]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 13 Sep 2021 11:36:42 +0000 (13:36 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 13 Sep 2021 11:36:42 +0000 (13:36 +0200)
added patches:
arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch
char-tpm-kconfig-remove-bad-i2c-cr50-select.patch
fuse-flush-extending-writes.patch
fuse-truncate-pagecache-on-atomic_o_trunc.patch
fuse-wait-for-writepages-in-syncfs.patch
ima-remove-the-dependency-on-crypto_md5.patch
ima-remove-wmissing-prototypes-warning.patch
io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch
kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch
kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch
kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch
kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch
kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch
kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch
kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch
kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch
md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch
perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch
revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch

20 files changed:
queue-5.14/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch [new file with mode: 0644]
queue-5.14/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch [new file with mode: 0644]
queue-5.14/fuse-flush-extending-writes.patch [new file with mode: 0644]
queue-5.14/fuse-truncate-pagecache-on-atomic_o_trunc.patch [new file with mode: 0644]
queue-5.14/fuse-wait-for-writepages-in-syncfs.patch [new file with mode: 0644]
queue-5.14/ima-remove-the-dependency-on-crypto_md5.patch [new file with mode: 0644]
queue-5.14/ima-remove-wmissing-prototypes-warning.patch [new file with mode: 0644]
queue-5.14/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch [new file with mode: 0644]
queue-5.14/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch [new file with mode: 0644]
queue-5.14/kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch [new file with mode: 0644]
queue-5.14/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch [new file with mode: 0644]
queue-5.14/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch [new file with mode: 0644]
queue-5.14/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch [new file with mode: 0644]
queue-5.14/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch [new file with mode: 0644]
queue-5.14/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch [new file with mode: 0644]
queue-5.14/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch [new file with mode: 0644]
queue-5.14/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch [new file with mode: 0644]
queue-5.14/perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch [new file with mode: 0644]
queue-5.14/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch [new file with mode: 0644]
queue-5.14/series

diff --git a/queue-5.14/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch b/queue-5.14/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch
new file mode 100644 (file)
index 0000000..3acb65d
--- /dev/null
@@ -0,0 +1,189 @@
+From bf781869e5cf3e4ec1a47dad69b6f0df97629cbd Mon Sep 17 00:00:00 2001
+From: Claudiu Beznea <claudiu.beznea@microchip.com>
+Date: Tue, 27 Jul 2021 10:40:05 +0300
+Subject: ARM: dts: at91: add pinctrl-{names, 0} for all gpios
+
+From: Claudiu Beznea <claudiu.beznea@microchip.com>
+
+commit bf781869e5cf3e4ec1a47dad69b6f0df97629cbd upstream.
+
+Add pinctrl-names and pinctrl-0 properties on controllers that claims to
+use pins to avoid failures due to
+commit 2ab73c6d8323 ("gpio: Support GPIO controllers without pin-ranges")
+and also to avoid using pins that may be claimed my other IPs.
+
+Fixes: b7c2b6157079 ("ARM: at91: add Atmel's SAMA5D3 Xplained board")
+Fixes: 1e5f532c2737 ("ARM: dts: at91: sam9x60: add device tree for soc and board")
+Fixes: 38153a017896 ("ARM: at91/dt: sama5d4: add dts for sama5d4 xplained board")
+Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
+Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
+Link: https://lore.kernel.org/r/20210727074006.1609989-1-claudiu.beznea@microchip.com
+Cc: <stable@vger.kernel.org> # v5.7+
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/boot/dts/at91-sam9x60ek.dts        |   16 ++++++++++++++-
+ arch/arm/boot/dts/at91-sama5d3_xplained.dts |   29 ++++++++++++++++++++++++++++
+ arch/arm/boot/dts/at91-sama5d4_xplained.dts |   19 ++++++++++++++++++
+ 3 files changed, 63 insertions(+), 1 deletion(-)
+
+--- a/arch/arm/boot/dts/at91-sam9x60ek.dts
++++ b/arch/arm/boot/dts/at91-sam9x60ek.dts
+@@ -92,6 +92,8 @@
+       leds {
+               compatible = "gpio-leds";
++              pinctrl-names = "default";
++              pinctrl-0 = <&pinctrl_gpio_leds>;
+               status = "okay"; /* Conflict with pwm0. */
+               red {
+@@ -537,6 +539,10 @@
+                                AT91_PIOA 19 AT91_PERIPH_A (AT91_PINCTRL_PULL_UP | AT91_PINCTRL_DRIVE_STRENGTH_HI)     /* PA19 DAT2 periph A with pullup */
+                                AT91_PIOA 20 AT91_PERIPH_A (AT91_PINCTRL_PULL_UP | AT91_PINCTRL_DRIVE_STRENGTH_HI)>;   /* PA20 DAT3 periph A with pullup */
+               };
++              pinctrl_sdmmc0_cd: sdmmc0_cd {
++                      atmel,pins =
++                              <AT91_PIOA 23 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++              };
+       };
+       sdmmc1 {
+@@ -569,6 +575,14 @@
+                                     AT91_PIOD 16 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
+               };
+       };
++
++      leds {
++              pinctrl_gpio_leds: gpio_leds {
++                      atmel,pins = <AT91_PIOB 11 AT91_PERIPH_GPIO AT91_PINCTRL_NONE
++                                    AT91_PIOB 12 AT91_PERIPH_GPIO AT91_PINCTRL_NONE
++                                    AT91_PIOB 13 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++              };
++      };
+ }; /* pinctrl */
+ &pwm0 {
+@@ -580,7 +594,7 @@
+ &sdmmc0 {
+       bus-width = <4>;
+       pinctrl-names = "default";
+-      pinctrl-0 = <&pinctrl_sdmmc0_default>;
++      pinctrl-0 = <&pinctrl_sdmmc0_default &pinctrl_sdmmc0_cd>;
+       status = "okay";
+       cd-gpios = <&pioA 23 GPIO_ACTIVE_LOW>;
+       disable-wp;
+--- a/arch/arm/boot/dts/at91-sama5d3_xplained.dts
++++ b/arch/arm/boot/dts/at91-sama5d3_xplained.dts
+@@ -57,6 +57,8 @@
+                       };
+                       spi0: spi@f0004000 {
++                              pinctrl-names = "default";
++                              pinctrl-0 = <&pinctrl_spi0_cs>;
+                               cs-gpios = <&pioD 13 0>, <0>, <0>, <&pioD 16 0>;
+                               status = "okay";
+                       };
+@@ -169,6 +171,8 @@
+                       };
+                       spi1: spi@f8008000 {
++                              pinctrl-names = "default";
++                              pinctrl-0 = <&pinctrl_spi1_cs>;
+                               cs-gpios = <&pioC 25 0>;
+                               status = "okay";
+                       };
+@@ -248,6 +252,26 @@
+                                                       <AT91_PIOE 3 AT91_PERIPH_GPIO AT91_PINCTRL_NONE
+                                                        AT91_PIOE 4 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
+                                       };
++
++                                      pinctrl_gpio_leds: gpio_leds_default {
++                                              atmel,pins =
++                                                      <AT91_PIOE 23 AT91_PERIPH_GPIO AT91_PINCTRL_NONE
++                                                       AT91_PIOE 24 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++                                      };
++
++                                      pinctrl_spi0_cs: spi0_cs_default {
++                                              atmel,pins =
++                                                      <AT91_PIOD 13 AT91_PERIPH_GPIO AT91_PINCTRL_NONE
++                                                       AT91_PIOD 16 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++                                      };
++
++                                      pinctrl_spi1_cs: spi1_cs_default {
++                                              atmel,pins = <AT91_PIOC 25 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++                                      };
++
++                                      pinctrl_vcc_mmc0_reg_gpio: vcc_mmc0_reg_gpio_default {
++                                              atmel,pins = <AT91_PIOE 2 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++                                      };
+                               };
+                       };
+               };
+@@ -339,6 +363,8 @@
+       vcc_mmc0_reg: fixedregulator_mmc0 {
+               compatible = "regulator-fixed";
++              pinctrl-names = "default";
++              pinctrl-0 = <&pinctrl_vcc_mmc0_reg_gpio>;
+               gpio = <&pioE 2 GPIO_ACTIVE_LOW>;
+               regulator-name = "mmc0-card-supply";
+               regulator-min-microvolt = <3300000>;
+@@ -362,6 +388,9 @@
+       leds {
+               compatible = "gpio-leds";
++              pinctrl-names = "default";
++              pinctrl-0 = <&pinctrl_gpio_leds>;
++              status = "okay";
+               d2 {
+                       label = "d2";
+--- a/arch/arm/boot/dts/at91-sama5d4_xplained.dts
++++ b/arch/arm/boot/dts/at91-sama5d4_xplained.dts
+@@ -90,6 +90,8 @@
+                       };
+                       spi1: spi@fc018000 {
++                              pinctrl-names = "default";
++                              pinctrl-0 = <&pinctrl_spi0_cs>;
+                               cs-gpios = <&pioB 21 0>;
+                               status = "okay";
+                       };
+@@ -147,6 +149,19 @@
+                                               atmel,pins =
+                                                       <AT91_PIOE 1 AT91_PERIPH_GPIO AT91_PINCTRL_PULL_UP_DEGLITCH>;
+                                       };
++                                      pinctrl_spi0_cs: spi0_cs_default {
++                                              atmel,pins =
++                                                      <AT91_PIOB 21 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++                                      };
++                                      pinctrl_gpio_leds: gpio_leds_default {
++                                              atmel,pins =
++                                                      <AT91_PIOD 30 AT91_PERIPH_GPIO AT91_PINCTRL_NONE
++                                                       AT91_PIOE 15 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++                                      };
++                                      pinctrl_vcc_mmc1_reg: vcc_mmc1_reg {
++                                              atmel,pins =
++                                                      <AT91_PIOE 4 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
++                                      };
+                               };
+                       };
+               };
+@@ -252,6 +267,8 @@
+       leds {
+               compatible = "gpio-leds";
++              pinctrl-names = "default";
++              pinctrl-0 = <&pinctrl_gpio_leds>;
+               status = "okay";
+               d8 {
+@@ -278,6 +295,8 @@
+       vcc_mmc1_reg: fixedregulator_mmc1 {
+               compatible = "regulator-fixed";
++              pinctrl-names = "default";
++              pinctrl-0 = <&pinctrl_vcc_mmc1_reg>;
+               gpio = <&pioE 4 GPIO_ACTIVE_LOW>;
+               regulator-name = "VDD MCI1";
+               regulator-min-microvolt = <3300000>;
diff --git a/queue-5.14/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch b/queue-5.14/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch
new file mode 100644 (file)
index 0000000..18cafc4
--- /dev/null
@@ -0,0 +1,33 @@
+From 847fdae1579f4ee930b01f24a7847b8043bf468c Mon Sep 17 00:00:00 2001
+From: Adrian Ratiu <adrian.ratiu@collabora.com>
+Date: Tue, 27 Jul 2021 20:13:12 +0300
+Subject: char: tpm: Kconfig: remove bad i2c cr50 select
+
+From: Adrian Ratiu <adrian.ratiu@collabora.com>
+
+commit 847fdae1579f4ee930b01f24a7847b8043bf468c upstream.
+
+This fixes a minor bug which went unnoticed during the initial
+driver upstreaming review: TCG_CR50 does not exist in mainline
+kernels, so remove it.
+
+Fixes: 3a253caaad11 ("char: tpm: add i2c driver for cr50")
+Cc: stable@vger.kernel.org
+Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Adrian Ratiu <adrian.ratiu@collabora.com>
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/char/tpm/Kconfig |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/char/tpm/Kconfig
++++ b/drivers/char/tpm/Kconfig
+@@ -89,7 +89,6 @@ config TCG_TIS_SYNQUACER
+ config TCG_TIS_I2C_CR50
+       tristate "TPM Interface Specification 2.0 Interface (I2C - CR50)"
+       depends on I2C
+-      select TCG_CR50
+       help
+         This is a driver for the Google cr50 I2C TPM interface which is a
+         custom microcontroller and requires a custom i2c protocol interface
diff --git a/queue-5.14/fuse-flush-extending-writes.patch b/queue-5.14/fuse-flush-extending-writes.patch
new file mode 100644 (file)
index 0000000..5441180
--- /dev/null
@@ -0,0 +1,49 @@
+From 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd Mon Sep 17 00:00:00 2001
+From: Miklos Szeredi <mszeredi@redhat.com>
+Date: Tue, 31 Aug 2021 14:18:08 +0200
+Subject: fuse: flush extending writes
+
+From: Miklos Szeredi <mszeredi@redhat.com>
+
+commit 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd upstream.
+
+Callers of fuse_writeback_range() assume that the file is ready for
+modification by the server in the supplied byte range after the call
+returns.
+
+If there's a write that extends the file beyond the end of the supplied
+range, then the file needs to be extended to at least the end of the range,
+but currently that's not done.
+
+There are at least two cases where this can cause problems:
+
+ - copy_file_range() will return short count if the file is not extended
+   up to end of the source range.
+
+ - FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE will not extend the file,
+   hence the region may not be fully allocated.
+
+Fix by flushing writes from the start of the range up to the end of the
+file.  This could be optimized if the writes are non-extending, etc, but
+it's probably not worth the trouble.
+
+Fixes: a2bc92362941 ("fuse: fix copy_file_range() in the writeback case")
+Fixes: 6b1bdb56b17c ("fuse: allow fallocate(FALLOC_FL_ZERO_RANGE)")
+Cc: <stable@vger.kernel.org>  # v5.2
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/fuse/file.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -2884,7 +2884,7 @@ fuse_direct_IO(struct kiocb *iocb, struc
+ static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
+ {
+-      int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
++      int err = filemap_write_and_wait_range(inode->i_mapping, start, -1);
+       if (!err)
+               fuse_sync_writes(inode);
diff --git a/queue-5.14/fuse-truncate-pagecache-on-atomic_o_trunc.patch b/queue-5.14/fuse-truncate-pagecache-on-atomic_o_trunc.patch
new file mode 100644 (file)
index 0000000..04fd413
--- /dev/null
@@ -0,0 +1,58 @@
+From 76224355db7570cbe6b6f75c8929a1558828dd55 Mon Sep 17 00:00:00 2001
+From: Miklos Szeredi <mszeredi@redhat.com>
+Date: Tue, 17 Aug 2021 21:05:16 +0200
+Subject: fuse: truncate pagecache on atomic_o_trunc
+
+From: Miklos Szeredi <mszeredi@redhat.com>
+
+commit 76224355db7570cbe6b6f75c8929a1558828dd55 upstream.
+
+fuse_finish_open() will be called with FUSE_NOWRITE in case of atomic
+O_TRUNC.  This can deadlock with fuse_wait_on_page_writeback() in
+fuse_launder_page() triggered by invalidate_inode_pages2().
+
+Fix by replacing invalidate_inode_pages2() in fuse_finish_open() with a
+truncate_pagecache() call.  This makes sense regardless of FOPEN_KEEP_CACHE
+or fc->writeback cache, so do it unconditionally.
+
+Reported-by: Xie Yongji <xieyongji@bytedance.com>
+Reported-and-tested-by: syzbot+bea44a5189836d956894@syzkaller.appspotmail.com
+Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC")
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/fuse/file.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inod
+       struct fuse_file *ff = file->private_data;
+       struct fuse_conn *fc = get_fuse_conn(inode);
+-      if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+-              invalidate_inode_pages2(inode->i_mapping);
+       if (ff->open_flags & FOPEN_STREAM)
+               stream_open(inode, file);
+       else if (ff->open_flags & FOPEN_NONSEEKABLE)
+               nonseekable_open(inode, file);
++
+       if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+               struct fuse_inode *fi = get_fuse_inode(inode);
+@@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inod
+               fi->attr_version = atomic64_inc_return(&fc->attr_version);
+               i_size_write(inode, 0);
+               spin_unlock(&fi->lock);
++              truncate_pagecache(inode, 0);
+               fuse_invalidate_attr(inode);
+               if (fc->writeback_cache)
+                       file_update_time(file);
++      } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
++              invalidate_inode_pages2(inode->i_mapping);
+       }
++
+       if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
+               fuse_link_write_file(file);
+ }
diff --git a/queue-5.14/fuse-wait-for-writepages-in-syncfs.patch b/queue-5.14/fuse-wait-for-writepages-in-syncfs.patch
new file mode 100644 (file)
index 0000000..62b68e0
--- /dev/null
@@ -0,0 +1,242 @@
+From 660585b56e63ca034ad506ea53c807c5cdca3196 Mon Sep 17 00:00:00 2001
+From: Miklos Szeredi <mszeredi@redhat.com>
+Date: Wed, 1 Sep 2021 12:39:02 +0200
+Subject: fuse: wait for writepages in syncfs
+
+From: Miklos Szeredi <mszeredi@redhat.com>
+
+commit 660585b56e63ca034ad506ea53c807c5cdca3196 upstream.
+
+In case of fuse the MM subsystem doesn't guarantee that page writeback
+completes by the time ->sync_fs() is called.  This is because fuse
+completes page writeback immediately to prevent DoS of memory reclaim by
+the userspace file server.
+
+This means that fuse itself must ensure that writes are synced before
+sending the SYNCFS request to the server.
+
+Introduce sync buckets, that hold a counter for the number of outstanding
+write requests.  On syncfs replace the current bucket with a new one and
+wait until the old bucket's counter goes down to zero.
+
+It is possible to have multiple syncfs calls in parallel, in which case
+there could be more than one waited-on buckets.  Descendant buckets must
+not complete until the parent completes.  Add a count to the child (new)
+bucket until the (parent) old bucket completes.
+
+Use RCU protection to dereference the current bucket and to wake up an
+emptied bucket.  Use fc->lock to protect against parallel assignments to
+the current bucket.
+
+This leaves just the counter to be a possible scalability issue.  The
+fc->num_waiting counter has a similar issue, so both should be addressed at
+the same time.
+
+Reported-by: Amir Goldstein <amir73il@gmail.com>
+Fixes: 2d82ab251ef0 ("virtiofs: propagate sync() to file server")
+Cc: <stable@vger.kernel.org> # v5.14
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/fuse/file.c   |   21 +++++++++++++++++++
+ fs/fuse/fuse_i.h |   19 +++++++++++++++++
+ fs/fuse/inode.c  |   60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 100 insertions(+)
+
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -392,6 +392,7 @@ struct fuse_writepage_args {
+       struct list_head queue_entry;
+       struct fuse_writepage_args *next;
+       struct inode *inode;
++      struct fuse_sync_bucket *bucket;
+ };
+ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
+@@ -1611,6 +1612,9 @@ static void fuse_writepage_free(struct f
+       struct fuse_args_pages *ap = &wpa->ia.ap;
+       int i;
++      if (wpa->bucket)
++              fuse_sync_bucket_dec(wpa->bucket);
++
+       for (i = 0; i < ap->num_pages; i++)
+               __free_page(ap->pages[i]);
+@@ -1874,6 +1878,20 @@ static struct fuse_writepage_args *fuse_
+ }
++static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
++                                       struct fuse_writepage_args *wpa)
++{
++      if (!fc->sync_fs)
++              return;
++
++      rcu_read_lock();
++      /* Prevent resurrection of dead bucket in unlikely race with syncfs */
++      do {
++              wpa->bucket = rcu_dereference(fc->curr_bucket);
++      } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
++      rcu_read_unlock();
++}
++
+ static int fuse_writepage_locked(struct page *page)
+ {
+       struct address_space *mapping = page->mapping;
+@@ -1901,6 +1919,7 @@ static int fuse_writepage_locked(struct
+       if (!wpa->ia.ff)
+               goto err_nofile;
++      fuse_writepage_add_to_bucket(fc, wpa);
+       fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
+       copy_highpage(tmp_page, page);
+@@ -2151,6 +2170,8 @@ static int fuse_writepages_fill(struct p
+                       __free_page(tmp_page);
+                       goto out_unlock;
+               }
++              fuse_writepage_add_to_bucket(fc, wpa);
++
+               data->max_pages = 1;
+               ap = &wpa->ia.ap;
+--- a/fs/fuse/fuse_i.h
++++ b/fs/fuse/fuse_i.h
+@@ -515,6 +515,13 @@ struct fuse_fs_context {
+       void **fudptr;
+ };
++struct fuse_sync_bucket {
++      /* count is a possible scalability bottleneck */
++      atomic_t count;
++      wait_queue_head_t waitq;
++      struct rcu_head rcu;
++};
++
+ /**
+  * A Fuse connection.
+  *
+@@ -807,6 +814,9 @@ struct fuse_conn {
+       /** List of filesystems using this connection */
+       struct list_head mounts;
++
++      /* New writepages go into this bucket */
++      struct fuse_sync_bucket __rcu *curr_bucket;
+ };
+ /*
+@@ -910,6 +920,15 @@ static inline void fuse_page_descs_lengt
+               descs[i].length = PAGE_SIZE - descs[i].offset;
+ }
++static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
++{
++      /* Need RCU protection to prevent use after free after the decrement */
++      rcu_read_lock();
++      if (atomic_dec_and_test(&bucket->count))
++              wake_up(&bucket->waitq);
++      rcu_read_unlock();
++}
++
+ /** Device operations */
+ extern const struct file_operations fuse_dev_operations;
+--- a/fs/fuse/inode.c
++++ b/fs/fuse/inode.c
+@@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *de
+       return err;
+ }
++static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
++{
++      struct fuse_sync_bucket *bucket;
++
++      bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
++      if (bucket) {
++              init_waitqueue_head(&bucket->waitq);
++              /* Initial active count */
++              atomic_set(&bucket->count, 1);
++      }
++      return bucket;
++}
++
++static void fuse_sync_fs_writes(struct fuse_conn *fc)
++{
++      struct fuse_sync_bucket *bucket, *new_bucket;
++      int count;
++
++      new_bucket = fuse_sync_bucket_alloc();
++      spin_lock(&fc->lock);
++      bucket = rcu_dereference_protected(fc->curr_bucket, 1);
++      count = atomic_read(&bucket->count);
++      WARN_ON(count < 1);
++      /* No outstanding writes? */
++      if (count == 1) {
++              spin_unlock(&fc->lock);
++              kfree(new_bucket);
++              return;
++      }
++
++      /*
++       * Completion of new bucket depends on completion of this bucket, so add
++       * one more count.
++       */
++      atomic_inc(&new_bucket->count);
++      rcu_assign_pointer(fc->curr_bucket, new_bucket);
++      spin_unlock(&fc->lock);
++      /*
++       * Drop initial active count.  At this point if all writes in this and
++       * ancestor buckets complete, the count will go to zero and this task
++       * will be woken up.
++       */
++      atomic_dec(&bucket->count);
++
++      wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);
++
++      /* Drop temp count on descendant bucket */
++      fuse_sync_bucket_dec(new_bucket);
++      kfree_rcu(bucket, rcu);
++}
++
+ static int fuse_sync_fs(struct super_block *sb, int wait)
+ {
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
+@@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_blo
+       if (!fc->sync_fs)
+               return 0;
++      fuse_sync_fs_writes(fc);
++
+       memset(&inarg, 0, sizeof(inarg));
+       args.in_numargs = 1;
+       args.in_args[0].size = sizeof(inarg);
+@@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc)
+ {
+       if (refcount_dec_and_test(&fc->count)) {
+               struct fuse_iqueue *fiq = &fc->iq;
++              struct fuse_sync_bucket *bucket;
+               if (IS_ENABLED(CONFIG_FUSE_DAX))
+                       fuse_dax_conn_free(fc);
+@@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc)
+                       fiq->ops->release(fiq);
+               put_pid_ns(fc->pid_ns);
+               put_user_ns(fc->user_ns);
++              bucket = rcu_dereference_protected(fc->curr_bucket, 1);
++              if (bucket) {
++                      WARN_ON(atomic_read(&bucket->count) != 1);
++                      kfree(bucket);
++              }
+               fc->release(fc);
+       }
+ }
+@@ -1418,6 +1477,7 @@ int fuse_fill_super_common(struct super_
+       if (sb->s_flags & SB_MANDLOCK)
+               goto err;
++      rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
+       fuse_sb_defaults(sb);
+       if (ctx->is_bdev) {
diff --git a/queue-5.14/ima-remove-the-dependency-on-crypto_md5.patch b/queue-5.14/ima-remove-the-dependency-on-crypto_md5.patch
new file mode 100644 (file)
index 0000000..d8acf2e
--- /dev/null
@@ -0,0 +1,45 @@
+From 8510505d55e194d3f6c9644c9f9d12c4f6b0395a Mon Sep 17 00:00:00 2001
+From: THOBY Simon <Simon.THOBY@viveris.fr>
+Date: Mon, 16 Aug 2021 08:10:59 +0000
+Subject: IMA: remove the dependency on CRYPTO_MD5
+
+From: THOBY Simon <Simon.THOBY@viveris.fr>
+
+commit 8510505d55e194d3f6c9644c9f9d12c4f6b0395a upstream.
+
+MD5 is a weak digest algorithm that shouldn't be used for cryptographic
+operation. It hinders the efficiency of a patch set that aims to limit
+the digests allowed for the extended file attribute namely security.ima.
+MD5 is no longer a requirement for IMA, nor should it be used there.
+
+The sole place where we still use the MD5 algorithm inside IMA is setting
+the ima_hash algorithm to MD5, if the user supplies 'ima_hash=md5'
+parameter on the command line.  With commit ab60368ab6a4 ("ima: Fallback
+to the builtin hash algorithm"), setting "ima_hash=md5" fails gracefully
+when CRYPTO_MD5 is not set:
+       ima: Can not allocate md5 (reason: -2)
+       ima: Allocating md5 failed, going to use default hash algorithm sha256
+
+Remove the CRYPTO_MD5 dependency for IMA.
+
+Signed-off-by: THOBY Simon <Simon.THOBY@viveris.fr>
+Reviewed-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
+[zohar@linux.ibm.com: include commit number in patch description for
+stable.]
+Cc: stable@vger.kernel.org # 4.17
+Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ security/integrity/ima/Kconfig |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/security/integrity/ima/Kconfig
++++ b/security/integrity/ima/Kconfig
+@@ -6,7 +6,6 @@ config IMA
+       select SECURITYFS
+       select CRYPTO
+       select CRYPTO_HMAC
+-      select CRYPTO_MD5
+       select CRYPTO_SHA1
+       select CRYPTO_HASH_INFO
+       select TCG_TPM if HAS_IOMEM && !UML
diff --git a/queue-5.14/ima-remove-wmissing-prototypes-warning.patch b/queue-5.14/ima-remove-wmissing-prototypes-warning.patch
new file mode 100644 (file)
index 0000000..4464b56
--- /dev/null
@@ -0,0 +1,40 @@
+From a32ad90426a9c8eb3915eed26e08ce133bd9e0da Mon Sep 17 00:00:00 2001
+From: Austin Kim <austin.kim@lge.com>
+Date: Tue, 29 Jun 2021 14:50:50 +0100
+Subject: IMA: remove -Wmissing-prototypes warning
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Austin Kim <austin.kim@lge.com>
+
+commit a32ad90426a9c8eb3915eed26e08ce133bd9e0da upstream.
+
+With W=1 build, the compiler throws warning message as below:
+
+   security/integrity/ima/ima_mok.c:24:12: warning:
+   no previous prototype for â€˜ima_mok_init’ [-Wmissing-prototypes]
+       __init int ima_mok_init(void)
+
+Silence the warning by adding static keyword to ima_mok_init().
+
+Signed-off-by: Austin Kim <austin.kim@lge.com>
+Fixes: 41c89b64d718 ("IMA: create machine owner and blacklist keyrings")
+Cc: stable@vger.kernel.org
+Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ security/integrity/ima/ima_mok.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/security/integrity/ima/ima_mok.c
++++ b/security/integrity/ima/ima_mok.c
+@@ -21,7 +21,7 @@ struct key *ima_blacklist_keyring;
+ /*
+  * Allocate the IMA blacklist keyring
+  */
+-__init int ima_mok_init(void)
++static __init int ima_mok_init(void)
+ {
+       struct key_restriction *restriction;
diff --git a/queue-5.14/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch b/queue-5.14/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch
new file mode 100644 (file)
index 0000000..d02440c
--- /dev/null
@@ -0,0 +1,104 @@
+From ecc53c48c13d995e6fe5559e30ffee48d92784fd Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sun, 29 Aug 2021 16:13:03 -0600
+Subject: io-wq: check max_worker limits if a worker transitions bound state
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit ecc53c48c13d995e6fe5559e30ffee48d92784fd upstream.
+
+For the two places where new workers are created, we diligently check if
+we are allowed to create a new worker. If we're currently at the limit
+of how many workers of a given type we can have, then we don't create
+any new ones.
+
+If you have a mixed workload with various types of bound and unbounded
+work, then it can happen that a worker finishes one type of work and
+is then transitioned to the other type. For this case, we don't check
+if we are actually allowed to do so. This can cause io-wq to temporarily
+exceed the allowed number of workers for a given type.
+
+When retrieving work, check that the types match. If they don't, check
+if we are allowed to transition to the other type. If not, then don't
+handle the new work.
+
+Cc: stable@vger.kernel.org
+Reported-by: Johannes Lundberg <johalun0@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io-wq.c |   33 ++++++++++++++++++++++++++++++---
+ 1 file changed, 30 insertions(+), 3 deletions(-)
+
+--- a/fs/io-wq.c
++++ b/fs/io-wq.c
+@@ -423,7 +423,28 @@ static void io_wait_on_hash(struct io_wq
+       spin_unlock(&wq->hash->wait.lock);
+ }
+-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
++/*
++ * We can always run the work if the worker is currently the same type as
++ * the work (eg both are bound, or both are unbound). If they are not the
++ * same, only allow it if incrementing the worker count would be allowed.
++ */
++static bool io_worker_can_run_work(struct io_worker *worker,
++                                 struct io_wq_work *work)
++{
++      struct io_wqe_acct *acct;
++
++      if (!(worker->flags & IO_WORKER_F_BOUND) !=
++          !(work->flags & IO_WQ_WORK_UNBOUND))
++              return true;
++
++      /* not the same type, check if we'd go over the limit */
++      acct = io_work_get_acct(worker->wqe, work);
++      return acct->nr_workers < acct->max_workers;
++}
++
++static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
++                                         struct io_worker *worker,
++                                         bool *stalled)
+       __must_hold(wqe->lock)
+ {
+       struct io_wq_work_node *node, *prev;
+@@ -435,6 +456,9 @@ static struct io_wq_work *io_get_next_wo
+               work = container_of(node, struct io_wq_work, list);
++              if (!io_worker_can_run_work(worker, work))
++                      break;
++
+               /* not hashed, can run anytime */
+               if (!io_wq_is_hashed(work)) {
+                       wq_list_del(&wqe->work_list, node, prev);
+@@ -461,6 +485,7 @@ static struct io_wq_work *io_get_next_wo
+               raw_spin_unlock(&wqe->lock);
+               io_wait_on_hash(wqe, stall_hash);
+               raw_spin_lock(&wqe->lock);
++              *stalled = true;
+       }
+       return NULL;
+@@ -500,6 +525,7 @@ static void io_worker_handle_work(struct
+       do {
+               struct io_wq_work *work;
++              bool stalled;
+ get_next:
+               /*
+                * If we got some work, mark us as busy. If we didn't, but
+@@ -508,10 +534,11 @@ get_next:
+                * can't make progress, any work completion or insertion will
+                * clear the stalled flag.
+                */
+-              work = io_get_next_work(wqe);
++              stalled = false;
++              work = io_get_next_work(wqe, worker, &stalled);
+               if (work)
+                       __io_worker_busy(wqe, worker, work);
+-              else if (!wq_list_empty(&wqe->work_list))
++              else if (stalled)
+                       wqe->flags |= IO_WQE_FLAG_STALLED;
+               raw_spin_unlock_irq(&wqe->lock);
diff --git a/queue-5.14/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch b/queue-5.14/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch
new file mode 100644 (file)
index 0000000..ebe2642
--- /dev/null
@@ -0,0 +1,54 @@
+From 47e6223c841e029bfc23c3ce594dac5525cebaf8 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Mon, 2 Aug 2021 13:38:30 +0100
+Subject: KVM: arm64: Unregister HYP sections from kmemleak in protected mode
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 47e6223c841e029bfc23c3ce594dac5525cebaf8 upstream.
+
+Booting a KVM host in protected mode with kmemleak quickly results
+in a pretty bad crash, as kmemleak doesn't know that the HYP sections
+have been taken away. This is specially true for the BSS section,
+which is part of the kernel BSS section and registered at boot time
+by kmemleak itself.
+
+Unregister the HYP part of the BSS before making that section
+HYP-private. The rest of the HYP-specific data is obtained via
+the page allocator or lives in other sections, none of which is
+subjected to kmemleak.
+
+Fixes: 90134ac9cabb ("KVM: arm64: Protect the .hyp sections from the host")
+Reviewed-by: Quentin Perret <qperret@google.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org # 5.13
+Link: https://lore.kernel.org/r/20210802123830.2195174-3-maz@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/arm.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/arch/arm64/kvm/arm.c
++++ b/arch/arm64/kvm/arm.c
+@@ -15,6 +15,7 @@
+ #include <linux/fs.h>
+ #include <linux/mman.h>
+ #include <linux/sched.h>
++#include <linux/kmemleak.h>
+ #include <linux/kvm.h>
+ #include <linux/kvm_irqfd.h>
+ #include <linux/irqbypass.h>
+@@ -1986,6 +1987,12 @@ static int finalize_hyp_mode(void)
+       if (ret)
+               return ret;
++      /*
++       * Exclude HYP BSS from kmemleak so that it doesn't get peeked
++       * at, which would end badly once the section is inaccessible.
++       * None of other sections should ever be introspected.
++       */
++      kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
+       ret = pkvm_mark_hyp_section(__hyp_bss);
+       if (ret)
+               return ret;
diff --git a/queue-5.14/kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch b/queue-5.14/kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch
new file mode 100644 (file)
index 0000000..9ba4888
--- /dev/null
@@ -0,0 +1,216 @@
+From 3134cc8beb69d0db9de651081707c4651c011621 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Thu, 19 Aug 2021 19:03:05 +0100
+Subject: KVM: arm64: vgic: Resample HW pending state on deactivation
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 3134cc8beb69d0db9de651081707c4651c011621 upstream.
+
+When a mapped level interrupt (a timer, for example) is deactivated
+by the guest, the corresponding host interrupt is equally deactivated.
+However, the fate of the pending state still needs to be dealt
+with in SW.
+
+This is specially true when the interrupt was in the active+pending
+state in the virtual distributor at the point where the guest
+was entered. On exit, the pending state is potentially stale
+(the guest may have put the interrupt in a non-pending state).
+
+If we don't do anything, the interrupt will be spuriously injected
+in the guest. Although this shouldn't have any ill effect (spurious
+interrupts are always possible), we can improve the emulation by
+detecting the deactivation-while-pending case and resample the
+interrupt.
+
+While we're at it, move the logic into a common helper that can
+be shared between the two GIC implementations.
+
+Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts")
+Reported-by: Raghavendra Rao Ananta <rananta@google.com>
+Tested-by: Raghavendra Rao Ananta <rananta@google.com>
+Reviewed-by: Oliver Upton <oupton@google.com>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210819180305.1670525-1-maz@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/vgic/vgic-v2.c |   36 +++++-------------------------------
+ arch/arm64/kvm/vgic/vgic-v3.c |   36 +++++-------------------------------
+ arch/arm64/kvm/vgic/vgic.c    |   38 ++++++++++++++++++++++++++++++++++++++
+ arch/arm64/kvm/vgic/vgic.h    |    2 ++
+ 4 files changed, 50 insertions(+), 62 deletions(-)
+
+--- a/arch/arm64/kvm/vgic/vgic-v2.c
++++ b/arch/arm64/kvm/vgic/vgic-v2.c
+@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vc
+               u32 val = cpuif->vgic_lr[lr];
+               u32 cpuid, intid = val & GICH_LR_VIRTUALID;
+               struct vgic_irq *irq;
++              bool deactivated;
+               /* Extract the source vCPU id from the LR */
+               cpuid = val & GICH_LR_PHYSID_CPUID;
+@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vc
+               raw_spin_lock(&irq->irq_lock);
+-              /* Always preserve the active bit */
++              /* Always preserve the active bit, note deactivation */
++              deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
+               irq->active = !!(val & GICH_LR_ACTIVE_BIT);
+               if (irq->active && vgic_irq_is_sgi(intid))
+@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vc
+               if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
+                       irq->pending_latch = false;
+-              /*
+-               * Level-triggered mapped IRQs are special because we only
+-               * observe rising edges as input to the VGIC.
+-               *
+-               * If the guest never acked the interrupt we have to sample
+-               * the physical line and set the line level, because the
+-               * device state could have changed or we simply need to
+-               * process the still pending interrupt later.
+-               *
+-               * If this causes us to lower the level, we have to also clear
+-               * the physical active state, since we will otherwise never be
+-               * told when the interrupt becomes asserted again.
+-               *
+-               * Another case is when the interrupt requires a helping hand
+-               * on deactivation (no HW deactivation, for example).
+-               */
+-              if (vgic_irq_is_mapped_level(irq)) {
+-                      bool resample = false;
+-
+-                      if (val & GICH_LR_PENDING_BIT) {
+-                              irq->line_level = vgic_get_phys_line_level(irq);
+-                              resample = !irq->line_level;
+-                      } else if (vgic_irq_needs_resampling(irq) &&
+-                                 !(irq->active || irq->pending_latch)) {
+-                              resample = true;
+-                      }
+-
+-                      if (resample)
+-                              vgic_irq_set_phys_active(irq, false);
+-              }
++              /* Handle resampling for mapped interrupts if required */
++              vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
+               raw_spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
+--- a/arch/arm64/kvm/vgic/vgic-v3.c
++++ b/arch/arm64/kvm/vgic/vgic-v3.c
+@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vc
+               u32 intid, cpuid;
+               struct vgic_irq *irq;
+               bool is_v2_sgi = false;
++              bool deactivated;
+               cpuid = val & GICH_LR_PHYSID_CPUID;
+               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vc
+               raw_spin_lock(&irq->irq_lock);
+-              /* Always preserve the active bit */
++              /* Always preserve the active bit, note deactivation */
++              deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
+               irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+               if (irq->active && is_v2_sgi)
+@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vc
+               if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
+                       irq->pending_latch = false;
+-              /*
+-               * Level-triggered mapped IRQs are special because we only
+-               * observe rising edges as input to the VGIC.
+-               *
+-               * If the guest never acked the interrupt we have to sample
+-               * the physical line and set the line level, because the
+-               * device state could have changed or we simply need to
+-               * process the still pending interrupt later.
+-               *
+-               * If this causes us to lower the level, we have to also clear
+-               * the physical active state, since we will otherwise never be
+-               * told when the interrupt becomes asserted again.
+-               *
+-               * Another case is when the interrupt requires a helping hand
+-               * on deactivation (no HW deactivation, for example).
+-               */
+-              if (vgic_irq_is_mapped_level(irq)) {
+-                      bool resample = false;
+-
+-                      if (val & ICH_LR_PENDING_BIT) {
+-                              irq->line_level = vgic_get_phys_line_level(irq);
+-                              resample = !irq->line_level;
+-                      } else if (vgic_irq_needs_resampling(irq) &&
+-                                 !(irq->active || irq->pending_latch)) {
+-                              resample = true;
+-                      }
+-
+-                      if (resample)
+-                              vgic_irq_set_phys_active(irq, false);
+-              }
++              /* Handle resampling for mapped interrupts if required */
++              vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
+               raw_spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
+--- a/arch/arm64/kvm/vgic/vgic.c
++++ b/arch/arm64/kvm/vgic/vgic.c
+@@ -1022,3 +1022,41 @@ bool kvm_vgic_map_is_active(struct kvm_v
+       return map_is_active;
+ }
++
++/*
++ * Level-triggered mapped IRQs are special because we only observe rising
++ * edges as input to the VGIC.
++ *
++ * If the guest never acked the interrupt we have to sample the physical
++ * line and set the line level, because the device state could have changed
++ * or we simply need to process the still pending interrupt later.
++ *
++ * We could also have entered the guest with the interrupt active+pending.
++ * On the next exit, we need to re-evaluate the pending state, as it could
++ * otherwise result in a spurious interrupt by injecting a now potentially
++ * stale pending state.
++ *
++ * If this causes us to lower the level, we have to also clear the physical
++ * active state, since we will otherwise never be told when the interrupt
++ * becomes asserted again.
++ *
++ * Another case is when the interrupt requires a helping hand on
++ * deactivation (no HW deactivation, for example).
++ */
++void vgic_irq_handle_resampling(struct vgic_irq *irq,
++                              bool lr_deactivated, bool lr_pending)
++{
++      if (vgic_irq_is_mapped_level(irq)) {
++              bool resample = false;
++
++              if (unlikely(vgic_irq_needs_resampling(irq))) {
++                      resample = !(irq->active || irq->pending_latch);
++              } else if (lr_pending || (lr_deactivated && irq->line_level)) {
++                      irq->line_level = vgic_get_phys_line_level(irq);
++                      resample = !irq->line_level;
++              }
++
++              if (resample)
++                      vgic_irq_set_phys_active(irq, false);
++      }
++}
+--- a/arch/arm64/kvm/vgic/vgic.h
++++ b/arch/arm64/kvm/vgic/vgic.h
+@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgi
+ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+                          unsigned long flags);
+ void vgic_kick_vcpus(struct kvm *kvm);
++void vgic_irq_handle_resampling(struct vgic_irq *irq,
++                              bool lr_deactivated, bool lr_pending);
+ int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                     phys_addr_t addr, phys_addr_t alignment);
diff --git a/queue-5.14/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch b/queue-5.14/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch
new file mode 100644 (file)
index 0000000..1dbdd5f
--- /dev/null
@@ -0,0 +1,60 @@
+From f7782bb8d818d8f47c26b22079db10599922787a Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Tue, 10 Aug 2021 07:45:26 -0700
+Subject: KVM: nVMX: Unconditionally clear nested.pi_pending on nested VM-Enter
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit f7782bb8d818d8f47c26b22079db10599922787a upstream.
+
+Clear nested.pi_pending on nested VM-Enter even if L2 will run without
+posted interrupts enabled.  If nested.pi_pending is left set from a
+previous L2, vmx_complete_nested_posted_interrupt() will pick up the
+stale flag and exit to userspace with an "internal emulation error" due
+the new L2 not having a valid nested.pi_desc.
+
+Arguably, vmx_complete_nested_posted_interrupt() should first check for
+posted interrupts being enabled, but it's also completely reasonable that
+KVM wouldn't screw up a fundamental flag.  Not to mention that the mere
+existence of nested.pi_pending is a long-standing bug as KVM shouldn't
+move the posted interrupt out of the IRR until it's actually processed,
+e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit
+with a "pending" posted interrupt.  Fixing the mess is a future problem.
+
+Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI
+descriptor as an error, this was a benign bug as the null PI descriptor
+effectively served as a check on PI not being enabled.  Even then, the
+new flow did not become problematic until KVM started checking the result
+of kvm_check_nested_events().
+
+Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing")
+Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable")
+Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails")
+Cc: stable@vger.kernel.org
+Cc: Jim Mattson <jmattson@google.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210810144526.2662272-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/nested.c |    7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2223,12 +2223,11 @@ static void prepare_vmcs02_early(struct
+                        ~PIN_BASED_VMX_PREEMPTION_TIMER);
+       /* Posted interrupts setting is only taken from vmcs12.  */
+-      if (nested_cpu_has_posted_intr(vmcs12)) {
++      vmx->nested.pi_pending = false;
++      if (nested_cpu_has_posted_intr(vmcs12))
+               vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+-              vmx->nested.pi_pending = false;
+-      } else {
++      else
+               exec_control &= ~PIN_BASED_POSTED_INTR;
+-      }
+       pin_controls_set(vmx, exec_control);
+       /*
diff --git a/queue-5.14/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch b/queue-5.14/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch
new file mode 100644 (file)
index 0000000..5ff7222
--- /dev/null
@@ -0,0 +1,122 @@
+From a3e03bc1368c1bc16e19b001fc96dc7430573cc8 Mon Sep 17 00:00:00 2001
+From: Halil Pasic <pasic@linux.ibm.com>
+Date: Fri, 27 Aug 2021 14:54:29 +0200
+Subject: KVM: s390: index kvm->arch.idle_mask by vcpu_idx
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Halil Pasic <pasic@linux.ibm.com>
+
+commit a3e03bc1368c1bc16e19b001fc96dc7430573cc8 upstream.
+
+While in practice vcpu->vcpu_idx ==  vcpu->vcp_id is often true, it may
+not always be, and we must not rely on this. Reason is that KVM decides
+the vcpu_idx, userspace decides the vcpu_id, thus the two might not
+match.
+
+Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies
+that code like
+for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
+                vcpu = kvm_get_vcpu(kvm, vcpu_id);
+               do_stuff(vcpu);
+}
+is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an
+vcpu_id.  The trouble is, we do actually use kvm->arch.idle_mask like
+this. To fix this problem we have two options. Either use
+kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id,
+or switch to indexing via vcpu_idx. The latter is preferable for obvious
+reasons.
+
+Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to
+indexing it by vcpu_idx.  To keep gisa_int.kicked_mask indexed by the
+same index as idle_mask lets make the same change for it as well.
+
+Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array")
+Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
+Reviewed-by: Christian Bornträger <borntraeger@de.ibm.com>
+Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
+Cc: <stable@vger.kernel.org> # 3.15+
+Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/s390/include/asm/kvm_host.h |    1 +
+ arch/s390/kvm/interrupt.c        |   12 ++++++------
+ arch/s390/kvm/kvm-s390.c         |    2 +-
+ arch/s390/kvm/kvm-s390.h         |    2 +-
+ 4 files changed, 9 insertions(+), 8 deletions(-)
+
+--- a/arch/s390/include/asm/kvm_host.h
++++ b/arch/s390/include/asm/kvm_host.h
+@@ -957,6 +957,7 @@ struct kvm_arch{
+       atomic64_t cmma_dirty_pages;
+       /* subset of available cpu features enabled by user space */
+       DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
++      /* indexed by vcpu_idx */
+       DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
+       struct kvm_s390_gisa_interrupt gisa_int;
+       struct kvm_s390_pv pv;
+--- a/arch/s390/kvm/interrupt.c
++++ b/arch/s390/kvm/interrupt.c
+@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(st
+ static void __set_cpu_idle(struct kvm_vcpu *vcpu)
+ {
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
+-      set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
++      set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
+ }
+ static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
+ {
+       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
+-      clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
++      clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
+ }
+ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
+@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vc
+ static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
+ {
+-      int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
++      int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
+       struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+       struct kvm_vcpu *vcpu;
+-      for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
+-              vcpu = kvm_get_vcpu(kvm, vcpu_id);
++      for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
++              vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+               if (psw_ioint_disabled(vcpu))
+                       continue;
+               deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
+               if (deliverable_mask) {
+                       /* lately kicked but not yet running */
+-                      if (test_and_set_bit(vcpu_id, gi->kicked_mask))
++                      if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
+                               return;
+                       kvm_s390_vcpu_wakeup(vcpu);
+                       return;
+--- a/arch/s390/kvm/kvm-s390.c
++++ b/arch/s390/kvm/kvm-s390.c
+@@ -4044,7 +4044,7 @@ static int vcpu_pre_run(struct kvm_vcpu
+               kvm_s390_patch_guest_per_regs(vcpu);
+       }
+-      clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
++      clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
+       vcpu->arch.sie_block->icptcode = 0;
+       cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
+--- a/arch/s390/kvm/kvm-s390.h
++++ b/arch/s390/kvm/kvm-s390.h
+@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct
+ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
+ {
+-      return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
++      return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
+ }
+ static inline int kvm_is_ucontrol(struct kvm *kvm)
diff --git a/queue-5.14/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch b/queue-5.14/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch
new file mode 100644 (file)
index 0000000..ce946b3
--- /dev/null
@@ -0,0 +1,34 @@
+From 81b4b56d4f8130bbb99cf4e2b48082e5b4cfccb9 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 26 Aug 2021 12:57:49 +0300
+Subject: KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 81b4b56d4f8130bbb99cf4e2b48082e5b4cfccb9 upstream.
+
+If we are emulating an invalid guest state, we don't have a correct
+exit reason, and thus we shouldn't do anything in this function.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20210826095750.1650467-2-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Fixes: 95b5a48c4f2b ("KVM: VMX: Handle NMIs, #MCs and async #PFs in common irqs-disabled fn", 2019-06-18)
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6368,6 +6368,9 @@ static void vmx_handle_exit_irqoff(struc
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
++      if (vmx->emulation_required)
++              return;
++
+       if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+               handle_external_interrupt_irqoff(vcpu);
+       else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
diff --git a/queue-5.14/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch b/queue-5.14/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch
new file mode 100644 (file)
index 0000000..7241413
--- /dev/null
@@ -0,0 +1,74 @@
+From ec607a564f70519b340f7eb4cfc0f4a6b55285ac Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Fri, 6 Aug 2021 07:05:58 -0400
+Subject: KVM: x86: clamp host mapping level to max_level in kvm_mmu_max_mapping_level
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit ec607a564f70519b340f7eb4cfc0f4a6b55285ac upstream.
+
+This change started as a way to make kvm_mmu_hugepage_adjust a bit simpler,
+but it does fix two bugs as well.
+
+One bug is in zapping collapsible PTEs.  If a large page size is
+disallowed but not all of them, kvm_mmu_max_mapping_level will return the
+host mapping level and the small PTEs will be zapped up to that level.
+However, if e.g. 1GB are prohibited, we can still zap 4KB mapping and
+preserve the 2MB ones. This can happen for example when NX huge pages
+are in use.
+
+The second would happen when userspace backs guest memory
+with a 1gb hugepage but only assign a subset of the page to
+the guest.  1gb pages would be disallowed by the memslot, but
+not 2mb.  kvm_mmu_max_mapping_level() would fall through to the
+host_pfn_mapping_level() logic, see the 1gb hugepage, and map the whole
+thing into the guest.
+
+Fixes: 2f57b7051fe8 ("KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c |   13 +++++--------
+ 1 file changed, 5 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -2846,6 +2846,7 @@ int kvm_mmu_max_mapping_level(struct kvm
+                             kvm_pfn_t pfn, int max_level)
+ {
+       struct kvm_lpage_info *linfo;
++      int host_level;
+       max_level = min(max_level, max_huge_page_level);
+       for ( ; max_level > PG_LEVEL_4K; max_level--) {
+@@ -2857,7 +2858,8 @@ int kvm_mmu_max_mapping_level(struct kvm
+       if (max_level == PG_LEVEL_4K)
+               return PG_LEVEL_4K;
+-      return host_pfn_mapping_level(kvm, gfn, pfn, slot);
++      host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
++      return min(host_level, max_level);
+ }
+ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
+@@ -2881,17 +2883,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_v
+       if (!slot)
+               return PG_LEVEL_4K;
+-      level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
+-      if (level == PG_LEVEL_4K)
+-              return level;
+-
+-      *req_level = level = min(level, max_level);
+-
+       /*
+        * Enforce the iTLB multihit workaround after capturing the requested
+        * level, which will be used to do precise, accurate accounting.
+        */
+-      if (huge_page_disallowed)
++      *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
++      if (level == PG_LEVEL_4K || huge_page_disallowed)
+               return PG_LEVEL_4K;
+       /*
diff --git a/queue-5.14/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch b/queue-5.14/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch
new file mode 100644 (file)
index 0000000..1e3f5fa
--- /dev/null
@@ -0,0 +1,82 @@
+From 088acd23526647844aec1c39db4ad02552c86c7b Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Mon, 2 Aug 2021 21:46:06 -0700
+Subject: KVM: x86/mmu: Avoid collision with !PRESENT SPTEs in TDP MMU lpage stats
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 088acd23526647844aec1c39db4ad02552c86c7b upstream.
+
+Factor in whether or not the old/new SPTEs are shadow-present when
+adjusting the large page stats in the TDP MMU.  A modified MMIO SPTE can
+toggle the page size bit, as bit 7 is used to store the MMIO generation,
+i.e. is_large_pte() can get a false positive when called on a MMIO SPTE.
+Ditto for nuking SPTEs with REMOVED_SPTE, which sets bit 7 in its magic
+value.
+
+Opportunistically move the logic below the check to verify at least one
+of the old/new SPTEs is shadow present.
+
+Use is/was_leaf even though is/was_present would suffice.  The code
+generation is roughly equivalent since all flags need to be computed
+prior to the code in question, and using the *_leaf flags will minimize
+the diff in a future enhancement to account all pages, i.e. will change
+the check to "is_leaf != was_leaf".
+
+Reviewed-by: David Matlack <dmatlack@google.com>
+Reviewed-by: Ben Gardon <bgardon@google.com>
+
+Fixes: 1699f65c8b65 ("kvm/x86: Fix 'lpages' kvm stat for TDM MMU")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Mingwei Zhang <mizhang@google.com>
+Message-Id: <20210803044607.599629-3-mizhang@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/tdp_mmu.c |   20 +++++++++++++-------
+ 1 file changed, 13 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -412,6 +412,7 @@ static void __handle_changed_spte(struct
+       bool was_leaf = was_present && is_last_spte(old_spte, level);
+       bool is_leaf = is_present && is_last_spte(new_spte, level);
+       bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
++      bool was_large, is_large;
+       WARN_ON(level > PT64_ROOT_MAX_LEVEL);
+       WARN_ON(level < PG_LEVEL_4K);
+@@ -445,13 +446,6 @@ static void __handle_changed_spte(struct
+       trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+-      if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
+-              if (is_large_pte(old_spte))
+-                      atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
+-              else
+-                      atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
+-      }
+-
+       /*
+        * The only times a SPTE should be changed from a non-present to
+        * non-present state is when an MMIO entry is installed/modified/
+@@ -477,6 +471,18 @@ static void __handle_changed_spte(struct
+               return;
+       }
++      /*
++       * Update large page stats if a large page is being zapped, created, or
++       * is replacing an existing shadow page.
++       */
++      was_large = was_leaf && is_large_pte(old_spte);
++      is_large = is_leaf && is_large_pte(new_spte);
++      if (was_large != is_large) {
++              if (was_large)
++                      atomic64_sub(1, (atomic64_t *)&kvm->stat.lpages);
++              else
++                      atomic64_add(1, (atomic64_t *)&kvm->stat.lpages);
++      }
+       if (was_leaf && is_dirty_spte(old_spte) &&
+           (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
diff --git a/queue-5.14/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch b/queue-5.14/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch
new file mode 100644 (file)
index 0000000..431b7d9
--- /dev/null
@@ -0,0 +1,40 @@
+From d9130a2dfdd4b21736c91b818f87dbc0ccd1e757 Mon Sep 17 00:00:00 2001
+From: Zelin Deng <zelin.deng@linux.alibaba.com>
+Date: Wed, 28 Apr 2021 10:22:01 +0800
+Subject: KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted
+
+From: Zelin Deng <zelin.deng@linux.alibaba.com>
+
+commit d9130a2dfdd4b21736c91b818f87dbc0ccd1e757 upstream.
+
+When MSR_IA32_TSC_ADJUST is written by guest due to TSC ADJUST feature
+especially there's a big tsc warp (like a new vCPU is hot-added into VM
+which has been up for a long time), tsc_offset is added by a large value
+then go back to guest. This causes system time jump as tsc_timestamp is
+not adjusted in the meantime and pvclock monotonic character.
+To fix this, just notify kvm to update vCPU's guest time before back to
+guest.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-Id: <1619576521-81399-2-git-send-email-zelin.deng@linux.alibaba.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3316,6 +3316,10 @@ int kvm_set_msr_common(struct kvm_vcpu *
+                       if (!msr_info->host_initiated) {
+                               s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+                               adjust_tsc_offset_guest(vcpu, adj);
++                              /* Before back to guest, tsc_timestamp must be adjusted
++                               * as well, otherwise guest's percpu pvclock time could jump.
++                               */
++                              kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+                       }
+                       vcpu->arch.ia32_tsc_adjust_msr = data;
+               }
diff --git a/queue-5.14/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch b/queue-5.14/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch
new file mode 100644 (file)
index 0000000..1d24ed8
--- /dev/null
@@ -0,0 +1,82 @@
+From 46d4703b1db4c86ab5acb2331b10df999f005e8e Mon Sep 17 00:00:00 2001
+From: Xiao Ni <xni@redhat.com>
+Date: Wed, 18 Aug 2021 13:57:48 +0800
+Subject: md/raid10: Remove unnecessary rcu_dereference in raid10_handle_discard
+
+From: Xiao Ni <xni@redhat.com>
+
+commit 46d4703b1db4c86ab5acb2331b10df999f005e8e upstream.
+
+We are seeing the following warning in raid10_handle_discard.
+[  695.110751] =============================
+[  695.131439] WARNING: suspicious RCU usage
+[  695.151389] 4.18.0-319.el8.x86_64+debug #1 Not tainted
+[  695.174413] -----------------------------
+[  695.192603] drivers/md/raid10.c:1776 suspicious
+rcu_dereference_check() usage!
+[  695.225107] other info that might help us debug this:
+[  695.260940] rcu_scheduler_active = 2, debug_locks = 1
+[  695.290157] no locks held by mkfs.xfs/10186.
+
+In the first loop of function raid10_handle_discard. It already
+determines which disk need to handle discard request and add the
+rdev reference count rdev->nr_pending. So the conf->mirrors will
+not change until all bios come back from underlayer disks. It
+doesn't need to use rcu_dereference to get rdev.
+
+Cc: stable@vger.kernel.org
+Fixes: d30588b2731f ('md/raid10: improve raid10 discard request')
+Signed-off-by: Xiao Ni <xni@redhat.com>
+Acked-by: Guoqing Jiang <guoqing.jiang@linux.dev>
+Signed-off-by: Song Liu <songliubraving@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/raid10.c |   14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -1712,6 +1712,11 @@ retry_discard:
+       } else
+               r10_bio->master_bio = (struct bio *)first_r10bio;
++      /*
++       * first select target devices under rcu_lock and
++       * inc refcount on their rdev.  Record them by setting
++       * bios[x] to bio
++       */
+       rcu_read_lock();
+       for (disk = 0; disk < geo->raid_disks; disk++) {
+               struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
+@@ -1743,9 +1748,6 @@ retry_discard:
+       for (disk = 0; disk < geo->raid_disks; disk++) {
+               sector_t dev_start, dev_end;
+               struct bio *mbio, *rbio = NULL;
+-              struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
+-              struct md_rdev *rrdev = rcu_dereference(
+-                      conf->mirrors[disk].replacement);
+               /*
+                * Now start to calculate the start and end address for each disk.
+@@ -1775,9 +1777,12 @@ retry_discard:
+               /*
+                * It only handles discard bio which size is >= stripe size, so
+-               * dev_end > dev_start all the time
++               * dev_end > dev_start all the time.
++               * It doesn't need to use rcu lock to get rdev here. We already
++               * add rdev->nr_pending in the first loop.
+                */
+               if (r10_bio->devs[disk].bio) {
++                      struct md_rdev *rdev = conf->mirrors[disk].rdev;
+                       mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+                       mbio->bi_end_io = raid10_end_discard_request;
+                       mbio->bi_private = r10_bio;
+@@ -1790,6 +1795,7 @@ retry_discard:
+                       bio_endio(mbio);
+               }
+               if (r10_bio->devs[disk].repl_bio) {
++                      struct md_rdev *rrdev = conf->mirrors[disk].replacement;
+                       rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+                       rbio->bi_end_io = raid10_end_discard_request;
+                       rbio->bi_private = r10_bio;
diff --git a/queue-5.14/perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch b/queue-5.14/perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch
new file mode 100644 (file)
index 0000000..afac78e
--- /dev/null
@@ -0,0 +1,113 @@
+From 3f2cbe3810a60111a33f5f6267bd5a237b826fc9 Mon Sep 17 00:00:00 2001
+From: Alexander Antonov <alexander.antonov@linux.intel.com>
+Date: Tue, 6 Jul 2021 12:07:23 +0300
+Subject: perf/x86/intel/uncore: Fix IIO cleanup mapping procedure for SNR/ICX
+
+From: Alexander Antonov <alexander.antonov@linux.intel.com>
+
+commit 3f2cbe3810a60111a33f5f6267bd5a237b826fc9 upstream.
+
+skx_iio_cleanup_mapping() is re-used for snr and icx, but in those
+cases it fails to use the appropriate XXX_iio_mapping_group and as
+such fails to free previously allocated resources, leading to memory
+leaks.
+
+Fixes: 10337e95e04c ("perf/x86/intel/uncore: Enable I/O stacks to IIO PMON mapping on ICX")
+Signed-off-by: Alexander Antonov <alexander.antonov@linux.intel.com>
+[peterz: Changelog]
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210706090723.41850-1-alexander.antonov@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/events/intel/uncore_snbep.c |   40 ++++++++++++++++++++++++-----------
+ 1 file changed, 28 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/events/intel/uncore_snbep.c
++++ b/arch/x86/events/intel/uncore_snbep.c
+@@ -3838,26 +3838,32 @@ clear_attr_update:
+       return ret;
+ }
+-static int skx_iio_set_mapping(struct intel_uncore_type *type)
+-{
+-      return pmu_iio_set_mapping(type, &skx_iio_mapping_group);
+-}
+-
+-static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
++static void
++pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+ {
+-      struct attribute **attr = skx_iio_mapping_group.attrs;
++      struct attribute **attr = ag->attrs;
+       if (!attr)
+               return;
+       for (; *attr; attr++)
+               kfree((*attr)->name);
+-      kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs));
+-      kfree(skx_iio_mapping_group.attrs);
+-      skx_iio_mapping_group.attrs = NULL;
++      kfree(attr_to_ext_attr(*ag->attrs));
++      kfree(ag->attrs);
++      ag->attrs = NULL;
+       kfree(type->topology);
+ }
++static int skx_iio_set_mapping(struct intel_uncore_type *type)
++{
++      return pmu_iio_set_mapping(type, &skx_iio_mapping_group);
++}
++
++static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
++{
++      pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group);
++}
++
+ static struct intel_uncore_type skx_uncore_iio = {
+       .name                   = "iio",
+       .num_counters           = 4,
+@@ -4501,6 +4507,11 @@ static int snr_iio_set_mapping(struct in
+       return pmu_iio_set_mapping(type, &snr_iio_mapping_group);
+ }
++static void snr_iio_cleanup_mapping(struct intel_uncore_type *type)
++{
++      pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group);
++}
++
+ static struct intel_uncore_type snr_uncore_iio = {
+       .name                   = "iio",
+       .num_counters           = 4,
+@@ -4517,7 +4528,7 @@ static struct intel_uncore_type snr_unco
+       .attr_update            = snr_iio_attr_update,
+       .get_topology           = snr_iio_get_topology,
+       .set_mapping            = snr_iio_set_mapping,
+-      .cleanup_mapping        = skx_iio_cleanup_mapping,
++      .cleanup_mapping        = snr_iio_cleanup_mapping,
+ };
+ static struct intel_uncore_type snr_uncore_irp = {
+@@ -5092,6 +5103,11 @@ static int icx_iio_set_mapping(struct in
+       return pmu_iio_set_mapping(type, &icx_iio_mapping_group);
+ }
++static void icx_iio_cleanup_mapping(struct intel_uncore_type *type)
++{
++      pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group);
++}
++
+ static struct intel_uncore_type icx_uncore_iio = {
+       .name                   = "iio",
+       .num_counters           = 4,
+@@ -5109,7 +5125,7 @@ static struct intel_uncore_type icx_unco
+       .attr_update            = icx_iio_attr_update,
+       .get_topology           = icx_iio_get_topology,
+       .set_mapping            = icx_iio_set_mapping,
+-      .cleanup_mapping        = skx_iio_cleanup_mapping,
++      .cleanup_mapping        = icx_iio_cleanup_mapping,
+ };
+ static struct intel_uncore_type icx_uncore_irp = {
diff --git a/queue-5.14/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch b/queue-5.14/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch
new file mode 100644 (file)
index 0000000..0803203
--- /dev/null
@@ -0,0 +1,72 @@
+From e7177339d7b5f9594b316842122b5fda9513d5e2 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Tue, 31 Aug 2021 09:42:22 -0700
+Subject: Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()"
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit e7177339d7b5f9594b316842122b5fda9513d5e2 upstream.
+
+Revert a misguided illegal GPA check when "translating" a non-nested GPA.
+The check is woefully incomplete as it does not fill in @exception as
+expected by all callers, which leads to KVM attempting to inject a bogus
+exception, potentially exposing kernel stack information in the process.
+
+ WARNING: CPU: 0 PID: 8469 at arch/x86/kvm/x86.c:525 exception_type+0x98/0xb0 arch/x86/kvm/x86.c:525
+ CPU: 1 PID: 8469 Comm: syz-executor531 Not tainted 5.14.0-rc7-syzkaller #0
+ RIP: 0010:exception_type+0x98/0xb0 arch/x86/kvm/x86.c:525
+ Call Trace:
+  x86_emulate_instruction+0xef6/0x1460 arch/x86/kvm/x86.c:7853
+  kvm_mmu_page_fault+0x2f0/0x1810 arch/x86/kvm/mmu/mmu.c:5199
+  handle_ept_misconfig+0xdf/0x3e0 arch/x86/kvm/vmx/vmx.c:5336
+  __vmx_handle_exit arch/x86/kvm/vmx/vmx.c:6021 [inline]
+  vmx_handle_exit+0x336/0x1800 arch/x86/kvm/vmx/vmx.c:6038
+  vcpu_enter_guest+0x2a1c/0x4430 arch/x86/kvm/x86.c:9712
+  vcpu_run arch/x86/kvm/x86.c:9779 [inline]
+  kvm_arch_vcpu_ioctl_run+0x47d/0x1b20 arch/x86/kvm/x86.c:10010
+  kvm_vcpu_ioctl+0x49e/0xe50 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3652
+
+The bug has escaped notice because practically speaking the GPA check is
+useless.  The GPA check in question only comes into play when KVM is
+walking guest page tables (or "translating" CR3), and KVM already handles
+illegal GPA checks by setting reserved bits in rsvd_bits_mask for each
+PxE, or in the case of CR3 for loading PTDPTRs, manually checks for an
+illegal CR3.  This particular failure doesn't hit the existing reserved
+bits checks because syzbot sets guest.MAXPHYADDR=1, and IA32 architecture
+simply doesn't allow for such an absurd MAXPHYADDR, e.g. 32-bit paging
+doesn't define any reserved PA bits checks, which KVM emulates by only
+incorporating the reserved PA bits into the "high" bits, i.e. bits 63:32.
+
+Simply remove the bogus check.  There is zero meaningful value and no
+architectural justification for supporting guest.MAXPHYADDR < 32, and
+properly filling the exception would introduce non-trivial complexity.
+
+This reverts commit ec7771ab471ba6a945350353617e2e3385d0e013.
+
+Fixes: ec7771ab471b ("KVM: x86: mmu: Add guest physical address check in translate_gpa()")
+Cc: stable@vger.kernel.org
+Reported-by: syzbot+200c08e88ae818f849ce@syzkaller.appspotmail.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210831164224.1119728-2-seanjc@google.com>
+Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c |    6 ------
+ 1 file changed, 6 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -323,12 +323,6 @@ static bool check_mmio_spte(struct kvm_v
+ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+                                   struct x86_exception *exception)
+ {
+-      /* Check if guest physical address doesn't exceed guest maximum */
+-      if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
+-              exception->error_code |= PFERR_RSVD_MASK;
+-              return UNMAPPED_GVA;
+-      }
+-
+         return gpa;
+ }
index 5508e0caff06cac96ec640787cdd8de18872081d..957b327ecf44a3695304aabd898221e882118b39 100644 (file)
@@ -306,3 +306,22 @@ raid1-ensure-write-behind-bio-has-less-than-bio_max_vecs-sectors.patch
 cifs-do-not-leak-edeadlk-to-dgetents64-for-status_user_session_deleted.patch
 smb3-fix-posix-extensions-mount-option.patch
 tty-fix-data-race-between-tiocsti-and-flush_to_ldisc.patch
+perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch
+revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch
+kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch
+kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch
+kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch
+kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch
+kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch
+kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch
+kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch
+kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch
+arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch
+io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch
+md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch
+char-tpm-kconfig-remove-bad-i2c-cr50-select.patch
+fuse-truncate-pagecache-on-atomic_o_trunc.patch
+fuse-flush-extending-writes.patch
+fuse-wait-for-writepages-in-syncfs.patch
+ima-remove-wmissing-prototypes-warning.patch
+ima-remove-the-dependency-on-crypto_md5.patch