From: Greg Kroah-Hartman Date: Mon, 13 Sep 2021 11:36:42 +0000 (+0200) Subject: 5.14-stable patches X-Git-Tag: v5.4.146~12 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=74cc39ad1bb73035227d6289892a4eebc4b3f852;p=thirdparty%2Fkernel%2Fstable-queue.git 5.14-stable patches added patches: arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch char-tpm-kconfig-remove-bad-i2c-cr50-select.patch fuse-flush-extending-writes.patch fuse-truncate-pagecache-on-atomic_o_trunc.patch fuse-wait-for-writepages-in-syncfs.patch ima-remove-the-dependency-on-crypto_md5.patch ima-remove-wmissing-prototypes-warning.patch io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch --- diff --git a/queue-5.14/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch b/queue-5.14/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch new file mode 100644 index 00000000000..3acb65df343 --- /dev/null +++ b/queue-5.14/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch @@ -0,0 +1,189 @@ +From bf781869e5cf3e4ec1a47dad69b6f0df97629cbd Mon Sep 17 00:00:00 2001 +From: Claudiu Beznea +Date: Tue, 27 Jul 2021 10:40:05 +0300 +Subject: ARM: dts: at91: add pinctrl-{names, 0} for all gpios + +From: Claudiu Beznea + +commit bf781869e5cf3e4ec1a47dad69b6f0df97629cbd upstream. + +Add pinctrl-names and pinctrl-0 properties on controllers that claims to +use pins to avoid failures due to +commit 2ab73c6d8323 ("gpio: Support GPIO controllers without pin-ranges") +and also to avoid using pins that may be claimed my other IPs. + +Fixes: b7c2b6157079 ("ARM: at91: add Atmel's SAMA5D3 Xplained board") +Fixes: 1e5f532c2737 ("ARM: dts: at91: sam9x60: add device tree for soc and board") +Fixes: 38153a017896 ("ARM: at91/dt: sama5d4: add dts for sama5d4 xplained board") +Signed-off-by: Claudiu Beznea +Signed-off-by: Nicolas Ferre +Link: https://lore.kernel.org/r/20210727074006.1609989-1-claudiu.beznea@microchip.com +Cc: # v5.7+ +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm/boot/dts/at91-sam9x60ek.dts | 16 ++++++++++++++- + arch/arm/boot/dts/at91-sama5d3_xplained.dts | 29 ++++++++++++++++++++++++++++ + arch/arm/boot/dts/at91-sama5d4_xplained.dts | 19 ++++++++++++++++++ + 3 files changed, 63 insertions(+), 1 deletion(-) + +--- a/arch/arm/boot/dts/at91-sam9x60ek.dts ++++ b/arch/arm/boot/dts/at91-sam9x60ek.dts +@@ -92,6 +92,8 @@ + + leds { + compatible = "gpio-leds"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_gpio_leds>; + status = "okay"; /* Conflict with pwm0. */ + + red { +@@ -537,6 +539,10 @@ + AT91_PIOA 19 AT91_PERIPH_A (AT91_PINCTRL_PULL_UP | AT91_PINCTRL_DRIVE_STRENGTH_HI) /* PA19 DAT2 periph A with pullup */ + AT91_PIOA 20 AT91_PERIPH_A (AT91_PINCTRL_PULL_UP | AT91_PINCTRL_DRIVE_STRENGTH_HI)>; /* PA20 DAT3 periph A with pullup */ + }; ++ pinctrl_sdmmc0_cd: sdmmc0_cd { ++ atmel,pins = ++ ; ++ }; + }; + + sdmmc1 { +@@ -569,6 +575,14 @@ + AT91_PIOD 16 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>; + }; + }; ++ ++ leds { ++ pinctrl_gpio_leds: gpio_leds { ++ atmel,pins = ; ++ }; ++ }; + }; /* pinctrl */ + + &pwm0 { +@@ -580,7 +594,7 @@ + &sdmmc0 { + bus-width = <4>; + pinctrl-names = "default"; +- pinctrl-0 = <&pinctrl_sdmmc0_default>; ++ pinctrl-0 = <&pinctrl_sdmmc0_default &pinctrl_sdmmc0_cd>; + status = "okay"; + cd-gpios = <&pioA 23 GPIO_ACTIVE_LOW>; + disable-wp; +--- a/arch/arm/boot/dts/at91-sama5d3_xplained.dts ++++ b/arch/arm/boot/dts/at91-sama5d3_xplained.dts +@@ -57,6 +57,8 @@ + }; + + spi0: spi@f0004000 { ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_spi0_cs>; + cs-gpios = <&pioD 13 0>, <0>, <0>, <&pioD 16 0>; + status = "okay"; + }; +@@ -169,6 +171,8 @@ + }; + + spi1: spi@f8008000 { ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_spi1_cs>; + cs-gpios = <&pioC 25 0>; + status = "okay"; + }; +@@ -248,6 +252,26 @@ + ; + }; ++ ++ pinctrl_gpio_leds: gpio_leds_default { ++ atmel,pins = ++ ; ++ }; ++ ++ pinctrl_spi0_cs: spi0_cs_default { ++ atmel,pins = ++ ; ++ }; ++ ++ pinctrl_spi1_cs: spi1_cs_default { ++ atmel,pins = ; ++ }; ++ ++ pinctrl_vcc_mmc0_reg_gpio: vcc_mmc0_reg_gpio_default { ++ atmel,pins = ; ++ }; + }; + }; + }; +@@ -339,6 +363,8 @@ + + vcc_mmc0_reg: fixedregulator_mmc0 { + compatible = "regulator-fixed"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_vcc_mmc0_reg_gpio>; + gpio = <&pioE 2 GPIO_ACTIVE_LOW>; + regulator-name = "mmc0-card-supply"; + regulator-min-microvolt = <3300000>; +@@ -362,6 +388,9 @@ + + leds { + compatible = "gpio-leds"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_gpio_leds>; ++ status = "okay"; + + d2 { + label = "d2"; +--- a/arch/arm/boot/dts/at91-sama5d4_xplained.dts ++++ b/arch/arm/boot/dts/at91-sama5d4_xplained.dts +@@ -90,6 +90,8 @@ + }; + + spi1: spi@fc018000 { ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_spi0_cs>; + cs-gpios = <&pioB 21 0>; + status = "okay"; + }; +@@ -147,6 +149,19 @@ + atmel,pins = + ; + }; ++ pinctrl_spi0_cs: spi0_cs_default { ++ atmel,pins = ++ ; ++ }; ++ pinctrl_gpio_leds: gpio_leds_default { ++ atmel,pins = ++ ; ++ }; ++ pinctrl_vcc_mmc1_reg: vcc_mmc1_reg { ++ atmel,pins = ++ ; ++ }; + }; + }; + }; +@@ -252,6 +267,8 @@ + + leds { + compatible = "gpio-leds"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_gpio_leds>; + status = "okay"; + + d8 { +@@ -278,6 +295,8 @@ + + vcc_mmc1_reg: fixedregulator_mmc1 { + compatible = "regulator-fixed"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_vcc_mmc1_reg>; + gpio = <&pioE 4 GPIO_ACTIVE_LOW>; + regulator-name = "VDD MCI1"; + regulator-min-microvolt = <3300000>; diff --git a/queue-5.14/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch b/queue-5.14/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch new file mode 100644 index 00000000000..18cafc45e41 --- /dev/null +++ b/queue-5.14/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch @@ -0,0 +1,33 @@ +From 847fdae1579f4ee930b01f24a7847b8043bf468c Mon Sep 17 00:00:00 2001 +From: Adrian Ratiu +Date: Tue, 27 Jul 2021 20:13:12 +0300 +Subject: char: tpm: Kconfig: remove bad i2c cr50 select + +From: Adrian Ratiu + +commit 847fdae1579f4ee930b01f24a7847b8043bf468c upstream. + +This fixes a minor bug which went unnoticed during the initial +driver upstreaming review: TCG_CR50 does not exist in mainline +kernels, so remove it. + +Fixes: 3a253caaad11 ("char: tpm: add i2c driver for cr50") +Cc: stable@vger.kernel.org +Reviewed-by: Jarkko Sakkinen +Signed-off-by: Adrian Ratiu +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/char/tpm/Kconfig | 1 - + 1 file changed, 1 deletion(-) + +--- a/drivers/char/tpm/Kconfig ++++ b/drivers/char/tpm/Kconfig +@@ -89,7 +89,6 @@ config TCG_TIS_SYNQUACER + config TCG_TIS_I2C_CR50 + tristate "TPM Interface Specification 2.0 Interface (I2C - CR50)" + depends on I2C +- select TCG_CR50 + help + This is a driver for the Google cr50 I2C TPM interface which is a + custom microcontroller and requires a custom i2c protocol interface diff --git a/queue-5.14/fuse-flush-extending-writes.patch b/queue-5.14/fuse-flush-extending-writes.patch new file mode 100644 index 00000000000..54411809a10 --- /dev/null +++ b/queue-5.14/fuse-flush-extending-writes.patch @@ -0,0 +1,49 @@ +From 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Tue, 31 Aug 2021 14:18:08 +0200 +Subject: fuse: flush extending writes + +From: Miklos Szeredi + +commit 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd upstream. + +Callers of fuse_writeback_range() assume that the file is ready for +modification by the server in the supplied byte range after the call +returns. + +If there's a write that extends the file beyond the end of the supplied +range, then the file needs to be extended to at least the end of the range, +but currently that's not done. + +There are at least two cases where this can cause problems: + + - copy_file_range() will return short count if the file is not extended + up to end of the source range. + + - FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE will not extend the file, + hence the region may not be fully allocated. + +Fix by flushing writes from the start of the range up to the end of the +file. This could be optimized if the writes are non-extending, etc, but +it's probably not worth the trouble. + +Fixes: a2bc92362941 ("fuse: fix copy_file_range() in the writeback case") +Fixes: 6b1bdb56b17c ("fuse: allow fallocate(FALLOC_FL_ZERO_RANGE)") +Cc: # v5.2 +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -2884,7 +2884,7 @@ fuse_direct_IO(struct kiocb *iocb, struc + + static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) + { +- int err = filemap_write_and_wait_range(inode->i_mapping, start, end); ++ int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); + + if (!err) + fuse_sync_writes(inode); diff --git a/queue-5.14/fuse-truncate-pagecache-on-atomic_o_trunc.patch b/queue-5.14/fuse-truncate-pagecache-on-atomic_o_trunc.patch new file mode 100644 index 00000000000..04fd413702f --- /dev/null +++ b/queue-5.14/fuse-truncate-pagecache-on-atomic_o_trunc.patch @@ -0,0 +1,58 @@ +From 76224355db7570cbe6b6f75c8929a1558828dd55 Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Tue, 17 Aug 2021 21:05:16 +0200 +Subject: fuse: truncate pagecache on atomic_o_trunc + +From: Miklos Szeredi + +commit 76224355db7570cbe6b6f75c8929a1558828dd55 upstream. + +fuse_finish_open() will be called with FUSE_NOWRITE in case of atomic +O_TRUNC. This can deadlock with fuse_wait_on_page_writeback() in +fuse_launder_page() triggered by invalidate_inode_pages2(). + +Fix by replacing invalidate_inode_pages2() in fuse_finish_open() with a +truncate_pagecache() call. This makes sense regardless of FOPEN_KEEP_CACHE +or fc->writeback cache, so do it unconditionally. + +Reported-by: Xie Yongji +Reported-and-tested-by: syzbot+bea44a5189836d956894@syzkaller.appspotmail.com +Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") +Cc: +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/file.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inod + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + +- if (!(ff->open_flags & FOPEN_KEEP_CACHE)) +- invalidate_inode_pages2(inode->i_mapping); + if (ff->open_flags & FOPEN_STREAM) + stream_open(inode, file); + else if (ff->open_flags & FOPEN_NONSEEKABLE) + nonseekable_open(inode, file); ++ + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + +@@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inod + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); ++ truncate_pagecache(inode, 0); + fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); ++ } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { ++ invalidate_inode_pages2(inode->i_mapping); + } ++ + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) + fuse_link_write_file(file); + } diff --git a/queue-5.14/fuse-wait-for-writepages-in-syncfs.patch b/queue-5.14/fuse-wait-for-writepages-in-syncfs.patch new file mode 100644 index 00000000000..62b68e0069a --- /dev/null +++ b/queue-5.14/fuse-wait-for-writepages-in-syncfs.patch @@ -0,0 +1,242 @@ +From 660585b56e63ca034ad506ea53c807c5cdca3196 Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Wed, 1 Sep 2021 12:39:02 +0200 +Subject: fuse: wait for writepages in syncfs + +From: Miklos Szeredi + +commit 660585b56e63ca034ad506ea53c807c5cdca3196 upstream. + +In case of fuse the MM subsystem doesn't guarantee that page writeback +completes by the time ->sync_fs() is called. This is because fuse +completes page writeback immediately to prevent DoS of memory reclaim by +the userspace file server. + +This means that fuse itself must ensure that writes are synced before +sending the SYNCFS request to the server. + +Introduce sync buckets, that hold a counter for the number of outstanding +write requests. On syncfs replace the current bucket with a new one and +wait until the old bucket's counter goes down to zero. + +It is possible to have multiple syncfs calls in parallel, in which case +there could be more than one waited-on buckets. Descendant buckets must +not complete until the parent completes. Add a count to the child (new) +bucket until the (parent) old bucket completes. + +Use RCU protection to dereference the current bucket and to wake up an +emptied bucket. Use fc->lock to protect against parallel assignments to +the current bucket. + +This leaves just the counter to be a possible scalability issue. The +fc->num_waiting counter has a similar issue, so both should be addressed at +the same time. + +Reported-by: Amir Goldstein +Fixes: 2d82ab251ef0 ("virtiofs: propagate sync() to file server") +Cc: # v5.14 +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/file.c | 21 +++++++++++++++++++ + fs/fuse/fuse_i.h | 19 +++++++++++++++++ + fs/fuse/inode.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 100 insertions(+) + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -392,6 +392,7 @@ struct fuse_writepage_args { + struct list_head queue_entry; + struct fuse_writepage_args *next; + struct inode *inode; ++ struct fuse_sync_bucket *bucket; + }; + + static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, +@@ -1611,6 +1612,9 @@ static void fuse_writepage_free(struct f + struct fuse_args_pages *ap = &wpa->ia.ap; + int i; + ++ if (wpa->bucket) ++ fuse_sync_bucket_dec(wpa->bucket); ++ + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); + +@@ -1874,6 +1878,20 @@ static struct fuse_writepage_args *fuse_ + + } + ++static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, ++ struct fuse_writepage_args *wpa) ++{ ++ if (!fc->sync_fs) ++ return; ++ ++ rcu_read_lock(); ++ /* Prevent resurrection of dead bucket in unlikely race with syncfs */ ++ do { ++ wpa->bucket = rcu_dereference(fc->curr_bucket); ++ } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); ++ rcu_read_unlock(); ++} ++ + static int fuse_writepage_locked(struct page *page) + { + struct address_space *mapping = page->mapping; +@@ -1901,6 +1919,7 @@ static int fuse_writepage_locked(struct + if (!wpa->ia.ff) + goto err_nofile; + ++ fuse_writepage_add_to_bucket(fc, wpa); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + + copy_highpage(tmp_page, page); +@@ -2151,6 +2170,8 @@ static int fuse_writepages_fill(struct p + __free_page(tmp_page); + goto out_unlock; + } ++ fuse_writepage_add_to_bucket(fc, wpa); ++ + data->max_pages = 1; + + ap = &wpa->ia.ap; +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -515,6 +515,13 @@ struct fuse_fs_context { + void **fudptr; + }; + ++struct fuse_sync_bucket { ++ /* count is a possible scalability bottleneck */ ++ atomic_t count; ++ wait_queue_head_t waitq; ++ struct rcu_head rcu; ++}; ++ + /** + * A Fuse connection. + * +@@ -807,6 +814,9 @@ struct fuse_conn { + + /** List of filesystems using this connection */ + struct list_head mounts; ++ ++ /* New writepages go into this bucket */ ++ struct fuse_sync_bucket __rcu *curr_bucket; + }; + + /* +@@ -910,6 +920,15 @@ static inline void fuse_page_descs_lengt + descs[i].length = PAGE_SIZE - descs[i].offset; + } + ++static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) ++{ ++ /* Need RCU protection to prevent use after free after the decrement */ ++ rcu_read_lock(); ++ if (atomic_dec_and_test(&bucket->count)) ++ wake_up(&bucket->waitq); ++ rcu_read_unlock(); ++} ++ + /** Device operations */ + extern const struct file_operations fuse_dev_operations; + +--- a/fs/fuse/inode.c ++++ b/fs/fuse/inode.c +@@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *de + return err; + } + ++static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) ++{ ++ struct fuse_sync_bucket *bucket; ++ ++ bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); ++ if (bucket) { ++ init_waitqueue_head(&bucket->waitq); ++ /* Initial active count */ ++ atomic_set(&bucket->count, 1); ++ } ++ return bucket; ++} ++ ++static void fuse_sync_fs_writes(struct fuse_conn *fc) ++{ ++ struct fuse_sync_bucket *bucket, *new_bucket; ++ int count; ++ ++ new_bucket = fuse_sync_bucket_alloc(); ++ spin_lock(&fc->lock); ++ bucket = rcu_dereference_protected(fc->curr_bucket, 1); ++ count = atomic_read(&bucket->count); ++ WARN_ON(count < 1); ++ /* No outstanding writes? */ ++ if (count == 1) { ++ spin_unlock(&fc->lock); ++ kfree(new_bucket); ++ return; ++ } ++ ++ /* ++ * Completion of new bucket depends on completion of this bucket, so add ++ * one more count. ++ */ ++ atomic_inc(&new_bucket->count); ++ rcu_assign_pointer(fc->curr_bucket, new_bucket); ++ spin_unlock(&fc->lock); ++ /* ++ * Drop initial active count. At this point if all writes in this and ++ * ancestor buckets complete, the count will go to zero and this task ++ * will be woken up. ++ */ ++ atomic_dec(&bucket->count); ++ ++ wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); ++ ++ /* Drop temp count on descendant bucket */ ++ fuse_sync_bucket_dec(new_bucket); ++ kfree_rcu(bucket, rcu); ++} ++ + static int fuse_sync_fs(struct super_block *sb, int wait) + { + struct fuse_mount *fm = get_fuse_mount_super(sb); +@@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_blo + if (!fc->sync_fs) + return 0; + ++ fuse_sync_fs_writes(fc); ++ + memset(&inarg, 0, sizeof(inarg)); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); +@@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc) + { + if (refcount_dec_and_test(&fc->count)) { + struct fuse_iqueue *fiq = &fc->iq; ++ struct fuse_sync_bucket *bucket; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); +@@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc) + fiq->ops->release(fiq); + put_pid_ns(fc->pid_ns); + put_user_ns(fc->user_ns); ++ bucket = rcu_dereference_protected(fc->curr_bucket, 1); ++ if (bucket) { ++ WARN_ON(atomic_read(&bucket->count) != 1); ++ kfree(bucket); ++ } + fc->release(fc); + } + } +@@ -1418,6 +1477,7 @@ int fuse_fill_super_common(struct super_ + if (sb->s_flags & SB_MANDLOCK) + goto err; + ++ rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); + fuse_sb_defaults(sb); + + if (ctx->is_bdev) { diff --git a/queue-5.14/ima-remove-the-dependency-on-crypto_md5.patch b/queue-5.14/ima-remove-the-dependency-on-crypto_md5.patch new file mode 100644 index 00000000000..d8acf2e5ca7 --- /dev/null +++ b/queue-5.14/ima-remove-the-dependency-on-crypto_md5.patch @@ -0,0 +1,45 @@ +From 8510505d55e194d3f6c9644c9f9d12c4f6b0395a Mon Sep 17 00:00:00 2001 +From: THOBY Simon +Date: Mon, 16 Aug 2021 08:10:59 +0000 +Subject: IMA: remove the dependency on CRYPTO_MD5 + +From: THOBY Simon + +commit 8510505d55e194d3f6c9644c9f9d12c4f6b0395a upstream. + +MD5 is a weak digest algorithm that shouldn't be used for cryptographic +operation. It hinders the efficiency of a patch set that aims to limit +the digests allowed for the extended file attribute namely security.ima. +MD5 is no longer a requirement for IMA, nor should it be used there. + +The sole place where we still use the MD5 algorithm inside IMA is setting +the ima_hash algorithm to MD5, if the user supplies 'ima_hash=md5' +parameter on the command line. With commit ab60368ab6a4 ("ima: Fallback +to the builtin hash algorithm"), setting "ima_hash=md5" fails gracefully +when CRYPTO_MD5 is not set: + ima: Can not allocate md5 (reason: -2) + ima: Allocating md5 failed, going to use default hash algorithm sha256 + +Remove the CRYPTO_MD5 dependency for IMA. + +Signed-off-by: THOBY Simon +Reviewed-by: Lakshmi Ramasubramanian +[zohar@linux.ibm.com: include commit number in patch description for +stable.] +Cc: stable@vger.kernel.org # 4.17 +Signed-off-by: Mimi Zohar +Signed-off-by: Greg Kroah-Hartman +--- + security/integrity/ima/Kconfig | 1 - + 1 file changed, 1 deletion(-) + +--- a/security/integrity/ima/Kconfig ++++ b/security/integrity/ima/Kconfig +@@ -6,7 +6,6 @@ config IMA + select SECURITYFS + select CRYPTO + select CRYPTO_HMAC +- select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_HASH_INFO + select TCG_TPM if HAS_IOMEM && !UML diff --git a/queue-5.14/ima-remove-wmissing-prototypes-warning.patch b/queue-5.14/ima-remove-wmissing-prototypes-warning.patch new file mode 100644 index 00000000000..4464b563fdb --- /dev/null +++ b/queue-5.14/ima-remove-wmissing-prototypes-warning.patch @@ -0,0 +1,40 @@ +From a32ad90426a9c8eb3915eed26e08ce133bd9e0da Mon Sep 17 00:00:00 2001 +From: Austin Kim +Date: Tue, 29 Jun 2021 14:50:50 +0100 +Subject: IMA: remove -Wmissing-prototypes warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Austin Kim + +commit a32ad90426a9c8eb3915eed26e08ce133bd9e0da upstream. + +With W=1 build, the compiler throws warning message as below: + + security/integrity/ima/ima_mok.c:24:12: warning: + no previous prototype for ‘ima_mok_init’ [-Wmissing-prototypes] + __init int ima_mok_init(void) + +Silence the warning by adding static keyword to ima_mok_init(). + +Signed-off-by: Austin Kim +Fixes: 41c89b64d718 ("IMA: create machine owner and blacklist keyrings") +Cc: stable@vger.kernel.org +Signed-off-by: Mimi Zohar +Signed-off-by: Greg Kroah-Hartman +--- + security/integrity/ima/ima_mok.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/security/integrity/ima/ima_mok.c ++++ b/security/integrity/ima/ima_mok.c +@@ -21,7 +21,7 @@ struct key *ima_blacklist_keyring; + /* + * Allocate the IMA blacklist keyring + */ +-__init int ima_mok_init(void) ++static __init int ima_mok_init(void) + { + struct key_restriction *restriction; + diff --git a/queue-5.14/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch b/queue-5.14/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch new file mode 100644 index 00000000000..d02440ca4e0 --- /dev/null +++ b/queue-5.14/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch @@ -0,0 +1,104 @@ +From ecc53c48c13d995e6fe5559e30ffee48d92784fd Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Sun, 29 Aug 2021 16:13:03 -0600 +Subject: io-wq: check max_worker limits if a worker transitions bound state + +From: Jens Axboe + +commit ecc53c48c13d995e6fe5559e30ffee48d92784fd upstream. + +For the two places where new workers are created, we diligently check if +we are allowed to create a new worker. If we're currently at the limit +of how many workers of a given type we can have, then we don't create +any new ones. + +If you have a mixed workload with various types of bound and unbounded +work, then it can happen that a worker finishes one type of work and +is then transitioned to the other type. For this case, we don't check +if we are actually allowed to do so. This can cause io-wq to temporarily +exceed the allowed number of workers for a given type. + +When retrieving work, check that the types match. If they don't, check +if we are allowed to transition to the other type. If not, then don't +handle the new work. + +Cc: stable@vger.kernel.org +Reported-by: Johannes Lundberg +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io-wq.c | 33 ++++++++++++++++++++++++++++++--- + 1 file changed, 30 insertions(+), 3 deletions(-) + +--- a/fs/io-wq.c ++++ b/fs/io-wq.c +@@ -423,7 +423,28 @@ static void io_wait_on_hash(struct io_wq + spin_unlock(&wq->hash->wait.lock); + } + +-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) ++/* ++ * We can always run the work if the worker is currently the same type as ++ * the work (eg both are bound, or both are unbound). If they are not the ++ * same, only allow it if incrementing the worker count would be allowed. ++ */ ++static bool io_worker_can_run_work(struct io_worker *worker, ++ struct io_wq_work *work) ++{ ++ struct io_wqe_acct *acct; ++ ++ if (!(worker->flags & IO_WORKER_F_BOUND) != ++ !(work->flags & IO_WQ_WORK_UNBOUND)) ++ return true; ++ ++ /* not the same type, check if we'd go over the limit */ ++ acct = io_work_get_acct(worker->wqe, work); ++ return acct->nr_workers < acct->max_workers; ++} ++ ++static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, ++ struct io_worker *worker, ++ bool *stalled) + __must_hold(wqe->lock) + { + struct io_wq_work_node *node, *prev; +@@ -435,6 +456,9 @@ static struct io_wq_work *io_get_next_wo + + work = container_of(node, struct io_wq_work, list); + ++ if (!io_worker_can_run_work(worker, work)) ++ break; ++ + /* not hashed, can run anytime */ + if (!io_wq_is_hashed(work)) { + wq_list_del(&wqe->work_list, node, prev); +@@ -461,6 +485,7 @@ static struct io_wq_work *io_get_next_wo + raw_spin_unlock(&wqe->lock); + io_wait_on_hash(wqe, stall_hash); + raw_spin_lock(&wqe->lock); ++ *stalled = true; + } + + return NULL; +@@ -500,6 +525,7 @@ static void io_worker_handle_work(struct + + do { + struct io_wq_work *work; ++ bool stalled; + get_next: + /* + * If we got some work, mark us as busy. If we didn't, but +@@ -508,10 +534,11 @@ get_next: + * can't make progress, any work completion or insertion will + * clear the stalled flag. + */ +- work = io_get_next_work(wqe); ++ stalled = false; ++ work = io_get_next_work(wqe, worker, &stalled); + if (work) + __io_worker_busy(wqe, worker, work); +- else if (!wq_list_empty(&wqe->work_list)) ++ else if (stalled) + wqe->flags |= IO_WQE_FLAG_STALLED; + + raw_spin_unlock_irq(&wqe->lock); diff --git a/queue-5.14/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch b/queue-5.14/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch new file mode 100644 index 00000000000..ebe2642d477 --- /dev/null +++ b/queue-5.14/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch @@ -0,0 +1,54 @@ +From 47e6223c841e029bfc23c3ce594dac5525cebaf8 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Mon, 2 Aug 2021 13:38:30 +0100 +Subject: KVM: arm64: Unregister HYP sections from kmemleak in protected mode + +From: Marc Zyngier + +commit 47e6223c841e029bfc23c3ce594dac5525cebaf8 upstream. + +Booting a KVM host in protected mode with kmemleak quickly results +in a pretty bad crash, as kmemleak doesn't know that the HYP sections +have been taken away. This is specially true for the BSS section, +which is part of the kernel BSS section and registered at boot time +by kmemleak itself. + +Unregister the HYP part of the BSS before making that section +HYP-private. The rest of the HYP-specific data is obtained via +the page allocator or lives in other sections, none of which is +subjected to kmemleak. + +Fixes: 90134ac9cabb ("KVM: arm64: Protect the .hyp sections from the host") +Reviewed-by: Quentin Perret +Reviewed-by: Catalin Marinas +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org # 5.13 +Link: https://lore.kernel.org/r/20210802123830.2195174-3-maz@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/arm.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1986,6 +1987,12 @@ static int finalize_hyp_mode(void) + if (ret) + return ret; + ++ /* ++ * Exclude HYP BSS from kmemleak so that it doesn't get peeked ++ * at, which would end badly once the section is inaccessible. ++ * None of other sections should ever be introspected. ++ */ ++ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); + ret = pkvm_mark_hyp_section(__hyp_bss); + if (ret) + return ret; diff --git a/queue-5.14/kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch b/queue-5.14/kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch new file mode 100644 index 00000000000..9ba4888ca79 --- /dev/null +++ b/queue-5.14/kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch @@ -0,0 +1,216 @@ +From 3134cc8beb69d0db9de651081707c4651c011621 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Thu, 19 Aug 2021 19:03:05 +0100 +Subject: KVM: arm64: vgic: Resample HW pending state on deactivation + +From: Marc Zyngier + +commit 3134cc8beb69d0db9de651081707c4651c011621 upstream. + +When a mapped level interrupt (a timer, for example) is deactivated +by the guest, the corresponding host interrupt is equally deactivated. +However, the fate of the pending state still needs to be dealt +with in SW. + +This is specially true when the interrupt was in the active+pending +state in the virtual distributor at the point where the guest +was entered. On exit, the pending state is potentially stale +(the guest may have put the interrupt in a non-pending state). + +If we don't do anything, the interrupt will be spuriously injected +in the guest. Although this shouldn't have any ill effect (spurious +interrupts are always possible), we can improve the emulation by +detecting the deactivation-while-pending case and resample the +interrupt. + +While we're at it, move the logic into a common helper that can +be shared between the two GIC implementations. + +Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts") +Reported-by: Raghavendra Rao Ananta +Tested-by: Raghavendra Rao Ananta +Reviewed-by: Oliver Upton +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210819180305.1670525-1-maz@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/vgic/vgic-v2.c | 36 +++++------------------------------- + arch/arm64/kvm/vgic/vgic-v3.c | 36 +++++------------------------------- + arch/arm64/kvm/vgic/vgic.c | 38 ++++++++++++++++++++++++++++++++++++++ + arch/arm64/kvm/vgic/vgic.h | 2 ++ + 4 files changed, 50 insertions(+), 62 deletions(-) + +--- a/arch/arm64/kvm/vgic/vgic-v2.c ++++ b/arch/arm64/kvm/vgic/vgic-v2.c +@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vc + u32 val = cpuif->vgic_lr[lr]; + u32 cpuid, intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; ++ bool deactivated; + + /* Extract the source vCPU id from the LR */ + cpuid = val & GICH_LR_PHYSID_CPUID; +@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vc + + raw_spin_lock(&irq->irq_lock); + +- /* Always preserve the active bit */ ++ /* Always preserve the active bit, note deactivation */ ++ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); + irq->active = !!(val & GICH_LR_ACTIVE_BIT); + + if (irq->active && vgic_irq_is_sgi(intid)) +@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vc + if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) + irq->pending_latch = false; + +- /* +- * Level-triggered mapped IRQs are special because we only +- * observe rising edges as input to the VGIC. +- * +- * If the guest never acked the interrupt we have to sample +- * the physical line and set the line level, because the +- * device state could have changed or we simply need to +- * process the still pending interrupt later. +- * +- * If this causes us to lower the level, we have to also clear +- * the physical active state, since we will otherwise never be +- * told when the interrupt becomes asserted again. +- * +- * Another case is when the interrupt requires a helping hand +- * on deactivation (no HW deactivation, for example). +- */ +- if (vgic_irq_is_mapped_level(irq)) { +- bool resample = false; +- +- if (val & GICH_LR_PENDING_BIT) { +- irq->line_level = vgic_get_phys_line_level(irq); +- resample = !irq->line_level; +- } else if (vgic_irq_needs_resampling(irq) && +- !(irq->active || irq->pending_latch)) { +- resample = true; +- } +- +- if (resample) +- vgic_irq_set_phys_active(irq, false); +- } ++ /* Handle resampling for mapped interrupts if required */ ++ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); + + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); +--- a/arch/arm64/kvm/vgic/vgic-v3.c ++++ b/arch/arm64/kvm/vgic/vgic-v3.c +@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vc + u32 intid, cpuid; + struct vgic_irq *irq; + bool is_v2_sgi = false; ++ bool deactivated; + + cpuid = val & GICH_LR_PHYSID_CPUID; + cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; +@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vc + + raw_spin_lock(&irq->irq_lock); + +- /* Always preserve the active bit */ ++ /* Always preserve the active bit, note deactivation */ ++ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); + irq->active = !!(val & ICH_LR_ACTIVE_BIT); + + if (irq->active && is_v2_sgi) +@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vc + if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) + irq->pending_latch = false; + +- /* +- * Level-triggered mapped IRQs are special because we only +- * observe rising edges as input to the VGIC. +- * +- * If the guest never acked the interrupt we have to sample +- * the physical line and set the line level, because the +- * device state could have changed or we simply need to +- * process the still pending interrupt later. +- * +- * If this causes us to lower the level, we have to also clear +- * the physical active state, since we will otherwise never be +- * told when the interrupt becomes asserted again. +- * +- * Another case is when the interrupt requires a helping hand +- * on deactivation (no HW deactivation, for example). +- */ +- if (vgic_irq_is_mapped_level(irq)) { +- bool resample = false; +- +- if (val & ICH_LR_PENDING_BIT) { +- irq->line_level = vgic_get_phys_line_level(irq); +- resample = !irq->line_level; +- } else if (vgic_irq_needs_resampling(irq) && +- !(irq->active || irq->pending_latch)) { +- resample = true; +- } +- +- if (resample) +- vgic_irq_set_phys_active(irq, false); +- } ++ /* Handle resampling for mapped interrupts if required */ ++ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); + + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); +--- a/arch/arm64/kvm/vgic/vgic.c ++++ b/arch/arm64/kvm/vgic/vgic.c +@@ -1022,3 +1022,41 @@ bool kvm_vgic_map_is_active(struct kvm_v + + return map_is_active; + } ++ ++/* ++ * Level-triggered mapped IRQs are special because we only observe rising ++ * edges as input to the VGIC. ++ * ++ * If the guest never acked the interrupt we have to sample the physical ++ * line and set the line level, because the device state could have changed ++ * or we simply need to process the still pending interrupt later. ++ * ++ * We could also have entered the guest with the interrupt active+pending. ++ * On the next exit, we need to re-evaluate the pending state, as it could ++ * otherwise result in a spurious interrupt by injecting a now potentially ++ * stale pending state. ++ * ++ * If this causes us to lower the level, we have to also clear the physical ++ * active state, since we will otherwise never be told when the interrupt ++ * becomes asserted again. ++ * ++ * Another case is when the interrupt requires a helping hand on ++ * deactivation (no HW deactivation, for example). ++ */ ++void vgic_irq_handle_resampling(struct vgic_irq *irq, ++ bool lr_deactivated, bool lr_pending) ++{ ++ if (vgic_irq_is_mapped_level(irq)) { ++ bool resample = false; ++ ++ if (unlikely(vgic_irq_needs_resampling(irq))) { ++ resample = !(irq->active || irq->pending_latch); ++ } else if (lr_pending || (lr_deactivated && irq->line_level)) { ++ irq->line_level = vgic_get_phys_line_level(irq); ++ resample = !irq->line_level; ++ } ++ ++ if (resample) ++ vgic_irq_set_phys_active(irq, false); ++ } ++} +--- a/arch/arm64/kvm/vgic/vgic.h ++++ b/arch/arm64/kvm/vgic/vgic.h +@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgi + bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags); + void vgic_kick_vcpus(struct kvm *kvm); ++void vgic_irq_handle_resampling(struct vgic_irq *irq, ++ bool lr_deactivated, bool lr_pending); + + int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment); diff --git a/queue-5.14/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch b/queue-5.14/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch new file mode 100644 index 00000000000..1dbdd5fb096 --- /dev/null +++ b/queue-5.14/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch @@ -0,0 +1,60 @@ +From f7782bb8d818d8f47c26b22079db10599922787a Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 10 Aug 2021 07:45:26 -0700 +Subject: KVM: nVMX: Unconditionally clear nested.pi_pending on nested VM-Enter + +From: Sean Christopherson + +commit f7782bb8d818d8f47c26b22079db10599922787a upstream. + +Clear nested.pi_pending on nested VM-Enter even if L2 will run without +posted interrupts enabled. If nested.pi_pending is left set from a +previous L2, vmx_complete_nested_posted_interrupt() will pick up the +stale flag and exit to userspace with an "internal emulation error" due +the new L2 not having a valid nested.pi_desc. + +Arguably, vmx_complete_nested_posted_interrupt() should first check for +posted interrupts being enabled, but it's also completely reasonable that +KVM wouldn't screw up a fundamental flag. Not to mention that the mere +existence of nested.pi_pending is a long-standing bug as KVM shouldn't +move the posted interrupt out of the IRR until it's actually processed, +e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit +with a "pending" posted interrupt. Fixing the mess is a future problem. + +Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI +descriptor as an error, this was a benign bug as the null PI descriptor +effectively served as a check on PI not being enabled. Even then, the +new flow did not become problematic until KVM started checking the result +of kvm_check_nested_events(). + +Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") +Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable") +Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails") +Cc: stable@vger.kernel.org +Cc: Jim Mattson +Signed-off-by: Sean Christopherson +Message-Id: <20210810144526.2662272-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/nested.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2223,12 +2223,11 @@ static void prepare_vmcs02_early(struct + ~PIN_BASED_VMX_PREEMPTION_TIMER); + + /* Posted interrupts setting is only taken from vmcs12. */ +- if (nested_cpu_has_posted_intr(vmcs12)) { ++ vmx->nested.pi_pending = false; ++ if (nested_cpu_has_posted_intr(vmcs12)) + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; +- vmx->nested.pi_pending = false; +- } else { ++ else + exec_control &= ~PIN_BASED_POSTED_INTR; +- } + pin_controls_set(vmx, exec_control); + + /* diff --git a/queue-5.14/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch b/queue-5.14/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch new file mode 100644 index 00000000000..5ff7222f84a --- /dev/null +++ b/queue-5.14/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch @@ -0,0 +1,122 @@ +From a3e03bc1368c1bc16e19b001fc96dc7430573cc8 Mon Sep 17 00:00:00 2001 +From: Halil Pasic +Date: Fri, 27 Aug 2021 14:54:29 +0200 +Subject: KVM: s390: index kvm->arch.idle_mask by vcpu_idx +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Halil Pasic + +commit a3e03bc1368c1bc16e19b001fc96dc7430573cc8 upstream. + +While in practice vcpu->vcpu_idx == vcpu->vcp_id is often true, it may +not always be, and we must not rely on this. Reason is that KVM decides +the vcpu_idx, userspace decides the vcpu_id, thus the two might not +match. + +Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies +that code like +for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_id); + do_stuff(vcpu); +} +is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an +vcpu_id. The trouble is, we do actually use kvm->arch.idle_mask like +this. To fix this problem we have two options. Either use +kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id, +or switch to indexing via vcpu_idx. The latter is preferable for obvious +reasons. + +Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to +indexing it by vcpu_idx. To keep gisa_int.kicked_mask indexed by the +same index as idle_mask lets make the same change for it as well. + +Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array") +Signed-off-by: Halil Pasic +Reviewed-by: Christian Bornträger +Reviewed-by: Claudio Imbrenda +Cc: # 3.15+ +Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com +Signed-off-by: Christian Borntraeger +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/include/asm/kvm_host.h | 1 + + arch/s390/kvm/interrupt.c | 12 ++++++------ + arch/s390/kvm/kvm-s390.c | 2 +- + arch/s390/kvm/kvm-s390.h | 2 +- + 4 files changed, 9 insertions(+), 8 deletions(-) + +--- a/arch/s390/include/asm/kvm_host.h ++++ b/arch/s390/include/asm/kvm_host.h +@@ -957,6 +957,7 @@ struct kvm_arch{ + atomic64_t cmma_dirty_pages; + /* subset of available cpu features enabled by user space */ + DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); ++ /* indexed by vcpu_idx */ + DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); + struct kvm_s390_gisa_interrupt gisa_int; + struct kvm_s390_pv pv; +--- a/arch/s390/kvm/interrupt.c ++++ b/arch/s390/kvm/interrupt.c +@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(st + static void __set_cpu_idle(struct kvm_vcpu *vcpu) + { + kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); +- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); ++ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + } + + static void __unset_cpu_idle(struct kvm_vcpu *vcpu) + { + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); +- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); ++ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + } + + static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) +@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vc + + static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) + { +- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); ++ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + +- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { +- vcpu = kvm_get_vcpu(kvm, vcpu_id); ++ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { ++ vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (psw_ioint_disabled(vcpu)) + continue; + deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask) { + /* lately kicked but not yet running */ +- if (test_and_set_bit(vcpu_id, gi->kicked_mask)) ++ if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) + return; + kvm_s390_vcpu_wakeup(vcpu); + return; +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -4044,7 +4044,7 @@ static int vcpu_pre_run(struct kvm_vcpu + kvm_s390_patch_guest_per_regs(vcpu); + } + +- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); ++ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); + + vcpu->arch.sie_block->icptcode = 0; + cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); +--- a/arch/s390/kvm/kvm-s390.h ++++ b/arch/s390/kvm/kvm-s390.h +@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct + + static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) + { +- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); ++ return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + } + + static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/queue-5.14/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch b/queue-5.14/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch new file mode 100644 index 00000000000..ce946b3cfdb --- /dev/null +++ b/queue-5.14/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch @@ -0,0 +1,34 @@ +From 81b4b56d4f8130bbb99cf4e2b48082e5b4cfccb9 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 26 Aug 2021 12:57:49 +0300 +Subject: KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation + +From: Maxim Levitsky + +commit 81b4b56d4f8130bbb99cf4e2b48082e5b4cfccb9 upstream. + +If we are emulating an invalid guest state, we don't have a correct +exit reason, and thus we shouldn't do anything in this function. + +Signed-off-by: Maxim Levitsky +Message-Id: <20210826095750.1650467-2-mlevitsk@redhat.com> +Cc: stable@vger.kernel.org +Fixes: 95b5a48c4f2b ("KVM: VMX: Handle NMIs, #MCs and async #PFs in common irqs-disabled fn", 2019-06-18) +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/vmx.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6368,6 +6368,9 @@ static void vmx_handle_exit_irqoff(struc + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + ++ if (vmx->emulation_required) ++ return; ++ + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + handle_external_interrupt_irqoff(vcpu); + else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) diff --git a/queue-5.14/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch b/queue-5.14/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch new file mode 100644 index 00000000000..72414135269 --- /dev/null +++ b/queue-5.14/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch @@ -0,0 +1,74 @@ +From ec607a564f70519b340f7eb4cfc0f4a6b55285ac Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 6 Aug 2021 07:05:58 -0400 +Subject: KVM: x86: clamp host mapping level to max_level in kvm_mmu_max_mapping_level + +From: Paolo Bonzini + +commit ec607a564f70519b340f7eb4cfc0f4a6b55285ac upstream. + +This change started as a way to make kvm_mmu_hugepage_adjust a bit simpler, +but it does fix two bugs as well. + +One bug is in zapping collapsible PTEs. If a large page size is +disallowed but not all of them, kvm_mmu_max_mapping_level will return the +host mapping level and the small PTEs will be zapped up to that level. +However, if e.g. 1GB are prohibited, we can still zap 4KB mapping and +preserve the 2MB ones. This can happen for example when NX huge pages +are in use. + +The second would happen when userspace backs guest memory +with a 1gb hugepage but only assign a subset of the page to +the guest. 1gb pages would be disallowed by the memslot, but +not 2mb. kvm_mmu_max_mapping_level() would fall through to the +host_pfn_mapping_level() logic, see the 1gb hugepage, and map the whole +thing into the guest. + +Fixes: 2f57b7051fe8 ("KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level") +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 13 +++++-------- + 1 file changed, 5 insertions(+), 8 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2846,6 +2846,7 @@ int kvm_mmu_max_mapping_level(struct kvm + kvm_pfn_t pfn, int max_level) + { + struct kvm_lpage_info *linfo; ++ int host_level; + + max_level = min(max_level, max_huge_page_level); + for ( ; max_level > PG_LEVEL_4K; max_level--) { +@@ -2857,7 +2858,8 @@ int kvm_mmu_max_mapping_level(struct kvm + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + +- return host_pfn_mapping_level(kvm, gfn, pfn, slot); ++ host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); ++ return min(host_level, max_level); + } + + int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, +@@ -2881,17 +2883,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_v + if (!slot) + return PG_LEVEL_4K; + +- level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); +- if (level == PG_LEVEL_4K) +- return level; +- +- *req_level = level = min(level, max_level); +- + /* + * Enforce the iTLB multihit workaround after capturing the requested + * level, which will be used to do precise, accurate accounting. + */ +- if (huge_page_disallowed) ++ *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); ++ if (level == PG_LEVEL_4K || huge_page_disallowed) + return PG_LEVEL_4K; + + /* diff --git a/queue-5.14/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch b/queue-5.14/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch new file mode 100644 index 00000000000..1e3f5fa1d56 --- /dev/null +++ b/queue-5.14/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch @@ -0,0 +1,82 @@ +From 088acd23526647844aec1c39db4ad02552c86c7b Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Mon, 2 Aug 2021 21:46:06 -0700 +Subject: KVM: x86/mmu: Avoid collision with !PRESENT SPTEs in TDP MMU lpage stats + +From: Sean Christopherson + +commit 088acd23526647844aec1c39db4ad02552c86c7b upstream. + +Factor in whether or not the old/new SPTEs are shadow-present when +adjusting the large page stats in the TDP MMU. A modified MMIO SPTE can +toggle the page size bit, as bit 7 is used to store the MMIO generation, +i.e. is_large_pte() can get a false positive when called on a MMIO SPTE. +Ditto for nuking SPTEs with REMOVED_SPTE, which sets bit 7 in its magic +value. + +Opportunistically move the logic below the check to verify at least one +of the old/new SPTEs is shadow present. + +Use is/was_leaf even though is/was_present would suffice. The code +generation is roughly equivalent since all flags need to be computed +prior to the code in question, and using the *_leaf flags will minimize +the diff in a future enhancement to account all pages, i.e. will change +the check to "is_leaf != was_leaf". + +Reviewed-by: David Matlack +Reviewed-by: Ben Gardon + +Fixes: 1699f65c8b65 ("kvm/x86: Fix 'lpages' kvm stat for TDM MMU") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Mingwei Zhang +Message-Id: <20210803044607.599629-3-mizhang@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/tdp_mmu.c | 20 +++++++++++++------- + 1 file changed, 13 insertions(+), 7 deletions(-) + +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -412,6 +412,7 @@ static void __handle_changed_spte(struct + bool was_leaf = was_present && is_last_spte(old_spte, level); + bool is_leaf = is_present && is_last_spte(new_spte, level); + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); ++ bool was_large, is_large; + + WARN_ON(level > PT64_ROOT_MAX_LEVEL); + WARN_ON(level < PG_LEVEL_4K); +@@ -445,13 +446,6 @@ static void __handle_changed_spte(struct + + trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + +- if (is_large_pte(old_spte) != is_large_pte(new_spte)) { +- if (is_large_pte(old_spte)) +- atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); +- else +- atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); +- } +- + /* + * The only times a SPTE should be changed from a non-present to + * non-present state is when an MMIO entry is installed/modified/ +@@ -477,6 +471,18 @@ static void __handle_changed_spte(struct + return; + } + ++ /* ++ * Update large page stats if a large page is being zapped, created, or ++ * is replacing an existing shadow page. ++ */ ++ was_large = was_leaf && is_large_pte(old_spte); ++ is_large = is_leaf && is_large_pte(new_spte); ++ if (was_large != is_large) { ++ if (was_large) ++ atomic64_sub(1, (atomic64_t *)&kvm->stat.lpages); ++ else ++ atomic64_add(1, (atomic64_t *)&kvm->stat.lpages); ++ } + + if (was_leaf && is_dirty_spte(old_spte) && + (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) diff --git a/queue-5.14/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch b/queue-5.14/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch new file mode 100644 index 00000000000..431b7d94eb2 --- /dev/null +++ b/queue-5.14/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch @@ -0,0 +1,40 @@ +From d9130a2dfdd4b21736c91b818f87dbc0ccd1e757 Mon Sep 17 00:00:00 2001 +From: Zelin Deng +Date: Wed, 28 Apr 2021 10:22:01 +0800 +Subject: KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted + +From: Zelin Deng + +commit d9130a2dfdd4b21736c91b818f87dbc0ccd1e757 upstream. + +When MSR_IA32_TSC_ADJUST is written by guest due to TSC ADJUST feature +especially there's a big tsc warp (like a new vCPU is hot-added into VM +which has been up for a long time), tsc_offset is added by a large value +then go back to guest. This causes system time jump as tsc_timestamp is +not adjusted in the meantime and pvclock monotonic character. +To fix this, just notify kvm to update vCPU's guest time before back to +guest. + +Cc: stable@vger.kernel.org +Signed-off-by: Zelin Deng +Signed-off-by: Paolo Bonzini +Message-Id: <1619576521-81399-2-git-send-email-zelin.deng@linux.alibaba.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3316,6 +3316,10 @@ int kvm_set_msr_common(struct kvm_vcpu * + if (!msr_info->host_initiated) { + s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + adjust_tsc_offset_guest(vcpu, adj); ++ /* Before back to guest, tsc_timestamp must be adjusted ++ * as well, otherwise guest's percpu pvclock time could jump. ++ */ ++ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + vcpu->arch.ia32_tsc_adjust_msr = data; + } diff --git a/queue-5.14/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch b/queue-5.14/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch new file mode 100644 index 00000000000..1d24ed89911 --- /dev/null +++ b/queue-5.14/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch @@ -0,0 +1,82 @@ +From 46d4703b1db4c86ab5acb2331b10df999f005e8e Mon Sep 17 00:00:00 2001 +From: Xiao Ni +Date: Wed, 18 Aug 2021 13:57:48 +0800 +Subject: md/raid10: Remove unnecessary rcu_dereference in raid10_handle_discard + +From: Xiao Ni + +commit 46d4703b1db4c86ab5acb2331b10df999f005e8e upstream. + +We are seeing the following warning in raid10_handle_discard. +[ 695.110751] ============================= +[ 695.131439] WARNING: suspicious RCU usage +[ 695.151389] 4.18.0-319.el8.x86_64+debug #1 Not tainted +[ 695.174413] ----------------------------- +[ 695.192603] drivers/md/raid10.c:1776 suspicious +rcu_dereference_check() usage! +[ 695.225107] other info that might help us debug this: +[ 695.260940] rcu_scheduler_active = 2, debug_locks = 1 +[ 695.290157] no locks held by mkfs.xfs/10186. + +In the first loop of function raid10_handle_discard. It already +determines which disk need to handle discard request and add the +rdev reference count rdev->nr_pending. So the conf->mirrors will +not change until all bios come back from underlayer disks. It +doesn't need to use rcu_dereference to get rdev. + +Cc: stable@vger.kernel.org +Fixes: d30588b2731f ('md/raid10: improve raid10 discard request') +Signed-off-by: Xiao Ni +Acked-by: Guoqing Jiang +Signed-off-by: Song Liu +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/raid10.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1712,6 +1712,11 @@ retry_discard: + } else + r10_bio->master_bio = (struct bio *)first_r10bio; + ++ /* ++ * first select target devices under rcu_lock and ++ * inc refcount on their rdev. Record them by setting ++ * bios[x] to bio ++ */ + rcu_read_lock(); + for (disk = 0; disk < geo->raid_disks; disk++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); +@@ -1743,9 +1748,6 @@ retry_discard: + for (disk = 0; disk < geo->raid_disks; disk++) { + sector_t dev_start, dev_end; + struct bio *mbio, *rbio = NULL; +- struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); +- struct md_rdev *rrdev = rcu_dereference( +- conf->mirrors[disk].replacement); + + /* + * Now start to calculate the start and end address for each disk. +@@ -1775,9 +1777,12 @@ retry_discard: + + /* + * It only handles discard bio which size is >= stripe size, so +- * dev_end > dev_start all the time ++ * dev_end > dev_start all the time. ++ * It doesn't need to use rcu lock to get rdev here. We already ++ * add rdev->nr_pending in the first loop. + */ + if (r10_bio->devs[disk].bio) { ++ struct md_rdev *rdev = conf->mirrors[disk].rdev; + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + mbio->bi_end_io = raid10_end_discard_request; + mbio->bi_private = r10_bio; +@@ -1790,6 +1795,7 @@ retry_discard: + bio_endio(mbio); + } + if (r10_bio->devs[disk].repl_bio) { ++ struct md_rdev *rrdev = conf->mirrors[disk].replacement; + rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + rbio->bi_end_io = raid10_end_discard_request; + rbio->bi_private = r10_bio; diff --git a/queue-5.14/perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch b/queue-5.14/perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch new file mode 100644 index 00000000000..afac78ef61e --- /dev/null +++ b/queue-5.14/perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch @@ -0,0 +1,113 @@ +From 3f2cbe3810a60111a33f5f6267bd5a237b826fc9 Mon Sep 17 00:00:00 2001 +From: Alexander Antonov +Date: Tue, 6 Jul 2021 12:07:23 +0300 +Subject: perf/x86/intel/uncore: Fix IIO cleanup mapping procedure for SNR/ICX + +From: Alexander Antonov + +commit 3f2cbe3810a60111a33f5f6267bd5a237b826fc9 upstream. + +skx_iio_cleanup_mapping() is re-used for snr and icx, but in those +cases it fails to use the appropriate XXX_iio_mapping_group and as +such fails to free previously allocated resources, leading to memory +leaks. + +Fixes: 10337e95e04c ("perf/x86/intel/uncore: Enable I/O stacks to IIO PMON mapping on ICX") +Signed-off-by: Alexander Antonov +[peterz: Changelog] +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Kan Liang +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210706090723.41850-1-alexander.antonov@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/events/intel/uncore_snbep.c | 40 ++++++++++++++++++++++++----------- + 1 file changed, 28 insertions(+), 12 deletions(-) + +--- a/arch/x86/events/intel/uncore_snbep.c ++++ b/arch/x86/events/intel/uncore_snbep.c +@@ -3838,26 +3838,32 @@ clear_attr_update: + return ret; + } + +-static int skx_iio_set_mapping(struct intel_uncore_type *type) +-{ +- return pmu_iio_set_mapping(type, &skx_iio_mapping_group); +-} +- +-static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) ++static void ++pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag) + { +- struct attribute **attr = skx_iio_mapping_group.attrs; ++ struct attribute **attr = ag->attrs; + + if (!attr) + return; + + for (; *attr; attr++) + kfree((*attr)->name); +- kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs)); +- kfree(skx_iio_mapping_group.attrs); +- skx_iio_mapping_group.attrs = NULL; ++ kfree(attr_to_ext_attr(*ag->attrs)); ++ kfree(ag->attrs); ++ ag->attrs = NULL; + kfree(type->topology); + } + ++static int skx_iio_set_mapping(struct intel_uncore_type *type) ++{ ++ return pmu_iio_set_mapping(type, &skx_iio_mapping_group); ++} ++ ++static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) ++{ ++ pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group); ++} ++ + static struct intel_uncore_type skx_uncore_iio = { + .name = "iio", + .num_counters = 4, +@@ -4501,6 +4507,11 @@ static int snr_iio_set_mapping(struct in + return pmu_iio_set_mapping(type, &snr_iio_mapping_group); + } + ++static void snr_iio_cleanup_mapping(struct intel_uncore_type *type) ++{ ++ pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group); ++} ++ + static struct intel_uncore_type snr_uncore_iio = { + .name = "iio", + .num_counters = 4, +@@ -4517,7 +4528,7 @@ static struct intel_uncore_type snr_unco + .attr_update = snr_iio_attr_update, + .get_topology = snr_iio_get_topology, + .set_mapping = snr_iio_set_mapping, +- .cleanup_mapping = skx_iio_cleanup_mapping, ++ .cleanup_mapping = snr_iio_cleanup_mapping, + }; + + static struct intel_uncore_type snr_uncore_irp = { +@@ -5092,6 +5103,11 @@ static int icx_iio_set_mapping(struct in + return pmu_iio_set_mapping(type, &icx_iio_mapping_group); + } + ++static void icx_iio_cleanup_mapping(struct intel_uncore_type *type) ++{ ++ pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group); ++} ++ + static struct intel_uncore_type icx_uncore_iio = { + .name = "iio", + .num_counters = 4, +@@ -5109,7 +5125,7 @@ static struct intel_uncore_type icx_unco + .attr_update = icx_iio_attr_update, + .get_topology = icx_iio_get_topology, + .set_mapping = icx_iio_set_mapping, +- .cleanup_mapping = skx_iio_cleanup_mapping, ++ .cleanup_mapping = icx_iio_cleanup_mapping, + }; + + static struct intel_uncore_type icx_uncore_irp = { diff --git a/queue-5.14/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch b/queue-5.14/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch new file mode 100644 index 00000000000..080320384b0 --- /dev/null +++ b/queue-5.14/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch @@ -0,0 +1,72 @@ +From e7177339d7b5f9594b316842122b5fda9513d5e2 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 31 Aug 2021 09:42:22 -0700 +Subject: Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()" + +From: Sean Christopherson + +commit e7177339d7b5f9594b316842122b5fda9513d5e2 upstream. + +Revert a misguided illegal GPA check when "translating" a non-nested GPA. +The check is woefully incomplete as it does not fill in @exception as +expected by all callers, which leads to KVM attempting to inject a bogus +exception, potentially exposing kernel stack information in the process. + + WARNING: CPU: 0 PID: 8469 at arch/x86/kvm/x86.c:525 exception_type+0x98/0xb0 arch/x86/kvm/x86.c:525 + CPU: 1 PID: 8469 Comm: syz-executor531 Not tainted 5.14.0-rc7-syzkaller #0 + RIP: 0010:exception_type+0x98/0xb0 arch/x86/kvm/x86.c:525 + Call Trace: + x86_emulate_instruction+0xef6/0x1460 arch/x86/kvm/x86.c:7853 + kvm_mmu_page_fault+0x2f0/0x1810 arch/x86/kvm/mmu/mmu.c:5199 + handle_ept_misconfig+0xdf/0x3e0 arch/x86/kvm/vmx/vmx.c:5336 + __vmx_handle_exit arch/x86/kvm/vmx/vmx.c:6021 [inline] + vmx_handle_exit+0x336/0x1800 arch/x86/kvm/vmx/vmx.c:6038 + vcpu_enter_guest+0x2a1c/0x4430 arch/x86/kvm/x86.c:9712 + vcpu_run arch/x86/kvm/x86.c:9779 [inline] + kvm_arch_vcpu_ioctl_run+0x47d/0x1b20 arch/x86/kvm/x86.c:10010 + kvm_vcpu_ioctl+0x49e/0xe50 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3652 + +The bug has escaped notice because practically speaking the GPA check is +useless. The GPA check in question only comes into play when KVM is +walking guest page tables (or "translating" CR3), and KVM already handles +illegal GPA checks by setting reserved bits in rsvd_bits_mask for each +PxE, or in the case of CR3 for loading PTDPTRs, manually checks for an +illegal CR3. This particular failure doesn't hit the existing reserved +bits checks because syzbot sets guest.MAXPHYADDR=1, and IA32 architecture +simply doesn't allow for such an absurd MAXPHYADDR, e.g. 32-bit paging +doesn't define any reserved PA bits checks, which KVM emulates by only +incorporating the reserved PA bits into the "high" bits, i.e. bits 63:32. + +Simply remove the bogus check. There is zero meaningful value and no +architectural justification for supporting guest.MAXPHYADDR < 32, and +properly filling the exception would introduce non-trivial complexity. + +This reverts commit ec7771ab471ba6a945350353617e2e3385d0e013. + +Fixes: ec7771ab471b ("KVM: x86: mmu: Add guest physical address check in translate_gpa()") +Cc: stable@vger.kernel.org +Reported-by: syzbot+200c08e88ae818f849ce@syzkaller.appspotmail.com +Signed-off-by: Sean Christopherson +Message-Id: <20210831164224.1119728-2-seanjc@google.com> +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 6 ------ + 1 file changed, 6 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -323,12 +323,6 @@ static bool check_mmio_spte(struct kvm_v + static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) + { +- /* Check if guest physical address doesn't exceed guest maximum */ +- if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { +- exception->error_code |= PFERR_RSVD_MASK; +- return UNMAPPED_GVA; +- } +- + return gpa; + } + diff --git a/queue-5.14/series b/queue-5.14/series index 5508e0caff0..957b327ecf4 100644 --- a/queue-5.14/series +++ b/queue-5.14/series @@ -306,3 +306,22 @@ raid1-ensure-write-behind-bio-has-less-than-bio_max_vecs-sectors.patch cifs-do-not-leak-edeadlk-to-dgetents64-for-status_user_session_deleted.patch smb3-fix-posix-extensions-mount-option.patch tty-fix-data-race-between-tiocsti-and-flush_to_ldisc.patch +perf-x86-intel-uncore-fix-iio-cleanup-mapping-procedure-for-snr-icx.patch +revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch +kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch +kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch +kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch +kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch +kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch +kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch +kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch +kvm-arm64-vgic-resample-hw-pending-state-on-deactivation.patch +arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch +io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch +md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch +char-tpm-kconfig-remove-bad-i2c-cr50-select.patch +fuse-truncate-pagecache-on-atomic_o_trunc.patch +fuse-flush-extending-writes.patch +fuse-wait-for-writepages-in-syncfs.patch +ima-remove-wmissing-prototypes-warning.patch +ima-remove-the-dependency-on-crypto_md5.patch