From: Greg Kroah-Hartman Date: Mon, 13 Sep 2021 11:36:21 +0000 (+0200) Subject: 5.13-stable patches X-Git-Tag: v5.4.146~13 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d1b8330e86cb823c98f77a5edfdc4fb764187a8d;p=thirdparty%2Fkernel%2Fstable-queue.git 5.13-stable patches added patches: arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch char-tpm-kconfig-remove-bad-i2c-cr50-select.patch fuse-flush-extending-writes.patch fuse-truncate-pagecache-on-atomic_o_trunc.patch fuse-wait-for-writepages-in-syncfs.patch ima-remove-the-dependency-on-crypto_md5.patch ima-remove-wmissing-prototypes-warning.patch io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch perf-x86-amd-ibs-extend-perf_pmu_cap_no_exclude-to-ibs-op.patch revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch x86-efi-restore-firmware-idt-before-calling-exitbootservices.patch x86-resctrl-fix-a-maybe-uninitialized-build-warning-treated-as-error.patch --- diff --git a/queue-5.13/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch b/queue-5.13/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch new file mode 100644 index 00000000000..3acb65df343 --- /dev/null +++ b/queue-5.13/arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch @@ -0,0 +1,189 @@ +From bf781869e5cf3e4ec1a47dad69b6f0df97629cbd Mon Sep 17 00:00:00 2001 +From: Claudiu Beznea +Date: Tue, 27 Jul 2021 10:40:05 +0300 +Subject: ARM: dts: at91: add pinctrl-{names, 0} for all gpios + +From: Claudiu Beznea + +commit bf781869e5cf3e4ec1a47dad69b6f0df97629cbd upstream. + +Add pinctrl-names and pinctrl-0 properties on controllers that claims to +use pins to avoid failures due to +commit 2ab73c6d8323 ("gpio: Support GPIO controllers without pin-ranges") +and also to avoid using pins that may be claimed my other IPs. + +Fixes: b7c2b6157079 ("ARM: at91: add Atmel's SAMA5D3 Xplained board") +Fixes: 1e5f532c2737 ("ARM: dts: at91: sam9x60: add device tree for soc and board") +Fixes: 38153a017896 ("ARM: at91/dt: sama5d4: add dts for sama5d4 xplained board") +Signed-off-by: Claudiu Beznea +Signed-off-by: Nicolas Ferre +Link: https://lore.kernel.org/r/20210727074006.1609989-1-claudiu.beznea@microchip.com +Cc: # v5.7+ +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm/boot/dts/at91-sam9x60ek.dts | 16 ++++++++++++++- + arch/arm/boot/dts/at91-sama5d3_xplained.dts | 29 ++++++++++++++++++++++++++++ + arch/arm/boot/dts/at91-sama5d4_xplained.dts | 19 ++++++++++++++++++ + 3 files changed, 63 insertions(+), 1 deletion(-) + +--- a/arch/arm/boot/dts/at91-sam9x60ek.dts ++++ b/arch/arm/boot/dts/at91-sam9x60ek.dts +@@ -92,6 +92,8 @@ + + leds { + compatible = "gpio-leds"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_gpio_leds>; + status = "okay"; /* Conflict with pwm0. */ + + red { +@@ -537,6 +539,10 @@ + AT91_PIOA 19 AT91_PERIPH_A (AT91_PINCTRL_PULL_UP | AT91_PINCTRL_DRIVE_STRENGTH_HI) /* PA19 DAT2 periph A with pullup */ + AT91_PIOA 20 AT91_PERIPH_A (AT91_PINCTRL_PULL_UP | AT91_PINCTRL_DRIVE_STRENGTH_HI)>; /* PA20 DAT3 periph A with pullup */ + }; ++ pinctrl_sdmmc0_cd: sdmmc0_cd { ++ atmel,pins = ++ ; ++ }; + }; + + sdmmc1 { +@@ -569,6 +575,14 @@ + AT91_PIOD 16 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>; + }; + }; ++ ++ leds { ++ pinctrl_gpio_leds: gpio_leds { ++ atmel,pins = ; ++ }; ++ }; + }; /* pinctrl */ + + &pwm0 { +@@ -580,7 +594,7 @@ + &sdmmc0 { + bus-width = <4>; + pinctrl-names = "default"; +- pinctrl-0 = <&pinctrl_sdmmc0_default>; ++ pinctrl-0 = <&pinctrl_sdmmc0_default &pinctrl_sdmmc0_cd>; + status = "okay"; + cd-gpios = <&pioA 23 GPIO_ACTIVE_LOW>; + disable-wp; +--- a/arch/arm/boot/dts/at91-sama5d3_xplained.dts ++++ b/arch/arm/boot/dts/at91-sama5d3_xplained.dts +@@ -57,6 +57,8 @@ + }; + + spi0: spi@f0004000 { ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_spi0_cs>; + cs-gpios = <&pioD 13 0>, <0>, <0>, <&pioD 16 0>; + status = "okay"; + }; +@@ -169,6 +171,8 @@ + }; + + spi1: spi@f8008000 { ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_spi1_cs>; + cs-gpios = <&pioC 25 0>; + status = "okay"; + }; +@@ -248,6 +252,26 @@ + ; + }; ++ ++ pinctrl_gpio_leds: gpio_leds_default { ++ atmel,pins = ++ ; ++ }; ++ ++ pinctrl_spi0_cs: spi0_cs_default { ++ atmel,pins = ++ ; ++ }; ++ ++ pinctrl_spi1_cs: spi1_cs_default { ++ atmel,pins = ; ++ }; ++ ++ pinctrl_vcc_mmc0_reg_gpio: vcc_mmc0_reg_gpio_default { ++ atmel,pins = ; ++ }; + }; + }; + }; +@@ -339,6 +363,8 @@ + + vcc_mmc0_reg: fixedregulator_mmc0 { + compatible = "regulator-fixed"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_vcc_mmc0_reg_gpio>; + gpio = <&pioE 2 GPIO_ACTIVE_LOW>; + regulator-name = "mmc0-card-supply"; + regulator-min-microvolt = <3300000>; +@@ -362,6 +388,9 @@ + + leds { + compatible = "gpio-leds"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_gpio_leds>; ++ status = "okay"; + + d2 { + label = "d2"; +--- a/arch/arm/boot/dts/at91-sama5d4_xplained.dts ++++ b/arch/arm/boot/dts/at91-sama5d4_xplained.dts +@@ -90,6 +90,8 @@ + }; + + spi1: spi@fc018000 { ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_spi0_cs>; + cs-gpios = <&pioB 21 0>; + status = "okay"; + }; +@@ -147,6 +149,19 @@ + atmel,pins = + ; + }; ++ pinctrl_spi0_cs: spi0_cs_default { ++ atmel,pins = ++ ; ++ }; ++ pinctrl_gpio_leds: gpio_leds_default { ++ atmel,pins = ++ ; ++ }; ++ pinctrl_vcc_mmc1_reg: vcc_mmc1_reg { ++ atmel,pins = ++ ; ++ }; + }; + }; + }; +@@ -252,6 +267,8 @@ + + leds { + compatible = "gpio-leds"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_gpio_leds>; + status = "okay"; + + d8 { +@@ -278,6 +295,8 @@ + + vcc_mmc1_reg: fixedregulator_mmc1 { + compatible = "regulator-fixed"; ++ pinctrl-names = "default"; ++ pinctrl-0 = <&pinctrl_vcc_mmc1_reg>; + gpio = <&pioE 4 GPIO_ACTIVE_LOW>; + regulator-name = "VDD MCI1"; + regulator-min-microvolt = <3300000>; diff --git a/queue-5.13/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch b/queue-5.13/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch new file mode 100644 index 00000000000..18cafc45e41 --- /dev/null +++ b/queue-5.13/char-tpm-kconfig-remove-bad-i2c-cr50-select.patch @@ -0,0 +1,33 @@ +From 847fdae1579f4ee930b01f24a7847b8043bf468c Mon Sep 17 00:00:00 2001 +From: Adrian Ratiu +Date: Tue, 27 Jul 2021 20:13:12 +0300 +Subject: char: tpm: Kconfig: remove bad i2c cr50 select + +From: Adrian Ratiu + +commit 847fdae1579f4ee930b01f24a7847b8043bf468c upstream. + +This fixes a minor bug which went unnoticed during the initial +driver upstreaming review: TCG_CR50 does not exist in mainline +kernels, so remove it. + +Fixes: 3a253caaad11 ("char: tpm: add i2c driver for cr50") +Cc: stable@vger.kernel.org +Reviewed-by: Jarkko Sakkinen +Signed-off-by: Adrian Ratiu +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/char/tpm/Kconfig | 1 - + 1 file changed, 1 deletion(-) + +--- a/drivers/char/tpm/Kconfig ++++ b/drivers/char/tpm/Kconfig +@@ -89,7 +89,6 @@ config TCG_TIS_SYNQUACER + config TCG_TIS_I2C_CR50 + tristate "TPM Interface Specification 2.0 Interface (I2C - CR50)" + depends on I2C +- select TCG_CR50 + help + This is a driver for the Google cr50 I2C TPM interface which is a + custom microcontroller and requires a custom i2c protocol interface diff --git a/queue-5.13/fuse-flush-extending-writes.patch b/queue-5.13/fuse-flush-extending-writes.patch new file mode 100644 index 00000000000..b18abd3853d --- /dev/null +++ b/queue-5.13/fuse-flush-extending-writes.patch @@ -0,0 +1,49 @@ +From 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Tue, 31 Aug 2021 14:18:08 +0200 +Subject: fuse: flush extending writes + +From: Miklos Szeredi + +commit 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd upstream. + +Callers of fuse_writeback_range() assume that the file is ready for +modification by the server in the supplied byte range after the call +returns. + +If there's a write that extends the file beyond the end of the supplied +range, then the file needs to be extended to at least the end of the range, +but currently that's not done. + +There are at least two cases where this can cause problems: + + - copy_file_range() will return short count if the file is not extended + up to end of the source range. + + - FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE will not extend the file, + hence the region may not be fully allocated. + +Fix by flushing writes from the start of the range up to the end of the +file. This could be optimized if the writes are non-extending, etc, but +it's probably not worth the trouble. + +Fixes: a2bc92362941 ("fuse: fix copy_file_range() in the writeback case") +Fixes: 6b1bdb56b17c ("fuse: allow fallocate(FALLOC_FL_ZERO_RANGE)") +Cc: # v5.2 +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -2886,7 +2886,7 @@ fuse_direct_IO(struct kiocb *iocb, struc + + static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) + { +- int err = filemap_write_and_wait_range(inode->i_mapping, start, end); ++ int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); + + if (!err) + fuse_sync_writes(inode); diff --git a/queue-5.13/fuse-truncate-pagecache-on-atomic_o_trunc.patch b/queue-5.13/fuse-truncate-pagecache-on-atomic_o_trunc.patch new file mode 100644 index 00000000000..04fd413702f --- /dev/null +++ b/queue-5.13/fuse-truncate-pagecache-on-atomic_o_trunc.patch @@ -0,0 +1,58 @@ +From 76224355db7570cbe6b6f75c8929a1558828dd55 Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Tue, 17 Aug 2021 21:05:16 +0200 +Subject: fuse: truncate pagecache on atomic_o_trunc + +From: Miklos Szeredi + +commit 76224355db7570cbe6b6f75c8929a1558828dd55 upstream. + +fuse_finish_open() will be called with FUSE_NOWRITE in case of atomic +O_TRUNC. This can deadlock with fuse_wait_on_page_writeback() in +fuse_launder_page() triggered by invalidate_inode_pages2(). + +Fix by replacing invalidate_inode_pages2() in fuse_finish_open() with a +truncate_pagecache() call. This makes sense regardless of FOPEN_KEEP_CACHE +or fc->writeback cache, so do it unconditionally. + +Reported-by: Xie Yongji +Reported-and-tested-by: syzbot+bea44a5189836d956894@syzkaller.appspotmail.com +Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") +Cc: +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/file.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inod + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + +- if (!(ff->open_flags & FOPEN_KEEP_CACHE)) +- invalidate_inode_pages2(inode->i_mapping); + if (ff->open_flags & FOPEN_STREAM) + stream_open(inode, file); + else if (ff->open_flags & FOPEN_NONSEEKABLE) + nonseekable_open(inode, file); ++ + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + +@@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inod + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); ++ truncate_pagecache(inode, 0); + fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); ++ } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { ++ invalidate_inode_pages2(inode->i_mapping); + } ++ + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) + fuse_link_write_file(file); + } diff --git a/queue-5.13/fuse-wait-for-writepages-in-syncfs.patch b/queue-5.13/fuse-wait-for-writepages-in-syncfs.patch new file mode 100644 index 00000000000..15f09799c2d --- /dev/null +++ b/queue-5.13/fuse-wait-for-writepages-in-syncfs.patch @@ -0,0 +1,242 @@ +From 660585b56e63ca034ad506ea53c807c5cdca3196 Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Wed, 1 Sep 2021 12:39:02 +0200 +Subject: fuse: wait for writepages in syncfs + +From: Miklos Szeredi + +commit 660585b56e63ca034ad506ea53c807c5cdca3196 upstream. + +In case of fuse the MM subsystem doesn't guarantee that page writeback +completes by the time ->sync_fs() is called. This is because fuse +completes page writeback immediately to prevent DoS of memory reclaim by +the userspace file server. + +This means that fuse itself must ensure that writes are synced before +sending the SYNCFS request to the server. + +Introduce sync buckets, that hold a counter for the number of outstanding +write requests. On syncfs replace the current bucket with a new one and +wait until the old bucket's counter goes down to zero. + +It is possible to have multiple syncfs calls in parallel, in which case +there could be more than one waited-on buckets. Descendant buckets must +not complete until the parent completes. Add a count to the child (new) +bucket until the (parent) old bucket completes. + +Use RCU protection to dereference the current bucket and to wake up an +emptied bucket. Use fc->lock to protect against parallel assignments to +the current bucket. + +This leaves just the counter to be a possible scalability issue. The +fc->num_waiting counter has a similar issue, so both should be addressed at +the same time. + +Reported-by: Amir Goldstein +Fixes: 2d82ab251ef0 ("virtiofs: propagate sync() to file server") +Cc: # v5.14 +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/file.c | 21 +++++++++++++++++++ + fs/fuse/fuse_i.h | 19 +++++++++++++++++ + fs/fuse/inode.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 100 insertions(+) + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -392,6 +392,7 @@ struct fuse_writepage_args { + struct list_head queue_entry; + struct fuse_writepage_args *next; + struct inode *inode; ++ struct fuse_sync_bucket *bucket; + }; + + static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, +@@ -1613,6 +1614,9 @@ static void fuse_writepage_free(struct f + struct fuse_args_pages *ap = &wpa->ia.ap; + int i; + ++ if (wpa->bucket) ++ fuse_sync_bucket_dec(wpa->bucket); ++ + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); + +@@ -1876,6 +1880,20 @@ static struct fuse_writepage_args *fuse_ + + } + ++static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, ++ struct fuse_writepage_args *wpa) ++{ ++ if (!fc->sync_fs) ++ return; ++ ++ rcu_read_lock(); ++ /* Prevent resurrection of dead bucket in unlikely race with syncfs */ ++ do { ++ wpa->bucket = rcu_dereference(fc->curr_bucket); ++ } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); ++ rcu_read_unlock(); ++} ++ + static int fuse_writepage_locked(struct page *page) + { + struct address_space *mapping = page->mapping; +@@ -1903,6 +1921,7 @@ static int fuse_writepage_locked(struct + if (!wpa->ia.ff) + goto err_nofile; + ++ fuse_writepage_add_to_bucket(fc, wpa); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + + copy_highpage(tmp_page, page); +@@ -2153,6 +2172,8 @@ static int fuse_writepages_fill(struct p + __free_page(tmp_page); + goto out_unlock; + } ++ fuse_writepage_add_to_bucket(fc, wpa); ++ + data->max_pages = 1; + + ap = &wpa->ia.ap; +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -515,6 +515,13 @@ struct fuse_fs_context { + void **fudptr; + }; + ++struct fuse_sync_bucket { ++ /* count is a possible scalability bottleneck */ ++ atomic_t count; ++ wait_queue_head_t waitq; ++ struct rcu_head rcu; ++}; ++ + /** + * A Fuse connection. + * +@@ -807,6 +814,9 @@ struct fuse_conn { + + /** List of filesystems using this connection */ + struct list_head mounts; ++ ++ /* New writepages go into this bucket */ ++ struct fuse_sync_bucket __rcu *curr_bucket; + }; + + /* +@@ -910,6 +920,15 @@ static inline void fuse_page_descs_lengt + descs[i].length = PAGE_SIZE - descs[i].offset; + } + ++static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) ++{ ++ /* Need RCU protection to prevent use after free after the decrement */ ++ rcu_read_lock(); ++ if (atomic_dec_and_test(&bucket->count)) ++ wake_up(&bucket->waitq); ++ rcu_read_unlock(); ++} ++ + /** Device operations */ + extern const struct file_operations fuse_dev_operations; + +--- a/fs/fuse/inode.c ++++ b/fs/fuse/inode.c +@@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *de + return err; + } + ++static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) ++{ ++ struct fuse_sync_bucket *bucket; ++ ++ bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); ++ if (bucket) { ++ init_waitqueue_head(&bucket->waitq); ++ /* Initial active count */ ++ atomic_set(&bucket->count, 1); ++ } ++ return bucket; ++} ++ ++static void fuse_sync_fs_writes(struct fuse_conn *fc) ++{ ++ struct fuse_sync_bucket *bucket, *new_bucket; ++ int count; ++ ++ new_bucket = fuse_sync_bucket_alloc(); ++ spin_lock(&fc->lock); ++ bucket = rcu_dereference_protected(fc->curr_bucket, 1); ++ count = atomic_read(&bucket->count); ++ WARN_ON(count < 1); ++ /* No outstanding writes? */ ++ if (count == 1) { ++ spin_unlock(&fc->lock); ++ kfree(new_bucket); ++ return; ++ } ++ ++ /* ++ * Completion of new bucket depends on completion of this bucket, so add ++ * one more count. ++ */ ++ atomic_inc(&new_bucket->count); ++ rcu_assign_pointer(fc->curr_bucket, new_bucket); ++ spin_unlock(&fc->lock); ++ /* ++ * Drop initial active count. At this point if all writes in this and ++ * ancestor buckets complete, the count will go to zero and this task ++ * will be woken up. ++ */ ++ atomic_dec(&bucket->count); ++ ++ wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); ++ ++ /* Drop temp count on descendant bucket */ ++ fuse_sync_bucket_dec(new_bucket); ++ kfree_rcu(bucket, rcu); ++} ++ + static int fuse_sync_fs(struct super_block *sb, int wait) + { + struct fuse_mount *fm = get_fuse_mount_super(sb); +@@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_blo + if (!fc->sync_fs) + return 0; + ++ fuse_sync_fs_writes(fc); ++ + memset(&inarg, 0, sizeof(inarg)); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); +@@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc) + { + if (refcount_dec_and_test(&fc->count)) { + struct fuse_iqueue *fiq = &fc->iq; ++ struct fuse_sync_bucket *bucket; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); +@@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc) + fiq->ops->release(fiq); + put_pid_ns(fc->pid_ns); + put_user_ns(fc->user_ns); ++ bucket = rcu_dereference_protected(fc->curr_bucket, 1); ++ if (bucket) { ++ WARN_ON(atomic_read(&bucket->count) != 1); ++ kfree(bucket); ++ } + fc->release(fc); + } + } +@@ -1366,6 +1425,7 @@ int fuse_fill_super_common(struct super_ + if (sb->s_flags & SB_MANDLOCK) + goto err; + ++ rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); + fuse_sb_defaults(sb); + + if (ctx->is_bdev) { diff --git a/queue-5.13/ima-remove-the-dependency-on-crypto_md5.patch b/queue-5.13/ima-remove-the-dependency-on-crypto_md5.patch new file mode 100644 index 00000000000..d8acf2e5ca7 --- /dev/null +++ b/queue-5.13/ima-remove-the-dependency-on-crypto_md5.patch @@ -0,0 +1,45 @@ +From 8510505d55e194d3f6c9644c9f9d12c4f6b0395a Mon Sep 17 00:00:00 2001 +From: THOBY Simon +Date: Mon, 16 Aug 2021 08:10:59 +0000 +Subject: IMA: remove the dependency on CRYPTO_MD5 + +From: THOBY Simon + +commit 8510505d55e194d3f6c9644c9f9d12c4f6b0395a upstream. + +MD5 is a weak digest algorithm that shouldn't be used for cryptographic +operation. It hinders the efficiency of a patch set that aims to limit +the digests allowed for the extended file attribute namely security.ima. +MD5 is no longer a requirement for IMA, nor should it be used there. + +The sole place where we still use the MD5 algorithm inside IMA is setting +the ima_hash algorithm to MD5, if the user supplies 'ima_hash=md5' +parameter on the command line. With commit ab60368ab6a4 ("ima: Fallback +to the builtin hash algorithm"), setting "ima_hash=md5" fails gracefully +when CRYPTO_MD5 is not set: + ima: Can not allocate md5 (reason: -2) + ima: Allocating md5 failed, going to use default hash algorithm sha256 + +Remove the CRYPTO_MD5 dependency for IMA. + +Signed-off-by: THOBY Simon +Reviewed-by: Lakshmi Ramasubramanian +[zohar@linux.ibm.com: include commit number in patch description for +stable.] +Cc: stable@vger.kernel.org # 4.17 +Signed-off-by: Mimi Zohar +Signed-off-by: Greg Kroah-Hartman +--- + security/integrity/ima/Kconfig | 1 - + 1 file changed, 1 deletion(-) + +--- a/security/integrity/ima/Kconfig ++++ b/security/integrity/ima/Kconfig +@@ -6,7 +6,6 @@ config IMA + select SECURITYFS + select CRYPTO + select CRYPTO_HMAC +- select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_HASH_INFO + select TCG_TPM if HAS_IOMEM && !UML diff --git a/queue-5.13/ima-remove-wmissing-prototypes-warning.patch b/queue-5.13/ima-remove-wmissing-prototypes-warning.patch new file mode 100644 index 00000000000..4464b563fdb --- /dev/null +++ b/queue-5.13/ima-remove-wmissing-prototypes-warning.patch @@ -0,0 +1,40 @@ +From a32ad90426a9c8eb3915eed26e08ce133bd9e0da Mon Sep 17 00:00:00 2001 +From: Austin Kim +Date: Tue, 29 Jun 2021 14:50:50 +0100 +Subject: IMA: remove -Wmissing-prototypes warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Austin Kim + +commit a32ad90426a9c8eb3915eed26e08ce133bd9e0da upstream. + +With W=1 build, the compiler throws warning message as below: + + security/integrity/ima/ima_mok.c:24:12: warning: + no previous prototype for ‘ima_mok_init’ [-Wmissing-prototypes] + __init int ima_mok_init(void) + +Silence the warning by adding static keyword to ima_mok_init(). + +Signed-off-by: Austin Kim +Fixes: 41c89b64d718 ("IMA: create machine owner and blacklist keyrings") +Cc: stable@vger.kernel.org +Signed-off-by: Mimi Zohar +Signed-off-by: Greg Kroah-Hartman +--- + security/integrity/ima/ima_mok.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/security/integrity/ima/ima_mok.c ++++ b/security/integrity/ima/ima_mok.c +@@ -21,7 +21,7 @@ struct key *ima_blacklist_keyring; + /* + * Allocate the IMA blacklist keyring + */ +-__init int ima_mok_init(void) ++static __init int ima_mok_init(void) + { + struct key_restriction *restriction; + diff --git a/queue-5.13/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch b/queue-5.13/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch new file mode 100644 index 00000000000..feb447f1a85 --- /dev/null +++ b/queue-5.13/io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch @@ -0,0 +1,104 @@ +From ecc53c48c13d995e6fe5559e30ffee48d92784fd Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Sun, 29 Aug 2021 16:13:03 -0600 +Subject: io-wq: check max_worker limits if a worker transitions bound state + +From: Jens Axboe + +commit ecc53c48c13d995e6fe5559e30ffee48d92784fd upstream. + +For the two places where new workers are created, we diligently check if +we are allowed to create a new worker. If we're currently at the limit +of how many workers of a given type we can have, then we don't create +any new ones. + +If you have a mixed workload with various types of bound and unbounded +work, then it can happen that a worker finishes one type of work and +is then transitioned to the other type. For this case, we don't check +if we are actually allowed to do so. This can cause io-wq to temporarily +exceed the allowed number of workers for a given type. + +When retrieving work, check that the types match. If they don't, check +if we are allowed to transition to the other type. If not, then don't +handle the new work. + +Cc: stable@vger.kernel.org +Reported-by: Johannes Lundberg +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io-wq.c | 33 ++++++++++++++++++++++++++++++--- + 1 file changed, 30 insertions(+), 3 deletions(-) + +--- a/fs/io-wq.c ++++ b/fs/io-wq.c +@@ -424,7 +424,28 @@ static void io_wait_on_hash(struct io_wq + spin_unlock(&wq->hash->wait.lock); + } + +-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) ++/* ++ * We can always run the work if the worker is currently the same type as ++ * the work (eg both are bound, or both are unbound). If they are not the ++ * same, only allow it if incrementing the worker count would be allowed. ++ */ ++static bool io_worker_can_run_work(struct io_worker *worker, ++ struct io_wq_work *work) ++{ ++ struct io_wqe_acct *acct; ++ ++ if (!(worker->flags & IO_WORKER_F_BOUND) != ++ !(work->flags & IO_WQ_WORK_UNBOUND)) ++ return true; ++ ++ /* not the same type, check if we'd go over the limit */ ++ acct = io_work_get_acct(worker->wqe, work); ++ return acct->nr_workers < acct->max_workers; ++} ++ ++static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, ++ struct io_worker *worker, ++ bool *stalled) + __must_hold(wqe->lock) + { + struct io_wq_work_node *node, *prev; +@@ -436,6 +457,9 @@ static struct io_wq_work *io_get_next_wo + + work = container_of(node, struct io_wq_work, list); + ++ if (!io_worker_can_run_work(worker, work)) ++ break; ++ + /* not hashed, can run anytime */ + if (!io_wq_is_hashed(work)) { + wq_list_del(&wqe->work_list, node, prev); +@@ -462,6 +486,7 @@ static struct io_wq_work *io_get_next_wo + raw_spin_unlock(&wqe->lock); + io_wait_on_hash(wqe, stall_hash); + raw_spin_lock(&wqe->lock); ++ *stalled = true; + } + + return NULL; +@@ -501,6 +526,7 @@ static void io_worker_handle_work(struct + + do { + struct io_wq_work *work; ++ bool stalled; + get_next: + /* + * If we got some work, mark us as busy. If we didn't, but +@@ -509,10 +535,11 @@ get_next: + * can't make progress, any work completion or insertion will + * clear the stalled flag. + */ +- work = io_get_next_work(wqe); ++ stalled = false; ++ work = io_get_next_work(wqe, worker, &stalled); + if (work) + __io_worker_busy(wqe, worker, work); +- else if (!wq_list_empty(&wqe->work_list)) ++ else if (stalled) + wqe->flags |= IO_WQE_FLAG_STALLED; + + raw_spin_unlock_irq(&wqe->lock); diff --git a/queue-5.13/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch b/queue-5.13/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch new file mode 100644 index 00000000000..2324c2f37e0 --- /dev/null +++ b/queue-5.13/kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch @@ -0,0 +1,54 @@ +From 47e6223c841e029bfc23c3ce594dac5525cebaf8 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Mon, 2 Aug 2021 13:38:30 +0100 +Subject: KVM: arm64: Unregister HYP sections from kmemleak in protected mode + +From: Marc Zyngier + +commit 47e6223c841e029bfc23c3ce594dac5525cebaf8 upstream. + +Booting a KVM host in protected mode with kmemleak quickly results +in a pretty bad crash, as kmemleak doesn't know that the HYP sections +have been taken away. This is specially true for the BSS section, +which is part of the kernel BSS section and registered at boot time +by kmemleak itself. + +Unregister the HYP part of the BSS before making that section +HYP-private. The rest of the HYP-specific data is obtained via +the page allocator or lives in other sections, none of which is +subjected to kmemleak. + +Fixes: 90134ac9cabb ("KVM: arm64: Protect the .hyp sections from the host") +Reviewed-by: Quentin Perret +Reviewed-by: Catalin Marinas +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org # 5.13 +Link: https://lore.kernel.org/r/20210802123830.2195174-3-maz@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/arm.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1957,6 +1958,12 @@ static int finalize_hyp_mode(void) + if (ret) + return ret; + ++ /* ++ * Exclude HYP BSS from kmemleak so that it doesn't get peeked ++ * at, which would end badly once the section is inaccessible. ++ * None of other sections should ever be introspected. ++ */ ++ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); + ret = pkvm_mark_hyp_section(__hyp_bss); + if (ret) + return ret; diff --git a/queue-5.13/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch b/queue-5.13/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch new file mode 100644 index 00000000000..0855571e720 --- /dev/null +++ b/queue-5.13/kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch @@ -0,0 +1,60 @@ +From f7782bb8d818d8f47c26b22079db10599922787a Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 10 Aug 2021 07:45:26 -0700 +Subject: KVM: nVMX: Unconditionally clear nested.pi_pending on nested VM-Enter + +From: Sean Christopherson + +commit f7782bb8d818d8f47c26b22079db10599922787a upstream. + +Clear nested.pi_pending on nested VM-Enter even if L2 will run without +posted interrupts enabled. If nested.pi_pending is left set from a +previous L2, vmx_complete_nested_posted_interrupt() will pick up the +stale flag and exit to userspace with an "internal emulation error" due +the new L2 not having a valid nested.pi_desc. + +Arguably, vmx_complete_nested_posted_interrupt() should first check for +posted interrupts being enabled, but it's also completely reasonable that +KVM wouldn't screw up a fundamental flag. Not to mention that the mere +existence of nested.pi_pending is a long-standing bug as KVM shouldn't +move the posted interrupt out of the IRR until it's actually processed, +e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit +with a "pending" posted interrupt. Fixing the mess is a future problem. + +Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI +descriptor as an error, this was a benign bug as the null PI descriptor +effectively served as a check on PI not being enabled. Even then, the +new flow did not become problematic until KVM started checking the result +of kvm_check_nested_events(). + +Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") +Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable") +Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails") +Cc: stable@vger.kernel.org +Cc: Jim Mattson +Signed-off-by: Sean Christopherson +Message-Id: <20210810144526.2662272-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/nested.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2226,12 +2226,11 @@ static void prepare_vmcs02_early(struct + ~PIN_BASED_VMX_PREEMPTION_TIMER); + + /* Posted interrupts setting is only taken from vmcs12. */ +- if (nested_cpu_has_posted_intr(vmcs12)) { ++ vmx->nested.pi_pending = false; ++ if (nested_cpu_has_posted_intr(vmcs12)) + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; +- vmx->nested.pi_pending = false; +- } else { ++ else + exec_control &= ~PIN_BASED_POSTED_INTR; +- } + pin_controls_set(vmx, exec_control); + + /* diff --git a/queue-5.13/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch b/queue-5.13/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch new file mode 100644 index 00000000000..ae66001ce58 --- /dev/null +++ b/queue-5.13/kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch @@ -0,0 +1,122 @@ +From a3e03bc1368c1bc16e19b001fc96dc7430573cc8 Mon Sep 17 00:00:00 2001 +From: Halil Pasic +Date: Fri, 27 Aug 2021 14:54:29 +0200 +Subject: KVM: s390: index kvm->arch.idle_mask by vcpu_idx +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Halil Pasic + +commit a3e03bc1368c1bc16e19b001fc96dc7430573cc8 upstream. + +While in practice vcpu->vcpu_idx == vcpu->vcp_id is often true, it may +not always be, and we must not rely on this. Reason is that KVM decides +the vcpu_idx, userspace decides the vcpu_id, thus the two might not +match. + +Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies +that code like +for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_id); + do_stuff(vcpu); +} +is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an +vcpu_id. The trouble is, we do actually use kvm->arch.idle_mask like +this. To fix this problem we have two options. Either use +kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id, +or switch to indexing via vcpu_idx. The latter is preferable for obvious +reasons. + +Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to +indexing it by vcpu_idx. To keep gisa_int.kicked_mask indexed by the +same index as idle_mask lets make the same change for it as well. + +Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array") +Signed-off-by: Halil Pasic +Reviewed-by: Christian Bornträger +Reviewed-by: Claudio Imbrenda +Cc: # 3.15+ +Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com +Signed-off-by: Christian Borntraeger +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/include/asm/kvm_host.h | 1 + + arch/s390/kvm/interrupt.c | 12 ++++++------ + arch/s390/kvm/kvm-s390.c | 2 +- + arch/s390/kvm/kvm-s390.h | 2 +- + 4 files changed, 9 insertions(+), 8 deletions(-) + +--- a/arch/s390/include/asm/kvm_host.h ++++ b/arch/s390/include/asm/kvm_host.h +@@ -962,6 +962,7 @@ struct kvm_arch{ + atomic64_t cmma_dirty_pages; + /* subset of available cpu features enabled by user space */ + DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); ++ /* indexed by vcpu_idx */ + DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); + struct kvm_s390_gisa_interrupt gisa_int; + struct kvm_s390_pv pv; +--- a/arch/s390/kvm/interrupt.c ++++ b/arch/s390/kvm/interrupt.c +@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(st + static void __set_cpu_idle(struct kvm_vcpu *vcpu) + { + kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); +- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); ++ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + } + + static void __unset_cpu_idle(struct kvm_vcpu *vcpu) + { + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); +- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); ++ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + } + + static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) +@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vc + + static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) + { +- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); ++ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + +- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { +- vcpu = kvm_get_vcpu(kvm, vcpu_id); ++ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { ++ vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (psw_ioint_disabled(vcpu)) + continue; + deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask) { + /* lately kicked but not yet running */ +- if (test_and_set_bit(vcpu_id, gi->kicked_mask)) ++ if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) + return; + kvm_s390_vcpu_wakeup(vcpu); + return; +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -4020,7 +4020,7 @@ static int vcpu_pre_run(struct kvm_vcpu + kvm_s390_patch_guest_per_regs(vcpu); + } + +- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); ++ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); + + vcpu->arch.sie_block->icptcode = 0; + cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); +--- a/arch/s390/kvm/kvm-s390.h ++++ b/arch/s390/kvm/kvm-s390.h +@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct + + static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) + { +- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); ++ return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + } + + static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/queue-5.13/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch b/queue-5.13/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch new file mode 100644 index 00000000000..9ccd5ebd2b9 --- /dev/null +++ b/queue-5.13/kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch @@ -0,0 +1,34 @@ +From 81b4b56d4f8130bbb99cf4e2b48082e5b4cfccb9 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 26 Aug 2021 12:57:49 +0300 +Subject: KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation + +From: Maxim Levitsky + +commit 81b4b56d4f8130bbb99cf4e2b48082e5b4cfccb9 upstream. + +If we are emulating an invalid guest state, we don't have a correct +exit reason, and thus we shouldn't do anything in this function. + +Signed-off-by: Maxim Levitsky +Message-Id: <20210826095750.1650467-2-mlevitsk@redhat.com> +Cc: stable@vger.kernel.org +Fixes: 95b5a48c4f2b ("KVM: VMX: Handle NMIs, #MCs and async #PFs in common irqs-disabled fn", 2019-06-18) +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/vmx.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6452,6 +6452,9 @@ static void vmx_handle_exit_irqoff(struc + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + ++ if (vmx->emulation_required) ++ return; ++ + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + handle_external_interrupt_irqoff(vcpu); + else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) diff --git a/queue-5.13/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch b/queue-5.13/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch new file mode 100644 index 00000000000..1e7b967c816 --- /dev/null +++ b/queue-5.13/kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch @@ -0,0 +1,74 @@ +From ec607a564f70519b340f7eb4cfc0f4a6b55285ac Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 6 Aug 2021 07:05:58 -0400 +Subject: KVM: x86: clamp host mapping level to max_level in kvm_mmu_max_mapping_level + +From: Paolo Bonzini + +commit ec607a564f70519b340f7eb4cfc0f4a6b55285ac upstream. + +This change started as a way to make kvm_mmu_hugepage_adjust a bit simpler, +but it does fix two bugs as well. + +One bug is in zapping collapsible PTEs. If a large page size is +disallowed but not all of them, kvm_mmu_max_mapping_level will return the +host mapping level and the small PTEs will be zapped up to that level. +However, if e.g. 1GB are prohibited, we can still zap 4KB mapping and +preserve the 2MB ones. This can happen for example when NX huge pages +are in use. + +The second would happen when userspace backs guest memory +with a 1gb hugepage but only assign a subset of the page to +the guest. 1gb pages would be disallowed by the memslot, but +not 2mb. kvm_mmu_max_mapping_level() would fall through to the +host_pfn_mapping_level() logic, see the 1gb hugepage, and map the whole +thing into the guest. + +Fixes: 2f57b7051fe8 ("KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level") +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 13 +++++-------- + 1 file changed, 5 insertions(+), 8 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2754,6 +2754,7 @@ int kvm_mmu_max_mapping_level(struct kvm + kvm_pfn_t pfn, int max_level) + { + struct kvm_lpage_info *linfo; ++ int host_level; + + max_level = min(max_level, max_huge_page_level); + for ( ; max_level > PG_LEVEL_4K; max_level--) { +@@ -2765,7 +2766,8 @@ int kvm_mmu_max_mapping_level(struct kvm + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + +- return host_pfn_mapping_level(kvm, gfn, pfn, slot); ++ host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); ++ return min(host_level, max_level); + } + + int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, +@@ -2789,17 +2791,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_v + if (!slot) + return PG_LEVEL_4K; + +- level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); +- if (level == PG_LEVEL_4K) +- return level; +- +- *req_level = level = min(level, max_level); +- + /* + * Enforce the iTLB multihit workaround after capturing the requested + * level, which will be used to do precise, accurate accounting. + */ +- if (huge_page_disallowed) ++ *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); ++ if (level == PG_LEVEL_4K || huge_page_disallowed) + return PG_LEVEL_4K; + + /* diff --git a/queue-5.13/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch b/queue-5.13/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch new file mode 100644 index 00000000000..9bea6ae9a5a --- /dev/null +++ b/queue-5.13/kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch @@ -0,0 +1,82 @@ +From 088acd23526647844aec1c39db4ad02552c86c7b Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Mon, 2 Aug 2021 21:46:06 -0700 +Subject: KVM: x86/mmu: Avoid collision with !PRESENT SPTEs in TDP MMU lpage stats + +From: Sean Christopherson + +commit 088acd23526647844aec1c39db4ad02552c86c7b upstream. + +Factor in whether or not the old/new SPTEs are shadow-present when +adjusting the large page stats in the TDP MMU. A modified MMIO SPTE can +toggle the page size bit, as bit 7 is used to store the MMIO generation, +i.e. is_large_pte() can get a false positive when called on a MMIO SPTE. +Ditto for nuking SPTEs with REMOVED_SPTE, which sets bit 7 in its magic +value. + +Opportunistically move the logic below the check to verify at least one +of the old/new SPTEs is shadow present. + +Use is/was_leaf even though is/was_present would suffice. The code +generation is roughly equivalent since all flags need to be computed +prior to the code in question, and using the *_leaf flags will minimize +the diff in a future enhancement to account all pages, i.e. will change +the check to "is_leaf != was_leaf". + +Reviewed-by: David Matlack +Reviewed-by: Ben Gardon + +Fixes: 1699f65c8b65 ("kvm/x86: Fix 'lpages' kvm stat for TDM MMU") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Mingwei Zhang +Message-Id: <20210803044607.599629-3-mizhang@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/tdp_mmu.c | 20 +++++++++++++------- + 1 file changed, 13 insertions(+), 7 deletions(-) + +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -410,6 +410,7 @@ static void __handle_changed_spte(struct + bool was_leaf = was_present && is_last_spte(old_spte, level); + bool is_leaf = is_present && is_last_spte(new_spte, level); + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); ++ bool was_large, is_large; + + WARN_ON(level > PT64_ROOT_MAX_LEVEL); + WARN_ON(level < PG_LEVEL_4K); +@@ -443,13 +444,6 @@ static void __handle_changed_spte(struct + + trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + +- if (is_large_pte(old_spte) != is_large_pte(new_spte)) { +- if (is_large_pte(old_spte)) +- atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); +- else +- atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); +- } +- + /* + * The only times a SPTE should be changed from a non-present to + * non-present state is when an MMIO entry is installed/modified/ +@@ -475,6 +469,18 @@ static void __handle_changed_spte(struct + return; + } + ++ /* ++ * Update large page stats if a large page is being zapped, created, or ++ * is replacing an existing shadow page. ++ */ ++ was_large = was_leaf && is_large_pte(old_spte); ++ is_large = is_leaf && is_large_pte(new_spte); ++ if (was_large != is_large) { ++ if (was_large) ++ atomic64_sub(1, (atomic64_t *)&kvm->stat.lpages); ++ else ++ atomic64_add(1, (atomic64_t *)&kvm->stat.lpages); ++ } + + if (was_leaf && is_dirty_spte(old_spte) && + (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) diff --git a/queue-5.13/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch b/queue-5.13/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch new file mode 100644 index 00000000000..a69dd791e61 --- /dev/null +++ b/queue-5.13/kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch @@ -0,0 +1,40 @@ +From d9130a2dfdd4b21736c91b818f87dbc0ccd1e757 Mon Sep 17 00:00:00 2001 +From: Zelin Deng +Date: Wed, 28 Apr 2021 10:22:01 +0800 +Subject: KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted + +From: Zelin Deng + +commit d9130a2dfdd4b21736c91b818f87dbc0ccd1e757 upstream. + +When MSR_IA32_TSC_ADJUST is written by guest due to TSC ADJUST feature +especially there's a big tsc warp (like a new vCPU is hot-added into VM +which has been up for a long time), tsc_offset is added by a large value +then go back to guest. This causes system time jump as tsc_timestamp is +not adjusted in the meantime and pvclock monotonic character. +To fix this, just notify kvm to update vCPU's guest time before back to +guest. + +Cc: stable@vger.kernel.org +Signed-off-by: Zelin Deng +Signed-off-by: Paolo Bonzini +Message-Id: <1619576521-81399-2-git-send-email-zelin.deng@linux.alibaba.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3223,6 +3223,10 @@ int kvm_set_msr_common(struct kvm_vcpu * + if (!msr_info->host_initiated) { + s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + adjust_tsc_offset_guest(vcpu, adj); ++ /* Before back to guest, tsc_timestamp must be adjusted ++ * as well, otherwise guest's percpu pvclock time could jump. ++ */ ++ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + vcpu->arch.ia32_tsc_adjust_msr = data; + } diff --git a/queue-5.13/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch b/queue-5.13/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch new file mode 100644 index 00000000000..d60b8e8eeaf --- /dev/null +++ b/queue-5.13/md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch @@ -0,0 +1,82 @@ +From 46d4703b1db4c86ab5acb2331b10df999f005e8e Mon Sep 17 00:00:00 2001 +From: Xiao Ni +Date: Wed, 18 Aug 2021 13:57:48 +0800 +Subject: md/raid10: Remove unnecessary rcu_dereference in raid10_handle_discard + +From: Xiao Ni + +commit 46d4703b1db4c86ab5acb2331b10df999f005e8e upstream. + +We are seeing the following warning in raid10_handle_discard. +[ 695.110751] ============================= +[ 695.131439] WARNING: suspicious RCU usage +[ 695.151389] 4.18.0-319.el8.x86_64+debug #1 Not tainted +[ 695.174413] ----------------------------- +[ 695.192603] drivers/md/raid10.c:1776 suspicious +rcu_dereference_check() usage! +[ 695.225107] other info that might help us debug this: +[ 695.260940] rcu_scheduler_active = 2, debug_locks = 1 +[ 695.290157] no locks held by mkfs.xfs/10186. + +In the first loop of function raid10_handle_discard. It already +determines which disk need to handle discard request and add the +rdev reference count rdev->nr_pending. So the conf->mirrors will +not change until all bios come back from underlayer disks. It +doesn't need to use rcu_dereference to get rdev. + +Cc: stable@vger.kernel.org +Fixes: d30588b2731f ('md/raid10: improve raid10 discard request') +Signed-off-by: Xiao Ni +Acked-by: Guoqing Jiang +Signed-off-by: Song Liu +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/raid10.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1706,6 +1706,11 @@ retry_discard: + } else + r10_bio->master_bio = (struct bio *)first_r10bio; + ++ /* ++ * first select target devices under rcu_lock and ++ * inc refcount on their rdev. Record them by setting ++ * bios[x] to bio ++ */ + rcu_read_lock(); + for (disk = 0; disk < geo->raid_disks; disk++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); +@@ -1737,9 +1742,6 @@ retry_discard: + for (disk = 0; disk < geo->raid_disks; disk++) { + sector_t dev_start, dev_end; + struct bio *mbio, *rbio = NULL; +- struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); +- struct md_rdev *rrdev = rcu_dereference( +- conf->mirrors[disk].replacement); + + /* + * Now start to calculate the start and end address for each disk. +@@ -1769,9 +1771,12 @@ retry_discard: + + /* + * It only handles discard bio which size is >= stripe size, so +- * dev_end > dev_start all the time ++ * dev_end > dev_start all the time. ++ * It doesn't need to use rcu lock to get rdev here. We already ++ * add rdev->nr_pending in the first loop. + */ + if (r10_bio->devs[disk].bio) { ++ struct md_rdev *rdev = conf->mirrors[disk].rdev; + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + mbio->bi_end_io = raid10_end_discard_request; + mbio->bi_private = r10_bio; +@@ -1784,6 +1789,7 @@ retry_discard: + bio_endio(mbio); + } + if (r10_bio->devs[disk].repl_bio) { ++ struct md_rdev *rrdev = conf->mirrors[disk].replacement; + rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + rbio->bi_end_io = raid10_end_discard_request; + rbio->bi_private = r10_bio; diff --git a/queue-5.13/perf-x86-amd-ibs-extend-perf_pmu_cap_no_exclude-to-ibs-op.patch b/queue-5.13/perf-x86-amd-ibs-extend-perf_pmu_cap_no_exclude-to-ibs-op.patch new file mode 100644 index 00000000000..8140948df3a --- /dev/null +++ b/queue-5.13/perf-x86-amd-ibs-extend-perf_pmu_cap_no_exclude-to-ibs-op.patch @@ -0,0 +1,36 @@ +From f11dd0d80555cdc8eaf5cfc9e19c9e198217f9f1 Mon Sep 17 00:00:00 2001 +From: Kim Phillips +Date: Tue, 17 Aug 2021 17:10:41 -0500 +Subject: perf/x86/amd/ibs: Extend PERF_PMU_CAP_NO_EXCLUDE to IBS Op + +From: Kim Phillips + +commit f11dd0d80555cdc8eaf5cfc9e19c9e198217f9f1 upstream. + +Commit: + + 2ff40250691e ("perf/core, arch/x86: Use PERF_PMU_CAP_NO_EXCLUDE for exclusion incapable PMUs") + +neglected to do so. + +Fixes: 2ff40250691e ("perf/core, arch/x86: Use PERF_PMU_CAP_NO_EXCLUDE for exclusion incapable PMUs") +Signed-off-by: Kim Phillips +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210817221048.88063-2-kim.phillips@amd.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/events/amd/ibs.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/x86/events/amd/ibs.c ++++ b/arch/x86/events/amd/ibs.c +@@ -571,6 +571,7 @@ static struct perf_ibs perf_ibs_op = { + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, ++ .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, diff --git a/queue-5.13/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch b/queue-5.13/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch new file mode 100644 index 00000000000..86e52692400 --- /dev/null +++ b/queue-5.13/revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch @@ -0,0 +1,72 @@ +From e7177339d7b5f9594b316842122b5fda9513d5e2 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 31 Aug 2021 09:42:22 -0700 +Subject: Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()" + +From: Sean Christopherson + +commit e7177339d7b5f9594b316842122b5fda9513d5e2 upstream. + +Revert a misguided illegal GPA check when "translating" a non-nested GPA. +The check is woefully incomplete as it does not fill in @exception as +expected by all callers, which leads to KVM attempting to inject a bogus +exception, potentially exposing kernel stack information in the process. + + WARNING: CPU: 0 PID: 8469 at arch/x86/kvm/x86.c:525 exception_type+0x98/0xb0 arch/x86/kvm/x86.c:525 + CPU: 1 PID: 8469 Comm: syz-executor531 Not tainted 5.14.0-rc7-syzkaller #0 + RIP: 0010:exception_type+0x98/0xb0 arch/x86/kvm/x86.c:525 + Call Trace: + x86_emulate_instruction+0xef6/0x1460 arch/x86/kvm/x86.c:7853 + kvm_mmu_page_fault+0x2f0/0x1810 arch/x86/kvm/mmu/mmu.c:5199 + handle_ept_misconfig+0xdf/0x3e0 arch/x86/kvm/vmx/vmx.c:5336 + __vmx_handle_exit arch/x86/kvm/vmx/vmx.c:6021 [inline] + vmx_handle_exit+0x336/0x1800 arch/x86/kvm/vmx/vmx.c:6038 + vcpu_enter_guest+0x2a1c/0x4430 arch/x86/kvm/x86.c:9712 + vcpu_run arch/x86/kvm/x86.c:9779 [inline] + kvm_arch_vcpu_ioctl_run+0x47d/0x1b20 arch/x86/kvm/x86.c:10010 + kvm_vcpu_ioctl+0x49e/0xe50 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3652 + +The bug has escaped notice because practically speaking the GPA check is +useless. The GPA check in question only comes into play when KVM is +walking guest page tables (or "translating" CR3), and KVM already handles +illegal GPA checks by setting reserved bits in rsvd_bits_mask for each +PxE, or in the case of CR3 for loading PTDPTRs, manually checks for an +illegal CR3. This particular failure doesn't hit the existing reserved +bits checks because syzbot sets guest.MAXPHYADDR=1, and IA32 architecture +simply doesn't allow for such an absurd MAXPHYADDR, e.g. 32-bit paging +doesn't define any reserved PA bits checks, which KVM emulates by only +incorporating the reserved PA bits into the "high" bits, i.e. bits 63:32. + +Simply remove the bogus check. There is zero meaningful value and no +architectural justification for supporting guest.MAXPHYADDR < 32, and +properly filling the exception would introduce non-trivial complexity. + +This reverts commit ec7771ab471ba6a945350353617e2e3385d0e013. + +Fixes: ec7771ab471b ("KVM: x86: mmu: Add guest physical address check in translate_gpa()") +Cc: stable@vger.kernel.org +Reported-by: syzbot+200c08e88ae818f849ce@syzkaller.appspotmail.com +Signed-off-by: Sean Christopherson +Message-Id: <20210831164224.1119728-2-seanjc@google.com> +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 6 ------ + 1 file changed, 6 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -257,12 +257,6 @@ static bool check_mmio_spte(struct kvm_v + static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) + { +- /* Check if guest physical address doesn't exceed guest maximum */ +- if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { +- exception->error_code |= PFERR_RSVD_MASK; +- return UNMAPPED_GVA; +- } +- + return gpa; + } + diff --git a/queue-5.13/series b/queue-5.13/series index bbcf2dacf03..69863b22cf0 100644 --- a/queue-5.13/series +++ b/queue-5.13/series @@ -274,3 +274,23 @@ raid1-ensure-write-behind-bio-has-less-than-bio_max_vecs-sectors.patch cifs-do-not-leak-edeadlk-to-dgetents64-for-status_user_session_deleted.patch smb3-fix-posix-extensions-mount-option.patch tty-fix-data-race-between-tiocsti-and-flush_to_ldisc.patch +x86-efi-restore-firmware-idt-before-calling-exitbootservices.patch +perf-x86-amd-ibs-extend-perf_pmu_cap_no_exclude-to-ibs-op.patch +x86-resctrl-fix-a-maybe-uninitialized-build-warning-treated-as-error.patch +revert-kvm-x86-mmu-add-guest-physical-address-check-in-translate_gpa.patch +kvm-s390-index-kvm-arch.idle_mask-by-vcpu_idx.patch +kvm-x86-update-vcpu-s-hv_clock-before-back-to-guest-when-tsc_offset-is-adjusted.patch +kvm-x86-clamp-host-mapping-level-to-max_level-in-kvm_mmu_max_mapping_level.patch +kvm-x86-mmu-avoid-collision-with-present-sptes-in-tdp-mmu-lpage-stats.patch +kvm-vmx-avoid-running-vmx_handle_exit_irqoff-in-case-of-emulation.patch +kvm-nvmx-unconditionally-clear-nested.pi_pending-on-nested-vm-enter.patch +kvm-arm64-unregister-hyp-sections-from-kmemleak-in-protected-mode.patch +arm-dts-at91-add-pinctrl-names-0-for-all-gpios.patch +io-wq-check-max_worker-limits-if-a-worker-transitions-bound-state.patch +md-raid10-remove-unnecessary-rcu_dereference-in-raid10_handle_discard.patch +char-tpm-kconfig-remove-bad-i2c-cr50-select.patch +fuse-truncate-pagecache-on-atomic_o_trunc.patch +fuse-flush-extending-writes.patch +fuse-wait-for-writepages-in-syncfs.patch +ima-remove-wmissing-prototypes-warning.patch +ima-remove-the-dependency-on-crypto_md5.patch diff --git a/queue-5.13/x86-efi-restore-firmware-idt-before-calling-exitbootservices.patch b/queue-5.13/x86-efi-restore-firmware-idt-before-calling-exitbootservices.patch new file mode 100644 index 00000000000..1c87eb794fb --- /dev/null +++ b/queue-5.13/x86-efi-restore-firmware-idt-before-calling-exitbootservices.patch @@ -0,0 +1,127 @@ +From 22aa45cb465be474e97666b3f7587ccb06ee411b Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Fri, 20 Aug 2021 14:57:03 +0200 +Subject: x86/efi: Restore Firmware IDT before calling ExitBootServices() + +From: Joerg Roedel + +commit 22aa45cb465be474e97666b3f7587ccb06ee411b upstream. + +Commit + + 79419e13e808 ("x86/boot/compressed/64: Setup IDT in startup_32 boot path") + +introduced an IDT into the 32-bit boot path of the decompressor stub. +But the IDT is set up before ExitBootServices() is called, and some UEFI +firmwares rely on their own IDT. + +Save the firmware IDT on boot and restore it before calling into EFI +functions to fix boot failures introduced by above commit. + +Fixes: 79419e13e808 ("x86/boot/compressed/64: Setup IDT in startup_32 boot path") +Reported-by: Fabio Aiuto +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Acked-by: Ard Biesheuvel +Cc: stable@vger.kernel.org # 5.13+ +Link: https://lkml.kernel.org/r/20210820125703.32410-1-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/boot/compressed/efi_thunk_64.S | 30 +++++++++++++++++++++--------- + arch/x86/boot/compressed/head_64.S | 3 +++ + 2 files changed, 24 insertions(+), 9 deletions(-) + +--- a/arch/x86/boot/compressed/efi_thunk_64.S ++++ b/arch/x86/boot/compressed/efi_thunk_64.S +@@ -5,9 +5,8 @@ + * Early support for invoking 32-bit EFI services from a 64-bit kernel. + * + * Because this thunking occurs before ExitBootServices() we have to +- * restore the firmware's 32-bit GDT before we make EFI service calls, +- * since the firmware's 32-bit IDT is still currently installed and it +- * needs to be able to service interrupts. ++ * restore the firmware's 32-bit GDT and IDT before we make EFI service ++ * calls. + * + * On the plus side, we don't have to worry about mangling 64-bit + * addresses into 32-bits because we're executing with an identity +@@ -39,7 +38,7 @@ SYM_FUNC_START(__efi64_thunk) + /* + * Convert x86-64 ABI params to i386 ABI + */ +- subq $32, %rsp ++ subq $64, %rsp + movl %esi, 0x0(%rsp) + movl %edx, 0x4(%rsp) + movl %ecx, 0x8(%rsp) +@@ -49,14 +48,19 @@ SYM_FUNC_START(__efi64_thunk) + leaq 0x14(%rsp), %rbx + sgdt (%rbx) + ++ addq $16, %rbx ++ sidt (%rbx) ++ + /* +- * Switch to gdt with 32-bit segments. This is the firmware GDT +- * that was installed when the kernel started executing. This +- * pointer was saved at the EFI stub entry point in head_64.S. ++ * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT ++ * and IDT that was installed when the kernel started executing. The ++ * pointers were saved at the EFI stub entry point in head_64.S. + * + * Pass the saved DS selector to the 32-bit code, and use far return to + * restore the saved CS selector. + */ ++ leaq efi32_boot_idt(%rip), %rax ++ lidt (%rax) + leaq efi32_boot_gdt(%rip), %rax + lgdt (%rax) + +@@ -67,7 +71,7 @@ SYM_FUNC_START(__efi64_thunk) + pushq %rax + lretq + +-1: addq $32, %rsp ++1: addq $64, %rsp + movq %rdi, %rax + + pop %rbx +@@ -128,10 +132,13 @@ SYM_FUNC_START_LOCAL(efi_enter32) + + /* + * Some firmware will return with interrupts enabled. Be sure to +- * disable them before we switch GDTs. ++ * disable them before we switch GDTs and IDTs. + */ + cli + ++ lidtl (%ebx) ++ subl $16, %ebx ++ + lgdtl (%ebx) + + movl %cr4, %eax +@@ -166,6 +173,11 @@ SYM_DATA_START(efi32_boot_gdt) + .quad 0 + SYM_DATA_END(efi32_boot_gdt) + ++SYM_DATA_START(efi32_boot_idt) ++ .word 0 ++ .quad 0 ++SYM_DATA_END(efi32_boot_idt) ++ + SYM_DATA_START(efi32_boot_cs) + .word 0 + SYM_DATA_END(efi32_boot_cs) +--- a/arch/x86/boot/compressed/head_64.S ++++ b/arch/x86/boot/compressed/head_64.S +@@ -319,6 +319,9 @@ SYM_INNER_LABEL(efi32_pe_stub_entry, SYM + movw %cs, rva(efi32_boot_cs)(%ebp) + movw %ds, rva(efi32_boot_ds)(%ebp) + ++ /* Store firmware IDT descriptor */ ++ sidtl rva(efi32_boot_idt)(%ebp) ++ + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax diff --git a/queue-5.13/x86-resctrl-fix-a-maybe-uninitialized-build-warning-treated-as-error.patch b/queue-5.13/x86-resctrl-fix-a-maybe-uninitialized-build-warning-treated-as-error.patch new file mode 100644 index 00000000000..e358cb4346c --- /dev/null +++ b/queue-5.13/x86-resctrl-fix-a-maybe-uninitialized-build-warning-treated-as-error.patch @@ -0,0 +1,67 @@ +From 527f721478bce3f49b513a733bacd19d6f34b08c Mon Sep 17 00:00:00 2001 +From: Babu Moger +Date: Fri, 20 Aug 2021 16:52:42 -0500 +Subject: x86/resctrl: Fix a maybe-uninitialized build warning treated as error +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Babu Moger + +commit 527f721478bce3f49b513a733bacd19d6f34b08c upstream. + +The recent commit + + 064855a69003 ("x86/resctrl: Fix default monitoring groups reporting") + +caused a RHEL build failure with an uninitialized variable warning +treated as an error because it removed the default case snippet. + +The RHEL Makefile uses '-Werror=maybe-uninitialized' to force possibly +uninitialized variable warnings to be treated as errors. This is also +reported by smatch via the 0day robot. + +The error from the RHEL build is: + + arch/x86/kernel/cpu/resctrl/monitor.c: In function ‘__mon_event_count’: + arch/x86/kernel/cpu/resctrl/monitor.c:261:12: error: ‘m’ may be used + uninitialized in this function [-Werror=maybe-uninitialized] + m->chunks += chunks; + ^~ + +The upstream Makefile does not build using '-Werror=maybe-uninitialized'. +So, the problem is not seen there. Fix the problem by putting back the +default case snippet. + + [ bp: note that there's nothing wrong with the code and other compilers + do not trigger this warning - this is being done just so the RHEL compiler + is happy. ] + +Fixes: 064855a69003 ("x86/resctrl: Fix default monitoring groups reporting") +Reported-by: Terry Bowman +Reported-by: kernel test robot +Signed-off-by: Babu Moger +Signed-off-by: Borislav Petkov +Reviewed-by: Reinette Chatre +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/162949631908.23903.17090272726012848523.stgit@bmoger-ubuntu +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/resctrl/monitor.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/arch/x86/kernel/cpu/resctrl/monitor.c ++++ b/arch/x86/kernel/cpu/resctrl/monitor.c +@@ -304,6 +304,12 @@ static u64 __mon_event_count(u32 rmid, s + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; ++ default: ++ /* ++ * Code would never reach here because an invalid ++ * event id would fail the __rmid_read. ++ */ ++ return RMID_VAL_ERROR; + } + + if (rr->first) {