From: Greg Kroah-Hartman Date: Sat, 29 Jan 2022 14:48:35 +0000 (+0100) Subject: 5.15-stable patches X-Git-Tag: v5.4.176~82 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e1dd415f163c5acf610d1976b097811679740ba5;p=thirdparty%2Fkernel%2Fstable-queue.git 5.15-stable patches added patches: block-add-bio_start_io_acct_time-to-control-start_time.patch ceph-properly-put-ceph_string-reference-after-async-create-attempt.patch ceph-set-pool_ns-in-new-inode-layout-for-async-creates.patch dm-properly-fix-redundant-bio-based-io-accounting.patch dm-revert-partial-fix-for-redundant-bio-based-io-accounting.patch drm-amd-display-fix-fp-start-end-for-dcn30_internal_validate_bw.patch drm-atomic-add-the-crtc-to-affected-crtc-only-if-uapi.enable-true.patch drm-etnaviv-relax-submit-size-limits.patch efi-runtime-avoid-efiv2-runtime-services-on-apple-x86-machines.patch fsnotify-fix-fsnotify-hooks-in-pseudo-filesystems.patch kvm-lapic-also-cancel-preemption-timer-during-set_lapic.patch kvm-ppc-book3s-hv-nested-fix-nested-hfscr-being-clobbered-with-multiple-vcpus.patch kvm-svm-don-t-intercept-gp-for-sev-guests.patch kvm-svm-never-reject-emulation-due-to-smap-errata-for-sev-guests.patch kvm-x86-forcibly-leave-nested-virt-when-smm-state-is-toggled.patch kvm-x86-keep-msr_ia32_xss-unchanged-for-init.patch kvm-x86-nsvm-skip-eax-alignment-check-for-non-svm-instructions.patch kvm-x86-sync-the-states-size-with-the-xcr0-ia32_xss-at-any-time.patch kvm-x86-update-vcpu-s-runtime-cpuid-on-write-to-msr_ia32_xss.patch perf-x86-intel-add-a-quirk-for-the-calculation-of-the-number-of-counters-on-alder-lake.patch perf-x86-intel-uncore-fix-cas_count_write-issue-for-icx.patch pm-wakeup-simplify-the-output-logic-of-pm_show_wakelocks.patch powerpc-audit-fix-syscall_get_arch.patch psi-fix-uaf-issue-when-psi-trigger-is-destroyed-while-being-polled.patch revert-kvm-svm-avoid-infinite-loop-on-npf-from-bad-address.patch tracing-don-t-inc-err_log-entry-count-if-entry-allocation-fails.patch tracing-histogram-fix-a-potential-memory-leak-for-kstrdup.patch --- diff --git a/queue-5.15/block-add-bio_start_io_acct_time-to-control-start_time.patch b/queue-5.15/block-add-bio_start_io_acct_time-to-control-start_time.patch new file mode 100644 index 00000000000..bcbd71594b6 --- /dev/null +++ b/queue-5.15/block-add-bio_start_io_acct_time-to-control-start_time.patch @@ -0,0 +1,93 @@ +From e45c47d1f94e0cc7b6b079fdb4bcce2995e2adc4 Mon Sep 17 00:00:00 2001 +From: Mike Snitzer +Date: Fri, 28 Jan 2022 10:58:39 -0500 +Subject: block: add bio_start_io_acct_time() to control start_time + +From: Mike Snitzer + +commit e45c47d1f94e0cc7b6b079fdb4bcce2995e2adc4 upstream. + +bio_start_io_acct_time() interface is like bio_start_io_acct() that +allows start_time to be passed in. This gives drivers the ability to +defer starting accounting until after IO is issued (but possibily not +entirely due to bio splitting). + +Reviewed-by: Christoph Hellwig +Signed-off-by: Mike Snitzer +Link: https://lore.kernel.org/r/20220128155841.39644-2-snitzer@redhat.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + block/blk-core.c | 25 +++++++++++++++++++------ + include/linux/blkdev.h | 1 + + 2 files changed, 20 insertions(+), 6 deletions(-) + +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -1293,22 +1293,34 @@ void blk_account_io_start(struct request + } + + static unsigned long __part_start_io_acct(struct block_device *part, +- unsigned int sectors, unsigned int op) ++ unsigned int sectors, unsigned int op, ++ unsigned long start_time) + { + const int sgrp = op_stat_group(op); +- unsigned long now = READ_ONCE(jiffies); + + part_stat_lock(); +- update_io_ticks(part, now, false); ++ update_io_ticks(part, start_time, false); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); + part_stat_local_inc(part, in_flight[op_is_write(op)]); + part_stat_unlock(); + +- return now; ++ return start_time; + } + + /** ++ * bio_start_io_acct_time - start I/O accounting for bio based drivers ++ * @bio: bio to start account for ++ * @start_time: start time that should be passed back to bio_end_io_acct(). ++ */ ++void bio_start_io_acct_time(struct bio *bio, unsigned long start_time) ++{ ++ __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), ++ bio_op(bio), start_time); ++} ++EXPORT_SYMBOL_GPL(bio_start_io_acct_time); ++ ++/** + * bio_start_io_acct - start I/O accounting for bio based drivers + * @bio: bio to start account for + * +@@ -1316,14 +1328,15 @@ static unsigned long __part_start_io_acc + */ + unsigned long bio_start_io_acct(struct bio *bio) + { +- return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio)); ++ return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), ++ bio_op(bio), jiffies); + } + EXPORT_SYMBOL_GPL(bio_start_io_acct); + + unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op) + { +- return __part_start_io_acct(disk->part0, sectors, op); ++ return __part_start_io_acct(disk->part0, sectors, op, jiffies); + } + EXPORT_SYMBOL(disk_start_io_acct); + +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -1947,6 +1947,7 @@ unsigned long disk_start_io_acct(struct + void disk_end_io_acct(struct gendisk *disk, unsigned int op, + unsigned long start_time); + ++void bio_start_io_acct_time(struct bio *bio, unsigned long start_time); + unsigned long bio_start_io_acct(struct bio *bio); + void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, + struct block_device *orig_bdev); diff --git a/queue-5.15/ceph-properly-put-ceph_string-reference-after-async-create-attempt.patch b/queue-5.15/ceph-properly-put-ceph_string-reference-after-async-create-attempt.patch new file mode 100644 index 00000000000..cf7720a08f1 --- /dev/null +++ b/queue-5.15/ceph-properly-put-ceph_string-reference-after-async-create-attempt.patch @@ -0,0 +1,35 @@ +From 932a9b5870d38b87ba0a9923c804b1af7d3605b9 Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Tue, 25 Jan 2022 15:39:16 -0500 +Subject: ceph: properly put ceph_string reference after async create attempt + +From: Jeff Layton + +commit 932a9b5870d38b87ba0a9923c804b1af7d3605b9 upstream. + +The reference acquired by try_prep_async_create is currently leaked. +Ensure we put it. + +Cc: stable@vger.kernel.org +Fixes: 9a8d03ca2e2c ("ceph: attempt to do async create when possible") +Signed-off-by: Jeff Layton +Reviewed-by: Ilya Dryomov +Signed-off-by: Ilya Dryomov +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/file.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ceph/file.c ++++ b/fs/ceph/file.c +@@ -744,8 +744,10 @@ retry: + restore_deleg_ino(dir, req->r_deleg_ino); + ceph_mdsc_put_request(req); + try_async = false; ++ ceph_put_string(rcu_dereference_raw(lo.pool_ns)); + goto retry; + } ++ ceph_put_string(rcu_dereference_raw(lo.pool_ns)); + goto out_req; + } + } diff --git a/queue-5.15/ceph-set-pool_ns-in-new-inode-layout-for-async-creates.patch b/queue-5.15/ceph-set-pool_ns-in-new-inode-layout-for-async-creates.patch new file mode 100644 index 00000000000..3cc5269f108 --- /dev/null +++ b/queue-5.15/ceph-set-pool_ns-in-new-inode-layout-for-async-creates.patch @@ -0,0 +1,52 @@ +From 4584a768f22b7669cdebabc911543621ac661341 Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Wed, 26 Jan 2022 12:36:49 -0500 +Subject: ceph: set pool_ns in new inode layout for async creates + +From: Jeff Layton + +commit 4584a768f22b7669cdebabc911543621ac661341 upstream. + +Dan reported that he was unable to write to files that had been +asynchronously created when the client's OSD caps are restricted to a +particular namespace. + +The issue is that the layout for the new inode is only partially being +filled. Ensure that we populate the pool_ns_data and pool_ns_len in the +iinfo before calling ceph_fill_inode. + +Cc: stable@vger.kernel.org +URL: https://tracker.ceph.com/issues/54013 +Fixes: 9a8d03ca2e2c ("ceph: attempt to do async create when possible") +Reported-by: Dan van der Ster +Signed-off-by: Jeff Layton +Reviewed-by: Ilya Dryomov +Signed-off-by: Ilya Dryomov +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/file.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/fs/ceph/file.c ++++ b/fs/ceph/file.c +@@ -577,6 +577,7 @@ static int ceph_finish_async_create(stru + struct ceph_inode_info *ci = ceph_inode(dir); + struct inode *inode; + struct timespec64 now; ++ struct ceph_string *pool_ns; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; +@@ -626,6 +627,12 @@ static int ceph_finish_async_create(stru + in.max_size = cpu_to_le64(lo->stripe_unit); + + ceph_file_layout_to_legacy(lo, &in.layout); ++ /* lo is private, so pool_ns can't change */ ++ pool_ns = rcu_dereference_raw(lo->pool_ns); ++ if (pool_ns) { ++ iinfo.pool_ns_len = pool_ns->len; ++ iinfo.pool_ns_data = pool_ns->str; ++ } + + down_read(&mdsc->snap_rwsem); + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, diff --git a/queue-5.15/dm-properly-fix-redundant-bio-based-io-accounting.patch b/queue-5.15/dm-properly-fix-redundant-bio-based-io-accounting.patch new file mode 100644 index 00000000000..beba0bda0c5 --- /dev/null +++ b/queue-5.15/dm-properly-fix-redundant-bio-based-io-accounting.patch @@ -0,0 +1,56 @@ +From b879f915bc48a18d4f4462729192435bb0f17052 Mon Sep 17 00:00:00 2001 +From: Mike Snitzer +Date: Fri, 28 Jan 2022 10:58:41 -0500 +Subject: dm: properly fix redundant bio-based IO accounting + +From: Mike Snitzer + +commit b879f915bc48a18d4f4462729192435bb0f17052 upstream. + +Record the start_time for a bio but defer the starting block core's IO +accounting until after IO is submitted using bio_start_io_acct_time(). + +This approach avoids the need to mess around with any of the +individual IO stats in response to a bio_split() that follows bio +submission. + +Reported-by: Bud Brown +Reviewed-by: Christoph Hellwig +Cc: stable@vger.kernel.org +Depends-on: e45c47d1f94e ("block: add bio_start_io_acct_time() to control start_time") +Signed-off-by: Mike Snitzer +Link: https://lore.kernel.org/r/20220128155841.39644-4-snitzer@redhat.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/md/dm.c ++++ b/drivers/md/dm.c +@@ -489,7 +489,7 @@ static void start_io_acct(struct dm_io * + struct mapped_device *md = io->md; + struct bio *bio = io->orig_bio; + +- io->start_time = bio_start_io_acct(bio); ++ bio_start_io_acct_time(bio, io->start_time); + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio_data_dir(bio), + bio->bi_iter.bi_sector, bio_sectors(bio), +@@ -535,7 +535,7 @@ static struct dm_io *alloc_io(struct map + io->md = md; + spin_lock_init(&io->endio_lock); + +- start_io_acct(io); ++ io->start_time = jiffies; + + return io; + } +@@ -1555,6 +1555,7 @@ static blk_qc_t __split_and_process_bio( + ret = submit_bio_noacct(bio); + } + } ++ start_io_acct(ci.io); + + /* drop the extra reference count */ + dm_io_dec_pending(ci.io, errno_to_blk_status(error)); diff --git a/queue-5.15/dm-revert-partial-fix-for-redundant-bio-based-io-accounting.patch b/queue-5.15/dm-revert-partial-fix-for-redundant-bio-based-io-accounting.patch new file mode 100644 index 00000000000..b2f88fae476 --- /dev/null +++ b/queue-5.15/dm-revert-partial-fix-for-redundant-bio-based-io-accounting.patch @@ -0,0 +1,53 @@ +From f524d9c95fab54783d0038f7a3e8c014d5b56857 Mon Sep 17 00:00:00 2001 +From: Mike Snitzer +Date: Fri, 28 Jan 2022 10:58:40 -0500 +Subject: dm: revert partial fix for redundant bio-based IO accounting + +From: Mike Snitzer + +commit f524d9c95fab54783d0038f7a3e8c014d5b56857 upstream. + +Reverts a1e1cb72d9649 ("dm: fix redundant IO accounting for bios that +need splitting") because it was too narrow in scope (only addressed +redundant 'sectors[]' accounting and not ios, nsecs[], etc). + +Cc: stable@vger.kernel.org +Signed-off-by: Mike Snitzer +Link: https://lore.kernel.org/r/20220128155841.39644-3-snitzer@redhat.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm.c | 15 --------------- + 1 file changed, 15 deletions(-) + +--- a/drivers/md/dm.c ++++ b/drivers/md/dm.c +@@ -1514,9 +1514,6 @@ static void init_clone_info(struct clone + ci->sector = bio->bi_iter.bi_sector; + } + +-#define __dm_part_stat_sub(part, field, subnd) \ +- (part_stat_get(part, field) -= (subnd)) +- + /* + * Entry point to split a bio into clones and submit them to the targets. + */ +@@ -1553,18 +1550,6 @@ static blk_qc_t __split_and_process_bio( + GFP_NOIO, &md->queue->bio_split); + ci.io->orig_bio = b; + +- /* +- * Adjust IO stats for each split, otherwise upon queue +- * reentry there will be redundant IO accounting. +- * NOTE: this is a stop-gap fix, a proper fix involves +- * significant refactoring of DM core's bio splitting +- * (by eliminating DM's splitting and just using bio_split) +- */ +- part_stat_lock(); +- __dm_part_stat_sub(dm_disk(md)->part0, +- sectors[op_stat_group(bio_op(bio))], ci.sector_count); +- part_stat_unlock(); +- + bio_chain(b, bio); + trace_block_split(b, bio->bi_iter.bi_sector); + ret = submit_bio_noacct(bio); diff --git a/queue-5.15/drm-amd-display-fix-fp-start-end-for-dcn30_internal_validate_bw.patch b/queue-5.15/drm-amd-display-fix-fp-start-end-for-dcn30_internal_validate_bw.patch new file mode 100644 index 00000000000..8624fd75f93 --- /dev/null +++ b/queue-5.15/drm-amd-display-fix-fp-start-end-for-dcn30_internal_validate_bw.patch @@ -0,0 +1,48 @@ +From 72a8d87b87270bff0c0b2fed4d59c48d0dd840d7 Mon Sep 17 00:00:00 2001 +From: Bas Nieuwenhuizen +Date: Mon, 24 Jan 2022 01:23:35 +0100 +Subject: drm/amd/display: Fix FP start/end for dcn30_internal_validate_bw. + +From: Bas Nieuwenhuizen + +commit 72a8d87b87270bff0c0b2fed4d59c48d0dd840d7 upstream. + +It calls populate_dml_pipes which uses doubles to initialize the +scale_ratio_depth params. Mirrors the dcn20 logic. + +Cc: stable@vger.kernel.org +Signed-off-by: Bas Nieuwenhuizen +Signed-off-by: Alex Deucher +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c ++++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c +@@ -1879,7 +1879,6 @@ static noinline bool dcn30_internal_vali + dc->res_pool->funcs->update_soc_for_wm_a(dc, context); + pipe_cnt = dc->res_pool->funcs->populate_dml_pipes(dc, context, pipes, fast_validate); + +- DC_FP_START(); + if (!pipe_cnt) { + out = true; + goto validate_out; +@@ -2103,7 +2102,6 @@ validate_fail: + out = false; + + validate_out: +- DC_FP_END(); + return out; + } + +@@ -2304,7 +2302,9 @@ bool dcn30_validate_bandwidth(struct dc + + BW_VAL_TRACE_COUNT(); + ++ DC_FP_START(); + out = dcn30_internal_validate_bw(dc, context, pipes, &pipe_cnt, &vlevel, fast_validate); ++ DC_FP_END(); + + if (pipe_cnt == 0) + goto validate_out; diff --git a/queue-5.15/drm-atomic-add-the-crtc-to-affected-crtc-only-if-uapi.enable-true.patch b/queue-5.15/drm-atomic-add-the-crtc-to-affected-crtc-only-if-uapi.enable-true.patch new file mode 100644 index 00000000000..3a7fcd16acf --- /dev/null +++ b/queue-5.15/drm-atomic-add-the-crtc-to-affected-crtc-only-if-uapi.enable-true.patch @@ -0,0 +1,69 @@ +From 5ec1cebd59300ddd26dbaa96c17c508764eef911 Mon Sep 17 00:00:00 2001 +From: Manasi Navare +Date: Mon, 4 Oct 2021 04:59:13 -0700 +Subject: drm/atomic: Add the crtc to affected crtc only if uapi.enable = true +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Manasi Navare + +commit 5ec1cebd59300ddd26dbaa96c17c508764eef911 upstream. + +In case of a modeset where a mode gets split across multiple CRTCs +in the driver specific implementation (bigjoiner in i915) we wrongly count +the affected CRTCs based on the drm_crtc_mask and indicate the stolen CRTC as +an affected CRTC in atomic_check_only(). +This triggers a warning since affected CRTCs doent match requested CRTC. + +To fix this in such bigjoiner configurations, we should only +increment affected crtcs if that CRTC is enabled in UAPI not +if it is just used internally in the driver to split the mode. + +v3: Add the same uapi crtc_state->enable check in requested +crtc calc (Ville) + +Cc: Ville Syrjälä +Cc: Simon Ser +Cc: Pekka Paalanen +Cc: Daniel Stone +Cc: Daniel Vetter +Cc: dri-devel@lists.freedesktop.org +Cc: # v5.11+ +Fixes: 919c2299a893 ("drm/i915: Enable bigjoiner") +Signed-off-by: Manasi Navare +Reviewed-by: Ville Syrjälä +Link: https://patchwork.freedesktop.org/patch/msgid/20211004115913.23889-1-manasi.d.navare@intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/drm_atomic.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/drivers/gpu/drm/drm_atomic.c ++++ b/drivers/gpu/drm/drm_atomic.c +@@ -1310,8 +1310,10 @@ int drm_atomic_check_only(struct drm_ato + + DRM_DEBUG_ATOMIC("checking %p\n", state); + +- for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) +- requested_crtc |= drm_crtc_mask(crtc); ++ for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) { ++ if (new_crtc_state->enable) ++ requested_crtc |= drm_crtc_mask(crtc); ++ } + + for_each_oldnew_plane_in_state(state, plane, old_plane_state, new_plane_state, i) { + ret = drm_atomic_plane_check(old_plane_state, new_plane_state); +@@ -1360,8 +1362,10 @@ int drm_atomic_check_only(struct drm_ato + } + } + +- for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) +- affected_crtc |= drm_crtc_mask(crtc); ++ for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) { ++ if (new_crtc_state->enable) ++ affected_crtc |= drm_crtc_mask(crtc); ++ } + + /* + * For commits that allow modesets drivers can add other CRTCs to the diff --git a/queue-5.15/drm-etnaviv-relax-submit-size-limits.patch b/queue-5.15/drm-etnaviv-relax-submit-size-limits.patch new file mode 100644 index 00000000000..9ffdb37f766 --- /dev/null +++ b/queue-5.15/drm-etnaviv-relax-submit-size-limits.patch @@ -0,0 +1,35 @@ +From e3d26528e083e612314d4dcd713f3d5a26143ddc Mon Sep 17 00:00:00 2001 +From: Lucas Stach +Date: Thu, 6 Jan 2022 19:10:21 +0100 +Subject: drm/etnaviv: relax submit size limits + +From: Lucas Stach + +commit e3d26528e083e612314d4dcd713f3d5a26143ddc upstream. + +While all userspace tried to limit commandstreams to 64K in size, +a bug in the Mesa driver lead to command streams of up to 128K +being submitted. Allow those to avoid breaking existing userspace. + +Fixes: 6dfa2fab8ddd ("drm/etnaviv: limit submit sizes") +Cc: stable@vger.kernel.org +Signed-off-by: Lucas Stach +Reviewed-by: Christian Gmeiner +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c ++++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +@@ -469,8 +469,8 @@ int etnaviv_ioctl_gem_submit(struct drm_ + return -EINVAL; + } + +- if (args->stream_size > SZ_64K || args->nr_relocs > SZ_64K || +- args->nr_bos > SZ_64K || args->nr_pmrs > 128) { ++ if (args->stream_size > SZ_128K || args->nr_relocs > SZ_128K || ++ args->nr_bos > SZ_128K || args->nr_pmrs > 128) { + DRM_ERROR("submit arguments out of size limits\n"); + return -EINVAL; + } diff --git a/queue-5.15/efi-runtime-avoid-efiv2-runtime-services-on-apple-x86-machines.patch b/queue-5.15/efi-runtime-avoid-efiv2-runtime-services-on-apple-x86-machines.patch new file mode 100644 index 00000000000..e9f5c63a84f --- /dev/null +++ b/queue-5.15/efi-runtime-avoid-efiv2-runtime-services-on-apple-x86-machines.patch @@ -0,0 +1,62 @@ +From f5390cd0b43c2e54c7cf5506c7da4a37c5cef746 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Wed, 12 Jan 2022 11:14:13 +0100 +Subject: efi: runtime: avoid EFIv2 runtime services on Apple x86 machines + +From: Ard Biesheuvel + +commit f5390cd0b43c2e54c7cf5506c7da4a37c5cef746 upstream. + +Aditya reports [0] that his recent MacbookPro crashes in the firmware +when using the variable services at runtime. The culprit appears to be a +call to QueryVariableInfo(), which we did not use to call on Apple x86 +machines in the past as they only upgraded from EFI v1.10 to EFI v2.40 +firmware fairly recently, and QueryVariableInfo() (along with +UpdateCapsule() et al) was added in EFI v2.00. + +The only runtime service introduced in EFI v2.00 that we actually use in +Linux is QueryVariableInfo(), as the capsule based ones are optional, +generally not used at runtime (all the LVFS/fwupd firmware update +infrastructure uses helper EFI programs that invoke capsule update at +boot time, not runtime), and not implemented by Apple machines in the +first place. QueryVariableInfo() is used to 'safely' set variables, +i.e., only when there is enough space. This prevents machines with buggy +firmwares from corrupting their NVRAMs when they run out of space. + +Given that Apple machines have been using EFI v1.10 services only for +the longest time (the EFI v2.0 spec was released in 2006, and Linux +support for the newly introduced runtime services was added in 2011, but +the MacbookPro12,1 released in 2015 still claims to be EFI v1.10 only), +let's avoid the EFI v2.0 ones on all Apple x86 machines. + +[0] https://lore.kernel.org/all/6D757C75-65B1-468B-842D-10410081A8E4@live.com/ + +Cc: +Cc: Jeremy Kerr +Cc: Matthew Garrett +Reported-by: Aditya Garg +Tested-by: Orlando Chamberlain +Signed-off-by: Ard Biesheuvel +Tested-by: Aditya Garg +Link: https://bugzilla.kernel.org/show_bug.cgi?id=215277 +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/efi.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/firmware/efi/efi.c ++++ b/drivers/firmware/efi/efi.c +@@ -719,6 +719,13 @@ void __init efi_systab_report_header(con + systab_hdr->revision >> 16, + systab_hdr->revision & 0xffff, + vendor); ++ ++ if (IS_ENABLED(CONFIG_X86_64) && ++ systab_hdr->revision > EFI_1_10_SYSTEM_TABLE_REVISION && ++ !strcmp(vendor, "Apple")) { ++ pr_info("Apple Mac detected, using EFI v1.10 runtime services only\n"); ++ efi.runtime_version = EFI_1_10_SYSTEM_TABLE_REVISION; ++ } + } + + static __initdata char memory_type_name[][13] = { diff --git a/queue-5.15/fsnotify-fix-fsnotify-hooks-in-pseudo-filesystems.patch b/queue-5.15/fsnotify-fix-fsnotify-hooks-in-pseudo-filesystems.patch new file mode 100644 index 00000000000..1198b7939d4 --- /dev/null +++ b/queue-5.15/fsnotify-fix-fsnotify-hooks-in-pseudo-filesystems.patch @@ -0,0 +1,124 @@ +From 29044dae2e746949ad4b9cbdbfb248994d1dcdb4 Mon Sep 17 00:00:00 2001 +From: Amir Goldstein +Date: Thu, 20 Jan 2022 23:53:05 +0200 +Subject: fsnotify: fix fsnotify hooks in pseudo filesystems + +From: Amir Goldstein + +commit 29044dae2e746949ad4b9cbdbfb248994d1dcdb4 upstream. + +Commit 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of +d_delete()") moved the fsnotify delete hook before d_delete() so fsnotify +will have access to a positive dentry. + +This allowed a race where opening the deleted file via cached dentry +is now possible after receiving the IN_DELETE event. + +To fix the regression in pseudo filesystems, convert d_delete() calls +to d_drop() (see commit 46c46f8df9aa ("devpts_pty_kill(): don't bother +with d_delete()") and move the fsnotify hook after d_drop(). + +Add a missing fsnotify_unlink() hook in nfsdfs that was found during +the audit of fsnotify hooks in pseudo filesystems. + +Note that the fsnotify hooks in simple_recursive_removal() follow +d_invalidate(), so they require no change. + +Link: https://lore.kernel.org/r/20220120215305.282577-2-amir73il@gmail.com +Reported-by: Ivan Delalande +Link: https://lore.kernel.org/linux-fsdevel/YeNyzoDM5hP5LtGW@visor/ +Fixes: 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()") +Cc: stable@vger.kernel.org # v5.3+ +Signed-off-by: Amir Goldstein +Signed-off-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman +--- + fs/configfs/dir.c | 6 +++--- + fs/devpts/inode.c | 2 +- + fs/nfsd/nfsctl.c | 5 +++-- + net/sunrpc/rpc_pipe.c | 4 ++-- + 4 files changed, 9 insertions(+), 8 deletions(-) + +--- a/fs/configfs/dir.c ++++ b/fs/configfs/dir.c +@@ -1780,8 +1780,8 @@ void configfs_unregister_group(struct co + configfs_detach_group(&group->cg_item); + d_inode(dentry)->i_flags |= S_DEAD; + dont_mount(dentry); ++ d_drop(dentry); + fsnotify_rmdir(d_inode(parent), dentry); +- d_delete(dentry); + inode_unlock(d_inode(parent)); + + dput(dentry); +@@ -1922,10 +1922,10 @@ void configfs_unregister_subsystem(struc + configfs_detach_group(&group->cg_item); + d_inode(dentry)->i_flags |= S_DEAD; + dont_mount(dentry); +- fsnotify_rmdir(d_inode(root), dentry); + inode_unlock(d_inode(dentry)); + +- d_delete(dentry); ++ d_drop(dentry); ++ fsnotify_rmdir(d_inode(root), dentry); + + inode_unlock(d_inode(root)); + +--- a/fs/devpts/inode.c ++++ b/fs/devpts/inode.c +@@ -621,8 +621,8 @@ void devpts_pty_kill(struct dentry *dent + + dentry->d_fsdata = NULL; + drop_nlink(dentry->d_inode); +- fsnotify_unlink(d_inode(dentry->d_parent), dentry); + d_drop(dentry); ++ fsnotify_unlink(d_inode(dentry->d_parent), dentry); + dput(dentry); /* d_alloc_name() in devpts_pty_new() */ + } + +--- a/fs/nfsd/nfsctl.c ++++ b/fs/nfsd/nfsctl.c +@@ -1249,7 +1249,8 @@ static void nfsdfs_remove_file(struct in + clear_ncl(d_inode(dentry)); + dget(dentry); + ret = simple_unlink(dir, dentry); +- d_delete(dentry); ++ d_drop(dentry); ++ fsnotify_unlink(dir, dentry); + dput(dentry); + WARN_ON_ONCE(ret); + } +@@ -1340,8 +1341,8 @@ void nfsd_client_rmdir(struct dentry *de + dget(dentry); + ret = simple_rmdir(dir, dentry); + WARN_ON_ONCE(ret); ++ d_drop(dentry); + fsnotify_rmdir(dir, dentry); +- d_delete(dentry); + dput(dentry); + inode_unlock(dir); + } +--- a/net/sunrpc/rpc_pipe.c ++++ b/net/sunrpc/rpc_pipe.c +@@ -600,9 +600,9 @@ static int __rpc_rmdir(struct inode *dir + + dget(dentry); + ret = simple_rmdir(dir, dentry); ++ d_drop(dentry); + if (!ret) + fsnotify_rmdir(dir, dentry); +- d_delete(dentry); + dput(dentry); + return ret; + } +@@ -613,9 +613,9 @@ static int __rpc_unlink(struct inode *di + + dget(dentry); + ret = simple_unlink(dir, dentry); ++ d_drop(dentry); + if (!ret) + fsnotify_unlink(dir, dentry); +- d_delete(dentry); + dput(dentry); + return ret; + } diff --git a/queue-5.15/kvm-lapic-also-cancel-preemption-timer-during-set_lapic.patch b/queue-5.15/kvm-lapic-also-cancel-preemption-timer-during-set_lapic.patch new file mode 100644 index 00000000000..2ce412dab47 --- /dev/null +++ b/queue-5.15/kvm-lapic-also-cancel-preemption-timer-during-set_lapic.patch @@ -0,0 +1,53 @@ +From 35fe7cfbab2e81f1afb23fc4212210b1de6d9633 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li +Date: Tue, 25 Jan 2022 01:17:00 -0800 +Subject: KVM: LAPIC: Also cancel preemption timer during SET_LAPIC + +From: Wanpeng Li + +commit 35fe7cfbab2e81f1afb23fc4212210b1de6d9633 upstream. + +The below warning is splatting during guest reboot. + + ------------[ cut here ]------------ + WARNING: CPU: 0 PID: 1931 at arch/x86/kvm/x86.c:10322 kvm_arch_vcpu_ioctl_run+0x874/0x880 [kvm] + CPU: 0 PID: 1931 Comm: qemu-system-x86 Tainted: G I 5.17.0-rc1+ #5 + RIP: 0010:kvm_arch_vcpu_ioctl_run+0x874/0x880 [kvm] + Call Trace: + + kvm_vcpu_ioctl+0x279/0x710 [kvm] + __x64_sys_ioctl+0x83/0xb0 + do_syscall_64+0x3b/0xc0 + entry_SYSCALL_64_after_hwframe+0x44/0xae + RIP: 0033:0x7fd39797350b + +This can be triggered by not exposing tsc-deadline mode and doing a reboot in +the guest. The lapic_shutdown() function which is called in sys_reboot path +will not disarm the flying timer, it just masks LVTT. lapic_shutdown() clears +APIC state w/ LVT_MASKED and timer-mode bit is 0, this can trigger timer-mode +switch between tsc-deadline and oneshot/periodic, which can result in preemption +timer be cancelled in apic_update_lvtt(). However, We can't depend on this when +not exposing tsc-deadline mode and oneshot/periodic modes emulated by preemption +timer. Qemu will synchronise states around reset, let's cancel preemption timer +under KVM_SET_LAPIC. + +Signed-off-by: Wanpeng Li +Message-Id: <1643102220-35667-1-git-send-email-wanpengli@tencent.com> +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/lapic.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -2623,7 +2623,7 @@ int kvm_apic_set_state(struct kvm_vcpu * + kvm_apic_set_version(vcpu); + + apic_update_ppr(apic); +- hrtimer_cancel(&apic->lapic_timer.timer); ++ cancel_apic_timer(apic); + apic->lapic_timer.expired_tscdeadline = 0; + apic_update_lvtt(apic); + apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); diff --git a/queue-5.15/kvm-ppc-book3s-hv-nested-fix-nested-hfscr-being-clobbered-with-multiple-vcpus.patch b/queue-5.15/kvm-ppc-book3s-hv-nested-fix-nested-hfscr-being-clobbered-with-multiple-vcpus.patch new file mode 100644 index 00000000000..91599b7145c --- /dev/null +++ b/queue-5.15/kvm-ppc-book3s-hv-nested-fix-nested-hfscr-being-clobbered-with-multiple-vcpus.patch @@ -0,0 +1,80 @@ +From 22f7ff0dea9491e90b6fe808ed40c30bd791e5c2 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Sat, 22 Jan 2022 20:55:30 +1000 +Subject: KVM: PPC: Book3S HV Nested: Fix nested HFSCR being clobbered with multiple vCPUs + +From: Nicholas Piggin + +commit 22f7ff0dea9491e90b6fe808ed40c30bd791e5c2 upstream. + +The L0 is storing HFSCR requested by the L1 for the L2 in struct +kvm_nested_guest when the L1 requests a vCPU enter L2. kvm_nested_guest +is not a per-vCPU structure. Hilarity ensues. + +Fix it by moving the nested hfscr into the vCPU structure together with +the other per-vCPU nested fields. + +Fixes: 8b210a880b35 ("KVM: PPC: Book3S HV Nested: Make nested HFSCR state accessible") +Cc: stable@vger.kernel.org # v5.15+ +Signed-off-by: Nicholas Piggin +Reviewed-by: Fabiano Rosas +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20220122105530.3477250-1-npiggin@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/kvm_book3s_64.h | 1 - + arch/powerpc/include/asm/kvm_host.h | 1 + + arch/powerpc/kvm/book3s_hv.c | 3 +-- + arch/powerpc/kvm/book3s_hv_nested.c | 2 +- + 4 files changed, 3 insertions(+), 4 deletions(-) + +--- a/arch/powerpc/include/asm/kvm_book3s_64.h ++++ b/arch/powerpc/include/asm/kvm_book3s_64.h +@@ -39,7 +39,6 @@ struct kvm_nested_guest { + pgd_t *shadow_pgtable; /* our page table for this guest */ + u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */ + u64 process_table; /* process table entry for this guest */ +- u64 hfscr; /* HFSCR that the L1 requested for this nested guest */ + long refcnt; /* number of pointers to this struct */ + struct mutex tlb_lock; /* serialize page faults and tlbies */ + struct kvm_nested_guest *next; +--- a/arch/powerpc/include/asm/kvm_host.h ++++ b/arch/powerpc/include/asm/kvm_host.h +@@ -814,6 +814,7 @@ struct kvm_vcpu_arch { + + /* For support of nested guests */ + struct kvm_nested_guest *nested; ++ u64 nested_hfscr; /* HFSCR that the L1 requested for the nested guest */ + u32 nested_vcpu_id; + gpa_t nested_io_gpr; + #endif +--- a/arch/powerpc/kvm/book3s_hv.c ++++ b/arch/powerpc/kvm/book3s_hv.c +@@ -1731,7 +1731,6 @@ static int kvmppc_handle_exit_hv(struct + + static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) + { +- struct kvm_nested_guest *nested = vcpu->arch.nested; + int r; + int srcu_idx; + +@@ -1831,7 +1830,7 @@ static int kvmppc_handle_nested_exit(str + * it into a HEAI. + */ + if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) || +- (nested->hfscr & (1UL << cause))) { ++ (vcpu->arch.nested_hfscr & (1UL << cause))) { + vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST; + + /* +--- a/arch/powerpc/kvm/book3s_hv_nested.c ++++ b/arch/powerpc/kvm/book3s_hv_nested.c +@@ -362,7 +362,7 @@ long kvmhv_enter_nested_guest(struct kvm + /* set L1 state to L2 state */ + vcpu->arch.nested = l2; + vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; +- l2->hfscr = l2_hv.hfscr; ++ vcpu->arch.nested_hfscr = l2_hv.hfscr; + vcpu->arch.regs = l2_regs; + + /* Guest must always run with ME enabled, HV disabled. */ diff --git a/queue-5.15/kvm-svm-don-t-intercept-gp-for-sev-guests.patch b/queue-5.15/kvm-svm-don-t-intercept-gp-for-sev-guests.patch new file mode 100644 index 00000000000..4d770ebbc1e --- /dev/null +++ b/queue-5.15/kvm-svm-don-t-intercept-gp-for-sev-guests.patch @@ -0,0 +1,52 @@ +From 0b0be065b7563ac708aaa9f69dd4941c80b3446d Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 20 Jan 2022 01:07:13 +0000 +Subject: KVM: SVM: Don't intercept #GP for SEV guests + +From: Sean Christopherson + +commit 0b0be065b7563ac708aaa9f69dd4941c80b3446d upstream. + +Never intercept #GP for SEV guests as reading SEV guest private memory +will return cyphertext, i.e. emulating on #GP can't work as intended. + +Cc: stable@vger.kernel.org +Cc: Tom Lendacky +Cc: Brijesh Singh +Signed-off-by: Sean Christopherson +Reviewed-by: Liam Merwick +Message-Id: <20220120010719.711476-4-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -303,7 +303,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, + return ret; + } + +- if (svm_gp_erratum_intercept) ++ /* ++ * Never intercept #GP for SEV guests, KVM can't ++ * decrypt guest memory to workaround the erratum. ++ */ ++ if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) + set_exception_intercept(svm, GP_VECTOR); + } + } +@@ -1176,9 +1180,10 @@ static void init_vmcb(struct kvm_vcpu *v + * Guest access to VMware backdoor ports could legitimately + * trigger #GP because of TSS I/O permission bitmap. + * We intercept those #GP and allow access to them anyway +- * as VMware does. ++ * as VMware does. Don't intercept #GP for SEV guests as KVM can't ++ * decrypt guest memory to decode the faulting instruction. + */ +- if (enable_vmware_backdoor) ++ if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) + set_exception_intercept(svm, GP_VECTOR); + + svm_set_intercept(svm, INTERCEPT_INTR); diff --git a/queue-5.15/kvm-svm-never-reject-emulation-due-to-smap-errata-for-sev-guests.patch b/queue-5.15/kvm-svm-never-reject-emulation-due-to-smap-errata-for-sev-guests.patch new file mode 100644 index 00000000000..1635f02d477 --- /dev/null +++ b/queue-5.15/kvm-svm-never-reject-emulation-due-to-smap-errata-for-sev-guests.patch @@ -0,0 +1,55 @@ +From 55467fcd55b89c622e62b4afe60ac0eb2fae91f2 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 20 Jan 2022 01:07:11 +0000 +Subject: KVM: SVM: Never reject emulation due to SMAP errata for !SEV guests + +From: Sean Christopherson + +commit 55467fcd55b89c622e62b4afe60ac0eb2fae91f2 upstream. + +Always signal that emulation is possible for !SEV guests regardless of +whether or not the CPU provided a valid instruction byte stream. KVM can +read all guest state (memory and registers) for !SEV guests, i.e. can +fetch the code stream from memory even if the CPU failed to do so because +of the SMAP errata. + +Fixes: 05d5a4863525 ("KVM: SVM: Workaround errata#1096 (insn_len maybe zero on SMAP violation)") +Cc: stable@vger.kernel.org +Cc: Tom Lendacky +Cc: Brijesh Singh +Signed-off-by: Sean Christopherson +Reviewed-by: Liam Merwick +Message-Id: <20220120010719.711476-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4407,8 +4407,13 @@ static bool svm_can_emulate_instruction( + bool smep, smap, is_user; + unsigned long cr4; + ++ /* Emulation is always possible when KVM has access to all guest state. */ ++ if (!sev_guest(vcpu->kvm)) ++ return true; ++ + /* +- * When the guest is an SEV-ES guest, emulation is not possible. ++ * Emulation is impossible for SEV-ES guests as KVM doesn't have access ++ * to guest register state. + */ + if (sev_es_guest(vcpu->kvm)) + return false; +@@ -4461,9 +4466,6 @@ static bool svm_can_emulate_instruction( + smap = cr4 & X86_CR4_SMAP; + is_user = svm_get_cpl(vcpu) == 3; + if (smap && (!smep || is_user)) { +- if (!sev_guest(vcpu->kvm)) +- return true; +- + pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } diff --git a/queue-5.15/kvm-x86-forcibly-leave-nested-virt-when-smm-state-is-toggled.patch b/queue-5.15/kvm-x86-forcibly-leave-nested-virt-when-smm-state-is-toggled.patch new file mode 100644 index 00000000000..a8b5b0dc70d --- /dev/null +++ b/queue-5.15/kvm-x86-forcibly-leave-nested-virt-when-smm-state-is-toggled.patch @@ -0,0 +1,173 @@ +From f7e570780efc5cec9b2ed1e0472a7da14e864fdb Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 25 Jan 2022 22:03:58 +0000 +Subject: KVM: x86: Forcibly leave nested virt when SMM state is toggled + +From: Sean Christopherson + +commit f7e570780efc5cec9b2ed1e0472a7da14e864fdb upstream. + +Forcibly leave nested virtualization operation if userspace toggles SMM +state via KVM_SET_VCPU_EVENTS or KVM_SYNC_X86_EVENTS. If userspace +forces the vCPU out of SMM while it's post-VMXON and then injects an SMI, +vmx_enter_smm() will overwrite vmx->nested.smm.vmxon and end up with both +vmxon=false and smm.vmxon=false, but all other nVMX state allocated. + +Don't attempt to gracefully handle the transition as (a) most transitions +are nonsencial, e.g. forcing SMM while L2 is running, (b) there isn't +sufficient information to handle all transitions, e.g. SVM wants access +to the SMRAM save state, and (c) KVM_SET_VCPU_EVENTS must precede +KVM_SET_NESTED_STATE during state restore as the latter disallows putting +the vCPU into L2 if SMM is active, and disallows tagging the vCPU as +being post-VMXON in SMM if SMM is not active. + +Abuse of KVM_SET_VCPU_EVENTS manifests as a WARN and memory leak in nVMX +due to failure to free vmcs01's shadow VMCS, but the bug goes far beyond +just a memory leak, e.g. toggling SMM on while L2 is active puts the vCPU +in an architecturally impossible state. + + WARNING: CPU: 0 PID: 3606 at free_loaded_vmcs arch/x86/kvm/vmx/vmx.c:2665 [inline] + WARNING: CPU: 0 PID: 3606 at free_loaded_vmcs+0x158/0x1a0 arch/x86/kvm/vmx/vmx.c:2656 + Modules linked in: + CPU: 1 PID: 3606 Comm: syz-executor725 Not tainted 5.17.0-rc1-syzkaller #0 + Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 + RIP: 0010:free_loaded_vmcs arch/x86/kvm/vmx/vmx.c:2665 [inline] + RIP: 0010:free_loaded_vmcs+0x158/0x1a0 arch/x86/kvm/vmx/vmx.c:2656 + Code: <0f> 0b eb b3 e8 8f 4d 9f 00 e9 f7 fe ff ff 48 89 df e8 92 4d 9f 00 + Call Trace: + + kvm_arch_vcpu_destroy+0x72/0x2f0 arch/x86/kvm/x86.c:11123 + kvm_vcpu_destroy arch/x86/kvm/../../../virt/kvm/kvm_main.c:441 [inline] + kvm_destroy_vcpus+0x11f/0x290 arch/x86/kvm/../../../virt/kvm/kvm_main.c:460 + kvm_free_vcpus arch/x86/kvm/x86.c:11564 [inline] + kvm_arch_destroy_vm+0x2e8/0x470 arch/x86/kvm/x86.c:11676 + kvm_destroy_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:1217 [inline] + kvm_put_kvm+0x4fa/0xb00 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1250 + kvm_vm_release+0x3f/0x50 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1273 + __fput+0x286/0x9f0 fs/file_table.c:311 + task_work_run+0xdd/0x1a0 kernel/task_work.c:164 + exit_task_work include/linux/task_work.h:32 [inline] + do_exit+0xb29/0x2a30 kernel/exit.c:806 + do_group_exit+0xd2/0x2f0 kernel/exit.c:935 + get_signal+0x4b0/0x28c0 kernel/signal.c:2862 + arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868 + handle_signal_work kernel/entry/common.c:148 [inline] + exit_to_user_mode_loop kernel/entry/common.c:172 [inline] + exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207 + __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] + syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300 + do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 + entry_SYSCALL_64_after_hwframe+0x44/0xae + + +Cc: stable@vger.kernel.org +Reported-by: syzbot+8112db3ab20e70d50c31@syzkaller.appspotmail.com +Signed-off-by: Sean Christopherson +Message-Id: <20220125220358.2091737-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/kvm_host.h | 1 + + arch/x86/kvm/svm/nested.c | 9 +++++---- + arch/x86/kvm/svm/svm.c | 2 +- + arch/x86/kvm/svm/svm.h | 2 +- + arch/x86/kvm/vmx/nested.c | 1 + + arch/x86/kvm/x86.c | 4 +++- + 6 files changed, 12 insertions(+), 7 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1487,6 +1487,7 @@ struct kvm_x86_ops { + }; + + struct kvm_x86_nested_ops { ++ void (*leave_nested)(struct kvm_vcpu *vcpu); + int (*check_events)(struct kvm_vcpu *vcpu); + bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); + void (*triple_fault)(struct kvm_vcpu *vcpu); +--- a/arch/x86/kvm/svm/nested.c ++++ b/arch/x86/kvm/svm/nested.c +@@ -942,9 +942,9 @@ void svm_free_nested(struct vcpu_svm *sv + /* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +-void svm_leave_nested(struct vcpu_svm *svm) ++void svm_leave_nested(struct kvm_vcpu *vcpu) + { +- struct kvm_vcpu *vcpu = &svm->vcpu; ++ struct vcpu_svm *svm = to_svm(vcpu); + + if (is_guest_mode(vcpu)) { + svm->nested.nested_run_pending = 0; +@@ -1313,7 +1313,7 @@ static int svm_set_nested_state(struct k + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { +- svm_leave_nested(svm); ++ svm_leave_nested(vcpu); + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + return 0; + } +@@ -1378,7 +1378,7 @@ static int svm_set_nested_state(struct k + */ + + if (is_guest_mode(vcpu)) +- svm_leave_nested(svm); ++ svm_leave_nested(vcpu); + else + svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; + +@@ -1432,6 +1432,7 @@ static bool svm_get_nested_state_pages(s + } + + struct kvm_x86_nested_ops svm_nested_ops = { ++ .leave_nested = svm_leave_nested, + .check_events = svm_check_nested_events, + .triple_fault = nested_svm_triple_fault, + .get_nested_state_pages = svm_get_nested_state_pages, +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -281,7 +281,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, + + if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { + if (!(efer & EFER_SVME)) { +- svm_leave_nested(svm); ++ svm_leave_nested(vcpu); + svm_set_gif(svm, true); + /* #GP intercept is still needed for vmware backdoor */ + if (!enable_vmware_backdoor) +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -461,7 +461,7 @@ static inline bool nested_exit_on_nmi(st + + int enter_svm_guest_mode(struct kvm_vcpu *vcpu, + u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); +-void svm_leave_nested(struct vcpu_svm *svm); ++void svm_leave_nested(struct kvm_vcpu *vcpu); + void svm_free_nested(struct vcpu_svm *svm); + int svm_allocate_nested(struct vcpu_svm *svm); + int nested_svm_vmrun(struct kvm_vcpu *vcpu); +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -6748,6 +6748,7 @@ __init int nested_vmx_hardware_setup(int + } + + struct kvm_x86_nested_ops vmx_nested_ops = { ++ .leave_nested = vmx_leave_nested, + .check_events = vmx_check_nested_events, + .hv_timer_pending = nested_vmx_preemption_timer_pending, + .triple_fault = nested_vmx_triple_fault, +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -4727,8 +4727,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_e + vcpu->arch.apic->sipi_vector = events->sipi_vector; + + if (events->flags & KVM_VCPUEVENT_VALID_SMM) { +- if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) ++ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { ++ kvm_x86_ops.nested_ops->leave_nested(vcpu); + kvm_smm_changed(vcpu, events->smi.smm); ++ } + + vcpu->arch.smi_pending = events->smi.pending; + diff --git a/queue-5.15/kvm-x86-keep-msr_ia32_xss-unchanged-for-init.patch b/queue-5.15/kvm-x86-keep-msr_ia32_xss-unchanged-for-init.patch new file mode 100644 index 00000000000..a3dcb8ef783 --- /dev/null +++ b/queue-5.15/kvm-x86-keep-msr_ia32_xss-unchanged-for-init.patch @@ -0,0 +1,42 @@ +From be4f3b3f82271c3193ce200a996dc70682c8e622 Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Wed, 26 Jan 2022 17:22:24 +0000 +Subject: KVM: x86: Keep MSR_IA32_XSS unchanged for INIT + +From: Xiaoyao Li + +commit be4f3b3f82271c3193ce200a996dc70682c8e622 upstream. + +It has been corrected from SDM version 075 that MSR_IA32_XSS is reset to +zero on Power up and Reset but keeps unchanged on INIT. + +Fixes: a554d207dc46 ("KVM: X86: Processor States following Reset or INIT") +Cc: stable@vger.kernel.org +Signed-off-by: Xiaoyao Li +Signed-off-by: Sean Christopherson +Message-Id: <20220126172226.2298529-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10990,6 +10990,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcp + vcpu->arch.msr_misc_features_enables = 0; + + vcpu->arch.xcr0 = XFEATURE_MASK_FP; ++ vcpu->arch.ia32_xss = 0; + } + + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); +@@ -11008,8 +11009,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcp + eax = 0x600; + kvm_rdx_write(vcpu, eax); + +- vcpu->arch.ia32_xss = 0; +- + static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); diff --git a/queue-5.15/kvm-x86-nsvm-skip-eax-alignment-check-for-non-svm-instructions.patch b/queue-5.15/kvm-x86-nsvm-skip-eax-alignment-check-for-non-svm-instructions.patch new file mode 100644 index 00000000000..8a7bef1fb4e --- /dev/null +++ b/queue-5.15/kvm-x86-nsvm-skip-eax-alignment-check-for-non-svm-instructions.patch @@ -0,0 +1,53 @@ +From 47c28d436f409f5b009dc82bd82d4971088aa391 Mon Sep 17 00:00:00 2001 +From: Denis Valeev +Date: Sat, 22 Jan 2022 23:13:57 +0300 +Subject: KVM: x86: nSVM: skip eax alignment check for non-SVM instructions + +From: Denis Valeev + +commit 47c28d436f409f5b009dc82bd82d4971088aa391 upstream. + +The bug occurs on #GP triggered by VMware backdoor when eax value is +unaligned. eax alignment check should not be applied to non-SVM +instructions because it leads to incorrect omission of the instructions +emulation. +Apply the alignment check only to SVM instructions to fix. + +Fixes: d1cba6c92237 ("KVM: x86: nSVM: test eax for 4K alignment for GP errata workaround") +Signed-off-by: Denis Valeev +Message-Id: +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -2238,10 +2238,6 @@ static int gp_interception(struct kvm_vc + if (error_code) + goto reinject; + +- /* All SVM instructions expect page aligned RAX */ +- if (svm->vmcb->save.rax & ~PAGE_MASK) +- goto reinject; +- + /* Decode the instruction for usage later */ + if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) + goto reinject; +@@ -2259,8 +2255,13 @@ static int gp_interception(struct kvm_vc + if (!is_guest_mode(vcpu)) + return kvm_emulate_instruction(vcpu, + EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); +- } else ++ } else { ++ /* All SVM instructions expect page aligned RAX */ ++ if (svm->vmcb->save.rax & ~PAGE_MASK) ++ goto reinject; ++ + return emulate_svm_instr(vcpu, opcode); ++ } + + reinject: + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); diff --git a/queue-5.15/kvm-x86-sync-the-states-size-with-the-xcr0-ia32_xss-at-any-time.patch b/queue-5.15/kvm-x86-sync-the-states-size-with-the-xcr0-ia32_xss-at-any-time.patch new file mode 100644 index 00000000000..c191ff8fe0a --- /dev/null +++ b/queue-5.15/kvm-x86-sync-the-states-size-with-the-xcr0-ia32_xss-at-any-time.patch @@ -0,0 +1,42 @@ +From 05a9e065059e566f218f8778c4d17ee75db56c55 Mon Sep 17 00:00:00 2001 +From: Like Xu +Date: Wed, 26 Jan 2022 17:22:26 +0000 +Subject: KVM: x86: Sync the states size with the XCR0/IA32_XSS at, any time + +From: Like Xu + +commit 05a9e065059e566f218f8778c4d17ee75db56c55 upstream. + +XCR0 is reset to 1 by RESET but not INIT and IA32_XSS is zeroed by +both RESET and INIT. The kvm_set_msr_common()'s handling of MSR_IA32_XSS +also needs to update kvm_update_cpuid_runtime(). In the above cases, the +size in bytes of the XSAVE area containing all states enabled by XCR0 or +(XCRO | IA32_XSS) needs to be updated. + +For simplicity and consistency, existing helpers are used to write values +and call kvm_update_cpuid_runtime(), and it's not exactly a fast path. + +Fixes: a554d207dc46 ("KVM: X86: Processor States following Reset or INIT") +Cc: stable@vger.kernel.org +Signed-off-by: Like Xu +Signed-off-by: Sean Christopherson +Message-Id: <20220126172226.2298529-4-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10990,8 +10990,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcp + + vcpu->arch.msr_misc_features_enables = 0; + +- vcpu->arch.xcr0 = XFEATURE_MASK_FP; +- vcpu->arch.ia32_xss = 0; ++ __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); ++ __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); + } + + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); diff --git a/queue-5.15/kvm-x86-update-vcpu-s-runtime-cpuid-on-write-to-msr_ia32_xss.patch b/queue-5.15/kvm-x86-update-vcpu-s-runtime-cpuid-on-write-to-msr_ia32_xss.patch new file mode 100644 index 00000000000..778ca26987a --- /dev/null +++ b/queue-5.15/kvm-x86-update-vcpu-s-runtime-cpuid-on-write-to-msr_ia32_xss.patch @@ -0,0 +1,34 @@ +From 4c282e51e4450b94680d6ca3b10f830483b1f243 Mon Sep 17 00:00:00 2001 +From: Like Xu +Date: Wed, 26 Jan 2022 17:22:25 +0000 +Subject: KVM: x86: Update vCPU's runtime CPUID on write to MSR_IA32_XSS + +From: Like Xu + +commit 4c282e51e4450b94680d6ca3b10f830483b1f243 upstream. + +Do a runtime CPUID update for a vCPU if MSR_IA32_XSS is written, as the +size in bytes of the XSAVE area is affected by the states enabled in XSS. + +Fixes: 203000993de5 ("kvm: vmx: add MSR logic for XSAVES") +Cc: stable@vger.kernel.org +Signed-off-by: Like Xu +[sean: split out as a separate patch, adjust Fixes tag] +Signed-off-by: Sean Christopherson +Message-Id: <20220126172226.2298529-3-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3453,6 +3453,7 @@ int kvm_set_msr_common(struct kvm_vcpu * + if (data & ~supported_xss) + return 1; + vcpu->arch.ia32_xss = data; ++ kvm_update_cpuid_runtime(vcpu); + break; + case MSR_SMI_COUNT: + if (!msr_info->host_initiated) diff --git a/queue-5.15/perf-x86-intel-add-a-quirk-for-the-calculation-of-the-number-of-counters-on-alder-lake.patch b/queue-5.15/perf-x86-intel-add-a-quirk-for-the-calculation-of-the-number-of-counters-on-alder-lake.patch new file mode 100644 index 00000000000..e06a80d468a --- /dev/null +++ b/queue-5.15/perf-x86-intel-add-a-quirk-for-the-calculation-of-the-number-of-counters-on-alder-lake.patch @@ -0,0 +1,76 @@ +From 7fa981cad216e9f64f49e22112f610c0bfed91bc Mon Sep 17 00:00:00 2001 +From: Kan Liang +Date: Tue, 11 Jan 2022 10:20:38 -0800 +Subject: perf/x86/intel: Add a quirk for the calculation of the number of counters on Alder Lake + +From: Kan Liang + +commit 7fa981cad216e9f64f49e22112f610c0bfed91bc upstream. + +For some Alder Lake machine with all E-cores disabled in a BIOS, the +below warning may be triggered. + +[ 2.010766] hw perf events fixed 5 > max(4), clipping! + +Current perf code relies on the CPUID leaf 0xA and leaf 7.EDX[15] to +calculate the number of the counters and follow the below assumption. + +For a hybrid configuration, the leaf 7.EDX[15] (X86_FEATURE_HYBRID_CPU) +is set. The leaf 0xA only enumerate the common counters. Linux perf has +to manually add the extra GP counters and fixed counters for P-cores. +For a non-hybrid configuration, the X86_FEATURE_HYBRID_CPU should not +be set. The leaf 0xA enumerates all counters. + +However, that's not the case when all E-cores are disabled in a BIOS. +Although there are only P-cores in the system, the leaf 7.EDX[15] +(X86_FEATURE_HYBRID_CPU) is still set. But the leaf 0xA is updated +to enumerate all counters of P-cores. The inconsistency triggers the +warning. + +Several software ways were considered to handle the inconsistency. +- Drop the leaf 0xA and leaf 7.EDX[15] CPUID enumeration support. + Hardcode the number of counters. This solution may be a problem for + virtualization. A hypervisor cannot control the number of counters + in a Linux guest via changing the guest CPUID enumeration anymore. +- Find another CPUID bit that is also updated with E-cores disabled. + There may be a problem in the virtualization environment too. Because + a hypervisor may disable the feature/CPUID bit. +- The P-cores have a maximum of 8 GP counters and 4 fixed counters on + ADL. The maximum number can be used to detect the case. + This solution is implemented in this patch. + +Fixes: ee72a94ea4a6 ("perf/x86/intel: Fix fixed counter check warning for some Alder Lake") +Reported-by: Damjan Marion (damarion) +Reported-by: Chan Edison +Signed-off-by: Kan Liang +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Damjan Marion (damarion) +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/1641925238-149288-1-git-send-email-kan.liang@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/events/intel/core.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/arch/x86/events/intel/core.c ++++ b/arch/x86/events/intel/core.c +@@ -6187,6 +6187,19 @@ __init int intel_pmu_init(void) + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + } ++ ++ /* ++ * Quirk: For some Alder Lake machine, when all E-cores are disabled in ++ * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However, ++ * the X86_FEATURE_HYBRID_CPU is still set. The above codes will ++ * mistakenly add extra counters for P-cores. Correct the number of ++ * counters here. ++ */ ++ if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) { ++ pmu->num_counters = x86_pmu.num_counters; ++ pmu->num_counters_fixed = x86_pmu.num_counters_fixed; ++ } ++ + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, diff --git a/queue-5.15/perf-x86-intel-uncore-fix-cas_count_write-issue-for-icx.patch b/queue-5.15/perf-x86-intel-uncore-fix-cas_count_write-issue-for-icx.patch new file mode 100644 index 00000000000..6edee622b50 --- /dev/null +++ b/queue-5.15/perf-x86-intel-uncore-fix-cas_count_write-issue-for-icx.patch @@ -0,0 +1,62 @@ +From 96fd2e89fba1aaada6f4b1e5d25a9d9ecbe1943d Mon Sep 17 00:00:00 2001 +From: Zhengjun Xing +Date: Thu, 23 Dec 2021 22:48:26 +0800 +Subject: perf/x86/intel/uncore: Fix CAS_COUNT_WRITE issue for ICX +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Zhengjun Xing + +commit 96fd2e89fba1aaada6f4b1e5d25a9d9ecbe1943d upstream. + +The user recently report a perf issue in the ICX platform, when test by +perf event “uncore_imc_x/cas_count_write”,the write bandwidth is always +very small (only 0.38MB/s), it is caused by the wrong "umask" for the +"cas_count_write" event. When double-checking, find "cas_count_read" +also is wrong. + +The public document for ICX uncore: + +3rd Gen Intel® Xeon® Processor Scalable Family, Codename Ice Lake,Uncore +Performance Monitoring Reference Manual, Revision 1.00, May 2021 + +On 2.4.7, it defines Unit Masks for CAS_COUNT: +RD b00001111 +WR b00110000 + +So corrected both "cas_count_read" and "cas_count_write" for ICX. + +Old settings: + hswep_uncore_imc_events + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03") + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c") + +New settings: + snr_uncore_imc_events + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x0f") + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x30") + +Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") +Signed-off-by: Zhengjun Xing +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Adrian Hunter +Reviewed-by: Kan Liang +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20211223144826.841267-1-zhengjun.xing@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/events/intel/uncore_snbep.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/events/intel/uncore_snbep.c ++++ b/arch/x86/events/intel/uncore_snbep.c +@@ -5482,7 +5482,7 @@ static struct intel_uncore_type icx_unco + .fixed_ctr_bits = 48, + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, +- .event_descs = hswep_uncore_imc_events, ++ .event_descs = snr_uncore_imc_events, + .perf_ctr = SNR_IMC_MMIO_PMON_CTR0, + .event_ctl = SNR_IMC_MMIO_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, diff --git a/queue-5.15/pm-wakeup-simplify-the-output-logic-of-pm_show_wakelocks.patch b/queue-5.15/pm-wakeup-simplify-the-output-logic-of-pm_show_wakelocks.patch new file mode 100644 index 00000000000..6c48495eb07 --- /dev/null +++ b/queue-5.15/pm-wakeup-simplify-the-output-logic-of-pm_show_wakelocks.patch @@ -0,0 +1,51 @@ +From c9d967b2ce40d71e968eb839f36c936b8a9cf1ea Mon Sep 17 00:00:00 2001 +From: Greg Kroah-Hartman +Date: Thu, 13 Jan 2022 19:44:20 +0100 +Subject: PM: wakeup: simplify the output logic of pm_show_wakelocks() + +From: Greg Kroah-Hartman + +commit c9d967b2ce40d71e968eb839f36c936b8a9cf1ea upstream. + +The buffer handling in pm_show_wakelocks() is tricky, and hopefully +correct. Ensure it really is correct by using sysfs_emit_at() which +handles all of the tricky string handling logic in a PAGE_SIZE buffer +for us automatically as this is a sysfs file being read from. + +Reviewed-by: Lee Jones +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Greg Kroah-Hartman +--- + kernel/power/wakelock.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +--- a/kernel/power/wakelock.c ++++ b/kernel/power/wakelock.c +@@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, boo + { + struct rb_node *node; + struct wakelock *wl; +- char *str = buf; +- char *end = buf + PAGE_SIZE; ++ int len = 0; + + mutex_lock(&wakelocks_lock); + + for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { + wl = rb_entry(node, struct wakelock, node); + if (wl->ws->active == show_active) +- str += scnprintf(str, end - str, "%s ", wl->name); ++ len += sysfs_emit_at(buf, len, "%s ", wl->name); + } +- if (str > buf) +- str--; + +- str += scnprintf(str, end - str, "\n"); ++ len += sysfs_emit_at(buf, len, "\n"); + + mutex_unlock(&wakelocks_lock); +- return (str - buf); ++ return len; + } + + #if CONFIG_PM_WAKELOCKS_LIMIT > 0 diff --git a/queue-5.15/powerpc-audit-fix-syscall_get_arch.patch b/queue-5.15/powerpc-audit-fix-syscall_get_arch.patch new file mode 100644 index 00000000000..f67d8925d85 --- /dev/null +++ b/queue-5.15/powerpc-audit-fix-syscall_get_arch.patch @@ -0,0 +1,65 @@ +From 252745240ba0ae774d2f80c5e185ed59fbc4fb41 Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Fri, 14 Jan 2022 11:26:25 +0000 +Subject: powerpc/audit: Fix syscall_get_arch() + +From: Christophe Leroy + +commit 252745240ba0ae774d2f80c5e185ed59fbc4fb41 upstream. + +Commit 770cec16cdc9 ("powerpc/audit: Simplify syscall_get_arch()") +and commit 898a1ef06ad4 ("powerpc/audit: Avoid unneccessary #ifdef +in syscall_get_arguments()") +replaced test_tsk_thread_flag(task, TIF_32BIT)) by is_32bit_task(). + +But is_32bit_task() applies on current task while be want the test +done on task 'task' + +So add a new macro is_tsk_32bit_task() to check any task. + +Fixes: 770cec16cdc9 ("powerpc/audit: Simplify syscall_get_arch()") +Fixes: 898a1ef06ad4 ("powerpc/audit: Avoid unneccessary #ifdef in syscall_get_arguments()") +Cc: stable@vger.kernel.org +Reported-by: Dmitry V. Levin +Signed-off-by: Christophe Leroy +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/c55cddb8f65713bf5859ed675d75a50cb37d5995.1642159570.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/syscall.h | 4 ++-- + arch/powerpc/include/asm/thread_info.h | 2 ++ + 2 files changed, 4 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/include/asm/syscall.h ++++ b/arch/powerpc/include/asm/syscall.h +@@ -90,7 +90,7 @@ static inline void syscall_get_arguments + unsigned long val, mask = -1UL; + unsigned int n = 6; + +- if (is_32bit_task()) ++ if (is_tsk_32bit_task(task)) + mask = 0xffffffff; + + while (n--) { +@@ -115,7 +115,7 @@ static inline void syscall_set_arguments + + static inline int syscall_get_arch(struct task_struct *task) + { +- if (is_32bit_task()) ++ if (is_tsk_32bit_task(task)) + return AUDIT_ARCH_PPC; + else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + return AUDIT_ARCH_PPC64LE; +--- a/arch/powerpc/include/asm/thread_info.h ++++ b/arch/powerpc/include/asm/thread_info.h +@@ -165,8 +165,10 @@ static inline bool test_thread_local_fla + + #ifdef CONFIG_COMPAT + #define is_32bit_task() (test_thread_flag(TIF_32BIT)) ++#define is_tsk_32bit_task(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT)) + #else + #define is_32bit_task() (IS_ENABLED(CONFIG_PPC32)) ++#define is_tsk_32bit_task(tsk) (IS_ENABLED(CONFIG_PPC32)) + #endif + + #if defined(CONFIG_PPC64) diff --git a/queue-5.15/psi-fix-uaf-issue-when-psi-trigger-is-destroyed-while-being-polled.patch b/queue-5.15/psi-fix-uaf-issue-when-psi-trigger-is-destroyed-while-being-polled.patch new file mode 100644 index 00000000000..333c61a90eb --- /dev/null +++ b/queue-5.15/psi-fix-uaf-issue-when-psi-trigger-is-destroyed-while-being-polled.patch @@ -0,0 +1,240 @@ +From a06247c6804f1a7c86a2e5398a4c1f1db1471848 Mon Sep 17 00:00:00 2001 +From: Suren Baghdasaryan +Date: Tue, 11 Jan 2022 15:23:09 -0800 +Subject: psi: Fix uaf issue when psi trigger is destroyed while being polled + +From: Suren Baghdasaryan + +commit a06247c6804f1a7c86a2e5398a4c1f1db1471848 upstream. + +With write operation on psi files replacing old trigger with a new one, +the lifetime of its waitqueue is totally arbitrary. Overwriting an +existing trigger causes its waitqueue to be freed and pending poll() +will stumble on trigger->event_wait which was destroyed. +Fix this by disallowing to redefine an existing psi trigger. If a write +operation is used on a file descriptor with an already existing psi +trigger, the operation will fail with EBUSY error. +Also bypass a check for psi_disabled in the psi_trigger_destroy as the +flag can be flipped after the trigger is created, leading to a memory +leak. + +Fixes: 0e94682b73bf ("psi: introduce psi monitor") +Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com +Suggested-by: Linus Torvalds +Analyzed-by: Eric Biggers +Signed-off-by: Suren Baghdasaryan +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Eric Biggers +Acked-by: Johannes Weiner +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/accounting/psi.rst | 3 + + include/linux/psi.h | 2 - + include/linux/psi_types.h | 3 - + kernel/cgroup/cgroup.c | 11 ++++-- + kernel/sched/psi.c | 66 +++++++++++++++++---------------------- + 5 files changed, 40 insertions(+), 45 deletions(-) + +--- a/Documentation/accounting/psi.rst ++++ b/Documentation/accounting/psi.rst +@@ -92,7 +92,8 @@ Triggers can be set on more than one psi + for the same psi metric can be specified. However for each trigger a separate + file descriptor is required to be able to poll it separately from others, + therefore for each trigger a separate open() syscall should be made even +-when opening the same psi interface file. ++when opening the same psi interface file. Write operations to a file descriptor ++with an already existing psi trigger will fail with EBUSY. + + Monitors activate only when system enters stall state for the monitored + psi metric and deactivates upon exit from the stall state. While system is +--- a/include/linux/psi.h ++++ b/include/linux/psi.h +@@ -32,7 +32,7 @@ void cgroup_move_task(struct task_struct + + struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res); +-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); ++void psi_trigger_destroy(struct psi_trigger *t); + + __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, + poll_table *wait); +--- a/include/linux/psi_types.h ++++ b/include/linux/psi_types.h +@@ -140,9 +140,6 @@ struct psi_trigger { + * events to one per window + */ + u64 last_event_time; +- +- /* Refcounting to prevent premature destruction */ +- struct kref refcount; + }; + + struct psi_group { +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -3642,6 +3642,12 @@ static ssize_t cgroup_pressure_write(str + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + ++ /* Allow only one trigger per file descriptor */ ++ if (ctx->psi.trigger) { ++ cgroup_put(cgrp); ++ return -EBUSY; ++ } ++ + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + new = psi_trigger_create(psi, buf, nbytes, res); + if (IS_ERR(new)) { +@@ -3649,8 +3655,7 @@ static ssize_t cgroup_pressure_write(str + return PTR_ERR(new); + } + +- psi_trigger_replace(&ctx->psi.trigger, new); +- ++ smp_store_release(&ctx->psi.trigger, new); + cgroup_put(cgrp); + + return nbytes; +@@ -3689,7 +3694,7 @@ static void cgroup_pressure_release(stru + { + struct cgroup_file_ctx *ctx = of->priv; + +- psi_trigger_replace(&ctx->psi.trigger, NULL); ++ psi_trigger_destroy(ctx->psi.trigger); + } + + bool cgroup_psi_enabled(void) +--- a/kernel/sched/psi.c ++++ b/kernel/sched/psi.c +@@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(s + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); +- kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + +@@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(s + return t; + } + +-static void psi_trigger_destroy(struct kref *ref) ++void psi_trigger_destroy(struct psi_trigger *t) + { +- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); +- struct psi_group *group = t->group; ++ struct psi_group *group; + struct task_struct *task_to_destroy = NULL; + +- if (static_branch_likely(&psi_disabled)) ++ /* ++ * We do not check psi_disabled since it might have been disabled after ++ * the trigger got created. ++ */ ++ if (!t) + return; + ++ group = t->group; + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. +@@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct k + mutex_unlock(&group->trigger_lock); + + /* +- * Wait for both *trigger_ptr from psi_trigger_replace and +- * poll_task RCUs to complete their read-side critical sections +- * before destroying the trigger and optionally the poll_task ++ * Wait for psi_schedule_poll_work RCU to complete its read-side ++ * critical section before destroying the trigger and optionally the ++ * poll_task. + */ + synchronize_rcu(); + /* +@@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct k + kfree(t); + } + +-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +-{ +- struct psi_trigger *old = *trigger_ptr; +- +- if (static_branch_likely(&psi_disabled)) +- return; +- +- rcu_assign_pointer(*trigger_ptr, new); +- if (old) +- kref_put(&old->refcount, psi_trigger_destroy); +-} +- + __poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) + { +@@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + +- rcu_read_lock(); +- +- t = rcu_dereference(*(void __rcu __force **)trigger_ptr); +- if (!t) { +- rcu_read_unlock(); ++ t = smp_load_acquire(trigger_ptr); ++ if (!t) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; +- } +- kref_get(&t->refcount); +- +- rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + +- kref_put(&t->refcount, psi_trigger_destroy); +- + return ret; + } + +@@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *fi + + buf[buf_size - 1] = '\0'; + +- new = psi_trigger_create(&psi_system, buf, nbytes, res); +- if (IS_ERR(new)) +- return PTR_ERR(new); +- + seq = file->private_data; ++ + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); +- psi_trigger_replace(&seq->private, new); ++ ++ /* Allow only one trigger per file descriptor */ ++ if (seq->private) { ++ mutex_unlock(&seq->lock); ++ return -EBUSY; ++ } ++ ++ new = psi_trigger_create(&psi_system, buf, nbytes, res); ++ if (IS_ERR(new)) { ++ mutex_unlock(&seq->lock); ++ return PTR_ERR(new); ++ } ++ ++ smp_store_release(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +@@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode + { + struct seq_file *seq = file->private_data; + +- psi_trigger_replace(&seq->private, NULL); ++ psi_trigger_destroy(seq->private); + return single_release(inode, file); + } + diff --git a/queue-5.15/revert-kvm-svm-avoid-infinite-loop-on-npf-from-bad-address.patch b/queue-5.15/revert-kvm-svm-avoid-infinite-loop-on-npf-from-bad-address.patch new file mode 100644 index 00000000000..b5a844bc0be --- /dev/null +++ b/queue-5.15/revert-kvm-svm-avoid-infinite-loop-on-npf-from-bad-address.patch @@ -0,0 +1,60 @@ +From 31c25585695abdf03d6160aa6d829e855b256329 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 20 Jan 2022 01:07:12 +0000 +Subject: Revert "KVM: SVM: avoid infinite loop on NPF from bad address" + +From: Sean Christopherson + +commit 31c25585695abdf03d6160aa6d829e855b256329 upstream. + +Revert a completely broken check on an "invalid" RIP in SVM's workaround +for the DecodeAssists SMAP errata. kvm_vcpu_gfn_to_memslot() obviously +expects a gfn, i.e. operates in the guest physical address space, whereas +RIP is a virtual (not even linear) address. The "fix" worked for the +problematic KVM selftest because the test identity mapped RIP. + +Fully revert the hack instead of trying to translate RIP to a GPA, as the +non-SEV case is now handled earlier, and KVM cannot access guest page +tables to translate RIP. + +This reverts commit e72436bc3a5206f95bb384e741154166ddb3202e. + +Fixes: e72436bc3a52 ("KVM: SVM: avoid infinite loop on NPF from bad address") +Reported-by: Liam Merwick +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Reviewed-by: Liam Merwick +Message-Id: <20220120010719.711476-3-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 7 ------- + virt/kvm/kvm_main.c | 1 - + 2 files changed, 8 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4456,13 +4456,6 @@ static bool svm_can_emulate_instruction( + if (likely(!insn || insn_len)) + return true; + +- /* +- * If RIP is invalid, go ahead with emulation which will cause an +- * internal error exit. +- */ +- if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) +- return true; +- + cr4 = kvm_read_cr4(vcpu); + smep = cr4 & X86_CR4_SMEP; + smap = cr4 & X86_CR4_SMAP; +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -2104,7 +2104,6 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_ + + return NULL; + } +-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); + + bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) + { diff --git a/queue-5.15/series b/queue-5.15/series index f5eaf2a1256..b8fb92c89af 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -17,3 +17,30 @@ ucount-make-get_ucount-a-safe-get_user-replacement.patch scsi-zfcp-fix-failed-recovery-on-gone-remote-port-with-non-npiv-fcp-devices.patch udf-restore-i_lenalloc-when-inode-expansion-fails.patch udf-fix-null-ptr-deref-when-converting-from-inline-format.patch +efi-runtime-avoid-efiv2-runtime-services-on-apple-x86-machines.patch +pm-wakeup-simplify-the-output-logic-of-pm_show_wakelocks.patch +tracing-histogram-fix-a-potential-memory-leak-for-kstrdup.patch +tracing-don-t-inc-err_log-entry-count-if-entry-allocation-fails.patch +ceph-properly-put-ceph_string-reference-after-async-create-attempt.patch +ceph-set-pool_ns-in-new-inode-layout-for-async-creates.patch +fsnotify-fix-fsnotify-hooks-in-pseudo-filesystems.patch +revert-kvm-svm-avoid-infinite-loop-on-npf-from-bad-address.patch +psi-fix-uaf-issue-when-psi-trigger-is-destroyed-while-being-polled.patch +powerpc-audit-fix-syscall_get_arch.patch +perf-x86-intel-uncore-fix-cas_count_write-issue-for-icx.patch +perf-x86-intel-add-a-quirk-for-the-calculation-of-the-number-of-counters-on-alder-lake.patch +drm-etnaviv-relax-submit-size-limits.patch +drm-atomic-add-the-crtc-to-affected-crtc-only-if-uapi.enable-true.patch +drm-amd-display-fix-fp-start-end-for-dcn30_internal_validate_bw.patch +kvm-lapic-also-cancel-preemption-timer-during-set_lapic.patch +kvm-svm-never-reject-emulation-due-to-smap-errata-for-sev-guests.patch +kvm-svm-don-t-intercept-gp-for-sev-guests.patch +kvm-x86-nsvm-skip-eax-alignment-check-for-non-svm-instructions.patch +kvm-x86-forcibly-leave-nested-virt-when-smm-state-is-toggled.patch +kvm-x86-keep-msr_ia32_xss-unchanged-for-init.patch +kvm-x86-update-vcpu-s-runtime-cpuid-on-write-to-msr_ia32_xss.patch +kvm-x86-sync-the-states-size-with-the-xcr0-ia32_xss-at-any-time.patch +kvm-ppc-book3s-hv-nested-fix-nested-hfscr-being-clobbered-with-multiple-vcpus.patch +dm-revert-partial-fix-for-redundant-bio-based-io-accounting.patch +block-add-bio_start_io_acct_time-to-control-start_time.patch +dm-properly-fix-redundant-bio-based-io-accounting.patch diff --git a/queue-5.15/tracing-don-t-inc-err_log-entry-count-if-entry-allocation-fails.patch b/queue-5.15/tracing-don-t-inc-err_log-entry-count-if-entry-allocation-fails.patch new file mode 100644 index 00000000000..205995df069 --- /dev/null +++ b/queue-5.15/tracing-don-t-inc-err_log-entry-count-if-entry-allocation-fails.patch @@ -0,0 +1,38 @@ +From 67ab5eb71b37b55f7c5522d080a1b42823351776 Mon Sep 17 00:00:00 2001 +From: Tom Zanussi +Date: Thu, 27 Jan 2022 15:44:18 -0600 +Subject: tracing: Don't inc err_log entry count if entry allocation fails + +From: Tom Zanussi + +commit 67ab5eb71b37b55f7c5522d080a1b42823351776 upstream. + +tr->n_err_log_entries should only be increased if entry allocation +succeeds. + +Doing it when it fails won't cause any problems other than wasting an +entry, but should be fixed anyway. + +Link: https://lkml.kernel.org/r/cad1ab28f75968db0f466925e7cba5970cec6c29.1643319703.git.zanussi@kernel.org + +Cc: stable@vger.kernel.org +Fixes: 2f754e771b1a6 ("tracing: Don't inc err_log entry count if entry allocation fails") +Signed-off-by: Tom Zanussi +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -7749,7 +7749,8 @@ static struct tracing_log_err *get_traci + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + err = ERR_PTR(-ENOMEM); +- tr->n_err_log_entries++; ++ else ++ tr->n_err_log_entries++; + + return err; + } diff --git a/queue-5.15/tracing-histogram-fix-a-potential-memory-leak-for-kstrdup.patch b/queue-5.15/tracing-histogram-fix-a-potential-memory-leak-for-kstrdup.patch new file mode 100644 index 00000000000..e5d25dd7f2b --- /dev/null +++ b/queue-5.15/tracing-histogram-fix-a-potential-memory-leak-for-kstrdup.patch @@ -0,0 +1,37 @@ +From e629e7b525a179e29d53463d992bdee759c950fb Mon Sep 17 00:00:00 2001 +From: Xiaoke Wang +Date: Tue, 25 Jan 2022 12:07:15 +0800 +Subject: tracing/histogram: Fix a potential memory leak for kstrdup() + +From: Xiaoke Wang + +commit e629e7b525a179e29d53463d992bdee759c950fb upstream. + +kfree() is missing on an error path to free the memory allocated by +kstrdup(): + + p = param = kstrdup(data->params[i], GFP_KERNEL); + +So it is better to free it via kfree(p). + +Link: https://lkml.kernel.org/r/tencent_C52895FD37802832A3E5B272D05008866F0A@qq.com + +Cc: stable@vger.kernel.org +Fixes: d380dcde9a07c ("tracing: Fix now invalid var_ref_vals assumption in trace action") +Signed-off-by: Xiaoke Wang +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace_events_hist.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/kernel/trace/trace_events_hist.c ++++ b/kernel/trace/trace_events_hist.c +@@ -3581,6 +3581,7 @@ static int trace_action_create(struct hi + + var_ref_idx = find_var_ref_idx(hist_data, var_ref); + if (WARN_ON(var_ref_idx < 0)) { ++ kfree(p); + ret = var_ref_idx; + goto err; + }