]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.1-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 20 Feb 2023 11:21:11 +0000 (12:21 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 20 Feb 2023 11:21:11 +0000 (12:21 +0100)
added patches:
alarmtimer-prevent-starvation-by-small-intervals-and-sig_ign.patch
kvm-initialize-all-of-the-kvm_debugregs-structure-before-sending-it-to-userspace.patch
kvm-x86-pmu-disable-vpmu-support-on-hybrid-cpus-host-pmus.patch
nvme-pci-refresh-visible-attrs-for-cmb-attributes.patch
perf-x86-refuse-to-export-capabilities-for-hybrid-pmus.patch

queue-6.1/alarmtimer-prevent-starvation-by-small-intervals-and-sig_ign.patch [new file with mode: 0644]
queue-6.1/kvm-initialize-all-of-the-kvm_debugregs-structure-before-sending-it-to-userspace.patch [new file with mode: 0644]
queue-6.1/kvm-x86-pmu-disable-vpmu-support-on-hybrid-cpus-host-pmus.patch [new file with mode: 0644]
queue-6.1/nvme-pci-refresh-visible-attrs-for-cmb-attributes.patch [new file with mode: 0644]
queue-6.1/perf-x86-refuse-to-export-capabilities-for-hybrid-pmus.patch [new file with mode: 0644]
queue-6.1/series

diff --git a/queue-6.1/alarmtimer-prevent-starvation-by-small-intervals-and-sig_ign.patch b/queue-6.1/alarmtimer-prevent-starvation-by-small-intervals-and-sig_ign.patch
new file mode 100644 (file)
index 0000000..bc89e50
--- /dev/null
@@ -0,0 +1,132 @@
+From d125d1349abeb46945dc5e98f7824bf688266f13 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 9 Feb 2023 23:25:49 +0100
+Subject: alarmtimer: Prevent starvation by small intervals and SIG_IGN
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit d125d1349abeb46945dc5e98f7824bf688266f13 upstream.
+
+syzbot reported a RCU stall which is caused by setting up an alarmtimer
+with a very small interval and ignoring the signal. The reproducer arms the
+alarm timer with a relative expiry of 8ns and an interval of 9ns. Not a
+problem per se, but that's an issue when the signal is ignored because then
+the timer is immediately rearmed because there is no way to delay that
+rearming to the signal delivery path.  See posix_timer_fn() and commit
+58229a189942 ("posix-timers: Prevent softirq starvation by small intervals
+and SIG_IGN") for details.
+
+The reproducer does not set SIG_IGN explicitely, but it sets up the timers
+signal with SIGCONT. That has the same effect as explicitely setting
+SIG_IGN for a signal as SIGCONT is ignored if there is no handler set and
+the task is not ptraced.
+
+The log clearly shows that:
+
+   [pid  5102] --- SIGCONT {si_signo=SIGCONT, si_code=SI_TIMER, si_timerid=0, si_overrun=316014, si_int=0, si_ptr=NULL} ---
+
+It works because the tasks are traced and therefore the signal is queued so
+the tracer can see it, which delays the restart of the timer to the signal
+delivery path. But then the tracer is killed:
+
+   [pid  5087] kill(-5102, SIGKILL <unfinished ...>
+   ...
+   ./strace-static-x86_64: Process 5107 detached
+
+and after it's gone the stall can be observed:
+
+   syzkaller login: [   79.439102][    C0] hrtimer: interrupt took 68471 ns
+   [  184.460538][    C1] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
+   ...
+   [  184.658237][    C1] rcu: Stack dump where RCU GP kthread last ran:
+   [  184.664574][    C1] Sending NMI from CPU 1 to CPUs 0:
+   [  184.669821][    C0] NMI backtrace for cpu 0
+   [  184.669831][    C0] CPU: 0 PID: 5108 Comm: syz-executor192 Not tainted 6.2.0-rc6-next-20230203-syzkaller #0
+   ...
+   [  184.670036][    C0] Call Trace:
+   [  184.670041][    C0]  <IRQ>
+   [  184.670045][    C0]  alarmtimer_fired+0x327/0x670
+
+posix_timer_fn() prevents that by checking whether the interval for
+timers which have the signal ignored is smaller than a jiffie and
+artifically delay it by shifting the next expiry out by a jiffie. That's
+accurate vs. the overrun accounting, but slightly inaccurate
+vs. timer_gettimer(2).
+
+The comment in that function says what needs to be done and there was a fix
+available for the regular userspace induced SIG_IGN mechanism, but that did
+not work due to the implicit ignore for SIGCONT and similar signals. This
+needs to be worked on, but for now the only available workaround is to do
+exactly what posix_timer_fn() does:
+
+Increase the interval of self-rearming timers, which have their signal
+ignored, to at least a jiffie.
+
+Interestingly this has been fixed before via commit ff86bf0c65f1
+("alarmtimer: Rate limit periodic intervals") already, but that fix got
+lost in a later rework.
+
+Reported-by: syzbot+b9564ba6e8e00694511b@syzkaller.appspotmail.com
+Fixes: f2c45807d399 ("alarmtimer: Switch over to generic set/get/rearm routine")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: John Stultz <jstultz@google.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/87k00q1no2.ffs@tglx
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/time/alarmtimer.c |   33 +++++++++++++++++++++++++++++----
+ 1 file changed, 29 insertions(+), 4 deletions(-)
+
+--- a/kernel/time/alarmtimer.c
++++ b/kernel/time/alarmtimer.c
+@@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, k
+ }
+ EXPORT_SYMBOL_GPL(alarm_forward);
+-u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
++static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle)
+ {
+       struct alarm_base *base = &alarm_bases[alarm->type];
++      ktime_t now = base->get_ktime();
++
++      if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) {
++              /*
++               * Same issue as with posix_timer_fn(). Timers which are
++               * periodic but the signal is ignored can starve the system
++               * with a very small interval. The real fix which was
++               * promised in the context of posix_timer_fn() never
++               * materialized, but someone should really work on it.
++               *
++               * To prevent DOS fake @now to be 1 jiffie out which keeps
++               * the overrun accounting correct but creates an
++               * inconsistency vs. timer_gettime(2).
++               */
++              ktime_t kj = NSEC_PER_SEC / HZ;
++
++              if (interval < kj)
++                      now = ktime_add(now, kj);
++      }
++
++      return alarm_forward(alarm, now, interval);
++}
+-      return alarm_forward(alarm, base->get_ktime(), interval);
++u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
++{
++      return __alarm_forward_now(alarm, interval, false);
+ }
+ EXPORT_SYMBOL_GPL(alarm_forward_now);
+@@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_han
+       if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
+               /*
+                * Handle ignored signals and rearm the timer. This will go
+-               * away once we handle ignored signals proper.
++               * away once we handle ignored signals proper. Ensure that
++               * small intervals cannot starve the system.
+                */
+-              ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
++              ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
+               ++ptr->it_requeue_pending;
+               ptr->it_active = 1;
+               result = ALARMTIMER_RESTART;
diff --git a/queue-6.1/kvm-initialize-all-of-the-kvm_debugregs-structure-before-sending-it-to-userspace.patch b/queue-6.1/kvm-initialize-all-of-the-kvm_debugregs-structure-before-sending-it-to-userspace.patch
new file mode 100644 (file)
index 0000000..87886ed
--- /dev/null
@@ -0,0 +1,53 @@
+From 2c10b61421a28e95a46ab489fd56c0f442ff6952 Mon Sep 17 00:00:00 2001
+From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Date: Tue, 14 Feb 2023 11:33:04 +0100
+Subject: kvm: initialize all of the kvm_debugregs structure before sending it to userspace
+
+From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+commit 2c10b61421a28e95a46ab489fd56c0f442ff6952 upstream.
+
+When calling the KVM_GET_DEBUGREGS ioctl, on some configurations, there
+might be some unitialized portions of the kvm_debugregs structure that
+could be copied to userspace.  Prevent this as is done in the other kvm
+ioctls, by setting the whole structure to 0 before copying anything into
+it.
+
+Bonus is that this reduces the lines of code as the explicit flag
+setting and reserved space zeroing out can be removed.
+
+Cc: Sean Christopherson <seanjc@google.com>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: <x86@kernel.org>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: stable <stable@kernel.org>
+Reported-by: Xingyuan Mo <hdthky0@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Message-Id: <20230214103304.3689213-1-gregkh@linuxfoundation.org>
+Tested-by: Xingyuan Mo <hdthky0@gmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -5250,12 +5250,11 @@ static void kvm_vcpu_ioctl_x86_get_debug
+ {
+       unsigned long val;
++      memset(dbgregs, 0, sizeof(*dbgregs));
+       memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+       kvm_get_dr(vcpu, 6, &val);
+       dbgregs->dr6 = val;
+       dbgregs->dr7 = vcpu->arch.dr7;
+-      dbgregs->flags = 0;
+-      memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
+ }
+ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
diff --git a/queue-6.1/kvm-x86-pmu-disable-vpmu-support-on-hybrid-cpus-host-pmus.patch b/queue-6.1/kvm-x86-pmu-disable-vpmu-support-on-hybrid-cpus-host-pmus.patch
new file mode 100644 (file)
index 0000000..3850328
--- /dev/null
@@ -0,0 +1,87 @@
+From 4d7404e5ee0066e9a9e8268675de8a273b568b08 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 8 Feb 2023 20:42:29 +0000
+Subject: KVM: x86/pmu: Disable vPMU support on hybrid CPUs (host PMUs)
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 4d7404e5ee0066e9a9e8268675de8a273b568b08 upstream.
+
+Disable KVM support for virtualizing PMUs on hosts with hybrid PMUs until
+KVM gains a sane way to enumeration the hybrid vPMU to userspace and/or
+gains a mechanism to let userspace opt-in to the dangers of exposing a
+hybrid vPMU to KVM guests.  Virtualizing a hybrid PMU, or at least part of
+a hybrid PMU, is possible, but it requires careful, deliberate
+configuration from userspace.
+
+E.g. to expose full functionality, vCPUs need to be pinned to pCPUs to
+prevent migrating a vCPU between a big core and a little core, userspace
+must enumerate a reasonable topology to the guest, and guest CPUID must be
+curated per vCPU to enumerate accurate vPMU capabilities.
+
+The last point is especially problematic, as KVM doesn't control which
+pCPU it runs on when enumerating KVM's vPMU capabilities to userspace,
+i.e. userspace can't rely on KVM_GET_SUPPORTED_CPUID in it's current form.
+
+Alternatively, userspace could enable vPMU support by enumerating the
+set of features that are common and coherent across all cores, e.g. by
+filtering PMU events and restricting guest capabilities.  But again, that
+requires userspace to take action far beyond reflecting KVM's supported
+feature set into the guest.
+
+For now, simply disable vPMU support on hybrid CPUs to avoid inducing
+seemingly random #GPs in guests, and punt support for hybrid CPUs to a
+future enabling effort.
+
+Reported-by: Jianfeng Gao <jianfeng.gao@intel.com>
+Cc: stable@vger.kernel.org
+Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Kan Liang <kan.liang@linux.intel.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Link: https://lore.kernel.org/all/20220818181530.2355034-1-kan.liang@linux.intel.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20230208204230.1360502-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/pmu.h |   26 +++++++++++++++++++-------
+ 1 file changed, 19 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/kvm/pmu.h
++++ b/arch/x86/kvm/pmu.h
+@@ -164,15 +164,27 @@ static inline void kvm_init_pmu_capabili
+ {
+       bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
+-      perf_get_x86_pmu_capability(&kvm_pmu_cap);
+-
+-       /*
+-        * For Intel, only support guest architectural pmu
+-        * on a host with architectural pmu.
+-        */
+-      if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp)
++      /*
++       * Hybrid PMUs don't play nice with virtualization without careful
++       * configuration by userspace, and KVM's APIs for reporting supported
++       * vPMU features do not account for hybrid PMUs.  Disable vPMU support
++       * for hybrid PMUs until KVM gains a way to let userspace opt-in.
++       */
++      if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+               enable_pmu = false;
++      if (enable_pmu) {
++              perf_get_x86_pmu_capability(&kvm_pmu_cap);
++
++              /*
++               * For Intel, only support guest architectural pmu
++               * on a host with architectural pmu.
++               */
++              if ((is_intel && !kvm_pmu_cap.version) ||
++                  !kvm_pmu_cap.num_counters_gp)
++                      enable_pmu = false;
++      }
++
+       if (!enable_pmu) {
+               memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
+               return;
diff --git a/queue-6.1/nvme-pci-refresh-visible-attrs-for-cmb-attributes.patch b/queue-6.1/nvme-pci-refresh-visible-attrs-for-cmb-attributes.patch
new file mode 100644 (file)
index 0000000..19ba2e2
--- /dev/null
@@ -0,0 +1,54 @@
+From e917a849c3fc317c4a5f82bb18726000173d39e6 Mon Sep 17 00:00:00 2001
+From: Keith Busch <kbusch@kernel.org>
+Date: Thu, 16 Feb 2023 08:44:03 -0800
+Subject: nvme-pci: refresh visible attrs for cmb attributes
+
+From: Keith Busch <kbusch@kernel.org>
+
+commit e917a849c3fc317c4a5f82bb18726000173d39e6 upstream.
+
+The sysfs group containing the cmb attributes is registered before the
+driver knows if they need to be visible or not. Update the group when
+cmb attributes are known to exist so the visibility setting is correct.
+
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=217037
+Fixes: 86adbf0cdb9ec65 ("nvme: simplify transport specific device attribute handling")
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/nvme/host/pci.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/nvme/host/pci.c
++++ b/drivers/nvme/host/pci.c
+@@ -109,6 +109,7 @@ struct nvme_queue;
+ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
+ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
++static void nvme_update_attrs(struct nvme_dev *dev);
+ /*
+  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+@@ -1967,6 +1968,8 @@ static void nvme_map_cmb(struct nvme_dev
+       if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
+                       (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
+               pci_p2pmem_publish(pdev, true);
++
++      nvme_update_attrs(dev);
+ }
+ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
+@@ -2250,6 +2253,11 @@ static const struct attribute_group *nvm
+       NULL,
+ };
++static void nvme_update_attrs(struct nvme_dev *dev)
++{
++      sysfs_update_group(&dev->ctrl.device->kobj, &nvme_pci_dev_attrs_group);
++}
++
+ /*
+  * nirqs is the number of interrupts available for write and read
+  * queues. The core already reserved an interrupt for the admin queue.
diff --git a/queue-6.1/perf-x86-refuse-to-export-capabilities-for-hybrid-pmus.patch b/queue-6.1/perf-x86-refuse-to-export-capabilities-for-hybrid-pmus.patch
new file mode 100644 (file)
index 0000000..4a84300
--- /dev/null
@@ -0,0 +1,56 @@
+From 4b4191b8ae1278bde3642acaaef8f92810ed111a Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 8 Feb 2023 20:42:30 +0000
+Subject: perf/x86: Refuse to export capabilities for hybrid PMUs
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 4b4191b8ae1278bde3642acaaef8f92810ed111a upstream.
+
+Now that KVM disables vPMU support on hybrid CPUs, WARN and return zeros
+if perf_get_x86_pmu_capability() is invoked on a hybrid CPU.  The helper
+doesn't provide an accurate accounting of the PMU capabilities for hybrid
+CPUs and needs to be enhanced if KVM, or anything else outside of perf,
+wants to act on the PMU capabilities.
+
+Cc: stable@vger.kernel.org
+Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Kan Liang <kan.liang@linux.intel.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Link: https://lore.kernel.org/all/20220818181530.2355034-1-kan.liang@linux.intel.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20230208204230.1360502-3-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/events/core.c |   12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/events/core.c
++++ b/arch/x86/events/core.c
+@@ -2994,17 +2994,19 @@ unsigned long perf_misc_flags(struct pt_
+ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+ {
+-      if (!x86_pmu_initialized()) {
++      /* This API doesn't currently support enumerating hybrid PMUs. */
++      if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) ||
++          !x86_pmu_initialized()) {
+               memset(cap, 0, sizeof(*cap));
+               return;
+       }
+-      cap->version            = x86_pmu.version;
+       /*
+-       * KVM doesn't support the hybrid PMU yet.
+-       * Return the common value in global x86_pmu,
+-       * which available for all cores.
++       * Note, hybrid CPU models get tracked as having hybrid PMUs even when
++       * all E-cores are disabled via BIOS.  When E-cores are disabled, the
++       * base PMU holds the correct number of counters for P-cores.
+        */
++      cap->version            = x86_pmu.version;
+       cap->num_counters_gp    = x86_pmu.num_counters;
+       cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+       cap->bit_width_gp       = x86_pmu.cntval_bits;
index 9c31c2aae67593e1b4a6b0dd2bb8023c7fd54011..dd8377373bd4a539a713756f0fa203c27f362e2a 100644 (file)
@@ -109,3 +109,8 @@ net-sched-tcindex-search-key-must-be-16-bits.patch
 nvme-tcp-stop-auth-work-after-tearing-down-queues-in.patch
 nvme-rdma-stop-auth-work-after-tearing-down-queues-i.patch
 nvme-apple-fix-controller-shutdown-in-apple_nvme_dis.patch
+kvm-x86-pmu-disable-vpmu-support-on-hybrid-cpus-host-pmus.patch
+kvm-initialize-all-of-the-kvm_debugregs-structure-before-sending-it-to-userspace.patch
+perf-x86-refuse-to-export-capabilities-for-hybrid-pmus.patch
+alarmtimer-prevent-starvation-by-small-intervals-and-sig_ign.patch
+nvme-pci-refresh-visible-attrs-for-cmb-attributes.patch