From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 9 Feb 2020 23:41:15 +0000 (+0100)
Subject: 5.4-stable patches
X-Git-Tag: v4.19.103~15
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=06ba48d0251ca113261c23bc98e1ed3ab425f4ff;p=thirdparty%2Fkernel%2Fstable-queue.git

5.4-stable patches

added patches:
	cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch
	cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch
	clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch
	perf-core-fix-mlock-accounting-in-perf_mmap.patch
	x86-apic-msi-plug-non-maskable-msi-affinity-race.patch
---

diff --git a/queue-5.4/cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch b/queue-5.4/cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch
new file mode 100644
index 00000000000..d347c6c56df
--- /dev/null
+++ b/queue-5.4/cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch
@@ -0,0 +1,51 @@
+From b0dd940e582b6a60296b9847a54012a4b080dc72 Mon Sep 17 00:00:00 2001
+From: Ronnie Sahlberg <lsahlber@redhat.com>
+Date: Wed, 5 Feb 2020 11:08:01 +1000
+Subject: cifs: fail i/o on soft mounts if sessionsetup errors out
+
+From: Ronnie Sahlberg <lsahlber@redhat.com>
+
+commit b0dd940e582b6a60296b9847a54012a4b080dc72 upstream.
+
+RHBZ: 1579050
+
+If we have a soft mount we should fail commands for session-setup
+failures (such as the password having changed/ account being deleted/ ...)
+and return an error back to the application.
+
+Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+CC: Stable <stable@vger.kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/cifs/smb2pdu.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/cifs/smb2pdu.c
++++ b/fs/cifs/smb2pdu.c
+@@ -350,9 +350,14 @@ smb2_reconnect(__le16 smb2_command, stru
+ 	}
+ 
+ 	rc = cifs_negotiate_protocol(0, tcon->ses);
+-	if (!rc && tcon->ses->need_reconnect)
++	if (!rc && tcon->ses->need_reconnect) {
+ 		rc = cifs_setup_session(0, tcon->ses, nls_codepage);
+-
++		if ((rc == -EACCES) && !tcon->retry) {
++			rc = -EHOSTDOWN;
++			mutex_unlock(&tcon->ses->session_mutex);
++			goto failed;
++		}
++	}
+ 	if (rc || !tcon->need_reconnect) {
+ 		mutex_unlock(&tcon->ses->session_mutex);
+ 		goto out;
+@@ -397,6 +402,7 @@ out:
+ 	case SMB2_SET_INFO:
+ 		rc = -EAGAIN;
+ 	}
++failed:
+ 	unload_nls(nls_codepage);
+ 	return rc;
+ }
diff --git a/queue-5.4/cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch b/queue-5.4/cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch
new file mode 100644
index 00000000000..3c781f9b966
--- /dev/null
+++ b/queue-5.4/cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch
@@ -0,0 +1,52 @@
+From e3e056c35108661e418c803adfc054bf683426e7 Mon Sep 17 00:00:00 2001
+From: Aurelien Aptel <aaptel@suse.com>
+Date: Thu, 6 Feb 2020 18:16:55 +0100
+Subject: cifs: fix mode bits from dir listing when mounted with modefromsid
+
+From: Aurelien Aptel <aaptel@suse.com>
+
+commit e3e056c35108661e418c803adfc054bf683426e7 upstream.
+
+When mounting with -o modefromsid, the mode bits are stored in an
+ACE. Directory enumeration (e.g. ls -l /mnt) triggers an SMB Query Dir
+which does not include ACEs in its response. The mode bits in this
+case are silently set to a default value of 755 instead.
+
+This patch marks the dentry created during the directory enumeration
+as needing re-evaluation (i.e. additional Query Info with ACEs) so
+that the mode bits can be properly extracted.
+
+Quick repro:
+
+$ mount.cifs //win19.test/data /mnt -o ...,modefromsid
+$ touch /mnt/foo && chmod 751 /mnt/foo
+$ stat /mnt/foo
+  # reports 751 (OK)
+$ sleep 2
+  # dentry older than 1s by default get invalidated
+$ ls -l /mnt
+  # since dentry invalid, ls does a Query Dir
+  # and reports foo as 755 (WRONG)
+
+Signed-off-by: Aurelien Aptel <aaptel@suse.com>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+CC: Stable <stable@vger.kernel.org>
+Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/cifs/readdir.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/cifs/readdir.c
++++ b/fs/cifs/readdir.c
+@@ -174,7 +174,8 @@ cifs_fill_common_info(struct cifs_fattr
+ 	 * may look wrong since the inodes may not have timed out by the time
+ 	 * "ls" does a stat() call on them.
+ 	 */
+-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
++	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) ||
++	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID))
+ 		fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+ 
+ 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
diff --git a/queue-5.4/clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch b/queue-5.4/clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch
new file mode 100644
index 00000000000..472ccaf7515
--- /dev/null
+++ b/queue-5.4/clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch
@@ -0,0 +1,96 @@
+From febac332a819f0e764aa4da62757ba21d18c182b Mon Sep 17 00:00:00 2001
+From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Date: Fri, 31 Jan 2020 19:08:59 +0300
+Subject: clocksource: Prevent double add_timer_on() for watchdog_timer
+
+From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+
+commit febac332a819f0e764aa4da62757ba21d18c182b upstream.
+
+Kernel crashes inside QEMU/KVM are observed:
+
+  kernel BUG at kernel/time/timer.c:1154!
+  BUG_ON(timer_pending(timer) || !timer->function) in add_timer_on().
+
+At the same time another cpu got:
+
+  general protection fault: 0000 [#1] SMP PTI of poinson pointer 0xdead000000000200 in:
+
+  __hlist_del at include/linux/list.h:681
+  (inlined by) detach_timer at kernel/time/timer.c:818
+  (inlined by) expire_timers at kernel/time/timer.c:1355
+  (inlined by) __run_timers at kernel/time/timer.c:1686
+  (inlined by) run_timer_softirq at kernel/time/timer.c:1699
+
+Unfortunately kernel logs are badly scrambled, stacktraces are lost.
+
+Printing the timer->function before the BUG_ON() pointed to
+clocksource_watchdog().
+
+The execution of clocksource_watchdog() can race with a sequence of
+clocksource_stop_watchdog() .. clocksource_start_watchdog():
+
+expire_timers()
+ detach_timer(timer, true);
+  timer->entry.pprev = NULL;
+ raw_spin_unlock_irq(&base->lock);
+ call_timer_fn
+  clocksource_watchdog()
+
+					clocksource_watchdog_kthread() or
+					clocksource_unbind()
+
+					spin_lock_irqsave(&watchdog_lock, flags);
+					clocksource_stop_watchdog();
+					 del_timer(&watchdog_timer);
+					 watchdog_running = 0;
+					spin_unlock_irqrestore(&watchdog_lock, flags);
+
+					spin_lock_irqsave(&watchdog_lock, flags);
+					clocksource_start_watchdog();
+					 add_timer_on(&watchdog_timer, ...);
+					 watchdog_running = 1;
+					spin_unlock_irqrestore(&watchdog_lock, flags);
+
+  spin_lock(&watchdog_lock);
+  add_timer_on(&watchdog_timer, ...);
+   BUG_ON(timer_pending(timer) || !timer->function);
+    timer_pending() -> true
+    BUG()
+
+I.e. inside clocksource_watchdog() watchdog_timer could be already armed.
+
+Check timer_pending() before calling add_timer_on(). This is sufficient as
+all operations are synchronized by watchdog_lock.
+
+Fixes: 75c5158f70c0 ("timekeeping: Update clocksource with stop_machine")
+Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/158048693917.4378.13823603769948933793.stgit@buzz
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/time/clocksource.c |   11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+--- a/kernel/time/clocksource.c
++++ b/kernel/time/clocksource.c
+@@ -293,8 +293,15 @@ static void clocksource_watchdog(struct
+ 	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ 	if (next_cpu >= nr_cpu_ids)
+ 		next_cpu = cpumask_first(cpu_online_mask);
+-	watchdog_timer.expires += WATCHDOG_INTERVAL;
+-	add_timer_on(&watchdog_timer, next_cpu);
++
++	/*
++	 * Arm timer if not already pending: could race with concurrent
++	 * pair clocksource_stop_watchdog() clocksource_start_watchdog().
++	 */
++	if (!timer_pending(&watchdog_timer)) {
++		watchdog_timer.expires += WATCHDOG_INTERVAL;
++		add_timer_on(&watchdog_timer, next_cpu);
++	}
+ out:
+ 	spin_unlock(&watchdog_lock);
+ }
diff --git a/queue-5.4/perf-core-fix-mlock-accounting-in-perf_mmap.patch b/queue-5.4/perf-core-fix-mlock-accounting-in-perf_mmap.patch
new file mode 100644
index 00000000000..f85fda26241
--- /dev/null
+++ b/queue-5.4/perf-core-fix-mlock-accounting-in-perf_mmap.patch
@@ -0,0 +1,51 @@
+From 003461559ef7a9bd0239bae35a22ad8924d6e9ad Mon Sep 17 00:00:00 2001
+From: Song Liu <songliubraving@fb.com>
+Date: Thu, 23 Jan 2020 10:11:46 -0800
+Subject: perf/core: Fix mlock accounting in perf_mmap()
+
+From: Song Liu <songliubraving@fb.com>
+
+commit 003461559ef7a9bd0239bae35a22ad8924d6e9ad upstream.
+
+Decreasing sysctl_perf_event_mlock between two consecutive perf_mmap()s of
+a perf ring buffer may lead to an integer underflow in locked memory
+accounting. This may lead to the undesired behaviors, such as failures in
+BPF map creation.
+
+Address this by adjusting the accounting logic to take into account the
+possibility that the amount of already locked memory may exceed the
+current limit.
+
+Fixes: c4b75479741c ("perf/core: Make the mlock accounting simple again")
+Suggested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Signed-off-by: Song Liu <songliubraving@fb.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: <stable@vger.kernel.org>
+Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Link: https://lkml.kernel.org/r/20200123181146.2238074-1-songliubraving@fb.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/events/core.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -5823,7 +5823,15 @@ accounting:
+ 	 */
+ 	user_lock_limit *= num_online_cpus();
+ 
+-	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
++	user_locked = atomic_long_read(&user->locked_vm);
++
++	/*
++	 * sysctl_perf_event_mlock may have changed, so that
++	 *     user->locked_vm > user_lock_limit
++	 */
++	if (user_locked > user_lock_limit)
++		user_locked = user_lock_limit;
++	user_locked += user_extra;
+ 
+ 	if (user_locked <= user_lock_limit) {
+ 		/* charge all to locked_vm */
diff --git a/queue-5.4/series b/queue-5.4/series
index cac44e1c9e1..6bb59e01f93 100644
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -299,3 +299,8 @@ kvm-x86-fix-overlap-between-spte_mmio_mask-and-gener.patch
 kvm-nvmx-vmread-should-not-set-rflags-to-specify-suc.patch
 kvm-use-vcpu-specific-gva-hva-translation-when-query.patch
 kvm-play-nice-with-read-only-memslots-when-querying-.patch
+cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch
+cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch
+x86-apic-msi-plug-non-maskable-msi-affinity-race.patch
+clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch
+perf-core-fix-mlock-accounting-in-perf_mmap.patch
diff --git a/queue-5.4/x86-apic-msi-plug-non-maskable-msi-affinity-race.patch b/queue-5.4/x86-apic-msi-plug-non-maskable-msi-affinity-race.patch
new file mode 100644
index 00000000000..7a34136f8bc
--- /dev/null
+++ b/queue-5.4/x86-apic-msi-plug-non-maskable-msi-affinity-race.patch
@@ -0,0 +1,370 @@
+From 6f1a4891a5928a5969c87fa5a584844c983ec823 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Fri, 31 Jan 2020 15:26:52 +0100
+Subject: x86/apic/msi: Plug non-maskable MSI affinity race
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 6f1a4891a5928a5969c87fa5a584844c983ec823 upstream.
+
+Evan tracked down a subtle race between the update of the MSI message and
+the device raising an interrupt internally on PCI devices which do not
+support MSI masking. The update of the MSI message is non-atomic and
+consists of either 2 or 3 sequential 32bit wide writes to the PCI config
+space.
+
+   - Write address low 32bits
+   - Write address high 32bits (If supported by device)
+   - Write data
+
+When an interrupt is migrated then both address and data might change, so
+the kernel attempts to mask the MSI interrupt first. But for MSI masking is
+optional, so there exist devices which do not provide it. That means that
+if the device raises an interrupt internally between the writes then a MSI
+message is sent built from half updated state.
+
+On x86 this can lead to spurious interrupts on the wrong interrupt
+vector when the affinity setting changes both address and data. As a
+consequence the device interrupt can be lost causing the device to
+become stuck or malfunctioning.
+
+Evan tried to handle that by disabling MSI accross an MSI message
+update. That's not feasible because disabling MSI has issues on its own:
+
+ If MSI is disabled the PCI device is routing an interrupt to the legacy
+ INTx mechanism. The INTx delivery can be disabled, but the disablement is
+ not working on all devices.
+
+ Some devices lose interrupts when both MSI and INTx delivery are disabled.
+
+Another way to solve this would be to enforce the allocation of the same
+vector on all CPUs in the system for this kind of screwed devices. That
+could be done, but it would bring back the vector space exhaustion problems
+which got solved a few years ago.
+
+Fortunately the high address (if supported by the device) is only relevant
+when X2APIC is enabled which implies interrupt remapping. In the interrupt
+remapping case the affinity setting is happening at the interrupt remapping
+unit and the PCI MSI message is programmed only once when the PCI device is
+initialized.
+
+That makes it possible to solve it with a two step update:
+
+  1) Target the MSI msg to the new vector on the current target CPU
+
+  2) Target the MSI msg to the new vector on the new target CPU
+
+In both cases writing the MSI message is only changing a single 32bit word
+which prevents the issue of inconsistency.
+
+After writing the final destination it is necessary to check whether the
+device issued an interrupt while the intermediate state #1 (new vector,
+current CPU) was in effect.
+
+This is possible because the affinity change is always happening on the
+current target CPU. The code runs with interrupts disabled, so the
+interrupt can be detected by checking the IRR of the local APIC. If the
+vector is pending in the IRR then the interrupt is retriggered on the new
+target CPU by sending an IPI for the associated vector on the target CPU.
+
+This can cause spurious interrupts on both the local and the new target
+CPU.
+
+ 1) If the new vector is not in use on the local CPU and the device
+    affected by the affinity change raised an interrupt during the
+    transitional state (step #1 above) then interrupt entry code will
+    ignore that spurious interrupt. The vector is marked so that the
+    'No irq handler for vector' warning is supressed once.
+
+ 2) If the new vector is in use already on the local CPU then the IRR check
+    might see an pending interrupt from the device which is using this
+    vector. The IPI to the new target CPU will then invoke the handler of
+    the device, which got the affinity change, even if that device did not
+    issue an interrupt
+
+ 3) If the new vector is in use already on the local CPU and the device
+    affected by the affinity change raised an interrupt during the
+    transitional state (step #1 above) then the handler of the device which
+    uses that vector on the local CPU will be invoked.
+
+expose issues in device driver interrupt handlers which are not prepared to
+handle a spurious interrupt correctly. This not a regression, it's just
+exposing something which was already broken as spurious interrupts can
+happen for a lot of reasons and all driver handlers need to be able to deal
+with them.
+
+Reported-by: Evan Green <evgreen@chromium.org>
+Debugged-by: Evan Green <evgreen@chromium.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Evan Green <evgreen@chromium.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/87imkr4s7n.fsf@nanos.tec.linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/apic.h |    8 ++
+ arch/x86/kernel/apic/msi.c  |  128 ++++++++++++++++++++++++++++++++++++++++++--
+ include/linux/irq.h         |   18 ++++++
+ include/linux/irqdomain.h   |    7 ++
+ kernel/irq/debugfs.c        |    1 
+ kernel/irq/msi.c            |    5 +
+ 6 files changed, 163 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/include/asm/apic.h
++++ b/arch/x86/include/asm/apic.h
+@@ -454,6 +454,14 @@ static inline void ack_APIC_irq(void)
+ 	apic_eoi();
+ }
+ 
++
++static inline bool lapic_vector_set_in_irr(unsigned int vector)
++{
++	u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
++
++	return !!(irr & (1U << (vector % 32)));
++}
++
+ static inline unsigned default_get_apic_id(unsigned long x)
+ {
+ 	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+--- a/arch/x86/kernel/apic/msi.c
++++ b/arch/x86/kernel/apic/msi.c
+@@ -23,10 +23,8 @@
+ 
+ static struct irq_domain *msi_default_domain;
+ 
+-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
++static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
+ {
+-	struct irq_cfg *cfg = irqd_cfg(data);
+-
+ 	msg->address_hi = MSI_ADDR_BASE_HI;
+ 
+ 	if (x2apic_enabled())
+@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct i
+ 		MSI_DATA_VECTOR(cfg->vector);
+ }
+ 
++static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
++{
++	__irq_msi_compose_msg(irqd_cfg(data), msg);
++}
++
++static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
++{
++	struct msi_msg msg[2] = { [1] = { }, };
++
++	__irq_msi_compose_msg(cfg, msg);
++	irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
++}
++
++static int
++msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
++{
++	struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
++	struct irq_data *parent = irqd->parent_data;
++	unsigned int cpu;
++	int ret;
++
++	/* Save the current configuration */
++	cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
++	old_cfg = *cfg;
++
++	/* Allocate a new target vector */
++	ret = parent->chip->irq_set_affinity(parent, mask, force);
++	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
++		return ret;
++
++	/*
++	 * For non-maskable and non-remapped MSI interrupts the migration
++	 * to a different destination CPU and a different vector has to be
++	 * done careful to handle the possible stray interrupt which can be
++	 * caused by the non-atomic update of the address/data pair.
++	 *
++	 * Direct update is possible when:
++	 * - The MSI is maskable (remapped MSI does not use this code path)).
++	 *   The quirk bit is not set in this case.
++	 * - The new vector is the same as the old vector
++	 * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
++	 * - The new destination CPU is the same as the old destination CPU
++	 */
++	if (!irqd_msi_nomask_quirk(irqd) ||
++	    cfg->vector == old_cfg.vector ||
++	    old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
++	    cfg->dest_apicid == old_cfg.dest_apicid) {
++		irq_msi_update_msg(irqd, cfg);
++		return ret;
++	}
++
++	/*
++	 * Paranoia: Validate that the interrupt target is the local
++	 * CPU.
++	 */
++	if (WARN_ON_ONCE(cpu != smp_processor_id())) {
++		irq_msi_update_msg(irqd, cfg);
++		return ret;
++	}
++
++	/*
++	 * Redirect the interrupt to the new vector on the current CPU
++	 * first. This might cause a spurious interrupt on this vector if
++	 * the device raises an interrupt right between this update and the
++	 * update to the final destination CPU.
++	 *
++	 * If the vector is in use then the installed device handler will
++	 * denote it as spurious which is no harm as this is a rare event
++	 * and interrupt handlers have to cope with spurious interrupts
++	 * anyway. If the vector is unused, then it is marked so it won't
++	 * trigger the 'No irq handler for vector' warning in do_IRQ().
++	 *
++	 * This requires to hold vector lock to prevent concurrent updates to
++	 * the affected vector.
++	 */
++	lock_vector_lock();
++
++	/*
++	 * Mark the new target vector on the local CPU if it is currently
++	 * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
++	 * the CPU hotplug path for a similar purpose. This cannot be
++	 * undone here as the current CPU has interrupts disabled and
++	 * cannot handle the interrupt before the whole set_affinity()
++	 * section is done. In the CPU unplug case, the current CPU is
++	 * about to vanish and will not handle any interrupts anymore. The
++	 * vector is cleaned up when the CPU comes online again.
++	 */
++	if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
++		this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
++
++	/* Redirect it to the new vector on the local CPU temporarily */
++	old_cfg.vector = cfg->vector;
++	irq_msi_update_msg(irqd, &old_cfg);
++
++	/* Now transition it to the target CPU */
++	irq_msi_update_msg(irqd, cfg);
++
++	/*
++	 * All interrupts after this point are now targeted at the new
++	 * vector/CPU.
++	 *
++	 * Drop vector lock before testing whether the temporary assignment
++	 * to the local CPU was hit by an interrupt raised in the device,
++	 * because the retrigger function acquires vector lock again.
++	 */
++	unlock_vector_lock();
++
++	/*
++	 * Check whether the transition raced with a device interrupt and
++	 * is pending in the local APICs IRR. It is safe to do this outside
++	 * of vector lock as the irq_desc::lock of this interrupt is still
++	 * held and interrupts are disabled: The check is not accessing the
++	 * underlying vector store. It's just checking the local APIC's
++	 * IRR.
++	 */
++	if (lapic_vector_set_in_irr(cfg->vector))
++		irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
++
++	return ret;
++}
++
+ /*
+  * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+  * which implement the MSI or MSI-X Capability Structure.
+@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controlle
+ 	.irq_ack		= irq_chip_ack_parent,
+ 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+ 	.irq_compose_msi_msg	= irq_msi_compose_msg,
++	.irq_set_affinity	= msi_set_affinity,
+ 	.flags			= IRQCHIP_SKIP_SET_WAKE,
+ };
+ 
+@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct
+ 	}
+ 	if (!msi_default_domain)
+ 		pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
++	else
++		msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
+ }
+ 
+ #ifdef CONFIG_IRQ_REMAP
+--- a/include/linux/irq.h
++++ b/include/linux/irq.h
+@@ -209,6 +209,8 @@ struct irq_data {
+  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
+  * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
+  * IRQD_CAN_RESERVE		- Can use reservation mode
++ * IRQD_MSI_NOMASK_QUIRK	- Non-maskable MSI quirk for affinity change
++ *				  required
+  */
+ enum {
+ 	IRQD_TRIGGER_MASK		= 0xf,
+@@ -231,6 +233,7 @@ enum {
+ 	IRQD_SINGLE_TARGET		= (1 << 24),
+ 	IRQD_DEFAULT_TRIGGER_SET	= (1 << 25),
+ 	IRQD_CAN_RESERVE		= (1 << 26),
++	IRQD_MSI_NOMASK_QUIRK		= (1 << 27),
+ };
+ 
+ #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
+@@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(stru
+ 	return __irqd_to_state(d) & IRQD_CAN_RESERVE;
+ }
+ 
++static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
++{
++	__irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
++}
++
++static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
++{
++	__irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
++}
++
++static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
++{
++	return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
++}
++
+ #undef __irqd_to_state
+ 
+ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
+--- a/include/linux/irqdomain.h
++++ b/include/linux/irqdomain.h
+@@ -206,6 +206,13 @@ enum {
+ 	IRQ_DOMAIN_FLAG_MSI_REMAP	= (1 << 5),
+ 
+ 	/*
++	 * Quirk to handle MSI implementations which do not provide
++	 * masking. Currently known to affect x86, but partially
++	 * handled in core code.
++	 */
++	IRQ_DOMAIN_MSI_NOMASK_QUIRK	= (1 << 6),
++
++	/*
+ 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
+ 	 * for implementation specific purposes and ignored by the
+ 	 * core code.
+--- a/kernel/irq/debugfs.c
++++ b/kernel/irq/debugfs.c
+@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdat
+ 	BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
+ 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+ 	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
++	BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
+ 
+ 	BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
+ 
+--- a/kernel/irq/msi.c
++++ b/kernel/irq/msi.c
+@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_dom
+ 			continue;
+ 
+ 		irq_data = irq_domain_get_irq_data(domain, desc->irq);
+-		if (!can_reserve)
++		if (!can_reserve) {
+ 			irqd_clr_can_reserve(irq_data);
++			if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
++				irqd_set_msi_nomask_quirk(irq_data);
++		}
+ 		ret = irq_domain_activate_irq(irq_data, can_reserve);
+ 		if (ret)
+ 			goto cleanup;