From: Greg Kroah-Hartman Date: Sun, 9 Feb 2020 23:41:15 +0000 (+0100) Subject: 5.4-stable patches X-Git-Tag: v4.19.103~15 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=06ba48d0251ca113261c23bc98e1ed3ab425f4ff;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch perf-core-fix-mlock-accounting-in-perf_mmap.patch x86-apic-msi-plug-non-maskable-msi-affinity-race.patch --- diff --git a/queue-5.4/cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch b/queue-5.4/cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch new file mode 100644 index 00000000000..d347c6c56df --- /dev/null +++ b/queue-5.4/cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch @@ -0,0 +1,51 @@ +From b0dd940e582b6a60296b9847a54012a4b080dc72 Mon Sep 17 00:00:00 2001 +From: Ronnie Sahlberg +Date: Wed, 5 Feb 2020 11:08:01 +1000 +Subject: cifs: fail i/o on soft mounts if sessionsetup errors out + +From: Ronnie Sahlberg + +commit b0dd940e582b6a60296b9847a54012a4b080dc72 upstream. + +RHBZ: 1579050 + +If we have a soft mount we should fail commands for session-setup +failures (such as the password having changed/ account being deleted/ ...) +and return an error back to the application. + +Signed-off-by: Ronnie Sahlberg +Signed-off-by: Steve French +CC: Stable +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/smb2pdu.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -350,9 +350,14 @@ smb2_reconnect(__le16 smb2_command, stru + } + + rc = cifs_negotiate_protocol(0, tcon->ses); +- if (!rc && tcon->ses->need_reconnect) ++ if (!rc && tcon->ses->need_reconnect) { + rc = cifs_setup_session(0, tcon->ses, nls_codepage); +- ++ if ((rc == -EACCES) && !tcon->retry) { ++ rc = -EHOSTDOWN; ++ mutex_unlock(&tcon->ses->session_mutex); ++ goto failed; ++ } ++ } + if (rc || !tcon->need_reconnect) { + mutex_unlock(&tcon->ses->session_mutex); + goto out; +@@ -397,6 +402,7 @@ out: + case SMB2_SET_INFO: + rc = -EAGAIN; + } ++failed: + unload_nls(nls_codepage); + return rc; + } diff --git a/queue-5.4/cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch b/queue-5.4/cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch new file mode 100644 index 00000000000..3c781f9b966 --- /dev/null +++ b/queue-5.4/cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch @@ -0,0 +1,52 @@ +From e3e056c35108661e418c803adfc054bf683426e7 Mon Sep 17 00:00:00 2001 +From: Aurelien Aptel +Date: Thu, 6 Feb 2020 18:16:55 +0100 +Subject: cifs: fix mode bits from dir listing when mounted with modefromsid + +From: Aurelien Aptel + +commit e3e056c35108661e418c803adfc054bf683426e7 upstream. + +When mounting with -o modefromsid, the mode bits are stored in an +ACE. Directory enumeration (e.g. ls -l /mnt) triggers an SMB Query Dir +which does not include ACEs in its response. The mode bits in this +case are silently set to a default value of 755 instead. + +This patch marks the dentry created during the directory enumeration +as needing re-evaluation (i.e. additional Query Info with ACEs) so +that the mode bits can be properly extracted. + +Quick repro: + +$ mount.cifs //win19.test/data /mnt -o ...,modefromsid +$ touch /mnt/foo && chmod 751 /mnt/foo +$ stat /mnt/foo + # reports 751 (OK) +$ sleep 2 + # dentry older than 1s by default get invalidated +$ ls -l /mnt + # since dentry invalid, ls does a Query Dir + # and reports foo as 755 (WRONG) + +Signed-off-by: Aurelien Aptel +Signed-off-by: Steve French +CC: Stable +Reviewed-by: Pavel Shilovsky +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/readdir.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/cifs/readdir.c ++++ b/fs/cifs/readdir.c +@@ -174,7 +174,8 @@ cifs_fill_common_info(struct cifs_fattr + * may look wrong since the inodes may not have timed out by the time + * "ls" does a stat() call on them. + */ +- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) ++ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) || ++ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && diff --git a/queue-5.4/clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch b/queue-5.4/clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch new file mode 100644 index 00000000000..472ccaf7515 --- /dev/null +++ b/queue-5.4/clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch @@ -0,0 +1,96 @@ +From febac332a819f0e764aa4da62757ba21d18c182b Mon Sep 17 00:00:00 2001 +From: Konstantin Khlebnikov +Date: Fri, 31 Jan 2020 19:08:59 +0300 +Subject: clocksource: Prevent double add_timer_on() for watchdog_timer + +From: Konstantin Khlebnikov + +commit febac332a819f0e764aa4da62757ba21d18c182b upstream. + +Kernel crashes inside QEMU/KVM are observed: + + kernel BUG at kernel/time/timer.c:1154! + BUG_ON(timer_pending(timer) || !timer->function) in add_timer_on(). + +At the same time another cpu got: + + general protection fault: 0000 [#1] SMP PTI of poinson pointer 0xdead000000000200 in: + + __hlist_del at include/linux/list.h:681 + (inlined by) detach_timer at kernel/time/timer.c:818 + (inlined by) expire_timers at kernel/time/timer.c:1355 + (inlined by) __run_timers at kernel/time/timer.c:1686 + (inlined by) run_timer_softirq at kernel/time/timer.c:1699 + +Unfortunately kernel logs are badly scrambled, stacktraces are lost. + +Printing the timer->function before the BUG_ON() pointed to +clocksource_watchdog(). + +The execution of clocksource_watchdog() can race with a sequence of +clocksource_stop_watchdog() .. clocksource_start_watchdog(): + +expire_timers() + detach_timer(timer, true); + timer->entry.pprev = NULL; + raw_spin_unlock_irq(&base->lock); + call_timer_fn + clocksource_watchdog() + + clocksource_watchdog_kthread() or + clocksource_unbind() + + spin_lock_irqsave(&watchdog_lock, flags); + clocksource_stop_watchdog(); + del_timer(&watchdog_timer); + watchdog_running = 0; + spin_unlock_irqrestore(&watchdog_lock, flags); + + spin_lock_irqsave(&watchdog_lock, flags); + clocksource_start_watchdog(); + add_timer_on(&watchdog_timer, ...); + watchdog_running = 1; + spin_unlock_irqrestore(&watchdog_lock, flags); + + spin_lock(&watchdog_lock); + add_timer_on(&watchdog_timer, ...); + BUG_ON(timer_pending(timer) || !timer->function); + timer_pending() -> true + BUG() + +I.e. inside clocksource_watchdog() watchdog_timer could be already armed. + +Check timer_pending() before calling add_timer_on(). This is sufficient as +all operations are synchronized by watchdog_lock. + +Fixes: 75c5158f70c0 ("timekeeping: Update clocksource with stop_machine") +Signed-off-by: Konstantin Khlebnikov +Signed-off-by: Thomas Gleixner +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/158048693917.4378.13823603769948933793.stgit@buzz +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/time/clocksource.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/kernel/time/clocksource.c ++++ b/kernel/time/clocksource.c +@@ -293,8 +293,15 @@ static void clocksource_watchdog(struct + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); +- watchdog_timer.expires += WATCHDOG_INTERVAL; +- add_timer_on(&watchdog_timer, next_cpu); ++ ++ /* ++ * Arm timer if not already pending: could race with concurrent ++ * pair clocksource_stop_watchdog() clocksource_start_watchdog(). ++ */ ++ if (!timer_pending(&watchdog_timer)) { ++ watchdog_timer.expires += WATCHDOG_INTERVAL; ++ add_timer_on(&watchdog_timer, next_cpu); ++ } + out: + spin_unlock(&watchdog_lock); + } diff --git a/queue-5.4/perf-core-fix-mlock-accounting-in-perf_mmap.patch b/queue-5.4/perf-core-fix-mlock-accounting-in-perf_mmap.patch new file mode 100644 index 00000000000..f85fda26241 --- /dev/null +++ b/queue-5.4/perf-core-fix-mlock-accounting-in-perf_mmap.patch @@ -0,0 +1,51 @@ +From 003461559ef7a9bd0239bae35a22ad8924d6e9ad Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Thu, 23 Jan 2020 10:11:46 -0800 +Subject: perf/core: Fix mlock accounting in perf_mmap() + +From: Song Liu + +commit 003461559ef7a9bd0239bae35a22ad8924d6e9ad upstream. + +Decreasing sysctl_perf_event_mlock between two consecutive perf_mmap()s of +a perf ring buffer may lead to an integer underflow in locked memory +accounting. This may lead to the undesired behaviors, such as failures in +BPF map creation. + +Address this by adjusting the accounting logic to take into account the +possibility that the amount of already locked memory may exceed the +current limit. + +Fixes: c4b75479741c ("perf/core: Make the mlock accounting simple again") +Suggested-by: Alexander Shishkin +Signed-off-by: Song Liu +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Cc: +Acked-by: Alexander Shishkin +Link: https://lkml.kernel.org/r/20200123181146.2238074-1-songliubraving@fb.com +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/events/core.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -5823,7 +5823,15 @@ accounting: + */ + user_lock_limit *= num_online_cpus(); + +- user_locked = atomic_long_read(&user->locked_vm) + user_extra; ++ user_locked = atomic_long_read(&user->locked_vm); ++ ++ /* ++ * sysctl_perf_event_mlock may have changed, so that ++ * user->locked_vm > user_lock_limit ++ */ ++ if (user_locked > user_lock_limit) ++ user_locked = user_lock_limit; ++ user_locked += user_extra; + + if (user_locked <= user_lock_limit) { + /* charge all to locked_vm */ diff --git a/queue-5.4/series b/queue-5.4/series index cac44e1c9e1..6bb59e01f93 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -299,3 +299,8 @@ kvm-x86-fix-overlap-between-spte_mmio_mask-and-gener.patch kvm-nvmx-vmread-should-not-set-rflags-to-specify-suc.patch kvm-use-vcpu-specific-gva-hva-translation-when-query.patch kvm-play-nice-with-read-only-memslots-when-querying-.patch +cifs-fail-i-o-on-soft-mounts-if-sessionsetup-errors-out.patch +cifs-fix-mode-bits-from-dir-listing-when-mounted-with-modefromsid.patch +x86-apic-msi-plug-non-maskable-msi-affinity-race.patch +clocksource-prevent-double-add_timer_on-for-watchdog_timer.patch +perf-core-fix-mlock-accounting-in-perf_mmap.patch diff --git a/queue-5.4/x86-apic-msi-plug-non-maskable-msi-affinity-race.patch b/queue-5.4/x86-apic-msi-plug-non-maskable-msi-affinity-race.patch new file mode 100644 index 00000000000..7a34136f8bc --- /dev/null +++ b/queue-5.4/x86-apic-msi-plug-non-maskable-msi-affinity-race.patch @@ -0,0 +1,370 @@ +From 6f1a4891a5928a5969c87fa5a584844c983ec823 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Fri, 31 Jan 2020 15:26:52 +0100 +Subject: x86/apic/msi: Plug non-maskable MSI affinity race + +From: Thomas Gleixner + +commit 6f1a4891a5928a5969c87fa5a584844c983ec823 upstream. + +Evan tracked down a subtle race between the update of the MSI message and +the device raising an interrupt internally on PCI devices which do not +support MSI masking. The update of the MSI message is non-atomic and +consists of either 2 or 3 sequential 32bit wide writes to the PCI config +space. + + - Write address low 32bits + - Write address high 32bits (If supported by device) + - Write data + +When an interrupt is migrated then both address and data might change, so +the kernel attempts to mask the MSI interrupt first. But for MSI masking is +optional, so there exist devices which do not provide it. That means that +if the device raises an interrupt internally between the writes then a MSI +message is sent built from half updated state. + +On x86 this can lead to spurious interrupts on the wrong interrupt +vector when the affinity setting changes both address and data. As a +consequence the device interrupt can be lost causing the device to +become stuck or malfunctioning. + +Evan tried to handle that by disabling MSI accross an MSI message +update. That's not feasible because disabling MSI has issues on its own: + + If MSI is disabled the PCI device is routing an interrupt to the legacy + INTx mechanism. The INTx delivery can be disabled, but the disablement is + not working on all devices. + + Some devices lose interrupts when both MSI and INTx delivery are disabled. + +Another way to solve this would be to enforce the allocation of the same +vector on all CPUs in the system for this kind of screwed devices. That +could be done, but it would bring back the vector space exhaustion problems +which got solved a few years ago. + +Fortunately the high address (if supported by the device) is only relevant +when X2APIC is enabled which implies interrupt remapping. In the interrupt +remapping case the affinity setting is happening at the interrupt remapping +unit and the PCI MSI message is programmed only once when the PCI device is +initialized. + +That makes it possible to solve it with a two step update: + + 1) Target the MSI msg to the new vector on the current target CPU + + 2) Target the MSI msg to the new vector on the new target CPU + +In both cases writing the MSI message is only changing a single 32bit word +which prevents the issue of inconsistency. + +After writing the final destination it is necessary to check whether the +device issued an interrupt while the intermediate state #1 (new vector, +current CPU) was in effect. + +This is possible because the affinity change is always happening on the +current target CPU. The code runs with interrupts disabled, so the +interrupt can be detected by checking the IRR of the local APIC. If the +vector is pending in the IRR then the interrupt is retriggered on the new +target CPU by sending an IPI for the associated vector on the target CPU. + +This can cause spurious interrupts on both the local and the new target +CPU. + + 1) If the new vector is not in use on the local CPU and the device + affected by the affinity change raised an interrupt during the + transitional state (step #1 above) then interrupt entry code will + ignore that spurious interrupt. The vector is marked so that the + 'No irq handler for vector' warning is supressed once. + + 2) If the new vector is in use already on the local CPU then the IRR check + might see an pending interrupt from the device which is using this + vector. The IPI to the new target CPU will then invoke the handler of + the device, which got the affinity change, even if that device did not + issue an interrupt + + 3) If the new vector is in use already on the local CPU and the device + affected by the affinity change raised an interrupt during the + transitional state (step #1 above) then the handler of the device which + uses that vector on the local CPU will be invoked. + +expose issues in device driver interrupt handlers which are not prepared to +handle a spurious interrupt correctly. This not a regression, it's just +exposing something which was already broken as spurious interrupts can +happen for a lot of reasons and all driver handlers need to be able to deal +with them. + +Reported-by: Evan Green +Debugged-by: Evan Green +Signed-off-by: Thomas Gleixner +Tested-by: Evan Green +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/87imkr4s7n.fsf@nanos.tec.linutronix.de +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/apic.h | 8 ++ + arch/x86/kernel/apic/msi.c | 128 ++++++++++++++++++++++++++++++++++++++++++-- + include/linux/irq.h | 18 ++++++ + include/linux/irqdomain.h | 7 ++ + kernel/irq/debugfs.c | 1 + kernel/irq/msi.c | 5 + + 6 files changed, 163 insertions(+), 4 deletions(-) + +--- a/arch/x86/include/asm/apic.h ++++ b/arch/x86/include/asm/apic.h +@@ -454,6 +454,14 @@ static inline void ack_APIC_irq(void) + apic_eoi(); + } + ++ ++static inline bool lapic_vector_set_in_irr(unsigned int vector) ++{ ++ u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); ++ ++ return !!(irr & (1U << (vector % 32))); ++} ++ + static inline unsigned default_get_apic_id(unsigned long x) + { + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); +--- a/arch/x86/kernel/apic/msi.c ++++ b/arch/x86/kernel/apic/msi.c +@@ -23,10 +23,8 @@ + + static struct irq_domain *msi_default_domain; + +-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ++static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) + { +- struct irq_cfg *cfg = irqd_cfg(data); +- + msg->address_hi = MSI_ADDR_BASE_HI; + + if (x2apic_enabled()) +@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct i + MSI_DATA_VECTOR(cfg->vector); + } + ++static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ++{ ++ __irq_msi_compose_msg(irqd_cfg(data), msg); ++} ++ ++static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) ++{ ++ struct msi_msg msg[2] = { [1] = { }, }; ++ ++ __irq_msi_compose_msg(cfg, msg); ++ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); ++} ++ ++static int ++msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) ++{ ++ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd); ++ struct irq_data *parent = irqd->parent_data; ++ unsigned int cpu; ++ int ret; ++ ++ /* Save the current configuration */ ++ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); ++ old_cfg = *cfg; ++ ++ /* Allocate a new target vector */ ++ ret = parent->chip->irq_set_affinity(parent, mask, force); ++ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) ++ return ret; ++ ++ /* ++ * For non-maskable and non-remapped MSI interrupts the migration ++ * to a different destination CPU and a different vector has to be ++ * done careful to handle the possible stray interrupt which can be ++ * caused by the non-atomic update of the address/data pair. ++ * ++ * Direct update is possible when: ++ * - The MSI is maskable (remapped MSI does not use this code path)). ++ * The quirk bit is not set in this case. ++ * - The new vector is the same as the old vector ++ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) ++ * - The new destination CPU is the same as the old destination CPU ++ */ ++ if (!irqd_msi_nomask_quirk(irqd) || ++ cfg->vector == old_cfg.vector || ++ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || ++ cfg->dest_apicid == old_cfg.dest_apicid) { ++ irq_msi_update_msg(irqd, cfg); ++ return ret; ++ } ++ ++ /* ++ * Paranoia: Validate that the interrupt target is the local ++ * CPU. ++ */ ++ if (WARN_ON_ONCE(cpu != smp_processor_id())) { ++ irq_msi_update_msg(irqd, cfg); ++ return ret; ++ } ++ ++ /* ++ * Redirect the interrupt to the new vector on the current CPU ++ * first. This might cause a spurious interrupt on this vector if ++ * the device raises an interrupt right between this update and the ++ * update to the final destination CPU. ++ * ++ * If the vector is in use then the installed device handler will ++ * denote it as spurious which is no harm as this is a rare event ++ * and interrupt handlers have to cope with spurious interrupts ++ * anyway. If the vector is unused, then it is marked so it won't ++ * trigger the 'No irq handler for vector' warning in do_IRQ(). ++ * ++ * This requires to hold vector lock to prevent concurrent updates to ++ * the affected vector. ++ */ ++ lock_vector_lock(); ++ ++ /* ++ * Mark the new target vector on the local CPU if it is currently ++ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in ++ * the CPU hotplug path for a similar purpose. This cannot be ++ * undone here as the current CPU has interrupts disabled and ++ * cannot handle the interrupt before the whole set_affinity() ++ * section is done. In the CPU unplug case, the current CPU is ++ * about to vanish and will not handle any interrupts anymore. The ++ * vector is cleaned up when the CPU comes online again. ++ */ ++ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector]))) ++ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED); ++ ++ /* Redirect it to the new vector on the local CPU temporarily */ ++ old_cfg.vector = cfg->vector; ++ irq_msi_update_msg(irqd, &old_cfg); ++ ++ /* Now transition it to the target CPU */ ++ irq_msi_update_msg(irqd, cfg); ++ ++ /* ++ * All interrupts after this point are now targeted at the new ++ * vector/CPU. ++ * ++ * Drop vector lock before testing whether the temporary assignment ++ * to the local CPU was hit by an interrupt raised in the device, ++ * because the retrigger function acquires vector lock again. ++ */ ++ unlock_vector_lock(); ++ ++ /* ++ * Check whether the transition raced with a device interrupt and ++ * is pending in the local APICs IRR. It is safe to do this outside ++ * of vector lock as the irq_desc::lock of this interrupt is still ++ * held and interrupts are disabled: The check is not accessing the ++ * underlying vector store. It's just checking the local APIC's ++ * IRR. ++ */ ++ if (lapic_vector_set_in_irr(cfg->vector)) ++ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd); ++ ++ return ret; ++} ++ + /* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. +@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controlle + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, ++ .irq_set_affinity = msi_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, + }; + +@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct + } + if (!msi_default_domain) + pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); ++ else ++ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; + } + + #ifdef CONFIG_IRQ_REMAP +--- a/include/linux/irq.h ++++ b/include/linux/irq.h +@@ -209,6 +209,8 @@ struct irq_data { + * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target + * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set + * IRQD_CAN_RESERVE - Can use reservation mode ++ * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change ++ * required + */ + enum { + IRQD_TRIGGER_MASK = 0xf, +@@ -231,6 +233,7 @@ enum { + IRQD_SINGLE_TARGET = (1 << 24), + IRQD_DEFAULT_TRIGGER_SET = (1 << 25), + IRQD_CAN_RESERVE = (1 << 26), ++ IRQD_MSI_NOMASK_QUIRK = (1 << 27), + }; + + #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) +@@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(stru + return __irqd_to_state(d) & IRQD_CAN_RESERVE; + } + ++static inline void irqd_set_msi_nomask_quirk(struct irq_data *d) ++{ ++ __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK; ++} ++ ++static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d) ++{ ++ __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK; ++} ++ ++static inline bool irqd_msi_nomask_quirk(struct irq_data *d) ++{ ++ return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK; ++} ++ + #undef __irqd_to_state + + static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) +--- a/include/linux/irqdomain.h ++++ b/include/linux/irqdomain.h +@@ -206,6 +206,13 @@ enum { + IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5), + + /* ++ * Quirk to handle MSI implementations which do not provide ++ * masking. Currently known to affect x86, but partially ++ * handled in core code. ++ */ ++ IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6), ++ ++ /* + * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved + * for implementation specific purposes and ignored by the + * core code. +--- a/kernel/irq/debugfs.c ++++ b/kernel/irq/debugfs.c +@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdat + BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), + BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), + BIT_MASK_DESCR(IRQD_CAN_RESERVE), ++ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), + + BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), + +--- a/kernel/irq/msi.c ++++ b/kernel/irq/msi.c +@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_dom + continue; + + irq_data = irq_domain_get_irq_data(domain, desc->irq); +- if (!can_reserve) ++ if (!can_reserve) { + irqd_clr_can_reserve(irq_data); ++ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) ++ irqd_set_msi_nomask_quirk(irq_data); ++ } + ret = irq_domain_activate_irq(irq_data, can_reserve); + if (ret) + goto cleanup;