--- /dev/null
+From 21b9f1186850b2c860401d9131c2e90bd719dd20 Mon Sep 17 00:00:00 2001
+From: Long Li <longli@microsoft.com>
+Date: Tue, 6 Nov 2018 04:00:00 +0000
+Subject: genirq/matrix: Improve target CPU selection for managed interrupts.
+
+[ Upstream commit e8da8794a7fd9eef1ec9a07f0d4897c68581c72b ]
+
+On large systems with multiple devices of the same class (e.g. NVMe disks,
+using managed interrupts), the kernel can affinitize these interrupts to a
+small subset of CPUs instead of spreading them out evenly.
+
+irq_matrix_alloc_managed() tries to select the CPU in the supplied cpumask
+of possible target CPUs which has the lowest number of interrupt vectors
+allocated.
+
+This is done by searching the CPU with the highest number of available
+vectors. While this is correct for non-managed CPUs it can select the wrong
+CPU for managed interrupts. Under certain constellations this results in
+affinitizing the managed interrupts of several devices to a single CPU in
+a set.
+
+The book keeping of available vectors works the following way:
+
+ 1) Non-managed interrupts:
+
+ available is decremented when the interrupt is actually requested by
+ the device driver and a vector is assigned. It's incremented when the
+ interrupt and the vector are freed.
+
+ 2) Managed interrupts:
+
+ Managed interrupts guarantee vector reservation when the MSI/MSI-X
+ functionality of a device is enabled, which is achieved by reserving
+ vectors in the bitmaps of the possible target CPUs. This reservation
+ decrements the available count on each possible target CPU.
+
+ When the interrupt is requested by the device driver then a vector is
+ allocated from the reserved region. The operation is reversed when the
+ interrupt is freed by the device driver. Neither of these operations
+ affect the available count.
+
+ The reservation persist up to the point where the MSI/MSI-X
+ functionality is disabled and only this operation increments the
+ available count again.
+
+For non-managed interrupts the available count is the correct selection
+criterion because the guaranteed reservations need to be taken into
+account. Using the allocated counter could lead to a failing allocation in
+the following situation (total vector space of 10 assumed):
+
+ CPU0 CPU1
+ available: 2 0
+ allocated: 5 3 <--- CPU1 is selected, but available space = 0
+ managed reserved: 3 7
+
+ while available yields the correct result.
+
+For managed interrupts the available count is not the appropriate
+selection criterion because as explained above the available count is not
+affected by the actual vector allocation.
+
+The following example illustrates that. Total vector space of 10
+assumed. The starting point is:
+
+ CPU0 CPU1
+ available: 5 4
+ allocated: 2 3
+ managed reserved: 3 3
+
+ Allocating vectors for three non-managed interrupts will result in
+ affinitizing the first two to CPU0 and the third one to CPU1 because the
+ available count is adjusted with each allocation:
+
+ CPU0 CPU1
+ available: 5 4 <- Select CPU0 for 1st allocation
+ --> allocated: 3 3
+
+ available: 4 4 <- Select CPU0 for 2nd allocation
+ --> allocated: 4 3
+
+ available: 3 4 <- Select CPU1 for 3rd allocation
+ --> allocated: 4 4
+
+ But the allocation of three managed interrupts starting from the same
+ point will affinitize all of them to CPU0 because the available count is
+ not affected by the allocation (see above). So the end result is:
+
+ CPU0 CPU1
+ available: 5 4
+ allocated: 5 3
+
+Introduce a "managed_allocated" field in struct cpumap to track the vector
+allocation for managed interrupts separately. Use this information to
+select the target CPU when a vector is allocated for a managed interrupt,
+which results in more evenly distributed vector assignments. The above
+example results in the following allocations:
+
+ CPU0 CPU1
+ managed_allocated: 0 0 <- Select CPU0 for 1st allocation
+ --> allocated: 3 3
+
+ managed_allocated: 1 0 <- Select CPU1 for 2nd allocation
+ --> allocated: 3 4
+
+ managed_allocated: 1 1 <- Select CPU0 for 3rd allocation
+ --> allocated: 4 4
+
+The allocation of non-managed interrupts is not affected by this change and
+is still evaluating the available count.
+
+The overall distribution of interrupt vectors for both types of interrupts
+might still not be perfectly even depending on the number of non-managed
+and managed interrupts in a system, but due to the reservation guarantee
+for managed interrupts this cannot be avoided.
+
+Expose the new field in debugfs as well.
+
+[ tglx: Clarified the background of the problem in the changelog and
+ described it independent of NVME ]
+
+Signed-off-by: Long Li <longli@microsoft.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Michael Kelley <mikelley@microsoft.com>
+Link: https://lkml.kernel.org/r/20181106040000.27316-1-longli@linuxonhyperv.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/irq/matrix.c | 34 ++++++++++++++++++++++++++++++----
+ 1 file changed, 30 insertions(+), 4 deletions(-)
+
+--- a/kernel/irq/matrix.c
++++ b/kernel/irq/matrix.c
+@@ -14,6 +14,7 @@ struct cpumap {
+ unsigned int available;
+ unsigned int allocated;
+ unsigned int managed;
++ unsigned int managed_allocated;
+ bool initialized;
+ bool online;
+ unsigned long alloc_map[IRQ_MATRIX_SIZE];
+@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu
+ return best_cpu;
+ }
+
++/* Find the best CPU which has the lowest number of managed IRQs allocated */
++static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
++ const struct cpumask *msk)
++{
++ unsigned int cpu, best_cpu, allocated = UINT_MAX;
++ struct cpumap *cm;
++
++ best_cpu = UINT_MAX;
++
++ for_each_cpu(cpu, msk) {
++ cm = per_cpu_ptr(m->maps, cpu);
++
++ if (!cm->online || cm->managed_allocated > allocated)
++ continue;
++
++ best_cpu = cpu;
++ allocated = cm->managed_allocated;
++ }
++ return best_cpu;
++}
++
+ /**
+ * irq_matrix_assign_system - Assign system wide entry in the matrix
+ * @m: Matrix pointer
+@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_
+ if (cpumask_empty(msk))
+ return -EINVAL;
+
+- cpu = matrix_find_best_cpu(m, msk);
++ cpu = matrix_find_best_cpu_managed(m, msk);
+ if (cpu == UINT_MAX)
+ return -ENOSPC;
+
+@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_
+ return -ENOSPC;
+ set_bit(bit, cm->alloc_map);
+ cm->allocated++;
++ cm->managed_allocated++;
+ m->total_allocated++;
+ *mapped_cpu = cpu;
+ trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
+@@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *
+
+ clear_bit(bit, cm->alloc_map);
+ cm->allocated--;
++ if(managed)
++ cm->managed_allocated--;
+
+ if (cm->online)
+ m->total_allocated--;
+@@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_fi
+ seq_printf(sf, "Total allocated: %6u\n", m->total_allocated);
+ seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
+ m->system_map);
+- seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
++ seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " ");
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+- seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ",
+- cpu, cm->available, cm->managed, cm->allocated,
++ seq_printf(sf, "%*s %4d %4u %4u %4u %4u %*pbl\n", ind, " ",
++ cpu, cm->available, cm->managed,
++ cm->managed_allocated, cm->allocated,
+ m->matrix_bits, cm->alloc_map);
+ }
+ cpus_read_unlock();