From f6cd226ea6f946a2da9f5b68d043430a83495550 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 25 Feb 2019 19:49:56 -0500 Subject: [PATCH] patches for 4.19 Signed-off-by: Sasha Levin --- ...prove-target-cpu-selection-for-manag.patch | 216 ++++++++++++++++++ ...-out-the-cpu-selection-code-into-a-h.patch | 113 +++++++++ ...ead-managed-interrupts-on-allocation.patch | 115 ++++++++++ queue-4.19/series | 3 + 4 files changed, 447 insertions(+) create mode 100644 queue-4.19/genirq-matrix-improve-target-cpu-selection-for-manag.patch create mode 100644 queue-4.19/irq-matrix-split-out-the-cpu-selection-code-into-a-h.patch create mode 100644 queue-4.19/irq-matrix-spread-managed-interrupts-on-allocation.patch diff --git a/queue-4.19/genirq-matrix-improve-target-cpu-selection-for-manag.patch b/queue-4.19/genirq-matrix-improve-target-cpu-selection-for-manag.patch new file mode 100644 index 00000000000..b63da6120b2 --- /dev/null +++ b/queue-4.19/genirq-matrix-improve-target-cpu-selection-for-manag.patch @@ -0,0 +1,216 @@ +From c7ca3df05628b3d8f8a33e2f69b1b0bd8411f0c5 Mon Sep 17 00:00:00 2001 +From: Long Li +Date: Tue, 6 Nov 2018 04:00:00 +0000 +Subject: genirq/matrix: Improve target CPU selection for managed interrupts. + +[ Upstream commit e8da8794a7fd9eef1ec9a07f0d4897c68581c72b ] + +On large systems with multiple devices of the same class (e.g. NVMe disks, +using managed interrupts), the kernel can affinitize these interrupts to a +small subset of CPUs instead of spreading them out evenly. + +irq_matrix_alloc_managed() tries to select the CPU in the supplied cpumask +of possible target CPUs which has the lowest number of interrupt vectors +allocated. + +This is done by searching the CPU with the highest number of available +vectors. While this is correct for non-managed CPUs it can select the wrong +CPU for managed interrupts. Under certain constellations this results in +affinitizing the managed interrupts of several devices to a single CPU in +a set. + +The book keeping of available vectors works the following way: + + 1) Non-managed interrupts: + + available is decremented when the interrupt is actually requested by + the device driver and a vector is assigned. It's incremented when the + interrupt and the vector are freed. + + 2) Managed interrupts: + + Managed interrupts guarantee vector reservation when the MSI/MSI-X + functionality of a device is enabled, which is achieved by reserving + vectors in the bitmaps of the possible target CPUs. This reservation + decrements the available count on each possible target CPU. + + When the interrupt is requested by the device driver then a vector is + allocated from the reserved region. The operation is reversed when the + interrupt is freed by the device driver. Neither of these operations + affect the available count. + + The reservation persist up to the point where the MSI/MSI-X + functionality is disabled and only this operation increments the + available count again. + +For non-managed interrupts the available count is the correct selection +criterion because the guaranteed reservations need to be taken into +account. Using the allocated counter could lead to a failing allocation in +the following situation (total vector space of 10 assumed): + + CPU0 CPU1 + available: 2 0 + allocated: 5 3 <--- CPU1 is selected, but available space = 0 + managed reserved: 3 7 + + while available yields the correct result. + +For managed interrupts the available count is not the appropriate +selection criterion because as explained above the available count is not +affected by the actual vector allocation. + +The following example illustrates that. Total vector space of 10 +assumed. The starting point is: + + CPU0 CPU1 + available: 5 4 + allocated: 2 3 + managed reserved: 3 3 + + Allocating vectors for three non-managed interrupts will result in + affinitizing the first two to CPU0 and the third one to CPU1 because the + available count is adjusted with each allocation: + + CPU0 CPU1 + available: 5 4 <- Select CPU0 for 1st allocation + --> allocated: 3 3 + + available: 4 4 <- Select CPU0 for 2nd allocation + --> allocated: 4 3 + + available: 3 4 <- Select CPU1 for 3rd allocation + --> allocated: 4 4 + + But the allocation of three managed interrupts starting from the same + point will affinitize all of them to CPU0 because the available count is + not affected by the allocation (see above). So the end result is: + + CPU0 CPU1 + available: 5 4 + allocated: 5 3 + +Introduce a "managed_allocated" field in struct cpumap to track the vector +allocation for managed interrupts separately. Use this information to +select the target CPU when a vector is allocated for a managed interrupt, +which results in more evenly distributed vector assignments. The above +example results in the following allocations: + + CPU0 CPU1 + managed_allocated: 0 0 <- Select CPU0 for 1st allocation + --> allocated: 3 3 + + managed_allocated: 1 0 <- Select CPU1 for 2nd allocation + --> allocated: 3 4 + + managed_allocated: 1 1 <- Select CPU0 for 3rd allocation + --> allocated: 4 4 + +The allocation of non-managed interrupts is not affected by this change and +is still evaluating the available count. + +The overall distribution of interrupt vectors for both types of interrupts +might still not be perfectly even depending on the number of non-managed +and managed interrupts in a system, but due to the reservation guarantee +for managed interrupts this cannot be avoided. + +Expose the new field in debugfs as well. + +[ tglx: Clarified the background of the problem in the changelog and + described it independent of NVME ] + +Signed-off-by: Long Li +Signed-off-by: Thomas Gleixner +Cc: Michael Kelley +Link: https://lkml.kernel.org/r/20181106040000.27316-1-longli@linuxonhyperv.com +Signed-off-by: Sasha Levin +--- + kernel/irq/matrix.c | 34 ++++++++++++++++++++++++++++++---- + 1 file changed, 30 insertions(+), 4 deletions(-) + +diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c +index 6e6d467f3dec5..92337703ca9fd 100644 +--- a/kernel/irq/matrix.c ++++ b/kernel/irq/matrix.c +@@ -14,6 +14,7 @@ struct cpumap { + unsigned int available; + unsigned int allocated; + unsigned int managed; ++ unsigned int managed_allocated; + bool initialized; + bool online; + unsigned long alloc_map[IRQ_MATRIX_SIZE]; +@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix *m, + return best_cpu; + } + ++/* Find the best CPU which has the lowest number of managed IRQs allocated */ ++static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m, ++ const struct cpumask *msk) ++{ ++ unsigned int cpu, best_cpu, allocated = UINT_MAX; ++ struct cpumap *cm; ++ ++ best_cpu = UINT_MAX; ++ ++ for_each_cpu(cpu, msk) { ++ cm = per_cpu_ptr(m->maps, cpu); ++ ++ if (!cm->online || cm->managed_allocated > allocated) ++ continue; ++ ++ best_cpu = cpu; ++ allocated = cm->managed_allocated; ++ } ++ return best_cpu; ++} ++ + /** + * irq_matrix_assign_system - Assign system wide entry in the matrix + * @m: Matrix pointer +@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, + if (cpumask_empty(msk)) + return -EINVAL; + +- cpu = matrix_find_best_cpu(m, msk); ++ cpu = matrix_find_best_cpu_managed(m, msk); + if (cpu == UINT_MAX) + return -ENOSPC; + +@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, + return -ENOSPC; + set_bit(bit, cm->alloc_map); + cm->allocated++; ++ cm->managed_allocated++; + m->total_allocated++; + *mapped_cpu = cpu; + trace_irq_matrix_alloc_managed(bit, cpu, m, cm); +@@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, + + clear_bit(bit, cm->alloc_map); + cm->allocated--; ++ if(managed) ++ cm->managed_allocated--; + + if (cm->online) + m->total_allocated--; +@@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind) + seq_printf(sf, "Total allocated: %6u\n", m->total_allocated); + seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits, + m->system_map); +- seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " "); ++ seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " "); + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + +- seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ", +- cpu, cm->available, cm->managed, cm->allocated, ++ seq_printf(sf, "%*s %4d %4u %4u %4u %4u %*pbl\n", ind, " ", ++ cpu, cm->available, cm->managed, ++ cm->managed_allocated, cm->allocated, + m->matrix_bits, cm->alloc_map); + } + cpus_read_unlock(); +-- +2.19.1 + diff --git a/queue-4.19/irq-matrix-split-out-the-cpu-selection-code-into-a-h.patch b/queue-4.19/irq-matrix-split-out-the-cpu-selection-code-into-a-h.patch new file mode 100644 index 00000000000..3f3a503edf2 --- /dev/null +++ b/queue-4.19/irq-matrix-split-out-the-cpu-selection-code-into-a-h.patch @@ -0,0 +1,113 @@ +From 504302562cb34ba1a9b73753f9735da29d8f5ef2 Mon Sep 17 00:00:00 2001 +From: Dou Liyang +Date: Sun, 9 Sep 2018 01:58:37 +0800 +Subject: irq/matrix: Split out the CPU selection code into a helper + +[ Upstream commit 8ffe4e61c06a48324cfd97f1199bb9838acce2f2 ] + +Linux finds the CPU which has the lowest vector allocation count to spread +out the non managed interrupts across the possible target CPUs, but does +not do so for managed interrupts. + +Split out the CPU selection code into a helper function for reuse. No +functional change. + +Signed-off-by: Dou Liyang +Signed-off-by: Thomas Gleixner +Cc: hpa@zytor.com +Link: https://lkml.kernel.org/r/20180908175838.14450-1-dou_liyang@163.com +Signed-off-by: Sasha Levin +--- + kernel/irq/matrix.c | 65 ++++++++++++++++++++++++++------------------- + 1 file changed, 38 insertions(+), 27 deletions(-) + +diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c +index 5092494bf2614..67768bbe736ed 100644 +--- a/kernel/irq/matrix.c ++++ b/kernel/irq/matrix.c +@@ -124,6 +124,27 @@ static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm, + return area; + } + ++/* Find the best CPU which has the lowest vector allocation count */ ++static unsigned int matrix_find_best_cpu(struct irq_matrix *m, ++ const struct cpumask *msk) ++{ ++ unsigned int cpu, best_cpu, maxavl = 0; ++ struct cpumap *cm; ++ ++ best_cpu = UINT_MAX; ++ ++ for_each_cpu(cpu, msk) { ++ cm = per_cpu_ptr(m->maps, cpu); ++ ++ if (!cm->online || cm->available <= maxavl) ++ continue; ++ ++ best_cpu = cpu; ++ maxavl = cm->available; ++ } ++ return best_cpu; ++} ++ + /** + * irq_matrix_assign_system - Assign system wide entry in the matrix + * @m: Matrix pointer +@@ -322,37 +343,27 @@ void irq_matrix_remove_reserved(struct irq_matrix *m) + int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, + bool reserved, unsigned int *mapped_cpu) + { +- unsigned int cpu, best_cpu, maxavl = 0; ++ unsigned int cpu, bit; + struct cpumap *cm; +- unsigned int bit; + +- best_cpu = UINT_MAX; +- for_each_cpu(cpu, msk) { +- cm = per_cpu_ptr(m->maps, cpu); +- +- if (!cm->online || cm->available <= maxavl) +- continue; ++ cpu = matrix_find_best_cpu(m, msk); ++ if (cpu == UINT_MAX) ++ return -ENOSPC; + +- best_cpu = cpu; +- maxavl = cm->available; +- } ++ cm = per_cpu_ptr(m->maps, cpu); ++ bit = matrix_alloc_area(m, cm, 1, false); ++ if (bit >= m->alloc_end) ++ return -ENOSPC; ++ cm->allocated++; ++ cm->available--; ++ m->total_allocated++; ++ m->global_available--; ++ if (reserved) ++ m->global_reserved--; ++ *mapped_cpu = cpu; ++ trace_irq_matrix_alloc(bit, cpu, m, cm); ++ return bit; + +- if (maxavl) { +- cm = per_cpu_ptr(m->maps, best_cpu); +- bit = matrix_alloc_area(m, cm, 1, false); +- if (bit < m->alloc_end) { +- cm->allocated++; +- cm->available--; +- m->total_allocated++; +- m->global_available--; +- if (reserved) +- m->global_reserved--; +- *mapped_cpu = best_cpu; +- trace_irq_matrix_alloc(bit, best_cpu, m, cm); +- return bit; +- } +- } +- return -ENOSPC; + } + + /** +-- +2.19.1 + diff --git a/queue-4.19/irq-matrix-spread-managed-interrupts-on-allocation.patch b/queue-4.19/irq-matrix-spread-managed-interrupts-on-allocation.patch new file mode 100644 index 00000000000..82e82bc62f4 --- /dev/null +++ b/queue-4.19/irq-matrix-spread-managed-interrupts-on-allocation.patch @@ -0,0 +1,115 @@ +From ead271d20be11196f16560c385cc132a5a4f1a8a Mon Sep 17 00:00:00 2001 +From: Dou Liyang +Date: Sun, 9 Sep 2018 01:58:38 +0800 +Subject: irq/matrix: Spread managed interrupts on allocation + +[ Upstream commit 76f99ae5b54d48430d1f0c5512a84da0ff9761e0 ] + +Linux spreads out the non managed interrupt across the possible target CPUs +to avoid vector space exhaustion. + +Managed interrupts are treated differently, as for them the vectors are +reserved (with guarantee) when the interrupt descriptors are initialized. + +When the interrupt is requested a real vector is assigned. The assignment +logic uses the first CPU in the affinity mask for assignment. If the +interrupt has more than one CPU in the affinity mask, which happens when a +multi queue device has less queues than CPUs, then doing the same search as +for non managed interrupts makes sense as it puts the interrupt on the +least interrupt plagued CPU. For single CPU affine vectors that's obviously +a NOOP. + +Restructre the matrix allocation code so it does the 'best CPU' search, add +the sanity check for an empty affinity mask and adapt the call site in the +x86 vector management code. + +[ tglx: Added the empty mask check to the core and improved change log ] + +Signed-off-by: Dou Liyang +Signed-off-by: Thomas Gleixner +Cc: hpa@zytor.com +Link: https://lkml.kernel.org/r/20180908175838.14450-2-dou_liyang@163.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/apic/vector.c | 9 ++++----- + include/linux/irq.h | 3 ++- + kernel/irq/matrix.c | 17 ++++++++++++++--- + 3 files changed, 20 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c +index 7654febd51027..652e7ffa9b9de 100644 +--- a/arch/x86/kernel/apic/vector.c ++++ b/arch/x86/kernel/apic/vector.c +@@ -313,14 +313,13 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector, cpu; + +- cpumask_and(vector_searchmask, vector_searchmask, affmsk); +- cpu = cpumask_first(vector_searchmask); +- if (cpu >= nr_cpu_ids) +- return -EINVAL; ++ cpumask_and(vector_searchmask, dest, affmsk); ++ + /* set_affinity might call here for nothing */ + if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask)) + return 0; +- vector = irq_matrix_alloc_managed(vector_matrix, cpu); ++ vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask, ++ &cpu); + trace_vector_alloc_managed(irqd->irq, vector, vector); + if (vector < 0) + return vector; +diff --git a/include/linux/irq.h b/include/linux/irq.h +index 201de12a99571..c9bffda04a450 100644 +--- a/include/linux/irq.h ++++ b/include/linux/irq.h +@@ -1151,7 +1151,8 @@ void irq_matrix_offline(struct irq_matrix *m); + void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace); + int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk); + void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk); +-int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu); ++int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, ++ unsigned int *mapped_cpu); + void irq_matrix_reserve(struct irq_matrix *m); + void irq_matrix_remove_reserved(struct irq_matrix *m); + int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, +diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c +index 67768bbe736ed..6e6d467f3dec5 100644 +--- a/kernel/irq/matrix.c ++++ b/kernel/irq/matrix.c +@@ -260,11 +260,21 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) + * @m: Matrix pointer + * @cpu: On which CPU the interrupt should be allocated + */ +-int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) ++int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, ++ unsigned int *mapped_cpu) + { +- struct cpumap *cm = per_cpu_ptr(m->maps, cpu); +- unsigned int bit, end = m->alloc_end; ++ unsigned int bit, cpu, end = m->alloc_end; ++ struct cpumap *cm; ++ ++ if (cpumask_empty(msk)) ++ return -EINVAL; + ++ cpu = matrix_find_best_cpu(m, msk); ++ if (cpu == UINT_MAX) ++ return -ENOSPC; ++ ++ cm = per_cpu_ptr(m->maps, cpu); ++ end = m->alloc_end; + /* Get managed bit which are not allocated */ + bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); + bit = find_first_bit(m->scratch_map, end); +@@ -273,6 +283,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) + set_bit(bit, cm->alloc_map); + cm->allocated++; + m->total_allocated++; ++ *mapped_cpu = cpu; + trace_irq_matrix_alloc_managed(bit, cpu, m, cm); + return bit; + } +-- +2.19.1 + diff --git a/queue-4.19/series b/queue-4.19/series index e6455f04afb..856fd25ed1b 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -150,3 +150,6 @@ netfilter-nfnetlink_osf-add-missing-fmatch-check.patch netfilter-ipt_clusterip-fix-sleep-in-atomic-bug-in-clusterip_config_entry_put.patch udlfb-handle-unplug-properly.patch pinctrl-max77620-use-define-directive-for-max77620_pinconf_param-values.patch +irq-matrix-split-out-the-cpu-selection-code-into-a-h.patch +irq-matrix-spread-managed-interrupts-on-allocation.patch +genirq-matrix-improve-target-cpu-selection-for-manag.patch -- 2.47.3