} else {
/* If dynamic allocation is enabled we have already allocated
* hwc msi
+ * Also, we make sure in this case the following is always true
+ * (num_msix_usable - 1 HWC) <= num_online_cpus()
*/
gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1);
}
* do the same thing.
*/
-static int irq_setup(unsigned int *irqs, unsigned int len, int node,
- bool skip_first_cpu)
+static int mana_irq_setup_numa_aware(unsigned int *irqs, unsigned int len,
+ int node, bool skip_first_cpu)
{
const struct cpumask *next, *prev = cpu_none_mask;
cpumask_var_t cpus __free(free_cpumask_var);
return 0;
}
+/* must be called with cpus_read_lock() held */
+static void mana_irq_setup_linear(unsigned int *irqs, unsigned int len)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (len == 0)
+ break;
+
+ irq_set_affinity_and_hint(*irqs++, cpumask_of(cpu));
+ len--;
+ }
+}
+
static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
struct gdma_irq_context *gic;
- bool skip_first_cpu = false;
int *irqs, err, i, msi;
irqs = kmalloc_objs(int, nvec);
return -ENOMEM;
/*
+ * In this function, num_msix_usable = HWC IRQ + Queue IRQ.
+ * nvec is only Queue IRQ (HWC already setup).
* While processing the next pci irq vector, we start with index 1,
* as IRQ vector at index 0 is already processed for HWC.
* However, the population of irqs array starts with index 0, to be
- * further used in irq_setup()
+ * further used in mana_irq_setup_numa_aware()
*/
for (i = 1; i <= nvec; i++) {
msi = i;
}
/*
- * When calling irq_setup() for dynamically added IRQs, if number of
- * CPUs is more than or equal to allocated MSI-X, we need to skip the
- * first CPU sibling group since they are already affinitized to HWC IRQ
+ * When calling mana_irq_setup_numa_aware() for dynamically added IRQs,
+ * if number of CPUs is more than or equal to allocated MSI-X, we need to
+ * skip the first CPU sibling group since they are already affinitized to
+ * HWC IRQ
*/
cpus_read_lock();
- if (gc->num_msix_usable <= num_online_cpus())
- skip_first_cpu = true;
+ if (gc->num_msix_usable <= num_online_cpus()) {
+ err = mana_irq_setup_numa_aware(irqs, nvec, gc->numa_node,
+ true);
+ if (err) {
+ cpus_read_unlock();
+ goto free_irq;
+ }
+ } else {
+ /*
+ * When num_msix_usable are more than num_online_cpus, our
+ * queue IRQs should be equal to num of online vCPUs.
+ * We try to make sure queue IRQs spread across all vCPUs.
+ * In such a case NUMA or CPU core affinity does not matter.
+ * Note: in this case the total mana IRQ should always be
+ * num_online_cpus + 1. The first HWC IRQ is already handled
+ * in HWC setup calls
+ * However, if CPUs went offline since num_msix_usable was
+ * computed, queue IRQs will be more than num_online_cpus().
+ * In such cases remaining extra IRQs will retain their default
+ * affinity.
+ */
+ int first_unassigned = num_online_cpus();
- err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu);
- if (err) {
- cpus_read_unlock();
- goto free_irq;
+ if (nvec > first_unassigned) {
+ char buf[32];
+
+ if (first_unassigned == nvec - 1)
+ snprintf(buf, sizeof(buf), "%d",
+ first_unassigned);
+ else
+ snprintf(buf, sizeof(buf), "%d-%d",
+ first_unassigned, nvec - 1);
+
+ dev_dbg(&pdev->dev,
+ "MANA IRQ indices #%s will retain the default CPU affinity\n",
+ buf);
+ }
+
+ mana_irq_setup_linear(irqs, nvec);
}
cpus_read_unlock();
nvec -= 1;
}
- err = irq_setup(irqs, nvec, gc->numa_node, false);
+ err = mana_irq_setup_numa_aware(irqs, nvec, gc->numa_node, false);
if (err) {
cpus_read_unlock();
goto free_irq;