--- /dev/null
+From: Suresh Siddha <suresh.b.siddha@intel.com>
+Subject: x64, x2apic/intr-remap: IO-APIC support for interrupt-remapping
+References: fate #303948 and fate #303984
+Patch-Mainline: queued for .28
+Commit-ID: 89027d35aa5b8f45ce0f7fa0911db85b46563da0
+
+Signed-off-by: Thomas Renninger <trenn@suse.de>
+
+IO-APIC support in the presence of interrupt-remapping infrastructure.
+
+IO-APIC RTE will be programmed with interrupt-remapping table entry(IRTE)
+index and the IRTE will contain information about the vector, cpu destination,
+trigger mode etc, which traditionally was present in the IO-APIC RTE.
+
+Introduce a new irq_chip for cleaner irq migration (in the process
+context as opposed to the current irq migration in the context of an interrupt.
+interrupt-remapping infrastructure will help us achieve this cleanly).
+
+For edge triggered, irq migration is a simple atomic update(of vector
+and cpu destination) of IRTE and flush the hardware cache.
+
+For level triggered, we need to modify the io-apic RTE aswell with the update
+vector information, along with modifying IRTE with vector and cpu destination.
+So irq migration for level triggered is little bit more complex compared to
+edge triggered migration. But the good news is, we use the same algorithm
+for level triggered migration as we have today, only difference being,
+we now initiate the irq migration from process context instead of the
+interrupt context.
+
+In future, when we do a directed EOI (combined with cpu EOI broadcast
+suppression) to the IO-APIC, level triggered irq migration will also be
+as simple as edge triggered migration and we can do the irq migration
+with a simple atomic update to IO-APIC RTE.
+
+TBD: some tests/changes needed in the presence of fixup_irqs() for
+level triggered irq migration.
+
+Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
+Cc: akpm@linux-foundation.org
+Cc: arjan@linux.intel.com
+Cc: andi@firstfloor.org
+Cc: ebiederm@xmission.com
+Cc: jbarnes@virtuousgeek.org
+Cc: steiner@sgi.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+
+---
+ arch/x86/kernel/apic_64.c | 1
+ arch/x86/kernel/io_apic_64.c | 300 +++++++++++++++++++++++++++++++++++++---
+ drivers/pci/intr_remapping.c | 10 +
+ include/asm-x86/apic.h | 8 +
+ include/asm-x86/io_apic.h | 14 +
+ include/asm-x86/irq_remapping.h | 8 +
+ include/linux/dmar.h | 1
+ 7 files changed, 320 insertions(+), 22 deletions(-)
+
+Index: linux-2.6.26/arch/x86/kernel/apic_64.c
+===================================================================
+--- linux-2.6.26.orig/arch/x86/kernel/apic_64.c
++++ linux-2.6.26/arch/x86/kernel/apic_64.c
+@@ -46,6 +46,7 @@
+ static int disable_apic_timer __cpuinitdata;
+ static int apic_calibrate_pmtmr __initdata;
+ int disable_apic;
++int x2apic;
+
+ /* Local APIC timer works in C2 */
+ int local_apic_timer_c2_ok;
+Index: linux-2.6.26/arch/x86/kernel/io_apic_64.c
+===================================================================
+--- linux-2.6.26.orig/arch/x86/kernel/io_apic_64.c
++++ linux-2.6.26/arch/x86/kernel/io_apic_64.c
+@@ -37,6 +37,7 @@
+ #include <acpi/acpi_bus.h>
+ #endif
+ #include <linux/bootmem.h>
++#include <linux/dmar.h>
+
+ #include <asm/idle.h>
+ #include <asm/io.h>
+@@ -49,6 +50,7 @@
+ #include <asm/nmi.h>
+ #include <asm/msidef.h>
+ #include <asm/hypertransport.h>
++#include <asm/irq_remapping.h>
+
+ #include <mach_ipi.h>
+ #include <mach_apic.h>
+@@ -306,7 +308,12 @@ static void __target_IO_APIC_irq(unsigne
+ pin = entry->pin;
+ if (pin == -1)
+ break;
+- io_apic_write(apic, 0x11 + pin*2, dest);
++ /*
++ * With interrupt-remapping, destination information comes
++ * from interrupt-remapping table entry.
++ */
++ if (!irq_remapped(irq))
++ io_apic_write(apic, 0x11 + pin*2, dest);
+ reg = io_apic_read(apic, 0x10 + pin*2);
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+ reg |= vector;
+@@ -905,18 +912,98 @@ void __setup_vector_irq(int cpu)
+ }
+
+ static struct irq_chip ioapic_chip;
++#ifdef CONFIG_INTR_REMAP
++static struct irq_chip ir_ioapic_chip;
++#endif
+
+ static void ioapic_register_intr(int irq, unsigned long trigger)
+ {
+- if (trigger) {
++ if (trigger)
+ irq_desc[irq].status |= IRQ_LEVEL;
+- set_irq_chip_and_handler_name(irq, &ioapic_chip,
+- handle_fasteoi_irq, "fasteoi");
+- } else {
++ else
+ irq_desc[irq].status &= ~IRQ_LEVEL;
++
++#ifdef CONFIG_INTR_REMAP
++ if (irq_remapped(irq)) {
++ irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
++ if (trigger)
++ set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
++ handle_fasteoi_irq,
++ "fasteoi");
++ else
++ set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
++ handle_edge_irq, "edge");
++ return;
++ }
++#endif
++ if (trigger)
++ set_irq_chip_and_handler_name(irq, &ioapic_chip,
++ handle_fasteoi_irq,
++ "fasteoi");
++ else
+ set_irq_chip_and_handler_name(irq, &ioapic_chip,
+ handle_edge_irq, "edge");
++}
++
++static int setup_ioapic_entry(int apic, int irq,
++ struct IO_APIC_route_entry *entry,
++ unsigned int destination, int trigger,
++ int polarity, int vector)
++{
++ /*
++ * add it to the IO-APIC irq-routing table:
++ */
++ memset(entry,0,sizeof(*entry));
++
++#ifdef CONFIG_INTR_REMAP
++ if (intr_remapping_enabled) {
++ struct intel_iommu *iommu = map_ioapic_to_ir(apic);
++ struct irte irte;
++ struct IR_IO_APIC_route_entry *ir_entry =
++ (struct IR_IO_APIC_route_entry *) entry;
++ int index;
++
++ if (!iommu)
++ panic("No mapping iommu for ioapic %d\n", apic);
++
++ index = alloc_irte(iommu, irq, 1);
++ if (index < 0)
++ panic("Failed to allocate IRTE for ioapic %d\n", apic);
++
++ memset(&irte, 0, sizeof(irte));
++
++ irte.present = 1;
++ irte.dst_mode = INT_DEST_MODE;
++ irte.trigger_mode = trigger;
++ irte.dlvry_mode = INT_DELIVERY_MODE;
++ irte.vector = vector;
++ irte.dest_id = IRTE_DEST(destination);
++
++ modify_irte(irq, &irte);
++
++ ir_entry->index2 = (index >> 15) & 0x1;
++ ir_entry->zero = 0;
++ ir_entry->format = 1;
++ ir_entry->index = (index & 0x7fff);
++ } else
++#endif
++ {
++ entry->delivery_mode = INT_DELIVERY_MODE;
++ entry->dest_mode = INT_DEST_MODE;
++ entry->dest = destination;
+ }
++
++ entry->mask = 0; /* enable IRQ */
++ entry->trigger = trigger;
++ entry->polarity = polarity;
++ entry->vector = vector;
++
++ /* Mask level triggered irqs.
++ * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
++ */
++ if (trigger)
++ entry->mask = 1;
++ return 0;
+ }
+
+ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+@@ -941,24 +1028,15 @@ static void setup_IO_APIC_irq(int apic,
+ apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
+ irq, trigger, polarity);
+
+- /*
+- * add it to the IO-APIC irq-routing table:
+- */
+- memset(&entry,0,sizeof(entry));
+
+- entry.delivery_mode = INT_DELIVERY_MODE;
+- entry.dest_mode = INT_DEST_MODE;
+- entry.dest = cpu_mask_to_apicid(mask);
+- entry.mask = 0; /* enable IRQ */
+- entry.trigger = trigger;
+- entry.polarity = polarity;
+- entry.vector = cfg->vector;
+-
+- /* Mask level triggered irqs.
+- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+- */
+- if (trigger)
+- entry.mask = 1;
++ if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
++ cpu_mask_to_apicid(mask), trigger, polarity,
++ cfg->vector)) {
++ printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
++ mp_ioapics[apic].mp_apicid, pin);
++ __clear_irq_vector(irq);
++ return;
++ }
+
+ ioapic_register_intr(irq, trigger);
+ if (irq < 16)
+@@ -1010,6 +1088,9 @@ static void __init setup_timer_IRQ0_pin(
+ {
+ struct IO_APIC_route_entry entry;
+
++ if (intr_remapping_enabled)
++ return;
++
+ memset(&entry, 0, sizeof(entry));
+
+ /*
+@@ -1463,6 +1544,147 @@ static int ioapic_retrigger_irq(unsigned
+ */
+
+ #ifdef CONFIG_SMP
++
++#ifdef CONFIG_INTR_REMAP
++static void ir_irq_migration(struct work_struct *work);
++
++static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
++
++/*
++ * Migrate the IO-APIC irq in the presence of intr-remapping.
++ *
++ * For edge triggered, irq migration is a simple atomic update(of vector
++ * and cpu destination) of IRTE and flush the hardware cache.
++ *
++ * For level triggered, we need to modify the io-apic RTE aswell with the update
++ * vector information, along with modifying IRTE with vector and destination.
++ * So irq migration for level triggered is little bit more complex compared to
++ * edge triggered migration. But the good news is, we use the same algorithm
++ * for level triggered migration as we have today, only difference being,
++ * we now initiate the irq migration from process context instead of the
++ * interrupt context.
++ *
++ * In future, when we do a directed EOI (combined with cpu EOI broadcast
++ * suppression) to the IO-APIC, level triggered irq migration will also be
++ * as simple as edge triggered migration and we can do the irq migration
++ * with a simple atomic update to IO-APIC RTE.
++ */
++static void migrate_ioapic_irq(int irq, cpumask_t mask)
++{
++ struct irq_cfg *cfg = irq_cfg + irq;
++ struct irq_desc *desc = irq_desc + irq;
++ cpumask_t tmp, cleanup_mask;
++ struct irte irte;
++ int modify_ioapic_rte = desc->status & IRQ_LEVEL;
++ unsigned int dest;
++ unsigned long flags;
++
++ cpus_and(tmp, mask, cpu_online_map);
++ if (cpus_empty(tmp))
++ return;
++
++ if (get_irte(irq, &irte))
++ return;
++
++ if (assign_irq_vector(irq, mask))
++ return;
++
++ cpus_and(tmp, cfg->domain, mask);
++ dest = cpu_mask_to_apicid(tmp);
++
++ if (modify_ioapic_rte) {
++ spin_lock_irqsave(&ioapic_lock, flags);
++ __target_IO_APIC_irq(irq, dest, cfg->vector);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++ }
++
++ irte.vector = cfg->vector;
++ irte.dest_id = IRTE_DEST(dest);
++
++ /*
++ * Modified the IRTE and flushes the Interrupt entry cache.
++ */
++ modify_irte(irq, &irte);
++
++ if (cfg->move_in_progress) {
++ cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
++ cfg->move_cleanup_count = cpus_weight(cleanup_mask);
++ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
++ cfg->move_in_progress = 0;
++ }
++
++ irq_desc[irq].affinity = mask;
++}
++
++static int migrate_irq_remapped_level(int irq)
++{
++ int ret = -1;
++
++ mask_IO_APIC_irq(irq);
++
++ if (io_apic_level_ack_pending(irq)) {
++ /*
++ * Interrupt in progress. Migrating irq now will change the
++ * vector information in the IO-APIC RTE and that will confuse
++ * the EOI broadcast performed by cpu.
++ * So, delay the irq migration to the next instance.
++ */
++ schedule_delayed_work(&ir_migration_work, 1);
++ goto unmask;
++ }
++
++ /* everthing is clear. we have right of way */
++ migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
++
++ ret = 0;
++ irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
++ cpus_clear(irq_desc[irq].pending_mask);
++
++unmask:
++ unmask_IO_APIC_irq(irq);
++ return ret;
++}
++
++static void ir_irq_migration(struct work_struct *work)
++{
++ int irq;
++
++ for (irq = 0; irq < NR_IRQS; irq++) {
++ struct irq_desc *desc = irq_desc + irq;
++ if (desc->status & IRQ_MOVE_PENDING) {
++ unsigned long flags;
++
++ spin_lock_irqsave(&desc->lock, flags);
++ if (!desc->chip->set_affinity ||
++ !(desc->status & IRQ_MOVE_PENDING)) {
++ desc->status &= ~IRQ_MOVE_PENDING;
++ spin_unlock_irqrestore(&desc->lock, flags);
++ continue;
++ }
++
++ desc->chip->set_affinity(irq,
++ irq_desc[irq].pending_mask);
++ spin_unlock_irqrestore(&desc->lock, flags);
++ }
++ }
++}
++
++/*
++ * Migrates the IRQ destination in the process context.
++ */
++static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
++{
++ if (irq_desc[irq].status & IRQ_LEVEL) {
++ irq_desc[irq].status |= IRQ_MOVE_PENDING;
++ irq_desc[irq].pending_mask = mask;
++ migrate_irq_remapped_level(irq);
++ return;
++ }
++
++ migrate_ioapic_irq(irq, mask);
++}
++#endif
++
+ asmlinkage void smp_irq_move_cleanup_interrupt(void)
+ {
+ unsigned vector, me;
+@@ -1519,6 +1741,17 @@ static void irq_complete_move(unsigned i
+ #else
+ static inline void irq_complete_move(unsigned int irq) {}
+ #endif
++#ifdef CONFIG_INTR_REMAP
++static void ack_x2apic_level(unsigned int irq)
++{
++ ack_x2APIC_irq();
++}
++
++static void ack_x2apic_edge(unsigned int irq)
++{
++ ack_x2APIC_irq();
++}
++#endif
+
+ static void ack_apic_edge(unsigned int irq)
+ {
+@@ -1593,6 +1826,21 @@ static struct irq_chip ioapic_chip __rea
+ .retrigger = ioapic_retrigger_irq,
+ };
+
++#ifdef CONFIG_INTR_REMAP
++static struct irq_chip ir_ioapic_chip __read_mostly = {
++ .name = "IR-IO-APIC",
++ .startup = startup_ioapic_irq,
++ .mask = mask_IO_APIC_irq,
++ .unmask = unmask_IO_APIC_irq,
++ .ack = ack_x2apic_edge,
++ .eoi = ack_x2apic_level,
++#ifdef CONFIG_SMP
++ .set_affinity = set_ir_ioapic_affinity_irq,
++#endif
++ .retrigger = ioapic_retrigger_irq,
++};
++#endif
++
+ static inline void init_IO_APIC_traps(void)
+ {
+ int irq;
+@@ -1778,6 +2026,8 @@ static inline void __init check_timer(vo
+ * 8259A.
+ */
+ if (pin1 == -1) {
++ if (intr_remapping_enabled)
++ panic("BIOS bug: timer not connected to IO-APIC");
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+@@ -1804,6 +2054,8 @@ static inline void __init check_timer(vo
+ clear_IO_APIC_pin(0, pin1);
+ goto out;
+ }
++ if (intr_remapping_enabled)
++ panic("timer doesn't work through Interrupt-remapped IO-APIC");
+ clear_IO_APIC_pin(apic1, pin1);
+ if (!no_pin1)
+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+@@ -2399,6 +2651,10 @@ void __init setup_ioapic_dest(void)
+ setup_IO_APIC_irq(ioapic, pin, irq,
+ irq_trigger(irq_entry),
+ irq_polarity(irq_entry));
++#ifdef CONFIG_INTR_REMAP
++ else if (intr_remapping_enabled)
++ set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
++#endif
+ else
+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
+ }
+Index: linux-2.6.26/drivers/pci/intr_remapping.c
+===================================================================
+--- linux-2.6.26.orig/drivers/pci/intr_remapping.c
++++ linux-2.6.26/drivers/pci/intr_remapping.c
+@@ -220,6 +220,16 @@ int flush_irte(int irq)
+ return 0;
+ }
+
++struct intel_iommu *map_ioapic_to_ir(int apic)
++{
++ int i;
++
++ for (i = 0; i < MAX_IO_APICS; i++)
++ if (ir_ioapic[i].id == apic)
++ return ir_ioapic[i].iommu;
++ return NULL;
++}
++
+ int free_irte(int irq)
+ {
+ int index, i;
+Index: linux-2.6.26/include/asm-x86/apic.h
+===================================================================
+--- linux-2.6.26.orig/include/asm-x86/apic.h
++++ linux-2.6.26/include/asm-x86/apic.h
+@@ -123,6 +123,14 @@ extern struct apic_ops *apic_ops;
+
+ extern int get_physical_broadcast(void);
+
++#ifdef CONFIG_X86_64
++static inline void ack_x2APIC_irq(void)
++{
++ /* Docs say use 0 for future compatibility */
++ native_apic_msr_write(APIC_EOI, 0);
++}
++#endif
++
+ static inline void ack_APIC_irq(void)
+ {
+ /*
+Index: linux-2.6.26/include/asm-x86/io_apic.h
+===================================================================
+--- linux-2.6.26.orig/include/asm-x86/io_apic.h
++++ linux-2.6.26/include/asm-x86/io_apic.h
+@@ -107,6 +107,20 @@ struct IO_APIC_route_entry {
+
+ } __attribute__ ((packed));
+
++struct IR_IO_APIC_route_entry {
++ __u64 vector : 8,
++ zero : 3,
++ index2 : 1,
++ delivery_status : 1,
++ polarity : 1,
++ irr : 1,
++ trigger : 1,
++ mask : 1,
++ reserved : 31,
++ format : 1,
++ index : 15;
++} __attribute__ ((packed));
++
+ #ifdef CONFIG_X86_IO_APIC
+
+ /*
+Index: linux-2.6.26/include/asm-x86/irq_remapping.h
+===================================================================
+--- /dev/null
++++ linux-2.6.26/include/asm-x86/irq_remapping.h
+@@ -0,0 +1,8 @@
++#ifndef _ASM_IRQ_REMAPPING_H
++#define _ASM_IRQ_REMAPPING_H
++
++extern int x2apic;
++
++#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
++
++#endif
+Index: linux-2.6.26/include/linux/dmar.h
+===================================================================
+--- linux-2.6.26.orig/include/linux/dmar.h
++++ linux-2.6.26/include/linux/dmar.h
+@@ -109,6 +109,7 @@ extern int flush_irte(int irq);
+ extern int free_irte(int irq);
+
+ extern int irq_remapped(int irq);
++extern struct intel_iommu *map_ioapic_to_ir(int apic);
+ #else
+ #define irq_remapped(irq) (0)
+ #define enable_intr_remapping(mode) (-1)