+++ /dev/null
-From: Arnd Bergmann <arnd.bergmann@de.ibm.com>
-Subject: powerpc/cell/axon-msi: retry on missing interrupt
-References: bnc#445964,bnc#467633
-
-The MSI capture logic on the axon bridge can sometimes
-lose interrupts in case of high DMA and interrupt load,
-when it signals an MSI interrupt to the MPIC interrupt
-controller while we are already handling another MSI.
-
-Each MSI vector gets written into a FIFO buffer in main
-memory using DMA, and that DMA access is normally flushed
-by the actual interrupt packet on the IOIF. An MMIO
-register in the MSIC holds the position of the last
-entry in the FIFO buffer that was written. However,
-reading that position does not flush the DMA, so that
-we can observe stale data in the buffer.
-
-In a stress test, we have observed the DMA to arrive
-up to 14 microseconds after reading the register.
-We can reliably detect this conditioning by writing
-an invalid MSI vector into the FIFO buffer after
-reading from it, assuming that all MSIs we get
-are valid. After detecting an invalid MSI vector,
-we udelay(1) in the interrupt cascade for up to
-100 times before giving up.
-
-Signed-off-by: Arnd Bergmann <arnd@arndb.de>
-Acked-by: John Jolly <jjolly@novell.com>
-
-commit 23e0e8afafd9ac065d81506524adf3339584044b
-Author: Arnd Bergmann <arnd@arndb.de>
-Date: Fri Dec 12 09:19:50 2008 +0000
-
- powerpc/cell/axon-msi: Fix MSI after kexec
-
- Commit d015fe995 'powerpc/cell/axon-msi: Retry on missing interrupt'
- has turned a rare failure to kexec on QS22 into a reproducible
- error, which we have now analysed.
-
- The problem is that after a kexec, the MSIC hardware still points
- into the middle of the old ring buffer. We set up the ring buffer
- during reboot, but not the offset into it. On older kernels, this
- would cause a storm of thousands of spurious interrupts after a
- kexec, which would most of the time get dropped silently.
-
- With the new code, we time out on each interrupt, waiting for
- it to become valid. If more interrupts come in that we time
- out on, this goes on indefinitely, which eventually leads to
- a hard crash.
-
- The solution in this commit is to read the current offset from
- the MSIC when reinitializing it. This now works correctly, as
- expected.
-
- Reported-by: Dirk Herrendoerfer <d.herrendoerfer@de.ibm.com>
- Signed-off-by: Arnd Bergmann <arnd@arndb.de>
- Acked-by: Michael Ellerman <michael@ellerman.id.au>
- Signed-off-by: Paul Mackerras <paulus@samba.org>
-
-
----
- arch/powerpc/platforms/cell/axon_msi.c | 39 ++++++++++++++++++++++++++++-----
- 1 file changed, 34 insertions(+), 5 deletions(-)
-
---- a/arch/powerpc/platforms/cell/axon_msi.c
-+++ b/arch/powerpc/platforms/cell/axon_msi.c
-@@ -95,6 +95,7 @@ static void axon_msi_cascade(unsigned in
- struct axon_msic *msic = get_irq_data(irq);
- u32 write_offset, msi;
- int idx;
-+ int retry = 0;
-
- write_offset = dcr_read(msic->dcr_host, MSIC_WRITE_OFFSET_REG);
- pr_debug("axon_msi: original write_offset 0x%x\n", write_offset);
-@@ -102,7 +103,7 @@ static void axon_msi_cascade(unsigned in
- /* write_offset doesn't wrap properly, so we have to mask it */
- write_offset &= MSIC_FIFO_SIZE_MASK;
-
-- while (msic->read_offset != write_offset) {
-+ while (msic->read_offset != write_offset && retry < 100) {
- idx = msic->read_offset / sizeof(__le32);
- msi = le32_to_cpu(msic->fifo_virt[idx]);
- msi &= 0xFFFF;
-@@ -110,13 +111,37 @@ static void axon_msi_cascade(unsigned in
- pr_debug("axon_msi: woff %x roff %x msi %x\n",
- write_offset, msic->read_offset, msi);
-
-+ if (msi < NR_IRQS && irq_map[msi].host == msic->irq_host) {
-+ generic_handle_irq(msi);
-+ msic->fifo_virt[idx] = cpu_to_le32(0xffffffff);
-+ } else {
-+ /*
-+ * Reading the MSIC_WRITE_OFFSET_REG does not
-+ * reliably flush the outstanding DMA to the
-+ * FIFO buffer. Here we were reading stale
-+ * data, so we need to retry.
-+ */
-+ udelay(1);
-+ retry++;
-+ pr_debug("axon_msi: invalid irq 0x%x!\n", msi);
-+ continue;
-+ }
-+
-+ if (retry) {
-+ pr_debug("axon_msi: late irq 0x%x, retry %d\n",
-+ msi, retry);
-+ retry = 0;
-+ }
-+
- msic->read_offset += MSIC_FIFO_ENTRY_SIZE;
- msic->read_offset &= MSIC_FIFO_SIZE_MASK;
-+ }
-
-- if (msi < NR_IRQS && irq_map[msi].host == msic->irq_host)
-- generic_handle_irq(msi);
-- else
-- pr_debug("axon_msi: invalid irq 0x%x!\n", msi);
-+ if (retry) {
-+ printk(KERN_WARNING "axon_msi: irq timed out\n");
-+
-+ msic->read_offset += MSIC_FIFO_ENTRY_SIZE;
-+ msic->read_offset &= MSIC_FIFO_SIZE_MASK;
- }
-
- desc->chip->eoi(irq);
-@@ -364,6 +389,7 @@ static int axon_msi_probe(struct of_devi
- dn->full_name);
- goto out_free_fifo;
- }
-+ memset(msic->fifo_virt, 0xff, MSIC_FIFO_SIZE_BYTES);
-
- msic->irq_host = irq_alloc_host(dn, IRQ_HOST_MAP_NOMAP,
- NR_IRQS, &msic_host_ops, 0);
-@@ -387,6 +413,9 @@ static int axon_msi_probe(struct of_devi
- MSIC_CTRL_IRQ_ENABLE | MSIC_CTRL_ENABLE |
- MSIC_CTRL_FIFO_SIZE);
-
-+ msic->read_offset = dcr_read(msic->dcr_host, MSIC_WRITE_OFFSET_REG)
-+ & MSIC_FIFO_SIZE_MASK;
-+
- device->dev.platform_data = msic;
-
- ppc_md.setup_msi_irqs = axon_msi_setup_msi_irqs;