obj-y += cpu.o
 obj-y += lowlevel.o
 obj-y += speed.o
+obj-$(CONFIG_MP) += mp.o
+obj-$(CONFIG_OF_LIBFDT) += fdt.o
 
 #include <asm/io.h>
 #include <asm/arch-fsl-lsch3/immap_lsch3.h>
 #include "cpu.h"
+#include "mp.h"
 #include "speed.h"
 #include <fsl_mc.h>
 
 #endif
        return error;
 }
+
+
+int arch_early_init_r(void)
+{
+       int rv;
+       rv = fsl_lsch3_wake_seconday_cores();
+
+       if (rv)
+               printf("Did not wake secondary cores\n");
+
+       return 0;
+}
 
  */
 
 int fsl_qoriq_core_to_cluster(unsigned int core);
+u32 cpu_mask(void);
 
--- /dev/null
+/*
+ * Copyright 2014 Freescale Semiconductor, Inc.
+ *
+ * SPDX-License-Identifier:    GPL-2.0+
+ */
+
+#include <common.h>
+#include <libfdt.h>
+#include <fdt_support.h>
+#include "mp.h"
+
+#ifdef CONFIG_MP
+void ft_fixup_cpu(void *blob)
+{
+       int off;
+       __maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr();
+       fdt32_t *reg;
+       int addr_cells;
+       u64 val;
+       size_t *boot_code_size = &(__secondary_boot_code_size);
+
+       off = fdt_path_offset(blob, "/cpus");
+       if (off < 0) {
+               puts("couldn't find /cpus node\n");
+               return;
+       }
+       of_bus_default_count_cells(blob, off, &addr_cells, NULL);
+
+       off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpu", 4);
+       while (off != -FDT_ERR_NOTFOUND) {
+               reg = (fdt32_t *)fdt_getprop(blob, off, "reg", 0);
+               if (reg) {
+                       val = spin_tbl_addr;
+                       val += id_to_core(of_read_number(reg, addr_cells))
+                               * SPIN_TABLE_ELEM_SIZE;
+                       val = cpu_to_fdt64(val);
+                       fdt_setprop_string(blob, off, "enable-method",
+                                          "spin-table");
+                       fdt_setprop(blob, off, "cpu-release-addr",
+                                   &val, sizeof(val));
+               } else {
+                       puts("Warning: found cpu node without reg property\n");
+               }
+               off = fdt_node_offset_by_prop_value(blob, off, "device_type",
+                                                   "cpu", 4);
+       }
+
+       fdt_add_mem_rsv(blob, (uintptr_t)&secondary_boot_code,
+                       *boot_code_size);
+}
+#endif
+
+void ft_cpu_setup(void *blob, bd_t *bd)
+{
+#ifdef CONFIG_MP
+       ft_fixup_cpu(blob);
+#endif
+}
 
 
 #include <config.h>
 #include <linux/linkage.h>
+#include <asm/gic.h>
 #include <asm/macro.h>
+#include "mp.h"
 
 ENTRY(lowlevel_init)
        mov     x29, lr                 /* Save LR */
 #endif
 #endif
 
-       branch_if_master x0, x1, 1f
+       branch_if_master x0, x1, 2f
 
+       ldr     x0, =secondary_boot_func
+       blr     x0
+2:
+       mov     lr, x29                 /* Restore LR */
+       ret
+ENDPROC(lowlevel_init)
+
+       /* Keep literals not used by the secondary boot code outside it */
+       .ltorg
+
+       /* Using 64 bit alignment since the spin table is accessed as data */
+       .align 4
+       .global secondary_boot_code
+       /* Secondary Boot Code starts here */
+secondary_boot_code:
+       .global __spin_table
+__spin_table:
+       .space CONFIG_MAX_CPUS*SPIN_TABLE_ELEM_SIZE
+
+       .align 2
+ENTRY(secondary_boot_func)
        /*
-        * Slave should wait for master clearing spin table.
-        * This sync prevent salves observing incorrect
-        * value of spin table and jumping to wrong place.
+        * MPIDR_EL1 Fields:
+        * MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1)
+        * MPIDR[7:2] = AFF0_RES
+        * MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3)
+        * MPIDR[23:16] = AFF2_CLUSTERID
+        * MPIDR[24] = MT
+        * MPIDR[29:25] = RES0
+        * MPIDR[30] = U
+        * MPIDR[31] = ME
+        * MPIDR[39:32] = AFF3
+        *
+        * Linear Processor ID (LPID) calculation from MPIDR_EL1:
+        * (We only use AFF0_CPUID and AFF1_CLUSTERID for now
+        * until AFF2_CLUSTERID and AFF3 have non-zero values)
+        *
+        * LPID = MPIDR[15:8] | MPIDR[1:0]
         */
-#if defined(CONFIG_GICV2) || defined(CONFIG_GICV3)
-#ifdef CONFIG_GICV2
-       ldr     x0, =GICC_BASE
-#endif
-       bl      gic_wait_for_interrupt
-#endif
-
+       mrs     x0, mpidr_el1
+       ubfm    x1, x0, #8, #15
+       ubfm    x2, x0, #0, #1
+       orr     x10, x2, x1, lsl #2     /* x10 has LPID */
+       ubfm    x9, x0, #0, #15         /* x9 contains MPIDR[15:0] */
        /*
-        * All processors will enter EL2 and optionally EL1.
+        * offset of the spin table element for this core from start of spin
+        * table (each elem is padded to 64 bytes)
         */
-       bl      armv8_switch_to_el2
+       lsl     x1, x10, #6
+       ldr     x0, =__spin_table
+       /* physical address of this cpus spin table element */
+       add     x11, x1, x0
+
+       str     x9, [x11, #16]  /* LPID */
+       mov     x4, #1
+       str     x4, [x11, #8]   /* STATUS */
+       dsb     sy
+#if defined(CONFIG_GICV3)
+       gic_wait_for_interrupt_m x0
+#elif defined(CONFIG_GICV2)
+        ldr     x0, =GICC_BASE
+        gic_wait_for_interrupt_m x0, w1
+#endif
+
+       bl secondary_switch_to_el2
 #ifdef CONFIG_ARMV8_SWITCH_TO_EL1
-       bl      armv8_switch_to_el1
+       bl secondary_switch_to_el1
 #endif
-       b       2f
 
-1:
-2:
-       mov     lr, x29                 /* Restore LR */
-       ret
-ENDPROC(lowlevel_init)
+slave_cpu:
+       wfe
+       ldr     x0, [x11]
+       cbz     x0, slave_cpu
+#ifndef CONFIG_ARMV8_SWITCH_TO_EL1
+       mrs     x1, sctlr_el2
+#else
+       mrs     x1, sctlr_el1
+#endif
+       tbz     x1, #25, cpu_is_le
+       rev     x0, x0                  /* BE to LE conversion */
+cpu_is_le:
+       br      x0                      /* branch to the given address */
+ENDPROC(secondary_boot_func)
+
+ENTRY(secondary_switch_to_el2)
+       switch_el x0, 1f, 0f, 0f
+0:     ret
+1:     armv8_switch_to_el2_m x0
+ENDPROC(secondary_switch_to_el2)
+
+ENTRY(secondary_switch_to_el1)
+       switch_el x0, 0f, 1f, 0f
+0:     ret
+1:     armv8_switch_to_el1_m x0, x1
+ENDPROC(secondary_switch_to_el1)
+
+       /* Ensure that the literals used by the secondary boot code are
+        * assembled within it (this is required so that we can protect
+        * this area with a single memreserve region
+        */
+       .ltorg
+
+       /* 64 bit alignment for elements accessed as data */
+       .align 4
+       .globl __secondary_boot_code_size
+       .type __secondary_boot_code_size, %object
+       /* Secondary Boot Code ends here */
+__secondary_boot_code_size:
+       .quad .-secondary_boot_code
 
--- /dev/null
+/*
+ * Copyright 2014 Freescale Semiconductor, Inc.
+ *
+ * SPDX-License-Identifier:    GPL-2.0+
+ */
+
+#include <common.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/arch-fsl-lsch3/immap_lsch3.h>
+#include "mp.h"
+
+DECLARE_GLOBAL_DATA_PTR;
+
+void *get_spin_tbl_addr(void)
+{
+       return &__spin_table;
+}
+
+phys_addr_t determine_mp_bootpg(void)
+{
+       return (phys_addr_t)&secondary_boot_code;
+}
+
+int fsl_lsch3_wake_seconday_cores(void)
+{
+       struct ccsr_gur __iomem *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR);
+       struct ccsr_reset __iomem *rst = (void *)(CONFIG_SYS_FSL_RST_ADDR);
+       u32 cores, cpu_up_mask = 1;
+       int i, timeout = 10;
+       u64 *table = get_spin_tbl_addr();
+
+       cores = cpu_mask();
+       /* Clear spin table so that secondary processors
+        * observe the correct value after waking up from wfe.
+        */
+       memset(table, 0, CONFIG_MAX_CPUS*SPIN_TABLE_ELEM_SIZE);
+       flush_dcache_range((unsigned long)table,
+                          (unsigned long)table +
+                          (CONFIG_MAX_CPUS*SPIN_TABLE_ELEM_SIZE));
+
+       printf("Waking secondary cores to start from %lx\n", gd->relocaddr);
+       out_le32(&gur->bootlocptrh, (u32)(gd->relocaddr >> 32));
+       out_le32(&gur->bootlocptrl, (u32)gd->relocaddr);
+       out_le32(&gur->scratchrw[6], 1);
+       asm volatile("dsb st" : : : "memory");
+       rst->brrl = cores;
+       asm volatile("dsb st" : : : "memory");
+
+       /* This is needed as a precautionary measure.
+        * If some code before this has accidentally  released the secondary
+        * cores then the pre-bootloader code will trap them in a "wfe" unless
+        * the scratchrw[6] is set. In this case we need a sev here to get these
+        * cores moving again.
+        */
+       asm volatile("sev");
+
+       while (timeout--) {
+               flush_dcache_range((unsigned long)table, (unsigned long)table +
+                                  CONFIG_MAX_CPUS * 64);
+               for (i = 1; i < CONFIG_MAX_CPUS; i++) {
+                       if (table[i * WORDS_PER_SPIN_TABLE_ENTRY +
+                                       SPIN_TABLE_ELEM_STATUS_IDX])
+                               cpu_up_mask |= 1 << i;
+               }
+               if (hweight32(cpu_up_mask) == hweight32(cores))
+                       break;
+               udelay(10);
+       }
+       if (timeout <= 0) {
+               printf("Not all cores (0x%x) are up (0x%x)\n",
+                      cores, cpu_up_mask);
+               return 1;
+       }
+       printf("All (%d) cores are up.\n", hweight32(cores));
+
+       return 0;
+}
+
+int is_core_valid(unsigned int core)
+{
+       return !!((1 << core) & cpu_mask());
+}
+
+int cpu_reset(int nr)
+{
+       puts("Feature is not implemented.\n");
+
+       return 0;
+}
+
+int cpu_disable(int nr)
+{
+       puts("Feature is not implemented.\n");
+
+       return 0;
+}
+
+int core_to_pos(int nr)
+{
+       u32 cores = cpu_mask();
+       int i, count = 0;
+
+       if (nr == 0) {
+               return 0;
+       } else if (nr >= hweight32(cores)) {
+               puts("Not a valid core number.\n");
+               return -1;
+       }
+
+       for (i = 1; i < 32; i++) {
+               if (is_core_valid(i)) {
+                       count++;
+                       if (count == nr)
+                               break;
+               }
+       }
+
+       return count;
+}
+
+int cpu_status(int nr)
+{
+       u64 *table;
+       int pos;
+
+       if (nr == 0) {
+               table = (u64 *)get_spin_tbl_addr();
+               printf("table base @ 0x%p\n", table);
+       } else {
+               pos = core_to_pos(nr);
+               if (pos < 0)
+                       return -1;
+               table = (u64 *)get_spin_tbl_addr() + pos *
+                       WORDS_PER_SPIN_TABLE_ENTRY;
+               printf("table @ 0x%p\n", table);
+               printf("   addr - 0x%016llx\n",
+                      table[SPIN_TABLE_ELEM_ENTRY_ADDR_IDX]);
+               printf("   status   - 0x%016llx\n",
+                      table[SPIN_TABLE_ELEM_STATUS_IDX]);
+               printf("   lpid  - 0x%016llx\n",
+                      table[SPIN_TABLE_ELEM_LPID_IDX]);
+       }
+
+       return 0;
+}
+
+int cpu_release(int nr, int argc, char * const argv[])
+{
+       u64 boot_addr;
+       u64 *table = (u64 *)get_spin_tbl_addr();
+       int pos;
+
+       pos = core_to_pos(nr);
+       if (pos <= 0)
+               return -1;
+
+       table += pos * WORDS_PER_SPIN_TABLE_ENTRY;
+       boot_addr = simple_strtoull(argv[0], NULL, 16);
+       table[SPIN_TABLE_ELEM_ENTRY_ADDR_IDX] = boot_addr;
+       flush_dcache_range((unsigned long)table,
+                          (unsigned long)table + SPIN_TABLE_ELEM_SIZE);
+       asm volatile("dsb st");
+       smp_kick_all_cpus();    /* only those with entry addr set will run */
+
+       return 0;
+}
 
--- /dev/null
+/*
+ * Copyright 2014, Freescale Semiconductor
+ *
+ * SPDX-License-Identifier:    GPL-2.0+
+ */
+
+#ifndef _FSL_CH3_MP_H
+#define _FSL_CH3_MP_H
+
+/*
+* Each spin table element is defined as
+* struct {
+*      uint64_t entry_addr;
+*      uint64_t status;
+*      uint64_t lpid;
+* };
+* we pad this struct to 64 bytes so each entry is in its own cacheline
+* the actual spin table is an array of these structures
+*/
+#define SPIN_TABLE_ELEM_ENTRY_ADDR_IDX 0
+#define SPIN_TABLE_ELEM_STATUS_IDX     1
+#define SPIN_TABLE_ELEM_LPID_IDX       2
+#define WORDS_PER_SPIN_TABLE_ENTRY     8       /* pad to 64 bytes */
+#define SPIN_TABLE_ELEM_SIZE           64
+
+#define id_to_core(x)  ((x & 3) | (x >> 6))
+#ifndef __ASSEMBLY__
+extern u64 __spin_table[];
+extern u64 *secondary_boot_code;
+extern size_t __secondary_boot_code_size;
+int fsl_lsch3_wake_seconday_cores(void);
+void *get_spin_tbl_addr(void);
+phys_addr_t determine_mp_bootpg(void);
+void secondary_boot_func(void);
+#endif
+#endif /* _FSL_CH3_MP_H */
 
 ENTRY(armv8_switch_to_el2)
        switch_el x0, 1f, 0f, 0f
 0:     ret
-1:
-       mov     x0, #0x5b1      /* Non-secure EL0/EL1 | HVC | 64bit EL2 */
-       msr     scr_el3, x0
-       msr     cptr_el3, xzr   /* Disable coprocessor traps to EL3 */
-       mov     x0, #0x33ff
-       msr     cptr_el2, x0    /* Disable coprocessor traps to EL2 */
-
-       /* Initialize SCTLR_EL2 */
-       msr     sctlr_el2, xzr
-
-       /* Return to the EL2_SP2 mode from EL3 */
-       mov     x0, sp
-       msr     sp_el2, x0      /* Migrate SP */
-       mrs     x0, vbar_el3
-       msr     vbar_el2, x0    /* Migrate VBAR */
-       mov     x0, #0x3c9
-       msr     spsr_el3, x0    /* EL2_SP2 | D | A | I | F */
-       msr     elr_el3, lr
-       eret
+1:     armv8_switch_to_el2_m x0
 ENDPROC(armv8_switch_to_el2)
 
 ENTRY(armv8_switch_to_el1)
        switch_el x0, 0f, 1f, 0f
 0:     ret
-1:
-       /* Initialize Generic Timers */
-       mrs     x0, cnthctl_el2
-       orr     x0, x0, #0x3            /* Enable EL1 access to timers */
-       msr     cnthctl_el2, x0
-       msr     cntvoff_el2, xzr
-       mrs     x0, cntkctl_el1
-       orr     x0, x0, #0x3            /* Enable EL0 access to timers */
-       msr     cntkctl_el1, x0
-
-       /* Initilize MPID/MPIDR registers */
-       mrs     x0, midr_el1
-       mrs     x1, mpidr_el1
-       msr     vpidr_el2, x0
-       msr     vmpidr_el2, x1
-
-       /* Disable coprocessor traps */
-       mov     x0, #0x33ff
-       msr     cptr_el2, x0            /* Disable coprocessor traps to EL2 */
-       msr     hstr_el2, xzr           /* Disable coprocessor traps to EL2 */
-       mov     x0, #3 << 20
-       msr     cpacr_el1, x0           /* Enable FP/SIMD at EL1 */
-
-       /* Initialize HCR_EL2 */
-       mov     x0, #(1 << 31)          /* 64bit EL1 */
-       orr     x0, x0, #(1 << 29)      /* Disable HVC */
-       msr     hcr_el2, x0
-
-       /* SCTLR_EL1 initialization */
-       mov     x0, #0x0800
-       movk    x0, #0x30d0, lsl #16
-       msr     sctlr_el1, x0
-
-       /* Return to the EL1_SP1 mode from EL2 */
-       mov     x0, sp
-       msr     sp_el1, x0              /* Migrate SP */
-       mrs     x0, vbar_el2
-       msr     vbar_el1, x0            /* Migrate VBAR */
-       mov     x0, #0x3c5
-       msr     spsr_el2, x0            /* EL1_SP1 | D | A | I | F */
-       msr     elr_el2, lr
-       eret
+1:     armv8_switch_to_el1_m x0, x1
 ENDPROC(armv8_switch_to_el1)
 
 #define _ASM_ARMV8_FSL_LSCH3_CONFIG_
 
 #include <fsl_ddrc_version.h>
-
+#define CONFIG_MP
 #define CONFIG_SYS_FSL_OCRAM_BASE      0x18000000      /* initial RAM */
 /* Link Definitions */
 #define CONFIG_SYS_INIT_SP_ADDR                (CONFIG_SYS_FSL_OCRAM_BASE + 0xfff0)
 #define CONFIG_SYS_FSL_DDR3_ADDR               0x08210000
 #define CONFIG_SYS_FSL_GUTS_ADDR               (CONFIG_SYS_IMMR + 0x00E00000)
 #define CONFIG_SYS_FSL_PMU_ADDR                        (CONFIG_SYS_IMMR + 0x00E30000)
+#define CONFIG_SYS_FSL_RST_ADDR                        (CONFIG_SYS_IMMR + 0x00E60000)
 #define CONFIG_SYS_FSL_CH3_CLK_GRPA_ADDR       (CONFIG_SYS_IMMR + 0x00300000)
 #define CONFIG_SYS_FSL_CH3_CLK_GRPB_ADDR       (CONFIG_SYS_IMMR + 0x00310000)
 #define CONFIG_SYS_FSL_CH3_CLK_CTRL_ADDR       (CONFIG_SYS_IMMR + 0x00370000)
 
                u8  res_04[0x20-0x04];
        } clkcncsr[8];
 };
+
+struct ccsr_reset {
+       u32 rstcr;                      /* 0x000 */
+       u32 rstcrsp;                    /* 0x004 */
+       u8 res_008[0x10-0x08];          /* 0x008 */
+       u32 rstrqmr1;                   /* 0x010 */
+       u32 rstrqmr2;                   /* 0x014 */
+       u32 rstrqsr1;                   /* 0x018 */
+       u32 rstrqsr2;                   /* 0x01c */
+       u32 rstrqwdtmrl;                /* 0x020 */
+       u32 rstrqwdtmru;                /* 0x024 */
+       u8 res_028[0x30-0x28];          /* 0x028 */
+       u32 rstrqwdtsrl;                /* 0x030 */
+       u32 rstrqwdtsru;                /* 0x034 */
+       u8 res_038[0x60-0x38];          /* 0x038 */
+       u32 brrl;                       /* 0x060 */
+       u32 brru;                       /* 0x064 */
+       u8 res_068[0x80-0x68];          /* 0x068 */
+       u32 pirset;                     /* 0x080 */
+       u32 pirclr;                     /* 0x084 */
+       u8 res_088[0x90-0x88];          /* 0x088 */
+       u32 brcorenbr;                  /* 0x090 */
+       u8 res_094[0x100-0x94];         /* 0x094 */
+       u32 rcw_reqr;                   /* 0x100 */
+       u32 rcw_completion;             /* 0x104 */
+       u8 res_108[0x110-0x108];        /* 0x108 */
+       u32 pbi_reqr;                   /* 0x110 */
+       u32 pbi_completion;             /* 0x114 */
+       u8 res_118[0xa00-0x118];        /* 0x118 */
+       u32 qmbm_warmrst;               /* 0xa00 */
+       u32 soc_warmrst;                /* 0xa04 */
+       u8 res_a08[0xbf8-0xa08];        /* 0xa08 */
+       u32 ip_rev1;                    /* 0xbf8 */
+       u32 ip_rev2;                    /* 0xbfc */
+};
 #endif /* __ARCH_FSL_LSCH3_IMMAP_H */
 
        cbz     \xreg1, \master_label
 .endm
 
+.macro armv8_switch_to_el2_m, xreg1
+       /* 64bit EL2 | HCE | SMD | RES1 (Bits[5:4]) | Non-secure EL0/EL1 */
+       mov     \xreg1, #0x5b1
+       msr     scr_el3, \xreg1
+       msr     cptr_el3, xzr           /* Disable coprocessor traps to EL3 */
+       mov     \xreg1, #0x33ff
+       msr     cptr_el2, \xreg1        /* Disable coprocessor traps to EL2 */
+
+       /* Initialize SCTLR_EL2
+        *
+        * setting RES1 bits (29,28,23,22,18,16,11,5,4) to 1
+        * and RES0 bits (31,30,27,26,24,21,20,17,15-13,10-6) +
+        * EE,WXN,I,SA,C,A,M to 0
+        */
+       mov     \xreg1, #0x0830
+       movk    \xreg1, #0x30C5, lsl #16
+       msr     sctlr_el2, \xreg1
+
+       /* Return to the EL2_SP2 mode from EL3 */
+       mov     \xreg1, sp
+       msr     sp_el2, \xreg1          /* Migrate SP */
+       mrs     \xreg1, vbar_el3
+       msr     vbar_el2, \xreg1        /* Migrate VBAR */
+       mov     \xreg1, #0x3c9
+       msr     spsr_el3, \xreg1        /* EL2_SP2 | D | A | I | F */
+       msr     elr_el3, lr
+       eret
+.endm
+
+.macro armv8_switch_to_el1_m, xreg1, xreg2
+       /* Initialize Generic Timers */
+       mrs     \xreg1, cnthctl_el2
+       orr     \xreg1, \xreg1, #0x3    /* Enable EL1 access to timers */
+       msr     cnthctl_el2, \xreg1
+       msr     cntvoff_el2, xzr
+
+       /* Initilize MPID/MPIDR registers */
+       mrs     \xreg1, midr_el1
+       mrs     \xreg2, mpidr_el1
+       msr     vpidr_el2, \xreg1
+       msr     vmpidr_el2, \xreg2
+
+       /* Disable coprocessor traps */
+       mov     \xreg1, #0x33ff
+       msr     cptr_el2, \xreg1        /* Disable coprocessor traps to EL2 */
+       msr     hstr_el2, xzr           /* Disable coprocessor traps to EL2 */
+       mov     \xreg1, #3 << 20
+       msr     cpacr_el1, \xreg1       /* Enable FP/SIMD at EL1 */
+
+       /* Initialize HCR_EL2 */
+       mov     \xreg1, #(1 << 31)              /* 64bit EL1 */
+       orr     \xreg1, \xreg1, #(1 << 29)      /* Disable HVC */
+       msr     hcr_el2, \xreg1
+
+       /* SCTLR_EL1 initialization
+        *
+        * setting RES1 bits (29,28,23,22,20,11) to 1
+        * and RES0 bits (31,30,27,21,17,13,10,6) +
+        * UCI,EE,EOE,WXN,nTWE,nTWI,UCT,DZE,I,UMA,SED,ITD,
+        * CP15BEN,SA0,SA,C,A,M to 0
+        */
+       mov     \xreg1, #0x0800
+       movk    \xreg1, #0x30d0, lsl #16
+       msr     sctlr_el1, \xreg1
+
+       /* Return to the EL1_SP1 mode from EL2 */
+       mov     \xreg1, sp
+       msr     sp_el1, \xreg1          /* Migrate SP */
+       mrs     \xreg1, vbar_el2
+       msr     vbar_el1, \xreg1        /* Migrate VBAR */
+       mov     \xreg1, #0x3c5
+       msr     spsr_el2, \xreg1        /* EL1_SP1 | D | A | I | F */
+       msr     elr_el2, lr
+       eret
+.endm
+
+#if defined(CONFIG_GICV3)
+.macro gic_wait_for_interrupt_m xreg1
+0 :    wfi
+       mrs     \xreg1, ICC_IAR1_EL1
+       msr     ICC_EOIR1_EL1, \xreg1
+       cbnz    \xreg1, 0b
+.endm
+#elif defined(CONFIG_GICV2)
+.macro gic_wait_for_interrupt_m xreg1, wreg2
+0 :    wfi
+       ldr     \wreg2, [\xreg1, GICC_AIAR]
+       str     \wreg2, [\xreg1, GICC_AEOIR]
+       and     \wreg2, \wreg2, #3ff
+       cbnz    \wreg2, 0b
+.endm
+#endif
+
 #endif /* CONFIG_ARM64 */
 
 #endif /* __ASSEMBLY__ */
 
 #include <asm-offsets.h>
 #include <config.h>
 #include <linux/linkage.h>
-#include <asm/macro.h>
 #include <asm/gic.h>
+#include <asm/macro.h>
 
 
 /*************************************************************************
  *
  *************************************************************************/
 ENTRY(gic_wait_for_interrupt)
-0:     wfi
 #if defined(CONFIG_GICV3)
-       mrs     x9, ICC_IAR1_EL1
-       msr     ICC_EOIR1_EL1, x9
+       gic_wait_for_interrupt_m x9
 #elif defined(CONFIG_GICV2)
-       ldr     w9, [x0, GICC_AIAR]
-       str     w9, [x0, GICC_AEOIR]
+       gic_wait_for_interrupt_m x0, w9
 #endif
-       cbnz    w9, 0b
        ret
 ENDPROC(gic_wait_for_interrupt)