--- /dev/null
+From foo@baz Thu Aug 13 14:26:49 PDT 2015
+From: "David S. Miller" <davem@davemloft.net>
+Date: Tue, 14 Oct 2014 19:37:58 -0700
+Subject: sparc64: Fix FPU register corruption with AES crypto offload.
+
+From: "David S. Miller" <davem@davemloft.net>
+
+[ Upstream commit f4da3628dc7c32a59d1fb7116bb042e6f436d611 ]
+
+The AES loops in arch/sparc/crypto/aes_glue.c use a scheme where the
+key material is preloaded into the FPU registers, and then we loop
+over and over doing the crypt operation, reusing those pre-cooked key
+registers.
+
+There are intervening blkcipher*() calls between the crypt operation
+calls. And those might perform memcpy() and thus also try to use the
+FPU.
+
+The sparc64 kernel FPU usage mechanism is designed to allow such
+recursive uses, but with a catch.
+
+There has to be a trap between the two FPU using threads of control.
+
+The mechanism works by, when the FPU is already in use by the kernel,
+allocating a slot for FPU saving at trap time. Then if, within the
+trap handler, we try to use the FPU registers, the pre-trap FPU
+register state is saved into the slot. Then at trap return time we
+notice this and restore the pre-trap FPU state.
+
+Over the long term there are various more involved ways we can make
+this work, but for a quick fix let's take advantage of the fact that
+the situation where this happens is very limited.
+
+All sparc64 chips that support the crypto instructiosn also are using
+the Niagara4 memcpy routine, and that routine only uses the FPU for
+large copies where we can't get the source aligned properly to a
+multiple of 8 bytes.
+
+We look to see if the FPU is already in use in this context, and if so
+we use the non-large copy path which only uses integer registers.
+
+Furthermore, we also limit this special logic to when we are doing
+kernel copy, rather than a user copy.
+
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/sparc/include/asm/visasm.h | 8 ++++++++
+ arch/sparc/lib/NG4memcpy.S | 14 +++++++++++++-
+ 2 files changed, 21 insertions(+), 1 deletion(-)
+
+--- a/arch/sparc/include/asm/visasm.h
++++ b/arch/sparc/include/asm/visasm.h
+@@ -39,6 +39,14 @@
+ 297: wr %o5, FPRS_FEF, %fprs; \
+ 298:
+
++#define VISEntryHalfFast(fail_label) \
++ rd %fprs, %o5; \
++ andcc %o5, FPRS_FEF, %g0; \
++ be,pt %icc, 297f; \
++ nop; \
++ ba,a,pt %xcc, fail_label; \
++297: wr %o5, FPRS_FEF, %fprs;
++
+ #define VISExitHalf \
+ wr %o5, 0, %fprs;
+
+--- a/arch/sparc/lib/NG4memcpy.S
++++ b/arch/sparc/lib/NG4memcpy.S
+@@ -41,6 +41,10 @@
+ #endif
+ #endif
+
++#if !defined(EX_LD) && !defined(EX_ST)
++#define NON_USER_COPY
++#endif
++
+ #ifndef EX_LD
+ #define EX_LD(x) x
+ #endif
+@@ -197,9 +201,13 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
+ mov EX_RETVAL(%o3), %o0
+
+ .Llarge_src_unaligned:
++#ifdef NON_USER_COPY
++ VISEntryHalfFast(.Lmedium_vis_entry_fail)
++#else
++ VISEntryHalf
++#endif
+ andn %o2, 0x3f, %o4
+ sub %o2, %o4, %o2
+- VISEntryHalf
+ alignaddr %o1, %g0, %g1
+ add %o1, %o4, %o1
+ EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
+@@ -240,6 +248,10 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
+ nop
+ ba,a,pt %icc, .Lmedium_unaligned
+
++#ifdef NON_USER_COPY
++.Lmedium_vis_entry_fail:
++ or %o0, %o1, %g2
++#endif
+ .Lmedium:
+ LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
+ andcc %g2, 0x7, %g0
--- /dev/null
+From foo@baz Thu Aug 13 14:26:49 PDT 2015
+From: "David S. Miller" <davem@davemloft.net>
+Date: Thu, 6 Aug 2015 19:13:25 -0700
+Subject: sparc64: Fix userspace FPU register corruptions.
+
+From: "David S. Miller" <davem@davemloft.net>
+
+[ Upstream commit 44922150d87cef616fd183220d43d8fde4d41390 ]
+
+If we have a series of events from userpsace, with %fprs=FPRS_FEF,
+like follows:
+
+ETRAP
+ ETRAP
+ VIS_ENTRY(fprs=0x4)
+ VIS_EXIT
+ RTRAP (kernel FPU restore with fpu_saved=0x4)
+ RTRAP
+
+We will not restore the user registers that were clobbered by the FPU
+using kernel code in the inner-most trap.
+
+Traps allocate FPU save slots in the thread struct, and FPU using
+sequences save the "dirty" FPU registers only.
+
+This works at the initial trap level because all of the registers
+get recorded into the top-level FPU save area, and we'll return
+to userspace with the FPU disabled so that any FPU use by the user
+will take an FPU disabled trap wherein we'll load the registers
+back up properly.
+
+But this is not how trap returns from kernel to kernel operate.
+
+The simplest fix for this bug is to always save all FPU register state
+for anything other than the top-most FPU save area.
+
+Getting rid of the optimized inner-slot FPU saving code ends up
+making VISEntryHalf degenerate into plain VISEntry.
+
+Longer term we need to do something smarter to reinstate the partial
+save optimizations. Perhaps the fundament error is having trap entry
+and exit allocate FPU save slots and restore register state. Instead,
+the VISEntry et al. calls should be doing that work.
+
+This bug is about two decades old.
+
+Reported-by: James Y Knight <jyknight@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/sparc/include/asm/visasm.h | 16 ++-------
+ arch/sparc/lib/NG4memcpy.S | 5 ++
+ arch/sparc/lib/VISsave.S | 67 +---------------------------------------
+ arch/sparc/lib/ksyms.c | 4 --
+ 4 files changed, 11 insertions(+), 81 deletions(-)
+
+--- a/arch/sparc/include/asm/visasm.h
++++ b/arch/sparc/include/asm/visasm.h
+@@ -28,16 +28,10 @@
+ * Must preserve %o5 between VISEntryHalf and VISExitHalf */
+
+ #define VISEntryHalf \
+- rd %fprs, %o5; \
+- andcc %o5, FPRS_FEF, %g0; \
+- be,pt %icc, 297f; \
+- sethi %hi(298f), %g7; \
+- sethi %hi(VISenterhalf), %g1; \
+- jmpl %g1 + %lo(VISenterhalf), %g0; \
+- or %g7, %lo(298f), %g7; \
+- clr %o5; \
+-297: wr %o5, FPRS_FEF, %fprs; \
+-298:
++ VISEntry
++
++#define VISExitHalf \
++ VISExit
+
+ #define VISEntryHalfFast(fail_label) \
+ rd %fprs, %o5; \
+@@ -47,7 +41,7 @@
+ ba,a,pt %xcc, fail_label; \
+ 297: wr %o5, FPRS_FEF, %fprs;
+
+-#define VISExitHalf \
++#define VISExitHalfFast \
+ wr %o5, 0, %fprs;
+
+ #ifndef __ASSEMBLY__
+--- a/arch/sparc/lib/NG4memcpy.S
++++ b/arch/sparc/lib/NG4memcpy.S
+@@ -240,8 +240,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
+ add %o0, 0x40, %o0
+ bne,pt %icc, 1b
+ LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
++#ifdef NON_USER_COPY
++ VISExitHalfFast
++#else
+ VISExitHalf
+-
++#endif
+ brz,pn %o2, .Lexit
+ cmp %o2, 19
+ ble,pn %icc, .Lsmall_unaligned
+--- a/arch/sparc/lib/VISsave.S
++++ b/arch/sparc/lib/VISsave.S
+@@ -44,9 +44,8 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3
+
+ stx %g3, [%g6 + TI_GSR]
+ 2: add %g6, %g1, %g3
+- cmp %o5, FPRS_DU
+- be,pn %icc, 6f
+- sll %g1, 3, %g1
++ mov FPRS_DU | FPRS_DL | FPRS_FEF, %o5
++ sll %g1, 3, %g1
+ stb %o5, [%g3 + TI_FPSAVED]
+ rd %gsr, %g2
+ add %g6, %g1, %g3
+@@ -80,65 +79,3 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3
+ .align 32
+ 80: jmpl %g7 + %g0, %g0
+ nop
+-
+-6: ldub [%g3 + TI_FPSAVED], %o5
+- or %o5, FPRS_DU, %o5
+- add %g6, TI_FPREGS+0x80, %g2
+- stb %o5, [%g3 + TI_FPSAVED]
+-
+- sll %g1, 5, %g1
+- add %g6, TI_FPREGS+0xc0, %g3
+- wr %g0, FPRS_FEF, %fprs
+- membar #Sync
+- stda %f32, [%g2 + %g1] ASI_BLK_P
+- stda %f48, [%g3 + %g1] ASI_BLK_P
+- membar #Sync
+- ba,pt %xcc, 80f
+- nop
+-
+- .align 32
+-80: jmpl %g7 + %g0, %g0
+- nop
+-
+- .align 32
+-VISenterhalf:
+- ldub [%g6 + TI_FPDEPTH], %g1
+- brnz,a,pn %g1, 1f
+- cmp %g1, 1
+- stb %g0, [%g6 + TI_FPSAVED]
+- stx %fsr, [%g6 + TI_XFSR]
+- clr %o5
+- jmpl %g7 + %g0, %g0
+- wr %g0, FPRS_FEF, %fprs
+-
+-1: bne,pn %icc, 2f
+- srl %g1, 1, %g1
+- ba,pt %xcc, vis1
+- sub %g7, 8, %g7
+-2: addcc %g6, %g1, %g3
+- sll %g1, 3, %g1
+- andn %o5, FPRS_DU, %g2
+- stb %g2, [%g3 + TI_FPSAVED]
+-
+- rd %gsr, %g2
+- add %g6, %g1, %g3
+- stx %g2, [%g3 + TI_GSR]
+- add %g6, %g1, %g2
+- stx %fsr, [%g2 + TI_XFSR]
+- sll %g1, 5, %g1
+-3: andcc %o5, FPRS_DL, %g0
+- be,pn %icc, 4f
+- add %g6, TI_FPREGS, %g2
+-
+- add %g6, TI_FPREGS+0x40, %g3
+- membar #Sync
+- stda %f0, [%g2 + %g1] ASI_BLK_P
+- stda %f16, [%g3 + %g1] ASI_BLK_P
+- membar #Sync
+- ba,pt %xcc, 4f
+- nop
+-
+- .align 32
+-4: and %o5, FPRS_DU, %o5
+- jmpl %g7 + %g0, %g0
+- wr %o5, FPRS_FEF, %fprs
+--- a/arch/sparc/lib/ksyms.c
++++ b/arch/sparc/lib/ksyms.c
+@@ -126,10 +126,6 @@ EXPORT_SYMBOL(copy_user_page);
+ void VISenter(void);
+ EXPORT_SYMBOL(VISenter);
+
+-/* CRYPTO code needs this */
+-void VISenterhalf(void);
+-EXPORT_SYMBOL(VISenterhalf);
+-
+ extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
+ extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *);