armv7: Optimized aligned case of memxor, using 3-way unrolling.

author Niels Möller <nisse@lysator.liu.se>

Tue, 12 Feb 2013 14:57:37 +0000 (15:57 +0100)

committer Niels Möller <nisse@lysator.liu.se>

Tue, 12 Feb 2013 14:57:37 +0000 (15:57 +0100)
author Niels Möller <nisse@lysator.liu.se>
Tue, 12 Feb 2013 14:57:37 +0000 (15:57 +0100)
committer Niels Möller <nisse@lysator.liu.se>
Tue, 12 Feb 2013 14:57:37 +0000 (15:57 +0100)
diff --git a/ChangeLog b/ChangeLog

index 90f03b7d901487dd9c0947d4347a501fa845dd1c..e1a0d6f5329bec8c45a95361d3674a4f60b6bef6 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2013-02-12  Niels Möller  <nisse@lysator.liu.se>
+
+       * armv7/memxor.asm (memxor): Optimized aligned case, using 3-way
+       unrolling.
+
  2013-02-06  Niels Möller  <nisse@lysator.liu.se>
  
         * armv7/memxor.asm (memxor, memxor3): Optimized aligned case, now
diff --git a/armv7/memxor.asm b/armv7/memxor.asm

index 52d4bf46c71f3f08af35d7679599a17bcaa2921d..94b8f532d1283257c1c637625e1b5828bbcbe59e 100644 (file)
--- a/armv7/memxor.asm
+++ b/armv7/memxor.asm
@@ -18,6 +18,12 @@ C along with the nettle library; see the file COPYING.LIB.  If not, write to
  C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  C MA 02111-1301, USA.
  
+C Possible speedups:
+C
+C The ldm instruction can do load two registers per cycle,
+C if the address is two-word aligned. Or three registers in two
+C cycles, regardless of alignment.
+
  C Register usage:
  
  define(<DST>, <r0>)
@@ -131,38 +137,49 @@ PROLOGUE(memxor)
         b       .Lmemxor_bytes
  
  .Lmemxor_same:
-       tst     N, #4
-       it      ne
-       subne   N, #4
-       bne     .Lmemxor_same_loop
-
-       ldr     r3, [SRC], #+4
-       ldr     r4, [DST]
-       eor     r3, r4
-       str     r3, [DST], #+4
-       
         subs    N, #8
         bcc     .Lmemxor_same_end
  
  .Lmemxor_same_loop:
-       C 6 cycles per iteration, 0.75 cycles/byte
-       ldr     r4, [SRC, #+4]
-       ldr     r3, [SRC], #+8
-       ldr     r6, [DST, #+4]
-       ldr     r5, [DST]
-       
-       eor     r4, r6
-       eor     r3, r5
-       subs    N, #8
-       
-       str     r4, [DST, #+4]
-       str     r3, [DST], #+8
+       C 8 cycles per iteration, 0.67 cycles/byte
+       ldmia   SRC!, {r3, r4, r5}
+       ldmia   DST, {r6, r7, r12}
+       subs    N, #12
+       eor     r3, r6
+       eor     r4, r7
+       eor     r5, r12
+       stmia   DST!, {r3, r4, r5}
         bcs     .Lmemxor_same_loop
         
  .Lmemxor_same_end:
-       adds    N, #8
+       C We have 0-11 bytes left to do, and N holds number of bytes -12.
+       adds    N, #4
+       bcc     .Lmemxor_same_lt_8
+       C Do 8 bytes more, leftover is in N
+       ldmia   SRC!, {r3, r4}
+       ldmia   DST, {r6, r7}
+       eor     r3, r6
+       eor     r4, r7
+       stmia   DST!, {r3, r4}
+       beq     .Lmemxor_done
+       b       .Lmemxor_bytes
+
+.Lmemxor_same_lt_8:
+       adds    N, #4
+       bcc     .Lmemxor_same_lt_4
+
+       ldr     r3, [SRC], #+4
+       ldr     r4, [DST]
+       eor     r3, r4
+       str     r3, [DST], #+4
         beq     .Lmemxor_done
         b       .Lmemxor_bytes
+
+.Lmemxor_same_lt_4:
+       adds    N, #4
+       beq     .Lmemxor_done
+       b       .Lmemxor_bytes
+       
  EPILOGUE(memxor)
  
  define(<DST>, <r0>)
author	Niels Möller <nisse@lysator.liu.se>
	Tue, 12 Feb 2013 14:57:37 +0000 (15:57 +0100)
committer	Niels Möller <nisse@lysator.liu.se>
	Tue, 12 Feb 2013 14:57:37 +0000 (15:57 +0100)
ChangeLog		patch \| blob \| blame \| history
armv7/memxor.asm		patch \| blob \| blame \| history