]> git.ipfire.org Git - thirdparty/nettle.git/commitdiff
armv7: Optimized aligned case of memxor, using 3-way unrolling.
authorNiels Möller <nisse@lysator.liu.se>
Tue, 12 Feb 2013 14:57:37 +0000 (15:57 +0100)
committerNiels Möller <nisse@lysator.liu.se>
Tue, 12 Feb 2013 14:57:37 +0000 (15:57 +0100)
ChangeLog
armv7/memxor.asm

index 90f03b7d901487dd9c0947d4347a501fa845dd1c..e1a0d6f5329bec8c45a95361d3674a4f60b6bef6 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2013-02-12  Niels Möller  <nisse@lysator.liu.se>
+
+       * armv7/memxor.asm (memxor): Optimized aligned case, using 3-way
+       unrolling.
+
 2013-02-06  Niels Möller  <nisse@lysator.liu.se>
 
        * armv7/memxor.asm (memxor, memxor3): Optimized aligned case, now
index 52d4bf46c71f3f08af35d7679599a17bcaa2921d..94b8f532d1283257c1c637625e1b5828bbcbe59e 100644 (file)
@@ -18,6 +18,12 @@ C along with the nettle library; see the file COPYING.LIB.  If not, write to
 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 C MA 02111-1301, USA.
 
+C Possible speedups:
+C
+C The ldm instruction can do load two registers per cycle,
+C if the address is two-word aligned. Or three registers in two
+C cycles, regardless of alignment.
+
 C Register usage:
 
 define(<DST>, <r0>)
@@ -131,38 +137,49 @@ PROLOGUE(memxor)
        b       .Lmemxor_bytes
 
 .Lmemxor_same:
-       tst     N, #4
-       it      ne
-       subne   N, #4
-       bne     .Lmemxor_same_loop
-
-       ldr     r3, [SRC], #+4
-       ldr     r4, [DST]
-       eor     r3, r4
-       str     r3, [DST], #+4
-       
        subs    N, #8
        bcc     .Lmemxor_same_end
 
 .Lmemxor_same_loop:
-       C 6 cycles per iteration, 0.75 cycles/byte
-       ldr     r4, [SRC, #+4]
-       ldr     r3, [SRC], #+8
-       ldr     r6, [DST, #+4]
-       ldr     r5, [DST]
-       
-       eor     r4, r6
-       eor     r3, r5
-       subs    N, #8
-       
-       str     r4, [DST, #+4]
-       str     r3, [DST], #+8
+       C 8 cycles per iteration, 0.67 cycles/byte
+       ldmia   SRC!, {r3, r4, r5}
+       ldmia   DST, {r6, r7, r12}
+       subs    N, #12
+       eor     r3, r6
+       eor     r4, r7
+       eor     r5, r12
+       stmia   DST!, {r3, r4, r5}
        bcs     .Lmemxor_same_loop
        
 .Lmemxor_same_end:
-       adds    N, #8
+       C We have 0-11 bytes left to do, and N holds number of bytes -12.
+       adds    N, #4
+       bcc     .Lmemxor_same_lt_8
+       C Do 8 bytes more, leftover is in N
+       ldmia   SRC!, {r3, r4}
+       ldmia   DST, {r6, r7}
+       eor     r3, r6
+       eor     r4, r7
+       stmia   DST!, {r3, r4}
+       beq     .Lmemxor_done
+       b       .Lmemxor_bytes
+
+.Lmemxor_same_lt_8:
+       adds    N, #4
+       bcc     .Lmemxor_same_lt_4
+
+       ldr     r3, [SRC], #+4
+       ldr     r4, [DST]
+       eor     r3, r4
+       str     r3, [DST], #+4
        beq     .Lmemxor_done
        b       .Lmemxor_bytes
+
+.Lmemxor_same_lt_4:
+       adds    N, #4
+       beq     .Lmemxor_done
+       b       .Lmemxor_bytes
+       
 EPILOGUE(memxor)
 
 define(<DST>, <r0>)