Optimized ARM memxor.

author Niels Möller <nisse@lysator.liu.se>

Tue, 19 Feb 2013 12:09:55 +0000 (13:09 +0100)

committer Niels Möller <nisse@lysator.liu.se>

Tue, 19 Feb 2013 12:09:55 +0000 (13:09 +0100)
author Niels Möller <nisse@lysator.liu.se>
Tue, 19 Feb 2013 12:09:55 +0000 (13:09 +0100)
committer Niels Möller <nisse@lysator.liu.se>
Tue, 19 Feb 2013 12:09:55 +0000 (13:09 +0100)
diff --git a/ChangeLog b/ChangeLog

index e1a0d6f5329bec8c45a95361d3674a4f60b6bef6..00100dc193a9547394b70b9a7b0b860cb53d77c7 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2013-02-19  Niels Möller  <nisse@lysator.liu.se>
+
+       * armv7/memxor.asm (memxor): Software pipelining for the aligned case.
+       (memxor3): Use 3-way unrolling also for aligned memxor3.
+       Both loops benchmarked at 7 cycles (0.58 cycles per byte), but
+       memxor3 seems to have a strange dependency on instruction
+       alignment.
+
  2013-02-12  Niels Möller  <nisse@lysator.liu.se>
  
         * armv7/memxor.asm (memxor): Optimized aligned case, using 3-way
diff --git a/armv7/memxor.asm b/armv7/memxor.asm

index 94b8f532d1283257c1c637625e1b5828bbcbe59e..929fffcda6bb1cb6cea83abf4a9f09d7862886a7 100644 (file)
--- a/armv7/memxor.asm
+++ b/armv7/memxor.asm
@@ -40,12 +40,13 @@ define(<TNC>, <r7>)
         .arm
  
         C memxor(uint8_t *dst, const uint8_t *src, size_t n)
-       .align 2
+       .align 4
  PROLOGUE(memxor)
         cmp     N, #0
         beq     .Lmemxor_ret
  
-       push    {r4, r5, r6, r7}
+       C FIXME: Delay push until we know how many registers we need.
+       push    {r4,r5,r6,r7,r8,r10,r11,r14}    C lr is the link register
  
         cmp     N, #7
         bcs     .Lmemxor_large
@@ -60,7 +61,7 @@ PROLOGUE(memxor)
         bne     .Lmemxor_bytes
  
  .Lmemxor_done:
-       pop     {r4,r5,r6,r7}
+       pop     {r4,r5,r6,r7,r8,r10,r11,r14}
  .Lmemxor_ret:
         bx      lr
  
@@ -140,16 +141,42 @@ PROLOGUE(memxor)
         subs    N, #8
         bcc     .Lmemxor_same_end
  
+       ldmia   SRC!, {r3, r4, r5}
+       C Keep address for loads in r14
+       mov     r14, DST
+       ldmia   r14!, {r6, r7, r8}
+       subs    N, #12
+       eor     r10, r3, r6
+       eor     r11, r4, r7
+       eor     r12, r5, r8
+       bcc     .Lmemxor_same_final_store
+       subs    N, #12
+       ldmia   r14!, {r6, r7, r8}
+       bcc     .Lmemxor_same_wind_down
+
+       C 7 cycles per iteration, 0.58 cycles/byte
+       C Loopmixer could perhaps get it down to 6 cycles.
  .Lmemxor_same_loop:
-       C 8 cycles per iteration, 0.67 cycles/byte
+       C r10-r12 contains values to be stored at DST
+       C r6-r8 contains values read from r14, in advance
         ldmia   SRC!, {r3, r4, r5}
-       ldmia   DST, {r6, r7, r12}
         subs    N, #12
-       eor     r3, r6
-       eor     r4, r7
-       eor     r5, r12
-       stmia   DST!, {r3, r4, r5}
+       stmia   DST!, {r10, r11, r12}
+       eor     r10, r3, r6
+       eor     r11, r4, r7
+       eor     r12, r5, r8
+       ldmia   r14!, {r6, r7, r8}
         bcs     .Lmemxor_same_loop
+
+.Lmemxor_same_wind_down:
+       C Wind down code
+       ldmia   SRC!, {r3, r4, r5}
+       stmia   DST!, {r10, r11, r12}
+       eor     r10, r3, r6
+       eor     r11, r4, r7
+       eor     r12, r5, r8
+.Lmemxor_same_final_store:
+       stmia   DST!, {r10, r11, r12}
         
  .Lmemxor_same_end:
         C We have 0-11 bytes left to do, and N holds number of bytes -12.
@@ -312,40 +339,51 @@ PROLOGUE(memxor3)
         bne     .Lmemxor3_au ;
  
         C a, b and dst all have the same alignment.
-       sub     AP, #4
-       sub     BP, #4
-       sub     DST, #4
-       tst     N, #4
-       it      ne
-       subne   N, #4
-       bne     .Lmemxor3_aligned_word_loop
-
-       ldr     r4, [AP], #-4
-       ldr     r5, [BP], #-4
-       eor     r4, r5
-       str     r4, [DST], #-4
         subs    N, #8
         bcc     .Lmemxor3_aligned_word_end
+
+       C This loop runs at 7 cycles per iteration, but it seems to
+       C have a strange alignment requirement. For this speed, the
+       C loop started at offset 0x2ac in the object file, and all
+       C other offsets made it slower.
         
  .Lmemxor3_aligned_word_loop:
-       ldr     r4, [AP, #-4]
-       ldr     r5, [AP], #-8
-       ldr     r6, [BP, #-4]
-       ldr     r7, [BP], #-8
+       ldmdb   AP!, {r4,r5,r6}
+       ldmdb   BP!, {r7,r8,r10}
+       subs    N, #12
+       eor     r4, r7
+       eor     r5, r8
+       eor     r6, r10
+       stmdb   DST!, {r4, r5,r6}
+       bcs     .Lmemxor3_aligned_word_loop
  
+.Lmemxor3_aligned_word_end:
+       C We have 0-11 bytes left to do, and N holds number of bytes -12.
+       adds    N, #4
+       bcc     .Lmemxor3_aligned_lt_8
+       C Do 8 bytes more, leftover is in N
+       ldmdb   AP!, {r4, r5}
+       ldmdb   BP!, {r6, r7}
         eor     r4, r6
         eor     r5, r7
-       subs    N, #8
-       str     r4, [DST, #-4]
-       str     r5, [DST], #-8
+       stmdb   DST!, {r4,r5}
+       beq     .Lmemxor3_done
+       b       .Lmemxor3_bytes
  
-       bcs     .Lmemxor3_aligned_word_loop
-.Lmemxor3_aligned_word_end:
-       adds    N, #8
+.Lmemxor3_aligned_lt_8:
+       adds    N, #4
+       bcc     .Lmemxor3_aligned_lt_4
+
+       ldr     r4, [AP,#-4]!
+       ldr     r5, [BP,#-4]!
+       eor     r4, r5
+       str     r4, [DST,#-4]!
+       beq     .Lmemxor3_done
+       b       .Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_4:
+       adds    N, #4   
         beq     .Lmemxor3_done
-       add     AP, #4
-       add     BP, #4
-       add     DST, #4
         b       .Lmemxor3_bytes
  
  .Lmemxor3_uu:
author	Niels Möller <nisse@lysator.liu.se>
	Tue, 19 Feb 2013 12:09:55 +0000 (13:09 +0100)
committer	Niels Möller <nisse@lysator.liu.se>
	Tue, 19 Feb 2013 12:09:55 +0000 (13:09 +0100)
ChangeLog		patch \| blob \| blame \| history
armv7/memxor.asm		patch \| blob \| blame \| history