From: Niels Möller Date: Tue, 12 Feb 2013 14:57:37 +0000 (+0100) Subject: armv7: Optimized aligned case of memxor, using 3-way unrolling. X-Git-Tag: nettle_2.7_release_20130424~113 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=39c037437d8807e2660baabe766fa391fa762a52;p=thirdparty%2Fnettle.git armv7: Optimized aligned case of memxor, using 3-way unrolling. --- diff --git a/ChangeLog b/ChangeLog index 90f03b7d..e1a0d6f5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2013-02-12 Niels Möller + + * armv7/memxor.asm (memxor): Optimized aligned case, using 3-way + unrolling. + 2013-02-06 Niels Möller * armv7/memxor.asm (memxor, memxor3): Optimized aligned case, now diff --git a/armv7/memxor.asm b/armv7/memxor.asm index 52d4bf46..94b8f532 100644 --- a/armv7/memxor.asm +++ b/armv7/memxor.asm @@ -18,6 +18,12 @@ C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, C MA 02111-1301, USA. +C Possible speedups: +C +C The ldm instruction can do load two registers per cycle, +C if the address is two-word aligned. Or three registers in two +C cycles, regardless of alignment. + C Register usage: define(, ) @@ -131,38 +137,49 @@ PROLOGUE(memxor) b .Lmemxor_bytes .Lmemxor_same: - tst N, #4 - it ne - subne N, #4 - bne .Lmemxor_same_loop - - ldr r3, [SRC], #+4 - ldr r4, [DST] - eor r3, r4 - str r3, [DST], #+4 - subs N, #8 bcc .Lmemxor_same_end .Lmemxor_same_loop: - C 6 cycles per iteration, 0.75 cycles/byte - ldr r4, [SRC, #+4] - ldr r3, [SRC], #+8 - ldr r6, [DST, #+4] - ldr r5, [DST] - - eor r4, r6 - eor r3, r5 - subs N, #8 - - str r4, [DST, #+4] - str r3, [DST], #+8 + C 8 cycles per iteration, 0.67 cycles/byte + ldmia SRC!, {r3, r4, r5} + ldmia DST, {r6, r7, r12} + subs N, #12 + eor r3, r6 + eor r4, r7 + eor r5, r12 + stmia DST!, {r3, r4, r5} bcs .Lmemxor_same_loop .Lmemxor_same_end: - adds N, #8 + C We have 0-11 bytes left to do, and N holds number of bytes -12. + adds N, #4 + bcc .Lmemxor_same_lt_8 + C Do 8 bytes more, leftover is in N + ldmia SRC!, {r3, r4} + ldmia DST, {r6, r7} + eor r3, r6 + eor r4, r7 + stmia DST!, {r3, r4} + beq .Lmemxor_done + b .Lmemxor_bytes + +.Lmemxor_same_lt_8: + adds N, #4 + bcc .Lmemxor_same_lt_4 + + ldr r3, [SRC], #+4 + ldr r4, [DST] + eor r3, r4 + str r3, [DST], #+4 beq .Lmemxor_done b .Lmemxor_bytes + +.Lmemxor_same_lt_4: + adds N, #4 + beq .Lmemxor_done + b .Lmemxor_bytes + EPILOGUE(memxor) define(, )