From dd8652d4b27173f2adefa77922b5fd3e18ddc8ef Mon Sep 17 00:00:00 2001 From: =?utf8?q?Niels=20M=C3=B6ller?= Date: Mon, 3 Oct 2011 09:43:08 +0200 Subject: [PATCH] Implemented sse2-loop. Configured at compile time, and currently disabled. Rev: nettle/x86_64/memxor.asm:1.3 --- x86_64/memxor.asm | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm index d9b05b18..7a5a23b6 100644 --- a/x86_64/memxor.asm +++ b/x86_64/memxor.asm @@ -28,7 +28,9 @@ define(, <%r9>) define(, <%rdi>) define(, <%r11>) define(, <%rdi>) C Overlaps with CNT - + +define(, ) + .file "memxor.asm" .text @@ -78,6 +80,10 @@ PROLOGUE(memxor3) jnz .Lalign_loop .Laligned: +ifelse(USE_SSE2, yes, < + cmp $16, N + jnc .Lsse2_case +>) C Check for the case that AP and BP have the same alignment, C but different from DST. mov AP, TMP @@ -209,4 +215,40 @@ C jz .Ldone .Ldone: ret + +ifelse(USE_SSE2, yes, < + +.Lsse2_case: + lea (DST, N), TMP + test $8, TMP + jz .Lsse2_next + sub $8, N + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + jmp .Lsse2_next + + ALIGN(4) +.Lsse2_loop: + movdqu (AP, N), %xmm0 + movdqu (BP, N), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST, N) +.Lsse2_next: + sub $16, N + ja .Lsse2_loop + + C FIXME: See if we can do a full word first, before the + C byte-wise final loop. + jnz .Lfinal + + C Final operation is aligned + movdqu (AP), %xmm0 + movdqu (BP), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST) + ret +>) + + EPILOGUE(memxor3) -- 2.47.2