From: Niels Möller Date: Thu, 30 Jun 2011 08:44:50 +0000 (+0200) Subject: Added an SSE2 loop, doing four blocks at a time in parallel. X-Git-Tag: nettle_2.2_release_20110711~18 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=3342b197be9058d18928be7a670263f82ef4b331;p=thirdparty%2Fnettle.git Added an SSE2 loop, doing four blocks at a time in parallel. Rev: nettle/x86_64/serpent-decrypt.asm:1.3 --- diff --git a/x86_64/serpent-decrypt.asm b/x86_64/serpent-decrypt.asm index 8cf91418..b2bca664 100644 --- a/x86_64/serpent-decrypt.asm +++ b/x86_64/serpent-decrypt.asm @@ -16,7 +16,9 @@ C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, C MA 02111-1307, USA. - + +include_src() + C Register usage: C Single block serpent state, two copies @@ -284,6 +286,236 @@ define(, < rol <$>19, $1 >) +define(, < + pxor MINUS1, $1 +>) + +define(, < + movdqa $1, $5 + pxor $3, $5 + movdqa $1, $7 + por $2, $7 + movdqa $3, $6 + pxor $4, $6 + pxor $6, $7 + pand $3, $6 + por $2, $3 + pxor $4, $2 + por $1, $6 + pand $3, $2 + pxor $2, $6 + por $7, $1 + pxor $6, $1 + movdqa $7, $2 + pand $1, $2 + PNOT($7) + por $7, $4 + pxor $3, $4 + movdqa $1, $8 + pxor $4, $8 + por $4, $2 + pxor $2, $5 +>) + +define(, < + movdqa $2, $6 + por $4, $6 + pxor $3, $6 + movdqa $1, $8 + pxor $2, $8 + movdqa $1, $5 + por $6, $5 + pand $8, $5 + pxor $5, $2 + pxor $6, $8 + pand $4, $2 + movdqa $1, $7 + pand $3, $7 + por $7, $6 + por $4, $7 + pxor $5, $7 + PNOT($7) + pxor $2, $6 + pxor $6, $5 + pxor $3, $5 + por $7, $1 + pxor $1, $5 +>) + +define(, < + movdqa $1, $5 + pxor $4, $5 + movdqa $3, $7 + pxor $4, $7 + movdqa $2, $6 + por $7, $6 + pxor $6, $5 + movdqa $4, $6 + por $5, $6 + pand $2, $6 + PNOT($4) + movdqa $1, $8 + por $3, $8 + pand $8, $7 + pxor $7, $6 + pand $2, $8 + pand $3, $1 + por $4, $1 + pxor $1, $8 + pand $8, $3 + pxor $1, $3 + movdqa $5, $7 + pxor $6, $7 + pxor $3, $7 +>) + +define(, < + movdqa $3, $8 + por $4, $8 + movdqa $2, $5 + pand $8, $5 + movdqa $1, $7 + por $4, $7 + movdqa $3, $6 + pxor $7, $6 + pxor $6, $5 + pxor $1, $4 + pxor $4, $8 + pxor $2, $7 + pand $6, $7 + pxor $4, $7 + pxor $1, $6 + por $5, $4 + pand $4, $6 + pxor $2, $6 + pand $7, $1 + por $2, $1 + pxor $1, $8 +>) + +define(, < + movdqa $3, $6 + pxor $4, $6 + movdqa $3, $7 + por $4, $7 + pxor $2, $7 + por $4, $2 + movdqa $1, $5 + pxor $7, $5 + pxor $7, $4 + pand $1, $7 + pxor $7, $6 + pxor $1, $7 + por $3, $7 + pand $2, $1 + movdqa $1, $8 + pxor $4, $8 + PNOT($1) + por $6, $1 + pxor $1, $5 + pxor $2, $1 + pxor $1, $7 +>) + +define(, < + movdqa $1, $6 + pand $4, $6 + movdqa $3, $8 + pxor $6, $8 + movdqa $2, $5 + pand $8, $5 + movdqa $1, $7 + pxor $4, $7 + pxor $2, $4 + pxor $7, $5 + pand $1, $3 + pand $5, $1 + por $2, $3 + pxor $5, $6 + pxor $3, $6 + movdqa $5, $7 + por $6, $7 + pxor $8, $7 + pxor $4, $7 + PNOT($2) + por $1, $2 + pxor $2, $8 +>) + +define(, < + movdqa $1, $7 + pxor $3, $7 + PNOT($3) + movdqa $2, $5 + pxor $4, $5 + movdqa $1, $6 + por $3, $6 + pxor $5, $6 + movdqa $2, $8 + pand $7, $8 + por $4, $8 + por $3, $4 + por $2, $3 + pand $1, $3 + movdqa $3, $5 + pxor $8, $5 + PNOT($5) + pand $7, $8 + pxor $3, $8 + pxor $6, $1 + pxor $1, $8 + pand $5, $2 + pxor $2, $7 + pxor $4, $7 +>) + +define(, < + movdqa $1, $8 + pand $2, $8 + movdqa $2, $7 + pxor $4, $7 + por $8, $7 + movdqa $1, $6 + por $4, $6 + pand $3, $6 + pxor $6, $7 + por $3, $8 + movdqa $1, $5 + por $2, $5 + pand $4, $5 + pxor $5, $8 + pxor $2, $5 + movdqa $4, $6 + pxor $8, $6 + PNOT($6) + por $5, $6 + pxor $3, $5 + pxor $1, $6 + por $6, $4 + pxor $4, $5 +>) + +define(, < + WROL(10, $3) + WROL(27, $1) + movdqa $2, T0 + pslld <$>7, T0 + pxor $4, $3 + pxor T0, $3 + pxor $2, $1 + pxor $4, $1 + WROL(25, $4) + WROL(31, $2) + movdqa $1, T0 + pslld <$>3, T0 + pxor $3, $4 + pxor T0, $4 + pxor $1, $2 + pxor $3, $2 + WROL(29, $3) + WROL(19, $1) +>) + .file "serpent-decrypt.asm" C serpent_decrypt(struct serpent_context *ctx, @@ -304,6 +536,79 @@ PROLOGUE(nettle_serpent_decrypt) neg N jz .Lend + cmp $-64, N + ja .Lblock_loop + + pcmpeqd MINUS1, MINUS1 + +.Lwblock_loop: + movups (SRC, N), X0 + movups 16(SRC, N), X1 + movups 32(SRC, N), X2 + movups 48(SRC, N), X3 + + WTRANSPOSE(X0,X1,X2,X3) + + mov $384, CNT + + C FIXME: CNT known, no index register needed + WKEYXOR(128, X0,X1,X2,X3) + + jmp .Lwround_start + + ALIGN(4) + +.Lwround_loop: + WLTI(X0,X1,X2,X3) +.Lwround_start: + WSBOX7I(X0,X1,X2,X3, Y0,Y1,Y2,Y3) + WKEYXOR(112, Y0,Y1,Y2,Y3) + + WLTI(Y0,Y1,Y2,Y3) + WSBOX6I(Y0,Y1,Y2,Y3, X0,X1,X2,X3) + WKEYXOR(96, X0,X1,X2,X3) + + WLTI(X0,X1,X2,X3) + WSBOX5I(X0,X1,X2,X3, Y0,Y1,Y2,Y3) + WKEYXOR(80, Y0,Y1,Y2,Y3) + + WLTI(Y0,Y1,Y2,Y3) + WSBOX4I(Y0,Y1,Y2,Y3, X0,X1,X2,X3) + WKEYXOR(64, X0,X1,X2,X3) + + WLTI(X0,X1,X2,X3) + WSBOX3I(X0,X1,X2,X3, Y0,Y1,Y2,Y3) + WKEYXOR(48, Y0,Y1,Y2,Y3) + + WLTI(Y0,Y1,Y2,Y3) + WSBOX2I(Y0,Y1,Y2,Y3, X0,X1,X2,X3) + WKEYXOR(32, X0,X1,X2,X3) + + WLTI(X0,X1,X2,X3) + WSBOX1I(X0,X1,X2,X3, Y0,Y1,Y2,Y3) + WKEYXOR(16, Y0,Y1,Y2,Y3) + + WLTI(Y0,Y1,Y2,Y3) + WSBOX0I(Y0,Y1,Y2,Y3, X0,X1,X2,X3) + WKEYXOR(, X0,X1,X2,X3) + + sub $128, CNT + jnc .Lwround_loop + + WTRANSPOSE(X0,X1,X2,X3) + + movups X0, (DST, N) + movups X1, 16(DST, N) + movups X2, 32(DST, N) + movups X3, 48(DST, N) + + C FIXME: Adjust N, so we can use just jnc without an extra cmp. + add $64, N + jz .Lend + + cmp $-64, N + jbe .Lwblock_loop + .Lblock_loop: movl (SRC, N), x0 movl 4(SRC, N), x1 @@ -363,7 +668,6 @@ PROLOGUE(nettle_serpent_decrypt) xor 40(CTX, CNT), x2 xor 44(CTX, CNT), x3 - C FIXME: Goes wrong in this round. LTI(x0,x1,x2,x3) SBOX1I(x0,x1,x2,x3, y0,y1,y2,y3) xor 16(CTX, CNT), y0