From c851f481cd16b1bd0442c8baa9c4ed12ceb86de3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 24 Feb 2014 21:49:58 -0800 Subject: [PATCH] rdrand_asm.S: On x86-64 we have enough registers, avoid repeated loads On x86-64 there are enough registers that there really is no point in using a repeated memory operand for the key material. Load it into a register instead, hopefully it will be slightly faster. Signed-off-by: H. Peter Anvin --- rdrand_asm.S | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/rdrand_asm.S b/rdrand_asm.S index 0bd4b04..4b8fdc5 100644 --- a/rdrand_asm.S +++ b/rdrand_asm.S @@ -122,7 +122,20 @@ ENTRY(x86_aes_mangle) pxor (6*16)(PTR1), %xmm6 pxor (7*16)(PTR1), %xmm7 +offset = 0 .rept 10 +#ifdef __x86_64__ + movdqa offset(PTR2), %xmm8 +offset = offset + 16 + .byte 0x66,0x41,0x0f,0x38,0xdc,0xc0 /* aesenc %xmm8, %xmm0 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xd0 /* aesenc %xmm8, %xmm2 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xd8 /* aesenc %xmm8, %xmm3 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xe0 /* aesenc %xmm8, %xmm4 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xe8 /* aesenc %xmm8, %xmm5 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xf0 /* aesenc %xmm8, %xmm6 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xf8 /* aesenc %xmm8, %xmm7 */ +#else .byte 0x66,0x0f,0x38,0xdc,0x00+NPTR2 /* aesenc (PTR2), %xmm0 */ .byte 0x66,0x0f,0x38,0xdc,0x08+NPTR2 /* aesenc (PTR2), %xmm1 */ .byte 0x66,0x0f,0x38,0xdc,0x10+NPTR2 /* aesenc (PTR2), %xmm2 */ @@ -132,8 +145,20 @@ ENTRY(x86_aes_mangle) .byte 0x66,0x0f,0x38,0xdc,0x30+NPTR2 /* aesenc (PTR2), %xmm6 */ .byte 0x66,0x0f,0x38,0xdc,0x38+NPTR2 /* aesenc (PTR2), %xmm7 */ add $16, PTR2 +#endif .endr +#ifdef __x86_64__ + movdqa offset(PTR2), %xmm8 + .byte 0x66,0x41,0x0f,0x38,0xdd,0xc0 /* aesenclast %xmm8, %xmm0 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xc8 /* aesenclast %xmm8, %xmm1 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xd0 /* aesenclast %xmm8, %xmm2 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xd8 /* aesenclast %xmm8, %xmm3 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xe0 /* aesenclast %xmm8, %xmm4 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xe8 /* aesenclast %xmm8, %xmm5 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xf0 /* aesenclast %xmm8, %xmm6 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xf8 /* aesenclast %xmm8, %xmm7 */ +#else .byte 0x66,0x0f,0x38,0xdd,0x00+NPTR2 /* aesenclast (PTR2), %xmm0 */ .byte 0x66,0x0f,0x38,0xdd,0x08+NPTR2 /* aesenclast (PTR2), %xmm1 */ .byte 0x66,0x0f,0x38,0xdd,0x10+NPTR2 /* aesenclast (PTR2), %xmm2 */ @@ -142,6 +167,7 @@ ENTRY(x86_aes_mangle) .byte 0x66,0x0f,0x38,0xdd,0x28+NPTR2 /* aesenclast (PTR2), %xmm5 */ .byte 0x66,0x0f,0x38,0xdd,0x30+NPTR2 /* aesenclast (PTR2), %xmm6 */ .byte 0x66,0x0f,0x38,0xdd,0x38+NPTR2 /* aesenclast (PTR2), %xmm7 */ +#endif movdqa %xmm0, (0*16)(PTR0) movdqa %xmm1, (1*16)(PTR0) -- 2.39.2