From: Jonathan Swinney Date: Fri, 18 Oct 2024 16:55:07 +0000 (-0500) Subject: Optimize x86/aarch64 MD5 implementation X-Git-Tag: openssl-3.5.0-alpha1~786 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ebe34f9a62630b45a825bc07a2e9cf52731e836e;p=thirdparty%2Fopenssl.git Optimize x86/aarch64 MD5 implementation As suggested in https://github.com/animetosho/md5-optimisation?tab=readme-ov-file#dependency-shortcut-in-g-function, we can delay the dependency on 'x' by recognizing that ((x & z) | (y & ~z)) is equivalent to ((x & z) + (y + ~z)) in this scenario, and we can perform those additions independently, leaving our dependency on x to the final addition. This speeds it up around 5% on both platforms. Signed-off-by: Oli Gillespie Reviewed-by: Paul Dale Reviewed-by: Hugo Landau (Merged from https://github.com/openssl/openssl/pull/25737) --- diff --git a/crypto/md5/asm/md5-aarch64.pl b/crypto/md5/asm/md5-aarch64.pl index 5a860806969..6e4f2459a42 100755 --- a/crypto/md5/asm/md5-aarch64.pl +++ b/crypto/md5/asm/md5-aarch64.pl @@ -237,165 +237,165 @@ ossl_md5_blocks_loop: add w9, w9, w13 // Add constant 0x49b40821 add w9, w9, w6 // Add aux function result ror w9, w9, #10 // Rotate left s=22 bits - bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x8, x17 // Aux function round 2 (~z & y) add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15]) - and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0x2562 // Load lower half of constant 0xf61e2562 movk x13, #0xf61e, lsl #16 // Load upper half of constant 0xf61e2562 add w4, w4, w20 // Add dest value add w4, w4, w13 // Add constant 0xf61e2562 - add w4, w4, w6 // Add aux function result + and x13, x9, x17 // Aux function round 2 (x & z) + add w4, w4, w6 // Add (~z & y) + add w4, w4, w13 // Add (x & z) ror w4, w4, #27 // Rotate left s=5 bits - bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x9, x8 // Aux function round 2 (~z & y) add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1]) - and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0xb340 // Load lower half of constant 0xc040b340 movk x13, #0xc040, lsl #16 // Load upper half of constant 0xc040b340 add w17, w17, w7 // Add dest value add w17, w17, w13 // Add constant 0xc040b340 - add w17, w17, w6 // Add aux function result + and x13, x4, x8 // Aux function round 2 (x & z) + add w17, w17, w6 // Add (~z & y) + add w17, w17, w13 // Add (x & z) ror w17, w17, #23 // Rotate left s=9 bits - bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x4, x9 // Aux function round 2 (~z & y) add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6]) - and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0x5a51 // Load lower half of constant 0x265e5a51 movk x13, #0x265e, lsl #16 // Load upper half of constant 0x265e5a51 add w8, w8, w25 // Add dest value add w8, w8, w13 // Add constant 0x265e5a51 - add w8, w8, w6 // Add aux function result + and x13, x17, x9 // Aux function round 2 (x & z) + add w8, w8, w6 // Add (~z & y) + add w8, w8, w13 // Add (x & z) ror w8, w8, #18 // Rotate left s=14 bits - bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x17, x4 // Aux function round 2 (~z & y) add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11]) - and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0xc7aa // Load lower half of constant 0xe9b6c7aa movk x13, #0xe9b6, lsl #16 // Load upper half of constant 0xe9b6c7aa add w9, w9, w15 // Add dest value add w9, w9, w13 // Add constant 0xe9b6c7aa - add w9, w9, w6 // Add aux function result + and x13, x8, x4 // Aux function round 2 (x & z) + add w9, w9, w6 // Add (~z & y) + add w9, w9, w13 // Add (x & z) ror w9, w9, #12 // Rotate left s=20 bits - bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x8, x17 // Aux function round 2 (~z & y) add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0]) - and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0x105d // Load lower half of constant 0xd62f105d movk x13, #0xd62f, lsl #16 // Load upper half of constant 0xd62f105d add w4, w4, w22 // Add dest value add w4, w4, w13 // Add constant 0xd62f105d - add w4, w4, w6 // Add aux function result + and x13, x9, x17 // Aux function round 2 (x & z) + add w4, w4, w6 // Add (~z & y) + add w4, w4, w13 // Add (x & z) ror w4, w4, #27 // Rotate left s=5 bits - bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x9, x8 // Aux function round 2 (~z & y) add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5]) - and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0x1453 // Load lower half of constant 0x2441453 movk x13, #0x244, lsl #16 // Load upper half of constant 0x2441453 add w17, w17, w16 // Add dest value add w17, w17, w13 // Add constant 0x2441453 - add w17, w17, w6 // Add aux function result + and x13, x4, x8 // Aux function round 2 (x & z) + add w17, w17, w6 // Add (~z & y) + add w17, w17, w13 // Add (x & z) ror w17, w17, #23 // Rotate left s=9 bits - bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x4, x9 // Aux function round 2 (~z & y) add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10]) - and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0xe681 // Load lower half of constant 0xd8a1e681 movk x13, #0xd8a1, lsl #16 // Load upper half of constant 0xd8a1e681 add w8, w8, w27 // Add dest value add w8, w8, w13 // Add constant 0xd8a1e681 - add w8, w8, w6 // Add aux function result + and x13, x17, x9 // Aux function round 2 (x & z) + add w8, w8, w6 // Add (~z & y) + add w8, w8, w13 // Add (x & z) ror w8, w8, #18 // Rotate left s=14 bits - bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x17, x4 // Aux function round 2 (~z & y) add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15]) - and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0xfbc8 // Load lower half of constant 0xe7d3fbc8 movk x13, #0xe7d3, lsl #16 // Load upper half of constant 0xe7d3fbc8 add w9, w9, w14 // Add dest value add w9, w9, w13 // Add constant 0xe7d3fbc8 - add w9, w9, w6 // Add aux function result + and x13, x8, x4 // Aux function round 2 (x & z) + add w9, w9, w6 // Add (~z & y) + add w9, w9, w13 // Add (x & z) ror w9, w9, #12 // Rotate left s=20 bits - bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x8, x17 // Aux function round 2 (~z & y) add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4]) - and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0xcde6 // Load lower half of constant 0x21e1cde6 movk x13, #0x21e1, lsl #16 // Load upper half of constant 0x21e1cde6 add w4, w4, w24 // Add dest value add w4, w4, w13 // Add constant 0x21e1cde6 - add w4, w4, w6 // Add aux function result + and x13, x9, x17 // Aux function round 2 (x & z) + add w4, w4, w6 // Add (~z & y) + add w4, w4, w13 // Add (x & z) ror w4, w4, #27 // Rotate left s=5 bits - bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x9, x8 // Aux function round 2 (~z & y) add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9]) - and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0x7d6 // Load lower half of constant 0xc33707d6 movk x13, #0xc337, lsl #16 // Load upper half of constant 0xc33707d6 add w17, w17, w12 // Add dest value add w17, w17, w13 // Add constant 0xc33707d6 - add w17, w17, w6 // Add aux function result + and x13, x4, x8 // Aux function round 2 (x & z) + add w17, w17, w6 // Add (~z & y) + add w17, w17, w13 // Add (x & z) ror w17, w17, #23 // Rotate left s=9 bits - bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x4, x9 // Aux function round 2 (~z & y) add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14]) - and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0xd87 // Load lower half of constant 0xf4d50d87 movk x13, #0xf4d5, lsl #16 // Load upper half of constant 0xf4d50d87 add w8, w8, w21 // Add dest value add w8, w8, w13 // Add constant 0xf4d50d87 - add w8, w8, w6 // Add aux function result + and x13, x17, x9 // Aux function round 2 (x & z) + add w8, w8, w6 // Add (~z & y) + add w8, w8, w13 // Add (x & z) ror w8, w8, #18 // Rotate left s=14 bits - bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x17, x4 // Aux function round 2 (~z & y) add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3]) - and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0x14ed // Load lower half of constant 0x455a14ed movk x13, #0x455a, lsl #16 // Load upper half of constant 0x455a14ed add w9, w9, w5 // Add dest value add w9, w9, w13 // Add constant 0x455a14ed - add w9, w9, w6 // Add aux function result + and x13, x8, x4 // Aux function round 2 (x & z) + add w9, w9, w6 // Add (~z & y) + add w9, w9, w13 // Add (x & z) ror w9, w9, #12 // Rotate left s=20 bits - bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x8, x17 // Aux function round 2 (~z & y) add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8]) - and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0xe905 // Load lower half of constant 0xa9e3e905 movk x13, #0xa9e3, lsl #16 // Load upper half of constant 0xa9e3e905 add w4, w4, w26 // Add dest value add w4, w4, w13 // Add constant 0xa9e3e905 - add w4, w4, w6 // Add aux function result + and x13, x9, x17 // Aux function round 2 (x & z) + add w4, w4, w6 // Add (~z & y) + add w4, w4, w13 // Add (x & z) ror w4, w4, #27 // Rotate left s=5 bits - bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x9, x8 // Aux function round 2 (~z & y) add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13]) - and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0xa3f8 // Load lower half of constant 0xfcefa3f8 movk x13, #0xfcef, lsl #16 // Load upper half of constant 0xfcefa3f8 add w17, w17, w3 // Add dest value add w17, w17, w13 // Add constant 0xfcefa3f8 - add w17, w17, w6 // Add aux function result + and x13, x4, x8 // Aux function round 2 (x & z) + add w17, w17, w6 // Add (~z & y) + add w17, w17, w13 // Add (x & z) ror w17, w17, #23 // Rotate left s=9 bits - bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x4, x9 // Aux function round 2 (~z & y) add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2]) - and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0x2d9 // Load lower half of constant 0x676f02d9 movk x13, #0x676f, lsl #16 // Load upper half of constant 0x676f02d9 add w8, w8, w23 // Add dest value add w8, w8, w13 // Add constant 0x676f02d9 - add w8, w8, w6 // Add aux function result + and x13, x17, x9 // Aux function round 2 (x & z) + add w8, w8, w6 // Add (~z & y) + add w8, w8, w13 // Add (x & z) ror w8, w8, #18 // Rotate left s=14 bits - bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + bic x6, x17, x4 // Aux function round 2 (~z & y) add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7]) - and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) - orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) movz x13, #0x4c8a // Load lower half of constant 0x8d2a4c8a movk x13, #0x8d2a, lsl #16 // Load upper half of constant 0x8d2a4c8a add w9, w9, w11 // Add dest value add w9, w9, w13 // Add constant 0x8d2a4c8a - add w9, w9, w6 // Add aux function result + and x13, x8, x4 // Aux function round 2 (x & z) + add w9, w9, w6 // Add (~z & y) + add w9, w9, w13 // Add (x & z) eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) ror w9, w9, #12 // Rotate left s=20 bits movz x10, #0x3942 // Load lower half of constant 0xfffa3942 diff --git a/crypto/md5/asm/md5-x86_64.pl b/crypto/md5/asm/md5-x86_64.pl index 6625fb7d08a..aa2880a9cdc 100755 --- a/crypto/md5/asm/md5-x86_64.pl +++ b/crypto/md5/asm/md5-x86_64.pl @@ -41,7 +41,6 @@ EOF # %r10d = X[k_next] # %r11d = z' (copy of z for the next step) # %r12d = z' (copy of z for the next step) -# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC) sub round2_step { my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; @@ -53,9 +52,9 @@ sub round2_step lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ and $y, %r11d /* y & (not z) */ mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ - or %r11d, %r12d /* (y & (not z)) | (x & z) */ + add %r11d, $dst /* dst += (y & (not z)) */ mov $y, %r11d /* (NEXT STEP) z' = $y */ - add %r12d, $dst /* dst += ... */ + add %r12d, $dst /* dst += (x & z) */ mov $y, %r12d /* (NEXT STEP) z' = $y */ rol \$$s, $dst /* dst <<< s */ add $x, $dst /* dst += x */