]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-x86_64.pl
Add OpenSSL copyright to .pl files
[thirdparty/openssl.git] / crypto / sha / asm / sha512-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
2337eb58
AP
9#
10# ====================================================================
83698d31 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
2337eb58
AP
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
4a5b8a5b
AP
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
2337eb58
AP
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
4a5b8a5b
AP
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
83698d31
AP
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
a8f3b8b5 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
c7f690c2 54# unfortunately -2% SHA512 on P4 [which nobody should care about
a8f3b8b5
AP
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
c4558efb
AP
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
619b9466
AP
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
a8f3b8b5
AP
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
88#
c7f690c2
AP
89# AMD K8 14.9 - - 9.57 -
90# P4 17.3 - - 30.8 -
91# Core 2 15.6 13.8(+13%) - 9.97 -
92# Westmere 14.8 12.3(+19%) - 9.58 -
504bbcf3
AP
93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
c7f690c2 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
b7f5503f 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
c7f690c2 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
504bbcf3 98# VIA Nano 23.0 16.5(+39%) - 14.7 -
c7f690c2 99# Atom 23.0 18.9(+22%) - 14.7 -
b59f92e7 100# Silvermont 27.4 20.6(+33%) - 17.5 -
a8f3b8b5 101#
c4558efb 102# (*) whichever best applicable;
a8f3b8b5
AP
103# (**) switch from ror to shrd stands for fair share of improvement;
104# (***) execution time is fully determined by remaining integer-only
105# part, body_00_15; reducing the amount of SIMD instructions
106# below certain limit makes no difference/sense; to conserve
107# space SHA256 XOP code path is therefore omitted;
2337eb58 108
aa8f38e4
AP
109$flavour = shift;
110$output = shift;
111if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
55eab3b7 112
be01f79d
AP
113$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
114
55eab3b7
AP
115$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
116( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
117( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
118die "can't locate x86_64-xlate.pl";
119
c4558efb
AP
120if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
121 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
122 $avx = ($1>=2.19) + ($1>=2.22);
123}
124
125if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
126 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
127 $avx = ($1>=2.09) + ($1>=2.10);
128}
129
130if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
131 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
132 $avx = ($1>=10) + ($1>=11);
133}
a8f3b8b5 134
b9749432 135if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
a356e488 136 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
137}
138
977f32e8
AP
139$shaext=1; ### set to zero if compiling for 1.0.1
140$avx=1 if (!$shaext && $avx);
141
46bf83f0
AP
142open OUT,"| \"$^X\" $xlate $flavour $output";
143*STDOUT=*OUT;
2337eb58
AP
144
145if ($output =~ /512/) {
c5f17d45 146 $func="sha512_block_data_order";
2337eb58
AP
147 $TABLE="K512";
148 $SZ=8;
149 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
150 "%r8", "%r9", "%r10","%r11");
83698d31 151 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
2337eb58
AP
152 @Sigma0=(28,34,39);
153 @Sigma1=(14,18,41);
154 @sigma0=(1, 8, 7);
155 @sigma1=(19,61, 6);
156 $rounds=80;
157} else {
c5f17d45 158 $func="sha256_block_data_order";
2337eb58
AP
159 $TABLE="K256";
160 $SZ=4;
161 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
162 "%r8d","%r9d","%r10d","%r11d");
83698d31 163 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
2337eb58
AP
164 @Sigma0=( 2,13,22);
165 @Sigma1=( 6,11,25);
166 @sigma0=( 7,18, 3);
167 @sigma1=(17,19,10);
168 $rounds=64;
169}
170
83698d31 171$ctx="%rdi"; # 1st arg, zapped by $a3
2337eb58
AP
172$inp="%rsi"; # 2nd arg
173$Tbl="%rbp";
174
175$_ctx="16*$SZ+0*8(%rsp)";
176$_inp="16*$SZ+1*8(%rsp)";
177$_end="16*$SZ+2*8(%rsp)";
c5f17d45
AP
178$_rsp="16*$SZ+3*8(%rsp)";
179$framesz="16*$SZ+4*8";
2337eb58
AP
180
181
182sub ROUND_00_15()
183{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
c4558efb
AP
184 my $STRIDE=$SZ;
185 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
2337eb58
AP
186
187$code.=<<___;
d2fd65f6 188 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
2337eb58
AP
189 mov $f,$a2
190
d2fd65f6 191 xor $e,$a0
c7f690c2 192 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
2337eb58
AP
193 xor $g,$a2 # f^g
194
3a9b3852 195 mov $T1,`$SZ*($i&0xf)`(%rsp)
d2fd65f6 196 xor $a,$a1
2337eb58 197 and $e,$a2 # (f^g)&e
83698d31 198
3a9b3852 199 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
83698d31
AP
200 add $h,$T1 # T1+=h
201 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
2337eb58 202
d2fd65f6
AP
203 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
204 xor $e,$a0
3a9b3852 205 add $a2,$T1 # T1+=Ch(e,f,g)
2337eb58 206
83698d31 207 mov $a,$a2
3a9b3852 208 add ($Tbl),$T1 # T1+=K[round]
d2fd65f6 209 xor $a,$a1
2337eb58 210
83698d31 211 xor $b,$a2 # a^b, b^c in next round
c7f690c2 212 ror \$$Sigma1[0],$a0 # Sigma1(e)
83698d31 213 mov $b,$h
2337eb58 214
83698d31 215 and $a2,$a3
c7f690c2 216 ror \$$Sigma0[0],$a1 # Sigma0(a)
d2fd65f6 217 add $a0,$T1 # T1+=Sigma1(e)
2337eb58 218
83698d31 219 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
2337eb58 220 add $T1,$d # d+=T1
2337eb58 221 add $T1,$h # h+=T1
c7f690c2 222
c4558efb 223 lea $STRIDE($Tbl),$Tbl # round++
c7f690c2
AP
224___
225$code.=<<___ if ($i<15);
d2fd65f6 226 add $a1,$h # h+=Sigma0(a)
2337eb58 227___
83698d31 228 ($a2,$a3) = ($a3,$a2);
2337eb58
AP
229}
230
231sub ROUND_16_XX()
232{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
233
234$code.=<<___;
c7f690c2
AP
235 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
236 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
83698d31 237
d2fd65f6 238 mov $a0,$T1
83698d31 239 ror \$`$sigma0[1]-$sigma0[0]`,$a0
c7f690c2
AP
240 add $a1,$a # modulo-scheduled h+=Sigma0(a)
241 mov $a2,$a1
242 ror \$`$sigma1[1]-$sigma1[0]`,$a2
2337eb58 243
83698d31
AP
244 xor $T1,$a0
245 shr \$$sigma0[2],$T1
246 ror \$$sigma0[0],$a0
c7f690c2
AP
247 xor $a1,$a2
248 shr \$$sigma1[2],$a1
2337eb58 249
c7f690c2 250 ror \$$sigma1[0],$a2
83698d31 251 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
c7f690c2 252 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
83698d31 253 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
2337eb58
AP
254
255 add `$SZ*($i&0xf)`(%rsp),$T1
d2fd65f6 256 mov $e,$a0
c7f690c2 257 add $a2,$T1
d2fd65f6 258 mov $a,$a1
2337eb58
AP
259___
260 &ROUND_00_15(@_);
261}
262
263$code=<<___;
264.text
265
a8f3b8b5 266.extern OPENSSL_ia32cap_P
2337eb58 267.globl $func
c4558efb 268.type $func,\@function,3
2337eb58
AP
269.align 16
270$func:
a8f3b8b5
AP
271___
272$code.=<<___ if ($SZ==4 || $avx);
273 lea OPENSSL_ia32cap_P(%rip),%r11
c4558efb
AP
274 mov 0(%r11),%r9d
275 mov 4(%r11),%r10d
276 mov 8(%r11),%r11d
a8f3b8b5 277___
977f32e8 278$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
279 test \$`1<<29`,%r11d # check for SHA
280 jnz _shaext_shortcut
281___
f6ff1aa8 282$code.=<<___ if ($avx && $SZ==8);
c4558efb 283 test \$`1<<11`,%r10d # check for XOP
a8f3b8b5
AP
284 jnz .Lxop_shortcut
285___
c4558efb
AP
286$code.=<<___ if ($avx>1);
287 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
288 cmp \$`1<<8|1<<5|1<<3`,%r11d
289 je .Lavx2_shortcut
290___
a8f3b8b5 291$code.=<<___ if ($avx);
c4558efb
AP
292 and \$`1<<30`,%r9d # mask "Intel CPU" bit
293 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
294 or %r9d,%r10d
295 cmp \$`1<<28|1<<9|1<<30`,%r10d
a8f3b8b5
AP
296 je .Lavx_shortcut
297___
298$code.=<<___ if ($SZ==4);
c4558efb 299 test \$`1<<9`,%r10d
a8f3b8b5
AP
300 jnz .Lssse3_shortcut
301___
302$code.=<<___;
2337eb58
AP
303 push %rbx
304 push %rbp
305 push %r12
306 push %r13
307 push %r14
308 push %r15
be01f79d 309 mov %rsp,%r11 # copy %rsp
2337eb58
AP
310 shl \$4,%rdx # num*16
311 sub \$$framesz,%rsp
312 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
313 and \$-64,%rsp # align stack frame
314 mov $ctx,$_ctx # save ctx, 1st arg
315 mov $inp,$_inp # save inp, 2nd arh
316 mov %rdx,$_end # save end pointer, "3rd" arg
be01f79d
AP
317 mov %r11,$_rsp # save copy of %rsp
318.Lprologue:
2337eb58 319
2337eb58
AP
320 mov $SZ*0($ctx),$A
321 mov $SZ*1($ctx),$B
322 mov $SZ*2($ctx),$C
323 mov $SZ*3($ctx),$D
324 mov $SZ*4($ctx),$E
325 mov $SZ*5($ctx),$F
326 mov $SZ*6($ctx),$G
327 mov $SZ*7($ctx),$H
328 jmp .Lloop
329
330.align 16
331.Lloop:
83698d31
AP
332 mov $B,$a3
333 lea $TABLE(%rip),$Tbl
334 xor $C,$a3 # magic
2337eb58 335___
2337eb58
AP
336 for($i=0;$i<16;$i++) {
337 $code.=" mov $SZ*$i($inp),$T1\n";
d2fd65f6
AP
338 $code.=" mov @ROT[4],$a0\n";
339 $code.=" mov @ROT[0],$a1\n";
2337eb58
AP
340 $code.=" bswap $T1\n";
341 &ROUND_00_15($i,@ROT);
342 unshift(@ROT,pop(@ROT));
343 }
344$code.=<<___;
345 jmp .Lrounds_16_xx
346.align 16
347.Lrounds_16_xx:
348___
349 for(;$i<32;$i++) {
350 &ROUND_16_XX($i,@ROT);
351 unshift(@ROT,pop(@ROT));
352 }
353
354$code.=<<___;
a8f3b8b5 355 cmpb \$0,`$SZ-1`($Tbl)
83698d31 356 jnz .Lrounds_16_xx
2337eb58
AP
357
358 mov $_ctx,$ctx
c7f690c2 359 add $a1,$A # modulo-scheduled h+=Sigma0(a)
2337eb58
AP
360 lea 16*$SZ($inp),$inp
361
362 add $SZ*0($ctx),$A
363 add $SZ*1($ctx),$B
364 add $SZ*2($ctx),$C
365 add $SZ*3($ctx),$D
366 add $SZ*4($ctx),$E
367 add $SZ*5($ctx),$F
368 add $SZ*6($ctx),$G
369 add $SZ*7($ctx),$H
370
371 cmp $_end,$inp
372
373 mov $A,$SZ*0($ctx)
374 mov $B,$SZ*1($ctx)
375 mov $C,$SZ*2($ctx)
376 mov $D,$SZ*3($ctx)
377 mov $E,$SZ*4($ctx)
378 mov $F,$SZ*5($ctx)
379 mov $G,$SZ*6($ctx)
380 mov $H,$SZ*7($ctx)
381 jb .Lloop
382
be01f79d
AP
383 mov $_rsp,%rsi
384 mov (%rsi),%r15
385 mov 8(%rsi),%r14
386 mov 16(%rsi),%r13
387 mov 24(%rsi),%r12
388 mov 32(%rsi),%rbp
389 mov 40(%rsi),%rbx
390 lea 48(%rsi),%rsp
391.Lepilogue:
2337eb58
AP
392 ret
393.size $func,.-$func
394___
395
396if ($SZ==4) {
397$code.=<<___;
398.align 64
399.type $TABLE,\@object
400$TABLE:
401 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
c4558efb
AP
402 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
403 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
2337eb58
AP
404 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
405 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
c4558efb
AP
406 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
407 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
2337eb58
AP
408 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
409 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
c4558efb
AP
410 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
411 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
2337eb58
AP
412 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
413 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
c4558efb
AP
414 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
415 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
2337eb58
AP
416 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
417 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
c4558efb 418 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
2337eb58 419 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
c4558efb
AP
420 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
421 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
2337eb58
AP
422 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
423 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
c4558efb
AP
424 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
425 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
2337eb58
AP
426 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
427 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
c4558efb
AP
428 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
429 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
2337eb58
AP
430 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
431 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
c4558efb 432 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
a8f3b8b5 433
c4558efb 434 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
a8f3b8b5
AP
435 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
436 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
c4558efb
AP
437 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
438 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
a8f3b8b5 439 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
83698d31 440 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2337eb58
AP
441___
442} else {
443$code.=<<___;
444.align 64
445.type $TABLE,\@object
446$TABLE:
447 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
c4558efb
AP
448 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
449 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
2337eb58
AP
450 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
451 .quad 0x3956c25bf348b538,0x59f111f1b605d019
c4558efb
AP
452 .quad 0x3956c25bf348b538,0x59f111f1b605d019
453 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
2337eb58
AP
454 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
455 .quad 0xd807aa98a3030242,0x12835b0145706fbe
c4558efb
AP
456 .quad 0xd807aa98a3030242,0x12835b0145706fbe
457 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
2337eb58
AP
458 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
459 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
c4558efb 460 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
2337eb58 461 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
c4558efb
AP
462 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
463 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
2337eb58
AP
464 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
465 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
c4558efb
AP
466 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
467 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
2337eb58
AP
468 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
469 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
c4558efb
AP
470 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
471 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
2337eb58
AP
472 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
473 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
c4558efb
AP
474 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
475 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
2337eb58
AP
476 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
477 .quad 0x06ca6351e003826f,0x142929670a0e6e70
c4558efb
AP
478 .quad 0x06ca6351e003826f,0x142929670a0e6e70
479 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
2337eb58
AP
480 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
481 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
c4558efb
AP
482 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
483 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
2337eb58
AP
484 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
485 .quad 0x81c2c92e47edaee6,0x92722c851482353b
c4558efb
AP
486 .quad 0x81c2c92e47edaee6,0x92722c851482353b
487 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
2337eb58
AP
488 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
489 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
c4558efb
AP
490 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
491 .quad 0xd192e819d6ef5218,0xd69906245565a910
2337eb58
AP
492 .quad 0xd192e819d6ef5218,0xd69906245565a910
493 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
c4558efb 494 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
2337eb58 495 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
c4558efb
AP
496 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
497 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
2337eb58
AP
498 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
499 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
c4558efb
AP
500 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
501 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
2337eb58
AP
502 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
503 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
c4558efb
AP
504 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
505 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
2337eb58
AP
506 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
507 .quad 0x90befffa23631e28,0xa4506cebde82bde9
c4558efb
AP
508 .quad 0x90befffa23631e28,0xa4506cebde82bde9
509 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
2337eb58
AP
510 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
511 .quad 0xca273eceea26619c,0xd186b8c721c0c207
c4558efb
AP
512 .quad 0xca273eceea26619c,0xd186b8c721c0c207
513 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
2337eb58
AP
514 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
515 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
c4558efb
AP
516 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
517 .quad 0x113f9804bef90dae,0x1b710b35131c471b
2337eb58
AP
518 .quad 0x113f9804bef90dae,0x1b710b35131c471b
519 .quad 0x28db77f523047d84,0x32caab7b40c72493
c4558efb
AP
520 .quad 0x28db77f523047d84,0x32caab7b40c72493
521 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
2337eb58
AP
522 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
523 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
c4558efb
AP
524 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
525 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
2337eb58 526 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
a8f3b8b5
AP
527
528 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
c4558efb
AP
529 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
530 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2337eb58
AP
531___
532}
533
a8f3b8b5
AP
534######################################################################
535# SIMD code paths
536#
977f32e8 537if ($SZ==4 && $shaext) {{{
619b9466
AP
538######################################################################
539# Intel SHA Extensions implementation of SHA256 update function.
540#
541my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
542
543my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
544my @MSG=map("%xmm$_",(3..6));
545
546$code.=<<___;
547.type sha256_block_data_order_shaext,\@function,3
548.align 64
549sha256_block_data_order_shaext:
550_shaext_shortcut:
551___
552$code.=<<___ if ($win64);
553 lea `-8-5*16`(%rsp),%rsp
554 movaps %xmm6,-8-5*16(%rax)
555 movaps %xmm7,-8-4*16(%rax)
556 movaps %xmm8,-8-3*16(%rax)
557 movaps %xmm9,-8-2*16(%rax)
558 movaps %xmm10,-8-1*16(%rax)
559.Lprologue_shaext:
560___
561$code.=<<___;
562 lea K256+0x80(%rip),$Tbl
563 movdqu ($ctx),$ABEF # DCBA
564 movdqu 16($ctx),$CDGH # HGFE
565 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
566
567 pshufd \$0x1b,$ABEF,$Wi # ABCD
568 pshufd \$0xb1,$ABEF,$ABEF # CDAB
569 pshufd \$0x1b,$CDGH,$CDGH # EFGH
570 movdqa $TMP,$BSWAP # offload
571 palignr \$8,$CDGH,$ABEF # ABEF
572 punpcklqdq $Wi,$CDGH # CDGH
573 jmp .Loop_shaext
574
575.align 16
576.Loop_shaext:
577 movdqu ($inp),@MSG[0]
578 movdqu 0x10($inp),@MSG[1]
579 movdqu 0x20($inp),@MSG[2]
580 pshufb $TMP,@MSG[0]
581 movdqu 0x30($inp),@MSG[3]
582
583 movdqa 0*32-0x80($Tbl),$Wi
584 paddd @MSG[0],$Wi
585 pshufb $TMP,@MSG[1]
586 movdqa $CDGH,$CDGH_SAVE # offload
587 sha256rnds2 $ABEF,$CDGH # 0-3
588 pshufd \$0x0e,$Wi,$Wi
589 nop
590 movdqa $ABEF,$ABEF_SAVE # offload
591 sha256rnds2 $CDGH,$ABEF
592
593 movdqa 1*32-0x80($Tbl),$Wi
594 paddd @MSG[1],$Wi
595 pshufb $TMP,@MSG[2]
596 sha256rnds2 $ABEF,$CDGH # 4-7
597 pshufd \$0x0e,$Wi,$Wi
598 lea 0x40($inp),$inp
599 sha256msg1 @MSG[1],@MSG[0]
600 sha256rnds2 $CDGH,$ABEF
601
602 movdqa 2*32-0x80($Tbl),$Wi
603 paddd @MSG[2],$Wi
604 pshufb $TMP,@MSG[3]
605 sha256rnds2 $ABEF,$CDGH # 8-11
606 pshufd \$0x0e,$Wi,$Wi
607 movdqa @MSG[3],$TMP
608 palignr \$4,@MSG[2],$TMP
609 nop
610 paddd $TMP,@MSG[0]
611 sha256msg1 @MSG[2],@MSG[1]
612 sha256rnds2 $CDGH,$ABEF
613
614 movdqa 3*32-0x80($Tbl),$Wi
615 paddd @MSG[3],$Wi
616 sha256msg2 @MSG[3],@MSG[0]
617 sha256rnds2 $ABEF,$CDGH # 12-15
618 pshufd \$0x0e,$Wi,$Wi
619 movdqa @MSG[0],$TMP
620 palignr \$4,@MSG[3],$TMP
621 nop
622 paddd $TMP,@MSG[1]
623 sha256msg1 @MSG[3],@MSG[2]
624 sha256rnds2 $CDGH,$ABEF
625___
626for($i=4;$i<16-3;$i++) {
627$code.=<<___;
628 movdqa $i*32-0x80($Tbl),$Wi
629 paddd @MSG[0],$Wi
630 sha256msg2 @MSG[0],@MSG[1]
631 sha256rnds2 $ABEF,$CDGH # 16-19...
632 pshufd \$0x0e,$Wi,$Wi
633 movdqa @MSG[1],$TMP
634 palignr \$4,@MSG[0],$TMP
635 nop
636 paddd $TMP,@MSG[2]
637 sha256msg1 @MSG[0],@MSG[3]
638 sha256rnds2 $CDGH,$ABEF
639___
640 push(@MSG,shift(@MSG));
641}
642$code.=<<___;
643 movdqa 13*32-0x80($Tbl),$Wi
644 paddd @MSG[0],$Wi
645 sha256msg2 @MSG[0],@MSG[1]
646 sha256rnds2 $ABEF,$CDGH # 52-55
647 pshufd \$0x0e,$Wi,$Wi
648 movdqa @MSG[1],$TMP
649 palignr \$4,@MSG[0],$TMP
650 sha256rnds2 $CDGH,$ABEF
651 paddd $TMP,@MSG[2]
652
653 movdqa 14*32-0x80($Tbl),$Wi
654 paddd @MSG[1],$Wi
655 sha256rnds2 $ABEF,$CDGH # 56-59
656 pshufd \$0x0e,$Wi,$Wi
657 sha256msg2 @MSG[1],@MSG[2]
658 movdqa $BSWAP,$TMP
659 sha256rnds2 $CDGH,$ABEF
660
661 movdqa 15*32-0x80($Tbl),$Wi
662 paddd @MSG[2],$Wi
663 nop
664 sha256rnds2 $ABEF,$CDGH # 60-63
665 pshufd \$0x0e,$Wi,$Wi
666 dec $num
667 nop
668 sha256rnds2 $CDGH,$ABEF
669
670 paddd $CDGH_SAVE,$CDGH
671 paddd $ABEF_SAVE,$ABEF
672 jnz .Loop_shaext
673
674 pshufd \$0xb1,$CDGH,$CDGH # DCHG
675 pshufd \$0x1b,$ABEF,$TMP # FEBA
676 pshufd \$0xb1,$ABEF,$ABEF # BAFE
677 punpckhqdq $CDGH,$ABEF # DCBA
678 palignr \$8,$TMP,$CDGH # HGFE
679
680 movdqu $ABEF,($ctx)
681 movdqu $CDGH,16($ctx)
682___
683$code.=<<___ if ($win64);
684 movaps -8-5*16(%rax),%xmm6
685 movaps -8-4*16(%rax),%xmm7
686 movaps -8-3*16(%rax),%xmm8
687 movaps -8-2*16(%rax),%xmm9
688 movaps -8-1*16(%rax),%xmm10
689 mov %rax,%rsp
690.Lepilogue_shaext:
691___
692$code.=<<___;
693 ret
694.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
695___
696}}}
a8f3b8b5
AP
697{{{
698
699my $a4=$T1;
700my ($a,$b,$c,$d,$e,$f,$g,$h);
701
702sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
703{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
704 my $arg = pop;
705 $arg = "\$$arg" if ($arg*1 eq $arg);
706 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
707}
708
709sub body_00_15 () {
710 (
711 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
712
713 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
714 '&mov ($a,$a1)',
715 '&mov ($a4,$f)',
716
a8f3b8b5 717 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
c7f690c2 718 '&xor ($a0,$e)',
a8f3b8b5
AP
719 '&xor ($a4,$g)', # f^g
720
721 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
722 '&xor ($a1,$a)',
723 '&and ($a4,$e)', # (f^g)&e
724
725 '&xor ($a0,$e)',
726 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
727 '&mov ($a2,$a)',
728
a8f3b8b5 729 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
c7f690c2 730 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
a8f3b8b5
AP
731 '&xor ($a2,$b)', # a^b, b^c in next round
732
a8f3b8b5 733 '&add ($h,$a4)', # h+=Ch(e,f,g)
c7f690c2 734 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
a8f3b8b5
AP
735 '&and ($a3,$a2)', # (b^c)&(a^b)
736
737 '&xor ($a1,$a)',
738 '&add ($h,$a0)', # h+=Sigma1(e)
739 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
740
a8f3b8b5 741 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
c7f690c2 742 '&add ($d,$h)', # d+=h
a8f3b8b5
AP
743 '&add ($h,$a3)', # h+=Maj(a,b,c)
744
745 '&mov ($a0,$d)',
746 '&add ($a1,$h);'. # h+=Sigma0(a)
747 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
748 );
749}
750
751######################################################################
752# SSSE3 code path
753#
754if ($SZ==4) { # SHA256 only
755my @X = map("%xmm$_",(0..3));
756my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
757
758$code.=<<___;
c4558efb 759.type ${func}_ssse3,\@function,3
a8f3b8b5
AP
760.align 64
761${func}_ssse3:
762.Lssse3_shortcut:
763 push %rbx
764 push %rbp
765 push %r12
766 push %r13
767 push %r14
768 push %r15
769 mov %rsp,%r11 # copy %rsp
770 shl \$4,%rdx # num*16
771 sub \$`$framesz+$win64*16*4`,%rsp
772 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
773 and \$-64,%rsp # align stack frame
774 mov $ctx,$_ctx # save ctx, 1st arg
775 mov $inp,$_inp # save inp, 2nd arh
776 mov %rdx,$_end # save end pointer, "3rd" arg
777 mov %r11,$_rsp # save copy of %rsp
778___
779$code.=<<___ if ($win64);
780 movaps %xmm6,16*$SZ+32(%rsp)
781 movaps %xmm7,16*$SZ+48(%rsp)
782 movaps %xmm8,16*$SZ+64(%rsp)
783 movaps %xmm9,16*$SZ+80(%rsp)
784___
785$code.=<<___;
786.Lprologue_ssse3:
787
788 mov $SZ*0($ctx),$A
789 mov $SZ*1($ctx),$B
790 mov $SZ*2($ctx),$C
791 mov $SZ*3($ctx),$D
792 mov $SZ*4($ctx),$E
793 mov $SZ*5($ctx),$F
794 mov $SZ*6($ctx),$G
795 mov $SZ*7($ctx),$H
796___
797
798$code.=<<___;
504bbcf3
AP
799 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
800 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
a8f3b8b5
AP
801 jmp .Lloop_ssse3
802.align 16
803.Lloop_ssse3:
c4558efb 804 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
805 movdqu 0x00($inp),@X[0]
806 movdqu 0x10($inp),@X[1]
807 movdqu 0x20($inp),@X[2]
a8f3b8b5 808 pshufb $t3,@X[0]
619b9466 809 movdqu 0x30($inp),@X[3]
a8f3b8b5
AP
810 lea $TABLE(%rip),$Tbl
811 pshufb $t3,@X[1]
812 movdqa 0x00($Tbl),$t0
c4558efb 813 movdqa 0x20($Tbl),$t1
619b9466 814 pshufb $t3,@X[2]
a8f3b8b5 815 paddd @X[0],$t0
c4558efb 816 movdqa 0x40($Tbl),$t2
a8f3b8b5 817 pshufb $t3,@X[3]
c4558efb 818 movdqa 0x60($Tbl),$t3
a8f3b8b5
AP
819 paddd @X[1],$t1
820 paddd @X[2],$t2
821 paddd @X[3],$t3
822 movdqa $t0,0x00(%rsp)
823 mov $A,$a1
824 movdqa $t1,0x10(%rsp)
825 mov $B,$a3
826 movdqa $t2,0x20(%rsp)
827 xor $C,$a3 # magic
828 movdqa $t3,0x30(%rsp)
829 mov $E,$a0
830 jmp .Lssse3_00_47
831
832.align 16
833.Lssse3_00_47:
147cca8f 834 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
835___
836sub Xupdate_256_SSSE3 () {
837 (
838 '&movdqa ($t0,@X[1]);',
839 '&movdqa ($t3,@X[3])',
840 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
841 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
842 '&movdqa ($t1,$t0)',
843 '&movdqa ($t2,$t0);',
844 '&psrld ($t0,$sigma0[2])',
845 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
846 '&psrld ($t2,$sigma0[0])',
847 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
848 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
849 '&pxor ($t0,$t2)',
850 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
851 '&pxor ($t0,$t1)',
852 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
853 '&pxor ($t0,$t2);',
854 '&movdqa ($t2,$t3)',
855 '&pxor ($t0,$t1);', # sigma0(X[1..4])
856 '&psrld ($t3,$sigma1[2])',
857 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
858 '&psrlq ($t2,$sigma1[0])',
859 '&pxor ($t3,$t2);',
860 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
861 '&pxor ($t3,$t2)',
862 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
863 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
864 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
865 '&movdqa ($t2,$t3);',
866 '&psrld ($t3,$sigma1[2])',
867 '&psrlq ($t2,$sigma1[0])',
868 '&pxor ($t3,$t2);',
869 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
870 '&pxor ($t3,$t2);',
c4558efb 871 '&movdqa ($t2,16*2*$j."($Tbl)")',
a8f3b8b5
AP
872 '&pshufb ($t3,$t5)',
873 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
874 );
875}
876
877sub SSSE3_256_00_47 () {
878my $j = shift;
879my $body = shift;
880my @X = @_;
881my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
882
883 if (0) {
884 foreach (Xupdate_256_SSSE3()) { # 36 instructions
885 eval;
886 eval(shift(@insns));
887 eval(shift(@insns));
888 eval(shift(@insns));
889 }
c7f690c2 890 } else { # squeeze extra 4% on Westmere and 19% on Atom
a8f3b8b5 891 eval(shift(@insns)); #@
a8f3b8b5
AP
892 &movdqa ($t0,@X[1]);
893 eval(shift(@insns));
c7f690c2 894 eval(shift(@insns));
a8f3b8b5 895 &movdqa ($t3,@X[3]);
c7f690c2
AP
896 eval(shift(@insns)); #@
897 eval(shift(@insns));
a8f3b8b5
AP
898 eval(shift(@insns));
899 eval(shift(@insns)); #@
900 eval(shift(@insns));
901 &palignr ($t0,@X[0],$SZ); # X[1..4]
a8f3b8b5 902 eval(shift(@insns));
a8f3b8b5 903 eval(shift(@insns));
c7f690c2 904 &palignr ($t3,@X[2],$SZ); # X[9..12]
a8f3b8b5
AP
905 eval(shift(@insns));
906 eval(shift(@insns));
907 eval(shift(@insns));
908 eval(shift(@insns)); #@
a8f3b8b5
AP
909 &movdqa ($t1,$t0);
910 eval(shift(@insns));
c7f690c2 911 eval(shift(@insns));
a8f3b8b5
AP
912 &movdqa ($t2,$t0);
913 eval(shift(@insns)); #@
914 eval(shift(@insns));
a8f3b8b5
AP
915 &psrld ($t0,$sigma0[2]);
916 eval(shift(@insns));
917 eval(shift(@insns));
918 eval(shift(@insns));
919 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
a8f3b8b5
AP
920 eval(shift(@insns)); #@
921 eval(shift(@insns));
922 &psrld ($t2,$sigma0[0]);
923 eval(shift(@insns));
924 eval(shift(@insns));
a8f3b8b5
AP
925 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
926 eval(shift(@insns));
c7f690c2 927 eval(shift(@insns)); #@
a8f3b8b5
AP
928 &pslld ($t1,8*$SZ-$sigma0[1]);
929 eval(shift(@insns));
c7f690c2 930 eval(shift(@insns));
a8f3b8b5
AP
931 &pxor ($t0,$t2);
932 eval(shift(@insns)); #@
933 eval(shift(@insns));
c7f690c2 934 eval(shift(@insns));
a8f3b8b5 935 eval(shift(@insns)); #@
c7f690c2 936 &psrld ($t2,$sigma0[1]-$sigma0[0]);
a8f3b8b5
AP
937 eval(shift(@insns));
938 &pxor ($t0,$t1);
939 eval(shift(@insns));
940 eval(shift(@insns));
941 &pslld ($t1,$sigma0[1]-$sigma0[0]);
942 eval(shift(@insns));
c7f690c2 943 eval(shift(@insns));
a8f3b8b5
AP
944 &pxor ($t0,$t2);
945 eval(shift(@insns));
946 eval(shift(@insns)); #@
a8f3b8b5
AP
947 &movdqa ($t2,$t3);
948 eval(shift(@insns));
a8f3b8b5
AP
949 eval(shift(@insns));
950 &pxor ($t0,$t1); # sigma0(X[1..4])
c7f690c2 951 eval(shift(@insns)); #@
a8f3b8b5
AP
952 eval(shift(@insns));
953 eval(shift(@insns));
954 &psrld ($t3,$sigma1[2]);
955 eval(shift(@insns));
956 eval(shift(@insns));
957 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
a8f3b8b5
AP
958 eval(shift(@insns)); #@
959 eval(shift(@insns));
a8f3b8b5
AP
960 &psrlq ($t2,$sigma1[0]);
961 eval(shift(@insns));
a8f3b8b5
AP
962 eval(shift(@insns));
963 eval(shift(@insns));
964 &pxor ($t3,$t2);
c7f690c2
AP
965 eval(shift(@insns)); #@
966 eval(shift(@insns));
a8f3b8b5
AP
967 eval(shift(@insns));
968 eval(shift(@insns)); #@
969 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
970 eval(shift(@insns));
a8f3b8b5
AP
971 eval(shift(@insns));
972 &pxor ($t3,$t2);
c7f690c2 973 eval(shift(@insns)); #@
a8f3b8b5
AP
974 eval(shift(@insns));
975 eval(shift(@insns));
504bbcf3
AP
976 #&pshufb ($t3,$t4); # sigma1(X[14..15])
977 &pshufd ($t3,$t3,0b10000000);
a8f3b8b5 978 eval(shift(@insns));
c7f690c2 979 eval(shift(@insns));
a8f3b8b5 980 eval(shift(@insns));
504bbcf3 981 &psrldq ($t3,8);
a8f3b8b5
AP
982 eval(shift(@insns));
983 eval(shift(@insns)); #@
c7f690c2
AP
984 eval(shift(@insns));
985 eval(shift(@insns));
986 eval(shift(@insns)); #@
a8f3b8b5
AP
987 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
988 eval(shift(@insns));
a8f3b8b5
AP
989 eval(shift(@insns));
990 eval(shift(@insns));
c7f690c2 991 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
a8f3b8b5 992 eval(shift(@insns));
c7f690c2 993 eval(shift(@insns)); #@
a8f3b8b5
AP
994 eval(shift(@insns));
995 &movdqa ($t2,$t3);
996 eval(shift(@insns));
a8f3b8b5
AP
997 eval(shift(@insns));
998 &psrld ($t3,$sigma1[2]);
999 eval(shift(@insns));
a8f3b8b5 1000 eval(shift(@insns)); #@
c7f690c2 1001 &psrlq ($t2,$sigma1[0]);
a8f3b8b5
AP
1002 eval(shift(@insns));
1003 eval(shift(@insns));
1004 &pxor ($t3,$t2);
c7f690c2
AP
1005 eval(shift(@insns)); #@
1006 eval(shift(@insns));
a8f3b8b5
AP
1007 eval(shift(@insns));
1008 eval(shift(@insns)); #@
1009 eval(shift(@insns));
1010 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
a8f3b8b5
AP
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1013 eval(shift(@insns));
1014 &pxor ($t3,$t2);
1015 eval(shift(@insns));
1016 eval(shift(@insns));
a8f3b8b5 1017 eval(shift(@insns)); #@
504bbcf3
AP
1018 #&pshufb ($t3,$t5);
1019 &pshufd ($t3,$t3,0b00001000);
a8f3b8b5 1020 eval(shift(@insns));
c7f690c2
AP
1021 eval(shift(@insns));
1022 &movdqa ($t2,16*2*$j."($Tbl)");
a8f3b8b5
AP
1023 eval(shift(@insns)); #@
1024 eval(shift(@insns));
504bbcf3 1025 &pslldq ($t3,8);
a8f3b8b5
AP
1026 eval(shift(@insns));
1027 eval(shift(@insns));
a8f3b8b5 1028 eval(shift(@insns));
c7f690c2
AP
1029 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1030 eval(shift(@insns)); #@
a8f3b8b5
AP
1031 eval(shift(@insns));
1032 eval(shift(@insns));
1033 }
1034 &paddd ($t2,@X[0]);
1035 foreach (@insns) { eval; } # remaining instructions
1036 &movdqa (16*$j."(%rsp)",$t2);
1037}
1038
1039 for ($i=0,$j=0; $j<4; $j++) {
1040 &SSSE3_256_00_47($j,\&body_00_15,@X);
1041 push(@X,shift(@X)); # rotate(@X)
1042 }
c4558efb 1043 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1044 &jne (".Lssse3_00_47");
1045
1046 for ($i=0; $i<16; ) {
1047 foreach(body_00_15()) { eval; }
1048 }
1049$code.=<<___;
1050 mov $_ctx,$ctx
1051 mov $a1,$A
1052
1053 add $SZ*0($ctx),$A
1054 lea 16*$SZ($inp),$inp
1055 add $SZ*1($ctx),$B
1056 add $SZ*2($ctx),$C
1057 add $SZ*3($ctx),$D
1058 add $SZ*4($ctx),$E
1059 add $SZ*5($ctx),$F
1060 add $SZ*6($ctx),$G
1061 add $SZ*7($ctx),$H
1062
1063 cmp $_end,$inp
1064
1065 mov $A,$SZ*0($ctx)
1066 mov $B,$SZ*1($ctx)
1067 mov $C,$SZ*2($ctx)
1068 mov $D,$SZ*3($ctx)
1069 mov $E,$SZ*4($ctx)
1070 mov $F,$SZ*5($ctx)
1071 mov $G,$SZ*6($ctx)
1072 mov $H,$SZ*7($ctx)
1073 jb .Lloop_ssse3
1074
1075 mov $_rsp,%rsi
1076___
1077$code.=<<___ if ($win64);
1078 movaps 16*$SZ+32(%rsp),%xmm6
1079 movaps 16*$SZ+48(%rsp),%xmm7
1080 movaps 16*$SZ+64(%rsp),%xmm8
1081 movaps 16*$SZ+80(%rsp),%xmm9
1082___
1083$code.=<<___;
1084 mov (%rsi),%r15
1085 mov 8(%rsi),%r14
1086 mov 16(%rsi),%r13
1087 mov 24(%rsi),%r12
1088 mov 32(%rsi),%rbp
1089 mov 40(%rsi),%rbx
1090 lea 48(%rsi),%rsp
1091.Lepilogue_ssse3:
1092 ret
1093.size ${func}_ssse3,.-${func}_ssse3
1094___
1095}
1096
1097if ($avx) {{
1098######################################################################
1099# XOP code path
1100#
f6ff1aa8 1101if ($SZ==8) { # SHA512 only
a8f3b8b5 1102$code.=<<___;
c4558efb 1103.type ${func}_xop,\@function,3
a8f3b8b5
AP
1104.align 64
1105${func}_xop:
1106.Lxop_shortcut:
1107 push %rbx
1108 push %rbp
1109 push %r12
1110 push %r13
1111 push %r14
1112 push %r15
1113 mov %rsp,%r11 # copy %rsp
1114 shl \$4,%rdx # num*16
1115 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1116 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1117 and \$-64,%rsp # align stack frame
1118 mov $ctx,$_ctx # save ctx, 1st arg
1119 mov $inp,$_inp # save inp, 2nd arh
1120 mov %rdx,$_end # save end pointer, "3rd" arg
1121 mov %r11,$_rsp # save copy of %rsp
1122___
1123$code.=<<___ if ($win64);
1124 movaps %xmm6,16*$SZ+32(%rsp)
1125 movaps %xmm7,16*$SZ+48(%rsp)
1126 movaps %xmm8,16*$SZ+64(%rsp)
1127 movaps %xmm9,16*$SZ+80(%rsp)
1128___
1129$code.=<<___ if ($win64 && $SZ>4);
1130 movaps %xmm10,16*$SZ+96(%rsp)
1131 movaps %xmm11,16*$SZ+112(%rsp)
1132___
1133$code.=<<___;
1134.Lprologue_xop:
1135
00678437 1136 vzeroupper
a8f3b8b5
AP
1137 mov $SZ*0($ctx),$A
1138 mov $SZ*1($ctx),$B
1139 mov $SZ*2($ctx),$C
1140 mov $SZ*3($ctx),$D
1141 mov $SZ*4($ctx),$E
1142 mov $SZ*5($ctx),$F
1143 mov $SZ*6($ctx),$G
1144 mov $SZ*7($ctx),$H
1145 jmp .Lloop_xop
1146___
1147 if ($SZ==4) { # SHA256
1148 my @X = map("%xmm$_",(0..3));
1149 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1150
1151$code.=<<___;
1152.align 16
1153.Lloop_xop:
c4558efb 1154 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
1155 vmovdqu 0x00($inp),@X[0]
1156 vmovdqu 0x10($inp),@X[1]
1157 vmovdqu 0x20($inp),@X[2]
1158 vmovdqu 0x30($inp),@X[3]
1159 vpshufb $t3,@X[0],@X[0]
1160 lea $TABLE(%rip),$Tbl
1161 vpshufb $t3,@X[1],@X[1]
1162 vpshufb $t3,@X[2],@X[2]
1163 vpaddd 0x00($Tbl),@X[0],$t0
1164 vpshufb $t3,@X[3],@X[3]
c4558efb
AP
1165 vpaddd 0x20($Tbl),@X[1],$t1
1166 vpaddd 0x40($Tbl),@X[2],$t2
1167 vpaddd 0x60($Tbl),@X[3],$t3
a8f3b8b5
AP
1168 vmovdqa $t0,0x00(%rsp)
1169 mov $A,$a1
1170 vmovdqa $t1,0x10(%rsp)
1171 mov $B,$a3
1172 vmovdqa $t2,0x20(%rsp)
1173 xor $C,$a3 # magic
1174 vmovdqa $t3,0x30(%rsp)
1175 mov $E,$a0
1176 jmp .Lxop_00_47
1177
1178.align 16
1179.Lxop_00_47:
147cca8f 1180 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
1181___
1182sub XOP_256_00_47 () {
1183my $j = shift;
1184my $body = shift;
1185my @X = @_;
1186my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1187
1188 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1189 eval(shift(@insns));
1190 eval(shift(@insns));
1191 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1192 eval(shift(@insns));
1193 eval(shift(@insns));
1194 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1195 eval(shift(@insns));
1196 eval(shift(@insns));
1197 &vpsrld ($t0,$t0,$sigma0[2]);
1198 eval(shift(@insns));
1199 eval(shift(@insns));
1200 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1201 eval(shift(@insns));
1202 eval(shift(@insns));
1203 eval(shift(@insns));
1204 eval(shift(@insns));
1205 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1206 eval(shift(@insns));
1207 eval(shift(@insns));
1208 &vpxor ($t0,$t0,$t1);
1209 eval(shift(@insns));
1210 eval(shift(@insns));
1211 eval(shift(@insns));
1212 eval(shift(@insns));
1213 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1214 eval(shift(@insns));
1215 eval(shift(@insns));
1216 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1217 eval(shift(@insns));
1218 eval(shift(@insns));
1219 &vpsrld ($t2,@X[3],$sigma1[2]);
1220 eval(shift(@insns));
1221 eval(shift(@insns));
1222 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1223 eval(shift(@insns));
1224 eval(shift(@insns));
1225 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1226 eval(shift(@insns));
1227 eval(shift(@insns));
1228 &vpxor ($t3,$t3,$t2);
1229 eval(shift(@insns));
1230 eval(shift(@insns));
1231 eval(shift(@insns));
1232 eval(shift(@insns));
1233 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1234 eval(shift(@insns));
1235 eval(shift(@insns));
1236 eval(shift(@insns));
1237 eval(shift(@insns));
1238 &vpsrldq ($t3,$t3,8);
1239 eval(shift(@insns));
1240 eval(shift(@insns));
1241 eval(shift(@insns));
1242 eval(shift(@insns));
1243 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1244 eval(shift(@insns));
1245 eval(shift(@insns));
1246 eval(shift(@insns));
1247 eval(shift(@insns));
1248 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1249 eval(shift(@insns));
1250 eval(shift(@insns));
1251 &vpsrld ($t2,@X[0],$sigma1[2]);
1252 eval(shift(@insns));
1253 eval(shift(@insns));
1254 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1255 eval(shift(@insns));
1256 eval(shift(@insns));
1257 &vpxor ($t3,$t3,$t2);
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 eval(shift(@insns));
1261 eval(shift(@insns));
1262 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1263 eval(shift(@insns));
1264 eval(shift(@insns));
1265 eval(shift(@insns));
1266 eval(shift(@insns));
1267 &vpslldq ($t3,$t3,8); # 22 instructions
1268 eval(shift(@insns));
1269 eval(shift(@insns));
1270 eval(shift(@insns));
1271 eval(shift(@insns));
1272 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1273 eval(shift(@insns));
1274 eval(shift(@insns));
1275 eval(shift(@insns));
1276 eval(shift(@insns));
c4558efb 1277 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
a8f3b8b5
AP
1278 foreach (@insns) { eval; } # remaining instructions
1279 &vmovdqa (16*$j."(%rsp)",$t2);
1280}
1281
1282 for ($i=0,$j=0; $j<4; $j++) {
1283 &XOP_256_00_47($j,\&body_00_15,@X);
1284 push(@X,shift(@X)); # rotate(@X)
1285 }
c4558efb 1286 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1287 &jne (".Lxop_00_47");
1288
1289 for ($i=0; $i<16; ) {
1290 foreach(body_00_15()) { eval; }
1291 }
1292
1293 } else { # SHA512
1294 my @X = map("%xmm$_",(0..7));
1295 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1296
1297$code.=<<___;
1298.align 16
1299.Lloop_xop:
c4558efb 1300 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5 1301 vmovdqu 0x00($inp),@X[0]
c4558efb 1302 lea $TABLE+0x80(%rip),$Tbl # size optimization
a8f3b8b5
AP
1303 vmovdqu 0x10($inp),@X[1]
1304 vmovdqu 0x20($inp),@X[2]
1305 vpshufb $t3,@X[0],@X[0]
1306 vmovdqu 0x30($inp),@X[3]
1307 vpshufb $t3,@X[1],@X[1]
1308 vmovdqu 0x40($inp),@X[4]
1309 vpshufb $t3,@X[2],@X[2]
1310 vmovdqu 0x50($inp),@X[5]
1311 vpshufb $t3,@X[3],@X[3]
1312 vmovdqu 0x60($inp),@X[6]
1313 vpshufb $t3,@X[4],@X[4]
1314 vmovdqu 0x70($inp),@X[7]
1315 vpshufb $t3,@X[5],@X[5]
c4558efb 1316 vpaddq -0x80($Tbl),@X[0],$t0
a8f3b8b5 1317 vpshufb $t3,@X[6],@X[6]
c4558efb 1318 vpaddq -0x60($Tbl),@X[1],$t1
a8f3b8b5 1319 vpshufb $t3,@X[7],@X[7]
c4558efb
AP
1320 vpaddq -0x40($Tbl),@X[2],$t2
1321 vpaddq -0x20($Tbl),@X[3],$t3
a8f3b8b5 1322 vmovdqa $t0,0x00(%rsp)
c4558efb 1323 vpaddq 0x00($Tbl),@X[4],$t0
a8f3b8b5 1324 vmovdqa $t1,0x10(%rsp)
c4558efb 1325 vpaddq 0x20($Tbl),@X[5],$t1
a8f3b8b5 1326 vmovdqa $t2,0x20(%rsp)
c4558efb 1327 vpaddq 0x40($Tbl),@X[6],$t2
a8f3b8b5 1328 vmovdqa $t3,0x30(%rsp)
c4558efb 1329 vpaddq 0x60($Tbl),@X[7],$t3
a8f3b8b5
AP
1330 vmovdqa $t0,0x40(%rsp)
1331 mov $A,$a1
1332 vmovdqa $t1,0x50(%rsp)
1333 mov $B,$a3
1334 vmovdqa $t2,0x60(%rsp)
1335 xor $C,$a3 # magic
1336 vmovdqa $t3,0x70(%rsp)
1337 mov $E,$a0
1338 jmp .Lxop_00_47
1339
1340.align 16
1341.Lxop_00_47:
147cca8f 1342 add \$`16*2*$SZ`,$Tbl
a8f3b8b5
AP
1343___
1344sub XOP_512_00_47 () {
1345my $j = shift;
1346my $body = shift;
1347my @X = @_;
1348my @insns = (&$body,&$body); # 52 instructions
1349
1350 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1351 eval(shift(@insns));
1352 eval(shift(@insns));
1353 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1354 eval(shift(@insns));
1355 eval(shift(@insns));
1356 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1357 eval(shift(@insns));
1358 eval(shift(@insns));
1359 &vpsrlq ($t0,$t0,$sigma0[2]);
1360 eval(shift(@insns));
1361 eval(shift(@insns));
1362 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1363 eval(shift(@insns));
1364 eval(shift(@insns));
1365 eval(shift(@insns));
1366 eval(shift(@insns));
1367 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1368 eval(shift(@insns));
1369 eval(shift(@insns));
1370 &vpxor ($t0,$t0,$t1);
1371 eval(shift(@insns));
1372 eval(shift(@insns));
1373 eval(shift(@insns));
1374 eval(shift(@insns));
1375 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1376 eval(shift(@insns));
1377 eval(shift(@insns));
1378 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1379 eval(shift(@insns));
1380 eval(shift(@insns));
1381 &vpsrlq ($t2,@X[7],$sigma1[2]);
1382 eval(shift(@insns));
1383 eval(shift(@insns));
1384 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1385 eval(shift(@insns));
1386 eval(shift(@insns));
1387 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1388 eval(shift(@insns));
1389 eval(shift(@insns));
1390 &vpxor ($t3,$t3,$t2);
1391 eval(shift(@insns));
1392 eval(shift(@insns));
1393 eval(shift(@insns));
1394 eval(shift(@insns));
1395 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1396 eval(shift(@insns));
1397 eval(shift(@insns));
1398 eval(shift(@insns));
1399 eval(shift(@insns));
1400 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1401 eval(shift(@insns));
1402 eval(shift(@insns));
1403 eval(shift(@insns));
1404 eval(shift(@insns));
c4558efb 1405 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
a8f3b8b5
AP
1406 foreach (@insns) { eval; } # remaining instructions
1407 &vmovdqa (16*$j."(%rsp)",$t2);
1408}
1409
1410 for ($i=0,$j=0; $j<8; $j++) {
1411 &XOP_512_00_47($j,\&body_00_15,@X);
1412 push(@X,shift(@X)); # rotate(@X)
1413 }
c4558efb 1414 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
a8f3b8b5
AP
1415 &jne (".Lxop_00_47");
1416
1417 for ($i=0; $i<16; ) {
1418 foreach(body_00_15()) { eval; }
1419 }
1420}
1421$code.=<<___;
1422 mov $_ctx,$ctx
1423 mov $a1,$A
1424
1425 add $SZ*0($ctx),$A
1426 lea 16*$SZ($inp),$inp
1427 add $SZ*1($ctx),$B
1428 add $SZ*2($ctx),$C
1429 add $SZ*3($ctx),$D
1430 add $SZ*4($ctx),$E
1431 add $SZ*5($ctx),$F
1432 add $SZ*6($ctx),$G
1433 add $SZ*7($ctx),$H
1434
1435 cmp $_end,$inp
1436
1437 mov $A,$SZ*0($ctx)
1438 mov $B,$SZ*1($ctx)
1439 mov $C,$SZ*2($ctx)
1440 mov $D,$SZ*3($ctx)
1441 mov $E,$SZ*4($ctx)
1442 mov $F,$SZ*5($ctx)
1443 mov $G,$SZ*6($ctx)
1444 mov $H,$SZ*7($ctx)
1445 jb .Lloop_xop
1446
1447 mov $_rsp,%rsi
00678437 1448 vzeroupper
a8f3b8b5
AP
1449___
1450$code.=<<___ if ($win64);
1451 movaps 16*$SZ+32(%rsp),%xmm6
1452 movaps 16*$SZ+48(%rsp),%xmm7
1453 movaps 16*$SZ+64(%rsp),%xmm8
1454 movaps 16*$SZ+80(%rsp),%xmm9
1455___
1456$code.=<<___ if ($win64 && $SZ>4);
1457 movaps 16*$SZ+96(%rsp),%xmm10
1458 movaps 16*$SZ+112(%rsp),%xmm11
1459___
1460$code.=<<___;
1461 mov (%rsi),%r15
1462 mov 8(%rsi),%r14
1463 mov 16(%rsi),%r13
1464 mov 24(%rsi),%r12
1465 mov 32(%rsi),%rbp
1466 mov 40(%rsi),%rbx
1467 lea 48(%rsi),%rsp
1468.Lepilogue_xop:
1469 ret
1470.size ${func}_xop,.-${func}_xop
1471___
1472}
1473######################################################################
1474# AVX+shrd code path
1475#
1476local *ror = sub { &shrd(@_[0],@_) };
1477
1478$code.=<<___;
c4558efb 1479.type ${func}_avx,\@function,3
a8f3b8b5
AP
1480.align 64
1481${func}_avx:
1482.Lavx_shortcut:
1483 push %rbx
1484 push %rbp
1485 push %r12
1486 push %r13
1487 push %r14
1488 push %r15
1489 mov %rsp,%r11 # copy %rsp
1490 shl \$4,%rdx # num*16
1491 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1492 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1493 and \$-64,%rsp # align stack frame
1494 mov $ctx,$_ctx # save ctx, 1st arg
1495 mov $inp,$_inp # save inp, 2nd arh
1496 mov %rdx,$_end # save end pointer, "3rd" arg
1497 mov %r11,$_rsp # save copy of %rsp
1498___
1499$code.=<<___ if ($win64);
1500 movaps %xmm6,16*$SZ+32(%rsp)
1501 movaps %xmm7,16*$SZ+48(%rsp)
1502 movaps %xmm8,16*$SZ+64(%rsp)
1503 movaps %xmm9,16*$SZ+80(%rsp)
1504___
1505$code.=<<___ if ($win64 && $SZ>4);
1506 movaps %xmm10,16*$SZ+96(%rsp)
1507 movaps %xmm11,16*$SZ+112(%rsp)
1508___
1509$code.=<<___;
1510.Lprologue_avx:
1511
00678437 1512 vzeroupper
a8f3b8b5
AP
1513 mov $SZ*0($ctx),$A
1514 mov $SZ*1($ctx),$B
1515 mov $SZ*2($ctx),$C
1516 mov $SZ*3($ctx),$D
1517 mov $SZ*4($ctx),$E
1518 mov $SZ*5($ctx),$F
1519 mov $SZ*6($ctx),$G
1520 mov $SZ*7($ctx),$H
1521___
1522 if ($SZ==4) { # SHA256
1523 my @X = map("%xmm$_",(0..3));
1524 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1525
1526$code.=<<___;
c4558efb
AP
1527 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1528 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
a8f3b8b5
AP
1529 jmp .Lloop_avx
1530.align 16
1531.Lloop_avx:
c4558efb 1532 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
1533 vmovdqu 0x00($inp),@X[0]
1534 vmovdqu 0x10($inp),@X[1]
1535 vmovdqu 0x20($inp),@X[2]
1536 vmovdqu 0x30($inp),@X[3]
1537 vpshufb $t3,@X[0],@X[0]
1538 lea $TABLE(%rip),$Tbl
1539 vpshufb $t3,@X[1],@X[1]
1540 vpshufb $t3,@X[2],@X[2]
1541 vpaddd 0x00($Tbl),@X[0],$t0
1542 vpshufb $t3,@X[3],@X[3]
c4558efb
AP
1543 vpaddd 0x20($Tbl),@X[1],$t1
1544 vpaddd 0x40($Tbl),@X[2],$t2
1545 vpaddd 0x60($Tbl),@X[3],$t3
a8f3b8b5
AP
1546 vmovdqa $t0,0x00(%rsp)
1547 mov $A,$a1
1548 vmovdqa $t1,0x10(%rsp)
1549 mov $B,$a3
1550 vmovdqa $t2,0x20(%rsp)
1551 xor $C,$a3 # magic
1552 vmovdqa $t3,0x30(%rsp)
1553 mov $E,$a0
1554 jmp .Lavx_00_47
1555
1556.align 16
1557.Lavx_00_47:
147cca8f 1558 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
1559___
1560sub Xupdate_256_AVX () {
1561 (
1562 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1563 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1564 '&vpsrld ($t2,$t0,$sigma0[0]);',
1565 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1566 '&vpsrld ($t3,$t0,$sigma0[2])',
1567 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1568 '&vpxor ($t0,$t3,$t2)',
1569 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1570 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1571 '&vpxor ($t0,$t0,$t1)',
1572 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1573 '&vpxor ($t0,$t0,$t2)',
1574 '&vpsrld ($t2,$t3,$sigma1[2]);',
1575 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1576 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1577 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1578 '&vpxor ($t2,$t2,$t3);',
1579 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1580 '&vpxor ($t2,$t2,$t3)',
1581 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1582 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1583 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1584 '&vpsrld ($t2,$t3,$sigma1[2])',
1585 '&vpsrlq ($t3,$t3,$sigma1[0])',
1586 '&vpxor ($t2,$t2,$t3);',
1587 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1588 '&vpxor ($t2,$t2,$t3)',
1589 '&vpshufb ($t2,$t2,$t5)',
1590 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1591 );
1592}
1593
1594sub AVX_256_00_47 () {
1595my $j = shift;
1596my $body = shift;
1597my @X = @_;
1598my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1599
1600 foreach (Xupdate_256_AVX()) { # 29 instructions
1601 eval;
1602 eval(shift(@insns));
1603 eval(shift(@insns));
1604 eval(shift(@insns));
1605 }
c4558efb 1606 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
a8f3b8b5
AP
1607 foreach (@insns) { eval; } # remaining instructions
1608 &vmovdqa (16*$j."(%rsp)",$t2);
1609}
1610
1611 for ($i=0,$j=0; $j<4; $j++) {
1612 &AVX_256_00_47($j,\&body_00_15,@X);
1613 push(@X,shift(@X)); # rotate(@X)
1614 }
c4558efb 1615 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1616 &jne (".Lavx_00_47");
1617
1618 for ($i=0; $i<16; ) {
1619 foreach(body_00_15()) { eval; }
1620 }
1621
1622 } else { # SHA512
1623 my @X = map("%xmm$_",(0..7));
1624 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1625
1626$code.=<<___;
1627 jmp .Lloop_avx
1628.align 16
1629.Lloop_avx:
c4558efb 1630 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5 1631 vmovdqu 0x00($inp),@X[0]
c4558efb 1632 lea $TABLE+0x80(%rip),$Tbl # size optimization
a8f3b8b5
AP
1633 vmovdqu 0x10($inp),@X[1]
1634 vmovdqu 0x20($inp),@X[2]
1635 vpshufb $t3,@X[0],@X[0]
1636 vmovdqu 0x30($inp),@X[3]
1637 vpshufb $t3,@X[1],@X[1]
1638 vmovdqu 0x40($inp),@X[4]
1639 vpshufb $t3,@X[2],@X[2]
1640 vmovdqu 0x50($inp),@X[5]
1641 vpshufb $t3,@X[3],@X[3]
1642 vmovdqu 0x60($inp),@X[6]
1643 vpshufb $t3,@X[4],@X[4]
1644 vmovdqu 0x70($inp),@X[7]
1645 vpshufb $t3,@X[5],@X[5]
c4558efb 1646 vpaddq -0x80($Tbl),@X[0],$t0
a8f3b8b5 1647 vpshufb $t3,@X[6],@X[6]
c4558efb 1648 vpaddq -0x60($Tbl),@X[1],$t1
a8f3b8b5 1649 vpshufb $t3,@X[7],@X[7]
c4558efb
AP
1650 vpaddq -0x40($Tbl),@X[2],$t2
1651 vpaddq -0x20($Tbl),@X[3],$t3
a8f3b8b5 1652 vmovdqa $t0,0x00(%rsp)
c4558efb 1653 vpaddq 0x00($Tbl),@X[4],$t0
a8f3b8b5 1654 vmovdqa $t1,0x10(%rsp)
c4558efb 1655 vpaddq 0x20($Tbl),@X[5],$t1
a8f3b8b5 1656 vmovdqa $t2,0x20(%rsp)
c4558efb 1657 vpaddq 0x40($Tbl),@X[6],$t2
a8f3b8b5 1658 vmovdqa $t3,0x30(%rsp)
c4558efb 1659 vpaddq 0x60($Tbl),@X[7],$t3
a8f3b8b5
AP
1660 vmovdqa $t0,0x40(%rsp)
1661 mov $A,$a1
1662 vmovdqa $t1,0x50(%rsp)
1663 mov $B,$a3
1664 vmovdqa $t2,0x60(%rsp)
1665 xor $C,$a3 # magic
1666 vmovdqa $t3,0x70(%rsp)
1667 mov $E,$a0
1668 jmp .Lavx_00_47
1669
1670.align 16
1671.Lavx_00_47:
147cca8f 1672 add \$`16*2*$SZ`,$Tbl
a8f3b8b5
AP
1673___
1674sub Xupdate_512_AVX () {
1675 (
1676 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1677 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
c4558efb
AP
1678 '&vpsrlq ($t2,$t0,$sigma0[0])',
1679 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
a8f3b8b5
AP
1680 '&vpsrlq ($t3,$t0,$sigma0[2])',
1681 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1682 '&vpxor ($t0,$t3,$t2)',
1683 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1684 '&vpxor ($t0,$t0,$t1)',
1685 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1686 '&vpxor ($t0,$t0,$t2)',
1687 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1688 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
c4558efb 1689 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
a8f3b8b5
AP
1690 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1691 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1692 '&vpxor ($t3,$t3,$t2)',
1693 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1694 '&vpxor ($t3,$t3,$t1)',
1695 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1696 '&vpxor ($t3,$t3,$t2)',
1697 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1698 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1699 );
1700}
1701
1702sub AVX_512_00_47 () {
1703my $j = shift;
1704my $body = shift;
1705my @X = @_;
1706my @insns = (&$body,&$body); # 52 instructions
1707
1708 foreach (Xupdate_512_AVX()) { # 23 instructions
1709 eval;
1710 eval(shift(@insns));
1711 eval(shift(@insns));
1712 }
c4558efb 1713 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
a8f3b8b5
AP
1714 foreach (@insns) { eval; } # remaining instructions
1715 &vmovdqa (16*$j."(%rsp)",$t2);
1716}
1717
1718 for ($i=0,$j=0; $j<8; $j++) {
1719 &AVX_512_00_47($j,\&body_00_15,@X);
1720 push(@X,shift(@X)); # rotate(@X)
1721 }
c4558efb 1722 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
a8f3b8b5
AP
1723 &jne (".Lavx_00_47");
1724
1725 for ($i=0; $i<16; ) {
1726 foreach(body_00_15()) { eval; }
1727 }
1728}
1729$code.=<<___;
1730 mov $_ctx,$ctx
1731 mov $a1,$A
1732
1733 add $SZ*0($ctx),$A
1734 lea 16*$SZ($inp),$inp
1735 add $SZ*1($ctx),$B
1736 add $SZ*2($ctx),$C
1737 add $SZ*3($ctx),$D
1738 add $SZ*4($ctx),$E
1739 add $SZ*5($ctx),$F
1740 add $SZ*6($ctx),$G
1741 add $SZ*7($ctx),$H
1742
1743 cmp $_end,$inp
1744
1745 mov $A,$SZ*0($ctx)
1746 mov $B,$SZ*1($ctx)
1747 mov $C,$SZ*2($ctx)
1748 mov $D,$SZ*3($ctx)
1749 mov $E,$SZ*4($ctx)
1750 mov $F,$SZ*5($ctx)
1751 mov $G,$SZ*6($ctx)
1752 mov $H,$SZ*7($ctx)
1753 jb .Lloop_avx
1754
1755 mov $_rsp,%rsi
00678437 1756 vzeroupper
a8f3b8b5
AP
1757___
1758$code.=<<___ if ($win64);
1759 movaps 16*$SZ+32(%rsp),%xmm6
1760 movaps 16*$SZ+48(%rsp),%xmm7
1761 movaps 16*$SZ+64(%rsp),%xmm8
1762 movaps 16*$SZ+80(%rsp),%xmm9
1763___
1764$code.=<<___ if ($win64 && $SZ>4);
1765 movaps 16*$SZ+96(%rsp),%xmm10
1766 movaps 16*$SZ+112(%rsp),%xmm11
1767___
1768$code.=<<___;
1769 mov (%rsi),%r15
1770 mov 8(%rsi),%r14
1771 mov 16(%rsi),%r13
1772 mov 24(%rsi),%r12
1773 mov 32(%rsi),%rbp
1774 mov 40(%rsi),%rbx
1775 lea 48(%rsi),%rsp
1776.Lepilogue_avx:
1777 ret
1778.size ${func}_avx,.-${func}_avx
1779___
c4558efb
AP
1780
1781if ($avx>1) {{
1782######################################################################
1783# AVX2+BMI code path
1784#
1785my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1786my $PUSH8=8*2*$SZ;
1787use integer;
1788
1789sub bodyx_00_15 () {
1790 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1791 (
1792 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1793
1794 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1795 '&and ($a4,$e)', # f&e
1796 '&rorx ($a0,$e,$Sigma1[2])',
1797 '&rorx ($a2,$e,$Sigma1[1])',
1798
1799 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1800 '&lea ($h,"($h,$a4)")',
1801 '&andn ($a4,$e,$g)', # ~e&g
1802 '&xor ($a0,$a2)',
1803
1804 '&rorx ($a1,$e,$Sigma1[0])',
1805 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1806 '&xor ($a0,$a1)', # Sigma1(e)
1807 '&mov ($a2,$a)',
1808
1809 '&rorx ($a4,$a,$Sigma0[2])',
1810 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1811 '&xor ($a2,$b)', # a^b, b^c in next round
1812 '&rorx ($a1,$a,$Sigma0[1])',
1813
1814 '&rorx ($a0,$a,$Sigma0[0])',
1815 '&lea ($d,"($d,$h)")', # d+=h
1816 '&and ($a3,$a2)', # (b^c)&(a^b)
1817 '&xor ($a1,$a4)',
1818
1819 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1820 '&xor ($a1,$a0)', # Sigma0(a)
1821 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1822 '&mov ($a4,$e)', # copy of f in future
1823
1824 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1825 );
1826 # and at the finish one has to $a+=$a1
1827}
1828
1829$code.=<<___;
1830.type ${func}_avx2,\@function,3
1831.align 64
1832${func}_avx2:
1833.Lavx2_shortcut:
1834 push %rbx
1835 push %rbp
1836 push %r12
1837 push %r13
1838 push %r14
1839 push %r15
1840 mov %rsp,%r11 # copy %rsp
1841 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1842 shl \$4,%rdx # num*16
1843 and \$-256*$SZ,%rsp # align stack frame
1844 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1845 add \$`2*$SZ*($rounds-8)`,%rsp
1846 mov $ctx,$_ctx # save ctx, 1st arg
1847 mov $inp,$_inp # save inp, 2nd arh
1848 mov %rdx,$_end # save end pointer, "3rd" arg
1849 mov %r11,$_rsp # save copy of %rsp
1850___
1851$code.=<<___ if ($win64);
1852 movaps %xmm6,16*$SZ+32(%rsp)
1853 movaps %xmm7,16*$SZ+48(%rsp)
1854 movaps %xmm8,16*$SZ+64(%rsp)
1855 movaps %xmm9,16*$SZ+80(%rsp)
1856___
1857$code.=<<___ if ($win64 && $SZ>4);
1858 movaps %xmm10,16*$SZ+96(%rsp)
1859 movaps %xmm11,16*$SZ+112(%rsp)
1860___
1861$code.=<<___;
1862.Lprologue_avx2:
1863
00678437 1864 vzeroupper
c4558efb
AP
1865 sub \$-16*$SZ,$inp # inp++, size optimization
1866 mov $SZ*0($ctx),$A
504bbcf3 1867 mov $inp,%r12 # borrow $T1
c4558efb
AP
1868 mov $SZ*1($ctx),$B
1869 cmp %rdx,$inp # $_end
1870 mov $SZ*2($ctx),$C
504bbcf3 1871 cmove %rsp,%r12 # next block or random data
c4558efb
AP
1872 mov $SZ*3($ctx),$D
1873 mov $SZ*4($ctx),$E
1874 mov $SZ*5($ctx),$F
1875 mov $SZ*6($ctx),$G
1876 mov $SZ*7($ctx),$H
1877___
1878 if ($SZ==4) { # SHA256
1879 my @X = map("%ymm$_",(0..3));
1880 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1881
1882$code.=<<___;
1883 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1884 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1885 jmp .Loop_avx2
1886.align 16
1887.Loop_avx2:
c4558efb 1888 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
504bbcf3
AP
1889 vmovdqu -16*$SZ+0($inp),%xmm0
1890 vmovdqu -16*$SZ+16($inp),%xmm1
1891 vmovdqu -16*$SZ+32($inp),%xmm2
1892 vmovdqu -16*$SZ+48($inp),%xmm3
c4558efb 1893 #mov $inp,$_inp # offload $inp
504bbcf3
AP
1894 vinserti128 \$1,(%r12),@X[0],@X[0]
1895 vinserti128 \$1,16(%r12),@X[1],@X[1]
1896 vpshufb $t3,@X[0],@X[0]
1897 vinserti128 \$1,32(%r12),@X[2],@X[2]
1898 vpshufb $t3,@X[1],@X[1]
1899 vinserti128 \$1,48(%r12),@X[3],@X[3]
c4558efb
AP
1900
1901 lea $TABLE(%rip),$Tbl
c4558efb
AP
1902 vpshufb $t3,@X[2],@X[2]
1903 vpaddd 0x00($Tbl),@X[0],$t0
1904 vpshufb $t3,@X[3],@X[3]
1905 vpaddd 0x20($Tbl),@X[1],$t1
1906 vpaddd 0x40($Tbl),@X[2],$t2
1907 vpaddd 0x60($Tbl),@X[3],$t3
1908 vmovdqa $t0,0x00(%rsp)
1909 xor $a1,$a1
1910 vmovdqa $t1,0x20(%rsp)
1911 lea -$PUSH8(%rsp),%rsp
1912 mov $B,$a3
1913 vmovdqa $t2,0x00(%rsp)
1914 xor $C,$a3 # magic
1915 vmovdqa $t3,0x20(%rsp)
1916 mov $F,$a4
1917 sub \$-16*2*$SZ,$Tbl # size optimization
1918 jmp .Lavx2_00_47
1919
1920.align 16
1921.Lavx2_00_47:
1922___
1923
1924sub AVX2_256_00_47 () {
1925my $j = shift;
1926my $body = shift;
1927my @X = @_;
1928my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1929my $base = "+2*$PUSH8(%rsp)";
1930
1931 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1932 foreach (Xupdate_256_AVX()) { # 29 instructions
1933 eval;
1934 eval(shift(@insns));
1935 eval(shift(@insns));
1936 eval(shift(@insns));
1937 }
1938 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1939 foreach (@insns) { eval; } # remaining instructions
1940 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1941}
1942
1943 for ($i=0,$j=0; $j<4; $j++) {
1944 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1945 push(@X,shift(@X)); # rotate(@X)
1946 }
1947 &lea ($Tbl,16*2*$SZ."($Tbl)");
1948 &cmpb (($SZ-1)."($Tbl)",0);
1949 &jne (".Lavx2_00_47");
1950
1951 for ($i=0; $i<16; ) {
1952 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1953 foreach(bodyx_00_15()) { eval; }
1954 }
1955 } else { # SHA512
1956 my @X = map("%ymm$_",(0..7));
1957 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1958
1959$code.=<<___;
1960 jmp .Loop_avx2
1961.align 16
1962.Loop_avx2:
504bbcf3
AP
1963 vmovdqu -16*$SZ($inp),%xmm0
1964 vmovdqu -16*$SZ+16($inp),%xmm1
1965 vmovdqu -16*$SZ+32($inp),%xmm2
c4558efb 1966 lea $TABLE+0x80(%rip),$Tbl # size optimization
504bbcf3
AP
1967 vmovdqu -16*$SZ+48($inp),%xmm3
1968 vmovdqu -16*$SZ+64($inp),%xmm4
1969 vmovdqu -16*$SZ+80($inp),%xmm5
1970 vmovdqu -16*$SZ+96($inp),%xmm6
1971 vmovdqu -16*$SZ+112($inp),%xmm7
1972 #mov $inp,$_inp # offload $inp
1973 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1974 vinserti128 \$1,(%r12),@X[0],@X[0]
1975 vinserti128 \$1,16(%r12),@X[1],@X[1]
1976 vpshufb $t2,@X[0],@X[0]
1977 vinserti128 \$1,32(%r12),@X[2],@X[2]
1978 vpshufb $t2,@X[1],@X[1]
1979 vinserti128 \$1,48(%r12),@X[3],@X[3]
1980 vpshufb $t2,@X[2],@X[2]
1981 vinserti128 \$1,64(%r12),@X[4],@X[4]
1982 vpshufb $t2,@X[3],@X[3]
1983 vinserti128 \$1,80(%r12),@X[5],@X[5]
1984 vpshufb $t2,@X[4],@X[4]
1985 vinserti128 \$1,96(%r12),@X[6],@X[6]
1986 vpshufb $t2,@X[5],@X[5]
1987 vinserti128 \$1,112(%r12),@X[7],@X[7]
1988
c4558efb
AP
1989 vpaddq -0x80($Tbl),@X[0],$t0
1990 vpshufb $t2,@X[6],@X[6]
1991 vpaddq -0x60($Tbl),@X[1],$t1
1992 vpshufb $t2,@X[7],@X[7]
1993 vpaddq -0x40($Tbl),@X[2],$t2
1994 vpaddq -0x20($Tbl),@X[3],$t3
1995 vmovdqa $t0,0x00(%rsp)
1996 vpaddq 0x00($Tbl),@X[4],$t0
1997 vmovdqa $t1,0x20(%rsp)
1998 vpaddq 0x20($Tbl),@X[5],$t1
1999 vmovdqa $t2,0x40(%rsp)
2000 vpaddq 0x40($Tbl),@X[6],$t2
2001 vmovdqa $t3,0x60(%rsp)
2002 lea -$PUSH8(%rsp),%rsp
2003 vpaddq 0x60($Tbl),@X[7],$t3
2004 vmovdqa $t0,0x00(%rsp)
2005 xor $a1,$a1
2006 vmovdqa $t1,0x20(%rsp)
2007 mov $B,$a3
2008 vmovdqa $t2,0x40(%rsp)
2009 xor $C,$a3 # magic
2010 vmovdqa $t3,0x60(%rsp)
2011 mov $F,$a4
2012 add \$16*2*$SZ,$Tbl
2013 jmp .Lavx2_00_47
2014
2015.align 16
2016.Lavx2_00_47:
2017___
2018
2019sub AVX2_512_00_47 () {
2020my $j = shift;
2021my $body = shift;
2022my @X = @_;
2023my @insns = (&$body,&$body); # 48 instructions
2024my $base = "+2*$PUSH8(%rsp)";
2025
2026 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2027 foreach (Xupdate_512_AVX()) { # 23 instructions
2028 eval;
2029 if ($_ !~ /\;$/) {
2030 eval(shift(@insns));
2031 eval(shift(@insns));
2032 eval(shift(@insns));
2033 }
2034 }
2035 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2036 foreach (@insns) { eval; } # remaining instructions
2037 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2038}
2039
2040 for ($i=0,$j=0; $j<8; $j++) {
2041 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2042 push(@X,shift(@X)); # rotate(@X)
2043 }
2044 &lea ($Tbl,16*2*$SZ."($Tbl)");
2045 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2046 &jne (".Lavx2_00_47");
2047
2048 for ($i=0; $i<16; ) {
2049 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2050 foreach(bodyx_00_15()) { eval; }
2051 }
2052}
2053$code.=<<___;
2054 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2055 add $a1,$A
2056 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2057 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2058
2059 add $SZ*0($ctx),$A
2060 add $SZ*1($ctx),$B
2061 add $SZ*2($ctx),$C
2062 add $SZ*3($ctx),$D
2063 add $SZ*4($ctx),$E
2064 add $SZ*5($ctx),$F
2065 add $SZ*6($ctx),$G
2066 add $SZ*7($ctx),$H
2067
2068 mov $A,$SZ*0($ctx)
2069 mov $B,$SZ*1($ctx)
2070 mov $C,$SZ*2($ctx)
2071 mov $D,$SZ*3($ctx)
2072 mov $E,$SZ*4($ctx)
2073 mov $F,$SZ*5($ctx)
2074 mov $G,$SZ*6($ctx)
2075 mov $H,$SZ*7($ctx)
2076
2077 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2078 je .Ldone_avx2
2079
2080 xor $a1,$a1
2081 mov $B,$a3
2082 xor $C,$a3 # magic
2083 mov $F,$a4
2084 jmp .Lower_avx2
2085.align 16
2086.Lower_avx2:
2087___
2088 for ($i=0; $i<8; ) {
2089 my $base="+16($Tbl)";
2090 foreach(bodyx_00_15()) { eval; }
2091 }
2092$code.=<<___;
2093 lea -$PUSH8($Tbl),$Tbl
2094 cmp %rsp,$Tbl
2095 jae .Lower_avx2
2096
2097 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2098 add $a1,$A
2099 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2100 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2101
2102 add $SZ*0($ctx),$A
2103 add $SZ*1($ctx),$B
2104 add $SZ*2($ctx),$C
2105 add $SZ*3($ctx),$D
2106 add $SZ*4($ctx),$E
2107 add $SZ*5($ctx),$F
2108 lea `2*16*$SZ`($inp),$inp # inp+=2
2109 add $SZ*6($ctx),$G
504bbcf3 2110 mov $inp,%r12
c4558efb
AP
2111 add $SZ*7($ctx),$H
2112 cmp $_end,$inp
2113
2114 mov $A,$SZ*0($ctx)
504bbcf3 2115 cmove %rsp,%r12 # next block or stale data
c4558efb
AP
2116 mov $B,$SZ*1($ctx)
2117 mov $C,$SZ*2($ctx)
2118 mov $D,$SZ*3($ctx)
2119 mov $E,$SZ*4($ctx)
2120 mov $F,$SZ*5($ctx)
2121 mov $G,$SZ*6($ctx)
2122 mov $H,$SZ*7($ctx)
2123
c4558efb
AP
2124 jbe .Loop_avx2
2125 lea (%rsp),$Tbl
2126
2127.Ldone_avx2:
2128 lea ($Tbl),%rsp
2129 mov $_rsp,%rsi
00678437 2130 vzeroupper
c4558efb
AP
2131___
2132$code.=<<___ if ($win64);
2133 movaps 16*$SZ+32(%rsp),%xmm6
2134 movaps 16*$SZ+48(%rsp),%xmm7
2135 movaps 16*$SZ+64(%rsp),%xmm8
2136 movaps 16*$SZ+80(%rsp),%xmm9
2137___
2138$code.=<<___ if ($win64 && $SZ>4);
2139 movaps 16*$SZ+96(%rsp),%xmm10
2140 movaps 16*$SZ+112(%rsp),%xmm11
2141___
2142$code.=<<___;
2143 mov (%rsi),%r15
2144 mov 8(%rsi),%r14
2145 mov 16(%rsi),%r13
2146 mov 24(%rsi),%r12
2147 mov 32(%rsi),%rbp
2148 mov 40(%rsi),%rbx
2149 lea 48(%rsi),%rsp
2150.Lepilogue_avx2:
2151 ret
2152.size ${func}_avx2,.-${func}_avx2
2153___
2154}}
a8f3b8b5
AP
2155}}}}}
2156
be01f79d
AP
2157# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2158# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2159if ($win64) {
2160$rec="%rcx";
2161$frame="%rdx";
2162$context="%r8";
2163$disp="%r9";
2164
2165$code.=<<___;
2166.extern __imp_RtlVirtualUnwind
2167.type se_handler,\@abi-omnipotent
2168.align 16
2169se_handler:
2170 push %rsi
2171 push %rdi
2172 push %rbx
2173 push %rbp
2174 push %r12
2175 push %r13
2176 push %r14
2177 push %r15
2178 pushfq
2179 sub \$64,%rsp
2180
2181 mov 120($context),%rax # pull context->Rax
2182 mov 248($context),%rbx # pull context->Rip
2183
a8f3b8b5
AP
2184 mov 8($disp),%rsi # disp->ImageBase
2185 mov 56($disp),%r11 # disp->HanderlData
2186
2187 mov 0(%r11),%r10d # HandlerData[0]
2188 lea (%rsi,%r10),%r10 # prologue label
2189 cmp %r10,%rbx # context->Rip<prologue label
be01f79d
AP
2190 jb .Lin_prologue
2191
2192 mov 152($context),%rax # pull context->Rsp
2193
a8f3b8b5
AP
2194 mov 4(%r11),%r10d # HandlerData[1]
2195 lea (%rsi,%r10),%r10 # epilogue label
2196 cmp %r10,%rbx # context->Rip>=epilogue label
be01f79d 2197 jae .Lin_prologue
c4558efb
AP
2198___
2199$code.=<<___ if ($avx>1);
2200 lea .Lavx2_shortcut(%rip),%r10
2201 cmp %r10,%rbx # context->Rip<avx2_shortcut
2202 jb .Lnot_in_avx2
2203
2204 and \$-256*$SZ,%rax
2205 add \$`2*$SZ*($rounds-8)`,%rax
2206.Lnot_in_avx2:
2207___
2208$code.=<<___;
a8f3b8b5 2209 mov %rax,%rsi # put aside Rsp
be01f79d
AP
2210 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2211 lea 48(%rax),%rax
2212
2213 mov -8(%rax),%rbx
2214 mov -16(%rax),%rbp
2215 mov -24(%rax),%r12
2216 mov -32(%rax),%r13
2217 mov -40(%rax),%r14
2218 mov -48(%rax),%r15
2219 mov %rbx,144($context) # restore context->Rbx
2220 mov %rbp,160($context) # restore context->Rbp
2221 mov %r12,216($context) # restore context->R12
2222 mov %r13,224($context) # restore context->R13
2223 mov %r14,232($context) # restore context->R14
2224 mov %r15,240($context) # restore context->R15
2225
a8f3b8b5
AP
2226 lea .Lepilogue(%rip),%r10
2227 cmp %r10,%rbx
2228 jb .Lin_prologue # non-AVX code
2229
2230 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2231 lea 512($context),%rdi # &context.Xmm6
2232 mov \$`$SZ==4?8:12`,%ecx
2233 .long 0xa548f3fc # cld; rep movsq
2234
be01f79d
AP
2235.Lin_prologue:
2236 mov 8(%rax),%rdi
2237 mov 16(%rax),%rsi
2238 mov %rax,152($context) # restore context->Rsp
2239 mov %rsi,168($context) # restore context->Rsi
2240 mov %rdi,176($context) # restore context->Rdi
2241
2242 mov 40($disp),%rdi # disp->ContextRecord
2243 mov $context,%rsi # context
2244 mov \$154,%ecx # sizeof(CONTEXT)
2245 .long 0xa548f3fc # cld; rep movsq
2246
2247 mov $disp,%rsi
2248 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2249 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2250 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2251 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2252 mov 40(%rsi),%r10 # disp->ContextRecord
2253 lea 56(%rsi),%r11 # &disp->HandlerData
2254 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2255 mov %r10,32(%rsp) # arg5
2256 mov %r11,40(%rsp) # arg6
2257 mov %r12,48(%rsp) # arg7
2258 mov %rcx,56(%rsp) # arg8, (NULL)
2259 call *__imp_RtlVirtualUnwind(%rip)
2260
2261 mov \$1,%eax # ExceptionContinueSearch
2262 add \$64,%rsp
2263 popfq
2264 pop %r15
2265 pop %r14
2266 pop %r13
2267 pop %r12
2268 pop %rbp
2269 pop %rbx
2270 pop %rdi
2271 pop %rsi
2272 ret
2273.size se_handler,.-se_handler
29be3f64 2274___
be01f79d 2275
29be3f64 2276$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2277.type shaext_handler,\@abi-omnipotent
2278.align 16
2279shaext_handler:
2280 push %rsi
2281 push %rdi
2282 push %rbx
2283 push %rbp
2284 push %r12
2285 push %r13
2286 push %r14
2287 push %r15
2288 pushfq
2289 sub \$64,%rsp
2290
2291 mov 120($context),%rax # pull context->Rax
2292 mov 248($context),%rbx # pull context->Rip
2293
2294 lea .Lprologue_shaext(%rip),%r10
2295 cmp %r10,%rbx # context->Rip<.Lprologue
2296 jb .Lin_prologue
2297
2298 lea .Lepilogue_shaext(%rip),%r10
2299 cmp %r10,%rbx # context->Rip>=.Lepilogue
2300 jae .Lin_prologue
2301
2302 lea -8-5*16(%rax),%rsi
2303 lea 512($context),%rdi # &context.Xmm6
2304 mov \$10,%ecx
2305 .long 0xa548f3fc # cld; rep movsq
2306
2307 jmp .Lin_prologue
2308.size shaext_handler,.-shaext_handler
29be3f64 2309___
619b9466 2310
29be3f64 2311$code.=<<___;
be01f79d
AP
2312.section .pdata
2313.align 4
2314 .rva .LSEH_begin_$func
2315 .rva .LSEH_end_$func
2316 .rva .LSEH_info_$func
a8f3b8b5 2317___
7eb9680a 2318$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2319 .rva .LSEH_begin_${func}_shaext
2320 .rva .LSEH_end_${func}_shaext
2321 .rva .LSEH_info_${func}_shaext
977f32e8
AP
2322___
2323$code.=<<___ if ($SZ==4);
a8f3b8b5
AP
2324 .rva .LSEH_begin_${func}_ssse3
2325 .rva .LSEH_end_${func}_ssse3
2326 .rva .LSEH_info_${func}_ssse3
2327___
2328$code.=<<___ if ($avx && $SZ==8);
2329 .rva .LSEH_begin_${func}_xop
2330 .rva .LSEH_end_${func}_xop
2331 .rva .LSEH_info_${func}_xop
2332___
2333$code.=<<___ if ($avx);
2334 .rva .LSEH_begin_${func}_avx
2335 .rva .LSEH_end_${func}_avx
faee82c1 2336 .rva .LSEH_info_${func}_avx
a8f3b8b5 2337___
c4558efb
AP
2338$code.=<<___ if ($avx>1);
2339 .rva .LSEH_begin_${func}_avx2
2340 .rva .LSEH_end_${func}_avx2
2341 .rva .LSEH_info_${func}_avx2
2342___
a8f3b8b5 2343$code.=<<___;
be01f79d
AP
2344.section .xdata
2345.align 8
2346.LSEH_info_$func:
2347 .byte 9,0,0,0
2348 .rva se_handler
a8f3b8b5
AP
2349 .rva .Lprologue,.Lepilogue # HandlerData[]
2350___
07b635cc 2351$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2352.LSEH_info_${func}_shaext:
2353 .byte 9,0,0,0
2354 .rva shaext_handler
07b635cc
AP
2355___
2356$code.=<<___ if ($SZ==4);
a8f3b8b5
AP
2357.LSEH_info_${func}_ssse3:
2358 .byte 9,0,0,0
2359 .rva se_handler
2360 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2361___
2362$code.=<<___ if ($avx && $SZ==8);
2363.LSEH_info_${func}_xop:
2364 .byte 9,0,0,0
2365 .rva se_handler
2366 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2367___
2368$code.=<<___ if ($avx);
2369.LSEH_info_${func}_avx:
2370 .byte 9,0,0,0
2371 .rva se_handler
2372 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
be01f79d 2373___
c4558efb
AP
2374$code.=<<___ if ($avx>1);
2375.LSEH_info_${func}_avx2:
2376 .byte 9,0,0,0
2377 .rva se_handler
2378 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2379___
be01f79d
AP
2380}
2381
619b9466
AP
2382sub sha256op38 {
2383 my $instr = shift;
2384 my %opcodelet = (
2385 "sha256rnds2" => 0xcb,
2386 "sha256msg1" => 0xcc,
2387 "sha256msg2" => 0xcd );
2388
2389 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2390 my @opcode=(0x0f,0x38);
2391 push @opcode,$opcodelet{$instr};
2392 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2393 return ".byte\t".join(',',@opcode);
2394 } else {
2395 return $instr."\t".@_[0];
2396 }
2397}
2398
2399foreach (split("\n",$code)) {
2400 s/\`([^\`]*)\`/eval $1/geo;
2401
2402 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2403
2404 print $_,"\n";
2405}
2337eb58 2406close STDOUT;