]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha256-mb-x86_64.pl
aesv8-armx.pl: inclrease interleave factor.
[thirdparty/openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
CommitLineData
b7838586
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Multi-buffer SHA256 procedure processes n buffers in parallel by
11# placing buffer data to designated lane of SIMD register. n is
12# naturally limited to 4 on pre-AVX2 processors and to 8 on
13# AVX2-capable processors such as Haswell.
14#
61ba602a 15# this +aesni(i) sha256 aesni-sha256 gain(iv)
b7838586 16# -------------------------------------------------------------------
61ba602a 17# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
619b9466 18# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
b7838586
AP
19# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
20# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
61ba602a 21# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
b7838586
AP
22# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
23#
61ba602a
AP
24# (i) multi-block CBC encrypt with 128-bit key;
25# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
b7838586
AP
26# because of lower AES-NI instruction throughput, nor is there
27# AES-NI-SHA256 stitch for these processors;
61ba602a 28# (iii) "this" is for n=8, when we gather twice as much data, result
b7838586 29# for n=4 is 20.3+4.44=24.7;
3847d15d
AP
30# (iv) presented improvement coefficients are asymptotic limits and
31# in real-life application are somewhat lower, e.g. for 2KB
619b9466 32# fragments they range from 75% to 130% (on Haswell);
b7838586
AP
33
34$flavour = shift;
35$output = shift;
36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
37
38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
39
40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43die "can't locate x86_64-xlate.pl";
44
45$avx=0;
46
47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
50}
51
52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
55}
56
57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
60}
61
62open OUT,"| \"$^X\" $xlate $flavour $output";
63*STDOUT=*OUT;
64
65# void sha256_multi_block (
66# struct { unsigned int A[8];
67# unsigned int B[8];
68# unsigned int C[8];
69# unsigned int D[8];
70# unsigned int E[8];
71# unsigned int F[8];
72# unsigned int G[8];
73# unsigned int H[8]; } *ctx,
74# struct { void *ptr; int blocks; } inp[8],
75# int num); /* 1 or 2 */
76#
77$ctx="%rdi"; # 1st arg
78$inp="%rsi"; # 2nd arg
79$num="%edx"; # 3rd arg
80@ptr=map("%r$_",(8..11));
81$Tbl="%rbp";
82
83@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
84($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
85
86$REG_SZ=16;
87
88sub Xi_off {
89my $off = shift;
90
91 $off %= 16; $off *= $REG_SZ;
92 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
93}
94
95sub ROUND_00_15 {
96my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
97
98$code.=<<___ if ($i<15);
99 movd `4*$i`(@ptr[0]),$Xi
100 movd `4*$i`(@ptr[1]),$t1
101 movd `4*$i`(@ptr[2]),$t2
102 movd `4*$i`(@ptr[3]),$t3
103 punpckldq $t2,$Xi
104 punpckldq $t3,$t1
105 punpckldq $t1,$Xi
b7838586
AP
106___
107$code.=<<___ if ($i==15);
108 movd `4*$i`(@ptr[0]),$Xi
109 lea `16*4`(@ptr[0]),@ptr[0]
110 movd `4*$i`(@ptr[1]),$t1
111 lea `16*4`(@ptr[1]),@ptr[1]
112 movd `4*$i`(@ptr[2]),$t2
113 lea `16*4`(@ptr[2]),@ptr[2]
114 movd `4*$i`(@ptr[3]),$t3
115 lea `16*4`(@ptr[3]),@ptr[3]
116 punpckldq $t2,$Xi
117 punpckldq $t3,$t1
118 punpckldq $t1,$Xi
b7838586
AP
119___
120$code.=<<___;
121 movdqa $e,$sigma
619b9466 122 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
b7838586 123 movdqa $e,$t3
619b9466 124 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
b7838586
AP
125 psrld \$6,$sigma
126 movdqa $e,$t2
127 pslld \$7,$t3
128 movdqa $Xi,`&Xi_off($i)`
129 paddd $h,$Xi # Xi+=h
130
131 psrld \$11,$t2
132 pxor $t3,$sigma
133 pslld \$21-7,$t3
134 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
135 pxor $t2,$sigma
136
137 psrld \$25-11,$t2
138 movdqa $e,$t1
619b9466 139 `"prefetcht0 63(@ptr[0])" if ($i==15)`
b7838586
AP
140 pxor $t3,$sigma
141 movdqa $e,$axb # borrow $axb
142 pslld \$26-21,$t3
143 pandn $g,$t1
144 pand $f,$axb
145 pxor $t2,$sigma
146
619b9466 147 `"prefetcht0 63(@ptr[1])" if ($i==15)`
b7838586
AP
148 movdqa $a,$t2
149 pxor $t3,$sigma # Sigma1(e)
150 movdqa $a,$t3
151 psrld \$2,$t2
152 paddd $sigma,$Xi # Xi+=Sigma1(e)
153 pxor $axb,$t1 # Ch(e,f,g)
154 movdqa $b,$axb
155 movdqa $a,$sigma
156 pslld \$10,$t3
157 pxor $a,$axb # a^b, b^c in next round
158
619b9466 159 `"prefetcht0 63(@ptr[2])" if ($i==15)`
b7838586
AP
160 psrld \$13,$sigma
161 pxor $t3,$t2
162 paddd $t1,$Xi # Xi+=Ch(e,f,g)
163 pslld \$19-10,$t3
164 pand $axb,$bxc
165 pxor $sigma,$t2
166
619b9466 167 `"prefetcht0 63(@ptr[3])" if ($i==15)`
b7838586
AP
168 psrld \$22-13,$sigma
169 pxor $t3,$t2
170 movdqa $b,$h
171 pslld \$30-19,$t3
172 pxor $t2,$sigma
173 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
174 paddd $Xi,$d # d+=Xi
175 pxor $t3,$sigma # Sigma0(a)
176
177 paddd $Xi,$h # h+=Xi
178 paddd $sigma,$h # h+=Sigma0(a)
179___
180$code.=<<___ if (($i%8)==7);
181 lea `32*8`($Tbl),$Tbl
182___
183 ($axb,$bxc)=($bxc,$axb);
184}
185
186sub ROUND_16_XX {
187my $i=shift;
188
189$code.=<<___;
190 movdqa `&Xi_off($i+1)`,$Xn
191 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
192
193 movdqa $Xn,$sigma
194 movdqa $Xn,$t2
195 psrld \$3,$sigma
196 movdqa $Xn,$t3
197
198 psrld \$7,$t2
199 movdqa `&Xi_off($i+14)`,$t1
200 pslld \$14,$t3
201 pxor $t2,$sigma
202 psrld \$18-7,$t2
203 movdqa $t1,$axb # borrow $axb
204 pxor $t3,$sigma
205 pslld \$25-14,$t3
206 pxor $t2,$sigma
207 psrld \$10,$t1
208 movdqa $axb,$t2
209
210 psrld \$17,$axb
211 pxor $t3,$sigma # sigma0(X[i+1])
212 pslld \$13,$t2
213 paddd $sigma,$Xi # Xi+=sigma0(e)
214 pxor $axb,$t1
215 psrld \$19-17,$axb
216 pxor $t2,$t1
217 pslld \$15-13,$t2
218 pxor $axb,$t1
219 pxor $t2,$t1 # sigma0(X[i+14])
220 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
221___
222 &ROUND_00_15($i,@_);
223 ($Xi,$Xn)=($Xn,$Xi);
224}
225
226$code.=<<___;
227.text
228
229.extern OPENSSL_ia32cap_P
230
231.globl sha256_multi_block
232.type sha256_multi_block,\@function,3
233.align 32
234sha256_multi_block:
619b9466
AP
235 mov OPENSSL_ia32cap_P+4(%rip),%rcx
236 bt \$61,%rcx # check SHA bit
237 jc _shaext_shortcut
b7838586
AP
238___
239$code.=<<___ if ($avx);
b7838586
AP
240 test \$`1<<28`,%ecx
241 jnz _avx_shortcut
242___
243$code.=<<___;
244 mov %rsp,%rax
245 push %rbx
246 push %rbp
247___
248$code.=<<___ if ($win64);
249 lea -0xa8(%rsp),%rsp
250 movaps %xmm6,(%rsp)
251 movaps %xmm7,0x10(%rsp)
252 movaps %xmm8,0x20(%rsp)
253 movaps %xmm9,0x30(%rsp)
254 movaps %xmm10,-0x78(%rax)
255 movaps %xmm11,-0x68(%rax)
256 movaps %xmm12,-0x58(%rax)
257 movaps %xmm13,-0x48(%rax)
258 movaps %xmm14,-0x38(%rax)
259 movaps %xmm15,-0x28(%rax)
260___
261$code.=<<___;
262 sub \$`$REG_SZ*18`, %rsp
263 and \$-256,%rsp
264 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
619b9466 265.Lbody:
b7838586
AP
266 lea K256+128(%rip),$Tbl
267 lea `$REG_SZ*16`(%rsp),%rbx
268 lea 0x80($ctx),$ctx # size optimization
269
270.Loop_grande:
271 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
272 xor $num,$num
273___
274for($i=0;$i<4;$i++) {
275 $code.=<<___;
276 mov `16*$i+0`($inp),@ptr[$i] # input pointer
277 mov `16*$i+8`($inp),%ecx # number of blocks
278 cmp $num,%ecx
279 cmovg %ecx,$num # find maximum
280 test %ecx,%ecx
281 mov %ecx,`4*$i`(%rbx) # initialize counters
282 cmovle $Tbl,@ptr[$i] # cancel input
283___
284}
285$code.=<<___;
286 test $num,$num
287 jz .Ldone
288
289 movdqu 0x00-0x80($ctx),$A # load context
290 lea 128(%rsp),%rax
291 movdqu 0x20-0x80($ctx),$B
292 movdqu 0x40-0x80($ctx),$C
293 movdqu 0x60-0x80($ctx),$D
294 movdqu 0x80-0x80($ctx),$E
295 movdqu 0xa0-0x80($ctx),$F
296 movdqu 0xc0-0x80($ctx),$G
297 movdqu 0xe0-0x80($ctx),$H
298 movdqu .Lpbswap(%rip),$Xn
299 jmp .Loop
300
301.align 32
302.Loop:
303 movdqa $C,$bxc
304 pxor $B,$bxc # magic seed
305___
306for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
307$code.=<<___;
308 movdqu `&Xi_off($i)`,$Xi
309 mov \$3,%ecx
310 jmp .Loop_16_xx
311.align 32
312.Loop_16_xx:
313___
314for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
315$code.=<<___;
316 dec %ecx
317 jnz .Loop_16_xx
318
319 mov \$1,%ecx
320 lea K256+128(%rip),$Tbl
321
322 movdqa (%rbx),$sigma # pull counters
323 cmp 4*0(%rbx),%ecx # examine counters
324 pxor $t1,$t1
325 cmovge $Tbl,@ptr[0] # cancel input
326 cmp 4*1(%rbx),%ecx
327 movdqa $sigma,$Xn
328 cmovge $Tbl,@ptr[1]
329 cmp 4*2(%rbx),%ecx
330 pcmpgtd $t1,$Xn # mask value
331 cmovge $Tbl,@ptr[2]
332 cmp 4*3(%rbx),%ecx
333 paddd $Xn,$sigma # counters--
334 cmovge $Tbl,@ptr[3]
335
336 movdqu 0x00-0x80($ctx),$t1
337 pand $Xn,$A
338 movdqu 0x20-0x80($ctx),$t2
339 pand $Xn,$B
340 movdqu 0x40-0x80($ctx),$t3
341 pand $Xn,$C
342 movdqu 0x60-0x80($ctx),$Xi
343 pand $Xn,$D
344 paddd $t1,$A
345 movdqu 0x80-0x80($ctx),$t1
346 pand $Xn,$E
347 paddd $t2,$B
348 movdqu 0xa0-0x80($ctx),$t2
349 pand $Xn,$F
350 paddd $t3,$C
351 movdqu 0xc0-0x80($ctx),$t3
352 pand $Xn,$G
353 paddd $Xi,$D
354 movdqu 0xe0-0x80($ctx),$Xi
355 pand $Xn,$H
356 paddd $t1,$E
357 paddd $t2,$F
358 movdqu $A,0x00-0x80($ctx)
359 paddd $t3,$G
360 movdqu $B,0x20-0x80($ctx)
361 paddd $Xi,$H
362 movdqu $C,0x40-0x80($ctx)
363 movdqu $D,0x60-0x80($ctx)
364 movdqu $E,0x80-0x80($ctx)
365 movdqu $F,0xa0-0x80($ctx)
366 movdqu $G,0xc0-0x80($ctx)
367 movdqu $H,0xe0-0x80($ctx)
368
369 movdqa $sigma,(%rbx) # save counters
370 movdqa .Lpbswap(%rip),$Xn
371 dec $num
372 jnz .Loop
373
374 mov `$REG_SZ*17+8`(%rsp),$num
375 lea $REG_SZ($ctx),$ctx
376 lea `16*$REG_SZ/4`($inp),$inp
377 dec $num
378 jnz .Loop_grande
379
380.Ldone:
381 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
382___
383$code.=<<___ if ($win64);
384 movaps -0xb8(%rax),%xmm6
385 movaps -0xa8(%rax),%xmm7
386 movaps -0x98(%rax),%xmm8
387 movaps -0x88(%rax),%xmm9
388 movaps -0x78(%rax),%xmm10
389 movaps -0x68(%rax),%xmm11
390 movaps -0x58(%rax),%xmm12
391 movaps -0x48(%rax),%xmm13
392 movaps -0x38(%rax),%xmm14
393 movaps -0x28(%rax),%xmm15
394___
395$code.=<<___;
396 mov -16(%rax),%rbp
397 mov -8(%rax),%rbx
398 lea (%rax),%rsp
619b9466 399.Lepilogue:
b7838586
AP
400 ret
401.size sha256_multi_block,.-sha256_multi_block
402___
619b9466
AP
403 {{{
404my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
405my @MSG0=map("%xmm$_",(4..7));
406my @MSG1=map("%xmm$_",(8..11));
407
408$code.=<<___;
409.type sha256_multi_block_shaext,\@function,3
410.align 32
411sha256_multi_block_shaext:
412_shaext_shortcut:
413 mov %rsp,%rax
414 push %rbx
415 push %rbp
416___
417$code.=<<___ if ($win64);
418 lea -0xa8(%rsp),%rsp
419 movaps %xmm6,(%rsp)
420 movaps %xmm7,0x10(%rsp)
421 movaps %xmm8,0x20(%rsp)
422 movaps %xmm9,0x30(%rsp)
423 movaps %xmm10,-0x78(%rax)
424 movaps %xmm11,-0x68(%rax)
425 movaps %xmm12,-0x58(%rax)
426 movaps %xmm13,-0x48(%rax)
427 movaps %xmm14,-0x38(%rax)
428 movaps %xmm15,-0x28(%rax)
429___
430$code.=<<___;
431 sub \$`$REG_SZ*18`,%rsp
432 shl \$1,$num # we process pair at a time
433 and \$-256,%rsp
434 lea 0x80($ctx),$ctx # size optimization
435 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
436.Lbody_shaext:
437 lea `$REG_SZ*16`(%rsp),%rbx
438 lea K256_shaext+0x80(%rip),$Tbl
439
440.Loop_grande_shaext:
441 mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
442 xor $num,$num
443___
444for($i=0;$i<2;$i++) {
445 $code.=<<___;
446 mov `16*$i+0`($inp),@ptr[$i] # input pointer
447 mov `16*$i+8`($inp),%ecx # number of blocks
448 cmp $num,%ecx
449 cmovg %ecx,$num # find maximum
450 test %ecx,%ecx
451 mov %ecx,`4*$i`(%rbx) # initialize counters
452 cmovle %rsp,@ptr[$i] # cancel input
453___
454}
455$code.=<<___;
456 test $num,$num
457 jz .Ldone_shaext
458
459 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
460 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
461 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
462 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
463 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
464 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
465 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
466 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
467
468 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
469 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
470 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
471 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
472 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
473
474 movdqa $ABEF0,$ABEF1
475 movdqa $CDGH0,$CDGH1
476 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
477 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
478 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
479 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
480
481 pshufd \$0b00011011,$ABEF0,$ABEF0
482 pshufd \$0b00011011,$CDGH0,$CDGH0
483 pshufd \$0b00011011,$ABEF1,$ABEF1
484 pshufd \$0b00011011,$CDGH1,$CDGH1
485 jmp .Loop_shaext
486
487.align 32
488.Loop_shaext:
489 movdqu 0x00(@ptr[0]),@MSG0[0]
490 movdqu 0x00(@ptr[1]),@MSG1[0]
491 movdqu 0x10(@ptr[0]),@MSG0[1]
492 movdqu 0x10(@ptr[1]),@MSG1[1]
493 movdqu 0x20(@ptr[0]),@MSG0[2]
494 pshufb $TMPx,@MSG0[0]
495 movdqu 0x20(@ptr[1]),@MSG1[2]
496 pshufb $TMPx,@MSG1[0]
497 movdqu 0x30(@ptr[0]),@MSG0[3]
498 lea 0x40(@ptr[0]),@ptr[0]
499 movdqu 0x30(@ptr[1]),@MSG1[3]
500 lea 0x40(@ptr[1]),@ptr[1]
501
502 movdqa 0*16-0x80($Tbl),$Wi
503 pshufb $TMPx,@MSG0[1]
504 paddd @MSG0[0],$Wi
505 pxor $ABEF0,@MSG0[0] # black magic
506 movdqa $Wi,$TMP0
507 movdqa 0*16-0x80($Tbl),$TMP1
508 pshufb $TMPx,@MSG1[1]
509 paddd @MSG1[0],$TMP1
510 movdqa $CDGH0,0x50(%rsp) # offload
511 sha256rnds2 $ABEF0,$CDGH0 # 0-3
512 pxor $ABEF1,@MSG1[0] # black magic
513 movdqa $TMP1,$Wi
514 movdqa $CDGH1,0x70(%rsp)
515 sha256rnds2 $ABEF1,$CDGH1 # 0-3
516 pshufd \$0x0e,$TMP0,$Wi
517 pxor $ABEF0,@MSG0[0] # black magic
518 movdqa $ABEF0,0x40(%rsp) # offload
519 sha256rnds2 $CDGH0,$ABEF0
520 pshufd \$0x0e,$TMP1,$Wi
521 pxor $ABEF1,@MSG1[0] # black magic
522 movdqa $ABEF1,0x60(%rsp)
523 movdqa 1*16-0x80($Tbl),$TMP0
524 paddd @MSG0[1],$TMP0
525 pshufb $TMPx,@MSG0[2]
526 sha256rnds2 $CDGH1,$ABEF1
527
528 movdqa $TMP0,$Wi
529 movdqa 1*16-0x80($Tbl),$TMP1
530 paddd @MSG1[1],$TMP1
531 sha256rnds2 $ABEF0,$CDGH0 # 4-7
532 movdqa $TMP1,$Wi
533 prefetcht0 127(@ptr[0])
534 pshufb $TMPx,@MSG0[3]
535 pshufb $TMPx,@MSG1[2]
536 prefetcht0 127(@ptr[1])
537 sha256rnds2 $ABEF1,$CDGH1 # 4-7
538 pshufd \$0x0e,$TMP0,$Wi
539 pshufb $TMPx,@MSG1[3]
540 sha256msg1 @MSG0[1],@MSG0[0]
541 sha256rnds2 $CDGH0,$ABEF0
542 pshufd \$0x0e,$TMP1,$Wi
543 movdqa 2*16-0x80($Tbl),$TMP0
544 paddd @MSG0[2],$TMP0
545 sha256rnds2 $CDGH1,$ABEF1
546
547 movdqa $TMP0,$Wi
548 movdqa 2*16-0x80($Tbl),$TMP1
549 paddd @MSG1[2],$TMP1
550 sha256rnds2 $ABEF0,$CDGH0 # 8-11
551 sha256msg1 @MSG1[1],@MSG1[0]
552 movdqa $TMP1,$Wi
553 movdqa @MSG0[3],$TMPx
554 sha256rnds2 $ABEF1,$CDGH1 # 8-11
555 pshufd \$0x0e,$TMP0,$Wi
556 palignr \$4,@MSG0[2],$TMPx
557 paddd $TMPx,@MSG0[0]
558 movdqa @MSG1[3],$TMPx
559 palignr \$4,@MSG1[2],$TMPx
560 sha256msg1 @MSG0[2],@MSG0[1]
561 sha256rnds2 $CDGH0,$ABEF0
562 pshufd \$0x0e,$TMP1,$Wi
563 movdqa 3*16-0x80($Tbl),$TMP0
564 paddd @MSG0[3],$TMP0
565 sha256rnds2 $CDGH1,$ABEF1
566 sha256msg1 @MSG1[2],@MSG1[1]
567
568 movdqa $TMP0,$Wi
569 movdqa 3*16-0x80($Tbl),$TMP1
570 paddd $TMPx,@MSG1[0]
571 paddd @MSG1[3],$TMP1
572 sha256msg2 @MSG0[3],@MSG0[0]
573 sha256rnds2 $ABEF0,$CDGH0 # 12-15
574 movdqa $TMP1,$Wi
575 movdqa @MSG0[0],$TMPx
576 palignr \$4,@MSG0[3],$TMPx
577 sha256rnds2 $ABEF1,$CDGH1 # 12-15
578 sha256msg2 @MSG1[3],@MSG1[0]
579 pshufd \$0x0e,$TMP0,$Wi
580 paddd $TMPx,@MSG0[1]
581 movdqa @MSG1[0],$TMPx
582 palignr \$4,@MSG1[3],$TMPx
583 sha256msg1 @MSG0[3],@MSG0[2]
584 sha256rnds2 $CDGH0,$ABEF0
585 pshufd \$0x0e,$TMP1,$Wi
586 movdqa 4*16-0x80($Tbl),$TMP0
587 paddd @MSG0[0],$TMP0
588 sha256rnds2 $CDGH1,$ABEF1
589 sha256msg1 @MSG1[3],@MSG1[2]
590___
591for($i=4;$i<16-3;$i++) {
592$code.=<<___;
593 movdqa $TMP0,$Wi
594 movdqa $i*16-0x80($Tbl),$TMP1
595 paddd $TMPx,@MSG1[1]
596 paddd @MSG1[0],$TMP1
597 sha256msg2 @MSG0[0],@MSG0[1]
598 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
599 movdqa $TMP1,$Wi
600 movdqa @MSG0[1],$TMPx
601 palignr \$4,@MSG0[0],$TMPx
602 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
603 sha256msg2 @MSG1[0],@MSG1[1]
604 pshufd \$0x0e,$TMP0,$Wi
605 paddd $TMPx,@MSG0[2]
606 movdqa @MSG1[1],$TMPx
607 palignr \$4,@MSG1[0],$TMPx
608 sha256msg1 @MSG0[0],@MSG0[3]
609 sha256rnds2 $CDGH0,$ABEF0
610 pshufd \$0x0e,$TMP1,$Wi
611 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
612 paddd @MSG0[1],$TMP0
613 sha256rnds2 $CDGH1,$ABEF1
614 sha256msg1 @MSG1[0],@MSG1[3]
615___
616 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
617}
618$code.=<<___;
619 movdqa $TMP0,$Wi
620 movdqa 13*16-0x80($Tbl),$TMP1
621 paddd $TMPx,@MSG1[1]
622 paddd @MSG1[0],$TMP1
623 sha256msg2 @MSG0[0],@MSG0[1]
624 sha256rnds2 $ABEF0,$CDGH0 # 52-55
625 movdqa $TMP1,$Wi
626 movdqa @MSG0[1],$TMPx
627 palignr \$4,@MSG0[0],$TMPx
628 sha256rnds2 $ABEF1,$CDGH1 # 52-55
629 sha256msg2 @MSG1[0],@MSG1[1]
630 pshufd \$0x0e,$TMP0,$Wi
631 paddd $TMPx,@MSG0[2]
632 movdqa @MSG1[1],$TMPx
633 palignr \$4,@MSG1[0],$TMPx
634 nop
635 sha256rnds2 $CDGH0,$ABEF0
636 pshufd \$0x0e,$TMP1,$Wi
637 movdqa 14*16-0x80($Tbl),$TMP0
638 paddd @MSG0[1],$TMP0
639 sha256rnds2 $CDGH1,$ABEF1
640
641 movdqa $TMP0,$Wi
642 movdqa 14*16-0x80($Tbl),$TMP1
643 paddd $TMPx,@MSG1[2]
644 paddd @MSG1[1],$TMP1
645 sha256msg2 @MSG0[1],@MSG0[2]
646 nop
647 sha256rnds2 $ABEF0,$CDGH0 # 56-59
648 movdqa $TMP1,$Wi
649 mov \$1,%ecx
650 pxor @MSG0[1],@MSG0[1] # zero
651 sha256rnds2 $ABEF1,$CDGH1 # 56-59
652 sha256msg2 @MSG1[1],@MSG1[2]
653 pshufd \$0x0e,$TMP0,$Wi
654 movdqa 15*16-0x80($Tbl),$TMP0
655 paddd @MSG0[2],$TMP0
656 movq (%rbx),@MSG0[2] # pull counters
657 nop
658 sha256rnds2 $CDGH0,$ABEF0
659 pshufd \$0x0e,$TMP1,$Wi
660 movdqa 15*16-0x80($Tbl),$TMP1
661 paddd @MSG1[2],$TMP1
662 sha256rnds2 $CDGH1,$ABEF1
663
664 movdqa $TMP0,$Wi
665 cmp 4*0(%rbx),%ecx # examine counters
666 cmovge %rsp,@ptr[0] # cancel input
667 cmp 4*1(%rbx),%ecx
668 cmovge %rsp,@ptr[1]
669 pshufd \$0x00,@MSG0[2],@MSG1[0]
670 sha256rnds2 $ABEF0,$CDGH0 # 60-63
671 movdqa $TMP1,$Wi
672 pshufd \$0x55,@MSG0[2],@MSG1[1]
673 movdqa @MSG0[2],@MSG1[2]
674 sha256rnds2 $ABEF1,$CDGH1 # 60-63
675 pshufd \$0x0e,$TMP0,$Wi
676 pcmpgtd @MSG0[1],@MSG1[0]
677 pcmpgtd @MSG0[1],@MSG1[1]
678 sha256rnds2 $CDGH0,$ABEF0
679 pshufd \$0x0e,$TMP1,$Wi
680 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
681 movdqa K256_shaext-0x10(%rip),$TMPx
682 sha256rnds2 $CDGH1,$ABEF1
683
684 pand @MSG1[0],$CDGH0
685 pand @MSG1[1],$CDGH1
686 pand @MSG1[0],$ABEF0
687 pand @MSG1[1],$ABEF1
688 paddd @MSG0[2],@MSG1[2] # counters--
689
690 paddd 0x50(%rsp),$CDGH0
691 paddd 0x70(%rsp),$CDGH1
692 paddd 0x40(%rsp),$ABEF0
693 paddd 0x60(%rsp),$ABEF1
694
695 movq @MSG1[2],(%rbx) # save counters
696 dec $num
697 jnz .Loop_shaext
698
699 mov `$REG_SZ*17+8`(%rsp),$num
700
701 pshufd \$0b00011011,$ABEF0,$ABEF0
702 pshufd \$0b00011011,$CDGH0,$CDGH0
703 pshufd \$0b00011011,$ABEF1,$ABEF1
704 pshufd \$0b00011011,$CDGH1,$CDGH1
705
706 movdqa $ABEF0,@MSG0[0]
707 movdqa $CDGH0,@MSG0[1]
708 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
709 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
710 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
711 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
712
713 movq $ABEF0,0x00-0x80($ctx) # A1.A0
714 psrldq \$8,$ABEF0
715 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
716 psrldq \$8,@MSG0[0]
717 movq $ABEF0,0x20-0x80($ctx) # B1.B0
718 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
719
720 movq $CDGH0,0x40-0x80($ctx) # C1.C0
721 psrldq \$8,$CDGH0
722 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
723 psrldq \$8,@MSG0[1]
724 movq $CDGH0,0x60-0x80($ctx) # D1.D0
725 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
726
727 lea `$REG_SZ/2`($ctx),$ctx
728 lea `16*2`($inp),$inp
729 dec $num
730 jnz .Loop_grande_shaext
731
732.Ldone_shaext:
733 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
734___
735$code.=<<___ if ($win64);
736 movaps -0xb8(%rax),%xmm6
737 movaps -0xa8(%rax),%xmm7
738 movaps -0x98(%rax),%xmm8
739 movaps -0x88(%rax),%xmm9
740 movaps -0x78(%rax),%xmm10
741 movaps -0x68(%rax),%xmm11
742 movaps -0x58(%rax),%xmm12
743 movaps -0x48(%rax),%xmm13
744 movaps -0x38(%rax),%xmm14
745 movaps -0x28(%rax),%xmm15
746___
747$code.=<<___;
748 mov -16(%rax),%rbp
749 mov -8(%rax),%rbx
750 lea (%rax),%rsp
751.Lepilogue_shaext:
752 ret
753.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
754___
755 }}}
b7838586
AP
756 if ($avx) {{{
757sub ROUND_00_15_avx {
758my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
759
760$code.=<<___ if ($i<15 && $REG_SZ==16);
761 vmovd `4*$i`(@ptr[0]),$Xi
762 vmovd `4*$i`(@ptr[1]),$t1
763 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
764 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
765 vpunpckldq $t1,$Xi,$Xi
766 vpshufb $Xn,$Xi,$Xi
767___
768$code.=<<___ if ($i==15 && $REG_SZ==16);
769 vmovd `4*$i`(@ptr[0]),$Xi
770 lea `16*4`(@ptr[0]),@ptr[0]
771 vmovd `4*$i`(@ptr[1]),$t1
772 lea `16*4`(@ptr[1]),@ptr[1]
773 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
774 lea `16*4`(@ptr[2]),@ptr[2]
775 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
776 lea `16*4`(@ptr[3]),@ptr[3]
777 vpunpckldq $t1,$Xi,$Xi
778 vpshufb $Xn,$Xi,$Xi
779___
780$code.=<<___ if ($i<15 && $REG_SZ==32);
781 vmovd `4*$i`(@ptr[0]),$Xi
782 vmovd `4*$i`(@ptr[4]),$t1
783 vmovd `4*$i`(@ptr[1]),$t2
784 vmovd `4*$i`(@ptr[5]),$t3
785 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
786 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
787 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
788 vpunpckldq $t2,$Xi,$Xi
789 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
790 vpunpckldq $t3,$t1,$t1
791 vinserti128 $t1,$Xi,$Xi
792 vpshufb $Xn,$Xi,$Xi
793___
794$code.=<<___ if ($i==15 && $REG_SZ==32);
795 vmovd `4*$i`(@ptr[0]),$Xi
796 lea `16*4`(@ptr[0]),@ptr[0]
797 vmovd `4*$i`(@ptr[4]),$t1
798 lea `16*4`(@ptr[4]),@ptr[4]
799 vmovd `4*$i`(@ptr[1]),$t2
800 lea `16*4`(@ptr[1]),@ptr[1]
801 vmovd `4*$i`(@ptr[5]),$t3
802 lea `16*4`(@ptr[5]),@ptr[5]
803 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
804 lea `16*4`(@ptr[2]),@ptr[2]
805 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
806 lea `16*4`(@ptr[6]),@ptr[6]
807 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
808 lea `16*4`(@ptr[3]),@ptr[3]
809 vpunpckldq $t2,$Xi,$Xi
810 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
811 lea `16*4`(@ptr[7]),@ptr[7]
812 vpunpckldq $t3,$t1,$t1
813 vinserti128 $t1,$Xi,$Xi
814 vpshufb $Xn,$Xi,$Xi
815___
816$code.=<<___;
817 vpsrld \$6,$e,$sigma
818 vpslld \$26,$e,$t3
819 vmovdqu $Xi,`&Xi_off($i)`
820 vpaddd $h,$Xi,$Xi # Xi+=h
821
822 vpsrld \$11,$e,$t2
823 vpxor $t3,$sigma,$sigma
824 vpslld \$21,$e,$t3
825 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
826 vpxor $t2,$sigma,$sigma
827
828 vpsrld \$25,$e,$t2
829 vpxor $t3,$sigma,$sigma
619b9466 830 `"prefetcht0 63(@ptr[0])" if ($i==15)`
b7838586
AP
831 vpslld \$7,$e,$t3
832 vpandn $g,$e,$t1
833 vpand $f,$e,$axb # borrow $axb
619b9466 834 `"prefetcht0 63(@ptr[1])" if ($i==15)`
b7838586
AP
835 vpxor $t2,$sigma,$sigma
836
837 vpsrld \$2,$a,$h # borrow $h
838 vpxor $t3,$sigma,$sigma # Sigma1(e)
619b9466 839 `"prefetcht0 63(@ptr[2])" if ($i==15)`
b7838586
AP
840 vpslld \$30,$a,$t2
841 vpxor $axb,$t1,$t1 # Ch(e,f,g)
842 vpxor $a,$b,$axb # a^b, b^c in next round
619b9466 843 `"prefetcht0 63(@ptr[3])" if ($i==15)`
b7838586
AP
844 vpxor $t2,$h,$h
845 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
846
847 vpsrld \$13,$a,$t2
619b9466 848 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
849 vpslld \$19,$a,$t3
850 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
851 vpand $axb,$bxc,$bxc
619b9466 852 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
853 vpxor $t2,$h,$sigma
854
855 vpsrld \$22,$a,$t2
856 vpxor $t3,$sigma,$sigma
619b9466 857 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
858 vpslld \$10,$a,$t3
859 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
860 vpaddd $Xi,$d,$d # d+=Xi
619b9466 861 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
862 vpxor $t2,$sigma,$sigma
863 vpxor $t3,$sigma,$sigma # Sigma0(a)
864
865 vpaddd $Xi,$h,$h # h+=Xi
866 vpaddd $sigma,$h,$h # h+=Sigma0(a)
867___
868$code.=<<___ if (($i%8)==7);
869 add \$`32*8`,$Tbl
870___
871 ($axb,$bxc)=($bxc,$axb);
872}
873
874sub ROUND_16_XX_avx {
875my $i=shift;
876
877$code.=<<___;
878 vmovdqu `&Xi_off($i+1)`,$Xn
879 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
880
881 vpsrld \$3,$Xn,$sigma
882 vpsrld \$7,$Xn,$t2
883 vpslld \$25,$Xn,$t3
884 vpxor $t2,$sigma,$sigma
885 vpsrld \$18,$Xn,$t2
886 vpxor $t3,$sigma,$sigma
887 vpslld \$14,$Xn,$t3
888 vmovdqu `&Xi_off($i+14)`,$t1
889 vpsrld \$10,$t1,$axb # borrow $axb
890
891 vpxor $t2,$sigma,$sigma
892 vpsrld \$17,$t1,$t2
893 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
894 vpslld \$15,$t1,$t3
895 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
896 vpxor $t2,$axb,$sigma
897 vpsrld \$19,$t1,$t2
898 vpxor $t3,$sigma,$sigma
899 vpslld \$13,$t1,$t3
900 vpxor $t2,$sigma,$sigma
901 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
902 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
903___
904 &ROUND_00_15_avx($i,@_);
905 ($Xi,$Xn)=($Xn,$Xi);
906}
907
908$code.=<<___;
909.type sha256_multi_block_avx,\@function,3
910.align 32
911sha256_multi_block_avx:
912_avx_shortcut:
913___
914$code.=<<___ if ($avx>1);
915 shr \$32,%rcx
916 cmp \$2,$num
917 jb .Lavx
918 test \$`1<<5`,%ecx
919 jnz _avx2_shortcut
920 jmp .Lavx
921.align 32
922.Lavx:
923___
924$code.=<<___;
925 mov %rsp,%rax
926 push %rbx
927 push %rbp
928___
929$code.=<<___ if ($win64);
930 lea -0xa8(%rsp),%rsp
931 movaps %xmm6,(%rsp)
932 movaps %xmm7,0x10(%rsp)
933 movaps %xmm8,0x20(%rsp)
934 movaps %xmm9,0x30(%rsp)
935 movaps %xmm10,-0x78(%rax)
936 movaps %xmm11,-0x68(%rax)
937 movaps %xmm12,-0x58(%rax)
938 movaps %xmm13,-0x48(%rax)
939 movaps %xmm14,-0x38(%rax)
940 movaps %xmm15,-0x28(%rax)
941___
942$code.=<<___;
943 sub \$`$REG_SZ*18`, %rsp
944 and \$-256,%rsp
945 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
619b9466 946.Lbody_avx:
b7838586
AP
947 lea K256+128(%rip),$Tbl
948 lea `$REG_SZ*16`(%rsp),%rbx
949 lea 0x80($ctx),$ctx # size optimization
950
951.Loop_grande_avx:
952 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
953 xor $num,$num
954___
955for($i=0;$i<4;$i++) {
956 $code.=<<___;
957 mov `16*$i+0`($inp),@ptr[$i] # input pointer
958 mov `16*$i+8`($inp),%ecx # number of blocks
959 cmp $num,%ecx
960 cmovg %ecx,$num # find maximum
961 test %ecx,%ecx
962 mov %ecx,`4*$i`(%rbx) # initialize counters
963 cmovle $Tbl,@ptr[$i] # cancel input
964___
965}
966$code.=<<___;
967 test $num,$num
968 jz .Ldone_avx
969
970 vmovdqu 0x00-0x80($ctx),$A # load context
971 lea 128(%rsp),%rax
972 vmovdqu 0x20-0x80($ctx),$B
973 vmovdqu 0x40-0x80($ctx),$C
974 vmovdqu 0x60-0x80($ctx),$D
975 vmovdqu 0x80-0x80($ctx),$E
976 vmovdqu 0xa0-0x80($ctx),$F
977 vmovdqu 0xc0-0x80($ctx),$G
978 vmovdqu 0xe0-0x80($ctx),$H
979 vmovdqu .Lpbswap(%rip),$Xn
980 jmp .Loop_avx
981
982.align 32
983.Loop_avx:
984 vpxor $B,$C,$bxc # magic seed
985___
986for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
987$code.=<<___;
988 vmovdqu `&Xi_off($i)`,$Xi
989 mov \$3,%ecx
990 jmp .Loop_16_xx_avx
991.align 32
992.Loop_16_xx_avx:
993___
994for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
995$code.=<<___;
996 dec %ecx
997 jnz .Loop_16_xx_avx
998
999 mov \$1,%ecx
1000 lea K256+128(%rip),$Tbl
1001___
1002for($i=0;$i<4;$i++) {
1003 $code.=<<___;
1004 cmp `4*$i`(%rbx),%ecx # examine counters
1005 cmovge $Tbl,@ptr[$i] # cancel input
1006___
1007}
1008$code.=<<___;
1009 vmovdqa (%rbx),$sigma # pull counters
1010 vpxor $t1,$t1,$t1
1011 vmovdqa $sigma,$Xn
1012 vpcmpgtd $t1,$Xn,$Xn # mask value
1013 vpaddd $Xn,$sigma,$sigma # counters--
1014
1015 vmovdqu 0x00-0x80($ctx),$t1
1016 vpand $Xn,$A,$A
1017 vmovdqu 0x20-0x80($ctx),$t2
1018 vpand $Xn,$B,$B
1019 vmovdqu 0x40-0x80($ctx),$t3
1020 vpand $Xn,$C,$C
1021 vmovdqu 0x60-0x80($ctx),$Xi
1022 vpand $Xn,$D,$D
1023 vpaddd $t1,$A,$A
1024 vmovdqu 0x80-0x80($ctx),$t1
1025 vpand $Xn,$E,$E
1026 vpaddd $t2,$B,$B
1027 vmovdqu 0xa0-0x80($ctx),$t2
1028 vpand $Xn,$F,$F
1029 vpaddd $t3,$C,$C
1030 vmovdqu 0xc0-0x80($ctx),$t3
1031 vpand $Xn,$G,$G
1032 vpaddd $Xi,$D,$D
1033 vmovdqu 0xe0-0x80($ctx),$Xi
1034 vpand $Xn,$H,$H
1035 vpaddd $t1,$E,$E
1036 vpaddd $t2,$F,$F
1037 vmovdqu $A,0x00-0x80($ctx)
1038 vpaddd $t3,$G,$G
1039 vmovdqu $B,0x20-0x80($ctx)
1040 vpaddd $Xi,$H,$H
1041 vmovdqu $C,0x40-0x80($ctx)
1042 vmovdqu $D,0x60-0x80($ctx)
1043 vmovdqu $E,0x80-0x80($ctx)
1044 vmovdqu $F,0xa0-0x80($ctx)
1045 vmovdqu $G,0xc0-0x80($ctx)
1046 vmovdqu $H,0xe0-0x80($ctx)
1047
1048 vmovdqu $sigma,(%rbx) # save counters
1049 vmovdqu .Lpbswap(%rip),$Xn
1050 dec $num
1051 jnz .Loop_avx
1052
1053 mov `$REG_SZ*17+8`(%rsp),$num
1054 lea $REG_SZ($ctx),$ctx
1055 lea `16*$REG_SZ/4`($inp),$inp
1056 dec $num
1057 jnz .Loop_grande_avx
1058
1059.Ldone_avx:
1060 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1061 vzeroupper
1062___
1063$code.=<<___ if ($win64);
1064 movaps -0xb8(%rax),%xmm6
1065 movaps -0xa8(%rax),%xmm7
1066 movaps -0x98(%rax),%xmm8
1067 movaps -0x88(%rax),%xmm9
1068 movaps -0x78(%rax),%xmm10
1069 movaps -0x68(%rax),%xmm11
1070 movaps -0x58(%rax),%xmm12
1071 movaps -0x48(%rax),%xmm13
1072 movaps -0x38(%rax),%xmm14
1073 movaps -0x28(%rax),%xmm15
1074___
1075$code.=<<___;
1076 mov -16(%rax),%rbp
1077 mov -8(%rax),%rbx
1078 lea (%rax),%rsp
619b9466 1079.Lepilogue_avx:
b7838586
AP
1080 ret
1081.size sha256_multi_block_avx,.-sha256_multi_block_avx
1082___
1083 if ($avx>1) {
1084$code =~ s/\`([^\`]*)\`/eval $1/gem;
1085
1086$REG_SZ=32;
1087@ptr=map("%r$_",(12..15,8..11));
1088
1089@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1090($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1091
1092$code.=<<___;
1093.type sha256_multi_block_avx2,\@function,3
1094.align 32
1095sha256_multi_block_avx2:
1096_avx2_shortcut:
1097 mov %rsp,%rax
1098 push %rbx
1099 push %rbp
1100 push %r12
1101 push %r13
1102 push %r14
1103 push %r15
1104___
1105$code.=<<___ if ($win64);
1106 lea -0xa8(%rsp),%rsp
1107 movaps %xmm6,(%rsp)
1108 movaps %xmm7,0x10(%rsp)
1109 movaps %xmm8,0x20(%rsp)
1110 movaps %xmm9,0x30(%rsp)
1111 movaps %xmm10,0x40(%rsp)
1112 movaps %xmm11,0x50(%rsp)
1113 movaps %xmm12,-0x78(%rax)
1114 movaps %xmm13,-0x68(%rax)
1115 movaps %xmm14,-0x58(%rax)
1116 movaps %xmm15,-0x48(%rax)
1117___
1118$code.=<<___;
1119 sub \$`$REG_SZ*18`, %rsp
1120 and \$-256,%rsp
1121 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
619b9466 1122.Lbody_avx2:
b7838586
AP
1123 lea K256+128(%rip),$Tbl
1124 lea 0x80($ctx),$ctx # size optimization
1125
1126.Loop_grande_avx2:
1127 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1128 xor $num,$num
1129 lea `$REG_SZ*16`(%rsp),%rbx
1130___
1131for($i=0;$i<8;$i++) {
1132 $code.=<<___;
1133 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1134 mov `16*$i+8`($inp),%ecx # number of blocks
1135 cmp $num,%ecx
1136 cmovg %ecx,$num # find maximum
1137 test %ecx,%ecx
1138 mov %ecx,`4*$i`(%rbx) # initialize counters
1139 cmovle $Tbl,@ptr[$i] # cancel input
1140___
1141}
1142$code.=<<___;
1143 vmovdqu 0x00-0x80($ctx),$A # load context
1144 lea 128(%rsp),%rax
1145 vmovdqu 0x20-0x80($ctx),$B
1146 lea 256+128(%rsp),%rbx
1147 vmovdqu 0x40-0x80($ctx),$C
1148 vmovdqu 0x60-0x80($ctx),$D
1149 vmovdqu 0x80-0x80($ctx),$E
1150 vmovdqu 0xa0-0x80($ctx),$F
1151 vmovdqu 0xc0-0x80($ctx),$G
1152 vmovdqu 0xe0-0x80($ctx),$H
1153 vmovdqu .Lpbswap(%rip),$Xn
1154 jmp .Loop_avx2
1155
1156.align 32
1157.Loop_avx2:
1158 vpxor $B,$C,$bxc # magic seed
1159___
1160for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1161$code.=<<___;
1162 vmovdqu `&Xi_off($i)`,$Xi
1163 mov \$3,%ecx
1164 jmp .Loop_16_xx_avx2
1165.align 32
1166.Loop_16_xx_avx2:
1167___
1168for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1169$code.=<<___;
1170 dec %ecx
1171 jnz .Loop_16_xx_avx2
1172
1173 mov \$1,%ecx
1174 lea `$REG_SZ*16`(%rsp),%rbx
1175 lea K256+128(%rip),$Tbl
1176___
1177for($i=0;$i<8;$i++) {
1178 $code.=<<___;
1179 cmp `4*$i`(%rbx),%ecx # examine counters
1180 cmovge $Tbl,@ptr[$i] # cancel input
1181___
1182}
1183$code.=<<___;
1184 vmovdqa (%rbx),$sigma # pull counters
1185 vpxor $t1,$t1,$t1
1186 vmovdqa $sigma,$Xn
1187 vpcmpgtd $t1,$Xn,$Xn # mask value
1188 vpaddd $Xn,$sigma,$sigma # counters--
1189
1190 vmovdqu 0x00-0x80($ctx),$t1
1191 vpand $Xn,$A,$A
1192 vmovdqu 0x20-0x80($ctx),$t2
1193 vpand $Xn,$B,$B
1194 vmovdqu 0x40-0x80($ctx),$t3
1195 vpand $Xn,$C,$C
1196 vmovdqu 0x60-0x80($ctx),$Xi
1197 vpand $Xn,$D,$D
1198 vpaddd $t1,$A,$A
1199 vmovdqu 0x80-0x80($ctx),$t1
1200 vpand $Xn,$E,$E
1201 vpaddd $t2,$B,$B
1202 vmovdqu 0xa0-0x80($ctx),$t2
1203 vpand $Xn,$F,$F
1204 vpaddd $t3,$C,$C
1205 vmovdqu 0xc0-0x80($ctx),$t3
1206 vpand $Xn,$G,$G
1207 vpaddd $Xi,$D,$D
1208 vmovdqu 0xe0-0x80($ctx),$Xi
1209 vpand $Xn,$H,$H
1210 vpaddd $t1,$E,$E
1211 vpaddd $t2,$F,$F
1212 vmovdqu $A,0x00-0x80($ctx)
1213 vpaddd $t3,$G,$G
1214 vmovdqu $B,0x20-0x80($ctx)
1215 vpaddd $Xi,$H,$H
1216 vmovdqu $C,0x40-0x80($ctx)
1217 vmovdqu $D,0x60-0x80($ctx)
1218 vmovdqu $E,0x80-0x80($ctx)
1219 vmovdqu $F,0xa0-0x80($ctx)
1220 vmovdqu $G,0xc0-0x80($ctx)
1221 vmovdqu $H,0xe0-0x80($ctx)
1222
1223 vmovdqu $sigma,(%rbx) # save counters
1224 lea 256+128(%rsp),%rbx
1225 vmovdqu .Lpbswap(%rip),$Xn
1226 dec $num
1227 jnz .Loop_avx2
1228
1229 #mov `$REG_SZ*17+8`(%rsp),$num
1230 #lea $REG_SZ($ctx),$ctx
1231 #lea `16*$REG_SZ/4`($inp),$inp
1232 #dec $num
1233 #jnz .Loop_grande_avx2
1234
1235.Ldone_avx2:
1236 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1237 vzeroupper
1238___
1239$code.=<<___ if ($win64);
1240 movaps -0xd8(%rax),%xmm6
1241 movaps -0xc8(%rax),%xmm7
1242 movaps -0xb8(%rax),%xmm8
1243 movaps -0xa8(%rax),%xmm9
1244 movaps -0x98(%rax),%xmm10
1245 movaps -0x88(%rax),%xmm11
1246 movaps -0x78(%rax),%xmm12
1247 movaps -0x68(%rax),%xmm13
1248 movaps -0x58(%rax),%xmm14
1249 movaps -0x48(%rax),%xmm15
1250___
1251$code.=<<___;
1252 mov -48(%rax),%r15
1253 mov -40(%rax),%r14
1254 mov -32(%rax),%r13
1255 mov -24(%rax),%r12
1256 mov -16(%rax),%rbp
1257 mov -8(%rax),%rbx
1258 lea (%rax),%rsp
619b9466 1259.Lepilogue_avx2:
b7838586
AP
1260 ret
1261.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1262___
1263 } }}}
1264$code.=<<___;
1265.align 256
1266K256:
1267___
1268sub TABLE {
1269 foreach (@_) {
1270 $code.=<<___;
1271 .long $_,$_,$_,$_
1272 .long $_,$_,$_,$_
1273___
1274 }
1275}
1276&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1277 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1278 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1279 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1280 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1281 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1282 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1283 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1284 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1285 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1286 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1287 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1288 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1289 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1290 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1291 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1292$code.=<<___;
1293.Lpbswap:
1294 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1295 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
619b9466
AP
1296K256_shaext:
1297 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1298 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1299 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1300 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1301 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1302 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1303 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1304 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1305 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1306 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1307 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1308 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1309 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1310 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1311 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1312 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1313 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
b7838586
AP
1314___
1315
619b9466
AP
1316if ($win64) {
1317# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1318# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1319$rec="%rcx";
1320$frame="%rdx";
1321$context="%r8";
1322$disp="%r9";
1323
1324$code.=<<___;
1325.extern __imp_RtlVirtualUnwind
1326.type se_handler,\@abi-omnipotent
1327.align 16
1328se_handler:
1329 push %rsi
1330 push %rdi
1331 push %rbx
1332 push %rbp
1333 push %r12
1334 push %r13
1335 push %r14
1336 push %r15
1337 pushfq
1338 sub \$64,%rsp
1339
1340 mov 120($context),%rax # pull context->Rax
1341 mov 248($context),%rbx # pull context->Rip
1342
1343 mov 8($disp),%rsi # disp->ImageBase
1344 mov 56($disp),%r11 # disp->HandlerData
1345
1346 mov 0(%r11),%r10d # HandlerData[0]
1347 lea (%rsi,%r10),%r10 # end of prologue label
1348 cmp %r10,%rbx # context->Rip<.Lbody
1349 jb .Lin_prologue
1350
1351 mov 152($context),%rax # pull context->Rsp
1352
1353 mov 4(%r11),%r10d # HandlerData[1]
1354 lea (%rsi,%r10),%r10 # epilogue label
1355 cmp %r10,%rbx # context->Rip>=.Lepilogue
1356 jae .Lin_prologue
1357
1358 mov `16*17`(%rax),%rax # pull saved stack pointer
1359
1360 mov -8(%rax),%rbx
1361 mov -16(%rax),%rbp
1362 mov %rbx,144($context) # restore context->Rbx
1363 mov %rbp,160($context) # restore context->Rbp
1364
1365 lea -24-10*16(%rax),%rsi
1366 lea 512($context),%rdi # &context.Xmm6
1367 mov \$20,%ecx
1368 .long 0xa548f3fc # cld; rep movsq
1369
1370.Lin_prologue:
1371 mov 8(%rax),%rdi
1372 mov 16(%rax),%rsi
1373 mov %rax,152($context) # restore context->Rsp
1374 mov %rsi,168($context) # restore context->Rsi
1375 mov %rdi,176($context) # restore context->Rdi
1376
1377 mov 40($disp),%rdi # disp->ContextRecord
1378 mov $context,%rsi # context
1379 mov \$154,%ecx # sizeof(CONTEXT)
1380 .long 0xa548f3fc # cld; rep movsq
1381
1382 mov $disp,%rsi
1383 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1384 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1385 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1386 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1387 mov 40(%rsi),%r10 # disp->ContextRecord
1388 lea 56(%rsi),%r11 # &disp->HandlerData
1389 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1390 mov %r10,32(%rsp) # arg5
1391 mov %r11,40(%rsp) # arg6
1392 mov %r12,48(%rsp) # arg7
1393 mov %rcx,56(%rsp) # arg8, (NULL)
1394 call *__imp_RtlVirtualUnwind(%rip)
1395
1396 mov \$1,%eax # ExceptionContinueSearch
1397 add \$64,%rsp
1398 popfq
1399 pop %r15
1400 pop %r14
1401 pop %r13
1402 pop %r12
1403 pop %rbp
1404 pop %rbx
1405 pop %rdi
1406 pop %rsi
1407 ret
1408.size se_handler,.-se_handler
1409___
1410$code.=<<___ if ($avx>1);
1411.type avx2_handler,\@abi-omnipotent
1412.align 16
1413avx2_handler:
1414 push %rsi
1415 push %rdi
1416 push %rbx
1417 push %rbp
1418 push %r12
1419 push %r13
1420 push %r14
1421 push %r15
1422 pushfq
1423 sub \$64,%rsp
1424
1425 mov 120($context),%rax # pull context->Rax
1426 mov 248($context),%rbx # pull context->Rip
1427
1428 mov 8($disp),%rsi # disp->ImageBase
1429 mov 56($disp),%r11 # disp->HandlerData
1430
1431 mov 0(%r11),%r10d # HandlerData[0]
1432 lea (%rsi,%r10),%r10 # end of prologue label
1433 cmp %r10,%rbx # context->Rip<body label
1434 jb .Lin_prologue
1435
1436 mov 152($context),%rax # pull context->Rsp
1437
1438 mov 4(%r11),%r10d # HandlerData[1]
1439 lea (%rsi,%r10),%r10 # epilogue label
1440 cmp %r10,%rbx # context->Rip>=epilogue label
1441 jae .Lin_prologue
1442
1443 mov `32*17`($context),%rax # pull saved stack pointer
1444
1445 mov -8(%rax),%rbx
1446 mov -16(%rax),%rbp
1447 mov -24(%rax),%r12
1448 mov -32(%rax),%r13
1449 mov -40(%rax),%r14
1450 mov -48(%rax),%r15
1451 mov %rbx,144($context) # restore context->Rbx
1452 mov %rbp,160($context) # restore context->Rbp
1453 mov %r12,216($context) # restore cotnext->R12
1454 mov %r13,224($context) # restore cotnext->R13
1455 mov %r14,232($context) # restore cotnext->R14
1456 mov %r15,240($context) # restore cotnext->R15
1457
1458 lea -56-10*16(%rax),%rsi
1459 lea 512($context),%rdi # &context.Xmm6
1460 mov \$20,%ecx
1461 .long 0xa548f3fc # cld; rep movsq
1462
1463 jmp .Lin_prologue
1464.size avx2_handler,.-avx2_handler
1465___
1466$code.=<<___;
1467.section .pdata
1468.align 4
1469 .rva .LSEH_begin_sha256_multi_block
1470 .rva .LSEH_end_sha256_multi_block
1471 .rva .LSEH_info_sha256_multi_block
1472 .rva .LSEH_begin_sha256_multi_block_shaext
1473 .rva .LSEH_end_sha256_multi_block_shaext
1474 .rva .LSEH_info_sha256_multi_block_shaext
1475___
1476$code.=<<___ if ($avx);
1477 .rva .LSEH_begin_sha256_multi_block_avx
1478 .rva .LSEH_end_sha256_multi_block_avx
1479 .rva .LSEH_info_sha256_multi_block_avx
1480___
1481$code.=<<___ if ($avx>1);
1482 .rva .LSEH_begin_sha256_multi_block_avx2
1483 .rva .LSEH_end_sha256_multi_block_avx2
1484 .rva .LSEH_info_sha256_multi_block_avx2
1485___
1486$code.=<<___;
1487.section .xdata
1488.align 8
1489.LSEH_info_sha256_multi_block:
1490 .byte 9,0,0,0
1491 .rva se_handler
1492 .rva .Lbody,.Lepilogue # HandlerData[]
1493.LSEH_info_sha256_multi_block_shaext:
1494 .byte 9,0,0,0
1495 .rva se_handler
1496 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1497___
1498$code.=<<___ if ($avx);
1499.LSEH_info_sha256_multi_block_avx:
1500 .byte 9,0,0,0
1501 .rva se_handler
1502 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1503___
1504$code.=<<___ if ($avx>1);
1505.LSEH_info_sha256_multi_block_avx2:
1506 .byte 9,0,0,0
1507 .rva avx2_handler
1508 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1509___
1510}
1511####################################################################
1512
1513sub rex {
1514 local *opcode=shift;
1515 my ($dst,$src)=@_;
1516 my $rex=0;
1517
1518 $rex|=0x04 if ($dst>=8);
1519 $rex|=0x01 if ($src>=8);
1520 unshift @opcode,$rex|0x40 if ($rex);
1521}
1522
1523sub sha256op38 {
1524 my $instr = shift;
1525 my %opcodelet = (
1526 "sha256rnds2" => 0xcb,
1527 "sha256msg1" => 0xcc,
1528 "sha256msg2" => 0xcd );
1529
1530 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1531 my @opcode=(0x0f,0x38);
1532 rex(\@opcode,$2,$1);
1533 push @opcode,$opcodelet{$instr};
1534 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1535 return ".byte\t".join(',',@opcode);
1536 } else {
1537 return $instr."\t".@_[0];
1538 }
1539}
1540
b7838586
AP
1541foreach (split("\n",$code)) {
1542 s/\`([^\`]*)\`/eval($1)/ge;
1543
619b9466
AP
1544 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1545
b7838586
AP
1546 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1547 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1548 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1549 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1550 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1551 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
619b9466 1552
b7838586
AP
1553 print $_,"\n";
1554}
1555
1556close STDOUT;