]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha256-mb-x86_64.pl
RT4070: Improve struct/union regexp
[thirdparty/openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
CommitLineData
b7838586
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Multi-buffer SHA256 procedure processes n buffers in parallel by
11# placing buffer data to designated lane of SIMD register. n is
12# naturally limited to 4 on pre-AVX2 processors and to 8 on
13# AVX2-capable processors such as Haswell.
14#
61ba602a 15# this +aesni(i) sha256 aesni-sha256 gain(iv)
b7838586 16# -------------------------------------------------------------------
61ba602a 17# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
619b9466 18# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
b7838586
AP
19# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
20# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
61ba602a 21# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
b7f5503f 22# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
b7838586
AP
23# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
24#
61ba602a
AP
25# (i) multi-block CBC encrypt with 128-bit key;
26# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
b7838586
AP
27# because of lower AES-NI instruction throughput, nor is there
28# AES-NI-SHA256 stitch for these processors;
61ba602a 29# (iii) "this" is for n=8, when we gather twice as much data, result
b7838586 30# for n=4 is 20.3+4.44=24.7;
3847d15d
AP
31# (iv) presented improvement coefficients are asymptotic limits and
32# in real-life application are somewhat lower, e.g. for 2KB
619b9466 33# fragments they range from 75% to 130% (on Haswell);
b7838586
AP
34
35$flavour = shift;
36$output = shift;
37if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
38
39$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
44die "can't locate x86_64-xlate.pl";
45
46$avx=0;
47
48if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50 $avx = ($1>=2.19) + ($1>=2.22);
51}
52
53if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55 $avx = ($1>=2.09) + ($1>=2.10);
56}
57
58if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60 $avx = ($1>=10) + ($1>=11);
61}
62
b9749432 63if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
a356e488 64 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
65}
66
b7838586
AP
67open OUT,"| \"$^X\" $xlate $flavour $output";
68*STDOUT=*OUT;
69
70# void sha256_multi_block (
71# struct { unsigned int A[8];
72# unsigned int B[8];
73# unsigned int C[8];
74# unsigned int D[8];
75# unsigned int E[8];
76# unsigned int F[8];
77# unsigned int G[8];
78# unsigned int H[8]; } *ctx,
79# struct { void *ptr; int blocks; } inp[8],
80# int num); /* 1 or 2 */
81#
82$ctx="%rdi"; # 1st arg
83$inp="%rsi"; # 2nd arg
84$num="%edx"; # 3rd arg
85@ptr=map("%r$_",(8..11));
86$Tbl="%rbp";
87
88@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
89($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
90
91$REG_SZ=16;
92
93sub Xi_off {
94my $off = shift;
95
96 $off %= 16; $off *= $REG_SZ;
97 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
98}
99
100sub ROUND_00_15 {
101my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
102
103$code.=<<___ if ($i<15);
104 movd `4*$i`(@ptr[0]),$Xi
105 movd `4*$i`(@ptr[1]),$t1
106 movd `4*$i`(@ptr[2]),$t2
107 movd `4*$i`(@ptr[3]),$t3
108 punpckldq $t2,$Xi
109 punpckldq $t3,$t1
110 punpckldq $t1,$Xi
b7838586
AP
111___
112$code.=<<___ if ($i==15);
113 movd `4*$i`(@ptr[0]),$Xi
114 lea `16*4`(@ptr[0]),@ptr[0]
115 movd `4*$i`(@ptr[1]),$t1
116 lea `16*4`(@ptr[1]),@ptr[1]
117 movd `4*$i`(@ptr[2]),$t2
118 lea `16*4`(@ptr[2]),@ptr[2]
119 movd `4*$i`(@ptr[3]),$t3
120 lea `16*4`(@ptr[3]),@ptr[3]
121 punpckldq $t2,$Xi
122 punpckldq $t3,$t1
123 punpckldq $t1,$Xi
b7838586
AP
124___
125$code.=<<___;
126 movdqa $e,$sigma
619b9466 127 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
b7838586 128 movdqa $e,$t3
619b9466 129 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
b7838586
AP
130 psrld \$6,$sigma
131 movdqa $e,$t2
132 pslld \$7,$t3
133 movdqa $Xi,`&Xi_off($i)`
134 paddd $h,$Xi # Xi+=h
135
136 psrld \$11,$t2
137 pxor $t3,$sigma
138 pslld \$21-7,$t3
139 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
140 pxor $t2,$sigma
141
142 psrld \$25-11,$t2
143 movdqa $e,$t1
619b9466 144 `"prefetcht0 63(@ptr[0])" if ($i==15)`
b7838586
AP
145 pxor $t3,$sigma
146 movdqa $e,$axb # borrow $axb
147 pslld \$26-21,$t3
148 pandn $g,$t1
149 pand $f,$axb
150 pxor $t2,$sigma
151
619b9466 152 `"prefetcht0 63(@ptr[1])" if ($i==15)`
b7838586
AP
153 movdqa $a,$t2
154 pxor $t3,$sigma # Sigma1(e)
155 movdqa $a,$t3
156 psrld \$2,$t2
157 paddd $sigma,$Xi # Xi+=Sigma1(e)
158 pxor $axb,$t1 # Ch(e,f,g)
159 movdqa $b,$axb
160 movdqa $a,$sigma
161 pslld \$10,$t3
162 pxor $a,$axb # a^b, b^c in next round
163
619b9466 164 `"prefetcht0 63(@ptr[2])" if ($i==15)`
b7838586
AP
165 psrld \$13,$sigma
166 pxor $t3,$t2
167 paddd $t1,$Xi # Xi+=Ch(e,f,g)
168 pslld \$19-10,$t3
169 pand $axb,$bxc
170 pxor $sigma,$t2
171
619b9466 172 `"prefetcht0 63(@ptr[3])" if ($i==15)`
b7838586
AP
173 psrld \$22-13,$sigma
174 pxor $t3,$t2
175 movdqa $b,$h
176 pslld \$30-19,$t3
177 pxor $t2,$sigma
178 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
179 paddd $Xi,$d # d+=Xi
180 pxor $t3,$sigma # Sigma0(a)
181
182 paddd $Xi,$h # h+=Xi
183 paddd $sigma,$h # h+=Sigma0(a)
184___
185$code.=<<___ if (($i%8)==7);
186 lea `32*8`($Tbl),$Tbl
187___
188 ($axb,$bxc)=($bxc,$axb);
189}
190
191sub ROUND_16_XX {
192my $i=shift;
193
194$code.=<<___;
195 movdqa `&Xi_off($i+1)`,$Xn
196 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
197
198 movdqa $Xn,$sigma
199 movdqa $Xn,$t2
200 psrld \$3,$sigma
201 movdqa $Xn,$t3
202
203 psrld \$7,$t2
204 movdqa `&Xi_off($i+14)`,$t1
205 pslld \$14,$t3
206 pxor $t2,$sigma
207 psrld \$18-7,$t2
208 movdqa $t1,$axb # borrow $axb
209 pxor $t3,$sigma
210 pslld \$25-14,$t3
211 pxor $t2,$sigma
212 psrld \$10,$t1
213 movdqa $axb,$t2
214
215 psrld \$17,$axb
216 pxor $t3,$sigma # sigma0(X[i+1])
217 pslld \$13,$t2
218 paddd $sigma,$Xi # Xi+=sigma0(e)
219 pxor $axb,$t1
220 psrld \$19-17,$axb
221 pxor $t2,$t1
222 pslld \$15-13,$t2
223 pxor $axb,$t1
224 pxor $t2,$t1 # sigma0(X[i+14])
225 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
226___
227 &ROUND_00_15($i,@_);
228 ($Xi,$Xn)=($Xn,$Xi);
229}
230
231$code.=<<___;
232.text
233
234.extern OPENSSL_ia32cap_P
235
236.globl sha256_multi_block
237.type sha256_multi_block,\@function,3
238.align 32
239sha256_multi_block:
619b9466
AP
240 mov OPENSSL_ia32cap_P+4(%rip),%rcx
241 bt \$61,%rcx # check SHA bit
242 jc _shaext_shortcut
b7838586
AP
243___
244$code.=<<___ if ($avx);
b7838586
AP
245 test \$`1<<28`,%ecx
246 jnz _avx_shortcut
247___
248$code.=<<___;
249 mov %rsp,%rax
250 push %rbx
251 push %rbp
252___
253$code.=<<___ if ($win64);
254 lea -0xa8(%rsp),%rsp
255 movaps %xmm6,(%rsp)
256 movaps %xmm7,0x10(%rsp)
257 movaps %xmm8,0x20(%rsp)
258 movaps %xmm9,0x30(%rsp)
259 movaps %xmm10,-0x78(%rax)
260 movaps %xmm11,-0x68(%rax)
261 movaps %xmm12,-0x58(%rax)
262 movaps %xmm13,-0x48(%rax)
263 movaps %xmm14,-0x38(%rax)
264 movaps %xmm15,-0x28(%rax)
265___
266$code.=<<___;
267 sub \$`$REG_SZ*18`, %rsp
268 and \$-256,%rsp
269 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
619b9466 270.Lbody:
b7838586
AP
271 lea K256+128(%rip),$Tbl
272 lea `$REG_SZ*16`(%rsp),%rbx
273 lea 0x80($ctx),$ctx # size optimization
274
275.Loop_grande:
276 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
277 xor $num,$num
278___
279for($i=0;$i<4;$i++) {
280 $code.=<<___;
281 mov `16*$i+0`($inp),@ptr[$i] # input pointer
282 mov `16*$i+8`($inp),%ecx # number of blocks
283 cmp $num,%ecx
284 cmovg %ecx,$num # find maximum
285 test %ecx,%ecx
286 mov %ecx,`4*$i`(%rbx) # initialize counters
287 cmovle $Tbl,@ptr[$i] # cancel input
288___
289}
290$code.=<<___;
291 test $num,$num
292 jz .Ldone
293
294 movdqu 0x00-0x80($ctx),$A # load context
295 lea 128(%rsp),%rax
296 movdqu 0x20-0x80($ctx),$B
297 movdqu 0x40-0x80($ctx),$C
298 movdqu 0x60-0x80($ctx),$D
299 movdqu 0x80-0x80($ctx),$E
300 movdqu 0xa0-0x80($ctx),$F
301 movdqu 0xc0-0x80($ctx),$G
302 movdqu 0xe0-0x80($ctx),$H
303 movdqu .Lpbswap(%rip),$Xn
304 jmp .Loop
305
306.align 32
307.Loop:
308 movdqa $C,$bxc
309 pxor $B,$bxc # magic seed
310___
311for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
312$code.=<<___;
313 movdqu `&Xi_off($i)`,$Xi
314 mov \$3,%ecx
315 jmp .Loop_16_xx
316.align 32
317.Loop_16_xx:
318___
319for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
320$code.=<<___;
321 dec %ecx
322 jnz .Loop_16_xx
323
324 mov \$1,%ecx
325 lea K256+128(%rip),$Tbl
326
327 movdqa (%rbx),$sigma # pull counters
328 cmp 4*0(%rbx),%ecx # examine counters
329 pxor $t1,$t1
330 cmovge $Tbl,@ptr[0] # cancel input
331 cmp 4*1(%rbx),%ecx
332 movdqa $sigma,$Xn
333 cmovge $Tbl,@ptr[1]
334 cmp 4*2(%rbx),%ecx
335 pcmpgtd $t1,$Xn # mask value
336 cmovge $Tbl,@ptr[2]
337 cmp 4*3(%rbx),%ecx
338 paddd $Xn,$sigma # counters--
339 cmovge $Tbl,@ptr[3]
340
341 movdqu 0x00-0x80($ctx),$t1
342 pand $Xn,$A
343 movdqu 0x20-0x80($ctx),$t2
344 pand $Xn,$B
345 movdqu 0x40-0x80($ctx),$t3
346 pand $Xn,$C
347 movdqu 0x60-0x80($ctx),$Xi
348 pand $Xn,$D
349 paddd $t1,$A
350 movdqu 0x80-0x80($ctx),$t1
351 pand $Xn,$E
352 paddd $t2,$B
353 movdqu 0xa0-0x80($ctx),$t2
354 pand $Xn,$F
355 paddd $t3,$C
356 movdqu 0xc0-0x80($ctx),$t3
357 pand $Xn,$G
358 paddd $Xi,$D
359 movdqu 0xe0-0x80($ctx),$Xi
360 pand $Xn,$H
361 paddd $t1,$E
362 paddd $t2,$F
363 movdqu $A,0x00-0x80($ctx)
364 paddd $t3,$G
365 movdqu $B,0x20-0x80($ctx)
366 paddd $Xi,$H
367 movdqu $C,0x40-0x80($ctx)
368 movdqu $D,0x60-0x80($ctx)
369 movdqu $E,0x80-0x80($ctx)
370 movdqu $F,0xa0-0x80($ctx)
371 movdqu $G,0xc0-0x80($ctx)
372 movdqu $H,0xe0-0x80($ctx)
373
374 movdqa $sigma,(%rbx) # save counters
375 movdqa .Lpbswap(%rip),$Xn
376 dec $num
377 jnz .Loop
378
379 mov `$REG_SZ*17+8`(%rsp),$num
380 lea $REG_SZ($ctx),$ctx
381 lea `16*$REG_SZ/4`($inp),$inp
382 dec $num
383 jnz .Loop_grande
384
385.Ldone:
386 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
387___
388$code.=<<___ if ($win64);
389 movaps -0xb8(%rax),%xmm6
390 movaps -0xa8(%rax),%xmm7
391 movaps -0x98(%rax),%xmm8
392 movaps -0x88(%rax),%xmm9
393 movaps -0x78(%rax),%xmm10
394 movaps -0x68(%rax),%xmm11
395 movaps -0x58(%rax),%xmm12
396 movaps -0x48(%rax),%xmm13
397 movaps -0x38(%rax),%xmm14
398 movaps -0x28(%rax),%xmm15
399___
400$code.=<<___;
401 mov -16(%rax),%rbp
402 mov -8(%rax),%rbx
403 lea (%rax),%rsp
619b9466 404.Lepilogue:
b7838586
AP
405 ret
406.size sha256_multi_block,.-sha256_multi_block
407___
619b9466
AP
408 {{{
409my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
410my @MSG0=map("%xmm$_",(4..7));
411my @MSG1=map("%xmm$_",(8..11));
412
413$code.=<<___;
414.type sha256_multi_block_shaext,\@function,3
415.align 32
416sha256_multi_block_shaext:
417_shaext_shortcut:
418 mov %rsp,%rax
419 push %rbx
420 push %rbp
421___
422$code.=<<___ if ($win64);
423 lea -0xa8(%rsp),%rsp
424 movaps %xmm6,(%rsp)
425 movaps %xmm7,0x10(%rsp)
426 movaps %xmm8,0x20(%rsp)
427 movaps %xmm9,0x30(%rsp)
428 movaps %xmm10,-0x78(%rax)
429 movaps %xmm11,-0x68(%rax)
430 movaps %xmm12,-0x58(%rax)
431 movaps %xmm13,-0x48(%rax)
432 movaps %xmm14,-0x38(%rax)
433 movaps %xmm15,-0x28(%rax)
434___
435$code.=<<___;
436 sub \$`$REG_SZ*18`,%rsp
437 shl \$1,$num # we process pair at a time
438 and \$-256,%rsp
439 lea 0x80($ctx),$ctx # size optimization
440 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
441.Lbody_shaext:
442 lea `$REG_SZ*16`(%rsp),%rbx
443 lea K256_shaext+0x80(%rip),$Tbl
444
445.Loop_grande_shaext:
446 mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
447 xor $num,$num
448___
449for($i=0;$i<2;$i++) {
450 $code.=<<___;
451 mov `16*$i+0`($inp),@ptr[$i] # input pointer
452 mov `16*$i+8`($inp),%ecx # number of blocks
453 cmp $num,%ecx
454 cmovg %ecx,$num # find maximum
455 test %ecx,%ecx
456 mov %ecx,`4*$i`(%rbx) # initialize counters
457 cmovle %rsp,@ptr[$i] # cancel input
458___
459}
460$code.=<<___;
461 test $num,$num
462 jz .Ldone_shaext
463
464 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
465 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
466 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
467 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
468 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
469 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
470 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
471 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
472
473 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
474 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
475 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
476 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
477 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
478
479 movdqa $ABEF0,$ABEF1
480 movdqa $CDGH0,$CDGH1
481 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
482 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
483 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
484 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
485
486 pshufd \$0b00011011,$ABEF0,$ABEF0
487 pshufd \$0b00011011,$CDGH0,$CDGH0
488 pshufd \$0b00011011,$ABEF1,$ABEF1
489 pshufd \$0b00011011,$CDGH1,$CDGH1
490 jmp .Loop_shaext
491
492.align 32
493.Loop_shaext:
494 movdqu 0x00(@ptr[0]),@MSG0[0]
495 movdqu 0x00(@ptr[1]),@MSG1[0]
496 movdqu 0x10(@ptr[0]),@MSG0[1]
497 movdqu 0x10(@ptr[1]),@MSG1[1]
498 movdqu 0x20(@ptr[0]),@MSG0[2]
499 pshufb $TMPx,@MSG0[0]
500 movdqu 0x20(@ptr[1]),@MSG1[2]
501 pshufb $TMPx,@MSG1[0]
502 movdqu 0x30(@ptr[0]),@MSG0[3]
503 lea 0x40(@ptr[0]),@ptr[0]
504 movdqu 0x30(@ptr[1]),@MSG1[3]
505 lea 0x40(@ptr[1]),@ptr[1]
506
507 movdqa 0*16-0x80($Tbl),$Wi
508 pshufb $TMPx,@MSG0[1]
509 paddd @MSG0[0],$Wi
510 pxor $ABEF0,@MSG0[0] # black magic
511 movdqa $Wi,$TMP0
512 movdqa 0*16-0x80($Tbl),$TMP1
513 pshufb $TMPx,@MSG1[1]
514 paddd @MSG1[0],$TMP1
515 movdqa $CDGH0,0x50(%rsp) # offload
516 sha256rnds2 $ABEF0,$CDGH0 # 0-3
517 pxor $ABEF1,@MSG1[0] # black magic
518 movdqa $TMP1,$Wi
519 movdqa $CDGH1,0x70(%rsp)
520 sha256rnds2 $ABEF1,$CDGH1 # 0-3
521 pshufd \$0x0e,$TMP0,$Wi
522 pxor $ABEF0,@MSG0[0] # black magic
523 movdqa $ABEF0,0x40(%rsp) # offload
524 sha256rnds2 $CDGH0,$ABEF0
525 pshufd \$0x0e,$TMP1,$Wi
526 pxor $ABEF1,@MSG1[0] # black magic
527 movdqa $ABEF1,0x60(%rsp)
528 movdqa 1*16-0x80($Tbl),$TMP0
529 paddd @MSG0[1],$TMP0
530 pshufb $TMPx,@MSG0[2]
531 sha256rnds2 $CDGH1,$ABEF1
532
533 movdqa $TMP0,$Wi
534 movdqa 1*16-0x80($Tbl),$TMP1
535 paddd @MSG1[1],$TMP1
536 sha256rnds2 $ABEF0,$CDGH0 # 4-7
537 movdqa $TMP1,$Wi
538 prefetcht0 127(@ptr[0])
539 pshufb $TMPx,@MSG0[3]
540 pshufb $TMPx,@MSG1[2]
541 prefetcht0 127(@ptr[1])
542 sha256rnds2 $ABEF1,$CDGH1 # 4-7
543 pshufd \$0x0e,$TMP0,$Wi
544 pshufb $TMPx,@MSG1[3]
545 sha256msg1 @MSG0[1],@MSG0[0]
546 sha256rnds2 $CDGH0,$ABEF0
547 pshufd \$0x0e,$TMP1,$Wi
548 movdqa 2*16-0x80($Tbl),$TMP0
549 paddd @MSG0[2],$TMP0
550 sha256rnds2 $CDGH1,$ABEF1
551
552 movdqa $TMP0,$Wi
553 movdqa 2*16-0x80($Tbl),$TMP1
554 paddd @MSG1[2],$TMP1
555 sha256rnds2 $ABEF0,$CDGH0 # 8-11
556 sha256msg1 @MSG1[1],@MSG1[0]
557 movdqa $TMP1,$Wi
558 movdqa @MSG0[3],$TMPx
559 sha256rnds2 $ABEF1,$CDGH1 # 8-11
560 pshufd \$0x0e,$TMP0,$Wi
561 palignr \$4,@MSG0[2],$TMPx
562 paddd $TMPx,@MSG0[0]
563 movdqa @MSG1[3],$TMPx
564 palignr \$4,@MSG1[2],$TMPx
565 sha256msg1 @MSG0[2],@MSG0[1]
566 sha256rnds2 $CDGH0,$ABEF0
567 pshufd \$0x0e,$TMP1,$Wi
568 movdqa 3*16-0x80($Tbl),$TMP0
569 paddd @MSG0[3],$TMP0
570 sha256rnds2 $CDGH1,$ABEF1
571 sha256msg1 @MSG1[2],@MSG1[1]
572
573 movdqa $TMP0,$Wi
574 movdqa 3*16-0x80($Tbl),$TMP1
575 paddd $TMPx,@MSG1[0]
576 paddd @MSG1[3],$TMP1
577 sha256msg2 @MSG0[3],@MSG0[0]
578 sha256rnds2 $ABEF0,$CDGH0 # 12-15
579 movdqa $TMP1,$Wi
580 movdqa @MSG0[0],$TMPx
581 palignr \$4,@MSG0[3],$TMPx
582 sha256rnds2 $ABEF1,$CDGH1 # 12-15
583 sha256msg2 @MSG1[3],@MSG1[0]
584 pshufd \$0x0e,$TMP0,$Wi
585 paddd $TMPx,@MSG0[1]
586 movdqa @MSG1[0],$TMPx
587 palignr \$4,@MSG1[3],$TMPx
588 sha256msg1 @MSG0[3],@MSG0[2]
589 sha256rnds2 $CDGH0,$ABEF0
590 pshufd \$0x0e,$TMP1,$Wi
591 movdqa 4*16-0x80($Tbl),$TMP0
592 paddd @MSG0[0],$TMP0
593 sha256rnds2 $CDGH1,$ABEF1
594 sha256msg1 @MSG1[3],@MSG1[2]
595___
596for($i=4;$i<16-3;$i++) {
597$code.=<<___;
598 movdqa $TMP0,$Wi
599 movdqa $i*16-0x80($Tbl),$TMP1
600 paddd $TMPx,@MSG1[1]
601 paddd @MSG1[0],$TMP1
602 sha256msg2 @MSG0[0],@MSG0[1]
603 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
604 movdqa $TMP1,$Wi
605 movdqa @MSG0[1],$TMPx
606 palignr \$4,@MSG0[0],$TMPx
607 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
608 sha256msg2 @MSG1[0],@MSG1[1]
609 pshufd \$0x0e,$TMP0,$Wi
610 paddd $TMPx,@MSG0[2]
611 movdqa @MSG1[1],$TMPx
612 palignr \$4,@MSG1[0],$TMPx
613 sha256msg1 @MSG0[0],@MSG0[3]
614 sha256rnds2 $CDGH0,$ABEF0
615 pshufd \$0x0e,$TMP1,$Wi
616 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
617 paddd @MSG0[1],$TMP0
618 sha256rnds2 $CDGH1,$ABEF1
619 sha256msg1 @MSG1[0],@MSG1[3]
620___
621 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
622}
623$code.=<<___;
624 movdqa $TMP0,$Wi
625 movdqa 13*16-0x80($Tbl),$TMP1
626 paddd $TMPx,@MSG1[1]
627 paddd @MSG1[0],$TMP1
628 sha256msg2 @MSG0[0],@MSG0[1]
629 sha256rnds2 $ABEF0,$CDGH0 # 52-55
630 movdqa $TMP1,$Wi
631 movdqa @MSG0[1],$TMPx
632 palignr \$4,@MSG0[0],$TMPx
633 sha256rnds2 $ABEF1,$CDGH1 # 52-55
634 sha256msg2 @MSG1[0],@MSG1[1]
635 pshufd \$0x0e,$TMP0,$Wi
636 paddd $TMPx,@MSG0[2]
637 movdqa @MSG1[1],$TMPx
638 palignr \$4,@MSG1[0],$TMPx
639 nop
640 sha256rnds2 $CDGH0,$ABEF0
641 pshufd \$0x0e,$TMP1,$Wi
642 movdqa 14*16-0x80($Tbl),$TMP0
643 paddd @MSG0[1],$TMP0
644 sha256rnds2 $CDGH1,$ABEF1
645
646 movdqa $TMP0,$Wi
647 movdqa 14*16-0x80($Tbl),$TMP1
648 paddd $TMPx,@MSG1[2]
649 paddd @MSG1[1],$TMP1
650 sha256msg2 @MSG0[1],@MSG0[2]
651 nop
652 sha256rnds2 $ABEF0,$CDGH0 # 56-59
653 movdqa $TMP1,$Wi
654 mov \$1,%ecx
655 pxor @MSG0[1],@MSG0[1] # zero
656 sha256rnds2 $ABEF1,$CDGH1 # 56-59
657 sha256msg2 @MSG1[1],@MSG1[2]
658 pshufd \$0x0e,$TMP0,$Wi
659 movdqa 15*16-0x80($Tbl),$TMP0
660 paddd @MSG0[2],$TMP0
661 movq (%rbx),@MSG0[2] # pull counters
662 nop
663 sha256rnds2 $CDGH0,$ABEF0
664 pshufd \$0x0e,$TMP1,$Wi
665 movdqa 15*16-0x80($Tbl),$TMP1
666 paddd @MSG1[2],$TMP1
667 sha256rnds2 $CDGH1,$ABEF1
668
669 movdqa $TMP0,$Wi
670 cmp 4*0(%rbx),%ecx # examine counters
671 cmovge %rsp,@ptr[0] # cancel input
672 cmp 4*1(%rbx),%ecx
673 cmovge %rsp,@ptr[1]
674 pshufd \$0x00,@MSG0[2],@MSG1[0]
675 sha256rnds2 $ABEF0,$CDGH0 # 60-63
676 movdqa $TMP1,$Wi
677 pshufd \$0x55,@MSG0[2],@MSG1[1]
678 movdqa @MSG0[2],@MSG1[2]
679 sha256rnds2 $ABEF1,$CDGH1 # 60-63
680 pshufd \$0x0e,$TMP0,$Wi
681 pcmpgtd @MSG0[1],@MSG1[0]
682 pcmpgtd @MSG0[1],@MSG1[1]
683 sha256rnds2 $CDGH0,$ABEF0
684 pshufd \$0x0e,$TMP1,$Wi
685 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
686 movdqa K256_shaext-0x10(%rip),$TMPx
687 sha256rnds2 $CDGH1,$ABEF1
688
689 pand @MSG1[0],$CDGH0
690 pand @MSG1[1],$CDGH1
691 pand @MSG1[0],$ABEF0
692 pand @MSG1[1],$ABEF1
693 paddd @MSG0[2],@MSG1[2] # counters--
694
695 paddd 0x50(%rsp),$CDGH0
696 paddd 0x70(%rsp),$CDGH1
697 paddd 0x40(%rsp),$ABEF0
698 paddd 0x60(%rsp),$ABEF1
699
700 movq @MSG1[2],(%rbx) # save counters
701 dec $num
702 jnz .Loop_shaext
703
704 mov `$REG_SZ*17+8`(%rsp),$num
705
706 pshufd \$0b00011011,$ABEF0,$ABEF0
707 pshufd \$0b00011011,$CDGH0,$CDGH0
708 pshufd \$0b00011011,$ABEF1,$ABEF1
709 pshufd \$0b00011011,$CDGH1,$CDGH1
710
711 movdqa $ABEF0,@MSG0[0]
712 movdqa $CDGH0,@MSG0[1]
713 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
714 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
715 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
716 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
717
718 movq $ABEF0,0x00-0x80($ctx) # A1.A0
719 psrldq \$8,$ABEF0
720 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
721 psrldq \$8,@MSG0[0]
722 movq $ABEF0,0x20-0x80($ctx) # B1.B0
723 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
724
725 movq $CDGH0,0x40-0x80($ctx) # C1.C0
726 psrldq \$8,$CDGH0
727 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
728 psrldq \$8,@MSG0[1]
729 movq $CDGH0,0x60-0x80($ctx) # D1.D0
730 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
731
732 lea `$REG_SZ/2`($ctx),$ctx
733 lea `16*2`($inp),$inp
734 dec $num
735 jnz .Loop_grande_shaext
736
737.Ldone_shaext:
738 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
739___
740$code.=<<___ if ($win64);
741 movaps -0xb8(%rax),%xmm6
742 movaps -0xa8(%rax),%xmm7
743 movaps -0x98(%rax),%xmm8
744 movaps -0x88(%rax),%xmm9
745 movaps -0x78(%rax),%xmm10
746 movaps -0x68(%rax),%xmm11
747 movaps -0x58(%rax),%xmm12
748 movaps -0x48(%rax),%xmm13
749 movaps -0x38(%rax),%xmm14
750 movaps -0x28(%rax),%xmm15
751___
752$code.=<<___;
753 mov -16(%rax),%rbp
754 mov -8(%rax),%rbx
755 lea (%rax),%rsp
756.Lepilogue_shaext:
757 ret
758.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
759___
760 }}}
b7838586
AP
761 if ($avx) {{{
762sub ROUND_00_15_avx {
763my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
764
765$code.=<<___ if ($i<15 && $REG_SZ==16);
766 vmovd `4*$i`(@ptr[0]),$Xi
767 vmovd `4*$i`(@ptr[1]),$t1
768 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
769 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
770 vpunpckldq $t1,$Xi,$Xi
771 vpshufb $Xn,$Xi,$Xi
772___
773$code.=<<___ if ($i==15 && $REG_SZ==16);
774 vmovd `4*$i`(@ptr[0]),$Xi
775 lea `16*4`(@ptr[0]),@ptr[0]
776 vmovd `4*$i`(@ptr[1]),$t1
777 lea `16*4`(@ptr[1]),@ptr[1]
778 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
779 lea `16*4`(@ptr[2]),@ptr[2]
780 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
781 lea `16*4`(@ptr[3]),@ptr[3]
782 vpunpckldq $t1,$Xi,$Xi
783 vpshufb $Xn,$Xi,$Xi
784___
785$code.=<<___ if ($i<15 && $REG_SZ==32);
786 vmovd `4*$i`(@ptr[0]),$Xi
787 vmovd `4*$i`(@ptr[4]),$t1
788 vmovd `4*$i`(@ptr[1]),$t2
789 vmovd `4*$i`(@ptr[5]),$t3
790 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
791 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
792 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
793 vpunpckldq $t2,$Xi,$Xi
794 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
795 vpunpckldq $t3,$t1,$t1
796 vinserti128 $t1,$Xi,$Xi
797 vpshufb $Xn,$Xi,$Xi
798___
799$code.=<<___ if ($i==15 && $REG_SZ==32);
800 vmovd `4*$i`(@ptr[0]),$Xi
801 lea `16*4`(@ptr[0]),@ptr[0]
802 vmovd `4*$i`(@ptr[4]),$t1
803 lea `16*4`(@ptr[4]),@ptr[4]
804 vmovd `4*$i`(@ptr[1]),$t2
805 lea `16*4`(@ptr[1]),@ptr[1]
806 vmovd `4*$i`(@ptr[5]),$t3
807 lea `16*4`(@ptr[5]),@ptr[5]
808 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
809 lea `16*4`(@ptr[2]),@ptr[2]
810 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
811 lea `16*4`(@ptr[6]),@ptr[6]
812 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
813 lea `16*4`(@ptr[3]),@ptr[3]
814 vpunpckldq $t2,$Xi,$Xi
815 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
816 lea `16*4`(@ptr[7]),@ptr[7]
817 vpunpckldq $t3,$t1,$t1
818 vinserti128 $t1,$Xi,$Xi
819 vpshufb $Xn,$Xi,$Xi
820___
821$code.=<<___;
822 vpsrld \$6,$e,$sigma
823 vpslld \$26,$e,$t3
824 vmovdqu $Xi,`&Xi_off($i)`
825 vpaddd $h,$Xi,$Xi # Xi+=h
826
827 vpsrld \$11,$e,$t2
828 vpxor $t3,$sigma,$sigma
829 vpslld \$21,$e,$t3
830 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
831 vpxor $t2,$sigma,$sigma
832
833 vpsrld \$25,$e,$t2
834 vpxor $t3,$sigma,$sigma
619b9466 835 `"prefetcht0 63(@ptr[0])" if ($i==15)`
b7838586
AP
836 vpslld \$7,$e,$t3
837 vpandn $g,$e,$t1
838 vpand $f,$e,$axb # borrow $axb
619b9466 839 `"prefetcht0 63(@ptr[1])" if ($i==15)`
b7838586
AP
840 vpxor $t2,$sigma,$sigma
841
842 vpsrld \$2,$a,$h # borrow $h
843 vpxor $t3,$sigma,$sigma # Sigma1(e)
619b9466 844 `"prefetcht0 63(@ptr[2])" if ($i==15)`
b7838586
AP
845 vpslld \$30,$a,$t2
846 vpxor $axb,$t1,$t1 # Ch(e,f,g)
847 vpxor $a,$b,$axb # a^b, b^c in next round
619b9466 848 `"prefetcht0 63(@ptr[3])" if ($i==15)`
b7838586
AP
849 vpxor $t2,$h,$h
850 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
851
852 vpsrld \$13,$a,$t2
619b9466 853 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
854 vpslld \$19,$a,$t3
855 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
856 vpand $axb,$bxc,$bxc
619b9466 857 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
858 vpxor $t2,$h,$sigma
859
860 vpsrld \$22,$a,$t2
861 vpxor $t3,$sigma,$sigma
619b9466 862 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
863 vpslld \$10,$a,$t3
864 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
865 vpaddd $Xi,$d,$d # d+=Xi
619b9466 866 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
867 vpxor $t2,$sigma,$sigma
868 vpxor $t3,$sigma,$sigma # Sigma0(a)
869
870 vpaddd $Xi,$h,$h # h+=Xi
871 vpaddd $sigma,$h,$h # h+=Sigma0(a)
872___
873$code.=<<___ if (($i%8)==7);
874 add \$`32*8`,$Tbl
875___
876 ($axb,$bxc)=($bxc,$axb);
877}
878
879sub ROUND_16_XX_avx {
880my $i=shift;
881
882$code.=<<___;
883 vmovdqu `&Xi_off($i+1)`,$Xn
884 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
885
886 vpsrld \$3,$Xn,$sigma
887 vpsrld \$7,$Xn,$t2
888 vpslld \$25,$Xn,$t3
889 vpxor $t2,$sigma,$sigma
890 vpsrld \$18,$Xn,$t2
891 vpxor $t3,$sigma,$sigma
892 vpslld \$14,$Xn,$t3
893 vmovdqu `&Xi_off($i+14)`,$t1
894 vpsrld \$10,$t1,$axb # borrow $axb
895
896 vpxor $t2,$sigma,$sigma
897 vpsrld \$17,$t1,$t2
898 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
899 vpslld \$15,$t1,$t3
900 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
901 vpxor $t2,$axb,$sigma
902 vpsrld \$19,$t1,$t2
903 vpxor $t3,$sigma,$sigma
904 vpslld \$13,$t1,$t3
905 vpxor $t2,$sigma,$sigma
906 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
907 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
908___
909 &ROUND_00_15_avx($i,@_);
910 ($Xi,$Xn)=($Xn,$Xi);
911}
912
913$code.=<<___;
914.type sha256_multi_block_avx,\@function,3
915.align 32
916sha256_multi_block_avx:
917_avx_shortcut:
918___
919$code.=<<___ if ($avx>1);
920 shr \$32,%rcx
921 cmp \$2,$num
922 jb .Lavx
923 test \$`1<<5`,%ecx
924 jnz _avx2_shortcut
925 jmp .Lavx
926.align 32
927.Lavx:
928___
929$code.=<<___;
930 mov %rsp,%rax
931 push %rbx
932 push %rbp
933___
934$code.=<<___ if ($win64);
935 lea -0xa8(%rsp),%rsp
936 movaps %xmm6,(%rsp)
937 movaps %xmm7,0x10(%rsp)
938 movaps %xmm8,0x20(%rsp)
939 movaps %xmm9,0x30(%rsp)
940 movaps %xmm10,-0x78(%rax)
941 movaps %xmm11,-0x68(%rax)
942 movaps %xmm12,-0x58(%rax)
943 movaps %xmm13,-0x48(%rax)
944 movaps %xmm14,-0x38(%rax)
945 movaps %xmm15,-0x28(%rax)
946___
947$code.=<<___;
948 sub \$`$REG_SZ*18`, %rsp
949 and \$-256,%rsp
950 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
619b9466 951.Lbody_avx:
b7838586
AP
952 lea K256+128(%rip),$Tbl
953 lea `$REG_SZ*16`(%rsp),%rbx
954 lea 0x80($ctx),$ctx # size optimization
955
956.Loop_grande_avx:
957 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
958 xor $num,$num
959___
960for($i=0;$i<4;$i++) {
961 $code.=<<___;
962 mov `16*$i+0`($inp),@ptr[$i] # input pointer
963 mov `16*$i+8`($inp),%ecx # number of blocks
964 cmp $num,%ecx
965 cmovg %ecx,$num # find maximum
966 test %ecx,%ecx
967 mov %ecx,`4*$i`(%rbx) # initialize counters
968 cmovle $Tbl,@ptr[$i] # cancel input
969___
970}
971$code.=<<___;
972 test $num,$num
973 jz .Ldone_avx
974
975 vmovdqu 0x00-0x80($ctx),$A # load context
976 lea 128(%rsp),%rax
977 vmovdqu 0x20-0x80($ctx),$B
978 vmovdqu 0x40-0x80($ctx),$C
979 vmovdqu 0x60-0x80($ctx),$D
980 vmovdqu 0x80-0x80($ctx),$E
981 vmovdqu 0xa0-0x80($ctx),$F
982 vmovdqu 0xc0-0x80($ctx),$G
983 vmovdqu 0xe0-0x80($ctx),$H
984 vmovdqu .Lpbswap(%rip),$Xn
985 jmp .Loop_avx
986
987.align 32
988.Loop_avx:
989 vpxor $B,$C,$bxc # magic seed
990___
991for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
992$code.=<<___;
993 vmovdqu `&Xi_off($i)`,$Xi
994 mov \$3,%ecx
995 jmp .Loop_16_xx_avx
996.align 32
997.Loop_16_xx_avx:
998___
999for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1000$code.=<<___;
1001 dec %ecx
1002 jnz .Loop_16_xx_avx
1003
1004 mov \$1,%ecx
1005 lea K256+128(%rip),$Tbl
1006___
1007for($i=0;$i<4;$i++) {
1008 $code.=<<___;
1009 cmp `4*$i`(%rbx),%ecx # examine counters
1010 cmovge $Tbl,@ptr[$i] # cancel input
1011___
1012}
1013$code.=<<___;
1014 vmovdqa (%rbx),$sigma # pull counters
1015 vpxor $t1,$t1,$t1
1016 vmovdqa $sigma,$Xn
1017 vpcmpgtd $t1,$Xn,$Xn # mask value
1018 vpaddd $Xn,$sigma,$sigma # counters--
1019
1020 vmovdqu 0x00-0x80($ctx),$t1
1021 vpand $Xn,$A,$A
1022 vmovdqu 0x20-0x80($ctx),$t2
1023 vpand $Xn,$B,$B
1024 vmovdqu 0x40-0x80($ctx),$t3
1025 vpand $Xn,$C,$C
1026 vmovdqu 0x60-0x80($ctx),$Xi
1027 vpand $Xn,$D,$D
1028 vpaddd $t1,$A,$A
1029 vmovdqu 0x80-0x80($ctx),$t1
1030 vpand $Xn,$E,$E
1031 vpaddd $t2,$B,$B
1032 vmovdqu 0xa0-0x80($ctx),$t2
1033 vpand $Xn,$F,$F
1034 vpaddd $t3,$C,$C
1035 vmovdqu 0xc0-0x80($ctx),$t3
1036 vpand $Xn,$G,$G
1037 vpaddd $Xi,$D,$D
1038 vmovdqu 0xe0-0x80($ctx),$Xi
1039 vpand $Xn,$H,$H
1040 vpaddd $t1,$E,$E
1041 vpaddd $t2,$F,$F
1042 vmovdqu $A,0x00-0x80($ctx)
1043 vpaddd $t3,$G,$G
1044 vmovdqu $B,0x20-0x80($ctx)
1045 vpaddd $Xi,$H,$H
1046 vmovdqu $C,0x40-0x80($ctx)
1047 vmovdqu $D,0x60-0x80($ctx)
1048 vmovdqu $E,0x80-0x80($ctx)
1049 vmovdqu $F,0xa0-0x80($ctx)
1050 vmovdqu $G,0xc0-0x80($ctx)
1051 vmovdqu $H,0xe0-0x80($ctx)
1052
1053 vmovdqu $sigma,(%rbx) # save counters
1054 vmovdqu .Lpbswap(%rip),$Xn
1055 dec $num
1056 jnz .Loop_avx
1057
1058 mov `$REG_SZ*17+8`(%rsp),$num
1059 lea $REG_SZ($ctx),$ctx
1060 lea `16*$REG_SZ/4`($inp),$inp
1061 dec $num
1062 jnz .Loop_grande_avx
1063
1064.Ldone_avx:
1065 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1066 vzeroupper
1067___
1068$code.=<<___ if ($win64);
1069 movaps -0xb8(%rax),%xmm6
1070 movaps -0xa8(%rax),%xmm7
1071 movaps -0x98(%rax),%xmm8
1072 movaps -0x88(%rax),%xmm9
1073 movaps -0x78(%rax),%xmm10
1074 movaps -0x68(%rax),%xmm11
1075 movaps -0x58(%rax),%xmm12
1076 movaps -0x48(%rax),%xmm13
1077 movaps -0x38(%rax),%xmm14
1078 movaps -0x28(%rax),%xmm15
1079___
1080$code.=<<___;
1081 mov -16(%rax),%rbp
1082 mov -8(%rax),%rbx
1083 lea (%rax),%rsp
619b9466 1084.Lepilogue_avx:
b7838586
AP
1085 ret
1086.size sha256_multi_block_avx,.-sha256_multi_block_avx
1087___
1088 if ($avx>1) {
1089$code =~ s/\`([^\`]*)\`/eval $1/gem;
1090
1091$REG_SZ=32;
1092@ptr=map("%r$_",(12..15,8..11));
1093
1094@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1095($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1096
1097$code.=<<___;
1098.type sha256_multi_block_avx2,\@function,3
1099.align 32
1100sha256_multi_block_avx2:
1101_avx2_shortcut:
1102 mov %rsp,%rax
1103 push %rbx
1104 push %rbp
1105 push %r12
1106 push %r13
1107 push %r14
1108 push %r15
1109___
1110$code.=<<___ if ($win64);
1111 lea -0xa8(%rsp),%rsp
1112 movaps %xmm6,(%rsp)
1113 movaps %xmm7,0x10(%rsp)
1114 movaps %xmm8,0x20(%rsp)
1115 movaps %xmm9,0x30(%rsp)
1116 movaps %xmm10,0x40(%rsp)
1117 movaps %xmm11,0x50(%rsp)
1118 movaps %xmm12,-0x78(%rax)
1119 movaps %xmm13,-0x68(%rax)
1120 movaps %xmm14,-0x58(%rax)
1121 movaps %xmm15,-0x48(%rax)
1122___
1123$code.=<<___;
1124 sub \$`$REG_SZ*18`, %rsp
1125 and \$-256,%rsp
1126 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
619b9466 1127.Lbody_avx2:
b7838586
AP
1128 lea K256+128(%rip),$Tbl
1129 lea 0x80($ctx),$ctx # size optimization
1130
1131.Loop_grande_avx2:
1132 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1133 xor $num,$num
1134 lea `$REG_SZ*16`(%rsp),%rbx
1135___
1136for($i=0;$i<8;$i++) {
1137 $code.=<<___;
1138 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1139 mov `16*$i+8`($inp),%ecx # number of blocks
1140 cmp $num,%ecx
1141 cmovg %ecx,$num # find maximum
1142 test %ecx,%ecx
1143 mov %ecx,`4*$i`(%rbx) # initialize counters
1144 cmovle $Tbl,@ptr[$i] # cancel input
1145___
1146}
1147$code.=<<___;
1148 vmovdqu 0x00-0x80($ctx),$A # load context
1149 lea 128(%rsp),%rax
1150 vmovdqu 0x20-0x80($ctx),$B
1151 lea 256+128(%rsp),%rbx
1152 vmovdqu 0x40-0x80($ctx),$C
1153 vmovdqu 0x60-0x80($ctx),$D
1154 vmovdqu 0x80-0x80($ctx),$E
1155 vmovdqu 0xa0-0x80($ctx),$F
1156 vmovdqu 0xc0-0x80($ctx),$G
1157 vmovdqu 0xe0-0x80($ctx),$H
1158 vmovdqu .Lpbswap(%rip),$Xn
1159 jmp .Loop_avx2
1160
1161.align 32
1162.Loop_avx2:
1163 vpxor $B,$C,$bxc # magic seed
1164___
1165for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1166$code.=<<___;
1167 vmovdqu `&Xi_off($i)`,$Xi
1168 mov \$3,%ecx
1169 jmp .Loop_16_xx_avx2
1170.align 32
1171.Loop_16_xx_avx2:
1172___
1173for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1174$code.=<<___;
1175 dec %ecx
1176 jnz .Loop_16_xx_avx2
1177
1178 mov \$1,%ecx
1179 lea `$REG_SZ*16`(%rsp),%rbx
1180 lea K256+128(%rip),$Tbl
1181___
1182for($i=0;$i<8;$i++) {
1183 $code.=<<___;
1184 cmp `4*$i`(%rbx),%ecx # examine counters
1185 cmovge $Tbl,@ptr[$i] # cancel input
1186___
1187}
1188$code.=<<___;
1189 vmovdqa (%rbx),$sigma # pull counters
1190 vpxor $t1,$t1,$t1
1191 vmovdqa $sigma,$Xn
1192 vpcmpgtd $t1,$Xn,$Xn # mask value
1193 vpaddd $Xn,$sigma,$sigma # counters--
1194
1195 vmovdqu 0x00-0x80($ctx),$t1
1196 vpand $Xn,$A,$A
1197 vmovdqu 0x20-0x80($ctx),$t2
1198 vpand $Xn,$B,$B
1199 vmovdqu 0x40-0x80($ctx),$t3
1200 vpand $Xn,$C,$C
1201 vmovdqu 0x60-0x80($ctx),$Xi
1202 vpand $Xn,$D,$D
1203 vpaddd $t1,$A,$A
1204 vmovdqu 0x80-0x80($ctx),$t1
1205 vpand $Xn,$E,$E
1206 vpaddd $t2,$B,$B
1207 vmovdqu 0xa0-0x80($ctx),$t2
1208 vpand $Xn,$F,$F
1209 vpaddd $t3,$C,$C
1210 vmovdqu 0xc0-0x80($ctx),$t3
1211 vpand $Xn,$G,$G
1212 vpaddd $Xi,$D,$D
1213 vmovdqu 0xe0-0x80($ctx),$Xi
1214 vpand $Xn,$H,$H
1215 vpaddd $t1,$E,$E
1216 vpaddd $t2,$F,$F
1217 vmovdqu $A,0x00-0x80($ctx)
1218 vpaddd $t3,$G,$G
1219 vmovdqu $B,0x20-0x80($ctx)
1220 vpaddd $Xi,$H,$H
1221 vmovdqu $C,0x40-0x80($ctx)
1222 vmovdqu $D,0x60-0x80($ctx)
1223 vmovdqu $E,0x80-0x80($ctx)
1224 vmovdqu $F,0xa0-0x80($ctx)
1225 vmovdqu $G,0xc0-0x80($ctx)
1226 vmovdqu $H,0xe0-0x80($ctx)
1227
1228 vmovdqu $sigma,(%rbx) # save counters
1229 lea 256+128(%rsp),%rbx
1230 vmovdqu .Lpbswap(%rip),$Xn
1231 dec $num
1232 jnz .Loop_avx2
1233
1234 #mov `$REG_SZ*17+8`(%rsp),$num
1235 #lea $REG_SZ($ctx),$ctx
1236 #lea `16*$REG_SZ/4`($inp),$inp
1237 #dec $num
1238 #jnz .Loop_grande_avx2
1239
1240.Ldone_avx2:
1241 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1242 vzeroupper
1243___
1244$code.=<<___ if ($win64);
1245 movaps -0xd8(%rax),%xmm6
1246 movaps -0xc8(%rax),%xmm7
1247 movaps -0xb8(%rax),%xmm8
1248 movaps -0xa8(%rax),%xmm9
1249 movaps -0x98(%rax),%xmm10
1250 movaps -0x88(%rax),%xmm11
1251 movaps -0x78(%rax),%xmm12
1252 movaps -0x68(%rax),%xmm13
1253 movaps -0x58(%rax),%xmm14
1254 movaps -0x48(%rax),%xmm15
1255___
1256$code.=<<___;
1257 mov -48(%rax),%r15
1258 mov -40(%rax),%r14
1259 mov -32(%rax),%r13
1260 mov -24(%rax),%r12
1261 mov -16(%rax),%rbp
1262 mov -8(%rax),%rbx
1263 lea (%rax),%rsp
619b9466 1264.Lepilogue_avx2:
b7838586
AP
1265 ret
1266.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1267___
1268 } }}}
1269$code.=<<___;
1270.align 256
1271K256:
1272___
1273sub TABLE {
1274 foreach (@_) {
1275 $code.=<<___;
1276 .long $_,$_,$_,$_
1277 .long $_,$_,$_,$_
1278___
1279 }
1280}
1281&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1282 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1283 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1284 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1285 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1286 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1287 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1288 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1289 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1290 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1291 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1292 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1293 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1294 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1295 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1296 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1297$code.=<<___;
1298.Lpbswap:
1299 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1300 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
619b9466
AP
1301K256_shaext:
1302 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1303 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1304 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1305 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1306 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1307 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1308 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1309 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1310 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1311 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1312 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1313 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1314 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1315 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1316 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1317 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1318 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
b7838586
AP
1319___
1320
619b9466
AP
1321if ($win64) {
1322# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1323# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1324$rec="%rcx";
1325$frame="%rdx";
1326$context="%r8";
1327$disp="%r9";
1328
1329$code.=<<___;
1330.extern __imp_RtlVirtualUnwind
1331.type se_handler,\@abi-omnipotent
1332.align 16
1333se_handler:
1334 push %rsi
1335 push %rdi
1336 push %rbx
1337 push %rbp
1338 push %r12
1339 push %r13
1340 push %r14
1341 push %r15
1342 pushfq
1343 sub \$64,%rsp
1344
1345 mov 120($context),%rax # pull context->Rax
1346 mov 248($context),%rbx # pull context->Rip
1347
1348 mov 8($disp),%rsi # disp->ImageBase
1349 mov 56($disp),%r11 # disp->HandlerData
1350
1351 mov 0(%r11),%r10d # HandlerData[0]
1352 lea (%rsi,%r10),%r10 # end of prologue label
1353 cmp %r10,%rbx # context->Rip<.Lbody
1354 jb .Lin_prologue
1355
1356 mov 152($context),%rax # pull context->Rsp
1357
1358 mov 4(%r11),%r10d # HandlerData[1]
1359 lea (%rsi,%r10),%r10 # epilogue label
1360 cmp %r10,%rbx # context->Rip>=.Lepilogue
1361 jae .Lin_prologue
1362
1363 mov `16*17`(%rax),%rax # pull saved stack pointer
1364
1365 mov -8(%rax),%rbx
1366 mov -16(%rax),%rbp
1367 mov %rbx,144($context) # restore context->Rbx
1368 mov %rbp,160($context) # restore context->Rbp
1369
1370 lea -24-10*16(%rax),%rsi
1371 lea 512($context),%rdi # &context.Xmm6
1372 mov \$20,%ecx
1373 .long 0xa548f3fc # cld; rep movsq
1374
1375.Lin_prologue:
1376 mov 8(%rax),%rdi
1377 mov 16(%rax),%rsi
1378 mov %rax,152($context) # restore context->Rsp
1379 mov %rsi,168($context) # restore context->Rsi
1380 mov %rdi,176($context) # restore context->Rdi
1381
1382 mov 40($disp),%rdi # disp->ContextRecord
1383 mov $context,%rsi # context
1384 mov \$154,%ecx # sizeof(CONTEXT)
1385 .long 0xa548f3fc # cld; rep movsq
1386
1387 mov $disp,%rsi
1388 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1389 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1390 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1391 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1392 mov 40(%rsi),%r10 # disp->ContextRecord
1393 lea 56(%rsi),%r11 # &disp->HandlerData
1394 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1395 mov %r10,32(%rsp) # arg5
1396 mov %r11,40(%rsp) # arg6
1397 mov %r12,48(%rsp) # arg7
1398 mov %rcx,56(%rsp) # arg8, (NULL)
1399 call *__imp_RtlVirtualUnwind(%rip)
1400
1401 mov \$1,%eax # ExceptionContinueSearch
1402 add \$64,%rsp
1403 popfq
1404 pop %r15
1405 pop %r14
1406 pop %r13
1407 pop %r12
1408 pop %rbp
1409 pop %rbx
1410 pop %rdi
1411 pop %rsi
1412 ret
1413.size se_handler,.-se_handler
1414___
1415$code.=<<___ if ($avx>1);
1416.type avx2_handler,\@abi-omnipotent
1417.align 16
1418avx2_handler:
1419 push %rsi
1420 push %rdi
1421 push %rbx
1422 push %rbp
1423 push %r12
1424 push %r13
1425 push %r14
1426 push %r15
1427 pushfq
1428 sub \$64,%rsp
1429
1430 mov 120($context),%rax # pull context->Rax
1431 mov 248($context),%rbx # pull context->Rip
1432
1433 mov 8($disp),%rsi # disp->ImageBase
1434 mov 56($disp),%r11 # disp->HandlerData
1435
1436 mov 0(%r11),%r10d # HandlerData[0]
1437 lea (%rsi,%r10),%r10 # end of prologue label
1438 cmp %r10,%rbx # context->Rip<body label
1439 jb .Lin_prologue
1440
1441 mov 152($context),%rax # pull context->Rsp
1442
1443 mov 4(%r11),%r10d # HandlerData[1]
1444 lea (%rsi,%r10),%r10 # epilogue label
1445 cmp %r10,%rbx # context->Rip>=epilogue label
1446 jae .Lin_prologue
1447
1448 mov `32*17`($context),%rax # pull saved stack pointer
1449
1450 mov -8(%rax),%rbx
1451 mov -16(%rax),%rbp
1452 mov -24(%rax),%r12
1453 mov -32(%rax),%r13
1454 mov -40(%rax),%r14
1455 mov -48(%rax),%r15
1456 mov %rbx,144($context) # restore context->Rbx
1457 mov %rbp,160($context) # restore context->Rbp
1458 mov %r12,216($context) # restore cotnext->R12
1459 mov %r13,224($context) # restore cotnext->R13
1460 mov %r14,232($context) # restore cotnext->R14
1461 mov %r15,240($context) # restore cotnext->R15
1462
1463 lea -56-10*16(%rax),%rsi
1464 lea 512($context),%rdi # &context.Xmm6
1465 mov \$20,%ecx
1466 .long 0xa548f3fc # cld; rep movsq
1467
1468 jmp .Lin_prologue
1469.size avx2_handler,.-avx2_handler
1470___
1471$code.=<<___;
1472.section .pdata
1473.align 4
1474 .rva .LSEH_begin_sha256_multi_block
1475 .rva .LSEH_end_sha256_multi_block
1476 .rva .LSEH_info_sha256_multi_block
1477 .rva .LSEH_begin_sha256_multi_block_shaext
1478 .rva .LSEH_end_sha256_multi_block_shaext
1479 .rva .LSEH_info_sha256_multi_block_shaext
1480___
1481$code.=<<___ if ($avx);
1482 .rva .LSEH_begin_sha256_multi_block_avx
1483 .rva .LSEH_end_sha256_multi_block_avx
1484 .rva .LSEH_info_sha256_multi_block_avx
1485___
1486$code.=<<___ if ($avx>1);
1487 .rva .LSEH_begin_sha256_multi_block_avx2
1488 .rva .LSEH_end_sha256_multi_block_avx2
1489 .rva .LSEH_info_sha256_multi_block_avx2
1490___
1491$code.=<<___;
1492.section .xdata
1493.align 8
1494.LSEH_info_sha256_multi_block:
1495 .byte 9,0,0,0
1496 .rva se_handler
1497 .rva .Lbody,.Lepilogue # HandlerData[]
1498.LSEH_info_sha256_multi_block_shaext:
1499 .byte 9,0,0,0
1500 .rva se_handler
1501 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1502___
1503$code.=<<___ if ($avx);
1504.LSEH_info_sha256_multi_block_avx:
1505 .byte 9,0,0,0
1506 .rva se_handler
1507 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1508___
1509$code.=<<___ if ($avx>1);
1510.LSEH_info_sha256_multi_block_avx2:
1511 .byte 9,0,0,0
1512 .rva avx2_handler
1513 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1514___
1515}
1516####################################################################
1517
1518sub rex {
1519 local *opcode=shift;
1520 my ($dst,$src)=@_;
1521 my $rex=0;
1522
1523 $rex|=0x04 if ($dst>=8);
1524 $rex|=0x01 if ($src>=8);
1525 unshift @opcode,$rex|0x40 if ($rex);
1526}
1527
1528sub sha256op38 {
1529 my $instr = shift;
1530 my %opcodelet = (
1531 "sha256rnds2" => 0xcb,
1532 "sha256msg1" => 0xcc,
1533 "sha256msg2" => 0xcd );
1534
1535 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1536 my @opcode=(0x0f,0x38);
1537 rex(\@opcode,$2,$1);
1538 push @opcode,$opcodelet{$instr};
1539 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1540 return ".byte\t".join(',',@opcode);
1541 } else {
1542 return $instr."\t".@_[0];
1543 }
1544}
1545
b7838586
AP
1546foreach (split("\n",$code)) {
1547 s/\`([^\`]*)\`/eval($1)/ge;
1548
619b9466
AP
1549 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1550
b7838586
AP
1551 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1552 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1553 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1554 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1555 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1556 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
619b9466 1557
b7838586
AP
1558 print $_,"\n";
1559}
1560
1561close STDOUT;