]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha256-mb-x86_64.pl
acdffbdecb2a3ea006b18fac5fdd32420e972802
[thirdparty/openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
21 #
22 # this +aesni(i) sha256 aesni-sha256 gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
25 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
26 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
27 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
28 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
29 # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
30 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
31 #
32 # (i) multi-block CBC encrypt with 128-bit key;
33 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 # because of lower AES-NI instruction throughput, nor is there
35 # AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 # for n=4 is 20.3+4.44=24.7;
38 # (iv) presented improvement coefficients are asymptotic limits and
39 # in real-life application are somewhat lower, e.g. for 2KB
40 # fragments they range from 75% to 130% (on Haswell);
41
42 $flavour = shift;
43 $output = shift;
44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51 die "can't locate x86_64-xlate.pl";
52
53 $avx=0;
54
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.19) + ($1>=2.22);
58 }
59
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
63 }
64
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67 $avx = ($1>=10) + ($1>=11);
68 }
69
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71 $avx = ($2>=3.0) + ($2>3.0);
72 }
73
74 open OUT,"| \"$^X\" $xlate $flavour $output";
75 *STDOUT=*OUT;
76
77 # void sha256_multi_block (
78 # struct { unsigned int A[8];
79 # unsigned int B[8];
80 # unsigned int C[8];
81 # unsigned int D[8];
82 # unsigned int E[8];
83 # unsigned int F[8];
84 # unsigned int G[8];
85 # unsigned int H[8]; } *ctx,
86 # struct { void *ptr; int blocks; } inp[8],
87 # int num); /* 1 or 2 */
88 #
89 $ctx="%rdi"; # 1st arg
90 $inp="%rsi"; # 2nd arg
91 $num="%edx"; # 3rd arg
92 @ptr=map("%r$_",(8..11));
93 $Tbl="%rbp";
94
95 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
96 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
97
98 $REG_SZ=16;
99
100 sub Xi_off {
101 my $off = shift;
102
103 $off %= 16; $off *= $REG_SZ;
104 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
105 }
106
107 sub ROUND_00_15 {
108 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
109
110 $code.=<<___ if ($i<15);
111 movd `4*$i`(@ptr[0]),$Xi
112 movd `4*$i`(@ptr[1]),$t1
113 movd `4*$i`(@ptr[2]),$t2
114 movd `4*$i`(@ptr[3]),$t3
115 punpckldq $t2,$Xi
116 punpckldq $t3,$t1
117 punpckldq $t1,$Xi
118 ___
119 $code.=<<___ if ($i==15);
120 movd `4*$i`(@ptr[0]),$Xi
121 lea `16*4`(@ptr[0]),@ptr[0]
122 movd `4*$i`(@ptr[1]),$t1
123 lea `16*4`(@ptr[1]),@ptr[1]
124 movd `4*$i`(@ptr[2]),$t2
125 lea `16*4`(@ptr[2]),@ptr[2]
126 movd `4*$i`(@ptr[3]),$t3
127 lea `16*4`(@ptr[3]),@ptr[3]
128 punpckldq $t2,$Xi
129 punpckldq $t3,$t1
130 punpckldq $t1,$Xi
131 ___
132 $code.=<<___;
133 movdqa $e,$sigma
134 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
135 movdqa $e,$t3
136 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
137 psrld \$6,$sigma
138 movdqa $e,$t2
139 pslld \$7,$t3
140 movdqa $Xi,`&Xi_off($i)`
141 paddd $h,$Xi # Xi+=h
142
143 psrld \$11,$t2
144 pxor $t3,$sigma
145 pslld \$21-7,$t3
146 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
147 pxor $t2,$sigma
148
149 psrld \$25-11,$t2
150 movdqa $e,$t1
151 `"prefetcht0 63(@ptr[0])" if ($i==15)`
152 pxor $t3,$sigma
153 movdqa $e,$axb # borrow $axb
154 pslld \$26-21,$t3
155 pandn $g,$t1
156 pand $f,$axb
157 pxor $t2,$sigma
158
159 `"prefetcht0 63(@ptr[1])" if ($i==15)`
160 movdqa $a,$t2
161 pxor $t3,$sigma # Sigma1(e)
162 movdqa $a,$t3
163 psrld \$2,$t2
164 paddd $sigma,$Xi # Xi+=Sigma1(e)
165 pxor $axb,$t1 # Ch(e,f,g)
166 movdqa $b,$axb
167 movdqa $a,$sigma
168 pslld \$10,$t3
169 pxor $a,$axb # a^b, b^c in next round
170
171 `"prefetcht0 63(@ptr[2])" if ($i==15)`
172 psrld \$13,$sigma
173 pxor $t3,$t2
174 paddd $t1,$Xi # Xi+=Ch(e,f,g)
175 pslld \$19-10,$t3
176 pand $axb,$bxc
177 pxor $sigma,$t2
178
179 `"prefetcht0 63(@ptr[3])" if ($i==15)`
180 psrld \$22-13,$sigma
181 pxor $t3,$t2
182 movdqa $b,$h
183 pslld \$30-19,$t3
184 pxor $t2,$sigma
185 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
186 paddd $Xi,$d # d+=Xi
187 pxor $t3,$sigma # Sigma0(a)
188
189 paddd $Xi,$h # h+=Xi
190 paddd $sigma,$h # h+=Sigma0(a)
191 ___
192 $code.=<<___ if (($i%8)==7);
193 lea `32*8`($Tbl),$Tbl
194 ___
195 ($axb,$bxc)=($bxc,$axb);
196 }
197
198 sub ROUND_16_XX {
199 my $i=shift;
200
201 $code.=<<___;
202 movdqa `&Xi_off($i+1)`,$Xn
203 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
204
205 movdqa $Xn,$sigma
206 movdqa $Xn,$t2
207 psrld \$3,$sigma
208 movdqa $Xn,$t3
209
210 psrld \$7,$t2
211 movdqa `&Xi_off($i+14)`,$t1
212 pslld \$14,$t3
213 pxor $t2,$sigma
214 psrld \$18-7,$t2
215 movdqa $t1,$axb # borrow $axb
216 pxor $t3,$sigma
217 pslld \$25-14,$t3
218 pxor $t2,$sigma
219 psrld \$10,$t1
220 movdqa $axb,$t2
221
222 psrld \$17,$axb
223 pxor $t3,$sigma # sigma0(X[i+1])
224 pslld \$13,$t2
225 paddd $sigma,$Xi # Xi+=sigma0(e)
226 pxor $axb,$t1
227 psrld \$19-17,$axb
228 pxor $t2,$t1
229 pslld \$15-13,$t2
230 pxor $axb,$t1
231 pxor $t2,$t1 # sigma0(X[i+14])
232 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
233 ___
234 &ROUND_00_15($i,@_);
235 ($Xi,$Xn)=($Xn,$Xi);
236 }
237
238 $code.=<<___;
239 .text
240
241 .extern OPENSSL_ia32cap_P
242
243 .globl sha256_multi_block
244 .type sha256_multi_block,\@function,3
245 .align 32
246 sha256_multi_block:
247 mov OPENSSL_ia32cap_P+4(%rip),%rcx
248 bt \$61,%rcx # check SHA bit
249 jc _shaext_shortcut
250 ___
251 $code.=<<___ if ($avx);
252 test \$`1<<28`,%ecx
253 jnz _avx_shortcut
254 ___
255 $code.=<<___;
256 mov %rsp,%rax
257 push %rbx
258 push %rbp
259 ___
260 $code.=<<___ if ($win64);
261 lea -0xa8(%rsp),%rsp
262 movaps %xmm6,(%rsp)
263 movaps %xmm7,0x10(%rsp)
264 movaps %xmm8,0x20(%rsp)
265 movaps %xmm9,0x30(%rsp)
266 movaps %xmm10,-0x78(%rax)
267 movaps %xmm11,-0x68(%rax)
268 movaps %xmm12,-0x58(%rax)
269 movaps %xmm13,-0x48(%rax)
270 movaps %xmm14,-0x38(%rax)
271 movaps %xmm15,-0x28(%rax)
272 ___
273 $code.=<<___;
274 sub \$`$REG_SZ*18`, %rsp
275 and \$-256,%rsp
276 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
277 .Lbody:
278 lea K256+128(%rip),$Tbl
279 lea `$REG_SZ*16`(%rsp),%rbx
280 lea 0x80($ctx),$ctx # size optimization
281
282 .Loop_grande:
283 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
284 xor $num,$num
285 ___
286 for($i=0;$i<4;$i++) {
287 $code.=<<___;
288 mov `16*$i+0`($inp),@ptr[$i] # input pointer
289 mov `16*$i+8`($inp),%ecx # number of blocks
290 cmp $num,%ecx
291 cmovg %ecx,$num # find maximum
292 test %ecx,%ecx
293 mov %ecx,`4*$i`(%rbx) # initialize counters
294 cmovle $Tbl,@ptr[$i] # cancel input
295 ___
296 }
297 $code.=<<___;
298 test $num,$num
299 jz .Ldone
300
301 movdqu 0x00-0x80($ctx),$A # load context
302 lea 128(%rsp),%rax
303 movdqu 0x20-0x80($ctx),$B
304 movdqu 0x40-0x80($ctx),$C
305 movdqu 0x60-0x80($ctx),$D
306 movdqu 0x80-0x80($ctx),$E
307 movdqu 0xa0-0x80($ctx),$F
308 movdqu 0xc0-0x80($ctx),$G
309 movdqu 0xe0-0x80($ctx),$H
310 movdqu .Lpbswap(%rip),$Xn
311 jmp .Loop
312
313 .align 32
314 .Loop:
315 movdqa $C,$bxc
316 pxor $B,$bxc # magic seed
317 ___
318 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
319 $code.=<<___;
320 movdqu `&Xi_off($i)`,$Xi
321 mov \$3,%ecx
322 jmp .Loop_16_xx
323 .align 32
324 .Loop_16_xx:
325 ___
326 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
327 $code.=<<___;
328 dec %ecx
329 jnz .Loop_16_xx
330
331 mov \$1,%ecx
332 lea K256+128(%rip),$Tbl
333
334 movdqa (%rbx),$sigma # pull counters
335 cmp 4*0(%rbx),%ecx # examine counters
336 pxor $t1,$t1
337 cmovge $Tbl,@ptr[0] # cancel input
338 cmp 4*1(%rbx),%ecx
339 movdqa $sigma,$Xn
340 cmovge $Tbl,@ptr[1]
341 cmp 4*2(%rbx),%ecx
342 pcmpgtd $t1,$Xn # mask value
343 cmovge $Tbl,@ptr[2]
344 cmp 4*3(%rbx),%ecx
345 paddd $Xn,$sigma # counters--
346 cmovge $Tbl,@ptr[3]
347
348 movdqu 0x00-0x80($ctx),$t1
349 pand $Xn,$A
350 movdqu 0x20-0x80($ctx),$t2
351 pand $Xn,$B
352 movdqu 0x40-0x80($ctx),$t3
353 pand $Xn,$C
354 movdqu 0x60-0x80($ctx),$Xi
355 pand $Xn,$D
356 paddd $t1,$A
357 movdqu 0x80-0x80($ctx),$t1
358 pand $Xn,$E
359 paddd $t2,$B
360 movdqu 0xa0-0x80($ctx),$t2
361 pand $Xn,$F
362 paddd $t3,$C
363 movdqu 0xc0-0x80($ctx),$t3
364 pand $Xn,$G
365 paddd $Xi,$D
366 movdqu 0xe0-0x80($ctx),$Xi
367 pand $Xn,$H
368 paddd $t1,$E
369 paddd $t2,$F
370 movdqu $A,0x00-0x80($ctx)
371 paddd $t3,$G
372 movdqu $B,0x20-0x80($ctx)
373 paddd $Xi,$H
374 movdqu $C,0x40-0x80($ctx)
375 movdqu $D,0x60-0x80($ctx)
376 movdqu $E,0x80-0x80($ctx)
377 movdqu $F,0xa0-0x80($ctx)
378 movdqu $G,0xc0-0x80($ctx)
379 movdqu $H,0xe0-0x80($ctx)
380
381 movdqa $sigma,(%rbx) # save counters
382 movdqa .Lpbswap(%rip),$Xn
383 dec $num
384 jnz .Loop
385
386 mov `$REG_SZ*17+8`(%rsp),$num
387 lea $REG_SZ($ctx),$ctx
388 lea `16*$REG_SZ/4`($inp),$inp
389 dec $num
390 jnz .Loop_grande
391
392 .Ldone:
393 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
394 ___
395 $code.=<<___ if ($win64);
396 movaps -0xb8(%rax),%xmm6
397 movaps -0xa8(%rax),%xmm7
398 movaps -0x98(%rax),%xmm8
399 movaps -0x88(%rax),%xmm9
400 movaps -0x78(%rax),%xmm10
401 movaps -0x68(%rax),%xmm11
402 movaps -0x58(%rax),%xmm12
403 movaps -0x48(%rax),%xmm13
404 movaps -0x38(%rax),%xmm14
405 movaps -0x28(%rax),%xmm15
406 ___
407 $code.=<<___;
408 mov -16(%rax),%rbp
409 mov -8(%rax),%rbx
410 lea (%rax),%rsp
411 .Lepilogue:
412 ret
413 .size sha256_multi_block,.-sha256_multi_block
414 ___
415 {{{
416 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
417 my @MSG0=map("%xmm$_",(4..7));
418 my @MSG1=map("%xmm$_",(8..11));
419
420 $code.=<<___;
421 .type sha256_multi_block_shaext,\@function,3
422 .align 32
423 sha256_multi_block_shaext:
424 _shaext_shortcut:
425 mov %rsp,%rax
426 push %rbx
427 push %rbp
428 ___
429 $code.=<<___ if ($win64);
430 lea -0xa8(%rsp),%rsp
431 movaps %xmm6,(%rsp)
432 movaps %xmm7,0x10(%rsp)
433 movaps %xmm8,0x20(%rsp)
434 movaps %xmm9,0x30(%rsp)
435 movaps %xmm10,-0x78(%rax)
436 movaps %xmm11,-0x68(%rax)
437 movaps %xmm12,-0x58(%rax)
438 movaps %xmm13,-0x48(%rax)
439 movaps %xmm14,-0x38(%rax)
440 movaps %xmm15,-0x28(%rax)
441 ___
442 $code.=<<___;
443 sub \$`$REG_SZ*18`,%rsp
444 shl \$1,$num # we process pair at a time
445 and \$-256,%rsp
446 lea 0x80($ctx),$ctx # size optimization
447 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
448 .Lbody_shaext:
449 lea `$REG_SZ*16`(%rsp),%rbx
450 lea K256_shaext+0x80(%rip),$Tbl
451
452 .Loop_grande_shaext:
453 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
454 xor $num,$num
455 ___
456 for($i=0;$i<2;$i++) {
457 $code.=<<___;
458 mov `16*$i+0`($inp),@ptr[$i] # input pointer
459 mov `16*$i+8`($inp),%ecx # number of blocks
460 cmp $num,%ecx
461 cmovg %ecx,$num # find maximum
462 test %ecx,%ecx
463 mov %ecx,`4*$i`(%rbx) # initialize counters
464 cmovle %rsp,@ptr[$i] # cancel input
465 ___
466 }
467 $code.=<<___;
468 test $num,$num
469 jz .Ldone_shaext
470
471 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
472 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
473 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
474 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
475 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
476 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
477 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
478 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
479
480 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
481 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
482 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
483 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
484 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
485
486 movdqa $ABEF0,$ABEF1
487 movdqa $CDGH0,$CDGH1
488 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
489 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
490 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
491 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
492
493 pshufd \$0b00011011,$ABEF0,$ABEF0
494 pshufd \$0b00011011,$CDGH0,$CDGH0
495 pshufd \$0b00011011,$ABEF1,$ABEF1
496 pshufd \$0b00011011,$CDGH1,$CDGH1
497 jmp .Loop_shaext
498
499 .align 32
500 .Loop_shaext:
501 movdqu 0x00(@ptr[0]),@MSG0[0]
502 movdqu 0x00(@ptr[1]),@MSG1[0]
503 movdqu 0x10(@ptr[0]),@MSG0[1]
504 movdqu 0x10(@ptr[1]),@MSG1[1]
505 movdqu 0x20(@ptr[0]),@MSG0[2]
506 pshufb $TMPx,@MSG0[0]
507 movdqu 0x20(@ptr[1]),@MSG1[2]
508 pshufb $TMPx,@MSG1[0]
509 movdqu 0x30(@ptr[0]),@MSG0[3]
510 lea 0x40(@ptr[0]),@ptr[0]
511 movdqu 0x30(@ptr[1]),@MSG1[3]
512 lea 0x40(@ptr[1]),@ptr[1]
513
514 movdqa 0*16-0x80($Tbl),$Wi
515 pshufb $TMPx,@MSG0[1]
516 paddd @MSG0[0],$Wi
517 pxor $ABEF0,@MSG0[0] # black magic
518 movdqa $Wi,$TMP0
519 movdqa 0*16-0x80($Tbl),$TMP1
520 pshufb $TMPx,@MSG1[1]
521 paddd @MSG1[0],$TMP1
522 movdqa $CDGH0,0x50(%rsp) # offload
523 sha256rnds2 $ABEF0,$CDGH0 # 0-3
524 pxor $ABEF1,@MSG1[0] # black magic
525 movdqa $TMP1,$Wi
526 movdqa $CDGH1,0x70(%rsp)
527 sha256rnds2 $ABEF1,$CDGH1 # 0-3
528 pshufd \$0x0e,$TMP0,$Wi
529 pxor $ABEF0,@MSG0[0] # black magic
530 movdqa $ABEF0,0x40(%rsp) # offload
531 sha256rnds2 $CDGH0,$ABEF0
532 pshufd \$0x0e,$TMP1,$Wi
533 pxor $ABEF1,@MSG1[0] # black magic
534 movdqa $ABEF1,0x60(%rsp)
535 movdqa 1*16-0x80($Tbl),$TMP0
536 paddd @MSG0[1],$TMP0
537 pshufb $TMPx,@MSG0[2]
538 sha256rnds2 $CDGH1,$ABEF1
539
540 movdqa $TMP0,$Wi
541 movdqa 1*16-0x80($Tbl),$TMP1
542 paddd @MSG1[1],$TMP1
543 sha256rnds2 $ABEF0,$CDGH0 # 4-7
544 movdqa $TMP1,$Wi
545 prefetcht0 127(@ptr[0])
546 pshufb $TMPx,@MSG0[3]
547 pshufb $TMPx,@MSG1[2]
548 prefetcht0 127(@ptr[1])
549 sha256rnds2 $ABEF1,$CDGH1 # 4-7
550 pshufd \$0x0e,$TMP0,$Wi
551 pshufb $TMPx,@MSG1[3]
552 sha256msg1 @MSG0[1],@MSG0[0]
553 sha256rnds2 $CDGH0,$ABEF0
554 pshufd \$0x0e,$TMP1,$Wi
555 movdqa 2*16-0x80($Tbl),$TMP0
556 paddd @MSG0[2],$TMP0
557 sha256rnds2 $CDGH1,$ABEF1
558
559 movdqa $TMP0,$Wi
560 movdqa 2*16-0x80($Tbl),$TMP1
561 paddd @MSG1[2],$TMP1
562 sha256rnds2 $ABEF0,$CDGH0 # 8-11
563 sha256msg1 @MSG1[1],@MSG1[0]
564 movdqa $TMP1,$Wi
565 movdqa @MSG0[3],$TMPx
566 sha256rnds2 $ABEF1,$CDGH1 # 8-11
567 pshufd \$0x0e,$TMP0,$Wi
568 palignr \$4,@MSG0[2],$TMPx
569 paddd $TMPx,@MSG0[0]
570 movdqa @MSG1[3],$TMPx
571 palignr \$4,@MSG1[2],$TMPx
572 sha256msg1 @MSG0[2],@MSG0[1]
573 sha256rnds2 $CDGH0,$ABEF0
574 pshufd \$0x0e,$TMP1,$Wi
575 movdqa 3*16-0x80($Tbl),$TMP0
576 paddd @MSG0[3],$TMP0
577 sha256rnds2 $CDGH1,$ABEF1
578 sha256msg1 @MSG1[2],@MSG1[1]
579
580 movdqa $TMP0,$Wi
581 movdqa 3*16-0x80($Tbl),$TMP1
582 paddd $TMPx,@MSG1[0]
583 paddd @MSG1[3],$TMP1
584 sha256msg2 @MSG0[3],@MSG0[0]
585 sha256rnds2 $ABEF0,$CDGH0 # 12-15
586 movdqa $TMP1,$Wi
587 movdqa @MSG0[0],$TMPx
588 palignr \$4,@MSG0[3],$TMPx
589 sha256rnds2 $ABEF1,$CDGH1 # 12-15
590 sha256msg2 @MSG1[3],@MSG1[0]
591 pshufd \$0x0e,$TMP0,$Wi
592 paddd $TMPx,@MSG0[1]
593 movdqa @MSG1[0],$TMPx
594 palignr \$4,@MSG1[3],$TMPx
595 sha256msg1 @MSG0[3],@MSG0[2]
596 sha256rnds2 $CDGH0,$ABEF0
597 pshufd \$0x0e,$TMP1,$Wi
598 movdqa 4*16-0x80($Tbl),$TMP0
599 paddd @MSG0[0],$TMP0
600 sha256rnds2 $CDGH1,$ABEF1
601 sha256msg1 @MSG1[3],@MSG1[2]
602 ___
603 for($i=4;$i<16-3;$i++) {
604 $code.=<<___;
605 movdqa $TMP0,$Wi
606 movdqa $i*16-0x80($Tbl),$TMP1
607 paddd $TMPx,@MSG1[1]
608 paddd @MSG1[0],$TMP1
609 sha256msg2 @MSG0[0],@MSG0[1]
610 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
611 movdqa $TMP1,$Wi
612 movdqa @MSG0[1],$TMPx
613 palignr \$4,@MSG0[0],$TMPx
614 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
615 sha256msg2 @MSG1[0],@MSG1[1]
616 pshufd \$0x0e,$TMP0,$Wi
617 paddd $TMPx,@MSG0[2]
618 movdqa @MSG1[1],$TMPx
619 palignr \$4,@MSG1[0],$TMPx
620 sha256msg1 @MSG0[0],@MSG0[3]
621 sha256rnds2 $CDGH0,$ABEF0
622 pshufd \$0x0e,$TMP1,$Wi
623 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
624 paddd @MSG0[1],$TMP0
625 sha256rnds2 $CDGH1,$ABEF1
626 sha256msg1 @MSG1[0],@MSG1[3]
627 ___
628 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
629 }
630 $code.=<<___;
631 movdqa $TMP0,$Wi
632 movdqa 13*16-0x80($Tbl),$TMP1
633 paddd $TMPx,@MSG1[1]
634 paddd @MSG1[0],$TMP1
635 sha256msg2 @MSG0[0],@MSG0[1]
636 sha256rnds2 $ABEF0,$CDGH0 # 52-55
637 movdqa $TMP1,$Wi
638 movdqa @MSG0[1],$TMPx
639 palignr \$4,@MSG0[0],$TMPx
640 sha256rnds2 $ABEF1,$CDGH1 # 52-55
641 sha256msg2 @MSG1[0],@MSG1[1]
642 pshufd \$0x0e,$TMP0,$Wi
643 paddd $TMPx,@MSG0[2]
644 movdqa @MSG1[1],$TMPx
645 palignr \$4,@MSG1[0],$TMPx
646 nop
647 sha256rnds2 $CDGH0,$ABEF0
648 pshufd \$0x0e,$TMP1,$Wi
649 movdqa 14*16-0x80($Tbl),$TMP0
650 paddd @MSG0[1],$TMP0
651 sha256rnds2 $CDGH1,$ABEF1
652
653 movdqa $TMP0,$Wi
654 movdqa 14*16-0x80($Tbl),$TMP1
655 paddd $TMPx,@MSG1[2]
656 paddd @MSG1[1],$TMP1
657 sha256msg2 @MSG0[1],@MSG0[2]
658 nop
659 sha256rnds2 $ABEF0,$CDGH0 # 56-59
660 movdqa $TMP1,$Wi
661 mov \$1,%ecx
662 pxor @MSG0[1],@MSG0[1] # zero
663 sha256rnds2 $ABEF1,$CDGH1 # 56-59
664 sha256msg2 @MSG1[1],@MSG1[2]
665 pshufd \$0x0e,$TMP0,$Wi
666 movdqa 15*16-0x80($Tbl),$TMP0
667 paddd @MSG0[2],$TMP0
668 movq (%rbx),@MSG0[2] # pull counters
669 nop
670 sha256rnds2 $CDGH0,$ABEF0
671 pshufd \$0x0e,$TMP1,$Wi
672 movdqa 15*16-0x80($Tbl),$TMP1
673 paddd @MSG1[2],$TMP1
674 sha256rnds2 $CDGH1,$ABEF1
675
676 movdqa $TMP0,$Wi
677 cmp 4*0(%rbx),%ecx # examine counters
678 cmovge %rsp,@ptr[0] # cancel input
679 cmp 4*1(%rbx),%ecx
680 cmovge %rsp,@ptr[1]
681 pshufd \$0x00,@MSG0[2],@MSG1[0]
682 sha256rnds2 $ABEF0,$CDGH0 # 60-63
683 movdqa $TMP1,$Wi
684 pshufd \$0x55,@MSG0[2],@MSG1[1]
685 movdqa @MSG0[2],@MSG1[2]
686 sha256rnds2 $ABEF1,$CDGH1 # 60-63
687 pshufd \$0x0e,$TMP0,$Wi
688 pcmpgtd @MSG0[1],@MSG1[0]
689 pcmpgtd @MSG0[1],@MSG1[1]
690 sha256rnds2 $CDGH0,$ABEF0
691 pshufd \$0x0e,$TMP1,$Wi
692 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
693 movdqa K256_shaext-0x10(%rip),$TMPx
694 sha256rnds2 $CDGH1,$ABEF1
695
696 pand @MSG1[0],$CDGH0
697 pand @MSG1[1],$CDGH1
698 pand @MSG1[0],$ABEF0
699 pand @MSG1[1],$ABEF1
700 paddd @MSG0[2],@MSG1[2] # counters--
701
702 paddd 0x50(%rsp),$CDGH0
703 paddd 0x70(%rsp),$CDGH1
704 paddd 0x40(%rsp),$ABEF0
705 paddd 0x60(%rsp),$ABEF1
706
707 movq @MSG1[2],(%rbx) # save counters
708 dec $num
709 jnz .Loop_shaext
710
711 mov `$REG_SZ*17+8`(%rsp),$num
712
713 pshufd \$0b00011011,$ABEF0,$ABEF0
714 pshufd \$0b00011011,$CDGH0,$CDGH0
715 pshufd \$0b00011011,$ABEF1,$ABEF1
716 pshufd \$0b00011011,$CDGH1,$CDGH1
717
718 movdqa $ABEF0,@MSG0[0]
719 movdqa $CDGH0,@MSG0[1]
720 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
721 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
722 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
723 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
724
725 movq $ABEF0,0x00-0x80($ctx) # A1.A0
726 psrldq \$8,$ABEF0
727 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
728 psrldq \$8,@MSG0[0]
729 movq $ABEF0,0x20-0x80($ctx) # B1.B0
730 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
731
732 movq $CDGH0,0x40-0x80($ctx) # C1.C0
733 psrldq \$8,$CDGH0
734 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
735 psrldq \$8,@MSG0[1]
736 movq $CDGH0,0x60-0x80($ctx) # D1.D0
737 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
738
739 lea `$REG_SZ/2`($ctx),$ctx
740 lea `16*2`($inp),$inp
741 dec $num
742 jnz .Loop_grande_shaext
743
744 .Ldone_shaext:
745 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
746 ___
747 $code.=<<___ if ($win64);
748 movaps -0xb8(%rax),%xmm6
749 movaps -0xa8(%rax),%xmm7
750 movaps -0x98(%rax),%xmm8
751 movaps -0x88(%rax),%xmm9
752 movaps -0x78(%rax),%xmm10
753 movaps -0x68(%rax),%xmm11
754 movaps -0x58(%rax),%xmm12
755 movaps -0x48(%rax),%xmm13
756 movaps -0x38(%rax),%xmm14
757 movaps -0x28(%rax),%xmm15
758 ___
759 $code.=<<___;
760 mov -16(%rax),%rbp
761 mov -8(%rax),%rbx
762 lea (%rax),%rsp
763 .Lepilogue_shaext:
764 ret
765 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
766 ___
767 }}}
768 if ($avx) {{{
769 sub ROUND_00_15_avx {
770 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
771
772 $code.=<<___ if ($i<15 && $REG_SZ==16);
773 vmovd `4*$i`(@ptr[0]),$Xi
774 vmovd `4*$i`(@ptr[1]),$t1
775 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
776 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
777 vpunpckldq $t1,$Xi,$Xi
778 vpshufb $Xn,$Xi,$Xi
779 ___
780 $code.=<<___ if ($i==15 && $REG_SZ==16);
781 vmovd `4*$i`(@ptr[0]),$Xi
782 lea `16*4`(@ptr[0]),@ptr[0]
783 vmovd `4*$i`(@ptr[1]),$t1
784 lea `16*4`(@ptr[1]),@ptr[1]
785 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
786 lea `16*4`(@ptr[2]),@ptr[2]
787 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
788 lea `16*4`(@ptr[3]),@ptr[3]
789 vpunpckldq $t1,$Xi,$Xi
790 vpshufb $Xn,$Xi,$Xi
791 ___
792 $code.=<<___ if ($i<15 && $REG_SZ==32);
793 vmovd `4*$i`(@ptr[0]),$Xi
794 vmovd `4*$i`(@ptr[4]),$t1
795 vmovd `4*$i`(@ptr[1]),$t2
796 vmovd `4*$i`(@ptr[5]),$t3
797 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
798 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
799 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
800 vpunpckldq $t2,$Xi,$Xi
801 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
802 vpunpckldq $t3,$t1,$t1
803 vinserti128 $t1,$Xi,$Xi
804 vpshufb $Xn,$Xi,$Xi
805 ___
806 $code.=<<___ if ($i==15 && $REG_SZ==32);
807 vmovd `4*$i`(@ptr[0]),$Xi
808 lea `16*4`(@ptr[0]),@ptr[0]
809 vmovd `4*$i`(@ptr[4]),$t1
810 lea `16*4`(@ptr[4]),@ptr[4]
811 vmovd `4*$i`(@ptr[1]),$t2
812 lea `16*4`(@ptr[1]),@ptr[1]
813 vmovd `4*$i`(@ptr[5]),$t3
814 lea `16*4`(@ptr[5]),@ptr[5]
815 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
816 lea `16*4`(@ptr[2]),@ptr[2]
817 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
818 lea `16*4`(@ptr[6]),@ptr[6]
819 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
820 lea `16*4`(@ptr[3]),@ptr[3]
821 vpunpckldq $t2,$Xi,$Xi
822 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
823 lea `16*4`(@ptr[7]),@ptr[7]
824 vpunpckldq $t3,$t1,$t1
825 vinserti128 $t1,$Xi,$Xi
826 vpshufb $Xn,$Xi,$Xi
827 ___
828 $code.=<<___;
829 vpsrld \$6,$e,$sigma
830 vpslld \$26,$e,$t3
831 vmovdqu $Xi,`&Xi_off($i)`
832 vpaddd $h,$Xi,$Xi # Xi+=h
833
834 vpsrld \$11,$e,$t2
835 vpxor $t3,$sigma,$sigma
836 vpslld \$21,$e,$t3
837 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
838 vpxor $t2,$sigma,$sigma
839
840 vpsrld \$25,$e,$t2
841 vpxor $t3,$sigma,$sigma
842 `"prefetcht0 63(@ptr[0])" if ($i==15)`
843 vpslld \$7,$e,$t3
844 vpandn $g,$e,$t1
845 vpand $f,$e,$axb # borrow $axb
846 `"prefetcht0 63(@ptr[1])" if ($i==15)`
847 vpxor $t2,$sigma,$sigma
848
849 vpsrld \$2,$a,$h # borrow $h
850 vpxor $t3,$sigma,$sigma # Sigma1(e)
851 `"prefetcht0 63(@ptr[2])" if ($i==15)`
852 vpslld \$30,$a,$t2
853 vpxor $axb,$t1,$t1 # Ch(e,f,g)
854 vpxor $a,$b,$axb # a^b, b^c in next round
855 `"prefetcht0 63(@ptr[3])" if ($i==15)`
856 vpxor $t2,$h,$h
857 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
858
859 vpsrld \$13,$a,$t2
860 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
861 vpslld \$19,$a,$t3
862 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
863 vpand $axb,$bxc,$bxc
864 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
865 vpxor $t2,$h,$sigma
866
867 vpsrld \$22,$a,$t2
868 vpxor $t3,$sigma,$sigma
869 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
870 vpslld \$10,$a,$t3
871 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
872 vpaddd $Xi,$d,$d # d+=Xi
873 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
874 vpxor $t2,$sigma,$sigma
875 vpxor $t3,$sigma,$sigma # Sigma0(a)
876
877 vpaddd $Xi,$h,$h # h+=Xi
878 vpaddd $sigma,$h,$h # h+=Sigma0(a)
879 ___
880 $code.=<<___ if (($i%8)==7);
881 add \$`32*8`,$Tbl
882 ___
883 ($axb,$bxc)=($bxc,$axb);
884 }
885
886 sub ROUND_16_XX_avx {
887 my $i=shift;
888
889 $code.=<<___;
890 vmovdqu `&Xi_off($i+1)`,$Xn
891 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
892
893 vpsrld \$3,$Xn,$sigma
894 vpsrld \$7,$Xn,$t2
895 vpslld \$25,$Xn,$t3
896 vpxor $t2,$sigma,$sigma
897 vpsrld \$18,$Xn,$t2
898 vpxor $t3,$sigma,$sigma
899 vpslld \$14,$Xn,$t3
900 vmovdqu `&Xi_off($i+14)`,$t1
901 vpsrld \$10,$t1,$axb # borrow $axb
902
903 vpxor $t2,$sigma,$sigma
904 vpsrld \$17,$t1,$t2
905 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
906 vpslld \$15,$t1,$t3
907 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
908 vpxor $t2,$axb,$sigma
909 vpsrld \$19,$t1,$t2
910 vpxor $t3,$sigma,$sigma
911 vpslld \$13,$t1,$t3
912 vpxor $t2,$sigma,$sigma
913 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
914 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
915 ___
916 &ROUND_00_15_avx($i,@_);
917 ($Xi,$Xn)=($Xn,$Xi);
918 }
919
920 $code.=<<___;
921 .type sha256_multi_block_avx,\@function,3
922 .align 32
923 sha256_multi_block_avx:
924 _avx_shortcut:
925 ___
926 $code.=<<___ if ($avx>1);
927 shr \$32,%rcx
928 cmp \$2,$num
929 jb .Lavx
930 test \$`1<<5`,%ecx
931 jnz _avx2_shortcut
932 jmp .Lavx
933 .align 32
934 .Lavx:
935 ___
936 $code.=<<___;
937 mov %rsp,%rax
938 push %rbx
939 push %rbp
940 ___
941 $code.=<<___ if ($win64);
942 lea -0xa8(%rsp),%rsp
943 movaps %xmm6,(%rsp)
944 movaps %xmm7,0x10(%rsp)
945 movaps %xmm8,0x20(%rsp)
946 movaps %xmm9,0x30(%rsp)
947 movaps %xmm10,-0x78(%rax)
948 movaps %xmm11,-0x68(%rax)
949 movaps %xmm12,-0x58(%rax)
950 movaps %xmm13,-0x48(%rax)
951 movaps %xmm14,-0x38(%rax)
952 movaps %xmm15,-0x28(%rax)
953 ___
954 $code.=<<___;
955 sub \$`$REG_SZ*18`, %rsp
956 and \$-256,%rsp
957 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
958 .Lbody_avx:
959 lea K256+128(%rip),$Tbl
960 lea `$REG_SZ*16`(%rsp),%rbx
961 lea 0x80($ctx),$ctx # size optimization
962
963 .Loop_grande_avx:
964 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
965 xor $num,$num
966 ___
967 for($i=0;$i<4;$i++) {
968 $code.=<<___;
969 mov `16*$i+0`($inp),@ptr[$i] # input pointer
970 mov `16*$i+8`($inp),%ecx # number of blocks
971 cmp $num,%ecx
972 cmovg %ecx,$num # find maximum
973 test %ecx,%ecx
974 mov %ecx,`4*$i`(%rbx) # initialize counters
975 cmovle $Tbl,@ptr[$i] # cancel input
976 ___
977 }
978 $code.=<<___;
979 test $num,$num
980 jz .Ldone_avx
981
982 vmovdqu 0x00-0x80($ctx),$A # load context
983 lea 128(%rsp),%rax
984 vmovdqu 0x20-0x80($ctx),$B
985 vmovdqu 0x40-0x80($ctx),$C
986 vmovdqu 0x60-0x80($ctx),$D
987 vmovdqu 0x80-0x80($ctx),$E
988 vmovdqu 0xa0-0x80($ctx),$F
989 vmovdqu 0xc0-0x80($ctx),$G
990 vmovdqu 0xe0-0x80($ctx),$H
991 vmovdqu .Lpbswap(%rip),$Xn
992 jmp .Loop_avx
993
994 .align 32
995 .Loop_avx:
996 vpxor $B,$C,$bxc # magic seed
997 ___
998 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
999 $code.=<<___;
1000 vmovdqu `&Xi_off($i)`,$Xi
1001 mov \$3,%ecx
1002 jmp .Loop_16_xx_avx
1003 .align 32
1004 .Loop_16_xx_avx:
1005 ___
1006 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1007 $code.=<<___;
1008 dec %ecx
1009 jnz .Loop_16_xx_avx
1010
1011 mov \$1,%ecx
1012 lea K256+128(%rip),$Tbl
1013 ___
1014 for($i=0;$i<4;$i++) {
1015 $code.=<<___;
1016 cmp `4*$i`(%rbx),%ecx # examine counters
1017 cmovge $Tbl,@ptr[$i] # cancel input
1018 ___
1019 }
1020 $code.=<<___;
1021 vmovdqa (%rbx),$sigma # pull counters
1022 vpxor $t1,$t1,$t1
1023 vmovdqa $sigma,$Xn
1024 vpcmpgtd $t1,$Xn,$Xn # mask value
1025 vpaddd $Xn,$sigma,$sigma # counters--
1026
1027 vmovdqu 0x00-0x80($ctx),$t1
1028 vpand $Xn,$A,$A
1029 vmovdqu 0x20-0x80($ctx),$t2
1030 vpand $Xn,$B,$B
1031 vmovdqu 0x40-0x80($ctx),$t3
1032 vpand $Xn,$C,$C
1033 vmovdqu 0x60-0x80($ctx),$Xi
1034 vpand $Xn,$D,$D
1035 vpaddd $t1,$A,$A
1036 vmovdqu 0x80-0x80($ctx),$t1
1037 vpand $Xn,$E,$E
1038 vpaddd $t2,$B,$B
1039 vmovdqu 0xa0-0x80($ctx),$t2
1040 vpand $Xn,$F,$F
1041 vpaddd $t3,$C,$C
1042 vmovdqu 0xc0-0x80($ctx),$t3
1043 vpand $Xn,$G,$G
1044 vpaddd $Xi,$D,$D
1045 vmovdqu 0xe0-0x80($ctx),$Xi
1046 vpand $Xn,$H,$H
1047 vpaddd $t1,$E,$E
1048 vpaddd $t2,$F,$F
1049 vmovdqu $A,0x00-0x80($ctx)
1050 vpaddd $t3,$G,$G
1051 vmovdqu $B,0x20-0x80($ctx)
1052 vpaddd $Xi,$H,$H
1053 vmovdqu $C,0x40-0x80($ctx)
1054 vmovdqu $D,0x60-0x80($ctx)
1055 vmovdqu $E,0x80-0x80($ctx)
1056 vmovdqu $F,0xa0-0x80($ctx)
1057 vmovdqu $G,0xc0-0x80($ctx)
1058 vmovdqu $H,0xe0-0x80($ctx)
1059
1060 vmovdqu $sigma,(%rbx) # save counters
1061 vmovdqu .Lpbswap(%rip),$Xn
1062 dec $num
1063 jnz .Loop_avx
1064
1065 mov `$REG_SZ*17+8`(%rsp),$num
1066 lea $REG_SZ($ctx),$ctx
1067 lea `16*$REG_SZ/4`($inp),$inp
1068 dec $num
1069 jnz .Loop_grande_avx
1070
1071 .Ldone_avx:
1072 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1073 vzeroupper
1074 ___
1075 $code.=<<___ if ($win64);
1076 movaps -0xb8(%rax),%xmm6
1077 movaps -0xa8(%rax),%xmm7
1078 movaps -0x98(%rax),%xmm8
1079 movaps -0x88(%rax),%xmm9
1080 movaps -0x78(%rax),%xmm10
1081 movaps -0x68(%rax),%xmm11
1082 movaps -0x58(%rax),%xmm12
1083 movaps -0x48(%rax),%xmm13
1084 movaps -0x38(%rax),%xmm14
1085 movaps -0x28(%rax),%xmm15
1086 ___
1087 $code.=<<___;
1088 mov -16(%rax),%rbp
1089 mov -8(%rax),%rbx
1090 lea (%rax),%rsp
1091 .Lepilogue_avx:
1092 ret
1093 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1094 ___
1095 if ($avx>1) {
1096 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1097
1098 $REG_SZ=32;
1099 @ptr=map("%r$_",(12..15,8..11));
1100
1101 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1102 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1103
1104 $code.=<<___;
1105 .type sha256_multi_block_avx2,\@function,3
1106 .align 32
1107 sha256_multi_block_avx2:
1108 _avx2_shortcut:
1109 mov %rsp,%rax
1110 push %rbx
1111 push %rbp
1112 push %r12
1113 push %r13
1114 push %r14
1115 push %r15
1116 ___
1117 $code.=<<___ if ($win64);
1118 lea -0xa8(%rsp),%rsp
1119 movaps %xmm6,(%rsp)
1120 movaps %xmm7,0x10(%rsp)
1121 movaps %xmm8,0x20(%rsp)
1122 movaps %xmm9,0x30(%rsp)
1123 movaps %xmm10,0x40(%rsp)
1124 movaps %xmm11,0x50(%rsp)
1125 movaps %xmm12,-0x78(%rax)
1126 movaps %xmm13,-0x68(%rax)
1127 movaps %xmm14,-0x58(%rax)
1128 movaps %xmm15,-0x48(%rax)
1129 ___
1130 $code.=<<___;
1131 sub \$`$REG_SZ*18`, %rsp
1132 and \$-256,%rsp
1133 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1134 .Lbody_avx2:
1135 lea K256+128(%rip),$Tbl
1136 lea 0x80($ctx),$ctx # size optimization
1137
1138 .Loop_grande_avx2:
1139 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1140 xor $num,$num
1141 lea `$REG_SZ*16`(%rsp),%rbx
1142 ___
1143 for($i=0;$i<8;$i++) {
1144 $code.=<<___;
1145 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1146 mov `16*$i+8`($inp),%ecx # number of blocks
1147 cmp $num,%ecx
1148 cmovg %ecx,$num # find maximum
1149 test %ecx,%ecx
1150 mov %ecx,`4*$i`(%rbx) # initialize counters
1151 cmovle $Tbl,@ptr[$i] # cancel input
1152 ___
1153 }
1154 $code.=<<___;
1155 vmovdqu 0x00-0x80($ctx),$A # load context
1156 lea 128(%rsp),%rax
1157 vmovdqu 0x20-0x80($ctx),$B
1158 lea 256+128(%rsp),%rbx
1159 vmovdqu 0x40-0x80($ctx),$C
1160 vmovdqu 0x60-0x80($ctx),$D
1161 vmovdqu 0x80-0x80($ctx),$E
1162 vmovdqu 0xa0-0x80($ctx),$F
1163 vmovdqu 0xc0-0x80($ctx),$G
1164 vmovdqu 0xe0-0x80($ctx),$H
1165 vmovdqu .Lpbswap(%rip),$Xn
1166 jmp .Loop_avx2
1167
1168 .align 32
1169 .Loop_avx2:
1170 vpxor $B,$C,$bxc # magic seed
1171 ___
1172 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1173 $code.=<<___;
1174 vmovdqu `&Xi_off($i)`,$Xi
1175 mov \$3,%ecx
1176 jmp .Loop_16_xx_avx2
1177 .align 32
1178 .Loop_16_xx_avx2:
1179 ___
1180 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1181 $code.=<<___;
1182 dec %ecx
1183 jnz .Loop_16_xx_avx2
1184
1185 mov \$1,%ecx
1186 lea `$REG_SZ*16`(%rsp),%rbx
1187 lea K256+128(%rip),$Tbl
1188 ___
1189 for($i=0;$i<8;$i++) {
1190 $code.=<<___;
1191 cmp `4*$i`(%rbx),%ecx # examine counters
1192 cmovge $Tbl,@ptr[$i] # cancel input
1193 ___
1194 }
1195 $code.=<<___;
1196 vmovdqa (%rbx),$sigma # pull counters
1197 vpxor $t1,$t1,$t1
1198 vmovdqa $sigma,$Xn
1199 vpcmpgtd $t1,$Xn,$Xn # mask value
1200 vpaddd $Xn,$sigma,$sigma # counters--
1201
1202 vmovdqu 0x00-0x80($ctx),$t1
1203 vpand $Xn,$A,$A
1204 vmovdqu 0x20-0x80($ctx),$t2
1205 vpand $Xn,$B,$B
1206 vmovdqu 0x40-0x80($ctx),$t3
1207 vpand $Xn,$C,$C
1208 vmovdqu 0x60-0x80($ctx),$Xi
1209 vpand $Xn,$D,$D
1210 vpaddd $t1,$A,$A
1211 vmovdqu 0x80-0x80($ctx),$t1
1212 vpand $Xn,$E,$E
1213 vpaddd $t2,$B,$B
1214 vmovdqu 0xa0-0x80($ctx),$t2
1215 vpand $Xn,$F,$F
1216 vpaddd $t3,$C,$C
1217 vmovdqu 0xc0-0x80($ctx),$t3
1218 vpand $Xn,$G,$G
1219 vpaddd $Xi,$D,$D
1220 vmovdqu 0xe0-0x80($ctx),$Xi
1221 vpand $Xn,$H,$H
1222 vpaddd $t1,$E,$E
1223 vpaddd $t2,$F,$F
1224 vmovdqu $A,0x00-0x80($ctx)
1225 vpaddd $t3,$G,$G
1226 vmovdqu $B,0x20-0x80($ctx)
1227 vpaddd $Xi,$H,$H
1228 vmovdqu $C,0x40-0x80($ctx)
1229 vmovdqu $D,0x60-0x80($ctx)
1230 vmovdqu $E,0x80-0x80($ctx)
1231 vmovdqu $F,0xa0-0x80($ctx)
1232 vmovdqu $G,0xc0-0x80($ctx)
1233 vmovdqu $H,0xe0-0x80($ctx)
1234
1235 vmovdqu $sigma,(%rbx) # save counters
1236 lea 256+128(%rsp),%rbx
1237 vmovdqu .Lpbswap(%rip),$Xn
1238 dec $num
1239 jnz .Loop_avx2
1240
1241 #mov `$REG_SZ*17+8`(%rsp),$num
1242 #lea $REG_SZ($ctx),$ctx
1243 #lea `16*$REG_SZ/4`($inp),$inp
1244 #dec $num
1245 #jnz .Loop_grande_avx2
1246
1247 .Ldone_avx2:
1248 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1249 vzeroupper
1250 ___
1251 $code.=<<___ if ($win64);
1252 movaps -0xd8(%rax),%xmm6
1253 movaps -0xc8(%rax),%xmm7
1254 movaps -0xb8(%rax),%xmm8
1255 movaps -0xa8(%rax),%xmm9
1256 movaps -0x98(%rax),%xmm10
1257 movaps -0x88(%rax),%xmm11
1258 movaps -0x78(%rax),%xmm12
1259 movaps -0x68(%rax),%xmm13
1260 movaps -0x58(%rax),%xmm14
1261 movaps -0x48(%rax),%xmm15
1262 ___
1263 $code.=<<___;
1264 mov -48(%rax),%r15
1265 mov -40(%rax),%r14
1266 mov -32(%rax),%r13
1267 mov -24(%rax),%r12
1268 mov -16(%rax),%rbp
1269 mov -8(%rax),%rbx
1270 lea (%rax),%rsp
1271 .Lepilogue_avx2:
1272 ret
1273 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1274 ___
1275 } }}}
1276 $code.=<<___;
1277 .align 256
1278 K256:
1279 ___
1280 sub TABLE {
1281 foreach (@_) {
1282 $code.=<<___;
1283 .long $_,$_,$_,$_
1284 .long $_,$_,$_,$_
1285 ___
1286 }
1287 }
1288 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1289 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1290 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1291 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1292 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1293 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1294 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1295 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1296 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1297 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1298 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1299 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1300 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1301 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1302 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1303 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1304 $code.=<<___;
1305 .Lpbswap:
1306 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1307 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1308 K256_shaext:
1309 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1310 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1311 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1312 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1313 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1314 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1315 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1316 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1317 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1318 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1319 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1320 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1321 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1322 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1323 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1324 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1325 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1326 ___
1327
1328 if ($win64) {
1329 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1330 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1331 $rec="%rcx";
1332 $frame="%rdx";
1333 $context="%r8";
1334 $disp="%r9";
1335
1336 $code.=<<___;
1337 .extern __imp_RtlVirtualUnwind
1338 .type se_handler,\@abi-omnipotent
1339 .align 16
1340 se_handler:
1341 push %rsi
1342 push %rdi
1343 push %rbx
1344 push %rbp
1345 push %r12
1346 push %r13
1347 push %r14
1348 push %r15
1349 pushfq
1350 sub \$64,%rsp
1351
1352 mov 120($context),%rax # pull context->Rax
1353 mov 248($context),%rbx # pull context->Rip
1354
1355 mov 8($disp),%rsi # disp->ImageBase
1356 mov 56($disp),%r11 # disp->HandlerData
1357
1358 mov 0(%r11),%r10d # HandlerData[0]
1359 lea (%rsi,%r10),%r10 # end of prologue label
1360 cmp %r10,%rbx # context->Rip<.Lbody
1361 jb .Lin_prologue
1362
1363 mov 152($context),%rax # pull context->Rsp
1364
1365 mov 4(%r11),%r10d # HandlerData[1]
1366 lea (%rsi,%r10),%r10 # epilogue label
1367 cmp %r10,%rbx # context->Rip>=.Lepilogue
1368 jae .Lin_prologue
1369
1370 mov `16*17`(%rax),%rax # pull saved stack pointer
1371
1372 mov -8(%rax),%rbx
1373 mov -16(%rax),%rbp
1374 mov %rbx,144($context) # restore context->Rbx
1375 mov %rbp,160($context) # restore context->Rbp
1376
1377 lea -24-10*16(%rax),%rsi
1378 lea 512($context),%rdi # &context.Xmm6
1379 mov \$20,%ecx
1380 .long 0xa548f3fc # cld; rep movsq
1381
1382 .Lin_prologue:
1383 mov 8(%rax),%rdi
1384 mov 16(%rax),%rsi
1385 mov %rax,152($context) # restore context->Rsp
1386 mov %rsi,168($context) # restore context->Rsi
1387 mov %rdi,176($context) # restore context->Rdi
1388
1389 mov 40($disp),%rdi # disp->ContextRecord
1390 mov $context,%rsi # context
1391 mov \$154,%ecx # sizeof(CONTEXT)
1392 .long 0xa548f3fc # cld; rep movsq
1393
1394 mov $disp,%rsi
1395 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1396 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1397 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1398 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1399 mov 40(%rsi),%r10 # disp->ContextRecord
1400 lea 56(%rsi),%r11 # &disp->HandlerData
1401 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1402 mov %r10,32(%rsp) # arg5
1403 mov %r11,40(%rsp) # arg6
1404 mov %r12,48(%rsp) # arg7
1405 mov %rcx,56(%rsp) # arg8, (NULL)
1406 call *__imp_RtlVirtualUnwind(%rip)
1407
1408 mov \$1,%eax # ExceptionContinueSearch
1409 add \$64,%rsp
1410 popfq
1411 pop %r15
1412 pop %r14
1413 pop %r13
1414 pop %r12
1415 pop %rbp
1416 pop %rbx
1417 pop %rdi
1418 pop %rsi
1419 ret
1420 .size se_handler,.-se_handler
1421 ___
1422 $code.=<<___ if ($avx>1);
1423 .type avx2_handler,\@abi-omnipotent
1424 .align 16
1425 avx2_handler:
1426 push %rsi
1427 push %rdi
1428 push %rbx
1429 push %rbp
1430 push %r12
1431 push %r13
1432 push %r14
1433 push %r15
1434 pushfq
1435 sub \$64,%rsp
1436
1437 mov 120($context),%rax # pull context->Rax
1438 mov 248($context),%rbx # pull context->Rip
1439
1440 mov 8($disp),%rsi # disp->ImageBase
1441 mov 56($disp),%r11 # disp->HandlerData
1442
1443 mov 0(%r11),%r10d # HandlerData[0]
1444 lea (%rsi,%r10),%r10 # end of prologue label
1445 cmp %r10,%rbx # context->Rip<body label
1446 jb .Lin_prologue
1447
1448 mov 152($context),%rax # pull context->Rsp
1449
1450 mov 4(%r11),%r10d # HandlerData[1]
1451 lea (%rsi,%r10),%r10 # epilogue label
1452 cmp %r10,%rbx # context->Rip>=epilogue label
1453 jae .Lin_prologue
1454
1455 mov `32*17`($context),%rax # pull saved stack pointer
1456
1457 mov -8(%rax),%rbx
1458 mov -16(%rax),%rbp
1459 mov -24(%rax),%r12
1460 mov -32(%rax),%r13
1461 mov -40(%rax),%r14
1462 mov -48(%rax),%r15
1463 mov %rbx,144($context) # restore context->Rbx
1464 mov %rbp,160($context) # restore context->Rbp
1465 mov %r12,216($context) # restore cotnext->R12
1466 mov %r13,224($context) # restore cotnext->R13
1467 mov %r14,232($context) # restore cotnext->R14
1468 mov %r15,240($context) # restore cotnext->R15
1469
1470 lea -56-10*16(%rax),%rsi
1471 lea 512($context),%rdi # &context.Xmm6
1472 mov \$20,%ecx
1473 .long 0xa548f3fc # cld; rep movsq
1474
1475 jmp .Lin_prologue
1476 .size avx2_handler,.-avx2_handler
1477 ___
1478 $code.=<<___;
1479 .section .pdata
1480 .align 4
1481 .rva .LSEH_begin_sha256_multi_block
1482 .rva .LSEH_end_sha256_multi_block
1483 .rva .LSEH_info_sha256_multi_block
1484 .rva .LSEH_begin_sha256_multi_block_shaext
1485 .rva .LSEH_end_sha256_multi_block_shaext
1486 .rva .LSEH_info_sha256_multi_block_shaext
1487 ___
1488 $code.=<<___ if ($avx);
1489 .rva .LSEH_begin_sha256_multi_block_avx
1490 .rva .LSEH_end_sha256_multi_block_avx
1491 .rva .LSEH_info_sha256_multi_block_avx
1492 ___
1493 $code.=<<___ if ($avx>1);
1494 .rva .LSEH_begin_sha256_multi_block_avx2
1495 .rva .LSEH_end_sha256_multi_block_avx2
1496 .rva .LSEH_info_sha256_multi_block_avx2
1497 ___
1498 $code.=<<___;
1499 .section .xdata
1500 .align 8
1501 .LSEH_info_sha256_multi_block:
1502 .byte 9,0,0,0
1503 .rva se_handler
1504 .rva .Lbody,.Lepilogue # HandlerData[]
1505 .LSEH_info_sha256_multi_block_shaext:
1506 .byte 9,0,0,0
1507 .rva se_handler
1508 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1509 ___
1510 $code.=<<___ if ($avx);
1511 .LSEH_info_sha256_multi_block_avx:
1512 .byte 9,0,0,0
1513 .rva se_handler
1514 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1515 ___
1516 $code.=<<___ if ($avx>1);
1517 .LSEH_info_sha256_multi_block_avx2:
1518 .byte 9,0,0,0
1519 .rva avx2_handler
1520 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1521 ___
1522 }
1523 ####################################################################
1524
1525 sub rex {
1526 local *opcode=shift;
1527 my ($dst,$src)=@_;
1528 my $rex=0;
1529
1530 $rex|=0x04 if ($dst>=8);
1531 $rex|=0x01 if ($src>=8);
1532 unshift @opcode,$rex|0x40 if ($rex);
1533 }
1534
1535 sub sha256op38 {
1536 my $instr = shift;
1537 my %opcodelet = (
1538 "sha256rnds2" => 0xcb,
1539 "sha256msg1" => 0xcc,
1540 "sha256msg2" => 0xcd );
1541
1542 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1543 my @opcode=(0x0f,0x38);
1544 rex(\@opcode,$2,$1);
1545 push @opcode,$opcodelet{$instr};
1546 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1547 return ".byte\t".join(',',@opcode);
1548 } else {
1549 return $instr."\t".@_[0];
1550 }
1551 }
1552
1553 foreach (split("\n",$code)) {
1554 s/\`([^\`]*)\`/eval($1)/ge;
1555
1556 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1557
1558 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1559 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1560 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1561 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1562 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1563 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1564
1565 print $_,"\n";
1566 }
1567
1568 close STDOUT;