]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha256-mb-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
a598ed0d 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
b7838586
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer SHA256 procedure processes n buffers in parallel by
18# placing buffer data to designated lane of SIMD register. n is
19# naturally limited to 4 on pre-AVX2 processors and to 8 on
20# AVX2-capable processors such as Haswell.
21#
61ba602a 22# this +aesni(i) sha256 aesni-sha256 gain(iv)
b7838586 23# -------------------------------------------------------------------
61ba602a 24# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
619b9466 25# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
b7838586
AP
26# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
27# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
61ba602a 28# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
b7f5503f 29# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
b7838586
AP
30# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
31#
61ba602a
AP
32# (i) multi-block CBC encrypt with 128-bit key;
33# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
b7838586
AP
34# because of lower AES-NI instruction throughput, nor is there
35# AES-NI-SHA256 stitch for these processors;
61ba602a 36# (iii) "this" is for n=8, when we gather twice as much data, result
b7838586 37# for n=4 is 20.3+4.44=24.7;
3847d15d 38# (iv) presented improvement coefficients are asymptotic limits and
609b0852 39# in real-life application are somewhat lower, e.g. for 2KB
619b9466 40# fragments they range from 75% to 130% (on Haswell);
b7838586 41
1aa89a7a
RL
42# $output is the last argument if it looks like a file (it has an extension)
43# $flavour is the first argument if it doesn't look like a file
44$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
45$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
b7838586
AP
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
0d51cf3c
L
54push(@INC,"${dir}","${dir}../../perlasm");
55require "x86_64-support.pl";
56
57$ptr_size=&pointer_size($flavour);
58
b7838586
AP
59$avx=0;
60
61if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
62 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
63 $avx = ($1>=2.19) + ($1>=2.22);
64}
65
66if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
67 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
68 $avx = ($1>=2.09) + ($1>=2.10);
69}
70
71if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
72 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
73 $avx = ($1>=10) + ($1>=11);
74}
75
9bb3e5fd 76if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
a356e488 77 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
78}
79
1aa89a7a
RL
80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
81 or die "can't call $xlate: $!";
b7838586
AP
82*STDOUT=*OUT;
83
84# void sha256_multi_block (
85# struct { unsigned int A[8];
86# unsigned int B[8];
87# unsigned int C[8];
88# unsigned int D[8];
89# unsigned int E[8];
90# unsigned int F[8];
91# unsigned int G[8];
92# unsigned int H[8]; } *ctx,
93# struct { void *ptr; int blocks; } inp[8],
94# int num); /* 1 or 2 */
95#
96$ctx="%rdi"; # 1st arg
97$inp="%rsi"; # 2nd arg
98$num="%edx"; # 3rd arg
99@ptr=map("%r$_",(8..11));
100$Tbl="%rbp";
0d51cf3c 101$inp_elm_size=2*$ptr_size;
b7838586
AP
102
103@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
104($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
105
106$REG_SZ=16;
107
108sub Xi_off {
109my $off = shift;
110
111 $off %= 16; $off *= $REG_SZ;
112 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
113}
114
115sub ROUND_00_15 {
116my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
117
118$code.=<<___ if ($i<15);
119 movd `4*$i`(@ptr[0]),$Xi
120 movd `4*$i`(@ptr[1]),$t1
121 movd `4*$i`(@ptr[2]),$t2
122 movd `4*$i`(@ptr[3]),$t3
123 punpckldq $t2,$Xi
124 punpckldq $t3,$t1
125 punpckldq $t1,$Xi
b7838586
AP
126___
127$code.=<<___ if ($i==15);
128 movd `4*$i`(@ptr[0]),$Xi
129 lea `16*4`(@ptr[0]),@ptr[0]
130 movd `4*$i`(@ptr[1]),$t1
131 lea `16*4`(@ptr[1]),@ptr[1]
132 movd `4*$i`(@ptr[2]),$t2
133 lea `16*4`(@ptr[2]),@ptr[2]
134 movd `4*$i`(@ptr[3]),$t3
135 lea `16*4`(@ptr[3]),@ptr[3]
136 punpckldq $t2,$Xi
137 punpckldq $t3,$t1
138 punpckldq $t1,$Xi
b7838586
AP
139___
140$code.=<<___;
141 movdqa $e,$sigma
619b9466 142 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
b7838586 143 movdqa $e,$t3
619b9466 144 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
b7838586
AP
145 psrld \$6,$sigma
146 movdqa $e,$t2
147 pslld \$7,$t3
148 movdqa $Xi,`&Xi_off($i)`
149 paddd $h,$Xi # Xi+=h
150
151 psrld \$11,$t2
152 pxor $t3,$sigma
153 pslld \$21-7,$t3
154 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
155 pxor $t2,$sigma
156
157 psrld \$25-11,$t2
158 movdqa $e,$t1
619b9466 159 `"prefetcht0 63(@ptr[0])" if ($i==15)`
b7838586
AP
160 pxor $t3,$sigma
161 movdqa $e,$axb # borrow $axb
162 pslld \$26-21,$t3
163 pandn $g,$t1
164 pand $f,$axb
165 pxor $t2,$sigma
166
619b9466 167 `"prefetcht0 63(@ptr[1])" if ($i==15)`
b7838586
AP
168 movdqa $a,$t2
169 pxor $t3,$sigma # Sigma1(e)
170 movdqa $a,$t3
171 psrld \$2,$t2
172 paddd $sigma,$Xi # Xi+=Sigma1(e)
173 pxor $axb,$t1 # Ch(e,f,g)
174 movdqa $b,$axb
175 movdqa $a,$sigma
176 pslld \$10,$t3
177 pxor $a,$axb # a^b, b^c in next round
178
619b9466 179 `"prefetcht0 63(@ptr[2])" if ($i==15)`
b7838586
AP
180 psrld \$13,$sigma
181 pxor $t3,$t2
182 paddd $t1,$Xi # Xi+=Ch(e,f,g)
183 pslld \$19-10,$t3
184 pand $axb,$bxc
185 pxor $sigma,$t2
186
619b9466 187 `"prefetcht0 63(@ptr[3])" if ($i==15)`
b7838586
AP
188 psrld \$22-13,$sigma
189 pxor $t3,$t2
190 movdqa $b,$h
191 pslld \$30-19,$t3
192 pxor $t2,$sigma
193 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
194 paddd $Xi,$d # d+=Xi
195 pxor $t3,$sigma # Sigma0(a)
196
197 paddd $Xi,$h # h+=Xi
198 paddd $sigma,$h # h+=Sigma0(a)
199___
200$code.=<<___ if (($i%8)==7);
201 lea `32*8`($Tbl),$Tbl
202___
203 ($axb,$bxc)=($bxc,$axb);
204}
205
206sub ROUND_16_XX {
207my $i=shift;
208
209$code.=<<___;
210 movdqa `&Xi_off($i+1)`,$Xn
211 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
212
213 movdqa $Xn,$sigma
214 movdqa $Xn,$t2
215 psrld \$3,$sigma
216 movdqa $Xn,$t3
217
218 psrld \$7,$t2
219 movdqa `&Xi_off($i+14)`,$t1
220 pslld \$14,$t3
221 pxor $t2,$sigma
222 psrld \$18-7,$t2
223 movdqa $t1,$axb # borrow $axb
224 pxor $t3,$sigma
225 pslld \$25-14,$t3
226 pxor $t2,$sigma
227 psrld \$10,$t1
228 movdqa $axb,$t2
229
230 psrld \$17,$axb
231 pxor $t3,$sigma # sigma0(X[i+1])
232 pslld \$13,$t2
233 paddd $sigma,$Xi # Xi+=sigma0(e)
234 pxor $axb,$t1
235 psrld \$19-17,$axb
236 pxor $t2,$t1
237 pslld \$15-13,$t2
238 pxor $axb,$t1
239 pxor $t2,$t1 # sigma0(X[i+14])
240 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
241___
242 &ROUND_00_15($i,@_);
243 ($Xi,$Xn)=($Xn,$Xi);
244}
245
246$code.=<<___;
247.text
248
249.extern OPENSSL_ia32cap_P
250
251.globl sha256_multi_block
252.type sha256_multi_block,\@function,3
253.align 32
254sha256_multi_block:
399976c7 255.cfi_startproc
619b9466
AP
256 mov OPENSSL_ia32cap_P+4(%rip),%rcx
257 bt \$61,%rcx # check SHA bit
258 jc _shaext_shortcut
b7838586
AP
259___
260$code.=<<___ if ($avx);
b7838586
AP
261 test \$`1<<28`,%ecx
262 jnz _avx_shortcut
263___
264$code.=<<___;
265 mov %rsp,%rax
399976c7 266.cfi_def_cfa_register %rax
b7838586 267 push %rbx
399976c7 268.cfi_push %rbx
b7838586 269 push %rbp
399976c7 270.cfi_push %rbp
b7838586
AP
271___
272$code.=<<___ if ($win64);
273 lea -0xa8(%rsp),%rsp
274 movaps %xmm6,(%rsp)
275 movaps %xmm7,0x10(%rsp)
276 movaps %xmm8,0x20(%rsp)
277 movaps %xmm9,0x30(%rsp)
278 movaps %xmm10,-0x78(%rax)
279 movaps %xmm11,-0x68(%rax)
280 movaps %xmm12,-0x58(%rax)
281 movaps %xmm13,-0x48(%rax)
282 movaps %xmm14,-0x38(%rax)
283 movaps %xmm15,-0x28(%rax)
284___
285$code.=<<___;
286 sub \$`$REG_SZ*18`, %rsp
287 and \$-256,%rsp
288 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
399976c7 289.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
619b9466 290.Lbody:
b7838586
AP
291 lea K256+128(%rip),$Tbl
292 lea `$REG_SZ*16`(%rsp),%rbx
293 lea 0x80($ctx),$ctx # size optimization
294
295.Loop_grande:
296 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
297 xor $num,$num
298___
299for($i=0;$i<4;$i++) {
0d51cf3c 300 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
b7838586 301 $code.=<<___;
0d51cf3c
L
302 # input pointer
303 mov `$inp_elm_size*$i+0`($inp),$ptr_reg
304 # number of blocks
305 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
b7838586
AP
306 cmp $num,%ecx
307 cmovg %ecx,$num # find maximum
308 test %ecx,%ecx
309 mov %ecx,`4*$i`(%rbx) # initialize counters
310 cmovle $Tbl,@ptr[$i] # cancel input
311___
312}
313$code.=<<___;
314 test $num,$num
315 jz .Ldone
316
317 movdqu 0x00-0x80($ctx),$A # load context
318 lea 128(%rsp),%rax
319 movdqu 0x20-0x80($ctx),$B
320 movdqu 0x40-0x80($ctx),$C
321 movdqu 0x60-0x80($ctx),$D
322 movdqu 0x80-0x80($ctx),$E
323 movdqu 0xa0-0x80($ctx),$F
324 movdqu 0xc0-0x80($ctx),$G
325 movdqu 0xe0-0x80($ctx),$H
326 movdqu .Lpbswap(%rip),$Xn
327 jmp .Loop
328
329.align 32
330.Loop:
331 movdqa $C,$bxc
332 pxor $B,$bxc # magic seed
333___
334for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
335$code.=<<___;
336 movdqu `&Xi_off($i)`,$Xi
337 mov \$3,%ecx
338 jmp .Loop_16_xx
339.align 32
340.Loop_16_xx:
341___
342for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
343$code.=<<___;
344 dec %ecx
345 jnz .Loop_16_xx
346
347 mov \$1,%ecx
348 lea K256+128(%rip),$Tbl
349
350 movdqa (%rbx),$sigma # pull counters
351 cmp 4*0(%rbx),%ecx # examine counters
352 pxor $t1,$t1
353 cmovge $Tbl,@ptr[0] # cancel input
354 cmp 4*1(%rbx),%ecx
355 movdqa $sigma,$Xn
356 cmovge $Tbl,@ptr[1]
357 cmp 4*2(%rbx),%ecx
358 pcmpgtd $t1,$Xn # mask value
359 cmovge $Tbl,@ptr[2]
360 cmp 4*3(%rbx),%ecx
361 paddd $Xn,$sigma # counters--
362 cmovge $Tbl,@ptr[3]
363
364 movdqu 0x00-0x80($ctx),$t1
365 pand $Xn,$A
366 movdqu 0x20-0x80($ctx),$t2
367 pand $Xn,$B
368 movdqu 0x40-0x80($ctx),$t3
369 pand $Xn,$C
370 movdqu 0x60-0x80($ctx),$Xi
371 pand $Xn,$D
372 paddd $t1,$A
373 movdqu 0x80-0x80($ctx),$t1
374 pand $Xn,$E
375 paddd $t2,$B
376 movdqu 0xa0-0x80($ctx),$t2
377 pand $Xn,$F
378 paddd $t3,$C
379 movdqu 0xc0-0x80($ctx),$t3
380 pand $Xn,$G
381 paddd $Xi,$D
382 movdqu 0xe0-0x80($ctx),$Xi
383 pand $Xn,$H
384 paddd $t1,$E
385 paddd $t2,$F
386 movdqu $A,0x00-0x80($ctx)
387 paddd $t3,$G
388 movdqu $B,0x20-0x80($ctx)
389 paddd $Xi,$H
390 movdqu $C,0x40-0x80($ctx)
391 movdqu $D,0x60-0x80($ctx)
392 movdqu $E,0x80-0x80($ctx)
393 movdqu $F,0xa0-0x80($ctx)
394 movdqu $G,0xc0-0x80($ctx)
395 movdqu $H,0xe0-0x80($ctx)
396
397 movdqa $sigma,(%rbx) # save counters
398 movdqa .Lpbswap(%rip),$Xn
399 dec $num
400 jnz .Loop
401
402 mov `$REG_SZ*17+8`(%rsp),$num
403 lea $REG_SZ($ctx),$ctx
0d51cf3c 404 lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
b7838586
AP
405 dec $num
406 jnz .Loop_grande
407
408.Ldone:
0d4fb843 409 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
399976c7 410.cfi_def_cfa %rax,8
b7838586
AP
411___
412$code.=<<___ if ($win64);
413 movaps -0xb8(%rax),%xmm6
414 movaps -0xa8(%rax),%xmm7
415 movaps -0x98(%rax),%xmm8
416 movaps -0x88(%rax),%xmm9
417 movaps -0x78(%rax),%xmm10
418 movaps -0x68(%rax),%xmm11
419 movaps -0x58(%rax),%xmm12
420 movaps -0x48(%rax),%xmm13
421 movaps -0x38(%rax),%xmm14
422 movaps -0x28(%rax),%xmm15
423___
424$code.=<<___;
425 mov -16(%rax),%rbp
399976c7 426.cfi_restore %rbp
b7838586 427 mov -8(%rax),%rbx
399976c7 428.cfi_restore %rbx
b7838586 429 lea (%rax),%rsp
399976c7 430.cfi_def_cfa_register %rsp
619b9466 431.Lepilogue:
b7838586 432 ret
399976c7 433.cfi_endproc
b7838586
AP
434.size sha256_multi_block,.-sha256_multi_block
435___
619b9466
AP
436 {{{
437my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
438my @MSG0=map("%xmm$_",(4..7));
439my @MSG1=map("%xmm$_",(8..11));
440
441$code.=<<___;
442.type sha256_multi_block_shaext,\@function,3
443.align 32
444sha256_multi_block_shaext:
399976c7 445.cfi_startproc
619b9466
AP
446_shaext_shortcut:
447 mov %rsp,%rax
399976c7 448.cfi_def_cfa_register %rax
619b9466 449 push %rbx
399976c7 450.cfi_push %rbx
619b9466 451 push %rbp
399976c7 452.cfi_push %rbp
619b9466
AP
453___
454$code.=<<___ if ($win64);
455 lea -0xa8(%rsp),%rsp
456 movaps %xmm6,(%rsp)
457 movaps %xmm7,0x10(%rsp)
458 movaps %xmm8,0x20(%rsp)
459 movaps %xmm9,0x30(%rsp)
460 movaps %xmm10,-0x78(%rax)
461 movaps %xmm11,-0x68(%rax)
462 movaps %xmm12,-0x58(%rax)
463 movaps %xmm13,-0x48(%rax)
464 movaps %xmm14,-0x38(%rax)
465 movaps %xmm15,-0x28(%rax)
466___
467$code.=<<___;
468 sub \$`$REG_SZ*18`,%rsp
469 shl \$1,$num # we process pair at a time
470 and \$-256,%rsp
471 lea 0x80($ctx),$ctx # size optimization
472 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
473.Lbody_shaext:
474 lea `$REG_SZ*16`(%rsp),%rbx
475 lea K256_shaext+0x80(%rip),$Tbl
476
477.Loop_grande_shaext:
0d4fb843 478 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
619b9466
AP
479 xor $num,$num
480___
481for($i=0;$i<2;$i++) {
0d51cf3c 482 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
619b9466 483 $code.=<<___;
0d51cf3c
L
484 # input pointer
485 mov `$inp_elm_size*$i+0`($inp),$ptr_reg
486 # number of blocks
487 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
619b9466
AP
488 cmp $num,%ecx
489 cmovg %ecx,$num # find maximum
490 test %ecx,%ecx
491 mov %ecx,`4*$i`(%rbx) # initialize counters
492 cmovle %rsp,@ptr[$i] # cancel input
493___
494}
495$code.=<<___;
496 test $num,$num
497 jz .Ldone_shaext
498
499 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
500 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
501 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
502 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
503 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
504 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
505 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
506 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
507
508 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
509 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
510 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
511 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
512 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
513
514 movdqa $ABEF0,$ABEF1
515 movdqa $CDGH0,$CDGH1
516 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
517 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
518 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
519 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
520
521 pshufd \$0b00011011,$ABEF0,$ABEF0
522 pshufd \$0b00011011,$CDGH0,$CDGH0
523 pshufd \$0b00011011,$ABEF1,$ABEF1
524 pshufd \$0b00011011,$CDGH1,$CDGH1
525 jmp .Loop_shaext
526
527.align 32
528.Loop_shaext:
529 movdqu 0x00(@ptr[0]),@MSG0[0]
530 movdqu 0x00(@ptr[1]),@MSG1[0]
531 movdqu 0x10(@ptr[0]),@MSG0[1]
532 movdqu 0x10(@ptr[1]),@MSG1[1]
533 movdqu 0x20(@ptr[0]),@MSG0[2]
534 pshufb $TMPx,@MSG0[0]
535 movdqu 0x20(@ptr[1]),@MSG1[2]
536 pshufb $TMPx,@MSG1[0]
537 movdqu 0x30(@ptr[0]),@MSG0[3]
538 lea 0x40(@ptr[0]),@ptr[0]
539 movdqu 0x30(@ptr[1]),@MSG1[3]
540 lea 0x40(@ptr[1]),@ptr[1]
541
542 movdqa 0*16-0x80($Tbl),$Wi
543 pshufb $TMPx,@MSG0[1]
544 paddd @MSG0[0],$Wi
545 pxor $ABEF0,@MSG0[0] # black magic
546 movdqa $Wi,$TMP0
547 movdqa 0*16-0x80($Tbl),$TMP1
548 pshufb $TMPx,@MSG1[1]
549 paddd @MSG1[0],$TMP1
550 movdqa $CDGH0,0x50(%rsp) # offload
551 sha256rnds2 $ABEF0,$CDGH0 # 0-3
552 pxor $ABEF1,@MSG1[0] # black magic
553 movdqa $TMP1,$Wi
554 movdqa $CDGH1,0x70(%rsp)
555 sha256rnds2 $ABEF1,$CDGH1 # 0-3
556 pshufd \$0x0e,$TMP0,$Wi
557 pxor $ABEF0,@MSG0[0] # black magic
558 movdqa $ABEF0,0x40(%rsp) # offload
559 sha256rnds2 $CDGH0,$ABEF0
560 pshufd \$0x0e,$TMP1,$Wi
561 pxor $ABEF1,@MSG1[0] # black magic
562 movdqa $ABEF1,0x60(%rsp)
563 movdqa 1*16-0x80($Tbl),$TMP0
564 paddd @MSG0[1],$TMP0
565 pshufb $TMPx,@MSG0[2]
566 sha256rnds2 $CDGH1,$ABEF1
567
568 movdqa $TMP0,$Wi
569 movdqa 1*16-0x80($Tbl),$TMP1
570 paddd @MSG1[1],$TMP1
571 sha256rnds2 $ABEF0,$CDGH0 # 4-7
572 movdqa $TMP1,$Wi
573 prefetcht0 127(@ptr[0])
574 pshufb $TMPx,@MSG0[3]
575 pshufb $TMPx,@MSG1[2]
576 prefetcht0 127(@ptr[1])
577 sha256rnds2 $ABEF1,$CDGH1 # 4-7
578 pshufd \$0x0e,$TMP0,$Wi
579 pshufb $TMPx,@MSG1[3]
580 sha256msg1 @MSG0[1],@MSG0[0]
581 sha256rnds2 $CDGH0,$ABEF0
582 pshufd \$0x0e,$TMP1,$Wi
583 movdqa 2*16-0x80($Tbl),$TMP0
584 paddd @MSG0[2],$TMP0
585 sha256rnds2 $CDGH1,$ABEF1
586
587 movdqa $TMP0,$Wi
588 movdqa 2*16-0x80($Tbl),$TMP1
589 paddd @MSG1[2],$TMP1
590 sha256rnds2 $ABEF0,$CDGH0 # 8-11
591 sha256msg1 @MSG1[1],@MSG1[0]
592 movdqa $TMP1,$Wi
593 movdqa @MSG0[3],$TMPx
594 sha256rnds2 $ABEF1,$CDGH1 # 8-11
595 pshufd \$0x0e,$TMP0,$Wi
596 palignr \$4,@MSG0[2],$TMPx
597 paddd $TMPx,@MSG0[0]
598 movdqa @MSG1[3],$TMPx
599 palignr \$4,@MSG1[2],$TMPx
600 sha256msg1 @MSG0[2],@MSG0[1]
601 sha256rnds2 $CDGH0,$ABEF0
602 pshufd \$0x0e,$TMP1,$Wi
603 movdqa 3*16-0x80($Tbl),$TMP0
604 paddd @MSG0[3],$TMP0
605 sha256rnds2 $CDGH1,$ABEF1
606 sha256msg1 @MSG1[2],@MSG1[1]
607
608 movdqa $TMP0,$Wi
609 movdqa 3*16-0x80($Tbl),$TMP1
610 paddd $TMPx,@MSG1[0]
611 paddd @MSG1[3],$TMP1
612 sha256msg2 @MSG0[3],@MSG0[0]
613 sha256rnds2 $ABEF0,$CDGH0 # 12-15
614 movdqa $TMP1,$Wi
615 movdqa @MSG0[0],$TMPx
616 palignr \$4,@MSG0[3],$TMPx
617 sha256rnds2 $ABEF1,$CDGH1 # 12-15
618 sha256msg2 @MSG1[3],@MSG1[0]
619 pshufd \$0x0e,$TMP0,$Wi
620 paddd $TMPx,@MSG0[1]
621 movdqa @MSG1[0],$TMPx
622 palignr \$4,@MSG1[3],$TMPx
623 sha256msg1 @MSG0[3],@MSG0[2]
624 sha256rnds2 $CDGH0,$ABEF0
625 pshufd \$0x0e,$TMP1,$Wi
626 movdqa 4*16-0x80($Tbl),$TMP0
627 paddd @MSG0[0],$TMP0
628 sha256rnds2 $CDGH1,$ABEF1
629 sha256msg1 @MSG1[3],@MSG1[2]
630___
631for($i=4;$i<16-3;$i++) {
632$code.=<<___;
633 movdqa $TMP0,$Wi
634 movdqa $i*16-0x80($Tbl),$TMP1
635 paddd $TMPx,@MSG1[1]
636 paddd @MSG1[0],$TMP1
637 sha256msg2 @MSG0[0],@MSG0[1]
638 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
639 movdqa $TMP1,$Wi
640 movdqa @MSG0[1],$TMPx
641 palignr \$4,@MSG0[0],$TMPx
642 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
643 sha256msg2 @MSG1[0],@MSG1[1]
644 pshufd \$0x0e,$TMP0,$Wi
645 paddd $TMPx,@MSG0[2]
646 movdqa @MSG1[1],$TMPx
647 palignr \$4,@MSG1[0],$TMPx
648 sha256msg1 @MSG0[0],@MSG0[3]
649 sha256rnds2 $CDGH0,$ABEF0
650 pshufd \$0x0e,$TMP1,$Wi
651 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
652 paddd @MSG0[1],$TMP0
653 sha256rnds2 $CDGH1,$ABEF1
654 sha256msg1 @MSG1[0],@MSG1[3]
655___
656 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
657}
658$code.=<<___;
659 movdqa $TMP0,$Wi
660 movdqa 13*16-0x80($Tbl),$TMP1
661 paddd $TMPx,@MSG1[1]
662 paddd @MSG1[0],$TMP1
663 sha256msg2 @MSG0[0],@MSG0[1]
664 sha256rnds2 $ABEF0,$CDGH0 # 52-55
665 movdqa $TMP1,$Wi
666 movdqa @MSG0[1],$TMPx
667 palignr \$4,@MSG0[0],$TMPx
668 sha256rnds2 $ABEF1,$CDGH1 # 52-55
669 sha256msg2 @MSG1[0],@MSG1[1]
670 pshufd \$0x0e,$TMP0,$Wi
671 paddd $TMPx,@MSG0[2]
672 movdqa @MSG1[1],$TMPx
673 palignr \$4,@MSG1[0],$TMPx
674 nop
675 sha256rnds2 $CDGH0,$ABEF0
676 pshufd \$0x0e,$TMP1,$Wi
677 movdqa 14*16-0x80($Tbl),$TMP0
678 paddd @MSG0[1],$TMP0
679 sha256rnds2 $CDGH1,$ABEF1
680
681 movdqa $TMP0,$Wi
682 movdqa 14*16-0x80($Tbl),$TMP1
683 paddd $TMPx,@MSG1[2]
684 paddd @MSG1[1],$TMP1
685 sha256msg2 @MSG0[1],@MSG0[2]
686 nop
687 sha256rnds2 $ABEF0,$CDGH0 # 56-59
688 movdqa $TMP1,$Wi
689 mov \$1,%ecx
690 pxor @MSG0[1],@MSG0[1] # zero
691 sha256rnds2 $ABEF1,$CDGH1 # 56-59
692 sha256msg2 @MSG1[1],@MSG1[2]
693 pshufd \$0x0e,$TMP0,$Wi
694 movdqa 15*16-0x80($Tbl),$TMP0
695 paddd @MSG0[2],$TMP0
696 movq (%rbx),@MSG0[2] # pull counters
697 nop
698 sha256rnds2 $CDGH0,$ABEF0
699 pshufd \$0x0e,$TMP1,$Wi
700 movdqa 15*16-0x80($Tbl),$TMP1
701 paddd @MSG1[2],$TMP1
702 sha256rnds2 $CDGH1,$ABEF1
703
704 movdqa $TMP0,$Wi
705 cmp 4*0(%rbx),%ecx # examine counters
706 cmovge %rsp,@ptr[0] # cancel input
707 cmp 4*1(%rbx),%ecx
708 cmovge %rsp,@ptr[1]
709 pshufd \$0x00,@MSG0[2],@MSG1[0]
710 sha256rnds2 $ABEF0,$CDGH0 # 60-63
711 movdqa $TMP1,$Wi
712 pshufd \$0x55,@MSG0[2],@MSG1[1]
713 movdqa @MSG0[2],@MSG1[2]
714 sha256rnds2 $ABEF1,$CDGH1 # 60-63
715 pshufd \$0x0e,$TMP0,$Wi
716 pcmpgtd @MSG0[1],@MSG1[0]
717 pcmpgtd @MSG0[1],@MSG1[1]
718 sha256rnds2 $CDGH0,$ABEF0
719 pshufd \$0x0e,$TMP1,$Wi
720 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
721 movdqa K256_shaext-0x10(%rip),$TMPx
722 sha256rnds2 $CDGH1,$ABEF1
723
724 pand @MSG1[0],$CDGH0
725 pand @MSG1[1],$CDGH1
726 pand @MSG1[0],$ABEF0
727 pand @MSG1[1],$ABEF1
728 paddd @MSG0[2],@MSG1[2] # counters--
729
730 paddd 0x50(%rsp),$CDGH0
731 paddd 0x70(%rsp),$CDGH1
732 paddd 0x40(%rsp),$ABEF0
733 paddd 0x60(%rsp),$ABEF1
734
735 movq @MSG1[2],(%rbx) # save counters
736 dec $num
737 jnz .Loop_shaext
738
739 mov `$REG_SZ*17+8`(%rsp),$num
740
741 pshufd \$0b00011011,$ABEF0,$ABEF0
742 pshufd \$0b00011011,$CDGH0,$CDGH0
743 pshufd \$0b00011011,$ABEF1,$ABEF1
744 pshufd \$0b00011011,$CDGH1,$CDGH1
745
746 movdqa $ABEF0,@MSG0[0]
747 movdqa $CDGH0,@MSG0[1]
748 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
749 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
750 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
751 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
752
753 movq $ABEF0,0x00-0x80($ctx) # A1.A0
754 psrldq \$8,$ABEF0
755 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
756 psrldq \$8,@MSG0[0]
757 movq $ABEF0,0x20-0x80($ctx) # B1.B0
758 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
759
760 movq $CDGH0,0x40-0x80($ctx) # C1.C0
761 psrldq \$8,$CDGH0
762 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
763 psrldq \$8,@MSG0[1]
764 movq $CDGH0,0x60-0x80($ctx) # D1.D0
765 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
766
767 lea `$REG_SZ/2`($ctx),$ctx
0d51cf3c 768 lea `$inp_elm_size*2`($inp),$inp
619b9466
AP
769 dec $num
770 jnz .Loop_grande_shaext
771
772.Ldone_shaext:
773 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
774___
775$code.=<<___ if ($win64);
776 movaps -0xb8(%rax),%xmm6
777 movaps -0xa8(%rax),%xmm7
778 movaps -0x98(%rax),%xmm8
779 movaps -0x88(%rax),%xmm9
780 movaps -0x78(%rax),%xmm10
781 movaps -0x68(%rax),%xmm11
782 movaps -0x58(%rax),%xmm12
783 movaps -0x48(%rax),%xmm13
784 movaps -0x38(%rax),%xmm14
785 movaps -0x28(%rax),%xmm15
786___
787$code.=<<___;
788 mov -16(%rax),%rbp
399976c7 789.cfi_restore %rbp
619b9466 790 mov -8(%rax),%rbx
399976c7 791.cfi_restore %rbx
619b9466 792 lea (%rax),%rsp
399976c7 793.cfi_def_cfa_register %rsp
619b9466
AP
794.Lepilogue_shaext:
795 ret
399976c7 796.cfi_endproc
619b9466
AP
797.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
798___
799 }}}
b7838586
AP
800 if ($avx) {{{
801sub ROUND_00_15_avx {
802my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
803
804$code.=<<___ if ($i<15 && $REG_SZ==16);
805 vmovd `4*$i`(@ptr[0]),$Xi
806 vmovd `4*$i`(@ptr[1]),$t1
807 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
808 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
809 vpunpckldq $t1,$Xi,$Xi
810 vpshufb $Xn,$Xi,$Xi
811___
812$code.=<<___ if ($i==15 && $REG_SZ==16);
813 vmovd `4*$i`(@ptr[0]),$Xi
814 lea `16*4`(@ptr[0]),@ptr[0]
815 vmovd `4*$i`(@ptr[1]),$t1
816 lea `16*4`(@ptr[1]),@ptr[1]
817 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
818 lea `16*4`(@ptr[2]),@ptr[2]
819 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
820 lea `16*4`(@ptr[3]),@ptr[3]
821 vpunpckldq $t1,$Xi,$Xi
822 vpshufb $Xn,$Xi,$Xi
823___
824$code.=<<___ if ($i<15 && $REG_SZ==32);
825 vmovd `4*$i`(@ptr[0]),$Xi
826 vmovd `4*$i`(@ptr[4]),$t1
827 vmovd `4*$i`(@ptr[1]),$t2
828 vmovd `4*$i`(@ptr[5]),$t3
829 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
830 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
831 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
832 vpunpckldq $t2,$Xi,$Xi
833 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
834 vpunpckldq $t3,$t1,$t1
835 vinserti128 $t1,$Xi,$Xi
836 vpshufb $Xn,$Xi,$Xi
837___
838$code.=<<___ if ($i==15 && $REG_SZ==32);
839 vmovd `4*$i`(@ptr[0]),$Xi
840 lea `16*4`(@ptr[0]),@ptr[0]
841 vmovd `4*$i`(@ptr[4]),$t1
842 lea `16*4`(@ptr[4]),@ptr[4]
843 vmovd `4*$i`(@ptr[1]),$t2
844 lea `16*4`(@ptr[1]),@ptr[1]
845 vmovd `4*$i`(@ptr[5]),$t3
846 lea `16*4`(@ptr[5]),@ptr[5]
847 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
848 lea `16*4`(@ptr[2]),@ptr[2]
849 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
850 lea `16*4`(@ptr[6]),@ptr[6]
851 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
852 lea `16*4`(@ptr[3]),@ptr[3]
853 vpunpckldq $t2,$Xi,$Xi
854 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
855 lea `16*4`(@ptr[7]),@ptr[7]
856 vpunpckldq $t3,$t1,$t1
857 vinserti128 $t1,$Xi,$Xi
858 vpshufb $Xn,$Xi,$Xi
859___
860$code.=<<___;
861 vpsrld \$6,$e,$sigma
862 vpslld \$26,$e,$t3
863 vmovdqu $Xi,`&Xi_off($i)`
864 vpaddd $h,$Xi,$Xi # Xi+=h
865
866 vpsrld \$11,$e,$t2
867 vpxor $t3,$sigma,$sigma
868 vpslld \$21,$e,$t3
869 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
870 vpxor $t2,$sigma,$sigma
871
872 vpsrld \$25,$e,$t2
873 vpxor $t3,$sigma,$sigma
619b9466 874 `"prefetcht0 63(@ptr[0])" if ($i==15)`
b7838586
AP
875 vpslld \$7,$e,$t3
876 vpandn $g,$e,$t1
877 vpand $f,$e,$axb # borrow $axb
619b9466 878 `"prefetcht0 63(@ptr[1])" if ($i==15)`
b7838586
AP
879 vpxor $t2,$sigma,$sigma
880
881 vpsrld \$2,$a,$h # borrow $h
882 vpxor $t3,$sigma,$sigma # Sigma1(e)
619b9466 883 `"prefetcht0 63(@ptr[2])" if ($i==15)`
b7838586
AP
884 vpslld \$30,$a,$t2
885 vpxor $axb,$t1,$t1 # Ch(e,f,g)
886 vpxor $a,$b,$axb # a^b, b^c in next round
619b9466 887 `"prefetcht0 63(@ptr[3])" if ($i==15)`
b7838586
AP
888 vpxor $t2,$h,$h
889 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
890
891 vpsrld \$13,$a,$t2
619b9466 892 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
893 vpslld \$19,$a,$t3
894 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
895 vpand $axb,$bxc,$bxc
619b9466 896 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
897 vpxor $t2,$h,$sigma
898
899 vpsrld \$22,$a,$t2
900 vpxor $t3,$sigma,$sigma
619b9466 901 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
902 vpslld \$10,$a,$t3
903 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
904 vpaddd $Xi,$d,$d # d+=Xi
619b9466 905 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
b7838586
AP
906 vpxor $t2,$sigma,$sigma
907 vpxor $t3,$sigma,$sigma # Sigma0(a)
908
909 vpaddd $Xi,$h,$h # h+=Xi
910 vpaddd $sigma,$h,$h # h+=Sigma0(a)
911___
912$code.=<<___ if (($i%8)==7);
913 add \$`32*8`,$Tbl
914___
915 ($axb,$bxc)=($bxc,$axb);
916}
917
918sub ROUND_16_XX_avx {
919my $i=shift;
920
921$code.=<<___;
922 vmovdqu `&Xi_off($i+1)`,$Xn
923 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
924
925 vpsrld \$3,$Xn,$sigma
926 vpsrld \$7,$Xn,$t2
927 vpslld \$25,$Xn,$t3
928 vpxor $t2,$sigma,$sigma
929 vpsrld \$18,$Xn,$t2
930 vpxor $t3,$sigma,$sigma
931 vpslld \$14,$Xn,$t3
932 vmovdqu `&Xi_off($i+14)`,$t1
933 vpsrld \$10,$t1,$axb # borrow $axb
934
935 vpxor $t2,$sigma,$sigma
936 vpsrld \$17,$t1,$t2
937 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
938 vpslld \$15,$t1,$t3
939 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
940 vpxor $t2,$axb,$sigma
941 vpsrld \$19,$t1,$t2
942 vpxor $t3,$sigma,$sigma
943 vpslld \$13,$t1,$t3
944 vpxor $t2,$sigma,$sigma
945 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
946 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
947___
948 &ROUND_00_15_avx($i,@_);
949 ($Xi,$Xn)=($Xn,$Xi);
950}
951
952$code.=<<___;
953.type sha256_multi_block_avx,\@function,3
954.align 32
955sha256_multi_block_avx:
399976c7 956.cfi_startproc
b7838586
AP
957_avx_shortcut:
958___
959$code.=<<___ if ($avx>1);
960 shr \$32,%rcx
961 cmp \$2,$num
962 jb .Lavx
963 test \$`1<<5`,%ecx
964 jnz _avx2_shortcut
965 jmp .Lavx
966.align 32
967.Lavx:
968___
969$code.=<<___;
970 mov %rsp,%rax
399976c7 971.cfi_def_cfa_register %rax
b7838586 972 push %rbx
399976c7 973.cfi_push %rbx
b7838586 974 push %rbp
399976c7 975.cfi_push %rbp
b7838586
AP
976___
977$code.=<<___ if ($win64);
978 lea -0xa8(%rsp),%rsp
979 movaps %xmm6,(%rsp)
980 movaps %xmm7,0x10(%rsp)
981 movaps %xmm8,0x20(%rsp)
982 movaps %xmm9,0x30(%rsp)
983 movaps %xmm10,-0x78(%rax)
984 movaps %xmm11,-0x68(%rax)
985 movaps %xmm12,-0x58(%rax)
986 movaps %xmm13,-0x48(%rax)
987 movaps %xmm14,-0x38(%rax)
988 movaps %xmm15,-0x28(%rax)
989___
990$code.=<<___;
991 sub \$`$REG_SZ*18`, %rsp
992 and \$-256,%rsp
993 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
399976c7 994.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
619b9466 995.Lbody_avx:
b7838586
AP
996 lea K256+128(%rip),$Tbl
997 lea `$REG_SZ*16`(%rsp),%rbx
998 lea 0x80($ctx),$ctx # size optimization
999
1000.Loop_grande_avx:
1001 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1002 xor $num,$num
1003___
1004for($i=0;$i<4;$i++) {
0d51cf3c 1005 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
b7838586 1006 $code.=<<___;
0d51cf3c
L
1007 # input pointer
1008 mov `$inp_elm_size*$i+0`($inp),$ptr_reg
1009 # number of blocks
1010 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
b7838586
AP
1011 cmp $num,%ecx
1012 cmovg %ecx,$num # find maximum
1013 test %ecx,%ecx
1014 mov %ecx,`4*$i`(%rbx) # initialize counters
1015 cmovle $Tbl,@ptr[$i] # cancel input
1016___
1017}
1018$code.=<<___;
1019 test $num,$num
1020 jz .Ldone_avx
1021
1022 vmovdqu 0x00-0x80($ctx),$A # load context
1023 lea 128(%rsp),%rax
1024 vmovdqu 0x20-0x80($ctx),$B
1025 vmovdqu 0x40-0x80($ctx),$C
1026 vmovdqu 0x60-0x80($ctx),$D
1027 vmovdqu 0x80-0x80($ctx),$E
1028 vmovdqu 0xa0-0x80($ctx),$F
1029 vmovdqu 0xc0-0x80($ctx),$G
1030 vmovdqu 0xe0-0x80($ctx),$H
1031 vmovdqu .Lpbswap(%rip),$Xn
1032 jmp .Loop_avx
1033
1034.align 32
1035.Loop_avx:
1036 vpxor $B,$C,$bxc # magic seed
1037___
1038for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1039$code.=<<___;
1040 vmovdqu `&Xi_off($i)`,$Xi
1041 mov \$3,%ecx
1042 jmp .Loop_16_xx_avx
1043.align 32
1044.Loop_16_xx_avx:
1045___
1046for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1047$code.=<<___;
1048 dec %ecx
1049 jnz .Loop_16_xx_avx
1050
1051 mov \$1,%ecx
1052 lea K256+128(%rip),$Tbl
1053___
1054for($i=0;$i<4;$i++) {
1055 $code.=<<___;
1056 cmp `4*$i`(%rbx),%ecx # examine counters
1057 cmovge $Tbl,@ptr[$i] # cancel input
1058___
1059}
1060$code.=<<___;
1061 vmovdqa (%rbx),$sigma # pull counters
1062 vpxor $t1,$t1,$t1
1063 vmovdqa $sigma,$Xn
1064 vpcmpgtd $t1,$Xn,$Xn # mask value
1065 vpaddd $Xn,$sigma,$sigma # counters--
1066
1067 vmovdqu 0x00-0x80($ctx),$t1
1068 vpand $Xn,$A,$A
1069 vmovdqu 0x20-0x80($ctx),$t2
1070 vpand $Xn,$B,$B
1071 vmovdqu 0x40-0x80($ctx),$t3
1072 vpand $Xn,$C,$C
1073 vmovdqu 0x60-0x80($ctx),$Xi
1074 vpand $Xn,$D,$D
1075 vpaddd $t1,$A,$A
1076 vmovdqu 0x80-0x80($ctx),$t1
1077 vpand $Xn,$E,$E
1078 vpaddd $t2,$B,$B
1079 vmovdqu 0xa0-0x80($ctx),$t2
1080 vpand $Xn,$F,$F
1081 vpaddd $t3,$C,$C
1082 vmovdqu 0xc0-0x80($ctx),$t3
1083 vpand $Xn,$G,$G
1084 vpaddd $Xi,$D,$D
1085 vmovdqu 0xe0-0x80($ctx),$Xi
1086 vpand $Xn,$H,$H
1087 vpaddd $t1,$E,$E
1088 vpaddd $t2,$F,$F
1089 vmovdqu $A,0x00-0x80($ctx)
1090 vpaddd $t3,$G,$G
1091 vmovdqu $B,0x20-0x80($ctx)
1092 vpaddd $Xi,$H,$H
1093 vmovdqu $C,0x40-0x80($ctx)
1094 vmovdqu $D,0x60-0x80($ctx)
1095 vmovdqu $E,0x80-0x80($ctx)
1096 vmovdqu $F,0xa0-0x80($ctx)
1097 vmovdqu $G,0xc0-0x80($ctx)
1098 vmovdqu $H,0xe0-0x80($ctx)
1099
1100 vmovdqu $sigma,(%rbx) # save counters
1101 vmovdqu .Lpbswap(%rip),$Xn
1102 dec $num
1103 jnz .Loop_avx
1104
1105 mov `$REG_SZ*17+8`(%rsp),$num
1106 lea $REG_SZ($ctx),$ctx
0d51cf3c 1107 lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
b7838586
AP
1108 dec $num
1109 jnz .Loop_grande_avx
1110
1111.Ldone_avx:
0d4fb843 1112 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
399976c7 1113.cfi_def_cfa %rax,8
b7838586
AP
1114 vzeroupper
1115___
1116$code.=<<___ if ($win64);
1117 movaps -0xb8(%rax),%xmm6
1118 movaps -0xa8(%rax),%xmm7
1119 movaps -0x98(%rax),%xmm8
1120 movaps -0x88(%rax),%xmm9
1121 movaps -0x78(%rax),%xmm10
1122 movaps -0x68(%rax),%xmm11
1123 movaps -0x58(%rax),%xmm12
1124 movaps -0x48(%rax),%xmm13
1125 movaps -0x38(%rax),%xmm14
1126 movaps -0x28(%rax),%xmm15
1127___
1128$code.=<<___;
1129 mov -16(%rax),%rbp
399976c7 1130.cfi_restore %rbp
b7838586 1131 mov -8(%rax),%rbx
399976c7 1132.cfi_restore %rbx
b7838586 1133 lea (%rax),%rsp
399976c7 1134.cfi_def_cfa_register %rsp
619b9466 1135.Lepilogue_avx:
b7838586 1136 ret
399976c7 1137.cfi_endproc
b7838586
AP
1138.size sha256_multi_block_avx,.-sha256_multi_block_avx
1139___
1140 if ($avx>1) {
1141$code =~ s/\`([^\`]*)\`/eval $1/gem;
1142
1143$REG_SZ=32;
1144@ptr=map("%r$_",(12..15,8..11));
1145
1146@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1147($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1148
1149$code.=<<___;
1150.type sha256_multi_block_avx2,\@function,3
1151.align 32
1152sha256_multi_block_avx2:
399976c7 1153.cfi_startproc
b7838586
AP
1154_avx2_shortcut:
1155 mov %rsp,%rax
399976c7 1156.cfi_def_cfa_register %rax
b7838586 1157 push %rbx
399976c7 1158.cfi_push %rbx
b7838586 1159 push %rbp
399976c7 1160.cfi_push %rbp
b7838586 1161 push %r12
399976c7 1162.cfi_push %r12
b7838586 1163 push %r13
399976c7 1164.cfi_push %r13
b7838586 1165 push %r14
399976c7 1166.cfi_push %r14
b7838586 1167 push %r15
399976c7 1168.cfi_push %r15
b7838586
AP
1169___
1170$code.=<<___ if ($win64);
1171 lea -0xa8(%rsp),%rsp
1172 movaps %xmm6,(%rsp)
1173 movaps %xmm7,0x10(%rsp)
1174 movaps %xmm8,0x20(%rsp)
1175 movaps %xmm9,0x30(%rsp)
1176 movaps %xmm10,0x40(%rsp)
1177 movaps %xmm11,0x50(%rsp)
1178 movaps %xmm12,-0x78(%rax)
1179 movaps %xmm13,-0x68(%rax)
1180 movaps %xmm14,-0x58(%rax)
1181 movaps %xmm15,-0x48(%rax)
1182___
1183$code.=<<___;
1184 sub \$`$REG_SZ*18`, %rsp
1185 and \$-256,%rsp
1186 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
399976c7 1187.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
619b9466 1188.Lbody_avx2:
b7838586
AP
1189 lea K256+128(%rip),$Tbl
1190 lea 0x80($ctx),$ctx # size optimization
1191
1192.Loop_grande_avx2:
1193 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1194 xor $num,$num
1195 lea `$REG_SZ*16`(%rsp),%rbx
1196___
1197for($i=0;$i<8;$i++) {
0d51cf3c 1198 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
b7838586 1199 $code.=<<___;
0d51cf3c
L
1200 # input pointer
1201 mov `$inp_elm_size*$i+0`($inp),$ptr_reg
1202 # number of blocks
1203 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
b7838586
AP
1204 cmp $num,%ecx
1205 cmovg %ecx,$num # find maximum
1206 test %ecx,%ecx
1207 mov %ecx,`4*$i`(%rbx) # initialize counters
1208 cmovle $Tbl,@ptr[$i] # cancel input
1209___
1210}
1211$code.=<<___;
1212 vmovdqu 0x00-0x80($ctx),$A # load context
1213 lea 128(%rsp),%rax
1214 vmovdqu 0x20-0x80($ctx),$B
1215 lea 256+128(%rsp),%rbx
1216 vmovdqu 0x40-0x80($ctx),$C
1217 vmovdqu 0x60-0x80($ctx),$D
1218 vmovdqu 0x80-0x80($ctx),$E
1219 vmovdqu 0xa0-0x80($ctx),$F
1220 vmovdqu 0xc0-0x80($ctx),$G
1221 vmovdqu 0xe0-0x80($ctx),$H
1222 vmovdqu .Lpbswap(%rip),$Xn
1223 jmp .Loop_avx2
1224
1225.align 32
1226.Loop_avx2:
1227 vpxor $B,$C,$bxc # magic seed
1228___
1229for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1230$code.=<<___;
1231 vmovdqu `&Xi_off($i)`,$Xi
1232 mov \$3,%ecx
1233 jmp .Loop_16_xx_avx2
1234.align 32
1235.Loop_16_xx_avx2:
1236___
1237for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1238$code.=<<___;
1239 dec %ecx
1240 jnz .Loop_16_xx_avx2
1241
1242 mov \$1,%ecx
1243 lea `$REG_SZ*16`(%rsp),%rbx
1244 lea K256+128(%rip),$Tbl
1245___
1246for($i=0;$i<8;$i++) {
1247 $code.=<<___;
1248 cmp `4*$i`(%rbx),%ecx # examine counters
1249 cmovge $Tbl,@ptr[$i] # cancel input
1250___
1251}
1252$code.=<<___;
1253 vmovdqa (%rbx),$sigma # pull counters
1254 vpxor $t1,$t1,$t1
1255 vmovdqa $sigma,$Xn
1256 vpcmpgtd $t1,$Xn,$Xn # mask value
1257 vpaddd $Xn,$sigma,$sigma # counters--
1258
1259 vmovdqu 0x00-0x80($ctx),$t1
1260 vpand $Xn,$A,$A
1261 vmovdqu 0x20-0x80($ctx),$t2
1262 vpand $Xn,$B,$B
1263 vmovdqu 0x40-0x80($ctx),$t3
1264 vpand $Xn,$C,$C
1265 vmovdqu 0x60-0x80($ctx),$Xi
1266 vpand $Xn,$D,$D
1267 vpaddd $t1,$A,$A
1268 vmovdqu 0x80-0x80($ctx),$t1
1269 vpand $Xn,$E,$E
1270 vpaddd $t2,$B,$B
1271 vmovdqu 0xa0-0x80($ctx),$t2
1272 vpand $Xn,$F,$F
1273 vpaddd $t3,$C,$C
1274 vmovdqu 0xc0-0x80($ctx),$t3
1275 vpand $Xn,$G,$G
1276 vpaddd $Xi,$D,$D
1277 vmovdqu 0xe0-0x80($ctx),$Xi
1278 vpand $Xn,$H,$H
1279 vpaddd $t1,$E,$E
1280 vpaddd $t2,$F,$F
1281 vmovdqu $A,0x00-0x80($ctx)
1282 vpaddd $t3,$G,$G
1283 vmovdqu $B,0x20-0x80($ctx)
1284 vpaddd $Xi,$H,$H
1285 vmovdqu $C,0x40-0x80($ctx)
1286 vmovdqu $D,0x60-0x80($ctx)
1287 vmovdqu $E,0x80-0x80($ctx)
1288 vmovdqu $F,0xa0-0x80($ctx)
1289 vmovdqu $G,0xc0-0x80($ctx)
1290 vmovdqu $H,0xe0-0x80($ctx)
1291
1292 vmovdqu $sigma,(%rbx) # save counters
1293 lea 256+128(%rsp),%rbx
1294 vmovdqu .Lpbswap(%rip),$Xn
1295 dec $num
1296 jnz .Loop_avx2
1297
1298 #mov `$REG_SZ*17+8`(%rsp),$num
1299 #lea $REG_SZ($ctx),$ctx
0d51cf3c 1300 #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
b7838586
AP
1301 #dec $num
1302 #jnz .Loop_grande_avx2
1303
1304.Ldone_avx2:
0d4fb843 1305 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
399976c7 1306.cfi_def_cfa %rax,8
b7838586
AP
1307 vzeroupper
1308___
1309$code.=<<___ if ($win64);
1310 movaps -0xd8(%rax),%xmm6
1311 movaps -0xc8(%rax),%xmm7
1312 movaps -0xb8(%rax),%xmm8
1313 movaps -0xa8(%rax),%xmm9
1314 movaps -0x98(%rax),%xmm10
1315 movaps -0x88(%rax),%xmm11
1316 movaps -0x78(%rax),%xmm12
1317 movaps -0x68(%rax),%xmm13
1318 movaps -0x58(%rax),%xmm14
1319 movaps -0x48(%rax),%xmm15
1320___
1321$code.=<<___;
1322 mov -48(%rax),%r15
399976c7 1323.cfi_restore %r15
b7838586 1324 mov -40(%rax),%r14
399976c7 1325.cfi_restore %r14
b7838586 1326 mov -32(%rax),%r13
399976c7 1327.cfi_restore %r13
b7838586 1328 mov -24(%rax),%r12
399976c7 1329.cfi_restore %r12
b7838586 1330 mov -16(%rax),%rbp
399976c7 1331.cfi_restore %rbp
b7838586 1332 mov -8(%rax),%rbx
399976c7 1333.cfi_restore %rbx
b7838586 1334 lea (%rax),%rsp
399976c7 1335.cfi_def_cfa_register %rsp
619b9466 1336.Lepilogue_avx2:
b7838586 1337 ret
399976c7 1338.cfi_endproc
b7838586
AP
1339.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1340___
1341 } }}}
1342$code.=<<___;
1343.align 256
1344K256:
1345___
1346sub TABLE {
1347 foreach (@_) {
1348 $code.=<<___;
1349 .long $_,$_,$_,$_
1350 .long $_,$_,$_,$_
1351___
1352 }
1353}
1354&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1355 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1356 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1357 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1358 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1359 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1360 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1361 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1362 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1363 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1364 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1365 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1366 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1367 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1368 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1369 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1370$code.=<<___;
1371.Lpbswap:
1372 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1373 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
619b9466
AP
1374K256_shaext:
1375 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1376 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1377 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1378 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1379 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1380 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1381 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1382 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1383 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1384 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1385 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1386 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1387 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1388 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1389 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1390 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1391 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
b7838586
AP
1392___
1393
619b9466
AP
1394if ($win64) {
1395# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1396# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1397$rec="%rcx";
1398$frame="%rdx";
1399$context="%r8";
1400$disp="%r9";
1401
1402$code.=<<___;
1403.extern __imp_RtlVirtualUnwind
1404.type se_handler,\@abi-omnipotent
1405.align 16
1406se_handler:
1407 push %rsi
1408 push %rdi
1409 push %rbx
1410 push %rbp
1411 push %r12
1412 push %r13
1413 push %r14
1414 push %r15
1415 pushfq
1416 sub \$64,%rsp
1417
1418 mov 120($context),%rax # pull context->Rax
1419 mov 248($context),%rbx # pull context->Rip
1420
1421 mov 8($disp),%rsi # disp->ImageBase
1422 mov 56($disp),%r11 # disp->HandlerData
1423
1424 mov 0(%r11),%r10d # HandlerData[0]
1425 lea (%rsi,%r10),%r10 # end of prologue label
1426 cmp %r10,%rbx # context->Rip<.Lbody
1427 jb .Lin_prologue
1428
1429 mov 152($context),%rax # pull context->Rsp
1430
1431 mov 4(%r11),%r10d # HandlerData[1]
1432 lea (%rsi,%r10),%r10 # epilogue label
1433 cmp %r10,%rbx # context->Rip>=.Lepilogue
1434 jae .Lin_prologue
1435
1436 mov `16*17`(%rax),%rax # pull saved stack pointer
1437
1438 mov -8(%rax),%rbx
1439 mov -16(%rax),%rbp
1440 mov %rbx,144($context) # restore context->Rbx
1441 mov %rbp,160($context) # restore context->Rbp
1442
1443 lea -24-10*16(%rax),%rsi
1444 lea 512($context),%rdi # &context.Xmm6
1445 mov \$20,%ecx
1446 .long 0xa548f3fc # cld; rep movsq
1447
1448.Lin_prologue:
1449 mov 8(%rax),%rdi
1450 mov 16(%rax),%rsi
1451 mov %rax,152($context) # restore context->Rsp
1452 mov %rsi,168($context) # restore context->Rsi
1453 mov %rdi,176($context) # restore context->Rdi
1454
1455 mov 40($disp),%rdi # disp->ContextRecord
1456 mov $context,%rsi # context
1457 mov \$154,%ecx # sizeof(CONTEXT)
1458 .long 0xa548f3fc # cld; rep movsq
1459
1460 mov $disp,%rsi
1461 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1462 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1463 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1464 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1465 mov 40(%rsi),%r10 # disp->ContextRecord
1466 lea 56(%rsi),%r11 # &disp->HandlerData
1467 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1468 mov %r10,32(%rsp) # arg5
1469 mov %r11,40(%rsp) # arg6
1470 mov %r12,48(%rsp) # arg7
1471 mov %rcx,56(%rsp) # arg8, (NULL)
1472 call *__imp_RtlVirtualUnwind(%rip)
1473
1474 mov \$1,%eax # ExceptionContinueSearch
1475 add \$64,%rsp
1476 popfq
1477 pop %r15
1478 pop %r14
1479 pop %r13
1480 pop %r12
1481 pop %rbp
1482 pop %rbx
1483 pop %rdi
1484 pop %rsi
1485 ret
1486.size se_handler,.-se_handler
1487___
1488$code.=<<___ if ($avx>1);
1489.type avx2_handler,\@abi-omnipotent
1490.align 16
1491avx2_handler:
1492 push %rsi
1493 push %rdi
1494 push %rbx
1495 push %rbp
1496 push %r12
1497 push %r13
1498 push %r14
1499 push %r15
1500 pushfq
1501 sub \$64,%rsp
1502
1503 mov 120($context),%rax # pull context->Rax
1504 mov 248($context),%rbx # pull context->Rip
1505
1506 mov 8($disp),%rsi # disp->ImageBase
1507 mov 56($disp),%r11 # disp->HandlerData
1508
1509 mov 0(%r11),%r10d # HandlerData[0]
1510 lea (%rsi,%r10),%r10 # end of prologue label
1511 cmp %r10,%rbx # context->Rip<body label
1512 jb .Lin_prologue
1513
1514 mov 152($context),%rax # pull context->Rsp
1515
1516 mov 4(%r11),%r10d # HandlerData[1]
1517 lea (%rsi,%r10),%r10 # epilogue label
1518 cmp %r10,%rbx # context->Rip>=epilogue label
1519 jae .Lin_prologue
1520
1521 mov `32*17`($context),%rax # pull saved stack pointer
1522
1523 mov -8(%rax),%rbx
1524 mov -16(%rax),%rbp
1525 mov -24(%rax),%r12
1526 mov -32(%rax),%r13
1527 mov -40(%rax),%r14
1528 mov -48(%rax),%r15
1529 mov %rbx,144($context) # restore context->Rbx
1530 mov %rbp,160($context) # restore context->Rbp
46f4e1be
JS
1531 mov %r12,216($context) # restore context->R12
1532 mov %r13,224($context) # restore context->R13
1533 mov %r14,232($context) # restore context->R14
1534 mov %r15,240($context) # restore context->R15
619b9466
AP
1535
1536 lea -56-10*16(%rax),%rsi
1537 lea 512($context),%rdi # &context.Xmm6
1538 mov \$20,%ecx
1539 .long 0xa548f3fc # cld; rep movsq
1540
1541 jmp .Lin_prologue
1542.size avx2_handler,.-avx2_handler
1543___
1544$code.=<<___;
1545.section .pdata
1546.align 4
1547 .rva .LSEH_begin_sha256_multi_block
1548 .rva .LSEH_end_sha256_multi_block
1549 .rva .LSEH_info_sha256_multi_block
1550 .rva .LSEH_begin_sha256_multi_block_shaext
1551 .rva .LSEH_end_sha256_multi_block_shaext
1552 .rva .LSEH_info_sha256_multi_block_shaext
1553___
1554$code.=<<___ if ($avx);
1555 .rva .LSEH_begin_sha256_multi_block_avx
1556 .rva .LSEH_end_sha256_multi_block_avx
1557 .rva .LSEH_info_sha256_multi_block_avx
1558___
1559$code.=<<___ if ($avx>1);
1560 .rva .LSEH_begin_sha256_multi_block_avx2
1561 .rva .LSEH_end_sha256_multi_block_avx2
1562 .rva .LSEH_info_sha256_multi_block_avx2
1563___
1564$code.=<<___;
1565.section .xdata
1566.align 8
1567.LSEH_info_sha256_multi_block:
1568 .byte 9,0,0,0
1569 .rva se_handler
1570 .rva .Lbody,.Lepilogue # HandlerData[]
1571.LSEH_info_sha256_multi_block_shaext:
1572 .byte 9,0,0,0
1573 .rva se_handler
1574 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1575___
1576$code.=<<___ if ($avx);
1577.LSEH_info_sha256_multi_block_avx:
1578 .byte 9,0,0,0
1579 .rva se_handler
1580 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1581___
1582$code.=<<___ if ($avx>1);
1583.LSEH_info_sha256_multi_block_avx2:
1584 .byte 9,0,0,0
1585 .rva avx2_handler
1586 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1587___
1588}
1589####################################################################
1590
1591sub rex {
1592 local *opcode=shift;
1593 my ($dst,$src)=@_;
1594 my $rex=0;
1595
1596 $rex|=0x04 if ($dst>=8);
1597 $rex|=0x01 if ($src>=8);
1598 unshift @opcode,$rex|0x40 if ($rex);
1599}
1600
1601sub sha256op38 {
1602 my $instr = shift;
1603 my %opcodelet = (
1604 "sha256rnds2" => 0xcb,
1605 "sha256msg1" => 0xcc,
1606 "sha256msg2" => 0xcd );
1607
1608 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1609 my @opcode=(0x0f,0x38);
1610 rex(\@opcode,$2,$1);
1611 push @opcode,$opcodelet{$instr};
1612 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1613 return ".byte\t".join(',',@opcode);
1614 } else {
1615 return $instr."\t".@_[0];
1616 }
1617}
1618
b7838586
AP
1619foreach (split("\n",$code)) {
1620 s/\`([^\`]*)\`/eval($1)/ge;
1621
619b9466
AP
1622 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1623
b7838586
AP
1624 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1625 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1626 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1627 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1628 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1629 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
619b9466 1630
b7838586
AP
1631 print $_,"\n";
1632}
1633
a21314db 1634close STDOUT or die "error closing STDOUT: $!";