3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
30 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31 # Westmere is omitted from loop, this is because gain was not
32 # estimated high enough to justify the effort;
33 # (**) these are EVP-free results, results obtained with 'speed
34 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
45 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM
} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM
} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 $shaext=$avx; ### set to zero if compiling for 1.0.1
63 $avx=1 if (!$shaext && $avx);
65 open OUT
,"| \"$^X\" $xlate $flavour $output";
68 $func="aesni_cbc_sha256_enc";
71 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
72 "%r8d","%r9d","%r10d","%r11d");
73 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
80 ########################################################################
81 # void aesni_cbc_sha256_enc(const void *inp,
88 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
89 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
93 $_inp="16*$SZ+0*8(%rsp)";
94 $_out="16*$SZ+1*8(%rsp)";
95 $_end="16*$SZ+2*8(%rsp)";
96 $_key="16*$SZ+3*8(%rsp)";
97 $_ivp="16*$SZ+4*8(%rsp)";
98 $_ctx="16*$SZ+5*8(%rsp)";
99 $_in0="16*$SZ+6*8(%rsp)";
100 $_rsp="16*$SZ+7*8(%rsp)";
106 .extern OPENSSL_ia32cap_P
108 .type
$func,\
@abi-omnipotent
114 lea OPENSSL_ia32cap_P
(%rip),%r11
116 cmp \
$0,`$win64?"%rcx":"%rdi"`
121 $code.=<<___
if ($shaext);
122 bt \
$61,%r10 # check for SHA
129 test \
$`1<<11`,%r10d # check for XOP
132 $code.=<<___
if ($avx>1);
133 and \
$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
134 cmp \
$`1<<8|1<<5|1<<3`,%r11d
138 and \
$`1<<30`,%eax # mask "Intel CPU" bit
139 and \
$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
141 cmp \
$`1<<28|1<<9|1<<30`,%r10d
148 cmp \
$0,`$win64?"%rcx":"%rdi"`
156 .type
$TABLE,\
@object
158 .long
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
159 .long
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160 .long
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
161 .long
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
162 .long
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
163 .long
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
164 .long
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
165 .long
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166 .long
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167 .long
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
168 .long
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
169 .long
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
170 .long
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
171 .long
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
172 .long
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
173 .long
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
174 .long
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
175 .long
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
176 .long
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
177 .long
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
178 .long
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
179 .long
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
180 .long
0xd192e819,0xd6990624,0xf40e3585,0x106aa070
181 .long
0xd192e819,0xd6990624,0xf40e3585,0x106aa070
182 .long
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
183 .long
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
184 .long
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
185 .long
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186 .long
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
187 .long
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
188 .long
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
189 .long
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
191 .long
0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
192 .long
0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
193 .long
0,0,0,0, 0,0,0,0, -1,-1,-1,-1
194 .long
0,0,0,0, 0,0,0,0
195 .asciz
"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
199 ######################################################################
203 ($iv,$inout,$roundkey,$temp,
204 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
208 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
209 ## &vmovdqu ($inout,($inp));
210 ## &mov ($_inp,$inp);
212 '&vpxor ($inout,$inout,$roundkey);'.
213 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
215 '&vpxor ($inout,$inout,$iv);',
217 '&vaesenc ($inout,$inout,$roundkey);'.
218 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
220 '&vaesenc ($inout,$inout,$roundkey);'.
221 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
223 '&vaesenc ($inout,$inout,$roundkey);'.
224 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
226 '&vaesenc ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
229 '&vaesenc ($inout,$inout,$roundkey);'.
230 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
232 '&vaesenc ($inout,$inout,$roundkey);'.
233 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
235 '&vaesenc ($inout,$inout,$roundkey);'.
236 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
238 '&vaesenc ($inout,$inout,$roundkey);'.
239 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
241 '&vaesenc ($inout,$inout,$roundkey);'.
242 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
244 '&vaesenclast ($temp,$inout,$roundkey);'.
245 ' &vaesenc ($inout,$inout,$roundkey);'.
246 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
248 '&vpand ($iv,$temp,$mask10);'.
249 ' &vaesenc ($inout,$inout,$roundkey);'.
250 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
252 '&vaesenclast ($temp,$inout,$roundkey);'.
253 ' &vaesenc ($inout,$inout,$roundkey);'.
254 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
256 '&vpand ($temp,$temp,$mask12);'.
257 ' &vaesenc ($inout,$inout,$roundkey);'.
258 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
260 '&vpor ($iv,$iv,$temp);'.
261 ' &vaesenclast ($temp,$inout,$roundkey);'.
262 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
264 ## &mov ($inp,$_inp);
265 ## &mov ($out,$_out);
266 ## &vpand ($temp,$temp,$mask14);
267 ## &vpor ($iv,$iv,$temp);
268 ## &vmovdqu ($iv,($out,$inp);
269 ## &lea (inp,16($inp));
273 my ($a,$b,$c,$d,$e,$f,$g,$h);
275 sub AUTOLOAD
() # thunk [simplified] 32-bit style perlasm
276 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
278 $arg = "\$$arg" if ($arg*1 eq $arg);
279 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
284 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
286 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
291 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
292 '&xor ($a4,$g)', # f^g
294 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
296 '&and ($a4,$e)', # (f^g)&e
298 @aesni_cbc_block[$aesni_cbc_idx++].
300 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
303 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
304 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
305 '&xor ($a2,$b)', # a^b, b^c in next round
307 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
308 '&add ($h,$a4)', # h+=Ch(e,f,g)
309 '&and ($a3,$a2)', # (b^c)&(a^b)
312 '&add ($h,$a0)', # h+=Sigma1(e)
313 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
315 '&add ($d,$h)', # d+=h
316 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
317 '&add ($h,$a3)', # h+=Maj(a,b,c)
320 '&add ($a1,$h);'. # h+=Sigma0(a)
321 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
326 ######################################################################
330 .type
${func
}_xop
,\
@function,6
334 mov
`($win64?56:8)`(%rsp),$in0 # load 7th parameter
341 mov
%rsp,%r11 # copy %rsp
342 sub \
$`$framesz+$win64*16*10`,%rsp
343 and \
$-64,%rsp # align stack frame
346 sub $inp,$out # re-bias
348 add
$inp,$len # end of input
350 #mov $inp,$_inp # saved later
353 #mov $key,$_key # remains resident in $inp register
359 $code.=<<___
if ($win64);
360 movaps
%xmm6,`$framesz+16*0`(%rsp)
361 movaps
%xmm7,`$framesz+16*1`(%rsp)
362 movaps
%xmm8,`$framesz+16*2`(%rsp)
363 movaps
%xmm9,`$framesz+16*3`(%rsp)
364 movaps
%xmm10,`$framesz+16*4`(%rsp)
365 movaps
%xmm11,`$framesz+16*5`(%rsp)
366 movaps
%xmm12,`$framesz+16*6`(%rsp)
367 movaps
%xmm13,`$framesz+16*7`(%rsp)
368 movaps
%xmm14,`$framesz+16*8`(%rsp)
369 movaps
%xmm15,`$framesz+16*9`(%rsp)
375 mov
$inp,%r12 # borrow $a4
376 lea
0x80($key),$inp # size optimization, reassign
377 lea
$TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
378 mov
0xf0-0x80($inp),%r14d # rounds, borrow $a1
379 mov
$ctx,%r15 # borrow $a2
380 mov
$in0,%rsi # borrow $a3
381 vmovdqu
($ivp),$iv # load IV
393 vmovdqa
0x00(%r13,%r14,8),$mask14
394 vmovdqa
0x10(%r13,%r14,8),$mask12
395 vmovdqa
0x20(%r13,%r14,8),$mask10
396 vmovdqu
0x00-0x80($inp),$roundkey
399 if ($SZ==4) { # SHA256
400 my @X = map("%xmm$_",(0..3));
401 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
406 vmovdqa
$TABLE+`$SZ*2*$rounds`(%rip),$t3
407 vmovdqu
0x00(%rsi,%r12),@X[0]
408 vmovdqu
0x10(%rsi,%r12),@X[1]
409 vmovdqu
0x20(%rsi,%r12),@X[2]
410 vmovdqu
0x30(%rsi,%r12),@X[3]
411 vpshufb
$t3,@X[0],@X[0]
412 lea
$TABLE(%rip),$Tbl
413 vpshufb
$t3,@X[1],@X[1]
414 vpshufb
$t3,@X[2],@X[2]
415 vpaddd
0x00($Tbl),@X[0],$t0
416 vpshufb
$t3,@X[3],@X[3]
417 vpaddd
0x20($Tbl),@X[1],$t1
418 vpaddd
0x40($Tbl),@X[2],$t2
419 vpaddd
0x60($Tbl),@X[3],$t3
420 vmovdqa
$t0,0x00(%rsp)
422 vmovdqa
$t1,0x10(%rsp)
424 vmovdqa
$t2,0x20(%rsp)
426 vmovdqa
$t3,0x30(%rsp)
432 sub \
$-16*2*$SZ,$Tbl # size optimization
433 vmovdqu
(%r12),$inout # $a4
436 sub XOP_256_00_47
() {
440 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
442 &vpalignr
($t0,@X[1],@X[0],$SZ); # X[1..4]
445 &vpalignr
($t3,@X[3],@X[2],$SZ); # X[9..12]
448 &vprotd
($t1,$t0,8*$SZ-$sigma0[1]);
451 &vpsrld
($t0,$t0,$sigma0[2]);
454 &vpaddd
(@X[0],@X[0],$t3); # X[0..3] += X[9..12]
459 &vprotd
($t2,$t1,$sigma0[1]-$sigma0[0]);
462 &vpxor
($t0,$t0,$t1);
467 &vprotd
($t3,@X[3],8*$SZ-$sigma1[1]);
470 &vpxor
($t0,$t0,$t2); # sigma0(X[1..4])
473 &vpsrld
($t2,@X[3],$sigma1[2]);
476 &vpaddd
(@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
479 &vprotd
($t1,$t3,$sigma1[1]-$sigma1[0]);
482 &vpxor
($t3,$t3,$t2);
487 &vpxor
($t3,$t3,$t1); # sigma1(X[14..15])
492 &vpsrldq
($t3,$t3,8);
497 &vpaddd
(@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
502 &vprotd
($t3,@X[0],8*$SZ-$sigma1[1]);
505 &vpsrld
($t2,@X[0],$sigma1[2]);
508 &vprotd
($t1,$t3,$sigma1[1]-$sigma1[0]);
511 &vpxor
($t3,$t3,$t2);
516 &vpxor
($t3,$t3,$t1); # sigma1(X[16..17])
521 &vpslldq
($t3,$t3,8); # 22 instructions
526 &vpaddd
(@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
531 &vpaddd
($t2,@X[0],16*2*$j."($Tbl)");
532 foreach (@insns) { eval; } # remaining instructions
533 &vmovdqa
(16*$j."(%rsp)",$t2);
537 for ($i=0,$j=0; $j<4; $j++) {
538 &XOP_256_00_47
($j,\
&body_00_15
,@X);
539 push(@X,shift(@X)); # rotate(@X)
541 &mov
("%r12",$_inp); # borrow $a4
542 &vpand
($temp,$temp,$mask14);
543 &mov
("%r15",$_out); # borrow $a2
544 &vpor
($iv,$iv,$temp);
545 &vmovdqu
("(%r15,%r12)",$iv); # write output
546 &lea
("%r12","16(%r12)"); # inp++
548 &cmpb
($SZ-1+16*2*$SZ."($Tbl)",0);
549 &jne
(".Lxop_00_47");
551 &vmovdqu
($inout,"(%r12)");
555 for ($i=0; $i<16; ) {
556 foreach(body_00_15
()) { eval; }
560 mov
$_inp,%r12 # borrow $a4
561 mov
$_out,%r13 # borrow $a0
562 mov
$_ctx,%r15 # borrow $a2
563 mov
$_in0,%rsi # borrow $a3
565 vpand
$mask14,$temp,$temp
568 vmovdqu
$iv,(%r13,%r12) # write output
569 lea
16(%r12),%r12 # inp++
595 vmovdqu
$iv,($ivp) # output IV
598 $code.=<<___
if ($win64);
599 movaps
`$framesz+16*0`(%rsp),%xmm6
600 movaps
`$framesz+16*1`(%rsp),%xmm7
601 movaps
`$framesz+16*2`(%rsp),%xmm8
602 movaps
`$framesz+16*3`(%rsp),%xmm9
603 movaps
`$framesz+16*4`(%rsp),%xmm10
604 movaps
`$framesz+16*5`(%rsp),%xmm11
605 movaps
`$framesz+16*6`(%rsp),%xmm12
606 movaps
`$framesz+16*7`(%rsp),%xmm13
607 movaps
`$framesz+16*8`(%rsp),%xmm14
608 movaps
`$framesz+16*9`(%rsp),%xmm15
620 .size
${func
}_xop
,.-${func
}_xop
622 ######################################################################
625 local *ror
= sub { &shrd
(@_[0],@_) };
628 .type
${func
}_avx
,\
@function,6
632 mov
`($win64?56:8)`(%rsp),$in0 # load 7th parameter
639 mov
%rsp,%r11 # copy %rsp
640 sub \
$`$framesz+$win64*16*10`,%rsp
641 and \
$-64,%rsp # align stack frame
644 sub $inp,$out # re-bias
646 add
$inp,$len # end of input
648 #mov $inp,$_inp # saved later
651 #mov $key,$_key # remains resident in $inp register
657 $code.=<<___
if ($win64);
658 movaps
%xmm6,`$framesz+16*0`(%rsp)
659 movaps
%xmm7,`$framesz+16*1`(%rsp)
660 movaps
%xmm8,`$framesz+16*2`(%rsp)
661 movaps
%xmm9,`$framesz+16*3`(%rsp)
662 movaps
%xmm10,`$framesz+16*4`(%rsp)
663 movaps
%xmm11,`$framesz+16*5`(%rsp)
664 movaps
%xmm12,`$framesz+16*6`(%rsp)
665 movaps
%xmm13,`$framesz+16*7`(%rsp)
666 movaps
%xmm14,`$framesz+16*8`(%rsp)
667 movaps
%xmm15,`$framesz+16*9`(%rsp)
673 mov
$inp,%r12 # borrow $a4
674 lea
0x80($key),$inp # size optimization, reassign
675 lea
$TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
676 mov
0xf0-0x80($inp),%r14d # rounds, borrow $a1
677 mov
$ctx,%r15 # borrow $a2
678 mov
$in0,%rsi # borrow $a3
679 vmovdqu
($ivp),$iv # load IV
691 vmovdqa
0x00(%r13,%r14,8),$mask14
692 vmovdqa
0x10(%r13,%r14,8),$mask12
693 vmovdqa
0x20(%r13,%r14,8),$mask10
694 vmovdqu
0x00-0x80($inp),$roundkey
696 if ($SZ==4) { # SHA256
697 my @X = map("%xmm$_",(0..3));
698 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
704 vmovdqa
$TABLE+`$SZ*2*$rounds`(%rip),$t3
705 vmovdqu
0x00(%rsi,%r12),@X[0]
706 vmovdqu
0x10(%rsi,%r12),@X[1]
707 vmovdqu
0x20(%rsi,%r12),@X[2]
708 vmovdqu
0x30(%rsi,%r12),@X[3]
709 vpshufb
$t3,@X[0],@X[0]
710 lea
$TABLE(%rip),$Tbl
711 vpshufb
$t3,@X[1],@X[1]
712 vpshufb
$t3,@X[2],@X[2]
713 vpaddd
0x00($Tbl),@X[0],$t0
714 vpshufb
$t3,@X[3],@X[3]
715 vpaddd
0x20($Tbl),@X[1],$t1
716 vpaddd
0x40($Tbl),@X[2],$t2
717 vpaddd
0x60($Tbl),@X[3],$t3
718 vmovdqa
$t0,0x00(%rsp)
720 vmovdqa
$t1,0x10(%rsp)
722 vmovdqa
$t2,0x20(%rsp)
724 vmovdqa
$t3,0x30(%rsp)
730 sub \
$-16*2*$SZ,$Tbl # size optimization
731 vmovdqu
(%r12),$inout # $a4
734 sub Xupdate_256_AVX
() {
736 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
737 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
738 '&vpsrld ($t2,$t0,$sigma0[0]);',
739 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
740 '&vpsrld ($t3,$t0,$sigma0[2])',
741 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
742 '&vpxor ($t0,$t3,$t2)',
743 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
744 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
745 '&vpxor ($t0,$t0,$t1)',
746 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
747 '&vpxor ($t0,$t0,$t2)',
748 '&vpsrld ($t2,$t3,$sigma1[2]);',
749 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
750 '&vpsrlq ($t3,$t3,$sigma1[0]);',
751 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
752 '&vpxor ($t2,$t2,$t3);',
753 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
754 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
755 '&vpshufd ($t2,$t2,0b10000100)',
756 '&vpsrldq ($t2,$t2,8)',
757 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
758 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
759 '&vpsrld ($t2,$t3,$sigma1[2])',
760 '&vpsrlq ($t3,$t3,$sigma1[0])',
761 '&vpxor ($t2,$t2,$t3);',
762 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
763 '&vpxor ($t2,$t2,$t3)',
764 '&vpshufd ($t2,$t2,0b11101000)',
765 '&vpslldq ($t2,$t2,8)',
766 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
770 sub AVX_256_00_47
() {
774 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
776 foreach (Xupdate_256_AVX
()) { # 29 instructions
782 &vpaddd
($t2,@X[0],16*2*$j."($Tbl)");
783 foreach (@insns) { eval; } # remaining instructions
784 &vmovdqa
(16*$j."(%rsp)",$t2);
788 for ($i=0,$j=0; $j<4; $j++) {
789 &AVX_256_00_47
($j,\
&body_00_15
,@X);
790 push(@X,shift(@X)); # rotate(@X)
792 &mov
("%r12",$_inp); # borrow $a4
793 &vpand
($temp,$temp,$mask14);
794 &mov
("%r15",$_out); # borrow $a2
795 &vpor
($iv,$iv,$temp);
796 &vmovdqu
("(%r15,%r12)",$iv); # write output
797 &lea
("%r12","16(%r12)"); # inp++
799 &cmpb
($SZ-1+16*2*$SZ."($Tbl)",0);
800 &jne
(".Lavx_00_47");
802 &vmovdqu
($inout,"(%r12)");
806 for ($i=0; $i<16; ) {
807 foreach(body_00_15
()) { eval; }
812 mov
$_inp,%r12 # borrow $a4
813 mov
$_out,%r13 # borrow $a0
814 mov
$_ctx,%r15 # borrow $a2
815 mov
$_in0,%rsi # borrow $a3
817 vpand
$mask14,$temp,$temp
820 vmovdqu
$iv,(%r13,%r12) # write output
821 lea
16(%r12),%r12 # inp++
846 vmovdqu
$iv,($ivp) # output IV
849 $code.=<<___
if ($win64);
850 movaps
`$framesz+16*0`(%rsp),%xmm6
851 movaps
`$framesz+16*1`(%rsp),%xmm7
852 movaps
`$framesz+16*2`(%rsp),%xmm8
853 movaps
`$framesz+16*3`(%rsp),%xmm9
854 movaps
`$framesz+16*4`(%rsp),%xmm10
855 movaps
`$framesz+16*5`(%rsp),%xmm11
856 movaps
`$framesz+16*6`(%rsp),%xmm12
857 movaps
`$framesz+16*7`(%rsp),%xmm13
858 movaps
`$framesz+16*8`(%rsp),%xmm14
859 movaps
`$framesz+16*9`(%rsp),%xmm15
871 .size
${func
}_avx
,.-${func
}_avx
875 ######################################################################
878 my $a5=$SZ==4?
"%esi":"%rsi"; # zap $inp
883 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
885 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
887 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
888 '&and ($a4,$e)', # f&e
889 '&rorx ($a0,$e,$Sigma1[2])',
890 '&rorx ($a2,$e,$Sigma1[1])',
892 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
893 '&lea ($h,"($h,$a4)")',
894 '&andn ($a4,$e,$g)', # ~e&g
897 '&rorx ($a1,$e,$Sigma1[0])',
898 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
899 '&xor ($a0,$a1)', # Sigma1(e)
902 '&rorx ($a4,$a,$Sigma0[2])',
903 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
904 '&xor ($a2,$b)', # a^b, b^c in next round
905 '&rorx ($a1,$a,$Sigma0[1])',
907 '&rorx ($a0,$a,$Sigma0[0])',
908 '&lea ($d,"($d,$h)")', # d+=h
909 '&and ($a3,$a2)', # (b^c)&(a^b)
910 @aesni_cbc_block[$aesni_cbc_idx++].
913 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
914 '&xor ($a1,$a0)', # Sigma0(a)
915 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
916 '&mov ($a4,$e)', # copy of f in future
918 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
920 # and at the finish one has to $a+=$a1
924 .type
${func
}_avx2
,\
@function,6
928 mov
`($win64?56:8)`(%rsp),$in0 # load 7th parameter
935 mov
%rsp,%r11 # copy %rsp
936 sub \
$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
937 and \
$-256*$SZ,%rsp # align stack frame
938 add \
$`2*$SZ*($rounds-8)`,%rsp
941 sub $inp,$out # re-bias
943 add
$inp,$len # end of input
945 #mov $inp,$_inp # saved later
946 #mov $out,$_out # kept in $offload
948 #mov $key,$_key # remains resident in $inp register
954 $code.=<<___
if ($win64);
955 movaps
%xmm6,`$framesz+16*0`(%rsp)
956 movaps
%xmm7,`$framesz+16*1`(%rsp)
957 movaps
%xmm8,`$framesz+16*2`(%rsp)
958 movaps
%xmm9,`$framesz+16*3`(%rsp)
959 movaps
%xmm10,`$framesz+16*4`(%rsp)
960 movaps
%xmm11,`$framesz+16*5`(%rsp)
961 movaps
%xmm12,`$framesz+16*6`(%rsp)
962 movaps
%xmm13,`$framesz+16*7`(%rsp)
963 movaps
%xmm14,`$framesz+16*8`(%rsp)
964 movaps
%xmm15,`$framesz+16*9`(%rsp)
970 mov
$inp,%r13 # borrow $a0
971 vpinsrq \
$1,$out,$offload,$offload
972 lea
0x80($key),$inp # size optimization, reassign
973 lea
$TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
974 mov
0xf0-0x80($inp),%r14d # rounds, borrow $a1
975 mov
$ctx,%r15 # borrow $a2
976 mov
$in0,%rsi # borrow $a3
977 vmovdqu
($ivp),$iv # load IV
980 vmovdqa
0x00(%r12,%r14,8),$mask14
981 vmovdqa
0x10(%r12,%r14,8),$mask12
982 vmovdqa
0x20(%r12,%r14,8),$mask10
984 sub \
$-16*$SZ,%r13 # inp++, size optimization
986 lea
(%rsi,%r13),%r12 # borrow $a0
988 cmp $len,%r13 # $_end
990 cmove
%rsp,%r12 # next block or random data
996 vmovdqu
0x00-0x80($inp),$roundkey
998 if ($SZ==4) { # SHA256
999 my @X = map("%ymm$_",(0..3));
1000 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1006 vmovdqa
$TABLE+`$SZ*2*$rounds`(%rip),$t3
1007 vmovdqu
-16*$SZ+0(%rsi,%r13),%xmm0
1008 vmovdqu
-16*$SZ+16(%rsi,%r13),%xmm1
1009 vmovdqu
-16*$SZ+32(%rsi,%r13),%xmm2
1010 vmovdqu
-16*$SZ+48(%rsi,%r13),%xmm3
1012 vinserti128 \
$1,(%r12),@X[0],@X[0]
1013 vinserti128 \
$1,16(%r12),@X[1],@X[1]
1014 vpshufb
$t3,@X[0],@X[0]
1015 vinserti128 \
$1,32(%r12),@X[2],@X[2]
1016 vpshufb
$t3,@X[1],@X[1]
1017 vinserti128 \
$1,48(%r12),@X[3],@X[3]
1019 lea
$TABLE(%rip),$Tbl
1020 vpshufb
$t3,@X[2],@X[2]
1021 lea
-16*$SZ(%r13),%r13
1022 vpaddd
0x00($Tbl),@X[0],$t0
1023 vpshufb
$t3,@X[3],@X[3]
1024 vpaddd
0x20($Tbl),@X[1],$t1
1025 vpaddd
0x40($Tbl),@X[2],$t2
1026 vpaddd
0x60($Tbl),@X[3],$t3
1027 vmovdqa
$t0,0x00(%rsp)
1029 vmovdqa
$t1,0x20(%rsp)
1030 lea
-$PUSH8(%rsp),%rsp
1032 vmovdqa
$t2,0x00(%rsp)
1034 vmovdqa
$t3,0x20(%rsp)
1036 sub \
$-16*2*$SZ,$Tbl # size optimization
1041 vmovdqu
(%r13),$inout
1042 vpinsrq \
$0,%r13,$offload,$offload
1045 sub AVX2_256_00_47
() {
1049 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1050 my $base = "+2*$PUSH8(%rsp)";
1052 &lea
("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1053 foreach (Xupdate_256_AVX
()) { # 29 instructions
1055 eval(shift(@insns));
1056 eval(shift(@insns));
1057 eval(shift(@insns));
1059 &vpaddd
($t2,@X[0],16*2*$j."($Tbl)");
1060 foreach (@insns) { eval; } # remaining instructions
1061 &vmovdqa
((32*$j)%$PUSH8."(%rsp)",$t2);
1064 for ($i=0,$j=0; $j<4; $j++) {
1065 &AVX2_256_00_47
($j,\
&bodyx_00_15
,@X);
1066 push(@X,shift(@X)); # rotate(@X)
1068 &vmovq
("%r13",$offload); # borrow $a0
1069 &vpextrq
("%r15",$offload,1); # borrow $a2
1070 &vpand
($temp,$temp,$mask14);
1071 &vpor
($iv,$iv,$temp);
1072 &vmovdqu
("(%r15,%r13)",$iv); # write output
1073 &lea
("%r13","16(%r13)"); # inp++
1075 &lea
($Tbl,16*2*$SZ."($Tbl)");
1076 &cmpb
(($SZ-1)."($Tbl)",0);
1077 &jne
(".Lavx2_00_47");
1079 &vmovdqu
($inout,"(%r13)");
1080 &vpinsrq
($offload,$offload,"%r13",0);
1083 for ($i=0; $i<16; ) {
1084 my $base=$i<8?
"+$PUSH8(%rsp)":"(%rsp)";
1085 foreach(bodyx_00_15
()) { eval; }
1089 vpextrq \
$1,$offload,%r12 # $_out, borrow $a4
1090 vmovq
$offload,%r13 # $_inp, borrow $a0
1091 mov
`2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1093 lea
`2*$SZ*($rounds-8)`(%rsp),$Tbl
1095 vpand
$mask14,$temp,$temp
1097 vmovdqu
$iv,(%r12,%r13) # write output
1118 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1128 vmovdqu
(%r13),$inout
1129 vpinsrq \
$0,%r13,$offload,$offload
1132 for ($i=0; $i<16; ) {
1133 my $base="+16($Tbl)";
1134 foreach(bodyx_00_15
()) { eval; }
1135 &lea
($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1138 vmovq
$offload,%r13 # borrow $a0
1139 vpextrq \
$1,$offload,%r15 # borrow $a2
1140 vpand
$mask14,$temp,$temp
1142 lea
-$PUSH8($Tbl),$Tbl
1143 vmovdqu
$iv,(%r15,%r13) # write output
1144 lea
16(%r13),%r13 # inp++
1148 mov
`2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1149 lea
16*$SZ(%r13),%r13
1150 mov
`2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1152 lea
`2*$SZ*($rounds-8)`(%rsp),%rsp
1161 lea
(%rsi,%r13),%r12
1167 cmove
%rsp,%r12 # next block or stale data
1183 vmovdqu
$iv,($ivp) # output IV
1186 $code.=<<___
if ($win64);
1187 movaps
`$framesz+16*0`(%rsp),%xmm6
1188 movaps
`$framesz+16*1`(%rsp),%xmm7
1189 movaps
`$framesz+16*2`(%rsp),%xmm8
1190 movaps
`$framesz+16*3`(%rsp),%xmm9
1191 movaps
`$framesz+16*4`(%rsp),%xmm10
1192 movaps
`$framesz+16*5`(%rsp),%xmm11
1193 movaps
`$framesz+16*6`(%rsp),%xmm12
1194 movaps
`$framesz+16*7`(%rsp),%xmm13
1195 movaps
`$framesz+16*8`(%rsp),%xmm14
1196 movaps
`$framesz+16*9`(%rsp),%xmm15
1208 .size
${func
}_avx2
,.-${func
}_avx2
1213 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1215 my ($rounds,$Tbl)=("%r11d","%rbx");
1217 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1218 my @rndkey=("%xmm4","%xmm5");
1222 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1223 my @MSG=map("%xmm$_",(10..13));
1227 my ($n,$k)=($r/10,$r%10);
1230 movups
`16*$n`($in0),$in # load input
1233 $code.=<<___
if ($n);
1234 movups
$iv,`16*($n-1)`($out,$in0) # write output
1238 movups
`32+16*$k-112`($key),$rndkey[1]
1239 aesenc
$rndkey[0],$iv
1246 movups
`32+16*($k+0)-112`($key),$rndkey[1]
1247 aesenc
$rndkey[0],$iv
1248 movups
`32+16*($k+1)-112`($key),$rndkey[0]
1249 aesenc
$rndkey[1],$iv
1251 movups
`32+16*($k+2)-112`($key),$rndkey[1]
1252 aesenc
$rndkey[0],$iv
1253 movups
`32+16*($k+3)-112`($key),$rndkey[0]
1254 aesenc
$rndkey[1],$iv
1256 aesenclast
$rndkey[0],$iv
1257 movups
16-112($key),$rndkey[1] # forward reference
1262 movups
`32+16*$k-112`($key),$rndkey[1]
1263 aesenc
$rndkey[0],$iv
1266 $r++; unshift(@rndkey,pop(@rndkey));
1273 .type
${func
}_shaext
,\
@function,6
1276 mov
`($win64?56:8)`(%rsp),$inp # load 7th argument
1278 $code.=<<___
if ($win64);
1279 lea
`-8-10*16`(%rsp),%rsp
1280 movaps
%xmm6,-8-10*16(%rax)
1281 movaps
%xmm7,-8-9*16(%rax)
1282 movaps
%xmm8,-8-8*16(%rax)
1283 movaps
%xmm9,-8-7*16(%rax)
1284 movaps
%xmm10,-8-6*16(%rax)
1285 movaps
%xmm11,-8-5*16(%rax)
1286 movaps
%xmm12,-8-4*16(%rax)
1287 movaps
%xmm13,-8-3*16(%rax)
1288 movaps
%xmm14,-8-2*16(%rax)
1289 movaps
%xmm15,-8-1*16(%rax)
1293 lea K256
+0x80(%rip),$Tbl
1294 movdqu
($ctx),$ABEF # DCBA
1295 movdqu
16($ctx),$CDGH # HGFE
1296 movdqa
0x200-0x80($Tbl),$TMP # byte swap mask
1298 mov
240($key),$rounds
1300 movups
($key),$rndkey0 # $key[0]
1301 movups
16($key),$rndkey[0] # forward reference
1302 lea
112($key),$key # size optimization
1304 pshufd \
$0x1b,$ABEF,$Wi # ABCD
1305 pshufd \
$0xb1,$ABEF,$ABEF # CDAB
1306 pshufd \
$0x1b,$CDGH,$CDGH # EFGH
1307 movdqa
$TMP,$BSWAP # offload
1308 palignr \
$8,$CDGH,$ABEF # ABEF
1309 punpcklqdq
$Wi,$CDGH # CDGH
1315 movdqu
($inp),@MSG[0]
1316 movdqu
0x10($inp),@MSG[1]
1317 movdqu
0x20($inp),@MSG[2]
1319 movdqu
0x30($inp),@MSG[3]
1321 movdqa
0*32-0x80($Tbl),$Wi
1324 movdqa
$CDGH,$CDGH_SAVE # offload
1325 movdqa
$ABEF,$ABEF_SAVE # offload
1329 sha256rnds2
$ABEF,$CDGH # 0-3
1330 pshufd \
$0x0e,$Wi,$Wi
1334 sha256rnds2
$CDGH,$ABEF
1336 movdqa
1*32-0x80($Tbl),$Wi
1343 sha256rnds2
$ABEF,$CDGH # 4-7
1344 pshufd \
$0x0e,$Wi,$Wi
1348 sha256rnds2
$CDGH,$ABEF
1350 movdqa
2*32-0x80($Tbl),$Wi
1353 sha256msg1
@MSG[1],@MSG[0]
1357 sha256rnds2
$ABEF,$CDGH # 8-11
1358 pshufd \
$0x0e,$Wi,$Wi
1360 palignr \
$4,@MSG[2],$TMP
1365 sha256rnds2
$CDGH,$ABEF
1367 movdqa
3*32-0x80($Tbl),$Wi
1369 sha256msg2
@MSG[3],@MSG[0]
1370 sha256msg1
@MSG[2],@MSG[1]
1374 sha256rnds2
$ABEF,$CDGH # 12-15
1375 pshufd \
$0x0e,$Wi,$Wi
1380 palignr \
$4,@MSG[3],$TMP
1382 sha256rnds2
$CDGH,$ABEF
1384 for($i=4;$i<16-3;$i++) {
1385 &$aesenc() if (($r%10)==0);
1387 movdqa
$i*32-0x80($Tbl),$Wi
1389 sha256msg2
@MSG[0],@MSG[1]
1390 sha256msg1
@MSG[3],@MSG[2]
1394 sha256rnds2
$ABEF,$CDGH # 16-19...
1395 pshufd \
$0x0e,$Wi,$Wi
1397 palignr \
$4,@MSG[0],$TMP
1401 &$aesenc() if ($r==19);
1403 sha256rnds2
$CDGH,$ABEF
1405 push(@MSG,shift(@MSG));
1408 movdqa
13*32-0x80($Tbl),$Wi
1410 sha256msg2
@MSG[0],@MSG[1]
1411 sha256msg1
@MSG[3],@MSG[2]
1415 sha256rnds2
$ABEF,$CDGH # 52-55
1416 pshufd \
$0x0e,$Wi,$Wi
1418 palignr \
$4,@MSG[0],$TMP
1424 sha256rnds2
$CDGH,$ABEF
1426 movdqa
14*32-0x80($Tbl),$Wi
1428 sha256msg2
@MSG[1],@MSG[2]
1433 sha256rnds2
$ABEF,$CDGH # 56-59
1434 pshufd \
$0x0e,$Wi,$Wi
1438 sha256rnds2
$CDGH,$ABEF
1440 movdqa
15*32-0x80($Tbl),$Wi
1446 sha256rnds2
$ABEF,$CDGH # 60-63
1447 pshufd \
$0x0e,$Wi,$Wi
1451 sha256rnds2
$CDGH,$ABEF
1452 #pxor $CDGH,$rndkey0 # black magic
1454 while ($r<40) { &$aesenc(); } # remaining aesenc's
1456 #xorps $CDGH,$rndkey0 # black magic
1457 paddd
$CDGH_SAVE,$CDGH
1458 paddd
$ABEF_SAVE,$ABEF
1461 movups
$iv,48($out,$in0) # write output
1465 pshufd \
$0xb1,$CDGH,$CDGH # DCHG
1466 pshufd \
$0x1b,$ABEF,$TMP # FEBA
1467 pshufd \
$0xb1,$ABEF,$ABEF # BAFE
1468 punpckhqdq
$CDGH,$ABEF # DCBA
1469 palignr \
$8,$TMP,$CDGH # HGFE
1471 movups
$iv,($ivp) # write IV
1473 movdqu
$CDGH,16($ctx)
1475 $code.=<<___
if ($win64);
1476 movaps
0*16(%rsp),%xmm6
1477 movaps
1*16(%rsp),%xmm7
1478 movaps
2*16(%rsp),%xmm8
1479 movaps
3*16(%rsp),%xmm9
1480 movaps
4*16(%rsp),%xmm10
1481 movaps
5*16(%rsp),%xmm11
1482 movaps
6*16(%rsp),%xmm12
1483 movaps
7*16(%rsp),%xmm13
1484 movaps
8*16(%rsp),%xmm14
1485 movaps
9*16(%rsp),%xmm15
1486 lea
8+10*16(%rsp),%rsp
1491 .size
${func
}_shaext
,.-${func
}_shaext
1496 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1497 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1504 $code.=<<___
if ($avx);
1505 .extern __imp_RtlVirtualUnwind
1506 .type se_handler
,\
@abi-omnipotent
1520 mov
120($context),%rax # pull context->Rax
1521 mov
248($context),%rbx # pull context->Rip
1523 mov
8($disp),%rsi # disp->ImageBase
1524 mov
56($disp),%r11 # disp->HanderlData
1526 mov
0(%r11),%r10d # HandlerData[0]
1527 lea
(%rsi,%r10),%r10 # prologue label
1528 cmp %r10,%rbx # context->Rip<prologue label
1531 mov
152($context),%rax # pull context->Rsp
1533 mov
4(%r11),%r10d # HandlerData[1]
1534 lea
(%rsi,%r10),%r10 # epilogue label
1535 cmp %r10,%rbx # context->Rip>=epilogue label
1538 $code.=<<___
if ($shaext);
1539 lea aesni_cbc_sha256_enc_shaext
(%rip),%r10
1544 lea
512($context),%rdi # &context.Xmm6
1546 .long
0xa548f3fc # cld; rep movsq
1547 lea
168(%rax),%rax # adjust stack pointer
1551 $code.=<<___
if ($avx>1);
1552 lea
.Lavx2_shortcut
(%rip),%r10
1553 cmp %r10,%rbx # context->Rip<avx2_shortcut
1557 add \
$`2*$SZ*($rounds-8)`,%rax
1561 mov
%rax,%rsi # put aside Rsp
1562 mov
16*$SZ+7*8(%rax),%rax # pull $_rsp
1571 mov
%rbx,144($context) # restore context->Rbx
1572 mov
%rbp,160($context) # restore context->Rbp
1573 mov
%r12,216($context) # restore context->R12
1574 mov
%r13,224($context) # restore context->R13
1575 mov
%r14,232($context) # restore context->R14
1576 mov
%r15,240($context) # restore context->R15
1578 lea
16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1579 lea
512($context),%rdi # &context.Xmm6
1581 .long
0xa548f3fc # cld; rep movsq
1586 mov
%rax,152($context) # restore context->Rsp
1587 mov
%rsi,168($context) # restore context->Rsi
1588 mov
%rdi,176($context) # restore context->Rdi
1590 mov
40($disp),%rdi # disp->ContextRecord
1591 mov
$context,%rsi # context
1592 mov \
$154,%ecx # sizeof(CONTEXT)
1593 .long
0xa548f3fc # cld; rep movsq
1596 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1597 mov
8(%rsi),%rdx # arg2, disp->ImageBase
1598 mov
0(%rsi),%r8 # arg3, disp->ControlPc
1599 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
1600 mov
40(%rsi),%r10 # disp->ContextRecord
1601 lea
56(%rsi),%r11 # &disp->HandlerData
1602 lea
24(%rsi),%r12 # &disp->EstablisherFrame
1603 mov
%r10,32(%rsp) # arg5
1604 mov
%r11,40(%rsp) # arg6
1605 mov
%r12,48(%rsp) # arg7
1606 mov
%rcx,56(%rsp) # arg8, (NULL)
1607 call
*__imp_RtlVirtualUnwind
(%rip)
1609 mov \
$1,%eax # ExceptionContinueSearch
1621 .size se_handler
,.-se_handler
1624 .rva
.LSEH_begin_
${func
}_xop
1625 .rva
.LSEH_end_
${func
}_xop
1626 .rva
.LSEH_info_
${func
}_xop
1628 .rva
.LSEH_begin_
${func
}_avx
1629 .rva
.LSEH_end_
${func
}_avx
1630 .rva
.LSEH_info_
${func
}_avx
1632 $code.=<<___
if ($avx>1);
1633 .rva
.LSEH_begin_
${func
}_avx2
1634 .rva
.LSEH_end_
${func
}_avx2
1635 .rva
.LSEH_info_
${func
}_avx2
1637 $code.=<<___
if ($shaext);
1638 .rva
.LSEH_begin_
${func
}_shaext
1639 .rva
.LSEH_end_
${func
}_shaext
1640 .rva
.LSEH_info_
${func
}_shaext
1642 $code.=<<___
if ($avx);
1645 .LSEH_info_
${func
}_xop
:
1648 .rva
.Lprologue_xop
,.Lepilogue_xop
# HandlerData[]
1650 .LSEH_info_
${func
}_avx
:
1653 .rva
.Lprologue_avx
,.Lepilogue_avx
# HandlerData[]
1655 $code.=<<___
if ($avx>1);
1656 .LSEH_info_
${func
}_avx2
:
1659 .rva
.Lprologue_avx2
,.Lepilogue_avx2
# HandlerData[]
1661 $code.=<<___
if ($shaext);
1662 .LSEH_info_
${func
}_shaext
:
1665 .rva
.Lprologue_shaext
,.Lepilogue_shaext
# HandlerData[]
1669 ####################################################################
1671 local *opcode
=shift;
1675 $rex|=0x04 if($dst>=8);
1676 $rex|=0x01 if($src>=8);
1677 unshift @opcode,$rex|0x40 if($rex);
1682 "sha256rnds2" => 0xcb,
1683 "sha256msg1" => 0xcc,
1684 "sha256msg2" => 0xcd );
1689 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1690 my @opcode=(0x0f,0x38);
1691 rex
(\
@opcode,$2,$1);
1692 push @opcode,$opcodelet{$instr};
1693 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1694 return ".byte\t".join(',',@opcode);
1696 return $instr."\t".@_[0];
1701 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1702 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;