]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesni-sha256-x86_64.pl
aesni-sha[1|256]-x86_64.pl: fix logical error and MacOS X build.
[thirdparty/openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
29 #
30 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31 # Westmere is omitted from loop, this is because gain was not
32 # estimated high enough to justify the effort;
33 # (**) these are EVP-free results, results obtained with 'speed
34 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36 $flavour = shift;
37 $output = shift;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
60 }
61
62 $shaext=$avx; ### set to zero if compiling for 1.0.1
63 $avx=1 if (!$shaext && $avx);
64
65 open OUT,"| \"$^X\" $xlate $flavour $output";
66 *STDOUT=*OUT;
67
68 $func="aesni_cbc_sha256_enc";
69 $TABLE="K256";
70 $SZ=4;
71 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
72 "%r8d","%r9d","%r10d","%r11d");
73 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
74 @Sigma0=( 2,13,22);
75 @Sigma1=( 6,11,25);
76 @sigma0=( 7,18, 3);
77 @sigma1=(17,19,10);
78 $rounds=64;
79
80 ########################################################################
81 # void aesni_cbc_sha256_enc(const void *inp,
82 # void *out,
83 # size_t length,
84 # const AES_KEY *key,
85 # unsigned char *iv,
86 # SHA256_CTX *ctx,
87 # const void *in0);
88 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
89 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
90
91 $Tbl="%rbp";
92
93 $_inp="16*$SZ+0*8(%rsp)";
94 $_out="16*$SZ+1*8(%rsp)";
95 $_end="16*$SZ+2*8(%rsp)";
96 $_key="16*$SZ+3*8(%rsp)";
97 $_ivp="16*$SZ+4*8(%rsp)";
98 $_ctx="16*$SZ+5*8(%rsp)";
99 $_in0="16*$SZ+6*8(%rsp)";
100 $_rsp="16*$SZ+7*8(%rsp)";
101 $framesz=16*$SZ+8*8;
102
103 $code=<<___;
104 .text
105
106 .extern OPENSSL_ia32cap_P
107 .globl $func
108 .type $func,\@abi-omnipotent
109 .align 16
110 $func:
111 ___
112 if ($avx) {
113 $code.=<<___;
114 lea OPENSSL_ia32cap_P(%rip),%r11
115 mov \$1,%eax
116 cmp \$0,`$win64?"%rcx":"%rdi"`
117 je .Lprobe
118 mov 0(%r11),%eax
119 mov 4(%r11),%r10
120 ___
121 $code.=<<___ if ($shaext);
122 bt \$61,%r10 # check for SHA
123 jc ${func}_shaext
124 ___
125 $code.=<<___;
126 mov %r10,%r11
127 shr \$32,%r11
128
129 test \$`1<<11`,%r10d # check for XOP
130 jnz ${func}_xop
131 ___
132 $code.=<<___ if ($avx>1);
133 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
134 cmp \$`1<<8|1<<5|1<<3`,%r11d
135 je ${func}_avx2
136 ___
137 $code.=<<___;
138 and \$`1<<30`,%eax # mask "Intel CPU" bit
139 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
140 or %eax,%r10d
141 cmp \$`1<<28|1<<9|1<<30`,%r10d
142 je ${func}_avx
143 ud2
144 ___
145 }
146 $code.=<<___;
147 xor %eax,%eax
148 cmp \$0,`$win64?"%rcx":"%rdi"`
149 je .Lprobe
150 ud2
151 .Lprobe:
152 ret
153 .size $func,.-$func
154
155 .align 64
156 .type $TABLE,\@object
157 $TABLE:
158 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
159 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
161 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
162 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
163 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
164 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
165 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
168 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
169 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
170 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
171 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
172 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
173 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
174 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
175 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
176 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
177 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
178 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
179 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
180 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
181 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
182 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
183 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
184 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
185 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
187 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
188 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
189 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
190
191 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
192 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
193 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
194 .long 0,0,0,0, 0,0,0,0
195 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
196 .align 64
197 ___
198
199 ######################################################################
200 # SIMD code paths
201 #
202 {{{
203 ($iv,$inout,$roundkey,$temp,
204 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
205
206 $aesni_cbc_idx=0;
207 @aesni_cbc_block = (
208 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
209 ## &vmovdqu ($inout,($inp));
210 ## &mov ($_inp,$inp);
211
212 '&vpxor ($inout,$inout,$roundkey);'.
213 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
214
215 '&vpxor ($inout,$inout,$iv);',
216
217 '&vaesenc ($inout,$inout,$roundkey);'.
218 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
219
220 '&vaesenc ($inout,$inout,$roundkey);'.
221 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
222
223 '&vaesenc ($inout,$inout,$roundkey);'.
224 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
225
226 '&vaesenc ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
228
229 '&vaesenc ($inout,$inout,$roundkey);'.
230 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
231
232 '&vaesenc ($inout,$inout,$roundkey);'.
233 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
234
235 '&vaesenc ($inout,$inout,$roundkey);'.
236 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
237
238 '&vaesenc ($inout,$inout,$roundkey);'.
239 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
240
241 '&vaesenc ($inout,$inout,$roundkey);'.
242 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
243
244 '&vaesenclast ($temp,$inout,$roundkey);'.
245 ' &vaesenc ($inout,$inout,$roundkey);'.
246 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
247
248 '&vpand ($iv,$temp,$mask10);'.
249 ' &vaesenc ($inout,$inout,$roundkey);'.
250 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
251
252 '&vaesenclast ($temp,$inout,$roundkey);'.
253 ' &vaesenc ($inout,$inout,$roundkey);'.
254 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
255
256 '&vpand ($temp,$temp,$mask12);'.
257 ' &vaesenc ($inout,$inout,$roundkey);'.
258 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
259
260 '&vpor ($iv,$iv,$temp);'.
261 ' &vaesenclast ($temp,$inout,$roundkey);'.
262 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
263
264 ## &mov ($inp,$_inp);
265 ## &mov ($out,$_out);
266 ## &vpand ($temp,$temp,$mask14);
267 ## &vpor ($iv,$iv,$temp);
268 ## &vmovdqu ($iv,($out,$inp);
269 ## &lea (inp,16($inp));
270 );
271
272 my $a4=$T1;
273 my ($a,$b,$c,$d,$e,$f,$g,$h);
274
275 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
276 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
277 my $arg = pop;
278 $arg = "\$$arg" if ($arg*1 eq $arg);
279 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
280 }
281
282 sub body_00_15 () {
283 (
284 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
285
286 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
287 '&mov ($a,$a1)',
288 '&mov ($a4,$f)',
289
290 '&xor ($a0,$e)',
291 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
292 '&xor ($a4,$g)', # f^g
293
294 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
295 '&xor ($a1,$a)',
296 '&and ($a4,$e)', # (f^g)&e
297
298 @aesni_cbc_block[$aesni_cbc_idx++].
299 '&xor ($a0,$e)',
300 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
301 '&mov ($a2,$a)',
302
303 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
304 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
305 '&xor ($a2,$b)', # a^b, b^c in next round
306
307 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
308 '&add ($h,$a4)', # h+=Ch(e,f,g)
309 '&and ($a3,$a2)', # (b^c)&(a^b)
310
311 '&xor ($a1,$a)',
312 '&add ($h,$a0)', # h+=Sigma1(e)
313 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
314
315 '&add ($d,$h)', # d+=h
316 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
317 '&add ($h,$a3)', # h+=Maj(a,b,c)
318
319 '&mov ($a0,$d)',
320 '&add ($a1,$h);'. # h+=Sigma0(a)
321 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
322 );
323 }
324
325 if ($avx) {{
326 ######################################################################
327 # XOP code path
328 #
329 $code.=<<___;
330 .type ${func}_xop,\@function,6
331 .align 64
332 ${func}_xop:
333 .Lxop_shortcut:
334 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
335 push %rbx
336 push %rbp
337 push %r12
338 push %r13
339 push %r14
340 push %r15
341 mov %rsp,%r11 # copy %rsp
342 sub \$`$framesz+$win64*16*10`,%rsp
343 and \$-64,%rsp # align stack frame
344
345 shl \$6,$len
346 sub $inp,$out # re-bias
347 sub $inp,$in0
348 add $inp,$len # end of input
349
350 #mov $inp,$_inp # saved later
351 mov $out,$_out
352 mov $len,$_end
353 #mov $key,$_key # remains resident in $inp register
354 mov $ivp,$_ivp
355 mov $ctx,$_ctx
356 mov $in0,$_in0
357 mov %r11,$_rsp
358 ___
359 $code.=<<___ if ($win64);
360 movaps %xmm6,`$framesz+16*0`(%rsp)
361 movaps %xmm7,`$framesz+16*1`(%rsp)
362 movaps %xmm8,`$framesz+16*2`(%rsp)
363 movaps %xmm9,`$framesz+16*3`(%rsp)
364 movaps %xmm10,`$framesz+16*4`(%rsp)
365 movaps %xmm11,`$framesz+16*5`(%rsp)
366 movaps %xmm12,`$framesz+16*6`(%rsp)
367 movaps %xmm13,`$framesz+16*7`(%rsp)
368 movaps %xmm14,`$framesz+16*8`(%rsp)
369 movaps %xmm15,`$framesz+16*9`(%rsp)
370 ___
371 $code.=<<___;
372 .Lprologue_xop:
373 vzeroall
374
375 mov $inp,%r12 # borrow $a4
376 lea 0x80($key),$inp # size optimization, reassign
377 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
378 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
379 mov $ctx,%r15 # borrow $a2
380 mov $in0,%rsi # borrow $a3
381 vmovdqu ($ivp),$iv # load IV
382 sub \$9,%r14
383
384 mov $SZ*0(%r15),$A
385 mov $SZ*1(%r15),$B
386 mov $SZ*2(%r15),$C
387 mov $SZ*3(%r15),$D
388 mov $SZ*4(%r15),$E
389 mov $SZ*5(%r15),$F
390 mov $SZ*6(%r15),$G
391 mov $SZ*7(%r15),$H
392
393 vmovdqa 0x00(%r13,%r14,8),$mask14
394 vmovdqa 0x10(%r13,%r14,8),$mask12
395 vmovdqa 0x20(%r13,%r14,8),$mask10
396 vmovdqu 0x00-0x80($inp),$roundkey
397 jmp .Lloop_xop
398 ___
399 if ($SZ==4) { # SHA256
400 my @X = map("%xmm$_",(0..3));
401 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
402
403 $code.=<<___;
404 .align 16
405 .Lloop_xop:
406 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
407 vmovdqu 0x00(%rsi,%r12),@X[0]
408 vmovdqu 0x10(%rsi,%r12),@X[1]
409 vmovdqu 0x20(%rsi,%r12),@X[2]
410 vmovdqu 0x30(%rsi,%r12),@X[3]
411 vpshufb $t3,@X[0],@X[0]
412 lea $TABLE(%rip),$Tbl
413 vpshufb $t3,@X[1],@X[1]
414 vpshufb $t3,@X[2],@X[2]
415 vpaddd 0x00($Tbl),@X[0],$t0
416 vpshufb $t3,@X[3],@X[3]
417 vpaddd 0x20($Tbl),@X[1],$t1
418 vpaddd 0x40($Tbl),@X[2],$t2
419 vpaddd 0x60($Tbl),@X[3],$t3
420 vmovdqa $t0,0x00(%rsp)
421 mov $A,$a1
422 vmovdqa $t1,0x10(%rsp)
423 mov $B,$a3
424 vmovdqa $t2,0x20(%rsp)
425 xor $C,$a3 # magic
426 vmovdqa $t3,0x30(%rsp)
427 mov $E,$a0
428 jmp .Lxop_00_47
429
430 .align 16
431 .Lxop_00_47:
432 sub \$-16*2*$SZ,$Tbl # size optimization
433 vmovdqu (%r12),$inout # $a4
434 mov %r12,$_inp # $a4
435 ___
436 sub XOP_256_00_47 () {
437 my $j = shift;
438 my $body = shift;
439 my @X = @_;
440 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
441
442 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
443 eval(shift(@insns));
444 eval(shift(@insns));
445 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
446 eval(shift(@insns));
447 eval(shift(@insns));
448 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
449 eval(shift(@insns));
450 eval(shift(@insns));
451 &vpsrld ($t0,$t0,$sigma0[2]);
452 eval(shift(@insns));
453 eval(shift(@insns));
454 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
455 eval(shift(@insns));
456 eval(shift(@insns));
457 eval(shift(@insns));
458 eval(shift(@insns));
459 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
460 eval(shift(@insns));
461 eval(shift(@insns));
462 &vpxor ($t0,$t0,$t1);
463 eval(shift(@insns));
464 eval(shift(@insns));
465 eval(shift(@insns));
466 eval(shift(@insns));
467 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
468 eval(shift(@insns));
469 eval(shift(@insns));
470 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
471 eval(shift(@insns));
472 eval(shift(@insns));
473 &vpsrld ($t2,@X[3],$sigma1[2]);
474 eval(shift(@insns));
475 eval(shift(@insns));
476 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
477 eval(shift(@insns));
478 eval(shift(@insns));
479 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
480 eval(shift(@insns));
481 eval(shift(@insns));
482 &vpxor ($t3,$t3,$t2);
483 eval(shift(@insns));
484 eval(shift(@insns));
485 eval(shift(@insns));
486 eval(shift(@insns));
487 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
488 eval(shift(@insns));
489 eval(shift(@insns));
490 eval(shift(@insns));
491 eval(shift(@insns));
492 &vpsrldq ($t3,$t3,8);
493 eval(shift(@insns));
494 eval(shift(@insns));
495 eval(shift(@insns));
496 eval(shift(@insns));
497 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
498 eval(shift(@insns));
499 eval(shift(@insns));
500 eval(shift(@insns));
501 eval(shift(@insns));
502 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
503 eval(shift(@insns));
504 eval(shift(@insns));
505 &vpsrld ($t2,@X[0],$sigma1[2]);
506 eval(shift(@insns));
507 eval(shift(@insns));
508 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
509 eval(shift(@insns));
510 eval(shift(@insns));
511 &vpxor ($t3,$t3,$t2);
512 eval(shift(@insns));
513 eval(shift(@insns));
514 eval(shift(@insns));
515 eval(shift(@insns));
516 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
517 eval(shift(@insns));
518 eval(shift(@insns));
519 eval(shift(@insns));
520 eval(shift(@insns));
521 &vpslldq ($t3,$t3,8); # 22 instructions
522 eval(shift(@insns));
523 eval(shift(@insns));
524 eval(shift(@insns));
525 eval(shift(@insns));
526 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
527 eval(shift(@insns));
528 eval(shift(@insns));
529 eval(shift(@insns));
530 eval(shift(@insns));
531 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
532 foreach (@insns) { eval; } # remaining instructions
533 &vmovdqa (16*$j."(%rsp)",$t2);
534 }
535
536 $aesni_cbc_idx=0;
537 for ($i=0,$j=0; $j<4; $j++) {
538 &XOP_256_00_47($j,\&body_00_15,@X);
539 push(@X,shift(@X)); # rotate(@X)
540 }
541 &mov ("%r12",$_inp); # borrow $a4
542 &vpand ($temp,$temp,$mask14);
543 &mov ("%r15",$_out); # borrow $a2
544 &vpor ($iv,$iv,$temp);
545 &vmovdqu ("(%r15,%r12)",$iv); # write output
546 &lea ("%r12","16(%r12)"); # inp++
547
548 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
549 &jne (".Lxop_00_47");
550
551 &vmovdqu ($inout,"(%r12)");
552 &mov ($_inp,"%r12");
553
554 $aesni_cbc_idx=0;
555 for ($i=0; $i<16; ) {
556 foreach(body_00_15()) { eval; }
557 }
558 }
559 $code.=<<___;
560 mov $_inp,%r12 # borrow $a4
561 mov $_out,%r13 # borrow $a0
562 mov $_ctx,%r15 # borrow $a2
563 mov $_in0,%rsi # borrow $a3
564
565 vpand $mask14,$temp,$temp
566 mov $a1,$A
567 vpor $temp,$iv,$iv
568 vmovdqu $iv,(%r13,%r12) # write output
569 lea 16(%r12),%r12 # inp++
570
571 add $SZ*0(%r15),$A
572 add $SZ*1(%r15),$B
573 add $SZ*2(%r15),$C
574 add $SZ*3(%r15),$D
575 add $SZ*4(%r15),$E
576 add $SZ*5(%r15),$F
577 add $SZ*6(%r15),$G
578 add $SZ*7(%r15),$H
579
580 cmp $_end,%r12
581
582 mov $A,$SZ*0(%r15)
583 mov $B,$SZ*1(%r15)
584 mov $C,$SZ*2(%r15)
585 mov $D,$SZ*3(%r15)
586 mov $E,$SZ*4(%r15)
587 mov $F,$SZ*5(%r15)
588 mov $G,$SZ*6(%r15)
589 mov $H,$SZ*7(%r15)
590
591 jb .Lloop_xop
592
593 mov $_ivp,$ivp
594 mov $_rsp,%rsi
595 vmovdqu $iv,($ivp) # output IV
596 vzeroall
597 ___
598 $code.=<<___ if ($win64);
599 movaps `$framesz+16*0`(%rsp),%xmm6
600 movaps `$framesz+16*1`(%rsp),%xmm7
601 movaps `$framesz+16*2`(%rsp),%xmm8
602 movaps `$framesz+16*3`(%rsp),%xmm9
603 movaps `$framesz+16*4`(%rsp),%xmm10
604 movaps `$framesz+16*5`(%rsp),%xmm11
605 movaps `$framesz+16*6`(%rsp),%xmm12
606 movaps `$framesz+16*7`(%rsp),%xmm13
607 movaps `$framesz+16*8`(%rsp),%xmm14
608 movaps `$framesz+16*9`(%rsp),%xmm15
609 ___
610 $code.=<<___;
611 mov (%rsi),%r15
612 mov 8(%rsi),%r14
613 mov 16(%rsi),%r13
614 mov 24(%rsi),%r12
615 mov 32(%rsi),%rbp
616 mov 40(%rsi),%rbx
617 lea 48(%rsi),%rsp
618 .Lepilogue_xop:
619 ret
620 .size ${func}_xop,.-${func}_xop
621 ___
622 ######################################################################
623 # AVX+shrd code path
624 #
625 local *ror = sub { &shrd(@_[0],@_) };
626
627 $code.=<<___;
628 .type ${func}_avx,\@function,6
629 .align 64
630 ${func}_avx:
631 .Lavx_shortcut:
632 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
633 push %rbx
634 push %rbp
635 push %r12
636 push %r13
637 push %r14
638 push %r15
639 mov %rsp,%r11 # copy %rsp
640 sub \$`$framesz+$win64*16*10`,%rsp
641 and \$-64,%rsp # align stack frame
642
643 shl \$6,$len
644 sub $inp,$out # re-bias
645 sub $inp,$in0
646 add $inp,$len # end of input
647
648 #mov $inp,$_inp # saved later
649 mov $out,$_out
650 mov $len,$_end
651 #mov $key,$_key # remains resident in $inp register
652 mov $ivp,$_ivp
653 mov $ctx,$_ctx
654 mov $in0,$_in0
655 mov %r11,$_rsp
656 ___
657 $code.=<<___ if ($win64);
658 movaps %xmm6,`$framesz+16*0`(%rsp)
659 movaps %xmm7,`$framesz+16*1`(%rsp)
660 movaps %xmm8,`$framesz+16*2`(%rsp)
661 movaps %xmm9,`$framesz+16*3`(%rsp)
662 movaps %xmm10,`$framesz+16*4`(%rsp)
663 movaps %xmm11,`$framesz+16*5`(%rsp)
664 movaps %xmm12,`$framesz+16*6`(%rsp)
665 movaps %xmm13,`$framesz+16*7`(%rsp)
666 movaps %xmm14,`$framesz+16*8`(%rsp)
667 movaps %xmm15,`$framesz+16*9`(%rsp)
668 ___
669 $code.=<<___;
670 .Lprologue_avx:
671 vzeroall
672
673 mov $inp,%r12 # borrow $a4
674 lea 0x80($key),$inp # size optimization, reassign
675 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
676 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
677 mov $ctx,%r15 # borrow $a2
678 mov $in0,%rsi # borrow $a3
679 vmovdqu ($ivp),$iv # load IV
680 sub \$9,%r14
681
682 mov $SZ*0(%r15),$A
683 mov $SZ*1(%r15),$B
684 mov $SZ*2(%r15),$C
685 mov $SZ*3(%r15),$D
686 mov $SZ*4(%r15),$E
687 mov $SZ*5(%r15),$F
688 mov $SZ*6(%r15),$G
689 mov $SZ*7(%r15),$H
690
691 vmovdqa 0x00(%r13,%r14,8),$mask14
692 vmovdqa 0x10(%r13,%r14,8),$mask12
693 vmovdqa 0x20(%r13,%r14,8),$mask10
694 vmovdqu 0x00-0x80($inp),$roundkey
695 ___
696 if ($SZ==4) { # SHA256
697 my @X = map("%xmm$_",(0..3));
698 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
699
700 $code.=<<___;
701 jmp .Lloop_avx
702 .align 16
703 .Lloop_avx:
704 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
705 vmovdqu 0x00(%rsi,%r12),@X[0]
706 vmovdqu 0x10(%rsi,%r12),@X[1]
707 vmovdqu 0x20(%rsi,%r12),@X[2]
708 vmovdqu 0x30(%rsi,%r12),@X[3]
709 vpshufb $t3,@X[0],@X[0]
710 lea $TABLE(%rip),$Tbl
711 vpshufb $t3,@X[1],@X[1]
712 vpshufb $t3,@X[2],@X[2]
713 vpaddd 0x00($Tbl),@X[0],$t0
714 vpshufb $t3,@X[3],@X[3]
715 vpaddd 0x20($Tbl),@X[1],$t1
716 vpaddd 0x40($Tbl),@X[2],$t2
717 vpaddd 0x60($Tbl),@X[3],$t3
718 vmovdqa $t0,0x00(%rsp)
719 mov $A,$a1
720 vmovdqa $t1,0x10(%rsp)
721 mov $B,$a3
722 vmovdqa $t2,0x20(%rsp)
723 xor $C,$a3 # magic
724 vmovdqa $t3,0x30(%rsp)
725 mov $E,$a0
726 jmp .Lavx_00_47
727
728 .align 16
729 .Lavx_00_47:
730 sub \$-16*2*$SZ,$Tbl # size optimization
731 vmovdqu (%r12),$inout # $a4
732 mov %r12,$_inp # $a4
733 ___
734 sub Xupdate_256_AVX () {
735 (
736 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
737 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
738 '&vpsrld ($t2,$t0,$sigma0[0]);',
739 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
740 '&vpsrld ($t3,$t0,$sigma0[2])',
741 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
742 '&vpxor ($t0,$t3,$t2)',
743 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
744 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
745 '&vpxor ($t0,$t0,$t1)',
746 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
747 '&vpxor ($t0,$t0,$t2)',
748 '&vpsrld ($t2,$t3,$sigma1[2]);',
749 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
750 '&vpsrlq ($t3,$t3,$sigma1[0]);',
751 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
752 '&vpxor ($t2,$t2,$t3);',
753 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
754 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
755 '&vpshufd ($t2,$t2,0b10000100)',
756 '&vpsrldq ($t2,$t2,8)',
757 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
758 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
759 '&vpsrld ($t2,$t3,$sigma1[2])',
760 '&vpsrlq ($t3,$t3,$sigma1[0])',
761 '&vpxor ($t2,$t2,$t3);',
762 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
763 '&vpxor ($t2,$t2,$t3)',
764 '&vpshufd ($t2,$t2,0b11101000)',
765 '&vpslldq ($t2,$t2,8)',
766 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
767 );
768 }
769
770 sub AVX_256_00_47 () {
771 my $j = shift;
772 my $body = shift;
773 my @X = @_;
774 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
775
776 foreach (Xupdate_256_AVX()) { # 29 instructions
777 eval;
778 eval(shift(@insns));
779 eval(shift(@insns));
780 eval(shift(@insns));
781 }
782 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
783 foreach (@insns) { eval; } # remaining instructions
784 &vmovdqa (16*$j."(%rsp)",$t2);
785 }
786
787 $aesni_cbc_idx=0;
788 for ($i=0,$j=0; $j<4; $j++) {
789 &AVX_256_00_47($j,\&body_00_15,@X);
790 push(@X,shift(@X)); # rotate(@X)
791 }
792 &mov ("%r12",$_inp); # borrow $a4
793 &vpand ($temp,$temp,$mask14);
794 &mov ("%r15",$_out); # borrow $a2
795 &vpor ($iv,$iv,$temp);
796 &vmovdqu ("(%r15,%r12)",$iv); # write output
797 &lea ("%r12","16(%r12)"); # inp++
798
799 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
800 &jne (".Lavx_00_47");
801
802 &vmovdqu ($inout,"(%r12)");
803 &mov ($_inp,"%r12");
804
805 $aesni_cbc_idx=0;
806 for ($i=0; $i<16; ) {
807 foreach(body_00_15()) { eval; }
808 }
809
810 }
811 $code.=<<___;
812 mov $_inp,%r12 # borrow $a4
813 mov $_out,%r13 # borrow $a0
814 mov $_ctx,%r15 # borrow $a2
815 mov $_in0,%rsi # borrow $a3
816
817 vpand $mask14,$temp,$temp
818 mov $a1,$A
819 vpor $temp,$iv,$iv
820 vmovdqu $iv,(%r13,%r12) # write output
821 lea 16(%r12),%r12 # inp++
822
823 add $SZ*0(%r15),$A
824 add $SZ*1(%r15),$B
825 add $SZ*2(%r15),$C
826 add $SZ*3(%r15),$D
827 add $SZ*4(%r15),$E
828 add $SZ*5(%r15),$F
829 add $SZ*6(%r15),$G
830 add $SZ*7(%r15),$H
831
832 cmp $_end,%r12
833
834 mov $A,$SZ*0(%r15)
835 mov $B,$SZ*1(%r15)
836 mov $C,$SZ*2(%r15)
837 mov $D,$SZ*3(%r15)
838 mov $E,$SZ*4(%r15)
839 mov $F,$SZ*5(%r15)
840 mov $G,$SZ*6(%r15)
841 mov $H,$SZ*7(%r15)
842 jb .Lloop_avx
843
844 mov $_ivp,$ivp
845 mov $_rsp,%rsi
846 vmovdqu $iv,($ivp) # output IV
847 vzeroall
848 ___
849 $code.=<<___ if ($win64);
850 movaps `$framesz+16*0`(%rsp),%xmm6
851 movaps `$framesz+16*1`(%rsp),%xmm7
852 movaps `$framesz+16*2`(%rsp),%xmm8
853 movaps `$framesz+16*3`(%rsp),%xmm9
854 movaps `$framesz+16*4`(%rsp),%xmm10
855 movaps `$framesz+16*5`(%rsp),%xmm11
856 movaps `$framesz+16*6`(%rsp),%xmm12
857 movaps `$framesz+16*7`(%rsp),%xmm13
858 movaps `$framesz+16*8`(%rsp),%xmm14
859 movaps `$framesz+16*9`(%rsp),%xmm15
860 ___
861 $code.=<<___;
862 mov (%rsi),%r15
863 mov 8(%rsi),%r14
864 mov 16(%rsi),%r13
865 mov 24(%rsi),%r12
866 mov 32(%rsi),%rbp
867 mov 40(%rsi),%rbx
868 lea 48(%rsi),%rsp
869 .Lepilogue_avx:
870 ret
871 .size ${func}_avx,.-${func}_avx
872 ___
873
874 if ($avx>1) {{
875 ######################################################################
876 # AVX2+BMI code path
877 #
878 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
879 my $PUSH8=8*2*$SZ;
880 use integer;
881
882 sub bodyx_00_15 () {
883 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
884 (
885 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
886
887 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
888 '&and ($a4,$e)', # f&e
889 '&rorx ($a0,$e,$Sigma1[2])',
890 '&rorx ($a2,$e,$Sigma1[1])',
891
892 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
893 '&lea ($h,"($h,$a4)")',
894 '&andn ($a4,$e,$g)', # ~e&g
895 '&xor ($a0,$a2)',
896
897 '&rorx ($a1,$e,$Sigma1[0])',
898 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
899 '&xor ($a0,$a1)', # Sigma1(e)
900 '&mov ($a2,$a)',
901
902 '&rorx ($a4,$a,$Sigma0[2])',
903 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
904 '&xor ($a2,$b)', # a^b, b^c in next round
905 '&rorx ($a1,$a,$Sigma0[1])',
906
907 '&rorx ($a0,$a,$Sigma0[0])',
908 '&lea ($d,"($d,$h)")', # d+=h
909 '&and ($a3,$a2)', # (b^c)&(a^b)
910 @aesni_cbc_block[$aesni_cbc_idx++].
911 '&xor ($a1,$a4)',
912
913 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
914 '&xor ($a1,$a0)', # Sigma0(a)
915 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
916 '&mov ($a4,$e)', # copy of f in future
917
918 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
919 );
920 # and at the finish one has to $a+=$a1
921 }
922
923 $code.=<<___;
924 .type ${func}_avx2,\@function,6
925 .align 64
926 ${func}_avx2:
927 .Lavx2_shortcut:
928 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
929 push %rbx
930 push %rbp
931 push %r12
932 push %r13
933 push %r14
934 push %r15
935 mov %rsp,%r11 # copy %rsp
936 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
937 and \$-256*$SZ,%rsp # align stack frame
938 add \$`2*$SZ*($rounds-8)`,%rsp
939
940 shl \$6,$len
941 sub $inp,$out # re-bias
942 sub $inp,$in0
943 add $inp,$len # end of input
944
945 #mov $inp,$_inp # saved later
946 #mov $out,$_out # kept in $offload
947 mov $len,$_end
948 #mov $key,$_key # remains resident in $inp register
949 mov $ivp,$_ivp
950 mov $ctx,$_ctx
951 mov $in0,$_in0
952 mov %r11,$_rsp
953 ___
954 $code.=<<___ if ($win64);
955 movaps %xmm6,`$framesz+16*0`(%rsp)
956 movaps %xmm7,`$framesz+16*1`(%rsp)
957 movaps %xmm8,`$framesz+16*2`(%rsp)
958 movaps %xmm9,`$framesz+16*3`(%rsp)
959 movaps %xmm10,`$framesz+16*4`(%rsp)
960 movaps %xmm11,`$framesz+16*5`(%rsp)
961 movaps %xmm12,`$framesz+16*6`(%rsp)
962 movaps %xmm13,`$framesz+16*7`(%rsp)
963 movaps %xmm14,`$framesz+16*8`(%rsp)
964 movaps %xmm15,`$framesz+16*9`(%rsp)
965 ___
966 $code.=<<___;
967 .Lprologue_avx2:
968 vzeroall
969
970 mov $inp,%r13 # borrow $a0
971 vpinsrq \$1,$out,$offload,$offload
972 lea 0x80($key),$inp # size optimization, reassign
973 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
974 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
975 mov $ctx,%r15 # borrow $a2
976 mov $in0,%rsi # borrow $a3
977 vmovdqu ($ivp),$iv # load IV
978 lea -9(%r14),%r14
979
980 vmovdqa 0x00(%r12,%r14,8),$mask14
981 vmovdqa 0x10(%r12,%r14,8),$mask12
982 vmovdqa 0x20(%r12,%r14,8),$mask10
983
984 sub \$-16*$SZ,%r13 # inp++, size optimization
985 mov $SZ*0(%r15),$A
986 lea (%rsi,%r13),%r12 # borrow $a0
987 mov $SZ*1(%r15),$B
988 cmp $len,%r13 # $_end
989 mov $SZ*2(%r15),$C
990 cmove %rsp,%r12 # next block or random data
991 mov $SZ*3(%r15),$D
992 mov $SZ*4(%r15),$E
993 mov $SZ*5(%r15),$F
994 mov $SZ*6(%r15),$G
995 mov $SZ*7(%r15),$H
996 vmovdqu 0x00-0x80($inp),$roundkey
997 ___
998 if ($SZ==4) { # SHA256
999 my @X = map("%ymm$_",(0..3));
1000 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1001
1002 $code.=<<___;
1003 jmp .Loop_avx2
1004 .align 16
1005 .Loop_avx2:
1006 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1007 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1008 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1009 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1010 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1011
1012 vinserti128 \$1,(%r12),@X[0],@X[0]
1013 vinserti128 \$1,16(%r12),@X[1],@X[1]
1014 vpshufb $t3,@X[0],@X[0]
1015 vinserti128 \$1,32(%r12),@X[2],@X[2]
1016 vpshufb $t3,@X[1],@X[1]
1017 vinserti128 \$1,48(%r12),@X[3],@X[3]
1018
1019 lea $TABLE(%rip),$Tbl
1020 vpshufb $t3,@X[2],@X[2]
1021 lea -16*$SZ(%r13),%r13
1022 vpaddd 0x00($Tbl),@X[0],$t0
1023 vpshufb $t3,@X[3],@X[3]
1024 vpaddd 0x20($Tbl),@X[1],$t1
1025 vpaddd 0x40($Tbl),@X[2],$t2
1026 vpaddd 0x60($Tbl),@X[3],$t3
1027 vmovdqa $t0,0x00(%rsp)
1028 xor $a1,$a1
1029 vmovdqa $t1,0x20(%rsp)
1030 lea -$PUSH8(%rsp),%rsp
1031 mov $B,$a3
1032 vmovdqa $t2,0x00(%rsp)
1033 xor $C,$a3 # magic
1034 vmovdqa $t3,0x20(%rsp)
1035 mov $F,$a4
1036 sub \$-16*2*$SZ,$Tbl # size optimization
1037 jmp .Lavx2_00_47
1038
1039 .align 16
1040 .Lavx2_00_47:
1041 vmovdqu (%r13),$inout
1042 vpinsrq \$0,%r13,$offload,$offload
1043 ___
1044
1045 sub AVX2_256_00_47 () {
1046 my $j = shift;
1047 my $body = shift;
1048 my @X = @_;
1049 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1050 my $base = "+2*$PUSH8(%rsp)";
1051
1052 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1053 foreach (Xupdate_256_AVX()) { # 29 instructions
1054 eval;
1055 eval(shift(@insns));
1056 eval(shift(@insns));
1057 eval(shift(@insns));
1058 }
1059 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1060 foreach (@insns) { eval; } # remaining instructions
1061 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1062 }
1063 $aesni_cbc_idx=0;
1064 for ($i=0,$j=0; $j<4; $j++) {
1065 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1066 push(@X,shift(@X)); # rotate(@X)
1067 }
1068 &vmovq ("%r13",$offload); # borrow $a0
1069 &vpextrq ("%r15",$offload,1); # borrow $a2
1070 &vpand ($temp,$temp,$mask14);
1071 &vpor ($iv,$iv,$temp);
1072 &vmovdqu ("(%r15,%r13)",$iv); # write output
1073 &lea ("%r13","16(%r13)"); # inp++
1074
1075 &lea ($Tbl,16*2*$SZ."($Tbl)");
1076 &cmpb (($SZ-1)."($Tbl)",0);
1077 &jne (".Lavx2_00_47");
1078
1079 &vmovdqu ($inout,"(%r13)");
1080 &vpinsrq ($offload,$offload,"%r13",0);
1081
1082 $aesni_cbc_idx=0;
1083 for ($i=0; $i<16; ) {
1084 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1085 foreach(bodyx_00_15()) { eval; }
1086 }
1087 }
1088 $code.=<<___;
1089 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1090 vmovq $offload,%r13 # $_inp, borrow $a0
1091 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1092 add $a1,$A
1093 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1094
1095 vpand $mask14,$temp,$temp
1096 vpor $temp,$iv,$iv
1097 vmovdqu $iv,(%r12,%r13) # write output
1098 lea 16(%r13),%r13
1099
1100 add $SZ*0(%r15),$A
1101 add $SZ*1(%r15),$B
1102 add $SZ*2(%r15),$C
1103 add $SZ*3(%r15),$D
1104 add $SZ*4(%r15),$E
1105 add $SZ*5(%r15),$F
1106 add $SZ*6(%r15),$G
1107 add $SZ*7(%r15),$H
1108
1109 mov $A,$SZ*0(%r15)
1110 mov $B,$SZ*1(%r15)
1111 mov $C,$SZ*2(%r15)
1112 mov $D,$SZ*3(%r15)
1113 mov $E,$SZ*4(%r15)
1114 mov $F,$SZ*5(%r15)
1115 mov $G,$SZ*6(%r15)
1116 mov $H,$SZ*7(%r15)
1117
1118 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1119 je .Ldone_avx2
1120
1121 xor $a1,$a1
1122 mov $B,$a3
1123 mov $F,$a4
1124 xor $C,$a3 # magic
1125 jmp .Lower_avx2
1126 .align 16
1127 .Lower_avx2:
1128 vmovdqu (%r13),$inout
1129 vpinsrq \$0,%r13,$offload,$offload
1130 ___
1131 $aesni_cbc_idx=0;
1132 for ($i=0; $i<16; ) {
1133 my $base="+16($Tbl)";
1134 foreach(bodyx_00_15()) { eval; }
1135 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1136 }
1137 $code.=<<___;
1138 vmovq $offload,%r13 # borrow $a0
1139 vpextrq \$1,$offload,%r15 # borrow $a2
1140 vpand $mask14,$temp,$temp
1141 vpor $temp,$iv,$iv
1142 lea -$PUSH8($Tbl),$Tbl
1143 vmovdqu $iv,(%r15,%r13) # write output
1144 lea 16(%r13),%r13 # inp++
1145 cmp %rsp,$Tbl
1146 jae .Lower_avx2
1147
1148 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1149 lea 16*$SZ(%r13),%r13
1150 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1151 add $a1,$A
1152 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1153
1154 add $SZ*0(%r15),$A
1155 add $SZ*1(%r15),$B
1156 add $SZ*2(%r15),$C
1157 add $SZ*3(%r15),$D
1158 add $SZ*4(%r15),$E
1159 add $SZ*5(%r15),$F
1160 add $SZ*6(%r15),$G
1161 lea (%rsi,%r13),%r12
1162 add $SZ*7(%r15),$H
1163
1164 cmp $_end,%r13
1165
1166 mov $A,$SZ*0(%r15)
1167 cmove %rsp,%r12 # next block or stale data
1168 mov $B,$SZ*1(%r15)
1169 mov $C,$SZ*2(%r15)
1170 mov $D,$SZ*3(%r15)
1171 mov $E,$SZ*4(%r15)
1172 mov $F,$SZ*5(%r15)
1173 mov $G,$SZ*6(%r15)
1174 mov $H,$SZ*7(%r15)
1175
1176 jbe .Loop_avx2
1177 lea (%rsp),$Tbl
1178
1179 .Ldone_avx2:
1180 lea ($Tbl),%rsp
1181 mov $_ivp,$ivp
1182 mov $_rsp,%rsi
1183 vmovdqu $iv,($ivp) # output IV
1184 vzeroall
1185 ___
1186 $code.=<<___ if ($win64);
1187 movaps `$framesz+16*0`(%rsp),%xmm6
1188 movaps `$framesz+16*1`(%rsp),%xmm7
1189 movaps `$framesz+16*2`(%rsp),%xmm8
1190 movaps `$framesz+16*3`(%rsp),%xmm9
1191 movaps `$framesz+16*4`(%rsp),%xmm10
1192 movaps `$framesz+16*5`(%rsp),%xmm11
1193 movaps `$framesz+16*6`(%rsp),%xmm12
1194 movaps `$framesz+16*7`(%rsp),%xmm13
1195 movaps `$framesz+16*8`(%rsp),%xmm14
1196 movaps `$framesz+16*9`(%rsp),%xmm15
1197 ___
1198 $code.=<<___;
1199 mov (%rsi),%r15
1200 mov 8(%rsi),%r14
1201 mov 16(%rsi),%r13
1202 mov 24(%rsi),%r12
1203 mov 32(%rsi),%rbp
1204 mov 40(%rsi),%rbx
1205 lea 48(%rsi),%rsp
1206 .Lepilogue_avx2:
1207 ret
1208 .size ${func}_avx2,.-${func}_avx2
1209 ___
1210 }}
1211 }}
1212 {{
1213 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1214
1215 my ($rounds,$Tbl)=("%r11d","%rbx");
1216
1217 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1218 my @rndkey=("%xmm4","%xmm5");
1219 my $r=0;
1220 my $sn=0;
1221
1222 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1223 my @MSG=map("%xmm$_",(10..13));
1224
1225 my $aesenc=sub {
1226 use integer;
1227 my ($n,$k)=($r/10,$r%10);
1228 if ($k==0) {
1229 $code.=<<___;
1230 movups `16*$n`($in0),$in # load input
1231 xorps $rndkey0,$in
1232 ___
1233 $code.=<<___ if ($n);
1234 movups $iv,`16*($n-1)`($out,$in0) # write output
1235 ___
1236 $code.=<<___;
1237 xorps $in,$iv
1238 movups `32+16*$k-112`($key),$rndkey[1]
1239 aesenc $rndkey[0],$iv
1240 ___
1241 } elsif ($k==9) {
1242 $sn++;
1243 $code.=<<___;
1244 cmp \$11,$rounds
1245 jb .Laesenclast$sn
1246 movups `32+16*($k+0)-112`($key),$rndkey[1]
1247 aesenc $rndkey[0],$iv
1248 movups `32+16*($k+1)-112`($key),$rndkey[0]
1249 aesenc $rndkey[1],$iv
1250 je .Laesenclast$sn
1251 movups `32+16*($k+2)-112`($key),$rndkey[1]
1252 aesenc $rndkey[0],$iv
1253 movups `32+16*($k+3)-112`($key),$rndkey[0]
1254 aesenc $rndkey[1],$iv
1255 .Laesenclast$sn:
1256 aesenclast $rndkey[0],$iv
1257 movups 16-112($key),$rndkey[1] # forward reference
1258 nop
1259 ___
1260 } else {
1261 $code.=<<___;
1262 movups `32+16*$k-112`($key),$rndkey[1]
1263 aesenc $rndkey[0],$iv
1264 ___
1265 }
1266 $r++; unshift(@rndkey,pop(@rndkey));
1267 };
1268
1269 if ($shaext) {
1270 my $Tbl="%rax";
1271
1272 $code.=<<___;
1273 .type ${func}_shaext,\@function,6
1274 .align 32
1275 ${func}_shaext:
1276 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1277 ___
1278 $code.=<<___ if ($win64);
1279 lea `-8-10*16`(%rsp),%rsp
1280 movaps %xmm6,-8-10*16(%rax)
1281 movaps %xmm7,-8-9*16(%rax)
1282 movaps %xmm8,-8-8*16(%rax)
1283 movaps %xmm9,-8-7*16(%rax)
1284 movaps %xmm10,-8-6*16(%rax)
1285 movaps %xmm11,-8-5*16(%rax)
1286 movaps %xmm12,-8-4*16(%rax)
1287 movaps %xmm13,-8-3*16(%rax)
1288 movaps %xmm14,-8-2*16(%rax)
1289 movaps %xmm15,-8-1*16(%rax)
1290 .Lprologue_shaext:
1291 ___
1292 $code.=<<___;
1293 lea K256+0x80(%rip),$Tbl
1294 movdqu ($ctx),$ABEF # DCBA
1295 movdqu 16($ctx),$CDGH # HGFE
1296 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1297
1298 mov 240($key),$rounds
1299 sub $in0,$out
1300 movups ($key),$rndkey0 # $key[0]
1301 movups 16($key),$rndkey[0] # forward reference
1302 lea 112($key),$key # size optimization
1303
1304 pshufd \$0x1b,$ABEF,$Wi # ABCD
1305 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1306 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1307 movdqa $TMP,$BSWAP # offload
1308 palignr \$8,$CDGH,$ABEF # ABEF
1309 punpcklqdq $Wi,$CDGH # CDGH
1310
1311 jmp .Loop_shaext
1312
1313 .align 16
1314 .Loop_shaext:
1315 movdqu ($inp),@MSG[0]
1316 movdqu 0x10($inp),@MSG[1]
1317 movdqu 0x20($inp),@MSG[2]
1318 pshufb $TMP,@MSG[0]
1319 movdqu 0x30($inp),@MSG[3]
1320
1321 movdqa 0*32-0x80($Tbl),$Wi
1322 paddd @MSG[0],$Wi
1323 pshufb $TMP,@MSG[1]
1324 movdqa $CDGH,$CDGH_SAVE # offload
1325 movdqa $ABEF,$ABEF_SAVE # offload
1326 ___
1327 &$aesenc();
1328 $code.=<<___;
1329 sha256rnds2 $ABEF,$CDGH # 0-3
1330 pshufd \$0x0e,$Wi,$Wi
1331 ___
1332 &$aesenc();
1333 $code.=<<___;
1334 sha256rnds2 $CDGH,$ABEF
1335
1336 movdqa 1*32-0x80($Tbl),$Wi
1337 paddd @MSG[1],$Wi
1338 pshufb $TMP,@MSG[2]
1339 lea 0x40($inp),$inp
1340 ___
1341 &$aesenc();
1342 $code.=<<___;
1343 sha256rnds2 $ABEF,$CDGH # 4-7
1344 pshufd \$0x0e,$Wi,$Wi
1345 ___
1346 &$aesenc();
1347 $code.=<<___;
1348 sha256rnds2 $CDGH,$ABEF
1349
1350 movdqa 2*32-0x80($Tbl),$Wi
1351 paddd @MSG[2],$Wi
1352 pshufb $TMP,@MSG[3]
1353 sha256msg1 @MSG[1],@MSG[0]
1354 ___
1355 &$aesenc();
1356 $code.=<<___;
1357 sha256rnds2 $ABEF,$CDGH # 8-11
1358 pshufd \$0x0e,$Wi,$Wi
1359 movdqa @MSG[3],$TMP
1360 palignr \$4,@MSG[2],$TMP
1361 paddd $TMP,@MSG[0]
1362 ___
1363 &$aesenc();
1364 $code.=<<___;
1365 sha256rnds2 $CDGH,$ABEF
1366
1367 movdqa 3*32-0x80($Tbl),$Wi
1368 paddd @MSG[3],$Wi
1369 sha256msg2 @MSG[3],@MSG[0]
1370 sha256msg1 @MSG[2],@MSG[1]
1371 ___
1372 &$aesenc();
1373 $code.=<<___;
1374 sha256rnds2 $ABEF,$CDGH # 12-15
1375 pshufd \$0x0e,$Wi,$Wi
1376 ___
1377 &$aesenc();
1378 $code.=<<___;
1379 movdqa @MSG[0],$TMP
1380 palignr \$4,@MSG[3],$TMP
1381 paddd $TMP,@MSG[1]
1382 sha256rnds2 $CDGH,$ABEF
1383 ___
1384 for($i=4;$i<16-3;$i++) {
1385 &$aesenc() if (($r%10)==0);
1386 $code.=<<___;
1387 movdqa $i*32-0x80($Tbl),$Wi
1388 paddd @MSG[0],$Wi
1389 sha256msg2 @MSG[0],@MSG[1]
1390 sha256msg1 @MSG[3],@MSG[2]
1391 ___
1392 &$aesenc();
1393 $code.=<<___;
1394 sha256rnds2 $ABEF,$CDGH # 16-19...
1395 pshufd \$0x0e,$Wi,$Wi
1396 movdqa @MSG[1],$TMP
1397 palignr \$4,@MSG[0],$TMP
1398 paddd $TMP,@MSG[2]
1399 ___
1400 &$aesenc();
1401 &$aesenc() if ($r==19);
1402 $code.=<<___;
1403 sha256rnds2 $CDGH,$ABEF
1404 ___
1405 push(@MSG,shift(@MSG));
1406 }
1407 $code.=<<___;
1408 movdqa 13*32-0x80($Tbl),$Wi
1409 paddd @MSG[0],$Wi
1410 sha256msg2 @MSG[0],@MSG[1]
1411 sha256msg1 @MSG[3],@MSG[2]
1412 ___
1413 &$aesenc();
1414 $code.=<<___;
1415 sha256rnds2 $ABEF,$CDGH # 52-55
1416 pshufd \$0x0e,$Wi,$Wi
1417 movdqa @MSG[1],$TMP
1418 palignr \$4,@MSG[0],$TMP
1419 paddd $TMP,@MSG[2]
1420 ___
1421 &$aesenc();
1422 &$aesenc();
1423 $code.=<<___;
1424 sha256rnds2 $CDGH,$ABEF
1425
1426 movdqa 14*32-0x80($Tbl),$Wi
1427 paddd @MSG[1],$Wi
1428 sha256msg2 @MSG[1],@MSG[2]
1429 movdqa $BSWAP,$TMP
1430 ___
1431 &$aesenc();
1432 $code.=<<___;
1433 sha256rnds2 $ABEF,$CDGH # 56-59
1434 pshufd \$0x0e,$Wi,$Wi
1435 ___
1436 &$aesenc();
1437 $code.=<<___;
1438 sha256rnds2 $CDGH,$ABEF
1439
1440 movdqa 15*32-0x80($Tbl),$Wi
1441 paddd @MSG[2],$Wi
1442 ___
1443 &$aesenc();
1444 &$aesenc();
1445 $code.=<<___;
1446 sha256rnds2 $ABEF,$CDGH # 60-63
1447 pshufd \$0x0e,$Wi,$Wi
1448 ___
1449 &$aesenc();
1450 $code.=<<___;
1451 sha256rnds2 $CDGH,$ABEF
1452 #pxor $CDGH,$rndkey0 # black magic
1453 ___
1454 while ($r<40) { &$aesenc(); } # remaining aesenc's
1455 $code.=<<___;
1456 #xorps $CDGH,$rndkey0 # black magic
1457 paddd $CDGH_SAVE,$CDGH
1458 paddd $ABEF_SAVE,$ABEF
1459
1460 dec $len
1461 movups $iv,48($out,$in0) # write output
1462 lea 64($in0),$in0
1463 jnz .Loop_shaext
1464
1465 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1466 pshufd \$0x1b,$ABEF,$TMP # FEBA
1467 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1468 punpckhqdq $CDGH,$ABEF # DCBA
1469 palignr \$8,$TMP,$CDGH # HGFE
1470
1471 movups $iv,($ivp) # write IV
1472 movdqu $ABEF,($ctx)
1473 movdqu $CDGH,16($ctx)
1474 ___
1475 $code.=<<___ if ($win64);
1476 movaps 0*16(%rsp),%xmm6
1477 movaps 1*16(%rsp),%xmm7
1478 movaps 2*16(%rsp),%xmm8
1479 movaps 3*16(%rsp),%xmm9
1480 movaps 4*16(%rsp),%xmm10
1481 movaps 5*16(%rsp),%xmm11
1482 movaps 6*16(%rsp),%xmm12
1483 movaps 7*16(%rsp),%xmm13
1484 movaps 8*16(%rsp),%xmm14
1485 movaps 9*16(%rsp),%xmm15
1486 lea 8+10*16(%rsp),%rsp
1487 .Lepilogue_shaext:
1488 ___
1489 $code.=<<___;
1490 ret
1491 .size ${func}_shaext,.-${func}_shaext
1492 ___
1493 }
1494 }}}}}
1495
1496 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1497 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1498 if ($win64) {
1499 $rec="%rcx";
1500 $frame="%rdx";
1501 $context="%r8";
1502 $disp="%r9";
1503
1504 $code.=<<___ if ($avx);
1505 .extern __imp_RtlVirtualUnwind
1506 .type se_handler,\@abi-omnipotent
1507 .align 16
1508 se_handler:
1509 push %rsi
1510 push %rdi
1511 push %rbx
1512 push %rbp
1513 push %r12
1514 push %r13
1515 push %r14
1516 push %r15
1517 pushfq
1518 sub \$64,%rsp
1519
1520 mov 120($context),%rax # pull context->Rax
1521 mov 248($context),%rbx # pull context->Rip
1522
1523 mov 8($disp),%rsi # disp->ImageBase
1524 mov 56($disp),%r11 # disp->HanderlData
1525
1526 mov 0(%r11),%r10d # HandlerData[0]
1527 lea (%rsi,%r10),%r10 # prologue label
1528 cmp %r10,%rbx # context->Rip<prologue label
1529 jb .Lin_prologue
1530
1531 mov 152($context),%rax # pull context->Rsp
1532
1533 mov 4(%r11),%r10d # HandlerData[1]
1534 lea (%rsi,%r10),%r10 # epilogue label
1535 cmp %r10,%rbx # context->Rip>=epilogue label
1536 jae .Lin_prologue
1537 ___
1538 $code.=<<___ if ($shaext);
1539 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1540 cmp %r10,%rbx
1541 jb .Lnot_in_shaext
1542
1543 lea (%rax),%rsi
1544 lea 512($context),%rdi # &context.Xmm6
1545 mov \$20,%ecx
1546 .long 0xa548f3fc # cld; rep movsq
1547 lea 168(%rax),%rax # adjust stack pointer
1548 jmp .Lin_prologue
1549 .Lnot_in_shaext:
1550 ___
1551 $code.=<<___ if ($avx>1);
1552 lea .Lavx2_shortcut(%rip),%r10
1553 cmp %r10,%rbx # context->Rip<avx2_shortcut
1554 jb .Lnot_in_avx2
1555
1556 and \$-256*$SZ,%rax
1557 add \$`2*$SZ*($rounds-8)`,%rax
1558 .Lnot_in_avx2:
1559 ___
1560 $code.=<<___;
1561 mov %rax,%rsi # put aside Rsp
1562 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1563 lea 48(%rax),%rax
1564
1565 mov -8(%rax),%rbx
1566 mov -16(%rax),%rbp
1567 mov -24(%rax),%r12
1568 mov -32(%rax),%r13
1569 mov -40(%rax),%r14
1570 mov -48(%rax),%r15
1571 mov %rbx,144($context) # restore context->Rbx
1572 mov %rbp,160($context) # restore context->Rbp
1573 mov %r12,216($context) # restore context->R12
1574 mov %r13,224($context) # restore context->R13
1575 mov %r14,232($context) # restore context->R14
1576 mov %r15,240($context) # restore context->R15
1577
1578 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1579 lea 512($context),%rdi # &context.Xmm6
1580 mov \$20,%ecx
1581 .long 0xa548f3fc # cld; rep movsq
1582
1583 .Lin_prologue:
1584 mov 8(%rax),%rdi
1585 mov 16(%rax),%rsi
1586 mov %rax,152($context) # restore context->Rsp
1587 mov %rsi,168($context) # restore context->Rsi
1588 mov %rdi,176($context) # restore context->Rdi
1589
1590 mov 40($disp),%rdi # disp->ContextRecord
1591 mov $context,%rsi # context
1592 mov \$154,%ecx # sizeof(CONTEXT)
1593 .long 0xa548f3fc # cld; rep movsq
1594
1595 mov $disp,%rsi
1596 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1597 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1598 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1599 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1600 mov 40(%rsi),%r10 # disp->ContextRecord
1601 lea 56(%rsi),%r11 # &disp->HandlerData
1602 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1603 mov %r10,32(%rsp) # arg5
1604 mov %r11,40(%rsp) # arg6
1605 mov %r12,48(%rsp) # arg7
1606 mov %rcx,56(%rsp) # arg8, (NULL)
1607 call *__imp_RtlVirtualUnwind(%rip)
1608
1609 mov \$1,%eax # ExceptionContinueSearch
1610 add \$64,%rsp
1611 popfq
1612 pop %r15
1613 pop %r14
1614 pop %r13
1615 pop %r12
1616 pop %rbp
1617 pop %rbx
1618 pop %rdi
1619 pop %rsi
1620 ret
1621 .size se_handler,.-se_handler
1622
1623 .section .pdata
1624 .rva .LSEH_begin_${func}_xop
1625 .rva .LSEH_end_${func}_xop
1626 .rva .LSEH_info_${func}_xop
1627
1628 .rva .LSEH_begin_${func}_avx
1629 .rva .LSEH_end_${func}_avx
1630 .rva .LSEH_info_${func}_avx
1631 ___
1632 $code.=<<___ if ($avx>1);
1633 .rva .LSEH_begin_${func}_avx2
1634 .rva .LSEH_end_${func}_avx2
1635 .rva .LSEH_info_${func}_avx2
1636 ___
1637 $code.=<<___ if ($shaext);
1638 .rva .LSEH_begin_${func}_shaext
1639 .rva .LSEH_end_${func}_shaext
1640 .rva .LSEH_info_${func}_shaext
1641 ___
1642 $code.=<<___ if ($avx);
1643 .section .xdata
1644 .align 8
1645 .LSEH_info_${func}_xop:
1646 .byte 9,0,0,0
1647 .rva se_handler
1648 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1649
1650 .LSEH_info_${func}_avx:
1651 .byte 9,0,0,0
1652 .rva se_handler
1653 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1654 ___
1655 $code.=<<___ if ($avx>1);
1656 .LSEH_info_${func}_avx2:
1657 .byte 9,0,0,0
1658 .rva se_handler
1659 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1660 ___
1661 $code.=<<___ if ($shaext);
1662 .LSEH_info_${func}_shaext:
1663 .byte 9,0,0,0
1664 .rva se_handler
1665 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1666 ___
1667 }
1668
1669 ####################################################################
1670 sub rex {
1671 local *opcode=shift;
1672 my ($dst,$src)=@_;
1673 my $rex=0;
1674
1675 $rex|=0x04 if($dst>=8);
1676 $rex|=0x01 if($src>=8);
1677 unshift @opcode,$rex|0x40 if($rex);
1678 }
1679
1680 {
1681 my %opcodelet = (
1682 "sha256rnds2" => 0xcb,
1683 "sha256msg1" => 0xcc,
1684 "sha256msg2" => 0xcd );
1685
1686 sub sha256op38 {
1687 my $instr = shift;
1688
1689 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1690 my @opcode=(0x0f,0x38);
1691 rex(\@opcode,$2,$1);
1692 push @opcode,$opcodelet{$instr};
1693 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1694 return ".byte\t".join(',',@opcode);
1695 } else {
1696 return $instr."\t".@_[0];
1697 }
1698 }
1699 }
1700
1701 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1702 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1703 print $code;
1704 close STDOUT;