]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-sha256-x86_64.pl
sha1-x86_64.pl: add missing rex prefix in shaext.
[thirdparty/openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
CommitLineData
8a97a330
AP
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# January 2013
11#
12# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18# AESNI code is weaved into it. As SHA256 dominates execution time,
19# stitch performance does not depend on AES key length. Below are
20# performance numbers in cycles per processed byte, less is better,
21# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22# subroutine:
23#
24# AES-128/-192/-256+SHA256 this(**)gain
25# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
42b9a417 27# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
8a97a330
AP
28# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
29#
30# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31# Westmere is omitted from loop, this is because gain was not
32# estimated high enough to justify the effort;
33# (**) these are EVP-free results, results obtained with 'speed
34# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36$flavour = shift;
37$output = shift;
38if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45die "can't locate x86_64-xlate.pl";
46
47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
50}
51
52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
55}
56
57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
60}
61
977f32e8
AP
62$shaext=1; ### set to zero if compiling for 1.0.1
63$avx=1 if (!$shaext && $avx);
64
8a97a330
AP
65open OUT,"| \"$^X\" $xlate $flavour $output";
66*STDOUT=*OUT;
67
68$func="aesni_cbc_sha256_enc";
69$TABLE="K256";
70$SZ=4;
71@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
72 "%r8d","%r9d","%r10d","%r11d");
73($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
74@Sigma0=( 2,13,22);
75@Sigma1=( 6,11,25);
76@sigma0=( 7,18, 3);
77@sigma1=(17,19,10);
78$rounds=64;
79
80########################################################################
81# void aesni_cbc_sha256_enc(const void *inp,
82# void *out,
83# size_t length,
84# const AES_KEY *key,
85# unsigned char *iv,
86# SHA256_CTX *ctx,
87# const void *in0);
88($inp, $out, $len, $key, $ivp, $ctx, $in0) =
89("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
90
91$Tbl="%rbp";
92
93$_inp="16*$SZ+0*8(%rsp)";
94$_out="16*$SZ+1*8(%rsp)";
95$_end="16*$SZ+2*8(%rsp)";
96$_key="16*$SZ+3*8(%rsp)";
97$_ivp="16*$SZ+4*8(%rsp)";
98$_ctx="16*$SZ+5*8(%rsp)";
99$_in0="16*$SZ+6*8(%rsp)";
100$_rsp="16*$SZ+7*8(%rsp)";
101$framesz=16*$SZ+8*8;
102
103$code=<<___;
104.text
105
106.extern OPENSSL_ia32cap_P
107.globl $func
108.type $func,\@abi-omnipotent
109.align 16
110$func:
111___
112$code.=<<___ if ($avx);
113 lea OPENSSL_ia32cap_P(%rip),%r11
114 mov \$1,%eax
115 cmp \$0,`$win64?"%rcx":"%rdi"`
116 je .Lprobe
117 mov 0(%r11),%eax
619b9466 118 mov 4(%r11),%r10
977f32e8
AP
119___
120$code.=<<___ if ($shaext);
619b9466
AP
121 bt \$61,%r10 # check for SHA
122 jc ${func}_shaext
977f32e8
AP
123___
124$code.=<<___;
619b9466
AP
125 mov %r10,%r11
126 shr \$32,%r11
8a97a330
AP
127
128 test \$`1<<11`,%r10d # check for XOP
129 jnz ${func}_xop
130___
131$code.=<<___ if ($avx>1);
132 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
133 cmp \$`1<<8|1<<5|1<<3`,%r11d
134 je ${func}_avx2
135___
136$code.=<<___ if ($avx);
137 and \$`1<<30`,%eax # mask "Intel CPU" bit
138 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
139 or %eax,%r10d
140 cmp \$`1<<28|1<<9|1<<30`,%r10d
141 je ${func}_avx
142 ud2
143___
144$code.=<<___;
145 xor %eax,%eax
146 cmp \$0,`$win64?"%rcx":"%rdi"`
147 je .Lprobe
148 ud2
149.Lprobe:
150 ret
151.size $func,.-$func
152
153.align 64
154.type $TABLE,\@object
155$TABLE:
156 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
157 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
158 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
159 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
160 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
161 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
162 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
163 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
164 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
165 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
166 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
167 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
170 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
171 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
172 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
173 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
174 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
175 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
176 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
177 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
178 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
179 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
180 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
181 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
182 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
183 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
184 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
185 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
186 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
187 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
188
189 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
190 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
191 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
192 .long 0,0,0,0, 0,0,0,0
193 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
194.align 64
195___
196
197######################################################################
198# SIMD code paths
199#
200{{{
201($iv,$inout,$roundkey,$temp,
202 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
203
204$aesni_cbc_idx=0;
205@aesni_cbc_block = (
206## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
207## &vmovdqu ($inout,($inp));
208## &mov ($_inp,$inp);
209
210 '&vpxor ($inout,$inout,$roundkey);'.
211 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
212
213 '&vpxor ($inout,$inout,$iv);',
214
215 '&vaesenc ($inout,$inout,$roundkey);'.
216 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
217
218 '&vaesenc ($inout,$inout,$roundkey);'.
219 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
220
221 '&vaesenc ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
223
224 '&vaesenc ($inout,$inout,$roundkey);'.
225 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
226
227 '&vaesenc ($inout,$inout,$roundkey);'.
228 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
229
230 '&vaesenc ($inout,$inout,$roundkey);'.
231 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
232
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
235
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
238
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
241
242 '&vaesenclast ($temp,$inout,$roundkey);'.
243 ' &vaesenc ($inout,$inout,$roundkey);'.
244 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
245
246 '&vpand ($iv,$temp,$mask10);'.
247 ' &vaesenc ($inout,$inout,$roundkey);'.
248 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
249
250 '&vaesenclast ($temp,$inout,$roundkey);'.
251 ' &vaesenc ($inout,$inout,$roundkey);'.
252 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
253
254 '&vpand ($temp,$temp,$mask12);'.
255 ' &vaesenc ($inout,$inout,$roundkey);'.
256 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
257
258 '&vpor ($iv,$iv,$temp);'.
259 ' &vaesenclast ($temp,$inout,$roundkey);'.
260 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
261
262## &mov ($inp,$_inp);
263## &mov ($out,$_out);
264## &vpand ($temp,$temp,$mask14);
265## &vpor ($iv,$iv,$temp);
266## &vmovdqu ($iv,($out,$inp);
267## &lea (inp,16($inp));
268);
269
270my $a4=$T1;
271my ($a,$b,$c,$d,$e,$f,$g,$h);
272
273sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
274{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
275 my $arg = pop;
276 $arg = "\$$arg" if ($arg*1 eq $arg);
277 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
278}
279
280sub body_00_15 () {
281 (
282 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
283
284 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
285 '&mov ($a,$a1)',
286 '&mov ($a4,$f)',
287
288 '&xor ($a0,$e)',
289 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
290 '&xor ($a4,$g)', # f^g
291
292 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
293 '&xor ($a1,$a)',
294 '&and ($a4,$e)', # (f^g)&e
295
296 @aesni_cbc_block[$aesni_cbc_idx++].
297 '&xor ($a0,$e)',
298 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
299 '&mov ($a2,$a)',
300
301 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
302 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
303 '&xor ($a2,$b)', # a^b, b^c in next round
304
305 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
306 '&add ($h,$a4)', # h+=Ch(e,f,g)
307 '&and ($a3,$a2)', # (b^c)&(a^b)
308
309 '&xor ($a1,$a)',
310 '&add ($h,$a0)', # h+=Sigma1(e)
311 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
312
313 '&add ($d,$h)', # d+=h
314 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
315 '&add ($h,$a3)', # h+=Maj(a,b,c)
316
317 '&mov ($a0,$d)',
318 '&add ($a1,$h);'. # h+=Sigma0(a)
319 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
320 );
321}
322
323if ($avx) {{
324######################################################################
325# XOP code path
326#
327$code.=<<___;
328.type ${func}_xop,\@function,6
329.align 64
330${func}_xop:
331.Lxop_shortcut:
332 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
333 push %rbx
334 push %rbp
335 push %r12
336 push %r13
337 push %r14
338 push %r15
339 mov %rsp,%r11 # copy %rsp
340 sub \$`$framesz+$win64*16*10`,%rsp
341 and \$-64,%rsp # align stack frame
342
343 shl \$6,$len
344 sub $inp,$out # re-bias
345 sub $inp,$in0
346 add $inp,$len # end of input
347
348 #mov $inp,$_inp # saved later
349 mov $out,$_out
350 mov $len,$_end
351 #mov $key,$_key # remains resident in $inp register
352 mov $ivp,$_ivp
353 mov $ctx,$_ctx
354 mov $in0,$_in0
355 mov %r11,$_rsp
356___
357$code.=<<___ if ($win64);
358 movaps %xmm6,`$framesz+16*0`(%rsp)
359 movaps %xmm7,`$framesz+16*1`(%rsp)
360 movaps %xmm8,`$framesz+16*2`(%rsp)
361 movaps %xmm9,`$framesz+16*3`(%rsp)
362 movaps %xmm10,`$framesz+16*4`(%rsp)
363 movaps %xmm11,`$framesz+16*5`(%rsp)
364 movaps %xmm12,`$framesz+16*6`(%rsp)
365 movaps %xmm13,`$framesz+16*7`(%rsp)
366 movaps %xmm14,`$framesz+16*8`(%rsp)
367 movaps %xmm15,`$framesz+16*9`(%rsp)
368___
369$code.=<<___;
370.Lprologue_xop:
371 vzeroall
372
373 mov $inp,%r12 # borrow $a4
374 lea 0x80($key),$inp # size optimization, reassign
375 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
376 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
377 mov $ctx,%r15 # borrow $a2
378 mov $in0,%rsi # borrow $a3
379 vmovdqu ($ivp),$iv # load IV
380 sub \$9,%r14
381
382 mov $SZ*0(%r15),$A
383 mov $SZ*1(%r15),$B
384 mov $SZ*2(%r15),$C
385 mov $SZ*3(%r15),$D
386 mov $SZ*4(%r15),$E
387 mov $SZ*5(%r15),$F
388 mov $SZ*6(%r15),$G
389 mov $SZ*7(%r15),$H
390
391 vmovdqa 0x00(%r13,%r14,8),$mask14
392 vmovdqa 0x10(%r13,%r14,8),$mask12
393 vmovdqa 0x20(%r13,%r14,8),$mask10
394 vmovdqu 0x00-0x80($inp),$roundkey
395 jmp .Lloop_xop
396___
397 if ($SZ==4) { # SHA256
398 my @X = map("%xmm$_",(0..3));
399 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
400
401$code.=<<___;
402.align 16
403.Lloop_xop:
404 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
405 vmovdqu 0x00(%rsi,%r12),@X[0]
406 vmovdqu 0x10(%rsi,%r12),@X[1]
407 vmovdqu 0x20(%rsi,%r12),@X[2]
408 vmovdqu 0x30(%rsi,%r12),@X[3]
409 vpshufb $t3,@X[0],@X[0]
410 lea $TABLE(%rip),$Tbl
411 vpshufb $t3,@X[1],@X[1]
412 vpshufb $t3,@X[2],@X[2]
413 vpaddd 0x00($Tbl),@X[0],$t0
414 vpshufb $t3,@X[3],@X[3]
415 vpaddd 0x20($Tbl),@X[1],$t1
416 vpaddd 0x40($Tbl),@X[2],$t2
417 vpaddd 0x60($Tbl),@X[3],$t3
418 vmovdqa $t0,0x00(%rsp)
419 mov $A,$a1
420 vmovdqa $t1,0x10(%rsp)
421 mov $B,$a3
422 vmovdqa $t2,0x20(%rsp)
423 xor $C,$a3 # magic
424 vmovdqa $t3,0x30(%rsp)
425 mov $E,$a0
426 jmp .Lxop_00_47
427
428.align 16
429.Lxop_00_47:
430 sub \$-16*2*$SZ,$Tbl # size optimization
431 vmovdqu (%r12),$inout # $a4
432 mov %r12,$_inp # $a4
433___
434sub XOP_256_00_47 () {
435my $j = shift;
436my $body = shift;
437my @X = @_;
438my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
439
440 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
441 eval(shift(@insns));
442 eval(shift(@insns));
443 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
444 eval(shift(@insns));
445 eval(shift(@insns));
446 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
447 eval(shift(@insns));
448 eval(shift(@insns));
449 &vpsrld ($t0,$t0,$sigma0[2]);
450 eval(shift(@insns));
451 eval(shift(@insns));
452 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
453 eval(shift(@insns));
454 eval(shift(@insns));
455 eval(shift(@insns));
456 eval(shift(@insns));
457 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
458 eval(shift(@insns));
459 eval(shift(@insns));
460 &vpxor ($t0,$t0,$t1);
461 eval(shift(@insns));
462 eval(shift(@insns));
463 eval(shift(@insns));
464 eval(shift(@insns));
465 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
466 eval(shift(@insns));
467 eval(shift(@insns));
468 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
469 eval(shift(@insns));
470 eval(shift(@insns));
471 &vpsrld ($t2,@X[3],$sigma1[2]);
472 eval(shift(@insns));
473 eval(shift(@insns));
474 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
475 eval(shift(@insns));
476 eval(shift(@insns));
477 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
478 eval(shift(@insns));
479 eval(shift(@insns));
480 &vpxor ($t3,$t3,$t2);
481 eval(shift(@insns));
482 eval(shift(@insns));
483 eval(shift(@insns));
484 eval(shift(@insns));
485 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
486 eval(shift(@insns));
487 eval(shift(@insns));
488 eval(shift(@insns));
489 eval(shift(@insns));
490 &vpsrldq ($t3,$t3,8);
491 eval(shift(@insns));
492 eval(shift(@insns));
493 eval(shift(@insns));
494 eval(shift(@insns));
495 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
496 eval(shift(@insns));
497 eval(shift(@insns));
498 eval(shift(@insns));
499 eval(shift(@insns));
500 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
501 eval(shift(@insns));
502 eval(shift(@insns));
503 &vpsrld ($t2,@X[0],$sigma1[2]);
504 eval(shift(@insns));
505 eval(shift(@insns));
506 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
507 eval(shift(@insns));
508 eval(shift(@insns));
509 &vpxor ($t3,$t3,$t2);
510 eval(shift(@insns));
511 eval(shift(@insns));
512 eval(shift(@insns));
513 eval(shift(@insns));
514 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
515 eval(shift(@insns));
516 eval(shift(@insns));
517 eval(shift(@insns));
518 eval(shift(@insns));
519 &vpslldq ($t3,$t3,8); # 22 instructions
520 eval(shift(@insns));
521 eval(shift(@insns));
522 eval(shift(@insns));
523 eval(shift(@insns));
524 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
525 eval(shift(@insns));
526 eval(shift(@insns));
527 eval(shift(@insns));
528 eval(shift(@insns));
529 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
530 foreach (@insns) { eval; } # remaining instructions
531 &vmovdqa (16*$j."(%rsp)",$t2);
532}
533
534 $aesni_cbc_idx=0;
535 for ($i=0,$j=0; $j<4; $j++) {
536 &XOP_256_00_47($j,\&body_00_15,@X);
537 push(@X,shift(@X)); # rotate(@X)
538 }
539 &mov ("%r12",$_inp); # borrow $a4
540 &vpand ($temp,$temp,$mask14);
541 &mov ("%r15",$_out); # borrow $a2
542 &vpor ($iv,$iv,$temp);
543 &vmovdqu ("(%r15,%r12)",$iv); # write output
544 &lea ("%r12","16(%r12)"); # inp++
545
546 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
547 &jne (".Lxop_00_47");
548
549 &vmovdqu ($inout,"(%r12)");
550 &mov ($_inp,"%r12");
551
552 $aesni_cbc_idx=0;
553 for ($i=0; $i<16; ) {
554 foreach(body_00_15()) { eval; }
555 }
556 }
557$code.=<<___;
558 mov $_inp,%r12 # borrow $a4
559 mov $_out,%r13 # borrow $a0
560 mov $_ctx,%r15 # borrow $a2
561 mov $_in0,%rsi # borrow $a3
562
563 vpand $mask14,$temp,$temp
564 mov $a1,$A
565 vpor $temp,$iv,$iv
566 vmovdqu $iv,(%r13,%r12) # write output
567 lea 16(%r12),%r12 # inp++
568
569 add $SZ*0(%r15),$A
570 add $SZ*1(%r15),$B
571 add $SZ*2(%r15),$C
572 add $SZ*3(%r15),$D
573 add $SZ*4(%r15),$E
574 add $SZ*5(%r15),$F
575 add $SZ*6(%r15),$G
576 add $SZ*7(%r15),$H
577
578 cmp $_end,%r12
579
580 mov $A,$SZ*0(%r15)
581 mov $B,$SZ*1(%r15)
582 mov $C,$SZ*2(%r15)
583 mov $D,$SZ*3(%r15)
584 mov $E,$SZ*4(%r15)
585 mov $F,$SZ*5(%r15)
586 mov $G,$SZ*6(%r15)
587 mov $H,$SZ*7(%r15)
588
589 jb .Lloop_xop
590
591 mov $_ivp,$ivp
592 mov $_rsp,%rsi
593 vmovdqu $iv,($ivp) # output IV
594 vzeroall
595___
596$code.=<<___ if ($win64);
597 movaps `$framesz+16*0`(%rsp),%xmm6
598 movaps `$framesz+16*1`(%rsp),%xmm7
599 movaps `$framesz+16*2`(%rsp),%xmm8
600 movaps `$framesz+16*3`(%rsp),%xmm9
601 movaps `$framesz+16*4`(%rsp),%xmm10
602 movaps `$framesz+16*5`(%rsp),%xmm11
603 movaps `$framesz+16*6`(%rsp),%xmm12
604 movaps `$framesz+16*7`(%rsp),%xmm13
605 movaps `$framesz+16*8`(%rsp),%xmm14
606 movaps `$framesz+16*9`(%rsp),%xmm15
607___
608$code.=<<___;
609 mov (%rsi),%r15
610 mov 8(%rsi),%r14
611 mov 16(%rsi),%r13
612 mov 24(%rsi),%r12
613 mov 32(%rsi),%rbp
614 mov 40(%rsi),%rbx
615 lea 48(%rsi),%rsp
616.Lepilogue_xop:
617 ret
618.size ${func}_xop,.-${func}_xop
619___
620######################################################################
621# AVX+shrd code path
622#
623local *ror = sub { &shrd(@_[0],@_) };
624
625$code.=<<___;
626.type ${func}_avx,\@function,6
627.align 64
628${func}_avx:
629.Lavx_shortcut:
630 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
631 push %rbx
632 push %rbp
633 push %r12
634 push %r13
635 push %r14
636 push %r15
637 mov %rsp,%r11 # copy %rsp
638 sub \$`$framesz+$win64*16*10`,%rsp
639 and \$-64,%rsp # align stack frame
640
641 shl \$6,$len
642 sub $inp,$out # re-bias
643 sub $inp,$in0
644 add $inp,$len # end of input
645
646 #mov $inp,$_inp # saved later
647 mov $out,$_out
648 mov $len,$_end
649 #mov $key,$_key # remains resident in $inp register
650 mov $ivp,$_ivp
651 mov $ctx,$_ctx
652 mov $in0,$_in0
653 mov %r11,$_rsp
654___
655$code.=<<___ if ($win64);
656 movaps %xmm6,`$framesz+16*0`(%rsp)
657 movaps %xmm7,`$framesz+16*1`(%rsp)
658 movaps %xmm8,`$framesz+16*2`(%rsp)
659 movaps %xmm9,`$framesz+16*3`(%rsp)
660 movaps %xmm10,`$framesz+16*4`(%rsp)
661 movaps %xmm11,`$framesz+16*5`(%rsp)
662 movaps %xmm12,`$framesz+16*6`(%rsp)
663 movaps %xmm13,`$framesz+16*7`(%rsp)
664 movaps %xmm14,`$framesz+16*8`(%rsp)
665 movaps %xmm15,`$framesz+16*9`(%rsp)
666___
667$code.=<<___;
668.Lprologue_avx:
669 vzeroall
670
671 mov $inp,%r12 # borrow $a4
672 lea 0x80($key),$inp # size optimization, reassign
673 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
674 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
675 mov $ctx,%r15 # borrow $a2
676 mov $in0,%rsi # borrow $a3
677 vmovdqu ($ivp),$iv # load IV
678 sub \$9,%r14
679
680 mov $SZ*0(%r15),$A
681 mov $SZ*1(%r15),$B
682 mov $SZ*2(%r15),$C
683 mov $SZ*3(%r15),$D
684 mov $SZ*4(%r15),$E
685 mov $SZ*5(%r15),$F
686 mov $SZ*6(%r15),$G
687 mov $SZ*7(%r15),$H
688
689 vmovdqa 0x00(%r13,%r14,8),$mask14
690 vmovdqa 0x10(%r13,%r14,8),$mask12
691 vmovdqa 0x20(%r13,%r14,8),$mask10
692 vmovdqu 0x00-0x80($inp),$roundkey
693___
694 if ($SZ==4) { # SHA256
695 my @X = map("%xmm$_",(0..3));
696 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
697
698$code.=<<___;
699 jmp .Lloop_avx
700.align 16
701.Lloop_avx:
702 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
703 vmovdqu 0x00(%rsi,%r12),@X[0]
704 vmovdqu 0x10(%rsi,%r12),@X[1]
705 vmovdqu 0x20(%rsi,%r12),@X[2]
706 vmovdqu 0x30(%rsi,%r12),@X[3]
707 vpshufb $t3,@X[0],@X[0]
708 lea $TABLE(%rip),$Tbl
709 vpshufb $t3,@X[1],@X[1]
710 vpshufb $t3,@X[2],@X[2]
711 vpaddd 0x00($Tbl),@X[0],$t0
712 vpshufb $t3,@X[3],@X[3]
713 vpaddd 0x20($Tbl),@X[1],$t1
714 vpaddd 0x40($Tbl),@X[2],$t2
715 vpaddd 0x60($Tbl),@X[3],$t3
716 vmovdqa $t0,0x00(%rsp)
717 mov $A,$a1
718 vmovdqa $t1,0x10(%rsp)
719 mov $B,$a3
720 vmovdqa $t2,0x20(%rsp)
721 xor $C,$a3 # magic
722 vmovdqa $t3,0x30(%rsp)
723 mov $E,$a0
724 jmp .Lavx_00_47
725
726.align 16
727.Lavx_00_47:
728 sub \$-16*2*$SZ,$Tbl # size optimization
729 vmovdqu (%r12),$inout # $a4
730 mov %r12,$_inp # $a4
731___
732sub Xupdate_256_AVX () {
733 (
734 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
735 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
736 '&vpsrld ($t2,$t0,$sigma0[0]);',
737 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
738 '&vpsrld ($t3,$t0,$sigma0[2])',
739 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
740 '&vpxor ($t0,$t3,$t2)',
741 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
742 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
743 '&vpxor ($t0,$t0,$t1)',
744 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
745 '&vpxor ($t0,$t0,$t2)',
746 '&vpsrld ($t2,$t3,$sigma1[2]);',
747 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
748 '&vpsrlq ($t3,$t3,$sigma1[0]);',
749 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
750 '&vpxor ($t2,$t2,$t3);',
751 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
752 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
753 '&vpshufd ($t2,$t2,0b10000100)',
754 '&vpsrldq ($t2,$t2,8)',
755 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
756 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
757 '&vpsrld ($t2,$t3,$sigma1[2])',
758 '&vpsrlq ($t3,$t3,$sigma1[0])',
759 '&vpxor ($t2,$t2,$t3);',
760 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
761 '&vpxor ($t2,$t2,$t3)',
762 '&vpshufd ($t2,$t2,0b11101000)',
763 '&vpslldq ($t2,$t2,8)',
764 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
765 );
766}
767
768sub AVX_256_00_47 () {
769my $j = shift;
770my $body = shift;
771my @X = @_;
772my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
773
774 foreach (Xupdate_256_AVX()) { # 29 instructions
775 eval;
776 eval(shift(@insns));
777 eval(shift(@insns));
778 eval(shift(@insns));
779 }
780 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
781 foreach (@insns) { eval; } # remaining instructions
782 &vmovdqa (16*$j."(%rsp)",$t2);
783}
784
785 $aesni_cbc_idx=0;
786 for ($i=0,$j=0; $j<4; $j++) {
787 &AVX_256_00_47($j,\&body_00_15,@X);
788 push(@X,shift(@X)); # rotate(@X)
789 }
790 &mov ("%r12",$_inp); # borrow $a4
791 &vpand ($temp,$temp,$mask14);
792 &mov ("%r15",$_out); # borrow $a2
793 &vpor ($iv,$iv,$temp);
794 &vmovdqu ("(%r15,%r12)",$iv); # write output
795 &lea ("%r12","16(%r12)"); # inp++
796
797 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
798 &jne (".Lavx_00_47");
799
800 &vmovdqu ($inout,"(%r12)");
801 &mov ($_inp,"%r12");
802
803 $aesni_cbc_idx=0;
804 for ($i=0; $i<16; ) {
805 foreach(body_00_15()) { eval; }
806 }
807
808 }
809$code.=<<___;
810 mov $_inp,%r12 # borrow $a4
811 mov $_out,%r13 # borrow $a0
812 mov $_ctx,%r15 # borrow $a2
813 mov $_in0,%rsi # borrow $a3
814
815 vpand $mask14,$temp,$temp
816 mov $a1,$A
817 vpor $temp,$iv,$iv
818 vmovdqu $iv,(%r13,%r12) # write output
819 lea 16(%r12),%r12 # inp++
820
821 add $SZ*0(%r15),$A
822 add $SZ*1(%r15),$B
823 add $SZ*2(%r15),$C
824 add $SZ*3(%r15),$D
825 add $SZ*4(%r15),$E
826 add $SZ*5(%r15),$F
827 add $SZ*6(%r15),$G
828 add $SZ*7(%r15),$H
829
830 cmp $_end,%r12
831
832 mov $A,$SZ*0(%r15)
833 mov $B,$SZ*1(%r15)
834 mov $C,$SZ*2(%r15)
835 mov $D,$SZ*3(%r15)
836 mov $E,$SZ*4(%r15)
837 mov $F,$SZ*5(%r15)
838 mov $G,$SZ*6(%r15)
839 mov $H,$SZ*7(%r15)
840 jb .Lloop_avx
841
842 mov $_ivp,$ivp
843 mov $_rsp,%rsi
844 vmovdqu $iv,($ivp) # output IV
845 vzeroall
846___
847$code.=<<___ if ($win64);
848 movaps `$framesz+16*0`(%rsp),%xmm6
849 movaps `$framesz+16*1`(%rsp),%xmm7
850 movaps `$framesz+16*2`(%rsp),%xmm8
851 movaps `$framesz+16*3`(%rsp),%xmm9
852 movaps `$framesz+16*4`(%rsp),%xmm10
853 movaps `$framesz+16*5`(%rsp),%xmm11
854 movaps `$framesz+16*6`(%rsp),%xmm12
855 movaps `$framesz+16*7`(%rsp),%xmm13
856 movaps `$framesz+16*8`(%rsp),%xmm14
857 movaps `$framesz+16*9`(%rsp),%xmm15
858___
859$code.=<<___;
860 mov (%rsi),%r15
861 mov 8(%rsi),%r14
862 mov 16(%rsi),%r13
863 mov 24(%rsi),%r12
864 mov 32(%rsi),%rbp
865 mov 40(%rsi),%rbx
866 lea 48(%rsi),%rsp
867.Lepilogue_avx:
868 ret
869.size ${func}_avx,.-${func}_avx
870___
871
872if ($avx>1) {{
873######################################################################
874# AVX2+BMI code path
875#
876my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
877my $PUSH8=8*2*$SZ;
878use integer;
879
880sub bodyx_00_15 () {
881 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
882 (
883 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
884
885 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
886 '&and ($a4,$e)', # f&e
887 '&rorx ($a0,$e,$Sigma1[2])',
888 '&rorx ($a2,$e,$Sigma1[1])',
889
890 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
891 '&lea ($h,"($h,$a4)")',
892 '&andn ($a4,$e,$g)', # ~e&g
893 '&xor ($a0,$a2)',
894
895 '&rorx ($a1,$e,$Sigma1[0])',
896 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
897 '&xor ($a0,$a1)', # Sigma1(e)
898 '&mov ($a2,$a)',
899
900 '&rorx ($a4,$a,$Sigma0[2])',
901 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
902 '&xor ($a2,$b)', # a^b, b^c in next round
903 '&rorx ($a1,$a,$Sigma0[1])',
904
905 '&rorx ($a0,$a,$Sigma0[0])',
906 '&lea ($d,"($d,$h)")', # d+=h
907 '&and ($a3,$a2)', # (b^c)&(a^b)
908 @aesni_cbc_block[$aesni_cbc_idx++].
909 '&xor ($a1,$a4)',
910
911 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
912 '&xor ($a1,$a0)', # Sigma0(a)
913 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
914 '&mov ($a4,$e)', # copy of f in future
915
916 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
917 );
918 # and at the finish one has to $a+=$a1
919}
920
921$code.=<<___;
922.type ${func}_avx2,\@function,6
923.align 64
924${func}_avx2:
925.Lavx2_shortcut:
926 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
927 push %rbx
928 push %rbp
929 push %r12
930 push %r13
931 push %r14
932 push %r15
933 mov %rsp,%r11 # copy %rsp
934 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
935 and \$-256*$SZ,%rsp # align stack frame
936 add \$`2*$SZ*($rounds-8)`,%rsp
937
938 shl \$6,$len
939 sub $inp,$out # re-bias
940 sub $inp,$in0
941 add $inp,$len # end of input
942
943 #mov $inp,$_inp # saved later
944 #mov $out,$_out # kept in $offload
945 mov $len,$_end
946 #mov $key,$_key # remains resident in $inp register
947 mov $ivp,$_ivp
948 mov $ctx,$_ctx
949 mov $in0,$_in0
950 mov %r11,$_rsp
951___
952$code.=<<___ if ($win64);
953 movaps %xmm6,`$framesz+16*0`(%rsp)
954 movaps %xmm7,`$framesz+16*1`(%rsp)
955 movaps %xmm8,`$framesz+16*2`(%rsp)
956 movaps %xmm9,`$framesz+16*3`(%rsp)
957 movaps %xmm10,`$framesz+16*4`(%rsp)
958 movaps %xmm11,`$framesz+16*5`(%rsp)
959 movaps %xmm12,`$framesz+16*6`(%rsp)
960 movaps %xmm13,`$framesz+16*7`(%rsp)
961 movaps %xmm14,`$framesz+16*8`(%rsp)
962 movaps %xmm15,`$framesz+16*9`(%rsp)
963___
964$code.=<<___;
965.Lprologue_avx2:
966 vzeroall
967
968 mov $inp,%r13 # borrow $a0
969 vpinsrq \$1,$out,$offload,$offload
970 lea 0x80($key),$inp # size optimization, reassign
971 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
972 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
973 mov $ctx,%r15 # borrow $a2
974 mov $in0,%rsi # borrow $a3
975 vmovdqu ($ivp),$iv # load IV
976 lea -9(%r14),%r14
977
978 vmovdqa 0x00(%r12,%r14,8),$mask14
979 vmovdqa 0x10(%r12,%r14,8),$mask12
980 vmovdqa 0x20(%r12,%r14,8),$mask10
981
982 sub \$-16*$SZ,%r13 # inp++, size optimization
983 mov $SZ*0(%r15),$A
42b9a417 984 lea (%rsi,%r13),%r12 # borrow $a0
8a97a330
AP
985 mov $SZ*1(%r15),$B
986 cmp $len,%r13 # $_end
987 mov $SZ*2(%r15),$C
42b9a417 988 cmove %rsp,%r12 # next block or random data
8a97a330
AP
989 mov $SZ*3(%r15),$D
990 mov $SZ*4(%r15),$E
991 mov $SZ*5(%r15),$F
992 mov $SZ*6(%r15),$G
993 mov $SZ*7(%r15),$H
994 vmovdqu 0x00-0x80($inp),$roundkey
995___
996 if ($SZ==4) { # SHA256
997 my @X = map("%ymm$_",(0..3));
998 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
999
1000$code.=<<___;
1001 jmp .Loop_avx2
1002.align 16
1003.Loop_avx2:
8a97a330 1004 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
8a97a330 1005 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
8a97a330
AP
1006 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1007 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1008 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1009
42b9a417
AP
1010 vinserti128 \$1,(%r12),@X[0],@X[0]
1011 vinserti128 \$1,16(%r12),@X[1],@X[1]
8a97a330 1012 vpshufb $t3,@X[0],@X[0]
42b9a417 1013 vinserti128 \$1,32(%r12),@X[2],@X[2]
8a97a330 1014 vpshufb $t3,@X[1],@X[1]
42b9a417 1015 vinserti128 \$1,48(%r12),@X[3],@X[3]
8a97a330
AP
1016
1017 lea $TABLE(%rip),$Tbl
1018 vpshufb $t3,@X[2],@X[2]
1019 lea -16*$SZ(%r13),%r13
1020 vpaddd 0x00($Tbl),@X[0],$t0
1021 vpshufb $t3,@X[3],@X[3]
1022 vpaddd 0x20($Tbl),@X[1],$t1
1023 vpaddd 0x40($Tbl),@X[2],$t2
1024 vpaddd 0x60($Tbl),@X[3],$t3
1025 vmovdqa $t0,0x00(%rsp)
1026 xor $a1,$a1
1027 vmovdqa $t1,0x20(%rsp)
1028 lea -$PUSH8(%rsp),%rsp
1029 mov $B,$a3
1030 vmovdqa $t2,0x00(%rsp)
1031 xor $C,$a3 # magic
1032 vmovdqa $t3,0x20(%rsp)
1033 mov $F,$a4
1034 sub \$-16*2*$SZ,$Tbl # size optimization
1035 jmp .Lavx2_00_47
1036
1037.align 16
1038.Lavx2_00_47:
1039 vmovdqu (%r13),$inout
1040 vpinsrq \$0,%r13,$offload,$offload
1041___
1042
1043sub AVX2_256_00_47 () {
1044my $j = shift;
1045my $body = shift;
1046my @X = @_;
1047my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1048my $base = "+2*$PUSH8(%rsp)";
1049
1050 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1051 foreach (Xupdate_256_AVX()) { # 29 instructions
1052 eval;
1053 eval(shift(@insns));
1054 eval(shift(@insns));
1055 eval(shift(@insns));
1056 }
1057 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1058 foreach (@insns) { eval; } # remaining instructions
1059 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1060}
1061 $aesni_cbc_idx=0;
1062 for ($i=0,$j=0; $j<4; $j++) {
1063 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1064 push(@X,shift(@X)); # rotate(@X)
1065 }
1066 &vmovq ("%r13",$offload); # borrow $a0
1067 &vpextrq ("%r15",$offload,1); # borrow $a2
1068 &vpand ($temp,$temp,$mask14);
1069 &vpor ($iv,$iv,$temp);
1070 &vmovdqu ("(%r15,%r13)",$iv); # write output
1071 &lea ("%r13","16(%r13)"); # inp++
1072
1073 &lea ($Tbl,16*2*$SZ."($Tbl)");
1074 &cmpb (($SZ-1)."($Tbl)",0);
1075 &jne (".Lavx2_00_47");
1076
1077 &vmovdqu ($inout,"(%r13)");
1078 &vpinsrq ($offload,$offload,"%r13",0);
1079
1080 $aesni_cbc_idx=0;
1081 for ($i=0; $i<16; ) {
1082 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1083 foreach(bodyx_00_15()) { eval; }
1084 }
1085 }
1086$code.=<<___;
1087 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1088 vmovq $offload,%r13 # $_inp, borrow $a0
1089 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1090 add $a1,$A
1091 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1092
1093 vpand $mask14,$temp,$temp
1094 vpor $temp,$iv,$iv
1095 vmovdqu $iv,(%r12,%r13) # write output
1096 lea 16(%r13),%r13
1097
1098 add $SZ*0(%r15),$A
1099 add $SZ*1(%r15),$B
1100 add $SZ*2(%r15),$C
1101 add $SZ*3(%r15),$D
1102 add $SZ*4(%r15),$E
1103 add $SZ*5(%r15),$F
1104 add $SZ*6(%r15),$G
1105 add $SZ*7(%r15),$H
1106
1107 mov $A,$SZ*0(%r15)
1108 mov $B,$SZ*1(%r15)
1109 mov $C,$SZ*2(%r15)
1110 mov $D,$SZ*3(%r15)
1111 mov $E,$SZ*4(%r15)
1112 mov $F,$SZ*5(%r15)
1113 mov $G,$SZ*6(%r15)
1114 mov $H,$SZ*7(%r15)
1115
1116 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1117 je .Ldone_avx2
1118
1119 xor $a1,$a1
1120 mov $B,$a3
1121 mov $F,$a4
1122 xor $C,$a3 # magic
1123 jmp .Lower_avx2
1124.align 16
1125.Lower_avx2:
1126 vmovdqu (%r13),$inout
1127 vpinsrq \$0,%r13,$offload,$offload
1128___
1129 $aesni_cbc_idx=0;
1130 for ($i=0; $i<16; ) {
1131 my $base="+16($Tbl)";
1132 foreach(bodyx_00_15()) { eval; }
1133 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1134 }
1135$code.=<<___;
1136 vmovq $offload,%r13 # borrow $a0
1137 vpextrq \$1,$offload,%r15 # borrow $a2
1138 vpand $mask14,$temp,$temp
1139 vpor $temp,$iv,$iv
1140 lea -$PUSH8($Tbl),$Tbl
1141 vmovdqu $iv,(%r15,%r13) # write output
1142 lea 16(%r13),%r13 # inp++
1143 cmp %rsp,$Tbl
1144 jae .Lower_avx2
1145
1146 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1147 lea 16*$SZ(%r13),%r13
1148 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1149 add $a1,$A
1150 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1151
1152 add $SZ*0(%r15),$A
1153 add $SZ*1(%r15),$B
1154 add $SZ*2(%r15),$C
1155 add $SZ*3(%r15),$D
1156 add $SZ*4(%r15),$E
1157 add $SZ*5(%r15),$F
1158 add $SZ*6(%r15),$G
42b9a417 1159 lea (%rsi,%r13),%r12
8a97a330
AP
1160 add $SZ*7(%r15),$H
1161
1162 cmp $_end,%r13
1163
1164 mov $A,$SZ*0(%r15)
42b9a417 1165 cmove %rsp,%r12 # next block or stale data
8a97a330
AP
1166 mov $B,$SZ*1(%r15)
1167 mov $C,$SZ*2(%r15)
1168 mov $D,$SZ*3(%r15)
1169 mov $E,$SZ*4(%r15)
1170 mov $F,$SZ*5(%r15)
1171 mov $G,$SZ*6(%r15)
1172 mov $H,$SZ*7(%r15)
1173
8a97a330
AP
1174 jbe .Loop_avx2
1175 lea (%rsp),$Tbl
1176
1177.Ldone_avx2:
1178 lea ($Tbl),%rsp
1179 mov $_ivp,$ivp
1180 mov $_rsp,%rsi
1181 vmovdqu $iv,($ivp) # output IV
1182 vzeroall
1183___
1184$code.=<<___ if ($win64);
1185 movaps `$framesz+16*0`(%rsp),%xmm6
1186 movaps `$framesz+16*1`(%rsp),%xmm7
1187 movaps `$framesz+16*2`(%rsp),%xmm8
1188 movaps `$framesz+16*3`(%rsp),%xmm9
1189 movaps `$framesz+16*4`(%rsp),%xmm10
1190 movaps `$framesz+16*5`(%rsp),%xmm11
1191 movaps `$framesz+16*6`(%rsp),%xmm12
1192 movaps `$framesz+16*7`(%rsp),%xmm13
1193 movaps `$framesz+16*8`(%rsp),%xmm14
1194 movaps `$framesz+16*9`(%rsp),%xmm15
1195___
1196$code.=<<___;
1197 mov (%rsi),%r15
1198 mov 8(%rsi),%r14
1199 mov 16(%rsi),%r13
1200 mov 24(%rsi),%r12
1201 mov 32(%rsi),%rbp
1202 mov 40(%rsi),%rbx
1203 lea 48(%rsi),%rsp
1204.Lepilogue_avx2:
1205 ret
1206.size ${func}_avx2,.-${func}_avx2
1207___
1208}}
619b9466
AP
1209}}
1210{{
1211my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1212
1213my ($rounds,$Tbl)=("%r11d","%rbx");
1214
1215my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1216my @rndkey=("%xmm4","%xmm5");
1217my $r=0;
1218my $sn=0;
1219
1220my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1221my @MSG=map("%xmm$_",(10..13));
1222
1223my $aesenc=sub {
1224 use integer;
1225 my ($n,$k)=($r/10,$r%10);
1226 if ($k==0) {
1227 $code.=<<___;
1228 movups `16*$n`($in0),$in # load input
1229 xorps $rndkey0,$in
1230___
1231 $code.=<<___ if ($n);
1232 movups $iv,`16*($n-1)`($out,$in0) # write output
1233___
1234 $code.=<<___;
1235 xorps $in,$iv
1236 movups `32+16*$k-112`($key),$rndkey[1]
1237 aesenc $rndkey[0],$iv
1238___
1239 } elsif ($k==9) {
1240 $sn++;
1241 $code.=<<___;
1242 cmp \$11,$rounds
1243 jb .Laesenclast$sn
1244 movups `32+16*($k+0)-112`($key),$rndkey[1]
1245 aesenc $rndkey[0],$iv
1246 movups `32+16*($k+1)-112`($key),$rndkey[0]
1247 aesenc $rndkey[1],$iv
1248 je .Laesenclast$sn
1249 movups `32+16*($k+2)-112`($key),$rndkey[1]
1250 aesenc $rndkey[0],$iv
1251 movups `32+16*($k+3)-112`($key),$rndkey[0]
1252 aesenc $rndkey[1],$iv
1253.Laesenclast$sn:
1254 aesenclast $rndkey[0],$iv
1255 movups 16-112($key),$rndkey[1] # forward reference
1256 nop
1257___
1258 } else {
1259 $code.=<<___;
1260 movups `32+16*$k-112`($key),$rndkey[1]
1261 aesenc $rndkey[0],$iv
1262___
1263 }
1264 $r++; unshift(@rndkey,pop(@rndkey));
1265};
1266
977f32e8
AP
1267if ($shaext) {
1268my $Tbl="%rax";
1269
619b9466
AP
1270$code.=<<___;
1271.type ${func}_shaext,\@function,6
1272.align 32
1273${func}_shaext:
619b9466 1274 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
619b9466
AP
1275___
1276$code.=<<___ if ($win64);
977f32e8 1277 lea `-8-10*16`(%rsp),%rsp
619b9466
AP
1278 movaps %xmm6,-8-10*16(%rax)
1279 movaps %xmm7,-8-9*16(%rax)
1280 movaps %xmm8,-8-8*16(%rax)
1281 movaps %xmm9,-8-7*16(%rax)
1282 movaps %xmm10,-8-6*16(%rax)
1283 movaps %xmm11,-8-5*16(%rax)
1284 movaps %xmm12,-8-4*16(%rax)
1285 movaps %xmm13,-8-3*16(%rax)
1286 movaps %xmm14,-8-2*16(%rax)
1287 movaps %xmm15,-8-1*16(%rax)
1288.Lprologue_shaext:
1289___
1290$code.=<<___;
1291 lea K256+0x80(%rip),$Tbl
1292 movdqu ($ctx),$ABEF # DCBA
1293 movdqu 16($ctx),$CDGH # HGFE
1294 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1295
1296 mov 240($key),$rounds
1297 sub $in0,$out
1298 movups ($key),$rndkey0 # $key[0]
1299 movups 16($key),$rndkey[0] # forward reference
1300 lea 112($key),$key # size optimization
1301
1302 pshufd \$0x1b,$ABEF,$Wi # ABCD
1303 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1304 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1305 movdqa $TMP,$BSWAP # offload
1306 palignr \$8,$CDGH,$ABEF # ABEF
1307 punpcklqdq $Wi,$CDGH # CDGH
1308
1309 jmp .Loop_shaext
1310
1311.align 16
1312.Loop_shaext:
1313 movdqu ($inp),@MSG[0]
1314 movdqu 0x10($inp),@MSG[1]
1315 movdqu 0x20($inp),@MSG[2]
1316 pshufb $TMP,@MSG[0]
1317 movdqu 0x30($inp),@MSG[3]
1318
1319 movdqa 0*32-0x80($Tbl),$Wi
1320 paddd @MSG[0],$Wi
1321 pshufb $TMP,@MSG[1]
1322 movdqa $CDGH,$CDGH_SAVE # offload
1323 movdqa $ABEF,$ABEF_SAVE # offload
1324___
1325 &$aesenc();
1326$code.=<<___;
1327 sha256rnds2 $ABEF,$CDGH # 0-3
1328 pshufd \$0x0e,$Wi,$Wi
1329___
1330 &$aesenc();
1331$code.=<<___;
1332 sha256rnds2 $CDGH,$ABEF
1333
1334 movdqa 1*32-0x80($Tbl),$Wi
1335 paddd @MSG[1],$Wi
1336 pshufb $TMP,@MSG[2]
1337 lea 0x40($inp),$inp
1338___
1339 &$aesenc();
1340$code.=<<___;
1341 sha256rnds2 $ABEF,$CDGH # 4-7
1342 pshufd \$0x0e,$Wi,$Wi
1343___
1344 &$aesenc();
1345$code.=<<___;
1346 sha256rnds2 $CDGH,$ABEF
1347
1348 movdqa 2*32-0x80($Tbl),$Wi
1349 paddd @MSG[2],$Wi
1350 pshufb $TMP,@MSG[3]
1351 sha256msg1 @MSG[1],@MSG[0]
1352___
1353 &$aesenc();
1354$code.=<<___;
1355 sha256rnds2 $ABEF,$CDGH # 8-11
1356 pshufd \$0x0e,$Wi,$Wi
1357 movdqa @MSG[3],$TMP
1358 palignr \$4,@MSG[2],$TMP
1359 paddd $TMP,@MSG[0]
1360___
1361 &$aesenc();
1362$code.=<<___;
1363 sha256rnds2 $CDGH,$ABEF
1364
1365 movdqa 3*32-0x80($Tbl),$Wi
1366 paddd @MSG[3],$Wi
1367 sha256msg2 @MSG[3],@MSG[0]
1368 sha256msg1 @MSG[2],@MSG[1]
1369___
1370 &$aesenc();
1371$code.=<<___;
1372 sha256rnds2 $ABEF,$CDGH # 12-15
1373 pshufd \$0x0e,$Wi,$Wi
1374___
1375 &$aesenc();
1376$code.=<<___;
1377 movdqa @MSG[0],$TMP
1378 palignr \$4,@MSG[3],$TMP
1379 paddd $TMP,@MSG[1]
1380 sha256rnds2 $CDGH,$ABEF
1381___
1382for($i=4;$i<16-3;$i++) {
1383 &$aesenc() if (($r%10)==0);
1384$code.=<<___;
1385 movdqa $i*32-0x80($Tbl),$Wi
1386 paddd @MSG[0],$Wi
1387 sha256msg2 @MSG[0],@MSG[1]
1388 sha256msg1 @MSG[3],@MSG[2]
1389___
1390 &$aesenc();
1391$code.=<<___;
1392 sha256rnds2 $ABEF,$CDGH # 16-19...
1393 pshufd \$0x0e,$Wi,$Wi
1394 movdqa @MSG[1],$TMP
1395 palignr \$4,@MSG[0],$TMP
1396 paddd $TMP,@MSG[2]
1397___
1398 &$aesenc();
1399 &$aesenc() if ($r==19);
1400$code.=<<___;
1401 sha256rnds2 $CDGH,$ABEF
1402___
1403 push(@MSG,shift(@MSG));
1404}
1405$code.=<<___;
1406 movdqa 13*32-0x80($Tbl),$Wi
1407 paddd @MSG[0],$Wi
1408 sha256msg2 @MSG[0],@MSG[1]
1409 sha256msg1 @MSG[3],@MSG[2]
1410___
1411 &$aesenc();
1412$code.=<<___;
1413 sha256rnds2 $ABEF,$CDGH # 52-55
1414 pshufd \$0x0e,$Wi,$Wi
1415 movdqa @MSG[1],$TMP
1416 palignr \$4,@MSG[0],$TMP
1417 paddd $TMP,@MSG[2]
1418___
1419 &$aesenc();
1420 &$aesenc();
1421$code.=<<___;
1422 sha256rnds2 $CDGH,$ABEF
1423
1424 movdqa 14*32-0x80($Tbl),$Wi
1425 paddd @MSG[1],$Wi
1426 sha256msg2 @MSG[1],@MSG[2]
1427 movdqa $BSWAP,$TMP
1428___
1429 &$aesenc();
1430$code.=<<___;
1431 sha256rnds2 $ABEF,$CDGH # 56-59
1432 pshufd \$0x0e,$Wi,$Wi
1433___
1434 &$aesenc();
1435$code.=<<___;
1436 sha256rnds2 $CDGH,$ABEF
1437
1438 movdqa 15*32-0x80($Tbl),$Wi
1439 paddd @MSG[2],$Wi
1440___
1441 &$aesenc();
1442 &$aesenc();
1443$code.=<<___;
1444 sha256rnds2 $ABEF,$CDGH # 60-63
1445 pshufd \$0x0e,$Wi,$Wi
1446___
1447 &$aesenc();
1448$code.=<<___;
1449 sha256rnds2 $CDGH,$ABEF
1450 #pxor $CDGH,$rndkey0 # black magic
1451___
1452 while ($r<40) { &$aesenc(); } # remaining aesenc's
1453$code.=<<___;
1454 #xorps $CDGH,$rndkey0 # black magic
1455 paddd $CDGH_SAVE,$CDGH
1456 paddd $ABEF_SAVE,$ABEF
1457
1458 dec $len
1459 movups $iv,48($out,$in0) # write output
1460 lea 64($in0),$in0
1461 jnz .Loop_shaext
1462
1463 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1464 pshufd \$0x1b,$ABEF,$TMP # FEBA
1465 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1466 punpckhqdq $CDGH,$ABEF # DCBA
1467 palignr \$8,$TMP,$CDGH # HGFE
1468
1469 movups $iv,($ivp) # write IV
1470 movdqu $ABEF,($ctx)
1471 movdqu $CDGH,16($ctx)
1472___
1473$code.=<<___ if ($win64);
977f32e8
AP
1474 movaps 0*16(%rsp),%xmm6
1475 movaps 1*16(%rsp),%xmm7
1476 movaps 2*16(%rsp),%xmm8
1477 movaps 3*16(%rsp),%xmm9
1478 movaps 4*16(%rsp),%xmm10
1479 movaps 5*16(%rsp),%xmm11
1480 movaps 6*16(%rsp),%xmm12
1481 movaps 7*16(%rsp),%xmm13
1482 movaps 8*16(%rsp),%xmm14
1483 movaps 9*16(%rsp),%xmm15
1484 lea 8+10*16(%rsp),%rsp
619b9466
AP
1485.Lepilogue_shaext:
1486___
1487$code.=<<___;
619b9466
AP
1488 ret
1489.size ${func}_shaext,.-${func}_shaext
1490___
977f32e8 1491}
8a97a330
AP
1492}}}}}
1493
1494# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1495# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1496if ($win64) {
1497$rec="%rcx";
1498$frame="%rdx";
1499$context="%r8";
1500$disp="%r9";
1501
1502$code.=<<___ if ($avx);
1503.extern __imp_RtlVirtualUnwind
1504.type se_handler,\@abi-omnipotent
1505.align 16
1506se_handler:
1507 push %rsi
1508 push %rdi
1509 push %rbx
1510 push %rbp
1511 push %r12
1512 push %r13
1513 push %r14
1514 push %r15
1515 pushfq
1516 sub \$64,%rsp
1517
1518 mov 120($context),%rax # pull context->Rax
1519 mov 248($context),%rbx # pull context->Rip
1520
1521 mov 8($disp),%rsi # disp->ImageBase
1522 mov 56($disp),%r11 # disp->HanderlData
1523
1524 mov 0(%r11),%r10d # HandlerData[0]
1525 lea (%rsi,%r10),%r10 # prologue label
1526 cmp %r10,%rbx # context->Rip<prologue label
1527 jb .Lin_prologue
1528
1529 mov 152($context),%rax # pull context->Rsp
1530
1531 mov 4(%r11),%r10d # HandlerData[1]
1532 lea (%rsi,%r10),%r10 # epilogue label
1533 cmp %r10,%rbx # context->Rip>=epilogue label
1534 jae .Lin_prologue
1535___
977f32e8
AP
1536$code.=<<___ if ($shaext);
1537 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1538 cmp %r10,%rbx
1539 jb .Lnot_in_shaext
1540
1541 lea (%rax),%rsi
1542 lea 512($context),%rdi # &context.Xmm6
1543 mov \$20,%ecx
1544 .long 0xa548f3fc # cld; rep movsq
1545 lea 168(%rax),%rax # adjust stack pointer
1546 jmp .Lin_prologue
1547.Lnot_in_shaext:
1548___
8a97a330
AP
1549$code.=<<___ if ($avx>1);
1550 lea .Lavx2_shortcut(%rip),%r10
1551 cmp %r10,%rbx # context->Rip<avx2_shortcut
1552 jb .Lnot_in_avx2
1553
1554 and \$-256*$SZ,%rax
1555 add \$`2*$SZ*($rounds-8)`,%rax
1556.Lnot_in_avx2:
1557___
1558$code.=<<___;
1559 mov %rax,%rsi # put aside Rsp
1560 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1561 lea 48(%rax),%rax
1562
1563 mov -8(%rax),%rbx
1564 mov -16(%rax),%rbp
1565 mov -24(%rax),%r12
1566 mov -32(%rax),%r13
1567 mov -40(%rax),%r14
1568 mov -48(%rax),%r15
1569 mov %rbx,144($context) # restore context->Rbx
1570 mov %rbp,160($context) # restore context->Rbp
1571 mov %r12,216($context) # restore context->R12
1572 mov %r13,224($context) # restore context->R13
1573 mov %r14,232($context) # restore context->R14
1574 mov %r15,240($context) # restore context->R15
1575
8a97a330
AP
1576 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1577 lea 512($context),%rdi # &context.Xmm6
1578 mov \$20,%ecx
1579 .long 0xa548f3fc # cld; rep movsq
1580
1581.Lin_prologue:
1582 mov 8(%rax),%rdi
1583 mov 16(%rax),%rsi
1584 mov %rax,152($context) # restore context->Rsp
1585 mov %rsi,168($context) # restore context->Rsi
1586 mov %rdi,176($context) # restore context->Rdi
1587
1588 mov 40($disp),%rdi # disp->ContextRecord
1589 mov $context,%rsi # context
1590 mov \$154,%ecx # sizeof(CONTEXT)
1591 .long 0xa548f3fc # cld; rep movsq
1592
1593 mov $disp,%rsi
1594 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1595 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1596 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1597 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1598 mov 40(%rsi),%r10 # disp->ContextRecord
1599 lea 56(%rsi),%r11 # &disp->HandlerData
1600 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1601 mov %r10,32(%rsp) # arg5
1602 mov %r11,40(%rsp) # arg6
1603 mov %r12,48(%rsp) # arg7
1604 mov %rcx,56(%rsp) # arg8, (NULL)
1605 call *__imp_RtlVirtualUnwind(%rip)
1606
1607 mov \$1,%eax # ExceptionContinueSearch
1608 add \$64,%rsp
1609 popfq
1610 pop %r15
1611 pop %r14
1612 pop %r13
1613 pop %r12
1614 pop %rbp
1615 pop %rbx
1616 pop %rdi
1617 pop %rsi
1618 ret
1619.size se_handler,.-se_handler
1620
1621.section .pdata
1622 .rva .LSEH_begin_${func}_xop
1623 .rva .LSEH_end_${func}_xop
1624 .rva .LSEH_info_${func}_xop
1625
1626 .rva .LSEH_begin_${func}_avx
1627 .rva .LSEH_end_${func}_avx
1628 .rva .LSEH_info_${func}_avx
1629___
1630$code.=<<___ if ($avx>1);
1631 .rva .LSEH_begin_${func}_avx2
1632 .rva .LSEH_end_${func}_avx2
1633 .rva .LSEH_info_${func}_avx2
1634___
977f32e8
AP
1635$code.=<<___ if ($shaext);
1636 .rva .LSEH_begin_${func}_shaext
1637 .rva .LSEH_end_${func}_shaext
1638 .rva .LSEH_info_${func}_shaext
1639___
8a97a330
AP
1640$code.=<<___ if ($avx);
1641.section .xdata
1642.align 8
1643.LSEH_info_${func}_xop:
1644 .byte 9,0,0,0
1645 .rva se_handler
1646 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1647
1648.LSEH_info_${func}_avx:
1649 .byte 9,0,0,0
1650 .rva se_handler
1651 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1652___
1653$code.=<<___ if ($avx>1);
1654.LSEH_info_${func}_avx2:
1655 .byte 9,0,0,0
1656 .rva se_handler
1657 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1658___
977f32e8
AP
1659$code.=<<___ if ($shaext);
1660.LSEH_info_${func}_shaext:
1661 .byte 9,0,0,0
1662 .rva se_handler
1663 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1664___
8a97a330
AP
1665}
1666
619b9466
AP
1667####################################################################
1668sub rex {
1669 local *opcode=shift;
1670 my ($dst,$src)=@_;
1671 my $rex=0;
1672
1673 $rex|=0x04 if($dst>=8);
1674 $rex|=0x01 if($src>=8);
1675 unshift @opcode,$rex|0x40 if($rex);
1676}
1677
1678{
1679 my %opcodelet = (
1680 "sha256rnds2" => 0xcb,
1681 "sha256msg1" => 0xcc,
1682 "sha256msg2" => 0xcd );
1683
1684 sub sha256op38 {
1685 my $instr = shift;
1686
1687 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
1688 my @opcode=(0x0f,0x38);
1689 rex(\@opcode,$2,$1);
1690 push @opcode,$opcodelet{$instr};
1691 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1692 return ".byte\t".join(',',@opcode);
1693 } else {
1694 return $instr."\t".@_[0];
1695 }
1696 }
1697}
1698
8a97a330 1699$code =~ s/\`([^\`]*)\`/eval $1/gem;
619b9466 1700$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
8a97a330
AP
1701print $code;
1702close STDOUT;