]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
c918d8e2 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
8a97a330 AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # January 2013 | |
18 | # | |
19 | # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled | |
20 | # in http://download.intel.com/design/intarch/papers/323686.pdf, is | |
21 | # that since AESNI-CBC encrypt exhibit *very* low instruction-level | |
22 | # parallelism, interleaving it with another algorithm would allow to | |
23 | # utilize processor resources better and achieve better performance. | |
24 | # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and | |
25 | # AESNI code is weaved into it. As SHA256 dominates execution time, | |
26 | # stitch performance does not depend on AES key length. Below are | |
27 | # performance numbers in cycles per processed byte, less is better, | |
28 | # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched | |
29 | # subroutine: | |
30 | # | |
18437871 AP |
31 | # AES-128/-192/-256+SHA256 this(**) gain |
32 | # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% | |
33 | # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% | |
34 | # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% | |
35 | # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% | |
36 | # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% | |
37 | # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% | |
38 | # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% | |
8a97a330 | 39 | # |
60250017 | 40 | # (*) there are XOP, AVX1 and AVX2 code paths, meaning that |
8a97a330 AP |
41 | # Westmere is omitted from loop, this is because gain was not |
42 | # estimated high enough to justify the effort; | |
43 | # (**) these are EVP-free results, results obtained with 'speed | |
44 | # -evp aes-256-cbc-hmac-sha256' will vary by percent or two; | |
18437871 | 45 | # (***) these are SHAEXT results; |
8a97a330 | 46 | |
1aa89a7a RL |
47 | # $output is the last argument if it looks like a file (it has an extension) |
48 | # $flavour is the first argument if it doesn't look like a file | |
49 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
50 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
8a97a330 AP |
51 | |
52 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
53 | ||
54 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
55 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
56 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
57 | die "can't locate x86_64-xlate.pl"; | |
58 | ||
59 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
60 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
61 | $avx = ($1>=2.19) + ($1>=2.22); | |
62 | } | |
63 | ||
64 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
65 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
66 | $avx = ($1>=2.09) + ($1>=2.10); | |
67 | } | |
68 | ||
69 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
70 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
1b0fe79f | 71 | $avx = ($1>=10) + ($1>=12); |
8a97a330 AP |
72 | } |
73 | ||
9bb3e5fd | 74 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { |
a356e488 | 75 | $avx = ($2>=3.0) + ($2>3.0); |
ac171925 AP |
76 | } |
77 | ||
9024b84b | 78 | $shaext=$avx; ### set to zero if compiling for 1.0.1 |
977f32e8 AP |
79 | $avx=1 if (!$shaext && $avx); |
80 | ||
1aa89a7a RL |
81 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
82 | or die "can't call $xlate: $!"; | |
8a97a330 AP |
83 | *STDOUT=*OUT; |
84 | ||
85 | $func="aesni_cbc_sha256_enc"; | |
86 | $TABLE="K256"; | |
87 | $SZ=4; | |
88 | @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", | |
89 | "%r8d","%r9d","%r10d","%r11d"); | |
90 | ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); | |
91 | @Sigma0=( 2,13,22); | |
92 | @Sigma1=( 6,11,25); | |
93 | @sigma0=( 7,18, 3); | |
94 | @sigma1=(17,19,10); | |
95 | $rounds=64; | |
96 | ||
97 | ######################################################################## | |
98 | # void aesni_cbc_sha256_enc(const void *inp, | |
99 | # void *out, | |
100 | # size_t length, | |
101 | # const AES_KEY *key, | |
102 | # unsigned char *iv, | |
103 | # SHA256_CTX *ctx, | |
104 | # const void *in0); | |
105 | ($inp, $out, $len, $key, $ivp, $ctx, $in0) = | |
106 | ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); | |
107 | ||
108 | $Tbl="%rbp"; | |
109 | ||
110 | $_inp="16*$SZ+0*8(%rsp)"; | |
111 | $_out="16*$SZ+1*8(%rsp)"; | |
112 | $_end="16*$SZ+2*8(%rsp)"; | |
113 | $_key="16*$SZ+3*8(%rsp)"; | |
114 | $_ivp="16*$SZ+4*8(%rsp)"; | |
115 | $_ctx="16*$SZ+5*8(%rsp)"; | |
116 | $_in0="16*$SZ+6*8(%rsp)"; | |
b84460ad | 117 | $_rsp="`16*$SZ+7*8`(%rsp)"; |
8a97a330 AP |
118 | $framesz=16*$SZ+8*8; |
119 | ||
120 | $code=<<___; | |
121 | .text | |
122 | ||
123 | .extern OPENSSL_ia32cap_P | |
124 | .globl $func | |
125 | .type $func,\@abi-omnipotent | |
126 | .align 16 | |
127 | $func: | |
b0d3442e | 128 | .cfi_startproc |
8a97a330 | 129 | ___ |
9024b84b AP |
130 | if ($avx) { |
131 | $code.=<<___; | |
8a97a330 AP |
132 | lea OPENSSL_ia32cap_P(%rip),%r11 |
133 | mov \$1,%eax | |
134 | cmp \$0,`$win64?"%rcx":"%rdi"` | |
135 | je .Lprobe | |
136 | mov 0(%r11),%eax | |
619b9466 | 137 | mov 4(%r11),%r10 |
977f32e8 AP |
138 | ___ |
139 | $code.=<<___ if ($shaext); | |
619b9466 AP |
140 | bt \$61,%r10 # check for SHA |
141 | jc ${func}_shaext | |
977f32e8 AP |
142 | ___ |
143 | $code.=<<___; | |
619b9466 AP |
144 | mov %r10,%r11 |
145 | shr \$32,%r11 | |
8a97a330 AP |
146 | |
147 | test \$`1<<11`,%r10d # check for XOP | |
148 | jnz ${func}_xop | |
149 | ___ | |
150 | $code.=<<___ if ($avx>1); | |
151 | and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 | |
152 | cmp \$`1<<8|1<<5|1<<3`,%r11d | |
153 | je ${func}_avx2 | |
154 | ___ | |
9024b84b | 155 | $code.=<<___; |
a5fd24d1 AP |
156 | and \$`1<<28`,%r10d # check for AVX |
157 | jnz ${func}_avx | |
8a97a330 AP |
158 | ud2 |
159 | ___ | |
9024b84b | 160 | } |
8a97a330 AP |
161 | $code.=<<___; |
162 | xor %eax,%eax | |
163 | cmp \$0,`$win64?"%rcx":"%rdi"` | |
164 | je .Lprobe | |
165 | ud2 | |
166 | .Lprobe: | |
167 | ret | |
b0d3442e | 168 | .cfi_endproc |
8a97a330 AP |
169 | .size $func,.-$func |
170 | ||
171 | .align 64 | |
172 | .type $TABLE,\@object | |
173 | $TABLE: | |
174 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
175 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
176 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
177 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
178 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
179 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
180 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
181 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
182 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
183 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
184 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
185 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
186 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
187 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
188 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
189 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
190 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
191 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
192 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
193 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
194 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
195 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
196 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
197 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
198 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
199 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
200 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
201 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
202 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
203 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
204 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
205 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
206 | ||
207 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f | |
208 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f | |
209 | .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 | |
210 | .long 0,0,0,0, 0,0,0,0 | |
211 | .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | |
212 | .align 64 | |
213 | ___ | |
214 | ||
215 | ###################################################################### | |
216 | # SIMD code paths | |
217 | # | |
218 | {{{ | |
219 | ($iv,$inout,$roundkey,$temp, | |
220 | $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); | |
221 | ||
222 | $aesni_cbc_idx=0; | |
223 | @aesni_cbc_block = ( | |
224 | ## &vmovdqu ($roundkey,"0x00-0x80($inp)");' | |
225 | ## &vmovdqu ($inout,($inp)); | |
226 | ## &mov ($_inp,$inp); | |
227 | ||
228 | '&vpxor ($inout,$inout,$roundkey);'. | |
229 | ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', | |
230 | ||
231 | '&vpxor ($inout,$inout,$iv);', | |
232 | ||
233 | '&vaesenc ($inout,$inout,$roundkey);'. | |
234 | ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', | |
235 | ||
236 | '&vaesenc ($inout,$inout,$roundkey);'. | |
237 | ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', | |
238 | ||
239 | '&vaesenc ($inout,$inout,$roundkey);'. | |
240 | ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', | |
241 | ||
242 | '&vaesenc ($inout,$inout,$roundkey);'. | |
243 | ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', | |
244 | ||
245 | '&vaesenc ($inout,$inout,$roundkey);'. | |
246 | ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', | |
247 | ||
248 | '&vaesenc ($inout,$inout,$roundkey);'. | |
249 | ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', | |
250 | ||
251 | '&vaesenc ($inout,$inout,$roundkey);'. | |
252 | ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', | |
253 | ||
254 | '&vaesenc ($inout,$inout,$roundkey);'. | |
255 | ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', | |
256 | ||
257 | '&vaesenc ($inout,$inout,$roundkey);'. | |
258 | ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', | |
259 | ||
260 | '&vaesenclast ($temp,$inout,$roundkey);'. | |
261 | ' &vaesenc ($inout,$inout,$roundkey);'. | |
262 | ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', | |
263 | ||
264 | '&vpand ($iv,$temp,$mask10);'. | |
265 | ' &vaesenc ($inout,$inout,$roundkey);'. | |
266 | ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', | |
267 | ||
268 | '&vaesenclast ($temp,$inout,$roundkey);'. | |
269 | ' &vaesenc ($inout,$inout,$roundkey);'. | |
270 | ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', | |
271 | ||
272 | '&vpand ($temp,$temp,$mask12);'. | |
273 | ' &vaesenc ($inout,$inout,$roundkey);'. | |
274 | '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', | |
275 | ||
276 | '&vpor ($iv,$iv,$temp);'. | |
277 | ' &vaesenclast ($temp,$inout,$roundkey);'. | |
278 | ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' | |
279 | ||
280 | ## &mov ($inp,$_inp); | |
281 | ## &mov ($out,$_out); | |
282 | ## &vpand ($temp,$temp,$mask14); | |
283 | ## &vpor ($iv,$iv,$temp); | |
284 | ## &vmovdqu ($iv,($out,$inp); | |
285 | ## &lea (inp,16($inp)); | |
286 | ); | |
287 | ||
288 | my $a4=$T1; | |
289 | my ($a,$b,$c,$d,$e,$f,$g,$h); | |
290 | ||
291 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | |
292 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | |
293 | my $arg = pop; | |
294 | $arg = "\$$arg" if ($arg*1 eq $arg); | |
295 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | |
296 | } | |
297 | ||
298 | sub body_00_15 () { | |
299 | ( | |
300 | '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. | |
301 | ||
302 | '&ror ($a0,$Sigma1[2]-$Sigma1[1])', | |
303 | '&mov ($a,$a1)', | |
304 | '&mov ($a4,$f)', | |
305 | ||
306 | '&xor ($a0,$e)', | |
307 | '&ror ($a1,$Sigma0[2]-$Sigma0[1])', | |
308 | '&xor ($a4,$g)', # f^g | |
309 | ||
310 | '&ror ($a0,$Sigma1[1]-$Sigma1[0])', | |
311 | '&xor ($a1,$a)', | |
312 | '&and ($a4,$e)', # (f^g)&e | |
313 | ||
314 | @aesni_cbc_block[$aesni_cbc_idx++]. | |
315 | '&xor ($a0,$e)', | |
316 | '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] | |
317 | '&mov ($a2,$a)', | |
318 | ||
319 | '&ror ($a1,$Sigma0[1]-$Sigma0[0])', | |
320 | '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g | |
321 | '&xor ($a2,$b)', # a^b, b^c in next round | |
322 | ||
323 | '&ror ($a0,$Sigma1[0])', # Sigma1(e) | |
324 | '&add ($h,$a4)', # h+=Ch(e,f,g) | |
325 | '&and ($a3,$a2)', # (b^c)&(a^b) | |
326 | ||
327 | '&xor ($a1,$a)', | |
328 | '&add ($h,$a0)', # h+=Sigma1(e) | |
329 | '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) | |
330 | ||
331 | '&add ($d,$h)', # d+=h | |
332 | '&ror ($a1,$Sigma0[0])', # Sigma0(a) | |
333 | '&add ($h,$a3)', # h+=Maj(a,b,c) | |
334 | ||
335 | '&mov ($a0,$d)', | |
336 | '&add ($a1,$h);'. # h+=Sigma0(a) | |
337 | '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' | |
338 | ); | |
339 | } | |
340 | ||
341 | if ($avx) {{ | |
342 | ###################################################################### | |
343 | # XOP code path | |
344 | # | |
345 | $code.=<<___; | |
346 | .type ${func}_xop,\@function,6 | |
347 | .align 64 | |
348 | ${func}_xop: | |
b84460ad | 349 | .cfi_startproc |
8a97a330 AP |
350 | .Lxop_shortcut: |
351 | mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter | |
384e6de4 | 352 | mov %rsp,%rax # copy %rsp |
b84460ad | 353 | .cfi_def_cfa_register %rax |
8a97a330 | 354 | push %rbx |
b84460ad | 355 | .cfi_push %rbx |
8a97a330 | 356 | push %rbp |
b84460ad | 357 | .cfi_push %rbp |
8a97a330 | 358 | push %r12 |
b84460ad | 359 | .cfi_push %r12 |
8a97a330 | 360 | push %r13 |
b84460ad | 361 | .cfi_push %r13 |
8a97a330 | 362 | push %r14 |
b84460ad | 363 | .cfi_push %r14 |
8a97a330 | 364 | push %r15 |
b84460ad | 365 | .cfi_push %r15 |
8a97a330 AP |
366 | sub \$`$framesz+$win64*16*10`,%rsp |
367 | and \$-64,%rsp # align stack frame | |
368 | ||
369 | shl \$6,$len | |
370 | sub $inp,$out # re-bias | |
371 | sub $inp,$in0 | |
372 | add $inp,$len # end of input | |
373 | ||
374 | #mov $inp,$_inp # saved later | |
375 | mov $out,$_out | |
376 | mov $len,$_end | |
377 | #mov $key,$_key # remains resident in $inp register | |
378 | mov $ivp,$_ivp | |
379 | mov $ctx,$_ctx | |
380 | mov $in0,$_in0 | |
384e6de4 | 381 | mov %rax,$_rsp |
b84460ad | 382 | .cfi_cfa_expression $_rsp,deref,+8 |
8a97a330 AP |
383 | ___ |
384 | $code.=<<___ if ($win64); | |
385 | movaps %xmm6,`$framesz+16*0`(%rsp) | |
386 | movaps %xmm7,`$framesz+16*1`(%rsp) | |
387 | movaps %xmm8,`$framesz+16*2`(%rsp) | |
388 | movaps %xmm9,`$framesz+16*3`(%rsp) | |
389 | movaps %xmm10,`$framesz+16*4`(%rsp) | |
390 | movaps %xmm11,`$framesz+16*5`(%rsp) | |
391 | movaps %xmm12,`$framesz+16*6`(%rsp) | |
392 | movaps %xmm13,`$framesz+16*7`(%rsp) | |
393 | movaps %xmm14,`$framesz+16*8`(%rsp) | |
394 | movaps %xmm15,`$framesz+16*9`(%rsp) | |
395 | ___ | |
396 | $code.=<<___; | |
397 | .Lprologue_xop: | |
398 | vzeroall | |
399 | ||
400 | mov $inp,%r12 # borrow $a4 | |
401 | lea 0x80($key),$inp # size optimization, reassign | |
402 | lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 | |
403 | mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 | |
404 | mov $ctx,%r15 # borrow $a2 | |
405 | mov $in0,%rsi # borrow $a3 | |
406 | vmovdqu ($ivp),$iv # load IV | |
407 | sub \$9,%r14 | |
408 | ||
409 | mov $SZ*0(%r15),$A | |
410 | mov $SZ*1(%r15),$B | |
411 | mov $SZ*2(%r15),$C | |
412 | mov $SZ*3(%r15),$D | |
413 | mov $SZ*4(%r15),$E | |
414 | mov $SZ*5(%r15),$F | |
415 | mov $SZ*6(%r15),$G | |
416 | mov $SZ*7(%r15),$H | |
417 | ||
418 | vmovdqa 0x00(%r13,%r14,8),$mask14 | |
419 | vmovdqa 0x10(%r13,%r14,8),$mask12 | |
420 | vmovdqa 0x20(%r13,%r14,8),$mask10 | |
421 | vmovdqu 0x00-0x80($inp),$roundkey | |
422 | jmp .Lloop_xop | |
423 | ___ | |
424 | if ($SZ==4) { # SHA256 | |
425 | my @X = map("%xmm$_",(0..3)); | |
426 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); | |
427 | ||
428 | $code.=<<___; | |
429 | .align 16 | |
430 | .Lloop_xop: | |
431 | vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 | |
432 | vmovdqu 0x00(%rsi,%r12),@X[0] | |
433 | vmovdqu 0x10(%rsi,%r12),@X[1] | |
434 | vmovdqu 0x20(%rsi,%r12),@X[2] | |
435 | vmovdqu 0x30(%rsi,%r12),@X[3] | |
436 | vpshufb $t3,@X[0],@X[0] | |
437 | lea $TABLE(%rip),$Tbl | |
438 | vpshufb $t3,@X[1],@X[1] | |
439 | vpshufb $t3,@X[2],@X[2] | |
440 | vpaddd 0x00($Tbl),@X[0],$t0 | |
441 | vpshufb $t3,@X[3],@X[3] | |
442 | vpaddd 0x20($Tbl),@X[1],$t1 | |
443 | vpaddd 0x40($Tbl),@X[2],$t2 | |
444 | vpaddd 0x60($Tbl),@X[3],$t3 | |
445 | vmovdqa $t0,0x00(%rsp) | |
446 | mov $A,$a1 | |
447 | vmovdqa $t1,0x10(%rsp) | |
448 | mov $B,$a3 | |
449 | vmovdqa $t2,0x20(%rsp) | |
450 | xor $C,$a3 # magic | |
451 | vmovdqa $t3,0x30(%rsp) | |
452 | mov $E,$a0 | |
453 | jmp .Lxop_00_47 | |
454 | ||
455 | .align 16 | |
456 | .Lxop_00_47: | |
457 | sub \$-16*2*$SZ,$Tbl # size optimization | |
458 | vmovdqu (%r12),$inout # $a4 | |
459 | mov %r12,$_inp # $a4 | |
460 | ___ | |
461 | sub XOP_256_00_47 () { | |
462 | my $j = shift; | |
463 | my $body = shift; | |
464 | my @X = @_; | |
465 | my @insns = (&$body,&$body,&$body,&$body); # 104 instructions | |
466 | ||
467 | &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] | |
468 | eval(shift(@insns)); | |
469 | eval(shift(@insns)); | |
470 | &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] | |
471 | eval(shift(@insns)); | |
472 | eval(shift(@insns)); | |
473 | &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); | |
474 | eval(shift(@insns)); | |
475 | eval(shift(@insns)); | |
476 | &vpsrld ($t0,$t0,$sigma0[2]); | |
477 | eval(shift(@insns)); | |
478 | eval(shift(@insns)); | |
479 | &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] | |
480 | eval(shift(@insns)); | |
481 | eval(shift(@insns)); | |
482 | eval(shift(@insns)); | |
483 | eval(shift(@insns)); | |
484 | &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); | |
485 | eval(shift(@insns)); | |
486 | eval(shift(@insns)); | |
487 | &vpxor ($t0,$t0,$t1); | |
488 | eval(shift(@insns)); | |
489 | eval(shift(@insns)); | |
490 | eval(shift(@insns)); | |
491 | eval(shift(@insns)); | |
492 | &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); | |
493 | eval(shift(@insns)); | |
494 | eval(shift(@insns)); | |
495 | &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) | |
496 | eval(shift(@insns)); | |
497 | eval(shift(@insns)); | |
498 | &vpsrld ($t2,@X[3],$sigma1[2]); | |
499 | eval(shift(@insns)); | |
500 | eval(shift(@insns)); | |
501 | &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) | |
502 | eval(shift(@insns)); | |
503 | eval(shift(@insns)); | |
504 | &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); | |
505 | eval(shift(@insns)); | |
506 | eval(shift(@insns)); | |
507 | &vpxor ($t3,$t3,$t2); | |
508 | eval(shift(@insns)); | |
509 | eval(shift(@insns)); | |
510 | eval(shift(@insns)); | |
511 | eval(shift(@insns)); | |
512 | &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) | |
513 | eval(shift(@insns)); | |
514 | eval(shift(@insns)); | |
515 | eval(shift(@insns)); | |
516 | eval(shift(@insns)); | |
517 | &vpsrldq ($t3,$t3,8); | |
518 | eval(shift(@insns)); | |
519 | eval(shift(@insns)); | |
520 | eval(shift(@insns)); | |
521 | eval(shift(@insns)); | |
522 | &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) | |
523 | eval(shift(@insns)); | |
524 | eval(shift(@insns)); | |
525 | eval(shift(@insns)); | |
526 | eval(shift(@insns)); | |
527 | &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); | |
528 | eval(shift(@insns)); | |
529 | eval(shift(@insns)); | |
530 | &vpsrld ($t2,@X[0],$sigma1[2]); | |
531 | eval(shift(@insns)); | |
532 | eval(shift(@insns)); | |
533 | &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); | |
534 | eval(shift(@insns)); | |
535 | eval(shift(@insns)); | |
536 | &vpxor ($t3,$t3,$t2); | |
537 | eval(shift(@insns)); | |
538 | eval(shift(@insns)); | |
539 | eval(shift(@insns)); | |
540 | eval(shift(@insns)); | |
541 | &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) | |
542 | eval(shift(@insns)); | |
543 | eval(shift(@insns)); | |
544 | eval(shift(@insns)); | |
545 | eval(shift(@insns)); | |
546 | &vpslldq ($t3,$t3,8); # 22 instructions | |
547 | eval(shift(@insns)); | |
548 | eval(shift(@insns)); | |
549 | eval(shift(@insns)); | |
550 | eval(shift(@insns)); | |
551 | &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) | |
552 | eval(shift(@insns)); | |
553 | eval(shift(@insns)); | |
554 | eval(shift(@insns)); | |
555 | eval(shift(@insns)); | |
556 | &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); | |
557 | foreach (@insns) { eval; } # remaining instructions | |
558 | &vmovdqa (16*$j."(%rsp)",$t2); | |
559 | } | |
560 | ||
561 | $aesni_cbc_idx=0; | |
562 | for ($i=0,$j=0; $j<4; $j++) { | |
563 | &XOP_256_00_47($j,\&body_00_15,@X); | |
564 | push(@X,shift(@X)); # rotate(@X) | |
565 | } | |
566 | &mov ("%r12",$_inp); # borrow $a4 | |
567 | &vpand ($temp,$temp,$mask14); | |
568 | &mov ("%r15",$_out); # borrow $a2 | |
569 | &vpor ($iv,$iv,$temp); | |
570 | &vmovdqu ("(%r15,%r12)",$iv); # write output | |
571 | &lea ("%r12","16(%r12)"); # inp++ | |
572 | ||
573 | &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); | |
574 | &jne (".Lxop_00_47"); | |
575 | ||
576 | &vmovdqu ($inout,"(%r12)"); | |
577 | &mov ($_inp,"%r12"); | |
578 | ||
579 | $aesni_cbc_idx=0; | |
580 | for ($i=0; $i<16; ) { | |
581 | foreach(body_00_15()) { eval; } | |
582 | } | |
583 | } | |
584 | $code.=<<___; | |
585 | mov $_inp,%r12 # borrow $a4 | |
586 | mov $_out,%r13 # borrow $a0 | |
587 | mov $_ctx,%r15 # borrow $a2 | |
588 | mov $_in0,%rsi # borrow $a3 | |
589 | ||
590 | vpand $mask14,$temp,$temp | |
591 | mov $a1,$A | |
592 | vpor $temp,$iv,$iv | |
593 | vmovdqu $iv,(%r13,%r12) # write output | |
594 | lea 16(%r12),%r12 # inp++ | |
595 | ||
596 | add $SZ*0(%r15),$A | |
597 | add $SZ*1(%r15),$B | |
598 | add $SZ*2(%r15),$C | |
599 | add $SZ*3(%r15),$D | |
600 | add $SZ*4(%r15),$E | |
601 | add $SZ*5(%r15),$F | |
602 | add $SZ*6(%r15),$G | |
603 | add $SZ*7(%r15),$H | |
604 | ||
605 | cmp $_end,%r12 | |
606 | ||
607 | mov $A,$SZ*0(%r15) | |
608 | mov $B,$SZ*1(%r15) | |
609 | mov $C,$SZ*2(%r15) | |
610 | mov $D,$SZ*3(%r15) | |
611 | mov $E,$SZ*4(%r15) | |
612 | mov $F,$SZ*5(%r15) | |
613 | mov $G,$SZ*6(%r15) | |
614 | mov $H,$SZ*7(%r15) | |
615 | ||
616 | jb .Lloop_xop | |
617 | ||
618 | mov $_ivp,$ivp | |
619 | mov $_rsp,%rsi | |
b84460ad | 620 | .cfi_def_cfa %rsi,8 |
8a97a330 AP |
621 | vmovdqu $iv,($ivp) # output IV |
622 | vzeroall | |
623 | ___ | |
624 | $code.=<<___ if ($win64); | |
625 | movaps `$framesz+16*0`(%rsp),%xmm6 | |
626 | movaps `$framesz+16*1`(%rsp),%xmm7 | |
627 | movaps `$framesz+16*2`(%rsp),%xmm8 | |
628 | movaps `$framesz+16*3`(%rsp),%xmm9 | |
629 | movaps `$framesz+16*4`(%rsp),%xmm10 | |
630 | movaps `$framesz+16*5`(%rsp),%xmm11 | |
631 | movaps `$framesz+16*6`(%rsp),%xmm12 | |
632 | movaps `$framesz+16*7`(%rsp),%xmm13 | |
633 | movaps `$framesz+16*8`(%rsp),%xmm14 | |
634 | movaps `$framesz+16*9`(%rsp),%xmm15 | |
635 | ___ | |
636 | $code.=<<___; | |
384e6de4 | 637 | mov -48(%rsi),%r15 |
b84460ad | 638 | .cfi_restore %r15 |
384e6de4 | 639 | mov -40(%rsi),%r14 |
b84460ad | 640 | .cfi_restore %r14 |
384e6de4 | 641 | mov -32(%rsi),%r13 |
b84460ad | 642 | .cfi_restore %r13 |
384e6de4 | 643 | mov -24(%rsi),%r12 |
b84460ad | 644 | .cfi_restore %r12 |
384e6de4 | 645 | mov -16(%rsi),%rbp |
b84460ad | 646 | .cfi_restore %rbp |
384e6de4 | 647 | mov -8(%rsi),%rbx |
b84460ad | 648 | .cfi_restore %rbx |
384e6de4 | 649 | lea (%rsi),%rsp |
b84460ad | 650 | .cfi_def_cfa_register %rsp |
8a97a330 AP |
651 | .Lepilogue_xop: |
652 | ret | |
b84460ad | 653 | .cfi_endproc |
8a97a330 AP |
654 | .size ${func}_xop,.-${func}_xop |
655 | ___ | |
656 | ###################################################################### | |
657 | # AVX+shrd code path | |
658 | # | |
659 | local *ror = sub { &shrd(@_[0],@_) }; | |
660 | ||
661 | $code.=<<___; | |
662 | .type ${func}_avx,\@function,6 | |
663 | .align 64 | |
664 | ${func}_avx: | |
b84460ad | 665 | .cfi_startproc |
8a97a330 AP |
666 | .Lavx_shortcut: |
667 | mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter | |
384e6de4 | 668 | mov %rsp,%rax # copy %rsp |
b84460ad | 669 | .cfi_def_cfa_register %rax |
8a97a330 | 670 | push %rbx |
b84460ad | 671 | .cfi_push %rbx |
8a97a330 | 672 | push %rbp |
b84460ad | 673 | .cfi_push %rbp |
8a97a330 | 674 | push %r12 |
b84460ad | 675 | .cfi_push %r12 |
8a97a330 | 676 | push %r13 |
b84460ad | 677 | .cfi_push %r13 |
8a97a330 | 678 | push %r14 |
b84460ad | 679 | .cfi_push %r14 |
8a97a330 | 680 | push %r15 |
b84460ad | 681 | .cfi_push %r15 |
8a97a330 AP |
682 | sub \$`$framesz+$win64*16*10`,%rsp |
683 | and \$-64,%rsp # align stack frame | |
684 | ||
685 | shl \$6,$len | |
686 | sub $inp,$out # re-bias | |
687 | sub $inp,$in0 | |
688 | add $inp,$len # end of input | |
689 | ||
690 | #mov $inp,$_inp # saved later | |
691 | mov $out,$_out | |
692 | mov $len,$_end | |
693 | #mov $key,$_key # remains resident in $inp register | |
694 | mov $ivp,$_ivp | |
695 | mov $ctx,$_ctx | |
696 | mov $in0,$_in0 | |
384e6de4 | 697 | mov %rax,$_rsp |
b84460ad | 698 | .cfi_cfa_expression $_rsp,deref,+8 |
8a97a330 AP |
699 | ___ |
700 | $code.=<<___ if ($win64); | |
701 | movaps %xmm6,`$framesz+16*0`(%rsp) | |
702 | movaps %xmm7,`$framesz+16*1`(%rsp) | |
703 | movaps %xmm8,`$framesz+16*2`(%rsp) | |
704 | movaps %xmm9,`$framesz+16*3`(%rsp) | |
705 | movaps %xmm10,`$framesz+16*4`(%rsp) | |
706 | movaps %xmm11,`$framesz+16*5`(%rsp) | |
707 | movaps %xmm12,`$framesz+16*6`(%rsp) | |
708 | movaps %xmm13,`$framesz+16*7`(%rsp) | |
709 | movaps %xmm14,`$framesz+16*8`(%rsp) | |
710 | movaps %xmm15,`$framesz+16*9`(%rsp) | |
711 | ___ | |
712 | $code.=<<___; | |
713 | .Lprologue_avx: | |
714 | vzeroall | |
715 | ||
716 | mov $inp,%r12 # borrow $a4 | |
717 | lea 0x80($key),$inp # size optimization, reassign | |
718 | lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 | |
719 | mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 | |
720 | mov $ctx,%r15 # borrow $a2 | |
721 | mov $in0,%rsi # borrow $a3 | |
722 | vmovdqu ($ivp),$iv # load IV | |
723 | sub \$9,%r14 | |
724 | ||
725 | mov $SZ*0(%r15),$A | |
726 | mov $SZ*1(%r15),$B | |
727 | mov $SZ*2(%r15),$C | |
728 | mov $SZ*3(%r15),$D | |
729 | mov $SZ*4(%r15),$E | |
730 | mov $SZ*5(%r15),$F | |
731 | mov $SZ*6(%r15),$G | |
732 | mov $SZ*7(%r15),$H | |
733 | ||
734 | vmovdqa 0x00(%r13,%r14,8),$mask14 | |
735 | vmovdqa 0x10(%r13,%r14,8),$mask12 | |
736 | vmovdqa 0x20(%r13,%r14,8),$mask10 | |
737 | vmovdqu 0x00-0x80($inp),$roundkey | |
738 | ___ | |
739 | if ($SZ==4) { # SHA256 | |
740 | my @X = map("%xmm$_",(0..3)); | |
741 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); | |
742 | ||
743 | $code.=<<___; | |
744 | jmp .Lloop_avx | |
745 | .align 16 | |
746 | .Lloop_avx: | |
747 | vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 | |
748 | vmovdqu 0x00(%rsi,%r12),@X[0] | |
749 | vmovdqu 0x10(%rsi,%r12),@X[1] | |
750 | vmovdqu 0x20(%rsi,%r12),@X[2] | |
751 | vmovdqu 0x30(%rsi,%r12),@X[3] | |
752 | vpshufb $t3,@X[0],@X[0] | |
753 | lea $TABLE(%rip),$Tbl | |
754 | vpshufb $t3,@X[1],@X[1] | |
755 | vpshufb $t3,@X[2],@X[2] | |
756 | vpaddd 0x00($Tbl),@X[0],$t0 | |
757 | vpshufb $t3,@X[3],@X[3] | |
758 | vpaddd 0x20($Tbl),@X[1],$t1 | |
759 | vpaddd 0x40($Tbl),@X[2],$t2 | |
760 | vpaddd 0x60($Tbl),@X[3],$t3 | |
761 | vmovdqa $t0,0x00(%rsp) | |
762 | mov $A,$a1 | |
763 | vmovdqa $t1,0x10(%rsp) | |
764 | mov $B,$a3 | |
765 | vmovdqa $t2,0x20(%rsp) | |
766 | xor $C,$a3 # magic | |
767 | vmovdqa $t3,0x30(%rsp) | |
768 | mov $E,$a0 | |
769 | jmp .Lavx_00_47 | |
770 | ||
771 | .align 16 | |
772 | .Lavx_00_47: | |
773 | sub \$-16*2*$SZ,$Tbl # size optimization | |
774 | vmovdqu (%r12),$inout # $a4 | |
775 | mov %r12,$_inp # $a4 | |
776 | ___ | |
777 | sub Xupdate_256_AVX () { | |
778 | ( | |
779 | '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] | |
780 | '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] | |
781 | '&vpsrld ($t2,$t0,$sigma0[0]);', | |
782 | '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] | |
783 | '&vpsrld ($t3,$t0,$sigma0[2])', | |
784 | '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', | |
785 | '&vpxor ($t0,$t3,$t2)', | |
786 | '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] | |
787 | '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', | |
788 | '&vpxor ($t0,$t0,$t1)', | |
789 | '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', | |
790 | '&vpxor ($t0,$t0,$t2)', | |
791 | '&vpsrld ($t2,$t3,$sigma1[2]);', | |
792 | '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) | |
793 | '&vpsrlq ($t3,$t3,$sigma1[0]);', | |
794 | '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) | |
795 | '&vpxor ($t2,$t2,$t3);', | |
796 | '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', | |
797 | '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) | |
798 | '&vpshufd ($t2,$t2,0b10000100)', | |
799 | '&vpsrldq ($t2,$t2,8)', | |
800 | '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) | |
801 | '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] | |
802 | '&vpsrld ($t2,$t3,$sigma1[2])', | |
803 | '&vpsrlq ($t3,$t3,$sigma1[0])', | |
804 | '&vpxor ($t2,$t2,$t3);', | |
805 | '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', | |
806 | '&vpxor ($t2,$t2,$t3)', | |
807 | '&vpshufd ($t2,$t2,0b11101000)', | |
808 | '&vpslldq ($t2,$t2,8)', | |
809 | '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) | |
810 | ); | |
811 | } | |
812 | ||
813 | sub AVX_256_00_47 () { | |
814 | my $j = shift; | |
815 | my $body = shift; | |
816 | my @X = @_; | |
817 | my @insns = (&$body,&$body,&$body,&$body); # 104 instructions | |
818 | ||
819 | foreach (Xupdate_256_AVX()) { # 29 instructions | |
820 | eval; | |
821 | eval(shift(@insns)); | |
822 | eval(shift(@insns)); | |
823 | eval(shift(@insns)); | |
824 | } | |
825 | &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); | |
826 | foreach (@insns) { eval; } # remaining instructions | |
827 | &vmovdqa (16*$j."(%rsp)",$t2); | |
828 | } | |
829 | ||
830 | $aesni_cbc_idx=0; | |
831 | for ($i=0,$j=0; $j<4; $j++) { | |
832 | &AVX_256_00_47($j,\&body_00_15,@X); | |
833 | push(@X,shift(@X)); # rotate(@X) | |
834 | } | |
835 | &mov ("%r12",$_inp); # borrow $a4 | |
836 | &vpand ($temp,$temp,$mask14); | |
837 | &mov ("%r15",$_out); # borrow $a2 | |
838 | &vpor ($iv,$iv,$temp); | |
839 | &vmovdqu ("(%r15,%r12)",$iv); # write output | |
840 | &lea ("%r12","16(%r12)"); # inp++ | |
841 | ||
842 | &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); | |
843 | &jne (".Lavx_00_47"); | |
844 | ||
845 | &vmovdqu ($inout,"(%r12)"); | |
846 | &mov ($_inp,"%r12"); | |
847 | ||
848 | $aesni_cbc_idx=0; | |
849 | for ($i=0; $i<16; ) { | |
850 | foreach(body_00_15()) { eval; } | |
851 | } | |
852 | ||
853 | } | |
854 | $code.=<<___; | |
855 | mov $_inp,%r12 # borrow $a4 | |
856 | mov $_out,%r13 # borrow $a0 | |
857 | mov $_ctx,%r15 # borrow $a2 | |
858 | mov $_in0,%rsi # borrow $a3 | |
859 | ||
860 | vpand $mask14,$temp,$temp | |
861 | mov $a1,$A | |
862 | vpor $temp,$iv,$iv | |
863 | vmovdqu $iv,(%r13,%r12) # write output | |
864 | lea 16(%r12),%r12 # inp++ | |
865 | ||
866 | add $SZ*0(%r15),$A | |
867 | add $SZ*1(%r15),$B | |
868 | add $SZ*2(%r15),$C | |
869 | add $SZ*3(%r15),$D | |
870 | add $SZ*4(%r15),$E | |
871 | add $SZ*5(%r15),$F | |
872 | add $SZ*6(%r15),$G | |
873 | add $SZ*7(%r15),$H | |
874 | ||
875 | cmp $_end,%r12 | |
876 | ||
877 | mov $A,$SZ*0(%r15) | |
878 | mov $B,$SZ*1(%r15) | |
879 | mov $C,$SZ*2(%r15) | |
880 | mov $D,$SZ*3(%r15) | |
881 | mov $E,$SZ*4(%r15) | |
882 | mov $F,$SZ*5(%r15) | |
883 | mov $G,$SZ*6(%r15) | |
884 | mov $H,$SZ*7(%r15) | |
885 | jb .Lloop_avx | |
886 | ||
887 | mov $_ivp,$ivp | |
888 | mov $_rsp,%rsi | |
b84460ad | 889 | .cfi_def_cfa %rsi,8 |
8a97a330 AP |
890 | vmovdqu $iv,($ivp) # output IV |
891 | vzeroall | |
892 | ___ | |
893 | $code.=<<___ if ($win64); | |
894 | movaps `$framesz+16*0`(%rsp),%xmm6 | |
895 | movaps `$framesz+16*1`(%rsp),%xmm7 | |
896 | movaps `$framesz+16*2`(%rsp),%xmm8 | |
897 | movaps `$framesz+16*3`(%rsp),%xmm9 | |
898 | movaps `$framesz+16*4`(%rsp),%xmm10 | |
899 | movaps `$framesz+16*5`(%rsp),%xmm11 | |
900 | movaps `$framesz+16*6`(%rsp),%xmm12 | |
901 | movaps `$framesz+16*7`(%rsp),%xmm13 | |
902 | movaps `$framesz+16*8`(%rsp),%xmm14 | |
903 | movaps `$framesz+16*9`(%rsp),%xmm15 | |
904 | ___ | |
905 | $code.=<<___; | |
384e6de4 | 906 | mov -48(%rsi),%r15 |
b84460ad | 907 | .cfi_restore %r15 |
384e6de4 | 908 | mov -40(%rsi),%r14 |
b84460ad | 909 | .cfi_restore %r14 |
384e6de4 | 910 | mov -32(%rsi),%r13 |
b84460ad | 911 | .cfi_restore %r13 |
384e6de4 | 912 | mov -24(%rsi),%r12 |
b84460ad | 913 | .cfi_restore %r12 |
384e6de4 | 914 | mov -16(%rsi),%rbp |
b84460ad | 915 | .cfi_restore %rbp |
384e6de4 | 916 | mov -8(%rsi),%rbx |
b84460ad | 917 | .cfi_restore %rbx |
384e6de4 | 918 | lea (%rsi),%rsp |
b84460ad | 919 | .cfi_def_cfa_register %rsp |
8a97a330 AP |
920 | .Lepilogue_avx: |
921 | ret | |
b84460ad | 922 | .cfi_endproc |
8a97a330 AP |
923 | .size ${func}_avx,.-${func}_avx |
924 | ___ | |
925 | ||
926 | if ($avx>1) {{ | |
927 | ###################################################################### | |
928 | # AVX2+BMI code path | |
929 | # | |
609b0852 | 930 | my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp |
8a97a330 AP |
931 | my $PUSH8=8*2*$SZ; |
932 | use integer; | |
933 | ||
934 | sub bodyx_00_15 () { | |
935 | # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f | |
936 | ( | |
937 | '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. | |
938 | ||
939 | '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] | |
940 | '&and ($a4,$e)', # f&e | |
941 | '&rorx ($a0,$e,$Sigma1[2])', | |
942 | '&rorx ($a2,$e,$Sigma1[1])', | |
943 | ||
944 | '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past | |
945 | '&lea ($h,"($h,$a4)")', | |
946 | '&andn ($a4,$e,$g)', # ~e&g | |
947 | '&xor ($a0,$a2)', | |
948 | ||
949 | '&rorx ($a1,$e,$Sigma1[0])', | |
950 | '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) | |
951 | '&xor ($a0,$a1)', # Sigma1(e) | |
952 | '&mov ($a2,$a)', | |
953 | ||
954 | '&rorx ($a4,$a,$Sigma0[2])', | |
955 | '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) | |
956 | '&xor ($a2,$b)', # a^b, b^c in next round | |
957 | '&rorx ($a1,$a,$Sigma0[1])', | |
958 | ||
959 | '&rorx ($a0,$a,$Sigma0[0])', | |
960 | '&lea ($d,"($d,$h)")', # d+=h | |
961 | '&and ($a3,$a2)', # (b^c)&(a^b) | |
962 | @aesni_cbc_block[$aesni_cbc_idx++]. | |
963 | '&xor ($a1,$a4)', | |
964 | ||
965 | '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) | |
966 | '&xor ($a1,$a0)', # Sigma0(a) | |
967 | '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) | |
968 | '&mov ($a4,$e)', # copy of f in future | |
969 | ||
970 | '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' | |
971 | ); | |
972 | # and at the finish one has to $a+=$a1 | |
973 | } | |
974 | ||
975 | $code.=<<___; | |
976 | .type ${func}_avx2,\@function,6 | |
977 | .align 64 | |
978 | ${func}_avx2: | |
b84460ad | 979 | .cfi_startproc |
8a97a330 AP |
980 | .Lavx2_shortcut: |
981 | mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter | |
384e6de4 | 982 | mov %rsp,%rax # copy %rsp |
b84460ad | 983 | .cfi_def_cfa_register %rax |
8a97a330 | 984 | push %rbx |
b84460ad | 985 | .cfi_push %rbx |
8a97a330 | 986 | push %rbp |
b84460ad | 987 | .cfi_push %rbp |
8a97a330 | 988 | push %r12 |
b84460ad | 989 | .cfi_push %r12 |
8a97a330 | 990 | push %r13 |
b84460ad | 991 | .cfi_push %r13 |
8a97a330 | 992 | push %r14 |
b84460ad | 993 | .cfi_push %r14 |
8a97a330 | 994 | push %r15 |
b84460ad | 995 | .cfi_push %r15 |
8a97a330 AP |
996 | sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp |
997 | and \$-256*$SZ,%rsp # align stack frame | |
998 | add \$`2*$SZ*($rounds-8)`,%rsp | |
999 | ||
1000 | shl \$6,$len | |
1001 | sub $inp,$out # re-bias | |
1002 | sub $inp,$in0 | |
1003 | add $inp,$len # end of input | |
1004 | ||
1005 | #mov $inp,$_inp # saved later | |
1006 | #mov $out,$_out # kept in $offload | |
1007 | mov $len,$_end | |
1008 | #mov $key,$_key # remains resident in $inp register | |
1009 | mov $ivp,$_ivp | |
1010 | mov $ctx,$_ctx | |
1011 | mov $in0,$_in0 | |
384e6de4 | 1012 | mov %rax,$_rsp |
b84460ad | 1013 | .cfi_cfa_expression $_rsp,deref,+8 |
8a97a330 AP |
1014 | ___ |
1015 | $code.=<<___ if ($win64); | |
1016 | movaps %xmm6,`$framesz+16*0`(%rsp) | |
1017 | movaps %xmm7,`$framesz+16*1`(%rsp) | |
1018 | movaps %xmm8,`$framesz+16*2`(%rsp) | |
1019 | movaps %xmm9,`$framesz+16*3`(%rsp) | |
1020 | movaps %xmm10,`$framesz+16*4`(%rsp) | |
1021 | movaps %xmm11,`$framesz+16*5`(%rsp) | |
1022 | movaps %xmm12,`$framesz+16*6`(%rsp) | |
1023 | movaps %xmm13,`$framesz+16*7`(%rsp) | |
1024 | movaps %xmm14,`$framesz+16*8`(%rsp) | |
1025 | movaps %xmm15,`$framesz+16*9`(%rsp) | |
1026 | ___ | |
1027 | $code.=<<___; | |
1028 | .Lprologue_avx2: | |
1029 | vzeroall | |
1030 | ||
1031 | mov $inp,%r13 # borrow $a0 | |
1032 | vpinsrq \$1,$out,$offload,$offload | |
1033 | lea 0x80($key),$inp # size optimization, reassign | |
1034 | lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 | |
1035 | mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 | |
1036 | mov $ctx,%r15 # borrow $a2 | |
1037 | mov $in0,%rsi # borrow $a3 | |
1038 | vmovdqu ($ivp),$iv # load IV | |
1039 | lea -9(%r14),%r14 | |
1040 | ||
1041 | vmovdqa 0x00(%r12,%r14,8),$mask14 | |
1042 | vmovdqa 0x10(%r12,%r14,8),$mask12 | |
1043 | vmovdqa 0x20(%r12,%r14,8),$mask10 | |
1044 | ||
1045 | sub \$-16*$SZ,%r13 # inp++, size optimization | |
1046 | mov $SZ*0(%r15),$A | |
42b9a417 | 1047 | lea (%rsi,%r13),%r12 # borrow $a0 |
8a97a330 AP |
1048 | mov $SZ*1(%r15),$B |
1049 | cmp $len,%r13 # $_end | |
1050 | mov $SZ*2(%r15),$C | |
42b9a417 | 1051 | cmove %rsp,%r12 # next block or random data |
8a97a330 AP |
1052 | mov $SZ*3(%r15),$D |
1053 | mov $SZ*4(%r15),$E | |
1054 | mov $SZ*5(%r15),$F | |
1055 | mov $SZ*6(%r15),$G | |
1056 | mov $SZ*7(%r15),$H | |
1057 | vmovdqu 0x00-0x80($inp),$roundkey | |
1058 | ___ | |
1059 | if ($SZ==4) { # SHA256 | |
1060 | my @X = map("%ymm$_",(0..3)); | |
1061 | my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); | |
1062 | ||
1063 | $code.=<<___; | |
1064 | jmp .Loop_avx2 | |
1065 | .align 16 | |
1066 | .Loop_avx2: | |
8a97a330 | 1067 | vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 |
8a97a330 | 1068 | vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 |
8a97a330 AP |
1069 | vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 |
1070 | vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 | |
1071 | vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 | |
1072 | ||
42b9a417 AP |
1073 | vinserti128 \$1,(%r12),@X[0],@X[0] |
1074 | vinserti128 \$1,16(%r12),@X[1],@X[1] | |
8a97a330 | 1075 | vpshufb $t3,@X[0],@X[0] |
42b9a417 | 1076 | vinserti128 \$1,32(%r12),@X[2],@X[2] |
8a97a330 | 1077 | vpshufb $t3,@X[1],@X[1] |
42b9a417 | 1078 | vinserti128 \$1,48(%r12),@X[3],@X[3] |
8a97a330 AP |
1079 | |
1080 | lea $TABLE(%rip),$Tbl | |
1081 | vpshufb $t3,@X[2],@X[2] | |
1082 | lea -16*$SZ(%r13),%r13 | |
1083 | vpaddd 0x00($Tbl),@X[0],$t0 | |
1084 | vpshufb $t3,@X[3],@X[3] | |
1085 | vpaddd 0x20($Tbl),@X[1],$t1 | |
1086 | vpaddd 0x40($Tbl),@X[2],$t2 | |
1087 | vpaddd 0x60($Tbl),@X[3],$t3 | |
1088 | vmovdqa $t0,0x00(%rsp) | |
1089 | xor $a1,$a1 | |
1090 | vmovdqa $t1,0x20(%rsp) | |
665de4d4 BE |
1091 | ___ |
1092 | $code.=<<___ if (!$win64); | |
1093 | # temporarily use %rsi as frame pointer | |
1094 | mov $_rsp,%rsi | |
1095 | .cfi_def_cfa %rsi,8 | |
1096 | ___ | |
1097 | $code.=<<___; | |
8a97a330 | 1098 | lea -$PUSH8(%rsp),%rsp |
665de4d4 BE |
1099 | ___ |
1100 | $code.=<<___ if (!$win64); | |
1101 | # the frame info is at $_rsp, but the stack is moving... | |
1102 | # so a second frame pointer is saved at -8(%rsp) | |
1103 | # that is in the red zone | |
1104 | mov %rsi,-8(%rsp) | |
1105 | .cfi_cfa_expression %rsp-8,deref,+8 | |
1106 | ___ | |
1107 | $code.=<<___; | |
8a97a330 AP |
1108 | mov $B,$a3 |
1109 | vmovdqa $t2,0x00(%rsp) | |
1110 | xor $C,$a3 # magic | |
1111 | vmovdqa $t3,0x20(%rsp) | |
1112 | mov $F,$a4 | |
1113 | sub \$-16*2*$SZ,$Tbl # size optimization | |
1114 | jmp .Lavx2_00_47 | |
1115 | ||
1116 | .align 16 | |
1117 | .Lavx2_00_47: | |
1118 | vmovdqu (%r13),$inout | |
1119 | vpinsrq \$0,%r13,$offload,$offload | |
1120 | ___ | |
1121 | ||
1122 | sub AVX2_256_00_47 () { | |
1123 | my $j = shift; | |
1124 | my $body = shift; | |
1125 | my @X = @_; | |
1126 | my @insns = (&$body,&$body,&$body,&$body); # 96 instructions | |
1127 | my $base = "+2*$PUSH8(%rsp)"; | |
1128 | ||
665de4d4 BE |
1129 | if (($j%2)==0) { |
1130 | &lea ("%rsp","-$PUSH8(%rsp)"); | |
1131 | $code.=<<___ if (!$win64); | |
1132 | .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 | |
1133 | # copy secondary frame pointer to new location again at -8(%rsp) | |
1134 | pushq $PUSH8-8(%rsp) | |
1135 | .cfi_cfa_expression %rsp,deref,+8 | |
1136 | lea 8(%rsp),%rsp | |
1137 | .cfi_cfa_expression %rsp-8,deref,+8 | |
1138 | ___ | |
1139 | } | |
8a97a330 AP |
1140 | foreach (Xupdate_256_AVX()) { # 29 instructions |
1141 | eval; | |
1142 | eval(shift(@insns)); | |
1143 | eval(shift(@insns)); | |
1144 | eval(shift(@insns)); | |
1145 | } | |
1146 | &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); | |
1147 | foreach (@insns) { eval; } # remaining instructions | |
1148 | &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); | |
1149 | } | |
1150 | $aesni_cbc_idx=0; | |
1151 | for ($i=0,$j=0; $j<4; $j++) { | |
1152 | &AVX2_256_00_47($j,\&bodyx_00_15,@X); | |
1153 | push(@X,shift(@X)); # rotate(@X) | |
1154 | } | |
1155 | &vmovq ("%r13",$offload); # borrow $a0 | |
1156 | &vpextrq ("%r15",$offload,1); # borrow $a2 | |
1157 | &vpand ($temp,$temp,$mask14); | |
1158 | &vpor ($iv,$iv,$temp); | |
1159 | &vmovdqu ("(%r15,%r13)",$iv); # write output | |
1160 | &lea ("%r13","16(%r13)"); # inp++ | |
1161 | ||
1162 | &lea ($Tbl,16*2*$SZ."($Tbl)"); | |
1163 | &cmpb (($SZ-1)."($Tbl)",0); | |
1164 | &jne (".Lavx2_00_47"); | |
1165 | ||
1166 | &vmovdqu ($inout,"(%r13)"); | |
1167 | &vpinsrq ($offload,$offload,"%r13",0); | |
1168 | ||
1169 | $aesni_cbc_idx=0; | |
1170 | for ($i=0; $i<16; ) { | |
1171 | my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; | |
1172 | foreach(bodyx_00_15()) { eval; } | |
1173 | } | |
1174 | } | |
1175 | $code.=<<___; | |
1176 | vpextrq \$1,$offload,%r12 # $_out, borrow $a4 | |
1177 | vmovq $offload,%r13 # $_inp, borrow $a0 | |
1178 | mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 | |
1179 | add $a1,$A | |
1180 | lea `2*$SZ*($rounds-8)`(%rsp),$Tbl | |
1181 | ||
1182 | vpand $mask14,$temp,$temp | |
1183 | vpor $temp,$iv,$iv | |
1184 | vmovdqu $iv,(%r12,%r13) # write output | |
1185 | lea 16(%r13),%r13 | |
1186 | ||
1187 | add $SZ*0(%r15),$A | |
1188 | add $SZ*1(%r15),$B | |
1189 | add $SZ*2(%r15),$C | |
1190 | add $SZ*3(%r15),$D | |
1191 | add $SZ*4(%r15),$E | |
1192 | add $SZ*5(%r15),$F | |
1193 | add $SZ*6(%r15),$G | |
1194 | add $SZ*7(%r15),$H | |
1195 | ||
1196 | mov $A,$SZ*0(%r15) | |
1197 | mov $B,$SZ*1(%r15) | |
1198 | mov $C,$SZ*2(%r15) | |
1199 | mov $D,$SZ*3(%r15) | |
1200 | mov $E,$SZ*4(%r15) | |
1201 | mov $F,$SZ*5(%r15) | |
1202 | mov $G,$SZ*6(%r15) | |
1203 | mov $H,$SZ*7(%r15) | |
1204 | ||
1205 | cmp `$PUSH8+2*8`($Tbl),%r13 # $_end | |
1206 | je .Ldone_avx2 | |
1207 | ||
1208 | xor $a1,$a1 | |
1209 | mov $B,$a3 | |
1210 | mov $F,$a4 | |
1211 | xor $C,$a3 # magic | |
1212 | jmp .Lower_avx2 | |
1213 | .align 16 | |
1214 | .Lower_avx2: | |
1215 | vmovdqu (%r13),$inout | |
1216 | vpinsrq \$0,%r13,$offload,$offload | |
1217 | ___ | |
1218 | $aesni_cbc_idx=0; | |
1219 | for ($i=0; $i<16; ) { | |
1220 | my $base="+16($Tbl)"; | |
1221 | foreach(bodyx_00_15()) { eval; } | |
1222 | &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); | |
1223 | } | |
1224 | $code.=<<___; | |
1225 | vmovq $offload,%r13 # borrow $a0 | |
1226 | vpextrq \$1,$offload,%r15 # borrow $a2 | |
1227 | vpand $mask14,$temp,$temp | |
1228 | vpor $temp,$iv,$iv | |
1229 | lea -$PUSH8($Tbl),$Tbl | |
1230 | vmovdqu $iv,(%r15,%r13) # write output | |
1231 | lea 16(%r13),%r13 # inp++ | |
1232 | cmp %rsp,$Tbl | |
1233 | jae .Lower_avx2 | |
1234 | ||
1235 | mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 | |
1236 | lea 16*$SZ(%r13),%r13 | |
1237 | mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 | |
1238 | add $a1,$A | |
1239 | lea `2*$SZ*($rounds-8)`(%rsp),%rsp | |
1240 | ||
1241 | add $SZ*0(%r15),$A | |
1242 | add $SZ*1(%r15),$B | |
1243 | add $SZ*2(%r15),$C | |
1244 | add $SZ*3(%r15),$D | |
1245 | add $SZ*4(%r15),$E | |
1246 | add $SZ*5(%r15),$F | |
1247 | add $SZ*6(%r15),$G | |
42b9a417 | 1248 | lea (%rsi,%r13),%r12 |
8a97a330 AP |
1249 | add $SZ*7(%r15),$H |
1250 | ||
1251 | cmp $_end,%r13 | |
1252 | ||
1253 | mov $A,$SZ*0(%r15) | |
42b9a417 | 1254 | cmove %rsp,%r12 # next block or stale data |
8a97a330 AP |
1255 | mov $B,$SZ*1(%r15) |
1256 | mov $C,$SZ*2(%r15) | |
1257 | mov $D,$SZ*3(%r15) | |
1258 | mov $E,$SZ*4(%r15) | |
1259 | mov $F,$SZ*5(%r15) | |
1260 | mov $G,$SZ*6(%r15) | |
1261 | mov $H,$SZ*7(%r15) | |
1262 | ||
8a97a330 AP |
1263 | jbe .Loop_avx2 |
1264 | lea (%rsp),$Tbl | |
665de4d4 BE |
1265 | # temporarily use $Tbl as index to $_rsp |
1266 | # this avoids the need to save a secondary frame pointer at -8(%rsp) | |
1267 | .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8 | |
8a97a330 AP |
1268 | |
1269 | .Ldone_avx2: | |
665de4d4 BE |
1270 | mov 16*$SZ+4*8($Tbl),$ivp |
1271 | mov 16*$SZ+7*8($Tbl),%rsi | |
b84460ad | 1272 | .cfi_def_cfa %rsi,8 |
8a97a330 AP |
1273 | vmovdqu $iv,($ivp) # output IV |
1274 | vzeroall | |
1275 | ___ | |
1276 | $code.=<<___ if ($win64); | |
665de4d4 BE |
1277 | movaps `$framesz+16*0`($Tbl),%xmm6 |
1278 | movaps `$framesz+16*1`($Tbl),%xmm7 | |
1279 | movaps `$framesz+16*2`($Tbl),%xmm8 | |
1280 | movaps `$framesz+16*3`($Tbl),%xmm9 | |
1281 | movaps `$framesz+16*4`($Tbl),%xmm10 | |
1282 | movaps `$framesz+16*5`($Tbl),%xmm11 | |
1283 | movaps `$framesz+16*6`($Tbl),%xmm12 | |
1284 | movaps `$framesz+16*7`($Tbl),%xmm13 | |
1285 | movaps `$framesz+16*8`($Tbl),%xmm14 | |
1286 | movaps `$framesz+16*9`($Tbl),%xmm15 | |
8a97a330 AP |
1287 | ___ |
1288 | $code.=<<___; | |
384e6de4 | 1289 | mov -48(%rsi),%r15 |
b84460ad | 1290 | .cfi_restore %r15 |
384e6de4 | 1291 | mov -40(%rsi),%r14 |
b84460ad | 1292 | .cfi_restore %r14 |
384e6de4 | 1293 | mov -32(%rsi),%r13 |
b84460ad | 1294 | .cfi_restore %r13 |
384e6de4 | 1295 | mov -24(%rsi),%r12 |
b84460ad | 1296 | .cfi_restore %r12 |
384e6de4 | 1297 | mov -16(%rsi),%rbp |
b84460ad | 1298 | .cfi_restore %rbp |
384e6de4 | 1299 | mov -8(%rsi),%rbx |
b84460ad | 1300 | .cfi_restore %rbx |
384e6de4 | 1301 | lea (%rsi),%rsp |
b84460ad | 1302 | .cfi_def_cfa_register %rsp |
8a97a330 AP |
1303 | .Lepilogue_avx2: |
1304 | ret | |
b84460ad | 1305 | .cfi_endproc |
8a97a330 AP |
1306 | .size ${func}_avx2,.-${func}_avx2 |
1307 | ___ | |
1308 | }} | |
619b9466 AP |
1309 | }} |
1310 | {{ | |
1311 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); | |
1312 | ||
1313 | my ($rounds,$Tbl)=("%r11d","%rbx"); | |
1314 | ||
1315 | my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); | |
1316 | my @rndkey=("%xmm4","%xmm5"); | |
1317 | my $r=0; | |
1318 | my $sn=0; | |
1319 | ||
1320 | my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); | |
1321 | my @MSG=map("%xmm$_",(10..13)); | |
1322 | ||
1323 | my $aesenc=sub { | |
1324 | use integer; | |
1325 | my ($n,$k)=($r/10,$r%10); | |
1326 | if ($k==0) { | |
1327 | $code.=<<___; | |
1328 | movups `16*$n`($in0),$in # load input | |
1329 | xorps $rndkey0,$in | |
1330 | ___ | |
1331 | $code.=<<___ if ($n); | |
1332 | movups $iv,`16*($n-1)`($out,$in0) # write output | |
1333 | ___ | |
1334 | $code.=<<___; | |
1335 | xorps $in,$iv | |
1336 | movups `32+16*$k-112`($key),$rndkey[1] | |
1337 | aesenc $rndkey[0],$iv | |
1338 | ___ | |
1339 | } elsif ($k==9) { | |
1340 | $sn++; | |
1341 | $code.=<<___; | |
1342 | cmp \$11,$rounds | |
1343 | jb .Laesenclast$sn | |
1344 | movups `32+16*($k+0)-112`($key),$rndkey[1] | |
1345 | aesenc $rndkey[0],$iv | |
1346 | movups `32+16*($k+1)-112`($key),$rndkey[0] | |
1347 | aesenc $rndkey[1],$iv | |
1348 | je .Laesenclast$sn | |
1349 | movups `32+16*($k+2)-112`($key),$rndkey[1] | |
1350 | aesenc $rndkey[0],$iv | |
1351 | movups `32+16*($k+3)-112`($key),$rndkey[0] | |
1352 | aesenc $rndkey[1],$iv | |
1353 | .Laesenclast$sn: | |
1354 | aesenclast $rndkey[0],$iv | |
1355 | movups 16-112($key),$rndkey[1] # forward reference | |
1356 | nop | |
1357 | ___ | |
1358 | } else { | |
1359 | $code.=<<___; | |
1360 | movups `32+16*$k-112`($key),$rndkey[1] | |
1361 | aesenc $rndkey[0],$iv | |
1362 | ___ | |
1363 | } | |
1364 | $r++; unshift(@rndkey,pop(@rndkey)); | |
1365 | }; | |
1366 | ||
977f32e8 AP |
1367 | if ($shaext) { |
1368 | my $Tbl="%rax"; | |
1369 | ||
619b9466 AP |
1370 | $code.=<<___; |
1371 | .type ${func}_shaext,\@function,6 | |
1372 | .align 32 | |
1373 | ${func}_shaext: | |
665de4d4 | 1374 | .cfi_startproc |
619b9466 | 1375 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument |
619b9466 AP |
1376 | ___ |
1377 | $code.=<<___ if ($win64); | |
977f32e8 | 1378 | lea `-8-10*16`(%rsp),%rsp |
619b9466 AP |
1379 | movaps %xmm6,-8-10*16(%rax) |
1380 | movaps %xmm7,-8-9*16(%rax) | |
1381 | movaps %xmm8,-8-8*16(%rax) | |
1382 | movaps %xmm9,-8-7*16(%rax) | |
1383 | movaps %xmm10,-8-6*16(%rax) | |
1384 | movaps %xmm11,-8-5*16(%rax) | |
1385 | movaps %xmm12,-8-4*16(%rax) | |
1386 | movaps %xmm13,-8-3*16(%rax) | |
1387 | movaps %xmm14,-8-2*16(%rax) | |
1388 | movaps %xmm15,-8-1*16(%rax) | |
1389 | .Lprologue_shaext: | |
1390 | ___ | |
1391 | $code.=<<___; | |
1392 | lea K256+0x80(%rip),$Tbl | |
1393 | movdqu ($ctx),$ABEF # DCBA | |
1394 | movdqu 16($ctx),$CDGH # HGFE | |
1395 | movdqa 0x200-0x80($Tbl),$TMP # byte swap mask | |
1396 | ||
1397 | mov 240($key),$rounds | |
1398 | sub $in0,$out | |
1399 | movups ($key),$rndkey0 # $key[0] | |
08d09628 | 1400 | movups ($ivp),$iv # load IV |
619b9466 AP |
1401 | movups 16($key),$rndkey[0] # forward reference |
1402 | lea 112($key),$key # size optimization | |
1403 | ||
1404 | pshufd \$0x1b,$ABEF,$Wi # ABCD | |
1405 | pshufd \$0xb1,$ABEF,$ABEF # CDAB | |
1406 | pshufd \$0x1b,$CDGH,$CDGH # EFGH | |
1407 | movdqa $TMP,$BSWAP # offload | |
1408 | palignr \$8,$CDGH,$ABEF # ABEF | |
1409 | punpcklqdq $Wi,$CDGH # CDGH | |
1410 | ||
1411 | jmp .Loop_shaext | |
1412 | ||
1413 | .align 16 | |
1414 | .Loop_shaext: | |
1415 | movdqu ($inp),@MSG[0] | |
1416 | movdqu 0x10($inp),@MSG[1] | |
1417 | movdqu 0x20($inp),@MSG[2] | |
1418 | pshufb $TMP,@MSG[0] | |
1419 | movdqu 0x30($inp),@MSG[3] | |
1420 | ||
1421 | movdqa 0*32-0x80($Tbl),$Wi | |
1422 | paddd @MSG[0],$Wi | |
1423 | pshufb $TMP,@MSG[1] | |
1424 | movdqa $CDGH,$CDGH_SAVE # offload | |
1425 | movdqa $ABEF,$ABEF_SAVE # offload | |
1426 | ___ | |
1427 | &$aesenc(); | |
1428 | $code.=<<___; | |
1429 | sha256rnds2 $ABEF,$CDGH # 0-3 | |
1430 | pshufd \$0x0e,$Wi,$Wi | |
1431 | ___ | |
1432 | &$aesenc(); | |
1433 | $code.=<<___; | |
1434 | sha256rnds2 $CDGH,$ABEF | |
1435 | ||
1436 | movdqa 1*32-0x80($Tbl),$Wi | |
1437 | paddd @MSG[1],$Wi | |
1438 | pshufb $TMP,@MSG[2] | |
1439 | lea 0x40($inp),$inp | |
1440 | ___ | |
1441 | &$aesenc(); | |
1442 | $code.=<<___; | |
1443 | sha256rnds2 $ABEF,$CDGH # 4-7 | |
1444 | pshufd \$0x0e,$Wi,$Wi | |
1445 | ___ | |
1446 | &$aesenc(); | |
1447 | $code.=<<___; | |
1448 | sha256rnds2 $CDGH,$ABEF | |
1449 | ||
1450 | movdqa 2*32-0x80($Tbl),$Wi | |
1451 | paddd @MSG[2],$Wi | |
1452 | pshufb $TMP,@MSG[3] | |
1453 | sha256msg1 @MSG[1],@MSG[0] | |
1454 | ___ | |
1455 | &$aesenc(); | |
1456 | $code.=<<___; | |
1457 | sha256rnds2 $ABEF,$CDGH # 8-11 | |
1458 | pshufd \$0x0e,$Wi,$Wi | |
1459 | movdqa @MSG[3],$TMP | |
1460 | palignr \$4,@MSG[2],$TMP | |
1461 | paddd $TMP,@MSG[0] | |
1462 | ___ | |
1463 | &$aesenc(); | |
1464 | $code.=<<___; | |
1465 | sha256rnds2 $CDGH,$ABEF | |
1466 | ||
1467 | movdqa 3*32-0x80($Tbl),$Wi | |
1468 | paddd @MSG[3],$Wi | |
1469 | sha256msg2 @MSG[3],@MSG[0] | |
1470 | sha256msg1 @MSG[2],@MSG[1] | |
1471 | ___ | |
1472 | &$aesenc(); | |
1473 | $code.=<<___; | |
1474 | sha256rnds2 $ABEF,$CDGH # 12-15 | |
1475 | pshufd \$0x0e,$Wi,$Wi | |
1476 | ___ | |
1477 | &$aesenc(); | |
1478 | $code.=<<___; | |
1479 | movdqa @MSG[0],$TMP | |
1480 | palignr \$4,@MSG[3],$TMP | |
1481 | paddd $TMP,@MSG[1] | |
1482 | sha256rnds2 $CDGH,$ABEF | |
1483 | ___ | |
1484 | for($i=4;$i<16-3;$i++) { | |
1485 | &$aesenc() if (($r%10)==0); | |
1486 | $code.=<<___; | |
1487 | movdqa $i*32-0x80($Tbl),$Wi | |
1488 | paddd @MSG[0],$Wi | |
1489 | sha256msg2 @MSG[0],@MSG[1] | |
1490 | sha256msg1 @MSG[3],@MSG[2] | |
1491 | ___ | |
1492 | &$aesenc(); | |
1493 | $code.=<<___; | |
1494 | sha256rnds2 $ABEF,$CDGH # 16-19... | |
1495 | pshufd \$0x0e,$Wi,$Wi | |
1496 | movdqa @MSG[1],$TMP | |
1497 | palignr \$4,@MSG[0],$TMP | |
1498 | paddd $TMP,@MSG[2] | |
1499 | ___ | |
1500 | &$aesenc(); | |
1501 | &$aesenc() if ($r==19); | |
1502 | $code.=<<___; | |
1503 | sha256rnds2 $CDGH,$ABEF | |
1504 | ___ | |
1505 | push(@MSG,shift(@MSG)); | |
1506 | } | |
1507 | $code.=<<___; | |
1508 | movdqa 13*32-0x80($Tbl),$Wi | |
1509 | paddd @MSG[0],$Wi | |
1510 | sha256msg2 @MSG[0],@MSG[1] | |
1511 | sha256msg1 @MSG[3],@MSG[2] | |
1512 | ___ | |
1513 | &$aesenc(); | |
1514 | $code.=<<___; | |
1515 | sha256rnds2 $ABEF,$CDGH # 52-55 | |
1516 | pshufd \$0x0e,$Wi,$Wi | |
1517 | movdqa @MSG[1],$TMP | |
1518 | palignr \$4,@MSG[0],$TMP | |
1519 | paddd $TMP,@MSG[2] | |
1520 | ___ | |
1521 | &$aesenc(); | |
1522 | &$aesenc(); | |
1523 | $code.=<<___; | |
1524 | sha256rnds2 $CDGH,$ABEF | |
1525 | ||
1526 | movdqa 14*32-0x80($Tbl),$Wi | |
1527 | paddd @MSG[1],$Wi | |
1528 | sha256msg2 @MSG[1],@MSG[2] | |
1529 | movdqa $BSWAP,$TMP | |
1530 | ___ | |
1531 | &$aesenc(); | |
1532 | $code.=<<___; | |
1533 | sha256rnds2 $ABEF,$CDGH # 56-59 | |
1534 | pshufd \$0x0e,$Wi,$Wi | |
1535 | ___ | |
1536 | &$aesenc(); | |
1537 | $code.=<<___; | |
1538 | sha256rnds2 $CDGH,$ABEF | |
1539 | ||
1540 | movdqa 15*32-0x80($Tbl),$Wi | |
1541 | paddd @MSG[2],$Wi | |
1542 | ___ | |
1543 | &$aesenc(); | |
1544 | &$aesenc(); | |
1545 | $code.=<<___; | |
1546 | sha256rnds2 $ABEF,$CDGH # 60-63 | |
1547 | pshufd \$0x0e,$Wi,$Wi | |
1548 | ___ | |
1549 | &$aesenc(); | |
1550 | $code.=<<___; | |
1551 | sha256rnds2 $CDGH,$ABEF | |
1552 | #pxor $CDGH,$rndkey0 # black magic | |
1553 | ___ | |
1554 | while ($r<40) { &$aesenc(); } # remaining aesenc's | |
1555 | $code.=<<___; | |
1556 | #xorps $CDGH,$rndkey0 # black magic | |
1557 | paddd $CDGH_SAVE,$CDGH | |
1558 | paddd $ABEF_SAVE,$ABEF | |
1559 | ||
1560 | dec $len | |
1561 | movups $iv,48($out,$in0) # write output | |
1562 | lea 64($in0),$in0 | |
1563 | jnz .Loop_shaext | |
1564 | ||
1565 | pshufd \$0xb1,$CDGH,$CDGH # DCHG | |
1566 | pshufd \$0x1b,$ABEF,$TMP # FEBA | |
1567 | pshufd \$0xb1,$ABEF,$ABEF # BAFE | |
1568 | punpckhqdq $CDGH,$ABEF # DCBA | |
1569 | palignr \$8,$TMP,$CDGH # HGFE | |
1570 | ||
1571 | movups $iv,($ivp) # write IV | |
1572 | movdqu $ABEF,($ctx) | |
1573 | movdqu $CDGH,16($ctx) | |
1574 | ___ | |
1575 | $code.=<<___ if ($win64); | |
977f32e8 AP |
1576 | movaps 0*16(%rsp),%xmm6 |
1577 | movaps 1*16(%rsp),%xmm7 | |
1578 | movaps 2*16(%rsp),%xmm8 | |
1579 | movaps 3*16(%rsp),%xmm9 | |
1580 | movaps 4*16(%rsp),%xmm10 | |
1581 | movaps 5*16(%rsp),%xmm11 | |
1582 | movaps 6*16(%rsp),%xmm12 | |
1583 | movaps 7*16(%rsp),%xmm13 | |
1584 | movaps 8*16(%rsp),%xmm14 | |
1585 | movaps 9*16(%rsp),%xmm15 | |
1586 | lea 8+10*16(%rsp),%rsp | |
619b9466 AP |
1587 | .Lepilogue_shaext: |
1588 | ___ | |
1589 | $code.=<<___; | |
619b9466 | 1590 | ret |
665de4d4 | 1591 | .cfi_endproc |
619b9466 AP |
1592 | .size ${func}_shaext,.-${func}_shaext |
1593 | ___ | |
977f32e8 | 1594 | } |
8a97a330 AP |
1595 | }}}}} |
1596 | ||
1597 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
1598 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
82c4a079 | 1599 | if ($win64 && $avx) { |
8a97a330 AP |
1600 | $rec="%rcx"; |
1601 | $frame="%rdx"; | |
1602 | $context="%r8"; | |
1603 | $disp="%r9"; | |
1604 | ||
82c4a079 | 1605 | $code.=<<___; |
8a97a330 AP |
1606 | .extern __imp_RtlVirtualUnwind |
1607 | .type se_handler,\@abi-omnipotent | |
1608 | .align 16 | |
1609 | se_handler: | |
1610 | push %rsi | |
1611 | push %rdi | |
1612 | push %rbx | |
1613 | push %rbp | |
1614 | push %r12 | |
1615 | push %r13 | |
1616 | push %r14 | |
1617 | push %r15 | |
1618 | pushfq | |
1619 | sub \$64,%rsp | |
1620 | ||
1621 | mov 120($context),%rax # pull context->Rax | |
1622 | mov 248($context),%rbx # pull context->Rip | |
1623 | ||
1624 | mov 8($disp),%rsi # disp->ImageBase | |
1625 | mov 56($disp),%r11 # disp->HanderlData | |
1626 | ||
1627 | mov 0(%r11),%r10d # HandlerData[0] | |
1628 | lea (%rsi,%r10),%r10 # prologue label | |
1629 | cmp %r10,%rbx # context->Rip<prologue label | |
1630 | jb .Lin_prologue | |
1631 | ||
1632 | mov 152($context),%rax # pull context->Rsp | |
1633 | ||
1634 | mov 4(%r11),%r10d # HandlerData[1] | |
1635 | lea (%rsi,%r10),%r10 # epilogue label | |
1636 | cmp %r10,%rbx # context->Rip>=epilogue label | |
1637 | jae .Lin_prologue | |
1638 | ___ | |
977f32e8 AP |
1639 | $code.=<<___ if ($shaext); |
1640 | lea aesni_cbc_sha256_enc_shaext(%rip),%r10 | |
1641 | cmp %r10,%rbx | |
1642 | jb .Lnot_in_shaext | |
1643 | ||
1644 | lea (%rax),%rsi | |
1645 | lea 512($context),%rdi # &context.Xmm6 | |
1646 | mov \$20,%ecx | |
1647 | .long 0xa548f3fc # cld; rep movsq | |
1648 | lea 168(%rax),%rax # adjust stack pointer | |
1649 | jmp .Lin_prologue | |
1650 | .Lnot_in_shaext: | |
1651 | ___ | |
8a97a330 AP |
1652 | $code.=<<___ if ($avx>1); |
1653 | lea .Lavx2_shortcut(%rip),%r10 | |
1654 | cmp %r10,%rbx # context->Rip<avx2_shortcut | |
1655 | jb .Lnot_in_avx2 | |
1656 | ||
1657 | and \$-256*$SZ,%rax | |
1658 | add \$`2*$SZ*($rounds-8)`,%rax | |
1659 | .Lnot_in_avx2: | |
1660 | ___ | |
1661 | $code.=<<___; | |
1662 | mov %rax,%rsi # put aside Rsp | |
1663 | mov 16*$SZ+7*8(%rax),%rax # pull $_rsp | |
8a97a330 AP |
1664 | |
1665 | mov -8(%rax),%rbx | |
1666 | mov -16(%rax),%rbp | |
1667 | mov -24(%rax),%r12 | |
1668 | mov -32(%rax),%r13 | |
1669 | mov -40(%rax),%r14 | |
1670 | mov -48(%rax),%r15 | |
1671 | mov %rbx,144($context) # restore context->Rbx | |
1672 | mov %rbp,160($context) # restore context->Rbp | |
1673 | mov %r12,216($context) # restore context->R12 | |
1674 | mov %r13,224($context) # restore context->R13 | |
1675 | mov %r14,232($context) # restore context->R14 | |
1676 | mov %r15,240($context) # restore context->R15 | |
1677 | ||
8a97a330 AP |
1678 | lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area |
1679 | lea 512($context),%rdi # &context.Xmm6 | |
1680 | mov \$20,%ecx | |
1681 | .long 0xa548f3fc # cld; rep movsq | |
1682 | ||
1683 | .Lin_prologue: | |
1684 | mov 8(%rax),%rdi | |
1685 | mov 16(%rax),%rsi | |
1686 | mov %rax,152($context) # restore context->Rsp | |
1687 | mov %rsi,168($context) # restore context->Rsi | |
1688 | mov %rdi,176($context) # restore context->Rdi | |
1689 | ||
1690 | mov 40($disp),%rdi # disp->ContextRecord | |
1691 | mov $context,%rsi # context | |
1692 | mov \$154,%ecx # sizeof(CONTEXT) | |
1693 | .long 0xa548f3fc # cld; rep movsq | |
1694 | ||
1695 | mov $disp,%rsi | |
1696 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
1697 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
1698 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
1699 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
1700 | mov 40(%rsi),%r10 # disp->ContextRecord | |
1701 | lea 56(%rsi),%r11 # &disp->HandlerData | |
1702 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
1703 | mov %r10,32(%rsp) # arg5 | |
1704 | mov %r11,40(%rsp) # arg6 | |
1705 | mov %r12,48(%rsp) # arg7 | |
1706 | mov %rcx,56(%rsp) # arg8, (NULL) | |
1707 | call *__imp_RtlVirtualUnwind(%rip) | |
1708 | ||
1709 | mov \$1,%eax # ExceptionContinueSearch | |
1710 | add \$64,%rsp | |
1711 | popfq | |
1712 | pop %r15 | |
1713 | pop %r14 | |
1714 | pop %r13 | |
1715 | pop %r12 | |
1716 | pop %rbp | |
1717 | pop %rbx | |
1718 | pop %rdi | |
1719 | pop %rsi | |
1720 | ret | |
1721 | .size se_handler,.-se_handler | |
1722 | ||
1723 | .section .pdata | |
1724 | .rva .LSEH_begin_${func}_xop | |
1725 | .rva .LSEH_end_${func}_xop | |
1726 | .rva .LSEH_info_${func}_xop | |
1727 | ||
1728 | .rva .LSEH_begin_${func}_avx | |
1729 | .rva .LSEH_end_${func}_avx | |
1730 | .rva .LSEH_info_${func}_avx | |
1731 | ___ | |
1732 | $code.=<<___ if ($avx>1); | |
1733 | .rva .LSEH_begin_${func}_avx2 | |
1734 | .rva .LSEH_end_${func}_avx2 | |
1735 | .rva .LSEH_info_${func}_avx2 | |
1736 | ___ | |
977f32e8 AP |
1737 | $code.=<<___ if ($shaext); |
1738 | .rva .LSEH_begin_${func}_shaext | |
1739 | .rva .LSEH_end_${func}_shaext | |
1740 | .rva .LSEH_info_${func}_shaext | |
1741 | ___ | |
82c4a079 | 1742 | $code.=<<___; |
8a97a330 AP |
1743 | .section .xdata |
1744 | .align 8 | |
1745 | .LSEH_info_${func}_xop: | |
1746 | .byte 9,0,0,0 | |
1747 | .rva se_handler | |
1748 | .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] | |
1749 | ||
1750 | .LSEH_info_${func}_avx: | |
1751 | .byte 9,0,0,0 | |
1752 | .rva se_handler | |
1753 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] | |
1754 | ___ | |
1755 | $code.=<<___ if ($avx>1); | |
1756 | .LSEH_info_${func}_avx2: | |
1757 | .byte 9,0,0,0 | |
1758 | .rva se_handler | |
1759 | .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] | |
1760 | ___ | |
977f32e8 AP |
1761 | $code.=<<___ if ($shaext); |
1762 | .LSEH_info_${func}_shaext: | |
1763 | .byte 9,0,0,0 | |
1764 | .rva se_handler | |
1765 | .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] | |
1766 | ___ | |
8a97a330 AP |
1767 | } |
1768 | ||
619b9466 AP |
1769 | #################################################################### |
1770 | sub rex { | |
1771 | local *opcode=shift; | |
1772 | my ($dst,$src)=@_; | |
1773 | my $rex=0; | |
1774 | ||
1775 | $rex|=0x04 if($dst>=8); | |
1776 | $rex|=0x01 if($src>=8); | |
1777 | unshift @opcode,$rex|0x40 if($rex); | |
1778 | } | |
1779 | ||
1780 | { | |
1781 | my %opcodelet = ( | |
1782 | "sha256rnds2" => 0xcb, | |
1783 | "sha256msg1" => 0xcc, | |
1784 | "sha256msg2" => 0xcd ); | |
1785 | ||
1786 | sub sha256op38 { | |
1787 | my $instr = shift; | |
1788 | ||
91a6bf80 | 1789 | if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { |
619b9466 AP |
1790 | my @opcode=(0x0f,0x38); |
1791 | rex(\@opcode,$2,$1); | |
1792 | push @opcode,$opcodelet{$instr}; | |
1793 | push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M | |
1794 | return ".byte\t".join(',',@opcode); | |
1795 | } else { | |
1796 | return $instr."\t".@_[0]; | |
1797 | } | |
1798 | } | |
1799 | } | |
1800 | ||
8a97a330 | 1801 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
619b9466 | 1802 | $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; |
8a97a330 | 1803 | print $code; |
a21314db | 1804 | close STDOUT or die "error closing STDOUT: $!"; |