]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
b7838586 AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # Multi-buffer AES-NI procedures process several independent buffers | |
18 | # in parallel by interleaving independent instructions. | |
19 | # | |
20 | # Cycles per byte for interleave factor 4: | |
21 | # | |
22 | # asymptotic measured | |
23 | # --------------------------- | |
24 | # Westmere 5.00/4=1.25 5.13/4=1.28 | |
3847d15d AP |
25 | # Atom 15.0/4=3.75 ?15.7/4=3.93 |
26 | # Sandy Bridge 5.06/4=1.27 5.18/4=1.29 | |
b7838586 AP |
27 | # Ivy Bridge 5.06/4=1.27 5.14/4=1.29 |
28 | # Haswell 4.44/4=1.11 4.44/4=1.11 | |
29 | # Bulldozer 5.75/4=1.44 5.76/4=1.44 | |
30 | # | |
31 | # Cycles per byte for interleave factor 8 (not implemented for | |
32 | # pre-AVX processors, where higher interleave factor incidentally | |
33 | # doesn't result in improvement): | |
34 | # | |
35 | # asymptotic measured | |
36 | # --------------------------- | |
3847d15d AP |
37 | # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) |
38 | # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) | |
b7838586 AP |
39 | # Haswell 5.00/8=0.63 5.00/8=0.63 |
40 | # Bulldozer 5.75/8=0.72 5.77/8=0.72 | |
41 | # | |
42 | # (*) Sandy/Ivy Bridge are known to handle high interleave factors | |
43 | # suboptimally; | |
44 | ||
45 | $flavour = shift; | |
46 | $output = shift; | |
47 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
48 | ||
49 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
50 | ||
51 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
52 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
53 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
54 | die "can't locate x86_64-xlate.pl"; | |
55 | ||
56 | $avx=0; | |
57 | ||
58 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
59 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
60 | $avx = ($1>=2.19) + ($1>=2.22); | |
61 | } | |
62 | ||
63 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
64 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
65 | $avx = ($1>=2.09) + ($1>=2.10); | |
66 | } | |
67 | ||
68 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
69 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
70 | $avx = ($1>=10) + ($1>=11); | |
71 | } | |
72 | ||
b9749432 | 73 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { |
a356e488 | 74 | $avx = ($2>=3.0) + ($2>3.0); |
ac171925 AP |
75 | } |
76 | ||
cfe1d992 | 77 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
b7838586 AP |
78 | *STDOUT=*OUT; |
79 | ||
80 | # void aesni_multi_cbc_encrypt ( | |
81 | # struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; | |
82 | # const AES_KEY *key, | |
83 | # int num); /* 1 or 2 */ | |
84 | # | |
85 | $inp="%rdi"; # 1st arg | |
86 | $key="%rsi"; # 2nd arg | |
87 | $num="%edx"; | |
88 | ||
89 | @inptr=map("%r$_",(8..11)); | |
90 | @outptr=map("%r$_",(12..15)); | |
91 | ||
92 | ($rndkey0,$rndkey1)=("%xmm0","%xmm1"); | |
93 | @out=map("%xmm$_",(2..5)); | |
94 | @inp=map("%xmm$_",(6..9)); | |
95 | ($counters,$mask,$zero)=map("%xmm$_",(10..12)); | |
96 | ||
97 | ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); | |
98 | ||
99 | $code.=<<___; | |
100 | .text | |
101 | ||
102 | .extern OPENSSL_ia32cap_P | |
103 | ||
104 | .globl aesni_multi_cbc_encrypt | |
105 | .type aesni_multi_cbc_encrypt,\@function,3 | |
106 | .align 32 | |
107 | aesni_multi_cbc_encrypt: | |
108 | ___ | |
109 | $code.=<<___ if ($avx); | |
110 | cmp \$2,$num | |
111 | jb .Lenc_non_avx | |
112 | mov OPENSSL_ia32cap_P+4(%rip),%ecx | |
113 | test \$`1<<28`,%ecx # AVX bit | |
114 | jnz _avx_cbc_enc_shortcut | |
115 | jmp .Lenc_non_avx | |
116 | .align 16 | |
117 | .Lenc_non_avx: | |
118 | ___ | |
119 | $code.=<<___; | |
120 | mov %rsp,%rax | |
121 | push %rbx | |
122 | push %rbp | |
123 | push %r12 | |
124 | push %r13 | |
125 | push %r14 | |
126 | push %r15 | |
127 | ___ | |
128 | $code.=<<___ if ($win64); | |
e2eabed1 | 129 | lea -0xa8(%rsp),%rsp |
b7838586 AP |
130 | movaps %xmm6,(%rsp) |
131 | movaps %xmm7,0x10(%rsp) | |
132 | movaps %xmm8,0x20(%rsp) | |
133 | movaps %xmm9,0x30(%rsp) | |
134 | movaps %xmm10,0x40(%rsp) | |
135 | movaps %xmm11,0x50(%rsp) | |
136 | movaps %xmm12,0x60(%rsp) | |
e2eabed1 AP |
137 | movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler |
138 | movaps %xmm14,-0x58(%rax) | |
139 | movaps %xmm15,-0x48(%rax) | |
b7838586 AP |
140 | ___ |
141 | $code.=<<___; | |
142 | # stack layout | |
143 | # | |
144 | # +0 output sink | |
145 | # +16 input sink [original %rsp and $num] | |
146 | # +32 counters | |
147 | ||
148 | sub \$48,%rsp | |
149 | and \$-64,%rsp | |
150 | mov %rax,16(%rsp) # original %rsp | |
151 | ||
152 | .Lenc4x_body: | |
153 | movdqu ($key),$zero # 0-round key | |
154 | lea 0x78($key),$key # size optimization | |
155 | lea 40*2($inp),$inp | |
156 | ||
157 | .Lenc4x_loop_grande: | |
158 | mov $num,24(%rsp) # original $num | |
159 | xor $num,$num | |
160 | ___ | |
161 | for($i=0;$i<4;$i++) { | |
162 | $code.=<<___; | |
163 | mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks | |
164 | mov `40*$i+0-40*2`($inp),@inptr[$i] | |
165 | cmp $num,$one | |
166 | mov `40*$i+8-40*2`($inp),@outptr[$i] | |
167 | cmovg $one,$num # find maximum | |
168 | test $one,$one | |
169 | movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV | |
170 | mov $one,`32+4*$i`(%rsp) # initialize counters | |
171 | cmovle %rsp,@inptr[$i] # cancel input | |
172 | ___ | |
173 | } | |
174 | $code.=<<___; | |
175 | test $num,$num | |
176 | jz .Lenc4x_done | |
177 | ||
178 | movups 0x10-0x78($key),$rndkey1 | |
179 | pxor $zero,@out[0] | |
180 | movups 0x20-0x78($key),$rndkey0 | |
181 | pxor $zero,@out[1] | |
182 | mov 0xf0-0x78($key),$rounds | |
183 | pxor $zero,@out[2] | |
184 | movdqu (@inptr[0]),@inp[0] # load inputs | |
185 | pxor $zero,@out[3] | |
186 | movdqu (@inptr[1]),@inp[1] | |
187 | pxor @inp[0],@out[0] | |
188 | movdqu (@inptr[2]),@inp[2] | |
189 | pxor @inp[1],@out[1] | |
190 | movdqu (@inptr[3]),@inp[3] | |
191 | pxor @inp[2],@out[2] | |
192 | pxor @inp[3],@out[3] | |
193 | movdqa 32(%rsp),$counters # load counters | |
194 | xor $offset,$offset | |
195 | jmp .Loop_enc4x | |
196 | ||
197 | .align 32 | |
198 | .Loop_enc4x: | |
199 | add \$16,$offset | |
200 | lea 16(%rsp),$sink # sink pointer | |
201 | mov \$1,$one # constant of 1 | |
202 | sub $offset,$sink | |
203 | ||
204 | aesenc $rndkey1,@out[0] | |
3847d15d AP |
205 | prefetcht0 31(@inptr[0],$offset) # prefetch input |
206 | prefetcht0 31(@inptr[1],$offset) | |
b7838586 | 207 | aesenc $rndkey1,@out[1] |
3847d15d AP |
208 | prefetcht0 31(@inptr[2],$offset) |
209 | prefetcht0 31(@inptr[2],$offset) | |
b7838586 AP |
210 | aesenc $rndkey1,@out[2] |
211 | aesenc $rndkey1,@out[3] | |
212 | movups 0x30-0x78($key),$rndkey1 | |
213 | ___ | |
214 | for($i=0;$i<4;$i++) { | |
215 | my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; | |
216 | $code.=<<___; | |
217 | cmp `32+4*$i`(%rsp),$one | |
218 | aesenc $rndkey,@out[0] | |
219 | aesenc $rndkey,@out[1] | |
b7838586 | 220 | aesenc $rndkey,@out[2] |
3847d15d | 221 | cmovge $sink,@inptr[$i] # cancel input |
b7838586 AP |
222 | cmovg $sink,@outptr[$i] # sink output |
223 | aesenc $rndkey,@out[3] | |
224 | movups `0x40+16*$i-0x78`($key),$rndkey | |
225 | ___ | |
226 | } | |
227 | $code.=<<___; | |
228 | movdqa $counters,$mask | |
229 | aesenc $rndkey0,@out[0] | |
3847d15d AP |
230 | prefetcht0 15(@outptr[0],$offset) # prefetch output |
231 | prefetcht0 15(@outptr[1],$offset) | |
b7838586 | 232 | aesenc $rndkey0,@out[1] |
3847d15d AP |
233 | prefetcht0 15(@outptr[2],$offset) |
234 | prefetcht0 15(@outptr[3],$offset) | |
b7838586 AP |
235 | aesenc $rndkey0,@out[2] |
236 | aesenc $rndkey0,@out[3] | |
237 | movups 0x80-0x78($key),$rndkey0 | |
238 | pxor $zero,$zero | |
239 | ||
240 | aesenc $rndkey1,@out[0] | |
241 | pcmpgtd $zero,$mask | |
242 | movdqu -0x78($key),$zero # reload 0-round key | |
243 | aesenc $rndkey1,@out[1] | |
244 | paddd $mask,$counters # decrement counters | |
245 | movdqa $counters,32(%rsp) # update counters | |
246 | aesenc $rndkey1,@out[2] | |
247 | aesenc $rndkey1,@out[3] | |
248 | movups 0x90-0x78($key),$rndkey1 | |
249 | ||
250 | cmp \$11,$rounds | |
251 | ||
252 | aesenc $rndkey0,@out[0] | |
253 | aesenc $rndkey0,@out[1] | |
254 | aesenc $rndkey0,@out[2] | |
255 | aesenc $rndkey0,@out[3] | |
256 | movups 0xa0-0x78($key),$rndkey0 | |
257 | ||
258 | jb .Lenc4x_tail | |
259 | ||
260 | aesenc $rndkey1,@out[0] | |
261 | aesenc $rndkey1,@out[1] | |
262 | aesenc $rndkey1,@out[2] | |
263 | aesenc $rndkey1,@out[3] | |
264 | movups 0xb0-0x78($key),$rndkey1 | |
265 | ||
266 | aesenc $rndkey0,@out[0] | |
267 | aesenc $rndkey0,@out[1] | |
268 | aesenc $rndkey0,@out[2] | |
269 | aesenc $rndkey0,@out[3] | |
270 | movups 0xc0-0x78($key),$rndkey0 | |
271 | ||
272 | je .Lenc4x_tail | |
273 | ||
274 | aesenc $rndkey1,@out[0] | |
275 | aesenc $rndkey1,@out[1] | |
276 | aesenc $rndkey1,@out[2] | |
277 | aesenc $rndkey1,@out[3] | |
278 | movups 0xd0-0x78($key),$rndkey1 | |
279 | ||
280 | aesenc $rndkey0,@out[0] | |
281 | aesenc $rndkey0,@out[1] | |
282 | aesenc $rndkey0,@out[2] | |
283 | aesenc $rndkey0,@out[3] | |
284 | movups 0xe0-0x78($key),$rndkey0 | |
3847d15d | 285 | jmp .Lenc4x_tail |
b7838586 | 286 | |
3847d15d | 287 | .align 32 |
b7838586 AP |
288 | .Lenc4x_tail: |
289 | aesenc $rndkey1,@out[0] | |
290 | aesenc $rndkey1,@out[1] | |
291 | aesenc $rndkey1,@out[2] | |
b7838586 | 292 | aesenc $rndkey1,@out[3] |
3847d15d | 293 | movdqu (@inptr[0],$offset),@inp[0] |
b7838586 AP |
294 | movdqu 0x10-0x78($key),$rndkey1 |
295 | ||
296 | aesenclast $rndkey0,@out[0] | |
297 | movdqu (@inptr[1],$offset),@inp[1] | |
298 | pxor $zero,@inp[0] | |
299 | aesenclast $rndkey0,@out[1] | |
300 | movdqu (@inptr[2],$offset),@inp[2] | |
301 | pxor $zero,@inp[1] | |
302 | aesenclast $rndkey0,@out[2] | |
303 | movdqu (@inptr[3],$offset),@inp[3] | |
304 | pxor $zero,@inp[2] | |
305 | aesenclast $rndkey0,@out[3] | |
306 | movdqu 0x20-0x78($key),$rndkey0 | |
307 | pxor $zero,@inp[3] | |
308 | ||
309 | movups @out[0],-16(@outptr[0],$offset) | |
310 | pxor @inp[0],@out[0] | |
311 | movups @out[1],-16(@outptr[1],$offset) | |
312 | pxor @inp[1],@out[1] | |
313 | movups @out[2],-16(@outptr[2],$offset) | |
314 | pxor @inp[2],@out[2] | |
315 | movups @out[3],-16(@outptr[3],$offset) | |
316 | pxor @inp[3],@out[3] | |
317 | ||
318 | dec $num | |
319 | jnz .Loop_enc4x | |
320 | ||
321 | mov 16(%rsp),%rax # original %rsp | |
322 | mov 24(%rsp),$num | |
323 | ||
324 | #pxor @inp[0],@out[0] | |
325 | #pxor @inp[1],@out[1] | |
326 | #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! | |
327 | #pxor @inp[2],@out[2] | |
328 | #movdqu @out[1],`40*1+24-40*2`($inp) | |
329 | #pxor @inp[3],@out[3] | |
330 | #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller | |
331 | #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... | |
332 | ||
333 | lea `40*4`($inp),$inp | |
334 | dec $num | |
335 | jnz .Lenc4x_loop_grande | |
336 | ||
337 | .Lenc4x_done: | |
338 | ___ | |
339 | $code.=<<___ if ($win64); | |
e2eabed1 AP |
340 | movaps -0xd8(%rax),%xmm6 |
341 | movaps -0xc8(%rax),%xmm7 | |
342 | movaps -0xb8(%rax),%xmm8 | |
343 | movaps -0xa8(%rax),%xmm9 | |
344 | movaps -0x98(%rax),%xmm10 | |
345 | movaps -0x88(%rax),%xmm11 | |
346 | movaps -0x78(%rax),%xmm12 | |
347 | #movaps -0x68(%rax),%xmm13 | |
348 | #movaps -0x58(%rax),%xmm14 | |
349 | #movaps -0x48(%rax),%xmm15 | |
b7838586 AP |
350 | ___ |
351 | $code.=<<___; | |
352 | mov -48(%rax),%r15 | |
353 | mov -40(%rax),%r14 | |
354 | mov -32(%rax),%r13 | |
355 | mov -24(%rax),%r12 | |
356 | mov -16(%rax),%rbp | |
357 | mov -8(%rax),%rbx | |
358 | lea (%rax),%rsp | |
e2eabed1 | 359 | .Lenc4x_epilogue: |
b7838586 AP |
360 | ret |
361 | .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt | |
362 | ||
363 | .globl aesni_multi_cbc_decrypt | |
364 | .type aesni_multi_cbc_decrypt,\@function,3 | |
365 | .align 32 | |
366 | aesni_multi_cbc_decrypt: | |
367 | ___ | |
368 | $code.=<<___ if ($avx); | |
369 | cmp \$2,$num | |
370 | jb .Ldec_non_avx | |
371 | mov OPENSSL_ia32cap_P+4(%rip),%ecx | |
372 | test \$`1<<28`,%ecx # AVX bit | |
373 | jnz _avx_cbc_dec_shortcut | |
374 | jmp .Ldec_non_avx | |
375 | .align 16 | |
376 | .Ldec_non_avx: | |
377 | ___ | |
378 | $code.=<<___; | |
379 | mov %rsp,%rax | |
380 | push %rbx | |
381 | push %rbp | |
382 | push %r12 | |
383 | push %r13 | |
384 | push %r14 | |
385 | push %r15 | |
386 | ___ | |
387 | $code.=<<___ if ($win64); | |
e2eabed1 | 388 | lea -0xa8(%rsp),%rsp |
b7838586 AP |
389 | movaps %xmm6,(%rsp) |
390 | movaps %xmm7,0x10(%rsp) | |
391 | movaps %xmm8,0x20(%rsp) | |
392 | movaps %xmm9,0x30(%rsp) | |
393 | movaps %xmm10,0x40(%rsp) | |
394 | movaps %xmm11,0x50(%rsp) | |
395 | movaps %xmm12,0x60(%rsp) | |
e2eabed1 AP |
396 | movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler |
397 | movaps %xmm14,-0x58(%rax) | |
398 | movaps %xmm15,-0x48(%rax) | |
b7838586 AP |
399 | ___ |
400 | $code.=<<___; | |
401 | # stack layout | |
402 | # | |
403 | # +0 output sink | |
404 | # +16 input sink [original %rsp and $num] | |
405 | # +32 counters | |
406 | ||
407 | sub \$48,%rsp | |
408 | and \$-64,%rsp | |
409 | mov %rax,16(%rsp) # original %rsp | |
410 | ||
411 | .Ldec4x_body: | |
412 | movdqu ($key),$zero # 0-round key | |
413 | lea 0x78($key),$key # size optimization | |
414 | lea 40*2($inp),$inp | |
415 | ||
416 | .Ldec4x_loop_grande: | |
417 | mov $num,24(%rsp) # original $num | |
418 | xor $num,$num | |
419 | ___ | |
420 | for($i=0;$i<4;$i++) { | |
421 | $code.=<<___; | |
422 | mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks | |
423 | mov `40*$i+0-40*2`($inp),@inptr[$i] | |
424 | cmp $num,$one | |
425 | mov `40*$i+8-40*2`($inp),@outptr[$i] | |
426 | cmovg $one,$num # find maximum | |
427 | test $one,$one | |
428 | movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV | |
429 | mov $one,`32+4*$i`(%rsp) # initialize counters | |
430 | cmovle %rsp,@inptr[$i] # cancel input | |
431 | ___ | |
432 | } | |
433 | $code.=<<___; | |
434 | test $num,$num | |
435 | jz .Ldec4x_done | |
436 | ||
437 | movups 0x10-0x78($key),$rndkey1 | |
438 | movups 0x20-0x78($key),$rndkey0 | |
439 | mov 0xf0-0x78($key),$rounds | |
440 | movdqu (@inptr[0]),@out[0] # load inputs | |
441 | movdqu (@inptr[1]),@out[1] | |
442 | pxor $zero,@out[0] | |
443 | movdqu (@inptr[2]),@out[2] | |
444 | pxor $zero,@out[1] | |
445 | movdqu (@inptr[3]),@out[3] | |
446 | pxor $zero,@out[2] | |
447 | pxor $zero,@out[3] | |
448 | movdqa 32(%rsp),$counters # load counters | |
449 | xor $offset,$offset | |
450 | jmp .Loop_dec4x | |
451 | ||
452 | .align 32 | |
453 | .Loop_dec4x: | |
454 | add \$16,$offset | |
455 | lea 16(%rsp),$sink # sink pointer | |
456 | mov \$1,$one # constant of 1 | |
457 | sub $offset,$sink | |
458 | ||
459 | aesdec $rndkey1,@out[0] | |
3847d15d AP |
460 | prefetcht0 31(@inptr[0],$offset) # prefetch input |
461 | prefetcht0 31(@inptr[1],$offset) | |
b7838586 | 462 | aesdec $rndkey1,@out[1] |
3847d15d AP |
463 | prefetcht0 31(@inptr[2],$offset) |
464 | prefetcht0 31(@inptr[3],$offset) | |
b7838586 AP |
465 | aesdec $rndkey1,@out[2] |
466 | aesdec $rndkey1,@out[3] | |
467 | movups 0x30-0x78($key),$rndkey1 | |
468 | ___ | |
469 | for($i=0;$i<4;$i++) { | |
470 | my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; | |
471 | $code.=<<___; | |
472 | cmp `32+4*$i`(%rsp),$one | |
473 | aesdec $rndkey,@out[0] | |
474 | aesdec $rndkey,@out[1] | |
b7838586 | 475 | aesdec $rndkey,@out[2] |
e2eabed1 | 476 | cmovge $sink,@inptr[$i] # cancel input |
b7838586 AP |
477 | cmovg $sink,@outptr[$i] # sink output |
478 | aesdec $rndkey,@out[3] | |
479 | movups `0x40+16*$i-0x78`($key),$rndkey | |
480 | ___ | |
481 | } | |
482 | $code.=<<___; | |
483 | movdqa $counters,$mask | |
484 | aesdec $rndkey0,@out[0] | |
3847d15d AP |
485 | prefetcht0 15(@outptr[0],$offset) # prefetch output |
486 | prefetcht0 15(@outptr[1],$offset) | |
b7838586 | 487 | aesdec $rndkey0,@out[1] |
3847d15d AP |
488 | prefetcht0 15(@outptr[2],$offset) |
489 | prefetcht0 15(@outptr[3],$offset) | |
b7838586 AP |
490 | aesdec $rndkey0,@out[2] |
491 | aesdec $rndkey0,@out[3] | |
492 | movups 0x80-0x78($key),$rndkey0 | |
493 | pxor $zero,$zero | |
494 | ||
495 | aesdec $rndkey1,@out[0] | |
496 | pcmpgtd $zero,$mask | |
497 | movdqu -0x78($key),$zero # reload 0-round key | |
498 | aesdec $rndkey1,@out[1] | |
499 | paddd $mask,$counters # decrement counters | |
500 | movdqa $counters,32(%rsp) # update counters | |
501 | aesdec $rndkey1,@out[2] | |
502 | aesdec $rndkey1,@out[3] | |
503 | movups 0x90-0x78($key),$rndkey1 | |
504 | ||
505 | cmp \$11,$rounds | |
506 | ||
507 | aesdec $rndkey0,@out[0] | |
508 | aesdec $rndkey0,@out[1] | |
509 | aesdec $rndkey0,@out[2] | |
510 | aesdec $rndkey0,@out[3] | |
511 | movups 0xa0-0x78($key),$rndkey0 | |
512 | ||
513 | jb .Ldec4x_tail | |
514 | ||
515 | aesdec $rndkey1,@out[0] | |
516 | aesdec $rndkey1,@out[1] | |
517 | aesdec $rndkey1,@out[2] | |
518 | aesdec $rndkey1,@out[3] | |
519 | movups 0xb0-0x78($key),$rndkey1 | |
520 | ||
521 | aesdec $rndkey0,@out[0] | |
522 | aesdec $rndkey0,@out[1] | |
523 | aesdec $rndkey0,@out[2] | |
524 | aesdec $rndkey0,@out[3] | |
525 | movups 0xc0-0x78($key),$rndkey0 | |
526 | ||
527 | je .Ldec4x_tail | |
528 | ||
529 | aesdec $rndkey1,@out[0] | |
530 | aesdec $rndkey1,@out[1] | |
531 | aesdec $rndkey1,@out[2] | |
532 | aesdec $rndkey1,@out[3] | |
533 | movups 0xd0-0x78($key),$rndkey1 | |
534 | ||
535 | aesdec $rndkey0,@out[0] | |
536 | aesdec $rndkey0,@out[1] | |
537 | aesdec $rndkey0,@out[2] | |
538 | aesdec $rndkey0,@out[3] | |
539 | movups 0xe0-0x78($key),$rndkey0 | |
3847d15d | 540 | jmp .Ldec4x_tail |
b7838586 | 541 | |
3847d15d | 542 | .align 32 |
b7838586 AP |
543 | .Ldec4x_tail: |
544 | aesdec $rndkey1,@out[0] | |
545 | aesdec $rndkey1,@out[1] | |
546 | aesdec $rndkey1,@out[2] | |
547 | pxor $rndkey0,@inp[0] | |
548 | pxor $rndkey0,@inp[1] | |
549 | aesdec $rndkey1,@out[3] | |
550 | movdqu 0x10-0x78($key),$rndkey1 | |
551 | pxor $rndkey0,@inp[2] | |
552 | pxor $rndkey0,@inp[3] | |
553 | movdqu 0x20-0x78($key),$rndkey0 | |
554 | ||
555 | aesdeclast @inp[0],@out[0] | |
b7838586 | 556 | aesdeclast @inp[1],@out[1] |
3847d15d | 557 | movdqu -16(@inptr[0],$offset),@inp[0] # load next IV |
b7838586 AP |
558 | movdqu -16(@inptr[1],$offset),@inp[1] |
559 | aesdeclast @inp[2],@out[2] | |
b7838586 | 560 | aesdeclast @inp[3],@out[3] |
3847d15d | 561 | movdqu -16(@inptr[2],$offset),@inp[2] |
b7838586 AP |
562 | movdqu -16(@inptr[3],$offset),@inp[3] |
563 | ||
564 | movups @out[0],-16(@outptr[0],$offset) | |
565 | movdqu (@inptr[0],$offset),@out[0] | |
566 | movups @out[1],-16(@outptr[1],$offset) | |
567 | movdqu (@inptr[1],$offset),@out[1] | |
568 | pxor $zero,@out[0] | |
569 | movups @out[2],-16(@outptr[2],$offset) | |
570 | movdqu (@inptr[2],$offset),@out[2] | |
571 | pxor $zero,@out[1] | |
572 | movups @out[3],-16(@outptr[3],$offset) | |
573 | movdqu (@inptr[3],$offset),@out[3] | |
574 | pxor $zero,@out[2] | |
575 | pxor $zero,@out[3] | |
576 | ||
577 | dec $num | |
578 | jnz .Loop_dec4x | |
579 | ||
580 | mov 16(%rsp),%rax # original %rsp | |
581 | mov 24(%rsp),$num | |
582 | ||
583 | lea `40*4`($inp),$inp | |
584 | dec $num | |
585 | jnz .Ldec4x_loop_grande | |
586 | ||
587 | .Ldec4x_done: | |
588 | ___ | |
589 | $code.=<<___ if ($win64); | |
e2eabed1 AP |
590 | movaps -0xd8(%rax),%xmm6 |
591 | movaps -0xc8(%rax),%xmm7 | |
592 | movaps -0xb8(%rax),%xmm8 | |
593 | movaps -0xa8(%rax),%xmm9 | |
594 | movaps -0x98(%rax),%xmm10 | |
595 | movaps -0x88(%rax),%xmm11 | |
596 | movaps -0x78(%rax),%xmm12 | |
597 | #movaps -0x68(%rax),%xmm13 | |
598 | #movaps -0x58(%rax),%xmm14 | |
599 | #movaps -0x48(%rax),%xmm15 | |
b7838586 AP |
600 | ___ |
601 | $code.=<<___; | |
602 | mov -48(%rax),%r15 | |
603 | mov -40(%rax),%r14 | |
604 | mov -32(%rax),%r13 | |
605 | mov -24(%rax),%r12 | |
606 | mov -16(%rax),%rbp | |
607 | mov -8(%rax),%rbx | |
608 | lea (%rax),%rsp | |
e2eabed1 | 609 | .Ldec4x_epilogue: |
b7838586 AP |
610 | ret |
611 | .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt | |
612 | ___ | |
613 | ||
614 | if ($avx) {{{ | |
615 | my @ptr=map("%r$_",(8..15)); | |
616 | my $offload=$sink; | |
617 | ||
618 | my @out=map("%xmm$_",(2..9)); | |
619 | my @inp=map("%xmm$_",(10..13)); | |
620 | my ($counters,$zero)=("%xmm14","%xmm15"); | |
621 | ||
622 | $code.=<<___; | |
623 | .type aesni_multi_cbc_encrypt_avx,\@function,3 | |
624 | .align 32 | |
625 | aesni_multi_cbc_encrypt_avx: | |
626 | _avx_cbc_enc_shortcut: | |
627 | mov %rsp,%rax | |
628 | push %rbx | |
629 | push %rbp | |
630 | push %r12 | |
631 | push %r13 | |
632 | push %r14 | |
633 | push %r15 | |
634 | ___ | |
635 | $code.=<<___ if ($win64); | |
636 | lea -0xa8(%rsp),%rsp | |
637 | movaps %xmm6,(%rsp) | |
638 | movaps %xmm7,0x10(%rsp) | |
639 | movaps %xmm8,0x20(%rsp) | |
640 | movaps %xmm9,0x30(%rsp) | |
641 | movaps %xmm10,0x40(%rsp) | |
642 | movaps %xmm11,0x50(%rsp) | |
643 | movaps %xmm12,-0x78(%rax) | |
644 | movaps %xmm13,-0x68(%rax) | |
645 | movaps %xmm14,-0x58(%rax) | |
646 | movaps %xmm15,-0x48(%rax) | |
647 | ___ | |
648 | $code.=<<___; | |
649 | # stack layout | |
650 | # | |
651 | # +0 output sink | |
652 | # +16 input sink [original %rsp and $num] | |
653 | # +32 counters | |
654 | # +64 distances between inputs and outputs | |
655 | # +128 off-load area for @inp[0..3] | |
656 | ||
657 | sub \$192,%rsp | |
658 | and \$-128,%rsp | |
659 | mov %rax,16(%rsp) # original %rsp | |
660 | ||
661 | .Lenc8x_body: | |
662 | vzeroupper | |
663 | vmovdqu ($key),$zero # 0-round key | |
664 | lea 0x78($key),$key # size optimization | |
665 | lea 40*4($inp),$inp | |
666 | shr \$1,$num | |
667 | ||
668 | .Lenc8x_loop_grande: | |
669 | #mov $num,24(%rsp) # original $num | |
670 | xor $num,$num | |
671 | ___ | |
672 | for($i=0;$i<8;$i++) { | |
673 | my $temp = $i ? $offload : $offset; | |
674 | $code.=<<___; | |
675 | mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks | |
676 | mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer | |
677 | cmp $num,$one | |
678 | mov `40*$i+8-40*4`($inp),$temp # output pointer | |
679 | cmovg $one,$num # find maximum | |
680 | test $one,$one | |
681 | vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV | |
682 | mov $one,`32+4*$i`(%rsp) # initialize counters | |
683 | cmovle %rsp,@ptr[$i] # cancel input | |
684 | sub @ptr[$i],$temp # distance between input and output | |
685 | mov $temp,`64+8*$i`(%rsp) # initialize distances | |
686 | ___ | |
687 | } | |
688 | $code.=<<___; | |
689 | test $num,$num | |
690 | jz .Lenc8x_done | |
691 | ||
692 | vmovups 0x10-0x78($key),$rndkey1 | |
693 | vmovups 0x20-0x78($key),$rndkey0 | |
694 | mov 0xf0-0x78($key),$rounds | |
695 | ||
696 | vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round | |
697 | lea 128(%rsp),$offload # offload area | |
698 | vpxor (@ptr[1]),$zero,@inp[1] | |
699 | vpxor (@ptr[2]),$zero,@inp[2] | |
700 | vpxor (@ptr[3]),$zero,@inp[3] | |
701 | vpxor @inp[0],@out[0],@out[0] | |
702 | vpxor (@ptr[4]),$zero,@inp[0] | |
703 | vpxor @inp[1],@out[1],@out[1] | |
704 | vpxor (@ptr[5]),$zero,@inp[1] | |
705 | vpxor @inp[2],@out[2],@out[2] | |
706 | vpxor (@ptr[6]),$zero,@inp[2] | |
707 | vpxor @inp[3],@out[3],@out[3] | |
708 | vpxor (@ptr[7]),$zero,@inp[3] | |
709 | vpxor @inp[0],@out[4],@out[4] | |
710 | mov \$1,$one # constant of 1 | |
711 | vpxor @inp[1],@out[5],@out[5] | |
712 | vpxor @inp[2],@out[6],@out[6] | |
713 | vpxor @inp[3],@out[7],@out[7] | |
714 | jmp .Loop_enc8x | |
715 | ||
716 | .align 32 | |
717 | .Loop_enc8x: | |
718 | ___ | |
719 | for($i=0;$i<8;$i++) { | |
720 | my $rndkey=($i&1)?$rndkey0:$rndkey1; | |
721 | $code.=<<___; | |
722 | vaesenc $rndkey,@out[0],@out[0] | |
723 | cmp 32+4*$i(%rsp),$one | |
724 | ___ | |
725 | $code.=<<___ if ($i); | |
726 | mov 64+8*$i(%rsp),$offset | |
727 | ___ | |
728 | $code.=<<___; | |
729 | vaesenc $rndkey,@out[1],@out[1] | |
3847d15d | 730 | prefetcht0 31(@ptr[$i]) # prefetch input |
b7838586 | 731 | vaesenc $rndkey,@out[2],@out[2] |
3847d15d AP |
732 | ___ |
733 | $code.=<<___ if ($i>1); | |
734 | prefetcht0 15(@ptr[$i-2]) # prefetch output | |
735 | ___ | |
736 | $code.=<<___; | |
b7838586 AP |
737 | vaesenc $rndkey,@out[3],@out[3] |
738 | lea (@ptr[$i],$offset),$offset | |
739 | cmovge %rsp,@ptr[$i] # cancel input | |
740 | vaesenc $rndkey,@out[4],@out[4] | |
741 | cmovg %rsp,$offset # sink output | |
742 | vaesenc $rndkey,@out[5],@out[5] | |
743 | sub @ptr[$i],$offset | |
744 | vaesenc $rndkey,@out[6],@out[6] | |
745 | vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round | |
746 | mov $offset,64+8*$i(%rsp) | |
747 | vaesenc $rndkey,@out[7],@out[7] | |
748 | vmovups `16*(3+$i)-0x78`($key),$rndkey | |
749 | lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output | |
750 | ___ | |
751 | $code.=<<___ if ($i<4) | |
752 | vmovdqu @inp[$i%4],`16*$i`($offload) # off-load | |
753 | ___ | |
754 | } | |
755 | $code.=<<___; | |
756 | vmovdqu 32(%rsp),$counters | |
3847d15d AP |
757 | prefetcht0 15(@ptr[$i-2]) # prefetch output |
758 | prefetcht0 15(@ptr[$i-1]) | |
b7838586 AP |
759 | cmp \$11,$rounds |
760 | jb .Lenc8x_tail | |
761 | ||
762 | vaesenc $rndkey1,@out[0],@out[0] | |
763 | vaesenc $rndkey1,@out[1],@out[1] | |
764 | vaesenc $rndkey1,@out[2],@out[2] | |
765 | vaesenc $rndkey1,@out[3],@out[3] | |
766 | vaesenc $rndkey1,@out[4],@out[4] | |
767 | vaesenc $rndkey1,@out[5],@out[5] | |
768 | vaesenc $rndkey1,@out[6],@out[6] | |
769 | vaesenc $rndkey1,@out[7],@out[7] | |
770 | vmovups 0xb0-0x78($key),$rndkey1 | |
771 | ||
772 | vaesenc $rndkey0,@out[0],@out[0] | |
773 | vaesenc $rndkey0,@out[1],@out[1] | |
774 | vaesenc $rndkey0,@out[2],@out[2] | |
775 | vaesenc $rndkey0,@out[3],@out[3] | |
776 | vaesenc $rndkey0,@out[4],@out[4] | |
777 | vaesenc $rndkey0,@out[5],@out[5] | |
778 | vaesenc $rndkey0,@out[6],@out[6] | |
779 | vaesenc $rndkey0,@out[7],@out[7] | |
780 | vmovups 0xc0-0x78($key),$rndkey0 | |
781 | je .Lenc8x_tail | |
782 | ||
783 | vaesenc $rndkey1,@out[0],@out[0] | |
784 | vaesenc $rndkey1,@out[1],@out[1] | |
785 | vaesenc $rndkey1,@out[2],@out[2] | |
786 | vaesenc $rndkey1,@out[3],@out[3] | |
787 | vaesenc $rndkey1,@out[4],@out[4] | |
788 | vaesenc $rndkey1,@out[5],@out[5] | |
789 | vaesenc $rndkey1,@out[6],@out[6] | |
790 | vaesenc $rndkey1,@out[7],@out[7] | |
791 | vmovups 0xd0-0x78($key),$rndkey1 | |
792 | ||
793 | vaesenc $rndkey0,@out[0],@out[0] | |
794 | vaesenc $rndkey0,@out[1],@out[1] | |
795 | vaesenc $rndkey0,@out[2],@out[2] | |
796 | vaesenc $rndkey0,@out[3],@out[3] | |
797 | vaesenc $rndkey0,@out[4],@out[4] | |
798 | vaesenc $rndkey0,@out[5],@out[5] | |
799 | vaesenc $rndkey0,@out[6],@out[6] | |
800 | vaesenc $rndkey0,@out[7],@out[7] | |
801 | vmovups 0xe0-0x78($key),$rndkey0 | |
802 | ||
803 | .Lenc8x_tail: | |
804 | vaesenc $rndkey1,@out[0],@out[0] | |
805 | vpxor $zero,$zero,$zero | |
806 | vaesenc $rndkey1,@out[1],@out[1] | |
807 | vaesenc $rndkey1,@out[2],@out[2] | |
808 | vpcmpgtd $zero,$counters,$zero | |
809 | vaesenc $rndkey1,@out[3],@out[3] | |
810 | vaesenc $rndkey1,@out[4],@out[4] | |
811 | vpaddd $counters,$zero,$zero # decrement counters | |
812 | vmovdqu 48(%rsp),$counters | |
813 | vaesenc $rndkey1,@out[5],@out[5] | |
814 | mov 64(%rsp),$offset # pre-load 1st offset | |
815 | vaesenc $rndkey1,@out[6],@out[6] | |
816 | vaesenc $rndkey1,@out[7],@out[7] | |
817 | vmovups 0x10-0x78($key),$rndkey1 | |
818 | ||
819 | vaesenclast $rndkey0,@out[0],@out[0] | |
820 | vmovdqa $zero,32(%rsp) # update counters | |
821 | vpxor $zero,$zero,$zero | |
822 | vaesenclast $rndkey0,@out[1],@out[1] | |
823 | vaesenclast $rndkey0,@out[2],@out[2] | |
824 | vpcmpgtd $zero,$counters,$zero | |
825 | vaesenclast $rndkey0,@out[3],@out[3] | |
826 | vaesenclast $rndkey0,@out[4],@out[4] | |
827 | vpaddd $zero,$counters,$counters # decrement counters | |
828 | vmovdqu -0x78($key),$zero # 0-round | |
829 | vaesenclast $rndkey0,@out[5],@out[5] | |
830 | vaesenclast $rndkey0,@out[6],@out[6] | |
831 | vmovdqa $counters,48(%rsp) # update counters | |
832 | vaesenclast $rndkey0,@out[7],@out[7] | |
833 | vmovups 0x20-0x78($key),$rndkey0 | |
834 | ||
835 | vmovups @out[0],-16(@ptr[0]) # write output | |
836 | sub $offset,@ptr[0] # switch to input | |
837 | vpxor 0x00($offload),@out[0],@out[0] | |
838 | vmovups @out[1],-16(@ptr[1]) | |
839 | sub `64+1*8`(%rsp),@ptr[1] | |
840 | vpxor 0x10($offload),@out[1],@out[1] | |
841 | vmovups @out[2],-16(@ptr[2]) | |
842 | sub `64+2*8`(%rsp),@ptr[2] | |
843 | vpxor 0x20($offload),@out[2],@out[2] | |
844 | vmovups @out[3],-16(@ptr[3]) | |
845 | sub `64+3*8`(%rsp),@ptr[3] | |
846 | vpxor 0x30($offload),@out[3],@out[3] | |
847 | vmovups @out[4],-16(@ptr[4]) | |
848 | sub `64+4*8`(%rsp),@ptr[4] | |
849 | vpxor @inp[0],@out[4],@out[4] | |
850 | vmovups @out[5],-16(@ptr[5]) | |
851 | sub `64+5*8`(%rsp),@ptr[5] | |
852 | vpxor @inp[1],@out[5],@out[5] | |
853 | vmovups @out[6],-16(@ptr[6]) | |
854 | sub `64+6*8`(%rsp),@ptr[6] | |
855 | vpxor @inp[2],@out[6],@out[6] | |
856 | vmovups @out[7],-16(@ptr[7]) | |
857 | sub `64+7*8`(%rsp),@ptr[7] | |
858 | vpxor @inp[3],@out[7],@out[7] | |
859 | ||
860 | dec $num | |
861 | jnz .Loop_enc8x | |
862 | ||
863 | mov 16(%rsp),%rax # original %rsp | |
864 | #mov 24(%rsp),$num | |
865 | #lea `40*8`($inp),$inp | |
866 | #dec $num | |
867 | #jnz .Lenc8x_loop_grande | |
868 | ||
869 | .Lenc8x_done: | |
870 | vzeroupper | |
871 | ___ | |
872 | $code.=<<___ if ($win64); | |
873 | movaps -0xd8(%rax),%xmm6 | |
874 | movaps -0xc8(%rax),%xmm7 | |
875 | movaps -0xb8(%rax),%xmm8 | |
876 | movaps -0xa8(%rax),%xmm9 | |
877 | movaps -0x98(%rax),%xmm10 | |
878 | movaps -0x88(%rax),%xmm11 | |
879 | movaps -0x78(%rax),%xmm12 | |
880 | movaps -0x68(%rax),%xmm13 | |
881 | movaps -0x58(%rax),%xmm14 | |
882 | movaps -0x48(%rax),%xmm15 | |
883 | ___ | |
884 | $code.=<<___; | |
885 | mov -48(%rax),%r15 | |
886 | mov -40(%rax),%r14 | |
887 | mov -32(%rax),%r13 | |
888 | mov -24(%rax),%r12 | |
889 | mov -16(%rax),%rbp | |
890 | mov -8(%rax),%rbx | |
891 | lea (%rax),%rsp | |
e2eabed1 | 892 | .Lenc8x_epilogue: |
b7838586 AP |
893 | ret |
894 | .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx | |
895 | ||
896 | .type aesni_multi_cbc_decrypt_avx,\@function,3 | |
897 | .align 32 | |
898 | aesni_multi_cbc_decrypt_avx: | |
899 | _avx_cbc_dec_shortcut: | |
900 | mov %rsp,%rax | |
901 | push %rbx | |
902 | push %rbp | |
903 | push %r12 | |
904 | push %r13 | |
905 | push %r14 | |
906 | push %r15 | |
907 | ___ | |
908 | $code.=<<___ if ($win64); | |
909 | lea -0xa8(%rsp),%rsp | |
910 | movaps %xmm6,(%rsp) | |
911 | movaps %xmm7,0x10(%rsp) | |
912 | movaps %xmm8,0x20(%rsp) | |
913 | movaps %xmm9,0x30(%rsp) | |
914 | movaps %xmm10,0x40(%rsp) | |
915 | movaps %xmm11,0x50(%rsp) | |
916 | movaps %xmm12,-0x78(%rax) | |
917 | movaps %xmm13,-0x68(%rax) | |
918 | movaps %xmm14,-0x58(%rax) | |
919 | movaps %xmm15,-0x48(%rax) | |
920 | ___ | |
921 | $code.=<<___; | |
922 | # stack layout | |
923 | # | |
924 | # +0 output sink | |
925 | # +16 input sink [original %rsp and $num] | |
926 | # +32 counters | |
927 | # +64 distances between inputs and outputs | |
928 | # +128 off-load area for @inp[0..3] | |
929 | # +192 IV/input offload | |
930 | ||
931 | sub \$256,%rsp | |
932 | and \$-256,%rsp | |
933 | sub \$192,%rsp | |
934 | mov %rax,16(%rsp) # original %rsp | |
935 | ||
936 | .Ldec8x_body: | |
937 | vzeroupper | |
938 | vmovdqu ($key),$zero # 0-round key | |
939 | lea 0x78($key),$key # size optimization | |
940 | lea 40*4($inp),$inp | |
941 | shr \$1,$num | |
942 | ||
943 | .Ldec8x_loop_grande: | |
944 | #mov $num,24(%rsp) # original $num | |
945 | xor $num,$num | |
946 | ___ | |
947 | for($i=0;$i<8;$i++) { | |
948 | my $temp = $i ? $offload : $offset; | |
949 | $code.=<<___; | |
950 | mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks | |
951 | mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer | |
952 | cmp $num,$one | |
953 | mov `40*$i+8-40*4`($inp),$temp # output pointer | |
954 | cmovg $one,$num # find maximum | |
955 | test $one,$one | |
956 | vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV | |
957 | mov $one,`32+4*$i`(%rsp) # initialize counters | |
958 | cmovle %rsp,@ptr[$i] # cancel input | |
959 | sub @ptr[$i],$temp # distance between input and output | |
960 | mov $temp,`64+8*$i`(%rsp) # initialize distances | |
961 | vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV | |
962 | ___ | |
963 | } | |
964 | $code.=<<___; | |
965 | test $num,$num | |
966 | jz .Ldec8x_done | |
967 | ||
968 | vmovups 0x10-0x78($key),$rndkey1 | |
969 | vmovups 0x20-0x78($key),$rndkey0 | |
970 | mov 0xf0-0x78($key),$rounds | |
971 | lea 192+128(%rsp),$offload # offload area | |
972 | ||
973 | vmovdqu (@ptr[0]),@out[0] # load inputs | |
974 | vmovdqu (@ptr[1]),@out[1] | |
975 | vmovdqu (@ptr[2]),@out[2] | |
976 | vmovdqu (@ptr[3]),@out[3] | |
977 | vmovdqu (@ptr[4]),@out[4] | |
978 | vmovdqu (@ptr[5]),@out[5] | |
979 | vmovdqu (@ptr[6]),@out[6] | |
980 | vmovdqu (@ptr[7]),@out[7] | |
981 | vmovdqu @out[0],0x00($offload) # offload inputs | |
982 | vpxor $zero,@out[0],@out[0] # xor inputs with 0-round | |
983 | vmovdqu @out[1],0x10($offload) | |
984 | vpxor $zero,@out[1],@out[1] | |
985 | vmovdqu @out[2],0x20($offload) | |
986 | vpxor $zero,@out[2],@out[2] | |
987 | vmovdqu @out[3],0x30($offload) | |
988 | vpxor $zero,@out[3],@out[3] | |
989 | vmovdqu @out[4],0x40($offload) | |
990 | vpxor $zero,@out[4],@out[4] | |
991 | vmovdqu @out[5],0x50($offload) | |
992 | vpxor $zero,@out[5],@out[5] | |
993 | vmovdqu @out[6],0x60($offload) | |
994 | vpxor $zero,@out[6],@out[6] | |
995 | vmovdqu @out[7],0x70($offload) | |
996 | vpxor $zero,@out[7],@out[7] | |
997 | xor \$0x80,$offload | |
998 | mov \$1,$one # constant of 1 | |
999 | jmp .Loop_dec8x | |
1000 | ||
1001 | .align 32 | |
1002 | .Loop_dec8x: | |
1003 | ___ | |
1004 | for($i=0;$i<8;$i++) { | |
1005 | my $rndkey=($i&1)?$rndkey0:$rndkey1; | |
1006 | $code.=<<___; | |
1007 | vaesdec $rndkey,@out[0],@out[0] | |
1008 | cmp 32+4*$i(%rsp),$one | |
1009 | ___ | |
1010 | $code.=<<___ if ($i); | |
1011 | mov 64+8*$i(%rsp),$offset | |
1012 | ___ | |
1013 | $code.=<<___; | |
1014 | vaesdec $rndkey,@out[1],@out[1] | |
3847d15d | 1015 | prefetcht0 31(@ptr[$i]) # prefetch input |
b7838586 | 1016 | vaesdec $rndkey,@out[2],@out[2] |
3847d15d AP |
1017 | ___ |
1018 | $code.=<<___ if ($i>1); | |
1019 | prefetcht0 15(@ptr[$i-2]) # prefetch output | |
1020 | ___ | |
1021 | $code.=<<___; | |
b7838586 AP |
1022 | vaesdec $rndkey,@out[3],@out[3] |
1023 | lea (@ptr[$i],$offset),$offset | |
1024 | cmovge %rsp,@ptr[$i] # cancel input | |
1025 | vaesdec $rndkey,@out[4],@out[4] | |
1026 | cmovg %rsp,$offset # sink output | |
1027 | vaesdec $rndkey,@out[5],@out[5] | |
1028 | sub @ptr[$i],$offset | |
1029 | vaesdec $rndkey,@out[6],@out[6] | |
1030 | vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input | |
1031 | mov $offset,64+8*$i(%rsp) | |
1032 | vaesdec $rndkey,@out[7],@out[7] | |
1033 | vmovups `16*(3+$i)-0x78`($key),$rndkey | |
1034 | lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output | |
1035 | ___ | |
1036 | $code.=<<___ if ($i<4); | |
1037 | vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load | |
1038 | ___ | |
1039 | } | |
1040 | $code.=<<___; | |
1041 | vmovdqu 32(%rsp),$counters | |
3847d15d AP |
1042 | prefetcht0 15(@ptr[$i-2]) # prefetch output |
1043 | prefetcht0 15(@ptr[$i-1]) | |
b7838586 AP |
1044 | cmp \$11,$rounds |
1045 | jb .Ldec8x_tail | |
1046 | ||
1047 | vaesdec $rndkey1,@out[0],@out[0] | |
1048 | vaesdec $rndkey1,@out[1],@out[1] | |
1049 | vaesdec $rndkey1,@out[2],@out[2] | |
1050 | vaesdec $rndkey1,@out[3],@out[3] | |
1051 | vaesdec $rndkey1,@out[4],@out[4] | |
1052 | vaesdec $rndkey1,@out[5],@out[5] | |
1053 | vaesdec $rndkey1,@out[6],@out[6] | |
1054 | vaesdec $rndkey1,@out[7],@out[7] | |
1055 | vmovups 0xb0-0x78($key),$rndkey1 | |
1056 | ||
1057 | vaesdec $rndkey0,@out[0],@out[0] | |
1058 | vaesdec $rndkey0,@out[1],@out[1] | |
1059 | vaesdec $rndkey0,@out[2],@out[2] | |
1060 | vaesdec $rndkey0,@out[3],@out[3] | |
1061 | vaesdec $rndkey0,@out[4],@out[4] | |
1062 | vaesdec $rndkey0,@out[5],@out[5] | |
1063 | vaesdec $rndkey0,@out[6],@out[6] | |
1064 | vaesdec $rndkey0,@out[7],@out[7] | |
1065 | vmovups 0xc0-0x78($key),$rndkey0 | |
1066 | je .Ldec8x_tail | |
1067 | ||
1068 | vaesdec $rndkey1,@out[0],@out[0] | |
1069 | vaesdec $rndkey1,@out[1],@out[1] | |
1070 | vaesdec $rndkey1,@out[2],@out[2] | |
1071 | vaesdec $rndkey1,@out[3],@out[3] | |
1072 | vaesdec $rndkey1,@out[4],@out[4] | |
1073 | vaesdec $rndkey1,@out[5],@out[5] | |
1074 | vaesdec $rndkey1,@out[6],@out[6] | |
1075 | vaesdec $rndkey1,@out[7],@out[7] | |
1076 | vmovups 0xd0-0x78($key),$rndkey1 | |
1077 | ||
1078 | vaesdec $rndkey0,@out[0],@out[0] | |
1079 | vaesdec $rndkey0,@out[1],@out[1] | |
1080 | vaesdec $rndkey0,@out[2],@out[2] | |
1081 | vaesdec $rndkey0,@out[3],@out[3] | |
1082 | vaesdec $rndkey0,@out[4],@out[4] | |
1083 | vaesdec $rndkey0,@out[5],@out[5] | |
1084 | vaesdec $rndkey0,@out[6],@out[6] | |
1085 | vaesdec $rndkey0,@out[7],@out[7] | |
1086 | vmovups 0xe0-0x78($key),$rndkey0 | |
1087 | ||
1088 | .Ldec8x_tail: | |
1089 | vaesdec $rndkey1,@out[0],@out[0] | |
1090 | vpxor $zero,$zero,$zero | |
1091 | vaesdec $rndkey1,@out[1],@out[1] | |
1092 | vaesdec $rndkey1,@out[2],@out[2] | |
1093 | vpcmpgtd $zero,$counters,$zero | |
1094 | vaesdec $rndkey1,@out[3],@out[3] | |
1095 | vaesdec $rndkey1,@out[4],@out[4] | |
1096 | vpaddd $counters,$zero,$zero # decrement counters | |
1097 | vmovdqu 48(%rsp),$counters | |
1098 | vaesdec $rndkey1,@out[5],@out[5] | |
1099 | mov 64(%rsp),$offset # pre-load 1st offset | |
1100 | vaesdec $rndkey1,@out[6],@out[6] | |
1101 | vaesdec $rndkey1,@out[7],@out[7] | |
1102 | vmovups 0x10-0x78($key),$rndkey1 | |
1103 | ||
1104 | vaesdeclast $rndkey0,@out[0],@out[0] | |
1105 | vmovdqa $zero,32(%rsp) # update counters | |
1106 | vpxor $zero,$zero,$zero | |
1107 | vaesdeclast $rndkey0,@out[1],@out[1] | |
1108 | vpxor 0x00($offload),@out[0],@out[0] # xor with IV | |
1109 | vaesdeclast $rndkey0,@out[2],@out[2] | |
1110 | vpxor 0x10($offload),@out[1],@out[1] | |
1111 | vpcmpgtd $zero,$counters,$zero | |
1112 | vaesdeclast $rndkey0,@out[3],@out[3] | |
1113 | vpxor 0x20($offload),@out[2],@out[2] | |
1114 | vaesdeclast $rndkey0,@out[4],@out[4] | |
1115 | vpxor 0x30($offload),@out[3],@out[3] | |
1116 | vpaddd $zero,$counters,$counters # decrement counters | |
1117 | vmovdqu -0x78($key),$zero # 0-round | |
1118 | vaesdeclast $rndkey0,@out[5],@out[5] | |
1119 | vpxor 0x40($offload),@out[4],@out[4] | |
1120 | vaesdeclast $rndkey0,@out[6],@out[6] | |
1121 | vpxor 0x50($offload),@out[5],@out[5] | |
1122 | vmovdqa $counters,48(%rsp) # update counters | |
1123 | vaesdeclast $rndkey0,@out[7],@out[7] | |
1124 | vpxor 0x60($offload),@out[6],@out[6] | |
1125 | vmovups 0x20-0x78($key),$rndkey0 | |
1126 | ||
1127 | vmovups @out[0],-16(@ptr[0]) # write output | |
1128 | sub $offset,@ptr[0] # switch to input | |
1129 | vmovdqu 128+0(%rsp),@out[0] | |
1130 | vpxor 0x70($offload),@out[7],@out[7] | |
1131 | vmovups @out[1],-16(@ptr[1]) | |
1132 | sub `64+1*8`(%rsp),@ptr[1] | |
1133 | vmovdqu @out[0],0x00($offload) | |
1134 | vpxor $zero,@out[0],@out[0] | |
1135 | vmovdqu 128+16(%rsp),@out[1] | |
1136 | vmovups @out[2],-16(@ptr[2]) | |
1137 | sub `64+2*8`(%rsp),@ptr[2] | |
1138 | vmovdqu @out[1],0x10($offload) | |
1139 | vpxor $zero,@out[1],@out[1] | |
1140 | vmovdqu 128+32(%rsp),@out[2] | |
1141 | vmovups @out[3],-16(@ptr[3]) | |
1142 | sub `64+3*8`(%rsp),@ptr[3] | |
1143 | vmovdqu @out[2],0x20($offload) | |
1144 | vpxor $zero,@out[2],@out[2] | |
1145 | vmovdqu 128+48(%rsp),@out[3] | |
1146 | vmovups @out[4],-16(@ptr[4]) | |
1147 | sub `64+4*8`(%rsp),@ptr[4] | |
1148 | vmovdqu @out[3],0x30($offload) | |
1149 | vpxor $zero,@out[3],@out[3] | |
1150 | vmovdqu @inp[0],0x40($offload) | |
1151 | vpxor @inp[0],$zero,@out[4] | |
1152 | vmovups @out[5],-16(@ptr[5]) | |
1153 | sub `64+5*8`(%rsp),@ptr[5] | |
1154 | vmovdqu @inp[1],0x50($offload) | |
1155 | vpxor @inp[1],$zero,@out[5] | |
1156 | vmovups @out[6],-16(@ptr[6]) | |
1157 | sub `64+6*8`(%rsp),@ptr[6] | |
1158 | vmovdqu @inp[2],0x60($offload) | |
1159 | vpxor @inp[2],$zero,@out[6] | |
1160 | vmovups @out[7],-16(@ptr[7]) | |
1161 | sub `64+7*8`(%rsp),@ptr[7] | |
1162 | vmovdqu @inp[3],0x70($offload) | |
1163 | vpxor @inp[3],$zero,@out[7] | |
1164 | ||
1165 | xor \$128,$offload | |
1166 | dec $num | |
1167 | jnz .Loop_dec8x | |
1168 | ||
1169 | mov 16(%rsp),%rax # original %rsp | |
1170 | #mov 24(%rsp),$num | |
1171 | #lea `40*8`($inp),$inp | |
1172 | #dec $num | |
1173 | #jnz .Ldec8x_loop_grande | |
1174 | ||
1175 | .Ldec8x_done: | |
1176 | vzeroupper | |
1177 | ___ | |
1178 | $code.=<<___ if ($win64); | |
1179 | movaps -0xd8(%rax),%xmm6 | |
1180 | movaps -0xc8(%rax),%xmm7 | |
1181 | movaps -0xb8(%rax),%xmm8 | |
1182 | movaps -0xa8(%rax),%xmm9 | |
1183 | movaps -0x98(%rax),%xmm10 | |
1184 | movaps -0x88(%rax),%xmm11 | |
1185 | movaps -0x78(%rax),%xmm12 | |
1186 | movaps -0x68(%rax),%xmm13 | |
1187 | movaps -0x58(%rax),%xmm14 | |
1188 | movaps -0x48(%rax),%xmm15 | |
1189 | ___ | |
1190 | $code.=<<___; | |
1191 | mov -48(%rax),%r15 | |
1192 | mov -40(%rax),%r14 | |
1193 | mov -32(%rax),%r13 | |
1194 | mov -24(%rax),%r12 | |
1195 | mov -16(%rax),%rbp | |
1196 | mov -8(%rax),%rbx | |
1197 | lea (%rax),%rsp | |
e2eabed1 | 1198 | .Ldec8x_epilogue: |
b7838586 AP |
1199 | ret |
1200 | .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx | |
1201 | ___ | |
1202 | }}} | |
1203 | ||
e2eabed1 AP |
1204 | if ($win64) { |
1205 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
1206 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
1207 | $rec="%rcx"; | |
1208 | $frame="%rdx"; | |
1209 | $context="%r8"; | |
1210 | $disp="%r9"; | |
1211 | ||
1212 | $code.=<<___; | |
1213 | .extern __imp_RtlVirtualUnwind | |
1214 | .type se_handler,\@abi-omnipotent | |
1215 | .align 16 | |
1216 | se_handler: | |
1217 | push %rsi | |
1218 | push %rdi | |
1219 | push %rbx | |
1220 | push %rbp | |
1221 | push %r12 | |
1222 | push %r13 | |
1223 | push %r14 | |
1224 | push %r15 | |
1225 | pushfq | |
1226 | sub \$64,%rsp | |
1227 | ||
1228 | mov 120($context),%rax # pull context->Rax | |
1229 | mov 248($context),%rbx # pull context->Rip | |
1230 | ||
1231 | mov 8($disp),%rsi # disp->ImageBase | |
1232 | mov 56($disp),%r11 # disp->HandlerData | |
1233 | ||
1234 | mov 0(%r11),%r10d # HandlerData[0] | |
1235 | lea (%rsi,%r10),%r10 # prologue label | |
1236 | cmp %r10,%rbx # context->Rip<.Lprologue | |
1237 | jb .Lin_prologue | |
1238 | ||
1239 | mov 152($context),%rax # pull context->Rsp | |
1240 | ||
1241 | mov 4(%r11),%r10d # HandlerData[1] | |
1242 | lea (%rsi,%r10),%r10 # epilogue label | |
1243 | cmp %r10,%rbx # context->Rip>=.Lepilogue | |
1244 | jae .Lin_prologue | |
1245 | ||
1246 | mov 16(%rax),%rax # pull saved stack pointer | |
1247 | ||
1248 | mov -8(%rax),%rbx | |
1249 | mov -16(%rax),%rbp | |
1250 | mov -24(%rax),%r12 | |
1251 | mov -32(%rax),%r13 | |
1252 | mov -40(%rax),%r14 | |
1253 | mov -48(%rax),%r15 | |
1254 | mov %rbx,144($context) # restore context->Rbx | |
1255 | mov %rbp,160($context) # restore context->Rbp | |
1256 | mov %r12,216($context) # restore cotnext->R12 | |
1257 | mov %r13,224($context) # restore cotnext->R13 | |
1258 | mov %r14,232($context) # restore cotnext->R14 | |
1259 | mov %r15,240($context) # restore cotnext->R15 | |
1260 | ||
1261 | lea -56-10*16(%rax),%rsi | |
1262 | lea 512($context),%rdi # &context.Xmm6 | |
1263 | mov \$20,%ecx | |
1264 | .long 0xa548f3fc # cld; rep movsq | |
1265 | ||
1266 | .Lin_prologue: | |
1267 | mov 8(%rax),%rdi | |
1268 | mov 16(%rax),%rsi | |
1269 | mov %rax,152($context) # restore context->Rsp | |
1270 | mov %rsi,168($context) # restore context->Rsi | |
1271 | mov %rdi,176($context) # restore context->Rdi | |
1272 | ||
1273 | mov 40($disp),%rdi # disp->ContextRecord | |
1274 | mov $context,%rsi # context | |
1275 | mov \$154,%ecx # sizeof(CONTEXT) | |
1276 | .long 0xa548f3fc # cld; rep movsq | |
1277 | ||
1278 | mov $disp,%rsi | |
1279 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
1280 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
1281 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
1282 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
1283 | mov 40(%rsi),%r10 # disp->ContextRecord | |
1284 | lea 56(%rsi),%r11 # &disp->HandlerData | |
1285 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
1286 | mov %r10,32(%rsp) # arg5 | |
1287 | mov %r11,40(%rsp) # arg6 | |
1288 | mov %r12,48(%rsp) # arg7 | |
1289 | mov %rcx,56(%rsp) # arg8, (NULL) | |
1290 | call *__imp_RtlVirtualUnwind(%rip) | |
1291 | ||
1292 | mov \$1,%eax # ExceptionContinueSearch | |
1293 | add \$64,%rsp | |
1294 | popfq | |
1295 | pop %r15 | |
1296 | pop %r14 | |
1297 | pop %r13 | |
1298 | pop %r12 | |
1299 | pop %rbp | |
1300 | pop %rbx | |
1301 | pop %rdi | |
1302 | pop %rsi | |
1303 | ret | |
1304 | .size se_handler,.-se_handler | |
1305 | ||
1306 | .section .pdata | |
1307 | .align 4 | |
1308 | .rva .LSEH_begin_aesni_multi_cbc_encrypt | |
1309 | .rva .LSEH_end_aesni_multi_cbc_encrypt | |
1310 | .rva .LSEH_info_aesni_multi_cbc_encrypt | |
1311 | .rva .LSEH_begin_aesni_multi_cbc_decrypt | |
1312 | .rva .LSEH_end_aesni_multi_cbc_decrypt | |
1313 | .rva .LSEH_info_aesni_multi_cbc_decrypt | |
1314 | ___ | |
1315 | $code.=<<___ if ($avx); | |
1316 | .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx | |
1317 | .rva .LSEH_end_aesni_multi_cbc_encrypt_avx | |
1318 | .rva .LSEH_info_aesni_multi_cbc_encrypt_avx | |
1319 | .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx | |
1320 | .rva .LSEH_end_aesni_multi_cbc_decrypt_avx | |
1321 | .rva .LSEH_info_aesni_multi_cbc_decrypt_avx | |
1322 | ___ | |
1323 | $code.=<<___; | |
1324 | .section .xdata | |
1325 | .align 8 | |
1326 | .LSEH_info_aesni_multi_cbc_encrypt: | |
1327 | .byte 9,0,0,0 | |
1328 | .rva se_handler | |
1329 | .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] | |
1330 | .LSEH_info_aesni_multi_cbc_decrypt: | |
1331 | .byte 9,0,0,0 | |
1332 | .rva se_handler | |
1333 | .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] | |
1334 | ___ | |
1335 | $code.=<<___ if ($avx); | |
1336 | .LSEH_info_aesni_multi_cbc_encrypt_avx: | |
1337 | .byte 9,0,0,0 | |
1338 | .rva se_handler | |
1339 | .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] | |
1340 | .LSEH_info_aesni_multi_cbc_decrypt_avx: | |
1341 | .byte 9,0,0,0 | |
1342 | .rva se_handler | |
1343 | .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] | |
1344 | ___ | |
1345 | } | |
1346 | #################################################################### | |
1347 | ||
b7838586 AP |
1348 | sub rex { |
1349 | local *opcode=shift; | |
1350 | my ($dst,$src)=@_; | |
1351 | my $rex=0; | |
1352 | ||
1353 | $rex|=0x04 if($dst>=8); | |
1354 | $rex|=0x01 if($src>=8); | |
1355 | push @opcode,$rex|0x40 if($rex); | |
1356 | } | |
1357 | ||
1358 | sub aesni { | |
1359 | my $line=shift; | |
1360 | my @opcode=(0x66); | |
1361 | ||
1362 | if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { | |
1363 | rex(\@opcode,$4,$3); | |
1364 | push @opcode,0x0f,0x3a,0xdf; | |
1365 | push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M | |
1366 | my $c=$2; | |
1367 | push @opcode,$c=~/^0/?oct($c):$c; | |
1368 | return ".byte\t".join(',',@opcode); | |
1369 | } | |
1370 | elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { | |
1371 | my %opcodelet = ( | |
1372 | "aesimc" => 0xdb, | |
1373 | "aesenc" => 0xdc, "aesenclast" => 0xdd, | |
1374 | "aesdec" => 0xde, "aesdeclast" => 0xdf | |
1375 | ); | |
1376 | return undef if (!defined($opcodelet{$1})); | |
1377 | rex(\@opcode,$3,$2); | |
1378 | push @opcode,0x0f,0x38,$opcodelet{$1}; | |
1379 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M | |
1380 | return ".byte\t".join(',',@opcode); | |
1381 | } | |
1382 | elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { | |
1383 | my %opcodelet = ( | |
1384 | "aesenc" => 0xdc, "aesenclast" => 0xdd, | |
1385 | "aesdec" => 0xde, "aesdeclast" => 0xdf | |
1386 | ); | |
1387 | return undef if (!defined($opcodelet{$1})); | |
1388 | my $off = $2; | |
1389 | push @opcode,0x44 if ($3>=8); | |
1390 | push @opcode,0x0f,0x38,$opcodelet{$1}; | |
1391 | push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M | |
1392 | push @opcode,($off=~/^0/?oct($off):$off)&0xff; | |
1393 | return ".byte\t".join(',',@opcode); | |
1394 | } | |
1395 | return $line; | |
1396 | } | |
1397 | ||
1398 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
1399 | $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; | |
1400 | ||
1401 | print $code; | |
1402 | close STDOUT; |