]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-mb-x86_64.pl
x86_64 assembly pack: tolerate spaces in source directory name.
[thirdparty/openssl.git] / crypto / aes / asm / aesni-mb-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
b7838586
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer AES-NI procedures process several independent buffers
18# in parallel by interleaving independent instructions.
19#
20# Cycles per byte for interleave factor 4:
21#
22# asymptotic measured
23# ---------------------------
24# Westmere 5.00/4=1.25 5.13/4=1.28
3847d15d
AP
25# Atom 15.0/4=3.75 ?15.7/4=3.93
26# Sandy Bridge 5.06/4=1.27 5.18/4=1.29
b7838586
AP
27# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
28# Haswell 4.44/4=1.11 4.44/4=1.11
29# Bulldozer 5.75/4=1.44 5.76/4=1.44
30#
31# Cycles per byte for interleave factor 8 (not implemented for
32# pre-AVX processors, where higher interleave factor incidentally
33# doesn't result in improvement):
34#
35# asymptotic measured
36# ---------------------------
3847d15d
AP
37# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
38# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
b7838586
AP
39# Haswell 5.00/8=0.63 5.00/8=0.63
40# Bulldozer 5.75/8=0.72 5.77/8=0.72
41#
42# (*) Sandy/Ivy Bridge are known to handle high interleave factors
43# suboptimally;
44
45$flavour = shift;
46$output = shift;
47if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
48
49$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
53( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
54die "can't locate x86_64-xlate.pl";
55
56$avx=0;
57
58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60 $avx = ($1>=2.19) + ($1>=2.22);
61}
62
63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.09) + ($1>=2.10);
66}
67
68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70 $avx = ($1>=10) + ($1>=11);
71}
72
b9749432 73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
a356e488 74 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
75}
76
cfe1d992 77open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
b7838586
AP
78*STDOUT=*OUT;
79
80# void aesni_multi_cbc_encrypt (
81# struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
82# const AES_KEY *key,
83# int num); /* 1 or 2 */
84#
85$inp="%rdi"; # 1st arg
86$key="%rsi"; # 2nd arg
87$num="%edx";
88
89@inptr=map("%r$_",(8..11));
90@outptr=map("%r$_",(12..15));
91
92($rndkey0,$rndkey1)=("%xmm0","%xmm1");
93@out=map("%xmm$_",(2..5));
94@inp=map("%xmm$_",(6..9));
95($counters,$mask,$zero)=map("%xmm$_",(10..12));
96
97($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
98
99$code.=<<___;
100.text
101
102.extern OPENSSL_ia32cap_P
103
104.globl aesni_multi_cbc_encrypt
105.type aesni_multi_cbc_encrypt,\@function,3
106.align 32
107aesni_multi_cbc_encrypt:
108___
109$code.=<<___ if ($avx);
110 cmp \$2,$num
111 jb .Lenc_non_avx
112 mov OPENSSL_ia32cap_P+4(%rip),%ecx
113 test \$`1<<28`,%ecx # AVX bit
114 jnz _avx_cbc_enc_shortcut
115 jmp .Lenc_non_avx
116.align 16
117.Lenc_non_avx:
118___
119$code.=<<___;
120 mov %rsp,%rax
121 push %rbx
122 push %rbp
123 push %r12
124 push %r13
125 push %r14
126 push %r15
127___
128$code.=<<___ if ($win64);
e2eabed1 129 lea -0xa8(%rsp),%rsp
b7838586
AP
130 movaps %xmm6,(%rsp)
131 movaps %xmm7,0x10(%rsp)
132 movaps %xmm8,0x20(%rsp)
133 movaps %xmm9,0x30(%rsp)
134 movaps %xmm10,0x40(%rsp)
135 movaps %xmm11,0x50(%rsp)
136 movaps %xmm12,0x60(%rsp)
e2eabed1
AP
137 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
138 movaps %xmm14,-0x58(%rax)
139 movaps %xmm15,-0x48(%rax)
b7838586
AP
140___
141$code.=<<___;
142 # stack layout
143 #
144 # +0 output sink
145 # +16 input sink [original %rsp and $num]
146 # +32 counters
147
148 sub \$48,%rsp
149 and \$-64,%rsp
150 mov %rax,16(%rsp) # original %rsp
151
152.Lenc4x_body:
153 movdqu ($key),$zero # 0-round key
154 lea 0x78($key),$key # size optimization
155 lea 40*2($inp),$inp
156
157.Lenc4x_loop_grande:
158 mov $num,24(%rsp) # original $num
159 xor $num,$num
160___
161for($i=0;$i<4;$i++) {
162 $code.=<<___;
163 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
164 mov `40*$i+0-40*2`($inp),@inptr[$i]
165 cmp $num,$one
166 mov `40*$i+8-40*2`($inp),@outptr[$i]
167 cmovg $one,$num # find maximum
168 test $one,$one
169 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
170 mov $one,`32+4*$i`(%rsp) # initialize counters
171 cmovle %rsp,@inptr[$i] # cancel input
172___
173}
174$code.=<<___;
175 test $num,$num
176 jz .Lenc4x_done
177
178 movups 0x10-0x78($key),$rndkey1
179 pxor $zero,@out[0]
180 movups 0x20-0x78($key),$rndkey0
181 pxor $zero,@out[1]
182 mov 0xf0-0x78($key),$rounds
183 pxor $zero,@out[2]
184 movdqu (@inptr[0]),@inp[0] # load inputs
185 pxor $zero,@out[3]
186 movdqu (@inptr[1]),@inp[1]
187 pxor @inp[0],@out[0]
188 movdqu (@inptr[2]),@inp[2]
189 pxor @inp[1],@out[1]
190 movdqu (@inptr[3]),@inp[3]
191 pxor @inp[2],@out[2]
192 pxor @inp[3],@out[3]
193 movdqa 32(%rsp),$counters # load counters
194 xor $offset,$offset
195 jmp .Loop_enc4x
196
197.align 32
198.Loop_enc4x:
199 add \$16,$offset
200 lea 16(%rsp),$sink # sink pointer
201 mov \$1,$one # constant of 1
202 sub $offset,$sink
203
204 aesenc $rndkey1,@out[0]
3847d15d
AP
205 prefetcht0 31(@inptr[0],$offset) # prefetch input
206 prefetcht0 31(@inptr[1],$offset)
b7838586 207 aesenc $rndkey1,@out[1]
3847d15d
AP
208 prefetcht0 31(@inptr[2],$offset)
209 prefetcht0 31(@inptr[2],$offset)
b7838586
AP
210 aesenc $rndkey1,@out[2]
211 aesenc $rndkey1,@out[3]
212 movups 0x30-0x78($key),$rndkey1
213___
214for($i=0;$i<4;$i++) {
215my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
216$code.=<<___;
217 cmp `32+4*$i`(%rsp),$one
218 aesenc $rndkey,@out[0]
219 aesenc $rndkey,@out[1]
b7838586 220 aesenc $rndkey,@out[2]
3847d15d 221 cmovge $sink,@inptr[$i] # cancel input
b7838586
AP
222 cmovg $sink,@outptr[$i] # sink output
223 aesenc $rndkey,@out[3]
224 movups `0x40+16*$i-0x78`($key),$rndkey
225___
226}
227$code.=<<___;
228 movdqa $counters,$mask
229 aesenc $rndkey0,@out[0]
3847d15d
AP
230 prefetcht0 15(@outptr[0],$offset) # prefetch output
231 prefetcht0 15(@outptr[1],$offset)
b7838586 232 aesenc $rndkey0,@out[1]
3847d15d
AP
233 prefetcht0 15(@outptr[2],$offset)
234 prefetcht0 15(@outptr[3],$offset)
b7838586
AP
235 aesenc $rndkey0,@out[2]
236 aesenc $rndkey0,@out[3]
237 movups 0x80-0x78($key),$rndkey0
238 pxor $zero,$zero
239
240 aesenc $rndkey1,@out[0]
241 pcmpgtd $zero,$mask
242 movdqu -0x78($key),$zero # reload 0-round key
243 aesenc $rndkey1,@out[1]
244 paddd $mask,$counters # decrement counters
245 movdqa $counters,32(%rsp) # update counters
246 aesenc $rndkey1,@out[2]
247 aesenc $rndkey1,@out[3]
248 movups 0x90-0x78($key),$rndkey1
249
250 cmp \$11,$rounds
251
252 aesenc $rndkey0,@out[0]
253 aesenc $rndkey0,@out[1]
254 aesenc $rndkey0,@out[2]
255 aesenc $rndkey0,@out[3]
256 movups 0xa0-0x78($key),$rndkey0
257
258 jb .Lenc4x_tail
259
260 aesenc $rndkey1,@out[0]
261 aesenc $rndkey1,@out[1]
262 aesenc $rndkey1,@out[2]
263 aesenc $rndkey1,@out[3]
264 movups 0xb0-0x78($key),$rndkey1
265
266 aesenc $rndkey0,@out[0]
267 aesenc $rndkey0,@out[1]
268 aesenc $rndkey0,@out[2]
269 aesenc $rndkey0,@out[3]
270 movups 0xc0-0x78($key),$rndkey0
271
272 je .Lenc4x_tail
273
274 aesenc $rndkey1,@out[0]
275 aesenc $rndkey1,@out[1]
276 aesenc $rndkey1,@out[2]
277 aesenc $rndkey1,@out[3]
278 movups 0xd0-0x78($key),$rndkey1
279
280 aesenc $rndkey0,@out[0]
281 aesenc $rndkey0,@out[1]
282 aesenc $rndkey0,@out[2]
283 aesenc $rndkey0,@out[3]
284 movups 0xe0-0x78($key),$rndkey0
3847d15d 285 jmp .Lenc4x_tail
b7838586 286
3847d15d 287.align 32
b7838586
AP
288.Lenc4x_tail:
289 aesenc $rndkey1,@out[0]
290 aesenc $rndkey1,@out[1]
291 aesenc $rndkey1,@out[2]
b7838586 292 aesenc $rndkey1,@out[3]
3847d15d 293 movdqu (@inptr[0],$offset),@inp[0]
b7838586
AP
294 movdqu 0x10-0x78($key),$rndkey1
295
296 aesenclast $rndkey0,@out[0]
297 movdqu (@inptr[1],$offset),@inp[1]
298 pxor $zero,@inp[0]
299 aesenclast $rndkey0,@out[1]
300 movdqu (@inptr[2],$offset),@inp[2]
301 pxor $zero,@inp[1]
302 aesenclast $rndkey0,@out[2]
303 movdqu (@inptr[3],$offset),@inp[3]
304 pxor $zero,@inp[2]
305 aesenclast $rndkey0,@out[3]
306 movdqu 0x20-0x78($key),$rndkey0
307 pxor $zero,@inp[3]
308
309 movups @out[0],-16(@outptr[0],$offset)
310 pxor @inp[0],@out[0]
311 movups @out[1],-16(@outptr[1],$offset)
312 pxor @inp[1],@out[1]
313 movups @out[2],-16(@outptr[2],$offset)
314 pxor @inp[2],@out[2]
315 movups @out[3],-16(@outptr[3],$offset)
316 pxor @inp[3],@out[3]
317
318 dec $num
319 jnz .Loop_enc4x
320
321 mov 16(%rsp),%rax # original %rsp
322 mov 24(%rsp),$num
323
324 #pxor @inp[0],@out[0]
325 #pxor @inp[1],@out[1]
326 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
327 #pxor @inp[2],@out[2]
328 #movdqu @out[1],`40*1+24-40*2`($inp)
329 #pxor @inp[3],@out[3]
330 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
331 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
332
333 lea `40*4`($inp),$inp
334 dec $num
335 jnz .Lenc4x_loop_grande
336
337.Lenc4x_done:
338___
339$code.=<<___ if ($win64);
e2eabed1
AP
340 movaps -0xd8(%rax),%xmm6
341 movaps -0xc8(%rax),%xmm7
342 movaps -0xb8(%rax),%xmm8
343 movaps -0xa8(%rax),%xmm9
344 movaps -0x98(%rax),%xmm10
345 movaps -0x88(%rax),%xmm11
346 movaps -0x78(%rax),%xmm12
347 #movaps -0x68(%rax),%xmm13
348 #movaps -0x58(%rax),%xmm14
349 #movaps -0x48(%rax),%xmm15
b7838586
AP
350___
351$code.=<<___;
352 mov -48(%rax),%r15
353 mov -40(%rax),%r14
354 mov -32(%rax),%r13
355 mov -24(%rax),%r12
356 mov -16(%rax),%rbp
357 mov -8(%rax),%rbx
358 lea (%rax),%rsp
e2eabed1 359.Lenc4x_epilogue:
b7838586
AP
360 ret
361.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
362
363.globl aesni_multi_cbc_decrypt
364.type aesni_multi_cbc_decrypt,\@function,3
365.align 32
366aesni_multi_cbc_decrypt:
367___
368$code.=<<___ if ($avx);
369 cmp \$2,$num
370 jb .Ldec_non_avx
371 mov OPENSSL_ia32cap_P+4(%rip),%ecx
372 test \$`1<<28`,%ecx # AVX bit
373 jnz _avx_cbc_dec_shortcut
374 jmp .Ldec_non_avx
375.align 16
376.Ldec_non_avx:
377___
378$code.=<<___;
379 mov %rsp,%rax
380 push %rbx
381 push %rbp
382 push %r12
383 push %r13
384 push %r14
385 push %r15
386___
387$code.=<<___ if ($win64);
e2eabed1 388 lea -0xa8(%rsp),%rsp
b7838586
AP
389 movaps %xmm6,(%rsp)
390 movaps %xmm7,0x10(%rsp)
391 movaps %xmm8,0x20(%rsp)
392 movaps %xmm9,0x30(%rsp)
393 movaps %xmm10,0x40(%rsp)
394 movaps %xmm11,0x50(%rsp)
395 movaps %xmm12,0x60(%rsp)
e2eabed1
AP
396 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
397 movaps %xmm14,-0x58(%rax)
398 movaps %xmm15,-0x48(%rax)
b7838586
AP
399___
400$code.=<<___;
401 # stack layout
402 #
403 # +0 output sink
404 # +16 input sink [original %rsp and $num]
405 # +32 counters
406
407 sub \$48,%rsp
408 and \$-64,%rsp
409 mov %rax,16(%rsp) # original %rsp
410
411.Ldec4x_body:
412 movdqu ($key),$zero # 0-round key
413 lea 0x78($key),$key # size optimization
414 lea 40*2($inp),$inp
415
416.Ldec4x_loop_grande:
417 mov $num,24(%rsp) # original $num
418 xor $num,$num
419___
420for($i=0;$i<4;$i++) {
421 $code.=<<___;
422 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
423 mov `40*$i+0-40*2`($inp),@inptr[$i]
424 cmp $num,$one
425 mov `40*$i+8-40*2`($inp),@outptr[$i]
426 cmovg $one,$num # find maximum
427 test $one,$one
428 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
429 mov $one,`32+4*$i`(%rsp) # initialize counters
430 cmovle %rsp,@inptr[$i] # cancel input
431___
432}
433$code.=<<___;
434 test $num,$num
435 jz .Ldec4x_done
436
437 movups 0x10-0x78($key),$rndkey1
438 movups 0x20-0x78($key),$rndkey0
439 mov 0xf0-0x78($key),$rounds
440 movdqu (@inptr[0]),@out[0] # load inputs
441 movdqu (@inptr[1]),@out[1]
442 pxor $zero,@out[0]
443 movdqu (@inptr[2]),@out[2]
444 pxor $zero,@out[1]
445 movdqu (@inptr[3]),@out[3]
446 pxor $zero,@out[2]
447 pxor $zero,@out[3]
448 movdqa 32(%rsp),$counters # load counters
449 xor $offset,$offset
450 jmp .Loop_dec4x
451
452.align 32
453.Loop_dec4x:
454 add \$16,$offset
455 lea 16(%rsp),$sink # sink pointer
456 mov \$1,$one # constant of 1
457 sub $offset,$sink
458
459 aesdec $rndkey1,@out[0]
3847d15d
AP
460 prefetcht0 31(@inptr[0],$offset) # prefetch input
461 prefetcht0 31(@inptr[1],$offset)
b7838586 462 aesdec $rndkey1,@out[1]
3847d15d
AP
463 prefetcht0 31(@inptr[2],$offset)
464 prefetcht0 31(@inptr[3],$offset)
b7838586
AP
465 aesdec $rndkey1,@out[2]
466 aesdec $rndkey1,@out[3]
467 movups 0x30-0x78($key),$rndkey1
468___
469for($i=0;$i<4;$i++) {
470my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
471$code.=<<___;
472 cmp `32+4*$i`(%rsp),$one
473 aesdec $rndkey,@out[0]
474 aesdec $rndkey,@out[1]
b7838586 475 aesdec $rndkey,@out[2]
e2eabed1 476 cmovge $sink,@inptr[$i] # cancel input
b7838586
AP
477 cmovg $sink,@outptr[$i] # sink output
478 aesdec $rndkey,@out[3]
479 movups `0x40+16*$i-0x78`($key),$rndkey
480___
481}
482$code.=<<___;
483 movdqa $counters,$mask
484 aesdec $rndkey0,@out[0]
3847d15d
AP
485 prefetcht0 15(@outptr[0],$offset) # prefetch output
486 prefetcht0 15(@outptr[1],$offset)
b7838586 487 aesdec $rndkey0,@out[1]
3847d15d
AP
488 prefetcht0 15(@outptr[2],$offset)
489 prefetcht0 15(@outptr[3],$offset)
b7838586
AP
490 aesdec $rndkey0,@out[2]
491 aesdec $rndkey0,@out[3]
492 movups 0x80-0x78($key),$rndkey0
493 pxor $zero,$zero
494
495 aesdec $rndkey1,@out[0]
496 pcmpgtd $zero,$mask
497 movdqu -0x78($key),$zero # reload 0-round key
498 aesdec $rndkey1,@out[1]
499 paddd $mask,$counters # decrement counters
500 movdqa $counters,32(%rsp) # update counters
501 aesdec $rndkey1,@out[2]
502 aesdec $rndkey1,@out[3]
503 movups 0x90-0x78($key),$rndkey1
504
505 cmp \$11,$rounds
506
507 aesdec $rndkey0,@out[0]
508 aesdec $rndkey0,@out[1]
509 aesdec $rndkey0,@out[2]
510 aesdec $rndkey0,@out[3]
511 movups 0xa0-0x78($key),$rndkey0
512
513 jb .Ldec4x_tail
514
515 aesdec $rndkey1,@out[0]
516 aesdec $rndkey1,@out[1]
517 aesdec $rndkey1,@out[2]
518 aesdec $rndkey1,@out[3]
519 movups 0xb0-0x78($key),$rndkey1
520
521 aesdec $rndkey0,@out[0]
522 aesdec $rndkey0,@out[1]
523 aesdec $rndkey0,@out[2]
524 aesdec $rndkey0,@out[3]
525 movups 0xc0-0x78($key),$rndkey0
526
527 je .Ldec4x_tail
528
529 aesdec $rndkey1,@out[0]
530 aesdec $rndkey1,@out[1]
531 aesdec $rndkey1,@out[2]
532 aesdec $rndkey1,@out[3]
533 movups 0xd0-0x78($key),$rndkey1
534
535 aesdec $rndkey0,@out[0]
536 aesdec $rndkey0,@out[1]
537 aesdec $rndkey0,@out[2]
538 aesdec $rndkey0,@out[3]
539 movups 0xe0-0x78($key),$rndkey0
3847d15d 540 jmp .Ldec4x_tail
b7838586 541
3847d15d 542.align 32
b7838586
AP
543.Ldec4x_tail:
544 aesdec $rndkey1,@out[0]
545 aesdec $rndkey1,@out[1]
546 aesdec $rndkey1,@out[2]
547 pxor $rndkey0,@inp[0]
548 pxor $rndkey0,@inp[1]
549 aesdec $rndkey1,@out[3]
550 movdqu 0x10-0x78($key),$rndkey1
551 pxor $rndkey0,@inp[2]
552 pxor $rndkey0,@inp[3]
553 movdqu 0x20-0x78($key),$rndkey0
554
555 aesdeclast @inp[0],@out[0]
b7838586 556 aesdeclast @inp[1],@out[1]
3847d15d 557 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
b7838586
AP
558 movdqu -16(@inptr[1],$offset),@inp[1]
559 aesdeclast @inp[2],@out[2]
b7838586 560 aesdeclast @inp[3],@out[3]
3847d15d 561 movdqu -16(@inptr[2],$offset),@inp[2]
b7838586
AP
562 movdqu -16(@inptr[3],$offset),@inp[3]
563
564 movups @out[0],-16(@outptr[0],$offset)
565 movdqu (@inptr[0],$offset),@out[0]
566 movups @out[1],-16(@outptr[1],$offset)
567 movdqu (@inptr[1],$offset),@out[1]
568 pxor $zero,@out[0]
569 movups @out[2],-16(@outptr[2],$offset)
570 movdqu (@inptr[2],$offset),@out[2]
571 pxor $zero,@out[1]
572 movups @out[3],-16(@outptr[3],$offset)
573 movdqu (@inptr[3],$offset),@out[3]
574 pxor $zero,@out[2]
575 pxor $zero,@out[3]
576
577 dec $num
578 jnz .Loop_dec4x
579
580 mov 16(%rsp),%rax # original %rsp
581 mov 24(%rsp),$num
582
583 lea `40*4`($inp),$inp
584 dec $num
585 jnz .Ldec4x_loop_grande
586
587.Ldec4x_done:
588___
589$code.=<<___ if ($win64);
e2eabed1
AP
590 movaps -0xd8(%rax),%xmm6
591 movaps -0xc8(%rax),%xmm7
592 movaps -0xb8(%rax),%xmm8
593 movaps -0xa8(%rax),%xmm9
594 movaps -0x98(%rax),%xmm10
595 movaps -0x88(%rax),%xmm11
596 movaps -0x78(%rax),%xmm12
597 #movaps -0x68(%rax),%xmm13
598 #movaps -0x58(%rax),%xmm14
599 #movaps -0x48(%rax),%xmm15
b7838586
AP
600___
601$code.=<<___;
602 mov -48(%rax),%r15
603 mov -40(%rax),%r14
604 mov -32(%rax),%r13
605 mov -24(%rax),%r12
606 mov -16(%rax),%rbp
607 mov -8(%rax),%rbx
608 lea (%rax),%rsp
e2eabed1 609.Ldec4x_epilogue:
b7838586
AP
610 ret
611.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
612___
613
614 if ($avx) {{{
615my @ptr=map("%r$_",(8..15));
616my $offload=$sink;
617
618my @out=map("%xmm$_",(2..9));
619my @inp=map("%xmm$_",(10..13));
620my ($counters,$zero)=("%xmm14","%xmm15");
621
622$code.=<<___;
623.type aesni_multi_cbc_encrypt_avx,\@function,3
624.align 32
625aesni_multi_cbc_encrypt_avx:
626_avx_cbc_enc_shortcut:
627 mov %rsp,%rax
628 push %rbx
629 push %rbp
630 push %r12
631 push %r13
632 push %r14
633 push %r15
634___
635$code.=<<___ if ($win64);
636 lea -0xa8(%rsp),%rsp
637 movaps %xmm6,(%rsp)
638 movaps %xmm7,0x10(%rsp)
639 movaps %xmm8,0x20(%rsp)
640 movaps %xmm9,0x30(%rsp)
641 movaps %xmm10,0x40(%rsp)
642 movaps %xmm11,0x50(%rsp)
643 movaps %xmm12,-0x78(%rax)
644 movaps %xmm13,-0x68(%rax)
645 movaps %xmm14,-0x58(%rax)
646 movaps %xmm15,-0x48(%rax)
647___
648$code.=<<___;
649 # stack layout
650 #
651 # +0 output sink
652 # +16 input sink [original %rsp and $num]
653 # +32 counters
654 # +64 distances between inputs and outputs
655 # +128 off-load area for @inp[0..3]
656
657 sub \$192,%rsp
658 and \$-128,%rsp
659 mov %rax,16(%rsp) # original %rsp
660
661.Lenc8x_body:
662 vzeroupper
663 vmovdqu ($key),$zero # 0-round key
664 lea 0x78($key),$key # size optimization
665 lea 40*4($inp),$inp
666 shr \$1,$num
667
668.Lenc8x_loop_grande:
669 #mov $num,24(%rsp) # original $num
670 xor $num,$num
671___
672for($i=0;$i<8;$i++) {
673 my $temp = $i ? $offload : $offset;
674 $code.=<<___;
675 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
676 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
677 cmp $num,$one
678 mov `40*$i+8-40*4`($inp),$temp # output pointer
679 cmovg $one,$num # find maximum
680 test $one,$one
681 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
682 mov $one,`32+4*$i`(%rsp) # initialize counters
683 cmovle %rsp,@ptr[$i] # cancel input
684 sub @ptr[$i],$temp # distance between input and output
685 mov $temp,`64+8*$i`(%rsp) # initialize distances
686___
687}
688$code.=<<___;
689 test $num,$num
690 jz .Lenc8x_done
691
692 vmovups 0x10-0x78($key),$rndkey1
693 vmovups 0x20-0x78($key),$rndkey0
694 mov 0xf0-0x78($key),$rounds
695
696 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
697 lea 128(%rsp),$offload # offload area
698 vpxor (@ptr[1]),$zero,@inp[1]
699 vpxor (@ptr[2]),$zero,@inp[2]
700 vpxor (@ptr[3]),$zero,@inp[3]
701 vpxor @inp[0],@out[0],@out[0]
702 vpxor (@ptr[4]),$zero,@inp[0]
703 vpxor @inp[1],@out[1],@out[1]
704 vpxor (@ptr[5]),$zero,@inp[1]
705 vpxor @inp[2],@out[2],@out[2]
706 vpxor (@ptr[6]),$zero,@inp[2]
707 vpxor @inp[3],@out[3],@out[3]
708 vpxor (@ptr[7]),$zero,@inp[3]
709 vpxor @inp[0],@out[4],@out[4]
710 mov \$1,$one # constant of 1
711 vpxor @inp[1],@out[5],@out[5]
712 vpxor @inp[2],@out[6],@out[6]
713 vpxor @inp[3],@out[7],@out[7]
714 jmp .Loop_enc8x
715
716.align 32
717.Loop_enc8x:
718___
719for($i=0;$i<8;$i++) {
720my $rndkey=($i&1)?$rndkey0:$rndkey1;
721$code.=<<___;
722 vaesenc $rndkey,@out[0],@out[0]
723 cmp 32+4*$i(%rsp),$one
724___
725$code.=<<___ if ($i);
726 mov 64+8*$i(%rsp),$offset
727___
728$code.=<<___;
729 vaesenc $rndkey,@out[1],@out[1]
3847d15d 730 prefetcht0 31(@ptr[$i]) # prefetch input
b7838586 731 vaesenc $rndkey,@out[2],@out[2]
3847d15d
AP
732___
733$code.=<<___ if ($i>1);
734 prefetcht0 15(@ptr[$i-2]) # prefetch output
735___
736$code.=<<___;
b7838586
AP
737 vaesenc $rndkey,@out[3],@out[3]
738 lea (@ptr[$i],$offset),$offset
739 cmovge %rsp,@ptr[$i] # cancel input
740 vaesenc $rndkey,@out[4],@out[4]
741 cmovg %rsp,$offset # sink output
742 vaesenc $rndkey,@out[5],@out[5]
743 sub @ptr[$i],$offset
744 vaesenc $rndkey,@out[6],@out[6]
745 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
746 mov $offset,64+8*$i(%rsp)
747 vaesenc $rndkey,@out[7],@out[7]
748 vmovups `16*(3+$i)-0x78`($key),$rndkey
749 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
750___
751$code.=<<___ if ($i<4)
752 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
753___
754}
755$code.=<<___;
756 vmovdqu 32(%rsp),$counters
3847d15d
AP
757 prefetcht0 15(@ptr[$i-2]) # prefetch output
758 prefetcht0 15(@ptr[$i-1])
b7838586
AP
759 cmp \$11,$rounds
760 jb .Lenc8x_tail
761
762 vaesenc $rndkey1,@out[0],@out[0]
763 vaesenc $rndkey1,@out[1],@out[1]
764 vaesenc $rndkey1,@out[2],@out[2]
765 vaesenc $rndkey1,@out[3],@out[3]
766 vaesenc $rndkey1,@out[4],@out[4]
767 vaesenc $rndkey1,@out[5],@out[5]
768 vaesenc $rndkey1,@out[6],@out[6]
769 vaesenc $rndkey1,@out[7],@out[7]
770 vmovups 0xb0-0x78($key),$rndkey1
771
772 vaesenc $rndkey0,@out[0],@out[0]
773 vaesenc $rndkey0,@out[1],@out[1]
774 vaesenc $rndkey0,@out[2],@out[2]
775 vaesenc $rndkey0,@out[3],@out[3]
776 vaesenc $rndkey0,@out[4],@out[4]
777 vaesenc $rndkey0,@out[5],@out[5]
778 vaesenc $rndkey0,@out[6],@out[6]
779 vaesenc $rndkey0,@out[7],@out[7]
780 vmovups 0xc0-0x78($key),$rndkey0
781 je .Lenc8x_tail
782
783 vaesenc $rndkey1,@out[0],@out[0]
784 vaesenc $rndkey1,@out[1],@out[1]
785 vaesenc $rndkey1,@out[2],@out[2]
786 vaesenc $rndkey1,@out[3],@out[3]
787 vaesenc $rndkey1,@out[4],@out[4]
788 vaesenc $rndkey1,@out[5],@out[5]
789 vaesenc $rndkey1,@out[6],@out[6]
790 vaesenc $rndkey1,@out[7],@out[7]
791 vmovups 0xd0-0x78($key),$rndkey1
792
793 vaesenc $rndkey0,@out[0],@out[0]
794 vaesenc $rndkey0,@out[1],@out[1]
795 vaesenc $rndkey0,@out[2],@out[2]
796 vaesenc $rndkey0,@out[3],@out[3]
797 vaesenc $rndkey0,@out[4],@out[4]
798 vaesenc $rndkey0,@out[5],@out[5]
799 vaesenc $rndkey0,@out[6],@out[6]
800 vaesenc $rndkey0,@out[7],@out[7]
801 vmovups 0xe0-0x78($key),$rndkey0
802
803.Lenc8x_tail:
804 vaesenc $rndkey1,@out[0],@out[0]
805 vpxor $zero,$zero,$zero
806 vaesenc $rndkey1,@out[1],@out[1]
807 vaesenc $rndkey1,@out[2],@out[2]
808 vpcmpgtd $zero,$counters,$zero
809 vaesenc $rndkey1,@out[3],@out[3]
810 vaesenc $rndkey1,@out[4],@out[4]
811 vpaddd $counters,$zero,$zero # decrement counters
812 vmovdqu 48(%rsp),$counters
813 vaesenc $rndkey1,@out[5],@out[5]
814 mov 64(%rsp),$offset # pre-load 1st offset
815 vaesenc $rndkey1,@out[6],@out[6]
816 vaesenc $rndkey1,@out[7],@out[7]
817 vmovups 0x10-0x78($key),$rndkey1
818
819 vaesenclast $rndkey0,@out[0],@out[0]
820 vmovdqa $zero,32(%rsp) # update counters
821 vpxor $zero,$zero,$zero
822 vaesenclast $rndkey0,@out[1],@out[1]
823 vaesenclast $rndkey0,@out[2],@out[2]
824 vpcmpgtd $zero,$counters,$zero
825 vaesenclast $rndkey0,@out[3],@out[3]
826 vaesenclast $rndkey0,@out[4],@out[4]
827 vpaddd $zero,$counters,$counters # decrement counters
828 vmovdqu -0x78($key),$zero # 0-round
829 vaesenclast $rndkey0,@out[5],@out[5]
830 vaesenclast $rndkey0,@out[6],@out[6]
831 vmovdqa $counters,48(%rsp) # update counters
832 vaesenclast $rndkey0,@out[7],@out[7]
833 vmovups 0x20-0x78($key),$rndkey0
834
835 vmovups @out[0],-16(@ptr[0]) # write output
836 sub $offset,@ptr[0] # switch to input
837 vpxor 0x00($offload),@out[0],@out[0]
838 vmovups @out[1],-16(@ptr[1])
839 sub `64+1*8`(%rsp),@ptr[1]
840 vpxor 0x10($offload),@out[1],@out[1]
841 vmovups @out[2],-16(@ptr[2])
842 sub `64+2*8`(%rsp),@ptr[2]
843 vpxor 0x20($offload),@out[2],@out[2]
844 vmovups @out[3],-16(@ptr[3])
845 sub `64+3*8`(%rsp),@ptr[3]
846 vpxor 0x30($offload),@out[3],@out[3]
847 vmovups @out[4],-16(@ptr[4])
848 sub `64+4*8`(%rsp),@ptr[4]
849 vpxor @inp[0],@out[4],@out[4]
850 vmovups @out[5],-16(@ptr[5])
851 sub `64+5*8`(%rsp),@ptr[5]
852 vpxor @inp[1],@out[5],@out[5]
853 vmovups @out[6],-16(@ptr[6])
854 sub `64+6*8`(%rsp),@ptr[6]
855 vpxor @inp[2],@out[6],@out[6]
856 vmovups @out[7],-16(@ptr[7])
857 sub `64+7*8`(%rsp),@ptr[7]
858 vpxor @inp[3],@out[7],@out[7]
859
860 dec $num
861 jnz .Loop_enc8x
862
863 mov 16(%rsp),%rax # original %rsp
864 #mov 24(%rsp),$num
865 #lea `40*8`($inp),$inp
866 #dec $num
867 #jnz .Lenc8x_loop_grande
868
869.Lenc8x_done:
870 vzeroupper
871___
872$code.=<<___ if ($win64);
873 movaps -0xd8(%rax),%xmm6
874 movaps -0xc8(%rax),%xmm7
875 movaps -0xb8(%rax),%xmm8
876 movaps -0xa8(%rax),%xmm9
877 movaps -0x98(%rax),%xmm10
878 movaps -0x88(%rax),%xmm11
879 movaps -0x78(%rax),%xmm12
880 movaps -0x68(%rax),%xmm13
881 movaps -0x58(%rax),%xmm14
882 movaps -0x48(%rax),%xmm15
883___
884$code.=<<___;
885 mov -48(%rax),%r15
886 mov -40(%rax),%r14
887 mov -32(%rax),%r13
888 mov -24(%rax),%r12
889 mov -16(%rax),%rbp
890 mov -8(%rax),%rbx
891 lea (%rax),%rsp
e2eabed1 892.Lenc8x_epilogue:
b7838586
AP
893 ret
894.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
895
896.type aesni_multi_cbc_decrypt_avx,\@function,3
897.align 32
898aesni_multi_cbc_decrypt_avx:
899_avx_cbc_dec_shortcut:
900 mov %rsp,%rax
901 push %rbx
902 push %rbp
903 push %r12
904 push %r13
905 push %r14
906 push %r15
907___
908$code.=<<___ if ($win64);
909 lea -0xa8(%rsp),%rsp
910 movaps %xmm6,(%rsp)
911 movaps %xmm7,0x10(%rsp)
912 movaps %xmm8,0x20(%rsp)
913 movaps %xmm9,0x30(%rsp)
914 movaps %xmm10,0x40(%rsp)
915 movaps %xmm11,0x50(%rsp)
916 movaps %xmm12,-0x78(%rax)
917 movaps %xmm13,-0x68(%rax)
918 movaps %xmm14,-0x58(%rax)
919 movaps %xmm15,-0x48(%rax)
920___
921$code.=<<___;
922 # stack layout
923 #
924 # +0 output sink
925 # +16 input sink [original %rsp and $num]
926 # +32 counters
927 # +64 distances between inputs and outputs
928 # +128 off-load area for @inp[0..3]
929 # +192 IV/input offload
930
931 sub \$256,%rsp
932 and \$-256,%rsp
933 sub \$192,%rsp
934 mov %rax,16(%rsp) # original %rsp
935
936.Ldec8x_body:
937 vzeroupper
938 vmovdqu ($key),$zero # 0-round key
939 lea 0x78($key),$key # size optimization
940 lea 40*4($inp),$inp
941 shr \$1,$num
942
943.Ldec8x_loop_grande:
944 #mov $num,24(%rsp) # original $num
945 xor $num,$num
946___
947for($i=0;$i<8;$i++) {
948 my $temp = $i ? $offload : $offset;
949 $code.=<<___;
950 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
951 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
952 cmp $num,$one
953 mov `40*$i+8-40*4`($inp),$temp # output pointer
954 cmovg $one,$num # find maximum
955 test $one,$one
956 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
957 mov $one,`32+4*$i`(%rsp) # initialize counters
958 cmovle %rsp,@ptr[$i] # cancel input
959 sub @ptr[$i],$temp # distance between input and output
960 mov $temp,`64+8*$i`(%rsp) # initialize distances
961 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
962___
963}
964$code.=<<___;
965 test $num,$num
966 jz .Ldec8x_done
967
968 vmovups 0x10-0x78($key),$rndkey1
969 vmovups 0x20-0x78($key),$rndkey0
970 mov 0xf0-0x78($key),$rounds
971 lea 192+128(%rsp),$offload # offload area
972
973 vmovdqu (@ptr[0]),@out[0] # load inputs
974 vmovdqu (@ptr[1]),@out[1]
975 vmovdqu (@ptr[2]),@out[2]
976 vmovdqu (@ptr[3]),@out[3]
977 vmovdqu (@ptr[4]),@out[4]
978 vmovdqu (@ptr[5]),@out[5]
979 vmovdqu (@ptr[6]),@out[6]
980 vmovdqu (@ptr[7]),@out[7]
981 vmovdqu @out[0],0x00($offload) # offload inputs
982 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
983 vmovdqu @out[1],0x10($offload)
984 vpxor $zero,@out[1],@out[1]
985 vmovdqu @out[2],0x20($offload)
986 vpxor $zero,@out[2],@out[2]
987 vmovdqu @out[3],0x30($offload)
988 vpxor $zero,@out[3],@out[3]
989 vmovdqu @out[4],0x40($offload)
990 vpxor $zero,@out[4],@out[4]
991 vmovdqu @out[5],0x50($offload)
992 vpxor $zero,@out[5],@out[5]
993 vmovdqu @out[6],0x60($offload)
994 vpxor $zero,@out[6],@out[6]
995 vmovdqu @out[7],0x70($offload)
996 vpxor $zero,@out[7],@out[7]
997 xor \$0x80,$offload
998 mov \$1,$one # constant of 1
999 jmp .Loop_dec8x
1000
1001.align 32
1002.Loop_dec8x:
1003___
1004for($i=0;$i<8;$i++) {
1005my $rndkey=($i&1)?$rndkey0:$rndkey1;
1006$code.=<<___;
1007 vaesdec $rndkey,@out[0],@out[0]
1008 cmp 32+4*$i(%rsp),$one
1009___
1010$code.=<<___ if ($i);
1011 mov 64+8*$i(%rsp),$offset
1012___
1013$code.=<<___;
1014 vaesdec $rndkey,@out[1],@out[1]
3847d15d 1015 prefetcht0 31(@ptr[$i]) # prefetch input
b7838586 1016 vaesdec $rndkey,@out[2],@out[2]
3847d15d
AP
1017___
1018$code.=<<___ if ($i>1);
1019 prefetcht0 15(@ptr[$i-2]) # prefetch output
1020___
1021$code.=<<___;
b7838586
AP
1022 vaesdec $rndkey,@out[3],@out[3]
1023 lea (@ptr[$i],$offset),$offset
1024 cmovge %rsp,@ptr[$i] # cancel input
1025 vaesdec $rndkey,@out[4],@out[4]
1026 cmovg %rsp,$offset # sink output
1027 vaesdec $rndkey,@out[5],@out[5]
1028 sub @ptr[$i],$offset
1029 vaesdec $rndkey,@out[6],@out[6]
1030 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
1031 mov $offset,64+8*$i(%rsp)
1032 vaesdec $rndkey,@out[7],@out[7]
1033 vmovups `16*(3+$i)-0x78`($key),$rndkey
1034 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
1035___
1036$code.=<<___ if ($i<4);
1037 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
1038___
1039}
1040$code.=<<___;
1041 vmovdqu 32(%rsp),$counters
3847d15d
AP
1042 prefetcht0 15(@ptr[$i-2]) # prefetch output
1043 prefetcht0 15(@ptr[$i-1])
b7838586
AP
1044 cmp \$11,$rounds
1045 jb .Ldec8x_tail
1046
1047 vaesdec $rndkey1,@out[0],@out[0]
1048 vaesdec $rndkey1,@out[1],@out[1]
1049 vaesdec $rndkey1,@out[2],@out[2]
1050 vaesdec $rndkey1,@out[3],@out[3]
1051 vaesdec $rndkey1,@out[4],@out[4]
1052 vaesdec $rndkey1,@out[5],@out[5]
1053 vaesdec $rndkey1,@out[6],@out[6]
1054 vaesdec $rndkey1,@out[7],@out[7]
1055 vmovups 0xb0-0x78($key),$rndkey1
1056
1057 vaesdec $rndkey0,@out[0],@out[0]
1058 vaesdec $rndkey0,@out[1],@out[1]
1059 vaesdec $rndkey0,@out[2],@out[2]
1060 vaesdec $rndkey0,@out[3],@out[3]
1061 vaesdec $rndkey0,@out[4],@out[4]
1062 vaesdec $rndkey0,@out[5],@out[5]
1063 vaesdec $rndkey0,@out[6],@out[6]
1064 vaesdec $rndkey0,@out[7],@out[7]
1065 vmovups 0xc0-0x78($key),$rndkey0
1066 je .Ldec8x_tail
1067
1068 vaesdec $rndkey1,@out[0],@out[0]
1069 vaesdec $rndkey1,@out[1],@out[1]
1070 vaesdec $rndkey1,@out[2],@out[2]
1071 vaesdec $rndkey1,@out[3],@out[3]
1072 vaesdec $rndkey1,@out[4],@out[4]
1073 vaesdec $rndkey1,@out[5],@out[5]
1074 vaesdec $rndkey1,@out[6],@out[6]
1075 vaesdec $rndkey1,@out[7],@out[7]
1076 vmovups 0xd0-0x78($key),$rndkey1
1077
1078 vaesdec $rndkey0,@out[0],@out[0]
1079 vaesdec $rndkey0,@out[1],@out[1]
1080 vaesdec $rndkey0,@out[2],@out[2]
1081 vaesdec $rndkey0,@out[3],@out[3]
1082 vaesdec $rndkey0,@out[4],@out[4]
1083 vaesdec $rndkey0,@out[5],@out[5]
1084 vaesdec $rndkey0,@out[6],@out[6]
1085 vaesdec $rndkey0,@out[7],@out[7]
1086 vmovups 0xe0-0x78($key),$rndkey0
1087
1088.Ldec8x_tail:
1089 vaesdec $rndkey1,@out[0],@out[0]
1090 vpxor $zero,$zero,$zero
1091 vaesdec $rndkey1,@out[1],@out[1]
1092 vaesdec $rndkey1,@out[2],@out[2]
1093 vpcmpgtd $zero,$counters,$zero
1094 vaesdec $rndkey1,@out[3],@out[3]
1095 vaesdec $rndkey1,@out[4],@out[4]
1096 vpaddd $counters,$zero,$zero # decrement counters
1097 vmovdqu 48(%rsp),$counters
1098 vaesdec $rndkey1,@out[5],@out[5]
1099 mov 64(%rsp),$offset # pre-load 1st offset
1100 vaesdec $rndkey1,@out[6],@out[6]
1101 vaesdec $rndkey1,@out[7],@out[7]
1102 vmovups 0x10-0x78($key),$rndkey1
1103
1104 vaesdeclast $rndkey0,@out[0],@out[0]
1105 vmovdqa $zero,32(%rsp) # update counters
1106 vpxor $zero,$zero,$zero
1107 vaesdeclast $rndkey0,@out[1],@out[1]
1108 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1109 vaesdeclast $rndkey0,@out[2],@out[2]
1110 vpxor 0x10($offload),@out[1],@out[1]
1111 vpcmpgtd $zero,$counters,$zero
1112 vaesdeclast $rndkey0,@out[3],@out[3]
1113 vpxor 0x20($offload),@out[2],@out[2]
1114 vaesdeclast $rndkey0,@out[4],@out[4]
1115 vpxor 0x30($offload),@out[3],@out[3]
1116 vpaddd $zero,$counters,$counters # decrement counters
1117 vmovdqu -0x78($key),$zero # 0-round
1118 vaesdeclast $rndkey0,@out[5],@out[5]
1119 vpxor 0x40($offload),@out[4],@out[4]
1120 vaesdeclast $rndkey0,@out[6],@out[6]
1121 vpxor 0x50($offload),@out[5],@out[5]
1122 vmovdqa $counters,48(%rsp) # update counters
1123 vaesdeclast $rndkey0,@out[7],@out[7]
1124 vpxor 0x60($offload),@out[6],@out[6]
1125 vmovups 0x20-0x78($key),$rndkey0
1126
1127 vmovups @out[0],-16(@ptr[0]) # write output
1128 sub $offset,@ptr[0] # switch to input
1129 vmovdqu 128+0(%rsp),@out[0]
1130 vpxor 0x70($offload),@out[7],@out[7]
1131 vmovups @out[1],-16(@ptr[1])
1132 sub `64+1*8`(%rsp),@ptr[1]
1133 vmovdqu @out[0],0x00($offload)
1134 vpxor $zero,@out[0],@out[0]
1135 vmovdqu 128+16(%rsp),@out[1]
1136 vmovups @out[2],-16(@ptr[2])
1137 sub `64+2*8`(%rsp),@ptr[2]
1138 vmovdqu @out[1],0x10($offload)
1139 vpxor $zero,@out[1],@out[1]
1140 vmovdqu 128+32(%rsp),@out[2]
1141 vmovups @out[3],-16(@ptr[3])
1142 sub `64+3*8`(%rsp),@ptr[3]
1143 vmovdqu @out[2],0x20($offload)
1144 vpxor $zero,@out[2],@out[2]
1145 vmovdqu 128+48(%rsp),@out[3]
1146 vmovups @out[4],-16(@ptr[4])
1147 sub `64+4*8`(%rsp),@ptr[4]
1148 vmovdqu @out[3],0x30($offload)
1149 vpxor $zero,@out[3],@out[3]
1150 vmovdqu @inp[0],0x40($offload)
1151 vpxor @inp[0],$zero,@out[4]
1152 vmovups @out[5],-16(@ptr[5])
1153 sub `64+5*8`(%rsp),@ptr[5]
1154 vmovdqu @inp[1],0x50($offload)
1155 vpxor @inp[1],$zero,@out[5]
1156 vmovups @out[6],-16(@ptr[6])
1157 sub `64+6*8`(%rsp),@ptr[6]
1158 vmovdqu @inp[2],0x60($offload)
1159 vpxor @inp[2],$zero,@out[6]
1160 vmovups @out[7],-16(@ptr[7])
1161 sub `64+7*8`(%rsp),@ptr[7]
1162 vmovdqu @inp[3],0x70($offload)
1163 vpxor @inp[3],$zero,@out[7]
1164
1165 xor \$128,$offload
1166 dec $num
1167 jnz .Loop_dec8x
1168
1169 mov 16(%rsp),%rax # original %rsp
1170 #mov 24(%rsp),$num
1171 #lea `40*8`($inp),$inp
1172 #dec $num
1173 #jnz .Ldec8x_loop_grande
1174
1175.Ldec8x_done:
1176 vzeroupper
1177___
1178$code.=<<___ if ($win64);
1179 movaps -0xd8(%rax),%xmm6
1180 movaps -0xc8(%rax),%xmm7
1181 movaps -0xb8(%rax),%xmm8
1182 movaps -0xa8(%rax),%xmm9
1183 movaps -0x98(%rax),%xmm10
1184 movaps -0x88(%rax),%xmm11
1185 movaps -0x78(%rax),%xmm12
1186 movaps -0x68(%rax),%xmm13
1187 movaps -0x58(%rax),%xmm14
1188 movaps -0x48(%rax),%xmm15
1189___
1190$code.=<<___;
1191 mov -48(%rax),%r15
1192 mov -40(%rax),%r14
1193 mov -32(%rax),%r13
1194 mov -24(%rax),%r12
1195 mov -16(%rax),%rbp
1196 mov -8(%rax),%rbx
1197 lea (%rax),%rsp
e2eabed1 1198.Ldec8x_epilogue:
b7838586
AP
1199 ret
1200.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1201___
1202 }}}
1203
e2eabed1
AP
1204if ($win64) {
1205# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1206# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1207$rec="%rcx";
1208$frame="%rdx";
1209$context="%r8";
1210$disp="%r9";
1211
1212$code.=<<___;
1213.extern __imp_RtlVirtualUnwind
1214.type se_handler,\@abi-omnipotent
1215.align 16
1216se_handler:
1217 push %rsi
1218 push %rdi
1219 push %rbx
1220 push %rbp
1221 push %r12
1222 push %r13
1223 push %r14
1224 push %r15
1225 pushfq
1226 sub \$64,%rsp
1227
1228 mov 120($context),%rax # pull context->Rax
1229 mov 248($context),%rbx # pull context->Rip
1230
1231 mov 8($disp),%rsi # disp->ImageBase
1232 mov 56($disp),%r11 # disp->HandlerData
1233
1234 mov 0(%r11),%r10d # HandlerData[0]
1235 lea (%rsi,%r10),%r10 # prologue label
1236 cmp %r10,%rbx # context->Rip<.Lprologue
1237 jb .Lin_prologue
1238
1239 mov 152($context),%rax # pull context->Rsp
1240
1241 mov 4(%r11),%r10d # HandlerData[1]
1242 lea (%rsi,%r10),%r10 # epilogue label
1243 cmp %r10,%rbx # context->Rip>=.Lepilogue
1244 jae .Lin_prologue
1245
1246 mov 16(%rax),%rax # pull saved stack pointer
1247
1248 mov -8(%rax),%rbx
1249 mov -16(%rax),%rbp
1250 mov -24(%rax),%r12
1251 mov -32(%rax),%r13
1252 mov -40(%rax),%r14
1253 mov -48(%rax),%r15
1254 mov %rbx,144($context) # restore context->Rbx
1255 mov %rbp,160($context) # restore context->Rbp
1256 mov %r12,216($context) # restore cotnext->R12
1257 mov %r13,224($context) # restore cotnext->R13
1258 mov %r14,232($context) # restore cotnext->R14
1259 mov %r15,240($context) # restore cotnext->R15
1260
1261 lea -56-10*16(%rax),%rsi
1262 lea 512($context),%rdi # &context.Xmm6
1263 mov \$20,%ecx
1264 .long 0xa548f3fc # cld; rep movsq
1265
1266.Lin_prologue:
1267 mov 8(%rax),%rdi
1268 mov 16(%rax),%rsi
1269 mov %rax,152($context) # restore context->Rsp
1270 mov %rsi,168($context) # restore context->Rsi
1271 mov %rdi,176($context) # restore context->Rdi
1272
1273 mov 40($disp),%rdi # disp->ContextRecord
1274 mov $context,%rsi # context
1275 mov \$154,%ecx # sizeof(CONTEXT)
1276 .long 0xa548f3fc # cld; rep movsq
1277
1278 mov $disp,%rsi
1279 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1280 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1281 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1282 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1283 mov 40(%rsi),%r10 # disp->ContextRecord
1284 lea 56(%rsi),%r11 # &disp->HandlerData
1285 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1286 mov %r10,32(%rsp) # arg5
1287 mov %r11,40(%rsp) # arg6
1288 mov %r12,48(%rsp) # arg7
1289 mov %rcx,56(%rsp) # arg8, (NULL)
1290 call *__imp_RtlVirtualUnwind(%rip)
1291
1292 mov \$1,%eax # ExceptionContinueSearch
1293 add \$64,%rsp
1294 popfq
1295 pop %r15
1296 pop %r14
1297 pop %r13
1298 pop %r12
1299 pop %rbp
1300 pop %rbx
1301 pop %rdi
1302 pop %rsi
1303 ret
1304.size se_handler,.-se_handler
1305
1306.section .pdata
1307.align 4
1308 .rva .LSEH_begin_aesni_multi_cbc_encrypt
1309 .rva .LSEH_end_aesni_multi_cbc_encrypt
1310 .rva .LSEH_info_aesni_multi_cbc_encrypt
1311 .rva .LSEH_begin_aesni_multi_cbc_decrypt
1312 .rva .LSEH_end_aesni_multi_cbc_decrypt
1313 .rva .LSEH_info_aesni_multi_cbc_decrypt
1314___
1315$code.=<<___ if ($avx);
1316 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
1317 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
1318 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
1319 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
1320 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
1321 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
1322___
1323$code.=<<___;
1324.section .xdata
1325.align 8
1326.LSEH_info_aesni_multi_cbc_encrypt:
1327 .byte 9,0,0,0
1328 .rva se_handler
1329 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
1330.LSEH_info_aesni_multi_cbc_decrypt:
1331 .byte 9,0,0,0
1332 .rva se_handler
1333 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
1334___
1335$code.=<<___ if ($avx);
1336.LSEH_info_aesni_multi_cbc_encrypt_avx:
1337 .byte 9,0,0,0
1338 .rva se_handler
1339 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
1340.LSEH_info_aesni_multi_cbc_decrypt_avx:
1341 .byte 9,0,0,0
1342 .rva se_handler
1343 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
1344___
1345}
1346####################################################################
1347
b7838586
AP
1348sub rex {
1349 local *opcode=shift;
1350 my ($dst,$src)=@_;
1351 my $rex=0;
1352
1353 $rex|=0x04 if($dst>=8);
1354 $rex|=0x01 if($src>=8);
1355 push @opcode,$rex|0x40 if($rex);
1356}
1357
1358sub aesni {
1359 my $line=shift;
1360 my @opcode=(0x66);
1361
1362 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1363 rex(\@opcode,$4,$3);
1364 push @opcode,0x0f,0x3a,0xdf;
1365 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1366 my $c=$2;
1367 push @opcode,$c=~/^0/?oct($c):$c;
1368 return ".byte\t".join(',',@opcode);
1369 }
1370 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1371 my %opcodelet = (
1372 "aesimc" => 0xdb,
1373 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1374 "aesdec" => 0xde, "aesdeclast" => 0xdf
1375 );
1376 return undef if (!defined($opcodelet{$1}));
1377 rex(\@opcode,$3,$2);
1378 push @opcode,0x0f,0x38,$opcodelet{$1};
1379 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1380 return ".byte\t".join(',',@opcode);
1381 }
1382 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1383 my %opcodelet = (
1384 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1385 "aesdec" => 0xde, "aesdeclast" => 0xdf
1386 );
1387 return undef if (!defined($opcodelet{$1}));
1388 my $off = $2;
1389 push @opcode,0x44 if ($3>=8);
1390 push @opcode,0x0f,0x38,$opcodelet{$1};
1391 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1392 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1393 return ".byte\t".join(',',@opcode);
1394 }
1395 return $line;
1396}
1397
1398$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1399$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1400
1401print $code;
1402close STDOUT;