]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesni-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / aes / asm / aesni-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20 # details].
21 #
22 # Performance.
23 #
24 # Given aes(enc|dec) instructions' latency asymptotic performance for
25 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26 # processed with 128-bit key. And given their throughput asymptotic
27 # performance for parallelizable modes is 1.25 cycles per byte. Being
28 # asymptotic limit it's not something you commonly achieve in reality,
29 # but how close does one get? Below are results collected for
30 # different modes and block sized. Pairs of numbers are for en-/
31 # decryption.
32 #
33 # 16-byte 64-byte 256-byte 1-KB 8-KB
34 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
37 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
38 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
40 #
41 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44 # The results were collected with specially crafted speed.c benchmark
45 # in order to compare them with results reported in "Intel Advanced
46 # Encryption Standard (AES) New Instruction Set" White Paper Revision
47 # 3.0 dated May 2010. All above results are consistently better. This
48 # module also provides better performance for block sizes smaller than
49 # 128 bytes in points *not* represented in the above table.
50 #
51 # Looking at the results for 8-KB buffer.
52 #
53 # CFB and OFB results are far from the limit, because implementation
54 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55 # single-block aesni_encrypt, which is not the most optimal way to go.
56 # CBC encrypt result is unexpectedly high and there is no documented
57 # explanation for it. Seemingly there is a small penalty for feeding
58 # the result back to AES unit the way it's done in CBC mode. There is
59 # nothing one can do and the result appears optimal. CCM result is
60 # identical to CBC, because CBC-MAC is essentially CBC encrypt without
61 # saving output. CCM CTR "stays invisible," because it's neatly
62 # interleaved with CBC-MAC. This provides ~30% improvement over
63 # "straightforward" CCM implementation with CTR and CBC-MAC performed
64 # disjointly. Parallelizable modes practically achieve the theoretical
65 # limit.
66 #
67 # Looking at how results vary with buffer size.
68 #
69 # Curves are practically saturated at 1-KB buffer size. In most cases
70 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71 # CTR curve doesn't follow this pattern and is "slowest" changing one
72 # with "256-byte" result being 87% of "8-KB." This is because overhead
73 # in CTR mode is most computationally intensive. Small-block CCM
74 # decrypt is slower than encrypt, because first CTR and last CBC-MAC
75 # iterations can't be interleaved.
76 #
77 # Results for 192- and 256-bit keys.
78 #
79 # EVP-free results were observed to scale perfectly with number of
80 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82 # are a tad smaller, because the above mentioned penalty biases all
83 # results by same constant value. In similar way function call
84 # overhead affects small-block performance, as well as OFB and CFB
85 # results. Differences are not large, most common coefficients are
86 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89 # January 2011
90 #
91 # While Westmere processor features 6 cycles latency for aes[enc|dec]
92 # instructions, which can be scheduled every second cycle, Sandy
93 # Bridge spends 8 cycles per instruction, but it can schedule them
94 # every cycle. This means that code targeting Westmere would perform
95 # suboptimally on Sandy Bridge. Therefore this update.
96 #
97 # In addition, non-parallelizable CBC encrypt (as well as CCM) is
98 # optimized. Relative improvement might appear modest, 8% on Westmere,
99 # but in absolute terms it's 3.77 cycles per byte encrypted with
100 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101 # should be compared to asymptotic limits of 3.75 for Westmere and
102 # 5.00 for Sandy Bridge. Actually, the fact that they get this close
103 # to asymptotic limits is quite amazing. Indeed, the limit is
104 # calculated as latency times number of rounds, 10 for 128-bit key,
105 # and divided by 16, the number of bytes in block, or in other words
106 # it accounts *solely* for aesenc instructions. But there are extra
107 # instructions, and numbers so close to the asymptotic limits mean
108 # that it's as if it takes as little as *one* additional cycle to
109 # execute all of them. How is it possible? It is possible thanks to
110 # out-of-order execution logic, which manages to overlap post-
111 # processing of previous block, things like saving the output, with
112 # actual encryption of current block, as well as pre-processing of
113 # current block, things like fetching input and xor-ing it with
114 # 0-round element of the key schedule, with actual encryption of
115 # previous block. Keep this in mind...
116 #
117 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118 # performance is achieved by interleaving instructions working on
119 # independent blocks. In which case asymptotic limit for such modes
120 # can be obtained by dividing above mentioned numbers by AES
121 # instructions' interleave factor. Westmere can execute at most 3
122 # instructions at a time, meaning that optimal interleave factor is 3,
123 # and that's where the "magic" number of 1.25 come from. "Optimal
124 # interleave factor" means that increase of interleave factor does
125 # not improve performance. The formula has proven to reflect reality
126 # pretty well on Westmere... Sandy Bridge on the other hand can
127 # execute up to 8 AES instructions at a time, so how does varying
128 # interleave factor affect the performance? Here is table for ECB
129 # (numbers are cycles per byte processed with 128-bit key):
130 #
131 # instruction interleave factor 3x 6x 8x
132 # theoretical asymptotic limit 1.67 0.83 0.625
133 # measured performance for 8KB block 1.05 0.86 0.84
134 #
135 # "as if" interleave factor 4.7x 5.8x 6.0x
136 #
137 # Further data for other parallelizable modes:
138 #
139 # CBC decrypt 1.16 0.93 0.74
140 # CTR 1.14 0.91 0.74
141 #
142 # Well, given 3x column it's probably inappropriate to call the limit
143 # asymptotic, if it can be surpassed, isn't it? What happens there?
144 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145 # magic is responsible for this. Processor overlaps not only the
146 # additional instructions with AES ones, but even AES instructions
147 # processing adjacent triplets of independent blocks. In the 6x case
148 # additional instructions still claim disproportionally small amount
149 # of additional cycles, but in 8x case number of instructions must be
150 # a tad too high for out-of-order logic to cope with, and AES unit
151 # remains underutilized... As you can see 8x interleave is hardly
152 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153 # utilizes 6x interleave because of limited register bank capacity.
154 #
155 # Higher interleave factors do have negative impact on Westmere
156 # performance. While for ECB mode it's negligible ~1.5%, other
157 # parallelizables perform ~5% worse, which is outweighed by ~25%
158 # improvement on Sandy Bridge. To balance regression on Westmere
159 # CTR mode was implemented with 6x aesenc interleave factor.
160
161 # April 2011
162 #
163 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165 # in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167 # November 2015
168 #
169 # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170 # chosen to be 6x.
171
172 ######################################################################
173 # Current large-block performance in cycles per byte processed with
174 # 128-bit key (less is better).
175 #
176 # CBC en-/decrypt CTR XTS ECB OCB
177 # Westmere 3.77/1.25 1.25 1.25 1.26
178 # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179 # Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180 # Skylake 2.62/0.63 0.63 0.63 0.63
181 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
182 # Knights L 2.54/0.77 0.78 0.85 - 1.50
183 # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
184 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
185 # Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
186 #
187 # (*) Atom Silvermont ECB result is suboptimal because of penalties
188 # incurred by operations on %xmm8-15. As ECB is not considered
189 # critical, nothing was done to mitigate the problem.
190
191 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
192 # generates drop-in replacement for
193 # crypto/aes/asm/aes-x86_64.pl:-)
194
195 # $output is the last argument if it looks like a file (it has an extension)
196 # $flavour is the first argument if it doesn't look like a file
197 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
198 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
199
200 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
201
202 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
203 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
204 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
205 die "can't locate x86_64-xlate.pl";
206
207 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
208 or die "can't call $xlate: $!";
209 *STDOUT=*OUT;
210
211 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
212 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
213 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
214
215 $code=".text\n";
216 $code.=".extern OPENSSL_ia32cap_P\n";
217
218 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
219 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
220 $inp="%rdi";
221 $out="%rsi";
222 $len="%rdx";
223 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
224 $ivp="%r8"; # cbc, ctr, ...
225
226 $rnds_="%r10d"; # backup copy for $rounds
227 $key_="%r11"; # backup copy for $key
228
229 # %xmm register layout
230 $rndkey0="%xmm0"; $rndkey1="%xmm1";
231 $inout0="%xmm2"; $inout1="%xmm3";
232 $inout2="%xmm4"; $inout3="%xmm5";
233 $inout4="%xmm6"; $inout5="%xmm7";
234 $inout6="%xmm8"; $inout7="%xmm9";
235
236 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
237 $in0="%xmm8"; $iv="%xmm9";
238 \f
239 # Inline version of internal aesni_[en|de]crypt1.
240 #
241 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
242 # cycles which take care of loop variables...
243 { my $sn;
244 sub aesni_generate1 {
245 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
246 ++$sn;
247 $code.=<<___;
248 $movkey ($key),$rndkey0
249 $movkey 16($key),$rndkey1
250 ___
251 $code.=<<___ if (defined($ivec));
252 xorps $rndkey0,$ivec
253 lea 32($key),$key
254 xorps $ivec,$inout
255 ___
256 $code.=<<___ if (!defined($ivec));
257 lea 32($key),$key
258 xorps $rndkey0,$inout
259 ___
260 $code.=<<___;
261 .Loop_${p}1_$sn:
262 aes${p} $rndkey1,$inout
263 dec $rounds
264 $movkey ($key),$rndkey1
265 lea 16($key),$key
266 jnz .Loop_${p}1_$sn # loop body is 16 bytes
267 aes${p}last $rndkey1,$inout
268 ___
269 }}
270 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
271 #
272 { my ($inp,$out,$key) = @_4args;
273
274 $code.=<<___;
275 .globl ${PREFIX}_encrypt
276 .type ${PREFIX}_encrypt,\@abi-omnipotent
277 .align 16
278 ${PREFIX}_encrypt:
279 .cfi_startproc
280 endbranch
281 movups ($inp),$inout0 # load input
282 mov 240($key),$rounds # key->rounds
283 ___
284 &aesni_generate1("enc",$key,$rounds);
285 $code.=<<___;
286 pxor $rndkey0,$rndkey0 # clear register bank
287 pxor $rndkey1,$rndkey1
288 movups $inout0,($out) # output
289 pxor $inout0,$inout0
290 ret
291 .cfi_endproc
292 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
293
294 .globl ${PREFIX}_decrypt
295 .type ${PREFIX}_decrypt,\@abi-omnipotent
296 .align 16
297 ${PREFIX}_decrypt:
298 .cfi_startproc
299 endbranch
300 movups ($inp),$inout0 # load input
301 mov 240($key),$rounds # key->rounds
302 ___
303 &aesni_generate1("dec",$key,$rounds);
304 $code.=<<___;
305 pxor $rndkey0,$rndkey0 # clear register bank
306 pxor $rndkey1,$rndkey1
307 movups $inout0,($out) # output
308 pxor $inout0,$inout0
309 ret
310 .cfi_endproc
311 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
312 ___
313 }
314 \f
315 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
316 # factor. Why 3x subroutine were originally used in loops? Even though
317 # aes[enc|dec] latency was originally 6, it could be scheduled only
318 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
319 # utilization, i.e. when subroutine's throughput is virtually same as
320 # of non-interleaved subroutine [for number of input blocks up to 3].
321 # This is why it originally made no sense to implement 2x subroutine.
322 # But times change and it became appropriate to spend extra 192 bytes
323 # on 2x subroutine on Atom Silvermont account. For processors that
324 # can schedule aes[enc|dec] every cycle optimal interleave factor
325 # equals to corresponding instructions latency. 8x is optimal for
326 # * Bridge and "super-optimal" for other Intel CPUs...
327
328 sub aesni_generate2 {
329 my $dir=shift;
330 # As already mentioned it takes in $key and $rounds, which are *not*
331 # preserved. $inout[0-1] is cipher/clear text...
332 $code.=<<___;
333 .type _aesni_${dir}rypt2,\@abi-omnipotent
334 .align 16
335 _aesni_${dir}rypt2:
336 .cfi_startproc
337 $movkey ($key),$rndkey0
338 shl \$4,$rounds
339 $movkey 16($key),$rndkey1
340 xorps $rndkey0,$inout0
341 xorps $rndkey0,$inout1
342 $movkey 32($key),$rndkey0
343 lea 32($key,$rounds),$key
344 neg %rax # $rounds
345 add \$16,%rax
346
347 .L${dir}_loop2:
348 aes${dir} $rndkey1,$inout0
349 aes${dir} $rndkey1,$inout1
350 $movkey ($key,%rax),$rndkey1
351 add \$32,%rax
352 aes${dir} $rndkey0,$inout0
353 aes${dir} $rndkey0,$inout1
354 $movkey -16($key,%rax),$rndkey0
355 jnz .L${dir}_loop2
356
357 aes${dir} $rndkey1,$inout0
358 aes${dir} $rndkey1,$inout1
359 aes${dir}last $rndkey0,$inout0
360 aes${dir}last $rndkey0,$inout1
361 ret
362 .cfi_endproc
363 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
364 ___
365 }
366 sub aesni_generate3 {
367 my $dir=shift;
368 # As already mentioned it takes in $key and $rounds, which are *not*
369 # preserved. $inout[0-2] is cipher/clear text...
370 $code.=<<___;
371 .type _aesni_${dir}rypt3,\@abi-omnipotent
372 .align 16
373 _aesni_${dir}rypt3:
374 .cfi_startproc
375 $movkey ($key),$rndkey0
376 shl \$4,$rounds
377 $movkey 16($key),$rndkey1
378 xorps $rndkey0,$inout0
379 xorps $rndkey0,$inout1
380 xorps $rndkey0,$inout2
381 $movkey 32($key),$rndkey0
382 lea 32($key,$rounds),$key
383 neg %rax # $rounds
384 add \$16,%rax
385
386 .L${dir}_loop3:
387 aes${dir} $rndkey1,$inout0
388 aes${dir} $rndkey1,$inout1
389 aes${dir} $rndkey1,$inout2
390 $movkey ($key,%rax),$rndkey1
391 add \$32,%rax
392 aes${dir} $rndkey0,$inout0
393 aes${dir} $rndkey0,$inout1
394 aes${dir} $rndkey0,$inout2
395 $movkey -16($key,%rax),$rndkey0
396 jnz .L${dir}_loop3
397
398 aes${dir} $rndkey1,$inout0
399 aes${dir} $rndkey1,$inout1
400 aes${dir} $rndkey1,$inout2
401 aes${dir}last $rndkey0,$inout0
402 aes${dir}last $rndkey0,$inout1
403 aes${dir}last $rndkey0,$inout2
404 ret
405 .cfi_endproc
406 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
407 ___
408 }
409 # 4x interleave is implemented to improve small block performance,
410 # most notably [and naturally] 4 block by ~30%. One can argue that one
411 # should have implemented 5x as well, but improvement would be <20%,
412 # so it's not worth it...
413 sub aesni_generate4 {
414 my $dir=shift;
415 # As already mentioned it takes in $key and $rounds, which are *not*
416 # preserved. $inout[0-3] is cipher/clear text...
417 $code.=<<___;
418 .type _aesni_${dir}rypt4,\@abi-omnipotent
419 .align 16
420 _aesni_${dir}rypt4:
421 .cfi_startproc
422 $movkey ($key),$rndkey0
423 shl \$4,$rounds
424 $movkey 16($key),$rndkey1
425 xorps $rndkey0,$inout0
426 xorps $rndkey0,$inout1
427 xorps $rndkey0,$inout2
428 xorps $rndkey0,$inout3
429 $movkey 32($key),$rndkey0
430 lea 32($key,$rounds),$key
431 neg %rax # $rounds
432 .byte 0x0f,0x1f,0x00
433 add \$16,%rax
434
435 .L${dir}_loop4:
436 aes${dir} $rndkey1,$inout0
437 aes${dir} $rndkey1,$inout1
438 aes${dir} $rndkey1,$inout2
439 aes${dir} $rndkey1,$inout3
440 $movkey ($key,%rax),$rndkey1
441 add \$32,%rax
442 aes${dir} $rndkey0,$inout0
443 aes${dir} $rndkey0,$inout1
444 aes${dir} $rndkey0,$inout2
445 aes${dir} $rndkey0,$inout3
446 $movkey -16($key,%rax),$rndkey0
447 jnz .L${dir}_loop4
448
449 aes${dir} $rndkey1,$inout0
450 aes${dir} $rndkey1,$inout1
451 aes${dir} $rndkey1,$inout2
452 aes${dir} $rndkey1,$inout3
453 aes${dir}last $rndkey0,$inout0
454 aes${dir}last $rndkey0,$inout1
455 aes${dir}last $rndkey0,$inout2
456 aes${dir}last $rndkey0,$inout3
457 ret
458 .cfi_endproc
459 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
460 ___
461 }
462 sub aesni_generate6 {
463 my $dir=shift;
464 # As already mentioned it takes in $key and $rounds, which are *not*
465 # preserved. $inout[0-5] is cipher/clear text...
466 $code.=<<___;
467 .type _aesni_${dir}rypt6,\@abi-omnipotent
468 .align 16
469 _aesni_${dir}rypt6:
470 .cfi_startproc
471 $movkey ($key),$rndkey0
472 shl \$4,$rounds
473 $movkey 16($key),$rndkey1
474 xorps $rndkey0,$inout0
475 pxor $rndkey0,$inout1
476 pxor $rndkey0,$inout2
477 aes${dir} $rndkey1,$inout0
478 lea 32($key,$rounds),$key
479 neg %rax # $rounds
480 aes${dir} $rndkey1,$inout1
481 pxor $rndkey0,$inout3
482 pxor $rndkey0,$inout4
483 aes${dir} $rndkey1,$inout2
484 pxor $rndkey0,$inout5
485 $movkey ($key,%rax),$rndkey0
486 add \$16,%rax
487 jmp .L${dir}_loop6_enter
488 .align 16
489 .L${dir}_loop6:
490 aes${dir} $rndkey1,$inout0
491 aes${dir} $rndkey1,$inout1
492 aes${dir} $rndkey1,$inout2
493 .L${dir}_loop6_enter:
494 aes${dir} $rndkey1,$inout3
495 aes${dir} $rndkey1,$inout4
496 aes${dir} $rndkey1,$inout5
497 $movkey ($key,%rax),$rndkey1
498 add \$32,%rax
499 aes${dir} $rndkey0,$inout0
500 aes${dir} $rndkey0,$inout1
501 aes${dir} $rndkey0,$inout2
502 aes${dir} $rndkey0,$inout3
503 aes${dir} $rndkey0,$inout4
504 aes${dir} $rndkey0,$inout5
505 $movkey -16($key,%rax),$rndkey0
506 jnz .L${dir}_loop6
507
508 aes${dir} $rndkey1,$inout0
509 aes${dir} $rndkey1,$inout1
510 aes${dir} $rndkey1,$inout2
511 aes${dir} $rndkey1,$inout3
512 aes${dir} $rndkey1,$inout4
513 aes${dir} $rndkey1,$inout5
514 aes${dir}last $rndkey0,$inout0
515 aes${dir}last $rndkey0,$inout1
516 aes${dir}last $rndkey0,$inout2
517 aes${dir}last $rndkey0,$inout3
518 aes${dir}last $rndkey0,$inout4
519 aes${dir}last $rndkey0,$inout5
520 ret
521 .cfi_endproc
522 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
523 ___
524 }
525 sub aesni_generate8 {
526 my $dir=shift;
527 # As already mentioned it takes in $key and $rounds, which are *not*
528 # preserved. $inout[0-7] is cipher/clear text...
529 $code.=<<___;
530 .type _aesni_${dir}rypt8,\@abi-omnipotent
531 .align 16
532 _aesni_${dir}rypt8:
533 .cfi_startproc
534 $movkey ($key),$rndkey0
535 shl \$4,$rounds
536 $movkey 16($key),$rndkey1
537 xorps $rndkey0,$inout0
538 xorps $rndkey0,$inout1
539 pxor $rndkey0,$inout2
540 pxor $rndkey0,$inout3
541 pxor $rndkey0,$inout4
542 lea 32($key,$rounds),$key
543 neg %rax # $rounds
544 aes${dir} $rndkey1,$inout0
545 pxor $rndkey0,$inout5
546 pxor $rndkey0,$inout6
547 aes${dir} $rndkey1,$inout1
548 pxor $rndkey0,$inout7
549 $movkey ($key,%rax),$rndkey0
550 add \$16,%rax
551 jmp .L${dir}_loop8_inner
552 .align 16
553 .L${dir}_loop8:
554 aes${dir} $rndkey1,$inout0
555 aes${dir} $rndkey1,$inout1
556 .L${dir}_loop8_inner:
557 aes${dir} $rndkey1,$inout2
558 aes${dir} $rndkey1,$inout3
559 aes${dir} $rndkey1,$inout4
560 aes${dir} $rndkey1,$inout5
561 aes${dir} $rndkey1,$inout6
562 aes${dir} $rndkey1,$inout7
563 .L${dir}_loop8_enter:
564 $movkey ($key,%rax),$rndkey1
565 add \$32,%rax
566 aes${dir} $rndkey0,$inout0
567 aes${dir} $rndkey0,$inout1
568 aes${dir} $rndkey0,$inout2
569 aes${dir} $rndkey0,$inout3
570 aes${dir} $rndkey0,$inout4
571 aes${dir} $rndkey0,$inout5
572 aes${dir} $rndkey0,$inout6
573 aes${dir} $rndkey0,$inout7
574 $movkey -16($key,%rax),$rndkey0
575 jnz .L${dir}_loop8
576
577 aes${dir} $rndkey1,$inout0
578 aes${dir} $rndkey1,$inout1
579 aes${dir} $rndkey1,$inout2
580 aes${dir} $rndkey1,$inout3
581 aes${dir} $rndkey1,$inout4
582 aes${dir} $rndkey1,$inout5
583 aes${dir} $rndkey1,$inout6
584 aes${dir} $rndkey1,$inout7
585 aes${dir}last $rndkey0,$inout0
586 aes${dir}last $rndkey0,$inout1
587 aes${dir}last $rndkey0,$inout2
588 aes${dir}last $rndkey0,$inout3
589 aes${dir}last $rndkey0,$inout4
590 aes${dir}last $rndkey0,$inout5
591 aes${dir}last $rndkey0,$inout6
592 aes${dir}last $rndkey0,$inout7
593 ret
594 .cfi_endproc
595 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
596 ___
597 }
598 &aesni_generate2("enc") if ($PREFIX eq "aesni");
599 &aesni_generate2("dec");
600 &aesni_generate3("enc") if ($PREFIX eq "aesni");
601 &aesni_generate3("dec");
602 &aesni_generate4("enc") if ($PREFIX eq "aesni");
603 &aesni_generate4("dec");
604 &aesni_generate6("enc") if ($PREFIX eq "aesni");
605 &aesni_generate6("dec");
606 &aesni_generate8("enc") if ($PREFIX eq "aesni");
607 &aesni_generate8("dec");
608 \f
609 if ($PREFIX eq "aesni") {
610 ########################################################################
611 # void aesni_ecb_encrypt (const void *in, void *out,
612 # size_t length, const AES_KEY *key,
613 # int enc);
614 $code.=<<___;
615 .globl aesni_ecb_encrypt
616 .type aesni_ecb_encrypt,\@function,5
617 .align 16
618 aesni_ecb_encrypt:
619 .cfi_startproc
620 endbranch
621 ___
622 $code.=<<___ if ($win64);
623 lea -0x58(%rsp),%rsp
624 movaps %xmm6,(%rsp) # offload $inout4..7
625 movaps %xmm7,0x10(%rsp)
626 movaps %xmm8,0x20(%rsp)
627 movaps %xmm9,0x30(%rsp)
628 .Lecb_enc_body:
629 ___
630 $code.=<<___;
631 and \$-16,$len # if ($len<16)
632 jz .Lecb_ret # return
633
634 mov 240($key),$rounds # key->rounds
635 $movkey ($key),$rndkey0
636 mov $key,$key_ # backup $key
637 mov $rounds,$rnds_ # backup $rounds
638 test %r8d,%r8d # 5th argument
639 jz .Lecb_decrypt
640 #--------------------------- ECB ENCRYPT ------------------------------#
641 cmp \$0x80,$len # if ($len<8*16)
642 jb .Lecb_enc_tail # short input
643
644 movdqu ($inp),$inout0 # load 8 input blocks
645 movdqu 0x10($inp),$inout1
646 movdqu 0x20($inp),$inout2
647 movdqu 0x30($inp),$inout3
648 movdqu 0x40($inp),$inout4
649 movdqu 0x50($inp),$inout5
650 movdqu 0x60($inp),$inout6
651 movdqu 0x70($inp),$inout7
652 lea 0x80($inp),$inp # $inp+=8*16
653 sub \$0x80,$len # $len-=8*16 (can be zero)
654 jmp .Lecb_enc_loop8_enter
655 .align 16
656 .Lecb_enc_loop8:
657 movups $inout0,($out) # store 8 output blocks
658 mov $key_,$key # restore $key
659 movdqu ($inp),$inout0 # load 8 input blocks
660 mov $rnds_,$rounds # restore $rounds
661 movups $inout1,0x10($out)
662 movdqu 0x10($inp),$inout1
663 movups $inout2,0x20($out)
664 movdqu 0x20($inp),$inout2
665 movups $inout3,0x30($out)
666 movdqu 0x30($inp),$inout3
667 movups $inout4,0x40($out)
668 movdqu 0x40($inp),$inout4
669 movups $inout5,0x50($out)
670 movdqu 0x50($inp),$inout5
671 movups $inout6,0x60($out)
672 movdqu 0x60($inp),$inout6
673 movups $inout7,0x70($out)
674 lea 0x80($out),$out # $out+=8*16
675 movdqu 0x70($inp),$inout7
676 lea 0x80($inp),$inp # $inp+=8*16
677 .Lecb_enc_loop8_enter:
678
679 call _aesni_encrypt8
680
681 sub \$0x80,$len
682 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
683
684 movups $inout0,($out) # store 8 output blocks
685 mov $key_,$key # restore $key
686 movups $inout1,0x10($out)
687 mov $rnds_,$rounds # restore $rounds
688 movups $inout2,0x20($out)
689 movups $inout3,0x30($out)
690 movups $inout4,0x40($out)
691 movups $inout5,0x50($out)
692 movups $inout6,0x60($out)
693 movups $inout7,0x70($out)
694 lea 0x80($out),$out # $out+=8*16
695 add \$0x80,$len # restore real remaining $len
696 jz .Lecb_ret # done if ($len==0)
697
698 .Lecb_enc_tail: # $len is less than 8*16
699 movups ($inp),$inout0
700 cmp \$0x20,$len
701 jb .Lecb_enc_one
702 movups 0x10($inp),$inout1
703 je .Lecb_enc_two
704 movups 0x20($inp),$inout2
705 cmp \$0x40,$len
706 jb .Lecb_enc_three
707 movups 0x30($inp),$inout3
708 je .Lecb_enc_four
709 movups 0x40($inp),$inout4
710 cmp \$0x60,$len
711 jb .Lecb_enc_five
712 movups 0x50($inp),$inout5
713 je .Lecb_enc_six
714 movdqu 0x60($inp),$inout6
715 xorps $inout7,$inout7
716 call _aesni_encrypt8
717 movups $inout0,($out) # store 7 output blocks
718 movups $inout1,0x10($out)
719 movups $inout2,0x20($out)
720 movups $inout3,0x30($out)
721 movups $inout4,0x40($out)
722 movups $inout5,0x50($out)
723 movups $inout6,0x60($out)
724 jmp .Lecb_ret
725 .align 16
726 .Lecb_enc_one:
727 ___
728 &aesni_generate1("enc",$key,$rounds);
729 $code.=<<___;
730 movups $inout0,($out) # store one output block
731 jmp .Lecb_ret
732 .align 16
733 .Lecb_enc_two:
734 call _aesni_encrypt2
735 movups $inout0,($out) # store 2 output blocks
736 movups $inout1,0x10($out)
737 jmp .Lecb_ret
738 .align 16
739 .Lecb_enc_three:
740 call _aesni_encrypt3
741 movups $inout0,($out) # store 3 output blocks
742 movups $inout1,0x10($out)
743 movups $inout2,0x20($out)
744 jmp .Lecb_ret
745 .align 16
746 .Lecb_enc_four:
747 call _aesni_encrypt4
748 movups $inout0,($out) # store 4 output blocks
749 movups $inout1,0x10($out)
750 movups $inout2,0x20($out)
751 movups $inout3,0x30($out)
752 jmp .Lecb_ret
753 .align 16
754 .Lecb_enc_five:
755 xorps $inout5,$inout5
756 call _aesni_encrypt6
757 movups $inout0,($out) # store 5 output blocks
758 movups $inout1,0x10($out)
759 movups $inout2,0x20($out)
760 movups $inout3,0x30($out)
761 movups $inout4,0x40($out)
762 jmp .Lecb_ret
763 .align 16
764 .Lecb_enc_six:
765 call _aesni_encrypt6
766 movups $inout0,($out) # store 6 output blocks
767 movups $inout1,0x10($out)
768 movups $inout2,0x20($out)
769 movups $inout3,0x30($out)
770 movups $inout4,0x40($out)
771 movups $inout5,0x50($out)
772 jmp .Lecb_ret
773 \f#--------------------------- ECB DECRYPT ------------------------------#
774 .align 16
775 .Lecb_decrypt:
776 cmp \$0x80,$len # if ($len<8*16)
777 jb .Lecb_dec_tail # short input
778
779 movdqu ($inp),$inout0 # load 8 input blocks
780 movdqu 0x10($inp),$inout1
781 movdqu 0x20($inp),$inout2
782 movdqu 0x30($inp),$inout3
783 movdqu 0x40($inp),$inout4
784 movdqu 0x50($inp),$inout5
785 movdqu 0x60($inp),$inout6
786 movdqu 0x70($inp),$inout7
787 lea 0x80($inp),$inp # $inp+=8*16
788 sub \$0x80,$len # $len-=8*16 (can be zero)
789 jmp .Lecb_dec_loop8_enter
790 .align 16
791 .Lecb_dec_loop8:
792 movups $inout0,($out) # store 8 output blocks
793 mov $key_,$key # restore $key
794 movdqu ($inp),$inout0 # load 8 input blocks
795 mov $rnds_,$rounds # restore $rounds
796 movups $inout1,0x10($out)
797 movdqu 0x10($inp),$inout1
798 movups $inout2,0x20($out)
799 movdqu 0x20($inp),$inout2
800 movups $inout3,0x30($out)
801 movdqu 0x30($inp),$inout3
802 movups $inout4,0x40($out)
803 movdqu 0x40($inp),$inout4
804 movups $inout5,0x50($out)
805 movdqu 0x50($inp),$inout5
806 movups $inout6,0x60($out)
807 movdqu 0x60($inp),$inout6
808 movups $inout7,0x70($out)
809 lea 0x80($out),$out # $out+=8*16
810 movdqu 0x70($inp),$inout7
811 lea 0x80($inp),$inp # $inp+=8*16
812 .Lecb_dec_loop8_enter:
813
814 call _aesni_decrypt8
815
816 $movkey ($key_),$rndkey0
817 sub \$0x80,$len
818 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
819
820 movups $inout0,($out) # store 8 output blocks
821 pxor $inout0,$inout0 # clear register bank
822 mov $key_,$key # restore $key
823 movups $inout1,0x10($out)
824 pxor $inout1,$inout1
825 mov $rnds_,$rounds # restore $rounds
826 movups $inout2,0x20($out)
827 pxor $inout2,$inout2
828 movups $inout3,0x30($out)
829 pxor $inout3,$inout3
830 movups $inout4,0x40($out)
831 pxor $inout4,$inout4
832 movups $inout5,0x50($out)
833 pxor $inout5,$inout5
834 movups $inout6,0x60($out)
835 pxor $inout6,$inout6
836 movups $inout7,0x70($out)
837 pxor $inout7,$inout7
838 lea 0x80($out),$out # $out+=8*16
839 add \$0x80,$len # restore real remaining $len
840 jz .Lecb_ret # done if ($len==0)
841
842 .Lecb_dec_tail:
843 movups ($inp),$inout0
844 cmp \$0x20,$len
845 jb .Lecb_dec_one
846 movups 0x10($inp),$inout1
847 je .Lecb_dec_two
848 movups 0x20($inp),$inout2
849 cmp \$0x40,$len
850 jb .Lecb_dec_three
851 movups 0x30($inp),$inout3
852 je .Lecb_dec_four
853 movups 0x40($inp),$inout4
854 cmp \$0x60,$len
855 jb .Lecb_dec_five
856 movups 0x50($inp),$inout5
857 je .Lecb_dec_six
858 movups 0x60($inp),$inout6
859 $movkey ($key),$rndkey0
860 xorps $inout7,$inout7
861 call _aesni_decrypt8
862 movups $inout0,($out) # store 7 output blocks
863 pxor $inout0,$inout0 # clear register bank
864 movups $inout1,0x10($out)
865 pxor $inout1,$inout1
866 movups $inout2,0x20($out)
867 pxor $inout2,$inout2
868 movups $inout3,0x30($out)
869 pxor $inout3,$inout3
870 movups $inout4,0x40($out)
871 pxor $inout4,$inout4
872 movups $inout5,0x50($out)
873 pxor $inout5,$inout5
874 movups $inout6,0x60($out)
875 pxor $inout6,$inout6
876 pxor $inout7,$inout7
877 jmp .Lecb_ret
878 .align 16
879 .Lecb_dec_one:
880 ___
881 &aesni_generate1("dec",$key,$rounds);
882 $code.=<<___;
883 movups $inout0,($out) # store one output block
884 pxor $inout0,$inout0 # clear register bank
885 jmp .Lecb_ret
886 .align 16
887 .Lecb_dec_two:
888 call _aesni_decrypt2
889 movups $inout0,($out) # store 2 output blocks
890 pxor $inout0,$inout0 # clear register bank
891 movups $inout1,0x10($out)
892 pxor $inout1,$inout1
893 jmp .Lecb_ret
894 .align 16
895 .Lecb_dec_three:
896 call _aesni_decrypt3
897 movups $inout0,($out) # store 3 output blocks
898 pxor $inout0,$inout0 # clear register bank
899 movups $inout1,0x10($out)
900 pxor $inout1,$inout1
901 movups $inout2,0x20($out)
902 pxor $inout2,$inout2
903 jmp .Lecb_ret
904 .align 16
905 .Lecb_dec_four:
906 call _aesni_decrypt4
907 movups $inout0,($out) # store 4 output blocks
908 pxor $inout0,$inout0 # clear register bank
909 movups $inout1,0x10($out)
910 pxor $inout1,$inout1
911 movups $inout2,0x20($out)
912 pxor $inout2,$inout2
913 movups $inout3,0x30($out)
914 pxor $inout3,$inout3
915 jmp .Lecb_ret
916 .align 16
917 .Lecb_dec_five:
918 xorps $inout5,$inout5
919 call _aesni_decrypt6
920 movups $inout0,($out) # store 5 output blocks
921 pxor $inout0,$inout0 # clear register bank
922 movups $inout1,0x10($out)
923 pxor $inout1,$inout1
924 movups $inout2,0x20($out)
925 pxor $inout2,$inout2
926 movups $inout3,0x30($out)
927 pxor $inout3,$inout3
928 movups $inout4,0x40($out)
929 pxor $inout4,$inout4
930 pxor $inout5,$inout5
931 jmp .Lecb_ret
932 .align 16
933 .Lecb_dec_six:
934 call _aesni_decrypt6
935 movups $inout0,($out) # store 6 output blocks
936 pxor $inout0,$inout0 # clear register bank
937 movups $inout1,0x10($out)
938 pxor $inout1,$inout1
939 movups $inout2,0x20($out)
940 pxor $inout2,$inout2
941 movups $inout3,0x30($out)
942 pxor $inout3,$inout3
943 movups $inout4,0x40($out)
944 pxor $inout4,$inout4
945 movups $inout5,0x50($out)
946 pxor $inout5,$inout5
947
948 .Lecb_ret:
949 xorps $rndkey0,$rndkey0 # %xmm0
950 pxor $rndkey1,$rndkey1
951 ___
952 $code.=<<___ if ($win64);
953 movaps (%rsp),%xmm6
954 movaps %xmm0,(%rsp) # clear stack
955 movaps 0x10(%rsp),%xmm7
956 movaps %xmm0,0x10(%rsp)
957 movaps 0x20(%rsp),%xmm8
958 movaps %xmm0,0x20(%rsp)
959 movaps 0x30(%rsp),%xmm9
960 movaps %xmm0,0x30(%rsp)
961 lea 0x58(%rsp),%rsp
962 .Lecb_enc_ret:
963 ___
964 $code.=<<___;
965 ret
966 .cfi_endproc
967 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt
968 ___
969 \f
970 {
971 ######################################################################
972 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
973 # size_t blocks, const AES_KEY *key,
974 # const char *ivec,char *cmac);
975 #
976 # Handles only complete blocks, operates on 64-bit counter and
977 # does not update *ivec! Nor does it finalize CMAC value
978 # (see engine/eng_aesni.c for details)
979 #
980 {
981 my $cmac="%r9"; # 6th argument
982
983 my $increment="%xmm9";
984 my $iv="%xmm6";
985 my $bswap_mask="%xmm7";
986
987 $code.=<<___;
988 .globl aesni_ccm64_encrypt_blocks
989 .type aesni_ccm64_encrypt_blocks,\@function,6
990 .align 16
991 aesni_ccm64_encrypt_blocks:
992 .cfi_startproc
993 endbranch
994 ___
995 $code.=<<___ if ($win64);
996 lea -0x58(%rsp),%rsp
997 movaps %xmm6,(%rsp) # $iv
998 movaps %xmm7,0x10(%rsp) # $bswap_mask
999 movaps %xmm8,0x20(%rsp) # $in0
1000 movaps %xmm9,0x30(%rsp) # $increment
1001 .Lccm64_enc_body:
1002 ___
1003 $code.=<<___;
1004 mov 240($key),$rounds # key->rounds
1005 movdqu ($ivp),$iv
1006 movdqa .Lincrement64(%rip),$increment
1007 movdqa .Lbswap_mask(%rip),$bswap_mask
1008
1009 shl \$4,$rounds
1010 mov \$16,$rnds_
1011 lea 0($key),$key_
1012 movdqu ($cmac),$inout1
1013 movdqa $iv,$inout0
1014 lea 32($key,$rounds),$key # end of key schedule
1015 pshufb $bswap_mask,$iv
1016 sub %rax,%r10 # twisted $rounds
1017 jmp .Lccm64_enc_outer
1018 .align 16
1019 .Lccm64_enc_outer:
1020 $movkey ($key_),$rndkey0
1021 mov %r10,%rax
1022 movups ($inp),$in0 # load inp
1023
1024 xorps $rndkey0,$inout0 # counter
1025 $movkey 16($key_),$rndkey1
1026 xorps $in0,$rndkey0
1027 xorps $rndkey0,$inout1 # cmac^=inp
1028 $movkey 32($key_),$rndkey0
1029
1030 .Lccm64_enc2_loop:
1031 aesenc $rndkey1,$inout0
1032 aesenc $rndkey1,$inout1
1033 $movkey ($key,%rax),$rndkey1
1034 add \$32,%rax
1035 aesenc $rndkey0,$inout0
1036 aesenc $rndkey0,$inout1
1037 $movkey -16($key,%rax),$rndkey0
1038 jnz .Lccm64_enc2_loop
1039 aesenc $rndkey1,$inout0
1040 aesenc $rndkey1,$inout1
1041 paddq $increment,$iv
1042 dec $len # $len-- ($len is in blocks)
1043 aesenclast $rndkey0,$inout0
1044 aesenclast $rndkey0,$inout1
1045
1046 lea 16($inp),$inp
1047 xorps $inout0,$in0 # inp ^= E(iv)
1048 movdqa $iv,$inout0
1049 movups $in0,($out) # save output
1050 pshufb $bswap_mask,$inout0
1051 lea 16($out),$out # $out+=16
1052 jnz .Lccm64_enc_outer # loop if ($len!=0)
1053
1054 pxor $rndkey0,$rndkey0 # clear register bank
1055 pxor $rndkey1,$rndkey1
1056 pxor $inout0,$inout0
1057 movups $inout1,($cmac) # store resulting mac
1058 pxor $inout1,$inout1
1059 pxor $in0,$in0
1060 pxor $iv,$iv
1061 ___
1062 $code.=<<___ if ($win64);
1063 movaps (%rsp),%xmm6
1064 movaps %xmm0,(%rsp) # clear stack
1065 movaps 0x10(%rsp),%xmm7
1066 movaps %xmm0,0x10(%rsp)
1067 movaps 0x20(%rsp),%xmm8
1068 movaps %xmm0,0x20(%rsp)
1069 movaps 0x30(%rsp),%xmm9
1070 movaps %xmm0,0x30(%rsp)
1071 lea 0x58(%rsp),%rsp
1072 .Lccm64_enc_ret:
1073 ___
1074 $code.=<<___;
1075 ret
1076 .cfi_endproc
1077 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1078 ___
1079 ######################################################################
1080 $code.=<<___;
1081 .globl aesni_ccm64_decrypt_blocks
1082 .type aesni_ccm64_decrypt_blocks,\@function,6
1083 .align 16
1084 aesni_ccm64_decrypt_blocks:
1085 .cfi_startproc
1086 endbranch
1087 ___
1088 $code.=<<___ if ($win64);
1089 lea -0x58(%rsp),%rsp
1090 movaps %xmm6,(%rsp) # $iv
1091 movaps %xmm7,0x10(%rsp) # $bswap_mask
1092 movaps %xmm8,0x20(%rsp) # $in8
1093 movaps %xmm9,0x30(%rsp) # $increment
1094 .Lccm64_dec_body:
1095 ___
1096 $code.=<<___;
1097 mov 240($key),$rounds # key->rounds
1098 movups ($ivp),$iv
1099 movdqu ($cmac),$inout1
1100 movdqa .Lincrement64(%rip),$increment
1101 movdqa .Lbswap_mask(%rip),$bswap_mask
1102
1103 movaps $iv,$inout0
1104 mov $rounds,$rnds_
1105 mov $key,$key_
1106 pshufb $bswap_mask,$iv
1107 ___
1108 &aesni_generate1("enc",$key,$rounds);
1109 $code.=<<___;
1110 shl \$4,$rnds_
1111 mov \$16,$rounds
1112 movups ($inp),$in0 # load inp
1113 paddq $increment,$iv
1114 lea 16($inp),$inp # $inp+=16
1115 sub %r10,%rax # twisted $rounds
1116 lea 32($key_,$rnds_),$key # end of key schedule
1117 mov %rax,%r10
1118 jmp .Lccm64_dec_outer
1119 .align 16
1120 .Lccm64_dec_outer:
1121 xorps $inout0,$in0 # inp ^= E(iv)
1122 movdqa $iv,$inout0
1123 movups $in0,($out) # save output
1124 lea 16($out),$out # $out+=16
1125 pshufb $bswap_mask,$inout0
1126
1127 sub \$1,$len # $len-- ($len is in blocks)
1128 jz .Lccm64_dec_break # if ($len==0) break
1129
1130 $movkey ($key_),$rndkey0
1131 mov %r10,%rax
1132 $movkey 16($key_),$rndkey1
1133 xorps $rndkey0,$in0
1134 xorps $rndkey0,$inout0
1135 xorps $in0,$inout1 # cmac^=out
1136 $movkey 32($key_),$rndkey0
1137 jmp .Lccm64_dec2_loop
1138 .align 16
1139 .Lccm64_dec2_loop:
1140 aesenc $rndkey1,$inout0
1141 aesenc $rndkey1,$inout1
1142 $movkey ($key,%rax),$rndkey1
1143 add \$32,%rax
1144 aesenc $rndkey0,$inout0
1145 aesenc $rndkey0,$inout1
1146 $movkey -16($key,%rax),$rndkey0
1147 jnz .Lccm64_dec2_loop
1148 movups ($inp),$in0 # load input
1149 paddq $increment,$iv
1150 aesenc $rndkey1,$inout0
1151 aesenc $rndkey1,$inout1
1152 aesenclast $rndkey0,$inout0
1153 aesenclast $rndkey0,$inout1
1154 lea 16($inp),$inp # $inp+=16
1155 jmp .Lccm64_dec_outer
1156
1157 .align 16
1158 .Lccm64_dec_break:
1159 #xorps $in0,$inout1 # cmac^=out
1160 mov 240($key_),$rounds
1161 ___
1162 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1163 $code.=<<___;
1164 pxor $rndkey0,$rndkey0 # clear register bank
1165 pxor $rndkey1,$rndkey1
1166 pxor $inout0,$inout0
1167 movups $inout1,($cmac) # store resulting mac
1168 pxor $inout1,$inout1
1169 pxor $in0,$in0
1170 pxor $iv,$iv
1171 ___
1172 $code.=<<___ if ($win64);
1173 movaps (%rsp),%xmm6
1174 movaps %xmm0,(%rsp) # clear stack
1175 movaps 0x10(%rsp),%xmm7
1176 movaps %xmm0,0x10(%rsp)
1177 movaps 0x20(%rsp),%xmm8
1178 movaps %xmm0,0x20(%rsp)
1179 movaps 0x30(%rsp),%xmm9
1180 movaps %xmm0,0x30(%rsp)
1181 lea 0x58(%rsp),%rsp
1182 .Lccm64_dec_ret:
1183 ___
1184 $code.=<<___;
1185 ret
1186 .cfi_endproc
1187 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1188 ___
1189 }\f
1190 ######################################################################
1191 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1192 # size_t blocks, const AES_KEY *key,
1193 # const char *ivec);
1194 #
1195 # Handles only complete blocks, operates on 32-bit counter and
1196 # does not update *ivec! (see crypto/modes/ctr128.c for details)
1197 #
1198 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1199 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1200 # Keywords are full unroll and modulo-schedule counter calculations
1201 # with zero-round key xor.
1202 {
1203 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1204 my ($key0,$ctr)=("%ebp","${ivp}d");
1205 my $frame_size = 0x80 + ($win64?160:0);
1206
1207 $code.=<<___;
1208 .globl aesni_ctr32_encrypt_blocks
1209 .type aesni_ctr32_encrypt_blocks,\@function,5
1210 .align 16
1211 aesni_ctr32_encrypt_blocks:
1212 .cfi_startproc
1213 endbranch
1214 cmp \$1,$len
1215 jne .Lctr32_bulk
1216
1217 # handle single block without allocating stack frame,
1218 # useful when handling edges
1219 movups ($ivp),$inout0
1220 movups ($inp),$inout1
1221 mov 240($key),%edx # key->rounds
1222 ___
1223 &aesni_generate1("enc",$key,"%edx");
1224 $code.=<<___;
1225 pxor $rndkey0,$rndkey0 # clear register bank
1226 pxor $rndkey1,$rndkey1
1227 xorps $inout1,$inout0
1228 pxor $inout1,$inout1
1229 movups $inout0,($out)
1230 xorps $inout0,$inout0
1231 jmp .Lctr32_epilogue
1232
1233 .align 16
1234 .Lctr32_bulk:
1235 lea (%rsp),$key_ # use $key_ as frame pointer
1236 .cfi_def_cfa_register $key_
1237 push %rbp
1238 .cfi_push %rbp
1239 sub \$$frame_size,%rsp
1240 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1241 ___
1242 $code.=<<___ if ($win64);
1243 movaps %xmm6,-0xa8($key_) # offload everything
1244 movaps %xmm7,-0x98($key_)
1245 movaps %xmm8,-0x88($key_)
1246 movaps %xmm9,-0x78($key_)
1247 movaps %xmm10,-0x68($key_)
1248 movaps %xmm11,-0x58($key_)
1249 movaps %xmm12,-0x48($key_)
1250 movaps %xmm13,-0x38($key_)
1251 movaps %xmm14,-0x28($key_)
1252 movaps %xmm15,-0x18($key_)
1253 .Lctr32_body:
1254 ___
1255 $code.=<<___;
1256
1257 # 8 16-byte words on top of stack are counter values
1258 # xor-ed with zero-round key
1259
1260 movdqu ($ivp),$inout0
1261 movdqu ($key),$rndkey0
1262 mov 12($ivp),$ctr # counter LSB
1263 pxor $rndkey0,$inout0
1264 mov 12($key),$key0 # 0-round key LSB
1265 movdqa $inout0,0x00(%rsp) # populate counter block
1266 bswap $ctr
1267 movdqa $inout0,$inout1
1268 movdqa $inout0,$inout2
1269 movdqa $inout0,$inout3
1270 movdqa $inout0,0x40(%rsp)
1271 movdqa $inout0,0x50(%rsp)
1272 movdqa $inout0,0x60(%rsp)
1273 mov %rdx,%r10 # about to borrow %rdx
1274 movdqa $inout0,0x70(%rsp)
1275
1276 lea 1($ctr),%rax
1277 lea 2($ctr),%rdx
1278 bswap %eax
1279 bswap %edx
1280 xor $key0,%eax
1281 xor $key0,%edx
1282 pinsrd \$3,%eax,$inout1
1283 lea 3($ctr),%rax
1284 movdqa $inout1,0x10(%rsp)
1285 pinsrd \$3,%edx,$inout2
1286 bswap %eax
1287 mov %r10,%rdx # restore %rdx
1288 lea 4($ctr),%r10
1289 movdqa $inout2,0x20(%rsp)
1290 xor $key0,%eax
1291 bswap %r10d
1292 pinsrd \$3,%eax,$inout3
1293 xor $key0,%r10d
1294 movdqa $inout3,0x30(%rsp)
1295 lea 5($ctr),%r9
1296 mov %r10d,0x40+12(%rsp)
1297 bswap %r9d
1298 lea 6($ctr),%r10
1299 mov 240($key),$rounds # key->rounds
1300 xor $key0,%r9d
1301 bswap %r10d
1302 mov %r9d,0x50+12(%rsp)
1303 xor $key0,%r10d
1304 lea 7($ctr),%r9
1305 mov %r10d,0x60+12(%rsp)
1306 bswap %r9d
1307 mov OPENSSL_ia32cap_P+4(%rip),%r10d
1308 xor $key0,%r9d
1309 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1310 mov %r9d,0x70+12(%rsp)
1311
1312 $movkey 0x10($key),$rndkey1
1313
1314 movdqa 0x40(%rsp),$inout4
1315 movdqa 0x50(%rsp),$inout5
1316
1317 cmp \$8,$len # $len is in blocks
1318 jb .Lctr32_tail # short input if ($len<8)
1319
1320 sub \$6,$len # $len is biased by -6
1321 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
1322 je .Lctr32_6x # [which denotes Atom Silvermont]
1323
1324 lea 0x80($key),$key # size optimization
1325 sub \$2,$len # $len is biased by -8
1326 jmp .Lctr32_loop8
1327
1328 .align 16
1329 .Lctr32_6x:
1330 shl \$4,$rounds
1331 mov \$48,$rnds_
1332 bswap $key0
1333 lea 32($key,$rounds),$key # end of key schedule
1334 sub %rax,%r10 # twisted $rounds
1335 jmp .Lctr32_loop6
1336
1337 .align 16
1338 .Lctr32_loop6:
1339 add \$6,$ctr # next counter value
1340 $movkey -48($key,$rnds_),$rndkey0
1341 aesenc $rndkey1,$inout0
1342 mov $ctr,%eax
1343 xor $key0,%eax
1344 aesenc $rndkey1,$inout1
1345 movbe %eax,`0x00+12`(%rsp) # store next counter value
1346 lea 1($ctr),%eax
1347 aesenc $rndkey1,$inout2
1348 xor $key0,%eax
1349 movbe %eax,`0x10+12`(%rsp)
1350 aesenc $rndkey1,$inout3
1351 lea 2($ctr),%eax
1352 xor $key0,%eax
1353 aesenc $rndkey1,$inout4
1354 movbe %eax,`0x20+12`(%rsp)
1355 lea 3($ctr),%eax
1356 aesenc $rndkey1,$inout5
1357 $movkey -32($key,$rnds_),$rndkey1
1358 xor $key0,%eax
1359
1360 aesenc $rndkey0,$inout0
1361 movbe %eax,`0x30+12`(%rsp)
1362 lea 4($ctr),%eax
1363 aesenc $rndkey0,$inout1
1364 xor $key0,%eax
1365 movbe %eax,`0x40+12`(%rsp)
1366 aesenc $rndkey0,$inout2
1367 lea 5($ctr),%eax
1368 xor $key0,%eax
1369 aesenc $rndkey0,$inout3
1370 movbe %eax,`0x50+12`(%rsp)
1371 mov %r10,%rax # mov $rnds_,$rounds
1372 aesenc $rndkey0,$inout4
1373 aesenc $rndkey0,$inout5
1374 $movkey -16($key,$rnds_),$rndkey0
1375
1376 call .Lenc_loop6
1377
1378 movdqu ($inp),$inout6 # load 6 input blocks
1379 movdqu 0x10($inp),$inout7
1380 movdqu 0x20($inp),$in0
1381 movdqu 0x30($inp),$in1
1382 movdqu 0x40($inp),$in2
1383 movdqu 0x50($inp),$in3
1384 lea 0x60($inp),$inp # $inp+=6*16
1385 $movkey -64($key,$rnds_),$rndkey1
1386 pxor $inout0,$inout6 # inp^=E(ctr)
1387 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
1388 pxor $inout1,$inout7
1389 movaps 0x10(%rsp),$inout1
1390 pxor $inout2,$in0
1391 movaps 0x20(%rsp),$inout2
1392 pxor $inout3,$in1
1393 movaps 0x30(%rsp),$inout3
1394 pxor $inout4,$in2
1395 movaps 0x40(%rsp),$inout4
1396 pxor $inout5,$in3
1397 movaps 0x50(%rsp),$inout5
1398 movdqu $inout6,($out) # store 6 output blocks
1399 movdqu $inout7,0x10($out)
1400 movdqu $in0,0x20($out)
1401 movdqu $in1,0x30($out)
1402 movdqu $in2,0x40($out)
1403 movdqu $in3,0x50($out)
1404 lea 0x60($out),$out # $out+=6*16
1405
1406 sub \$6,$len
1407 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1408
1409 add \$6,$len # restore real remaining $len
1410 jz .Lctr32_done # done if ($len==0)
1411
1412 lea -48($rnds_),$rounds
1413 lea -80($key,$rnds_),$key # restore $key
1414 neg $rounds
1415 shr \$4,$rounds # restore $rounds
1416 jmp .Lctr32_tail
1417
1418 .align 32
1419 .Lctr32_loop8:
1420 add \$8,$ctr # next counter value
1421 movdqa 0x60(%rsp),$inout6
1422 aesenc $rndkey1,$inout0
1423 mov $ctr,%r9d
1424 movdqa 0x70(%rsp),$inout7
1425 aesenc $rndkey1,$inout1
1426 bswap %r9d
1427 $movkey 0x20-0x80($key),$rndkey0
1428 aesenc $rndkey1,$inout2
1429 xor $key0,%r9d
1430 nop
1431 aesenc $rndkey1,$inout3
1432 mov %r9d,0x00+12(%rsp) # store next counter value
1433 lea 1($ctr),%r9
1434 aesenc $rndkey1,$inout4
1435 aesenc $rndkey1,$inout5
1436 aesenc $rndkey1,$inout6
1437 aesenc $rndkey1,$inout7
1438 $movkey 0x30-0x80($key),$rndkey1
1439 ___
1440 for($i=2;$i<8;$i++) {
1441 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1442 $code.=<<___;
1443 bswap %r9d
1444 aesenc $rndkeyx,$inout0
1445 aesenc $rndkeyx,$inout1
1446 xor $key0,%r9d
1447 .byte 0x66,0x90
1448 aesenc $rndkeyx,$inout2
1449 aesenc $rndkeyx,$inout3
1450 mov %r9d,`0x10*($i-1)`+12(%rsp)
1451 lea $i($ctr),%r9
1452 aesenc $rndkeyx,$inout4
1453 aesenc $rndkeyx,$inout5
1454 aesenc $rndkeyx,$inout6
1455 aesenc $rndkeyx,$inout7
1456 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1457 ___
1458 }
1459 $code.=<<___;
1460 bswap %r9d
1461 aesenc $rndkey0,$inout0
1462 aesenc $rndkey0,$inout1
1463 aesenc $rndkey0,$inout2
1464 xor $key0,%r9d
1465 movdqu 0x00($inp),$in0 # start loading input
1466 aesenc $rndkey0,$inout3
1467 mov %r9d,0x70+12(%rsp)
1468 cmp \$11,$rounds
1469 aesenc $rndkey0,$inout4
1470 aesenc $rndkey0,$inout5
1471 aesenc $rndkey0,$inout6
1472 aesenc $rndkey0,$inout7
1473 $movkey 0xa0-0x80($key),$rndkey0
1474
1475 jb .Lctr32_enc_done
1476
1477 aesenc $rndkey1,$inout0
1478 aesenc $rndkey1,$inout1
1479 aesenc $rndkey1,$inout2
1480 aesenc $rndkey1,$inout3
1481 aesenc $rndkey1,$inout4
1482 aesenc $rndkey1,$inout5
1483 aesenc $rndkey1,$inout6
1484 aesenc $rndkey1,$inout7
1485 $movkey 0xb0-0x80($key),$rndkey1
1486
1487 aesenc $rndkey0,$inout0
1488 aesenc $rndkey0,$inout1
1489 aesenc $rndkey0,$inout2
1490 aesenc $rndkey0,$inout3
1491 aesenc $rndkey0,$inout4
1492 aesenc $rndkey0,$inout5
1493 aesenc $rndkey0,$inout6
1494 aesenc $rndkey0,$inout7
1495 $movkey 0xc0-0x80($key),$rndkey0
1496 je .Lctr32_enc_done
1497
1498 aesenc $rndkey1,$inout0
1499 aesenc $rndkey1,$inout1
1500 aesenc $rndkey1,$inout2
1501 aesenc $rndkey1,$inout3
1502 aesenc $rndkey1,$inout4
1503 aesenc $rndkey1,$inout5
1504 aesenc $rndkey1,$inout6
1505 aesenc $rndkey1,$inout7
1506 $movkey 0xd0-0x80($key),$rndkey1
1507
1508 aesenc $rndkey0,$inout0
1509 aesenc $rndkey0,$inout1
1510 aesenc $rndkey0,$inout2
1511 aesenc $rndkey0,$inout3
1512 aesenc $rndkey0,$inout4
1513 aesenc $rndkey0,$inout5
1514 aesenc $rndkey0,$inout6
1515 aesenc $rndkey0,$inout7
1516 $movkey 0xe0-0x80($key),$rndkey0
1517 jmp .Lctr32_enc_done
1518
1519 .align 16
1520 .Lctr32_enc_done:
1521 movdqu 0x10($inp),$in1
1522 pxor $rndkey0,$in0 # input^=round[last]
1523 movdqu 0x20($inp),$in2
1524 pxor $rndkey0,$in1
1525 movdqu 0x30($inp),$in3
1526 pxor $rndkey0,$in2
1527 movdqu 0x40($inp),$in4
1528 pxor $rndkey0,$in3
1529 movdqu 0x50($inp),$in5
1530 pxor $rndkey0,$in4
1531 pxor $rndkey0,$in5
1532 aesenc $rndkey1,$inout0
1533 aesenc $rndkey1,$inout1
1534 aesenc $rndkey1,$inout2
1535 aesenc $rndkey1,$inout3
1536 aesenc $rndkey1,$inout4
1537 aesenc $rndkey1,$inout5
1538 aesenc $rndkey1,$inout6
1539 aesenc $rndkey1,$inout7
1540 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1541 lea 0x80($inp),$inp # $inp+=8*16
1542
1543 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1544 pxor $rndkey0,$rndkey1 # borrowed $rndkey
1545 movdqu 0x70-0x80($inp),$in0
1546 aesenclast $in1,$inout1
1547 pxor $rndkey0,$in0
1548 movdqa 0x00(%rsp),$in1 # load next counter block
1549 aesenclast $in2,$inout2
1550 aesenclast $in3,$inout3
1551 movdqa 0x10(%rsp),$in2
1552 movdqa 0x20(%rsp),$in3
1553 aesenclast $in4,$inout4
1554 aesenclast $in5,$inout5
1555 movdqa 0x30(%rsp),$in4
1556 movdqa 0x40(%rsp),$in5
1557 aesenclast $rndkey1,$inout6
1558 movdqa 0x50(%rsp),$rndkey0
1559 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1560 aesenclast $in0,$inout7
1561
1562 movups $inout0,($out) # store 8 output blocks
1563 movdqa $in1,$inout0
1564 movups $inout1,0x10($out)
1565 movdqa $in2,$inout1
1566 movups $inout2,0x20($out)
1567 movdqa $in3,$inout2
1568 movups $inout3,0x30($out)
1569 movdqa $in4,$inout3
1570 movups $inout4,0x40($out)
1571 movdqa $in5,$inout4
1572 movups $inout5,0x50($out)
1573 movdqa $rndkey0,$inout5
1574 movups $inout6,0x60($out)
1575 movups $inout7,0x70($out)
1576 lea 0x80($out),$out # $out+=8*16
1577
1578 sub \$8,$len
1579 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1580
1581 add \$8,$len # restore real remaining $len
1582 jz .Lctr32_done # done if ($len==0)
1583 lea -0x80($key),$key
1584
1585 .Lctr32_tail:
1586 # note that at this point $inout0..5 are populated with
1587 # counter values xor-ed with 0-round key
1588 lea 16($key),$key
1589 cmp \$4,$len
1590 jb .Lctr32_loop3
1591 je .Lctr32_loop4
1592
1593 # if ($len>4) compute 7 E(counter)
1594 shl \$4,$rounds
1595 movdqa 0x60(%rsp),$inout6
1596 pxor $inout7,$inout7
1597
1598 $movkey 16($key),$rndkey0
1599 aesenc $rndkey1,$inout0
1600 aesenc $rndkey1,$inout1
1601 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1602 neg %rax
1603 aesenc $rndkey1,$inout2
1604 add \$16,%rax # prepare for .Lenc_loop8_enter
1605 movups ($inp),$in0
1606 aesenc $rndkey1,$inout3
1607 aesenc $rndkey1,$inout4
1608 movups 0x10($inp),$in1 # pre-load input
1609 movups 0x20($inp),$in2
1610 aesenc $rndkey1,$inout5
1611 aesenc $rndkey1,$inout6
1612
1613 call .Lenc_loop8_enter
1614
1615 movdqu 0x30($inp),$in3
1616 pxor $in0,$inout0
1617 movdqu 0x40($inp),$in0
1618 pxor $in1,$inout1
1619 movdqu $inout0,($out) # store output
1620 pxor $in2,$inout2
1621 movdqu $inout1,0x10($out)
1622 pxor $in3,$inout3
1623 movdqu $inout2,0x20($out)
1624 pxor $in0,$inout4
1625 movdqu $inout3,0x30($out)
1626 movdqu $inout4,0x40($out)
1627 cmp \$6,$len
1628 jb .Lctr32_done # $len was 5, stop store
1629
1630 movups 0x50($inp),$in1
1631 xorps $in1,$inout5
1632 movups $inout5,0x50($out)
1633 je .Lctr32_done # $len was 6, stop store
1634
1635 movups 0x60($inp),$in2
1636 xorps $in2,$inout6
1637 movups $inout6,0x60($out)
1638 jmp .Lctr32_done # $len was 7, stop store
1639
1640 .align 32
1641 .Lctr32_loop4:
1642 aesenc $rndkey1,$inout0
1643 lea 16($key),$key
1644 dec $rounds
1645 aesenc $rndkey1,$inout1
1646 aesenc $rndkey1,$inout2
1647 aesenc $rndkey1,$inout3
1648 $movkey ($key),$rndkey1
1649 jnz .Lctr32_loop4
1650 aesenclast $rndkey1,$inout0
1651 aesenclast $rndkey1,$inout1
1652 movups ($inp),$in0 # load input
1653 movups 0x10($inp),$in1
1654 aesenclast $rndkey1,$inout2
1655 aesenclast $rndkey1,$inout3
1656 movups 0x20($inp),$in2
1657 movups 0x30($inp),$in3
1658
1659 xorps $in0,$inout0
1660 movups $inout0,($out) # store output
1661 xorps $in1,$inout1
1662 movups $inout1,0x10($out)
1663 pxor $in2,$inout2
1664 movdqu $inout2,0x20($out)
1665 pxor $in3,$inout3
1666 movdqu $inout3,0x30($out)
1667 jmp .Lctr32_done # $len was 4, stop store
1668
1669 .align 32
1670 .Lctr32_loop3:
1671 aesenc $rndkey1,$inout0
1672 lea 16($key),$key
1673 dec $rounds
1674 aesenc $rndkey1,$inout1
1675 aesenc $rndkey1,$inout2
1676 $movkey ($key),$rndkey1
1677 jnz .Lctr32_loop3
1678 aesenclast $rndkey1,$inout0
1679 aesenclast $rndkey1,$inout1
1680 aesenclast $rndkey1,$inout2
1681
1682 movups ($inp),$in0 # load input
1683 xorps $in0,$inout0
1684 movups $inout0,($out) # store output
1685 cmp \$2,$len
1686 jb .Lctr32_done # $len was 1, stop store
1687
1688 movups 0x10($inp),$in1
1689 xorps $in1,$inout1
1690 movups $inout1,0x10($out)
1691 je .Lctr32_done # $len was 2, stop store
1692
1693 movups 0x20($inp),$in2
1694 xorps $in2,$inout2
1695 movups $inout2,0x20($out) # $len was 3, stop store
1696
1697 .Lctr32_done:
1698 xorps %xmm0,%xmm0 # clear register bank
1699 xor $key0,$key0
1700 pxor %xmm1,%xmm1
1701 pxor %xmm2,%xmm2
1702 pxor %xmm3,%xmm3
1703 pxor %xmm4,%xmm4
1704 pxor %xmm5,%xmm5
1705 ___
1706 $code.=<<___ if (!$win64);
1707 pxor %xmm6,%xmm6
1708 pxor %xmm7,%xmm7
1709 movaps %xmm0,0x00(%rsp) # clear stack
1710 pxor %xmm8,%xmm8
1711 movaps %xmm0,0x10(%rsp)
1712 pxor %xmm9,%xmm9
1713 movaps %xmm0,0x20(%rsp)
1714 pxor %xmm10,%xmm10
1715 movaps %xmm0,0x30(%rsp)
1716 pxor %xmm11,%xmm11
1717 movaps %xmm0,0x40(%rsp)
1718 pxor %xmm12,%xmm12
1719 movaps %xmm0,0x50(%rsp)
1720 pxor %xmm13,%xmm13
1721 movaps %xmm0,0x60(%rsp)
1722 pxor %xmm14,%xmm14
1723 movaps %xmm0,0x70(%rsp)
1724 pxor %xmm15,%xmm15
1725 ___
1726 $code.=<<___ if ($win64);
1727 movaps -0xa8($key_),%xmm6
1728 movaps %xmm0,-0xa8($key_) # clear stack
1729 movaps -0x98($key_),%xmm7
1730 movaps %xmm0,-0x98($key_)
1731 movaps -0x88($key_),%xmm8
1732 movaps %xmm0,-0x88($key_)
1733 movaps -0x78($key_),%xmm9
1734 movaps %xmm0,-0x78($key_)
1735 movaps -0x68($key_),%xmm10
1736 movaps %xmm0,-0x68($key_)
1737 movaps -0x58($key_),%xmm11
1738 movaps %xmm0,-0x58($key_)
1739 movaps -0x48($key_),%xmm12
1740 movaps %xmm0,-0x48($key_)
1741 movaps -0x38($key_),%xmm13
1742 movaps %xmm0,-0x38($key_)
1743 movaps -0x28($key_),%xmm14
1744 movaps %xmm0,-0x28($key_)
1745 movaps -0x18($key_),%xmm15
1746 movaps %xmm0,-0x18($key_)
1747 movaps %xmm0,0x00(%rsp)
1748 movaps %xmm0,0x10(%rsp)
1749 movaps %xmm0,0x20(%rsp)
1750 movaps %xmm0,0x30(%rsp)
1751 movaps %xmm0,0x40(%rsp)
1752 movaps %xmm0,0x50(%rsp)
1753 movaps %xmm0,0x60(%rsp)
1754 movaps %xmm0,0x70(%rsp)
1755 ___
1756 $code.=<<___;
1757 mov -8($key_),%rbp
1758 .cfi_restore %rbp
1759 lea ($key_),%rsp
1760 .cfi_def_cfa_register %rsp
1761 .Lctr32_epilogue:
1762 ret
1763 .cfi_endproc
1764 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1765 ___
1766 }
1767 \f
1768 ######################################################################
1769 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1770 # const AES_KEY *key1, const AES_KEY *key2
1771 # const unsigned char iv[16]);
1772 #
1773 {
1774 my @tweak=map("%xmm$_",(10..15));
1775 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1776 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1777 my $frame_size = 0x70 + ($win64?160:0);
1778 my $key_ = "%rbp"; # override so that we can use %r11 as FP
1779
1780 $code.=<<___;
1781 .globl aesni_xts_encrypt
1782 .type aesni_xts_encrypt,\@function,6
1783 .align 16
1784 aesni_xts_encrypt:
1785 .cfi_startproc
1786 endbranch
1787 lea (%rsp),%r11 # frame pointer
1788 .cfi_def_cfa_register %r11
1789 push %rbp
1790 .cfi_push %rbp
1791 sub \$$frame_size,%rsp
1792 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1793 ___
1794 $code.=<<___ if ($win64);
1795 movaps %xmm6,-0xa8(%r11) # offload everything
1796 movaps %xmm7,-0x98(%r11)
1797 movaps %xmm8,-0x88(%r11)
1798 movaps %xmm9,-0x78(%r11)
1799 movaps %xmm10,-0x68(%r11)
1800 movaps %xmm11,-0x58(%r11)
1801 movaps %xmm12,-0x48(%r11)
1802 movaps %xmm13,-0x38(%r11)
1803 movaps %xmm14,-0x28(%r11)
1804 movaps %xmm15,-0x18(%r11)
1805 .Lxts_enc_body:
1806 ___
1807 $code.=<<___;
1808 movups ($ivp),$inout0 # load clear-text tweak
1809 mov 240(%r8),$rounds # key2->rounds
1810 mov 240($key),$rnds_ # key1->rounds
1811 ___
1812 # generate the tweak
1813 &aesni_generate1("enc",$key2,$rounds,$inout0);
1814 $code.=<<___;
1815 $movkey ($key),$rndkey0 # zero round key
1816 mov $key,$key_ # backup $key
1817 mov $rnds_,$rounds # backup $rounds
1818 shl \$4,$rnds_
1819 mov $len,$len_ # backup $len
1820 and \$-16,$len
1821
1822 $movkey 16($key,$rnds_),$rndkey1 # last round key
1823
1824 movdqa .Lxts_magic(%rip),$twmask
1825 movdqa $inout0,@tweak[5]
1826 pshufd \$0x5f,$inout0,$twres
1827 pxor $rndkey0,$rndkey1
1828 ___
1829 # alternative tweak calculation algorithm is based on suggestions
1830 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1831 # and should help in the future...
1832 for ($i=0;$i<4;$i++) {
1833 $code.=<<___;
1834 movdqa $twres,$twtmp
1835 paddd $twres,$twres
1836 movdqa @tweak[5],@tweak[$i]
1837 psrad \$31,$twtmp # broadcast upper bits
1838 paddq @tweak[5],@tweak[5]
1839 pand $twmask,$twtmp
1840 pxor $rndkey0,@tweak[$i]
1841 pxor $twtmp,@tweak[5]
1842 ___
1843 }
1844 $code.=<<___;
1845 movdqa @tweak[5],@tweak[4]
1846 psrad \$31,$twres
1847 paddq @tweak[5],@tweak[5]
1848 pand $twmask,$twres
1849 pxor $rndkey0,@tweak[4]
1850 pxor $twres,@tweak[5]
1851 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1852
1853 sub \$16*6,$len
1854 jc .Lxts_enc_short # if $len-=6*16 borrowed
1855
1856 mov \$16+96,$rounds
1857 lea 32($key_,$rnds_),$key # end of key schedule
1858 sub %r10,%rax # twisted $rounds
1859 $movkey 16($key_),$rndkey1
1860 mov %rax,%r10 # backup twisted $rounds
1861 lea .Lxts_magic(%rip),%r8
1862 jmp .Lxts_enc_grandloop
1863
1864 .align 32
1865 .Lxts_enc_grandloop:
1866 movdqu `16*0`($inp),$inout0 # load input
1867 movdqa $rndkey0,$twmask
1868 movdqu `16*1`($inp),$inout1
1869 pxor @tweak[0],$inout0 # input^=tweak^round[0]
1870 movdqu `16*2`($inp),$inout2
1871 pxor @tweak[1],$inout1
1872 aesenc $rndkey1,$inout0
1873 movdqu `16*3`($inp),$inout3
1874 pxor @tweak[2],$inout2
1875 aesenc $rndkey1,$inout1
1876 movdqu `16*4`($inp),$inout4
1877 pxor @tweak[3],$inout3
1878 aesenc $rndkey1,$inout2
1879 movdqu `16*5`($inp),$inout5
1880 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1881 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1882 pxor @tweak[4],$inout4
1883 aesenc $rndkey1,$inout3
1884 $movkey 32($key_),$rndkey0
1885 lea `16*6`($inp),$inp
1886 pxor $twmask,$inout5
1887
1888 pxor $twres,@tweak[0] # calculate tweaks^round[last]
1889 aesenc $rndkey1,$inout4
1890 pxor $twres,@tweak[1]
1891 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
1892 aesenc $rndkey1,$inout5
1893 $movkey 48($key_),$rndkey1
1894 pxor $twres,@tweak[2]
1895
1896 aesenc $rndkey0,$inout0
1897 pxor $twres,@tweak[3]
1898 movdqa @tweak[1],`16*1`(%rsp)
1899 aesenc $rndkey0,$inout1
1900 pxor $twres,@tweak[4]
1901 movdqa @tweak[2],`16*2`(%rsp)
1902 aesenc $rndkey0,$inout2
1903 aesenc $rndkey0,$inout3
1904 pxor $twres,$twmask
1905 movdqa @tweak[4],`16*4`(%rsp)
1906 aesenc $rndkey0,$inout4
1907 aesenc $rndkey0,$inout5
1908 $movkey 64($key_),$rndkey0
1909 movdqa $twmask,`16*5`(%rsp)
1910 pshufd \$0x5f,@tweak[5],$twres
1911 jmp .Lxts_enc_loop6
1912 .align 32
1913 .Lxts_enc_loop6:
1914 aesenc $rndkey1,$inout0
1915 aesenc $rndkey1,$inout1
1916 aesenc $rndkey1,$inout2
1917 aesenc $rndkey1,$inout3
1918 aesenc $rndkey1,$inout4
1919 aesenc $rndkey1,$inout5
1920 $movkey -64($key,%rax),$rndkey1
1921 add \$32,%rax
1922
1923 aesenc $rndkey0,$inout0
1924 aesenc $rndkey0,$inout1
1925 aesenc $rndkey0,$inout2
1926 aesenc $rndkey0,$inout3
1927 aesenc $rndkey0,$inout4
1928 aesenc $rndkey0,$inout5
1929 $movkey -80($key,%rax),$rndkey0
1930 jnz .Lxts_enc_loop6
1931
1932 movdqa (%r8),$twmask # start calculating next tweak
1933 movdqa $twres,$twtmp
1934 paddd $twres,$twres
1935 aesenc $rndkey1,$inout0
1936 paddq @tweak[5],@tweak[5]
1937 psrad \$31,$twtmp
1938 aesenc $rndkey1,$inout1
1939 pand $twmask,$twtmp
1940 $movkey ($key_),@tweak[0] # load round[0]
1941 aesenc $rndkey1,$inout2
1942 aesenc $rndkey1,$inout3
1943 aesenc $rndkey1,$inout4
1944 pxor $twtmp,@tweak[5]
1945 movaps @tweak[0],@tweak[1] # copy round[0]
1946 aesenc $rndkey1,$inout5
1947 $movkey -64($key),$rndkey1
1948
1949 movdqa $twres,$twtmp
1950 aesenc $rndkey0,$inout0
1951 paddd $twres,$twres
1952 pxor @tweak[5],@tweak[0]
1953 aesenc $rndkey0,$inout1
1954 psrad \$31,$twtmp
1955 paddq @tweak[5],@tweak[5]
1956 aesenc $rndkey0,$inout2
1957 aesenc $rndkey0,$inout3
1958 pand $twmask,$twtmp
1959 movaps @tweak[1],@tweak[2]
1960 aesenc $rndkey0,$inout4
1961 pxor $twtmp,@tweak[5]
1962 movdqa $twres,$twtmp
1963 aesenc $rndkey0,$inout5
1964 $movkey -48($key),$rndkey0
1965
1966 paddd $twres,$twres
1967 aesenc $rndkey1,$inout0
1968 pxor @tweak[5],@tweak[1]
1969 psrad \$31,$twtmp
1970 aesenc $rndkey1,$inout1
1971 paddq @tweak[5],@tweak[5]
1972 pand $twmask,$twtmp
1973 aesenc $rndkey1,$inout2
1974 aesenc $rndkey1,$inout3
1975 movdqa @tweak[3],`16*3`(%rsp)
1976 pxor $twtmp,@tweak[5]
1977 aesenc $rndkey1,$inout4
1978 movaps @tweak[2],@tweak[3]
1979 movdqa $twres,$twtmp
1980 aesenc $rndkey1,$inout5
1981 $movkey -32($key),$rndkey1
1982
1983 paddd $twres,$twres
1984 aesenc $rndkey0,$inout0
1985 pxor @tweak[5],@tweak[2]
1986 psrad \$31,$twtmp
1987 aesenc $rndkey0,$inout1
1988 paddq @tweak[5],@tweak[5]
1989 pand $twmask,$twtmp
1990 aesenc $rndkey0,$inout2
1991 aesenc $rndkey0,$inout3
1992 aesenc $rndkey0,$inout4
1993 pxor $twtmp,@tweak[5]
1994 movaps @tweak[3],@tweak[4]
1995 aesenc $rndkey0,$inout5
1996
1997 movdqa $twres,$rndkey0
1998 paddd $twres,$twres
1999 aesenc $rndkey1,$inout0
2000 pxor @tweak[5],@tweak[3]
2001 psrad \$31,$rndkey0
2002 aesenc $rndkey1,$inout1
2003 paddq @tweak[5],@tweak[5]
2004 pand $twmask,$rndkey0
2005 aesenc $rndkey1,$inout2
2006 aesenc $rndkey1,$inout3
2007 pxor $rndkey0,@tweak[5]
2008 $movkey ($key_),$rndkey0
2009 aesenc $rndkey1,$inout4
2010 aesenc $rndkey1,$inout5
2011 $movkey 16($key_),$rndkey1
2012
2013 pxor @tweak[5],@tweak[4]
2014 aesenclast `16*0`(%rsp),$inout0
2015 psrad \$31,$twres
2016 paddq @tweak[5],@tweak[5]
2017 aesenclast `16*1`(%rsp),$inout1
2018 aesenclast `16*2`(%rsp),$inout2
2019 pand $twmask,$twres
2020 mov %r10,%rax # restore $rounds
2021 aesenclast `16*3`(%rsp),$inout3
2022 aesenclast `16*4`(%rsp),$inout4
2023 aesenclast `16*5`(%rsp),$inout5
2024 pxor $twres,@tweak[5]
2025
2026 lea `16*6`($out),$out # $out+=6*16
2027 movups $inout0,`-16*6`($out) # store 6 output blocks
2028 movups $inout1,`-16*5`($out)
2029 movups $inout2,`-16*4`($out)
2030 movups $inout3,`-16*3`($out)
2031 movups $inout4,`-16*2`($out)
2032 movups $inout5,`-16*1`($out)
2033 sub \$16*6,$len
2034 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
2035
2036 mov \$16+96,$rounds
2037 sub $rnds_,$rounds
2038 mov $key_,$key # restore $key
2039 shr \$4,$rounds # restore original value
2040
2041 .Lxts_enc_short:
2042 # at the point @tweak[0..5] are populated with tweak values
2043 mov $rounds,$rnds_ # backup $rounds
2044 pxor $rndkey0,@tweak[0]
2045 add \$16*6,$len # restore real remaining $len
2046 jz .Lxts_enc_done # done if ($len==0)
2047
2048 pxor $rndkey0,@tweak[1]
2049 cmp \$0x20,$len
2050 jb .Lxts_enc_one # $len is 1*16
2051 pxor $rndkey0,@tweak[2]
2052 je .Lxts_enc_two # $len is 2*16
2053
2054 pxor $rndkey0,@tweak[3]
2055 cmp \$0x40,$len
2056 jb .Lxts_enc_three # $len is 3*16
2057 pxor $rndkey0,@tweak[4]
2058 je .Lxts_enc_four # $len is 4*16
2059
2060 movdqu ($inp),$inout0 # $len is 5*16
2061 movdqu 16*1($inp),$inout1
2062 movdqu 16*2($inp),$inout2
2063 pxor @tweak[0],$inout0
2064 movdqu 16*3($inp),$inout3
2065 pxor @tweak[1],$inout1
2066 movdqu 16*4($inp),$inout4
2067 lea 16*5($inp),$inp # $inp+=5*16
2068 pxor @tweak[2],$inout2
2069 pxor @tweak[3],$inout3
2070 pxor @tweak[4],$inout4
2071 pxor $inout5,$inout5
2072
2073 call _aesni_encrypt6
2074
2075 xorps @tweak[0],$inout0
2076 movdqa @tweak[5],@tweak[0]
2077 xorps @tweak[1],$inout1
2078 xorps @tweak[2],$inout2
2079 movdqu $inout0,($out) # store 5 output blocks
2080 xorps @tweak[3],$inout3
2081 movdqu $inout1,16*1($out)
2082 xorps @tweak[4],$inout4
2083 movdqu $inout2,16*2($out)
2084 movdqu $inout3,16*3($out)
2085 movdqu $inout4,16*4($out)
2086 lea 16*5($out),$out # $out+=5*16
2087 jmp .Lxts_enc_done
2088
2089 .align 16
2090 .Lxts_enc_one:
2091 movups ($inp),$inout0
2092 lea 16*1($inp),$inp # inp+=1*16
2093 xorps @tweak[0],$inout0
2094 ___
2095 &aesni_generate1("enc",$key,$rounds);
2096 $code.=<<___;
2097 xorps @tweak[0],$inout0
2098 movdqa @tweak[1],@tweak[0]
2099 movups $inout0,($out) # store one output block
2100 lea 16*1($out),$out # $out+=1*16
2101 jmp .Lxts_enc_done
2102
2103 .align 16
2104 .Lxts_enc_two:
2105 movups ($inp),$inout0
2106 movups 16($inp),$inout1
2107 lea 32($inp),$inp # $inp+=2*16
2108 xorps @tweak[0],$inout0
2109 xorps @tweak[1],$inout1
2110
2111 call _aesni_encrypt2
2112
2113 xorps @tweak[0],$inout0
2114 movdqa @tweak[2],@tweak[0]
2115 xorps @tweak[1],$inout1
2116 movups $inout0,($out) # store 2 output blocks
2117 movups $inout1,16*1($out)
2118 lea 16*2($out),$out # $out+=2*16
2119 jmp .Lxts_enc_done
2120
2121 .align 16
2122 .Lxts_enc_three:
2123 movups ($inp),$inout0
2124 movups 16*1($inp),$inout1
2125 movups 16*2($inp),$inout2
2126 lea 16*3($inp),$inp # $inp+=3*16
2127 xorps @tweak[0],$inout0
2128 xorps @tweak[1],$inout1
2129 xorps @tweak[2],$inout2
2130
2131 call _aesni_encrypt3
2132
2133 xorps @tweak[0],$inout0
2134 movdqa @tweak[3],@tweak[0]
2135 xorps @tweak[1],$inout1
2136 xorps @tweak[2],$inout2
2137 movups $inout0,($out) # store 3 output blocks
2138 movups $inout1,16*1($out)
2139 movups $inout2,16*2($out)
2140 lea 16*3($out),$out # $out+=3*16
2141 jmp .Lxts_enc_done
2142
2143 .align 16
2144 .Lxts_enc_four:
2145 movups ($inp),$inout0
2146 movups 16*1($inp),$inout1
2147 movups 16*2($inp),$inout2
2148 xorps @tweak[0],$inout0
2149 movups 16*3($inp),$inout3
2150 lea 16*4($inp),$inp # $inp+=4*16
2151 xorps @tweak[1],$inout1
2152 xorps @tweak[2],$inout2
2153 xorps @tweak[3],$inout3
2154
2155 call _aesni_encrypt4
2156
2157 pxor @tweak[0],$inout0
2158 movdqa @tweak[4],@tweak[0]
2159 pxor @tweak[1],$inout1
2160 pxor @tweak[2],$inout2
2161 movdqu $inout0,($out) # store 4 output blocks
2162 pxor @tweak[3],$inout3
2163 movdqu $inout1,16*1($out)
2164 movdqu $inout2,16*2($out)
2165 movdqu $inout3,16*3($out)
2166 lea 16*4($out),$out # $out+=4*16
2167 jmp .Lxts_enc_done
2168
2169 .align 16
2170 .Lxts_enc_done:
2171 and \$15,$len_ # see if $len%16 is 0
2172 jz .Lxts_enc_ret
2173 mov $len_,$len
2174
2175 .Lxts_enc_steal:
2176 movzb ($inp),%eax # borrow $rounds ...
2177 movzb -16($out),%ecx # ... and $key
2178 lea 1($inp),$inp
2179 mov %al,-16($out)
2180 mov %cl,0($out)
2181 lea 1($out),$out
2182 sub \$1,$len
2183 jnz .Lxts_enc_steal
2184
2185 sub $len_,$out # rewind $out
2186 mov $key_,$key # restore $key
2187 mov $rnds_,$rounds # restore $rounds
2188
2189 movups -16($out),$inout0
2190 xorps @tweak[0],$inout0
2191 ___
2192 &aesni_generate1("enc",$key,$rounds);
2193 $code.=<<___;
2194 xorps @tweak[0],$inout0
2195 movups $inout0,-16($out)
2196
2197 .Lxts_enc_ret:
2198 xorps %xmm0,%xmm0 # clear register bank
2199 pxor %xmm1,%xmm1
2200 pxor %xmm2,%xmm2
2201 pxor %xmm3,%xmm3
2202 pxor %xmm4,%xmm4
2203 pxor %xmm5,%xmm5
2204 ___
2205 $code.=<<___ if (!$win64);
2206 pxor %xmm6,%xmm6
2207 pxor %xmm7,%xmm7
2208 movaps %xmm0,0x00(%rsp) # clear stack
2209 pxor %xmm8,%xmm8
2210 movaps %xmm0,0x10(%rsp)
2211 pxor %xmm9,%xmm9
2212 movaps %xmm0,0x20(%rsp)
2213 pxor %xmm10,%xmm10
2214 movaps %xmm0,0x30(%rsp)
2215 pxor %xmm11,%xmm11
2216 movaps %xmm0,0x40(%rsp)
2217 pxor %xmm12,%xmm12
2218 movaps %xmm0,0x50(%rsp)
2219 pxor %xmm13,%xmm13
2220 movaps %xmm0,0x60(%rsp)
2221 pxor %xmm14,%xmm14
2222 pxor %xmm15,%xmm15
2223 ___
2224 $code.=<<___ if ($win64);
2225 movaps -0xa8(%r11),%xmm6
2226 movaps %xmm0,-0xa8(%r11) # clear stack
2227 movaps -0x98(%r11),%xmm7
2228 movaps %xmm0,-0x98(%r11)
2229 movaps -0x88(%r11),%xmm8
2230 movaps %xmm0,-0x88(%r11)
2231 movaps -0x78(%r11),%xmm9
2232 movaps %xmm0,-0x78(%r11)
2233 movaps -0x68(%r11),%xmm10
2234 movaps %xmm0,-0x68(%r11)
2235 movaps -0x58(%r11),%xmm11
2236 movaps %xmm0,-0x58(%r11)
2237 movaps -0x48(%r11),%xmm12
2238 movaps %xmm0,-0x48(%r11)
2239 movaps -0x38(%r11),%xmm13
2240 movaps %xmm0,-0x38(%r11)
2241 movaps -0x28(%r11),%xmm14
2242 movaps %xmm0,-0x28(%r11)
2243 movaps -0x18(%r11),%xmm15
2244 movaps %xmm0,-0x18(%r11)
2245 movaps %xmm0,0x00(%rsp)
2246 movaps %xmm0,0x10(%rsp)
2247 movaps %xmm0,0x20(%rsp)
2248 movaps %xmm0,0x30(%rsp)
2249 movaps %xmm0,0x40(%rsp)
2250 movaps %xmm0,0x50(%rsp)
2251 movaps %xmm0,0x60(%rsp)
2252 ___
2253 $code.=<<___;
2254 mov -8(%r11),%rbp
2255 .cfi_restore %rbp
2256 lea (%r11),%rsp
2257 .cfi_def_cfa_register %rsp
2258 .Lxts_enc_epilogue:
2259 ret
2260 .cfi_endproc
2261 .size aesni_xts_encrypt,.-aesni_xts_encrypt
2262 ___
2263
2264 $code.=<<___;
2265 .globl aesni_xts_decrypt
2266 .type aesni_xts_decrypt,\@function,6
2267 .align 16
2268 aesni_xts_decrypt:
2269 .cfi_startproc
2270 endbranch
2271 lea (%rsp),%r11 # frame pointer
2272 .cfi_def_cfa_register %r11
2273 push %rbp
2274 .cfi_push %rbp
2275 sub \$$frame_size,%rsp
2276 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2277 ___
2278 $code.=<<___ if ($win64);
2279 movaps %xmm6,-0xa8(%r11) # offload everything
2280 movaps %xmm7,-0x98(%r11)
2281 movaps %xmm8,-0x88(%r11)
2282 movaps %xmm9,-0x78(%r11)
2283 movaps %xmm10,-0x68(%r11)
2284 movaps %xmm11,-0x58(%r11)
2285 movaps %xmm12,-0x48(%r11)
2286 movaps %xmm13,-0x38(%r11)
2287 movaps %xmm14,-0x28(%r11)
2288 movaps %xmm15,-0x18(%r11)
2289 .Lxts_dec_body:
2290 ___
2291 $code.=<<___;
2292 movups ($ivp),$inout0 # load clear-text tweak
2293 mov 240($key2),$rounds # key2->rounds
2294 mov 240($key),$rnds_ # key1->rounds
2295 ___
2296 # generate the tweak
2297 &aesni_generate1("enc",$key2,$rounds,$inout0);
2298 $code.=<<___;
2299 xor %eax,%eax # if ($len%16) len-=16;
2300 test \$15,$len
2301 setnz %al
2302 shl \$4,%rax
2303 sub %rax,$len
2304
2305 $movkey ($key),$rndkey0 # zero round key
2306 mov $key,$key_ # backup $key
2307 mov $rnds_,$rounds # backup $rounds
2308 shl \$4,$rnds_
2309 mov $len,$len_ # backup $len
2310 and \$-16,$len
2311
2312 $movkey 16($key,$rnds_),$rndkey1 # last round key
2313
2314 movdqa .Lxts_magic(%rip),$twmask
2315 movdqa $inout0,@tweak[5]
2316 pshufd \$0x5f,$inout0,$twres
2317 pxor $rndkey0,$rndkey1
2318 ___
2319 for ($i=0;$i<4;$i++) {
2320 $code.=<<___;
2321 movdqa $twres,$twtmp
2322 paddd $twres,$twres
2323 movdqa @tweak[5],@tweak[$i]
2324 psrad \$31,$twtmp # broadcast upper bits
2325 paddq @tweak[5],@tweak[5]
2326 pand $twmask,$twtmp
2327 pxor $rndkey0,@tweak[$i]
2328 pxor $twtmp,@tweak[5]
2329 ___
2330 }
2331 $code.=<<___;
2332 movdqa @tweak[5],@tweak[4]
2333 psrad \$31,$twres
2334 paddq @tweak[5],@tweak[5]
2335 pand $twmask,$twres
2336 pxor $rndkey0,@tweak[4]
2337 pxor $twres,@tweak[5]
2338 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2339
2340 sub \$16*6,$len
2341 jc .Lxts_dec_short # if $len-=6*16 borrowed
2342
2343 mov \$16+96,$rounds
2344 lea 32($key_,$rnds_),$key # end of key schedule
2345 sub %r10,%rax # twisted $rounds
2346 $movkey 16($key_),$rndkey1
2347 mov %rax,%r10 # backup twisted $rounds
2348 lea .Lxts_magic(%rip),%r8
2349 jmp .Lxts_dec_grandloop
2350
2351 .align 32
2352 .Lxts_dec_grandloop:
2353 movdqu `16*0`($inp),$inout0 # load input
2354 movdqa $rndkey0,$twmask
2355 movdqu `16*1`($inp),$inout1
2356 pxor @tweak[0],$inout0 # input^=tweak^round[0]
2357 movdqu `16*2`($inp),$inout2
2358 pxor @tweak[1],$inout1
2359 aesdec $rndkey1,$inout0
2360 movdqu `16*3`($inp),$inout3
2361 pxor @tweak[2],$inout2
2362 aesdec $rndkey1,$inout1
2363 movdqu `16*4`($inp),$inout4
2364 pxor @tweak[3],$inout3
2365 aesdec $rndkey1,$inout2
2366 movdqu `16*5`($inp),$inout5
2367 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2368 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2369 pxor @tweak[4],$inout4
2370 aesdec $rndkey1,$inout3
2371 $movkey 32($key_),$rndkey0
2372 lea `16*6`($inp),$inp
2373 pxor $twmask,$inout5
2374
2375 pxor $twres,@tweak[0] # calculate tweaks^round[last]
2376 aesdec $rndkey1,$inout4
2377 pxor $twres,@tweak[1]
2378 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2379 aesdec $rndkey1,$inout5
2380 $movkey 48($key_),$rndkey1
2381 pxor $twres,@tweak[2]
2382
2383 aesdec $rndkey0,$inout0
2384 pxor $twres,@tweak[3]
2385 movdqa @tweak[1],`16*1`(%rsp)
2386 aesdec $rndkey0,$inout1
2387 pxor $twres,@tweak[4]
2388 movdqa @tweak[2],`16*2`(%rsp)
2389 aesdec $rndkey0,$inout2
2390 aesdec $rndkey0,$inout3
2391 pxor $twres,$twmask
2392 movdqa @tweak[4],`16*4`(%rsp)
2393 aesdec $rndkey0,$inout4
2394 aesdec $rndkey0,$inout5
2395 $movkey 64($key_),$rndkey0
2396 movdqa $twmask,`16*5`(%rsp)
2397 pshufd \$0x5f,@tweak[5],$twres
2398 jmp .Lxts_dec_loop6
2399 .align 32
2400 .Lxts_dec_loop6:
2401 aesdec $rndkey1,$inout0
2402 aesdec $rndkey1,$inout1
2403 aesdec $rndkey1,$inout2
2404 aesdec $rndkey1,$inout3
2405 aesdec $rndkey1,$inout4
2406 aesdec $rndkey1,$inout5
2407 $movkey -64($key,%rax),$rndkey1
2408 add \$32,%rax
2409
2410 aesdec $rndkey0,$inout0
2411 aesdec $rndkey0,$inout1
2412 aesdec $rndkey0,$inout2
2413 aesdec $rndkey0,$inout3
2414 aesdec $rndkey0,$inout4
2415 aesdec $rndkey0,$inout5
2416 $movkey -80($key,%rax),$rndkey0
2417 jnz .Lxts_dec_loop6
2418
2419 movdqa (%r8),$twmask # start calculating next tweak
2420 movdqa $twres,$twtmp
2421 paddd $twres,$twres
2422 aesdec $rndkey1,$inout0
2423 paddq @tweak[5],@tweak[5]
2424 psrad \$31,$twtmp
2425 aesdec $rndkey1,$inout1
2426 pand $twmask,$twtmp
2427 $movkey ($key_),@tweak[0] # load round[0]
2428 aesdec $rndkey1,$inout2
2429 aesdec $rndkey1,$inout3
2430 aesdec $rndkey1,$inout4
2431 pxor $twtmp,@tweak[5]
2432 movaps @tweak[0],@tweak[1] # copy round[0]
2433 aesdec $rndkey1,$inout5
2434 $movkey -64($key),$rndkey1
2435
2436 movdqa $twres,$twtmp
2437 aesdec $rndkey0,$inout0
2438 paddd $twres,$twres
2439 pxor @tweak[5],@tweak[0]
2440 aesdec $rndkey0,$inout1
2441 psrad \$31,$twtmp
2442 paddq @tweak[5],@tweak[5]
2443 aesdec $rndkey0,$inout2
2444 aesdec $rndkey0,$inout3
2445 pand $twmask,$twtmp
2446 movaps @tweak[1],@tweak[2]
2447 aesdec $rndkey0,$inout4
2448 pxor $twtmp,@tweak[5]
2449 movdqa $twres,$twtmp
2450 aesdec $rndkey0,$inout5
2451 $movkey -48($key),$rndkey0
2452
2453 paddd $twres,$twres
2454 aesdec $rndkey1,$inout0
2455 pxor @tweak[5],@tweak[1]
2456 psrad \$31,$twtmp
2457 aesdec $rndkey1,$inout1
2458 paddq @tweak[5],@tweak[5]
2459 pand $twmask,$twtmp
2460 aesdec $rndkey1,$inout2
2461 aesdec $rndkey1,$inout3
2462 movdqa @tweak[3],`16*3`(%rsp)
2463 pxor $twtmp,@tweak[5]
2464 aesdec $rndkey1,$inout4
2465 movaps @tweak[2],@tweak[3]
2466 movdqa $twres,$twtmp
2467 aesdec $rndkey1,$inout5
2468 $movkey -32($key),$rndkey1
2469
2470 paddd $twres,$twres
2471 aesdec $rndkey0,$inout0
2472 pxor @tweak[5],@tweak[2]
2473 psrad \$31,$twtmp
2474 aesdec $rndkey0,$inout1
2475 paddq @tweak[5],@tweak[5]
2476 pand $twmask,$twtmp
2477 aesdec $rndkey0,$inout2
2478 aesdec $rndkey0,$inout3
2479 aesdec $rndkey0,$inout4
2480 pxor $twtmp,@tweak[5]
2481 movaps @tweak[3],@tweak[4]
2482 aesdec $rndkey0,$inout5
2483
2484 movdqa $twres,$rndkey0
2485 paddd $twres,$twres
2486 aesdec $rndkey1,$inout0
2487 pxor @tweak[5],@tweak[3]
2488 psrad \$31,$rndkey0
2489 aesdec $rndkey1,$inout1
2490 paddq @tweak[5],@tweak[5]
2491 pand $twmask,$rndkey0
2492 aesdec $rndkey1,$inout2
2493 aesdec $rndkey1,$inout3
2494 pxor $rndkey0,@tweak[5]
2495 $movkey ($key_),$rndkey0
2496 aesdec $rndkey1,$inout4
2497 aesdec $rndkey1,$inout5
2498 $movkey 16($key_),$rndkey1
2499
2500 pxor @tweak[5],@tweak[4]
2501 aesdeclast `16*0`(%rsp),$inout0
2502 psrad \$31,$twres
2503 paddq @tweak[5],@tweak[5]
2504 aesdeclast `16*1`(%rsp),$inout1
2505 aesdeclast `16*2`(%rsp),$inout2
2506 pand $twmask,$twres
2507 mov %r10,%rax # restore $rounds
2508 aesdeclast `16*3`(%rsp),$inout3
2509 aesdeclast `16*4`(%rsp),$inout4
2510 aesdeclast `16*5`(%rsp),$inout5
2511 pxor $twres,@tweak[5]
2512
2513 lea `16*6`($out),$out # $out+=6*16
2514 movups $inout0,`-16*6`($out) # store 6 output blocks
2515 movups $inout1,`-16*5`($out)
2516 movups $inout2,`-16*4`($out)
2517 movups $inout3,`-16*3`($out)
2518 movups $inout4,`-16*2`($out)
2519 movups $inout5,`-16*1`($out)
2520 sub \$16*6,$len
2521 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
2522
2523 mov \$16+96,$rounds
2524 sub $rnds_,$rounds
2525 mov $key_,$key # restore $key
2526 shr \$4,$rounds # restore original value
2527
2528 .Lxts_dec_short:
2529 # at the point @tweak[0..5] are populated with tweak values
2530 mov $rounds,$rnds_ # backup $rounds
2531 pxor $rndkey0,@tweak[0]
2532 pxor $rndkey0,@tweak[1]
2533 add \$16*6,$len # restore real remaining $len
2534 jz .Lxts_dec_done # done if ($len==0)
2535
2536 pxor $rndkey0,@tweak[2]
2537 cmp \$0x20,$len
2538 jb .Lxts_dec_one # $len is 1*16
2539 pxor $rndkey0,@tweak[3]
2540 je .Lxts_dec_two # $len is 2*16
2541
2542 pxor $rndkey0,@tweak[4]
2543 cmp \$0x40,$len
2544 jb .Lxts_dec_three # $len is 3*16
2545 je .Lxts_dec_four # $len is 4*16
2546
2547 movdqu ($inp),$inout0 # $len is 5*16
2548 movdqu 16*1($inp),$inout1
2549 movdqu 16*2($inp),$inout2
2550 pxor @tweak[0],$inout0
2551 movdqu 16*3($inp),$inout3
2552 pxor @tweak[1],$inout1
2553 movdqu 16*4($inp),$inout4
2554 lea 16*5($inp),$inp # $inp+=5*16
2555 pxor @tweak[2],$inout2
2556 pxor @tweak[3],$inout3
2557 pxor @tweak[4],$inout4
2558
2559 call _aesni_decrypt6
2560
2561 xorps @tweak[0],$inout0
2562 xorps @tweak[1],$inout1
2563 xorps @tweak[2],$inout2
2564 movdqu $inout0,($out) # store 5 output blocks
2565 xorps @tweak[3],$inout3
2566 movdqu $inout1,16*1($out)
2567 xorps @tweak[4],$inout4
2568 movdqu $inout2,16*2($out)
2569 pxor $twtmp,$twtmp
2570 movdqu $inout3,16*3($out)
2571 pcmpgtd @tweak[5],$twtmp
2572 movdqu $inout4,16*4($out)
2573 lea 16*5($out),$out # $out+=5*16
2574 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2575 and \$15,$len_
2576 jz .Lxts_dec_ret
2577
2578 movdqa @tweak[5],@tweak[0]
2579 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2580 pand $twmask,@tweak[1] # isolate carry and residue
2581 pxor @tweak[5],@tweak[1]
2582 jmp .Lxts_dec_done2
2583
2584 .align 16
2585 .Lxts_dec_one:
2586 movups ($inp),$inout0
2587 lea 16*1($inp),$inp # $inp+=1*16
2588 xorps @tweak[0],$inout0
2589 ___
2590 &aesni_generate1("dec",$key,$rounds);
2591 $code.=<<___;
2592 xorps @tweak[0],$inout0
2593 movdqa @tweak[1],@tweak[0]
2594 movups $inout0,($out) # store one output block
2595 movdqa @tweak[2],@tweak[1]
2596 lea 16*1($out),$out # $out+=1*16
2597 jmp .Lxts_dec_done
2598
2599 .align 16
2600 .Lxts_dec_two:
2601 movups ($inp),$inout0
2602 movups 16($inp),$inout1
2603 lea 32($inp),$inp # $inp+=2*16
2604 xorps @tweak[0],$inout0
2605 xorps @tweak[1],$inout1
2606
2607 call _aesni_decrypt2
2608
2609 xorps @tweak[0],$inout0
2610 movdqa @tweak[2],@tweak[0]
2611 xorps @tweak[1],$inout1
2612 movdqa @tweak[3],@tweak[1]
2613 movups $inout0,($out) # store 2 output blocks
2614 movups $inout1,16*1($out)
2615 lea 16*2($out),$out # $out+=2*16
2616 jmp .Lxts_dec_done
2617
2618 .align 16
2619 .Lxts_dec_three:
2620 movups ($inp),$inout0
2621 movups 16*1($inp),$inout1
2622 movups 16*2($inp),$inout2
2623 lea 16*3($inp),$inp # $inp+=3*16
2624 xorps @tweak[0],$inout0
2625 xorps @tweak[1],$inout1
2626 xorps @tweak[2],$inout2
2627
2628 call _aesni_decrypt3
2629
2630 xorps @tweak[0],$inout0
2631 movdqa @tweak[3],@tweak[0]
2632 xorps @tweak[1],$inout1
2633 movdqa @tweak[4],@tweak[1]
2634 xorps @tweak[2],$inout2
2635 movups $inout0,($out) # store 3 output blocks
2636 movups $inout1,16*1($out)
2637 movups $inout2,16*2($out)
2638 lea 16*3($out),$out # $out+=3*16
2639 jmp .Lxts_dec_done
2640
2641 .align 16
2642 .Lxts_dec_four:
2643 movups ($inp),$inout0
2644 movups 16*1($inp),$inout1
2645 movups 16*2($inp),$inout2
2646 xorps @tweak[0],$inout0
2647 movups 16*3($inp),$inout3
2648 lea 16*4($inp),$inp # $inp+=4*16
2649 xorps @tweak[1],$inout1
2650 xorps @tweak[2],$inout2
2651 xorps @tweak[3],$inout3
2652
2653 call _aesni_decrypt4
2654
2655 pxor @tweak[0],$inout0
2656 movdqa @tweak[4],@tweak[0]
2657 pxor @tweak[1],$inout1
2658 movdqa @tweak[5],@tweak[1]
2659 pxor @tweak[2],$inout2
2660 movdqu $inout0,($out) # store 4 output blocks
2661 pxor @tweak[3],$inout3
2662 movdqu $inout1,16*1($out)
2663 movdqu $inout2,16*2($out)
2664 movdqu $inout3,16*3($out)
2665 lea 16*4($out),$out # $out+=4*16
2666 jmp .Lxts_dec_done
2667
2668 .align 16
2669 .Lxts_dec_done:
2670 and \$15,$len_ # see if $len%16 is 0
2671 jz .Lxts_dec_ret
2672 .Lxts_dec_done2:
2673 mov $len_,$len
2674 mov $key_,$key # restore $key
2675 mov $rnds_,$rounds # restore $rounds
2676
2677 movups ($inp),$inout0
2678 xorps @tweak[1],$inout0
2679 ___
2680 &aesni_generate1("dec",$key,$rounds);
2681 $code.=<<___;
2682 xorps @tweak[1],$inout0
2683 movups $inout0,($out)
2684
2685 .Lxts_dec_steal:
2686 movzb 16($inp),%eax # borrow $rounds ...
2687 movzb ($out),%ecx # ... and $key
2688 lea 1($inp),$inp
2689 mov %al,($out)
2690 mov %cl,16($out)
2691 lea 1($out),$out
2692 sub \$1,$len
2693 jnz .Lxts_dec_steal
2694
2695 sub $len_,$out # rewind $out
2696 mov $key_,$key # restore $key
2697 mov $rnds_,$rounds # restore $rounds
2698
2699 movups ($out),$inout0
2700 xorps @tweak[0],$inout0
2701 ___
2702 &aesni_generate1("dec",$key,$rounds);
2703 $code.=<<___;
2704 xorps @tweak[0],$inout0
2705 movups $inout0,($out)
2706
2707 .Lxts_dec_ret:
2708 xorps %xmm0,%xmm0 # clear register bank
2709 pxor %xmm1,%xmm1
2710 pxor %xmm2,%xmm2
2711 pxor %xmm3,%xmm3
2712 pxor %xmm4,%xmm4
2713 pxor %xmm5,%xmm5
2714 ___
2715 $code.=<<___ if (!$win64);
2716 pxor %xmm6,%xmm6
2717 pxor %xmm7,%xmm7
2718 movaps %xmm0,0x00(%rsp) # clear stack
2719 pxor %xmm8,%xmm8
2720 movaps %xmm0,0x10(%rsp)
2721 pxor %xmm9,%xmm9
2722 movaps %xmm0,0x20(%rsp)
2723 pxor %xmm10,%xmm10
2724 movaps %xmm0,0x30(%rsp)
2725 pxor %xmm11,%xmm11
2726 movaps %xmm0,0x40(%rsp)
2727 pxor %xmm12,%xmm12
2728 movaps %xmm0,0x50(%rsp)
2729 pxor %xmm13,%xmm13
2730 movaps %xmm0,0x60(%rsp)
2731 pxor %xmm14,%xmm14
2732 pxor %xmm15,%xmm15
2733 ___
2734 $code.=<<___ if ($win64);
2735 movaps -0xa8(%r11),%xmm6
2736 movaps %xmm0,-0xa8(%r11) # clear stack
2737 movaps -0x98(%r11),%xmm7
2738 movaps %xmm0,-0x98(%r11)
2739 movaps -0x88(%r11),%xmm8
2740 movaps %xmm0,-0x88(%r11)
2741 movaps -0x78(%r11),%xmm9
2742 movaps %xmm0,-0x78(%r11)
2743 movaps -0x68(%r11),%xmm10
2744 movaps %xmm0,-0x68(%r11)
2745 movaps -0x58(%r11),%xmm11
2746 movaps %xmm0,-0x58(%r11)
2747 movaps -0x48(%r11),%xmm12
2748 movaps %xmm0,-0x48(%r11)
2749 movaps -0x38(%r11),%xmm13
2750 movaps %xmm0,-0x38(%r11)
2751 movaps -0x28(%r11),%xmm14
2752 movaps %xmm0,-0x28(%r11)
2753 movaps -0x18(%r11),%xmm15
2754 movaps %xmm0,-0x18(%r11)
2755 movaps %xmm0,0x00(%rsp)
2756 movaps %xmm0,0x10(%rsp)
2757 movaps %xmm0,0x20(%rsp)
2758 movaps %xmm0,0x30(%rsp)
2759 movaps %xmm0,0x40(%rsp)
2760 movaps %xmm0,0x50(%rsp)
2761 movaps %xmm0,0x60(%rsp)
2762 ___
2763 $code.=<<___;
2764 mov -8(%r11),%rbp
2765 .cfi_restore %rbp
2766 lea (%r11),%rsp
2767 .cfi_def_cfa_register %rsp
2768 .Lxts_dec_epilogue:
2769 ret
2770 .cfi_endproc
2771 .size aesni_xts_decrypt,.-aesni_xts_decrypt
2772 ___
2773 }
2774 \f
2775 ######################################################################
2776 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2777 # const AES_KEY *key, unsigned int start_block_num,
2778 # unsigned char offset_i[16], const unsigned char L_[][16],
2779 # unsigned char checksum[16]);
2780 #
2781 {
2782 my @offset=map("%xmm$_",(10..15));
2783 my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2784 my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2785 my ($L_p,$checksum_p) = ("%rbx","%rbp");
2786 my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2787 my $seventh_arg = $win64 ? 56 : 8;
2788 my $blocks = $len;
2789
2790 $code.=<<___;
2791 .globl aesni_ocb_encrypt
2792 .type aesni_ocb_encrypt,\@function,6
2793 .align 32
2794 aesni_ocb_encrypt:
2795 .cfi_startproc
2796 endbranch
2797 lea (%rsp),%rax
2798 push %rbx
2799 .cfi_push %rbx
2800 push %rbp
2801 .cfi_push %rbp
2802 push %r12
2803 .cfi_push %r12
2804 push %r13
2805 .cfi_push %r13
2806 push %r14
2807 .cfi_push %r14
2808 ___
2809 $code.=<<___ if ($win64);
2810 lea -0xa0(%rsp),%rsp
2811 movaps %xmm6,0x00(%rsp) # offload everything
2812 movaps %xmm7,0x10(%rsp)
2813 movaps %xmm8,0x20(%rsp)
2814 movaps %xmm9,0x30(%rsp)
2815 movaps %xmm10,0x40(%rsp)
2816 movaps %xmm11,0x50(%rsp)
2817 movaps %xmm12,0x60(%rsp)
2818 movaps %xmm13,0x70(%rsp)
2819 movaps %xmm14,0x80(%rsp)
2820 movaps %xmm15,0x90(%rsp)
2821 .Locb_enc_body:
2822 ___
2823 $code.=<<___;
2824 mov $seventh_arg(%rax),$L_p # 7th argument
2825 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2826
2827 mov 240($key),$rnds_
2828 mov $key,$key_
2829 shl \$4,$rnds_
2830 $movkey ($key),$rndkey0l # round[0]
2831 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2832
2833 movdqu ($offset_p),@offset[5] # load last offset_i
2834 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2835 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2836
2837 mov \$16+32,$rounds
2838 lea 32($key_,$rnds_),$key
2839 $movkey 16($key_),$rndkey1 # round[1]
2840 sub %r10,%rax # twisted $rounds
2841 mov %rax,%r10 # backup twisted $rounds
2842
2843 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2844 movdqu ($checksum_p),$checksum # load checksum
2845
2846 test \$1,$block_num # is first block number odd?
2847 jnz .Locb_enc_odd
2848
2849 bsf $block_num,$i1
2850 add \$1,$block_num
2851 shl \$4,$i1
2852 movdqu ($L_p,$i1),$inout5 # borrow
2853 movdqu ($inp),$inout0
2854 lea 16($inp),$inp
2855
2856 call __ocb_encrypt1
2857
2858 movdqa $inout5,@offset[5]
2859 movups $inout0,($out)
2860 lea 16($out),$out
2861 sub \$1,$blocks
2862 jz .Locb_enc_done
2863
2864 .Locb_enc_odd:
2865 lea 1($block_num),$i1 # even-numbered blocks
2866 lea 3($block_num),$i3
2867 lea 5($block_num),$i5
2868 lea 6($block_num),$block_num
2869 bsf $i1,$i1 # ntz(block)
2870 bsf $i3,$i3
2871 bsf $i5,$i5
2872 shl \$4,$i1 # ntz(block) -> table offset
2873 shl \$4,$i3
2874 shl \$4,$i5
2875
2876 sub \$6,$blocks
2877 jc .Locb_enc_short
2878 jmp .Locb_enc_grandloop
2879
2880 .align 32
2881 .Locb_enc_grandloop:
2882 movdqu `16*0`($inp),$inout0 # load input
2883 movdqu `16*1`($inp),$inout1
2884 movdqu `16*2`($inp),$inout2
2885 movdqu `16*3`($inp),$inout3
2886 movdqu `16*4`($inp),$inout4
2887 movdqu `16*5`($inp),$inout5
2888 lea `16*6`($inp),$inp
2889
2890 call __ocb_encrypt6
2891
2892 movups $inout0,`16*0`($out) # store output
2893 movups $inout1,`16*1`($out)
2894 movups $inout2,`16*2`($out)
2895 movups $inout3,`16*3`($out)
2896 movups $inout4,`16*4`($out)
2897 movups $inout5,`16*5`($out)
2898 lea `16*6`($out),$out
2899 sub \$6,$blocks
2900 jnc .Locb_enc_grandloop
2901
2902 .Locb_enc_short:
2903 add \$6,$blocks
2904 jz .Locb_enc_done
2905
2906 movdqu `16*0`($inp),$inout0
2907 cmp \$2,$blocks
2908 jb .Locb_enc_one
2909 movdqu `16*1`($inp),$inout1
2910 je .Locb_enc_two
2911
2912 movdqu `16*2`($inp),$inout2
2913 cmp \$4,$blocks
2914 jb .Locb_enc_three
2915 movdqu `16*3`($inp),$inout3
2916 je .Locb_enc_four
2917
2918 movdqu `16*4`($inp),$inout4
2919 pxor $inout5,$inout5
2920
2921 call __ocb_encrypt6
2922
2923 movdqa @offset[4],@offset[5]
2924 movups $inout0,`16*0`($out)
2925 movups $inout1,`16*1`($out)
2926 movups $inout2,`16*2`($out)
2927 movups $inout3,`16*3`($out)
2928 movups $inout4,`16*4`($out)
2929
2930 jmp .Locb_enc_done
2931
2932 .align 16
2933 .Locb_enc_one:
2934 movdqa @offset[0],$inout5 # borrow
2935
2936 call __ocb_encrypt1
2937
2938 movdqa $inout5,@offset[5]
2939 movups $inout0,`16*0`($out)
2940 jmp .Locb_enc_done
2941
2942 .align 16
2943 .Locb_enc_two:
2944 pxor $inout2,$inout2
2945 pxor $inout3,$inout3
2946
2947 call __ocb_encrypt4
2948
2949 movdqa @offset[1],@offset[5]
2950 movups $inout0,`16*0`($out)
2951 movups $inout1,`16*1`($out)
2952
2953 jmp .Locb_enc_done
2954
2955 .align 16
2956 .Locb_enc_three:
2957 pxor $inout3,$inout3
2958
2959 call __ocb_encrypt4
2960
2961 movdqa @offset[2],@offset[5]
2962 movups $inout0,`16*0`($out)
2963 movups $inout1,`16*1`($out)
2964 movups $inout2,`16*2`($out)
2965
2966 jmp .Locb_enc_done
2967
2968 .align 16
2969 .Locb_enc_four:
2970 call __ocb_encrypt4
2971
2972 movdqa @offset[3],@offset[5]
2973 movups $inout0,`16*0`($out)
2974 movups $inout1,`16*1`($out)
2975 movups $inout2,`16*2`($out)
2976 movups $inout3,`16*3`($out)
2977
2978 .Locb_enc_done:
2979 pxor $rndkey0,@offset[5] # "remove" round[last]
2980 movdqu $checksum,($checksum_p) # store checksum
2981 movdqu @offset[5],($offset_p) # store last offset_i
2982
2983 xorps %xmm0,%xmm0 # clear register bank
2984 pxor %xmm1,%xmm1
2985 pxor %xmm2,%xmm2
2986 pxor %xmm3,%xmm3
2987 pxor %xmm4,%xmm4
2988 pxor %xmm5,%xmm5
2989 ___
2990 $code.=<<___ if (!$win64);
2991 pxor %xmm6,%xmm6
2992 pxor %xmm7,%xmm7
2993 pxor %xmm8,%xmm8
2994 pxor %xmm9,%xmm9
2995 pxor %xmm10,%xmm10
2996 pxor %xmm11,%xmm11
2997 pxor %xmm12,%xmm12
2998 pxor %xmm13,%xmm13
2999 pxor %xmm14,%xmm14
3000 pxor %xmm15,%xmm15
3001 lea 0x28(%rsp),%rax
3002 .cfi_def_cfa %rax,8
3003 ___
3004 $code.=<<___ if ($win64);
3005 movaps 0x00(%rsp),%xmm6
3006 movaps %xmm0,0x00(%rsp) # clear stack
3007 movaps 0x10(%rsp),%xmm7
3008 movaps %xmm0,0x10(%rsp)
3009 movaps 0x20(%rsp),%xmm8
3010 movaps %xmm0,0x20(%rsp)
3011 movaps 0x30(%rsp),%xmm9
3012 movaps %xmm0,0x30(%rsp)
3013 movaps 0x40(%rsp),%xmm10
3014 movaps %xmm0,0x40(%rsp)
3015 movaps 0x50(%rsp),%xmm11
3016 movaps %xmm0,0x50(%rsp)
3017 movaps 0x60(%rsp),%xmm12
3018 movaps %xmm0,0x60(%rsp)
3019 movaps 0x70(%rsp),%xmm13
3020 movaps %xmm0,0x70(%rsp)
3021 movaps 0x80(%rsp),%xmm14
3022 movaps %xmm0,0x80(%rsp)
3023 movaps 0x90(%rsp),%xmm15
3024 movaps %xmm0,0x90(%rsp)
3025 lea 0xa0+0x28(%rsp),%rax
3026 .Locb_enc_pop:
3027 ___
3028 $code.=<<___;
3029 mov -40(%rax),%r14
3030 .cfi_restore %r14
3031 mov -32(%rax),%r13
3032 .cfi_restore %r13
3033 mov -24(%rax),%r12
3034 .cfi_restore %r12
3035 mov -16(%rax),%rbp
3036 .cfi_restore %rbp
3037 mov -8(%rax),%rbx
3038 .cfi_restore %rbx
3039 lea (%rax),%rsp
3040 .cfi_def_cfa_register %rsp
3041 .Locb_enc_epilogue:
3042 ret
3043 .cfi_endproc
3044 .size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3045
3046 .type __ocb_encrypt6,\@abi-omnipotent
3047 .align 32
3048 __ocb_encrypt6:
3049 .cfi_startproc
3050 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3051 movdqu ($L_p,$i1),@offset[1]
3052 movdqa @offset[0],@offset[2]
3053 movdqu ($L_p,$i3),@offset[3]
3054 movdqa @offset[0],@offset[4]
3055 pxor @offset[5],@offset[0]
3056 movdqu ($L_p,$i5),@offset[5]
3057 pxor @offset[0],@offset[1]
3058 pxor $inout0,$checksum # accumulate checksum
3059 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3060 pxor @offset[1],@offset[2]
3061 pxor $inout1,$checksum
3062 pxor @offset[1],$inout1
3063 pxor @offset[2],@offset[3]
3064 pxor $inout2,$checksum
3065 pxor @offset[2],$inout2
3066 pxor @offset[3],@offset[4]
3067 pxor $inout3,$checksum
3068 pxor @offset[3],$inout3
3069 pxor @offset[4],@offset[5]
3070 pxor $inout4,$checksum
3071 pxor @offset[4],$inout4
3072 pxor $inout5,$checksum
3073 pxor @offset[5],$inout5
3074 $movkey 32($key_),$rndkey0
3075
3076 lea 1($block_num),$i1 # even-numbered blocks
3077 lea 3($block_num),$i3
3078 lea 5($block_num),$i5
3079 add \$6,$block_num
3080 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3081 bsf $i1,$i1 # ntz(block)
3082 bsf $i3,$i3
3083 bsf $i5,$i5
3084
3085 aesenc $rndkey1,$inout0
3086 aesenc $rndkey1,$inout1
3087 aesenc $rndkey1,$inout2
3088 aesenc $rndkey1,$inout3
3089 pxor $rndkey0l,@offset[1]
3090 pxor $rndkey0l,@offset[2]
3091 aesenc $rndkey1,$inout4
3092 pxor $rndkey0l,@offset[3]
3093 pxor $rndkey0l,@offset[4]
3094 aesenc $rndkey1,$inout5
3095 $movkey 48($key_),$rndkey1
3096 pxor $rndkey0l,@offset[5]
3097
3098 aesenc $rndkey0,$inout0
3099 aesenc $rndkey0,$inout1
3100 aesenc $rndkey0,$inout2
3101 aesenc $rndkey0,$inout3
3102 aesenc $rndkey0,$inout4
3103 aesenc $rndkey0,$inout5
3104 $movkey 64($key_),$rndkey0
3105 shl \$4,$i1 # ntz(block) -> table offset
3106 shl \$4,$i3
3107 jmp .Locb_enc_loop6
3108
3109 .align 32
3110 .Locb_enc_loop6:
3111 aesenc $rndkey1,$inout0
3112 aesenc $rndkey1,$inout1
3113 aesenc $rndkey1,$inout2
3114 aesenc $rndkey1,$inout3
3115 aesenc $rndkey1,$inout4
3116 aesenc $rndkey1,$inout5
3117 $movkey ($key,%rax),$rndkey1
3118 add \$32,%rax
3119
3120 aesenc $rndkey0,$inout0
3121 aesenc $rndkey0,$inout1
3122 aesenc $rndkey0,$inout2
3123 aesenc $rndkey0,$inout3
3124 aesenc $rndkey0,$inout4
3125 aesenc $rndkey0,$inout5
3126 $movkey -16($key,%rax),$rndkey0
3127 jnz .Locb_enc_loop6
3128
3129 aesenc $rndkey1,$inout0
3130 aesenc $rndkey1,$inout1
3131 aesenc $rndkey1,$inout2
3132 aesenc $rndkey1,$inout3
3133 aesenc $rndkey1,$inout4
3134 aesenc $rndkey1,$inout5
3135 $movkey 16($key_),$rndkey1
3136 shl \$4,$i5
3137
3138 aesenclast @offset[0],$inout0
3139 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3140 mov %r10,%rax # restore twisted rounds
3141 aesenclast @offset[1],$inout1
3142 aesenclast @offset[2],$inout2
3143 aesenclast @offset[3],$inout3
3144 aesenclast @offset[4],$inout4
3145 aesenclast @offset[5],$inout5
3146 ret
3147 .cfi_endproc
3148 .size __ocb_encrypt6,.-__ocb_encrypt6
3149
3150 .type __ocb_encrypt4,\@abi-omnipotent
3151 .align 32
3152 __ocb_encrypt4:
3153 .cfi_startproc
3154 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3155 movdqu ($L_p,$i1),@offset[1]
3156 movdqa @offset[0],@offset[2]
3157 movdqu ($L_p,$i3),@offset[3]
3158 pxor @offset[5],@offset[0]
3159 pxor @offset[0],@offset[1]
3160 pxor $inout0,$checksum # accumulate checksum
3161 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3162 pxor @offset[1],@offset[2]
3163 pxor $inout1,$checksum
3164 pxor @offset[1],$inout1
3165 pxor @offset[2],@offset[3]
3166 pxor $inout2,$checksum
3167 pxor @offset[2],$inout2
3168 pxor $inout3,$checksum
3169 pxor @offset[3],$inout3
3170 $movkey 32($key_),$rndkey0
3171
3172 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3173 pxor $rndkey0l,@offset[1]
3174 pxor $rndkey0l,@offset[2]
3175 pxor $rndkey0l,@offset[3]
3176
3177 aesenc $rndkey1,$inout0
3178 aesenc $rndkey1,$inout1
3179 aesenc $rndkey1,$inout2
3180 aesenc $rndkey1,$inout3
3181 $movkey 48($key_),$rndkey1
3182
3183 aesenc $rndkey0,$inout0
3184 aesenc $rndkey0,$inout1
3185 aesenc $rndkey0,$inout2
3186 aesenc $rndkey0,$inout3
3187 $movkey 64($key_),$rndkey0
3188 jmp .Locb_enc_loop4
3189
3190 .align 32
3191 .Locb_enc_loop4:
3192 aesenc $rndkey1,$inout0
3193 aesenc $rndkey1,$inout1
3194 aesenc $rndkey1,$inout2
3195 aesenc $rndkey1,$inout3
3196 $movkey ($key,%rax),$rndkey1
3197 add \$32,%rax
3198
3199 aesenc $rndkey0,$inout0
3200 aesenc $rndkey0,$inout1
3201 aesenc $rndkey0,$inout2
3202 aesenc $rndkey0,$inout3
3203 $movkey -16($key,%rax),$rndkey0
3204 jnz .Locb_enc_loop4
3205
3206 aesenc $rndkey1,$inout0
3207 aesenc $rndkey1,$inout1
3208 aesenc $rndkey1,$inout2
3209 aesenc $rndkey1,$inout3
3210 $movkey 16($key_),$rndkey1
3211 mov %r10,%rax # restore twisted rounds
3212
3213 aesenclast @offset[0],$inout0
3214 aesenclast @offset[1],$inout1
3215 aesenclast @offset[2],$inout2
3216 aesenclast @offset[3],$inout3
3217 ret
3218 .cfi_endproc
3219 .size __ocb_encrypt4,.-__ocb_encrypt4
3220
3221 .type __ocb_encrypt1,\@abi-omnipotent
3222 .align 32
3223 __ocb_encrypt1:
3224 .cfi_startproc
3225 pxor @offset[5],$inout5 # offset_i
3226 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3227 pxor $inout0,$checksum # accumulate checksum
3228 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3229 $movkey 32($key_),$rndkey0
3230
3231 aesenc $rndkey1,$inout0
3232 $movkey 48($key_),$rndkey1
3233 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3234
3235 aesenc $rndkey0,$inout0
3236 $movkey 64($key_),$rndkey0
3237 jmp .Locb_enc_loop1
3238
3239 .align 32
3240 .Locb_enc_loop1:
3241 aesenc $rndkey1,$inout0
3242 $movkey ($key,%rax),$rndkey1
3243 add \$32,%rax
3244
3245 aesenc $rndkey0,$inout0
3246 $movkey -16($key,%rax),$rndkey0
3247 jnz .Locb_enc_loop1
3248
3249 aesenc $rndkey1,$inout0
3250 $movkey 16($key_),$rndkey1 # redundant in tail
3251 mov %r10,%rax # restore twisted rounds
3252
3253 aesenclast $inout5,$inout0
3254 ret
3255 .cfi_endproc
3256 .size __ocb_encrypt1,.-__ocb_encrypt1
3257
3258 .globl aesni_ocb_decrypt
3259 .type aesni_ocb_decrypt,\@function,6
3260 .align 32
3261 aesni_ocb_decrypt:
3262 .cfi_startproc
3263 endbranch
3264 lea (%rsp),%rax
3265 push %rbx
3266 .cfi_push %rbx
3267 push %rbp
3268 .cfi_push %rbp
3269 push %r12
3270 .cfi_push %r12
3271 push %r13
3272 .cfi_push %r13
3273 push %r14
3274 .cfi_push %r14
3275 ___
3276 $code.=<<___ if ($win64);
3277 lea -0xa0(%rsp),%rsp
3278 movaps %xmm6,0x00(%rsp) # offload everything
3279 movaps %xmm7,0x10(%rsp)
3280 movaps %xmm8,0x20(%rsp)
3281 movaps %xmm9,0x30(%rsp)
3282 movaps %xmm10,0x40(%rsp)
3283 movaps %xmm11,0x50(%rsp)
3284 movaps %xmm12,0x60(%rsp)
3285 movaps %xmm13,0x70(%rsp)
3286 movaps %xmm14,0x80(%rsp)
3287 movaps %xmm15,0x90(%rsp)
3288 .Locb_dec_body:
3289 ___
3290 $code.=<<___;
3291 mov $seventh_arg(%rax),$L_p # 7th argument
3292 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3293
3294 mov 240($key),$rnds_
3295 mov $key,$key_
3296 shl \$4,$rnds_
3297 $movkey ($key),$rndkey0l # round[0]
3298 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3299
3300 movdqu ($offset_p),@offset[5] # load last offset_i
3301 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3302 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3303
3304 mov \$16+32,$rounds
3305 lea 32($key_,$rnds_),$key
3306 $movkey 16($key_),$rndkey1 # round[1]
3307 sub %r10,%rax # twisted $rounds
3308 mov %rax,%r10 # backup twisted $rounds
3309
3310 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3311 movdqu ($checksum_p),$checksum # load checksum
3312
3313 test \$1,$block_num # is first block number odd?
3314 jnz .Locb_dec_odd
3315
3316 bsf $block_num,$i1
3317 add \$1,$block_num
3318 shl \$4,$i1
3319 movdqu ($L_p,$i1),$inout5 # borrow
3320 movdqu ($inp),$inout0
3321 lea 16($inp),$inp
3322
3323 call __ocb_decrypt1
3324
3325 movdqa $inout5,@offset[5]
3326 movups $inout0,($out)
3327 xorps $inout0,$checksum # accumulate checksum
3328 lea 16($out),$out
3329 sub \$1,$blocks
3330 jz .Locb_dec_done
3331
3332 .Locb_dec_odd:
3333 lea 1($block_num),$i1 # even-numbered blocks
3334 lea 3($block_num),$i3
3335 lea 5($block_num),$i5
3336 lea 6($block_num),$block_num
3337 bsf $i1,$i1 # ntz(block)
3338 bsf $i3,$i3
3339 bsf $i5,$i5
3340 shl \$4,$i1 # ntz(block) -> table offset
3341 shl \$4,$i3
3342 shl \$4,$i5
3343
3344 sub \$6,$blocks
3345 jc .Locb_dec_short
3346 jmp .Locb_dec_grandloop
3347
3348 .align 32
3349 .Locb_dec_grandloop:
3350 movdqu `16*0`($inp),$inout0 # load input
3351 movdqu `16*1`($inp),$inout1
3352 movdqu `16*2`($inp),$inout2
3353 movdqu `16*3`($inp),$inout3
3354 movdqu `16*4`($inp),$inout4
3355 movdqu `16*5`($inp),$inout5
3356 lea `16*6`($inp),$inp
3357
3358 call __ocb_decrypt6
3359
3360 movups $inout0,`16*0`($out) # store output
3361 pxor $inout0,$checksum # accumulate checksum
3362 movups $inout1,`16*1`($out)
3363 pxor $inout1,$checksum
3364 movups $inout2,`16*2`($out)
3365 pxor $inout2,$checksum
3366 movups $inout3,`16*3`($out)
3367 pxor $inout3,$checksum
3368 movups $inout4,`16*4`($out)
3369 pxor $inout4,$checksum
3370 movups $inout5,`16*5`($out)
3371 pxor $inout5,$checksum
3372 lea `16*6`($out),$out
3373 sub \$6,$blocks
3374 jnc .Locb_dec_grandloop
3375
3376 .Locb_dec_short:
3377 add \$6,$blocks
3378 jz .Locb_dec_done
3379
3380 movdqu `16*0`($inp),$inout0
3381 cmp \$2,$blocks
3382 jb .Locb_dec_one
3383 movdqu `16*1`($inp),$inout1
3384 je .Locb_dec_two
3385
3386 movdqu `16*2`($inp),$inout2
3387 cmp \$4,$blocks
3388 jb .Locb_dec_three
3389 movdqu `16*3`($inp),$inout3
3390 je .Locb_dec_four
3391
3392 movdqu `16*4`($inp),$inout4
3393 pxor $inout5,$inout5
3394
3395 call __ocb_decrypt6
3396
3397 movdqa @offset[4],@offset[5]
3398 movups $inout0,`16*0`($out) # store output
3399 pxor $inout0,$checksum # accumulate checksum
3400 movups $inout1,`16*1`($out)
3401 pxor $inout1,$checksum
3402 movups $inout2,`16*2`($out)
3403 pxor $inout2,$checksum
3404 movups $inout3,`16*3`($out)
3405 pxor $inout3,$checksum
3406 movups $inout4,`16*4`($out)
3407 pxor $inout4,$checksum
3408
3409 jmp .Locb_dec_done
3410
3411 .align 16
3412 .Locb_dec_one:
3413 movdqa @offset[0],$inout5 # borrow
3414
3415 call __ocb_decrypt1
3416
3417 movdqa $inout5,@offset[5]
3418 movups $inout0,`16*0`($out) # store output
3419 xorps $inout0,$checksum # accumulate checksum
3420 jmp .Locb_dec_done
3421
3422 .align 16
3423 .Locb_dec_two:
3424 pxor $inout2,$inout2
3425 pxor $inout3,$inout3
3426
3427 call __ocb_decrypt4
3428
3429 movdqa @offset[1],@offset[5]
3430 movups $inout0,`16*0`($out) # store output
3431 xorps $inout0,$checksum # accumulate checksum
3432 movups $inout1,`16*1`($out)
3433 xorps $inout1,$checksum
3434
3435 jmp .Locb_dec_done
3436
3437 .align 16
3438 .Locb_dec_three:
3439 pxor $inout3,$inout3
3440
3441 call __ocb_decrypt4
3442
3443 movdqa @offset[2],@offset[5]
3444 movups $inout0,`16*0`($out) # store output
3445 xorps $inout0,$checksum # accumulate checksum
3446 movups $inout1,`16*1`($out)
3447 xorps $inout1,$checksum
3448 movups $inout2,`16*2`($out)
3449 xorps $inout2,$checksum
3450
3451 jmp .Locb_dec_done
3452
3453 .align 16
3454 .Locb_dec_four:
3455 call __ocb_decrypt4
3456
3457 movdqa @offset[3],@offset[5]
3458 movups $inout0,`16*0`($out) # store output
3459 pxor $inout0,$checksum # accumulate checksum
3460 movups $inout1,`16*1`($out)
3461 pxor $inout1,$checksum
3462 movups $inout2,`16*2`($out)
3463 pxor $inout2,$checksum
3464 movups $inout3,`16*3`($out)
3465 pxor $inout3,$checksum
3466
3467 .Locb_dec_done:
3468 pxor $rndkey0,@offset[5] # "remove" round[last]
3469 movdqu $checksum,($checksum_p) # store checksum
3470 movdqu @offset[5],($offset_p) # store last offset_i
3471
3472 xorps %xmm0,%xmm0 # clear register bank
3473 pxor %xmm1,%xmm1
3474 pxor %xmm2,%xmm2
3475 pxor %xmm3,%xmm3
3476 pxor %xmm4,%xmm4
3477 pxor %xmm5,%xmm5
3478 ___
3479 $code.=<<___ if (!$win64);
3480 pxor %xmm6,%xmm6
3481 pxor %xmm7,%xmm7
3482 pxor %xmm8,%xmm8
3483 pxor %xmm9,%xmm9
3484 pxor %xmm10,%xmm10
3485 pxor %xmm11,%xmm11
3486 pxor %xmm12,%xmm12
3487 pxor %xmm13,%xmm13
3488 pxor %xmm14,%xmm14
3489 pxor %xmm15,%xmm15
3490 lea 0x28(%rsp),%rax
3491 .cfi_def_cfa %rax,8
3492 ___
3493 $code.=<<___ if ($win64);
3494 movaps 0x00(%rsp),%xmm6
3495 movaps %xmm0,0x00(%rsp) # clear stack
3496 movaps 0x10(%rsp),%xmm7
3497 movaps %xmm0,0x10(%rsp)
3498 movaps 0x20(%rsp),%xmm8
3499 movaps %xmm0,0x20(%rsp)
3500 movaps 0x30(%rsp),%xmm9
3501 movaps %xmm0,0x30(%rsp)
3502 movaps 0x40(%rsp),%xmm10
3503 movaps %xmm0,0x40(%rsp)
3504 movaps 0x50(%rsp),%xmm11
3505 movaps %xmm0,0x50(%rsp)
3506 movaps 0x60(%rsp),%xmm12
3507 movaps %xmm0,0x60(%rsp)
3508 movaps 0x70(%rsp),%xmm13
3509 movaps %xmm0,0x70(%rsp)
3510 movaps 0x80(%rsp),%xmm14
3511 movaps %xmm0,0x80(%rsp)
3512 movaps 0x90(%rsp),%xmm15
3513 movaps %xmm0,0x90(%rsp)
3514 lea 0xa0+0x28(%rsp),%rax
3515 .Locb_dec_pop:
3516 ___
3517 $code.=<<___;
3518 mov -40(%rax),%r14
3519 .cfi_restore %r14
3520 mov -32(%rax),%r13
3521 .cfi_restore %r13
3522 mov -24(%rax),%r12
3523 .cfi_restore %r12
3524 mov -16(%rax),%rbp
3525 .cfi_restore %rbp
3526 mov -8(%rax),%rbx
3527 .cfi_restore %rbx
3528 lea (%rax),%rsp
3529 .cfi_def_cfa_register %rsp
3530 .Locb_dec_epilogue:
3531 ret
3532 .cfi_endproc
3533 .size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3534
3535 .type __ocb_decrypt6,\@abi-omnipotent
3536 .align 32
3537 __ocb_decrypt6:
3538 .cfi_startproc
3539 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3540 movdqu ($L_p,$i1),@offset[1]
3541 movdqa @offset[0],@offset[2]
3542 movdqu ($L_p,$i3),@offset[3]
3543 movdqa @offset[0],@offset[4]
3544 pxor @offset[5],@offset[0]
3545 movdqu ($L_p,$i5),@offset[5]
3546 pxor @offset[0],@offset[1]
3547 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3548 pxor @offset[1],@offset[2]
3549 pxor @offset[1],$inout1
3550 pxor @offset[2],@offset[3]
3551 pxor @offset[2],$inout2
3552 pxor @offset[3],@offset[4]
3553 pxor @offset[3],$inout3
3554 pxor @offset[4],@offset[5]
3555 pxor @offset[4],$inout4
3556 pxor @offset[5],$inout5
3557 $movkey 32($key_),$rndkey0
3558
3559 lea 1($block_num),$i1 # even-numbered blocks
3560 lea 3($block_num),$i3
3561 lea 5($block_num),$i5
3562 add \$6,$block_num
3563 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3564 bsf $i1,$i1 # ntz(block)
3565 bsf $i3,$i3
3566 bsf $i5,$i5
3567
3568 aesdec $rndkey1,$inout0
3569 aesdec $rndkey1,$inout1
3570 aesdec $rndkey1,$inout2
3571 aesdec $rndkey1,$inout3
3572 pxor $rndkey0l,@offset[1]
3573 pxor $rndkey0l,@offset[2]
3574 aesdec $rndkey1,$inout4
3575 pxor $rndkey0l,@offset[3]
3576 pxor $rndkey0l,@offset[4]
3577 aesdec $rndkey1,$inout5
3578 $movkey 48($key_),$rndkey1
3579 pxor $rndkey0l,@offset[5]
3580
3581 aesdec $rndkey0,$inout0
3582 aesdec $rndkey0,$inout1
3583 aesdec $rndkey0,$inout2
3584 aesdec $rndkey0,$inout3
3585 aesdec $rndkey0,$inout4
3586 aesdec $rndkey0,$inout5
3587 $movkey 64($key_),$rndkey0
3588 shl \$4,$i1 # ntz(block) -> table offset
3589 shl \$4,$i3
3590 jmp .Locb_dec_loop6
3591
3592 .align 32
3593 .Locb_dec_loop6:
3594 aesdec $rndkey1,$inout0
3595 aesdec $rndkey1,$inout1
3596 aesdec $rndkey1,$inout2
3597 aesdec $rndkey1,$inout3
3598 aesdec $rndkey1,$inout4
3599 aesdec $rndkey1,$inout5
3600 $movkey ($key,%rax),$rndkey1
3601 add \$32,%rax
3602
3603 aesdec $rndkey0,$inout0
3604 aesdec $rndkey0,$inout1
3605 aesdec $rndkey0,$inout2
3606 aesdec $rndkey0,$inout3
3607 aesdec $rndkey0,$inout4
3608 aesdec $rndkey0,$inout5
3609 $movkey -16($key,%rax),$rndkey0
3610 jnz .Locb_dec_loop6
3611
3612 aesdec $rndkey1,$inout0
3613 aesdec $rndkey1,$inout1
3614 aesdec $rndkey1,$inout2
3615 aesdec $rndkey1,$inout3
3616 aesdec $rndkey1,$inout4
3617 aesdec $rndkey1,$inout5
3618 $movkey 16($key_),$rndkey1
3619 shl \$4,$i5
3620
3621 aesdeclast @offset[0],$inout0
3622 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3623 mov %r10,%rax # restore twisted rounds
3624 aesdeclast @offset[1],$inout1
3625 aesdeclast @offset[2],$inout2
3626 aesdeclast @offset[3],$inout3
3627 aesdeclast @offset[4],$inout4
3628 aesdeclast @offset[5],$inout5
3629 ret
3630 .cfi_endproc
3631 .size __ocb_decrypt6,.-__ocb_decrypt6
3632
3633 .type __ocb_decrypt4,\@abi-omnipotent
3634 .align 32
3635 __ocb_decrypt4:
3636 .cfi_startproc
3637 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3638 movdqu ($L_p,$i1),@offset[1]
3639 movdqa @offset[0],@offset[2]
3640 movdqu ($L_p,$i3),@offset[3]
3641 pxor @offset[5],@offset[0]
3642 pxor @offset[0],@offset[1]
3643 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3644 pxor @offset[1],@offset[2]
3645 pxor @offset[1],$inout1
3646 pxor @offset[2],@offset[3]
3647 pxor @offset[2],$inout2
3648 pxor @offset[3],$inout3
3649 $movkey 32($key_),$rndkey0
3650
3651 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3652 pxor $rndkey0l,@offset[1]
3653 pxor $rndkey0l,@offset[2]
3654 pxor $rndkey0l,@offset[3]
3655
3656 aesdec $rndkey1,$inout0
3657 aesdec $rndkey1,$inout1
3658 aesdec $rndkey1,$inout2
3659 aesdec $rndkey1,$inout3
3660 $movkey 48($key_),$rndkey1
3661
3662 aesdec $rndkey0,$inout0
3663 aesdec $rndkey0,$inout1
3664 aesdec $rndkey0,$inout2
3665 aesdec $rndkey0,$inout3
3666 $movkey 64($key_),$rndkey0
3667 jmp .Locb_dec_loop4
3668
3669 .align 32
3670 .Locb_dec_loop4:
3671 aesdec $rndkey1,$inout0
3672 aesdec $rndkey1,$inout1
3673 aesdec $rndkey1,$inout2
3674 aesdec $rndkey1,$inout3
3675 $movkey ($key,%rax),$rndkey1
3676 add \$32,%rax
3677
3678 aesdec $rndkey0,$inout0
3679 aesdec $rndkey0,$inout1
3680 aesdec $rndkey0,$inout2
3681 aesdec $rndkey0,$inout3
3682 $movkey -16($key,%rax),$rndkey0
3683 jnz .Locb_dec_loop4
3684
3685 aesdec $rndkey1,$inout0
3686 aesdec $rndkey1,$inout1
3687 aesdec $rndkey1,$inout2
3688 aesdec $rndkey1,$inout3
3689 $movkey 16($key_),$rndkey1
3690 mov %r10,%rax # restore twisted rounds
3691
3692 aesdeclast @offset[0],$inout0
3693 aesdeclast @offset[1],$inout1
3694 aesdeclast @offset[2],$inout2
3695 aesdeclast @offset[3],$inout3
3696 ret
3697 .cfi_endproc
3698 .size __ocb_decrypt4,.-__ocb_decrypt4
3699
3700 .type __ocb_decrypt1,\@abi-omnipotent
3701 .align 32
3702 __ocb_decrypt1:
3703 .cfi_startproc
3704 pxor @offset[5],$inout5 # offset_i
3705 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3706 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3707 $movkey 32($key_),$rndkey0
3708
3709 aesdec $rndkey1,$inout0
3710 $movkey 48($key_),$rndkey1
3711 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3712
3713 aesdec $rndkey0,$inout0
3714 $movkey 64($key_),$rndkey0
3715 jmp .Locb_dec_loop1
3716
3717 .align 32
3718 .Locb_dec_loop1:
3719 aesdec $rndkey1,$inout0
3720 $movkey ($key,%rax),$rndkey1
3721 add \$32,%rax
3722
3723 aesdec $rndkey0,$inout0
3724 $movkey -16($key,%rax),$rndkey0
3725 jnz .Locb_dec_loop1
3726
3727 aesdec $rndkey1,$inout0
3728 $movkey 16($key_),$rndkey1 # redundant in tail
3729 mov %r10,%rax # restore twisted rounds
3730
3731 aesdeclast $inout5,$inout0
3732 ret
3733 .cfi_endproc
3734 .size __ocb_decrypt1,.-__ocb_decrypt1
3735 ___
3736 } }}
3737 \f
3738 ########################################################################
3739 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
3740 # size_t length, const AES_KEY *key,
3741 # unsigned char *ivp,const int enc);
3742 {
3743 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3744 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3745
3746 $code.=<<___;
3747 .globl ${PREFIX}_cbc_encrypt
3748 .type ${PREFIX}_cbc_encrypt,\@function,6
3749 .align 16
3750 ${PREFIX}_cbc_encrypt:
3751 .cfi_startproc
3752 endbranch
3753 test $len,$len # check length
3754 jz .Lcbc_ret
3755
3756 mov 240($key),$rnds_ # key->rounds
3757 mov $key,$key_ # backup $key
3758 test %r9d,%r9d # 6th argument
3759 jz .Lcbc_decrypt
3760 #--------------------------- CBC ENCRYPT ------------------------------#
3761 movups ($ivp),$inout0 # load iv as initial state
3762 mov $rnds_,$rounds
3763 cmp \$16,$len
3764 jb .Lcbc_enc_tail
3765 sub \$16,$len
3766 jmp .Lcbc_enc_loop
3767 .align 16
3768 .Lcbc_enc_loop:
3769 movups ($inp),$inout1 # load input
3770 lea 16($inp),$inp
3771 #xorps $inout1,$inout0
3772 ___
3773 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3774 $code.=<<___;
3775 mov $rnds_,$rounds # restore $rounds
3776 mov $key_,$key # restore $key
3777 movups $inout0,0($out) # store output
3778 lea 16($out),$out
3779 sub \$16,$len
3780 jnc .Lcbc_enc_loop
3781 add \$16,$len
3782 jnz .Lcbc_enc_tail
3783 pxor $rndkey0,$rndkey0 # clear register bank
3784 pxor $rndkey1,$rndkey1
3785 movups $inout0,($ivp)
3786 pxor $inout0,$inout0
3787 pxor $inout1,$inout1
3788 jmp .Lcbc_ret
3789
3790 .Lcbc_enc_tail:
3791 mov $len,%rcx # zaps $key
3792 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3793 .long 0x9066A4F3 # rep movsb
3794 mov \$16,%ecx # zero tail
3795 sub $len,%rcx
3796 xor %eax,%eax
3797 .long 0x9066AAF3 # rep stosb
3798 lea -16(%rdi),%rdi # rewind $out by 1 block
3799 mov $rnds_,$rounds # restore $rounds
3800 mov %rdi,%rsi # $inp and $out are the same
3801 mov $key_,$key # restore $key
3802 xor $len,$len # len=16
3803 jmp .Lcbc_enc_loop # one more spin
3804 \f#--------------------------- CBC DECRYPT ------------------------------#
3805 .align 16
3806 .Lcbc_decrypt:
3807 cmp \$16,$len
3808 jne .Lcbc_decrypt_bulk
3809
3810 # handle single block without allocating stack frame,
3811 # useful in ciphertext stealing mode
3812 movdqu ($inp),$inout0 # load input
3813 movdqu ($ivp),$inout1 # load iv
3814 movdqa $inout0,$inout2 # future iv
3815 ___
3816 &aesni_generate1("dec",$key,$rnds_);
3817 $code.=<<___;
3818 pxor $rndkey0,$rndkey0 # clear register bank
3819 pxor $rndkey1,$rndkey1
3820 movdqu $inout2,($ivp) # store iv
3821 xorps $inout1,$inout0 # ^=iv
3822 pxor $inout1,$inout1
3823 movups $inout0,($out) # store output
3824 pxor $inout0,$inout0
3825 jmp .Lcbc_ret
3826 .align 16
3827 .Lcbc_decrypt_bulk:
3828 lea (%rsp),%r11 # frame pointer
3829 .cfi_def_cfa_register %r11
3830 push %rbp
3831 .cfi_push %rbp
3832 sub \$$frame_size,%rsp
3833 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3834 ___
3835 $code.=<<___ if ($win64);
3836 movaps %xmm6,0x10(%rsp)
3837 movaps %xmm7,0x20(%rsp)
3838 movaps %xmm8,0x30(%rsp)
3839 movaps %xmm9,0x40(%rsp)
3840 movaps %xmm10,0x50(%rsp)
3841 movaps %xmm11,0x60(%rsp)
3842 movaps %xmm12,0x70(%rsp)
3843 movaps %xmm13,0x80(%rsp)
3844 movaps %xmm14,0x90(%rsp)
3845 movaps %xmm15,0xa0(%rsp)
3846 .Lcbc_decrypt_body:
3847 ___
3848
3849 my $inp_=$key_="%rbp"; # reassign $key_
3850
3851 $code.=<<___;
3852 mov $key,$key_ # [re-]backup $key [after reassignment]
3853 movups ($ivp),$iv
3854 mov $rnds_,$rounds
3855 cmp \$0x50,$len
3856 jbe .Lcbc_dec_tail
3857
3858 $movkey ($key),$rndkey0
3859 movdqu 0x00($inp),$inout0 # load input
3860 movdqu 0x10($inp),$inout1
3861 movdqa $inout0,$in0
3862 movdqu 0x20($inp),$inout2
3863 movdqa $inout1,$in1
3864 movdqu 0x30($inp),$inout3
3865 movdqa $inout2,$in2
3866 movdqu 0x40($inp),$inout4
3867 movdqa $inout3,$in3
3868 movdqu 0x50($inp),$inout5
3869 movdqa $inout4,$in4
3870 mov OPENSSL_ia32cap_P+4(%rip),%r9d
3871 cmp \$0x70,$len
3872 jbe .Lcbc_dec_six_or_seven
3873
3874 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3875 sub \$0x50,$len # $len is biased by -5*16
3876 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
3877 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3878 sub \$0x20,$len # $len is biased by -7*16
3879 lea 0x70($key),$key # size optimization
3880 jmp .Lcbc_dec_loop8_enter
3881 .align 16
3882 .Lcbc_dec_loop8:
3883 movups $inout7,($out)
3884 lea 0x10($out),$out
3885 .Lcbc_dec_loop8_enter:
3886 movdqu 0x60($inp),$inout6
3887 pxor $rndkey0,$inout0
3888 movdqu 0x70($inp),$inout7
3889 pxor $rndkey0,$inout1
3890 $movkey 0x10-0x70($key),$rndkey1
3891 pxor $rndkey0,$inout2
3892 mov \$-1,$inp_
3893 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3894 pxor $rndkey0,$inout3
3895 pxor $rndkey0,$inout4
3896 pxor $rndkey0,$inout5
3897 pxor $rndkey0,$inout6
3898
3899 aesdec $rndkey1,$inout0
3900 pxor $rndkey0,$inout7
3901 $movkey 0x20-0x70($key),$rndkey0
3902 aesdec $rndkey1,$inout1
3903 aesdec $rndkey1,$inout2
3904 aesdec $rndkey1,$inout3
3905 aesdec $rndkey1,$inout4
3906 aesdec $rndkey1,$inout5
3907 aesdec $rndkey1,$inout6
3908 adc \$0,$inp_
3909 and \$128,$inp_
3910 aesdec $rndkey1,$inout7
3911 add $inp,$inp_
3912 $movkey 0x30-0x70($key),$rndkey1
3913 ___
3914 for($i=1;$i<12;$i++) {
3915 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3916 $code.=<<___ if ($i==7);
3917 cmp \$11,$rounds
3918 ___
3919 $code.=<<___;
3920 aesdec $rndkeyx,$inout0
3921 aesdec $rndkeyx,$inout1
3922 aesdec $rndkeyx,$inout2
3923 aesdec $rndkeyx,$inout3
3924 aesdec $rndkeyx,$inout4
3925 aesdec $rndkeyx,$inout5
3926 aesdec $rndkeyx,$inout6
3927 aesdec $rndkeyx,$inout7
3928 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3929 ___
3930 $code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3931 nop
3932 ___
3933 $code.=<<___ if ($i==7);
3934 jb .Lcbc_dec_done
3935 ___
3936 $code.=<<___ if ($i==9);
3937 je .Lcbc_dec_done
3938 ___
3939 $code.=<<___ if ($i==11);
3940 jmp .Lcbc_dec_done
3941 ___
3942 }
3943 $code.=<<___;
3944 .align 16
3945 .Lcbc_dec_done:
3946 aesdec $rndkey1,$inout0
3947 aesdec $rndkey1,$inout1
3948 pxor $rndkey0,$iv
3949 pxor $rndkey0,$in0
3950 aesdec $rndkey1,$inout2
3951 aesdec $rndkey1,$inout3
3952 pxor $rndkey0,$in1
3953 pxor $rndkey0,$in2
3954 aesdec $rndkey1,$inout4
3955 aesdec $rndkey1,$inout5
3956 pxor $rndkey0,$in3
3957 pxor $rndkey0,$in4
3958 aesdec $rndkey1,$inout6
3959 aesdec $rndkey1,$inout7
3960 movdqu 0x50($inp),$rndkey1
3961
3962 aesdeclast $iv,$inout0
3963 movdqu 0x60($inp),$iv # borrow $iv
3964 pxor $rndkey0,$rndkey1
3965 aesdeclast $in0,$inout1
3966 pxor $rndkey0,$iv
3967 movdqu 0x70($inp),$rndkey0 # next IV
3968 aesdeclast $in1,$inout2
3969 lea 0x80($inp),$inp
3970 movdqu 0x00($inp_),$in0
3971 aesdeclast $in2,$inout3
3972 aesdeclast $in3,$inout4
3973 movdqu 0x10($inp_),$in1
3974 movdqu 0x20($inp_),$in2
3975 aesdeclast $in4,$inout5
3976 aesdeclast $rndkey1,$inout6
3977 movdqu 0x30($inp_),$in3
3978 movdqu 0x40($inp_),$in4
3979 aesdeclast $iv,$inout7
3980 movdqa $rndkey0,$iv # return $iv
3981 movdqu 0x50($inp_),$rndkey1
3982 $movkey -0x70($key),$rndkey0
3983
3984 movups $inout0,($out) # store output
3985 movdqa $in0,$inout0
3986 movups $inout1,0x10($out)
3987 movdqa $in1,$inout1
3988 movups $inout2,0x20($out)
3989 movdqa $in2,$inout2
3990 movups $inout3,0x30($out)
3991 movdqa $in3,$inout3
3992 movups $inout4,0x40($out)
3993 movdqa $in4,$inout4
3994 movups $inout5,0x50($out)
3995 movdqa $rndkey1,$inout5
3996 movups $inout6,0x60($out)
3997 lea 0x70($out),$out
3998
3999 sub \$0x80,$len
4000 ja .Lcbc_dec_loop8
4001
4002 movaps $inout7,$inout0
4003 lea -0x70($key),$key
4004 add \$0x70,$len
4005 jle .Lcbc_dec_clear_tail_collected
4006 movups $inout7,($out)
4007 lea 0x10($out),$out
4008 cmp \$0x50,$len
4009 jbe .Lcbc_dec_tail
4010
4011 movaps $in0,$inout0
4012 .Lcbc_dec_six_or_seven:
4013 cmp \$0x60,$len
4014 ja .Lcbc_dec_seven
4015
4016 movaps $inout5,$inout6
4017 call _aesni_decrypt6
4018 pxor $iv,$inout0 # ^= IV
4019 movaps $inout6,$iv
4020 pxor $in0,$inout1
4021 movdqu $inout0,($out)
4022 pxor $in1,$inout2
4023 movdqu $inout1,0x10($out)
4024 pxor $inout1,$inout1 # clear register bank
4025 pxor $in2,$inout3
4026 movdqu $inout2,0x20($out)
4027 pxor $inout2,$inout2
4028 pxor $in3,$inout4
4029 movdqu $inout3,0x30($out)
4030 pxor $inout3,$inout3
4031 pxor $in4,$inout5
4032 movdqu $inout4,0x40($out)
4033 pxor $inout4,$inout4
4034 lea 0x50($out),$out
4035 movdqa $inout5,$inout0
4036 pxor $inout5,$inout5
4037 jmp .Lcbc_dec_tail_collected
4038
4039 .align 16
4040 .Lcbc_dec_seven:
4041 movups 0x60($inp),$inout6
4042 xorps $inout7,$inout7
4043 call _aesni_decrypt8
4044 movups 0x50($inp),$inout7
4045 pxor $iv,$inout0 # ^= IV
4046 movups 0x60($inp),$iv
4047 pxor $in0,$inout1
4048 movdqu $inout0,($out)
4049 pxor $in1,$inout2
4050 movdqu $inout1,0x10($out)
4051 pxor $inout1,$inout1 # clear register bank
4052 pxor $in2,$inout3
4053 movdqu $inout2,0x20($out)
4054 pxor $inout2,$inout2
4055 pxor $in3,$inout4
4056 movdqu $inout3,0x30($out)
4057 pxor $inout3,$inout3
4058 pxor $in4,$inout5
4059 movdqu $inout4,0x40($out)
4060 pxor $inout4,$inout4
4061 pxor $inout7,$inout6
4062 movdqu $inout5,0x50($out)
4063 pxor $inout5,$inout5
4064 lea 0x60($out),$out
4065 movdqa $inout6,$inout0
4066 pxor $inout6,$inout6
4067 pxor $inout7,$inout7
4068 jmp .Lcbc_dec_tail_collected
4069
4070 .align 16
4071 .Lcbc_dec_loop6:
4072 movups $inout5,($out)
4073 lea 0x10($out),$out
4074 movdqu 0x00($inp),$inout0 # load input
4075 movdqu 0x10($inp),$inout1
4076 movdqa $inout0,$in0
4077 movdqu 0x20($inp),$inout2
4078 movdqa $inout1,$in1
4079 movdqu 0x30($inp),$inout3
4080 movdqa $inout2,$in2
4081 movdqu 0x40($inp),$inout4
4082 movdqa $inout3,$in3
4083 movdqu 0x50($inp),$inout5
4084 movdqa $inout4,$in4
4085 .Lcbc_dec_loop6_enter:
4086 lea 0x60($inp),$inp
4087 movdqa $inout5,$inout6
4088
4089 call _aesni_decrypt6
4090
4091 pxor $iv,$inout0 # ^= IV
4092 movdqa $inout6,$iv
4093 pxor $in0,$inout1
4094 movdqu $inout0,($out)
4095 pxor $in1,$inout2
4096 movdqu $inout1,0x10($out)
4097 pxor $in2,$inout3
4098 movdqu $inout2,0x20($out)
4099 pxor $in3,$inout4
4100 mov $key_,$key
4101 movdqu $inout3,0x30($out)
4102 pxor $in4,$inout5
4103 mov $rnds_,$rounds
4104 movdqu $inout4,0x40($out)
4105 lea 0x50($out),$out
4106 sub \$0x60,$len
4107 ja .Lcbc_dec_loop6
4108
4109 movdqa $inout5,$inout0
4110 add \$0x50,$len
4111 jle .Lcbc_dec_clear_tail_collected
4112 movups $inout5,($out)
4113 lea 0x10($out),$out
4114
4115 .Lcbc_dec_tail:
4116 movups ($inp),$inout0
4117 sub \$0x10,$len
4118 jbe .Lcbc_dec_one # $len is 1*16 or less
4119
4120 movups 0x10($inp),$inout1
4121 movaps $inout0,$in0
4122 sub \$0x10,$len
4123 jbe .Lcbc_dec_two # $len is 2*16 or less
4124
4125 movups 0x20($inp),$inout2
4126 movaps $inout1,$in1
4127 sub \$0x10,$len
4128 jbe .Lcbc_dec_three # $len is 3*16 or less
4129
4130 movups 0x30($inp),$inout3
4131 movaps $inout2,$in2
4132 sub \$0x10,$len
4133 jbe .Lcbc_dec_four # $len is 4*16 or less
4134
4135 movups 0x40($inp),$inout4 # $len is 5*16 or less
4136 movaps $inout3,$in3
4137 movaps $inout4,$in4
4138 xorps $inout5,$inout5
4139 call _aesni_decrypt6
4140 pxor $iv,$inout0
4141 movaps $in4,$iv
4142 pxor $in0,$inout1
4143 movdqu $inout0,($out)
4144 pxor $in1,$inout2
4145 movdqu $inout1,0x10($out)
4146 pxor $inout1,$inout1 # clear register bank
4147 pxor $in2,$inout3
4148 movdqu $inout2,0x20($out)
4149 pxor $inout2,$inout2
4150 pxor $in3,$inout4
4151 movdqu $inout3,0x30($out)
4152 pxor $inout3,$inout3
4153 lea 0x40($out),$out
4154 movdqa $inout4,$inout0
4155 pxor $inout4,$inout4
4156 pxor $inout5,$inout5
4157 sub \$0x10,$len
4158 jmp .Lcbc_dec_tail_collected
4159
4160 .align 16
4161 .Lcbc_dec_one:
4162 movaps $inout0,$in0
4163 ___
4164 &aesni_generate1("dec",$key,$rounds);
4165 $code.=<<___;
4166 xorps $iv,$inout0
4167 movaps $in0,$iv
4168 jmp .Lcbc_dec_tail_collected
4169 .align 16
4170 .Lcbc_dec_two:
4171 movaps $inout1,$in1
4172 call _aesni_decrypt2
4173 pxor $iv,$inout0
4174 movaps $in1,$iv
4175 pxor $in0,$inout1
4176 movdqu $inout0,($out)
4177 movdqa $inout1,$inout0
4178 pxor $inout1,$inout1 # clear register bank
4179 lea 0x10($out),$out
4180 jmp .Lcbc_dec_tail_collected
4181 .align 16
4182 .Lcbc_dec_three:
4183 movaps $inout2,$in2
4184 call _aesni_decrypt3
4185 pxor $iv,$inout0
4186 movaps $in2,$iv
4187 pxor $in0,$inout1
4188 movdqu $inout0,($out)
4189 pxor $in1,$inout2
4190 movdqu $inout1,0x10($out)
4191 pxor $inout1,$inout1 # clear register bank
4192 movdqa $inout2,$inout0
4193 pxor $inout2,$inout2
4194 lea 0x20($out),$out
4195 jmp .Lcbc_dec_tail_collected
4196 .align 16
4197 .Lcbc_dec_four:
4198 movaps $inout3,$in3
4199 call _aesni_decrypt4
4200 pxor $iv,$inout0
4201 movaps $in3,$iv
4202 pxor $in0,$inout1
4203 movdqu $inout0,($out)
4204 pxor $in1,$inout2
4205 movdqu $inout1,0x10($out)
4206 pxor $inout1,$inout1 # clear register bank
4207 pxor $in2,$inout3
4208 movdqu $inout2,0x20($out)
4209 pxor $inout2,$inout2
4210 movdqa $inout3,$inout0
4211 pxor $inout3,$inout3
4212 lea 0x30($out),$out
4213 jmp .Lcbc_dec_tail_collected
4214
4215 .align 16
4216 .Lcbc_dec_clear_tail_collected:
4217 pxor $inout1,$inout1 # clear register bank
4218 pxor $inout2,$inout2
4219 pxor $inout3,$inout3
4220 ___
4221 $code.=<<___ if (!$win64);
4222 pxor $inout4,$inout4 # %xmm6..9
4223 pxor $inout5,$inout5
4224 pxor $inout6,$inout6
4225 pxor $inout7,$inout7
4226 ___
4227 $code.=<<___;
4228 .Lcbc_dec_tail_collected:
4229 movups $iv,($ivp)
4230 and \$15,$len
4231 jnz .Lcbc_dec_tail_partial
4232 movups $inout0,($out)
4233 pxor $inout0,$inout0
4234 jmp .Lcbc_dec_ret
4235 .align 16
4236 .Lcbc_dec_tail_partial:
4237 movaps $inout0,(%rsp)
4238 pxor $inout0,$inout0
4239 mov \$16,%rcx
4240 mov $out,%rdi
4241 sub $len,%rcx
4242 lea (%rsp),%rsi
4243 .long 0x9066A4F3 # rep movsb
4244 movdqa $inout0,(%rsp)
4245
4246 .Lcbc_dec_ret:
4247 xorps $rndkey0,$rndkey0 # %xmm0
4248 pxor $rndkey1,$rndkey1
4249 ___
4250 $code.=<<___ if ($win64);
4251 movaps 0x10(%rsp),%xmm6
4252 movaps %xmm0,0x10(%rsp) # clear stack
4253 movaps 0x20(%rsp),%xmm7
4254 movaps %xmm0,0x20(%rsp)
4255 movaps 0x30(%rsp),%xmm8
4256 movaps %xmm0,0x30(%rsp)
4257 movaps 0x40(%rsp),%xmm9
4258 movaps %xmm0,0x40(%rsp)
4259 movaps 0x50(%rsp),%xmm10
4260 movaps %xmm0,0x50(%rsp)
4261 movaps 0x60(%rsp),%xmm11
4262 movaps %xmm0,0x60(%rsp)
4263 movaps 0x70(%rsp),%xmm12
4264 movaps %xmm0,0x70(%rsp)
4265 movaps 0x80(%rsp),%xmm13
4266 movaps %xmm0,0x80(%rsp)
4267 movaps 0x90(%rsp),%xmm14
4268 movaps %xmm0,0x90(%rsp)
4269 movaps 0xa0(%rsp),%xmm15
4270 movaps %xmm0,0xa0(%rsp)
4271 ___
4272 $code.=<<___;
4273 mov -8(%r11),%rbp
4274 .cfi_restore %rbp
4275 lea (%r11),%rsp
4276 .cfi_def_cfa_register %rsp
4277 .Lcbc_ret:
4278 ret
4279 .cfi_endproc
4280 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4281 ___
4282 } \f
4283 # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4284 # int bits, AES_KEY *key)
4285 #
4286 # input: $inp user-supplied key
4287 # $bits $inp length in bits
4288 # $key pointer to key schedule
4289 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4290 # *$key key schedule
4291 #
4292 { my ($inp,$bits,$key) = @_4args;
4293 $bits =~ s/%r/%e/;
4294
4295 $code.=<<___;
4296 .globl ${PREFIX}_set_decrypt_key
4297 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4298 .align 16
4299 ${PREFIX}_set_decrypt_key:
4300 .cfi_startproc
4301 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4302 .cfi_adjust_cfa_offset 8
4303 call __aesni_set_encrypt_key
4304 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4305 test %eax,%eax
4306 jnz .Ldec_key_ret
4307 lea 16($key,$bits),$inp # points at the end of key schedule
4308
4309 $movkey ($key),%xmm0 # just swap
4310 $movkey ($inp),%xmm1
4311 $movkey %xmm0,($inp)
4312 $movkey %xmm1,($key)
4313 lea 16($key),$key
4314 lea -16($inp),$inp
4315
4316 .Ldec_key_inverse:
4317 $movkey ($key),%xmm0 # swap and inverse
4318 $movkey ($inp),%xmm1
4319 aesimc %xmm0,%xmm0
4320 aesimc %xmm1,%xmm1
4321 lea 16($key),$key
4322 lea -16($inp),$inp
4323 $movkey %xmm0,16($inp)
4324 $movkey %xmm1,-16($key)
4325 cmp $key,$inp
4326 ja .Ldec_key_inverse
4327
4328 $movkey ($key),%xmm0 # inverse middle
4329 aesimc %xmm0,%xmm0
4330 pxor %xmm1,%xmm1
4331 $movkey %xmm0,($inp)
4332 pxor %xmm0,%xmm0
4333 .Ldec_key_ret:
4334 add \$8,%rsp
4335 .cfi_adjust_cfa_offset -8
4336 ret
4337 .cfi_endproc
4338 .LSEH_end_set_decrypt_key:
4339 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4340 ___
4341 \f
4342 # This is based on submission from Intel by
4343 # Huang Ying
4344 # Vinodh Gopal
4345 # Kahraman Akdemir
4346 #
4347 # Aggressively optimized in respect to aeskeygenassist's critical path
4348 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
4349 #
4350 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4351 # int bits, AES_KEY * const key);
4352 #
4353 # input: $inp user-supplied key
4354 # $bits $inp length in bits
4355 # $key pointer to key schedule
4356 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4357 # $bits rounds-1 (used in aesni_set_decrypt_key)
4358 # *$key key schedule
4359 # $key pointer to key schedule (used in
4360 # aesni_set_decrypt_key)
4361 #
4362 # Subroutine is frame-less, which means that only volatile registers
4363 # are used. Note that it's declared "abi-omnipotent", which means that
4364 # amount of volatile registers is smaller on Windows.
4365 #
4366 $code.=<<___;
4367 .globl ${PREFIX}_set_encrypt_key
4368 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4369 .align 16
4370 ${PREFIX}_set_encrypt_key:
4371 __aesni_set_encrypt_key:
4372 .cfi_startproc
4373 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4374 .cfi_adjust_cfa_offset 8
4375 mov \$-1,%rax
4376 test $inp,$inp
4377 jz .Lenc_key_ret
4378 test $key,$key
4379 jz .Lenc_key_ret
4380
4381 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
4382 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4383 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
4384 and OPENSSL_ia32cap_P+4(%rip),%r10d
4385 lea 16($key),%rax # %rax is used as modifiable copy of $key
4386 cmp \$256,$bits
4387 je .L14rounds
4388 cmp \$192,$bits
4389 je .L12rounds
4390 cmp \$128,$bits
4391 jne .Lbad_keybits
4392
4393 .L10rounds:
4394 mov \$9,$bits # 10 rounds for 128-bit key
4395 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4396 je .L10rounds_alt
4397
4398 $movkey %xmm0,($key) # round 0
4399 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4400 call .Lkey_expansion_128_cold
4401 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4402 call .Lkey_expansion_128
4403 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4404 call .Lkey_expansion_128
4405 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4406 call .Lkey_expansion_128
4407 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4408 call .Lkey_expansion_128
4409 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4410 call .Lkey_expansion_128
4411 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4412 call .Lkey_expansion_128
4413 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4414 call .Lkey_expansion_128
4415 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4416 call .Lkey_expansion_128
4417 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4418 call .Lkey_expansion_128
4419 $movkey %xmm0,(%rax)
4420 mov $bits,80(%rax) # 240(%rdx)
4421 xor %eax,%eax
4422 jmp .Lenc_key_ret
4423
4424 .align 16
4425 .L10rounds_alt:
4426 movdqa .Lkey_rotate(%rip),%xmm5
4427 mov \$8,%r10d
4428 movdqa .Lkey_rcon1(%rip),%xmm4
4429 movdqa %xmm0,%xmm2
4430 movdqu %xmm0,($key)
4431 jmp .Loop_key128
4432
4433 .align 16
4434 .Loop_key128:
4435 pshufb %xmm5,%xmm0
4436 aesenclast %xmm4,%xmm0
4437 pslld \$1,%xmm4
4438 lea 16(%rax),%rax
4439
4440 movdqa %xmm2,%xmm3
4441 pslldq \$4,%xmm2
4442 pxor %xmm2,%xmm3
4443 pslldq \$4,%xmm2
4444 pxor %xmm2,%xmm3
4445 pslldq \$4,%xmm2
4446 pxor %xmm3,%xmm2
4447
4448 pxor %xmm2,%xmm0
4449 movdqu %xmm0,-16(%rax)
4450 movdqa %xmm0,%xmm2
4451
4452 dec %r10d
4453 jnz .Loop_key128
4454
4455 movdqa .Lkey_rcon1b(%rip),%xmm4
4456
4457 pshufb %xmm5,%xmm0
4458 aesenclast %xmm4,%xmm0
4459 pslld \$1,%xmm4
4460
4461 movdqa %xmm2,%xmm3
4462 pslldq \$4,%xmm2
4463 pxor %xmm2,%xmm3
4464 pslldq \$4,%xmm2
4465 pxor %xmm2,%xmm3
4466 pslldq \$4,%xmm2
4467 pxor %xmm3,%xmm2
4468
4469 pxor %xmm2,%xmm0
4470 movdqu %xmm0,(%rax)
4471
4472 movdqa %xmm0,%xmm2
4473 pshufb %xmm5,%xmm0
4474 aesenclast %xmm4,%xmm0
4475
4476 movdqa %xmm2,%xmm3
4477 pslldq \$4,%xmm2
4478 pxor %xmm2,%xmm3
4479 pslldq \$4,%xmm2
4480 pxor %xmm2,%xmm3
4481 pslldq \$4,%xmm2
4482 pxor %xmm3,%xmm2
4483
4484 pxor %xmm2,%xmm0
4485 movdqu %xmm0,16(%rax)
4486
4487 mov $bits,96(%rax) # 240($key)
4488 xor %eax,%eax
4489 jmp .Lenc_key_ret
4490
4491 .align 16
4492 .L12rounds:
4493 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4494 mov \$11,$bits # 12 rounds for 192
4495 cmp \$`1<<28`,%r10d # AVX, but no XOP
4496 je .L12rounds_alt
4497
4498 $movkey %xmm0,($key) # round 0
4499 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4500 call .Lkey_expansion_192a_cold
4501 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4502 call .Lkey_expansion_192b
4503 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4504 call .Lkey_expansion_192a
4505 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4506 call .Lkey_expansion_192b
4507 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4508 call .Lkey_expansion_192a
4509 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4510 call .Lkey_expansion_192b
4511 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4512 call .Lkey_expansion_192a
4513 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4514 call .Lkey_expansion_192b
4515 $movkey %xmm0,(%rax)
4516 mov $bits,48(%rax) # 240(%rdx)
4517 xor %rax, %rax
4518 jmp .Lenc_key_ret
4519
4520 .align 16
4521 .L12rounds_alt:
4522 movdqa .Lkey_rotate192(%rip),%xmm5
4523 movdqa .Lkey_rcon1(%rip),%xmm4
4524 mov \$8,%r10d
4525 movdqu %xmm0,($key)
4526 jmp .Loop_key192
4527
4528 .align 16
4529 .Loop_key192:
4530 movq %xmm2,0(%rax)
4531 movdqa %xmm2,%xmm1
4532 pshufb %xmm5,%xmm2
4533 aesenclast %xmm4,%xmm2
4534 pslld \$1, %xmm4
4535 lea 24(%rax),%rax
4536
4537 movdqa %xmm0,%xmm3
4538 pslldq \$4,%xmm0
4539 pxor %xmm0,%xmm3
4540 pslldq \$4,%xmm0
4541 pxor %xmm0,%xmm3
4542 pslldq \$4,%xmm0
4543 pxor %xmm3,%xmm0
4544
4545 pshufd \$0xff,%xmm0,%xmm3
4546 pxor %xmm1,%xmm3
4547 pslldq \$4,%xmm1
4548 pxor %xmm1,%xmm3
4549
4550 pxor %xmm2,%xmm0
4551 pxor %xmm3,%xmm2
4552 movdqu %xmm0,-16(%rax)
4553
4554 dec %r10d
4555 jnz .Loop_key192
4556
4557 mov $bits,32(%rax) # 240($key)
4558 xor %eax,%eax
4559 jmp .Lenc_key_ret
4560
4561 .align 16
4562 .L14rounds:
4563 movups 16($inp),%xmm2 # remaining half of *userKey
4564 mov \$13,$bits # 14 rounds for 256
4565 lea 16(%rax),%rax
4566 cmp \$`1<<28`,%r10d # AVX, but no XOP
4567 je .L14rounds_alt
4568
4569 $movkey %xmm0,($key) # round 0
4570 $movkey %xmm2,16($key) # round 1
4571 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4572 call .Lkey_expansion_256a_cold
4573 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4574 call .Lkey_expansion_256b
4575 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4576 call .Lkey_expansion_256a
4577 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4578 call .Lkey_expansion_256b
4579 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4580 call .Lkey_expansion_256a
4581 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4582 call .Lkey_expansion_256b
4583 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4584 call .Lkey_expansion_256a
4585 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4586 call .Lkey_expansion_256b
4587 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4588 call .Lkey_expansion_256a
4589 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4590 call .Lkey_expansion_256b
4591 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4592 call .Lkey_expansion_256a
4593 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4594 call .Lkey_expansion_256b
4595 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4596 call .Lkey_expansion_256a
4597 $movkey %xmm0,(%rax)
4598 mov $bits,16(%rax) # 240(%rdx)
4599 xor %rax,%rax
4600 jmp .Lenc_key_ret
4601
4602 .align 16
4603 .L14rounds_alt:
4604 movdqa .Lkey_rotate(%rip),%xmm5
4605 movdqa .Lkey_rcon1(%rip),%xmm4
4606 mov \$7,%r10d
4607 movdqu %xmm0,0($key)
4608 movdqa %xmm2,%xmm1
4609 movdqu %xmm2,16($key)
4610 jmp .Loop_key256
4611
4612 .align 16
4613 .Loop_key256:
4614 pshufb %xmm5,%xmm2
4615 aesenclast %xmm4,%xmm2
4616
4617 movdqa %xmm0,%xmm3
4618 pslldq \$4,%xmm0
4619 pxor %xmm0,%xmm3
4620 pslldq \$4,%xmm0
4621 pxor %xmm0,%xmm3
4622 pslldq \$4,%xmm0
4623 pxor %xmm3,%xmm0
4624 pslld \$1,%xmm4
4625
4626 pxor %xmm2,%xmm0
4627 movdqu %xmm0,(%rax)
4628
4629 dec %r10d
4630 jz .Ldone_key256
4631
4632 pshufd \$0xff,%xmm0,%xmm2
4633 pxor %xmm3,%xmm3
4634 aesenclast %xmm3,%xmm2
4635
4636 movdqa %xmm1,%xmm3
4637 pslldq \$4,%xmm1
4638 pxor %xmm1,%xmm3
4639 pslldq \$4,%xmm1
4640 pxor %xmm1,%xmm3
4641 pslldq \$4,%xmm1
4642 pxor %xmm3,%xmm1
4643
4644 pxor %xmm1,%xmm2
4645 movdqu %xmm2,16(%rax)
4646 lea 32(%rax),%rax
4647 movdqa %xmm2,%xmm1
4648
4649 jmp .Loop_key256
4650
4651 .Ldone_key256:
4652 mov $bits,16(%rax) # 240($key)
4653 xor %eax,%eax
4654 jmp .Lenc_key_ret
4655
4656 .align 16
4657 .Lbad_keybits:
4658 mov \$-2,%rax
4659 .Lenc_key_ret:
4660 pxor %xmm0,%xmm0
4661 pxor %xmm1,%xmm1
4662 pxor %xmm2,%xmm2
4663 pxor %xmm3,%xmm3
4664 pxor %xmm4,%xmm4
4665 pxor %xmm5,%xmm5
4666 add \$8,%rsp
4667 .cfi_adjust_cfa_offset -8
4668 ret
4669 .LSEH_end_set_encrypt_key:
4670 \f
4671 .align 16
4672 .Lkey_expansion_128:
4673 $movkey %xmm0,(%rax)
4674 lea 16(%rax),%rax
4675 .Lkey_expansion_128_cold:
4676 shufps \$0b00010000,%xmm0,%xmm4
4677 xorps %xmm4, %xmm0
4678 shufps \$0b10001100,%xmm0,%xmm4
4679 xorps %xmm4, %xmm0
4680 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4681 xorps %xmm1,%xmm0
4682 ret
4683
4684 .align 16
4685 .Lkey_expansion_192a:
4686 $movkey %xmm0,(%rax)
4687 lea 16(%rax),%rax
4688 .Lkey_expansion_192a_cold:
4689 movaps %xmm2, %xmm5
4690 .Lkey_expansion_192b_warm:
4691 shufps \$0b00010000,%xmm0,%xmm4
4692 movdqa %xmm2,%xmm3
4693 xorps %xmm4,%xmm0
4694 shufps \$0b10001100,%xmm0,%xmm4
4695 pslldq \$4,%xmm3
4696 xorps %xmm4,%xmm0
4697 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4698 pxor %xmm3,%xmm2
4699 pxor %xmm1,%xmm0
4700 pshufd \$0b11111111,%xmm0,%xmm3
4701 pxor %xmm3,%xmm2
4702 ret
4703
4704 .align 16
4705 .Lkey_expansion_192b:
4706 movaps %xmm0,%xmm3
4707 shufps \$0b01000100,%xmm0,%xmm5
4708 $movkey %xmm5,(%rax)
4709 shufps \$0b01001110,%xmm2,%xmm3
4710 $movkey %xmm3,16(%rax)
4711 lea 32(%rax),%rax
4712 jmp .Lkey_expansion_192b_warm
4713
4714 .align 16
4715 .Lkey_expansion_256a:
4716 $movkey %xmm2,(%rax)
4717 lea 16(%rax),%rax
4718 .Lkey_expansion_256a_cold:
4719 shufps \$0b00010000,%xmm0,%xmm4
4720 xorps %xmm4,%xmm0
4721 shufps \$0b10001100,%xmm0,%xmm4
4722 xorps %xmm4,%xmm0
4723 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4724 xorps %xmm1,%xmm0
4725 ret
4726
4727 .align 16
4728 .Lkey_expansion_256b:
4729 $movkey %xmm0,(%rax)
4730 lea 16(%rax),%rax
4731
4732 shufps \$0b00010000,%xmm2,%xmm4
4733 xorps %xmm4,%xmm2
4734 shufps \$0b10001100,%xmm2,%xmm4
4735 xorps %xmm4,%xmm2
4736 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4737 xorps %xmm1,%xmm2
4738 ret
4739 .cfi_endproc
4740 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4741 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4742 ___
4743 }
4744 \f
4745 $code.=<<___;
4746 .align 64
4747 .Lbswap_mask:
4748 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4749 .Lincrement32:
4750 .long 6,6,6,0
4751 .Lincrement64:
4752 .long 1,0,0,0
4753 .Lxts_magic:
4754 .long 0x87,0,1,0
4755 .Lincrement1:
4756 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4757 .Lkey_rotate:
4758 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4759 .Lkey_rotate192:
4760 .long 0x04070605,0x04070605,0x04070605,0x04070605
4761 .Lkey_rcon1:
4762 .long 1,1,1,1
4763 .Lkey_rcon1b:
4764 .long 0x1b,0x1b,0x1b,0x1b
4765
4766 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4767 .align 64
4768 ___
4769
4770 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4771 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
4772 if ($win64) {
4773 $rec="%rcx";
4774 $frame="%rdx";
4775 $context="%r8";
4776 $disp="%r9";
4777
4778 $code.=<<___;
4779 .extern __imp_RtlVirtualUnwind
4780 ___
4781 $code.=<<___ if ($PREFIX eq "aesni");
4782 .type ecb_ccm64_se_handler,\@abi-omnipotent
4783 .align 16
4784 ecb_ccm64_se_handler:
4785 push %rsi
4786 push %rdi
4787 push %rbx
4788 push %rbp
4789 push %r12
4790 push %r13
4791 push %r14
4792 push %r15
4793 pushfq
4794 sub \$64,%rsp
4795
4796 mov 120($context),%rax # pull context->Rax
4797 mov 248($context),%rbx # pull context->Rip
4798
4799 mov 8($disp),%rsi # disp->ImageBase
4800 mov 56($disp),%r11 # disp->HandlerData
4801
4802 mov 0(%r11),%r10d # HandlerData[0]
4803 lea (%rsi,%r10),%r10 # prologue label
4804 cmp %r10,%rbx # context->Rip<prologue label
4805 jb .Lcommon_seh_tail
4806
4807 mov 152($context),%rax # pull context->Rsp
4808
4809 mov 4(%r11),%r10d # HandlerData[1]
4810 lea (%rsi,%r10),%r10 # epilogue label
4811 cmp %r10,%rbx # context->Rip>=epilogue label
4812 jae .Lcommon_seh_tail
4813
4814 lea 0(%rax),%rsi # %xmm save area
4815 lea 512($context),%rdi # &context.Xmm6
4816 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4817 .long 0xa548f3fc # cld; rep movsq
4818 lea 0x58(%rax),%rax # adjust stack pointer
4819
4820 jmp .Lcommon_seh_tail
4821 .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4822
4823 .type ctr_xts_se_handler,\@abi-omnipotent
4824 .align 16
4825 ctr_xts_se_handler:
4826 push %rsi
4827 push %rdi
4828 push %rbx
4829 push %rbp
4830 push %r12
4831 push %r13
4832 push %r14
4833 push %r15
4834 pushfq
4835 sub \$64,%rsp
4836
4837 mov 120($context),%rax # pull context->Rax
4838 mov 248($context),%rbx # pull context->Rip
4839
4840 mov 8($disp),%rsi # disp->ImageBase
4841 mov 56($disp),%r11 # disp->HandlerData
4842
4843 mov 0(%r11),%r10d # HandlerData[0]
4844 lea (%rsi,%r10),%r10 # prologue label
4845 cmp %r10,%rbx # context->Rip<prologue label
4846 jb .Lcommon_seh_tail
4847
4848 mov 152($context),%rax # pull context->Rsp
4849
4850 mov 4(%r11),%r10d # HandlerData[1]
4851 lea (%rsi,%r10),%r10 # epilogue label
4852 cmp %r10,%rbx # context->Rip>=epilogue label
4853 jae .Lcommon_seh_tail
4854
4855 mov 208($context),%rax # pull context->R11
4856
4857 lea -0xa8(%rax),%rsi # %xmm save area
4858 lea 512($context),%rdi # & context.Xmm6
4859 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4860 .long 0xa548f3fc # cld; rep movsq
4861
4862 mov -8(%rax),%rbp # restore saved %rbp
4863 mov %rbp,160($context) # restore context->Rbp
4864 jmp .Lcommon_seh_tail
4865 .size ctr_xts_se_handler,.-ctr_xts_se_handler
4866
4867 .type ocb_se_handler,\@abi-omnipotent
4868 .align 16
4869 ocb_se_handler:
4870 push %rsi
4871 push %rdi
4872 push %rbx
4873 push %rbp
4874 push %r12
4875 push %r13
4876 push %r14
4877 push %r15
4878 pushfq
4879 sub \$64,%rsp
4880
4881 mov 120($context),%rax # pull context->Rax
4882 mov 248($context),%rbx # pull context->Rip
4883
4884 mov 8($disp),%rsi # disp->ImageBase
4885 mov 56($disp),%r11 # disp->HandlerData
4886
4887 mov 0(%r11),%r10d # HandlerData[0]
4888 lea (%rsi,%r10),%r10 # prologue label
4889 cmp %r10,%rbx # context->Rip<prologue label
4890 jb .Lcommon_seh_tail
4891
4892 mov 4(%r11),%r10d # HandlerData[1]
4893 lea (%rsi,%r10),%r10 # epilogue label
4894 cmp %r10,%rbx # context->Rip>=epilogue label
4895 jae .Lcommon_seh_tail
4896
4897 mov 8(%r11),%r10d # HandlerData[2]
4898 lea (%rsi,%r10),%r10
4899 cmp %r10,%rbx # context->Rip>=pop label
4900 jae .Locb_no_xmm
4901
4902 mov 152($context),%rax # pull context->Rsp
4903
4904 lea (%rax),%rsi # %xmm save area
4905 lea 512($context),%rdi # & context.Xmm6
4906 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4907 .long 0xa548f3fc # cld; rep movsq
4908 lea 0xa0+0x28(%rax),%rax
4909
4910 .Locb_no_xmm:
4911 mov -8(%rax),%rbx
4912 mov -16(%rax),%rbp
4913 mov -24(%rax),%r12
4914 mov -32(%rax),%r13
4915 mov -40(%rax),%r14
4916
4917 mov %rbx,144($context) # restore context->Rbx
4918 mov %rbp,160($context) # restore context->Rbp
4919 mov %r12,216($context) # restore context->R12
4920 mov %r13,224($context) # restore context->R13
4921 mov %r14,232($context) # restore context->R14
4922
4923 jmp .Lcommon_seh_tail
4924 .size ocb_se_handler,.-ocb_se_handler
4925 ___
4926 $code.=<<___;
4927 .type cbc_se_handler,\@abi-omnipotent
4928 .align 16
4929 cbc_se_handler:
4930 push %rsi
4931 push %rdi
4932 push %rbx
4933 push %rbp
4934 push %r12
4935 push %r13
4936 push %r14
4937 push %r15
4938 pushfq
4939 sub \$64,%rsp
4940
4941 mov 152($context),%rax # pull context->Rsp
4942 mov 248($context),%rbx # pull context->Rip
4943
4944 lea .Lcbc_decrypt_bulk(%rip),%r10
4945 cmp %r10,%rbx # context->Rip<"prologue" label
4946 jb .Lcommon_seh_tail
4947
4948 mov 120($context),%rax # pull context->Rax
4949
4950 lea .Lcbc_decrypt_body(%rip),%r10
4951 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
4952 jb .Lcommon_seh_tail
4953
4954 mov 152($context),%rax # pull context->Rsp
4955
4956 lea .Lcbc_ret(%rip),%r10
4957 cmp %r10,%rbx # context->Rip>="epilogue" label
4958 jae .Lcommon_seh_tail
4959
4960 lea 16(%rax),%rsi # %xmm save area
4961 lea 512($context),%rdi # &context.Xmm6
4962 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4963 .long 0xa548f3fc # cld; rep movsq
4964
4965 mov 208($context),%rax # pull context->R11
4966
4967 mov -8(%rax),%rbp # restore saved %rbp
4968 mov %rbp,160($context) # restore context->Rbp
4969
4970 .Lcommon_seh_tail:
4971 mov 8(%rax),%rdi
4972 mov 16(%rax),%rsi
4973 mov %rax,152($context) # restore context->Rsp
4974 mov %rsi,168($context) # restore context->Rsi
4975 mov %rdi,176($context) # restore context->Rdi
4976
4977 mov 40($disp),%rdi # disp->ContextRecord
4978 mov $context,%rsi # context
4979 mov \$154,%ecx # sizeof(CONTEXT)
4980 .long 0xa548f3fc # cld; rep movsq
4981
4982 mov $disp,%rsi
4983 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4984 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4985 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4986 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4987 mov 40(%rsi),%r10 # disp->ContextRecord
4988 lea 56(%rsi),%r11 # &disp->HandlerData
4989 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4990 mov %r10,32(%rsp) # arg5
4991 mov %r11,40(%rsp) # arg6
4992 mov %r12,48(%rsp) # arg7
4993 mov %rcx,56(%rsp) # arg8, (NULL)
4994 call *__imp_RtlVirtualUnwind(%rip)
4995
4996 mov \$1,%eax # ExceptionContinueSearch
4997 add \$64,%rsp
4998 popfq
4999 pop %r15
5000 pop %r14
5001 pop %r13
5002 pop %r12
5003 pop %rbp
5004 pop %rbx
5005 pop %rdi
5006 pop %rsi
5007 ret
5008 .size cbc_se_handler,.-cbc_se_handler
5009
5010 .section .pdata
5011 .align 4
5012 ___
5013 $code.=<<___ if ($PREFIX eq "aesni");
5014 .rva .LSEH_begin_aesni_ecb_encrypt
5015 .rva .LSEH_end_aesni_ecb_encrypt
5016 .rva .LSEH_info_ecb
5017
5018 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
5019 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
5020 .rva .LSEH_info_ccm64_enc
5021
5022 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
5023 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
5024 .rva .LSEH_info_ccm64_dec
5025
5026 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
5027 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
5028 .rva .LSEH_info_ctr32
5029
5030 .rva .LSEH_begin_aesni_xts_encrypt
5031 .rva .LSEH_end_aesni_xts_encrypt
5032 .rva .LSEH_info_xts_enc
5033
5034 .rva .LSEH_begin_aesni_xts_decrypt
5035 .rva .LSEH_end_aesni_xts_decrypt
5036 .rva .LSEH_info_xts_dec
5037
5038 .rva .LSEH_begin_aesni_ocb_encrypt
5039 .rva .LSEH_end_aesni_ocb_encrypt
5040 .rva .LSEH_info_ocb_enc
5041
5042 .rva .LSEH_begin_aesni_ocb_decrypt
5043 .rva .LSEH_end_aesni_ocb_decrypt
5044 .rva .LSEH_info_ocb_dec
5045 ___
5046 $code.=<<___;
5047 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5048 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5049 .rva .LSEH_info_cbc
5050
5051 .rva ${PREFIX}_set_decrypt_key
5052 .rva .LSEH_end_set_decrypt_key
5053 .rva .LSEH_info_key
5054
5055 .rva ${PREFIX}_set_encrypt_key
5056 .rva .LSEH_end_set_encrypt_key
5057 .rva .LSEH_info_key
5058 .section .xdata
5059 .align 8
5060 ___
5061 $code.=<<___ if ($PREFIX eq "aesni");
5062 .LSEH_info_ecb:
5063 .byte 9,0,0,0
5064 .rva ecb_ccm64_se_handler
5065 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
5066 .LSEH_info_ccm64_enc:
5067 .byte 9,0,0,0
5068 .rva ecb_ccm64_se_handler
5069 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5070 .LSEH_info_ccm64_dec:
5071 .byte 9,0,0,0
5072 .rva ecb_ccm64_se_handler
5073 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
5074 .LSEH_info_ctr32:
5075 .byte 9,0,0,0
5076 .rva ctr_xts_se_handler
5077 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
5078 .LSEH_info_xts_enc:
5079 .byte 9,0,0,0
5080 .rva ctr_xts_se_handler
5081 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5082 .LSEH_info_xts_dec:
5083 .byte 9,0,0,0
5084 .rva ctr_xts_se_handler
5085 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
5086 .LSEH_info_ocb_enc:
5087 .byte 9,0,0,0
5088 .rva ocb_se_handler
5089 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5090 .rva .Locb_enc_pop
5091 .long 0
5092 .LSEH_info_ocb_dec:
5093 .byte 9,0,0,0
5094 .rva ocb_se_handler
5095 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5096 .rva .Locb_dec_pop
5097 .long 0
5098 ___
5099 $code.=<<___;
5100 .LSEH_info_cbc:
5101 .byte 9,0,0,0
5102 .rva cbc_se_handler
5103 .LSEH_info_key:
5104 .byte 0x01,0x04,0x01,0x00
5105 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
5106 ___
5107 }
5108
5109 sub rex {
5110 local *opcode=shift;
5111 my ($dst,$src)=@_;
5112 my $rex=0;
5113
5114 $rex|=0x04 if($dst>=8);
5115 $rex|=0x01 if($src>=8);
5116 push @opcode,$rex|0x40 if($rex);
5117 }
5118
5119 sub aesni {
5120 my $line=shift;
5121 my @opcode=(0x66);
5122
5123 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5124 rex(\@opcode,$4,$3);
5125 push @opcode,0x0f,0x3a,0xdf;
5126 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5127 my $c=$2;
5128 push @opcode,$c=~/^0/?oct($c):$c;
5129 return ".byte\t".join(',',@opcode);
5130 }
5131 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5132 my %opcodelet = (
5133 "aesimc" => 0xdb,
5134 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5135 "aesdec" => 0xde, "aesdeclast" => 0xdf
5136 );
5137 return undef if (!defined($opcodelet{$1}));
5138 rex(\@opcode,$3,$2);
5139 push @opcode,0x0f,0x38,$opcodelet{$1};
5140 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5141 return ".byte\t".join(',',@opcode);
5142 }
5143 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5144 my %opcodelet = (
5145 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5146 "aesdec" => 0xde, "aesdeclast" => 0xdf
5147 );
5148 return undef if (!defined($opcodelet{$1}));
5149 my $off = $2;
5150 push @opcode,0x44 if ($3>=8);
5151 push @opcode,0x0f,0x38,$opcodelet{$1};
5152 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5153 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5154 return ".byte\t".join(',',@opcode);
5155 }
5156 return $line;
5157 }
5158
5159 sub movbe {
5160 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5161 }
5162
5163 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
5164 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5165 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5166 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5167
5168 print $code;
5169
5170 close STDOUT or die "error closing STDOUT: $!";