]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesni-x86_64.pl
x86_64 assembly pack: tolerate spaces in source directory name.
[thirdparty/openssl.git] / crypto / aes / asm / aesni-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20 # details].
21 #
22 # Performance.
23 #
24 # Given aes(enc|dec) instructions' latency asymptotic performance for
25 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26 # processed with 128-bit key. And given their throughput asymptotic
27 # performance for parallelizable modes is 1.25 cycles per byte. Being
28 # asymptotic limit it's not something you commonly achieve in reality,
29 # but how close does one get? Below are results collected for
30 # different modes and block sized. Pairs of numbers are for en-/
31 # decryption.
32 #
33 # 16-byte 64-byte 256-byte 1-KB 8-KB
34 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
37 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
38 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
40 #
41 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44 # The results were collected with specially crafted speed.c benchmark
45 # in order to compare them with results reported in "Intel Advanced
46 # Encryption Standard (AES) New Instruction Set" White Paper Revision
47 # 3.0 dated May 2010. All above results are consistently better. This
48 # module also provides better performance for block sizes smaller than
49 # 128 bytes in points *not* represented in the above table.
50 #
51 # Looking at the results for 8-KB buffer.
52 #
53 # CFB and OFB results are far from the limit, because implementation
54 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55 # single-block aesni_encrypt, which is not the most optimal way to go.
56 # CBC encrypt result is unexpectedly high and there is no documented
57 # explanation for it. Seemingly there is a small penalty for feeding
58 # the result back to AES unit the way it's done in CBC mode. There is
59 # nothing one can do and the result appears optimal. CCM result is
60 # identical to CBC, because CBC-MAC is essentially CBC encrypt without
61 # saving output. CCM CTR "stays invisible," because it's neatly
62 # interleaved wih CBC-MAC. This provides ~30% improvement over
63 # "straghtforward" CCM implementation with CTR and CBC-MAC performed
64 # disjointly. Parallelizable modes practically achieve the theoretical
65 # limit.
66 #
67 # Looking at how results vary with buffer size.
68 #
69 # Curves are practically saturated at 1-KB buffer size. In most cases
70 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71 # CTR curve doesn't follow this pattern and is "slowest" changing one
72 # with "256-byte" result being 87% of "8-KB." This is because overhead
73 # in CTR mode is most computationally intensive. Small-block CCM
74 # decrypt is slower than encrypt, because first CTR and last CBC-MAC
75 # iterations can't be interleaved.
76 #
77 # Results for 192- and 256-bit keys.
78 #
79 # EVP-free results were observed to scale perfectly with number of
80 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82 # are a tad smaller, because the above mentioned penalty biases all
83 # results by same constant value. In similar way function call
84 # overhead affects small-block performance, as well as OFB and CFB
85 # results. Differences are not large, most common coefficients are
86 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89 # January 2011
90 #
91 # While Westmere processor features 6 cycles latency for aes[enc|dec]
92 # instructions, which can be scheduled every second cycle, Sandy
93 # Bridge spends 8 cycles per instruction, but it can schedule them
94 # every cycle. This means that code targeting Westmere would perform
95 # suboptimally on Sandy Bridge. Therefore this update.
96 #
97 # In addition, non-parallelizable CBC encrypt (as well as CCM) is
98 # optimized. Relative improvement might appear modest, 8% on Westmere,
99 # but in absolute terms it's 3.77 cycles per byte encrypted with
100 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101 # should be compared to asymptotic limits of 3.75 for Westmere and
102 # 5.00 for Sandy Bridge. Actually, the fact that they get this close
103 # to asymptotic limits is quite amazing. Indeed, the limit is
104 # calculated as latency times number of rounds, 10 for 128-bit key,
105 # and divided by 16, the number of bytes in block, or in other words
106 # it accounts *solely* for aesenc instructions. But there are extra
107 # instructions, and numbers so close to the asymptotic limits mean
108 # that it's as if it takes as little as *one* additional cycle to
109 # execute all of them. How is it possible? It is possible thanks to
110 # out-of-order execution logic, which manages to overlap post-
111 # processing of previous block, things like saving the output, with
112 # actual encryption of current block, as well as pre-processing of
113 # current block, things like fetching input and xor-ing it with
114 # 0-round element of the key schedule, with actual encryption of
115 # previous block. Keep this in mind...
116 #
117 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118 # performance is achieved by interleaving instructions working on
119 # independent blocks. In which case asymptotic limit for such modes
120 # can be obtained by dividing above mentioned numbers by AES
121 # instructions' interleave factor. Westmere can execute at most 3
122 # instructions at a time, meaning that optimal interleave factor is 3,
123 # and that's where the "magic" number of 1.25 come from. "Optimal
124 # interleave factor" means that increase of interleave factor does
125 # not improve performance. The formula has proven to reflect reality
126 # pretty well on Westmere... Sandy Bridge on the other hand can
127 # execute up to 8 AES instructions at a time, so how does varying
128 # interleave factor affect the performance? Here is table for ECB
129 # (numbers are cycles per byte processed with 128-bit key):
130 #
131 # instruction interleave factor 3x 6x 8x
132 # theoretical asymptotic limit 1.67 0.83 0.625
133 # measured performance for 8KB block 1.05 0.86 0.84
134 #
135 # "as if" interleave factor 4.7x 5.8x 6.0x
136 #
137 # Further data for other parallelizable modes:
138 #
139 # CBC decrypt 1.16 0.93 0.74
140 # CTR 1.14 0.91 0.74
141 #
142 # Well, given 3x column it's probably inappropriate to call the limit
143 # asymptotic, if it can be surpassed, isn't it? What happens there?
144 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145 # magic is responsible for this. Processor overlaps not only the
146 # additional instructions with AES ones, but even AES instuctions
147 # processing adjacent triplets of independent blocks. In the 6x case
148 # additional instructions still claim disproportionally small amount
149 # of additional cycles, but in 8x case number of instructions must be
150 # a tad too high for out-of-order logic to cope with, and AES unit
151 # remains underutilized... As you can see 8x interleave is hardly
152 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153 # utilizies 6x interleave because of limited register bank capacity.
154 #
155 # Higher interleave factors do have negative impact on Westmere
156 # performance. While for ECB mode it's negligible ~1.5%, other
157 # parallelizables perform ~5% worse, which is outweighed by ~25%
158 # improvement on Sandy Bridge. To balance regression on Westmere
159 # CTR mode was implemented with 6x aesenc interleave factor.
160
161 # April 2011
162 #
163 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165 # in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167 # November 2015
168 #
169 # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170 # chosen to be 6x.
171
172 ######################################################################
173 # Current large-block performance in cycles per byte processed with
174 # 128-bit key (less is better).
175 #
176 # CBC en-/decrypt CTR XTS ECB OCB
177 # Westmere 3.77/1.25 1.25 1.25 1.26
178 # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179 # Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180 # Skylake 2.62/0.63 0.63 0.63 0.63
181 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
182 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
183 #
184 # (*) Atom Silvermont ECB result is suboptimal because of penalties
185 # incurred by operations on %xmm8-15. As ECB is not considered
186 # critical, nothing was done to mitigate the problem.
187
188 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
189 # generates drop-in replacement for
190 # crypto/aes/asm/aes-x86_64.pl:-)
191
192 $flavour = shift;
193 $output = shift;
194 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
195
196 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
197
198 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
199 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
200 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
201 die "can't locate x86_64-xlate.pl";
202
203 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
204 *STDOUT=*OUT;
205
206 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
207 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
208 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
209
210 $code=".text\n";
211 $code.=".extern OPENSSL_ia32cap_P\n";
212
213 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
214 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
215 $inp="%rdi";
216 $out="%rsi";
217 $len="%rdx";
218 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
219 $ivp="%r8"; # cbc, ctr, ...
220
221 $rnds_="%r10d"; # backup copy for $rounds
222 $key_="%r11"; # backup copy for $key
223
224 # %xmm register layout
225 $rndkey0="%xmm0"; $rndkey1="%xmm1";
226 $inout0="%xmm2"; $inout1="%xmm3";
227 $inout2="%xmm4"; $inout3="%xmm5";
228 $inout4="%xmm6"; $inout5="%xmm7";
229 $inout6="%xmm8"; $inout7="%xmm9";
230
231 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
232 $in0="%xmm8"; $iv="%xmm9";
233 \f
234 # Inline version of internal aesni_[en|de]crypt1.
235 #
236 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
237 # cycles which take care of loop variables...
238 { my $sn;
239 sub aesni_generate1 {
240 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
241 ++$sn;
242 $code.=<<___;
243 $movkey ($key),$rndkey0
244 $movkey 16($key),$rndkey1
245 ___
246 $code.=<<___ if (defined($ivec));
247 xorps $rndkey0,$ivec
248 lea 32($key),$key
249 xorps $ivec,$inout
250 ___
251 $code.=<<___ if (!defined($ivec));
252 lea 32($key),$key
253 xorps $rndkey0,$inout
254 ___
255 $code.=<<___;
256 .Loop_${p}1_$sn:
257 aes${p} $rndkey1,$inout
258 dec $rounds
259 $movkey ($key),$rndkey1
260 lea 16($key),$key
261 jnz .Loop_${p}1_$sn # loop body is 16 bytes
262 aes${p}last $rndkey1,$inout
263 ___
264 }}
265 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
266 #
267 { my ($inp,$out,$key) = @_4args;
268
269 $code.=<<___;
270 .globl ${PREFIX}_encrypt
271 .type ${PREFIX}_encrypt,\@abi-omnipotent
272 .align 16
273 ${PREFIX}_encrypt:
274 movups ($inp),$inout0 # load input
275 mov 240($key),$rounds # key->rounds
276 ___
277 &aesni_generate1("enc",$key,$rounds);
278 $code.=<<___;
279 pxor $rndkey0,$rndkey0 # clear register bank
280 pxor $rndkey1,$rndkey1
281 movups $inout0,($out) # output
282 pxor $inout0,$inout0
283 ret
284 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
285
286 .globl ${PREFIX}_decrypt
287 .type ${PREFIX}_decrypt,\@abi-omnipotent
288 .align 16
289 ${PREFIX}_decrypt:
290 movups ($inp),$inout0 # load input
291 mov 240($key),$rounds # key->rounds
292 ___
293 &aesni_generate1("dec",$key,$rounds);
294 $code.=<<___;
295 pxor $rndkey0,$rndkey0 # clear register bank
296 pxor $rndkey1,$rndkey1
297 movups $inout0,($out) # output
298 pxor $inout0,$inout0
299 ret
300 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
301 ___
302 }
303 \f
304 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
305 # factor. Why 3x subroutine were originally used in loops? Even though
306 # aes[enc|dec] latency was originally 6, it could be scheduled only
307 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
308 # utilization, i.e. when subroutine's throughput is virtually same as
309 # of non-interleaved subroutine [for number of input blocks up to 3].
310 # This is why it originally made no sense to implement 2x subroutine.
311 # But times change and it became appropriate to spend extra 192 bytes
312 # on 2x subroutine on Atom Silvermont account. For processors that
313 # can schedule aes[enc|dec] every cycle optimal interleave factor
314 # equals to corresponding instructions latency. 8x is optimal for
315 # * Bridge and "super-optimal" for other Intel CPUs...
316
317 sub aesni_generate2 {
318 my $dir=shift;
319 # As already mentioned it takes in $key and $rounds, which are *not*
320 # preserved. $inout[0-1] is cipher/clear text...
321 $code.=<<___;
322 .type _aesni_${dir}rypt2,\@abi-omnipotent
323 .align 16
324 _aesni_${dir}rypt2:
325 $movkey ($key),$rndkey0
326 shl \$4,$rounds
327 $movkey 16($key),$rndkey1
328 xorps $rndkey0,$inout0
329 xorps $rndkey0,$inout1
330 $movkey 32($key),$rndkey0
331 lea 32($key,$rounds),$key
332 neg %rax # $rounds
333 add \$16,%rax
334
335 .L${dir}_loop2:
336 aes${dir} $rndkey1,$inout0
337 aes${dir} $rndkey1,$inout1
338 $movkey ($key,%rax),$rndkey1
339 add \$32,%rax
340 aes${dir} $rndkey0,$inout0
341 aes${dir} $rndkey0,$inout1
342 $movkey -16($key,%rax),$rndkey0
343 jnz .L${dir}_loop2
344
345 aes${dir} $rndkey1,$inout0
346 aes${dir} $rndkey1,$inout1
347 aes${dir}last $rndkey0,$inout0
348 aes${dir}last $rndkey0,$inout1
349 ret
350 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
351 ___
352 }
353 sub aesni_generate3 {
354 my $dir=shift;
355 # As already mentioned it takes in $key and $rounds, which are *not*
356 # preserved. $inout[0-2] is cipher/clear text...
357 $code.=<<___;
358 .type _aesni_${dir}rypt3,\@abi-omnipotent
359 .align 16
360 _aesni_${dir}rypt3:
361 $movkey ($key),$rndkey0
362 shl \$4,$rounds
363 $movkey 16($key),$rndkey1
364 xorps $rndkey0,$inout0
365 xorps $rndkey0,$inout1
366 xorps $rndkey0,$inout2
367 $movkey 32($key),$rndkey0
368 lea 32($key,$rounds),$key
369 neg %rax # $rounds
370 add \$16,%rax
371
372 .L${dir}_loop3:
373 aes${dir} $rndkey1,$inout0
374 aes${dir} $rndkey1,$inout1
375 aes${dir} $rndkey1,$inout2
376 $movkey ($key,%rax),$rndkey1
377 add \$32,%rax
378 aes${dir} $rndkey0,$inout0
379 aes${dir} $rndkey0,$inout1
380 aes${dir} $rndkey0,$inout2
381 $movkey -16($key,%rax),$rndkey0
382 jnz .L${dir}_loop3
383
384 aes${dir} $rndkey1,$inout0
385 aes${dir} $rndkey1,$inout1
386 aes${dir} $rndkey1,$inout2
387 aes${dir}last $rndkey0,$inout0
388 aes${dir}last $rndkey0,$inout1
389 aes${dir}last $rndkey0,$inout2
390 ret
391 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
392 ___
393 }
394 # 4x interleave is implemented to improve small block performance,
395 # most notably [and naturally] 4 block by ~30%. One can argue that one
396 # should have implemented 5x as well, but improvement would be <20%,
397 # so it's not worth it...
398 sub aesni_generate4 {
399 my $dir=shift;
400 # As already mentioned it takes in $key and $rounds, which are *not*
401 # preserved. $inout[0-3] is cipher/clear text...
402 $code.=<<___;
403 .type _aesni_${dir}rypt4,\@abi-omnipotent
404 .align 16
405 _aesni_${dir}rypt4:
406 $movkey ($key),$rndkey0
407 shl \$4,$rounds
408 $movkey 16($key),$rndkey1
409 xorps $rndkey0,$inout0
410 xorps $rndkey0,$inout1
411 xorps $rndkey0,$inout2
412 xorps $rndkey0,$inout3
413 $movkey 32($key),$rndkey0
414 lea 32($key,$rounds),$key
415 neg %rax # $rounds
416 .byte 0x0f,0x1f,0x00
417 add \$16,%rax
418
419 .L${dir}_loop4:
420 aes${dir} $rndkey1,$inout0
421 aes${dir} $rndkey1,$inout1
422 aes${dir} $rndkey1,$inout2
423 aes${dir} $rndkey1,$inout3
424 $movkey ($key,%rax),$rndkey1
425 add \$32,%rax
426 aes${dir} $rndkey0,$inout0
427 aes${dir} $rndkey0,$inout1
428 aes${dir} $rndkey0,$inout2
429 aes${dir} $rndkey0,$inout3
430 $movkey -16($key,%rax),$rndkey0
431 jnz .L${dir}_loop4
432
433 aes${dir} $rndkey1,$inout0
434 aes${dir} $rndkey1,$inout1
435 aes${dir} $rndkey1,$inout2
436 aes${dir} $rndkey1,$inout3
437 aes${dir}last $rndkey0,$inout0
438 aes${dir}last $rndkey0,$inout1
439 aes${dir}last $rndkey0,$inout2
440 aes${dir}last $rndkey0,$inout3
441 ret
442 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
443 ___
444 }
445 sub aesni_generate6 {
446 my $dir=shift;
447 # As already mentioned it takes in $key and $rounds, which are *not*
448 # preserved. $inout[0-5] is cipher/clear text...
449 $code.=<<___;
450 .type _aesni_${dir}rypt6,\@abi-omnipotent
451 .align 16
452 _aesni_${dir}rypt6:
453 $movkey ($key),$rndkey0
454 shl \$4,$rounds
455 $movkey 16($key),$rndkey1
456 xorps $rndkey0,$inout0
457 pxor $rndkey0,$inout1
458 pxor $rndkey0,$inout2
459 aes${dir} $rndkey1,$inout0
460 lea 32($key,$rounds),$key
461 neg %rax # $rounds
462 aes${dir} $rndkey1,$inout1
463 pxor $rndkey0,$inout3
464 pxor $rndkey0,$inout4
465 aes${dir} $rndkey1,$inout2
466 pxor $rndkey0,$inout5
467 $movkey ($key,%rax),$rndkey0
468 add \$16,%rax
469 jmp .L${dir}_loop6_enter
470 .align 16
471 .L${dir}_loop6:
472 aes${dir} $rndkey1,$inout0
473 aes${dir} $rndkey1,$inout1
474 aes${dir} $rndkey1,$inout2
475 .L${dir}_loop6_enter:
476 aes${dir} $rndkey1,$inout3
477 aes${dir} $rndkey1,$inout4
478 aes${dir} $rndkey1,$inout5
479 $movkey ($key,%rax),$rndkey1
480 add \$32,%rax
481 aes${dir} $rndkey0,$inout0
482 aes${dir} $rndkey0,$inout1
483 aes${dir} $rndkey0,$inout2
484 aes${dir} $rndkey0,$inout3
485 aes${dir} $rndkey0,$inout4
486 aes${dir} $rndkey0,$inout5
487 $movkey -16($key,%rax),$rndkey0
488 jnz .L${dir}_loop6
489
490 aes${dir} $rndkey1,$inout0
491 aes${dir} $rndkey1,$inout1
492 aes${dir} $rndkey1,$inout2
493 aes${dir} $rndkey1,$inout3
494 aes${dir} $rndkey1,$inout4
495 aes${dir} $rndkey1,$inout5
496 aes${dir}last $rndkey0,$inout0
497 aes${dir}last $rndkey0,$inout1
498 aes${dir}last $rndkey0,$inout2
499 aes${dir}last $rndkey0,$inout3
500 aes${dir}last $rndkey0,$inout4
501 aes${dir}last $rndkey0,$inout5
502 ret
503 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
504 ___
505 }
506 sub aesni_generate8 {
507 my $dir=shift;
508 # As already mentioned it takes in $key and $rounds, which are *not*
509 # preserved. $inout[0-7] is cipher/clear text...
510 $code.=<<___;
511 .type _aesni_${dir}rypt8,\@abi-omnipotent
512 .align 16
513 _aesni_${dir}rypt8:
514 $movkey ($key),$rndkey0
515 shl \$4,$rounds
516 $movkey 16($key),$rndkey1
517 xorps $rndkey0,$inout0
518 xorps $rndkey0,$inout1
519 pxor $rndkey0,$inout2
520 pxor $rndkey0,$inout3
521 pxor $rndkey0,$inout4
522 lea 32($key,$rounds),$key
523 neg %rax # $rounds
524 aes${dir} $rndkey1,$inout0
525 pxor $rndkey0,$inout5
526 pxor $rndkey0,$inout6
527 aes${dir} $rndkey1,$inout1
528 pxor $rndkey0,$inout7
529 $movkey ($key,%rax),$rndkey0
530 add \$16,%rax
531 jmp .L${dir}_loop8_inner
532 .align 16
533 .L${dir}_loop8:
534 aes${dir} $rndkey1,$inout0
535 aes${dir} $rndkey1,$inout1
536 .L${dir}_loop8_inner:
537 aes${dir} $rndkey1,$inout2
538 aes${dir} $rndkey1,$inout3
539 aes${dir} $rndkey1,$inout4
540 aes${dir} $rndkey1,$inout5
541 aes${dir} $rndkey1,$inout6
542 aes${dir} $rndkey1,$inout7
543 .L${dir}_loop8_enter:
544 $movkey ($key,%rax),$rndkey1
545 add \$32,%rax
546 aes${dir} $rndkey0,$inout0
547 aes${dir} $rndkey0,$inout1
548 aes${dir} $rndkey0,$inout2
549 aes${dir} $rndkey0,$inout3
550 aes${dir} $rndkey0,$inout4
551 aes${dir} $rndkey0,$inout5
552 aes${dir} $rndkey0,$inout6
553 aes${dir} $rndkey0,$inout7
554 $movkey -16($key,%rax),$rndkey0
555 jnz .L${dir}_loop8
556
557 aes${dir} $rndkey1,$inout0
558 aes${dir} $rndkey1,$inout1
559 aes${dir} $rndkey1,$inout2
560 aes${dir} $rndkey1,$inout3
561 aes${dir} $rndkey1,$inout4
562 aes${dir} $rndkey1,$inout5
563 aes${dir} $rndkey1,$inout6
564 aes${dir} $rndkey1,$inout7
565 aes${dir}last $rndkey0,$inout0
566 aes${dir}last $rndkey0,$inout1
567 aes${dir}last $rndkey0,$inout2
568 aes${dir}last $rndkey0,$inout3
569 aes${dir}last $rndkey0,$inout4
570 aes${dir}last $rndkey0,$inout5
571 aes${dir}last $rndkey0,$inout6
572 aes${dir}last $rndkey0,$inout7
573 ret
574 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
575 ___
576 }
577 &aesni_generate2("enc") if ($PREFIX eq "aesni");
578 &aesni_generate2("dec");
579 &aesni_generate3("enc") if ($PREFIX eq "aesni");
580 &aesni_generate3("dec");
581 &aesni_generate4("enc") if ($PREFIX eq "aesni");
582 &aesni_generate4("dec");
583 &aesni_generate6("enc") if ($PREFIX eq "aesni");
584 &aesni_generate6("dec");
585 &aesni_generate8("enc") if ($PREFIX eq "aesni");
586 &aesni_generate8("dec");
587 \f
588 if ($PREFIX eq "aesni") {
589 ########################################################################
590 # void aesni_ecb_encrypt (const void *in, void *out,
591 # size_t length, const AES_KEY *key,
592 # int enc);
593 $code.=<<___;
594 .globl aesni_ecb_encrypt
595 .type aesni_ecb_encrypt,\@function,5
596 .align 16
597 aesni_ecb_encrypt:
598 ___
599 $code.=<<___ if ($win64);
600 lea -0x58(%rsp),%rsp
601 movaps %xmm6,(%rsp) # offload $inout4..7
602 movaps %xmm7,0x10(%rsp)
603 movaps %xmm8,0x20(%rsp)
604 movaps %xmm9,0x30(%rsp)
605 .Lecb_enc_body:
606 ___
607 $code.=<<___;
608 and \$-16,$len # if ($len<16)
609 jz .Lecb_ret # return
610
611 mov 240($key),$rounds # key->rounds
612 $movkey ($key),$rndkey0
613 mov $key,$key_ # backup $key
614 mov $rounds,$rnds_ # backup $rounds
615 test %r8d,%r8d # 5th argument
616 jz .Lecb_decrypt
617 #--------------------------- ECB ENCRYPT ------------------------------#
618 cmp \$0x80,$len # if ($len<8*16)
619 jb .Lecb_enc_tail # short input
620
621 movdqu ($inp),$inout0 # load 8 input blocks
622 movdqu 0x10($inp),$inout1
623 movdqu 0x20($inp),$inout2
624 movdqu 0x30($inp),$inout3
625 movdqu 0x40($inp),$inout4
626 movdqu 0x50($inp),$inout5
627 movdqu 0x60($inp),$inout6
628 movdqu 0x70($inp),$inout7
629 lea 0x80($inp),$inp # $inp+=8*16
630 sub \$0x80,$len # $len-=8*16 (can be zero)
631 jmp .Lecb_enc_loop8_enter
632 .align 16
633 .Lecb_enc_loop8:
634 movups $inout0,($out) # store 8 output blocks
635 mov $key_,$key # restore $key
636 movdqu ($inp),$inout0 # load 8 input blocks
637 mov $rnds_,$rounds # restore $rounds
638 movups $inout1,0x10($out)
639 movdqu 0x10($inp),$inout1
640 movups $inout2,0x20($out)
641 movdqu 0x20($inp),$inout2
642 movups $inout3,0x30($out)
643 movdqu 0x30($inp),$inout3
644 movups $inout4,0x40($out)
645 movdqu 0x40($inp),$inout4
646 movups $inout5,0x50($out)
647 movdqu 0x50($inp),$inout5
648 movups $inout6,0x60($out)
649 movdqu 0x60($inp),$inout6
650 movups $inout7,0x70($out)
651 lea 0x80($out),$out # $out+=8*16
652 movdqu 0x70($inp),$inout7
653 lea 0x80($inp),$inp # $inp+=8*16
654 .Lecb_enc_loop8_enter:
655
656 call _aesni_encrypt8
657
658 sub \$0x80,$len
659 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
660
661 movups $inout0,($out) # store 8 output blocks
662 mov $key_,$key # restore $key
663 movups $inout1,0x10($out)
664 mov $rnds_,$rounds # restore $rounds
665 movups $inout2,0x20($out)
666 movups $inout3,0x30($out)
667 movups $inout4,0x40($out)
668 movups $inout5,0x50($out)
669 movups $inout6,0x60($out)
670 movups $inout7,0x70($out)
671 lea 0x80($out),$out # $out+=8*16
672 add \$0x80,$len # restore real remaining $len
673 jz .Lecb_ret # done if ($len==0)
674
675 .Lecb_enc_tail: # $len is less than 8*16
676 movups ($inp),$inout0
677 cmp \$0x20,$len
678 jb .Lecb_enc_one
679 movups 0x10($inp),$inout1
680 je .Lecb_enc_two
681 movups 0x20($inp),$inout2
682 cmp \$0x40,$len
683 jb .Lecb_enc_three
684 movups 0x30($inp),$inout3
685 je .Lecb_enc_four
686 movups 0x40($inp),$inout4
687 cmp \$0x60,$len
688 jb .Lecb_enc_five
689 movups 0x50($inp),$inout5
690 je .Lecb_enc_six
691 movdqu 0x60($inp),$inout6
692 xorps $inout7,$inout7
693 call _aesni_encrypt8
694 movups $inout0,($out) # store 7 output blocks
695 movups $inout1,0x10($out)
696 movups $inout2,0x20($out)
697 movups $inout3,0x30($out)
698 movups $inout4,0x40($out)
699 movups $inout5,0x50($out)
700 movups $inout6,0x60($out)
701 jmp .Lecb_ret
702 .align 16
703 .Lecb_enc_one:
704 ___
705 &aesni_generate1("enc",$key,$rounds);
706 $code.=<<___;
707 movups $inout0,($out) # store one output block
708 jmp .Lecb_ret
709 .align 16
710 .Lecb_enc_two:
711 call _aesni_encrypt2
712 movups $inout0,($out) # store 2 output blocks
713 movups $inout1,0x10($out)
714 jmp .Lecb_ret
715 .align 16
716 .Lecb_enc_three:
717 call _aesni_encrypt3
718 movups $inout0,($out) # store 3 output blocks
719 movups $inout1,0x10($out)
720 movups $inout2,0x20($out)
721 jmp .Lecb_ret
722 .align 16
723 .Lecb_enc_four:
724 call _aesni_encrypt4
725 movups $inout0,($out) # store 4 output blocks
726 movups $inout1,0x10($out)
727 movups $inout2,0x20($out)
728 movups $inout3,0x30($out)
729 jmp .Lecb_ret
730 .align 16
731 .Lecb_enc_five:
732 xorps $inout5,$inout5
733 call _aesni_encrypt6
734 movups $inout0,($out) # store 5 output blocks
735 movups $inout1,0x10($out)
736 movups $inout2,0x20($out)
737 movups $inout3,0x30($out)
738 movups $inout4,0x40($out)
739 jmp .Lecb_ret
740 .align 16
741 .Lecb_enc_six:
742 call _aesni_encrypt6
743 movups $inout0,($out) # store 6 output blocks
744 movups $inout1,0x10($out)
745 movups $inout2,0x20($out)
746 movups $inout3,0x30($out)
747 movups $inout4,0x40($out)
748 movups $inout5,0x50($out)
749 jmp .Lecb_ret
750 \f#--------------------------- ECB DECRYPT ------------------------------#
751 .align 16
752 .Lecb_decrypt:
753 cmp \$0x80,$len # if ($len<8*16)
754 jb .Lecb_dec_tail # short input
755
756 movdqu ($inp),$inout0 # load 8 input blocks
757 movdqu 0x10($inp),$inout1
758 movdqu 0x20($inp),$inout2
759 movdqu 0x30($inp),$inout3
760 movdqu 0x40($inp),$inout4
761 movdqu 0x50($inp),$inout5
762 movdqu 0x60($inp),$inout6
763 movdqu 0x70($inp),$inout7
764 lea 0x80($inp),$inp # $inp+=8*16
765 sub \$0x80,$len # $len-=8*16 (can be zero)
766 jmp .Lecb_dec_loop8_enter
767 .align 16
768 .Lecb_dec_loop8:
769 movups $inout0,($out) # store 8 output blocks
770 mov $key_,$key # restore $key
771 movdqu ($inp),$inout0 # load 8 input blocks
772 mov $rnds_,$rounds # restore $rounds
773 movups $inout1,0x10($out)
774 movdqu 0x10($inp),$inout1
775 movups $inout2,0x20($out)
776 movdqu 0x20($inp),$inout2
777 movups $inout3,0x30($out)
778 movdqu 0x30($inp),$inout3
779 movups $inout4,0x40($out)
780 movdqu 0x40($inp),$inout4
781 movups $inout5,0x50($out)
782 movdqu 0x50($inp),$inout5
783 movups $inout6,0x60($out)
784 movdqu 0x60($inp),$inout6
785 movups $inout7,0x70($out)
786 lea 0x80($out),$out # $out+=8*16
787 movdqu 0x70($inp),$inout7
788 lea 0x80($inp),$inp # $inp+=8*16
789 .Lecb_dec_loop8_enter:
790
791 call _aesni_decrypt8
792
793 $movkey ($key_),$rndkey0
794 sub \$0x80,$len
795 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
796
797 movups $inout0,($out) # store 8 output blocks
798 pxor $inout0,$inout0 # clear register bank
799 mov $key_,$key # restore $key
800 movups $inout1,0x10($out)
801 pxor $inout1,$inout1
802 mov $rnds_,$rounds # restore $rounds
803 movups $inout2,0x20($out)
804 pxor $inout2,$inout2
805 movups $inout3,0x30($out)
806 pxor $inout3,$inout3
807 movups $inout4,0x40($out)
808 pxor $inout4,$inout4
809 movups $inout5,0x50($out)
810 pxor $inout5,$inout5
811 movups $inout6,0x60($out)
812 pxor $inout6,$inout6
813 movups $inout7,0x70($out)
814 pxor $inout7,$inout7
815 lea 0x80($out),$out # $out+=8*16
816 add \$0x80,$len # restore real remaining $len
817 jz .Lecb_ret # done if ($len==0)
818
819 .Lecb_dec_tail:
820 movups ($inp),$inout0
821 cmp \$0x20,$len
822 jb .Lecb_dec_one
823 movups 0x10($inp),$inout1
824 je .Lecb_dec_two
825 movups 0x20($inp),$inout2
826 cmp \$0x40,$len
827 jb .Lecb_dec_three
828 movups 0x30($inp),$inout3
829 je .Lecb_dec_four
830 movups 0x40($inp),$inout4
831 cmp \$0x60,$len
832 jb .Lecb_dec_five
833 movups 0x50($inp),$inout5
834 je .Lecb_dec_six
835 movups 0x60($inp),$inout6
836 $movkey ($key),$rndkey0
837 xorps $inout7,$inout7
838 call _aesni_decrypt8
839 movups $inout0,($out) # store 7 output blocks
840 pxor $inout0,$inout0 # clear register bank
841 movups $inout1,0x10($out)
842 pxor $inout1,$inout1
843 movups $inout2,0x20($out)
844 pxor $inout2,$inout2
845 movups $inout3,0x30($out)
846 pxor $inout3,$inout3
847 movups $inout4,0x40($out)
848 pxor $inout4,$inout4
849 movups $inout5,0x50($out)
850 pxor $inout5,$inout5
851 movups $inout6,0x60($out)
852 pxor $inout6,$inout6
853 pxor $inout7,$inout7
854 jmp .Lecb_ret
855 .align 16
856 .Lecb_dec_one:
857 ___
858 &aesni_generate1("dec",$key,$rounds);
859 $code.=<<___;
860 movups $inout0,($out) # store one output block
861 pxor $inout0,$inout0 # clear register bank
862 jmp .Lecb_ret
863 .align 16
864 .Lecb_dec_two:
865 call _aesni_decrypt2
866 movups $inout0,($out) # store 2 output blocks
867 pxor $inout0,$inout0 # clear register bank
868 movups $inout1,0x10($out)
869 pxor $inout1,$inout1
870 jmp .Lecb_ret
871 .align 16
872 .Lecb_dec_three:
873 call _aesni_decrypt3
874 movups $inout0,($out) # store 3 output blocks
875 pxor $inout0,$inout0 # clear register bank
876 movups $inout1,0x10($out)
877 pxor $inout1,$inout1
878 movups $inout2,0x20($out)
879 pxor $inout2,$inout2
880 jmp .Lecb_ret
881 .align 16
882 .Lecb_dec_four:
883 call _aesni_decrypt4
884 movups $inout0,($out) # store 4 output blocks
885 pxor $inout0,$inout0 # clear register bank
886 movups $inout1,0x10($out)
887 pxor $inout1,$inout1
888 movups $inout2,0x20($out)
889 pxor $inout2,$inout2
890 movups $inout3,0x30($out)
891 pxor $inout3,$inout3
892 jmp .Lecb_ret
893 .align 16
894 .Lecb_dec_five:
895 xorps $inout5,$inout5
896 call _aesni_decrypt6
897 movups $inout0,($out) # store 5 output blocks
898 pxor $inout0,$inout0 # clear register bank
899 movups $inout1,0x10($out)
900 pxor $inout1,$inout1
901 movups $inout2,0x20($out)
902 pxor $inout2,$inout2
903 movups $inout3,0x30($out)
904 pxor $inout3,$inout3
905 movups $inout4,0x40($out)
906 pxor $inout4,$inout4
907 pxor $inout5,$inout5
908 jmp .Lecb_ret
909 .align 16
910 .Lecb_dec_six:
911 call _aesni_decrypt6
912 movups $inout0,($out) # store 6 output blocks
913 pxor $inout0,$inout0 # clear register bank
914 movups $inout1,0x10($out)
915 pxor $inout1,$inout1
916 movups $inout2,0x20($out)
917 pxor $inout2,$inout2
918 movups $inout3,0x30($out)
919 pxor $inout3,$inout3
920 movups $inout4,0x40($out)
921 pxor $inout4,$inout4
922 movups $inout5,0x50($out)
923 pxor $inout5,$inout5
924
925 .Lecb_ret:
926 xorps $rndkey0,$rndkey0 # %xmm0
927 pxor $rndkey1,$rndkey1
928 ___
929 $code.=<<___ if ($win64);
930 movaps (%rsp),%xmm6
931 movaps %xmm0,(%rsp) # clear stack
932 movaps 0x10(%rsp),%xmm7
933 movaps %xmm0,0x10(%rsp)
934 movaps 0x20(%rsp),%xmm8
935 movaps %xmm0,0x20(%rsp)
936 movaps 0x30(%rsp),%xmm9
937 movaps %xmm0,0x30(%rsp)
938 lea 0x58(%rsp),%rsp
939 .Lecb_enc_ret:
940 ___
941 $code.=<<___;
942 ret
943 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt
944 ___
945 \f
946 {
947 ######################################################################
948 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
949 # size_t blocks, const AES_KEY *key,
950 # const char *ivec,char *cmac);
951 #
952 # Handles only complete blocks, operates on 64-bit counter and
953 # does not update *ivec! Nor does it finalize CMAC value
954 # (see engine/eng_aesni.c for details)
955 #
956 {
957 my $cmac="%r9"; # 6th argument
958
959 my $increment="%xmm9";
960 my $iv="%xmm6";
961 my $bswap_mask="%xmm7";
962
963 $code.=<<___;
964 .globl aesni_ccm64_encrypt_blocks
965 .type aesni_ccm64_encrypt_blocks,\@function,6
966 .align 16
967 aesni_ccm64_encrypt_blocks:
968 ___
969 $code.=<<___ if ($win64);
970 lea -0x58(%rsp),%rsp
971 movaps %xmm6,(%rsp) # $iv
972 movaps %xmm7,0x10(%rsp) # $bswap_mask
973 movaps %xmm8,0x20(%rsp) # $in0
974 movaps %xmm9,0x30(%rsp) # $increment
975 .Lccm64_enc_body:
976 ___
977 $code.=<<___;
978 mov 240($key),$rounds # key->rounds
979 movdqu ($ivp),$iv
980 movdqa .Lincrement64(%rip),$increment
981 movdqa .Lbswap_mask(%rip),$bswap_mask
982
983 shl \$4,$rounds
984 mov \$16,$rnds_
985 lea 0($key),$key_
986 movdqu ($cmac),$inout1
987 movdqa $iv,$inout0
988 lea 32($key,$rounds),$key # end of key schedule
989 pshufb $bswap_mask,$iv
990 sub %rax,%r10 # twisted $rounds
991 jmp .Lccm64_enc_outer
992 .align 16
993 .Lccm64_enc_outer:
994 $movkey ($key_),$rndkey0
995 mov %r10,%rax
996 movups ($inp),$in0 # load inp
997
998 xorps $rndkey0,$inout0 # counter
999 $movkey 16($key_),$rndkey1
1000 xorps $in0,$rndkey0
1001 xorps $rndkey0,$inout1 # cmac^=inp
1002 $movkey 32($key_),$rndkey0
1003
1004 .Lccm64_enc2_loop:
1005 aesenc $rndkey1,$inout0
1006 aesenc $rndkey1,$inout1
1007 $movkey ($key,%rax),$rndkey1
1008 add \$32,%rax
1009 aesenc $rndkey0,$inout0
1010 aesenc $rndkey0,$inout1
1011 $movkey -16($key,%rax),$rndkey0
1012 jnz .Lccm64_enc2_loop
1013 aesenc $rndkey1,$inout0
1014 aesenc $rndkey1,$inout1
1015 paddq $increment,$iv
1016 dec $len # $len-- ($len is in blocks)
1017 aesenclast $rndkey0,$inout0
1018 aesenclast $rndkey0,$inout1
1019
1020 lea 16($inp),$inp
1021 xorps $inout0,$in0 # inp ^= E(iv)
1022 movdqa $iv,$inout0
1023 movups $in0,($out) # save output
1024 pshufb $bswap_mask,$inout0
1025 lea 16($out),$out # $out+=16
1026 jnz .Lccm64_enc_outer # loop if ($len!=0)
1027
1028 pxor $rndkey0,$rndkey0 # clear register bank
1029 pxor $rndkey1,$rndkey1
1030 pxor $inout0,$inout0
1031 movups $inout1,($cmac) # store resulting mac
1032 pxor $inout1,$inout1
1033 pxor $in0,$in0
1034 pxor $iv,$iv
1035 ___
1036 $code.=<<___ if ($win64);
1037 movaps (%rsp),%xmm6
1038 movaps %xmm0,(%rsp) # clear stack
1039 movaps 0x10(%rsp),%xmm7
1040 movaps %xmm0,0x10(%rsp)
1041 movaps 0x20(%rsp),%xmm8
1042 movaps %xmm0,0x20(%rsp)
1043 movaps 0x30(%rsp),%xmm9
1044 movaps %xmm0,0x30(%rsp)
1045 lea 0x58(%rsp),%rsp
1046 .Lccm64_enc_ret:
1047 ___
1048 $code.=<<___;
1049 ret
1050 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1051 ___
1052 ######################################################################
1053 $code.=<<___;
1054 .globl aesni_ccm64_decrypt_blocks
1055 .type aesni_ccm64_decrypt_blocks,\@function,6
1056 .align 16
1057 aesni_ccm64_decrypt_blocks:
1058 ___
1059 $code.=<<___ if ($win64);
1060 lea -0x58(%rsp),%rsp
1061 movaps %xmm6,(%rsp) # $iv
1062 movaps %xmm7,0x10(%rsp) # $bswap_mask
1063 movaps %xmm8,0x20(%rsp) # $in8
1064 movaps %xmm9,0x30(%rsp) # $increment
1065 .Lccm64_dec_body:
1066 ___
1067 $code.=<<___;
1068 mov 240($key),$rounds # key->rounds
1069 movups ($ivp),$iv
1070 movdqu ($cmac),$inout1
1071 movdqa .Lincrement64(%rip),$increment
1072 movdqa .Lbswap_mask(%rip),$bswap_mask
1073
1074 movaps $iv,$inout0
1075 mov $rounds,$rnds_
1076 mov $key,$key_
1077 pshufb $bswap_mask,$iv
1078 ___
1079 &aesni_generate1("enc",$key,$rounds);
1080 $code.=<<___;
1081 shl \$4,$rnds_
1082 mov \$16,$rounds
1083 movups ($inp),$in0 # load inp
1084 paddq $increment,$iv
1085 lea 16($inp),$inp # $inp+=16
1086 sub %r10,%rax # twisted $rounds
1087 lea 32($key_,$rnds_),$key # end of key schedule
1088 mov %rax,%r10
1089 jmp .Lccm64_dec_outer
1090 .align 16
1091 .Lccm64_dec_outer:
1092 xorps $inout0,$in0 # inp ^= E(iv)
1093 movdqa $iv,$inout0
1094 movups $in0,($out) # save output
1095 lea 16($out),$out # $out+=16
1096 pshufb $bswap_mask,$inout0
1097
1098 sub \$1,$len # $len-- ($len is in blocks)
1099 jz .Lccm64_dec_break # if ($len==0) break
1100
1101 $movkey ($key_),$rndkey0
1102 mov %r10,%rax
1103 $movkey 16($key_),$rndkey1
1104 xorps $rndkey0,$in0
1105 xorps $rndkey0,$inout0
1106 xorps $in0,$inout1 # cmac^=out
1107 $movkey 32($key_),$rndkey0
1108 jmp .Lccm64_dec2_loop
1109 .align 16
1110 .Lccm64_dec2_loop:
1111 aesenc $rndkey1,$inout0
1112 aesenc $rndkey1,$inout1
1113 $movkey ($key,%rax),$rndkey1
1114 add \$32,%rax
1115 aesenc $rndkey0,$inout0
1116 aesenc $rndkey0,$inout1
1117 $movkey -16($key,%rax),$rndkey0
1118 jnz .Lccm64_dec2_loop
1119 movups ($inp),$in0 # load input
1120 paddq $increment,$iv
1121 aesenc $rndkey1,$inout0
1122 aesenc $rndkey1,$inout1
1123 aesenclast $rndkey0,$inout0
1124 aesenclast $rndkey0,$inout1
1125 lea 16($inp),$inp # $inp+=16
1126 jmp .Lccm64_dec_outer
1127
1128 .align 16
1129 .Lccm64_dec_break:
1130 #xorps $in0,$inout1 # cmac^=out
1131 mov 240($key_),$rounds
1132 ___
1133 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1134 $code.=<<___;
1135 pxor $rndkey0,$rndkey0 # clear register bank
1136 pxor $rndkey1,$rndkey1
1137 pxor $inout0,$inout0
1138 movups $inout1,($cmac) # store resulting mac
1139 pxor $inout1,$inout1
1140 pxor $in0,$in0
1141 pxor $iv,$iv
1142 ___
1143 $code.=<<___ if ($win64);
1144 movaps (%rsp),%xmm6
1145 movaps %xmm0,(%rsp) # clear stack
1146 movaps 0x10(%rsp),%xmm7
1147 movaps %xmm0,0x10(%rsp)
1148 movaps 0x20(%rsp),%xmm8
1149 movaps %xmm0,0x20(%rsp)
1150 movaps 0x30(%rsp),%xmm9
1151 movaps %xmm0,0x30(%rsp)
1152 lea 0x58(%rsp),%rsp
1153 .Lccm64_dec_ret:
1154 ___
1155 $code.=<<___;
1156 ret
1157 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1158 ___
1159 }\f
1160 ######################################################################
1161 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1162 # size_t blocks, const AES_KEY *key,
1163 # const char *ivec);
1164 #
1165 # Handles only complete blocks, operates on 32-bit counter and
1166 # does not update *ivec! (see crypto/modes/ctr128.c for details)
1167 #
1168 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1169 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1170 # Keywords are full unroll and modulo-schedule counter calculations
1171 # with zero-round key xor.
1172 {
1173 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1174 my ($key0,$ctr)=("${key_}d","${ivp}d");
1175 my $frame_size = 0x80 + ($win64?160:0);
1176
1177 $code.=<<___;
1178 .globl aesni_ctr32_encrypt_blocks
1179 .type aesni_ctr32_encrypt_blocks,\@function,5
1180 .align 16
1181 aesni_ctr32_encrypt_blocks:
1182 cmp \$1,$len
1183 jne .Lctr32_bulk
1184
1185 # handle single block without allocating stack frame,
1186 # useful when handling edges
1187 movups ($ivp),$inout0
1188 movups ($inp),$inout1
1189 mov 240($key),%edx # key->rounds
1190 ___
1191 &aesni_generate1("enc",$key,"%edx");
1192 $code.=<<___;
1193 pxor $rndkey0,$rndkey0 # clear register bank
1194 pxor $rndkey1,$rndkey1
1195 xorps $inout1,$inout0
1196 pxor $inout1,$inout1
1197 movups $inout0,($out)
1198 xorps $inout0,$inout0
1199 jmp .Lctr32_epilogue
1200
1201 .align 16
1202 .Lctr32_bulk:
1203 lea (%rsp),%rax
1204 push %rbp
1205 sub \$$frame_size,%rsp
1206 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1207 ___
1208 $code.=<<___ if ($win64);
1209 movaps %xmm6,-0xa8(%rax) # offload everything
1210 movaps %xmm7,-0x98(%rax)
1211 movaps %xmm8,-0x88(%rax)
1212 movaps %xmm9,-0x78(%rax)
1213 movaps %xmm10,-0x68(%rax)
1214 movaps %xmm11,-0x58(%rax)
1215 movaps %xmm12,-0x48(%rax)
1216 movaps %xmm13,-0x38(%rax)
1217 movaps %xmm14,-0x28(%rax)
1218 movaps %xmm15,-0x18(%rax)
1219 .Lctr32_body:
1220 ___
1221 $code.=<<___;
1222 lea -8(%rax),%rbp
1223
1224 # 8 16-byte words on top of stack are counter values
1225 # xor-ed with zero-round key
1226
1227 movdqu ($ivp),$inout0
1228 movdqu ($key),$rndkey0
1229 mov 12($ivp),$ctr # counter LSB
1230 pxor $rndkey0,$inout0
1231 mov 12($key),$key0 # 0-round key LSB
1232 movdqa $inout0,0x00(%rsp) # populate counter block
1233 bswap $ctr
1234 movdqa $inout0,$inout1
1235 movdqa $inout0,$inout2
1236 movdqa $inout0,$inout3
1237 movdqa $inout0,0x40(%rsp)
1238 movdqa $inout0,0x50(%rsp)
1239 movdqa $inout0,0x60(%rsp)
1240 mov %rdx,%r10 # about to borrow %rdx
1241 movdqa $inout0,0x70(%rsp)
1242
1243 lea 1($ctr),%rax
1244 lea 2($ctr),%rdx
1245 bswap %eax
1246 bswap %edx
1247 xor $key0,%eax
1248 xor $key0,%edx
1249 pinsrd \$3,%eax,$inout1
1250 lea 3($ctr),%rax
1251 movdqa $inout1,0x10(%rsp)
1252 pinsrd \$3,%edx,$inout2
1253 bswap %eax
1254 mov %r10,%rdx # restore %rdx
1255 lea 4($ctr),%r10
1256 movdqa $inout2,0x20(%rsp)
1257 xor $key0,%eax
1258 bswap %r10d
1259 pinsrd \$3,%eax,$inout3
1260 xor $key0,%r10d
1261 movdqa $inout3,0x30(%rsp)
1262 lea 5($ctr),%r9
1263 mov %r10d,0x40+12(%rsp)
1264 bswap %r9d
1265 lea 6($ctr),%r10
1266 mov 240($key),$rounds # key->rounds
1267 xor $key0,%r9d
1268 bswap %r10d
1269 mov %r9d,0x50+12(%rsp)
1270 xor $key0,%r10d
1271 lea 7($ctr),%r9
1272 mov %r10d,0x60+12(%rsp)
1273 bswap %r9d
1274 mov OPENSSL_ia32cap_P+4(%rip),%r10d
1275 xor $key0,%r9d
1276 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1277 mov %r9d,0x70+12(%rsp)
1278
1279 $movkey 0x10($key),$rndkey1
1280
1281 movdqa 0x40(%rsp),$inout4
1282 movdqa 0x50(%rsp),$inout5
1283
1284 cmp \$8,$len # $len is in blocks
1285 jb .Lctr32_tail # short input if ($len<8)
1286
1287 sub \$6,$len # $len is biased by -6
1288 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
1289 je .Lctr32_6x # [which denotes Atom Silvermont]
1290
1291 lea 0x80($key),$key # size optimization
1292 sub \$2,$len # $len is biased by -8
1293 jmp .Lctr32_loop8
1294
1295 .align 16
1296 .Lctr32_6x:
1297 shl \$4,$rounds
1298 mov \$48,$rnds_
1299 bswap $key0
1300 lea 32($key,$rounds),$key # end of key schedule
1301 sub %rax,%r10 # twisted $rounds
1302 jmp .Lctr32_loop6
1303
1304 .align 16
1305 .Lctr32_loop6:
1306 add \$6,$ctr # next counter value
1307 $movkey -48($key,$rnds_),$rndkey0
1308 aesenc $rndkey1,$inout0
1309 mov $ctr,%eax
1310 xor $key0,%eax
1311 aesenc $rndkey1,$inout1
1312 movbe %eax,`0x00+12`(%rsp) # store next counter value
1313 lea 1($ctr),%eax
1314 aesenc $rndkey1,$inout2
1315 xor $key0,%eax
1316 movbe %eax,`0x10+12`(%rsp)
1317 aesenc $rndkey1,$inout3
1318 lea 2($ctr),%eax
1319 xor $key0,%eax
1320 aesenc $rndkey1,$inout4
1321 movbe %eax,`0x20+12`(%rsp)
1322 lea 3($ctr),%eax
1323 aesenc $rndkey1,$inout5
1324 $movkey -32($key,$rnds_),$rndkey1
1325 xor $key0,%eax
1326
1327 aesenc $rndkey0,$inout0
1328 movbe %eax,`0x30+12`(%rsp)
1329 lea 4($ctr),%eax
1330 aesenc $rndkey0,$inout1
1331 xor $key0,%eax
1332 movbe %eax,`0x40+12`(%rsp)
1333 aesenc $rndkey0,$inout2
1334 lea 5($ctr),%eax
1335 xor $key0,%eax
1336 aesenc $rndkey0,$inout3
1337 movbe %eax,`0x50+12`(%rsp)
1338 mov %r10,%rax # mov $rnds_,$rounds
1339 aesenc $rndkey0,$inout4
1340 aesenc $rndkey0,$inout5
1341 $movkey -16($key,$rnds_),$rndkey0
1342
1343 call .Lenc_loop6
1344
1345 movdqu ($inp),$inout6 # load 6 input blocks
1346 movdqu 0x10($inp),$inout7
1347 movdqu 0x20($inp),$in0
1348 movdqu 0x30($inp),$in1
1349 movdqu 0x40($inp),$in2
1350 movdqu 0x50($inp),$in3
1351 lea 0x60($inp),$inp # $inp+=6*16
1352 $movkey -64($key,$rnds_),$rndkey1
1353 pxor $inout0,$inout6 # inp^=E(ctr)
1354 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
1355 pxor $inout1,$inout7
1356 movaps 0x10(%rsp),$inout1
1357 pxor $inout2,$in0
1358 movaps 0x20(%rsp),$inout2
1359 pxor $inout3,$in1
1360 movaps 0x30(%rsp),$inout3
1361 pxor $inout4,$in2
1362 movaps 0x40(%rsp),$inout4
1363 pxor $inout5,$in3
1364 movaps 0x50(%rsp),$inout5
1365 movdqu $inout6,($out) # store 6 output blocks
1366 movdqu $inout7,0x10($out)
1367 movdqu $in0,0x20($out)
1368 movdqu $in1,0x30($out)
1369 movdqu $in2,0x40($out)
1370 movdqu $in3,0x50($out)
1371 lea 0x60($out),$out # $out+=6*16
1372
1373 sub \$6,$len
1374 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1375
1376 add \$6,$len # restore real remaining $len
1377 jz .Lctr32_done # done if ($len==0)
1378
1379 lea -48($rnds_),$rounds
1380 lea -80($key,$rnds_),$key # restore $key
1381 neg $rounds
1382 shr \$4,$rounds # restore $rounds
1383 jmp .Lctr32_tail
1384
1385 .align 32
1386 .Lctr32_loop8:
1387 add \$8,$ctr # next counter value
1388 movdqa 0x60(%rsp),$inout6
1389 aesenc $rndkey1,$inout0
1390 mov $ctr,%r9d
1391 movdqa 0x70(%rsp),$inout7
1392 aesenc $rndkey1,$inout1
1393 bswap %r9d
1394 $movkey 0x20-0x80($key),$rndkey0
1395 aesenc $rndkey1,$inout2
1396 xor $key0,%r9d
1397 nop
1398 aesenc $rndkey1,$inout3
1399 mov %r9d,0x00+12(%rsp) # store next counter value
1400 lea 1($ctr),%r9
1401 aesenc $rndkey1,$inout4
1402 aesenc $rndkey1,$inout5
1403 aesenc $rndkey1,$inout6
1404 aesenc $rndkey1,$inout7
1405 $movkey 0x30-0x80($key),$rndkey1
1406 ___
1407 for($i=2;$i<8;$i++) {
1408 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1409 $code.=<<___;
1410 bswap %r9d
1411 aesenc $rndkeyx,$inout0
1412 aesenc $rndkeyx,$inout1
1413 xor $key0,%r9d
1414 .byte 0x66,0x90
1415 aesenc $rndkeyx,$inout2
1416 aesenc $rndkeyx,$inout3
1417 mov %r9d,`0x10*($i-1)`+12(%rsp)
1418 lea $i($ctr),%r9
1419 aesenc $rndkeyx,$inout4
1420 aesenc $rndkeyx,$inout5
1421 aesenc $rndkeyx,$inout6
1422 aesenc $rndkeyx,$inout7
1423 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1424 ___
1425 }
1426 $code.=<<___;
1427 bswap %r9d
1428 aesenc $rndkey0,$inout0
1429 aesenc $rndkey0,$inout1
1430 aesenc $rndkey0,$inout2
1431 xor $key0,%r9d
1432 movdqu 0x00($inp),$in0 # start loading input
1433 aesenc $rndkey0,$inout3
1434 mov %r9d,0x70+12(%rsp)
1435 cmp \$11,$rounds
1436 aesenc $rndkey0,$inout4
1437 aesenc $rndkey0,$inout5
1438 aesenc $rndkey0,$inout6
1439 aesenc $rndkey0,$inout7
1440 $movkey 0xa0-0x80($key),$rndkey0
1441
1442 jb .Lctr32_enc_done
1443
1444 aesenc $rndkey1,$inout0
1445 aesenc $rndkey1,$inout1
1446 aesenc $rndkey1,$inout2
1447 aesenc $rndkey1,$inout3
1448 aesenc $rndkey1,$inout4
1449 aesenc $rndkey1,$inout5
1450 aesenc $rndkey1,$inout6
1451 aesenc $rndkey1,$inout7
1452 $movkey 0xb0-0x80($key),$rndkey1
1453
1454 aesenc $rndkey0,$inout0
1455 aesenc $rndkey0,$inout1
1456 aesenc $rndkey0,$inout2
1457 aesenc $rndkey0,$inout3
1458 aesenc $rndkey0,$inout4
1459 aesenc $rndkey0,$inout5
1460 aesenc $rndkey0,$inout6
1461 aesenc $rndkey0,$inout7
1462 $movkey 0xc0-0x80($key),$rndkey0
1463 je .Lctr32_enc_done
1464
1465 aesenc $rndkey1,$inout0
1466 aesenc $rndkey1,$inout1
1467 aesenc $rndkey1,$inout2
1468 aesenc $rndkey1,$inout3
1469 aesenc $rndkey1,$inout4
1470 aesenc $rndkey1,$inout5
1471 aesenc $rndkey1,$inout6
1472 aesenc $rndkey1,$inout7
1473 $movkey 0xd0-0x80($key),$rndkey1
1474
1475 aesenc $rndkey0,$inout0
1476 aesenc $rndkey0,$inout1
1477 aesenc $rndkey0,$inout2
1478 aesenc $rndkey0,$inout3
1479 aesenc $rndkey0,$inout4
1480 aesenc $rndkey0,$inout5
1481 aesenc $rndkey0,$inout6
1482 aesenc $rndkey0,$inout7
1483 $movkey 0xe0-0x80($key),$rndkey0
1484 jmp .Lctr32_enc_done
1485
1486 .align 16
1487 .Lctr32_enc_done:
1488 movdqu 0x10($inp),$in1
1489 pxor $rndkey0,$in0 # input^=round[last]
1490 movdqu 0x20($inp),$in2
1491 pxor $rndkey0,$in1
1492 movdqu 0x30($inp),$in3
1493 pxor $rndkey0,$in2
1494 movdqu 0x40($inp),$in4
1495 pxor $rndkey0,$in3
1496 movdqu 0x50($inp),$in5
1497 pxor $rndkey0,$in4
1498 pxor $rndkey0,$in5
1499 aesenc $rndkey1,$inout0
1500 aesenc $rndkey1,$inout1
1501 aesenc $rndkey1,$inout2
1502 aesenc $rndkey1,$inout3
1503 aesenc $rndkey1,$inout4
1504 aesenc $rndkey1,$inout5
1505 aesenc $rndkey1,$inout6
1506 aesenc $rndkey1,$inout7
1507 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1508 lea 0x80($inp),$inp # $inp+=8*16
1509
1510 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1511 pxor $rndkey0,$rndkey1 # borrowed $rndkey
1512 movdqu 0x70-0x80($inp),$in0
1513 aesenclast $in1,$inout1
1514 pxor $rndkey0,$in0
1515 movdqa 0x00(%rsp),$in1 # load next counter block
1516 aesenclast $in2,$inout2
1517 aesenclast $in3,$inout3
1518 movdqa 0x10(%rsp),$in2
1519 movdqa 0x20(%rsp),$in3
1520 aesenclast $in4,$inout4
1521 aesenclast $in5,$inout5
1522 movdqa 0x30(%rsp),$in4
1523 movdqa 0x40(%rsp),$in5
1524 aesenclast $rndkey1,$inout6
1525 movdqa 0x50(%rsp),$rndkey0
1526 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1527 aesenclast $in0,$inout7
1528
1529 movups $inout0,($out) # store 8 output blocks
1530 movdqa $in1,$inout0
1531 movups $inout1,0x10($out)
1532 movdqa $in2,$inout1
1533 movups $inout2,0x20($out)
1534 movdqa $in3,$inout2
1535 movups $inout3,0x30($out)
1536 movdqa $in4,$inout3
1537 movups $inout4,0x40($out)
1538 movdqa $in5,$inout4
1539 movups $inout5,0x50($out)
1540 movdqa $rndkey0,$inout5
1541 movups $inout6,0x60($out)
1542 movups $inout7,0x70($out)
1543 lea 0x80($out),$out # $out+=8*16
1544
1545 sub \$8,$len
1546 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1547
1548 add \$8,$len # restore real remainig $len
1549 jz .Lctr32_done # done if ($len==0)
1550 lea -0x80($key),$key
1551
1552 .Lctr32_tail:
1553 # note that at this point $inout0..5 are populated with
1554 # counter values xor-ed with 0-round key
1555 lea 16($key),$key
1556 cmp \$4,$len
1557 jb .Lctr32_loop3
1558 je .Lctr32_loop4
1559
1560 # if ($len>4) compute 7 E(counter)
1561 shl \$4,$rounds
1562 movdqa 0x60(%rsp),$inout6
1563 pxor $inout7,$inout7
1564
1565 $movkey 16($key),$rndkey0
1566 aesenc $rndkey1,$inout0
1567 aesenc $rndkey1,$inout1
1568 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1569 neg %rax
1570 aesenc $rndkey1,$inout2
1571 add \$16,%rax # prepare for .Lenc_loop8_enter
1572 movups ($inp),$in0
1573 aesenc $rndkey1,$inout3
1574 aesenc $rndkey1,$inout4
1575 movups 0x10($inp),$in1 # pre-load input
1576 movups 0x20($inp),$in2
1577 aesenc $rndkey1,$inout5
1578 aesenc $rndkey1,$inout6
1579
1580 call .Lenc_loop8_enter
1581
1582 movdqu 0x30($inp),$in3
1583 pxor $in0,$inout0
1584 movdqu 0x40($inp),$in0
1585 pxor $in1,$inout1
1586 movdqu $inout0,($out) # store output
1587 pxor $in2,$inout2
1588 movdqu $inout1,0x10($out)
1589 pxor $in3,$inout3
1590 movdqu $inout2,0x20($out)
1591 pxor $in0,$inout4
1592 movdqu $inout3,0x30($out)
1593 movdqu $inout4,0x40($out)
1594 cmp \$6,$len
1595 jb .Lctr32_done # $len was 5, stop store
1596
1597 movups 0x50($inp),$in1
1598 xorps $in1,$inout5
1599 movups $inout5,0x50($out)
1600 je .Lctr32_done # $len was 6, stop store
1601
1602 movups 0x60($inp),$in2
1603 xorps $in2,$inout6
1604 movups $inout6,0x60($out)
1605 jmp .Lctr32_done # $len was 7, stop store
1606
1607 .align 32
1608 .Lctr32_loop4:
1609 aesenc $rndkey1,$inout0
1610 lea 16($key),$key
1611 dec $rounds
1612 aesenc $rndkey1,$inout1
1613 aesenc $rndkey1,$inout2
1614 aesenc $rndkey1,$inout3
1615 $movkey ($key),$rndkey1
1616 jnz .Lctr32_loop4
1617 aesenclast $rndkey1,$inout0
1618 aesenclast $rndkey1,$inout1
1619 movups ($inp),$in0 # load input
1620 movups 0x10($inp),$in1
1621 aesenclast $rndkey1,$inout2
1622 aesenclast $rndkey1,$inout3
1623 movups 0x20($inp),$in2
1624 movups 0x30($inp),$in3
1625
1626 xorps $in0,$inout0
1627 movups $inout0,($out) # store output
1628 xorps $in1,$inout1
1629 movups $inout1,0x10($out)
1630 pxor $in2,$inout2
1631 movdqu $inout2,0x20($out)
1632 pxor $in3,$inout3
1633 movdqu $inout3,0x30($out)
1634 jmp .Lctr32_done # $len was 4, stop store
1635
1636 .align 32
1637 .Lctr32_loop3:
1638 aesenc $rndkey1,$inout0
1639 lea 16($key),$key
1640 dec $rounds
1641 aesenc $rndkey1,$inout1
1642 aesenc $rndkey1,$inout2
1643 $movkey ($key),$rndkey1
1644 jnz .Lctr32_loop3
1645 aesenclast $rndkey1,$inout0
1646 aesenclast $rndkey1,$inout1
1647 aesenclast $rndkey1,$inout2
1648
1649 movups ($inp),$in0 # load input
1650 xorps $in0,$inout0
1651 movups $inout0,($out) # store output
1652 cmp \$2,$len
1653 jb .Lctr32_done # $len was 1, stop store
1654
1655 movups 0x10($inp),$in1
1656 xorps $in1,$inout1
1657 movups $inout1,0x10($out)
1658 je .Lctr32_done # $len was 2, stop store
1659
1660 movups 0x20($inp),$in2
1661 xorps $in2,$inout2
1662 movups $inout2,0x20($out) # $len was 3, stop store
1663
1664 .Lctr32_done:
1665 xorps %xmm0,%xmm0 # clear regiser bank
1666 xor $key0,$key0
1667 pxor %xmm1,%xmm1
1668 pxor %xmm2,%xmm2
1669 pxor %xmm3,%xmm3
1670 pxor %xmm4,%xmm4
1671 pxor %xmm5,%xmm5
1672 ___
1673 $code.=<<___ if (!$win64);
1674 pxor %xmm6,%xmm6
1675 pxor %xmm7,%xmm7
1676 movaps %xmm0,0x00(%rsp) # clear stack
1677 pxor %xmm8,%xmm8
1678 movaps %xmm0,0x10(%rsp)
1679 pxor %xmm9,%xmm9
1680 movaps %xmm0,0x20(%rsp)
1681 pxor %xmm10,%xmm10
1682 movaps %xmm0,0x30(%rsp)
1683 pxor %xmm11,%xmm11
1684 movaps %xmm0,0x40(%rsp)
1685 pxor %xmm12,%xmm12
1686 movaps %xmm0,0x50(%rsp)
1687 pxor %xmm13,%xmm13
1688 movaps %xmm0,0x60(%rsp)
1689 pxor %xmm14,%xmm14
1690 movaps %xmm0,0x70(%rsp)
1691 pxor %xmm15,%xmm15
1692 ___
1693 $code.=<<___ if ($win64);
1694 movaps -0xa0(%rbp),%xmm6
1695 movaps %xmm0,-0xa0(%rbp) # clear stack
1696 movaps -0x90(%rbp),%xmm7
1697 movaps %xmm0,-0x90(%rbp)
1698 movaps -0x80(%rbp),%xmm8
1699 movaps %xmm0,-0x80(%rbp)
1700 movaps -0x70(%rbp),%xmm9
1701 movaps %xmm0,-0x70(%rbp)
1702 movaps -0x60(%rbp),%xmm10
1703 movaps %xmm0,-0x60(%rbp)
1704 movaps -0x50(%rbp),%xmm11
1705 movaps %xmm0,-0x50(%rbp)
1706 movaps -0x40(%rbp),%xmm12
1707 movaps %xmm0,-0x40(%rbp)
1708 movaps -0x30(%rbp),%xmm13
1709 movaps %xmm0,-0x30(%rbp)
1710 movaps -0x20(%rbp),%xmm14
1711 movaps %xmm0,-0x20(%rbp)
1712 movaps -0x10(%rbp),%xmm15
1713 movaps %xmm0,-0x10(%rbp)
1714 movaps %xmm0,0x00(%rsp)
1715 movaps %xmm0,0x10(%rsp)
1716 movaps %xmm0,0x20(%rsp)
1717 movaps %xmm0,0x30(%rsp)
1718 movaps %xmm0,0x40(%rsp)
1719 movaps %xmm0,0x50(%rsp)
1720 movaps %xmm0,0x60(%rsp)
1721 movaps %xmm0,0x70(%rsp)
1722 ___
1723 $code.=<<___;
1724 lea (%rbp),%rsp
1725 pop %rbp
1726 .Lctr32_epilogue:
1727 ret
1728 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1729 ___
1730 }
1731 \f
1732 ######################################################################
1733 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1734 # const AES_KEY *key1, const AES_KEY *key2
1735 # const unsigned char iv[16]);
1736 #
1737 {
1738 my @tweak=map("%xmm$_",(10..15));
1739 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1740 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1741 my $frame_size = 0x70 + ($win64?160:0);
1742
1743 $code.=<<___;
1744 .globl aesni_xts_encrypt
1745 .type aesni_xts_encrypt,\@function,6
1746 .align 16
1747 aesni_xts_encrypt:
1748 lea (%rsp),%rax
1749 push %rbp
1750 sub \$$frame_size,%rsp
1751 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1752 ___
1753 $code.=<<___ if ($win64);
1754 movaps %xmm6,-0xa8(%rax) # offload everything
1755 movaps %xmm7,-0x98(%rax)
1756 movaps %xmm8,-0x88(%rax)
1757 movaps %xmm9,-0x78(%rax)
1758 movaps %xmm10,-0x68(%rax)
1759 movaps %xmm11,-0x58(%rax)
1760 movaps %xmm12,-0x48(%rax)
1761 movaps %xmm13,-0x38(%rax)
1762 movaps %xmm14,-0x28(%rax)
1763 movaps %xmm15,-0x18(%rax)
1764 .Lxts_enc_body:
1765 ___
1766 $code.=<<___;
1767 lea -8(%rax),%rbp
1768 movups ($ivp),$inout0 # load clear-text tweak
1769 mov 240(%r8),$rounds # key2->rounds
1770 mov 240($key),$rnds_ # key1->rounds
1771 ___
1772 # generate the tweak
1773 &aesni_generate1("enc",$key2,$rounds,$inout0);
1774 $code.=<<___;
1775 $movkey ($key),$rndkey0 # zero round key
1776 mov $key,$key_ # backup $key
1777 mov $rnds_,$rounds # backup $rounds
1778 shl \$4,$rnds_
1779 mov $len,$len_ # backup $len
1780 and \$-16,$len
1781
1782 $movkey 16($key,$rnds_),$rndkey1 # last round key
1783
1784 movdqa .Lxts_magic(%rip),$twmask
1785 movdqa $inout0,@tweak[5]
1786 pshufd \$0x5f,$inout0,$twres
1787 pxor $rndkey0,$rndkey1
1788 ___
1789 # alternative tweak calculation algorithm is based on suggestions
1790 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1791 # and should help in the future...
1792 for ($i=0;$i<4;$i++) {
1793 $code.=<<___;
1794 movdqa $twres,$twtmp
1795 paddd $twres,$twres
1796 movdqa @tweak[5],@tweak[$i]
1797 psrad \$31,$twtmp # broadcast upper bits
1798 paddq @tweak[5],@tweak[5]
1799 pand $twmask,$twtmp
1800 pxor $rndkey0,@tweak[$i]
1801 pxor $twtmp,@tweak[5]
1802 ___
1803 }
1804 $code.=<<___;
1805 movdqa @tweak[5],@tweak[4]
1806 psrad \$31,$twres
1807 paddq @tweak[5],@tweak[5]
1808 pand $twmask,$twres
1809 pxor $rndkey0,@tweak[4]
1810 pxor $twres,@tweak[5]
1811 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1812
1813 sub \$16*6,$len
1814 jc .Lxts_enc_short # if $len-=6*16 borrowed
1815
1816 mov \$16+96,$rounds
1817 lea 32($key_,$rnds_),$key # end of key schedule
1818 sub %r10,%rax # twisted $rounds
1819 $movkey 16($key_),$rndkey1
1820 mov %rax,%r10 # backup twisted $rounds
1821 lea .Lxts_magic(%rip),%r8
1822 jmp .Lxts_enc_grandloop
1823
1824 .align 32
1825 .Lxts_enc_grandloop:
1826 movdqu `16*0`($inp),$inout0 # load input
1827 movdqa $rndkey0,$twmask
1828 movdqu `16*1`($inp),$inout1
1829 pxor @tweak[0],$inout0 # input^=tweak^round[0]
1830 movdqu `16*2`($inp),$inout2
1831 pxor @tweak[1],$inout1
1832 aesenc $rndkey1,$inout0
1833 movdqu `16*3`($inp),$inout3
1834 pxor @tweak[2],$inout2
1835 aesenc $rndkey1,$inout1
1836 movdqu `16*4`($inp),$inout4
1837 pxor @tweak[3],$inout3
1838 aesenc $rndkey1,$inout2
1839 movdqu `16*5`($inp),$inout5
1840 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1841 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1842 pxor @tweak[4],$inout4
1843 aesenc $rndkey1,$inout3
1844 $movkey 32($key_),$rndkey0
1845 lea `16*6`($inp),$inp
1846 pxor $twmask,$inout5
1847
1848 pxor $twres,@tweak[0] # calclulate tweaks^round[last]
1849 aesenc $rndkey1,$inout4
1850 pxor $twres,@tweak[1]
1851 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
1852 aesenc $rndkey1,$inout5
1853 $movkey 48($key_),$rndkey1
1854 pxor $twres,@tweak[2]
1855
1856 aesenc $rndkey0,$inout0
1857 pxor $twres,@tweak[3]
1858 movdqa @tweak[1],`16*1`(%rsp)
1859 aesenc $rndkey0,$inout1
1860 pxor $twres,@tweak[4]
1861 movdqa @tweak[2],`16*2`(%rsp)
1862 aesenc $rndkey0,$inout2
1863 aesenc $rndkey0,$inout3
1864 pxor $twres,$twmask
1865 movdqa @tweak[4],`16*4`(%rsp)
1866 aesenc $rndkey0,$inout4
1867 aesenc $rndkey0,$inout5
1868 $movkey 64($key_),$rndkey0
1869 movdqa $twmask,`16*5`(%rsp)
1870 pshufd \$0x5f,@tweak[5],$twres
1871 jmp .Lxts_enc_loop6
1872 .align 32
1873 .Lxts_enc_loop6:
1874 aesenc $rndkey1,$inout0
1875 aesenc $rndkey1,$inout1
1876 aesenc $rndkey1,$inout2
1877 aesenc $rndkey1,$inout3
1878 aesenc $rndkey1,$inout4
1879 aesenc $rndkey1,$inout5
1880 $movkey -64($key,%rax),$rndkey1
1881 add \$32,%rax
1882
1883 aesenc $rndkey0,$inout0
1884 aesenc $rndkey0,$inout1
1885 aesenc $rndkey0,$inout2
1886 aesenc $rndkey0,$inout3
1887 aesenc $rndkey0,$inout4
1888 aesenc $rndkey0,$inout5
1889 $movkey -80($key,%rax),$rndkey0
1890 jnz .Lxts_enc_loop6
1891
1892 movdqa (%r8),$twmask # start calculating next tweak
1893 movdqa $twres,$twtmp
1894 paddd $twres,$twres
1895 aesenc $rndkey1,$inout0
1896 paddq @tweak[5],@tweak[5]
1897 psrad \$31,$twtmp
1898 aesenc $rndkey1,$inout1
1899 pand $twmask,$twtmp
1900 $movkey ($key_),@tweak[0] # load round[0]
1901 aesenc $rndkey1,$inout2
1902 aesenc $rndkey1,$inout3
1903 aesenc $rndkey1,$inout4
1904 pxor $twtmp,@tweak[5]
1905 movaps @tweak[0],@tweak[1] # copy round[0]
1906 aesenc $rndkey1,$inout5
1907 $movkey -64($key),$rndkey1
1908
1909 movdqa $twres,$twtmp
1910 aesenc $rndkey0,$inout0
1911 paddd $twres,$twres
1912 pxor @tweak[5],@tweak[0]
1913 aesenc $rndkey0,$inout1
1914 psrad \$31,$twtmp
1915 paddq @tweak[5],@tweak[5]
1916 aesenc $rndkey0,$inout2
1917 aesenc $rndkey0,$inout3
1918 pand $twmask,$twtmp
1919 movaps @tweak[1],@tweak[2]
1920 aesenc $rndkey0,$inout4
1921 pxor $twtmp,@tweak[5]
1922 movdqa $twres,$twtmp
1923 aesenc $rndkey0,$inout5
1924 $movkey -48($key),$rndkey0
1925
1926 paddd $twres,$twres
1927 aesenc $rndkey1,$inout0
1928 pxor @tweak[5],@tweak[1]
1929 psrad \$31,$twtmp
1930 aesenc $rndkey1,$inout1
1931 paddq @tweak[5],@tweak[5]
1932 pand $twmask,$twtmp
1933 aesenc $rndkey1,$inout2
1934 aesenc $rndkey1,$inout3
1935 movdqa @tweak[3],`16*3`(%rsp)
1936 pxor $twtmp,@tweak[5]
1937 aesenc $rndkey1,$inout4
1938 movaps @tweak[2],@tweak[3]
1939 movdqa $twres,$twtmp
1940 aesenc $rndkey1,$inout5
1941 $movkey -32($key),$rndkey1
1942
1943 paddd $twres,$twres
1944 aesenc $rndkey0,$inout0
1945 pxor @tweak[5],@tweak[2]
1946 psrad \$31,$twtmp
1947 aesenc $rndkey0,$inout1
1948 paddq @tweak[5],@tweak[5]
1949 pand $twmask,$twtmp
1950 aesenc $rndkey0,$inout2
1951 aesenc $rndkey0,$inout3
1952 aesenc $rndkey0,$inout4
1953 pxor $twtmp,@tweak[5]
1954 movaps @tweak[3],@tweak[4]
1955 aesenc $rndkey0,$inout5
1956
1957 movdqa $twres,$rndkey0
1958 paddd $twres,$twres
1959 aesenc $rndkey1,$inout0
1960 pxor @tweak[5],@tweak[3]
1961 psrad \$31,$rndkey0
1962 aesenc $rndkey1,$inout1
1963 paddq @tweak[5],@tweak[5]
1964 pand $twmask,$rndkey0
1965 aesenc $rndkey1,$inout2
1966 aesenc $rndkey1,$inout3
1967 pxor $rndkey0,@tweak[5]
1968 $movkey ($key_),$rndkey0
1969 aesenc $rndkey1,$inout4
1970 aesenc $rndkey1,$inout5
1971 $movkey 16($key_),$rndkey1
1972
1973 pxor @tweak[5],@tweak[4]
1974 aesenclast `16*0`(%rsp),$inout0
1975 psrad \$31,$twres
1976 paddq @tweak[5],@tweak[5]
1977 aesenclast `16*1`(%rsp),$inout1
1978 aesenclast `16*2`(%rsp),$inout2
1979 pand $twmask,$twres
1980 mov %r10,%rax # restore $rounds
1981 aesenclast `16*3`(%rsp),$inout3
1982 aesenclast `16*4`(%rsp),$inout4
1983 aesenclast `16*5`(%rsp),$inout5
1984 pxor $twres,@tweak[5]
1985
1986 lea `16*6`($out),$out # $out+=6*16
1987 movups $inout0,`-16*6`($out) # store 6 output blocks
1988 movups $inout1,`-16*5`($out)
1989 movups $inout2,`-16*4`($out)
1990 movups $inout3,`-16*3`($out)
1991 movups $inout4,`-16*2`($out)
1992 movups $inout5,`-16*1`($out)
1993 sub \$16*6,$len
1994 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
1995
1996 mov \$16+96,$rounds
1997 sub $rnds_,$rounds
1998 mov $key_,$key # restore $key
1999 shr \$4,$rounds # restore original value
2000
2001 .Lxts_enc_short:
2002 # at the point @tweak[0..5] are populated with tweak values
2003 mov $rounds,$rnds_ # backup $rounds
2004 pxor $rndkey0,@tweak[0]
2005 add \$16*6,$len # restore real remaining $len
2006 jz .Lxts_enc_done # done if ($len==0)
2007
2008 pxor $rndkey0,@tweak[1]
2009 cmp \$0x20,$len
2010 jb .Lxts_enc_one # $len is 1*16
2011 pxor $rndkey0,@tweak[2]
2012 je .Lxts_enc_two # $len is 2*16
2013
2014 pxor $rndkey0,@tweak[3]
2015 cmp \$0x40,$len
2016 jb .Lxts_enc_three # $len is 3*16
2017 pxor $rndkey0,@tweak[4]
2018 je .Lxts_enc_four # $len is 4*16
2019
2020 movdqu ($inp),$inout0 # $len is 5*16
2021 movdqu 16*1($inp),$inout1
2022 movdqu 16*2($inp),$inout2
2023 pxor @tweak[0],$inout0
2024 movdqu 16*3($inp),$inout3
2025 pxor @tweak[1],$inout1
2026 movdqu 16*4($inp),$inout4
2027 lea 16*5($inp),$inp # $inp+=5*16
2028 pxor @tweak[2],$inout2
2029 pxor @tweak[3],$inout3
2030 pxor @tweak[4],$inout4
2031 pxor $inout5,$inout5
2032
2033 call _aesni_encrypt6
2034
2035 xorps @tweak[0],$inout0
2036 movdqa @tweak[5],@tweak[0]
2037 xorps @tweak[1],$inout1
2038 xorps @tweak[2],$inout2
2039 movdqu $inout0,($out) # store 5 output blocks
2040 xorps @tweak[3],$inout3
2041 movdqu $inout1,16*1($out)
2042 xorps @tweak[4],$inout4
2043 movdqu $inout2,16*2($out)
2044 movdqu $inout3,16*3($out)
2045 movdqu $inout4,16*4($out)
2046 lea 16*5($out),$out # $out+=5*16
2047 jmp .Lxts_enc_done
2048
2049 .align 16
2050 .Lxts_enc_one:
2051 movups ($inp),$inout0
2052 lea 16*1($inp),$inp # inp+=1*16
2053 xorps @tweak[0],$inout0
2054 ___
2055 &aesni_generate1("enc",$key,$rounds);
2056 $code.=<<___;
2057 xorps @tweak[0],$inout0
2058 movdqa @tweak[1],@tweak[0]
2059 movups $inout0,($out) # store one output block
2060 lea 16*1($out),$out # $out+=1*16
2061 jmp .Lxts_enc_done
2062
2063 .align 16
2064 .Lxts_enc_two:
2065 movups ($inp),$inout0
2066 movups 16($inp),$inout1
2067 lea 32($inp),$inp # $inp+=2*16
2068 xorps @tweak[0],$inout0
2069 xorps @tweak[1],$inout1
2070
2071 call _aesni_encrypt2
2072
2073 xorps @tweak[0],$inout0
2074 movdqa @tweak[2],@tweak[0]
2075 xorps @tweak[1],$inout1
2076 movups $inout0,($out) # store 2 output blocks
2077 movups $inout1,16*1($out)
2078 lea 16*2($out),$out # $out+=2*16
2079 jmp .Lxts_enc_done
2080
2081 .align 16
2082 .Lxts_enc_three:
2083 movups ($inp),$inout0
2084 movups 16*1($inp),$inout1
2085 movups 16*2($inp),$inout2
2086 lea 16*3($inp),$inp # $inp+=3*16
2087 xorps @tweak[0],$inout0
2088 xorps @tweak[1],$inout1
2089 xorps @tweak[2],$inout2
2090
2091 call _aesni_encrypt3
2092
2093 xorps @tweak[0],$inout0
2094 movdqa @tweak[3],@tweak[0]
2095 xorps @tweak[1],$inout1
2096 xorps @tweak[2],$inout2
2097 movups $inout0,($out) # store 3 output blocks
2098 movups $inout1,16*1($out)
2099 movups $inout2,16*2($out)
2100 lea 16*3($out),$out # $out+=3*16
2101 jmp .Lxts_enc_done
2102
2103 .align 16
2104 .Lxts_enc_four:
2105 movups ($inp),$inout0
2106 movups 16*1($inp),$inout1
2107 movups 16*2($inp),$inout2
2108 xorps @tweak[0],$inout0
2109 movups 16*3($inp),$inout3
2110 lea 16*4($inp),$inp # $inp+=4*16
2111 xorps @tweak[1],$inout1
2112 xorps @tweak[2],$inout2
2113 xorps @tweak[3],$inout3
2114
2115 call _aesni_encrypt4
2116
2117 pxor @tweak[0],$inout0
2118 movdqa @tweak[4],@tweak[0]
2119 pxor @tweak[1],$inout1
2120 pxor @tweak[2],$inout2
2121 movdqu $inout0,($out) # store 4 output blocks
2122 pxor @tweak[3],$inout3
2123 movdqu $inout1,16*1($out)
2124 movdqu $inout2,16*2($out)
2125 movdqu $inout3,16*3($out)
2126 lea 16*4($out),$out # $out+=4*16
2127 jmp .Lxts_enc_done
2128
2129 .align 16
2130 .Lxts_enc_done:
2131 and \$15,$len_ # see if $len%16 is 0
2132 jz .Lxts_enc_ret
2133 mov $len_,$len
2134
2135 .Lxts_enc_steal:
2136 movzb ($inp),%eax # borrow $rounds ...
2137 movzb -16($out),%ecx # ... and $key
2138 lea 1($inp),$inp
2139 mov %al,-16($out)
2140 mov %cl,0($out)
2141 lea 1($out),$out
2142 sub \$1,$len
2143 jnz .Lxts_enc_steal
2144
2145 sub $len_,$out # rewind $out
2146 mov $key_,$key # restore $key
2147 mov $rnds_,$rounds # restore $rounds
2148
2149 movups -16($out),$inout0
2150 xorps @tweak[0],$inout0
2151 ___
2152 &aesni_generate1("enc",$key,$rounds);
2153 $code.=<<___;
2154 xorps @tweak[0],$inout0
2155 movups $inout0,-16($out)
2156
2157 .Lxts_enc_ret:
2158 xorps %xmm0,%xmm0 # clear register bank
2159 pxor %xmm1,%xmm1
2160 pxor %xmm2,%xmm2
2161 pxor %xmm3,%xmm3
2162 pxor %xmm4,%xmm4
2163 pxor %xmm5,%xmm5
2164 ___
2165 $code.=<<___ if (!$win64);
2166 pxor %xmm6,%xmm6
2167 pxor %xmm7,%xmm7
2168 movaps %xmm0,0x00(%rsp) # clear stack
2169 pxor %xmm8,%xmm8
2170 movaps %xmm0,0x10(%rsp)
2171 pxor %xmm9,%xmm9
2172 movaps %xmm0,0x20(%rsp)
2173 pxor %xmm10,%xmm10
2174 movaps %xmm0,0x30(%rsp)
2175 pxor %xmm11,%xmm11
2176 movaps %xmm0,0x40(%rsp)
2177 pxor %xmm12,%xmm12
2178 movaps %xmm0,0x50(%rsp)
2179 pxor %xmm13,%xmm13
2180 movaps %xmm0,0x60(%rsp)
2181 pxor %xmm14,%xmm14
2182 pxor %xmm15,%xmm15
2183 ___
2184 $code.=<<___ if ($win64);
2185 movaps -0xa0(%rbp),%xmm6
2186 movaps %xmm0,-0xa0(%rbp) # clear stack
2187 movaps -0x90(%rbp),%xmm7
2188 movaps %xmm0,-0x90(%rbp)
2189 movaps -0x80(%rbp),%xmm8
2190 movaps %xmm0,-0x80(%rbp)
2191 movaps -0x70(%rbp),%xmm9
2192 movaps %xmm0,-0x70(%rbp)
2193 movaps -0x60(%rbp),%xmm10
2194 movaps %xmm0,-0x60(%rbp)
2195 movaps -0x50(%rbp),%xmm11
2196 movaps %xmm0,-0x50(%rbp)
2197 movaps -0x40(%rbp),%xmm12
2198 movaps %xmm0,-0x40(%rbp)
2199 movaps -0x30(%rbp),%xmm13
2200 movaps %xmm0,-0x30(%rbp)
2201 movaps -0x20(%rbp),%xmm14
2202 movaps %xmm0,-0x20(%rbp)
2203 movaps -0x10(%rbp),%xmm15
2204 movaps %xmm0,-0x10(%rbp)
2205 movaps %xmm0,0x00(%rsp)
2206 movaps %xmm0,0x10(%rsp)
2207 movaps %xmm0,0x20(%rsp)
2208 movaps %xmm0,0x30(%rsp)
2209 movaps %xmm0,0x40(%rsp)
2210 movaps %xmm0,0x50(%rsp)
2211 movaps %xmm0,0x60(%rsp)
2212 ___
2213 $code.=<<___;
2214 lea (%rbp),%rsp
2215 pop %rbp
2216 .Lxts_enc_epilogue:
2217 ret
2218 .size aesni_xts_encrypt,.-aesni_xts_encrypt
2219 ___
2220
2221 $code.=<<___;
2222 .globl aesni_xts_decrypt
2223 .type aesni_xts_decrypt,\@function,6
2224 .align 16
2225 aesni_xts_decrypt:
2226 lea (%rsp),%rax
2227 push %rbp
2228 sub \$$frame_size,%rsp
2229 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2230 ___
2231 $code.=<<___ if ($win64);
2232 movaps %xmm6,-0xa8(%rax) # offload everything
2233 movaps %xmm7,-0x98(%rax)
2234 movaps %xmm8,-0x88(%rax)
2235 movaps %xmm9,-0x78(%rax)
2236 movaps %xmm10,-0x68(%rax)
2237 movaps %xmm11,-0x58(%rax)
2238 movaps %xmm12,-0x48(%rax)
2239 movaps %xmm13,-0x38(%rax)
2240 movaps %xmm14,-0x28(%rax)
2241 movaps %xmm15,-0x18(%rax)
2242 .Lxts_dec_body:
2243 ___
2244 $code.=<<___;
2245 lea -8(%rax),%rbp
2246 movups ($ivp),$inout0 # load clear-text tweak
2247 mov 240($key2),$rounds # key2->rounds
2248 mov 240($key),$rnds_ # key1->rounds
2249 ___
2250 # generate the tweak
2251 &aesni_generate1("enc",$key2,$rounds,$inout0);
2252 $code.=<<___;
2253 xor %eax,%eax # if ($len%16) len-=16;
2254 test \$15,$len
2255 setnz %al
2256 shl \$4,%rax
2257 sub %rax,$len
2258
2259 $movkey ($key),$rndkey0 # zero round key
2260 mov $key,$key_ # backup $key
2261 mov $rnds_,$rounds # backup $rounds
2262 shl \$4,$rnds_
2263 mov $len,$len_ # backup $len
2264 and \$-16,$len
2265
2266 $movkey 16($key,$rnds_),$rndkey1 # last round key
2267
2268 movdqa .Lxts_magic(%rip),$twmask
2269 movdqa $inout0,@tweak[5]
2270 pshufd \$0x5f,$inout0,$twres
2271 pxor $rndkey0,$rndkey1
2272 ___
2273 for ($i=0;$i<4;$i++) {
2274 $code.=<<___;
2275 movdqa $twres,$twtmp
2276 paddd $twres,$twres
2277 movdqa @tweak[5],@tweak[$i]
2278 psrad \$31,$twtmp # broadcast upper bits
2279 paddq @tweak[5],@tweak[5]
2280 pand $twmask,$twtmp
2281 pxor $rndkey0,@tweak[$i]
2282 pxor $twtmp,@tweak[5]
2283 ___
2284 }
2285 $code.=<<___;
2286 movdqa @tweak[5],@tweak[4]
2287 psrad \$31,$twres
2288 paddq @tweak[5],@tweak[5]
2289 pand $twmask,$twres
2290 pxor $rndkey0,@tweak[4]
2291 pxor $twres,@tweak[5]
2292 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2293
2294 sub \$16*6,$len
2295 jc .Lxts_dec_short # if $len-=6*16 borrowed
2296
2297 mov \$16+96,$rounds
2298 lea 32($key_,$rnds_),$key # end of key schedule
2299 sub %r10,%rax # twisted $rounds
2300 $movkey 16($key_),$rndkey1
2301 mov %rax,%r10 # backup twisted $rounds
2302 lea .Lxts_magic(%rip),%r8
2303 jmp .Lxts_dec_grandloop
2304
2305 .align 32
2306 .Lxts_dec_grandloop:
2307 movdqu `16*0`($inp),$inout0 # load input
2308 movdqa $rndkey0,$twmask
2309 movdqu `16*1`($inp),$inout1
2310 pxor @tweak[0],$inout0 # intput^=tweak^round[0]
2311 movdqu `16*2`($inp),$inout2
2312 pxor @tweak[1],$inout1
2313 aesdec $rndkey1,$inout0
2314 movdqu `16*3`($inp),$inout3
2315 pxor @tweak[2],$inout2
2316 aesdec $rndkey1,$inout1
2317 movdqu `16*4`($inp),$inout4
2318 pxor @tweak[3],$inout3
2319 aesdec $rndkey1,$inout2
2320 movdqu `16*5`($inp),$inout5
2321 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2322 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2323 pxor @tweak[4],$inout4
2324 aesdec $rndkey1,$inout3
2325 $movkey 32($key_),$rndkey0
2326 lea `16*6`($inp),$inp
2327 pxor $twmask,$inout5
2328
2329 pxor $twres,@tweak[0] # calclulate tweaks^round[last]
2330 aesdec $rndkey1,$inout4
2331 pxor $twres,@tweak[1]
2332 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2333 aesdec $rndkey1,$inout5
2334 $movkey 48($key_),$rndkey1
2335 pxor $twres,@tweak[2]
2336
2337 aesdec $rndkey0,$inout0
2338 pxor $twres,@tweak[3]
2339 movdqa @tweak[1],`16*1`(%rsp)
2340 aesdec $rndkey0,$inout1
2341 pxor $twres,@tweak[4]
2342 movdqa @tweak[2],`16*2`(%rsp)
2343 aesdec $rndkey0,$inout2
2344 aesdec $rndkey0,$inout3
2345 pxor $twres,$twmask
2346 movdqa @tweak[4],`16*4`(%rsp)
2347 aesdec $rndkey0,$inout4
2348 aesdec $rndkey0,$inout5
2349 $movkey 64($key_),$rndkey0
2350 movdqa $twmask,`16*5`(%rsp)
2351 pshufd \$0x5f,@tweak[5],$twres
2352 jmp .Lxts_dec_loop6
2353 .align 32
2354 .Lxts_dec_loop6:
2355 aesdec $rndkey1,$inout0
2356 aesdec $rndkey1,$inout1
2357 aesdec $rndkey1,$inout2
2358 aesdec $rndkey1,$inout3
2359 aesdec $rndkey1,$inout4
2360 aesdec $rndkey1,$inout5
2361 $movkey -64($key,%rax),$rndkey1
2362 add \$32,%rax
2363
2364 aesdec $rndkey0,$inout0
2365 aesdec $rndkey0,$inout1
2366 aesdec $rndkey0,$inout2
2367 aesdec $rndkey0,$inout3
2368 aesdec $rndkey0,$inout4
2369 aesdec $rndkey0,$inout5
2370 $movkey -80($key,%rax),$rndkey0
2371 jnz .Lxts_dec_loop6
2372
2373 movdqa (%r8),$twmask # start calculating next tweak
2374 movdqa $twres,$twtmp
2375 paddd $twres,$twres
2376 aesdec $rndkey1,$inout0
2377 paddq @tweak[5],@tweak[5]
2378 psrad \$31,$twtmp
2379 aesdec $rndkey1,$inout1
2380 pand $twmask,$twtmp
2381 $movkey ($key_),@tweak[0] # load round[0]
2382 aesdec $rndkey1,$inout2
2383 aesdec $rndkey1,$inout3
2384 aesdec $rndkey1,$inout4
2385 pxor $twtmp,@tweak[5]
2386 movaps @tweak[0],@tweak[1] # copy round[0]
2387 aesdec $rndkey1,$inout5
2388 $movkey -64($key),$rndkey1
2389
2390 movdqa $twres,$twtmp
2391 aesdec $rndkey0,$inout0
2392 paddd $twres,$twres
2393 pxor @tweak[5],@tweak[0]
2394 aesdec $rndkey0,$inout1
2395 psrad \$31,$twtmp
2396 paddq @tweak[5],@tweak[5]
2397 aesdec $rndkey0,$inout2
2398 aesdec $rndkey0,$inout3
2399 pand $twmask,$twtmp
2400 movaps @tweak[1],@tweak[2]
2401 aesdec $rndkey0,$inout4
2402 pxor $twtmp,@tweak[5]
2403 movdqa $twres,$twtmp
2404 aesdec $rndkey0,$inout5
2405 $movkey -48($key),$rndkey0
2406
2407 paddd $twres,$twres
2408 aesdec $rndkey1,$inout0
2409 pxor @tweak[5],@tweak[1]
2410 psrad \$31,$twtmp
2411 aesdec $rndkey1,$inout1
2412 paddq @tweak[5],@tweak[5]
2413 pand $twmask,$twtmp
2414 aesdec $rndkey1,$inout2
2415 aesdec $rndkey1,$inout3
2416 movdqa @tweak[3],`16*3`(%rsp)
2417 pxor $twtmp,@tweak[5]
2418 aesdec $rndkey1,$inout4
2419 movaps @tweak[2],@tweak[3]
2420 movdqa $twres,$twtmp
2421 aesdec $rndkey1,$inout5
2422 $movkey -32($key),$rndkey1
2423
2424 paddd $twres,$twres
2425 aesdec $rndkey0,$inout0
2426 pxor @tweak[5],@tweak[2]
2427 psrad \$31,$twtmp
2428 aesdec $rndkey0,$inout1
2429 paddq @tweak[5],@tweak[5]
2430 pand $twmask,$twtmp
2431 aesdec $rndkey0,$inout2
2432 aesdec $rndkey0,$inout3
2433 aesdec $rndkey0,$inout4
2434 pxor $twtmp,@tweak[5]
2435 movaps @tweak[3],@tweak[4]
2436 aesdec $rndkey0,$inout5
2437
2438 movdqa $twres,$rndkey0
2439 paddd $twres,$twres
2440 aesdec $rndkey1,$inout0
2441 pxor @tweak[5],@tweak[3]
2442 psrad \$31,$rndkey0
2443 aesdec $rndkey1,$inout1
2444 paddq @tweak[5],@tweak[5]
2445 pand $twmask,$rndkey0
2446 aesdec $rndkey1,$inout2
2447 aesdec $rndkey1,$inout3
2448 pxor $rndkey0,@tweak[5]
2449 $movkey ($key_),$rndkey0
2450 aesdec $rndkey1,$inout4
2451 aesdec $rndkey1,$inout5
2452 $movkey 16($key_),$rndkey1
2453
2454 pxor @tweak[5],@tweak[4]
2455 aesdeclast `16*0`(%rsp),$inout0
2456 psrad \$31,$twres
2457 paddq @tweak[5],@tweak[5]
2458 aesdeclast `16*1`(%rsp),$inout1
2459 aesdeclast `16*2`(%rsp),$inout2
2460 pand $twmask,$twres
2461 mov %r10,%rax # restore $rounds
2462 aesdeclast `16*3`(%rsp),$inout3
2463 aesdeclast `16*4`(%rsp),$inout4
2464 aesdeclast `16*5`(%rsp),$inout5
2465 pxor $twres,@tweak[5]
2466
2467 lea `16*6`($out),$out # $out+=6*16
2468 movups $inout0,`-16*6`($out) # store 6 output blocks
2469 movups $inout1,`-16*5`($out)
2470 movups $inout2,`-16*4`($out)
2471 movups $inout3,`-16*3`($out)
2472 movups $inout4,`-16*2`($out)
2473 movups $inout5,`-16*1`($out)
2474 sub \$16*6,$len
2475 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
2476
2477 mov \$16+96,$rounds
2478 sub $rnds_,$rounds
2479 mov $key_,$key # restore $key
2480 shr \$4,$rounds # restore original value
2481
2482 .Lxts_dec_short:
2483 # at the point @tweak[0..5] are populated with tweak values
2484 mov $rounds,$rnds_ # backup $rounds
2485 pxor $rndkey0,@tweak[0]
2486 pxor $rndkey0,@tweak[1]
2487 add \$16*6,$len # restore real remaining $len
2488 jz .Lxts_dec_done # done if ($len==0)
2489
2490 pxor $rndkey0,@tweak[2]
2491 cmp \$0x20,$len
2492 jb .Lxts_dec_one # $len is 1*16
2493 pxor $rndkey0,@tweak[3]
2494 je .Lxts_dec_two # $len is 2*16
2495
2496 pxor $rndkey0,@tweak[4]
2497 cmp \$0x40,$len
2498 jb .Lxts_dec_three # $len is 3*16
2499 je .Lxts_dec_four # $len is 4*16
2500
2501 movdqu ($inp),$inout0 # $len is 5*16
2502 movdqu 16*1($inp),$inout1
2503 movdqu 16*2($inp),$inout2
2504 pxor @tweak[0],$inout0
2505 movdqu 16*3($inp),$inout3
2506 pxor @tweak[1],$inout1
2507 movdqu 16*4($inp),$inout4
2508 lea 16*5($inp),$inp # $inp+=5*16
2509 pxor @tweak[2],$inout2
2510 pxor @tweak[3],$inout3
2511 pxor @tweak[4],$inout4
2512
2513 call _aesni_decrypt6
2514
2515 xorps @tweak[0],$inout0
2516 xorps @tweak[1],$inout1
2517 xorps @tweak[2],$inout2
2518 movdqu $inout0,($out) # store 5 output blocks
2519 xorps @tweak[3],$inout3
2520 movdqu $inout1,16*1($out)
2521 xorps @tweak[4],$inout4
2522 movdqu $inout2,16*2($out)
2523 pxor $twtmp,$twtmp
2524 movdqu $inout3,16*3($out)
2525 pcmpgtd @tweak[5],$twtmp
2526 movdqu $inout4,16*4($out)
2527 lea 16*5($out),$out # $out+=5*16
2528 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2529 and \$15,$len_
2530 jz .Lxts_dec_ret
2531
2532 movdqa @tweak[5],@tweak[0]
2533 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2534 pand $twmask,@tweak[1] # isolate carry and residue
2535 pxor @tweak[5],@tweak[1]
2536 jmp .Lxts_dec_done2
2537
2538 .align 16
2539 .Lxts_dec_one:
2540 movups ($inp),$inout0
2541 lea 16*1($inp),$inp # $inp+=1*16
2542 xorps @tweak[0],$inout0
2543 ___
2544 &aesni_generate1("dec",$key,$rounds);
2545 $code.=<<___;
2546 xorps @tweak[0],$inout0
2547 movdqa @tweak[1],@tweak[0]
2548 movups $inout0,($out) # store one output block
2549 movdqa @tweak[2],@tweak[1]
2550 lea 16*1($out),$out # $out+=1*16
2551 jmp .Lxts_dec_done
2552
2553 .align 16
2554 .Lxts_dec_two:
2555 movups ($inp),$inout0
2556 movups 16($inp),$inout1
2557 lea 32($inp),$inp # $inp+=2*16
2558 xorps @tweak[0],$inout0
2559 xorps @tweak[1],$inout1
2560
2561 call _aesni_decrypt2
2562
2563 xorps @tweak[0],$inout0
2564 movdqa @tweak[2],@tweak[0]
2565 xorps @tweak[1],$inout1
2566 movdqa @tweak[3],@tweak[1]
2567 movups $inout0,($out) # store 2 output blocks
2568 movups $inout1,16*1($out)
2569 lea 16*2($out),$out # $out+=2*16
2570 jmp .Lxts_dec_done
2571
2572 .align 16
2573 .Lxts_dec_three:
2574 movups ($inp),$inout0
2575 movups 16*1($inp),$inout1
2576 movups 16*2($inp),$inout2
2577 lea 16*3($inp),$inp # $inp+=3*16
2578 xorps @tweak[0],$inout0
2579 xorps @tweak[1],$inout1
2580 xorps @tweak[2],$inout2
2581
2582 call _aesni_decrypt3
2583
2584 xorps @tweak[0],$inout0
2585 movdqa @tweak[3],@tweak[0]
2586 xorps @tweak[1],$inout1
2587 movdqa @tweak[4],@tweak[1]
2588 xorps @tweak[2],$inout2
2589 movups $inout0,($out) # store 3 output blocks
2590 movups $inout1,16*1($out)
2591 movups $inout2,16*2($out)
2592 lea 16*3($out),$out # $out+=3*16
2593 jmp .Lxts_dec_done
2594
2595 .align 16
2596 .Lxts_dec_four:
2597 movups ($inp),$inout0
2598 movups 16*1($inp),$inout1
2599 movups 16*2($inp),$inout2
2600 xorps @tweak[0],$inout0
2601 movups 16*3($inp),$inout3
2602 lea 16*4($inp),$inp # $inp+=4*16
2603 xorps @tweak[1],$inout1
2604 xorps @tweak[2],$inout2
2605 xorps @tweak[3],$inout3
2606
2607 call _aesni_decrypt4
2608
2609 pxor @tweak[0],$inout0
2610 movdqa @tweak[4],@tweak[0]
2611 pxor @tweak[1],$inout1
2612 movdqa @tweak[5],@tweak[1]
2613 pxor @tweak[2],$inout2
2614 movdqu $inout0,($out) # store 4 output blocks
2615 pxor @tweak[3],$inout3
2616 movdqu $inout1,16*1($out)
2617 movdqu $inout2,16*2($out)
2618 movdqu $inout3,16*3($out)
2619 lea 16*4($out),$out # $out+=4*16
2620 jmp .Lxts_dec_done
2621
2622 .align 16
2623 .Lxts_dec_done:
2624 and \$15,$len_ # see if $len%16 is 0
2625 jz .Lxts_dec_ret
2626 .Lxts_dec_done2:
2627 mov $len_,$len
2628 mov $key_,$key # restore $key
2629 mov $rnds_,$rounds # restore $rounds
2630
2631 movups ($inp),$inout0
2632 xorps @tweak[1],$inout0
2633 ___
2634 &aesni_generate1("dec",$key,$rounds);
2635 $code.=<<___;
2636 xorps @tweak[1],$inout0
2637 movups $inout0,($out)
2638
2639 .Lxts_dec_steal:
2640 movzb 16($inp),%eax # borrow $rounds ...
2641 movzb ($out),%ecx # ... and $key
2642 lea 1($inp),$inp
2643 mov %al,($out)
2644 mov %cl,16($out)
2645 lea 1($out),$out
2646 sub \$1,$len
2647 jnz .Lxts_dec_steal
2648
2649 sub $len_,$out # rewind $out
2650 mov $key_,$key # restore $key
2651 mov $rnds_,$rounds # restore $rounds
2652
2653 movups ($out),$inout0
2654 xorps @tweak[0],$inout0
2655 ___
2656 &aesni_generate1("dec",$key,$rounds);
2657 $code.=<<___;
2658 xorps @tweak[0],$inout0
2659 movups $inout0,($out)
2660
2661 .Lxts_dec_ret:
2662 xorps %xmm0,%xmm0 # clear register bank
2663 pxor %xmm1,%xmm1
2664 pxor %xmm2,%xmm2
2665 pxor %xmm3,%xmm3
2666 pxor %xmm4,%xmm4
2667 pxor %xmm5,%xmm5
2668 ___
2669 $code.=<<___ if (!$win64);
2670 pxor %xmm6,%xmm6
2671 pxor %xmm7,%xmm7
2672 movaps %xmm0,0x00(%rsp) # clear stack
2673 pxor %xmm8,%xmm8
2674 movaps %xmm0,0x10(%rsp)
2675 pxor %xmm9,%xmm9
2676 movaps %xmm0,0x20(%rsp)
2677 pxor %xmm10,%xmm10
2678 movaps %xmm0,0x30(%rsp)
2679 pxor %xmm11,%xmm11
2680 movaps %xmm0,0x40(%rsp)
2681 pxor %xmm12,%xmm12
2682 movaps %xmm0,0x50(%rsp)
2683 pxor %xmm13,%xmm13
2684 movaps %xmm0,0x60(%rsp)
2685 pxor %xmm14,%xmm14
2686 pxor %xmm15,%xmm15
2687 ___
2688 $code.=<<___ if ($win64);
2689 movaps -0xa0(%rbp),%xmm6
2690 movaps %xmm0,-0xa0(%rbp) # clear stack
2691 movaps -0x90(%rbp),%xmm7
2692 movaps %xmm0,-0x90(%rbp)
2693 movaps -0x80(%rbp),%xmm8
2694 movaps %xmm0,-0x80(%rbp)
2695 movaps -0x70(%rbp),%xmm9
2696 movaps %xmm0,-0x70(%rbp)
2697 movaps -0x60(%rbp),%xmm10
2698 movaps %xmm0,-0x60(%rbp)
2699 movaps -0x50(%rbp),%xmm11
2700 movaps %xmm0,-0x50(%rbp)
2701 movaps -0x40(%rbp),%xmm12
2702 movaps %xmm0,-0x40(%rbp)
2703 movaps -0x30(%rbp),%xmm13
2704 movaps %xmm0,-0x30(%rbp)
2705 movaps -0x20(%rbp),%xmm14
2706 movaps %xmm0,-0x20(%rbp)
2707 movaps -0x10(%rbp),%xmm15
2708 movaps %xmm0,-0x10(%rbp)
2709 movaps %xmm0,0x00(%rsp)
2710 movaps %xmm0,0x10(%rsp)
2711 movaps %xmm0,0x20(%rsp)
2712 movaps %xmm0,0x30(%rsp)
2713 movaps %xmm0,0x40(%rsp)
2714 movaps %xmm0,0x50(%rsp)
2715 movaps %xmm0,0x60(%rsp)
2716 ___
2717 $code.=<<___;
2718 lea (%rbp),%rsp
2719 pop %rbp
2720 .Lxts_dec_epilogue:
2721 ret
2722 .size aesni_xts_decrypt,.-aesni_xts_decrypt
2723 ___
2724 }
2725 \f
2726 ######################################################################
2727 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2728 # const AES_KEY *key, unsigned int start_block_num,
2729 # unsigned char offset_i[16], const unsigned char L_[][16],
2730 # unsigned char checksum[16]);
2731 #
2732 {
2733 my @offset=map("%xmm$_",(10..15));
2734 my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2735 my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2736 my ($L_p,$checksum_p) = ("%rbx","%rbp");
2737 my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2738 my $seventh_arg = $win64 ? 56 : 8;
2739 my $blocks = $len;
2740
2741 $code.=<<___;
2742 .globl aesni_ocb_encrypt
2743 .type aesni_ocb_encrypt,\@function,6
2744 .align 32
2745 aesni_ocb_encrypt:
2746 lea (%rsp),%rax
2747 push %rbx
2748 push %rbp
2749 push %r12
2750 push %r13
2751 push %r14
2752 ___
2753 $code.=<<___ if ($win64);
2754 lea -0xa0(%rsp),%rsp
2755 movaps %xmm6,0x00(%rsp) # offload everything
2756 movaps %xmm7,0x10(%rsp)
2757 movaps %xmm8,0x20(%rsp)
2758 movaps %xmm9,0x30(%rsp)
2759 movaps %xmm10,0x40(%rsp)
2760 movaps %xmm11,0x50(%rsp)
2761 movaps %xmm12,0x60(%rsp)
2762 movaps %xmm13,0x70(%rsp)
2763 movaps %xmm14,0x80(%rsp)
2764 movaps %xmm15,0x90(%rsp)
2765 .Locb_enc_body:
2766 ___
2767 $code.=<<___;
2768 mov $seventh_arg(%rax),$L_p # 7th argument
2769 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2770
2771 mov 240($key),$rnds_
2772 mov $key,$key_
2773 shl \$4,$rnds_
2774 $movkey ($key),$rndkey0l # round[0]
2775 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2776
2777 movdqu ($offset_p),@offset[5] # load last offset_i
2778 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2779 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2780
2781 mov \$16+32,$rounds
2782 lea 32($key_,$rnds_),$key
2783 $movkey 16($key_),$rndkey1 # round[1]
2784 sub %r10,%rax # twisted $rounds
2785 mov %rax,%r10 # backup twisted $rounds
2786
2787 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2788 movdqu ($checksum_p),$checksum # load checksum
2789
2790 test \$1,$block_num # is first block number odd?
2791 jnz .Locb_enc_odd
2792
2793 bsf $block_num,$i1
2794 add \$1,$block_num
2795 shl \$4,$i1
2796 movdqu ($L_p,$i1),$inout5 # borrow
2797 movdqu ($inp),$inout0
2798 lea 16($inp),$inp
2799
2800 call __ocb_encrypt1
2801
2802 movdqa $inout5,@offset[5]
2803 movups $inout0,($out)
2804 lea 16($out),$out
2805 sub \$1,$blocks
2806 jz .Locb_enc_done
2807
2808 .Locb_enc_odd:
2809 lea 1($block_num),$i1 # even-numbered blocks
2810 lea 3($block_num),$i3
2811 lea 5($block_num),$i5
2812 lea 6($block_num),$block_num
2813 bsf $i1,$i1 # ntz(block)
2814 bsf $i3,$i3
2815 bsf $i5,$i5
2816 shl \$4,$i1 # ntz(block) -> table offset
2817 shl \$4,$i3
2818 shl \$4,$i5
2819
2820 sub \$6,$blocks
2821 jc .Locb_enc_short
2822 jmp .Locb_enc_grandloop
2823
2824 .align 32
2825 .Locb_enc_grandloop:
2826 movdqu `16*0`($inp),$inout0 # load input
2827 movdqu `16*1`($inp),$inout1
2828 movdqu `16*2`($inp),$inout2
2829 movdqu `16*3`($inp),$inout3
2830 movdqu `16*4`($inp),$inout4
2831 movdqu `16*5`($inp),$inout5
2832 lea `16*6`($inp),$inp
2833
2834 call __ocb_encrypt6
2835
2836 movups $inout0,`16*0`($out) # store output
2837 movups $inout1,`16*1`($out)
2838 movups $inout2,`16*2`($out)
2839 movups $inout3,`16*3`($out)
2840 movups $inout4,`16*4`($out)
2841 movups $inout5,`16*5`($out)
2842 lea `16*6`($out),$out
2843 sub \$6,$blocks
2844 jnc .Locb_enc_grandloop
2845
2846 .Locb_enc_short:
2847 add \$6,$blocks
2848 jz .Locb_enc_done
2849
2850 movdqu `16*0`($inp),$inout0
2851 cmp \$2,$blocks
2852 jb .Locb_enc_one
2853 movdqu `16*1`($inp),$inout1
2854 je .Locb_enc_two
2855
2856 movdqu `16*2`($inp),$inout2
2857 cmp \$4,$blocks
2858 jb .Locb_enc_three
2859 movdqu `16*3`($inp),$inout3
2860 je .Locb_enc_four
2861
2862 movdqu `16*4`($inp),$inout4
2863 pxor $inout5,$inout5
2864
2865 call __ocb_encrypt6
2866
2867 movdqa @offset[4],@offset[5]
2868 movups $inout0,`16*0`($out)
2869 movups $inout1,`16*1`($out)
2870 movups $inout2,`16*2`($out)
2871 movups $inout3,`16*3`($out)
2872 movups $inout4,`16*4`($out)
2873
2874 jmp .Locb_enc_done
2875
2876 .align 16
2877 .Locb_enc_one:
2878 movdqa @offset[0],$inout5 # borrow
2879
2880 call __ocb_encrypt1
2881
2882 movdqa $inout5,@offset[5]
2883 movups $inout0,`16*0`($out)
2884 jmp .Locb_enc_done
2885
2886 .align 16
2887 .Locb_enc_two:
2888 pxor $inout2,$inout2
2889 pxor $inout3,$inout3
2890
2891 call __ocb_encrypt4
2892
2893 movdqa @offset[1],@offset[5]
2894 movups $inout0,`16*0`($out)
2895 movups $inout1,`16*1`($out)
2896
2897 jmp .Locb_enc_done
2898
2899 .align 16
2900 .Locb_enc_three:
2901 pxor $inout3,$inout3
2902
2903 call __ocb_encrypt4
2904
2905 movdqa @offset[2],@offset[5]
2906 movups $inout0,`16*0`($out)
2907 movups $inout1,`16*1`($out)
2908 movups $inout2,`16*2`($out)
2909
2910 jmp .Locb_enc_done
2911
2912 .align 16
2913 .Locb_enc_four:
2914 call __ocb_encrypt4
2915
2916 movdqa @offset[3],@offset[5]
2917 movups $inout0,`16*0`($out)
2918 movups $inout1,`16*1`($out)
2919 movups $inout2,`16*2`($out)
2920 movups $inout3,`16*3`($out)
2921
2922 .Locb_enc_done:
2923 pxor $rndkey0,@offset[5] # "remove" round[last]
2924 movdqu $checksum,($checksum_p) # store checksum
2925 movdqu @offset[5],($offset_p) # store last offset_i
2926
2927 xorps %xmm0,%xmm0 # clear register bank
2928 pxor %xmm1,%xmm1
2929 pxor %xmm2,%xmm2
2930 pxor %xmm3,%xmm3
2931 pxor %xmm4,%xmm4
2932 pxor %xmm5,%xmm5
2933 ___
2934 $code.=<<___ if (!$win64);
2935 pxor %xmm6,%xmm6
2936 pxor %xmm7,%xmm7
2937 pxor %xmm8,%xmm8
2938 pxor %xmm9,%xmm9
2939 pxor %xmm10,%xmm10
2940 pxor %xmm11,%xmm11
2941 pxor %xmm12,%xmm12
2942 pxor %xmm13,%xmm13
2943 pxor %xmm14,%xmm14
2944 pxor %xmm15,%xmm15
2945 ___
2946 $code.=<<___ if ($win64);
2947 movaps 0x00(%rsp),%xmm6
2948 movaps %xmm0,0x00(%rsp) # clear stack
2949 movaps 0x10(%rsp),%xmm7
2950 movaps %xmm0,0x10(%rsp)
2951 movaps 0x20(%rsp),%xmm8
2952 movaps %xmm0,0x20(%rsp)
2953 movaps 0x30(%rsp),%xmm9
2954 movaps %xmm0,0x30(%rsp)
2955 movaps 0x40(%rsp),%xmm10
2956 movaps %xmm0,0x40(%rsp)
2957 movaps 0x50(%rsp),%xmm11
2958 movaps %xmm0,0x50(%rsp)
2959 movaps 0x60(%rsp),%xmm12
2960 movaps %xmm0,0x60(%rsp)
2961 movaps 0x70(%rsp),%xmm13
2962 movaps %xmm0,0x70(%rsp)
2963 movaps 0x80(%rsp),%xmm14
2964 movaps %xmm0,0x80(%rsp)
2965 movaps 0x90(%rsp),%xmm15
2966 movaps %xmm0,0x90(%rsp)
2967 lea 0xa0+0x28(%rsp),%rax
2968 .Locb_enc_pop:
2969 lea 0xa0(%rsp),%rsp
2970 ___
2971 $code.=<<___;
2972 pop %r14
2973 pop %r13
2974 pop %r12
2975 pop %rbp
2976 pop %rbx
2977 .Locb_enc_epilogue:
2978 ret
2979 .size aesni_ocb_encrypt,.-aesni_ocb_encrypt
2980
2981 .type __ocb_encrypt6,\@abi-omnipotent
2982 .align 32
2983 __ocb_encrypt6:
2984 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
2985 movdqu ($L_p,$i1),@offset[1]
2986 movdqa @offset[0],@offset[2]
2987 movdqu ($L_p,$i3),@offset[3]
2988 movdqa @offset[0],@offset[4]
2989 pxor @offset[5],@offset[0]
2990 movdqu ($L_p,$i5),@offset[5]
2991 pxor @offset[0],@offset[1]
2992 pxor $inout0,$checksum # accumulate checksum
2993 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
2994 pxor @offset[1],@offset[2]
2995 pxor $inout1,$checksum
2996 pxor @offset[1],$inout1
2997 pxor @offset[2],@offset[3]
2998 pxor $inout2,$checksum
2999 pxor @offset[2],$inout2
3000 pxor @offset[3],@offset[4]
3001 pxor $inout3,$checksum
3002 pxor @offset[3],$inout3
3003 pxor @offset[4],@offset[5]
3004 pxor $inout4,$checksum
3005 pxor @offset[4],$inout4
3006 pxor $inout5,$checksum
3007 pxor @offset[5],$inout5
3008 $movkey 32($key_),$rndkey0
3009
3010 lea 1($block_num),$i1 # even-numbered blocks
3011 lea 3($block_num),$i3
3012 lea 5($block_num),$i5
3013 add \$6,$block_num
3014 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3015 bsf $i1,$i1 # ntz(block)
3016 bsf $i3,$i3
3017 bsf $i5,$i5
3018
3019 aesenc $rndkey1,$inout0
3020 aesenc $rndkey1,$inout1
3021 aesenc $rndkey1,$inout2
3022 aesenc $rndkey1,$inout3
3023 pxor $rndkey0l,@offset[1]
3024 pxor $rndkey0l,@offset[2]
3025 aesenc $rndkey1,$inout4
3026 pxor $rndkey0l,@offset[3]
3027 pxor $rndkey0l,@offset[4]
3028 aesenc $rndkey1,$inout5
3029 $movkey 48($key_),$rndkey1
3030 pxor $rndkey0l,@offset[5]
3031
3032 aesenc $rndkey0,$inout0
3033 aesenc $rndkey0,$inout1
3034 aesenc $rndkey0,$inout2
3035 aesenc $rndkey0,$inout3
3036 aesenc $rndkey0,$inout4
3037 aesenc $rndkey0,$inout5
3038 $movkey 64($key_),$rndkey0
3039 shl \$4,$i1 # ntz(block) -> table offset
3040 shl \$4,$i3
3041 jmp .Locb_enc_loop6
3042
3043 .align 32
3044 .Locb_enc_loop6:
3045 aesenc $rndkey1,$inout0
3046 aesenc $rndkey1,$inout1
3047 aesenc $rndkey1,$inout2
3048 aesenc $rndkey1,$inout3
3049 aesenc $rndkey1,$inout4
3050 aesenc $rndkey1,$inout5
3051 $movkey ($key,%rax),$rndkey1
3052 add \$32,%rax
3053
3054 aesenc $rndkey0,$inout0
3055 aesenc $rndkey0,$inout1
3056 aesenc $rndkey0,$inout2
3057 aesenc $rndkey0,$inout3
3058 aesenc $rndkey0,$inout4
3059 aesenc $rndkey0,$inout5
3060 $movkey -16($key,%rax),$rndkey0
3061 jnz .Locb_enc_loop6
3062
3063 aesenc $rndkey1,$inout0
3064 aesenc $rndkey1,$inout1
3065 aesenc $rndkey1,$inout2
3066 aesenc $rndkey1,$inout3
3067 aesenc $rndkey1,$inout4
3068 aesenc $rndkey1,$inout5
3069 $movkey 16($key_),$rndkey1
3070 shl \$4,$i5
3071
3072 aesenclast @offset[0],$inout0
3073 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3074 mov %r10,%rax # restore twisted rounds
3075 aesenclast @offset[1],$inout1
3076 aesenclast @offset[2],$inout2
3077 aesenclast @offset[3],$inout3
3078 aesenclast @offset[4],$inout4
3079 aesenclast @offset[5],$inout5
3080 ret
3081 .size __ocb_encrypt6,.-__ocb_encrypt6
3082
3083 .type __ocb_encrypt4,\@abi-omnipotent
3084 .align 32
3085 __ocb_encrypt4:
3086 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3087 movdqu ($L_p,$i1),@offset[1]
3088 movdqa @offset[0],@offset[2]
3089 movdqu ($L_p,$i3),@offset[3]
3090 pxor @offset[5],@offset[0]
3091 pxor @offset[0],@offset[1]
3092 pxor $inout0,$checksum # accumulate checksum
3093 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3094 pxor @offset[1],@offset[2]
3095 pxor $inout1,$checksum
3096 pxor @offset[1],$inout1
3097 pxor @offset[2],@offset[3]
3098 pxor $inout2,$checksum
3099 pxor @offset[2],$inout2
3100 pxor $inout3,$checksum
3101 pxor @offset[3],$inout3
3102 $movkey 32($key_),$rndkey0
3103
3104 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3105 pxor $rndkey0l,@offset[1]
3106 pxor $rndkey0l,@offset[2]
3107 pxor $rndkey0l,@offset[3]
3108
3109 aesenc $rndkey1,$inout0
3110 aesenc $rndkey1,$inout1
3111 aesenc $rndkey1,$inout2
3112 aesenc $rndkey1,$inout3
3113 $movkey 48($key_),$rndkey1
3114
3115 aesenc $rndkey0,$inout0
3116 aesenc $rndkey0,$inout1
3117 aesenc $rndkey0,$inout2
3118 aesenc $rndkey0,$inout3
3119 $movkey 64($key_),$rndkey0
3120 jmp .Locb_enc_loop4
3121
3122 .align 32
3123 .Locb_enc_loop4:
3124 aesenc $rndkey1,$inout0
3125 aesenc $rndkey1,$inout1
3126 aesenc $rndkey1,$inout2
3127 aesenc $rndkey1,$inout3
3128 $movkey ($key,%rax),$rndkey1
3129 add \$32,%rax
3130
3131 aesenc $rndkey0,$inout0
3132 aesenc $rndkey0,$inout1
3133 aesenc $rndkey0,$inout2
3134 aesenc $rndkey0,$inout3
3135 $movkey -16($key,%rax),$rndkey0
3136 jnz .Locb_enc_loop4
3137
3138 aesenc $rndkey1,$inout0
3139 aesenc $rndkey1,$inout1
3140 aesenc $rndkey1,$inout2
3141 aesenc $rndkey1,$inout3
3142 $movkey 16($key_),$rndkey1
3143 mov %r10,%rax # restore twisted rounds
3144
3145 aesenclast @offset[0],$inout0
3146 aesenclast @offset[1],$inout1
3147 aesenclast @offset[2],$inout2
3148 aesenclast @offset[3],$inout3
3149 ret
3150 .size __ocb_encrypt4,.-__ocb_encrypt4
3151
3152 .type __ocb_encrypt1,\@abi-omnipotent
3153 .align 32
3154 __ocb_encrypt1:
3155 pxor @offset[5],$inout5 # offset_i
3156 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3157 pxor $inout0,$checksum # accumulate checksum
3158 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3159 $movkey 32($key_),$rndkey0
3160
3161 aesenc $rndkey1,$inout0
3162 $movkey 48($key_),$rndkey1
3163 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3164
3165 aesenc $rndkey0,$inout0
3166 $movkey 64($key_),$rndkey0
3167 jmp .Locb_enc_loop1
3168
3169 .align 32
3170 .Locb_enc_loop1:
3171 aesenc $rndkey1,$inout0
3172 $movkey ($key,%rax),$rndkey1
3173 add \$32,%rax
3174
3175 aesenc $rndkey0,$inout0
3176 $movkey -16($key,%rax),$rndkey0
3177 jnz .Locb_enc_loop1
3178
3179 aesenc $rndkey1,$inout0
3180 $movkey 16($key_),$rndkey1 # redundant in tail
3181 mov %r10,%rax # restore twisted rounds
3182
3183 aesenclast $inout5,$inout0
3184 ret
3185 .size __ocb_encrypt1,.-__ocb_encrypt1
3186
3187 .globl aesni_ocb_decrypt
3188 .type aesni_ocb_decrypt,\@function,6
3189 .align 32
3190 aesni_ocb_decrypt:
3191 lea (%rsp),%rax
3192 push %rbx
3193 push %rbp
3194 push %r12
3195 push %r13
3196 push %r14
3197 ___
3198 $code.=<<___ if ($win64);
3199 lea -0xa0(%rsp),%rsp
3200 movaps %xmm6,0x00(%rsp) # offload everything
3201 movaps %xmm7,0x10(%rsp)
3202 movaps %xmm8,0x20(%rsp)
3203 movaps %xmm9,0x30(%rsp)
3204 movaps %xmm10,0x40(%rsp)
3205 movaps %xmm11,0x50(%rsp)
3206 movaps %xmm12,0x60(%rsp)
3207 movaps %xmm13,0x70(%rsp)
3208 movaps %xmm14,0x80(%rsp)
3209 movaps %xmm15,0x90(%rsp)
3210 .Locb_dec_body:
3211 ___
3212 $code.=<<___;
3213 mov $seventh_arg(%rax),$L_p # 7th argument
3214 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3215
3216 mov 240($key),$rnds_
3217 mov $key,$key_
3218 shl \$4,$rnds_
3219 $movkey ($key),$rndkey0l # round[0]
3220 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3221
3222 movdqu ($offset_p),@offset[5] # load last offset_i
3223 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3224 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3225
3226 mov \$16+32,$rounds
3227 lea 32($key_,$rnds_),$key
3228 $movkey 16($key_),$rndkey1 # round[1]
3229 sub %r10,%rax # twisted $rounds
3230 mov %rax,%r10 # backup twisted $rounds
3231
3232 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3233 movdqu ($checksum_p),$checksum # load checksum
3234
3235 test \$1,$block_num # is first block number odd?
3236 jnz .Locb_dec_odd
3237
3238 bsf $block_num,$i1
3239 add \$1,$block_num
3240 shl \$4,$i1
3241 movdqu ($L_p,$i1),$inout5 # borrow
3242 movdqu ($inp),$inout0
3243 lea 16($inp),$inp
3244
3245 call __ocb_decrypt1
3246
3247 movdqa $inout5,@offset[5]
3248 movups $inout0,($out)
3249 xorps $inout0,$checksum # accumulate checksum
3250 lea 16($out),$out
3251 sub \$1,$blocks
3252 jz .Locb_dec_done
3253
3254 .Locb_dec_odd:
3255 lea 1($block_num),$i1 # even-numbered blocks
3256 lea 3($block_num),$i3
3257 lea 5($block_num),$i5
3258 lea 6($block_num),$block_num
3259 bsf $i1,$i1 # ntz(block)
3260 bsf $i3,$i3
3261 bsf $i5,$i5
3262 shl \$4,$i1 # ntz(block) -> table offset
3263 shl \$4,$i3
3264 shl \$4,$i5
3265
3266 sub \$6,$blocks
3267 jc .Locb_dec_short
3268 jmp .Locb_dec_grandloop
3269
3270 .align 32
3271 .Locb_dec_grandloop:
3272 movdqu `16*0`($inp),$inout0 # load input
3273 movdqu `16*1`($inp),$inout1
3274 movdqu `16*2`($inp),$inout2
3275 movdqu `16*3`($inp),$inout3
3276 movdqu `16*4`($inp),$inout4
3277 movdqu `16*5`($inp),$inout5
3278 lea `16*6`($inp),$inp
3279
3280 call __ocb_decrypt6
3281
3282 movups $inout0,`16*0`($out) # store output
3283 pxor $inout0,$checksum # accumulate checksum
3284 movups $inout1,`16*1`($out)
3285 pxor $inout1,$checksum
3286 movups $inout2,`16*2`($out)
3287 pxor $inout2,$checksum
3288 movups $inout3,`16*3`($out)
3289 pxor $inout3,$checksum
3290 movups $inout4,`16*4`($out)
3291 pxor $inout4,$checksum
3292 movups $inout5,`16*5`($out)
3293 pxor $inout5,$checksum
3294 lea `16*6`($out),$out
3295 sub \$6,$blocks
3296 jnc .Locb_dec_grandloop
3297
3298 .Locb_dec_short:
3299 add \$6,$blocks
3300 jz .Locb_dec_done
3301
3302 movdqu `16*0`($inp),$inout0
3303 cmp \$2,$blocks
3304 jb .Locb_dec_one
3305 movdqu `16*1`($inp),$inout1
3306 je .Locb_dec_two
3307
3308 movdqu `16*2`($inp),$inout2
3309 cmp \$4,$blocks
3310 jb .Locb_dec_three
3311 movdqu `16*3`($inp),$inout3
3312 je .Locb_dec_four
3313
3314 movdqu `16*4`($inp),$inout4
3315 pxor $inout5,$inout5
3316
3317 call __ocb_decrypt6
3318
3319 movdqa @offset[4],@offset[5]
3320 movups $inout0,`16*0`($out) # store output
3321 pxor $inout0,$checksum # accumulate checksum
3322 movups $inout1,`16*1`($out)
3323 pxor $inout1,$checksum
3324 movups $inout2,`16*2`($out)
3325 pxor $inout2,$checksum
3326 movups $inout3,`16*3`($out)
3327 pxor $inout3,$checksum
3328 movups $inout4,`16*4`($out)
3329 pxor $inout4,$checksum
3330
3331 jmp .Locb_dec_done
3332
3333 .align 16
3334 .Locb_dec_one:
3335 movdqa @offset[0],$inout5 # borrow
3336
3337 call __ocb_decrypt1
3338
3339 movdqa $inout5,@offset[5]
3340 movups $inout0,`16*0`($out) # store output
3341 xorps $inout0,$checksum # accumulate checksum
3342 jmp .Locb_dec_done
3343
3344 .align 16
3345 .Locb_dec_two:
3346 pxor $inout2,$inout2
3347 pxor $inout3,$inout3
3348
3349 call __ocb_decrypt4
3350
3351 movdqa @offset[1],@offset[5]
3352 movups $inout0,`16*0`($out) # store output
3353 xorps $inout0,$checksum # accumulate checksum
3354 movups $inout1,`16*1`($out)
3355 xorps $inout1,$checksum
3356
3357 jmp .Locb_dec_done
3358
3359 .align 16
3360 .Locb_dec_three:
3361 pxor $inout3,$inout3
3362
3363 call __ocb_decrypt4
3364
3365 movdqa @offset[2],@offset[5]
3366 movups $inout0,`16*0`($out) # store output
3367 xorps $inout0,$checksum # accumulate checksum
3368 movups $inout1,`16*1`($out)
3369 xorps $inout1,$checksum
3370 movups $inout2,`16*2`($out)
3371 xorps $inout2,$checksum
3372
3373 jmp .Locb_dec_done
3374
3375 .align 16
3376 .Locb_dec_four:
3377 call __ocb_decrypt4
3378
3379 movdqa @offset[3],@offset[5]
3380 movups $inout0,`16*0`($out) # store output
3381 pxor $inout0,$checksum # accumulate checksum
3382 movups $inout1,`16*1`($out)
3383 pxor $inout1,$checksum
3384 movups $inout2,`16*2`($out)
3385 pxor $inout2,$checksum
3386 movups $inout3,`16*3`($out)
3387 pxor $inout3,$checksum
3388
3389 .Locb_dec_done:
3390 pxor $rndkey0,@offset[5] # "remove" round[last]
3391 movdqu $checksum,($checksum_p) # store checksum
3392 movdqu @offset[5],($offset_p) # store last offset_i
3393
3394 xorps %xmm0,%xmm0 # clear register bank
3395 pxor %xmm1,%xmm1
3396 pxor %xmm2,%xmm2
3397 pxor %xmm3,%xmm3
3398 pxor %xmm4,%xmm4
3399 pxor %xmm5,%xmm5
3400 ___
3401 $code.=<<___ if (!$win64);
3402 pxor %xmm6,%xmm6
3403 pxor %xmm7,%xmm7
3404 pxor %xmm8,%xmm8
3405 pxor %xmm9,%xmm9
3406 pxor %xmm10,%xmm10
3407 pxor %xmm11,%xmm11
3408 pxor %xmm12,%xmm12
3409 pxor %xmm13,%xmm13
3410 pxor %xmm14,%xmm14
3411 pxor %xmm15,%xmm15
3412 ___
3413 $code.=<<___ if ($win64);
3414 movaps 0x00(%rsp),%xmm6
3415 movaps %xmm0,0x00(%rsp) # clear stack
3416 movaps 0x10(%rsp),%xmm7
3417 movaps %xmm0,0x10(%rsp)
3418 movaps 0x20(%rsp),%xmm8
3419 movaps %xmm0,0x20(%rsp)
3420 movaps 0x30(%rsp),%xmm9
3421 movaps %xmm0,0x30(%rsp)
3422 movaps 0x40(%rsp),%xmm10
3423 movaps %xmm0,0x40(%rsp)
3424 movaps 0x50(%rsp),%xmm11
3425 movaps %xmm0,0x50(%rsp)
3426 movaps 0x60(%rsp),%xmm12
3427 movaps %xmm0,0x60(%rsp)
3428 movaps 0x70(%rsp),%xmm13
3429 movaps %xmm0,0x70(%rsp)
3430 movaps 0x80(%rsp),%xmm14
3431 movaps %xmm0,0x80(%rsp)
3432 movaps 0x90(%rsp),%xmm15
3433 movaps %xmm0,0x90(%rsp)
3434 lea 0xa0+0x28(%rsp),%rax
3435 .Locb_dec_pop:
3436 lea 0xa0(%rsp),%rsp
3437 ___
3438 $code.=<<___;
3439 pop %r14
3440 pop %r13
3441 pop %r12
3442 pop %rbp
3443 pop %rbx
3444 .Locb_dec_epilogue:
3445 ret
3446 .size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3447
3448 .type __ocb_decrypt6,\@abi-omnipotent
3449 .align 32
3450 __ocb_decrypt6:
3451 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3452 movdqu ($L_p,$i1),@offset[1]
3453 movdqa @offset[0],@offset[2]
3454 movdqu ($L_p,$i3),@offset[3]
3455 movdqa @offset[0],@offset[4]
3456 pxor @offset[5],@offset[0]
3457 movdqu ($L_p,$i5),@offset[5]
3458 pxor @offset[0],@offset[1]
3459 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3460 pxor @offset[1],@offset[2]
3461 pxor @offset[1],$inout1
3462 pxor @offset[2],@offset[3]
3463 pxor @offset[2],$inout2
3464 pxor @offset[3],@offset[4]
3465 pxor @offset[3],$inout3
3466 pxor @offset[4],@offset[5]
3467 pxor @offset[4],$inout4
3468 pxor @offset[5],$inout5
3469 $movkey 32($key_),$rndkey0
3470
3471 lea 1($block_num),$i1 # even-numbered blocks
3472 lea 3($block_num),$i3
3473 lea 5($block_num),$i5
3474 add \$6,$block_num
3475 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3476 bsf $i1,$i1 # ntz(block)
3477 bsf $i3,$i3
3478 bsf $i5,$i5
3479
3480 aesdec $rndkey1,$inout0
3481 aesdec $rndkey1,$inout1
3482 aesdec $rndkey1,$inout2
3483 aesdec $rndkey1,$inout3
3484 pxor $rndkey0l,@offset[1]
3485 pxor $rndkey0l,@offset[2]
3486 aesdec $rndkey1,$inout4
3487 pxor $rndkey0l,@offset[3]
3488 pxor $rndkey0l,@offset[4]
3489 aesdec $rndkey1,$inout5
3490 $movkey 48($key_),$rndkey1
3491 pxor $rndkey0l,@offset[5]
3492
3493 aesdec $rndkey0,$inout0
3494 aesdec $rndkey0,$inout1
3495 aesdec $rndkey0,$inout2
3496 aesdec $rndkey0,$inout3
3497 aesdec $rndkey0,$inout4
3498 aesdec $rndkey0,$inout5
3499 $movkey 64($key_),$rndkey0
3500 shl \$4,$i1 # ntz(block) -> table offset
3501 shl \$4,$i3
3502 jmp .Locb_dec_loop6
3503
3504 .align 32
3505 .Locb_dec_loop6:
3506 aesdec $rndkey1,$inout0
3507 aesdec $rndkey1,$inout1
3508 aesdec $rndkey1,$inout2
3509 aesdec $rndkey1,$inout3
3510 aesdec $rndkey1,$inout4
3511 aesdec $rndkey1,$inout5
3512 $movkey ($key,%rax),$rndkey1
3513 add \$32,%rax
3514
3515 aesdec $rndkey0,$inout0
3516 aesdec $rndkey0,$inout1
3517 aesdec $rndkey0,$inout2
3518 aesdec $rndkey0,$inout3
3519 aesdec $rndkey0,$inout4
3520 aesdec $rndkey0,$inout5
3521 $movkey -16($key,%rax),$rndkey0
3522 jnz .Locb_dec_loop6
3523
3524 aesdec $rndkey1,$inout0
3525 aesdec $rndkey1,$inout1
3526 aesdec $rndkey1,$inout2
3527 aesdec $rndkey1,$inout3
3528 aesdec $rndkey1,$inout4
3529 aesdec $rndkey1,$inout5
3530 $movkey 16($key_),$rndkey1
3531 shl \$4,$i5
3532
3533 aesdeclast @offset[0],$inout0
3534 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3535 mov %r10,%rax # restore twisted rounds
3536 aesdeclast @offset[1],$inout1
3537 aesdeclast @offset[2],$inout2
3538 aesdeclast @offset[3],$inout3
3539 aesdeclast @offset[4],$inout4
3540 aesdeclast @offset[5],$inout5
3541 ret
3542 .size __ocb_decrypt6,.-__ocb_decrypt6
3543
3544 .type __ocb_decrypt4,\@abi-omnipotent
3545 .align 32
3546 __ocb_decrypt4:
3547 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3548 movdqu ($L_p,$i1),@offset[1]
3549 movdqa @offset[0],@offset[2]
3550 movdqu ($L_p,$i3),@offset[3]
3551 pxor @offset[5],@offset[0]
3552 pxor @offset[0],@offset[1]
3553 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3554 pxor @offset[1],@offset[2]
3555 pxor @offset[1],$inout1
3556 pxor @offset[2],@offset[3]
3557 pxor @offset[2],$inout2
3558 pxor @offset[3],$inout3
3559 $movkey 32($key_),$rndkey0
3560
3561 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3562 pxor $rndkey0l,@offset[1]
3563 pxor $rndkey0l,@offset[2]
3564 pxor $rndkey0l,@offset[3]
3565
3566 aesdec $rndkey1,$inout0
3567 aesdec $rndkey1,$inout1
3568 aesdec $rndkey1,$inout2
3569 aesdec $rndkey1,$inout3
3570 $movkey 48($key_),$rndkey1
3571
3572 aesdec $rndkey0,$inout0
3573 aesdec $rndkey0,$inout1
3574 aesdec $rndkey0,$inout2
3575 aesdec $rndkey0,$inout3
3576 $movkey 64($key_),$rndkey0
3577 jmp .Locb_dec_loop4
3578
3579 .align 32
3580 .Locb_dec_loop4:
3581 aesdec $rndkey1,$inout0
3582 aesdec $rndkey1,$inout1
3583 aesdec $rndkey1,$inout2
3584 aesdec $rndkey1,$inout3
3585 $movkey ($key,%rax),$rndkey1
3586 add \$32,%rax
3587
3588 aesdec $rndkey0,$inout0
3589 aesdec $rndkey0,$inout1
3590 aesdec $rndkey0,$inout2
3591 aesdec $rndkey0,$inout3
3592 $movkey -16($key,%rax),$rndkey0
3593 jnz .Locb_dec_loop4
3594
3595 aesdec $rndkey1,$inout0
3596 aesdec $rndkey1,$inout1
3597 aesdec $rndkey1,$inout2
3598 aesdec $rndkey1,$inout3
3599 $movkey 16($key_),$rndkey1
3600 mov %r10,%rax # restore twisted rounds
3601
3602 aesdeclast @offset[0],$inout0
3603 aesdeclast @offset[1],$inout1
3604 aesdeclast @offset[2],$inout2
3605 aesdeclast @offset[3],$inout3
3606 ret
3607 .size __ocb_decrypt4,.-__ocb_decrypt4
3608
3609 .type __ocb_decrypt1,\@abi-omnipotent
3610 .align 32
3611 __ocb_decrypt1:
3612 pxor @offset[5],$inout5 # offset_i
3613 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3614 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3615 $movkey 32($key_),$rndkey0
3616
3617 aesdec $rndkey1,$inout0
3618 $movkey 48($key_),$rndkey1
3619 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3620
3621 aesdec $rndkey0,$inout0
3622 $movkey 64($key_),$rndkey0
3623 jmp .Locb_dec_loop1
3624
3625 .align 32
3626 .Locb_dec_loop1:
3627 aesdec $rndkey1,$inout0
3628 $movkey ($key,%rax),$rndkey1
3629 add \$32,%rax
3630
3631 aesdec $rndkey0,$inout0
3632 $movkey -16($key,%rax),$rndkey0
3633 jnz .Locb_dec_loop1
3634
3635 aesdec $rndkey1,$inout0
3636 $movkey 16($key_),$rndkey1 # redundant in tail
3637 mov %r10,%rax # restore twisted rounds
3638
3639 aesdeclast $inout5,$inout0
3640 ret
3641 .size __ocb_decrypt1,.-__ocb_decrypt1
3642 ___
3643 } }}
3644 \f
3645 ########################################################################
3646 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
3647 # size_t length, const AES_KEY *key,
3648 # unsigned char *ivp,const int enc);
3649 {
3650 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3651 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3652 my $inp_=$key_;
3653
3654 $code.=<<___;
3655 .globl ${PREFIX}_cbc_encrypt
3656 .type ${PREFIX}_cbc_encrypt,\@function,6
3657 .align 16
3658 ${PREFIX}_cbc_encrypt:
3659 test $len,$len # check length
3660 jz .Lcbc_ret
3661
3662 mov 240($key),$rnds_ # key->rounds
3663 mov $key,$key_ # backup $key
3664 test %r9d,%r9d # 6th argument
3665 jz .Lcbc_decrypt
3666 #--------------------------- CBC ENCRYPT ------------------------------#
3667 movups ($ivp),$inout0 # load iv as initial state
3668 mov $rnds_,$rounds
3669 cmp \$16,$len
3670 jb .Lcbc_enc_tail
3671 sub \$16,$len
3672 jmp .Lcbc_enc_loop
3673 .align 16
3674 .Lcbc_enc_loop:
3675 movups ($inp),$inout1 # load input
3676 lea 16($inp),$inp
3677 #xorps $inout1,$inout0
3678 ___
3679 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3680 $code.=<<___;
3681 mov $rnds_,$rounds # restore $rounds
3682 mov $key_,$key # restore $key
3683 movups $inout0,0($out) # store output
3684 lea 16($out),$out
3685 sub \$16,$len
3686 jnc .Lcbc_enc_loop
3687 add \$16,$len
3688 jnz .Lcbc_enc_tail
3689 pxor $rndkey0,$rndkey0 # clear register bank
3690 pxor $rndkey1,$rndkey1
3691 movups $inout0,($ivp)
3692 pxor $inout0,$inout0
3693 pxor $inout1,$inout1
3694 jmp .Lcbc_ret
3695
3696 .Lcbc_enc_tail:
3697 mov $len,%rcx # zaps $key
3698 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3699 .long 0x9066A4F3 # rep movsb
3700 mov \$16,%ecx # zero tail
3701 sub $len,%rcx
3702 xor %eax,%eax
3703 .long 0x9066AAF3 # rep stosb
3704 lea -16(%rdi),%rdi # rewind $out by 1 block
3705 mov $rnds_,$rounds # restore $rounds
3706 mov %rdi,%rsi # $inp and $out are the same
3707 mov $key_,$key # restore $key
3708 xor $len,$len # len=16
3709 jmp .Lcbc_enc_loop # one more spin
3710 \f#--------------------------- CBC DECRYPT ------------------------------#
3711 .align 16
3712 .Lcbc_decrypt:
3713 cmp \$16,$len
3714 jne .Lcbc_decrypt_bulk
3715
3716 # handle single block without allocating stack frame,
3717 # useful in ciphertext stealing mode
3718 movdqu ($inp),$inout0 # load input
3719 movdqu ($ivp),$inout1 # load iv
3720 movdqa $inout0,$inout2 # future iv
3721 ___
3722 &aesni_generate1("dec",$key,$rnds_);
3723 $code.=<<___;
3724 pxor $rndkey0,$rndkey0 # clear register bank
3725 pxor $rndkey1,$rndkey1
3726 movdqu $inout2,($ivp) # store iv
3727 xorps $inout1,$inout0 # ^=iv
3728 pxor $inout1,$inout1
3729 movups $inout0,($out) # store output
3730 pxor $inout0,$inout0
3731 jmp .Lcbc_ret
3732 .align 16
3733 .Lcbc_decrypt_bulk:
3734 lea (%rsp),%rax
3735 push %rbp
3736 sub \$$frame_size,%rsp
3737 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3738 ___
3739 $code.=<<___ if ($win64);
3740 movaps %xmm6,0x10(%rsp)
3741 movaps %xmm7,0x20(%rsp)
3742 movaps %xmm8,0x30(%rsp)
3743 movaps %xmm9,0x40(%rsp)
3744 movaps %xmm10,0x50(%rsp)
3745 movaps %xmm11,0x60(%rsp)
3746 movaps %xmm12,0x70(%rsp)
3747 movaps %xmm13,0x80(%rsp)
3748 movaps %xmm14,0x90(%rsp)
3749 movaps %xmm15,0xa0(%rsp)
3750 .Lcbc_decrypt_body:
3751 ___
3752 $code.=<<___;
3753 lea -8(%rax),%rbp
3754 movups ($ivp),$iv
3755 mov $rnds_,$rounds
3756 cmp \$0x50,$len
3757 jbe .Lcbc_dec_tail
3758
3759 $movkey ($key),$rndkey0
3760 movdqu 0x00($inp),$inout0 # load input
3761 movdqu 0x10($inp),$inout1
3762 movdqa $inout0,$in0
3763 movdqu 0x20($inp),$inout2
3764 movdqa $inout1,$in1
3765 movdqu 0x30($inp),$inout3
3766 movdqa $inout2,$in2
3767 movdqu 0x40($inp),$inout4
3768 movdqa $inout3,$in3
3769 movdqu 0x50($inp),$inout5
3770 movdqa $inout4,$in4
3771 mov OPENSSL_ia32cap_P+4(%rip),%r9d
3772 cmp \$0x70,$len
3773 jbe .Lcbc_dec_six_or_seven
3774
3775 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3776 sub \$0x50,$len # $len is biased by -5*16
3777 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
3778 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3779 sub \$0x20,$len # $len is biased by -7*16
3780 lea 0x70($key),$key # size optimization
3781 jmp .Lcbc_dec_loop8_enter
3782 .align 16
3783 .Lcbc_dec_loop8:
3784 movups $inout7,($out)
3785 lea 0x10($out),$out
3786 .Lcbc_dec_loop8_enter:
3787 movdqu 0x60($inp),$inout6
3788 pxor $rndkey0,$inout0
3789 movdqu 0x70($inp),$inout7
3790 pxor $rndkey0,$inout1
3791 $movkey 0x10-0x70($key),$rndkey1
3792 pxor $rndkey0,$inout2
3793 xor $inp_,$inp_
3794 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3795 pxor $rndkey0,$inout3
3796 pxor $rndkey0,$inout4
3797 pxor $rndkey0,$inout5
3798 pxor $rndkey0,$inout6
3799
3800 aesdec $rndkey1,$inout0
3801 pxor $rndkey0,$inout7
3802 $movkey 0x20-0x70($key),$rndkey0
3803 aesdec $rndkey1,$inout1
3804 aesdec $rndkey1,$inout2
3805 aesdec $rndkey1,$inout3
3806 aesdec $rndkey1,$inout4
3807 aesdec $rndkey1,$inout5
3808 aesdec $rndkey1,$inout6
3809 setnc ${inp_}b
3810 shl \$7,$inp_
3811 aesdec $rndkey1,$inout7
3812 add $inp,$inp_
3813 $movkey 0x30-0x70($key),$rndkey1
3814 ___
3815 for($i=1;$i<12;$i++) {
3816 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3817 $code.=<<___ if ($i==7);
3818 cmp \$11,$rounds
3819 ___
3820 $code.=<<___;
3821 aesdec $rndkeyx,$inout0
3822 aesdec $rndkeyx,$inout1
3823 aesdec $rndkeyx,$inout2
3824 aesdec $rndkeyx,$inout3
3825 aesdec $rndkeyx,$inout4
3826 aesdec $rndkeyx,$inout5
3827 aesdec $rndkeyx,$inout6
3828 aesdec $rndkeyx,$inout7
3829 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3830 ___
3831 $code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3832 nop
3833 ___
3834 $code.=<<___ if ($i==7);
3835 jb .Lcbc_dec_done
3836 ___
3837 $code.=<<___ if ($i==9);
3838 je .Lcbc_dec_done
3839 ___
3840 $code.=<<___ if ($i==11);
3841 jmp .Lcbc_dec_done
3842 ___
3843 }
3844 $code.=<<___;
3845 .align 16
3846 .Lcbc_dec_done:
3847 aesdec $rndkey1,$inout0
3848 aesdec $rndkey1,$inout1
3849 pxor $rndkey0,$iv
3850 pxor $rndkey0,$in0
3851 aesdec $rndkey1,$inout2
3852 aesdec $rndkey1,$inout3
3853 pxor $rndkey0,$in1
3854 pxor $rndkey0,$in2
3855 aesdec $rndkey1,$inout4
3856 aesdec $rndkey1,$inout5
3857 pxor $rndkey0,$in3
3858 pxor $rndkey0,$in4
3859 aesdec $rndkey1,$inout6
3860 aesdec $rndkey1,$inout7
3861 movdqu 0x50($inp),$rndkey1
3862
3863 aesdeclast $iv,$inout0
3864 movdqu 0x60($inp),$iv # borrow $iv
3865 pxor $rndkey0,$rndkey1
3866 aesdeclast $in0,$inout1
3867 pxor $rndkey0,$iv
3868 movdqu 0x70($inp),$rndkey0 # next IV
3869 aesdeclast $in1,$inout2
3870 lea 0x80($inp),$inp
3871 movdqu 0x00($inp_),$in0
3872 aesdeclast $in2,$inout3
3873 aesdeclast $in3,$inout4
3874 movdqu 0x10($inp_),$in1
3875 movdqu 0x20($inp_),$in2
3876 aesdeclast $in4,$inout5
3877 aesdeclast $rndkey1,$inout6
3878 movdqu 0x30($inp_),$in3
3879 movdqu 0x40($inp_),$in4
3880 aesdeclast $iv,$inout7
3881 movdqa $rndkey0,$iv # return $iv
3882 movdqu 0x50($inp_),$rndkey1
3883 $movkey -0x70($key),$rndkey0
3884
3885 movups $inout0,($out) # store output
3886 movdqa $in0,$inout0
3887 movups $inout1,0x10($out)
3888 movdqa $in1,$inout1
3889 movups $inout2,0x20($out)
3890 movdqa $in2,$inout2
3891 movups $inout3,0x30($out)
3892 movdqa $in3,$inout3
3893 movups $inout4,0x40($out)
3894 movdqa $in4,$inout4
3895 movups $inout5,0x50($out)
3896 movdqa $rndkey1,$inout5
3897 movups $inout6,0x60($out)
3898 lea 0x70($out),$out
3899
3900 sub \$0x80,$len
3901 ja .Lcbc_dec_loop8
3902
3903 movaps $inout7,$inout0
3904 lea -0x70($key),$key
3905 add \$0x70,$len
3906 jle .Lcbc_dec_clear_tail_collected
3907 movups $inout7,($out)
3908 lea 0x10($out),$out
3909 cmp \$0x50,$len
3910 jbe .Lcbc_dec_tail
3911
3912 movaps $in0,$inout0
3913 .Lcbc_dec_six_or_seven:
3914 cmp \$0x60,$len
3915 ja .Lcbc_dec_seven
3916
3917 movaps $inout5,$inout6
3918 call _aesni_decrypt6
3919 pxor $iv,$inout0 # ^= IV
3920 movaps $inout6,$iv
3921 pxor $in0,$inout1
3922 movdqu $inout0,($out)
3923 pxor $in1,$inout2
3924 movdqu $inout1,0x10($out)
3925 pxor $inout1,$inout1 # clear register bank
3926 pxor $in2,$inout3
3927 movdqu $inout2,0x20($out)
3928 pxor $inout2,$inout2
3929 pxor $in3,$inout4
3930 movdqu $inout3,0x30($out)
3931 pxor $inout3,$inout3
3932 pxor $in4,$inout5
3933 movdqu $inout4,0x40($out)
3934 pxor $inout4,$inout4
3935 lea 0x50($out),$out
3936 movdqa $inout5,$inout0
3937 pxor $inout5,$inout5
3938 jmp .Lcbc_dec_tail_collected
3939
3940 .align 16
3941 .Lcbc_dec_seven:
3942 movups 0x60($inp),$inout6
3943 xorps $inout7,$inout7
3944 call _aesni_decrypt8
3945 movups 0x50($inp),$inout7
3946 pxor $iv,$inout0 # ^= IV
3947 movups 0x60($inp),$iv
3948 pxor $in0,$inout1
3949 movdqu $inout0,($out)
3950 pxor $in1,$inout2
3951 movdqu $inout1,0x10($out)
3952 pxor $inout1,$inout1 # clear register bank
3953 pxor $in2,$inout3
3954 movdqu $inout2,0x20($out)
3955 pxor $inout2,$inout2
3956 pxor $in3,$inout4
3957 movdqu $inout3,0x30($out)
3958 pxor $inout3,$inout3
3959 pxor $in4,$inout5
3960 movdqu $inout4,0x40($out)
3961 pxor $inout4,$inout4
3962 pxor $inout7,$inout6
3963 movdqu $inout5,0x50($out)
3964 pxor $inout5,$inout5
3965 lea 0x60($out),$out
3966 movdqa $inout6,$inout0
3967 pxor $inout6,$inout6
3968 pxor $inout7,$inout7
3969 jmp .Lcbc_dec_tail_collected
3970
3971 .align 16
3972 .Lcbc_dec_loop6:
3973 movups $inout5,($out)
3974 lea 0x10($out),$out
3975 movdqu 0x00($inp),$inout0 # load input
3976 movdqu 0x10($inp),$inout1
3977 movdqa $inout0,$in0
3978 movdqu 0x20($inp),$inout2
3979 movdqa $inout1,$in1
3980 movdqu 0x30($inp),$inout3
3981 movdqa $inout2,$in2
3982 movdqu 0x40($inp),$inout4
3983 movdqa $inout3,$in3
3984 movdqu 0x50($inp),$inout5
3985 movdqa $inout4,$in4
3986 .Lcbc_dec_loop6_enter:
3987 lea 0x60($inp),$inp
3988 movdqa $inout5,$inout6
3989
3990 call _aesni_decrypt6
3991
3992 pxor $iv,$inout0 # ^= IV
3993 movdqa $inout6,$iv
3994 pxor $in0,$inout1
3995 movdqu $inout0,($out)
3996 pxor $in1,$inout2
3997 movdqu $inout1,0x10($out)
3998 pxor $in2,$inout3
3999 movdqu $inout2,0x20($out)
4000 pxor $in3,$inout4
4001 mov $key_,$key
4002 movdqu $inout3,0x30($out)
4003 pxor $in4,$inout5
4004 mov $rnds_,$rounds
4005 movdqu $inout4,0x40($out)
4006 lea 0x50($out),$out
4007 sub \$0x60,$len
4008 ja .Lcbc_dec_loop6
4009
4010 movdqa $inout5,$inout0
4011 add \$0x50,$len
4012 jle .Lcbc_dec_clear_tail_collected
4013 movups $inout5,($out)
4014 lea 0x10($out),$out
4015
4016 .Lcbc_dec_tail:
4017 movups ($inp),$inout0
4018 sub \$0x10,$len
4019 jbe .Lcbc_dec_one # $len is 1*16 or less
4020
4021 movups 0x10($inp),$inout1
4022 movaps $inout0,$in0
4023 sub \$0x10,$len
4024 jbe .Lcbc_dec_two # $len is 2*16 or less
4025
4026 movups 0x20($inp),$inout2
4027 movaps $inout1,$in1
4028 sub \$0x10,$len
4029 jbe .Lcbc_dec_three # $len is 3*16 or less
4030
4031 movups 0x30($inp),$inout3
4032 movaps $inout2,$in2
4033 sub \$0x10,$len
4034 jbe .Lcbc_dec_four # $len is 4*16 or less
4035
4036 movups 0x40($inp),$inout4 # $len is 5*16 or less
4037 movaps $inout3,$in3
4038 movaps $inout4,$in4
4039 xorps $inout5,$inout5
4040 call _aesni_decrypt6
4041 pxor $iv,$inout0
4042 movaps $in4,$iv
4043 pxor $in0,$inout1
4044 movdqu $inout0,($out)
4045 pxor $in1,$inout2
4046 movdqu $inout1,0x10($out)
4047 pxor $inout1,$inout1 # clear register bank
4048 pxor $in2,$inout3
4049 movdqu $inout2,0x20($out)
4050 pxor $inout2,$inout2
4051 pxor $in3,$inout4
4052 movdqu $inout3,0x30($out)
4053 pxor $inout3,$inout3
4054 lea 0x40($out),$out
4055 movdqa $inout4,$inout0
4056 pxor $inout4,$inout4
4057 pxor $inout5,$inout5
4058 sub \$0x10,$len
4059 jmp .Lcbc_dec_tail_collected
4060
4061 .align 16
4062 .Lcbc_dec_one:
4063 movaps $inout0,$in0
4064 ___
4065 &aesni_generate1("dec",$key,$rounds);
4066 $code.=<<___;
4067 xorps $iv,$inout0
4068 movaps $in0,$iv
4069 jmp .Lcbc_dec_tail_collected
4070 .align 16
4071 .Lcbc_dec_two:
4072 movaps $inout1,$in1
4073 call _aesni_decrypt2
4074 pxor $iv,$inout0
4075 movaps $in1,$iv
4076 pxor $in0,$inout1
4077 movdqu $inout0,($out)
4078 movdqa $inout1,$inout0
4079 pxor $inout1,$inout1 # clear register bank
4080 lea 0x10($out),$out
4081 jmp .Lcbc_dec_tail_collected
4082 .align 16
4083 .Lcbc_dec_three:
4084 movaps $inout2,$in2
4085 call _aesni_decrypt3
4086 pxor $iv,$inout0
4087 movaps $in2,$iv
4088 pxor $in0,$inout1
4089 movdqu $inout0,($out)
4090 pxor $in1,$inout2
4091 movdqu $inout1,0x10($out)
4092 pxor $inout1,$inout1 # clear register bank
4093 movdqa $inout2,$inout0
4094 pxor $inout2,$inout2
4095 lea 0x20($out),$out
4096 jmp .Lcbc_dec_tail_collected
4097 .align 16
4098 .Lcbc_dec_four:
4099 movaps $inout3,$in3
4100 call _aesni_decrypt4
4101 pxor $iv,$inout0
4102 movaps $in3,$iv
4103 pxor $in0,$inout1
4104 movdqu $inout0,($out)
4105 pxor $in1,$inout2
4106 movdqu $inout1,0x10($out)
4107 pxor $inout1,$inout1 # clear register bank
4108 pxor $in2,$inout3
4109 movdqu $inout2,0x20($out)
4110 pxor $inout2,$inout2
4111 movdqa $inout3,$inout0
4112 pxor $inout3,$inout3
4113 lea 0x30($out),$out
4114 jmp .Lcbc_dec_tail_collected
4115
4116 .align 16
4117 .Lcbc_dec_clear_tail_collected:
4118 pxor $inout1,$inout1 # clear register bank
4119 pxor $inout2,$inout2
4120 pxor $inout3,$inout3
4121 ___
4122 $code.=<<___ if (!$win64);
4123 pxor $inout4,$inout4 # %xmm6..9
4124 pxor $inout5,$inout5
4125 pxor $inout6,$inout6
4126 pxor $inout7,$inout7
4127 ___
4128 $code.=<<___;
4129 .Lcbc_dec_tail_collected:
4130 movups $iv,($ivp)
4131 and \$15,$len
4132 jnz .Lcbc_dec_tail_partial
4133 movups $inout0,($out)
4134 pxor $inout0,$inout0
4135 jmp .Lcbc_dec_ret
4136 .align 16
4137 .Lcbc_dec_tail_partial:
4138 movaps $inout0,(%rsp)
4139 pxor $inout0,$inout0
4140 mov \$16,%rcx
4141 mov $out,%rdi
4142 sub $len,%rcx
4143 lea (%rsp),%rsi
4144 .long 0x9066A4F3 # rep movsb
4145 movdqa $inout0,(%rsp)
4146
4147 .Lcbc_dec_ret:
4148 xorps $rndkey0,$rndkey0 # %xmm0
4149 pxor $rndkey1,$rndkey1
4150 ___
4151 $code.=<<___ if ($win64);
4152 movaps 0x10(%rsp),%xmm6
4153 movaps %xmm0,0x10(%rsp) # clear stack
4154 movaps 0x20(%rsp),%xmm7
4155 movaps %xmm0,0x20(%rsp)
4156 movaps 0x30(%rsp),%xmm8
4157 movaps %xmm0,0x30(%rsp)
4158 movaps 0x40(%rsp),%xmm9
4159 movaps %xmm0,0x40(%rsp)
4160 movaps 0x50(%rsp),%xmm10
4161 movaps %xmm0,0x50(%rsp)
4162 movaps 0x60(%rsp),%xmm11
4163 movaps %xmm0,0x60(%rsp)
4164 movaps 0x70(%rsp),%xmm12
4165 movaps %xmm0,0x70(%rsp)
4166 movaps 0x80(%rsp),%xmm13
4167 movaps %xmm0,0x80(%rsp)
4168 movaps 0x90(%rsp),%xmm14
4169 movaps %xmm0,0x90(%rsp)
4170 movaps 0xa0(%rsp),%xmm15
4171 movaps %xmm0,0xa0(%rsp)
4172 ___
4173 $code.=<<___;
4174 lea (%rbp),%rsp
4175 pop %rbp
4176 .Lcbc_ret:
4177 ret
4178 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4179 ___
4180 } \f
4181 # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4182 # int bits, AES_KEY *key)
4183 #
4184 # input: $inp user-supplied key
4185 # $bits $inp length in bits
4186 # $key pointer to key schedule
4187 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4188 # *$key key schedule
4189 #
4190 { my ($inp,$bits,$key) = @_4args;
4191 $bits =~ s/%r/%e/;
4192
4193 $code.=<<___;
4194 .globl ${PREFIX}_set_decrypt_key
4195 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4196 .align 16
4197 ${PREFIX}_set_decrypt_key:
4198 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4199 call __aesni_set_encrypt_key
4200 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4201 test %eax,%eax
4202 jnz .Ldec_key_ret
4203 lea 16($key,$bits),$inp # points at the end of key schedule
4204
4205 $movkey ($key),%xmm0 # just swap
4206 $movkey ($inp),%xmm1
4207 $movkey %xmm0,($inp)
4208 $movkey %xmm1,($key)
4209 lea 16($key),$key
4210 lea -16($inp),$inp
4211
4212 .Ldec_key_inverse:
4213 $movkey ($key),%xmm0 # swap and inverse
4214 $movkey ($inp),%xmm1
4215 aesimc %xmm0,%xmm0
4216 aesimc %xmm1,%xmm1
4217 lea 16($key),$key
4218 lea -16($inp),$inp
4219 $movkey %xmm0,16($inp)
4220 $movkey %xmm1,-16($key)
4221 cmp $key,$inp
4222 ja .Ldec_key_inverse
4223
4224 $movkey ($key),%xmm0 # inverse middle
4225 aesimc %xmm0,%xmm0
4226 pxor %xmm1,%xmm1
4227 $movkey %xmm0,($inp)
4228 pxor %xmm0,%xmm0
4229 .Ldec_key_ret:
4230 add \$8,%rsp
4231 ret
4232 .LSEH_end_set_decrypt_key:
4233 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4234 ___
4235 \f
4236 # This is based on submission by
4237 #
4238 # Huang Ying <ying.huang@intel.com>
4239 # Vinodh Gopal <vinodh.gopal@intel.com>
4240 # Kahraman Akdemir
4241 #
4242 # Agressively optimized in respect to aeskeygenassist's critical path
4243 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
4244 #
4245 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4246 # int bits, AES_KEY * const key);
4247 #
4248 # input: $inp user-supplied key
4249 # $bits $inp length in bits
4250 # $key pointer to key schedule
4251 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4252 # $bits rounds-1 (used in aesni_set_decrypt_key)
4253 # *$key key schedule
4254 # $key pointer to key schedule (used in
4255 # aesni_set_decrypt_key)
4256 #
4257 # Subroutine is frame-less, which means that only volatile registers
4258 # are used. Note that it's declared "abi-omnipotent", which means that
4259 # amount of volatile registers is smaller on Windows.
4260 #
4261 $code.=<<___;
4262 .globl ${PREFIX}_set_encrypt_key
4263 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4264 .align 16
4265 ${PREFIX}_set_encrypt_key:
4266 __aesni_set_encrypt_key:
4267 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4268 mov \$-1,%rax
4269 test $inp,$inp
4270 jz .Lenc_key_ret
4271 test $key,$key
4272 jz .Lenc_key_ret
4273
4274 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
4275 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4276 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
4277 and OPENSSL_ia32cap_P+4(%rip),%r10d
4278 lea 16($key),%rax # %rax is used as modifiable copy of $key
4279 cmp \$256,$bits
4280 je .L14rounds
4281 cmp \$192,$bits
4282 je .L12rounds
4283 cmp \$128,$bits
4284 jne .Lbad_keybits
4285
4286 .L10rounds:
4287 mov \$9,$bits # 10 rounds for 128-bit key
4288 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4289 je .L10rounds_alt
4290
4291 $movkey %xmm0,($key) # round 0
4292 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4293 call .Lkey_expansion_128_cold
4294 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4295 call .Lkey_expansion_128
4296 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4297 call .Lkey_expansion_128
4298 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4299 call .Lkey_expansion_128
4300 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4301 call .Lkey_expansion_128
4302 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4303 call .Lkey_expansion_128
4304 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4305 call .Lkey_expansion_128
4306 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4307 call .Lkey_expansion_128
4308 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4309 call .Lkey_expansion_128
4310 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4311 call .Lkey_expansion_128
4312 $movkey %xmm0,(%rax)
4313 mov $bits,80(%rax) # 240(%rdx)
4314 xor %eax,%eax
4315 jmp .Lenc_key_ret
4316
4317 .align 16
4318 .L10rounds_alt:
4319 movdqa .Lkey_rotate(%rip),%xmm5
4320 mov \$8,%r10d
4321 movdqa .Lkey_rcon1(%rip),%xmm4
4322 movdqa %xmm0,%xmm2
4323 movdqu %xmm0,($key)
4324 jmp .Loop_key128
4325
4326 .align 16
4327 .Loop_key128:
4328 pshufb %xmm5,%xmm0
4329 aesenclast %xmm4,%xmm0
4330 pslld \$1,%xmm4
4331 lea 16(%rax),%rax
4332
4333 movdqa %xmm2,%xmm3
4334 pslldq \$4,%xmm2
4335 pxor %xmm2,%xmm3
4336 pslldq \$4,%xmm2
4337 pxor %xmm2,%xmm3
4338 pslldq \$4,%xmm2
4339 pxor %xmm3,%xmm2
4340
4341 pxor %xmm2,%xmm0
4342 movdqu %xmm0,-16(%rax)
4343 movdqa %xmm0,%xmm2
4344
4345 dec %r10d
4346 jnz .Loop_key128
4347
4348 movdqa .Lkey_rcon1b(%rip),%xmm4
4349
4350 pshufb %xmm5,%xmm0
4351 aesenclast %xmm4,%xmm0
4352 pslld \$1,%xmm4
4353
4354 movdqa %xmm2,%xmm3
4355 pslldq \$4,%xmm2
4356 pxor %xmm2,%xmm3
4357 pslldq \$4,%xmm2
4358 pxor %xmm2,%xmm3
4359 pslldq \$4,%xmm2
4360 pxor %xmm3,%xmm2
4361
4362 pxor %xmm2,%xmm0
4363 movdqu %xmm0,(%rax)
4364
4365 movdqa %xmm0,%xmm2
4366 pshufb %xmm5,%xmm0
4367 aesenclast %xmm4,%xmm0
4368
4369 movdqa %xmm2,%xmm3
4370 pslldq \$4,%xmm2
4371 pxor %xmm2,%xmm3
4372 pslldq \$4,%xmm2
4373 pxor %xmm2,%xmm3
4374 pslldq \$4,%xmm2
4375 pxor %xmm3,%xmm2
4376
4377 pxor %xmm2,%xmm0
4378 movdqu %xmm0,16(%rax)
4379
4380 mov $bits,96(%rax) # 240($key)
4381 xor %eax,%eax
4382 jmp .Lenc_key_ret
4383
4384 .align 16
4385 .L12rounds:
4386 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4387 mov \$11,$bits # 12 rounds for 192
4388 cmp \$`1<<28`,%r10d # AVX, but no XOP
4389 je .L12rounds_alt
4390
4391 $movkey %xmm0,($key) # round 0
4392 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4393 call .Lkey_expansion_192a_cold
4394 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4395 call .Lkey_expansion_192b
4396 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4397 call .Lkey_expansion_192a
4398 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4399 call .Lkey_expansion_192b
4400 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4401 call .Lkey_expansion_192a
4402 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4403 call .Lkey_expansion_192b
4404 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4405 call .Lkey_expansion_192a
4406 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4407 call .Lkey_expansion_192b
4408 $movkey %xmm0,(%rax)
4409 mov $bits,48(%rax) # 240(%rdx)
4410 xor %rax, %rax
4411 jmp .Lenc_key_ret
4412
4413 .align 16
4414 .L12rounds_alt:
4415 movdqa .Lkey_rotate192(%rip),%xmm5
4416 movdqa .Lkey_rcon1(%rip),%xmm4
4417 mov \$8,%r10d
4418 movdqu %xmm0,($key)
4419 jmp .Loop_key192
4420
4421 .align 16
4422 .Loop_key192:
4423 movq %xmm2,0(%rax)
4424 movdqa %xmm2,%xmm1
4425 pshufb %xmm5,%xmm2
4426 aesenclast %xmm4,%xmm2
4427 pslld \$1, %xmm4
4428 lea 24(%rax),%rax
4429
4430 movdqa %xmm0,%xmm3
4431 pslldq \$4,%xmm0
4432 pxor %xmm0,%xmm3
4433 pslldq \$4,%xmm0
4434 pxor %xmm0,%xmm3
4435 pslldq \$4,%xmm0
4436 pxor %xmm3,%xmm0
4437
4438 pshufd \$0xff,%xmm0,%xmm3
4439 pxor %xmm1,%xmm3
4440 pslldq \$4,%xmm1
4441 pxor %xmm1,%xmm3
4442
4443 pxor %xmm2,%xmm0
4444 pxor %xmm3,%xmm2
4445 movdqu %xmm0,-16(%rax)
4446
4447 dec %r10d
4448 jnz .Loop_key192
4449
4450 mov $bits,32(%rax) # 240($key)
4451 xor %eax,%eax
4452 jmp .Lenc_key_ret
4453
4454 .align 16
4455 .L14rounds:
4456 movups 16($inp),%xmm2 # remaning half of *userKey
4457 mov \$13,$bits # 14 rounds for 256
4458 lea 16(%rax),%rax
4459 cmp \$`1<<28`,%r10d # AVX, but no XOP
4460 je .L14rounds_alt
4461
4462 $movkey %xmm0,($key) # round 0
4463 $movkey %xmm2,16($key) # round 1
4464 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4465 call .Lkey_expansion_256a_cold
4466 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4467 call .Lkey_expansion_256b
4468 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4469 call .Lkey_expansion_256a
4470 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4471 call .Lkey_expansion_256b
4472 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4473 call .Lkey_expansion_256a
4474 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4475 call .Lkey_expansion_256b
4476 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4477 call .Lkey_expansion_256a
4478 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4479 call .Lkey_expansion_256b
4480 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4481 call .Lkey_expansion_256a
4482 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4483 call .Lkey_expansion_256b
4484 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4485 call .Lkey_expansion_256a
4486 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4487 call .Lkey_expansion_256b
4488 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4489 call .Lkey_expansion_256a
4490 $movkey %xmm0,(%rax)
4491 mov $bits,16(%rax) # 240(%rdx)
4492 xor %rax,%rax
4493 jmp .Lenc_key_ret
4494
4495 .align 16
4496 .L14rounds_alt:
4497 movdqa .Lkey_rotate(%rip),%xmm5
4498 movdqa .Lkey_rcon1(%rip),%xmm4
4499 mov \$7,%r10d
4500 movdqu %xmm0,0($key)
4501 movdqa %xmm2,%xmm1
4502 movdqu %xmm2,16($key)
4503 jmp .Loop_key256
4504
4505 .align 16
4506 .Loop_key256:
4507 pshufb %xmm5,%xmm2
4508 aesenclast %xmm4,%xmm2
4509
4510 movdqa %xmm0,%xmm3
4511 pslldq \$4,%xmm0
4512 pxor %xmm0,%xmm3
4513 pslldq \$4,%xmm0
4514 pxor %xmm0,%xmm3
4515 pslldq \$4,%xmm0
4516 pxor %xmm3,%xmm0
4517 pslld \$1,%xmm4
4518
4519 pxor %xmm2,%xmm0
4520 movdqu %xmm0,(%rax)
4521
4522 dec %r10d
4523 jz .Ldone_key256
4524
4525 pshufd \$0xff,%xmm0,%xmm2
4526 pxor %xmm3,%xmm3
4527 aesenclast %xmm3,%xmm2
4528
4529 movdqa %xmm1,%xmm3
4530 pslldq \$4,%xmm1
4531 pxor %xmm1,%xmm3
4532 pslldq \$4,%xmm1
4533 pxor %xmm1,%xmm3
4534 pslldq \$4,%xmm1
4535 pxor %xmm3,%xmm1
4536
4537 pxor %xmm1,%xmm2
4538 movdqu %xmm2,16(%rax)
4539 lea 32(%rax),%rax
4540 movdqa %xmm2,%xmm1
4541
4542 jmp .Loop_key256
4543
4544 .Ldone_key256:
4545 mov $bits,16(%rax) # 240($key)
4546 xor %eax,%eax
4547 jmp .Lenc_key_ret
4548
4549 .align 16
4550 .Lbad_keybits:
4551 mov \$-2,%rax
4552 .Lenc_key_ret:
4553 pxor %xmm0,%xmm0
4554 pxor %xmm1,%xmm1
4555 pxor %xmm2,%xmm2
4556 pxor %xmm3,%xmm3
4557 pxor %xmm4,%xmm4
4558 pxor %xmm5,%xmm5
4559 add \$8,%rsp
4560 ret
4561 .LSEH_end_set_encrypt_key:
4562 \f
4563 .align 16
4564 .Lkey_expansion_128:
4565 $movkey %xmm0,(%rax)
4566 lea 16(%rax),%rax
4567 .Lkey_expansion_128_cold:
4568 shufps \$0b00010000,%xmm0,%xmm4
4569 xorps %xmm4, %xmm0
4570 shufps \$0b10001100,%xmm0,%xmm4
4571 xorps %xmm4, %xmm0
4572 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4573 xorps %xmm1,%xmm0
4574 ret
4575
4576 .align 16
4577 .Lkey_expansion_192a:
4578 $movkey %xmm0,(%rax)
4579 lea 16(%rax),%rax
4580 .Lkey_expansion_192a_cold:
4581 movaps %xmm2, %xmm5
4582 .Lkey_expansion_192b_warm:
4583 shufps \$0b00010000,%xmm0,%xmm4
4584 movdqa %xmm2,%xmm3
4585 xorps %xmm4,%xmm0
4586 shufps \$0b10001100,%xmm0,%xmm4
4587 pslldq \$4,%xmm3
4588 xorps %xmm4,%xmm0
4589 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4590 pxor %xmm3,%xmm2
4591 pxor %xmm1,%xmm0
4592 pshufd \$0b11111111,%xmm0,%xmm3
4593 pxor %xmm3,%xmm2
4594 ret
4595
4596 .align 16
4597 .Lkey_expansion_192b:
4598 movaps %xmm0,%xmm3
4599 shufps \$0b01000100,%xmm0,%xmm5
4600 $movkey %xmm5,(%rax)
4601 shufps \$0b01001110,%xmm2,%xmm3
4602 $movkey %xmm3,16(%rax)
4603 lea 32(%rax),%rax
4604 jmp .Lkey_expansion_192b_warm
4605
4606 .align 16
4607 .Lkey_expansion_256a:
4608 $movkey %xmm2,(%rax)
4609 lea 16(%rax),%rax
4610 .Lkey_expansion_256a_cold:
4611 shufps \$0b00010000,%xmm0,%xmm4
4612 xorps %xmm4,%xmm0
4613 shufps \$0b10001100,%xmm0,%xmm4
4614 xorps %xmm4,%xmm0
4615 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4616 xorps %xmm1,%xmm0
4617 ret
4618
4619 .align 16
4620 .Lkey_expansion_256b:
4621 $movkey %xmm0,(%rax)
4622 lea 16(%rax),%rax
4623
4624 shufps \$0b00010000,%xmm2,%xmm4
4625 xorps %xmm4,%xmm2
4626 shufps \$0b10001100,%xmm2,%xmm4
4627 xorps %xmm4,%xmm2
4628 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4629 xorps %xmm1,%xmm2
4630 ret
4631 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4632 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4633 ___
4634 }
4635 \f
4636 $code.=<<___;
4637 .align 64
4638 .Lbswap_mask:
4639 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4640 .Lincrement32:
4641 .long 6,6,6,0
4642 .Lincrement64:
4643 .long 1,0,0,0
4644 .Lxts_magic:
4645 .long 0x87,0,1,0
4646 .Lincrement1:
4647 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4648 .Lkey_rotate:
4649 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4650 .Lkey_rotate192:
4651 .long 0x04070605,0x04070605,0x04070605,0x04070605
4652 .Lkey_rcon1:
4653 .long 1,1,1,1
4654 .Lkey_rcon1b:
4655 .long 0x1b,0x1b,0x1b,0x1b
4656
4657 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4658 .align 64
4659 ___
4660
4661 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4662 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
4663 if ($win64) {
4664 $rec="%rcx";
4665 $frame="%rdx";
4666 $context="%r8";
4667 $disp="%r9";
4668
4669 $code.=<<___;
4670 .extern __imp_RtlVirtualUnwind
4671 ___
4672 $code.=<<___ if ($PREFIX eq "aesni");
4673 .type ecb_ccm64_se_handler,\@abi-omnipotent
4674 .align 16
4675 ecb_ccm64_se_handler:
4676 push %rsi
4677 push %rdi
4678 push %rbx
4679 push %rbp
4680 push %r12
4681 push %r13
4682 push %r14
4683 push %r15
4684 pushfq
4685 sub \$64,%rsp
4686
4687 mov 120($context),%rax # pull context->Rax
4688 mov 248($context),%rbx # pull context->Rip
4689
4690 mov 8($disp),%rsi # disp->ImageBase
4691 mov 56($disp),%r11 # disp->HandlerData
4692
4693 mov 0(%r11),%r10d # HandlerData[0]
4694 lea (%rsi,%r10),%r10 # prologue label
4695 cmp %r10,%rbx # context->Rip<prologue label
4696 jb .Lcommon_seh_tail
4697
4698 mov 152($context),%rax # pull context->Rsp
4699
4700 mov 4(%r11),%r10d # HandlerData[1]
4701 lea (%rsi,%r10),%r10 # epilogue label
4702 cmp %r10,%rbx # context->Rip>=epilogue label
4703 jae .Lcommon_seh_tail
4704
4705 lea 0(%rax),%rsi # %xmm save area
4706 lea 512($context),%rdi # &context.Xmm6
4707 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4708 .long 0xa548f3fc # cld; rep movsq
4709 lea 0x58(%rax),%rax # adjust stack pointer
4710
4711 jmp .Lcommon_seh_tail
4712 .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4713
4714 .type ctr_xts_se_handler,\@abi-omnipotent
4715 .align 16
4716 ctr_xts_se_handler:
4717 push %rsi
4718 push %rdi
4719 push %rbx
4720 push %rbp
4721 push %r12
4722 push %r13
4723 push %r14
4724 push %r15
4725 pushfq
4726 sub \$64,%rsp
4727
4728 mov 120($context),%rax # pull context->Rax
4729 mov 248($context),%rbx # pull context->Rip
4730
4731 mov 8($disp),%rsi # disp->ImageBase
4732 mov 56($disp),%r11 # disp->HandlerData
4733
4734 mov 0(%r11),%r10d # HandlerData[0]
4735 lea (%rsi,%r10),%r10 # prologue lable
4736 cmp %r10,%rbx # context->Rip<prologue label
4737 jb .Lcommon_seh_tail
4738
4739 mov 152($context),%rax # pull context->Rsp
4740
4741 mov 4(%r11),%r10d # HandlerData[1]
4742 lea (%rsi,%r10),%r10 # epilogue label
4743 cmp %r10,%rbx # context->Rip>=epilogue label
4744 jae .Lcommon_seh_tail
4745
4746 mov 160($context),%rax # pull context->Rbp
4747 lea -0xa0(%rax),%rsi # %xmm save area
4748 lea 512($context),%rdi # & context.Xmm6
4749 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4750 .long 0xa548f3fc # cld; rep movsq
4751
4752 jmp .Lcommon_rbp_tail
4753 .size ctr_xts_se_handler,.-ctr_xts_se_handler
4754
4755 .type ocb_se_handler,\@abi-omnipotent
4756 .align 16
4757 ocb_se_handler:
4758 push %rsi
4759 push %rdi
4760 push %rbx
4761 push %rbp
4762 push %r12
4763 push %r13
4764 push %r14
4765 push %r15
4766 pushfq
4767 sub \$64,%rsp
4768
4769 mov 120($context),%rax # pull context->Rax
4770 mov 248($context),%rbx # pull context->Rip
4771
4772 mov 8($disp),%rsi # disp->ImageBase
4773 mov 56($disp),%r11 # disp->HandlerData
4774
4775 mov 0(%r11),%r10d # HandlerData[0]
4776 lea (%rsi,%r10),%r10 # prologue lable
4777 cmp %r10,%rbx # context->Rip<prologue label
4778 jb .Lcommon_seh_tail
4779
4780 mov 4(%r11),%r10d # HandlerData[1]
4781 lea (%rsi,%r10),%r10 # epilogue label
4782 cmp %r10,%rbx # context->Rip>=epilogue label
4783 jae .Lcommon_seh_tail
4784
4785 mov 8(%r11),%r10d # HandlerData[2]
4786 lea (%rsi,%r10),%r10
4787 cmp %r10,%rbx # context->Rip>=pop label
4788 jae .Locb_no_xmm
4789
4790 mov 152($context),%rax # pull context->Rsp
4791
4792 lea (%rax),%rsi # %xmm save area
4793 lea 512($context),%rdi # & context.Xmm6
4794 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4795 .long 0xa548f3fc # cld; rep movsq
4796 lea 0xa0+0x28(%rax),%rax
4797
4798 .Locb_no_xmm:
4799 mov -8(%rax),%rbx
4800 mov -16(%rax),%rbp
4801 mov -24(%rax),%r12
4802 mov -32(%rax),%r13
4803 mov -40(%rax),%r14
4804
4805 mov %rbx,144($context) # restore context->Rbx
4806 mov %rbp,160($context) # restore context->Rbp
4807 mov %r12,216($context) # restore context->R12
4808 mov %r13,224($context) # restore context->R13
4809 mov %r14,232($context) # restore context->R14
4810
4811 jmp .Lcommon_seh_tail
4812 .size ocb_se_handler,.-ocb_se_handler
4813 ___
4814 $code.=<<___;
4815 .type cbc_se_handler,\@abi-omnipotent
4816 .align 16
4817 cbc_se_handler:
4818 push %rsi
4819 push %rdi
4820 push %rbx
4821 push %rbp
4822 push %r12
4823 push %r13
4824 push %r14
4825 push %r15
4826 pushfq
4827 sub \$64,%rsp
4828
4829 mov 152($context),%rax # pull context->Rsp
4830 mov 248($context),%rbx # pull context->Rip
4831
4832 lea .Lcbc_decrypt_bulk(%rip),%r10
4833 cmp %r10,%rbx # context->Rip<"prologue" label
4834 jb .Lcommon_seh_tail
4835
4836 lea .Lcbc_decrypt_body(%rip),%r10
4837 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
4838 jb .Lrestore_cbc_rax
4839
4840 lea .Lcbc_ret(%rip),%r10
4841 cmp %r10,%rbx # context->Rip>="epilogue" label
4842 jae .Lcommon_seh_tail
4843
4844 lea 16(%rax),%rsi # %xmm save area
4845 lea 512($context),%rdi # &context.Xmm6
4846 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4847 .long 0xa548f3fc # cld; rep movsq
4848
4849 .Lcommon_rbp_tail:
4850 mov 160($context),%rax # pull context->Rbp
4851 mov (%rax),%rbp # restore saved %rbp
4852 lea 8(%rax),%rax # adjust stack pointer
4853 mov %rbp,160($context) # restore context->Rbp
4854 jmp .Lcommon_seh_tail
4855
4856 .Lrestore_cbc_rax:
4857 mov 120($context),%rax
4858
4859 .Lcommon_seh_tail:
4860 mov 8(%rax),%rdi
4861 mov 16(%rax),%rsi
4862 mov %rax,152($context) # restore context->Rsp
4863 mov %rsi,168($context) # restore context->Rsi
4864 mov %rdi,176($context) # restore context->Rdi
4865
4866 mov 40($disp),%rdi # disp->ContextRecord
4867 mov $context,%rsi # context
4868 mov \$154,%ecx # sizeof(CONTEXT)
4869 .long 0xa548f3fc # cld; rep movsq
4870
4871 mov $disp,%rsi
4872 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4873 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4874 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4875 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4876 mov 40(%rsi),%r10 # disp->ContextRecord
4877 lea 56(%rsi),%r11 # &disp->HandlerData
4878 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4879 mov %r10,32(%rsp) # arg5
4880 mov %r11,40(%rsp) # arg6
4881 mov %r12,48(%rsp) # arg7
4882 mov %rcx,56(%rsp) # arg8, (NULL)
4883 call *__imp_RtlVirtualUnwind(%rip)
4884
4885 mov \$1,%eax # ExceptionContinueSearch
4886 add \$64,%rsp
4887 popfq
4888 pop %r15
4889 pop %r14
4890 pop %r13
4891 pop %r12
4892 pop %rbp
4893 pop %rbx
4894 pop %rdi
4895 pop %rsi
4896 ret
4897 .size cbc_se_handler,.-cbc_se_handler
4898
4899 .section .pdata
4900 .align 4
4901 ___
4902 $code.=<<___ if ($PREFIX eq "aesni");
4903 .rva .LSEH_begin_aesni_ecb_encrypt
4904 .rva .LSEH_end_aesni_ecb_encrypt
4905 .rva .LSEH_info_ecb
4906
4907 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
4908 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
4909 .rva .LSEH_info_ccm64_enc
4910
4911 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
4912 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
4913 .rva .LSEH_info_ccm64_dec
4914
4915 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
4916 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
4917 .rva .LSEH_info_ctr32
4918
4919 .rva .LSEH_begin_aesni_xts_encrypt
4920 .rva .LSEH_end_aesni_xts_encrypt
4921 .rva .LSEH_info_xts_enc
4922
4923 .rva .LSEH_begin_aesni_xts_decrypt
4924 .rva .LSEH_end_aesni_xts_decrypt
4925 .rva .LSEH_info_xts_dec
4926
4927 .rva .LSEH_begin_aesni_ocb_encrypt
4928 .rva .LSEH_end_aesni_ocb_encrypt
4929 .rva .LSEH_info_ocb_enc
4930
4931 .rva .LSEH_begin_aesni_ocb_decrypt
4932 .rva .LSEH_end_aesni_ocb_decrypt
4933 .rva .LSEH_info_ocb_dec
4934 ___
4935 $code.=<<___;
4936 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
4937 .rva .LSEH_end_${PREFIX}_cbc_encrypt
4938 .rva .LSEH_info_cbc
4939
4940 .rva ${PREFIX}_set_decrypt_key
4941 .rva .LSEH_end_set_decrypt_key
4942 .rva .LSEH_info_key
4943
4944 .rva ${PREFIX}_set_encrypt_key
4945 .rva .LSEH_end_set_encrypt_key
4946 .rva .LSEH_info_key
4947 .section .xdata
4948 .align 8
4949 ___
4950 $code.=<<___ if ($PREFIX eq "aesni");
4951 .LSEH_info_ecb:
4952 .byte 9,0,0,0
4953 .rva ecb_ccm64_se_handler
4954 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
4955 .LSEH_info_ccm64_enc:
4956 .byte 9,0,0,0
4957 .rva ecb_ccm64_se_handler
4958 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
4959 .LSEH_info_ccm64_dec:
4960 .byte 9,0,0,0
4961 .rva ecb_ccm64_se_handler
4962 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
4963 .LSEH_info_ctr32:
4964 .byte 9,0,0,0
4965 .rva ctr_xts_se_handler
4966 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
4967 .LSEH_info_xts_enc:
4968 .byte 9,0,0,0
4969 .rva ctr_xts_se_handler
4970 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
4971 .LSEH_info_xts_dec:
4972 .byte 9,0,0,0
4973 .rva ctr_xts_se_handler
4974 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
4975 .LSEH_info_ocb_enc:
4976 .byte 9,0,0,0
4977 .rva ocb_se_handler
4978 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
4979 .rva .Locb_enc_pop
4980 .long 0
4981 .LSEH_info_ocb_dec:
4982 .byte 9,0,0,0
4983 .rva ocb_se_handler
4984 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
4985 .rva .Locb_dec_pop
4986 .long 0
4987 ___
4988 $code.=<<___;
4989 .LSEH_info_cbc:
4990 .byte 9,0,0,0
4991 .rva cbc_se_handler
4992 .LSEH_info_key:
4993 .byte 0x01,0x04,0x01,0x00
4994 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
4995 ___
4996 }
4997
4998 sub rex {
4999 local *opcode=shift;
5000 my ($dst,$src)=@_;
5001 my $rex=0;
5002
5003 $rex|=0x04 if($dst>=8);
5004 $rex|=0x01 if($src>=8);
5005 push @opcode,$rex|0x40 if($rex);
5006 }
5007
5008 sub aesni {
5009 my $line=shift;
5010 my @opcode=(0x66);
5011
5012 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5013 rex(\@opcode,$4,$3);
5014 push @opcode,0x0f,0x3a,0xdf;
5015 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5016 my $c=$2;
5017 push @opcode,$c=~/^0/?oct($c):$c;
5018 return ".byte\t".join(',',@opcode);
5019 }
5020 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5021 my %opcodelet = (
5022 "aesimc" => 0xdb,
5023 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5024 "aesdec" => 0xde, "aesdeclast" => 0xdf
5025 );
5026 return undef if (!defined($opcodelet{$1}));
5027 rex(\@opcode,$3,$2);
5028 push @opcode,0x0f,0x38,$opcodelet{$1};
5029 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5030 return ".byte\t".join(',',@opcode);
5031 }
5032 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5033 my %opcodelet = (
5034 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5035 "aesdec" => 0xde, "aesdeclast" => 0xdf
5036 );
5037 return undef if (!defined($opcodelet{$1}));
5038 my $off = $2;
5039 push @opcode,0x44 if ($3>=8);
5040 push @opcode,0x0f,0x38,$opcodelet{$1};
5041 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5042 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5043 return ".byte\t".join(',',@opcode);
5044 }
5045 return $line;
5046 }
5047
5048 sub movbe {
5049 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5050 }
5051
5052 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
5053 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5054 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5055 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5056
5057 print $code;
5058
5059 close STDOUT;