]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-x86_64.pl
Many spelling fixes/typo's corrected.
[thirdparty/openssl.git] / crypto / aes / asm / aesni-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
d64a7232
AP
9#
10# ====================================================================
d8ba0dc9 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
d64a7232
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
d7d119a3
AP
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
f8501464 28# asymptotic limit it's not something you commonly achieve in reality,
d7d119a3
AP
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33# 16-byte 64-byte 256-byte 1-KB 8-KB
34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
609b0852 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
d7d119a3
AP
38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
46f4e1be 63# "straightforward" CCM implementation with CTR and CBC-MAC performed
d7d119a3
AP
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
02f358da 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
d64a7232 88
f8501464
AP
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
609b0852 121# instructions' interleave factor. Westmere can execute at most 3
f8501464
AP
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor 3x 6x 8x
132# theoretical asymptotic limit 1.67 0.83 0.625
133# measured performance for 8KB block 1.05 0.86 0.84
134#
135# "as if" interleave factor 4.7x 5.8x 6.0x
136#
137# Further data for other parallelizable modes:
138#
73325b22 139# CBC decrypt 1.16 0.93 0.74
cd54249c 140# CTR 1.14 0.91 0.74
f8501464
AP
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
46f4e1be 146# additional instructions with AES ones, but even AES instructions
f8501464
AP
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
46f4e1be 153# utilizes 6x interleave because of limited register bank capacity.
f8501464
AP
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
36df342f
AP
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
f8501464
AP
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
bd30091c
AP
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
d2e18031 172######################################################################
5599c733
AP
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
bd30091c 176# CBC en-/decrypt CTR XTS ECB OCB
5599c733 177# Westmere 3.77/1.25 1.25 1.25 1.26
bd30091c
AP
178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
b7f5503f 180# Skylake 2.62/0.63 0.63 0.63 0.63
bd30091c 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
64d92d74 182# Knights L 2.54/0.77 0.78 0.85 - 1.50
ace05265 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
bd30091c 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
54f8f9a1 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
5599c733 186#
23f6eec7
AP
187# (*) Atom Silvermont ECB result is suboptimal because of penalties
188# incurred by operations on %xmm8-15. As ECB is not considered
5599c733 189# critical, nothing was done to mitigate the problem.
d8ba0dc9 190
d64a7232
AP
191$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
192 # generates drop-in replacement for
193 # crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
203( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
204die "can't locate x86_64-xlate.pl";
205
cfe1d992 206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
46bf83f0 207*STDOUT=*OUT;
d64a7232 208
8da721ee 209$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
d608b4d6
AP
210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
d64a7232
AP
212
213$code=".text\n";
5599c733 214$code.=".extern OPENSSL_ia32cap_P\n";
d64a7232
AP
215
216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
d608b4d6 217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
d64a7232
AP
218$inp="%rdi";
219$out="%rsi";
d64a7232
AP
220$len="%rdx";
221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
d7d119a3 222$ivp="%r8"; # cbc, ctr, ...
d64a7232
AP
223
224$rnds_="%r10d"; # backup copy for $rounds
225$key_="%r11"; # backup copy for $key
226
227# %xmm register layout
f8501464
AP
228$rndkey0="%xmm0"; $rndkey1="%xmm1";
229$inout0="%xmm2"; $inout1="%xmm3";
230$inout2="%xmm4"; $inout3="%xmm5";
231$inout4="%xmm6"; $inout5="%xmm7";
232$inout6="%xmm8"; $inout7="%xmm9";
233
234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
235$in0="%xmm8"; $iv="%xmm9";
d64a7232
AP
236\f
237# Inline version of internal aesni_[en|de]crypt1.
238#
239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240# cycles which take care of loop variables...
241{ my $sn;
d608b4d6 242sub aesni_generate1 {
f8501464 243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
d64a7232
AP
244++$sn;
245$code.=<<___;
f8501464 246 $movkey ($key),$rndkey0
d64a7232 247 $movkey 16($key),$rndkey1
f8501464
AP
248___
249$code.=<<___ if (defined($ivec));
250 xorps $rndkey0,$ivec
251 lea 32($key),$key
252 xorps $ivec,$inout
253___
254$code.=<<___ if (!defined($ivec));
d608b4d6 255 lea 32($key),$key
f8501464
AP
256 xorps $rndkey0,$inout
257___
258$code.=<<___;
d608b4d6 259.Loop_${p}1_$sn:
d7d119a3 260 aes${p} $rndkey1,$inout
d64a7232 261 dec $rounds
d64a7232 262 $movkey ($key),$rndkey1
d64a7232 263 lea 16($key),$key
d608b4d6 264 jnz .Loop_${p}1_$sn # loop body is 16 bytes
d7d119a3 265 aes${p}last $rndkey1,$inout
d64a7232
AP
266___
267}}
d608b4d6 268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
d64a7232 269#
d608b4d6
AP
270{ my ($inp,$out,$key) = @_4args;
271
d64a7232
AP
272$code.=<<___;
273.globl ${PREFIX}_encrypt
d608b4d6 274.type ${PREFIX}_encrypt,\@abi-omnipotent
d64a7232
AP
275.align 16
276${PREFIX}_encrypt:
f8501464
AP
277 movups ($inp),$inout0 # load input
278 mov 240($key),$rounds # key->rounds
d64a7232 279___
d608b4d6 280 &aesni_generate1("enc",$key,$rounds);
d64a7232 281$code.=<<___;
23f6eec7
AP
282 pxor $rndkey0,$rndkey0 # clear register bank
283 pxor $rndkey1,$rndkey1
d608b4d6 284 movups $inout0,($out) # output
23f6eec7 285 pxor $inout0,$inout0
d64a7232
AP
286 ret
287.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
d64a7232 288
d64a7232 289.globl ${PREFIX}_decrypt
d608b4d6 290.type ${PREFIX}_decrypt,\@abi-omnipotent
d64a7232
AP
291.align 16
292${PREFIX}_decrypt:
f8501464
AP
293 movups ($inp),$inout0 # load input
294 mov 240($key),$rounds # key->rounds
d64a7232 295___
d608b4d6 296 &aesni_generate1("dec",$key,$rounds);
d64a7232 297$code.=<<___;
23f6eec7
AP
298 pxor $rndkey0,$rndkey0 # clear register bank
299 pxor $rndkey1,$rndkey1
d608b4d6 300 movups $inout0,($out) # output
23f6eec7 301 pxor $inout0,$inout0
d64a7232
AP
302 ret
303.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
304___
d608b4d6 305}
d64a7232 306\f
f8501464
AP
307# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
308# factor. Why 3x subroutine were originally used in loops? Even though
309# aes[enc|dec] latency was originally 6, it could be scheduled only
310# every *2nd* cycle. Thus 3x interleave was the one providing optimal
d608b4d6
AP
311# utilization, i.e. when subroutine's throughput is virtually same as
312# of non-interleaved subroutine [for number of input blocks up to 3].
214368ff
AP
313# This is why it originally made no sense to implement 2x subroutine.
314# But times change and it became appropriate to spend extra 192 bytes
315# on 2x subroutine on Atom Silvermont account. For processors that
316# can schedule aes[enc|dec] every cycle optimal interleave factor
317# equals to corresponding instructions latency. 8x is optimal for
609b0852 318# * Bridge and "super-optimal" for other Intel CPUs...
214368ff
AP
319
320sub aesni_generate2 {
321my $dir=shift;
322# As already mentioned it takes in $key and $rounds, which are *not*
323# preserved. $inout[0-1] is cipher/clear text...
324$code.=<<___;
325.type _aesni_${dir}rypt2,\@abi-omnipotent
326.align 16
327_aesni_${dir}rypt2:
328 $movkey ($key),$rndkey0
329 shl \$4,$rounds
330 $movkey 16($key),$rndkey1
331 xorps $rndkey0,$inout0
332 xorps $rndkey0,$inout1
333 $movkey 32($key),$rndkey0
334 lea 32($key,$rounds),$key
335 neg %rax # $rounds
336 add \$16,%rax
337
338.L${dir}_loop2:
339 aes${dir} $rndkey1,$inout0
340 aes${dir} $rndkey1,$inout1
341 $movkey ($key,%rax),$rndkey1
342 add \$32,%rax
343 aes${dir} $rndkey0,$inout0
344 aes${dir} $rndkey0,$inout1
345 $movkey -16($key,%rax),$rndkey0
346 jnz .L${dir}_loop2
347
348 aes${dir} $rndkey1,$inout0
349 aes${dir} $rndkey1,$inout1
350 aes${dir}last $rndkey0,$inout0
351 aes${dir}last $rndkey0,$inout1
352 ret
353.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
354___
355}
d608b4d6 356sub aesni_generate3 {
d64a7232
AP
357my $dir=shift;
358# As already mentioned it takes in $key and $rounds, which are *not*
d608b4d6 359# preserved. $inout[0-2] is cipher/clear text...
d64a7232 360$code.=<<___;
d608b4d6 361.type _aesni_${dir}rypt3,\@abi-omnipotent
d64a7232 362.align 16
d608b4d6 363_aesni_${dir}rypt3:
d64a7232 364 $movkey ($key),$rndkey0
d8ba0dc9 365 shl \$4,$rounds
d64a7232 366 $movkey 16($key),$rndkey1
f8501464
AP
367 xorps $rndkey0,$inout0
368 xorps $rndkey0,$inout1
369 xorps $rndkey0,$inout2
d8ba0dc9
AP
370 $movkey 32($key),$rndkey0
371 lea 32($key,$rounds),$key
372 neg %rax # $rounds
373 add \$16,%rax
d608b4d6
AP
374
375.L${dir}_loop3:
376 aes${dir} $rndkey1,$inout0
d608b4d6 377 aes${dir} $rndkey1,$inout1
d608b4d6 378 aes${dir} $rndkey1,$inout2
d8ba0dc9
AP
379 $movkey ($key,%rax),$rndkey1
380 add \$32,%rax
d7d119a3 381 aes${dir} $rndkey0,$inout0
d608b4d6 382 aes${dir} $rndkey0,$inout1
d608b4d6 383 aes${dir} $rndkey0,$inout2
d8ba0dc9 384 $movkey -16($key,%rax),$rndkey0
d608b4d6
AP
385 jnz .L${dir}_loop3
386
387 aes${dir} $rndkey1,$inout0
d608b4d6
AP
388 aes${dir} $rndkey1,$inout1
389 aes${dir} $rndkey1,$inout2
390 aes${dir}last $rndkey0,$inout0
391 aes${dir}last $rndkey0,$inout1
392 aes${dir}last $rndkey0,$inout2
393 ret
394.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
395___
396}
397# 4x interleave is implemented to improve small block performance,
398# most notably [and naturally] 4 block by ~30%. One can argue that one
399# should have implemented 5x as well, but improvement would be <20%,
400# so it's not worth it...
401sub aesni_generate4 {
402my $dir=shift;
403# As already mentioned it takes in $key and $rounds, which are *not*
404# preserved. $inout[0-3] is cipher/clear text...
405$code.=<<___;
406.type _aesni_${dir}rypt4,\@abi-omnipotent
407.align 16
408_aesni_${dir}rypt4:
409 $movkey ($key),$rndkey0
d8ba0dc9 410 shl \$4,$rounds
d608b4d6 411 $movkey 16($key),$rndkey1
f8501464
AP
412 xorps $rndkey0,$inout0
413 xorps $rndkey0,$inout1
414 xorps $rndkey0,$inout2
415 xorps $rndkey0,$inout3
d8ba0dc9
AP
416 $movkey 32($key),$rndkey0
417 lea 32($key,$rounds),$key
418 neg %rax # $rounds
419 .byte 0x0f,0x1f,0x00
420 add \$16,%rax
d608b4d6
AP
421
422.L${dir}_loop4:
d64a7232 423 aes${dir} $rndkey1,$inout0
d64a7232 424 aes${dir} $rndkey1,$inout1
d64a7232
AP
425 aes${dir} $rndkey1,$inout2
426 aes${dir} $rndkey1,$inout3
d8ba0dc9
AP
427 $movkey ($key,%rax),$rndkey1
428 add \$32,%rax
d7d119a3 429 aes${dir} $rndkey0,$inout0
d64a7232 430 aes${dir} $rndkey0,$inout1
d64a7232
AP
431 aes${dir} $rndkey0,$inout2
432 aes${dir} $rndkey0,$inout3
d8ba0dc9 433 $movkey -16($key,%rax),$rndkey0
d608b4d6
AP
434 jnz .L${dir}_loop4
435
d64a7232 436 aes${dir} $rndkey1,$inout0
d64a7232
AP
437 aes${dir} $rndkey1,$inout1
438 aes${dir} $rndkey1,$inout2
439 aes${dir} $rndkey1,$inout3
d64a7232
AP
440 aes${dir}last $rndkey0,$inout0
441 aes${dir}last $rndkey0,$inout1
442 aes${dir}last $rndkey0,$inout2
443 aes${dir}last $rndkey0,$inout3
d64a7232 444 ret
d608b4d6 445.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
d64a7232
AP
446___
447}
f8501464
AP
448sub aesni_generate6 {
449my $dir=shift;
450# As already mentioned it takes in $key and $rounds, which are *not*
451# preserved. $inout[0-5] is cipher/clear text...
452$code.=<<___;
453.type _aesni_${dir}rypt6,\@abi-omnipotent
454.align 16
455_aesni_${dir}rypt6:
456 $movkey ($key),$rndkey0
d8ba0dc9 457 shl \$4,$rounds
f8501464 458 $movkey 16($key),$rndkey1
f8501464
AP
459 xorps $rndkey0,$inout0
460 pxor $rndkey0,$inout1
f8501464 461 pxor $rndkey0,$inout2
d8ba0dc9
AP
462 aes${dir} $rndkey1,$inout0
463 lea 32($key,$rounds),$key
464 neg %rax # $rounds
f8501464
AP
465 aes${dir} $rndkey1,$inout1
466 pxor $rndkey0,$inout3
f8501464 467 pxor $rndkey0,$inout4
d8ba0dc9 468 aes${dir} $rndkey1,$inout2
f8501464 469 pxor $rndkey0,$inout5
23f6eec7 470 $movkey ($key,%rax),$rndkey0
d8ba0dc9 471 add \$16,%rax
f8501464
AP
472 jmp .L${dir}_loop6_enter
473.align 16
474.L${dir}_loop6:
475 aes${dir} $rndkey1,$inout0
476 aes${dir} $rndkey1,$inout1
f8501464 477 aes${dir} $rndkey1,$inout2
23f6eec7 478.L${dir}_loop6_enter:
f8501464
AP
479 aes${dir} $rndkey1,$inout3
480 aes${dir} $rndkey1,$inout4
481 aes${dir} $rndkey1,$inout5
d8ba0dc9
AP
482 $movkey ($key,%rax),$rndkey1
483 add \$32,%rax
f8501464
AP
484 aes${dir} $rndkey0,$inout0
485 aes${dir} $rndkey0,$inout1
f8501464
AP
486 aes${dir} $rndkey0,$inout2
487 aes${dir} $rndkey0,$inout3
488 aes${dir} $rndkey0,$inout4
489 aes${dir} $rndkey0,$inout5
d8ba0dc9 490 $movkey -16($key,%rax),$rndkey0
f8501464
AP
491 jnz .L${dir}_loop6
492
493 aes${dir} $rndkey1,$inout0
494 aes${dir} $rndkey1,$inout1
495 aes${dir} $rndkey1,$inout2
496 aes${dir} $rndkey1,$inout3
497 aes${dir} $rndkey1,$inout4
498 aes${dir} $rndkey1,$inout5
499 aes${dir}last $rndkey0,$inout0
500 aes${dir}last $rndkey0,$inout1
501 aes${dir}last $rndkey0,$inout2
502 aes${dir}last $rndkey0,$inout3
503 aes${dir}last $rndkey0,$inout4
504 aes${dir}last $rndkey0,$inout5
505 ret
506.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
507___
508}
509sub aesni_generate8 {
510my $dir=shift;
511# As already mentioned it takes in $key and $rounds, which are *not*
512# preserved. $inout[0-7] is cipher/clear text...
513$code.=<<___;
514.type _aesni_${dir}rypt8,\@abi-omnipotent
515.align 16
516_aesni_${dir}rypt8:
517 $movkey ($key),$rndkey0
d8ba0dc9 518 shl \$4,$rounds
f8501464 519 $movkey 16($key),$rndkey1
f8501464
AP
520 xorps $rndkey0,$inout0
521 xorps $rndkey0,$inout1
f8501464 522 pxor $rndkey0,$inout2
f8501464 523 pxor $rndkey0,$inout3
f8501464 524 pxor $rndkey0,$inout4
d8ba0dc9
AP
525 lea 32($key,$rounds),$key
526 neg %rax # $rounds
527 aes${dir} $rndkey1,$inout0
f8501464 528 pxor $rndkey0,$inout5
f8501464 529 pxor $rndkey0,$inout6
23f6eec7 530 aes${dir} $rndkey1,$inout1
f8501464 531 pxor $rndkey0,$inout7
23f6eec7
AP
532 $movkey ($key,%rax),$rndkey0
533 add \$16,%rax
534 jmp .L${dir}_loop8_inner
f8501464
AP
535.align 16
536.L${dir}_loop8:
537 aes${dir} $rndkey1,$inout0
538 aes${dir} $rndkey1,$inout1
23f6eec7 539.L${dir}_loop8_inner:
f8501464
AP
540 aes${dir} $rndkey1,$inout2
541 aes${dir} $rndkey1,$inout3
542 aes${dir} $rndkey1,$inout4
543 aes${dir} $rndkey1,$inout5
544 aes${dir} $rndkey1,$inout6
545 aes${dir} $rndkey1,$inout7
d8ba0dc9
AP
546.L${dir}_loop8_enter:
547 $movkey ($key,%rax),$rndkey1
548 add \$32,%rax
f8501464
AP
549 aes${dir} $rndkey0,$inout0
550 aes${dir} $rndkey0,$inout1
f8501464
AP
551 aes${dir} $rndkey0,$inout2
552 aes${dir} $rndkey0,$inout3
553 aes${dir} $rndkey0,$inout4
554 aes${dir} $rndkey0,$inout5
555 aes${dir} $rndkey0,$inout6
556 aes${dir} $rndkey0,$inout7
d8ba0dc9 557 $movkey -16($key,%rax),$rndkey0
f8501464
AP
558 jnz .L${dir}_loop8
559
560 aes${dir} $rndkey1,$inout0
561 aes${dir} $rndkey1,$inout1
562 aes${dir} $rndkey1,$inout2
563 aes${dir} $rndkey1,$inout3
564 aes${dir} $rndkey1,$inout4
565 aes${dir} $rndkey1,$inout5
566 aes${dir} $rndkey1,$inout6
567 aes${dir} $rndkey1,$inout7
568 aes${dir}last $rndkey0,$inout0
569 aes${dir}last $rndkey0,$inout1
570 aes${dir}last $rndkey0,$inout2
571 aes${dir}last $rndkey0,$inout3
572 aes${dir}last $rndkey0,$inout4
573 aes${dir}last $rndkey0,$inout5
574 aes${dir}last $rndkey0,$inout6
575 aes${dir}last $rndkey0,$inout7
576 ret
577.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
578___
579}
214368ff
AP
580&aesni_generate2("enc") if ($PREFIX eq "aesni");
581&aesni_generate2("dec");
d608b4d6
AP
582&aesni_generate3("enc") if ($PREFIX eq "aesni");
583&aesni_generate3("dec");
584&aesni_generate4("enc") if ($PREFIX eq "aesni");
585&aesni_generate4("dec");
f8501464
AP
586&aesni_generate6("enc") if ($PREFIX eq "aesni");
587&aesni_generate6("dec");
588&aesni_generate8("enc") if ($PREFIX eq "aesni");
589&aesni_generate8("dec");
d64a7232
AP
590\f
591if ($PREFIX eq "aesni") {
6c83629b 592########################################################################
d64a7232
AP
593# void aesni_ecb_encrypt (const void *in, void *out,
594# size_t length, const AES_KEY *key,
595# int enc);
596$code.=<<___;
597.globl aesni_ecb_encrypt
598.type aesni_ecb_encrypt,\@function,5
599.align 16
600aesni_ecb_encrypt:
69d5747f
AP
601___
602$code.=<<___ if ($win64);
603 lea -0x58(%rsp),%rsp
23f6eec7 604 movaps %xmm6,(%rsp) # offload $inout4..7
69d5747f
AP
605 movaps %xmm7,0x10(%rsp)
606 movaps %xmm8,0x20(%rsp)
607 movaps %xmm9,0x30(%rsp)
608.Lecb_enc_body:
609___
610$code.=<<___;
23f6eec7
AP
611 and \$-16,$len # if ($len<16)
612 jz .Lecb_ret # return
f8501464
AP
613
614 mov 240($key),$rounds # key->rounds
615 $movkey ($key),$rndkey0
d64a7232 616 mov $key,$key_ # backup $key
d64a7232 617 mov $rounds,$rnds_ # backup $rounds
d7d119a3 618 test %r8d,%r8d # 5th argument
d64a7232
AP
619 jz .Lecb_decrypt
620#--------------------------- ECB ENCRYPT ------------------------------#
23f6eec7
AP
621 cmp \$0x80,$len # if ($len<8*16)
622 jb .Lecb_enc_tail # short input
f8501464 623
23f6eec7 624 movdqu ($inp),$inout0 # load 8 input blocks
f8501464
AP
625 movdqu 0x10($inp),$inout1
626 movdqu 0x20($inp),$inout2
627 movdqu 0x30($inp),$inout3
628 movdqu 0x40($inp),$inout4
629 movdqu 0x50($inp),$inout5
630 movdqu 0x60($inp),$inout6
631 movdqu 0x70($inp),$inout7
23f6eec7
AP
632 lea 0x80($inp),$inp # $inp+=8*16
633 sub \$0x80,$len # $len-=8*16 (can be zero)
f8501464 634 jmp .Lecb_enc_loop8_enter
d64a7232 635.align 16
f8501464 636.Lecb_enc_loop8:
23f6eec7 637 movups $inout0,($out) # store 8 output blocks
f8501464 638 mov $key_,$key # restore $key
23f6eec7 639 movdqu ($inp),$inout0 # load 8 input blocks
d64a7232 640 mov $rnds_,$rounds # restore $rounds
d7d119a3 641 movups $inout1,0x10($out)
f8501464
AP
642 movdqu 0x10($inp),$inout1
643 movups $inout2,0x20($out)
644 movdqu 0x20($inp),$inout2
645 movups $inout3,0x30($out)
646 movdqu 0x30($inp),$inout3
647 movups $inout4,0x40($out)
648 movdqu 0x40($inp),$inout4
649 movups $inout5,0x50($out)
650 movdqu 0x50($inp),$inout5
651 movups $inout6,0x60($out)
652 movdqu 0x60($inp),$inout6
653 movups $inout7,0x70($out)
23f6eec7 654 lea 0x80($out),$out # $out+=8*16
f8501464 655 movdqu 0x70($inp),$inout7
23f6eec7 656 lea 0x80($inp),$inp # $inp+=8*16
f8501464
AP
657.Lecb_enc_loop8_enter:
658
659 call _aesni_encrypt8
660
661 sub \$0x80,$len
23f6eec7 662 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
f8501464 663
23f6eec7 664 movups $inout0,($out) # store 8 output blocks
d64a7232 665 mov $key_,$key # restore $key
f8501464
AP
666 movups $inout1,0x10($out)
667 mov $rnds_,$rounds # restore $rounds
d7d119a3 668 movups $inout2,0x20($out)
f8501464
AP
669 movups $inout3,0x30($out)
670 movups $inout4,0x40($out)
671 movups $inout5,0x50($out)
672 movups $inout6,0x60($out)
673 movups $inout7,0x70($out)
23f6eec7
AP
674 lea 0x80($out),$out # $out+=8*16
675 add \$0x80,$len # restore real remaining $len
676 jz .Lecb_ret # done if ($len==0)
d64a7232 677
23f6eec7 678.Lecb_enc_tail: # $len is less than 8*16
6c83629b 679 movups ($inp),$inout0
d7d119a3 680 cmp \$0x20,$len
6c83629b 681 jb .Lecb_enc_one
d64a7232
AP
682 movups 0x10($inp),$inout1
683 je .Lecb_enc_two
d64a7232 684 movups 0x20($inp),$inout2
f8501464
AP
685 cmp \$0x40,$len
686 jb .Lecb_enc_three
d64a7232 687 movups 0x30($inp),$inout3
f8501464
AP
688 je .Lecb_enc_four
689 movups 0x40($inp),$inout4
690 cmp \$0x60,$len
691 jb .Lecb_enc_five
692 movups 0x50($inp),$inout5
693 je .Lecb_enc_six
694 movdqu 0x60($inp),$inout6
23f6eec7 695 xorps $inout7,$inout7
f8501464 696 call _aesni_encrypt8
23f6eec7 697 movups $inout0,($out) # store 7 output blocks
d64a7232
AP
698 movups $inout1,0x10($out)
699 movups $inout2,0x20($out)
700 movups $inout3,0x30($out)
f8501464
AP
701 movups $inout4,0x40($out)
702 movups $inout5,0x50($out)
703 movups $inout6,0x60($out)
d64a7232
AP
704 jmp .Lecb_ret
705.align 16
706.Lecb_enc_one:
707___
d608b4d6 708 &aesni_generate1("enc",$key,$rounds);
d64a7232 709$code.=<<___;
23f6eec7 710 movups $inout0,($out) # store one output block
d64a7232
AP
711 jmp .Lecb_ret
712.align 16
713.Lecb_enc_two:
214368ff 714 call _aesni_encrypt2
23f6eec7 715 movups $inout0,($out) # store 2 output blocks
d64a7232
AP
716 movups $inout1,0x10($out)
717 jmp .Lecb_ret
718.align 16
719.Lecb_enc_three:
d608b4d6 720 call _aesni_encrypt3
23f6eec7 721 movups $inout0,($out) # store 3 output blocks
d64a7232
AP
722 movups $inout1,0x10($out)
723 movups $inout2,0x20($out)
724 jmp .Lecb_ret
f8501464
AP
725.align 16
726.Lecb_enc_four:
727 call _aesni_encrypt4
23f6eec7 728 movups $inout0,($out) # store 4 output blocks
f8501464
AP
729 movups $inout1,0x10($out)
730 movups $inout2,0x20($out)
731 movups $inout3,0x30($out)
732 jmp .Lecb_ret
733.align 16
734.Lecb_enc_five:
735 xorps $inout5,$inout5
736 call _aesni_encrypt6
23f6eec7 737 movups $inout0,($out) # store 5 output blocks
f8501464
AP
738 movups $inout1,0x10($out)
739 movups $inout2,0x20($out)
740 movups $inout3,0x30($out)
741 movups $inout4,0x40($out)
742 jmp .Lecb_ret
743.align 16
744.Lecb_enc_six:
745 call _aesni_encrypt6
23f6eec7 746 movups $inout0,($out) # store 6 output blocks
f8501464
AP
747 movups $inout1,0x10($out)
748 movups $inout2,0x20($out)
749 movups $inout3,0x30($out)
750 movups $inout4,0x40($out)
751 movups $inout5,0x50($out)
752 jmp .Lecb_ret
d64a7232
AP
753\f#--------------------------- ECB DECRYPT ------------------------------#
754.align 16
755.Lecb_decrypt:
23f6eec7
AP
756 cmp \$0x80,$len # if ($len<8*16)
757 jb .Lecb_dec_tail # short input
f8501464 758
23f6eec7 759 movdqu ($inp),$inout0 # load 8 input blocks
f8501464
AP
760 movdqu 0x10($inp),$inout1
761 movdqu 0x20($inp),$inout2
762 movdqu 0x30($inp),$inout3
763 movdqu 0x40($inp),$inout4
764 movdqu 0x50($inp),$inout5
765 movdqu 0x60($inp),$inout6
766 movdqu 0x70($inp),$inout7
23f6eec7
AP
767 lea 0x80($inp),$inp # $inp+=8*16
768 sub \$0x80,$len # $len-=8*16 (can be zero)
f8501464 769 jmp .Lecb_dec_loop8_enter
d64a7232 770.align 16
f8501464 771.Lecb_dec_loop8:
23f6eec7 772 movups $inout0,($out) # store 8 output blocks
f8501464 773 mov $key_,$key # restore $key
23f6eec7 774 movdqu ($inp),$inout0 # load 8 input blocks
d64a7232 775 mov $rnds_,$rounds # restore $rounds
d7d119a3 776 movups $inout1,0x10($out)
f8501464
AP
777 movdqu 0x10($inp),$inout1
778 movups $inout2,0x20($out)
779 movdqu 0x20($inp),$inout2
780 movups $inout3,0x30($out)
781 movdqu 0x30($inp),$inout3
782 movups $inout4,0x40($out)
783 movdqu 0x40($inp),$inout4
784 movups $inout5,0x50($out)
785 movdqu 0x50($inp),$inout5
786 movups $inout6,0x60($out)
787 movdqu 0x60($inp),$inout6
788 movups $inout7,0x70($out)
23f6eec7 789 lea 0x80($out),$out # $out+=8*16
f8501464 790 movdqu 0x70($inp),$inout7
23f6eec7 791 lea 0x80($inp),$inp # $inp+=8*16
f8501464
AP
792.Lecb_dec_loop8_enter:
793
794 call _aesni_decrypt8
795
796 $movkey ($key_),$rndkey0
797 sub \$0x80,$len
23f6eec7 798 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
f8501464 799
23f6eec7
AP
800 movups $inout0,($out) # store 8 output blocks
801 pxor $inout0,$inout0 # clear register bank
d64a7232 802 mov $key_,$key # restore $key
f8501464 803 movups $inout1,0x10($out)
23f6eec7 804 pxor $inout1,$inout1
f8501464 805 mov $rnds_,$rounds # restore $rounds
d7d119a3 806 movups $inout2,0x20($out)
23f6eec7 807 pxor $inout2,$inout2
f8501464 808 movups $inout3,0x30($out)
23f6eec7 809 pxor $inout3,$inout3
f8501464 810 movups $inout4,0x40($out)
23f6eec7 811 pxor $inout4,$inout4
f8501464 812 movups $inout5,0x50($out)
23f6eec7 813 pxor $inout5,$inout5
f8501464 814 movups $inout6,0x60($out)
23f6eec7 815 pxor $inout6,$inout6
f8501464 816 movups $inout7,0x70($out)
23f6eec7
AP
817 pxor $inout7,$inout7
818 lea 0x80($out),$out # $out+=8*16
819 add \$0x80,$len # restore real remaining $len
820 jz .Lecb_ret # done if ($len==0)
d64a7232 821
6c83629b 822.Lecb_dec_tail:
6c83629b 823 movups ($inp),$inout0
d7d119a3 824 cmp \$0x20,$len
6c83629b 825 jb .Lecb_dec_one
d64a7232
AP
826 movups 0x10($inp),$inout1
827 je .Lecb_dec_two
d64a7232 828 movups 0x20($inp),$inout2
f8501464
AP
829 cmp \$0x40,$len
830 jb .Lecb_dec_three
d64a7232 831 movups 0x30($inp),$inout3
f8501464
AP
832 je .Lecb_dec_four
833 movups 0x40($inp),$inout4
834 cmp \$0x60,$len
835 jb .Lecb_dec_five
836 movups 0x50($inp),$inout5
837 je .Lecb_dec_six
838 movups 0x60($inp),$inout6
839 $movkey ($key),$rndkey0
23f6eec7 840 xorps $inout7,$inout7
f8501464 841 call _aesni_decrypt8
23f6eec7
AP
842 movups $inout0,($out) # store 7 output blocks
843 pxor $inout0,$inout0 # clear register bank
d64a7232 844 movups $inout1,0x10($out)
23f6eec7 845 pxor $inout1,$inout1
d64a7232 846 movups $inout2,0x20($out)
23f6eec7 847 pxor $inout2,$inout2
d64a7232 848 movups $inout3,0x30($out)
23f6eec7 849 pxor $inout3,$inout3
f8501464 850 movups $inout4,0x40($out)
23f6eec7 851 pxor $inout4,$inout4
f8501464 852 movups $inout5,0x50($out)
23f6eec7 853 pxor $inout5,$inout5
f8501464 854 movups $inout6,0x60($out)
23f6eec7
AP
855 pxor $inout6,$inout6
856 pxor $inout7,$inout7
d64a7232
AP
857 jmp .Lecb_ret
858.align 16
859.Lecb_dec_one:
860___
d608b4d6 861 &aesni_generate1("dec",$key,$rounds);
d64a7232 862$code.=<<___;
23f6eec7
AP
863 movups $inout0,($out) # store one output block
864 pxor $inout0,$inout0 # clear register bank
d64a7232
AP
865 jmp .Lecb_ret
866.align 16
867.Lecb_dec_two:
214368ff 868 call _aesni_decrypt2
23f6eec7
AP
869 movups $inout0,($out) # store 2 output blocks
870 pxor $inout0,$inout0 # clear register bank
d64a7232 871 movups $inout1,0x10($out)
23f6eec7 872 pxor $inout1,$inout1
d64a7232
AP
873 jmp .Lecb_ret
874.align 16
875.Lecb_dec_three:
d608b4d6 876 call _aesni_decrypt3
23f6eec7
AP
877 movups $inout0,($out) # store 3 output blocks
878 pxor $inout0,$inout0 # clear register bank
d64a7232 879 movups $inout1,0x10($out)
23f6eec7 880 pxor $inout1,$inout1
d64a7232 881 movups $inout2,0x20($out)
23f6eec7 882 pxor $inout2,$inout2
f8501464
AP
883 jmp .Lecb_ret
884.align 16
885.Lecb_dec_four:
886 call _aesni_decrypt4
23f6eec7
AP
887 movups $inout0,($out) # store 4 output blocks
888 pxor $inout0,$inout0 # clear register bank
f8501464 889 movups $inout1,0x10($out)
23f6eec7 890 pxor $inout1,$inout1
f8501464 891 movups $inout2,0x20($out)
23f6eec7 892 pxor $inout2,$inout2
f8501464 893 movups $inout3,0x30($out)
23f6eec7 894 pxor $inout3,$inout3
f8501464
AP
895 jmp .Lecb_ret
896.align 16
897.Lecb_dec_five:
898 xorps $inout5,$inout5
899 call _aesni_decrypt6
23f6eec7
AP
900 movups $inout0,($out) # store 5 output blocks
901 pxor $inout0,$inout0 # clear register bank
f8501464 902 movups $inout1,0x10($out)
23f6eec7 903 pxor $inout1,$inout1
f8501464 904 movups $inout2,0x20($out)
23f6eec7 905 pxor $inout2,$inout2
f8501464 906 movups $inout3,0x30($out)
23f6eec7 907 pxor $inout3,$inout3
f8501464 908 movups $inout4,0x40($out)
23f6eec7
AP
909 pxor $inout4,$inout4
910 pxor $inout5,$inout5
f8501464
AP
911 jmp .Lecb_ret
912.align 16
913.Lecb_dec_six:
914 call _aesni_decrypt6
23f6eec7
AP
915 movups $inout0,($out) # store 6 output blocks
916 pxor $inout0,$inout0 # clear register bank
f8501464 917 movups $inout1,0x10($out)
23f6eec7 918 pxor $inout1,$inout1
f8501464 919 movups $inout2,0x20($out)
23f6eec7 920 pxor $inout2,$inout2
f8501464 921 movups $inout3,0x30($out)
23f6eec7 922 pxor $inout3,$inout3
f8501464 923 movups $inout4,0x40($out)
23f6eec7 924 pxor $inout4,$inout4
f8501464 925 movups $inout5,0x50($out)
23f6eec7 926 pxor $inout5,$inout5
d64a7232
AP
927
928.Lecb_ret:
23f6eec7
AP
929 xorps $rndkey0,$rndkey0 # %xmm0
930 pxor $rndkey1,$rndkey1
69d5747f
AP
931___
932$code.=<<___ if ($win64);
933 movaps (%rsp),%xmm6
23f6eec7 934 movaps %xmm0,(%rsp) # clear stack
69d5747f 935 movaps 0x10(%rsp),%xmm7
23f6eec7 936 movaps %xmm0,0x10(%rsp)
69d5747f 937 movaps 0x20(%rsp),%xmm8
23f6eec7 938 movaps %xmm0,0x20(%rsp)
69d5747f 939 movaps 0x30(%rsp),%xmm9
23f6eec7 940 movaps %xmm0,0x30(%rsp)
69d5747f
AP
941 lea 0x58(%rsp),%rsp
942.Lecb_enc_ret:
943___
944$code.=<<___;
d64a7232
AP
945 ret
946.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
947___
d7d119a3
AP
948\f
949{
6c83629b 950######################################################################
d7d119a3
AP
951# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
952# size_t blocks, const AES_KEY *key,
953# const char *ivec,char *cmac);
6c83629b 954#
d7d119a3
AP
955# Handles only complete blocks, operates on 64-bit counter and
956# does not update *ivec! Nor does it finalize CMAC value
957# (see engine/eng_aesni.c for details)
958#
959{
960my $cmac="%r9"; # 6th argument
961
d8ba0dc9
AP
962my $increment="%xmm9";
963my $iv="%xmm6";
267b481c 964my $bswap_mask="%xmm7";
d7d119a3
AP
965
966$code.=<<___;
967.globl aesni_ccm64_encrypt_blocks
968.type aesni_ccm64_encrypt_blocks,\@function,6
969.align 16
970aesni_ccm64_encrypt_blocks:
971___
972$code.=<<___ if ($win64);
973 lea -0x58(%rsp),%rsp
23f6eec7
AP
974 movaps %xmm6,(%rsp) # $iv
975 movaps %xmm7,0x10(%rsp) # $bswap_mask
976 movaps %xmm8,0x20(%rsp) # $in0
977 movaps %xmm9,0x30(%rsp) # $increment
d7d119a3
AP
978.Lccm64_enc_body:
979___
980$code.=<<___;
267b481c 981 mov 240($key),$rounds # key->rounds
d7d119a3 982 movdqu ($ivp),$iv
d7d119a3
AP
983 movdqa .Lincrement64(%rip),$increment
984 movdqa .Lbswap_mask(%rip),$bswap_mask
d7d119a3 985
d8ba0dc9
AP
986 shl \$4,$rounds
987 mov \$16,$rnds_
267b481c
AP
988 lea 0($key),$key_
989 movdqu ($cmac),$inout1
d7d119a3 990 movdqa $iv,$inout0
d8ba0dc9 991 lea 32($key,$rounds),$key # end of key schedule
9ee5916d 992 pshufb $bswap_mask,$iv
d8ba0dc9 993 sub %rax,%r10 # twisted $rounds
267b481c
AP
994 jmp .Lccm64_enc_outer
995.align 16
d7d119a3 996.Lccm64_enc_outer:
267b481c 997 $movkey ($key_),$rndkey0
d8ba0dc9 998 mov %r10,%rax
267b481c 999 movups ($inp),$in0 # load inp
d7d119a3 1000
267b481c
AP
1001 xorps $rndkey0,$inout0 # counter
1002 $movkey 16($key_),$rndkey1
1003 xorps $in0,$rndkey0
267b481c 1004 xorps $rndkey0,$inout1 # cmac^=inp
d8ba0dc9 1005 $movkey 32($key_),$rndkey0
f8501464
AP
1006
1007.Lccm64_enc2_loop:
1008 aesenc $rndkey1,$inout0
f8501464 1009 aesenc $rndkey1,$inout1
d8ba0dc9
AP
1010 $movkey ($key,%rax),$rndkey1
1011 add \$32,%rax
f8501464 1012 aesenc $rndkey0,$inout0
f8501464 1013 aesenc $rndkey0,$inout1
d8ba0dc9 1014 $movkey -16($key,%rax),$rndkey0
f8501464
AP
1015 jnz .Lccm64_enc2_loop
1016 aesenc $rndkey1,$inout0
1017 aesenc $rndkey1,$inout1
267b481c 1018 paddq $increment,$iv
23f6eec7 1019 dec $len # $len-- ($len is in blocks)
f8501464
AP
1020 aesenclast $rndkey0,$inout0
1021 aesenclast $rndkey0,$inout1
d7d119a3 1022
d7d119a3 1023 lea 16($inp),$inp
f8501464 1024 xorps $inout0,$in0 # inp ^= E(iv)
d7d119a3 1025 movdqa $iv,$inout0
f8501464 1026 movups $in0,($out) # save output
9ee5916d 1027 pshufb $bswap_mask,$inout0
23f6eec7
AP
1028 lea 16($out),$out # $out+=16
1029 jnz .Lccm64_enc_outer # loop if ($len!=0)
d7d119a3 1030
23f6eec7
AP
1031 pxor $rndkey0,$rndkey0 # clear register bank
1032 pxor $rndkey1,$rndkey1
1033 pxor $inout0,$inout0
1034 movups $inout1,($cmac) # store resulting mac
1035 pxor $inout1,$inout1
1036 pxor $in0,$in0
1037 pxor $iv,$iv
d7d119a3
AP
1038___
1039$code.=<<___ if ($win64);
1040 movaps (%rsp),%xmm6
23f6eec7 1041 movaps %xmm0,(%rsp) # clear stack
d7d119a3 1042 movaps 0x10(%rsp),%xmm7
23f6eec7 1043 movaps %xmm0,0x10(%rsp)
d7d119a3 1044 movaps 0x20(%rsp),%xmm8
23f6eec7 1045 movaps %xmm0,0x20(%rsp)
d7d119a3 1046 movaps 0x30(%rsp),%xmm9
23f6eec7 1047 movaps %xmm0,0x30(%rsp)
d7d119a3
AP
1048 lea 0x58(%rsp),%rsp
1049.Lccm64_enc_ret:
1050___
1051$code.=<<___;
1052 ret
1053.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1054___
1055######################################################################
1056$code.=<<___;
1057.globl aesni_ccm64_decrypt_blocks
1058.type aesni_ccm64_decrypt_blocks,\@function,6
1059.align 16
1060aesni_ccm64_decrypt_blocks:
1061___
1062$code.=<<___ if ($win64);
1063 lea -0x58(%rsp),%rsp
23f6eec7
AP
1064 movaps %xmm6,(%rsp) # $iv
1065 movaps %xmm7,0x10(%rsp) # $bswap_mask
1066 movaps %xmm8,0x20(%rsp) # $in8
1067 movaps %xmm9,0x30(%rsp) # $increment
d7d119a3
AP
1068.Lccm64_dec_body:
1069___
1070$code.=<<___;
267b481c
AP
1071 mov 240($key),$rounds # key->rounds
1072 movups ($ivp),$iv
d7d119a3
AP
1073 movdqu ($cmac),$inout1
1074 movdqa .Lincrement64(%rip),$increment
1075 movdqa .Lbswap_mask(%rip),$bswap_mask
1076
267b481c 1077 movaps $iv,$inout0
d7d119a3
AP
1078 mov $rounds,$rnds_
1079 mov $key,$key_
267b481c 1080 pshufb $bswap_mask,$iv
d7d119a3
AP
1081___
1082 &aesni_generate1("enc",$key,$rounds);
1083$code.=<<___;
d8ba0dc9
AP
1084 shl \$4,$rnds_
1085 mov \$16,$rounds
f8501464 1086 movups ($inp),$in0 # load inp
267b481c 1087 paddq $increment,$iv
23f6eec7 1088 lea 16($inp),$inp # $inp+=16
d8ba0dc9
AP
1089 sub %r10,%rax # twisted $rounds
1090 lea 32($key_,$rnds_),$key # end of key schedule
1091 mov %rax,%r10
267b481c
AP
1092 jmp .Lccm64_dec_outer
1093.align 16
1094.Lccm64_dec_outer:
1095 xorps $inout0,$in0 # inp ^= E(iv)
1096 movdqa $iv,$inout0
267b481c 1097 movups $in0,($out) # save output
23f6eec7 1098 lea 16($out),$out # $out+=16
9ee5916d 1099 pshufb $bswap_mask,$inout0
d7d119a3 1100
23f6eec7
AP
1101 sub \$1,$len # $len-- ($len is in blocks)
1102 jz .Lccm64_dec_break # if ($len==0) break
d7d119a3 1103
267b481c 1104 $movkey ($key_),$rndkey0
d8ba0dc9 1105 mov %r10,%rax
267b481c 1106 $movkey 16($key_),$rndkey1
f8501464 1107 xorps $rndkey0,$in0
f8501464
AP
1108 xorps $rndkey0,$inout0
1109 xorps $in0,$inout1 # cmac^=out
d8ba0dc9
AP
1110 $movkey 32($key_),$rndkey0
1111 jmp .Lccm64_dec2_loop
1112.align 16
f8501464
AP
1113.Lccm64_dec2_loop:
1114 aesenc $rndkey1,$inout0
f8501464 1115 aesenc $rndkey1,$inout1
d8ba0dc9
AP
1116 $movkey ($key,%rax),$rndkey1
1117 add \$32,%rax
f8501464 1118 aesenc $rndkey0,$inout0
f8501464 1119 aesenc $rndkey0,$inout1
d8ba0dc9 1120 $movkey -16($key,%rax),$rndkey0
f8501464 1121 jnz .Lccm64_dec2_loop
23f6eec7 1122 movups ($inp),$in0 # load input
267b481c 1123 paddq $increment,$iv
f8501464
AP
1124 aesenc $rndkey1,$inout0
1125 aesenc $rndkey1,$inout1
1126 aesenclast $rndkey0,$inout0
267b481c 1127 aesenclast $rndkey0,$inout1
23f6eec7 1128 lea 16($inp),$inp # $inp+=16
d7d119a3
AP
1129 jmp .Lccm64_dec_outer
1130
1131.align 16
1132.Lccm64_dec_break:
267b481c 1133 #xorps $in0,$inout1 # cmac^=out
d8ba0dc9 1134 mov 240($key_),$rounds
d7d119a3 1135___
267b481c 1136 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
d7d119a3 1137$code.=<<___;
23f6eec7
AP
1138 pxor $rndkey0,$rndkey0 # clear register bank
1139 pxor $rndkey1,$rndkey1
1140 pxor $inout0,$inout0
1141 movups $inout1,($cmac) # store resulting mac
1142 pxor $inout1,$inout1
1143 pxor $in0,$in0
1144 pxor $iv,$iv
d7d119a3
AP
1145___
1146$code.=<<___ if ($win64);
1147 movaps (%rsp),%xmm6
23f6eec7 1148 movaps %xmm0,(%rsp) # clear stack
d7d119a3 1149 movaps 0x10(%rsp),%xmm7
23f6eec7 1150 movaps %xmm0,0x10(%rsp)
d7d119a3 1151 movaps 0x20(%rsp),%xmm8
23f6eec7 1152 movaps %xmm0,0x20(%rsp)
d7d119a3 1153 movaps 0x30(%rsp),%xmm9
23f6eec7 1154 movaps %xmm0,0x30(%rsp)
d7d119a3
AP
1155 lea 0x58(%rsp),%rsp
1156.Lccm64_dec_ret:
1157___
1158$code.=<<___;
1159 ret
f8501464
AP
1160.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1161___
1162}\f
1163######################################################################
1164# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1165# size_t blocks, const AES_KEY *key,
1166# const char *ivec);
1167#
1168# Handles only complete blocks, operates on 32-bit counter and
6c79faaa 1169# does not update *ivec! (see crypto/modes/ctr128.c for details)
f8501464 1170#
6c79faaa 1171# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
b4a9d5bf 1172# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
6c79faaa
AP
1173# Keywords are full unroll and modulo-schedule counter calculations
1174# with zero-round key xor.
f8501464 1175{
6c79faaa 1176my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
384e6de4 1177my ($key0,$ctr)=("%ebp","${ivp}d");
6c79faaa 1178my $frame_size = 0x80 + ($win64?160:0);
f8501464
AP
1179
1180$code.=<<___;
1181.globl aesni_ctr32_encrypt_blocks
1182.type aesni_ctr32_encrypt_blocks,\@function,5
1183.align 16
1184aesni_ctr32_encrypt_blocks:
b84460ad 1185.cfi_startproc
23f6eec7
AP
1186 cmp \$1,$len
1187 jne .Lctr32_bulk
1188
1189 # handle single block without allocating stack frame,
1190 # useful when handling edges
1191 movups ($ivp),$inout0
1192 movups ($inp),$inout1
1193 mov 240($key),%edx # key->rounds
1194___
1195 &aesni_generate1("enc",$key,"%edx");
1196$code.=<<___;
1197 pxor $rndkey0,$rndkey0 # clear register bank
1198 pxor $rndkey1,$rndkey1
1199 xorps $inout1,$inout0
1200 pxor $inout1,$inout1
1201 movups $inout0,($out)
1202 xorps $inout0,$inout0
1203 jmp .Lctr32_epilogue
1204
1205.align 16
1206.Lctr32_bulk:
384e6de4 1207 lea (%rsp),$key_ # use $key_ as frame pointer
b84460ad 1208.cfi_def_cfa_register $key_
6c79faaa 1209 push %rbp
b84460ad 1210.cfi_push %rbp
6c79faaa
AP
1211 sub \$$frame_size,%rsp
1212 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
f8501464
AP
1213___
1214$code.=<<___ if ($win64);
384e6de4
AP
1215 movaps %xmm6,-0xa8($key_) # offload everything
1216 movaps %xmm7,-0x98($key_)
1217 movaps %xmm8,-0x88($key_)
1218 movaps %xmm9,-0x78($key_)
1219 movaps %xmm10,-0x68($key_)
1220 movaps %xmm11,-0x58($key_)
1221 movaps %xmm12,-0x48($key_)
1222 movaps %xmm13,-0x38($key_)
1223 movaps %xmm14,-0x28($key_)
1224 movaps %xmm15,-0x18($key_)
f8501464
AP
1225.Lctr32_body:
1226___
1227$code.=<<___;
6c79faaa 1228
23f6eec7
AP
1229 # 8 16-byte words on top of stack are counter values
1230 # xor-ed with zero-round key
f8501464 1231
6c79faaa
AP
1232 movdqu ($ivp),$inout0
1233 movdqu ($key),$rndkey0
1234 mov 12($ivp),$ctr # counter LSB
1235 pxor $rndkey0,$inout0
1236 mov 12($key),$key0 # 0-round key LSB
1237 movdqa $inout0,0x00(%rsp) # populate counter block
1238 bswap $ctr
b4a9d5bf
AP
1239 movdqa $inout0,$inout1
1240 movdqa $inout0,$inout2
1241 movdqa $inout0,$inout3
6c79faaa
AP
1242 movdqa $inout0,0x40(%rsp)
1243 movdqa $inout0,0x50(%rsp)
1244 movdqa $inout0,0x60(%rsp)
23f6eec7 1245 mov %rdx,%r10 # about to borrow %rdx
6c79faaa
AP
1246 movdqa $inout0,0x70(%rsp)
1247
d8ba0dc9
AP
1248 lea 1($ctr),%rax
1249 lea 2($ctr),%rdx
1250 bswap %eax
1251 bswap %edx
1252 xor $key0,%eax
1253 xor $key0,%edx
1254 pinsrd \$3,%eax,$inout1
1255 lea 3($ctr),%rax
b4a9d5bf 1256 movdqa $inout1,0x10(%rsp)
d8ba0dc9
AP
1257 pinsrd \$3,%edx,$inout2
1258 bswap %eax
1259 mov %r10,%rdx # restore %rdx
6c79faaa 1260 lea 4($ctr),%r10
b4a9d5bf 1261 movdqa $inout2,0x20(%rsp)
d8ba0dc9 1262 xor $key0,%eax
6c79faaa 1263 bswap %r10d
d8ba0dc9 1264 pinsrd \$3,%eax,$inout3
6c79faaa 1265 xor $key0,%r10d
b4a9d5bf 1266 movdqa $inout3,0x30(%rsp)
6c79faaa
AP
1267 lea 5($ctr),%r9
1268 mov %r10d,0x40+12(%rsp)
1269 bswap %r9d
1270 lea 6($ctr),%r10
d8ba0dc9 1271 mov 240($key),$rounds # key->rounds
6c79faaa
AP
1272 xor $key0,%r9d
1273 bswap %r10d
1274 mov %r9d,0x50+12(%rsp)
1275 xor $key0,%r10d
1276 lea 7($ctr),%r9
1277 mov %r10d,0x60+12(%rsp)
1278 bswap %r9d
609b0852 1279 mov OPENSSL_ia32cap_P+4(%rip),%r10d
6c79faaa 1280 xor $key0,%r9d
5599c733 1281 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
6c79faaa
AP
1282 mov %r9d,0x70+12(%rsp)
1283
1284 $movkey 0x10($key),$rndkey1
1285
6c79faaa
AP
1286 movdqa 0x40(%rsp),$inout4
1287 movdqa 0x50(%rsp),$inout5
9282c335 1288
23f6eec7
AP
1289 cmp \$8,$len # $len is in blocks
1290 jb .Lctr32_tail # short input if ($len<8)
9282c335 1291
23f6eec7 1292 sub \$6,$len # $len is biased by -6
5599c733 1293 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
23f6eec7 1294 je .Lctr32_6x # [which denotes Atom Silvermont]
5599c733 1295
6c79faaa 1296 lea 0x80($key),$key # size optimization
23f6eec7 1297 sub \$2,$len # $len is biased by -8
9282c335 1298 jmp .Lctr32_loop8
f8501464 1299
5599c733
AP
1300.align 16
1301.Lctr32_6x:
1302 shl \$4,$rounds
1303 mov \$48,$rnds_
1304 bswap $key0
1305 lea 32($key,$rounds),$key # end of key schedule
1306 sub %rax,%r10 # twisted $rounds
1307 jmp .Lctr32_loop6
1308
1309.align 16
1310.Lctr32_loop6:
23f6eec7 1311 add \$6,$ctr # next counter value
5599c733
AP
1312 $movkey -48($key,$rnds_),$rndkey0
1313 aesenc $rndkey1,$inout0
1314 mov $ctr,%eax
1315 xor $key0,%eax
1316 aesenc $rndkey1,$inout1
23f6eec7 1317 movbe %eax,`0x00+12`(%rsp) # store next counter value
5599c733
AP
1318 lea 1($ctr),%eax
1319 aesenc $rndkey1,$inout2
1320 xor $key0,%eax
1321 movbe %eax,`0x10+12`(%rsp)
1322 aesenc $rndkey1,$inout3
1323 lea 2($ctr),%eax
1324 xor $key0,%eax
1325 aesenc $rndkey1,$inout4
1326 movbe %eax,`0x20+12`(%rsp)
1327 lea 3($ctr),%eax
1328 aesenc $rndkey1,$inout5
1329 $movkey -32($key,$rnds_),$rndkey1
1330 xor $key0,%eax
1331
1332 aesenc $rndkey0,$inout0
1333 movbe %eax,`0x30+12`(%rsp)
1334 lea 4($ctr),%eax
1335 aesenc $rndkey0,$inout1
1336 xor $key0,%eax
1337 movbe %eax,`0x40+12`(%rsp)
1338 aesenc $rndkey0,$inout2
1339 lea 5($ctr),%eax
1340 xor $key0,%eax
1341 aesenc $rndkey0,$inout3
1342 movbe %eax,`0x50+12`(%rsp)
1343 mov %r10,%rax # mov $rnds_,$rounds
1344 aesenc $rndkey0,$inout4
1345 aesenc $rndkey0,$inout5
1346 $movkey -16($key,$rnds_),$rndkey0
1347
1348 call .Lenc_loop6
1349
23f6eec7 1350 movdqu ($inp),$inout6 # load 6 input blocks
5599c733
AP
1351 movdqu 0x10($inp),$inout7
1352 movdqu 0x20($inp),$in0
1353 movdqu 0x30($inp),$in1
1354 movdqu 0x40($inp),$in2
1355 movdqu 0x50($inp),$in3
23f6eec7 1356 lea 0x60($inp),$inp # $inp+=6*16
5599c733 1357 $movkey -64($key,$rnds_),$rndkey1
23f6eec7
AP
1358 pxor $inout0,$inout6 # inp^=E(ctr)
1359 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
5599c733
AP
1360 pxor $inout1,$inout7
1361 movaps 0x10(%rsp),$inout1
1362 pxor $inout2,$in0
1363 movaps 0x20(%rsp),$inout2
1364 pxor $inout3,$in1
1365 movaps 0x30(%rsp),$inout3
1366 pxor $inout4,$in2
1367 movaps 0x40(%rsp),$inout4
1368 pxor $inout5,$in3
1369 movaps 0x50(%rsp),$inout5
23f6eec7 1370 movdqu $inout6,($out) # store 6 output blocks
5599c733
AP
1371 movdqu $inout7,0x10($out)
1372 movdqu $in0,0x20($out)
1373 movdqu $in1,0x30($out)
1374 movdqu $in2,0x40($out)
1375 movdqu $in3,0x50($out)
23f6eec7
AP
1376 lea 0x60($out),$out # $out+=6*16
1377
5599c733 1378 sub \$6,$len
23f6eec7 1379 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
5599c733 1380
23f6eec7
AP
1381 add \$6,$len # restore real remaining $len
1382 jz .Lctr32_done # done if ($len==0)
5599c733
AP
1383
1384 lea -48($rnds_),$rounds
1385 lea -80($key,$rnds_),$key # restore $key
1386 neg $rounds
1387 shr \$4,$rounds # restore $rounds
1388 jmp .Lctr32_tail
1389
6c79faaa 1390.align 32
9282c335 1391.Lctr32_loop8:
23f6eec7 1392 add \$8,$ctr # next counter value
6c79faaa
AP
1393 movdqa 0x60(%rsp),$inout6
1394 aesenc $rndkey1,$inout0
1395 mov $ctr,%r9d
1396 movdqa 0x70(%rsp),$inout7
1397 aesenc $rndkey1,$inout1
1398 bswap %r9d
1399 $movkey 0x20-0x80($key),$rndkey0
1400 aesenc $rndkey1,$inout2
1401 xor $key0,%r9d
d8ba0dc9 1402 nop
6c79faaa 1403 aesenc $rndkey1,$inout3
23f6eec7 1404 mov %r9d,0x00+12(%rsp) # store next counter value
6c79faaa
AP
1405 lea 1($ctr),%r9
1406 aesenc $rndkey1,$inout4
1407 aesenc $rndkey1,$inout5
1408 aesenc $rndkey1,$inout6
1409 aesenc $rndkey1,$inout7
1410 $movkey 0x30-0x80($key),$rndkey1
1411___
1412for($i=2;$i<8;$i++) {
1413my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1414$code.=<<___;
d8ba0dc9 1415 bswap %r9d
6c79faaa
AP
1416 aesenc $rndkeyx,$inout0
1417 aesenc $rndkeyx,$inout1
6c79faaa 1418 xor $key0,%r9d
d8ba0dc9
AP
1419 .byte 0x66,0x90
1420 aesenc $rndkeyx,$inout2
6c79faaa
AP
1421 aesenc $rndkeyx,$inout3
1422 mov %r9d,`0x10*($i-1)`+12(%rsp)
1423 lea $i($ctr),%r9
1424 aesenc $rndkeyx,$inout4
1425 aesenc $rndkeyx,$inout5
1426 aesenc $rndkeyx,$inout6
1427 aesenc $rndkeyx,$inout7
1428 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1429___
1430}
1431$code.=<<___;
d8ba0dc9 1432 bswap %r9d
6c79faaa
AP
1433 aesenc $rndkey0,$inout0
1434 aesenc $rndkey0,$inout1
6c79faaa
AP
1435 aesenc $rndkey0,$inout2
1436 xor $key0,%r9d
23f6eec7 1437 movdqu 0x00($inp),$in0 # start loading input
6c79faaa
AP
1438 aesenc $rndkey0,$inout3
1439 mov %r9d,0x70+12(%rsp)
d8ba0dc9 1440 cmp \$11,$rounds
6c79faaa
AP
1441 aesenc $rndkey0,$inout4
1442 aesenc $rndkey0,$inout5
1443 aesenc $rndkey0,$inout6
6c79faaa
AP
1444 aesenc $rndkey0,$inout7
1445 $movkey 0xa0-0x80($key),$rndkey0
1446
6c79faaa
AP
1447 jb .Lctr32_enc_done
1448
1449 aesenc $rndkey1,$inout0
1450 aesenc $rndkey1,$inout1
1451 aesenc $rndkey1,$inout2
1452 aesenc $rndkey1,$inout3
1453 aesenc $rndkey1,$inout4
1454 aesenc $rndkey1,$inout5
1455 aesenc $rndkey1,$inout6
1456 aesenc $rndkey1,$inout7
1457 $movkey 0xb0-0x80($key),$rndkey1
1bc4d009
AP
1458
1459 aesenc $rndkey0,$inout0
1460 aesenc $rndkey0,$inout1
1bc4d009 1461 aesenc $rndkey0,$inout2
1bc4d009 1462 aesenc $rndkey0,$inout3
1bc4d009 1463 aesenc $rndkey0,$inout4
1bc4d009 1464 aesenc $rndkey0,$inout5
1bc4d009 1465 aesenc $rndkey0,$inout6
1bc4d009 1466 aesenc $rndkey0,$inout7
6c79faaa
AP
1467 $movkey 0xc0-0x80($key),$rndkey0
1468 je .Lctr32_enc_done
9282c335 1469
1bc4d009
AP
1470 aesenc $rndkey1,$inout0
1471 aesenc $rndkey1,$inout1
1bc4d009
AP
1472 aesenc $rndkey1,$inout2
1473 aesenc $rndkey1,$inout3
1474 aesenc $rndkey1,$inout4
1475 aesenc $rndkey1,$inout5
1476 aesenc $rndkey1,$inout6
1477 aesenc $rndkey1,$inout7
6c79faaa 1478 $movkey 0xd0-0x80($key),$rndkey1
9282c335 1479
1bc4d009
AP
1480 aesenc $rndkey0,$inout0
1481 aesenc $rndkey0,$inout1
1bc4d009
AP
1482 aesenc $rndkey0,$inout2
1483 aesenc $rndkey0,$inout3
1484 aesenc $rndkey0,$inout4
1485 aesenc $rndkey0,$inout5
1486 aesenc $rndkey0,$inout6
1487 aesenc $rndkey0,$inout7
6c79faaa 1488 $movkey 0xe0-0x80($key),$rndkey0
d8ba0dc9 1489 jmp .Lctr32_enc_done
1bc4d009 1490
d8ba0dc9 1491.align 16
6c79faaa 1492.Lctr32_enc_done:
6c79faaa 1493 movdqu 0x10($inp),$in1
23f6eec7 1494 pxor $rndkey0,$in0 # input^=round[last]
6c79faaa 1495 movdqu 0x20($inp),$in2
1bc4d009 1496 pxor $rndkey0,$in1
6c79faaa 1497 movdqu 0x30($inp),$in3
1bc4d009 1498 pxor $rndkey0,$in2
6c79faaa 1499 movdqu 0x40($inp),$in4
1bc4d009 1500 pxor $rndkey0,$in3
6c79faaa
AP
1501 movdqu 0x50($inp),$in5
1502 pxor $rndkey0,$in4
6c79faaa 1503 pxor $rndkey0,$in5
d8ba0dc9 1504 aesenc $rndkey1,$inout0
cd54249c
AP
1505 aesenc $rndkey1,$inout1
1506 aesenc $rndkey1,$inout2
1507 aesenc $rndkey1,$inout3
1508 aesenc $rndkey1,$inout4
1509 aesenc $rndkey1,$inout5
1bc4d009
AP
1510 aesenc $rndkey1,$inout6
1511 aesenc $rndkey1,$inout7
23f6eec7
AP
1512 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1513 lea 0x80($inp),$inp # $inp+=8*16
6c79faaa 1514
23f6eec7
AP
1515 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1516 pxor $rndkey0,$rndkey1 # borrowed $rndkey
d8ba0dc9 1517 movdqu 0x70-0x80($inp),$in0
1bc4d009 1518 aesenclast $in1,$inout1
1bc4d009 1519 pxor $rndkey0,$in0
6c79faaa 1520 movdqa 0x00(%rsp),$in1 # load next counter block
1bc4d009 1521 aesenclast $in2,$inout2
1bc4d009 1522 aesenclast $in3,$inout3
d8ba0dc9 1523 movdqa 0x10(%rsp),$in2
6c79faaa
AP
1524 movdqa 0x20(%rsp),$in3
1525 aesenclast $in4,$inout4
6c79faaa 1526 aesenclast $in5,$inout5
d8ba0dc9 1527 movdqa 0x30(%rsp),$in4
6c79faaa
AP
1528 movdqa 0x40(%rsp),$in5
1529 aesenclast $rndkey1,$inout6
1530 movdqa 0x50(%rsp),$rndkey0
23f6eec7 1531 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
d8ba0dc9 1532 aesenclast $in0,$inout7
1bc4d009 1533
23f6eec7 1534 movups $inout0,($out) # store 8 output blocks
6c79faaa 1535 movdqa $in1,$inout0
9282c335 1536 movups $inout1,0x10($out)
6c79faaa 1537 movdqa $in2,$inout1
9282c335 1538 movups $inout2,0x20($out)
6c79faaa 1539 movdqa $in3,$inout2
9282c335 1540 movups $inout3,0x30($out)
6c79faaa 1541 movdqa $in4,$inout3
9282c335 1542 movups $inout4,0x40($out)
6c79faaa 1543 movdqa $in5,$inout4
9282c335 1544 movups $inout5,0x50($out)
1bc4d009 1545 movdqa $rndkey0,$inout5
9282c335
AP
1546 movups $inout6,0x60($out)
1547 movups $inout7,0x70($out)
23f6eec7
AP
1548 lea 0x80($out),$out # $out+=8*16
1549
9282c335 1550 sub \$8,$len
23f6eec7 1551 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
f8501464 1552
46f4e1be 1553 add \$8,$len # restore real remaining $len
23f6eec7 1554 jz .Lctr32_done # done if ($len==0)
6c79faaa 1555 lea -0x80($key),$key
f8501464
AP
1556
1557.Lctr32_tail:
23f6eec7 1558 # note that at this point $inout0..5 are populated with
609b0852 1559 # counter values xor-ed with 0-round key
6c79faaa 1560 lea 16($key),$key
f8501464 1561 cmp \$4,$len
b4a9d5bf
AP
1562 jb .Lctr32_loop3
1563 je .Lctr32_loop4
f8501464 1564
23f6eec7 1565 # if ($len>4) compute 7 E(counter)
d8ba0dc9 1566 shl \$4,$rounds
6c79faaa 1567 movdqa 0x60(%rsp),$inout6
b4a9d5bf 1568 pxor $inout7,$inout7
f8501464 1569
6c79faaa
AP
1570 $movkey 16($key),$rndkey0
1571 aesenc $rndkey1,$inout0
6c79faaa 1572 aesenc $rndkey1,$inout1
23f6eec7 1573 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
d8ba0dc9 1574 neg %rax
6c79faaa 1575 aesenc $rndkey1,$inout2
23f6eec7 1576 add \$16,%rax # prepare for .Lenc_loop8_enter
b4a9d5bf 1577 movups ($inp),$in0
d8ba0dc9 1578 aesenc $rndkey1,$inout3
6c79faaa 1579 aesenc $rndkey1,$inout4
23f6eec7 1580 movups 0x10($inp),$in1 # pre-load input
b4a9d5bf 1581 movups 0x20($inp),$in2
d8ba0dc9 1582 aesenc $rndkey1,$inout5
6c79faaa 1583 aesenc $rndkey1,$inout6
f8501464 1584
6c79faaa 1585 call .Lenc_loop8_enter
f8501464 1586
73325b22
AP
1587 movdqu 0x30($inp),$in3
1588 pxor $in0,$inout0
1589 movdqu 0x40($inp),$in0
1590 pxor $in1,$inout1
23f6eec7 1591 movdqu $inout0,($out) # store output
73325b22
AP
1592 pxor $in2,$inout2
1593 movdqu $inout1,0x10($out)
1594 pxor $in3,$inout3
1595 movdqu $inout2,0x20($out)
1596 pxor $in0,$inout4
1597 movdqu $inout3,0x30($out)
1598 movdqu $inout4,0x40($out)
6c79faaa 1599 cmp \$6,$len
23f6eec7 1600 jb .Lctr32_done # $len was 5, stop store
9282c335 1601
6c79faaa
AP
1602 movups 0x50($inp),$in1
1603 xorps $in1,$inout5
1604 movups $inout5,0x50($out)
23f6eec7 1605 je .Lctr32_done # $len was 6, stop store
9282c335 1606
6c79faaa
AP
1607 movups 0x60($inp),$in2
1608 xorps $in2,$inout6
1609 movups $inout6,0x60($out)
23f6eec7 1610 jmp .Lctr32_done # $len was 7, stop store
f8501464 1611
6c79faaa
AP
1612.align 32
1613.Lctr32_loop4:
1614 aesenc $rndkey1,$inout0
1615 lea 16($key),$key
d8ba0dc9 1616 dec $rounds
6c79faaa
AP
1617 aesenc $rndkey1,$inout1
1618 aesenc $rndkey1,$inout2
1619 aesenc $rndkey1,$inout3
1620 $movkey ($key),$rndkey1
6c79faaa
AP
1621 jnz .Lctr32_loop4
1622 aesenclast $rndkey1,$inout0
1623 aesenclast $rndkey1,$inout1
23f6eec7 1624 movups ($inp),$in0 # load input
b4a9d5bf 1625 movups 0x10($inp),$in1
6c79faaa
AP
1626 aesenclast $rndkey1,$inout2
1627 aesenclast $rndkey1,$inout3
d8ba0dc9 1628 movups 0x20($inp),$in2
b4a9d5bf
AP
1629 movups 0x30($inp),$in3
1630
1631 xorps $in0,$inout0
23f6eec7 1632 movups $inout0,($out) # store output
b4a9d5bf
AP
1633 xorps $in1,$inout1
1634 movups $inout1,0x10($out)
73325b22
AP
1635 pxor $in2,$inout2
1636 movdqu $inout2,0x20($out)
1637 pxor $in3,$inout3
1638 movdqu $inout3,0x30($out)
23f6eec7 1639 jmp .Lctr32_done # $len was 4, stop store
b4a9d5bf
AP
1640
1641.align 32
1642.Lctr32_loop3:
1643 aesenc $rndkey1,$inout0
1644 lea 16($key),$key
d8ba0dc9 1645 dec $rounds
b4a9d5bf
AP
1646 aesenc $rndkey1,$inout1
1647 aesenc $rndkey1,$inout2
1648 $movkey ($key),$rndkey1
b4a9d5bf
AP
1649 jnz .Lctr32_loop3
1650 aesenclast $rndkey1,$inout0
1651 aesenclast $rndkey1,$inout1
1652 aesenclast $rndkey1,$inout2
6c79faaa 1653
23f6eec7 1654 movups ($inp),$in0 # load input
9282c335 1655 xorps $in0,$inout0
23f6eec7 1656 movups $inout0,($out) # store output
6c79faaa 1657 cmp \$2,$len
23f6eec7 1658 jb .Lctr32_done # $len was 1, stop store
f8501464 1659
6c79faaa 1660 movups 0x10($inp),$in1
9282c335 1661 xorps $in1,$inout1
9282c335 1662 movups $inout1,0x10($out)
23f6eec7 1663 je .Lctr32_done # $len was 2, stop store
f8501464 1664
6c79faaa 1665 movups 0x20($inp),$in2
9282c335 1666 xorps $in2,$inout2
23f6eec7 1667 movups $inout2,0x20($out) # $len was 3, stop store
9282c335 1668
f8501464 1669.Lctr32_done:
46f4e1be 1670 xorps %xmm0,%xmm0 # clear register bank
23f6eec7
AP
1671 xor $key0,$key0
1672 pxor %xmm1,%xmm1
1673 pxor %xmm2,%xmm2
1674 pxor %xmm3,%xmm3
1675 pxor %xmm4,%xmm4
1676 pxor %xmm5,%xmm5
1677___
1678$code.=<<___ if (!$win64);
1679 pxor %xmm6,%xmm6
1680 pxor %xmm7,%xmm7
1681 movaps %xmm0,0x00(%rsp) # clear stack
1682 pxor %xmm8,%xmm8
1683 movaps %xmm0,0x10(%rsp)
1684 pxor %xmm9,%xmm9
1685 movaps %xmm0,0x20(%rsp)
1686 pxor %xmm10,%xmm10
1687 movaps %xmm0,0x30(%rsp)
1688 pxor %xmm11,%xmm11
1689 movaps %xmm0,0x40(%rsp)
1690 pxor %xmm12,%xmm12
1691 movaps %xmm0,0x50(%rsp)
1692 pxor %xmm13,%xmm13
1693 movaps %xmm0,0x60(%rsp)
1694 pxor %xmm14,%xmm14
1695 movaps %xmm0,0x70(%rsp)
1696 pxor %xmm15,%xmm15
f8501464
AP
1697___
1698$code.=<<___ if ($win64);
384e6de4
AP
1699 movaps -0xa8($key_),%xmm6
1700 movaps %xmm0,-0xa8($key_) # clear stack
1701 movaps -0x98($key_),%xmm7
1702 movaps %xmm0,-0x98($key_)
1703 movaps -0x88($key_),%xmm8
1704 movaps %xmm0,-0x88($key_)
1705 movaps -0x78($key_),%xmm9
1706 movaps %xmm0,-0x78($key_)
1707 movaps -0x68($key_),%xmm10
1708 movaps %xmm0,-0x68($key_)
1709 movaps -0x58($key_),%xmm11
1710 movaps %xmm0,-0x58($key_)
1711 movaps -0x48($key_),%xmm12
1712 movaps %xmm0,-0x48($key_)
1713 movaps -0x38($key_),%xmm13
1714 movaps %xmm0,-0x38($key_)
1715 movaps -0x28($key_),%xmm14
1716 movaps %xmm0,-0x28($key_)
1717 movaps -0x18($key_),%xmm15
1718 movaps %xmm0,-0x18($key_)
23f6eec7
AP
1719 movaps %xmm0,0x00(%rsp)
1720 movaps %xmm0,0x10(%rsp)
1721 movaps %xmm0,0x20(%rsp)
1722 movaps %xmm0,0x30(%rsp)
1723 movaps %xmm0,0x40(%rsp)
1724 movaps %xmm0,0x50(%rsp)
1725 movaps %xmm0,0x60(%rsp)
1726 movaps %xmm0,0x70(%rsp)
f8501464
AP
1727___
1728$code.=<<___;
384e6de4 1729 mov -8($key_),%rbp
b84460ad 1730.cfi_restore %rbp
384e6de4 1731 lea ($key_),%rsp
b84460ad 1732.cfi_def_cfa_register %rsp
6c79faaa 1733.Lctr32_epilogue:
f8501464 1734 ret
b84460ad 1735.cfi_endproc
f8501464
AP
1736.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1737___
1738}
1739\f
1740######################################################################
1741# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1742# const AES_KEY *key1, const AES_KEY *key2
1743# const unsigned char iv[16]);
1744#
1745{
1746my @tweak=map("%xmm$_",(10..15));
1747my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1748my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
36df342f 1749my $frame_size = 0x70 + ($win64?160:0);
384e6de4 1750my $key_ = "%rbp"; # override so that we can use %r11 as FP
f8501464
AP
1751
1752$code.=<<___;
1753.globl aesni_xts_encrypt
1754.type aesni_xts_encrypt,\@function,6
1755.align 16
1756aesni_xts_encrypt:
b84460ad 1757.cfi_startproc
384e6de4 1758 lea (%rsp),%r11 # frame pointer
b84460ad 1759.cfi_def_cfa_register %r11
6a40ebe8 1760 push %rbp
b84460ad 1761.cfi_push %rbp
6a40ebe8
AP
1762 sub \$$frame_size,%rsp
1763 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
f8501464
AP
1764___
1765$code.=<<___ if ($win64);
384e6de4
AP
1766 movaps %xmm6,-0xa8(%r11) # offload everything
1767 movaps %xmm7,-0x98(%r11)
1768 movaps %xmm8,-0x88(%r11)
1769 movaps %xmm9,-0x78(%r11)
1770 movaps %xmm10,-0x68(%r11)
1771 movaps %xmm11,-0x58(%r11)
1772 movaps %xmm12,-0x48(%r11)
1773 movaps %xmm13,-0x38(%r11)
1774 movaps %xmm14,-0x28(%r11)
1775 movaps %xmm15,-0x18(%r11)
f8501464
AP
1776.Lxts_enc_body:
1777___
1778$code.=<<___;
d8ba0dc9 1779 movups ($ivp),$inout0 # load clear-text tweak
f8501464
AP
1780 mov 240(%r8),$rounds # key2->rounds
1781 mov 240($key),$rnds_ # key1->rounds
1782___
1783 # generate the tweak
d8ba0dc9 1784 &aesni_generate1("enc",$key2,$rounds,$inout0);
f8501464 1785$code.=<<___;
36df342f 1786 $movkey ($key),$rndkey0 # zero round key
f8501464
AP
1787 mov $key,$key_ # backup $key
1788 mov $rnds_,$rounds # backup $rounds
36df342f 1789 shl \$4,$rnds_
f8501464
AP
1790 mov $len,$len_ # backup $len
1791 and \$-16,$len
1792
36df342f 1793 $movkey 16($key,$rnds_),$rndkey1 # last round key
36df342f 1794
f8501464 1795 movdqa .Lxts_magic(%rip),$twmask
d8ba0dc9
AP
1796 movdqa $inout0,@tweak[5]
1797 pshufd \$0x5f,$inout0,$twres
36df342f 1798 pxor $rndkey0,$rndkey1
f8501464 1799___
36df342f
AP
1800 # alternative tweak calculation algorithm is based on suggestions
1801 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1802 # and should help in the future...
f8501464
AP
1803 for ($i=0;$i<4;$i++) {
1804 $code.=<<___;
36df342f
AP
1805 movdqa $twres,$twtmp
1806 paddd $twres,$twres
f8501464 1807 movdqa @tweak[5],@tweak[$i]
36df342f
AP
1808 psrad \$31,$twtmp # broadcast upper bits
1809 paddq @tweak[5],@tweak[5]
1810 pand $twmask,$twtmp
1811 pxor $rndkey0,@tweak[$i]
1812 pxor $twtmp,@tweak[5]
f8501464
AP
1813___
1814 }
1815$code.=<<___;
36df342f
AP
1816 movdqa @tweak[5],@tweak[4]
1817 psrad \$31,$twres
1818 paddq @tweak[5],@tweak[5]
1819 pand $twmask,$twres
1820 pxor $rndkey0,@tweak[4]
1821 pxor $twres,@tweak[5]
1822 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1823
f8501464 1824 sub \$16*6,$len
23f6eec7 1825 jc .Lxts_enc_short # if $len-=6*16 borrowed
f8501464 1826
d8ba0dc9
AP
1827 mov \$16+96,$rounds
1828 lea 32($key_,$rnds_),$key # end of key schedule
1829 sub %r10,%rax # twisted $rounds
36df342f 1830 $movkey 16($key_),$rndkey1
d8ba0dc9 1831 mov %rax,%r10 # backup twisted $rounds
36df342f 1832 lea .Lxts_magic(%rip),%r8
f8501464
AP
1833 jmp .Lxts_enc_grandloop
1834
36df342f 1835.align 32
f8501464 1836.Lxts_enc_grandloop:
f8501464 1837 movdqu `16*0`($inp),$inout0 # load input
36df342f 1838 movdqa $rndkey0,$twmask
f8501464 1839 movdqu `16*1`($inp),$inout1
23f6eec7 1840 pxor @tweak[0],$inout0 # input^=tweak^round[0]
f8501464 1841 movdqu `16*2`($inp),$inout2
f8501464 1842 pxor @tweak[1],$inout1
36df342f
AP
1843 aesenc $rndkey1,$inout0
1844 movdqu `16*3`($inp),$inout3
f8501464 1845 pxor @tweak[2],$inout2
36df342f
AP
1846 aesenc $rndkey1,$inout1
1847 movdqu `16*4`($inp),$inout4
f8501464 1848 pxor @tweak[3],$inout3
36df342f
AP
1849 aesenc $rndkey1,$inout2
1850 movdqu `16*5`($inp),$inout5
1851 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1852 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
f8501464 1853 pxor @tweak[4],$inout4
36df342f
AP
1854 aesenc $rndkey1,$inout3
1855 $movkey 32($key_),$rndkey0
1856 lea `16*6`($inp),$inp
1857 pxor $twmask,$inout5
f8501464 1858
46f4e1be 1859 pxor $twres,@tweak[0] # calculate tweaks^round[last]
f8501464 1860 aesenc $rndkey1,$inout4
36df342f 1861 pxor $twres,@tweak[1]
23f6eec7 1862 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
f8501464 1863 aesenc $rndkey1,$inout5
36df342f 1864 $movkey 48($key_),$rndkey1
d8ba0dc9 1865 pxor $twres,@tweak[2]
f8501464 1866
36df342f 1867 aesenc $rndkey0,$inout0
d8ba0dc9 1868 pxor $twres,@tweak[3]
36df342f
AP
1869 movdqa @tweak[1],`16*1`(%rsp)
1870 aesenc $rndkey0,$inout1
d8ba0dc9 1871 pxor $twres,@tweak[4]
36df342f
AP
1872 movdqa @tweak[2],`16*2`(%rsp)
1873 aesenc $rndkey0,$inout2
36df342f
AP
1874 aesenc $rndkey0,$inout3
1875 pxor $twres,$twmask
1876 movdqa @tweak[4],`16*4`(%rsp)
1877 aesenc $rndkey0,$inout4
36df342f
AP
1878 aesenc $rndkey0,$inout5
1879 $movkey 64($key_),$rndkey0
d8ba0dc9 1880 movdqa $twmask,`16*5`(%rsp)
36df342f
AP
1881 pshufd \$0x5f,@tweak[5],$twres
1882 jmp .Lxts_enc_loop6
1883.align 32
f8501464
AP
1884.Lxts_enc_loop6:
1885 aesenc $rndkey1,$inout0
1886 aesenc $rndkey1,$inout1
f8501464
AP
1887 aesenc $rndkey1,$inout2
1888 aesenc $rndkey1,$inout3
1889 aesenc $rndkey1,$inout4
1890 aesenc $rndkey1,$inout5
d8ba0dc9
AP
1891 $movkey -64($key,%rax),$rndkey1
1892 add \$32,%rax
36df342f 1893
f8501464
AP
1894 aesenc $rndkey0,$inout0
1895 aesenc $rndkey0,$inout1
f8501464
AP
1896 aesenc $rndkey0,$inout2
1897 aesenc $rndkey0,$inout3
1898 aesenc $rndkey0,$inout4
1899 aesenc $rndkey0,$inout5
d8ba0dc9 1900 $movkey -80($key,%rax),$rndkey0
f8501464
AP
1901 jnz .Lxts_enc_loop6
1902
23f6eec7 1903 movdqa (%r8),$twmask # start calculating next tweak
36df342f
AP
1904 movdqa $twres,$twtmp
1905 paddd $twres,$twres
f8501464 1906 aesenc $rndkey1,$inout0
36df342f
AP
1907 paddq @tweak[5],@tweak[5]
1908 psrad \$31,$twtmp
f8501464 1909 aesenc $rndkey1,$inout1
36df342f
AP
1910 pand $twmask,$twtmp
1911 $movkey ($key_),@tweak[0] # load round[0]
f8501464 1912 aesenc $rndkey1,$inout2
f8501464
AP
1913 aesenc $rndkey1,$inout3
1914 aesenc $rndkey1,$inout4
d8ba0dc9 1915 pxor $twtmp,@tweak[5]
36df342f 1916 movaps @tweak[0],@tweak[1] # copy round[0]
f8501464 1917 aesenc $rndkey1,$inout5
d8ba0dc9 1918 $movkey -64($key),$rndkey1
f8501464 1919
36df342f 1920 movdqa $twres,$twtmp
f8501464 1921 aesenc $rndkey0,$inout0
d8ba0dc9 1922 paddd $twres,$twres
36df342f 1923 pxor @tweak[5],@tweak[0]
f8501464 1924 aesenc $rndkey0,$inout1
d8ba0dc9 1925 psrad \$31,$twtmp
36df342f 1926 paddq @tweak[5],@tweak[5]
f8501464 1927 aesenc $rndkey0,$inout2
f8501464 1928 aesenc $rndkey0,$inout3
d8ba0dc9 1929 pand $twmask,$twtmp
36df342f 1930 movaps @tweak[1],@tweak[2]
d8ba0dc9
AP
1931 aesenc $rndkey0,$inout4
1932 pxor $twtmp,@tweak[5]
1933 movdqa $twres,$twtmp
f8501464 1934 aesenc $rndkey0,$inout5
d8ba0dc9 1935 $movkey -48($key),$rndkey0
f8501464 1936
36df342f 1937 paddd $twres,$twres
f8501464 1938 aesenc $rndkey1,$inout0
36df342f
AP
1939 pxor @tweak[5],@tweak[1]
1940 psrad \$31,$twtmp
f8501464 1941 aesenc $rndkey1,$inout1
36df342f
AP
1942 paddq @tweak[5],@tweak[5]
1943 pand $twmask,$twtmp
f8501464 1944 aesenc $rndkey1,$inout2
f8501464 1945 aesenc $rndkey1,$inout3
d8ba0dc9 1946 movdqa @tweak[3],`16*3`(%rsp)
36df342f 1947 pxor $twtmp,@tweak[5]
f8501464 1948 aesenc $rndkey1,$inout4
36df342f 1949 movaps @tweak[2],@tweak[3]
d8ba0dc9 1950 movdqa $twres,$twtmp
f8501464 1951 aesenc $rndkey1,$inout5
d8ba0dc9 1952 $movkey -32($key),$rndkey1
f8501464 1953
36df342f
AP
1954 paddd $twres,$twres
1955 aesenc $rndkey0,$inout0
1956 pxor @tweak[5],@tweak[2]
1957 psrad \$31,$twtmp
1958 aesenc $rndkey0,$inout1
1959 paddq @tweak[5],@tweak[5]
1960 pand $twmask,$twtmp
1961 aesenc $rndkey0,$inout2
1962 aesenc $rndkey0,$inout3
36df342f 1963 aesenc $rndkey0,$inout4
d8ba0dc9 1964 pxor $twtmp,@tweak[5]
36df342f
AP
1965 movaps @tweak[3],@tweak[4]
1966 aesenc $rndkey0,$inout5
1967
1968 movdqa $twres,$rndkey0
1969 paddd $twres,$twres
1970 aesenc $rndkey1,$inout0
1971 pxor @tweak[5],@tweak[3]
1972 psrad \$31,$rndkey0
1973 aesenc $rndkey1,$inout1
1974 paddq @tweak[5],@tweak[5]
1975 pand $twmask,$rndkey0
1976 aesenc $rndkey1,$inout2
1977 aesenc $rndkey1,$inout3
1978 pxor $rndkey0,@tweak[5]
1979 $movkey ($key_),$rndkey0
1980 aesenc $rndkey1,$inout4
1981 aesenc $rndkey1,$inout5
1982 $movkey 16($key_),$rndkey1
1983
1984 pxor @tweak[5],@tweak[4]
36df342f 1985 aesenclast `16*0`(%rsp),$inout0
d8ba0dc9 1986 psrad \$31,$twres
36df342f 1987 paddq @tweak[5],@tweak[5]
36df342f
AP
1988 aesenclast `16*1`(%rsp),$inout1
1989 aesenclast `16*2`(%rsp),$inout2
d8ba0dc9
AP
1990 pand $twmask,$twres
1991 mov %r10,%rax # restore $rounds
36df342f
AP
1992 aesenclast `16*3`(%rsp),$inout3
1993 aesenclast `16*4`(%rsp),$inout4
1994 aesenclast `16*5`(%rsp),$inout5
d8ba0dc9 1995 pxor $twres,@tweak[5]
f8501464 1996
23f6eec7
AP
1997 lea `16*6`($out),$out # $out+=6*16
1998 movups $inout0,`-16*6`($out) # store 6 output blocks
36df342f
AP
1999 movups $inout1,`-16*5`($out)
2000 movups $inout2,`-16*4`($out)
2001 movups $inout3,`-16*3`($out)
2002 movups $inout4,`-16*2`($out)
2003 movups $inout5,`-16*1`($out)
f8501464 2004 sub \$16*6,$len
23f6eec7 2005 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
f8501464 2006
d8ba0dc9
AP
2007 mov \$16+96,$rounds
2008 sub $rnds_,$rounds
f8501464 2009 mov $key_,$key # restore $key
d8ba0dc9 2010 shr \$4,$rounds # restore original value
f8501464
AP
2011
2012.Lxts_enc_short:
23f6eec7 2013 # at the point @tweak[0..5] are populated with tweak values
d8ba0dc9 2014 mov $rounds,$rnds_ # backup $rounds
36df342f 2015 pxor $rndkey0,@tweak[0]
23f6eec7
AP
2016 add \$16*6,$len # restore real remaining $len
2017 jz .Lxts_enc_done # done if ($len==0)
f8501464 2018
36df342f 2019 pxor $rndkey0,@tweak[1]
f8501464 2020 cmp \$0x20,$len
23f6eec7 2021 jb .Lxts_enc_one # $len is 1*16
36df342f 2022 pxor $rndkey0,@tweak[2]
23f6eec7 2023 je .Lxts_enc_two # $len is 2*16
f8501464 2024
36df342f 2025 pxor $rndkey0,@tweak[3]
f8501464 2026 cmp \$0x40,$len
23f6eec7 2027 jb .Lxts_enc_three # $len is 3*16
36df342f 2028 pxor $rndkey0,@tweak[4]
23f6eec7 2029 je .Lxts_enc_four # $len is 4*16
f8501464 2030
23f6eec7 2031 movdqu ($inp),$inout0 # $len is 5*16
36df342f 2032 movdqu 16*1($inp),$inout1
f8501464
AP
2033 movdqu 16*2($inp),$inout2
2034 pxor @tweak[0],$inout0
2035 movdqu 16*3($inp),$inout3
2036 pxor @tweak[1],$inout1
2037 movdqu 16*4($inp),$inout4
23f6eec7 2038 lea 16*5($inp),$inp # $inp+=5*16
f8501464
AP
2039 pxor @tweak[2],$inout2
2040 pxor @tweak[3],$inout3
2041 pxor @tweak[4],$inout4
23f6eec7 2042 pxor $inout5,$inout5
f8501464
AP
2043
2044 call _aesni_encrypt6
2045
2046 xorps @tweak[0],$inout0
2047 movdqa @tweak[5],@tweak[0]
2048 xorps @tweak[1],$inout1
2049 xorps @tweak[2],$inout2
23f6eec7 2050 movdqu $inout0,($out) # store 5 output blocks
f8501464
AP
2051 xorps @tweak[3],$inout3
2052 movdqu $inout1,16*1($out)
2053 xorps @tweak[4],$inout4
2054 movdqu $inout2,16*2($out)
2055 movdqu $inout3,16*3($out)
2056 movdqu $inout4,16*4($out)
23f6eec7 2057 lea 16*5($out),$out # $out+=5*16
f8501464
AP
2058 jmp .Lxts_enc_done
2059
2060.align 16
2061.Lxts_enc_one:
2062 movups ($inp),$inout0
23f6eec7 2063 lea 16*1($inp),$inp # inp+=1*16
f8501464
AP
2064 xorps @tweak[0],$inout0
2065___
2066 &aesni_generate1("enc",$key,$rounds);
2067$code.=<<___;
2068 xorps @tweak[0],$inout0
2069 movdqa @tweak[1],@tweak[0]
23f6eec7
AP
2070 movups $inout0,($out) # store one output block
2071 lea 16*1($out),$out # $out+=1*16
f8501464
AP
2072 jmp .Lxts_enc_done
2073
2074.align 16
2075.Lxts_enc_two:
2076 movups ($inp),$inout0
2077 movups 16($inp),$inout1
23f6eec7 2078 lea 32($inp),$inp # $inp+=2*16
f8501464
AP
2079 xorps @tweak[0],$inout0
2080 xorps @tweak[1],$inout1
2081
214368ff 2082 call _aesni_encrypt2
f8501464
AP
2083
2084 xorps @tweak[0],$inout0
2085 movdqa @tweak[2],@tweak[0]
2086 xorps @tweak[1],$inout1
23f6eec7 2087 movups $inout0,($out) # store 2 output blocks
f8501464 2088 movups $inout1,16*1($out)
23f6eec7 2089 lea 16*2($out),$out # $out+=2*16
f8501464
AP
2090 jmp .Lxts_enc_done
2091
2092.align 16
2093.Lxts_enc_three:
2094 movups ($inp),$inout0
2095 movups 16*1($inp),$inout1
2096 movups 16*2($inp),$inout2
23f6eec7 2097 lea 16*3($inp),$inp # $inp+=3*16
f8501464
AP
2098 xorps @tweak[0],$inout0
2099 xorps @tweak[1],$inout1
2100 xorps @tweak[2],$inout2
2101
2102 call _aesni_encrypt3
2103
2104 xorps @tweak[0],$inout0
2105 movdqa @tweak[3],@tweak[0]
2106 xorps @tweak[1],$inout1
2107 xorps @tweak[2],$inout2
23f6eec7 2108 movups $inout0,($out) # store 3 output blocks
f8501464
AP
2109 movups $inout1,16*1($out)
2110 movups $inout2,16*2($out)
23f6eec7 2111 lea 16*3($out),$out # $out+=3*16
f8501464
AP
2112 jmp .Lxts_enc_done
2113
2114.align 16
2115.Lxts_enc_four:
2116 movups ($inp),$inout0
2117 movups 16*1($inp),$inout1
2118 movups 16*2($inp),$inout2
2119 xorps @tweak[0],$inout0
2120 movups 16*3($inp),$inout3
23f6eec7 2121 lea 16*4($inp),$inp # $inp+=4*16
f8501464
AP
2122 xorps @tweak[1],$inout1
2123 xorps @tweak[2],$inout2
2124 xorps @tweak[3],$inout3
2125
2126 call _aesni_encrypt4
2127
36df342f
AP
2128 pxor @tweak[0],$inout0
2129 movdqa @tweak[4],@tweak[0]
2130 pxor @tweak[1],$inout1
2131 pxor @tweak[2],$inout2
23f6eec7 2132 movdqu $inout0,($out) # store 4 output blocks
36df342f
AP
2133 pxor @tweak[3],$inout3
2134 movdqu $inout1,16*1($out)
2135 movdqu $inout2,16*2($out)
2136 movdqu $inout3,16*3($out)
23f6eec7 2137 lea 16*4($out),$out # $out+=4*16
f8501464
AP
2138 jmp .Lxts_enc_done
2139
2140.align 16
2141.Lxts_enc_done:
23f6eec7 2142 and \$15,$len_ # see if $len%16 is 0
f8501464
AP
2143 jz .Lxts_enc_ret
2144 mov $len_,$len
2145
2146.Lxts_enc_steal:
2147 movzb ($inp),%eax # borrow $rounds ...
2148 movzb -16($out),%ecx # ... and $key
2149 lea 1($inp),$inp
2150 mov %al,-16($out)
2151 mov %cl,0($out)
2152 lea 1($out),$out
2153 sub \$1,$len
2154 jnz .Lxts_enc_steal
2155
2156 sub $len_,$out # rewind $out
2157 mov $key_,$key # restore $key
2158 mov $rnds_,$rounds # restore $rounds
2159
2160 movups -16($out),$inout0
2161 xorps @tweak[0],$inout0
2162___
2163 &aesni_generate1("enc",$key,$rounds);
2164$code.=<<___;
2165 xorps @tweak[0],$inout0
2166 movups $inout0,-16($out)
2167
2168.Lxts_enc_ret:
23f6eec7
AP
2169 xorps %xmm0,%xmm0 # clear register bank
2170 pxor %xmm1,%xmm1
2171 pxor %xmm2,%xmm2
2172 pxor %xmm3,%xmm3
2173 pxor %xmm4,%xmm4
2174 pxor %xmm5,%xmm5
2175___
2176$code.=<<___ if (!$win64);
2177 pxor %xmm6,%xmm6
2178 pxor %xmm7,%xmm7
2179 movaps %xmm0,0x00(%rsp) # clear stack
2180 pxor %xmm8,%xmm8
2181 movaps %xmm0,0x10(%rsp)
2182 pxor %xmm9,%xmm9
2183 movaps %xmm0,0x20(%rsp)
2184 pxor %xmm10,%xmm10
2185 movaps %xmm0,0x30(%rsp)
2186 pxor %xmm11,%xmm11
2187 movaps %xmm0,0x40(%rsp)
2188 pxor %xmm12,%xmm12
2189 movaps %xmm0,0x50(%rsp)
2190 pxor %xmm13,%xmm13
2191 movaps %xmm0,0x60(%rsp)
2192 pxor %xmm14,%xmm14
2193 pxor %xmm15,%xmm15
f8501464
AP
2194___
2195$code.=<<___ if ($win64);
384e6de4
AP
2196 movaps -0xa8(%r11),%xmm6
2197 movaps %xmm0,-0xa8(%r11) # clear stack
2198 movaps -0x98(%r11),%xmm7
2199 movaps %xmm0,-0x98(%r11)
2200 movaps -0x88(%r11),%xmm8
2201 movaps %xmm0,-0x88(%r11)
2202 movaps -0x78(%r11),%xmm9
2203 movaps %xmm0,-0x78(%r11)
2204 movaps -0x68(%r11),%xmm10
2205 movaps %xmm0,-0x68(%r11)
2206 movaps -0x58(%r11),%xmm11
2207 movaps %xmm0,-0x58(%r11)
2208 movaps -0x48(%r11),%xmm12
2209 movaps %xmm0,-0x48(%r11)
2210 movaps -0x38(%r11),%xmm13
2211 movaps %xmm0,-0x38(%r11)
2212 movaps -0x28(%r11),%xmm14
2213 movaps %xmm0,-0x28(%r11)
2214 movaps -0x18(%r11),%xmm15
2215 movaps %xmm0,-0x18(%r11)
23f6eec7
AP
2216 movaps %xmm0,0x00(%rsp)
2217 movaps %xmm0,0x10(%rsp)
2218 movaps %xmm0,0x20(%rsp)
2219 movaps %xmm0,0x30(%rsp)
2220 movaps %xmm0,0x40(%rsp)
2221 movaps %xmm0,0x50(%rsp)
2222 movaps %xmm0,0x60(%rsp)
f8501464
AP
2223___
2224$code.=<<___;
384e6de4 2225 mov -8(%r11),%rbp
b84460ad 2226.cfi_restore %rbp
384e6de4 2227 lea (%r11),%rsp
b84460ad 2228.cfi_def_cfa_register %rsp
f8501464
AP
2229.Lxts_enc_epilogue:
2230 ret
b84460ad 2231.cfi_endproc
f8501464 2232.size aesni_xts_encrypt,.-aesni_xts_encrypt
d7d119a3 2233___
6c83629b
AP
2234
2235$code.=<<___;
f8501464
AP
2236.globl aesni_xts_decrypt
2237.type aesni_xts_decrypt,\@function,6
6c83629b 2238.align 16
f8501464 2239aesni_xts_decrypt:
b84460ad 2240.cfi_startproc
384e6de4 2241 lea (%rsp),%r11 # frame pointer
b84460ad 2242.cfi_def_cfa_register %r11
6a40ebe8 2243 push %rbp
b84460ad 2244.cfi_push %rbp
6a40ebe8
AP
2245 sub \$$frame_size,%rsp
2246 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
6c83629b
AP
2247___
2248$code.=<<___ if ($win64);
384e6de4
AP
2249 movaps %xmm6,-0xa8(%r11) # offload everything
2250 movaps %xmm7,-0x98(%r11)
2251 movaps %xmm8,-0x88(%r11)
2252 movaps %xmm9,-0x78(%r11)
2253 movaps %xmm10,-0x68(%r11)
2254 movaps %xmm11,-0x58(%r11)
2255 movaps %xmm12,-0x48(%r11)
2256 movaps %xmm13,-0x38(%r11)
2257 movaps %xmm14,-0x28(%r11)
2258 movaps %xmm15,-0x18(%r11)
f8501464 2259.Lxts_dec_body:
6c83629b
AP
2260___
2261$code.=<<___;
d8ba0dc9 2262 movups ($ivp),$inout0 # load clear-text tweak
f8501464
AP
2263 mov 240($key2),$rounds # key2->rounds
2264 mov 240($key),$rnds_ # key1->rounds
2265___
2266 # generate the tweak
d8ba0dc9 2267 &aesni_generate1("enc",$key2,$rounds,$inout0);
f8501464
AP
2268$code.=<<___;
2269 xor %eax,%eax # if ($len%16) len-=16;
2270 test \$15,$len
2271 setnz %al
2272 shl \$4,%rax
2273 sub %rax,$len
2274
36df342f 2275 $movkey ($key),$rndkey0 # zero round key
f8501464
AP
2276 mov $key,$key_ # backup $key
2277 mov $rnds_,$rounds # backup $rounds
36df342f 2278 shl \$4,$rnds_
f8501464
AP
2279 mov $len,$len_ # backup $len
2280 and \$-16,$len
6c83629b 2281
36df342f 2282 $movkey 16($key,$rnds_),$rndkey1 # last round key
36df342f 2283
f8501464 2284 movdqa .Lxts_magic(%rip),$twmask
d8ba0dc9
AP
2285 movdqa $inout0,@tweak[5]
2286 pshufd \$0x5f,$inout0,$twres
36df342f 2287 pxor $rndkey0,$rndkey1
f8501464
AP
2288___
2289 for ($i=0;$i<4;$i++) {
2290 $code.=<<___;
36df342f
AP
2291 movdqa $twres,$twtmp
2292 paddd $twres,$twres
f8501464 2293 movdqa @tweak[5],@tweak[$i]
36df342f
AP
2294 psrad \$31,$twtmp # broadcast upper bits
2295 paddq @tweak[5],@tweak[5]
2296 pand $twmask,$twtmp
2297 pxor $rndkey0,@tweak[$i]
2298 pxor $twtmp,@tweak[5]
f8501464
AP
2299___
2300 }
2301$code.=<<___;
36df342f
AP
2302 movdqa @tweak[5],@tweak[4]
2303 psrad \$31,$twres
2304 paddq @tweak[5],@tweak[5]
2305 pand $twmask,$twres
2306 pxor $rndkey0,@tweak[4]
2307 pxor $twres,@tweak[5]
2308 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2309
f8501464 2310 sub \$16*6,$len
23f6eec7 2311 jc .Lxts_dec_short # if $len-=6*16 borrowed
6c83629b 2312
d8ba0dc9
AP
2313 mov \$16+96,$rounds
2314 lea 32($key_,$rnds_),$key # end of key schedule
2315 sub %r10,%rax # twisted $rounds
36df342f 2316 $movkey 16($key_),$rndkey1
d8ba0dc9 2317 mov %rax,%r10 # backup twisted $rounds
36df342f 2318 lea .Lxts_magic(%rip),%r8
f8501464 2319 jmp .Lxts_dec_grandloop
6c83629b 2320
36df342f 2321.align 32
f8501464 2322.Lxts_dec_grandloop:
f8501464 2323 movdqu `16*0`($inp),$inout0 # load input
36df342f 2324 movdqa $rndkey0,$twmask
f8501464 2325 movdqu `16*1`($inp),$inout1
23f6eec7 2326 pxor @tweak[0],$inout0 # intput^=tweak^round[0]
f8501464 2327 movdqu `16*2`($inp),$inout2
f8501464 2328 pxor @tweak[1],$inout1
36df342f
AP
2329 aesdec $rndkey1,$inout0
2330 movdqu `16*3`($inp),$inout3
f8501464 2331 pxor @tweak[2],$inout2
36df342f
AP
2332 aesdec $rndkey1,$inout1
2333 movdqu `16*4`($inp),$inout4
f8501464 2334 pxor @tweak[3],$inout3
36df342f
AP
2335 aesdec $rndkey1,$inout2
2336 movdqu `16*5`($inp),$inout5
2337 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2338 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
f8501464 2339 pxor @tweak[4],$inout4
36df342f
AP
2340 aesdec $rndkey1,$inout3
2341 $movkey 32($key_),$rndkey0
2342 lea `16*6`($inp),$inp
2343 pxor $twmask,$inout5
f8501464 2344
46f4e1be 2345 pxor $twres,@tweak[0] # calculate tweaks^round[last]
f8501464 2346 aesdec $rndkey1,$inout4
36df342f
AP
2347 pxor $twres,@tweak[1]
2348 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
f8501464 2349 aesdec $rndkey1,$inout5
36df342f 2350 $movkey 48($key_),$rndkey1
d8ba0dc9 2351 pxor $twres,@tweak[2]
6c83629b 2352
36df342f 2353 aesdec $rndkey0,$inout0
d8ba0dc9 2354 pxor $twres,@tweak[3]
36df342f
AP
2355 movdqa @tweak[1],`16*1`(%rsp)
2356 aesdec $rndkey0,$inout1
d8ba0dc9 2357 pxor $twres,@tweak[4]
36df342f
AP
2358 movdqa @tweak[2],`16*2`(%rsp)
2359 aesdec $rndkey0,$inout2
36df342f
AP
2360 aesdec $rndkey0,$inout3
2361 pxor $twres,$twmask
2362 movdqa @tweak[4],`16*4`(%rsp)
2363 aesdec $rndkey0,$inout4
36df342f
AP
2364 aesdec $rndkey0,$inout5
2365 $movkey 64($key_),$rndkey0
d8ba0dc9 2366 movdqa $twmask,`16*5`(%rsp)
36df342f
AP
2367 pshufd \$0x5f,@tweak[5],$twres
2368 jmp .Lxts_dec_loop6
2369.align 32
f8501464
AP
2370.Lxts_dec_loop6:
2371 aesdec $rndkey1,$inout0
2372 aesdec $rndkey1,$inout1
f8501464
AP
2373 aesdec $rndkey1,$inout2
2374 aesdec $rndkey1,$inout3
2375 aesdec $rndkey1,$inout4
2376 aesdec $rndkey1,$inout5
d8ba0dc9
AP
2377 $movkey -64($key,%rax),$rndkey1
2378 add \$32,%rax
36df342f 2379
f8501464
AP
2380 aesdec $rndkey0,$inout0
2381 aesdec $rndkey0,$inout1
f8501464
AP
2382 aesdec $rndkey0,$inout2
2383 aesdec $rndkey0,$inout3
2384 aesdec $rndkey0,$inout4
2385 aesdec $rndkey0,$inout5
d8ba0dc9 2386 $movkey -80($key,%rax),$rndkey0
f8501464
AP
2387 jnz .Lxts_dec_loop6
2388
23f6eec7 2389 movdqa (%r8),$twmask # start calculating next tweak
36df342f
AP
2390 movdqa $twres,$twtmp
2391 paddd $twres,$twres
f8501464 2392 aesdec $rndkey1,$inout0
36df342f
AP
2393 paddq @tweak[5],@tweak[5]
2394 psrad \$31,$twtmp
f8501464 2395 aesdec $rndkey1,$inout1
36df342f
AP
2396 pand $twmask,$twtmp
2397 $movkey ($key_),@tweak[0] # load round[0]
f8501464 2398 aesdec $rndkey1,$inout2
f8501464
AP
2399 aesdec $rndkey1,$inout3
2400 aesdec $rndkey1,$inout4
d8ba0dc9 2401 pxor $twtmp,@tweak[5]
36df342f 2402 movaps @tweak[0],@tweak[1] # copy round[0]
f8501464 2403 aesdec $rndkey1,$inout5
d8ba0dc9 2404 $movkey -64($key),$rndkey1
f8501464 2405
36df342f 2406 movdqa $twres,$twtmp
f8501464 2407 aesdec $rndkey0,$inout0
d8ba0dc9 2408 paddd $twres,$twres
36df342f 2409 pxor @tweak[5],@tweak[0]
f8501464 2410 aesdec $rndkey0,$inout1
d8ba0dc9 2411 psrad \$31,$twtmp
36df342f 2412 paddq @tweak[5],@tweak[5]
f8501464 2413 aesdec $rndkey0,$inout2
f8501464 2414 aesdec $rndkey0,$inout3
d8ba0dc9 2415 pand $twmask,$twtmp
36df342f 2416 movaps @tweak[1],@tweak[2]
d8ba0dc9
AP
2417 aesdec $rndkey0,$inout4
2418 pxor $twtmp,@tweak[5]
2419 movdqa $twres,$twtmp
f8501464 2420 aesdec $rndkey0,$inout5
d8ba0dc9 2421 $movkey -48($key),$rndkey0
f8501464 2422
36df342f 2423 paddd $twres,$twres
f8501464 2424 aesdec $rndkey1,$inout0
36df342f
AP
2425 pxor @tweak[5],@tweak[1]
2426 psrad \$31,$twtmp
f8501464 2427 aesdec $rndkey1,$inout1
36df342f
AP
2428 paddq @tweak[5],@tweak[5]
2429 pand $twmask,$twtmp
f8501464 2430 aesdec $rndkey1,$inout2
f8501464 2431 aesdec $rndkey1,$inout3
d8ba0dc9 2432 movdqa @tweak[3],`16*3`(%rsp)
36df342f 2433 pxor $twtmp,@tweak[5]
f8501464 2434 aesdec $rndkey1,$inout4
36df342f 2435 movaps @tweak[2],@tweak[3]
d8ba0dc9 2436 movdqa $twres,$twtmp
f8501464 2437 aesdec $rndkey1,$inout5
d8ba0dc9 2438 $movkey -32($key),$rndkey1
f8501464 2439
36df342f
AP
2440 paddd $twres,$twres
2441 aesdec $rndkey0,$inout0
2442 pxor @tweak[5],@tweak[2]
2443 psrad \$31,$twtmp
2444 aesdec $rndkey0,$inout1
2445 paddq @tweak[5],@tweak[5]
2446 pand $twmask,$twtmp
2447 aesdec $rndkey0,$inout2
2448 aesdec $rndkey0,$inout3
36df342f 2449 aesdec $rndkey0,$inout4
d8ba0dc9 2450 pxor $twtmp,@tweak[5]
36df342f
AP
2451 movaps @tweak[3],@tweak[4]
2452 aesdec $rndkey0,$inout5
2453
2454 movdqa $twres,$rndkey0
2455 paddd $twres,$twres
2456 aesdec $rndkey1,$inout0
2457 pxor @tweak[5],@tweak[3]
2458 psrad \$31,$rndkey0
2459 aesdec $rndkey1,$inout1
2460 paddq @tweak[5],@tweak[5]
2461 pand $twmask,$rndkey0
2462 aesdec $rndkey1,$inout2
2463 aesdec $rndkey1,$inout3
2464 pxor $rndkey0,@tweak[5]
2465 $movkey ($key_),$rndkey0
2466 aesdec $rndkey1,$inout4
2467 aesdec $rndkey1,$inout5
2468 $movkey 16($key_),$rndkey1
2469
2470 pxor @tweak[5],@tweak[4]
36df342f 2471 aesdeclast `16*0`(%rsp),$inout0
d8ba0dc9 2472 psrad \$31,$twres
36df342f 2473 paddq @tweak[5],@tweak[5]
36df342f
AP
2474 aesdeclast `16*1`(%rsp),$inout1
2475 aesdeclast `16*2`(%rsp),$inout2
d8ba0dc9
AP
2476 pand $twmask,$twres
2477 mov %r10,%rax # restore $rounds
36df342f
AP
2478 aesdeclast `16*3`(%rsp),$inout3
2479 aesdeclast `16*4`(%rsp),$inout4
2480 aesdeclast `16*5`(%rsp),$inout5
d8ba0dc9 2481 pxor $twres,@tweak[5]
f8501464 2482
23f6eec7
AP
2483 lea `16*6`($out),$out # $out+=6*16
2484 movups $inout0,`-16*6`($out) # store 6 output blocks
36df342f
AP
2485 movups $inout1,`-16*5`($out)
2486 movups $inout2,`-16*4`($out)
2487 movups $inout3,`-16*3`($out)
2488 movups $inout4,`-16*2`($out)
2489 movups $inout5,`-16*1`($out)
f8501464 2490 sub \$16*6,$len
23f6eec7 2491 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
f8501464 2492
d8ba0dc9
AP
2493 mov \$16+96,$rounds
2494 sub $rnds_,$rounds
f8501464 2495 mov $key_,$key # restore $key
d8ba0dc9 2496 shr \$4,$rounds # restore original value
f8501464
AP
2497
2498.Lxts_dec_short:
23f6eec7 2499 # at the point @tweak[0..5] are populated with tweak values
d8ba0dc9 2500 mov $rounds,$rnds_ # backup $rounds
36df342f
AP
2501 pxor $rndkey0,@tweak[0]
2502 pxor $rndkey0,@tweak[1]
23f6eec7
AP
2503 add \$16*6,$len # restore real remaining $len
2504 jz .Lxts_dec_done # done if ($len==0)
d7d119a3 2505
36df342f 2506 pxor $rndkey0,@tweak[2]
f8501464 2507 cmp \$0x20,$len
23f6eec7 2508 jb .Lxts_dec_one # $len is 1*16
36df342f 2509 pxor $rndkey0,@tweak[3]
23f6eec7 2510 je .Lxts_dec_two # $len is 2*16
d7d119a3 2511
36df342f 2512 pxor $rndkey0,@tweak[4]
f8501464 2513 cmp \$0x40,$len
23f6eec7
AP
2514 jb .Lxts_dec_three # $len is 3*16
2515 je .Lxts_dec_four # $len is 4*16
f8501464 2516
23f6eec7 2517 movdqu ($inp),$inout0 # $len is 5*16
36df342f 2518 movdqu 16*1($inp),$inout1
f8501464
AP
2519 movdqu 16*2($inp),$inout2
2520 pxor @tweak[0],$inout0
2521 movdqu 16*3($inp),$inout3
2522 pxor @tweak[1],$inout1
2523 movdqu 16*4($inp),$inout4
23f6eec7 2524 lea 16*5($inp),$inp # $inp+=5*16
f8501464
AP
2525 pxor @tweak[2],$inout2
2526 pxor @tweak[3],$inout3
2527 pxor @tweak[4],$inout4
2528
2529 call _aesni_decrypt6
2530
2531 xorps @tweak[0],$inout0
2532 xorps @tweak[1],$inout1
2533 xorps @tweak[2],$inout2
23f6eec7 2534 movdqu $inout0,($out) # store 5 output blocks
f8501464
AP
2535 xorps @tweak[3],$inout3
2536 movdqu $inout1,16*1($out)
2537 xorps @tweak[4],$inout4
2538 movdqu $inout2,16*2($out)
2539 pxor $twtmp,$twtmp
2540 movdqu $inout3,16*3($out)
2541 pcmpgtd @tweak[5],$twtmp
2542 movdqu $inout4,16*4($out)
23f6eec7 2543 lea 16*5($out),$out # $out+=5*16
f8501464
AP
2544 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2545 and \$15,$len_
2546 jz .Lxts_dec_ret
2547
2548 movdqa @tweak[5],@tweak[0]
2549 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2550 pand $twmask,@tweak[1] # isolate carry and residue
2551 pxor @tweak[5],@tweak[1]
2552 jmp .Lxts_dec_done2
d7d119a3 2553
f8501464
AP
2554.align 16
2555.Lxts_dec_one:
2556 movups ($inp),$inout0
23f6eec7 2557 lea 16*1($inp),$inp # $inp+=1*16
f8501464
AP
2558 xorps @tweak[0],$inout0
2559___
2560 &aesni_generate1("dec",$key,$rounds);
2561$code.=<<___;
2562 xorps @tweak[0],$inout0
2563 movdqa @tweak[1],@tweak[0]
23f6eec7 2564 movups $inout0,($out) # store one output block
f8501464 2565 movdqa @tweak[2],@tweak[1]
23f6eec7 2566 lea 16*1($out),$out # $out+=1*16
f8501464 2567 jmp .Lxts_dec_done
6c83629b 2568
f8501464
AP
2569.align 16
2570.Lxts_dec_two:
2571 movups ($inp),$inout0
2572 movups 16($inp),$inout1
23f6eec7 2573 lea 32($inp),$inp # $inp+=2*16
f8501464
AP
2574 xorps @tweak[0],$inout0
2575 xorps @tweak[1],$inout1
6c83629b 2576
214368ff 2577 call _aesni_decrypt2
6c83629b 2578
f8501464
AP
2579 xorps @tweak[0],$inout0
2580 movdqa @tweak[2],@tweak[0]
2581 xorps @tweak[1],$inout1
2582 movdqa @tweak[3],@tweak[1]
23f6eec7 2583 movups $inout0,($out) # store 2 output blocks
f8501464 2584 movups $inout1,16*1($out)
23f6eec7 2585 lea 16*2($out),$out # $out+=2*16
f8501464 2586 jmp .Lxts_dec_done
6c83629b 2587
f8501464
AP
2588.align 16
2589.Lxts_dec_three:
2590 movups ($inp),$inout0
2591 movups 16*1($inp),$inout1
2592 movups 16*2($inp),$inout2
23f6eec7 2593 lea 16*3($inp),$inp # $inp+=3*16
f8501464
AP
2594 xorps @tweak[0],$inout0
2595 xorps @tweak[1],$inout1
2596 xorps @tweak[2],$inout2
6c83629b 2597
f8501464 2598 call _aesni_decrypt3
6c83629b 2599
f8501464
AP
2600 xorps @tweak[0],$inout0
2601 movdqa @tweak[3],@tweak[0]
2602 xorps @tweak[1],$inout1
36df342f 2603 movdqa @tweak[4],@tweak[1]
f8501464 2604 xorps @tweak[2],$inout2
23f6eec7 2605 movups $inout0,($out) # store 3 output blocks
f8501464
AP
2606 movups $inout1,16*1($out)
2607 movups $inout2,16*2($out)
23f6eec7 2608 lea 16*3($out),$out # $out+=3*16
f8501464 2609 jmp .Lxts_dec_done
6c83629b
AP
2610
2611.align 16
f8501464 2612.Lxts_dec_four:
36df342f
AP
2613 movups ($inp),$inout0
2614 movups 16*1($inp),$inout1
f8501464
AP
2615 movups 16*2($inp),$inout2
2616 xorps @tweak[0],$inout0
2617 movups 16*3($inp),$inout3
23f6eec7 2618 lea 16*4($inp),$inp # $inp+=4*16
f8501464
AP
2619 xorps @tweak[1],$inout1
2620 xorps @tweak[2],$inout2
2621 xorps @tweak[3],$inout3
2622
2623 call _aesni_decrypt4
2624
36df342f 2625 pxor @tweak[0],$inout0
f8501464 2626 movdqa @tweak[4],@tweak[0]
36df342f 2627 pxor @tweak[1],$inout1
f8501464 2628 movdqa @tweak[5],@tweak[1]
36df342f 2629 pxor @tweak[2],$inout2
23f6eec7 2630 movdqu $inout0,($out) # store 4 output blocks
36df342f
AP
2631 pxor @tweak[3],$inout3
2632 movdqu $inout1,16*1($out)
2633 movdqu $inout2,16*2($out)
2634 movdqu $inout3,16*3($out)
23f6eec7 2635 lea 16*4($out),$out # $out+=4*16
f8501464 2636 jmp .Lxts_dec_done
6c83629b
AP
2637
2638.align 16
f8501464 2639.Lxts_dec_done:
23f6eec7 2640 and \$15,$len_ # see if $len%16 is 0
f8501464
AP
2641 jz .Lxts_dec_ret
2642.Lxts_dec_done2:
2643 mov $len_,$len
2644 mov $key_,$key # restore $key
2645 mov $rnds_,$rounds # restore $rounds
6c83629b 2646
f8501464
AP
2647 movups ($inp),$inout0
2648 xorps @tweak[1],$inout0
2649___
2650 &aesni_generate1("dec",$key,$rounds);
2651$code.=<<___;
2652 xorps @tweak[1],$inout0
2653 movups $inout0,($out)
2654
2655.Lxts_dec_steal:
2656 movzb 16($inp),%eax # borrow $rounds ...
2657 movzb ($out),%ecx # ... and $key
2658 lea 1($inp),$inp
2659 mov %al,($out)
2660 mov %cl,16($out)
2661 lea 1($out),$out
2662 sub \$1,$len
2663 jnz .Lxts_dec_steal
2664
2665 sub $len_,$out # rewind $out
2666 mov $key_,$key # restore $key
2667 mov $rnds_,$rounds # restore $rounds
2668
2669 movups ($out),$inout0
2670 xorps @tweak[0],$inout0
6c83629b 2671___
f8501464
AP
2672 &aesni_generate1("dec",$key,$rounds);
2673$code.=<<___;
2674 xorps @tweak[0],$inout0
2675 movups $inout0,($out)
6c83629b 2676
f8501464 2677.Lxts_dec_ret:
23f6eec7
AP
2678 xorps %xmm0,%xmm0 # clear register bank
2679 pxor %xmm1,%xmm1
2680 pxor %xmm2,%xmm2
2681 pxor %xmm3,%xmm3
2682 pxor %xmm4,%xmm4
2683 pxor %xmm5,%xmm5
2684___
2685$code.=<<___ if (!$win64);
2686 pxor %xmm6,%xmm6
2687 pxor %xmm7,%xmm7
2688 movaps %xmm0,0x00(%rsp) # clear stack
2689 pxor %xmm8,%xmm8
2690 movaps %xmm0,0x10(%rsp)
2691 pxor %xmm9,%xmm9
2692 movaps %xmm0,0x20(%rsp)
2693 pxor %xmm10,%xmm10
2694 movaps %xmm0,0x30(%rsp)
2695 pxor %xmm11,%xmm11
2696 movaps %xmm0,0x40(%rsp)
2697 pxor %xmm12,%xmm12
2698 movaps %xmm0,0x50(%rsp)
2699 pxor %xmm13,%xmm13
2700 movaps %xmm0,0x60(%rsp)
2701 pxor %xmm14,%xmm14
2702 pxor %xmm15,%xmm15
f8501464 2703___
6c83629b 2704$code.=<<___ if ($win64);
384e6de4
AP
2705 movaps -0xa8(%r11),%xmm6
2706 movaps %xmm0,-0xa8(%r11) # clear stack
2707 movaps -0x98(%r11),%xmm7
2708 movaps %xmm0,-0x98(%r11)
2709 movaps -0x88(%r11),%xmm8
2710 movaps %xmm0,-0x88(%r11)
2711 movaps -0x78(%r11),%xmm9
2712 movaps %xmm0,-0x78(%r11)
2713 movaps -0x68(%r11),%xmm10
2714 movaps %xmm0,-0x68(%r11)
2715 movaps -0x58(%r11),%xmm11
2716 movaps %xmm0,-0x58(%r11)
2717 movaps -0x48(%r11),%xmm12
2718 movaps %xmm0,-0x48(%r11)
2719 movaps -0x38(%r11),%xmm13
2720 movaps %xmm0,-0x38(%r11)
2721 movaps -0x28(%r11),%xmm14
2722 movaps %xmm0,-0x28(%r11)
2723 movaps -0x18(%r11),%xmm15
2724 movaps %xmm0,-0x18(%r11)
23f6eec7
AP
2725 movaps %xmm0,0x00(%rsp)
2726 movaps %xmm0,0x10(%rsp)
2727 movaps %xmm0,0x20(%rsp)
2728 movaps %xmm0,0x30(%rsp)
2729 movaps %xmm0,0x40(%rsp)
2730 movaps %xmm0,0x50(%rsp)
2731 movaps %xmm0,0x60(%rsp)
6c83629b
AP
2732___
2733$code.=<<___;
384e6de4 2734 mov -8(%r11),%rbp
b84460ad 2735.cfi_restore %rbp
384e6de4 2736 lea (%r11),%rsp
b84460ad 2737.cfi_def_cfa_register %rsp
f8501464 2738.Lxts_dec_epilogue:
6c83629b 2739 ret
b84460ad 2740.cfi_endproc
f8501464 2741.size aesni_xts_decrypt,.-aesni_xts_decrypt
6c83629b 2742___
bd30091c
AP
2743}
2744\f
2745######################################################################
2746# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2747# const AES_KEY *key, unsigned int start_block_num,
2748# unsigned char offset_i[16], const unsigned char L_[][16],
2749# unsigned char checksum[16]);
2750#
2751{
2752my @offset=map("%xmm$_",(10..15));
2753my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2754my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2755my ($L_p,$checksum_p) = ("%rbx","%rbp");
2756my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2757my $seventh_arg = $win64 ? 56 : 8;
2758my $blocks = $len;
2759
2760$code.=<<___;
2761.globl aesni_ocb_encrypt
2762.type aesni_ocb_encrypt,\@function,6
2763.align 32
2764aesni_ocb_encrypt:
b84460ad 2765.cfi_startproc
bd30091c
AP
2766 lea (%rsp),%rax
2767 push %rbx
b84460ad 2768.cfi_push %rbx
bd30091c 2769 push %rbp
b84460ad 2770.cfi_push %rbp
bd30091c 2771 push %r12
b84460ad 2772.cfi_push %r12
bd30091c 2773 push %r13
b84460ad 2774.cfi_push %r13
bd30091c 2775 push %r14
b84460ad 2776.cfi_push %r14
bd30091c
AP
2777___
2778$code.=<<___ if ($win64);
2779 lea -0xa0(%rsp),%rsp
2780 movaps %xmm6,0x00(%rsp) # offload everything
2781 movaps %xmm7,0x10(%rsp)
2782 movaps %xmm8,0x20(%rsp)
2783 movaps %xmm9,0x30(%rsp)
2784 movaps %xmm10,0x40(%rsp)
2785 movaps %xmm11,0x50(%rsp)
2786 movaps %xmm12,0x60(%rsp)
2787 movaps %xmm13,0x70(%rsp)
2788 movaps %xmm14,0x80(%rsp)
2789 movaps %xmm15,0x90(%rsp)
2790.Locb_enc_body:
2791___
2792$code.=<<___;
2793 mov $seventh_arg(%rax),$L_p # 7th argument
2794 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2795
2796 mov 240($key),$rnds_
2797 mov $key,$key_
2798 shl \$4,$rnds_
2799 $movkey ($key),$rndkey0l # round[0]
2800 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2801
2802 movdqu ($offset_p),@offset[5] # load last offset_i
2803 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2804 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2805
2806 mov \$16+32,$rounds
2807 lea 32($key_,$rnds_),$key
2808 $movkey 16($key_),$rndkey1 # round[1]
2809 sub %r10,%rax # twisted $rounds
2810 mov %rax,%r10 # backup twisted $rounds
2811
2812 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2813 movdqu ($checksum_p),$checksum # load checksum
2814
2815 test \$1,$block_num # is first block number odd?
2816 jnz .Locb_enc_odd
2817
2818 bsf $block_num,$i1
2819 add \$1,$block_num
2820 shl \$4,$i1
2821 movdqu ($L_p,$i1),$inout5 # borrow
2822 movdqu ($inp),$inout0
2823 lea 16($inp),$inp
2824
2825 call __ocb_encrypt1
2826
2827 movdqa $inout5,@offset[5]
2828 movups $inout0,($out)
2829 lea 16($out),$out
2830 sub \$1,$blocks
2831 jz .Locb_enc_done
2832
2833.Locb_enc_odd:
2834 lea 1($block_num),$i1 # even-numbered blocks
2835 lea 3($block_num),$i3
2836 lea 5($block_num),$i5
2837 lea 6($block_num),$block_num
2838 bsf $i1,$i1 # ntz(block)
2839 bsf $i3,$i3
2840 bsf $i5,$i5
2841 shl \$4,$i1 # ntz(block) -> table offset
2842 shl \$4,$i3
2843 shl \$4,$i5
2844
2845 sub \$6,$blocks
2846 jc .Locb_enc_short
2847 jmp .Locb_enc_grandloop
2848
2849.align 32
2850.Locb_enc_grandloop:
2851 movdqu `16*0`($inp),$inout0 # load input
2852 movdqu `16*1`($inp),$inout1
2853 movdqu `16*2`($inp),$inout2
2854 movdqu `16*3`($inp),$inout3
2855 movdqu `16*4`($inp),$inout4
2856 movdqu `16*5`($inp),$inout5
2857 lea `16*6`($inp),$inp
2858
2859 call __ocb_encrypt6
2860
2861 movups $inout0,`16*0`($out) # store output
2862 movups $inout1,`16*1`($out)
2863 movups $inout2,`16*2`($out)
2864 movups $inout3,`16*3`($out)
2865 movups $inout4,`16*4`($out)
2866 movups $inout5,`16*5`($out)
2867 lea `16*6`($out),$out
2868 sub \$6,$blocks
2869 jnc .Locb_enc_grandloop
2870
2871.Locb_enc_short:
2872 add \$6,$blocks
2873 jz .Locb_enc_done
2874
2875 movdqu `16*0`($inp),$inout0
2876 cmp \$2,$blocks
2877 jb .Locb_enc_one
2878 movdqu `16*1`($inp),$inout1
2879 je .Locb_enc_two
2880
2881 movdqu `16*2`($inp),$inout2
2882 cmp \$4,$blocks
2883 jb .Locb_enc_three
2884 movdqu `16*3`($inp),$inout3
2885 je .Locb_enc_four
2886
2887 movdqu `16*4`($inp),$inout4
2888 pxor $inout5,$inout5
2889
2890 call __ocb_encrypt6
2891
2892 movdqa @offset[4],@offset[5]
2893 movups $inout0,`16*0`($out)
2894 movups $inout1,`16*1`($out)
2895 movups $inout2,`16*2`($out)
2896 movups $inout3,`16*3`($out)
2897 movups $inout4,`16*4`($out)
2898
2899 jmp .Locb_enc_done
2900
2901.align 16
2902.Locb_enc_one:
2903 movdqa @offset[0],$inout5 # borrow
2904
2905 call __ocb_encrypt1
2906
2907 movdqa $inout5,@offset[5]
2908 movups $inout0,`16*0`($out)
2909 jmp .Locb_enc_done
2910
2911.align 16
2912.Locb_enc_two:
2913 pxor $inout2,$inout2
2914 pxor $inout3,$inout3
2915
2916 call __ocb_encrypt4
2917
2918 movdqa @offset[1],@offset[5]
2919 movups $inout0,`16*0`($out)
2920 movups $inout1,`16*1`($out)
2921
2922 jmp .Locb_enc_done
2923
2924.align 16
2925.Locb_enc_three:
2926 pxor $inout3,$inout3
2927
2928 call __ocb_encrypt4
2929
2930 movdqa @offset[2],@offset[5]
2931 movups $inout0,`16*0`($out)
2932 movups $inout1,`16*1`($out)
2933 movups $inout2,`16*2`($out)
2934
2935 jmp .Locb_enc_done
2936
2937.align 16
2938.Locb_enc_four:
2939 call __ocb_encrypt4
2940
2941 movdqa @offset[3],@offset[5]
2942 movups $inout0,`16*0`($out)
2943 movups $inout1,`16*1`($out)
2944 movups $inout2,`16*2`($out)
2945 movups $inout3,`16*3`($out)
2946
2947.Locb_enc_done:
2948 pxor $rndkey0,@offset[5] # "remove" round[last]
2949 movdqu $checksum,($checksum_p) # store checksum
2950 movdqu @offset[5],($offset_p) # store last offset_i
2951
2952 xorps %xmm0,%xmm0 # clear register bank
2953 pxor %xmm1,%xmm1
2954 pxor %xmm2,%xmm2
2955 pxor %xmm3,%xmm3
2956 pxor %xmm4,%xmm4
2957 pxor %xmm5,%xmm5
2958___
2959$code.=<<___ if (!$win64);
2960 pxor %xmm6,%xmm6
2961 pxor %xmm7,%xmm7
2962 pxor %xmm8,%xmm8
2963 pxor %xmm9,%xmm9
2964 pxor %xmm10,%xmm10
2965 pxor %xmm11,%xmm11
2966 pxor %xmm12,%xmm12
2967 pxor %xmm13,%xmm13
2968 pxor %xmm14,%xmm14
2969 pxor %xmm15,%xmm15
384e6de4 2970 lea 0x28(%rsp),%rax
b84460ad 2971.cfi_def_cfa %rax,8
bd30091c
AP
2972___
2973$code.=<<___ if ($win64);
2974 movaps 0x00(%rsp),%xmm6
2975 movaps %xmm0,0x00(%rsp) # clear stack
2976 movaps 0x10(%rsp),%xmm7
2977 movaps %xmm0,0x10(%rsp)
2978 movaps 0x20(%rsp),%xmm8
2979 movaps %xmm0,0x20(%rsp)
2980 movaps 0x30(%rsp),%xmm9
2981 movaps %xmm0,0x30(%rsp)
2982 movaps 0x40(%rsp),%xmm10
2983 movaps %xmm0,0x40(%rsp)
2984 movaps 0x50(%rsp),%xmm11
2985 movaps %xmm0,0x50(%rsp)
2986 movaps 0x60(%rsp),%xmm12
2987 movaps %xmm0,0x60(%rsp)
2988 movaps 0x70(%rsp),%xmm13
2989 movaps %xmm0,0x70(%rsp)
2990 movaps 0x80(%rsp),%xmm14
2991 movaps %xmm0,0x80(%rsp)
2992 movaps 0x90(%rsp),%xmm15
2993 movaps %xmm0,0x90(%rsp)
2994 lea 0xa0+0x28(%rsp),%rax
2995.Locb_enc_pop:
bd30091c
AP
2996___
2997$code.=<<___;
384e6de4 2998 mov -40(%rax),%r14
b84460ad 2999.cfi_restore %r14
384e6de4 3000 mov -32(%rax),%r13
b84460ad 3001.cfi_restore %r13
384e6de4 3002 mov -24(%rax),%r12
b84460ad 3003.cfi_restore %r12
384e6de4 3004 mov -16(%rax),%rbp
b84460ad 3005.cfi_restore %rbp
384e6de4 3006 mov -8(%rax),%rbx
b84460ad 3007.cfi_restore %rbx
384e6de4 3008 lea (%rax),%rsp
b84460ad 3009.cfi_def_cfa_register %rsp
bd30091c
AP
3010.Locb_enc_epilogue:
3011 ret
b84460ad 3012.cfi_endproc
bd30091c
AP
3013.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3014
3015.type __ocb_encrypt6,\@abi-omnipotent
3016.align 32
3017__ocb_encrypt6:
3018 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3019 movdqu ($L_p,$i1),@offset[1]
3020 movdqa @offset[0],@offset[2]
3021 movdqu ($L_p,$i3),@offset[3]
3022 movdqa @offset[0],@offset[4]
3023 pxor @offset[5],@offset[0]
3024 movdqu ($L_p,$i5),@offset[5]
3025 pxor @offset[0],@offset[1]
3026 pxor $inout0,$checksum # accumulate checksum
3027 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3028 pxor @offset[1],@offset[2]
3029 pxor $inout1,$checksum
3030 pxor @offset[1],$inout1
3031 pxor @offset[2],@offset[3]
3032 pxor $inout2,$checksum
3033 pxor @offset[2],$inout2
3034 pxor @offset[3],@offset[4]
3035 pxor $inout3,$checksum
3036 pxor @offset[3],$inout3
3037 pxor @offset[4],@offset[5]
3038 pxor $inout4,$checksum
3039 pxor @offset[4],$inout4
3040 pxor $inout5,$checksum
3041 pxor @offset[5],$inout5
3042 $movkey 32($key_),$rndkey0
3043
3044 lea 1($block_num),$i1 # even-numbered blocks
3045 lea 3($block_num),$i3
3046 lea 5($block_num),$i5
3047 add \$6,$block_num
3048 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3049 bsf $i1,$i1 # ntz(block)
3050 bsf $i3,$i3
3051 bsf $i5,$i5
3052
3053 aesenc $rndkey1,$inout0
3054 aesenc $rndkey1,$inout1
3055 aesenc $rndkey1,$inout2
3056 aesenc $rndkey1,$inout3
3057 pxor $rndkey0l,@offset[1]
3058 pxor $rndkey0l,@offset[2]
3059 aesenc $rndkey1,$inout4
3060 pxor $rndkey0l,@offset[3]
3061 pxor $rndkey0l,@offset[4]
3062 aesenc $rndkey1,$inout5
3063 $movkey 48($key_),$rndkey1
3064 pxor $rndkey0l,@offset[5]
3065
3066 aesenc $rndkey0,$inout0
3067 aesenc $rndkey0,$inout1
3068 aesenc $rndkey0,$inout2
3069 aesenc $rndkey0,$inout3
3070 aesenc $rndkey0,$inout4
3071 aesenc $rndkey0,$inout5
3072 $movkey 64($key_),$rndkey0
3073 shl \$4,$i1 # ntz(block) -> table offset
3074 shl \$4,$i3
3075 jmp .Locb_enc_loop6
3076
3077.align 32
3078.Locb_enc_loop6:
3079 aesenc $rndkey1,$inout0
3080 aesenc $rndkey1,$inout1
3081 aesenc $rndkey1,$inout2
3082 aesenc $rndkey1,$inout3
3083 aesenc $rndkey1,$inout4
3084 aesenc $rndkey1,$inout5
3085 $movkey ($key,%rax),$rndkey1
3086 add \$32,%rax
3087
3088 aesenc $rndkey0,$inout0
3089 aesenc $rndkey0,$inout1
3090 aesenc $rndkey0,$inout2
3091 aesenc $rndkey0,$inout3
3092 aesenc $rndkey0,$inout4
3093 aesenc $rndkey0,$inout5
3094 $movkey -16($key,%rax),$rndkey0
3095 jnz .Locb_enc_loop6
3096
3097 aesenc $rndkey1,$inout0
3098 aesenc $rndkey1,$inout1
3099 aesenc $rndkey1,$inout2
3100 aesenc $rndkey1,$inout3
3101 aesenc $rndkey1,$inout4
3102 aesenc $rndkey1,$inout5
3103 $movkey 16($key_),$rndkey1
3104 shl \$4,$i5
3105
3106 aesenclast @offset[0],$inout0
3107 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3108 mov %r10,%rax # restore twisted rounds
3109 aesenclast @offset[1],$inout1
3110 aesenclast @offset[2],$inout2
3111 aesenclast @offset[3],$inout3
3112 aesenclast @offset[4],$inout4
3113 aesenclast @offset[5],$inout5
3114 ret
3115.size __ocb_encrypt6,.-__ocb_encrypt6
3116
3117.type __ocb_encrypt4,\@abi-omnipotent
3118.align 32
3119__ocb_encrypt4:
3120 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3121 movdqu ($L_p,$i1),@offset[1]
3122 movdqa @offset[0],@offset[2]
3123 movdqu ($L_p,$i3),@offset[3]
3124 pxor @offset[5],@offset[0]
3125 pxor @offset[0],@offset[1]
3126 pxor $inout0,$checksum # accumulate checksum
3127 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3128 pxor @offset[1],@offset[2]
3129 pxor $inout1,$checksum
3130 pxor @offset[1],$inout1
3131 pxor @offset[2],@offset[3]
3132 pxor $inout2,$checksum
3133 pxor @offset[2],$inout2
3134 pxor $inout3,$checksum
3135 pxor @offset[3],$inout3
3136 $movkey 32($key_),$rndkey0
3137
3138 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3139 pxor $rndkey0l,@offset[1]
3140 pxor $rndkey0l,@offset[2]
3141 pxor $rndkey0l,@offset[3]
3142
3143 aesenc $rndkey1,$inout0
3144 aesenc $rndkey1,$inout1
3145 aesenc $rndkey1,$inout2
3146 aesenc $rndkey1,$inout3
3147 $movkey 48($key_),$rndkey1
3148
3149 aesenc $rndkey0,$inout0
3150 aesenc $rndkey0,$inout1
3151 aesenc $rndkey0,$inout2
3152 aesenc $rndkey0,$inout3
3153 $movkey 64($key_),$rndkey0
3154 jmp .Locb_enc_loop4
3155
3156.align 32
3157.Locb_enc_loop4:
3158 aesenc $rndkey1,$inout0
3159 aesenc $rndkey1,$inout1
3160 aesenc $rndkey1,$inout2
3161 aesenc $rndkey1,$inout3
3162 $movkey ($key,%rax),$rndkey1
3163 add \$32,%rax
3164
3165 aesenc $rndkey0,$inout0
3166 aesenc $rndkey0,$inout1
3167 aesenc $rndkey0,$inout2
3168 aesenc $rndkey0,$inout3
3169 $movkey -16($key,%rax),$rndkey0
3170 jnz .Locb_enc_loop4
3171
3172 aesenc $rndkey1,$inout0
3173 aesenc $rndkey1,$inout1
3174 aesenc $rndkey1,$inout2
3175 aesenc $rndkey1,$inout3
3176 $movkey 16($key_),$rndkey1
3177 mov %r10,%rax # restore twisted rounds
3178
3179 aesenclast @offset[0],$inout0
3180 aesenclast @offset[1],$inout1
3181 aesenclast @offset[2],$inout2
3182 aesenclast @offset[3],$inout3
3183 ret
3184.size __ocb_encrypt4,.-__ocb_encrypt4
3185
3186.type __ocb_encrypt1,\@abi-omnipotent
3187.align 32
3188__ocb_encrypt1:
3189 pxor @offset[5],$inout5 # offset_i
3190 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3191 pxor $inout0,$checksum # accumulate checksum
3192 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3193 $movkey 32($key_),$rndkey0
3194
3195 aesenc $rndkey1,$inout0
3196 $movkey 48($key_),$rndkey1
3197 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3198
3199 aesenc $rndkey0,$inout0
3200 $movkey 64($key_),$rndkey0
3201 jmp .Locb_enc_loop1
3202
3203.align 32
3204.Locb_enc_loop1:
3205 aesenc $rndkey1,$inout0
3206 $movkey ($key,%rax),$rndkey1
3207 add \$32,%rax
3208
3209 aesenc $rndkey0,$inout0
3210 $movkey -16($key,%rax),$rndkey0
3211 jnz .Locb_enc_loop1
3212
3213 aesenc $rndkey1,$inout0
3214 $movkey 16($key_),$rndkey1 # redundant in tail
3215 mov %r10,%rax # restore twisted rounds
3216
3217 aesenclast $inout5,$inout0
3218 ret
3219.size __ocb_encrypt1,.-__ocb_encrypt1
3220
3221.globl aesni_ocb_decrypt
3222.type aesni_ocb_decrypt,\@function,6
3223.align 32
3224aesni_ocb_decrypt:
b84460ad 3225.cfi_startproc
bd30091c
AP
3226 lea (%rsp),%rax
3227 push %rbx
b84460ad 3228.cfi_push %rbx
bd30091c 3229 push %rbp
b84460ad 3230.cfi_push %rbp
bd30091c 3231 push %r12
b84460ad 3232.cfi_push %r12
bd30091c 3233 push %r13
b84460ad 3234.cfi_push %r13
bd30091c 3235 push %r14
b84460ad 3236.cfi_push %r14
bd30091c
AP
3237___
3238$code.=<<___ if ($win64);
3239 lea -0xa0(%rsp),%rsp
3240 movaps %xmm6,0x00(%rsp) # offload everything
3241 movaps %xmm7,0x10(%rsp)
3242 movaps %xmm8,0x20(%rsp)
3243 movaps %xmm9,0x30(%rsp)
3244 movaps %xmm10,0x40(%rsp)
3245 movaps %xmm11,0x50(%rsp)
3246 movaps %xmm12,0x60(%rsp)
3247 movaps %xmm13,0x70(%rsp)
3248 movaps %xmm14,0x80(%rsp)
3249 movaps %xmm15,0x90(%rsp)
3250.Locb_dec_body:
3251___
3252$code.=<<___;
3253 mov $seventh_arg(%rax),$L_p # 7th argument
3254 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3255
3256 mov 240($key),$rnds_
3257 mov $key,$key_
3258 shl \$4,$rnds_
3259 $movkey ($key),$rndkey0l # round[0]
3260 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3261
3262 movdqu ($offset_p),@offset[5] # load last offset_i
3263 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3264 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3265
3266 mov \$16+32,$rounds
3267 lea 32($key_,$rnds_),$key
3268 $movkey 16($key_),$rndkey1 # round[1]
3269 sub %r10,%rax # twisted $rounds
3270 mov %rax,%r10 # backup twisted $rounds
3271
3272 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3273 movdqu ($checksum_p),$checksum # load checksum
3274
3275 test \$1,$block_num # is first block number odd?
3276 jnz .Locb_dec_odd
3277
3278 bsf $block_num,$i1
3279 add \$1,$block_num
3280 shl \$4,$i1
3281 movdqu ($L_p,$i1),$inout5 # borrow
3282 movdqu ($inp),$inout0
3283 lea 16($inp),$inp
3284
3285 call __ocb_decrypt1
3286
3287 movdqa $inout5,@offset[5]
3288 movups $inout0,($out)
3289 xorps $inout0,$checksum # accumulate checksum
3290 lea 16($out),$out
3291 sub \$1,$blocks
3292 jz .Locb_dec_done
3293
3294.Locb_dec_odd:
3295 lea 1($block_num),$i1 # even-numbered blocks
3296 lea 3($block_num),$i3
3297 lea 5($block_num),$i5
3298 lea 6($block_num),$block_num
3299 bsf $i1,$i1 # ntz(block)
3300 bsf $i3,$i3
3301 bsf $i5,$i5
3302 shl \$4,$i1 # ntz(block) -> table offset
3303 shl \$4,$i3
3304 shl \$4,$i5
3305
3306 sub \$6,$blocks
3307 jc .Locb_dec_short
3308 jmp .Locb_dec_grandloop
3309
3310.align 32
3311.Locb_dec_grandloop:
3312 movdqu `16*0`($inp),$inout0 # load input
3313 movdqu `16*1`($inp),$inout1
3314 movdqu `16*2`($inp),$inout2
3315 movdqu `16*3`($inp),$inout3
3316 movdqu `16*4`($inp),$inout4
3317 movdqu `16*5`($inp),$inout5
3318 lea `16*6`($inp),$inp
3319
3320 call __ocb_decrypt6
3321
3322 movups $inout0,`16*0`($out) # store output
3323 pxor $inout0,$checksum # accumulate checksum
3324 movups $inout1,`16*1`($out)
3325 pxor $inout1,$checksum
3326 movups $inout2,`16*2`($out)
3327 pxor $inout2,$checksum
3328 movups $inout3,`16*3`($out)
3329 pxor $inout3,$checksum
3330 movups $inout4,`16*4`($out)
3331 pxor $inout4,$checksum
3332 movups $inout5,`16*5`($out)
3333 pxor $inout5,$checksum
3334 lea `16*6`($out),$out
3335 sub \$6,$blocks
3336 jnc .Locb_dec_grandloop
3337
3338.Locb_dec_short:
3339 add \$6,$blocks
3340 jz .Locb_dec_done
3341
3342 movdqu `16*0`($inp),$inout0
3343 cmp \$2,$blocks
3344 jb .Locb_dec_one
3345 movdqu `16*1`($inp),$inout1
3346 je .Locb_dec_two
3347
3348 movdqu `16*2`($inp),$inout2
3349 cmp \$4,$blocks
3350 jb .Locb_dec_three
3351 movdqu `16*3`($inp),$inout3
3352 je .Locb_dec_four
3353
3354 movdqu `16*4`($inp),$inout4
3355 pxor $inout5,$inout5
3356
3357 call __ocb_decrypt6
3358
3359 movdqa @offset[4],@offset[5]
3360 movups $inout0,`16*0`($out) # store output
3361 pxor $inout0,$checksum # accumulate checksum
3362 movups $inout1,`16*1`($out)
3363 pxor $inout1,$checksum
3364 movups $inout2,`16*2`($out)
3365 pxor $inout2,$checksum
3366 movups $inout3,`16*3`($out)
3367 pxor $inout3,$checksum
3368 movups $inout4,`16*4`($out)
3369 pxor $inout4,$checksum
3370
3371 jmp .Locb_dec_done
3372
3373.align 16
3374.Locb_dec_one:
3375 movdqa @offset[0],$inout5 # borrow
3376
3377 call __ocb_decrypt1
3378
3379 movdqa $inout5,@offset[5]
3380 movups $inout0,`16*0`($out) # store output
3381 xorps $inout0,$checksum # accumulate checksum
3382 jmp .Locb_dec_done
3383
3384.align 16
3385.Locb_dec_two:
3386 pxor $inout2,$inout2
3387 pxor $inout3,$inout3
3388
3389 call __ocb_decrypt4
3390
3391 movdqa @offset[1],@offset[5]
3392 movups $inout0,`16*0`($out) # store output
3393 xorps $inout0,$checksum # accumulate checksum
3394 movups $inout1,`16*1`($out)
3395 xorps $inout1,$checksum
3396
3397 jmp .Locb_dec_done
3398
3399.align 16
3400.Locb_dec_three:
3401 pxor $inout3,$inout3
3402
3403 call __ocb_decrypt4
3404
3405 movdqa @offset[2],@offset[5]
3406 movups $inout0,`16*0`($out) # store output
3407 xorps $inout0,$checksum # accumulate checksum
3408 movups $inout1,`16*1`($out)
3409 xorps $inout1,$checksum
3410 movups $inout2,`16*2`($out)
3411 xorps $inout2,$checksum
3412
3413 jmp .Locb_dec_done
3414
3415.align 16
3416.Locb_dec_four:
3417 call __ocb_decrypt4
3418
3419 movdqa @offset[3],@offset[5]
3420 movups $inout0,`16*0`($out) # store output
3421 pxor $inout0,$checksum # accumulate checksum
3422 movups $inout1,`16*1`($out)
3423 pxor $inout1,$checksum
3424 movups $inout2,`16*2`($out)
3425 pxor $inout2,$checksum
3426 movups $inout3,`16*3`($out)
3427 pxor $inout3,$checksum
3428
3429.Locb_dec_done:
3430 pxor $rndkey0,@offset[5] # "remove" round[last]
3431 movdqu $checksum,($checksum_p) # store checksum
3432 movdqu @offset[5],($offset_p) # store last offset_i
3433
3434 xorps %xmm0,%xmm0 # clear register bank
3435 pxor %xmm1,%xmm1
3436 pxor %xmm2,%xmm2
3437 pxor %xmm3,%xmm3
3438 pxor %xmm4,%xmm4
3439 pxor %xmm5,%xmm5
3440___
3441$code.=<<___ if (!$win64);
3442 pxor %xmm6,%xmm6
3443 pxor %xmm7,%xmm7
3444 pxor %xmm8,%xmm8
3445 pxor %xmm9,%xmm9
3446 pxor %xmm10,%xmm10
3447 pxor %xmm11,%xmm11
3448 pxor %xmm12,%xmm12
3449 pxor %xmm13,%xmm13
3450 pxor %xmm14,%xmm14
3451 pxor %xmm15,%xmm15
384e6de4 3452 lea 0x28(%rsp),%rax
b84460ad 3453.cfi_def_cfa %rax,8
bd30091c
AP
3454___
3455$code.=<<___ if ($win64);
3456 movaps 0x00(%rsp),%xmm6
3457 movaps %xmm0,0x00(%rsp) # clear stack
3458 movaps 0x10(%rsp),%xmm7
3459 movaps %xmm0,0x10(%rsp)
3460 movaps 0x20(%rsp),%xmm8
3461 movaps %xmm0,0x20(%rsp)
3462 movaps 0x30(%rsp),%xmm9
3463 movaps %xmm0,0x30(%rsp)
3464 movaps 0x40(%rsp),%xmm10
3465 movaps %xmm0,0x40(%rsp)
3466 movaps 0x50(%rsp),%xmm11
3467 movaps %xmm0,0x50(%rsp)
3468 movaps 0x60(%rsp),%xmm12
3469 movaps %xmm0,0x60(%rsp)
3470 movaps 0x70(%rsp),%xmm13
3471 movaps %xmm0,0x70(%rsp)
3472 movaps 0x80(%rsp),%xmm14
3473 movaps %xmm0,0x80(%rsp)
3474 movaps 0x90(%rsp),%xmm15
3475 movaps %xmm0,0x90(%rsp)
3476 lea 0xa0+0x28(%rsp),%rax
3477.Locb_dec_pop:
bd30091c
AP
3478___
3479$code.=<<___;
384e6de4 3480 mov -40(%rax),%r14
b84460ad 3481.cfi_restore %r14
384e6de4 3482 mov -32(%rax),%r13
b84460ad 3483.cfi_restore %r13
384e6de4 3484 mov -24(%rax),%r12
b84460ad 3485.cfi_restore %r12
384e6de4 3486 mov -16(%rax),%rbp
b84460ad 3487.cfi_restore %rbp
384e6de4 3488 mov -8(%rax),%rbx
b84460ad 3489.cfi_restore %rbx
384e6de4 3490 lea (%rax),%rsp
b84460ad 3491.cfi_def_cfa_register %rsp
bd30091c
AP
3492.Locb_dec_epilogue:
3493 ret
b84460ad 3494.cfi_endproc
bd30091c
AP
3495.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3496
3497.type __ocb_decrypt6,\@abi-omnipotent
3498.align 32
3499__ocb_decrypt6:
3500 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3501 movdqu ($L_p,$i1),@offset[1]
3502 movdqa @offset[0],@offset[2]
3503 movdqu ($L_p,$i3),@offset[3]
3504 movdqa @offset[0],@offset[4]
3505 pxor @offset[5],@offset[0]
3506 movdqu ($L_p,$i5),@offset[5]
3507 pxor @offset[0],@offset[1]
3508 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3509 pxor @offset[1],@offset[2]
3510 pxor @offset[1],$inout1
3511 pxor @offset[2],@offset[3]
3512 pxor @offset[2],$inout2
3513 pxor @offset[3],@offset[4]
3514 pxor @offset[3],$inout3
3515 pxor @offset[4],@offset[5]
3516 pxor @offset[4],$inout4
3517 pxor @offset[5],$inout5
3518 $movkey 32($key_),$rndkey0
3519
3520 lea 1($block_num),$i1 # even-numbered blocks
3521 lea 3($block_num),$i3
3522 lea 5($block_num),$i5
3523 add \$6,$block_num
3524 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3525 bsf $i1,$i1 # ntz(block)
3526 bsf $i3,$i3
3527 bsf $i5,$i5
3528
3529 aesdec $rndkey1,$inout0
3530 aesdec $rndkey1,$inout1
3531 aesdec $rndkey1,$inout2
3532 aesdec $rndkey1,$inout3
3533 pxor $rndkey0l,@offset[1]
3534 pxor $rndkey0l,@offset[2]
3535 aesdec $rndkey1,$inout4
3536 pxor $rndkey0l,@offset[3]
3537 pxor $rndkey0l,@offset[4]
3538 aesdec $rndkey1,$inout5
3539 $movkey 48($key_),$rndkey1
3540 pxor $rndkey0l,@offset[5]
3541
3542 aesdec $rndkey0,$inout0
3543 aesdec $rndkey0,$inout1
3544 aesdec $rndkey0,$inout2
3545 aesdec $rndkey0,$inout3
3546 aesdec $rndkey0,$inout4
3547 aesdec $rndkey0,$inout5
3548 $movkey 64($key_),$rndkey0
3549 shl \$4,$i1 # ntz(block) -> table offset
3550 shl \$4,$i3
3551 jmp .Locb_dec_loop6
3552
3553.align 32
3554.Locb_dec_loop6:
3555 aesdec $rndkey1,$inout0
3556 aesdec $rndkey1,$inout1
3557 aesdec $rndkey1,$inout2
3558 aesdec $rndkey1,$inout3
3559 aesdec $rndkey1,$inout4
3560 aesdec $rndkey1,$inout5
3561 $movkey ($key,%rax),$rndkey1
3562 add \$32,%rax
3563
3564 aesdec $rndkey0,$inout0
3565 aesdec $rndkey0,$inout1
3566 aesdec $rndkey0,$inout2
3567 aesdec $rndkey0,$inout3
3568 aesdec $rndkey0,$inout4
3569 aesdec $rndkey0,$inout5
3570 $movkey -16($key,%rax),$rndkey0
3571 jnz .Locb_dec_loop6
3572
3573 aesdec $rndkey1,$inout0
3574 aesdec $rndkey1,$inout1
3575 aesdec $rndkey1,$inout2
3576 aesdec $rndkey1,$inout3
3577 aesdec $rndkey1,$inout4
3578 aesdec $rndkey1,$inout5
3579 $movkey 16($key_),$rndkey1
3580 shl \$4,$i5
3581
3582 aesdeclast @offset[0],$inout0
3583 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3584 mov %r10,%rax # restore twisted rounds
3585 aesdeclast @offset[1],$inout1
3586 aesdeclast @offset[2],$inout2
3587 aesdeclast @offset[3],$inout3
3588 aesdeclast @offset[4],$inout4
3589 aesdeclast @offset[5],$inout5
3590 ret
3591.size __ocb_decrypt6,.-__ocb_decrypt6
3592
3593.type __ocb_decrypt4,\@abi-omnipotent
3594.align 32
3595__ocb_decrypt4:
3596 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3597 movdqu ($L_p,$i1),@offset[1]
3598 movdqa @offset[0],@offset[2]
3599 movdqu ($L_p,$i3),@offset[3]
3600 pxor @offset[5],@offset[0]
3601 pxor @offset[0],@offset[1]
3602 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3603 pxor @offset[1],@offset[2]
3604 pxor @offset[1],$inout1
3605 pxor @offset[2],@offset[3]
3606 pxor @offset[2],$inout2
3607 pxor @offset[3],$inout3
3608 $movkey 32($key_),$rndkey0
3609
3610 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3611 pxor $rndkey0l,@offset[1]
3612 pxor $rndkey0l,@offset[2]
3613 pxor $rndkey0l,@offset[3]
3614
3615 aesdec $rndkey1,$inout0
3616 aesdec $rndkey1,$inout1
3617 aesdec $rndkey1,$inout2
3618 aesdec $rndkey1,$inout3
3619 $movkey 48($key_),$rndkey1
3620
3621 aesdec $rndkey0,$inout0
3622 aesdec $rndkey0,$inout1
3623 aesdec $rndkey0,$inout2
3624 aesdec $rndkey0,$inout3
3625 $movkey 64($key_),$rndkey0
3626 jmp .Locb_dec_loop4
3627
3628.align 32
3629.Locb_dec_loop4:
3630 aesdec $rndkey1,$inout0
3631 aesdec $rndkey1,$inout1
3632 aesdec $rndkey1,$inout2
3633 aesdec $rndkey1,$inout3
3634 $movkey ($key,%rax),$rndkey1
3635 add \$32,%rax
3636
3637 aesdec $rndkey0,$inout0
3638 aesdec $rndkey0,$inout1
3639 aesdec $rndkey0,$inout2
3640 aesdec $rndkey0,$inout3
3641 $movkey -16($key,%rax),$rndkey0
3642 jnz .Locb_dec_loop4
3643
3644 aesdec $rndkey1,$inout0
3645 aesdec $rndkey1,$inout1
3646 aesdec $rndkey1,$inout2
3647 aesdec $rndkey1,$inout3
3648 $movkey 16($key_),$rndkey1
3649 mov %r10,%rax # restore twisted rounds
3650
3651 aesdeclast @offset[0],$inout0
3652 aesdeclast @offset[1],$inout1
3653 aesdeclast @offset[2],$inout2
3654 aesdeclast @offset[3],$inout3
3655 ret
3656.size __ocb_decrypt4,.-__ocb_decrypt4
3657
3658.type __ocb_decrypt1,\@abi-omnipotent
3659.align 32
3660__ocb_decrypt1:
3661 pxor @offset[5],$inout5 # offset_i
3662 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3663 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3664 $movkey 32($key_),$rndkey0
3665
3666 aesdec $rndkey1,$inout0
3667 $movkey 48($key_),$rndkey1
3668 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3669
3670 aesdec $rndkey0,$inout0
3671 $movkey 64($key_),$rndkey0
3672 jmp .Locb_dec_loop1
3673
3674.align 32
3675.Locb_dec_loop1:
3676 aesdec $rndkey1,$inout0
3677 $movkey ($key,%rax),$rndkey1
3678 add \$32,%rax
3679
3680 aesdec $rndkey0,$inout0
3681 $movkey -16($key,%rax),$rndkey0
3682 jnz .Locb_dec_loop1
3683
3684 aesdec $rndkey1,$inout0
3685 $movkey 16($key_),$rndkey1 # redundant in tail
3686 mov %r10,%rax # restore twisted rounds
3687
3688 aesdeclast $inout5,$inout0
3689 ret
3690.size __ocb_decrypt1,.-__ocb_decrypt1
3691___
f8501464 3692} }}
d64a7232 3693\f
6c83629b 3694########################################################################
d64a7232
AP
3695# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3696# size_t length, const AES_KEY *key,
3697# unsigned char *ivp,const int enc);
f8501464 3698{
73325b22
AP
3699my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3700my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
73325b22 3701
d64a7232
AP
3702$code.=<<___;
3703.globl ${PREFIX}_cbc_encrypt
3704.type ${PREFIX}_cbc_encrypt,\@function,6
3705.align 16
3706${PREFIX}_cbc_encrypt:
b84460ad 3707.cfi_startproc
d64a7232
AP
3708 test $len,$len # check length
3709 jz .Lcbc_ret
d608b4d6 3710
f8501464 3711 mov 240($key),$rnds_ # key->rounds
d64a7232 3712 mov $key,$key_ # backup $key
d608b4d6 3713 test %r9d,%r9d # 6th argument
d64a7232
AP
3714 jz .Lcbc_decrypt
3715#--------------------------- CBC ENCRYPT ------------------------------#
f8501464 3716 movups ($ivp),$inout0 # load iv as initial state
d608b4d6 3717 mov $rnds_,$rounds
d7d119a3 3718 cmp \$16,$len
d64a7232
AP
3719 jb .Lcbc_enc_tail
3720 sub \$16,$len
3721 jmp .Lcbc_enc_loop
d7d119a3 3722.align 16
d64a7232 3723.Lcbc_enc_loop:
f8501464 3724 movups ($inp),$inout1 # load input
d64a7232 3725 lea 16($inp),$inp
f8501464 3726 #xorps $inout1,$inout0
d64a7232 3727___
f8501464 3728 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
d64a7232 3729$code.=<<___;
d608b4d6
AP
3730 mov $rnds_,$rounds # restore $rounds
3731 mov $key_,$key # restore $key
d7d119a3
AP
3732 movups $inout0,0($out) # store output
3733 lea 16($out),$out
3734 sub \$16,$len
d64a7232
AP
3735 jnc .Lcbc_enc_loop
3736 add \$16,$len
3737 jnz .Lcbc_enc_tail
23f6eec7
AP
3738 pxor $rndkey0,$rndkey0 # clear register bank
3739 pxor $rndkey1,$rndkey1
d608b4d6 3740 movups $inout0,($ivp)
23f6eec7
AP
3741 pxor $inout0,$inout0
3742 pxor $inout1,$inout1
d64a7232
AP
3743 jmp .Lcbc_ret
3744
3745.Lcbc_enc_tail:
3746 mov $len,%rcx # zaps $key
3747 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3748 .long 0x9066A4F3 # rep movsb
3749 mov \$16,%ecx # zero tail
3750 sub $len,%rcx
3751 xor %eax,%eax
3752 .long 0x9066AAF3 # rep stosb
3753 lea -16(%rdi),%rdi # rewind $out by 1 block
3754 mov $rnds_,$rounds # restore $rounds
3755 mov %rdi,%rsi # $inp and $out are the same
3756 mov $key_,$key # restore $key
3757 xor $len,$len # len=16
3758 jmp .Lcbc_enc_loop # one more spin
3759\f#--------------------------- CBC DECRYPT ------------------------------#
3760.align 16
3761.Lcbc_decrypt:
23f6eec7
AP
3762 cmp \$16,$len
3763 jne .Lcbc_decrypt_bulk
3764
3765 # handle single block without allocating stack frame,
3766 # useful in ciphertext stealing mode
3767 movdqu ($inp),$inout0 # load input
3768 movdqu ($ivp),$inout1 # load iv
3769 movdqa $inout0,$inout2 # future iv
3770___
3771 &aesni_generate1("dec",$key,$rnds_);
3772$code.=<<___;
3773 pxor $rndkey0,$rndkey0 # clear register bank
3774 pxor $rndkey1,$rndkey1
3775 movdqu $inout2,($ivp) # store iv
3776 xorps $inout1,$inout0 # ^=iv
3777 pxor $inout1,$inout1
3778 movups $inout0,($out) # store output
3779 pxor $inout0,$inout0
3780 jmp .Lcbc_ret
3781.align 16
3782.Lcbc_decrypt_bulk:
384e6de4 3783 lea (%rsp),%r11 # frame pointer
b84460ad 3784.cfi_def_cfa_register %r11
6a40ebe8 3785 push %rbp
b84460ad 3786.cfi_push %rbp
6a40ebe8
AP
3787 sub \$$frame_size,%rsp
3788 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
d64a7232
AP
3789___
3790$code.=<<___ if ($win64);
6a40ebe8
AP
3791 movaps %xmm6,0x10(%rsp)
3792 movaps %xmm7,0x20(%rsp)
3793 movaps %xmm8,0x30(%rsp)
3794 movaps %xmm9,0x40(%rsp)
73325b22
AP
3795 movaps %xmm10,0x50(%rsp)
3796 movaps %xmm11,0x60(%rsp)
3797 movaps %xmm12,0x70(%rsp)
3798 movaps %xmm13,0x80(%rsp)
3799 movaps %xmm14,0x90(%rsp)
3800 movaps %xmm15,0xa0(%rsp)
d608b4d6 3801.Lcbc_decrypt_body:
d64a7232 3802___
384e6de4
AP
3803
3804my $inp_=$key_="%rbp"; # reassign $key_
3805
d64a7232 3806$code.=<<___;
384e6de4 3807 mov $key,$key_ # [re-]backup $key [after reassignment]
d64a7232 3808 movups ($ivp),$iv
d608b4d6 3809 mov $rnds_,$rounds
73325b22 3810 cmp \$0x50,$len
d608b4d6 3811 jbe .Lcbc_dec_tail
73325b22
AP
3812
3813 $movkey ($key),$rndkey0
3814 movdqu 0x00($inp),$inout0 # load input
3815 movdqu 0x10($inp),$inout1
3816 movdqa $inout0,$in0
3817 movdqu 0x20($inp),$inout2
3818 movdqa $inout1,$in1
3819 movdqu 0x30($inp),$inout3
3820 movdqa $inout2,$in2
3821 movdqu 0x40($inp),$inout4
3822 movdqa $inout3,$in3
3823 movdqu 0x50($inp),$inout5
3824 movdqa $inout4,$in4
5599c733 3825 mov OPENSSL_ia32cap_P+4(%rip),%r9d
73325b22
AP
3826 cmp \$0x70,$len
3827 jbe .Lcbc_dec_six_or_seven
3828
23f6eec7
AP
3829 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3830 sub \$0x50,$len # $len is biased by -5*16
5599c733 3831 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
23f6eec7
AP
3832 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3833 sub \$0x20,$len # $len is biased by -7*16
73325b22 3834 lea 0x70($key),$key # size optimization
f8501464 3835 jmp .Lcbc_dec_loop8_enter
d7d119a3 3836.align 16
f8501464 3837.Lcbc_dec_loop8:
f8501464
AP
3838 movups $inout7,($out)
3839 lea 0x10($out),$out
3840.Lcbc_dec_loop8_enter:
73325b22
AP
3841 movdqu 0x60($inp),$inout6
3842 pxor $rndkey0,$inout0
3843 movdqu 0x70($inp),$inout7
3844 pxor $rndkey0,$inout1
3845 $movkey 0x10-0x70($key),$rndkey1
3846 pxor $rndkey0,$inout2
384e6de4 3847 mov \$-1,$inp_
73325b22
AP
3848 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3849 pxor $rndkey0,$inout3
3850 pxor $rndkey0,$inout4
3851 pxor $rndkey0,$inout5
3852 pxor $rndkey0,$inout6
d7d119a3 3853
f8501464 3854 aesdec $rndkey1,$inout0
73325b22
AP
3855 pxor $rndkey0,$inout7
3856 $movkey 0x20-0x70($key),$rndkey0
f8501464 3857 aesdec $rndkey1,$inout1
f8501464 3858 aesdec $rndkey1,$inout2
f8501464 3859 aesdec $rndkey1,$inout3
f8501464 3860 aesdec $rndkey1,$inout4
f8501464 3861 aesdec $rndkey1,$inout5
f8501464 3862 aesdec $rndkey1,$inout6
384e6de4
AP
3863 adc \$0,$inp_
3864 and \$128,$inp_
f8501464 3865 aesdec $rndkey1,$inout7
73325b22
AP
3866 add $inp,$inp_
3867 $movkey 0x30-0x70($key),$rndkey1
3868___
3869for($i=1;$i<12;$i++) {
3870my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
d8ba0dc9
AP
3871$code.=<<___ if ($i==7);
3872 cmp \$11,$rounds
3873___
73325b22
AP
3874$code.=<<___;
3875 aesdec $rndkeyx,$inout0
3876 aesdec $rndkeyx,$inout1
3877 aesdec $rndkeyx,$inout2
3878 aesdec $rndkeyx,$inout3
3879 aesdec $rndkeyx,$inout4
3880 aesdec $rndkeyx,$inout5
3881 aesdec $rndkeyx,$inout6
3882 aesdec $rndkeyx,$inout7
3883 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3884___
d8ba0dc9
AP
3885$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3886 nop
3887___
73325b22 3888$code.=<<___ if ($i==7);
73325b22
AP
3889 jb .Lcbc_dec_done
3890___
3891$code.=<<___ if ($i==9);
3892 je .Lcbc_dec_done
3893___
d8ba0dc9
AP
3894$code.=<<___ if ($i==11);
3895 jmp .Lcbc_dec_done
3896___
73325b22
AP
3897}
3898$code.=<<___;
d8ba0dc9 3899.align 16
73325b22
AP
3900.Lcbc_dec_done:
3901 aesdec $rndkey1,$inout0
73325b22 3902 aesdec $rndkey1,$inout1
d8ba0dc9 3903 pxor $rndkey0,$iv
73325b22
AP
3904 pxor $rndkey0,$in0
3905 aesdec $rndkey1,$inout2
73325b22 3906 aesdec $rndkey1,$inout3
d8ba0dc9 3907 pxor $rndkey0,$in1
73325b22
AP
3908 pxor $rndkey0,$in2
3909 aesdec $rndkey1,$inout4
73325b22 3910 aesdec $rndkey1,$inout5
d8ba0dc9 3911 pxor $rndkey0,$in3
73325b22
AP
3912 pxor $rndkey0,$in4
3913 aesdec $rndkey1,$inout6
3914 aesdec $rndkey1,$inout7
3915 movdqu 0x50($inp),$rndkey1
d64a7232 3916
73325b22
AP
3917 aesdeclast $iv,$inout0
3918 movdqu 0x60($inp),$iv # borrow $iv
3919 pxor $rndkey0,$rndkey1
3920 aesdeclast $in0,$inout1
3921 pxor $rndkey0,$iv
3922 movdqu 0x70($inp),$rndkey0 # next IV
73325b22 3923 aesdeclast $in1,$inout2
d8ba0dc9 3924 lea 0x80($inp),$inp
73325b22
AP
3925 movdqu 0x00($inp_),$in0
3926 aesdeclast $in2,$inout3
73325b22 3927 aesdeclast $in3,$inout4
d8ba0dc9 3928 movdqu 0x10($inp_),$in1
73325b22
AP
3929 movdqu 0x20($inp_),$in2
3930 aesdeclast $in4,$inout5
73325b22 3931 aesdeclast $rndkey1,$inout6
d8ba0dc9 3932 movdqu 0x30($inp_),$in3
73325b22
AP
3933 movdqu 0x40($inp_),$in4
3934 aesdeclast $iv,$inout7
3935 movdqa $rndkey0,$iv # return $iv
3936 movdqu 0x50($inp_),$rndkey1
3937 $movkey -0x70($key),$rndkey0
3938
3939 movups $inout0,($out) # store output
3940 movdqa $in0,$inout0
3941 movups $inout1,0x10($out)
3942 movdqa $in1,$inout1
3943 movups $inout2,0x20($out)
3944 movdqa $in2,$inout2
3945 movups $inout3,0x30($out)
3946 movdqa $in3,$inout3
3947 movups $inout4,0x40($out)
3948 movdqa $in4,$inout4
3949 movups $inout5,0x50($out)
3950 movdqa $rndkey1,$inout5
3951 movups $inout6,0x60($out)
3952 lea 0x70($out),$out
f8501464 3953
f8501464
AP
3954 sub \$0x80,$len
3955 ja .Lcbc_dec_loop8
3956
3957 movaps $inout7,$inout0
73325b22 3958 lea -0x70($key),$key
f8501464 3959 add \$0x70,$len
23f6eec7 3960 jle .Lcbc_dec_clear_tail_collected
73325b22 3961 movups $inout7,($out)
f8501464 3962 lea 0x10($out),$out
73325b22
AP
3963 cmp \$0x50,$len
3964 jbe .Lcbc_dec_tail
3965
3966 movaps $in0,$inout0
3967.Lcbc_dec_six_or_seven:
3968 cmp \$0x60,$len
3969 ja .Lcbc_dec_seven
3970
3971 movaps $inout5,$inout6
3972 call _aesni_decrypt6
3973 pxor $iv,$inout0 # ^= IV
3974 movaps $inout6,$iv
3975 pxor $in0,$inout1
3976 movdqu $inout0,($out)
3977 pxor $in1,$inout2
3978 movdqu $inout1,0x10($out)
23f6eec7 3979 pxor $inout1,$inout1 # clear register bank
73325b22
AP
3980 pxor $in2,$inout3
3981 movdqu $inout2,0x20($out)
23f6eec7 3982 pxor $inout2,$inout2
73325b22
AP
3983 pxor $in3,$inout4
3984 movdqu $inout3,0x30($out)
23f6eec7 3985 pxor $inout3,$inout3
73325b22
AP
3986 pxor $in4,$inout5
3987 movdqu $inout4,0x40($out)
23f6eec7 3988 pxor $inout4,$inout4
73325b22
AP
3989 lea 0x50($out),$out
3990 movdqa $inout5,$inout0
23f6eec7 3991 pxor $inout5,$inout5
73325b22
AP
3992 jmp .Lcbc_dec_tail_collected
3993
3994.align 16
3995.Lcbc_dec_seven:
3996 movups 0x60($inp),$inout6
3997 xorps $inout7,$inout7
3998 call _aesni_decrypt8
3999 movups 0x50($inp),$inout7
4000 pxor $iv,$inout0 # ^= IV
4001 movups 0x60($inp),$iv
4002 pxor $in0,$inout1
4003 movdqu $inout0,($out)
4004 pxor $in1,$inout2
4005 movdqu $inout1,0x10($out)
23f6eec7 4006 pxor $inout1,$inout1 # clear register bank
73325b22
AP
4007 pxor $in2,$inout3
4008 movdqu $inout2,0x20($out)
23f6eec7 4009 pxor $inout2,$inout2
73325b22
AP
4010 pxor $in3,$inout4
4011 movdqu $inout3,0x30($out)
23f6eec7 4012 pxor $inout3,$inout3
73325b22
AP
4013 pxor $in4,$inout5
4014 movdqu $inout4,0x40($out)
23f6eec7 4015 pxor $inout4,$inout4
73325b22
AP
4016 pxor $inout7,$inout6
4017 movdqu $inout5,0x50($out)
23f6eec7 4018 pxor $inout5,$inout5
73325b22
AP
4019 lea 0x60($out),$out
4020 movdqa $inout6,$inout0
23f6eec7
AP
4021 pxor $inout6,$inout6
4022 pxor $inout7,$inout7
73325b22
AP
4023 jmp .Lcbc_dec_tail_collected
4024
5599c733
AP
4025.align 16
4026.Lcbc_dec_loop6:
4027 movups $inout5,($out)
4028 lea 0x10($out),$out
4029 movdqu 0x00($inp),$inout0 # load input
4030 movdqu 0x10($inp),$inout1
4031 movdqa $inout0,$in0
4032 movdqu 0x20($inp),$inout2
4033 movdqa $inout1,$in1
4034 movdqu 0x30($inp),$inout3
4035 movdqa $inout2,$in2
4036 movdqu 0x40($inp),$inout4
4037 movdqa $inout3,$in3
4038 movdqu 0x50($inp),$inout5
4039 movdqa $inout4,$in4
4040.Lcbc_dec_loop6_enter:
4041 lea 0x60($inp),$inp
4042 movdqa $inout5,$inout6
4043
4044 call _aesni_decrypt6
4045
4046 pxor $iv,$inout0 # ^= IV
4047 movdqa $inout6,$iv
4048 pxor $in0,$inout1
4049 movdqu $inout0,($out)
4050 pxor $in1,$inout2
4051 movdqu $inout1,0x10($out)
4052 pxor $in2,$inout3
4053 movdqu $inout2,0x20($out)
4054 pxor $in3,$inout4
4055 mov $key_,$key
4056 movdqu $inout3,0x30($out)
4057 pxor $in4,$inout5
4058 mov $rnds_,$rounds
4059 movdqu $inout4,0x40($out)
4060 lea 0x50($out),$out
4061 sub \$0x60,$len
4062 ja .Lcbc_dec_loop6
4063
4064 movdqa $inout5,$inout0
4065 add \$0x50,$len
23f6eec7 4066 jle .Lcbc_dec_clear_tail_collected
5599c733
AP
4067 movups $inout5,($out)
4068 lea 0x10($out),$out
4069
6c83629b 4070.Lcbc_dec_tail:
d64a7232 4071 movups ($inp),$inout0
73325b22 4072 sub \$0x10,$len
23f6eec7 4073 jbe .Lcbc_dec_one # $len is 1*16 or less
f8501464 4074
d64a7232 4075 movups 0x10($inp),$inout1
73325b22
AP
4076 movaps $inout0,$in0
4077 sub \$0x10,$len
23f6eec7 4078 jbe .Lcbc_dec_two # $len is 2*16 or less
f8501464 4079
d64a7232 4080 movups 0x20($inp),$inout2
73325b22
AP
4081 movaps $inout1,$in1
4082 sub \$0x10,$len
23f6eec7 4083 jbe .Lcbc_dec_three # $len is 3*16 or less
f8501464 4084
d64a7232 4085 movups 0x30($inp),$inout3
73325b22
AP
4086 movaps $inout2,$in2
4087 sub \$0x10,$len
23f6eec7 4088 jbe .Lcbc_dec_four # $len is 4*16 or less
f8501464 4089
23f6eec7 4090 movups 0x40($inp),$inout4 # $len is 5*16 or less
73325b22
AP
4091 movaps $inout3,$in3
4092 movaps $inout4,$in4
4093 xorps $inout5,$inout5
4094 call _aesni_decrypt6
4095 pxor $iv,$inout0
4096 movaps $in4,$iv
4097 pxor $in0,$inout1
4098 movdqu $inout0,($out)
4099 pxor $in1,$inout2
4100 movdqu $inout1,0x10($out)
23f6eec7 4101 pxor $inout1,$inout1 # clear register bank
73325b22
AP
4102 pxor $in2,$inout3
4103 movdqu $inout2,0x20($out)
23f6eec7 4104 pxor $inout2,$inout2
73325b22
AP
4105 pxor $in3,$inout4
4106 movdqu $inout3,0x30($out)
23f6eec7 4107 pxor $inout3,$inout3
73325b22
AP
4108 lea 0x40($out),$out
4109 movdqa $inout4,$inout0
23f6eec7
AP
4110 pxor $inout4,$inout4
4111 pxor $inout5,$inout5
73325b22 4112 sub \$0x10,$len
d64a7232 4113 jmp .Lcbc_dec_tail_collected
73325b22 4114
d64a7232
AP
4115.align 16
4116.Lcbc_dec_one:
73325b22 4117 movaps $inout0,$in0
d64a7232 4118___
d608b4d6 4119 &aesni_generate1("dec",$key,$rounds);
d64a7232 4120$code.=<<___;
f8501464 4121 xorps $iv,$inout0
d64a7232
AP
4122 movaps $in0,$iv
4123 jmp .Lcbc_dec_tail_collected
4124.align 16
4125.Lcbc_dec_two:
73325b22 4126 movaps $inout1,$in1
214368ff 4127 call _aesni_decrypt2
73325b22 4128 pxor $iv,$inout0
d64a7232 4129 movaps $in1,$iv
73325b22
AP
4130 pxor $in0,$inout1
4131 movdqu $inout0,($out)
4132 movdqa $inout1,$inout0
23f6eec7 4133 pxor $inout1,$inout1 # clear register bank
d64a7232
AP
4134 lea 0x10($out),$out
4135 jmp .Lcbc_dec_tail_collected
4136.align 16
4137.Lcbc_dec_three:
73325b22 4138 movaps $inout2,$in2
d608b4d6 4139 call _aesni_decrypt3
73325b22 4140 pxor $iv,$inout0
d64a7232 4141 movaps $in2,$iv
73325b22
AP
4142 pxor $in0,$inout1
4143 movdqu $inout0,($out)
4144 pxor $in1,$inout2
4145 movdqu $inout1,0x10($out)
23f6eec7 4146 pxor $inout1,$inout1 # clear register bank
73325b22 4147 movdqa $inout2,$inout0
23f6eec7 4148 pxor $inout2,$inout2
d64a7232 4149 lea 0x20($out),$out
f8501464
AP
4150 jmp .Lcbc_dec_tail_collected
4151.align 16
4152.Lcbc_dec_four:
73325b22 4153 movaps $inout3,$in3
f8501464 4154 call _aesni_decrypt4
73325b22
AP
4155 pxor $iv,$inout0
4156 movaps $in3,$iv
4157 pxor $in0,$inout1
4158 movdqu $inout0,($out)
4159 pxor $in1,$inout2
4160 movdqu $inout1,0x10($out)
23f6eec7 4161 pxor $inout1,$inout1 # clear register bank
73325b22
AP
4162 pxor $in2,$inout3
4163 movdqu $inout2,0x20($out)
23f6eec7 4164 pxor $inout2,$inout2
73325b22 4165 movdqa $inout3,$inout0
23f6eec7 4166 pxor $inout3,$inout3
f8501464 4167 lea 0x30($out),$out
d64a7232 4168 jmp .Lcbc_dec_tail_collected
73325b22 4169
d64a7232 4170.align 16
23f6eec7
AP
4171.Lcbc_dec_clear_tail_collected:
4172 pxor $inout1,$inout1 # clear register bank
4173 pxor $inout2,$inout2
4174 pxor $inout3,$inout3
4175___
4176$code.=<<___ if (!$win64);
4177 pxor $inout4,$inout4 # %xmm6..9
4178 pxor $inout5,$inout5
4179 pxor $inout6,$inout6
4180 pxor $inout7,$inout7
4181___
4182$code.=<<___;
d64a7232 4183.Lcbc_dec_tail_collected:
d64a7232 4184 movups $iv,($ivp)
73325b22 4185 and \$15,$len
d64a7232 4186 jnz .Lcbc_dec_tail_partial
f8501464 4187 movups $inout0,($out)
23f6eec7 4188 pxor $inout0,$inout0
d64a7232 4189 jmp .Lcbc_dec_ret
d7d119a3 4190.align 16
d64a7232 4191.Lcbc_dec_tail_partial:
6a40ebe8 4192 movaps $inout0,(%rsp)
23f6eec7 4193 pxor $inout0,$inout0
f8501464 4194 mov \$16,%rcx
d64a7232 4195 mov $out,%rdi
f8501464 4196 sub $len,%rcx
6a40ebe8 4197 lea (%rsp),%rsi
23f6eec7
AP
4198 .long 0x9066A4F3 # rep movsb
4199 movdqa $inout0,(%rsp)
d64a7232
AP
4200
4201.Lcbc_dec_ret:
23f6eec7
AP
4202 xorps $rndkey0,$rndkey0 # %xmm0
4203 pxor $rndkey1,$rndkey1
d64a7232
AP
4204___
4205$code.=<<___ if ($win64);
6a40ebe8 4206 movaps 0x10(%rsp),%xmm6
23f6eec7 4207 movaps %xmm0,0x10(%rsp) # clear stack
6a40ebe8 4208 movaps 0x20(%rsp),%xmm7
23f6eec7 4209 movaps %xmm0,0x20(%rsp)
6a40ebe8 4210 movaps 0x30(%rsp),%xmm8
23f6eec7 4211 movaps %xmm0,0x30(%rsp)
6a40ebe8 4212 movaps 0x40(%rsp),%xmm9
23f6eec7 4213 movaps %xmm0,0x40(%rsp)
73325b22 4214 movaps 0x50(%rsp),%xmm10
23f6eec7 4215 movaps %xmm0,0x50(%rsp)
73325b22 4216 movaps 0x60(%rsp),%xmm11
23f6eec7 4217 movaps %xmm0,0x60(%rsp)
73325b22 4218 movaps 0x70(%rsp),%xmm12
23f6eec7 4219 movaps %xmm0,0x70(%rsp)
73325b22 4220 movaps 0x80(%rsp),%xmm13
23f6eec7 4221 movaps %xmm0,0x80(%rsp)
73325b22 4222 movaps 0x90(%rsp),%xmm14
23f6eec7 4223 movaps %xmm0,0x90(%rsp)
73325b22 4224 movaps 0xa0(%rsp),%xmm15
23f6eec7 4225 movaps %xmm0,0xa0(%rsp)
d64a7232
AP
4226___
4227$code.=<<___;
384e6de4 4228 mov -8(%r11),%rbp
b84460ad 4229.cfi_restore %rbp
384e6de4 4230 lea (%r11),%rsp
b84460ad 4231.cfi_def_cfa_register %rsp
d64a7232
AP
4232.Lcbc_ret:
4233 ret
b84460ad 4234.cfi_endproc
d64a7232
AP
4235.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4236___
f8501464 4237} \f
23f6eec7 4238# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
d608b4d6 4239# int bits, AES_KEY *key)
23f6eec7
AP
4240#
4241# input: $inp user-supplied key
4242# $bits $inp length in bits
4243# $key pointer to key schedule
4244# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4245# *$key key schedule
4246#
d608b4d6
AP
4247{ my ($inp,$bits,$key) = @_4args;
4248 $bits =~ s/%r/%e/;
4249
d64a7232
AP
4250$code.=<<___;
4251.globl ${PREFIX}_set_decrypt_key
d608b4d6 4252.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
d64a7232
AP
4253.align 16
4254${PREFIX}_set_decrypt_key:
b84460ad 4255.cfi_startproc
d608b4d6 4256 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
b84460ad 4257.cfi_adjust_cfa_offset 8
fb2f3411 4258 call __aesni_set_encrypt_key
d608b4d6 4259 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
d64a7232
AP
4260 test %eax,%eax
4261 jnz .Ldec_key_ret
d608b4d6
AP
4262 lea 16($key,$bits),$inp # points at the end of key schedule
4263
4264 $movkey ($key),%xmm0 # just swap
4265 $movkey ($inp),%xmm1
4266 $movkey %xmm0,($inp)
4267 $movkey %xmm1,($key)
4268 lea 16($key),$key
4269 lea -16($inp),$inp
4270
d64a7232 4271.Ldec_key_inverse:
d608b4d6
AP
4272 $movkey ($key),%xmm0 # swap and inverse
4273 $movkey ($inp),%xmm1
d64a7232
AP
4274 aesimc %xmm0,%xmm0
4275 aesimc %xmm1,%xmm1
d608b4d6
AP
4276 lea 16($key),$key
4277 lea -16($inp),$inp
d608b4d6
AP
4278 $movkey %xmm0,16($inp)
4279 $movkey %xmm1,-16($key)
d7d119a3 4280 cmp $key,$inp
d64a7232
AP
4281 ja .Ldec_key_inverse
4282
d608b4d6 4283 $movkey ($key),%xmm0 # inverse middle
d64a7232 4284 aesimc %xmm0,%xmm0
23f6eec7 4285 pxor %xmm1,%xmm1
d608b4d6 4286 $movkey %xmm0,($inp)
23f6eec7 4287 pxor %xmm0,%xmm0
d64a7232 4288.Ldec_key_ret:
d608b4d6 4289 add \$8,%rsp
b84460ad 4290.cfi_adjust_cfa_offset -8
d64a7232 4291 ret
b84460ad 4292.cfi_endproc
d608b4d6 4293.LSEH_end_set_decrypt_key:
d64a7232
AP
4294.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4295___
4296\f
e3713c36
RS
4297# This is based on submission from Intel by
4298# Huang Ying
4299# Vinodh Gopal
d64a7232
AP
4300# Kahraman Akdemir
4301#
60250017 4302# Aggressively optimized in respect to aeskeygenassist's critical path
d64a7232
AP
4303# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4304#
23f6eec7
AP
4305# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4306# int bits, AES_KEY * const key);
4307#
4308# input: $inp user-supplied key
4309# $bits $inp length in bits
4310# $key pointer to key schedule
4311# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4312# $bits rounds-1 (used in aesni_set_decrypt_key)
4313# *$key key schedule
4314# $key pointer to key schedule (used in
4315# aesni_set_decrypt_key)
4316#
4317# Subroutine is frame-less, which means that only volatile registers
4318# are used. Note that it's declared "abi-omnipotent", which means that
4319# amount of volatile registers is smaller on Windows.
4320#
d64a7232 4321$code.=<<___;
d608b4d6
AP
4322.globl ${PREFIX}_set_encrypt_key
4323.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
d64a7232 4324.align 16
d608b4d6 4325${PREFIX}_set_encrypt_key:
fb2f3411 4326__aesni_set_encrypt_key:
b84460ad 4327.cfi_startproc
d608b4d6 4328 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
b84460ad 4329.cfi_adjust_cfa_offset 8
d608b4d6 4330 mov \$-1,%rax
d7d119a3 4331 test $inp,$inp
d608b4d6
AP
4332 jz .Lenc_key_ret
4333 test $key,$key
4334 jz .Lenc_key_ret
4335
23f6eec7 4336 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
d608b4d6 4337 movups ($inp),%xmm0 # pull first 128 bits of *userKey
f8501464 4338 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
23f6eec7
AP
4339 and OPENSSL_ia32cap_P+4(%rip),%r10d
4340 lea 16($key),%rax # %rax is used as modifiable copy of $key
d608b4d6 4341 cmp \$256,$bits
d64a7232 4342 je .L14rounds
d608b4d6 4343 cmp \$192,$bits
d64a7232 4344 je .L12rounds
d608b4d6 4345 cmp \$128,$bits
d64a7232 4346 jne .Lbad_keybits
d608b4d6 4347
d64a7232 4348.L10rounds:
d608b4d6 4349 mov \$9,$bits # 10 rounds for 128-bit key
23f6eec7
AP
4350 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4351 je .L10rounds_alt
4352
d608b4d6 4353 $movkey %xmm0,($key) # round 0
d64a7232
AP
4354 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4355 call .Lkey_expansion_128_cold
4356 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4357 call .Lkey_expansion_128
4358 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4359 call .Lkey_expansion_128
4360 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4361 call .Lkey_expansion_128
4362 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4363 call .Lkey_expansion_128
4364 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4365 call .Lkey_expansion_128
4366 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4367 call .Lkey_expansion_128
4368 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4369 call .Lkey_expansion_128
4370 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4371 call .Lkey_expansion_128
4372 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4373 call .Lkey_expansion_128
d608b4d6
AP
4374 $movkey %xmm0,(%rax)
4375 mov $bits,80(%rax) # 240(%rdx)
d64a7232 4376 xor %eax,%eax
d608b4d6 4377 jmp .Lenc_key_ret
d64a7232 4378
23f6eec7
AP
4379.align 16
4380.L10rounds_alt:
4381 movdqa .Lkey_rotate(%rip),%xmm5
4382 mov \$8,%r10d
4383 movdqa .Lkey_rcon1(%rip),%xmm4
4384 movdqa %xmm0,%xmm2
4385 movdqu %xmm0,($key)
4386 jmp .Loop_key128
4387
4388.align 16
4389.Loop_key128:
4390 pshufb %xmm5,%xmm0
4391 aesenclast %xmm4,%xmm0
4392 pslld \$1,%xmm4
4393 lea 16(%rax),%rax
4394
4395 movdqa %xmm2,%xmm3
4396 pslldq \$4,%xmm2
4397 pxor %xmm2,%xmm3
4398 pslldq \$4,%xmm2
4399 pxor %xmm2,%xmm3
4400 pslldq \$4,%xmm2
4401 pxor %xmm3,%xmm2
4402
4403 pxor %xmm2,%xmm0
4404 movdqu %xmm0,-16(%rax)
4405 movdqa %xmm0,%xmm2
4406
4407 dec %r10d
4408 jnz .Loop_key128
4409
4410 movdqa .Lkey_rcon1b(%rip),%xmm4
4411
4412 pshufb %xmm5,%xmm0
4413 aesenclast %xmm4,%xmm0
4414 pslld \$1,%xmm4
4415
4416 movdqa %xmm2,%xmm3
4417 pslldq \$4,%xmm2
4418 pxor %xmm2,%xmm3
4419 pslldq \$4,%xmm2
4420 pxor %xmm2,%xmm3
4421 pslldq \$4,%xmm2
4422 pxor %xmm3,%xmm2
4423
4424 pxor %xmm2,%xmm0
4425 movdqu %xmm0,(%rax)
4426
4427 movdqa %xmm0,%xmm2
4428 pshufb %xmm5,%xmm0
4429 aesenclast %xmm4,%xmm0
4430
4431 movdqa %xmm2,%xmm3
4432 pslldq \$4,%xmm2
4433 pxor %xmm2,%xmm3
4434 pslldq \$4,%xmm2
4435 pxor %xmm2,%xmm3
4436 pslldq \$4,%xmm2
4437 pxor %xmm3,%xmm2
4438
4439 pxor %xmm2,%xmm0
4440 movdqu %xmm0,16(%rax)
4441
4442 mov $bits,96(%rax) # 240($key)
4443 xor %eax,%eax
4444 jmp .Lenc_key_ret
4445
d64a7232
AP
4446.align 16
4447.L12rounds:
d608b4d6
AP
4448 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4449 mov \$11,$bits # 12 rounds for 192
23f6eec7
AP
4450 cmp \$`1<<28`,%r10d # AVX, but no XOP
4451 je .L12rounds_alt
4452
d608b4d6 4453 $movkey %xmm0,($key) # round 0
d64a7232
AP
4454 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4455 call .Lkey_expansion_192a_cold
4456 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4457 call .Lkey_expansion_192b
4458 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4459 call .Lkey_expansion_192a
4460 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4461 call .Lkey_expansion_192b
4462 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4463 call .Lkey_expansion_192a
4464 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4465 call .Lkey_expansion_192b
4466 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4467 call .Lkey_expansion_192a
4468 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4469 call .Lkey_expansion_192b
d608b4d6
AP
4470 $movkey %xmm0,(%rax)
4471 mov $bits,48(%rax) # 240(%rdx)
d64a7232 4472 xor %rax, %rax
d608b4d6 4473 jmp .Lenc_key_ret
d64a7232 4474
23f6eec7
AP
4475.align 16
4476.L12rounds_alt:
4477 movdqa .Lkey_rotate192(%rip),%xmm5
4478 movdqa .Lkey_rcon1(%rip),%xmm4
4479 mov \$8,%r10d
4480 movdqu %xmm0,($key)
4481 jmp .Loop_key192
4482
4483.align 16
4484.Loop_key192:
4485 movq %xmm2,0(%rax)
4486 movdqa %xmm2,%xmm1
4487 pshufb %xmm5,%xmm2
4488 aesenclast %xmm4,%xmm2
4489 pslld \$1, %xmm4
4490 lea 24(%rax),%rax
4491
4492 movdqa %xmm0,%xmm3
4493 pslldq \$4,%xmm0
4494 pxor %xmm0,%xmm3
4495 pslldq \$4,%xmm0
4496 pxor %xmm0,%xmm3
4497 pslldq \$4,%xmm0
4498 pxor %xmm3,%xmm0
4499
4500 pshufd \$0xff,%xmm0,%xmm3
4501 pxor %xmm1,%xmm3
4502 pslldq \$4,%xmm1
4503 pxor %xmm1,%xmm3
4504
4505 pxor %xmm2,%xmm0
4506 pxor %xmm3,%xmm2
4507 movdqu %xmm0,-16(%rax)
4508
4509 dec %r10d
4510 jnz .Loop_key192
4511
4512 mov $bits,32(%rax) # 240($key)
4513 xor %eax,%eax
4514 jmp .Lenc_key_ret
4515
d64a7232
AP
4516.align 16
4517.L14rounds:
46f4e1be 4518 movups 16($inp),%xmm2 # remaining half of *userKey
d608b4d6
AP
4519 mov \$13,$bits # 14 rounds for 256
4520 lea 16(%rax),%rax
23f6eec7
AP
4521 cmp \$`1<<28`,%r10d # AVX, but no XOP
4522 je .L14rounds_alt
4523
d608b4d6
AP
4524 $movkey %xmm0,($key) # round 0
4525 $movkey %xmm2,16($key) # round 1
d64a7232
AP
4526 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4527 call .Lkey_expansion_256a_cold
4528 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4529 call .Lkey_expansion_256b
4530 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4531 call .Lkey_expansion_256a
4532 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4533 call .Lkey_expansion_256b
4534 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4535 call .Lkey_expansion_256a
4536 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4537 call .Lkey_expansion_256b
4538 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4539 call .Lkey_expansion_256a
4540 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4541 call .Lkey_expansion_256b
4542 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4543 call .Lkey_expansion_256a
4544 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4545 call .Lkey_expansion_256b
4546 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4547 call .Lkey_expansion_256a
4548 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4549 call .Lkey_expansion_256b
4550 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4551 call .Lkey_expansion_256a
d608b4d6
AP
4552 $movkey %xmm0,(%rax)
4553 mov $bits,16(%rax) # 240(%rdx)
d64a7232 4554 xor %rax,%rax
d608b4d6
AP
4555 jmp .Lenc_key_ret
4556
23f6eec7
AP
4557.align 16
4558.L14rounds_alt:
4559 movdqa .Lkey_rotate(%rip),%xmm5
4560 movdqa .Lkey_rcon1(%rip),%xmm4
4561 mov \$7,%r10d
4562 movdqu %xmm0,0($key)
4563 movdqa %xmm2,%xmm1
4564 movdqu %xmm2,16($key)
4565 jmp .Loop_key256
4566
4567.align 16
4568.Loop_key256:
4569 pshufb %xmm5,%xmm2
4570 aesenclast %xmm4,%xmm2
4571
4572 movdqa %xmm0,%xmm3
4573 pslldq \$4,%xmm0
4574 pxor %xmm0,%xmm3
4575 pslldq \$4,%xmm0
4576 pxor %xmm0,%xmm3
4577 pslldq \$4,%xmm0
4578 pxor %xmm3,%xmm0
4579 pslld \$1,%xmm4
4580
4581 pxor %xmm2,%xmm0
4582 movdqu %xmm0,(%rax)
4583
4584 dec %r10d
4585 jz .Ldone_key256
4586
4587 pshufd \$0xff,%xmm0,%xmm2
4588 pxor %xmm3,%xmm3
4589 aesenclast %xmm3,%xmm2
4590
4591 movdqa %xmm1,%xmm3
4592 pslldq \$4,%xmm1
4593 pxor %xmm1,%xmm3
4594 pslldq \$4,%xmm1
4595 pxor %xmm1,%xmm3
4596 pslldq \$4,%xmm1
4597 pxor %xmm3,%xmm1
4598
4599 pxor %xmm1,%xmm2
4600 movdqu %xmm2,16(%rax)
4601 lea 32(%rax),%rax
4602 movdqa %xmm2,%xmm1
4603
4604 jmp .Loop_key256
4605
4606.Ldone_key256:
4607 mov $bits,16(%rax) # 240($key)
4608 xor %eax,%eax
4609 jmp .Lenc_key_ret
4610
d608b4d6
AP
4611.align 16
4612.Lbad_keybits:
4613 mov \$-2,%rax
4614.Lenc_key_ret:
23f6eec7
AP
4615 pxor %xmm0,%xmm0
4616 pxor %xmm1,%xmm1
4617 pxor %xmm2,%xmm2
4618 pxor %xmm3,%xmm3
4619 pxor %xmm4,%xmm4
4620 pxor %xmm5,%xmm5
d608b4d6 4621 add \$8,%rsp
b84460ad 4622.cfi_adjust_cfa_offset -8
d608b4d6 4623 ret
b84460ad 4624.cfi_endproc
d608b4d6
AP
4625.LSEH_end_set_encrypt_key:
4626\f
4627.align 16
4628.Lkey_expansion_128:
4629 $movkey %xmm0,(%rax)
4630 lea 16(%rax),%rax
4631.Lkey_expansion_128_cold:
4632 shufps \$0b00010000,%xmm0,%xmm4
f8501464 4633 xorps %xmm4, %xmm0
d608b4d6 4634 shufps \$0b10001100,%xmm0,%xmm4
f8501464
AP
4635 xorps %xmm4, %xmm0
4636 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4637 xorps %xmm1,%xmm0
d608b4d6
AP
4638 ret
4639
4640.align 16
4641.Lkey_expansion_192a:
4642 $movkey %xmm0,(%rax)
4643 lea 16(%rax),%rax
4644.Lkey_expansion_192a_cold:
4645 movaps %xmm2, %xmm5
4646.Lkey_expansion_192b_warm:
4647 shufps \$0b00010000,%xmm0,%xmm4
f8501464
AP
4648 movdqa %xmm2,%xmm3
4649 xorps %xmm4,%xmm0
d608b4d6
AP
4650 shufps \$0b10001100,%xmm0,%xmm4
4651 pslldq \$4,%xmm3
f8501464 4652 xorps %xmm4,%xmm0
d608b4d6
AP
4653 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4654 pxor %xmm3,%xmm2
4655 pxor %xmm1,%xmm0
4656 pshufd \$0b11111111,%xmm0,%xmm3
4657 pxor %xmm3,%xmm2
d64a7232
AP
4658 ret
4659
d608b4d6
AP
4660.align 16
4661.Lkey_expansion_192b:
4662 movaps %xmm0,%xmm3
4663 shufps \$0b01000100,%xmm0,%xmm5
4664 $movkey %xmm5,(%rax)
4665 shufps \$0b01001110,%xmm2,%xmm3
4666 $movkey %xmm3,16(%rax)
4667 lea 32(%rax),%rax
4668 jmp .Lkey_expansion_192b_warm
4669
d64a7232
AP
4670.align 16
4671.Lkey_expansion_256a:
d608b4d6
AP
4672 $movkey %xmm2,(%rax)
4673 lea 16(%rax),%rax
d64a7232
AP
4674.Lkey_expansion_256a_cold:
4675 shufps \$0b00010000,%xmm0,%xmm4
f8501464 4676 xorps %xmm4,%xmm0
d64a7232 4677 shufps \$0b10001100,%xmm0,%xmm4
f8501464
AP
4678 xorps %xmm4,%xmm0
4679 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4680 xorps %xmm1,%xmm0
d64a7232
AP
4681 ret
4682
4683.align 16
4684.Lkey_expansion_256b:
d608b4d6
AP
4685 $movkey %xmm0,(%rax)
4686 lea 16(%rax),%rax
d64a7232
AP
4687
4688 shufps \$0b00010000,%xmm2,%xmm4
f8501464 4689 xorps %xmm4,%xmm2
d64a7232 4690 shufps \$0b10001100,%xmm2,%xmm4
f8501464
AP
4691 xorps %xmm4,%xmm2
4692 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4693 xorps %xmm1,%xmm2
d64a7232 4694 ret
d608b4d6 4695.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
f8501464 4696.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
d64a7232
AP
4697___
4698}
4699\f
4700$code.=<<___;
6c83629b
AP
4701.align 64
4702.Lbswap_mask:
4703 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
d7d119a3 4704.Lincrement32:
f8501464 4705 .long 6,6,6,0
d7d119a3
AP
4706.Lincrement64:
4707 .long 1,0,0,0
f8501464
AP
4708.Lxts_magic:
4709 .long 0x87,0,1,0
9282c335
AP
4710.Lincrement1:
4711 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
23f6eec7
AP
4712.Lkey_rotate:
4713 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4714.Lkey_rotate192:
4715 .long 0x04070605,0x04070605,0x04070605,0x04070605
4716.Lkey_rcon1:
4717 .long 1,1,1,1
4718.Lkey_rcon1b:
4719 .long 0x1b,0x1b,0x1b,0x1b
f8501464 4720
d64a7232
AP
4721.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4722.align 64
4723___
4724
d608b4d6
AP
4725# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4726# CONTEXT *context,DISPATCHER_CONTEXT *disp)
4727if ($win64) {
4728$rec="%rcx";
4729$frame="%rdx";
4730$context="%r8";
4731$disp="%r9";
4732
4733$code.=<<___;
4734.extern __imp_RtlVirtualUnwind
6c83629b
AP
4735___
4736$code.=<<___ if ($PREFIX eq "aesni");
69d5747f 4737.type ecb_ccm64_se_handler,\@abi-omnipotent
d7d119a3 4738.align 16
69d5747f 4739ecb_ccm64_se_handler:
d7d119a3
AP
4740 push %rsi
4741 push %rdi
4742 push %rbx
4743 push %rbp
4744 push %r12
4745 push %r13
4746 push %r14
4747 push %r15
4748 pushfq
4749 sub \$64,%rsp
4750
4751 mov 120($context),%rax # pull context->Rax
4752 mov 248($context),%rbx # pull context->Rip
4753
4754 mov 8($disp),%rsi # disp->ImageBase
02f358da 4755 mov 56($disp),%r11 # disp->HandlerData
d7d119a3
AP
4756
4757 mov 0(%r11),%r10d # HandlerData[0]
4758 lea (%rsi,%r10),%r10 # prologue label
4759 cmp %r10,%rbx # context->Rip<prologue label
f8501464 4760 jb .Lcommon_seh_tail
d7d119a3
AP
4761
4762 mov 152($context),%rax # pull context->Rsp
4763
4764 mov 4(%r11),%r10d # HandlerData[1]
4765 lea (%rsi,%r10),%r10 # epilogue label
4766 cmp %r10,%rbx # context->Rip>=epilogue label
f8501464 4767 jae .Lcommon_seh_tail
d7d119a3 4768
f8501464 4769 lea 0(%rax),%rsi # %xmm save area
d7d119a3
AP
4770 lea 512($context),%rdi # &context.Xmm6
4771 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4772 .long 0xa548f3fc # cld; rep movsq
4773 lea 0x58(%rax),%rax # adjust stack pointer
4774
f8501464 4775 jmp .Lcommon_seh_tail
69d5747f 4776.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
d7d119a3 4777
6c79faaa 4778.type ctr_xts_se_handler,\@abi-omnipotent
6c83629b 4779.align 16
6c79faaa 4780ctr_xts_se_handler:
f8501464
AP
4781 push %rsi
4782 push %rdi
4783 push %rbx
4784 push %rbp
4785 push %r12
4786 push %r13
4787 push %r14
4788 push %r15
4789 pushfq
4790 sub \$64,%rsp
4791
4792 mov 120($context),%rax # pull context->Rax
4793 mov 248($context),%rbx # pull context->Rip
4794
4795 mov 8($disp),%rsi # disp->ImageBase
4796 mov 56($disp),%r11 # disp->HandlerData
4797
4798 mov 0(%r11),%r10d # HandlerData[0]
4799 lea (%rsi,%r10),%r10 # prologue lable
4800 cmp %r10,%rbx # context->Rip<prologue label
4801 jb .Lcommon_seh_tail
4802
4803 mov 152($context),%rax # pull context->Rsp
4804
4805 mov 4(%r11),%r10d # HandlerData[1]
4806 lea (%rsi,%r10),%r10 # epilogue label
4807 cmp %r10,%rbx # context->Rip>=epilogue label
4808 jae .Lcommon_seh_tail
4809
384e6de4
AP
4810 mov 208($context),%rax # pull context->R11
4811
4812 lea -0xa8(%rax),%rsi # %xmm save area
f8501464
AP
4813 lea 512($context),%rdi # & context.Xmm6
4814 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4815 .long 0xa548f3fc # cld; rep movsq
f8501464 4816
384e6de4
AP
4817 mov -8(%rax),%rbp # restore saved %rbp
4818 mov %rbp,160($context) # restore context->Rbp
4819 jmp .Lcommon_seh_tail
6c79faaa 4820.size ctr_xts_se_handler,.-ctr_xts_se_handler
bd30091c
AP
4821
4822.type ocb_se_handler,\@abi-omnipotent
4823.align 16
4824ocb_se_handler:
4825 push %rsi
4826 push %rdi
4827 push %rbx
4828 push %rbp
4829 push %r12
4830 push %r13
4831 push %r14
4832 push %r15
4833 pushfq
4834 sub \$64,%rsp
4835
4836 mov 120($context),%rax # pull context->Rax
4837 mov 248($context),%rbx # pull context->Rip
4838
4839 mov 8($disp),%rsi # disp->ImageBase
4840 mov 56($disp),%r11 # disp->HandlerData
4841
4842 mov 0(%r11),%r10d # HandlerData[0]
4843 lea (%rsi,%r10),%r10 # prologue lable
4844 cmp %r10,%rbx # context->Rip<prologue label
4845 jb .Lcommon_seh_tail
4846
4847 mov 4(%r11),%r10d # HandlerData[1]
4848 lea (%rsi,%r10),%r10 # epilogue label
4849 cmp %r10,%rbx # context->Rip>=epilogue label
4850 jae .Lcommon_seh_tail
4851
4852 mov 8(%r11),%r10d # HandlerData[2]
4853 lea (%rsi,%r10),%r10
4854 cmp %r10,%rbx # context->Rip>=pop label
4855 jae .Locb_no_xmm
4856
4857 mov 152($context),%rax # pull context->Rsp
4858
4859 lea (%rax),%rsi # %xmm save area
4860 lea 512($context),%rdi # & context.Xmm6
4861 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4862 .long 0xa548f3fc # cld; rep movsq
4863 lea 0xa0+0x28(%rax),%rax
4864
4865.Locb_no_xmm:
4866 mov -8(%rax),%rbx
4867 mov -16(%rax),%rbp
4868 mov -24(%rax),%r12
4869 mov -32(%rax),%r13
4870 mov -40(%rax),%r14
4871
4872 mov %rbx,144($context) # restore context->Rbx
4873 mov %rbp,160($context) # restore context->Rbp
4874 mov %r12,216($context) # restore context->R12
4875 mov %r13,224($context) # restore context->R13
4876 mov %r14,232($context) # restore context->R14
4877
4878 jmp .Lcommon_seh_tail
4879.size ocb_se_handler,.-ocb_se_handler
6c83629b
AP
4880___
4881$code.=<<___;
4882.type cbc_se_handler,\@abi-omnipotent
d608b4d6 4883.align 16
6c83629b 4884cbc_se_handler:
d608b4d6
AP
4885 push %rsi
4886 push %rdi
4887 push %rbx
4888 push %rbp
4889 push %r12
4890 push %r13
4891 push %r14
4892 push %r15
4893 pushfq
4894 sub \$64,%rsp
4895
4896 mov 152($context),%rax # pull context->Rsp
6c83629b
AP
4897 mov 248($context),%rbx # pull context->Rip
4898
23f6eec7 4899 lea .Lcbc_decrypt_bulk(%rip),%r10
6c83629b 4900 cmp %r10,%rbx # context->Rip<"prologue" label
f8501464 4901 jb .Lcommon_seh_tail
6c83629b 4902
384e6de4
AP
4903 mov 120($context),%rax # pull context->Rax
4904
6c83629b
AP
4905 lea .Lcbc_decrypt_body(%rip),%r10
4906 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
384e6de4
AP
4907 jb .Lcommon_seh_tail
4908
4909 mov 152($context),%rax # pull context->Rsp
6c83629b
AP
4910
4911 lea .Lcbc_ret(%rip),%r10
4912 cmp %r10,%rbx # context->Rip>="epilogue" label
f8501464 4913 jae .Lcommon_seh_tail
6c83629b 4914
6a40ebe8 4915 lea 16(%rax),%rsi # %xmm save area
6c83629b 4916 lea 512($context),%rdi # &context.Xmm6
73325b22 4917 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
6c83629b 4918 .long 0xa548f3fc # cld; rep movsq
6a40ebe8 4919
384e6de4 4920 mov 208($context),%rax # pull context->R11
6c83629b 4921
384e6de4
AP
4922 mov -8(%rax),%rbp # restore saved %rbp
4923 mov %rbp,160($context) # restore context->Rbp
f8501464
AP
4924
4925.Lcommon_seh_tail:
d608b4d6
AP
4926 mov 8(%rax),%rdi
4927 mov 16(%rax),%rsi
6c83629b 4928 mov %rax,152($context) # restore context->Rsp
d608b4d6
AP
4929 mov %rsi,168($context) # restore context->Rsi
4930 mov %rdi,176($context) # restore context->Rdi
4931
d608b4d6
AP
4932 mov 40($disp),%rdi # disp->ContextRecord
4933 mov $context,%rsi # context
4934 mov \$154,%ecx # sizeof(CONTEXT)
4935 .long 0xa548f3fc # cld; rep movsq
4936
4937 mov $disp,%rsi
4938 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4939 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4940 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4941 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4942 mov 40(%rsi),%r10 # disp->ContextRecord
4943 lea 56(%rsi),%r11 # &disp->HandlerData
4944 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4945 mov %r10,32(%rsp) # arg5
4946 mov %r11,40(%rsp) # arg6
4947 mov %r12,48(%rsp) # arg7
4948 mov %rcx,56(%rsp) # arg8, (NULL)
4949 call *__imp_RtlVirtualUnwind(%rip)
4950
4951 mov \$1,%eax # ExceptionContinueSearch
4952 add \$64,%rsp
4953 popfq
4954 pop %r15
4955 pop %r14
4956 pop %r13
4957 pop %r12
4958 pop %rbp
4959 pop %rbx
4960 pop %rdi
4961 pop %rsi
4962 ret
4963.size cbc_se_handler,.-cbc_se_handler
4964
4965.section .pdata
4966.align 4
6c83629b
AP
4967___
4968$code.=<<___ if ($PREFIX eq "aesni");
4969 .rva .LSEH_begin_aesni_ecb_encrypt
4970 .rva .LSEH_end_aesni_ecb_encrypt
d608b4d6
AP
4971 .rva .LSEH_info_ecb
4972
d7d119a3
AP
4973 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
4974 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
02f358da 4975 .rva .LSEH_info_ccm64_enc
d7d119a3
AP
4976
4977 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
4978 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
02f358da 4979 .rva .LSEH_info_ccm64_dec
d7d119a3 4980
6c83629b
AP
4981 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
4982 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
4983 .rva .LSEH_info_ctr32
f8501464
AP
4984
4985 .rva .LSEH_begin_aesni_xts_encrypt
4986 .rva .LSEH_end_aesni_xts_encrypt
4987 .rva .LSEH_info_xts_enc
4988
4989 .rva .LSEH_begin_aesni_xts_decrypt
4990 .rva .LSEH_end_aesni_xts_decrypt
4991 .rva .LSEH_info_xts_dec
bd30091c
AP
4992
4993 .rva .LSEH_begin_aesni_ocb_encrypt
4994 .rva .LSEH_end_aesni_ocb_encrypt
4995 .rva .LSEH_info_ocb_enc
4996
4997 .rva .LSEH_begin_aesni_ocb_decrypt
4998 .rva .LSEH_end_aesni_ocb_decrypt
4999 .rva .LSEH_info_ocb_dec
6c83629b
AP
5000___
5001$code.=<<___;
d608b4d6
AP
5002 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5003 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5004 .rva .LSEH_info_cbc
5005
d608b4d6
AP
5006 .rva ${PREFIX}_set_decrypt_key
5007 .rva .LSEH_end_set_decrypt_key
5008 .rva .LSEH_info_key
c5036d78
AP
5009
5010 .rva ${PREFIX}_set_encrypt_key
5011 .rva .LSEH_end_set_encrypt_key
5012 .rva .LSEH_info_key
d608b4d6
AP
5013.section .xdata
5014.align 8
6c83629b
AP
5015___
5016$code.=<<___ if ($PREFIX eq "aesni");
d608b4d6
AP
5017.LSEH_info_ecb:
5018 .byte 9,0,0,0
69d5747f
AP
5019 .rva ecb_ccm64_se_handler
5020 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
02f358da 5021.LSEH_info_ccm64_enc:
d7d119a3 5022 .byte 9,0,0,0
69d5747f 5023 .rva ecb_ccm64_se_handler
02f358da
AP
5024 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5025.LSEH_info_ccm64_dec:
5026 .byte 9,0,0,0
69d5747f 5027 .rva ecb_ccm64_se_handler
02f358da 5028 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
6c83629b
AP
5029.LSEH_info_ctr32:
5030 .byte 9,0,0,0
6c79faaa
AP
5031 .rva ctr_xts_se_handler
5032 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
f8501464
AP
5033.LSEH_info_xts_enc:
5034 .byte 9,0,0,0
6c79faaa 5035 .rva ctr_xts_se_handler
f8501464
AP
5036 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5037.LSEH_info_xts_dec:
5038 .byte 9,0,0,0
6c79faaa 5039 .rva ctr_xts_se_handler
f8501464 5040 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
bd30091c
AP
5041.LSEH_info_ocb_enc:
5042 .byte 9,0,0,0
5043 .rva ocb_se_handler
5044 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5045 .rva .Locb_enc_pop
5046 .long 0
5047.LSEH_info_ocb_dec:
5048 .byte 9,0,0,0
5049 .rva ocb_se_handler
5050 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5051 .rva .Locb_dec_pop
5052 .long 0
6c83629b
AP
5053___
5054$code.=<<___;
d608b4d6
AP
5055.LSEH_info_cbc:
5056 .byte 9,0,0,0
5057 .rva cbc_se_handler
5058.LSEH_info_key:
5059 .byte 0x01,0x04,0x01,0x00
d7d119a3 5060 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
d608b4d6
AP
5061___
5062}
5063
d64a7232 5064sub rex {
0a9a692e
AP
5065 local *opcode=shift;
5066 my ($dst,$src)=@_;
5067 my $rex=0;
5068
5069 $rex|=0x04 if($dst>=8);
5070 $rex|=0x01 if($src>=8);
5071 push @opcode,$rex|0x40 if($rex);
d64a7232
AP
5072}
5073
5074sub aesni {
5075 my $line=shift;
5076 my @opcode=(0x66);
5077
5078 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5079 rex(\@opcode,$4,$3);
5080 push @opcode,0x0f,0x3a,0xdf;
5081 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5082 my $c=$2;
5083 push @opcode,$c=~/^0/?oct($c):$c;
5084 return ".byte\t".join(',',@opcode);
5085 }
5086 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5087 my %opcodelet = (
5088 "aesimc" => 0xdb,
5089 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5090 "aesdec" => 0xde, "aesdeclast" => 0xdf
5091 );
5092 return undef if (!defined($opcodelet{$1}));
5093 rex(\@opcode,$3,$2);
5094 push @opcode,0x0f,0x38,$opcodelet{$1};
5095 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5096 return ".byte\t".join(',',@opcode);
5097 }
36df342f
AP
5098 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5099 my %opcodelet = (
5100 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5101 "aesdec" => 0xde, "aesdeclast" => 0xdf
5102 );
5103 return undef if (!defined($opcodelet{$1}));
5104 my $off = $2;
5105 push @opcode,0x44 if ($3>=8);
5106 push @opcode,0x0f,0x38,$opcodelet{$1};
5107 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5108 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5109 return ".byte\t".join(',',@opcode);
5110 }
d64a7232
AP
5111 return $line;
5112}
5113
5599c733
AP
5114sub movbe {
5115 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5116}
5117
d64a7232
AP
5118$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5119$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5599c733
AP
5120#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5121$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
d64a7232
AP
5122
5123print $code;
5124
5125close STDOUT;