]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-x86_64.pl
Fix some typos
[thirdparty/openssl.git] / crypto / aes / asm / aesni-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
c918d8e2 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
d64a7232
AP
9#
10# ====================================================================
d8ba0dc9 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
d64a7232
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
d7d119a3
AP
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
f8501464 28# asymptotic limit it's not something you commonly achieve in reality,
d7d119a3
AP
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33# 16-byte 64-byte 256-byte 1-KB 8-KB
34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
609b0852 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
d7d119a3
AP
38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
79c44b4e 62# interleaved with CBC-MAC. This provides ~30% improvement over
46f4e1be 63# "straightforward" CCM implementation with CTR and CBC-MAC performed
d7d119a3
AP
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
02f358da 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
d64a7232 88
f8501464
AP
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
609b0852 121# instructions' interleave factor. Westmere can execute at most 3
f8501464
AP
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor 3x 6x 8x
132# theoretical asymptotic limit 1.67 0.83 0.625
133# measured performance for 8KB block 1.05 0.86 0.84
134#
135# "as if" interleave factor 4.7x 5.8x 6.0x
136#
137# Further data for other parallelizable modes:
138#
73325b22 139# CBC decrypt 1.16 0.93 0.74
cd54249c 140# CTR 1.14 0.91 0.74
f8501464
AP
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
46f4e1be 146# additional instructions with AES ones, but even AES instructions
f8501464
AP
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
46f4e1be 153# utilizes 6x interleave because of limited register bank capacity.
f8501464
AP
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
36df342f
AP
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
f8501464
AP
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
bd30091c
AP
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
d2e18031 172######################################################################
5599c733
AP
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
bd30091c 176# CBC en-/decrypt CTR XTS ECB OCB
5599c733 177# Westmere 3.77/1.25 1.25 1.25 1.26
bd30091c
AP
178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
b7f5503f 180# Skylake 2.62/0.63 0.63 0.63 0.63
bd30091c 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
64d92d74 182# Knights L 2.54/0.77 0.78 0.85 - 1.50
ace05265 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
bd30091c 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
54f8f9a1 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
5599c733 186#
23f6eec7
AP
187# (*) Atom Silvermont ECB result is suboptimal because of penalties
188# incurred by operations on %xmm8-15. As ECB is not considered
5599c733 189# critical, nothing was done to mitigate the problem.
d8ba0dc9 190
d64a7232
AP
191$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
192 # generates drop-in replacement for
193 # crypto/aes/asm/aes-x86_64.pl:-)
194
1aa89a7a
RL
195# $output is the last argument if it looks like a file (it has an extension)
196# $flavour is the first argument if it doesn't look like a file
197$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
198$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
d64a7232
AP
199
200$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
201
202$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
203( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
204( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
205die "can't locate x86_64-xlate.pl";
206
1aa89a7a
RL
207open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
208 or die "can't call $xlate: $!";
46bf83f0 209*STDOUT=*OUT;
d64a7232 210
8da721ee 211$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
d608b4d6
AP
212@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
213 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
d64a7232
AP
214
215$code=".text\n";
5599c733 216$code.=".extern OPENSSL_ia32cap_P\n";
d64a7232
AP
217
218$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
d608b4d6 219# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
d64a7232
AP
220$inp="%rdi";
221$out="%rsi";
d64a7232
AP
222$len="%rdx";
223$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
d7d119a3 224$ivp="%r8"; # cbc, ctr, ...
d64a7232
AP
225
226$rnds_="%r10d"; # backup copy for $rounds
227$key_="%r11"; # backup copy for $key
228
229# %xmm register layout
f8501464
AP
230$rndkey0="%xmm0"; $rndkey1="%xmm1";
231$inout0="%xmm2"; $inout1="%xmm3";
232$inout2="%xmm4"; $inout3="%xmm5";
233$inout4="%xmm6"; $inout5="%xmm7";
234$inout6="%xmm8"; $inout7="%xmm9";
235
236$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
237$in0="%xmm8"; $iv="%xmm9";
d64a7232
AP
238\f
239# Inline version of internal aesni_[en|de]crypt1.
240#
241# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
242# cycles which take care of loop variables...
243{ my $sn;
d608b4d6 244sub aesni_generate1 {
f8501464 245my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
d64a7232
AP
246++$sn;
247$code.=<<___;
f8501464 248 $movkey ($key),$rndkey0
d64a7232 249 $movkey 16($key),$rndkey1
f8501464
AP
250___
251$code.=<<___ if (defined($ivec));
252 xorps $rndkey0,$ivec
253 lea 32($key),$key
254 xorps $ivec,$inout
255___
256$code.=<<___ if (!defined($ivec));
d608b4d6 257 lea 32($key),$key
f8501464
AP
258 xorps $rndkey0,$inout
259___
260$code.=<<___;
d608b4d6 261.Loop_${p}1_$sn:
d7d119a3 262 aes${p} $rndkey1,$inout
d64a7232 263 dec $rounds
d64a7232 264 $movkey ($key),$rndkey1
d64a7232 265 lea 16($key),$key
d608b4d6 266 jnz .Loop_${p}1_$sn # loop body is 16 bytes
d7d119a3 267 aes${p}last $rndkey1,$inout
d64a7232
AP
268___
269}}
d608b4d6 270# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
d64a7232 271#
d608b4d6
AP
272{ my ($inp,$out,$key) = @_4args;
273
d64a7232
AP
274$code.=<<___;
275.globl ${PREFIX}_encrypt
d608b4d6 276.type ${PREFIX}_encrypt,\@abi-omnipotent
d64a7232
AP
277.align 16
278${PREFIX}_encrypt:
c0e8e500 279.cfi_startproc
f8501464
AP
280 movups ($inp),$inout0 # load input
281 mov 240($key),$rounds # key->rounds
d64a7232 282___
d608b4d6 283 &aesni_generate1("enc",$key,$rounds);
d64a7232 284$code.=<<___;
23f6eec7
AP
285 pxor $rndkey0,$rndkey0 # clear register bank
286 pxor $rndkey1,$rndkey1
d608b4d6 287 movups $inout0,($out) # output
23f6eec7 288 pxor $inout0,$inout0
d64a7232 289 ret
c0e8e500 290.cfi_endproc
d64a7232 291.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
d64a7232 292
d64a7232 293.globl ${PREFIX}_decrypt
d608b4d6 294.type ${PREFIX}_decrypt,\@abi-omnipotent
d64a7232
AP
295.align 16
296${PREFIX}_decrypt:
c0e8e500 297.cfi_startproc
f8501464
AP
298 movups ($inp),$inout0 # load input
299 mov 240($key),$rounds # key->rounds
d64a7232 300___
d608b4d6 301 &aesni_generate1("dec",$key,$rounds);
d64a7232 302$code.=<<___;
23f6eec7
AP
303 pxor $rndkey0,$rndkey0 # clear register bank
304 pxor $rndkey1,$rndkey1
d608b4d6 305 movups $inout0,($out) # output
23f6eec7 306 pxor $inout0,$inout0
d64a7232 307 ret
c0e8e500 308.cfi_endproc
d64a7232
AP
309.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
310___
d608b4d6 311}
d64a7232 312\f
f8501464
AP
313# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
314# factor. Why 3x subroutine were originally used in loops? Even though
315# aes[enc|dec] latency was originally 6, it could be scheduled only
316# every *2nd* cycle. Thus 3x interleave was the one providing optimal
d608b4d6
AP
317# utilization, i.e. when subroutine's throughput is virtually same as
318# of non-interleaved subroutine [for number of input blocks up to 3].
214368ff
AP
319# This is why it originally made no sense to implement 2x subroutine.
320# But times change and it became appropriate to spend extra 192 bytes
321# on 2x subroutine on Atom Silvermont account. For processors that
322# can schedule aes[enc|dec] every cycle optimal interleave factor
323# equals to corresponding instructions latency. 8x is optimal for
609b0852 324# * Bridge and "super-optimal" for other Intel CPUs...
214368ff
AP
325
326sub aesni_generate2 {
327my $dir=shift;
328# As already mentioned it takes in $key and $rounds, which are *not*
329# preserved. $inout[0-1] is cipher/clear text...
330$code.=<<___;
331.type _aesni_${dir}rypt2,\@abi-omnipotent
332.align 16
333_aesni_${dir}rypt2:
c0e8e500 334.cfi_startproc
214368ff
AP
335 $movkey ($key),$rndkey0
336 shl \$4,$rounds
337 $movkey 16($key),$rndkey1
338 xorps $rndkey0,$inout0
339 xorps $rndkey0,$inout1
340 $movkey 32($key),$rndkey0
341 lea 32($key,$rounds),$key
342 neg %rax # $rounds
343 add \$16,%rax
344
345.L${dir}_loop2:
346 aes${dir} $rndkey1,$inout0
347 aes${dir} $rndkey1,$inout1
348 $movkey ($key,%rax),$rndkey1
349 add \$32,%rax
350 aes${dir} $rndkey0,$inout0
351 aes${dir} $rndkey0,$inout1
352 $movkey -16($key,%rax),$rndkey0
353 jnz .L${dir}_loop2
354
355 aes${dir} $rndkey1,$inout0
356 aes${dir} $rndkey1,$inout1
357 aes${dir}last $rndkey0,$inout0
358 aes${dir}last $rndkey0,$inout1
359 ret
c0e8e500 360.cfi_endproc
214368ff
AP
361.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
362___
363}
d608b4d6 364sub aesni_generate3 {
d64a7232
AP
365my $dir=shift;
366# As already mentioned it takes in $key and $rounds, which are *not*
d608b4d6 367# preserved. $inout[0-2] is cipher/clear text...
d64a7232 368$code.=<<___;
d608b4d6 369.type _aesni_${dir}rypt3,\@abi-omnipotent
d64a7232 370.align 16
d608b4d6 371_aesni_${dir}rypt3:
c0e8e500 372.cfi_startproc
d64a7232 373 $movkey ($key),$rndkey0
d8ba0dc9 374 shl \$4,$rounds
d64a7232 375 $movkey 16($key),$rndkey1
f8501464
AP
376 xorps $rndkey0,$inout0
377 xorps $rndkey0,$inout1
378 xorps $rndkey0,$inout2
d8ba0dc9
AP
379 $movkey 32($key),$rndkey0
380 lea 32($key,$rounds),$key
381 neg %rax # $rounds
382 add \$16,%rax
d608b4d6
AP
383
384.L${dir}_loop3:
385 aes${dir} $rndkey1,$inout0
d608b4d6 386 aes${dir} $rndkey1,$inout1
d608b4d6 387 aes${dir} $rndkey1,$inout2
d8ba0dc9
AP
388 $movkey ($key,%rax),$rndkey1
389 add \$32,%rax
d7d119a3 390 aes${dir} $rndkey0,$inout0
d608b4d6 391 aes${dir} $rndkey0,$inout1
d608b4d6 392 aes${dir} $rndkey0,$inout2
d8ba0dc9 393 $movkey -16($key,%rax),$rndkey0
d608b4d6
AP
394 jnz .L${dir}_loop3
395
396 aes${dir} $rndkey1,$inout0
d608b4d6
AP
397 aes${dir} $rndkey1,$inout1
398 aes${dir} $rndkey1,$inout2
399 aes${dir}last $rndkey0,$inout0
400 aes${dir}last $rndkey0,$inout1
401 aes${dir}last $rndkey0,$inout2
402 ret
c0e8e500 403.cfi_endproc
d608b4d6
AP
404.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
405___
406}
407# 4x interleave is implemented to improve small block performance,
408# most notably [and naturally] 4 block by ~30%. One can argue that one
409# should have implemented 5x as well, but improvement would be <20%,
410# so it's not worth it...
411sub aesni_generate4 {
412my $dir=shift;
413# As already mentioned it takes in $key and $rounds, which are *not*
414# preserved. $inout[0-3] is cipher/clear text...
415$code.=<<___;
416.type _aesni_${dir}rypt4,\@abi-omnipotent
417.align 16
418_aesni_${dir}rypt4:
c0e8e500 419.cfi_startproc
d608b4d6 420 $movkey ($key),$rndkey0
d8ba0dc9 421 shl \$4,$rounds
d608b4d6 422 $movkey 16($key),$rndkey1
f8501464
AP
423 xorps $rndkey0,$inout0
424 xorps $rndkey0,$inout1
425 xorps $rndkey0,$inout2
426 xorps $rndkey0,$inout3
d8ba0dc9
AP
427 $movkey 32($key),$rndkey0
428 lea 32($key,$rounds),$key
429 neg %rax # $rounds
430 .byte 0x0f,0x1f,0x00
431 add \$16,%rax
d608b4d6
AP
432
433.L${dir}_loop4:
d64a7232 434 aes${dir} $rndkey1,$inout0
d64a7232 435 aes${dir} $rndkey1,$inout1
d64a7232
AP
436 aes${dir} $rndkey1,$inout2
437 aes${dir} $rndkey1,$inout3
d8ba0dc9
AP
438 $movkey ($key,%rax),$rndkey1
439 add \$32,%rax
d7d119a3 440 aes${dir} $rndkey0,$inout0
d64a7232 441 aes${dir} $rndkey0,$inout1
d64a7232
AP
442 aes${dir} $rndkey0,$inout2
443 aes${dir} $rndkey0,$inout3
d8ba0dc9 444 $movkey -16($key,%rax),$rndkey0
d608b4d6
AP
445 jnz .L${dir}_loop4
446
d64a7232 447 aes${dir} $rndkey1,$inout0
d64a7232
AP
448 aes${dir} $rndkey1,$inout1
449 aes${dir} $rndkey1,$inout2
450 aes${dir} $rndkey1,$inout3
d64a7232
AP
451 aes${dir}last $rndkey0,$inout0
452 aes${dir}last $rndkey0,$inout1
453 aes${dir}last $rndkey0,$inout2
454 aes${dir}last $rndkey0,$inout3
d64a7232 455 ret
c0e8e500 456.cfi_endproc
d608b4d6 457.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
d64a7232
AP
458___
459}
f8501464
AP
460sub aesni_generate6 {
461my $dir=shift;
462# As already mentioned it takes in $key and $rounds, which are *not*
463# preserved. $inout[0-5] is cipher/clear text...
464$code.=<<___;
465.type _aesni_${dir}rypt6,\@abi-omnipotent
466.align 16
467_aesni_${dir}rypt6:
c0e8e500 468.cfi_startproc
f8501464 469 $movkey ($key),$rndkey0
d8ba0dc9 470 shl \$4,$rounds
f8501464 471 $movkey 16($key),$rndkey1
f8501464
AP
472 xorps $rndkey0,$inout0
473 pxor $rndkey0,$inout1
f8501464 474 pxor $rndkey0,$inout2
d8ba0dc9
AP
475 aes${dir} $rndkey1,$inout0
476 lea 32($key,$rounds),$key
477 neg %rax # $rounds
f8501464
AP
478 aes${dir} $rndkey1,$inout1
479 pxor $rndkey0,$inout3
f8501464 480 pxor $rndkey0,$inout4
d8ba0dc9 481 aes${dir} $rndkey1,$inout2
f8501464 482 pxor $rndkey0,$inout5
23f6eec7 483 $movkey ($key,%rax),$rndkey0
d8ba0dc9 484 add \$16,%rax
f8501464
AP
485 jmp .L${dir}_loop6_enter
486.align 16
487.L${dir}_loop6:
488 aes${dir} $rndkey1,$inout0
489 aes${dir} $rndkey1,$inout1
f8501464 490 aes${dir} $rndkey1,$inout2
23f6eec7 491.L${dir}_loop6_enter:
f8501464
AP
492 aes${dir} $rndkey1,$inout3
493 aes${dir} $rndkey1,$inout4
494 aes${dir} $rndkey1,$inout5
d8ba0dc9
AP
495 $movkey ($key,%rax),$rndkey1
496 add \$32,%rax
f8501464
AP
497 aes${dir} $rndkey0,$inout0
498 aes${dir} $rndkey0,$inout1
f8501464
AP
499 aes${dir} $rndkey0,$inout2
500 aes${dir} $rndkey0,$inout3
501 aes${dir} $rndkey0,$inout4
502 aes${dir} $rndkey0,$inout5
d8ba0dc9 503 $movkey -16($key,%rax),$rndkey0
f8501464
AP
504 jnz .L${dir}_loop6
505
506 aes${dir} $rndkey1,$inout0
507 aes${dir} $rndkey1,$inout1
508 aes${dir} $rndkey1,$inout2
509 aes${dir} $rndkey1,$inout3
510 aes${dir} $rndkey1,$inout4
511 aes${dir} $rndkey1,$inout5
512 aes${dir}last $rndkey0,$inout0
513 aes${dir}last $rndkey0,$inout1
514 aes${dir}last $rndkey0,$inout2
515 aes${dir}last $rndkey0,$inout3
516 aes${dir}last $rndkey0,$inout4
517 aes${dir}last $rndkey0,$inout5
518 ret
c0e8e500 519.cfi_endproc
f8501464
AP
520.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
521___
522}
523sub aesni_generate8 {
524my $dir=shift;
525# As already mentioned it takes in $key and $rounds, which are *not*
526# preserved. $inout[0-7] is cipher/clear text...
527$code.=<<___;
528.type _aesni_${dir}rypt8,\@abi-omnipotent
529.align 16
530_aesni_${dir}rypt8:
c0e8e500 531.cfi_startproc
f8501464 532 $movkey ($key),$rndkey0
d8ba0dc9 533 shl \$4,$rounds
f8501464 534 $movkey 16($key),$rndkey1
f8501464
AP
535 xorps $rndkey0,$inout0
536 xorps $rndkey0,$inout1
f8501464 537 pxor $rndkey0,$inout2
f8501464 538 pxor $rndkey0,$inout3
f8501464 539 pxor $rndkey0,$inout4
d8ba0dc9
AP
540 lea 32($key,$rounds),$key
541 neg %rax # $rounds
542 aes${dir} $rndkey1,$inout0
f8501464 543 pxor $rndkey0,$inout5
f8501464 544 pxor $rndkey0,$inout6
23f6eec7 545 aes${dir} $rndkey1,$inout1
f8501464 546 pxor $rndkey0,$inout7
23f6eec7
AP
547 $movkey ($key,%rax),$rndkey0
548 add \$16,%rax
549 jmp .L${dir}_loop8_inner
f8501464
AP
550.align 16
551.L${dir}_loop8:
552 aes${dir} $rndkey1,$inout0
553 aes${dir} $rndkey1,$inout1
23f6eec7 554.L${dir}_loop8_inner:
f8501464
AP
555 aes${dir} $rndkey1,$inout2
556 aes${dir} $rndkey1,$inout3
557 aes${dir} $rndkey1,$inout4
558 aes${dir} $rndkey1,$inout5
559 aes${dir} $rndkey1,$inout6
560 aes${dir} $rndkey1,$inout7
d8ba0dc9
AP
561.L${dir}_loop8_enter:
562 $movkey ($key,%rax),$rndkey1
563 add \$32,%rax
f8501464
AP
564 aes${dir} $rndkey0,$inout0
565 aes${dir} $rndkey0,$inout1
f8501464
AP
566 aes${dir} $rndkey0,$inout2
567 aes${dir} $rndkey0,$inout3
568 aes${dir} $rndkey0,$inout4
569 aes${dir} $rndkey0,$inout5
570 aes${dir} $rndkey0,$inout6
571 aes${dir} $rndkey0,$inout7
d8ba0dc9 572 $movkey -16($key,%rax),$rndkey0
f8501464
AP
573 jnz .L${dir}_loop8
574
575 aes${dir} $rndkey1,$inout0
576 aes${dir} $rndkey1,$inout1
577 aes${dir} $rndkey1,$inout2
578 aes${dir} $rndkey1,$inout3
579 aes${dir} $rndkey1,$inout4
580 aes${dir} $rndkey1,$inout5
581 aes${dir} $rndkey1,$inout6
582 aes${dir} $rndkey1,$inout7
583 aes${dir}last $rndkey0,$inout0
584 aes${dir}last $rndkey0,$inout1
585 aes${dir}last $rndkey0,$inout2
586 aes${dir}last $rndkey0,$inout3
587 aes${dir}last $rndkey0,$inout4
588 aes${dir}last $rndkey0,$inout5
589 aes${dir}last $rndkey0,$inout6
590 aes${dir}last $rndkey0,$inout7
591 ret
c0e8e500 592.cfi_endproc
f8501464
AP
593.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
594___
595}
214368ff
AP
596&aesni_generate2("enc") if ($PREFIX eq "aesni");
597&aesni_generate2("dec");
d608b4d6
AP
598&aesni_generate3("enc") if ($PREFIX eq "aesni");
599&aesni_generate3("dec");
600&aesni_generate4("enc") if ($PREFIX eq "aesni");
601&aesni_generate4("dec");
f8501464
AP
602&aesni_generate6("enc") if ($PREFIX eq "aesni");
603&aesni_generate6("dec");
604&aesni_generate8("enc") if ($PREFIX eq "aesni");
605&aesni_generate8("dec");
d64a7232
AP
606\f
607if ($PREFIX eq "aesni") {
6c83629b 608########################################################################
d64a7232
AP
609# void aesni_ecb_encrypt (const void *in, void *out,
610# size_t length, const AES_KEY *key,
611# int enc);
612$code.=<<___;
613.globl aesni_ecb_encrypt
614.type aesni_ecb_encrypt,\@function,5
615.align 16
616aesni_ecb_encrypt:
c0e8e500 617.cfi_startproc
69d5747f
AP
618___
619$code.=<<___ if ($win64);
620 lea -0x58(%rsp),%rsp
23f6eec7 621 movaps %xmm6,(%rsp) # offload $inout4..7
69d5747f
AP
622 movaps %xmm7,0x10(%rsp)
623 movaps %xmm8,0x20(%rsp)
624 movaps %xmm9,0x30(%rsp)
625.Lecb_enc_body:
626___
627$code.=<<___;
23f6eec7
AP
628 and \$-16,$len # if ($len<16)
629 jz .Lecb_ret # return
f8501464
AP
630
631 mov 240($key),$rounds # key->rounds
632 $movkey ($key),$rndkey0
d64a7232 633 mov $key,$key_ # backup $key
d64a7232 634 mov $rounds,$rnds_ # backup $rounds
d7d119a3 635 test %r8d,%r8d # 5th argument
d64a7232
AP
636 jz .Lecb_decrypt
637#--------------------------- ECB ENCRYPT ------------------------------#
23f6eec7
AP
638 cmp \$0x80,$len # if ($len<8*16)
639 jb .Lecb_enc_tail # short input
f8501464 640
23f6eec7 641 movdqu ($inp),$inout0 # load 8 input blocks
f8501464
AP
642 movdqu 0x10($inp),$inout1
643 movdqu 0x20($inp),$inout2
644 movdqu 0x30($inp),$inout3
645 movdqu 0x40($inp),$inout4
646 movdqu 0x50($inp),$inout5
647 movdqu 0x60($inp),$inout6
648 movdqu 0x70($inp),$inout7
23f6eec7
AP
649 lea 0x80($inp),$inp # $inp+=8*16
650 sub \$0x80,$len # $len-=8*16 (can be zero)
f8501464 651 jmp .Lecb_enc_loop8_enter
d64a7232 652.align 16
f8501464 653.Lecb_enc_loop8:
23f6eec7 654 movups $inout0,($out) # store 8 output blocks
f8501464 655 mov $key_,$key # restore $key
23f6eec7 656 movdqu ($inp),$inout0 # load 8 input blocks
d64a7232 657 mov $rnds_,$rounds # restore $rounds
d7d119a3 658 movups $inout1,0x10($out)
f8501464
AP
659 movdqu 0x10($inp),$inout1
660 movups $inout2,0x20($out)
661 movdqu 0x20($inp),$inout2
662 movups $inout3,0x30($out)
663 movdqu 0x30($inp),$inout3
664 movups $inout4,0x40($out)
665 movdqu 0x40($inp),$inout4
666 movups $inout5,0x50($out)
667 movdqu 0x50($inp),$inout5
668 movups $inout6,0x60($out)
669 movdqu 0x60($inp),$inout6
670 movups $inout7,0x70($out)
23f6eec7 671 lea 0x80($out),$out # $out+=8*16
f8501464 672 movdqu 0x70($inp),$inout7
23f6eec7 673 lea 0x80($inp),$inp # $inp+=8*16
f8501464
AP
674.Lecb_enc_loop8_enter:
675
676 call _aesni_encrypt8
677
678 sub \$0x80,$len
23f6eec7 679 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
f8501464 680
23f6eec7 681 movups $inout0,($out) # store 8 output blocks
d64a7232 682 mov $key_,$key # restore $key
f8501464
AP
683 movups $inout1,0x10($out)
684 mov $rnds_,$rounds # restore $rounds
d7d119a3 685 movups $inout2,0x20($out)
f8501464
AP
686 movups $inout3,0x30($out)
687 movups $inout4,0x40($out)
688 movups $inout5,0x50($out)
689 movups $inout6,0x60($out)
690 movups $inout7,0x70($out)
23f6eec7
AP
691 lea 0x80($out),$out # $out+=8*16
692 add \$0x80,$len # restore real remaining $len
693 jz .Lecb_ret # done if ($len==0)
d64a7232 694
23f6eec7 695.Lecb_enc_tail: # $len is less than 8*16
6c83629b 696 movups ($inp),$inout0
d7d119a3 697 cmp \$0x20,$len
6c83629b 698 jb .Lecb_enc_one
d64a7232
AP
699 movups 0x10($inp),$inout1
700 je .Lecb_enc_two
d64a7232 701 movups 0x20($inp),$inout2
f8501464
AP
702 cmp \$0x40,$len
703 jb .Lecb_enc_three
d64a7232 704 movups 0x30($inp),$inout3
f8501464
AP
705 je .Lecb_enc_four
706 movups 0x40($inp),$inout4
707 cmp \$0x60,$len
708 jb .Lecb_enc_five
709 movups 0x50($inp),$inout5
710 je .Lecb_enc_six
711 movdqu 0x60($inp),$inout6
23f6eec7 712 xorps $inout7,$inout7
f8501464 713 call _aesni_encrypt8
23f6eec7 714 movups $inout0,($out) # store 7 output blocks
d64a7232
AP
715 movups $inout1,0x10($out)
716 movups $inout2,0x20($out)
717 movups $inout3,0x30($out)
f8501464
AP
718 movups $inout4,0x40($out)
719 movups $inout5,0x50($out)
720 movups $inout6,0x60($out)
d64a7232
AP
721 jmp .Lecb_ret
722.align 16
723.Lecb_enc_one:
724___
d608b4d6 725 &aesni_generate1("enc",$key,$rounds);
d64a7232 726$code.=<<___;
23f6eec7 727 movups $inout0,($out) # store one output block
d64a7232
AP
728 jmp .Lecb_ret
729.align 16
730.Lecb_enc_two:
214368ff 731 call _aesni_encrypt2
23f6eec7 732 movups $inout0,($out) # store 2 output blocks
d64a7232
AP
733 movups $inout1,0x10($out)
734 jmp .Lecb_ret
735.align 16
736.Lecb_enc_three:
d608b4d6 737 call _aesni_encrypt3
23f6eec7 738 movups $inout0,($out) # store 3 output blocks
d64a7232
AP
739 movups $inout1,0x10($out)
740 movups $inout2,0x20($out)
741 jmp .Lecb_ret
f8501464
AP
742.align 16
743.Lecb_enc_four:
744 call _aesni_encrypt4
23f6eec7 745 movups $inout0,($out) # store 4 output blocks
f8501464
AP
746 movups $inout1,0x10($out)
747 movups $inout2,0x20($out)
748 movups $inout3,0x30($out)
749 jmp .Lecb_ret
750.align 16
751.Lecb_enc_five:
752 xorps $inout5,$inout5
753 call _aesni_encrypt6
23f6eec7 754 movups $inout0,($out) # store 5 output blocks
f8501464
AP
755 movups $inout1,0x10($out)
756 movups $inout2,0x20($out)
757 movups $inout3,0x30($out)
758 movups $inout4,0x40($out)
759 jmp .Lecb_ret
760.align 16
761.Lecb_enc_six:
762 call _aesni_encrypt6
23f6eec7 763 movups $inout0,($out) # store 6 output blocks
f8501464
AP
764 movups $inout1,0x10($out)
765 movups $inout2,0x20($out)
766 movups $inout3,0x30($out)
767 movups $inout4,0x40($out)
768 movups $inout5,0x50($out)
769 jmp .Lecb_ret
d64a7232
AP
770\f#--------------------------- ECB DECRYPT ------------------------------#
771.align 16
772.Lecb_decrypt:
23f6eec7
AP
773 cmp \$0x80,$len # if ($len<8*16)
774 jb .Lecb_dec_tail # short input
f8501464 775
23f6eec7 776 movdqu ($inp),$inout0 # load 8 input blocks
f8501464
AP
777 movdqu 0x10($inp),$inout1
778 movdqu 0x20($inp),$inout2
779 movdqu 0x30($inp),$inout3
780 movdqu 0x40($inp),$inout4
781 movdqu 0x50($inp),$inout5
782 movdqu 0x60($inp),$inout6
783 movdqu 0x70($inp),$inout7
23f6eec7
AP
784 lea 0x80($inp),$inp # $inp+=8*16
785 sub \$0x80,$len # $len-=8*16 (can be zero)
f8501464 786 jmp .Lecb_dec_loop8_enter
d64a7232 787.align 16
f8501464 788.Lecb_dec_loop8:
23f6eec7 789 movups $inout0,($out) # store 8 output blocks
f8501464 790 mov $key_,$key # restore $key
23f6eec7 791 movdqu ($inp),$inout0 # load 8 input blocks
d64a7232 792 mov $rnds_,$rounds # restore $rounds
d7d119a3 793 movups $inout1,0x10($out)
f8501464
AP
794 movdqu 0x10($inp),$inout1
795 movups $inout2,0x20($out)
796 movdqu 0x20($inp),$inout2
797 movups $inout3,0x30($out)
798 movdqu 0x30($inp),$inout3
799 movups $inout4,0x40($out)
800 movdqu 0x40($inp),$inout4
801 movups $inout5,0x50($out)
802 movdqu 0x50($inp),$inout5
803 movups $inout6,0x60($out)
804 movdqu 0x60($inp),$inout6
805 movups $inout7,0x70($out)
23f6eec7 806 lea 0x80($out),$out # $out+=8*16
f8501464 807 movdqu 0x70($inp),$inout7
23f6eec7 808 lea 0x80($inp),$inp # $inp+=8*16
f8501464
AP
809.Lecb_dec_loop8_enter:
810
811 call _aesni_decrypt8
812
813 $movkey ($key_),$rndkey0
814 sub \$0x80,$len
23f6eec7 815 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
f8501464 816
23f6eec7
AP
817 movups $inout0,($out) # store 8 output blocks
818 pxor $inout0,$inout0 # clear register bank
d64a7232 819 mov $key_,$key # restore $key
f8501464 820 movups $inout1,0x10($out)
23f6eec7 821 pxor $inout1,$inout1
f8501464 822 mov $rnds_,$rounds # restore $rounds
d7d119a3 823 movups $inout2,0x20($out)
23f6eec7 824 pxor $inout2,$inout2
f8501464 825 movups $inout3,0x30($out)
23f6eec7 826 pxor $inout3,$inout3
f8501464 827 movups $inout4,0x40($out)
23f6eec7 828 pxor $inout4,$inout4
f8501464 829 movups $inout5,0x50($out)
23f6eec7 830 pxor $inout5,$inout5
f8501464 831 movups $inout6,0x60($out)
23f6eec7 832 pxor $inout6,$inout6
f8501464 833 movups $inout7,0x70($out)
23f6eec7
AP
834 pxor $inout7,$inout7
835 lea 0x80($out),$out # $out+=8*16
836 add \$0x80,$len # restore real remaining $len
837 jz .Lecb_ret # done if ($len==0)
d64a7232 838
6c83629b 839.Lecb_dec_tail:
6c83629b 840 movups ($inp),$inout0
d7d119a3 841 cmp \$0x20,$len
6c83629b 842 jb .Lecb_dec_one
d64a7232
AP
843 movups 0x10($inp),$inout1
844 je .Lecb_dec_two
d64a7232 845 movups 0x20($inp),$inout2
f8501464
AP
846 cmp \$0x40,$len
847 jb .Lecb_dec_three
d64a7232 848 movups 0x30($inp),$inout3
f8501464
AP
849 je .Lecb_dec_four
850 movups 0x40($inp),$inout4
851 cmp \$0x60,$len
852 jb .Lecb_dec_five
853 movups 0x50($inp),$inout5
854 je .Lecb_dec_six
855 movups 0x60($inp),$inout6
856 $movkey ($key),$rndkey0
23f6eec7 857 xorps $inout7,$inout7
f8501464 858 call _aesni_decrypt8
23f6eec7
AP
859 movups $inout0,($out) # store 7 output blocks
860 pxor $inout0,$inout0 # clear register bank
d64a7232 861 movups $inout1,0x10($out)
23f6eec7 862 pxor $inout1,$inout1
d64a7232 863 movups $inout2,0x20($out)
23f6eec7 864 pxor $inout2,$inout2
d64a7232 865 movups $inout3,0x30($out)
23f6eec7 866 pxor $inout3,$inout3
f8501464 867 movups $inout4,0x40($out)
23f6eec7 868 pxor $inout4,$inout4
f8501464 869 movups $inout5,0x50($out)
23f6eec7 870 pxor $inout5,$inout5
f8501464 871 movups $inout6,0x60($out)
23f6eec7
AP
872 pxor $inout6,$inout6
873 pxor $inout7,$inout7
d64a7232
AP
874 jmp .Lecb_ret
875.align 16
876.Lecb_dec_one:
877___
d608b4d6 878 &aesni_generate1("dec",$key,$rounds);
d64a7232 879$code.=<<___;
23f6eec7
AP
880 movups $inout0,($out) # store one output block
881 pxor $inout0,$inout0 # clear register bank
d64a7232
AP
882 jmp .Lecb_ret
883.align 16
884.Lecb_dec_two:
214368ff 885 call _aesni_decrypt2
23f6eec7
AP
886 movups $inout0,($out) # store 2 output blocks
887 pxor $inout0,$inout0 # clear register bank
d64a7232 888 movups $inout1,0x10($out)
23f6eec7 889 pxor $inout1,$inout1
d64a7232
AP
890 jmp .Lecb_ret
891.align 16
892.Lecb_dec_three:
d608b4d6 893 call _aesni_decrypt3
23f6eec7
AP
894 movups $inout0,($out) # store 3 output blocks
895 pxor $inout0,$inout0 # clear register bank
d64a7232 896 movups $inout1,0x10($out)
23f6eec7 897 pxor $inout1,$inout1
d64a7232 898 movups $inout2,0x20($out)
23f6eec7 899 pxor $inout2,$inout2
f8501464
AP
900 jmp .Lecb_ret
901.align 16
902.Lecb_dec_four:
903 call _aesni_decrypt4
23f6eec7
AP
904 movups $inout0,($out) # store 4 output blocks
905 pxor $inout0,$inout0 # clear register bank
f8501464 906 movups $inout1,0x10($out)
23f6eec7 907 pxor $inout1,$inout1
f8501464 908 movups $inout2,0x20($out)
23f6eec7 909 pxor $inout2,$inout2
f8501464 910 movups $inout3,0x30($out)
23f6eec7 911 pxor $inout3,$inout3
f8501464
AP
912 jmp .Lecb_ret
913.align 16
914.Lecb_dec_five:
915 xorps $inout5,$inout5
916 call _aesni_decrypt6
23f6eec7
AP
917 movups $inout0,($out) # store 5 output blocks
918 pxor $inout0,$inout0 # clear register bank
f8501464 919 movups $inout1,0x10($out)
23f6eec7 920 pxor $inout1,$inout1
f8501464 921 movups $inout2,0x20($out)
23f6eec7 922 pxor $inout2,$inout2
f8501464 923 movups $inout3,0x30($out)
23f6eec7 924 pxor $inout3,$inout3
f8501464 925 movups $inout4,0x40($out)
23f6eec7
AP
926 pxor $inout4,$inout4
927 pxor $inout5,$inout5
f8501464
AP
928 jmp .Lecb_ret
929.align 16
930.Lecb_dec_six:
931 call _aesni_decrypt6
23f6eec7
AP
932 movups $inout0,($out) # store 6 output blocks
933 pxor $inout0,$inout0 # clear register bank
f8501464 934 movups $inout1,0x10($out)
23f6eec7 935 pxor $inout1,$inout1
f8501464 936 movups $inout2,0x20($out)
23f6eec7 937 pxor $inout2,$inout2
f8501464 938 movups $inout3,0x30($out)
23f6eec7 939 pxor $inout3,$inout3
f8501464 940 movups $inout4,0x40($out)
23f6eec7 941 pxor $inout4,$inout4
f8501464 942 movups $inout5,0x50($out)
23f6eec7 943 pxor $inout5,$inout5
d64a7232
AP
944
945.Lecb_ret:
23f6eec7
AP
946 xorps $rndkey0,$rndkey0 # %xmm0
947 pxor $rndkey1,$rndkey1
69d5747f
AP
948___
949$code.=<<___ if ($win64);
950 movaps (%rsp),%xmm6
23f6eec7 951 movaps %xmm0,(%rsp) # clear stack
69d5747f 952 movaps 0x10(%rsp),%xmm7
23f6eec7 953 movaps %xmm0,0x10(%rsp)
69d5747f 954 movaps 0x20(%rsp),%xmm8
23f6eec7 955 movaps %xmm0,0x20(%rsp)
69d5747f 956 movaps 0x30(%rsp),%xmm9
23f6eec7 957 movaps %xmm0,0x30(%rsp)
69d5747f
AP
958 lea 0x58(%rsp),%rsp
959.Lecb_enc_ret:
960___
961$code.=<<___;
d64a7232 962 ret
c0e8e500 963.cfi_endproc
d64a7232
AP
964.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
965___
d7d119a3
AP
966\f
967{
6c83629b 968######################################################################
d7d119a3
AP
969# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
970# size_t blocks, const AES_KEY *key,
971# const char *ivec,char *cmac);
6c83629b 972#
d7d119a3
AP
973# Handles only complete blocks, operates on 64-bit counter and
974# does not update *ivec! Nor does it finalize CMAC value
975# (see engine/eng_aesni.c for details)
976#
977{
978my $cmac="%r9"; # 6th argument
979
d8ba0dc9
AP
980my $increment="%xmm9";
981my $iv="%xmm6";
267b481c 982my $bswap_mask="%xmm7";
d7d119a3
AP
983
984$code.=<<___;
985.globl aesni_ccm64_encrypt_blocks
986.type aesni_ccm64_encrypt_blocks,\@function,6
987.align 16
988aesni_ccm64_encrypt_blocks:
989___
990$code.=<<___ if ($win64);
991 lea -0x58(%rsp),%rsp
23f6eec7
AP
992 movaps %xmm6,(%rsp) # $iv
993 movaps %xmm7,0x10(%rsp) # $bswap_mask
994 movaps %xmm8,0x20(%rsp) # $in0
995 movaps %xmm9,0x30(%rsp) # $increment
d7d119a3
AP
996.Lccm64_enc_body:
997___
998$code.=<<___;
267b481c 999 mov 240($key),$rounds # key->rounds
d7d119a3 1000 movdqu ($ivp),$iv
d7d119a3
AP
1001 movdqa .Lincrement64(%rip),$increment
1002 movdqa .Lbswap_mask(%rip),$bswap_mask
d7d119a3 1003
d8ba0dc9
AP
1004 shl \$4,$rounds
1005 mov \$16,$rnds_
267b481c
AP
1006 lea 0($key),$key_
1007 movdqu ($cmac),$inout1
d7d119a3 1008 movdqa $iv,$inout0
d8ba0dc9 1009 lea 32($key,$rounds),$key # end of key schedule
9ee5916d 1010 pshufb $bswap_mask,$iv
d8ba0dc9 1011 sub %rax,%r10 # twisted $rounds
267b481c
AP
1012 jmp .Lccm64_enc_outer
1013.align 16
d7d119a3 1014.Lccm64_enc_outer:
267b481c 1015 $movkey ($key_),$rndkey0
d8ba0dc9 1016 mov %r10,%rax
267b481c 1017 movups ($inp),$in0 # load inp
d7d119a3 1018
267b481c
AP
1019 xorps $rndkey0,$inout0 # counter
1020 $movkey 16($key_),$rndkey1
1021 xorps $in0,$rndkey0
267b481c 1022 xorps $rndkey0,$inout1 # cmac^=inp
d8ba0dc9 1023 $movkey 32($key_),$rndkey0
f8501464
AP
1024
1025.Lccm64_enc2_loop:
1026 aesenc $rndkey1,$inout0
f8501464 1027 aesenc $rndkey1,$inout1
d8ba0dc9
AP
1028 $movkey ($key,%rax),$rndkey1
1029 add \$32,%rax
f8501464 1030 aesenc $rndkey0,$inout0
f8501464 1031 aesenc $rndkey0,$inout1
d8ba0dc9 1032 $movkey -16($key,%rax),$rndkey0
f8501464
AP
1033 jnz .Lccm64_enc2_loop
1034 aesenc $rndkey1,$inout0
1035 aesenc $rndkey1,$inout1
267b481c 1036 paddq $increment,$iv
23f6eec7 1037 dec $len # $len-- ($len is in blocks)
f8501464
AP
1038 aesenclast $rndkey0,$inout0
1039 aesenclast $rndkey0,$inout1
d7d119a3 1040
d7d119a3 1041 lea 16($inp),$inp
f8501464 1042 xorps $inout0,$in0 # inp ^= E(iv)
d7d119a3 1043 movdqa $iv,$inout0
f8501464 1044 movups $in0,($out) # save output
9ee5916d 1045 pshufb $bswap_mask,$inout0
23f6eec7
AP
1046 lea 16($out),$out # $out+=16
1047 jnz .Lccm64_enc_outer # loop if ($len!=0)
d7d119a3 1048
23f6eec7
AP
1049 pxor $rndkey0,$rndkey0 # clear register bank
1050 pxor $rndkey1,$rndkey1
1051 pxor $inout0,$inout0
1052 movups $inout1,($cmac) # store resulting mac
1053 pxor $inout1,$inout1
1054 pxor $in0,$in0
1055 pxor $iv,$iv
d7d119a3
AP
1056___
1057$code.=<<___ if ($win64);
1058 movaps (%rsp),%xmm6
23f6eec7 1059 movaps %xmm0,(%rsp) # clear stack
d7d119a3 1060 movaps 0x10(%rsp),%xmm7
23f6eec7 1061 movaps %xmm0,0x10(%rsp)
d7d119a3 1062 movaps 0x20(%rsp),%xmm8
23f6eec7 1063 movaps %xmm0,0x20(%rsp)
d7d119a3 1064 movaps 0x30(%rsp),%xmm9
23f6eec7 1065 movaps %xmm0,0x30(%rsp)
d7d119a3
AP
1066 lea 0x58(%rsp),%rsp
1067.Lccm64_enc_ret:
1068___
1069$code.=<<___;
1070 ret
1071.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1072___
1073######################################################################
1074$code.=<<___;
1075.globl aesni_ccm64_decrypt_blocks
1076.type aesni_ccm64_decrypt_blocks,\@function,6
1077.align 16
1078aesni_ccm64_decrypt_blocks:
1079___
1080$code.=<<___ if ($win64);
1081 lea -0x58(%rsp),%rsp
23f6eec7
AP
1082 movaps %xmm6,(%rsp) # $iv
1083 movaps %xmm7,0x10(%rsp) # $bswap_mask
1084 movaps %xmm8,0x20(%rsp) # $in8
1085 movaps %xmm9,0x30(%rsp) # $increment
d7d119a3
AP
1086.Lccm64_dec_body:
1087___
1088$code.=<<___;
267b481c
AP
1089 mov 240($key),$rounds # key->rounds
1090 movups ($ivp),$iv
d7d119a3
AP
1091 movdqu ($cmac),$inout1
1092 movdqa .Lincrement64(%rip),$increment
1093 movdqa .Lbswap_mask(%rip),$bswap_mask
1094
267b481c 1095 movaps $iv,$inout0
d7d119a3
AP
1096 mov $rounds,$rnds_
1097 mov $key,$key_
267b481c 1098 pshufb $bswap_mask,$iv
d7d119a3
AP
1099___
1100 &aesni_generate1("enc",$key,$rounds);
1101$code.=<<___;
d8ba0dc9
AP
1102 shl \$4,$rnds_
1103 mov \$16,$rounds
f8501464 1104 movups ($inp),$in0 # load inp
267b481c 1105 paddq $increment,$iv
23f6eec7 1106 lea 16($inp),$inp # $inp+=16
d8ba0dc9
AP
1107 sub %r10,%rax # twisted $rounds
1108 lea 32($key_,$rnds_),$key # end of key schedule
1109 mov %rax,%r10
267b481c
AP
1110 jmp .Lccm64_dec_outer
1111.align 16
1112.Lccm64_dec_outer:
1113 xorps $inout0,$in0 # inp ^= E(iv)
1114 movdqa $iv,$inout0
267b481c 1115 movups $in0,($out) # save output
23f6eec7 1116 lea 16($out),$out # $out+=16
9ee5916d 1117 pshufb $bswap_mask,$inout0
d7d119a3 1118
23f6eec7
AP
1119 sub \$1,$len # $len-- ($len is in blocks)
1120 jz .Lccm64_dec_break # if ($len==0) break
d7d119a3 1121
267b481c 1122 $movkey ($key_),$rndkey0
d8ba0dc9 1123 mov %r10,%rax
267b481c 1124 $movkey 16($key_),$rndkey1
f8501464 1125 xorps $rndkey0,$in0
f8501464
AP
1126 xorps $rndkey0,$inout0
1127 xorps $in0,$inout1 # cmac^=out
d8ba0dc9
AP
1128 $movkey 32($key_),$rndkey0
1129 jmp .Lccm64_dec2_loop
1130.align 16
f8501464
AP
1131.Lccm64_dec2_loop:
1132 aesenc $rndkey1,$inout0
f8501464 1133 aesenc $rndkey1,$inout1
d8ba0dc9
AP
1134 $movkey ($key,%rax),$rndkey1
1135 add \$32,%rax
f8501464 1136 aesenc $rndkey0,$inout0
f8501464 1137 aesenc $rndkey0,$inout1
d8ba0dc9 1138 $movkey -16($key,%rax),$rndkey0
f8501464 1139 jnz .Lccm64_dec2_loop
23f6eec7 1140 movups ($inp),$in0 # load input
267b481c 1141 paddq $increment,$iv
f8501464
AP
1142 aesenc $rndkey1,$inout0
1143 aesenc $rndkey1,$inout1
1144 aesenclast $rndkey0,$inout0
267b481c 1145 aesenclast $rndkey0,$inout1
23f6eec7 1146 lea 16($inp),$inp # $inp+=16
d7d119a3
AP
1147 jmp .Lccm64_dec_outer
1148
1149.align 16
1150.Lccm64_dec_break:
267b481c 1151 #xorps $in0,$inout1 # cmac^=out
d8ba0dc9 1152 mov 240($key_),$rounds
d7d119a3 1153___
267b481c 1154 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
d7d119a3 1155$code.=<<___;
23f6eec7
AP
1156 pxor $rndkey0,$rndkey0 # clear register bank
1157 pxor $rndkey1,$rndkey1
1158 pxor $inout0,$inout0
1159 movups $inout1,($cmac) # store resulting mac
1160 pxor $inout1,$inout1
1161 pxor $in0,$in0
1162 pxor $iv,$iv
d7d119a3
AP
1163___
1164$code.=<<___ if ($win64);
1165 movaps (%rsp),%xmm6
23f6eec7 1166 movaps %xmm0,(%rsp) # clear stack
d7d119a3 1167 movaps 0x10(%rsp),%xmm7
23f6eec7 1168 movaps %xmm0,0x10(%rsp)
d7d119a3 1169 movaps 0x20(%rsp),%xmm8
23f6eec7 1170 movaps %xmm0,0x20(%rsp)
d7d119a3 1171 movaps 0x30(%rsp),%xmm9
23f6eec7 1172 movaps %xmm0,0x30(%rsp)
d7d119a3
AP
1173 lea 0x58(%rsp),%rsp
1174.Lccm64_dec_ret:
1175___
1176$code.=<<___;
1177 ret
f8501464
AP
1178.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1179___
1180}\f
1181######################################################################
1182# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1183# size_t blocks, const AES_KEY *key,
1184# const char *ivec);
1185#
1186# Handles only complete blocks, operates on 32-bit counter and
6c79faaa 1187# does not update *ivec! (see crypto/modes/ctr128.c for details)
f8501464 1188#
6c79faaa 1189# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
b4a9d5bf 1190# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
6c79faaa
AP
1191# Keywords are full unroll and modulo-schedule counter calculations
1192# with zero-round key xor.
f8501464 1193{
6c79faaa 1194my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
384e6de4 1195my ($key0,$ctr)=("%ebp","${ivp}d");
6c79faaa 1196my $frame_size = 0x80 + ($win64?160:0);
f8501464
AP
1197
1198$code.=<<___;
1199.globl aesni_ctr32_encrypt_blocks
1200.type aesni_ctr32_encrypt_blocks,\@function,5
1201.align 16
1202aesni_ctr32_encrypt_blocks:
b84460ad 1203.cfi_startproc
23f6eec7
AP
1204 cmp \$1,$len
1205 jne .Lctr32_bulk
1206
1207 # handle single block without allocating stack frame,
1208 # useful when handling edges
1209 movups ($ivp),$inout0
1210 movups ($inp),$inout1
1211 mov 240($key),%edx # key->rounds
1212___
1213 &aesni_generate1("enc",$key,"%edx");
1214$code.=<<___;
1215 pxor $rndkey0,$rndkey0 # clear register bank
1216 pxor $rndkey1,$rndkey1
1217 xorps $inout1,$inout0
1218 pxor $inout1,$inout1
1219 movups $inout0,($out)
1220 xorps $inout0,$inout0
1221 jmp .Lctr32_epilogue
1222
1223.align 16
1224.Lctr32_bulk:
384e6de4 1225 lea (%rsp),$key_ # use $key_ as frame pointer
b84460ad 1226.cfi_def_cfa_register $key_
6c79faaa 1227 push %rbp
b84460ad 1228.cfi_push %rbp
6c79faaa
AP
1229 sub \$$frame_size,%rsp
1230 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
f8501464
AP
1231___
1232$code.=<<___ if ($win64);
384e6de4
AP
1233 movaps %xmm6,-0xa8($key_) # offload everything
1234 movaps %xmm7,-0x98($key_)
1235 movaps %xmm8,-0x88($key_)
1236 movaps %xmm9,-0x78($key_)
1237 movaps %xmm10,-0x68($key_)
1238 movaps %xmm11,-0x58($key_)
1239 movaps %xmm12,-0x48($key_)
1240 movaps %xmm13,-0x38($key_)
1241 movaps %xmm14,-0x28($key_)
1242 movaps %xmm15,-0x18($key_)
f8501464
AP
1243.Lctr32_body:
1244___
1245$code.=<<___;
6c79faaa 1246
23f6eec7
AP
1247 # 8 16-byte words on top of stack are counter values
1248 # xor-ed with zero-round key
f8501464 1249
6c79faaa
AP
1250 movdqu ($ivp),$inout0
1251 movdqu ($key),$rndkey0
1252 mov 12($ivp),$ctr # counter LSB
1253 pxor $rndkey0,$inout0
1254 mov 12($key),$key0 # 0-round key LSB
1255 movdqa $inout0,0x00(%rsp) # populate counter block
1256 bswap $ctr
b4a9d5bf
AP
1257 movdqa $inout0,$inout1
1258 movdqa $inout0,$inout2
1259 movdqa $inout0,$inout3
6c79faaa
AP
1260 movdqa $inout0,0x40(%rsp)
1261 movdqa $inout0,0x50(%rsp)
1262 movdqa $inout0,0x60(%rsp)
23f6eec7 1263 mov %rdx,%r10 # about to borrow %rdx
6c79faaa
AP
1264 movdqa $inout0,0x70(%rsp)
1265
d8ba0dc9
AP
1266 lea 1($ctr),%rax
1267 lea 2($ctr),%rdx
1268 bswap %eax
1269 bswap %edx
1270 xor $key0,%eax
1271 xor $key0,%edx
1272 pinsrd \$3,%eax,$inout1
1273 lea 3($ctr),%rax
b4a9d5bf 1274 movdqa $inout1,0x10(%rsp)
d8ba0dc9
AP
1275 pinsrd \$3,%edx,$inout2
1276 bswap %eax
1277 mov %r10,%rdx # restore %rdx
6c79faaa 1278 lea 4($ctr),%r10
b4a9d5bf 1279 movdqa $inout2,0x20(%rsp)
d8ba0dc9 1280 xor $key0,%eax
6c79faaa 1281 bswap %r10d
d8ba0dc9 1282 pinsrd \$3,%eax,$inout3
6c79faaa 1283 xor $key0,%r10d
b4a9d5bf 1284 movdqa $inout3,0x30(%rsp)
6c79faaa
AP
1285 lea 5($ctr),%r9
1286 mov %r10d,0x40+12(%rsp)
1287 bswap %r9d
1288 lea 6($ctr),%r10
d8ba0dc9 1289 mov 240($key),$rounds # key->rounds
6c79faaa
AP
1290 xor $key0,%r9d
1291 bswap %r10d
1292 mov %r9d,0x50+12(%rsp)
1293 xor $key0,%r10d
1294 lea 7($ctr),%r9
1295 mov %r10d,0x60+12(%rsp)
1296 bswap %r9d
609b0852 1297 mov OPENSSL_ia32cap_P+4(%rip),%r10d
6c79faaa 1298 xor $key0,%r9d
5599c733 1299 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
6c79faaa
AP
1300 mov %r9d,0x70+12(%rsp)
1301
1302 $movkey 0x10($key),$rndkey1
1303
6c79faaa
AP
1304 movdqa 0x40(%rsp),$inout4
1305 movdqa 0x50(%rsp),$inout5
9282c335 1306
23f6eec7
AP
1307 cmp \$8,$len # $len is in blocks
1308 jb .Lctr32_tail # short input if ($len<8)
9282c335 1309
23f6eec7 1310 sub \$6,$len # $len is biased by -6
5599c733 1311 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
23f6eec7 1312 je .Lctr32_6x # [which denotes Atom Silvermont]
5599c733 1313
6c79faaa 1314 lea 0x80($key),$key # size optimization
23f6eec7 1315 sub \$2,$len # $len is biased by -8
9282c335 1316 jmp .Lctr32_loop8
f8501464 1317
5599c733
AP
1318.align 16
1319.Lctr32_6x:
1320 shl \$4,$rounds
1321 mov \$48,$rnds_
1322 bswap $key0
1323 lea 32($key,$rounds),$key # end of key schedule
1324 sub %rax,%r10 # twisted $rounds
1325 jmp .Lctr32_loop6
1326
1327.align 16
1328.Lctr32_loop6:
23f6eec7 1329 add \$6,$ctr # next counter value
5599c733
AP
1330 $movkey -48($key,$rnds_),$rndkey0
1331 aesenc $rndkey1,$inout0
1332 mov $ctr,%eax
1333 xor $key0,%eax
1334 aesenc $rndkey1,$inout1
23f6eec7 1335 movbe %eax,`0x00+12`(%rsp) # store next counter value
5599c733
AP
1336 lea 1($ctr),%eax
1337 aesenc $rndkey1,$inout2
1338 xor $key0,%eax
1339 movbe %eax,`0x10+12`(%rsp)
1340 aesenc $rndkey1,$inout3
1341 lea 2($ctr),%eax
1342 xor $key0,%eax
1343 aesenc $rndkey1,$inout4
1344 movbe %eax,`0x20+12`(%rsp)
1345 lea 3($ctr),%eax
1346 aesenc $rndkey1,$inout5
1347 $movkey -32($key,$rnds_),$rndkey1
1348 xor $key0,%eax
1349
1350 aesenc $rndkey0,$inout0
1351 movbe %eax,`0x30+12`(%rsp)
1352 lea 4($ctr),%eax
1353 aesenc $rndkey0,$inout1
1354 xor $key0,%eax
1355 movbe %eax,`0x40+12`(%rsp)
1356 aesenc $rndkey0,$inout2
1357 lea 5($ctr),%eax
1358 xor $key0,%eax
1359 aesenc $rndkey0,$inout3
1360 movbe %eax,`0x50+12`(%rsp)
1361 mov %r10,%rax # mov $rnds_,$rounds
1362 aesenc $rndkey0,$inout4
1363 aesenc $rndkey0,$inout5
1364 $movkey -16($key,$rnds_),$rndkey0
1365
1366 call .Lenc_loop6
1367
23f6eec7 1368 movdqu ($inp),$inout6 # load 6 input blocks
5599c733
AP
1369 movdqu 0x10($inp),$inout7
1370 movdqu 0x20($inp),$in0
1371 movdqu 0x30($inp),$in1
1372 movdqu 0x40($inp),$in2
1373 movdqu 0x50($inp),$in3
23f6eec7 1374 lea 0x60($inp),$inp # $inp+=6*16
5599c733 1375 $movkey -64($key,$rnds_),$rndkey1
23f6eec7
AP
1376 pxor $inout0,$inout6 # inp^=E(ctr)
1377 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
5599c733
AP
1378 pxor $inout1,$inout7
1379 movaps 0x10(%rsp),$inout1
1380 pxor $inout2,$in0
1381 movaps 0x20(%rsp),$inout2
1382 pxor $inout3,$in1
1383 movaps 0x30(%rsp),$inout3
1384 pxor $inout4,$in2
1385 movaps 0x40(%rsp),$inout4
1386 pxor $inout5,$in3
1387 movaps 0x50(%rsp),$inout5
23f6eec7 1388 movdqu $inout6,($out) # store 6 output blocks
5599c733
AP
1389 movdqu $inout7,0x10($out)
1390 movdqu $in0,0x20($out)
1391 movdqu $in1,0x30($out)
1392 movdqu $in2,0x40($out)
1393 movdqu $in3,0x50($out)
23f6eec7
AP
1394 lea 0x60($out),$out # $out+=6*16
1395
5599c733 1396 sub \$6,$len
23f6eec7 1397 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
5599c733 1398
23f6eec7
AP
1399 add \$6,$len # restore real remaining $len
1400 jz .Lctr32_done # done if ($len==0)
5599c733
AP
1401
1402 lea -48($rnds_),$rounds
1403 lea -80($key,$rnds_),$key # restore $key
1404 neg $rounds
1405 shr \$4,$rounds # restore $rounds
1406 jmp .Lctr32_tail
1407
6c79faaa 1408.align 32
9282c335 1409.Lctr32_loop8:
23f6eec7 1410 add \$8,$ctr # next counter value
6c79faaa
AP
1411 movdqa 0x60(%rsp),$inout6
1412 aesenc $rndkey1,$inout0
1413 mov $ctr,%r9d
1414 movdqa 0x70(%rsp),$inout7
1415 aesenc $rndkey1,$inout1
1416 bswap %r9d
1417 $movkey 0x20-0x80($key),$rndkey0
1418 aesenc $rndkey1,$inout2
1419 xor $key0,%r9d
d8ba0dc9 1420 nop
6c79faaa 1421 aesenc $rndkey1,$inout3
23f6eec7 1422 mov %r9d,0x00+12(%rsp) # store next counter value
6c79faaa
AP
1423 lea 1($ctr),%r9
1424 aesenc $rndkey1,$inout4
1425 aesenc $rndkey1,$inout5
1426 aesenc $rndkey1,$inout6
1427 aesenc $rndkey1,$inout7
1428 $movkey 0x30-0x80($key),$rndkey1
1429___
1430for($i=2;$i<8;$i++) {
1431my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1432$code.=<<___;
d8ba0dc9 1433 bswap %r9d
6c79faaa
AP
1434 aesenc $rndkeyx,$inout0
1435 aesenc $rndkeyx,$inout1
6c79faaa 1436 xor $key0,%r9d
d8ba0dc9
AP
1437 .byte 0x66,0x90
1438 aesenc $rndkeyx,$inout2
6c79faaa
AP
1439 aesenc $rndkeyx,$inout3
1440 mov %r9d,`0x10*($i-1)`+12(%rsp)
1441 lea $i($ctr),%r9
1442 aesenc $rndkeyx,$inout4
1443 aesenc $rndkeyx,$inout5
1444 aesenc $rndkeyx,$inout6
1445 aesenc $rndkeyx,$inout7
1446 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1447___
1448}
1449$code.=<<___;
d8ba0dc9 1450 bswap %r9d
6c79faaa
AP
1451 aesenc $rndkey0,$inout0
1452 aesenc $rndkey0,$inout1
6c79faaa
AP
1453 aesenc $rndkey0,$inout2
1454 xor $key0,%r9d
23f6eec7 1455 movdqu 0x00($inp),$in0 # start loading input
6c79faaa
AP
1456 aesenc $rndkey0,$inout3
1457 mov %r9d,0x70+12(%rsp)
d8ba0dc9 1458 cmp \$11,$rounds
6c79faaa
AP
1459 aesenc $rndkey0,$inout4
1460 aesenc $rndkey0,$inout5
1461 aesenc $rndkey0,$inout6
6c79faaa
AP
1462 aesenc $rndkey0,$inout7
1463 $movkey 0xa0-0x80($key),$rndkey0
1464
6c79faaa
AP
1465 jb .Lctr32_enc_done
1466
1467 aesenc $rndkey1,$inout0
1468 aesenc $rndkey1,$inout1
1469 aesenc $rndkey1,$inout2
1470 aesenc $rndkey1,$inout3
1471 aesenc $rndkey1,$inout4
1472 aesenc $rndkey1,$inout5
1473 aesenc $rndkey1,$inout6
1474 aesenc $rndkey1,$inout7
1475 $movkey 0xb0-0x80($key),$rndkey1
1bc4d009
AP
1476
1477 aesenc $rndkey0,$inout0
1478 aesenc $rndkey0,$inout1
1bc4d009 1479 aesenc $rndkey0,$inout2
1bc4d009 1480 aesenc $rndkey0,$inout3
1bc4d009 1481 aesenc $rndkey0,$inout4
1bc4d009 1482 aesenc $rndkey0,$inout5
1bc4d009 1483 aesenc $rndkey0,$inout6
1bc4d009 1484 aesenc $rndkey0,$inout7
6c79faaa
AP
1485 $movkey 0xc0-0x80($key),$rndkey0
1486 je .Lctr32_enc_done
9282c335 1487
1bc4d009
AP
1488 aesenc $rndkey1,$inout0
1489 aesenc $rndkey1,$inout1
1bc4d009
AP
1490 aesenc $rndkey1,$inout2
1491 aesenc $rndkey1,$inout3
1492 aesenc $rndkey1,$inout4
1493 aesenc $rndkey1,$inout5
1494 aesenc $rndkey1,$inout6
1495 aesenc $rndkey1,$inout7
6c79faaa 1496 $movkey 0xd0-0x80($key),$rndkey1
9282c335 1497
1bc4d009
AP
1498 aesenc $rndkey0,$inout0
1499 aesenc $rndkey0,$inout1
1bc4d009
AP
1500 aesenc $rndkey0,$inout2
1501 aesenc $rndkey0,$inout3
1502 aesenc $rndkey0,$inout4
1503 aesenc $rndkey0,$inout5
1504 aesenc $rndkey0,$inout6
1505 aesenc $rndkey0,$inout7
6c79faaa 1506 $movkey 0xe0-0x80($key),$rndkey0
d8ba0dc9 1507 jmp .Lctr32_enc_done
1bc4d009 1508
d8ba0dc9 1509.align 16
6c79faaa 1510.Lctr32_enc_done:
6c79faaa 1511 movdqu 0x10($inp),$in1
23f6eec7 1512 pxor $rndkey0,$in0 # input^=round[last]
6c79faaa 1513 movdqu 0x20($inp),$in2
1bc4d009 1514 pxor $rndkey0,$in1
6c79faaa 1515 movdqu 0x30($inp),$in3
1bc4d009 1516 pxor $rndkey0,$in2
6c79faaa 1517 movdqu 0x40($inp),$in4
1bc4d009 1518 pxor $rndkey0,$in3
6c79faaa
AP
1519 movdqu 0x50($inp),$in5
1520 pxor $rndkey0,$in4
6c79faaa 1521 pxor $rndkey0,$in5
d8ba0dc9 1522 aesenc $rndkey1,$inout0
cd54249c
AP
1523 aesenc $rndkey1,$inout1
1524 aesenc $rndkey1,$inout2
1525 aesenc $rndkey1,$inout3
1526 aesenc $rndkey1,$inout4
1527 aesenc $rndkey1,$inout5
1bc4d009
AP
1528 aesenc $rndkey1,$inout6
1529 aesenc $rndkey1,$inout7
23f6eec7
AP
1530 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1531 lea 0x80($inp),$inp # $inp+=8*16
6c79faaa 1532
23f6eec7
AP
1533 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1534 pxor $rndkey0,$rndkey1 # borrowed $rndkey
d8ba0dc9 1535 movdqu 0x70-0x80($inp),$in0
1bc4d009 1536 aesenclast $in1,$inout1
1bc4d009 1537 pxor $rndkey0,$in0
6c79faaa 1538 movdqa 0x00(%rsp),$in1 # load next counter block
1bc4d009 1539 aesenclast $in2,$inout2
1bc4d009 1540 aesenclast $in3,$inout3
d8ba0dc9 1541 movdqa 0x10(%rsp),$in2
6c79faaa
AP
1542 movdqa 0x20(%rsp),$in3
1543 aesenclast $in4,$inout4
6c79faaa 1544 aesenclast $in5,$inout5
d8ba0dc9 1545 movdqa 0x30(%rsp),$in4
6c79faaa
AP
1546 movdqa 0x40(%rsp),$in5
1547 aesenclast $rndkey1,$inout6
1548 movdqa 0x50(%rsp),$rndkey0
23f6eec7 1549 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
d8ba0dc9 1550 aesenclast $in0,$inout7
1bc4d009 1551
23f6eec7 1552 movups $inout0,($out) # store 8 output blocks
6c79faaa 1553 movdqa $in1,$inout0
9282c335 1554 movups $inout1,0x10($out)
6c79faaa 1555 movdqa $in2,$inout1
9282c335 1556 movups $inout2,0x20($out)
6c79faaa 1557 movdqa $in3,$inout2
9282c335 1558 movups $inout3,0x30($out)
6c79faaa 1559 movdqa $in4,$inout3
9282c335 1560 movups $inout4,0x40($out)
6c79faaa 1561 movdqa $in5,$inout4
9282c335 1562 movups $inout5,0x50($out)
1bc4d009 1563 movdqa $rndkey0,$inout5
9282c335
AP
1564 movups $inout6,0x60($out)
1565 movups $inout7,0x70($out)
23f6eec7
AP
1566 lea 0x80($out),$out # $out+=8*16
1567
9282c335 1568 sub \$8,$len
23f6eec7 1569 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
f8501464 1570
46f4e1be 1571 add \$8,$len # restore real remaining $len
23f6eec7 1572 jz .Lctr32_done # done if ($len==0)
6c79faaa 1573 lea -0x80($key),$key
f8501464
AP
1574
1575.Lctr32_tail:
23f6eec7 1576 # note that at this point $inout0..5 are populated with
609b0852 1577 # counter values xor-ed with 0-round key
6c79faaa 1578 lea 16($key),$key
f8501464 1579 cmp \$4,$len
b4a9d5bf
AP
1580 jb .Lctr32_loop3
1581 je .Lctr32_loop4
f8501464 1582
23f6eec7 1583 # if ($len>4) compute 7 E(counter)
d8ba0dc9 1584 shl \$4,$rounds
6c79faaa 1585 movdqa 0x60(%rsp),$inout6
b4a9d5bf 1586 pxor $inout7,$inout7
f8501464 1587
6c79faaa
AP
1588 $movkey 16($key),$rndkey0
1589 aesenc $rndkey1,$inout0
6c79faaa 1590 aesenc $rndkey1,$inout1
23f6eec7 1591 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
d8ba0dc9 1592 neg %rax
6c79faaa 1593 aesenc $rndkey1,$inout2
23f6eec7 1594 add \$16,%rax # prepare for .Lenc_loop8_enter
b4a9d5bf 1595 movups ($inp),$in0
d8ba0dc9 1596 aesenc $rndkey1,$inout3
6c79faaa 1597 aesenc $rndkey1,$inout4
23f6eec7 1598 movups 0x10($inp),$in1 # pre-load input
b4a9d5bf 1599 movups 0x20($inp),$in2
d8ba0dc9 1600 aesenc $rndkey1,$inout5
6c79faaa 1601 aesenc $rndkey1,$inout6
f8501464 1602
6c79faaa 1603 call .Lenc_loop8_enter
f8501464 1604
73325b22
AP
1605 movdqu 0x30($inp),$in3
1606 pxor $in0,$inout0
1607 movdqu 0x40($inp),$in0
1608 pxor $in1,$inout1
23f6eec7 1609 movdqu $inout0,($out) # store output
73325b22
AP
1610 pxor $in2,$inout2
1611 movdqu $inout1,0x10($out)
1612 pxor $in3,$inout3
1613 movdqu $inout2,0x20($out)
1614 pxor $in0,$inout4
1615 movdqu $inout3,0x30($out)
1616 movdqu $inout4,0x40($out)
6c79faaa 1617 cmp \$6,$len
23f6eec7 1618 jb .Lctr32_done # $len was 5, stop store
9282c335 1619
6c79faaa
AP
1620 movups 0x50($inp),$in1
1621 xorps $in1,$inout5
1622 movups $inout5,0x50($out)
23f6eec7 1623 je .Lctr32_done # $len was 6, stop store
9282c335 1624
6c79faaa
AP
1625 movups 0x60($inp),$in2
1626 xorps $in2,$inout6
1627 movups $inout6,0x60($out)
23f6eec7 1628 jmp .Lctr32_done # $len was 7, stop store
f8501464 1629
6c79faaa
AP
1630.align 32
1631.Lctr32_loop4:
1632 aesenc $rndkey1,$inout0
1633 lea 16($key),$key
d8ba0dc9 1634 dec $rounds
6c79faaa
AP
1635 aesenc $rndkey1,$inout1
1636 aesenc $rndkey1,$inout2
1637 aesenc $rndkey1,$inout3
1638 $movkey ($key),$rndkey1
6c79faaa
AP
1639 jnz .Lctr32_loop4
1640 aesenclast $rndkey1,$inout0
1641 aesenclast $rndkey1,$inout1
23f6eec7 1642 movups ($inp),$in0 # load input
b4a9d5bf 1643 movups 0x10($inp),$in1
6c79faaa
AP
1644 aesenclast $rndkey1,$inout2
1645 aesenclast $rndkey1,$inout3
d8ba0dc9 1646 movups 0x20($inp),$in2
b4a9d5bf
AP
1647 movups 0x30($inp),$in3
1648
1649 xorps $in0,$inout0
23f6eec7 1650 movups $inout0,($out) # store output
b4a9d5bf
AP
1651 xorps $in1,$inout1
1652 movups $inout1,0x10($out)
73325b22
AP
1653 pxor $in2,$inout2
1654 movdqu $inout2,0x20($out)
1655 pxor $in3,$inout3
1656 movdqu $inout3,0x30($out)
23f6eec7 1657 jmp .Lctr32_done # $len was 4, stop store
b4a9d5bf
AP
1658
1659.align 32
1660.Lctr32_loop3:
1661 aesenc $rndkey1,$inout0
1662 lea 16($key),$key
d8ba0dc9 1663 dec $rounds
b4a9d5bf
AP
1664 aesenc $rndkey1,$inout1
1665 aesenc $rndkey1,$inout2
1666 $movkey ($key),$rndkey1
b4a9d5bf
AP
1667 jnz .Lctr32_loop3
1668 aesenclast $rndkey1,$inout0
1669 aesenclast $rndkey1,$inout1
1670 aesenclast $rndkey1,$inout2
6c79faaa 1671
23f6eec7 1672 movups ($inp),$in0 # load input
9282c335 1673 xorps $in0,$inout0
23f6eec7 1674 movups $inout0,($out) # store output
6c79faaa 1675 cmp \$2,$len
23f6eec7 1676 jb .Lctr32_done # $len was 1, stop store
f8501464 1677
6c79faaa 1678 movups 0x10($inp),$in1
9282c335 1679 xorps $in1,$inout1
9282c335 1680 movups $inout1,0x10($out)
23f6eec7 1681 je .Lctr32_done # $len was 2, stop store
f8501464 1682
6c79faaa 1683 movups 0x20($inp),$in2
9282c335 1684 xorps $in2,$inout2
23f6eec7 1685 movups $inout2,0x20($out) # $len was 3, stop store
9282c335 1686
f8501464 1687.Lctr32_done:
46f4e1be 1688 xorps %xmm0,%xmm0 # clear register bank
23f6eec7
AP
1689 xor $key0,$key0
1690 pxor %xmm1,%xmm1
1691 pxor %xmm2,%xmm2
1692 pxor %xmm3,%xmm3
1693 pxor %xmm4,%xmm4
1694 pxor %xmm5,%xmm5
1695___
1696$code.=<<___ if (!$win64);
1697 pxor %xmm6,%xmm6
1698 pxor %xmm7,%xmm7
1699 movaps %xmm0,0x00(%rsp) # clear stack
1700 pxor %xmm8,%xmm8
1701 movaps %xmm0,0x10(%rsp)
1702 pxor %xmm9,%xmm9
1703 movaps %xmm0,0x20(%rsp)
1704 pxor %xmm10,%xmm10
1705 movaps %xmm0,0x30(%rsp)
1706 pxor %xmm11,%xmm11
1707 movaps %xmm0,0x40(%rsp)
1708 pxor %xmm12,%xmm12
1709 movaps %xmm0,0x50(%rsp)
1710 pxor %xmm13,%xmm13
1711 movaps %xmm0,0x60(%rsp)
1712 pxor %xmm14,%xmm14
1713 movaps %xmm0,0x70(%rsp)
1714 pxor %xmm15,%xmm15
f8501464
AP
1715___
1716$code.=<<___ if ($win64);
384e6de4
AP
1717 movaps -0xa8($key_),%xmm6
1718 movaps %xmm0,-0xa8($key_) # clear stack
1719 movaps -0x98($key_),%xmm7
1720 movaps %xmm0,-0x98($key_)
1721 movaps -0x88($key_),%xmm8
1722 movaps %xmm0,-0x88($key_)
1723 movaps -0x78($key_),%xmm9
1724 movaps %xmm0,-0x78($key_)
1725 movaps -0x68($key_),%xmm10
1726 movaps %xmm0,-0x68($key_)
1727 movaps -0x58($key_),%xmm11
1728 movaps %xmm0,-0x58($key_)
1729 movaps -0x48($key_),%xmm12
1730 movaps %xmm0,-0x48($key_)
1731 movaps -0x38($key_),%xmm13
1732 movaps %xmm0,-0x38($key_)
1733 movaps -0x28($key_),%xmm14
1734 movaps %xmm0,-0x28($key_)
1735 movaps -0x18($key_),%xmm15
1736 movaps %xmm0,-0x18($key_)
23f6eec7
AP
1737 movaps %xmm0,0x00(%rsp)
1738 movaps %xmm0,0x10(%rsp)
1739 movaps %xmm0,0x20(%rsp)
1740 movaps %xmm0,0x30(%rsp)
1741 movaps %xmm0,0x40(%rsp)
1742 movaps %xmm0,0x50(%rsp)
1743 movaps %xmm0,0x60(%rsp)
1744 movaps %xmm0,0x70(%rsp)
f8501464
AP
1745___
1746$code.=<<___;
384e6de4 1747 mov -8($key_),%rbp
b84460ad 1748.cfi_restore %rbp
384e6de4 1749 lea ($key_),%rsp
b84460ad 1750.cfi_def_cfa_register %rsp
6c79faaa 1751.Lctr32_epilogue:
f8501464 1752 ret
b84460ad 1753.cfi_endproc
f8501464
AP
1754.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1755___
1756}
1757\f
1758######################################################################
1759# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1760# const AES_KEY *key1, const AES_KEY *key2
1761# const unsigned char iv[16]);
1762#
1763{
1764my @tweak=map("%xmm$_",(10..15));
1765my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1766my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
36df342f 1767my $frame_size = 0x70 + ($win64?160:0);
384e6de4 1768my $key_ = "%rbp"; # override so that we can use %r11 as FP
f8501464
AP
1769
1770$code.=<<___;
1771.globl aesni_xts_encrypt
1772.type aesni_xts_encrypt,\@function,6
1773.align 16
1774aesni_xts_encrypt:
b84460ad 1775.cfi_startproc
384e6de4 1776 lea (%rsp),%r11 # frame pointer
b84460ad 1777.cfi_def_cfa_register %r11
6a40ebe8 1778 push %rbp
b84460ad 1779.cfi_push %rbp
6a40ebe8
AP
1780 sub \$$frame_size,%rsp
1781 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
f8501464
AP
1782___
1783$code.=<<___ if ($win64);
384e6de4
AP
1784 movaps %xmm6,-0xa8(%r11) # offload everything
1785 movaps %xmm7,-0x98(%r11)
1786 movaps %xmm8,-0x88(%r11)
1787 movaps %xmm9,-0x78(%r11)
1788 movaps %xmm10,-0x68(%r11)
1789 movaps %xmm11,-0x58(%r11)
1790 movaps %xmm12,-0x48(%r11)
1791 movaps %xmm13,-0x38(%r11)
1792 movaps %xmm14,-0x28(%r11)
1793 movaps %xmm15,-0x18(%r11)
f8501464
AP
1794.Lxts_enc_body:
1795___
1796$code.=<<___;
d8ba0dc9 1797 movups ($ivp),$inout0 # load clear-text tweak
f8501464
AP
1798 mov 240(%r8),$rounds # key2->rounds
1799 mov 240($key),$rnds_ # key1->rounds
1800___
1801 # generate the tweak
d8ba0dc9 1802 &aesni_generate1("enc",$key2,$rounds,$inout0);
f8501464 1803$code.=<<___;
36df342f 1804 $movkey ($key),$rndkey0 # zero round key
f8501464
AP
1805 mov $key,$key_ # backup $key
1806 mov $rnds_,$rounds # backup $rounds
36df342f 1807 shl \$4,$rnds_
f8501464
AP
1808 mov $len,$len_ # backup $len
1809 and \$-16,$len
1810
36df342f 1811 $movkey 16($key,$rnds_),$rndkey1 # last round key
36df342f 1812
f8501464 1813 movdqa .Lxts_magic(%rip),$twmask
d8ba0dc9
AP
1814 movdqa $inout0,@tweak[5]
1815 pshufd \$0x5f,$inout0,$twres
36df342f 1816 pxor $rndkey0,$rndkey1
f8501464 1817___
36df342f
AP
1818 # alternative tweak calculation algorithm is based on suggestions
1819 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1820 # and should help in the future...
f8501464
AP
1821 for ($i=0;$i<4;$i++) {
1822 $code.=<<___;
36df342f
AP
1823 movdqa $twres,$twtmp
1824 paddd $twres,$twres
f8501464 1825 movdqa @tweak[5],@tweak[$i]
36df342f
AP
1826 psrad \$31,$twtmp # broadcast upper bits
1827 paddq @tweak[5],@tweak[5]
1828 pand $twmask,$twtmp
1829 pxor $rndkey0,@tweak[$i]
1830 pxor $twtmp,@tweak[5]
f8501464
AP
1831___
1832 }
1833$code.=<<___;
36df342f
AP
1834 movdqa @tweak[5],@tweak[4]
1835 psrad \$31,$twres
1836 paddq @tweak[5],@tweak[5]
1837 pand $twmask,$twres
1838 pxor $rndkey0,@tweak[4]
1839 pxor $twres,@tweak[5]
1840 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1841
f8501464 1842 sub \$16*6,$len
23f6eec7 1843 jc .Lxts_enc_short # if $len-=6*16 borrowed
f8501464 1844
d8ba0dc9
AP
1845 mov \$16+96,$rounds
1846 lea 32($key_,$rnds_),$key # end of key schedule
1847 sub %r10,%rax # twisted $rounds
36df342f 1848 $movkey 16($key_),$rndkey1
d8ba0dc9 1849 mov %rax,%r10 # backup twisted $rounds
36df342f 1850 lea .Lxts_magic(%rip),%r8
f8501464
AP
1851 jmp .Lxts_enc_grandloop
1852
36df342f 1853.align 32
f8501464 1854.Lxts_enc_grandloop:
f8501464 1855 movdqu `16*0`($inp),$inout0 # load input
36df342f 1856 movdqa $rndkey0,$twmask
f8501464 1857 movdqu `16*1`($inp),$inout1
23f6eec7 1858 pxor @tweak[0],$inout0 # input^=tweak^round[0]
f8501464 1859 movdqu `16*2`($inp),$inout2
f8501464 1860 pxor @tweak[1],$inout1
36df342f
AP
1861 aesenc $rndkey1,$inout0
1862 movdqu `16*3`($inp),$inout3
f8501464 1863 pxor @tweak[2],$inout2
36df342f
AP
1864 aesenc $rndkey1,$inout1
1865 movdqu `16*4`($inp),$inout4
f8501464 1866 pxor @tweak[3],$inout3
36df342f
AP
1867 aesenc $rndkey1,$inout2
1868 movdqu `16*5`($inp),$inout5
1869 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1870 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
f8501464 1871 pxor @tweak[4],$inout4
36df342f
AP
1872 aesenc $rndkey1,$inout3
1873 $movkey 32($key_),$rndkey0
1874 lea `16*6`($inp),$inp
1875 pxor $twmask,$inout5
f8501464 1876
46f4e1be 1877 pxor $twres,@tweak[0] # calculate tweaks^round[last]
f8501464 1878 aesenc $rndkey1,$inout4
36df342f 1879 pxor $twres,@tweak[1]
23f6eec7 1880 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
f8501464 1881 aesenc $rndkey1,$inout5
36df342f 1882 $movkey 48($key_),$rndkey1
d8ba0dc9 1883 pxor $twres,@tweak[2]
f8501464 1884
36df342f 1885 aesenc $rndkey0,$inout0
d8ba0dc9 1886 pxor $twres,@tweak[3]
36df342f
AP
1887 movdqa @tweak[1],`16*1`(%rsp)
1888 aesenc $rndkey0,$inout1
d8ba0dc9 1889 pxor $twres,@tweak[4]
36df342f
AP
1890 movdqa @tweak[2],`16*2`(%rsp)
1891 aesenc $rndkey0,$inout2
36df342f
AP
1892 aesenc $rndkey0,$inout3
1893 pxor $twres,$twmask
1894 movdqa @tweak[4],`16*4`(%rsp)
1895 aesenc $rndkey0,$inout4
36df342f
AP
1896 aesenc $rndkey0,$inout5
1897 $movkey 64($key_),$rndkey0
d8ba0dc9 1898 movdqa $twmask,`16*5`(%rsp)
36df342f
AP
1899 pshufd \$0x5f,@tweak[5],$twres
1900 jmp .Lxts_enc_loop6
1901.align 32
f8501464
AP
1902.Lxts_enc_loop6:
1903 aesenc $rndkey1,$inout0
1904 aesenc $rndkey1,$inout1
f8501464
AP
1905 aesenc $rndkey1,$inout2
1906 aesenc $rndkey1,$inout3
1907 aesenc $rndkey1,$inout4
1908 aesenc $rndkey1,$inout5
d8ba0dc9
AP
1909 $movkey -64($key,%rax),$rndkey1
1910 add \$32,%rax
36df342f 1911
f8501464
AP
1912 aesenc $rndkey0,$inout0
1913 aesenc $rndkey0,$inout1
f8501464
AP
1914 aesenc $rndkey0,$inout2
1915 aesenc $rndkey0,$inout3
1916 aesenc $rndkey0,$inout4
1917 aesenc $rndkey0,$inout5
d8ba0dc9 1918 $movkey -80($key,%rax),$rndkey0
f8501464
AP
1919 jnz .Lxts_enc_loop6
1920
23f6eec7 1921 movdqa (%r8),$twmask # start calculating next tweak
36df342f
AP
1922 movdqa $twres,$twtmp
1923 paddd $twres,$twres
f8501464 1924 aesenc $rndkey1,$inout0
36df342f
AP
1925 paddq @tweak[5],@tweak[5]
1926 psrad \$31,$twtmp
f8501464 1927 aesenc $rndkey1,$inout1
36df342f
AP
1928 pand $twmask,$twtmp
1929 $movkey ($key_),@tweak[0] # load round[0]
f8501464 1930 aesenc $rndkey1,$inout2
f8501464
AP
1931 aesenc $rndkey1,$inout3
1932 aesenc $rndkey1,$inout4
d8ba0dc9 1933 pxor $twtmp,@tweak[5]
36df342f 1934 movaps @tweak[0],@tweak[1] # copy round[0]
f8501464 1935 aesenc $rndkey1,$inout5
d8ba0dc9 1936 $movkey -64($key),$rndkey1
f8501464 1937
36df342f 1938 movdqa $twres,$twtmp
f8501464 1939 aesenc $rndkey0,$inout0
d8ba0dc9 1940 paddd $twres,$twres
36df342f 1941 pxor @tweak[5],@tweak[0]
f8501464 1942 aesenc $rndkey0,$inout1
d8ba0dc9 1943 psrad \$31,$twtmp
36df342f 1944 paddq @tweak[5],@tweak[5]
f8501464 1945 aesenc $rndkey0,$inout2
f8501464 1946 aesenc $rndkey0,$inout3
d8ba0dc9 1947 pand $twmask,$twtmp
36df342f 1948 movaps @tweak[1],@tweak[2]
d8ba0dc9
AP
1949 aesenc $rndkey0,$inout4
1950 pxor $twtmp,@tweak[5]
1951 movdqa $twres,$twtmp
f8501464 1952 aesenc $rndkey0,$inout5
d8ba0dc9 1953 $movkey -48($key),$rndkey0
f8501464 1954
36df342f 1955 paddd $twres,$twres
f8501464 1956 aesenc $rndkey1,$inout0
36df342f
AP
1957 pxor @tweak[5],@tweak[1]
1958 psrad \$31,$twtmp
f8501464 1959 aesenc $rndkey1,$inout1
36df342f
AP
1960 paddq @tweak[5],@tweak[5]
1961 pand $twmask,$twtmp
f8501464 1962 aesenc $rndkey1,$inout2
f8501464 1963 aesenc $rndkey1,$inout3
d8ba0dc9 1964 movdqa @tweak[3],`16*3`(%rsp)
36df342f 1965 pxor $twtmp,@tweak[5]
f8501464 1966 aesenc $rndkey1,$inout4
36df342f 1967 movaps @tweak[2],@tweak[3]
d8ba0dc9 1968 movdqa $twres,$twtmp
f8501464 1969 aesenc $rndkey1,$inout5
d8ba0dc9 1970 $movkey -32($key),$rndkey1
f8501464 1971
36df342f
AP
1972 paddd $twres,$twres
1973 aesenc $rndkey0,$inout0
1974 pxor @tweak[5],@tweak[2]
1975 psrad \$31,$twtmp
1976 aesenc $rndkey0,$inout1
1977 paddq @tweak[5],@tweak[5]
1978 pand $twmask,$twtmp
1979 aesenc $rndkey0,$inout2
1980 aesenc $rndkey0,$inout3
36df342f 1981 aesenc $rndkey0,$inout4
d8ba0dc9 1982 pxor $twtmp,@tweak[5]
36df342f
AP
1983 movaps @tweak[3],@tweak[4]
1984 aesenc $rndkey0,$inout5
1985
1986 movdqa $twres,$rndkey0
1987 paddd $twres,$twres
1988 aesenc $rndkey1,$inout0
1989 pxor @tweak[5],@tweak[3]
1990 psrad \$31,$rndkey0
1991 aesenc $rndkey1,$inout1
1992 paddq @tweak[5],@tweak[5]
1993 pand $twmask,$rndkey0
1994 aesenc $rndkey1,$inout2
1995 aesenc $rndkey1,$inout3
1996 pxor $rndkey0,@tweak[5]
1997 $movkey ($key_),$rndkey0
1998 aesenc $rndkey1,$inout4
1999 aesenc $rndkey1,$inout5
2000 $movkey 16($key_),$rndkey1
2001
2002 pxor @tweak[5],@tweak[4]
36df342f 2003 aesenclast `16*0`(%rsp),$inout0
d8ba0dc9 2004 psrad \$31,$twres
36df342f 2005 paddq @tweak[5],@tweak[5]
36df342f
AP
2006 aesenclast `16*1`(%rsp),$inout1
2007 aesenclast `16*2`(%rsp),$inout2
d8ba0dc9
AP
2008 pand $twmask,$twres
2009 mov %r10,%rax # restore $rounds
36df342f
AP
2010 aesenclast `16*3`(%rsp),$inout3
2011 aesenclast `16*4`(%rsp),$inout4
2012 aesenclast `16*5`(%rsp),$inout5
d8ba0dc9 2013 pxor $twres,@tweak[5]
f8501464 2014
23f6eec7
AP
2015 lea `16*6`($out),$out # $out+=6*16
2016 movups $inout0,`-16*6`($out) # store 6 output blocks
36df342f
AP
2017 movups $inout1,`-16*5`($out)
2018 movups $inout2,`-16*4`($out)
2019 movups $inout3,`-16*3`($out)
2020 movups $inout4,`-16*2`($out)
2021 movups $inout5,`-16*1`($out)
f8501464 2022 sub \$16*6,$len
23f6eec7 2023 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
f8501464 2024
d8ba0dc9
AP
2025 mov \$16+96,$rounds
2026 sub $rnds_,$rounds
f8501464 2027 mov $key_,$key # restore $key
d8ba0dc9 2028 shr \$4,$rounds # restore original value
f8501464
AP
2029
2030.Lxts_enc_short:
23f6eec7 2031 # at the point @tweak[0..5] are populated with tweak values
d8ba0dc9 2032 mov $rounds,$rnds_ # backup $rounds
36df342f 2033 pxor $rndkey0,@tweak[0]
23f6eec7
AP
2034 add \$16*6,$len # restore real remaining $len
2035 jz .Lxts_enc_done # done if ($len==0)
f8501464 2036
36df342f 2037 pxor $rndkey0,@tweak[1]
f8501464 2038 cmp \$0x20,$len
23f6eec7 2039 jb .Lxts_enc_one # $len is 1*16
36df342f 2040 pxor $rndkey0,@tweak[2]
23f6eec7 2041 je .Lxts_enc_two # $len is 2*16
f8501464 2042
36df342f 2043 pxor $rndkey0,@tweak[3]
f8501464 2044 cmp \$0x40,$len
23f6eec7 2045 jb .Lxts_enc_three # $len is 3*16
36df342f 2046 pxor $rndkey0,@tweak[4]
23f6eec7 2047 je .Lxts_enc_four # $len is 4*16
f8501464 2048
23f6eec7 2049 movdqu ($inp),$inout0 # $len is 5*16
36df342f 2050 movdqu 16*1($inp),$inout1
f8501464
AP
2051 movdqu 16*2($inp),$inout2
2052 pxor @tweak[0],$inout0
2053 movdqu 16*3($inp),$inout3
2054 pxor @tweak[1],$inout1
2055 movdqu 16*4($inp),$inout4
23f6eec7 2056 lea 16*5($inp),$inp # $inp+=5*16
f8501464
AP
2057 pxor @tweak[2],$inout2
2058 pxor @tweak[3],$inout3
2059 pxor @tweak[4],$inout4
23f6eec7 2060 pxor $inout5,$inout5
f8501464
AP
2061
2062 call _aesni_encrypt6
2063
2064 xorps @tweak[0],$inout0
2065 movdqa @tweak[5],@tweak[0]
2066 xorps @tweak[1],$inout1
2067 xorps @tweak[2],$inout2
23f6eec7 2068 movdqu $inout0,($out) # store 5 output blocks
f8501464
AP
2069 xorps @tweak[3],$inout3
2070 movdqu $inout1,16*1($out)
2071 xorps @tweak[4],$inout4
2072 movdqu $inout2,16*2($out)
2073 movdqu $inout3,16*3($out)
2074 movdqu $inout4,16*4($out)
23f6eec7 2075 lea 16*5($out),$out # $out+=5*16
f8501464
AP
2076 jmp .Lxts_enc_done
2077
2078.align 16
2079.Lxts_enc_one:
2080 movups ($inp),$inout0
23f6eec7 2081 lea 16*1($inp),$inp # inp+=1*16
f8501464
AP
2082 xorps @tweak[0],$inout0
2083___
2084 &aesni_generate1("enc",$key,$rounds);
2085$code.=<<___;
2086 xorps @tweak[0],$inout0
2087 movdqa @tweak[1],@tweak[0]
23f6eec7
AP
2088 movups $inout0,($out) # store one output block
2089 lea 16*1($out),$out # $out+=1*16
f8501464
AP
2090 jmp .Lxts_enc_done
2091
2092.align 16
2093.Lxts_enc_two:
2094 movups ($inp),$inout0
2095 movups 16($inp),$inout1
23f6eec7 2096 lea 32($inp),$inp # $inp+=2*16
f8501464
AP
2097 xorps @tweak[0],$inout0
2098 xorps @tweak[1],$inout1
2099
214368ff 2100 call _aesni_encrypt2
f8501464
AP
2101
2102 xorps @tweak[0],$inout0
2103 movdqa @tweak[2],@tweak[0]
2104 xorps @tweak[1],$inout1
23f6eec7 2105 movups $inout0,($out) # store 2 output blocks
f8501464 2106 movups $inout1,16*1($out)
23f6eec7 2107 lea 16*2($out),$out # $out+=2*16
f8501464
AP
2108 jmp .Lxts_enc_done
2109
2110.align 16
2111.Lxts_enc_three:
2112 movups ($inp),$inout0
2113 movups 16*1($inp),$inout1
2114 movups 16*2($inp),$inout2
23f6eec7 2115 lea 16*3($inp),$inp # $inp+=3*16
f8501464
AP
2116 xorps @tweak[0],$inout0
2117 xorps @tweak[1],$inout1
2118 xorps @tweak[2],$inout2
2119
2120 call _aesni_encrypt3
2121
2122 xorps @tweak[0],$inout0
2123 movdqa @tweak[3],@tweak[0]
2124 xorps @tweak[1],$inout1
2125 xorps @tweak[2],$inout2
23f6eec7 2126 movups $inout0,($out) # store 3 output blocks
f8501464
AP
2127 movups $inout1,16*1($out)
2128 movups $inout2,16*2($out)
23f6eec7 2129 lea 16*3($out),$out # $out+=3*16
f8501464
AP
2130 jmp .Lxts_enc_done
2131
2132.align 16
2133.Lxts_enc_four:
2134 movups ($inp),$inout0
2135 movups 16*1($inp),$inout1
2136 movups 16*2($inp),$inout2
2137 xorps @tweak[0],$inout0
2138 movups 16*3($inp),$inout3
23f6eec7 2139 lea 16*4($inp),$inp # $inp+=4*16
f8501464
AP
2140 xorps @tweak[1],$inout1
2141 xorps @tweak[2],$inout2
2142 xorps @tweak[3],$inout3
2143
2144 call _aesni_encrypt4
2145
36df342f
AP
2146 pxor @tweak[0],$inout0
2147 movdqa @tweak[4],@tweak[0]
2148 pxor @tweak[1],$inout1
2149 pxor @tweak[2],$inout2
23f6eec7 2150 movdqu $inout0,($out) # store 4 output blocks
36df342f
AP
2151 pxor @tweak[3],$inout3
2152 movdqu $inout1,16*1($out)
2153 movdqu $inout2,16*2($out)
2154 movdqu $inout3,16*3($out)
23f6eec7 2155 lea 16*4($out),$out # $out+=4*16
f8501464
AP
2156 jmp .Lxts_enc_done
2157
2158.align 16
2159.Lxts_enc_done:
23f6eec7 2160 and \$15,$len_ # see if $len%16 is 0
f8501464
AP
2161 jz .Lxts_enc_ret
2162 mov $len_,$len
2163
2164.Lxts_enc_steal:
2165 movzb ($inp),%eax # borrow $rounds ...
2166 movzb -16($out),%ecx # ... and $key
2167 lea 1($inp),$inp
2168 mov %al,-16($out)
2169 mov %cl,0($out)
2170 lea 1($out),$out
2171 sub \$1,$len
2172 jnz .Lxts_enc_steal
2173
2174 sub $len_,$out # rewind $out
2175 mov $key_,$key # restore $key
2176 mov $rnds_,$rounds # restore $rounds
2177
2178 movups -16($out),$inout0
2179 xorps @tweak[0],$inout0
2180___
2181 &aesni_generate1("enc",$key,$rounds);
2182$code.=<<___;
2183 xorps @tweak[0],$inout0
2184 movups $inout0,-16($out)
2185
2186.Lxts_enc_ret:
23f6eec7
AP
2187 xorps %xmm0,%xmm0 # clear register bank
2188 pxor %xmm1,%xmm1
2189 pxor %xmm2,%xmm2
2190 pxor %xmm3,%xmm3
2191 pxor %xmm4,%xmm4
2192 pxor %xmm5,%xmm5
2193___
2194$code.=<<___ if (!$win64);
2195 pxor %xmm6,%xmm6
2196 pxor %xmm7,%xmm7
2197 movaps %xmm0,0x00(%rsp) # clear stack
2198 pxor %xmm8,%xmm8
2199 movaps %xmm0,0x10(%rsp)
2200 pxor %xmm9,%xmm9
2201 movaps %xmm0,0x20(%rsp)
2202 pxor %xmm10,%xmm10
2203 movaps %xmm0,0x30(%rsp)
2204 pxor %xmm11,%xmm11
2205 movaps %xmm0,0x40(%rsp)
2206 pxor %xmm12,%xmm12
2207 movaps %xmm0,0x50(%rsp)
2208 pxor %xmm13,%xmm13
2209 movaps %xmm0,0x60(%rsp)
2210 pxor %xmm14,%xmm14
2211 pxor %xmm15,%xmm15
f8501464
AP
2212___
2213$code.=<<___ if ($win64);
384e6de4
AP
2214 movaps -0xa8(%r11),%xmm6
2215 movaps %xmm0,-0xa8(%r11) # clear stack
2216 movaps -0x98(%r11),%xmm7
2217 movaps %xmm0,-0x98(%r11)
2218 movaps -0x88(%r11),%xmm8
2219 movaps %xmm0,-0x88(%r11)
2220 movaps -0x78(%r11),%xmm9
2221 movaps %xmm0,-0x78(%r11)
2222 movaps -0x68(%r11),%xmm10
2223 movaps %xmm0,-0x68(%r11)
2224 movaps -0x58(%r11),%xmm11
2225 movaps %xmm0,-0x58(%r11)
2226 movaps -0x48(%r11),%xmm12
2227 movaps %xmm0,-0x48(%r11)
2228 movaps -0x38(%r11),%xmm13
2229 movaps %xmm0,-0x38(%r11)
2230 movaps -0x28(%r11),%xmm14
2231 movaps %xmm0,-0x28(%r11)
2232 movaps -0x18(%r11),%xmm15
2233 movaps %xmm0,-0x18(%r11)
23f6eec7
AP
2234 movaps %xmm0,0x00(%rsp)
2235 movaps %xmm0,0x10(%rsp)
2236 movaps %xmm0,0x20(%rsp)
2237 movaps %xmm0,0x30(%rsp)
2238 movaps %xmm0,0x40(%rsp)
2239 movaps %xmm0,0x50(%rsp)
2240 movaps %xmm0,0x60(%rsp)
f8501464
AP
2241___
2242$code.=<<___;
384e6de4 2243 mov -8(%r11),%rbp
b84460ad 2244.cfi_restore %rbp
384e6de4 2245 lea (%r11),%rsp
b84460ad 2246.cfi_def_cfa_register %rsp
f8501464
AP
2247.Lxts_enc_epilogue:
2248 ret
b84460ad 2249.cfi_endproc
f8501464 2250.size aesni_xts_encrypt,.-aesni_xts_encrypt
d7d119a3 2251___
6c83629b
AP
2252
2253$code.=<<___;
f8501464
AP
2254.globl aesni_xts_decrypt
2255.type aesni_xts_decrypt,\@function,6
6c83629b 2256.align 16
f8501464 2257aesni_xts_decrypt:
b84460ad 2258.cfi_startproc
384e6de4 2259 lea (%rsp),%r11 # frame pointer
b84460ad 2260.cfi_def_cfa_register %r11
6a40ebe8 2261 push %rbp
b84460ad 2262.cfi_push %rbp
6a40ebe8
AP
2263 sub \$$frame_size,%rsp
2264 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
6c83629b
AP
2265___
2266$code.=<<___ if ($win64);
384e6de4
AP
2267 movaps %xmm6,-0xa8(%r11) # offload everything
2268 movaps %xmm7,-0x98(%r11)
2269 movaps %xmm8,-0x88(%r11)
2270 movaps %xmm9,-0x78(%r11)
2271 movaps %xmm10,-0x68(%r11)
2272 movaps %xmm11,-0x58(%r11)
2273 movaps %xmm12,-0x48(%r11)
2274 movaps %xmm13,-0x38(%r11)
2275 movaps %xmm14,-0x28(%r11)
2276 movaps %xmm15,-0x18(%r11)
f8501464 2277.Lxts_dec_body:
6c83629b
AP
2278___
2279$code.=<<___;
d8ba0dc9 2280 movups ($ivp),$inout0 # load clear-text tweak
f8501464
AP
2281 mov 240($key2),$rounds # key2->rounds
2282 mov 240($key),$rnds_ # key1->rounds
2283___
2284 # generate the tweak
d8ba0dc9 2285 &aesni_generate1("enc",$key2,$rounds,$inout0);
f8501464
AP
2286$code.=<<___;
2287 xor %eax,%eax # if ($len%16) len-=16;
2288 test \$15,$len
2289 setnz %al
2290 shl \$4,%rax
2291 sub %rax,$len
2292
36df342f 2293 $movkey ($key),$rndkey0 # zero round key
f8501464
AP
2294 mov $key,$key_ # backup $key
2295 mov $rnds_,$rounds # backup $rounds
36df342f 2296 shl \$4,$rnds_
f8501464
AP
2297 mov $len,$len_ # backup $len
2298 and \$-16,$len
6c83629b 2299
36df342f 2300 $movkey 16($key,$rnds_),$rndkey1 # last round key
36df342f 2301
f8501464 2302 movdqa .Lxts_magic(%rip),$twmask
d8ba0dc9
AP
2303 movdqa $inout0,@tweak[5]
2304 pshufd \$0x5f,$inout0,$twres
36df342f 2305 pxor $rndkey0,$rndkey1
f8501464
AP
2306___
2307 for ($i=0;$i<4;$i++) {
2308 $code.=<<___;
36df342f
AP
2309 movdqa $twres,$twtmp
2310 paddd $twres,$twres
f8501464 2311 movdqa @tweak[5],@tweak[$i]
36df342f
AP
2312 psrad \$31,$twtmp # broadcast upper bits
2313 paddq @tweak[5],@tweak[5]
2314 pand $twmask,$twtmp
2315 pxor $rndkey0,@tweak[$i]
2316 pxor $twtmp,@tweak[5]
f8501464
AP
2317___
2318 }
2319$code.=<<___;
36df342f
AP
2320 movdqa @tweak[5],@tweak[4]
2321 psrad \$31,$twres
2322 paddq @tweak[5],@tweak[5]
2323 pand $twmask,$twres
2324 pxor $rndkey0,@tweak[4]
2325 pxor $twres,@tweak[5]
2326 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2327
f8501464 2328 sub \$16*6,$len
23f6eec7 2329 jc .Lxts_dec_short # if $len-=6*16 borrowed
6c83629b 2330
d8ba0dc9
AP
2331 mov \$16+96,$rounds
2332 lea 32($key_,$rnds_),$key # end of key schedule
2333 sub %r10,%rax # twisted $rounds
36df342f 2334 $movkey 16($key_),$rndkey1
d8ba0dc9 2335 mov %rax,%r10 # backup twisted $rounds
36df342f 2336 lea .Lxts_magic(%rip),%r8
f8501464 2337 jmp .Lxts_dec_grandloop
6c83629b 2338
36df342f 2339.align 32
f8501464 2340.Lxts_dec_grandloop:
f8501464 2341 movdqu `16*0`($inp),$inout0 # load input
36df342f 2342 movdqa $rndkey0,$twmask
f8501464 2343 movdqu `16*1`($inp),$inout1
23f6eec7 2344 pxor @tweak[0],$inout0 # intput^=tweak^round[0]
f8501464 2345 movdqu `16*2`($inp),$inout2
f8501464 2346 pxor @tweak[1],$inout1
36df342f
AP
2347 aesdec $rndkey1,$inout0
2348 movdqu `16*3`($inp),$inout3
f8501464 2349 pxor @tweak[2],$inout2
36df342f
AP
2350 aesdec $rndkey1,$inout1
2351 movdqu `16*4`($inp),$inout4
f8501464 2352 pxor @tweak[3],$inout3
36df342f
AP
2353 aesdec $rndkey1,$inout2
2354 movdqu `16*5`($inp),$inout5
2355 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2356 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
f8501464 2357 pxor @tweak[4],$inout4
36df342f
AP
2358 aesdec $rndkey1,$inout3
2359 $movkey 32($key_),$rndkey0
2360 lea `16*6`($inp),$inp
2361 pxor $twmask,$inout5
f8501464 2362
46f4e1be 2363 pxor $twres,@tweak[0] # calculate tweaks^round[last]
f8501464 2364 aesdec $rndkey1,$inout4
36df342f
AP
2365 pxor $twres,@tweak[1]
2366 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
f8501464 2367 aesdec $rndkey1,$inout5
36df342f 2368 $movkey 48($key_),$rndkey1
d8ba0dc9 2369 pxor $twres,@tweak[2]
6c83629b 2370
36df342f 2371 aesdec $rndkey0,$inout0
d8ba0dc9 2372 pxor $twres,@tweak[3]
36df342f
AP
2373 movdqa @tweak[1],`16*1`(%rsp)
2374 aesdec $rndkey0,$inout1
d8ba0dc9 2375 pxor $twres,@tweak[4]
36df342f
AP
2376 movdqa @tweak[2],`16*2`(%rsp)
2377 aesdec $rndkey0,$inout2
36df342f
AP
2378 aesdec $rndkey0,$inout3
2379 pxor $twres,$twmask
2380 movdqa @tweak[4],`16*4`(%rsp)
2381 aesdec $rndkey0,$inout4
36df342f
AP
2382 aesdec $rndkey0,$inout5
2383 $movkey 64($key_),$rndkey0
d8ba0dc9 2384 movdqa $twmask,`16*5`(%rsp)
36df342f
AP
2385 pshufd \$0x5f,@tweak[5],$twres
2386 jmp .Lxts_dec_loop6
2387.align 32
f8501464
AP
2388.Lxts_dec_loop6:
2389 aesdec $rndkey1,$inout0
2390 aesdec $rndkey1,$inout1
f8501464
AP
2391 aesdec $rndkey1,$inout2
2392 aesdec $rndkey1,$inout3
2393 aesdec $rndkey1,$inout4
2394 aesdec $rndkey1,$inout5
d8ba0dc9
AP
2395 $movkey -64($key,%rax),$rndkey1
2396 add \$32,%rax
36df342f 2397
f8501464
AP
2398 aesdec $rndkey0,$inout0
2399 aesdec $rndkey0,$inout1
f8501464
AP
2400 aesdec $rndkey0,$inout2
2401 aesdec $rndkey0,$inout3
2402 aesdec $rndkey0,$inout4
2403 aesdec $rndkey0,$inout5
d8ba0dc9 2404 $movkey -80($key,%rax),$rndkey0
f8501464
AP
2405 jnz .Lxts_dec_loop6
2406
23f6eec7 2407 movdqa (%r8),$twmask # start calculating next tweak
36df342f
AP
2408 movdqa $twres,$twtmp
2409 paddd $twres,$twres
f8501464 2410 aesdec $rndkey1,$inout0
36df342f
AP
2411 paddq @tweak[5],@tweak[5]
2412 psrad \$31,$twtmp
f8501464 2413 aesdec $rndkey1,$inout1
36df342f
AP
2414 pand $twmask,$twtmp
2415 $movkey ($key_),@tweak[0] # load round[0]
f8501464 2416 aesdec $rndkey1,$inout2
f8501464
AP
2417 aesdec $rndkey1,$inout3
2418 aesdec $rndkey1,$inout4
d8ba0dc9 2419 pxor $twtmp,@tweak[5]
36df342f 2420 movaps @tweak[0],@tweak[1] # copy round[0]
f8501464 2421 aesdec $rndkey1,$inout5
d8ba0dc9 2422 $movkey -64($key),$rndkey1
f8501464 2423
36df342f 2424 movdqa $twres,$twtmp
f8501464 2425 aesdec $rndkey0,$inout0
d8ba0dc9 2426 paddd $twres,$twres
36df342f 2427 pxor @tweak[5],@tweak[0]
f8501464 2428 aesdec $rndkey0,$inout1
d8ba0dc9 2429 psrad \$31,$twtmp
36df342f 2430 paddq @tweak[5],@tweak[5]
f8501464 2431 aesdec $rndkey0,$inout2
f8501464 2432 aesdec $rndkey0,$inout3
d8ba0dc9 2433 pand $twmask,$twtmp
36df342f 2434 movaps @tweak[1],@tweak[2]
d8ba0dc9
AP
2435 aesdec $rndkey0,$inout4
2436 pxor $twtmp,@tweak[5]
2437 movdqa $twres,$twtmp
f8501464 2438 aesdec $rndkey0,$inout5
d8ba0dc9 2439 $movkey -48($key),$rndkey0
f8501464 2440
36df342f 2441 paddd $twres,$twres
f8501464 2442 aesdec $rndkey1,$inout0
36df342f
AP
2443 pxor @tweak[5],@tweak[1]
2444 psrad \$31,$twtmp
f8501464 2445 aesdec $rndkey1,$inout1
36df342f
AP
2446 paddq @tweak[5],@tweak[5]
2447 pand $twmask,$twtmp
f8501464 2448 aesdec $rndkey1,$inout2
f8501464 2449 aesdec $rndkey1,$inout3
d8ba0dc9 2450 movdqa @tweak[3],`16*3`(%rsp)
36df342f 2451 pxor $twtmp,@tweak[5]
f8501464 2452 aesdec $rndkey1,$inout4
36df342f 2453 movaps @tweak[2],@tweak[3]
d8ba0dc9 2454 movdqa $twres,$twtmp
f8501464 2455 aesdec $rndkey1,$inout5
d8ba0dc9 2456 $movkey -32($key),$rndkey1
f8501464 2457
36df342f
AP
2458 paddd $twres,$twres
2459 aesdec $rndkey0,$inout0
2460 pxor @tweak[5],@tweak[2]
2461 psrad \$31,$twtmp
2462 aesdec $rndkey0,$inout1
2463 paddq @tweak[5],@tweak[5]
2464 pand $twmask,$twtmp
2465 aesdec $rndkey0,$inout2
2466 aesdec $rndkey0,$inout3
36df342f 2467 aesdec $rndkey0,$inout4
d8ba0dc9 2468 pxor $twtmp,@tweak[5]
36df342f
AP
2469 movaps @tweak[3],@tweak[4]
2470 aesdec $rndkey0,$inout5
2471
2472 movdqa $twres,$rndkey0
2473 paddd $twres,$twres
2474 aesdec $rndkey1,$inout0
2475 pxor @tweak[5],@tweak[3]
2476 psrad \$31,$rndkey0
2477 aesdec $rndkey1,$inout1
2478 paddq @tweak[5],@tweak[5]
2479 pand $twmask,$rndkey0
2480 aesdec $rndkey1,$inout2
2481 aesdec $rndkey1,$inout3
2482 pxor $rndkey0,@tweak[5]
2483 $movkey ($key_),$rndkey0
2484 aesdec $rndkey1,$inout4
2485 aesdec $rndkey1,$inout5
2486 $movkey 16($key_),$rndkey1
2487
2488 pxor @tweak[5],@tweak[4]
36df342f 2489 aesdeclast `16*0`(%rsp),$inout0
d8ba0dc9 2490 psrad \$31,$twres
36df342f 2491 paddq @tweak[5],@tweak[5]
36df342f
AP
2492 aesdeclast `16*1`(%rsp),$inout1
2493 aesdeclast `16*2`(%rsp),$inout2
d8ba0dc9
AP
2494 pand $twmask,$twres
2495 mov %r10,%rax # restore $rounds
36df342f
AP
2496 aesdeclast `16*3`(%rsp),$inout3
2497 aesdeclast `16*4`(%rsp),$inout4
2498 aesdeclast `16*5`(%rsp),$inout5
d8ba0dc9 2499 pxor $twres,@tweak[5]
f8501464 2500
23f6eec7
AP
2501 lea `16*6`($out),$out # $out+=6*16
2502 movups $inout0,`-16*6`($out) # store 6 output blocks
36df342f
AP
2503 movups $inout1,`-16*5`($out)
2504 movups $inout2,`-16*4`($out)
2505 movups $inout3,`-16*3`($out)
2506 movups $inout4,`-16*2`($out)
2507 movups $inout5,`-16*1`($out)
f8501464 2508 sub \$16*6,$len
23f6eec7 2509 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
f8501464 2510
d8ba0dc9
AP
2511 mov \$16+96,$rounds
2512 sub $rnds_,$rounds
f8501464 2513 mov $key_,$key # restore $key
d8ba0dc9 2514 shr \$4,$rounds # restore original value
f8501464
AP
2515
2516.Lxts_dec_short:
23f6eec7 2517 # at the point @tweak[0..5] are populated with tweak values
d8ba0dc9 2518 mov $rounds,$rnds_ # backup $rounds
36df342f
AP
2519 pxor $rndkey0,@tweak[0]
2520 pxor $rndkey0,@tweak[1]
23f6eec7
AP
2521 add \$16*6,$len # restore real remaining $len
2522 jz .Lxts_dec_done # done if ($len==0)
d7d119a3 2523
36df342f 2524 pxor $rndkey0,@tweak[2]
f8501464 2525 cmp \$0x20,$len
23f6eec7 2526 jb .Lxts_dec_one # $len is 1*16
36df342f 2527 pxor $rndkey0,@tweak[3]
23f6eec7 2528 je .Lxts_dec_two # $len is 2*16
d7d119a3 2529
36df342f 2530 pxor $rndkey0,@tweak[4]
f8501464 2531 cmp \$0x40,$len
23f6eec7
AP
2532 jb .Lxts_dec_three # $len is 3*16
2533 je .Lxts_dec_four # $len is 4*16
f8501464 2534
23f6eec7 2535 movdqu ($inp),$inout0 # $len is 5*16
36df342f 2536 movdqu 16*1($inp),$inout1
f8501464
AP
2537 movdqu 16*2($inp),$inout2
2538 pxor @tweak[0],$inout0
2539 movdqu 16*3($inp),$inout3
2540 pxor @tweak[1],$inout1
2541 movdqu 16*4($inp),$inout4
23f6eec7 2542 lea 16*5($inp),$inp # $inp+=5*16
f8501464
AP
2543 pxor @tweak[2],$inout2
2544 pxor @tweak[3],$inout3
2545 pxor @tweak[4],$inout4
2546
2547 call _aesni_decrypt6
2548
2549 xorps @tweak[0],$inout0
2550 xorps @tweak[1],$inout1
2551 xorps @tweak[2],$inout2
23f6eec7 2552 movdqu $inout0,($out) # store 5 output blocks
f8501464
AP
2553 xorps @tweak[3],$inout3
2554 movdqu $inout1,16*1($out)
2555 xorps @tweak[4],$inout4
2556 movdqu $inout2,16*2($out)
2557 pxor $twtmp,$twtmp
2558 movdqu $inout3,16*3($out)
2559 pcmpgtd @tweak[5],$twtmp
2560 movdqu $inout4,16*4($out)
23f6eec7 2561 lea 16*5($out),$out # $out+=5*16
f8501464
AP
2562 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2563 and \$15,$len_
2564 jz .Lxts_dec_ret
2565
2566 movdqa @tweak[5],@tweak[0]
2567 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2568 pand $twmask,@tweak[1] # isolate carry and residue
2569 pxor @tweak[5],@tweak[1]
2570 jmp .Lxts_dec_done2
d7d119a3 2571
f8501464
AP
2572.align 16
2573.Lxts_dec_one:
2574 movups ($inp),$inout0
23f6eec7 2575 lea 16*1($inp),$inp # $inp+=1*16
f8501464
AP
2576 xorps @tweak[0],$inout0
2577___
2578 &aesni_generate1("dec",$key,$rounds);
2579$code.=<<___;
2580 xorps @tweak[0],$inout0
2581 movdqa @tweak[1],@tweak[0]
23f6eec7 2582 movups $inout0,($out) # store one output block
f8501464 2583 movdqa @tweak[2],@tweak[1]
23f6eec7 2584 lea 16*1($out),$out # $out+=1*16
f8501464 2585 jmp .Lxts_dec_done
6c83629b 2586
f8501464
AP
2587.align 16
2588.Lxts_dec_two:
2589 movups ($inp),$inout0
2590 movups 16($inp),$inout1
23f6eec7 2591 lea 32($inp),$inp # $inp+=2*16
f8501464
AP
2592 xorps @tweak[0],$inout0
2593 xorps @tweak[1],$inout1
6c83629b 2594
214368ff 2595 call _aesni_decrypt2
6c83629b 2596
f8501464
AP
2597 xorps @tweak[0],$inout0
2598 movdqa @tweak[2],@tweak[0]
2599 xorps @tweak[1],$inout1
2600 movdqa @tweak[3],@tweak[1]
23f6eec7 2601 movups $inout0,($out) # store 2 output blocks
f8501464 2602 movups $inout1,16*1($out)
23f6eec7 2603 lea 16*2($out),$out # $out+=2*16
f8501464 2604 jmp .Lxts_dec_done
6c83629b 2605
f8501464
AP
2606.align 16
2607.Lxts_dec_three:
2608 movups ($inp),$inout0
2609 movups 16*1($inp),$inout1
2610 movups 16*2($inp),$inout2
23f6eec7 2611 lea 16*3($inp),$inp # $inp+=3*16
f8501464
AP
2612 xorps @tweak[0],$inout0
2613 xorps @tweak[1],$inout1
2614 xorps @tweak[2],$inout2
6c83629b 2615
f8501464 2616 call _aesni_decrypt3
6c83629b 2617
f8501464
AP
2618 xorps @tweak[0],$inout0
2619 movdqa @tweak[3],@tweak[0]
2620 xorps @tweak[1],$inout1
36df342f 2621 movdqa @tweak[4],@tweak[1]
f8501464 2622 xorps @tweak[2],$inout2
23f6eec7 2623 movups $inout0,($out) # store 3 output blocks
f8501464
AP
2624 movups $inout1,16*1($out)
2625 movups $inout2,16*2($out)
23f6eec7 2626 lea 16*3($out),$out # $out+=3*16
f8501464 2627 jmp .Lxts_dec_done
6c83629b
AP
2628
2629.align 16
f8501464 2630.Lxts_dec_four:
36df342f
AP
2631 movups ($inp),$inout0
2632 movups 16*1($inp),$inout1
f8501464
AP
2633 movups 16*2($inp),$inout2
2634 xorps @tweak[0],$inout0
2635 movups 16*3($inp),$inout3
23f6eec7 2636 lea 16*4($inp),$inp # $inp+=4*16
f8501464
AP
2637 xorps @tweak[1],$inout1
2638 xorps @tweak[2],$inout2
2639 xorps @tweak[3],$inout3
2640
2641 call _aesni_decrypt4
2642
36df342f 2643 pxor @tweak[0],$inout0
f8501464 2644 movdqa @tweak[4],@tweak[0]
36df342f 2645 pxor @tweak[1],$inout1
f8501464 2646 movdqa @tweak[5],@tweak[1]
36df342f 2647 pxor @tweak[2],$inout2
23f6eec7 2648 movdqu $inout0,($out) # store 4 output blocks
36df342f
AP
2649 pxor @tweak[3],$inout3
2650 movdqu $inout1,16*1($out)
2651 movdqu $inout2,16*2($out)
2652 movdqu $inout3,16*3($out)
23f6eec7 2653 lea 16*4($out),$out # $out+=4*16
f8501464 2654 jmp .Lxts_dec_done
6c83629b
AP
2655
2656.align 16
f8501464 2657.Lxts_dec_done:
23f6eec7 2658 and \$15,$len_ # see if $len%16 is 0
f8501464
AP
2659 jz .Lxts_dec_ret
2660.Lxts_dec_done2:
2661 mov $len_,$len
2662 mov $key_,$key # restore $key
2663 mov $rnds_,$rounds # restore $rounds
6c83629b 2664
f8501464
AP
2665 movups ($inp),$inout0
2666 xorps @tweak[1],$inout0
2667___
2668 &aesni_generate1("dec",$key,$rounds);
2669$code.=<<___;
2670 xorps @tweak[1],$inout0
2671 movups $inout0,($out)
2672
2673.Lxts_dec_steal:
2674 movzb 16($inp),%eax # borrow $rounds ...
2675 movzb ($out),%ecx # ... and $key
2676 lea 1($inp),$inp
2677 mov %al,($out)
2678 mov %cl,16($out)
2679 lea 1($out),$out
2680 sub \$1,$len
2681 jnz .Lxts_dec_steal
2682
2683 sub $len_,$out # rewind $out
2684 mov $key_,$key # restore $key
2685 mov $rnds_,$rounds # restore $rounds
2686
2687 movups ($out),$inout0
2688 xorps @tweak[0],$inout0
6c83629b 2689___
f8501464
AP
2690 &aesni_generate1("dec",$key,$rounds);
2691$code.=<<___;
2692 xorps @tweak[0],$inout0
2693 movups $inout0,($out)
6c83629b 2694
f8501464 2695.Lxts_dec_ret:
23f6eec7
AP
2696 xorps %xmm0,%xmm0 # clear register bank
2697 pxor %xmm1,%xmm1
2698 pxor %xmm2,%xmm2
2699 pxor %xmm3,%xmm3
2700 pxor %xmm4,%xmm4
2701 pxor %xmm5,%xmm5
2702___
2703$code.=<<___ if (!$win64);
2704 pxor %xmm6,%xmm6
2705 pxor %xmm7,%xmm7
2706 movaps %xmm0,0x00(%rsp) # clear stack
2707 pxor %xmm8,%xmm8
2708 movaps %xmm0,0x10(%rsp)
2709 pxor %xmm9,%xmm9
2710 movaps %xmm0,0x20(%rsp)
2711 pxor %xmm10,%xmm10
2712 movaps %xmm0,0x30(%rsp)
2713 pxor %xmm11,%xmm11
2714 movaps %xmm0,0x40(%rsp)
2715 pxor %xmm12,%xmm12
2716 movaps %xmm0,0x50(%rsp)
2717 pxor %xmm13,%xmm13
2718 movaps %xmm0,0x60(%rsp)
2719 pxor %xmm14,%xmm14
2720 pxor %xmm15,%xmm15
f8501464 2721___
6c83629b 2722$code.=<<___ if ($win64);
384e6de4
AP
2723 movaps -0xa8(%r11),%xmm6
2724 movaps %xmm0,-0xa8(%r11) # clear stack
2725 movaps -0x98(%r11),%xmm7
2726 movaps %xmm0,-0x98(%r11)
2727 movaps -0x88(%r11),%xmm8
2728 movaps %xmm0,-0x88(%r11)
2729 movaps -0x78(%r11),%xmm9
2730 movaps %xmm0,-0x78(%r11)
2731 movaps -0x68(%r11),%xmm10
2732 movaps %xmm0,-0x68(%r11)
2733 movaps -0x58(%r11),%xmm11
2734 movaps %xmm0,-0x58(%r11)
2735 movaps -0x48(%r11),%xmm12
2736 movaps %xmm0,-0x48(%r11)
2737 movaps -0x38(%r11),%xmm13
2738 movaps %xmm0,-0x38(%r11)
2739 movaps -0x28(%r11),%xmm14
2740 movaps %xmm0,-0x28(%r11)
2741 movaps -0x18(%r11),%xmm15
2742 movaps %xmm0,-0x18(%r11)
23f6eec7
AP
2743 movaps %xmm0,0x00(%rsp)
2744 movaps %xmm0,0x10(%rsp)
2745 movaps %xmm0,0x20(%rsp)
2746 movaps %xmm0,0x30(%rsp)
2747 movaps %xmm0,0x40(%rsp)
2748 movaps %xmm0,0x50(%rsp)
2749 movaps %xmm0,0x60(%rsp)
6c83629b
AP
2750___
2751$code.=<<___;
384e6de4 2752 mov -8(%r11),%rbp
b84460ad 2753.cfi_restore %rbp
384e6de4 2754 lea (%r11),%rsp
b84460ad 2755.cfi_def_cfa_register %rsp
f8501464 2756.Lxts_dec_epilogue:
6c83629b 2757 ret
b84460ad 2758.cfi_endproc
f8501464 2759.size aesni_xts_decrypt,.-aesni_xts_decrypt
6c83629b 2760___
bd30091c
AP
2761}
2762\f
2763######################################################################
2764# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2765# const AES_KEY *key, unsigned int start_block_num,
2766# unsigned char offset_i[16], const unsigned char L_[][16],
2767# unsigned char checksum[16]);
2768#
2769{
2770my @offset=map("%xmm$_",(10..15));
2771my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2772my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2773my ($L_p,$checksum_p) = ("%rbx","%rbp");
2774my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2775my $seventh_arg = $win64 ? 56 : 8;
2776my $blocks = $len;
2777
2778$code.=<<___;
2779.globl aesni_ocb_encrypt
2780.type aesni_ocb_encrypt,\@function,6
2781.align 32
2782aesni_ocb_encrypt:
b84460ad 2783.cfi_startproc
bd30091c
AP
2784 lea (%rsp),%rax
2785 push %rbx
b84460ad 2786.cfi_push %rbx
bd30091c 2787 push %rbp
b84460ad 2788.cfi_push %rbp
bd30091c 2789 push %r12
b84460ad 2790.cfi_push %r12
bd30091c 2791 push %r13
b84460ad 2792.cfi_push %r13
bd30091c 2793 push %r14
b84460ad 2794.cfi_push %r14
bd30091c
AP
2795___
2796$code.=<<___ if ($win64);
2797 lea -0xa0(%rsp),%rsp
2798 movaps %xmm6,0x00(%rsp) # offload everything
2799 movaps %xmm7,0x10(%rsp)
2800 movaps %xmm8,0x20(%rsp)
2801 movaps %xmm9,0x30(%rsp)
2802 movaps %xmm10,0x40(%rsp)
2803 movaps %xmm11,0x50(%rsp)
2804 movaps %xmm12,0x60(%rsp)
2805 movaps %xmm13,0x70(%rsp)
2806 movaps %xmm14,0x80(%rsp)
2807 movaps %xmm15,0x90(%rsp)
2808.Locb_enc_body:
2809___
2810$code.=<<___;
2811 mov $seventh_arg(%rax),$L_p # 7th argument
2812 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2813
2814 mov 240($key),$rnds_
2815 mov $key,$key_
2816 shl \$4,$rnds_
2817 $movkey ($key),$rndkey0l # round[0]
2818 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2819
2820 movdqu ($offset_p),@offset[5] # load last offset_i
2821 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2822 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2823
2824 mov \$16+32,$rounds
2825 lea 32($key_,$rnds_),$key
2826 $movkey 16($key_),$rndkey1 # round[1]
2827 sub %r10,%rax # twisted $rounds
2828 mov %rax,%r10 # backup twisted $rounds
2829
2830 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2831 movdqu ($checksum_p),$checksum # load checksum
2832
2833 test \$1,$block_num # is first block number odd?
2834 jnz .Locb_enc_odd
2835
2836 bsf $block_num,$i1
2837 add \$1,$block_num
2838 shl \$4,$i1
2839 movdqu ($L_p,$i1),$inout5 # borrow
2840 movdqu ($inp),$inout0
2841 lea 16($inp),$inp
2842
2843 call __ocb_encrypt1
2844
2845 movdqa $inout5,@offset[5]
2846 movups $inout0,($out)
2847 lea 16($out),$out
2848 sub \$1,$blocks
2849 jz .Locb_enc_done
2850
2851.Locb_enc_odd:
2852 lea 1($block_num),$i1 # even-numbered blocks
2853 lea 3($block_num),$i3
2854 lea 5($block_num),$i5
2855 lea 6($block_num),$block_num
2856 bsf $i1,$i1 # ntz(block)
2857 bsf $i3,$i3
2858 bsf $i5,$i5
2859 shl \$4,$i1 # ntz(block) -> table offset
2860 shl \$4,$i3
2861 shl \$4,$i5
2862
2863 sub \$6,$blocks
2864 jc .Locb_enc_short
2865 jmp .Locb_enc_grandloop
2866
2867.align 32
2868.Locb_enc_grandloop:
2869 movdqu `16*0`($inp),$inout0 # load input
2870 movdqu `16*1`($inp),$inout1
2871 movdqu `16*2`($inp),$inout2
2872 movdqu `16*3`($inp),$inout3
2873 movdqu `16*4`($inp),$inout4
2874 movdqu `16*5`($inp),$inout5
2875 lea `16*6`($inp),$inp
2876
2877 call __ocb_encrypt6
2878
2879 movups $inout0,`16*0`($out) # store output
2880 movups $inout1,`16*1`($out)
2881 movups $inout2,`16*2`($out)
2882 movups $inout3,`16*3`($out)
2883 movups $inout4,`16*4`($out)
2884 movups $inout5,`16*5`($out)
2885 lea `16*6`($out),$out
2886 sub \$6,$blocks
2887 jnc .Locb_enc_grandloop
2888
2889.Locb_enc_short:
2890 add \$6,$blocks
2891 jz .Locb_enc_done
2892
2893 movdqu `16*0`($inp),$inout0
2894 cmp \$2,$blocks
2895 jb .Locb_enc_one
2896 movdqu `16*1`($inp),$inout1
2897 je .Locb_enc_two
2898
2899 movdqu `16*2`($inp),$inout2
2900 cmp \$4,$blocks
2901 jb .Locb_enc_three
2902 movdqu `16*3`($inp),$inout3
2903 je .Locb_enc_four
2904
2905 movdqu `16*4`($inp),$inout4
2906 pxor $inout5,$inout5
2907
2908 call __ocb_encrypt6
2909
2910 movdqa @offset[4],@offset[5]
2911 movups $inout0,`16*0`($out)
2912 movups $inout1,`16*1`($out)
2913 movups $inout2,`16*2`($out)
2914 movups $inout3,`16*3`($out)
2915 movups $inout4,`16*4`($out)
2916
2917 jmp .Locb_enc_done
2918
2919.align 16
2920.Locb_enc_one:
2921 movdqa @offset[0],$inout5 # borrow
2922
2923 call __ocb_encrypt1
2924
2925 movdqa $inout5,@offset[5]
2926 movups $inout0,`16*0`($out)
2927 jmp .Locb_enc_done
2928
2929.align 16
2930.Locb_enc_two:
2931 pxor $inout2,$inout2
2932 pxor $inout3,$inout3
2933
2934 call __ocb_encrypt4
2935
2936 movdqa @offset[1],@offset[5]
2937 movups $inout0,`16*0`($out)
2938 movups $inout1,`16*1`($out)
2939
2940 jmp .Locb_enc_done
2941
2942.align 16
2943.Locb_enc_three:
2944 pxor $inout3,$inout3
2945
2946 call __ocb_encrypt4
2947
2948 movdqa @offset[2],@offset[5]
2949 movups $inout0,`16*0`($out)
2950 movups $inout1,`16*1`($out)
2951 movups $inout2,`16*2`($out)
2952
2953 jmp .Locb_enc_done
2954
2955.align 16
2956.Locb_enc_four:
2957 call __ocb_encrypt4
2958
2959 movdqa @offset[3],@offset[5]
2960 movups $inout0,`16*0`($out)
2961 movups $inout1,`16*1`($out)
2962 movups $inout2,`16*2`($out)
2963 movups $inout3,`16*3`($out)
2964
2965.Locb_enc_done:
2966 pxor $rndkey0,@offset[5] # "remove" round[last]
2967 movdqu $checksum,($checksum_p) # store checksum
2968 movdqu @offset[5],($offset_p) # store last offset_i
2969
2970 xorps %xmm0,%xmm0 # clear register bank
2971 pxor %xmm1,%xmm1
2972 pxor %xmm2,%xmm2
2973 pxor %xmm3,%xmm3
2974 pxor %xmm4,%xmm4
2975 pxor %xmm5,%xmm5
2976___
2977$code.=<<___ if (!$win64);
2978 pxor %xmm6,%xmm6
2979 pxor %xmm7,%xmm7
2980 pxor %xmm8,%xmm8
2981 pxor %xmm9,%xmm9
2982 pxor %xmm10,%xmm10
2983 pxor %xmm11,%xmm11
2984 pxor %xmm12,%xmm12
2985 pxor %xmm13,%xmm13
2986 pxor %xmm14,%xmm14
2987 pxor %xmm15,%xmm15
384e6de4 2988 lea 0x28(%rsp),%rax
b84460ad 2989.cfi_def_cfa %rax,8
bd30091c
AP
2990___
2991$code.=<<___ if ($win64);
2992 movaps 0x00(%rsp),%xmm6
2993 movaps %xmm0,0x00(%rsp) # clear stack
2994 movaps 0x10(%rsp),%xmm7
2995 movaps %xmm0,0x10(%rsp)
2996 movaps 0x20(%rsp),%xmm8
2997 movaps %xmm0,0x20(%rsp)
2998 movaps 0x30(%rsp),%xmm9
2999 movaps %xmm0,0x30(%rsp)
3000 movaps 0x40(%rsp),%xmm10
3001 movaps %xmm0,0x40(%rsp)
3002 movaps 0x50(%rsp),%xmm11
3003 movaps %xmm0,0x50(%rsp)
3004 movaps 0x60(%rsp),%xmm12
3005 movaps %xmm0,0x60(%rsp)
3006 movaps 0x70(%rsp),%xmm13
3007 movaps %xmm0,0x70(%rsp)
3008 movaps 0x80(%rsp),%xmm14
3009 movaps %xmm0,0x80(%rsp)
3010 movaps 0x90(%rsp),%xmm15
3011 movaps %xmm0,0x90(%rsp)
3012 lea 0xa0+0x28(%rsp),%rax
3013.Locb_enc_pop:
bd30091c
AP
3014___
3015$code.=<<___;
384e6de4 3016 mov -40(%rax),%r14
b84460ad 3017.cfi_restore %r14
384e6de4 3018 mov -32(%rax),%r13
b84460ad 3019.cfi_restore %r13
384e6de4 3020 mov -24(%rax),%r12
b84460ad 3021.cfi_restore %r12
384e6de4 3022 mov -16(%rax),%rbp
b84460ad 3023.cfi_restore %rbp
384e6de4 3024 mov -8(%rax),%rbx
b84460ad 3025.cfi_restore %rbx
384e6de4 3026 lea (%rax),%rsp
b84460ad 3027.cfi_def_cfa_register %rsp
bd30091c
AP
3028.Locb_enc_epilogue:
3029 ret
b84460ad 3030.cfi_endproc
bd30091c
AP
3031.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3032
3033.type __ocb_encrypt6,\@abi-omnipotent
3034.align 32
3035__ocb_encrypt6:
3036 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3037 movdqu ($L_p,$i1),@offset[1]
3038 movdqa @offset[0],@offset[2]
3039 movdqu ($L_p,$i3),@offset[3]
3040 movdqa @offset[0],@offset[4]
3041 pxor @offset[5],@offset[0]
3042 movdqu ($L_p,$i5),@offset[5]
3043 pxor @offset[0],@offset[1]
3044 pxor $inout0,$checksum # accumulate checksum
3045 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3046 pxor @offset[1],@offset[2]
3047 pxor $inout1,$checksum
3048 pxor @offset[1],$inout1
3049 pxor @offset[2],@offset[3]
3050 pxor $inout2,$checksum
3051 pxor @offset[2],$inout2
3052 pxor @offset[3],@offset[4]
3053 pxor $inout3,$checksum
3054 pxor @offset[3],$inout3
3055 pxor @offset[4],@offset[5]
3056 pxor $inout4,$checksum
3057 pxor @offset[4],$inout4
3058 pxor $inout5,$checksum
3059 pxor @offset[5],$inout5
3060 $movkey 32($key_),$rndkey0
3061
3062 lea 1($block_num),$i1 # even-numbered blocks
3063 lea 3($block_num),$i3
3064 lea 5($block_num),$i5
3065 add \$6,$block_num
3066 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3067 bsf $i1,$i1 # ntz(block)
3068 bsf $i3,$i3
3069 bsf $i5,$i5
3070
3071 aesenc $rndkey1,$inout0
3072 aesenc $rndkey1,$inout1
3073 aesenc $rndkey1,$inout2
3074 aesenc $rndkey1,$inout3
3075 pxor $rndkey0l,@offset[1]
3076 pxor $rndkey0l,@offset[2]
3077 aesenc $rndkey1,$inout4
3078 pxor $rndkey0l,@offset[3]
3079 pxor $rndkey0l,@offset[4]
3080 aesenc $rndkey1,$inout5
3081 $movkey 48($key_),$rndkey1
3082 pxor $rndkey0l,@offset[5]
3083
3084 aesenc $rndkey0,$inout0
3085 aesenc $rndkey0,$inout1
3086 aesenc $rndkey0,$inout2
3087 aesenc $rndkey0,$inout3
3088 aesenc $rndkey0,$inout4
3089 aesenc $rndkey0,$inout5
3090 $movkey 64($key_),$rndkey0
3091 shl \$4,$i1 # ntz(block) -> table offset
3092 shl \$4,$i3
3093 jmp .Locb_enc_loop6
3094
3095.align 32
3096.Locb_enc_loop6:
3097 aesenc $rndkey1,$inout0
3098 aesenc $rndkey1,$inout1
3099 aesenc $rndkey1,$inout2
3100 aesenc $rndkey1,$inout3
3101 aesenc $rndkey1,$inout4
3102 aesenc $rndkey1,$inout5
3103 $movkey ($key,%rax),$rndkey1
3104 add \$32,%rax
3105
3106 aesenc $rndkey0,$inout0
3107 aesenc $rndkey0,$inout1
3108 aesenc $rndkey0,$inout2
3109 aesenc $rndkey0,$inout3
3110 aesenc $rndkey0,$inout4
3111 aesenc $rndkey0,$inout5
3112 $movkey -16($key,%rax),$rndkey0
3113 jnz .Locb_enc_loop6
3114
3115 aesenc $rndkey1,$inout0
3116 aesenc $rndkey1,$inout1
3117 aesenc $rndkey1,$inout2
3118 aesenc $rndkey1,$inout3
3119 aesenc $rndkey1,$inout4
3120 aesenc $rndkey1,$inout5
3121 $movkey 16($key_),$rndkey1
3122 shl \$4,$i5
3123
3124 aesenclast @offset[0],$inout0
3125 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3126 mov %r10,%rax # restore twisted rounds
3127 aesenclast @offset[1],$inout1
3128 aesenclast @offset[2],$inout2
3129 aesenclast @offset[3],$inout3
3130 aesenclast @offset[4],$inout4
3131 aesenclast @offset[5],$inout5
3132 ret
3133.size __ocb_encrypt6,.-__ocb_encrypt6
3134
3135.type __ocb_encrypt4,\@abi-omnipotent
3136.align 32
3137__ocb_encrypt4:
3138 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3139 movdqu ($L_p,$i1),@offset[1]
3140 movdqa @offset[0],@offset[2]
3141 movdqu ($L_p,$i3),@offset[3]
3142 pxor @offset[5],@offset[0]
3143 pxor @offset[0],@offset[1]
3144 pxor $inout0,$checksum # accumulate checksum
3145 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3146 pxor @offset[1],@offset[2]
3147 pxor $inout1,$checksum
3148 pxor @offset[1],$inout1
3149 pxor @offset[2],@offset[3]
3150 pxor $inout2,$checksum
3151 pxor @offset[2],$inout2
3152 pxor $inout3,$checksum
3153 pxor @offset[3],$inout3
3154 $movkey 32($key_),$rndkey0
3155
3156 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3157 pxor $rndkey0l,@offset[1]
3158 pxor $rndkey0l,@offset[2]
3159 pxor $rndkey0l,@offset[3]
3160
3161 aesenc $rndkey1,$inout0
3162 aesenc $rndkey1,$inout1
3163 aesenc $rndkey1,$inout2
3164 aesenc $rndkey1,$inout3
3165 $movkey 48($key_),$rndkey1
3166
3167 aesenc $rndkey0,$inout0
3168 aesenc $rndkey0,$inout1
3169 aesenc $rndkey0,$inout2
3170 aesenc $rndkey0,$inout3
3171 $movkey 64($key_),$rndkey0
3172 jmp .Locb_enc_loop4
3173
3174.align 32
3175.Locb_enc_loop4:
3176 aesenc $rndkey1,$inout0
3177 aesenc $rndkey1,$inout1
3178 aesenc $rndkey1,$inout2
3179 aesenc $rndkey1,$inout3
3180 $movkey ($key,%rax),$rndkey1
3181 add \$32,%rax
3182
3183 aesenc $rndkey0,$inout0
3184 aesenc $rndkey0,$inout1
3185 aesenc $rndkey0,$inout2
3186 aesenc $rndkey0,$inout3
3187 $movkey -16($key,%rax),$rndkey0
3188 jnz .Locb_enc_loop4
3189
3190 aesenc $rndkey1,$inout0
3191 aesenc $rndkey1,$inout1
3192 aesenc $rndkey1,$inout2
3193 aesenc $rndkey1,$inout3
3194 $movkey 16($key_),$rndkey1
3195 mov %r10,%rax # restore twisted rounds
3196
3197 aesenclast @offset[0],$inout0
3198 aesenclast @offset[1],$inout1
3199 aesenclast @offset[2],$inout2
3200 aesenclast @offset[3],$inout3
3201 ret
3202.size __ocb_encrypt4,.-__ocb_encrypt4
3203
3204.type __ocb_encrypt1,\@abi-omnipotent
3205.align 32
3206__ocb_encrypt1:
3207 pxor @offset[5],$inout5 # offset_i
3208 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3209 pxor $inout0,$checksum # accumulate checksum
3210 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3211 $movkey 32($key_),$rndkey0
3212
3213 aesenc $rndkey1,$inout0
3214 $movkey 48($key_),$rndkey1
3215 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3216
3217 aesenc $rndkey0,$inout0
3218 $movkey 64($key_),$rndkey0
3219 jmp .Locb_enc_loop1
3220
3221.align 32
3222.Locb_enc_loop1:
3223 aesenc $rndkey1,$inout0
3224 $movkey ($key,%rax),$rndkey1
3225 add \$32,%rax
3226
3227 aesenc $rndkey0,$inout0
3228 $movkey -16($key,%rax),$rndkey0
3229 jnz .Locb_enc_loop1
3230
3231 aesenc $rndkey1,$inout0
3232 $movkey 16($key_),$rndkey1 # redundant in tail
3233 mov %r10,%rax # restore twisted rounds
3234
3235 aesenclast $inout5,$inout0
3236 ret
3237.size __ocb_encrypt1,.-__ocb_encrypt1
3238
3239.globl aesni_ocb_decrypt
3240.type aesni_ocb_decrypt,\@function,6
3241.align 32
3242aesni_ocb_decrypt:
b84460ad 3243.cfi_startproc
bd30091c
AP
3244 lea (%rsp),%rax
3245 push %rbx
b84460ad 3246.cfi_push %rbx
bd30091c 3247 push %rbp
b84460ad 3248.cfi_push %rbp
bd30091c 3249 push %r12
b84460ad 3250.cfi_push %r12
bd30091c 3251 push %r13
b84460ad 3252.cfi_push %r13
bd30091c 3253 push %r14
b84460ad 3254.cfi_push %r14
bd30091c
AP
3255___
3256$code.=<<___ if ($win64);
3257 lea -0xa0(%rsp),%rsp
3258 movaps %xmm6,0x00(%rsp) # offload everything
3259 movaps %xmm7,0x10(%rsp)
3260 movaps %xmm8,0x20(%rsp)
3261 movaps %xmm9,0x30(%rsp)
3262 movaps %xmm10,0x40(%rsp)
3263 movaps %xmm11,0x50(%rsp)
3264 movaps %xmm12,0x60(%rsp)
3265 movaps %xmm13,0x70(%rsp)
3266 movaps %xmm14,0x80(%rsp)
3267 movaps %xmm15,0x90(%rsp)
3268.Locb_dec_body:
3269___
3270$code.=<<___;
3271 mov $seventh_arg(%rax),$L_p # 7th argument
3272 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3273
3274 mov 240($key),$rnds_
3275 mov $key,$key_
3276 shl \$4,$rnds_
3277 $movkey ($key),$rndkey0l # round[0]
3278 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3279
3280 movdqu ($offset_p),@offset[5] # load last offset_i
3281 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3282 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3283
3284 mov \$16+32,$rounds
3285 lea 32($key_,$rnds_),$key
3286 $movkey 16($key_),$rndkey1 # round[1]
3287 sub %r10,%rax # twisted $rounds
3288 mov %rax,%r10 # backup twisted $rounds
3289
3290 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3291 movdqu ($checksum_p),$checksum # load checksum
3292
3293 test \$1,$block_num # is first block number odd?
3294 jnz .Locb_dec_odd
3295
3296 bsf $block_num,$i1
3297 add \$1,$block_num
3298 shl \$4,$i1
3299 movdqu ($L_p,$i1),$inout5 # borrow
3300 movdqu ($inp),$inout0
3301 lea 16($inp),$inp
3302
3303 call __ocb_decrypt1
3304
3305 movdqa $inout5,@offset[5]
3306 movups $inout0,($out)
3307 xorps $inout0,$checksum # accumulate checksum
3308 lea 16($out),$out
3309 sub \$1,$blocks
3310 jz .Locb_dec_done
3311
3312.Locb_dec_odd:
3313 lea 1($block_num),$i1 # even-numbered blocks
3314 lea 3($block_num),$i3
3315 lea 5($block_num),$i5
3316 lea 6($block_num),$block_num
3317 bsf $i1,$i1 # ntz(block)
3318 bsf $i3,$i3
3319 bsf $i5,$i5
3320 shl \$4,$i1 # ntz(block) -> table offset
3321 shl \$4,$i3
3322 shl \$4,$i5
3323
3324 sub \$6,$blocks
3325 jc .Locb_dec_short
3326 jmp .Locb_dec_grandloop
3327
3328.align 32
3329.Locb_dec_grandloop:
3330 movdqu `16*0`($inp),$inout0 # load input
3331 movdqu `16*1`($inp),$inout1
3332 movdqu `16*2`($inp),$inout2
3333 movdqu `16*3`($inp),$inout3
3334 movdqu `16*4`($inp),$inout4
3335 movdqu `16*5`($inp),$inout5
3336 lea `16*6`($inp),$inp
3337
3338 call __ocb_decrypt6
3339
3340 movups $inout0,`16*0`($out) # store output
3341 pxor $inout0,$checksum # accumulate checksum
3342 movups $inout1,`16*1`($out)
3343 pxor $inout1,$checksum
3344 movups $inout2,`16*2`($out)
3345 pxor $inout2,$checksum
3346 movups $inout3,`16*3`($out)
3347 pxor $inout3,$checksum
3348 movups $inout4,`16*4`($out)
3349 pxor $inout4,$checksum
3350 movups $inout5,`16*5`($out)
3351 pxor $inout5,$checksum
3352 lea `16*6`($out),$out
3353 sub \$6,$blocks
3354 jnc .Locb_dec_grandloop
3355
3356.Locb_dec_short:
3357 add \$6,$blocks
3358 jz .Locb_dec_done
3359
3360 movdqu `16*0`($inp),$inout0
3361 cmp \$2,$blocks
3362 jb .Locb_dec_one
3363 movdqu `16*1`($inp),$inout1
3364 je .Locb_dec_two
3365
3366 movdqu `16*2`($inp),$inout2
3367 cmp \$4,$blocks
3368 jb .Locb_dec_three
3369 movdqu `16*3`($inp),$inout3
3370 je .Locb_dec_four
3371
3372 movdqu `16*4`($inp),$inout4
3373 pxor $inout5,$inout5
3374
3375 call __ocb_decrypt6
3376
3377 movdqa @offset[4],@offset[5]
3378 movups $inout0,`16*0`($out) # store output
3379 pxor $inout0,$checksum # accumulate checksum
3380 movups $inout1,`16*1`($out)
3381 pxor $inout1,$checksum
3382 movups $inout2,`16*2`($out)
3383 pxor $inout2,$checksum
3384 movups $inout3,`16*3`($out)
3385 pxor $inout3,$checksum
3386 movups $inout4,`16*4`($out)
3387 pxor $inout4,$checksum
3388
3389 jmp .Locb_dec_done
3390
3391.align 16
3392.Locb_dec_one:
3393 movdqa @offset[0],$inout5 # borrow
3394
3395 call __ocb_decrypt1
3396
3397 movdqa $inout5,@offset[5]
3398 movups $inout0,`16*0`($out) # store output
3399 xorps $inout0,$checksum # accumulate checksum
3400 jmp .Locb_dec_done
3401
3402.align 16
3403.Locb_dec_two:
3404 pxor $inout2,$inout2
3405 pxor $inout3,$inout3
3406
3407 call __ocb_decrypt4
3408
3409 movdqa @offset[1],@offset[5]
3410 movups $inout0,`16*0`($out) # store output
3411 xorps $inout0,$checksum # accumulate checksum
3412 movups $inout1,`16*1`($out)
3413 xorps $inout1,$checksum
3414
3415 jmp .Locb_dec_done
3416
3417.align 16
3418.Locb_dec_three:
3419 pxor $inout3,$inout3
3420
3421 call __ocb_decrypt4
3422
3423 movdqa @offset[2],@offset[5]
3424 movups $inout0,`16*0`($out) # store output
3425 xorps $inout0,$checksum # accumulate checksum
3426 movups $inout1,`16*1`($out)
3427 xorps $inout1,$checksum
3428 movups $inout2,`16*2`($out)
3429 xorps $inout2,$checksum
3430
3431 jmp .Locb_dec_done
3432
3433.align 16
3434.Locb_dec_four:
3435 call __ocb_decrypt4
3436
3437 movdqa @offset[3],@offset[5]
3438 movups $inout0,`16*0`($out) # store output
3439 pxor $inout0,$checksum # accumulate checksum
3440 movups $inout1,`16*1`($out)
3441 pxor $inout1,$checksum
3442 movups $inout2,`16*2`($out)
3443 pxor $inout2,$checksum
3444 movups $inout3,`16*3`($out)
3445 pxor $inout3,$checksum
3446
3447.Locb_dec_done:
3448 pxor $rndkey0,@offset[5] # "remove" round[last]
3449 movdqu $checksum,($checksum_p) # store checksum
3450 movdqu @offset[5],($offset_p) # store last offset_i
3451
3452 xorps %xmm0,%xmm0 # clear register bank
3453 pxor %xmm1,%xmm1
3454 pxor %xmm2,%xmm2
3455 pxor %xmm3,%xmm3
3456 pxor %xmm4,%xmm4
3457 pxor %xmm5,%xmm5
3458___
3459$code.=<<___ if (!$win64);
3460 pxor %xmm6,%xmm6
3461 pxor %xmm7,%xmm7
3462 pxor %xmm8,%xmm8
3463 pxor %xmm9,%xmm9
3464 pxor %xmm10,%xmm10
3465 pxor %xmm11,%xmm11
3466 pxor %xmm12,%xmm12
3467 pxor %xmm13,%xmm13
3468 pxor %xmm14,%xmm14
3469 pxor %xmm15,%xmm15
384e6de4 3470 lea 0x28(%rsp),%rax
b84460ad 3471.cfi_def_cfa %rax,8
bd30091c
AP
3472___
3473$code.=<<___ if ($win64);
3474 movaps 0x00(%rsp),%xmm6
3475 movaps %xmm0,0x00(%rsp) # clear stack
3476 movaps 0x10(%rsp),%xmm7
3477 movaps %xmm0,0x10(%rsp)
3478 movaps 0x20(%rsp),%xmm8
3479 movaps %xmm0,0x20(%rsp)
3480 movaps 0x30(%rsp),%xmm9
3481 movaps %xmm0,0x30(%rsp)
3482 movaps 0x40(%rsp),%xmm10
3483 movaps %xmm0,0x40(%rsp)
3484 movaps 0x50(%rsp),%xmm11
3485 movaps %xmm0,0x50(%rsp)
3486 movaps 0x60(%rsp),%xmm12
3487 movaps %xmm0,0x60(%rsp)
3488 movaps 0x70(%rsp),%xmm13
3489 movaps %xmm0,0x70(%rsp)
3490 movaps 0x80(%rsp),%xmm14
3491 movaps %xmm0,0x80(%rsp)
3492 movaps 0x90(%rsp),%xmm15
3493 movaps %xmm0,0x90(%rsp)
3494 lea 0xa0+0x28(%rsp),%rax
3495.Locb_dec_pop:
bd30091c
AP
3496___
3497$code.=<<___;
384e6de4 3498 mov -40(%rax),%r14
b84460ad 3499.cfi_restore %r14
384e6de4 3500 mov -32(%rax),%r13
b84460ad 3501.cfi_restore %r13
384e6de4 3502 mov -24(%rax),%r12
b84460ad 3503.cfi_restore %r12
384e6de4 3504 mov -16(%rax),%rbp
b84460ad 3505.cfi_restore %rbp
384e6de4 3506 mov -8(%rax),%rbx
b84460ad 3507.cfi_restore %rbx
384e6de4 3508 lea (%rax),%rsp
b84460ad 3509.cfi_def_cfa_register %rsp
bd30091c
AP
3510.Locb_dec_epilogue:
3511 ret
b84460ad 3512.cfi_endproc
bd30091c
AP
3513.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3514
3515.type __ocb_decrypt6,\@abi-omnipotent
3516.align 32
3517__ocb_decrypt6:
3518 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3519 movdqu ($L_p,$i1),@offset[1]
3520 movdqa @offset[0],@offset[2]
3521 movdqu ($L_p,$i3),@offset[3]
3522 movdqa @offset[0],@offset[4]
3523 pxor @offset[5],@offset[0]
3524 movdqu ($L_p,$i5),@offset[5]
3525 pxor @offset[0],@offset[1]
3526 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3527 pxor @offset[1],@offset[2]
3528 pxor @offset[1],$inout1
3529 pxor @offset[2],@offset[3]
3530 pxor @offset[2],$inout2
3531 pxor @offset[3],@offset[4]
3532 pxor @offset[3],$inout3
3533 pxor @offset[4],@offset[5]
3534 pxor @offset[4],$inout4
3535 pxor @offset[5],$inout5
3536 $movkey 32($key_),$rndkey0
3537
3538 lea 1($block_num),$i1 # even-numbered blocks
3539 lea 3($block_num),$i3
3540 lea 5($block_num),$i5
3541 add \$6,$block_num
3542 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3543 bsf $i1,$i1 # ntz(block)
3544 bsf $i3,$i3
3545 bsf $i5,$i5
3546
3547 aesdec $rndkey1,$inout0
3548 aesdec $rndkey1,$inout1
3549 aesdec $rndkey1,$inout2
3550 aesdec $rndkey1,$inout3
3551 pxor $rndkey0l,@offset[1]
3552 pxor $rndkey0l,@offset[2]
3553 aesdec $rndkey1,$inout4
3554 pxor $rndkey0l,@offset[3]
3555 pxor $rndkey0l,@offset[4]
3556 aesdec $rndkey1,$inout5
3557 $movkey 48($key_),$rndkey1
3558 pxor $rndkey0l,@offset[5]
3559
3560 aesdec $rndkey0,$inout0
3561 aesdec $rndkey0,$inout1
3562 aesdec $rndkey0,$inout2
3563 aesdec $rndkey0,$inout3
3564 aesdec $rndkey0,$inout4
3565 aesdec $rndkey0,$inout5
3566 $movkey 64($key_),$rndkey0
3567 shl \$4,$i1 # ntz(block) -> table offset
3568 shl \$4,$i3
3569 jmp .Locb_dec_loop6
3570
3571.align 32
3572.Locb_dec_loop6:
3573 aesdec $rndkey1,$inout0
3574 aesdec $rndkey1,$inout1
3575 aesdec $rndkey1,$inout2
3576 aesdec $rndkey1,$inout3
3577 aesdec $rndkey1,$inout4
3578 aesdec $rndkey1,$inout5
3579 $movkey ($key,%rax),$rndkey1
3580 add \$32,%rax
3581
3582 aesdec $rndkey0,$inout0
3583 aesdec $rndkey0,$inout1
3584 aesdec $rndkey0,$inout2
3585 aesdec $rndkey0,$inout3
3586 aesdec $rndkey0,$inout4
3587 aesdec $rndkey0,$inout5
3588 $movkey -16($key,%rax),$rndkey0
3589 jnz .Locb_dec_loop6
3590
3591 aesdec $rndkey1,$inout0
3592 aesdec $rndkey1,$inout1
3593 aesdec $rndkey1,$inout2
3594 aesdec $rndkey1,$inout3
3595 aesdec $rndkey1,$inout4
3596 aesdec $rndkey1,$inout5
3597 $movkey 16($key_),$rndkey1
3598 shl \$4,$i5
3599
3600 aesdeclast @offset[0],$inout0
3601 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3602 mov %r10,%rax # restore twisted rounds
3603 aesdeclast @offset[1],$inout1
3604 aesdeclast @offset[2],$inout2
3605 aesdeclast @offset[3],$inout3
3606 aesdeclast @offset[4],$inout4
3607 aesdeclast @offset[5],$inout5
3608 ret
3609.size __ocb_decrypt6,.-__ocb_decrypt6
3610
3611.type __ocb_decrypt4,\@abi-omnipotent
3612.align 32
3613__ocb_decrypt4:
3614 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3615 movdqu ($L_p,$i1),@offset[1]
3616 movdqa @offset[0],@offset[2]
3617 movdqu ($L_p,$i3),@offset[3]
3618 pxor @offset[5],@offset[0]
3619 pxor @offset[0],@offset[1]
3620 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3621 pxor @offset[1],@offset[2]
3622 pxor @offset[1],$inout1
3623 pxor @offset[2],@offset[3]
3624 pxor @offset[2],$inout2
3625 pxor @offset[3],$inout3
3626 $movkey 32($key_),$rndkey0
3627
3628 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3629 pxor $rndkey0l,@offset[1]
3630 pxor $rndkey0l,@offset[2]
3631 pxor $rndkey0l,@offset[3]
3632
3633 aesdec $rndkey1,$inout0
3634 aesdec $rndkey1,$inout1
3635 aesdec $rndkey1,$inout2
3636 aesdec $rndkey1,$inout3
3637 $movkey 48($key_),$rndkey1
3638
3639 aesdec $rndkey0,$inout0
3640 aesdec $rndkey0,$inout1
3641 aesdec $rndkey0,$inout2
3642 aesdec $rndkey0,$inout3
3643 $movkey 64($key_),$rndkey0
3644 jmp .Locb_dec_loop4
3645
3646.align 32
3647.Locb_dec_loop4:
3648 aesdec $rndkey1,$inout0
3649 aesdec $rndkey1,$inout1
3650 aesdec $rndkey1,$inout2
3651 aesdec $rndkey1,$inout3
3652 $movkey ($key,%rax),$rndkey1
3653 add \$32,%rax
3654
3655 aesdec $rndkey0,$inout0
3656 aesdec $rndkey0,$inout1
3657 aesdec $rndkey0,$inout2
3658 aesdec $rndkey0,$inout3
3659 $movkey -16($key,%rax),$rndkey0
3660 jnz .Locb_dec_loop4
3661
3662 aesdec $rndkey1,$inout0
3663 aesdec $rndkey1,$inout1
3664 aesdec $rndkey1,$inout2
3665 aesdec $rndkey1,$inout3
3666 $movkey 16($key_),$rndkey1
3667 mov %r10,%rax # restore twisted rounds
3668
3669 aesdeclast @offset[0],$inout0
3670 aesdeclast @offset[1],$inout1
3671 aesdeclast @offset[2],$inout2
3672 aesdeclast @offset[3],$inout3
3673 ret
3674.size __ocb_decrypt4,.-__ocb_decrypt4
3675
3676.type __ocb_decrypt1,\@abi-omnipotent
3677.align 32
3678__ocb_decrypt1:
3679 pxor @offset[5],$inout5 # offset_i
3680 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3681 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3682 $movkey 32($key_),$rndkey0
3683
3684 aesdec $rndkey1,$inout0
3685 $movkey 48($key_),$rndkey1
3686 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3687
3688 aesdec $rndkey0,$inout0
3689 $movkey 64($key_),$rndkey0
3690 jmp .Locb_dec_loop1
3691
3692.align 32
3693.Locb_dec_loop1:
3694 aesdec $rndkey1,$inout0
3695 $movkey ($key,%rax),$rndkey1
3696 add \$32,%rax
3697
3698 aesdec $rndkey0,$inout0
3699 $movkey -16($key,%rax),$rndkey0
3700 jnz .Locb_dec_loop1
3701
3702 aesdec $rndkey1,$inout0
3703 $movkey 16($key_),$rndkey1 # redundant in tail
3704 mov %r10,%rax # restore twisted rounds
3705
3706 aesdeclast $inout5,$inout0
3707 ret
3708.size __ocb_decrypt1,.-__ocb_decrypt1
3709___
f8501464 3710} }}
d64a7232 3711\f
6c83629b 3712########################################################################
d64a7232
AP
3713# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3714# size_t length, const AES_KEY *key,
3715# unsigned char *ivp,const int enc);
f8501464 3716{
73325b22
AP
3717my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3718my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
73325b22 3719
d64a7232
AP
3720$code.=<<___;
3721.globl ${PREFIX}_cbc_encrypt
3722.type ${PREFIX}_cbc_encrypt,\@function,6
3723.align 16
3724${PREFIX}_cbc_encrypt:
b84460ad 3725.cfi_startproc
d64a7232
AP
3726 test $len,$len # check length
3727 jz .Lcbc_ret
d608b4d6 3728
f8501464 3729 mov 240($key),$rnds_ # key->rounds
d64a7232 3730 mov $key,$key_ # backup $key
d608b4d6 3731 test %r9d,%r9d # 6th argument
d64a7232
AP
3732 jz .Lcbc_decrypt
3733#--------------------------- CBC ENCRYPT ------------------------------#
f8501464 3734 movups ($ivp),$inout0 # load iv as initial state
d608b4d6 3735 mov $rnds_,$rounds
d7d119a3 3736 cmp \$16,$len
d64a7232
AP
3737 jb .Lcbc_enc_tail
3738 sub \$16,$len
3739 jmp .Lcbc_enc_loop
d7d119a3 3740.align 16
d64a7232 3741.Lcbc_enc_loop:
f8501464 3742 movups ($inp),$inout1 # load input
d64a7232 3743 lea 16($inp),$inp
f8501464 3744 #xorps $inout1,$inout0
d64a7232 3745___
f8501464 3746 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
d64a7232 3747$code.=<<___;
d608b4d6
AP
3748 mov $rnds_,$rounds # restore $rounds
3749 mov $key_,$key # restore $key
d7d119a3
AP
3750 movups $inout0,0($out) # store output
3751 lea 16($out),$out
3752 sub \$16,$len
d64a7232
AP
3753 jnc .Lcbc_enc_loop
3754 add \$16,$len
3755 jnz .Lcbc_enc_tail
23f6eec7
AP
3756 pxor $rndkey0,$rndkey0 # clear register bank
3757 pxor $rndkey1,$rndkey1
d608b4d6 3758 movups $inout0,($ivp)
23f6eec7
AP
3759 pxor $inout0,$inout0
3760 pxor $inout1,$inout1
d64a7232
AP
3761 jmp .Lcbc_ret
3762
3763.Lcbc_enc_tail:
3764 mov $len,%rcx # zaps $key
3765 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3766 .long 0x9066A4F3 # rep movsb
3767 mov \$16,%ecx # zero tail
3768 sub $len,%rcx
3769 xor %eax,%eax
3770 .long 0x9066AAF3 # rep stosb
3771 lea -16(%rdi),%rdi # rewind $out by 1 block
3772 mov $rnds_,$rounds # restore $rounds
3773 mov %rdi,%rsi # $inp and $out are the same
3774 mov $key_,$key # restore $key
3775 xor $len,$len # len=16
3776 jmp .Lcbc_enc_loop # one more spin
3777\f#--------------------------- CBC DECRYPT ------------------------------#
3778.align 16
3779.Lcbc_decrypt:
23f6eec7
AP
3780 cmp \$16,$len
3781 jne .Lcbc_decrypt_bulk
3782
3783 # handle single block without allocating stack frame,
3784 # useful in ciphertext stealing mode
3785 movdqu ($inp),$inout0 # load input
3786 movdqu ($ivp),$inout1 # load iv
3787 movdqa $inout0,$inout2 # future iv
3788___
3789 &aesni_generate1("dec",$key,$rnds_);
3790$code.=<<___;
3791 pxor $rndkey0,$rndkey0 # clear register bank
3792 pxor $rndkey1,$rndkey1
3793 movdqu $inout2,($ivp) # store iv
3794 xorps $inout1,$inout0 # ^=iv
3795 pxor $inout1,$inout1
3796 movups $inout0,($out) # store output
3797 pxor $inout0,$inout0
3798 jmp .Lcbc_ret
3799.align 16
3800.Lcbc_decrypt_bulk:
384e6de4 3801 lea (%rsp),%r11 # frame pointer
b84460ad 3802.cfi_def_cfa_register %r11
6a40ebe8 3803 push %rbp
b84460ad 3804.cfi_push %rbp
6a40ebe8
AP
3805 sub \$$frame_size,%rsp
3806 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
d64a7232
AP
3807___
3808$code.=<<___ if ($win64);
6a40ebe8
AP
3809 movaps %xmm6,0x10(%rsp)
3810 movaps %xmm7,0x20(%rsp)
3811 movaps %xmm8,0x30(%rsp)
3812 movaps %xmm9,0x40(%rsp)
73325b22
AP
3813 movaps %xmm10,0x50(%rsp)
3814 movaps %xmm11,0x60(%rsp)
3815 movaps %xmm12,0x70(%rsp)
3816 movaps %xmm13,0x80(%rsp)
3817 movaps %xmm14,0x90(%rsp)
3818 movaps %xmm15,0xa0(%rsp)
d608b4d6 3819.Lcbc_decrypt_body:
d64a7232 3820___
384e6de4
AP
3821
3822my $inp_=$key_="%rbp"; # reassign $key_
3823
d64a7232 3824$code.=<<___;
384e6de4 3825 mov $key,$key_ # [re-]backup $key [after reassignment]
d64a7232 3826 movups ($ivp),$iv
d608b4d6 3827 mov $rnds_,$rounds
73325b22 3828 cmp \$0x50,$len
d608b4d6 3829 jbe .Lcbc_dec_tail
73325b22
AP
3830
3831 $movkey ($key),$rndkey0
3832 movdqu 0x00($inp),$inout0 # load input
3833 movdqu 0x10($inp),$inout1
3834 movdqa $inout0,$in0
3835 movdqu 0x20($inp),$inout2
3836 movdqa $inout1,$in1
3837 movdqu 0x30($inp),$inout3
3838 movdqa $inout2,$in2
3839 movdqu 0x40($inp),$inout4
3840 movdqa $inout3,$in3
3841 movdqu 0x50($inp),$inout5
3842 movdqa $inout4,$in4
5599c733 3843 mov OPENSSL_ia32cap_P+4(%rip),%r9d
73325b22
AP
3844 cmp \$0x70,$len
3845 jbe .Lcbc_dec_six_or_seven
3846
23f6eec7
AP
3847 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3848 sub \$0x50,$len # $len is biased by -5*16
5599c733 3849 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
23f6eec7
AP
3850 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3851 sub \$0x20,$len # $len is biased by -7*16
73325b22 3852 lea 0x70($key),$key # size optimization
f8501464 3853 jmp .Lcbc_dec_loop8_enter
d7d119a3 3854.align 16
f8501464 3855.Lcbc_dec_loop8:
f8501464
AP
3856 movups $inout7,($out)
3857 lea 0x10($out),$out
3858.Lcbc_dec_loop8_enter:
73325b22
AP
3859 movdqu 0x60($inp),$inout6
3860 pxor $rndkey0,$inout0
3861 movdqu 0x70($inp),$inout7
3862 pxor $rndkey0,$inout1
3863 $movkey 0x10-0x70($key),$rndkey1
3864 pxor $rndkey0,$inout2
384e6de4 3865 mov \$-1,$inp_
73325b22
AP
3866 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3867 pxor $rndkey0,$inout3
3868 pxor $rndkey0,$inout4
3869 pxor $rndkey0,$inout5
3870 pxor $rndkey0,$inout6
d7d119a3 3871
f8501464 3872 aesdec $rndkey1,$inout0
73325b22
AP
3873 pxor $rndkey0,$inout7
3874 $movkey 0x20-0x70($key),$rndkey0
f8501464 3875 aesdec $rndkey1,$inout1
f8501464 3876 aesdec $rndkey1,$inout2
f8501464 3877 aesdec $rndkey1,$inout3
f8501464 3878 aesdec $rndkey1,$inout4
f8501464 3879 aesdec $rndkey1,$inout5
f8501464 3880 aesdec $rndkey1,$inout6
384e6de4
AP
3881 adc \$0,$inp_
3882 and \$128,$inp_
f8501464 3883 aesdec $rndkey1,$inout7
73325b22
AP
3884 add $inp,$inp_
3885 $movkey 0x30-0x70($key),$rndkey1
3886___
3887for($i=1;$i<12;$i++) {
3888my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
d8ba0dc9
AP
3889$code.=<<___ if ($i==7);
3890 cmp \$11,$rounds
3891___
73325b22
AP
3892$code.=<<___;
3893 aesdec $rndkeyx,$inout0
3894 aesdec $rndkeyx,$inout1
3895 aesdec $rndkeyx,$inout2
3896 aesdec $rndkeyx,$inout3
3897 aesdec $rndkeyx,$inout4
3898 aesdec $rndkeyx,$inout5
3899 aesdec $rndkeyx,$inout6
3900 aesdec $rndkeyx,$inout7
3901 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3902___
d8ba0dc9
AP
3903$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3904 nop
3905___
73325b22 3906$code.=<<___ if ($i==7);
73325b22
AP
3907 jb .Lcbc_dec_done
3908___
3909$code.=<<___ if ($i==9);
3910 je .Lcbc_dec_done
3911___
d8ba0dc9
AP
3912$code.=<<___ if ($i==11);
3913 jmp .Lcbc_dec_done
3914___
73325b22
AP
3915}
3916$code.=<<___;
d8ba0dc9 3917.align 16
73325b22
AP
3918.Lcbc_dec_done:
3919 aesdec $rndkey1,$inout0
73325b22 3920 aesdec $rndkey1,$inout1
d8ba0dc9 3921 pxor $rndkey0,$iv
73325b22
AP
3922 pxor $rndkey0,$in0
3923 aesdec $rndkey1,$inout2
73325b22 3924 aesdec $rndkey1,$inout3
d8ba0dc9 3925 pxor $rndkey0,$in1
73325b22
AP
3926 pxor $rndkey0,$in2
3927 aesdec $rndkey1,$inout4
73325b22 3928 aesdec $rndkey1,$inout5
d8ba0dc9 3929 pxor $rndkey0,$in3
73325b22
AP
3930 pxor $rndkey0,$in4
3931 aesdec $rndkey1,$inout6
3932 aesdec $rndkey1,$inout7
3933 movdqu 0x50($inp),$rndkey1
d64a7232 3934
73325b22
AP
3935 aesdeclast $iv,$inout0
3936 movdqu 0x60($inp),$iv # borrow $iv
3937 pxor $rndkey0,$rndkey1
3938 aesdeclast $in0,$inout1
3939 pxor $rndkey0,$iv
3940 movdqu 0x70($inp),$rndkey0 # next IV
73325b22 3941 aesdeclast $in1,$inout2
d8ba0dc9 3942 lea 0x80($inp),$inp
73325b22
AP
3943 movdqu 0x00($inp_),$in0
3944 aesdeclast $in2,$inout3
73325b22 3945 aesdeclast $in3,$inout4
d8ba0dc9 3946 movdqu 0x10($inp_),$in1
73325b22
AP
3947 movdqu 0x20($inp_),$in2
3948 aesdeclast $in4,$inout5
73325b22 3949 aesdeclast $rndkey1,$inout6
d8ba0dc9 3950 movdqu 0x30($inp_),$in3
73325b22
AP
3951 movdqu 0x40($inp_),$in4
3952 aesdeclast $iv,$inout7
3953 movdqa $rndkey0,$iv # return $iv
3954 movdqu 0x50($inp_),$rndkey1
3955 $movkey -0x70($key),$rndkey0
3956
3957 movups $inout0,($out) # store output
3958 movdqa $in0,$inout0
3959 movups $inout1,0x10($out)
3960 movdqa $in1,$inout1
3961 movups $inout2,0x20($out)
3962 movdqa $in2,$inout2
3963 movups $inout3,0x30($out)
3964 movdqa $in3,$inout3
3965 movups $inout4,0x40($out)
3966 movdqa $in4,$inout4
3967 movups $inout5,0x50($out)
3968 movdqa $rndkey1,$inout5
3969 movups $inout6,0x60($out)
3970 lea 0x70($out),$out
f8501464 3971
f8501464
AP
3972 sub \$0x80,$len
3973 ja .Lcbc_dec_loop8
3974
3975 movaps $inout7,$inout0
73325b22 3976 lea -0x70($key),$key
f8501464 3977 add \$0x70,$len
23f6eec7 3978 jle .Lcbc_dec_clear_tail_collected
73325b22 3979 movups $inout7,($out)
f8501464 3980 lea 0x10($out),$out
73325b22
AP
3981 cmp \$0x50,$len
3982 jbe .Lcbc_dec_tail
3983
3984 movaps $in0,$inout0
3985.Lcbc_dec_six_or_seven:
3986 cmp \$0x60,$len
3987 ja .Lcbc_dec_seven
3988
3989 movaps $inout5,$inout6
3990 call _aesni_decrypt6
3991 pxor $iv,$inout0 # ^= IV
3992 movaps $inout6,$iv
3993 pxor $in0,$inout1
3994 movdqu $inout0,($out)
3995 pxor $in1,$inout2
3996 movdqu $inout1,0x10($out)
23f6eec7 3997 pxor $inout1,$inout1 # clear register bank
73325b22
AP
3998 pxor $in2,$inout3
3999 movdqu $inout2,0x20($out)
23f6eec7 4000 pxor $inout2,$inout2
73325b22
AP
4001 pxor $in3,$inout4
4002 movdqu $inout3,0x30($out)
23f6eec7 4003 pxor $inout3,$inout3
73325b22
AP
4004 pxor $in4,$inout5
4005 movdqu $inout4,0x40($out)
23f6eec7 4006 pxor $inout4,$inout4
73325b22
AP
4007 lea 0x50($out),$out
4008 movdqa $inout5,$inout0
23f6eec7 4009 pxor $inout5,$inout5
73325b22
AP
4010 jmp .Lcbc_dec_tail_collected
4011
4012.align 16
4013.Lcbc_dec_seven:
4014 movups 0x60($inp),$inout6
4015 xorps $inout7,$inout7
4016 call _aesni_decrypt8
4017 movups 0x50($inp),$inout7
4018 pxor $iv,$inout0 # ^= IV
4019 movups 0x60($inp),$iv
4020 pxor $in0,$inout1
4021 movdqu $inout0,($out)
4022 pxor $in1,$inout2
4023 movdqu $inout1,0x10($out)
23f6eec7 4024 pxor $inout1,$inout1 # clear register bank
73325b22
AP
4025 pxor $in2,$inout3
4026 movdqu $inout2,0x20($out)
23f6eec7 4027 pxor $inout2,$inout2
73325b22
AP
4028 pxor $in3,$inout4
4029 movdqu $inout3,0x30($out)
23f6eec7 4030 pxor $inout3,$inout3
73325b22
AP
4031 pxor $in4,$inout5
4032 movdqu $inout4,0x40($out)
23f6eec7 4033 pxor $inout4,$inout4
73325b22
AP
4034 pxor $inout7,$inout6
4035 movdqu $inout5,0x50($out)
23f6eec7 4036 pxor $inout5,$inout5
73325b22
AP
4037 lea 0x60($out),$out
4038 movdqa $inout6,$inout0
23f6eec7
AP
4039 pxor $inout6,$inout6
4040 pxor $inout7,$inout7
73325b22
AP
4041 jmp .Lcbc_dec_tail_collected
4042
5599c733
AP
4043.align 16
4044.Lcbc_dec_loop6:
4045 movups $inout5,($out)
4046 lea 0x10($out),$out
4047 movdqu 0x00($inp),$inout0 # load input
4048 movdqu 0x10($inp),$inout1
4049 movdqa $inout0,$in0
4050 movdqu 0x20($inp),$inout2
4051 movdqa $inout1,$in1
4052 movdqu 0x30($inp),$inout3
4053 movdqa $inout2,$in2
4054 movdqu 0x40($inp),$inout4
4055 movdqa $inout3,$in3
4056 movdqu 0x50($inp),$inout5
4057 movdqa $inout4,$in4
4058.Lcbc_dec_loop6_enter:
4059 lea 0x60($inp),$inp
4060 movdqa $inout5,$inout6
4061
4062 call _aesni_decrypt6
4063
4064 pxor $iv,$inout0 # ^= IV
4065 movdqa $inout6,$iv
4066 pxor $in0,$inout1
4067 movdqu $inout0,($out)
4068 pxor $in1,$inout2
4069 movdqu $inout1,0x10($out)
4070 pxor $in2,$inout3
4071 movdqu $inout2,0x20($out)
4072 pxor $in3,$inout4
4073 mov $key_,$key
4074 movdqu $inout3,0x30($out)
4075 pxor $in4,$inout5
4076 mov $rnds_,$rounds
4077 movdqu $inout4,0x40($out)
4078 lea 0x50($out),$out
4079 sub \$0x60,$len
4080 ja .Lcbc_dec_loop6
4081
4082 movdqa $inout5,$inout0
4083 add \$0x50,$len
23f6eec7 4084 jle .Lcbc_dec_clear_tail_collected
5599c733
AP
4085 movups $inout5,($out)
4086 lea 0x10($out),$out
4087
6c83629b 4088.Lcbc_dec_tail:
d64a7232 4089 movups ($inp),$inout0
73325b22 4090 sub \$0x10,$len
23f6eec7 4091 jbe .Lcbc_dec_one # $len is 1*16 or less
f8501464 4092
d64a7232 4093 movups 0x10($inp),$inout1
73325b22
AP
4094 movaps $inout0,$in0
4095 sub \$0x10,$len
23f6eec7 4096 jbe .Lcbc_dec_two # $len is 2*16 or less
f8501464 4097
d64a7232 4098 movups 0x20($inp),$inout2
73325b22
AP
4099 movaps $inout1,$in1
4100 sub \$0x10,$len
23f6eec7 4101 jbe .Lcbc_dec_three # $len is 3*16 or less
f8501464 4102
d64a7232 4103 movups 0x30($inp),$inout3
73325b22
AP
4104 movaps $inout2,$in2
4105 sub \$0x10,$len
23f6eec7 4106 jbe .Lcbc_dec_four # $len is 4*16 or less
f8501464 4107
23f6eec7 4108 movups 0x40($inp),$inout4 # $len is 5*16 or less
73325b22
AP
4109 movaps $inout3,$in3
4110 movaps $inout4,$in4
4111 xorps $inout5,$inout5
4112 call _aesni_decrypt6
4113 pxor $iv,$inout0
4114 movaps $in4,$iv
4115 pxor $in0,$inout1
4116 movdqu $inout0,($out)
4117 pxor $in1,$inout2
4118 movdqu $inout1,0x10($out)
23f6eec7 4119 pxor $inout1,$inout1 # clear register bank
73325b22
AP
4120 pxor $in2,$inout3
4121 movdqu $inout2,0x20($out)
23f6eec7 4122 pxor $inout2,$inout2
73325b22
AP
4123 pxor $in3,$inout4
4124 movdqu $inout3,0x30($out)
23f6eec7 4125 pxor $inout3,$inout3
73325b22
AP
4126 lea 0x40($out),$out
4127 movdqa $inout4,$inout0
23f6eec7
AP
4128 pxor $inout4,$inout4
4129 pxor $inout5,$inout5
73325b22 4130 sub \$0x10,$len
d64a7232 4131 jmp .Lcbc_dec_tail_collected
73325b22 4132
d64a7232
AP
4133.align 16
4134.Lcbc_dec_one:
73325b22 4135 movaps $inout0,$in0
d64a7232 4136___
d608b4d6 4137 &aesni_generate1("dec",$key,$rounds);
d64a7232 4138$code.=<<___;
f8501464 4139 xorps $iv,$inout0
d64a7232
AP
4140 movaps $in0,$iv
4141 jmp .Lcbc_dec_tail_collected
4142.align 16
4143.Lcbc_dec_two:
73325b22 4144 movaps $inout1,$in1
214368ff 4145 call _aesni_decrypt2
73325b22 4146 pxor $iv,$inout0
d64a7232 4147 movaps $in1,$iv
73325b22
AP
4148 pxor $in0,$inout1
4149 movdqu $inout0,($out)
4150 movdqa $inout1,$inout0
23f6eec7 4151 pxor $inout1,$inout1 # clear register bank
d64a7232
AP
4152 lea 0x10($out),$out
4153 jmp .Lcbc_dec_tail_collected
4154.align 16
4155.Lcbc_dec_three:
73325b22 4156 movaps $inout2,$in2
d608b4d6 4157 call _aesni_decrypt3
73325b22 4158 pxor $iv,$inout0
d64a7232 4159 movaps $in2,$iv
73325b22
AP
4160 pxor $in0,$inout1
4161 movdqu $inout0,($out)
4162 pxor $in1,$inout2
4163 movdqu $inout1,0x10($out)
23f6eec7 4164 pxor $inout1,$inout1 # clear register bank
73325b22 4165 movdqa $inout2,$inout0
23f6eec7 4166 pxor $inout2,$inout2
d64a7232 4167 lea 0x20($out),$out
f8501464
AP
4168 jmp .Lcbc_dec_tail_collected
4169.align 16
4170.Lcbc_dec_four:
73325b22 4171 movaps $inout3,$in3
f8501464 4172 call _aesni_decrypt4
73325b22
AP
4173 pxor $iv,$inout0
4174 movaps $in3,$iv
4175 pxor $in0,$inout1
4176 movdqu $inout0,($out)
4177 pxor $in1,$inout2
4178 movdqu $inout1,0x10($out)
23f6eec7 4179 pxor $inout1,$inout1 # clear register bank
73325b22
AP
4180 pxor $in2,$inout3
4181 movdqu $inout2,0x20($out)
23f6eec7 4182 pxor $inout2,$inout2
73325b22 4183 movdqa $inout3,$inout0
23f6eec7 4184 pxor $inout3,$inout3
f8501464 4185 lea 0x30($out),$out
d64a7232 4186 jmp .Lcbc_dec_tail_collected
73325b22 4187
d64a7232 4188.align 16
23f6eec7
AP
4189.Lcbc_dec_clear_tail_collected:
4190 pxor $inout1,$inout1 # clear register bank
4191 pxor $inout2,$inout2
4192 pxor $inout3,$inout3
4193___
4194$code.=<<___ if (!$win64);
4195 pxor $inout4,$inout4 # %xmm6..9
4196 pxor $inout5,$inout5
4197 pxor $inout6,$inout6
4198 pxor $inout7,$inout7
4199___
4200$code.=<<___;
d64a7232 4201.Lcbc_dec_tail_collected:
d64a7232 4202 movups $iv,($ivp)
73325b22 4203 and \$15,$len
d64a7232 4204 jnz .Lcbc_dec_tail_partial
f8501464 4205 movups $inout0,($out)
23f6eec7 4206 pxor $inout0,$inout0
d64a7232 4207 jmp .Lcbc_dec_ret
d7d119a3 4208.align 16
d64a7232 4209.Lcbc_dec_tail_partial:
6a40ebe8 4210 movaps $inout0,(%rsp)
23f6eec7 4211 pxor $inout0,$inout0
f8501464 4212 mov \$16,%rcx
d64a7232 4213 mov $out,%rdi
f8501464 4214 sub $len,%rcx
6a40ebe8 4215 lea (%rsp),%rsi
23f6eec7
AP
4216 .long 0x9066A4F3 # rep movsb
4217 movdqa $inout0,(%rsp)
d64a7232
AP
4218
4219.Lcbc_dec_ret:
23f6eec7
AP
4220 xorps $rndkey0,$rndkey0 # %xmm0
4221 pxor $rndkey1,$rndkey1
d64a7232
AP
4222___
4223$code.=<<___ if ($win64);
6a40ebe8 4224 movaps 0x10(%rsp),%xmm6
23f6eec7 4225 movaps %xmm0,0x10(%rsp) # clear stack
6a40ebe8 4226 movaps 0x20(%rsp),%xmm7
23f6eec7 4227 movaps %xmm0,0x20(%rsp)
6a40ebe8 4228 movaps 0x30(%rsp),%xmm8
23f6eec7 4229 movaps %xmm0,0x30(%rsp)
6a40ebe8 4230 movaps 0x40(%rsp),%xmm9
23f6eec7 4231 movaps %xmm0,0x40(%rsp)
73325b22 4232 movaps 0x50(%rsp),%xmm10
23f6eec7 4233 movaps %xmm0,0x50(%rsp)
73325b22 4234 movaps 0x60(%rsp),%xmm11
23f6eec7 4235 movaps %xmm0,0x60(%rsp)
73325b22 4236 movaps 0x70(%rsp),%xmm12
23f6eec7 4237 movaps %xmm0,0x70(%rsp)
73325b22 4238 movaps 0x80(%rsp),%xmm13
23f6eec7 4239 movaps %xmm0,0x80(%rsp)
73325b22 4240 movaps 0x90(%rsp),%xmm14
23f6eec7 4241 movaps %xmm0,0x90(%rsp)
73325b22 4242 movaps 0xa0(%rsp),%xmm15
23f6eec7 4243 movaps %xmm0,0xa0(%rsp)
d64a7232
AP
4244___
4245$code.=<<___;
384e6de4 4246 mov -8(%r11),%rbp
b84460ad 4247.cfi_restore %rbp
384e6de4 4248 lea (%r11),%rsp
b84460ad 4249.cfi_def_cfa_register %rsp
d64a7232
AP
4250.Lcbc_ret:
4251 ret
b84460ad 4252.cfi_endproc
d64a7232
AP
4253.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4254___
f8501464 4255} \f
23f6eec7 4256# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
d608b4d6 4257# int bits, AES_KEY *key)
23f6eec7
AP
4258#
4259# input: $inp user-supplied key
4260# $bits $inp length in bits
4261# $key pointer to key schedule
4262# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4263# *$key key schedule
4264#
d608b4d6
AP
4265{ my ($inp,$bits,$key) = @_4args;
4266 $bits =~ s/%r/%e/;
4267
d64a7232
AP
4268$code.=<<___;
4269.globl ${PREFIX}_set_decrypt_key
d608b4d6 4270.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
d64a7232
AP
4271.align 16
4272${PREFIX}_set_decrypt_key:
b84460ad 4273.cfi_startproc
d608b4d6 4274 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
b84460ad 4275.cfi_adjust_cfa_offset 8
fb2f3411 4276 call __aesni_set_encrypt_key
d608b4d6 4277 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
d64a7232
AP
4278 test %eax,%eax
4279 jnz .Ldec_key_ret
d608b4d6
AP
4280 lea 16($key,$bits),$inp # points at the end of key schedule
4281
4282 $movkey ($key),%xmm0 # just swap
4283 $movkey ($inp),%xmm1
4284 $movkey %xmm0,($inp)
4285 $movkey %xmm1,($key)
4286 lea 16($key),$key
4287 lea -16($inp),$inp
4288
d64a7232 4289.Ldec_key_inverse:
d608b4d6
AP
4290 $movkey ($key),%xmm0 # swap and inverse
4291 $movkey ($inp),%xmm1
d64a7232
AP
4292 aesimc %xmm0,%xmm0
4293 aesimc %xmm1,%xmm1
d608b4d6
AP
4294 lea 16($key),$key
4295 lea -16($inp),$inp
d608b4d6
AP
4296 $movkey %xmm0,16($inp)
4297 $movkey %xmm1,-16($key)
d7d119a3 4298 cmp $key,$inp
d64a7232
AP
4299 ja .Ldec_key_inverse
4300
d608b4d6 4301 $movkey ($key),%xmm0 # inverse middle
d64a7232 4302 aesimc %xmm0,%xmm0
23f6eec7 4303 pxor %xmm1,%xmm1
d608b4d6 4304 $movkey %xmm0,($inp)
23f6eec7 4305 pxor %xmm0,%xmm0
d64a7232 4306.Ldec_key_ret:
d608b4d6 4307 add \$8,%rsp
b84460ad 4308.cfi_adjust_cfa_offset -8
d64a7232 4309 ret
b84460ad 4310.cfi_endproc
d608b4d6 4311.LSEH_end_set_decrypt_key:
d64a7232
AP
4312.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4313___
4314\f
e3713c36
RS
4315# This is based on submission from Intel by
4316# Huang Ying
4317# Vinodh Gopal
d64a7232
AP
4318# Kahraman Akdemir
4319#
60250017 4320# Aggressively optimized in respect to aeskeygenassist's critical path
d64a7232
AP
4321# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4322#
23f6eec7
AP
4323# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4324# int bits, AES_KEY * const key);
4325#
4326# input: $inp user-supplied key
4327# $bits $inp length in bits
4328# $key pointer to key schedule
4329# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4330# $bits rounds-1 (used in aesni_set_decrypt_key)
4331# *$key key schedule
4332# $key pointer to key schedule (used in
4333# aesni_set_decrypt_key)
4334#
4335# Subroutine is frame-less, which means that only volatile registers
4336# are used. Note that it's declared "abi-omnipotent", which means that
4337# amount of volatile registers is smaller on Windows.
4338#
d64a7232 4339$code.=<<___;
d608b4d6
AP
4340.globl ${PREFIX}_set_encrypt_key
4341.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
d64a7232 4342.align 16
d608b4d6 4343${PREFIX}_set_encrypt_key:
fb2f3411 4344__aesni_set_encrypt_key:
b84460ad 4345.cfi_startproc
d608b4d6 4346 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
b84460ad 4347.cfi_adjust_cfa_offset 8
d608b4d6 4348 mov \$-1,%rax
d7d119a3 4349 test $inp,$inp
d608b4d6
AP
4350 jz .Lenc_key_ret
4351 test $key,$key
4352 jz .Lenc_key_ret
4353
23f6eec7 4354 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
d608b4d6 4355 movups ($inp),%xmm0 # pull first 128 bits of *userKey
f8501464 4356 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
23f6eec7
AP
4357 and OPENSSL_ia32cap_P+4(%rip),%r10d
4358 lea 16($key),%rax # %rax is used as modifiable copy of $key
d608b4d6 4359 cmp \$256,$bits
d64a7232 4360 je .L14rounds
d608b4d6 4361 cmp \$192,$bits
d64a7232 4362 je .L12rounds
d608b4d6 4363 cmp \$128,$bits
d64a7232 4364 jne .Lbad_keybits
d608b4d6 4365
d64a7232 4366.L10rounds:
d608b4d6 4367 mov \$9,$bits # 10 rounds for 128-bit key
23f6eec7
AP
4368 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4369 je .L10rounds_alt
4370
d608b4d6 4371 $movkey %xmm0,($key) # round 0
d64a7232
AP
4372 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4373 call .Lkey_expansion_128_cold
4374 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4375 call .Lkey_expansion_128
4376 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4377 call .Lkey_expansion_128
4378 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4379 call .Lkey_expansion_128
4380 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4381 call .Lkey_expansion_128
4382 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4383 call .Lkey_expansion_128
4384 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4385 call .Lkey_expansion_128
4386 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4387 call .Lkey_expansion_128
4388 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4389 call .Lkey_expansion_128
4390 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4391 call .Lkey_expansion_128
d608b4d6
AP
4392 $movkey %xmm0,(%rax)
4393 mov $bits,80(%rax) # 240(%rdx)
d64a7232 4394 xor %eax,%eax
d608b4d6 4395 jmp .Lenc_key_ret
d64a7232 4396
23f6eec7
AP
4397.align 16
4398.L10rounds_alt:
4399 movdqa .Lkey_rotate(%rip),%xmm5
4400 mov \$8,%r10d
4401 movdqa .Lkey_rcon1(%rip),%xmm4
4402 movdqa %xmm0,%xmm2
4403 movdqu %xmm0,($key)
4404 jmp .Loop_key128
4405
4406.align 16
4407.Loop_key128:
4408 pshufb %xmm5,%xmm0
4409 aesenclast %xmm4,%xmm0
4410 pslld \$1,%xmm4
4411 lea 16(%rax),%rax
4412
4413 movdqa %xmm2,%xmm3
4414 pslldq \$4,%xmm2
4415 pxor %xmm2,%xmm3
4416 pslldq \$4,%xmm2
4417 pxor %xmm2,%xmm3
4418 pslldq \$4,%xmm2
4419 pxor %xmm3,%xmm2
4420
4421 pxor %xmm2,%xmm0
4422 movdqu %xmm0,-16(%rax)
4423 movdqa %xmm0,%xmm2
4424
4425 dec %r10d
4426 jnz .Loop_key128
4427
4428 movdqa .Lkey_rcon1b(%rip),%xmm4
4429
4430 pshufb %xmm5,%xmm0
4431 aesenclast %xmm4,%xmm0
4432 pslld \$1,%xmm4
4433
4434 movdqa %xmm2,%xmm3
4435 pslldq \$4,%xmm2
4436 pxor %xmm2,%xmm3
4437 pslldq \$4,%xmm2
4438 pxor %xmm2,%xmm3
4439 pslldq \$4,%xmm2
4440 pxor %xmm3,%xmm2
4441
4442 pxor %xmm2,%xmm0
4443 movdqu %xmm0,(%rax)
4444
4445 movdqa %xmm0,%xmm2
4446 pshufb %xmm5,%xmm0
4447 aesenclast %xmm4,%xmm0
4448
4449 movdqa %xmm2,%xmm3
4450 pslldq \$4,%xmm2
4451 pxor %xmm2,%xmm3
4452 pslldq \$4,%xmm2
4453 pxor %xmm2,%xmm3
4454 pslldq \$4,%xmm2
4455 pxor %xmm3,%xmm2
4456
4457 pxor %xmm2,%xmm0
4458 movdqu %xmm0,16(%rax)
4459
4460 mov $bits,96(%rax) # 240($key)
4461 xor %eax,%eax
4462 jmp .Lenc_key_ret
4463
d64a7232
AP
4464.align 16
4465.L12rounds:
d608b4d6
AP
4466 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4467 mov \$11,$bits # 12 rounds for 192
23f6eec7
AP
4468 cmp \$`1<<28`,%r10d # AVX, but no XOP
4469 je .L12rounds_alt
4470
d608b4d6 4471 $movkey %xmm0,($key) # round 0
d64a7232
AP
4472 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4473 call .Lkey_expansion_192a_cold
4474 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4475 call .Lkey_expansion_192b
4476 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4477 call .Lkey_expansion_192a
4478 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4479 call .Lkey_expansion_192b
4480 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4481 call .Lkey_expansion_192a
4482 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4483 call .Lkey_expansion_192b
4484 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4485 call .Lkey_expansion_192a
4486 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4487 call .Lkey_expansion_192b
d608b4d6
AP
4488 $movkey %xmm0,(%rax)
4489 mov $bits,48(%rax) # 240(%rdx)
d64a7232 4490 xor %rax, %rax
d608b4d6 4491 jmp .Lenc_key_ret
d64a7232 4492
23f6eec7
AP
4493.align 16
4494.L12rounds_alt:
4495 movdqa .Lkey_rotate192(%rip),%xmm5
4496 movdqa .Lkey_rcon1(%rip),%xmm4
4497 mov \$8,%r10d
4498 movdqu %xmm0,($key)
4499 jmp .Loop_key192
4500
4501.align 16
4502.Loop_key192:
4503 movq %xmm2,0(%rax)
4504 movdqa %xmm2,%xmm1
4505 pshufb %xmm5,%xmm2
4506 aesenclast %xmm4,%xmm2
4507 pslld \$1, %xmm4
4508 lea 24(%rax),%rax
4509
4510 movdqa %xmm0,%xmm3
4511 pslldq \$4,%xmm0
4512 pxor %xmm0,%xmm3
4513 pslldq \$4,%xmm0
4514 pxor %xmm0,%xmm3
4515 pslldq \$4,%xmm0
4516 pxor %xmm3,%xmm0
4517
4518 pshufd \$0xff,%xmm0,%xmm3
4519 pxor %xmm1,%xmm3
4520 pslldq \$4,%xmm1
4521 pxor %xmm1,%xmm3
4522
4523 pxor %xmm2,%xmm0
4524 pxor %xmm3,%xmm2
4525 movdqu %xmm0,-16(%rax)
4526
4527 dec %r10d
4528 jnz .Loop_key192
4529
4530 mov $bits,32(%rax) # 240($key)
4531 xor %eax,%eax
4532 jmp .Lenc_key_ret
4533
d64a7232
AP
4534.align 16
4535.L14rounds:
46f4e1be 4536 movups 16($inp),%xmm2 # remaining half of *userKey
d608b4d6
AP
4537 mov \$13,$bits # 14 rounds for 256
4538 lea 16(%rax),%rax
23f6eec7
AP
4539 cmp \$`1<<28`,%r10d # AVX, but no XOP
4540 je .L14rounds_alt
4541
d608b4d6
AP
4542 $movkey %xmm0,($key) # round 0
4543 $movkey %xmm2,16($key) # round 1
d64a7232
AP
4544 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4545 call .Lkey_expansion_256a_cold
4546 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4547 call .Lkey_expansion_256b
4548 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4549 call .Lkey_expansion_256a
4550 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4551 call .Lkey_expansion_256b
4552 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4553 call .Lkey_expansion_256a
4554 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4555 call .Lkey_expansion_256b
4556 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4557 call .Lkey_expansion_256a
4558 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4559 call .Lkey_expansion_256b
4560 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4561 call .Lkey_expansion_256a
4562 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4563 call .Lkey_expansion_256b
4564 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4565 call .Lkey_expansion_256a
4566 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4567 call .Lkey_expansion_256b
4568 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4569 call .Lkey_expansion_256a
d608b4d6
AP
4570 $movkey %xmm0,(%rax)
4571 mov $bits,16(%rax) # 240(%rdx)
d64a7232 4572 xor %rax,%rax
d608b4d6
AP
4573 jmp .Lenc_key_ret
4574
23f6eec7
AP
4575.align 16
4576.L14rounds_alt:
4577 movdqa .Lkey_rotate(%rip),%xmm5
4578 movdqa .Lkey_rcon1(%rip),%xmm4
4579 mov \$7,%r10d
4580 movdqu %xmm0,0($key)
4581 movdqa %xmm2,%xmm1
4582 movdqu %xmm2,16($key)
4583 jmp .Loop_key256
4584
4585.align 16
4586.Loop_key256:
4587 pshufb %xmm5,%xmm2
4588 aesenclast %xmm4,%xmm2
4589
4590 movdqa %xmm0,%xmm3
4591 pslldq \$4,%xmm0
4592 pxor %xmm0,%xmm3
4593 pslldq \$4,%xmm0
4594 pxor %xmm0,%xmm3
4595 pslldq \$4,%xmm0
4596 pxor %xmm3,%xmm0
4597 pslld \$1,%xmm4
4598
4599 pxor %xmm2,%xmm0
4600 movdqu %xmm0,(%rax)
4601
4602 dec %r10d
4603 jz .Ldone_key256
4604
4605 pshufd \$0xff,%xmm0,%xmm2
4606 pxor %xmm3,%xmm3
4607 aesenclast %xmm3,%xmm2
4608
4609 movdqa %xmm1,%xmm3
4610 pslldq \$4,%xmm1
4611 pxor %xmm1,%xmm3
4612 pslldq \$4,%xmm1
4613 pxor %xmm1,%xmm3
4614 pslldq \$4,%xmm1
4615 pxor %xmm3,%xmm1
4616
4617 pxor %xmm1,%xmm2
4618 movdqu %xmm2,16(%rax)
4619 lea 32(%rax),%rax
4620 movdqa %xmm2,%xmm1
4621
4622 jmp .Loop_key256
4623
4624.Ldone_key256:
4625 mov $bits,16(%rax) # 240($key)
4626 xor %eax,%eax
4627 jmp .Lenc_key_ret
4628
d608b4d6
AP
4629.align 16
4630.Lbad_keybits:
4631 mov \$-2,%rax
4632.Lenc_key_ret:
23f6eec7
AP
4633 pxor %xmm0,%xmm0
4634 pxor %xmm1,%xmm1
4635 pxor %xmm2,%xmm2
4636 pxor %xmm3,%xmm3
4637 pxor %xmm4,%xmm4
4638 pxor %xmm5,%xmm5
d608b4d6 4639 add \$8,%rsp
b84460ad 4640.cfi_adjust_cfa_offset -8
d608b4d6 4641 ret
b84460ad 4642.cfi_endproc
d608b4d6
AP
4643.LSEH_end_set_encrypt_key:
4644\f
4645.align 16
4646.Lkey_expansion_128:
4647 $movkey %xmm0,(%rax)
4648 lea 16(%rax),%rax
4649.Lkey_expansion_128_cold:
4650 shufps \$0b00010000,%xmm0,%xmm4
f8501464 4651 xorps %xmm4, %xmm0
d608b4d6 4652 shufps \$0b10001100,%xmm0,%xmm4
f8501464
AP
4653 xorps %xmm4, %xmm0
4654 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4655 xorps %xmm1,%xmm0
d608b4d6
AP
4656 ret
4657
4658.align 16
4659.Lkey_expansion_192a:
4660 $movkey %xmm0,(%rax)
4661 lea 16(%rax),%rax
4662.Lkey_expansion_192a_cold:
4663 movaps %xmm2, %xmm5
4664.Lkey_expansion_192b_warm:
4665 shufps \$0b00010000,%xmm0,%xmm4
f8501464
AP
4666 movdqa %xmm2,%xmm3
4667 xorps %xmm4,%xmm0
d608b4d6
AP
4668 shufps \$0b10001100,%xmm0,%xmm4
4669 pslldq \$4,%xmm3
f8501464 4670 xorps %xmm4,%xmm0
d608b4d6
AP
4671 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4672 pxor %xmm3,%xmm2
4673 pxor %xmm1,%xmm0
4674 pshufd \$0b11111111,%xmm0,%xmm3
4675 pxor %xmm3,%xmm2
d64a7232
AP
4676 ret
4677
d608b4d6
AP
4678.align 16
4679.Lkey_expansion_192b:
4680 movaps %xmm0,%xmm3
4681 shufps \$0b01000100,%xmm0,%xmm5
4682 $movkey %xmm5,(%rax)
4683 shufps \$0b01001110,%xmm2,%xmm3
4684 $movkey %xmm3,16(%rax)
4685 lea 32(%rax),%rax
4686 jmp .Lkey_expansion_192b_warm
4687
d64a7232
AP
4688.align 16
4689.Lkey_expansion_256a:
d608b4d6
AP
4690 $movkey %xmm2,(%rax)
4691 lea 16(%rax),%rax
d64a7232
AP
4692.Lkey_expansion_256a_cold:
4693 shufps \$0b00010000,%xmm0,%xmm4
f8501464 4694 xorps %xmm4,%xmm0
d64a7232 4695 shufps \$0b10001100,%xmm0,%xmm4
f8501464
AP
4696 xorps %xmm4,%xmm0
4697 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4698 xorps %xmm1,%xmm0
d64a7232
AP
4699 ret
4700
4701.align 16
4702.Lkey_expansion_256b:
d608b4d6
AP
4703 $movkey %xmm0,(%rax)
4704 lea 16(%rax),%rax
d64a7232
AP
4705
4706 shufps \$0b00010000,%xmm2,%xmm4
f8501464 4707 xorps %xmm4,%xmm2
d64a7232 4708 shufps \$0b10001100,%xmm2,%xmm4
f8501464
AP
4709 xorps %xmm4,%xmm2
4710 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4711 xorps %xmm1,%xmm2
d64a7232 4712 ret
d608b4d6 4713.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
f8501464 4714.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
d64a7232
AP
4715___
4716}
4717\f
4718$code.=<<___;
6c83629b
AP
4719.align 64
4720.Lbswap_mask:
4721 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
d7d119a3 4722.Lincrement32:
f8501464 4723 .long 6,6,6,0
d7d119a3
AP
4724.Lincrement64:
4725 .long 1,0,0,0
f8501464
AP
4726.Lxts_magic:
4727 .long 0x87,0,1,0
9282c335
AP
4728.Lincrement1:
4729 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
23f6eec7
AP
4730.Lkey_rotate:
4731 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4732.Lkey_rotate192:
4733 .long 0x04070605,0x04070605,0x04070605,0x04070605
4734.Lkey_rcon1:
4735 .long 1,1,1,1
4736.Lkey_rcon1b:
4737 .long 0x1b,0x1b,0x1b,0x1b
f8501464 4738
d64a7232
AP
4739.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4740.align 64
4741___
4742
d608b4d6
AP
4743# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4744# CONTEXT *context,DISPATCHER_CONTEXT *disp)
4745if ($win64) {
4746$rec="%rcx";
4747$frame="%rdx";
4748$context="%r8";
4749$disp="%r9";
4750
4751$code.=<<___;
4752.extern __imp_RtlVirtualUnwind
6c83629b
AP
4753___
4754$code.=<<___ if ($PREFIX eq "aesni");
69d5747f 4755.type ecb_ccm64_se_handler,\@abi-omnipotent
d7d119a3 4756.align 16
69d5747f 4757ecb_ccm64_se_handler:
d7d119a3
AP
4758 push %rsi
4759 push %rdi
4760 push %rbx
4761 push %rbp
4762 push %r12
4763 push %r13
4764 push %r14
4765 push %r15
4766 pushfq
4767 sub \$64,%rsp
4768
4769 mov 120($context),%rax # pull context->Rax
4770 mov 248($context),%rbx # pull context->Rip
4771
4772 mov 8($disp),%rsi # disp->ImageBase
02f358da 4773 mov 56($disp),%r11 # disp->HandlerData
d7d119a3
AP
4774
4775 mov 0(%r11),%r10d # HandlerData[0]
4776 lea (%rsi,%r10),%r10 # prologue label
4777 cmp %r10,%rbx # context->Rip<prologue label
f8501464 4778 jb .Lcommon_seh_tail
d7d119a3
AP
4779
4780 mov 152($context),%rax # pull context->Rsp
4781
4782 mov 4(%r11),%r10d # HandlerData[1]
4783 lea (%rsi,%r10),%r10 # epilogue label
4784 cmp %r10,%rbx # context->Rip>=epilogue label
f8501464 4785 jae .Lcommon_seh_tail
d7d119a3 4786
f8501464 4787 lea 0(%rax),%rsi # %xmm save area
d7d119a3
AP
4788 lea 512($context),%rdi # &context.Xmm6
4789 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4790 .long 0xa548f3fc # cld; rep movsq
4791 lea 0x58(%rax),%rax # adjust stack pointer
4792
f8501464 4793 jmp .Lcommon_seh_tail
69d5747f 4794.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
d7d119a3 4795
6c79faaa 4796.type ctr_xts_se_handler,\@abi-omnipotent
6c83629b 4797.align 16
6c79faaa 4798ctr_xts_se_handler:
f8501464
AP
4799 push %rsi
4800 push %rdi
4801 push %rbx
4802 push %rbp
4803 push %r12
4804 push %r13
4805 push %r14
4806 push %r15
4807 pushfq
4808 sub \$64,%rsp
4809
4810 mov 120($context),%rax # pull context->Rax
4811 mov 248($context),%rbx # pull context->Rip
4812
4813 mov 8($disp),%rsi # disp->ImageBase
4814 mov 56($disp),%r11 # disp->HandlerData
4815
4816 mov 0(%r11),%r10d # HandlerData[0]
4817 lea (%rsi,%r10),%r10 # prologue lable
4818 cmp %r10,%rbx # context->Rip<prologue label
4819 jb .Lcommon_seh_tail
4820
4821 mov 152($context),%rax # pull context->Rsp
4822
4823 mov 4(%r11),%r10d # HandlerData[1]
4824 lea (%rsi,%r10),%r10 # epilogue label
4825 cmp %r10,%rbx # context->Rip>=epilogue label
4826 jae .Lcommon_seh_tail
4827
384e6de4
AP
4828 mov 208($context),%rax # pull context->R11
4829
4830 lea -0xa8(%rax),%rsi # %xmm save area
f8501464
AP
4831 lea 512($context),%rdi # & context.Xmm6
4832 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4833 .long 0xa548f3fc # cld; rep movsq
f8501464 4834
384e6de4
AP
4835 mov -8(%rax),%rbp # restore saved %rbp
4836 mov %rbp,160($context) # restore context->Rbp
4837 jmp .Lcommon_seh_tail
6c79faaa 4838.size ctr_xts_se_handler,.-ctr_xts_se_handler
bd30091c
AP
4839
4840.type ocb_se_handler,\@abi-omnipotent
4841.align 16
4842ocb_se_handler:
4843 push %rsi
4844 push %rdi
4845 push %rbx
4846 push %rbp
4847 push %r12
4848 push %r13
4849 push %r14
4850 push %r15
4851 pushfq
4852 sub \$64,%rsp
4853
4854 mov 120($context),%rax # pull context->Rax
4855 mov 248($context),%rbx # pull context->Rip
4856
4857 mov 8($disp),%rsi # disp->ImageBase
4858 mov 56($disp),%r11 # disp->HandlerData
4859
4860 mov 0(%r11),%r10d # HandlerData[0]
4861 lea (%rsi,%r10),%r10 # prologue lable
4862 cmp %r10,%rbx # context->Rip<prologue label
4863 jb .Lcommon_seh_tail
4864
4865 mov 4(%r11),%r10d # HandlerData[1]
4866 lea (%rsi,%r10),%r10 # epilogue label
4867 cmp %r10,%rbx # context->Rip>=epilogue label
4868 jae .Lcommon_seh_tail
4869
4870 mov 8(%r11),%r10d # HandlerData[2]
4871 lea (%rsi,%r10),%r10
4872 cmp %r10,%rbx # context->Rip>=pop label
4873 jae .Locb_no_xmm
4874
4875 mov 152($context),%rax # pull context->Rsp
4876
4877 lea (%rax),%rsi # %xmm save area
4878 lea 512($context),%rdi # & context.Xmm6
4879 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4880 .long 0xa548f3fc # cld; rep movsq
4881 lea 0xa0+0x28(%rax),%rax
4882
4883.Locb_no_xmm:
4884 mov -8(%rax),%rbx
4885 mov -16(%rax),%rbp
4886 mov -24(%rax),%r12
4887 mov -32(%rax),%r13
4888 mov -40(%rax),%r14
4889
4890 mov %rbx,144($context) # restore context->Rbx
4891 mov %rbp,160($context) # restore context->Rbp
4892 mov %r12,216($context) # restore context->R12
4893 mov %r13,224($context) # restore context->R13
4894 mov %r14,232($context) # restore context->R14
4895
4896 jmp .Lcommon_seh_tail
4897.size ocb_se_handler,.-ocb_se_handler
6c83629b
AP
4898___
4899$code.=<<___;
4900.type cbc_se_handler,\@abi-omnipotent
d608b4d6 4901.align 16
6c83629b 4902cbc_se_handler:
d608b4d6
AP
4903 push %rsi
4904 push %rdi
4905 push %rbx
4906 push %rbp
4907 push %r12
4908 push %r13
4909 push %r14
4910 push %r15
4911 pushfq
4912 sub \$64,%rsp
4913
4914 mov 152($context),%rax # pull context->Rsp
6c83629b
AP
4915 mov 248($context),%rbx # pull context->Rip
4916
23f6eec7 4917 lea .Lcbc_decrypt_bulk(%rip),%r10
6c83629b 4918 cmp %r10,%rbx # context->Rip<"prologue" label
f8501464 4919 jb .Lcommon_seh_tail
6c83629b 4920
384e6de4
AP
4921 mov 120($context),%rax # pull context->Rax
4922
6c83629b
AP
4923 lea .Lcbc_decrypt_body(%rip),%r10
4924 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
384e6de4
AP
4925 jb .Lcommon_seh_tail
4926
4927 mov 152($context),%rax # pull context->Rsp
6c83629b
AP
4928
4929 lea .Lcbc_ret(%rip),%r10
4930 cmp %r10,%rbx # context->Rip>="epilogue" label
f8501464 4931 jae .Lcommon_seh_tail
6c83629b 4932
6a40ebe8 4933 lea 16(%rax),%rsi # %xmm save area
6c83629b 4934 lea 512($context),%rdi # &context.Xmm6
73325b22 4935 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
6c83629b 4936 .long 0xa548f3fc # cld; rep movsq
6a40ebe8 4937
384e6de4 4938 mov 208($context),%rax # pull context->R11
6c83629b 4939
384e6de4
AP
4940 mov -8(%rax),%rbp # restore saved %rbp
4941 mov %rbp,160($context) # restore context->Rbp
f8501464
AP
4942
4943.Lcommon_seh_tail:
d608b4d6
AP
4944 mov 8(%rax),%rdi
4945 mov 16(%rax),%rsi
6c83629b 4946 mov %rax,152($context) # restore context->Rsp
d608b4d6
AP
4947 mov %rsi,168($context) # restore context->Rsi
4948 mov %rdi,176($context) # restore context->Rdi
4949
d608b4d6
AP
4950 mov 40($disp),%rdi # disp->ContextRecord
4951 mov $context,%rsi # context
4952 mov \$154,%ecx # sizeof(CONTEXT)
4953 .long 0xa548f3fc # cld; rep movsq
4954
4955 mov $disp,%rsi
4956 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4957 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4958 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4959 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4960 mov 40(%rsi),%r10 # disp->ContextRecord
4961 lea 56(%rsi),%r11 # &disp->HandlerData
4962 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4963 mov %r10,32(%rsp) # arg5
4964 mov %r11,40(%rsp) # arg6
4965 mov %r12,48(%rsp) # arg7
4966 mov %rcx,56(%rsp) # arg8, (NULL)
4967 call *__imp_RtlVirtualUnwind(%rip)
4968
4969 mov \$1,%eax # ExceptionContinueSearch
4970 add \$64,%rsp
4971 popfq
4972 pop %r15
4973 pop %r14
4974 pop %r13
4975 pop %r12
4976 pop %rbp
4977 pop %rbx
4978 pop %rdi
4979 pop %rsi
4980 ret
4981.size cbc_se_handler,.-cbc_se_handler
4982
4983.section .pdata
4984.align 4
6c83629b
AP
4985___
4986$code.=<<___ if ($PREFIX eq "aesni");
4987 .rva .LSEH_begin_aesni_ecb_encrypt
4988 .rva .LSEH_end_aesni_ecb_encrypt
d608b4d6
AP
4989 .rva .LSEH_info_ecb
4990
d7d119a3
AP
4991 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
4992 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
02f358da 4993 .rva .LSEH_info_ccm64_enc
d7d119a3
AP
4994
4995 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
4996 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
02f358da 4997 .rva .LSEH_info_ccm64_dec
d7d119a3 4998
6c83629b
AP
4999 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
5000 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
5001 .rva .LSEH_info_ctr32
f8501464
AP
5002
5003 .rva .LSEH_begin_aesni_xts_encrypt
5004 .rva .LSEH_end_aesni_xts_encrypt
5005 .rva .LSEH_info_xts_enc
5006
5007 .rva .LSEH_begin_aesni_xts_decrypt
5008 .rva .LSEH_end_aesni_xts_decrypt
5009 .rva .LSEH_info_xts_dec
bd30091c
AP
5010
5011 .rva .LSEH_begin_aesni_ocb_encrypt
5012 .rva .LSEH_end_aesni_ocb_encrypt
5013 .rva .LSEH_info_ocb_enc
5014
5015 .rva .LSEH_begin_aesni_ocb_decrypt
5016 .rva .LSEH_end_aesni_ocb_decrypt
5017 .rva .LSEH_info_ocb_dec
6c83629b
AP
5018___
5019$code.=<<___;
d608b4d6
AP
5020 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5021 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5022 .rva .LSEH_info_cbc
5023
d608b4d6
AP
5024 .rva ${PREFIX}_set_decrypt_key
5025 .rva .LSEH_end_set_decrypt_key
5026 .rva .LSEH_info_key
c5036d78
AP
5027
5028 .rva ${PREFIX}_set_encrypt_key
5029 .rva .LSEH_end_set_encrypt_key
5030 .rva .LSEH_info_key
d608b4d6
AP
5031.section .xdata
5032.align 8
6c83629b
AP
5033___
5034$code.=<<___ if ($PREFIX eq "aesni");
d608b4d6
AP
5035.LSEH_info_ecb:
5036 .byte 9,0,0,0
69d5747f
AP
5037 .rva ecb_ccm64_se_handler
5038 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
02f358da 5039.LSEH_info_ccm64_enc:
d7d119a3 5040 .byte 9,0,0,0
69d5747f 5041 .rva ecb_ccm64_se_handler
02f358da
AP
5042 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5043.LSEH_info_ccm64_dec:
5044 .byte 9,0,0,0
69d5747f 5045 .rva ecb_ccm64_se_handler
02f358da 5046 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
6c83629b
AP
5047.LSEH_info_ctr32:
5048 .byte 9,0,0,0
6c79faaa
AP
5049 .rva ctr_xts_se_handler
5050 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
f8501464
AP
5051.LSEH_info_xts_enc:
5052 .byte 9,0,0,0
6c79faaa 5053 .rva ctr_xts_se_handler
f8501464
AP
5054 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5055.LSEH_info_xts_dec:
5056 .byte 9,0,0,0
6c79faaa 5057 .rva ctr_xts_se_handler
f8501464 5058 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
bd30091c
AP
5059.LSEH_info_ocb_enc:
5060 .byte 9,0,0,0
5061 .rva ocb_se_handler
5062 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5063 .rva .Locb_enc_pop
5064 .long 0
5065.LSEH_info_ocb_dec:
5066 .byte 9,0,0,0
5067 .rva ocb_se_handler
5068 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5069 .rva .Locb_dec_pop
5070 .long 0
6c83629b
AP
5071___
5072$code.=<<___;
d608b4d6
AP
5073.LSEH_info_cbc:
5074 .byte 9,0,0,0
5075 .rva cbc_se_handler
5076.LSEH_info_key:
5077 .byte 0x01,0x04,0x01,0x00
d7d119a3 5078 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
d608b4d6
AP
5079___
5080}
5081
d64a7232 5082sub rex {
0a9a692e
AP
5083 local *opcode=shift;
5084 my ($dst,$src)=@_;
5085 my $rex=0;
5086
5087 $rex|=0x04 if($dst>=8);
5088 $rex|=0x01 if($src>=8);
5089 push @opcode,$rex|0x40 if($rex);
d64a7232
AP
5090}
5091
5092sub aesni {
5093 my $line=shift;
5094 my @opcode=(0x66);
5095
5096 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5097 rex(\@opcode,$4,$3);
5098 push @opcode,0x0f,0x3a,0xdf;
5099 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5100 my $c=$2;
5101 push @opcode,$c=~/^0/?oct($c):$c;
5102 return ".byte\t".join(',',@opcode);
5103 }
5104 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5105 my %opcodelet = (
5106 "aesimc" => 0xdb,
5107 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5108 "aesdec" => 0xde, "aesdeclast" => 0xdf
5109 );
5110 return undef if (!defined($opcodelet{$1}));
5111 rex(\@opcode,$3,$2);
5112 push @opcode,0x0f,0x38,$opcodelet{$1};
5113 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5114 return ".byte\t".join(',',@opcode);
5115 }
36df342f
AP
5116 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5117 my %opcodelet = (
5118 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5119 "aesdec" => 0xde, "aesdeclast" => 0xdf
5120 );
5121 return undef if (!defined($opcodelet{$1}));
5122 my $off = $2;
5123 push @opcode,0x44 if ($3>=8);
5124 push @opcode,0x0f,0x38,$opcodelet{$1};
5125 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5126 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5127 return ".byte\t".join(',',@opcode);
5128 }
d64a7232
AP
5129 return $line;
5130}
5131
5599c733
AP
5132sub movbe {
5133 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5134}
5135
d64a7232
AP
5136$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5137$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5599c733
AP
5138#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5139$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
d64a7232
AP
5140
5141print $code;
5142
5143close STDOUT;