]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
c918d8e2 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
d64a7232 AP |
9 | # |
10 | # ==================================================================== | |
d8ba0dc9 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
d64a7232 AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # This module implements support for Intel AES-NI extension. In | |
18 | # OpenSSL context it's used with Intel engine, but can also be used as | |
19 | # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for | |
20 | # details]. | |
d7d119a3 AP |
21 | # |
22 | # Performance. | |
23 | # | |
24 | # Given aes(enc|dec) instructions' latency asymptotic performance for | |
25 | # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte | |
26 | # processed with 128-bit key. And given their throughput asymptotic | |
27 | # performance for parallelizable modes is 1.25 cycles per byte. Being | |
f8501464 | 28 | # asymptotic limit it's not something you commonly achieve in reality, |
d7d119a3 AP |
29 | # but how close does one get? Below are results collected for |
30 | # different modes and block sized. Pairs of numbers are for en-/ | |
31 | # decryption. | |
32 | # | |
33 | # 16-byte 64-byte 256-byte 1-KB 8-KB | |
34 | # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 | |
35 | # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 | |
36 | # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 | |
609b0852 | 37 | # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 |
d7d119a3 AP |
38 | # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 |
39 | # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 | |
40 | # | |
41 | # ECB, CTR, CBC and CCM results are free from EVP overhead. This means | |
42 | # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni | |
43 | # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. | |
44 | # The results were collected with specially crafted speed.c benchmark | |
45 | # in order to compare them with results reported in "Intel Advanced | |
46 | # Encryption Standard (AES) New Instruction Set" White Paper Revision | |
47 | # 3.0 dated May 2010. All above results are consistently better. This | |
48 | # module also provides better performance for block sizes smaller than | |
49 | # 128 bytes in points *not* represented in the above table. | |
50 | # | |
51 | # Looking at the results for 8-KB buffer. | |
52 | # | |
53 | # CFB and OFB results are far from the limit, because implementation | |
54 | # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on | |
55 | # single-block aesni_encrypt, which is not the most optimal way to go. | |
56 | # CBC encrypt result is unexpectedly high and there is no documented | |
57 | # explanation for it. Seemingly there is a small penalty for feeding | |
58 | # the result back to AES unit the way it's done in CBC mode. There is | |
59 | # nothing one can do and the result appears optimal. CCM result is | |
60 | # identical to CBC, because CBC-MAC is essentially CBC encrypt without | |
61 | # saving output. CCM CTR "stays invisible," because it's neatly | |
79c44b4e | 62 | # interleaved with CBC-MAC. This provides ~30% improvement over |
46f4e1be | 63 | # "straightforward" CCM implementation with CTR and CBC-MAC performed |
d7d119a3 AP |
64 | # disjointly. Parallelizable modes practically achieve the theoretical |
65 | # limit. | |
66 | # | |
67 | # Looking at how results vary with buffer size. | |
68 | # | |
69 | # Curves are practically saturated at 1-KB buffer size. In most cases | |
70 | # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. | |
71 | # CTR curve doesn't follow this pattern and is "slowest" changing one | |
72 | # with "256-byte" result being 87% of "8-KB." This is because overhead | |
73 | # in CTR mode is most computationally intensive. Small-block CCM | |
74 | # decrypt is slower than encrypt, because first CTR and last CBC-MAC | |
75 | # iterations can't be interleaved. | |
76 | # | |
77 | # Results for 192- and 256-bit keys. | |
78 | # | |
79 | # EVP-free results were observed to scale perfectly with number of | |
80 | # rounds for larger block sizes, i.e. 192-bit result being 10/12 times | |
81 | # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences | |
82 | # are a tad smaller, because the above mentioned penalty biases all | |
83 | # results by same constant value. In similar way function call | |
84 | # overhead affects small-block performance, as well as OFB and CFB | |
85 | # results. Differences are not large, most common coefficients are | |
86 | # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one | |
02f358da | 87 | # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... |
d64a7232 | 88 | |
f8501464 AP |
89 | # January 2011 |
90 | # | |
91 | # While Westmere processor features 6 cycles latency for aes[enc|dec] | |
92 | # instructions, which can be scheduled every second cycle, Sandy | |
93 | # Bridge spends 8 cycles per instruction, but it can schedule them | |
94 | # every cycle. This means that code targeting Westmere would perform | |
95 | # suboptimally on Sandy Bridge. Therefore this update. | |
96 | # | |
97 | # In addition, non-parallelizable CBC encrypt (as well as CCM) is | |
98 | # optimized. Relative improvement might appear modest, 8% on Westmere, | |
99 | # but in absolute terms it's 3.77 cycles per byte encrypted with | |
100 | # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers | |
101 | # should be compared to asymptotic limits of 3.75 for Westmere and | |
102 | # 5.00 for Sandy Bridge. Actually, the fact that they get this close | |
103 | # to asymptotic limits is quite amazing. Indeed, the limit is | |
104 | # calculated as latency times number of rounds, 10 for 128-bit key, | |
105 | # and divided by 16, the number of bytes in block, or in other words | |
106 | # it accounts *solely* for aesenc instructions. But there are extra | |
107 | # instructions, and numbers so close to the asymptotic limits mean | |
108 | # that it's as if it takes as little as *one* additional cycle to | |
109 | # execute all of them. How is it possible? It is possible thanks to | |
110 | # out-of-order execution logic, which manages to overlap post- | |
111 | # processing of previous block, things like saving the output, with | |
112 | # actual encryption of current block, as well as pre-processing of | |
113 | # current block, things like fetching input and xor-ing it with | |
114 | # 0-round element of the key schedule, with actual encryption of | |
115 | # previous block. Keep this in mind... | |
116 | # | |
117 | # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher | |
118 | # performance is achieved by interleaving instructions working on | |
119 | # independent blocks. In which case asymptotic limit for such modes | |
120 | # can be obtained by dividing above mentioned numbers by AES | |
609b0852 | 121 | # instructions' interleave factor. Westmere can execute at most 3 |
f8501464 AP |
122 | # instructions at a time, meaning that optimal interleave factor is 3, |
123 | # and that's where the "magic" number of 1.25 come from. "Optimal | |
124 | # interleave factor" means that increase of interleave factor does | |
125 | # not improve performance. The formula has proven to reflect reality | |
126 | # pretty well on Westmere... Sandy Bridge on the other hand can | |
127 | # execute up to 8 AES instructions at a time, so how does varying | |
128 | # interleave factor affect the performance? Here is table for ECB | |
129 | # (numbers are cycles per byte processed with 128-bit key): | |
130 | # | |
131 | # instruction interleave factor 3x 6x 8x | |
132 | # theoretical asymptotic limit 1.67 0.83 0.625 | |
133 | # measured performance for 8KB block 1.05 0.86 0.84 | |
134 | # | |
135 | # "as if" interleave factor 4.7x 5.8x 6.0x | |
136 | # | |
137 | # Further data for other parallelizable modes: | |
138 | # | |
73325b22 | 139 | # CBC decrypt 1.16 0.93 0.74 |
cd54249c | 140 | # CTR 1.14 0.91 0.74 |
f8501464 AP |
141 | # |
142 | # Well, given 3x column it's probably inappropriate to call the limit | |
143 | # asymptotic, if it can be surpassed, isn't it? What happens there? | |
144 | # Rewind to CBC paragraph for the answer. Yes, out-of-order execution | |
145 | # magic is responsible for this. Processor overlaps not only the | |
46f4e1be | 146 | # additional instructions with AES ones, but even AES instructions |
f8501464 AP |
147 | # processing adjacent triplets of independent blocks. In the 6x case |
148 | # additional instructions still claim disproportionally small amount | |
149 | # of additional cycles, but in 8x case number of instructions must be | |
150 | # a tad too high for out-of-order logic to cope with, and AES unit | |
151 | # remains underutilized... As you can see 8x interleave is hardly | |
152 | # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl | |
46f4e1be | 153 | # utilizes 6x interleave because of limited register bank capacity. |
f8501464 AP |
154 | # |
155 | # Higher interleave factors do have negative impact on Westmere | |
156 | # performance. While for ECB mode it's negligible ~1.5%, other | |
157 | # parallelizables perform ~5% worse, which is outweighed by ~25% | |
158 | # improvement on Sandy Bridge. To balance regression on Westmere | |
159 | # CTR mode was implemented with 6x aesenc interleave factor. | |
160 | ||
161 | # April 2011 | |
162 | # | |
36df342f AP |
163 | # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing |
164 | # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like | |
f8501464 AP |
165 | # in CTR mode AES instruction interleave factor was chosen to be 6x. |
166 | ||
bd30091c AP |
167 | # November 2015 |
168 | # | |
169 | # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was | |
170 | # chosen to be 6x. | |
171 | ||
d2e18031 | 172 | ###################################################################### |
5599c733 AP |
173 | # Current large-block performance in cycles per byte processed with |
174 | # 128-bit key (less is better). | |
175 | # | |
bd30091c | 176 | # CBC en-/decrypt CTR XTS ECB OCB |
5599c733 | 177 | # Westmere 3.77/1.25 1.25 1.25 1.26 |
bd30091c AP |
178 | # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 |
179 | # Haswell 4.44/0.63 0.63 0.73 0.63 0.70 | |
b7f5503f | 180 | # Skylake 2.62/0.63 0.63 0.63 0.63 |
bd30091c | 181 | # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 |
64d92d74 | 182 | # Knights L 2.54/0.77 0.78 0.85 - 1.50 |
ace05265 | 183 | # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 |
bd30091c | 184 | # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 |
54f8f9a1 | 185 | # Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 |
5599c733 | 186 | # |
23f6eec7 AP |
187 | # (*) Atom Silvermont ECB result is suboptimal because of penalties |
188 | # incurred by operations on %xmm8-15. As ECB is not considered | |
5599c733 | 189 | # critical, nothing was done to mitigate the problem. |
d8ba0dc9 | 190 | |
d64a7232 AP |
191 | $PREFIX="aesni"; # if $PREFIX is set to "AES", the script |
192 | # generates drop-in replacement for | |
193 | # crypto/aes/asm/aes-x86_64.pl:-) | |
194 | ||
1aa89a7a RL |
195 | # $output is the last argument if it looks like a file (it has an extension) |
196 | # $flavour is the first argument if it doesn't look like a file | |
197 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
198 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
d64a7232 AP |
199 | |
200 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
201 | ||
202 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
203 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
204 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
205 | die "can't locate x86_64-xlate.pl"; | |
206 | ||
1aa89a7a RL |
207 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
208 | or die "can't call $xlate: $!"; | |
46bf83f0 | 209 | *STDOUT=*OUT; |
d64a7232 | 210 | |
8da721ee | 211 | $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; |
d608b4d6 AP |
212 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order |
213 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
d64a7232 AP |
214 | |
215 | $code=".text\n"; | |
5599c733 | 216 | $code.=".extern OPENSSL_ia32cap_P\n"; |
d64a7232 AP |
217 | |
218 | $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! | |
d608b4d6 | 219 | # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... |
d64a7232 AP |
220 | $inp="%rdi"; |
221 | $out="%rsi"; | |
d64a7232 AP |
222 | $len="%rdx"; |
223 | $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! | |
d7d119a3 | 224 | $ivp="%r8"; # cbc, ctr, ... |
d64a7232 AP |
225 | |
226 | $rnds_="%r10d"; # backup copy for $rounds | |
227 | $key_="%r11"; # backup copy for $key | |
228 | ||
229 | # %xmm register layout | |
f8501464 AP |
230 | $rndkey0="%xmm0"; $rndkey1="%xmm1"; |
231 | $inout0="%xmm2"; $inout1="%xmm3"; | |
232 | $inout2="%xmm4"; $inout3="%xmm5"; | |
233 | $inout4="%xmm6"; $inout5="%xmm7"; | |
234 | $inout6="%xmm8"; $inout7="%xmm9"; | |
235 | ||
236 | $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... | |
237 | $in0="%xmm8"; $iv="%xmm9"; | |
d64a7232 AP |
238 | \f |
239 | # Inline version of internal aesni_[en|de]crypt1. | |
240 | # | |
241 | # Why folded loop? Because aes[enc|dec] is slow enough to accommodate | |
242 | # cycles which take care of loop variables... | |
243 | { my $sn; | |
d608b4d6 | 244 | sub aesni_generate1 { |
f8501464 | 245 | my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); |
d64a7232 AP |
246 | ++$sn; |
247 | $code.=<<___; | |
f8501464 | 248 | $movkey ($key),$rndkey0 |
d64a7232 | 249 | $movkey 16($key),$rndkey1 |
f8501464 AP |
250 | ___ |
251 | $code.=<<___ if (defined($ivec)); | |
252 | xorps $rndkey0,$ivec | |
253 | lea 32($key),$key | |
254 | xorps $ivec,$inout | |
255 | ___ | |
256 | $code.=<<___ if (!defined($ivec)); | |
d608b4d6 | 257 | lea 32($key),$key |
f8501464 AP |
258 | xorps $rndkey0,$inout |
259 | ___ | |
260 | $code.=<<___; | |
d608b4d6 | 261 | .Loop_${p}1_$sn: |
d7d119a3 | 262 | aes${p} $rndkey1,$inout |
d64a7232 | 263 | dec $rounds |
d64a7232 | 264 | $movkey ($key),$rndkey1 |
d64a7232 | 265 | lea 16($key),$key |
d608b4d6 | 266 | jnz .Loop_${p}1_$sn # loop body is 16 bytes |
d7d119a3 | 267 | aes${p}last $rndkey1,$inout |
d64a7232 AP |
268 | ___ |
269 | }} | |
d608b4d6 | 270 | # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); |
d64a7232 | 271 | # |
d608b4d6 AP |
272 | { my ($inp,$out,$key) = @_4args; |
273 | ||
d64a7232 AP |
274 | $code.=<<___; |
275 | .globl ${PREFIX}_encrypt | |
d608b4d6 | 276 | .type ${PREFIX}_encrypt,\@abi-omnipotent |
d64a7232 AP |
277 | .align 16 |
278 | ${PREFIX}_encrypt: | |
c0e8e500 | 279 | .cfi_startproc |
f8501464 AP |
280 | movups ($inp),$inout0 # load input |
281 | mov 240($key),$rounds # key->rounds | |
d64a7232 | 282 | ___ |
d608b4d6 | 283 | &aesni_generate1("enc",$key,$rounds); |
d64a7232 | 284 | $code.=<<___; |
23f6eec7 AP |
285 | pxor $rndkey0,$rndkey0 # clear register bank |
286 | pxor $rndkey1,$rndkey1 | |
d608b4d6 | 287 | movups $inout0,($out) # output |
23f6eec7 | 288 | pxor $inout0,$inout0 |
d64a7232 | 289 | ret |
c0e8e500 | 290 | .cfi_endproc |
d64a7232 | 291 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt |
d64a7232 | 292 | |
d64a7232 | 293 | .globl ${PREFIX}_decrypt |
d608b4d6 | 294 | .type ${PREFIX}_decrypt,\@abi-omnipotent |
d64a7232 AP |
295 | .align 16 |
296 | ${PREFIX}_decrypt: | |
c0e8e500 | 297 | .cfi_startproc |
f8501464 AP |
298 | movups ($inp),$inout0 # load input |
299 | mov 240($key),$rounds # key->rounds | |
d64a7232 | 300 | ___ |
d608b4d6 | 301 | &aesni_generate1("dec",$key,$rounds); |
d64a7232 | 302 | $code.=<<___; |
23f6eec7 AP |
303 | pxor $rndkey0,$rndkey0 # clear register bank |
304 | pxor $rndkey1,$rndkey1 | |
d608b4d6 | 305 | movups $inout0,($out) # output |
23f6eec7 | 306 | pxor $inout0,$inout0 |
d64a7232 | 307 | ret |
c0e8e500 | 308 | .cfi_endproc |
d64a7232 AP |
309 | .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt |
310 | ___ | |
d608b4d6 | 311 | } |
d64a7232 | 312 | \f |
f8501464 AP |
313 | # _aesni_[en|de]cryptN are private interfaces, N denotes interleave |
314 | # factor. Why 3x subroutine were originally used in loops? Even though | |
315 | # aes[enc|dec] latency was originally 6, it could be scheduled only | |
316 | # every *2nd* cycle. Thus 3x interleave was the one providing optimal | |
d608b4d6 AP |
317 | # utilization, i.e. when subroutine's throughput is virtually same as |
318 | # of non-interleaved subroutine [for number of input blocks up to 3]. | |
214368ff AP |
319 | # This is why it originally made no sense to implement 2x subroutine. |
320 | # But times change and it became appropriate to spend extra 192 bytes | |
321 | # on 2x subroutine on Atom Silvermont account. For processors that | |
322 | # can schedule aes[enc|dec] every cycle optimal interleave factor | |
323 | # equals to corresponding instructions latency. 8x is optimal for | |
609b0852 | 324 | # * Bridge and "super-optimal" for other Intel CPUs... |
214368ff AP |
325 | |
326 | sub aesni_generate2 { | |
327 | my $dir=shift; | |
328 | # As already mentioned it takes in $key and $rounds, which are *not* | |
329 | # preserved. $inout[0-1] is cipher/clear text... | |
330 | $code.=<<___; | |
331 | .type _aesni_${dir}rypt2,\@abi-omnipotent | |
332 | .align 16 | |
333 | _aesni_${dir}rypt2: | |
c0e8e500 | 334 | .cfi_startproc |
214368ff AP |
335 | $movkey ($key),$rndkey0 |
336 | shl \$4,$rounds | |
337 | $movkey 16($key),$rndkey1 | |
338 | xorps $rndkey0,$inout0 | |
339 | xorps $rndkey0,$inout1 | |
340 | $movkey 32($key),$rndkey0 | |
341 | lea 32($key,$rounds),$key | |
342 | neg %rax # $rounds | |
343 | add \$16,%rax | |
344 | ||
345 | .L${dir}_loop2: | |
346 | aes${dir} $rndkey1,$inout0 | |
347 | aes${dir} $rndkey1,$inout1 | |
348 | $movkey ($key,%rax),$rndkey1 | |
349 | add \$32,%rax | |
350 | aes${dir} $rndkey0,$inout0 | |
351 | aes${dir} $rndkey0,$inout1 | |
352 | $movkey -16($key,%rax),$rndkey0 | |
353 | jnz .L${dir}_loop2 | |
354 | ||
355 | aes${dir} $rndkey1,$inout0 | |
356 | aes${dir} $rndkey1,$inout1 | |
357 | aes${dir}last $rndkey0,$inout0 | |
358 | aes${dir}last $rndkey0,$inout1 | |
359 | ret | |
c0e8e500 | 360 | .cfi_endproc |
214368ff AP |
361 | .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 |
362 | ___ | |
363 | } | |
d608b4d6 | 364 | sub aesni_generate3 { |
d64a7232 AP |
365 | my $dir=shift; |
366 | # As already mentioned it takes in $key and $rounds, which are *not* | |
d608b4d6 | 367 | # preserved. $inout[0-2] is cipher/clear text... |
d64a7232 | 368 | $code.=<<___; |
d608b4d6 | 369 | .type _aesni_${dir}rypt3,\@abi-omnipotent |
d64a7232 | 370 | .align 16 |
d608b4d6 | 371 | _aesni_${dir}rypt3: |
c0e8e500 | 372 | .cfi_startproc |
d64a7232 | 373 | $movkey ($key),$rndkey0 |
d8ba0dc9 | 374 | shl \$4,$rounds |
d64a7232 | 375 | $movkey 16($key),$rndkey1 |
f8501464 AP |
376 | xorps $rndkey0,$inout0 |
377 | xorps $rndkey0,$inout1 | |
378 | xorps $rndkey0,$inout2 | |
d8ba0dc9 AP |
379 | $movkey 32($key),$rndkey0 |
380 | lea 32($key,$rounds),$key | |
381 | neg %rax # $rounds | |
382 | add \$16,%rax | |
d608b4d6 AP |
383 | |
384 | .L${dir}_loop3: | |
385 | aes${dir} $rndkey1,$inout0 | |
d608b4d6 | 386 | aes${dir} $rndkey1,$inout1 |
d608b4d6 | 387 | aes${dir} $rndkey1,$inout2 |
d8ba0dc9 AP |
388 | $movkey ($key,%rax),$rndkey1 |
389 | add \$32,%rax | |
d7d119a3 | 390 | aes${dir} $rndkey0,$inout0 |
d608b4d6 | 391 | aes${dir} $rndkey0,$inout1 |
d608b4d6 | 392 | aes${dir} $rndkey0,$inout2 |
d8ba0dc9 | 393 | $movkey -16($key,%rax),$rndkey0 |
d608b4d6 AP |
394 | jnz .L${dir}_loop3 |
395 | ||
396 | aes${dir} $rndkey1,$inout0 | |
d608b4d6 AP |
397 | aes${dir} $rndkey1,$inout1 |
398 | aes${dir} $rndkey1,$inout2 | |
399 | aes${dir}last $rndkey0,$inout0 | |
400 | aes${dir}last $rndkey0,$inout1 | |
401 | aes${dir}last $rndkey0,$inout2 | |
402 | ret | |
c0e8e500 | 403 | .cfi_endproc |
d608b4d6 AP |
404 | .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 |
405 | ___ | |
406 | } | |
407 | # 4x interleave is implemented to improve small block performance, | |
408 | # most notably [and naturally] 4 block by ~30%. One can argue that one | |
409 | # should have implemented 5x as well, but improvement would be <20%, | |
410 | # so it's not worth it... | |
411 | sub aesni_generate4 { | |
412 | my $dir=shift; | |
413 | # As already mentioned it takes in $key and $rounds, which are *not* | |
414 | # preserved. $inout[0-3] is cipher/clear text... | |
415 | $code.=<<___; | |
416 | .type _aesni_${dir}rypt4,\@abi-omnipotent | |
417 | .align 16 | |
418 | _aesni_${dir}rypt4: | |
c0e8e500 | 419 | .cfi_startproc |
d608b4d6 | 420 | $movkey ($key),$rndkey0 |
d8ba0dc9 | 421 | shl \$4,$rounds |
d608b4d6 | 422 | $movkey 16($key),$rndkey1 |
f8501464 AP |
423 | xorps $rndkey0,$inout0 |
424 | xorps $rndkey0,$inout1 | |
425 | xorps $rndkey0,$inout2 | |
426 | xorps $rndkey0,$inout3 | |
d8ba0dc9 AP |
427 | $movkey 32($key),$rndkey0 |
428 | lea 32($key,$rounds),$key | |
429 | neg %rax # $rounds | |
430 | .byte 0x0f,0x1f,0x00 | |
431 | add \$16,%rax | |
d608b4d6 AP |
432 | |
433 | .L${dir}_loop4: | |
d64a7232 | 434 | aes${dir} $rndkey1,$inout0 |
d64a7232 | 435 | aes${dir} $rndkey1,$inout1 |
d64a7232 AP |
436 | aes${dir} $rndkey1,$inout2 |
437 | aes${dir} $rndkey1,$inout3 | |
d8ba0dc9 AP |
438 | $movkey ($key,%rax),$rndkey1 |
439 | add \$32,%rax | |
d7d119a3 | 440 | aes${dir} $rndkey0,$inout0 |
d64a7232 | 441 | aes${dir} $rndkey0,$inout1 |
d64a7232 AP |
442 | aes${dir} $rndkey0,$inout2 |
443 | aes${dir} $rndkey0,$inout3 | |
d8ba0dc9 | 444 | $movkey -16($key,%rax),$rndkey0 |
d608b4d6 AP |
445 | jnz .L${dir}_loop4 |
446 | ||
d64a7232 | 447 | aes${dir} $rndkey1,$inout0 |
d64a7232 AP |
448 | aes${dir} $rndkey1,$inout1 |
449 | aes${dir} $rndkey1,$inout2 | |
450 | aes${dir} $rndkey1,$inout3 | |
d64a7232 AP |
451 | aes${dir}last $rndkey0,$inout0 |
452 | aes${dir}last $rndkey0,$inout1 | |
453 | aes${dir}last $rndkey0,$inout2 | |
454 | aes${dir}last $rndkey0,$inout3 | |
d64a7232 | 455 | ret |
c0e8e500 | 456 | .cfi_endproc |
d608b4d6 | 457 | .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 |
d64a7232 AP |
458 | ___ |
459 | } | |
f8501464 AP |
460 | sub aesni_generate6 { |
461 | my $dir=shift; | |
462 | # As already mentioned it takes in $key and $rounds, which are *not* | |
463 | # preserved. $inout[0-5] is cipher/clear text... | |
464 | $code.=<<___; | |
465 | .type _aesni_${dir}rypt6,\@abi-omnipotent | |
466 | .align 16 | |
467 | _aesni_${dir}rypt6: | |
c0e8e500 | 468 | .cfi_startproc |
f8501464 | 469 | $movkey ($key),$rndkey0 |
d8ba0dc9 | 470 | shl \$4,$rounds |
f8501464 | 471 | $movkey 16($key),$rndkey1 |
f8501464 AP |
472 | xorps $rndkey0,$inout0 |
473 | pxor $rndkey0,$inout1 | |
f8501464 | 474 | pxor $rndkey0,$inout2 |
d8ba0dc9 AP |
475 | aes${dir} $rndkey1,$inout0 |
476 | lea 32($key,$rounds),$key | |
477 | neg %rax # $rounds | |
f8501464 AP |
478 | aes${dir} $rndkey1,$inout1 |
479 | pxor $rndkey0,$inout3 | |
f8501464 | 480 | pxor $rndkey0,$inout4 |
d8ba0dc9 | 481 | aes${dir} $rndkey1,$inout2 |
f8501464 | 482 | pxor $rndkey0,$inout5 |
23f6eec7 | 483 | $movkey ($key,%rax),$rndkey0 |
d8ba0dc9 | 484 | add \$16,%rax |
f8501464 AP |
485 | jmp .L${dir}_loop6_enter |
486 | .align 16 | |
487 | .L${dir}_loop6: | |
488 | aes${dir} $rndkey1,$inout0 | |
489 | aes${dir} $rndkey1,$inout1 | |
f8501464 | 490 | aes${dir} $rndkey1,$inout2 |
23f6eec7 | 491 | .L${dir}_loop6_enter: |
f8501464 AP |
492 | aes${dir} $rndkey1,$inout3 |
493 | aes${dir} $rndkey1,$inout4 | |
494 | aes${dir} $rndkey1,$inout5 | |
d8ba0dc9 AP |
495 | $movkey ($key,%rax),$rndkey1 |
496 | add \$32,%rax | |
f8501464 AP |
497 | aes${dir} $rndkey0,$inout0 |
498 | aes${dir} $rndkey0,$inout1 | |
f8501464 AP |
499 | aes${dir} $rndkey0,$inout2 |
500 | aes${dir} $rndkey0,$inout3 | |
501 | aes${dir} $rndkey0,$inout4 | |
502 | aes${dir} $rndkey0,$inout5 | |
d8ba0dc9 | 503 | $movkey -16($key,%rax),$rndkey0 |
f8501464 AP |
504 | jnz .L${dir}_loop6 |
505 | ||
506 | aes${dir} $rndkey1,$inout0 | |
507 | aes${dir} $rndkey1,$inout1 | |
508 | aes${dir} $rndkey1,$inout2 | |
509 | aes${dir} $rndkey1,$inout3 | |
510 | aes${dir} $rndkey1,$inout4 | |
511 | aes${dir} $rndkey1,$inout5 | |
512 | aes${dir}last $rndkey0,$inout0 | |
513 | aes${dir}last $rndkey0,$inout1 | |
514 | aes${dir}last $rndkey0,$inout2 | |
515 | aes${dir}last $rndkey0,$inout3 | |
516 | aes${dir}last $rndkey0,$inout4 | |
517 | aes${dir}last $rndkey0,$inout5 | |
518 | ret | |
c0e8e500 | 519 | .cfi_endproc |
f8501464 AP |
520 | .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 |
521 | ___ | |
522 | } | |
523 | sub aesni_generate8 { | |
524 | my $dir=shift; | |
525 | # As already mentioned it takes in $key and $rounds, which are *not* | |
526 | # preserved. $inout[0-7] is cipher/clear text... | |
527 | $code.=<<___; | |
528 | .type _aesni_${dir}rypt8,\@abi-omnipotent | |
529 | .align 16 | |
530 | _aesni_${dir}rypt8: | |
c0e8e500 | 531 | .cfi_startproc |
f8501464 | 532 | $movkey ($key),$rndkey0 |
d8ba0dc9 | 533 | shl \$4,$rounds |
f8501464 | 534 | $movkey 16($key),$rndkey1 |
f8501464 AP |
535 | xorps $rndkey0,$inout0 |
536 | xorps $rndkey0,$inout1 | |
f8501464 | 537 | pxor $rndkey0,$inout2 |
f8501464 | 538 | pxor $rndkey0,$inout3 |
f8501464 | 539 | pxor $rndkey0,$inout4 |
d8ba0dc9 AP |
540 | lea 32($key,$rounds),$key |
541 | neg %rax # $rounds | |
542 | aes${dir} $rndkey1,$inout0 | |
f8501464 | 543 | pxor $rndkey0,$inout5 |
f8501464 | 544 | pxor $rndkey0,$inout6 |
23f6eec7 | 545 | aes${dir} $rndkey1,$inout1 |
f8501464 | 546 | pxor $rndkey0,$inout7 |
23f6eec7 AP |
547 | $movkey ($key,%rax),$rndkey0 |
548 | add \$16,%rax | |
549 | jmp .L${dir}_loop8_inner | |
f8501464 AP |
550 | .align 16 |
551 | .L${dir}_loop8: | |
552 | aes${dir} $rndkey1,$inout0 | |
553 | aes${dir} $rndkey1,$inout1 | |
23f6eec7 | 554 | .L${dir}_loop8_inner: |
f8501464 AP |
555 | aes${dir} $rndkey1,$inout2 |
556 | aes${dir} $rndkey1,$inout3 | |
557 | aes${dir} $rndkey1,$inout4 | |
558 | aes${dir} $rndkey1,$inout5 | |
559 | aes${dir} $rndkey1,$inout6 | |
560 | aes${dir} $rndkey1,$inout7 | |
d8ba0dc9 AP |
561 | .L${dir}_loop8_enter: |
562 | $movkey ($key,%rax),$rndkey1 | |
563 | add \$32,%rax | |
f8501464 AP |
564 | aes${dir} $rndkey0,$inout0 |
565 | aes${dir} $rndkey0,$inout1 | |
f8501464 AP |
566 | aes${dir} $rndkey0,$inout2 |
567 | aes${dir} $rndkey0,$inout3 | |
568 | aes${dir} $rndkey0,$inout4 | |
569 | aes${dir} $rndkey0,$inout5 | |
570 | aes${dir} $rndkey0,$inout6 | |
571 | aes${dir} $rndkey0,$inout7 | |
d8ba0dc9 | 572 | $movkey -16($key,%rax),$rndkey0 |
f8501464 AP |
573 | jnz .L${dir}_loop8 |
574 | ||
575 | aes${dir} $rndkey1,$inout0 | |
576 | aes${dir} $rndkey1,$inout1 | |
577 | aes${dir} $rndkey1,$inout2 | |
578 | aes${dir} $rndkey1,$inout3 | |
579 | aes${dir} $rndkey1,$inout4 | |
580 | aes${dir} $rndkey1,$inout5 | |
581 | aes${dir} $rndkey1,$inout6 | |
582 | aes${dir} $rndkey1,$inout7 | |
583 | aes${dir}last $rndkey0,$inout0 | |
584 | aes${dir}last $rndkey0,$inout1 | |
585 | aes${dir}last $rndkey0,$inout2 | |
586 | aes${dir}last $rndkey0,$inout3 | |
587 | aes${dir}last $rndkey0,$inout4 | |
588 | aes${dir}last $rndkey0,$inout5 | |
589 | aes${dir}last $rndkey0,$inout6 | |
590 | aes${dir}last $rndkey0,$inout7 | |
591 | ret | |
c0e8e500 | 592 | .cfi_endproc |
f8501464 AP |
593 | .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 |
594 | ___ | |
595 | } | |
214368ff AP |
596 | &aesni_generate2("enc") if ($PREFIX eq "aesni"); |
597 | &aesni_generate2("dec"); | |
d608b4d6 AP |
598 | &aesni_generate3("enc") if ($PREFIX eq "aesni"); |
599 | &aesni_generate3("dec"); | |
600 | &aesni_generate4("enc") if ($PREFIX eq "aesni"); | |
601 | &aesni_generate4("dec"); | |
f8501464 AP |
602 | &aesni_generate6("enc") if ($PREFIX eq "aesni"); |
603 | &aesni_generate6("dec"); | |
604 | &aesni_generate8("enc") if ($PREFIX eq "aesni"); | |
605 | &aesni_generate8("dec"); | |
d64a7232 AP |
606 | \f |
607 | if ($PREFIX eq "aesni") { | |
6c83629b | 608 | ######################################################################## |
d64a7232 AP |
609 | # void aesni_ecb_encrypt (const void *in, void *out, |
610 | # size_t length, const AES_KEY *key, | |
611 | # int enc); | |
612 | $code.=<<___; | |
613 | .globl aesni_ecb_encrypt | |
614 | .type aesni_ecb_encrypt,\@function,5 | |
615 | .align 16 | |
616 | aesni_ecb_encrypt: | |
c0e8e500 | 617 | .cfi_startproc |
69d5747f AP |
618 | ___ |
619 | $code.=<<___ if ($win64); | |
620 | lea -0x58(%rsp),%rsp | |
23f6eec7 | 621 | movaps %xmm6,(%rsp) # offload $inout4..7 |
69d5747f AP |
622 | movaps %xmm7,0x10(%rsp) |
623 | movaps %xmm8,0x20(%rsp) | |
624 | movaps %xmm9,0x30(%rsp) | |
625 | .Lecb_enc_body: | |
626 | ___ | |
627 | $code.=<<___; | |
23f6eec7 AP |
628 | and \$-16,$len # if ($len<16) |
629 | jz .Lecb_ret # return | |
f8501464 AP |
630 | |
631 | mov 240($key),$rounds # key->rounds | |
632 | $movkey ($key),$rndkey0 | |
d64a7232 | 633 | mov $key,$key_ # backup $key |
d64a7232 | 634 | mov $rounds,$rnds_ # backup $rounds |
d7d119a3 | 635 | test %r8d,%r8d # 5th argument |
d64a7232 AP |
636 | jz .Lecb_decrypt |
637 | #--------------------------- ECB ENCRYPT ------------------------------# | |
23f6eec7 AP |
638 | cmp \$0x80,$len # if ($len<8*16) |
639 | jb .Lecb_enc_tail # short input | |
f8501464 | 640 | |
23f6eec7 | 641 | movdqu ($inp),$inout0 # load 8 input blocks |
f8501464 AP |
642 | movdqu 0x10($inp),$inout1 |
643 | movdqu 0x20($inp),$inout2 | |
644 | movdqu 0x30($inp),$inout3 | |
645 | movdqu 0x40($inp),$inout4 | |
646 | movdqu 0x50($inp),$inout5 | |
647 | movdqu 0x60($inp),$inout6 | |
648 | movdqu 0x70($inp),$inout7 | |
23f6eec7 AP |
649 | lea 0x80($inp),$inp # $inp+=8*16 |
650 | sub \$0x80,$len # $len-=8*16 (can be zero) | |
f8501464 | 651 | jmp .Lecb_enc_loop8_enter |
d64a7232 | 652 | .align 16 |
f8501464 | 653 | .Lecb_enc_loop8: |
23f6eec7 | 654 | movups $inout0,($out) # store 8 output blocks |
f8501464 | 655 | mov $key_,$key # restore $key |
23f6eec7 | 656 | movdqu ($inp),$inout0 # load 8 input blocks |
d64a7232 | 657 | mov $rnds_,$rounds # restore $rounds |
d7d119a3 | 658 | movups $inout1,0x10($out) |
f8501464 AP |
659 | movdqu 0x10($inp),$inout1 |
660 | movups $inout2,0x20($out) | |
661 | movdqu 0x20($inp),$inout2 | |
662 | movups $inout3,0x30($out) | |
663 | movdqu 0x30($inp),$inout3 | |
664 | movups $inout4,0x40($out) | |
665 | movdqu 0x40($inp),$inout4 | |
666 | movups $inout5,0x50($out) | |
667 | movdqu 0x50($inp),$inout5 | |
668 | movups $inout6,0x60($out) | |
669 | movdqu 0x60($inp),$inout6 | |
670 | movups $inout7,0x70($out) | |
23f6eec7 | 671 | lea 0x80($out),$out # $out+=8*16 |
f8501464 | 672 | movdqu 0x70($inp),$inout7 |
23f6eec7 | 673 | lea 0x80($inp),$inp # $inp+=8*16 |
f8501464 AP |
674 | .Lecb_enc_loop8_enter: |
675 | ||
676 | call _aesni_encrypt8 | |
677 | ||
678 | sub \$0x80,$len | |
23f6eec7 | 679 | jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow |
f8501464 | 680 | |
23f6eec7 | 681 | movups $inout0,($out) # store 8 output blocks |
d64a7232 | 682 | mov $key_,$key # restore $key |
f8501464 AP |
683 | movups $inout1,0x10($out) |
684 | mov $rnds_,$rounds # restore $rounds | |
d7d119a3 | 685 | movups $inout2,0x20($out) |
f8501464 AP |
686 | movups $inout3,0x30($out) |
687 | movups $inout4,0x40($out) | |
688 | movups $inout5,0x50($out) | |
689 | movups $inout6,0x60($out) | |
690 | movups $inout7,0x70($out) | |
23f6eec7 AP |
691 | lea 0x80($out),$out # $out+=8*16 |
692 | add \$0x80,$len # restore real remaining $len | |
693 | jz .Lecb_ret # done if ($len==0) | |
d64a7232 | 694 | |
23f6eec7 | 695 | .Lecb_enc_tail: # $len is less than 8*16 |
6c83629b | 696 | movups ($inp),$inout0 |
d7d119a3 | 697 | cmp \$0x20,$len |
6c83629b | 698 | jb .Lecb_enc_one |
d64a7232 AP |
699 | movups 0x10($inp),$inout1 |
700 | je .Lecb_enc_two | |
d64a7232 | 701 | movups 0x20($inp),$inout2 |
f8501464 AP |
702 | cmp \$0x40,$len |
703 | jb .Lecb_enc_three | |
d64a7232 | 704 | movups 0x30($inp),$inout3 |
f8501464 AP |
705 | je .Lecb_enc_four |
706 | movups 0x40($inp),$inout4 | |
707 | cmp \$0x60,$len | |
708 | jb .Lecb_enc_five | |
709 | movups 0x50($inp),$inout5 | |
710 | je .Lecb_enc_six | |
711 | movdqu 0x60($inp),$inout6 | |
23f6eec7 | 712 | xorps $inout7,$inout7 |
f8501464 | 713 | call _aesni_encrypt8 |
23f6eec7 | 714 | movups $inout0,($out) # store 7 output blocks |
d64a7232 AP |
715 | movups $inout1,0x10($out) |
716 | movups $inout2,0x20($out) | |
717 | movups $inout3,0x30($out) | |
f8501464 AP |
718 | movups $inout4,0x40($out) |
719 | movups $inout5,0x50($out) | |
720 | movups $inout6,0x60($out) | |
d64a7232 AP |
721 | jmp .Lecb_ret |
722 | .align 16 | |
723 | .Lecb_enc_one: | |
724 | ___ | |
d608b4d6 | 725 | &aesni_generate1("enc",$key,$rounds); |
d64a7232 | 726 | $code.=<<___; |
23f6eec7 | 727 | movups $inout0,($out) # store one output block |
d64a7232 AP |
728 | jmp .Lecb_ret |
729 | .align 16 | |
730 | .Lecb_enc_two: | |
214368ff | 731 | call _aesni_encrypt2 |
23f6eec7 | 732 | movups $inout0,($out) # store 2 output blocks |
d64a7232 AP |
733 | movups $inout1,0x10($out) |
734 | jmp .Lecb_ret | |
735 | .align 16 | |
736 | .Lecb_enc_three: | |
d608b4d6 | 737 | call _aesni_encrypt3 |
23f6eec7 | 738 | movups $inout0,($out) # store 3 output blocks |
d64a7232 AP |
739 | movups $inout1,0x10($out) |
740 | movups $inout2,0x20($out) | |
741 | jmp .Lecb_ret | |
f8501464 AP |
742 | .align 16 |
743 | .Lecb_enc_four: | |
744 | call _aesni_encrypt4 | |
23f6eec7 | 745 | movups $inout0,($out) # store 4 output blocks |
f8501464 AP |
746 | movups $inout1,0x10($out) |
747 | movups $inout2,0x20($out) | |
748 | movups $inout3,0x30($out) | |
749 | jmp .Lecb_ret | |
750 | .align 16 | |
751 | .Lecb_enc_five: | |
752 | xorps $inout5,$inout5 | |
753 | call _aesni_encrypt6 | |
23f6eec7 | 754 | movups $inout0,($out) # store 5 output blocks |
f8501464 AP |
755 | movups $inout1,0x10($out) |
756 | movups $inout2,0x20($out) | |
757 | movups $inout3,0x30($out) | |
758 | movups $inout4,0x40($out) | |
759 | jmp .Lecb_ret | |
760 | .align 16 | |
761 | .Lecb_enc_six: | |
762 | call _aesni_encrypt6 | |
23f6eec7 | 763 | movups $inout0,($out) # store 6 output blocks |
f8501464 AP |
764 | movups $inout1,0x10($out) |
765 | movups $inout2,0x20($out) | |
766 | movups $inout3,0x30($out) | |
767 | movups $inout4,0x40($out) | |
768 | movups $inout5,0x50($out) | |
769 | jmp .Lecb_ret | |
d64a7232 AP |
770 | \f#--------------------------- ECB DECRYPT ------------------------------# |
771 | .align 16 | |
772 | .Lecb_decrypt: | |
23f6eec7 AP |
773 | cmp \$0x80,$len # if ($len<8*16) |
774 | jb .Lecb_dec_tail # short input | |
f8501464 | 775 | |
23f6eec7 | 776 | movdqu ($inp),$inout0 # load 8 input blocks |
f8501464 AP |
777 | movdqu 0x10($inp),$inout1 |
778 | movdqu 0x20($inp),$inout2 | |
779 | movdqu 0x30($inp),$inout3 | |
780 | movdqu 0x40($inp),$inout4 | |
781 | movdqu 0x50($inp),$inout5 | |
782 | movdqu 0x60($inp),$inout6 | |
783 | movdqu 0x70($inp),$inout7 | |
23f6eec7 AP |
784 | lea 0x80($inp),$inp # $inp+=8*16 |
785 | sub \$0x80,$len # $len-=8*16 (can be zero) | |
f8501464 | 786 | jmp .Lecb_dec_loop8_enter |
d64a7232 | 787 | .align 16 |
f8501464 | 788 | .Lecb_dec_loop8: |
23f6eec7 | 789 | movups $inout0,($out) # store 8 output blocks |
f8501464 | 790 | mov $key_,$key # restore $key |
23f6eec7 | 791 | movdqu ($inp),$inout0 # load 8 input blocks |
d64a7232 | 792 | mov $rnds_,$rounds # restore $rounds |
d7d119a3 | 793 | movups $inout1,0x10($out) |
f8501464 AP |
794 | movdqu 0x10($inp),$inout1 |
795 | movups $inout2,0x20($out) | |
796 | movdqu 0x20($inp),$inout2 | |
797 | movups $inout3,0x30($out) | |
798 | movdqu 0x30($inp),$inout3 | |
799 | movups $inout4,0x40($out) | |
800 | movdqu 0x40($inp),$inout4 | |
801 | movups $inout5,0x50($out) | |
802 | movdqu 0x50($inp),$inout5 | |
803 | movups $inout6,0x60($out) | |
804 | movdqu 0x60($inp),$inout6 | |
805 | movups $inout7,0x70($out) | |
23f6eec7 | 806 | lea 0x80($out),$out # $out+=8*16 |
f8501464 | 807 | movdqu 0x70($inp),$inout7 |
23f6eec7 | 808 | lea 0x80($inp),$inp # $inp+=8*16 |
f8501464 AP |
809 | .Lecb_dec_loop8_enter: |
810 | ||
811 | call _aesni_decrypt8 | |
812 | ||
813 | $movkey ($key_),$rndkey0 | |
814 | sub \$0x80,$len | |
23f6eec7 | 815 | jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow |
f8501464 | 816 | |
23f6eec7 AP |
817 | movups $inout0,($out) # store 8 output blocks |
818 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 | 819 | mov $key_,$key # restore $key |
f8501464 | 820 | movups $inout1,0x10($out) |
23f6eec7 | 821 | pxor $inout1,$inout1 |
f8501464 | 822 | mov $rnds_,$rounds # restore $rounds |
d7d119a3 | 823 | movups $inout2,0x20($out) |
23f6eec7 | 824 | pxor $inout2,$inout2 |
f8501464 | 825 | movups $inout3,0x30($out) |
23f6eec7 | 826 | pxor $inout3,$inout3 |
f8501464 | 827 | movups $inout4,0x40($out) |
23f6eec7 | 828 | pxor $inout4,$inout4 |
f8501464 | 829 | movups $inout5,0x50($out) |
23f6eec7 | 830 | pxor $inout5,$inout5 |
f8501464 | 831 | movups $inout6,0x60($out) |
23f6eec7 | 832 | pxor $inout6,$inout6 |
f8501464 | 833 | movups $inout7,0x70($out) |
23f6eec7 AP |
834 | pxor $inout7,$inout7 |
835 | lea 0x80($out),$out # $out+=8*16 | |
836 | add \$0x80,$len # restore real remaining $len | |
837 | jz .Lecb_ret # done if ($len==0) | |
d64a7232 | 838 | |
6c83629b | 839 | .Lecb_dec_tail: |
6c83629b | 840 | movups ($inp),$inout0 |
d7d119a3 | 841 | cmp \$0x20,$len |
6c83629b | 842 | jb .Lecb_dec_one |
d64a7232 AP |
843 | movups 0x10($inp),$inout1 |
844 | je .Lecb_dec_two | |
d64a7232 | 845 | movups 0x20($inp),$inout2 |
f8501464 AP |
846 | cmp \$0x40,$len |
847 | jb .Lecb_dec_three | |
d64a7232 | 848 | movups 0x30($inp),$inout3 |
f8501464 AP |
849 | je .Lecb_dec_four |
850 | movups 0x40($inp),$inout4 | |
851 | cmp \$0x60,$len | |
852 | jb .Lecb_dec_five | |
853 | movups 0x50($inp),$inout5 | |
854 | je .Lecb_dec_six | |
855 | movups 0x60($inp),$inout6 | |
856 | $movkey ($key),$rndkey0 | |
23f6eec7 | 857 | xorps $inout7,$inout7 |
f8501464 | 858 | call _aesni_decrypt8 |
23f6eec7 AP |
859 | movups $inout0,($out) # store 7 output blocks |
860 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 | 861 | movups $inout1,0x10($out) |
23f6eec7 | 862 | pxor $inout1,$inout1 |
d64a7232 | 863 | movups $inout2,0x20($out) |
23f6eec7 | 864 | pxor $inout2,$inout2 |
d64a7232 | 865 | movups $inout3,0x30($out) |
23f6eec7 | 866 | pxor $inout3,$inout3 |
f8501464 | 867 | movups $inout4,0x40($out) |
23f6eec7 | 868 | pxor $inout4,$inout4 |
f8501464 | 869 | movups $inout5,0x50($out) |
23f6eec7 | 870 | pxor $inout5,$inout5 |
f8501464 | 871 | movups $inout6,0x60($out) |
23f6eec7 AP |
872 | pxor $inout6,$inout6 |
873 | pxor $inout7,$inout7 | |
d64a7232 AP |
874 | jmp .Lecb_ret |
875 | .align 16 | |
876 | .Lecb_dec_one: | |
877 | ___ | |
d608b4d6 | 878 | &aesni_generate1("dec",$key,$rounds); |
d64a7232 | 879 | $code.=<<___; |
23f6eec7 AP |
880 | movups $inout0,($out) # store one output block |
881 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 AP |
882 | jmp .Lecb_ret |
883 | .align 16 | |
884 | .Lecb_dec_two: | |
214368ff | 885 | call _aesni_decrypt2 |
23f6eec7 AP |
886 | movups $inout0,($out) # store 2 output blocks |
887 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 | 888 | movups $inout1,0x10($out) |
23f6eec7 | 889 | pxor $inout1,$inout1 |
d64a7232 AP |
890 | jmp .Lecb_ret |
891 | .align 16 | |
892 | .Lecb_dec_three: | |
d608b4d6 | 893 | call _aesni_decrypt3 |
23f6eec7 AP |
894 | movups $inout0,($out) # store 3 output blocks |
895 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 | 896 | movups $inout1,0x10($out) |
23f6eec7 | 897 | pxor $inout1,$inout1 |
d64a7232 | 898 | movups $inout2,0x20($out) |
23f6eec7 | 899 | pxor $inout2,$inout2 |
f8501464 AP |
900 | jmp .Lecb_ret |
901 | .align 16 | |
902 | .Lecb_dec_four: | |
903 | call _aesni_decrypt4 | |
23f6eec7 AP |
904 | movups $inout0,($out) # store 4 output blocks |
905 | pxor $inout0,$inout0 # clear register bank | |
f8501464 | 906 | movups $inout1,0x10($out) |
23f6eec7 | 907 | pxor $inout1,$inout1 |
f8501464 | 908 | movups $inout2,0x20($out) |
23f6eec7 | 909 | pxor $inout2,$inout2 |
f8501464 | 910 | movups $inout3,0x30($out) |
23f6eec7 | 911 | pxor $inout3,$inout3 |
f8501464 AP |
912 | jmp .Lecb_ret |
913 | .align 16 | |
914 | .Lecb_dec_five: | |
915 | xorps $inout5,$inout5 | |
916 | call _aesni_decrypt6 | |
23f6eec7 AP |
917 | movups $inout0,($out) # store 5 output blocks |
918 | pxor $inout0,$inout0 # clear register bank | |
f8501464 | 919 | movups $inout1,0x10($out) |
23f6eec7 | 920 | pxor $inout1,$inout1 |
f8501464 | 921 | movups $inout2,0x20($out) |
23f6eec7 | 922 | pxor $inout2,$inout2 |
f8501464 | 923 | movups $inout3,0x30($out) |
23f6eec7 | 924 | pxor $inout3,$inout3 |
f8501464 | 925 | movups $inout4,0x40($out) |
23f6eec7 AP |
926 | pxor $inout4,$inout4 |
927 | pxor $inout5,$inout5 | |
f8501464 AP |
928 | jmp .Lecb_ret |
929 | .align 16 | |
930 | .Lecb_dec_six: | |
931 | call _aesni_decrypt6 | |
23f6eec7 AP |
932 | movups $inout0,($out) # store 6 output blocks |
933 | pxor $inout0,$inout0 # clear register bank | |
f8501464 | 934 | movups $inout1,0x10($out) |
23f6eec7 | 935 | pxor $inout1,$inout1 |
f8501464 | 936 | movups $inout2,0x20($out) |
23f6eec7 | 937 | pxor $inout2,$inout2 |
f8501464 | 938 | movups $inout3,0x30($out) |
23f6eec7 | 939 | pxor $inout3,$inout3 |
f8501464 | 940 | movups $inout4,0x40($out) |
23f6eec7 | 941 | pxor $inout4,$inout4 |
f8501464 | 942 | movups $inout5,0x50($out) |
23f6eec7 | 943 | pxor $inout5,$inout5 |
d64a7232 AP |
944 | |
945 | .Lecb_ret: | |
23f6eec7 AP |
946 | xorps $rndkey0,$rndkey0 # %xmm0 |
947 | pxor $rndkey1,$rndkey1 | |
69d5747f AP |
948 | ___ |
949 | $code.=<<___ if ($win64); | |
950 | movaps (%rsp),%xmm6 | |
23f6eec7 | 951 | movaps %xmm0,(%rsp) # clear stack |
69d5747f | 952 | movaps 0x10(%rsp),%xmm7 |
23f6eec7 | 953 | movaps %xmm0,0x10(%rsp) |
69d5747f | 954 | movaps 0x20(%rsp),%xmm8 |
23f6eec7 | 955 | movaps %xmm0,0x20(%rsp) |
69d5747f | 956 | movaps 0x30(%rsp),%xmm9 |
23f6eec7 | 957 | movaps %xmm0,0x30(%rsp) |
69d5747f AP |
958 | lea 0x58(%rsp),%rsp |
959 | .Lecb_enc_ret: | |
960 | ___ | |
961 | $code.=<<___; | |
d64a7232 | 962 | ret |
c0e8e500 | 963 | .cfi_endproc |
d64a7232 AP |
964 | .size aesni_ecb_encrypt,.-aesni_ecb_encrypt |
965 | ___ | |
d7d119a3 AP |
966 | \f |
967 | { | |
6c83629b | 968 | ###################################################################### |
d7d119a3 AP |
969 | # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, |
970 | # size_t blocks, const AES_KEY *key, | |
971 | # const char *ivec,char *cmac); | |
6c83629b | 972 | # |
d7d119a3 AP |
973 | # Handles only complete blocks, operates on 64-bit counter and |
974 | # does not update *ivec! Nor does it finalize CMAC value | |
975 | # (see engine/eng_aesni.c for details) | |
976 | # | |
977 | { | |
978 | my $cmac="%r9"; # 6th argument | |
979 | ||
d8ba0dc9 AP |
980 | my $increment="%xmm9"; |
981 | my $iv="%xmm6"; | |
267b481c | 982 | my $bswap_mask="%xmm7"; |
d7d119a3 AP |
983 | |
984 | $code.=<<___; | |
985 | .globl aesni_ccm64_encrypt_blocks | |
986 | .type aesni_ccm64_encrypt_blocks,\@function,6 | |
987 | .align 16 | |
988 | aesni_ccm64_encrypt_blocks: | |
989 | ___ | |
990 | $code.=<<___ if ($win64); | |
991 | lea -0x58(%rsp),%rsp | |
23f6eec7 AP |
992 | movaps %xmm6,(%rsp) # $iv |
993 | movaps %xmm7,0x10(%rsp) # $bswap_mask | |
994 | movaps %xmm8,0x20(%rsp) # $in0 | |
995 | movaps %xmm9,0x30(%rsp) # $increment | |
d7d119a3 AP |
996 | .Lccm64_enc_body: |
997 | ___ | |
998 | $code.=<<___; | |
267b481c | 999 | mov 240($key),$rounds # key->rounds |
d7d119a3 | 1000 | movdqu ($ivp),$iv |
d7d119a3 AP |
1001 | movdqa .Lincrement64(%rip),$increment |
1002 | movdqa .Lbswap_mask(%rip),$bswap_mask | |
d7d119a3 | 1003 | |
d8ba0dc9 AP |
1004 | shl \$4,$rounds |
1005 | mov \$16,$rnds_ | |
267b481c AP |
1006 | lea 0($key),$key_ |
1007 | movdqu ($cmac),$inout1 | |
d7d119a3 | 1008 | movdqa $iv,$inout0 |
d8ba0dc9 | 1009 | lea 32($key,$rounds),$key # end of key schedule |
9ee5916d | 1010 | pshufb $bswap_mask,$iv |
d8ba0dc9 | 1011 | sub %rax,%r10 # twisted $rounds |
267b481c AP |
1012 | jmp .Lccm64_enc_outer |
1013 | .align 16 | |
d7d119a3 | 1014 | .Lccm64_enc_outer: |
267b481c | 1015 | $movkey ($key_),$rndkey0 |
d8ba0dc9 | 1016 | mov %r10,%rax |
267b481c | 1017 | movups ($inp),$in0 # load inp |
d7d119a3 | 1018 | |
267b481c AP |
1019 | xorps $rndkey0,$inout0 # counter |
1020 | $movkey 16($key_),$rndkey1 | |
1021 | xorps $in0,$rndkey0 | |
267b481c | 1022 | xorps $rndkey0,$inout1 # cmac^=inp |
d8ba0dc9 | 1023 | $movkey 32($key_),$rndkey0 |
f8501464 AP |
1024 | |
1025 | .Lccm64_enc2_loop: | |
1026 | aesenc $rndkey1,$inout0 | |
f8501464 | 1027 | aesenc $rndkey1,$inout1 |
d8ba0dc9 AP |
1028 | $movkey ($key,%rax),$rndkey1 |
1029 | add \$32,%rax | |
f8501464 | 1030 | aesenc $rndkey0,$inout0 |
f8501464 | 1031 | aesenc $rndkey0,$inout1 |
d8ba0dc9 | 1032 | $movkey -16($key,%rax),$rndkey0 |
f8501464 AP |
1033 | jnz .Lccm64_enc2_loop |
1034 | aesenc $rndkey1,$inout0 | |
1035 | aesenc $rndkey1,$inout1 | |
267b481c | 1036 | paddq $increment,$iv |
23f6eec7 | 1037 | dec $len # $len-- ($len is in blocks) |
f8501464 AP |
1038 | aesenclast $rndkey0,$inout0 |
1039 | aesenclast $rndkey0,$inout1 | |
d7d119a3 | 1040 | |
d7d119a3 | 1041 | lea 16($inp),$inp |
f8501464 | 1042 | xorps $inout0,$in0 # inp ^= E(iv) |
d7d119a3 | 1043 | movdqa $iv,$inout0 |
f8501464 | 1044 | movups $in0,($out) # save output |
9ee5916d | 1045 | pshufb $bswap_mask,$inout0 |
23f6eec7 AP |
1046 | lea 16($out),$out # $out+=16 |
1047 | jnz .Lccm64_enc_outer # loop if ($len!=0) | |
d7d119a3 | 1048 | |
23f6eec7 AP |
1049 | pxor $rndkey0,$rndkey0 # clear register bank |
1050 | pxor $rndkey1,$rndkey1 | |
1051 | pxor $inout0,$inout0 | |
1052 | movups $inout1,($cmac) # store resulting mac | |
1053 | pxor $inout1,$inout1 | |
1054 | pxor $in0,$in0 | |
1055 | pxor $iv,$iv | |
d7d119a3 AP |
1056 | ___ |
1057 | $code.=<<___ if ($win64); | |
1058 | movaps (%rsp),%xmm6 | |
23f6eec7 | 1059 | movaps %xmm0,(%rsp) # clear stack |
d7d119a3 | 1060 | movaps 0x10(%rsp),%xmm7 |
23f6eec7 | 1061 | movaps %xmm0,0x10(%rsp) |
d7d119a3 | 1062 | movaps 0x20(%rsp),%xmm8 |
23f6eec7 | 1063 | movaps %xmm0,0x20(%rsp) |
d7d119a3 | 1064 | movaps 0x30(%rsp),%xmm9 |
23f6eec7 | 1065 | movaps %xmm0,0x30(%rsp) |
d7d119a3 AP |
1066 | lea 0x58(%rsp),%rsp |
1067 | .Lccm64_enc_ret: | |
1068 | ___ | |
1069 | $code.=<<___; | |
1070 | ret | |
1071 | .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks | |
1072 | ___ | |
1073 | ###################################################################### | |
1074 | $code.=<<___; | |
1075 | .globl aesni_ccm64_decrypt_blocks | |
1076 | .type aesni_ccm64_decrypt_blocks,\@function,6 | |
1077 | .align 16 | |
1078 | aesni_ccm64_decrypt_blocks: | |
1079 | ___ | |
1080 | $code.=<<___ if ($win64); | |
1081 | lea -0x58(%rsp),%rsp | |
23f6eec7 AP |
1082 | movaps %xmm6,(%rsp) # $iv |
1083 | movaps %xmm7,0x10(%rsp) # $bswap_mask | |
1084 | movaps %xmm8,0x20(%rsp) # $in8 | |
1085 | movaps %xmm9,0x30(%rsp) # $increment | |
d7d119a3 AP |
1086 | .Lccm64_dec_body: |
1087 | ___ | |
1088 | $code.=<<___; | |
267b481c AP |
1089 | mov 240($key),$rounds # key->rounds |
1090 | movups ($ivp),$iv | |
d7d119a3 AP |
1091 | movdqu ($cmac),$inout1 |
1092 | movdqa .Lincrement64(%rip),$increment | |
1093 | movdqa .Lbswap_mask(%rip),$bswap_mask | |
1094 | ||
267b481c | 1095 | movaps $iv,$inout0 |
d7d119a3 AP |
1096 | mov $rounds,$rnds_ |
1097 | mov $key,$key_ | |
267b481c | 1098 | pshufb $bswap_mask,$iv |
d7d119a3 AP |
1099 | ___ |
1100 | &aesni_generate1("enc",$key,$rounds); | |
1101 | $code.=<<___; | |
d8ba0dc9 AP |
1102 | shl \$4,$rnds_ |
1103 | mov \$16,$rounds | |
f8501464 | 1104 | movups ($inp),$in0 # load inp |
267b481c | 1105 | paddq $increment,$iv |
23f6eec7 | 1106 | lea 16($inp),$inp # $inp+=16 |
d8ba0dc9 AP |
1107 | sub %r10,%rax # twisted $rounds |
1108 | lea 32($key_,$rnds_),$key # end of key schedule | |
1109 | mov %rax,%r10 | |
267b481c AP |
1110 | jmp .Lccm64_dec_outer |
1111 | .align 16 | |
1112 | .Lccm64_dec_outer: | |
1113 | xorps $inout0,$in0 # inp ^= E(iv) | |
1114 | movdqa $iv,$inout0 | |
267b481c | 1115 | movups $in0,($out) # save output |
23f6eec7 | 1116 | lea 16($out),$out # $out+=16 |
9ee5916d | 1117 | pshufb $bswap_mask,$inout0 |
d7d119a3 | 1118 | |
23f6eec7 AP |
1119 | sub \$1,$len # $len-- ($len is in blocks) |
1120 | jz .Lccm64_dec_break # if ($len==0) break | |
d7d119a3 | 1121 | |
267b481c | 1122 | $movkey ($key_),$rndkey0 |
d8ba0dc9 | 1123 | mov %r10,%rax |
267b481c | 1124 | $movkey 16($key_),$rndkey1 |
f8501464 | 1125 | xorps $rndkey0,$in0 |
f8501464 AP |
1126 | xorps $rndkey0,$inout0 |
1127 | xorps $in0,$inout1 # cmac^=out | |
d8ba0dc9 AP |
1128 | $movkey 32($key_),$rndkey0 |
1129 | jmp .Lccm64_dec2_loop | |
1130 | .align 16 | |
f8501464 AP |
1131 | .Lccm64_dec2_loop: |
1132 | aesenc $rndkey1,$inout0 | |
f8501464 | 1133 | aesenc $rndkey1,$inout1 |
d8ba0dc9 AP |
1134 | $movkey ($key,%rax),$rndkey1 |
1135 | add \$32,%rax | |
f8501464 | 1136 | aesenc $rndkey0,$inout0 |
f8501464 | 1137 | aesenc $rndkey0,$inout1 |
d8ba0dc9 | 1138 | $movkey -16($key,%rax),$rndkey0 |
f8501464 | 1139 | jnz .Lccm64_dec2_loop |
23f6eec7 | 1140 | movups ($inp),$in0 # load input |
267b481c | 1141 | paddq $increment,$iv |
f8501464 AP |
1142 | aesenc $rndkey1,$inout0 |
1143 | aesenc $rndkey1,$inout1 | |
1144 | aesenclast $rndkey0,$inout0 | |
267b481c | 1145 | aesenclast $rndkey0,$inout1 |
23f6eec7 | 1146 | lea 16($inp),$inp # $inp+=16 |
d7d119a3 AP |
1147 | jmp .Lccm64_dec_outer |
1148 | ||
1149 | .align 16 | |
1150 | .Lccm64_dec_break: | |
267b481c | 1151 | #xorps $in0,$inout1 # cmac^=out |
d8ba0dc9 | 1152 | mov 240($key_),$rounds |
d7d119a3 | 1153 | ___ |
267b481c | 1154 | &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); |
d7d119a3 | 1155 | $code.=<<___; |
23f6eec7 AP |
1156 | pxor $rndkey0,$rndkey0 # clear register bank |
1157 | pxor $rndkey1,$rndkey1 | |
1158 | pxor $inout0,$inout0 | |
1159 | movups $inout1,($cmac) # store resulting mac | |
1160 | pxor $inout1,$inout1 | |
1161 | pxor $in0,$in0 | |
1162 | pxor $iv,$iv | |
d7d119a3 AP |
1163 | ___ |
1164 | $code.=<<___ if ($win64); | |
1165 | movaps (%rsp),%xmm6 | |
23f6eec7 | 1166 | movaps %xmm0,(%rsp) # clear stack |
d7d119a3 | 1167 | movaps 0x10(%rsp),%xmm7 |
23f6eec7 | 1168 | movaps %xmm0,0x10(%rsp) |
d7d119a3 | 1169 | movaps 0x20(%rsp),%xmm8 |
23f6eec7 | 1170 | movaps %xmm0,0x20(%rsp) |
d7d119a3 | 1171 | movaps 0x30(%rsp),%xmm9 |
23f6eec7 | 1172 | movaps %xmm0,0x30(%rsp) |
d7d119a3 AP |
1173 | lea 0x58(%rsp),%rsp |
1174 | .Lccm64_dec_ret: | |
1175 | ___ | |
1176 | $code.=<<___; | |
1177 | ret | |
f8501464 AP |
1178 | .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks |
1179 | ___ | |
1180 | }\f | |
1181 | ###################################################################### | |
1182 | # void aesni_ctr32_encrypt_blocks (const void *in, void *out, | |
1183 | # size_t blocks, const AES_KEY *key, | |
1184 | # const char *ivec); | |
1185 | # | |
1186 | # Handles only complete blocks, operates on 32-bit counter and | |
6c79faaa | 1187 | # does not update *ivec! (see crypto/modes/ctr128.c for details) |
f8501464 | 1188 | # |
6c79faaa | 1189 | # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, |
b4a9d5bf | 1190 | # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. |
6c79faaa AP |
1191 | # Keywords are full unroll and modulo-schedule counter calculations |
1192 | # with zero-round key xor. | |
f8501464 | 1193 | { |
6c79faaa | 1194 | my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); |
384e6de4 | 1195 | my ($key0,$ctr)=("%ebp","${ivp}d"); |
6c79faaa | 1196 | my $frame_size = 0x80 + ($win64?160:0); |
f8501464 AP |
1197 | |
1198 | $code.=<<___; | |
1199 | .globl aesni_ctr32_encrypt_blocks | |
1200 | .type aesni_ctr32_encrypt_blocks,\@function,5 | |
1201 | .align 16 | |
1202 | aesni_ctr32_encrypt_blocks: | |
b84460ad | 1203 | .cfi_startproc |
23f6eec7 AP |
1204 | cmp \$1,$len |
1205 | jne .Lctr32_bulk | |
1206 | ||
1207 | # handle single block without allocating stack frame, | |
1208 | # useful when handling edges | |
1209 | movups ($ivp),$inout0 | |
1210 | movups ($inp),$inout1 | |
1211 | mov 240($key),%edx # key->rounds | |
1212 | ___ | |
1213 | &aesni_generate1("enc",$key,"%edx"); | |
1214 | $code.=<<___; | |
1215 | pxor $rndkey0,$rndkey0 # clear register bank | |
1216 | pxor $rndkey1,$rndkey1 | |
1217 | xorps $inout1,$inout0 | |
1218 | pxor $inout1,$inout1 | |
1219 | movups $inout0,($out) | |
1220 | xorps $inout0,$inout0 | |
1221 | jmp .Lctr32_epilogue | |
1222 | ||
1223 | .align 16 | |
1224 | .Lctr32_bulk: | |
384e6de4 | 1225 | lea (%rsp),$key_ # use $key_ as frame pointer |
b84460ad | 1226 | .cfi_def_cfa_register $key_ |
6c79faaa | 1227 | push %rbp |
b84460ad | 1228 | .cfi_push %rbp |
6c79faaa AP |
1229 | sub \$$frame_size,%rsp |
1230 | and \$-16,%rsp # Linux kernel stack can be incorrectly seeded | |
f8501464 AP |
1231 | ___ |
1232 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1233 | movaps %xmm6,-0xa8($key_) # offload everything |
1234 | movaps %xmm7,-0x98($key_) | |
1235 | movaps %xmm8,-0x88($key_) | |
1236 | movaps %xmm9,-0x78($key_) | |
1237 | movaps %xmm10,-0x68($key_) | |
1238 | movaps %xmm11,-0x58($key_) | |
1239 | movaps %xmm12,-0x48($key_) | |
1240 | movaps %xmm13,-0x38($key_) | |
1241 | movaps %xmm14,-0x28($key_) | |
1242 | movaps %xmm15,-0x18($key_) | |
f8501464 AP |
1243 | .Lctr32_body: |
1244 | ___ | |
1245 | $code.=<<___; | |
6c79faaa | 1246 | |
23f6eec7 AP |
1247 | # 8 16-byte words on top of stack are counter values |
1248 | # xor-ed with zero-round key | |
f8501464 | 1249 | |
6c79faaa AP |
1250 | movdqu ($ivp),$inout0 |
1251 | movdqu ($key),$rndkey0 | |
1252 | mov 12($ivp),$ctr # counter LSB | |
1253 | pxor $rndkey0,$inout0 | |
1254 | mov 12($key),$key0 # 0-round key LSB | |
1255 | movdqa $inout0,0x00(%rsp) # populate counter block | |
1256 | bswap $ctr | |
b4a9d5bf AP |
1257 | movdqa $inout0,$inout1 |
1258 | movdqa $inout0,$inout2 | |
1259 | movdqa $inout0,$inout3 | |
6c79faaa AP |
1260 | movdqa $inout0,0x40(%rsp) |
1261 | movdqa $inout0,0x50(%rsp) | |
1262 | movdqa $inout0,0x60(%rsp) | |
23f6eec7 | 1263 | mov %rdx,%r10 # about to borrow %rdx |
6c79faaa AP |
1264 | movdqa $inout0,0x70(%rsp) |
1265 | ||
d8ba0dc9 AP |
1266 | lea 1($ctr),%rax |
1267 | lea 2($ctr),%rdx | |
1268 | bswap %eax | |
1269 | bswap %edx | |
1270 | xor $key0,%eax | |
1271 | xor $key0,%edx | |
1272 | pinsrd \$3,%eax,$inout1 | |
1273 | lea 3($ctr),%rax | |
b4a9d5bf | 1274 | movdqa $inout1,0x10(%rsp) |
d8ba0dc9 AP |
1275 | pinsrd \$3,%edx,$inout2 |
1276 | bswap %eax | |
1277 | mov %r10,%rdx # restore %rdx | |
6c79faaa | 1278 | lea 4($ctr),%r10 |
b4a9d5bf | 1279 | movdqa $inout2,0x20(%rsp) |
d8ba0dc9 | 1280 | xor $key0,%eax |
6c79faaa | 1281 | bswap %r10d |
d8ba0dc9 | 1282 | pinsrd \$3,%eax,$inout3 |
6c79faaa | 1283 | xor $key0,%r10d |
b4a9d5bf | 1284 | movdqa $inout3,0x30(%rsp) |
6c79faaa AP |
1285 | lea 5($ctr),%r9 |
1286 | mov %r10d,0x40+12(%rsp) | |
1287 | bswap %r9d | |
1288 | lea 6($ctr),%r10 | |
d8ba0dc9 | 1289 | mov 240($key),$rounds # key->rounds |
6c79faaa AP |
1290 | xor $key0,%r9d |
1291 | bswap %r10d | |
1292 | mov %r9d,0x50+12(%rsp) | |
1293 | xor $key0,%r10d | |
1294 | lea 7($ctr),%r9 | |
1295 | mov %r10d,0x60+12(%rsp) | |
1296 | bswap %r9d | |
609b0852 | 1297 | mov OPENSSL_ia32cap_P+4(%rip),%r10d |
6c79faaa | 1298 | xor $key0,%r9d |
5599c733 | 1299 | and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE |
6c79faaa AP |
1300 | mov %r9d,0x70+12(%rsp) |
1301 | ||
1302 | $movkey 0x10($key),$rndkey1 | |
1303 | ||
6c79faaa AP |
1304 | movdqa 0x40(%rsp),$inout4 |
1305 | movdqa 0x50(%rsp),$inout5 | |
9282c335 | 1306 | |
23f6eec7 AP |
1307 | cmp \$8,$len # $len is in blocks |
1308 | jb .Lctr32_tail # short input if ($len<8) | |
9282c335 | 1309 | |
23f6eec7 | 1310 | sub \$6,$len # $len is biased by -6 |
5599c733 | 1311 | cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE |
23f6eec7 | 1312 | je .Lctr32_6x # [which denotes Atom Silvermont] |
5599c733 | 1313 | |
6c79faaa | 1314 | lea 0x80($key),$key # size optimization |
23f6eec7 | 1315 | sub \$2,$len # $len is biased by -8 |
9282c335 | 1316 | jmp .Lctr32_loop8 |
f8501464 | 1317 | |
5599c733 AP |
1318 | .align 16 |
1319 | .Lctr32_6x: | |
1320 | shl \$4,$rounds | |
1321 | mov \$48,$rnds_ | |
1322 | bswap $key0 | |
1323 | lea 32($key,$rounds),$key # end of key schedule | |
1324 | sub %rax,%r10 # twisted $rounds | |
1325 | jmp .Lctr32_loop6 | |
1326 | ||
1327 | .align 16 | |
1328 | .Lctr32_loop6: | |
23f6eec7 | 1329 | add \$6,$ctr # next counter value |
5599c733 AP |
1330 | $movkey -48($key,$rnds_),$rndkey0 |
1331 | aesenc $rndkey1,$inout0 | |
1332 | mov $ctr,%eax | |
1333 | xor $key0,%eax | |
1334 | aesenc $rndkey1,$inout1 | |
23f6eec7 | 1335 | movbe %eax,`0x00+12`(%rsp) # store next counter value |
5599c733 AP |
1336 | lea 1($ctr),%eax |
1337 | aesenc $rndkey1,$inout2 | |
1338 | xor $key0,%eax | |
1339 | movbe %eax,`0x10+12`(%rsp) | |
1340 | aesenc $rndkey1,$inout3 | |
1341 | lea 2($ctr),%eax | |
1342 | xor $key0,%eax | |
1343 | aesenc $rndkey1,$inout4 | |
1344 | movbe %eax,`0x20+12`(%rsp) | |
1345 | lea 3($ctr),%eax | |
1346 | aesenc $rndkey1,$inout5 | |
1347 | $movkey -32($key,$rnds_),$rndkey1 | |
1348 | xor $key0,%eax | |
1349 | ||
1350 | aesenc $rndkey0,$inout0 | |
1351 | movbe %eax,`0x30+12`(%rsp) | |
1352 | lea 4($ctr),%eax | |
1353 | aesenc $rndkey0,$inout1 | |
1354 | xor $key0,%eax | |
1355 | movbe %eax,`0x40+12`(%rsp) | |
1356 | aesenc $rndkey0,$inout2 | |
1357 | lea 5($ctr),%eax | |
1358 | xor $key0,%eax | |
1359 | aesenc $rndkey0,$inout3 | |
1360 | movbe %eax,`0x50+12`(%rsp) | |
1361 | mov %r10,%rax # mov $rnds_,$rounds | |
1362 | aesenc $rndkey0,$inout4 | |
1363 | aesenc $rndkey0,$inout5 | |
1364 | $movkey -16($key,$rnds_),$rndkey0 | |
1365 | ||
1366 | call .Lenc_loop6 | |
1367 | ||
23f6eec7 | 1368 | movdqu ($inp),$inout6 # load 6 input blocks |
5599c733 AP |
1369 | movdqu 0x10($inp),$inout7 |
1370 | movdqu 0x20($inp),$in0 | |
1371 | movdqu 0x30($inp),$in1 | |
1372 | movdqu 0x40($inp),$in2 | |
1373 | movdqu 0x50($inp),$in3 | |
23f6eec7 | 1374 | lea 0x60($inp),$inp # $inp+=6*16 |
5599c733 | 1375 | $movkey -64($key,$rnds_),$rndkey1 |
23f6eec7 AP |
1376 | pxor $inout0,$inout6 # inp^=E(ctr) |
1377 | movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] | |
5599c733 AP |
1378 | pxor $inout1,$inout7 |
1379 | movaps 0x10(%rsp),$inout1 | |
1380 | pxor $inout2,$in0 | |
1381 | movaps 0x20(%rsp),$inout2 | |
1382 | pxor $inout3,$in1 | |
1383 | movaps 0x30(%rsp),$inout3 | |
1384 | pxor $inout4,$in2 | |
1385 | movaps 0x40(%rsp),$inout4 | |
1386 | pxor $inout5,$in3 | |
1387 | movaps 0x50(%rsp),$inout5 | |
23f6eec7 | 1388 | movdqu $inout6,($out) # store 6 output blocks |
5599c733 AP |
1389 | movdqu $inout7,0x10($out) |
1390 | movdqu $in0,0x20($out) | |
1391 | movdqu $in1,0x30($out) | |
1392 | movdqu $in2,0x40($out) | |
1393 | movdqu $in3,0x50($out) | |
23f6eec7 AP |
1394 | lea 0x60($out),$out # $out+=6*16 |
1395 | ||
5599c733 | 1396 | sub \$6,$len |
23f6eec7 | 1397 | jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow |
5599c733 | 1398 | |
23f6eec7 AP |
1399 | add \$6,$len # restore real remaining $len |
1400 | jz .Lctr32_done # done if ($len==0) | |
5599c733 AP |
1401 | |
1402 | lea -48($rnds_),$rounds | |
1403 | lea -80($key,$rnds_),$key # restore $key | |
1404 | neg $rounds | |
1405 | shr \$4,$rounds # restore $rounds | |
1406 | jmp .Lctr32_tail | |
1407 | ||
6c79faaa | 1408 | .align 32 |
9282c335 | 1409 | .Lctr32_loop8: |
23f6eec7 | 1410 | add \$8,$ctr # next counter value |
6c79faaa AP |
1411 | movdqa 0x60(%rsp),$inout6 |
1412 | aesenc $rndkey1,$inout0 | |
1413 | mov $ctr,%r9d | |
1414 | movdqa 0x70(%rsp),$inout7 | |
1415 | aesenc $rndkey1,$inout1 | |
1416 | bswap %r9d | |
1417 | $movkey 0x20-0x80($key),$rndkey0 | |
1418 | aesenc $rndkey1,$inout2 | |
1419 | xor $key0,%r9d | |
d8ba0dc9 | 1420 | nop |
6c79faaa | 1421 | aesenc $rndkey1,$inout3 |
23f6eec7 | 1422 | mov %r9d,0x00+12(%rsp) # store next counter value |
6c79faaa AP |
1423 | lea 1($ctr),%r9 |
1424 | aesenc $rndkey1,$inout4 | |
1425 | aesenc $rndkey1,$inout5 | |
1426 | aesenc $rndkey1,$inout6 | |
1427 | aesenc $rndkey1,$inout7 | |
1428 | $movkey 0x30-0x80($key),$rndkey1 | |
1429 | ___ | |
1430 | for($i=2;$i<8;$i++) { | |
1431 | my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; | |
1432 | $code.=<<___; | |
d8ba0dc9 | 1433 | bswap %r9d |
6c79faaa AP |
1434 | aesenc $rndkeyx,$inout0 |
1435 | aesenc $rndkeyx,$inout1 | |
6c79faaa | 1436 | xor $key0,%r9d |
d8ba0dc9 AP |
1437 | .byte 0x66,0x90 |
1438 | aesenc $rndkeyx,$inout2 | |
6c79faaa AP |
1439 | aesenc $rndkeyx,$inout3 |
1440 | mov %r9d,`0x10*($i-1)`+12(%rsp) | |
1441 | lea $i($ctr),%r9 | |
1442 | aesenc $rndkeyx,$inout4 | |
1443 | aesenc $rndkeyx,$inout5 | |
1444 | aesenc $rndkeyx,$inout6 | |
1445 | aesenc $rndkeyx,$inout7 | |
1446 | $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx | |
1447 | ___ | |
1448 | } | |
1449 | $code.=<<___; | |
d8ba0dc9 | 1450 | bswap %r9d |
6c79faaa AP |
1451 | aesenc $rndkey0,$inout0 |
1452 | aesenc $rndkey0,$inout1 | |
6c79faaa AP |
1453 | aesenc $rndkey0,$inout2 |
1454 | xor $key0,%r9d | |
23f6eec7 | 1455 | movdqu 0x00($inp),$in0 # start loading input |
6c79faaa AP |
1456 | aesenc $rndkey0,$inout3 |
1457 | mov %r9d,0x70+12(%rsp) | |
d8ba0dc9 | 1458 | cmp \$11,$rounds |
6c79faaa AP |
1459 | aesenc $rndkey0,$inout4 |
1460 | aesenc $rndkey0,$inout5 | |
1461 | aesenc $rndkey0,$inout6 | |
6c79faaa AP |
1462 | aesenc $rndkey0,$inout7 |
1463 | $movkey 0xa0-0x80($key),$rndkey0 | |
1464 | ||
6c79faaa AP |
1465 | jb .Lctr32_enc_done |
1466 | ||
1467 | aesenc $rndkey1,$inout0 | |
1468 | aesenc $rndkey1,$inout1 | |
1469 | aesenc $rndkey1,$inout2 | |
1470 | aesenc $rndkey1,$inout3 | |
1471 | aesenc $rndkey1,$inout4 | |
1472 | aesenc $rndkey1,$inout5 | |
1473 | aesenc $rndkey1,$inout6 | |
1474 | aesenc $rndkey1,$inout7 | |
1475 | $movkey 0xb0-0x80($key),$rndkey1 | |
1bc4d009 AP |
1476 | |
1477 | aesenc $rndkey0,$inout0 | |
1478 | aesenc $rndkey0,$inout1 | |
1bc4d009 | 1479 | aesenc $rndkey0,$inout2 |
1bc4d009 | 1480 | aesenc $rndkey0,$inout3 |
1bc4d009 | 1481 | aesenc $rndkey0,$inout4 |
1bc4d009 | 1482 | aesenc $rndkey0,$inout5 |
1bc4d009 | 1483 | aesenc $rndkey0,$inout6 |
1bc4d009 | 1484 | aesenc $rndkey0,$inout7 |
6c79faaa AP |
1485 | $movkey 0xc0-0x80($key),$rndkey0 |
1486 | je .Lctr32_enc_done | |
9282c335 | 1487 | |
1bc4d009 AP |
1488 | aesenc $rndkey1,$inout0 |
1489 | aesenc $rndkey1,$inout1 | |
1bc4d009 AP |
1490 | aesenc $rndkey1,$inout2 |
1491 | aesenc $rndkey1,$inout3 | |
1492 | aesenc $rndkey1,$inout4 | |
1493 | aesenc $rndkey1,$inout5 | |
1494 | aesenc $rndkey1,$inout6 | |
1495 | aesenc $rndkey1,$inout7 | |
6c79faaa | 1496 | $movkey 0xd0-0x80($key),$rndkey1 |
9282c335 | 1497 | |
1bc4d009 AP |
1498 | aesenc $rndkey0,$inout0 |
1499 | aesenc $rndkey0,$inout1 | |
1bc4d009 AP |
1500 | aesenc $rndkey0,$inout2 |
1501 | aesenc $rndkey0,$inout3 | |
1502 | aesenc $rndkey0,$inout4 | |
1503 | aesenc $rndkey0,$inout5 | |
1504 | aesenc $rndkey0,$inout6 | |
1505 | aesenc $rndkey0,$inout7 | |
6c79faaa | 1506 | $movkey 0xe0-0x80($key),$rndkey0 |
d8ba0dc9 | 1507 | jmp .Lctr32_enc_done |
1bc4d009 | 1508 | |
d8ba0dc9 | 1509 | .align 16 |
6c79faaa | 1510 | .Lctr32_enc_done: |
6c79faaa | 1511 | movdqu 0x10($inp),$in1 |
23f6eec7 | 1512 | pxor $rndkey0,$in0 # input^=round[last] |
6c79faaa | 1513 | movdqu 0x20($inp),$in2 |
1bc4d009 | 1514 | pxor $rndkey0,$in1 |
6c79faaa | 1515 | movdqu 0x30($inp),$in3 |
1bc4d009 | 1516 | pxor $rndkey0,$in2 |
6c79faaa | 1517 | movdqu 0x40($inp),$in4 |
1bc4d009 | 1518 | pxor $rndkey0,$in3 |
6c79faaa AP |
1519 | movdqu 0x50($inp),$in5 |
1520 | pxor $rndkey0,$in4 | |
6c79faaa | 1521 | pxor $rndkey0,$in5 |
d8ba0dc9 | 1522 | aesenc $rndkey1,$inout0 |
cd54249c AP |
1523 | aesenc $rndkey1,$inout1 |
1524 | aesenc $rndkey1,$inout2 | |
1525 | aesenc $rndkey1,$inout3 | |
1526 | aesenc $rndkey1,$inout4 | |
1527 | aesenc $rndkey1,$inout5 | |
1bc4d009 AP |
1528 | aesenc $rndkey1,$inout6 |
1529 | aesenc $rndkey1,$inout7 | |
23f6eec7 AP |
1530 | movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] |
1531 | lea 0x80($inp),$inp # $inp+=8*16 | |
6c79faaa | 1532 | |
23f6eec7 AP |
1533 | aesenclast $in0,$inout0 # $inN is inp[N]^round[last] |
1534 | pxor $rndkey0,$rndkey1 # borrowed $rndkey | |
d8ba0dc9 | 1535 | movdqu 0x70-0x80($inp),$in0 |
1bc4d009 | 1536 | aesenclast $in1,$inout1 |
1bc4d009 | 1537 | pxor $rndkey0,$in0 |
6c79faaa | 1538 | movdqa 0x00(%rsp),$in1 # load next counter block |
1bc4d009 | 1539 | aesenclast $in2,$inout2 |
1bc4d009 | 1540 | aesenclast $in3,$inout3 |
d8ba0dc9 | 1541 | movdqa 0x10(%rsp),$in2 |
6c79faaa AP |
1542 | movdqa 0x20(%rsp),$in3 |
1543 | aesenclast $in4,$inout4 | |
6c79faaa | 1544 | aesenclast $in5,$inout5 |
d8ba0dc9 | 1545 | movdqa 0x30(%rsp),$in4 |
6c79faaa AP |
1546 | movdqa 0x40(%rsp),$in5 |
1547 | aesenclast $rndkey1,$inout6 | |
1548 | movdqa 0x50(%rsp),$rndkey0 | |
23f6eec7 | 1549 | $movkey 0x10-0x80($key),$rndkey1#real 1st-round key |
d8ba0dc9 | 1550 | aesenclast $in0,$inout7 |
1bc4d009 | 1551 | |
23f6eec7 | 1552 | movups $inout0,($out) # store 8 output blocks |
6c79faaa | 1553 | movdqa $in1,$inout0 |
9282c335 | 1554 | movups $inout1,0x10($out) |
6c79faaa | 1555 | movdqa $in2,$inout1 |
9282c335 | 1556 | movups $inout2,0x20($out) |
6c79faaa | 1557 | movdqa $in3,$inout2 |
9282c335 | 1558 | movups $inout3,0x30($out) |
6c79faaa | 1559 | movdqa $in4,$inout3 |
9282c335 | 1560 | movups $inout4,0x40($out) |
6c79faaa | 1561 | movdqa $in5,$inout4 |
9282c335 | 1562 | movups $inout5,0x50($out) |
1bc4d009 | 1563 | movdqa $rndkey0,$inout5 |
9282c335 AP |
1564 | movups $inout6,0x60($out) |
1565 | movups $inout7,0x70($out) | |
23f6eec7 AP |
1566 | lea 0x80($out),$out # $out+=8*16 |
1567 | ||
9282c335 | 1568 | sub \$8,$len |
23f6eec7 | 1569 | jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow |
f8501464 | 1570 | |
46f4e1be | 1571 | add \$8,$len # restore real remaining $len |
23f6eec7 | 1572 | jz .Lctr32_done # done if ($len==0) |
6c79faaa | 1573 | lea -0x80($key),$key |
f8501464 AP |
1574 | |
1575 | .Lctr32_tail: | |
23f6eec7 | 1576 | # note that at this point $inout0..5 are populated with |
609b0852 | 1577 | # counter values xor-ed with 0-round key |
6c79faaa | 1578 | lea 16($key),$key |
f8501464 | 1579 | cmp \$4,$len |
b4a9d5bf AP |
1580 | jb .Lctr32_loop3 |
1581 | je .Lctr32_loop4 | |
f8501464 | 1582 | |
23f6eec7 | 1583 | # if ($len>4) compute 7 E(counter) |
d8ba0dc9 | 1584 | shl \$4,$rounds |
6c79faaa | 1585 | movdqa 0x60(%rsp),$inout6 |
b4a9d5bf | 1586 | pxor $inout7,$inout7 |
f8501464 | 1587 | |
6c79faaa AP |
1588 | $movkey 16($key),$rndkey0 |
1589 | aesenc $rndkey1,$inout0 | |
6c79faaa | 1590 | aesenc $rndkey1,$inout1 |
23f6eec7 | 1591 | lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter |
d8ba0dc9 | 1592 | neg %rax |
6c79faaa | 1593 | aesenc $rndkey1,$inout2 |
23f6eec7 | 1594 | add \$16,%rax # prepare for .Lenc_loop8_enter |
b4a9d5bf | 1595 | movups ($inp),$in0 |
d8ba0dc9 | 1596 | aesenc $rndkey1,$inout3 |
6c79faaa | 1597 | aesenc $rndkey1,$inout4 |
23f6eec7 | 1598 | movups 0x10($inp),$in1 # pre-load input |
b4a9d5bf | 1599 | movups 0x20($inp),$in2 |
d8ba0dc9 | 1600 | aesenc $rndkey1,$inout5 |
6c79faaa | 1601 | aesenc $rndkey1,$inout6 |
f8501464 | 1602 | |
6c79faaa | 1603 | call .Lenc_loop8_enter |
f8501464 | 1604 | |
73325b22 AP |
1605 | movdqu 0x30($inp),$in3 |
1606 | pxor $in0,$inout0 | |
1607 | movdqu 0x40($inp),$in0 | |
1608 | pxor $in1,$inout1 | |
23f6eec7 | 1609 | movdqu $inout0,($out) # store output |
73325b22 AP |
1610 | pxor $in2,$inout2 |
1611 | movdqu $inout1,0x10($out) | |
1612 | pxor $in3,$inout3 | |
1613 | movdqu $inout2,0x20($out) | |
1614 | pxor $in0,$inout4 | |
1615 | movdqu $inout3,0x30($out) | |
1616 | movdqu $inout4,0x40($out) | |
6c79faaa | 1617 | cmp \$6,$len |
23f6eec7 | 1618 | jb .Lctr32_done # $len was 5, stop store |
9282c335 | 1619 | |
6c79faaa AP |
1620 | movups 0x50($inp),$in1 |
1621 | xorps $in1,$inout5 | |
1622 | movups $inout5,0x50($out) | |
23f6eec7 | 1623 | je .Lctr32_done # $len was 6, stop store |
9282c335 | 1624 | |
6c79faaa AP |
1625 | movups 0x60($inp),$in2 |
1626 | xorps $in2,$inout6 | |
1627 | movups $inout6,0x60($out) | |
23f6eec7 | 1628 | jmp .Lctr32_done # $len was 7, stop store |
f8501464 | 1629 | |
6c79faaa AP |
1630 | .align 32 |
1631 | .Lctr32_loop4: | |
1632 | aesenc $rndkey1,$inout0 | |
1633 | lea 16($key),$key | |
d8ba0dc9 | 1634 | dec $rounds |
6c79faaa AP |
1635 | aesenc $rndkey1,$inout1 |
1636 | aesenc $rndkey1,$inout2 | |
1637 | aesenc $rndkey1,$inout3 | |
1638 | $movkey ($key),$rndkey1 | |
6c79faaa AP |
1639 | jnz .Lctr32_loop4 |
1640 | aesenclast $rndkey1,$inout0 | |
1641 | aesenclast $rndkey1,$inout1 | |
23f6eec7 | 1642 | movups ($inp),$in0 # load input |
b4a9d5bf | 1643 | movups 0x10($inp),$in1 |
6c79faaa AP |
1644 | aesenclast $rndkey1,$inout2 |
1645 | aesenclast $rndkey1,$inout3 | |
d8ba0dc9 | 1646 | movups 0x20($inp),$in2 |
b4a9d5bf AP |
1647 | movups 0x30($inp),$in3 |
1648 | ||
1649 | xorps $in0,$inout0 | |
23f6eec7 | 1650 | movups $inout0,($out) # store output |
b4a9d5bf AP |
1651 | xorps $in1,$inout1 |
1652 | movups $inout1,0x10($out) | |
73325b22 AP |
1653 | pxor $in2,$inout2 |
1654 | movdqu $inout2,0x20($out) | |
1655 | pxor $in3,$inout3 | |
1656 | movdqu $inout3,0x30($out) | |
23f6eec7 | 1657 | jmp .Lctr32_done # $len was 4, stop store |
b4a9d5bf AP |
1658 | |
1659 | .align 32 | |
1660 | .Lctr32_loop3: | |
1661 | aesenc $rndkey1,$inout0 | |
1662 | lea 16($key),$key | |
d8ba0dc9 | 1663 | dec $rounds |
b4a9d5bf AP |
1664 | aesenc $rndkey1,$inout1 |
1665 | aesenc $rndkey1,$inout2 | |
1666 | $movkey ($key),$rndkey1 | |
b4a9d5bf AP |
1667 | jnz .Lctr32_loop3 |
1668 | aesenclast $rndkey1,$inout0 | |
1669 | aesenclast $rndkey1,$inout1 | |
1670 | aesenclast $rndkey1,$inout2 | |
6c79faaa | 1671 | |
23f6eec7 | 1672 | movups ($inp),$in0 # load input |
9282c335 | 1673 | xorps $in0,$inout0 |
23f6eec7 | 1674 | movups $inout0,($out) # store output |
6c79faaa | 1675 | cmp \$2,$len |
23f6eec7 | 1676 | jb .Lctr32_done # $len was 1, stop store |
f8501464 | 1677 | |
6c79faaa | 1678 | movups 0x10($inp),$in1 |
9282c335 | 1679 | xorps $in1,$inout1 |
9282c335 | 1680 | movups $inout1,0x10($out) |
23f6eec7 | 1681 | je .Lctr32_done # $len was 2, stop store |
f8501464 | 1682 | |
6c79faaa | 1683 | movups 0x20($inp),$in2 |
9282c335 | 1684 | xorps $in2,$inout2 |
23f6eec7 | 1685 | movups $inout2,0x20($out) # $len was 3, stop store |
9282c335 | 1686 | |
f8501464 | 1687 | .Lctr32_done: |
46f4e1be | 1688 | xorps %xmm0,%xmm0 # clear register bank |
23f6eec7 AP |
1689 | xor $key0,$key0 |
1690 | pxor %xmm1,%xmm1 | |
1691 | pxor %xmm2,%xmm2 | |
1692 | pxor %xmm3,%xmm3 | |
1693 | pxor %xmm4,%xmm4 | |
1694 | pxor %xmm5,%xmm5 | |
1695 | ___ | |
1696 | $code.=<<___ if (!$win64); | |
1697 | pxor %xmm6,%xmm6 | |
1698 | pxor %xmm7,%xmm7 | |
1699 | movaps %xmm0,0x00(%rsp) # clear stack | |
1700 | pxor %xmm8,%xmm8 | |
1701 | movaps %xmm0,0x10(%rsp) | |
1702 | pxor %xmm9,%xmm9 | |
1703 | movaps %xmm0,0x20(%rsp) | |
1704 | pxor %xmm10,%xmm10 | |
1705 | movaps %xmm0,0x30(%rsp) | |
1706 | pxor %xmm11,%xmm11 | |
1707 | movaps %xmm0,0x40(%rsp) | |
1708 | pxor %xmm12,%xmm12 | |
1709 | movaps %xmm0,0x50(%rsp) | |
1710 | pxor %xmm13,%xmm13 | |
1711 | movaps %xmm0,0x60(%rsp) | |
1712 | pxor %xmm14,%xmm14 | |
1713 | movaps %xmm0,0x70(%rsp) | |
1714 | pxor %xmm15,%xmm15 | |
f8501464 AP |
1715 | ___ |
1716 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1717 | movaps -0xa8($key_),%xmm6 |
1718 | movaps %xmm0,-0xa8($key_) # clear stack | |
1719 | movaps -0x98($key_),%xmm7 | |
1720 | movaps %xmm0,-0x98($key_) | |
1721 | movaps -0x88($key_),%xmm8 | |
1722 | movaps %xmm0,-0x88($key_) | |
1723 | movaps -0x78($key_),%xmm9 | |
1724 | movaps %xmm0,-0x78($key_) | |
1725 | movaps -0x68($key_),%xmm10 | |
1726 | movaps %xmm0,-0x68($key_) | |
1727 | movaps -0x58($key_),%xmm11 | |
1728 | movaps %xmm0,-0x58($key_) | |
1729 | movaps -0x48($key_),%xmm12 | |
1730 | movaps %xmm0,-0x48($key_) | |
1731 | movaps -0x38($key_),%xmm13 | |
1732 | movaps %xmm0,-0x38($key_) | |
1733 | movaps -0x28($key_),%xmm14 | |
1734 | movaps %xmm0,-0x28($key_) | |
1735 | movaps -0x18($key_),%xmm15 | |
1736 | movaps %xmm0,-0x18($key_) | |
23f6eec7 AP |
1737 | movaps %xmm0,0x00(%rsp) |
1738 | movaps %xmm0,0x10(%rsp) | |
1739 | movaps %xmm0,0x20(%rsp) | |
1740 | movaps %xmm0,0x30(%rsp) | |
1741 | movaps %xmm0,0x40(%rsp) | |
1742 | movaps %xmm0,0x50(%rsp) | |
1743 | movaps %xmm0,0x60(%rsp) | |
1744 | movaps %xmm0,0x70(%rsp) | |
f8501464 AP |
1745 | ___ |
1746 | $code.=<<___; | |
384e6de4 | 1747 | mov -8($key_),%rbp |
b84460ad | 1748 | .cfi_restore %rbp |
384e6de4 | 1749 | lea ($key_),%rsp |
b84460ad | 1750 | .cfi_def_cfa_register %rsp |
6c79faaa | 1751 | .Lctr32_epilogue: |
f8501464 | 1752 | ret |
b84460ad | 1753 | .cfi_endproc |
f8501464 AP |
1754 | .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks |
1755 | ___ | |
1756 | } | |
1757 | \f | |
1758 | ###################################################################### | |
1759 | # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, | |
1760 | # const AES_KEY *key1, const AES_KEY *key2 | |
1761 | # const unsigned char iv[16]); | |
1762 | # | |
1763 | { | |
1764 | my @tweak=map("%xmm$_",(10..15)); | |
1765 | my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); | |
1766 | my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); | |
36df342f | 1767 | my $frame_size = 0x70 + ($win64?160:0); |
384e6de4 | 1768 | my $key_ = "%rbp"; # override so that we can use %r11 as FP |
f8501464 AP |
1769 | |
1770 | $code.=<<___; | |
1771 | .globl aesni_xts_encrypt | |
1772 | .type aesni_xts_encrypt,\@function,6 | |
1773 | .align 16 | |
1774 | aesni_xts_encrypt: | |
b84460ad | 1775 | .cfi_startproc |
384e6de4 | 1776 | lea (%rsp),%r11 # frame pointer |
b84460ad | 1777 | .cfi_def_cfa_register %r11 |
6a40ebe8 | 1778 | push %rbp |
b84460ad | 1779 | .cfi_push %rbp |
6a40ebe8 AP |
1780 | sub \$$frame_size,%rsp |
1781 | and \$-16,%rsp # Linux kernel stack can be incorrectly seeded | |
f8501464 AP |
1782 | ___ |
1783 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1784 | movaps %xmm6,-0xa8(%r11) # offload everything |
1785 | movaps %xmm7,-0x98(%r11) | |
1786 | movaps %xmm8,-0x88(%r11) | |
1787 | movaps %xmm9,-0x78(%r11) | |
1788 | movaps %xmm10,-0x68(%r11) | |
1789 | movaps %xmm11,-0x58(%r11) | |
1790 | movaps %xmm12,-0x48(%r11) | |
1791 | movaps %xmm13,-0x38(%r11) | |
1792 | movaps %xmm14,-0x28(%r11) | |
1793 | movaps %xmm15,-0x18(%r11) | |
f8501464 AP |
1794 | .Lxts_enc_body: |
1795 | ___ | |
1796 | $code.=<<___; | |
d8ba0dc9 | 1797 | movups ($ivp),$inout0 # load clear-text tweak |
f8501464 AP |
1798 | mov 240(%r8),$rounds # key2->rounds |
1799 | mov 240($key),$rnds_ # key1->rounds | |
1800 | ___ | |
1801 | # generate the tweak | |
d8ba0dc9 | 1802 | &aesni_generate1("enc",$key2,$rounds,$inout0); |
f8501464 | 1803 | $code.=<<___; |
36df342f | 1804 | $movkey ($key),$rndkey0 # zero round key |
f8501464 AP |
1805 | mov $key,$key_ # backup $key |
1806 | mov $rnds_,$rounds # backup $rounds | |
36df342f | 1807 | shl \$4,$rnds_ |
f8501464 AP |
1808 | mov $len,$len_ # backup $len |
1809 | and \$-16,$len | |
1810 | ||
36df342f | 1811 | $movkey 16($key,$rnds_),$rndkey1 # last round key |
36df342f | 1812 | |
f8501464 | 1813 | movdqa .Lxts_magic(%rip),$twmask |
d8ba0dc9 AP |
1814 | movdqa $inout0,@tweak[5] |
1815 | pshufd \$0x5f,$inout0,$twres | |
36df342f | 1816 | pxor $rndkey0,$rndkey1 |
f8501464 | 1817 | ___ |
36df342f AP |
1818 | # alternative tweak calculation algorithm is based on suggestions |
1819 | # by Shay Gueron. psrad doesn't conflict with AES-NI instructions | |
1820 | # and should help in the future... | |
f8501464 AP |
1821 | for ($i=0;$i<4;$i++) { |
1822 | $code.=<<___; | |
36df342f AP |
1823 | movdqa $twres,$twtmp |
1824 | paddd $twres,$twres | |
f8501464 | 1825 | movdqa @tweak[5],@tweak[$i] |
36df342f AP |
1826 | psrad \$31,$twtmp # broadcast upper bits |
1827 | paddq @tweak[5],@tweak[5] | |
1828 | pand $twmask,$twtmp | |
1829 | pxor $rndkey0,@tweak[$i] | |
1830 | pxor $twtmp,@tweak[5] | |
f8501464 AP |
1831 | ___ |
1832 | } | |
1833 | $code.=<<___; | |
36df342f AP |
1834 | movdqa @tweak[5],@tweak[4] |
1835 | psrad \$31,$twres | |
1836 | paddq @tweak[5],@tweak[5] | |
1837 | pand $twmask,$twres | |
1838 | pxor $rndkey0,@tweak[4] | |
1839 | pxor $twres,@tweak[5] | |
1840 | movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] | |
1841 | ||
f8501464 | 1842 | sub \$16*6,$len |
23f6eec7 | 1843 | jc .Lxts_enc_short # if $len-=6*16 borrowed |
f8501464 | 1844 | |
d8ba0dc9 AP |
1845 | mov \$16+96,$rounds |
1846 | lea 32($key_,$rnds_),$key # end of key schedule | |
1847 | sub %r10,%rax # twisted $rounds | |
36df342f | 1848 | $movkey 16($key_),$rndkey1 |
d8ba0dc9 | 1849 | mov %rax,%r10 # backup twisted $rounds |
36df342f | 1850 | lea .Lxts_magic(%rip),%r8 |
f8501464 AP |
1851 | jmp .Lxts_enc_grandloop |
1852 | ||
36df342f | 1853 | .align 32 |
f8501464 | 1854 | .Lxts_enc_grandloop: |
f8501464 | 1855 | movdqu `16*0`($inp),$inout0 # load input |
36df342f | 1856 | movdqa $rndkey0,$twmask |
f8501464 | 1857 | movdqu `16*1`($inp),$inout1 |
23f6eec7 | 1858 | pxor @tweak[0],$inout0 # input^=tweak^round[0] |
f8501464 | 1859 | movdqu `16*2`($inp),$inout2 |
f8501464 | 1860 | pxor @tweak[1],$inout1 |
36df342f AP |
1861 | aesenc $rndkey1,$inout0 |
1862 | movdqu `16*3`($inp),$inout3 | |
f8501464 | 1863 | pxor @tweak[2],$inout2 |
36df342f AP |
1864 | aesenc $rndkey1,$inout1 |
1865 | movdqu `16*4`($inp),$inout4 | |
f8501464 | 1866 | pxor @tweak[3],$inout3 |
36df342f AP |
1867 | aesenc $rndkey1,$inout2 |
1868 | movdqu `16*5`($inp),$inout5 | |
1869 | pxor @tweak[5],$twmask # round[0]^=tweak[5] | |
1870 | movdqa 0x60(%rsp),$twres # load round[0]^round[last] | |
f8501464 | 1871 | pxor @tweak[4],$inout4 |
36df342f AP |
1872 | aesenc $rndkey1,$inout3 |
1873 | $movkey 32($key_),$rndkey0 | |
1874 | lea `16*6`($inp),$inp | |
1875 | pxor $twmask,$inout5 | |
f8501464 | 1876 | |
46f4e1be | 1877 | pxor $twres,@tweak[0] # calculate tweaks^round[last] |
f8501464 | 1878 | aesenc $rndkey1,$inout4 |
36df342f | 1879 | pxor $twres,@tweak[1] |
23f6eec7 | 1880 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] |
f8501464 | 1881 | aesenc $rndkey1,$inout5 |
36df342f | 1882 | $movkey 48($key_),$rndkey1 |
d8ba0dc9 | 1883 | pxor $twres,@tweak[2] |
f8501464 | 1884 | |
36df342f | 1885 | aesenc $rndkey0,$inout0 |
d8ba0dc9 | 1886 | pxor $twres,@tweak[3] |
36df342f AP |
1887 | movdqa @tweak[1],`16*1`(%rsp) |
1888 | aesenc $rndkey0,$inout1 | |
d8ba0dc9 | 1889 | pxor $twres,@tweak[4] |
36df342f AP |
1890 | movdqa @tweak[2],`16*2`(%rsp) |
1891 | aesenc $rndkey0,$inout2 | |
36df342f AP |
1892 | aesenc $rndkey0,$inout3 |
1893 | pxor $twres,$twmask | |
1894 | movdqa @tweak[4],`16*4`(%rsp) | |
1895 | aesenc $rndkey0,$inout4 | |
36df342f AP |
1896 | aesenc $rndkey0,$inout5 |
1897 | $movkey 64($key_),$rndkey0 | |
d8ba0dc9 | 1898 | movdqa $twmask,`16*5`(%rsp) |
36df342f AP |
1899 | pshufd \$0x5f,@tweak[5],$twres |
1900 | jmp .Lxts_enc_loop6 | |
1901 | .align 32 | |
f8501464 AP |
1902 | .Lxts_enc_loop6: |
1903 | aesenc $rndkey1,$inout0 | |
1904 | aesenc $rndkey1,$inout1 | |
f8501464 AP |
1905 | aesenc $rndkey1,$inout2 |
1906 | aesenc $rndkey1,$inout3 | |
1907 | aesenc $rndkey1,$inout4 | |
1908 | aesenc $rndkey1,$inout5 | |
d8ba0dc9 AP |
1909 | $movkey -64($key,%rax),$rndkey1 |
1910 | add \$32,%rax | |
36df342f | 1911 | |
f8501464 AP |
1912 | aesenc $rndkey0,$inout0 |
1913 | aesenc $rndkey0,$inout1 | |
f8501464 AP |
1914 | aesenc $rndkey0,$inout2 |
1915 | aesenc $rndkey0,$inout3 | |
1916 | aesenc $rndkey0,$inout4 | |
1917 | aesenc $rndkey0,$inout5 | |
d8ba0dc9 | 1918 | $movkey -80($key,%rax),$rndkey0 |
f8501464 AP |
1919 | jnz .Lxts_enc_loop6 |
1920 | ||
23f6eec7 | 1921 | movdqa (%r8),$twmask # start calculating next tweak |
36df342f AP |
1922 | movdqa $twres,$twtmp |
1923 | paddd $twres,$twres | |
f8501464 | 1924 | aesenc $rndkey1,$inout0 |
36df342f AP |
1925 | paddq @tweak[5],@tweak[5] |
1926 | psrad \$31,$twtmp | |
f8501464 | 1927 | aesenc $rndkey1,$inout1 |
36df342f AP |
1928 | pand $twmask,$twtmp |
1929 | $movkey ($key_),@tweak[0] # load round[0] | |
f8501464 | 1930 | aesenc $rndkey1,$inout2 |
f8501464 AP |
1931 | aesenc $rndkey1,$inout3 |
1932 | aesenc $rndkey1,$inout4 | |
d8ba0dc9 | 1933 | pxor $twtmp,@tweak[5] |
36df342f | 1934 | movaps @tweak[0],@tweak[1] # copy round[0] |
f8501464 | 1935 | aesenc $rndkey1,$inout5 |
d8ba0dc9 | 1936 | $movkey -64($key),$rndkey1 |
f8501464 | 1937 | |
36df342f | 1938 | movdqa $twres,$twtmp |
f8501464 | 1939 | aesenc $rndkey0,$inout0 |
d8ba0dc9 | 1940 | paddd $twres,$twres |
36df342f | 1941 | pxor @tweak[5],@tweak[0] |
f8501464 | 1942 | aesenc $rndkey0,$inout1 |
d8ba0dc9 | 1943 | psrad \$31,$twtmp |
36df342f | 1944 | paddq @tweak[5],@tweak[5] |
f8501464 | 1945 | aesenc $rndkey0,$inout2 |
f8501464 | 1946 | aesenc $rndkey0,$inout3 |
d8ba0dc9 | 1947 | pand $twmask,$twtmp |
36df342f | 1948 | movaps @tweak[1],@tweak[2] |
d8ba0dc9 AP |
1949 | aesenc $rndkey0,$inout4 |
1950 | pxor $twtmp,@tweak[5] | |
1951 | movdqa $twres,$twtmp | |
f8501464 | 1952 | aesenc $rndkey0,$inout5 |
d8ba0dc9 | 1953 | $movkey -48($key),$rndkey0 |
f8501464 | 1954 | |
36df342f | 1955 | paddd $twres,$twres |
f8501464 | 1956 | aesenc $rndkey1,$inout0 |
36df342f AP |
1957 | pxor @tweak[5],@tweak[1] |
1958 | psrad \$31,$twtmp | |
f8501464 | 1959 | aesenc $rndkey1,$inout1 |
36df342f AP |
1960 | paddq @tweak[5],@tweak[5] |
1961 | pand $twmask,$twtmp | |
f8501464 | 1962 | aesenc $rndkey1,$inout2 |
f8501464 | 1963 | aesenc $rndkey1,$inout3 |
d8ba0dc9 | 1964 | movdqa @tweak[3],`16*3`(%rsp) |
36df342f | 1965 | pxor $twtmp,@tweak[5] |
f8501464 | 1966 | aesenc $rndkey1,$inout4 |
36df342f | 1967 | movaps @tweak[2],@tweak[3] |
d8ba0dc9 | 1968 | movdqa $twres,$twtmp |
f8501464 | 1969 | aesenc $rndkey1,$inout5 |
d8ba0dc9 | 1970 | $movkey -32($key),$rndkey1 |
f8501464 | 1971 | |
36df342f AP |
1972 | paddd $twres,$twres |
1973 | aesenc $rndkey0,$inout0 | |
1974 | pxor @tweak[5],@tweak[2] | |
1975 | psrad \$31,$twtmp | |
1976 | aesenc $rndkey0,$inout1 | |
1977 | paddq @tweak[5],@tweak[5] | |
1978 | pand $twmask,$twtmp | |
1979 | aesenc $rndkey0,$inout2 | |
1980 | aesenc $rndkey0,$inout3 | |
36df342f | 1981 | aesenc $rndkey0,$inout4 |
d8ba0dc9 | 1982 | pxor $twtmp,@tweak[5] |
36df342f AP |
1983 | movaps @tweak[3],@tweak[4] |
1984 | aesenc $rndkey0,$inout5 | |
1985 | ||
1986 | movdqa $twres,$rndkey0 | |
1987 | paddd $twres,$twres | |
1988 | aesenc $rndkey1,$inout0 | |
1989 | pxor @tweak[5],@tweak[3] | |
1990 | psrad \$31,$rndkey0 | |
1991 | aesenc $rndkey1,$inout1 | |
1992 | paddq @tweak[5],@tweak[5] | |
1993 | pand $twmask,$rndkey0 | |
1994 | aesenc $rndkey1,$inout2 | |
1995 | aesenc $rndkey1,$inout3 | |
1996 | pxor $rndkey0,@tweak[5] | |
1997 | $movkey ($key_),$rndkey0 | |
1998 | aesenc $rndkey1,$inout4 | |
1999 | aesenc $rndkey1,$inout5 | |
2000 | $movkey 16($key_),$rndkey1 | |
2001 | ||
2002 | pxor @tweak[5],@tweak[4] | |
36df342f | 2003 | aesenclast `16*0`(%rsp),$inout0 |
d8ba0dc9 | 2004 | psrad \$31,$twres |
36df342f | 2005 | paddq @tweak[5],@tweak[5] |
36df342f AP |
2006 | aesenclast `16*1`(%rsp),$inout1 |
2007 | aesenclast `16*2`(%rsp),$inout2 | |
d8ba0dc9 AP |
2008 | pand $twmask,$twres |
2009 | mov %r10,%rax # restore $rounds | |
36df342f AP |
2010 | aesenclast `16*3`(%rsp),$inout3 |
2011 | aesenclast `16*4`(%rsp),$inout4 | |
2012 | aesenclast `16*5`(%rsp),$inout5 | |
d8ba0dc9 | 2013 | pxor $twres,@tweak[5] |
f8501464 | 2014 | |
23f6eec7 AP |
2015 | lea `16*6`($out),$out # $out+=6*16 |
2016 | movups $inout0,`-16*6`($out) # store 6 output blocks | |
36df342f AP |
2017 | movups $inout1,`-16*5`($out) |
2018 | movups $inout2,`-16*4`($out) | |
2019 | movups $inout3,`-16*3`($out) | |
2020 | movups $inout4,`-16*2`($out) | |
2021 | movups $inout5,`-16*1`($out) | |
f8501464 | 2022 | sub \$16*6,$len |
23f6eec7 | 2023 | jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow |
f8501464 | 2024 | |
d8ba0dc9 AP |
2025 | mov \$16+96,$rounds |
2026 | sub $rnds_,$rounds | |
f8501464 | 2027 | mov $key_,$key # restore $key |
d8ba0dc9 | 2028 | shr \$4,$rounds # restore original value |
f8501464 AP |
2029 | |
2030 | .Lxts_enc_short: | |
23f6eec7 | 2031 | # at the point @tweak[0..5] are populated with tweak values |
d8ba0dc9 | 2032 | mov $rounds,$rnds_ # backup $rounds |
36df342f | 2033 | pxor $rndkey0,@tweak[0] |
23f6eec7 AP |
2034 | add \$16*6,$len # restore real remaining $len |
2035 | jz .Lxts_enc_done # done if ($len==0) | |
f8501464 | 2036 | |
36df342f | 2037 | pxor $rndkey0,@tweak[1] |
f8501464 | 2038 | cmp \$0x20,$len |
23f6eec7 | 2039 | jb .Lxts_enc_one # $len is 1*16 |
36df342f | 2040 | pxor $rndkey0,@tweak[2] |
23f6eec7 | 2041 | je .Lxts_enc_two # $len is 2*16 |
f8501464 | 2042 | |
36df342f | 2043 | pxor $rndkey0,@tweak[3] |
f8501464 | 2044 | cmp \$0x40,$len |
23f6eec7 | 2045 | jb .Lxts_enc_three # $len is 3*16 |
36df342f | 2046 | pxor $rndkey0,@tweak[4] |
23f6eec7 | 2047 | je .Lxts_enc_four # $len is 4*16 |
f8501464 | 2048 | |
23f6eec7 | 2049 | movdqu ($inp),$inout0 # $len is 5*16 |
36df342f | 2050 | movdqu 16*1($inp),$inout1 |
f8501464 AP |
2051 | movdqu 16*2($inp),$inout2 |
2052 | pxor @tweak[0],$inout0 | |
2053 | movdqu 16*3($inp),$inout3 | |
2054 | pxor @tweak[1],$inout1 | |
2055 | movdqu 16*4($inp),$inout4 | |
23f6eec7 | 2056 | lea 16*5($inp),$inp # $inp+=5*16 |
f8501464 AP |
2057 | pxor @tweak[2],$inout2 |
2058 | pxor @tweak[3],$inout3 | |
2059 | pxor @tweak[4],$inout4 | |
23f6eec7 | 2060 | pxor $inout5,$inout5 |
f8501464 AP |
2061 | |
2062 | call _aesni_encrypt6 | |
2063 | ||
2064 | xorps @tweak[0],$inout0 | |
2065 | movdqa @tweak[5],@tweak[0] | |
2066 | xorps @tweak[1],$inout1 | |
2067 | xorps @tweak[2],$inout2 | |
23f6eec7 | 2068 | movdqu $inout0,($out) # store 5 output blocks |
f8501464 AP |
2069 | xorps @tweak[3],$inout3 |
2070 | movdqu $inout1,16*1($out) | |
2071 | xorps @tweak[4],$inout4 | |
2072 | movdqu $inout2,16*2($out) | |
2073 | movdqu $inout3,16*3($out) | |
2074 | movdqu $inout4,16*4($out) | |
23f6eec7 | 2075 | lea 16*5($out),$out # $out+=5*16 |
f8501464 AP |
2076 | jmp .Lxts_enc_done |
2077 | ||
2078 | .align 16 | |
2079 | .Lxts_enc_one: | |
2080 | movups ($inp),$inout0 | |
23f6eec7 | 2081 | lea 16*1($inp),$inp # inp+=1*16 |
f8501464 AP |
2082 | xorps @tweak[0],$inout0 |
2083 | ___ | |
2084 | &aesni_generate1("enc",$key,$rounds); | |
2085 | $code.=<<___; | |
2086 | xorps @tweak[0],$inout0 | |
2087 | movdqa @tweak[1],@tweak[0] | |
23f6eec7 AP |
2088 | movups $inout0,($out) # store one output block |
2089 | lea 16*1($out),$out # $out+=1*16 | |
f8501464 AP |
2090 | jmp .Lxts_enc_done |
2091 | ||
2092 | .align 16 | |
2093 | .Lxts_enc_two: | |
2094 | movups ($inp),$inout0 | |
2095 | movups 16($inp),$inout1 | |
23f6eec7 | 2096 | lea 32($inp),$inp # $inp+=2*16 |
f8501464 AP |
2097 | xorps @tweak[0],$inout0 |
2098 | xorps @tweak[1],$inout1 | |
2099 | ||
214368ff | 2100 | call _aesni_encrypt2 |
f8501464 AP |
2101 | |
2102 | xorps @tweak[0],$inout0 | |
2103 | movdqa @tweak[2],@tweak[0] | |
2104 | xorps @tweak[1],$inout1 | |
23f6eec7 | 2105 | movups $inout0,($out) # store 2 output blocks |
f8501464 | 2106 | movups $inout1,16*1($out) |
23f6eec7 | 2107 | lea 16*2($out),$out # $out+=2*16 |
f8501464 AP |
2108 | jmp .Lxts_enc_done |
2109 | ||
2110 | .align 16 | |
2111 | .Lxts_enc_three: | |
2112 | movups ($inp),$inout0 | |
2113 | movups 16*1($inp),$inout1 | |
2114 | movups 16*2($inp),$inout2 | |
23f6eec7 | 2115 | lea 16*3($inp),$inp # $inp+=3*16 |
f8501464 AP |
2116 | xorps @tweak[0],$inout0 |
2117 | xorps @tweak[1],$inout1 | |
2118 | xorps @tweak[2],$inout2 | |
2119 | ||
2120 | call _aesni_encrypt3 | |
2121 | ||
2122 | xorps @tweak[0],$inout0 | |
2123 | movdqa @tweak[3],@tweak[0] | |
2124 | xorps @tweak[1],$inout1 | |
2125 | xorps @tweak[2],$inout2 | |
23f6eec7 | 2126 | movups $inout0,($out) # store 3 output blocks |
f8501464 AP |
2127 | movups $inout1,16*1($out) |
2128 | movups $inout2,16*2($out) | |
23f6eec7 | 2129 | lea 16*3($out),$out # $out+=3*16 |
f8501464 AP |
2130 | jmp .Lxts_enc_done |
2131 | ||
2132 | .align 16 | |
2133 | .Lxts_enc_four: | |
2134 | movups ($inp),$inout0 | |
2135 | movups 16*1($inp),$inout1 | |
2136 | movups 16*2($inp),$inout2 | |
2137 | xorps @tweak[0],$inout0 | |
2138 | movups 16*3($inp),$inout3 | |
23f6eec7 | 2139 | lea 16*4($inp),$inp # $inp+=4*16 |
f8501464 AP |
2140 | xorps @tweak[1],$inout1 |
2141 | xorps @tweak[2],$inout2 | |
2142 | xorps @tweak[3],$inout3 | |
2143 | ||
2144 | call _aesni_encrypt4 | |
2145 | ||
36df342f AP |
2146 | pxor @tweak[0],$inout0 |
2147 | movdqa @tweak[4],@tweak[0] | |
2148 | pxor @tweak[1],$inout1 | |
2149 | pxor @tweak[2],$inout2 | |
23f6eec7 | 2150 | movdqu $inout0,($out) # store 4 output blocks |
36df342f AP |
2151 | pxor @tweak[3],$inout3 |
2152 | movdqu $inout1,16*1($out) | |
2153 | movdqu $inout2,16*2($out) | |
2154 | movdqu $inout3,16*3($out) | |
23f6eec7 | 2155 | lea 16*4($out),$out # $out+=4*16 |
f8501464 AP |
2156 | jmp .Lxts_enc_done |
2157 | ||
2158 | .align 16 | |
2159 | .Lxts_enc_done: | |
23f6eec7 | 2160 | and \$15,$len_ # see if $len%16 is 0 |
f8501464 AP |
2161 | jz .Lxts_enc_ret |
2162 | mov $len_,$len | |
2163 | ||
2164 | .Lxts_enc_steal: | |
2165 | movzb ($inp),%eax # borrow $rounds ... | |
2166 | movzb -16($out),%ecx # ... and $key | |
2167 | lea 1($inp),$inp | |
2168 | mov %al,-16($out) | |
2169 | mov %cl,0($out) | |
2170 | lea 1($out),$out | |
2171 | sub \$1,$len | |
2172 | jnz .Lxts_enc_steal | |
2173 | ||
2174 | sub $len_,$out # rewind $out | |
2175 | mov $key_,$key # restore $key | |
2176 | mov $rnds_,$rounds # restore $rounds | |
2177 | ||
2178 | movups -16($out),$inout0 | |
2179 | xorps @tweak[0],$inout0 | |
2180 | ___ | |
2181 | &aesni_generate1("enc",$key,$rounds); | |
2182 | $code.=<<___; | |
2183 | xorps @tweak[0],$inout0 | |
2184 | movups $inout0,-16($out) | |
2185 | ||
2186 | .Lxts_enc_ret: | |
23f6eec7 AP |
2187 | xorps %xmm0,%xmm0 # clear register bank |
2188 | pxor %xmm1,%xmm1 | |
2189 | pxor %xmm2,%xmm2 | |
2190 | pxor %xmm3,%xmm3 | |
2191 | pxor %xmm4,%xmm4 | |
2192 | pxor %xmm5,%xmm5 | |
2193 | ___ | |
2194 | $code.=<<___ if (!$win64); | |
2195 | pxor %xmm6,%xmm6 | |
2196 | pxor %xmm7,%xmm7 | |
2197 | movaps %xmm0,0x00(%rsp) # clear stack | |
2198 | pxor %xmm8,%xmm8 | |
2199 | movaps %xmm0,0x10(%rsp) | |
2200 | pxor %xmm9,%xmm9 | |
2201 | movaps %xmm0,0x20(%rsp) | |
2202 | pxor %xmm10,%xmm10 | |
2203 | movaps %xmm0,0x30(%rsp) | |
2204 | pxor %xmm11,%xmm11 | |
2205 | movaps %xmm0,0x40(%rsp) | |
2206 | pxor %xmm12,%xmm12 | |
2207 | movaps %xmm0,0x50(%rsp) | |
2208 | pxor %xmm13,%xmm13 | |
2209 | movaps %xmm0,0x60(%rsp) | |
2210 | pxor %xmm14,%xmm14 | |
2211 | pxor %xmm15,%xmm15 | |
f8501464 AP |
2212 | ___ |
2213 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2214 | movaps -0xa8(%r11),%xmm6 |
2215 | movaps %xmm0,-0xa8(%r11) # clear stack | |
2216 | movaps -0x98(%r11),%xmm7 | |
2217 | movaps %xmm0,-0x98(%r11) | |
2218 | movaps -0x88(%r11),%xmm8 | |
2219 | movaps %xmm0,-0x88(%r11) | |
2220 | movaps -0x78(%r11),%xmm9 | |
2221 | movaps %xmm0,-0x78(%r11) | |
2222 | movaps -0x68(%r11),%xmm10 | |
2223 | movaps %xmm0,-0x68(%r11) | |
2224 | movaps -0x58(%r11),%xmm11 | |
2225 | movaps %xmm0,-0x58(%r11) | |
2226 | movaps -0x48(%r11),%xmm12 | |
2227 | movaps %xmm0,-0x48(%r11) | |
2228 | movaps -0x38(%r11),%xmm13 | |
2229 | movaps %xmm0,-0x38(%r11) | |
2230 | movaps -0x28(%r11),%xmm14 | |
2231 | movaps %xmm0,-0x28(%r11) | |
2232 | movaps -0x18(%r11),%xmm15 | |
2233 | movaps %xmm0,-0x18(%r11) | |
23f6eec7 AP |
2234 | movaps %xmm0,0x00(%rsp) |
2235 | movaps %xmm0,0x10(%rsp) | |
2236 | movaps %xmm0,0x20(%rsp) | |
2237 | movaps %xmm0,0x30(%rsp) | |
2238 | movaps %xmm0,0x40(%rsp) | |
2239 | movaps %xmm0,0x50(%rsp) | |
2240 | movaps %xmm0,0x60(%rsp) | |
f8501464 AP |
2241 | ___ |
2242 | $code.=<<___; | |
384e6de4 | 2243 | mov -8(%r11),%rbp |
b84460ad | 2244 | .cfi_restore %rbp |
384e6de4 | 2245 | lea (%r11),%rsp |
b84460ad | 2246 | .cfi_def_cfa_register %rsp |
f8501464 AP |
2247 | .Lxts_enc_epilogue: |
2248 | ret | |
b84460ad | 2249 | .cfi_endproc |
f8501464 | 2250 | .size aesni_xts_encrypt,.-aesni_xts_encrypt |
d7d119a3 | 2251 | ___ |
6c83629b AP |
2252 | |
2253 | $code.=<<___; | |
f8501464 AP |
2254 | .globl aesni_xts_decrypt |
2255 | .type aesni_xts_decrypt,\@function,6 | |
6c83629b | 2256 | .align 16 |
f8501464 | 2257 | aesni_xts_decrypt: |
b84460ad | 2258 | .cfi_startproc |
384e6de4 | 2259 | lea (%rsp),%r11 # frame pointer |
b84460ad | 2260 | .cfi_def_cfa_register %r11 |
6a40ebe8 | 2261 | push %rbp |
b84460ad | 2262 | .cfi_push %rbp |
6a40ebe8 AP |
2263 | sub \$$frame_size,%rsp |
2264 | and \$-16,%rsp # Linux kernel stack can be incorrectly seeded | |
6c83629b AP |
2265 | ___ |
2266 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2267 | movaps %xmm6,-0xa8(%r11) # offload everything |
2268 | movaps %xmm7,-0x98(%r11) | |
2269 | movaps %xmm8,-0x88(%r11) | |
2270 | movaps %xmm9,-0x78(%r11) | |
2271 | movaps %xmm10,-0x68(%r11) | |
2272 | movaps %xmm11,-0x58(%r11) | |
2273 | movaps %xmm12,-0x48(%r11) | |
2274 | movaps %xmm13,-0x38(%r11) | |
2275 | movaps %xmm14,-0x28(%r11) | |
2276 | movaps %xmm15,-0x18(%r11) | |
f8501464 | 2277 | .Lxts_dec_body: |
6c83629b AP |
2278 | ___ |
2279 | $code.=<<___; | |
d8ba0dc9 | 2280 | movups ($ivp),$inout0 # load clear-text tweak |
f8501464 AP |
2281 | mov 240($key2),$rounds # key2->rounds |
2282 | mov 240($key),$rnds_ # key1->rounds | |
2283 | ___ | |
2284 | # generate the tweak | |
d8ba0dc9 | 2285 | &aesni_generate1("enc",$key2,$rounds,$inout0); |
f8501464 AP |
2286 | $code.=<<___; |
2287 | xor %eax,%eax # if ($len%16) len-=16; | |
2288 | test \$15,$len | |
2289 | setnz %al | |
2290 | shl \$4,%rax | |
2291 | sub %rax,$len | |
2292 | ||
36df342f | 2293 | $movkey ($key),$rndkey0 # zero round key |
f8501464 AP |
2294 | mov $key,$key_ # backup $key |
2295 | mov $rnds_,$rounds # backup $rounds | |
36df342f | 2296 | shl \$4,$rnds_ |
f8501464 AP |
2297 | mov $len,$len_ # backup $len |
2298 | and \$-16,$len | |
6c83629b | 2299 | |
36df342f | 2300 | $movkey 16($key,$rnds_),$rndkey1 # last round key |
36df342f | 2301 | |
f8501464 | 2302 | movdqa .Lxts_magic(%rip),$twmask |
d8ba0dc9 AP |
2303 | movdqa $inout0,@tweak[5] |
2304 | pshufd \$0x5f,$inout0,$twres | |
36df342f | 2305 | pxor $rndkey0,$rndkey1 |
f8501464 AP |
2306 | ___ |
2307 | for ($i=0;$i<4;$i++) { | |
2308 | $code.=<<___; | |
36df342f AP |
2309 | movdqa $twres,$twtmp |
2310 | paddd $twres,$twres | |
f8501464 | 2311 | movdqa @tweak[5],@tweak[$i] |
36df342f AP |
2312 | psrad \$31,$twtmp # broadcast upper bits |
2313 | paddq @tweak[5],@tweak[5] | |
2314 | pand $twmask,$twtmp | |
2315 | pxor $rndkey0,@tweak[$i] | |
2316 | pxor $twtmp,@tweak[5] | |
f8501464 AP |
2317 | ___ |
2318 | } | |
2319 | $code.=<<___; | |
36df342f AP |
2320 | movdqa @tweak[5],@tweak[4] |
2321 | psrad \$31,$twres | |
2322 | paddq @tweak[5],@tweak[5] | |
2323 | pand $twmask,$twres | |
2324 | pxor $rndkey0,@tweak[4] | |
2325 | pxor $twres,@tweak[5] | |
2326 | movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] | |
2327 | ||
f8501464 | 2328 | sub \$16*6,$len |
23f6eec7 | 2329 | jc .Lxts_dec_short # if $len-=6*16 borrowed |
6c83629b | 2330 | |
d8ba0dc9 AP |
2331 | mov \$16+96,$rounds |
2332 | lea 32($key_,$rnds_),$key # end of key schedule | |
2333 | sub %r10,%rax # twisted $rounds | |
36df342f | 2334 | $movkey 16($key_),$rndkey1 |
d8ba0dc9 | 2335 | mov %rax,%r10 # backup twisted $rounds |
36df342f | 2336 | lea .Lxts_magic(%rip),%r8 |
f8501464 | 2337 | jmp .Lxts_dec_grandloop |
6c83629b | 2338 | |
36df342f | 2339 | .align 32 |
f8501464 | 2340 | .Lxts_dec_grandloop: |
f8501464 | 2341 | movdqu `16*0`($inp),$inout0 # load input |
36df342f | 2342 | movdqa $rndkey0,$twmask |
f8501464 | 2343 | movdqu `16*1`($inp),$inout1 |
23f6eec7 | 2344 | pxor @tweak[0],$inout0 # intput^=tweak^round[0] |
f8501464 | 2345 | movdqu `16*2`($inp),$inout2 |
f8501464 | 2346 | pxor @tweak[1],$inout1 |
36df342f AP |
2347 | aesdec $rndkey1,$inout0 |
2348 | movdqu `16*3`($inp),$inout3 | |
f8501464 | 2349 | pxor @tweak[2],$inout2 |
36df342f AP |
2350 | aesdec $rndkey1,$inout1 |
2351 | movdqu `16*4`($inp),$inout4 | |
f8501464 | 2352 | pxor @tweak[3],$inout3 |
36df342f AP |
2353 | aesdec $rndkey1,$inout2 |
2354 | movdqu `16*5`($inp),$inout5 | |
2355 | pxor @tweak[5],$twmask # round[0]^=tweak[5] | |
2356 | movdqa 0x60(%rsp),$twres # load round[0]^round[last] | |
f8501464 | 2357 | pxor @tweak[4],$inout4 |
36df342f AP |
2358 | aesdec $rndkey1,$inout3 |
2359 | $movkey 32($key_),$rndkey0 | |
2360 | lea `16*6`($inp),$inp | |
2361 | pxor $twmask,$inout5 | |
f8501464 | 2362 | |
46f4e1be | 2363 | pxor $twres,@tweak[0] # calculate tweaks^round[last] |
f8501464 | 2364 | aesdec $rndkey1,$inout4 |
36df342f AP |
2365 | pxor $twres,@tweak[1] |
2366 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key | |
f8501464 | 2367 | aesdec $rndkey1,$inout5 |
36df342f | 2368 | $movkey 48($key_),$rndkey1 |
d8ba0dc9 | 2369 | pxor $twres,@tweak[2] |
6c83629b | 2370 | |
36df342f | 2371 | aesdec $rndkey0,$inout0 |
d8ba0dc9 | 2372 | pxor $twres,@tweak[3] |
36df342f AP |
2373 | movdqa @tweak[1],`16*1`(%rsp) |
2374 | aesdec $rndkey0,$inout1 | |
d8ba0dc9 | 2375 | pxor $twres,@tweak[4] |
36df342f AP |
2376 | movdqa @tweak[2],`16*2`(%rsp) |
2377 | aesdec $rndkey0,$inout2 | |
36df342f AP |
2378 | aesdec $rndkey0,$inout3 |
2379 | pxor $twres,$twmask | |
2380 | movdqa @tweak[4],`16*4`(%rsp) | |
2381 | aesdec $rndkey0,$inout4 | |
36df342f AP |
2382 | aesdec $rndkey0,$inout5 |
2383 | $movkey 64($key_),$rndkey0 | |
d8ba0dc9 | 2384 | movdqa $twmask,`16*5`(%rsp) |
36df342f AP |
2385 | pshufd \$0x5f,@tweak[5],$twres |
2386 | jmp .Lxts_dec_loop6 | |
2387 | .align 32 | |
f8501464 AP |
2388 | .Lxts_dec_loop6: |
2389 | aesdec $rndkey1,$inout0 | |
2390 | aesdec $rndkey1,$inout1 | |
f8501464 AP |
2391 | aesdec $rndkey1,$inout2 |
2392 | aesdec $rndkey1,$inout3 | |
2393 | aesdec $rndkey1,$inout4 | |
2394 | aesdec $rndkey1,$inout5 | |
d8ba0dc9 AP |
2395 | $movkey -64($key,%rax),$rndkey1 |
2396 | add \$32,%rax | |
36df342f | 2397 | |
f8501464 AP |
2398 | aesdec $rndkey0,$inout0 |
2399 | aesdec $rndkey0,$inout1 | |
f8501464 AP |
2400 | aesdec $rndkey0,$inout2 |
2401 | aesdec $rndkey0,$inout3 | |
2402 | aesdec $rndkey0,$inout4 | |
2403 | aesdec $rndkey0,$inout5 | |
d8ba0dc9 | 2404 | $movkey -80($key,%rax),$rndkey0 |
f8501464 AP |
2405 | jnz .Lxts_dec_loop6 |
2406 | ||
23f6eec7 | 2407 | movdqa (%r8),$twmask # start calculating next tweak |
36df342f AP |
2408 | movdqa $twres,$twtmp |
2409 | paddd $twres,$twres | |
f8501464 | 2410 | aesdec $rndkey1,$inout0 |
36df342f AP |
2411 | paddq @tweak[5],@tweak[5] |
2412 | psrad \$31,$twtmp | |
f8501464 | 2413 | aesdec $rndkey1,$inout1 |
36df342f AP |
2414 | pand $twmask,$twtmp |
2415 | $movkey ($key_),@tweak[0] # load round[0] | |
f8501464 | 2416 | aesdec $rndkey1,$inout2 |
f8501464 AP |
2417 | aesdec $rndkey1,$inout3 |
2418 | aesdec $rndkey1,$inout4 | |
d8ba0dc9 | 2419 | pxor $twtmp,@tweak[5] |
36df342f | 2420 | movaps @tweak[0],@tweak[1] # copy round[0] |
f8501464 | 2421 | aesdec $rndkey1,$inout5 |
d8ba0dc9 | 2422 | $movkey -64($key),$rndkey1 |
f8501464 | 2423 | |
36df342f | 2424 | movdqa $twres,$twtmp |
f8501464 | 2425 | aesdec $rndkey0,$inout0 |
d8ba0dc9 | 2426 | paddd $twres,$twres |
36df342f | 2427 | pxor @tweak[5],@tweak[0] |
f8501464 | 2428 | aesdec $rndkey0,$inout1 |
d8ba0dc9 | 2429 | psrad \$31,$twtmp |
36df342f | 2430 | paddq @tweak[5],@tweak[5] |
f8501464 | 2431 | aesdec $rndkey0,$inout2 |
f8501464 | 2432 | aesdec $rndkey0,$inout3 |
d8ba0dc9 | 2433 | pand $twmask,$twtmp |
36df342f | 2434 | movaps @tweak[1],@tweak[2] |
d8ba0dc9 AP |
2435 | aesdec $rndkey0,$inout4 |
2436 | pxor $twtmp,@tweak[5] | |
2437 | movdqa $twres,$twtmp | |
f8501464 | 2438 | aesdec $rndkey0,$inout5 |
d8ba0dc9 | 2439 | $movkey -48($key),$rndkey0 |
f8501464 | 2440 | |
36df342f | 2441 | paddd $twres,$twres |
f8501464 | 2442 | aesdec $rndkey1,$inout0 |
36df342f AP |
2443 | pxor @tweak[5],@tweak[1] |
2444 | psrad \$31,$twtmp | |
f8501464 | 2445 | aesdec $rndkey1,$inout1 |
36df342f AP |
2446 | paddq @tweak[5],@tweak[5] |
2447 | pand $twmask,$twtmp | |
f8501464 | 2448 | aesdec $rndkey1,$inout2 |
f8501464 | 2449 | aesdec $rndkey1,$inout3 |
d8ba0dc9 | 2450 | movdqa @tweak[3],`16*3`(%rsp) |
36df342f | 2451 | pxor $twtmp,@tweak[5] |
f8501464 | 2452 | aesdec $rndkey1,$inout4 |
36df342f | 2453 | movaps @tweak[2],@tweak[3] |
d8ba0dc9 | 2454 | movdqa $twres,$twtmp |
f8501464 | 2455 | aesdec $rndkey1,$inout5 |
d8ba0dc9 | 2456 | $movkey -32($key),$rndkey1 |
f8501464 | 2457 | |
36df342f AP |
2458 | paddd $twres,$twres |
2459 | aesdec $rndkey0,$inout0 | |
2460 | pxor @tweak[5],@tweak[2] | |
2461 | psrad \$31,$twtmp | |
2462 | aesdec $rndkey0,$inout1 | |
2463 | paddq @tweak[5],@tweak[5] | |
2464 | pand $twmask,$twtmp | |
2465 | aesdec $rndkey0,$inout2 | |
2466 | aesdec $rndkey0,$inout3 | |
36df342f | 2467 | aesdec $rndkey0,$inout4 |
d8ba0dc9 | 2468 | pxor $twtmp,@tweak[5] |
36df342f AP |
2469 | movaps @tweak[3],@tweak[4] |
2470 | aesdec $rndkey0,$inout5 | |
2471 | ||
2472 | movdqa $twres,$rndkey0 | |
2473 | paddd $twres,$twres | |
2474 | aesdec $rndkey1,$inout0 | |
2475 | pxor @tweak[5],@tweak[3] | |
2476 | psrad \$31,$rndkey0 | |
2477 | aesdec $rndkey1,$inout1 | |
2478 | paddq @tweak[5],@tweak[5] | |
2479 | pand $twmask,$rndkey0 | |
2480 | aesdec $rndkey1,$inout2 | |
2481 | aesdec $rndkey1,$inout3 | |
2482 | pxor $rndkey0,@tweak[5] | |
2483 | $movkey ($key_),$rndkey0 | |
2484 | aesdec $rndkey1,$inout4 | |
2485 | aesdec $rndkey1,$inout5 | |
2486 | $movkey 16($key_),$rndkey1 | |
2487 | ||
2488 | pxor @tweak[5],@tweak[4] | |
36df342f | 2489 | aesdeclast `16*0`(%rsp),$inout0 |
d8ba0dc9 | 2490 | psrad \$31,$twres |
36df342f | 2491 | paddq @tweak[5],@tweak[5] |
36df342f AP |
2492 | aesdeclast `16*1`(%rsp),$inout1 |
2493 | aesdeclast `16*2`(%rsp),$inout2 | |
d8ba0dc9 AP |
2494 | pand $twmask,$twres |
2495 | mov %r10,%rax # restore $rounds | |
36df342f AP |
2496 | aesdeclast `16*3`(%rsp),$inout3 |
2497 | aesdeclast `16*4`(%rsp),$inout4 | |
2498 | aesdeclast `16*5`(%rsp),$inout5 | |
d8ba0dc9 | 2499 | pxor $twres,@tweak[5] |
f8501464 | 2500 | |
23f6eec7 AP |
2501 | lea `16*6`($out),$out # $out+=6*16 |
2502 | movups $inout0,`-16*6`($out) # store 6 output blocks | |
36df342f AP |
2503 | movups $inout1,`-16*5`($out) |
2504 | movups $inout2,`-16*4`($out) | |
2505 | movups $inout3,`-16*3`($out) | |
2506 | movups $inout4,`-16*2`($out) | |
2507 | movups $inout5,`-16*1`($out) | |
f8501464 | 2508 | sub \$16*6,$len |
23f6eec7 | 2509 | jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow |
f8501464 | 2510 | |
d8ba0dc9 AP |
2511 | mov \$16+96,$rounds |
2512 | sub $rnds_,$rounds | |
f8501464 | 2513 | mov $key_,$key # restore $key |
d8ba0dc9 | 2514 | shr \$4,$rounds # restore original value |
f8501464 AP |
2515 | |
2516 | .Lxts_dec_short: | |
23f6eec7 | 2517 | # at the point @tweak[0..5] are populated with tweak values |
d8ba0dc9 | 2518 | mov $rounds,$rnds_ # backup $rounds |
36df342f AP |
2519 | pxor $rndkey0,@tweak[0] |
2520 | pxor $rndkey0,@tweak[1] | |
23f6eec7 AP |
2521 | add \$16*6,$len # restore real remaining $len |
2522 | jz .Lxts_dec_done # done if ($len==0) | |
d7d119a3 | 2523 | |
36df342f | 2524 | pxor $rndkey0,@tweak[2] |
f8501464 | 2525 | cmp \$0x20,$len |
23f6eec7 | 2526 | jb .Lxts_dec_one # $len is 1*16 |
36df342f | 2527 | pxor $rndkey0,@tweak[3] |
23f6eec7 | 2528 | je .Lxts_dec_two # $len is 2*16 |
d7d119a3 | 2529 | |
36df342f | 2530 | pxor $rndkey0,@tweak[4] |
f8501464 | 2531 | cmp \$0x40,$len |
23f6eec7 AP |
2532 | jb .Lxts_dec_three # $len is 3*16 |
2533 | je .Lxts_dec_four # $len is 4*16 | |
f8501464 | 2534 | |
23f6eec7 | 2535 | movdqu ($inp),$inout0 # $len is 5*16 |
36df342f | 2536 | movdqu 16*1($inp),$inout1 |
f8501464 AP |
2537 | movdqu 16*2($inp),$inout2 |
2538 | pxor @tweak[0],$inout0 | |
2539 | movdqu 16*3($inp),$inout3 | |
2540 | pxor @tweak[1],$inout1 | |
2541 | movdqu 16*4($inp),$inout4 | |
23f6eec7 | 2542 | lea 16*5($inp),$inp # $inp+=5*16 |
f8501464 AP |
2543 | pxor @tweak[2],$inout2 |
2544 | pxor @tweak[3],$inout3 | |
2545 | pxor @tweak[4],$inout4 | |
2546 | ||
2547 | call _aesni_decrypt6 | |
2548 | ||
2549 | xorps @tweak[0],$inout0 | |
2550 | xorps @tweak[1],$inout1 | |
2551 | xorps @tweak[2],$inout2 | |
23f6eec7 | 2552 | movdqu $inout0,($out) # store 5 output blocks |
f8501464 AP |
2553 | xorps @tweak[3],$inout3 |
2554 | movdqu $inout1,16*1($out) | |
2555 | xorps @tweak[4],$inout4 | |
2556 | movdqu $inout2,16*2($out) | |
2557 | pxor $twtmp,$twtmp | |
2558 | movdqu $inout3,16*3($out) | |
2559 | pcmpgtd @tweak[5],$twtmp | |
2560 | movdqu $inout4,16*4($out) | |
23f6eec7 | 2561 | lea 16*5($out),$out # $out+=5*16 |
f8501464 AP |
2562 | pshufd \$0x13,$twtmp,@tweak[1] # $twres |
2563 | and \$15,$len_ | |
2564 | jz .Lxts_dec_ret | |
2565 | ||
2566 | movdqa @tweak[5],@tweak[0] | |
2567 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | |
2568 | pand $twmask,@tweak[1] # isolate carry and residue | |
2569 | pxor @tweak[5],@tweak[1] | |
2570 | jmp .Lxts_dec_done2 | |
d7d119a3 | 2571 | |
f8501464 AP |
2572 | .align 16 |
2573 | .Lxts_dec_one: | |
2574 | movups ($inp),$inout0 | |
23f6eec7 | 2575 | lea 16*1($inp),$inp # $inp+=1*16 |
f8501464 AP |
2576 | xorps @tweak[0],$inout0 |
2577 | ___ | |
2578 | &aesni_generate1("dec",$key,$rounds); | |
2579 | $code.=<<___; | |
2580 | xorps @tweak[0],$inout0 | |
2581 | movdqa @tweak[1],@tweak[0] | |
23f6eec7 | 2582 | movups $inout0,($out) # store one output block |
f8501464 | 2583 | movdqa @tweak[2],@tweak[1] |
23f6eec7 | 2584 | lea 16*1($out),$out # $out+=1*16 |
f8501464 | 2585 | jmp .Lxts_dec_done |
6c83629b | 2586 | |
f8501464 AP |
2587 | .align 16 |
2588 | .Lxts_dec_two: | |
2589 | movups ($inp),$inout0 | |
2590 | movups 16($inp),$inout1 | |
23f6eec7 | 2591 | lea 32($inp),$inp # $inp+=2*16 |
f8501464 AP |
2592 | xorps @tweak[0],$inout0 |
2593 | xorps @tweak[1],$inout1 | |
6c83629b | 2594 | |
214368ff | 2595 | call _aesni_decrypt2 |
6c83629b | 2596 | |
f8501464 AP |
2597 | xorps @tweak[0],$inout0 |
2598 | movdqa @tweak[2],@tweak[0] | |
2599 | xorps @tweak[1],$inout1 | |
2600 | movdqa @tweak[3],@tweak[1] | |
23f6eec7 | 2601 | movups $inout0,($out) # store 2 output blocks |
f8501464 | 2602 | movups $inout1,16*1($out) |
23f6eec7 | 2603 | lea 16*2($out),$out # $out+=2*16 |
f8501464 | 2604 | jmp .Lxts_dec_done |
6c83629b | 2605 | |
f8501464 AP |
2606 | .align 16 |
2607 | .Lxts_dec_three: | |
2608 | movups ($inp),$inout0 | |
2609 | movups 16*1($inp),$inout1 | |
2610 | movups 16*2($inp),$inout2 | |
23f6eec7 | 2611 | lea 16*3($inp),$inp # $inp+=3*16 |
f8501464 AP |
2612 | xorps @tweak[0],$inout0 |
2613 | xorps @tweak[1],$inout1 | |
2614 | xorps @tweak[2],$inout2 | |
6c83629b | 2615 | |
f8501464 | 2616 | call _aesni_decrypt3 |
6c83629b | 2617 | |
f8501464 AP |
2618 | xorps @tweak[0],$inout0 |
2619 | movdqa @tweak[3],@tweak[0] | |
2620 | xorps @tweak[1],$inout1 | |
36df342f | 2621 | movdqa @tweak[4],@tweak[1] |
f8501464 | 2622 | xorps @tweak[2],$inout2 |
23f6eec7 | 2623 | movups $inout0,($out) # store 3 output blocks |
f8501464 AP |
2624 | movups $inout1,16*1($out) |
2625 | movups $inout2,16*2($out) | |
23f6eec7 | 2626 | lea 16*3($out),$out # $out+=3*16 |
f8501464 | 2627 | jmp .Lxts_dec_done |
6c83629b AP |
2628 | |
2629 | .align 16 | |
f8501464 | 2630 | .Lxts_dec_four: |
36df342f AP |
2631 | movups ($inp),$inout0 |
2632 | movups 16*1($inp),$inout1 | |
f8501464 AP |
2633 | movups 16*2($inp),$inout2 |
2634 | xorps @tweak[0],$inout0 | |
2635 | movups 16*3($inp),$inout3 | |
23f6eec7 | 2636 | lea 16*4($inp),$inp # $inp+=4*16 |
f8501464 AP |
2637 | xorps @tweak[1],$inout1 |
2638 | xorps @tweak[2],$inout2 | |
2639 | xorps @tweak[3],$inout3 | |
2640 | ||
2641 | call _aesni_decrypt4 | |
2642 | ||
36df342f | 2643 | pxor @tweak[0],$inout0 |
f8501464 | 2644 | movdqa @tweak[4],@tweak[0] |
36df342f | 2645 | pxor @tweak[1],$inout1 |
f8501464 | 2646 | movdqa @tweak[5],@tweak[1] |
36df342f | 2647 | pxor @tweak[2],$inout2 |
23f6eec7 | 2648 | movdqu $inout0,($out) # store 4 output blocks |
36df342f AP |
2649 | pxor @tweak[3],$inout3 |
2650 | movdqu $inout1,16*1($out) | |
2651 | movdqu $inout2,16*2($out) | |
2652 | movdqu $inout3,16*3($out) | |
23f6eec7 | 2653 | lea 16*4($out),$out # $out+=4*16 |
f8501464 | 2654 | jmp .Lxts_dec_done |
6c83629b AP |
2655 | |
2656 | .align 16 | |
f8501464 | 2657 | .Lxts_dec_done: |
23f6eec7 | 2658 | and \$15,$len_ # see if $len%16 is 0 |
f8501464 AP |
2659 | jz .Lxts_dec_ret |
2660 | .Lxts_dec_done2: | |
2661 | mov $len_,$len | |
2662 | mov $key_,$key # restore $key | |
2663 | mov $rnds_,$rounds # restore $rounds | |
6c83629b | 2664 | |
f8501464 AP |
2665 | movups ($inp),$inout0 |
2666 | xorps @tweak[1],$inout0 | |
2667 | ___ | |
2668 | &aesni_generate1("dec",$key,$rounds); | |
2669 | $code.=<<___; | |
2670 | xorps @tweak[1],$inout0 | |
2671 | movups $inout0,($out) | |
2672 | ||
2673 | .Lxts_dec_steal: | |
2674 | movzb 16($inp),%eax # borrow $rounds ... | |
2675 | movzb ($out),%ecx # ... and $key | |
2676 | lea 1($inp),$inp | |
2677 | mov %al,($out) | |
2678 | mov %cl,16($out) | |
2679 | lea 1($out),$out | |
2680 | sub \$1,$len | |
2681 | jnz .Lxts_dec_steal | |
2682 | ||
2683 | sub $len_,$out # rewind $out | |
2684 | mov $key_,$key # restore $key | |
2685 | mov $rnds_,$rounds # restore $rounds | |
2686 | ||
2687 | movups ($out),$inout0 | |
2688 | xorps @tweak[0],$inout0 | |
6c83629b | 2689 | ___ |
f8501464 AP |
2690 | &aesni_generate1("dec",$key,$rounds); |
2691 | $code.=<<___; | |
2692 | xorps @tweak[0],$inout0 | |
2693 | movups $inout0,($out) | |
6c83629b | 2694 | |
f8501464 | 2695 | .Lxts_dec_ret: |
23f6eec7 AP |
2696 | xorps %xmm0,%xmm0 # clear register bank |
2697 | pxor %xmm1,%xmm1 | |
2698 | pxor %xmm2,%xmm2 | |
2699 | pxor %xmm3,%xmm3 | |
2700 | pxor %xmm4,%xmm4 | |
2701 | pxor %xmm5,%xmm5 | |
2702 | ___ | |
2703 | $code.=<<___ if (!$win64); | |
2704 | pxor %xmm6,%xmm6 | |
2705 | pxor %xmm7,%xmm7 | |
2706 | movaps %xmm0,0x00(%rsp) # clear stack | |
2707 | pxor %xmm8,%xmm8 | |
2708 | movaps %xmm0,0x10(%rsp) | |
2709 | pxor %xmm9,%xmm9 | |
2710 | movaps %xmm0,0x20(%rsp) | |
2711 | pxor %xmm10,%xmm10 | |
2712 | movaps %xmm0,0x30(%rsp) | |
2713 | pxor %xmm11,%xmm11 | |
2714 | movaps %xmm0,0x40(%rsp) | |
2715 | pxor %xmm12,%xmm12 | |
2716 | movaps %xmm0,0x50(%rsp) | |
2717 | pxor %xmm13,%xmm13 | |
2718 | movaps %xmm0,0x60(%rsp) | |
2719 | pxor %xmm14,%xmm14 | |
2720 | pxor %xmm15,%xmm15 | |
f8501464 | 2721 | ___ |
6c83629b | 2722 | $code.=<<___ if ($win64); |
384e6de4 AP |
2723 | movaps -0xa8(%r11),%xmm6 |
2724 | movaps %xmm0,-0xa8(%r11) # clear stack | |
2725 | movaps -0x98(%r11),%xmm7 | |
2726 | movaps %xmm0,-0x98(%r11) | |
2727 | movaps -0x88(%r11),%xmm8 | |
2728 | movaps %xmm0,-0x88(%r11) | |
2729 | movaps -0x78(%r11),%xmm9 | |
2730 | movaps %xmm0,-0x78(%r11) | |
2731 | movaps -0x68(%r11),%xmm10 | |
2732 | movaps %xmm0,-0x68(%r11) | |
2733 | movaps -0x58(%r11),%xmm11 | |
2734 | movaps %xmm0,-0x58(%r11) | |
2735 | movaps -0x48(%r11),%xmm12 | |
2736 | movaps %xmm0,-0x48(%r11) | |
2737 | movaps -0x38(%r11),%xmm13 | |
2738 | movaps %xmm0,-0x38(%r11) | |
2739 | movaps -0x28(%r11),%xmm14 | |
2740 | movaps %xmm0,-0x28(%r11) | |
2741 | movaps -0x18(%r11),%xmm15 | |
2742 | movaps %xmm0,-0x18(%r11) | |
23f6eec7 AP |
2743 | movaps %xmm0,0x00(%rsp) |
2744 | movaps %xmm0,0x10(%rsp) | |
2745 | movaps %xmm0,0x20(%rsp) | |
2746 | movaps %xmm0,0x30(%rsp) | |
2747 | movaps %xmm0,0x40(%rsp) | |
2748 | movaps %xmm0,0x50(%rsp) | |
2749 | movaps %xmm0,0x60(%rsp) | |
6c83629b AP |
2750 | ___ |
2751 | $code.=<<___; | |
384e6de4 | 2752 | mov -8(%r11),%rbp |
b84460ad | 2753 | .cfi_restore %rbp |
384e6de4 | 2754 | lea (%r11),%rsp |
b84460ad | 2755 | .cfi_def_cfa_register %rsp |
f8501464 | 2756 | .Lxts_dec_epilogue: |
6c83629b | 2757 | ret |
b84460ad | 2758 | .cfi_endproc |
f8501464 | 2759 | .size aesni_xts_decrypt,.-aesni_xts_decrypt |
6c83629b | 2760 | ___ |
bd30091c AP |
2761 | } |
2762 | \f | |
2763 | ###################################################################### | |
2764 | # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, | |
2765 | # const AES_KEY *key, unsigned int start_block_num, | |
2766 | # unsigned char offset_i[16], const unsigned char L_[][16], | |
2767 | # unsigned char checksum[16]); | |
2768 | # | |
2769 | { | |
2770 | my @offset=map("%xmm$_",(10..15)); | |
2771 | my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); | |
2772 | my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments | |
2773 | my ($L_p,$checksum_p) = ("%rbx","%rbp"); | |
2774 | my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); | |
2775 | my $seventh_arg = $win64 ? 56 : 8; | |
2776 | my $blocks = $len; | |
2777 | ||
2778 | $code.=<<___; | |
2779 | .globl aesni_ocb_encrypt | |
2780 | .type aesni_ocb_encrypt,\@function,6 | |
2781 | .align 32 | |
2782 | aesni_ocb_encrypt: | |
b84460ad | 2783 | .cfi_startproc |
bd30091c AP |
2784 | lea (%rsp),%rax |
2785 | push %rbx | |
b84460ad | 2786 | .cfi_push %rbx |
bd30091c | 2787 | push %rbp |
b84460ad | 2788 | .cfi_push %rbp |
bd30091c | 2789 | push %r12 |
b84460ad | 2790 | .cfi_push %r12 |
bd30091c | 2791 | push %r13 |
b84460ad | 2792 | .cfi_push %r13 |
bd30091c | 2793 | push %r14 |
b84460ad | 2794 | .cfi_push %r14 |
bd30091c AP |
2795 | ___ |
2796 | $code.=<<___ if ($win64); | |
2797 | lea -0xa0(%rsp),%rsp | |
2798 | movaps %xmm6,0x00(%rsp) # offload everything | |
2799 | movaps %xmm7,0x10(%rsp) | |
2800 | movaps %xmm8,0x20(%rsp) | |
2801 | movaps %xmm9,0x30(%rsp) | |
2802 | movaps %xmm10,0x40(%rsp) | |
2803 | movaps %xmm11,0x50(%rsp) | |
2804 | movaps %xmm12,0x60(%rsp) | |
2805 | movaps %xmm13,0x70(%rsp) | |
2806 | movaps %xmm14,0x80(%rsp) | |
2807 | movaps %xmm15,0x90(%rsp) | |
2808 | .Locb_enc_body: | |
2809 | ___ | |
2810 | $code.=<<___; | |
2811 | mov $seventh_arg(%rax),$L_p # 7th argument | |
2812 | mov $seventh_arg+8(%rax),$checksum_p# 8th argument | |
2813 | ||
2814 | mov 240($key),$rnds_ | |
2815 | mov $key,$key_ | |
2816 | shl \$4,$rnds_ | |
2817 | $movkey ($key),$rndkey0l # round[0] | |
2818 | $movkey 16($key,$rnds_),$rndkey1 # round[last] | |
2819 | ||
2820 | movdqu ($offset_p),@offset[5] # load last offset_i | |
2821 | pxor $rndkey1,$rndkey0l # round[0] ^ round[last] | |
2822 | pxor $rndkey1,@offset[5] # offset_i ^ round[last] | |
2823 | ||
2824 | mov \$16+32,$rounds | |
2825 | lea 32($key_,$rnds_),$key | |
2826 | $movkey 16($key_),$rndkey1 # round[1] | |
2827 | sub %r10,%rax # twisted $rounds | |
2828 | mov %rax,%r10 # backup twisted $rounds | |
2829 | ||
2830 | movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks | |
2831 | movdqu ($checksum_p),$checksum # load checksum | |
2832 | ||
2833 | test \$1,$block_num # is first block number odd? | |
2834 | jnz .Locb_enc_odd | |
2835 | ||
2836 | bsf $block_num,$i1 | |
2837 | add \$1,$block_num | |
2838 | shl \$4,$i1 | |
2839 | movdqu ($L_p,$i1),$inout5 # borrow | |
2840 | movdqu ($inp),$inout0 | |
2841 | lea 16($inp),$inp | |
2842 | ||
2843 | call __ocb_encrypt1 | |
2844 | ||
2845 | movdqa $inout5,@offset[5] | |
2846 | movups $inout0,($out) | |
2847 | lea 16($out),$out | |
2848 | sub \$1,$blocks | |
2849 | jz .Locb_enc_done | |
2850 | ||
2851 | .Locb_enc_odd: | |
2852 | lea 1($block_num),$i1 # even-numbered blocks | |
2853 | lea 3($block_num),$i3 | |
2854 | lea 5($block_num),$i5 | |
2855 | lea 6($block_num),$block_num | |
2856 | bsf $i1,$i1 # ntz(block) | |
2857 | bsf $i3,$i3 | |
2858 | bsf $i5,$i5 | |
2859 | shl \$4,$i1 # ntz(block) -> table offset | |
2860 | shl \$4,$i3 | |
2861 | shl \$4,$i5 | |
2862 | ||
2863 | sub \$6,$blocks | |
2864 | jc .Locb_enc_short | |
2865 | jmp .Locb_enc_grandloop | |
2866 | ||
2867 | .align 32 | |
2868 | .Locb_enc_grandloop: | |
2869 | movdqu `16*0`($inp),$inout0 # load input | |
2870 | movdqu `16*1`($inp),$inout1 | |
2871 | movdqu `16*2`($inp),$inout2 | |
2872 | movdqu `16*3`($inp),$inout3 | |
2873 | movdqu `16*4`($inp),$inout4 | |
2874 | movdqu `16*5`($inp),$inout5 | |
2875 | lea `16*6`($inp),$inp | |
2876 | ||
2877 | call __ocb_encrypt6 | |
2878 | ||
2879 | movups $inout0,`16*0`($out) # store output | |
2880 | movups $inout1,`16*1`($out) | |
2881 | movups $inout2,`16*2`($out) | |
2882 | movups $inout3,`16*3`($out) | |
2883 | movups $inout4,`16*4`($out) | |
2884 | movups $inout5,`16*5`($out) | |
2885 | lea `16*6`($out),$out | |
2886 | sub \$6,$blocks | |
2887 | jnc .Locb_enc_grandloop | |
2888 | ||
2889 | .Locb_enc_short: | |
2890 | add \$6,$blocks | |
2891 | jz .Locb_enc_done | |
2892 | ||
2893 | movdqu `16*0`($inp),$inout0 | |
2894 | cmp \$2,$blocks | |
2895 | jb .Locb_enc_one | |
2896 | movdqu `16*1`($inp),$inout1 | |
2897 | je .Locb_enc_two | |
2898 | ||
2899 | movdqu `16*2`($inp),$inout2 | |
2900 | cmp \$4,$blocks | |
2901 | jb .Locb_enc_three | |
2902 | movdqu `16*3`($inp),$inout3 | |
2903 | je .Locb_enc_four | |
2904 | ||
2905 | movdqu `16*4`($inp),$inout4 | |
2906 | pxor $inout5,$inout5 | |
2907 | ||
2908 | call __ocb_encrypt6 | |
2909 | ||
2910 | movdqa @offset[4],@offset[5] | |
2911 | movups $inout0,`16*0`($out) | |
2912 | movups $inout1,`16*1`($out) | |
2913 | movups $inout2,`16*2`($out) | |
2914 | movups $inout3,`16*3`($out) | |
2915 | movups $inout4,`16*4`($out) | |
2916 | ||
2917 | jmp .Locb_enc_done | |
2918 | ||
2919 | .align 16 | |
2920 | .Locb_enc_one: | |
2921 | movdqa @offset[0],$inout5 # borrow | |
2922 | ||
2923 | call __ocb_encrypt1 | |
2924 | ||
2925 | movdqa $inout5,@offset[5] | |
2926 | movups $inout0,`16*0`($out) | |
2927 | jmp .Locb_enc_done | |
2928 | ||
2929 | .align 16 | |
2930 | .Locb_enc_two: | |
2931 | pxor $inout2,$inout2 | |
2932 | pxor $inout3,$inout3 | |
2933 | ||
2934 | call __ocb_encrypt4 | |
2935 | ||
2936 | movdqa @offset[1],@offset[5] | |
2937 | movups $inout0,`16*0`($out) | |
2938 | movups $inout1,`16*1`($out) | |
2939 | ||
2940 | jmp .Locb_enc_done | |
2941 | ||
2942 | .align 16 | |
2943 | .Locb_enc_three: | |
2944 | pxor $inout3,$inout3 | |
2945 | ||
2946 | call __ocb_encrypt4 | |
2947 | ||
2948 | movdqa @offset[2],@offset[5] | |
2949 | movups $inout0,`16*0`($out) | |
2950 | movups $inout1,`16*1`($out) | |
2951 | movups $inout2,`16*2`($out) | |
2952 | ||
2953 | jmp .Locb_enc_done | |
2954 | ||
2955 | .align 16 | |
2956 | .Locb_enc_four: | |
2957 | call __ocb_encrypt4 | |
2958 | ||
2959 | movdqa @offset[3],@offset[5] | |
2960 | movups $inout0,`16*0`($out) | |
2961 | movups $inout1,`16*1`($out) | |
2962 | movups $inout2,`16*2`($out) | |
2963 | movups $inout3,`16*3`($out) | |
2964 | ||
2965 | .Locb_enc_done: | |
2966 | pxor $rndkey0,@offset[5] # "remove" round[last] | |
2967 | movdqu $checksum,($checksum_p) # store checksum | |
2968 | movdqu @offset[5],($offset_p) # store last offset_i | |
2969 | ||
2970 | xorps %xmm0,%xmm0 # clear register bank | |
2971 | pxor %xmm1,%xmm1 | |
2972 | pxor %xmm2,%xmm2 | |
2973 | pxor %xmm3,%xmm3 | |
2974 | pxor %xmm4,%xmm4 | |
2975 | pxor %xmm5,%xmm5 | |
2976 | ___ | |
2977 | $code.=<<___ if (!$win64); | |
2978 | pxor %xmm6,%xmm6 | |
2979 | pxor %xmm7,%xmm7 | |
2980 | pxor %xmm8,%xmm8 | |
2981 | pxor %xmm9,%xmm9 | |
2982 | pxor %xmm10,%xmm10 | |
2983 | pxor %xmm11,%xmm11 | |
2984 | pxor %xmm12,%xmm12 | |
2985 | pxor %xmm13,%xmm13 | |
2986 | pxor %xmm14,%xmm14 | |
2987 | pxor %xmm15,%xmm15 | |
384e6de4 | 2988 | lea 0x28(%rsp),%rax |
b84460ad | 2989 | .cfi_def_cfa %rax,8 |
bd30091c AP |
2990 | ___ |
2991 | $code.=<<___ if ($win64); | |
2992 | movaps 0x00(%rsp),%xmm6 | |
2993 | movaps %xmm0,0x00(%rsp) # clear stack | |
2994 | movaps 0x10(%rsp),%xmm7 | |
2995 | movaps %xmm0,0x10(%rsp) | |
2996 | movaps 0x20(%rsp),%xmm8 | |
2997 | movaps %xmm0,0x20(%rsp) | |
2998 | movaps 0x30(%rsp),%xmm9 | |
2999 | movaps %xmm0,0x30(%rsp) | |
3000 | movaps 0x40(%rsp),%xmm10 | |
3001 | movaps %xmm0,0x40(%rsp) | |
3002 | movaps 0x50(%rsp),%xmm11 | |
3003 | movaps %xmm0,0x50(%rsp) | |
3004 | movaps 0x60(%rsp),%xmm12 | |
3005 | movaps %xmm0,0x60(%rsp) | |
3006 | movaps 0x70(%rsp),%xmm13 | |
3007 | movaps %xmm0,0x70(%rsp) | |
3008 | movaps 0x80(%rsp),%xmm14 | |
3009 | movaps %xmm0,0x80(%rsp) | |
3010 | movaps 0x90(%rsp),%xmm15 | |
3011 | movaps %xmm0,0x90(%rsp) | |
3012 | lea 0xa0+0x28(%rsp),%rax | |
3013 | .Locb_enc_pop: | |
bd30091c AP |
3014 | ___ |
3015 | $code.=<<___; | |
384e6de4 | 3016 | mov -40(%rax),%r14 |
b84460ad | 3017 | .cfi_restore %r14 |
384e6de4 | 3018 | mov -32(%rax),%r13 |
b84460ad | 3019 | .cfi_restore %r13 |
384e6de4 | 3020 | mov -24(%rax),%r12 |
b84460ad | 3021 | .cfi_restore %r12 |
384e6de4 | 3022 | mov -16(%rax),%rbp |
b84460ad | 3023 | .cfi_restore %rbp |
384e6de4 | 3024 | mov -8(%rax),%rbx |
b84460ad | 3025 | .cfi_restore %rbx |
384e6de4 | 3026 | lea (%rax),%rsp |
b84460ad | 3027 | .cfi_def_cfa_register %rsp |
bd30091c AP |
3028 | .Locb_enc_epilogue: |
3029 | ret | |
b84460ad | 3030 | .cfi_endproc |
bd30091c AP |
3031 | .size aesni_ocb_encrypt,.-aesni_ocb_encrypt |
3032 | ||
3033 | .type __ocb_encrypt6,\@abi-omnipotent | |
3034 | .align 32 | |
3035 | __ocb_encrypt6: | |
3036 | pxor $rndkey0l,@offset[5] # offset_i ^ round[0] | |
3037 | movdqu ($L_p,$i1),@offset[1] | |
3038 | movdqa @offset[0],@offset[2] | |
3039 | movdqu ($L_p,$i3),@offset[3] | |
3040 | movdqa @offset[0],@offset[4] | |
3041 | pxor @offset[5],@offset[0] | |
3042 | movdqu ($L_p,$i5),@offset[5] | |
3043 | pxor @offset[0],@offset[1] | |
3044 | pxor $inout0,$checksum # accumulate checksum | |
3045 | pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i | |
3046 | pxor @offset[1],@offset[2] | |
3047 | pxor $inout1,$checksum | |
3048 | pxor @offset[1],$inout1 | |
3049 | pxor @offset[2],@offset[3] | |
3050 | pxor $inout2,$checksum | |
3051 | pxor @offset[2],$inout2 | |
3052 | pxor @offset[3],@offset[4] | |
3053 | pxor $inout3,$checksum | |
3054 | pxor @offset[3],$inout3 | |
3055 | pxor @offset[4],@offset[5] | |
3056 | pxor $inout4,$checksum | |
3057 | pxor @offset[4],$inout4 | |
3058 | pxor $inout5,$checksum | |
3059 | pxor @offset[5],$inout5 | |
3060 | $movkey 32($key_),$rndkey0 | |
3061 | ||
3062 | lea 1($block_num),$i1 # even-numbered blocks | |
3063 | lea 3($block_num),$i3 | |
3064 | lea 5($block_num),$i5 | |
3065 | add \$6,$block_num | |
3066 | pxor $rndkey0l,@offset[0] # offset_i ^ round[last] | |
3067 | bsf $i1,$i1 # ntz(block) | |
3068 | bsf $i3,$i3 | |
3069 | bsf $i5,$i5 | |
3070 | ||
3071 | aesenc $rndkey1,$inout0 | |
3072 | aesenc $rndkey1,$inout1 | |
3073 | aesenc $rndkey1,$inout2 | |
3074 | aesenc $rndkey1,$inout3 | |
3075 | pxor $rndkey0l,@offset[1] | |
3076 | pxor $rndkey0l,@offset[2] | |
3077 | aesenc $rndkey1,$inout4 | |
3078 | pxor $rndkey0l,@offset[3] | |
3079 | pxor $rndkey0l,@offset[4] | |
3080 | aesenc $rndkey1,$inout5 | |
3081 | $movkey 48($key_),$rndkey1 | |
3082 | pxor $rndkey0l,@offset[5] | |
3083 | ||
3084 | aesenc $rndkey0,$inout0 | |
3085 | aesenc $rndkey0,$inout1 | |
3086 | aesenc $rndkey0,$inout2 | |
3087 | aesenc $rndkey0,$inout3 | |
3088 | aesenc $rndkey0,$inout4 | |
3089 | aesenc $rndkey0,$inout5 | |
3090 | $movkey 64($key_),$rndkey0 | |
3091 | shl \$4,$i1 # ntz(block) -> table offset | |
3092 | shl \$4,$i3 | |
3093 | jmp .Locb_enc_loop6 | |
3094 | ||
3095 | .align 32 | |
3096 | .Locb_enc_loop6: | |
3097 | aesenc $rndkey1,$inout0 | |
3098 | aesenc $rndkey1,$inout1 | |
3099 | aesenc $rndkey1,$inout2 | |
3100 | aesenc $rndkey1,$inout3 | |
3101 | aesenc $rndkey1,$inout4 | |
3102 | aesenc $rndkey1,$inout5 | |
3103 | $movkey ($key,%rax),$rndkey1 | |
3104 | add \$32,%rax | |
3105 | ||
3106 | aesenc $rndkey0,$inout0 | |
3107 | aesenc $rndkey0,$inout1 | |
3108 | aesenc $rndkey0,$inout2 | |
3109 | aesenc $rndkey0,$inout3 | |
3110 | aesenc $rndkey0,$inout4 | |
3111 | aesenc $rndkey0,$inout5 | |
3112 | $movkey -16($key,%rax),$rndkey0 | |
3113 | jnz .Locb_enc_loop6 | |
3114 | ||
3115 | aesenc $rndkey1,$inout0 | |
3116 | aesenc $rndkey1,$inout1 | |
3117 | aesenc $rndkey1,$inout2 | |
3118 | aesenc $rndkey1,$inout3 | |
3119 | aesenc $rndkey1,$inout4 | |
3120 | aesenc $rndkey1,$inout5 | |
3121 | $movkey 16($key_),$rndkey1 | |
3122 | shl \$4,$i5 | |
3123 | ||
3124 | aesenclast @offset[0],$inout0 | |
3125 | movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks | |
3126 | mov %r10,%rax # restore twisted rounds | |
3127 | aesenclast @offset[1],$inout1 | |
3128 | aesenclast @offset[2],$inout2 | |
3129 | aesenclast @offset[3],$inout3 | |
3130 | aesenclast @offset[4],$inout4 | |
3131 | aesenclast @offset[5],$inout5 | |
3132 | ret | |
3133 | .size __ocb_encrypt6,.-__ocb_encrypt6 | |
3134 | ||
3135 | .type __ocb_encrypt4,\@abi-omnipotent | |
3136 | .align 32 | |
3137 | __ocb_encrypt4: | |
3138 | pxor $rndkey0l,@offset[5] # offset_i ^ round[0] | |
3139 | movdqu ($L_p,$i1),@offset[1] | |
3140 | movdqa @offset[0],@offset[2] | |
3141 | movdqu ($L_p,$i3),@offset[3] | |
3142 | pxor @offset[5],@offset[0] | |
3143 | pxor @offset[0],@offset[1] | |
3144 | pxor $inout0,$checksum # accumulate checksum | |
3145 | pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i | |
3146 | pxor @offset[1],@offset[2] | |
3147 | pxor $inout1,$checksum | |
3148 | pxor @offset[1],$inout1 | |
3149 | pxor @offset[2],@offset[3] | |
3150 | pxor $inout2,$checksum | |
3151 | pxor @offset[2],$inout2 | |
3152 | pxor $inout3,$checksum | |
3153 | pxor @offset[3],$inout3 | |
3154 | $movkey 32($key_),$rndkey0 | |
3155 | ||
3156 | pxor $rndkey0l,@offset[0] # offset_i ^ round[last] | |
3157 | pxor $rndkey0l,@offset[1] | |
3158 | pxor $rndkey0l,@offset[2] | |
3159 | pxor $rndkey0l,@offset[3] | |
3160 | ||
3161 | aesenc $rndkey1,$inout0 | |
3162 | aesenc $rndkey1,$inout1 | |
3163 | aesenc $rndkey1,$inout2 | |
3164 | aesenc $rndkey1,$inout3 | |
3165 | $movkey 48($key_),$rndkey1 | |
3166 | ||
3167 | aesenc $rndkey0,$inout0 | |
3168 | aesenc $rndkey0,$inout1 | |
3169 | aesenc $rndkey0,$inout2 | |
3170 | aesenc $rndkey0,$inout3 | |
3171 | $movkey 64($key_),$rndkey0 | |
3172 | jmp .Locb_enc_loop4 | |
3173 | ||
3174 | .align 32 | |
3175 | .Locb_enc_loop4: | |
3176 | aesenc $rndkey1,$inout0 | |
3177 | aesenc $rndkey1,$inout1 | |
3178 | aesenc $rndkey1,$inout2 | |
3179 | aesenc $rndkey1,$inout3 | |
3180 | $movkey ($key,%rax),$rndkey1 | |
3181 | add \$32,%rax | |
3182 | ||
3183 | aesenc $rndkey0,$inout0 | |
3184 | aesenc $rndkey0,$inout1 | |
3185 | aesenc $rndkey0,$inout2 | |
3186 | aesenc $rndkey0,$inout3 | |
3187 | $movkey -16($key,%rax),$rndkey0 | |
3188 | jnz .Locb_enc_loop4 | |
3189 | ||
3190 | aesenc $rndkey1,$inout0 | |
3191 | aesenc $rndkey1,$inout1 | |
3192 | aesenc $rndkey1,$inout2 | |
3193 | aesenc $rndkey1,$inout3 | |
3194 | $movkey 16($key_),$rndkey1 | |
3195 | mov %r10,%rax # restore twisted rounds | |
3196 | ||
3197 | aesenclast @offset[0],$inout0 | |
3198 | aesenclast @offset[1],$inout1 | |
3199 | aesenclast @offset[2],$inout2 | |
3200 | aesenclast @offset[3],$inout3 | |
3201 | ret | |
3202 | .size __ocb_encrypt4,.-__ocb_encrypt4 | |
3203 | ||
3204 | .type __ocb_encrypt1,\@abi-omnipotent | |
3205 | .align 32 | |
3206 | __ocb_encrypt1: | |
3207 | pxor @offset[5],$inout5 # offset_i | |
3208 | pxor $rndkey0l,$inout5 # offset_i ^ round[0] | |
3209 | pxor $inout0,$checksum # accumulate checksum | |
3210 | pxor $inout5,$inout0 # input ^ round[0] ^ offset_i | |
3211 | $movkey 32($key_),$rndkey0 | |
3212 | ||
3213 | aesenc $rndkey1,$inout0 | |
3214 | $movkey 48($key_),$rndkey1 | |
3215 | pxor $rndkey0l,$inout5 # offset_i ^ round[last] | |
3216 | ||
3217 | aesenc $rndkey0,$inout0 | |
3218 | $movkey 64($key_),$rndkey0 | |
3219 | jmp .Locb_enc_loop1 | |
3220 | ||
3221 | .align 32 | |
3222 | .Locb_enc_loop1: | |
3223 | aesenc $rndkey1,$inout0 | |
3224 | $movkey ($key,%rax),$rndkey1 | |
3225 | add \$32,%rax | |
3226 | ||
3227 | aesenc $rndkey0,$inout0 | |
3228 | $movkey -16($key,%rax),$rndkey0 | |
3229 | jnz .Locb_enc_loop1 | |
3230 | ||
3231 | aesenc $rndkey1,$inout0 | |
3232 | $movkey 16($key_),$rndkey1 # redundant in tail | |
3233 | mov %r10,%rax # restore twisted rounds | |
3234 | ||
3235 | aesenclast $inout5,$inout0 | |
3236 | ret | |
3237 | .size __ocb_encrypt1,.-__ocb_encrypt1 | |
3238 | ||
3239 | .globl aesni_ocb_decrypt | |
3240 | .type aesni_ocb_decrypt,\@function,6 | |
3241 | .align 32 | |
3242 | aesni_ocb_decrypt: | |
b84460ad | 3243 | .cfi_startproc |
bd30091c AP |
3244 | lea (%rsp),%rax |
3245 | push %rbx | |
b84460ad | 3246 | .cfi_push %rbx |
bd30091c | 3247 | push %rbp |
b84460ad | 3248 | .cfi_push %rbp |
bd30091c | 3249 | push %r12 |
b84460ad | 3250 | .cfi_push %r12 |
bd30091c | 3251 | push %r13 |
b84460ad | 3252 | .cfi_push %r13 |
bd30091c | 3253 | push %r14 |
b84460ad | 3254 | .cfi_push %r14 |
bd30091c AP |
3255 | ___ |
3256 | $code.=<<___ if ($win64); | |
3257 | lea -0xa0(%rsp),%rsp | |
3258 | movaps %xmm6,0x00(%rsp) # offload everything | |
3259 | movaps %xmm7,0x10(%rsp) | |
3260 | movaps %xmm8,0x20(%rsp) | |
3261 | movaps %xmm9,0x30(%rsp) | |
3262 | movaps %xmm10,0x40(%rsp) | |
3263 | movaps %xmm11,0x50(%rsp) | |
3264 | movaps %xmm12,0x60(%rsp) | |
3265 | movaps %xmm13,0x70(%rsp) | |
3266 | movaps %xmm14,0x80(%rsp) | |
3267 | movaps %xmm15,0x90(%rsp) | |
3268 | .Locb_dec_body: | |
3269 | ___ | |
3270 | $code.=<<___; | |
3271 | mov $seventh_arg(%rax),$L_p # 7th argument | |
3272 | mov $seventh_arg+8(%rax),$checksum_p# 8th argument | |
3273 | ||
3274 | mov 240($key),$rnds_ | |
3275 | mov $key,$key_ | |
3276 | shl \$4,$rnds_ | |
3277 | $movkey ($key),$rndkey0l # round[0] | |
3278 | $movkey 16($key,$rnds_),$rndkey1 # round[last] | |
3279 | ||
3280 | movdqu ($offset_p),@offset[5] # load last offset_i | |
3281 | pxor $rndkey1,$rndkey0l # round[0] ^ round[last] | |
3282 | pxor $rndkey1,@offset[5] # offset_i ^ round[last] | |
3283 | ||
3284 | mov \$16+32,$rounds | |
3285 | lea 32($key_,$rnds_),$key | |
3286 | $movkey 16($key_),$rndkey1 # round[1] | |
3287 | sub %r10,%rax # twisted $rounds | |
3288 | mov %rax,%r10 # backup twisted $rounds | |
3289 | ||
3290 | movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks | |
3291 | movdqu ($checksum_p),$checksum # load checksum | |
3292 | ||
3293 | test \$1,$block_num # is first block number odd? | |
3294 | jnz .Locb_dec_odd | |
3295 | ||
3296 | bsf $block_num,$i1 | |
3297 | add \$1,$block_num | |
3298 | shl \$4,$i1 | |
3299 | movdqu ($L_p,$i1),$inout5 # borrow | |
3300 | movdqu ($inp),$inout0 | |
3301 | lea 16($inp),$inp | |
3302 | ||
3303 | call __ocb_decrypt1 | |
3304 | ||
3305 | movdqa $inout5,@offset[5] | |
3306 | movups $inout0,($out) | |
3307 | xorps $inout0,$checksum # accumulate checksum | |
3308 | lea 16($out),$out | |
3309 | sub \$1,$blocks | |
3310 | jz .Locb_dec_done | |
3311 | ||
3312 | .Locb_dec_odd: | |
3313 | lea 1($block_num),$i1 # even-numbered blocks | |
3314 | lea 3($block_num),$i3 | |
3315 | lea 5($block_num),$i5 | |
3316 | lea 6($block_num),$block_num | |
3317 | bsf $i1,$i1 # ntz(block) | |
3318 | bsf $i3,$i3 | |
3319 | bsf $i5,$i5 | |
3320 | shl \$4,$i1 # ntz(block) -> table offset | |
3321 | shl \$4,$i3 | |
3322 | shl \$4,$i5 | |
3323 | ||
3324 | sub \$6,$blocks | |
3325 | jc .Locb_dec_short | |
3326 | jmp .Locb_dec_grandloop | |
3327 | ||
3328 | .align 32 | |
3329 | .Locb_dec_grandloop: | |
3330 | movdqu `16*0`($inp),$inout0 # load input | |
3331 | movdqu `16*1`($inp),$inout1 | |
3332 | movdqu `16*2`($inp),$inout2 | |
3333 | movdqu `16*3`($inp),$inout3 | |
3334 | movdqu `16*4`($inp),$inout4 | |
3335 | movdqu `16*5`($inp),$inout5 | |
3336 | lea `16*6`($inp),$inp | |
3337 | ||
3338 | call __ocb_decrypt6 | |
3339 | ||
3340 | movups $inout0,`16*0`($out) # store output | |
3341 | pxor $inout0,$checksum # accumulate checksum | |
3342 | movups $inout1,`16*1`($out) | |
3343 | pxor $inout1,$checksum | |
3344 | movups $inout2,`16*2`($out) | |
3345 | pxor $inout2,$checksum | |
3346 | movups $inout3,`16*3`($out) | |
3347 | pxor $inout3,$checksum | |
3348 | movups $inout4,`16*4`($out) | |
3349 | pxor $inout4,$checksum | |
3350 | movups $inout5,`16*5`($out) | |
3351 | pxor $inout5,$checksum | |
3352 | lea `16*6`($out),$out | |
3353 | sub \$6,$blocks | |
3354 | jnc .Locb_dec_grandloop | |
3355 | ||
3356 | .Locb_dec_short: | |
3357 | add \$6,$blocks | |
3358 | jz .Locb_dec_done | |
3359 | ||
3360 | movdqu `16*0`($inp),$inout0 | |
3361 | cmp \$2,$blocks | |
3362 | jb .Locb_dec_one | |
3363 | movdqu `16*1`($inp),$inout1 | |
3364 | je .Locb_dec_two | |
3365 | ||
3366 | movdqu `16*2`($inp),$inout2 | |
3367 | cmp \$4,$blocks | |
3368 | jb .Locb_dec_three | |
3369 | movdqu `16*3`($inp),$inout3 | |
3370 | je .Locb_dec_four | |
3371 | ||
3372 | movdqu `16*4`($inp),$inout4 | |
3373 | pxor $inout5,$inout5 | |
3374 | ||
3375 | call __ocb_decrypt6 | |
3376 | ||
3377 | movdqa @offset[4],@offset[5] | |
3378 | movups $inout0,`16*0`($out) # store output | |
3379 | pxor $inout0,$checksum # accumulate checksum | |
3380 | movups $inout1,`16*1`($out) | |
3381 | pxor $inout1,$checksum | |
3382 | movups $inout2,`16*2`($out) | |
3383 | pxor $inout2,$checksum | |
3384 | movups $inout3,`16*3`($out) | |
3385 | pxor $inout3,$checksum | |
3386 | movups $inout4,`16*4`($out) | |
3387 | pxor $inout4,$checksum | |
3388 | ||
3389 | jmp .Locb_dec_done | |
3390 | ||
3391 | .align 16 | |
3392 | .Locb_dec_one: | |
3393 | movdqa @offset[0],$inout5 # borrow | |
3394 | ||
3395 | call __ocb_decrypt1 | |
3396 | ||
3397 | movdqa $inout5,@offset[5] | |
3398 | movups $inout0,`16*0`($out) # store output | |
3399 | xorps $inout0,$checksum # accumulate checksum | |
3400 | jmp .Locb_dec_done | |
3401 | ||
3402 | .align 16 | |
3403 | .Locb_dec_two: | |
3404 | pxor $inout2,$inout2 | |
3405 | pxor $inout3,$inout3 | |
3406 | ||
3407 | call __ocb_decrypt4 | |
3408 | ||
3409 | movdqa @offset[1],@offset[5] | |
3410 | movups $inout0,`16*0`($out) # store output | |
3411 | xorps $inout0,$checksum # accumulate checksum | |
3412 | movups $inout1,`16*1`($out) | |
3413 | xorps $inout1,$checksum | |
3414 | ||
3415 | jmp .Locb_dec_done | |
3416 | ||
3417 | .align 16 | |
3418 | .Locb_dec_three: | |
3419 | pxor $inout3,$inout3 | |
3420 | ||
3421 | call __ocb_decrypt4 | |
3422 | ||
3423 | movdqa @offset[2],@offset[5] | |
3424 | movups $inout0,`16*0`($out) # store output | |
3425 | xorps $inout0,$checksum # accumulate checksum | |
3426 | movups $inout1,`16*1`($out) | |
3427 | xorps $inout1,$checksum | |
3428 | movups $inout2,`16*2`($out) | |
3429 | xorps $inout2,$checksum | |
3430 | ||
3431 | jmp .Locb_dec_done | |
3432 | ||
3433 | .align 16 | |
3434 | .Locb_dec_four: | |
3435 | call __ocb_decrypt4 | |
3436 | ||
3437 | movdqa @offset[3],@offset[5] | |
3438 | movups $inout0,`16*0`($out) # store output | |
3439 | pxor $inout0,$checksum # accumulate checksum | |
3440 | movups $inout1,`16*1`($out) | |
3441 | pxor $inout1,$checksum | |
3442 | movups $inout2,`16*2`($out) | |
3443 | pxor $inout2,$checksum | |
3444 | movups $inout3,`16*3`($out) | |
3445 | pxor $inout3,$checksum | |
3446 | ||
3447 | .Locb_dec_done: | |
3448 | pxor $rndkey0,@offset[5] # "remove" round[last] | |
3449 | movdqu $checksum,($checksum_p) # store checksum | |
3450 | movdqu @offset[5],($offset_p) # store last offset_i | |
3451 | ||
3452 | xorps %xmm0,%xmm0 # clear register bank | |
3453 | pxor %xmm1,%xmm1 | |
3454 | pxor %xmm2,%xmm2 | |
3455 | pxor %xmm3,%xmm3 | |
3456 | pxor %xmm4,%xmm4 | |
3457 | pxor %xmm5,%xmm5 | |
3458 | ___ | |
3459 | $code.=<<___ if (!$win64); | |
3460 | pxor %xmm6,%xmm6 | |
3461 | pxor %xmm7,%xmm7 | |
3462 | pxor %xmm8,%xmm8 | |
3463 | pxor %xmm9,%xmm9 | |
3464 | pxor %xmm10,%xmm10 | |
3465 | pxor %xmm11,%xmm11 | |
3466 | pxor %xmm12,%xmm12 | |
3467 | pxor %xmm13,%xmm13 | |
3468 | pxor %xmm14,%xmm14 | |
3469 | pxor %xmm15,%xmm15 | |
384e6de4 | 3470 | lea 0x28(%rsp),%rax |
b84460ad | 3471 | .cfi_def_cfa %rax,8 |
bd30091c AP |
3472 | ___ |
3473 | $code.=<<___ if ($win64); | |
3474 | movaps 0x00(%rsp),%xmm6 | |
3475 | movaps %xmm0,0x00(%rsp) # clear stack | |
3476 | movaps 0x10(%rsp),%xmm7 | |
3477 | movaps %xmm0,0x10(%rsp) | |
3478 | movaps 0x20(%rsp),%xmm8 | |
3479 | movaps %xmm0,0x20(%rsp) | |
3480 | movaps 0x30(%rsp),%xmm9 | |
3481 | movaps %xmm0,0x30(%rsp) | |
3482 | movaps 0x40(%rsp),%xmm10 | |
3483 | movaps %xmm0,0x40(%rsp) | |
3484 | movaps 0x50(%rsp),%xmm11 | |
3485 | movaps %xmm0,0x50(%rsp) | |
3486 | movaps 0x60(%rsp),%xmm12 | |
3487 | movaps %xmm0,0x60(%rsp) | |
3488 | movaps 0x70(%rsp),%xmm13 | |
3489 | movaps %xmm0,0x70(%rsp) | |
3490 | movaps 0x80(%rsp),%xmm14 | |
3491 | movaps %xmm0,0x80(%rsp) | |
3492 | movaps 0x90(%rsp),%xmm15 | |
3493 | movaps %xmm0,0x90(%rsp) | |
3494 | lea 0xa0+0x28(%rsp),%rax | |
3495 | .Locb_dec_pop: | |
bd30091c AP |
3496 | ___ |
3497 | $code.=<<___; | |
384e6de4 | 3498 | mov -40(%rax),%r14 |
b84460ad | 3499 | .cfi_restore %r14 |
384e6de4 | 3500 | mov -32(%rax),%r13 |
b84460ad | 3501 | .cfi_restore %r13 |
384e6de4 | 3502 | mov -24(%rax),%r12 |
b84460ad | 3503 | .cfi_restore %r12 |
384e6de4 | 3504 | mov -16(%rax),%rbp |
b84460ad | 3505 | .cfi_restore %rbp |
384e6de4 | 3506 | mov -8(%rax),%rbx |
b84460ad | 3507 | .cfi_restore %rbx |
384e6de4 | 3508 | lea (%rax),%rsp |
b84460ad | 3509 | .cfi_def_cfa_register %rsp |
bd30091c AP |
3510 | .Locb_dec_epilogue: |
3511 | ret | |
b84460ad | 3512 | .cfi_endproc |
bd30091c AP |
3513 | .size aesni_ocb_decrypt,.-aesni_ocb_decrypt |
3514 | ||
3515 | .type __ocb_decrypt6,\@abi-omnipotent | |
3516 | .align 32 | |
3517 | __ocb_decrypt6: | |
3518 | pxor $rndkey0l,@offset[5] # offset_i ^ round[0] | |
3519 | movdqu ($L_p,$i1),@offset[1] | |
3520 | movdqa @offset[0],@offset[2] | |
3521 | movdqu ($L_p,$i3),@offset[3] | |
3522 | movdqa @offset[0],@offset[4] | |
3523 | pxor @offset[5],@offset[0] | |
3524 | movdqu ($L_p,$i5),@offset[5] | |
3525 | pxor @offset[0],@offset[1] | |
3526 | pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i | |
3527 | pxor @offset[1],@offset[2] | |
3528 | pxor @offset[1],$inout1 | |
3529 | pxor @offset[2],@offset[3] | |
3530 | pxor @offset[2],$inout2 | |
3531 | pxor @offset[3],@offset[4] | |
3532 | pxor @offset[3],$inout3 | |
3533 | pxor @offset[4],@offset[5] | |
3534 | pxor @offset[4],$inout4 | |
3535 | pxor @offset[5],$inout5 | |
3536 | $movkey 32($key_),$rndkey0 | |
3537 | ||
3538 | lea 1($block_num),$i1 # even-numbered blocks | |
3539 | lea 3($block_num),$i3 | |
3540 | lea 5($block_num),$i5 | |
3541 | add \$6,$block_num | |
3542 | pxor $rndkey0l,@offset[0] # offset_i ^ round[last] | |
3543 | bsf $i1,$i1 # ntz(block) | |
3544 | bsf $i3,$i3 | |
3545 | bsf $i5,$i5 | |
3546 | ||
3547 | aesdec $rndkey1,$inout0 | |
3548 | aesdec $rndkey1,$inout1 | |
3549 | aesdec $rndkey1,$inout2 | |
3550 | aesdec $rndkey1,$inout3 | |
3551 | pxor $rndkey0l,@offset[1] | |
3552 | pxor $rndkey0l,@offset[2] | |
3553 | aesdec $rndkey1,$inout4 | |
3554 | pxor $rndkey0l,@offset[3] | |
3555 | pxor $rndkey0l,@offset[4] | |
3556 | aesdec $rndkey1,$inout5 | |
3557 | $movkey 48($key_),$rndkey1 | |
3558 | pxor $rndkey0l,@offset[5] | |
3559 | ||
3560 | aesdec $rndkey0,$inout0 | |
3561 | aesdec $rndkey0,$inout1 | |
3562 | aesdec $rndkey0,$inout2 | |
3563 | aesdec $rndkey0,$inout3 | |
3564 | aesdec $rndkey0,$inout4 | |
3565 | aesdec $rndkey0,$inout5 | |
3566 | $movkey 64($key_),$rndkey0 | |
3567 | shl \$4,$i1 # ntz(block) -> table offset | |
3568 | shl \$4,$i3 | |
3569 | jmp .Locb_dec_loop6 | |
3570 | ||
3571 | .align 32 | |
3572 | .Locb_dec_loop6: | |
3573 | aesdec $rndkey1,$inout0 | |
3574 | aesdec $rndkey1,$inout1 | |
3575 | aesdec $rndkey1,$inout2 | |
3576 | aesdec $rndkey1,$inout3 | |
3577 | aesdec $rndkey1,$inout4 | |
3578 | aesdec $rndkey1,$inout5 | |
3579 | $movkey ($key,%rax),$rndkey1 | |
3580 | add \$32,%rax | |
3581 | ||
3582 | aesdec $rndkey0,$inout0 | |
3583 | aesdec $rndkey0,$inout1 | |
3584 | aesdec $rndkey0,$inout2 | |
3585 | aesdec $rndkey0,$inout3 | |
3586 | aesdec $rndkey0,$inout4 | |
3587 | aesdec $rndkey0,$inout5 | |
3588 | $movkey -16($key,%rax),$rndkey0 | |
3589 | jnz .Locb_dec_loop6 | |
3590 | ||
3591 | aesdec $rndkey1,$inout0 | |
3592 | aesdec $rndkey1,$inout1 | |
3593 | aesdec $rndkey1,$inout2 | |
3594 | aesdec $rndkey1,$inout3 | |
3595 | aesdec $rndkey1,$inout4 | |
3596 | aesdec $rndkey1,$inout5 | |
3597 | $movkey 16($key_),$rndkey1 | |
3598 | shl \$4,$i5 | |
3599 | ||
3600 | aesdeclast @offset[0],$inout0 | |
3601 | movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks | |
3602 | mov %r10,%rax # restore twisted rounds | |
3603 | aesdeclast @offset[1],$inout1 | |
3604 | aesdeclast @offset[2],$inout2 | |
3605 | aesdeclast @offset[3],$inout3 | |
3606 | aesdeclast @offset[4],$inout4 | |
3607 | aesdeclast @offset[5],$inout5 | |
3608 | ret | |
3609 | .size __ocb_decrypt6,.-__ocb_decrypt6 | |
3610 | ||
3611 | .type __ocb_decrypt4,\@abi-omnipotent | |
3612 | .align 32 | |
3613 | __ocb_decrypt4: | |
3614 | pxor $rndkey0l,@offset[5] # offset_i ^ round[0] | |
3615 | movdqu ($L_p,$i1),@offset[1] | |
3616 | movdqa @offset[0],@offset[2] | |
3617 | movdqu ($L_p,$i3),@offset[3] | |
3618 | pxor @offset[5],@offset[0] | |
3619 | pxor @offset[0],@offset[1] | |
3620 | pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i | |
3621 | pxor @offset[1],@offset[2] | |
3622 | pxor @offset[1],$inout1 | |
3623 | pxor @offset[2],@offset[3] | |
3624 | pxor @offset[2],$inout2 | |
3625 | pxor @offset[3],$inout3 | |
3626 | $movkey 32($key_),$rndkey0 | |
3627 | ||
3628 | pxor $rndkey0l,@offset[0] # offset_i ^ round[last] | |
3629 | pxor $rndkey0l,@offset[1] | |
3630 | pxor $rndkey0l,@offset[2] | |
3631 | pxor $rndkey0l,@offset[3] | |
3632 | ||
3633 | aesdec $rndkey1,$inout0 | |
3634 | aesdec $rndkey1,$inout1 | |
3635 | aesdec $rndkey1,$inout2 | |
3636 | aesdec $rndkey1,$inout3 | |
3637 | $movkey 48($key_),$rndkey1 | |
3638 | ||
3639 | aesdec $rndkey0,$inout0 | |
3640 | aesdec $rndkey0,$inout1 | |
3641 | aesdec $rndkey0,$inout2 | |
3642 | aesdec $rndkey0,$inout3 | |
3643 | $movkey 64($key_),$rndkey0 | |
3644 | jmp .Locb_dec_loop4 | |
3645 | ||
3646 | .align 32 | |
3647 | .Locb_dec_loop4: | |
3648 | aesdec $rndkey1,$inout0 | |
3649 | aesdec $rndkey1,$inout1 | |
3650 | aesdec $rndkey1,$inout2 | |
3651 | aesdec $rndkey1,$inout3 | |
3652 | $movkey ($key,%rax),$rndkey1 | |
3653 | add \$32,%rax | |
3654 | ||
3655 | aesdec $rndkey0,$inout0 | |
3656 | aesdec $rndkey0,$inout1 | |
3657 | aesdec $rndkey0,$inout2 | |
3658 | aesdec $rndkey0,$inout3 | |
3659 | $movkey -16($key,%rax),$rndkey0 | |
3660 | jnz .Locb_dec_loop4 | |
3661 | ||
3662 | aesdec $rndkey1,$inout0 | |
3663 | aesdec $rndkey1,$inout1 | |
3664 | aesdec $rndkey1,$inout2 | |
3665 | aesdec $rndkey1,$inout3 | |
3666 | $movkey 16($key_),$rndkey1 | |
3667 | mov %r10,%rax # restore twisted rounds | |
3668 | ||
3669 | aesdeclast @offset[0],$inout0 | |
3670 | aesdeclast @offset[1],$inout1 | |
3671 | aesdeclast @offset[2],$inout2 | |
3672 | aesdeclast @offset[3],$inout3 | |
3673 | ret | |
3674 | .size __ocb_decrypt4,.-__ocb_decrypt4 | |
3675 | ||
3676 | .type __ocb_decrypt1,\@abi-omnipotent | |
3677 | .align 32 | |
3678 | __ocb_decrypt1: | |
3679 | pxor @offset[5],$inout5 # offset_i | |
3680 | pxor $rndkey0l,$inout5 # offset_i ^ round[0] | |
3681 | pxor $inout5,$inout0 # input ^ round[0] ^ offset_i | |
3682 | $movkey 32($key_),$rndkey0 | |
3683 | ||
3684 | aesdec $rndkey1,$inout0 | |
3685 | $movkey 48($key_),$rndkey1 | |
3686 | pxor $rndkey0l,$inout5 # offset_i ^ round[last] | |
3687 | ||
3688 | aesdec $rndkey0,$inout0 | |
3689 | $movkey 64($key_),$rndkey0 | |
3690 | jmp .Locb_dec_loop1 | |
3691 | ||
3692 | .align 32 | |
3693 | .Locb_dec_loop1: | |
3694 | aesdec $rndkey1,$inout0 | |
3695 | $movkey ($key,%rax),$rndkey1 | |
3696 | add \$32,%rax | |
3697 | ||
3698 | aesdec $rndkey0,$inout0 | |
3699 | $movkey -16($key,%rax),$rndkey0 | |
3700 | jnz .Locb_dec_loop1 | |
3701 | ||
3702 | aesdec $rndkey1,$inout0 | |
3703 | $movkey 16($key_),$rndkey1 # redundant in tail | |
3704 | mov %r10,%rax # restore twisted rounds | |
3705 | ||
3706 | aesdeclast $inout5,$inout0 | |
3707 | ret | |
3708 | .size __ocb_decrypt1,.-__ocb_decrypt1 | |
3709 | ___ | |
f8501464 | 3710 | } }} |
d64a7232 | 3711 | \f |
6c83629b | 3712 | ######################################################################## |
d64a7232 AP |
3713 | # void $PREFIX_cbc_encrypt (const void *inp, void *out, |
3714 | # size_t length, const AES_KEY *key, | |
3715 | # unsigned char *ivp,const int enc); | |
f8501464 | 3716 | { |
73325b22 AP |
3717 | my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt |
3718 | my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); | |
73325b22 | 3719 | |
d64a7232 AP |
3720 | $code.=<<___; |
3721 | .globl ${PREFIX}_cbc_encrypt | |
3722 | .type ${PREFIX}_cbc_encrypt,\@function,6 | |
3723 | .align 16 | |
3724 | ${PREFIX}_cbc_encrypt: | |
b84460ad | 3725 | .cfi_startproc |
d64a7232 AP |
3726 | test $len,$len # check length |
3727 | jz .Lcbc_ret | |
d608b4d6 | 3728 | |
f8501464 | 3729 | mov 240($key),$rnds_ # key->rounds |
d64a7232 | 3730 | mov $key,$key_ # backup $key |
d608b4d6 | 3731 | test %r9d,%r9d # 6th argument |
d64a7232 AP |
3732 | jz .Lcbc_decrypt |
3733 | #--------------------------- CBC ENCRYPT ------------------------------# | |
f8501464 | 3734 | movups ($ivp),$inout0 # load iv as initial state |
d608b4d6 | 3735 | mov $rnds_,$rounds |
d7d119a3 | 3736 | cmp \$16,$len |
d64a7232 AP |
3737 | jb .Lcbc_enc_tail |
3738 | sub \$16,$len | |
3739 | jmp .Lcbc_enc_loop | |
d7d119a3 | 3740 | .align 16 |
d64a7232 | 3741 | .Lcbc_enc_loop: |
f8501464 | 3742 | movups ($inp),$inout1 # load input |
d64a7232 | 3743 | lea 16($inp),$inp |
f8501464 | 3744 | #xorps $inout1,$inout0 |
d64a7232 | 3745 | ___ |
f8501464 | 3746 | &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); |
d64a7232 | 3747 | $code.=<<___; |
d608b4d6 AP |
3748 | mov $rnds_,$rounds # restore $rounds |
3749 | mov $key_,$key # restore $key | |
d7d119a3 AP |
3750 | movups $inout0,0($out) # store output |
3751 | lea 16($out),$out | |
3752 | sub \$16,$len | |
d64a7232 AP |
3753 | jnc .Lcbc_enc_loop |
3754 | add \$16,$len | |
3755 | jnz .Lcbc_enc_tail | |
23f6eec7 AP |
3756 | pxor $rndkey0,$rndkey0 # clear register bank |
3757 | pxor $rndkey1,$rndkey1 | |
d608b4d6 | 3758 | movups $inout0,($ivp) |
23f6eec7 AP |
3759 | pxor $inout0,$inout0 |
3760 | pxor $inout1,$inout1 | |
d64a7232 AP |
3761 | jmp .Lcbc_ret |
3762 | ||
3763 | .Lcbc_enc_tail: | |
3764 | mov $len,%rcx # zaps $key | |
3765 | xchg $inp,$out # $inp is %rsi and $out is %rdi now | |
3766 | .long 0x9066A4F3 # rep movsb | |
3767 | mov \$16,%ecx # zero tail | |
3768 | sub $len,%rcx | |
3769 | xor %eax,%eax | |
3770 | .long 0x9066AAF3 # rep stosb | |
3771 | lea -16(%rdi),%rdi # rewind $out by 1 block | |
3772 | mov $rnds_,$rounds # restore $rounds | |
3773 | mov %rdi,%rsi # $inp and $out are the same | |
3774 | mov $key_,$key # restore $key | |
3775 | xor $len,$len # len=16 | |
3776 | jmp .Lcbc_enc_loop # one more spin | |
3777 | \f#--------------------------- CBC DECRYPT ------------------------------# | |
3778 | .align 16 | |
3779 | .Lcbc_decrypt: | |
23f6eec7 AP |
3780 | cmp \$16,$len |
3781 | jne .Lcbc_decrypt_bulk | |
3782 | ||
3783 | # handle single block without allocating stack frame, | |
3784 | # useful in ciphertext stealing mode | |
3785 | movdqu ($inp),$inout0 # load input | |
3786 | movdqu ($ivp),$inout1 # load iv | |
3787 | movdqa $inout0,$inout2 # future iv | |
3788 | ___ | |
3789 | &aesni_generate1("dec",$key,$rnds_); | |
3790 | $code.=<<___; | |
3791 | pxor $rndkey0,$rndkey0 # clear register bank | |
3792 | pxor $rndkey1,$rndkey1 | |
3793 | movdqu $inout2,($ivp) # store iv | |
3794 | xorps $inout1,$inout0 # ^=iv | |
3795 | pxor $inout1,$inout1 | |
3796 | movups $inout0,($out) # store output | |
3797 | pxor $inout0,$inout0 | |
3798 | jmp .Lcbc_ret | |
3799 | .align 16 | |
3800 | .Lcbc_decrypt_bulk: | |
384e6de4 | 3801 | lea (%rsp),%r11 # frame pointer |
b84460ad | 3802 | .cfi_def_cfa_register %r11 |
6a40ebe8 | 3803 | push %rbp |
b84460ad | 3804 | .cfi_push %rbp |
6a40ebe8 AP |
3805 | sub \$$frame_size,%rsp |
3806 | and \$-16,%rsp # Linux kernel stack can be incorrectly seeded | |
d64a7232 AP |
3807 | ___ |
3808 | $code.=<<___ if ($win64); | |
6a40ebe8 AP |
3809 | movaps %xmm6,0x10(%rsp) |
3810 | movaps %xmm7,0x20(%rsp) | |
3811 | movaps %xmm8,0x30(%rsp) | |
3812 | movaps %xmm9,0x40(%rsp) | |
73325b22 AP |
3813 | movaps %xmm10,0x50(%rsp) |
3814 | movaps %xmm11,0x60(%rsp) | |
3815 | movaps %xmm12,0x70(%rsp) | |
3816 | movaps %xmm13,0x80(%rsp) | |
3817 | movaps %xmm14,0x90(%rsp) | |
3818 | movaps %xmm15,0xa0(%rsp) | |
d608b4d6 | 3819 | .Lcbc_decrypt_body: |
d64a7232 | 3820 | ___ |
384e6de4 AP |
3821 | |
3822 | my $inp_=$key_="%rbp"; # reassign $key_ | |
3823 | ||
d64a7232 | 3824 | $code.=<<___; |
384e6de4 | 3825 | mov $key,$key_ # [re-]backup $key [after reassignment] |
d64a7232 | 3826 | movups ($ivp),$iv |
d608b4d6 | 3827 | mov $rnds_,$rounds |
73325b22 | 3828 | cmp \$0x50,$len |
d608b4d6 | 3829 | jbe .Lcbc_dec_tail |
73325b22 AP |
3830 | |
3831 | $movkey ($key),$rndkey0 | |
3832 | movdqu 0x00($inp),$inout0 # load input | |
3833 | movdqu 0x10($inp),$inout1 | |
3834 | movdqa $inout0,$in0 | |
3835 | movdqu 0x20($inp),$inout2 | |
3836 | movdqa $inout1,$in1 | |
3837 | movdqu 0x30($inp),$inout3 | |
3838 | movdqa $inout2,$in2 | |
3839 | movdqu 0x40($inp),$inout4 | |
3840 | movdqa $inout3,$in3 | |
3841 | movdqu 0x50($inp),$inout5 | |
3842 | movdqa $inout4,$in4 | |
5599c733 | 3843 | mov OPENSSL_ia32cap_P+4(%rip),%r9d |
73325b22 AP |
3844 | cmp \$0x70,$len |
3845 | jbe .Lcbc_dec_six_or_seven | |
3846 | ||
23f6eec7 AP |
3847 | and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE |
3848 | sub \$0x50,$len # $len is biased by -5*16 | |
5599c733 | 3849 | cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE |
23f6eec7 AP |
3850 | je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] |
3851 | sub \$0x20,$len # $len is biased by -7*16 | |
73325b22 | 3852 | lea 0x70($key),$key # size optimization |
f8501464 | 3853 | jmp .Lcbc_dec_loop8_enter |
d7d119a3 | 3854 | .align 16 |
f8501464 | 3855 | .Lcbc_dec_loop8: |
f8501464 AP |
3856 | movups $inout7,($out) |
3857 | lea 0x10($out),$out | |
3858 | .Lcbc_dec_loop8_enter: | |
73325b22 AP |
3859 | movdqu 0x60($inp),$inout6 |
3860 | pxor $rndkey0,$inout0 | |
3861 | movdqu 0x70($inp),$inout7 | |
3862 | pxor $rndkey0,$inout1 | |
3863 | $movkey 0x10-0x70($key),$rndkey1 | |
3864 | pxor $rndkey0,$inout2 | |
384e6de4 | 3865 | mov \$-1,$inp_ |
73325b22 AP |
3866 | cmp \$0x70,$len # is there at least 0x60 bytes ahead? |
3867 | pxor $rndkey0,$inout3 | |
3868 | pxor $rndkey0,$inout4 | |
3869 | pxor $rndkey0,$inout5 | |
3870 | pxor $rndkey0,$inout6 | |
d7d119a3 | 3871 | |
f8501464 | 3872 | aesdec $rndkey1,$inout0 |
73325b22 AP |
3873 | pxor $rndkey0,$inout7 |
3874 | $movkey 0x20-0x70($key),$rndkey0 | |
f8501464 | 3875 | aesdec $rndkey1,$inout1 |
f8501464 | 3876 | aesdec $rndkey1,$inout2 |
f8501464 | 3877 | aesdec $rndkey1,$inout3 |
f8501464 | 3878 | aesdec $rndkey1,$inout4 |
f8501464 | 3879 | aesdec $rndkey1,$inout5 |
f8501464 | 3880 | aesdec $rndkey1,$inout6 |
384e6de4 AP |
3881 | adc \$0,$inp_ |
3882 | and \$128,$inp_ | |
f8501464 | 3883 | aesdec $rndkey1,$inout7 |
73325b22 AP |
3884 | add $inp,$inp_ |
3885 | $movkey 0x30-0x70($key),$rndkey1 | |
3886 | ___ | |
3887 | for($i=1;$i<12;$i++) { | |
3888 | my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; | |
d8ba0dc9 AP |
3889 | $code.=<<___ if ($i==7); |
3890 | cmp \$11,$rounds | |
3891 | ___ | |
73325b22 AP |
3892 | $code.=<<___; |
3893 | aesdec $rndkeyx,$inout0 | |
3894 | aesdec $rndkeyx,$inout1 | |
3895 | aesdec $rndkeyx,$inout2 | |
3896 | aesdec $rndkeyx,$inout3 | |
3897 | aesdec $rndkeyx,$inout4 | |
3898 | aesdec $rndkeyx,$inout5 | |
3899 | aesdec $rndkeyx,$inout6 | |
3900 | aesdec $rndkeyx,$inout7 | |
3901 | $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx | |
3902 | ___ | |
d8ba0dc9 AP |
3903 | $code.=<<___ if ($i<6 || (!($i&1) && $i>7)); |
3904 | nop | |
3905 | ___ | |
73325b22 | 3906 | $code.=<<___ if ($i==7); |
73325b22 AP |
3907 | jb .Lcbc_dec_done |
3908 | ___ | |
3909 | $code.=<<___ if ($i==9); | |
3910 | je .Lcbc_dec_done | |
3911 | ___ | |
d8ba0dc9 AP |
3912 | $code.=<<___ if ($i==11); |
3913 | jmp .Lcbc_dec_done | |
3914 | ___ | |
73325b22 AP |
3915 | } |
3916 | $code.=<<___; | |
d8ba0dc9 | 3917 | .align 16 |
73325b22 AP |
3918 | .Lcbc_dec_done: |
3919 | aesdec $rndkey1,$inout0 | |
73325b22 | 3920 | aesdec $rndkey1,$inout1 |
d8ba0dc9 | 3921 | pxor $rndkey0,$iv |
73325b22 AP |
3922 | pxor $rndkey0,$in0 |
3923 | aesdec $rndkey1,$inout2 | |
73325b22 | 3924 | aesdec $rndkey1,$inout3 |
d8ba0dc9 | 3925 | pxor $rndkey0,$in1 |
73325b22 AP |
3926 | pxor $rndkey0,$in2 |
3927 | aesdec $rndkey1,$inout4 | |
73325b22 | 3928 | aesdec $rndkey1,$inout5 |
d8ba0dc9 | 3929 | pxor $rndkey0,$in3 |
73325b22 AP |
3930 | pxor $rndkey0,$in4 |
3931 | aesdec $rndkey1,$inout6 | |
3932 | aesdec $rndkey1,$inout7 | |
3933 | movdqu 0x50($inp),$rndkey1 | |
d64a7232 | 3934 | |
73325b22 AP |
3935 | aesdeclast $iv,$inout0 |
3936 | movdqu 0x60($inp),$iv # borrow $iv | |
3937 | pxor $rndkey0,$rndkey1 | |
3938 | aesdeclast $in0,$inout1 | |
3939 | pxor $rndkey0,$iv | |
3940 | movdqu 0x70($inp),$rndkey0 # next IV | |
73325b22 | 3941 | aesdeclast $in1,$inout2 |
d8ba0dc9 | 3942 | lea 0x80($inp),$inp |
73325b22 AP |
3943 | movdqu 0x00($inp_),$in0 |
3944 | aesdeclast $in2,$inout3 | |
73325b22 | 3945 | aesdeclast $in3,$inout4 |
d8ba0dc9 | 3946 | movdqu 0x10($inp_),$in1 |
73325b22 AP |
3947 | movdqu 0x20($inp_),$in2 |
3948 | aesdeclast $in4,$inout5 | |
73325b22 | 3949 | aesdeclast $rndkey1,$inout6 |
d8ba0dc9 | 3950 | movdqu 0x30($inp_),$in3 |
73325b22 AP |
3951 | movdqu 0x40($inp_),$in4 |
3952 | aesdeclast $iv,$inout7 | |
3953 | movdqa $rndkey0,$iv # return $iv | |
3954 | movdqu 0x50($inp_),$rndkey1 | |
3955 | $movkey -0x70($key),$rndkey0 | |
3956 | ||
3957 | movups $inout0,($out) # store output | |
3958 | movdqa $in0,$inout0 | |
3959 | movups $inout1,0x10($out) | |
3960 | movdqa $in1,$inout1 | |
3961 | movups $inout2,0x20($out) | |
3962 | movdqa $in2,$inout2 | |
3963 | movups $inout3,0x30($out) | |
3964 | movdqa $in3,$inout3 | |
3965 | movups $inout4,0x40($out) | |
3966 | movdqa $in4,$inout4 | |
3967 | movups $inout5,0x50($out) | |
3968 | movdqa $rndkey1,$inout5 | |
3969 | movups $inout6,0x60($out) | |
3970 | lea 0x70($out),$out | |
f8501464 | 3971 | |
f8501464 AP |
3972 | sub \$0x80,$len |
3973 | ja .Lcbc_dec_loop8 | |
3974 | ||
3975 | movaps $inout7,$inout0 | |
73325b22 | 3976 | lea -0x70($key),$key |
f8501464 | 3977 | add \$0x70,$len |
23f6eec7 | 3978 | jle .Lcbc_dec_clear_tail_collected |
73325b22 | 3979 | movups $inout7,($out) |
f8501464 | 3980 | lea 0x10($out),$out |
73325b22 AP |
3981 | cmp \$0x50,$len |
3982 | jbe .Lcbc_dec_tail | |
3983 | ||
3984 | movaps $in0,$inout0 | |
3985 | .Lcbc_dec_six_or_seven: | |
3986 | cmp \$0x60,$len | |
3987 | ja .Lcbc_dec_seven | |
3988 | ||
3989 | movaps $inout5,$inout6 | |
3990 | call _aesni_decrypt6 | |
3991 | pxor $iv,$inout0 # ^= IV | |
3992 | movaps $inout6,$iv | |
3993 | pxor $in0,$inout1 | |
3994 | movdqu $inout0,($out) | |
3995 | pxor $in1,$inout2 | |
3996 | movdqu $inout1,0x10($out) | |
23f6eec7 | 3997 | pxor $inout1,$inout1 # clear register bank |
73325b22 AP |
3998 | pxor $in2,$inout3 |
3999 | movdqu $inout2,0x20($out) | |
23f6eec7 | 4000 | pxor $inout2,$inout2 |
73325b22 AP |
4001 | pxor $in3,$inout4 |
4002 | movdqu $inout3,0x30($out) | |
23f6eec7 | 4003 | pxor $inout3,$inout3 |
73325b22 AP |
4004 | pxor $in4,$inout5 |
4005 | movdqu $inout4,0x40($out) | |
23f6eec7 | 4006 | pxor $inout4,$inout4 |
73325b22 AP |
4007 | lea 0x50($out),$out |
4008 | movdqa $inout5,$inout0 | |
23f6eec7 | 4009 | pxor $inout5,$inout5 |
73325b22 AP |
4010 | jmp .Lcbc_dec_tail_collected |
4011 | ||
4012 | .align 16 | |
4013 | .Lcbc_dec_seven: | |
4014 | movups 0x60($inp),$inout6 | |
4015 | xorps $inout7,$inout7 | |
4016 | call _aesni_decrypt8 | |
4017 | movups 0x50($inp),$inout7 | |
4018 | pxor $iv,$inout0 # ^= IV | |
4019 | movups 0x60($inp),$iv | |
4020 | pxor $in0,$inout1 | |
4021 | movdqu $inout0,($out) | |
4022 | pxor $in1,$inout2 | |
4023 | movdqu $inout1,0x10($out) | |
23f6eec7 | 4024 | pxor $inout1,$inout1 # clear register bank |
73325b22 AP |
4025 | pxor $in2,$inout3 |
4026 | movdqu $inout2,0x20($out) | |
23f6eec7 | 4027 | pxor $inout2,$inout2 |
73325b22 AP |
4028 | pxor $in3,$inout4 |
4029 | movdqu $inout3,0x30($out) | |
23f6eec7 | 4030 | pxor $inout3,$inout3 |
73325b22 AP |
4031 | pxor $in4,$inout5 |
4032 | movdqu $inout4,0x40($out) | |
23f6eec7 | 4033 | pxor $inout4,$inout4 |
73325b22 AP |
4034 | pxor $inout7,$inout6 |
4035 | movdqu $inout5,0x50($out) | |
23f6eec7 | 4036 | pxor $inout5,$inout5 |
73325b22 AP |
4037 | lea 0x60($out),$out |
4038 | movdqa $inout6,$inout0 | |
23f6eec7 AP |
4039 | pxor $inout6,$inout6 |
4040 | pxor $inout7,$inout7 | |
73325b22 AP |
4041 | jmp .Lcbc_dec_tail_collected |
4042 | ||
5599c733 AP |
4043 | .align 16 |
4044 | .Lcbc_dec_loop6: | |
4045 | movups $inout5,($out) | |
4046 | lea 0x10($out),$out | |
4047 | movdqu 0x00($inp),$inout0 # load input | |
4048 | movdqu 0x10($inp),$inout1 | |
4049 | movdqa $inout0,$in0 | |
4050 | movdqu 0x20($inp),$inout2 | |
4051 | movdqa $inout1,$in1 | |
4052 | movdqu 0x30($inp),$inout3 | |
4053 | movdqa $inout2,$in2 | |
4054 | movdqu 0x40($inp),$inout4 | |
4055 | movdqa $inout3,$in3 | |
4056 | movdqu 0x50($inp),$inout5 | |
4057 | movdqa $inout4,$in4 | |
4058 | .Lcbc_dec_loop6_enter: | |
4059 | lea 0x60($inp),$inp | |
4060 | movdqa $inout5,$inout6 | |
4061 | ||
4062 | call _aesni_decrypt6 | |
4063 | ||
4064 | pxor $iv,$inout0 # ^= IV | |
4065 | movdqa $inout6,$iv | |
4066 | pxor $in0,$inout1 | |
4067 | movdqu $inout0,($out) | |
4068 | pxor $in1,$inout2 | |
4069 | movdqu $inout1,0x10($out) | |
4070 | pxor $in2,$inout3 | |
4071 | movdqu $inout2,0x20($out) | |
4072 | pxor $in3,$inout4 | |
4073 | mov $key_,$key | |
4074 | movdqu $inout3,0x30($out) | |
4075 | pxor $in4,$inout5 | |
4076 | mov $rnds_,$rounds | |
4077 | movdqu $inout4,0x40($out) | |
4078 | lea 0x50($out),$out | |
4079 | sub \$0x60,$len | |
4080 | ja .Lcbc_dec_loop6 | |
4081 | ||
4082 | movdqa $inout5,$inout0 | |
4083 | add \$0x50,$len | |
23f6eec7 | 4084 | jle .Lcbc_dec_clear_tail_collected |
5599c733 AP |
4085 | movups $inout5,($out) |
4086 | lea 0x10($out),$out | |
4087 | ||
6c83629b | 4088 | .Lcbc_dec_tail: |
d64a7232 | 4089 | movups ($inp),$inout0 |
73325b22 | 4090 | sub \$0x10,$len |
23f6eec7 | 4091 | jbe .Lcbc_dec_one # $len is 1*16 or less |
f8501464 | 4092 | |
d64a7232 | 4093 | movups 0x10($inp),$inout1 |
73325b22 AP |
4094 | movaps $inout0,$in0 |
4095 | sub \$0x10,$len | |
23f6eec7 | 4096 | jbe .Lcbc_dec_two # $len is 2*16 or less |
f8501464 | 4097 | |
d64a7232 | 4098 | movups 0x20($inp),$inout2 |
73325b22 AP |
4099 | movaps $inout1,$in1 |
4100 | sub \$0x10,$len | |
23f6eec7 | 4101 | jbe .Lcbc_dec_three # $len is 3*16 or less |
f8501464 | 4102 | |
d64a7232 | 4103 | movups 0x30($inp),$inout3 |
73325b22 AP |
4104 | movaps $inout2,$in2 |
4105 | sub \$0x10,$len | |
23f6eec7 | 4106 | jbe .Lcbc_dec_four # $len is 4*16 or less |
f8501464 | 4107 | |
23f6eec7 | 4108 | movups 0x40($inp),$inout4 # $len is 5*16 or less |
73325b22 AP |
4109 | movaps $inout3,$in3 |
4110 | movaps $inout4,$in4 | |
4111 | xorps $inout5,$inout5 | |
4112 | call _aesni_decrypt6 | |
4113 | pxor $iv,$inout0 | |
4114 | movaps $in4,$iv | |
4115 | pxor $in0,$inout1 | |
4116 | movdqu $inout0,($out) | |
4117 | pxor $in1,$inout2 | |
4118 | movdqu $inout1,0x10($out) | |
23f6eec7 | 4119 | pxor $inout1,$inout1 # clear register bank |
73325b22 AP |
4120 | pxor $in2,$inout3 |
4121 | movdqu $inout2,0x20($out) | |
23f6eec7 | 4122 | pxor $inout2,$inout2 |
73325b22 AP |
4123 | pxor $in3,$inout4 |
4124 | movdqu $inout3,0x30($out) | |
23f6eec7 | 4125 | pxor $inout3,$inout3 |
73325b22 AP |
4126 | lea 0x40($out),$out |
4127 | movdqa $inout4,$inout0 | |
23f6eec7 AP |
4128 | pxor $inout4,$inout4 |
4129 | pxor $inout5,$inout5 | |
73325b22 | 4130 | sub \$0x10,$len |
d64a7232 | 4131 | jmp .Lcbc_dec_tail_collected |
73325b22 | 4132 | |
d64a7232 AP |
4133 | .align 16 |
4134 | .Lcbc_dec_one: | |
73325b22 | 4135 | movaps $inout0,$in0 |
d64a7232 | 4136 | ___ |
d608b4d6 | 4137 | &aesni_generate1("dec",$key,$rounds); |
d64a7232 | 4138 | $code.=<<___; |
f8501464 | 4139 | xorps $iv,$inout0 |
d64a7232 AP |
4140 | movaps $in0,$iv |
4141 | jmp .Lcbc_dec_tail_collected | |
4142 | .align 16 | |
4143 | .Lcbc_dec_two: | |
73325b22 | 4144 | movaps $inout1,$in1 |
214368ff | 4145 | call _aesni_decrypt2 |
73325b22 | 4146 | pxor $iv,$inout0 |
d64a7232 | 4147 | movaps $in1,$iv |
73325b22 AP |
4148 | pxor $in0,$inout1 |
4149 | movdqu $inout0,($out) | |
4150 | movdqa $inout1,$inout0 | |
23f6eec7 | 4151 | pxor $inout1,$inout1 # clear register bank |
d64a7232 AP |
4152 | lea 0x10($out),$out |
4153 | jmp .Lcbc_dec_tail_collected | |
4154 | .align 16 | |
4155 | .Lcbc_dec_three: | |
73325b22 | 4156 | movaps $inout2,$in2 |
d608b4d6 | 4157 | call _aesni_decrypt3 |
73325b22 | 4158 | pxor $iv,$inout0 |
d64a7232 | 4159 | movaps $in2,$iv |
73325b22 AP |
4160 | pxor $in0,$inout1 |
4161 | movdqu $inout0,($out) | |
4162 | pxor $in1,$inout2 | |
4163 | movdqu $inout1,0x10($out) | |
23f6eec7 | 4164 | pxor $inout1,$inout1 # clear register bank |
73325b22 | 4165 | movdqa $inout2,$inout0 |
23f6eec7 | 4166 | pxor $inout2,$inout2 |
d64a7232 | 4167 | lea 0x20($out),$out |
f8501464 AP |
4168 | jmp .Lcbc_dec_tail_collected |
4169 | .align 16 | |
4170 | .Lcbc_dec_four: | |
73325b22 | 4171 | movaps $inout3,$in3 |
f8501464 | 4172 | call _aesni_decrypt4 |
73325b22 AP |
4173 | pxor $iv,$inout0 |
4174 | movaps $in3,$iv | |
4175 | pxor $in0,$inout1 | |
4176 | movdqu $inout0,($out) | |
4177 | pxor $in1,$inout2 | |
4178 | movdqu $inout1,0x10($out) | |
23f6eec7 | 4179 | pxor $inout1,$inout1 # clear register bank |
73325b22 AP |
4180 | pxor $in2,$inout3 |
4181 | movdqu $inout2,0x20($out) | |
23f6eec7 | 4182 | pxor $inout2,$inout2 |
73325b22 | 4183 | movdqa $inout3,$inout0 |
23f6eec7 | 4184 | pxor $inout3,$inout3 |
f8501464 | 4185 | lea 0x30($out),$out |
d64a7232 | 4186 | jmp .Lcbc_dec_tail_collected |
73325b22 | 4187 | |
d64a7232 | 4188 | .align 16 |
23f6eec7 AP |
4189 | .Lcbc_dec_clear_tail_collected: |
4190 | pxor $inout1,$inout1 # clear register bank | |
4191 | pxor $inout2,$inout2 | |
4192 | pxor $inout3,$inout3 | |
4193 | ___ | |
4194 | $code.=<<___ if (!$win64); | |
4195 | pxor $inout4,$inout4 # %xmm6..9 | |
4196 | pxor $inout5,$inout5 | |
4197 | pxor $inout6,$inout6 | |
4198 | pxor $inout7,$inout7 | |
4199 | ___ | |
4200 | $code.=<<___; | |
d64a7232 | 4201 | .Lcbc_dec_tail_collected: |
d64a7232 | 4202 | movups $iv,($ivp) |
73325b22 | 4203 | and \$15,$len |
d64a7232 | 4204 | jnz .Lcbc_dec_tail_partial |
f8501464 | 4205 | movups $inout0,($out) |
23f6eec7 | 4206 | pxor $inout0,$inout0 |
d64a7232 | 4207 | jmp .Lcbc_dec_ret |
d7d119a3 | 4208 | .align 16 |
d64a7232 | 4209 | .Lcbc_dec_tail_partial: |
6a40ebe8 | 4210 | movaps $inout0,(%rsp) |
23f6eec7 | 4211 | pxor $inout0,$inout0 |
f8501464 | 4212 | mov \$16,%rcx |
d64a7232 | 4213 | mov $out,%rdi |
f8501464 | 4214 | sub $len,%rcx |
6a40ebe8 | 4215 | lea (%rsp),%rsi |
23f6eec7 AP |
4216 | .long 0x9066A4F3 # rep movsb |
4217 | movdqa $inout0,(%rsp) | |
d64a7232 AP |
4218 | |
4219 | .Lcbc_dec_ret: | |
23f6eec7 AP |
4220 | xorps $rndkey0,$rndkey0 # %xmm0 |
4221 | pxor $rndkey1,$rndkey1 | |
d64a7232 AP |
4222 | ___ |
4223 | $code.=<<___ if ($win64); | |
6a40ebe8 | 4224 | movaps 0x10(%rsp),%xmm6 |
23f6eec7 | 4225 | movaps %xmm0,0x10(%rsp) # clear stack |
6a40ebe8 | 4226 | movaps 0x20(%rsp),%xmm7 |
23f6eec7 | 4227 | movaps %xmm0,0x20(%rsp) |
6a40ebe8 | 4228 | movaps 0x30(%rsp),%xmm8 |
23f6eec7 | 4229 | movaps %xmm0,0x30(%rsp) |
6a40ebe8 | 4230 | movaps 0x40(%rsp),%xmm9 |
23f6eec7 | 4231 | movaps %xmm0,0x40(%rsp) |
73325b22 | 4232 | movaps 0x50(%rsp),%xmm10 |
23f6eec7 | 4233 | movaps %xmm0,0x50(%rsp) |
73325b22 | 4234 | movaps 0x60(%rsp),%xmm11 |
23f6eec7 | 4235 | movaps %xmm0,0x60(%rsp) |
73325b22 | 4236 | movaps 0x70(%rsp),%xmm12 |
23f6eec7 | 4237 | movaps %xmm0,0x70(%rsp) |
73325b22 | 4238 | movaps 0x80(%rsp),%xmm13 |
23f6eec7 | 4239 | movaps %xmm0,0x80(%rsp) |
73325b22 | 4240 | movaps 0x90(%rsp),%xmm14 |
23f6eec7 | 4241 | movaps %xmm0,0x90(%rsp) |
73325b22 | 4242 | movaps 0xa0(%rsp),%xmm15 |
23f6eec7 | 4243 | movaps %xmm0,0xa0(%rsp) |
d64a7232 AP |
4244 | ___ |
4245 | $code.=<<___; | |
384e6de4 | 4246 | mov -8(%r11),%rbp |
b84460ad | 4247 | .cfi_restore %rbp |
384e6de4 | 4248 | lea (%r11),%rsp |
b84460ad | 4249 | .cfi_def_cfa_register %rsp |
d64a7232 AP |
4250 | .Lcbc_ret: |
4251 | ret | |
b84460ad | 4252 | .cfi_endproc |
d64a7232 AP |
4253 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt |
4254 | ___ | |
f8501464 | 4255 | } \f |
23f6eec7 | 4256 | # int ${PREFIX}_set_decrypt_key(const unsigned char *inp, |
d608b4d6 | 4257 | # int bits, AES_KEY *key) |
23f6eec7 AP |
4258 | # |
4259 | # input: $inp user-supplied key | |
4260 | # $bits $inp length in bits | |
4261 | # $key pointer to key schedule | |
4262 | # output: %eax 0 denoting success, -1 or -2 - failure (see C) | |
4263 | # *$key key schedule | |
4264 | # | |
d608b4d6 AP |
4265 | { my ($inp,$bits,$key) = @_4args; |
4266 | $bits =~ s/%r/%e/; | |
4267 | ||
d64a7232 AP |
4268 | $code.=<<___; |
4269 | .globl ${PREFIX}_set_decrypt_key | |
d608b4d6 | 4270 | .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent |
d64a7232 AP |
4271 | .align 16 |
4272 | ${PREFIX}_set_decrypt_key: | |
b84460ad | 4273 | .cfi_startproc |
d608b4d6 | 4274 | .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 |
b84460ad | 4275 | .cfi_adjust_cfa_offset 8 |
fb2f3411 | 4276 | call __aesni_set_encrypt_key |
d608b4d6 | 4277 | shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key |
d64a7232 AP |
4278 | test %eax,%eax |
4279 | jnz .Ldec_key_ret | |
d608b4d6 AP |
4280 | lea 16($key,$bits),$inp # points at the end of key schedule |
4281 | ||
4282 | $movkey ($key),%xmm0 # just swap | |
4283 | $movkey ($inp),%xmm1 | |
4284 | $movkey %xmm0,($inp) | |
4285 | $movkey %xmm1,($key) | |
4286 | lea 16($key),$key | |
4287 | lea -16($inp),$inp | |
4288 | ||
d64a7232 | 4289 | .Ldec_key_inverse: |
d608b4d6 AP |
4290 | $movkey ($key),%xmm0 # swap and inverse |
4291 | $movkey ($inp),%xmm1 | |
d64a7232 AP |
4292 | aesimc %xmm0,%xmm0 |
4293 | aesimc %xmm1,%xmm1 | |
d608b4d6 AP |
4294 | lea 16($key),$key |
4295 | lea -16($inp),$inp | |
d608b4d6 AP |
4296 | $movkey %xmm0,16($inp) |
4297 | $movkey %xmm1,-16($key) | |
d7d119a3 | 4298 | cmp $key,$inp |
d64a7232 AP |
4299 | ja .Ldec_key_inverse |
4300 | ||
d608b4d6 | 4301 | $movkey ($key),%xmm0 # inverse middle |
d64a7232 | 4302 | aesimc %xmm0,%xmm0 |
23f6eec7 | 4303 | pxor %xmm1,%xmm1 |
d608b4d6 | 4304 | $movkey %xmm0,($inp) |
23f6eec7 | 4305 | pxor %xmm0,%xmm0 |
d64a7232 | 4306 | .Ldec_key_ret: |
d608b4d6 | 4307 | add \$8,%rsp |
b84460ad | 4308 | .cfi_adjust_cfa_offset -8 |
d64a7232 | 4309 | ret |
b84460ad | 4310 | .cfi_endproc |
d608b4d6 | 4311 | .LSEH_end_set_decrypt_key: |
d64a7232 AP |
4312 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key |
4313 | ___ | |
4314 | \f | |
e3713c36 RS |
4315 | # This is based on submission from Intel by |
4316 | # Huang Ying | |
4317 | # Vinodh Gopal | |
d64a7232 AP |
4318 | # Kahraman Akdemir |
4319 | # | |
60250017 | 4320 | # Aggressively optimized in respect to aeskeygenassist's critical path |
d64a7232 AP |
4321 | # and is contained in %xmm0-5 to meet Win64 ABI requirement. |
4322 | # | |
23f6eec7 AP |
4323 | # int ${PREFIX}_set_encrypt_key(const unsigned char *inp, |
4324 | # int bits, AES_KEY * const key); | |
4325 | # | |
4326 | # input: $inp user-supplied key | |
4327 | # $bits $inp length in bits | |
4328 | # $key pointer to key schedule | |
4329 | # output: %eax 0 denoting success, -1 or -2 - failure (see C) | |
4330 | # $bits rounds-1 (used in aesni_set_decrypt_key) | |
4331 | # *$key key schedule | |
4332 | # $key pointer to key schedule (used in | |
4333 | # aesni_set_decrypt_key) | |
4334 | # | |
4335 | # Subroutine is frame-less, which means that only volatile registers | |
4336 | # are used. Note that it's declared "abi-omnipotent", which means that | |
4337 | # amount of volatile registers is smaller on Windows. | |
4338 | # | |
d64a7232 | 4339 | $code.=<<___; |
d608b4d6 AP |
4340 | .globl ${PREFIX}_set_encrypt_key |
4341 | .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent | |
d64a7232 | 4342 | .align 16 |
d608b4d6 | 4343 | ${PREFIX}_set_encrypt_key: |
fb2f3411 | 4344 | __aesni_set_encrypt_key: |
b84460ad | 4345 | .cfi_startproc |
d608b4d6 | 4346 | .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 |
b84460ad | 4347 | .cfi_adjust_cfa_offset 8 |
d608b4d6 | 4348 | mov \$-1,%rax |
d7d119a3 | 4349 | test $inp,$inp |
d608b4d6 AP |
4350 | jz .Lenc_key_ret |
4351 | test $key,$key | |
4352 | jz .Lenc_key_ret | |
4353 | ||
23f6eec7 | 4354 | mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits |
d608b4d6 | 4355 | movups ($inp),%xmm0 # pull first 128 bits of *userKey |
f8501464 | 4356 | xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 |
23f6eec7 AP |
4357 | and OPENSSL_ia32cap_P+4(%rip),%r10d |
4358 | lea 16($key),%rax # %rax is used as modifiable copy of $key | |
d608b4d6 | 4359 | cmp \$256,$bits |
d64a7232 | 4360 | je .L14rounds |
d608b4d6 | 4361 | cmp \$192,$bits |
d64a7232 | 4362 | je .L12rounds |
d608b4d6 | 4363 | cmp \$128,$bits |
d64a7232 | 4364 | jne .Lbad_keybits |
d608b4d6 | 4365 | |
d64a7232 | 4366 | .L10rounds: |
d608b4d6 | 4367 | mov \$9,$bits # 10 rounds for 128-bit key |
23f6eec7 AP |
4368 | cmp \$`1<<28`,%r10d # AVX, bit no XOP |
4369 | je .L10rounds_alt | |
4370 | ||
d608b4d6 | 4371 | $movkey %xmm0,($key) # round 0 |
d64a7232 AP |
4372 | aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 |
4373 | call .Lkey_expansion_128_cold | |
4374 | aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 | |
4375 | call .Lkey_expansion_128 | |
4376 | aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 | |
4377 | call .Lkey_expansion_128 | |
4378 | aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 | |
4379 | call .Lkey_expansion_128 | |
4380 | aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 | |
4381 | call .Lkey_expansion_128 | |
4382 | aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 | |
4383 | call .Lkey_expansion_128 | |
4384 | aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 | |
4385 | call .Lkey_expansion_128 | |
4386 | aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 | |
4387 | call .Lkey_expansion_128 | |
4388 | aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 | |
4389 | call .Lkey_expansion_128 | |
4390 | aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 | |
4391 | call .Lkey_expansion_128 | |
d608b4d6 AP |
4392 | $movkey %xmm0,(%rax) |
4393 | mov $bits,80(%rax) # 240(%rdx) | |
d64a7232 | 4394 | xor %eax,%eax |
d608b4d6 | 4395 | jmp .Lenc_key_ret |
d64a7232 | 4396 | |
23f6eec7 AP |
4397 | .align 16 |
4398 | .L10rounds_alt: | |
4399 | movdqa .Lkey_rotate(%rip),%xmm5 | |
4400 | mov \$8,%r10d | |
4401 | movdqa .Lkey_rcon1(%rip),%xmm4 | |
4402 | movdqa %xmm0,%xmm2 | |
4403 | movdqu %xmm0,($key) | |
4404 | jmp .Loop_key128 | |
4405 | ||
4406 | .align 16 | |
4407 | .Loop_key128: | |
4408 | pshufb %xmm5,%xmm0 | |
4409 | aesenclast %xmm4,%xmm0 | |
4410 | pslld \$1,%xmm4 | |
4411 | lea 16(%rax),%rax | |
4412 | ||
4413 | movdqa %xmm2,%xmm3 | |
4414 | pslldq \$4,%xmm2 | |
4415 | pxor %xmm2,%xmm3 | |
4416 | pslldq \$4,%xmm2 | |
4417 | pxor %xmm2,%xmm3 | |
4418 | pslldq \$4,%xmm2 | |
4419 | pxor %xmm3,%xmm2 | |
4420 | ||
4421 | pxor %xmm2,%xmm0 | |
4422 | movdqu %xmm0,-16(%rax) | |
4423 | movdqa %xmm0,%xmm2 | |
4424 | ||
4425 | dec %r10d | |
4426 | jnz .Loop_key128 | |
4427 | ||
4428 | movdqa .Lkey_rcon1b(%rip),%xmm4 | |
4429 | ||
4430 | pshufb %xmm5,%xmm0 | |
4431 | aesenclast %xmm4,%xmm0 | |
4432 | pslld \$1,%xmm4 | |
4433 | ||
4434 | movdqa %xmm2,%xmm3 | |
4435 | pslldq \$4,%xmm2 | |
4436 | pxor %xmm2,%xmm3 | |
4437 | pslldq \$4,%xmm2 | |
4438 | pxor %xmm2,%xmm3 | |
4439 | pslldq \$4,%xmm2 | |
4440 | pxor %xmm3,%xmm2 | |
4441 | ||
4442 | pxor %xmm2,%xmm0 | |
4443 | movdqu %xmm0,(%rax) | |
4444 | ||
4445 | movdqa %xmm0,%xmm2 | |
4446 | pshufb %xmm5,%xmm0 | |
4447 | aesenclast %xmm4,%xmm0 | |
4448 | ||
4449 | movdqa %xmm2,%xmm3 | |
4450 | pslldq \$4,%xmm2 | |
4451 | pxor %xmm2,%xmm3 | |
4452 | pslldq \$4,%xmm2 | |
4453 | pxor %xmm2,%xmm3 | |
4454 | pslldq \$4,%xmm2 | |
4455 | pxor %xmm3,%xmm2 | |
4456 | ||
4457 | pxor %xmm2,%xmm0 | |
4458 | movdqu %xmm0,16(%rax) | |
4459 | ||
4460 | mov $bits,96(%rax) # 240($key) | |
4461 | xor %eax,%eax | |
4462 | jmp .Lenc_key_ret | |
4463 | ||
d64a7232 AP |
4464 | .align 16 |
4465 | .L12rounds: | |
d608b4d6 AP |
4466 | movq 16($inp),%xmm2 # remaining 1/3 of *userKey |
4467 | mov \$11,$bits # 12 rounds for 192 | |
23f6eec7 AP |
4468 | cmp \$`1<<28`,%r10d # AVX, but no XOP |
4469 | je .L12rounds_alt | |
4470 | ||
d608b4d6 | 4471 | $movkey %xmm0,($key) # round 0 |
d64a7232 AP |
4472 | aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 |
4473 | call .Lkey_expansion_192a_cold | |
4474 | aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 | |
4475 | call .Lkey_expansion_192b | |
4476 | aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 | |
4477 | call .Lkey_expansion_192a | |
4478 | aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 | |
4479 | call .Lkey_expansion_192b | |
4480 | aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 | |
4481 | call .Lkey_expansion_192a | |
4482 | aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 | |
4483 | call .Lkey_expansion_192b | |
4484 | aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 | |
4485 | call .Lkey_expansion_192a | |
4486 | aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 | |
4487 | call .Lkey_expansion_192b | |
d608b4d6 AP |
4488 | $movkey %xmm0,(%rax) |
4489 | mov $bits,48(%rax) # 240(%rdx) | |
d64a7232 | 4490 | xor %rax, %rax |
d608b4d6 | 4491 | jmp .Lenc_key_ret |
d64a7232 | 4492 | |
23f6eec7 AP |
4493 | .align 16 |
4494 | .L12rounds_alt: | |
4495 | movdqa .Lkey_rotate192(%rip),%xmm5 | |
4496 | movdqa .Lkey_rcon1(%rip),%xmm4 | |
4497 | mov \$8,%r10d | |
4498 | movdqu %xmm0,($key) | |
4499 | jmp .Loop_key192 | |
4500 | ||
4501 | .align 16 | |
4502 | .Loop_key192: | |
4503 | movq %xmm2,0(%rax) | |
4504 | movdqa %xmm2,%xmm1 | |
4505 | pshufb %xmm5,%xmm2 | |
4506 | aesenclast %xmm4,%xmm2 | |
4507 | pslld \$1, %xmm4 | |
4508 | lea 24(%rax),%rax | |
4509 | ||
4510 | movdqa %xmm0,%xmm3 | |
4511 | pslldq \$4,%xmm0 | |
4512 | pxor %xmm0,%xmm3 | |
4513 | pslldq \$4,%xmm0 | |
4514 | pxor %xmm0,%xmm3 | |
4515 | pslldq \$4,%xmm0 | |
4516 | pxor %xmm3,%xmm0 | |
4517 | ||
4518 | pshufd \$0xff,%xmm0,%xmm3 | |
4519 | pxor %xmm1,%xmm3 | |
4520 | pslldq \$4,%xmm1 | |
4521 | pxor %xmm1,%xmm3 | |
4522 | ||
4523 | pxor %xmm2,%xmm0 | |
4524 | pxor %xmm3,%xmm2 | |
4525 | movdqu %xmm0,-16(%rax) | |
4526 | ||
4527 | dec %r10d | |
4528 | jnz .Loop_key192 | |
4529 | ||
4530 | mov $bits,32(%rax) # 240($key) | |
4531 | xor %eax,%eax | |
4532 | jmp .Lenc_key_ret | |
4533 | ||
d64a7232 AP |
4534 | .align 16 |
4535 | .L14rounds: | |
46f4e1be | 4536 | movups 16($inp),%xmm2 # remaining half of *userKey |
d608b4d6 AP |
4537 | mov \$13,$bits # 14 rounds for 256 |
4538 | lea 16(%rax),%rax | |
23f6eec7 AP |
4539 | cmp \$`1<<28`,%r10d # AVX, but no XOP |
4540 | je .L14rounds_alt | |
4541 | ||
d608b4d6 AP |
4542 | $movkey %xmm0,($key) # round 0 |
4543 | $movkey %xmm2,16($key) # round 1 | |
d64a7232 AP |
4544 | aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 |
4545 | call .Lkey_expansion_256a_cold | |
4546 | aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 | |
4547 | call .Lkey_expansion_256b | |
4548 | aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 | |
4549 | call .Lkey_expansion_256a | |
4550 | aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 | |
4551 | call .Lkey_expansion_256b | |
4552 | aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 | |
4553 | call .Lkey_expansion_256a | |
4554 | aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 | |
4555 | call .Lkey_expansion_256b | |
4556 | aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 | |
4557 | call .Lkey_expansion_256a | |
4558 | aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 | |
4559 | call .Lkey_expansion_256b | |
4560 | aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 | |
4561 | call .Lkey_expansion_256a | |
4562 | aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 | |
4563 | call .Lkey_expansion_256b | |
4564 | aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 | |
4565 | call .Lkey_expansion_256a | |
4566 | aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 | |
4567 | call .Lkey_expansion_256b | |
4568 | aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 | |
4569 | call .Lkey_expansion_256a | |
d608b4d6 AP |
4570 | $movkey %xmm0,(%rax) |
4571 | mov $bits,16(%rax) # 240(%rdx) | |
d64a7232 | 4572 | xor %rax,%rax |
d608b4d6 AP |
4573 | jmp .Lenc_key_ret |
4574 | ||
23f6eec7 AP |
4575 | .align 16 |
4576 | .L14rounds_alt: | |
4577 | movdqa .Lkey_rotate(%rip),%xmm5 | |
4578 | movdqa .Lkey_rcon1(%rip),%xmm4 | |
4579 | mov \$7,%r10d | |
4580 | movdqu %xmm0,0($key) | |
4581 | movdqa %xmm2,%xmm1 | |
4582 | movdqu %xmm2,16($key) | |
4583 | jmp .Loop_key256 | |
4584 | ||
4585 | .align 16 | |
4586 | .Loop_key256: | |
4587 | pshufb %xmm5,%xmm2 | |
4588 | aesenclast %xmm4,%xmm2 | |
4589 | ||
4590 | movdqa %xmm0,%xmm3 | |
4591 | pslldq \$4,%xmm0 | |
4592 | pxor %xmm0,%xmm3 | |
4593 | pslldq \$4,%xmm0 | |
4594 | pxor %xmm0,%xmm3 | |
4595 | pslldq \$4,%xmm0 | |
4596 | pxor %xmm3,%xmm0 | |
4597 | pslld \$1,%xmm4 | |
4598 | ||
4599 | pxor %xmm2,%xmm0 | |
4600 | movdqu %xmm0,(%rax) | |
4601 | ||
4602 | dec %r10d | |
4603 | jz .Ldone_key256 | |
4604 | ||
4605 | pshufd \$0xff,%xmm0,%xmm2 | |
4606 | pxor %xmm3,%xmm3 | |
4607 | aesenclast %xmm3,%xmm2 | |
4608 | ||
4609 | movdqa %xmm1,%xmm3 | |
4610 | pslldq \$4,%xmm1 | |
4611 | pxor %xmm1,%xmm3 | |
4612 | pslldq \$4,%xmm1 | |
4613 | pxor %xmm1,%xmm3 | |
4614 | pslldq \$4,%xmm1 | |
4615 | pxor %xmm3,%xmm1 | |
4616 | ||
4617 | pxor %xmm1,%xmm2 | |
4618 | movdqu %xmm2,16(%rax) | |
4619 | lea 32(%rax),%rax | |
4620 | movdqa %xmm2,%xmm1 | |
4621 | ||
4622 | jmp .Loop_key256 | |
4623 | ||
4624 | .Ldone_key256: | |
4625 | mov $bits,16(%rax) # 240($key) | |
4626 | xor %eax,%eax | |
4627 | jmp .Lenc_key_ret | |
4628 | ||
d608b4d6 AP |
4629 | .align 16 |
4630 | .Lbad_keybits: | |
4631 | mov \$-2,%rax | |
4632 | .Lenc_key_ret: | |
23f6eec7 AP |
4633 | pxor %xmm0,%xmm0 |
4634 | pxor %xmm1,%xmm1 | |
4635 | pxor %xmm2,%xmm2 | |
4636 | pxor %xmm3,%xmm3 | |
4637 | pxor %xmm4,%xmm4 | |
4638 | pxor %xmm5,%xmm5 | |
d608b4d6 | 4639 | add \$8,%rsp |
b84460ad | 4640 | .cfi_adjust_cfa_offset -8 |
d608b4d6 | 4641 | ret |
b84460ad | 4642 | .cfi_endproc |
d608b4d6 AP |
4643 | .LSEH_end_set_encrypt_key: |
4644 | \f | |
4645 | .align 16 | |
4646 | .Lkey_expansion_128: | |
4647 | $movkey %xmm0,(%rax) | |
4648 | lea 16(%rax),%rax | |
4649 | .Lkey_expansion_128_cold: | |
4650 | shufps \$0b00010000,%xmm0,%xmm4 | |
f8501464 | 4651 | xorps %xmm4, %xmm0 |
d608b4d6 | 4652 | shufps \$0b10001100,%xmm0,%xmm4 |
f8501464 AP |
4653 | xorps %xmm4, %xmm0 |
4654 | shufps \$0b11111111,%xmm1,%xmm1 # critical path | |
4655 | xorps %xmm1,%xmm0 | |
d608b4d6 AP |
4656 | ret |
4657 | ||
4658 | .align 16 | |
4659 | .Lkey_expansion_192a: | |
4660 | $movkey %xmm0,(%rax) | |
4661 | lea 16(%rax),%rax | |
4662 | .Lkey_expansion_192a_cold: | |
4663 | movaps %xmm2, %xmm5 | |
4664 | .Lkey_expansion_192b_warm: | |
4665 | shufps \$0b00010000,%xmm0,%xmm4 | |
f8501464 AP |
4666 | movdqa %xmm2,%xmm3 |
4667 | xorps %xmm4,%xmm0 | |
d608b4d6 AP |
4668 | shufps \$0b10001100,%xmm0,%xmm4 |
4669 | pslldq \$4,%xmm3 | |
f8501464 | 4670 | xorps %xmm4,%xmm0 |
d608b4d6 AP |
4671 | pshufd \$0b01010101,%xmm1,%xmm1 # critical path |
4672 | pxor %xmm3,%xmm2 | |
4673 | pxor %xmm1,%xmm0 | |
4674 | pshufd \$0b11111111,%xmm0,%xmm3 | |
4675 | pxor %xmm3,%xmm2 | |
d64a7232 AP |
4676 | ret |
4677 | ||
d608b4d6 AP |
4678 | .align 16 |
4679 | .Lkey_expansion_192b: | |
4680 | movaps %xmm0,%xmm3 | |
4681 | shufps \$0b01000100,%xmm0,%xmm5 | |
4682 | $movkey %xmm5,(%rax) | |
4683 | shufps \$0b01001110,%xmm2,%xmm3 | |
4684 | $movkey %xmm3,16(%rax) | |
4685 | lea 32(%rax),%rax | |
4686 | jmp .Lkey_expansion_192b_warm | |
4687 | ||
d64a7232 AP |
4688 | .align 16 |
4689 | .Lkey_expansion_256a: | |
d608b4d6 AP |
4690 | $movkey %xmm2,(%rax) |
4691 | lea 16(%rax),%rax | |
d64a7232 AP |
4692 | .Lkey_expansion_256a_cold: |
4693 | shufps \$0b00010000,%xmm0,%xmm4 | |
f8501464 | 4694 | xorps %xmm4,%xmm0 |
d64a7232 | 4695 | shufps \$0b10001100,%xmm0,%xmm4 |
f8501464 AP |
4696 | xorps %xmm4,%xmm0 |
4697 | shufps \$0b11111111,%xmm1,%xmm1 # critical path | |
4698 | xorps %xmm1,%xmm0 | |
d64a7232 AP |
4699 | ret |
4700 | ||
4701 | .align 16 | |
4702 | .Lkey_expansion_256b: | |
d608b4d6 AP |
4703 | $movkey %xmm0,(%rax) |
4704 | lea 16(%rax),%rax | |
d64a7232 AP |
4705 | |
4706 | shufps \$0b00010000,%xmm2,%xmm4 | |
f8501464 | 4707 | xorps %xmm4,%xmm2 |
d64a7232 | 4708 | shufps \$0b10001100,%xmm2,%xmm4 |
f8501464 AP |
4709 | xorps %xmm4,%xmm2 |
4710 | shufps \$0b10101010,%xmm1,%xmm1 # critical path | |
4711 | xorps %xmm1,%xmm2 | |
d64a7232 | 4712 | ret |
d608b4d6 | 4713 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key |
f8501464 | 4714 | .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key |
d64a7232 AP |
4715 | ___ |
4716 | } | |
4717 | \f | |
4718 | $code.=<<___; | |
6c83629b AP |
4719 | .align 64 |
4720 | .Lbswap_mask: | |
4721 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | |
d7d119a3 | 4722 | .Lincrement32: |
f8501464 | 4723 | .long 6,6,6,0 |
d7d119a3 AP |
4724 | .Lincrement64: |
4725 | .long 1,0,0,0 | |
f8501464 AP |
4726 | .Lxts_magic: |
4727 | .long 0x87,0,1,0 | |
9282c335 AP |
4728 | .Lincrement1: |
4729 | .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 | |
23f6eec7 AP |
4730 | .Lkey_rotate: |
4731 | .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d | |
4732 | .Lkey_rotate192: | |
4733 | .long 0x04070605,0x04070605,0x04070605,0x04070605 | |
4734 | .Lkey_rcon1: | |
4735 | .long 1,1,1,1 | |
4736 | .Lkey_rcon1b: | |
4737 | .long 0x1b,0x1b,0x1b,0x1b | |
f8501464 | 4738 | |
d64a7232 AP |
4739 | .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" |
4740 | .align 64 | |
4741 | ___ | |
4742 | ||
d608b4d6 AP |
4743 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
4744 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
4745 | if ($win64) { | |
4746 | $rec="%rcx"; | |
4747 | $frame="%rdx"; | |
4748 | $context="%r8"; | |
4749 | $disp="%r9"; | |
4750 | ||
4751 | $code.=<<___; | |
4752 | .extern __imp_RtlVirtualUnwind | |
6c83629b AP |
4753 | ___ |
4754 | $code.=<<___ if ($PREFIX eq "aesni"); | |
69d5747f | 4755 | .type ecb_ccm64_se_handler,\@abi-omnipotent |
d7d119a3 | 4756 | .align 16 |
69d5747f | 4757 | ecb_ccm64_se_handler: |
d7d119a3 AP |
4758 | push %rsi |
4759 | push %rdi | |
4760 | push %rbx | |
4761 | push %rbp | |
4762 | push %r12 | |
4763 | push %r13 | |
4764 | push %r14 | |
4765 | push %r15 | |
4766 | pushfq | |
4767 | sub \$64,%rsp | |
4768 | ||
4769 | mov 120($context),%rax # pull context->Rax | |
4770 | mov 248($context),%rbx # pull context->Rip | |
4771 | ||
4772 | mov 8($disp),%rsi # disp->ImageBase | |
02f358da | 4773 | mov 56($disp),%r11 # disp->HandlerData |
d7d119a3 AP |
4774 | |
4775 | mov 0(%r11),%r10d # HandlerData[0] | |
4776 | lea (%rsi,%r10),%r10 # prologue label | |
4777 | cmp %r10,%rbx # context->Rip<prologue label | |
f8501464 | 4778 | jb .Lcommon_seh_tail |
d7d119a3 AP |
4779 | |
4780 | mov 152($context),%rax # pull context->Rsp | |
4781 | ||
4782 | mov 4(%r11),%r10d # HandlerData[1] | |
4783 | lea (%rsi,%r10),%r10 # epilogue label | |
4784 | cmp %r10,%rbx # context->Rip>=epilogue label | |
f8501464 | 4785 | jae .Lcommon_seh_tail |
d7d119a3 | 4786 | |
f8501464 | 4787 | lea 0(%rax),%rsi # %xmm save area |
d7d119a3 AP |
4788 | lea 512($context),%rdi # &context.Xmm6 |
4789 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) | |
4790 | .long 0xa548f3fc # cld; rep movsq | |
4791 | lea 0x58(%rax),%rax # adjust stack pointer | |
4792 | ||
f8501464 | 4793 | jmp .Lcommon_seh_tail |
69d5747f | 4794 | .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler |
d7d119a3 | 4795 | |
6c79faaa | 4796 | .type ctr_xts_se_handler,\@abi-omnipotent |
6c83629b | 4797 | .align 16 |
6c79faaa | 4798 | ctr_xts_se_handler: |
f8501464 AP |
4799 | push %rsi |
4800 | push %rdi | |
4801 | push %rbx | |
4802 | push %rbp | |
4803 | push %r12 | |
4804 | push %r13 | |
4805 | push %r14 | |
4806 | push %r15 | |
4807 | pushfq | |
4808 | sub \$64,%rsp | |
4809 | ||
4810 | mov 120($context),%rax # pull context->Rax | |
4811 | mov 248($context),%rbx # pull context->Rip | |
4812 | ||
4813 | mov 8($disp),%rsi # disp->ImageBase | |
4814 | mov 56($disp),%r11 # disp->HandlerData | |
4815 | ||
4816 | mov 0(%r11),%r10d # HandlerData[0] | |
4817 | lea (%rsi,%r10),%r10 # prologue lable | |
4818 | cmp %r10,%rbx # context->Rip<prologue label | |
4819 | jb .Lcommon_seh_tail | |
4820 | ||
4821 | mov 152($context),%rax # pull context->Rsp | |
4822 | ||
4823 | mov 4(%r11),%r10d # HandlerData[1] | |
4824 | lea (%rsi,%r10),%r10 # epilogue label | |
4825 | cmp %r10,%rbx # context->Rip>=epilogue label | |
4826 | jae .Lcommon_seh_tail | |
4827 | ||
384e6de4 AP |
4828 | mov 208($context),%rax # pull context->R11 |
4829 | ||
4830 | lea -0xa8(%rax),%rsi # %xmm save area | |
f8501464 AP |
4831 | lea 512($context),%rdi # & context.Xmm6 |
4832 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | |
4833 | .long 0xa548f3fc # cld; rep movsq | |
f8501464 | 4834 | |
384e6de4 AP |
4835 | mov -8(%rax),%rbp # restore saved %rbp |
4836 | mov %rbp,160($context) # restore context->Rbp | |
4837 | jmp .Lcommon_seh_tail | |
6c79faaa | 4838 | .size ctr_xts_se_handler,.-ctr_xts_se_handler |
bd30091c AP |
4839 | |
4840 | .type ocb_se_handler,\@abi-omnipotent | |
4841 | .align 16 | |
4842 | ocb_se_handler: | |
4843 | push %rsi | |
4844 | push %rdi | |
4845 | push %rbx | |
4846 | push %rbp | |
4847 | push %r12 | |
4848 | push %r13 | |
4849 | push %r14 | |
4850 | push %r15 | |
4851 | pushfq | |
4852 | sub \$64,%rsp | |
4853 | ||
4854 | mov 120($context),%rax # pull context->Rax | |
4855 | mov 248($context),%rbx # pull context->Rip | |
4856 | ||
4857 | mov 8($disp),%rsi # disp->ImageBase | |
4858 | mov 56($disp),%r11 # disp->HandlerData | |
4859 | ||
4860 | mov 0(%r11),%r10d # HandlerData[0] | |
4861 | lea (%rsi,%r10),%r10 # prologue lable | |
4862 | cmp %r10,%rbx # context->Rip<prologue label | |
4863 | jb .Lcommon_seh_tail | |
4864 | ||
4865 | mov 4(%r11),%r10d # HandlerData[1] | |
4866 | lea (%rsi,%r10),%r10 # epilogue label | |
4867 | cmp %r10,%rbx # context->Rip>=epilogue label | |
4868 | jae .Lcommon_seh_tail | |
4869 | ||
4870 | mov 8(%r11),%r10d # HandlerData[2] | |
4871 | lea (%rsi,%r10),%r10 | |
4872 | cmp %r10,%rbx # context->Rip>=pop label | |
4873 | jae .Locb_no_xmm | |
4874 | ||
4875 | mov 152($context),%rax # pull context->Rsp | |
4876 | ||
4877 | lea (%rax),%rsi # %xmm save area | |
4878 | lea 512($context),%rdi # & context.Xmm6 | |
4879 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | |
4880 | .long 0xa548f3fc # cld; rep movsq | |
4881 | lea 0xa0+0x28(%rax),%rax | |
4882 | ||
4883 | .Locb_no_xmm: | |
4884 | mov -8(%rax),%rbx | |
4885 | mov -16(%rax),%rbp | |
4886 | mov -24(%rax),%r12 | |
4887 | mov -32(%rax),%r13 | |
4888 | mov -40(%rax),%r14 | |
4889 | ||
4890 | mov %rbx,144($context) # restore context->Rbx | |
4891 | mov %rbp,160($context) # restore context->Rbp | |
4892 | mov %r12,216($context) # restore context->R12 | |
4893 | mov %r13,224($context) # restore context->R13 | |
4894 | mov %r14,232($context) # restore context->R14 | |
4895 | ||
4896 | jmp .Lcommon_seh_tail | |
4897 | .size ocb_se_handler,.-ocb_se_handler | |
6c83629b AP |
4898 | ___ |
4899 | $code.=<<___; | |
4900 | .type cbc_se_handler,\@abi-omnipotent | |
d608b4d6 | 4901 | .align 16 |
6c83629b | 4902 | cbc_se_handler: |
d608b4d6 AP |
4903 | push %rsi |
4904 | push %rdi | |
4905 | push %rbx | |
4906 | push %rbp | |
4907 | push %r12 | |
4908 | push %r13 | |
4909 | push %r14 | |
4910 | push %r15 | |
4911 | pushfq | |
4912 | sub \$64,%rsp | |
4913 | ||
4914 | mov 152($context),%rax # pull context->Rsp | |
6c83629b AP |
4915 | mov 248($context),%rbx # pull context->Rip |
4916 | ||
23f6eec7 | 4917 | lea .Lcbc_decrypt_bulk(%rip),%r10 |
6c83629b | 4918 | cmp %r10,%rbx # context->Rip<"prologue" label |
f8501464 | 4919 | jb .Lcommon_seh_tail |
6c83629b | 4920 | |
384e6de4 AP |
4921 | mov 120($context),%rax # pull context->Rax |
4922 | ||
6c83629b AP |
4923 | lea .Lcbc_decrypt_body(%rip),%r10 |
4924 | cmp %r10,%rbx # context->Rip<cbc_decrypt_body | |
384e6de4 AP |
4925 | jb .Lcommon_seh_tail |
4926 | ||
4927 | mov 152($context),%rax # pull context->Rsp | |
6c83629b AP |
4928 | |
4929 | lea .Lcbc_ret(%rip),%r10 | |
4930 | cmp %r10,%rbx # context->Rip>="epilogue" label | |
f8501464 | 4931 | jae .Lcommon_seh_tail |
6c83629b | 4932 | |
6a40ebe8 | 4933 | lea 16(%rax),%rsi # %xmm save area |
6c83629b | 4934 | lea 512($context),%rdi # &context.Xmm6 |
73325b22 | 4935 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) |
6c83629b | 4936 | .long 0xa548f3fc # cld; rep movsq |
6a40ebe8 | 4937 | |
384e6de4 | 4938 | mov 208($context),%rax # pull context->R11 |
6c83629b | 4939 | |
384e6de4 AP |
4940 | mov -8(%rax),%rbp # restore saved %rbp |
4941 | mov %rbp,160($context) # restore context->Rbp | |
f8501464 AP |
4942 | |
4943 | .Lcommon_seh_tail: | |
d608b4d6 AP |
4944 | mov 8(%rax),%rdi |
4945 | mov 16(%rax),%rsi | |
6c83629b | 4946 | mov %rax,152($context) # restore context->Rsp |
d608b4d6 AP |
4947 | mov %rsi,168($context) # restore context->Rsi |
4948 | mov %rdi,176($context) # restore context->Rdi | |
4949 | ||
d608b4d6 AP |
4950 | mov 40($disp),%rdi # disp->ContextRecord |
4951 | mov $context,%rsi # context | |
4952 | mov \$154,%ecx # sizeof(CONTEXT) | |
4953 | .long 0xa548f3fc # cld; rep movsq | |
4954 | ||
4955 | mov $disp,%rsi | |
4956 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
4957 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
4958 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
4959 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
4960 | mov 40(%rsi),%r10 # disp->ContextRecord | |
4961 | lea 56(%rsi),%r11 # &disp->HandlerData | |
4962 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
4963 | mov %r10,32(%rsp) # arg5 | |
4964 | mov %r11,40(%rsp) # arg6 | |
4965 | mov %r12,48(%rsp) # arg7 | |
4966 | mov %rcx,56(%rsp) # arg8, (NULL) | |
4967 | call *__imp_RtlVirtualUnwind(%rip) | |
4968 | ||
4969 | mov \$1,%eax # ExceptionContinueSearch | |
4970 | add \$64,%rsp | |
4971 | popfq | |
4972 | pop %r15 | |
4973 | pop %r14 | |
4974 | pop %r13 | |
4975 | pop %r12 | |
4976 | pop %rbp | |
4977 | pop %rbx | |
4978 | pop %rdi | |
4979 | pop %rsi | |
4980 | ret | |
4981 | .size cbc_se_handler,.-cbc_se_handler | |
4982 | ||
4983 | .section .pdata | |
4984 | .align 4 | |
6c83629b AP |
4985 | ___ |
4986 | $code.=<<___ if ($PREFIX eq "aesni"); | |
4987 | .rva .LSEH_begin_aesni_ecb_encrypt | |
4988 | .rva .LSEH_end_aesni_ecb_encrypt | |
d608b4d6 AP |
4989 | .rva .LSEH_info_ecb |
4990 | ||
d7d119a3 AP |
4991 | .rva .LSEH_begin_aesni_ccm64_encrypt_blocks |
4992 | .rva .LSEH_end_aesni_ccm64_encrypt_blocks | |
02f358da | 4993 | .rva .LSEH_info_ccm64_enc |
d7d119a3 AP |
4994 | |
4995 | .rva .LSEH_begin_aesni_ccm64_decrypt_blocks | |
4996 | .rva .LSEH_end_aesni_ccm64_decrypt_blocks | |
02f358da | 4997 | .rva .LSEH_info_ccm64_dec |
d7d119a3 | 4998 | |
6c83629b AP |
4999 | .rva .LSEH_begin_aesni_ctr32_encrypt_blocks |
5000 | .rva .LSEH_end_aesni_ctr32_encrypt_blocks | |
5001 | .rva .LSEH_info_ctr32 | |
f8501464 AP |
5002 | |
5003 | .rva .LSEH_begin_aesni_xts_encrypt | |
5004 | .rva .LSEH_end_aesni_xts_encrypt | |
5005 | .rva .LSEH_info_xts_enc | |
5006 | ||
5007 | .rva .LSEH_begin_aesni_xts_decrypt | |
5008 | .rva .LSEH_end_aesni_xts_decrypt | |
5009 | .rva .LSEH_info_xts_dec | |
bd30091c AP |
5010 | |
5011 | .rva .LSEH_begin_aesni_ocb_encrypt | |
5012 | .rva .LSEH_end_aesni_ocb_encrypt | |
5013 | .rva .LSEH_info_ocb_enc | |
5014 | ||
5015 | .rva .LSEH_begin_aesni_ocb_decrypt | |
5016 | .rva .LSEH_end_aesni_ocb_decrypt | |
5017 | .rva .LSEH_info_ocb_dec | |
6c83629b AP |
5018 | ___ |
5019 | $code.=<<___; | |
d608b4d6 AP |
5020 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt |
5021 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | |
5022 | .rva .LSEH_info_cbc | |
5023 | ||
d608b4d6 AP |
5024 | .rva ${PREFIX}_set_decrypt_key |
5025 | .rva .LSEH_end_set_decrypt_key | |
5026 | .rva .LSEH_info_key | |
c5036d78 AP |
5027 | |
5028 | .rva ${PREFIX}_set_encrypt_key | |
5029 | .rva .LSEH_end_set_encrypt_key | |
5030 | .rva .LSEH_info_key | |
d608b4d6 AP |
5031 | .section .xdata |
5032 | .align 8 | |
6c83629b AP |
5033 | ___ |
5034 | $code.=<<___ if ($PREFIX eq "aesni"); | |
d608b4d6 AP |
5035 | .LSEH_info_ecb: |
5036 | .byte 9,0,0,0 | |
69d5747f AP |
5037 | .rva ecb_ccm64_se_handler |
5038 | .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] | |
02f358da | 5039 | .LSEH_info_ccm64_enc: |
d7d119a3 | 5040 | .byte 9,0,0,0 |
69d5747f | 5041 | .rva ecb_ccm64_se_handler |
02f358da AP |
5042 | .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] |
5043 | .LSEH_info_ccm64_dec: | |
5044 | .byte 9,0,0,0 | |
69d5747f | 5045 | .rva ecb_ccm64_se_handler |
02f358da | 5046 | .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] |
6c83629b AP |
5047 | .LSEH_info_ctr32: |
5048 | .byte 9,0,0,0 | |
6c79faaa AP |
5049 | .rva ctr_xts_se_handler |
5050 | .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] | |
f8501464 AP |
5051 | .LSEH_info_xts_enc: |
5052 | .byte 9,0,0,0 | |
6c79faaa | 5053 | .rva ctr_xts_se_handler |
f8501464 AP |
5054 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] |
5055 | .LSEH_info_xts_dec: | |
5056 | .byte 9,0,0,0 | |
6c79faaa | 5057 | .rva ctr_xts_se_handler |
f8501464 | 5058 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] |
bd30091c AP |
5059 | .LSEH_info_ocb_enc: |
5060 | .byte 9,0,0,0 | |
5061 | .rva ocb_se_handler | |
5062 | .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] | |
5063 | .rva .Locb_enc_pop | |
5064 | .long 0 | |
5065 | .LSEH_info_ocb_dec: | |
5066 | .byte 9,0,0,0 | |
5067 | .rva ocb_se_handler | |
5068 | .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] | |
5069 | .rva .Locb_dec_pop | |
5070 | .long 0 | |
6c83629b AP |
5071 | ___ |
5072 | $code.=<<___; | |
d608b4d6 AP |
5073 | .LSEH_info_cbc: |
5074 | .byte 9,0,0,0 | |
5075 | .rva cbc_se_handler | |
5076 | .LSEH_info_key: | |
5077 | .byte 0x01,0x04,0x01,0x00 | |
d7d119a3 | 5078 | .byte 0x04,0x02,0x00,0x00 # sub rsp,8 |
d608b4d6 AP |
5079 | ___ |
5080 | } | |
5081 | ||
d64a7232 | 5082 | sub rex { |
0a9a692e AP |
5083 | local *opcode=shift; |
5084 | my ($dst,$src)=@_; | |
5085 | my $rex=0; | |
5086 | ||
5087 | $rex|=0x04 if($dst>=8); | |
5088 | $rex|=0x01 if($src>=8); | |
5089 | push @opcode,$rex|0x40 if($rex); | |
d64a7232 AP |
5090 | } |
5091 | ||
5092 | sub aesni { | |
5093 | my $line=shift; | |
5094 | my @opcode=(0x66); | |
5095 | ||
5096 | if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { | |
5097 | rex(\@opcode,$4,$3); | |
5098 | push @opcode,0x0f,0x3a,0xdf; | |
5099 | push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M | |
5100 | my $c=$2; | |
5101 | push @opcode,$c=~/^0/?oct($c):$c; | |
5102 | return ".byte\t".join(',',@opcode); | |
5103 | } | |
5104 | elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { | |
5105 | my %opcodelet = ( | |
5106 | "aesimc" => 0xdb, | |
5107 | "aesenc" => 0xdc, "aesenclast" => 0xdd, | |
5108 | "aesdec" => 0xde, "aesdeclast" => 0xdf | |
5109 | ); | |
5110 | return undef if (!defined($opcodelet{$1})); | |
5111 | rex(\@opcode,$3,$2); | |
5112 | push @opcode,0x0f,0x38,$opcodelet{$1}; | |
5113 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M | |
5114 | return ".byte\t".join(',',@opcode); | |
5115 | } | |
36df342f AP |
5116 | elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { |
5117 | my %opcodelet = ( | |
5118 | "aesenc" => 0xdc, "aesenclast" => 0xdd, | |
5119 | "aesdec" => 0xde, "aesdeclast" => 0xdf | |
5120 | ); | |
5121 | return undef if (!defined($opcodelet{$1})); | |
5122 | my $off = $2; | |
5123 | push @opcode,0x44 if ($3>=8); | |
5124 | push @opcode,0x0f,0x38,$opcodelet{$1}; | |
5125 | push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M | |
5126 | push @opcode,($off=~/^0/?oct($off):$off)&0xff; | |
5127 | return ".byte\t".join(',',@opcode); | |
5128 | } | |
d64a7232 AP |
5129 | return $line; |
5130 | } | |
5131 | ||
5599c733 AP |
5132 | sub movbe { |
5133 | ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; | |
5134 | } | |
5135 | ||
d64a7232 AP |
5136 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; |
5137 | $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; | |
5599c733 AP |
5138 | #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact |
5139 | $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; | |
d64a7232 AP |
5140 | |
5141 | print $code; | |
5142 | ||
5143 | close STDOUT; |