]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
d64a7232 AP |
9 | # |
10 | # ==================================================================== | |
d8ba0dc9 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
d64a7232 AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # This module implements support for Intel AES-NI extension. In | |
18 | # OpenSSL context it's used with Intel engine, but can also be used as | |
19 | # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for | |
20 | # details]. | |
d7d119a3 AP |
21 | # |
22 | # Performance. | |
23 | # | |
24 | # Given aes(enc|dec) instructions' latency asymptotic performance for | |
25 | # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte | |
26 | # processed with 128-bit key. And given their throughput asymptotic | |
27 | # performance for parallelizable modes is 1.25 cycles per byte. Being | |
f8501464 | 28 | # asymptotic limit it's not something you commonly achieve in reality, |
d7d119a3 AP |
29 | # but how close does one get? Below are results collected for |
30 | # different modes and block sized. Pairs of numbers are for en-/ | |
31 | # decryption. | |
32 | # | |
33 | # 16-byte 64-byte 256-byte 1-KB 8-KB | |
34 | # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 | |
35 | # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 | |
36 | # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 | |
609b0852 | 37 | # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 |
d7d119a3 AP |
38 | # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 |
39 | # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 | |
40 | # | |
41 | # ECB, CTR, CBC and CCM results are free from EVP overhead. This means | |
42 | # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni | |
43 | # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. | |
44 | # The results were collected with specially crafted speed.c benchmark | |
45 | # in order to compare them with results reported in "Intel Advanced | |
46 | # Encryption Standard (AES) New Instruction Set" White Paper Revision | |
47 | # 3.0 dated May 2010. All above results are consistently better. This | |
48 | # module also provides better performance for block sizes smaller than | |
49 | # 128 bytes in points *not* represented in the above table. | |
50 | # | |
51 | # Looking at the results for 8-KB buffer. | |
52 | # | |
53 | # CFB and OFB results are far from the limit, because implementation | |
54 | # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on | |
55 | # single-block aesni_encrypt, which is not the most optimal way to go. | |
56 | # CBC encrypt result is unexpectedly high and there is no documented | |
57 | # explanation for it. Seemingly there is a small penalty for feeding | |
58 | # the result back to AES unit the way it's done in CBC mode. There is | |
59 | # nothing one can do and the result appears optimal. CCM result is | |
60 | # identical to CBC, because CBC-MAC is essentially CBC encrypt without | |
61 | # saving output. CCM CTR "stays invisible," because it's neatly | |
62 | # interleaved wih CBC-MAC. This provides ~30% improvement over | |
46f4e1be | 63 | # "straightforward" CCM implementation with CTR and CBC-MAC performed |
d7d119a3 AP |
64 | # disjointly. Parallelizable modes practically achieve the theoretical |
65 | # limit. | |
66 | # | |
67 | # Looking at how results vary with buffer size. | |
68 | # | |
69 | # Curves are practically saturated at 1-KB buffer size. In most cases | |
70 | # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. | |
71 | # CTR curve doesn't follow this pattern and is "slowest" changing one | |
72 | # with "256-byte" result being 87% of "8-KB." This is because overhead | |
73 | # in CTR mode is most computationally intensive. Small-block CCM | |
74 | # decrypt is slower than encrypt, because first CTR and last CBC-MAC | |
75 | # iterations can't be interleaved. | |
76 | # | |
77 | # Results for 192- and 256-bit keys. | |
78 | # | |
79 | # EVP-free results were observed to scale perfectly with number of | |
80 | # rounds for larger block sizes, i.e. 192-bit result being 10/12 times | |
81 | # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences | |
82 | # are a tad smaller, because the above mentioned penalty biases all | |
83 | # results by same constant value. In similar way function call | |
84 | # overhead affects small-block performance, as well as OFB and CFB | |
85 | # results. Differences are not large, most common coefficients are | |
86 | # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one | |
02f358da | 87 | # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... |
d64a7232 | 88 | |
f8501464 AP |
89 | # January 2011 |
90 | # | |
91 | # While Westmere processor features 6 cycles latency for aes[enc|dec] | |
92 | # instructions, which can be scheduled every second cycle, Sandy | |
93 | # Bridge spends 8 cycles per instruction, but it can schedule them | |
94 | # every cycle. This means that code targeting Westmere would perform | |
95 | # suboptimally on Sandy Bridge. Therefore this update. | |
96 | # | |
97 | # In addition, non-parallelizable CBC encrypt (as well as CCM) is | |
98 | # optimized. Relative improvement might appear modest, 8% on Westmere, | |
99 | # but in absolute terms it's 3.77 cycles per byte encrypted with | |
100 | # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers | |
101 | # should be compared to asymptotic limits of 3.75 for Westmere and | |
102 | # 5.00 for Sandy Bridge. Actually, the fact that they get this close | |
103 | # to asymptotic limits is quite amazing. Indeed, the limit is | |
104 | # calculated as latency times number of rounds, 10 for 128-bit key, | |
105 | # and divided by 16, the number of bytes in block, or in other words | |
106 | # it accounts *solely* for aesenc instructions. But there are extra | |
107 | # instructions, and numbers so close to the asymptotic limits mean | |
108 | # that it's as if it takes as little as *one* additional cycle to | |
109 | # execute all of them. How is it possible? It is possible thanks to | |
110 | # out-of-order execution logic, which manages to overlap post- | |
111 | # processing of previous block, things like saving the output, with | |
112 | # actual encryption of current block, as well as pre-processing of | |
113 | # current block, things like fetching input and xor-ing it with | |
114 | # 0-round element of the key schedule, with actual encryption of | |
115 | # previous block. Keep this in mind... | |
116 | # | |
117 | # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher | |
118 | # performance is achieved by interleaving instructions working on | |
119 | # independent blocks. In which case asymptotic limit for such modes | |
120 | # can be obtained by dividing above mentioned numbers by AES | |
609b0852 | 121 | # instructions' interleave factor. Westmere can execute at most 3 |
f8501464 AP |
122 | # instructions at a time, meaning that optimal interleave factor is 3, |
123 | # and that's where the "magic" number of 1.25 come from. "Optimal | |
124 | # interleave factor" means that increase of interleave factor does | |
125 | # not improve performance. The formula has proven to reflect reality | |
126 | # pretty well on Westmere... Sandy Bridge on the other hand can | |
127 | # execute up to 8 AES instructions at a time, so how does varying | |
128 | # interleave factor affect the performance? Here is table for ECB | |
129 | # (numbers are cycles per byte processed with 128-bit key): | |
130 | # | |
131 | # instruction interleave factor 3x 6x 8x | |
132 | # theoretical asymptotic limit 1.67 0.83 0.625 | |
133 | # measured performance for 8KB block 1.05 0.86 0.84 | |
134 | # | |
135 | # "as if" interleave factor 4.7x 5.8x 6.0x | |
136 | # | |
137 | # Further data for other parallelizable modes: | |
138 | # | |
73325b22 | 139 | # CBC decrypt 1.16 0.93 0.74 |
cd54249c | 140 | # CTR 1.14 0.91 0.74 |
f8501464 AP |
141 | # |
142 | # Well, given 3x column it's probably inappropriate to call the limit | |
143 | # asymptotic, if it can be surpassed, isn't it? What happens there? | |
144 | # Rewind to CBC paragraph for the answer. Yes, out-of-order execution | |
145 | # magic is responsible for this. Processor overlaps not only the | |
46f4e1be | 146 | # additional instructions with AES ones, but even AES instructions |
f8501464 AP |
147 | # processing adjacent triplets of independent blocks. In the 6x case |
148 | # additional instructions still claim disproportionally small amount | |
149 | # of additional cycles, but in 8x case number of instructions must be | |
150 | # a tad too high for out-of-order logic to cope with, and AES unit | |
151 | # remains underutilized... As you can see 8x interleave is hardly | |
152 | # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl | |
46f4e1be | 153 | # utilizes 6x interleave because of limited register bank capacity. |
f8501464 AP |
154 | # |
155 | # Higher interleave factors do have negative impact on Westmere | |
156 | # performance. While for ECB mode it's negligible ~1.5%, other | |
157 | # parallelizables perform ~5% worse, which is outweighed by ~25% | |
158 | # improvement on Sandy Bridge. To balance regression on Westmere | |
159 | # CTR mode was implemented with 6x aesenc interleave factor. | |
160 | ||
161 | # April 2011 | |
162 | # | |
36df342f AP |
163 | # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing |
164 | # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like | |
f8501464 AP |
165 | # in CTR mode AES instruction interleave factor was chosen to be 6x. |
166 | ||
bd30091c AP |
167 | # November 2015 |
168 | # | |
169 | # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was | |
170 | # chosen to be 6x. | |
171 | ||
d2e18031 | 172 | ###################################################################### |
5599c733 AP |
173 | # Current large-block performance in cycles per byte processed with |
174 | # 128-bit key (less is better). | |
175 | # | |
bd30091c | 176 | # CBC en-/decrypt CTR XTS ECB OCB |
5599c733 | 177 | # Westmere 3.77/1.25 1.25 1.25 1.26 |
bd30091c AP |
178 | # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 |
179 | # Haswell 4.44/0.63 0.63 0.73 0.63 0.70 | |
b7f5503f | 180 | # Skylake 2.62/0.63 0.63 0.63 0.63 |
bd30091c | 181 | # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 |
64d92d74 | 182 | # Knights L 2.54/0.77 0.78 0.85 - 1.50 |
ace05265 | 183 | # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 |
bd30091c | 184 | # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 |
54f8f9a1 | 185 | # Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 |
5599c733 | 186 | # |
23f6eec7 AP |
187 | # (*) Atom Silvermont ECB result is suboptimal because of penalties |
188 | # incurred by operations on %xmm8-15. As ECB is not considered | |
5599c733 | 189 | # critical, nothing was done to mitigate the problem. |
d8ba0dc9 | 190 | |
d64a7232 AP |
191 | $PREFIX="aesni"; # if $PREFIX is set to "AES", the script |
192 | # generates drop-in replacement for | |
193 | # crypto/aes/asm/aes-x86_64.pl:-) | |
194 | ||
195 | $flavour = shift; | |
196 | $output = shift; | |
197 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
198 | ||
199 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
200 | ||
201 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
202 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
203 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
204 | die "can't locate x86_64-xlate.pl"; | |
205 | ||
cfe1d992 | 206 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
46bf83f0 | 207 | *STDOUT=*OUT; |
d64a7232 | 208 | |
8da721ee | 209 | $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; |
d608b4d6 AP |
210 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order |
211 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
d64a7232 AP |
212 | |
213 | $code=".text\n"; | |
5599c733 | 214 | $code.=".extern OPENSSL_ia32cap_P\n"; |
d64a7232 AP |
215 | |
216 | $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! | |
d608b4d6 | 217 | # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... |
d64a7232 AP |
218 | $inp="%rdi"; |
219 | $out="%rsi"; | |
d64a7232 AP |
220 | $len="%rdx"; |
221 | $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! | |
d7d119a3 | 222 | $ivp="%r8"; # cbc, ctr, ... |
d64a7232 AP |
223 | |
224 | $rnds_="%r10d"; # backup copy for $rounds | |
225 | $key_="%r11"; # backup copy for $key | |
226 | ||
227 | # %xmm register layout | |
f8501464 AP |
228 | $rndkey0="%xmm0"; $rndkey1="%xmm1"; |
229 | $inout0="%xmm2"; $inout1="%xmm3"; | |
230 | $inout2="%xmm4"; $inout3="%xmm5"; | |
231 | $inout4="%xmm6"; $inout5="%xmm7"; | |
232 | $inout6="%xmm8"; $inout7="%xmm9"; | |
233 | ||
234 | $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... | |
235 | $in0="%xmm8"; $iv="%xmm9"; | |
d64a7232 AP |
236 | \f |
237 | # Inline version of internal aesni_[en|de]crypt1. | |
238 | # | |
239 | # Why folded loop? Because aes[enc|dec] is slow enough to accommodate | |
240 | # cycles which take care of loop variables... | |
241 | { my $sn; | |
d608b4d6 | 242 | sub aesni_generate1 { |
f8501464 | 243 | my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); |
d64a7232 AP |
244 | ++$sn; |
245 | $code.=<<___; | |
f8501464 | 246 | $movkey ($key),$rndkey0 |
d64a7232 | 247 | $movkey 16($key),$rndkey1 |
f8501464 AP |
248 | ___ |
249 | $code.=<<___ if (defined($ivec)); | |
250 | xorps $rndkey0,$ivec | |
251 | lea 32($key),$key | |
252 | xorps $ivec,$inout | |
253 | ___ | |
254 | $code.=<<___ if (!defined($ivec)); | |
d608b4d6 | 255 | lea 32($key),$key |
f8501464 AP |
256 | xorps $rndkey0,$inout |
257 | ___ | |
258 | $code.=<<___; | |
d608b4d6 | 259 | .Loop_${p}1_$sn: |
d7d119a3 | 260 | aes${p} $rndkey1,$inout |
d64a7232 | 261 | dec $rounds |
d64a7232 | 262 | $movkey ($key),$rndkey1 |
d64a7232 | 263 | lea 16($key),$key |
d608b4d6 | 264 | jnz .Loop_${p}1_$sn # loop body is 16 bytes |
d7d119a3 | 265 | aes${p}last $rndkey1,$inout |
d64a7232 AP |
266 | ___ |
267 | }} | |
d608b4d6 | 268 | # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); |
d64a7232 | 269 | # |
d608b4d6 AP |
270 | { my ($inp,$out,$key) = @_4args; |
271 | ||
d64a7232 AP |
272 | $code.=<<___; |
273 | .globl ${PREFIX}_encrypt | |
d608b4d6 | 274 | .type ${PREFIX}_encrypt,\@abi-omnipotent |
d64a7232 AP |
275 | .align 16 |
276 | ${PREFIX}_encrypt: | |
f8501464 AP |
277 | movups ($inp),$inout0 # load input |
278 | mov 240($key),$rounds # key->rounds | |
d64a7232 | 279 | ___ |
d608b4d6 | 280 | &aesni_generate1("enc",$key,$rounds); |
d64a7232 | 281 | $code.=<<___; |
23f6eec7 AP |
282 | pxor $rndkey0,$rndkey0 # clear register bank |
283 | pxor $rndkey1,$rndkey1 | |
d608b4d6 | 284 | movups $inout0,($out) # output |
23f6eec7 | 285 | pxor $inout0,$inout0 |
d64a7232 AP |
286 | ret |
287 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt | |
d64a7232 | 288 | |
d64a7232 | 289 | .globl ${PREFIX}_decrypt |
d608b4d6 | 290 | .type ${PREFIX}_decrypt,\@abi-omnipotent |
d64a7232 AP |
291 | .align 16 |
292 | ${PREFIX}_decrypt: | |
f8501464 AP |
293 | movups ($inp),$inout0 # load input |
294 | mov 240($key),$rounds # key->rounds | |
d64a7232 | 295 | ___ |
d608b4d6 | 296 | &aesni_generate1("dec",$key,$rounds); |
d64a7232 | 297 | $code.=<<___; |
23f6eec7 AP |
298 | pxor $rndkey0,$rndkey0 # clear register bank |
299 | pxor $rndkey1,$rndkey1 | |
d608b4d6 | 300 | movups $inout0,($out) # output |
23f6eec7 | 301 | pxor $inout0,$inout0 |
d64a7232 AP |
302 | ret |
303 | .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt | |
304 | ___ | |
d608b4d6 | 305 | } |
d64a7232 | 306 | \f |
f8501464 AP |
307 | # _aesni_[en|de]cryptN are private interfaces, N denotes interleave |
308 | # factor. Why 3x subroutine were originally used in loops? Even though | |
309 | # aes[enc|dec] latency was originally 6, it could be scheduled only | |
310 | # every *2nd* cycle. Thus 3x interleave was the one providing optimal | |
d608b4d6 AP |
311 | # utilization, i.e. when subroutine's throughput is virtually same as |
312 | # of non-interleaved subroutine [for number of input blocks up to 3]. | |
214368ff AP |
313 | # This is why it originally made no sense to implement 2x subroutine. |
314 | # But times change and it became appropriate to spend extra 192 bytes | |
315 | # on 2x subroutine on Atom Silvermont account. For processors that | |
316 | # can schedule aes[enc|dec] every cycle optimal interleave factor | |
317 | # equals to corresponding instructions latency. 8x is optimal for | |
609b0852 | 318 | # * Bridge and "super-optimal" for other Intel CPUs... |
214368ff AP |
319 | |
320 | sub aesni_generate2 { | |
321 | my $dir=shift; | |
322 | # As already mentioned it takes in $key and $rounds, which are *not* | |
323 | # preserved. $inout[0-1] is cipher/clear text... | |
324 | $code.=<<___; | |
325 | .type _aesni_${dir}rypt2,\@abi-omnipotent | |
326 | .align 16 | |
327 | _aesni_${dir}rypt2: | |
328 | $movkey ($key),$rndkey0 | |
329 | shl \$4,$rounds | |
330 | $movkey 16($key),$rndkey1 | |
331 | xorps $rndkey0,$inout0 | |
332 | xorps $rndkey0,$inout1 | |
333 | $movkey 32($key),$rndkey0 | |
334 | lea 32($key,$rounds),$key | |
335 | neg %rax # $rounds | |
336 | add \$16,%rax | |
337 | ||
338 | .L${dir}_loop2: | |
339 | aes${dir} $rndkey1,$inout0 | |
340 | aes${dir} $rndkey1,$inout1 | |
341 | $movkey ($key,%rax),$rndkey1 | |
342 | add \$32,%rax | |
343 | aes${dir} $rndkey0,$inout0 | |
344 | aes${dir} $rndkey0,$inout1 | |
345 | $movkey -16($key,%rax),$rndkey0 | |
346 | jnz .L${dir}_loop2 | |
347 | ||
348 | aes${dir} $rndkey1,$inout0 | |
349 | aes${dir} $rndkey1,$inout1 | |
350 | aes${dir}last $rndkey0,$inout0 | |
351 | aes${dir}last $rndkey0,$inout1 | |
352 | ret | |
353 | .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 | |
354 | ___ | |
355 | } | |
d608b4d6 | 356 | sub aesni_generate3 { |
d64a7232 AP |
357 | my $dir=shift; |
358 | # As already mentioned it takes in $key and $rounds, which are *not* | |
d608b4d6 | 359 | # preserved. $inout[0-2] is cipher/clear text... |
d64a7232 | 360 | $code.=<<___; |
d608b4d6 | 361 | .type _aesni_${dir}rypt3,\@abi-omnipotent |
d64a7232 | 362 | .align 16 |
d608b4d6 | 363 | _aesni_${dir}rypt3: |
d64a7232 | 364 | $movkey ($key),$rndkey0 |
d8ba0dc9 | 365 | shl \$4,$rounds |
d64a7232 | 366 | $movkey 16($key),$rndkey1 |
f8501464 AP |
367 | xorps $rndkey0,$inout0 |
368 | xorps $rndkey0,$inout1 | |
369 | xorps $rndkey0,$inout2 | |
d8ba0dc9 AP |
370 | $movkey 32($key),$rndkey0 |
371 | lea 32($key,$rounds),$key | |
372 | neg %rax # $rounds | |
373 | add \$16,%rax | |
d608b4d6 AP |
374 | |
375 | .L${dir}_loop3: | |
376 | aes${dir} $rndkey1,$inout0 | |
d608b4d6 | 377 | aes${dir} $rndkey1,$inout1 |
d608b4d6 | 378 | aes${dir} $rndkey1,$inout2 |
d8ba0dc9 AP |
379 | $movkey ($key,%rax),$rndkey1 |
380 | add \$32,%rax | |
d7d119a3 | 381 | aes${dir} $rndkey0,$inout0 |
d608b4d6 | 382 | aes${dir} $rndkey0,$inout1 |
d608b4d6 | 383 | aes${dir} $rndkey0,$inout2 |
d8ba0dc9 | 384 | $movkey -16($key,%rax),$rndkey0 |
d608b4d6 AP |
385 | jnz .L${dir}_loop3 |
386 | ||
387 | aes${dir} $rndkey1,$inout0 | |
d608b4d6 AP |
388 | aes${dir} $rndkey1,$inout1 |
389 | aes${dir} $rndkey1,$inout2 | |
390 | aes${dir}last $rndkey0,$inout0 | |
391 | aes${dir}last $rndkey0,$inout1 | |
392 | aes${dir}last $rndkey0,$inout2 | |
393 | ret | |
394 | .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 | |
395 | ___ | |
396 | } | |
397 | # 4x interleave is implemented to improve small block performance, | |
398 | # most notably [and naturally] 4 block by ~30%. One can argue that one | |
399 | # should have implemented 5x as well, but improvement would be <20%, | |
400 | # so it's not worth it... | |
401 | sub aesni_generate4 { | |
402 | my $dir=shift; | |
403 | # As already mentioned it takes in $key and $rounds, which are *not* | |
404 | # preserved. $inout[0-3] is cipher/clear text... | |
405 | $code.=<<___; | |
406 | .type _aesni_${dir}rypt4,\@abi-omnipotent | |
407 | .align 16 | |
408 | _aesni_${dir}rypt4: | |
409 | $movkey ($key),$rndkey0 | |
d8ba0dc9 | 410 | shl \$4,$rounds |
d608b4d6 | 411 | $movkey 16($key),$rndkey1 |
f8501464 AP |
412 | xorps $rndkey0,$inout0 |
413 | xorps $rndkey0,$inout1 | |
414 | xorps $rndkey0,$inout2 | |
415 | xorps $rndkey0,$inout3 | |
d8ba0dc9 AP |
416 | $movkey 32($key),$rndkey0 |
417 | lea 32($key,$rounds),$key | |
418 | neg %rax # $rounds | |
419 | .byte 0x0f,0x1f,0x00 | |
420 | add \$16,%rax | |
d608b4d6 AP |
421 | |
422 | .L${dir}_loop4: | |
d64a7232 | 423 | aes${dir} $rndkey1,$inout0 |
d64a7232 | 424 | aes${dir} $rndkey1,$inout1 |
d64a7232 AP |
425 | aes${dir} $rndkey1,$inout2 |
426 | aes${dir} $rndkey1,$inout3 | |
d8ba0dc9 AP |
427 | $movkey ($key,%rax),$rndkey1 |
428 | add \$32,%rax | |
d7d119a3 | 429 | aes${dir} $rndkey0,$inout0 |
d64a7232 | 430 | aes${dir} $rndkey0,$inout1 |
d64a7232 AP |
431 | aes${dir} $rndkey0,$inout2 |
432 | aes${dir} $rndkey0,$inout3 | |
d8ba0dc9 | 433 | $movkey -16($key,%rax),$rndkey0 |
d608b4d6 AP |
434 | jnz .L${dir}_loop4 |
435 | ||
d64a7232 | 436 | aes${dir} $rndkey1,$inout0 |
d64a7232 AP |
437 | aes${dir} $rndkey1,$inout1 |
438 | aes${dir} $rndkey1,$inout2 | |
439 | aes${dir} $rndkey1,$inout3 | |
d64a7232 AP |
440 | aes${dir}last $rndkey0,$inout0 |
441 | aes${dir}last $rndkey0,$inout1 | |
442 | aes${dir}last $rndkey0,$inout2 | |
443 | aes${dir}last $rndkey0,$inout3 | |
d64a7232 | 444 | ret |
d608b4d6 | 445 | .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 |
d64a7232 AP |
446 | ___ |
447 | } | |
f8501464 AP |
448 | sub aesni_generate6 { |
449 | my $dir=shift; | |
450 | # As already mentioned it takes in $key and $rounds, which are *not* | |
451 | # preserved. $inout[0-5] is cipher/clear text... | |
452 | $code.=<<___; | |
453 | .type _aesni_${dir}rypt6,\@abi-omnipotent | |
454 | .align 16 | |
455 | _aesni_${dir}rypt6: | |
456 | $movkey ($key),$rndkey0 | |
d8ba0dc9 | 457 | shl \$4,$rounds |
f8501464 | 458 | $movkey 16($key),$rndkey1 |
f8501464 AP |
459 | xorps $rndkey0,$inout0 |
460 | pxor $rndkey0,$inout1 | |
f8501464 | 461 | pxor $rndkey0,$inout2 |
d8ba0dc9 AP |
462 | aes${dir} $rndkey1,$inout0 |
463 | lea 32($key,$rounds),$key | |
464 | neg %rax # $rounds | |
f8501464 AP |
465 | aes${dir} $rndkey1,$inout1 |
466 | pxor $rndkey0,$inout3 | |
f8501464 | 467 | pxor $rndkey0,$inout4 |
d8ba0dc9 | 468 | aes${dir} $rndkey1,$inout2 |
f8501464 | 469 | pxor $rndkey0,$inout5 |
23f6eec7 | 470 | $movkey ($key,%rax),$rndkey0 |
d8ba0dc9 | 471 | add \$16,%rax |
f8501464 AP |
472 | jmp .L${dir}_loop6_enter |
473 | .align 16 | |
474 | .L${dir}_loop6: | |
475 | aes${dir} $rndkey1,$inout0 | |
476 | aes${dir} $rndkey1,$inout1 | |
f8501464 | 477 | aes${dir} $rndkey1,$inout2 |
23f6eec7 | 478 | .L${dir}_loop6_enter: |
f8501464 AP |
479 | aes${dir} $rndkey1,$inout3 |
480 | aes${dir} $rndkey1,$inout4 | |
481 | aes${dir} $rndkey1,$inout5 | |
d8ba0dc9 AP |
482 | $movkey ($key,%rax),$rndkey1 |
483 | add \$32,%rax | |
f8501464 AP |
484 | aes${dir} $rndkey0,$inout0 |
485 | aes${dir} $rndkey0,$inout1 | |
f8501464 AP |
486 | aes${dir} $rndkey0,$inout2 |
487 | aes${dir} $rndkey0,$inout3 | |
488 | aes${dir} $rndkey0,$inout4 | |
489 | aes${dir} $rndkey0,$inout5 | |
d8ba0dc9 | 490 | $movkey -16($key,%rax),$rndkey0 |
f8501464 AP |
491 | jnz .L${dir}_loop6 |
492 | ||
493 | aes${dir} $rndkey1,$inout0 | |
494 | aes${dir} $rndkey1,$inout1 | |
495 | aes${dir} $rndkey1,$inout2 | |
496 | aes${dir} $rndkey1,$inout3 | |
497 | aes${dir} $rndkey1,$inout4 | |
498 | aes${dir} $rndkey1,$inout5 | |
499 | aes${dir}last $rndkey0,$inout0 | |
500 | aes${dir}last $rndkey0,$inout1 | |
501 | aes${dir}last $rndkey0,$inout2 | |
502 | aes${dir}last $rndkey0,$inout3 | |
503 | aes${dir}last $rndkey0,$inout4 | |
504 | aes${dir}last $rndkey0,$inout5 | |
505 | ret | |
506 | .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 | |
507 | ___ | |
508 | } | |
509 | sub aesni_generate8 { | |
510 | my $dir=shift; | |
511 | # As already mentioned it takes in $key and $rounds, which are *not* | |
512 | # preserved. $inout[0-7] is cipher/clear text... | |
513 | $code.=<<___; | |
514 | .type _aesni_${dir}rypt8,\@abi-omnipotent | |
515 | .align 16 | |
516 | _aesni_${dir}rypt8: | |
517 | $movkey ($key),$rndkey0 | |
d8ba0dc9 | 518 | shl \$4,$rounds |
f8501464 | 519 | $movkey 16($key),$rndkey1 |
f8501464 AP |
520 | xorps $rndkey0,$inout0 |
521 | xorps $rndkey0,$inout1 | |
f8501464 | 522 | pxor $rndkey0,$inout2 |
f8501464 | 523 | pxor $rndkey0,$inout3 |
f8501464 | 524 | pxor $rndkey0,$inout4 |
d8ba0dc9 AP |
525 | lea 32($key,$rounds),$key |
526 | neg %rax # $rounds | |
527 | aes${dir} $rndkey1,$inout0 | |
f8501464 | 528 | pxor $rndkey0,$inout5 |
f8501464 | 529 | pxor $rndkey0,$inout6 |
23f6eec7 | 530 | aes${dir} $rndkey1,$inout1 |
f8501464 | 531 | pxor $rndkey0,$inout7 |
23f6eec7 AP |
532 | $movkey ($key,%rax),$rndkey0 |
533 | add \$16,%rax | |
534 | jmp .L${dir}_loop8_inner | |
f8501464 AP |
535 | .align 16 |
536 | .L${dir}_loop8: | |
537 | aes${dir} $rndkey1,$inout0 | |
538 | aes${dir} $rndkey1,$inout1 | |
23f6eec7 | 539 | .L${dir}_loop8_inner: |
f8501464 AP |
540 | aes${dir} $rndkey1,$inout2 |
541 | aes${dir} $rndkey1,$inout3 | |
542 | aes${dir} $rndkey1,$inout4 | |
543 | aes${dir} $rndkey1,$inout5 | |
544 | aes${dir} $rndkey1,$inout6 | |
545 | aes${dir} $rndkey1,$inout7 | |
d8ba0dc9 AP |
546 | .L${dir}_loop8_enter: |
547 | $movkey ($key,%rax),$rndkey1 | |
548 | add \$32,%rax | |
f8501464 AP |
549 | aes${dir} $rndkey0,$inout0 |
550 | aes${dir} $rndkey0,$inout1 | |
f8501464 AP |
551 | aes${dir} $rndkey0,$inout2 |
552 | aes${dir} $rndkey0,$inout3 | |
553 | aes${dir} $rndkey0,$inout4 | |
554 | aes${dir} $rndkey0,$inout5 | |
555 | aes${dir} $rndkey0,$inout6 | |
556 | aes${dir} $rndkey0,$inout7 | |
d8ba0dc9 | 557 | $movkey -16($key,%rax),$rndkey0 |
f8501464 AP |
558 | jnz .L${dir}_loop8 |
559 | ||
560 | aes${dir} $rndkey1,$inout0 | |
561 | aes${dir} $rndkey1,$inout1 | |
562 | aes${dir} $rndkey1,$inout2 | |
563 | aes${dir} $rndkey1,$inout3 | |
564 | aes${dir} $rndkey1,$inout4 | |
565 | aes${dir} $rndkey1,$inout5 | |
566 | aes${dir} $rndkey1,$inout6 | |
567 | aes${dir} $rndkey1,$inout7 | |
568 | aes${dir}last $rndkey0,$inout0 | |
569 | aes${dir}last $rndkey0,$inout1 | |
570 | aes${dir}last $rndkey0,$inout2 | |
571 | aes${dir}last $rndkey0,$inout3 | |
572 | aes${dir}last $rndkey0,$inout4 | |
573 | aes${dir}last $rndkey0,$inout5 | |
574 | aes${dir}last $rndkey0,$inout6 | |
575 | aes${dir}last $rndkey0,$inout7 | |
576 | ret | |
577 | .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 | |
578 | ___ | |
579 | } | |
214368ff AP |
580 | &aesni_generate2("enc") if ($PREFIX eq "aesni"); |
581 | &aesni_generate2("dec"); | |
d608b4d6 AP |
582 | &aesni_generate3("enc") if ($PREFIX eq "aesni"); |
583 | &aesni_generate3("dec"); | |
584 | &aesni_generate4("enc") if ($PREFIX eq "aesni"); | |
585 | &aesni_generate4("dec"); | |
f8501464 AP |
586 | &aesni_generate6("enc") if ($PREFIX eq "aesni"); |
587 | &aesni_generate6("dec"); | |
588 | &aesni_generate8("enc") if ($PREFIX eq "aesni"); | |
589 | &aesni_generate8("dec"); | |
d64a7232 AP |
590 | \f |
591 | if ($PREFIX eq "aesni") { | |
6c83629b | 592 | ######################################################################## |
d64a7232 AP |
593 | # void aesni_ecb_encrypt (const void *in, void *out, |
594 | # size_t length, const AES_KEY *key, | |
595 | # int enc); | |
596 | $code.=<<___; | |
597 | .globl aesni_ecb_encrypt | |
598 | .type aesni_ecb_encrypt,\@function,5 | |
599 | .align 16 | |
600 | aesni_ecb_encrypt: | |
69d5747f AP |
601 | ___ |
602 | $code.=<<___ if ($win64); | |
603 | lea -0x58(%rsp),%rsp | |
23f6eec7 | 604 | movaps %xmm6,(%rsp) # offload $inout4..7 |
69d5747f AP |
605 | movaps %xmm7,0x10(%rsp) |
606 | movaps %xmm8,0x20(%rsp) | |
607 | movaps %xmm9,0x30(%rsp) | |
608 | .Lecb_enc_body: | |
609 | ___ | |
610 | $code.=<<___; | |
23f6eec7 AP |
611 | and \$-16,$len # if ($len<16) |
612 | jz .Lecb_ret # return | |
f8501464 AP |
613 | |
614 | mov 240($key),$rounds # key->rounds | |
615 | $movkey ($key),$rndkey0 | |
d64a7232 | 616 | mov $key,$key_ # backup $key |
d64a7232 | 617 | mov $rounds,$rnds_ # backup $rounds |
d7d119a3 | 618 | test %r8d,%r8d # 5th argument |
d64a7232 AP |
619 | jz .Lecb_decrypt |
620 | #--------------------------- ECB ENCRYPT ------------------------------# | |
23f6eec7 AP |
621 | cmp \$0x80,$len # if ($len<8*16) |
622 | jb .Lecb_enc_tail # short input | |
f8501464 | 623 | |
23f6eec7 | 624 | movdqu ($inp),$inout0 # load 8 input blocks |
f8501464 AP |
625 | movdqu 0x10($inp),$inout1 |
626 | movdqu 0x20($inp),$inout2 | |
627 | movdqu 0x30($inp),$inout3 | |
628 | movdqu 0x40($inp),$inout4 | |
629 | movdqu 0x50($inp),$inout5 | |
630 | movdqu 0x60($inp),$inout6 | |
631 | movdqu 0x70($inp),$inout7 | |
23f6eec7 AP |
632 | lea 0x80($inp),$inp # $inp+=8*16 |
633 | sub \$0x80,$len # $len-=8*16 (can be zero) | |
f8501464 | 634 | jmp .Lecb_enc_loop8_enter |
d64a7232 | 635 | .align 16 |
f8501464 | 636 | .Lecb_enc_loop8: |
23f6eec7 | 637 | movups $inout0,($out) # store 8 output blocks |
f8501464 | 638 | mov $key_,$key # restore $key |
23f6eec7 | 639 | movdqu ($inp),$inout0 # load 8 input blocks |
d64a7232 | 640 | mov $rnds_,$rounds # restore $rounds |
d7d119a3 | 641 | movups $inout1,0x10($out) |
f8501464 AP |
642 | movdqu 0x10($inp),$inout1 |
643 | movups $inout2,0x20($out) | |
644 | movdqu 0x20($inp),$inout2 | |
645 | movups $inout3,0x30($out) | |
646 | movdqu 0x30($inp),$inout3 | |
647 | movups $inout4,0x40($out) | |
648 | movdqu 0x40($inp),$inout4 | |
649 | movups $inout5,0x50($out) | |
650 | movdqu 0x50($inp),$inout5 | |
651 | movups $inout6,0x60($out) | |
652 | movdqu 0x60($inp),$inout6 | |
653 | movups $inout7,0x70($out) | |
23f6eec7 | 654 | lea 0x80($out),$out # $out+=8*16 |
f8501464 | 655 | movdqu 0x70($inp),$inout7 |
23f6eec7 | 656 | lea 0x80($inp),$inp # $inp+=8*16 |
f8501464 AP |
657 | .Lecb_enc_loop8_enter: |
658 | ||
659 | call _aesni_encrypt8 | |
660 | ||
661 | sub \$0x80,$len | |
23f6eec7 | 662 | jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow |
f8501464 | 663 | |
23f6eec7 | 664 | movups $inout0,($out) # store 8 output blocks |
d64a7232 | 665 | mov $key_,$key # restore $key |
f8501464 AP |
666 | movups $inout1,0x10($out) |
667 | mov $rnds_,$rounds # restore $rounds | |
d7d119a3 | 668 | movups $inout2,0x20($out) |
f8501464 AP |
669 | movups $inout3,0x30($out) |
670 | movups $inout4,0x40($out) | |
671 | movups $inout5,0x50($out) | |
672 | movups $inout6,0x60($out) | |
673 | movups $inout7,0x70($out) | |
23f6eec7 AP |
674 | lea 0x80($out),$out # $out+=8*16 |
675 | add \$0x80,$len # restore real remaining $len | |
676 | jz .Lecb_ret # done if ($len==0) | |
d64a7232 | 677 | |
23f6eec7 | 678 | .Lecb_enc_tail: # $len is less than 8*16 |
6c83629b | 679 | movups ($inp),$inout0 |
d7d119a3 | 680 | cmp \$0x20,$len |
6c83629b | 681 | jb .Lecb_enc_one |
d64a7232 AP |
682 | movups 0x10($inp),$inout1 |
683 | je .Lecb_enc_two | |
d64a7232 | 684 | movups 0x20($inp),$inout2 |
f8501464 AP |
685 | cmp \$0x40,$len |
686 | jb .Lecb_enc_three | |
d64a7232 | 687 | movups 0x30($inp),$inout3 |
f8501464 AP |
688 | je .Lecb_enc_four |
689 | movups 0x40($inp),$inout4 | |
690 | cmp \$0x60,$len | |
691 | jb .Lecb_enc_five | |
692 | movups 0x50($inp),$inout5 | |
693 | je .Lecb_enc_six | |
694 | movdqu 0x60($inp),$inout6 | |
23f6eec7 | 695 | xorps $inout7,$inout7 |
f8501464 | 696 | call _aesni_encrypt8 |
23f6eec7 | 697 | movups $inout0,($out) # store 7 output blocks |
d64a7232 AP |
698 | movups $inout1,0x10($out) |
699 | movups $inout2,0x20($out) | |
700 | movups $inout3,0x30($out) | |
f8501464 AP |
701 | movups $inout4,0x40($out) |
702 | movups $inout5,0x50($out) | |
703 | movups $inout6,0x60($out) | |
d64a7232 AP |
704 | jmp .Lecb_ret |
705 | .align 16 | |
706 | .Lecb_enc_one: | |
707 | ___ | |
d608b4d6 | 708 | &aesni_generate1("enc",$key,$rounds); |
d64a7232 | 709 | $code.=<<___; |
23f6eec7 | 710 | movups $inout0,($out) # store one output block |
d64a7232 AP |
711 | jmp .Lecb_ret |
712 | .align 16 | |
713 | .Lecb_enc_two: | |
214368ff | 714 | call _aesni_encrypt2 |
23f6eec7 | 715 | movups $inout0,($out) # store 2 output blocks |
d64a7232 AP |
716 | movups $inout1,0x10($out) |
717 | jmp .Lecb_ret | |
718 | .align 16 | |
719 | .Lecb_enc_three: | |
d608b4d6 | 720 | call _aesni_encrypt3 |
23f6eec7 | 721 | movups $inout0,($out) # store 3 output blocks |
d64a7232 AP |
722 | movups $inout1,0x10($out) |
723 | movups $inout2,0x20($out) | |
724 | jmp .Lecb_ret | |
f8501464 AP |
725 | .align 16 |
726 | .Lecb_enc_four: | |
727 | call _aesni_encrypt4 | |
23f6eec7 | 728 | movups $inout0,($out) # store 4 output blocks |
f8501464 AP |
729 | movups $inout1,0x10($out) |
730 | movups $inout2,0x20($out) | |
731 | movups $inout3,0x30($out) | |
732 | jmp .Lecb_ret | |
733 | .align 16 | |
734 | .Lecb_enc_five: | |
735 | xorps $inout5,$inout5 | |
736 | call _aesni_encrypt6 | |
23f6eec7 | 737 | movups $inout0,($out) # store 5 output blocks |
f8501464 AP |
738 | movups $inout1,0x10($out) |
739 | movups $inout2,0x20($out) | |
740 | movups $inout3,0x30($out) | |
741 | movups $inout4,0x40($out) | |
742 | jmp .Lecb_ret | |
743 | .align 16 | |
744 | .Lecb_enc_six: | |
745 | call _aesni_encrypt6 | |
23f6eec7 | 746 | movups $inout0,($out) # store 6 output blocks |
f8501464 AP |
747 | movups $inout1,0x10($out) |
748 | movups $inout2,0x20($out) | |
749 | movups $inout3,0x30($out) | |
750 | movups $inout4,0x40($out) | |
751 | movups $inout5,0x50($out) | |
752 | jmp .Lecb_ret | |
d64a7232 AP |
753 | \f#--------------------------- ECB DECRYPT ------------------------------# |
754 | .align 16 | |
755 | .Lecb_decrypt: | |
23f6eec7 AP |
756 | cmp \$0x80,$len # if ($len<8*16) |
757 | jb .Lecb_dec_tail # short input | |
f8501464 | 758 | |
23f6eec7 | 759 | movdqu ($inp),$inout0 # load 8 input blocks |
f8501464 AP |
760 | movdqu 0x10($inp),$inout1 |
761 | movdqu 0x20($inp),$inout2 | |
762 | movdqu 0x30($inp),$inout3 | |
763 | movdqu 0x40($inp),$inout4 | |
764 | movdqu 0x50($inp),$inout5 | |
765 | movdqu 0x60($inp),$inout6 | |
766 | movdqu 0x70($inp),$inout7 | |
23f6eec7 AP |
767 | lea 0x80($inp),$inp # $inp+=8*16 |
768 | sub \$0x80,$len # $len-=8*16 (can be zero) | |
f8501464 | 769 | jmp .Lecb_dec_loop8_enter |
d64a7232 | 770 | .align 16 |
f8501464 | 771 | .Lecb_dec_loop8: |
23f6eec7 | 772 | movups $inout0,($out) # store 8 output blocks |
f8501464 | 773 | mov $key_,$key # restore $key |
23f6eec7 | 774 | movdqu ($inp),$inout0 # load 8 input blocks |
d64a7232 | 775 | mov $rnds_,$rounds # restore $rounds |
d7d119a3 | 776 | movups $inout1,0x10($out) |
f8501464 AP |
777 | movdqu 0x10($inp),$inout1 |
778 | movups $inout2,0x20($out) | |
779 | movdqu 0x20($inp),$inout2 | |
780 | movups $inout3,0x30($out) | |
781 | movdqu 0x30($inp),$inout3 | |
782 | movups $inout4,0x40($out) | |
783 | movdqu 0x40($inp),$inout4 | |
784 | movups $inout5,0x50($out) | |
785 | movdqu 0x50($inp),$inout5 | |
786 | movups $inout6,0x60($out) | |
787 | movdqu 0x60($inp),$inout6 | |
788 | movups $inout7,0x70($out) | |
23f6eec7 | 789 | lea 0x80($out),$out # $out+=8*16 |
f8501464 | 790 | movdqu 0x70($inp),$inout7 |
23f6eec7 | 791 | lea 0x80($inp),$inp # $inp+=8*16 |
f8501464 AP |
792 | .Lecb_dec_loop8_enter: |
793 | ||
794 | call _aesni_decrypt8 | |
795 | ||
796 | $movkey ($key_),$rndkey0 | |
797 | sub \$0x80,$len | |
23f6eec7 | 798 | jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow |
f8501464 | 799 | |
23f6eec7 AP |
800 | movups $inout0,($out) # store 8 output blocks |
801 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 | 802 | mov $key_,$key # restore $key |
f8501464 | 803 | movups $inout1,0x10($out) |
23f6eec7 | 804 | pxor $inout1,$inout1 |
f8501464 | 805 | mov $rnds_,$rounds # restore $rounds |
d7d119a3 | 806 | movups $inout2,0x20($out) |
23f6eec7 | 807 | pxor $inout2,$inout2 |
f8501464 | 808 | movups $inout3,0x30($out) |
23f6eec7 | 809 | pxor $inout3,$inout3 |
f8501464 | 810 | movups $inout4,0x40($out) |
23f6eec7 | 811 | pxor $inout4,$inout4 |
f8501464 | 812 | movups $inout5,0x50($out) |
23f6eec7 | 813 | pxor $inout5,$inout5 |
f8501464 | 814 | movups $inout6,0x60($out) |
23f6eec7 | 815 | pxor $inout6,$inout6 |
f8501464 | 816 | movups $inout7,0x70($out) |
23f6eec7 AP |
817 | pxor $inout7,$inout7 |
818 | lea 0x80($out),$out # $out+=8*16 | |
819 | add \$0x80,$len # restore real remaining $len | |
820 | jz .Lecb_ret # done if ($len==0) | |
d64a7232 | 821 | |
6c83629b | 822 | .Lecb_dec_tail: |
6c83629b | 823 | movups ($inp),$inout0 |
d7d119a3 | 824 | cmp \$0x20,$len |
6c83629b | 825 | jb .Lecb_dec_one |
d64a7232 AP |
826 | movups 0x10($inp),$inout1 |
827 | je .Lecb_dec_two | |
d64a7232 | 828 | movups 0x20($inp),$inout2 |
f8501464 AP |
829 | cmp \$0x40,$len |
830 | jb .Lecb_dec_three | |
d64a7232 | 831 | movups 0x30($inp),$inout3 |
f8501464 AP |
832 | je .Lecb_dec_four |
833 | movups 0x40($inp),$inout4 | |
834 | cmp \$0x60,$len | |
835 | jb .Lecb_dec_five | |
836 | movups 0x50($inp),$inout5 | |
837 | je .Lecb_dec_six | |
838 | movups 0x60($inp),$inout6 | |
839 | $movkey ($key),$rndkey0 | |
23f6eec7 | 840 | xorps $inout7,$inout7 |
f8501464 | 841 | call _aesni_decrypt8 |
23f6eec7 AP |
842 | movups $inout0,($out) # store 7 output blocks |
843 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 | 844 | movups $inout1,0x10($out) |
23f6eec7 | 845 | pxor $inout1,$inout1 |
d64a7232 | 846 | movups $inout2,0x20($out) |
23f6eec7 | 847 | pxor $inout2,$inout2 |
d64a7232 | 848 | movups $inout3,0x30($out) |
23f6eec7 | 849 | pxor $inout3,$inout3 |
f8501464 | 850 | movups $inout4,0x40($out) |
23f6eec7 | 851 | pxor $inout4,$inout4 |
f8501464 | 852 | movups $inout5,0x50($out) |
23f6eec7 | 853 | pxor $inout5,$inout5 |
f8501464 | 854 | movups $inout6,0x60($out) |
23f6eec7 AP |
855 | pxor $inout6,$inout6 |
856 | pxor $inout7,$inout7 | |
d64a7232 AP |
857 | jmp .Lecb_ret |
858 | .align 16 | |
859 | .Lecb_dec_one: | |
860 | ___ | |
d608b4d6 | 861 | &aesni_generate1("dec",$key,$rounds); |
d64a7232 | 862 | $code.=<<___; |
23f6eec7 AP |
863 | movups $inout0,($out) # store one output block |
864 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 AP |
865 | jmp .Lecb_ret |
866 | .align 16 | |
867 | .Lecb_dec_two: | |
214368ff | 868 | call _aesni_decrypt2 |
23f6eec7 AP |
869 | movups $inout0,($out) # store 2 output blocks |
870 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 | 871 | movups $inout1,0x10($out) |
23f6eec7 | 872 | pxor $inout1,$inout1 |
d64a7232 AP |
873 | jmp .Lecb_ret |
874 | .align 16 | |
875 | .Lecb_dec_three: | |
d608b4d6 | 876 | call _aesni_decrypt3 |
23f6eec7 AP |
877 | movups $inout0,($out) # store 3 output blocks |
878 | pxor $inout0,$inout0 # clear register bank | |
d64a7232 | 879 | movups $inout1,0x10($out) |
23f6eec7 | 880 | pxor $inout1,$inout1 |
d64a7232 | 881 | movups $inout2,0x20($out) |
23f6eec7 | 882 | pxor $inout2,$inout2 |
f8501464 AP |
883 | jmp .Lecb_ret |
884 | .align 16 | |
885 | .Lecb_dec_four: | |
886 | call _aesni_decrypt4 | |
23f6eec7 AP |
887 | movups $inout0,($out) # store 4 output blocks |
888 | pxor $inout0,$inout0 # clear register bank | |
f8501464 | 889 | movups $inout1,0x10($out) |
23f6eec7 | 890 | pxor $inout1,$inout1 |
f8501464 | 891 | movups $inout2,0x20($out) |
23f6eec7 | 892 | pxor $inout2,$inout2 |
f8501464 | 893 | movups $inout3,0x30($out) |
23f6eec7 | 894 | pxor $inout3,$inout3 |
f8501464 AP |
895 | jmp .Lecb_ret |
896 | .align 16 | |
897 | .Lecb_dec_five: | |
898 | xorps $inout5,$inout5 | |
899 | call _aesni_decrypt6 | |
23f6eec7 AP |
900 | movups $inout0,($out) # store 5 output blocks |
901 | pxor $inout0,$inout0 # clear register bank | |
f8501464 | 902 | movups $inout1,0x10($out) |
23f6eec7 | 903 | pxor $inout1,$inout1 |
f8501464 | 904 | movups $inout2,0x20($out) |
23f6eec7 | 905 | pxor $inout2,$inout2 |
f8501464 | 906 | movups $inout3,0x30($out) |
23f6eec7 | 907 | pxor $inout3,$inout3 |
f8501464 | 908 | movups $inout4,0x40($out) |
23f6eec7 AP |
909 | pxor $inout4,$inout4 |
910 | pxor $inout5,$inout5 | |
f8501464 AP |
911 | jmp .Lecb_ret |
912 | .align 16 | |
913 | .Lecb_dec_six: | |
914 | call _aesni_decrypt6 | |
23f6eec7 AP |
915 | movups $inout0,($out) # store 6 output blocks |
916 | pxor $inout0,$inout0 # clear register bank | |
f8501464 | 917 | movups $inout1,0x10($out) |
23f6eec7 | 918 | pxor $inout1,$inout1 |
f8501464 | 919 | movups $inout2,0x20($out) |
23f6eec7 | 920 | pxor $inout2,$inout2 |
f8501464 | 921 | movups $inout3,0x30($out) |
23f6eec7 | 922 | pxor $inout3,$inout3 |
f8501464 | 923 | movups $inout4,0x40($out) |
23f6eec7 | 924 | pxor $inout4,$inout4 |
f8501464 | 925 | movups $inout5,0x50($out) |
23f6eec7 | 926 | pxor $inout5,$inout5 |
d64a7232 AP |
927 | |
928 | .Lecb_ret: | |
23f6eec7 AP |
929 | xorps $rndkey0,$rndkey0 # %xmm0 |
930 | pxor $rndkey1,$rndkey1 | |
69d5747f AP |
931 | ___ |
932 | $code.=<<___ if ($win64); | |
933 | movaps (%rsp),%xmm6 | |
23f6eec7 | 934 | movaps %xmm0,(%rsp) # clear stack |
69d5747f | 935 | movaps 0x10(%rsp),%xmm7 |
23f6eec7 | 936 | movaps %xmm0,0x10(%rsp) |
69d5747f | 937 | movaps 0x20(%rsp),%xmm8 |
23f6eec7 | 938 | movaps %xmm0,0x20(%rsp) |
69d5747f | 939 | movaps 0x30(%rsp),%xmm9 |
23f6eec7 | 940 | movaps %xmm0,0x30(%rsp) |
69d5747f AP |
941 | lea 0x58(%rsp),%rsp |
942 | .Lecb_enc_ret: | |
943 | ___ | |
944 | $code.=<<___; | |
d64a7232 AP |
945 | ret |
946 | .size aesni_ecb_encrypt,.-aesni_ecb_encrypt | |
947 | ___ | |
d7d119a3 AP |
948 | \f |
949 | { | |
6c83629b | 950 | ###################################################################### |
d7d119a3 AP |
951 | # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, |
952 | # size_t blocks, const AES_KEY *key, | |
953 | # const char *ivec,char *cmac); | |
6c83629b | 954 | # |
d7d119a3 AP |
955 | # Handles only complete blocks, operates on 64-bit counter and |
956 | # does not update *ivec! Nor does it finalize CMAC value | |
957 | # (see engine/eng_aesni.c for details) | |
958 | # | |
959 | { | |
960 | my $cmac="%r9"; # 6th argument | |
961 | ||
d8ba0dc9 AP |
962 | my $increment="%xmm9"; |
963 | my $iv="%xmm6"; | |
267b481c | 964 | my $bswap_mask="%xmm7"; |
d7d119a3 AP |
965 | |
966 | $code.=<<___; | |
967 | .globl aesni_ccm64_encrypt_blocks | |
968 | .type aesni_ccm64_encrypt_blocks,\@function,6 | |
969 | .align 16 | |
970 | aesni_ccm64_encrypt_blocks: | |
971 | ___ | |
972 | $code.=<<___ if ($win64); | |
973 | lea -0x58(%rsp),%rsp | |
23f6eec7 AP |
974 | movaps %xmm6,(%rsp) # $iv |
975 | movaps %xmm7,0x10(%rsp) # $bswap_mask | |
976 | movaps %xmm8,0x20(%rsp) # $in0 | |
977 | movaps %xmm9,0x30(%rsp) # $increment | |
d7d119a3 AP |
978 | .Lccm64_enc_body: |
979 | ___ | |
980 | $code.=<<___; | |
267b481c | 981 | mov 240($key),$rounds # key->rounds |
d7d119a3 | 982 | movdqu ($ivp),$iv |
d7d119a3 AP |
983 | movdqa .Lincrement64(%rip),$increment |
984 | movdqa .Lbswap_mask(%rip),$bswap_mask | |
d7d119a3 | 985 | |
d8ba0dc9 AP |
986 | shl \$4,$rounds |
987 | mov \$16,$rnds_ | |
267b481c AP |
988 | lea 0($key),$key_ |
989 | movdqu ($cmac),$inout1 | |
d7d119a3 | 990 | movdqa $iv,$inout0 |
d8ba0dc9 | 991 | lea 32($key,$rounds),$key # end of key schedule |
9ee5916d | 992 | pshufb $bswap_mask,$iv |
d8ba0dc9 | 993 | sub %rax,%r10 # twisted $rounds |
267b481c AP |
994 | jmp .Lccm64_enc_outer |
995 | .align 16 | |
d7d119a3 | 996 | .Lccm64_enc_outer: |
267b481c | 997 | $movkey ($key_),$rndkey0 |
d8ba0dc9 | 998 | mov %r10,%rax |
267b481c | 999 | movups ($inp),$in0 # load inp |
d7d119a3 | 1000 | |
267b481c AP |
1001 | xorps $rndkey0,$inout0 # counter |
1002 | $movkey 16($key_),$rndkey1 | |
1003 | xorps $in0,$rndkey0 | |
267b481c | 1004 | xorps $rndkey0,$inout1 # cmac^=inp |
d8ba0dc9 | 1005 | $movkey 32($key_),$rndkey0 |
f8501464 AP |
1006 | |
1007 | .Lccm64_enc2_loop: | |
1008 | aesenc $rndkey1,$inout0 | |
f8501464 | 1009 | aesenc $rndkey1,$inout1 |
d8ba0dc9 AP |
1010 | $movkey ($key,%rax),$rndkey1 |
1011 | add \$32,%rax | |
f8501464 | 1012 | aesenc $rndkey0,$inout0 |
f8501464 | 1013 | aesenc $rndkey0,$inout1 |
d8ba0dc9 | 1014 | $movkey -16($key,%rax),$rndkey0 |
f8501464 AP |
1015 | jnz .Lccm64_enc2_loop |
1016 | aesenc $rndkey1,$inout0 | |
1017 | aesenc $rndkey1,$inout1 | |
267b481c | 1018 | paddq $increment,$iv |
23f6eec7 | 1019 | dec $len # $len-- ($len is in blocks) |
f8501464 AP |
1020 | aesenclast $rndkey0,$inout0 |
1021 | aesenclast $rndkey0,$inout1 | |
d7d119a3 | 1022 | |
d7d119a3 | 1023 | lea 16($inp),$inp |
f8501464 | 1024 | xorps $inout0,$in0 # inp ^= E(iv) |
d7d119a3 | 1025 | movdqa $iv,$inout0 |
f8501464 | 1026 | movups $in0,($out) # save output |
9ee5916d | 1027 | pshufb $bswap_mask,$inout0 |
23f6eec7 AP |
1028 | lea 16($out),$out # $out+=16 |
1029 | jnz .Lccm64_enc_outer # loop if ($len!=0) | |
d7d119a3 | 1030 | |
23f6eec7 AP |
1031 | pxor $rndkey0,$rndkey0 # clear register bank |
1032 | pxor $rndkey1,$rndkey1 | |
1033 | pxor $inout0,$inout0 | |
1034 | movups $inout1,($cmac) # store resulting mac | |
1035 | pxor $inout1,$inout1 | |
1036 | pxor $in0,$in0 | |
1037 | pxor $iv,$iv | |
d7d119a3 AP |
1038 | ___ |
1039 | $code.=<<___ if ($win64); | |
1040 | movaps (%rsp),%xmm6 | |
23f6eec7 | 1041 | movaps %xmm0,(%rsp) # clear stack |
d7d119a3 | 1042 | movaps 0x10(%rsp),%xmm7 |
23f6eec7 | 1043 | movaps %xmm0,0x10(%rsp) |
d7d119a3 | 1044 | movaps 0x20(%rsp),%xmm8 |
23f6eec7 | 1045 | movaps %xmm0,0x20(%rsp) |
d7d119a3 | 1046 | movaps 0x30(%rsp),%xmm9 |
23f6eec7 | 1047 | movaps %xmm0,0x30(%rsp) |
d7d119a3 AP |
1048 | lea 0x58(%rsp),%rsp |
1049 | .Lccm64_enc_ret: | |
1050 | ___ | |
1051 | $code.=<<___; | |
1052 | ret | |
1053 | .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks | |
1054 | ___ | |
1055 | ###################################################################### | |
1056 | $code.=<<___; | |
1057 | .globl aesni_ccm64_decrypt_blocks | |
1058 | .type aesni_ccm64_decrypt_blocks,\@function,6 | |
1059 | .align 16 | |
1060 | aesni_ccm64_decrypt_blocks: | |
1061 | ___ | |
1062 | $code.=<<___ if ($win64); | |
1063 | lea -0x58(%rsp),%rsp | |
23f6eec7 AP |
1064 | movaps %xmm6,(%rsp) # $iv |
1065 | movaps %xmm7,0x10(%rsp) # $bswap_mask | |
1066 | movaps %xmm8,0x20(%rsp) # $in8 | |
1067 | movaps %xmm9,0x30(%rsp) # $increment | |
d7d119a3 AP |
1068 | .Lccm64_dec_body: |
1069 | ___ | |
1070 | $code.=<<___; | |
267b481c AP |
1071 | mov 240($key),$rounds # key->rounds |
1072 | movups ($ivp),$iv | |
d7d119a3 AP |
1073 | movdqu ($cmac),$inout1 |
1074 | movdqa .Lincrement64(%rip),$increment | |
1075 | movdqa .Lbswap_mask(%rip),$bswap_mask | |
1076 | ||
267b481c | 1077 | movaps $iv,$inout0 |
d7d119a3 AP |
1078 | mov $rounds,$rnds_ |
1079 | mov $key,$key_ | |
267b481c | 1080 | pshufb $bswap_mask,$iv |
d7d119a3 AP |
1081 | ___ |
1082 | &aesni_generate1("enc",$key,$rounds); | |
1083 | $code.=<<___; | |
d8ba0dc9 AP |
1084 | shl \$4,$rnds_ |
1085 | mov \$16,$rounds | |
f8501464 | 1086 | movups ($inp),$in0 # load inp |
267b481c | 1087 | paddq $increment,$iv |
23f6eec7 | 1088 | lea 16($inp),$inp # $inp+=16 |
d8ba0dc9 AP |
1089 | sub %r10,%rax # twisted $rounds |
1090 | lea 32($key_,$rnds_),$key # end of key schedule | |
1091 | mov %rax,%r10 | |
267b481c AP |
1092 | jmp .Lccm64_dec_outer |
1093 | .align 16 | |
1094 | .Lccm64_dec_outer: | |
1095 | xorps $inout0,$in0 # inp ^= E(iv) | |
1096 | movdqa $iv,$inout0 | |
267b481c | 1097 | movups $in0,($out) # save output |
23f6eec7 | 1098 | lea 16($out),$out # $out+=16 |
9ee5916d | 1099 | pshufb $bswap_mask,$inout0 |
d7d119a3 | 1100 | |
23f6eec7 AP |
1101 | sub \$1,$len # $len-- ($len is in blocks) |
1102 | jz .Lccm64_dec_break # if ($len==0) break | |
d7d119a3 | 1103 | |
267b481c | 1104 | $movkey ($key_),$rndkey0 |
d8ba0dc9 | 1105 | mov %r10,%rax |
267b481c | 1106 | $movkey 16($key_),$rndkey1 |
f8501464 | 1107 | xorps $rndkey0,$in0 |
f8501464 AP |
1108 | xorps $rndkey0,$inout0 |
1109 | xorps $in0,$inout1 # cmac^=out | |
d8ba0dc9 AP |
1110 | $movkey 32($key_),$rndkey0 |
1111 | jmp .Lccm64_dec2_loop | |
1112 | .align 16 | |
f8501464 AP |
1113 | .Lccm64_dec2_loop: |
1114 | aesenc $rndkey1,$inout0 | |
f8501464 | 1115 | aesenc $rndkey1,$inout1 |
d8ba0dc9 AP |
1116 | $movkey ($key,%rax),$rndkey1 |
1117 | add \$32,%rax | |
f8501464 | 1118 | aesenc $rndkey0,$inout0 |
f8501464 | 1119 | aesenc $rndkey0,$inout1 |
d8ba0dc9 | 1120 | $movkey -16($key,%rax),$rndkey0 |
f8501464 | 1121 | jnz .Lccm64_dec2_loop |
23f6eec7 | 1122 | movups ($inp),$in0 # load input |
267b481c | 1123 | paddq $increment,$iv |
f8501464 AP |
1124 | aesenc $rndkey1,$inout0 |
1125 | aesenc $rndkey1,$inout1 | |
1126 | aesenclast $rndkey0,$inout0 | |
267b481c | 1127 | aesenclast $rndkey0,$inout1 |
23f6eec7 | 1128 | lea 16($inp),$inp # $inp+=16 |
d7d119a3 AP |
1129 | jmp .Lccm64_dec_outer |
1130 | ||
1131 | .align 16 | |
1132 | .Lccm64_dec_break: | |
267b481c | 1133 | #xorps $in0,$inout1 # cmac^=out |
d8ba0dc9 | 1134 | mov 240($key_),$rounds |
d7d119a3 | 1135 | ___ |
267b481c | 1136 | &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); |
d7d119a3 | 1137 | $code.=<<___; |
23f6eec7 AP |
1138 | pxor $rndkey0,$rndkey0 # clear register bank |
1139 | pxor $rndkey1,$rndkey1 | |
1140 | pxor $inout0,$inout0 | |
1141 | movups $inout1,($cmac) # store resulting mac | |
1142 | pxor $inout1,$inout1 | |
1143 | pxor $in0,$in0 | |
1144 | pxor $iv,$iv | |
d7d119a3 AP |
1145 | ___ |
1146 | $code.=<<___ if ($win64); | |
1147 | movaps (%rsp),%xmm6 | |
23f6eec7 | 1148 | movaps %xmm0,(%rsp) # clear stack |
d7d119a3 | 1149 | movaps 0x10(%rsp),%xmm7 |
23f6eec7 | 1150 | movaps %xmm0,0x10(%rsp) |
d7d119a3 | 1151 | movaps 0x20(%rsp),%xmm8 |
23f6eec7 | 1152 | movaps %xmm0,0x20(%rsp) |
d7d119a3 | 1153 | movaps 0x30(%rsp),%xmm9 |
23f6eec7 | 1154 | movaps %xmm0,0x30(%rsp) |
d7d119a3 AP |
1155 | lea 0x58(%rsp),%rsp |
1156 | .Lccm64_dec_ret: | |
1157 | ___ | |
1158 | $code.=<<___; | |
1159 | ret | |
f8501464 AP |
1160 | .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks |
1161 | ___ | |
1162 | }\f | |
1163 | ###################################################################### | |
1164 | # void aesni_ctr32_encrypt_blocks (const void *in, void *out, | |
1165 | # size_t blocks, const AES_KEY *key, | |
1166 | # const char *ivec); | |
1167 | # | |
1168 | # Handles only complete blocks, operates on 32-bit counter and | |
6c79faaa | 1169 | # does not update *ivec! (see crypto/modes/ctr128.c for details) |
f8501464 | 1170 | # |
6c79faaa | 1171 | # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, |
b4a9d5bf | 1172 | # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. |
6c79faaa AP |
1173 | # Keywords are full unroll and modulo-schedule counter calculations |
1174 | # with zero-round key xor. | |
f8501464 | 1175 | { |
6c79faaa | 1176 | my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); |
384e6de4 | 1177 | my ($key0,$ctr)=("%ebp","${ivp}d"); |
6c79faaa | 1178 | my $frame_size = 0x80 + ($win64?160:0); |
f8501464 AP |
1179 | |
1180 | $code.=<<___; | |
1181 | .globl aesni_ctr32_encrypt_blocks | |
1182 | .type aesni_ctr32_encrypt_blocks,\@function,5 | |
1183 | .align 16 | |
1184 | aesni_ctr32_encrypt_blocks: | |
b84460ad | 1185 | .cfi_startproc |
23f6eec7 AP |
1186 | cmp \$1,$len |
1187 | jne .Lctr32_bulk | |
1188 | ||
1189 | # handle single block without allocating stack frame, | |
1190 | # useful when handling edges | |
1191 | movups ($ivp),$inout0 | |
1192 | movups ($inp),$inout1 | |
1193 | mov 240($key),%edx # key->rounds | |
1194 | ___ | |
1195 | &aesni_generate1("enc",$key,"%edx"); | |
1196 | $code.=<<___; | |
1197 | pxor $rndkey0,$rndkey0 # clear register bank | |
1198 | pxor $rndkey1,$rndkey1 | |
1199 | xorps $inout1,$inout0 | |
1200 | pxor $inout1,$inout1 | |
1201 | movups $inout0,($out) | |
1202 | xorps $inout0,$inout0 | |
1203 | jmp .Lctr32_epilogue | |
1204 | ||
1205 | .align 16 | |
1206 | .Lctr32_bulk: | |
384e6de4 | 1207 | lea (%rsp),$key_ # use $key_ as frame pointer |
b84460ad | 1208 | .cfi_def_cfa_register $key_ |
6c79faaa | 1209 | push %rbp |
b84460ad | 1210 | .cfi_push %rbp |
6c79faaa AP |
1211 | sub \$$frame_size,%rsp |
1212 | and \$-16,%rsp # Linux kernel stack can be incorrectly seeded | |
f8501464 AP |
1213 | ___ |
1214 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1215 | movaps %xmm6,-0xa8($key_) # offload everything |
1216 | movaps %xmm7,-0x98($key_) | |
1217 | movaps %xmm8,-0x88($key_) | |
1218 | movaps %xmm9,-0x78($key_) | |
1219 | movaps %xmm10,-0x68($key_) | |
1220 | movaps %xmm11,-0x58($key_) | |
1221 | movaps %xmm12,-0x48($key_) | |
1222 | movaps %xmm13,-0x38($key_) | |
1223 | movaps %xmm14,-0x28($key_) | |
1224 | movaps %xmm15,-0x18($key_) | |
f8501464 AP |
1225 | .Lctr32_body: |
1226 | ___ | |
1227 | $code.=<<___; | |
6c79faaa | 1228 | |
23f6eec7 AP |
1229 | # 8 16-byte words on top of stack are counter values |
1230 | # xor-ed with zero-round key | |
f8501464 | 1231 | |
6c79faaa AP |
1232 | movdqu ($ivp),$inout0 |
1233 | movdqu ($key),$rndkey0 | |
1234 | mov 12($ivp),$ctr # counter LSB | |
1235 | pxor $rndkey0,$inout0 | |
1236 | mov 12($key),$key0 # 0-round key LSB | |
1237 | movdqa $inout0,0x00(%rsp) # populate counter block | |
1238 | bswap $ctr | |
b4a9d5bf AP |
1239 | movdqa $inout0,$inout1 |
1240 | movdqa $inout0,$inout2 | |
1241 | movdqa $inout0,$inout3 | |
6c79faaa AP |
1242 | movdqa $inout0,0x40(%rsp) |
1243 | movdqa $inout0,0x50(%rsp) | |
1244 | movdqa $inout0,0x60(%rsp) | |
23f6eec7 | 1245 | mov %rdx,%r10 # about to borrow %rdx |
6c79faaa AP |
1246 | movdqa $inout0,0x70(%rsp) |
1247 | ||
d8ba0dc9 AP |
1248 | lea 1($ctr),%rax |
1249 | lea 2($ctr),%rdx | |
1250 | bswap %eax | |
1251 | bswap %edx | |
1252 | xor $key0,%eax | |
1253 | xor $key0,%edx | |
1254 | pinsrd \$3,%eax,$inout1 | |
1255 | lea 3($ctr),%rax | |
b4a9d5bf | 1256 | movdqa $inout1,0x10(%rsp) |
d8ba0dc9 AP |
1257 | pinsrd \$3,%edx,$inout2 |
1258 | bswap %eax | |
1259 | mov %r10,%rdx # restore %rdx | |
6c79faaa | 1260 | lea 4($ctr),%r10 |
b4a9d5bf | 1261 | movdqa $inout2,0x20(%rsp) |
d8ba0dc9 | 1262 | xor $key0,%eax |
6c79faaa | 1263 | bswap %r10d |
d8ba0dc9 | 1264 | pinsrd \$3,%eax,$inout3 |
6c79faaa | 1265 | xor $key0,%r10d |
b4a9d5bf | 1266 | movdqa $inout3,0x30(%rsp) |
6c79faaa AP |
1267 | lea 5($ctr),%r9 |
1268 | mov %r10d,0x40+12(%rsp) | |
1269 | bswap %r9d | |
1270 | lea 6($ctr),%r10 | |
d8ba0dc9 | 1271 | mov 240($key),$rounds # key->rounds |
6c79faaa AP |
1272 | xor $key0,%r9d |
1273 | bswap %r10d | |
1274 | mov %r9d,0x50+12(%rsp) | |
1275 | xor $key0,%r10d | |
1276 | lea 7($ctr),%r9 | |
1277 | mov %r10d,0x60+12(%rsp) | |
1278 | bswap %r9d | |
609b0852 | 1279 | mov OPENSSL_ia32cap_P+4(%rip),%r10d |
6c79faaa | 1280 | xor $key0,%r9d |
5599c733 | 1281 | and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE |
6c79faaa AP |
1282 | mov %r9d,0x70+12(%rsp) |
1283 | ||
1284 | $movkey 0x10($key),$rndkey1 | |
1285 | ||
6c79faaa AP |
1286 | movdqa 0x40(%rsp),$inout4 |
1287 | movdqa 0x50(%rsp),$inout5 | |
9282c335 | 1288 | |
23f6eec7 AP |
1289 | cmp \$8,$len # $len is in blocks |
1290 | jb .Lctr32_tail # short input if ($len<8) | |
9282c335 | 1291 | |
23f6eec7 | 1292 | sub \$6,$len # $len is biased by -6 |
5599c733 | 1293 | cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE |
23f6eec7 | 1294 | je .Lctr32_6x # [which denotes Atom Silvermont] |
5599c733 | 1295 | |
6c79faaa | 1296 | lea 0x80($key),$key # size optimization |
23f6eec7 | 1297 | sub \$2,$len # $len is biased by -8 |
9282c335 | 1298 | jmp .Lctr32_loop8 |
f8501464 | 1299 | |
5599c733 AP |
1300 | .align 16 |
1301 | .Lctr32_6x: | |
1302 | shl \$4,$rounds | |
1303 | mov \$48,$rnds_ | |
1304 | bswap $key0 | |
1305 | lea 32($key,$rounds),$key # end of key schedule | |
1306 | sub %rax,%r10 # twisted $rounds | |
1307 | jmp .Lctr32_loop6 | |
1308 | ||
1309 | .align 16 | |
1310 | .Lctr32_loop6: | |
23f6eec7 | 1311 | add \$6,$ctr # next counter value |
5599c733 AP |
1312 | $movkey -48($key,$rnds_),$rndkey0 |
1313 | aesenc $rndkey1,$inout0 | |
1314 | mov $ctr,%eax | |
1315 | xor $key0,%eax | |
1316 | aesenc $rndkey1,$inout1 | |
23f6eec7 | 1317 | movbe %eax,`0x00+12`(%rsp) # store next counter value |
5599c733 AP |
1318 | lea 1($ctr),%eax |
1319 | aesenc $rndkey1,$inout2 | |
1320 | xor $key0,%eax | |
1321 | movbe %eax,`0x10+12`(%rsp) | |
1322 | aesenc $rndkey1,$inout3 | |
1323 | lea 2($ctr),%eax | |
1324 | xor $key0,%eax | |
1325 | aesenc $rndkey1,$inout4 | |
1326 | movbe %eax,`0x20+12`(%rsp) | |
1327 | lea 3($ctr),%eax | |
1328 | aesenc $rndkey1,$inout5 | |
1329 | $movkey -32($key,$rnds_),$rndkey1 | |
1330 | xor $key0,%eax | |
1331 | ||
1332 | aesenc $rndkey0,$inout0 | |
1333 | movbe %eax,`0x30+12`(%rsp) | |
1334 | lea 4($ctr),%eax | |
1335 | aesenc $rndkey0,$inout1 | |
1336 | xor $key0,%eax | |
1337 | movbe %eax,`0x40+12`(%rsp) | |
1338 | aesenc $rndkey0,$inout2 | |
1339 | lea 5($ctr),%eax | |
1340 | xor $key0,%eax | |
1341 | aesenc $rndkey0,$inout3 | |
1342 | movbe %eax,`0x50+12`(%rsp) | |
1343 | mov %r10,%rax # mov $rnds_,$rounds | |
1344 | aesenc $rndkey0,$inout4 | |
1345 | aesenc $rndkey0,$inout5 | |
1346 | $movkey -16($key,$rnds_),$rndkey0 | |
1347 | ||
1348 | call .Lenc_loop6 | |
1349 | ||
23f6eec7 | 1350 | movdqu ($inp),$inout6 # load 6 input blocks |
5599c733 AP |
1351 | movdqu 0x10($inp),$inout7 |
1352 | movdqu 0x20($inp),$in0 | |
1353 | movdqu 0x30($inp),$in1 | |
1354 | movdqu 0x40($inp),$in2 | |
1355 | movdqu 0x50($inp),$in3 | |
23f6eec7 | 1356 | lea 0x60($inp),$inp # $inp+=6*16 |
5599c733 | 1357 | $movkey -64($key,$rnds_),$rndkey1 |
23f6eec7 AP |
1358 | pxor $inout0,$inout6 # inp^=E(ctr) |
1359 | movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] | |
5599c733 AP |
1360 | pxor $inout1,$inout7 |
1361 | movaps 0x10(%rsp),$inout1 | |
1362 | pxor $inout2,$in0 | |
1363 | movaps 0x20(%rsp),$inout2 | |
1364 | pxor $inout3,$in1 | |
1365 | movaps 0x30(%rsp),$inout3 | |
1366 | pxor $inout4,$in2 | |
1367 | movaps 0x40(%rsp),$inout4 | |
1368 | pxor $inout5,$in3 | |
1369 | movaps 0x50(%rsp),$inout5 | |
23f6eec7 | 1370 | movdqu $inout6,($out) # store 6 output blocks |
5599c733 AP |
1371 | movdqu $inout7,0x10($out) |
1372 | movdqu $in0,0x20($out) | |
1373 | movdqu $in1,0x30($out) | |
1374 | movdqu $in2,0x40($out) | |
1375 | movdqu $in3,0x50($out) | |
23f6eec7 AP |
1376 | lea 0x60($out),$out # $out+=6*16 |
1377 | ||
5599c733 | 1378 | sub \$6,$len |
23f6eec7 | 1379 | jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow |
5599c733 | 1380 | |
23f6eec7 AP |
1381 | add \$6,$len # restore real remaining $len |
1382 | jz .Lctr32_done # done if ($len==0) | |
5599c733 AP |
1383 | |
1384 | lea -48($rnds_),$rounds | |
1385 | lea -80($key,$rnds_),$key # restore $key | |
1386 | neg $rounds | |
1387 | shr \$4,$rounds # restore $rounds | |
1388 | jmp .Lctr32_tail | |
1389 | ||
6c79faaa | 1390 | .align 32 |
9282c335 | 1391 | .Lctr32_loop8: |
23f6eec7 | 1392 | add \$8,$ctr # next counter value |
6c79faaa AP |
1393 | movdqa 0x60(%rsp),$inout6 |
1394 | aesenc $rndkey1,$inout0 | |
1395 | mov $ctr,%r9d | |
1396 | movdqa 0x70(%rsp),$inout7 | |
1397 | aesenc $rndkey1,$inout1 | |
1398 | bswap %r9d | |
1399 | $movkey 0x20-0x80($key),$rndkey0 | |
1400 | aesenc $rndkey1,$inout2 | |
1401 | xor $key0,%r9d | |
d8ba0dc9 | 1402 | nop |
6c79faaa | 1403 | aesenc $rndkey1,$inout3 |
23f6eec7 | 1404 | mov %r9d,0x00+12(%rsp) # store next counter value |
6c79faaa AP |
1405 | lea 1($ctr),%r9 |
1406 | aesenc $rndkey1,$inout4 | |
1407 | aesenc $rndkey1,$inout5 | |
1408 | aesenc $rndkey1,$inout6 | |
1409 | aesenc $rndkey1,$inout7 | |
1410 | $movkey 0x30-0x80($key),$rndkey1 | |
1411 | ___ | |
1412 | for($i=2;$i<8;$i++) { | |
1413 | my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; | |
1414 | $code.=<<___; | |
d8ba0dc9 | 1415 | bswap %r9d |
6c79faaa AP |
1416 | aesenc $rndkeyx,$inout0 |
1417 | aesenc $rndkeyx,$inout1 | |
6c79faaa | 1418 | xor $key0,%r9d |
d8ba0dc9 AP |
1419 | .byte 0x66,0x90 |
1420 | aesenc $rndkeyx,$inout2 | |
6c79faaa AP |
1421 | aesenc $rndkeyx,$inout3 |
1422 | mov %r9d,`0x10*($i-1)`+12(%rsp) | |
1423 | lea $i($ctr),%r9 | |
1424 | aesenc $rndkeyx,$inout4 | |
1425 | aesenc $rndkeyx,$inout5 | |
1426 | aesenc $rndkeyx,$inout6 | |
1427 | aesenc $rndkeyx,$inout7 | |
1428 | $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx | |
1429 | ___ | |
1430 | } | |
1431 | $code.=<<___; | |
d8ba0dc9 | 1432 | bswap %r9d |
6c79faaa AP |
1433 | aesenc $rndkey0,$inout0 |
1434 | aesenc $rndkey0,$inout1 | |
6c79faaa AP |
1435 | aesenc $rndkey0,$inout2 |
1436 | xor $key0,%r9d | |
23f6eec7 | 1437 | movdqu 0x00($inp),$in0 # start loading input |
6c79faaa AP |
1438 | aesenc $rndkey0,$inout3 |
1439 | mov %r9d,0x70+12(%rsp) | |
d8ba0dc9 | 1440 | cmp \$11,$rounds |
6c79faaa AP |
1441 | aesenc $rndkey0,$inout4 |
1442 | aesenc $rndkey0,$inout5 | |
1443 | aesenc $rndkey0,$inout6 | |
6c79faaa AP |
1444 | aesenc $rndkey0,$inout7 |
1445 | $movkey 0xa0-0x80($key),$rndkey0 | |
1446 | ||
6c79faaa AP |
1447 | jb .Lctr32_enc_done |
1448 | ||
1449 | aesenc $rndkey1,$inout0 | |
1450 | aesenc $rndkey1,$inout1 | |
1451 | aesenc $rndkey1,$inout2 | |
1452 | aesenc $rndkey1,$inout3 | |
1453 | aesenc $rndkey1,$inout4 | |
1454 | aesenc $rndkey1,$inout5 | |
1455 | aesenc $rndkey1,$inout6 | |
1456 | aesenc $rndkey1,$inout7 | |
1457 | $movkey 0xb0-0x80($key),$rndkey1 | |
1bc4d009 AP |
1458 | |
1459 | aesenc $rndkey0,$inout0 | |
1460 | aesenc $rndkey0,$inout1 | |
1bc4d009 | 1461 | aesenc $rndkey0,$inout2 |
1bc4d009 | 1462 | aesenc $rndkey0,$inout3 |
1bc4d009 | 1463 | aesenc $rndkey0,$inout4 |
1bc4d009 | 1464 | aesenc $rndkey0,$inout5 |
1bc4d009 | 1465 | aesenc $rndkey0,$inout6 |
1bc4d009 | 1466 | aesenc $rndkey0,$inout7 |
6c79faaa AP |
1467 | $movkey 0xc0-0x80($key),$rndkey0 |
1468 | je .Lctr32_enc_done | |
9282c335 | 1469 | |
1bc4d009 AP |
1470 | aesenc $rndkey1,$inout0 |
1471 | aesenc $rndkey1,$inout1 | |
1bc4d009 AP |
1472 | aesenc $rndkey1,$inout2 |
1473 | aesenc $rndkey1,$inout3 | |
1474 | aesenc $rndkey1,$inout4 | |
1475 | aesenc $rndkey1,$inout5 | |
1476 | aesenc $rndkey1,$inout6 | |
1477 | aesenc $rndkey1,$inout7 | |
6c79faaa | 1478 | $movkey 0xd0-0x80($key),$rndkey1 |
9282c335 | 1479 | |
1bc4d009 AP |
1480 | aesenc $rndkey0,$inout0 |
1481 | aesenc $rndkey0,$inout1 | |
1bc4d009 AP |
1482 | aesenc $rndkey0,$inout2 |
1483 | aesenc $rndkey0,$inout3 | |
1484 | aesenc $rndkey0,$inout4 | |
1485 | aesenc $rndkey0,$inout5 | |
1486 | aesenc $rndkey0,$inout6 | |
1487 | aesenc $rndkey0,$inout7 | |
6c79faaa | 1488 | $movkey 0xe0-0x80($key),$rndkey0 |
d8ba0dc9 | 1489 | jmp .Lctr32_enc_done |
1bc4d009 | 1490 | |
d8ba0dc9 | 1491 | .align 16 |
6c79faaa | 1492 | .Lctr32_enc_done: |
6c79faaa | 1493 | movdqu 0x10($inp),$in1 |
23f6eec7 | 1494 | pxor $rndkey0,$in0 # input^=round[last] |
6c79faaa | 1495 | movdqu 0x20($inp),$in2 |
1bc4d009 | 1496 | pxor $rndkey0,$in1 |
6c79faaa | 1497 | movdqu 0x30($inp),$in3 |
1bc4d009 | 1498 | pxor $rndkey0,$in2 |
6c79faaa | 1499 | movdqu 0x40($inp),$in4 |
1bc4d009 | 1500 | pxor $rndkey0,$in3 |
6c79faaa AP |
1501 | movdqu 0x50($inp),$in5 |
1502 | pxor $rndkey0,$in4 | |
6c79faaa | 1503 | pxor $rndkey0,$in5 |
d8ba0dc9 | 1504 | aesenc $rndkey1,$inout0 |
cd54249c AP |
1505 | aesenc $rndkey1,$inout1 |
1506 | aesenc $rndkey1,$inout2 | |
1507 | aesenc $rndkey1,$inout3 | |
1508 | aesenc $rndkey1,$inout4 | |
1509 | aesenc $rndkey1,$inout5 | |
1bc4d009 AP |
1510 | aesenc $rndkey1,$inout6 |
1511 | aesenc $rndkey1,$inout7 | |
23f6eec7 AP |
1512 | movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] |
1513 | lea 0x80($inp),$inp # $inp+=8*16 | |
6c79faaa | 1514 | |
23f6eec7 AP |
1515 | aesenclast $in0,$inout0 # $inN is inp[N]^round[last] |
1516 | pxor $rndkey0,$rndkey1 # borrowed $rndkey | |
d8ba0dc9 | 1517 | movdqu 0x70-0x80($inp),$in0 |
1bc4d009 | 1518 | aesenclast $in1,$inout1 |
1bc4d009 | 1519 | pxor $rndkey0,$in0 |
6c79faaa | 1520 | movdqa 0x00(%rsp),$in1 # load next counter block |
1bc4d009 | 1521 | aesenclast $in2,$inout2 |
1bc4d009 | 1522 | aesenclast $in3,$inout3 |
d8ba0dc9 | 1523 | movdqa 0x10(%rsp),$in2 |
6c79faaa AP |
1524 | movdqa 0x20(%rsp),$in3 |
1525 | aesenclast $in4,$inout4 | |
6c79faaa | 1526 | aesenclast $in5,$inout5 |
d8ba0dc9 | 1527 | movdqa 0x30(%rsp),$in4 |
6c79faaa AP |
1528 | movdqa 0x40(%rsp),$in5 |
1529 | aesenclast $rndkey1,$inout6 | |
1530 | movdqa 0x50(%rsp),$rndkey0 | |
23f6eec7 | 1531 | $movkey 0x10-0x80($key),$rndkey1#real 1st-round key |
d8ba0dc9 | 1532 | aesenclast $in0,$inout7 |
1bc4d009 | 1533 | |
23f6eec7 | 1534 | movups $inout0,($out) # store 8 output blocks |
6c79faaa | 1535 | movdqa $in1,$inout0 |
9282c335 | 1536 | movups $inout1,0x10($out) |
6c79faaa | 1537 | movdqa $in2,$inout1 |
9282c335 | 1538 | movups $inout2,0x20($out) |
6c79faaa | 1539 | movdqa $in3,$inout2 |
9282c335 | 1540 | movups $inout3,0x30($out) |
6c79faaa | 1541 | movdqa $in4,$inout3 |
9282c335 | 1542 | movups $inout4,0x40($out) |
6c79faaa | 1543 | movdqa $in5,$inout4 |
9282c335 | 1544 | movups $inout5,0x50($out) |
1bc4d009 | 1545 | movdqa $rndkey0,$inout5 |
9282c335 AP |
1546 | movups $inout6,0x60($out) |
1547 | movups $inout7,0x70($out) | |
23f6eec7 AP |
1548 | lea 0x80($out),$out # $out+=8*16 |
1549 | ||
9282c335 | 1550 | sub \$8,$len |
23f6eec7 | 1551 | jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow |
f8501464 | 1552 | |
46f4e1be | 1553 | add \$8,$len # restore real remaining $len |
23f6eec7 | 1554 | jz .Lctr32_done # done if ($len==0) |
6c79faaa | 1555 | lea -0x80($key),$key |
f8501464 AP |
1556 | |
1557 | .Lctr32_tail: | |
23f6eec7 | 1558 | # note that at this point $inout0..5 are populated with |
609b0852 | 1559 | # counter values xor-ed with 0-round key |
6c79faaa | 1560 | lea 16($key),$key |
f8501464 | 1561 | cmp \$4,$len |
b4a9d5bf AP |
1562 | jb .Lctr32_loop3 |
1563 | je .Lctr32_loop4 | |
f8501464 | 1564 | |
23f6eec7 | 1565 | # if ($len>4) compute 7 E(counter) |
d8ba0dc9 | 1566 | shl \$4,$rounds |
6c79faaa | 1567 | movdqa 0x60(%rsp),$inout6 |
b4a9d5bf | 1568 | pxor $inout7,$inout7 |
f8501464 | 1569 | |
6c79faaa AP |
1570 | $movkey 16($key),$rndkey0 |
1571 | aesenc $rndkey1,$inout0 | |
6c79faaa | 1572 | aesenc $rndkey1,$inout1 |
23f6eec7 | 1573 | lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter |
d8ba0dc9 | 1574 | neg %rax |
6c79faaa | 1575 | aesenc $rndkey1,$inout2 |
23f6eec7 | 1576 | add \$16,%rax # prepare for .Lenc_loop8_enter |
b4a9d5bf | 1577 | movups ($inp),$in0 |
d8ba0dc9 | 1578 | aesenc $rndkey1,$inout3 |
6c79faaa | 1579 | aesenc $rndkey1,$inout4 |
23f6eec7 | 1580 | movups 0x10($inp),$in1 # pre-load input |
b4a9d5bf | 1581 | movups 0x20($inp),$in2 |
d8ba0dc9 | 1582 | aesenc $rndkey1,$inout5 |
6c79faaa | 1583 | aesenc $rndkey1,$inout6 |
f8501464 | 1584 | |
6c79faaa | 1585 | call .Lenc_loop8_enter |
f8501464 | 1586 | |
73325b22 AP |
1587 | movdqu 0x30($inp),$in3 |
1588 | pxor $in0,$inout0 | |
1589 | movdqu 0x40($inp),$in0 | |
1590 | pxor $in1,$inout1 | |
23f6eec7 | 1591 | movdqu $inout0,($out) # store output |
73325b22 AP |
1592 | pxor $in2,$inout2 |
1593 | movdqu $inout1,0x10($out) | |
1594 | pxor $in3,$inout3 | |
1595 | movdqu $inout2,0x20($out) | |
1596 | pxor $in0,$inout4 | |
1597 | movdqu $inout3,0x30($out) | |
1598 | movdqu $inout4,0x40($out) | |
6c79faaa | 1599 | cmp \$6,$len |
23f6eec7 | 1600 | jb .Lctr32_done # $len was 5, stop store |
9282c335 | 1601 | |
6c79faaa AP |
1602 | movups 0x50($inp),$in1 |
1603 | xorps $in1,$inout5 | |
1604 | movups $inout5,0x50($out) | |
23f6eec7 | 1605 | je .Lctr32_done # $len was 6, stop store |
9282c335 | 1606 | |
6c79faaa AP |
1607 | movups 0x60($inp),$in2 |
1608 | xorps $in2,$inout6 | |
1609 | movups $inout6,0x60($out) | |
23f6eec7 | 1610 | jmp .Lctr32_done # $len was 7, stop store |
f8501464 | 1611 | |
6c79faaa AP |
1612 | .align 32 |
1613 | .Lctr32_loop4: | |
1614 | aesenc $rndkey1,$inout0 | |
1615 | lea 16($key),$key | |
d8ba0dc9 | 1616 | dec $rounds |
6c79faaa AP |
1617 | aesenc $rndkey1,$inout1 |
1618 | aesenc $rndkey1,$inout2 | |
1619 | aesenc $rndkey1,$inout3 | |
1620 | $movkey ($key),$rndkey1 | |
6c79faaa AP |
1621 | jnz .Lctr32_loop4 |
1622 | aesenclast $rndkey1,$inout0 | |
1623 | aesenclast $rndkey1,$inout1 | |
23f6eec7 | 1624 | movups ($inp),$in0 # load input |
b4a9d5bf | 1625 | movups 0x10($inp),$in1 |
6c79faaa AP |
1626 | aesenclast $rndkey1,$inout2 |
1627 | aesenclast $rndkey1,$inout3 | |
d8ba0dc9 | 1628 | movups 0x20($inp),$in2 |
b4a9d5bf AP |
1629 | movups 0x30($inp),$in3 |
1630 | ||
1631 | xorps $in0,$inout0 | |
23f6eec7 | 1632 | movups $inout0,($out) # store output |
b4a9d5bf AP |
1633 | xorps $in1,$inout1 |
1634 | movups $inout1,0x10($out) | |
73325b22 AP |
1635 | pxor $in2,$inout2 |
1636 | movdqu $inout2,0x20($out) | |
1637 | pxor $in3,$inout3 | |
1638 | movdqu $inout3,0x30($out) | |
23f6eec7 | 1639 | jmp .Lctr32_done # $len was 4, stop store |
b4a9d5bf AP |
1640 | |
1641 | .align 32 | |
1642 | .Lctr32_loop3: | |
1643 | aesenc $rndkey1,$inout0 | |
1644 | lea 16($key),$key | |
d8ba0dc9 | 1645 | dec $rounds |
b4a9d5bf AP |
1646 | aesenc $rndkey1,$inout1 |
1647 | aesenc $rndkey1,$inout2 | |
1648 | $movkey ($key),$rndkey1 | |
b4a9d5bf AP |
1649 | jnz .Lctr32_loop3 |
1650 | aesenclast $rndkey1,$inout0 | |
1651 | aesenclast $rndkey1,$inout1 | |
1652 | aesenclast $rndkey1,$inout2 | |
6c79faaa | 1653 | |
23f6eec7 | 1654 | movups ($inp),$in0 # load input |
9282c335 | 1655 | xorps $in0,$inout0 |
23f6eec7 | 1656 | movups $inout0,($out) # store output |
6c79faaa | 1657 | cmp \$2,$len |
23f6eec7 | 1658 | jb .Lctr32_done # $len was 1, stop store |
f8501464 | 1659 | |
6c79faaa | 1660 | movups 0x10($inp),$in1 |
9282c335 | 1661 | xorps $in1,$inout1 |
9282c335 | 1662 | movups $inout1,0x10($out) |
23f6eec7 | 1663 | je .Lctr32_done # $len was 2, stop store |
f8501464 | 1664 | |
6c79faaa | 1665 | movups 0x20($inp),$in2 |
9282c335 | 1666 | xorps $in2,$inout2 |
23f6eec7 | 1667 | movups $inout2,0x20($out) # $len was 3, stop store |
9282c335 | 1668 | |
f8501464 | 1669 | .Lctr32_done: |
46f4e1be | 1670 | xorps %xmm0,%xmm0 # clear register bank |
23f6eec7 AP |
1671 | xor $key0,$key0 |
1672 | pxor %xmm1,%xmm1 | |
1673 | pxor %xmm2,%xmm2 | |
1674 | pxor %xmm3,%xmm3 | |
1675 | pxor %xmm4,%xmm4 | |
1676 | pxor %xmm5,%xmm5 | |
1677 | ___ | |
1678 | $code.=<<___ if (!$win64); | |
1679 | pxor %xmm6,%xmm6 | |
1680 | pxor %xmm7,%xmm7 | |
1681 | movaps %xmm0,0x00(%rsp) # clear stack | |
1682 | pxor %xmm8,%xmm8 | |
1683 | movaps %xmm0,0x10(%rsp) | |
1684 | pxor %xmm9,%xmm9 | |
1685 | movaps %xmm0,0x20(%rsp) | |
1686 | pxor %xmm10,%xmm10 | |
1687 | movaps %xmm0,0x30(%rsp) | |
1688 | pxor %xmm11,%xmm11 | |
1689 | movaps %xmm0,0x40(%rsp) | |
1690 | pxor %xmm12,%xmm12 | |
1691 | movaps %xmm0,0x50(%rsp) | |
1692 | pxor %xmm13,%xmm13 | |
1693 | movaps %xmm0,0x60(%rsp) | |
1694 | pxor %xmm14,%xmm14 | |
1695 | movaps %xmm0,0x70(%rsp) | |
1696 | pxor %xmm15,%xmm15 | |
f8501464 AP |
1697 | ___ |
1698 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1699 | movaps -0xa8($key_),%xmm6 |
1700 | movaps %xmm0,-0xa8($key_) # clear stack | |
1701 | movaps -0x98($key_),%xmm7 | |
1702 | movaps %xmm0,-0x98($key_) | |
1703 | movaps -0x88($key_),%xmm8 | |
1704 | movaps %xmm0,-0x88($key_) | |
1705 | movaps -0x78($key_),%xmm9 | |
1706 | movaps %xmm0,-0x78($key_) | |
1707 | movaps -0x68($key_),%xmm10 | |
1708 | movaps %xmm0,-0x68($key_) | |
1709 | movaps -0x58($key_),%xmm11 | |
1710 | movaps %xmm0,-0x58($key_) | |
1711 | movaps -0x48($key_),%xmm12 | |
1712 | movaps %xmm0,-0x48($key_) | |
1713 | movaps -0x38($key_),%xmm13 | |
1714 | movaps %xmm0,-0x38($key_) | |
1715 | movaps -0x28($key_),%xmm14 | |
1716 | movaps %xmm0,-0x28($key_) | |
1717 | movaps -0x18($key_),%xmm15 | |
1718 | movaps %xmm0,-0x18($key_) | |
23f6eec7 AP |
1719 | movaps %xmm0,0x00(%rsp) |
1720 | movaps %xmm0,0x10(%rsp) | |
1721 | movaps %xmm0,0x20(%rsp) | |
1722 | movaps %xmm0,0x30(%rsp) | |
1723 | movaps %xmm0,0x40(%rsp) | |
1724 | movaps %xmm0,0x50(%rsp) | |
1725 | movaps %xmm0,0x60(%rsp) | |
1726 | movaps %xmm0,0x70(%rsp) | |
f8501464 AP |
1727 | ___ |
1728 | $code.=<<___; | |
384e6de4 | 1729 | mov -8($key_),%rbp |
b84460ad | 1730 | .cfi_restore %rbp |
384e6de4 | 1731 | lea ($key_),%rsp |
b84460ad | 1732 | .cfi_def_cfa_register %rsp |
6c79faaa | 1733 | .Lctr32_epilogue: |
f8501464 | 1734 | ret |
b84460ad | 1735 | .cfi_endproc |
f8501464 AP |
1736 | .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks |
1737 | ___ | |
1738 | } | |
1739 | \f | |
1740 | ###################################################################### | |
1741 | # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, | |
1742 | # const AES_KEY *key1, const AES_KEY *key2 | |
1743 | # const unsigned char iv[16]); | |
1744 | # | |
1745 | { | |
1746 | my @tweak=map("%xmm$_",(10..15)); | |
1747 | my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); | |
1748 | my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); | |
36df342f | 1749 | my $frame_size = 0x70 + ($win64?160:0); |
384e6de4 | 1750 | my $key_ = "%rbp"; # override so that we can use %r11 as FP |
f8501464 AP |
1751 | |
1752 | $code.=<<___; | |
1753 | .globl aesni_xts_encrypt | |
1754 | .type aesni_xts_encrypt,\@function,6 | |
1755 | .align 16 | |
1756 | aesni_xts_encrypt: | |
b84460ad | 1757 | .cfi_startproc |
384e6de4 | 1758 | lea (%rsp),%r11 # frame pointer |
b84460ad | 1759 | .cfi_def_cfa_register %r11 |
6a40ebe8 | 1760 | push %rbp |
b84460ad | 1761 | .cfi_push %rbp |
6a40ebe8 AP |
1762 | sub \$$frame_size,%rsp |
1763 | and \$-16,%rsp # Linux kernel stack can be incorrectly seeded | |
f8501464 AP |
1764 | ___ |
1765 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1766 | movaps %xmm6,-0xa8(%r11) # offload everything |
1767 | movaps %xmm7,-0x98(%r11) | |
1768 | movaps %xmm8,-0x88(%r11) | |
1769 | movaps %xmm9,-0x78(%r11) | |
1770 | movaps %xmm10,-0x68(%r11) | |
1771 | movaps %xmm11,-0x58(%r11) | |
1772 | movaps %xmm12,-0x48(%r11) | |
1773 | movaps %xmm13,-0x38(%r11) | |
1774 | movaps %xmm14,-0x28(%r11) | |
1775 | movaps %xmm15,-0x18(%r11) | |
f8501464 AP |
1776 | .Lxts_enc_body: |
1777 | ___ | |
1778 | $code.=<<___; | |
d8ba0dc9 | 1779 | movups ($ivp),$inout0 # load clear-text tweak |
f8501464 AP |
1780 | mov 240(%r8),$rounds # key2->rounds |
1781 | mov 240($key),$rnds_ # key1->rounds | |
1782 | ___ | |
1783 | # generate the tweak | |
d8ba0dc9 | 1784 | &aesni_generate1("enc",$key2,$rounds,$inout0); |
f8501464 | 1785 | $code.=<<___; |
36df342f | 1786 | $movkey ($key),$rndkey0 # zero round key |
f8501464 AP |
1787 | mov $key,$key_ # backup $key |
1788 | mov $rnds_,$rounds # backup $rounds | |
36df342f | 1789 | shl \$4,$rnds_ |
f8501464 AP |
1790 | mov $len,$len_ # backup $len |
1791 | and \$-16,$len | |
1792 | ||
36df342f | 1793 | $movkey 16($key,$rnds_),$rndkey1 # last round key |
36df342f | 1794 | |
f8501464 | 1795 | movdqa .Lxts_magic(%rip),$twmask |
d8ba0dc9 AP |
1796 | movdqa $inout0,@tweak[5] |
1797 | pshufd \$0x5f,$inout0,$twres | |
36df342f | 1798 | pxor $rndkey0,$rndkey1 |
f8501464 | 1799 | ___ |
36df342f AP |
1800 | # alternative tweak calculation algorithm is based on suggestions |
1801 | # by Shay Gueron. psrad doesn't conflict with AES-NI instructions | |
1802 | # and should help in the future... | |
f8501464 AP |
1803 | for ($i=0;$i<4;$i++) { |
1804 | $code.=<<___; | |
36df342f AP |
1805 | movdqa $twres,$twtmp |
1806 | paddd $twres,$twres | |
f8501464 | 1807 | movdqa @tweak[5],@tweak[$i] |
36df342f AP |
1808 | psrad \$31,$twtmp # broadcast upper bits |
1809 | paddq @tweak[5],@tweak[5] | |
1810 | pand $twmask,$twtmp | |
1811 | pxor $rndkey0,@tweak[$i] | |
1812 | pxor $twtmp,@tweak[5] | |
f8501464 AP |
1813 | ___ |
1814 | } | |
1815 | $code.=<<___; | |
36df342f AP |
1816 | movdqa @tweak[5],@tweak[4] |
1817 | psrad \$31,$twres | |
1818 | paddq @tweak[5],@tweak[5] | |
1819 | pand $twmask,$twres | |
1820 | pxor $rndkey0,@tweak[4] | |
1821 | pxor $twres,@tweak[5] | |
1822 | movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] | |
1823 | ||
f8501464 | 1824 | sub \$16*6,$len |
23f6eec7 | 1825 | jc .Lxts_enc_short # if $len-=6*16 borrowed |
f8501464 | 1826 | |
d8ba0dc9 AP |
1827 | mov \$16+96,$rounds |
1828 | lea 32($key_,$rnds_),$key # end of key schedule | |
1829 | sub %r10,%rax # twisted $rounds | |
36df342f | 1830 | $movkey 16($key_),$rndkey1 |
d8ba0dc9 | 1831 | mov %rax,%r10 # backup twisted $rounds |
36df342f | 1832 | lea .Lxts_magic(%rip),%r8 |
f8501464 AP |
1833 | jmp .Lxts_enc_grandloop |
1834 | ||
36df342f | 1835 | .align 32 |
f8501464 | 1836 | .Lxts_enc_grandloop: |
f8501464 | 1837 | movdqu `16*0`($inp),$inout0 # load input |
36df342f | 1838 | movdqa $rndkey0,$twmask |
f8501464 | 1839 | movdqu `16*1`($inp),$inout1 |
23f6eec7 | 1840 | pxor @tweak[0],$inout0 # input^=tweak^round[0] |
f8501464 | 1841 | movdqu `16*2`($inp),$inout2 |
f8501464 | 1842 | pxor @tweak[1],$inout1 |
36df342f AP |
1843 | aesenc $rndkey1,$inout0 |
1844 | movdqu `16*3`($inp),$inout3 | |
f8501464 | 1845 | pxor @tweak[2],$inout2 |
36df342f AP |
1846 | aesenc $rndkey1,$inout1 |
1847 | movdqu `16*4`($inp),$inout4 | |
f8501464 | 1848 | pxor @tweak[3],$inout3 |
36df342f AP |
1849 | aesenc $rndkey1,$inout2 |
1850 | movdqu `16*5`($inp),$inout5 | |
1851 | pxor @tweak[5],$twmask # round[0]^=tweak[5] | |
1852 | movdqa 0x60(%rsp),$twres # load round[0]^round[last] | |
f8501464 | 1853 | pxor @tweak[4],$inout4 |
36df342f AP |
1854 | aesenc $rndkey1,$inout3 |
1855 | $movkey 32($key_),$rndkey0 | |
1856 | lea `16*6`($inp),$inp | |
1857 | pxor $twmask,$inout5 | |
f8501464 | 1858 | |
46f4e1be | 1859 | pxor $twres,@tweak[0] # calculate tweaks^round[last] |
f8501464 | 1860 | aesenc $rndkey1,$inout4 |
36df342f | 1861 | pxor $twres,@tweak[1] |
23f6eec7 | 1862 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] |
f8501464 | 1863 | aesenc $rndkey1,$inout5 |
36df342f | 1864 | $movkey 48($key_),$rndkey1 |
d8ba0dc9 | 1865 | pxor $twres,@tweak[2] |
f8501464 | 1866 | |
36df342f | 1867 | aesenc $rndkey0,$inout0 |
d8ba0dc9 | 1868 | pxor $twres,@tweak[3] |
36df342f AP |
1869 | movdqa @tweak[1],`16*1`(%rsp) |
1870 | aesenc $rndkey0,$inout1 | |
d8ba0dc9 | 1871 | pxor $twres,@tweak[4] |
36df342f AP |
1872 | movdqa @tweak[2],`16*2`(%rsp) |
1873 | aesenc $rndkey0,$inout2 | |
36df342f AP |
1874 | aesenc $rndkey0,$inout3 |
1875 | pxor $twres,$twmask | |
1876 | movdqa @tweak[4],`16*4`(%rsp) | |
1877 | aesenc $rndkey0,$inout4 | |
36df342f AP |
1878 | aesenc $rndkey0,$inout5 |
1879 | $movkey 64($key_),$rndkey0 | |
d8ba0dc9 | 1880 | movdqa $twmask,`16*5`(%rsp) |
36df342f AP |
1881 | pshufd \$0x5f,@tweak[5],$twres |
1882 | jmp .Lxts_enc_loop6 | |
1883 | .align 32 | |
f8501464 AP |
1884 | .Lxts_enc_loop6: |
1885 | aesenc $rndkey1,$inout0 | |
1886 | aesenc $rndkey1,$inout1 | |
f8501464 AP |
1887 | aesenc $rndkey1,$inout2 |
1888 | aesenc $rndkey1,$inout3 | |
1889 | aesenc $rndkey1,$inout4 | |
1890 | aesenc $rndkey1,$inout5 | |
d8ba0dc9 AP |
1891 | $movkey -64($key,%rax),$rndkey1 |
1892 | add \$32,%rax | |
36df342f | 1893 | |
f8501464 AP |
1894 | aesenc $rndkey0,$inout0 |
1895 | aesenc $rndkey0,$inout1 | |
f8501464 AP |
1896 | aesenc $rndkey0,$inout2 |
1897 | aesenc $rndkey0,$inout3 | |
1898 | aesenc $rndkey0,$inout4 | |
1899 | aesenc $rndkey0,$inout5 | |
d8ba0dc9 | 1900 | $movkey -80($key,%rax),$rndkey0 |
f8501464 AP |
1901 | jnz .Lxts_enc_loop6 |
1902 | ||
23f6eec7 | 1903 | movdqa (%r8),$twmask # start calculating next tweak |
36df342f AP |
1904 | movdqa $twres,$twtmp |
1905 | paddd $twres,$twres | |
f8501464 | 1906 | aesenc $rndkey1,$inout0 |
36df342f AP |
1907 | paddq @tweak[5],@tweak[5] |
1908 | psrad \$31,$twtmp | |
f8501464 | 1909 | aesenc $rndkey1,$inout1 |
36df342f AP |
1910 | pand $twmask,$twtmp |
1911 | $movkey ($key_),@tweak[0] # load round[0] | |
f8501464 | 1912 | aesenc $rndkey1,$inout2 |
f8501464 AP |
1913 | aesenc $rndkey1,$inout3 |
1914 | aesenc $rndkey1,$inout4 | |
d8ba0dc9 | 1915 | pxor $twtmp,@tweak[5] |
36df342f | 1916 | movaps @tweak[0],@tweak[1] # copy round[0] |
f8501464 | 1917 | aesenc $rndkey1,$inout5 |
d8ba0dc9 | 1918 | $movkey -64($key),$rndkey1 |
f8501464 | 1919 | |
36df342f | 1920 | movdqa $twres,$twtmp |
f8501464 | 1921 | aesenc $rndkey0,$inout0 |
d8ba0dc9 | 1922 | paddd $twres,$twres |
36df342f | 1923 | pxor @tweak[5],@tweak[0] |
f8501464 | 1924 | aesenc $rndkey0,$inout1 |
d8ba0dc9 | 1925 | psrad \$31,$twtmp |
36df342f | 1926 | paddq @tweak[5],@tweak[5] |
f8501464 | 1927 | aesenc $rndkey0,$inout2 |
f8501464 | 1928 | aesenc $rndkey0,$inout3 |
d8ba0dc9 | 1929 | pand $twmask,$twtmp |
36df342f | 1930 | movaps @tweak[1],@tweak[2] |
d8ba0dc9 AP |
1931 | aesenc $rndkey0,$inout4 |
1932 | pxor $twtmp,@tweak[5] | |
1933 | movdqa $twres,$twtmp | |
f8501464 | 1934 | aesenc $rndkey0,$inout5 |
d8ba0dc9 | 1935 | $movkey -48($key),$rndkey0 |
f8501464 | 1936 | |
36df342f | 1937 | paddd $twres,$twres |
f8501464 | 1938 | aesenc $rndkey1,$inout0 |
36df342f AP |
1939 | pxor @tweak[5],@tweak[1] |
1940 | psrad \$31,$twtmp | |
f8501464 | 1941 | aesenc $rndkey1,$inout1 |
36df342f AP |
1942 | paddq @tweak[5],@tweak[5] |
1943 | pand $twmask,$twtmp | |
f8501464 | 1944 | aesenc $rndkey1,$inout2 |
f8501464 | 1945 | aesenc $rndkey1,$inout3 |
d8ba0dc9 | 1946 | movdqa @tweak[3],`16*3`(%rsp) |
36df342f | 1947 | pxor $twtmp,@tweak[5] |
f8501464 | 1948 | aesenc $rndkey1,$inout4 |
36df342f | 1949 | movaps @tweak[2],@tweak[3] |
d8ba0dc9 | 1950 | movdqa $twres,$twtmp |
f8501464 | 1951 | aesenc $rndkey1,$inout5 |
d8ba0dc9 | 1952 | $movkey -32($key),$rndkey1 |
f8501464 | 1953 | |
36df342f AP |
1954 | paddd $twres,$twres |
1955 | aesenc $rndkey0,$inout0 | |
1956 | pxor @tweak[5],@tweak[2] | |
1957 | psrad \$31,$twtmp | |
1958 | aesenc $rndkey0,$inout1 | |
1959 | paddq @tweak[5],@tweak[5] | |
1960 | pand $twmask,$twtmp | |
1961 | aesenc $rndkey0,$inout2 | |
1962 | aesenc $rndkey0,$inout3 | |
36df342f | 1963 | aesenc $rndkey0,$inout4 |
d8ba0dc9 | 1964 | pxor $twtmp,@tweak[5] |
36df342f AP |
1965 | movaps @tweak[3],@tweak[4] |
1966 | aesenc $rndkey0,$inout5 | |
1967 | ||
1968 | movdqa $twres,$rndkey0 | |
1969 | paddd $twres,$twres | |
1970 | aesenc $rndkey1,$inout0 | |
1971 | pxor @tweak[5],@tweak[3] | |
1972 | psrad \$31,$rndkey0 | |
1973 | aesenc $rndkey1,$inout1 | |
1974 | paddq @tweak[5],@tweak[5] | |
1975 | pand $twmask,$rndkey0 | |
1976 | aesenc $rndkey1,$inout2 | |
1977 | aesenc $rndkey1,$inout3 | |
1978 | pxor $rndkey0,@tweak[5] | |
1979 | $movkey ($key_),$rndkey0 | |
1980 | aesenc $rndkey1,$inout4 | |
1981 | aesenc $rndkey1,$inout5 | |
1982 | $movkey 16($key_),$rndkey1 | |
1983 | ||
1984 | pxor @tweak[5],@tweak[4] | |
36df342f | 1985 | aesenclast `16*0`(%rsp),$inout0 |
d8ba0dc9 | 1986 | psrad \$31,$twres |
36df342f | 1987 | paddq @tweak[5],@tweak[5] |
36df342f AP |
1988 | aesenclast `16*1`(%rsp),$inout1 |
1989 | aesenclast `16*2`(%rsp),$inout2 | |
d8ba0dc9 AP |
1990 | pand $twmask,$twres |
1991 | mov %r10,%rax # restore $rounds | |
36df342f AP |
1992 | aesenclast `16*3`(%rsp),$inout3 |
1993 | aesenclast `16*4`(%rsp),$inout4 | |
1994 | aesenclast `16*5`(%rsp),$inout5 | |
d8ba0dc9 | 1995 | pxor $twres,@tweak[5] |
f8501464 | 1996 | |
23f6eec7 AP |
1997 | lea `16*6`($out),$out # $out+=6*16 |
1998 | movups $inout0,`-16*6`($out) # store 6 output blocks | |
36df342f AP |
1999 | movups $inout1,`-16*5`($out) |
2000 | movups $inout2,`-16*4`($out) | |
2001 | movups $inout3,`-16*3`($out) | |
2002 | movups $inout4,`-16*2`($out) | |
2003 | movups $inout5,`-16*1`($out) | |
f8501464 | 2004 | sub \$16*6,$len |
23f6eec7 | 2005 | jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow |
f8501464 | 2006 | |
d8ba0dc9 AP |
2007 | mov \$16+96,$rounds |
2008 | sub $rnds_,$rounds | |
f8501464 | 2009 | mov $key_,$key # restore $key |
d8ba0dc9 | 2010 | shr \$4,$rounds # restore original value |
f8501464 AP |
2011 | |
2012 | .Lxts_enc_short: | |
23f6eec7 | 2013 | # at the point @tweak[0..5] are populated with tweak values |
d8ba0dc9 | 2014 | mov $rounds,$rnds_ # backup $rounds |
36df342f | 2015 | pxor $rndkey0,@tweak[0] |
23f6eec7 AP |
2016 | add \$16*6,$len # restore real remaining $len |
2017 | jz .Lxts_enc_done # done if ($len==0) | |
f8501464 | 2018 | |
36df342f | 2019 | pxor $rndkey0,@tweak[1] |
f8501464 | 2020 | cmp \$0x20,$len |
23f6eec7 | 2021 | jb .Lxts_enc_one # $len is 1*16 |
36df342f | 2022 | pxor $rndkey0,@tweak[2] |
23f6eec7 | 2023 | je .Lxts_enc_two # $len is 2*16 |
f8501464 | 2024 | |
36df342f | 2025 | pxor $rndkey0,@tweak[3] |
f8501464 | 2026 | cmp \$0x40,$len |
23f6eec7 | 2027 | jb .Lxts_enc_three # $len is 3*16 |
36df342f | 2028 | pxor $rndkey0,@tweak[4] |
23f6eec7 | 2029 | je .Lxts_enc_four # $len is 4*16 |
f8501464 | 2030 | |
23f6eec7 | 2031 | movdqu ($inp),$inout0 # $len is 5*16 |
36df342f | 2032 | movdqu 16*1($inp),$inout1 |
f8501464 AP |
2033 | movdqu 16*2($inp),$inout2 |
2034 | pxor @tweak[0],$inout0 | |
2035 | movdqu 16*3($inp),$inout3 | |
2036 | pxor @tweak[1],$inout1 | |
2037 | movdqu 16*4($inp),$inout4 | |
23f6eec7 | 2038 | lea 16*5($inp),$inp # $inp+=5*16 |
f8501464 AP |
2039 | pxor @tweak[2],$inout2 |
2040 | pxor @tweak[3],$inout3 | |
2041 | pxor @tweak[4],$inout4 | |
23f6eec7 | 2042 | pxor $inout5,$inout5 |
f8501464 AP |
2043 | |
2044 | call _aesni_encrypt6 | |
2045 | ||
2046 | xorps @tweak[0],$inout0 | |
2047 | movdqa @tweak[5],@tweak[0] | |
2048 | xorps @tweak[1],$inout1 | |
2049 | xorps @tweak[2],$inout2 | |
23f6eec7 | 2050 | movdqu $inout0,($out) # store 5 output blocks |
f8501464 AP |
2051 | xorps @tweak[3],$inout3 |
2052 | movdqu $inout1,16*1($out) | |
2053 | xorps @tweak[4],$inout4 | |
2054 | movdqu $inout2,16*2($out) | |
2055 | movdqu $inout3,16*3($out) | |
2056 | movdqu $inout4,16*4($out) | |
23f6eec7 | 2057 | lea 16*5($out),$out # $out+=5*16 |
f8501464 AP |
2058 | jmp .Lxts_enc_done |
2059 | ||
2060 | .align 16 | |
2061 | .Lxts_enc_one: | |
2062 | movups ($inp),$inout0 | |
23f6eec7 | 2063 | lea 16*1($inp),$inp # inp+=1*16 |
f8501464 AP |
2064 | xorps @tweak[0],$inout0 |
2065 | ___ | |
2066 | &aesni_generate1("enc",$key,$rounds); | |
2067 | $code.=<<___; | |
2068 | xorps @tweak[0],$inout0 | |
2069 | movdqa @tweak[1],@tweak[0] | |
23f6eec7 AP |
2070 | movups $inout0,($out) # store one output block |
2071 | lea 16*1($out),$out # $out+=1*16 | |
f8501464 AP |
2072 | jmp .Lxts_enc_done |
2073 | ||
2074 | .align 16 | |
2075 | .Lxts_enc_two: | |
2076 | movups ($inp),$inout0 | |
2077 | movups 16($inp),$inout1 | |
23f6eec7 | 2078 | lea 32($inp),$inp # $inp+=2*16 |
f8501464 AP |
2079 | xorps @tweak[0],$inout0 |
2080 | xorps @tweak[1],$inout1 | |
2081 | ||
214368ff | 2082 | call _aesni_encrypt2 |
f8501464 AP |
2083 | |
2084 | xorps @tweak[0],$inout0 | |
2085 | movdqa @tweak[2],@tweak[0] | |
2086 | xorps @tweak[1],$inout1 | |
23f6eec7 | 2087 | movups $inout0,($out) # store 2 output blocks |
f8501464 | 2088 | movups $inout1,16*1($out) |
23f6eec7 | 2089 | lea 16*2($out),$out # $out+=2*16 |
f8501464 AP |
2090 | jmp .Lxts_enc_done |
2091 | ||
2092 | .align 16 | |
2093 | .Lxts_enc_three: | |
2094 | movups ($inp),$inout0 | |
2095 | movups 16*1($inp),$inout1 | |
2096 | movups 16*2($inp),$inout2 | |
23f6eec7 | 2097 | lea 16*3($inp),$inp # $inp+=3*16 |
f8501464 AP |
2098 | xorps @tweak[0],$inout0 |
2099 | xorps @tweak[1],$inout1 | |
2100 | xorps @tweak[2],$inout2 | |
2101 | ||
2102 | call _aesni_encrypt3 | |
2103 | ||
2104 | xorps @tweak[0],$inout0 | |
2105 | movdqa @tweak[3],@tweak[0] | |
2106 | xorps @tweak[1],$inout1 | |
2107 | xorps @tweak[2],$inout2 | |
23f6eec7 | 2108 | movups $inout0,($out) # store 3 output blocks |
f8501464 AP |
2109 | movups $inout1,16*1($out) |
2110 | movups $inout2,16*2($out) | |
23f6eec7 | 2111 | lea 16*3($out),$out # $out+=3*16 |
f8501464 AP |
2112 | jmp .Lxts_enc_done |
2113 | ||
2114 | .align 16 | |
2115 | .Lxts_enc_four: | |
2116 | movups ($inp),$inout0 | |
2117 | movups 16*1($inp),$inout1 | |
2118 | movups 16*2($inp),$inout2 | |
2119 | xorps @tweak[0],$inout0 | |
2120 | movups 16*3($inp),$inout3 | |
23f6eec7 | 2121 | lea 16*4($inp),$inp # $inp+=4*16 |
f8501464 AP |
2122 | xorps @tweak[1],$inout1 |
2123 | xorps @tweak[2],$inout2 | |
2124 | xorps @tweak[3],$inout3 | |
2125 | ||
2126 | call _aesni_encrypt4 | |
2127 | ||
36df342f AP |
2128 | pxor @tweak[0],$inout0 |
2129 | movdqa @tweak[4],@tweak[0] | |
2130 | pxor @tweak[1],$inout1 | |
2131 | pxor @tweak[2],$inout2 | |
23f6eec7 | 2132 | movdqu $inout0,($out) # store 4 output blocks |
36df342f AP |
2133 | pxor @tweak[3],$inout3 |
2134 | movdqu $inout1,16*1($out) | |
2135 | movdqu $inout2,16*2($out) | |
2136 | movdqu $inout3,16*3($out) | |
23f6eec7 | 2137 | lea 16*4($out),$out # $out+=4*16 |
f8501464 AP |
2138 | jmp .Lxts_enc_done |
2139 | ||
2140 | .align 16 | |
2141 | .Lxts_enc_done: | |
23f6eec7 | 2142 | and \$15,$len_ # see if $len%16 is 0 |
f8501464 AP |
2143 | jz .Lxts_enc_ret |
2144 | mov $len_,$len | |
2145 | ||
2146 | .Lxts_enc_steal: | |
2147 | movzb ($inp),%eax # borrow $rounds ... | |
2148 | movzb -16($out),%ecx # ... and $key | |
2149 | lea 1($inp),$inp | |
2150 | mov %al,-16($out) | |
2151 | mov %cl,0($out) | |
2152 | lea 1($out),$out | |
2153 | sub \$1,$len | |
2154 | jnz .Lxts_enc_steal | |
2155 | ||
2156 | sub $len_,$out # rewind $out | |
2157 | mov $key_,$key # restore $key | |
2158 | mov $rnds_,$rounds # restore $rounds | |
2159 | ||
2160 | movups -16($out),$inout0 | |
2161 | xorps @tweak[0],$inout0 | |
2162 | ___ | |
2163 | &aesni_generate1("enc",$key,$rounds); | |
2164 | $code.=<<___; | |
2165 | xorps @tweak[0],$inout0 | |
2166 | movups $inout0,-16($out) | |
2167 | ||
2168 | .Lxts_enc_ret: | |
23f6eec7 AP |
2169 | xorps %xmm0,%xmm0 # clear register bank |
2170 | pxor %xmm1,%xmm1 | |
2171 | pxor %xmm2,%xmm2 | |
2172 | pxor %xmm3,%xmm3 | |
2173 | pxor %xmm4,%xmm4 | |
2174 | pxor %xmm5,%xmm5 | |
2175 | ___ | |
2176 | $code.=<<___ if (!$win64); | |
2177 | pxor %xmm6,%xmm6 | |
2178 | pxor %xmm7,%xmm7 | |
2179 | movaps %xmm0,0x00(%rsp) # clear stack | |
2180 | pxor %xmm8,%xmm8 | |
2181 | movaps %xmm0,0x10(%rsp) | |
2182 | pxor %xmm9,%xmm9 | |
2183 | movaps %xmm0,0x20(%rsp) | |
2184 | pxor %xmm10,%xmm10 | |
2185 | movaps %xmm0,0x30(%rsp) | |
2186 | pxor %xmm11,%xmm11 | |
2187 | movaps %xmm0,0x40(%rsp) | |
2188 | pxor %xmm12,%xmm12 | |
2189 | movaps %xmm0,0x50(%rsp) | |
2190 | pxor %xmm13,%xmm13 | |
2191 | movaps %xmm0,0x60(%rsp) | |
2192 | pxor %xmm14,%xmm14 | |
2193 | pxor %xmm15,%xmm15 | |
f8501464 AP |
2194 | ___ |
2195 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2196 | movaps -0xa8(%r11),%xmm6 |
2197 | movaps %xmm0,-0xa8(%r11) # clear stack | |
2198 | movaps -0x98(%r11),%xmm7 | |
2199 | movaps %xmm0,-0x98(%r11) | |
2200 | movaps -0x88(%r11),%xmm8 | |
2201 | movaps %xmm0,-0x88(%r11) | |
2202 | movaps -0x78(%r11),%xmm9 | |
2203 | movaps %xmm0,-0x78(%r11) | |
2204 | movaps -0x68(%r11),%xmm10 | |
2205 | movaps %xmm0,-0x68(%r11) | |
2206 | movaps -0x58(%r11),%xmm11 | |
2207 | movaps %xmm0,-0x58(%r11) | |
2208 | movaps -0x48(%r11),%xmm12 | |
2209 | movaps %xmm0,-0x48(%r11) | |
2210 | movaps -0x38(%r11),%xmm13 | |
2211 | movaps %xmm0,-0x38(%r11) | |
2212 | movaps -0x28(%r11),%xmm14 | |
2213 | movaps %xmm0,-0x28(%r11) | |
2214 | movaps -0x18(%r11),%xmm15 | |
2215 | movaps %xmm0,-0x18(%r11) | |
23f6eec7 AP |
2216 | movaps %xmm0,0x00(%rsp) |
2217 | movaps %xmm0,0x10(%rsp) | |
2218 | movaps %xmm0,0x20(%rsp) | |
2219 | movaps %xmm0,0x30(%rsp) | |
2220 | movaps %xmm0,0x40(%rsp) | |
2221 | movaps %xmm0,0x50(%rsp) | |
2222 | movaps %xmm0,0x60(%rsp) | |
f8501464 AP |
2223 | ___ |
2224 | $code.=<<___; | |
384e6de4 | 2225 | mov -8(%r11),%rbp |
b84460ad | 2226 | .cfi_restore %rbp |
384e6de4 | 2227 | lea (%r11),%rsp |
b84460ad | 2228 | .cfi_def_cfa_register %rsp |
f8501464 AP |
2229 | .Lxts_enc_epilogue: |
2230 | ret | |
b84460ad | 2231 | .cfi_endproc |
f8501464 | 2232 | .size aesni_xts_encrypt,.-aesni_xts_encrypt |
d7d119a3 | 2233 | ___ |
6c83629b AP |
2234 | |
2235 | $code.=<<___; | |
f8501464 AP |
2236 | .globl aesni_xts_decrypt |
2237 | .type aesni_xts_decrypt,\@function,6 | |
6c83629b | 2238 | .align 16 |
f8501464 | 2239 | aesni_xts_decrypt: |
b84460ad | 2240 | .cfi_startproc |
384e6de4 | 2241 | lea (%rsp),%r11 # frame pointer |
b84460ad | 2242 | .cfi_def_cfa_register %r11 |
6a40ebe8 | 2243 | push %rbp |
b84460ad | 2244 | .cfi_push %rbp |
6a40ebe8 AP |
2245 | sub \$$frame_size,%rsp |
2246 | and \$-16,%rsp # Linux kernel stack can be incorrectly seeded | |
6c83629b AP |
2247 | ___ |
2248 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2249 | movaps %xmm6,-0xa8(%r11) # offload everything |
2250 | movaps %xmm7,-0x98(%r11) | |
2251 | movaps %xmm8,-0x88(%r11) | |
2252 | movaps %xmm9,-0x78(%r11) | |
2253 | movaps %xmm10,-0x68(%r11) | |
2254 | movaps %xmm11,-0x58(%r11) | |
2255 | movaps %xmm12,-0x48(%r11) | |
2256 | movaps %xmm13,-0x38(%r11) | |
2257 | movaps %xmm14,-0x28(%r11) | |
2258 | movaps %xmm15,-0x18(%r11) | |
f8501464 | 2259 | .Lxts_dec_body: |
6c83629b AP |
2260 | ___ |
2261 | $code.=<<___; | |
d8ba0dc9 | 2262 | movups ($ivp),$inout0 # load clear-text tweak |
f8501464 AP |
2263 | mov 240($key2),$rounds # key2->rounds |
2264 | mov 240($key),$rnds_ # key1->rounds | |
2265 | ___ | |
2266 | # generate the tweak | |
d8ba0dc9 | 2267 | &aesni_generate1("enc",$key2,$rounds,$inout0); |
f8501464 AP |
2268 | $code.=<<___; |
2269 | xor %eax,%eax # if ($len%16) len-=16; | |
2270 | test \$15,$len | |
2271 | setnz %al | |
2272 | shl \$4,%rax | |
2273 | sub %rax,$len | |
2274 | ||
36df342f | 2275 | $movkey ($key),$rndkey0 # zero round key |
f8501464 AP |
2276 | mov $key,$key_ # backup $key |
2277 | mov $rnds_,$rounds # backup $rounds | |
36df342f | 2278 | shl \$4,$rnds_ |
f8501464 AP |
2279 | mov $len,$len_ # backup $len |
2280 | and \$-16,$len | |
6c83629b | 2281 | |
36df342f | 2282 | $movkey 16($key,$rnds_),$rndkey1 # last round key |
36df342f | 2283 | |
f8501464 | 2284 | movdqa .Lxts_magic(%rip),$twmask |
d8ba0dc9 AP |
2285 | movdqa $inout0,@tweak[5] |
2286 | pshufd \$0x5f,$inout0,$twres | |
36df342f | 2287 | pxor $rndkey0,$rndkey1 |
f8501464 AP |
2288 | ___ |
2289 | for ($i=0;$i<4;$i++) { | |
2290 | $code.=<<___; | |
36df342f AP |
2291 | movdqa $twres,$twtmp |
2292 | paddd $twres,$twres | |
f8501464 | 2293 | movdqa @tweak[5],@tweak[$i] |
36df342f AP |
2294 | psrad \$31,$twtmp # broadcast upper bits |
2295 | paddq @tweak[5],@tweak[5] | |
2296 | pand $twmask,$twtmp | |
2297 | pxor $rndkey0,@tweak[$i] | |
2298 | pxor $twtmp,@tweak[5] | |
f8501464 AP |
2299 | ___ |
2300 | } | |
2301 | $code.=<<___; | |
36df342f AP |
2302 | movdqa @tweak[5],@tweak[4] |
2303 | psrad \$31,$twres | |
2304 | paddq @tweak[5],@tweak[5] | |
2305 | pand $twmask,$twres | |
2306 | pxor $rndkey0,@tweak[4] | |
2307 | pxor $twres,@tweak[5] | |
2308 | movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] | |
2309 | ||
f8501464 | 2310 | sub \$16*6,$len |
23f6eec7 | 2311 | jc .Lxts_dec_short # if $len-=6*16 borrowed |
6c83629b | 2312 | |
d8ba0dc9 AP |
2313 | mov \$16+96,$rounds |
2314 | lea 32($key_,$rnds_),$key # end of key schedule | |
2315 | sub %r10,%rax # twisted $rounds | |
36df342f | 2316 | $movkey 16($key_),$rndkey1 |
d8ba0dc9 | 2317 | mov %rax,%r10 # backup twisted $rounds |
36df342f | 2318 | lea .Lxts_magic(%rip),%r8 |
f8501464 | 2319 | jmp .Lxts_dec_grandloop |
6c83629b | 2320 | |
36df342f | 2321 | .align 32 |
f8501464 | 2322 | .Lxts_dec_grandloop: |
f8501464 | 2323 | movdqu `16*0`($inp),$inout0 # load input |
36df342f | 2324 | movdqa $rndkey0,$twmask |
f8501464 | 2325 | movdqu `16*1`($inp),$inout1 |
23f6eec7 | 2326 | pxor @tweak[0],$inout0 # intput^=tweak^round[0] |
f8501464 | 2327 | movdqu `16*2`($inp),$inout2 |
f8501464 | 2328 | pxor @tweak[1],$inout1 |
36df342f AP |
2329 | aesdec $rndkey1,$inout0 |
2330 | movdqu `16*3`($inp),$inout3 | |
f8501464 | 2331 | pxor @tweak[2],$inout2 |
36df342f AP |
2332 | aesdec $rndkey1,$inout1 |
2333 | movdqu `16*4`($inp),$inout4 | |
f8501464 | 2334 | pxor @tweak[3],$inout3 |
36df342f AP |
2335 | aesdec $rndkey1,$inout2 |
2336 | movdqu `16*5`($inp),$inout5 | |
2337 | pxor @tweak[5],$twmask # round[0]^=tweak[5] | |
2338 | movdqa 0x60(%rsp),$twres # load round[0]^round[last] | |
f8501464 | 2339 | pxor @tweak[4],$inout4 |
36df342f AP |
2340 | aesdec $rndkey1,$inout3 |
2341 | $movkey 32($key_),$rndkey0 | |
2342 | lea `16*6`($inp),$inp | |
2343 | pxor $twmask,$inout5 | |
f8501464 | 2344 | |
46f4e1be | 2345 | pxor $twres,@tweak[0] # calculate tweaks^round[last] |
f8501464 | 2346 | aesdec $rndkey1,$inout4 |
36df342f AP |
2347 | pxor $twres,@tweak[1] |
2348 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key | |
f8501464 | 2349 | aesdec $rndkey1,$inout5 |
36df342f | 2350 | $movkey 48($key_),$rndkey1 |
d8ba0dc9 | 2351 | pxor $twres,@tweak[2] |
6c83629b | 2352 | |
36df342f | 2353 | aesdec $rndkey0,$inout0 |
d8ba0dc9 | 2354 | pxor $twres,@tweak[3] |
36df342f AP |
2355 | movdqa @tweak[1],`16*1`(%rsp) |
2356 | aesdec $rndkey0,$inout1 | |
d8ba0dc9 | 2357 | pxor $twres,@tweak[4] |
36df342f AP |
2358 | movdqa @tweak[2],`16*2`(%rsp) |
2359 | aesdec $rndkey0,$inout2 | |
36df342f AP |
2360 | aesdec $rndkey0,$inout3 |
2361 | pxor $twres,$twmask | |
2362 | movdqa @tweak[4],`16*4`(%rsp) | |
2363 | aesdec $rndkey0,$inout4 | |
36df342f AP |
2364 | aesdec $rndkey0,$inout5 |
2365 | $movkey 64($key_),$rndkey0 | |
d8ba0dc9 | 2366 | movdqa $twmask,`16*5`(%rsp) |
36df342f AP |
2367 | pshufd \$0x5f,@tweak[5],$twres |
2368 | jmp .Lxts_dec_loop6 | |
2369 | .align 32 | |
f8501464 AP |
2370 | .Lxts_dec_loop6: |
2371 | aesdec $rndkey1,$inout0 | |
2372 | aesdec $rndkey1,$inout1 | |
f8501464 AP |
2373 | aesdec $rndkey1,$inout2 |
2374 | aesdec $rndkey1,$inout3 | |
2375 | aesdec $rndkey1,$inout4 | |
2376 | aesdec $rndkey1,$inout5 | |
d8ba0dc9 AP |
2377 | $movkey -64($key,%rax),$rndkey1 |
2378 | add \$32,%rax | |
36df342f | 2379 | |
f8501464 AP |
2380 | aesdec $rndkey0,$inout0 |
2381 | aesdec $rndkey0,$inout1 | |
f8501464 AP |
2382 | aesdec $rndkey0,$inout2 |
2383 | aesdec $rndkey0,$inout3 | |
2384 | aesdec $rndkey0,$inout4 | |
2385 | aesdec $rndkey0,$inout5 | |
d8ba0dc9 | 2386 | $movkey -80($key,%rax),$rndkey0 |
f8501464 AP |
2387 | jnz .Lxts_dec_loop6 |
2388 | ||
23f6eec7 | 2389 | movdqa (%r8),$twmask # start calculating next tweak |
36df342f AP |
2390 | movdqa $twres,$twtmp |
2391 | paddd $twres,$twres | |
f8501464 | 2392 | aesdec $rndkey1,$inout0 |
36df342f AP |
2393 | paddq @tweak[5],@tweak[5] |
2394 | psrad \$31,$twtmp | |
f8501464 | 2395 | aesdec $rndkey1,$inout1 |
36df342f AP |
2396 | pand $twmask,$twtmp |
2397 | $movkey ($key_),@tweak[0] # load round[0] | |
f8501464 | 2398 | aesdec $rndkey1,$inout2 |
f8501464 AP |
2399 | aesdec $rndkey1,$inout3 |
2400 | aesdec $rndkey1,$inout4 | |
d8ba0dc9 | 2401 | pxor $twtmp,@tweak[5] |
36df342f | 2402 | movaps @tweak[0],@tweak[1] # copy round[0] |
f8501464 | 2403 | aesdec $rndkey1,$inout5 |
d8ba0dc9 | 2404 | $movkey -64($key),$rndkey1 |
f8501464 | 2405 | |
36df342f | 2406 | movdqa $twres,$twtmp |
f8501464 | 2407 | aesdec $rndkey0,$inout0 |
d8ba0dc9 | 2408 | paddd $twres,$twres |
36df342f | 2409 | pxor @tweak[5],@tweak[0] |
f8501464 | 2410 | aesdec $rndkey0,$inout1 |
d8ba0dc9 | 2411 | psrad \$31,$twtmp |
36df342f | 2412 | paddq @tweak[5],@tweak[5] |
f8501464 | 2413 | aesdec $rndkey0,$inout2 |
f8501464 | 2414 | aesdec $rndkey0,$inout3 |
d8ba0dc9 | 2415 | pand $twmask,$twtmp |
36df342f | 2416 | movaps @tweak[1],@tweak[2] |
d8ba0dc9 AP |
2417 | aesdec $rndkey0,$inout4 |
2418 | pxor $twtmp,@tweak[5] | |
2419 | movdqa $twres,$twtmp | |
f8501464 | 2420 | aesdec $rndkey0,$inout5 |
d8ba0dc9 | 2421 | $movkey -48($key),$rndkey0 |
f8501464 | 2422 | |
36df342f | 2423 | paddd $twres,$twres |
f8501464 | 2424 | aesdec $rndkey1,$inout0 |
36df342f AP |
2425 | pxor @tweak[5],@tweak[1] |
2426 | psrad \$31,$twtmp | |
f8501464 | 2427 | aesdec $rndkey1,$inout1 |
36df342f AP |
2428 | paddq @tweak[5],@tweak[5] |
2429 | pand $twmask,$twtmp | |
f8501464 | 2430 | aesdec $rndkey1,$inout2 |
f8501464 | 2431 | aesdec $rndkey1,$inout3 |
d8ba0dc9 | 2432 | movdqa @tweak[3],`16*3`(%rsp) |
36df342f | 2433 | pxor $twtmp,@tweak[5] |
f8501464 | 2434 | aesdec $rndkey1,$inout4 |
36df342f | 2435 | movaps @tweak[2],@tweak[3] |
d8ba0dc9 | 2436 | movdqa $twres,$twtmp |
f8501464 | 2437 | aesdec $rndkey1,$inout5 |
d8ba0dc9 | 2438 | $movkey -32($key),$rndkey1 |
f8501464 | 2439 | |
36df342f AP |
2440 | paddd $twres,$twres |
2441 | aesdec $rndkey0,$inout0 | |
2442 | pxor @tweak[5],@tweak[2] | |
2443 | psrad \$31,$twtmp | |
2444 | aesdec $rndkey0,$inout1 | |
2445 | paddq @tweak[5],@tweak[5] | |
2446 | pand $twmask,$twtmp | |
2447 | aesdec $rndkey0,$inout2 | |
2448 | aesdec $rndkey0,$inout3 | |
36df342f | 2449 | aesdec $rndkey0,$inout4 |
d8ba0dc9 | 2450 | pxor $twtmp,@tweak[5] |
36df342f AP |
2451 | movaps @tweak[3],@tweak[4] |
2452 | aesdec $rndkey0,$inout5 | |
2453 | ||
2454 | movdqa $twres,$rndkey0 | |
2455 | paddd $twres,$twres | |
2456 | aesdec $rndkey1,$inout0 | |
2457 | pxor @tweak[5],@tweak[3] | |
2458 | psrad \$31,$rndkey0 | |
2459 | aesdec $rndkey1,$inout1 | |
2460 | paddq @tweak[5],@tweak[5] | |
2461 | pand $twmask,$rndkey0 | |
2462 | aesdec $rndkey1,$inout2 | |
2463 | aesdec $rndkey1,$inout3 | |
2464 | pxor $rndkey0,@tweak[5] | |
2465 | $movkey ($key_),$rndkey0 | |
2466 | aesdec $rndkey1,$inout4 | |
2467 | aesdec $rndkey1,$inout5 | |
2468 | $movkey 16($key_),$rndkey1 | |
2469 | ||
2470 | pxor @tweak[5],@tweak[4] | |
36df342f | 2471 | aesdeclast `16*0`(%rsp),$inout0 |
d8ba0dc9 | 2472 | psrad \$31,$twres |
36df342f | 2473 | paddq @tweak[5],@tweak[5] |
36df342f AP |
2474 | aesdeclast `16*1`(%rsp),$inout1 |
2475 | aesdeclast `16*2`(%rsp),$inout2 | |
d8ba0dc9 AP |
2476 | pand $twmask,$twres |
2477 | mov %r10,%rax # restore $rounds | |
36df342f AP |
2478 | aesdeclast `16*3`(%rsp),$inout3 |
2479 | aesdeclast `16*4`(%rsp),$inout4 | |
2480 | aesdeclast `16*5`(%rsp),$inout5 | |
d8ba0dc9 | 2481 | pxor $twres,@tweak[5] |
f8501464 | 2482 | |
23f6eec7 AP |
2483 | lea `16*6`($out),$out # $out+=6*16 |
2484 | movups $inout0,`-16*6`($out) # store 6 output blocks | |
36df342f AP |
2485 | movups $inout1,`-16*5`($out) |
2486 | movups $inout2,`-16*4`($out) | |
2487 | movups $inout3,`-16*3`($out) | |
2488 | movups $inout4,`-16*2`($out) | |
2489 | movups $inout5,`-16*1`($out) | |
f8501464 | 2490 | sub \$16*6,$len |
23f6eec7 | 2491 | jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow |
f8501464 | 2492 | |
d8ba0dc9 AP |
2493 | mov \$16+96,$rounds |
2494 | sub $rnds_,$rounds | |
f8501464 | 2495 | mov $key_,$key # restore $key |
d8ba0dc9 | 2496 | shr \$4,$rounds # restore original value |
f8501464 AP |
2497 | |
2498 | .Lxts_dec_short: | |
23f6eec7 | 2499 | # at the point @tweak[0..5] are populated with tweak values |
d8ba0dc9 | 2500 | mov $rounds,$rnds_ # backup $rounds |
36df342f AP |
2501 | pxor $rndkey0,@tweak[0] |
2502 | pxor $rndkey0,@tweak[1] | |
23f6eec7 AP |
2503 | add \$16*6,$len # restore real remaining $len |
2504 | jz .Lxts_dec_done # done if ($len==0) | |
d7d119a3 | 2505 | |
36df342f | 2506 | pxor $rndkey0,@tweak[2] |
f8501464 | 2507 | cmp \$0x20,$len |
23f6eec7 | 2508 | jb .Lxts_dec_one # $len is 1*16 |
36df342f | 2509 | pxor $rndkey0,@tweak[3] |
23f6eec7 | 2510 | je .Lxts_dec_two # $len is 2*16 |
d7d119a3 | 2511 | |
36df342f | 2512 | pxor $rndkey0,@tweak[4] |
f8501464 | 2513 | cmp \$0x40,$len |
23f6eec7 AP |
2514 | jb .Lxts_dec_three # $len is 3*16 |
2515 | je .Lxts_dec_four # $len is 4*16 | |
f8501464 | 2516 | |
23f6eec7 | 2517 | movdqu ($inp),$inout0 # $len is 5*16 |
36df342f | 2518 | movdqu 16*1($inp),$inout1 |
f8501464 AP |
2519 | movdqu 16*2($inp),$inout2 |
2520 | pxor @tweak[0],$inout0 | |
2521 | movdqu 16*3($inp),$inout3 | |
2522 | pxor @tweak[1],$inout1 | |
2523 | movdqu 16*4($inp),$inout4 | |
23f6eec7 | 2524 | lea 16*5($inp),$inp # $inp+=5*16 |
f8501464 AP |
2525 | pxor @tweak[2],$inout2 |
2526 | pxor @tweak[3],$inout3 | |
2527 | pxor @tweak[4],$inout4 | |
2528 | ||
2529 | call _aesni_decrypt6 | |
2530 | ||
2531 | xorps @tweak[0],$inout0 | |
2532 | xorps @tweak[1],$inout1 | |
2533 | xorps @tweak[2],$inout2 | |
23f6eec7 | 2534 | movdqu $inout0,($out) # store 5 output blocks |
f8501464 AP |
2535 | xorps @tweak[3],$inout3 |
2536 | movdqu $inout1,16*1($out) | |
2537 | xorps @tweak[4],$inout4 | |
2538 | movdqu $inout2,16*2($out) | |
2539 | pxor $twtmp,$twtmp | |
2540 | movdqu $inout3,16*3($out) | |
2541 | pcmpgtd @tweak[5],$twtmp | |
2542 | movdqu $inout4,16*4($out) | |
23f6eec7 | 2543 | lea 16*5($out),$out # $out+=5*16 |
f8501464 AP |
2544 | pshufd \$0x13,$twtmp,@tweak[1] # $twres |
2545 | and \$15,$len_ | |
2546 | jz .Lxts_dec_ret | |
2547 | ||
2548 | movdqa @tweak[5],@tweak[0] | |
2549 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | |
2550 | pand $twmask,@tweak[1] # isolate carry and residue | |
2551 | pxor @tweak[5],@tweak[1] | |
2552 | jmp .Lxts_dec_done2 | |
d7d119a3 | 2553 | |
f8501464 AP |
2554 | .align 16 |
2555 | .Lxts_dec_one: | |
2556 | movups ($inp),$inout0 | |
23f6eec7 | 2557 | lea 16*1($inp),$inp # $inp+=1*16 |
f8501464 AP |
2558 | xorps @tweak[0],$inout0 |
2559 | ___ | |
2560 | &aesni_generate1("dec",$key,$rounds); | |
2561 | $code.=<<___; | |
2562 | xorps @tweak[0],$inout0 | |
2563 | movdqa @tweak[1],@tweak[0] | |
23f6eec7 | 2564 | movups $inout0,($out) # store one output block |
f8501464 | 2565 | movdqa @tweak[2],@tweak[1] |
23f6eec7 | 2566 | lea 16*1($out),$out # $out+=1*16 |
f8501464 | 2567 | jmp .Lxts_dec_done |
6c83629b | 2568 | |
f8501464 AP |
2569 | .align 16 |
2570 | .Lxts_dec_two: | |
2571 | movups ($inp),$inout0 | |
2572 | movups 16($inp),$inout1 | |
23f6eec7 | 2573 | lea 32($inp),$inp # $inp+=2*16 |
f8501464 AP |
2574 | xorps @tweak[0],$inout0 |
2575 | xorps @tweak[1],$inout1 | |
6c83629b | 2576 | |
214368ff | 2577 | call _aesni_decrypt2 |
6c83629b | 2578 | |
f8501464 AP |
2579 | xorps @tweak[0],$inout0 |
2580 | movdqa @tweak[2],@tweak[0] | |
2581 | xorps @tweak[1],$inout1 | |
2582 | movdqa @tweak[3],@tweak[1] | |
23f6eec7 | 2583 | movups $inout0,($out) # store 2 output blocks |
f8501464 | 2584 | movups $inout1,16*1($out) |
23f6eec7 | 2585 | lea 16*2($out),$out # $out+=2*16 |
f8501464 | 2586 | jmp .Lxts_dec_done |
6c83629b | 2587 | |
f8501464 AP |
2588 | .align 16 |
2589 | .Lxts_dec_three: | |
2590 | movups ($inp),$inout0 | |
2591 | movups 16*1($inp),$inout1 | |
2592 | movups 16*2($inp),$inout2 | |
23f6eec7 | 2593 | lea 16*3($inp),$inp # $inp+=3*16 |
f8501464 AP |
2594 | xorps @tweak[0],$inout0 |
2595 | xorps @tweak[1],$inout1 | |
2596 | xorps @tweak[2],$inout2 | |
6c83629b | 2597 | |
f8501464 | 2598 | call _aesni_decrypt3 |
6c83629b | 2599 | |
f8501464 AP |
2600 | xorps @tweak[0],$inout0 |
2601 | movdqa @tweak[3],@tweak[0] | |
2602 | xorps @tweak[1],$inout1 | |
36df342f | 2603 | movdqa @tweak[4],@tweak[1] |
f8501464 | 2604 | xorps @tweak[2],$inout2 |
23f6eec7 | 2605 | movups $inout0,($out) # store 3 output blocks |
f8501464 AP |
2606 | movups $inout1,16*1($out) |
2607 | movups $inout2,16*2($out) | |
23f6eec7 | 2608 | lea 16*3($out),$out # $out+=3*16 |
f8501464 | 2609 | jmp .Lxts_dec_done |
6c83629b AP |
2610 | |
2611 | .align 16 | |
f8501464 | 2612 | .Lxts_dec_four: |
36df342f AP |
2613 | movups ($inp),$inout0 |
2614 | movups 16*1($inp),$inout1 | |
f8501464 AP |
2615 | movups 16*2($inp),$inout2 |
2616 | xorps @tweak[0],$inout0 | |
2617 | movups 16*3($inp),$inout3 | |
23f6eec7 | 2618 | lea 16*4($inp),$inp # $inp+=4*16 |
f8501464 AP |
2619 | xorps @tweak[1],$inout1 |
2620 | xorps @tweak[2],$inout2 | |
2621 | xorps @tweak[3],$inout3 | |
2622 | ||
2623 | call _aesni_decrypt4 | |
2624 | ||
36df342f | 2625 | pxor @tweak[0],$inout0 |
f8501464 | 2626 | movdqa @tweak[4],@tweak[0] |
36df342f | 2627 | pxor @tweak[1],$inout1 |
f8501464 | 2628 | movdqa @tweak[5],@tweak[1] |
36df342f | 2629 | pxor @tweak[2],$inout2 |
23f6eec7 | 2630 | movdqu $inout0,($out) # store 4 output blocks |
36df342f AP |
2631 | pxor @tweak[3],$inout3 |
2632 | movdqu $inout1,16*1($out) | |
2633 | movdqu $inout2,16*2($out) | |
2634 | movdqu $inout3,16*3($out) | |
23f6eec7 | 2635 | lea 16*4($out),$out # $out+=4*16 |
f8501464 | 2636 | jmp .Lxts_dec_done |
6c83629b AP |
2637 | |
2638 | .align 16 | |
f8501464 | 2639 | .Lxts_dec_done: |
23f6eec7 | 2640 | and \$15,$len_ # see if $len%16 is 0 |
f8501464 AP |
2641 | jz .Lxts_dec_ret |
2642 | .Lxts_dec_done2: | |
2643 | mov $len_,$len | |
2644 | mov $key_,$key # restore $key | |
2645 | mov $rnds_,$rounds # restore $rounds | |
6c83629b | 2646 | |
f8501464 AP |
2647 | movups ($inp),$inout0 |
2648 | xorps @tweak[1],$inout0 | |
2649 | ___ | |
2650 | &aesni_generate1("dec",$key,$rounds); | |
2651 | $code.=<<___; | |
2652 | xorps @tweak[1],$inout0 | |
2653 | movups $inout0,($out) | |
2654 | ||
2655 | .Lxts_dec_steal: | |
2656 | movzb 16($inp),%eax # borrow $rounds ... | |
2657 | movzb ($out),%ecx # ... and $key | |
2658 | lea 1($inp),$inp | |
2659 | mov %al,($out) | |
2660 | mov %cl,16($out) | |
2661 | lea 1($out),$out | |
2662 | sub \$1,$len | |
2663 | jnz .Lxts_dec_steal | |
2664 | ||
2665 | sub $len_,$out # rewind $out | |
2666 | mov $key_,$key # restore $key | |
2667 | mov $rnds_,$rounds # restore $rounds | |
2668 | ||
2669 | movups ($out),$inout0 | |
2670 | xorps @tweak[0],$inout0 | |
6c83629b | 2671 | ___ |
f8501464 AP |
2672 | &aesni_generate1("dec",$key,$rounds); |
2673 | $code.=<<___; | |
2674 | xorps @tweak[0],$inout0 | |
2675 | movups $inout0,($out) | |
6c83629b | 2676 | |
f8501464 | 2677 | .Lxts_dec_ret: |
23f6eec7 AP |
2678 | xorps %xmm0,%xmm0 # clear register bank |
2679 | pxor %xmm1,%xmm1 | |
2680 | pxor %xmm2,%xmm2 | |
2681 | pxor %xmm3,%xmm3 | |
2682 | pxor %xmm4,%xmm4 | |
2683 | pxor %xmm5,%xmm5 | |
2684 | ___ | |
2685 | $code.=<<___ if (!$win64); | |
2686 | pxor %xmm6,%xmm6 | |
2687 | pxor %xmm7,%xmm7 | |
2688 | movaps %xmm0,0x00(%rsp) # clear stack | |
2689 | pxor %xmm8,%xmm8 | |
2690 | movaps %xmm0,0x10(%rsp) | |
2691 | pxor %xmm9,%xmm9 | |
2692 | movaps %xmm0,0x20(%rsp) | |
2693 | pxor %xmm10,%xmm10 | |
2694 | movaps %xmm0,0x30(%rsp) | |
2695 | pxor %xmm11,%xmm11 | |
2696 | movaps %xmm0,0x40(%rsp) | |
2697 | pxor %xmm12,%xmm12 | |
2698 | movaps %xmm0,0x50(%rsp) | |
2699 | pxor %xmm13,%xmm13 | |
2700 | movaps %xmm0,0x60(%rsp) | |
2701 | pxor %xmm14,%xmm14 | |
2702 | pxor %xmm15,%xmm15 | |
f8501464 | 2703 | ___ |
6c83629b | 2704 | $code.=<<___ if ($win64); |
384e6de4 AP |
2705 | movaps -0xa8(%r11),%xmm6 |
2706 | movaps %xmm0,-0xa8(%r11) # clear stack | |
2707 | movaps -0x98(%r11),%xmm7 | |
2708 | movaps %xmm0,-0x98(%r11) | |
2709 | movaps -0x88(%r11),%xmm8 | |
2710 | movaps %xmm0,-0x88(%r11) | |
2711 | movaps -0x78(%r11),%xmm9 | |
2712 | movaps %xmm0,-0x78(%r11) | |
2713 | movaps -0x68(%r11),%xmm10 | |
2714 | movaps %xmm0,-0x68(%r11) | |
2715 | movaps -0x58(%r11),%xmm11 | |
2716 | movaps %xmm0,-0x58(%r11) | |
2717 | movaps -0x48(%r11),%xmm12 | |
2718 | movaps %xmm0,-0x48(%r11) | |
2719 | movaps -0x38(%r11),%xmm13 | |
2720 | movaps %xmm0,-0x38(%r11) | |
2721 | movaps -0x28(%r11),%xmm14 | |
2722 | movaps %xmm0,-0x28(%r11) | |
2723 | movaps -0x18(%r11),%xmm15 | |
2724 | movaps %xmm0,-0x18(%r11) | |
23f6eec7 AP |
2725 | movaps %xmm0,0x00(%rsp) |
2726 | movaps %xmm0,0x10(%rsp) | |
2727 | movaps %xmm0,0x20(%rsp) | |
2728 | movaps %xmm0,0x30(%rsp) | |
2729 | movaps %xmm0,0x40(%rsp) | |
2730 | movaps %xmm0,0x50(%rsp) | |
2731 | movaps %xmm0,0x60(%rsp) | |
6c83629b AP |
2732 | ___ |
2733 | $code.=<<___; | |
384e6de4 | 2734 | mov -8(%r11),%rbp |
b84460ad | 2735 | .cfi_restore %rbp |
384e6de4 | 2736 | lea (%r11),%rsp |
b84460ad | 2737 | .cfi_def_cfa_register %rsp |
f8501464 | 2738 | .Lxts_dec_epilogue: |
6c83629b | 2739 | ret |
b84460ad | 2740 | .cfi_endproc |
f8501464 | 2741 | .size aesni_xts_decrypt,.-aesni_xts_decrypt |
6c83629b | 2742 | ___ |
bd30091c AP |
2743 | } |
2744 | \f | |
2745 | ###################################################################### | |
2746 | # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, | |
2747 | # const AES_KEY *key, unsigned int start_block_num, | |
2748 | # unsigned char offset_i[16], const unsigned char L_[][16], | |
2749 | # unsigned char checksum[16]); | |
2750 | # | |
2751 | { | |
2752 | my @offset=map("%xmm$_",(10..15)); | |
2753 | my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); | |
2754 | my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments | |
2755 | my ($L_p,$checksum_p) = ("%rbx","%rbp"); | |
2756 | my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); | |
2757 | my $seventh_arg = $win64 ? 56 : 8; | |
2758 | my $blocks = $len; | |
2759 | ||
2760 | $code.=<<___; | |
2761 | .globl aesni_ocb_encrypt | |
2762 | .type aesni_ocb_encrypt,\@function,6 | |
2763 | .align 32 | |
2764 | aesni_ocb_encrypt: | |
b84460ad | 2765 | .cfi_startproc |
bd30091c AP |
2766 | lea (%rsp),%rax |
2767 | push %rbx | |
b84460ad | 2768 | .cfi_push %rbx |
bd30091c | 2769 | push %rbp |
b84460ad | 2770 | .cfi_push %rbp |
bd30091c | 2771 | push %r12 |
b84460ad | 2772 | .cfi_push %r12 |
bd30091c | 2773 | push %r13 |
b84460ad | 2774 | .cfi_push %r13 |
bd30091c | 2775 | push %r14 |
b84460ad | 2776 | .cfi_push %r14 |
bd30091c AP |
2777 | ___ |
2778 | $code.=<<___ if ($win64); | |
2779 | lea -0xa0(%rsp),%rsp | |
2780 | movaps %xmm6,0x00(%rsp) # offload everything | |
2781 | movaps %xmm7,0x10(%rsp) | |
2782 | movaps %xmm8,0x20(%rsp) | |
2783 | movaps %xmm9,0x30(%rsp) | |
2784 | movaps %xmm10,0x40(%rsp) | |
2785 | movaps %xmm11,0x50(%rsp) | |
2786 | movaps %xmm12,0x60(%rsp) | |
2787 | movaps %xmm13,0x70(%rsp) | |
2788 | movaps %xmm14,0x80(%rsp) | |
2789 | movaps %xmm15,0x90(%rsp) | |
2790 | .Locb_enc_body: | |
2791 | ___ | |
2792 | $code.=<<___; | |
2793 | mov $seventh_arg(%rax),$L_p # 7th argument | |
2794 | mov $seventh_arg+8(%rax),$checksum_p# 8th argument | |
2795 | ||
2796 | mov 240($key),$rnds_ | |
2797 | mov $key,$key_ | |
2798 | shl \$4,$rnds_ | |
2799 | $movkey ($key),$rndkey0l # round[0] | |
2800 | $movkey 16($key,$rnds_),$rndkey1 # round[last] | |
2801 | ||
2802 | movdqu ($offset_p),@offset[5] # load last offset_i | |
2803 | pxor $rndkey1,$rndkey0l # round[0] ^ round[last] | |
2804 | pxor $rndkey1,@offset[5] # offset_i ^ round[last] | |
2805 | ||
2806 | mov \$16+32,$rounds | |
2807 | lea 32($key_,$rnds_),$key | |
2808 | $movkey 16($key_),$rndkey1 # round[1] | |
2809 | sub %r10,%rax # twisted $rounds | |
2810 | mov %rax,%r10 # backup twisted $rounds | |
2811 | ||
2812 | movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks | |
2813 | movdqu ($checksum_p),$checksum # load checksum | |
2814 | ||
2815 | test \$1,$block_num # is first block number odd? | |
2816 | jnz .Locb_enc_odd | |
2817 | ||
2818 | bsf $block_num,$i1 | |
2819 | add \$1,$block_num | |
2820 | shl \$4,$i1 | |
2821 | movdqu ($L_p,$i1),$inout5 # borrow | |
2822 | movdqu ($inp),$inout0 | |
2823 | lea 16($inp),$inp | |
2824 | ||
2825 | call __ocb_encrypt1 | |
2826 | ||
2827 | movdqa $inout5,@offset[5] | |
2828 | movups $inout0,($out) | |
2829 | lea 16($out),$out | |
2830 | sub \$1,$blocks | |
2831 | jz .Locb_enc_done | |
2832 | ||
2833 | .Locb_enc_odd: | |
2834 | lea 1($block_num),$i1 # even-numbered blocks | |
2835 | lea 3($block_num),$i3 | |
2836 | lea 5($block_num),$i5 | |
2837 | lea 6($block_num),$block_num | |
2838 | bsf $i1,$i1 # ntz(block) | |
2839 | bsf $i3,$i3 | |
2840 | bsf $i5,$i5 | |
2841 | shl \$4,$i1 # ntz(block) -> table offset | |
2842 | shl \$4,$i3 | |
2843 | shl \$4,$i5 | |
2844 | ||
2845 | sub \$6,$blocks | |
2846 | jc .Locb_enc_short | |
2847 | jmp .Locb_enc_grandloop | |
2848 | ||
2849 | .align 32 | |
2850 | .Locb_enc_grandloop: | |
2851 | movdqu `16*0`($inp),$inout0 # load input | |
2852 | movdqu `16*1`($inp),$inout1 | |
2853 | movdqu `16*2`($inp),$inout2 | |
2854 | movdqu `16*3`($inp),$inout3 | |
2855 | movdqu `16*4`($inp),$inout4 | |
2856 | movdqu `16*5`($inp),$inout5 | |
2857 | lea `16*6`($inp),$inp | |
2858 | ||
2859 | call __ocb_encrypt6 | |
2860 | ||
2861 | movups $inout0,`16*0`($out) # store output | |
2862 | movups $inout1,`16*1`($out) | |
2863 | movups $inout2,`16*2`($out) | |
2864 | movups $inout3,`16*3`($out) | |
2865 | movups $inout4,`16*4`($out) | |
2866 | movups $inout5,`16*5`($out) | |
2867 | lea `16*6`($out),$out | |
2868 | sub \$6,$blocks | |
2869 | jnc .Locb_enc_grandloop | |
2870 | ||
2871 | .Locb_enc_short: | |
2872 | add \$6,$blocks | |
2873 | jz .Locb_enc_done | |
2874 | ||
2875 | movdqu `16*0`($inp),$inout0 | |
2876 | cmp \$2,$blocks | |
2877 | jb .Locb_enc_one | |
2878 | movdqu `16*1`($inp),$inout1 | |
2879 | je .Locb_enc_two | |
2880 | ||
2881 | movdqu `16*2`($inp),$inout2 | |
2882 | cmp \$4,$blocks | |
2883 | jb .Locb_enc_three | |
2884 | movdqu `16*3`($inp),$inout3 | |
2885 | je .Locb_enc_four | |
2886 | ||
2887 | movdqu `16*4`($inp),$inout4 | |
2888 | pxor $inout5,$inout5 | |
2889 | ||
2890 | call __ocb_encrypt6 | |
2891 | ||
2892 | movdqa @offset[4],@offset[5] | |
2893 | movups $inout0,`16*0`($out) | |
2894 | movups $inout1,`16*1`($out) | |
2895 | movups $inout2,`16*2`($out) | |
2896 | movups $inout3,`16*3`($out) | |
2897 | movups $inout4,`16*4`($out) | |
2898 | ||
2899 | jmp .Locb_enc_done | |
2900 | ||
2901 | .align 16 | |
2902 | .Locb_enc_one: | |
2903 | movdqa @offset[0],$inout5 # borrow | |
2904 | ||
2905 | call __ocb_encrypt1 | |
2906 | ||
2907 | movdqa $inout5,@offset[5] | |
2908 | movups $inout0,`16*0`($out) | |
2909 | jmp .Locb_enc_done | |
2910 | ||
2911 | .align 16 | |
2912 | .Locb_enc_two: | |
2913 | pxor $inout2,$inout2 | |
2914 | pxor $inout3,$inout3 | |
2915 | ||
2916 | call __ocb_encrypt4 | |
2917 | ||
2918 | movdqa @offset[1],@offset[5] | |
2919 | movups $inout0,`16*0`($out) | |
2920 | movups $inout1,`16*1`($out) | |
2921 | ||
2922 | jmp .Locb_enc_done | |
2923 | ||
2924 | .align 16 | |
2925 | .Locb_enc_three: | |
2926 | pxor $inout3,$inout3 | |
2927 | ||
2928 | call __ocb_encrypt4 | |
2929 | ||
2930 | movdqa @offset[2],@offset[5] | |
2931 | movups $inout0,`16*0`($out) | |
2932 | movups $inout1,`16*1`($out) | |
2933 | movups $inout2,`16*2`($out) | |
2934 | ||
2935 | jmp .Locb_enc_done | |
2936 | ||
2937 | .align 16 | |
2938 | .Locb_enc_four: | |
2939 | call __ocb_encrypt4 | |
2940 | ||
2941 | movdqa @offset[3],@offset[5] | |
2942 | movups $inout0,`16*0`($out) | |
2943 | movups $inout1,`16*1`($out) | |
2944 | movups $inout2,`16*2`($out) | |
2945 | movups $inout3,`16*3`($out) | |
2946 | ||
2947 | .Locb_enc_done: | |
2948 | pxor $rndkey0,@offset[5] # "remove" round[last] | |
2949 | movdqu $checksum,($checksum_p) # store checksum | |
2950 | movdqu @offset[5],($offset_p) # store last offset_i | |
2951 | ||
2952 | xorps %xmm0,%xmm0 # clear register bank | |
2953 | pxor %xmm1,%xmm1 | |
2954 | pxor %xmm2,%xmm2 | |
2955 | pxor %xmm3,%xmm3 | |
2956 | pxor %xmm4,%xmm4 | |
2957 | pxor %xmm5,%xmm5 | |
2958 | ___ | |
2959 | $code.=<<___ if (!$win64); | |
2960 | pxor %xmm6,%xmm6 | |
2961 | pxor %xmm7,%xmm7 | |
2962 | pxor %xmm8,%xmm8 | |
2963 | pxor %xmm9,%xmm9 | |
2964 | pxor %xmm10,%xmm10 | |
2965 | pxor %xmm11,%xmm11 | |
2966 | pxor %xmm12,%xmm12 | |
2967 | pxor %xmm13,%xmm13 | |
2968 | pxor %xmm14,%xmm14 | |
2969 | pxor %xmm15,%xmm15 | |
384e6de4 | 2970 | lea 0x28(%rsp),%rax |
b84460ad | 2971 | .cfi_def_cfa %rax,8 |
bd30091c AP |
2972 | ___ |
2973 | $code.=<<___ if ($win64); | |
2974 | movaps 0x00(%rsp),%xmm6 | |
2975 | movaps %xmm0,0x00(%rsp) # clear stack | |
2976 | movaps 0x10(%rsp),%xmm7 | |
2977 | movaps %xmm0,0x10(%rsp) | |
2978 | movaps 0x20(%rsp),%xmm8 | |
2979 | movaps %xmm0,0x20(%rsp) | |
2980 | movaps 0x30(%rsp),%xmm9 | |
2981 | movaps %xmm0,0x30(%rsp) | |
2982 | movaps 0x40(%rsp),%xmm10 | |
2983 | movaps %xmm0,0x40(%rsp) | |
2984 | movaps 0x50(%rsp),%xmm11 | |
2985 | movaps %xmm0,0x50(%rsp) | |
2986 | movaps 0x60(%rsp),%xmm12 | |
2987 | movaps %xmm0,0x60(%rsp) | |
2988 | movaps 0x70(%rsp),%xmm13 | |
2989 | movaps %xmm0,0x70(%rsp) | |
2990 | movaps 0x80(%rsp),%xmm14 | |
2991 | movaps %xmm0,0x80(%rsp) | |
2992 | movaps 0x90(%rsp),%xmm15 | |
2993 | movaps %xmm0,0x90(%rsp) | |
2994 | lea 0xa0+0x28(%rsp),%rax | |
2995 | .Locb_enc_pop: | |
bd30091c AP |
2996 | ___ |
2997 | $code.=<<___; | |
384e6de4 | 2998 | mov -40(%rax),%r14 |
b84460ad | 2999 | .cfi_restore %r14 |
384e6de4 | 3000 | mov -32(%rax),%r13 |
b84460ad | 3001 | .cfi_restore %r13 |
384e6de4 | 3002 | mov -24(%rax),%r12 |
b84460ad | 3003 | .cfi_restore %r12 |
384e6de4 | 3004 | mov -16(%rax),%rbp |
b84460ad | 3005 | .cfi_restore %rbp |
384e6de4 | 3006 | mov -8(%rax),%rbx |
b84460ad | 3007 | .cfi_restore %rbx |
384e6de4 | 3008 | lea (%rax),%rsp |
b84460ad | 3009 | .cfi_def_cfa_register %rsp |
bd30091c AP |
3010 | .Locb_enc_epilogue: |
3011 | ret | |
b84460ad | 3012 | .cfi_endproc |
bd30091c AP |
3013 | .size aesni_ocb_encrypt,.-aesni_ocb_encrypt |
3014 | ||
3015 | .type __ocb_encrypt6,\@abi-omnipotent | |
3016 | .align 32 | |
3017 | __ocb_encrypt6: | |
3018 | pxor $rndkey0l,@offset[5] # offset_i ^ round[0] | |
3019 | movdqu ($L_p,$i1),@offset[1] | |
3020 | movdqa @offset[0],@offset[2] | |
3021 | movdqu ($L_p,$i3),@offset[3] | |
3022 | movdqa @offset[0],@offset[4] | |
3023 | pxor @offset[5],@offset[0] | |
3024 | movdqu ($L_p,$i5),@offset[5] | |
3025 | pxor @offset[0],@offset[1] | |
3026 | pxor $inout0,$checksum # accumulate checksum | |
3027 | pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i | |
3028 | pxor @offset[1],@offset[2] | |
3029 | pxor $inout1,$checksum | |
3030 | pxor @offset[1],$inout1 | |
3031 | pxor @offset[2],@offset[3] | |
3032 | pxor $inout2,$checksum | |
3033 | pxor @offset[2],$inout2 | |
3034 | pxor @offset[3],@offset[4] | |
3035 | pxor $inout3,$checksum | |
3036 | pxor @offset[3],$inout3 | |
3037 | pxor @offset[4],@offset[5] | |
3038 | pxor $inout4,$checksum | |
3039 | pxor @offset[4],$inout4 | |
3040 | pxor $inout5,$checksum | |
3041 | pxor @offset[5],$inout5 | |
3042 | $movkey 32($key_),$rndkey0 | |
3043 | ||
3044 | lea 1($block_num),$i1 # even-numbered blocks | |
3045 | lea 3($block_num),$i3 | |
3046 | lea 5($block_num),$i5 | |
3047 | add \$6,$block_num | |
3048 | pxor $rndkey0l,@offset[0] # offset_i ^ round[last] | |
3049 | bsf $i1,$i1 # ntz(block) | |
3050 | bsf $i3,$i3 | |
3051 | bsf $i5,$i5 | |
3052 | ||
3053 | aesenc $rndkey1,$inout0 | |
3054 | aesenc $rndkey1,$inout1 | |
3055 | aesenc $rndkey1,$inout2 | |
3056 | aesenc $rndkey1,$inout3 | |
3057 | pxor $rndkey0l,@offset[1] | |
3058 | pxor $rndkey0l,@offset[2] | |
3059 | aesenc $rndkey1,$inout4 | |
3060 | pxor $rndkey0l,@offset[3] | |
3061 | pxor $rndkey0l,@offset[4] | |
3062 | aesenc $rndkey1,$inout5 | |
3063 | $movkey 48($key_),$rndkey1 | |
3064 | pxor $rndkey0l,@offset[5] | |
3065 | ||
3066 | aesenc $rndkey0,$inout0 | |
3067 | aesenc $rndkey0,$inout1 | |
3068 | aesenc $rndkey0,$inout2 | |
3069 | aesenc $rndkey0,$inout3 | |
3070 | aesenc $rndkey0,$inout4 | |
3071 | aesenc $rndkey0,$inout5 | |
3072 | $movkey 64($key_),$rndkey0 | |
3073 | shl \$4,$i1 # ntz(block) -> table offset | |
3074 | shl \$4,$i3 | |
3075 | jmp .Locb_enc_loop6 | |
3076 | ||
3077 | .align 32 | |
3078 | .Locb_enc_loop6: | |
3079 | aesenc $rndkey1,$inout0 | |
3080 | aesenc $rndkey1,$inout1 | |
3081 | aesenc $rndkey1,$inout2 | |
3082 | aesenc $rndkey1,$inout3 | |
3083 | aesenc $rndkey1,$inout4 | |
3084 | aesenc $rndkey1,$inout5 | |
3085 | $movkey ($key,%rax),$rndkey1 | |
3086 | add \$32,%rax | |
3087 | ||
3088 | aesenc $rndkey0,$inout0 | |
3089 | aesenc $rndkey0,$inout1 | |
3090 | aesenc $rndkey0,$inout2 | |
3091 | aesenc $rndkey0,$inout3 | |
3092 | aesenc $rndkey0,$inout4 | |
3093 | aesenc $rndkey0,$inout5 | |
3094 | $movkey -16($key,%rax),$rndkey0 | |
3095 | jnz .Locb_enc_loop6 | |
3096 | ||
3097 | aesenc $rndkey1,$inout0 | |
3098 | aesenc $rndkey1,$inout1 | |
3099 | aesenc $rndkey1,$inout2 | |
3100 | aesenc $rndkey1,$inout3 | |
3101 | aesenc $rndkey1,$inout4 | |
3102 | aesenc $rndkey1,$inout5 | |
3103 | $movkey 16($key_),$rndkey1 | |
3104 | shl \$4,$i5 | |
3105 | ||
3106 | aesenclast @offset[0],$inout0 | |
3107 | movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks | |
3108 | mov %r10,%rax # restore twisted rounds | |
3109 | aesenclast @offset[1],$inout1 | |
3110 | aesenclast @offset[2],$inout2 | |
3111 | aesenclast @offset[3],$inout3 | |
3112 | aesenclast @offset[4],$inout4 | |
3113 | aesenclast @offset[5],$inout5 | |
3114 | ret | |
3115 | .size __ocb_encrypt6,.-__ocb_encrypt6 | |
3116 | ||
3117 | .type __ocb_encrypt4,\@abi-omnipotent | |
3118 | .align 32 | |
3119 | __ocb_encrypt4: | |
3120 | pxor $rndkey0l,@offset[5] # offset_i ^ round[0] | |
3121 | movdqu ($L_p,$i1),@offset[1] | |
3122 | movdqa @offset[0],@offset[2] | |
3123 | movdqu ($L_p,$i3),@offset[3] | |
3124 | pxor @offset[5],@offset[0] | |
3125 | pxor @offset[0],@offset[1] | |
3126 | pxor $inout0,$checksum # accumulate checksum | |
3127 | pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i | |
3128 | pxor @offset[1],@offset[2] | |
3129 | pxor $inout1,$checksum | |
3130 | pxor @offset[1],$inout1 | |
3131 | pxor @offset[2],@offset[3] | |
3132 | pxor $inout2,$checksum | |
3133 | pxor @offset[2],$inout2 | |
3134 | pxor $inout3,$checksum | |
3135 | pxor @offset[3],$inout3 | |
3136 | $movkey 32($key_),$rndkey0 | |
3137 | ||
3138 | pxor $rndkey0l,@offset[0] # offset_i ^ round[last] | |
3139 | pxor $rndkey0l,@offset[1] | |
3140 | pxor $rndkey0l,@offset[2] | |
3141 | pxor $rndkey0l,@offset[3] | |
3142 | ||
3143 | aesenc $rndkey1,$inout0 | |
3144 | aesenc $rndkey1,$inout1 | |
3145 | aesenc $rndkey1,$inout2 | |
3146 | aesenc $rndkey1,$inout3 | |
3147 | $movkey 48($key_),$rndkey1 | |
3148 | ||
3149 | aesenc $rndkey0,$inout0 | |
3150 | aesenc $rndkey0,$inout1 | |
3151 | aesenc $rndkey0,$inout2 | |
3152 | aesenc $rndkey0,$inout3 | |
3153 | $movkey 64($key_),$rndkey0 | |
3154 | jmp .Locb_enc_loop4 | |
3155 | ||
3156 | .align 32 | |
3157 | .Locb_enc_loop4: | |
3158 | aesenc $rndkey1,$inout0 | |
3159 | aesenc $rndkey1,$inout1 | |
3160 | aesenc $rndkey1,$inout2 | |
3161 | aesenc $rndkey1,$inout3 | |
3162 | $movkey ($key,%rax),$rndkey1 | |
3163 | add \$32,%rax | |
3164 | ||
3165 | aesenc $rndkey0,$inout0 | |
3166 | aesenc $rndkey0,$inout1 | |
3167 | aesenc $rndkey0,$inout2 | |
3168 | aesenc $rndkey0,$inout3 | |
3169 | $movkey -16($key,%rax),$rndkey0 | |
3170 | jnz .Locb_enc_loop4 | |
3171 | ||
3172 | aesenc $rndkey1,$inout0 | |
3173 | aesenc $rndkey1,$inout1 | |
3174 | aesenc $rndkey1,$inout2 | |
3175 | aesenc $rndkey1,$inout3 | |
3176 | $movkey 16($key_),$rndkey1 | |
3177 | mov %r10,%rax # restore twisted rounds | |
3178 | ||
3179 | aesenclast @offset[0],$inout0 | |
3180 | aesenclast @offset[1],$inout1 | |
3181 | aesenclast @offset[2],$inout2 | |
3182 | aesenclast @offset[3],$inout3 | |
3183 | ret | |
3184 | .size __ocb_encrypt4,.-__ocb_encrypt4 | |
3185 | ||
3186 | .type __ocb_encrypt1,\@abi-omnipotent | |
3187 | .align 32 | |
3188 | __ocb_encrypt1: | |
3189 | pxor @offset[5],$inout5 # offset_i | |
3190 | pxor $rndkey0l,$inout5 # offset_i ^ round[0] | |
3191 | pxor $inout0,$checksum # accumulate checksum | |
3192 | pxor $inout5,$inout0 # input ^ round[0] ^ offset_i | |
3193 | $movkey 32($key_),$rndkey0 | |
3194 | ||
3195 | aesenc $rndkey1,$inout0 | |
3196 | $movkey 48($key_),$rndkey1 | |
3197 | pxor $rndkey0l,$inout5 # offset_i ^ round[last] | |
3198 | ||
3199 | aesenc $rndkey0,$inout0 | |
3200 | $movkey 64($key_),$rndkey0 | |
3201 | jmp .Locb_enc_loop1 | |
3202 | ||
3203 | .align 32 | |
3204 | .Locb_enc_loop1: | |
3205 | aesenc $rndkey1,$inout0 | |
3206 | $movkey ($key,%rax),$rndkey1 | |
3207 | add \$32,%rax | |
3208 | ||
3209 | aesenc $rndkey0,$inout0 | |
3210 | $movkey -16($key,%rax),$rndkey0 | |
3211 | jnz .Locb_enc_loop1 | |
3212 | ||
3213 | aesenc $rndkey1,$inout0 | |
3214 | $movkey 16($key_),$rndkey1 # redundant in tail | |
3215 | mov %r10,%rax # restore twisted rounds | |
3216 | ||
3217 | aesenclast $inout5,$inout0 | |
3218 | ret | |
3219 | .size __ocb_encrypt1,.-__ocb_encrypt1 | |
3220 | ||
3221 | .globl aesni_ocb_decrypt | |
3222 | .type aesni_ocb_decrypt,\@function,6 | |
3223 | .align 32 | |
3224 | aesni_ocb_decrypt: | |
b84460ad | 3225 | .cfi_startproc |
bd30091c AP |
3226 | lea (%rsp),%rax |
3227 | push %rbx | |
b84460ad | 3228 | .cfi_push %rbx |
bd30091c | 3229 | push %rbp |
b84460ad | 3230 | .cfi_push %rbp |
bd30091c | 3231 | push %r12 |
b84460ad | 3232 | .cfi_push %r12 |
bd30091c | 3233 | push %r13 |
b84460ad | 3234 | .cfi_push %r13 |
bd30091c | 3235 | push %r14 |
b84460ad | 3236 | .cfi_push %r14 |
bd30091c AP |
3237 | ___ |
3238 | $code.=<<___ if ($win64); | |
3239 | lea -0xa0(%rsp),%rsp | |
3240 | movaps %xmm6,0x00(%rsp) # offload everything | |
3241 | movaps %xmm7,0x10(%rsp) | |
3242 | movaps %xmm8,0x20(%rsp) | |
3243 | movaps %xmm9,0x30(%rsp) | |
3244 | movaps %xmm10,0x40(%rsp) | |
3245 | movaps %xmm11,0x50(%rsp) | |
3246 | movaps %xmm12,0x60(%rsp) | |
3247 | movaps %xmm13,0x70(%rsp) | |
3248 | movaps %xmm14,0x80(%rsp) | |
3249 | movaps %xmm15,0x90(%rsp) | |
3250 | .Locb_dec_body: | |
3251 | ___ | |
3252 | $code.=<<___; | |
3253 | mov $seventh_arg(%rax),$L_p # 7th argument | |
3254 | mov $seventh_arg+8(%rax),$checksum_p# 8th argument | |
3255 | ||
3256 | mov 240($key),$rnds_ | |
3257 | mov $key,$key_ | |
3258 | shl \$4,$rnds_ | |
3259 | $movkey ($key),$rndkey0l # round[0] | |
3260 | $movkey 16($key,$rnds_),$rndkey1 # round[last] | |
3261 | ||
3262 | movdqu ($offset_p),@offset[5] # load last offset_i | |
3263 | pxor $rndkey1,$rndkey0l # round[0] ^ round[last] | |
3264 | pxor $rndkey1,@offset[5] # offset_i ^ round[last] | |
3265 | ||
3266 | mov \$16+32,$rounds | |
3267 | lea 32($key_,$rnds_),$key | |
3268 | $movkey 16($key_),$rndkey1 # round[1] | |
3269 | sub %r10,%rax # twisted $rounds | |
3270 | mov %rax,%r10 # backup twisted $rounds | |
3271 | ||
3272 | movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks | |
3273 | movdqu ($checksum_p),$checksum # load checksum | |
3274 | ||
3275 | test \$1,$block_num # is first block number odd? | |
3276 | jnz .Locb_dec_odd | |
3277 | ||
3278 | bsf $block_num,$i1 | |
3279 | add \$1,$block_num | |
3280 | shl \$4,$i1 | |
3281 | movdqu ($L_p,$i1),$inout5 # borrow | |
3282 | movdqu ($inp),$inout0 | |
3283 | lea 16($inp),$inp | |
3284 | ||
3285 | call __ocb_decrypt1 | |
3286 | ||
3287 | movdqa $inout5,@offset[5] | |
3288 | movups $inout0,($out) | |
3289 | xorps $inout0,$checksum # accumulate checksum | |
3290 | lea 16($out),$out | |
3291 | sub \$1,$blocks | |
3292 | jz .Locb_dec_done | |
3293 | ||
3294 | .Locb_dec_odd: | |
3295 | lea 1($block_num),$i1 # even-numbered blocks | |
3296 | lea 3($block_num),$i3 | |
3297 | lea 5($block_num),$i5 | |
3298 | lea 6($block_num),$block_num | |
3299 | bsf $i1,$i1 # ntz(block) | |
3300 | bsf $i3,$i3 | |
3301 | bsf $i5,$i5 | |
3302 | shl \$4,$i1 # ntz(block) -> table offset | |
3303 | shl \$4,$i3 | |
3304 | shl \$4,$i5 | |
3305 | ||
3306 | sub \$6,$blocks | |
3307 | jc .Locb_dec_short | |
3308 | jmp .Locb_dec_grandloop | |
3309 | ||
3310 | .align 32 | |
3311 | .Locb_dec_grandloop: | |
3312 | movdqu `16*0`($inp),$inout0 # load input | |
3313 | movdqu `16*1`($inp),$inout1 | |
3314 | movdqu `16*2`($inp),$inout2 | |
3315 | movdqu `16*3`($inp),$inout3 | |
3316 | movdqu `16*4`($inp),$inout4 | |
3317 | movdqu `16*5`($inp),$inout5 | |
3318 | lea `16*6`($inp),$inp | |
3319 | ||
3320 | call __ocb_decrypt6 | |
3321 | ||
3322 | movups $inout0,`16*0`($out) # store output | |
3323 | pxor $inout0,$checksum # accumulate checksum | |
3324 | movups $inout1,`16*1`($out) | |
3325 | pxor $inout1,$checksum | |
3326 | movups $inout2,`16*2`($out) | |
3327 | pxor $inout2,$checksum | |
3328 | movups $inout3,`16*3`($out) | |
3329 | pxor $inout3,$checksum | |
3330 | movups $inout4,`16*4`($out) | |
3331 | pxor $inout4,$checksum | |
3332 | movups $inout5,`16*5`($out) | |
3333 | pxor $inout5,$checksum | |
3334 | lea `16*6`($out),$out | |
3335 | sub \$6,$blocks | |
3336 | jnc .Locb_dec_grandloop | |
3337 | ||
3338 | .Locb_dec_short: | |
3339 | add \$6,$blocks | |
3340 | jz .Locb_dec_done | |
3341 | ||
3342 | movdqu `16*0`($inp),$inout0 | |
3343 | cmp \$2,$blocks | |
3344 | jb .Locb_dec_one | |
3345 | movdqu `16*1`($inp),$inout1 | |
3346 | je .Locb_dec_two | |
3347 | ||
3348 | movdqu `16*2`($inp),$inout2 | |
3349 | cmp \$4,$blocks | |
3350 | jb .Locb_dec_three | |
3351 | movdqu `16*3`($inp),$inout3 | |
3352 | je .Locb_dec_four | |
3353 | ||
3354 | movdqu `16*4`($inp),$inout4 | |
3355 | pxor $inout5,$inout5 | |
3356 | ||
3357 | call __ocb_decrypt6 | |
3358 | ||
3359 | movdqa @offset[4],@offset[5] | |
3360 | movups $inout0,`16*0`($out) # store output | |
3361 | pxor $inout0,$checksum # accumulate checksum | |
3362 | movups $inout1,`16*1`($out) | |
3363 | pxor $inout1,$checksum | |
3364 | movups $inout2,`16*2`($out) | |
3365 | pxor $inout2,$checksum | |
3366 | movups $inout3,`16*3`($out) | |
3367 | pxor $inout3,$checksum | |
3368 | movups $inout4,`16*4`($out) | |
3369 | pxor $inout4,$checksum | |
3370 | ||
3371 | jmp .Locb_dec_done | |
3372 | ||
3373 | .align 16 | |
3374 | .Locb_dec_one: | |
3375 | movdqa @offset[0],$inout5 # borrow | |
3376 | ||
3377 | call __ocb_decrypt1 | |
3378 | ||
3379 | movdqa $inout5,@offset[5] | |
3380 | movups $inout0,`16*0`($out) # store output | |
3381 | xorps $inout0,$checksum # accumulate checksum | |
3382 | jmp .Locb_dec_done | |
3383 | ||
3384 | .align 16 | |
3385 | .Locb_dec_two: | |
3386 | pxor $inout2,$inout2 | |
3387 | pxor $inout3,$inout3 | |
3388 | ||
3389 | call __ocb_decrypt4 | |
3390 | ||
3391 | movdqa @offset[1],@offset[5] | |
3392 | movups $inout0,`16*0`($out) # store output | |
3393 | xorps $inout0,$checksum # accumulate checksum | |
3394 | movups $inout1,`16*1`($out) | |
3395 | xorps $inout1,$checksum | |
3396 | ||
3397 | jmp .Locb_dec_done | |
3398 | ||
3399 | .align 16 | |
3400 | .Locb_dec_three: | |
3401 | pxor $inout3,$inout3 | |
3402 | ||
3403 | call __ocb_decrypt4 | |
3404 | ||
3405 | movdqa @offset[2],@offset[5] | |
3406 | movups $inout0,`16*0`($out) # store output | |
3407 | xorps $inout0,$checksum # accumulate checksum | |
3408 | movups $inout1,`16*1`($out) | |
3409 | xorps $inout1,$checksum | |
3410 | movups $inout2,`16*2`($out) | |
3411 | xorps $inout2,$checksum | |
3412 | ||
3413 | jmp .Locb_dec_done | |
3414 | ||
3415 | .align 16 | |
3416 | .Locb_dec_four: | |
3417 | call __ocb_decrypt4 | |
3418 | ||
3419 | movdqa @offset[3],@offset[5] | |
3420 | movups $inout0,`16*0`($out) # store output | |
3421 | pxor $inout0,$checksum # accumulate checksum | |
3422 | movups $inout1,`16*1`($out) | |
3423 | pxor $inout1,$checksum | |
3424 | movups $inout2,`16*2`($out) | |
3425 | pxor $inout2,$checksum | |
3426 | movups $inout3,`16*3`($out) | |
3427 | pxor $inout3,$checksum | |
3428 | ||
3429 | .Locb_dec_done: | |
3430 | pxor $rndkey0,@offset[5] # "remove" round[last] | |
3431 | movdqu $checksum,($checksum_p) # store checksum | |
3432 | movdqu @offset[5],($offset_p) # store last offset_i | |
3433 | ||
3434 | xorps %xmm0,%xmm0 # clear register bank | |
3435 | pxor %xmm1,%xmm1 | |
3436 | pxor %xmm2,%xmm2 | |
3437 | pxor %xmm3,%xmm3 | |
3438 | pxor %xmm4,%xmm4 | |
3439 | pxor %xmm5,%xmm5 | |
3440 | ___ | |
3441 | $code.=<<___ if (!$win64); | |
3442 | pxor %xmm6,%xmm6 | |
3443 | pxor %xmm7,%xmm7 | |
3444 | pxor %xmm8,%xmm8 | |
3445 | pxor %xmm9,%xmm9 | |
3446 | pxor %xmm10,%xmm10 | |
3447 | pxor %xmm11,%xmm11 | |
3448 | pxor %xmm12,%xmm12 | |
3449 | pxor %xmm13,%xmm13 | |
3450 | pxor %xmm14,%xmm14 | |
3451 | pxor %xmm15,%xmm15 | |
384e6de4 | 3452 | lea 0x28(%rsp),%rax |
b84460ad | 3453 | .cfi_def_cfa %rax,8 |
bd30091c AP |
3454 | ___ |
3455 | $code.=<<___ if ($win64); | |
3456 | movaps 0x00(%rsp),%xmm6 | |
3457 | movaps %xmm0,0x00(%rsp) # clear stack | |
3458 | movaps 0x10(%rsp),%xmm7 | |
3459 | movaps %xmm0,0x10(%rsp) | |
3460 | movaps 0x20(%rsp),%xmm8 | |
3461 | movaps %xmm0,0x20(%rsp) | |
3462 | movaps 0x30(%rsp),%xmm9 | |
3463 | movaps %xmm0,0x30(%rsp) | |
3464 | movaps 0x40(%rsp),%xmm10 | |
3465 | movaps %xmm0,0x40(%rsp) | |
3466 | movaps 0x50(%rsp),%xmm11 | |
3467 | movaps %xmm0,0x50(%rsp) | |
3468 | movaps 0x60(%rsp),%xmm12 | |
3469 | movaps %xmm0,0x60(%rsp) | |
3470 | movaps 0x70(%rsp),%xmm13 | |
3471 | movaps %xmm0,0x70(%rsp) | |
3472 | movaps 0x80(%rsp),%xmm14 | |
3473 | movaps %xmm0,0x80(%rsp) | |
3474 | movaps 0x90(%rsp),%xmm15 | |
3475 | movaps %xmm0,0x90(%rsp) | |
3476 | lea 0xa0+0x28(%rsp),%rax | |
3477 | .Locb_dec_pop: | |
bd30091c AP |
3478 | ___ |
3479 | $code.=<<___; | |
384e6de4 | 3480 | mov -40(%rax),%r14 |
b84460ad | 3481 | .cfi_restore %r14 |
384e6de4 | 3482 | mov -32(%rax),%r13 |
b84460ad | 3483 | .cfi_restore %r13 |
384e6de4 | 3484 | mov -24(%rax),%r12 |
b84460ad | 3485 | .cfi_restore %r12 |
384e6de4 | 3486 | mov -16(%rax),%rbp |
b84460ad | 3487 | .cfi_restore %rbp |
384e6de4 | 3488 | mov -8(%rax),%rbx |
b84460ad | 3489 | .cfi_restore %rbx |
384e6de4 | 3490 | lea (%rax),%rsp |
b84460ad | 3491 | .cfi_def_cfa_register %rsp |
bd30091c AP |
3492 | .Locb_dec_epilogue: |
3493 | ret | |
b84460ad | 3494 | .cfi_endproc |
bd30091c AP |
3495 | .size aesni_ocb_decrypt,.-aesni_ocb_decrypt |
3496 | ||
3497 | .type __ocb_decrypt6,\@abi-omnipotent | |
3498 | .align 32 | |
3499 | __ocb_decrypt6: | |
3500 | pxor $rndkey0l,@offset[5] # offset_i ^ round[0] | |
3501 | movdqu ($L_p,$i1),@offset[1] | |
3502 | movdqa @offset[0],@offset[2] | |
3503 | movdqu ($L_p,$i3),@offset[3] | |
3504 | movdqa @offset[0],@offset[4] | |
3505 | pxor @offset[5],@offset[0] | |
3506 | movdqu ($L_p,$i5),@offset[5] | |
3507 | pxor @offset[0],@offset[1] | |
3508 | pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i | |
3509 | pxor @offset[1],@offset[2] | |
3510 | pxor @offset[1],$inout1 | |
3511 | pxor @offset[2],@offset[3] | |
3512 | pxor @offset[2],$inout2 | |
3513 | pxor @offset[3],@offset[4] | |
3514 | pxor @offset[3],$inout3 | |
3515 | pxor @offset[4],@offset[5] | |
3516 | pxor @offset[4],$inout4 | |
3517 | pxor @offset[5],$inout5 | |
3518 | $movkey 32($key_),$rndkey0 | |
3519 | ||
3520 | lea 1($block_num),$i1 # even-numbered blocks | |
3521 | lea 3($block_num),$i3 | |
3522 | lea 5($block_num),$i5 | |
3523 | add \$6,$block_num | |
3524 | pxor $rndkey0l,@offset[0] # offset_i ^ round[last] | |
3525 | bsf $i1,$i1 # ntz(block) | |
3526 | bsf $i3,$i3 | |
3527 | bsf $i5,$i5 | |
3528 | ||
3529 | aesdec $rndkey1,$inout0 | |
3530 | aesdec $rndkey1,$inout1 | |
3531 | aesdec $rndkey1,$inout2 | |
3532 | aesdec $rndkey1,$inout3 | |
3533 | pxor $rndkey0l,@offset[1] | |
3534 | pxor $rndkey0l,@offset[2] | |
3535 | aesdec $rndkey1,$inout4 | |
3536 | pxor $rndkey0l,@offset[3] | |
3537 | pxor $rndkey0l,@offset[4] | |
3538 | aesdec $rndkey1,$inout5 | |
3539 | $movkey 48($key_),$rndkey1 | |
3540 | pxor $rndkey0l,@offset[5] | |
3541 | ||
3542 | aesdec $rndkey0,$inout0 | |
3543 | aesdec $rndkey0,$inout1 | |
3544 | aesdec $rndkey0,$inout2 | |
3545 | aesdec $rndkey0,$inout3 | |
3546 | aesdec $rndkey0,$inout4 | |
3547 | aesdec $rndkey0,$inout5 | |
3548 | $movkey 64($key_),$rndkey0 | |
3549 | shl \$4,$i1 # ntz(block) -> table offset | |
3550 | shl \$4,$i3 | |
3551 | jmp .Locb_dec_loop6 | |
3552 | ||
3553 | .align 32 | |
3554 | .Locb_dec_loop6: | |
3555 | aesdec $rndkey1,$inout0 | |
3556 | aesdec $rndkey1,$inout1 | |
3557 | aesdec $rndkey1,$inout2 | |
3558 | aesdec $rndkey1,$inout3 | |
3559 | aesdec $rndkey1,$inout4 | |
3560 | aesdec $rndkey1,$inout5 | |
3561 | $movkey ($key,%rax),$rndkey1 | |
3562 | add \$32,%rax | |
3563 | ||
3564 | aesdec $rndkey0,$inout0 | |
3565 | aesdec $rndkey0,$inout1 | |
3566 | aesdec $rndkey0,$inout2 | |
3567 | aesdec $rndkey0,$inout3 | |
3568 | aesdec $rndkey0,$inout4 | |
3569 | aesdec $rndkey0,$inout5 | |
3570 | $movkey -16($key,%rax),$rndkey0 | |
3571 | jnz .Locb_dec_loop6 | |
3572 | ||
3573 | aesdec $rndkey1,$inout0 | |
3574 | aesdec $rndkey1,$inout1 | |
3575 | aesdec $rndkey1,$inout2 | |
3576 | aesdec $rndkey1,$inout3 | |
3577 | aesdec $rndkey1,$inout4 | |
3578 | aesdec $rndkey1,$inout5 | |
3579 | $movkey 16($key_),$rndkey1 | |
3580 | shl \$4,$i5 | |
3581 | ||
3582 | aesdeclast @offset[0],$inout0 | |
3583 | movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks | |
3584 | mov %r10,%rax # restore twisted rounds | |
3585 | aesdeclast @offset[1],$inout1 | |
3586 | aesdeclast @offset[2],$inout2 | |
3587 | aesdeclast @offset[3],$inout3 | |
3588 | aesdeclast @offset[4],$inout4 | |
3589 | aesdeclast @offset[5],$inout5 | |
3590 | ret | |
3591 | .size __ocb_decrypt6,.-__ocb_decrypt6 | |
3592 | ||
3593 | .type __ocb_decrypt4,\@abi-omnipotent | |
3594 | .align 32 | |
3595 | __ocb_decrypt4: | |
3596 | pxor $rndkey0l,@offset[5] # offset_i ^ round[0] | |
3597 | movdqu ($L_p,$i1),@offset[1] | |
3598 | movdqa @offset[0],@offset[2] | |
3599 | movdqu ($L_p,$i3),@offset[3] | |
3600 | pxor @offset[5],@offset[0] | |
3601 | pxor @offset[0],@offset[1] | |
3602 | pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i | |
3603 | pxor @offset[1],@offset[2] | |
3604 | pxor @offset[1],$inout1 | |
3605 | pxor @offset[2],@offset[3] | |
3606 | pxor @offset[2],$inout2 | |
3607 | pxor @offset[3],$inout3 | |
3608 | $movkey 32($key_),$rndkey0 | |
3609 | ||
3610 | pxor $rndkey0l,@offset[0] # offset_i ^ round[last] | |
3611 | pxor $rndkey0l,@offset[1] | |
3612 | pxor $rndkey0l,@offset[2] | |
3613 | pxor $rndkey0l,@offset[3] | |
3614 | ||
3615 | aesdec $rndkey1,$inout0 | |
3616 | aesdec $rndkey1,$inout1 | |
3617 | aesdec $rndkey1,$inout2 | |
3618 | aesdec $rndkey1,$inout3 | |
3619 | $movkey 48($key_),$rndkey1 | |
3620 | ||
3621 | aesdec $rndkey0,$inout0 | |
3622 | aesdec $rndkey0,$inout1 | |
3623 | aesdec $rndkey0,$inout2 | |
3624 | aesdec $rndkey0,$inout3 | |
3625 | $movkey 64($key_),$rndkey0 | |
3626 | jmp .Locb_dec_loop4 | |
3627 | ||
3628 | .align 32 | |
3629 | .Locb_dec_loop4: | |
3630 | aesdec $rndkey1,$inout0 | |
3631 | aesdec $rndkey1,$inout1 | |
3632 | aesdec $rndkey1,$inout2 | |
3633 | aesdec $rndkey1,$inout3 | |
3634 | $movkey ($key,%rax),$rndkey1 | |
3635 | add \$32,%rax | |
3636 | ||
3637 | aesdec $rndkey0,$inout0 | |
3638 | aesdec $rndkey0,$inout1 | |
3639 | aesdec $rndkey0,$inout2 | |
3640 | aesdec $rndkey0,$inout3 | |
3641 | $movkey -16($key,%rax),$rndkey0 | |
3642 | jnz .Locb_dec_loop4 | |
3643 | ||
3644 | aesdec $rndkey1,$inout0 | |
3645 | aesdec $rndkey1,$inout1 | |
3646 | aesdec $rndkey1,$inout2 | |
3647 | aesdec $rndkey1,$inout3 | |
3648 | $movkey 16($key_),$rndkey1 | |
3649 | mov %r10,%rax # restore twisted rounds | |
3650 | ||
3651 | aesdeclast @offset[0],$inout0 | |
3652 | aesdeclast @offset[1],$inout1 | |
3653 | aesdeclast @offset[2],$inout2 | |
3654 | aesdeclast @offset[3],$inout3 | |
3655 | ret | |
3656 | .size __ocb_decrypt4,.-__ocb_decrypt4 | |
3657 | ||
3658 | .type __ocb_decrypt1,\@abi-omnipotent | |
3659 | .align 32 | |
3660 | __ocb_decrypt1: | |
3661 | pxor @offset[5],$inout5 # offset_i | |
3662 | pxor $rndkey0l,$inout5 # offset_i ^ round[0] | |
3663 | pxor $inout5,$inout0 # input ^ round[0] ^ offset_i | |
3664 | $movkey 32($key_),$rndkey0 | |
3665 | ||
3666 | aesdec $rndkey1,$inout0 | |
3667 | $movkey 48($key_),$rndkey1 | |
3668 | pxor $rndkey0l,$inout5 # offset_i ^ round[last] | |
3669 | ||
3670 | aesdec $rndkey0,$inout0 | |
3671 | $movkey 64($key_),$rndkey0 | |
3672 | jmp .Locb_dec_loop1 | |
3673 | ||
3674 | .align 32 | |
3675 | .Locb_dec_loop1: | |
3676 | aesdec $rndkey1,$inout0 | |
3677 | $movkey ($key,%rax),$rndkey1 | |
3678 | add \$32,%rax | |
3679 | ||
3680 | aesdec $rndkey0,$inout0 | |
3681 | $movkey -16($key,%rax),$rndkey0 | |
3682 | jnz .Locb_dec_loop1 | |
3683 | ||
3684 | aesdec $rndkey1,$inout0 | |
3685 | $movkey 16($key_),$rndkey1 # redundant in tail | |
3686 | mov %r10,%rax # restore twisted rounds | |
3687 | ||
3688 | aesdeclast $inout5,$inout0 | |
3689 | ret | |
3690 | .size __ocb_decrypt1,.-__ocb_decrypt1 | |
3691 | ___ | |
f8501464 | 3692 | } }} |
d64a7232 | 3693 | \f |
6c83629b | 3694 | ######################################################################## |
d64a7232 AP |
3695 | # void $PREFIX_cbc_encrypt (const void *inp, void *out, |
3696 | # size_t length, const AES_KEY *key, | |
3697 | # unsigned char *ivp,const int enc); | |
f8501464 | 3698 | { |
73325b22 AP |
3699 | my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt |
3700 | my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); | |
73325b22 | 3701 | |
d64a7232 AP |
3702 | $code.=<<___; |
3703 | .globl ${PREFIX}_cbc_encrypt | |
3704 | .type ${PREFIX}_cbc_encrypt,\@function,6 | |
3705 | .align 16 | |
3706 | ${PREFIX}_cbc_encrypt: | |
b84460ad | 3707 | .cfi_startproc |
d64a7232 AP |
3708 | test $len,$len # check length |
3709 | jz .Lcbc_ret | |
d608b4d6 | 3710 | |
f8501464 | 3711 | mov 240($key),$rnds_ # key->rounds |
d64a7232 | 3712 | mov $key,$key_ # backup $key |
d608b4d6 | 3713 | test %r9d,%r9d # 6th argument |
d64a7232 AP |
3714 | jz .Lcbc_decrypt |
3715 | #--------------------------- CBC ENCRYPT ------------------------------# | |
f8501464 | 3716 | movups ($ivp),$inout0 # load iv as initial state |
d608b4d6 | 3717 | mov $rnds_,$rounds |
d7d119a3 | 3718 | cmp \$16,$len |
d64a7232 AP |
3719 | jb .Lcbc_enc_tail |
3720 | sub \$16,$len | |
3721 | jmp .Lcbc_enc_loop | |
d7d119a3 | 3722 | .align 16 |
d64a7232 | 3723 | .Lcbc_enc_loop: |
f8501464 | 3724 | movups ($inp),$inout1 # load input |
d64a7232 | 3725 | lea 16($inp),$inp |
f8501464 | 3726 | #xorps $inout1,$inout0 |
d64a7232 | 3727 | ___ |
f8501464 | 3728 | &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); |
d64a7232 | 3729 | $code.=<<___; |
d608b4d6 AP |
3730 | mov $rnds_,$rounds # restore $rounds |
3731 | mov $key_,$key # restore $key | |
d7d119a3 AP |
3732 | movups $inout0,0($out) # store output |
3733 | lea 16($out),$out | |
3734 | sub \$16,$len | |
d64a7232 AP |
3735 | jnc .Lcbc_enc_loop |
3736 | add \$16,$len | |
3737 | jnz .Lcbc_enc_tail | |
23f6eec7 AP |
3738 | pxor $rndkey0,$rndkey0 # clear register bank |
3739 | pxor $rndkey1,$rndkey1 | |
d608b4d6 | 3740 | movups $inout0,($ivp) |
23f6eec7 AP |
3741 | pxor $inout0,$inout0 |
3742 | pxor $inout1,$inout1 | |
d64a7232 AP |
3743 | jmp .Lcbc_ret |
3744 | ||
3745 | .Lcbc_enc_tail: | |
3746 | mov $len,%rcx # zaps $key | |
3747 | xchg $inp,$out # $inp is %rsi and $out is %rdi now | |
3748 | .long 0x9066A4F3 # rep movsb | |
3749 | mov \$16,%ecx # zero tail | |
3750 | sub $len,%rcx | |
3751 | xor %eax,%eax | |
3752 | .long 0x9066AAF3 # rep stosb | |
3753 | lea -16(%rdi),%rdi # rewind $out by 1 block | |
3754 | mov $rnds_,$rounds # restore $rounds | |
3755 | mov %rdi,%rsi # $inp and $out are the same | |
3756 | mov $key_,$key # restore $key | |
3757 | xor $len,$len # len=16 | |
3758 | jmp .Lcbc_enc_loop # one more spin | |
3759 | \f#--------------------------- CBC DECRYPT ------------------------------# | |
3760 | .align 16 | |
3761 | .Lcbc_decrypt: | |
23f6eec7 AP |
3762 | cmp \$16,$len |
3763 | jne .Lcbc_decrypt_bulk | |
3764 | ||
3765 | # handle single block without allocating stack frame, | |
3766 | # useful in ciphertext stealing mode | |
3767 | movdqu ($inp),$inout0 # load input | |
3768 | movdqu ($ivp),$inout1 # load iv | |
3769 | movdqa $inout0,$inout2 # future iv | |
3770 | ___ | |
3771 | &aesni_generate1("dec",$key,$rnds_); | |
3772 | $code.=<<___; | |
3773 | pxor $rndkey0,$rndkey0 # clear register bank | |
3774 | pxor $rndkey1,$rndkey1 | |
3775 | movdqu $inout2,($ivp) # store iv | |
3776 | xorps $inout1,$inout0 # ^=iv | |
3777 | pxor $inout1,$inout1 | |
3778 | movups $inout0,($out) # store output | |
3779 | pxor $inout0,$inout0 | |
3780 | jmp .Lcbc_ret | |
3781 | .align 16 | |
3782 | .Lcbc_decrypt_bulk: | |
384e6de4 | 3783 | lea (%rsp),%r11 # frame pointer |
b84460ad | 3784 | .cfi_def_cfa_register %r11 |
6a40ebe8 | 3785 | push %rbp |
b84460ad | 3786 | .cfi_push %rbp |
6a40ebe8 AP |
3787 | sub \$$frame_size,%rsp |
3788 | and \$-16,%rsp # Linux kernel stack can be incorrectly seeded | |
d64a7232 AP |
3789 | ___ |
3790 | $code.=<<___ if ($win64); | |
6a40ebe8 AP |
3791 | movaps %xmm6,0x10(%rsp) |
3792 | movaps %xmm7,0x20(%rsp) | |
3793 | movaps %xmm8,0x30(%rsp) | |
3794 | movaps %xmm9,0x40(%rsp) | |
73325b22 AP |
3795 | movaps %xmm10,0x50(%rsp) |
3796 | movaps %xmm11,0x60(%rsp) | |
3797 | movaps %xmm12,0x70(%rsp) | |
3798 | movaps %xmm13,0x80(%rsp) | |
3799 | movaps %xmm14,0x90(%rsp) | |
3800 | movaps %xmm15,0xa0(%rsp) | |
d608b4d6 | 3801 | .Lcbc_decrypt_body: |
d64a7232 | 3802 | ___ |
384e6de4 AP |
3803 | |
3804 | my $inp_=$key_="%rbp"; # reassign $key_ | |
3805 | ||
d64a7232 | 3806 | $code.=<<___; |
384e6de4 | 3807 | mov $key,$key_ # [re-]backup $key [after reassignment] |
d64a7232 | 3808 | movups ($ivp),$iv |
d608b4d6 | 3809 | mov $rnds_,$rounds |
73325b22 | 3810 | cmp \$0x50,$len |
d608b4d6 | 3811 | jbe .Lcbc_dec_tail |
73325b22 AP |
3812 | |
3813 | $movkey ($key),$rndkey0 | |
3814 | movdqu 0x00($inp),$inout0 # load input | |
3815 | movdqu 0x10($inp),$inout1 | |
3816 | movdqa $inout0,$in0 | |
3817 | movdqu 0x20($inp),$inout2 | |
3818 | movdqa $inout1,$in1 | |
3819 | movdqu 0x30($inp),$inout3 | |
3820 | movdqa $inout2,$in2 | |
3821 | movdqu 0x40($inp),$inout4 | |
3822 | movdqa $inout3,$in3 | |
3823 | movdqu 0x50($inp),$inout5 | |
3824 | movdqa $inout4,$in4 | |
5599c733 | 3825 | mov OPENSSL_ia32cap_P+4(%rip),%r9d |
73325b22 AP |
3826 | cmp \$0x70,$len |
3827 | jbe .Lcbc_dec_six_or_seven | |
3828 | ||
23f6eec7 AP |
3829 | and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE |
3830 | sub \$0x50,$len # $len is biased by -5*16 | |
5599c733 | 3831 | cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE |
23f6eec7 AP |
3832 | je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] |
3833 | sub \$0x20,$len # $len is biased by -7*16 | |
73325b22 | 3834 | lea 0x70($key),$key # size optimization |
f8501464 | 3835 | jmp .Lcbc_dec_loop8_enter |
d7d119a3 | 3836 | .align 16 |
f8501464 | 3837 | .Lcbc_dec_loop8: |
f8501464 AP |
3838 | movups $inout7,($out) |
3839 | lea 0x10($out),$out | |
3840 | .Lcbc_dec_loop8_enter: | |
73325b22 AP |
3841 | movdqu 0x60($inp),$inout6 |
3842 | pxor $rndkey0,$inout0 | |
3843 | movdqu 0x70($inp),$inout7 | |
3844 | pxor $rndkey0,$inout1 | |
3845 | $movkey 0x10-0x70($key),$rndkey1 | |
3846 | pxor $rndkey0,$inout2 | |
384e6de4 | 3847 | mov \$-1,$inp_ |
73325b22 AP |
3848 | cmp \$0x70,$len # is there at least 0x60 bytes ahead? |
3849 | pxor $rndkey0,$inout3 | |
3850 | pxor $rndkey0,$inout4 | |
3851 | pxor $rndkey0,$inout5 | |
3852 | pxor $rndkey0,$inout6 | |
d7d119a3 | 3853 | |
f8501464 | 3854 | aesdec $rndkey1,$inout0 |
73325b22 AP |
3855 | pxor $rndkey0,$inout7 |
3856 | $movkey 0x20-0x70($key),$rndkey0 | |
f8501464 | 3857 | aesdec $rndkey1,$inout1 |
f8501464 | 3858 | aesdec $rndkey1,$inout2 |
f8501464 | 3859 | aesdec $rndkey1,$inout3 |
f8501464 | 3860 | aesdec $rndkey1,$inout4 |
f8501464 | 3861 | aesdec $rndkey1,$inout5 |
f8501464 | 3862 | aesdec $rndkey1,$inout6 |
384e6de4 AP |
3863 | adc \$0,$inp_ |
3864 | and \$128,$inp_ | |
f8501464 | 3865 | aesdec $rndkey1,$inout7 |
73325b22 AP |
3866 | add $inp,$inp_ |
3867 | $movkey 0x30-0x70($key),$rndkey1 | |
3868 | ___ | |
3869 | for($i=1;$i<12;$i++) { | |
3870 | my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; | |
d8ba0dc9 AP |
3871 | $code.=<<___ if ($i==7); |
3872 | cmp \$11,$rounds | |
3873 | ___ | |
73325b22 AP |
3874 | $code.=<<___; |
3875 | aesdec $rndkeyx,$inout0 | |
3876 | aesdec $rndkeyx,$inout1 | |
3877 | aesdec $rndkeyx,$inout2 | |
3878 | aesdec $rndkeyx,$inout3 | |
3879 | aesdec $rndkeyx,$inout4 | |
3880 | aesdec $rndkeyx,$inout5 | |
3881 | aesdec $rndkeyx,$inout6 | |
3882 | aesdec $rndkeyx,$inout7 | |
3883 | $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx | |
3884 | ___ | |
d8ba0dc9 AP |
3885 | $code.=<<___ if ($i<6 || (!($i&1) && $i>7)); |
3886 | nop | |
3887 | ___ | |
73325b22 | 3888 | $code.=<<___ if ($i==7); |
73325b22 AP |
3889 | jb .Lcbc_dec_done |
3890 | ___ | |
3891 | $code.=<<___ if ($i==9); | |
3892 | je .Lcbc_dec_done | |
3893 | ___ | |
d8ba0dc9 AP |
3894 | $code.=<<___ if ($i==11); |
3895 | jmp .Lcbc_dec_done | |
3896 | ___ | |
73325b22 AP |
3897 | } |
3898 | $code.=<<___; | |
d8ba0dc9 | 3899 | .align 16 |
73325b22 AP |
3900 | .Lcbc_dec_done: |
3901 | aesdec $rndkey1,$inout0 | |
73325b22 | 3902 | aesdec $rndkey1,$inout1 |
d8ba0dc9 | 3903 | pxor $rndkey0,$iv |
73325b22 AP |
3904 | pxor $rndkey0,$in0 |
3905 | aesdec $rndkey1,$inout2 | |
73325b22 | 3906 | aesdec $rndkey1,$inout3 |
d8ba0dc9 | 3907 | pxor $rndkey0,$in1 |
73325b22 AP |
3908 | pxor $rndkey0,$in2 |
3909 | aesdec $rndkey1,$inout4 | |
73325b22 | 3910 | aesdec $rndkey1,$inout5 |
d8ba0dc9 | 3911 | pxor $rndkey0,$in3 |
73325b22 AP |
3912 | pxor $rndkey0,$in4 |
3913 | aesdec $rndkey1,$inout6 | |
3914 | aesdec $rndkey1,$inout7 | |
3915 | movdqu 0x50($inp),$rndkey1 | |
d64a7232 | 3916 | |
73325b22 AP |
3917 | aesdeclast $iv,$inout0 |
3918 | movdqu 0x60($inp),$iv # borrow $iv | |
3919 | pxor $rndkey0,$rndkey1 | |
3920 | aesdeclast $in0,$inout1 | |
3921 | pxor $rndkey0,$iv | |
3922 | movdqu 0x70($inp),$rndkey0 # next IV | |
73325b22 | 3923 | aesdeclast $in1,$inout2 |
d8ba0dc9 | 3924 | lea 0x80($inp),$inp |
73325b22 AP |
3925 | movdqu 0x00($inp_),$in0 |
3926 | aesdeclast $in2,$inout3 | |
73325b22 | 3927 | aesdeclast $in3,$inout4 |
d8ba0dc9 | 3928 | movdqu 0x10($inp_),$in1 |
73325b22 AP |
3929 | movdqu 0x20($inp_),$in2 |
3930 | aesdeclast $in4,$inout5 | |
73325b22 | 3931 | aesdeclast $rndkey1,$inout6 |
d8ba0dc9 | 3932 | movdqu 0x30($inp_),$in3 |
73325b22 AP |
3933 | movdqu 0x40($inp_),$in4 |
3934 | aesdeclast $iv,$inout7 | |
3935 | movdqa $rndkey0,$iv # return $iv | |
3936 | movdqu 0x50($inp_),$rndkey1 | |
3937 | $movkey -0x70($key),$rndkey0 | |
3938 | ||
3939 | movups $inout0,($out) # store output | |
3940 | movdqa $in0,$inout0 | |
3941 | movups $inout1,0x10($out) | |
3942 | movdqa $in1,$inout1 | |
3943 | movups $inout2,0x20($out) | |
3944 | movdqa $in2,$inout2 | |
3945 | movups $inout3,0x30($out) | |
3946 | movdqa $in3,$inout3 | |
3947 | movups $inout4,0x40($out) | |
3948 | movdqa $in4,$inout4 | |
3949 | movups $inout5,0x50($out) | |
3950 | movdqa $rndkey1,$inout5 | |
3951 | movups $inout6,0x60($out) | |
3952 | lea 0x70($out),$out | |
f8501464 | 3953 | |
f8501464 AP |
3954 | sub \$0x80,$len |
3955 | ja .Lcbc_dec_loop8 | |
3956 | ||
3957 | movaps $inout7,$inout0 | |
73325b22 | 3958 | lea -0x70($key),$key |
f8501464 | 3959 | add \$0x70,$len |
23f6eec7 | 3960 | jle .Lcbc_dec_clear_tail_collected |
73325b22 | 3961 | movups $inout7,($out) |
f8501464 | 3962 | lea 0x10($out),$out |
73325b22 AP |
3963 | cmp \$0x50,$len |
3964 | jbe .Lcbc_dec_tail | |
3965 | ||
3966 | movaps $in0,$inout0 | |
3967 | .Lcbc_dec_six_or_seven: | |
3968 | cmp \$0x60,$len | |
3969 | ja .Lcbc_dec_seven | |
3970 | ||
3971 | movaps $inout5,$inout6 | |
3972 | call _aesni_decrypt6 | |
3973 | pxor $iv,$inout0 # ^= IV | |
3974 | movaps $inout6,$iv | |
3975 | pxor $in0,$inout1 | |
3976 | movdqu $inout0,($out) | |
3977 | pxor $in1,$inout2 | |
3978 | movdqu $inout1,0x10($out) | |
23f6eec7 | 3979 | pxor $inout1,$inout1 # clear register bank |
73325b22 AP |
3980 | pxor $in2,$inout3 |
3981 | movdqu $inout2,0x20($out) | |
23f6eec7 | 3982 | pxor $inout2,$inout2 |
73325b22 AP |
3983 | pxor $in3,$inout4 |
3984 | movdqu $inout3,0x30($out) | |
23f6eec7 | 3985 | pxor $inout3,$inout3 |
73325b22 AP |
3986 | pxor $in4,$inout5 |
3987 | movdqu $inout4,0x40($out) | |
23f6eec7 | 3988 | pxor $inout4,$inout4 |
73325b22 AP |
3989 | lea 0x50($out),$out |
3990 | movdqa $inout5,$inout0 | |
23f6eec7 | 3991 | pxor $inout5,$inout5 |
73325b22 AP |
3992 | jmp .Lcbc_dec_tail_collected |
3993 | ||
3994 | .align 16 | |
3995 | .Lcbc_dec_seven: | |
3996 | movups 0x60($inp),$inout6 | |
3997 | xorps $inout7,$inout7 | |
3998 | call _aesni_decrypt8 | |
3999 | movups 0x50($inp),$inout7 | |
4000 | pxor $iv,$inout0 # ^= IV | |
4001 | movups 0x60($inp),$iv | |
4002 | pxor $in0,$inout1 | |
4003 | movdqu $inout0,($out) | |
4004 | pxor $in1,$inout2 | |
4005 | movdqu $inout1,0x10($out) | |
23f6eec7 | 4006 | pxor $inout1,$inout1 # clear register bank |
73325b22 AP |
4007 | pxor $in2,$inout3 |
4008 | movdqu $inout2,0x20($out) | |
23f6eec7 | 4009 | pxor $inout2,$inout2 |
73325b22 AP |
4010 | pxor $in3,$inout4 |
4011 | movdqu $inout3,0x30($out) | |
23f6eec7 | 4012 | pxor $inout3,$inout3 |
73325b22 AP |
4013 | pxor $in4,$inout5 |
4014 | movdqu $inout4,0x40($out) | |
23f6eec7 | 4015 | pxor $inout4,$inout4 |
73325b22 AP |
4016 | pxor $inout7,$inout6 |
4017 | movdqu $inout5,0x50($out) | |
23f6eec7 | 4018 | pxor $inout5,$inout5 |
73325b22 AP |
4019 | lea 0x60($out),$out |
4020 | movdqa $inout6,$inout0 | |
23f6eec7 AP |
4021 | pxor $inout6,$inout6 |
4022 | pxor $inout7,$inout7 | |
73325b22 AP |
4023 | jmp .Lcbc_dec_tail_collected |
4024 | ||
5599c733 AP |
4025 | .align 16 |
4026 | .Lcbc_dec_loop6: | |
4027 | movups $inout5,($out) | |
4028 | lea 0x10($out),$out | |
4029 | movdqu 0x00($inp),$inout0 # load input | |
4030 | movdqu 0x10($inp),$inout1 | |
4031 | movdqa $inout0,$in0 | |
4032 | movdqu 0x20($inp),$inout2 | |
4033 | movdqa $inout1,$in1 | |
4034 | movdqu 0x30($inp),$inout3 | |
4035 | movdqa $inout2,$in2 | |
4036 | movdqu 0x40($inp),$inout4 | |
4037 | movdqa $inout3,$in3 | |
4038 | movdqu 0x50($inp),$inout5 | |
4039 | movdqa $inout4,$in4 | |
4040 | .Lcbc_dec_loop6_enter: | |
4041 | lea 0x60($inp),$inp | |
4042 | movdqa $inout5,$inout6 | |
4043 | ||
4044 | call _aesni_decrypt6 | |
4045 | ||
4046 | pxor $iv,$inout0 # ^= IV | |
4047 | movdqa $inout6,$iv | |
4048 | pxor $in0,$inout1 | |
4049 | movdqu $inout0,($out) | |
4050 | pxor $in1,$inout2 | |
4051 | movdqu $inout1,0x10($out) | |
4052 | pxor $in2,$inout3 | |
4053 | movdqu $inout2,0x20($out) | |
4054 | pxor $in3,$inout4 | |
4055 | mov $key_,$key | |
4056 | movdqu $inout3,0x30($out) | |
4057 | pxor $in4,$inout5 | |
4058 | mov $rnds_,$rounds | |
4059 | movdqu $inout4,0x40($out) | |
4060 | lea 0x50($out),$out | |
4061 | sub \$0x60,$len | |
4062 | ja .Lcbc_dec_loop6 | |
4063 | ||
4064 | movdqa $inout5,$inout0 | |
4065 | add \$0x50,$len | |
23f6eec7 | 4066 | jle .Lcbc_dec_clear_tail_collected |
5599c733 AP |
4067 | movups $inout5,($out) |
4068 | lea 0x10($out),$out | |
4069 | ||
6c83629b | 4070 | .Lcbc_dec_tail: |
d64a7232 | 4071 | movups ($inp),$inout0 |
73325b22 | 4072 | sub \$0x10,$len |
23f6eec7 | 4073 | jbe .Lcbc_dec_one # $len is 1*16 or less |
f8501464 | 4074 | |
d64a7232 | 4075 | movups 0x10($inp),$inout1 |
73325b22 AP |
4076 | movaps $inout0,$in0 |
4077 | sub \$0x10,$len | |
23f6eec7 | 4078 | jbe .Lcbc_dec_two # $len is 2*16 or less |
f8501464 | 4079 | |
d64a7232 | 4080 | movups 0x20($inp),$inout2 |
73325b22 AP |
4081 | movaps $inout1,$in1 |
4082 | sub \$0x10,$len | |
23f6eec7 | 4083 | jbe .Lcbc_dec_three # $len is 3*16 or less |
f8501464 | 4084 | |
d64a7232 | 4085 | movups 0x30($inp),$inout3 |
73325b22 AP |
4086 | movaps $inout2,$in2 |
4087 | sub \$0x10,$len | |
23f6eec7 | 4088 | jbe .Lcbc_dec_four # $len is 4*16 or less |
f8501464 | 4089 | |
23f6eec7 | 4090 | movups 0x40($inp),$inout4 # $len is 5*16 or less |
73325b22 AP |
4091 | movaps $inout3,$in3 |
4092 | movaps $inout4,$in4 | |
4093 | xorps $inout5,$inout5 | |
4094 | call _aesni_decrypt6 | |
4095 | pxor $iv,$inout0 | |
4096 | movaps $in4,$iv | |
4097 | pxor $in0,$inout1 | |
4098 | movdqu $inout0,($out) | |
4099 | pxor $in1,$inout2 | |
4100 | movdqu $inout1,0x10($out) | |
23f6eec7 | 4101 | pxor $inout1,$inout1 # clear register bank |
73325b22 AP |
4102 | pxor $in2,$inout3 |
4103 | movdqu $inout2,0x20($out) | |
23f6eec7 | 4104 | pxor $inout2,$inout2 |
73325b22 AP |
4105 | pxor $in3,$inout4 |
4106 | movdqu $inout3,0x30($out) | |
23f6eec7 | 4107 | pxor $inout3,$inout3 |
73325b22 AP |
4108 | lea 0x40($out),$out |
4109 | movdqa $inout4,$inout0 | |
23f6eec7 AP |
4110 | pxor $inout4,$inout4 |
4111 | pxor $inout5,$inout5 | |
73325b22 | 4112 | sub \$0x10,$len |
d64a7232 | 4113 | jmp .Lcbc_dec_tail_collected |
73325b22 | 4114 | |
d64a7232 AP |
4115 | .align 16 |
4116 | .Lcbc_dec_one: | |
73325b22 | 4117 | movaps $inout0,$in0 |
d64a7232 | 4118 | ___ |
d608b4d6 | 4119 | &aesni_generate1("dec",$key,$rounds); |
d64a7232 | 4120 | $code.=<<___; |
f8501464 | 4121 | xorps $iv,$inout0 |
d64a7232 AP |
4122 | movaps $in0,$iv |
4123 | jmp .Lcbc_dec_tail_collected | |
4124 | .align 16 | |
4125 | .Lcbc_dec_two: | |
73325b22 | 4126 | movaps $inout1,$in1 |
214368ff | 4127 | call _aesni_decrypt2 |
73325b22 | 4128 | pxor $iv,$inout0 |
d64a7232 | 4129 | movaps $in1,$iv |
73325b22 AP |
4130 | pxor $in0,$inout1 |
4131 | movdqu $inout0,($out) | |
4132 | movdqa $inout1,$inout0 | |
23f6eec7 | 4133 | pxor $inout1,$inout1 # clear register bank |
d64a7232 AP |
4134 | lea 0x10($out),$out |
4135 | jmp .Lcbc_dec_tail_collected | |
4136 | .align 16 | |
4137 | .Lcbc_dec_three: | |
73325b22 | 4138 | movaps $inout2,$in2 |
d608b4d6 | 4139 | call _aesni_decrypt3 |
73325b22 | 4140 | pxor $iv,$inout0 |
d64a7232 | 4141 | movaps $in2,$iv |
73325b22 AP |
4142 | pxor $in0,$inout1 |
4143 | movdqu $inout0,($out) | |
4144 | pxor $in1,$inout2 | |
4145 | movdqu $inout1,0x10($out) | |
23f6eec7 | 4146 | pxor $inout1,$inout1 # clear register bank |
73325b22 | 4147 | movdqa $inout2,$inout0 |
23f6eec7 | 4148 | pxor $inout2,$inout2 |
d64a7232 | 4149 | lea 0x20($out),$out |
f8501464 AP |
4150 | jmp .Lcbc_dec_tail_collected |
4151 | .align 16 | |
4152 | .Lcbc_dec_four: | |
73325b22 | 4153 | movaps $inout3,$in3 |
f8501464 | 4154 | call _aesni_decrypt4 |
73325b22 AP |
4155 | pxor $iv,$inout0 |
4156 | movaps $in3,$iv | |
4157 | pxor $in0,$inout1 | |
4158 | movdqu $inout0,($out) | |
4159 | pxor $in1,$inout2 | |
4160 | movdqu $inout1,0x10($out) | |
23f6eec7 | 4161 | pxor $inout1,$inout1 # clear register bank |
73325b22 AP |
4162 | pxor $in2,$inout3 |
4163 | movdqu $inout2,0x20($out) | |
23f6eec7 | 4164 | pxor $inout2,$inout2 |
73325b22 | 4165 | movdqa $inout3,$inout0 |
23f6eec7 | 4166 | pxor $inout3,$inout3 |
f8501464 | 4167 | lea 0x30($out),$out |
d64a7232 | 4168 | jmp .Lcbc_dec_tail_collected |
73325b22 | 4169 | |
d64a7232 | 4170 | .align 16 |
23f6eec7 AP |
4171 | .Lcbc_dec_clear_tail_collected: |
4172 | pxor $inout1,$inout1 # clear register bank | |
4173 | pxor $inout2,$inout2 | |
4174 | pxor $inout3,$inout3 | |
4175 | ___ | |
4176 | $code.=<<___ if (!$win64); | |
4177 | pxor $inout4,$inout4 # %xmm6..9 | |
4178 | pxor $inout5,$inout5 | |
4179 | pxor $inout6,$inout6 | |
4180 | pxor $inout7,$inout7 | |
4181 | ___ | |
4182 | $code.=<<___; | |
d64a7232 | 4183 | .Lcbc_dec_tail_collected: |
d64a7232 | 4184 | movups $iv,($ivp) |
73325b22 | 4185 | and \$15,$len |
d64a7232 | 4186 | jnz .Lcbc_dec_tail_partial |
f8501464 | 4187 | movups $inout0,($out) |
23f6eec7 | 4188 | pxor $inout0,$inout0 |
d64a7232 | 4189 | jmp .Lcbc_dec_ret |
d7d119a3 | 4190 | .align 16 |
d64a7232 | 4191 | .Lcbc_dec_tail_partial: |
6a40ebe8 | 4192 | movaps $inout0,(%rsp) |
23f6eec7 | 4193 | pxor $inout0,$inout0 |
f8501464 | 4194 | mov \$16,%rcx |
d64a7232 | 4195 | mov $out,%rdi |
f8501464 | 4196 | sub $len,%rcx |
6a40ebe8 | 4197 | lea (%rsp),%rsi |
23f6eec7 AP |
4198 | .long 0x9066A4F3 # rep movsb |
4199 | movdqa $inout0,(%rsp) | |
d64a7232 AP |
4200 | |
4201 | .Lcbc_dec_ret: | |
23f6eec7 AP |
4202 | xorps $rndkey0,$rndkey0 # %xmm0 |
4203 | pxor $rndkey1,$rndkey1 | |
d64a7232 AP |
4204 | ___ |
4205 | $code.=<<___ if ($win64); | |
6a40ebe8 | 4206 | movaps 0x10(%rsp),%xmm6 |
23f6eec7 | 4207 | movaps %xmm0,0x10(%rsp) # clear stack |
6a40ebe8 | 4208 | movaps 0x20(%rsp),%xmm7 |
23f6eec7 | 4209 | movaps %xmm0,0x20(%rsp) |
6a40ebe8 | 4210 | movaps 0x30(%rsp),%xmm8 |
23f6eec7 | 4211 | movaps %xmm0,0x30(%rsp) |
6a40ebe8 | 4212 | movaps 0x40(%rsp),%xmm9 |
23f6eec7 | 4213 | movaps %xmm0,0x40(%rsp) |
73325b22 | 4214 | movaps 0x50(%rsp),%xmm10 |
23f6eec7 | 4215 | movaps %xmm0,0x50(%rsp) |
73325b22 | 4216 | movaps 0x60(%rsp),%xmm11 |
23f6eec7 | 4217 | movaps %xmm0,0x60(%rsp) |
73325b22 | 4218 | movaps 0x70(%rsp),%xmm12 |
23f6eec7 | 4219 | movaps %xmm0,0x70(%rsp) |
73325b22 | 4220 | movaps 0x80(%rsp),%xmm13 |
23f6eec7 | 4221 | movaps %xmm0,0x80(%rsp) |
73325b22 | 4222 | movaps 0x90(%rsp),%xmm14 |
23f6eec7 | 4223 | movaps %xmm0,0x90(%rsp) |
73325b22 | 4224 | movaps 0xa0(%rsp),%xmm15 |
23f6eec7 | 4225 | movaps %xmm0,0xa0(%rsp) |
d64a7232 AP |
4226 | ___ |
4227 | $code.=<<___; | |
384e6de4 | 4228 | mov -8(%r11),%rbp |
b84460ad | 4229 | .cfi_restore %rbp |
384e6de4 | 4230 | lea (%r11),%rsp |
b84460ad | 4231 | .cfi_def_cfa_register %rsp |
d64a7232 AP |
4232 | .Lcbc_ret: |
4233 | ret | |
b84460ad | 4234 | .cfi_endproc |
d64a7232 AP |
4235 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt |
4236 | ___ | |
f8501464 | 4237 | } \f |
23f6eec7 | 4238 | # int ${PREFIX}_set_decrypt_key(const unsigned char *inp, |
d608b4d6 | 4239 | # int bits, AES_KEY *key) |
23f6eec7 AP |
4240 | # |
4241 | # input: $inp user-supplied key | |
4242 | # $bits $inp length in bits | |
4243 | # $key pointer to key schedule | |
4244 | # output: %eax 0 denoting success, -1 or -2 - failure (see C) | |
4245 | # *$key key schedule | |
4246 | # | |
d608b4d6 AP |
4247 | { my ($inp,$bits,$key) = @_4args; |
4248 | $bits =~ s/%r/%e/; | |
4249 | ||
d64a7232 AP |
4250 | $code.=<<___; |
4251 | .globl ${PREFIX}_set_decrypt_key | |
d608b4d6 | 4252 | .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent |
d64a7232 AP |
4253 | .align 16 |
4254 | ${PREFIX}_set_decrypt_key: | |
b84460ad | 4255 | .cfi_startproc |
d608b4d6 | 4256 | .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 |
b84460ad | 4257 | .cfi_adjust_cfa_offset 8 |
fb2f3411 | 4258 | call __aesni_set_encrypt_key |
d608b4d6 | 4259 | shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key |
d64a7232 AP |
4260 | test %eax,%eax |
4261 | jnz .Ldec_key_ret | |
d608b4d6 AP |
4262 | lea 16($key,$bits),$inp # points at the end of key schedule |
4263 | ||
4264 | $movkey ($key),%xmm0 # just swap | |
4265 | $movkey ($inp),%xmm1 | |
4266 | $movkey %xmm0,($inp) | |
4267 | $movkey %xmm1,($key) | |
4268 | lea 16($key),$key | |
4269 | lea -16($inp),$inp | |
4270 | ||
d64a7232 | 4271 | .Ldec_key_inverse: |
d608b4d6 AP |
4272 | $movkey ($key),%xmm0 # swap and inverse |
4273 | $movkey ($inp),%xmm1 | |
d64a7232 AP |
4274 | aesimc %xmm0,%xmm0 |
4275 | aesimc %xmm1,%xmm1 | |
d608b4d6 AP |
4276 | lea 16($key),$key |
4277 | lea -16($inp),$inp | |
d608b4d6 AP |
4278 | $movkey %xmm0,16($inp) |
4279 | $movkey %xmm1,-16($key) | |
d7d119a3 | 4280 | cmp $key,$inp |
d64a7232 AP |
4281 | ja .Ldec_key_inverse |
4282 | ||
d608b4d6 | 4283 | $movkey ($key),%xmm0 # inverse middle |
d64a7232 | 4284 | aesimc %xmm0,%xmm0 |
23f6eec7 | 4285 | pxor %xmm1,%xmm1 |
d608b4d6 | 4286 | $movkey %xmm0,($inp) |
23f6eec7 | 4287 | pxor %xmm0,%xmm0 |
d64a7232 | 4288 | .Ldec_key_ret: |
d608b4d6 | 4289 | add \$8,%rsp |
b84460ad | 4290 | .cfi_adjust_cfa_offset -8 |
d64a7232 | 4291 | ret |
b84460ad | 4292 | .cfi_endproc |
d608b4d6 | 4293 | .LSEH_end_set_decrypt_key: |
d64a7232 AP |
4294 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key |
4295 | ___ | |
4296 | \f | |
e3713c36 RS |
4297 | # This is based on submission from Intel by |
4298 | # Huang Ying | |
4299 | # Vinodh Gopal | |
d64a7232 AP |
4300 | # Kahraman Akdemir |
4301 | # | |
60250017 | 4302 | # Aggressively optimized in respect to aeskeygenassist's critical path |
d64a7232 AP |
4303 | # and is contained in %xmm0-5 to meet Win64 ABI requirement. |
4304 | # | |
23f6eec7 AP |
4305 | # int ${PREFIX}_set_encrypt_key(const unsigned char *inp, |
4306 | # int bits, AES_KEY * const key); | |
4307 | # | |
4308 | # input: $inp user-supplied key | |
4309 | # $bits $inp length in bits | |
4310 | # $key pointer to key schedule | |
4311 | # output: %eax 0 denoting success, -1 or -2 - failure (see C) | |
4312 | # $bits rounds-1 (used in aesni_set_decrypt_key) | |
4313 | # *$key key schedule | |
4314 | # $key pointer to key schedule (used in | |
4315 | # aesni_set_decrypt_key) | |
4316 | # | |
4317 | # Subroutine is frame-less, which means that only volatile registers | |
4318 | # are used. Note that it's declared "abi-omnipotent", which means that | |
4319 | # amount of volatile registers is smaller on Windows. | |
4320 | # | |
d64a7232 | 4321 | $code.=<<___; |
d608b4d6 AP |
4322 | .globl ${PREFIX}_set_encrypt_key |
4323 | .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent | |
d64a7232 | 4324 | .align 16 |
d608b4d6 | 4325 | ${PREFIX}_set_encrypt_key: |
fb2f3411 | 4326 | __aesni_set_encrypt_key: |
b84460ad | 4327 | .cfi_startproc |
d608b4d6 | 4328 | .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 |
b84460ad | 4329 | .cfi_adjust_cfa_offset 8 |
d608b4d6 | 4330 | mov \$-1,%rax |
d7d119a3 | 4331 | test $inp,$inp |
d608b4d6 AP |
4332 | jz .Lenc_key_ret |
4333 | test $key,$key | |
4334 | jz .Lenc_key_ret | |
4335 | ||
23f6eec7 | 4336 | mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits |
d608b4d6 | 4337 | movups ($inp),%xmm0 # pull first 128 bits of *userKey |
f8501464 | 4338 | xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 |
23f6eec7 AP |
4339 | and OPENSSL_ia32cap_P+4(%rip),%r10d |
4340 | lea 16($key),%rax # %rax is used as modifiable copy of $key | |
d608b4d6 | 4341 | cmp \$256,$bits |
d64a7232 | 4342 | je .L14rounds |
d608b4d6 | 4343 | cmp \$192,$bits |
d64a7232 | 4344 | je .L12rounds |
d608b4d6 | 4345 | cmp \$128,$bits |
d64a7232 | 4346 | jne .Lbad_keybits |
d608b4d6 | 4347 | |
d64a7232 | 4348 | .L10rounds: |
d608b4d6 | 4349 | mov \$9,$bits # 10 rounds for 128-bit key |
23f6eec7 AP |
4350 | cmp \$`1<<28`,%r10d # AVX, bit no XOP |
4351 | je .L10rounds_alt | |
4352 | ||
d608b4d6 | 4353 | $movkey %xmm0,($key) # round 0 |
d64a7232 AP |
4354 | aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 |
4355 | call .Lkey_expansion_128_cold | |
4356 | aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 | |
4357 | call .Lkey_expansion_128 | |
4358 | aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 | |
4359 | call .Lkey_expansion_128 | |
4360 | aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 | |
4361 | call .Lkey_expansion_128 | |
4362 | aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 | |
4363 | call .Lkey_expansion_128 | |
4364 | aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 | |
4365 | call .Lkey_expansion_128 | |
4366 | aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 | |
4367 | call .Lkey_expansion_128 | |
4368 | aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 | |
4369 | call .Lkey_expansion_128 | |
4370 | aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 | |
4371 | call .Lkey_expansion_128 | |
4372 | aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 | |
4373 | call .Lkey_expansion_128 | |
d608b4d6 AP |
4374 | $movkey %xmm0,(%rax) |
4375 | mov $bits,80(%rax) # 240(%rdx) | |
d64a7232 | 4376 | xor %eax,%eax |
d608b4d6 | 4377 | jmp .Lenc_key_ret |
d64a7232 | 4378 | |
23f6eec7 AP |
4379 | .align 16 |
4380 | .L10rounds_alt: | |
4381 | movdqa .Lkey_rotate(%rip),%xmm5 | |
4382 | mov \$8,%r10d | |
4383 | movdqa .Lkey_rcon1(%rip),%xmm4 | |
4384 | movdqa %xmm0,%xmm2 | |
4385 | movdqu %xmm0,($key) | |
4386 | jmp .Loop_key128 | |
4387 | ||
4388 | .align 16 | |
4389 | .Loop_key128: | |
4390 | pshufb %xmm5,%xmm0 | |
4391 | aesenclast %xmm4,%xmm0 | |
4392 | pslld \$1,%xmm4 | |
4393 | lea 16(%rax),%rax | |
4394 | ||
4395 | movdqa %xmm2,%xmm3 | |
4396 | pslldq \$4,%xmm2 | |
4397 | pxor %xmm2,%xmm3 | |
4398 | pslldq \$4,%xmm2 | |
4399 | pxor %xmm2,%xmm3 | |
4400 | pslldq \$4,%xmm2 | |
4401 | pxor %xmm3,%xmm2 | |
4402 | ||
4403 | pxor %xmm2,%xmm0 | |
4404 | movdqu %xmm0,-16(%rax) | |
4405 | movdqa %xmm0,%xmm2 | |
4406 | ||
4407 | dec %r10d | |
4408 | jnz .Loop_key128 | |
4409 | ||
4410 | movdqa .Lkey_rcon1b(%rip),%xmm4 | |
4411 | ||
4412 | pshufb %xmm5,%xmm0 | |
4413 | aesenclast %xmm4,%xmm0 | |
4414 | pslld \$1,%xmm4 | |
4415 | ||
4416 | movdqa %xmm2,%xmm3 | |
4417 | pslldq \$4,%xmm2 | |
4418 | pxor %xmm2,%xmm3 | |
4419 | pslldq \$4,%xmm2 | |
4420 | pxor %xmm2,%xmm3 | |
4421 | pslldq \$4,%xmm2 | |
4422 | pxor %xmm3,%xmm2 | |
4423 | ||
4424 | pxor %xmm2,%xmm0 | |
4425 | movdqu %xmm0,(%rax) | |
4426 | ||
4427 | movdqa %xmm0,%xmm2 | |
4428 | pshufb %xmm5,%xmm0 | |
4429 | aesenclast %xmm4,%xmm0 | |
4430 | ||
4431 | movdqa %xmm2,%xmm3 | |
4432 | pslldq \$4,%xmm2 | |
4433 | pxor %xmm2,%xmm3 | |
4434 | pslldq \$4,%xmm2 | |
4435 | pxor %xmm2,%xmm3 | |
4436 | pslldq \$4,%xmm2 | |
4437 | pxor %xmm3,%xmm2 | |
4438 | ||
4439 | pxor %xmm2,%xmm0 | |
4440 | movdqu %xmm0,16(%rax) | |
4441 | ||
4442 | mov $bits,96(%rax) # 240($key) | |
4443 | xor %eax,%eax | |
4444 | jmp .Lenc_key_ret | |
4445 | ||
d64a7232 AP |
4446 | .align 16 |
4447 | .L12rounds: | |
d608b4d6 AP |
4448 | movq 16($inp),%xmm2 # remaining 1/3 of *userKey |
4449 | mov \$11,$bits # 12 rounds for 192 | |
23f6eec7 AP |
4450 | cmp \$`1<<28`,%r10d # AVX, but no XOP |
4451 | je .L12rounds_alt | |
4452 | ||
d608b4d6 | 4453 | $movkey %xmm0,($key) # round 0 |
d64a7232 AP |
4454 | aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 |
4455 | call .Lkey_expansion_192a_cold | |
4456 | aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 | |
4457 | call .Lkey_expansion_192b | |
4458 | aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 | |
4459 | call .Lkey_expansion_192a | |
4460 | aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 | |
4461 | call .Lkey_expansion_192b | |
4462 | aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 | |
4463 | call .Lkey_expansion_192a | |
4464 | aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 | |
4465 | call .Lkey_expansion_192b | |
4466 | aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 | |
4467 | call .Lkey_expansion_192a | |
4468 | aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 | |
4469 | call .Lkey_expansion_192b | |
d608b4d6 AP |
4470 | $movkey %xmm0,(%rax) |
4471 | mov $bits,48(%rax) # 240(%rdx) | |
d64a7232 | 4472 | xor %rax, %rax |
d608b4d6 | 4473 | jmp .Lenc_key_ret |
d64a7232 | 4474 | |
23f6eec7 AP |
4475 | .align 16 |
4476 | .L12rounds_alt: | |
4477 | movdqa .Lkey_rotate192(%rip),%xmm5 | |
4478 | movdqa .Lkey_rcon1(%rip),%xmm4 | |
4479 | mov \$8,%r10d | |
4480 | movdqu %xmm0,($key) | |
4481 | jmp .Loop_key192 | |
4482 | ||
4483 | .align 16 | |
4484 | .Loop_key192: | |
4485 | movq %xmm2,0(%rax) | |
4486 | movdqa %xmm2,%xmm1 | |
4487 | pshufb %xmm5,%xmm2 | |
4488 | aesenclast %xmm4,%xmm2 | |
4489 | pslld \$1, %xmm4 | |
4490 | lea 24(%rax),%rax | |
4491 | ||
4492 | movdqa %xmm0,%xmm3 | |
4493 | pslldq \$4,%xmm0 | |
4494 | pxor %xmm0,%xmm3 | |
4495 | pslldq \$4,%xmm0 | |
4496 | pxor %xmm0,%xmm3 | |
4497 | pslldq \$4,%xmm0 | |
4498 | pxor %xmm3,%xmm0 | |
4499 | ||
4500 | pshufd \$0xff,%xmm0,%xmm3 | |
4501 | pxor %xmm1,%xmm3 | |
4502 | pslldq \$4,%xmm1 | |
4503 | pxor %xmm1,%xmm3 | |
4504 | ||
4505 | pxor %xmm2,%xmm0 | |
4506 | pxor %xmm3,%xmm2 | |
4507 | movdqu %xmm0,-16(%rax) | |
4508 | ||
4509 | dec %r10d | |
4510 | jnz .Loop_key192 | |
4511 | ||
4512 | mov $bits,32(%rax) # 240($key) | |
4513 | xor %eax,%eax | |
4514 | jmp .Lenc_key_ret | |
4515 | ||
d64a7232 AP |
4516 | .align 16 |
4517 | .L14rounds: | |
46f4e1be | 4518 | movups 16($inp),%xmm2 # remaining half of *userKey |
d608b4d6 AP |
4519 | mov \$13,$bits # 14 rounds for 256 |
4520 | lea 16(%rax),%rax | |
23f6eec7 AP |
4521 | cmp \$`1<<28`,%r10d # AVX, but no XOP |
4522 | je .L14rounds_alt | |
4523 | ||
d608b4d6 AP |
4524 | $movkey %xmm0,($key) # round 0 |
4525 | $movkey %xmm2,16($key) # round 1 | |
d64a7232 AP |
4526 | aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 |
4527 | call .Lkey_expansion_256a_cold | |
4528 | aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 | |
4529 | call .Lkey_expansion_256b | |
4530 | aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 | |
4531 | call .Lkey_expansion_256a | |
4532 | aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 | |
4533 | call .Lkey_expansion_256b | |
4534 | aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 | |
4535 | call .Lkey_expansion_256a | |
4536 | aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 | |
4537 | call .Lkey_expansion_256b | |
4538 | aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 | |
4539 | call .Lkey_expansion_256a | |
4540 | aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 | |
4541 | call .Lkey_expansion_256b | |
4542 | aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 | |
4543 | call .Lkey_expansion_256a | |
4544 | aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 | |
4545 | call .Lkey_expansion_256b | |
4546 | aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 | |
4547 | call .Lkey_expansion_256a | |
4548 | aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 | |
4549 | call .Lkey_expansion_256b | |
4550 | aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 | |
4551 | call .Lkey_expansion_256a | |
d608b4d6 AP |
4552 | $movkey %xmm0,(%rax) |
4553 | mov $bits,16(%rax) # 240(%rdx) | |
d64a7232 | 4554 | xor %rax,%rax |
d608b4d6 AP |
4555 | jmp .Lenc_key_ret |
4556 | ||
23f6eec7 AP |
4557 | .align 16 |
4558 | .L14rounds_alt: | |
4559 | movdqa .Lkey_rotate(%rip),%xmm5 | |
4560 | movdqa .Lkey_rcon1(%rip),%xmm4 | |
4561 | mov \$7,%r10d | |
4562 | movdqu %xmm0,0($key) | |
4563 | movdqa %xmm2,%xmm1 | |
4564 | movdqu %xmm2,16($key) | |
4565 | jmp .Loop_key256 | |
4566 | ||
4567 | .align 16 | |
4568 | .Loop_key256: | |
4569 | pshufb %xmm5,%xmm2 | |
4570 | aesenclast %xmm4,%xmm2 | |
4571 | ||
4572 | movdqa %xmm0,%xmm3 | |
4573 | pslldq \$4,%xmm0 | |
4574 | pxor %xmm0,%xmm3 | |
4575 | pslldq \$4,%xmm0 | |
4576 | pxor %xmm0,%xmm3 | |
4577 | pslldq \$4,%xmm0 | |
4578 | pxor %xmm3,%xmm0 | |
4579 | pslld \$1,%xmm4 | |
4580 | ||
4581 | pxor %xmm2,%xmm0 | |
4582 | movdqu %xmm0,(%rax) | |
4583 | ||
4584 | dec %r10d | |
4585 | jz .Ldone_key256 | |
4586 | ||
4587 | pshufd \$0xff,%xmm0,%xmm2 | |
4588 | pxor %xmm3,%xmm3 | |
4589 | aesenclast %xmm3,%xmm2 | |
4590 | ||
4591 | movdqa %xmm1,%xmm3 | |
4592 | pslldq \$4,%xmm1 | |
4593 | pxor %xmm1,%xmm3 | |
4594 | pslldq \$4,%xmm1 | |
4595 | pxor %xmm1,%xmm3 | |
4596 | pslldq \$4,%xmm1 | |
4597 | pxor %xmm3,%xmm1 | |
4598 | ||
4599 | pxor %xmm1,%xmm2 | |
4600 | movdqu %xmm2,16(%rax) | |
4601 | lea 32(%rax),%rax | |
4602 | movdqa %xmm2,%xmm1 | |
4603 | ||
4604 | jmp .Loop_key256 | |
4605 | ||
4606 | .Ldone_key256: | |
4607 | mov $bits,16(%rax) # 240($key) | |
4608 | xor %eax,%eax | |
4609 | jmp .Lenc_key_ret | |
4610 | ||
d608b4d6 AP |
4611 | .align 16 |
4612 | .Lbad_keybits: | |
4613 | mov \$-2,%rax | |
4614 | .Lenc_key_ret: | |
23f6eec7 AP |
4615 | pxor %xmm0,%xmm0 |
4616 | pxor %xmm1,%xmm1 | |
4617 | pxor %xmm2,%xmm2 | |
4618 | pxor %xmm3,%xmm3 | |
4619 | pxor %xmm4,%xmm4 | |
4620 | pxor %xmm5,%xmm5 | |
d608b4d6 | 4621 | add \$8,%rsp |
b84460ad | 4622 | .cfi_adjust_cfa_offset -8 |
d608b4d6 | 4623 | ret |
b84460ad | 4624 | .cfi_endproc |
d608b4d6 AP |
4625 | .LSEH_end_set_encrypt_key: |
4626 | \f | |
4627 | .align 16 | |
4628 | .Lkey_expansion_128: | |
4629 | $movkey %xmm0,(%rax) | |
4630 | lea 16(%rax),%rax | |
4631 | .Lkey_expansion_128_cold: | |
4632 | shufps \$0b00010000,%xmm0,%xmm4 | |
f8501464 | 4633 | xorps %xmm4, %xmm0 |
d608b4d6 | 4634 | shufps \$0b10001100,%xmm0,%xmm4 |
f8501464 AP |
4635 | xorps %xmm4, %xmm0 |
4636 | shufps \$0b11111111,%xmm1,%xmm1 # critical path | |
4637 | xorps %xmm1,%xmm0 | |
d608b4d6 AP |
4638 | ret |
4639 | ||
4640 | .align 16 | |
4641 | .Lkey_expansion_192a: | |
4642 | $movkey %xmm0,(%rax) | |
4643 | lea 16(%rax),%rax | |
4644 | .Lkey_expansion_192a_cold: | |
4645 | movaps %xmm2, %xmm5 | |
4646 | .Lkey_expansion_192b_warm: | |
4647 | shufps \$0b00010000,%xmm0,%xmm4 | |
f8501464 AP |
4648 | movdqa %xmm2,%xmm3 |
4649 | xorps %xmm4,%xmm0 | |
d608b4d6 AP |
4650 | shufps \$0b10001100,%xmm0,%xmm4 |
4651 | pslldq \$4,%xmm3 | |
f8501464 | 4652 | xorps %xmm4,%xmm0 |
d608b4d6 AP |
4653 | pshufd \$0b01010101,%xmm1,%xmm1 # critical path |
4654 | pxor %xmm3,%xmm2 | |
4655 | pxor %xmm1,%xmm0 | |
4656 | pshufd \$0b11111111,%xmm0,%xmm3 | |
4657 | pxor %xmm3,%xmm2 | |
d64a7232 AP |
4658 | ret |
4659 | ||
d608b4d6 AP |
4660 | .align 16 |
4661 | .Lkey_expansion_192b: | |
4662 | movaps %xmm0,%xmm3 | |
4663 | shufps \$0b01000100,%xmm0,%xmm5 | |
4664 | $movkey %xmm5,(%rax) | |
4665 | shufps \$0b01001110,%xmm2,%xmm3 | |
4666 | $movkey %xmm3,16(%rax) | |
4667 | lea 32(%rax),%rax | |
4668 | jmp .Lkey_expansion_192b_warm | |
4669 | ||
d64a7232 AP |
4670 | .align 16 |
4671 | .Lkey_expansion_256a: | |
d608b4d6 AP |
4672 | $movkey %xmm2,(%rax) |
4673 | lea 16(%rax),%rax | |
d64a7232 AP |
4674 | .Lkey_expansion_256a_cold: |
4675 | shufps \$0b00010000,%xmm0,%xmm4 | |
f8501464 | 4676 | xorps %xmm4,%xmm0 |
d64a7232 | 4677 | shufps \$0b10001100,%xmm0,%xmm4 |
f8501464 AP |
4678 | xorps %xmm4,%xmm0 |
4679 | shufps \$0b11111111,%xmm1,%xmm1 # critical path | |
4680 | xorps %xmm1,%xmm0 | |
d64a7232 AP |
4681 | ret |
4682 | ||
4683 | .align 16 | |
4684 | .Lkey_expansion_256b: | |
d608b4d6 AP |
4685 | $movkey %xmm0,(%rax) |
4686 | lea 16(%rax),%rax | |
d64a7232 AP |
4687 | |
4688 | shufps \$0b00010000,%xmm2,%xmm4 | |
f8501464 | 4689 | xorps %xmm4,%xmm2 |
d64a7232 | 4690 | shufps \$0b10001100,%xmm2,%xmm4 |
f8501464 AP |
4691 | xorps %xmm4,%xmm2 |
4692 | shufps \$0b10101010,%xmm1,%xmm1 # critical path | |
4693 | xorps %xmm1,%xmm2 | |
d64a7232 | 4694 | ret |
d608b4d6 | 4695 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key |
f8501464 | 4696 | .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key |
d64a7232 AP |
4697 | ___ |
4698 | } | |
4699 | \f | |
4700 | $code.=<<___; | |
6c83629b AP |
4701 | .align 64 |
4702 | .Lbswap_mask: | |
4703 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | |
d7d119a3 | 4704 | .Lincrement32: |
f8501464 | 4705 | .long 6,6,6,0 |
d7d119a3 AP |
4706 | .Lincrement64: |
4707 | .long 1,0,0,0 | |
f8501464 AP |
4708 | .Lxts_magic: |
4709 | .long 0x87,0,1,0 | |
9282c335 AP |
4710 | .Lincrement1: |
4711 | .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 | |
23f6eec7 AP |
4712 | .Lkey_rotate: |
4713 | .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d | |
4714 | .Lkey_rotate192: | |
4715 | .long 0x04070605,0x04070605,0x04070605,0x04070605 | |
4716 | .Lkey_rcon1: | |
4717 | .long 1,1,1,1 | |
4718 | .Lkey_rcon1b: | |
4719 | .long 0x1b,0x1b,0x1b,0x1b | |
f8501464 | 4720 | |
d64a7232 AP |
4721 | .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" |
4722 | .align 64 | |
4723 | ___ | |
4724 | ||
d608b4d6 AP |
4725 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
4726 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
4727 | if ($win64) { | |
4728 | $rec="%rcx"; | |
4729 | $frame="%rdx"; | |
4730 | $context="%r8"; | |
4731 | $disp="%r9"; | |
4732 | ||
4733 | $code.=<<___; | |
4734 | .extern __imp_RtlVirtualUnwind | |
6c83629b AP |
4735 | ___ |
4736 | $code.=<<___ if ($PREFIX eq "aesni"); | |
69d5747f | 4737 | .type ecb_ccm64_se_handler,\@abi-omnipotent |
d7d119a3 | 4738 | .align 16 |
69d5747f | 4739 | ecb_ccm64_se_handler: |
d7d119a3 AP |
4740 | push %rsi |
4741 | push %rdi | |
4742 | push %rbx | |
4743 | push %rbp | |
4744 | push %r12 | |
4745 | push %r13 | |
4746 | push %r14 | |
4747 | push %r15 | |
4748 | pushfq | |
4749 | sub \$64,%rsp | |
4750 | ||
4751 | mov 120($context),%rax # pull context->Rax | |
4752 | mov 248($context),%rbx # pull context->Rip | |
4753 | ||
4754 | mov 8($disp),%rsi # disp->ImageBase | |
02f358da | 4755 | mov 56($disp),%r11 # disp->HandlerData |
d7d119a3 AP |
4756 | |
4757 | mov 0(%r11),%r10d # HandlerData[0] | |
4758 | lea (%rsi,%r10),%r10 # prologue label | |
4759 | cmp %r10,%rbx # context->Rip<prologue label | |
f8501464 | 4760 | jb .Lcommon_seh_tail |
d7d119a3 AP |
4761 | |
4762 | mov 152($context),%rax # pull context->Rsp | |
4763 | ||
4764 | mov 4(%r11),%r10d # HandlerData[1] | |
4765 | lea (%rsi,%r10),%r10 # epilogue label | |
4766 | cmp %r10,%rbx # context->Rip>=epilogue label | |
f8501464 | 4767 | jae .Lcommon_seh_tail |
d7d119a3 | 4768 | |
f8501464 | 4769 | lea 0(%rax),%rsi # %xmm save area |
d7d119a3 AP |
4770 | lea 512($context),%rdi # &context.Xmm6 |
4771 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) | |
4772 | .long 0xa548f3fc # cld; rep movsq | |
4773 | lea 0x58(%rax),%rax # adjust stack pointer | |
4774 | ||
f8501464 | 4775 | jmp .Lcommon_seh_tail |
69d5747f | 4776 | .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler |
d7d119a3 | 4777 | |
6c79faaa | 4778 | .type ctr_xts_se_handler,\@abi-omnipotent |
6c83629b | 4779 | .align 16 |
6c79faaa | 4780 | ctr_xts_se_handler: |
f8501464 AP |
4781 | push %rsi |
4782 | push %rdi | |
4783 | push %rbx | |
4784 | push %rbp | |
4785 | push %r12 | |
4786 | push %r13 | |
4787 | push %r14 | |
4788 | push %r15 | |
4789 | pushfq | |
4790 | sub \$64,%rsp | |
4791 | ||
4792 | mov 120($context),%rax # pull context->Rax | |
4793 | mov 248($context),%rbx # pull context->Rip | |
4794 | ||
4795 | mov 8($disp),%rsi # disp->ImageBase | |
4796 | mov 56($disp),%r11 # disp->HandlerData | |
4797 | ||
4798 | mov 0(%r11),%r10d # HandlerData[0] | |
4799 | lea (%rsi,%r10),%r10 # prologue lable | |
4800 | cmp %r10,%rbx # context->Rip<prologue label | |
4801 | jb .Lcommon_seh_tail | |
4802 | ||
4803 | mov 152($context),%rax # pull context->Rsp | |
4804 | ||
4805 | mov 4(%r11),%r10d # HandlerData[1] | |
4806 | lea (%rsi,%r10),%r10 # epilogue label | |
4807 | cmp %r10,%rbx # context->Rip>=epilogue label | |
4808 | jae .Lcommon_seh_tail | |
4809 | ||
384e6de4 AP |
4810 | mov 208($context),%rax # pull context->R11 |
4811 | ||
4812 | lea -0xa8(%rax),%rsi # %xmm save area | |
f8501464 AP |
4813 | lea 512($context),%rdi # & context.Xmm6 |
4814 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | |
4815 | .long 0xa548f3fc # cld; rep movsq | |
f8501464 | 4816 | |
384e6de4 AP |
4817 | mov -8(%rax),%rbp # restore saved %rbp |
4818 | mov %rbp,160($context) # restore context->Rbp | |
4819 | jmp .Lcommon_seh_tail | |
6c79faaa | 4820 | .size ctr_xts_se_handler,.-ctr_xts_se_handler |
bd30091c AP |
4821 | |
4822 | .type ocb_se_handler,\@abi-omnipotent | |
4823 | .align 16 | |
4824 | ocb_se_handler: | |
4825 | push %rsi | |
4826 | push %rdi | |
4827 | push %rbx | |
4828 | push %rbp | |
4829 | push %r12 | |
4830 | push %r13 | |
4831 | push %r14 | |
4832 | push %r15 | |
4833 | pushfq | |
4834 | sub \$64,%rsp | |
4835 | ||
4836 | mov 120($context),%rax # pull context->Rax | |
4837 | mov 248($context),%rbx # pull context->Rip | |
4838 | ||
4839 | mov 8($disp),%rsi # disp->ImageBase | |
4840 | mov 56($disp),%r11 # disp->HandlerData | |
4841 | ||
4842 | mov 0(%r11),%r10d # HandlerData[0] | |
4843 | lea (%rsi,%r10),%r10 # prologue lable | |
4844 | cmp %r10,%rbx # context->Rip<prologue label | |
4845 | jb .Lcommon_seh_tail | |
4846 | ||
4847 | mov 4(%r11),%r10d # HandlerData[1] | |
4848 | lea (%rsi,%r10),%r10 # epilogue label | |
4849 | cmp %r10,%rbx # context->Rip>=epilogue label | |
4850 | jae .Lcommon_seh_tail | |
4851 | ||
4852 | mov 8(%r11),%r10d # HandlerData[2] | |
4853 | lea (%rsi,%r10),%r10 | |
4854 | cmp %r10,%rbx # context->Rip>=pop label | |
4855 | jae .Locb_no_xmm | |
4856 | ||
4857 | mov 152($context),%rax # pull context->Rsp | |
4858 | ||
4859 | lea (%rax),%rsi # %xmm save area | |
4860 | lea 512($context),%rdi # & context.Xmm6 | |
4861 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | |
4862 | .long 0xa548f3fc # cld; rep movsq | |
4863 | lea 0xa0+0x28(%rax),%rax | |
4864 | ||
4865 | .Locb_no_xmm: | |
4866 | mov -8(%rax),%rbx | |
4867 | mov -16(%rax),%rbp | |
4868 | mov -24(%rax),%r12 | |
4869 | mov -32(%rax),%r13 | |
4870 | mov -40(%rax),%r14 | |
4871 | ||
4872 | mov %rbx,144($context) # restore context->Rbx | |
4873 | mov %rbp,160($context) # restore context->Rbp | |
4874 | mov %r12,216($context) # restore context->R12 | |
4875 | mov %r13,224($context) # restore context->R13 | |
4876 | mov %r14,232($context) # restore context->R14 | |
4877 | ||
4878 | jmp .Lcommon_seh_tail | |
4879 | .size ocb_se_handler,.-ocb_se_handler | |
6c83629b AP |
4880 | ___ |
4881 | $code.=<<___; | |
4882 | .type cbc_se_handler,\@abi-omnipotent | |
d608b4d6 | 4883 | .align 16 |
6c83629b | 4884 | cbc_se_handler: |
d608b4d6 AP |
4885 | push %rsi |
4886 | push %rdi | |
4887 | push %rbx | |
4888 | push %rbp | |
4889 | push %r12 | |
4890 | push %r13 | |
4891 | push %r14 | |
4892 | push %r15 | |
4893 | pushfq | |
4894 | sub \$64,%rsp | |
4895 | ||
4896 | mov 152($context),%rax # pull context->Rsp | |
6c83629b AP |
4897 | mov 248($context),%rbx # pull context->Rip |
4898 | ||
23f6eec7 | 4899 | lea .Lcbc_decrypt_bulk(%rip),%r10 |
6c83629b | 4900 | cmp %r10,%rbx # context->Rip<"prologue" label |
f8501464 | 4901 | jb .Lcommon_seh_tail |
6c83629b | 4902 | |
384e6de4 AP |
4903 | mov 120($context),%rax # pull context->Rax |
4904 | ||
6c83629b AP |
4905 | lea .Lcbc_decrypt_body(%rip),%r10 |
4906 | cmp %r10,%rbx # context->Rip<cbc_decrypt_body | |
384e6de4 AP |
4907 | jb .Lcommon_seh_tail |
4908 | ||
4909 | mov 152($context),%rax # pull context->Rsp | |
6c83629b AP |
4910 | |
4911 | lea .Lcbc_ret(%rip),%r10 | |
4912 | cmp %r10,%rbx # context->Rip>="epilogue" label | |
f8501464 | 4913 | jae .Lcommon_seh_tail |
6c83629b | 4914 | |
6a40ebe8 | 4915 | lea 16(%rax),%rsi # %xmm save area |
6c83629b | 4916 | lea 512($context),%rdi # &context.Xmm6 |
73325b22 | 4917 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) |
6c83629b | 4918 | .long 0xa548f3fc # cld; rep movsq |
6a40ebe8 | 4919 | |
384e6de4 | 4920 | mov 208($context),%rax # pull context->R11 |
6c83629b | 4921 | |
384e6de4 AP |
4922 | mov -8(%rax),%rbp # restore saved %rbp |
4923 | mov %rbp,160($context) # restore context->Rbp | |
f8501464 AP |
4924 | |
4925 | .Lcommon_seh_tail: | |
d608b4d6 AP |
4926 | mov 8(%rax),%rdi |
4927 | mov 16(%rax),%rsi | |
6c83629b | 4928 | mov %rax,152($context) # restore context->Rsp |
d608b4d6 AP |
4929 | mov %rsi,168($context) # restore context->Rsi |
4930 | mov %rdi,176($context) # restore context->Rdi | |
4931 | ||
d608b4d6 AP |
4932 | mov 40($disp),%rdi # disp->ContextRecord |
4933 | mov $context,%rsi # context | |
4934 | mov \$154,%ecx # sizeof(CONTEXT) | |
4935 | .long 0xa548f3fc # cld; rep movsq | |
4936 | ||
4937 | mov $disp,%rsi | |
4938 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
4939 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
4940 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
4941 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
4942 | mov 40(%rsi),%r10 # disp->ContextRecord | |
4943 | lea 56(%rsi),%r11 # &disp->HandlerData | |
4944 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
4945 | mov %r10,32(%rsp) # arg5 | |
4946 | mov %r11,40(%rsp) # arg6 | |
4947 | mov %r12,48(%rsp) # arg7 | |
4948 | mov %rcx,56(%rsp) # arg8, (NULL) | |
4949 | call *__imp_RtlVirtualUnwind(%rip) | |
4950 | ||
4951 | mov \$1,%eax # ExceptionContinueSearch | |
4952 | add \$64,%rsp | |
4953 | popfq | |
4954 | pop %r15 | |
4955 | pop %r14 | |
4956 | pop %r13 | |
4957 | pop %r12 | |
4958 | pop %rbp | |
4959 | pop %rbx | |
4960 | pop %rdi | |
4961 | pop %rsi | |
4962 | ret | |
4963 | .size cbc_se_handler,.-cbc_se_handler | |
4964 | ||
4965 | .section .pdata | |
4966 | .align 4 | |
6c83629b AP |
4967 | ___ |
4968 | $code.=<<___ if ($PREFIX eq "aesni"); | |
4969 | .rva .LSEH_begin_aesni_ecb_encrypt | |
4970 | .rva .LSEH_end_aesni_ecb_encrypt | |
d608b4d6 AP |
4971 | .rva .LSEH_info_ecb |
4972 | ||
d7d119a3 AP |
4973 | .rva .LSEH_begin_aesni_ccm64_encrypt_blocks |
4974 | .rva .LSEH_end_aesni_ccm64_encrypt_blocks | |
02f358da | 4975 | .rva .LSEH_info_ccm64_enc |
d7d119a3 AP |
4976 | |
4977 | .rva .LSEH_begin_aesni_ccm64_decrypt_blocks | |
4978 | .rva .LSEH_end_aesni_ccm64_decrypt_blocks | |
02f358da | 4979 | .rva .LSEH_info_ccm64_dec |
d7d119a3 | 4980 | |
6c83629b AP |
4981 | .rva .LSEH_begin_aesni_ctr32_encrypt_blocks |
4982 | .rva .LSEH_end_aesni_ctr32_encrypt_blocks | |
4983 | .rva .LSEH_info_ctr32 | |
f8501464 AP |
4984 | |
4985 | .rva .LSEH_begin_aesni_xts_encrypt | |
4986 | .rva .LSEH_end_aesni_xts_encrypt | |
4987 | .rva .LSEH_info_xts_enc | |
4988 | ||
4989 | .rva .LSEH_begin_aesni_xts_decrypt | |
4990 | .rva .LSEH_end_aesni_xts_decrypt | |
4991 | .rva .LSEH_info_xts_dec | |
bd30091c AP |
4992 | |
4993 | .rva .LSEH_begin_aesni_ocb_encrypt | |
4994 | .rva .LSEH_end_aesni_ocb_encrypt | |
4995 | .rva .LSEH_info_ocb_enc | |
4996 | ||
4997 | .rva .LSEH_begin_aesni_ocb_decrypt | |
4998 | .rva .LSEH_end_aesni_ocb_decrypt | |
4999 | .rva .LSEH_info_ocb_dec | |
6c83629b AP |
5000 | ___ |
5001 | $code.=<<___; | |
d608b4d6 AP |
5002 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt |
5003 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | |
5004 | .rva .LSEH_info_cbc | |
5005 | ||
d608b4d6 AP |
5006 | .rva ${PREFIX}_set_decrypt_key |
5007 | .rva .LSEH_end_set_decrypt_key | |
5008 | .rva .LSEH_info_key | |
c5036d78 AP |
5009 | |
5010 | .rva ${PREFIX}_set_encrypt_key | |
5011 | .rva .LSEH_end_set_encrypt_key | |
5012 | .rva .LSEH_info_key | |
d608b4d6 AP |
5013 | .section .xdata |
5014 | .align 8 | |
6c83629b AP |
5015 | ___ |
5016 | $code.=<<___ if ($PREFIX eq "aesni"); | |
d608b4d6 AP |
5017 | .LSEH_info_ecb: |
5018 | .byte 9,0,0,0 | |
69d5747f AP |
5019 | .rva ecb_ccm64_se_handler |
5020 | .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] | |
02f358da | 5021 | .LSEH_info_ccm64_enc: |
d7d119a3 | 5022 | .byte 9,0,0,0 |
69d5747f | 5023 | .rva ecb_ccm64_se_handler |
02f358da AP |
5024 | .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] |
5025 | .LSEH_info_ccm64_dec: | |
5026 | .byte 9,0,0,0 | |
69d5747f | 5027 | .rva ecb_ccm64_se_handler |
02f358da | 5028 | .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] |
6c83629b AP |
5029 | .LSEH_info_ctr32: |
5030 | .byte 9,0,0,0 | |
6c79faaa AP |
5031 | .rva ctr_xts_se_handler |
5032 | .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] | |
f8501464 AP |
5033 | .LSEH_info_xts_enc: |
5034 | .byte 9,0,0,0 | |
6c79faaa | 5035 | .rva ctr_xts_se_handler |
f8501464 AP |
5036 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] |
5037 | .LSEH_info_xts_dec: | |
5038 | .byte 9,0,0,0 | |
6c79faaa | 5039 | .rva ctr_xts_se_handler |
f8501464 | 5040 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] |
bd30091c AP |
5041 | .LSEH_info_ocb_enc: |
5042 | .byte 9,0,0,0 | |
5043 | .rva ocb_se_handler | |
5044 | .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] | |
5045 | .rva .Locb_enc_pop | |
5046 | .long 0 | |
5047 | .LSEH_info_ocb_dec: | |
5048 | .byte 9,0,0,0 | |
5049 | .rva ocb_se_handler | |
5050 | .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] | |
5051 | .rva .Locb_dec_pop | |
5052 | .long 0 | |
6c83629b AP |
5053 | ___ |
5054 | $code.=<<___; | |
d608b4d6 AP |
5055 | .LSEH_info_cbc: |
5056 | .byte 9,0,0,0 | |
5057 | .rva cbc_se_handler | |
5058 | .LSEH_info_key: | |
5059 | .byte 0x01,0x04,0x01,0x00 | |
d7d119a3 | 5060 | .byte 0x04,0x02,0x00,0x00 # sub rsp,8 |
d608b4d6 AP |
5061 | ___ |
5062 | } | |
5063 | ||
d64a7232 | 5064 | sub rex { |
0a9a692e AP |
5065 | local *opcode=shift; |
5066 | my ($dst,$src)=@_; | |
5067 | my $rex=0; | |
5068 | ||
5069 | $rex|=0x04 if($dst>=8); | |
5070 | $rex|=0x01 if($src>=8); | |
5071 | push @opcode,$rex|0x40 if($rex); | |
d64a7232 AP |
5072 | } |
5073 | ||
5074 | sub aesni { | |
5075 | my $line=shift; | |
5076 | my @opcode=(0x66); | |
5077 | ||
5078 | if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { | |
5079 | rex(\@opcode,$4,$3); | |
5080 | push @opcode,0x0f,0x3a,0xdf; | |
5081 | push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M | |
5082 | my $c=$2; | |
5083 | push @opcode,$c=~/^0/?oct($c):$c; | |
5084 | return ".byte\t".join(',',@opcode); | |
5085 | } | |
5086 | elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { | |
5087 | my %opcodelet = ( | |
5088 | "aesimc" => 0xdb, | |
5089 | "aesenc" => 0xdc, "aesenclast" => 0xdd, | |
5090 | "aesdec" => 0xde, "aesdeclast" => 0xdf | |
5091 | ); | |
5092 | return undef if (!defined($opcodelet{$1})); | |
5093 | rex(\@opcode,$3,$2); | |
5094 | push @opcode,0x0f,0x38,$opcodelet{$1}; | |
5095 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M | |
5096 | return ".byte\t".join(',',@opcode); | |
5097 | } | |
36df342f AP |
5098 | elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { |
5099 | my %opcodelet = ( | |
5100 | "aesenc" => 0xdc, "aesenclast" => 0xdd, | |
5101 | "aesdec" => 0xde, "aesdeclast" => 0xdf | |
5102 | ); | |
5103 | return undef if (!defined($opcodelet{$1})); | |
5104 | my $off = $2; | |
5105 | push @opcode,0x44 if ($3>=8); | |
5106 | push @opcode,0x0f,0x38,$opcodelet{$1}; | |
5107 | push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M | |
5108 | push @opcode,($off=~/^0/?oct($off):$off)&0xff; | |
5109 | return ".byte\t".join(',',@opcode); | |
5110 | } | |
d64a7232 AP |
5111 | return $line; |
5112 | } | |
5113 | ||
5599c733 AP |
5114 | sub movbe { |
5115 | ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; | |
5116 | } | |
5117 | ||
d64a7232 AP |
5118 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; |
5119 | $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; | |
5599c733 AP |
5120 | #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact |
5121 | $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; | |
d64a7232 AP |
5122 | |
5123 | print $code; | |
5124 | ||
5125 | close STDOUT; |