]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/bsaes-x86_64.pl
x86_64 assembly pack: tolerate spaces in source directory name.
[thirdparty/openssl.git] / crypto / aes / asm / bsaes-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 ###################################################################
11 ### AES-128 [originally in CTR mode] ###
12 ### bitsliced implementation for Intel Core 2 processors ###
13 ### requires support of SSE extensions up to SSSE3 ###
14 ### Author: Emilia Käsper and Peter Schwabe ###
15 ### Date: 2009-03-19 ###
16 ### Public domain ###
17 ### ###
18 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19 ### further information. ###
20 ###################################################################
21 #
22 # September 2011.
23 #
24 # Started as transliteration to "perlasm" the original code has
25 # undergone following changes:
26 #
27 # - code was made position-independent;
28 # - rounds were folded into a loop resulting in >5x size reduction
29 # from 12.5KB to 2.2KB;
30 # - above was possibile thanks to mixcolumns() modification that
31 # allowed to feed its output back to aesenc[last], this was
32 # achieved at cost of two additional inter-registers moves;
33 # - some instruction reordering and interleaving;
34 # - this module doesn't implement key setup subroutine, instead it
35 # relies on conversion of "conventional" key schedule as returned
36 # by AES_set_encrypt_key (see discussion below);
37 # - first and last round keys are treated differently, which allowed
38 # to skip one shiftrows(), reduce bit-sliced key schedule and
39 # speed-up conversion by 22%;
40 # - support for 192- and 256-bit keys was added;
41 #
42 # Resulting performance in CPU cycles spent to encrypt one byte out
43 # of 4096-byte buffer with 128-bit key is:
44 #
45 # Emilia's this(*) difference
46 #
47 # Core 2 9.30 8.69 +7%
48 # Nehalem(**) 7.63 6.88 +11%
49 # Atom 17.1 16.4 +4%
50 # Silvermont - 12.9
51 #
52 # (*) Comparison is not completely fair, because "this" is ECB,
53 # i.e. no extra processing such as counter values calculation
54 # and xor-ing input as in Emilia's CTR implementation is
55 # performed. However, the CTR calculations stand for not more
56 # than 1% of total time, so comparison is *rather* fair.
57 #
58 # (**) Results were collected on Westmere, which is considered to
59 # be equivalent to Nehalem for this code.
60 #
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
65 # function is:
66 #
67 # conversion conversion/8x block
68 # Core 2 240 0.22
69 # Nehalem 180 0.20
70 # Atom 430 0.20
71 #
72 # The ratio values mean that 128-byte blocks will be processed
73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
80 #
81 # October 2011.
82 #
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
85 #
86 # Core 2 9.98
87 # Nehalem 7.80
88 # Atom 17.9
89 # Silvermont 14.0
90 #
91 # November 2011.
92 #
93 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
94 # suboptimal, but XTS is meant to be used with larger blocks...
95 #
96 # <appro@openssl.org>
97
98 $flavour = shift;
99 $output = shift;
100 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
101
102 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
103
104 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
105 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
106 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
107 die "can't locate x86_64-xlate.pl";
108
109 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
110 *STDOUT=*OUT;
111
112 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
113 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
114 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
115
116 {
117 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
118
119 sub Sbox {
120 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
121 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
122 my @b=@_[0..7];
123 my @t=@_[8..11];
124 my @s=@_[12..15];
125 &InBasisChange (@b);
126 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
127 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
128 }
129
130 sub InBasisChange {
131 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
132 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
133 my @b=@_[0..7];
134 $code.=<<___;
135 pxor @b[6], @b[5]
136 pxor @b[1], @b[2]
137 pxor @b[0], @b[3]
138 pxor @b[2], @b[6]
139 pxor @b[0], @b[5]
140
141 pxor @b[3], @b[6]
142 pxor @b[7], @b[3]
143 pxor @b[5], @b[7]
144 pxor @b[4], @b[3]
145 pxor @b[5], @b[4]
146 pxor @b[1], @b[3]
147
148 pxor @b[7], @b[2]
149 pxor @b[5], @b[1]
150 ___
151 }
152
153 sub OutBasisChange {
154 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
155 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
156 my @b=@_[0..7];
157 $code.=<<___;
158 pxor @b[6], @b[0]
159 pxor @b[4], @b[1]
160 pxor @b[0], @b[2]
161 pxor @b[6], @b[4]
162 pxor @b[1], @b[6]
163
164 pxor @b[5], @b[1]
165 pxor @b[3], @b[5]
166 pxor @b[7], @b[3]
167 pxor @b[5], @b[7]
168 pxor @b[5], @b[2]
169
170 pxor @b[7], @b[4]
171 ___
172 }
173
174 sub InvSbox {
175 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
176 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
177 my @b=@_[0..7];
178 my @t=@_[8..11];
179 my @s=@_[12..15];
180 &InvInBasisChange (@b);
181 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
182 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
183 }
184
185 sub InvInBasisChange { # OutBasisChange in reverse
186 my @b=@_[5,1,2,6,3,7,0,4];
187 $code.=<<___
188 pxor @b[7], @b[4]
189
190 pxor @b[5], @b[7]
191 pxor @b[5], @b[2]
192 pxor @b[7], @b[3]
193 pxor @b[3], @b[5]
194 pxor @b[5], @b[1]
195
196 pxor @b[1], @b[6]
197 pxor @b[0], @b[2]
198 pxor @b[6], @b[4]
199 pxor @b[6], @b[0]
200 pxor @b[4], @b[1]
201 ___
202 }
203
204 sub InvOutBasisChange { # InBasisChange in reverse
205 my @b=@_[2,5,7,3,6,1,0,4];
206 $code.=<<___;
207 pxor @b[5], @b[1]
208 pxor @b[7], @b[2]
209
210 pxor @b[1], @b[3]
211 pxor @b[5], @b[4]
212 pxor @b[5], @b[7]
213 pxor @b[4], @b[3]
214 pxor @b[0], @b[5]
215 pxor @b[7], @b[3]
216 pxor @b[2], @b[6]
217 pxor @b[1], @b[2]
218 pxor @b[3], @b[6]
219
220 pxor @b[0], @b[3]
221 pxor @b[6], @b[5]
222 ___
223 }
224
225 sub Mul_GF4 {
226 #;*************************************************************
227 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
228 #;*************************************************************
229 my ($x0,$x1,$y0,$y1,$t0)=@_;
230 $code.=<<___;
231 movdqa $y0, $t0
232 pxor $y1, $t0
233 pand $x0, $t0
234 pxor $x1, $x0
235 pand $y0, $x1
236 pand $y1, $x0
237 pxor $x1, $x0
238 pxor $t0, $x1
239 ___
240 }
241
242 sub Mul_GF4_N { # not used, see next subroutine
243 # multiply and scale by N
244 my ($x0,$x1,$y0,$y1,$t0)=@_;
245 $code.=<<___;
246 movdqa $y0, $t0
247 pxor $y1, $t0
248 pand $x0, $t0
249 pxor $x1, $x0
250 pand $y0, $x1
251 pand $y1, $x0
252 pxor $x0, $x1
253 pxor $t0, $x0
254 ___
255 }
256
257 sub Mul_GF4_N_GF4 {
258 # interleaved Mul_GF4_N and Mul_GF4
259 my ($x0,$x1,$y0,$y1,$t0,
260 $x2,$x3,$y2,$y3,$t1)=@_;
261 $code.=<<___;
262 movdqa $y0, $t0
263 movdqa $y2, $t1
264 pxor $y1, $t0
265 pxor $y3, $t1
266 pand $x0, $t0
267 pand $x2, $t1
268 pxor $x1, $x0
269 pxor $x3, $x2
270 pand $y0, $x1
271 pand $y2, $x3
272 pand $y1, $x0
273 pand $y3, $x2
274 pxor $x0, $x1
275 pxor $x3, $x2
276 pxor $t0, $x0
277 pxor $t1, $x3
278 ___
279 }
280 sub Mul_GF16_2 {
281 my @x=@_[0..7];
282 my @y=@_[8..11];
283 my @t=@_[12..15];
284 $code.=<<___;
285 movdqa @x[0], @t[0]
286 movdqa @x[1], @t[1]
287 ___
288 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
289 $code.=<<___;
290 pxor @x[2], @t[0]
291 pxor @x[3], @t[1]
292 pxor @y[2], @y[0]
293 pxor @y[3], @y[1]
294 ___
295 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
296 @x[2], @x[3], @y[2], @y[3], @t[2]);
297 $code.=<<___;
298 pxor @t[0], @x[0]
299 pxor @t[0], @x[2]
300 pxor @t[1], @x[1]
301 pxor @t[1], @x[3]
302
303 movdqa @x[4], @t[0]
304 movdqa @x[5], @t[1]
305 pxor @x[6], @t[0]
306 pxor @x[7], @t[1]
307 ___
308 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
309 @x[6], @x[7], @y[2], @y[3], @t[2]);
310 $code.=<<___;
311 pxor @y[2], @y[0]
312 pxor @y[3], @y[1]
313 ___
314 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
315 $code.=<<___;
316 pxor @t[0], @x[4]
317 pxor @t[0], @x[6]
318 pxor @t[1], @x[5]
319 pxor @t[1], @x[7]
320 ___
321 }
322 sub Inv_GF256 {
323 #;********************************************************************
324 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
325 #;********************************************************************
326 my @x=@_[0..7];
327 my @t=@_[8..11];
328 my @s=@_[12..15];
329 # direct optimizations from hardware
330 $code.=<<___;
331 movdqa @x[4], @t[3]
332 movdqa @x[5], @t[2]
333 movdqa @x[1], @t[1]
334 movdqa @x[7], @s[1]
335 movdqa @x[0], @s[0]
336
337 pxor @x[6], @t[3]
338 pxor @x[7], @t[2]
339 pxor @x[3], @t[1]
340 movdqa @t[3], @s[2]
341 pxor @x[6], @s[1]
342 movdqa @t[2], @t[0]
343 pxor @x[2], @s[0]
344 movdqa @t[3], @s[3]
345
346 por @t[1], @t[2]
347 por @s[0], @t[3]
348 pxor @t[0], @s[3]
349 pand @s[0], @s[2]
350 pxor @t[1], @s[0]
351 pand @t[1], @t[0]
352 pand @s[0], @s[3]
353 movdqa @x[3], @s[0]
354 pxor @x[2], @s[0]
355 pand @s[0], @s[1]
356 pxor @s[1], @t[3]
357 pxor @s[1], @t[2]
358 movdqa @x[4], @s[1]
359 movdqa @x[1], @s[0]
360 pxor @x[5], @s[1]
361 pxor @x[0], @s[0]
362 movdqa @s[1], @t[1]
363 pand @s[0], @s[1]
364 por @s[0], @t[1]
365 pxor @s[1], @t[0]
366 pxor @s[3], @t[3]
367 pxor @s[2], @t[2]
368 pxor @s[3], @t[1]
369 movdqa @x[7], @s[0]
370 pxor @s[2], @t[0]
371 movdqa @x[6], @s[1]
372 pxor @s[2], @t[1]
373 movdqa @x[5], @s[2]
374 pand @x[3], @s[0]
375 movdqa @x[4], @s[3]
376 pand @x[2], @s[1]
377 pand @x[1], @s[2]
378 por @x[0], @s[3]
379 pxor @s[0], @t[3]
380 pxor @s[1], @t[2]
381 pxor @s[2], @t[1]
382 pxor @s[3], @t[0]
383
384 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
385
386 # new smaller inversion
387
388 movdqa @t[3], @s[0]
389 pand @t[1], @t[3]
390 pxor @t[2], @s[0]
391
392 movdqa @t[0], @s[2]
393 movdqa @s[0], @s[3]
394 pxor @t[3], @s[2]
395 pand @s[2], @s[3]
396
397 movdqa @t[1], @s[1]
398 pxor @t[2], @s[3]
399 pxor @t[0], @s[1]
400
401 pxor @t[2], @t[3]
402
403 pand @t[3], @s[1]
404
405 movdqa @s[2], @t[2]
406 pxor @t[0], @s[1]
407
408 pxor @s[1], @t[2]
409 pxor @s[1], @t[1]
410
411 pand @t[0], @t[2]
412
413 pxor @t[2], @s[2]
414 pxor @t[2], @t[1]
415
416 pand @s[3], @s[2]
417
418 pxor @s[0], @s[2]
419 ___
420 # output in s3, s2, s1, t1
421
422 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
423
424 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
425 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
426
427 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
428 }
429
430 # AES linear components
431
432 sub ShiftRows {
433 my @x=@_[0..7];
434 my $mask=pop;
435 $code.=<<___;
436 pxor 0x00($key),@x[0]
437 pxor 0x10($key),@x[1]
438 pxor 0x20($key),@x[2]
439 pxor 0x30($key),@x[3]
440 pshufb $mask,@x[0]
441 pshufb $mask,@x[1]
442 pxor 0x40($key),@x[4]
443 pxor 0x50($key),@x[5]
444 pshufb $mask,@x[2]
445 pshufb $mask,@x[3]
446 pxor 0x60($key),@x[6]
447 pxor 0x70($key),@x[7]
448 pshufb $mask,@x[4]
449 pshufb $mask,@x[5]
450 pshufb $mask,@x[6]
451 pshufb $mask,@x[7]
452 lea 0x80($key),$key
453 ___
454 }
455
456 sub MixColumns {
457 # modified to emit output in order suitable for feeding back to aesenc[last]
458 my @x=@_[0..7];
459 my @t=@_[8..15];
460 my $inv=@_[16]; # optional
461 $code.=<<___;
462 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
463 pshufd \$0x93, @x[1], @t[1]
464 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
465 pshufd \$0x93, @x[2], @t[2]
466 pxor @t[1], @x[1]
467 pshufd \$0x93, @x[3], @t[3]
468 pxor @t[2], @x[2]
469 pshufd \$0x93, @x[4], @t[4]
470 pxor @t[3], @x[3]
471 pshufd \$0x93, @x[5], @t[5]
472 pxor @t[4], @x[4]
473 pshufd \$0x93, @x[6], @t[6]
474 pxor @t[5], @x[5]
475 pshufd \$0x93, @x[7], @t[7]
476 pxor @t[6], @x[6]
477 pxor @t[7], @x[7]
478
479 pxor @x[0], @t[1]
480 pxor @x[7], @t[0]
481 pxor @x[7], @t[1]
482 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
483 pxor @x[1], @t[2]
484 pshufd \$0x4E, @x[1], @x[1]
485 pxor @x[4], @t[5]
486 pxor @t[0], @x[0]
487 pxor @x[5], @t[6]
488 pxor @t[1], @x[1]
489 pxor @x[3], @t[4]
490 pshufd \$0x4E, @x[4], @t[0]
491 pxor @x[6], @t[7]
492 pshufd \$0x4E, @x[5], @t[1]
493 pxor @x[2], @t[3]
494 pshufd \$0x4E, @x[3], @x[4]
495 pxor @x[7], @t[3]
496 pshufd \$0x4E, @x[7], @x[5]
497 pxor @x[7], @t[4]
498 pshufd \$0x4E, @x[6], @x[3]
499 pxor @t[4], @t[0]
500 pshufd \$0x4E, @x[2], @x[6]
501 pxor @t[5], @t[1]
502 ___
503 $code.=<<___ if (!$inv);
504 pxor @t[3], @x[4]
505 pxor @t[7], @x[5]
506 pxor @t[6], @x[3]
507 movdqa @t[0], @x[2]
508 pxor @t[2], @x[6]
509 movdqa @t[1], @x[7]
510 ___
511 $code.=<<___ if ($inv);
512 pxor @x[4], @t[3]
513 pxor @t[7], @x[5]
514 pxor @x[3], @t[6]
515 movdqa @t[0], @x[3]
516 pxor @t[2], @x[6]
517 movdqa @t[6], @x[2]
518 movdqa @t[1], @x[7]
519 movdqa @x[6], @x[4]
520 movdqa @t[3], @x[6]
521 ___
522 }
523
524 sub InvMixColumns_orig {
525 my @x=@_[0..7];
526 my @t=@_[8..15];
527
528 $code.=<<___;
529 # multiplication by 0x0e
530 pshufd \$0x93, @x[7], @t[7]
531 movdqa @x[2], @t[2]
532 pxor @x[5], @x[7] # 7 5
533 pxor @x[5], @x[2] # 2 5
534 pshufd \$0x93, @x[0], @t[0]
535 movdqa @x[5], @t[5]
536 pxor @x[0], @x[5] # 5 0 [1]
537 pxor @x[1], @x[0] # 0 1
538 pshufd \$0x93, @x[1], @t[1]
539 pxor @x[2], @x[1] # 1 25
540 pxor @x[6], @x[0] # 01 6 [2]
541 pxor @x[3], @x[1] # 125 3 [4]
542 pshufd \$0x93, @x[3], @t[3]
543 pxor @x[0], @x[2] # 25 016 [3]
544 pxor @x[7], @x[3] # 3 75
545 pxor @x[6], @x[7] # 75 6 [0]
546 pshufd \$0x93, @x[6], @t[6]
547 movdqa @x[4], @t[4]
548 pxor @x[4], @x[6] # 6 4
549 pxor @x[3], @x[4] # 4 375 [6]
550 pxor @x[7], @x[3] # 375 756=36
551 pxor @t[5], @x[6] # 64 5 [7]
552 pxor @t[2], @x[3] # 36 2
553 pxor @t[4], @x[3] # 362 4 [5]
554 pshufd \$0x93, @t[5], @t[5]
555 ___
556 my @y = @x[7,5,0,2,1,3,4,6];
557 $code.=<<___;
558 # multiplication by 0x0b
559 pxor @y[0], @y[1]
560 pxor @t[0], @y[0]
561 pxor @t[1], @y[1]
562 pshufd \$0x93, @t[2], @t[2]
563 pxor @t[5], @y[0]
564 pxor @t[6], @y[1]
565 pxor @t[7], @y[0]
566 pshufd \$0x93, @t[4], @t[4]
567 pxor @t[6], @t[7] # clobber t[7]
568 pxor @y[0], @y[1]
569
570 pxor @t[0], @y[3]
571 pshufd \$0x93, @t[0], @t[0]
572 pxor @t[1], @y[2]
573 pxor @t[1], @y[4]
574 pxor @t[2], @y[2]
575 pshufd \$0x93, @t[1], @t[1]
576 pxor @t[2], @y[3]
577 pxor @t[2], @y[5]
578 pxor @t[7], @y[2]
579 pshufd \$0x93, @t[2], @t[2]
580 pxor @t[3], @y[3]
581 pxor @t[3], @y[6]
582 pxor @t[3], @y[4]
583 pshufd \$0x93, @t[3], @t[3]
584 pxor @t[4], @y[7]
585 pxor @t[4], @y[5]
586 pxor @t[7], @y[7]
587 pxor @t[5], @y[3]
588 pxor @t[4], @y[4]
589 pxor @t[5], @t[7] # clobber t[7] even more
590
591 pxor @t[7], @y[5]
592 pshufd \$0x93, @t[4], @t[4]
593 pxor @t[7], @y[6]
594 pxor @t[7], @y[4]
595
596 pxor @t[5], @t[7]
597 pshufd \$0x93, @t[5], @t[5]
598 pxor @t[6], @t[7] # restore t[7]
599
600 # multiplication by 0x0d
601 pxor @y[7], @y[4]
602 pxor @t[4], @y[7]
603 pshufd \$0x93, @t[6], @t[6]
604 pxor @t[0], @y[2]
605 pxor @t[5], @y[7]
606 pxor @t[2], @y[2]
607 pshufd \$0x93, @t[7], @t[7]
608
609 pxor @y[1], @y[3]
610 pxor @t[1], @y[1]
611 pxor @t[0], @y[0]
612 pxor @t[0], @y[3]
613 pxor @t[5], @y[1]
614 pxor @t[5], @y[0]
615 pxor @t[7], @y[1]
616 pshufd \$0x93, @t[0], @t[0]
617 pxor @t[6], @y[0]
618 pxor @y[1], @y[3]
619 pxor @t[1], @y[4]
620 pshufd \$0x93, @t[1], @t[1]
621
622 pxor @t[7], @y[7]
623 pxor @t[2], @y[4]
624 pxor @t[2], @y[5]
625 pshufd \$0x93, @t[2], @t[2]
626 pxor @t[6], @y[2]
627 pxor @t[3], @t[6] # clobber t[6]
628 pxor @y[7], @y[4]
629 pxor @t[6], @y[3]
630
631 pxor @t[6], @y[6]
632 pxor @t[5], @y[5]
633 pxor @t[4], @y[6]
634 pshufd \$0x93, @t[4], @t[4]
635 pxor @t[6], @y[5]
636 pxor @t[7], @y[6]
637 pxor @t[3], @t[6] # restore t[6]
638
639 pshufd \$0x93, @t[5], @t[5]
640 pshufd \$0x93, @t[6], @t[6]
641 pshufd \$0x93, @t[7], @t[7]
642 pshufd \$0x93, @t[3], @t[3]
643
644 # multiplication by 0x09
645 pxor @y[1], @y[4]
646 pxor @y[1], @t[1] # t[1]=y[1]
647 pxor @t[5], @t[0] # clobber t[0]
648 pxor @t[5], @t[1]
649 pxor @t[0], @y[3]
650 pxor @y[0], @t[0] # t[0]=y[0]
651 pxor @t[6], @t[1]
652 pxor @t[7], @t[6] # clobber t[6]
653 pxor @t[1], @y[4]
654 pxor @t[4], @y[7]
655 pxor @y[4], @t[4] # t[4]=y[4]
656 pxor @t[3], @y[6]
657 pxor @y[3], @t[3] # t[3]=y[3]
658 pxor @t[2], @y[5]
659 pxor @y[2], @t[2] # t[2]=y[2]
660 pxor @t[7], @t[3]
661 pxor @y[5], @t[5] # t[5]=y[5]
662 pxor @t[6], @t[2]
663 pxor @t[6], @t[5]
664 pxor @y[6], @t[6] # t[6]=y[6]
665 pxor @y[7], @t[7] # t[7]=y[7]
666
667 movdqa @t[0],@XMM[0]
668 movdqa @t[1],@XMM[1]
669 movdqa @t[2],@XMM[2]
670 movdqa @t[3],@XMM[3]
671 movdqa @t[4],@XMM[4]
672 movdqa @t[5],@XMM[5]
673 movdqa @t[6],@XMM[6]
674 movdqa @t[7],@XMM[7]
675 ___
676 }
677
678 sub InvMixColumns {
679 my @x=@_[0..7];
680 my @t=@_[8..15];
681
682 # Thanks to Jussi Kivilinna for providing pointer to
683 #
684 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
685 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
686 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
687 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
688
689 $code.=<<___;
690 # multiplication by 0x05-0x00-0x04-0x00
691 pshufd \$0x4E, @x[0], @t[0]
692 pshufd \$0x4E, @x[6], @t[6]
693 pxor @x[0], @t[0]
694 pshufd \$0x4E, @x[7], @t[7]
695 pxor @x[6], @t[6]
696 pshufd \$0x4E, @x[1], @t[1]
697 pxor @x[7], @t[7]
698 pshufd \$0x4E, @x[2], @t[2]
699 pxor @x[1], @t[1]
700 pshufd \$0x4E, @x[3], @t[3]
701 pxor @x[2], @t[2]
702 pxor @t[6], @x[0]
703 pxor @t[6], @x[1]
704 pshufd \$0x4E, @x[4], @t[4]
705 pxor @x[3], @t[3]
706 pxor @t[0], @x[2]
707 pxor @t[1], @x[3]
708 pshufd \$0x4E, @x[5], @t[5]
709 pxor @x[4], @t[4]
710 pxor @t[7], @x[1]
711 pxor @t[2], @x[4]
712 pxor @x[5], @t[5]
713
714 pxor @t[7], @x[2]
715 pxor @t[6], @x[3]
716 pxor @t[6], @x[4]
717 pxor @t[3], @x[5]
718 pxor @t[4], @x[6]
719 pxor @t[7], @x[4]
720 pxor @t[7], @x[5]
721 pxor @t[5], @x[7]
722 ___
723 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
724 }
725
726 sub aesenc { # not used
727 my @b=@_[0..7];
728 my @t=@_[8..15];
729 $code.=<<___;
730 movdqa 0x30($const),@t[0] # .LSR
731 ___
732 &ShiftRows (@b,@t[0]);
733 &Sbox (@b,@t);
734 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
735 }
736
737 sub aesenclast { # not used
738 my @b=@_[0..7];
739 my @t=@_[8..15];
740 $code.=<<___;
741 movdqa 0x40($const),@t[0] # .LSRM0
742 ___
743 &ShiftRows (@b,@t[0]);
744 &Sbox (@b,@t);
745 $code.=<<___
746 pxor 0x00($key),@b[0]
747 pxor 0x10($key),@b[1]
748 pxor 0x20($key),@b[4]
749 pxor 0x30($key),@b[6]
750 pxor 0x40($key),@b[3]
751 pxor 0x50($key),@b[7]
752 pxor 0x60($key),@b[2]
753 pxor 0x70($key),@b[5]
754 ___
755 }
756
757 sub swapmove {
758 my ($a,$b,$n,$mask,$t)=@_;
759 $code.=<<___;
760 movdqa $b,$t
761 psrlq \$$n,$b
762 pxor $a,$b
763 pand $mask,$b
764 pxor $b,$a
765 psllq \$$n,$b
766 pxor $t,$b
767 ___
768 }
769 sub swapmove2x {
770 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
771 $code.=<<___;
772 movdqa $b0,$t0
773 psrlq \$$n,$b0
774 movdqa $b1,$t1
775 psrlq \$$n,$b1
776 pxor $a0,$b0
777 pxor $a1,$b1
778 pand $mask,$b0
779 pand $mask,$b1
780 pxor $b0,$a0
781 psllq \$$n,$b0
782 pxor $b1,$a1
783 psllq \$$n,$b1
784 pxor $t0,$b0
785 pxor $t1,$b1
786 ___
787 }
788
789 sub bitslice {
790 my @x=reverse(@_[0..7]);
791 my ($t0,$t1,$t2,$t3)=@_[8..11];
792 $code.=<<___;
793 movdqa 0x00($const),$t0 # .LBS0
794 movdqa 0x10($const),$t1 # .LBS1
795 ___
796 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
797 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
798 $code.=<<___;
799 movdqa 0x20($const),$t0 # .LBS2
800 ___
801 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
802 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
803
804 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
805 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
806 }
807
808 $code.=<<___;
809 .text
810
811 .extern asm_AES_encrypt
812 .extern asm_AES_decrypt
813
814 .type _bsaes_encrypt8,\@abi-omnipotent
815 .align 64
816 _bsaes_encrypt8:
817 lea .LBS0(%rip), $const # constants table
818
819 movdqa ($key), @XMM[9] # round 0 key
820 lea 0x10($key), $key
821 movdqa 0x50($const), @XMM[8] # .LM0SR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pxor @XMM[9], @XMM[2]
825 pxor @XMM[9], @XMM[3]
826 pshufb @XMM[8], @XMM[0]
827 pshufb @XMM[8], @XMM[1]
828 pxor @XMM[9], @XMM[4]
829 pxor @XMM[9], @XMM[5]
830 pshufb @XMM[8], @XMM[2]
831 pshufb @XMM[8], @XMM[3]
832 pxor @XMM[9], @XMM[6]
833 pxor @XMM[9], @XMM[7]
834 pshufb @XMM[8], @XMM[4]
835 pshufb @XMM[8], @XMM[5]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
838 _bsaes_encrypt8_bitslice:
839 ___
840 &bitslice (@XMM[0..7, 8..11]);
841 $code.=<<___;
842 dec $rounds
843 jmp .Lenc_sbox
844 .align 16
845 .Lenc_loop:
846 ___
847 &ShiftRows (@XMM[0..7, 8]);
848 $code.=".Lenc_sbox:\n";
849 &Sbox (@XMM[0..7, 8..15]);
850 $code.=<<___;
851 dec $rounds
852 jl .Lenc_done
853 ___
854 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
855 $code.=<<___;
856 movdqa 0x30($const), @XMM[8] # .LSR
857 jnz .Lenc_loop
858 movdqa 0x40($const), @XMM[8] # .LSRM0
859 jmp .Lenc_loop
860 .align 16
861 .Lenc_done:
862 ___
863 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
864 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
865 $code.=<<___;
866 movdqa ($key), @XMM[8] # last round key
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[6]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[7]
871 pxor @XMM[8], @XMM[2]
872 pxor @XMM[8], @XMM[5]
873 pxor @XMM[8], @XMM[0]
874 pxor @XMM[8], @XMM[1]
875 ret
876 .size _bsaes_encrypt8,.-_bsaes_encrypt8
877
878 .type _bsaes_decrypt8,\@abi-omnipotent
879 .align 64
880 _bsaes_decrypt8:
881 lea .LBS0(%rip), $const # constants table
882
883 movdqa ($key), @XMM[9] # round 0 key
884 lea 0x10($key), $key
885 movdqa -0x30($const), @XMM[8] # .LM0ISR
886 pxor @XMM[9], @XMM[0] # xor with round0 key
887 pxor @XMM[9], @XMM[1]
888 pxor @XMM[9], @XMM[2]
889 pxor @XMM[9], @XMM[3]
890 pshufb @XMM[8], @XMM[0]
891 pshufb @XMM[8], @XMM[1]
892 pxor @XMM[9], @XMM[4]
893 pxor @XMM[9], @XMM[5]
894 pshufb @XMM[8], @XMM[2]
895 pshufb @XMM[8], @XMM[3]
896 pxor @XMM[9], @XMM[6]
897 pxor @XMM[9], @XMM[7]
898 pshufb @XMM[8], @XMM[4]
899 pshufb @XMM[8], @XMM[5]
900 pshufb @XMM[8], @XMM[6]
901 pshufb @XMM[8], @XMM[7]
902 ___
903 &bitslice (@XMM[0..7, 8..11]);
904 $code.=<<___;
905 dec $rounds
906 jmp .Ldec_sbox
907 .align 16
908 .Ldec_loop:
909 ___
910 &ShiftRows (@XMM[0..7, 8]);
911 $code.=".Ldec_sbox:\n";
912 &InvSbox (@XMM[0..7, 8..15]);
913 $code.=<<___;
914 dec $rounds
915 jl .Ldec_done
916 ___
917 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
918 $code.=<<___;
919 movdqa -0x10($const), @XMM[8] # .LISR
920 jnz .Ldec_loop
921 movdqa -0x20($const), @XMM[8] # .LISRM0
922 jmp .Ldec_loop
923 .align 16
924 .Ldec_done:
925 ___
926 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
927 $code.=<<___;
928 movdqa ($key), @XMM[8] # last round key
929 pxor @XMM[8], @XMM[6]
930 pxor @XMM[8], @XMM[4]
931 pxor @XMM[8], @XMM[2]
932 pxor @XMM[8], @XMM[7]
933 pxor @XMM[8], @XMM[3]
934 pxor @XMM[8], @XMM[5]
935 pxor @XMM[8], @XMM[0]
936 pxor @XMM[8], @XMM[1]
937 ret
938 .size _bsaes_decrypt8,.-_bsaes_decrypt8
939 ___
940 }
941 {
942 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
943
944 sub bitslice_key {
945 my @x=reverse(@_[0..7]);
946 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
947
948 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
949 $code.=<<___;
950 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
951 movdqa @x[0], @x[2]
952 movdqa @x[1], @x[3]
953 ___
954 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
955
956 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
957 $code.=<<___;
958 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
959 movdqa @x[0], @x[4]
960 movdqa @x[2], @x[6]
961 movdqa @x[1], @x[5]
962 movdqa @x[3], @x[7]
963 ___
964 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
965 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
966 }
967
968 $code.=<<___;
969 .type _bsaes_key_convert,\@abi-omnipotent
970 .align 16
971 _bsaes_key_convert:
972 lea .Lmasks(%rip), $const
973 movdqu ($inp), %xmm7 # load round 0 key
974 lea 0x10($inp), $inp
975 movdqa 0x00($const), %xmm0 # 0x01...
976 movdqa 0x10($const), %xmm1 # 0x02...
977 movdqa 0x20($const), %xmm2 # 0x04...
978 movdqa 0x30($const), %xmm3 # 0x08...
979 movdqa 0x40($const), %xmm4 # .LM0
980 pcmpeqd %xmm5, %xmm5 # .LNOT
981
982 movdqu ($inp), %xmm6 # load round 1 key
983 movdqa %xmm7, ($out) # save round 0 key
984 lea 0x10($out), $out
985 dec $rounds
986 jmp .Lkey_loop
987 .align 16
988 .Lkey_loop:
989 pshufb %xmm4, %xmm6 # .LM0
990
991 movdqa %xmm0, %xmm8
992 movdqa %xmm1, %xmm9
993
994 pand %xmm6, %xmm8
995 pand %xmm6, %xmm9
996 movdqa %xmm2, %xmm10
997 pcmpeqb %xmm0, %xmm8
998 psllq \$4, %xmm0 # 0x10...
999 movdqa %xmm3, %xmm11
1000 pcmpeqb %xmm1, %xmm9
1001 psllq \$4, %xmm1 # 0x20...
1002
1003 pand %xmm6, %xmm10
1004 pand %xmm6, %xmm11
1005 movdqa %xmm0, %xmm12
1006 pcmpeqb %xmm2, %xmm10
1007 psllq \$4, %xmm2 # 0x40...
1008 movdqa %xmm1, %xmm13
1009 pcmpeqb %xmm3, %xmm11
1010 psllq \$4, %xmm3 # 0x80...
1011
1012 movdqa %xmm2, %xmm14
1013 movdqa %xmm3, %xmm15
1014 pxor %xmm5, %xmm8 # "pnot"
1015 pxor %xmm5, %xmm9
1016
1017 pand %xmm6, %xmm12
1018 pand %xmm6, %xmm13
1019 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1020 pcmpeqb %xmm0, %xmm12
1021 psrlq \$4, %xmm0 # 0x01...
1022 movdqa %xmm9, 0x10($out)
1023 pcmpeqb %xmm1, %xmm13
1024 psrlq \$4, %xmm1 # 0x02...
1025 lea 0x10($inp), $inp
1026
1027 pand %xmm6, %xmm14
1028 pand %xmm6, %xmm15
1029 movdqa %xmm10, 0x20($out)
1030 pcmpeqb %xmm2, %xmm14
1031 psrlq \$4, %xmm2 # 0x04...
1032 movdqa %xmm11, 0x30($out)
1033 pcmpeqb %xmm3, %xmm15
1034 psrlq \$4, %xmm3 # 0x08...
1035 movdqu ($inp), %xmm6 # load next round key
1036
1037 pxor %xmm5, %xmm13 # "pnot"
1038 pxor %xmm5, %xmm14
1039 movdqa %xmm12, 0x40($out)
1040 movdqa %xmm13, 0x50($out)
1041 movdqa %xmm14, 0x60($out)
1042 movdqa %xmm15, 0x70($out)
1043 lea 0x80($out),$out
1044 dec $rounds
1045 jnz .Lkey_loop
1046
1047 movdqa 0x50($const), %xmm7 # .L63
1048 #movdqa %xmm6, ($out) # don't save last round key
1049 ret
1050 .size _bsaes_key_convert,.-_bsaes_key_convert
1051 ___
1052 }
1053
1054 if (0 && !$win64) { # following four functions are unsupported interface
1055 # used for benchmarking...
1056 $code.=<<___;
1057 .globl bsaes_enc_key_convert
1058 .type bsaes_enc_key_convert,\@function,2
1059 .align 16
1060 bsaes_enc_key_convert:
1061 mov 240($inp),%r10d # pass rounds
1062 mov $inp,%rcx # pass key
1063 mov $out,%rax # pass key schedule
1064 call _bsaes_key_convert
1065 pxor %xmm6,%xmm7 # fix up last round key
1066 movdqa %xmm7,(%rax) # save last round key
1067 ret
1068 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1069
1070 .globl bsaes_encrypt_128
1071 .type bsaes_encrypt_128,\@function,4
1072 .align 16
1073 bsaes_encrypt_128:
1074 .Lenc128_loop:
1075 movdqu 0x00($inp), @XMM[0] # load input
1076 movdqu 0x10($inp), @XMM[1]
1077 movdqu 0x20($inp), @XMM[2]
1078 movdqu 0x30($inp), @XMM[3]
1079 movdqu 0x40($inp), @XMM[4]
1080 movdqu 0x50($inp), @XMM[5]
1081 movdqu 0x60($inp), @XMM[6]
1082 movdqu 0x70($inp), @XMM[7]
1083 mov $key, %rax # pass the $key
1084 lea 0x80($inp), $inp
1085 mov \$10,%r10d
1086
1087 call _bsaes_encrypt8
1088
1089 movdqu @XMM[0], 0x00($out) # write output
1090 movdqu @XMM[1], 0x10($out)
1091 movdqu @XMM[4], 0x20($out)
1092 movdqu @XMM[6], 0x30($out)
1093 movdqu @XMM[3], 0x40($out)
1094 movdqu @XMM[7], 0x50($out)
1095 movdqu @XMM[2], 0x60($out)
1096 movdqu @XMM[5], 0x70($out)
1097 lea 0x80($out), $out
1098 sub \$0x80,$len
1099 ja .Lenc128_loop
1100 ret
1101 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1102
1103 .globl bsaes_dec_key_convert
1104 .type bsaes_dec_key_convert,\@function,2
1105 .align 16
1106 bsaes_dec_key_convert:
1107 mov 240($inp),%r10d # pass rounds
1108 mov $inp,%rcx # pass key
1109 mov $out,%rax # pass key schedule
1110 call _bsaes_key_convert
1111 pxor ($out),%xmm7 # fix up round 0 key
1112 movdqa %xmm6,(%rax) # save last round key
1113 movdqa %xmm7,($out)
1114 ret
1115 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1116
1117 .globl bsaes_decrypt_128
1118 .type bsaes_decrypt_128,\@function,4
1119 .align 16
1120 bsaes_decrypt_128:
1121 .Ldec128_loop:
1122 movdqu 0x00($inp), @XMM[0] # load input
1123 movdqu 0x10($inp), @XMM[1]
1124 movdqu 0x20($inp), @XMM[2]
1125 movdqu 0x30($inp), @XMM[3]
1126 movdqu 0x40($inp), @XMM[4]
1127 movdqu 0x50($inp), @XMM[5]
1128 movdqu 0x60($inp), @XMM[6]
1129 movdqu 0x70($inp), @XMM[7]
1130 mov $key, %rax # pass the $key
1131 lea 0x80($inp), $inp
1132 mov \$10,%r10d
1133
1134 call _bsaes_decrypt8
1135
1136 movdqu @XMM[0], 0x00($out) # write output
1137 movdqu @XMM[1], 0x10($out)
1138 movdqu @XMM[6], 0x20($out)
1139 movdqu @XMM[4], 0x30($out)
1140 movdqu @XMM[2], 0x40($out)
1141 movdqu @XMM[7], 0x50($out)
1142 movdqu @XMM[3], 0x60($out)
1143 movdqu @XMM[5], 0x70($out)
1144 lea 0x80($out), $out
1145 sub \$0x80,$len
1146 ja .Ldec128_loop
1147 ret
1148 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1149 ___
1150 }
1151 {
1152 ######################################################################
1153 #
1154 # OpenSSL interface
1155 #
1156 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1157 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1158 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1159
1160 if ($ecb) {
1161 $code.=<<___;
1162 .globl bsaes_ecb_encrypt_blocks
1163 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1164 .align 16
1165 bsaes_ecb_encrypt_blocks:
1166 mov %rsp, %rax
1167 .Lecb_enc_prologue:
1168 push %rbp
1169 push %rbx
1170 push %r12
1171 push %r13
1172 push %r14
1173 push %r15
1174 lea -0x48(%rsp),%rsp
1175 ___
1176 $code.=<<___ if ($win64);
1177 lea -0xa0(%rsp), %rsp
1178 movaps %xmm6, 0x40(%rsp)
1179 movaps %xmm7, 0x50(%rsp)
1180 movaps %xmm8, 0x60(%rsp)
1181 movaps %xmm9, 0x70(%rsp)
1182 movaps %xmm10, 0x80(%rsp)
1183 movaps %xmm11, 0x90(%rsp)
1184 movaps %xmm12, 0xa0(%rsp)
1185 movaps %xmm13, 0xb0(%rsp)
1186 movaps %xmm14, 0xc0(%rsp)
1187 movaps %xmm15, 0xd0(%rsp)
1188 .Lecb_enc_body:
1189 ___
1190 $code.=<<___;
1191 mov %rsp,%rbp # backup %rsp
1192 mov 240($arg4),%eax # rounds
1193 mov $arg1,$inp # backup arguments
1194 mov $arg2,$out
1195 mov $arg3,$len
1196 mov $arg4,$key
1197 cmp \$8,$arg3
1198 jb .Lecb_enc_short
1199
1200 mov %eax,%ebx # backup rounds
1201 shl \$7,%rax # 128 bytes per inner round key
1202 sub \$`128-32`,%rax # size of bit-sliced key schedule
1203 sub %rax,%rsp
1204 mov %rsp,%rax # pass key schedule
1205 mov $key,%rcx # pass key
1206 mov %ebx,%r10d # pass rounds
1207 call _bsaes_key_convert
1208 pxor %xmm6,%xmm7 # fix up last round key
1209 movdqa %xmm7,(%rax) # save last round key
1210
1211 sub \$8,$len
1212 .Lecb_enc_loop:
1213 movdqu 0x00($inp), @XMM[0] # load input
1214 movdqu 0x10($inp), @XMM[1]
1215 movdqu 0x20($inp), @XMM[2]
1216 movdqu 0x30($inp), @XMM[3]
1217 movdqu 0x40($inp), @XMM[4]
1218 movdqu 0x50($inp), @XMM[5]
1219 mov %rsp, %rax # pass key schedule
1220 movdqu 0x60($inp), @XMM[6]
1221 mov %ebx,%r10d # pass rounds
1222 movdqu 0x70($inp), @XMM[7]
1223 lea 0x80($inp), $inp
1224
1225 call _bsaes_encrypt8
1226
1227 movdqu @XMM[0], 0x00($out) # write output
1228 movdqu @XMM[1], 0x10($out)
1229 movdqu @XMM[4], 0x20($out)
1230 movdqu @XMM[6], 0x30($out)
1231 movdqu @XMM[3], 0x40($out)
1232 movdqu @XMM[7], 0x50($out)
1233 movdqu @XMM[2], 0x60($out)
1234 movdqu @XMM[5], 0x70($out)
1235 lea 0x80($out), $out
1236 sub \$8,$len
1237 jnc .Lecb_enc_loop
1238
1239 add \$8,$len
1240 jz .Lecb_enc_done
1241
1242 movdqu 0x00($inp), @XMM[0] # load input
1243 mov %rsp, %rax # pass key schedule
1244 mov %ebx,%r10d # pass rounds
1245 cmp \$2,$len
1246 jb .Lecb_enc_one
1247 movdqu 0x10($inp), @XMM[1]
1248 je .Lecb_enc_two
1249 movdqu 0x20($inp), @XMM[2]
1250 cmp \$4,$len
1251 jb .Lecb_enc_three
1252 movdqu 0x30($inp), @XMM[3]
1253 je .Lecb_enc_four
1254 movdqu 0x40($inp), @XMM[4]
1255 cmp \$6,$len
1256 jb .Lecb_enc_five
1257 movdqu 0x50($inp), @XMM[5]
1258 je .Lecb_enc_six
1259 movdqu 0x60($inp), @XMM[6]
1260 call _bsaes_encrypt8
1261 movdqu @XMM[0], 0x00($out) # write output
1262 movdqu @XMM[1], 0x10($out)
1263 movdqu @XMM[4], 0x20($out)
1264 movdqu @XMM[6], 0x30($out)
1265 movdqu @XMM[3], 0x40($out)
1266 movdqu @XMM[7], 0x50($out)
1267 movdqu @XMM[2], 0x60($out)
1268 jmp .Lecb_enc_done
1269 .align 16
1270 .Lecb_enc_six:
1271 call _bsaes_encrypt8
1272 movdqu @XMM[0], 0x00($out) # write output
1273 movdqu @XMM[1], 0x10($out)
1274 movdqu @XMM[4], 0x20($out)
1275 movdqu @XMM[6], 0x30($out)
1276 movdqu @XMM[3], 0x40($out)
1277 movdqu @XMM[7], 0x50($out)
1278 jmp .Lecb_enc_done
1279 .align 16
1280 .Lecb_enc_five:
1281 call _bsaes_encrypt8
1282 movdqu @XMM[0], 0x00($out) # write output
1283 movdqu @XMM[1], 0x10($out)
1284 movdqu @XMM[4], 0x20($out)
1285 movdqu @XMM[6], 0x30($out)
1286 movdqu @XMM[3], 0x40($out)
1287 jmp .Lecb_enc_done
1288 .align 16
1289 .Lecb_enc_four:
1290 call _bsaes_encrypt8
1291 movdqu @XMM[0], 0x00($out) # write output
1292 movdqu @XMM[1], 0x10($out)
1293 movdqu @XMM[4], 0x20($out)
1294 movdqu @XMM[6], 0x30($out)
1295 jmp .Lecb_enc_done
1296 .align 16
1297 .Lecb_enc_three:
1298 call _bsaes_encrypt8
1299 movdqu @XMM[0], 0x00($out) # write output
1300 movdqu @XMM[1], 0x10($out)
1301 movdqu @XMM[4], 0x20($out)
1302 jmp .Lecb_enc_done
1303 .align 16
1304 .Lecb_enc_two:
1305 call _bsaes_encrypt8
1306 movdqu @XMM[0], 0x00($out) # write output
1307 movdqu @XMM[1], 0x10($out)
1308 jmp .Lecb_enc_done
1309 .align 16
1310 .Lecb_enc_one:
1311 call _bsaes_encrypt8
1312 movdqu @XMM[0], 0x00($out) # write output
1313 jmp .Lecb_enc_done
1314 .align 16
1315 .Lecb_enc_short:
1316 lea ($inp), $arg1
1317 lea ($out), $arg2
1318 lea ($key), $arg3
1319 call asm_AES_encrypt
1320 lea 16($inp), $inp
1321 lea 16($out), $out
1322 dec $len
1323 jnz .Lecb_enc_short
1324
1325 .Lecb_enc_done:
1326 lea (%rsp),%rax
1327 pxor %xmm0, %xmm0
1328 .Lecb_enc_bzero: # wipe key schedule [if any]
1329 movdqa %xmm0, 0x00(%rax)
1330 movdqa %xmm0, 0x10(%rax)
1331 lea 0x20(%rax), %rax
1332 cmp %rax, %rbp
1333 jb .Lecb_enc_bzero
1334
1335 lea (%rbp),%rsp # restore %rsp
1336 ___
1337 $code.=<<___ if ($win64);
1338 movaps 0x40(%rbp), %xmm6
1339 movaps 0x50(%rbp), %xmm7
1340 movaps 0x60(%rbp), %xmm8
1341 movaps 0x70(%rbp), %xmm9
1342 movaps 0x80(%rbp), %xmm10
1343 movaps 0x90(%rbp), %xmm11
1344 movaps 0xa0(%rbp), %xmm12
1345 movaps 0xb0(%rbp), %xmm13
1346 movaps 0xc0(%rbp), %xmm14
1347 movaps 0xd0(%rbp), %xmm15
1348 lea 0xa0(%rbp), %rsp
1349 ___
1350 $code.=<<___;
1351 mov 0x48(%rsp), %r15
1352 mov 0x50(%rsp), %r14
1353 mov 0x58(%rsp), %r13
1354 mov 0x60(%rsp), %r12
1355 mov 0x68(%rsp), %rbx
1356 mov 0x70(%rsp), %rax
1357 lea 0x78(%rsp), %rsp
1358 mov %rax, %rbp
1359 .Lecb_enc_epilogue:
1360 ret
1361 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1362
1363 .globl bsaes_ecb_decrypt_blocks
1364 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1365 .align 16
1366 bsaes_ecb_decrypt_blocks:
1367 mov %rsp, %rax
1368 .Lecb_dec_prologue:
1369 push %rbp
1370 push %rbx
1371 push %r12
1372 push %r13
1373 push %r14
1374 push %r15
1375 lea -0x48(%rsp),%rsp
1376 ___
1377 $code.=<<___ if ($win64);
1378 lea -0xa0(%rsp), %rsp
1379 movaps %xmm6, 0x40(%rsp)
1380 movaps %xmm7, 0x50(%rsp)
1381 movaps %xmm8, 0x60(%rsp)
1382 movaps %xmm9, 0x70(%rsp)
1383 movaps %xmm10, 0x80(%rsp)
1384 movaps %xmm11, 0x90(%rsp)
1385 movaps %xmm12, 0xa0(%rsp)
1386 movaps %xmm13, 0xb0(%rsp)
1387 movaps %xmm14, 0xc0(%rsp)
1388 movaps %xmm15, 0xd0(%rsp)
1389 .Lecb_dec_body:
1390 ___
1391 $code.=<<___;
1392 mov %rsp,%rbp # backup %rsp
1393 mov 240($arg4),%eax # rounds
1394 mov $arg1,$inp # backup arguments
1395 mov $arg2,$out
1396 mov $arg3,$len
1397 mov $arg4,$key
1398 cmp \$8,$arg3
1399 jb .Lecb_dec_short
1400
1401 mov %eax,%ebx # backup rounds
1402 shl \$7,%rax # 128 bytes per inner round key
1403 sub \$`128-32`,%rax # size of bit-sliced key schedule
1404 sub %rax,%rsp
1405 mov %rsp,%rax # pass key schedule
1406 mov $key,%rcx # pass key
1407 mov %ebx,%r10d # pass rounds
1408 call _bsaes_key_convert
1409 pxor (%rsp),%xmm7 # fix up 0 round key
1410 movdqa %xmm6,(%rax) # save last round key
1411 movdqa %xmm7,(%rsp)
1412
1413 sub \$8,$len
1414 .Lecb_dec_loop:
1415 movdqu 0x00($inp), @XMM[0] # load input
1416 movdqu 0x10($inp), @XMM[1]
1417 movdqu 0x20($inp), @XMM[2]
1418 movdqu 0x30($inp), @XMM[3]
1419 movdqu 0x40($inp), @XMM[4]
1420 movdqu 0x50($inp), @XMM[5]
1421 mov %rsp, %rax # pass key schedule
1422 movdqu 0x60($inp), @XMM[6]
1423 mov %ebx,%r10d # pass rounds
1424 movdqu 0x70($inp), @XMM[7]
1425 lea 0x80($inp), $inp
1426
1427 call _bsaes_decrypt8
1428
1429 movdqu @XMM[0], 0x00($out) # write output
1430 movdqu @XMM[1], 0x10($out)
1431 movdqu @XMM[6], 0x20($out)
1432 movdqu @XMM[4], 0x30($out)
1433 movdqu @XMM[2], 0x40($out)
1434 movdqu @XMM[7], 0x50($out)
1435 movdqu @XMM[3], 0x60($out)
1436 movdqu @XMM[5], 0x70($out)
1437 lea 0x80($out), $out
1438 sub \$8,$len
1439 jnc .Lecb_dec_loop
1440
1441 add \$8,$len
1442 jz .Lecb_dec_done
1443
1444 movdqu 0x00($inp), @XMM[0] # load input
1445 mov %rsp, %rax # pass key schedule
1446 mov %ebx,%r10d # pass rounds
1447 cmp \$2,$len
1448 jb .Lecb_dec_one
1449 movdqu 0x10($inp), @XMM[1]
1450 je .Lecb_dec_two
1451 movdqu 0x20($inp), @XMM[2]
1452 cmp \$4,$len
1453 jb .Lecb_dec_three
1454 movdqu 0x30($inp), @XMM[3]
1455 je .Lecb_dec_four
1456 movdqu 0x40($inp), @XMM[4]
1457 cmp \$6,$len
1458 jb .Lecb_dec_five
1459 movdqu 0x50($inp), @XMM[5]
1460 je .Lecb_dec_six
1461 movdqu 0x60($inp), @XMM[6]
1462 call _bsaes_decrypt8
1463 movdqu @XMM[0], 0x00($out) # write output
1464 movdqu @XMM[1], 0x10($out)
1465 movdqu @XMM[6], 0x20($out)
1466 movdqu @XMM[4], 0x30($out)
1467 movdqu @XMM[2], 0x40($out)
1468 movdqu @XMM[7], 0x50($out)
1469 movdqu @XMM[3], 0x60($out)
1470 jmp .Lecb_dec_done
1471 .align 16
1472 .Lecb_dec_six:
1473 call _bsaes_decrypt8
1474 movdqu @XMM[0], 0x00($out) # write output
1475 movdqu @XMM[1], 0x10($out)
1476 movdqu @XMM[6], 0x20($out)
1477 movdqu @XMM[4], 0x30($out)
1478 movdqu @XMM[2], 0x40($out)
1479 movdqu @XMM[7], 0x50($out)
1480 jmp .Lecb_dec_done
1481 .align 16
1482 .Lecb_dec_five:
1483 call _bsaes_decrypt8
1484 movdqu @XMM[0], 0x00($out) # write output
1485 movdqu @XMM[1], 0x10($out)
1486 movdqu @XMM[6], 0x20($out)
1487 movdqu @XMM[4], 0x30($out)
1488 movdqu @XMM[2], 0x40($out)
1489 jmp .Lecb_dec_done
1490 .align 16
1491 .Lecb_dec_four:
1492 call _bsaes_decrypt8
1493 movdqu @XMM[0], 0x00($out) # write output
1494 movdqu @XMM[1], 0x10($out)
1495 movdqu @XMM[6], 0x20($out)
1496 movdqu @XMM[4], 0x30($out)
1497 jmp .Lecb_dec_done
1498 .align 16
1499 .Lecb_dec_three:
1500 call _bsaes_decrypt8
1501 movdqu @XMM[0], 0x00($out) # write output
1502 movdqu @XMM[1], 0x10($out)
1503 movdqu @XMM[6], 0x20($out)
1504 jmp .Lecb_dec_done
1505 .align 16
1506 .Lecb_dec_two:
1507 call _bsaes_decrypt8
1508 movdqu @XMM[0], 0x00($out) # write output
1509 movdqu @XMM[1], 0x10($out)
1510 jmp .Lecb_dec_done
1511 .align 16
1512 .Lecb_dec_one:
1513 call _bsaes_decrypt8
1514 movdqu @XMM[0], 0x00($out) # write output
1515 jmp .Lecb_dec_done
1516 .align 16
1517 .Lecb_dec_short:
1518 lea ($inp), $arg1
1519 lea ($out), $arg2
1520 lea ($key), $arg3
1521 call asm_AES_decrypt
1522 lea 16($inp), $inp
1523 lea 16($out), $out
1524 dec $len
1525 jnz .Lecb_dec_short
1526
1527 .Lecb_dec_done:
1528 lea (%rsp),%rax
1529 pxor %xmm0, %xmm0
1530 .Lecb_dec_bzero: # wipe key schedule [if any]
1531 movdqa %xmm0, 0x00(%rax)
1532 movdqa %xmm0, 0x10(%rax)
1533 lea 0x20(%rax), %rax
1534 cmp %rax, %rbp
1535 jb .Lecb_dec_bzero
1536
1537 lea (%rbp),%rsp # restore %rsp
1538 ___
1539 $code.=<<___ if ($win64);
1540 movaps 0x40(%rbp), %xmm6
1541 movaps 0x50(%rbp), %xmm7
1542 movaps 0x60(%rbp), %xmm8
1543 movaps 0x70(%rbp), %xmm9
1544 movaps 0x80(%rbp), %xmm10
1545 movaps 0x90(%rbp), %xmm11
1546 movaps 0xa0(%rbp), %xmm12
1547 movaps 0xb0(%rbp), %xmm13
1548 movaps 0xc0(%rbp), %xmm14
1549 movaps 0xd0(%rbp), %xmm15
1550 lea 0xa0(%rbp), %rsp
1551 ___
1552 $code.=<<___;
1553 mov 0x48(%rsp), %r15
1554 mov 0x50(%rsp), %r14
1555 mov 0x58(%rsp), %r13
1556 mov 0x60(%rsp), %r12
1557 mov 0x68(%rsp), %rbx
1558 mov 0x70(%rsp), %rax
1559 lea 0x78(%rsp), %rsp
1560 mov %rax, %rbp
1561 .Lecb_dec_epilogue:
1562 ret
1563 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1564 ___
1565 }
1566 $code.=<<___;
1567 .extern asm_AES_cbc_encrypt
1568 .globl bsaes_cbc_encrypt
1569 .type bsaes_cbc_encrypt,\@abi-omnipotent
1570 .align 16
1571 bsaes_cbc_encrypt:
1572 ___
1573 $code.=<<___ if ($win64);
1574 mov 48(%rsp),$arg6 # pull direction flag
1575 ___
1576 $code.=<<___;
1577 cmp \$0,$arg6
1578 jne asm_AES_cbc_encrypt
1579 cmp \$128,$arg3
1580 jb asm_AES_cbc_encrypt
1581
1582 mov %rsp, %rax
1583 .Lcbc_dec_prologue:
1584 push %rbp
1585 push %rbx
1586 push %r12
1587 push %r13
1588 push %r14
1589 push %r15
1590 lea -0x48(%rsp), %rsp
1591 ___
1592 $code.=<<___ if ($win64);
1593 mov 0xa0(%rsp),$arg5 # pull ivp
1594 lea -0xa0(%rsp), %rsp
1595 movaps %xmm6, 0x40(%rsp)
1596 movaps %xmm7, 0x50(%rsp)
1597 movaps %xmm8, 0x60(%rsp)
1598 movaps %xmm9, 0x70(%rsp)
1599 movaps %xmm10, 0x80(%rsp)
1600 movaps %xmm11, 0x90(%rsp)
1601 movaps %xmm12, 0xa0(%rsp)
1602 movaps %xmm13, 0xb0(%rsp)
1603 movaps %xmm14, 0xc0(%rsp)
1604 movaps %xmm15, 0xd0(%rsp)
1605 .Lcbc_dec_body:
1606 ___
1607 $code.=<<___;
1608 mov %rsp, %rbp # backup %rsp
1609 mov 240($arg4), %eax # rounds
1610 mov $arg1, $inp # backup arguments
1611 mov $arg2, $out
1612 mov $arg3, $len
1613 mov $arg4, $key
1614 mov $arg5, %rbx
1615 shr \$4, $len # bytes to blocks
1616
1617 mov %eax, %edx # rounds
1618 shl \$7, %rax # 128 bytes per inner round key
1619 sub \$`128-32`, %rax # size of bit-sliced key schedule
1620 sub %rax, %rsp
1621
1622 mov %rsp, %rax # pass key schedule
1623 mov $key, %rcx # pass key
1624 mov %edx, %r10d # pass rounds
1625 call _bsaes_key_convert
1626 pxor (%rsp),%xmm7 # fix up 0 round key
1627 movdqa %xmm6,(%rax) # save last round key
1628 movdqa %xmm7,(%rsp)
1629
1630 movdqu (%rbx), @XMM[15] # load IV
1631 sub \$8,$len
1632 .Lcbc_dec_loop:
1633 movdqu 0x00($inp), @XMM[0] # load input
1634 movdqu 0x10($inp), @XMM[1]
1635 movdqu 0x20($inp), @XMM[2]
1636 movdqu 0x30($inp), @XMM[3]
1637 movdqu 0x40($inp), @XMM[4]
1638 movdqu 0x50($inp), @XMM[5]
1639 mov %rsp, %rax # pass key schedule
1640 movdqu 0x60($inp), @XMM[6]
1641 mov %edx,%r10d # pass rounds
1642 movdqu 0x70($inp), @XMM[7]
1643 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1644
1645 call _bsaes_decrypt8
1646
1647 pxor 0x20(%rbp), @XMM[0] # ^= IV
1648 movdqu 0x00($inp), @XMM[8] # re-load input
1649 movdqu 0x10($inp), @XMM[9]
1650 pxor @XMM[8], @XMM[1]
1651 movdqu 0x20($inp), @XMM[10]
1652 pxor @XMM[9], @XMM[6]
1653 movdqu 0x30($inp), @XMM[11]
1654 pxor @XMM[10], @XMM[4]
1655 movdqu 0x40($inp), @XMM[12]
1656 pxor @XMM[11], @XMM[2]
1657 movdqu 0x50($inp), @XMM[13]
1658 pxor @XMM[12], @XMM[7]
1659 movdqu 0x60($inp), @XMM[14]
1660 pxor @XMM[13], @XMM[3]
1661 movdqu 0x70($inp), @XMM[15] # IV
1662 pxor @XMM[14], @XMM[5]
1663 movdqu @XMM[0], 0x00($out) # write output
1664 lea 0x80($inp), $inp
1665 movdqu @XMM[1], 0x10($out)
1666 movdqu @XMM[6], 0x20($out)
1667 movdqu @XMM[4], 0x30($out)
1668 movdqu @XMM[2], 0x40($out)
1669 movdqu @XMM[7], 0x50($out)
1670 movdqu @XMM[3], 0x60($out)
1671 movdqu @XMM[5], 0x70($out)
1672 lea 0x80($out), $out
1673 sub \$8,$len
1674 jnc .Lcbc_dec_loop
1675
1676 add \$8,$len
1677 jz .Lcbc_dec_done
1678
1679 movdqu 0x00($inp), @XMM[0] # load input
1680 mov %rsp, %rax # pass key schedule
1681 mov %edx, %r10d # pass rounds
1682 cmp \$2,$len
1683 jb .Lcbc_dec_one
1684 movdqu 0x10($inp), @XMM[1]
1685 je .Lcbc_dec_two
1686 movdqu 0x20($inp), @XMM[2]
1687 cmp \$4,$len
1688 jb .Lcbc_dec_three
1689 movdqu 0x30($inp), @XMM[3]
1690 je .Lcbc_dec_four
1691 movdqu 0x40($inp), @XMM[4]
1692 cmp \$6,$len
1693 jb .Lcbc_dec_five
1694 movdqu 0x50($inp), @XMM[5]
1695 je .Lcbc_dec_six
1696 movdqu 0x60($inp), @XMM[6]
1697 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1698 call _bsaes_decrypt8
1699 pxor 0x20(%rbp), @XMM[0] # ^= IV
1700 movdqu 0x00($inp), @XMM[8] # re-load input
1701 movdqu 0x10($inp), @XMM[9]
1702 pxor @XMM[8], @XMM[1]
1703 movdqu 0x20($inp), @XMM[10]
1704 pxor @XMM[9], @XMM[6]
1705 movdqu 0x30($inp), @XMM[11]
1706 pxor @XMM[10], @XMM[4]
1707 movdqu 0x40($inp), @XMM[12]
1708 pxor @XMM[11], @XMM[2]
1709 movdqu 0x50($inp), @XMM[13]
1710 pxor @XMM[12], @XMM[7]
1711 movdqu 0x60($inp), @XMM[15] # IV
1712 pxor @XMM[13], @XMM[3]
1713 movdqu @XMM[0], 0x00($out) # write output
1714 movdqu @XMM[1], 0x10($out)
1715 movdqu @XMM[6], 0x20($out)
1716 movdqu @XMM[4], 0x30($out)
1717 movdqu @XMM[2], 0x40($out)
1718 movdqu @XMM[7], 0x50($out)
1719 movdqu @XMM[3], 0x60($out)
1720 jmp .Lcbc_dec_done
1721 .align 16
1722 .Lcbc_dec_six:
1723 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1724 call _bsaes_decrypt8
1725 pxor 0x20(%rbp), @XMM[0] # ^= IV
1726 movdqu 0x00($inp), @XMM[8] # re-load input
1727 movdqu 0x10($inp), @XMM[9]
1728 pxor @XMM[8], @XMM[1]
1729 movdqu 0x20($inp), @XMM[10]
1730 pxor @XMM[9], @XMM[6]
1731 movdqu 0x30($inp), @XMM[11]
1732 pxor @XMM[10], @XMM[4]
1733 movdqu 0x40($inp), @XMM[12]
1734 pxor @XMM[11], @XMM[2]
1735 movdqu 0x50($inp), @XMM[15] # IV
1736 pxor @XMM[12], @XMM[7]
1737 movdqu @XMM[0], 0x00($out) # write output
1738 movdqu @XMM[1], 0x10($out)
1739 movdqu @XMM[6], 0x20($out)
1740 movdqu @XMM[4], 0x30($out)
1741 movdqu @XMM[2], 0x40($out)
1742 movdqu @XMM[7], 0x50($out)
1743 jmp .Lcbc_dec_done
1744 .align 16
1745 .Lcbc_dec_five:
1746 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1747 call _bsaes_decrypt8
1748 pxor 0x20(%rbp), @XMM[0] # ^= IV
1749 movdqu 0x00($inp), @XMM[8] # re-load input
1750 movdqu 0x10($inp), @XMM[9]
1751 pxor @XMM[8], @XMM[1]
1752 movdqu 0x20($inp), @XMM[10]
1753 pxor @XMM[9], @XMM[6]
1754 movdqu 0x30($inp), @XMM[11]
1755 pxor @XMM[10], @XMM[4]
1756 movdqu 0x40($inp), @XMM[15] # IV
1757 pxor @XMM[11], @XMM[2]
1758 movdqu @XMM[0], 0x00($out) # write output
1759 movdqu @XMM[1], 0x10($out)
1760 movdqu @XMM[6], 0x20($out)
1761 movdqu @XMM[4], 0x30($out)
1762 movdqu @XMM[2], 0x40($out)
1763 jmp .Lcbc_dec_done
1764 .align 16
1765 .Lcbc_dec_four:
1766 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1767 call _bsaes_decrypt8
1768 pxor 0x20(%rbp), @XMM[0] # ^= IV
1769 movdqu 0x00($inp), @XMM[8] # re-load input
1770 movdqu 0x10($inp), @XMM[9]
1771 pxor @XMM[8], @XMM[1]
1772 movdqu 0x20($inp), @XMM[10]
1773 pxor @XMM[9], @XMM[6]
1774 movdqu 0x30($inp), @XMM[15] # IV
1775 pxor @XMM[10], @XMM[4]
1776 movdqu @XMM[0], 0x00($out) # write output
1777 movdqu @XMM[1], 0x10($out)
1778 movdqu @XMM[6], 0x20($out)
1779 movdqu @XMM[4], 0x30($out)
1780 jmp .Lcbc_dec_done
1781 .align 16
1782 .Lcbc_dec_three:
1783 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1784 call _bsaes_decrypt8
1785 pxor 0x20(%rbp), @XMM[0] # ^= IV
1786 movdqu 0x00($inp), @XMM[8] # re-load input
1787 movdqu 0x10($inp), @XMM[9]
1788 pxor @XMM[8], @XMM[1]
1789 movdqu 0x20($inp), @XMM[15] # IV
1790 pxor @XMM[9], @XMM[6]
1791 movdqu @XMM[0], 0x00($out) # write output
1792 movdqu @XMM[1], 0x10($out)
1793 movdqu @XMM[6], 0x20($out)
1794 jmp .Lcbc_dec_done
1795 .align 16
1796 .Lcbc_dec_two:
1797 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1798 call _bsaes_decrypt8
1799 pxor 0x20(%rbp), @XMM[0] # ^= IV
1800 movdqu 0x00($inp), @XMM[8] # re-load input
1801 movdqu 0x10($inp), @XMM[15] # IV
1802 pxor @XMM[8], @XMM[1]
1803 movdqu @XMM[0], 0x00($out) # write output
1804 movdqu @XMM[1], 0x10($out)
1805 jmp .Lcbc_dec_done
1806 .align 16
1807 .Lcbc_dec_one:
1808 lea ($inp), $arg1
1809 lea 0x20(%rbp), $arg2 # buffer output
1810 lea ($key), $arg3
1811 call asm_AES_decrypt # doesn't touch %xmm
1812 pxor 0x20(%rbp), @XMM[15] # ^= IV
1813 movdqu @XMM[15], ($out) # write output
1814 movdqa @XMM[0], @XMM[15] # IV
1815
1816 .Lcbc_dec_done:
1817 movdqu @XMM[15], (%rbx) # return IV
1818 lea (%rsp), %rax
1819 pxor %xmm0, %xmm0
1820 .Lcbc_dec_bzero: # wipe key schedule [if any]
1821 movdqa %xmm0, 0x00(%rax)
1822 movdqa %xmm0, 0x10(%rax)
1823 lea 0x20(%rax), %rax
1824 cmp %rax, %rbp
1825 ja .Lcbc_dec_bzero
1826
1827 lea (%rbp),%rsp # restore %rsp
1828 ___
1829 $code.=<<___ if ($win64);
1830 movaps 0x40(%rbp), %xmm6
1831 movaps 0x50(%rbp), %xmm7
1832 movaps 0x60(%rbp), %xmm8
1833 movaps 0x70(%rbp), %xmm9
1834 movaps 0x80(%rbp), %xmm10
1835 movaps 0x90(%rbp), %xmm11
1836 movaps 0xa0(%rbp), %xmm12
1837 movaps 0xb0(%rbp), %xmm13
1838 movaps 0xc0(%rbp), %xmm14
1839 movaps 0xd0(%rbp), %xmm15
1840 lea 0xa0(%rbp), %rsp
1841 ___
1842 $code.=<<___;
1843 mov 0x48(%rsp), %r15
1844 mov 0x50(%rsp), %r14
1845 mov 0x58(%rsp), %r13
1846 mov 0x60(%rsp), %r12
1847 mov 0x68(%rsp), %rbx
1848 mov 0x70(%rsp), %rax
1849 lea 0x78(%rsp), %rsp
1850 mov %rax, %rbp
1851 .Lcbc_dec_epilogue:
1852 ret
1853 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1854
1855 .globl bsaes_ctr32_encrypt_blocks
1856 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1857 .align 16
1858 bsaes_ctr32_encrypt_blocks:
1859 mov %rsp, %rax
1860 .Lctr_enc_prologue:
1861 push %rbp
1862 push %rbx
1863 push %r12
1864 push %r13
1865 push %r14
1866 push %r15
1867 lea -0x48(%rsp), %rsp
1868 ___
1869 $code.=<<___ if ($win64);
1870 mov 0xa0(%rsp),$arg5 # pull ivp
1871 lea -0xa0(%rsp), %rsp
1872 movaps %xmm6, 0x40(%rsp)
1873 movaps %xmm7, 0x50(%rsp)
1874 movaps %xmm8, 0x60(%rsp)
1875 movaps %xmm9, 0x70(%rsp)
1876 movaps %xmm10, 0x80(%rsp)
1877 movaps %xmm11, 0x90(%rsp)
1878 movaps %xmm12, 0xa0(%rsp)
1879 movaps %xmm13, 0xb0(%rsp)
1880 movaps %xmm14, 0xc0(%rsp)
1881 movaps %xmm15, 0xd0(%rsp)
1882 .Lctr_enc_body:
1883 ___
1884 $code.=<<___;
1885 mov %rsp, %rbp # backup %rsp
1886 movdqu ($arg5), %xmm0 # load counter
1887 mov 240($arg4), %eax # rounds
1888 mov $arg1, $inp # backup arguments
1889 mov $arg2, $out
1890 mov $arg3, $len
1891 mov $arg4, $key
1892 movdqa %xmm0, 0x20(%rbp) # copy counter
1893 cmp \$8, $arg3
1894 jb .Lctr_enc_short
1895
1896 mov %eax, %ebx # rounds
1897 shl \$7, %rax # 128 bytes per inner round key
1898 sub \$`128-32`, %rax # size of bit-sliced key schedule
1899 sub %rax, %rsp
1900
1901 mov %rsp, %rax # pass key schedule
1902 mov $key, %rcx # pass key
1903 mov %ebx, %r10d # pass rounds
1904 call _bsaes_key_convert
1905 pxor %xmm6,%xmm7 # fix up last round key
1906 movdqa %xmm7,(%rax) # save last round key
1907
1908 movdqa (%rsp), @XMM[9] # load round0 key
1909 lea .LADD1(%rip), %r11
1910 movdqa 0x20(%rbp), @XMM[0] # counter copy
1911 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1912 pshufb @XMM[8], @XMM[9] # byte swap upper part
1913 pshufb @XMM[8], @XMM[0]
1914 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1915 jmp .Lctr_enc_loop
1916 .align 16
1917 .Lctr_enc_loop:
1918 movdqa @XMM[0], 0x20(%rbp) # save counter
1919 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1920 movdqa @XMM[0], @XMM[2]
1921 paddd 0x00(%r11), @XMM[1] # .LADD1
1922 movdqa @XMM[0], @XMM[3]
1923 paddd 0x10(%r11), @XMM[2] # .LADD2
1924 movdqa @XMM[0], @XMM[4]
1925 paddd 0x20(%r11), @XMM[3] # .LADD3
1926 movdqa @XMM[0], @XMM[5]
1927 paddd 0x30(%r11), @XMM[4] # .LADD4
1928 movdqa @XMM[0], @XMM[6]
1929 paddd 0x40(%r11), @XMM[5] # .LADD5
1930 movdqa @XMM[0], @XMM[7]
1931 paddd 0x50(%r11), @XMM[6] # .LADD6
1932 paddd 0x60(%r11), @XMM[7] # .LADD7
1933
1934 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1935 # to flip byte order in 32-bit counter
1936 movdqa (%rsp), @XMM[9] # round 0 key
1937 lea 0x10(%rsp), %rax # pass key schedule
1938 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1939 pxor @XMM[9], @XMM[0] # xor with round0 key
1940 pxor @XMM[9], @XMM[1]
1941 pxor @XMM[9], @XMM[2]
1942 pxor @XMM[9], @XMM[3]
1943 pshufb @XMM[8], @XMM[0]
1944 pshufb @XMM[8], @XMM[1]
1945 pxor @XMM[9], @XMM[4]
1946 pxor @XMM[9], @XMM[5]
1947 pshufb @XMM[8], @XMM[2]
1948 pshufb @XMM[8], @XMM[3]
1949 pxor @XMM[9], @XMM[6]
1950 pxor @XMM[9], @XMM[7]
1951 pshufb @XMM[8], @XMM[4]
1952 pshufb @XMM[8], @XMM[5]
1953 pshufb @XMM[8], @XMM[6]
1954 pshufb @XMM[8], @XMM[7]
1955 lea .LBS0(%rip), %r11 # constants table
1956 mov %ebx,%r10d # pass rounds
1957
1958 call _bsaes_encrypt8_bitslice
1959
1960 sub \$8,$len
1961 jc .Lctr_enc_loop_done
1962
1963 movdqu 0x00($inp), @XMM[8] # load input
1964 movdqu 0x10($inp), @XMM[9]
1965 movdqu 0x20($inp), @XMM[10]
1966 movdqu 0x30($inp), @XMM[11]
1967 movdqu 0x40($inp), @XMM[12]
1968 movdqu 0x50($inp), @XMM[13]
1969 movdqu 0x60($inp), @XMM[14]
1970 movdqu 0x70($inp), @XMM[15]
1971 lea 0x80($inp),$inp
1972 pxor @XMM[0], @XMM[8]
1973 movdqa 0x20(%rbp), @XMM[0] # load counter
1974 pxor @XMM[9], @XMM[1]
1975 movdqu @XMM[8], 0x00($out) # write output
1976 pxor @XMM[10], @XMM[4]
1977 movdqu @XMM[1], 0x10($out)
1978 pxor @XMM[11], @XMM[6]
1979 movdqu @XMM[4], 0x20($out)
1980 pxor @XMM[12], @XMM[3]
1981 movdqu @XMM[6], 0x30($out)
1982 pxor @XMM[13], @XMM[7]
1983 movdqu @XMM[3], 0x40($out)
1984 pxor @XMM[14], @XMM[2]
1985 movdqu @XMM[7], 0x50($out)
1986 pxor @XMM[15], @XMM[5]
1987 movdqu @XMM[2], 0x60($out)
1988 lea .LADD1(%rip), %r11
1989 movdqu @XMM[5], 0x70($out)
1990 lea 0x80($out), $out
1991 paddd 0x70(%r11), @XMM[0] # .LADD8
1992 jnz .Lctr_enc_loop
1993
1994 jmp .Lctr_enc_done
1995 .align 16
1996 .Lctr_enc_loop_done:
1997 add \$8, $len
1998 movdqu 0x00($inp), @XMM[8] # load input
1999 pxor @XMM[8], @XMM[0]
2000 movdqu @XMM[0], 0x00($out) # write output
2001 cmp \$2,$len
2002 jb .Lctr_enc_done
2003 movdqu 0x10($inp), @XMM[9]
2004 pxor @XMM[9], @XMM[1]
2005 movdqu @XMM[1], 0x10($out)
2006 je .Lctr_enc_done
2007 movdqu 0x20($inp), @XMM[10]
2008 pxor @XMM[10], @XMM[4]
2009 movdqu @XMM[4], 0x20($out)
2010 cmp \$4,$len
2011 jb .Lctr_enc_done
2012 movdqu 0x30($inp), @XMM[11]
2013 pxor @XMM[11], @XMM[6]
2014 movdqu @XMM[6], 0x30($out)
2015 je .Lctr_enc_done
2016 movdqu 0x40($inp), @XMM[12]
2017 pxor @XMM[12], @XMM[3]
2018 movdqu @XMM[3], 0x40($out)
2019 cmp \$6,$len
2020 jb .Lctr_enc_done
2021 movdqu 0x50($inp), @XMM[13]
2022 pxor @XMM[13], @XMM[7]
2023 movdqu @XMM[7], 0x50($out)
2024 je .Lctr_enc_done
2025 movdqu 0x60($inp), @XMM[14]
2026 pxor @XMM[14], @XMM[2]
2027 movdqu @XMM[2], 0x60($out)
2028 jmp .Lctr_enc_done
2029
2030 .align 16
2031 .Lctr_enc_short:
2032 lea 0x20(%rbp), $arg1
2033 lea 0x30(%rbp), $arg2
2034 lea ($key), $arg3
2035 call asm_AES_encrypt
2036 movdqu ($inp), @XMM[1]
2037 lea 16($inp), $inp
2038 mov 0x2c(%rbp), %eax # load 32-bit counter
2039 bswap %eax
2040 pxor 0x30(%rbp), @XMM[1]
2041 inc %eax # increment
2042 movdqu @XMM[1], ($out)
2043 bswap %eax
2044 lea 16($out), $out
2045 mov %eax, 0x2c(%rsp) # save 32-bit counter
2046 dec $len
2047 jnz .Lctr_enc_short
2048
2049 .Lctr_enc_done:
2050 lea (%rsp), %rax
2051 pxor %xmm0, %xmm0
2052 .Lctr_enc_bzero: # wipe key schedule [if any]
2053 movdqa %xmm0, 0x00(%rax)
2054 movdqa %xmm0, 0x10(%rax)
2055 lea 0x20(%rax), %rax
2056 cmp %rax, %rbp
2057 ja .Lctr_enc_bzero
2058
2059 lea (%rbp),%rsp # restore %rsp
2060 ___
2061 $code.=<<___ if ($win64);
2062 movaps 0x40(%rbp), %xmm6
2063 movaps 0x50(%rbp), %xmm7
2064 movaps 0x60(%rbp), %xmm8
2065 movaps 0x70(%rbp), %xmm9
2066 movaps 0x80(%rbp), %xmm10
2067 movaps 0x90(%rbp), %xmm11
2068 movaps 0xa0(%rbp), %xmm12
2069 movaps 0xb0(%rbp), %xmm13
2070 movaps 0xc0(%rbp), %xmm14
2071 movaps 0xd0(%rbp), %xmm15
2072 lea 0xa0(%rbp), %rsp
2073 ___
2074 $code.=<<___;
2075 mov 0x48(%rsp), %r15
2076 mov 0x50(%rsp), %r14
2077 mov 0x58(%rsp), %r13
2078 mov 0x60(%rsp), %r12
2079 mov 0x68(%rsp), %rbx
2080 mov 0x70(%rsp), %rax
2081 lea 0x78(%rsp), %rsp
2082 mov %rax, %rbp
2083 .Lctr_enc_epilogue:
2084 ret
2085 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2086 ___
2087 ######################################################################
2088 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2089 # const AES_KEY *key1, const AES_KEY *key2,
2090 # const unsigned char iv[16]);
2091 #
2092 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2093 $arg6=~s/d$//;
2094
2095 $code.=<<___;
2096 .globl bsaes_xts_encrypt
2097 .type bsaes_xts_encrypt,\@abi-omnipotent
2098 .align 16
2099 bsaes_xts_encrypt:
2100 mov %rsp, %rax
2101 .Lxts_enc_prologue:
2102 push %rbp
2103 push %rbx
2104 push %r12
2105 push %r13
2106 push %r14
2107 push %r15
2108 lea -0x48(%rsp), %rsp
2109 ___
2110 $code.=<<___ if ($win64);
2111 mov 0xa0(%rsp),$arg5 # pull key2
2112 mov 0xa8(%rsp),$arg6 # pull ivp
2113 lea -0xa0(%rsp), %rsp
2114 movaps %xmm6, 0x40(%rsp)
2115 movaps %xmm7, 0x50(%rsp)
2116 movaps %xmm8, 0x60(%rsp)
2117 movaps %xmm9, 0x70(%rsp)
2118 movaps %xmm10, 0x80(%rsp)
2119 movaps %xmm11, 0x90(%rsp)
2120 movaps %xmm12, 0xa0(%rsp)
2121 movaps %xmm13, 0xb0(%rsp)
2122 movaps %xmm14, 0xc0(%rsp)
2123 movaps %xmm15, 0xd0(%rsp)
2124 .Lxts_enc_body:
2125 ___
2126 $code.=<<___;
2127 mov %rsp, %rbp # backup %rsp
2128 mov $arg1, $inp # backup arguments
2129 mov $arg2, $out
2130 mov $arg3, $len
2131 mov $arg4, $key
2132
2133 lea ($arg6), $arg1
2134 lea 0x20(%rbp), $arg2
2135 lea ($arg5), $arg3
2136 call asm_AES_encrypt # generate initial tweak
2137
2138 mov 240($key), %eax # rounds
2139 mov $len, %rbx # backup $len
2140
2141 mov %eax, %edx # rounds
2142 shl \$7, %rax # 128 bytes per inner round key
2143 sub \$`128-32`, %rax # size of bit-sliced key schedule
2144 sub %rax, %rsp
2145
2146 mov %rsp, %rax # pass key schedule
2147 mov $key, %rcx # pass key
2148 mov %edx, %r10d # pass rounds
2149 call _bsaes_key_convert
2150 pxor %xmm6, %xmm7 # fix up last round key
2151 movdqa %xmm7, (%rax) # save last round key
2152
2153 and \$-16, $len
2154 sub \$0x80, %rsp # place for tweak[8]
2155 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2156
2157 pxor $twtmp, $twtmp
2158 movdqa .Lxts_magic(%rip), $twmask
2159 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2160
2161 sub \$0x80, $len
2162 jc .Lxts_enc_short
2163 jmp .Lxts_enc_loop
2164
2165 .align 16
2166 .Lxts_enc_loop:
2167 ___
2168 for ($i=0;$i<7;$i++) {
2169 $code.=<<___;
2170 pshufd \$0x13, $twtmp, $twres
2171 pxor $twtmp, $twtmp
2172 movdqa @XMM[7], @XMM[$i]
2173 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2174 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2175 pand $twmask, $twres # isolate carry and residue
2176 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2177 pxor $twres, @XMM[7]
2178 ___
2179 $code.=<<___ if ($i>=1);
2180 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2181 ___
2182 $code.=<<___ if ($i>=2);
2183 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2184 ___
2185 }
2186 $code.=<<___;
2187 movdqu 0x60($inp), @XMM[8+6]
2188 pxor @XMM[8+5], @XMM[5]
2189 movdqu 0x70($inp), @XMM[8+7]
2190 lea 0x80($inp), $inp
2191 movdqa @XMM[7], 0x70(%rsp)
2192 pxor @XMM[8+6], @XMM[6]
2193 lea 0x80(%rsp), %rax # pass key schedule
2194 pxor @XMM[8+7], @XMM[7]
2195 mov %edx, %r10d # pass rounds
2196
2197 call _bsaes_encrypt8
2198
2199 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2200 pxor 0x10(%rsp), @XMM[1]
2201 movdqu @XMM[0], 0x00($out) # write output
2202 pxor 0x20(%rsp), @XMM[4]
2203 movdqu @XMM[1], 0x10($out)
2204 pxor 0x30(%rsp), @XMM[6]
2205 movdqu @XMM[4], 0x20($out)
2206 pxor 0x40(%rsp), @XMM[3]
2207 movdqu @XMM[6], 0x30($out)
2208 pxor 0x50(%rsp), @XMM[7]
2209 movdqu @XMM[3], 0x40($out)
2210 pxor 0x60(%rsp), @XMM[2]
2211 movdqu @XMM[7], 0x50($out)
2212 pxor 0x70(%rsp), @XMM[5]
2213 movdqu @XMM[2], 0x60($out)
2214 movdqu @XMM[5], 0x70($out)
2215 lea 0x80($out), $out
2216
2217 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2218 pxor $twtmp, $twtmp
2219 movdqa .Lxts_magic(%rip), $twmask
2220 pcmpgtd @XMM[7], $twtmp
2221 pshufd \$0x13, $twtmp, $twres
2222 pxor $twtmp, $twtmp
2223 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2224 pand $twmask, $twres # isolate carry and residue
2225 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2226 pxor $twres, @XMM[7]
2227
2228 sub \$0x80,$len
2229 jnc .Lxts_enc_loop
2230
2231 .Lxts_enc_short:
2232 add \$0x80, $len
2233 jz .Lxts_enc_done
2234 ___
2235 for ($i=0;$i<7;$i++) {
2236 $code.=<<___;
2237 pshufd \$0x13, $twtmp, $twres
2238 pxor $twtmp, $twtmp
2239 movdqa @XMM[7], @XMM[$i]
2240 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2241 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2242 pand $twmask, $twres # isolate carry and residue
2243 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2244 pxor $twres, @XMM[7]
2245 ___
2246 $code.=<<___ if ($i>=1);
2247 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2248 cmp \$`0x10*$i`,$len
2249 je .Lxts_enc_$i
2250 ___
2251 $code.=<<___ if ($i>=2);
2252 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2253 ___
2254 }
2255 $code.=<<___;
2256 movdqu 0x60($inp), @XMM[8+6]
2257 pxor @XMM[8+5], @XMM[5]
2258 movdqa @XMM[7], 0x70(%rsp)
2259 lea 0x70($inp), $inp
2260 pxor @XMM[8+6], @XMM[6]
2261 lea 0x80(%rsp), %rax # pass key schedule
2262 mov %edx, %r10d # pass rounds
2263
2264 call _bsaes_encrypt8
2265
2266 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2267 pxor 0x10(%rsp), @XMM[1]
2268 movdqu @XMM[0], 0x00($out) # write output
2269 pxor 0x20(%rsp), @XMM[4]
2270 movdqu @XMM[1], 0x10($out)
2271 pxor 0x30(%rsp), @XMM[6]
2272 movdqu @XMM[4], 0x20($out)
2273 pxor 0x40(%rsp), @XMM[3]
2274 movdqu @XMM[6], 0x30($out)
2275 pxor 0x50(%rsp), @XMM[7]
2276 movdqu @XMM[3], 0x40($out)
2277 pxor 0x60(%rsp), @XMM[2]
2278 movdqu @XMM[7], 0x50($out)
2279 movdqu @XMM[2], 0x60($out)
2280 lea 0x70($out), $out
2281
2282 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2283 jmp .Lxts_enc_done
2284 .align 16
2285 .Lxts_enc_6:
2286 pxor @XMM[8+4], @XMM[4]
2287 lea 0x60($inp), $inp
2288 pxor @XMM[8+5], @XMM[5]
2289 lea 0x80(%rsp), %rax # pass key schedule
2290 mov %edx, %r10d # pass rounds
2291
2292 call _bsaes_encrypt8
2293
2294 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2295 pxor 0x10(%rsp), @XMM[1]
2296 movdqu @XMM[0], 0x00($out) # write output
2297 pxor 0x20(%rsp), @XMM[4]
2298 movdqu @XMM[1], 0x10($out)
2299 pxor 0x30(%rsp), @XMM[6]
2300 movdqu @XMM[4], 0x20($out)
2301 pxor 0x40(%rsp), @XMM[3]
2302 movdqu @XMM[6], 0x30($out)
2303 pxor 0x50(%rsp), @XMM[7]
2304 movdqu @XMM[3], 0x40($out)
2305 movdqu @XMM[7], 0x50($out)
2306 lea 0x60($out), $out
2307
2308 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2309 jmp .Lxts_enc_done
2310 .align 16
2311 .Lxts_enc_5:
2312 pxor @XMM[8+3], @XMM[3]
2313 lea 0x50($inp), $inp
2314 pxor @XMM[8+4], @XMM[4]
2315 lea 0x80(%rsp), %rax # pass key schedule
2316 mov %edx, %r10d # pass rounds
2317
2318 call _bsaes_encrypt8
2319
2320 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2321 pxor 0x10(%rsp), @XMM[1]
2322 movdqu @XMM[0], 0x00($out) # write output
2323 pxor 0x20(%rsp), @XMM[4]
2324 movdqu @XMM[1], 0x10($out)
2325 pxor 0x30(%rsp), @XMM[6]
2326 movdqu @XMM[4], 0x20($out)
2327 pxor 0x40(%rsp), @XMM[3]
2328 movdqu @XMM[6], 0x30($out)
2329 movdqu @XMM[3], 0x40($out)
2330 lea 0x50($out), $out
2331
2332 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2333 jmp .Lxts_enc_done
2334 .align 16
2335 .Lxts_enc_4:
2336 pxor @XMM[8+2], @XMM[2]
2337 lea 0x40($inp), $inp
2338 pxor @XMM[8+3], @XMM[3]
2339 lea 0x80(%rsp), %rax # pass key schedule
2340 mov %edx, %r10d # pass rounds
2341
2342 call _bsaes_encrypt8
2343
2344 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2345 pxor 0x10(%rsp), @XMM[1]
2346 movdqu @XMM[0], 0x00($out) # write output
2347 pxor 0x20(%rsp), @XMM[4]
2348 movdqu @XMM[1], 0x10($out)
2349 pxor 0x30(%rsp), @XMM[6]
2350 movdqu @XMM[4], 0x20($out)
2351 movdqu @XMM[6], 0x30($out)
2352 lea 0x40($out), $out
2353
2354 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2355 jmp .Lxts_enc_done
2356 .align 16
2357 .Lxts_enc_3:
2358 pxor @XMM[8+1], @XMM[1]
2359 lea 0x30($inp), $inp
2360 pxor @XMM[8+2], @XMM[2]
2361 lea 0x80(%rsp), %rax # pass key schedule
2362 mov %edx, %r10d # pass rounds
2363
2364 call _bsaes_encrypt8
2365
2366 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2367 pxor 0x10(%rsp), @XMM[1]
2368 movdqu @XMM[0], 0x00($out) # write output
2369 pxor 0x20(%rsp), @XMM[4]
2370 movdqu @XMM[1], 0x10($out)
2371 movdqu @XMM[4], 0x20($out)
2372 lea 0x30($out), $out
2373
2374 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2375 jmp .Lxts_enc_done
2376 .align 16
2377 .Lxts_enc_2:
2378 pxor @XMM[8+0], @XMM[0]
2379 lea 0x20($inp), $inp
2380 pxor @XMM[8+1], @XMM[1]
2381 lea 0x80(%rsp), %rax # pass key schedule
2382 mov %edx, %r10d # pass rounds
2383
2384 call _bsaes_encrypt8
2385
2386 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2387 pxor 0x10(%rsp), @XMM[1]
2388 movdqu @XMM[0], 0x00($out) # write output
2389 movdqu @XMM[1], 0x10($out)
2390 lea 0x20($out), $out
2391
2392 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2393 jmp .Lxts_enc_done
2394 .align 16
2395 .Lxts_enc_1:
2396 pxor @XMM[0], @XMM[8]
2397 lea 0x10($inp), $inp
2398 movdqa @XMM[8], 0x20(%rbp)
2399 lea 0x20(%rbp), $arg1
2400 lea 0x20(%rbp), $arg2
2401 lea ($key), $arg3
2402 call asm_AES_encrypt # doesn't touch %xmm
2403 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2404 #pxor @XMM[8], @XMM[0]
2405 #lea 0x80(%rsp), %rax # pass key schedule
2406 #mov %edx, %r10d # pass rounds
2407 #call _bsaes_encrypt8
2408 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2409 movdqu @XMM[0], 0x00($out) # write output
2410 lea 0x10($out), $out
2411
2412 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2413
2414 .Lxts_enc_done:
2415 and \$15, %ebx
2416 jz .Lxts_enc_ret
2417 mov $out, %rdx
2418
2419 .Lxts_enc_steal:
2420 movzb ($inp), %eax
2421 movzb -16(%rdx), %ecx
2422 lea 1($inp), $inp
2423 mov %al, -16(%rdx)
2424 mov %cl, 0(%rdx)
2425 lea 1(%rdx), %rdx
2426 sub \$1,%ebx
2427 jnz .Lxts_enc_steal
2428
2429 movdqu -16($out), @XMM[0]
2430 lea 0x20(%rbp), $arg1
2431 pxor @XMM[7], @XMM[0]
2432 lea 0x20(%rbp), $arg2
2433 movdqa @XMM[0], 0x20(%rbp)
2434 lea ($key), $arg3
2435 call asm_AES_encrypt # doesn't touch %xmm
2436 pxor 0x20(%rbp), @XMM[7]
2437 movdqu @XMM[7], -16($out)
2438
2439 .Lxts_enc_ret:
2440 lea (%rsp), %rax
2441 pxor %xmm0, %xmm0
2442 .Lxts_enc_bzero: # wipe key schedule [if any]
2443 movdqa %xmm0, 0x00(%rax)
2444 movdqa %xmm0, 0x10(%rax)
2445 lea 0x20(%rax), %rax
2446 cmp %rax, %rbp
2447 ja .Lxts_enc_bzero
2448
2449 lea (%rbp),%rsp # restore %rsp
2450 ___
2451 $code.=<<___ if ($win64);
2452 movaps 0x40(%rbp), %xmm6
2453 movaps 0x50(%rbp), %xmm7
2454 movaps 0x60(%rbp), %xmm8
2455 movaps 0x70(%rbp), %xmm9
2456 movaps 0x80(%rbp), %xmm10
2457 movaps 0x90(%rbp), %xmm11
2458 movaps 0xa0(%rbp), %xmm12
2459 movaps 0xb0(%rbp), %xmm13
2460 movaps 0xc0(%rbp), %xmm14
2461 movaps 0xd0(%rbp), %xmm15
2462 lea 0xa0(%rbp), %rsp
2463 ___
2464 $code.=<<___;
2465 mov 0x48(%rsp), %r15
2466 mov 0x50(%rsp), %r14
2467 mov 0x58(%rsp), %r13
2468 mov 0x60(%rsp), %r12
2469 mov 0x68(%rsp), %rbx
2470 mov 0x70(%rsp), %rax
2471 lea 0x78(%rsp), %rsp
2472 mov %rax, %rbp
2473 .Lxts_enc_epilogue:
2474 ret
2475 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2476
2477 .globl bsaes_xts_decrypt
2478 .type bsaes_xts_decrypt,\@abi-omnipotent
2479 .align 16
2480 bsaes_xts_decrypt:
2481 mov %rsp, %rax
2482 .Lxts_dec_prologue:
2483 push %rbp
2484 push %rbx
2485 push %r12
2486 push %r13
2487 push %r14
2488 push %r15
2489 lea -0x48(%rsp), %rsp
2490 ___
2491 $code.=<<___ if ($win64);
2492 mov 0xa0(%rsp),$arg5 # pull key2
2493 mov 0xa8(%rsp),$arg6 # pull ivp
2494 lea -0xa0(%rsp), %rsp
2495 movaps %xmm6, 0x40(%rsp)
2496 movaps %xmm7, 0x50(%rsp)
2497 movaps %xmm8, 0x60(%rsp)
2498 movaps %xmm9, 0x70(%rsp)
2499 movaps %xmm10, 0x80(%rsp)
2500 movaps %xmm11, 0x90(%rsp)
2501 movaps %xmm12, 0xa0(%rsp)
2502 movaps %xmm13, 0xb0(%rsp)
2503 movaps %xmm14, 0xc0(%rsp)
2504 movaps %xmm15, 0xd0(%rsp)
2505 .Lxts_dec_body:
2506 ___
2507 $code.=<<___;
2508 mov %rsp, %rbp # backup %rsp
2509 mov $arg1, $inp # backup arguments
2510 mov $arg2, $out
2511 mov $arg3, $len
2512 mov $arg4, $key
2513
2514 lea ($arg6), $arg1
2515 lea 0x20(%rbp), $arg2
2516 lea ($arg5), $arg3
2517 call asm_AES_encrypt # generate initial tweak
2518
2519 mov 240($key), %eax # rounds
2520 mov $len, %rbx # backup $len
2521
2522 mov %eax, %edx # rounds
2523 shl \$7, %rax # 128 bytes per inner round key
2524 sub \$`128-32`, %rax # size of bit-sliced key schedule
2525 sub %rax, %rsp
2526
2527 mov %rsp, %rax # pass key schedule
2528 mov $key, %rcx # pass key
2529 mov %edx, %r10d # pass rounds
2530 call _bsaes_key_convert
2531 pxor (%rsp), %xmm7 # fix up round 0 key
2532 movdqa %xmm6, (%rax) # save last round key
2533 movdqa %xmm7, (%rsp)
2534
2535 xor %eax, %eax # if ($len%16) len-=16;
2536 and \$-16, $len
2537 test \$15, %ebx
2538 setnz %al
2539 shl \$4, %rax
2540 sub %rax, $len
2541
2542 sub \$0x80, %rsp # place for tweak[8]
2543 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2544
2545 pxor $twtmp, $twtmp
2546 movdqa .Lxts_magic(%rip), $twmask
2547 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2548
2549 sub \$0x80, $len
2550 jc .Lxts_dec_short
2551 jmp .Lxts_dec_loop
2552
2553 .align 16
2554 .Lxts_dec_loop:
2555 ___
2556 for ($i=0;$i<7;$i++) {
2557 $code.=<<___;
2558 pshufd \$0x13, $twtmp, $twres
2559 pxor $twtmp, $twtmp
2560 movdqa @XMM[7], @XMM[$i]
2561 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2562 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2563 pand $twmask, $twres # isolate carry and residue
2564 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2565 pxor $twres, @XMM[7]
2566 ___
2567 $code.=<<___ if ($i>=1);
2568 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2569 ___
2570 $code.=<<___ if ($i>=2);
2571 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2572 ___
2573 }
2574 $code.=<<___;
2575 movdqu 0x60($inp), @XMM[8+6]
2576 pxor @XMM[8+5], @XMM[5]
2577 movdqu 0x70($inp), @XMM[8+7]
2578 lea 0x80($inp), $inp
2579 movdqa @XMM[7], 0x70(%rsp)
2580 pxor @XMM[8+6], @XMM[6]
2581 lea 0x80(%rsp), %rax # pass key schedule
2582 pxor @XMM[8+7], @XMM[7]
2583 mov %edx, %r10d # pass rounds
2584
2585 call _bsaes_decrypt8
2586
2587 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2588 pxor 0x10(%rsp), @XMM[1]
2589 movdqu @XMM[0], 0x00($out) # write output
2590 pxor 0x20(%rsp), @XMM[6]
2591 movdqu @XMM[1], 0x10($out)
2592 pxor 0x30(%rsp), @XMM[4]
2593 movdqu @XMM[6], 0x20($out)
2594 pxor 0x40(%rsp), @XMM[2]
2595 movdqu @XMM[4], 0x30($out)
2596 pxor 0x50(%rsp), @XMM[7]
2597 movdqu @XMM[2], 0x40($out)
2598 pxor 0x60(%rsp), @XMM[3]
2599 movdqu @XMM[7], 0x50($out)
2600 pxor 0x70(%rsp), @XMM[5]
2601 movdqu @XMM[3], 0x60($out)
2602 movdqu @XMM[5], 0x70($out)
2603 lea 0x80($out), $out
2604
2605 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2606 pxor $twtmp, $twtmp
2607 movdqa .Lxts_magic(%rip), $twmask
2608 pcmpgtd @XMM[7], $twtmp
2609 pshufd \$0x13, $twtmp, $twres
2610 pxor $twtmp, $twtmp
2611 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2612 pand $twmask, $twres # isolate carry and residue
2613 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2614 pxor $twres, @XMM[7]
2615
2616 sub \$0x80,$len
2617 jnc .Lxts_dec_loop
2618
2619 .Lxts_dec_short:
2620 add \$0x80, $len
2621 jz .Lxts_dec_done
2622 ___
2623 for ($i=0;$i<7;$i++) {
2624 $code.=<<___;
2625 pshufd \$0x13, $twtmp, $twres
2626 pxor $twtmp, $twtmp
2627 movdqa @XMM[7], @XMM[$i]
2628 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2629 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2630 pand $twmask, $twres # isolate carry and residue
2631 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2632 pxor $twres, @XMM[7]
2633 ___
2634 $code.=<<___ if ($i>=1);
2635 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2636 cmp \$`0x10*$i`,$len
2637 je .Lxts_dec_$i
2638 ___
2639 $code.=<<___ if ($i>=2);
2640 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2641 ___
2642 }
2643 $code.=<<___;
2644 movdqu 0x60($inp), @XMM[8+6]
2645 pxor @XMM[8+5], @XMM[5]
2646 movdqa @XMM[7], 0x70(%rsp)
2647 lea 0x70($inp), $inp
2648 pxor @XMM[8+6], @XMM[6]
2649 lea 0x80(%rsp), %rax # pass key schedule
2650 mov %edx, %r10d # pass rounds
2651
2652 call _bsaes_decrypt8
2653
2654 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2655 pxor 0x10(%rsp), @XMM[1]
2656 movdqu @XMM[0], 0x00($out) # write output
2657 pxor 0x20(%rsp), @XMM[6]
2658 movdqu @XMM[1], 0x10($out)
2659 pxor 0x30(%rsp), @XMM[4]
2660 movdqu @XMM[6], 0x20($out)
2661 pxor 0x40(%rsp), @XMM[2]
2662 movdqu @XMM[4], 0x30($out)
2663 pxor 0x50(%rsp), @XMM[7]
2664 movdqu @XMM[2], 0x40($out)
2665 pxor 0x60(%rsp), @XMM[3]
2666 movdqu @XMM[7], 0x50($out)
2667 movdqu @XMM[3], 0x60($out)
2668 lea 0x70($out), $out
2669
2670 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2671 jmp .Lxts_dec_done
2672 .align 16
2673 .Lxts_dec_6:
2674 pxor @XMM[8+4], @XMM[4]
2675 lea 0x60($inp), $inp
2676 pxor @XMM[8+5], @XMM[5]
2677 lea 0x80(%rsp), %rax # pass key schedule
2678 mov %edx, %r10d # pass rounds
2679
2680 call _bsaes_decrypt8
2681
2682 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2683 pxor 0x10(%rsp), @XMM[1]
2684 movdqu @XMM[0], 0x00($out) # write output
2685 pxor 0x20(%rsp), @XMM[6]
2686 movdqu @XMM[1], 0x10($out)
2687 pxor 0x30(%rsp), @XMM[4]
2688 movdqu @XMM[6], 0x20($out)
2689 pxor 0x40(%rsp), @XMM[2]
2690 movdqu @XMM[4], 0x30($out)
2691 pxor 0x50(%rsp), @XMM[7]
2692 movdqu @XMM[2], 0x40($out)
2693 movdqu @XMM[7], 0x50($out)
2694 lea 0x60($out), $out
2695
2696 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2697 jmp .Lxts_dec_done
2698 .align 16
2699 .Lxts_dec_5:
2700 pxor @XMM[8+3], @XMM[3]
2701 lea 0x50($inp), $inp
2702 pxor @XMM[8+4], @XMM[4]
2703 lea 0x80(%rsp), %rax # pass key schedule
2704 mov %edx, %r10d # pass rounds
2705
2706 call _bsaes_decrypt8
2707
2708 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2709 pxor 0x10(%rsp), @XMM[1]
2710 movdqu @XMM[0], 0x00($out) # write output
2711 pxor 0x20(%rsp), @XMM[6]
2712 movdqu @XMM[1], 0x10($out)
2713 pxor 0x30(%rsp), @XMM[4]
2714 movdqu @XMM[6], 0x20($out)
2715 pxor 0x40(%rsp), @XMM[2]
2716 movdqu @XMM[4], 0x30($out)
2717 movdqu @XMM[2], 0x40($out)
2718 lea 0x50($out), $out
2719
2720 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2721 jmp .Lxts_dec_done
2722 .align 16
2723 .Lxts_dec_4:
2724 pxor @XMM[8+2], @XMM[2]
2725 lea 0x40($inp), $inp
2726 pxor @XMM[8+3], @XMM[3]
2727 lea 0x80(%rsp), %rax # pass key schedule
2728 mov %edx, %r10d # pass rounds
2729
2730 call _bsaes_decrypt8
2731
2732 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2733 pxor 0x10(%rsp), @XMM[1]
2734 movdqu @XMM[0], 0x00($out) # write output
2735 pxor 0x20(%rsp), @XMM[6]
2736 movdqu @XMM[1], 0x10($out)
2737 pxor 0x30(%rsp), @XMM[4]
2738 movdqu @XMM[6], 0x20($out)
2739 movdqu @XMM[4], 0x30($out)
2740 lea 0x40($out), $out
2741
2742 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2743 jmp .Lxts_dec_done
2744 .align 16
2745 .Lxts_dec_3:
2746 pxor @XMM[8+1], @XMM[1]
2747 lea 0x30($inp), $inp
2748 pxor @XMM[8+2], @XMM[2]
2749 lea 0x80(%rsp), %rax # pass key schedule
2750 mov %edx, %r10d # pass rounds
2751
2752 call _bsaes_decrypt8
2753
2754 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2755 pxor 0x10(%rsp), @XMM[1]
2756 movdqu @XMM[0], 0x00($out) # write output
2757 pxor 0x20(%rsp), @XMM[6]
2758 movdqu @XMM[1], 0x10($out)
2759 movdqu @XMM[6], 0x20($out)
2760 lea 0x30($out), $out
2761
2762 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2763 jmp .Lxts_dec_done
2764 .align 16
2765 .Lxts_dec_2:
2766 pxor @XMM[8+0], @XMM[0]
2767 lea 0x20($inp), $inp
2768 pxor @XMM[8+1], @XMM[1]
2769 lea 0x80(%rsp), %rax # pass key schedule
2770 mov %edx, %r10d # pass rounds
2771
2772 call _bsaes_decrypt8
2773
2774 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2775 pxor 0x10(%rsp), @XMM[1]
2776 movdqu @XMM[0], 0x00($out) # write output
2777 movdqu @XMM[1], 0x10($out)
2778 lea 0x20($out), $out
2779
2780 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2781 jmp .Lxts_dec_done
2782 .align 16
2783 .Lxts_dec_1:
2784 pxor @XMM[0], @XMM[8]
2785 lea 0x10($inp), $inp
2786 movdqa @XMM[8], 0x20(%rbp)
2787 lea 0x20(%rbp), $arg1
2788 lea 0x20(%rbp), $arg2
2789 lea ($key), $arg3
2790 call asm_AES_decrypt # doesn't touch %xmm
2791 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2792 #pxor @XMM[8], @XMM[0]
2793 #lea 0x80(%rsp), %rax # pass key schedule
2794 #mov %edx, %r10d # pass rounds
2795 #call _bsaes_decrypt8
2796 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2797 movdqu @XMM[0], 0x00($out) # write output
2798 lea 0x10($out), $out
2799
2800 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2801
2802 .Lxts_dec_done:
2803 and \$15, %ebx
2804 jz .Lxts_dec_ret
2805
2806 pxor $twtmp, $twtmp
2807 movdqa .Lxts_magic(%rip), $twmask
2808 pcmpgtd @XMM[7], $twtmp
2809 pshufd \$0x13, $twtmp, $twres
2810 movdqa @XMM[7], @XMM[6]
2811 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2812 pand $twmask, $twres # isolate carry and residue
2813 movdqu ($inp), @XMM[0]
2814 pxor $twres, @XMM[7]
2815
2816 lea 0x20(%rbp), $arg1
2817 pxor @XMM[7], @XMM[0]
2818 lea 0x20(%rbp), $arg2
2819 movdqa @XMM[0], 0x20(%rbp)
2820 lea ($key), $arg3
2821 call asm_AES_decrypt # doesn't touch %xmm
2822 pxor 0x20(%rbp), @XMM[7]
2823 mov $out, %rdx
2824 movdqu @XMM[7], ($out)
2825
2826 .Lxts_dec_steal:
2827 movzb 16($inp), %eax
2828 movzb (%rdx), %ecx
2829 lea 1($inp), $inp
2830 mov %al, (%rdx)
2831 mov %cl, 16(%rdx)
2832 lea 1(%rdx), %rdx
2833 sub \$1,%ebx
2834 jnz .Lxts_dec_steal
2835
2836 movdqu ($out), @XMM[0]
2837 lea 0x20(%rbp), $arg1
2838 pxor @XMM[6], @XMM[0]
2839 lea 0x20(%rbp), $arg2
2840 movdqa @XMM[0], 0x20(%rbp)
2841 lea ($key), $arg3
2842 call asm_AES_decrypt # doesn't touch %xmm
2843 pxor 0x20(%rbp), @XMM[6]
2844 movdqu @XMM[6], ($out)
2845
2846 .Lxts_dec_ret:
2847 lea (%rsp), %rax
2848 pxor %xmm0, %xmm0
2849 .Lxts_dec_bzero: # wipe key schedule [if any]
2850 movdqa %xmm0, 0x00(%rax)
2851 movdqa %xmm0, 0x10(%rax)
2852 lea 0x20(%rax), %rax
2853 cmp %rax, %rbp
2854 ja .Lxts_dec_bzero
2855
2856 lea (%rbp),%rsp # restore %rsp
2857 ___
2858 $code.=<<___ if ($win64);
2859 movaps 0x40(%rbp), %xmm6
2860 movaps 0x50(%rbp), %xmm7
2861 movaps 0x60(%rbp), %xmm8
2862 movaps 0x70(%rbp), %xmm9
2863 movaps 0x80(%rbp), %xmm10
2864 movaps 0x90(%rbp), %xmm11
2865 movaps 0xa0(%rbp), %xmm12
2866 movaps 0xb0(%rbp), %xmm13
2867 movaps 0xc0(%rbp), %xmm14
2868 movaps 0xd0(%rbp), %xmm15
2869 lea 0xa0(%rbp), %rsp
2870 ___
2871 $code.=<<___;
2872 mov 0x48(%rsp), %r15
2873 mov 0x50(%rsp), %r14
2874 mov 0x58(%rsp), %r13
2875 mov 0x60(%rsp), %r12
2876 mov 0x68(%rsp), %rbx
2877 mov 0x70(%rsp), %rax
2878 lea 0x78(%rsp), %rsp
2879 mov %rax, %rbp
2880 .Lxts_dec_epilogue:
2881 ret
2882 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2883 ___
2884 }
2885 $code.=<<___;
2886 .type _bsaes_const,\@object
2887 .align 64
2888 _bsaes_const:
2889 .LM0ISR: # InvShiftRows constants
2890 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2891 .LISRM0:
2892 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2893 .LISR:
2894 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2895 .LBS0: # bit-slice constants
2896 .quad 0x5555555555555555, 0x5555555555555555
2897 .LBS1:
2898 .quad 0x3333333333333333, 0x3333333333333333
2899 .LBS2:
2900 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2901 .LSR: # shiftrows constants
2902 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2903 .LSRM0:
2904 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2905 .LM0SR:
2906 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2907 .LSWPUP: # byte-swap upper dword
2908 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2909 .LSWPUPM0SR:
2910 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2911 .LADD1: # counter increment constants
2912 .quad 0x0000000000000000, 0x0000000100000000
2913 .LADD2:
2914 .quad 0x0000000000000000, 0x0000000200000000
2915 .LADD3:
2916 .quad 0x0000000000000000, 0x0000000300000000
2917 .LADD4:
2918 .quad 0x0000000000000000, 0x0000000400000000
2919 .LADD5:
2920 .quad 0x0000000000000000, 0x0000000500000000
2921 .LADD6:
2922 .quad 0x0000000000000000, 0x0000000600000000
2923 .LADD7:
2924 .quad 0x0000000000000000, 0x0000000700000000
2925 .LADD8:
2926 .quad 0x0000000000000000, 0x0000000800000000
2927 .Lxts_magic:
2928 .long 0x87,0,1,0
2929 .Lmasks:
2930 .quad 0x0101010101010101, 0x0101010101010101
2931 .quad 0x0202020202020202, 0x0202020202020202
2932 .quad 0x0404040404040404, 0x0404040404040404
2933 .quad 0x0808080808080808, 0x0808080808080808
2934 .LM0:
2935 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2936 .L63:
2937 .quad 0x6363636363636363, 0x6363636363636363
2938 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2939 .align 64
2940 .size _bsaes_const,.-_bsaes_const
2941 ___
2942
2943 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2944 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2945 if ($win64) {
2946 $rec="%rcx";
2947 $frame="%rdx";
2948 $context="%r8";
2949 $disp="%r9";
2950
2951 $code.=<<___;
2952 .extern __imp_RtlVirtualUnwind
2953 .type se_handler,\@abi-omnipotent
2954 .align 16
2955 se_handler:
2956 push %rsi
2957 push %rdi
2958 push %rbx
2959 push %rbp
2960 push %r12
2961 push %r13
2962 push %r14
2963 push %r15
2964 pushfq
2965 sub \$64,%rsp
2966
2967 mov 120($context),%rax # pull context->Rax
2968 mov 248($context),%rbx # pull context->Rip
2969
2970 mov 8($disp),%rsi # disp->ImageBase
2971 mov 56($disp),%r11 # disp->HandlerData
2972
2973 mov 0(%r11),%r10d # HandlerData[0]
2974 lea (%rsi,%r10),%r10 # prologue label
2975 cmp %r10,%rbx # context->Rip<prologue label
2976 jb .Lin_prologue
2977
2978 mov 152($context),%rax # pull context->Rsp
2979
2980 mov 4(%r11),%r10d # HandlerData[1]
2981 lea (%rsi,%r10),%r10 # epilogue label
2982 cmp %r10,%rbx # context->Rip>=epilogue label
2983 jae .Lin_prologue
2984
2985 mov 160($context),%rax # pull context->Rbp
2986
2987 lea 0x40(%rax),%rsi # %xmm save area
2988 lea 512($context),%rdi # &context.Xmm6
2989 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2990 .long 0xa548f3fc # cld; rep movsq
2991 lea 0xa0(%rax),%rax # adjust stack pointer
2992
2993 mov 0x70(%rax),%rbp
2994 mov 0x68(%rax),%rbx
2995 mov 0x60(%rax),%r12
2996 mov 0x58(%rax),%r13
2997 mov 0x50(%rax),%r14
2998 mov 0x48(%rax),%r15
2999 lea 0x78(%rax),%rax # adjust stack pointer
3000 mov %rbx,144($context) # restore context->Rbx
3001 mov %rbp,160($context) # restore context->Rbp
3002 mov %r12,216($context) # restore context->R12
3003 mov %r13,224($context) # restore context->R13
3004 mov %r14,232($context) # restore context->R14
3005 mov %r15,240($context) # restore context->R15
3006
3007 .Lin_prologue:
3008 mov %rax,152($context) # restore context->Rsp
3009
3010 mov 40($disp),%rdi # disp->ContextRecord
3011 mov $context,%rsi # context
3012 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3013 .long 0xa548f3fc # cld; rep movsq
3014
3015 mov $disp,%rsi
3016 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3017 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3018 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3019 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3020 mov 40(%rsi),%r10 # disp->ContextRecord
3021 lea 56(%rsi),%r11 # &disp->HandlerData
3022 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3023 mov %r10,32(%rsp) # arg5
3024 mov %r11,40(%rsp) # arg6
3025 mov %r12,48(%rsp) # arg7
3026 mov %rcx,56(%rsp) # arg8, (NULL)
3027 call *__imp_RtlVirtualUnwind(%rip)
3028
3029 mov \$1,%eax # ExceptionContinueSearch
3030 add \$64,%rsp
3031 popfq
3032 pop %r15
3033 pop %r14
3034 pop %r13
3035 pop %r12
3036 pop %rbp
3037 pop %rbx
3038 pop %rdi
3039 pop %rsi
3040 ret
3041 .size se_handler,.-se_handler
3042
3043 .section .pdata
3044 .align 4
3045 ___
3046 $code.=<<___ if ($ecb);
3047 .rva .Lecb_enc_prologue
3048 .rva .Lecb_enc_epilogue
3049 .rva .Lecb_enc_info
3050
3051 .rva .Lecb_dec_prologue
3052 .rva .Lecb_dec_epilogue
3053 .rva .Lecb_dec_info
3054 ___
3055 $code.=<<___;
3056 .rva .Lcbc_dec_prologue
3057 .rva .Lcbc_dec_epilogue
3058 .rva .Lcbc_dec_info
3059
3060 .rva .Lctr_enc_prologue
3061 .rva .Lctr_enc_epilogue
3062 .rva .Lctr_enc_info
3063
3064 .rva .Lxts_enc_prologue
3065 .rva .Lxts_enc_epilogue
3066 .rva .Lxts_enc_info
3067
3068 .rva .Lxts_dec_prologue
3069 .rva .Lxts_dec_epilogue
3070 .rva .Lxts_dec_info
3071
3072 .section .xdata
3073 .align 8
3074 ___
3075 $code.=<<___ if ($ecb);
3076 .Lecb_enc_info:
3077 .byte 9,0,0,0
3078 .rva se_handler
3079 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3080 .Lecb_dec_info:
3081 .byte 9,0,0,0
3082 .rva se_handler
3083 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3084 ___
3085 $code.=<<___;
3086 .Lcbc_dec_info:
3087 .byte 9,0,0,0
3088 .rva se_handler
3089 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3090 .Lctr_enc_info:
3091 .byte 9,0,0,0
3092 .rva se_handler
3093 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3094 .Lxts_enc_info:
3095 .byte 9,0,0,0
3096 .rva se_handler
3097 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3098 .Lxts_dec_info:
3099 .byte 9,0,0,0
3100 .rva se_handler
3101 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3102 ___
3103 }
3104
3105 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3106
3107 print $code;
3108
3109 close STDOUT;