]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/bsaes-x86_64.pl
1b0f709687a0732e4a9f7419e945cbe7043069c6
[thirdparty/openssl.git] / crypto / aes / asm / bsaes-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 ###################################################################
11 ### AES-128 [originally in CTR mode] ###
12 ### bitsliced implementation for Intel Core 2 processors ###
13 ### requires support of SSE extensions up to SSSE3 ###
14 ### Author: Emilia Käsper and Peter Schwabe ###
15 ### Date: 2009-03-19 ###
16 ### Public domain ###
17 ### ###
18 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19 ### further information. ###
20 ###################################################################
21 #
22 # September 2011.
23 #
24 # Started as transliteration to "perlasm" the original code has
25 # undergone following changes:
26 #
27 # - code was made position-independent;
28 # - rounds were folded into a loop resulting in >5x size reduction
29 # from 12.5KB to 2.2KB;
30 # - above was possible thanks to mixcolumns() modification that
31 # allowed to feed its output back to aesenc[last], this was
32 # achieved at cost of two additional inter-registers moves;
33 # - some instruction reordering and interleaving;
34 # - this module doesn't implement key setup subroutine, instead it
35 # relies on conversion of "conventional" key schedule as returned
36 # by AES_set_encrypt_key (see discussion below);
37 # - first and last round keys are treated differently, which allowed
38 # to skip one shiftrows(), reduce bit-sliced key schedule and
39 # speed-up conversion by 22%;
40 # - support for 192- and 256-bit keys was added;
41 #
42 # Resulting performance in CPU cycles spent to encrypt one byte out
43 # of 4096-byte buffer with 128-bit key is:
44 #
45 # Emilia's this(*) difference
46 #
47 # Core 2 9.30 8.69 +7%
48 # Nehalem(**) 7.63 6.88 +11%
49 # Atom 17.1 16.4 +4%
50 # Silvermont - 12.9
51 # Goldmont - 8.85
52 #
53 # (*) Comparison is not completely fair, because "this" is ECB,
54 # i.e. no extra processing such as counter values calculation
55 # and xor-ing input as in Emilia's CTR implementation is
56 # performed. However, the CTR calculations stand for not more
57 # than 1% of total time, so comparison is *rather* fair.
58 #
59 # (**) Results were collected on Westmere, which is considered to
60 # be equivalent to Nehalem for this code.
61 #
62 # As for key schedule conversion subroutine. Interface to OpenSSL
63 # relies on per-invocation on-the-fly conversion. This naturally
64 # has impact on performance, especially for short inputs. Conversion
65 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
66 # function is:
67 #
68 # conversion conversion/8x block
69 # Core 2 240 0.22
70 # Nehalem 180 0.20
71 # Atom 430 0.20
72 #
73 # The ratio values mean that 128-byte blocks will be processed
74 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75 # etc. Then keep in mind that input sizes not divisible by 128 are
76 # *effectively* slower, especially shortest ones, e.g. consecutive
77 # 144-byte blocks are processed 44% slower than one would expect,
78 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79 # it's still faster than ["hyper-threading-safe" code path in]
80 # aes-x86_64.pl on all lengths above 64 bytes...
81 #
82 # October 2011.
83 #
84 # Add decryption procedure. Performance in CPU cycles spent to decrypt
85 # one byte out of 4096-byte buffer with 128-bit key is:
86 #
87 # Core 2 9.98
88 # Nehalem 7.80
89 # Atom 17.9
90 # Silvermont 14.0
91 # Goldmont 10.2
92 #
93 # November 2011.
94 #
95 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96 # suboptimal, but XTS is meant to be used with larger blocks...
97 #
98 # <appro@openssl.org>
99
100 # $output is the last argument if it looks like a file (it has an extension)
101 # $flavour is the first argument if it doesn't look like a file
102 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
103 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
104
105 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
107 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110 die "can't locate x86_64-xlate.pl";
111
112 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
113 or die "can't call $xlate: $!";
114 *STDOUT=*OUT;
115
116 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
117 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
118 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
119
120 {
121 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
122
123 sub Sbox {
124 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
126 my @b=@_[0..7];
127 my @t=@_[8..11];
128 my @s=@_[12..15];
129 &InBasisChange (@b);
130 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
131 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
132 }
133
134 sub InBasisChange {
135 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
136 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
137 my @b=@_[0..7];
138 $code.=<<___;
139 pxor @b[6], @b[5]
140 pxor @b[1], @b[2]
141 pxor @b[0], @b[3]
142 pxor @b[2], @b[6]
143 pxor @b[0], @b[5]
144
145 pxor @b[3], @b[6]
146 pxor @b[7], @b[3]
147 pxor @b[5], @b[7]
148 pxor @b[4], @b[3]
149 pxor @b[5], @b[4]
150 pxor @b[1], @b[3]
151
152 pxor @b[7], @b[2]
153 pxor @b[5], @b[1]
154 ___
155 }
156
157 sub OutBasisChange {
158 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
159 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
160 my @b=@_[0..7];
161 $code.=<<___;
162 pxor @b[6], @b[0]
163 pxor @b[4], @b[1]
164 pxor @b[0], @b[2]
165 pxor @b[6], @b[4]
166 pxor @b[1], @b[6]
167
168 pxor @b[5], @b[1]
169 pxor @b[3], @b[5]
170 pxor @b[7], @b[3]
171 pxor @b[5], @b[7]
172 pxor @b[5], @b[2]
173
174 pxor @b[7], @b[4]
175 ___
176 }
177
178 sub InvSbox {
179 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
180 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
181 my @b=@_[0..7];
182 my @t=@_[8..11];
183 my @s=@_[12..15];
184 &InvInBasisChange (@b);
185 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
186 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
187 }
188
189 sub InvInBasisChange { # OutBasisChange in reverse
190 my @b=@_[5,1,2,6,3,7,0,4];
191 $code.=<<___
192 pxor @b[7], @b[4]
193
194 pxor @b[5], @b[7]
195 pxor @b[5], @b[2]
196 pxor @b[7], @b[3]
197 pxor @b[3], @b[5]
198 pxor @b[5], @b[1]
199
200 pxor @b[1], @b[6]
201 pxor @b[0], @b[2]
202 pxor @b[6], @b[4]
203 pxor @b[6], @b[0]
204 pxor @b[4], @b[1]
205 ___
206 }
207
208 sub InvOutBasisChange { # InBasisChange in reverse
209 my @b=@_[2,5,7,3,6,1,0,4];
210 $code.=<<___;
211 pxor @b[5], @b[1]
212 pxor @b[7], @b[2]
213
214 pxor @b[1], @b[3]
215 pxor @b[5], @b[4]
216 pxor @b[5], @b[7]
217 pxor @b[4], @b[3]
218 pxor @b[0], @b[5]
219 pxor @b[7], @b[3]
220 pxor @b[2], @b[6]
221 pxor @b[1], @b[2]
222 pxor @b[3], @b[6]
223
224 pxor @b[0], @b[3]
225 pxor @b[6], @b[5]
226 ___
227 }
228
229 sub Mul_GF4 {
230 #;*************************************************************
231 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
232 #;*************************************************************
233 my ($x0,$x1,$y0,$y1,$t0)=@_;
234 $code.=<<___;
235 movdqa $y0, $t0
236 pxor $y1, $t0
237 pand $x0, $t0
238 pxor $x1, $x0
239 pand $y0, $x1
240 pand $y1, $x0
241 pxor $x1, $x0
242 pxor $t0, $x1
243 ___
244 }
245
246 sub Mul_GF4_N { # not used, see next subroutine
247 # multiply and scale by N
248 my ($x0,$x1,$y0,$y1,$t0)=@_;
249 $code.=<<___;
250 movdqa $y0, $t0
251 pxor $y1, $t0
252 pand $x0, $t0
253 pxor $x1, $x0
254 pand $y0, $x1
255 pand $y1, $x0
256 pxor $x0, $x1
257 pxor $t0, $x0
258 ___
259 }
260
261 sub Mul_GF4_N_GF4 {
262 # interleaved Mul_GF4_N and Mul_GF4
263 my ($x0,$x1,$y0,$y1,$t0,
264 $x2,$x3,$y2,$y3,$t1)=@_;
265 $code.=<<___;
266 movdqa $y0, $t0
267 movdqa $y2, $t1
268 pxor $y1, $t0
269 pxor $y3, $t1
270 pand $x0, $t0
271 pand $x2, $t1
272 pxor $x1, $x0
273 pxor $x3, $x2
274 pand $y0, $x1
275 pand $y2, $x3
276 pand $y1, $x0
277 pand $y3, $x2
278 pxor $x0, $x1
279 pxor $x3, $x2
280 pxor $t0, $x0
281 pxor $t1, $x3
282 ___
283 }
284 sub Mul_GF16_2 {
285 my @x=@_[0..7];
286 my @y=@_[8..11];
287 my @t=@_[12..15];
288 $code.=<<___;
289 movdqa @x[0], @t[0]
290 movdqa @x[1], @t[1]
291 ___
292 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
293 $code.=<<___;
294 pxor @x[2], @t[0]
295 pxor @x[3], @t[1]
296 pxor @y[2], @y[0]
297 pxor @y[3], @y[1]
298 ___
299 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
300 @x[2], @x[3], @y[2], @y[3], @t[2]);
301 $code.=<<___;
302 pxor @t[0], @x[0]
303 pxor @t[0], @x[2]
304 pxor @t[1], @x[1]
305 pxor @t[1], @x[3]
306
307 movdqa @x[4], @t[0]
308 movdqa @x[5], @t[1]
309 pxor @x[6], @t[0]
310 pxor @x[7], @t[1]
311 ___
312 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
313 @x[6], @x[7], @y[2], @y[3], @t[2]);
314 $code.=<<___;
315 pxor @y[2], @y[0]
316 pxor @y[3], @y[1]
317 ___
318 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
319 $code.=<<___;
320 pxor @t[0], @x[4]
321 pxor @t[0], @x[6]
322 pxor @t[1], @x[5]
323 pxor @t[1], @x[7]
324 ___
325 }
326 sub Inv_GF256 {
327 #;********************************************************************
328 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
329 #;********************************************************************
330 my @x=@_[0..7];
331 my @t=@_[8..11];
332 my @s=@_[12..15];
333 # direct optimizations from hardware
334 $code.=<<___;
335 movdqa @x[4], @t[3]
336 movdqa @x[5], @t[2]
337 movdqa @x[1], @t[1]
338 movdqa @x[7], @s[1]
339 movdqa @x[0], @s[0]
340
341 pxor @x[6], @t[3]
342 pxor @x[7], @t[2]
343 pxor @x[3], @t[1]
344 movdqa @t[3], @s[2]
345 pxor @x[6], @s[1]
346 movdqa @t[2], @t[0]
347 pxor @x[2], @s[0]
348 movdqa @t[3], @s[3]
349
350 por @t[1], @t[2]
351 por @s[0], @t[3]
352 pxor @t[0], @s[3]
353 pand @s[0], @s[2]
354 pxor @t[1], @s[0]
355 pand @t[1], @t[0]
356 pand @s[0], @s[3]
357 movdqa @x[3], @s[0]
358 pxor @x[2], @s[0]
359 pand @s[0], @s[1]
360 pxor @s[1], @t[3]
361 pxor @s[1], @t[2]
362 movdqa @x[4], @s[1]
363 movdqa @x[1], @s[0]
364 pxor @x[5], @s[1]
365 pxor @x[0], @s[0]
366 movdqa @s[1], @t[1]
367 pand @s[0], @s[1]
368 por @s[0], @t[1]
369 pxor @s[1], @t[0]
370 pxor @s[3], @t[3]
371 pxor @s[2], @t[2]
372 pxor @s[3], @t[1]
373 movdqa @x[7], @s[0]
374 pxor @s[2], @t[0]
375 movdqa @x[6], @s[1]
376 pxor @s[2], @t[1]
377 movdqa @x[5], @s[2]
378 pand @x[3], @s[0]
379 movdqa @x[4], @s[3]
380 pand @x[2], @s[1]
381 pand @x[1], @s[2]
382 por @x[0], @s[3]
383 pxor @s[0], @t[3]
384 pxor @s[1], @t[2]
385 pxor @s[2], @t[1]
386 pxor @s[3], @t[0]
387
388 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
389
390 # new smaller inversion
391
392 movdqa @t[3], @s[0]
393 pand @t[1], @t[3]
394 pxor @t[2], @s[0]
395
396 movdqa @t[0], @s[2]
397 movdqa @s[0], @s[3]
398 pxor @t[3], @s[2]
399 pand @s[2], @s[3]
400
401 movdqa @t[1], @s[1]
402 pxor @t[2], @s[3]
403 pxor @t[0], @s[1]
404
405 pxor @t[2], @t[3]
406
407 pand @t[3], @s[1]
408
409 movdqa @s[2], @t[2]
410 pxor @t[0], @s[1]
411
412 pxor @s[1], @t[2]
413 pxor @s[1], @t[1]
414
415 pand @t[0], @t[2]
416
417 pxor @t[2], @s[2]
418 pxor @t[2], @t[1]
419
420 pand @s[3], @s[2]
421
422 pxor @s[0], @s[2]
423 ___
424 # output in s3, s2, s1, t1
425
426 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
427
428 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
429 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
430
431 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
432 }
433
434 # AES linear components
435
436 sub ShiftRows {
437 my @x=@_[0..7];
438 my $mask=pop;
439 $code.=<<___;
440 pxor 0x00($key),@x[0]
441 pxor 0x10($key),@x[1]
442 pxor 0x20($key),@x[2]
443 pxor 0x30($key),@x[3]
444 pshufb $mask,@x[0]
445 pshufb $mask,@x[1]
446 pxor 0x40($key),@x[4]
447 pxor 0x50($key),@x[5]
448 pshufb $mask,@x[2]
449 pshufb $mask,@x[3]
450 pxor 0x60($key),@x[6]
451 pxor 0x70($key),@x[7]
452 pshufb $mask,@x[4]
453 pshufb $mask,@x[5]
454 pshufb $mask,@x[6]
455 pshufb $mask,@x[7]
456 lea 0x80($key),$key
457 ___
458 }
459
460 sub MixColumns {
461 # modified to emit output in order suitable for feeding back to aesenc[last]
462 my @x=@_[0..7];
463 my @t=@_[8..15];
464 my $inv=@_[16]; # optional
465 $code.=<<___;
466 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
467 pshufd \$0x93, @x[1], @t[1]
468 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
469 pshufd \$0x93, @x[2], @t[2]
470 pxor @t[1], @x[1]
471 pshufd \$0x93, @x[3], @t[3]
472 pxor @t[2], @x[2]
473 pshufd \$0x93, @x[4], @t[4]
474 pxor @t[3], @x[3]
475 pshufd \$0x93, @x[5], @t[5]
476 pxor @t[4], @x[4]
477 pshufd \$0x93, @x[6], @t[6]
478 pxor @t[5], @x[5]
479 pshufd \$0x93, @x[7], @t[7]
480 pxor @t[6], @x[6]
481 pxor @t[7], @x[7]
482
483 pxor @x[0], @t[1]
484 pxor @x[7], @t[0]
485 pxor @x[7], @t[1]
486 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
487 pxor @x[1], @t[2]
488 pshufd \$0x4E, @x[1], @x[1]
489 pxor @x[4], @t[5]
490 pxor @t[0], @x[0]
491 pxor @x[5], @t[6]
492 pxor @t[1], @x[1]
493 pxor @x[3], @t[4]
494 pshufd \$0x4E, @x[4], @t[0]
495 pxor @x[6], @t[7]
496 pshufd \$0x4E, @x[5], @t[1]
497 pxor @x[2], @t[3]
498 pshufd \$0x4E, @x[3], @x[4]
499 pxor @x[7], @t[3]
500 pshufd \$0x4E, @x[7], @x[5]
501 pxor @x[7], @t[4]
502 pshufd \$0x4E, @x[6], @x[3]
503 pxor @t[4], @t[0]
504 pshufd \$0x4E, @x[2], @x[6]
505 pxor @t[5], @t[1]
506 ___
507 $code.=<<___ if (!$inv);
508 pxor @t[3], @x[4]
509 pxor @t[7], @x[5]
510 pxor @t[6], @x[3]
511 movdqa @t[0], @x[2]
512 pxor @t[2], @x[6]
513 movdqa @t[1], @x[7]
514 ___
515 $code.=<<___ if ($inv);
516 pxor @x[4], @t[3]
517 pxor @t[7], @x[5]
518 pxor @x[3], @t[6]
519 movdqa @t[0], @x[3]
520 pxor @t[2], @x[6]
521 movdqa @t[6], @x[2]
522 movdqa @t[1], @x[7]
523 movdqa @x[6], @x[4]
524 movdqa @t[3], @x[6]
525 ___
526 }
527
528 sub InvMixColumns_orig {
529 my @x=@_[0..7];
530 my @t=@_[8..15];
531
532 $code.=<<___;
533 # multiplication by 0x0e
534 pshufd \$0x93, @x[7], @t[7]
535 movdqa @x[2], @t[2]
536 pxor @x[5], @x[7] # 7 5
537 pxor @x[5], @x[2] # 2 5
538 pshufd \$0x93, @x[0], @t[0]
539 movdqa @x[5], @t[5]
540 pxor @x[0], @x[5] # 5 0 [1]
541 pxor @x[1], @x[0] # 0 1
542 pshufd \$0x93, @x[1], @t[1]
543 pxor @x[2], @x[1] # 1 25
544 pxor @x[6], @x[0] # 01 6 [2]
545 pxor @x[3], @x[1] # 125 3 [4]
546 pshufd \$0x93, @x[3], @t[3]
547 pxor @x[0], @x[2] # 25 016 [3]
548 pxor @x[7], @x[3] # 3 75
549 pxor @x[6], @x[7] # 75 6 [0]
550 pshufd \$0x93, @x[6], @t[6]
551 movdqa @x[4], @t[4]
552 pxor @x[4], @x[6] # 6 4
553 pxor @x[3], @x[4] # 4 375 [6]
554 pxor @x[7], @x[3] # 375 756=36
555 pxor @t[5], @x[6] # 64 5 [7]
556 pxor @t[2], @x[3] # 36 2
557 pxor @t[4], @x[3] # 362 4 [5]
558 pshufd \$0x93, @t[5], @t[5]
559 ___
560 my @y = @x[7,5,0,2,1,3,4,6];
561 $code.=<<___;
562 # multiplication by 0x0b
563 pxor @y[0], @y[1]
564 pxor @t[0], @y[0]
565 pxor @t[1], @y[1]
566 pshufd \$0x93, @t[2], @t[2]
567 pxor @t[5], @y[0]
568 pxor @t[6], @y[1]
569 pxor @t[7], @y[0]
570 pshufd \$0x93, @t[4], @t[4]
571 pxor @t[6], @t[7] # clobber t[7]
572 pxor @y[0], @y[1]
573
574 pxor @t[0], @y[3]
575 pshufd \$0x93, @t[0], @t[0]
576 pxor @t[1], @y[2]
577 pxor @t[1], @y[4]
578 pxor @t[2], @y[2]
579 pshufd \$0x93, @t[1], @t[1]
580 pxor @t[2], @y[3]
581 pxor @t[2], @y[5]
582 pxor @t[7], @y[2]
583 pshufd \$0x93, @t[2], @t[2]
584 pxor @t[3], @y[3]
585 pxor @t[3], @y[6]
586 pxor @t[3], @y[4]
587 pshufd \$0x93, @t[3], @t[3]
588 pxor @t[4], @y[7]
589 pxor @t[4], @y[5]
590 pxor @t[7], @y[7]
591 pxor @t[5], @y[3]
592 pxor @t[4], @y[4]
593 pxor @t[5], @t[7] # clobber t[7] even more
594
595 pxor @t[7], @y[5]
596 pshufd \$0x93, @t[4], @t[4]
597 pxor @t[7], @y[6]
598 pxor @t[7], @y[4]
599
600 pxor @t[5], @t[7]
601 pshufd \$0x93, @t[5], @t[5]
602 pxor @t[6], @t[7] # restore t[7]
603
604 # multiplication by 0x0d
605 pxor @y[7], @y[4]
606 pxor @t[4], @y[7]
607 pshufd \$0x93, @t[6], @t[6]
608 pxor @t[0], @y[2]
609 pxor @t[5], @y[7]
610 pxor @t[2], @y[2]
611 pshufd \$0x93, @t[7], @t[7]
612
613 pxor @y[1], @y[3]
614 pxor @t[1], @y[1]
615 pxor @t[0], @y[0]
616 pxor @t[0], @y[3]
617 pxor @t[5], @y[1]
618 pxor @t[5], @y[0]
619 pxor @t[7], @y[1]
620 pshufd \$0x93, @t[0], @t[0]
621 pxor @t[6], @y[0]
622 pxor @y[1], @y[3]
623 pxor @t[1], @y[4]
624 pshufd \$0x93, @t[1], @t[1]
625
626 pxor @t[7], @y[7]
627 pxor @t[2], @y[4]
628 pxor @t[2], @y[5]
629 pshufd \$0x93, @t[2], @t[2]
630 pxor @t[6], @y[2]
631 pxor @t[3], @t[6] # clobber t[6]
632 pxor @y[7], @y[4]
633 pxor @t[6], @y[3]
634
635 pxor @t[6], @y[6]
636 pxor @t[5], @y[5]
637 pxor @t[4], @y[6]
638 pshufd \$0x93, @t[4], @t[4]
639 pxor @t[6], @y[5]
640 pxor @t[7], @y[6]
641 pxor @t[3], @t[6] # restore t[6]
642
643 pshufd \$0x93, @t[5], @t[5]
644 pshufd \$0x93, @t[6], @t[6]
645 pshufd \$0x93, @t[7], @t[7]
646 pshufd \$0x93, @t[3], @t[3]
647
648 # multiplication by 0x09
649 pxor @y[1], @y[4]
650 pxor @y[1], @t[1] # t[1]=y[1]
651 pxor @t[5], @t[0] # clobber t[0]
652 pxor @t[5], @t[1]
653 pxor @t[0], @y[3]
654 pxor @y[0], @t[0] # t[0]=y[0]
655 pxor @t[6], @t[1]
656 pxor @t[7], @t[6] # clobber t[6]
657 pxor @t[1], @y[4]
658 pxor @t[4], @y[7]
659 pxor @y[4], @t[4] # t[4]=y[4]
660 pxor @t[3], @y[6]
661 pxor @y[3], @t[3] # t[3]=y[3]
662 pxor @t[2], @y[5]
663 pxor @y[2], @t[2] # t[2]=y[2]
664 pxor @t[7], @t[3]
665 pxor @y[5], @t[5] # t[5]=y[5]
666 pxor @t[6], @t[2]
667 pxor @t[6], @t[5]
668 pxor @y[6], @t[6] # t[6]=y[6]
669 pxor @y[7], @t[7] # t[7]=y[7]
670
671 movdqa @t[0],@XMM[0]
672 movdqa @t[1],@XMM[1]
673 movdqa @t[2],@XMM[2]
674 movdqa @t[3],@XMM[3]
675 movdqa @t[4],@XMM[4]
676 movdqa @t[5],@XMM[5]
677 movdqa @t[6],@XMM[6]
678 movdqa @t[7],@XMM[7]
679 ___
680 }
681
682 sub InvMixColumns {
683 my @x=@_[0..7];
684 my @t=@_[8..15];
685
686 # Thanks to Jussi Kivilinna for providing pointer to
687 #
688 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
689 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
690 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
691 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
692
693 $code.=<<___;
694 # multiplication by 0x05-0x00-0x04-0x00
695 pshufd \$0x4E, @x[0], @t[0]
696 pshufd \$0x4E, @x[6], @t[6]
697 pxor @x[0], @t[0]
698 pshufd \$0x4E, @x[7], @t[7]
699 pxor @x[6], @t[6]
700 pshufd \$0x4E, @x[1], @t[1]
701 pxor @x[7], @t[7]
702 pshufd \$0x4E, @x[2], @t[2]
703 pxor @x[1], @t[1]
704 pshufd \$0x4E, @x[3], @t[3]
705 pxor @x[2], @t[2]
706 pxor @t[6], @x[0]
707 pxor @t[6], @x[1]
708 pshufd \$0x4E, @x[4], @t[4]
709 pxor @x[3], @t[3]
710 pxor @t[0], @x[2]
711 pxor @t[1], @x[3]
712 pshufd \$0x4E, @x[5], @t[5]
713 pxor @x[4], @t[4]
714 pxor @t[7], @x[1]
715 pxor @t[2], @x[4]
716 pxor @x[5], @t[5]
717
718 pxor @t[7], @x[2]
719 pxor @t[6], @x[3]
720 pxor @t[6], @x[4]
721 pxor @t[3], @x[5]
722 pxor @t[4], @x[6]
723 pxor @t[7], @x[4]
724 pxor @t[7], @x[5]
725 pxor @t[5], @x[7]
726 ___
727 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
728 }
729
730 sub aesenc { # not used
731 my @b=@_[0..7];
732 my @t=@_[8..15];
733 $code.=<<___;
734 movdqa 0x30($const),@t[0] # .LSR
735 ___
736 &ShiftRows (@b,@t[0]);
737 &Sbox (@b,@t);
738 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
739 }
740
741 sub aesenclast { # not used
742 my @b=@_[0..7];
743 my @t=@_[8..15];
744 $code.=<<___;
745 movdqa 0x40($const),@t[0] # .LSRM0
746 ___
747 &ShiftRows (@b,@t[0]);
748 &Sbox (@b,@t);
749 $code.=<<___
750 pxor 0x00($key),@b[0]
751 pxor 0x10($key),@b[1]
752 pxor 0x20($key),@b[4]
753 pxor 0x30($key),@b[6]
754 pxor 0x40($key),@b[3]
755 pxor 0x50($key),@b[7]
756 pxor 0x60($key),@b[2]
757 pxor 0x70($key),@b[5]
758 ___
759 }
760
761 sub swapmove {
762 my ($a,$b,$n,$mask,$t)=@_;
763 $code.=<<___;
764 movdqa $b,$t
765 psrlq \$$n,$b
766 pxor $a,$b
767 pand $mask,$b
768 pxor $b,$a
769 psllq \$$n,$b
770 pxor $t,$b
771 ___
772 }
773 sub swapmove2x {
774 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
775 $code.=<<___;
776 movdqa $b0,$t0
777 psrlq \$$n,$b0
778 movdqa $b1,$t1
779 psrlq \$$n,$b1
780 pxor $a0,$b0
781 pxor $a1,$b1
782 pand $mask,$b0
783 pand $mask,$b1
784 pxor $b0,$a0
785 psllq \$$n,$b0
786 pxor $b1,$a1
787 psllq \$$n,$b1
788 pxor $t0,$b0
789 pxor $t1,$b1
790 ___
791 }
792
793 sub bitslice {
794 my @x=reverse(@_[0..7]);
795 my ($t0,$t1,$t2,$t3)=@_[8..11];
796 $code.=<<___;
797 movdqa 0x00($const),$t0 # .LBS0
798 movdqa 0x10($const),$t1 # .LBS1
799 ___
800 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
801 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
802 $code.=<<___;
803 movdqa 0x20($const),$t0 # .LBS2
804 ___
805 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
806 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
807
808 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
809 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
810 }
811
812 $code.=<<___;
813 .text
814
815 .extern asm_AES_encrypt
816 .extern asm_AES_decrypt
817
818 .type _bsaes_encrypt8,\@abi-omnipotent
819 .align 64
820 _bsaes_encrypt8:
821 .cfi_startproc
822 lea .LBS0(%rip), $const # constants table
823
824 movdqa ($key), @XMM[9] # round 0 key
825 lea 0x10($key), $key
826 movdqa 0x50($const), @XMM[8] # .LM0SR
827 pxor @XMM[9], @XMM[0] # xor with round0 key
828 pxor @XMM[9], @XMM[1]
829 pxor @XMM[9], @XMM[2]
830 pxor @XMM[9], @XMM[3]
831 pshufb @XMM[8], @XMM[0]
832 pshufb @XMM[8], @XMM[1]
833 pxor @XMM[9], @XMM[4]
834 pxor @XMM[9], @XMM[5]
835 pshufb @XMM[8], @XMM[2]
836 pshufb @XMM[8], @XMM[3]
837 pxor @XMM[9], @XMM[6]
838 pxor @XMM[9], @XMM[7]
839 pshufb @XMM[8], @XMM[4]
840 pshufb @XMM[8], @XMM[5]
841 pshufb @XMM[8], @XMM[6]
842 pshufb @XMM[8], @XMM[7]
843 _bsaes_encrypt8_bitslice:
844 ___
845 &bitslice (@XMM[0..7, 8..11]);
846 $code.=<<___;
847 dec $rounds
848 jmp .Lenc_sbox
849 .align 16
850 .Lenc_loop:
851 ___
852 &ShiftRows (@XMM[0..7, 8]);
853 $code.=".Lenc_sbox:\n";
854 &Sbox (@XMM[0..7, 8..15]);
855 $code.=<<___;
856 dec $rounds
857 jl .Lenc_done
858 ___
859 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
860 $code.=<<___;
861 movdqa 0x30($const), @XMM[8] # .LSR
862 jnz .Lenc_loop
863 movdqa 0x40($const), @XMM[8] # .LSRM0
864 jmp .Lenc_loop
865 .align 16
866 .Lenc_done:
867 ___
868 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
869 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
870 $code.=<<___;
871 movdqa ($key), @XMM[8] # last round key
872 pxor @XMM[8], @XMM[4]
873 pxor @XMM[8], @XMM[6]
874 pxor @XMM[8], @XMM[3]
875 pxor @XMM[8], @XMM[7]
876 pxor @XMM[8], @XMM[2]
877 pxor @XMM[8], @XMM[5]
878 pxor @XMM[8], @XMM[0]
879 pxor @XMM[8], @XMM[1]
880 ret
881 .cfi_endproc
882 .size _bsaes_encrypt8,.-_bsaes_encrypt8
883
884 .type _bsaes_decrypt8,\@abi-omnipotent
885 .align 64
886 _bsaes_decrypt8:
887 .cfi_startproc
888 lea .LBS0(%rip), $const # constants table
889
890 movdqa ($key), @XMM[9] # round 0 key
891 lea 0x10($key), $key
892 movdqa -0x30($const), @XMM[8] # .LM0ISR
893 pxor @XMM[9], @XMM[0] # xor with round0 key
894 pxor @XMM[9], @XMM[1]
895 pxor @XMM[9], @XMM[2]
896 pxor @XMM[9], @XMM[3]
897 pshufb @XMM[8], @XMM[0]
898 pshufb @XMM[8], @XMM[1]
899 pxor @XMM[9], @XMM[4]
900 pxor @XMM[9], @XMM[5]
901 pshufb @XMM[8], @XMM[2]
902 pshufb @XMM[8], @XMM[3]
903 pxor @XMM[9], @XMM[6]
904 pxor @XMM[9], @XMM[7]
905 pshufb @XMM[8], @XMM[4]
906 pshufb @XMM[8], @XMM[5]
907 pshufb @XMM[8], @XMM[6]
908 pshufb @XMM[8], @XMM[7]
909 ___
910 &bitslice (@XMM[0..7, 8..11]);
911 $code.=<<___;
912 dec $rounds
913 jmp .Ldec_sbox
914 .align 16
915 .Ldec_loop:
916 ___
917 &ShiftRows (@XMM[0..7, 8]);
918 $code.=".Ldec_sbox:\n";
919 &InvSbox (@XMM[0..7, 8..15]);
920 $code.=<<___;
921 dec $rounds
922 jl .Ldec_done
923 ___
924 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
925 $code.=<<___;
926 movdqa -0x10($const), @XMM[8] # .LISR
927 jnz .Ldec_loop
928 movdqa -0x20($const), @XMM[8] # .LISRM0
929 jmp .Ldec_loop
930 .align 16
931 .Ldec_done:
932 ___
933 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
934 $code.=<<___;
935 movdqa ($key), @XMM[8] # last round key
936 pxor @XMM[8], @XMM[6]
937 pxor @XMM[8], @XMM[4]
938 pxor @XMM[8], @XMM[2]
939 pxor @XMM[8], @XMM[7]
940 pxor @XMM[8], @XMM[3]
941 pxor @XMM[8], @XMM[5]
942 pxor @XMM[8], @XMM[0]
943 pxor @XMM[8], @XMM[1]
944 ret
945 .cfi_endproc
946 .size _bsaes_decrypt8,.-_bsaes_decrypt8
947 ___
948 }
949 {
950 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
951
952 sub bitslice_key {
953 my @x=reverse(@_[0..7]);
954 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
955
956 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
957 $code.=<<___;
958 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
959 movdqa @x[0], @x[2]
960 movdqa @x[1], @x[3]
961 ___
962 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
963
964 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
965 $code.=<<___;
966 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
967 movdqa @x[0], @x[4]
968 movdqa @x[2], @x[6]
969 movdqa @x[1], @x[5]
970 movdqa @x[3], @x[7]
971 ___
972 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
973 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
974 }
975
976 $code.=<<___;
977 .type _bsaes_key_convert,\@abi-omnipotent
978 .align 16
979 _bsaes_key_convert:
980 .cfi_startproc
981 lea .Lmasks(%rip), $const
982 movdqu ($inp), %xmm7 # load round 0 key
983 lea 0x10($inp), $inp
984 movdqa 0x00($const), %xmm0 # 0x01...
985 movdqa 0x10($const), %xmm1 # 0x02...
986 movdqa 0x20($const), %xmm2 # 0x04...
987 movdqa 0x30($const), %xmm3 # 0x08...
988 movdqa 0x40($const), %xmm4 # .LM0
989 pcmpeqd %xmm5, %xmm5 # .LNOT
990
991 movdqu ($inp), %xmm6 # load round 1 key
992 movdqa %xmm7, ($out) # save round 0 key
993 lea 0x10($out), $out
994 dec $rounds
995 jmp .Lkey_loop
996 .align 16
997 .Lkey_loop:
998 pshufb %xmm4, %xmm6 # .LM0
999
1000 movdqa %xmm0, %xmm8
1001 movdqa %xmm1, %xmm9
1002
1003 pand %xmm6, %xmm8
1004 pand %xmm6, %xmm9
1005 movdqa %xmm2, %xmm10
1006 pcmpeqb %xmm0, %xmm8
1007 psllq \$4, %xmm0 # 0x10...
1008 movdqa %xmm3, %xmm11
1009 pcmpeqb %xmm1, %xmm9
1010 psllq \$4, %xmm1 # 0x20...
1011
1012 pand %xmm6, %xmm10
1013 pand %xmm6, %xmm11
1014 movdqa %xmm0, %xmm12
1015 pcmpeqb %xmm2, %xmm10
1016 psllq \$4, %xmm2 # 0x40...
1017 movdqa %xmm1, %xmm13
1018 pcmpeqb %xmm3, %xmm11
1019 psllq \$4, %xmm3 # 0x80...
1020
1021 movdqa %xmm2, %xmm14
1022 movdqa %xmm3, %xmm15
1023 pxor %xmm5, %xmm8 # "pnot"
1024 pxor %xmm5, %xmm9
1025
1026 pand %xmm6, %xmm12
1027 pand %xmm6, %xmm13
1028 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1029 pcmpeqb %xmm0, %xmm12
1030 psrlq \$4, %xmm0 # 0x01...
1031 movdqa %xmm9, 0x10($out)
1032 pcmpeqb %xmm1, %xmm13
1033 psrlq \$4, %xmm1 # 0x02...
1034 lea 0x10($inp), $inp
1035
1036 pand %xmm6, %xmm14
1037 pand %xmm6, %xmm15
1038 movdqa %xmm10, 0x20($out)
1039 pcmpeqb %xmm2, %xmm14
1040 psrlq \$4, %xmm2 # 0x04...
1041 movdqa %xmm11, 0x30($out)
1042 pcmpeqb %xmm3, %xmm15
1043 psrlq \$4, %xmm3 # 0x08...
1044 movdqu ($inp), %xmm6 # load next round key
1045
1046 pxor %xmm5, %xmm13 # "pnot"
1047 pxor %xmm5, %xmm14
1048 movdqa %xmm12, 0x40($out)
1049 movdqa %xmm13, 0x50($out)
1050 movdqa %xmm14, 0x60($out)
1051 movdqa %xmm15, 0x70($out)
1052 lea 0x80($out),$out
1053 dec $rounds
1054 jnz .Lkey_loop
1055
1056 movdqa 0x50($const), %xmm7 # .L63
1057 #movdqa %xmm6, ($out) # don't save last round key
1058 ret
1059 .cfi_endproc
1060 .size _bsaes_key_convert,.-_bsaes_key_convert
1061 ___
1062 }
1063
1064 if (0 && !$win64) { # following four functions are unsupported interface
1065 # used for benchmarking...
1066 $code.=<<___;
1067 .globl bsaes_enc_key_convert
1068 .type bsaes_enc_key_convert,\@function,2
1069 .align 16
1070 bsaes_enc_key_convert:
1071 mov 240($inp),%r10d # pass rounds
1072 mov $inp,%rcx # pass key
1073 mov $out,%rax # pass key schedule
1074 call _bsaes_key_convert
1075 pxor %xmm6,%xmm7 # fix up last round key
1076 movdqa %xmm7,(%rax) # save last round key
1077 ret
1078 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1079
1080 .globl bsaes_encrypt_128
1081 .type bsaes_encrypt_128,\@function,4
1082 .align 16
1083 bsaes_encrypt_128:
1084 .Lenc128_loop:
1085 movdqu 0x00($inp), @XMM[0] # load input
1086 movdqu 0x10($inp), @XMM[1]
1087 movdqu 0x20($inp), @XMM[2]
1088 movdqu 0x30($inp), @XMM[3]
1089 movdqu 0x40($inp), @XMM[4]
1090 movdqu 0x50($inp), @XMM[5]
1091 movdqu 0x60($inp), @XMM[6]
1092 movdqu 0x70($inp), @XMM[7]
1093 mov $key, %rax # pass the $key
1094 lea 0x80($inp), $inp
1095 mov \$10,%r10d
1096
1097 call _bsaes_encrypt8
1098
1099 movdqu @XMM[0], 0x00($out) # write output
1100 movdqu @XMM[1], 0x10($out)
1101 movdqu @XMM[4], 0x20($out)
1102 movdqu @XMM[6], 0x30($out)
1103 movdqu @XMM[3], 0x40($out)
1104 movdqu @XMM[7], 0x50($out)
1105 movdqu @XMM[2], 0x60($out)
1106 movdqu @XMM[5], 0x70($out)
1107 lea 0x80($out), $out
1108 sub \$0x80,$len
1109 ja .Lenc128_loop
1110 ret
1111 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1112
1113 .globl bsaes_dec_key_convert
1114 .type bsaes_dec_key_convert,\@function,2
1115 .align 16
1116 bsaes_dec_key_convert:
1117 mov 240($inp),%r10d # pass rounds
1118 mov $inp,%rcx # pass key
1119 mov $out,%rax # pass key schedule
1120 call _bsaes_key_convert
1121 pxor ($out),%xmm7 # fix up round 0 key
1122 movdqa %xmm6,(%rax) # save last round key
1123 movdqa %xmm7,($out)
1124 ret
1125 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1126
1127 .globl bsaes_decrypt_128
1128 .type bsaes_decrypt_128,\@function,4
1129 .align 16
1130 bsaes_decrypt_128:
1131 .Ldec128_loop:
1132 movdqu 0x00($inp), @XMM[0] # load input
1133 movdqu 0x10($inp), @XMM[1]
1134 movdqu 0x20($inp), @XMM[2]
1135 movdqu 0x30($inp), @XMM[3]
1136 movdqu 0x40($inp), @XMM[4]
1137 movdqu 0x50($inp), @XMM[5]
1138 movdqu 0x60($inp), @XMM[6]
1139 movdqu 0x70($inp), @XMM[7]
1140 mov $key, %rax # pass the $key
1141 lea 0x80($inp), $inp
1142 mov \$10,%r10d
1143
1144 call _bsaes_decrypt8
1145
1146 movdqu @XMM[0], 0x00($out) # write output
1147 movdqu @XMM[1], 0x10($out)
1148 movdqu @XMM[6], 0x20($out)
1149 movdqu @XMM[4], 0x30($out)
1150 movdqu @XMM[2], 0x40($out)
1151 movdqu @XMM[7], 0x50($out)
1152 movdqu @XMM[3], 0x60($out)
1153 movdqu @XMM[5], 0x70($out)
1154 lea 0x80($out), $out
1155 sub \$0x80,$len
1156 ja .Ldec128_loop
1157 ret
1158 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1159 ___
1160 }
1161 {
1162 ######################################################################
1163 #
1164 # OpenSSL interface
1165 #
1166 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1167 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1168 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1169
1170 if ($ecb) {
1171 $code.=<<___;
1172 .globl bsaes_ecb_encrypt_blocks
1173 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1174 .align 16
1175 bsaes_ecb_encrypt_blocks:
1176 .cfi_startproc
1177 mov %rsp, %rax
1178 .Lecb_enc_prologue:
1179 push %rbp
1180 .cfi_push %rbp
1181 push %rbx
1182 .cfi_push %rbx
1183 push %r12
1184 .cfi_push %r12
1185 push %r13
1186 .cfi_push %r13
1187 push %r14
1188 .cfi_push %r14
1189 push %r15
1190 .cfi_push %r15
1191 lea -0x48(%rsp),%rsp
1192 .cfi_adjust_cfa_offset 0x48
1193 ___
1194 $code.=<<___ if ($win64);
1195 lea -0xa0(%rsp), %rsp
1196 movaps %xmm6, 0x40(%rsp)
1197 movaps %xmm7, 0x50(%rsp)
1198 movaps %xmm8, 0x60(%rsp)
1199 movaps %xmm9, 0x70(%rsp)
1200 movaps %xmm10, 0x80(%rsp)
1201 movaps %xmm11, 0x90(%rsp)
1202 movaps %xmm12, 0xa0(%rsp)
1203 movaps %xmm13, 0xb0(%rsp)
1204 movaps %xmm14, 0xc0(%rsp)
1205 movaps %xmm15, 0xd0(%rsp)
1206 .Lecb_enc_body:
1207 ___
1208 $code.=<<___;
1209 mov %rsp,%rbp # backup %rsp
1210 .cfi_def_cfa_register %rbp
1211 mov 240($arg4),%eax # rounds
1212 mov $arg1,$inp # backup arguments
1213 mov $arg2,$out
1214 mov $arg3,$len
1215 mov $arg4,$key
1216 cmp \$8,$arg3
1217 jb .Lecb_enc_short
1218
1219 mov %eax,%ebx # backup rounds
1220 shl \$7,%rax # 128 bytes per inner round key
1221 sub \$`128-32`,%rax # size of bit-sliced key schedule
1222 sub %rax,%rsp
1223 mov %rsp,%rax # pass key schedule
1224 mov $key,%rcx # pass key
1225 mov %ebx,%r10d # pass rounds
1226 call _bsaes_key_convert
1227 pxor %xmm6,%xmm7 # fix up last round key
1228 movdqa %xmm7,(%rax) # save last round key
1229
1230 sub \$8,$len
1231 .Lecb_enc_loop:
1232 movdqu 0x00($inp), @XMM[0] # load input
1233 movdqu 0x10($inp), @XMM[1]
1234 movdqu 0x20($inp), @XMM[2]
1235 movdqu 0x30($inp), @XMM[3]
1236 movdqu 0x40($inp), @XMM[4]
1237 movdqu 0x50($inp), @XMM[5]
1238 mov %rsp, %rax # pass key schedule
1239 movdqu 0x60($inp), @XMM[6]
1240 mov %ebx,%r10d # pass rounds
1241 movdqu 0x70($inp), @XMM[7]
1242 lea 0x80($inp), $inp
1243
1244 call _bsaes_encrypt8
1245
1246 movdqu @XMM[0], 0x00($out) # write output
1247 movdqu @XMM[1], 0x10($out)
1248 movdqu @XMM[4], 0x20($out)
1249 movdqu @XMM[6], 0x30($out)
1250 movdqu @XMM[3], 0x40($out)
1251 movdqu @XMM[7], 0x50($out)
1252 movdqu @XMM[2], 0x60($out)
1253 movdqu @XMM[5], 0x70($out)
1254 lea 0x80($out), $out
1255 sub \$8,$len
1256 jnc .Lecb_enc_loop
1257
1258 add \$8,$len
1259 jz .Lecb_enc_done
1260
1261 movdqu 0x00($inp), @XMM[0] # load input
1262 mov %rsp, %rax # pass key schedule
1263 mov %ebx,%r10d # pass rounds
1264 cmp \$2,$len
1265 jb .Lecb_enc_one
1266 movdqu 0x10($inp), @XMM[1]
1267 je .Lecb_enc_two
1268 movdqu 0x20($inp), @XMM[2]
1269 cmp \$4,$len
1270 jb .Lecb_enc_three
1271 movdqu 0x30($inp), @XMM[3]
1272 je .Lecb_enc_four
1273 movdqu 0x40($inp), @XMM[4]
1274 cmp \$6,$len
1275 jb .Lecb_enc_five
1276 movdqu 0x50($inp), @XMM[5]
1277 je .Lecb_enc_six
1278 movdqu 0x60($inp), @XMM[6]
1279 call _bsaes_encrypt8
1280 movdqu @XMM[0], 0x00($out) # write output
1281 movdqu @XMM[1], 0x10($out)
1282 movdqu @XMM[4], 0x20($out)
1283 movdqu @XMM[6], 0x30($out)
1284 movdqu @XMM[3], 0x40($out)
1285 movdqu @XMM[7], 0x50($out)
1286 movdqu @XMM[2], 0x60($out)
1287 jmp .Lecb_enc_done
1288 .align 16
1289 .Lecb_enc_six:
1290 call _bsaes_encrypt8
1291 movdqu @XMM[0], 0x00($out) # write output
1292 movdqu @XMM[1], 0x10($out)
1293 movdqu @XMM[4], 0x20($out)
1294 movdqu @XMM[6], 0x30($out)
1295 movdqu @XMM[3], 0x40($out)
1296 movdqu @XMM[7], 0x50($out)
1297 jmp .Lecb_enc_done
1298 .align 16
1299 .Lecb_enc_five:
1300 call _bsaes_encrypt8
1301 movdqu @XMM[0], 0x00($out) # write output
1302 movdqu @XMM[1], 0x10($out)
1303 movdqu @XMM[4], 0x20($out)
1304 movdqu @XMM[6], 0x30($out)
1305 movdqu @XMM[3], 0x40($out)
1306 jmp .Lecb_enc_done
1307 .align 16
1308 .Lecb_enc_four:
1309 call _bsaes_encrypt8
1310 movdqu @XMM[0], 0x00($out) # write output
1311 movdqu @XMM[1], 0x10($out)
1312 movdqu @XMM[4], 0x20($out)
1313 movdqu @XMM[6], 0x30($out)
1314 jmp .Lecb_enc_done
1315 .align 16
1316 .Lecb_enc_three:
1317 call _bsaes_encrypt8
1318 movdqu @XMM[0], 0x00($out) # write output
1319 movdqu @XMM[1], 0x10($out)
1320 movdqu @XMM[4], 0x20($out)
1321 jmp .Lecb_enc_done
1322 .align 16
1323 .Lecb_enc_two:
1324 call _bsaes_encrypt8
1325 movdqu @XMM[0], 0x00($out) # write output
1326 movdqu @XMM[1], 0x10($out)
1327 jmp .Lecb_enc_done
1328 .align 16
1329 .Lecb_enc_one:
1330 call _bsaes_encrypt8
1331 movdqu @XMM[0], 0x00($out) # write output
1332 jmp .Lecb_enc_done
1333 .align 16
1334 .Lecb_enc_short:
1335 lea ($inp), $arg1
1336 lea ($out), $arg2
1337 lea ($key), $arg3
1338 call asm_AES_encrypt
1339 lea 16($inp), $inp
1340 lea 16($out), $out
1341 dec $len
1342 jnz .Lecb_enc_short
1343
1344 .Lecb_enc_done:
1345 lea (%rsp),%rax
1346 pxor %xmm0, %xmm0
1347 .Lecb_enc_bzero: # wipe key schedule [if any]
1348 movdqa %xmm0, 0x00(%rax)
1349 movdqa %xmm0, 0x10(%rax)
1350 lea 0x20(%rax), %rax
1351 cmp %rax, %rbp
1352 jb .Lecb_enc_bzero
1353
1354 lea 0x78(%rbp),%rax
1355 .cfi_def_cfa %rax,8
1356 ___
1357 $code.=<<___ if ($win64);
1358 movaps 0x40(%rbp), %xmm6
1359 movaps 0x50(%rbp), %xmm7
1360 movaps 0x60(%rbp), %xmm8
1361 movaps 0x70(%rbp), %xmm9
1362 movaps 0x80(%rbp), %xmm10
1363 movaps 0x90(%rbp), %xmm11
1364 movaps 0xa0(%rbp), %xmm12
1365 movaps 0xb0(%rbp), %xmm13
1366 movaps 0xc0(%rbp), %xmm14
1367 movaps 0xd0(%rbp), %xmm15
1368 lea 0xa0(%rax), %rax
1369 .Lecb_enc_tail:
1370 ___
1371 $code.=<<___;
1372 mov -48(%rax), %r15
1373 .cfi_restore %r15
1374 mov -40(%rax), %r14
1375 .cfi_restore %r14
1376 mov -32(%rax), %r13
1377 .cfi_restore %r13
1378 mov -24(%rax), %r12
1379 .cfi_restore %r12
1380 mov -16(%rax), %rbx
1381 .cfi_restore %rbx
1382 mov -8(%rax), %rbp
1383 .cfi_restore %rbp
1384 lea (%rax), %rsp # restore %rsp
1385 .cfi_def_cfa_register %rsp
1386 .Lecb_enc_epilogue:
1387 ret
1388 .cfi_endproc
1389 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1390
1391 .globl bsaes_ecb_decrypt_blocks
1392 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1393 .align 16
1394 bsaes_ecb_decrypt_blocks:
1395 .cfi_startproc
1396 mov %rsp, %rax
1397 .Lecb_dec_prologue:
1398 push %rbp
1399 .cfi_push %rbp
1400 push %rbx
1401 .cfi_push %rbx
1402 push %r12
1403 .cfi_push %r12
1404 push %r13
1405 .cfi_push %r13
1406 push %r14
1407 .cfi_push %r14
1408 push %r15
1409 .cfi_push %r15
1410 lea -0x48(%rsp),%rsp
1411 .cfi_adjust_cfa_offset 0x48
1412 ___
1413 $code.=<<___ if ($win64);
1414 lea -0xa0(%rsp), %rsp
1415 movaps %xmm6, 0x40(%rsp)
1416 movaps %xmm7, 0x50(%rsp)
1417 movaps %xmm8, 0x60(%rsp)
1418 movaps %xmm9, 0x70(%rsp)
1419 movaps %xmm10, 0x80(%rsp)
1420 movaps %xmm11, 0x90(%rsp)
1421 movaps %xmm12, 0xa0(%rsp)
1422 movaps %xmm13, 0xb0(%rsp)
1423 movaps %xmm14, 0xc0(%rsp)
1424 movaps %xmm15, 0xd0(%rsp)
1425 .Lecb_dec_body:
1426 ___
1427 $code.=<<___;
1428 mov %rsp,%rbp # backup %rsp
1429 .cfi_def_cfa_register %rbp
1430 mov 240($arg4),%eax # rounds
1431 mov $arg1,$inp # backup arguments
1432 mov $arg2,$out
1433 mov $arg3,$len
1434 mov $arg4,$key
1435 cmp \$8,$arg3
1436 jb .Lecb_dec_short
1437
1438 mov %eax,%ebx # backup rounds
1439 shl \$7,%rax # 128 bytes per inner round key
1440 sub \$`128-32`,%rax # size of bit-sliced key schedule
1441 sub %rax,%rsp
1442 mov %rsp,%rax # pass key schedule
1443 mov $key,%rcx # pass key
1444 mov %ebx,%r10d # pass rounds
1445 call _bsaes_key_convert
1446 pxor (%rsp),%xmm7 # fix up 0 round key
1447 movdqa %xmm6,(%rax) # save last round key
1448 movdqa %xmm7,(%rsp)
1449
1450 sub \$8,$len
1451 .Lecb_dec_loop:
1452 movdqu 0x00($inp), @XMM[0] # load input
1453 movdqu 0x10($inp), @XMM[1]
1454 movdqu 0x20($inp), @XMM[2]
1455 movdqu 0x30($inp), @XMM[3]
1456 movdqu 0x40($inp), @XMM[4]
1457 movdqu 0x50($inp), @XMM[5]
1458 mov %rsp, %rax # pass key schedule
1459 movdqu 0x60($inp), @XMM[6]
1460 mov %ebx,%r10d # pass rounds
1461 movdqu 0x70($inp), @XMM[7]
1462 lea 0x80($inp), $inp
1463
1464 call _bsaes_decrypt8
1465
1466 movdqu @XMM[0], 0x00($out) # write output
1467 movdqu @XMM[1], 0x10($out)
1468 movdqu @XMM[6], 0x20($out)
1469 movdqu @XMM[4], 0x30($out)
1470 movdqu @XMM[2], 0x40($out)
1471 movdqu @XMM[7], 0x50($out)
1472 movdqu @XMM[3], 0x60($out)
1473 movdqu @XMM[5], 0x70($out)
1474 lea 0x80($out), $out
1475 sub \$8,$len
1476 jnc .Lecb_dec_loop
1477
1478 add \$8,$len
1479 jz .Lecb_dec_done
1480
1481 movdqu 0x00($inp), @XMM[0] # load input
1482 mov %rsp, %rax # pass key schedule
1483 mov %ebx,%r10d # pass rounds
1484 cmp \$2,$len
1485 jb .Lecb_dec_one
1486 movdqu 0x10($inp), @XMM[1]
1487 je .Lecb_dec_two
1488 movdqu 0x20($inp), @XMM[2]
1489 cmp \$4,$len
1490 jb .Lecb_dec_three
1491 movdqu 0x30($inp), @XMM[3]
1492 je .Lecb_dec_four
1493 movdqu 0x40($inp), @XMM[4]
1494 cmp \$6,$len
1495 jb .Lecb_dec_five
1496 movdqu 0x50($inp), @XMM[5]
1497 je .Lecb_dec_six
1498 movdqu 0x60($inp), @XMM[6]
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1503 movdqu @XMM[4], 0x30($out)
1504 movdqu @XMM[2], 0x40($out)
1505 movdqu @XMM[7], 0x50($out)
1506 movdqu @XMM[3], 0x60($out)
1507 jmp .Lecb_dec_done
1508 .align 16
1509 .Lecb_dec_six:
1510 call _bsaes_decrypt8
1511 movdqu @XMM[0], 0x00($out) # write output
1512 movdqu @XMM[1], 0x10($out)
1513 movdqu @XMM[6], 0x20($out)
1514 movdqu @XMM[4], 0x30($out)
1515 movdqu @XMM[2], 0x40($out)
1516 movdqu @XMM[7], 0x50($out)
1517 jmp .Lecb_dec_done
1518 .align 16
1519 .Lecb_dec_five:
1520 call _bsaes_decrypt8
1521 movdqu @XMM[0], 0x00($out) # write output
1522 movdqu @XMM[1], 0x10($out)
1523 movdqu @XMM[6], 0x20($out)
1524 movdqu @XMM[4], 0x30($out)
1525 movdqu @XMM[2], 0x40($out)
1526 jmp .Lecb_dec_done
1527 .align 16
1528 .Lecb_dec_four:
1529 call _bsaes_decrypt8
1530 movdqu @XMM[0], 0x00($out) # write output
1531 movdqu @XMM[1], 0x10($out)
1532 movdqu @XMM[6], 0x20($out)
1533 movdqu @XMM[4], 0x30($out)
1534 jmp .Lecb_dec_done
1535 .align 16
1536 .Lecb_dec_three:
1537 call _bsaes_decrypt8
1538 movdqu @XMM[0], 0x00($out) # write output
1539 movdqu @XMM[1], 0x10($out)
1540 movdqu @XMM[6], 0x20($out)
1541 jmp .Lecb_dec_done
1542 .align 16
1543 .Lecb_dec_two:
1544 call _bsaes_decrypt8
1545 movdqu @XMM[0], 0x00($out) # write output
1546 movdqu @XMM[1], 0x10($out)
1547 jmp .Lecb_dec_done
1548 .align 16
1549 .Lecb_dec_one:
1550 call _bsaes_decrypt8
1551 movdqu @XMM[0], 0x00($out) # write output
1552 jmp .Lecb_dec_done
1553 .align 16
1554 .Lecb_dec_short:
1555 lea ($inp), $arg1
1556 lea ($out), $arg2
1557 lea ($key), $arg3
1558 call asm_AES_decrypt
1559 lea 16($inp), $inp
1560 lea 16($out), $out
1561 dec $len
1562 jnz .Lecb_dec_short
1563
1564 .Lecb_dec_done:
1565 lea (%rsp),%rax
1566 pxor %xmm0, %xmm0
1567 .Lecb_dec_bzero: # wipe key schedule [if any]
1568 movdqa %xmm0, 0x00(%rax)
1569 movdqa %xmm0, 0x10(%rax)
1570 lea 0x20(%rax), %rax
1571 cmp %rax, %rbp
1572 jb .Lecb_dec_bzero
1573
1574 lea 0x78(%rbp),%rax
1575 .cfi_def_cfa %rax,8
1576 ___
1577 $code.=<<___ if ($win64);
1578 movaps 0x40(%rbp), %xmm6
1579 movaps 0x50(%rbp), %xmm7
1580 movaps 0x60(%rbp), %xmm8
1581 movaps 0x70(%rbp), %xmm9
1582 movaps 0x80(%rbp), %xmm10
1583 movaps 0x90(%rbp), %xmm11
1584 movaps 0xa0(%rbp), %xmm12
1585 movaps 0xb0(%rbp), %xmm13
1586 movaps 0xc0(%rbp), %xmm14
1587 movaps 0xd0(%rbp), %xmm15
1588 lea 0xa0(%rax), %rax
1589 .Lecb_dec_tail:
1590 ___
1591 $code.=<<___;
1592 mov -48(%rax), %r15
1593 .cfi_restore %r15
1594 mov -40(%rax), %r14
1595 .cfi_restore %r14
1596 mov -32(%rax), %r13
1597 .cfi_restore %r13
1598 mov -24(%rax), %r12
1599 .cfi_restore %r12
1600 mov -16(%rax), %rbx
1601 .cfi_restore %rbx
1602 mov -8(%rax), %rbp
1603 .cfi_restore %rbp
1604 lea (%rax), %rsp # restore %rsp
1605 .cfi_def_cfa_register %rsp
1606 .Lecb_dec_epilogue:
1607 ret
1608 .cfi_endproc
1609 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1610 ___
1611 }
1612 $code.=<<___;
1613 .extern asm_AES_cbc_encrypt
1614 .globl bsaes_cbc_encrypt
1615 .type bsaes_cbc_encrypt,\@abi-omnipotent
1616 .align 16
1617 bsaes_cbc_encrypt:
1618 .cfi_startproc
1619 endbranch
1620 ___
1621 $code.=<<___ if ($win64);
1622 mov 48(%rsp),$arg6 # pull direction flag
1623 ___
1624 $code.=<<___;
1625 cmp \$0,$arg6
1626 jne asm_AES_cbc_encrypt
1627 cmp \$128,$arg3
1628 jb asm_AES_cbc_encrypt
1629
1630 mov %rsp, %rax
1631 .Lcbc_dec_prologue:
1632 push %rbp
1633 .cfi_push %rbp
1634 push %rbx
1635 .cfi_push %rbx
1636 push %r12
1637 .cfi_push %r12
1638 push %r13
1639 .cfi_push %r13
1640 push %r14
1641 .cfi_push %r14
1642 push %r15
1643 .cfi_push %r15
1644 lea -0x48(%rsp), %rsp
1645 .cfi_adjust_cfa_offset 0x48
1646 ___
1647 $code.=<<___ if ($win64);
1648 mov 0xa0(%rsp),$arg5 # pull ivp
1649 lea -0xa0(%rsp), %rsp
1650 movaps %xmm6, 0x40(%rsp)
1651 movaps %xmm7, 0x50(%rsp)
1652 movaps %xmm8, 0x60(%rsp)
1653 movaps %xmm9, 0x70(%rsp)
1654 movaps %xmm10, 0x80(%rsp)
1655 movaps %xmm11, 0x90(%rsp)
1656 movaps %xmm12, 0xa0(%rsp)
1657 movaps %xmm13, 0xb0(%rsp)
1658 movaps %xmm14, 0xc0(%rsp)
1659 movaps %xmm15, 0xd0(%rsp)
1660 .Lcbc_dec_body:
1661 ___
1662 $code.=<<___;
1663 mov %rsp, %rbp # backup %rsp
1664 .cfi_def_cfa_register %rbp
1665 mov 240($arg4), %eax # rounds
1666 mov $arg1, $inp # backup arguments
1667 mov $arg2, $out
1668 mov $arg3, $len
1669 mov $arg4, $key
1670 mov $arg5, %rbx
1671 shr \$4, $len # bytes to blocks
1672
1673 mov %eax, %edx # rounds
1674 shl \$7, %rax # 128 bytes per inner round key
1675 sub \$`128-32`, %rax # size of bit-sliced key schedule
1676 sub %rax, %rsp
1677
1678 mov %rsp, %rax # pass key schedule
1679 mov $key, %rcx # pass key
1680 mov %edx, %r10d # pass rounds
1681 call _bsaes_key_convert
1682 pxor (%rsp),%xmm7 # fix up 0 round key
1683 movdqa %xmm6,(%rax) # save last round key
1684 movdqa %xmm7,(%rsp)
1685
1686 movdqu (%rbx), @XMM[15] # load IV
1687 sub \$8,$len
1688 .Lcbc_dec_loop:
1689 movdqu 0x00($inp), @XMM[0] # load input
1690 movdqu 0x10($inp), @XMM[1]
1691 movdqu 0x20($inp), @XMM[2]
1692 movdqu 0x30($inp), @XMM[3]
1693 movdqu 0x40($inp), @XMM[4]
1694 movdqu 0x50($inp), @XMM[5]
1695 mov %rsp, %rax # pass key schedule
1696 movdqu 0x60($inp), @XMM[6]
1697 mov %edx,%r10d # pass rounds
1698 movdqu 0x70($inp), @XMM[7]
1699 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1700
1701 call _bsaes_decrypt8
1702
1703 pxor 0x20(%rbp), @XMM[0] # ^= IV
1704 movdqu 0x00($inp), @XMM[8] # re-load input
1705 movdqu 0x10($inp), @XMM[9]
1706 pxor @XMM[8], @XMM[1]
1707 movdqu 0x20($inp), @XMM[10]
1708 pxor @XMM[9], @XMM[6]
1709 movdqu 0x30($inp), @XMM[11]
1710 pxor @XMM[10], @XMM[4]
1711 movdqu 0x40($inp), @XMM[12]
1712 pxor @XMM[11], @XMM[2]
1713 movdqu 0x50($inp), @XMM[13]
1714 pxor @XMM[12], @XMM[7]
1715 movdqu 0x60($inp), @XMM[14]
1716 pxor @XMM[13], @XMM[3]
1717 movdqu 0x70($inp), @XMM[15] # IV
1718 pxor @XMM[14], @XMM[5]
1719 movdqu @XMM[0], 0x00($out) # write output
1720 lea 0x80($inp), $inp
1721 movdqu @XMM[1], 0x10($out)
1722 movdqu @XMM[6], 0x20($out)
1723 movdqu @XMM[4], 0x30($out)
1724 movdqu @XMM[2], 0x40($out)
1725 movdqu @XMM[7], 0x50($out)
1726 movdqu @XMM[3], 0x60($out)
1727 movdqu @XMM[5], 0x70($out)
1728 lea 0x80($out), $out
1729 sub \$8,$len
1730 jnc .Lcbc_dec_loop
1731
1732 add \$8,$len
1733 jz .Lcbc_dec_done
1734
1735 movdqu 0x00($inp), @XMM[0] # load input
1736 mov %rsp, %rax # pass key schedule
1737 mov %edx, %r10d # pass rounds
1738 cmp \$2,$len
1739 jb .Lcbc_dec_one
1740 movdqu 0x10($inp), @XMM[1]
1741 je .Lcbc_dec_two
1742 movdqu 0x20($inp), @XMM[2]
1743 cmp \$4,$len
1744 jb .Lcbc_dec_three
1745 movdqu 0x30($inp), @XMM[3]
1746 je .Lcbc_dec_four
1747 movdqu 0x40($inp), @XMM[4]
1748 cmp \$6,$len
1749 jb .Lcbc_dec_five
1750 movdqu 0x50($inp), @XMM[5]
1751 je .Lcbc_dec_six
1752 movdqu 0x60($inp), @XMM[6]
1753 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1754 call _bsaes_decrypt8
1755 pxor 0x20(%rbp), @XMM[0] # ^= IV
1756 movdqu 0x00($inp), @XMM[8] # re-load input
1757 movdqu 0x10($inp), @XMM[9]
1758 pxor @XMM[8], @XMM[1]
1759 movdqu 0x20($inp), @XMM[10]
1760 pxor @XMM[9], @XMM[6]
1761 movdqu 0x30($inp), @XMM[11]
1762 pxor @XMM[10], @XMM[4]
1763 movdqu 0x40($inp), @XMM[12]
1764 pxor @XMM[11], @XMM[2]
1765 movdqu 0x50($inp), @XMM[13]
1766 pxor @XMM[12], @XMM[7]
1767 movdqu 0x60($inp), @XMM[15] # IV
1768 pxor @XMM[13], @XMM[3]
1769 movdqu @XMM[0], 0x00($out) # write output
1770 movdqu @XMM[1], 0x10($out)
1771 movdqu @XMM[6], 0x20($out)
1772 movdqu @XMM[4], 0x30($out)
1773 movdqu @XMM[2], 0x40($out)
1774 movdqu @XMM[7], 0x50($out)
1775 movdqu @XMM[3], 0x60($out)
1776 jmp .Lcbc_dec_done
1777 .align 16
1778 .Lcbc_dec_six:
1779 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1780 call _bsaes_decrypt8
1781 pxor 0x20(%rbp), @XMM[0] # ^= IV
1782 movdqu 0x00($inp), @XMM[8] # re-load input
1783 movdqu 0x10($inp), @XMM[9]
1784 pxor @XMM[8], @XMM[1]
1785 movdqu 0x20($inp), @XMM[10]
1786 pxor @XMM[9], @XMM[6]
1787 movdqu 0x30($inp), @XMM[11]
1788 pxor @XMM[10], @XMM[4]
1789 movdqu 0x40($inp), @XMM[12]
1790 pxor @XMM[11], @XMM[2]
1791 movdqu 0x50($inp), @XMM[15] # IV
1792 pxor @XMM[12], @XMM[7]
1793 movdqu @XMM[0], 0x00($out) # write output
1794 movdqu @XMM[1], 0x10($out)
1795 movdqu @XMM[6], 0x20($out)
1796 movdqu @XMM[4], 0x30($out)
1797 movdqu @XMM[2], 0x40($out)
1798 movdqu @XMM[7], 0x50($out)
1799 jmp .Lcbc_dec_done
1800 .align 16
1801 .Lcbc_dec_five:
1802 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1803 call _bsaes_decrypt8
1804 pxor 0x20(%rbp), @XMM[0] # ^= IV
1805 movdqu 0x00($inp), @XMM[8] # re-load input
1806 movdqu 0x10($inp), @XMM[9]
1807 pxor @XMM[8], @XMM[1]
1808 movdqu 0x20($inp), @XMM[10]
1809 pxor @XMM[9], @XMM[6]
1810 movdqu 0x30($inp), @XMM[11]
1811 pxor @XMM[10], @XMM[4]
1812 movdqu 0x40($inp), @XMM[15] # IV
1813 pxor @XMM[11], @XMM[2]
1814 movdqu @XMM[0], 0x00($out) # write output
1815 movdqu @XMM[1], 0x10($out)
1816 movdqu @XMM[6], 0x20($out)
1817 movdqu @XMM[4], 0x30($out)
1818 movdqu @XMM[2], 0x40($out)
1819 jmp .Lcbc_dec_done
1820 .align 16
1821 .Lcbc_dec_four:
1822 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1823 call _bsaes_decrypt8
1824 pxor 0x20(%rbp), @XMM[0] # ^= IV
1825 movdqu 0x00($inp), @XMM[8] # re-load input
1826 movdqu 0x10($inp), @XMM[9]
1827 pxor @XMM[8], @XMM[1]
1828 movdqu 0x20($inp), @XMM[10]
1829 pxor @XMM[9], @XMM[6]
1830 movdqu 0x30($inp), @XMM[15] # IV
1831 pxor @XMM[10], @XMM[4]
1832 movdqu @XMM[0], 0x00($out) # write output
1833 movdqu @XMM[1], 0x10($out)
1834 movdqu @XMM[6], 0x20($out)
1835 movdqu @XMM[4], 0x30($out)
1836 jmp .Lcbc_dec_done
1837 .align 16
1838 .Lcbc_dec_three:
1839 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1840 call _bsaes_decrypt8
1841 pxor 0x20(%rbp), @XMM[0] # ^= IV
1842 movdqu 0x00($inp), @XMM[8] # re-load input
1843 movdqu 0x10($inp), @XMM[9]
1844 pxor @XMM[8], @XMM[1]
1845 movdqu 0x20($inp), @XMM[15] # IV
1846 pxor @XMM[9], @XMM[6]
1847 movdqu @XMM[0], 0x00($out) # write output
1848 movdqu @XMM[1], 0x10($out)
1849 movdqu @XMM[6], 0x20($out)
1850 jmp .Lcbc_dec_done
1851 .align 16
1852 .Lcbc_dec_two:
1853 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1854 call _bsaes_decrypt8
1855 pxor 0x20(%rbp), @XMM[0] # ^= IV
1856 movdqu 0x00($inp), @XMM[8] # re-load input
1857 movdqu 0x10($inp), @XMM[15] # IV
1858 pxor @XMM[8], @XMM[1]
1859 movdqu @XMM[0], 0x00($out) # write output
1860 movdqu @XMM[1], 0x10($out)
1861 jmp .Lcbc_dec_done
1862 .align 16
1863 .Lcbc_dec_one:
1864 lea ($inp), $arg1
1865 lea 0x20(%rbp), $arg2 # buffer output
1866 lea ($key), $arg3
1867 call asm_AES_decrypt # doesn't touch %xmm
1868 pxor 0x20(%rbp), @XMM[15] # ^= IV
1869 movdqu @XMM[15], ($out) # write output
1870 movdqa @XMM[0], @XMM[15] # IV
1871
1872 .Lcbc_dec_done:
1873 movdqu @XMM[15], (%rbx) # return IV
1874 lea (%rsp), %rax
1875 pxor %xmm0, %xmm0
1876 .Lcbc_dec_bzero: # wipe key schedule [if any]
1877 movdqa %xmm0, 0x00(%rax)
1878 movdqa %xmm0, 0x10(%rax)
1879 lea 0x20(%rax), %rax
1880 cmp %rax, %rbp
1881 ja .Lcbc_dec_bzero
1882
1883 lea 0x78(%rbp),%rax
1884 .cfi_def_cfa %rax,8
1885 ___
1886 $code.=<<___ if ($win64);
1887 movaps 0x40(%rbp), %xmm6
1888 movaps 0x50(%rbp), %xmm7
1889 movaps 0x60(%rbp), %xmm8
1890 movaps 0x70(%rbp), %xmm9
1891 movaps 0x80(%rbp), %xmm10
1892 movaps 0x90(%rbp), %xmm11
1893 movaps 0xa0(%rbp), %xmm12
1894 movaps 0xb0(%rbp), %xmm13
1895 movaps 0xc0(%rbp), %xmm14
1896 movaps 0xd0(%rbp), %xmm15
1897 lea 0xa0(%rax), %rax
1898 .Lcbc_dec_tail:
1899 ___
1900 $code.=<<___;
1901 mov -48(%rax), %r15
1902 .cfi_restore %r15
1903 mov -40(%rax), %r14
1904 .cfi_restore %r14
1905 mov -32(%rax), %r13
1906 .cfi_restore %r13
1907 mov -24(%rax), %r12
1908 .cfi_restore %r12
1909 mov -16(%rax), %rbx
1910 .cfi_restore %rbx
1911 mov -8(%rax), %rbp
1912 .cfi_restore %rbp
1913 lea (%rax), %rsp # restore %rsp
1914 .cfi_def_cfa_register %rsp
1915 .Lcbc_dec_epilogue:
1916 ret
1917 .cfi_endproc
1918 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1919
1920 .globl bsaes_ctr32_encrypt_blocks
1921 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1922 .align 16
1923 bsaes_ctr32_encrypt_blocks:
1924 .cfi_startproc
1925 endbranch
1926 mov %rsp, %rax
1927 .Lctr_enc_prologue:
1928 push %rbp
1929 .cfi_push %rbp
1930 push %rbx
1931 .cfi_push %rbx
1932 push %r12
1933 .cfi_push %r12
1934 push %r13
1935 .cfi_push %r13
1936 push %r14
1937 .cfi_push %r14
1938 push %r15
1939 .cfi_push %r15
1940 lea -0x48(%rsp), %rsp
1941 .cfi_adjust_cfa_offset 0x48
1942 ___
1943 $code.=<<___ if ($win64);
1944 mov 0xa0(%rsp),$arg5 # pull ivp
1945 lea -0xa0(%rsp), %rsp
1946 movaps %xmm6, 0x40(%rsp)
1947 movaps %xmm7, 0x50(%rsp)
1948 movaps %xmm8, 0x60(%rsp)
1949 movaps %xmm9, 0x70(%rsp)
1950 movaps %xmm10, 0x80(%rsp)
1951 movaps %xmm11, 0x90(%rsp)
1952 movaps %xmm12, 0xa0(%rsp)
1953 movaps %xmm13, 0xb0(%rsp)
1954 movaps %xmm14, 0xc0(%rsp)
1955 movaps %xmm15, 0xd0(%rsp)
1956 .Lctr_enc_body:
1957 ___
1958 $code.=<<___;
1959 mov %rsp, %rbp # backup %rsp
1960 .cfi_def_cfa_register %rbp
1961 movdqu ($arg5), %xmm0 # load counter
1962 mov 240($arg4), %eax # rounds
1963 mov $arg1, $inp # backup arguments
1964 mov $arg2, $out
1965 mov $arg3, $len
1966 mov $arg4, $key
1967 movdqa %xmm0, 0x20(%rbp) # copy counter
1968 cmp \$8, $arg3
1969 jb .Lctr_enc_short
1970
1971 mov %eax, %ebx # rounds
1972 shl \$7, %rax # 128 bytes per inner round key
1973 sub \$`128-32`, %rax # size of bit-sliced key schedule
1974 sub %rax, %rsp
1975
1976 mov %rsp, %rax # pass key schedule
1977 mov $key, %rcx # pass key
1978 mov %ebx, %r10d # pass rounds
1979 call _bsaes_key_convert
1980 pxor %xmm6,%xmm7 # fix up last round key
1981 movdqa %xmm7,(%rax) # save last round key
1982
1983 movdqa (%rsp), @XMM[9] # load round0 key
1984 lea .LADD1(%rip), %r11
1985 movdqa 0x20(%rbp), @XMM[0] # counter copy
1986 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1987 pshufb @XMM[8], @XMM[9] # byte swap upper part
1988 pshufb @XMM[8], @XMM[0]
1989 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1990 jmp .Lctr_enc_loop
1991 .align 16
1992 .Lctr_enc_loop:
1993 movdqa @XMM[0], 0x20(%rbp) # save counter
1994 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1995 movdqa @XMM[0], @XMM[2]
1996 paddd 0x00(%r11), @XMM[1] # .LADD1
1997 movdqa @XMM[0], @XMM[3]
1998 paddd 0x10(%r11), @XMM[2] # .LADD2
1999 movdqa @XMM[0], @XMM[4]
2000 paddd 0x20(%r11), @XMM[3] # .LADD3
2001 movdqa @XMM[0], @XMM[5]
2002 paddd 0x30(%r11), @XMM[4] # .LADD4
2003 movdqa @XMM[0], @XMM[6]
2004 paddd 0x40(%r11), @XMM[5] # .LADD5
2005 movdqa @XMM[0], @XMM[7]
2006 paddd 0x50(%r11), @XMM[6] # .LADD6
2007 paddd 0x60(%r11), @XMM[7] # .LADD7
2008
2009 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
2010 # to flip byte order in 32-bit counter
2011 movdqa (%rsp), @XMM[9] # round 0 key
2012 lea 0x10(%rsp), %rax # pass key schedule
2013 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
2014 pxor @XMM[9], @XMM[0] # xor with round0 key
2015 pxor @XMM[9], @XMM[1]
2016 pxor @XMM[9], @XMM[2]
2017 pxor @XMM[9], @XMM[3]
2018 pshufb @XMM[8], @XMM[0]
2019 pshufb @XMM[8], @XMM[1]
2020 pxor @XMM[9], @XMM[4]
2021 pxor @XMM[9], @XMM[5]
2022 pshufb @XMM[8], @XMM[2]
2023 pshufb @XMM[8], @XMM[3]
2024 pxor @XMM[9], @XMM[6]
2025 pxor @XMM[9], @XMM[7]
2026 pshufb @XMM[8], @XMM[4]
2027 pshufb @XMM[8], @XMM[5]
2028 pshufb @XMM[8], @XMM[6]
2029 pshufb @XMM[8], @XMM[7]
2030 lea .LBS0(%rip), %r11 # constants table
2031 mov %ebx,%r10d # pass rounds
2032
2033 call _bsaes_encrypt8_bitslice
2034
2035 sub \$8,$len
2036 jc .Lctr_enc_loop_done
2037
2038 movdqu 0x00($inp), @XMM[8] # load input
2039 movdqu 0x10($inp), @XMM[9]
2040 movdqu 0x20($inp), @XMM[10]
2041 movdqu 0x30($inp), @XMM[11]
2042 movdqu 0x40($inp), @XMM[12]
2043 movdqu 0x50($inp), @XMM[13]
2044 movdqu 0x60($inp), @XMM[14]
2045 movdqu 0x70($inp), @XMM[15]
2046 lea 0x80($inp),$inp
2047 pxor @XMM[0], @XMM[8]
2048 movdqa 0x20(%rbp), @XMM[0] # load counter
2049 pxor @XMM[9], @XMM[1]
2050 movdqu @XMM[8], 0x00($out) # write output
2051 pxor @XMM[10], @XMM[4]
2052 movdqu @XMM[1], 0x10($out)
2053 pxor @XMM[11], @XMM[6]
2054 movdqu @XMM[4], 0x20($out)
2055 pxor @XMM[12], @XMM[3]
2056 movdqu @XMM[6], 0x30($out)
2057 pxor @XMM[13], @XMM[7]
2058 movdqu @XMM[3], 0x40($out)
2059 pxor @XMM[14], @XMM[2]
2060 movdqu @XMM[7], 0x50($out)
2061 pxor @XMM[15], @XMM[5]
2062 movdqu @XMM[2], 0x60($out)
2063 lea .LADD1(%rip), %r11
2064 movdqu @XMM[5], 0x70($out)
2065 lea 0x80($out), $out
2066 paddd 0x70(%r11), @XMM[0] # .LADD8
2067 jnz .Lctr_enc_loop
2068
2069 jmp .Lctr_enc_done
2070 .align 16
2071 .Lctr_enc_loop_done:
2072 add \$8, $len
2073 movdqu 0x00($inp), @XMM[8] # load input
2074 pxor @XMM[8], @XMM[0]
2075 movdqu @XMM[0], 0x00($out) # write output
2076 cmp \$2,$len
2077 jb .Lctr_enc_done
2078 movdqu 0x10($inp), @XMM[9]
2079 pxor @XMM[9], @XMM[1]
2080 movdqu @XMM[1], 0x10($out)
2081 je .Lctr_enc_done
2082 movdqu 0x20($inp), @XMM[10]
2083 pxor @XMM[10], @XMM[4]
2084 movdqu @XMM[4], 0x20($out)
2085 cmp \$4,$len
2086 jb .Lctr_enc_done
2087 movdqu 0x30($inp), @XMM[11]
2088 pxor @XMM[11], @XMM[6]
2089 movdqu @XMM[6], 0x30($out)
2090 je .Lctr_enc_done
2091 movdqu 0x40($inp), @XMM[12]
2092 pxor @XMM[12], @XMM[3]
2093 movdqu @XMM[3], 0x40($out)
2094 cmp \$6,$len
2095 jb .Lctr_enc_done
2096 movdqu 0x50($inp), @XMM[13]
2097 pxor @XMM[13], @XMM[7]
2098 movdqu @XMM[7], 0x50($out)
2099 je .Lctr_enc_done
2100 movdqu 0x60($inp), @XMM[14]
2101 pxor @XMM[14], @XMM[2]
2102 movdqu @XMM[2], 0x60($out)
2103 jmp .Lctr_enc_done
2104
2105 .align 16
2106 .Lctr_enc_short:
2107 lea 0x20(%rbp), $arg1
2108 lea 0x30(%rbp), $arg2
2109 lea ($key), $arg3
2110 call asm_AES_encrypt
2111 movdqu ($inp), @XMM[1]
2112 lea 16($inp), $inp
2113 mov 0x2c(%rbp), %eax # load 32-bit counter
2114 bswap %eax
2115 pxor 0x30(%rbp), @XMM[1]
2116 inc %eax # increment
2117 movdqu @XMM[1], ($out)
2118 bswap %eax
2119 lea 16($out), $out
2120 mov %eax, 0x2c(%rsp) # save 32-bit counter
2121 dec $len
2122 jnz .Lctr_enc_short
2123
2124 .Lctr_enc_done:
2125 lea (%rsp), %rax
2126 pxor %xmm0, %xmm0
2127 .Lctr_enc_bzero: # wipe key schedule [if any]
2128 movdqa %xmm0, 0x00(%rax)
2129 movdqa %xmm0, 0x10(%rax)
2130 lea 0x20(%rax), %rax
2131 cmp %rax, %rbp
2132 ja .Lctr_enc_bzero
2133
2134 lea 0x78(%rbp),%rax
2135 .cfi_def_cfa %rax,8
2136 ___
2137 $code.=<<___ if ($win64);
2138 movaps 0x40(%rbp), %xmm6
2139 movaps 0x50(%rbp), %xmm7
2140 movaps 0x60(%rbp), %xmm8
2141 movaps 0x70(%rbp), %xmm9
2142 movaps 0x80(%rbp), %xmm10
2143 movaps 0x90(%rbp), %xmm11
2144 movaps 0xa0(%rbp), %xmm12
2145 movaps 0xb0(%rbp), %xmm13
2146 movaps 0xc0(%rbp), %xmm14
2147 movaps 0xd0(%rbp), %xmm15
2148 lea 0xa0(%rax), %rax
2149 .Lctr_enc_tail:
2150 ___
2151 $code.=<<___;
2152 mov -48(%rax), %r15
2153 .cfi_restore %r15
2154 mov -40(%rax), %r14
2155 .cfi_restore %r14
2156 mov -32(%rax), %r13
2157 .cfi_restore %r13
2158 mov -24(%rax), %r12
2159 .cfi_restore %r12
2160 mov -16(%rax), %rbx
2161 .cfi_restore %rbx
2162 mov -8(%rax), %rbp
2163 .cfi_restore %rbp
2164 lea (%rax), %rsp # restore %rsp
2165 .cfi_def_cfa_register %rsp
2166 .Lctr_enc_epilogue:
2167 ret
2168 .cfi_endproc
2169 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2170 ___
2171 ######################################################################
2172 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2173 # const AES_KEY *key1, const AES_KEY *key2,
2174 # const unsigned char iv[16]);
2175 #
2176 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2177 $arg6=~s/d$//;
2178
2179 $code.=<<___;
2180 .globl bsaes_xts_encrypt
2181 .type bsaes_xts_encrypt,\@abi-omnipotent
2182 .align 16
2183 bsaes_xts_encrypt:
2184 .cfi_startproc
2185 mov %rsp, %rax
2186 .Lxts_enc_prologue:
2187 push %rbp
2188 .cfi_push %rbp
2189 push %rbx
2190 .cfi_push %rbx
2191 push %r12
2192 .cfi_push %r12
2193 push %r13
2194 .cfi_push %r13
2195 push %r14
2196 .cfi_push %r14
2197 push %r15
2198 .cfi_push %r15
2199 lea -0x48(%rsp), %rsp
2200 .cfi_adjust_cfa_offset 0x48
2201 ___
2202 $code.=<<___ if ($win64);
2203 mov 0xa0(%rsp),$arg5 # pull key2
2204 mov 0xa8(%rsp),$arg6 # pull ivp
2205 lea -0xa0(%rsp), %rsp
2206 movaps %xmm6, 0x40(%rsp)
2207 movaps %xmm7, 0x50(%rsp)
2208 movaps %xmm8, 0x60(%rsp)
2209 movaps %xmm9, 0x70(%rsp)
2210 movaps %xmm10, 0x80(%rsp)
2211 movaps %xmm11, 0x90(%rsp)
2212 movaps %xmm12, 0xa0(%rsp)
2213 movaps %xmm13, 0xb0(%rsp)
2214 movaps %xmm14, 0xc0(%rsp)
2215 movaps %xmm15, 0xd0(%rsp)
2216 .Lxts_enc_body:
2217 ___
2218 $code.=<<___;
2219 mov %rsp, %rbp # backup %rsp
2220 .cfi_def_cfa_register %rbp
2221 mov $arg1, $inp # backup arguments
2222 mov $arg2, $out
2223 mov $arg3, $len
2224 mov $arg4, $key
2225
2226 lea ($arg6), $arg1
2227 lea 0x20(%rbp), $arg2
2228 lea ($arg5), $arg3
2229 call asm_AES_encrypt # generate initial tweak
2230
2231 mov 240($key), %eax # rounds
2232 mov $len, %rbx # backup $len
2233
2234 mov %eax, %edx # rounds
2235 shl \$7, %rax # 128 bytes per inner round key
2236 sub \$`128-32`, %rax # size of bit-sliced key schedule
2237 sub %rax, %rsp
2238
2239 mov %rsp, %rax # pass key schedule
2240 mov $key, %rcx # pass key
2241 mov %edx, %r10d # pass rounds
2242 call _bsaes_key_convert
2243 pxor %xmm6, %xmm7 # fix up last round key
2244 movdqa %xmm7, (%rax) # save last round key
2245
2246 and \$-16, $len
2247 sub \$0x80, %rsp # place for tweak[8]
2248 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2249
2250 pxor $twtmp, $twtmp
2251 movdqa .Lxts_magic(%rip), $twmask
2252 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2253
2254 sub \$0x80, $len
2255 jc .Lxts_enc_short
2256 jmp .Lxts_enc_loop
2257
2258 .align 16
2259 .Lxts_enc_loop:
2260 ___
2261 for ($i=0;$i<7;$i++) {
2262 $code.=<<___;
2263 pshufd \$0x13, $twtmp, $twres
2264 pxor $twtmp, $twtmp
2265 movdqa @XMM[7], @XMM[$i]
2266 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2267 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2268 pand $twmask, $twres # isolate carry and residue
2269 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2270 pxor $twres, @XMM[7]
2271 ___
2272 $code.=<<___ if ($i>=1);
2273 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2274 ___
2275 $code.=<<___ if ($i>=2);
2276 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2277 ___
2278 }
2279 $code.=<<___;
2280 movdqu 0x60($inp), @XMM[8+6]
2281 pxor @XMM[8+5], @XMM[5]
2282 movdqu 0x70($inp), @XMM[8+7]
2283 lea 0x80($inp), $inp
2284 movdqa @XMM[7], 0x70(%rsp)
2285 pxor @XMM[8+6], @XMM[6]
2286 lea 0x80(%rsp), %rax # pass key schedule
2287 pxor @XMM[8+7], @XMM[7]
2288 mov %edx, %r10d # pass rounds
2289
2290 call _bsaes_encrypt8
2291
2292 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2293 pxor 0x10(%rsp), @XMM[1]
2294 movdqu @XMM[0], 0x00($out) # write output
2295 pxor 0x20(%rsp), @XMM[4]
2296 movdqu @XMM[1], 0x10($out)
2297 pxor 0x30(%rsp), @XMM[6]
2298 movdqu @XMM[4], 0x20($out)
2299 pxor 0x40(%rsp), @XMM[3]
2300 movdqu @XMM[6], 0x30($out)
2301 pxor 0x50(%rsp), @XMM[7]
2302 movdqu @XMM[3], 0x40($out)
2303 pxor 0x60(%rsp), @XMM[2]
2304 movdqu @XMM[7], 0x50($out)
2305 pxor 0x70(%rsp), @XMM[5]
2306 movdqu @XMM[2], 0x60($out)
2307 movdqu @XMM[5], 0x70($out)
2308 lea 0x80($out), $out
2309
2310 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2311 pxor $twtmp, $twtmp
2312 movdqa .Lxts_magic(%rip), $twmask
2313 pcmpgtd @XMM[7], $twtmp
2314 pshufd \$0x13, $twtmp, $twres
2315 pxor $twtmp, $twtmp
2316 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2317 pand $twmask, $twres # isolate carry and residue
2318 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2319 pxor $twres, @XMM[7]
2320
2321 sub \$0x80,$len
2322 jnc .Lxts_enc_loop
2323
2324 .Lxts_enc_short:
2325 add \$0x80, $len
2326 jz .Lxts_enc_done
2327 ___
2328 for ($i=0;$i<7;$i++) {
2329 $code.=<<___;
2330 pshufd \$0x13, $twtmp, $twres
2331 pxor $twtmp, $twtmp
2332 movdqa @XMM[7], @XMM[$i]
2333 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2334 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2335 pand $twmask, $twres # isolate carry and residue
2336 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2337 pxor $twres, @XMM[7]
2338 ___
2339 $code.=<<___ if ($i>=1);
2340 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2341 cmp \$`0x10*$i`,$len
2342 je .Lxts_enc_$i
2343 ___
2344 $code.=<<___ if ($i>=2);
2345 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2346 ___
2347 }
2348 $code.=<<___;
2349 movdqu 0x60($inp), @XMM[8+6]
2350 pxor @XMM[8+5], @XMM[5]
2351 movdqa @XMM[7], 0x70(%rsp)
2352 lea 0x70($inp), $inp
2353 pxor @XMM[8+6], @XMM[6]
2354 lea 0x80(%rsp), %rax # pass key schedule
2355 mov %edx, %r10d # pass rounds
2356
2357 call _bsaes_encrypt8
2358
2359 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2360 pxor 0x10(%rsp), @XMM[1]
2361 movdqu @XMM[0], 0x00($out) # write output
2362 pxor 0x20(%rsp), @XMM[4]
2363 movdqu @XMM[1], 0x10($out)
2364 pxor 0x30(%rsp), @XMM[6]
2365 movdqu @XMM[4], 0x20($out)
2366 pxor 0x40(%rsp), @XMM[3]
2367 movdqu @XMM[6], 0x30($out)
2368 pxor 0x50(%rsp), @XMM[7]
2369 movdqu @XMM[3], 0x40($out)
2370 pxor 0x60(%rsp), @XMM[2]
2371 movdqu @XMM[7], 0x50($out)
2372 movdqu @XMM[2], 0x60($out)
2373 lea 0x70($out), $out
2374
2375 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2376 jmp .Lxts_enc_done
2377 .align 16
2378 .Lxts_enc_6:
2379 pxor @XMM[8+4], @XMM[4]
2380 lea 0x60($inp), $inp
2381 pxor @XMM[8+5], @XMM[5]
2382 lea 0x80(%rsp), %rax # pass key schedule
2383 mov %edx, %r10d # pass rounds
2384
2385 call _bsaes_encrypt8
2386
2387 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2388 pxor 0x10(%rsp), @XMM[1]
2389 movdqu @XMM[0], 0x00($out) # write output
2390 pxor 0x20(%rsp), @XMM[4]
2391 movdqu @XMM[1], 0x10($out)
2392 pxor 0x30(%rsp), @XMM[6]
2393 movdqu @XMM[4], 0x20($out)
2394 pxor 0x40(%rsp), @XMM[3]
2395 movdqu @XMM[6], 0x30($out)
2396 pxor 0x50(%rsp), @XMM[7]
2397 movdqu @XMM[3], 0x40($out)
2398 movdqu @XMM[7], 0x50($out)
2399 lea 0x60($out), $out
2400
2401 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2402 jmp .Lxts_enc_done
2403 .align 16
2404 .Lxts_enc_5:
2405 pxor @XMM[8+3], @XMM[3]
2406 lea 0x50($inp), $inp
2407 pxor @XMM[8+4], @XMM[4]
2408 lea 0x80(%rsp), %rax # pass key schedule
2409 mov %edx, %r10d # pass rounds
2410
2411 call _bsaes_encrypt8
2412
2413 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2414 pxor 0x10(%rsp), @XMM[1]
2415 movdqu @XMM[0], 0x00($out) # write output
2416 pxor 0x20(%rsp), @XMM[4]
2417 movdqu @XMM[1], 0x10($out)
2418 pxor 0x30(%rsp), @XMM[6]
2419 movdqu @XMM[4], 0x20($out)
2420 pxor 0x40(%rsp), @XMM[3]
2421 movdqu @XMM[6], 0x30($out)
2422 movdqu @XMM[3], 0x40($out)
2423 lea 0x50($out), $out
2424
2425 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2426 jmp .Lxts_enc_done
2427 .align 16
2428 .Lxts_enc_4:
2429 pxor @XMM[8+2], @XMM[2]
2430 lea 0x40($inp), $inp
2431 pxor @XMM[8+3], @XMM[3]
2432 lea 0x80(%rsp), %rax # pass key schedule
2433 mov %edx, %r10d # pass rounds
2434
2435 call _bsaes_encrypt8
2436
2437 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2438 pxor 0x10(%rsp), @XMM[1]
2439 movdqu @XMM[0], 0x00($out) # write output
2440 pxor 0x20(%rsp), @XMM[4]
2441 movdqu @XMM[1], 0x10($out)
2442 pxor 0x30(%rsp), @XMM[6]
2443 movdqu @XMM[4], 0x20($out)
2444 movdqu @XMM[6], 0x30($out)
2445 lea 0x40($out), $out
2446
2447 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2448 jmp .Lxts_enc_done
2449 .align 16
2450 .Lxts_enc_3:
2451 pxor @XMM[8+1], @XMM[1]
2452 lea 0x30($inp), $inp
2453 pxor @XMM[8+2], @XMM[2]
2454 lea 0x80(%rsp), %rax # pass key schedule
2455 mov %edx, %r10d # pass rounds
2456
2457 call _bsaes_encrypt8
2458
2459 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2460 pxor 0x10(%rsp), @XMM[1]
2461 movdqu @XMM[0], 0x00($out) # write output
2462 pxor 0x20(%rsp), @XMM[4]
2463 movdqu @XMM[1], 0x10($out)
2464 movdqu @XMM[4], 0x20($out)
2465 lea 0x30($out), $out
2466
2467 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2468 jmp .Lxts_enc_done
2469 .align 16
2470 .Lxts_enc_2:
2471 pxor @XMM[8+0], @XMM[0]
2472 lea 0x20($inp), $inp
2473 pxor @XMM[8+1], @XMM[1]
2474 lea 0x80(%rsp), %rax # pass key schedule
2475 mov %edx, %r10d # pass rounds
2476
2477 call _bsaes_encrypt8
2478
2479 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2480 pxor 0x10(%rsp), @XMM[1]
2481 movdqu @XMM[0], 0x00($out) # write output
2482 movdqu @XMM[1], 0x10($out)
2483 lea 0x20($out), $out
2484
2485 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2486 jmp .Lxts_enc_done
2487 .align 16
2488 .Lxts_enc_1:
2489 pxor @XMM[0], @XMM[8]
2490 lea 0x10($inp), $inp
2491 movdqa @XMM[8], 0x20(%rbp)
2492 lea 0x20(%rbp), $arg1
2493 lea 0x20(%rbp), $arg2
2494 lea ($key), $arg3
2495 call asm_AES_encrypt # doesn't touch %xmm
2496 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2497 #pxor @XMM[8], @XMM[0]
2498 #lea 0x80(%rsp), %rax # pass key schedule
2499 #mov %edx, %r10d # pass rounds
2500 #call _bsaes_encrypt8
2501 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2502 movdqu @XMM[0], 0x00($out) # write output
2503 lea 0x10($out), $out
2504
2505 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2506
2507 .Lxts_enc_done:
2508 and \$15, %ebx
2509 jz .Lxts_enc_ret
2510 mov $out, %rdx
2511
2512 .Lxts_enc_steal:
2513 movzb ($inp), %eax
2514 movzb -16(%rdx), %ecx
2515 lea 1($inp), $inp
2516 mov %al, -16(%rdx)
2517 mov %cl, 0(%rdx)
2518 lea 1(%rdx), %rdx
2519 sub \$1,%ebx
2520 jnz .Lxts_enc_steal
2521
2522 movdqu -16($out), @XMM[0]
2523 lea 0x20(%rbp), $arg1
2524 pxor @XMM[7], @XMM[0]
2525 lea 0x20(%rbp), $arg2
2526 movdqa @XMM[0], 0x20(%rbp)
2527 lea ($key), $arg3
2528 call asm_AES_encrypt # doesn't touch %xmm
2529 pxor 0x20(%rbp), @XMM[7]
2530 movdqu @XMM[7], -16($out)
2531
2532 .Lxts_enc_ret:
2533 lea (%rsp), %rax
2534 pxor %xmm0, %xmm0
2535 .Lxts_enc_bzero: # wipe key schedule [if any]
2536 movdqa %xmm0, 0x00(%rax)
2537 movdqa %xmm0, 0x10(%rax)
2538 lea 0x20(%rax), %rax
2539 cmp %rax, %rbp
2540 ja .Lxts_enc_bzero
2541
2542 lea 0x78(%rbp),%rax
2543 .cfi_def_cfa %rax,8
2544 ___
2545 $code.=<<___ if ($win64);
2546 movaps 0x40(%rbp), %xmm6
2547 movaps 0x50(%rbp), %xmm7
2548 movaps 0x60(%rbp), %xmm8
2549 movaps 0x70(%rbp), %xmm9
2550 movaps 0x80(%rbp), %xmm10
2551 movaps 0x90(%rbp), %xmm11
2552 movaps 0xa0(%rbp), %xmm12
2553 movaps 0xb0(%rbp), %xmm13
2554 movaps 0xc0(%rbp), %xmm14
2555 movaps 0xd0(%rbp), %xmm15
2556 lea 0xa0(%rax), %rax
2557 .Lxts_enc_tail:
2558 ___
2559 $code.=<<___;
2560 mov -48(%rax), %r15
2561 .cfi_restore %r15
2562 mov -40(%rax), %r14
2563 .cfi_restore %r14
2564 mov -32(%rax), %r13
2565 .cfi_restore %r13
2566 mov -24(%rax), %r12
2567 .cfi_restore %r12
2568 mov -16(%rax), %rbx
2569 .cfi_restore %rbx
2570 mov -8(%rax), %rbp
2571 .cfi_restore %rbp
2572 lea (%rax), %rsp # restore %rsp
2573 .cfi_def_cfa_register %rsp
2574 .Lxts_enc_epilogue:
2575 ret
2576 .cfi_endproc
2577 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2578
2579 .globl bsaes_xts_decrypt
2580 .type bsaes_xts_decrypt,\@abi-omnipotent
2581 .align 16
2582 bsaes_xts_decrypt:
2583 .cfi_startproc
2584 mov %rsp, %rax
2585 .Lxts_dec_prologue:
2586 push %rbp
2587 .cfi_push %rbp
2588 push %rbx
2589 .cfi_push %rbx
2590 push %r12
2591 .cfi_push %r12
2592 push %r13
2593 .cfi_push %r13
2594 push %r14
2595 .cfi_push %r14
2596 push %r15
2597 .cfi_push %r15
2598 lea -0x48(%rsp), %rsp
2599 .cfi_adjust_cfa_offset 0x48
2600 ___
2601 $code.=<<___ if ($win64);
2602 mov 0xa0(%rsp),$arg5 # pull key2
2603 mov 0xa8(%rsp),$arg6 # pull ivp
2604 lea -0xa0(%rsp), %rsp
2605 movaps %xmm6, 0x40(%rsp)
2606 movaps %xmm7, 0x50(%rsp)
2607 movaps %xmm8, 0x60(%rsp)
2608 movaps %xmm9, 0x70(%rsp)
2609 movaps %xmm10, 0x80(%rsp)
2610 movaps %xmm11, 0x90(%rsp)
2611 movaps %xmm12, 0xa0(%rsp)
2612 movaps %xmm13, 0xb0(%rsp)
2613 movaps %xmm14, 0xc0(%rsp)
2614 movaps %xmm15, 0xd0(%rsp)
2615 .Lxts_dec_body:
2616 ___
2617 $code.=<<___;
2618 mov %rsp, %rbp # backup %rsp
2619 mov $arg1, $inp # backup arguments
2620 mov $arg2, $out
2621 mov $arg3, $len
2622 mov $arg4, $key
2623
2624 lea ($arg6), $arg1
2625 lea 0x20(%rbp), $arg2
2626 lea ($arg5), $arg3
2627 call asm_AES_encrypt # generate initial tweak
2628
2629 mov 240($key), %eax # rounds
2630 mov $len, %rbx # backup $len
2631
2632 mov %eax, %edx # rounds
2633 shl \$7, %rax # 128 bytes per inner round key
2634 sub \$`128-32`, %rax # size of bit-sliced key schedule
2635 sub %rax, %rsp
2636
2637 mov %rsp, %rax # pass key schedule
2638 mov $key, %rcx # pass key
2639 mov %edx, %r10d # pass rounds
2640 call _bsaes_key_convert
2641 pxor (%rsp), %xmm7 # fix up round 0 key
2642 movdqa %xmm6, (%rax) # save last round key
2643 movdqa %xmm7, (%rsp)
2644
2645 xor %eax, %eax # if ($len%16) len-=16;
2646 and \$-16, $len
2647 test \$15, %ebx
2648 setnz %al
2649 shl \$4, %rax
2650 sub %rax, $len
2651
2652 sub \$0x80, %rsp # place for tweak[8]
2653 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2654
2655 pxor $twtmp, $twtmp
2656 movdqa .Lxts_magic(%rip), $twmask
2657 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2658
2659 sub \$0x80, $len
2660 jc .Lxts_dec_short
2661 jmp .Lxts_dec_loop
2662
2663 .align 16
2664 .Lxts_dec_loop:
2665 ___
2666 for ($i=0;$i<7;$i++) {
2667 $code.=<<___;
2668 pshufd \$0x13, $twtmp, $twres
2669 pxor $twtmp, $twtmp
2670 movdqa @XMM[7], @XMM[$i]
2671 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2672 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2673 pand $twmask, $twres # isolate carry and residue
2674 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2675 pxor $twres, @XMM[7]
2676 ___
2677 $code.=<<___ if ($i>=1);
2678 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2679 ___
2680 $code.=<<___ if ($i>=2);
2681 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2682 ___
2683 }
2684 $code.=<<___;
2685 movdqu 0x60($inp), @XMM[8+6]
2686 pxor @XMM[8+5], @XMM[5]
2687 movdqu 0x70($inp), @XMM[8+7]
2688 lea 0x80($inp), $inp
2689 movdqa @XMM[7], 0x70(%rsp)
2690 pxor @XMM[8+6], @XMM[6]
2691 lea 0x80(%rsp), %rax # pass key schedule
2692 pxor @XMM[8+7], @XMM[7]
2693 mov %edx, %r10d # pass rounds
2694
2695 call _bsaes_decrypt8
2696
2697 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2698 pxor 0x10(%rsp), @XMM[1]
2699 movdqu @XMM[0], 0x00($out) # write output
2700 pxor 0x20(%rsp), @XMM[6]
2701 movdqu @XMM[1], 0x10($out)
2702 pxor 0x30(%rsp), @XMM[4]
2703 movdqu @XMM[6], 0x20($out)
2704 pxor 0x40(%rsp), @XMM[2]
2705 movdqu @XMM[4], 0x30($out)
2706 pxor 0x50(%rsp), @XMM[7]
2707 movdqu @XMM[2], 0x40($out)
2708 pxor 0x60(%rsp), @XMM[3]
2709 movdqu @XMM[7], 0x50($out)
2710 pxor 0x70(%rsp), @XMM[5]
2711 movdqu @XMM[3], 0x60($out)
2712 movdqu @XMM[5], 0x70($out)
2713 lea 0x80($out), $out
2714
2715 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2716 pxor $twtmp, $twtmp
2717 movdqa .Lxts_magic(%rip), $twmask
2718 pcmpgtd @XMM[7], $twtmp
2719 pshufd \$0x13, $twtmp, $twres
2720 pxor $twtmp, $twtmp
2721 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2722 pand $twmask, $twres # isolate carry and residue
2723 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2724 pxor $twres, @XMM[7]
2725
2726 sub \$0x80,$len
2727 jnc .Lxts_dec_loop
2728
2729 .Lxts_dec_short:
2730 add \$0x80, $len
2731 jz .Lxts_dec_done
2732 ___
2733 for ($i=0;$i<7;$i++) {
2734 $code.=<<___;
2735 pshufd \$0x13, $twtmp, $twres
2736 pxor $twtmp, $twtmp
2737 movdqa @XMM[7], @XMM[$i]
2738 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2739 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2740 pand $twmask, $twres # isolate carry and residue
2741 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2742 pxor $twres, @XMM[7]
2743 ___
2744 $code.=<<___ if ($i>=1);
2745 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2746 cmp \$`0x10*$i`,$len
2747 je .Lxts_dec_$i
2748 ___
2749 $code.=<<___ if ($i>=2);
2750 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2751 ___
2752 }
2753 $code.=<<___;
2754 movdqu 0x60($inp), @XMM[8+6]
2755 pxor @XMM[8+5], @XMM[5]
2756 movdqa @XMM[7], 0x70(%rsp)
2757 lea 0x70($inp), $inp
2758 pxor @XMM[8+6], @XMM[6]
2759 lea 0x80(%rsp), %rax # pass key schedule
2760 mov %edx, %r10d # pass rounds
2761
2762 call _bsaes_decrypt8
2763
2764 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2765 pxor 0x10(%rsp), @XMM[1]
2766 movdqu @XMM[0], 0x00($out) # write output
2767 pxor 0x20(%rsp), @XMM[6]
2768 movdqu @XMM[1], 0x10($out)
2769 pxor 0x30(%rsp), @XMM[4]
2770 movdqu @XMM[6], 0x20($out)
2771 pxor 0x40(%rsp), @XMM[2]
2772 movdqu @XMM[4], 0x30($out)
2773 pxor 0x50(%rsp), @XMM[7]
2774 movdqu @XMM[2], 0x40($out)
2775 pxor 0x60(%rsp), @XMM[3]
2776 movdqu @XMM[7], 0x50($out)
2777 movdqu @XMM[3], 0x60($out)
2778 lea 0x70($out), $out
2779
2780 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2781 jmp .Lxts_dec_done
2782 .align 16
2783 .Lxts_dec_6:
2784 pxor @XMM[8+4], @XMM[4]
2785 lea 0x60($inp), $inp
2786 pxor @XMM[8+5], @XMM[5]
2787 lea 0x80(%rsp), %rax # pass key schedule
2788 mov %edx, %r10d # pass rounds
2789
2790 call _bsaes_decrypt8
2791
2792 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2793 pxor 0x10(%rsp), @XMM[1]
2794 movdqu @XMM[0], 0x00($out) # write output
2795 pxor 0x20(%rsp), @XMM[6]
2796 movdqu @XMM[1], 0x10($out)
2797 pxor 0x30(%rsp), @XMM[4]
2798 movdqu @XMM[6], 0x20($out)
2799 pxor 0x40(%rsp), @XMM[2]
2800 movdqu @XMM[4], 0x30($out)
2801 pxor 0x50(%rsp), @XMM[7]
2802 movdqu @XMM[2], 0x40($out)
2803 movdqu @XMM[7], 0x50($out)
2804 lea 0x60($out), $out
2805
2806 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2807 jmp .Lxts_dec_done
2808 .align 16
2809 .Lxts_dec_5:
2810 pxor @XMM[8+3], @XMM[3]
2811 lea 0x50($inp), $inp
2812 pxor @XMM[8+4], @XMM[4]
2813 lea 0x80(%rsp), %rax # pass key schedule
2814 mov %edx, %r10d # pass rounds
2815
2816 call _bsaes_decrypt8
2817
2818 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2819 pxor 0x10(%rsp), @XMM[1]
2820 movdqu @XMM[0], 0x00($out) # write output
2821 pxor 0x20(%rsp), @XMM[6]
2822 movdqu @XMM[1], 0x10($out)
2823 pxor 0x30(%rsp), @XMM[4]
2824 movdqu @XMM[6], 0x20($out)
2825 pxor 0x40(%rsp), @XMM[2]
2826 movdqu @XMM[4], 0x30($out)
2827 movdqu @XMM[2], 0x40($out)
2828 lea 0x50($out), $out
2829
2830 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2831 jmp .Lxts_dec_done
2832 .align 16
2833 .Lxts_dec_4:
2834 pxor @XMM[8+2], @XMM[2]
2835 lea 0x40($inp), $inp
2836 pxor @XMM[8+3], @XMM[3]
2837 lea 0x80(%rsp), %rax # pass key schedule
2838 mov %edx, %r10d # pass rounds
2839
2840 call _bsaes_decrypt8
2841
2842 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2843 pxor 0x10(%rsp), @XMM[1]
2844 movdqu @XMM[0], 0x00($out) # write output
2845 pxor 0x20(%rsp), @XMM[6]
2846 movdqu @XMM[1], 0x10($out)
2847 pxor 0x30(%rsp), @XMM[4]
2848 movdqu @XMM[6], 0x20($out)
2849 movdqu @XMM[4], 0x30($out)
2850 lea 0x40($out), $out
2851
2852 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2853 jmp .Lxts_dec_done
2854 .align 16
2855 .Lxts_dec_3:
2856 pxor @XMM[8+1], @XMM[1]
2857 lea 0x30($inp), $inp
2858 pxor @XMM[8+2], @XMM[2]
2859 lea 0x80(%rsp), %rax # pass key schedule
2860 mov %edx, %r10d # pass rounds
2861
2862 call _bsaes_decrypt8
2863
2864 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2865 pxor 0x10(%rsp), @XMM[1]
2866 movdqu @XMM[0], 0x00($out) # write output
2867 pxor 0x20(%rsp), @XMM[6]
2868 movdqu @XMM[1], 0x10($out)
2869 movdqu @XMM[6], 0x20($out)
2870 lea 0x30($out), $out
2871
2872 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2873 jmp .Lxts_dec_done
2874 .align 16
2875 .Lxts_dec_2:
2876 pxor @XMM[8+0], @XMM[0]
2877 lea 0x20($inp), $inp
2878 pxor @XMM[8+1], @XMM[1]
2879 lea 0x80(%rsp), %rax # pass key schedule
2880 mov %edx, %r10d # pass rounds
2881
2882 call _bsaes_decrypt8
2883
2884 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2885 pxor 0x10(%rsp), @XMM[1]
2886 movdqu @XMM[0], 0x00($out) # write output
2887 movdqu @XMM[1], 0x10($out)
2888 lea 0x20($out), $out
2889
2890 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2891 jmp .Lxts_dec_done
2892 .align 16
2893 .Lxts_dec_1:
2894 pxor @XMM[0], @XMM[8]
2895 lea 0x10($inp), $inp
2896 movdqa @XMM[8], 0x20(%rbp)
2897 lea 0x20(%rbp), $arg1
2898 lea 0x20(%rbp), $arg2
2899 lea ($key), $arg3
2900 call asm_AES_decrypt # doesn't touch %xmm
2901 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2902 #pxor @XMM[8], @XMM[0]
2903 #lea 0x80(%rsp), %rax # pass key schedule
2904 #mov %edx, %r10d # pass rounds
2905 #call _bsaes_decrypt8
2906 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2907 movdqu @XMM[0], 0x00($out) # write output
2908 lea 0x10($out), $out
2909
2910 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2911
2912 .Lxts_dec_done:
2913 and \$15, %ebx
2914 jz .Lxts_dec_ret
2915
2916 pxor $twtmp, $twtmp
2917 movdqa .Lxts_magic(%rip), $twmask
2918 pcmpgtd @XMM[7], $twtmp
2919 pshufd \$0x13, $twtmp, $twres
2920 movdqa @XMM[7], @XMM[6]
2921 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2922 pand $twmask, $twres # isolate carry and residue
2923 movdqu ($inp), @XMM[0]
2924 pxor $twres, @XMM[7]
2925
2926 lea 0x20(%rbp), $arg1
2927 pxor @XMM[7], @XMM[0]
2928 lea 0x20(%rbp), $arg2
2929 movdqa @XMM[0], 0x20(%rbp)
2930 lea ($key), $arg3
2931 call asm_AES_decrypt # doesn't touch %xmm
2932 pxor 0x20(%rbp), @XMM[7]
2933 mov $out, %rdx
2934 movdqu @XMM[7], ($out)
2935
2936 .Lxts_dec_steal:
2937 movzb 16($inp), %eax
2938 movzb (%rdx), %ecx
2939 lea 1($inp), $inp
2940 mov %al, (%rdx)
2941 mov %cl, 16(%rdx)
2942 lea 1(%rdx), %rdx
2943 sub \$1,%ebx
2944 jnz .Lxts_dec_steal
2945
2946 movdqu ($out), @XMM[0]
2947 lea 0x20(%rbp), $arg1
2948 pxor @XMM[6], @XMM[0]
2949 lea 0x20(%rbp), $arg2
2950 movdqa @XMM[0], 0x20(%rbp)
2951 lea ($key), $arg3
2952 call asm_AES_decrypt # doesn't touch %xmm
2953 pxor 0x20(%rbp), @XMM[6]
2954 movdqu @XMM[6], ($out)
2955
2956 .Lxts_dec_ret:
2957 lea (%rsp), %rax
2958 pxor %xmm0, %xmm0
2959 .Lxts_dec_bzero: # wipe key schedule [if any]
2960 movdqa %xmm0, 0x00(%rax)
2961 movdqa %xmm0, 0x10(%rax)
2962 lea 0x20(%rax), %rax
2963 cmp %rax, %rbp
2964 ja .Lxts_dec_bzero
2965
2966 lea 0x78(%rbp),%rax
2967 .cfi_def_cfa %rax,8
2968 ___
2969 $code.=<<___ if ($win64);
2970 movaps 0x40(%rbp), %xmm6
2971 movaps 0x50(%rbp), %xmm7
2972 movaps 0x60(%rbp), %xmm8
2973 movaps 0x70(%rbp), %xmm9
2974 movaps 0x80(%rbp), %xmm10
2975 movaps 0x90(%rbp), %xmm11
2976 movaps 0xa0(%rbp), %xmm12
2977 movaps 0xb0(%rbp), %xmm13
2978 movaps 0xc0(%rbp), %xmm14
2979 movaps 0xd0(%rbp), %xmm15
2980 lea 0xa0(%rax), %rax
2981 .Lxts_dec_tail:
2982 ___
2983 $code.=<<___;
2984 mov -48(%rax), %r15
2985 .cfi_restore %r15
2986 mov -40(%rax), %r14
2987 .cfi_restore %r14
2988 mov -32(%rax), %r13
2989 .cfi_restore %r13
2990 mov -24(%rax), %r12
2991 .cfi_restore %r12
2992 mov -16(%rax), %rbx
2993 .cfi_restore %rbx
2994 mov -8(%rax), %rbp
2995 .cfi_restore %rbp
2996 lea (%rax), %rsp # restore %rsp
2997 .cfi_def_cfa_register %rsp
2998 .Lxts_dec_epilogue:
2999 ret
3000 .cfi_endproc
3001 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
3002 ___
3003 }
3004 $code.=<<___;
3005 .type _bsaes_const,\@object
3006 .align 64
3007 _bsaes_const:
3008 .LM0ISR: # InvShiftRows constants
3009 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
3010 .LISRM0:
3011 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
3012 .LISR:
3013 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
3014 .LBS0: # bit-slice constants
3015 .quad 0x5555555555555555, 0x5555555555555555
3016 .LBS1:
3017 .quad 0x3333333333333333, 0x3333333333333333
3018 .LBS2:
3019 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3020 .LSR: # shiftrows constants
3021 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
3022 .LSRM0:
3023 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
3024 .LM0SR:
3025 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
3026 .LSWPUP: # byte-swap upper dword
3027 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
3028 .LSWPUPM0SR:
3029 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
3030 .LADD1: # counter increment constants
3031 .quad 0x0000000000000000, 0x0000000100000000
3032 .LADD2:
3033 .quad 0x0000000000000000, 0x0000000200000000
3034 .LADD3:
3035 .quad 0x0000000000000000, 0x0000000300000000
3036 .LADD4:
3037 .quad 0x0000000000000000, 0x0000000400000000
3038 .LADD5:
3039 .quad 0x0000000000000000, 0x0000000500000000
3040 .LADD6:
3041 .quad 0x0000000000000000, 0x0000000600000000
3042 .LADD7:
3043 .quad 0x0000000000000000, 0x0000000700000000
3044 .LADD8:
3045 .quad 0x0000000000000000, 0x0000000800000000
3046 .Lxts_magic:
3047 .long 0x87,0,1,0
3048 .Lmasks:
3049 .quad 0x0101010101010101, 0x0101010101010101
3050 .quad 0x0202020202020202, 0x0202020202020202
3051 .quad 0x0404040404040404, 0x0404040404040404
3052 .quad 0x0808080808080808, 0x0808080808080808
3053 .LM0:
3054 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
3055 .L63:
3056 .quad 0x6363636363636363, 0x6363636363636363
3057 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3058 .align 64
3059 .size _bsaes_const,.-_bsaes_const
3060 ___
3061
3062 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3063 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3064 if ($win64) {
3065 $rec="%rcx";
3066 $frame="%rdx";
3067 $context="%r8";
3068 $disp="%r9";
3069
3070 $code.=<<___;
3071 .extern __imp_RtlVirtualUnwind
3072 .type se_handler,\@abi-omnipotent
3073 .align 16
3074 se_handler:
3075 push %rsi
3076 push %rdi
3077 push %rbx
3078 push %rbp
3079 push %r12
3080 push %r13
3081 push %r14
3082 push %r15
3083 pushfq
3084 sub \$64,%rsp
3085
3086 mov 120($context),%rax # pull context->Rax
3087 mov 248($context),%rbx # pull context->Rip
3088
3089 mov 8($disp),%rsi # disp->ImageBase
3090 mov 56($disp),%r11 # disp->HandlerData
3091
3092 mov 0(%r11),%r10d # HandlerData[0]
3093 lea (%rsi,%r10),%r10 # prologue label
3094 cmp %r10,%rbx # context->Rip<=prologue label
3095 jbe .Lin_prologue
3096
3097 mov 4(%r11),%r10d # HandlerData[1]
3098 lea (%rsi,%r10),%r10 # epilogue label
3099 cmp %r10,%rbx # context->Rip>=epilogue label
3100 jae .Lin_prologue
3101
3102 mov 8(%r11),%r10d # HandlerData[2]
3103 lea (%rsi,%r10),%r10 # epilogue label
3104 cmp %r10,%rbx # context->Rip>=tail label
3105 jae .Lin_tail
3106
3107 mov 160($context),%rax # pull context->Rbp
3108
3109 lea 0x40(%rax),%rsi # %xmm save area
3110 lea 512($context),%rdi # &context.Xmm6
3111 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3112 .long 0xa548f3fc # cld; rep movsq
3113 lea 0xa0+0x78(%rax),%rax # adjust stack pointer
3114
3115 .Lin_tail:
3116 mov -48(%rax),%rbp
3117 mov -40(%rax),%rbx
3118 mov -32(%rax),%r12
3119 mov -24(%rax),%r13
3120 mov -16(%rax),%r14
3121 mov -8(%rax),%r15
3122 mov %rbx,144($context) # restore context->Rbx
3123 mov %rbp,160($context) # restore context->Rbp
3124 mov %r12,216($context) # restore context->R12
3125 mov %r13,224($context) # restore context->R13
3126 mov %r14,232($context) # restore context->R14
3127 mov %r15,240($context) # restore context->R15
3128
3129 .Lin_prologue:
3130 mov %rax,152($context) # restore context->Rsp
3131
3132 mov 40($disp),%rdi # disp->ContextRecord
3133 mov $context,%rsi # context
3134 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3135 .long 0xa548f3fc # cld; rep movsq
3136
3137 mov $disp,%rsi
3138 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3139 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3140 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3141 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3142 mov 40(%rsi),%r10 # disp->ContextRecord
3143 lea 56(%rsi),%r11 # &disp->HandlerData
3144 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3145 mov %r10,32(%rsp) # arg5
3146 mov %r11,40(%rsp) # arg6
3147 mov %r12,48(%rsp) # arg7
3148 mov %rcx,56(%rsp) # arg8, (NULL)
3149 call *__imp_RtlVirtualUnwind(%rip)
3150
3151 mov \$1,%eax # ExceptionContinueSearch
3152 add \$64,%rsp
3153 popfq
3154 pop %r15
3155 pop %r14
3156 pop %r13
3157 pop %r12
3158 pop %rbp
3159 pop %rbx
3160 pop %rdi
3161 pop %rsi
3162 ret
3163 .size se_handler,.-se_handler
3164
3165 .section .pdata
3166 .align 4
3167 ___
3168 $code.=<<___ if ($ecb);
3169 .rva .Lecb_enc_prologue
3170 .rva .Lecb_enc_epilogue
3171 .rva .Lecb_enc_info
3172
3173 .rva .Lecb_dec_prologue
3174 .rva .Lecb_dec_epilogue
3175 .rva .Lecb_dec_info
3176 ___
3177 $code.=<<___;
3178 .rva .Lcbc_dec_prologue
3179 .rva .Lcbc_dec_epilogue
3180 .rva .Lcbc_dec_info
3181
3182 .rva .Lctr_enc_prologue
3183 .rva .Lctr_enc_epilogue
3184 .rva .Lctr_enc_info
3185
3186 .rva .Lxts_enc_prologue
3187 .rva .Lxts_enc_epilogue
3188 .rva .Lxts_enc_info
3189
3190 .rva .Lxts_dec_prologue
3191 .rva .Lxts_dec_epilogue
3192 .rva .Lxts_dec_info
3193
3194 .section .xdata
3195 .align 8
3196 ___
3197 $code.=<<___ if ($ecb);
3198 .Lecb_enc_info:
3199 .byte 9,0,0,0
3200 .rva se_handler
3201 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3202 .rva .Lecb_enc_tail
3203 .long 0
3204 .Lecb_dec_info:
3205 .byte 9,0,0,0
3206 .rva se_handler
3207 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3208 .rva .Lecb_dec_tail
3209 .long 0
3210 ___
3211 $code.=<<___;
3212 .Lcbc_dec_info:
3213 .byte 9,0,0,0
3214 .rva se_handler
3215 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3216 .rva .Lcbc_dec_tail
3217 .long 0
3218 .Lctr_enc_info:
3219 .byte 9,0,0,0
3220 .rva se_handler
3221 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3222 .rva .Lctr_enc_tail
3223 .long 0
3224 .Lxts_enc_info:
3225 .byte 9,0,0,0
3226 .rva se_handler
3227 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3228 .rva .Lxts_enc_tail
3229 .long 0
3230 .Lxts_dec_info:
3231 .byte 9,0,0,0
3232 .rva se_handler
3233 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3234 .rva .Lxts_dec_tail
3235 .long 0
3236 ___
3237 }
3238
3239 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3240
3241 print $code;
3242
3243 close STDOUT or die "error closing STDOUT: $!";