]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/bsaes-x86_64.pl
bn/asm/rsax-avx2.pl: minor optimization [for Decoded ICache].
[thirdparty/openssl.git] / crypto / aes / asm / bsaes-x86_64.pl
CommitLineData
4ec93a10
AP
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
11272648 7### Author: Emilia Käsper and Peter Schwabe ###
4ec93a10
AP
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
f9ef874a
AP
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
4ec93a10
AP
71#
72# The ratio values mean that 128-byte blocks will be processed
f9ef874a 73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
4ec93a10
AP
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
28507577
AP
81# October 2011.
82#
b08259cd
AP
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
6f6a6130
AP
86# Core 2 9.83
87# Nehalem 7.74
88# Atom 18.9 (estimated, not measured yet)
28507577 89#
60d4e99c
AP
90# November 2011.
91#
fe068648
AP
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
60d4e99c 94#
4ec93a10
AP
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
46bf83f0
AP
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
4ec93a10
AP
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
fe068648 113my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
4ec93a10
AP
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
28507577 118sub Sbox {
4ec93a10
AP
119# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124 &InBasisChange (@b);
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134 pxor @b[6], @b[5]
135 pxor @b[1], @b[2]
28507577 136 pxor @b[0], @b[3]
4ec93a10 137 pxor @b[2], @b[6]
28507577 138 pxor @b[0], @b[5]
4ec93a10
AP
139
140 pxor @b[3], @b[6]
141 pxor @b[7], @b[3]
142 pxor @b[5], @b[7]
143 pxor @b[4], @b[3]
144 pxor @b[5], @b[4]
145 pxor @b[1], @b[3]
146
147 pxor @b[7], @b[2]
148 pxor @b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157 pxor @b[6], @b[0]
158 pxor @b[4], @b[1]
159 pxor @b[0], @b[2]
160 pxor @b[6], @b[4]
161 pxor @b[1], @b[6]
162
163 pxor @b[5], @b[1]
164 pxor @b[3], @b[5]
165 pxor @b[7], @b[3]
166 pxor @b[5], @b[7]
167 pxor @b[5], @b[2]
168
169 pxor @b[7], @b[4]
170___
171}
172
28507577
AP
173sub InvSbox {
174# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange { # OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187 pxor @b[7], @b[4]
188
189 pxor @b[5], @b[7]
190 pxor @b[5], @b[2]
191 pxor @b[7], @b[3]
192 pxor @b[3], @b[5]
193 pxor @b[5], @b[1]
194
195 pxor @b[1], @b[6]
196 pxor @b[0], @b[2]
197 pxor @b[6], @b[4]
198 pxor @b[6], @b[0]
199 pxor @b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange { # InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206 pxor @b[5], @b[1]
207 pxor @b[7], @b[2]
208
209 pxor @b[1], @b[3]
210 pxor @b[5], @b[4]
211 pxor @b[5], @b[7]
212 pxor @b[4], @b[3]
213 pxor @b[0], @b[5]
214 pxor @b[7], @b[3]
215 pxor @b[2], @b[6]
216 pxor @b[1], @b[2]
217 pxor @b[3], @b[6]
218
219 pxor @b[0], @b[3]
220 pxor @b[6], @b[5]
221___
222}
223
4ec93a10
AP
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230 movdqa $y0, $t0
231 pxor $y1, $t0
232 pand $x0, $t0
233 pxor $x1, $x0
234 pand $y0, $x1
235 pand $y1, $x0
236 pxor $x1, $x0
237 pxor $t0, $x1
238___
239}
240
241sub Mul_GF4_N { # not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245 movdqa $y0, $t0
246 pxor $y1, $t0
247 pand $x0, $t0
248 pxor $x1, $x0
249 pand $y0, $x1
250 pand $y1, $x0
251 pxor $x0, $x1
252 pxor $t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261 movdqa $y0, $t0
262 movdqa $y2, $t1
263 pxor $y1, $t0
264 pxor $y3, $t1
265 pand $x0, $t0
266 pand $x2, $t1
267 pxor $x1, $x0
268 pxor $x3, $x2
269 pand $y0, $x1
270 pand $y2, $x3
271 pand $y1, $x0
272 pand $y3, $x2
273 pxor $x0, $x1
274 pxor $x3, $x2
275 pxor $t0, $x0
276 pxor $t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284 movdqa @x[0], @t[0]
285 movdqa @x[1], @t[1]
286___
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289 pxor @x[2], @t[0]
290 pxor @x[3], @t[1]
291 pxor @y[2], @y[0]
292 pxor @y[3], @y[1]
293___
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297 pxor @t[0], @x[0]
298 pxor @t[0], @x[2]
299 pxor @t[1], @x[1]
300 pxor @t[1], @x[3]
301
302 movdqa @x[4], @t[0]
303 movdqa @x[5], @t[1]
304 pxor @x[6], @t[0]
305 pxor @x[7], @t[1]
306___
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310 pxor @y[2], @y[0]
311 pxor @y[3], @y[1]
312___
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315 pxor @t[0], @x[4]
316 pxor @t[0], @x[6]
317 pxor @t[1], @x[5]
318 pxor @t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330 movdqa @x[4], @t[3]
331 movdqa @x[5], @t[2]
332 movdqa @x[1], @t[1]
333 movdqa @x[7], @s[1]
334 movdqa @x[0], @s[0]
335
336 pxor @x[6], @t[3]
337 pxor @x[7], @t[2]
338 pxor @x[3], @t[1]
339 movdqa @t[3], @s[2]
340 pxor @x[6], @s[1]
341 movdqa @t[2], @t[0]
342 pxor @x[2], @s[0]
343 movdqa @t[3], @s[3]
344
345 por @t[1], @t[2]
346 por @s[0], @t[3]
347 pxor @t[0], @s[3]
348 pand @s[0], @s[2]
349 pxor @t[1], @s[0]
350 pand @t[1], @t[0]
351 pand @s[0], @s[3]
352 movdqa @x[3], @s[0]
353 pxor @x[2], @s[0]
354 pand @s[0], @s[1]
355 pxor @s[1], @t[3]
356 pxor @s[1], @t[2]
357 movdqa @x[4], @s[1]
358 movdqa @x[1], @s[0]
359 pxor @x[5], @s[1]
360 pxor @x[0], @s[0]
361 movdqa @s[1], @t[1]
362 pand @s[0], @s[1]
363 por @s[0], @t[1]
364 pxor @s[1], @t[0]
365 pxor @s[3], @t[3]
366 pxor @s[2], @t[2]
367 pxor @s[3], @t[1]
368 movdqa @x[7], @s[0]
369 pxor @s[2], @t[0]
370 movdqa @x[6], @s[1]
371 pxor @s[2], @t[1]
372 movdqa @x[5], @s[2]
373 pand @x[3], @s[0]
374 movdqa @x[4], @s[3]
375 pand @x[2], @s[1]
376 pand @x[1], @s[2]
377 por @x[0], @s[3]
378 pxor @s[0], @t[3]
379 pxor @s[1], @t[2]
380 pxor @s[2], @t[1]
381 pxor @s[3], @t[0]
382
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385 # new smaller inversion
386
387 movdqa @t[3], @s[0]
388 pand @t[1], @t[3]
389 pxor @t[2], @s[0]
390
391 movdqa @t[0], @s[2]
392 movdqa @s[0], @s[3]
393 pxor @t[3], @s[2]
394 pand @s[2], @s[3]
395
396 movdqa @t[1], @s[1]
397 pxor @t[2], @s[3]
398 pxor @t[0], @s[1]
399
400 pxor @t[2], @t[3]
401
402 pand @t[3], @s[1]
403
404 movdqa @s[2], @t[2]
405 pxor @t[0], @s[1]
406
407 pxor @s[1], @t[2]
408 pxor @s[1], @t[1]
409
410 pand @t[0], @t[2]
411
412 pxor @t[2], @s[2]
413 pxor @t[2], @t[1]
414
415 pand @s[3], @s[2]
416
417 pxor @s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
28507577 431sub ShiftRows {
4ec93a10
AP
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
437 pshufb $mask,@x[0]
438 pxor 0x20($key),@x[2]
439 pshufb $mask,@x[1]
440 pxor 0x30($key),@x[3]
441 pshufb $mask,@x[2]
442 pxor 0x40($key),@x[4]
443 pshufb $mask,@x[3]
444 pxor 0x50($key),@x[5]
445 pshufb $mask,@x[4]
446 pxor 0x60($key),@x[6]
447 pshufb $mask,@x[5]
448 pxor 0x70($key),@x[7]
449 pshufb $mask,@x[6]
450 lea 0x80($key),$key
451 pshufb $mask,@x[7]
452___
453}
454
28507577 455sub MixColumns {
4ec93a10
AP
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
6f6a6130 459my $inv=@_[16]; # optional
4ec93a10
AP
460$code.=<<___;
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
465 pxor @t[1], @x[1]
466 pshufd \$0x93, @x[3], @t[3]
467 pxor @t[2], @x[2]
468 pshufd \$0x93, @x[4], @t[4]
469 pxor @t[3], @x[3]
470 pshufd \$0x93, @x[5], @t[5]
471 pxor @t[4], @x[4]
472 pshufd \$0x93, @x[6], @t[6]
473 pxor @t[5], @x[5]
474 pshufd \$0x93, @x[7], @t[7]
475 pxor @t[6], @x[6]
476 pxor @t[7], @x[7]
477
478 pxor @x[0], @t[1]
479 pxor @x[7], @t[0]
480 pxor @x[7], @t[1]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
482 pxor @x[1], @t[2]
483 pshufd \$0x4E, @x[1], @x[1]
484 pxor @x[4], @t[5]
485 pxor @t[0], @x[0]
486 pxor @x[5], @t[6]
487 pxor @t[1], @x[1]
488 pxor @x[3], @t[4]
489 pshufd \$0x4E, @x[4], @t[0]
490 pxor @x[6], @t[7]
491 pshufd \$0x4E, @x[5], @t[1]
492 pxor @x[2], @t[3]
493 pshufd \$0x4E, @x[3], @x[4]
494 pxor @x[7], @t[3]
495 pshufd \$0x4E, @x[7], @x[5]
496 pxor @x[7], @t[4]
497 pshufd \$0x4E, @x[6], @x[3]
498 pxor @t[4], @t[0]
499 pshufd \$0x4E, @x[2], @x[6]
500 pxor @t[5], @t[1]
6f6a6130
AP
501___
502$code.=<<___ if (!$inv);
4ec93a10
AP
503 pxor @t[3], @x[4]
504 pxor @t[7], @x[5]
505 pxor @t[6], @x[3]
506 movdqa @t[0], @x[2]
507 pxor @t[2], @x[6]
508 movdqa @t[1], @x[7]
509___
6f6a6130
AP
510$code.=<<___ if ($inv);
511 pxor @x[4], @t[3]
512 pxor @t[7], @x[5]
513 pxor @x[3], @t[6]
514 movdqa @t[0], @x[3]
515 pxor @t[2], @x[6]
516 movdqa @t[6], @x[2]
517 movdqa @t[1], @x[7]
518 movdqa @x[6], @x[4]
519 movdqa @t[3], @x[6]
520___
4ec93a10
AP
521}
522
6f6a6130 523sub InvMixColumns_orig {
28507577
AP
524my @x=@_[0..7];
525my @t=@_[8..15];
526
527$code.=<<___;
28507577 528 # multiplication by 0x0e
b08259cd
AP
529 pshufd \$0x93, @x[7], @t[7]
530 movdqa @x[2], @t[2]
28507577
AP
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
b08259cd
AP
533 pshufd \$0x93, @x[0], @t[0]
534 movdqa @x[5], @t[5]
28507577
AP
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
b08259cd 537 pshufd \$0x93, @x[1], @t[1]
28507577
AP
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
b08259cd 541 pshufd \$0x93, @x[3], @t[3]
28507577
AP
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
b08259cd
AP
545 pshufd \$0x93, @x[6], @t[6]
546 movdqa @x[4], @t[4]
28507577
AP
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
28507577 551 pxor @t[2], @x[3] # 36 2
28507577 552 pxor @t[4], @x[3] # 362 4 [5]
b08259cd 553 pshufd \$0x93, @t[5], @t[5]
28507577
AP
554___
555 my @y = @x[7,5,0,2,1,3,4,6];
556$code.=<<___;
557 # multiplication by 0x0b
558 pxor @y[0], @y[1]
559 pxor @t[0], @y[0]
28507577 560 pxor @t[1], @y[1]
b08259cd
AP
561 pshufd \$0x93, @t[2], @t[2]
562 pxor @t[5], @y[0]
563 pxor @t[6], @y[1]
564 pxor @t[7], @y[0]
565 pshufd \$0x93, @t[4], @t[4]
28507577 566 pxor @t[6], @t[7] # clobber t[7]
b08259cd 567 pxor @y[0], @y[1]
28507577 568
b08259cd
AP
569 pxor @t[0], @y[3]
570 pshufd \$0x93, @t[0], @t[0]
28507577 571 pxor @t[1], @y[2]
b08259cd 572 pxor @t[1], @y[4]
28507577 573 pxor @t[2], @y[2]
b08259cd 574 pshufd \$0x93, @t[1], @t[1]
28507577 575 pxor @t[2], @y[3]
b08259cd
AP
576 pxor @t[2], @y[5]
577 pxor @t[7], @y[2]
578 pshufd \$0x93, @t[2], @t[2]
28507577 579 pxor @t[3], @y[3]
b08259cd
AP
580 pxor @t[3], @y[6]
581 pxor @t[3], @y[4]
582 pshufd \$0x93, @t[3], @t[3]
583 pxor @t[4], @y[7]
584 pxor @t[4], @y[5]
28507577 585 pxor @t[7], @y[7]
b08259cd
AP
586 pxor @t[5], @y[3]
587 pxor @t[4], @y[4]
28507577
AP
588 pxor @t[5], @t[7] # clobber t[7] even more
589
b08259cd
AP
590 pxor @t[7], @y[5]
591 pshufd \$0x93, @t[4], @t[4]
592 pxor @t[7], @y[6]
593 pxor @t[7], @y[4]
28507577
AP
594
595 pxor @t[5], @t[7]
b08259cd 596 pshufd \$0x93, @t[5], @t[5]
28507577
AP
597 pxor @t[6], @t[7] # restore t[7]
598
b08259cd
AP
599 # multiplication by 0x0d
600 pxor @y[7], @y[4]
601 pxor @t[4], @y[7]
28507577 602 pshufd \$0x93, @t[6], @t[6]
b08259cd
AP
603 pxor @t[0], @y[2]
604 pxor @t[5], @y[7]
605 pxor @t[2], @y[2]
28507577
AP
606 pshufd \$0x93, @t[7], @t[7]
607
28507577
AP
608 pxor @y[1], @y[3]
609 pxor @t[1], @y[1]
b08259cd 610 pxor @t[0], @y[0]
28507577 611 pxor @t[0], @y[3]
b08259cd
AP
612 pxor @t[5], @y[1]
613 pxor @t[5], @y[0]
614 pxor @t[7], @y[1]
615 pshufd \$0x93, @t[0], @t[0]
616 pxor @t[6], @y[0]
617 pxor @y[1], @y[3]
28507577 618 pxor @t[1], @y[4]
b08259cd 619 pshufd \$0x93, @t[1], @t[1]
28507577 620
b08259cd
AP
621 pxor @t[7], @y[7]
622 pxor @t[2], @y[4]
28507577 623 pxor @t[2], @y[5]
b08259cd
AP
624 pshufd \$0x93, @t[2], @t[2]
625 pxor @t[6], @y[2]
626 pxor @t[3], @t[6] # clobber t[6]
627 pxor @y[7], @y[4]
628 pxor @t[6], @y[3]
28507577
AP
629
630 pxor @t[6], @y[6]
b08259cd 631 pxor @t[5], @y[5]
28507577 632 pxor @t[4], @y[6]
b08259cd
AP
633 pshufd \$0x93, @t[4], @t[4]
634 pxor @t[6], @y[5]
635 pxor @t[7], @y[6]
28507577
AP
636 pxor @t[3], @t[6] # restore t[6]
637
28507577
AP
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
b08259cd 641 pshufd \$0x93, @t[3], @t[3]
28507577
AP
642
643 # multiplication by 0x09
644 pxor @y[1], @y[4]
b08259cd 645 pxor @y[1], @t[1] # t[1]=y[1]
28507577 646 pxor @t[5], @t[0] # clobber t[0]
b08259cd 647 pxor @t[5], @t[1]
28507577 648 pxor @t[0], @y[3]
b08259cd
AP
649 pxor @y[0], @t[0] # t[0]=y[0]
650 pxor @t[6], @t[1]
651 pxor @t[7], @t[6] # clobber t[6]
652 pxor @t[1], @y[4]
28507577 653 pxor @t[4], @y[7]
b08259cd
AP
654 pxor @y[4], @t[4] # t[4]=y[4]
655 pxor @t[3], @y[6]
656 pxor @y[3], @t[3] # t[3]=y[3]
657 pxor @t[2], @y[5]
658 pxor @y[2], @t[2] # t[2]=y[2]
659 pxor @t[7], @t[3]
660 pxor @y[5], @t[5] # t[5]=y[5]
661 pxor @t[6], @t[2]
662 pxor @t[6], @t[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
28507577
AP
665
666 movdqa @t[0],@XMM[0]
667 movdqa @t[1],@XMM[1]
668 movdqa @t[2],@XMM[2]
669 movdqa @t[3],@XMM[3]
670 movdqa @t[4],@XMM[4]
671 movdqa @t[5],@XMM[5]
672 movdqa @t[6],@XMM[6]
673 movdqa @t[7],@XMM[7]
674___
675}
676
6f6a6130
AP
677sub InvMixColumns {
678my @x=@_[0..7];
679my @t=@_[8..15];
680
681# Thanks to Jussi Kivilinna for providing pointer to
682#
683# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
687
688$code.=<<___;
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
692 pxor @x[0], @t[0]
693 pshufd \$0x4E, @x[7], @t[7]
694 pxor @x[6], @t[6]
695 pshufd \$0x4E, @x[1], @t[1]
696 pxor @x[7], @t[7]
697 pshufd \$0x4E, @x[2], @t[2]
698 pxor @x[1], @t[1]
699 pshufd \$0x4E, @x[3], @t[3]
700 pxor @x[2], @t[2]
701 pxor @t[6], @x[0]
702 pxor @t[6], @x[1]
703 pshufd \$0x4E, @x[4], @t[4]
704 pxor @x[3], @t[3]
705 pxor @t[0], @x[2]
706 pxor @t[1], @x[3]
707 pshufd \$0x4E, @x[5], @t[5]
708 pxor @x[4], @t[4]
709 pxor @t[7], @x[1]
710 pxor @t[2], @x[4]
711 pxor @x[5], @t[5]
712
713 pxor @t[7], @x[2]
714 pxor @t[6], @x[3]
715 pxor @t[6], @x[4]
716 pxor @t[3], @x[5]
717 pxor @t[4], @x[6]
718 pxor @t[7], @x[4]
719 pxor @t[7], @x[5]
720 pxor @t[5], @x[7]
721___
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
723}
724
4ec93a10
AP
725sub aesenc { # not used
726my @b=@_[0..7];
727my @t=@_[8..15];
728$code.=<<___;
729 movdqa 0x30($const),@t[0] # .LSR
730___
28507577
AP
731 &ShiftRows (@b,@t[0]);
732 &Sbox (@b,@t);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
4ec93a10
AP
734}
735
736sub aesenclast { # not used
737my @b=@_[0..7];
738my @t=@_[8..15];
739$code.=<<___;
740 movdqa 0x40($const),@t[0] # .LSRM0
741___
28507577
AP
742 &ShiftRows (@b,@t[0]);
743 &Sbox (@b,@t);
4ec93a10
AP
744$code.=<<___
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
753___
754}
755
756sub swapmove {
757my ($a,$b,$n,$mask,$t)=@_;
758$code.=<<___;
759 movdqa $b,$t
760 psrlq \$$n,$b
761 pxor $a,$b
762 pand $mask,$b
763 pxor $b,$a
764 psllq \$$n,$b
765 pxor $t,$b
766___
767}
768sub swapmove2x {
769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770$code.=<<___;
771 movdqa $b0,$t0
772 psrlq \$$n,$b0
773 movdqa $b1,$t1
774 psrlq \$$n,$b1
775 pxor $a0,$b0
776 pxor $a1,$b1
777 pand $mask,$b0
778 pand $mask,$b1
779 pxor $b0,$a0
780 psllq \$$n,$b0
781 pxor $b1,$a1
782 psllq \$$n,$b1
783 pxor $t0,$b0
784 pxor $t1,$b1
785___
786}
787
788sub bitslice {
789my @x=reverse(@_[0..7]);
790my ($t0,$t1,$t2,$t3)=@_[8..11];
791$code.=<<___;
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
794___
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797$code.=<<___;
798 movdqa 0x20($const),$t0 # .LBS2
799___
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805}
806
807$code.=<<___;
808.text
809
fe068648
AP
810.extern asm_AES_encrypt
811.extern asm_AES_decrypt
4ec93a10
AP
812
813.type _bsaes_encrypt8,\@abi-omnipotent
814.align 64
815_bsaes_encrypt8:
816 lea .LBS0(%rip), $const # constants table
817
818 movdqa ($key), @XMM[9] # round 0 key
819 lea 0x10($key), $key
f9ef874a 820 movdqa 0x50($const), @XMM[8] # .LM0SR
4ec93a10
AP
821 pxor @XMM[9], @XMM[0] # xor with round0 key
822 pxor @XMM[9], @XMM[1]
823 pshufb @XMM[8], @XMM[0]
824 pxor @XMM[9], @XMM[2]
825 pshufb @XMM[8], @XMM[1]
826 pxor @XMM[9], @XMM[3]
827 pshufb @XMM[8], @XMM[2]
828 pxor @XMM[9], @XMM[4]
829 pshufb @XMM[8], @XMM[3]
830 pxor @XMM[9], @XMM[5]
831 pshufb @XMM[8], @XMM[4]
832 pxor @XMM[9], @XMM[6]
833 pshufb @XMM[8], @XMM[5]
834 pxor @XMM[9], @XMM[7]
835 pshufb @XMM[8], @XMM[6]
836 pshufb @XMM[8], @XMM[7]
837_bsaes_encrypt8_bitslice:
838___
839 &bitslice (@XMM[0..7, 8..11]);
840$code.=<<___;
841 dec $rounds
842 jmp .Lenc_sbox
843.align 16
844.Lenc_loop:
845___
28507577 846 &ShiftRows (@XMM[0..7, 8]);
4ec93a10 847$code.=".Lenc_sbox:\n";
28507577 848 &Sbox (@XMM[0..7, 8..15]);
4ec93a10
AP
849$code.=<<___;
850 dec $rounds
851 jl .Lenc_done
852___
28507577 853 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
4ec93a10
AP
854$code.=<<___;
855 movdqa 0x30($const), @XMM[8] # .LSR
856 jnz .Lenc_loop
857 movdqa 0x40($const), @XMM[8] # .LSRM0
858 jmp .Lenc_loop
859.align 16
860.Lenc_done:
861___
862 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
863 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
864$code.=<<___;
865 movdqa ($key), @XMM[8] # last round key
4ec93a10
AP
866 pxor @XMM[8], @XMM[4]
867 pxor @XMM[8], @XMM[6]
868 pxor @XMM[8], @XMM[3]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[2]
871 pxor @XMM[8], @XMM[5]
28507577
AP
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
4ec93a10
AP
874 ret
875.size _bsaes_encrypt8,.-_bsaes_encrypt8
28507577
AP
876
877.type _bsaes_decrypt8,\@abi-omnipotent
878.align 64
879_bsaes_decrypt8:
880 lea .LBS0(%rip), $const # constants table
881
882 movdqa ($key), @XMM[9] # round 0 key
883 lea 0x10($key), $key
884 movdqa -0x30($const), @XMM[8] # .LM0ISR
885 pxor @XMM[9], @XMM[0] # xor with round0 key
886 pxor @XMM[9], @XMM[1]
887 pshufb @XMM[8], @XMM[0]
888 pxor @XMM[9], @XMM[2]
889 pshufb @XMM[8], @XMM[1]
890 pxor @XMM[9], @XMM[3]
891 pshufb @XMM[8], @XMM[2]
892 pxor @XMM[9], @XMM[4]
893 pshufb @XMM[8], @XMM[3]
894 pxor @XMM[9], @XMM[5]
895 pshufb @XMM[8], @XMM[4]
896 pxor @XMM[9], @XMM[6]
897 pshufb @XMM[8], @XMM[5]
898 pxor @XMM[9], @XMM[7]
899 pshufb @XMM[8], @XMM[6]
900 pshufb @XMM[8], @XMM[7]
901___
902 &bitslice (@XMM[0..7, 8..11]);
903$code.=<<___;
904 dec $rounds
905 jmp .Ldec_sbox
906.align 16
907.Ldec_loop:
908___
909 &ShiftRows (@XMM[0..7, 8]);
910$code.=".Ldec_sbox:\n";
911 &InvSbox (@XMM[0..7, 8..15]);
912$code.=<<___;
913 dec $rounds
914 jl .Ldec_done
915___
916 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
917$code.=<<___;
918 movdqa -0x10($const), @XMM[8] # .LISR
919 jnz .Ldec_loop
920 movdqa -0x20($const), @XMM[8] # .LISRM0
921 jmp .Ldec_loop
922.align 16
923.Ldec_done:
924___
925 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
926$code.=<<___;
927 movdqa ($key), @XMM[8] # last round key
928 pxor @XMM[8], @XMM[6]
929 pxor @XMM[8], @XMM[4]
930 pxor @XMM[8], @XMM[2]
931 pxor @XMM[8], @XMM[7]
932 pxor @XMM[8], @XMM[3]
933 pxor @XMM[8], @XMM[5]
934 pxor @XMM[8], @XMM[0]
935 pxor @XMM[8], @XMM[1]
936 ret
937.size _bsaes_decrypt8,.-_bsaes_decrypt8
4ec93a10
AP
938___
939}
940{
941my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
942
943sub bitslice_key {
944my @x=reverse(@_[0..7]);
945my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
946
947 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
948$code.=<<___;
949 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
950 movdqa @x[0], @x[2]
951 movdqa @x[1], @x[3]
952___
953 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
954
955 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
956$code.=<<___;
957 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
958 movdqa @x[0], @x[4]
959 movdqa @x[2], @x[6]
960 movdqa @x[1], @x[5]
961 movdqa @x[3], @x[7]
962___
963 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
964 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
965}
966
967$code.=<<___;
28507577 968.type _bsaes_key_convert,\@abi-omnipotent
4ec93a10 969.align 16
28507577 970_bsaes_key_convert:
f9ef874a 971 lea .Lmasks(%rip), $const
4ec93a10 972 movdqu ($inp), %xmm7 # load round 0 key
4ec93a10 973 lea 0x10($inp), $inp
f9ef874a
AP
974 movdqa 0x00($const), %xmm0 # 0x01...
975 movdqa 0x10($const), %xmm1 # 0x02...
976 movdqa 0x20($const), %xmm2 # 0x04...
977 movdqa 0x30($const), %xmm3 # 0x08...
978 movdqa 0x40($const), %xmm4 # .LM0
979 pcmpeqd %xmm5, %xmm5 # .LNOT
980
981 movdqu ($inp), %xmm6 # load round 1 key
4ec93a10
AP
982 movdqa %xmm7, ($out) # save round 0 key
983 lea 0x10($out), $out
984 dec $rounds
985 jmp .Lkey_loop
986.align 16
987.Lkey_loop:
f9ef874a
AP
988 pshufb %xmm4, %xmm6 # .LM0
989
990 movdqa %xmm0, %xmm8
991 movdqa %xmm1, %xmm9
992
993 pand %xmm6, %xmm8
994 pand %xmm6, %xmm9
995 movdqa %xmm2, %xmm10
996 pcmpeqb %xmm0, %xmm8
997 psllq \$4, %xmm0 # 0x10...
998 movdqa %xmm3, %xmm11
999 pcmpeqb %xmm1, %xmm9
1000 psllq \$4, %xmm1 # 0x20...
1001
1002 pand %xmm6, %xmm10
1003 pand %xmm6, %xmm11
1004 movdqa %xmm0, %xmm12
1005 pcmpeqb %xmm2, %xmm10
1006 psllq \$4, %xmm2 # 0x40...
1007 movdqa %xmm1, %xmm13
1008 pcmpeqb %xmm3, %xmm11
1009 psllq \$4, %xmm3 # 0x80...
1010
1011 movdqa %xmm2, %xmm14
1012 movdqa %xmm3, %xmm15
1013 pxor %xmm5, %xmm8 # "pnot"
1014 pxor %xmm5, %xmm9
1015
1016 pand %xmm6, %xmm12
1017 pand %xmm6, %xmm13
1018 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1019 pcmpeqb %xmm0, %xmm12
1020 psrlq \$4, %xmm0 # 0x01...
1021 movdqa %xmm9, 0x10($out)
1022 pcmpeqb %xmm1, %xmm13
1023 psrlq \$4, %xmm1 # 0x02...
1024 lea 0x10($inp), $inp
1025
1026 pand %xmm6, %xmm14
1027 pand %xmm6, %xmm15
1028 movdqa %xmm10, 0x20($out)
1029 pcmpeqb %xmm2, %xmm14
1030 psrlq \$4, %xmm2 # 0x04...
1031 movdqa %xmm11, 0x30($out)
1032 pcmpeqb %xmm3, %xmm15
1033 psrlq \$4, %xmm3 # 0x08...
1034 movdqu ($inp), %xmm6 # load next round key
1035
1036 pxor %xmm5, %xmm13 # "pnot"
1037 pxor %xmm5, %xmm14
1038 movdqa %xmm12, 0x40($out)
1039 movdqa %xmm13, 0x50($out)
1040 movdqa %xmm14, 0x60($out)
1041 movdqa %xmm15, 0x70($out)
4ec93a10 1042 lea 0x80($out),$out
4ec93a10
AP
1043 dec $rounds
1044 jnz .Lkey_loop
1045
f9ef874a 1046 movdqa 0x50($const), %xmm7 # .L63
28507577 1047 #movdqa %xmm6, ($out) # don't save last round key
4ec93a10 1048 ret
28507577 1049.size _bsaes_key_convert,.-_bsaes_key_convert
4ec93a10
AP
1050___
1051}
1052
fe068648 1053if (0 && !$win64) { # following four functions are unsupported interface
11272648 1054 # used for benchmarking...
4ec93a10
AP
1055$code.=<<___;
1056.globl bsaes_enc_key_convert
1057.type bsaes_enc_key_convert,\@function,2
1058.align 16
1059bsaes_enc_key_convert:
1060 mov 240($inp),%r10d # pass rounds
1061 mov $inp,%rcx # pass key
1062 mov $out,%rax # pass key schedule
28507577
AP
1063 call _bsaes_key_convert
1064 pxor %xmm6,%xmm7 # fix up last round key
1065 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1066 ret
1067.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1068
1069.globl bsaes_encrypt_128
1070.type bsaes_encrypt_128,\@function,4
1071.align 16
1072bsaes_encrypt_128:
1073.Lenc128_loop:
1074 movdqu 0x00($inp), @XMM[0] # load input
1075 movdqu 0x10($inp), @XMM[1]
1076 movdqu 0x20($inp), @XMM[2]
1077 movdqu 0x30($inp), @XMM[3]
1078 movdqu 0x40($inp), @XMM[4]
1079 movdqu 0x50($inp), @XMM[5]
1080 movdqu 0x60($inp), @XMM[6]
1081 movdqu 0x70($inp), @XMM[7]
1082 mov $key, %rax # pass the $key
1083 lea 0x80($inp), $inp
1084 mov \$10,%r10d
1085
1086 call _bsaes_encrypt8
1087
1088 movdqu @XMM[0], 0x00($out) # write output
1089 movdqu @XMM[1], 0x10($out)
1090 movdqu @XMM[4], 0x20($out)
1091 movdqu @XMM[6], 0x30($out)
1092 movdqu @XMM[3], 0x40($out)
1093 movdqu @XMM[7], 0x50($out)
1094 movdqu @XMM[2], 0x60($out)
1095 movdqu @XMM[5], 0x70($out)
1096 lea 0x80($out), $out
1097 sub \$0x80,$len
1098 ja .Lenc128_loop
1099 ret
1100.size bsaes_encrypt_128,.-bsaes_encrypt_128
28507577
AP
1101
1102.globl bsaes_dec_key_convert
1103.type bsaes_dec_key_convert,\@function,2
1104.align 16
1105bsaes_dec_key_convert:
1106 mov 240($inp),%r10d # pass rounds
1107 mov $inp,%rcx # pass key
1108 mov $out,%rax # pass key schedule
1109 call _bsaes_key_convert
1110 pxor ($out),%xmm7 # fix up round 0 key
1111 movdqa %xmm6,(%rax) # save last round key
1112 movdqa %xmm7,($out)
1113 ret
1114.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1115
1116.globl bsaes_decrypt_128
1117.type bsaes_decrypt_128,\@function,4
1118.align 16
1119bsaes_decrypt_128:
1120.Ldec128_loop:
1121 movdqu 0x00($inp), @XMM[0] # load input
1122 movdqu 0x10($inp), @XMM[1]
1123 movdqu 0x20($inp), @XMM[2]
1124 movdqu 0x30($inp), @XMM[3]
1125 movdqu 0x40($inp), @XMM[4]
1126 movdqu 0x50($inp), @XMM[5]
1127 movdqu 0x60($inp), @XMM[6]
1128 movdqu 0x70($inp), @XMM[7]
1129 mov $key, %rax # pass the $key
1130 lea 0x80($inp), $inp
1131 mov \$10,%r10d
1132
1133 call _bsaes_decrypt8
1134
1135 movdqu @XMM[0], 0x00($out) # write output
1136 movdqu @XMM[1], 0x10($out)
1137 movdqu @XMM[6], 0x20($out)
1138 movdqu @XMM[4], 0x30($out)
1139 movdqu @XMM[2], 0x40($out)
1140 movdqu @XMM[7], 0x50($out)
1141 movdqu @XMM[3], 0x60($out)
1142 movdqu @XMM[5], 0x70($out)
1143 lea 0x80($out), $out
1144 sub \$0x80,$len
1145 ja .Ldec128_loop
1146 ret
1147.size bsaes_decrypt_128,.-bsaes_decrypt_128
4ec93a10
AP
1148___
1149}
1150{
1151######################################################################
1152#
1153# OpenSSL interface
1154#
a75a52a4
AP
1155my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1156 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
4ec93a10
AP
1157my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1158
fe068648 1159if ($ecb) {
4ec93a10
AP
1160$code.=<<___;
1161.globl bsaes_ecb_encrypt_blocks
1162.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1163.align 16
1164bsaes_ecb_encrypt_blocks:
fe068648
AP
1165 mov %rsp, %rax
1166.Lecb_enc_prologue:
4ec93a10
AP
1167 push %rbp
1168 push %rbx
1169 push %r12
1170 push %r13
1171 push %r14
1172 push %r15
1173 lea -0x48(%rsp),%rsp
1174___
1175$code.=<<___ if ($win64);
1176 lea -0xa0(%rsp), %rsp
1177 movaps %xmm6, 0x40(%rsp)
1178 movaps %xmm7, 0x50(%rsp)
1179 movaps %xmm8, 0x60(%rsp)
1180 movaps %xmm9, 0x70(%rsp)
1181 movaps %xmm10, 0x80(%rsp)
1182 movaps %xmm11, 0x90(%rsp)
1183 movaps %xmm12, 0xa0(%rsp)
1184 movaps %xmm13, 0xb0(%rsp)
1185 movaps %xmm14, 0xc0(%rsp)
1186 movaps %xmm15, 0xd0(%rsp)
1187.Lecb_enc_body:
1188___
1189$code.=<<___;
1190 mov %rsp,%rbp # backup %rsp
1191 mov 240($arg4),%eax # rounds
1192 mov $arg1,$inp # backup arguments
1193 mov $arg2,$out
1194 mov $arg3,$len
1195 mov $arg4,$key
1196 cmp \$8,$arg3
1197 jb .Lecb_enc_short
1198
1199 mov %eax,%ebx # backup rounds
1200 shl \$7,%rax # 128 bytes per inner round key
1201 sub \$`128-32`,%rax # size of bit-sliced key schedule
1202 sub %rax,%rsp
1203 mov %rsp,%rax # pass key schedule
1204 mov $key,%rcx # pass key
1205 mov %ebx,%r10d # pass rounds
28507577
AP
1206 call _bsaes_key_convert
1207 pxor %xmm6,%xmm7 # fix up last round key
1208 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1209
1210 sub \$8,$len
1211.Lecb_enc_loop:
1212 movdqu 0x00($inp), @XMM[0] # load input
1213 movdqu 0x10($inp), @XMM[1]
1214 movdqu 0x20($inp), @XMM[2]
1215 movdqu 0x30($inp), @XMM[3]
1216 movdqu 0x40($inp), @XMM[4]
1217 movdqu 0x50($inp), @XMM[5]
1218 mov %rsp, %rax # pass key schedule
1219 movdqu 0x60($inp), @XMM[6]
1220 mov %ebx,%r10d # pass rounds
1221 movdqu 0x70($inp), @XMM[7]
1222 lea 0x80($inp), $inp
1223
1224 call _bsaes_encrypt8
1225
1226 movdqu @XMM[0], 0x00($out) # write output
1227 movdqu @XMM[1], 0x10($out)
1228 movdqu @XMM[4], 0x20($out)
1229 movdqu @XMM[6], 0x30($out)
1230 movdqu @XMM[3], 0x40($out)
1231 movdqu @XMM[7], 0x50($out)
1232 movdqu @XMM[2], 0x60($out)
1233 movdqu @XMM[5], 0x70($out)
1234 lea 0x80($out), $out
1235 sub \$8,$len
1236 jnc .Lecb_enc_loop
1237
1238 add \$8,$len
1239 jz .Lecb_enc_done
1240
1241 movdqu 0x00($inp), @XMM[0] # load input
1242 mov %rsp, %rax # pass key schedule
1243 mov %ebx,%r10d # pass rounds
1244 cmp \$2,$len
1245 jb .Lecb_enc_one
1246 movdqu 0x10($inp), @XMM[1]
1247 je .Lecb_enc_two
1248 movdqu 0x20($inp), @XMM[2]
1249 cmp \$4,$len
1250 jb .Lecb_enc_three
1251 movdqu 0x30($inp), @XMM[3]
1252 je .Lecb_enc_four
1253 movdqu 0x40($inp), @XMM[4]
1254 cmp \$6,$len
1255 jb .Lecb_enc_five
1256 movdqu 0x50($inp), @XMM[5]
1257 je .Lecb_enc_six
1258 movdqu 0x60($inp), @XMM[6]
1259 call _bsaes_encrypt8
1260 movdqu @XMM[0], 0x00($out) # write output
1261 movdqu @XMM[1], 0x10($out)
1262 movdqu @XMM[4], 0x20($out)
1263 movdqu @XMM[6], 0x30($out)
1264 movdqu @XMM[3], 0x40($out)
1265 movdqu @XMM[7], 0x50($out)
1266 movdqu @XMM[2], 0x60($out)
1267 jmp .Lecb_enc_done
1268.align 16
1269.Lecb_enc_six:
1270 call _bsaes_encrypt8
1271 movdqu @XMM[0], 0x00($out) # write output
1272 movdqu @XMM[1], 0x10($out)
1273 movdqu @XMM[4], 0x20($out)
1274 movdqu @XMM[6], 0x30($out)
1275 movdqu @XMM[3], 0x40($out)
1276 movdqu @XMM[7], 0x50($out)
1277 jmp .Lecb_enc_done
1278.align 16
1279.Lecb_enc_five:
1280 call _bsaes_encrypt8
1281 movdqu @XMM[0], 0x00($out) # write output
1282 movdqu @XMM[1], 0x10($out)
1283 movdqu @XMM[4], 0x20($out)
1284 movdqu @XMM[6], 0x30($out)
1285 movdqu @XMM[3], 0x40($out)
1286 jmp .Lecb_enc_done
1287.align 16
1288.Lecb_enc_four:
1289 call _bsaes_encrypt8
1290 movdqu @XMM[0], 0x00($out) # write output
1291 movdqu @XMM[1], 0x10($out)
1292 movdqu @XMM[4], 0x20($out)
1293 movdqu @XMM[6], 0x30($out)
1294 jmp .Lecb_enc_done
1295.align 16
1296.Lecb_enc_three:
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1301 jmp .Lecb_enc_done
1302.align 16
1303.Lecb_enc_two:
1304 call _bsaes_encrypt8
1305 movdqu @XMM[0], 0x00($out) # write output
1306 movdqu @XMM[1], 0x10($out)
1307 jmp .Lecb_enc_done
1308.align 16
1309.Lecb_enc_one:
1310 call _bsaes_encrypt8
1311 movdqu @XMM[0], 0x00($out) # write output
1312 jmp .Lecb_enc_done
1313.align 16
1314.Lecb_enc_short:
1315 lea ($inp), $arg1
1316 lea ($out), $arg2
1317 lea ($key), $arg3
fe068648 1318 call asm_AES_encrypt
4ec93a10
AP
1319 lea 16($inp), $inp
1320 lea 16($out), $out
1321 dec $len
1322 jnz .Lecb_enc_short
1323
1324.Lecb_enc_done:
1325 lea (%rsp),%rax
1326 pxor %xmm0, %xmm0
1327.Lecb_enc_bzero: # wipe key schedule [if any]
1328 movdqa %xmm0, 0x00(%rax)
1329 movdqa %xmm0, 0x10(%rax)
1330 lea 0x20(%rax), %rax
1331 cmp %rax, %rbp
1332 jb .Lecb_enc_bzero
1333
1334 lea (%rbp),%rsp # restore %rsp
1335___
1336$code.=<<___ if ($win64);
1337 movaps 0x40(%rbp), %xmm6
1338 movaps 0x50(%rbp), %xmm7
1339 movaps 0x60(%rbp), %xmm8
1340 movaps 0x70(%rbp), %xmm9
1341 movaps 0x80(%rbp), %xmm10
1342 movaps 0x90(%rbp), %xmm11
1343 movaps 0xa0(%rbp), %xmm12
1344 movaps 0xb0(%rbp), %xmm13
1345 movaps 0xc0(%rbp), %xmm14
1346 movaps 0xd0(%rbp), %xmm15
1347 lea 0xa0(%rbp), %rsp
1348___
1349$code.=<<___;
1350 mov 0x48(%rsp), %r15
1351 mov 0x50(%rsp), %r14
1352 mov 0x58(%rsp), %r13
1353 mov 0x60(%rsp), %r12
1354 mov 0x68(%rsp), %rbx
fe068648 1355 mov 0x70(%rsp), %rax
4ec93a10 1356 lea 0x78(%rsp), %rsp
fe068648 1357 mov %rax, %rbp
4ec93a10
AP
1358.Lecb_enc_epilogue:
1359 ret
1360.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1361
a75a52a4
AP
1362.globl bsaes_ecb_decrypt_blocks
1363.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1364.align 16
1365bsaes_ecb_decrypt_blocks:
fe068648
AP
1366 mov %rsp, %rax
1367.Lecb_dec_prologue:
a75a52a4
AP
1368 push %rbp
1369 push %rbx
1370 push %r12
1371 push %r13
1372 push %r14
1373 push %r15
1374 lea -0x48(%rsp),%rsp
1375___
1376$code.=<<___ if ($win64);
1377 lea -0xa0(%rsp), %rsp
1378 movaps %xmm6, 0x40(%rsp)
1379 movaps %xmm7, 0x50(%rsp)
1380 movaps %xmm8, 0x60(%rsp)
1381 movaps %xmm9, 0x70(%rsp)
1382 movaps %xmm10, 0x80(%rsp)
1383 movaps %xmm11, 0x90(%rsp)
1384 movaps %xmm12, 0xa0(%rsp)
1385 movaps %xmm13, 0xb0(%rsp)
1386 movaps %xmm14, 0xc0(%rsp)
1387 movaps %xmm15, 0xd0(%rsp)
1388.Lecb_dec_body:
1389___
1390$code.=<<___;
1391 mov %rsp,%rbp # backup %rsp
1392 mov 240($arg4),%eax # rounds
1393 mov $arg1,$inp # backup arguments
1394 mov $arg2,$out
1395 mov $arg3,$len
1396 mov $arg4,$key
1397 cmp \$8,$arg3
1398 jb .Lecb_dec_short
1399
1400 mov %eax,%ebx # backup rounds
1401 shl \$7,%rax # 128 bytes per inner round key
1402 sub \$`128-32`,%rax # size of bit-sliced key schedule
1403 sub %rax,%rsp
1404 mov %rsp,%rax # pass key schedule
1405 mov $key,%rcx # pass key
1406 mov %ebx,%r10d # pass rounds
1407 call _bsaes_key_convert
1408 pxor (%rsp),%xmm7 # fix up 0 round key
1409 movdqa %xmm6,(%rax) # save last round key
1410 movdqa %xmm7,(%rsp)
1411
1412 sub \$8,$len
1413.Lecb_dec_loop:
1414 movdqu 0x00($inp), @XMM[0] # load input
1415 movdqu 0x10($inp), @XMM[1]
1416 movdqu 0x20($inp), @XMM[2]
1417 movdqu 0x30($inp), @XMM[3]
1418 movdqu 0x40($inp), @XMM[4]
1419 movdqu 0x50($inp), @XMM[5]
1420 mov %rsp, %rax # pass key schedule
1421 movdqu 0x60($inp), @XMM[6]
1422 mov %ebx,%r10d # pass rounds
1423 movdqu 0x70($inp), @XMM[7]
1424 lea 0x80($inp), $inp
1425
1426 call _bsaes_decrypt8
1427
1428 movdqu @XMM[0], 0x00($out) # write output
1429 movdqu @XMM[1], 0x10($out)
1430 movdqu @XMM[6], 0x20($out)
1431 movdqu @XMM[4], 0x30($out)
1432 movdqu @XMM[2], 0x40($out)
1433 movdqu @XMM[7], 0x50($out)
1434 movdqu @XMM[3], 0x60($out)
1435 movdqu @XMM[5], 0x70($out)
1436 lea 0x80($out), $out
1437 sub \$8,$len
1438 jnc .Lecb_dec_loop
1439
1440 add \$8,$len
1441 jz .Lecb_dec_done
1442
1443 movdqu 0x00($inp), @XMM[0] # load input
1444 mov %rsp, %rax # pass key schedule
1445 mov %ebx,%r10d # pass rounds
1446 cmp \$2,$len
1447 jb .Lecb_dec_one
1448 movdqu 0x10($inp), @XMM[1]
1449 je .Lecb_dec_two
1450 movdqu 0x20($inp), @XMM[2]
1451 cmp \$4,$len
1452 jb .Lecb_dec_three
1453 movdqu 0x30($inp), @XMM[3]
1454 je .Lecb_dec_four
1455 movdqu 0x40($inp), @XMM[4]
1456 cmp \$6,$len
1457 jb .Lecb_dec_five
1458 movdqu 0x50($inp), @XMM[5]
1459 je .Lecb_dec_six
1460 movdqu 0x60($inp), @XMM[6]
1461 call _bsaes_decrypt8
1462 movdqu @XMM[0], 0x00($out) # write output
1463 movdqu @XMM[1], 0x10($out)
1464 movdqu @XMM[6], 0x20($out)
1465 movdqu @XMM[4], 0x30($out)
1466 movdqu @XMM[2], 0x40($out)
1467 movdqu @XMM[7], 0x50($out)
1468 movdqu @XMM[3], 0x60($out)
1469 jmp .Lecb_dec_done
1470.align 16
1471.Lecb_dec_six:
1472 call _bsaes_decrypt8
1473 movdqu @XMM[0], 0x00($out) # write output
1474 movdqu @XMM[1], 0x10($out)
1475 movdqu @XMM[6], 0x20($out)
1476 movdqu @XMM[4], 0x30($out)
1477 movdqu @XMM[2], 0x40($out)
1478 movdqu @XMM[7], 0x50($out)
1479 jmp .Lecb_dec_done
1480.align 16
1481.Lecb_dec_five:
1482 call _bsaes_decrypt8
1483 movdqu @XMM[0], 0x00($out) # write output
1484 movdqu @XMM[1], 0x10($out)
1485 movdqu @XMM[6], 0x20($out)
1486 movdqu @XMM[4], 0x30($out)
1487 movdqu @XMM[2], 0x40($out)
1488 jmp .Lecb_dec_done
1489.align 16
1490.Lecb_dec_four:
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 jmp .Lecb_dec_done
1497.align 16
1498.Lecb_dec_three:
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1503 jmp .Lecb_dec_done
1504.align 16
1505.Lecb_dec_two:
1506 call _bsaes_decrypt8
1507 movdqu @XMM[0], 0x00($out) # write output
1508 movdqu @XMM[1], 0x10($out)
1509 jmp .Lecb_dec_done
1510.align 16
1511.Lecb_dec_one:
1512 call _bsaes_decrypt8
1513 movdqu @XMM[0], 0x00($out) # write output
1514 jmp .Lecb_dec_done
1515.align 16
1516.Lecb_dec_short:
1517 lea ($inp), $arg1
1518 lea ($out), $arg2
1519 lea ($key), $arg3
fe068648 1520 call asm_AES_decrypt
a75a52a4
AP
1521 lea 16($inp), $inp
1522 lea 16($out), $out
1523 dec $len
1524 jnz .Lecb_dec_short
1525
1526.Lecb_dec_done:
1527 lea (%rsp),%rax
1528 pxor %xmm0, %xmm0
1529.Lecb_dec_bzero: # wipe key schedule [if any]
1530 movdqa %xmm0, 0x00(%rax)
1531 movdqa %xmm0, 0x10(%rax)
1532 lea 0x20(%rax), %rax
1533 cmp %rax, %rbp
1534 jb .Lecb_dec_bzero
1535
1536 lea (%rbp),%rsp # restore %rsp
1537___
1538$code.=<<___ if ($win64);
1539 movaps 0x40(%rbp), %xmm6
1540 movaps 0x50(%rbp), %xmm7
1541 movaps 0x60(%rbp), %xmm8
1542 movaps 0x70(%rbp), %xmm9
1543 movaps 0x80(%rbp), %xmm10
1544 movaps 0x90(%rbp), %xmm11
1545 movaps 0xa0(%rbp), %xmm12
1546 movaps 0xb0(%rbp), %xmm13
1547 movaps 0xc0(%rbp), %xmm14
1548 movaps 0xd0(%rbp), %xmm15
1549 lea 0xa0(%rbp), %rsp
1550___
1551$code.=<<___;
1552 mov 0x48(%rsp), %r15
1553 mov 0x50(%rsp), %r14
1554 mov 0x58(%rsp), %r13
1555 mov 0x60(%rsp), %r12
1556 mov 0x68(%rsp), %rbx
fe068648 1557 mov 0x70(%rsp), %rax
a75a52a4 1558 lea 0x78(%rsp), %rsp
fe068648 1559 mov %rax, %rbp
a75a52a4
AP
1560.Lecb_dec_epilogue:
1561 ret
1562.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1563___
1564}
1565$code.=<<___;
fe068648 1566.extern asm_AES_cbc_encrypt
a75a52a4
AP
1567.globl bsaes_cbc_encrypt
1568.type bsaes_cbc_encrypt,\@abi-omnipotent
1569.align 16
1570bsaes_cbc_encrypt:
1571___
1572$code.=<<___ if ($win64);
1573 mov 48(%rsp),$arg6 # pull direction flag
1574___
1575$code.=<<___;
1576 cmp \$0,$arg6
fe068648 1577 jne asm_AES_cbc_encrypt
a75a52a4 1578 cmp \$128,$arg3
fe068648 1579 jb asm_AES_cbc_encrypt
a75a52a4 1580
fe068648
AP
1581 mov %rsp, %rax
1582.Lcbc_dec_prologue:
a75a52a4
AP
1583 push %rbp
1584 push %rbx
1585 push %r12
1586 push %r13
1587 push %r14
1588 push %r15
1589 lea -0x48(%rsp), %rsp
1590___
1591$code.=<<___ if ($win64);
1592 mov 0xa0(%rsp),$arg5 # pull ivp
1593 lea -0xa0(%rsp), %rsp
1594 movaps %xmm6, 0x40(%rsp)
1595 movaps %xmm7, 0x50(%rsp)
1596 movaps %xmm8, 0x60(%rsp)
1597 movaps %xmm9, 0x70(%rsp)
1598 movaps %xmm10, 0x80(%rsp)
1599 movaps %xmm11, 0x90(%rsp)
1600 movaps %xmm12, 0xa0(%rsp)
1601 movaps %xmm13, 0xb0(%rsp)
1602 movaps %xmm14, 0xc0(%rsp)
1603 movaps %xmm15, 0xd0(%rsp)
1604.Lcbc_dec_body:
1605___
1606$code.=<<___;
1607 mov %rsp, %rbp # backup %rsp
1608 mov 240($arg4), %eax # rounds
1609 mov $arg1, $inp # backup arguments
1610 mov $arg2, $out
1611 mov $arg3, $len
1612 mov $arg4, $key
60d4e99c 1613 mov $arg5, %rbx
a75a52a4
AP
1614 shr \$4, $len # bytes to blocks
1615
60d4e99c 1616 mov %eax, %edx # rounds
a75a52a4
AP
1617 shl \$7, %rax # 128 bytes per inner round key
1618 sub \$`128-32`, %rax # size of bit-sliced key schedule
1619 sub %rax, %rsp
1620
1621 mov %rsp, %rax # pass key schedule
1622 mov $key, %rcx # pass key
60d4e99c 1623 mov %edx, %r10d # pass rounds
a75a52a4
AP
1624 call _bsaes_key_convert
1625 pxor (%rsp),%xmm7 # fix up 0 round key
1626 movdqa %xmm6,(%rax) # save last round key
1627 movdqa %xmm7,(%rsp)
1628
60d4e99c 1629 movdqu (%rbx), @XMM[15] # load IV
a75a52a4
AP
1630 sub \$8,$len
1631.Lcbc_dec_loop:
1632 movdqu 0x00($inp), @XMM[0] # load input
1633 movdqu 0x10($inp), @XMM[1]
1634 movdqu 0x20($inp), @XMM[2]
1635 movdqu 0x30($inp), @XMM[3]
1636 movdqu 0x40($inp), @XMM[4]
1637 movdqu 0x50($inp), @XMM[5]
1638 mov %rsp, %rax # pass key schedule
1639 movdqu 0x60($inp), @XMM[6]
60d4e99c 1640 mov %edx,%r10d # pass rounds
a75a52a4
AP
1641 movdqu 0x70($inp), @XMM[7]
1642 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1643
1644 call _bsaes_decrypt8
1645
1646 pxor 0x20(%rbp), @XMM[0] # ^= IV
1647 movdqu 0x00($inp), @XMM[8] # re-load input
1648 movdqu 0x10($inp), @XMM[9]
1649 pxor @XMM[8], @XMM[1]
1650 movdqu 0x20($inp), @XMM[10]
1651 pxor @XMM[9], @XMM[6]
1652 movdqu 0x30($inp), @XMM[11]
1653 pxor @XMM[10], @XMM[4]
1654 movdqu 0x40($inp), @XMM[12]
1655 pxor @XMM[11], @XMM[2]
1656 movdqu 0x50($inp), @XMM[13]
1657 pxor @XMM[12], @XMM[7]
1658 movdqu 0x60($inp), @XMM[14]
1659 pxor @XMM[13], @XMM[3]
1660 movdqu 0x70($inp), @XMM[15] # IV
1661 pxor @XMM[14], @XMM[5]
1662 movdqu @XMM[0], 0x00($out) # write output
1663 lea 0x80($inp), $inp
1664 movdqu @XMM[1], 0x10($out)
1665 movdqu @XMM[6], 0x20($out)
1666 movdqu @XMM[4], 0x30($out)
1667 movdqu @XMM[2], 0x40($out)
1668 movdqu @XMM[7], 0x50($out)
1669 movdqu @XMM[3], 0x60($out)
1670 movdqu @XMM[5], 0x70($out)
1671 lea 0x80($out), $out
1672 sub \$8,$len
1673 jnc .Lcbc_dec_loop
1674
1675 add \$8,$len
1676 jz .Lcbc_dec_done
1677
1678 movdqu 0x00($inp), @XMM[0] # load input
1679 mov %rsp, %rax # pass key schedule
60d4e99c 1680 mov %edx, %r10d # pass rounds
a75a52a4
AP
1681 cmp \$2,$len
1682 jb .Lcbc_dec_one
1683 movdqu 0x10($inp), @XMM[1]
1684 je .Lcbc_dec_two
1685 movdqu 0x20($inp), @XMM[2]
1686 cmp \$4,$len
1687 jb .Lcbc_dec_three
1688 movdqu 0x30($inp), @XMM[3]
1689 je .Lcbc_dec_four
1690 movdqu 0x40($inp), @XMM[4]
1691 cmp \$6,$len
1692 jb .Lcbc_dec_five
1693 movdqu 0x50($inp), @XMM[5]
1694 je .Lcbc_dec_six
1695 movdqu 0x60($inp), @XMM[6]
1696 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1697 call _bsaes_decrypt8
1698 pxor 0x20(%rbp), @XMM[0] # ^= IV
1699 movdqu 0x00($inp), @XMM[8] # re-load input
1700 movdqu 0x10($inp), @XMM[9]
1701 pxor @XMM[8], @XMM[1]
1702 movdqu 0x20($inp), @XMM[10]
1703 pxor @XMM[9], @XMM[6]
1704 movdqu 0x30($inp), @XMM[11]
1705 pxor @XMM[10], @XMM[4]
1706 movdqu 0x40($inp), @XMM[12]
1707 pxor @XMM[11], @XMM[2]
1708 movdqu 0x50($inp), @XMM[13]
1709 pxor @XMM[12], @XMM[7]
1710 movdqu 0x60($inp), @XMM[15] # IV
1711 pxor @XMM[13], @XMM[3]
1712 movdqu @XMM[0], 0x00($out) # write output
1713 movdqu @XMM[1], 0x10($out)
1714 movdqu @XMM[6], 0x20($out)
1715 movdqu @XMM[4], 0x30($out)
1716 movdqu @XMM[2], 0x40($out)
1717 movdqu @XMM[7], 0x50($out)
1718 movdqu @XMM[3], 0x60($out)
1719 jmp .Lcbc_dec_done
1720.align 16
1721.Lcbc_dec_six:
1722 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1723 call _bsaes_decrypt8
1724 pxor 0x20(%rbp), @XMM[0] # ^= IV
1725 movdqu 0x00($inp), @XMM[8] # re-load input
1726 movdqu 0x10($inp), @XMM[9]
1727 pxor @XMM[8], @XMM[1]
1728 movdqu 0x20($inp), @XMM[10]
1729 pxor @XMM[9], @XMM[6]
1730 movdqu 0x30($inp), @XMM[11]
1731 pxor @XMM[10], @XMM[4]
1732 movdqu 0x40($inp), @XMM[12]
1733 pxor @XMM[11], @XMM[2]
1734 movdqu 0x50($inp), @XMM[15] # IV
1735 pxor @XMM[12], @XMM[7]
1736 movdqu @XMM[0], 0x00($out) # write output
1737 movdqu @XMM[1], 0x10($out)
1738 movdqu @XMM[6], 0x20($out)
1739 movdqu @XMM[4], 0x30($out)
1740 movdqu @XMM[2], 0x40($out)
1741 movdqu @XMM[7], 0x50($out)
1742 jmp .Lcbc_dec_done
1743.align 16
1744.Lcbc_dec_five:
1745 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1746 call _bsaes_decrypt8
1747 pxor 0x20(%rbp), @XMM[0] # ^= IV
1748 movdqu 0x00($inp), @XMM[8] # re-load input
1749 movdqu 0x10($inp), @XMM[9]
1750 pxor @XMM[8], @XMM[1]
1751 movdqu 0x20($inp), @XMM[10]
1752 pxor @XMM[9], @XMM[6]
1753 movdqu 0x30($inp), @XMM[11]
1754 pxor @XMM[10], @XMM[4]
1755 movdqu 0x40($inp), @XMM[15] # IV
1756 pxor @XMM[11], @XMM[2]
1757 movdqu @XMM[0], 0x00($out) # write output
1758 movdqu @XMM[1], 0x10($out)
1759 movdqu @XMM[6], 0x20($out)
1760 movdqu @XMM[4], 0x30($out)
1761 movdqu @XMM[2], 0x40($out)
1762 jmp .Lcbc_dec_done
1763.align 16
1764.Lcbc_dec_four:
1765 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1766 call _bsaes_decrypt8
1767 pxor 0x20(%rbp), @XMM[0] # ^= IV
1768 movdqu 0x00($inp), @XMM[8] # re-load input
1769 movdqu 0x10($inp), @XMM[9]
1770 pxor @XMM[8], @XMM[1]
1771 movdqu 0x20($inp), @XMM[10]
1772 pxor @XMM[9], @XMM[6]
1773 movdqu 0x30($inp), @XMM[15] # IV
1774 pxor @XMM[10], @XMM[4]
1775 movdqu @XMM[0], 0x00($out) # write output
1776 movdqu @XMM[1], 0x10($out)
1777 movdqu @XMM[6], 0x20($out)
1778 movdqu @XMM[4], 0x30($out)
1779 jmp .Lcbc_dec_done
1780.align 16
1781.Lcbc_dec_three:
1782 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1783 call _bsaes_decrypt8
1784 pxor 0x20(%rbp), @XMM[0] # ^= IV
1785 movdqu 0x00($inp), @XMM[8] # re-load input
1786 movdqu 0x10($inp), @XMM[9]
1787 pxor @XMM[8], @XMM[1]
1788 movdqu 0x20($inp), @XMM[15] # IV
1789 pxor @XMM[9], @XMM[6]
1790 movdqu @XMM[0], 0x00($out) # write output
1791 movdqu @XMM[1], 0x10($out)
1792 movdqu @XMM[6], 0x20($out)
1793 jmp .Lcbc_dec_done
1794.align 16
1795.Lcbc_dec_two:
1796 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1797 call _bsaes_decrypt8
1798 pxor 0x20(%rbp), @XMM[0] # ^= IV
1799 movdqu 0x00($inp), @XMM[8] # re-load input
1800 movdqu 0x10($inp), @XMM[15] # IV
1801 pxor @XMM[8], @XMM[1]
1802 movdqu @XMM[0], 0x00($out) # write output
1803 movdqu @XMM[1], 0x10($out)
1804 jmp .Lcbc_dec_done
1805.align 16
1806.Lcbc_dec_one:
60d4e99c
AP
1807 lea ($inp), $arg1
1808 lea 0x20(%rbp), $arg2 # buffer output
1809 lea ($key), $arg3
fe068648 1810 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
1811 pxor 0x20(%rbp), @XMM[15] # ^= IV
1812 movdqu @XMM[15], ($out) # write output
1813 movdqa @XMM[0], @XMM[15] # IV
a75a52a4
AP
1814
1815.Lcbc_dec_done:
60d4e99c 1816 movdqu @XMM[15], (%rbx) # return IV
a75a52a4
AP
1817 lea (%rsp), %rax
1818 pxor %xmm0, %xmm0
1819.Lcbc_dec_bzero: # wipe key schedule [if any]
1820 movdqa %xmm0, 0x00(%rax)
1821 movdqa %xmm0, 0x10(%rax)
1822 lea 0x20(%rax), %rax
1823 cmp %rax, %rbp
1824 ja .Lcbc_dec_bzero
1825
1826 lea (%rbp),%rsp # restore %rsp
1827___
1828$code.=<<___ if ($win64);
1829 movaps 0x40(%rbp), %xmm6
1830 movaps 0x50(%rbp), %xmm7
1831 movaps 0x60(%rbp), %xmm8
1832 movaps 0x70(%rbp), %xmm9
1833 movaps 0x80(%rbp), %xmm10
1834 movaps 0x90(%rbp), %xmm11
1835 movaps 0xa0(%rbp), %xmm12
1836 movaps 0xb0(%rbp), %xmm13
1837 movaps 0xc0(%rbp), %xmm14
1838 movaps 0xd0(%rbp), %xmm15
1839 lea 0xa0(%rbp), %rsp
1840___
1841$code.=<<___;
1842 mov 0x48(%rsp), %r15
1843 mov 0x50(%rsp), %r14
1844 mov 0x58(%rsp), %r13
1845 mov 0x60(%rsp), %r12
1846 mov 0x68(%rsp), %rbx
fe068648 1847 mov 0x70(%rsp), %rax
a75a52a4 1848 lea 0x78(%rsp), %rsp
fe068648 1849 mov %rax, %rbp
a75a52a4
AP
1850.Lcbc_dec_epilogue:
1851 ret
1852.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1853
4ec93a10
AP
1854.globl bsaes_ctr32_encrypt_blocks
1855.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1856.align 16
1857bsaes_ctr32_encrypt_blocks:
fe068648
AP
1858 mov %rsp, %rax
1859.Lctr_enc_prologue:
4ec93a10
AP
1860 push %rbp
1861 push %rbx
1862 push %r12
1863 push %r13
1864 push %r14
1865 push %r15
1866 lea -0x48(%rsp), %rsp
1867___
1868$code.=<<___ if ($win64);
1869 mov 0xa0(%rsp),$arg5 # pull ivp
1870 lea -0xa0(%rsp), %rsp
1871 movaps %xmm6, 0x40(%rsp)
1872 movaps %xmm7, 0x50(%rsp)
1873 movaps %xmm8, 0x60(%rsp)
1874 movaps %xmm9, 0x70(%rsp)
1875 movaps %xmm10, 0x80(%rsp)
1876 movaps %xmm11, 0x90(%rsp)
1877 movaps %xmm12, 0xa0(%rsp)
1878 movaps %xmm13, 0xb0(%rsp)
1879 movaps %xmm14, 0xc0(%rsp)
1880 movaps %xmm15, 0xd0(%rsp)
1881.Lctr_enc_body:
1882___
1883$code.=<<___;
1884 mov %rsp, %rbp # backup %rsp
1885 movdqu ($arg5), %xmm0 # load counter
1886 mov 240($arg4), %eax # rounds
1887 mov $arg1, $inp # backup arguments
1888 mov $arg2, $out
1889 mov $arg3, $len
1890 mov $arg4, $key
1891 movdqa %xmm0, 0x20(%rbp) # copy counter
1892 cmp \$8, $arg3
1893 jb .Lctr_enc_short
1894
1895 mov %eax, %ebx # rounds
1896 shl \$7, %rax # 128 bytes per inner round key
1897 sub \$`128-32`, %rax # size of bit-sliced key schedule
1898 sub %rax, %rsp
1899
1900 mov %rsp, %rax # pass key schedule
1901 mov $key, %rcx # pass key
1902 mov %ebx, %r10d # pass rounds
28507577
AP
1903 call _bsaes_key_convert
1904 pxor %xmm6,%xmm7 # fix up last round key
1905 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1906
1907 movdqa (%rsp), @XMM[9] # load round0 key
1908 lea .LADD1(%rip), %r11
1909 movdqa 0x20(%rbp), @XMM[0] # counter copy
1910 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1911 pshufb @XMM[8], @XMM[9] # byte swap upper part
1912 pshufb @XMM[8], @XMM[0]
1913 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1914 jmp .Lctr_enc_loop
1915.align 16
1916.Lctr_enc_loop:
1917 movdqa @XMM[0], 0x20(%rbp) # save counter
1918 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1919 movdqa @XMM[0], @XMM[2]
1920 paddd 0x00(%r11), @XMM[1] # .LADD1
1921 movdqa @XMM[0], @XMM[3]
1922 paddd 0x10(%r11), @XMM[2] # .LADD2
1923 movdqa @XMM[0], @XMM[4]
1924 paddd 0x20(%r11), @XMM[3] # .LADD3
1925 movdqa @XMM[0], @XMM[5]
1926 paddd 0x30(%r11), @XMM[4] # .LADD4
1927 movdqa @XMM[0], @XMM[6]
1928 paddd 0x40(%r11), @XMM[5] # .LADD5
1929 movdqa @XMM[0], @XMM[7]
1930 paddd 0x50(%r11), @XMM[6] # .LADD6
1931 paddd 0x60(%r11), @XMM[7] # .LADD7
1932
1933 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1934 # to flip byte order in 32-bit counter
1935 movdqa (%rsp), @XMM[9] # round 0 key
1936 lea 0x10(%rsp), %rax # pass key schedule
1937 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1938 pxor @XMM[9], @XMM[0] # xor with round0 key
1939 pxor @XMM[9], @XMM[1]
1940 pshufb @XMM[8], @XMM[0]
1941 pxor @XMM[9], @XMM[2]
1942 pshufb @XMM[8], @XMM[1]
1943 pxor @XMM[9], @XMM[3]
1944 pshufb @XMM[8], @XMM[2]
1945 pxor @XMM[9], @XMM[4]
1946 pshufb @XMM[8], @XMM[3]
1947 pxor @XMM[9], @XMM[5]
1948 pshufb @XMM[8], @XMM[4]
1949 pxor @XMM[9], @XMM[6]
1950 pshufb @XMM[8], @XMM[5]
1951 pxor @XMM[9], @XMM[7]
1952 pshufb @XMM[8], @XMM[6]
1953 lea .LBS0(%rip), %r11 # constants table
1954 pshufb @XMM[8], @XMM[7]
1955 mov %ebx,%r10d # pass rounds
1956
1957 call _bsaes_encrypt8_bitslice
1958
1959 sub \$8,$len
1960 jc .Lctr_enc_loop_done
1961
1962 movdqu 0x00($inp), @XMM[8] # load input
1963 movdqu 0x10($inp), @XMM[9]
1964 movdqu 0x20($inp), @XMM[10]
1965 movdqu 0x30($inp), @XMM[11]
1966 movdqu 0x40($inp), @XMM[12]
1967 movdqu 0x50($inp), @XMM[13]
1968 movdqu 0x60($inp), @XMM[14]
1969 movdqu 0x70($inp), @XMM[15]
1970 lea 0x80($inp),$inp
1971 pxor @XMM[0], @XMM[8]
1972 movdqa 0x20(%rbp), @XMM[0] # load counter
1973 pxor @XMM[9], @XMM[1]
1974 movdqu @XMM[8], 0x00($out) # write output
1975 pxor @XMM[10], @XMM[4]
1976 movdqu @XMM[1], 0x10($out)
1977 pxor @XMM[11], @XMM[6]
1978 movdqu @XMM[4], 0x20($out)
1979 pxor @XMM[12], @XMM[3]
1980 movdqu @XMM[6], 0x30($out)
1981 pxor @XMM[13], @XMM[7]
1982 movdqu @XMM[3], 0x40($out)
1983 pxor @XMM[14], @XMM[2]
1984 movdqu @XMM[7], 0x50($out)
1985 pxor @XMM[15], @XMM[5]
1986 movdqu @XMM[2], 0x60($out)
1987 lea .LADD1(%rip), %r11
1988 movdqu @XMM[5], 0x70($out)
1989 lea 0x80($out), $out
1990 paddd 0x70(%r11), @XMM[0] # .LADD8
1991 jnz .Lctr_enc_loop
1992
1993 jmp .Lctr_enc_done
1994.align 16
1995.Lctr_enc_loop_done:
d127ef78 1996 add \$8, $len
4ec93a10
AP
1997 movdqu 0x00($inp), @XMM[8] # load input
1998 pxor @XMM[8], @XMM[0]
1999 movdqu @XMM[0], 0x00($out) # write output
2000 cmp \$2,$len
2001 jb .Lctr_enc_done
2002 movdqu 0x10($inp), @XMM[9]
2003 pxor @XMM[9], @XMM[1]
2004 movdqu @XMM[1], 0x10($out)
2005 je .Lctr_enc_done
2006 movdqu 0x20($inp), @XMM[10]
2007 pxor @XMM[10], @XMM[4]
2008 movdqu @XMM[4], 0x20($out)
2009 cmp \$4,$len
2010 jb .Lctr_enc_done
2011 movdqu 0x30($inp), @XMM[11]
2012 pxor @XMM[11], @XMM[6]
2013 movdqu @XMM[6], 0x30($out)
2014 je .Lctr_enc_done
2015 movdqu 0x40($inp), @XMM[12]
2016 pxor @XMM[12], @XMM[3]
2017 movdqu @XMM[3], 0x40($out)
2018 cmp \$6,$len
2019 jb .Lctr_enc_done
2020 movdqu 0x50($inp), @XMM[13]
2021 pxor @XMM[13], @XMM[7]
2022 movdqu @XMM[7], 0x50($out)
2023 je .Lctr_enc_done
2024 movdqu 0x60($inp), @XMM[14]
2025 pxor @XMM[14], @XMM[2]
2026 movdqu @XMM[2], 0x60($out)
2027 jmp .Lctr_enc_done
2028
2029.align 16
2030.Lctr_enc_short:
2031 lea 0x20(%rbp), $arg1
2032 lea 0x30(%rbp), $arg2
2033 lea ($key), $arg3
fe068648 2034 call asm_AES_encrypt
4ec93a10
AP
2035 movdqu ($inp), @XMM[1]
2036 lea 16($inp), $inp
2037 mov 0x2c(%rbp), %eax # load 32-bit counter
2038 bswap %eax
2039 pxor 0x30(%rbp), @XMM[1]
2040 inc %eax # increment
2041 movdqu @XMM[1], ($out)
2042 bswap %eax
2043 lea 16($out), $out
2044 mov %eax, 0x2c(%rsp) # save 32-bit counter
2045 dec $len
2046 jnz .Lctr_enc_short
2047
2048.Lctr_enc_done:
2049 lea (%rsp), %rax
2050 pxor %xmm0, %xmm0
2051.Lctr_enc_bzero: # wipe key schedule [if any]
2052 movdqa %xmm0, 0x00(%rax)
2053 movdqa %xmm0, 0x10(%rax)
2054 lea 0x20(%rax), %rax
2055 cmp %rax, %rbp
2056 ja .Lctr_enc_bzero
2057
2058 lea (%rbp),%rsp # restore %rsp
2059___
2060$code.=<<___ if ($win64);
2061 movaps 0x40(%rbp), %xmm6
2062 movaps 0x50(%rbp), %xmm7
2063 movaps 0x60(%rbp), %xmm8
2064 movaps 0x70(%rbp), %xmm9
2065 movaps 0x80(%rbp), %xmm10
2066 movaps 0x90(%rbp), %xmm11
2067 movaps 0xa0(%rbp), %xmm12
2068 movaps 0xb0(%rbp), %xmm13
2069 movaps 0xc0(%rbp), %xmm14
2070 movaps 0xd0(%rbp), %xmm15
2071 lea 0xa0(%rbp), %rsp
2072___
2073$code.=<<___;
2074 mov 0x48(%rsp), %r15
2075 mov 0x50(%rsp), %r14
2076 mov 0x58(%rsp), %r13
2077 mov 0x60(%rsp), %r12
2078 mov 0x68(%rsp), %rbx
fe068648 2079 mov 0x70(%rsp), %rax
4ec93a10 2080 lea 0x78(%rsp), %rsp
fe068648 2081 mov %rax, %rbp
4ec93a10
AP
2082.Lctr_enc_epilogue:
2083 ret
2084.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2085___
60d4e99c
AP
2086######################################################################
2087# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2088# const AES_KEY *key1, const AES_KEY *key2,
2089# const unsigned char iv[16]);
2090#
2091my ($twmask,$twres,$twtmp)=@XMM[13..15];
2092$code.=<<___;
2093.globl bsaes_xts_encrypt
2094.type bsaes_xts_encrypt,\@abi-omnipotent
2095.align 16
2096bsaes_xts_encrypt:
fe068648
AP
2097 mov %rsp, %rax
2098.Lxts_enc_prologue:
60d4e99c
AP
2099 push %rbp
2100 push %rbx
2101 push %r12
2102 push %r13
2103 push %r14
2104 push %r15
2105 lea -0x48(%rsp), %rsp
2106___
2107$code.=<<___ if ($win64);
2108 mov 0xa0(%rsp),$arg5 # pull key2
2109 mov 0xa8(%rsp),$arg6 # pull ivp
2110 lea -0xa0(%rsp), %rsp
2111 movaps %xmm6, 0x40(%rsp)
2112 movaps %xmm7, 0x50(%rsp)
2113 movaps %xmm8, 0x60(%rsp)
2114 movaps %xmm9, 0x70(%rsp)
2115 movaps %xmm10, 0x80(%rsp)
2116 movaps %xmm11, 0x90(%rsp)
2117 movaps %xmm12, 0xa0(%rsp)
2118 movaps %xmm13, 0xb0(%rsp)
2119 movaps %xmm14, 0xc0(%rsp)
2120 movaps %xmm15, 0xd0(%rsp)
2121.Lxts_enc_body:
2122___
2123$code.=<<___;
2124 mov %rsp, %rbp # backup %rsp
2125 mov $arg1, $inp # backup arguments
2126 mov $arg2, $out
2127 mov $arg3, $len
2128 mov $arg4, $key
2129
2130 lea ($arg6), $arg1
2131 lea 0x20(%rbp), $arg2
2132 lea ($arg5), $arg3
fe068648 2133 call asm_AES_encrypt # generate initial tweak
60d4e99c
AP
2134
2135 mov 240($key), %eax # rounds
2136 mov $len, %rbx # backup $len
2137
2138 mov %eax, %edx # rounds
2139 shl \$7, %rax # 128 bytes per inner round key
2140 sub \$`128-32`, %rax # size of bit-sliced key schedule
2141 sub %rax, %rsp
2142
2143 mov %rsp, %rax # pass key schedule
2144 mov $key, %rcx # pass key
2145 mov %edx, %r10d # pass rounds
2146 call _bsaes_key_convert
2147 pxor %xmm6, %xmm7 # fix up last round key
2148 movdqa %xmm7, (%rax) # save last round key
2149
2150 and \$-16, $len
2151 sub \$0x80, %rsp # place for tweak[8]
2152 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2153
2154 pxor $twtmp, $twtmp
2155 movdqa .Lxts_magic(%rip), $twmask
2156 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2157
2158 sub \$0x80, $len
2159 jc .Lxts_enc_short
2160 jmp .Lxts_enc_loop
2161
2162.align 16
2163.Lxts_enc_loop:
2164___
2165 for ($i=0;$i<7;$i++) {
2166 $code.=<<___;
2167 pshufd \$0x13, $twtmp, $twres
2168 pxor $twtmp, $twtmp
2169 movdqa @XMM[7], @XMM[$i]
2170 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2171 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2172 pand $twmask, $twres # isolate carry and residue
2173 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2174 pxor $twres, @XMM[7]
2175___
2176 $code.=<<___ if ($i>=1);
2177 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2178___
2179 $code.=<<___ if ($i>=2);
2180 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2181___
2182 }
2183$code.=<<___;
2184 movdqu 0x60($inp), @XMM[8+6]
2185 pxor @XMM[8+5], @XMM[5]
2186 movdqu 0x70($inp), @XMM[8+7]
2187 lea 0x80($inp), $inp
2188 movdqa @XMM[7], 0x70(%rsp)
2189 pxor @XMM[8+6], @XMM[6]
2190 lea 0x80(%rsp), %rax # pass key schedule
2191 pxor @XMM[8+7], @XMM[7]
2192 mov %edx, %r10d # pass rounds
2193
2194 call _bsaes_encrypt8
2195
2196 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2197 pxor 0x10(%rsp), @XMM[1]
2198 movdqu @XMM[0], 0x00($out) # write output
2199 pxor 0x20(%rsp), @XMM[4]
2200 movdqu @XMM[1], 0x10($out)
2201 pxor 0x30(%rsp), @XMM[6]
2202 movdqu @XMM[4], 0x20($out)
2203 pxor 0x40(%rsp), @XMM[3]
2204 movdqu @XMM[6], 0x30($out)
2205 pxor 0x50(%rsp), @XMM[7]
2206 movdqu @XMM[3], 0x40($out)
2207 pxor 0x60(%rsp), @XMM[2]
2208 movdqu @XMM[7], 0x50($out)
2209 pxor 0x70(%rsp), @XMM[5]
2210 movdqu @XMM[2], 0x60($out)
2211 movdqu @XMM[5], 0x70($out)
2212 lea 0x80($out), $out
2213
2214 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2215 pxor $twtmp, $twtmp
2216 movdqa .Lxts_magic(%rip), $twmask
2217 pcmpgtd @XMM[7], $twtmp
2218 pshufd \$0x13, $twtmp, $twres
2219 pxor $twtmp, $twtmp
2220 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2221 pand $twmask, $twres # isolate carry and residue
2222 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2223 pxor $twres, @XMM[7]
2224
2225 sub \$0x80,$len
2226 jnc .Lxts_enc_loop
2227
2228.Lxts_enc_short:
2229 add \$0x80, $len
2230 jz .Lxts_enc_done
2231___
2232 for ($i=0;$i<7;$i++) {
2233 $code.=<<___;
2234 pshufd \$0x13, $twtmp, $twres
2235 pxor $twtmp, $twtmp
2236 movdqa @XMM[7], @XMM[$i]
2237 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2238 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2239 pand $twmask, $twres # isolate carry and residue
2240 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2241 pxor $twres, @XMM[7]
2242___
2243 $code.=<<___ if ($i>=1);
2244 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2245 cmp \$`0x10*$i`,$len
2246 je .Lxts_enc_$i
2247___
2248 $code.=<<___ if ($i>=2);
2249 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2250___
2251 }
2252$code.=<<___;
2253 movdqu 0x60($inp), @XMM[8+6]
2254 pxor @XMM[8+5], @XMM[5]
2255 movdqa @XMM[7], 0x70(%rsp)
2256 lea 0x70($inp), $inp
2257 pxor @XMM[8+6], @XMM[6]
2258 lea 0x80(%rsp), %rax # pass key schedule
2259 mov %edx, %r10d # pass rounds
2260
2261 call _bsaes_encrypt8
2262
2263 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2264 pxor 0x10(%rsp), @XMM[1]
2265 movdqu @XMM[0], 0x00($out) # write output
2266 pxor 0x20(%rsp), @XMM[4]
2267 movdqu @XMM[1], 0x10($out)
2268 pxor 0x30(%rsp), @XMM[6]
2269 movdqu @XMM[4], 0x20($out)
2270 pxor 0x40(%rsp), @XMM[3]
2271 movdqu @XMM[6], 0x30($out)
2272 pxor 0x50(%rsp), @XMM[7]
2273 movdqu @XMM[3], 0x40($out)
2274 pxor 0x60(%rsp), @XMM[2]
2275 movdqu @XMM[7], 0x50($out)
2276 movdqu @XMM[2], 0x60($out)
2277 lea 0x70($out), $out
2278
2279 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2280 jmp .Lxts_enc_done
2281.align 16
2282.Lxts_enc_6:
2283 pxor @XMM[8+4], @XMM[4]
2284 lea 0x60($inp), $inp
2285 pxor @XMM[8+5], @XMM[5]
2286 lea 0x80(%rsp), %rax # pass key schedule
2287 mov %edx, %r10d # pass rounds
2288
2289 call _bsaes_encrypt8
2290
2291 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2292 pxor 0x10(%rsp), @XMM[1]
2293 movdqu @XMM[0], 0x00($out) # write output
2294 pxor 0x20(%rsp), @XMM[4]
2295 movdqu @XMM[1], 0x10($out)
2296 pxor 0x30(%rsp), @XMM[6]
2297 movdqu @XMM[4], 0x20($out)
2298 pxor 0x40(%rsp), @XMM[3]
2299 movdqu @XMM[6], 0x30($out)
2300 pxor 0x50(%rsp), @XMM[7]
2301 movdqu @XMM[3], 0x40($out)
2302 movdqu @XMM[7], 0x50($out)
2303 lea 0x60($out), $out
2304
2305 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2306 jmp .Lxts_enc_done
2307.align 16
2308.Lxts_enc_5:
2309 pxor @XMM[8+3], @XMM[3]
2310 lea 0x50($inp), $inp
2311 pxor @XMM[8+4], @XMM[4]
2312 lea 0x80(%rsp), %rax # pass key schedule
2313 mov %edx, %r10d # pass rounds
2314
2315 call _bsaes_encrypt8
2316
2317 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2318 pxor 0x10(%rsp), @XMM[1]
2319 movdqu @XMM[0], 0x00($out) # write output
2320 pxor 0x20(%rsp), @XMM[4]
2321 movdqu @XMM[1], 0x10($out)
2322 pxor 0x30(%rsp), @XMM[6]
2323 movdqu @XMM[4], 0x20($out)
2324 pxor 0x40(%rsp), @XMM[3]
2325 movdqu @XMM[6], 0x30($out)
2326 movdqu @XMM[3], 0x40($out)
2327 lea 0x50($out), $out
2328
2329 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2330 jmp .Lxts_enc_done
2331.align 16
2332.Lxts_enc_4:
2333 pxor @XMM[8+2], @XMM[2]
2334 lea 0x40($inp), $inp
2335 pxor @XMM[8+3], @XMM[3]
2336 lea 0x80(%rsp), %rax # pass key schedule
2337 mov %edx, %r10d # pass rounds
2338
2339 call _bsaes_encrypt8
2340
2341 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2342 pxor 0x10(%rsp), @XMM[1]
2343 movdqu @XMM[0], 0x00($out) # write output
2344 pxor 0x20(%rsp), @XMM[4]
2345 movdqu @XMM[1], 0x10($out)
2346 pxor 0x30(%rsp), @XMM[6]
2347 movdqu @XMM[4], 0x20($out)
2348 movdqu @XMM[6], 0x30($out)
2349 lea 0x40($out), $out
2350
2351 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2352 jmp .Lxts_enc_done
2353.align 16
2354.Lxts_enc_3:
2355 pxor @XMM[8+1], @XMM[1]
2356 lea 0x30($inp), $inp
2357 pxor @XMM[8+2], @XMM[2]
2358 lea 0x80(%rsp), %rax # pass key schedule
2359 mov %edx, %r10d # pass rounds
2360
2361 call _bsaes_encrypt8
2362
2363 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2364 pxor 0x10(%rsp), @XMM[1]
2365 movdqu @XMM[0], 0x00($out) # write output
2366 pxor 0x20(%rsp), @XMM[4]
2367 movdqu @XMM[1], 0x10($out)
2368 movdqu @XMM[4], 0x20($out)
2369 lea 0x30($out), $out
2370
2371 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2372 jmp .Lxts_enc_done
2373.align 16
2374.Lxts_enc_2:
2375 pxor @XMM[8+0], @XMM[0]
2376 lea 0x20($inp), $inp
2377 pxor @XMM[8+1], @XMM[1]
2378 lea 0x80(%rsp), %rax # pass key schedule
2379 mov %edx, %r10d # pass rounds
2380
2381 call _bsaes_encrypt8
2382
2383 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2384 pxor 0x10(%rsp), @XMM[1]
2385 movdqu @XMM[0], 0x00($out) # write output
2386 movdqu @XMM[1], 0x10($out)
2387 lea 0x20($out), $out
2388
2389 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2390 jmp .Lxts_enc_done
2391.align 16
2392.Lxts_enc_1:
2393 pxor @XMM[0], @XMM[8]
2394 lea 0x10($inp), $inp
2395 movdqa @XMM[8], 0x20(%rbp)
2396 lea 0x20(%rbp), $arg1
2397 lea 0x20(%rbp), $arg2
2398 lea ($key), $arg3
fe068648 2399 call asm_AES_encrypt # doesn't touch %xmm
60d4e99c
AP
2400 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2401 #pxor @XMM[8], @XMM[0]
2402 #lea 0x80(%rsp), %rax # pass key schedule
2403 #mov %edx, %r10d # pass rounds
2404 #call _bsaes_encrypt8
2405 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2406 movdqu @XMM[0], 0x00($out) # write output
2407 lea 0x10($out), $out
2408
2409 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2410
2411.Lxts_enc_done:
2412 and \$15, %ebx
2413 jz .Lxts_enc_ret
2414 mov $out, %rdx
2415
2416.Lxts_enc_steal:
2417 movzb ($inp), %eax
2418 movzb -16(%rdx), %ecx
2419 lea 1($inp), $inp
2420 mov %al, -16(%rdx)
2421 mov %cl, 0(%rdx)
2422 lea 1(%rdx), %rdx
2423 sub \$1,%ebx
2424 jnz .Lxts_enc_steal
2425
2426 movdqu -16($out), @XMM[0]
2427 lea 0x20(%rbp), $arg1
2428 pxor @XMM[7], @XMM[0]
2429 lea 0x20(%rbp), $arg2
2430 movdqa @XMM[0], 0x20(%rbp)
2431 lea ($key), $arg3
fe068648 2432 call asm_AES_encrypt # doesn't touch %xmm
60d4e99c
AP
2433 pxor 0x20(%rbp), @XMM[7]
2434 movdqu @XMM[7], -16($out)
2435
2436.Lxts_enc_ret:
2437 lea (%rsp), %rax
2438 pxor %xmm0, %xmm0
2439.Lxts_enc_bzero: # wipe key schedule [if any]
2440 movdqa %xmm0, 0x00(%rax)
2441 movdqa %xmm0, 0x10(%rax)
2442 lea 0x20(%rax), %rax
2443 cmp %rax, %rbp
2444 ja .Lxts_enc_bzero
2445
2446 lea (%rbp),%rsp # restore %rsp
2447___
2448$code.=<<___ if ($win64);
2449 movaps 0x40(%rbp), %xmm6
2450 movaps 0x50(%rbp), %xmm7
2451 movaps 0x60(%rbp), %xmm8
2452 movaps 0x70(%rbp), %xmm9
2453 movaps 0x80(%rbp), %xmm10
2454 movaps 0x90(%rbp), %xmm11
2455 movaps 0xa0(%rbp), %xmm12
2456 movaps 0xb0(%rbp), %xmm13
2457 movaps 0xc0(%rbp), %xmm14
2458 movaps 0xd0(%rbp), %xmm15
2459 lea 0xa0(%rbp), %rsp
2460___
2461$code.=<<___;
2462 mov 0x48(%rsp), %r15
2463 mov 0x50(%rsp), %r14
2464 mov 0x58(%rsp), %r13
2465 mov 0x60(%rsp), %r12
2466 mov 0x68(%rsp), %rbx
fe068648 2467 mov 0x70(%rsp), %rax
60d4e99c 2468 lea 0x78(%rsp), %rsp
fe068648 2469 mov %rax, %rbp
60d4e99c
AP
2470.Lxts_enc_epilogue:
2471 ret
2472.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2473
2474.globl bsaes_xts_decrypt
2475.type bsaes_xts_decrypt,\@abi-omnipotent
2476.align 16
2477bsaes_xts_decrypt:
fe068648
AP
2478 mov %rsp, %rax
2479.Lxts_dec_prologue:
60d4e99c
AP
2480 push %rbp
2481 push %rbx
2482 push %r12
2483 push %r13
2484 push %r14
2485 push %r15
2486 lea -0x48(%rsp), %rsp
2487___
2488$code.=<<___ if ($win64);
2489 mov 0xa0(%rsp),$arg5 # pull key2
2490 mov 0xa8(%rsp),$arg6 # pull ivp
2491 lea -0xa0(%rsp), %rsp
2492 movaps %xmm6, 0x40(%rsp)
2493 movaps %xmm7, 0x50(%rsp)
2494 movaps %xmm8, 0x60(%rsp)
2495 movaps %xmm9, 0x70(%rsp)
2496 movaps %xmm10, 0x80(%rsp)
2497 movaps %xmm11, 0x90(%rsp)
2498 movaps %xmm12, 0xa0(%rsp)
2499 movaps %xmm13, 0xb0(%rsp)
2500 movaps %xmm14, 0xc0(%rsp)
2501 movaps %xmm15, 0xd0(%rsp)
2502.Lxts_dec_body:
2503___
2504$code.=<<___;
2505 mov %rsp, %rbp # backup %rsp
2506 mov $arg1, $inp # backup arguments
2507 mov $arg2, $out
2508 mov $arg3, $len
2509 mov $arg4, $key
2510
2511 lea ($arg6), $arg1
2512 lea 0x20(%rbp), $arg2
2513 lea ($arg5), $arg3
fe068648 2514 call asm_AES_encrypt # generate initial tweak
60d4e99c
AP
2515
2516 mov 240($key), %eax # rounds
2517 mov $len, %rbx # backup $len
2518
2519 mov %eax, %edx # rounds
2520 shl \$7, %rax # 128 bytes per inner round key
2521 sub \$`128-32`, %rax # size of bit-sliced key schedule
2522 sub %rax, %rsp
2523
2524 mov %rsp, %rax # pass key schedule
2525 mov $key, %rcx # pass key
2526 mov %edx, %r10d # pass rounds
2527 call _bsaes_key_convert
2528 pxor (%rsp), %xmm7 # fix up round 0 key
2529 movdqa %xmm6, (%rax) # save last round key
2530 movdqa %xmm7, (%rsp)
2531
2532 xor %eax, %eax # if ($len%16) len-=16;
2533 and \$-16, $len
2534 test \$15, %ebx
2535 setnz %al
2536 shl \$4, %rax
2537 sub %rax, $len
2538
2539 sub \$0x80, %rsp # place for tweak[8]
2540 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2541
2542 pxor $twtmp, $twtmp
2543 movdqa .Lxts_magic(%rip), $twmask
2544 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2545
2546 sub \$0x80, $len
2547 jc .Lxts_dec_short
2548 jmp .Lxts_dec_loop
2549
2550.align 16
2551.Lxts_dec_loop:
2552___
2553 for ($i=0;$i<7;$i++) {
2554 $code.=<<___;
2555 pshufd \$0x13, $twtmp, $twres
2556 pxor $twtmp, $twtmp
2557 movdqa @XMM[7], @XMM[$i]
2558 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2559 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2560 pand $twmask, $twres # isolate carry and residue
2561 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2562 pxor $twres, @XMM[7]
2563___
2564 $code.=<<___ if ($i>=1);
2565 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2566___
2567 $code.=<<___ if ($i>=2);
2568 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2569___
2570 }
2571$code.=<<___;
2572 movdqu 0x60($inp), @XMM[8+6]
2573 pxor @XMM[8+5], @XMM[5]
2574 movdqu 0x70($inp), @XMM[8+7]
2575 lea 0x80($inp), $inp
2576 movdqa @XMM[7], 0x70(%rsp)
2577 pxor @XMM[8+6], @XMM[6]
2578 lea 0x80(%rsp), %rax # pass key schedule
2579 pxor @XMM[8+7], @XMM[7]
2580 mov %edx, %r10d # pass rounds
2581
2582 call _bsaes_decrypt8
2583
2584 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2585 pxor 0x10(%rsp), @XMM[1]
2586 movdqu @XMM[0], 0x00($out) # write output
2587 pxor 0x20(%rsp), @XMM[6]
2588 movdqu @XMM[1], 0x10($out)
2589 pxor 0x30(%rsp), @XMM[4]
2590 movdqu @XMM[6], 0x20($out)
2591 pxor 0x40(%rsp), @XMM[2]
2592 movdqu @XMM[4], 0x30($out)
2593 pxor 0x50(%rsp), @XMM[7]
2594 movdqu @XMM[2], 0x40($out)
2595 pxor 0x60(%rsp), @XMM[3]
2596 movdqu @XMM[7], 0x50($out)
2597 pxor 0x70(%rsp), @XMM[5]
2598 movdqu @XMM[3], 0x60($out)
2599 movdqu @XMM[5], 0x70($out)
2600 lea 0x80($out), $out
2601
2602 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2603 pxor $twtmp, $twtmp
2604 movdqa .Lxts_magic(%rip), $twmask
2605 pcmpgtd @XMM[7], $twtmp
2606 pshufd \$0x13, $twtmp, $twres
2607 pxor $twtmp, $twtmp
2608 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2609 pand $twmask, $twres # isolate carry and residue
2610 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2611 pxor $twres, @XMM[7]
2612
2613 sub \$0x80,$len
2614 jnc .Lxts_dec_loop
2615
2616.Lxts_dec_short:
2617 add \$0x80, $len
2618 jz .Lxts_dec_done
2619___
2620 for ($i=0;$i<7;$i++) {
2621 $code.=<<___;
2622 pshufd \$0x13, $twtmp, $twres
2623 pxor $twtmp, $twtmp
2624 movdqa @XMM[7], @XMM[$i]
2625 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2626 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2627 pand $twmask, $twres # isolate carry and residue
2628 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2629 pxor $twres, @XMM[7]
2630___
2631 $code.=<<___ if ($i>=1);
2632 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2633 cmp \$`0x10*$i`,$len
2634 je .Lxts_dec_$i
2635___
2636 $code.=<<___ if ($i>=2);
2637 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2638___
2639 }
2640$code.=<<___;
2641 movdqu 0x60($inp), @XMM[8+6]
2642 pxor @XMM[8+5], @XMM[5]
2643 movdqa @XMM[7], 0x70(%rsp)
2644 lea 0x70($inp), $inp
2645 pxor @XMM[8+6], @XMM[6]
2646 lea 0x80(%rsp), %rax # pass key schedule
2647 mov %edx, %r10d # pass rounds
2648
2649 call _bsaes_decrypt8
2650
2651 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2652 pxor 0x10(%rsp), @XMM[1]
2653 movdqu @XMM[0], 0x00($out) # write output
2654 pxor 0x20(%rsp), @XMM[6]
2655 movdqu @XMM[1], 0x10($out)
2656 pxor 0x30(%rsp), @XMM[4]
2657 movdqu @XMM[6], 0x20($out)
2658 pxor 0x40(%rsp), @XMM[2]
2659 movdqu @XMM[4], 0x30($out)
2660 pxor 0x50(%rsp), @XMM[7]
2661 movdqu @XMM[2], 0x40($out)
2662 pxor 0x60(%rsp), @XMM[3]
2663 movdqu @XMM[7], 0x50($out)
2664 movdqu @XMM[3], 0x60($out)
2665 lea 0x70($out), $out
2666
2667 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2668 jmp .Lxts_dec_done
2669.align 16
2670.Lxts_dec_6:
2671 pxor @XMM[8+4], @XMM[4]
2672 lea 0x60($inp), $inp
2673 pxor @XMM[8+5], @XMM[5]
2674 lea 0x80(%rsp), %rax # pass key schedule
2675 mov %edx, %r10d # pass rounds
2676
2677 call _bsaes_decrypt8
2678
2679 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2680 pxor 0x10(%rsp), @XMM[1]
2681 movdqu @XMM[0], 0x00($out) # write output
2682 pxor 0x20(%rsp), @XMM[6]
2683 movdqu @XMM[1], 0x10($out)
2684 pxor 0x30(%rsp), @XMM[4]
2685 movdqu @XMM[6], 0x20($out)
2686 pxor 0x40(%rsp), @XMM[2]
2687 movdqu @XMM[4], 0x30($out)
2688 pxor 0x50(%rsp), @XMM[7]
2689 movdqu @XMM[2], 0x40($out)
2690 movdqu @XMM[7], 0x50($out)
2691 lea 0x60($out), $out
2692
2693 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2694 jmp .Lxts_dec_done
2695.align 16
2696.Lxts_dec_5:
2697 pxor @XMM[8+3], @XMM[3]
2698 lea 0x50($inp), $inp
2699 pxor @XMM[8+4], @XMM[4]
2700 lea 0x80(%rsp), %rax # pass key schedule
2701 mov %edx, %r10d # pass rounds
2702
2703 call _bsaes_decrypt8
2704
2705 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2706 pxor 0x10(%rsp), @XMM[1]
2707 movdqu @XMM[0], 0x00($out) # write output
2708 pxor 0x20(%rsp), @XMM[6]
2709 movdqu @XMM[1], 0x10($out)
2710 pxor 0x30(%rsp), @XMM[4]
2711 movdqu @XMM[6], 0x20($out)
2712 pxor 0x40(%rsp), @XMM[2]
2713 movdqu @XMM[4], 0x30($out)
2714 movdqu @XMM[2], 0x40($out)
2715 lea 0x50($out), $out
2716
2717 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2718 jmp .Lxts_dec_done
2719.align 16
2720.Lxts_dec_4:
2721 pxor @XMM[8+2], @XMM[2]
2722 lea 0x40($inp), $inp
2723 pxor @XMM[8+3], @XMM[3]
2724 lea 0x80(%rsp), %rax # pass key schedule
2725 mov %edx, %r10d # pass rounds
2726
2727 call _bsaes_decrypt8
2728
2729 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2730 pxor 0x10(%rsp), @XMM[1]
2731 movdqu @XMM[0], 0x00($out) # write output
2732 pxor 0x20(%rsp), @XMM[6]
2733 movdqu @XMM[1], 0x10($out)
2734 pxor 0x30(%rsp), @XMM[4]
2735 movdqu @XMM[6], 0x20($out)
2736 movdqu @XMM[4], 0x30($out)
2737 lea 0x40($out), $out
2738
2739 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2740 jmp .Lxts_dec_done
2741.align 16
2742.Lxts_dec_3:
2743 pxor @XMM[8+1], @XMM[1]
2744 lea 0x30($inp), $inp
2745 pxor @XMM[8+2], @XMM[2]
2746 lea 0x80(%rsp), %rax # pass key schedule
2747 mov %edx, %r10d # pass rounds
2748
2749 call _bsaes_decrypt8
2750
2751 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2752 pxor 0x10(%rsp), @XMM[1]
2753 movdqu @XMM[0], 0x00($out) # write output
2754 pxor 0x20(%rsp), @XMM[6]
2755 movdqu @XMM[1], 0x10($out)
2756 movdqu @XMM[6], 0x20($out)
2757 lea 0x30($out), $out
2758
2759 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2760 jmp .Lxts_dec_done
2761.align 16
2762.Lxts_dec_2:
2763 pxor @XMM[8+0], @XMM[0]
2764 lea 0x20($inp), $inp
2765 pxor @XMM[8+1], @XMM[1]
2766 lea 0x80(%rsp), %rax # pass key schedule
2767 mov %edx, %r10d # pass rounds
2768
2769 call _bsaes_decrypt8
2770
2771 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2772 pxor 0x10(%rsp), @XMM[1]
2773 movdqu @XMM[0], 0x00($out) # write output
2774 movdqu @XMM[1], 0x10($out)
2775 lea 0x20($out), $out
2776
2777 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2778 jmp .Lxts_dec_done
2779.align 16
2780.Lxts_dec_1:
2781 pxor @XMM[0], @XMM[8]
2782 lea 0x10($inp), $inp
2783 movdqa @XMM[8], 0x20(%rbp)
2784 lea 0x20(%rbp), $arg1
2785 lea 0x20(%rbp), $arg2
2786 lea ($key), $arg3
fe068648 2787 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2788 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2789 #pxor @XMM[8], @XMM[0]
2790 #lea 0x80(%rsp), %rax # pass key schedule
2791 #mov %edx, %r10d # pass rounds
2792 #call _bsaes_decrypt8
2793 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2794 movdqu @XMM[0], 0x00($out) # write output
2795 lea 0x10($out), $out
2796
2797 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2798
2799.Lxts_dec_done:
2800 and \$15, %ebx
2801 jz .Lxts_dec_ret
2802
2803 pxor $twtmp, $twtmp
2804 movdqa .Lxts_magic(%rip), $twmask
2805 pcmpgtd @XMM[7], $twtmp
2806 pshufd \$0x13, $twtmp, $twres
2807 movdqa @XMM[7], @XMM[6]
2808 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2809 pand $twmask, $twres # isolate carry and residue
2810 movdqu ($inp), @XMM[0]
2811 pxor $twres, @XMM[7]
2812
2813 lea 0x20(%rbp), $arg1
2814 pxor @XMM[7], @XMM[0]
2815 lea 0x20(%rbp), $arg2
2816 movdqa @XMM[0], 0x20(%rbp)
2817 lea ($key), $arg3
fe068648 2818 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2819 pxor 0x20(%rbp), @XMM[7]
2820 mov $out, %rdx
2821 movdqu @XMM[7], ($out)
2822
2823.Lxts_dec_steal:
2824 movzb 16($inp), %eax
2825 movzb (%rdx), %ecx
2826 lea 1($inp), $inp
2827 mov %al, (%rdx)
2828 mov %cl, 16(%rdx)
2829 lea 1(%rdx), %rdx
2830 sub \$1,%ebx
2831 jnz .Lxts_dec_steal
2832
2833 movdqu ($out), @XMM[0]
2834 lea 0x20(%rbp), $arg1
2835 pxor @XMM[6], @XMM[0]
2836 lea 0x20(%rbp), $arg2
2837 movdqa @XMM[0], 0x20(%rbp)
2838 lea ($key), $arg3
fe068648 2839 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2840 pxor 0x20(%rbp), @XMM[6]
2841 movdqu @XMM[6], ($out)
2842
2843.Lxts_dec_ret:
2844 lea (%rsp), %rax
2845 pxor %xmm0, %xmm0
2846.Lxts_dec_bzero: # wipe key schedule [if any]
2847 movdqa %xmm0, 0x00(%rax)
2848 movdqa %xmm0, 0x10(%rax)
2849 lea 0x20(%rax), %rax
2850 cmp %rax, %rbp
2851 ja .Lxts_dec_bzero
2852
2853 lea (%rbp),%rsp # restore %rsp
2854___
2855$code.=<<___ if ($win64);
2856 movaps 0x40(%rbp), %xmm6
2857 movaps 0x50(%rbp), %xmm7
2858 movaps 0x60(%rbp), %xmm8
2859 movaps 0x70(%rbp), %xmm9
2860 movaps 0x80(%rbp), %xmm10
2861 movaps 0x90(%rbp), %xmm11
2862 movaps 0xa0(%rbp), %xmm12
2863 movaps 0xb0(%rbp), %xmm13
2864 movaps 0xc0(%rbp), %xmm14
2865 movaps 0xd0(%rbp), %xmm15
2866 lea 0xa0(%rbp), %rsp
2867___
2868$code.=<<___;
2869 mov 0x48(%rsp), %r15
2870 mov 0x50(%rsp), %r14
2871 mov 0x58(%rsp), %r13
2872 mov 0x60(%rsp), %r12
2873 mov 0x68(%rsp), %rbx
fe068648 2874 mov 0x70(%rsp), %rax
60d4e99c 2875 lea 0x78(%rsp), %rsp
fe068648 2876 mov %rax, %rbp
60d4e99c
AP
2877.Lxts_dec_epilogue:
2878 ret
2879.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2880___
4ec93a10
AP
2881}
2882$code.=<<___;
a75a52a4 2883.type _bsaes_const,\@object
4ec93a10 2884.align 64
a75a52a4 2885_bsaes_const:
28507577
AP
2886.LM0ISR: # InvShiftRows constants
2887 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2888.LISRM0:
2889 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2890.LISR:
2891 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
5a326467
AP
2892.LBS0: # bit-slice constants
2893 .quad 0x5555555555555555, 0x5555555555555555
2894.LBS1:
2895 .quad 0x3333333333333333, 0x3333333333333333
2896.LBS2:
2897 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2898.LSR: # shiftrows constants
2899 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2900.LSRM0:
2901 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
5a326467
AP
2902.LM0SR:
2903 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
5a326467 2904.LSWPUP: # byte-swap upper dword
4ec93a10
AP
2905 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2906.LSWPUPM0SR:
2907 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
5a326467
AP
2908.LADD1: # counter increment constants
2909 .quad 0x0000000000000000, 0x0000000100000000
2910.LADD2:
2911 .quad 0x0000000000000000, 0x0000000200000000
2912.LADD3:
2913 .quad 0x0000000000000000, 0x0000000300000000
2914.LADD4:
2915 .quad 0x0000000000000000, 0x0000000400000000
2916.LADD5:
2917 .quad 0x0000000000000000, 0x0000000500000000
2918.LADD6:
2919 .quad 0x0000000000000000, 0x0000000600000000
2920.LADD7:
2921 .quad 0x0000000000000000, 0x0000000700000000
2922.LADD8:
2923 .quad 0x0000000000000000, 0x0000000800000000
60d4e99c
AP
2924.Lxts_magic:
2925 .long 0x87,0,1,0
f9ef874a
AP
2926.Lmasks:
2927 .quad 0x0101010101010101, 0x0101010101010101
2928 .quad 0x0202020202020202, 0x0202020202020202
2929 .quad 0x0404040404040404, 0x0404040404040404
2930 .quad 0x0808080808080808, 0x0808080808080808
2931.LM0:
2932 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2933.L63:
2934 .quad 0x6363636363636363, 0x6363636363636363
a75a52a4 2935.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
4ec93a10 2936.align 64
a75a52a4 2937.size _bsaes_const,.-_bsaes_const
4ec93a10
AP
2938___
2939
fe068648
AP
2940# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2941# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2942if ($win64) {
2943$rec="%rcx";
2944$frame="%rdx";
2945$context="%r8";
2946$disp="%r9";
2947
2948$code.=<<___;
2949.extern __imp_RtlVirtualUnwind
2950.type se_handler,\@abi-omnipotent
2951.align 16
2952se_handler:
2953 push %rsi
2954 push %rdi
2955 push %rbx
2956 push %rbp
2957 push %r12
2958 push %r13
2959 push %r14
2960 push %r15
2961 pushfq
2962 sub \$64,%rsp
2963
2964 mov 120($context),%rax # pull context->Rax
2965 mov 248($context),%rbx # pull context->Rip
2966
2967 mov 8($disp),%rsi # disp->ImageBase
2968 mov 56($disp),%r11 # disp->HandlerData
2969
2970 mov 0(%r11),%r10d # HandlerData[0]
2971 lea (%rsi,%r10),%r10 # prologue label
2972 cmp %r10,%rbx # context->Rip<prologue label
2973 jb .Lin_prologue
2974
2975 mov 152($context),%rax # pull context->Rsp
2976
2977 mov 4(%r11),%r10d # HandlerData[1]
2978 lea (%rsi,%r10),%r10 # epilogue label
2979 cmp %r10,%rbx # context->Rip>=epilogue label
2980 jae .Lin_prologue
2981
2982 mov 160($context),%rax # pull context->Rbp
2983
2984 lea 0x40(%rax),%rsi # %xmm save area
2985 lea 512($context),%rdi # &context.Xmm6
2986 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2987 .long 0xa548f3fc # cld; rep movsq
2988 lea 0xa0(%rax),%rax # adjust stack pointer
2989
2990 mov 0x70(%rax),%rbp
2991 mov 0x68(%rax),%rbx
2992 mov 0x60(%rax),%r12
2993 mov 0x58(%rax),%r13
2994 mov 0x50(%rax),%r14
2995 mov 0x48(%rax),%r15
2996 lea 0x78(%rax),%rax # adjust stack pointer
2997 mov %rbx,144($context) # restore context->Rbx
2998 mov %rbp,160($context) # restore context->Rbp
2999 mov %r12,216($context) # restore context->R12
3000 mov %r13,224($context) # restore context->R13
3001 mov %r14,232($context) # restore context->R14
3002 mov %r15,240($context) # restore context->R15
3003
3004.Lin_prologue:
3005 mov %rax,152($context) # restore context->Rsp
3006
3007 mov 40($disp),%rdi # disp->ContextRecord
3008 mov $context,%rsi # context
3009 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3010 .long 0xa548f3fc # cld; rep movsq
3011
3012 mov $disp,%rsi
3013 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3014 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3015 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3016 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3017 mov 40(%rsi),%r10 # disp->ContextRecord
3018 lea 56(%rsi),%r11 # &disp->HandlerData
3019 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3020 mov %r10,32(%rsp) # arg5
3021 mov %r11,40(%rsp) # arg6
3022 mov %r12,48(%rsp) # arg7
3023 mov %rcx,56(%rsp) # arg8, (NULL)
3024 call *__imp_RtlVirtualUnwind(%rip)
3025
3026 mov \$1,%eax # ExceptionContinueSearch
3027 add \$64,%rsp
3028 popfq
3029 pop %r15
3030 pop %r14
3031 pop %r13
3032 pop %r12
3033 pop %rbp
3034 pop %rbx
3035 pop %rdi
3036 pop %rsi
3037 ret
3038.size se_handler,.-se_handler
3039
3040.section .pdata
3041.align 4
3042___
3043$code.=<<___ if ($ecb);
3044 .rva .Lecb_enc_prologue
3045 .rva .Lecb_enc_epilogue
3046 .rva .Lecb_enc_info
3047
3048 .rva .Lecb_dec_prologue
3049 .rva .Lecb_dec_epilogue
3050 .rva .Lecb_dec_info
3051___
3052$code.=<<___;
3053 .rva .Lcbc_dec_prologue
3054 .rva .Lcbc_dec_epilogue
3055 .rva .Lcbc_dec_info
3056
3057 .rva .Lctr_enc_prologue
3058 .rva .Lctr_enc_epilogue
3059 .rva .Lctr_enc_info
3060
3061 .rva .Lxts_enc_prologue
3062 .rva .Lxts_enc_epilogue
3063 .rva .Lxts_enc_info
3064
3065 .rva .Lxts_dec_prologue
3066 .rva .Lxts_dec_epilogue
3067 .rva .Lxts_dec_info
3068
3069.section .xdata
3070.align 8
3071___
3072$code.=<<___ if ($ecb);
3073.Lecb_enc_info:
3074 .byte 9,0,0,0
3075 .rva se_handler
3076 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3077.Lecb_dec_info:
3078 .byte 9,0,0,0
3079 .rva se_handler
3080 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3081___
3082$code.=<<___;
3083.Lcbc_dec_info:
3084 .byte 9,0,0,0
3085 .rva se_handler
3086 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3087.Lctr_enc_info:
3088 .byte 9,0,0,0
3089 .rva se_handler
3090 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3091.Lxts_enc_info:
3092 .byte 9,0,0,0
3093 .rva se_handler
3094 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3095.Lxts_dec_info:
3096 .byte 9,0,0,0
3097 .rva se_handler
3098 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3099___
3100}
3101
4ec93a10
AP
3102$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3103
3104print $code;
3105
3106close STDOUT;