]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/bsaes-x86_64.pl
Following the license change, modify the boilerplates in ms/
[thirdparty/openssl.git] / crypto / aes / asm / bsaes-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
4ec93a10
AP
9
10###################################################################
11### AES-128 [originally in CTR mode] ###
12### bitsliced implementation for Intel Core 2 processors ###
13### requires support of SSE extensions up to SSSE3 ###
11272648 14### Author: Emilia Käsper and Peter Schwabe ###
4ec93a10
AP
15### Date: 2009-03-19 ###
16### Public domain ###
17### ###
18### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19### further information. ###
20###################################################################
21#
22# September 2011.
23#
24# Started as transliteration to "perlasm" the original code has
25# undergone following changes:
26#
27# - code was made position-independent;
28# - rounds were folded into a loop resulting in >5x size reduction
29# from 12.5KB to 2.2KB;
30# - above was possibile thanks to mixcolumns() modification that
31# allowed to feed its output back to aesenc[last], this was
32# achieved at cost of two additional inter-registers moves;
33# - some instruction reordering and interleaving;
34# - this module doesn't implement key setup subroutine, instead it
35# relies on conversion of "conventional" key schedule as returned
36# by AES_set_encrypt_key (see discussion below);
37# - first and last round keys are treated differently, which allowed
38# to skip one shiftrows(), reduce bit-sliced key schedule and
39# speed-up conversion by 22%;
40# - support for 192- and 256-bit keys was added;
41#
42# Resulting performance in CPU cycles spent to encrypt one byte out
43# of 4096-byte buffer with 128-bit key is:
44#
45# Emilia's this(*) difference
46#
47# Core 2 9.30 8.69 +7%
558ff0f0
AP
48# Nehalem(**) 7.63 6.88 +11%
49# Atom 17.1 16.4 +4%
b59f92e7 50# Silvermont - 12.9
ace05265 51# Goldmont - 8.85
4ec93a10
AP
52#
53# (*) Comparison is not completely fair, because "this" is ECB,
54# i.e. no extra processing such as counter values calculation
55# and xor-ing input as in Emilia's CTR implementation is
56# performed. However, the CTR calculations stand for not more
57# than 1% of total time, so comparison is *rather* fair.
58#
59# (**) Results were collected on Westmere, which is considered to
60# be equivalent to Nehalem for this code.
61#
4ec93a10
AP
62# As for key schedule conversion subroutine. Interface to OpenSSL
63# relies on per-invocation on-the-fly conversion. This naturally
64# has impact on performance, especially for short inputs. Conversion
65# time in CPU cycles and its ratio to CPU cycles spent in 8x block
66# function is:
67#
68# conversion conversion/8x block
f9ef874a
AP
69# Core 2 240 0.22
70# Nehalem 180 0.20
558ff0f0 71# Atom 430 0.20
4ec93a10
AP
72#
73# The ratio values mean that 128-byte blocks will be processed
f9ef874a 74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
4ec93a10
AP
75# etc. Then keep in mind that input sizes not divisible by 128 are
76# *effectively* slower, especially shortest ones, e.g. consecutive
77# 144-byte blocks are processed 44% slower than one would expect,
78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79# it's still faster than ["hyper-threading-safe" code path in]
80# aes-x86_64.pl on all lengths above 64 bytes...
81#
28507577
AP
82# October 2011.
83#
b08259cd
AP
84# Add decryption procedure. Performance in CPU cycles spent to decrypt
85# one byte out of 4096-byte buffer with 128-bit key is:
86#
558ff0f0
AP
87# Core 2 9.98
88# Nehalem 7.80
89# Atom 17.9
b59f92e7 90# Silvermont 14.0
ace05265 91# Goldmont 10.2
28507577 92#
60d4e99c
AP
93# November 2011.
94#
fe068648
AP
95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96# suboptimal, but XTS is meant to be used with larger blocks...
60d4e99c 97#
4ec93a10
AP
98# <appro@openssl.org>
99
100$flavour = shift;
101$output = shift;
102if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
103
104$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
105
106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109die "can't locate x86_64-xlate.pl";
110
cfe1d992 111open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
46bf83f0 112*STDOUT=*OUT;
4ec93a10
AP
113
114my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
115my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
fe068648 116my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
4ec93a10
AP
117
118{
119my ($key,$rounds,$const)=("%rax","%r10d","%r11");
120
28507577 121sub Sbox {
4ec93a10
AP
122# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
124my @b=@_[0..7];
125my @t=@_[8..11];
126my @s=@_[12..15];
127 &InBasisChange (@b);
128 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
129 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
130}
131
132sub InBasisChange {
133# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
609b0852 134# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
4ec93a10
AP
135my @b=@_[0..7];
136$code.=<<___;
137 pxor @b[6], @b[5]
138 pxor @b[1], @b[2]
28507577 139 pxor @b[0], @b[3]
4ec93a10 140 pxor @b[2], @b[6]
28507577 141 pxor @b[0], @b[5]
4ec93a10
AP
142
143 pxor @b[3], @b[6]
144 pxor @b[7], @b[3]
145 pxor @b[5], @b[7]
146 pxor @b[4], @b[3]
147 pxor @b[5], @b[4]
148 pxor @b[1], @b[3]
149
150 pxor @b[7], @b[2]
151 pxor @b[5], @b[1]
152___
153}
154
155sub OutBasisChange {
156# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
157# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
158my @b=@_[0..7];
159$code.=<<___;
160 pxor @b[6], @b[0]
161 pxor @b[4], @b[1]
162 pxor @b[0], @b[2]
163 pxor @b[6], @b[4]
164 pxor @b[1], @b[6]
165
166 pxor @b[5], @b[1]
167 pxor @b[3], @b[5]
168 pxor @b[7], @b[3]
169 pxor @b[5], @b[7]
170 pxor @b[5], @b[2]
171
172 pxor @b[7], @b[4]
173___
174}
175
28507577
AP
176sub InvSbox {
177# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
178# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
179my @b=@_[0..7];
180my @t=@_[8..11];
181my @s=@_[12..15];
182 &InvInBasisChange (@b);
183 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
184 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
185}
186
187sub InvInBasisChange { # OutBasisChange in reverse
188my @b=@_[5,1,2,6,3,7,0,4];
189$code.=<<___
190 pxor @b[7], @b[4]
191
192 pxor @b[5], @b[7]
193 pxor @b[5], @b[2]
194 pxor @b[7], @b[3]
195 pxor @b[3], @b[5]
196 pxor @b[5], @b[1]
197
198 pxor @b[1], @b[6]
199 pxor @b[0], @b[2]
200 pxor @b[6], @b[4]
201 pxor @b[6], @b[0]
202 pxor @b[4], @b[1]
203___
204}
205
206sub InvOutBasisChange { # InBasisChange in reverse
207my @b=@_[2,5,7,3,6,1,0,4];
208$code.=<<___;
209 pxor @b[5], @b[1]
210 pxor @b[7], @b[2]
211
212 pxor @b[1], @b[3]
213 pxor @b[5], @b[4]
214 pxor @b[5], @b[7]
215 pxor @b[4], @b[3]
216 pxor @b[0], @b[5]
217 pxor @b[7], @b[3]
218 pxor @b[2], @b[6]
219 pxor @b[1], @b[2]
220 pxor @b[3], @b[6]
221
222 pxor @b[0], @b[3]
223 pxor @b[6], @b[5]
224___
225}
226
4ec93a10
AP
227sub Mul_GF4 {
228#;*************************************************************
229#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
230#;*************************************************************
231my ($x0,$x1,$y0,$y1,$t0)=@_;
232$code.=<<___;
233 movdqa $y0, $t0
234 pxor $y1, $t0
235 pand $x0, $t0
236 pxor $x1, $x0
237 pand $y0, $x1
238 pand $y1, $x0
239 pxor $x1, $x0
240 pxor $t0, $x1
241___
242}
243
244sub Mul_GF4_N { # not used, see next subroutine
245# multiply and scale by N
246my ($x0,$x1,$y0,$y1,$t0)=@_;
247$code.=<<___;
248 movdqa $y0, $t0
249 pxor $y1, $t0
250 pand $x0, $t0
251 pxor $x1, $x0
252 pand $y0, $x1
253 pand $y1, $x0
254 pxor $x0, $x1
255 pxor $t0, $x0
256___
257}
258
259sub Mul_GF4_N_GF4 {
260# interleaved Mul_GF4_N and Mul_GF4
261my ($x0,$x1,$y0,$y1,$t0,
262 $x2,$x3,$y2,$y3,$t1)=@_;
263$code.=<<___;
264 movdqa $y0, $t0
265 movdqa $y2, $t1
266 pxor $y1, $t0
267 pxor $y3, $t1
268 pand $x0, $t0
269 pand $x2, $t1
270 pxor $x1, $x0
271 pxor $x3, $x2
272 pand $y0, $x1
273 pand $y2, $x3
274 pand $y1, $x0
275 pand $y3, $x2
276 pxor $x0, $x1
277 pxor $x3, $x2
278 pxor $t0, $x0
279 pxor $t1, $x3
280___
281}
282sub Mul_GF16_2 {
283my @x=@_[0..7];
284my @y=@_[8..11];
285my @t=@_[12..15];
286$code.=<<___;
287 movdqa @x[0], @t[0]
288 movdqa @x[1], @t[1]
289___
290 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
291$code.=<<___;
292 pxor @x[2], @t[0]
293 pxor @x[3], @t[1]
294 pxor @y[2], @y[0]
295 pxor @y[3], @y[1]
296___
297 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
298 @x[2], @x[3], @y[2], @y[3], @t[2]);
299$code.=<<___;
300 pxor @t[0], @x[0]
301 pxor @t[0], @x[2]
302 pxor @t[1], @x[1]
303 pxor @t[1], @x[3]
304
305 movdqa @x[4], @t[0]
306 movdqa @x[5], @t[1]
307 pxor @x[6], @t[0]
308 pxor @x[7], @t[1]
309___
310 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
311 @x[6], @x[7], @y[2], @y[3], @t[2]);
312$code.=<<___;
313 pxor @y[2], @y[0]
314 pxor @y[3], @y[1]
315___
316 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
317$code.=<<___;
318 pxor @t[0], @x[4]
319 pxor @t[0], @x[6]
320 pxor @t[1], @x[5]
321 pxor @t[1], @x[7]
322___
323}
324sub Inv_GF256 {
325#;********************************************************************
326#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
327#;********************************************************************
328my @x=@_[0..7];
329my @t=@_[8..11];
330my @s=@_[12..15];
331# direct optimizations from hardware
332$code.=<<___;
333 movdqa @x[4], @t[3]
334 movdqa @x[5], @t[2]
335 movdqa @x[1], @t[1]
336 movdqa @x[7], @s[1]
337 movdqa @x[0], @s[0]
338
339 pxor @x[6], @t[3]
340 pxor @x[7], @t[2]
341 pxor @x[3], @t[1]
342 movdqa @t[3], @s[2]
343 pxor @x[6], @s[1]
344 movdqa @t[2], @t[0]
345 pxor @x[2], @s[0]
346 movdqa @t[3], @s[3]
347
348 por @t[1], @t[2]
349 por @s[0], @t[3]
350 pxor @t[0], @s[3]
351 pand @s[0], @s[2]
352 pxor @t[1], @s[0]
353 pand @t[1], @t[0]
354 pand @s[0], @s[3]
355 movdqa @x[3], @s[0]
356 pxor @x[2], @s[0]
357 pand @s[0], @s[1]
358 pxor @s[1], @t[3]
359 pxor @s[1], @t[2]
360 movdqa @x[4], @s[1]
361 movdqa @x[1], @s[0]
362 pxor @x[5], @s[1]
363 pxor @x[0], @s[0]
364 movdqa @s[1], @t[1]
365 pand @s[0], @s[1]
366 por @s[0], @t[1]
367 pxor @s[1], @t[0]
368 pxor @s[3], @t[3]
369 pxor @s[2], @t[2]
370 pxor @s[3], @t[1]
371 movdqa @x[7], @s[0]
372 pxor @s[2], @t[0]
373 movdqa @x[6], @s[1]
374 pxor @s[2], @t[1]
375 movdqa @x[5], @s[2]
376 pand @x[3], @s[0]
377 movdqa @x[4], @s[3]
378 pand @x[2], @s[1]
379 pand @x[1], @s[2]
380 por @x[0], @s[3]
381 pxor @s[0], @t[3]
382 pxor @s[1], @t[2]
383 pxor @s[2], @t[1]
609b0852 384 pxor @s[3], @t[0]
4ec93a10
AP
385
386 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
387
388 # new smaller inversion
389
390 movdqa @t[3], @s[0]
391 pand @t[1], @t[3]
392 pxor @t[2], @s[0]
393
394 movdqa @t[0], @s[2]
395 movdqa @s[0], @s[3]
396 pxor @t[3], @s[2]
397 pand @s[2], @s[3]
398
399 movdqa @t[1], @s[1]
400 pxor @t[2], @s[3]
401 pxor @t[0], @s[1]
402
403 pxor @t[2], @t[3]
404
405 pand @t[3], @s[1]
406
407 movdqa @s[2], @t[2]
408 pxor @t[0], @s[1]
409
410 pxor @s[1], @t[2]
411 pxor @s[1], @t[1]
412
413 pand @t[0], @t[2]
414
415 pxor @t[2], @s[2]
416 pxor @t[2], @t[1]
417
418 pand @s[3], @s[2]
419
420 pxor @s[0], @s[2]
421___
422# output in s3, s2, s1, t1
423
424# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
425
426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
427 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
428
429### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
430}
431
432# AES linear components
433
28507577 434sub ShiftRows {
4ec93a10
AP
435my @x=@_[0..7];
436my $mask=pop;
437$code.=<<___;
438 pxor 0x00($key),@x[0]
439 pxor 0x10($key),@x[1]
4ec93a10 440 pxor 0x20($key),@x[2]
4ec93a10 441 pxor 0x30($key),@x[3]
558ff0f0
AP
442 pshufb $mask,@x[0]
443 pshufb $mask,@x[1]
4ec93a10 444 pxor 0x40($key),@x[4]
4ec93a10 445 pxor 0x50($key),@x[5]
558ff0f0
AP
446 pshufb $mask,@x[2]
447 pshufb $mask,@x[3]
4ec93a10 448 pxor 0x60($key),@x[6]
4ec93a10 449 pxor 0x70($key),@x[7]
558ff0f0
AP
450 pshufb $mask,@x[4]
451 pshufb $mask,@x[5]
4ec93a10 452 pshufb $mask,@x[6]
4ec93a10 453 pshufb $mask,@x[7]
558ff0f0 454 lea 0x80($key),$key
4ec93a10
AP
455___
456}
457
28507577 458sub MixColumns {
4ec93a10
AP
459# modified to emit output in order suitable for feeding back to aesenc[last]
460my @x=@_[0..7];
461my @t=@_[8..15];
6f6a6130 462my $inv=@_[16]; # optional
4ec93a10
AP
463$code.=<<___;
464 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
465 pshufd \$0x93, @x[1], @t[1]
466 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
467 pshufd \$0x93, @x[2], @t[2]
468 pxor @t[1], @x[1]
469 pshufd \$0x93, @x[3], @t[3]
470 pxor @t[2], @x[2]
471 pshufd \$0x93, @x[4], @t[4]
472 pxor @t[3], @x[3]
473 pshufd \$0x93, @x[5], @t[5]
474 pxor @t[4], @x[4]
475 pshufd \$0x93, @x[6], @t[6]
476 pxor @t[5], @x[5]
477 pshufd \$0x93, @x[7], @t[7]
478 pxor @t[6], @x[6]
479 pxor @t[7], @x[7]
480
481 pxor @x[0], @t[1]
482 pxor @x[7], @t[0]
483 pxor @x[7], @t[1]
484 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
485 pxor @x[1], @t[2]
486 pshufd \$0x4E, @x[1], @x[1]
487 pxor @x[4], @t[5]
488 pxor @t[0], @x[0]
489 pxor @x[5], @t[6]
490 pxor @t[1], @x[1]
491 pxor @x[3], @t[4]
492 pshufd \$0x4E, @x[4], @t[0]
493 pxor @x[6], @t[7]
494 pshufd \$0x4E, @x[5], @t[1]
495 pxor @x[2], @t[3]
496 pshufd \$0x4E, @x[3], @x[4]
497 pxor @x[7], @t[3]
498 pshufd \$0x4E, @x[7], @x[5]
499 pxor @x[7], @t[4]
500 pshufd \$0x4E, @x[6], @x[3]
501 pxor @t[4], @t[0]
502 pshufd \$0x4E, @x[2], @x[6]
503 pxor @t[5], @t[1]
6f6a6130
AP
504___
505$code.=<<___ if (!$inv);
4ec93a10
AP
506 pxor @t[3], @x[4]
507 pxor @t[7], @x[5]
508 pxor @t[6], @x[3]
509 movdqa @t[0], @x[2]
510 pxor @t[2], @x[6]
511 movdqa @t[1], @x[7]
512___
6f6a6130
AP
513$code.=<<___ if ($inv);
514 pxor @x[4], @t[3]
515 pxor @t[7], @x[5]
516 pxor @x[3], @t[6]
517 movdqa @t[0], @x[3]
518 pxor @t[2], @x[6]
519 movdqa @t[6], @x[2]
520 movdqa @t[1], @x[7]
521 movdqa @x[6], @x[4]
522 movdqa @t[3], @x[6]
523___
4ec93a10
AP
524}
525
6f6a6130 526sub InvMixColumns_orig {
28507577
AP
527my @x=@_[0..7];
528my @t=@_[8..15];
529
530$code.=<<___;
28507577 531 # multiplication by 0x0e
b08259cd
AP
532 pshufd \$0x93, @x[7], @t[7]
533 movdqa @x[2], @t[2]
28507577
AP
534 pxor @x[5], @x[7] # 7 5
535 pxor @x[5], @x[2] # 2 5
b08259cd
AP
536 pshufd \$0x93, @x[0], @t[0]
537 movdqa @x[5], @t[5]
28507577
AP
538 pxor @x[0], @x[5] # 5 0 [1]
539 pxor @x[1], @x[0] # 0 1
b08259cd 540 pshufd \$0x93, @x[1], @t[1]
28507577
AP
541 pxor @x[2], @x[1] # 1 25
542 pxor @x[6], @x[0] # 01 6 [2]
543 pxor @x[3], @x[1] # 125 3 [4]
b08259cd 544 pshufd \$0x93, @x[3], @t[3]
28507577
AP
545 pxor @x[0], @x[2] # 25 016 [3]
546 pxor @x[7], @x[3] # 3 75
547 pxor @x[6], @x[7] # 75 6 [0]
b08259cd
AP
548 pshufd \$0x93, @x[6], @t[6]
549 movdqa @x[4], @t[4]
28507577
AP
550 pxor @x[4], @x[6] # 6 4
551 pxor @x[3], @x[4] # 4 375 [6]
552 pxor @x[7], @x[3] # 375 756=36
553 pxor @t[5], @x[6] # 64 5 [7]
28507577 554 pxor @t[2], @x[3] # 36 2
28507577 555 pxor @t[4], @x[3] # 362 4 [5]
b08259cd 556 pshufd \$0x93, @t[5], @t[5]
28507577
AP
557___
558 my @y = @x[7,5,0,2,1,3,4,6];
559$code.=<<___;
560 # multiplication by 0x0b
561 pxor @y[0], @y[1]
562 pxor @t[0], @y[0]
28507577 563 pxor @t[1], @y[1]
b08259cd
AP
564 pshufd \$0x93, @t[2], @t[2]
565 pxor @t[5], @y[0]
566 pxor @t[6], @y[1]
567 pxor @t[7], @y[0]
568 pshufd \$0x93, @t[4], @t[4]
28507577 569 pxor @t[6], @t[7] # clobber t[7]
b08259cd 570 pxor @y[0], @y[1]
28507577 571
b08259cd
AP
572 pxor @t[0], @y[3]
573 pshufd \$0x93, @t[0], @t[0]
28507577 574 pxor @t[1], @y[2]
b08259cd 575 pxor @t[1], @y[4]
28507577 576 pxor @t[2], @y[2]
b08259cd 577 pshufd \$0x93, @t[1], @t[1]
28507577 578 pxor @t[2], @y[3]
b08259cd
AP
579 pxor @t[2], @y[5]
580 pxor @t[7], @y[2]
581 pshufd \$0x93, @t[2], @t[2]
28507577 582 pxor @t[3], @y[3]
b08259cd
AP
583 pxor @t[3], @y[6]
584 pxor @t[3], @y[4]
585 pshufd \$0x93, @t[3], @t[3]
586 pxor @t[4], @y[7]
587 pxor @t[4], @y[5]
28507577 588 pxor @t[7], @y[7]
b08259cd
AP
589 pxor @t[5], @y[3]
590 pxor @t[4], @y[4]
28507577
AP
591 pxor @t[5], @t[7] # clobber t[7] even more
592
b08259cd
AP
593 pxor @t[7], @y[5]
594 pshufd \$0x93, @t[4], @t[4]
595 pxor @t[7], @y[6]
596 pxor @t[7], @y[4]
28507577
AP
597
598 pxor @t[5], @t[7]
b08259cd 599 pshufd \$0x93, @t[5], @t[5]
28507577
AP
600 pxor @t[6], @t[7] # restore t[7]
601
b08259cd
AP
602 # multiplication by 0x0d
603 pxor @y[7], @y[4]
604 pxor @t[4], @y[7]
28507577 605 pshufd \$0x93, @t[6], @t[6]
b08259cd
AP
606 pxor @t[0], @y[2]
607 pxor @t[5], @y[7]
608 pxor @t[2], @y[2]
28507577
AP
609 pshufd \$0x93, @t[7], @t[7]
610
28507577
AP
611 pxor @y[1], @y[3]
612 pxor @t[1], @y[1]
b08259cd 613 pxor @t[0], @y[0]
28507577 614 pxor @t[0], @y[3]
b08259cd
AP
615 pxor @t[5], @y[1]
616 pxor @t[5], @y[0]
617 pxor @t[7], @y[1]
618 pshufd \$0x93, @t[0], @t[0]
619 pxor @t[6], @y[0]
620 pxor @y[1], @y[3]
28507577 621 pxor @t[1], @y[4]
b08259cd 622 pshufd \$0x93, @t[1], @t[1]
28507577 623
b08259cd
AP
624 pxor @t[7], @y[7]
625 pxor @t[2], @y[4]
28507577 626 pxor @t[2], @y[5]
b08259cd
AP
627 pshufd \$0x93, @t[2], @t[2]
628 pxor @t[6], @y[2]
629 pxor @t[3], @t[6] # clobber t[6]
630 pxor @y[7], @y[4]
631 pxor @t[6], @y[3]
28507577
AP
632
633 pxor @t[6], @y[6]
b08259cd 634 pxor @t[5], @y[5]
28507577 635 pxor @t[4], @y[6]
b08259cd
AP
636 pshufd \$0x93, @t[4], @t[4]
637 pxor @t[6], @y[5]
638 pxor @t[7], @y[6]
28507577
AP
639 pxor @t[3], @t[6] # restore t[6]
640
28507577
AP
641 pshufd \$0x93, @t[5], @t[5]
642 pshufd \$0x93, @t[6], @t[6]
643 pshufd \$0x93, @t[7], @t[7]
b08259cd 644 pshufd \$0x93, @t[3], @t[3]
28507577
AP
645
646 # multiplication by 0x09
647 pxor @y[1], @y[4]
b08259cd 648 pxor @y[1], @t[1] # t[1]=y[1]
28507577 649 pxor @t[5], @t[0] # clobber t[0]
b08259cd 650 pxor @t[5], @t[1]
28507577 651 pxor @t[0], @y[3]
b08259cd
AP
652 pxor @y[0], @t[0] # t[0]=y[0]
653 pxor @t[6], @t[1]
654 pxor @t[7], @t[6] # clobber t[6]
655 pxor @t[1], @y[4]
28507577 656 pxor @t[4], @y[7]
b08259cd
AP
657 pxor @y[4], @t[4] # t[4]=y[4]
658 pxor @t[3], @y[6]
659 pxor @y[3], @t[3] # t[3]=y[3]
660 pxor @t[2], @y[5]
661 pxor @y[2], @t[2] # t[2]=y[2]
662 pxor @t[7], @t[3]
663 pxor @y[5], @t[5] # t[5]=y[5]
664 pxor @t[6], @t[2]
665 pxor @t[6], @t[5]
666 pxor @y[6], @t[6] # t[6]=y[6]
667 pxor @y[7], @t[7] # t[7]=y[7]
28507577
AP
668
669 movdqa @t[0],@XMM[0]
670 movdqa @t[1],@XMM[1]
671 movdqa @t[2],@XMM[2]
672 movdqa @t[3],@XMM[3]
673 movdqa @t[4],@XMM[4]
674 movdqa @t[5],@XMM[5]
675 movdqa @t[6],@XMM[6]
676 movdqa @t[7],@XMM[7]
677___
678}
679
6f6a6130
AP
680sub InvMixColumns {
681my @x=@_[0..7];
682my @t=@_[8..15];
683
684# Thanks to Jussi Kivilinna for providing pointer to
685#
686# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
687# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
688# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
689# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
690
691$code.=<<___;
692 # multiplication by 0x05-0x00-0x04-0x00
693 pshufd \$0x4E, @x[0], @t[0]
694 pshufd \$0x4E, @x[6], @t[6]
695 pxor @x[0], @t[0]
696 pshufd \$0x4E, @x[7], @t[7]
697 pxor @x[6], @t[6]
698 pshufd \$0x4E, @x[1], @t[1]
699 pxor @x[7], @t[7]
700 pshufd \$0x4E, @x[2], @t[2]
701 pxor @x[1], @t[1]
702 pshufd \$0x4E, @x[3], @t[3]
703 pxor @x[2], @t[2]
704 pxor @t[6], @x[0]
705 pxor @t[6], @x[1]
706 pshufd \$0x4E, @x[4], @t[4]
707 pxor @x[3], @t[3]
708 pxor @t[0], @x[2]
709 pxor @t[1], @x[3]
710 pshufd \$0x4E, @x[5], @t[5]
711 pxor @x[4], @t[4]
712 pxor @t[7], @x[1]
713 pxor @t[2], @x[4]
714 pxor @x[5], @t[5]
715
716 pxor @t[7], @x[2]
717 pxor @t[6], @x[3]
718 pxor @t[6], @x[4]
719 pxor @t[3], @x[5]
720 pxor @t[4], @x[6]
721 pxor @t[7], @x[4]
722 pxor @t[7], @x[5]
723 pxor @t[5], @x[7]
724___
725 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
726}
727
4ec93a10
AP
728sub aesenc { # not used
729my @b=@_[0..7];
730my @t=@_[8..15];
731$code.=<<___;
732 movdqa 0x30($const),@t[0] # .LSR
733___
28507577
AP
734 &ShiftRows (@b,@t[0]);
735 &Sbox (@b,@t);
736 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
4ec93a10
AP
737}
738
739sub aesenclast { # not used
740my @b=@_[0..7];
741my @t=@_[8..15];
742$code.=<<___;
743 movdqa 0x40($const),@t[0] # .LSRM0
744___
28507577
AP
745 &ShiftRows (@b,@t[0]);
746 &Sbox (@b,@t);
4ec93a10
AP
747$code.=<<___
748 pxor 0x00($key),@b[0]
749 pxor 0x10($key),@b[1]
750 pxor 0x20($key),@b[4]
751 pxor 0x30($key),@b[6]
752 pxor 0x40($key),@b[3]
753 pxor 0x50($key),@b[7]
754 pxor 0x60($key),@b[2]
755 pxor 0x70($key),@b[5]
756___
757}
758
759sub swapmove {
760my ($a,$b,$n,$mask,$t)=@_;
761$code.=<<___;
762 movdqa $b,$t
763 psrlq \$$n,$b
764 pxor $a,$b
765 pand $mask,$b
766 pxor $b,$a
767 psllq \$$n,$b
768 pxor $t,$b
769___
770}
771sub swapmove2x {
772my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
773$code.=<<___;
774 movdqa $b0,$t0
775 psrlq \$$n,$b0
776 movdqa $b1,$t1
777 psrlq \$$n,$b1
778 pxor $a0,$b0
779 pxor $a1,$b1
780 pand $mask,$b0
781 pand $mask,$b1
782 pxor $b0,$a0
783 psllq \$$n,$b0
784 pxor $b1,$a1
785 psllq \$$n,$b1
786 pxor $t0,$b0
787 pxor $t1,$b1
788___
789}
790
791sub bitslice {
792my @x=reverse(@_[0..7]);
793my ($t0,$t1,$t2,$t3)=@_[8..11];
794$code.=<<___;
795 movdqa 0x00($const),$t0 # .LBS0
796 movdqa 0x10($const),$t1 # .LBS1
797___
798 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
799 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
800$code.=<<___;
801 movdqa 0x20($const),$t0 # .LBS2
802___
803 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
804 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
805
806 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
807 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
808}
809
810$code.=<<___;
811.text
812
fe068648
AP
813.extern asm_AES_encrypt
814.extern asm_AES_decrypt
4ec93a10
AP
815
816.type _bsaes_encrypt8,\@abi-omnipotent
817.align 64
818_bsaes_encrypt8:
819 lea .LBS0(%rip), $const # constants table
820
821 movdqa ($key), @XMM[9] # round 0 key
822 lea 0x10($key), $key
f9ef874a 823 movdqa 0x50($const), @XMM[8] # .LM0SR
4ec93a10
AP
824 pxor @XMM[9], @XMM[0] # xor with round0 key
825 pxor @XMM[9], @XMM[1]
4ec93a10 826 pxor @XMM[9], @XMM[2]
4ec93a10 827 pxor @XMM[9], @XMM[3]
558ff0f0
AP
828 pshufb @XMM[8], @XMM[0]
829 pshufb @XMM[8], @XMM[1]
4ec93a10 830 pxor @XMM[9], @XMM[4]
4ec93a10 831 pxor @XMM[9], @XMM[5]
558ff0f0
AP
832 pshufb @XMM[8], @XMM[2]
833 pshufb @XMM[8], @XMM[3]
4ec93a10 834 pxor @XMM[9], @XMM[6]
4ec93a10 835 pxor @XMM[9], @XMM[7]
558ff0f0
AP
836 pshufb @XMM[8], @XMM[4]
837 pshufb @XMM[8], @XMM[5]
4ec93a10
AP
838 pshufb @XMM[8], @XMM[6]
839 pshufb @XMM[8], @XMM[7]
840_bsaes_encrypt8_bitslice:
841___
842 &bitslice (@XMM[0..7, 8..11]);
843$code.=<<___;
844 dec $rounds
845 jmp .Lenc_sbox
846.align 16
847.Lenc_loop:
848___
28507577 849 &ShiftRows (@XMM[0..7, 8]);
4ec93a10 850$code.=".Lenc_sbox:\n";
28507577 851 &Sbox (@XMM[0..7, 8..15]);
4ec93a10
AP
852$code.=<<___;
853 dec $rounds
854 jl .Lenc_done
855___
28507577 856 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
4ec93a10
AP
857$code.=<<___;
858 movdqa 0x30($const), @XMM[8] # .LSR
859 jnz .Lenc_loop
860 movdqa 0x40($const), @XMM[8] # .LSRM0
861 jmp .Lenc_loop
862.align 16
863.Lenc_done:
864___
865 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
866 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
867$code.=<<___;
868 movdqa ($key), @XMM[8] # last round key
4ec93a10
AP
869 pxor @XMM[8], @XMM[4]
870 pxor @XMM[8], @XMM[6]
871 pxor @XMM[8], @XMM[3]
872 pxor @XMM[8], @XMM[7]
873 pxor @XMM[8], @XMM[2]
874 pxor @XMM[8], @XMM[5]
28507577
AP
875 pxor @XMM[8], @XMM[0]
876 pxor @XMM[8], @XMM[1]
4ec93a10
AP
877 ret
878.size _bsaes_encrypt8,.-_bsaes_encrypt8
28507577
AP
879
880.type _bsaes_decrypt8,\@abi-omnipotent
881.align 64
882_bsaes_decrypt8:
883 lea .LBS0(%rip), $const # constants table
884
885 movdqa ($key), @XMM[9] # round 0 key
886 lea 0x10($key), $key
887 movdqa -0x30($const), @XMM[8] # .LM0ISR
888 pxor @XMM[9], @XMM[0] # xor with round0 key
889 pxor @XMM[9], @XMM[1]
28507577 890 pxor @XMM[9], @XMM[2]
28507577 891 pxor @XMM[9], @XMM[3]
558ff0f0
AP
892 pshufb @XMM[8], @XMM[0]
893 pshufb @XMM[8], @XMM[1]
28507577 894 pxor @XMM[9], @XMM[4]
28507577 895 pxor @XMM[9], @XMM[5]
558ff0f0
AP
896 pshufb @XMM[8], @XMM[2]
897 pshufb @XMM[8], @XMM[3]
28507577 898 pxor @XMM[9], @XMM[6]
28507577 899 pxor @XMM[9], @XMM[7]
558ff0f0
AP
900 pshufb @XMM[8], @XMM[4]
901 pshufb @XMM[8], @XMM[5]
28507577
AP
902 pshufb @XMM[8], @XMM[6]
903 pshufb @XMM[8], @XMM[7]
904___
905 &bitslice (@XMM[0..7, 8..11]);
906$code.=<<___;
907 dec $rounds
908 jmp .Ldec_sbox
909.align 16
910.Ldec_loop:
911___
912 &ShiftRows (@XMM[0..7, 8]);
913$code.=".Ldec_sbox:\n";
914 &InvSbox (@XMM[0..7, 8..15]);
915$code.=<<___;
916 dec $rounds
917 jl .Ldec_done
918___
919 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
920$code.=<<___;
921 movdqa -0x10($const), @XMM[8] # .LISR
922 jnz .Ldec_loop
923 movdqa -0x20($const), @XMM[8] # .LISRM0
924 jmp .Ldec_loop
925.align 16
926.Ldec_done:
927___
928 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
929$code.=<<___;
930 movdqa ($key), @XMM[8] # last round key
931 pxor @XMM[8], @XMM[6]
932 pxor @XMM[8], @XMM[4]
933 pxor @XMM[8], @XMM[2]
934 pxor @XMM[8], @XMM[7]
935 pxor @XMM[8], @XMM[3]
936 pxor @XMM[8], @XMM[5]
937 pxor @XMM[8], @XMM[0]
938 pxor @XMM[8], @XMM[1]
939 ret
940.size _bsaes_decrypt8,.-_bsaes_decrypt8
4ec93a10
AP
941___
942}
943{
944my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
945
946sub bitslice_key {
947my @x=reverse(@_[0..7]);
948my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
949
950 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
951$code.=<<___;
952 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
953 movdqa @x[0], @x[2]
954 movdqa @x[1], @x[3]
955___
956 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
957
958 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
959$code.=<<___;
960 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
961 movdqa @x[0], @x[4]
962 movdqa @x[2], @x[6]
963 movdqa @x[1], @x[5]
964 movdqa @x[3], @x[7]
965___
966 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
967 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
968}
969
970$code.=<<___;
28507577 971.type _bsaes_key_convert,\@abi-omnipotent
4ec93a10 972.align 16
28507577 973_bsaes_key_convert:
f9ef874a 974 lea .Lmasks(%rip), $const
4ec93a10 975 movdqu ($inp), %xmm7 # load round 0 key
4ec93a10 976 lea 0x10($inp), $inp
f9ef874a
AP
977 movdqa 0x00($const), %xmm0 # 0x01...
978 movdqa 0x10($const), %xmm1 # 0x02...
979 movdqa 0x20($const), %xmm2 # 0x04...
980 movdqa 0x30($const), %xmm3 # 0x08...
981 movdqa 0x40($const), %xmm4 # .LM0
982 pcmpeqd %xmm5, %xmm5 # .LNOT
983
984 movdqu ($inp), %xmm6 # load round 1 key
4ec93a10
AP
985 movdqa %xmm7, ($out) # save round 0 key
986 lea 0x10($out), $out
987 dec $rounds
988 jmp .Lkey_loop
989.align 16
990.Lkey_loop:
f9ef874a
AP
991 pshufb %xmm4, %xmm6 # .LM0
992
993 movdqa %xmm0, %xmm8
994 movdqa %xmm1, %xmm9
995
996 pand %xmm6, %xmm8
997 pand %xmm6, %xmm9
998 movdqa %xmm2, %xmm10
999 pcmpeqb %xmm0, %xmm8
1000 psllq \$4, %xmm0 # 0x10...
1001 movdqa %xmm3, %xmm11
1002 pcmpeqb %xmm1, %xmm9
1003 psllq \$4, %xmm1 # 0x20...
1004
1005 pand %xmm6, %xmm10
1006 pand %xmm6, %xmm11
1007 movdqa %xmm0, %xmm12
1008 pcmpeqb %xmm2, %xmm10
1009 psllq \$4, %xmm2 # 0x40...
1010 movdqa %xmm1, %xmm13
1011 pcmpeqb %xmm3, %xmm11
1012 psllq \$4, %xmm3 # 0x80...
1013
1014 movdqa %xmm2, %xmm14
1015 movdqa %xmm3, %xmm15
1016 pxor %xmm5, %xmm8 # "pnot"
1017 pxor %xmm5, %xmm9
1018
1019 pand %xmm6, %xmm12
1020 pand %xmm6, %xmm13
1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1022 pcmpeqb %xmm0, %xmm12
1023 psrlq \$4, %xmm0 # 0x01...
1024 movdqa %xmm9, 0x10($out)
1025 pcmpeqb %xmm1, %xmm13
1026 psrlq \$4, %xmm1 # 0x02...
1027 lea 0x10($inp), $inp
1028
1029 pand %xmm6, %xmm14
1030 pand %xmm6, %xmm15
1031 movdqa %xmm10, 0x20($out)
1032 pcmpeqb %xmm2, %xmm14
1033 psrlq \$4, %xmm2 # 0x04...
1034 movdqa %xmm11, 0x30($out)
1035 pcmpeqb %xmm3, %xmm15
1036 psrlq \$4, %xmm3 # 0x08...
1037 movdqu ($inp), %xmm6 # load next round key
1038
1039 pxor %xmm5, %xmm13 # "pnot"
1040 pxor %xmm5, %xmm14
1041 movdqa %xmm12, 0x40($out)
1042 movdqa %xmm13, 0x50($out)
1043 movdqa %xmm14, 0x60($out)
1044 movdqa %xmm15, 0x70($out)
4ec93a10 1045 lea 0x80($out),$out
4ec93a10
AP
1046 dec $rounds
1047 jnz .Lkey_loop
1048
f9ef874a 1049 movdqa 0x50($const), %xmm7 # .L63
28507577 1050 #movdqa %xmm6, ($out) # don't save last round key
4ec93a10 1051 ret
28507577 1052.size _bsaes_key_convert,.-_bsaes_key_convert
4ec93a10
AP
1053___
1054}
1055
fe068648 1056if (0 && !$win64) { # following four functions are unsupported interface
11272648 1057 # used for benchmarking...
4ec93a10
AP
1058$code.=<<___;
1059.globl bsaes_enc_key_convert
1060.type bsaes_enc_key_convert,\@function,2
1061.align 16
1062bsaes_enc_key_convert:
1063 mov 240($inp),%r10d # pass rounds
1064 mov $inp,%rcx # pass key
1065 mov $out,%rax # pass key schedule
28507577
AP
1066 call _bsaes_key_convert
1067 pxor %xmm6,%xmm7 # fix up last round key
1068 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1069 ret
1070.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1071
1072.globl bsaes_encrypt_128
1073.type bsaes_encrypt_128,\@function,4
1074.align 16
1075bsaes_encrypt_128:
1076.Lenc128_loop:
1077 movdqu 0x00($inp), @XMM[0] # load input
1078 movdqu 0x10($inp), @XMM[1]
1079 movdqu 0x20($inp), @XMM[2]
1080 movdqu 0x30($inp), @XMM[3]
1081 movdqu 0x40($inp), @XMM[4]
1082 movdqu 0x50($inp), @XMM[5]
1083 movdqu 0x60($inp), @XMM[6]
1084 movdqu 0x70($inp), @XMM[7]
1085 mov $key, %rax # pass the $key
1086 lea 0x80($inp), $inp
1087 mov \$10,%r10d
1088
1089 call _bsaes_encrypt8
1090
1091 movdqu @XMM[0], 0x00($out) # write output
1092 movdqu @XMM[1], 0x10($out)
1093 movdqu @XMM[4], 0x20($out)
1094 movdqu @XMM[6], 0x30($out)
1095 movdqu @XMM[3], 0x40($out)
1096 movdqu @XMM[7], 0x50($out)
1097 movdqu @XMM[2], 0x60($out)
1098 movdqu @XMM[5], 0x70($out)
1099 lea 0x80($out), $out
1100 sub \$0x80,$len
1101 ja .Lenc128_loop
1102 ret
1103.size bsaes_encrypt_128,.-bsaes_encrypt_128
28507577
AP
1104
1105.globl bsaes_dec_key_convert
1106.type bsaes_dec_key_convert,\@function,2
1107.align 16
1108bsaes_dec_key_convert:
1109 mov 240($inp),%r10d # pass rounds
1110 mov $inp,%rcx # pass key
1111 mov $out,%rax # pass key schedule
1112 call _bsaes_key_convert
1113 pxor ($out),%xmm7 # fix up round 0 key
1114 movdqa %xmm6,(%rax) # save last round key
1115 movdqa %xmm7,($out)
1116 ret
1117.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1118
1119.globl bsaes_decrypt_128
1120.type bsaes_decrypt_128,\@function,4
1121.align 16
1122bsaes_decrypt_128:
1123.Ldec128_loop:
1124 movdqu 0x00($inp), @XMM[0] # load input
1125 movdqu 0x10($inp), @XMM[1]
1126 movdqu 0x20($inp), @XMM[2]
1127 movdqu 0x30($inp), @XMM[3]
1128 movdqu 0x40($inp), @XMM[4]
1129 movdqu 0x50($inp), @XMM[5]
1130 movdqu 0x60($inp), @XMM[6]
1131 movdqu 0x70($inp), @XMM[7]
1132 mov $key, %rax # pass the $key
1133 lea 0x80($inp), $inp
1134 mov \$10,%r10d
1135
1136 call _bsaes_decrypt8
1137
1138 movdqu @XMM[0], 0x00($out) # write output
1139 movdqu @XMM[1], 0x10($out)
1140 movdqu @XMM[6], 0x20($out)
1141 movdqu @XMM[4], 0x30($out)
1142 movdqu @XMM[2], 0x40($out)
1143 movdqu @XMM[7], 0x50($out)
1144 movdqu @XMM[3], 0x60($out)
1145 movdqu @XMM[5], 0x70($out)
1146 lea 0x80($out), $out
1147 sub \$0x80,$len
1148 ja .Ldec128_loop
1149 ret
1150.size bsaes_decrypt_128,.-bsaes_decrypt_128
4ec93a10
AP
1151___
1152}
1153{
1154######################################################################
1155#
1156# OpenSSL interface
1157#
a75a52a4
AP
1158my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1159 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
4ec93a10
AP
1160my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1161
fe068648 1162if ($ecb) {
4ec93a10
AP
1163$code.=<<___;
1164.globl bsaes_ecb_encrypt_blocks
1165.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1166.align 16
1167bsaes_ecb_encrypt_blocks:
b84460ad 1168.cfi_startproc
fe068648
AP
1169 mov %rsp, %rax
1170.Lecb_enc_prologue:
4ec93a10 1171 push %rbp
b84460ad 1172.cfi_push %rbp
4ec93a10 1173 push %rbx
b84460ad 1174.cfi_push %rbx
4ec93a10 1175 push %r12
b84460ad 1176.cfi_push %r12
4ec93a10 1177 push %r13
b84460ad 1178.cfi_push %r13
4ec93a10 1179 push %r14
b84460ad 1180.cfi_push %r14
4ec93a10 1181 push %r15
b84460ad 1182.cfi_push %r15
4ec93a10 1183 lea -0x48(%rsp),%rsp
b84460ad 1184.cfi_adjust_cfa_offset 0x48
4ec93a10
AP
1185___
1186$code.=<<___ if ($win64);
1187 lea -0xa0(%rsp), %rsp
1188 movaps %xmm6, 0x40(%rsp)
1189 movaps %xmm7, 0x50(%rsp)
1190 movaps %xmm8, 0x60(%rsp)
1191 movaps %xmm9, 0x70(%rsp)
1192 movaps %xmm10, 0x80(%rsp)
1193 movaps %xmm11, 0x90(%rsp)
1194 movaps %xmm12, 0xa0(%rsp)
1195 movaps %xmm13, 0xb0(%rsp)
1196 movaps %xmm14, 0xc0(%rsp)
1197 movaps %xmm15, 0xd0(%rsp)
1198.Lecb_enc_body:
1199___
1200$code.=<<___;
1201 mov %rsp,%rbp # backup %rsp
b84460ad 1202.cfi_def_cfa_register %rbp
4ec93a10
AP
1203 mov 240($arg4),%eax # rounds
1204 mov $arg1,$inp # backup arguments
1205 mov $arg2,$out
1206 mov $arg3,$len
1207 mov $arg4,$key
1208 cmp \$8,$arg3
1209 jb .Lecb_enc_short
1210
1211 mov %eax,%ebx # backup rounds
1212 shl \$7,%rax # 128 bytes per inner round key
1213 sub \$`128-32`,%rax # size of bit-sliced key schedule
1214 sub %rax,%rsp
1215 mov %rsp,%rax # pass key schedule
1216 mov $key,%rcx # pass key
1217 mov %ebx,%r10d # pass rounds
28507577
AP
1218 call _bsaes_key_convert
1219 pxor %xmm6,%xmm7 # fix up last round key
1220 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1221
1222 sub \$8,$len
1223.Lecb_enc_loop:
1224 movdqu 0x00($inp), @XMM[0] # load input
1225 movdqu 0x10($inp), @XMM[1]
1226 movdqu 0x20($inp), @XMM[2]
1227 movdqu 0x30($inp), @XMM[3]
1228 movdqu 0x40($inp), @XMM[4]
1229 movdqu 0x50($inp), @XMM[5]
1230 mov %rsp, %rax # pass key schedule
1231 movdqu 0x60($inp), @XMM[6]
1232 mov %ebx,%r10d # pass rounds
1233 movdqu 0x70($inp), @XMM[7]
1234 lea 0x80($inp), $inp
1235
1236 call _bsaes_encrypt8
1237
1238 movdqu @XMM[0], 0x00($out) # write output
1239 movdqu @XMM[1], 0x10($out)
1240 movdqu @XMM[4], 0x20($out)
1241 movdqu @XMM[6], 0x30($out)
1242 movdqu @XMM[3], 0x40($out)
1243 movdqu @XMM[7], 0x50($out)
1244 movdqu @XMM[2], 0x60($out)
1245 movdqu @XMM[5], 0x70($out)
1246 lea 0x80($out), $out
1247 sub \$8,$len
1248 jnc .Lecb_enc_loop
1249
1250 add \$8,$len
1251 jz .Lecb_enc_done
1252
1253 movdqu 0x00($inp), @XMM[0] # load input
1254 mov %rsp, %rax # pass key schedule
1255 mov %ebx,%r10d # pass rounds
1256 cmp \$2,$len
1257 jb .Lecb_enc_one
1258 movdqu 0x10($inp), @XMM[1]
1259 je .Lecb_enc_two
1260 movdqu 0x20($inp), @XMM[2]
1261 cmp \$4,$len
1262 jb .Lecb_enc_three
1263 movdqu 0x30($inp), @XMM[3]
1264 je .Lecb_enc_four
1265 movdqu 0x40($inp), @XMM[4]
1266 cmp \$6,$len
1267 jb .Lecb_enc_five
1268 movdqu 0x50($inp), @XMM[5]
1269 je .Lecb_enc_six
1270 movdqu 0x60($inp), @XMM[6]
1271 call _bsaes_encrypt8
1272 movdqu @XMM[0], 0x00($out) # write output
1273 movdqu @XMM[1], 0x10($out)
1274 movdqu @XMM[4], 0x20($out)
1275 movdqu @XMM[6], 0x30($out)
1276 movdqu @XMM[3], 0x40($out)
1277 movdqu @XMM[7], 0x50($out)
1278 movdqu @XMM[2], 0x60($out)
1279 jmp .Lecb_enc_done
1280.align 16
1281.Lecb_enc_six:
1282 call _bsaes_encrypt8
1283 movdqu @XMM[0], 0x00($out) # write output
1284 movdqu @XMM[1], 0x10($out)
1285 movdqu @XMM[4], 0x20($out)
1286 movdqu @XMM[6], 0x30($out)
1287 movdqu @XMM[3], 0x40($out)
1288 movdqu @XMM[7], 0x50($out)
1289 jmp .Lecb_enc_done
1290.align 16
1291.Lecb_enc_five:
1292 call _bsaes_encrypt8
1293 movdqu @XMM[0], 0x00($out) # write output
1294 movdqu @XMM[1], 0x10($out)
1295 movdqu @XMM[4], 0x20($out)
1296 movdqu @XMM[6], 0x30($out)
1297 movdqu @XMM[3], 0x40($out)
1298 jmp .Lecb_enc_done
1299.align 16
1300.Lecb_enc_four:
1301 call _bsaes_encrypt8
1302 movdqu @XMM[0], 0x00($out) # write output
1303 movdqu @XMM[1], 0x10($out)
1304 movdqu @XMM[4], 0x20($out)
1305 movdqu @XMM[6], 0x30($out)
1306 jmp .Lecb_enc_done
1307.align 16
1308.Lecb_enc_three:
1309 call _bsaes_encrypt8
1310 movdqu @XMM[0], 0x00($out) # write output
1311 movdqu @XMM[1], 0x10($out)
1312 movdqu @XMM[4], 0x20($out)
1313 jmp .Lecb_enc_done
1314.align 16
1315.Lecb_enc_two:
1316 call _bsaes_encrypt8
1317 movdqu @XMM[0], 0x00($out) # write output
1318 movdqu @XMM[1], 0x10($out)
1319 jmp .Lecb_enc_done
1320.align 16
1321.Lecb_enc_one:
1322 call _bsaes_encrypt8
1323 movdqu @XMM[0], 0x00($out) # write output
1324 jmp .Lecb_enc_done
1325.align 16
1326.Lecb_enc_short:
1327 lea ($inp), $arg1
1328 lea ($out), $arg2
1329 lea ($key), $arg3
fe068648 1330 call asm_AES_encrypt
4ec93a10
AP
1331 lea 16($inp), $inp
1332 lea 16($out), $out
1333 dec $len
1334 jnz .Lecb_enc_short
1335
1336.Lecb_enc_done:
1337 lea (%rsp),%rax
1338 pxor %xmm0, %xmm0
1339.Lecb_enc_bzero: # wipe key schedule [if any]
1340 movdqa %xmm0, 0x00(%rax)
1341 movdqa %xmm0, 0x10(%rax)
1342 lea 0x20(%rax), %rax
1343 cmp %rax, %rbp
1344 jb .Lecb_enc_bzero
1345
384e6de4 1346 lea 0x78(%rbp),%rax
b84460ad 1347.cfi_def_cfa %rax,8
4ec93a10
AP
1348___
1349$code.=<<___ if ($win64);
1350 movaps 0x40(%rbp), %xmm6
1351 movaps 0x50(%rbp), %xmm7
1352 movaps 0x60(%rbp), %xmm8
1353 movaps 0x70(%rbp), %xmm9
1354 movaps 0x80(%rbp), %xmm10
1355 movaps 0x90(%rbp), %xmm11
1356 movaps 0xa0(%rbp), %xmm12
1357 movaps 0xb0(%rbp), %xmm13
1358 movaps 0xc0(%rbp), %xmm14
1359 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
1360 lea 0xa0(%rax), %rax
1361.Lecb_enc_tail:
4ec93a10
AP
1362___
1363$code.=<<___;
384e6de4 1364 mov -48(%rax), %r15
b84460ad 1365.cfi_restore %r15
384e6de4 1366 mov -40(%rax), %r14
b84460ad 1367.cfi_restore %r14
384e6de4 1368 mov -32(%rax), %r13
b84460ad 1369.cfi_restore %r13
384e6de4 1370 mov -24(%rax), %r12
b84460ad 1371.cfi_restore %r12
384e6de4 1372 mov -16(%rax), %rbx
b84460ad 1373.cfi_restore %rbx
384e6de4 1374 mov -8(%rax), %rbp
b84460ad 1375.cfi_restore %rbp
384e6de4 1376 lea (%rax), %rsp # restore %rsp
b84460ad 1377.cfi_def_cfa_register %rsp
4ec93a10
AP
1378.Lecb_enc_epilogue:
1379 ret
b84460ad 1380.cfi_endproc
4ec93a10
AP
1381.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1382
a75a52a4
AP
1383.globl bsaes_ecb_decrypt_blocks
1384.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1385.align 16
1386bsaes_ecb_decrypt_blocks:
b84460ad 1387.cfi_startproc
fe068648
AP
1388 mov %rsp, %rax
1389.Lecb_dec_prologue:
a75a52a4 1390 push %rbp
b84460ad 1391.cfi_push %rbp
a75a52a4 1392 push %rbx
b84460ad 1393.cfi_push %rbx
a75a52a4 1394 push %r12
b84460ad 1395.cfi_push %r12
a75a52a4 1396 push %r13
b84460ad 1397.cfi_push %r13
a75a52a4 1398 push %r14
b84460ad 1399.cfi_push %r14
a75a52a4 1400 push %r15
b84460ad 1401.cfi_push %r15
a75a52a4 1402 lea -0x48(%rsp),%rsp
b84460ad 1403.cfi_adjust_cfa_offset 0x48
a75a52a4
AP
1404___
1405$code.=<<___ if ($win64);
1406 lea -0xa0(%rsp), %rsp
1407 movaps %xmm6, 0x40(%rsp)
1408 movaps %xmm7, 0x50(%rsp)
1409 movaps %xmm8, 0x60(%rsp)
1410 movaps %xmm9, 0x70(%rsp)
1411 movaps %xmm10, 0x80(%rsp)
1412 movaps %xmm11, 0x90(%rsp)
1413 movaps %xmm12, 0xa0(%rsp)
1414 movaps %xmm13, 0xb0(%rsp)
1415 movaps %xmm14, 0xc0(%rsp)
1416 movaps %xmm15, 0xd0(%rsp)
1417.Lecb_dec_body:
1418___
1419$code.=<<___;
1420 mov %rsp,%rbp # backup %rsp
b84460ad 1421.cfi_def_cfa_register %rbp
a75a52a4
AP
1422 mov 240($arg4),%eax # rounds
1423 mov $arg1,$inp # backup arguments
1424 mov $arg2,$out
1425 mov $arg3,$len
1426 mov $arg4,$key
1427 cmp \$8,$arg3
1428 jb .Lecb_dec_short
1429
1430 mov %eax,%ebx # backup rounds
1431 shl \$7,%rax # 128 bytes per inner round key
1432 sub \$`128-32`,%rax # size of bit-sliced key schedule
1433 sub %rax,%rsp
1434 mov %rsp,%rax # pass key schedule
1435 mov $key,%rcx # pass key
1436 mov %ebx,%r10d # pass rounds
1437 call _bsaes_key_convert
1438 pxor (%rsp),%xmm7 # fix up 0 round key
1439 movdqa %xmm6,(%rax) # save last round key
1440 movdqa %xmm7,(%rsp)
1441
1442 sub \$8,$len
1443.Lecb_dec_loop:
1444 movdqu 0x00($inp), @XMM[0] # load input
1445 movdqu 0x10($inp), @XMM[1]
1446 movdqu 0x20($inp), @XMM[2]
1447 movdqu 0x30($inp), @XMM[3]
1448 movdqu 0x40($inp), @XMM[4]
1449 movdqu 0x50($inp), @XMM[5]
1450 mov %rsp, %rax # pass key schedule
1451 movdqu 0x60($inp), @XMM[6]
1452 mov %ebx,%r10d # pass rounds
1453 movdqu 0x70($inp), @XMM[7]
1454 lea 0x80($inp), $inp
1455
1456 call _bsaes_decrypt8
1457
1458 movdqu @XMM[0], 0x00($out) # write output
1459 movdqu @XMM[1], 0x10($out)
1460 movdqu @XMM[6], 0x20($out)
1461 movdqu @XMM[4], 0x30($out)
1462 movdqu @XMM[2], 0x40($out)
1463 movdqu @XMM[7], 0x50($out)
1464 movdqu @XMM[3], 0x60($out)
1465 movdqu @XMM[5], 0x70($out)
1466 lea 0x80($out), $out
1467 sub \$8,$len
1468 jnc .Lecb_dec_loop
1469
1470 add \$8,$len
1471 jz .Lecb_dec_done
1472
1473 movdqu 0x00($inp), @XMM[0] # load input
1474 mov %rsp, %rax # pass key schedule
1475 mov %ebx,%r10d # pass rounds
1476 cmp \$2,$len
1477 jb .Lecb_dec_one
1478 movdqu 0x10($inp), @XMM[1]
1479 je .Lecb_dec_two
1480 movdqu 0x20($inp), @XMM[2]
1481 cmp \$4,$len
1482 jb .Lecb_dec_three
1483 movdqu 0x30($inp), @XMM[3]
1484 je .Lecb_dec_four
1485 movdqu 0x40($inp), @XMM[4]
1486 cmp \$6,$len
1487 jb .Lecb_dec_five
1488 movdqu 0x50($inp), @XMM[5]
1489 je .Lecb_dec_six
1490 movdqu 0x60($inp), @XMM[6]
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 movdqu @XMM[2], 0x40($out)
1497 movdqu @XMM[7], 0x50($out)
1498 movdqu @XMM[3], 0x60($out)
1499 jmp .Lecb_dec_done
1500.align 16
1501.Lecb_dec_six:
1502 call _bsaes_decrypt8
1503 movdqu @XMM[0], 0x00($out) # write output
1504 movdqu @XMM[1], 0x10($out)
1505 movdqu @XMM[6], 0x20($out)
1506 movdqu @XMM[4], 0x30($out)
1507 movdqu @XMM[2], 0x40($out)
1508 movdqu @XMM[7], 0x50($out)
1509 jmp .Lecb_dec_done
1510.align 16
1511.Lecb_dec_five:
1512 call _bsaes_decrypt8
1513 movdqu @XMM[0], 0x00($out) # write output
1514 movdqu @XMM[1], 0x10($out)
1515 movdqu @XMM[6], 0x20($out)
1516 movdqu @XMM[4], 0x30($out)
1517 movdqu @XMM[2], 0x40($out)
1518 jmp .Lecb_dec_done
1519.align 16
1520.Lecb_dec_four:
1521 call _bsaes_decrypt8
1522 movdqu @XMM[0], 0x00($out) # write output
1523 movdqu @XMM[1], 0x10($out)
1524 movdqu @XMM[6], 0x20($out)
1525 movdqu @XMM[4], 0x30($out)
1526 jmp .Lecb_dec_done
1527.align 16
1528.Lecb_dec_three:
1529 call _bsaes_decrypt8
1530 movdqu @XMM[0], 0x00($out) # write output
1531 movdqu @XMM[1], 0x10($out)
1532 movdqu @XMM[6], 0x20($out)
1533 jmp .Lecb_dec_done
1534.align 16
1535.Lecb_dec_two:
1536 call _bsaes_decrypt8
1537 movdqu @XMM[0], 0x00($out) # write output
1538 movdqu @XMM[1], 0x10($out)
1539 jmp .Lecb_dec_done
1540.align 16
1541.Lecb_dec_one:
1542 call _bsaes_decrypt8
1543 movdqu @XMM[0], 0x00($out) # write output
1544 jmp .Lecb_dec_done
1545.align 16
1546.Lecb_dec_short:
1547 lea ($inp), $arg1
1548 lea ($out), $arg2
1549 lea ($key), $arg3
fe068648 1550 call asm_AES_decrypt
a75a52a4
AP
1551 lea 16($inp), $inp
1552 lea 16($out), $out
1553 dec $len
1554 jnz .Lecb_dec_short
1555
1556.Lecb_dec_done:
1557 lea (%rsp),%rax
1558 pxor %xmm0, %xmm0
1559.Lecb_dec_bzero: # wipe key schedule [if any]
1560 movdqa %xmm0, 0x00(%rax)
1561 movdqa %xmm0, 0x10(%rax)
1562 lea 0x20(%rax), %rax
1563 cmp %rax, %rbp
1564 jb .Lecb_dec_bzero
1565
384e6de4 1566 lea 0x78(%rbp),%rax
b84460ad 1567.cfi_def_cfa %rax,8
a75a52a4
AP
1568___
1569$code.=<<___ if ($win64);
1570 movaps 0x40(%rbp), %xmm6
1571 movaps 0x50(%rbp), %xmm7
1572 movaps 0x60(%rbp), %xmm8
1573 movaps 0x70(%rbp), %xmm9
1574 movaps 0x80(%rbp), %xmm10
1575 movaps 0x90(%rbp), %xmm11
1576 movaps 0xa0(%rbp), %xmm12
1577 movaps 0xb0(%rbp), %xmm13
1578 movaps 0xc0(%rbp), %xmm14
1579 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
1580 lea 0xa0(%rax), %rax
1581.Lecb_dec_tail:
a75a52a4
AP
1582___
1583$code.=<<___;
384e6de4 1584 mov -48(%rax), %r15
b84460ad 1585.cfi_restore %r15
384e6de4 1586 mov -40(%rax), %r14
b84460ad 1587.cfi_restore %r14
384e6de4 1588 mov -32(%rax), %r13
b84460ad 1589.cfi_restore %r13
384e6de4 1590 mov -24(%rax), %r12
b84460ad 1591.cfi_restore %r12
384e6de4 1592 mov -16(%rax), %rbx
b84460ad 1593.cfi_restore %rbx
384e6de4 1594 mov -8(%rax), %rbp
b84460ad 1595.cfi_restore %rbp
384e6de4 1596 lea (%rax), %rsp # restore %rsp
b84460ad 1597.cfi_def_cfa_register %rsp
a75a52a4
AP
1598.Lecb_dec_epilogue:
1599 ret
b84460ad 1600.cfi_endproc
a75a52a4
AP
1601.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1602___
1603}
1604$code.=<<___;
fe068648 1605.extern asm_AES_cbc_encrypt
a75a52a4
AP
1606.globl bsaes_cbc_encrypt
1607.type bsaes_cbc_encrypt,\@abi-omnipotent
1608.align 16
1609bsaes_cbc_encrypt:
b84460ad 1610.cfi_startproc
a75a52a4
AP
1611___
1612$code.=<<___ if ($win64);
1613 mov 48(%rsp),$arg6 # pull direction flag
1614___
1615$code.=<<___;
1616 cmp \$0,$arg6
fe068648 1617 jne asm_AES_cbc_encrypt
a75a52a4 1618 cmp \$128,$arg3
fe068648 1619 jb asm_AES_cbc_encrypt
a75a52a4 1620
fe068648
AP
1621 mov %rsp, %rax
1622.Lcbc_dec_prologue:
a75a52a4 1623 push %rbp
b84460ad 1624.cfi_push %rbp
a75a52a4 1625 push %rbx
b84460ad 1626.cfi_push %rbx
a75a52a4 1627 push %r12
b84460ad 1628.cfi_push %r12
a75a52a4 1629 push %r13
b84460ad 1630.cfi_push %r13
a75a52a4 1631 push %r14
b84460ad 1632.cfi_push %r14
a75a52a4 1633 push %r15
b84460ad 1634.cfi_push %r15
a75a52a4 1635 lea -0x48(%rsp), %rsp
b84460ad 1636.cfi_adjust_cfa_offset 0x48
a75a52a4
AP
1637___
1638$code.=<<___ if ($win64);
1639 mov 0xa0(%rsp),$arg5 # pull ivp
1640 lea -0xa0(%rsp), %rsp
1641 movaps %xmm6, 0x40(%rsp)
1642 movaps %xmm7, 0x50(%rsp)
1643 movaps %xmm8, 0x60(%rsp)
1644 movaps %xmm9, 0x70(%rsp)
1645 movaps %xmm10, 0x80(%rsp)
1646 movaps %xmm11, 0x90(%rsp)
1647 movaps %xmm12, 0xa0(%rsp)
1648 movaps %xmm13, 0xb0(%rsp)
1649 movaps %xmm14, 0xc0(%rsp)
1650 movaps %xmm15, 0xd0(%rsp)
1651.Lcbc_dec_body:
1652___
1653$code.=<<___;
1654 mov %rsp, %rbp # backup %rsp
b84460ad 1655.cfi_def_cfa_register %rbp
a75a52a4
AP
1656 mov 240($arg4), %eax # rounds
1657 mov $arg1, $inp # backup arguments
1658 mov $arg2, $out
1659 mov $arg3, $len
1660 mov $arg4, $key
60d4e99c 1661 mov $arg5, %rbx
a75a52a4
AP
1662 shr \$4, $len # bytes to blocks
1663
60d4e99c 1664 mov %eax, %edx # rounds
a75a52a4
AP
1665 shl \$7, %rax # 128 bytes per inner round key
1666 sub \$`128-32`, %rax # size of bit-sliced key schedule
1667 sub %rax, %rsp
1668
1669 mov %rsp, %rax # pass key schedule
1670 mov $key, %rcx # pass key
60d4e99c 1671 mov %edx, %r10d # pass rounds
a75a52a4
AP
1672 call _bsaes_key_convert
1673 pxor (%rsp),%xmm7 # fix up 0 round key
1674 movdqa %xmm6,(%rax) # save last round key
1675 movdqa %xmm7,(%rsp)
1676
60d4e99c 1677 movdqu (%rbx), @XMM[15] # load IV
a75a52a4
AP
1678 sub \$8,$len
1679.Lcbc_dec_loop:
1680 movdqu 0x00($inp), @XMM[0] # load input
1681 movdqu 0x10($inp), @XMM[1]
1682 movdqu 0x20($inp), @XMM[2]
1683 movdqu 0x30($inp), @XMM[3]
1684 movdqu 0x40($inp), @XMM[4]
1685 movdqu 0x50($inp), @XMM[5]
1686 mov %rsp, %rax # pass key schedule
1687 movdqu 0x60($inp), @XMM[6]
60d4e99c 1688 mov %edx,%r10d # pass rounds
a75a52a4
AP
1689 movdqu 0x70($inp), @XMM[7]
1690 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1691
1692 call _bsaes_decrypt8
1693
1694 pxor 0x20(%rbp), @XMM[0] # ^= IV
1695 movdqu 0x00($inp), @XMM[8] # re-load input
1696 movdqu 0x10($inp), @XMM[9]
1697 pxor @XMM[8], @XMM[1]
1698 movdqu 0x20($inp), @XMM[10]
1699 pxor @XMM[9], @XMM[6]
1700 movdqu 0x30($inp), @XMM[11]
1701 pxor @XMM[10], @XMM[4]
1702 movdqu 0x40($inp), @XMM[12]
1703 pxor @XMM[11], @XMM[2]
1704 movdqu 0x50($inp), @XMM[13]
1705 pxor @XMM[12], @XMM[7]
1706 movdqu 0x60($inp), @XMM[14]
1707 pxor @XMM[13], @XMM[3]
1708 movdqu 0x70($inp), @XMM[15] # IV
1709 pxor @XMM[14], @XMM[5]
1710 movdqu @XMM[0], 0x00($out) # write output
1711 lea 0x80($inp), $inp
1712 movdqu @XMM[1], 0x10($out)
1713 movdqu @XMM[6], 0x20($out)
1714 movdqu @XMM[4], 0x30($out)
1715 movdqu @XMM[2], 0x40($out)
1716 movdqu @XMM[7], 0x50($out)
1717 movdqu @XMM[3], 0x60($out)
1718 movdqu @XMM[5], 0x70($out)
1719 lea 0x80($out), $out
1720 sub \$8,$len
1721 jnc .Lcbc_dec_loop
1722
1723 add \$8,$len
1724 jz .Lcbc_dec_done
1725
1726 movdqu 0x00($inp), @XMM[0] # load input
1727 mov %rsp, %rax # pass key schedule
60d4e99c 1728 mov %edx, %r10d # pass rounds
a75a52a4
AP
1729 cmp \$2,$len
1730 jb .Lcbc_dec_one
1731 movdqu 0x10($inp), @XMM[1]
1732 je .Lcbc_dec_two
1733 movdqu 0x20($inp), @XMM[2]
1734 cmp \$4,$len
1735 jb .Lcbc_dec_three
1736 movdqu 0x30($inp), @XMM[3]
1737 je .Lcbc_dec_four
1738 movdqu 0x40($inp), @XMM[4]
1739 cmp \$6,$len
1740 jb .Lcbc_dec_five
1741 movdqu 0x50($inp), @XMM[5]
1742 je .Lcbc_dec_six
1743 movdqu 0x60($inp), @XMM[6]
1744 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1745 call _bsaes_decrypt8
1746 pxor 0x20(%rbp), @XMM[0] # ^= IV
1747 movdqu 0x00($inp), @XMM[8] # re-load input
1748 movdqu 0x10($inp), @XMM[9]
1749 pxor @XMM[8], @XMM[1]
1750 movdqu 0x20($inp), @XMM[10]
1751 pxor @XMM[9], @XMM[6]
1752 movdqu 0x30($inp), @XMM[11]
1753 pxor @XMM[10], @XMM[4]
1754 movdqu 0x40($inp), @XMM[12]
1755 pxor @XMM[11], @XMM[2]
1756 movdqu 0x50($inp), @XMM[13]
1757 pxor @XMM[12], @XMM[7]
1758 movdqu 0x60($inp), @XMM[15] # IV
1759 pxor @XMM[13], @XMM[3]
1760 movdqu @XMM[0], 0x00($out) # write output
1761 movdqu @XMM[1], 0x10($out)
1762 movdqu @XMM[6], 0x20($out)
1763 movdqu @XMM[4], 0x30($out)
1764 movdqu @XMM[2], 0x40($out)
1765 movdqu @XMM[7], 0x50($out)
1766 movdqu @XMM[3], 0x60($out)
1767 jmp .Lcbc_dec_done
1768.align 16
1769.Lcbc_dec_six:
1770 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1771 call _bsaes_decrypt8
1772 pxor 0x20(%rbp), @XMM[0] # ^= IV
1773 movdqu 0x00($inp), @XMM[8] # re-load input
1774 movdqu 0x10($inp), @XMM[9]
1775 pxor @XMM[8], @XMM[1]
1776 movdqu 0x20($inp), @XMM[10]
1777 pxor @XMM[9], @XMM[6]
1778 movdqu 0x30($inp), @XMM[11]
1779 pxor @XMM[10], @XMM[4]
1780 movdqu 0x40($inp), @XMM[12]
1781 pxor @XMM[11], @XMM[2]
1782 movdqu 0x50($inp), @XMM[15] # IV
1783 pxor @XMM[12], @XMM[7]
1784 movdqu @XMM[0], 0x00($out) # write output
1785 movdqu @XMM[1], 0x10($out)
1786 movdqu @XMM[6], 0x20($out)
1787 movdqu @XMM[4], 0x30($out)
1788 movdqu @XMM[2], 0x40($out)
1789 movdqu @XMM[7], 0x50($out)
1790 jmp .Lcbc_dec_done
1791.align 16
1792.Lcbc_dec_five:
1793 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1794 call _bsaes_decrypt8
1795 pxor 0x20(%rbp), @XMM[0] # ^= IV
1796 movdqu 0x00($inp), @XMM[8] # re-load input
1797 movdqu 0x10($inp), @XMM[9]
1798 pxor @XMM[8], @XMM[1]
1799 movdqu 0x20($inp), @XMM[10]
1800 pxor @XMM[9], @XMM[6]
1801 movdqu 0x30($inp), @XMM[11]
1802 pxor @XMM[10], @XMM[4]
1803 movdqu 0x40($inp), @XMM[15] # IV
1804 pxor @XMM[11], @XMM[2]
1805 movdqu @XMM[0], 0x00($out) # write output
1806 movdqu @XMM[1], 0x10($out)
1807 movdqu @XMM[6], 0x20($out)
1808 movdqu @XMM[4], 0x30($out)
1809 movdqu @XMM[2], 0x40($out)
1810 jmp .Lcbc_dec_done
1811.align 16
1812.Lcbc_dec_four:
1813 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1814 call _bsaes_decrypt8
1815 pxor 0x20(%rbp), @XMM[0] # ^= IV
1816 movdqu 0x00($inp), @XMM[8] # re-load input
1817 movdqu 0x10($inp), @XMM[9]
1818 pxor @XMM[8], @XMM[1]
1819 movdqu 0x20($inp), @XMM[10]
1820 pxor @XMM[9], @XMM[6]
1821 movdqu 0x30($inp), @XMM[15] # IV
1822 pxor @XMM[10], @XMM[4]
1823 movdqu @XMM[0], 0x00($out) # write output
1824 movdqu @XMM[1], 0x10($out)
1825 movdqu @XMM[6], 0x20($out)
1826 movdqu @XMM[4], 0x30($out)
1827 jmp .Lcbc_dec_done
1828.align 16
1829.Lcbc_dec_three:
1830 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1831 call _bsaes_decrypt8
1832 pxor 0x20(%rbp), @XMM[0] # ^= IV
1833 movdqu 0x00($inp), @XMM[8] # re-load input
1834 movdqu 0x10($inp), @XMM[9]
1835 pxor @XMM[8], @XMM[1]
1836 movdqu 0x20($inp), @XMM[15] # IV
1837 pxor @XMM[9], @XMM[6]
1838 movdqu @XMM[0], 0x00($out) # write output
1839 movdqu @XMM[1], 0x10($out)
1840 movdqu @XMM[6], 0x20($out)
1841 jmp .Lcbc_dec_done
1842.align 16
1843.Lcbc_dec_two:
1844 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1845 call _bsaes_decrypt8
1846 pxor 0x20(%rbp), @XMM[0] # ^= IV
1847 movdqu 0x00($inp), @XMM[8] # re-load input
1848 movdqu 0x10($inp), @XMM[15] # IV
1849 pxor @XMM[8], @XMM[1]
1850 movdqu @XMM[0], 0x00($out) # write output
1851 movdqu @XMM[1], 0x10($out)
1852 jmp .Lcbc_dec_done
1853.align 16
1854.Lcbc_dec_one:
60d4e99c
AP
1855 lea ($inp), $arg1
1856 lea 0x20(%rbp), $arg2 # buffer output
1857 lea ($key), $arg3
fe068648 1858 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
1859 pxor 0x20(%rbp), @XMM[15] # ^= IV
1860 movdqu @XMM[15], ($out) # write output
1861 movdqa @XMM[0], @XMM[15] # IV
a75a52a4
AP
1862
1863.Lcbc_dec_done:
60d4e99c 1864 movdqu @XMM[15], (%rbx) # return IV
a75a52a4
AP
1865 lea (%rsp), %rax
1866 pxor %xmm0, %xmm0
1867.Lcbc_dec_bzero: # wipe key schedule [if any]
1868 movdqa %xmm0, 0x00(%rax)
1869 movdqa %xmm0, 0x10(%rax)
1870 lea 0x20(%rax), %rax
1871 cmp %rax, %rbp
1872 ja .Lcbc_dec_bzero
1873
384e6de4 1874 lea 0x78(%rbp),%rax
b84460ad 1875.cfi_def_cfa %rax,8
a75a52a4
AP
1876___
1877$code.=<<___ if ($win64);
1878 movaps 0x40(%rbp), %xmm6
1879 movaps 0x50(%rbp), %xmm7
1880 movaps 0x60(%rbp), %xmm8
1881 movaps 0x70(%rbp), %xmm9
1882 movaps 0x80(%rbp), %xmm10
1883 movaps 0x90(%rbp), %xmm11
1884 movaps 0xa0(%rbp), %xmm12
1885 movaps 0xb0(%rbp), %xmm13
1886 movaps 0xc0(%rbp), %xmm14
1887 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
1888 lea 0xa0(%rax), %rax
1889.Lcbc_dec_tail:
a75a52a4
AP
1890___
1891$code.=<<___;
384e6de4 1892 mov -48(%rax), %r15
b84460ad 1893.cfi_restore %r15
384e6de4 1894 mov -40(%rax), %r14
b84460ad 1895.cfi_restore %r14
384e6de4 1896 mov -32(%rax), %r13
b84460ad 1897.cfi_restore %r13
384e6de4 1898 mov -24(%rax), %r12
b84460ad 1899.cfi_restore %r12
384e6de4 1900 mov -16(%rax), %rbx
b84460ad 1901.cfi_restore %rbx
384e6de4 1902 mov -8(%rax), %rbp
b84460ad 1903.cfi_restore %rbp
384e6de4 1904 lea (%rax), %rsp # restore %rsp
b84460ad 1905.cfi_def_cfa_register %rsp
a75a52a4
AP
1906.Lcbc_dec_epilogue:
1907 ret
b84460ad 1908.cfi_endproc
a75a52a4
AP
1909.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1910
4ec93a10
AP
1911.globl bsaes_ctr32_encrypt_blocks
1912.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1913.align 16
1914bsaes_ctr32_encrypt_blocks:
b84460ad 1915.cfi_startproc
fe068648
AP
1916 mov %rsp, %rax
1917.Lctr_enc_prologue:
4ec93a10 1918 push %rbp
b84460ad 1919.cfi_push %rbp
4ec93a10 1920 push %rbx
b84460ad 1921.cfi_push %rbx
4ec93a10 1922 push %r12
b84460ad 1923.cfi_push %r12
4ec93a10 1924 push %r13
b84460ad 1925.cfi_push %r13
4ec93a10 1926 push %r14
b84460ad 1927.cfi_push %r14
4ec93a10 1928 push %r15
b84460ad 1929.cfi_push %r15
4ec93a10 1930 lea -0x48(%rsp), %rsp
b84460ad 1931.cfi_adjust_cfa_offset 0x48
4ec93a10
AP
1932___
1933$code.=<<___ if ($win64);
1934 mov 0xa0(%rsp),$arg5 # pull ivp
1935 lea -0xa0(%rsp), %rsp
1936 movaps %xmm6, 0x40(%rsp)
1937 movaps %xmm7, 0x50(%rsp)
1938 movaps %xmm8, 0x60(%rsp)
1939 movaps %xmm9, 0x70(%rsp)
1940 movaps %xmm10, 0x80(%rsp)
1941 movaps %xmm11, 0x90(%rsp)
1942 movaps %xmm12, 0xa0(%rsp)
1943 movaps %xmm13, 0xb0(%rsp)
1944 movaps %xmm14, 0xc0(%rsp)
1945 movaps %xmm15, 0xd0(%rsp)
1946.Lctr_enc_body:
1947___
1948$code.=<<___;
1949 mov %rsp, %rbp # backup %rsp
b84460ad 1950.cfi_def_cfa_register %rbp
4ec93a10
AP
1951 movdqu ($arg5), %xmm0 # load counter
1952 mov 240($arg4), %eax # rounds
1953 mov $arg1, $inp # backup arguments
1954 mov $arg2, $out
1955 mov $arg3, $len
1956 mov $arg4, $key
1957 movdqa %xmm0, 0x20(%rbp) # copy counter
1958 cmp \$8, $arg3
1959 jb .Lctr_enc_short
1960
1961 mov %eax, %ebx # rounds
1962 shl \$7, %rax # 128 bytes per inner round key
1963 sub \$`128-32`, %rax # size of bit-sliced key schedule
1964 sub %rax, %rsp
1965
1966 mov %rsp, %rax # pass key schedule
1967 mov $key, %rcx # pass key
1968 mov %ebx, %r10d # pass rounds
28507577
AP
1969 call _bsaes_key_convert
1970 pxor %xmm6,%xmm7 # fix up last round key
1971 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1972
1973 movdqa (%rsp), @XMM[9] # load round0 key
1974 lea .LADD1(%rip), %r11
1975 movdqa 0x20(%rbp), @XMM[0] # counter copy
1976 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1977 pshufb @XMM[8], @XMM[9] # byte swap upper part
1978 pshufb @XMM[8], @XMM[0]
1979 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1980 jmp .Lctr_enc_loop
1981.align 16
1982.Lctr_enc_loop:
1983 movdqa @XMM[0], 0x20(%rbp) # save counter
1984 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1985 movdqa @XMM[0], @XMM[2]
1986 paddd 0x00(%r11), @XMM[1] # .LADD1
1987 movdqa @XMM[0], @XMM[3]
1988 paddd 0x10(%r11), @XMM[2] # .LADD2
1989 movdqa @XMM[0], @XMM[4]
1990 paddd 0x20(%r11), @XMM[3] # .LADD3
1991 movdqa @XMM[0], @XMM[5]
1992 paddd 0x30(%r11), @XMM[4] # .LADD4
1993 movdqa @XMM[0], @XMM[6]
1994 paddd 0x40(%r11), @XMM[5] # .LADD5
1995 movdqa @XMM[0], @XMM[7]
1996 paddd 0x50(%r11), @XMM[6] # .LADD6
1997 paddd 0x60(%r11), @XMM[7] # .LADD7
1998
1999 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
2000 # to flip byte order in 32-bit counter
2001 movdqa (%rsp), @XMM[9] # round 0 key
2002 lea 0x10(%rsp), %rax # pass key schedule
2003 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
2004 pxor @XMM[9], @XMM[0] # xor with round0 key
2005 pxor @XMM[9], @XMM[1]
4ec93a10 2006 pxor @XMM[9], @XMM[2]
4ec93a10 2007 pxor @XMM[9], @XMM[3]
558ff0f0
AP
2008 pshufb @XMM[8], @XMM[0]
2009 pshufb @XMM[8], @XMM[1]
4ec93a10 2010 pxor @XMM[9], @XMM[4]
4ec93a10 2011 pxor @XMM[9], @XMM[5]
558ff0f0
AP
2012 pshufb @XMM[8], @XMM[2]
2013 pshufb @XMM[8], @XMM[3]
4ec93a10 2014 pxor @XMM[9], @XMM[6]
4ec93a10 2015 pxor @XMM[9], @XMM[7]
558ff0f0
AP
2016 pshufb @XMM[8], @XMM[4]
2017 pshufb @XMM[8], @XMM[5]
4ec93a10 2018 pshufb @XMM[8], @XMM[6]
4ec93a10 2019 pshufb @XMM[8], @XMM[7]
558ff0f0 2020 lea .LBS0(%rip), %r11 # constants table
4ec93a10
AP
2021 mov %ebx,%r10d # pass rounds
2022
2023 call _bsaes_encrypt8_bitslice
2024
2025 sub \$8,$len
2026 jc .Lctr_enc_loop_done
2027
2028 movdqu 0x00($inp), @XMM[8] # load input
2029 movdqu 0x10($inp), @XMM[9]
2030 movdqu 0x20($inp), @XMM[10]
2031 movdqu 0x30($inp), @XMM[11]
2032 movdqu 0x40($inp), @XMM[12]
2033 movdqu 0x50($inp), @XMM[13]
2034 movdqu 0x60($inp), @XMM[14]
2035 movdqu 0x70($inp), @XMM[15]
2036 lea 0x80($inp),$inp
2037 pxor @XMM[0], @XMM[8]
2038 movdqa 0x20(%rbp), @XMM[0] # load counter
2039 pxor @XMM[9], @XMM[1]
2040 movdqu @XMM[8], 0x00($out) # write output
2041 pxor @XMM[10], @XMM[4]
2042 movdqu @XMM[1], 0x10($out)
2043 pxor @XMM[11], @XMM[6]
2044 movdqu @XMM[4], 0x20($out)
2045 pxor @XMM[12], @XMM[3]
2046 movdqu @XMM[6], 0x30($out)
2047 pxor @XMM[13], @XMM[7]
2048 movdqu @XMM[3], 0x40($out)
2049 pxor @XMM[14], @XMM[2]
2050 movdqu @XMM[7], 0x50($out)
2051 pxor @XMM[15], @XMM[5]
2052 movdqu @XMM[2], 0x60($out)
2053 lea .LADD1(%rip), %r11
2054 movdqu @XMM[5], 0x70($out)
2055 lea 0x80($out), $out
2056 paddd 0x70(%r11), @XMM[0] # .LADD8
2057 jnz .Lctr_enc_loop
2058
2059 jmp .Lctr_enc_done
2060.align 16
2061.Lctr_enc_loop_done:
d127ef78 2062 add \$8, $len
4ec93a10
AP
2063 movdqu 0x00($inp), @XMM[8] # load input
2064 pxor @XMM[8], @XMM[0]
2065 movdqu @XMM[0], 0x00($out) # write output
2066 cmp \$2,$len
2067 jb .Lctr_enc_done
2068 movdqu 0x10($inp), @XMM[9]
2069 pxor @XMM[9], @XMM[1]
2070 movdqu @XMM[1], 0x10($out)
2071 je .Lctr_enc_done
2072 movdqu 0x20($inp), @XMM[10]
2073 pxor @XMM[10], @XMM[4]
2074 movdqu @XMM[4], 0x20($out)
2075 cmp \$4,$len
2076 jb .Lctr_enc_done
2077 movdqu 0x30($inp), @XMM[11]
2078 pxor @XMM[11], @XMM[6]
2079 movdqu @XMM[6], 0x30($out)
2080 je .Lctr_enc_done
2081 movdqu 0x40($inp), @XMM[12]
2082 pxor @XMM[12], @XMM[3]
2083 movdqu @XMM[3], 0x40($out)
2084 cmp \$6,$len
2085 jb .Lctr_enc_done
2086 movdqu 0x50($inp), @XMM[13]
2087 pxor @XMM[13], @XMM[7]
2088 movdqu @XMM[7], 0x50($out)
2089 je .Lctr_enc_done
2090 movdqu 0x60($inp), @XMM[14]
2091 pxor @XMM[14], @XMM[2]
2092 movdqu @XMM[2], 0x60($out)
2093 jmp .Lctr_enc_done
2094
2095.align 16
2096.Lctr_enc_short:
2097 lea 0x20(%rbp), $arg1
2098 lea 0x30(%rbp), $arg2
2099 lea ($key), $arg3
fe068648 2100 call asm_AES_encrypt
4ec93a10
AP
2101 movdqu ($inp), @XMM[1]
2102 lea 16($inp), $inp
2103 mov 0x2c(%rbp), %eax # load 32-bit counter
2104 bswap %eax
2105 pxor 0x30(%rbp), @XMM[1]
2106 inc %eax # increment
2107 movdqu @XMM[1], ($out)
2108 bswap %eax
2109 lea 16($out), $out
2110 mov %eax, 0x2c(%rsp) # save 32-bit counter
2111 dec $len
2112 jnz .Lctr_enc_short
2113
2114.Lctr_enc_done:
2115 lea (%rsp), %rax
2116 pxor %xmm0, %xmm0
2117.Lctr_enc_bzero: # wipe key schedule [if any]
2118 movdqa %xmm0, 0x00(%rax)
2119 movdqa %xmm0, 0x10(%rax)
2120 lea 0x20(%rax), %rax
2121 cmp %rax, %rbp
2122 ja .Lctr_enc_bzero
2123
384e6de4 2124 lea 0x78(%rbp),%rax
b84460ad 2125.cfi_def_cfa %rax,8
4ec93a10
AP
2126___
2127$code.=<<___ if ($win64);
2128 movaps 0x40(%rbp), %xmm6
2129 movaps 0x50(%rbp), %xmm7
2130 movaps 0x60(%rbp), %xmm8
2131 movaps 0x70(%rbp), %xmm9
2132 movaps 0x80(%rbp), %xmm10
2133 movaps 0x90(%rbp), %xmm11
2134 movaps 0xa0(%rbp), %xmm12
2135 movaps 0xb0(%rbp), %xmm13
2136 movaps 0xc0(%rbp), %xmm14
2137 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
2138 lea 0xa0(%rax), %rax
2139.Lctr_enc_tail:
4ec93a10
AP
2140___
2141$code.=<<___;
384e6de4 2142 mov -48(%rax), %r15
b84460ad 2143.cfi_restore %r15
384e6de4 2144 mov -40(%rax), %r14
b84460ad 2145.cfi_restore %r14
384e6de4 2146 mov -32(%rax), %r13
b84460ad 2147.cfi_restore %r13
384e6de4 2148 mov -24(%rax), %r12
b84460ad 2149.cfi_restore %r12
384e6de4 2150 mov -16(%rax), %rbx
b84460ad 2151.cfi_restore %rbx
384e6de4 2152 mov -8(%rax), %rbp
b84460ad 2153.cfi_restore %rbp
384e6de4 2154 lea (%rax), %rsp # restore %rsp
b84460ad 2155.cfi_def_cfa_register %rsp
4ec93a10
AP
2156.Lctr_enc_epilogue:
2157 ret
b84460ad 2158.cfi_endproc
4ec93a10
AP
2159.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2160___
60d4e99c
AP
2161######################################################################
2162# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2163# const AES_KEY *key1, const AES_KEY *key2,
2164# const unsigned char iv[16]);
2165#
2166my ($twmask,$twres,$twtmp)=@XMM[13..15];
7e1e3334
AP
2167$arg6=~s/d$//;
2168
60d4e99c
AP
2169$code.=<<___;
2170.globl bsaes_xts_encrypt
2171.type bsaes_xts_encrypt,\@abi-omnipotent
2172.align 16
2173bsaes_xts_encrypt:
b84460ad 2174.cfi_startproc
fe068648
AP
2175 mov %rsp, %rax
2176.Lxts_enc_prologue:
60d4e99c 2177 push %rbp
b84460ad 2178.cfi_push %rbp
60d4e99c 2179 push %rbx
b84460ad 2180.cfi_push %rbx
60d4e99c 2181 push %r12
b84460ad 2182.cfi_push %r12
60d4e99c 2183 push %r13
b84460ad 2184.cfi_push %r13
60d4e99c 2185 push %r14
b84460ad 2186.cfi_push %r14
60d4e99c 2187 push %r15
b84460ad 2188.cfi_push %r15
60d4e99c 2189 lea -0x48(%rsp), %rsp
b84460ad 2190.cfi_adjust_cfa_offset 0x48
60d4e99c
AP
2191___
2192$code.=<<___ if ($win64);
2193 mov 0xa0(%rsp),$arg5 # pull key2
2194 mov 0xa8(%rsp),$arg6 # pull ivp
2195 lea -0xa0(%rsp), %rsp
2196 movaps %xmm6, 0x40(%rsp)
2197 movaps %xmm7, 0x50(%rsp)
2198 movaps %xmm8, 0x60(%rsp)
2199 movaps %xmm9, 0x70(%rsp)
2200 movaps %xmm10, 0x80(%rsp)
2201 movaps %xmm11, 0x90(%rsp)
2202 movaps %xmm12, 0xa0(%rsp)
2203 movaps %xmm13, 0xb0(%rsp)
2204 movaps %xmm14, 0xc0(%rsp)
2205 movaps %xmm15, 0xd0(%rsp)
2206.Lxts_enc_body:
2207___
2208$code.=<<___;
2209 mov %rsp, %rbp # backup %rsp
b84460ad 2210.cfi_def_cfa_register %rbp
60d4e99c
AP
2211 mov $arg1, $inp # backup arguments
2212 mov $arg2, $out
2213 mov $arg3, $len
2214 mov $arg4, $key
2215
2216 lea ($arg6), $arg1
2217 lea 0x20(%rbp), $arg2
2218 lea ($arg5), $arg3
fe068648 2219 call asm_AES_encrypt # generate initial tweak
60d4e99c
AP
2220
2221 mov 240($key), %eax # rounds
2222 mov $len, %rbx # backup $len
2223
2224 mov %eax, %edx # rounds
2225 shl \$7, %rax # 128 bytes per inner round key
2226 sub \$`128-32`, %rax # size of bit-sliced key schedule
2227 sub %rax, %rsp
2228
2229 mov %rsp, %rax # pass key schedule
2230 mov $key, %rcx # pass key
2231 mov %edx, %r10d # pass rounds
2232 call _bsaes_key_convert
2233 pxor %xmm6, %xmm7 # fix up last round key
2234 movdqa %xmm7, (%rax) # save last round key
2235
2236 and \$-16, $len
2237 sub \$0x80, %rsp # place for tweak[8]
2238 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2239
2240 pxor $twtmp, $twtmp
2241 movdqa .Lxts_magic(%rip), $twmask
2242 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2243
2244 sub \$0x80, $len
2245 jc .Lxts_enc_short
2246 jmp .Lxts_enc_loop
2247
2248.align 16
2249.Lxts_enc_loop:
2250___
2251 for ($i=0;$i<7;$i++) {
2252 $code.=<<___;
2253 pshufd \$0x13, $twtmp, $twres
2254 pxor $twtmp, $twtmp
2255 movdqa @XMM[7], @XMM[$i]
2256 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2257 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2258 pand $twmask, $twres # isolate carry and residue
2259 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2260 pxor $twres, @XMM[7]
2261___
2262 $code.=<<___ if ($i>=1);
2263 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2264___
2265 $code.=<<___ if ($i>=2);
2266 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2267___
2268 }
2269$code.=<<___;
2270 movdqu 0x60($inp), @XMM[8+6]
2271 pxor @XMM[8+5], @XMM[5]
2272 movdqu 0x70($inp), @XMM[8+7]
2273 lea 0x80($inp), $inp
2274 movdqa @XMM[7], 0x70(%rsp)
2275 pxor @XMM[8+6], @XMM[6]
2276 lea 0x80(%rsp), %rax # pass key schedule
2277 pxor @XMM[8+7], @XMM[7]
2278 mov %edx, %r10d # pass rounds
2279
2280 call _bsaes_encrypt8
2281
2282 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2283 pxor 0x10(%rsp), @XMM[1]
2284 movdqu @XMM[0], 0x00($out) # write output
2285 pxor 0x20(%rsp), @XMM[4]
2286 movdqu @XMM[1], 0x10($out)
2287 pxor 0x30(%rsp), @XMM[6]
2288 movdqu @XMM[4], 0x20($out)
2289 pxor 0x40(%rsp), @XMM[3]
2290 movdqu @XMM[6], 0x30($out)
2291 pxor 0x50(%rsp), @XMM[7]
2292 movdqu @XMM[3], 0x40($out)
2293 pxor 0x60(%rsp), @XMM[2]
2294 movdqu @XMM[7], 0x50($out)
2295 pxor 0x70(%rsp), @XMM[5]
2296 movdqu @XMM[2], 0x60($out)
2297 movdqu @XMM[5], 0x70($out)
2298 lea 0x80($out), $out
2299
2300 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2301 pxor $twtmp, $twtmp
2302 movdqa .Lxts_magic(%rip), $twmask
2303 pcmpgtd @XMM[7], $twtmp
2304 pshufd \$0x13, $twtmp, $twres
2305 pxor $twtmp, $twtmp
2306 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2307 pand $twmask, $twres # isolate carry and residue
2308 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2309 pxor $twres, @XMM[7]
2310
2311 sub \$0x80,$len
2312 jnc .Lxts_enc_loop
2313
2314.Lxts_enc_short:
2315 add \$0x80, $len
2316 jz .Lxts_enc_done
2317___
2318 for ($i=0;$i<7;$i++) {
2319 $code.=<<___;
2320 pshufd \$0x13, $twtmp, $twres
2321 pxor $twtmp, $twtmp
2322 movdqa @XMM[7], @XMM[$i]
2323 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2324 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2325 pand $twmask, $twres # isolate carry and residue
2326 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2327 pxor $twres, @XMM[7]
2328___
2329 $code.=<<___ if ($i>=1);
2330 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2331 cmp \$`0x10*$i`,$len
2332 je .Lxts_enc_$i
2333___
2334 $code.=<<___ if ($i>=2);
2335 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2336___
2337 }
2338$code.=<<___;
2339 movdqu 0x60($inp), @XMM[8+6]
2340 pxor @XMM[8+5], @XMM[5]
2341 movdqa @XMM[7], 0x70(%rsp)
2342 lea 0x70($inp), $inp
2343 pxor @XMM[8+6], @XMM[6]
2344 lea 0x80(%rsp), %rax # pass key schedule
2345 mov %edx, %r10d # pass rounds
2346
2347 call _bsaes_encrypt8
2348
2349 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2350 pxor 0x10(%rsp), @XMM[1]
2351 movdqu @XMM[0], 0x00($out) # write output
2352 pxor 0x20(%rsp), @XMM[4]
2353 movdqu @XMM[1], 0x10($out)
2354 pxor 0x30(%rsp), @XMM[6]
2355 movdqu @XMM[4], 0x20($out)
2356 pxor 0x40(%rsp), @XMM[3]
2357 movdqu @XMM[6], 0x30($out)
2358 pxor 0x50(%rsp), @XMM[7]
2359 movdqu @XMM[3], 0x40($out)
2360 pxor 0x60(%rsp), @XMM[2]
2361 movdqu @XMM[7], 0x50($out)
2362 movdqu @XMM[2], 0x60($out)
2363 lea 0x70($out), $out
2364
2365 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2366 jmp .Lxts_enc_done
2367.align 16
2368.Lxts_enc_6:
2369 pxor @XMM[8+4], @XMM[4]
2370 lea 0x60($inp), $inp
2371 pxor @XMM[8+5], @XMM[5]
2372 lea 0x80(%rsp), %rax # pass key schedule
2373 mov %edx, %r10d # pass rounds
2374
2375 call _bsaes_encrypt8
2376
2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2378 pxor 0x10(%rsp), @XMM[1]
2379 movdqu @XMM[0], 0x00($out) # write output
2380 pxor 0x20(%rsp), @XMM[4]
2381 movdqu @XMM[1], 0x10($out)
2382 pxor 0x30(%rsp), @XMM[6]
2383 movdqu @XMM[4], 0x20($out)
2384 pxor 0x40(%rsp), @XMM[3]
2385 movdqu @XMM[6], 0x30($out)
2386 pxor 0x50(%rsp), @XMM[7]
2387 movdqu @XMM[3], 0x40($out)
2388 movdqu @XMM[7], 0x50($out)
2389 lea 0x60($out), $out
2390
2391 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2392 jmp .Lxts_enc_done
2393.align 16
2394.Lxts_enc_5:
2395 pxor @XMM[8+3], @XMM[3]
2396 lea 0x50($inp), $inp
2397 pxor @XMM[8+4], @XMM[4]
2398 lea 0x80(%rsp), %rax # pass key schedule
2399 mov %edx, %r10d # pass rounds
2400
2401 call _bsaes_encrypt8
2402
2403 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2404 pxor 0x10(%rsp), @XMM[1]
2405 movdqu @XMM[0], 0x00($out) # write output
2406 pxor 0x20(%rsp), @XMM[4]
2407 movdqu @XMM[1], 0x10($out)
2408 pxor 0x30(%rsp), @XMM[6]
2409 movdqu @XMM[4], 0x20($out)
2410 pxor 0x40(%rsp), @XMM[3]
2411 movdqu @XMM[6], 0x30($out)
2412 movdqu @XMM[3], 0x40($out)
2413 lea 0x50($out), $out
2414
2415 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2416 jmp .Lxts_enc_done
2417.align 16
2418.Lxts_enc_4:
2419 pxor @XMM[8+2], @XMM[2]
2420 lea 0x40($inp), $inp
2421 pxor @XMM[8+3], @XMM[3]
2422 lea 0x80(%rsp), %rax # pass key schedule
2423 mov %edx, %r10d # pass rounds
2424
2425 call _bsaes_encrypt8
2426
2427 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2428 pxor 0x10(%rsp), @XMM[1]
2429 movdqu @XMM[0], 0x00($out) # write output
2430 pxor 0x20(%rsp), @XMM[4]
2431 movdqu @XMM[1], 0x10($out)
2432 pxor 0x30(%rsp), @XMM[6]
2433 movdqu @XMM[4], 0x20($out)
2434 movdqu @XMM[6], 0x30($out)
2435 lea 0x40($out), $out
2436
2437 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2438 jmp .Lxts_enc_done
2439.align 16
2440.Lxts_enc_3:
2441 pxor @XMM[8+1], @XMM[1]
2442 lea 0x30($inp), $inp
2443 pxor @XMM[8+2], @XMM[2]
2444 lea 0x80(%rsp), %rax # pass key schedule
2445 mov %edx, %r10d # pass rounds
2446
2447 call _bsaes_encrypt8
2448
2449 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2450 pxor 0x10(%rsp), @XMM[1]
2451 movdqu @XMM[0], 0x00($out) # write output
2452 pxor 0x20(%rsp), @XMM[4]
2453 movdqu @XMM[1], 0x10($out)
2454 movdqu @XMM[4], 0x20($out)
2455 lea 0x30($out), $out
2456
2457 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2458 jmp .Lxts_enc_done
2459.align 16
2460.Lxts_enc_2:
2461 pxor @XMM[8+0], @XMM[0]
2462 lea 0x20($inp), $inp
2463 pxor @XMM[8+1], @XMM[1]
2464 lea 0x80(%rsp), %rax # pass key schedule
2465 mov %edx, %r10d # pass rounds
2466
2467 call _bsaes_encrypt8
2468
2469 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2470 pxor 0x10(%rsp), @XMM[1]
2471 movdqu @XMM[0], 0x00($out) # write output
2472 movdqu @XMM[1], 0x10($out)
2473 lea 0x20($out), $out
2474
2475 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2476 jmp .Lxts_enc_done
2477.align 16
2478.Lxts_enc_1:
2479 pxor @XMM[0], @XMM[8]
2480 lea 0x10($inp), $inp
2481 movdqa @XMM[8], 0x20(%rbp)
2482 lea 0x20(%rbp), $arg1
2483 lea 0x20(%rbp), $arg2
2484 lea ($key), $arg3
fe068648 2485 call asm_AES_encrypt # doesn't touch %xmm
60d4e99c
AP
2486 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2487 #pxor @XMM[8], @XMM[0]
2488 #lea 0x80(%rsp), %rax # pass key schedule
2489 #mov %edx, %r10d # pass rounds
2490 #call _bsaes_encrypt8
2491 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2492 movdqu @XMM[0], 0x00($out) # write output
2493 lea 0x10($out), $out
2494
2495 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2496
2497.Lxts_enc_done:
2498 and \$15, %ebx
2499 jz .Lxts_enc_ret
2500 mov $out, %rdx
2501
2502.Lxts_enc_steal:
2503 movzb ($inp), %eax
2504 movzb -16(%rdx), %ecx
2505 lea 1($inp), $inp
2506 mov %al, -16(%rdx)
2507 mov %cl, 0(%rdx)
2508 lea 1(%rdx), %rdx
2509 sub \$1,%ebx
2510 jnz .Lxts_enc_steal
2511
2512 movdqu -16($out), @XMM[0]
2513 lea 0x20(%rbp), $arg1
2514 pxor @XMM[7], @XMM[0]
2515 lea 0x20(%rbp), $arg2
2516 movdqa @XMM[0], 0x20(%rbp)
2517 lea ($key), $arg3
fe068648 2518 call asm_AES_encrypt # doesn't touch %xmm
60d4e99c
AP
2519 pxor 0x20(%rbp), @XMM[7]
2520 movdqu @XMM[7], -16($out)
2521
2522.Lxts_enc_ret:
2523 lea (%rsp), %rax
2524 pxor %xmm0, %xmm0
2525.Lxts_enc_bzero: # wipe key schedule [if any]
2526 movdqa %xmm0, 0x00(%rax)
2527 movdqa %xmm0, 0x10(%rax)
2528 lea 0x20(%rax), %rax
2529 cmp %rax, %rbp
2530 ja .Lxts_enc_bzero
2531
384e6de4 2532 lea 0x78(%rbp),%rax
b84460ad 2533.cfi_def_cfa %rax,8
60d4e99c
AP
2534___
2535$code.=<<___ if ($win64);
2536 movaps 0x40(%rbp), %xmm6
2537 movaps 0x50(%rbp), %xmm7
2538 movaps 0x60(%rbp), %xmm8
2539 movaps 0x70(%rbp), %xmm9
2540 movaps 0x80(%rbp), %xmm10
2541 movaps 0x90(%rbp), %xmm11
2542 movaps 0xa0(%rbp), %xmm12
2543 movaps 0xb0(%rbp), %xmm13
2544 movaps 0xc0(%rbp), %xmm14
2545 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
2546 lea 0xa0(%rax), %rax
2547.Lxts_enc_tail:
60d4e99c
AP
2548___
2549$code.=<<___;
384e6de4 2550 mov -48(%rax), %r15
b84460ad 2551.cfi_restore %r15
384e6de4 2552 mov -40(%rax), %r14
b84460ad 2553.cfi_restore %r14
384e6de4 2554 mov -32(%rax), %r13
b84460ad 2555.cfi_restore %r13
384e6de4 2556 mov -24(%rax), %r12
b84460ad 2557.cfi_restore %r12
384e6de4 2558 mov -16(%rax), %rbx
b84460ad 2559.cfi_restore %rbx
384e6de4 2560 mov -8(%rax), %rbp
b84460ad 2561.cfi_restore %rbp
384e6de4 2562 lea (%rax), %rsp # restore %rsp
b84460ad 2563.cfi_def_cfa_register %rsp
60d4e99c
AP
2564.Lxts_enc_epilogue:
2565 ret
b84460ad 2566.cfi_endproc
60d4e99c
AP
2567.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2568
2569.globl bsaes_xts_decrypt
2570.type bsaes_xts_decrypt,\@abi-omnipotent
2571.align 16
2572bsaes_xts_decrypt:
b84460ad 2573.cfi_startproc
fe068648
AP
2574 mov %rsp, %rax
2575.Lxts_dec_prologue:
60d4e99c 2576 push %rbp
b84460ad 2577.cfi_push %rbp
60d4e99c 2578 push %rbx
b84460ad 2579.cfi_push %rbx
60d4e99c 2580 push %r12
b84460ad 2581.cfi_push %r12
60d4e99c 2582 push %r13
b84460ad 2583.cfi_push %r13
60d4e99c 2584 push %r14
b84460ad 2585.cfi_push %r14
60d4e99c 2586 push %r15
b84460ad 2587.cfi_push %r15
60d4e99c 2588 lea -0x48(%rsp), %rsp
b84460ad 2589.cfi_adjust_cfa_offset 0x48
60d4e99c
AP
2590___
2591$code.=<<___ if ($win64);
2592 mov 0xa0(%rsp),$arg5 # pull key2
2593 mov 0xa8(%rsp),$arg6 # pull ivp
2594 lea -0xa0(%rsp), %rsp
2595 movaps %xmm6, 0x40(%rsp)
2596 movaps %xmm7, 0x50(%rsp)
2597 movaps %xmm8, 0x60(%rsp)
2598 movaps %xmm9, 0x70(%rsp)
2599 movaps %xmm10, 0x80(%rsp)
2600 movaps %xmm11, 0x90(%rsp)
2601 movaps %xmm12, 0xa0(%rsp)
2602 movaps %xmm13, 0xb0(%rsp)
2603 movaps %xmm14, 0xc0(%rsp)
2604 movaps %xmm15, 0xd0(%rsp)
2605.Lxts_dec_body:
2606___
2607$code.=<<___;
2608 mov %rsp, %rbp # backup %rsp
2609 mov $arg1, $inp # backup arguments
2610 mov $arg2, $out
2611 mov $arg3, $len
2612 mov $arg4, $key
2613
2614 lea ($arg6), $arg1
2615 lea 0x20(%rbp), $arg2
2616 lea ($arg5), $arg3
fe068648 2617 call asm_AES_encrypt # generate initial tweak
60d4e99c
AP
2618
2619 mov 240($key), %eax # rounds
2620 mov $len, %rbx # backup $len
2621
2622 mov %eax, %edx # rounds
2623 shl \$7, %rax # 128 bytes per inner round key
2624 sub \$`128-32`, %rax # size of bit-sliced key schedule
2625 sub %rax, %rsp
2626
2627 mov %rsp, %rax # pass key schedule
2628 mov $key, %rcx # pass key
2629 mov %edx, %r10d # pass rounds
2630 call _bsaes_key_convert
2631 pxor (%rsp), %xmm7 # fix up round 0 key
2632 movdqa %xmm6, (%rax) # save last round key
2633 movdqa %xmm7, (%rsp)
2634
2635 xor %eax, %eax # if ($len%16) len-=16;
2636 and \$-16, $len
2637 test \$15, %ebx
2638 setnz %al
2639 shl \$4, %rax
2640 sub %rax, $len
2641
2642 sub \$0x80, %rsp # place for tweak[8]
2643 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2644
2645 pxor $twtmp, $twtmp
2646 movdqa .Lxts_magic(%rip), $twmask
2647 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2648
2649 sub \$0x80, $len
2650 jc .Lxts_dec_short
2651 jmp .Lxts_dec_loop
2652
2653.align 16
2654.Lxts_dec_loop:
2655___
2656 for ($i=0;$i<7;$i++) {
2657 $code.=<<___;
2658 pshufd \$0x13, $twtmp, $twres
2659 pxor $twtmp, $twtmp
2660 movdqa @XMM[7], @XMM[$i]
2661 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2662 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2663 pand $twmask, $twres # isolate carry and residue
2664 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2665 pxor $twres, @XMM[7]
2666___
2667 $code.=<<___ if ($i>=1);
2668 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2669___
2670 $code.=<<___ if ($i>=2);
2671 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2672___
2673 }
2674$code.=<<___;
2675 movdqu 0x60($inp), @XMM[8+6]
2676 pxor @XMM[8+5], @XMM[5]
2677 movdqu 0x70($inp), @XMM[8+7]
2678 lea 0x80($inp), $inp
2679 movdqa @XMM[7], 0x70(%rsp)
2680 pxor @XMM[8+6], @XMM[6]
2681 lea 0x80(%rsp), %rax # pass key schedule
2682 pxor @XMM[8+7], @XMM[7]
2683 mov %edx, %r10d # pass rounds
2684
2685 call _bsaes_decrypt8
2686
2687 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2688 pxor 0x10(%rsp), @XMM[1]
2689 movdqu @XMM[0], 0x00($out) # write output
2690 pxor 0x20(%rsp), @XMM[6]
2691 movdqu @XMM[1], 0x10($out)
2692 pxor 0x30(%rsp), @XMM[4]
2693 movdqu @XMM[6], 0x20($out)
2694 pxor 0x40(%rsp), @XMM[2]
2695 movdqu @XMM[4], 0x30($out)
2696 pxor 0x50(%rsp), @XMM[7]
2697 movdqu @XMM[2], 0x40($out)
2698 pxor 0x60(%rsp), @XMM[3]
2699 movdqu @XMM[7], 0x50($out)
2700 pxor 0x70(%rsp), @XMM[5]
2701 movdqu @XMM[3], 0x60($out)
2702 movdqu @XMM[5], 0x70($out)
2703 lea 0x80($out), $out
2704
2705 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2706 pxor $twtmp, $twtmp
2707 movdqa .Lxts_magic(%rip), $twmask
2708 pcmpgtd @XMM[7], $twtmp
2709 pshufd \$0x13, $twtmp, $twres
2710 pxor $twtmp, $twtmp
2711 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2712 pand $twmask, $twres # isolate carry and residue
2713 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2714 pxor $twres, @XMM[7]
2715
2716 sub \$0x80,$len
2717 jnc .Lxts_dec_loop
2718
2719.Lxts_dec_short:
2720 add \$0x80, $len
2721 jz .Lxts_dec_done
2722___
2723 for ($i=0;$i<7;$i++) {
2724 $code.=<<___;
2725 pshufd \$0x13, $twtmp, $twres
2726 pxor $twtmp, $twtmp
2727 movdqa @XMM[7], @XMM[$i]
2728 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2729 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2730 pand $twmask, $twres # isolate carry and residue
2731 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2732 pxor $twres, @XMM[7]
2733___
2734 $code.=<<___ if ($i>=1);
2735 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2736 cmp \$`0x10*$i`,$len
2737 je .Lxts_dec_$i
2738___
2739 $code.=<<___ if ($i>=2);
2740 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2741___
2742 }
2743$code.=<<___;
2744 movdqu 0x60($inp), @XMM[8+6]
2745 pxor @XMM[8+5], @XMM[5]
2746 movdqa @XMM[7], 0x70(%rsp)
2747 lea 0x70($inp), $inp
2748 pxor @XMM[8+6], @XMM[6]
2749 lea 0x80(%rsp), %rax # pass key schedule
2750 mov %edx, %r10d # pass rounds
2751
2752 call _bsaes_decrypt8
2753
2754 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2755 pxor 0x10(%rsp), @XMM[1]
2756 movdqu @XMM[0], 0x00($out) # write output
2757 pxor 0x20(%rsp), @XMM[6]
2758 movdqu @XMM[1], 0x10($out)
2759 pxor 0x30(%rsp), @XMM[4]
2760 movdqu @XMM[6], 0x20($out)
2761 pxor 0x40(%rsp), @XMM[2]
2762 movdqu @XMM[4], 0x30($out)
2763 pxor 0x50(%rsp), @XMM[7]
2764 movdqu @XMM[2], 0x40($out)
2765 pxor 0x60(%rsp), @XMM[3]
2766 movdqu @XMM[7], 0x50($out)
2767 movdqu @XMM[3], 0x60($out)
2768 lea 0x70($out), $out
2769
2770 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2771 jmp .Lxts_dec_done
2772.align 16
2773.Lxts_dec_6:
2774 pxor @XMM[8+4], @XMM[4]
2775 lea 0x60($inp), $inp
2776 pxor @XMM[8+5], @XMM[5]
2777 lea 0x80(%rsp), %rax # pass key schedule
2778 mov %edx, %r10d # pass rounds
2779
2780 call _bsaes_decrypt8
2781
2782 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2783 pxor 0x10(%rsp), @XMM[1]
2784 movdqu @XMM[0], 0x00($out) # write output
2785 pxor 0x20(%rsp), @XMM[6]
2786 movdqu @XMM[1], 0x10($out)
2787 pxor 0x30(%rsp), @XMM[4]
2788 movdqu @XMM[6], 0x20($out)
2789 pxor 0x40(%rsp), @XMM[2]
2790 movdqu @XMM[4], 0x30($out)
2791 pxor 0x50(%rsp), @XMM[7]
2792 movdqu @XMM[2], 0x40($out)
2793 movdqu @XMM[7], 0x50($out)
2794 lea 0x60($out), $out
2795
2796 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2797 jmp .Lxts_dec_done
2798.align 16
2799.Lxts_dec_5:
2800 pxor @XMM[8+3], @XMM[3]
2801 lea 0x50($inp), $inp
2802 pxor @XMM[8+4], @XMM[4]
2803 lea 0x80(%rsp), %rax # pass key schedule
2804 mov %edx, %r10d # pass rounds
2805
2806 call _bsaes_decrypt8
2807
2808 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2809 pxor 0x10(%rsp), @XMM[1]
2810 movdqu @XMM[0], 0x00($out) # write output
2811 pxor 0x20(%rsp), @XMM[6]
2812 movdqu @XMM[1], 0x10($out)
2813 pxor 0x30(%rsp), @XMM[4]
2814 movdqu @XMM[6], 0x20($out)
2815 pxor 0x40(%rsp), @XMM[2]
2816 movdqu @XMM[4], 0x30($out)
2817 movdqu @XMM[2], 0x40($out)
2818 lea 0x50($out), $out
2819
2820 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2821 jmp .Lxts_dec_done
2822.align 16
2823.Lxts_dec_4:
2824 pxor @XMM[8+2], @XMM[2]
2825 lea 0x40($inp), $inp
2826 pxor @XMM[8+3], @XMM[3]
2827 lea 0x80(%rsp), %rax # pass key schedule
2828 mov %edx, %r10d # pass rounds
2829
2830 call _bsaes_decrypt8
2831
2832 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2833 pxor 0x10(%rsp), @XMM[1]
2834 movdqu @XMM[0], 0x00($out) # write output
2835 pxor 0x20(%rsp), @XMM[6]
2836 movdqu @XMM[1], 0x10($out)
2837 pxor 0x30(%rsp), @XMM[4]
2838 movdqu @XMM[6], 0x20($out)
2839 movdqu @XMM[4], 0x30($out)
2840 lea 0x40($out), $out
2841
2842 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2843 jmp .Lxts_dec_done
2844.align 16
2845.Lxts_dec_3:
2846 pxor @XMM[8+1], @XMM[1]
2847 lea 0x30($inp), $inp
2848 pxor @XMM[8+2], @XMM[2]
2849 lea 0x80(%rsp), %rax # pass key schedule
2850 mov %edx, %r10d # pass rounds
2851
2852 call _bsaes_decrypt8
2853
2854 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2855 pxor 0x10(%rsp), @XMM[1]
2856 movdqu @XMM[0], 0x00($out) # write output
2857 pxor 0x20(%rsp), @XMM[6]
2858 movdqu @XMM[1], 0x10($out)
2859 movdqu @XMM[6], 0x20($out)
2860 lea 0x30($out), $out
2861
2862 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2863 jmp .Lxts_dec_done
2864.align 16
2865.Lxts_dec_2:
2866 pxor @XMM[8+0], @XMM[0]
2867 lea 0x20($inp), $inp
2868 pxor @XMM[8+1], @XMM[1]
2869 lea 0x80(%rsp), %rax # pass key schedule
2870 mov %edx, %r10d # pass rounds
2871
2872 call _bsaes_decrypt8
2873
2874 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2875 pxor 0x10(%rsp), @XMM[1]
2876 movdqu @XMM[0], 0x00($out) # write output
2877 movdqu @XMM[1], 0x10($out)
2878 lea 0x20($out), $out
2879
2880 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2881 jmp .Lxts_dec_done
2882.align 16
2883.Lxts_dec_1:
2884 pxor @XMM[0], @XMM[8]
2885 lea 0x10($inp), $inp
2886 movdqa @XMM[8], 0x20(%rbp)
2887 lea 0x20(%rbp), $arg1
2888 lea 0x20(%rbp), $arg2
2889 lea ($key), $arg3
fe068648 2890 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2891 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2892 #pxor @XMM[8], @XMM[0]
2893 #lea 0x80(%rsp), %rax # pass key schedule
2894 #mov %edx, %r10d # pass rounds
2895 #call _bsaes_decrypt8
2896 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2897 movdqu @XMM[0], 0x00($out) # write output
2898 lea 0x10($out), $out
2899
2900 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2901
2902.Lxts_dec_done:
2903 and \$15, %ebx
2904 jz .Lxts_dec_ret
2905
2906 pxor $twtmp, $twtmp
2907 movdqa .Lxts_magic(%rip), $twmask
2908 pcmpgtd @XMM[7], $twtmp
2909 pshufd \$0x13, $twtmp, $twres
2910 movdqa @XMM[7], @XMM[6]
2911 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2912 pand $twmask, $twres # isolate carry and residue
2913 movdqu ($inp), @XMM[0]
2914 pxor $twres, @XMM[7]
2915
2916 lea 0x20(%rbp), $arg1
2917 pxor @XMM[7], @XMM[0]
2918 lea 0x20(%rbp), $arg2
2919 movdqa @XMM[0], 0x20(%rbp)
2920 lea ($key), $arg3
fe068648 2921 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2922 pxor 0x20(%rbp), @XMM[7]
2923 mov $out, %rdx
2924 movdqu @XMM[7], ($out)
2925
2926.Lxts_dec_steal:
2927 movzb 16($inp), %eax
2928 movzb (%rdx), %ecx
2929 lea 1($inp), $inp
2930 mov %al, (%rdx)
2931 mov %cl, 16(%rdx)
2932 lea 1(%rdx), %rdx
2933 sub \$1,%ebx
2934 jnz .Lxts_dec_steal
2935
2936 movdqu ($out), @XMM[0]
2937 lea 0x20(%rbp), $arg1
2938 pxor @XMM[6], @XMM[0]
2939 lea 0x20(%rbp), $arg2
2940 movdqa @XMM[0], 0x20(%rbp)
2941 lea ($key), $arg3
fe068648 2942 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2943 pxor 0x20(%rbp), @XMM[6]
2944 movdqu @XMM[6], ($out)
2945
2946.Lxts_dec_ret:
2947 lea (%rsp), %rax
2948 pxor %xmm0, %xmm0
2949.Lxts_dec_bzero: # wipe key schedule [if any]
2950 movdqa %xmm0, 0x00(%rax)
2951 movdqa %xmm0, 0x10(%rax)
2952 lea 0x20(%rax), %rax
2953 cmp %rax, %rbp
2954 ja .Lxts_dec_bzero
2955
384e6de4 2956 lea 0x78(%rbp),%rax
b84460ad 2957.cfi_def_cfa %rax,8
60d4e99c
AP
2958___
2959$code.=<<___ if ($win64);
2960 movaps 0x40(%rbp), %xmm6
2961 movaps 0x50(%rbp), %xmm7
2962 movaps 0x60(%rbp), %xmm8
2963 movaps 0x70(%rbp), %xmm9
2964 movaps 0x80(%rbp), %xmm10
2965 movaps 0x90(%rbp), %xmm11
2966 movaps 0xa0(%rbp), %xmm12
2967 movaps 0xb0(%rbp), %xmm13
2968 movaps 0xc0(%rbp), %xmm14
2969 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
2970 lea 0xa0(%rax), %rax
2971.Lxts_dec_tail:
60d4e99c
AP
2972___
2973$code.=<<___;
384e6de4 2974 mov -48(%rax), %r15
b84460ad 2975.cfi_restore %r15
384e6de4 2976 mov -40(%rax), %r14
b84460ad 2977.cfi_restore %r14
384e6de4 2978 mov -32(%rax), %r13
b84460ad 2979.cfi_restore %r13
384e6de4 2980 mov -24(%rax), %r12
b84460ad 2981.cfi_restore %r12
384e6de4 2982 mov -16(%rax), %rbx
b84460ad 2983.cfi_restore %rbx
384e6de4 2984 mov -8(%rax), %rbp
b84460ad 2985.cfi_restore %rbp
384e6de4 2986 lea (%rax), %rsp # restore %rsp
b84460ad 2987.cfi_def_cfa_register %rsp
60d4e99c
AP
2988.Lxts_dec_epilogue:
2989 ret
b84460ad 2990.cfi_endproc
60d4e99c
AP
2991.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2992___
4ec93a10
AP
2993}
2994$code.=<<___;
a75a52a4 2995.type _bsaes_const,\@object
4ec93a10 2996.align 64
a75a52a4 2997_bsaes_const:
28507577
AP
2998.LM0ISR: # InvShiftRows constants
2999 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
3000.LISRM0:
3001 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
3002.LISR:
3003 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
5a326467
AP
3004.LBS0: # bit-slice constants
3005 .quad 0x5555555555555555, 0x5555555555555555
3006.LBS1:
3007 .quad 0x3333333333333333, 0x3333333333333333
3008.LBS2:
3009 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3010.LSR: # shiftrows constants
3011 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
3012.LSRM0:
3013 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
5a326467
AP
3014.LM0SR:
3015 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
5a326467 3016.LSWPUP: # byte-swap upper dword
4ec93a10
AP
3017 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
3018.LSWPUPM0SR:
3019 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
5a326467
AP
3020.LADD1: # counter increment constants
3021 .quad 0x0000000000000000, 0x0000000100000000
3022.LADD2:
3023 .quad 0x0000000000000000, 0x0000000200000000
3024.LADD3:
3025 .quad 0x0000000000000000, 0x0000000300000000
3026.LADD4:
3027 .quad 0x0000000000000000, 0x0000000400000000
3028.LADD5:
3029 .quad 0x0000000000000000, 0x0000000500000000
3030.LADD6:
3031 .quad 0x0000000000000000, 0x0000000600000000
3032.LADD7:
3033 .quad 0x0000000000000000, 0x0000000700000000
3034.LADD8:
3035 .quad 0x0000000000000000, 0x0000000800000000
60d4e99c
AP
3036.Lxts_magic:
3037 .long 0x87,0,1,0
f9ef874a
AP
3038.Lmasks:
3039 .quad 0x0101010101010101, 0x0101010101010101
3040 .quad 0x0202020202020202, 0x0202020202020202
3041 .quad 0x0404040404040404, 0x0404040404040404
3042 .quad 0x0808080808080808, 0x0808080808080808
3043.LM0:
3044 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
3045.L63:
3046 .quad 0x6363636363636363, 0x6363636363636363
a75a52a4 3047.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
4ec93a10 3048.align 64
a75a52a4 3049.size _bsaes_const,.-_bsaes_const
4ec93a10
AP
3050___
3051
fe068648
AP
3052# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3053# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3054if ($win64) {
3055$rec="%rcx";
3056$frame="%rdx";
3057$context="%r8";
3058$disp="%r9";
3059
3060$code.=<<___;
3061.extern __imp_RtlVirtualUnwind
3062.type se_handler,\@abi-omnipotent
3063.align 16
3064se_handler:
3065 push %rsi
3066 push %rdi
3067 push %rbx
3068 push %rbp
3069 push %r12
3070 push %r13
3071 push %r14
3072 push %r15
3073 pushfq
3074 sub \$64,%rsp
3075
3076 mov 120($context),%rax # pull context->Rax
3077 mov 248($context),%rbx # pull context->Rip
3078
3079 mov 8($disp),%rsi # disp->ImageBase
3080 mov 56($disp),%r11 # disp->HandlerData
3081
3082 mov 0(%r11),%r10d # HandlerData[0]
3083 lea (%rsi,%r10),%r10 # prologue label
384e6de4
AP
3084 cmp %r10,%rbx # context->Rip<=prologue label
3085 jbe .Lin_prologue
fe068648
AP
3086
3087 mov 4(%r11),%r10d # HandlerData[1]
3088 lea (%rsi,%r10),%r10 # epilogue label
3089 cmp %r10,%rbx # context->Rip>=epilogue label
3090 jae .Lin_prologue
3091
384e6de4
AP
3092 mov 8(%r11),%r10d # HandlerData[2]
3093 lea (%rsi,%r10),%r10 # epilogue label
3094 cmp %r10,%rbx # context->Rip>=tail label
3095 jae .Lin_tail
3096
fe068648
AP
3097 mov 160($context),%rax # pull context->Rbp
3098
3099 lea 0x40(%rax),%rsi # %xmm save area
3100 lea 512($context),%rdi # &context.Xmm6
3101 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3102 .long 0xa548f3fc # cld; rep movsq
384e6de4
AP
3103 lea 0xa0+0x78(%rax),%rax # adjust stack pointer
3104
3105.Lin_tail:
3106 mov -48(%rax),%rbp
3107 mov -40(%rax),%rbx
3108 mov -32(%rax),%r12
3109 mov -24(%rax),%r13
3110 mov -16(%rax),%r14
3111 mov -8(%rax),%r15
fe068648
AP
3112 mov %rbx,144($context) # restore context->Rbx
3113 mov %rbp,160($context) # restore context->Rbp
3114 mov %r12,216($context) # restore context->R12
3115 mov %r13,224($context) # restore context->R13
3116 mov %r14,232($context) # restore context->R14
3117 mov %r15,240($context) # restore context->R15
3118
3119.Lin_prologue:
3120 mov %rax,152($context) # restore context->Rsp
3121
3122 mov 40($disp),%rdi # disp->ContextRecord
3123 mov $context,%rsi # context
3124 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3125 .long 0xa548f3fc # cld; rep movsq
3126
3127 mov $disp,%rsi
3128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3129 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3130 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3131 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3132 mov 40(%rsi),%r10 # disp->ContextRecord
3133 lea 56(%rsi),%r11 # &disp->HandlerData
3134 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3135 mov %r10,32(%rsp) # arg5
3136 mov %r11,40(%rsp) # arg6
3137 mov %r12,48(%rsp) # arg7
3138 mov %rcx,56(%rsp) # arg8, (NULL)
3139 call *__imp_RtlVirtualUnwind(%rip)
3140
3141 mov \$1,%eax # ExceptionContinueSearch
3142 add \$64,%rsp
3143 popfq
3144 pop %r15
3145 pop %r14
3146 pop %r13
3147 pop %r12
3148 pop %rbp
3149 pop %rbx
3150 pop %rdi
3151 pop %rsi
3152 ret
3153.size se_handler,.-se_handler
3154
3155.section .pdata
3156.align 4
3157___
3158$code.=<<___ if ($ecb);
3159 .rva .Lecb_enc_prologue
3160 .rva .Lecb_enc_epilogue
3161 .rva .Lecb_enc_info
3162
3163 .rva .Lecb_dec_prologue
3164 .rva .Lecb_dec_epilogue
3165 .rva .Lecb_dec_info
3166___
3167$code.=<<___;
3168 .rva .Lcbc_dec_prologue
3169 .rva .Lcbc_dec_epilogue
3170 .rva .Lcbc_dec_info
3171
3172 .rva .Lctr_enc_prologue
3173 .rva .Lctr_enc_epilogue
3174 .rva .Lctr_enc_info
3175
3176 .rva .Lxts_enc_prologue
3177 .rva .Lxts_enc_epilogue
3178 .rva .Lxts_enc_info
3179
3180 .rva .Lxts_dec_prologue
3181 .rva .Lxts_dec_epilogue
3182 .rva .Lxts_dec_info
3183
3184.section .xdata
3185.align 8
3186___
3187$code.=<<___ if ($ecb);
3188.Lecb_enc_info:
3189 .byte 9,0,0,0
3190 .rva se_handler
3191 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
384e6de4
AP
3192 .rva .Lecb_enc_tail
3193 .long 0
fe068648
AP
3194.Lecb_dec_info:
3195 .byte 9,0,0,0
3196 .rva se_handler
3197 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
384e6de4
AP
3198 .rva .Lecb_dec_tail
3199 .long 0
fe068648
AP
3200___
3201$code.=<<___;
3202.Lcbc_dec_info:
3203 .byte 9,0,0,0
3204 .rva se_handler
3205 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
384e6de4
AP
3206 .rva .Lcbc_dec_tail
3207 .long 0
fe068648
AP
3208.Lctr_enc_info:
3209 .byte 9,0,0,0
3210 .rva se_handler
3211 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
384e6de4
AP
3212 .rva .Lctr_enc_tail
3213 .long 0
fe068648
AP
3214.Lxts_enc_info:
3215 .byte 9,0,0,0
3216 .rva se_handler
3217 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
384e6de4
AP
3218 .rva .Lxts_enc_tail
3219 .long 0
fe068648
AP
3220.Lxts_dec_info:
3221 .byte 9,0,0,0
3222 .rva se_handler
3223 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
384e6de4
AP
3224 .rva .Lxts_dec_tail
3225 .long 0
fe068648
AP
3226___
3227}
3228
4ec93a10
AP
3229$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3230
3231print $code;
3232
3233close STDOUT;