]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/bsaes-x86_64.pl
Fix some typos
[thirdparty/openssl.git] / crypto / aes / asm / bsaes-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
c918d8e2 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
4ec93a10
AP
9
10###################################################################
11### AES-128 [originally in CTR mode] ###
12### bitsliced implementation for Intel Core 2 processors ###
13### requires support of SSE extensions up to SSSE3 ###
11272648 14### Author: Emilia Käsper and Peter Schwabe ###
4ec93a10
AP
15### Date: 2009-03-19 ###
16### Public domain ###
17### ###
18### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19### further information. ###
20###################################################################
21#
22# September 2011.
23#
24# Started as transliteration to "perlasm" the original code has
25# undergone following changes:
26#
27# - code was made position-independent;
28# - rounds were folded into a loop resulting in >5x size reduction
29# from 12.5KB to 2.2KB;
79c44b4e 30# - above was possible thanks to mixcolumns() modification that
4ec93a10
AP
31# allowed to feed its output back to aesenc[last], this was
32# achieved at cost of two additional inter-registers moves;
33# - some instruction reordering and interleaving;
34# - this module doesn't implement key setup subroutine, instead it
35# relies on conversion of "conventional" key schedule as returned
36# by AES_set_encrypt_key (see discussion below);
37# - first and last round keys are treated differently, which allowed
38# to skip one shiftrows(), reduce bit-sliced key schedule and
39# speed-up conversion by 22%;
40# - support for 192- and 256-bit keys was added;
41#
42# Resulting performance in CPU cycles spent to encrypt one byte out
43# of 4096-byte buffer with 128-bit key is:
44#
45# Emilia's this(*) difference
46#
47# Core 2 9.30 8.69 +7%
558ff0f0
AP
48# Nehalem(**) 7.63 6.88 +11%
49# Atom 17.1 16.4 +4%
b59f92e7 50# Silvermont - 12.9
ace05265 51# Goldmont - 8.85
4ec93a10
AP
52#
53# (*) Comparison is not completely fair, because "this" is ECB,
54# i.e. no extra processing such as counter values calculation
55# and xor-ing input as in Emilia's CTR implementation is
56# performed. However, the CTR calculations stand for not more
57# than 1% of total time, so comparison is *rather* fair.
58#
59# (**) Results were collected on Westmere, which is considered to
60# be equivalent to Nehalem for this code.
61#
4ec93a10
AP
62# As for key schedule conversion subroutine. Interface to OpenSSL
63# relies on per-invocation on-the-fly conversion. This naturally
64# has impact on performance, especially for short inputs. Conversion
65# time in CPU cycles and its ratio to CPU cycles spent in 8x block
66# function is:
67#
68# conversion conversion/8x block
f9ef874a
AP
69# Core 2 240 0.22
70# Nehalem 180 0.20
558ff0f0 71# Atom 430 0.20
4ec93a10
AP
72#
73# The ratio values mean that 128-byte blocks will be processed
f9ef874a 74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
4ec93a10
AP
75# etc. Then keep in mind that input sizes not divisible by 128 are
76# *effectively* slower, especially shortest ones, e.g. consecutive
77# 144-byte blocks are processed 44% slower than one would expect,
78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79# it's still faster than ["hyper-threading-safe" code path in]
80# aes-x86_64.pl on all lengths above 64 bytes...
81#
28507577
AP
82# October 2011.
83#
b08259cd
AP
84# Add decryption procedure. Performance in CPU cycles spent to decrypt
85# one byte out of 4096-byte buffer with 128-bit key is:
86#
558ff0f0
AP
87# Core 2 9.98
88# Nehalem 7.80
89# Atom 17.9
b59f92e7 90# Silvermont 14.0
ace05265 91# Goldmont 10.2
28507577 92#
60d4e99c
AP
93# November 2011.
94#
fe068648
AP
95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96# suboptimal, but XTS is meant to be used with larger blocks...
60d4e99c 97#
4ec93a10
AP
98# <appro@openssl.org>
99
1aa89a7a
RL
100# $output is the last argument if it looks like a file (it has an extension)
101# $flavour is the first argument if it doesn't look like a file
102$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
103$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
4ec93a10
AP
104
105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110die "can't locate x86_64-xlate.pl";
111
1aa89a7a
RL
112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
113 or die "can't call $xlate: $!";
46bf83f0 114*STDOUT=*OUT;
4ec93a10
AP
115
116my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
117my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
fe068648 118my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
4ec93a10
AP
119
120{
121my ($key,$rounds,$const)=("%rax","%r10d","%r11");
122
28507577 123sub Sbox {
4ec93a10
AP
124# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
126my @b=@_[0..7];
127my @t=@_[8..11];
128my @s=@_[12..15];
129 &InBasisChange (@b);
130 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
131 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
132}
133
134sub InBasisChange {
135# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
609b0852 136# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
4ec93a10
AP
137my @b=@_[0..7];
138$code.=<<___;
139 pxor @b[6], @b[5]
140 pxor @b[1], @b[2]
28507577 141 pxor @b[0], @b[3]
4ec93a10 142 pxor @b[2], @b[6]
28507577 143 pxor @b[0], @b[5]
4ec93a10
AP
144
145 pxor @b[3], @b[6]
146 pxor @b[7], @b[3]
147 pxor @b[5], @b[7]
148 pxor @b[4], @b[3]
149 pxor @b[5], @b[4]
150 pxor @b[1], @b[3]
151
152 pxor @b[7], @b[2]
153 pxor @b[5], @b[1]
154___
155}
156
157sub OutBasisChange {
158# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
159# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
160my @b=@_[0..7];
161$code.=<<___;
162 pxor @b[6], @b[0]
163 pxor @b[4], @b[1]
164 pxor @b[0], @b[2]
165 pxor @b[6], @b[4]
166 pxor @b[1], @b[6]
167
168 pxor @b[5], @b[1]
169 pxor @b[3], @b[5]
170 pxor @b[7], @b[3]
171 pxor @b[5], @b[7]
172 pxor @b[5], @b[2]
173
174 pxor @b[7], @b[4]
175___
176}
177
28507577
AP
178sub InvSbox {
179# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
180# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
181my @b=@_[0..7];
182my @t=@_[8..11];
183my @s=@_[12..15];
184 &InvInBasisChange (@b);
185 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
186 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
187}
188
189sub InvInBasisChange { # OutBasisChange in reverse
190my @b=@_[5,1,2,6,3,7,0,4];
191$code.=<<___
192 pxor @b[7], @b[4]
193
194 pxor @b[5], @b[7]
195 pxor @b[5], @b[2]
196 pxor @b[7], @b[3]
197 pxor @b[3], @b[5]
198 pxor @b[5], @b[1]
199
200 pxor @b[1], @b[6]
201 pxor @b[0], @b[2]
202 pxor @b[6], @b[4]
203 pxor @b[6], @b[0]
204 pxor @b[4], @b[1]
205___
206}
207
208sub InvOutBasisChange { # InBasisChange in reverse
209my @b=@_[2,5,7,3,6,1,0,4];
210$code.=<<___;
211 pxor @b[5], @b[1]
212 pxor @b[7], @b[2]
213
214 pxor @b[1], @b[3]
215 pxor @b[5], @b[4]
216 pxor @b[5], @b[7]
217 pxor @b[4], @b[3]
218 pxor @b[0], @b[5]
219 pxor @b[7], @b[3]
220 pxor @b[2], @b[6]
221 pxor @b[1], @b[2]
222 pxor @b[3], @b[6]
223
224 pxor @b[0], @b[3]
225 pxor @b[6], @b[5]
226___
227}
228
4ec93a10
AP
229sub Mul_GF4 {
230#;*************************************************************
231#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
232#;*************************************************************
233my ($x0,$x1,$y0,$y1,$t0)=@_;
234$code.=<<___;
235 movdqa $y0, $t0
236 pxor $y1, $t0
237 pand $x0, $t0
238 pxor $x1, $x0
239 pand $y0, $x1
240 pand $y1, $x0
241 pxor $x1, $x0
242 pxor $t0, $x1
243___
244}
245
246sub Mul_GF4_N { # not used, see next subroutine
247# multiply and scale by N
248my ($x0,$x1,$y0,$y1,$t0)=@_;
249$code.=<<___;
250 movdqa $y0, $t0
251 pxor $y1, $t0
252 pand $x0, $t0
253 pxor $x1, $x0
254 pand $y0, $x1
255 pand $y1, $x0
256 pxor $x0, $x1
257 pxor $t0, $x0
258___
259}
260
261sub Mul_GF4_N_GF4 {
262# interleaved Mul_GF4_N and Mul_GF4
263my ($x0,$x1,$y0,$y1,$t0,
264 $x2,$x3,$y2,$y3,$t1)=@_;
265$code.=<<___;
266 movdqa $y0, $t0
267 movdqa $y2, $t1
268 pxor $y1, $t0
269 pxor $y3, $t1
270 pand $x0, $t0
271 pand $x2, $t1
272 pxor $x1, $x0
273 pxor $x3, $x2
274 pand $y0, $x1
275 pand $y2, $x3
276 pand $y1, $x0
277 pand $y3, $x2
278 pxor $x0, $x1
279 pxor $x3, $x2
280 pxor $t0, $x0
281 pxor $t1, $x3
282___
283}
284sub Mul_GF16_2 {
285my @x=@_[0..7];
286my @y=@_[8..11];
287my @t=@_[12..15];
288$code.=<<___;
289 movdqa @x[0], @t[0]
290 movdqa @x[1], @t[1]
291___
292 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
293$code.=<<___;
294 pxor @x[2], @t[0]
295 pxor @x[3], @t[1]
296 pxor @y[2], @y[0]
297 pxor @y[3], @y[1]
298___
299 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
300 @x[2], @x[3], @y[2], @y[3], @t[2]);
301$code.=<<___;
302 pxor @t[0], @x[0]
303 pxor @t[0], @x[2]
304 pxor @t[1], @x[1]
305 pxor @t[1], @x[3]
306
307 movdqa @x[4], @t[0]
308 movdqa @x[5], @t[1]
309 pxor @x[6], @t[0]
310 pxor @x[7], @t[1]
311___
312 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
313 @x[6], @x[7], @y[2], @y[3], @t[2]);
314$code.=<<___;
315 pxor @y[2], @y[0]
316 pxor @y[3], @y[1]
317___
318 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
319$code.=<<___;
320 pxor @t[0], @x[4]
321 pxor @t[0], @x[6]
322 pxor @t[1], @x[5]
323 pxor @t[1], @x[7]
324___
325}
326sub Inv_GF256 {
327#;********************************************************************
328#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
329#;********************************************************************
330my @x=@_[0..7];
331my @t=@_[8..11];
332my @s=@_[12..15];
333# direct optimizations from hardware
334$code.=<<___;
335 movdqa @x[4], @t[3]
336 movdqa @x[5], @t[2]
337 movdqa @x[1], @t[1]
338 movdqa @x[7], @s[1]
339 movdqa @x[0], @s[0]
340
341 pxor @x[6], @t[3]
342 pxor @x[7], @t[2]
343 pxor @x[3], @t[1]
344 movdqa @t[3], @s[2]
345 pxor @x[6], @s[1]
346 movdqa @t[2], @t[0]
347 pxor @x[2], @s[0]
348 movdqa @t[3], @s[3]
349
350 por @t[1], @t[2]
351 por @s[0], @t[3]
352 pxor @t[0], @s[3]
353 pand @s[0], @s[2]
354 pxor @t[1], @s[0]
355 pand @t[1], @t[0]
356 pand @s[0], @s[3]
357 movdqa @x[3], @s[0]
358 pxor @x[2], @s[0]
359 pand @s[0], @s[1]
360 pxor @s[1], @t[3]
361 pxor @s[1], @t[2]
362 movdqa @x[4], @s[1]
363 movdqa @x[1], @s[0]
364 pxor @x[5], @s[1]
365 pxor @x[0], @s[0]
366 movdqa @s[1], @t[1]
367 pand @s[0], @s[1]
368 por @s[0], @t[1]
369 pxor @s[1], @t[0]
370 pxor @s[3], @t[3]
371 pxor @s[2], @t[2]
372 pxor @s[3], @t[1]
373 movdqa @x[7], @s[0]
374 pxor @s[2], @t[0]
375 movdqa @x[6], @s[1]
376 pxor @s[2], @t[1]
377 movdqa @x[5], @s[2]
378 pand @x[3], @s[0]
379 movdqa @x[4], @s[3]
380 pand @x[2], @s[1]
381 pand @x[1], @s[2]
382 por @x[0], @s[3]
383 pxor @s[0], @t[3]
384 pxor @s[1], @t[2]
385 pxor @s[2], @t[1]
609b0852 386 pxor @s[3], @t[0]
4ec93a10
AP
387
388 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
389
390 # new smaller inversion
391
392 movdqa @t[3], @s[0]
393 pand @t[1], @t[3]
394 pxor @t[2], @s[0]
395
396 movdqa @t[0], @s[2]
397 movdqa @s[0], @s[3]
398 pxor @t[3], @s[2]
399 pand @s[2], @s[3]
400
401 movdqa @t[1], @s[1]
402 pxor @t[2], @s[3]
403 pxor @t[0], @s[1]
404
405 pxor @t[2], @t[3]
406
407 pand @t[3], @s[1]
408
409 movdqa @s[2], @t[2]
410 pxor @t[0], @s[1]
411
412 pxor @s[1], @t[2]
413 pxor @s[1], @t[1]
414
415 pand @t[0], @t[2]
416
417 pxor @t[2], @s[2]
418 pxor @t[2], @t[1]
419
420 pand @s[3], @s[2]
421
422 pxor @s[0], @s[2]
423___
424# output in s3, s2, s1, t1
425
426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
427
428# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
429 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
430
431### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
432}
433
434# AES linear components
435
28507577 436sub ShiftRows {
4ec93a10
AP
437my @x=@_[0..7];
438my $mask=pop;
439$code.=<<___;
440 pxor 0x00($key),@x[0]
441 pxor 0x10($key),@x[1]
4ec93a10 442 pxor 0x20($key),@x[2]
4ec93a10 443 pxor 0x30($key),@x[3]
558ff0f0
AP
444 pshufb $mask,@x[0]
445 pshufb $mask,@x[1]
4ec93a10 446 pxor 0x40($key),@x[4]
4ec93a10 447 pxor 0x50($key),@x[5]
558ff0f0
AP
448 pshufb $mask,@x[2]
449 pshufb $mask,@x[3]
4ec93a10 450 pxor 0x60($key),@x[6]
4ec93a10 451 pxor 0x70($key),@x[7]
558ff0f0
AP
452 pshufb $mask,@x[4]
453 pshufb $mask,@x[5]
4ec93a10 454 pshufb $mask,@x[6]
4ec93a10 455 pshufb $mask,@x[7]
558ff0f0 456 lea 0x80($key),$key
4ec93a10
AP
457___
458}
459
28507577 460sub MixColumns {
4ec93a10
AP
461# modified to emit output in order suitable for feeding back to aesenc[last]
462my @x=@_[0..7];
463my @t=@_[8..15];
6f6a6130 464my $inv=@_[16]; # optional
4ec93a10
AP
465$code.=<<___;
466 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
467 pshufd \$0x93, @x[1], @t[1]
468 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
469 pshufd \$0x93, @x[2], @t[2]
470 pxor @t[1], @x[1]
471 pshufd \$0x93, @x[3], @t[3]
472 pxor @t[2], @x[2]
473 pshufd \$0x93, @x[4], @t[4]
474 pxor @t[3], @x[3]
475 pshufd \$0x93, @x[5], @t[5]
476 pxor @t[4], @x[4]
477 pshufd \$0x93, @x[6], @t[6]
478 pxor @t[5], @x[5]
479 pshufd \$0x93, @x[7], @t[7]
480 pxor @t[6], @x[6]
481 pxor @t[7], @x[7]
482
483 pxor @x[0], @t[1]
484 pxor @x[7], @t[0]
485 pxor @x[7], @t[1]
486 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
487 pxor @x[1], @t[2]
488 pshufd \$0x4E, @x[1], @x[1]
489 pxor @x[4], @t[5]
490 pxor @t[0], @x[0]
491 pxor @x[5], @t[6]
492 pxor @t[1], @x[1]
493 pxor @x[3], @t[4]
494 pshufd \$0x4E, @x[4], @t[0]
495 pxor @x[6], @t[7]
496 pshufd \$0x4E, @x[5], @t[1]
497 pxor @x[2], @t[3]
498 pshufd \$0x4E, @x[3], @x[4]
499 pxor @x[7], @t[3]
500 pshufd \$0x4E, @x[7], @x[5]
501 pxor @x[7], @t[4]
502 pshufd \$0x4E, @x[6], @x[3]
503 pxor @t[4], @t[0]
504 pshufd \$0x4E, @x[2], @x[6]
505 pxor @t[5], @t[1]
6f6a6130
AP
506___
507$code.=<<___ if (!$inv);
4ec93a10
AP
508 pxor @t[3], @x[4]
509 pxor @t[7], @x[5]
510 pxor @t[6], @x[3]
511 movdqa @t[0], @x[2]
512 pxor @t[2], @x[6]
513 movdqa @t[1], @x[7]
514___
6f6a6130
AP
515$code.=<<___ if ($inv);
516 pxor @x[4], @t[3]
517 pxor @t[7], @x[5]
518 pxor @x[3], @t[6]
519 movdqa @t[0], @x[3]
520 pxor @t[2], @x[6]
521 movdqa @t[6], @x[2]
522 movdqa @t[1], @x[7]
523 movdqa @x[6], @x[4]
524 movdqa @t[3], @x[6]
525___
4ec93a10
AP
526}
527
6f6a6130 528sub InvMixColumns_orig {
28507577
AP
529my @x=@_[0..7];
530my @t=@_[8..15];
531
532$code.=<<___;
28507577 533 # multiplication by 0x0e
b08259cd
AP
534 pshufd \$0x93, @x[7], @t[7]
535 movdqa @x[2], @t[2]
28507577
AP
536 pxor @x[5], @x[7] # 7 5
537 pxor @x[5], @x[2] # 2 5
b08259cd
AP
538 pshufd \$0x93, @x[0], @t[0]
539 movdqa @x[5], @t[5]
28507577
AP
540 pxor @x[0], @x[5] # 5 0 [1]
541 pxor @x[1], @x[0] # 0 1
b08259cd 542 pshufd \$0x93, @x[1], @t[1]
28507577
AP
543 pxor @x[2], @x[1] # 1 25
544 pxor @x[6], @x[0] # 01 6 [2]
545 pxor @x[3], @x[1] # 125 3 [4]
b08259cd 546 pshufd \$0x93, @x[3], @t[3]
28507577
AP
547 pxor @x[0], @x[2] # 25 016 [3]
548 pxor @x[7], @x[3] # 3 75
549 pxor @x[6], @x[7] # 75 6 [0]
b08259cd
AP
550 pshufd \$0x93, @x[6], @t[6]
551 movdqa @x[4], @t[4]
28507577
AP
552 pxor @x[4], @x[6] # 6 4
553 pxor @x[3], @x[4] # 4 375 [6]
554 pxor @x[7], @x[3] # 375 756=36
555 pxor @t[5], @x[6] # 64 5 [7]
28507577 556 pxor @t[2], @x[3] # 36 2
28507577 557 pxor @t[4], @x[3] # 362 4 [5]
b08259cd 558 pshufd \$0x93, @t[5], @t[5]
28507577
AP
559___
560 my @y = @x[7,5,0,2,1,3,4,6];
561$code.=<<___;
562 # multiplication by 0x0b
563 pxor @y[0], @y[1]
564 pxor @t[0], @y[0]
28507577 565 pxor @t[1], @y[1]
b08259cd
AP
566 pshufd \$0x93, @t[2], @t[2]
567 pxor @t[5], @y[0]
568 pxor @t[6], @y[1]
569 pxor @t[7], @y[0]
570 pshufd \$0x93, @t[4], @t[4]
28507577 571 pxor @t[6], @t[7] # clobber t[7]
b08259cd 572 pxor @y[0], @y[1]
28507577 573
b08259cd
AP
574 pxor @t[0], @y[3]
575 pshufd \$0x93, @t[0], @t[0]
28507577 576 pxor @t[1], @y[2]
b08259cd 577 pxor @t[1], @y[4]
28507577 578 pxor @t[2], @y[2]
b08259cd 579 pshufd \$0x93, @t[1], @t[1]
28507577 580 pxor @t[2], @y[3]
b08259cd
AP
581 pxor @t[2], @y[5]
582 pxor @t[7], @y[2]
583 pshufd \$0x93, @t[2], @t[2]
28507577 584 pxor @t[3], @y[3]
b08259cd
AP
585 pxor @t[3], @y[6]
586 pxor @t[3], @y[4]
587 pshufd \$0x93, @t[3], @t[3]
588 pxor @t[4], @y[7]
589 pxor @t[4], @y[5]
28507577 590 pxor @t[7], @y[7]
b08259cd
AP
591 pxor @t[5], @y[3]
592 pxor @t[4], @y[4]
28507577
AP
593 pxor @t[5], @t[7] # clobber t[7] even more
594
b08259cd
AP
595 pxor @t[7], @y[5]
596 pshufd \$0x93, @t[4], @t[4]
597 pxor @t[7], @y[6]
598 pxor @t[7], @y[4]
28507577
AP
599
600 pxor @t[5], @t[7]
b08259cd 601 pshufd \$0x93, @t[5], @t[5]
28507577
AP
602 pxor @t[6], @t[7] # restore t[7]
603
b08259cd
AP
604 # multiplication by 0x0d
605 pxor @y[7], @y[4]
606 pxor @t[4], @y[7]
28507577 607 pshufd \$0x93, @t[6], @t[6]
b08259cd
AP
608 pxor @t[0], @y[2]
609 pxor @t[5], @y[7]
610 pxor @t[2], @y[2]
28507577
AP
611 pshufd \$0x93, @t[7], @t[7]
612
28507577
AP
613 pxor @y[1], @y[3]
614 pxor @t[1], @y[1]
b08259cd 615 pxor @t[0], @y[0]
28507577 616 pxor @t[0], @y[3]
b08259cd
AP
617 pxor @t[5], @y[1]
618 pxor @t[5], @y[0]
619 pxor @t[7], @y[1]
620 pshufd \$0x93, @t[0], @t[0]
621 pxor @t[6], @y[0]
622 pxor @y[1], @y[3]
28507577 623 pxor @t[1], @y[4]
b08259cd 624 pshufd \$0x93, @t[1], @t[1]
28507577 625
b08259cd
AP
626 pxor @t[7], @y[7]
627 pxor @t[2], @y[4]
28507577 628 pxor @t[2], @y[5]
b08259cd
AP
629 pshufd \$0x93, @t[2], @t[2]
630 pxor @t[6], @y[2]
631 pxor @t[3], @t[6] # clobber t[6]
632 pxor @y[7], @y[4]
633 pxor @t[6], @y[3]
28507577
AP
634
635 pxor @t[6], @y[6]
b08259cd 636 pxor @t[5], @y[5]
28507577 637 pxor @t[4], @y[6]
b08259cd
AP
638 pshufd \$0x93, @t[4], @t[4]
639 pxor @t[6], @y[5]
640 pxor @t[7], @y[6]
28507577
AP
641 pxor @t[3], @t[6] # restore t[6]
642
28507577
AP
643 pshufd \$0x93, @t[5], @t[5]
644 pshufd \$0x93, @t[6], @t[6]
645 pshufd \$0x93, @t[7], @t[7]
b08259cd 646 pshufd \$0x93, @t[3], @t[3]
28507577
AP
647
648 # multiplication by 0x09
649 pxor @y[1], @y[4]
b08259cd 650 pxor @y[1], @t[1] # t[1]=y[1]
28507577 651 pxor @t[5], @t[0] # clobber t[0]
b08259cd 652 pxor @t[5], @t[1]
28507577 653 pxor @t[0], @y[3]
b08259cd
AP
654 pxor @y[0], @t[0] # t[0]=y[0]
655 pxor @t[6], @t[1]
656 pxor @t[7], @t[6] # clobber t[6]
657 pxor @t[1], @y[4]
28507577 658 pxor @t[4], @y[7]
b08259cd
AP
659 pxor @y[4], @t[4] # t[4]=y[4]
660 pxor @t[3], @y[6]
661 pxor @y[3], @t[3] # t[3]=y[3]
662 pxor @t[2], @y[5]
663 pxor @y[2], @t[2] # t[2]=y[2]
664 pxor @t[7], @t[3]
665 pxor @y[5], @t[5] # t[5]=y[5]
666 pxor @t[6], @t[2]
667 pxor @t[6], @t[5]
668 pxor @y[6], @t[6] # t[6]=y[6]
669 pxor @y[7], @t[7] # t[7]=y[7]
28507577
AP
670
671 movdqa @t[0],@XMM[0]
672 movdqa @t[1],@XMM[1]
673 movdqa @t[2],@XMM[2]
674 movdqa @t[3],@XMM[3]
675 movdqa @t[4],@XMM[4]
676 movdqa @t[5],@XMM[5]
677 movdqa @t[6],@XMM[6]
678 movdqa @t[7],@XMM[7]
679___
680}
681
6f6a6130
AP
682sub InvMixColumns {
683my @x=@_[0..7];
684my @t=@_[8..15];
685
686# Thanks to Jussi Kivilinna for providing pointer to
687#
688# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
689# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
690# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
691# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
692
693$code.=<<___;
694 # multiplication by 0x05-0x00-0x04-0x00
695 pshufd \$0x4E, @x[0], @t[0]
696 pshufd \$0x4E, @x[6], @t[6]
697 pxor @x[0], @t[0]
698 pshufd \$0x4E, @x[7], @t[7]
699 pxor @x[6], @t[6]
700 pshufd \$0x4E, @x[1], @t[1]
701 pxor @x[7], @t[7]
702 pshufd \$0x4E, @x[2], @t[2]
703 pxor @x[1], @t[1]
704 pshufd \$0x4E, @x[3], @t[3]
705 pxor @x[2], @t[2]
706 pxor @t[6], @x[0]
707 pxor @t[6], @x[1]
708 pshufd \$0x4E, @x[4], @t[4]
709 pxor @x[3], @t[3]
710 pxor @t[0], @x[2]
711 pxor @t[1], @x[3]
712 pshufd \$0x4E, @x[5], @t[5]
713 pxor @x[4], @t[4]
714 pxor @t[7], @x[1]
715 pxor @t[2], @x[4]
716 pxor @x[5], @t[5]
717
718 pxor @t[7], @x[2]
719 pxor @t[6], @x[3]
720 pxor @t[6], @x[4]
721 pxor @t[3], @x[5]
722 pxor @t[4], @x[6]
723 pxor @t[7], @x[4]
724 pxor @t[7], @x[5]
725 pxor @t[5], @x[7]
726___
727 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
728}
729
4ec93a10
AP
730sub aesenc { # not used
731my @b=@_[0..7];
732my @t=@_[8..15];
733$code.=<<___;
734 movdqa 0x30($const),@t[0] # .LSR
735___
28507577
AP
736 &ShiftRows (@b,@t[0]);
737 &Sbox (@b,@t);
738 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
4ec93a10
AP
739}
740
741sub aesenclast { # not used
742my @b=@_[0..7];
743my @t=@_[8..15];
744$code.=<<___;
745 movdqa 0x40($const),@t[0] # .LSRM0
746___
28507577
AP
747 &ShiftRows (@b,@t[0]);
748 &Sbox (@b,@t);
4ec93a10
AP
749$code.=<<___
750 pxor 0x00($key),@b[0]
751 pxor 0x10($key),@b[1]
752 pxor 0x20($key),@b[4]
753 pxor 0x30($key),@b[6]
754 pxor 0x40($key),@b[3]
755 pxor 0x50($key),@b[7]
756 pxor 0x60($key),@b[2]
757 pxor 0x70($key),@b[5]
758___
759}
760
761sub swapmove {
762my ($a,$b,$n,$mask,$t)=@_;
763$code.=<<___;
764 movdqa $b,$t
765 psrlq \$$n,$b
766 pxor $a,$b
767 pand $mask,$b
768 pxor $b,$a
769 psllq \$$n,$b
770 pxor $t,$b
771___
772}
773sub swapmove2x {
774my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
775$code.=<<___;
776 movdqa $b0,$t0
777 psrlq \$$n,$b0
778 movdqa $b1,$t1
779 psrlq \$$n,$b1
780 pxor $a0,$b0
781 pxor $a1,$b1
782 pand $mask,$b0
783 pand $mask,$b1
784 pxor $b0,$a0
785 psllq \$$n,$b0
786 pxor $b1,$a1
787 psllq \$$n,$b1
788 pxor $t0,$b0
789 pxor $t1,$b1
790___
791}
792
793sub bitslice {
794my @x=reverse(@_[0..7]);
795my ($t0,$t1,$t2,$t3)=@_[8..11];
796$code.=<<___;
797 movdqa 0x00($const),$t0 # .LBS0
798 movdqa 0x10($const),$t1 # .LBS1
799___
800 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
801 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
802$code.=<<___;
803 movdqa 0x20($const),$t0 # .LBS2
804___
805 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
806 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
807
808 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
809 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
810}
811
812$code.=<<___;
813.text
814
fe068648
AP
815.extern asm_AES_encrypt
816.extern asm_AES_decrypt
4ec93a10
AP
817
818.type _bsaes_encrypt8,\@abi-omnipotent
819.align 64
820_bsaes_encrypt8:
c0e8e500 821.cfi_startproc
4ec93a10
AP
822 lea .LBS0(%rip), $const # constants table
823
824 movdqa ($key), @XMM[9] # round 0 key
825 lea 0x10($key), $key
f9ef874a 826 movdqa 0x50($const), @XMM[8] # .LM0SR
4ec93a10
AP
827 pxor @XMM[9], @XMM[0] # xor with round0 key
828 pxor @XMM[9], @XMM[1]
4ec93a10 829 pxor @XMM[9], @XMM[2]
4ec93a10 830 pxor @XMM[9], @XMM[3]
558ff0f0
AP
831 pshufb @XMM[8], @XMM[0]
832 pshufb @XMM[8], @XMM[1]
4ec93a10 833 pxor @XMM[9], @XMM[4]
4ec93a10 834 pxor @XMM[9], @XMM[5]
558ff0f0
AP
835 pshufb @XMM[8], @XMM[2]
836 pshufb @XMM[8], @XMM[3]
4ec93a10 837 pxor @XMM[9], @XMM[6]
4ec93a10 838 pxor @XMM[9], @XMM[7]
558ff0f0
AP
839 pshufb @XMM[8], @XMM[4]
840 pshufb @XMM[8], @XMM[5]
4ec93a10
AP
841 pshufb @XMM[8], @XMM[6]
842 pshufb @XMM[8], @XMM[7]
843_bsaes_encrypt8_bitslice:
844___
845 &bitslice (@XMM[0..7, 8..11]);
846$code.=<<___;
847 dec $rounds
848 jmp .Lenc_sbox
849.align 16
850.Lenc_loop:
851___
28507577 852 &ShiftRows (@XMM[0..7, 8]);
4ec93a10 853$code.=".Lenc_sbox:\n";
28507577 854 &Sbox (@XMM[0..7, 8..15]);
4ec93a10
AP
855$code.=<<___;
856 dec $rounds
857 jl .Lenc_done
858___
28507577 859 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
4ec93a10
AP
860$code.=<<___;
861 movdqa 0x30($const), @XMM[8] # .LSR
862 jnz .Lenc_loop
863 movdqa 0x40($const), @XMM[8] # .LSRM0
864 jmp .Lenc_loop
865.align 16
866.Lenc_done:
867___
868 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
869 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
870$code.=<<___;
871 movdqa ($key), @XMM[8] # last round key
4ec93a10
AP
872 pxor @XMM[8], @XMM[4]
873 pxor @XMM[8], @XMM[6]
874 pxor @XMM[8], @XMM[3]
875 pxor @XMM[8], @XMM[7]
876 pxor @XMM[8], @XMM[2]
877 pxor @XMM[8], @XMM[5]
28507577
AP
878 pxor @XMM[8], @XMM[0]
879 pxor @XMM[8], @XMM[1]
4ec93a10 880 ret
c0e8e500 881.cfi_endproc
4ec93a10 882.size _bsaes_encrypt8,.-_bsaes_encrypt8
28507577
AP
883
884.type _bsaes_decrypt8,\@abi-omnipotent
885.align 64
886_bsaes_decrypt8:
c0e8e500 887.cfi_startproc
28507577
AP
888 lea .LBS0(%rip), $const # constants table
889
890 movdqa ($key), @XMM[9] # round 0 key
891 lea 0x10($key), $key
892 movdqa -0x30($const), @XMM[8] # .LM0ISR
893 pxor @XMM[9], @XMM[0] # xor with round0 key
894 pxor @XMM[9], @XMM[1]
28507577 895 pxor @XMM[9], @XMM[2]
28507577 896 pxor @XMM[9], @XMM[3]
558ff0f0
AP
897 pshufb @XMM[8], @XMM[0]
898 pshufb @XMM[8], @XMM[1]
28507577 899 pxor @XMM[9], @XMM[4]
28507577 900 pxor @XMM[9], @XMM[5]
558ff0f0
AP
901 pshufb @XMM[8], @XMM[2]
902 pshufb @XMM[8], @XMM[3]
28507577 903 pxor @XMM[9], @XMM[6]
28507577 904 pxor @XMM[9], @XMM[7]
558ff0f0
AP
905 pshufb @XMM[8], @XMM[4]
906 pshufb @XMM[8], @XMM[5]
28507577
AP
907 pshufb @XMM[8], @XMM[6]
908 pshufb @XMM[8], @XMM[7]
909___
910 &bitslice (@XMM[0..7, 8..11]);
911$code.=<<___;
912 dec $rounds
913 jmp .Ldec_sbox
914.align 16
915.Ldec_loop:
916___
917 &ShiftRows (@XMM[0..7, 8]);
918$code.=".Ldec_sbox:\n";
919 &InvSbox (@XMM[0..7, 8..15]);
920$code.=<<___;
921 dec $rounds
922 jl .Ldec_done
923___
924 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
925$code.=<<___;
926 movdqa -0x10($const), @XMM[8] # .LISR
927 jnz .Ldec_loop
928 movdqa -0x20($const), @XMM[8] # .LISRM0
929 jmp .Ldec_loop
930.align 16
931.Ldec_done:
932___
933 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
934$code.=<<___;
935 movdqa ($key), @XMM[8] # last round key
936 pxor @XMM[8], @XMM[6]
937 pxor @XMM[8], @XMM[4]
938 pxor @XMM[8], @XMM[2]
939 pxor @XMM[8], @XMM[7]
940 pxor @XMM[8], @XMM[3]
941 pxor @XMM[8], @XMM[5]
942 pxor @XMM[8], @XMM[0]
943 pxor @XMM[8], @XMM[1]
944 ret
c0e8e500 945.cfi_endproc
28507577 946.size _bsaes_decrypt8,.-_bsaes_decrypt8
4ec93a10
AP
947___
948}
949{
950my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
951
952sub bitslice_key {
953my @x=reverse(@_[0..7]);
954my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
955
956 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
957$code.=<<___;
958 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
959 movdqa @x[0], @x[2]
960 movdqa @x[1], @x[3]
961___
962 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
963
964 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
965$code.=<<___;
966 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
967 movdqa @x[0], @x[4]
968 movdqa @x[2], @x[6]
969 movdqa @x[1], @x[5]
970 movdqa @x[3], @x[7]
971___
972 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
973 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
974}
975
976$code.=<<___;
28507577 977.type _bsaes_key_convert,\@abi-omnipotent
4ec93a10 978.align 16
28507577 979_bsaes_key_convert:
c0e8e500 980.cfi_startproc
f9ef874a 981 lea .Lmasks(%rip), $const
4ec93a10 982 movdqu ($inp), %xmm7 # load round 0 key
4ec93a10 983 lea 0x10($inp), $inp
f9ef874a
AP
984 movdqa 0x00($const), %xmm0 # 0x01...
985 movdqa 0x10($const), %xmm1 # 0x02...
986 movdqa 0x20($const), %xmm2 # 0x04...
987 movdqa 0x30($const), %xmm3 # 0x08...
988 movdqa 0x40($const), %xmm4 # .LM0
989 pcmpeqd %xmm5, %xmm5 # .LNOT
990
991 movdqu ($inp), %xmm6 # load round 1 key
4ec93a10
AP
992 movdqa %xmm7, ($out) # save round 0 key
993 lea 0x10($out), $out
994 dec $rounds
995 jmp .Lkey_loop
996.align 16
997.Lkey_loop:
f9ef874a
AP
998 pshufb %xmm4, %xmm6 # .LM0
999
1000 movdqa %xmm0, %xmm8
1001 movdqa %xmm1, %xmm9
1002
1003 pand %xmm6, %xmm8
1004 pand %xmm6, %xmm9
1005 movdqa %xmm2, %xmm10
1006 pcmpeqb %xmm0, %xmm8
1007 psllq \$4, %xmm0 # 0x10...
1008 movdqa %xmm3, %xmm11
1009 pcmpeqb %xmm1, %xmm9
1010 psllq \$4, %xmm1 # 0x20...
1011
1012 pand %xmm6, %xmm10
1013 pand %xmm6, %xmm11
1014 movdqa %xmm0, %xmm12
1015 pcmpeqb %xmm2, %xmm10
1016 psllq \$4, %xmm2 # 0x40...
1017 movdqa %xmm1, %xmm13
1018 pcmpeqb %xmm3, %xmm11
1019 psllq \$4, %xmm3 # 0x80...
1020
1021 movdqa %xmm2, %xmm14
1022 movdqa %xmm3, %xmm15
1023 pxor %xmm5, %xmm8 # "pnot"
1024 pxor %xmm5, %xmm9
1025
1026 pand %xmm6, %xmm12
1027 pand %xmm6, %xmm13
1028 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1029 pcmpeqb %xmm0, %xmm12
1030 psrlq \$4, %xmm0 # 0x01...
1031 movdqa %xmm9, 0x10($out)
1032 pcmpeqb %xmm1, %xmm13
1033 psrlq \$4, %xmm1 # 0x02...
1034 lea 0x10($inp), $inp
1035
1036 pand %xmm6, %xmm14
1037 pand %xmm6, %xmm15
1038 movdqa %xmm10, 0x20($out)
1039 pcmpeqb %xmm2, %xmm14
1040 psrlq \$4, %xmm2 # 0x04...
1041 movdqa %xmm11, 0x30($out)
1042 pcmpeqb %xmm3, %xmm15
1043 psrlq \$4, %xmm3 # 0x08...
1044 movdqu ($inp), %xmm6 # load next round key
1045
1046 pxor %xmm5, %xmm13 # "pnot"
1047 pxor %xmm5, %xmm14
1048 movdqa %xmm12, 0x40($out)
1049 movdqa %xmm13, 0x50($out)
1050 movdqa %xmm14, 0x60($out)
1051 movdqa %xmm15, 0x70($out)
4ec93a10 1052 lea 0x80($out),$out
4ec93a10
AP
1053 dec $rounds
1054 jnz .Lkey_loop
1055
f9ef874a 1056 movdqa 0x50($const), %xmm7 # .L63
28507577 1057 #movdqa %xmm6, ($out) # don't save last round key
4ec93a10 1058 ret
c0e8e500 1059.cfi_endproc
28507577 1060.size _bsaes_key_convert,.-_bsaes_key_convert
4ec93a10
AP
1061___
1062}
1063
fe068648 1064if (0 && !$win64) { # following four functions are unsupported interface
11272648 1065 # used for benchmarking...
4ec93a10
AP
1066$code.=<<___;
1067.globl bsaes_enc_key_convert
1068.type bsaes_enc_key_convert,\@function,2
1069.align 16
1070bsaes_enc_key_convert:
1071 mov 240($inp),%r10d # pass rounds
1072 mov $inp,%rcx # pass key
1073 mov $out,%rax # pass key schedule
28507577
AP
1074 call _bsaes_key_convert
1075 pxor %xmm6,%xmm7 # fix up last round key
1076 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1077 ret
1078.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1079
1080.globl bsaes_encrypt_128
1081.type bsaes_encrypt_128,\@function,4
1082.align 16
1083bsaes_encrypt_128:
1084.Lenc128_loop:
1085 movdqu 0x00($inp), @XMM[0] # load input
1086 movdqu 0x10($inp), @XMM[1]
1087 movdqu 0x20($inp), @XMM[2]
1088 movdqu 0x30($inp), @XMM[3]
1089 movdqu 0x40($inp), @XMM[4]
1090 movdqu 0x50($inp), @XMM[5]
1091 movdqu 0x60($inp), @XMM[6]
1092 movdqu 0x70($inp), @XMM[7]
1093 mov $key, %rax # pass the $key
1094 lea 0x80($inp), $inp
1095 mov \$10,%r10d
1096
1097 call _bsaes_encrypt8
1098
1099 movdqu @XMM[0], 0x00($out) # write output
1100 movdqu @XMM[1], 0x10($out)
1101 movdqu @XMM[4], 0x20($out)
1102 movdqu @XMM[6], 0x30($out)
1103 movdqu @XMM[3], 0x40($out)
1104 movdqu @XMM[7], 0x50($out)
1105 movdqu @XMM[2], 0x60($out)
1106 movdqu @XMM[5], 0x70($out)
1107 lea 0x80($out), $out
1108 sub \$0x80,$len
1109 ja .Lenc128_loop
1110 ret
1111.size bsaes_encrypt_128,.-bsaes_encrypt_128
28507577
AP
1112
1113.globl bsaes_dec_key_convert
1114.type bsaes_dec_key_convert,\@function,2
1115.align 16
1116bsaes_dec_key_convert:
1117 mov 240($inp),%r10d # pass rounds
1118 mov $inp,%rcx # pass key
1119 mov $out,%rax # pass key schedule
1120 call _bsaes_key_convert
1121 pxor ($out),%xmm7 # fix up round 0 key
1122 movdqa %xmm6,(%rax) # save last round key
1123 movdqa %xmm7,($out)
1124 ret
1125.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1126
1127.globl bsaes_decrypt_128
1128.type bsaes_decrypt_128,\@function,4
1129.align 16
1130bsaes_decrypt_128:
1131.Ldec128_loop:
1132 movdqu 0x00($inp), @XMM[0] # load input
1133 movdqu 0x10($inp), @XMM[1]
1134 movdqu 0x20($inp), @XMM[2]
1135 movdqu 0x30($inp), @XMM[3]
1136 movdqu 0x40($inp), @XMM[4]
1137 movdqu 0x50($inp), @XMM[5]
1138 movdqu 0x60($inp), @XMM[6]
1139 movdqu 0x70($inp), @XMM[7]
1140 mov $key, %rax # pass the $key
1141 lea 0x80($inp), $inp
1142 mov \$10,%r10d
1143
1144 call _bsaes_decrypt8
1145
1146 movdqu @XMM[0], 0x00($out) # write output
1147 movdqu @XMM[1], 0x10($out)
1148 movdqu @XMM[6], 0x20($out)
1149 movdqu @XMM[4], 0x30($out)
1150 movdqu @XMM[2], 0x40($out)
1151 movdqu @XMM[7], 0x50($out)
1152 movdqu @XMM[3], 0x60($out)
1153 movdqu @XMM[5], 0x70($out)
1154 lea 0x80($out), $out
1155 sub \$0x80,$len
1156 ja .Ldec128_loop
1157 ret
1158.size bsaes_decrypt_128,.-bsaes_decrypt_128
4ec93a10
AP
1159___
1160}
1161{
1162######################################################################
1163#
1164# OpenSSL interface
1165#
a75a52a4
AP
1166my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1167 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
4ec93a10
AP
1168my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1169
fe068648 1170if ($ecb) {
4ec93a10
AP
1171$code.=<<___;
1172.globl bsaes_ecb_encrypt_blocks
1173.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1174.align 16
1175bsaes_ecb_encrypt_blocks:
b84460ad 1176.cfi_startproc
fe068648
AP
1177 mov %rsp, %rax
1178.Lecb_enc_prologue:
4ec93a10 1179 push %rbp
b84460ad 1180.cfi_push %rbp
4ec93a10 1181 push %rbx
b84460ad 1182.cfi_push %rbx
4ec93a10 1183 push %r12
b84460ad 1184.cfi_push %r12
4ec93a10 1185 push %r13
b84460ad 1186.cfi_push %r13
4ec93a10 1187 push %r14
b84460ad 1188.cfi_push %r14
4ec93a10 1189 push %r15
b84460ad 1190.cfi_push %r15
4ec93a10 1191 lea -0x48(%rsp),%rsp
b84460ad 1192.cfi_adjust_cfa_offset 0x48
4ec93a10
AP
1193___
1194$code.=<<___ if ($win64);
1195 lea -0xa0(%rsp), %rsp
1196 movaps %xmm6, 0x40(%rsp)
1197 movaps %xmm7, 0x50(%rsp)
1198 movaps %xmm8, 0x60(%rsp)
1199 movaps %xmm9, 0x70(%rsp)
1200 movaps %xmm10, 0x80(%rsp)
1201 movaps %xmm11, 0x90(%rsp)
1202 movaps %xmm12, 0xa0(%rsp)
1203 movaps %xmm13, 0xb0(%rsp)
1204 movaps %xmm14, 0xc0(%rsp)
1205 movaps %xmm15, 0xd0(%rsp)
1206.Lecb_enc_body:
1207___
1208$code.=<<___;
1209 mov %rsp,%rbp # backup %rsp
b84460ad 1210.cfi_def_cfa_register %rbp
4ec93a10
AP
1211 mov 240($arg4),%eax # rounds
1212 mov $arg1,$inp # backup arguments
1213 mov $arg2,$out
1214 mov $arg3,$len
1215 mov $arg4,$key
1216 cmp \$8,$arg3
1217 jb .Lecb_enc_short
1218
1219 mov %eax,%ebx # backup rounds
1220 shl \$7,%rax # 128 bytes per inner round key
1221 sub \$`128-32`,%rax # size of bit-sliced key schedule
1222 sub %rax,%rsp
1223 mov %rsp,%rax # pass key schedule
1224 mov $key,%rcx # pass key
1225 mov %ebx,%r10d # pass rounds
28507577
AP
1226 call _bsaes_key_convert
1227 pxor %xmm6,%xmm7 # fix up last round key
1228 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1229
1230 sub \$8,$len
1231.Lecb_enc_loop:
1232 movdqu 0x00($inp), @XMM[0] # load input
1233 movdqu 0x10($inp), @XMM[1]
1234 movdqu 0x20($inp), @XMM[2]
1235 movdqu 0x30($inp), @XMM[3]
1236 movdqu 0x40($inp), @XMM[4]
1237 movdqu 0x50($inp), @XMM[5]
1238 mov %rsp, %rax # pass key schedule
1239 movdqu 0x60($inp), @XMM[6]
1240 mov %ebx,%r10d # pass rounds
1241 movdqu 0x70($inp), @XMM[7]
1242 lea 0x80($inp), $inp
1243
1244 call _bsaes_encrypt8
1245
1246 movdqu @XMM[0], 0x00($out) # write output
1247 movdqu @XMM[1], 0x10($out)
1248 movdqu @XMM[4], 0x20($out)
1249 movdqu @XMM[6], 0x30($out)
1250 movdqu @XMM[3], 0x40($out)
1251 movdqu @XMM[7], 0x50($out)
1252 movdqu @XMM[2], 0x60($out)
1253 movdqu @XMM[5], 0x70($out)
1254 lea 0x80($out), $out
1255 sub \$8,$len
1256 jnc .Lecb_enc_loop
1257
1258 add \$8,$len
1259 jz .Lecb_enc_done
1260
1261 movdqu 0x00($inp), @XMM[0] # load input
1262 mov %rsp, %rax # pass key schedule
1263 mov %ebx,%r10d # pass rounds
1264 cmp \$2,$len
1265 jb .Lecb_enc_one
1266 movdqu 0x10($inp), @XMM[1]
1267 je .Lecb_enc_two
1268 movdqu 0x20($inp), @XMM[2]
1269 cmp \$4,$len
1270 jb .Lecb_enc_three
1271 movdqu 0x30($inp), @XMM[3]
1272 je .Lecb_enc_four
1273 movdqu 0x40($inp), @XMM[4]
1274 cmp \$6,$len
1275 jb .Lecb_enc_five
1276 movdqu 0x50($inp), @XMM[5]
1277 je .Lecb_enc_six
1278 movdqu 0x60($inp), @XMM[6]
1279 call _bsaes_encrypt8
1280 movdqu @XMM[0], 0x00($out) # write output
1281 movdqu @XMM[1], 0x10($out)
1282 movdqu @XMM[4], 0x20($out)
1283 movdqu @XMM[6], 0x30($out)
1284 movdqu @XMM[3], 0x40($out)
1285 movdqu @XMM[7], 0x50($out)
1286 movdqu @XMM[2], 0x60($out)
1287 jmp .Lecb_enc_done
1288.align 16
1289.Lecb_enc_six:
1290 call _bsaes_encrypt8
1291 movdqu @XMM[0], 0x00($out) # write output
1292 movdqu @XMM[1], 0x10($out)
1293 movdqu @XMM[4], 0x20($out)
1294 movdqu @XMM[6], 0x30($out)
1295 movdqu @XMM[3], 0x40($out)
1296 movdqu @XMM[7], 0x50($out)
1297 jmp .Lecb_enc_done
1298.align 16
1299.Lecb_enc_five:
1300 call _bsaes_encrypt8
1301 movdqu @XMM[0], 0x00($out) # write output
1302 movdqu @XMM[1], 0x10($out)
1303 movdqu @XMM[4], 0x20($out)
1304 movdqu @XMM[6], 0x30($out)
1305 movdqu @XMM[3], 0x40($out)
1306 jmp .Lecb_enc_done
1307.align 16
1308.Lecb_enc_four:
1309 call _bsaes_encrypt8
1310 movdqu @XMM[0], 0x00($out) # write output
1311 movdqu @XMM[1], 0x10($out)
1312 movdqu @XMM[4], 0x20($out)
1313 movdqu @XMM[6], 0x30($out)
1314 jmp .Lecb_enc_done
1315.align 16
1316.Lecb_enc_three:
1317 call _bsaes_encrypt8
1318 movdqu @XMM[0], 0x00($out) # write output
1319 movdqu @XMM[1], 0x10($out)
1320 movdqu @XMM[4], 0x20($out)
1321 jmp .Lecb_enc_done
1322.align 16
1323.Lecb_enc_two:
1324 call _bsaes_encrypt8
1325 movdqu @XMM[0], 0x00($out) # write output
1326 movdqu @XMM[1], 0x10($out)
1327 jmp .Lecb_enc_done
1328.align 16
1329.Lecb_enc_one:
1330 call _bsaes_encrypt8
1331 movdqu @XMM[0], 0x00($out) # write output
1332 jmp .Lecb_enc_done
1333.align 16
1334.Lecb_enc_short:
1335 lea ($inp), $arg1
1336 lea ($out), $arg2
1337 lea ($key), $arg3
fe068648 1338 call asm_AES_encrypt
4ec93a10
AP
1339 lea 16($inp), $inp
1340 lea 16($out), $out
1341 dec $len
1342 jnz .Lecb_enc_short
1343
1344.Lecb_enc_done:
1345 lea (%rsp),%rax
1346 pxor %xmm0, %xmm0
1347.Lecb_enc_bzero: # wipe key schedule [if any]
1348 movdqa %xmm0, 0x00(%rax)
1349 movdqa %xmm0, 0x10(%rax)
1350 lea 0x20(%rax), %rax
1351 cmp %rax, %rbp
1352 jb .Lecb_enc_bzero
1353
384e6de4 1354 lea 0x78(%rbp),%rax
b84460ad 1355.cfi_def_cfa %rax,8
4ec93a10
AP
1356___
1357$code.=<<___ if ($win64);
1358 movaps 0x40(%rbp), %xmm6
1359 movaps 0x50(%rbp), %xmm7
1360 movaps 0x60(%rbp), %xmm8
1361 movaps 0x70(%rbp), %xmm9
1362 movaps 0x80(%rbp), %xmm10
1363 movaps 0x90(%rbp), %xmm11
1364 movaps 0xa0(%rbp), %xmm12
1365 movaps 0xb0(%rbp), %xmm13
1366 movaps 0xc0(%rbp), %xmm14
1367 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
1368 lea 0xa0(%rax), %rax
1369.Lecb_enc_tail:
4ec93a10
AP
1370___
1371$code.=<<___;
384e6de4 1372 mov -48(%rax), %r15
b84460ad 1373.cfi_restore %r15
384e6de4 1374 mov -40(%rax), %r14
b84460ad 1375.cfi_restore %r14
384e6de4 1376 mov -32(%rax), %r13
b84460ad 1377.cfi_restore %r13
384e6de4 1378 mov -24(%rax), %r12
b84460ad 1379.cfi_restore %r12
384e6de4 1380 mov -16(%rax), %rbx
b84460ad 1381.cfi_restore %rbx
384e6de4 1382 mov -8(%rax), %rbp
b84460ad 1383.cfi_restore %rbp
384e6de4 1384 lea (%rax), %rsp # restore %rsp
b84460ad 1385.cfi_def_cfa_register %rsp
4ec93a10
AP
1386.Lecb_enc_epilogue:
1387 ret
b84460ad 1388.cfi_endproc
4ec93a10
AP
1389.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1390
a75a52a4
AP
1391.globl bsaes_ecb_decrypt_blocks
1392.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1393.align 16
1394bsaes_ecb_decrypt_blocks:
b84460ad 1395.cfi_startproc
fe068648
AP
1396 mov %rsp, %rax
1397.Lecb_dec_prologue:
a75a52a4 1398 push %rbp
b84460ad 1399.cfi_push %rbp
a75a52a4 1400 push %rbx
b84460ad 1401.cfi_push %rbx
a75a52a4 1402 push %r12
b84460ad 1403.cfi_push %r12
a75a52a4 1404 push %r13
b84460ad 1405.cfi_push %r13
a75a52a4 1406 push %r14
b84460ad 1407.cfi_push %r14
a75a52a4 1408 push %r15
b84460ad 1409.cfi_push %r15
a75a52a4 1410 lea -0x48(%rsp),%rsp
b84460ad 1411.cfi_adjust_cfa_offset 0x48
a75a52a4
AP
1412___
1413$code.=<<___ if ($win64);
1414 lea -0xa0(%rsp), %rsp
1415 movaps %xmm6, 0x40(%rsp)
1416 movaps %xmm7, 0x50(%rsp)
1417 movaps %xmm8, 0x60(%rsp)
1418 movaps %xmm9, 0x70(%rsp)
1419 movaps %xmm10, 0x80(%rsp)
1420 movaps %xmm11, 0x90(%rsp)
1421 movaps %xmm12, 0xa0(%rsp)
1422 movaps %xmm13, 0xb0(%rsp)
1423 movaps %xmm14, 0xc0(%rsp)
1424 movaps %xmm15, 0xd0(%rsp)
1425.Lecb_dec_body:
1426___
1427$code.=<<___;
1428 mov %rsp,%rbp # backup %rsp
b84460ad 1429.cfi_def_cfa_register %rbp
a75a52a4
AP
1430 mov 240($arg4),%eax # rounds
1431 mov $arg1,$inp # backup arguments
1432 mov $arg2,$out
1433 mov $arg3,$len
1434 mov $arg4,$key
1435 cmp \$8,$arg3
1436 jb .Lecb_dec_short
1437
1438 mov %eax,%ebx # backup rounds
1439 shl \$7,%rax # 128 bytes per inner round key
1440 sub \$`128-32`,%rax # size of bit-sliced key schedule
1441 sub %rax,%rsp
1442 mov %rsp,%rax # pass key schedule
1443 mov $key,%rcx # pass key
1444 mov %ebx,%r10d # pass rounds
1445 call _bsaes_key_convert
1446 pxor (%rsp),%xmm7 # fix up 0 round key
1447 movdqa %xmm6,(%rax) # save last round key
1448 movdqa %xmm7,(%rsp)
1449
1450 sub \$8,$len
1451.Lecb_dec_loop:
1452 movdqu 0x00($inp), @XMM[0] # load input
1453 movdqu 0x10($inp), @XMM[1]
1454 movdqu 0x20($inp), @XMM[2]
1455 movdqu 0x30($inp), @XMM[3]
1456 movdqu 0x40($inp), @XMM[4]
1457 movdqu 0x50($inp), @XMM[5]
1458 mov %rsp, %rax # pass key schedule
1459 movdqu 0x60($inp), @XMM[6]
1460 mov %ebx,%r10d # pass rounds
1461 movdqu 0x70($inp), @XMM[7]
1462 lea 0x80($inp), $inp
1463
1464 call _bsaes_decrypt8
1465
1466 movdqu @XMM[0], 0x00($out) # write output
1467 movdqu @XMM[1], 0x10($out)
1468 movdqu @XMM[6], 0x20($out)
1469 movdqu @XMM[4], 0x30($out)
1470 movdqu @XMM[2], 0x40($out)
1471 movdqu @XMM[7], 0x50($out)
1472 movdqu @XMM[3], 0x60($out)
1473 movdqu @XMM[5], 0x70($out)
1474 lea 0x80($out), $out
1475 sub \$8,$len
1476 jnc .Lecb_dec_loop
1477
1478 add \$8,$len
1479 jz .Lecb_dec_done
1480
1481 movdqu 0x00($inp), @XMM[0] # load input
1482 mov %rsp, %rax # pass key schedule
1483 mov %ebx,%r10d # pass rounds
1484 cmp \$2,$len
1485 jb .Lecb_dec_one
1486 movdqu 0x10($inp), @XMM[1]
1487 je .Lecb_dec_two
1488 movdqu 0x20($inp), @XMM[2]
1489 cmp \$4,$len
1490 jb .Lecb_dec_three
1491 movdqu 0x30($inp), @XMM[3]
1492 je .Lecb_dec_four
1493 movdqu 0x40($inp), @XMM[4]
1494 cmp \$6,$len
1495 jb .Lecb_dec_five
1496 movdqu 0x50($inp), @XMM[5]
1497 je .Lecb_dec_six
1498 movdqu 0x60($inp), @XMM[6]
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1503 movdqu @XMM[4], 0x30($out)
1504 movdqu @XMM[2], 0x40($out)
1505 movdqu @XMM[7], 0x50($out)
1506 movdqu @XMM[3], 0x60($out)
1507 jmp .Lecb_dec_done
1508.align 16
1509.Lecb_dec_six:
1510 call _bsaes_decrypt8
1511 movdqu @XMM[0], 0x00($out) # write output
1512 movdqu @XMM[1], 0x10($out)
1513 movdqu @XMM[6], 0x20($out)
1514 movdqu @XMM[4], 0x30($out)
1515 movdqu @XMM[2], 0x40($out)
1516 movdqu @XMM[7], 0x50($out)
1517 jmp .Lecb_dec_done
1518.align 16
1519.Lecb_dec_five:
1520 call _bsaes_decrypt8
1521 movdqu @XMM[0], 0x00($out) # write output
1522 movdqu @XMM[1], 0x10($out)
1523 movdqu @XMM[6], 0x20($out)
1524 movdqu @XMM[4], 0x30($out)
1525 movdqu @XMM[2], 0x40($out)
1526 jmp .Lecb_dec_done
1527.align 16
1528.Lecb_dec_four:
1529 call _bsaes_decrypt8
1530 movdqu @XMM[0], 0x00($out) # write output
1531 movdqu @XMM[1], 0x10($out)
1532 movdqu @XMM[6], 0x20($out)
1533 movdqu @XMM[4], 0x30($out)
1534 jmp .Lecb_dec_done
1535.align 16
1536.Lecb_dec_three:
1537 call _bsaes_decrypt8
1538 movdqu @XMM[0], 0x00($out) # write output
1539 movdqu @XMM[1], 0x10($out)
1540 movdqu @XMM[6], 0x20($out)
1541 jmp .Lecb_dec_done
1542.align 16
1543.Lecb_dec_two:
1544 call _bsaes_decrypt8
1545 movdqu @XMM[0], 0x00($out) # write output
1546 movdqu @XMM[1], 0x10($out)
1547 jmp .Lecb_dec_done
1548.align 16
1549.Lecb_dec_one:
1550 call _bsaes_decrypt8
1551 movdqu @XMM[0], 0x00($out) # write output
1552 jmp .Lecb_dec_done
1553.align 16
1554.Lecb_dec_short:
1555 lea ($inp), $arg1
1556 lea ($out), $arg2
1557 lea ($key), $arg3
fe068648 1558 call asm_AES_decrypt
a75a52a4
AP
1559 lea 16($inp), $inp
1560 lea 16($out), $out
1561 dec $len
1562 jnz .Lecb_dec_short
1563
1564.Lecb_dec_done:
1565 lea (%rsp),%rax
1566 pxor %xmm0, %xmm0
1567.Lecb_dec_bzero: # wipe key schedule [if any]
1568 movdqa %xmm0, 0x00(%rax)
1569 movdqa %xmm0, 0x10(%rax)
1570 lea 0x20(%rax), %rax
1571 cmp %rax, %rbp
1572 jb .Lecb_dec_bzero
1573
384e6de4 1574 lea 0x78(%rbp),%rax
b84460ad 1575.cfi_def_cfa %rax,8
a75a52a4
AP
1576___
1577$code.=<<___ if ($win64);
1578 movaps 0x40(%rbp), %xmm6
1579 movaps 0x50(%rbp), %xmm7
1580 movaps 0x60(%rbp), %xmm8
1581 movaps 0x70(%rbp), %xmm9
1582 movaps 0x80(%rbp), %xmm10
1583 movaps 0x90(%rbp), %xmm11
1584 movaps 0xa0(%rbp), %xmm12
1585 movaps 0xb0(%rbp), %xmm13
1586 movaps 0xc0(%rbp), %xmm14
1587 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
1588 lea 0xa0(%rax), %rax
1589.Lecb_dec_tail:
a75a52a4
AP
1590___
1591$code.=<<___;
384e6de4 1592 mov -48(%rax), %r15
b84460ad 1593.cfi_restore %r15
384e6de4 1594 mov -40(%rax), %r14
b84460ad 1595.cfi_restore %r14
384e6de4 1596 mov -32(%rax), %r13
b84460ad 1597.cfi_restore %r13
384e6de4 1598 mov -24(%rax), %r12
b84460ad 1599.cfi_restore %r12
384e6de4 1600 mov -16(%rax), %rbx
b84460ad 1601.cfi_restore %rbx
384e6de4 1602 mov -8(%rax), %rbp
b84460ad 1603.cfi_restore %rbp
384e6de4 1604 lea (%rax), %rsp # restore %rsp
b84460ad 1605.cfi_def_cfa_register %rsp
a75a52a4
AP
1606.Lecb_dec_epilogue:
1607 ret
b84460ad 1608.cfi_endproc
a75a52a4
AP
1609.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1610___
1611}
1612$code.=<<___;
fe068648 1613.extern asm_AES_cbc_encrypt
a75a52a4
AP
1614.globl bsaes_cbc_encrypt
1615.type bsaes_cbc_encrypt,\@abi-omnipotent
1616.align 16
1617bsaes_cbc_encrypt:
b84460ad 1618.cfi_startproc
a75a52a4
AP
1619___
1620$code.=<<___ if ($win64);
1621 mov 48(%rsp),$arg6 # pull direction flag
1622___
1623$code.=<<___;
1624 cmp \$0,$arg6
fe068648 1625 jne asm_AES_cbc_encrypt
a75a52a4 1626 cmp \$128,$arg3
fe068648 1627 jb asm_AES_cbc_encrypt
a75a52a4 1628
fe068648
AP
1629 mov %rsp, %rax
1630.Lcbc_dec_prologue:
a75a52a4 1631 push %rbp
b84460ad 1632.cfi_push %rbp
a75a52a4 1633 push %rbx
b84460ad 1634.cfi_push %rbx
a75a52a4 1635 push %r12
b84460ad 1636.cfi_push %r12
a75a52a4 1637 push %r13
b84460ad 1638.cfi_push %r13
a75a52a4 1639 push %r14
b84460ad 1640.cfi_push %r14
a75a52a4 1641 push %r15
b84460ad 1642.cfi_push %r15
a75a52a4 1643 lea -0x48(%rsp), %rsp
b84460ad 1644.cfi_adjust_cfa_offset 0x48
a75a52a4
AP
1645___
1646$code.=<<___ if ($win64);
1647 mov 0xa0(%rsp),$arg5 # pull ivp
1648 lea -0xa0(%rsp), %rsp
1649 movaps %xmm6, 0x40(%rsp)
1650 movaps %xmm7, 0x50(%rsp)
1651 movaps %xmm8, 0x60(%rsp)
1652 movaps %xmm9, 0x70(%rsp)
1653 movaps %xmm10, 0x80(%rsp)
1654 movaps %xmm11, 0x90(%rsp)
1655 movaps %xmm12, 0xa0(%rsp)
1656 movaps %xmm13, 0xb0(%rsp)
1657 movaps %xmm14, 0xc0(%rsp)
1658 movaps %xmm15, 0xd0(%rsp)
1659.Lcbc_dec_body:
1660___
1661$code.=<<___;
1662 mov %rsp, %rbp # backup %rsp
b84460ad 1663.cfi_def_cfa_register %rbp
a75a52a4
AP
1664 mov 240($arg4), %eax # rounds
1665 mov $arg1, $inp # backup arguments
1666 mov $arg2, $out
1667 mov $arg3, $len
1668 mov $arg4, $key
60d4e99c 1669 mov $arg5, %rbx
a75a52a4
AP
1670 shr \$4, $len # bytes to blocks
1671
60d4e99c 1672 mov %eax, %edx # rounds
a75a52a4
AP
1673 shl \$7, %rax # 128 bytes per inner round key
1674 sub \$`128-32`, %rax # size of bit-sliced key schedule
1675 sub %rax, %rsp
1676
1677 mov %rsp, %rax # pass key schedule
1678 mov $key, %rcx # pass key
60d4e99c 1679 mov %edx, %r10d # pass rounds
a75a52a4
AP
1680 call _bsaes_key_convert
1681 pxor (%rsp),%xmm7 # fix up 0 round key
1682 movdqa %xmm6,(%rax) # save last round key
1683 movdqa %xmm7,(%rsp)
1684
60d4e99c 1685 movdqu (%rbx), @XMM[15] # load IV
a75a52a4
AP
1686 sub \$8,$len
1687.Lcbc_dec_loop:
1688 movdqu 0x00($inp), @XMM[0] # load input
1689 movdqu 0x10($inp), @XMM[1]
1690 movdqu 0x20($inp), @XMM[2]
1691 movdqu 0x30($inp), @XMM[3]
1692 movdqu 0x40($inp), @XMM[4]
1693 movdqu 0x50($inp), @XMM[5]
1694 mov %rsp, %rax # pass key schedule
1695 movdqu 0x60($inp), @XMM[6]
60d4e99c 1696 mov %edx,%r10d # pass rounds
a75a52a4
AP
1697 movdqu 0x70($inp), @XMM[7]
1698 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1699
1700 call _bsaes_decrypt8
1701
1702 pxor 0x20(%rbp), @XMM[0] # ^= IV
1703 movdqu 0x00($inp), @XMM[8] # re-load input
1704 movdqu 0x10($inp), @XMM[9]
1705 pxor @XMM[8], @XMM[1]
1706 movdqu 0x20($inp), @XMM[10]
1707 pxor @XMM[9], @XMM[6]
1708 movdqu 0x30($inp), @XMM[11]
1709 pxor @XMM[10], @XMM[4]
1710 movdqu 0x40($inp), @XMM[12]
1711 pxor @XMM[11], @XMM[2]
1712 movdqu 0x50($inp), @XMM[13]
1713 pxor @XMM[12], @XMM[7]
1714 movdqu 0x60($inp), @XMM[14]
1715 pxor @XMM[13], @XMM[3]
1716 movdqu 0x70($inp), @XMM[15] # IV
1717 pxor @XMM[14], @XMM[5]
1718 movdqu @XMM[0], 0x00($out) # write output
1719 lea 0x80($inp), $inp
1720 movdqu @XMM[1], 0x10($out)
1721 movdqu @XMM[6], 0x20($out)
1722 movdqu @XMM[4], 0x30($out)
1723 movdqu @XMM[2], 0x40($out)
1724 movdqu @XMM[7], 0x50($out)
1725 movdqu @XMM[3], 0x60($out)
1726 movdqu @XMM[5], 0x70($out)
1727 lea 0x80($out), $out
1728 sub \$8,$len
1729 jnc .Lcbc_dec_loop
1730
1731 add \$8,$len
1732 jz .Lcbc_dec_done
1733
1734 movdqu 0x00($inp), @XMM[0] # load input
1735 mov %rsp, %rax # pass key schedule
60d4e99c 1736 mov %edx, %r10d # pass rounds
a75a52a4
AP
1737 cmp \$2,$len
1738 jb .Lcbc_dec_one
1739 movdqu 0x10($inp), @XMM[1]
1740 je .Lcbc_dec_two
1741 movdqu 0x20($inp), @XMM[2]
1742 cmp \$4,$len
1743 jb .Lcbc_dec_three
1744 movdqu 0x30($inp), @XMM[3]
1745 je .Lcbc_dec_four
1746 movdqu 0x40($inp), @XMM[4]
1747 cmp \$6,$len
1748 jb .Lcbc_dec_five
1749 movdqu 0x50($inp), @XMM[5]
1750 je .Lcbc_dec_six
1751 movdqu 0x60($inp), @XMM[6]
1752 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1753 call _bsaes_decrypt8
1754 pxor 0x20(%rbp), @XMM[0] # ^= IV
1755 movdqu 0x00($inp), @XMM[8] # re-load input
1756 movdqu 0x10($inp), @XMM[9]
1757 pxor @XMM[8], @XMM[1]
1758 movdqu 0x20($inp), @XMM[10]
1759 pxor @XMM[9], @XMM[6]
1760 movdqu 0x30($inp), @XMM[11]
1761 pxor @XMM[10], @XMM[4]
1762 movdqu 0x40($inp), @XMM[12]
1763 pxor @XMM[11], @XMM[2]
1764 movdqu 0x50($inp), @XMM[13]
1765 pxor @XMM[12], @XMM[7]
1766 movdqu 0x60($inp), @XMM[15] # IV
1767 pxor @XMM[13], @XMM[3]
1768 movdqu @XMM[0], 0x00($out) # write output
1769 movdqu @XMM[1], 0x10($out)
1770 movdqu @XMM[6], 0x20($out)
1771 movdqu @XMM[4], 0x30($out)
1772 movdqu @XMM[2], 0x40($out)
1773 movdqu @XMM[7], 0x50($out)
1774 movdqu @XMM[3], 0x60($out)
1775 jmp .Lcbc_dec_done
1776.align 16
1777.Lcbc_dec_six:
1778 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1779 call _bsaes_decrypt8
1780 pxor 0x20(%rbp), @XMM[0] # ^= IV
1781 movdqu 0x00($inp), @XMM[8] # re-load input
1782 movdqu 0x10($inp), @XMM[9]
1783 pxor @XMM[8], @XMM[1]
1784 movdqu 0x20($inp), @XMM[10]
1785 pxor @XMM[9], @XMM[6]
1786 movdqu 0x30($inp), @XMM[11]
1787 pxor @XMM[10], @XMM[4]
1788 movdqu 0x40($inp), @XMM[12]
1789 pxor @XMM[11], @XMM[2]
1790 movdqu 0x50($inp), @XMM[15] # IV
1791 pxor @XMM[12], @XMM[7]
1792 movdqu @XMM[0], 0x00($out) # write output
1793 movdqu @XMM[1], 0x10($out)
1794 movdqu @XMM[6], 0x20($out)
1795 movdqu @XMM[4], 0x30($out)
1796 movdqu @XMM[2], 0x40($out)
1797 movdqu @XMM[7], 0x50($out)
1798 jmp .Lcbc_dec_done
1799.align 16
1800.Lcbc_dec_five:
1801 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1802 call _bsaes_decrypt8
1803 pxor 0x20(%rbp), @XMM[0] # ^= IV
1804 movdqu 0x00($inp), @XMM[8] # re-load input
1805 movdqu 0x10($inp), @XMM[9]
1806 pxor @XMM[8], @XMM[1]
1807 movdqu 0x20($inp), @XMM[10]
1808 pxor @XMM[9], @XMM[6]
1809 movdqu 0x30($inp), @XMM[11]
1810 pxor @XMM[10], @XMM[4]
1811 movdqu 0x40($inp), @XMM[15] # IV
1812 pxor @XMM[11], @XMM[2]
1813 movdqu @XMM[0], 0x00($out) # write output
1814 movdqu @XMM[1], 0x10($out)
1815 movdqu @XMM[6], 0x20($out)
1816 movdqu @XMM[4], 0x30($out)
1817 movdqu @XMM[2], 0x40($out)
1818 jmp .Lcbc_dec_done
1819.align 16
1820.Lcbc_dec_four:
1821 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1822 call _bsaes_decrypt8
1823 pxor 0x20(%rbp), @XMM[0] # ^= IV
1824 movdqu 0x00($inp), @XMM[8] # re-load input
1825 movdqu 0x10($inp), @XMM[9]
1826 pxor @XMM[8], @XMM[1]
1827 movdqu 0x20($inp), @XMM[10]
1828 pxor @XMM[9], @XMM[6]
1829 movdqu 0x30($inp), @XMM[15] # IV
1830 pxor @XMM[10], @XMM[4]
1831 movdqu @XMM[0], 0x00($out) # write output
1832 movdqu @XMM[1], 0x10($out)
1833 movdqu @XMM[6], 0x20($out)
1834 movdqu @XMM[4], 0x30($out)
1835 jmp .Lcbc_dec_done
1836.align 16
1837.Lcbc_dec_three:
1838 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1839 call _bsaes_decrypt8
1840 pxor 0x20(%rbp), @XMM[0] # ^= IV
1841 movdqu 0x00($inp), @XMM[8] # re-load input
1842 movdqu 0x10($inp), @XMM[9]
1843 pxor @XMM[8], @XMM[1]
1844 movdqu 0x20($inp), @XMM[15] # IV
1845 pxor @XMM[9], @XMM[6]
1846 movdqu @XMM[0], 0x00($out) # write output
1847 movdqu @XMM[1], 0x10($out)
1848 movdqu @XMM[6], 0x20($out)
1849 jmp .Lcbc_dec_done
1850.align 16
1851.Lcbc_dec_two:
1852 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1853 call _bsaes_decrypt8
1854 pxor 0x20(%rbp), @XMM[0] # ^= IV
1855 movdqu 0x00($inp), @XMM[8] # re-load input
1856 movdqu 0x10($inp), @XMM[15] # IV
1857 pxor @XMM[8], @XMM[1]
1858 movdqu @XMM[0], 0x00($out) # write output
1859 movdqu @XMM[1], 0x10($out)
1860 jmp .Lcbc_dec_done
1861.align 16
1862.Lcbc_dec_one:
60d4e99c
AP
1863 lea ($inp), $arg1
1864 lea 0x20(%rbp), $arg2 # buffer output
1865 lea ($key), $arg3
fe068648 1866 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
1867 pxor 0x20(%rbp), @XMM[15] # ^= IV
1868 movdqu @XMM[15], ($out) # write output
1869 movdqa @XMM[0], @XMM[15] # IV
a75a52a4
AP
1870
1871.Lcbc_dec_done:
60d4e99c 1872 movdqu @XMM[15], (%rbx) # return IV
a75a52a4
AP
1873 lea (%rsp), %rax
1874 pxor %xmm0, %xmm0
1875.Lcbc_dec_bzero: # wipe key schedule [if any]
1876 movdqa %xmm0, 0x00(%rax)
1877 movdqa %xmm0, 0x10(%rax)
1878 lea 0x20(%rax), %rax
1879 cmp %rax, %rbp
1880 ja .Lcbc_dec_bzero
1881
384e6de4 1882 lea 0x78(%rbp),%rax
b84460ad 1883.cfi_def_cfa %rax,8
a75a52a4
AP
1884___
1885$code.=<<___ if ($win64);
1886 movaps 0x40(%rbp), %xmm6
1887 movaps 0x50(%rbp), %xmm7
1888 movaps 0x60(%rbp), %xmm8
1889 movaps 0x70(%rbp), %xmm9
1890 movaps 0x80(%rbp), %xmm10
1891 movaps 0x90(%rbp), %xmm11
1892 movaps 0xa0(%rbp), %xmm12
1893 movaps 0xb0(%rbp), %xmm13
1894 movaps 0xc0(%rbp), %xmm14
1895 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
1896 lea 0xa0(%rax), %rax
1897.Lcbc_dec_tail:
a75a52a4
AP
1898___
1899$code.=<<___;
384e6de4 1900 mov -48(%rax), %r15
b84460ad 1901.cfi_restore %r15
384e6de4 1902 mov -40(%rax), %r14
b84460ad 1903.cfi_restore %r14
384e6de4 1904 mov -32(%rax), %r13
b84460ad 1905.cfi_restore %r13
384e6de4 1906 mov -24(%rax), %r12
b84460ad 1907.cfi_restore %r12
384e6de4 1908 mov -16(%rax), %rbx
b84460ad 1909.cfi_restore %rbx
384e6de4 1910 mov -8(%rax), %rbp
b84460ad 1911.cfi_restore %rbp
384e6de4 1912 lea (%rax), %rsp # restore %rsp
b84460ad 1913.cfi_def_cfa_register %rsp
a75a52a4
AP
1914.Lcbc_dec_epilogue:
1915 ret
b84460ad 1916.cfi_endproc
a75a52a4
AP
1917.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1918
4ec93a10
AP
1919.globl bsaes_ctr32_encrypt_blocks
1920.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1921.align 16
1922bsaes_ctr32_encrypt_blocks:
b84460ad 1923.cfi_startproc
fe068648
AP
1924 mov %rsp, %rax
1925.Lctr_enc_prologue:
4ec93a10 1926 push %rbp
b84460ad 1927.cfi_push %rbp
4ec93a10 1928 push %rbx
b84460ad 1929.cfi_push %rbx
4ec93a10 1930 push %r12
b84460ad 1931.cfi_push %r12
4ec93a10 1932 push %r13
b84460ad 1933.cfi_push %r13
4ec93a10 1934 push %r14
b84460ad 1935.cfi_push %r14
4ec93a10 1936 push %r15
b84460ad 1937.cfi_push %r15
4ec93a10 1938 lea -0x48(%rsp), %rsp
b84460ad 1939.cfi_adjust_cfa_offset 0x48
4ec93a10
AP
1940___
1941$code.=<<___ if ($win64);
1942 mov 0xa0(%rsp),$arg5 # pull ivp
1943 lea -0xa0(%rsp), %rsp
1944 movaps %xmm6, 0x40(%rsp)
1945 movaps %xmm7, 0x50(%rsp)
1946 movaps %xmm8, 0x60(%rsp)
1947 movaps %xmm9, 0x70(%rsp)
1948 movaps %xmm10, 0x80(%rsp)
1949 movaps %xmm11, 0x90(%rsp)
1950 movaps %xmm12, 0xa0(%rsp)
1951 movaps %xmm13, 0xb0(%rsp)
1952 movaps %xmm14, 0xc0(%rsp)
1953 movaps %xmm15, 0xd0(%rsp)
1954.Lctr_enc_body:
1955___
1956$code.=<<___;
1957 mov %rsp, %rbp # backup %rsp
b84460ad 1958.cfi_def_cfa_register %rbp
4ec93a10
AP
1959 movdqu ($arg5), %xmm0 # load counter
1960 mov 240($arg4), %eax # rounds
1961 mov $arg1, $inp # backup arguments
1962 mov $arg2, $out
1963 mov $arg3, $len
1964 mov $arg4, $key
1965 movdqa %xmm0, 0x20(%rbp) # copy counter
1966 cmp \$8, $arg3
1967 jb .Lctr_enc_short
1968
1969 mov %eax, %ebx # rounds
1970 shl \$7, %rax # 128 bytes per inner round key
1971 sub \$`128-32`, %rax # size of bit-sliced key schedule
1972 sub %rax, %rsp
1973
1974 mov %rsp, %rax # pass key schedule
1975 mov $key, %rcx # pass key
1976 mov %ebx, %r10d # pass rounds
28507577
AP
1977 call _bsaes_key_convert
1978 pxor %xmm6,%xmm7 # fix up last round key
1979 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1980
1981 movdqa (%rsp), @XMM[9] # load round0 key
1982 lea .LADD1(%rip), %r11
1983 movdqa 0x20(%rbp), @XMM[0] # counter copy
1984 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1985 pshufb @XMM[8], @XMM[9] # byte swap upper part
1986 pshufb @XMM[8], @XMM[0]
1987 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1988 jmp .Lctr_enc_loop
1989.align 16
1990.Lctr_enc_loop:
1991 movdqa @XMM[0], 0x20(%rbp) # save counter
1992 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1993 movdqa @XMM[0], @XMM[2]
1994 paddd 0x00(%r11), @XMM[1] # .LADD1
1995 movdqa @XMM[0], @XMM[3]
1996 paddd 0x10(%r11), @XMM[2] # .LADD2
1997 movdqa @XMM[0], @XMM[4]
1998 paddd 0x20(%r11), @XMM[3] # .LADD3
1999 movdqa @XMM[0], @XMM[5]
2000 paddd 0x30(%r11), @XMM[4] # .LADD4
2001 movdqa @XMM[0], @XMM[6]
2002 paddd 0x40(%r11), @XMM[5] # .LADD5
2003 movdqa @XMM[0], @XMM[7]
2004 paddd 0x50(%r11), @XMM[6] # .LADD6
2005 paddd 0x60(%r11), @XMM[7] # .LADD7
2006
2007 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
2008 # to flip byte order in 32-bit counter
2009 movdqa (%rsp), @XMM[9] # round 0 key
2010 lea 0x10(%rsp), %rax # pass key schedule
2011 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
2012 pxor @XMM[9], @XMM[0] # xor with round0 key
2013 pxor @XMM[9], @XMM[1]
4ec93a10 2014 pxor @XMM[9], @XMM[2]
4ec93a10 2015 pxor @XMM[9], @XMM[3]
558ff0f0
AP
2016 pshufb @XMM[8], @XMM[0]
2017 pshufb @XMM[8], @XMM[1]
4ec93a10 2018 pxor @XMM[9], @XMM[4]
4ec93a10 2019 pxor @XMM[9], @XMM[5]
558ff0f0
AP
2020 pshufb @XMM[8], @XMM[2]
2021 pshufb @XMM[8], @XMM[3]
4ec93a10 2022 pxor @XMM[9], @XMM[6]
4ec93a10 2023 pxor @XMM[9], @XMM[7]
558ff0f0
AP
2024 pshufb @XMM[8], @XMM[4]
2025 pshufb @XMM[8], @XMM[5]
4ec93a10 2026 pshufb @XMM[8], @XMM[6]
4ec93a10 2027 pshufb @XMM[8], @XMM[7]
558ff0f0 2028 lea .LBS0(%rip), %r11 # constants table
4ec93a10
AP
2029 mov %ebx,%r10d # pass rounds
2030
2031 call _bsaes_encrypt8_bitslice
2032
2033 sub \$8,$len
2034 jc .Lctr_enc_loop_done
2035
2036 movdqu 0x00($inp), @XMM[8] # load input
2037 movdqu 0x10($inp), @XMM[9]
2038 movdqu 0x20($inp), @XMM[10]
2039 movdqu 0x30($inp), @XMM[11]
2040 movdqu 0x40($inp), @XMM[12]
2041 movdqu 0x50($inp), @XMM[13]
2042 movdqu 0x60($inp), @XMM[14]
2043 movdqu 0x70($inp), @XMM[15]
2044 lea 0x80($inp),$inp
2045 pxor @XMM[0], @XMM[8]
2046 movdqa 0x20(%rbp), @XMM[0] # load counter
2047 pxor @XMM[9], @XMM[1]
2048 movdqu @XMM[8], 0x00($out) # write output
2049 pxor @XMM[10], @XMM[4]
2050 movdqu @XMM[1], 0x10($out)
2051 pxor @XMM[11], @XMM[6]
2052 movdqu @XMM[4], 0x20($out)
2053 pxor @XMM[12], @XMM[3]
2054 movdqu @XMM[6], 0x30($out)
2055 pxor @XMM[13], @XMM[7]
2056 movdqu @XMM[3], 0x40($out)
2057 pxor @XMM[14], @XMM[2]
2058 movdqu @XMM[7], 0x50($out)
2059 pxor @XMM[15], @XMM[5]
2060 movdqu @XMM[2], 0x60($out)
2061 lea .LADD1(%rip), %r11
2062 movdqu @XMM[5], 0x70($out)
2063 lea 0x80($out), $out
2064 paddd 0x70(%r11), @XMM[0] # .LADD8
2065 jnz .Lctr_enc_loop
2066
2067 jmp .Lctr_enc_done
2068.align 16
2069.Lctr_enc_loop_done:
d127ef78 2070 add \$8, $len
4ec93a10
AP
2071 movdqu 0x00($inp), @XMM[8] # load input
2072 pxor @XMM[8], @XMM[0]
2073 movdqu @XMM[0], 0x00($out) # write output
2074 cmp \$2,$len
2075 jb .Lctr_enc_done
2076 movdqu 0x10($inp), @XMM[9]
2077 pxor @XMM[9], @XMM[1]
2078 movdqu @XMM[1], 0x10($out)
2079 je .Lctr_enc_done
2080 movdqu 0x20($inp), @XMM[10]
2081 pxor @XMM[10], @XMM[4]
2082 movdqu @XMM[4], 0x20($out)
2083 cmp \$4,$len
2084 jb .Lctr_enc_done
2085 movdqu 0x30($inp), @XMM[11]
2086 pxor @XMM[11], @XMM[6]
2087 movdqu @XMM[6], 0x30($out)
2088 je .Lctr_enc_done
2089 movdqu 0x40($inp), @XMM[12]
2090 pxor @XMM[12], @XMM[3]
2091 movdqu @XMM[3], 0x40($out)
2092 cmp \$6,$len
2093 jb .Lctr_enc_done
2094 movdqu 0x50($inp), @XMM[13]
2095 pxor @XMM[13], @XMM[7]
2096 movdqu @XMM[7], 0x50($out)
2097 je .Lctr_enc_done
2098 movdqu 0x60($inp), @XMM[14]
2099 pxor @XMM[14], @XMM[2]
2100 movdqu @XMM[2], 0x60($out)
2101 jmp .Lctr_enc_done
2102
2103.align 16
2104.Lctr_enc_short:
2105 lea 0x20(%rbp), $arg1
2106 lea 0x30(%rbp), $arg2
2107 lea ($key), $arg3
fe068648 2108 call asm_AES_encrypt
4ec93a10
AP
2109 movdqu ($inp), @XMM[1]
2110 lea 16($inp), $inp
2111 mov 0x2c(%rbp), %eax # load 32-bit counter
2112 bswap %eax
2113 pxor 0x30(%rbp), @XMM[1]
2114 inc %eax # increment
2115 movdqu @XMM[1], ($out)
2116 bswap %eax
2117 lea 16($out), $out
2118 mov %eax, 0x2c(%rsp) # save 32-bit counter
2119 dec $len
2120 jnz .Lctr_enc_short
2121
2122.Lctr_enc_done:
2123 lea (%rsp), %rax
2124 pxor %xmm0, %xmm0
2125.Lctr_enc_bzero: # wipe key schedule [if any]
2126 movdqa %xmm0, 0x00(%rax)
2127 movdqa %xmm0, 0x10(%rax)
2128 lea 0x20(%rax), %rax
2129 cmp %rax, %rbp
2130 ja .Lctr_enc_bzero
2131
384e6de4 2132 lea 0x78(%rbp),%rax
b84460ad 2133.cfi_def_cfa %rax,8
4ec93a10
AP
2134___
2135$code.=<<___ if ($win64);
2136 movaps 0x40(%rbp), %xmm6
2137 movaps 0x50(%rbp), %xmm7
2138 movaps 0x60(%rbp), %xmm8
2139 movaps 0x70(%rbp), %xmm9
2140 movaps 0x80(%rbp), %xmm10
2141 movaps 0x90(%rbp), %xmm11
2142 movaps 0xa0(%rbp), %xmm12
2143 movaps 0xb0(%rbp), %xmm13
2144 movaps 0xc0(%rbp), %xmm14
2145 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
2146 lea 0xa0(%rax), %rax
2147.Lctr_enc_tail:
4ec93a10
AP
2148___
2149$code.=<<___;
384e6de4 2150 mov -48(%rax), %r15
b84460ad 2151.cfi_restore %r15
384e6de4 2152 mov -40(%rax), %r14
b84460ad 2153.cfi_restore %r14
384e6de4 2154 mov -32(%rax), %r13
b84460ad 2155.cfi_restore %r13
384e6de4 2156 mov -24(%rax), %r12
b84460ad 2157.cfi_restore %r12
384e6de4 2158 mov -16(%rax), %rbx
b84460ad 2159.cfi_restore %rbx
384e6de4 2160 mov -8(%rax), %rbp
b84460ad 2161.cfi_restore %rbp
384e6de4 2162 lea (%rax), %rsp # restore %rsp
b84460ad 2163.cfi_def_cfa_register %rsp
4ec93a10
AP
2164.Lctr_enc_epilogue:
2165 ret
b84460ad 2166.cfi_endproc
4ec93a10
AP
2167.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2168___
60d4e99c
AP
2169######################################################################
2170# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2171# const AES_KEY *key1, const AES_KEY *key2,
2172# const unsigned char iv[16]);
2173#
2174my ($twmask,$twres,$twtmp)=@XMM[13..15];
7e1e3334
AP
2175$arg6=~s/d$//;
2176
60d4e99c
AP
2177$code.=<<___;
2178.globl bsaes_xts_encrypt
2179.type bsaes_xts_encrypt,\@abi-omnipotent
2180.align 16
2181bsaes_xts_encrypt:
b84460ad 2182.cfi_startproc
fe068648
AP
2183 mov %rsp, %rax
2184.Lxts_enc_prologue:
60d4e99c 2185 push %rbp
b84460ad 2186.cfi_push %rbp
60d4e99c 2187 push %rbx
b84460ad 2188.cfi_push %rbx
60d4e99c 2189 push %r12
b84460ad 2190.cfi_push %r12
60d4e99c 2191 push %r13
b84460ad 2192.cfi_push %r13
60d4e99c 2193 push %r14
b84460ad 2194.cfi_push %r14
60d4e99c 2195 push %r15
b84460ad 2196.cfi_push %r15
60d4e99c 2197 lea -0x48(%rsp), %rsp
b84460ad 2198.cfi_adjust_cfa_offset 0x48
60d4e99c
AP
2199___
2200$code.=<<___ if ($win64);
2201 mov 0xa0(%rsp),$arg5 # pull key2
2202 mov 0xa8(%rsp),$arg6 # pull ivp
2203 lea -0xa0(%rsp), %rsp
2204 movaps %xmm6, 0x40(%rsp)
2205 movaps %xmm7, 0x50(%rsp)
2206 movaps %xmm8, 0x60(%rsp)
2207 movaps %xmm9, 0x70(%rsp)
2208 movaps %xmm10, 0x80(%rsp)
2209 movaps %xmm11, 0x90(%rsp)
2210 movaps %xmm12, 0xa0(%rsp)
2211 movaps %xmm13, 0xb0(%rsp)
2212 movaps %xmm14, 0xc0(%rsp)
2213 movaps %xmm15, 0xd0(%rsp)
2214.Lxts_enc_body:
2215___
2216$code.=<<___;
2217 mov %rsp, %rbp # backup %rsp
b84460ad 2218.cfi_def_cfa_register %rbp
60d4e99c
AP
2219 mov $arg1, $inp # backup arguments
2220 mov $arg2, $out
2221 mov $arg3, $len
2222 mov $arg4, $key
2223
2224 lea ($arg6), $arg1
2225 lea 0x20(%rbp), $arg2
2226 lea ($arg5), $arg3
fe068648 2227 call asm_AES_encrypt # generate initial tweak
60d4e99c
AP
2228
2229 mov 240($key), %eax # rounds
2230 mov $len, %rbx # backup $len
2231
2232 mov %eax, %edx # rounds
2233 shl \$7, %rax # 128 bytes per inner round key
2234 sub \$`128-32`, %rax # size of bit-sliced key schedule
2235 sub %rax, %rsp
2236
2237 mov %rsp, %rax # pass key schedule
2238 mov $key, %rcx # pass key
2239 mov %edx, %r10d # pass rounds
2240 call _bsaes_key_convert
2241 pxor %xmm6, %xmm7 # fix up last round key
2242 movdqa %xmm7, (%rax) # save last round key
2243
2244 and \$-16, $len
2245 sub \$0x80, %rsp # place for tweak[8]
2246 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2247
2248 pxor $twtmp, $twtmp
2249 movdqa .Lxts_magic(%rip), $twmask
2250 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2251
2252 sub \$0x80, $len
2253 jc .Lxts_enc_short
2254 jmp .Lxts_enc_loop
2255
2256.align 16
2257.Lxts_enc_loop:
2258___
2259 for ($i=0;$i<7;$i++) {
2260 $code.=<<___;
2261 pshufd \$0x13, $twtmp, $twres
2262 pxor $twtmp, $twtmp
2263 movdqa @XMM[7], @XMM[$i]
2264 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2265 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2266 pand $twmask, $twres # isolate carry and residue
2267 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2268 pxor $twres, @XMM[7]
2269___
2270 $code.=<<___ if ($i>=1);
2271 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2272___
2273 $code.=<<___ if ($i>=2);
2274 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2275___
2276 }
2277$code.=<<___;
2278 movdqu 0x60($inp), @XMM[8+6]
2279 pxor @XMM[8+5], @XMM[5]
2280 movdqu 0x70($inp), @XMM[8+7]
2281 lea 0x80($inp), $inp
2282 movdqa @XMM[7], 0x70(%rsp)
2283 pxor @XMM[8+6], @XMM[6]
2284 lea 0x80(%rsp), %rax # pass key schedule
2285 pxor @XMM[8+7], @XMM[7]
2286 mov %edx, %r10d # pass rounds
2287
2288 call _bsaes_encrypt8
2289
2290 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2291 pxor 0x10(%rsp), @XMM[1]
2292 movdqu @XMM[0], 0x00($out) # write output
2293 pxor 0x20(%rsp), @XMM[4]
2294 movdqu @XMM[1], 0x10($out)
2295 pxor 0x30(%rsp), @XMM[6]
2296 movdqu @XMM[4], 0x20($out)
2297 pxor 0x40(%rsp), @XMM[3]
2298 movdqu @XMM[6], 0x30($out)
2299 pxor 0x50(%rsp), @XMM[7]
2300 movdqu @XMM[3], 0x40($out)
2301 pxor 0x60(%rsp), @XMM[2]
2302 movdqu @XMM[7], 0x50($out)
2303 pxor 0x70(%rsp), @XMM[5]
2304 movdqu @XMM[2], 0x60($out)
2305 movdqu @XMM[5], 0x70($out)
2306 lea 0x80($out), $out
2307
2308 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2309 pxor $twtmp, $twtmp
2310 movdqa .Lxts_magic(%rip), $twmask
2311 pcmpgtd @XMM[7], $twtmp
2312 pshufd \$0x13, $twtmp, $twres
2313 pxor $twtmp, $twtmp
2314 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2315 pand $twmask, $twres # isolate carry and residue
2316 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2317 pxor $twres, @XMM[7]
2318
2319 sub \$0x80,$len
2320 jnc .Lxts_enc_loop
2321
2322.Lxts_enc_short:
2323 add \$0x80, $len
2324 jz .Lxts_enc_done
2325___
2326 for ($i=0;$i<7;$i++) {
2327 $code.=<<___;
2328 pshufd \$0x13, $twtmp, $twres
2329 pxor $twtmp, $twtmp
2330 movdqa @XMM[7], @XMM[$i]
2331 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2332 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2333 pand $twmask, $twres # isolate carry and residue
2334 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2335 pxor $twres, @XMM[7]
2336___
2337 $code.=<<___ if ($i>=1);
2338 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2339 cmp \$`0x10*$i`,$len
2340 je .Lxts_enc_$i
2341___
2342 $code.=<<___ if ($i>=2);
2343 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2344___
2345 }
2346$code.=<<___;
2347 movdqu 0x60($inp), @XMM[8+6]
2348 pxor @XMM[8+5], @XMM[5]
2349 movdqa @XMM[7], 0x70(%rsp)
2350 lea 0x70($inp), $inp
2351 pxor @XMM[8+6], @XMM[6]
2352 lea 0x80(%rsp), %rax # pass key schedule
2353 mov %edx, %r10d # pass rounds
2354
2355 call _bsaes_encrypt8
2356
2357 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2358 pxor 0x10(%rsp), @XMM[1]
2359 movdqu @XMM[0], 0x00($out) # write output
2360 pxor 0x20(%rsp), @XMM[4]
2361 movdqu @XMM[1], 0x10($out)
2362 pxor 0x30(%rsp), @XMM[6]
2363 movdqu @XMM[4], 0x20($out)
2364 pxor 0x40(%rsp), @XMM[3]
2365 movdqu @XMM[6], 0x30($out)
2366 pxor 0x50(%rsp), @XMM[7]
2367 movdqu @XMM[3], 0x40($out)
2368 pxor 0x60(%rsp), @XMM[2]
2369 movdqu @XMM[7], 0x50($out)
2370 movdqu @XMM[2], 0x60($out)
2371 lea 0x70($out), $out
2372
2373 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2374 jmp .Lxts_enc_done
2375.align 16
2376.Lxts_enc_6:
2377 pxor @XMM[8+4], @XMM[4]
2378 lea 0x60($inp), $inp
2379 pxor @XMM[8+5], @XMM[5]
2380 lea 0x80(%rsp), %rax # pass key schedule
2381 mov %edx, %r10d # pass rounds
2382
2383 call _bsaes_encrypt8
2384
2385 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2386 pxor 0x10(%rsp), @XMM[1]
2387 movdqu @XMM[0], 0x00($out) # write output
2388 pxor 0x20(%rsp), @XMM[4]
2389 movdqu @XMM[1], 0x10($out)
2390 pxor 0x30(%rsp), @XMM[6]
2391 movdqu @XMM[4], 0x20($out)
2392 pxor 0x40(%rsp), @XMM[3]
2393 movdqu @XMM[6], 0x30($out)
2394 pxor 0x50(%rsp), @XMM[7]
2395 movdqu @XMM[3], 0x40($out)
2396 movdqu @XMM[7], 0x50($out)
2397 lea 0x60($out), $out
2398
2399 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2400 jmp .Lxts_enc_done
2401.align 16
2402.Lxts_enc_5:
2403 pxor @XMM[8+3], @XMM[3]
2404 lea 0x50($inp), $inp
2405 pxor @XMM[8+4], @XMM[4]
2406 lea 0x80(%rsp), %rax # pass key schedule
2407 mov %edx, %r10d # pass rounds
2408
2409 call _bsaes_encrypt8
2410
2411 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2412 pxor 0x10(%rsp), @XMM[1]
2413 movdqu @XMM[0], 0x00($out) # write output
2414 pxor 0x20(%rsp), @XMM[4]
2415 movdqu @XMM[1], 0x10($out)
2416 pxor 0x30(%rsp), @XMM[6]
2417 movdqu @XMM[4], 0x20($out)
2418 pxor 0x40(%rsp), @XMM[3]
2419 movdqu @XMM[6], 0x30($out)
2420 movdqu @XMM[3], 0x40($out)
2421 lea 0x50($out), $out
2422
2423 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2424 jmp .Lxts_enc_done
2425.align 16
2426.Lxts_enc_4:
2427 pxor @XMM[8+2], @XMM[2]
2428 lea 0x40($inp), $inp
2429 pxor @XMM[8+3], @XMM[3]
2430 lea 0x80(%rsp), %rax # pass key schedule
2431 mov %edx, %r10d # pass rounds
2432
2433 call _bsaes_encrypt8
2434
2435 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2436 pxor 0x10(%rsp), @XMM[1]
2437 movdqu @XMM[0], 0x00($out) # write output
2438 pxor 0x20(%rsp), @XMM[4]
2439 movdqu @XMM[1], 0x10($out)
2440 pxor 0x30(%rsp), @XMM[6]
2441 movdqu @XMM[4], 0x20($out)
2442 movdqu @XMM[6], 0x30($out)
2443 lea 0x40($out), $out
2444
2445 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2446 jmp .Lxts_enc_done
2447.align 16
2448.Lxts_enc_3:
2449 pxor @XMM[8+1], @XMM[1]
2450 lea 0x30($inp), $inp
2451 pxor @XMM[8+2], @XMM[2]
2452 lea 0x80(%rsp), %rax # pass key schedule
2453 mov %edx, %r10d # pass rounds
2454
2455 call _bsaes_encrypt8
2456
2457 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2458 pxor 0x10(%rsp), @XMM[1]
2459 movdqu @XMM[0], 0x00($out) # write output
2460 pxor 0x20(%rsp), @XMM[4]
2461 movdqu @XMM[1], 0x10($out)
2462 movdqu @XMM[4], 0x20($out)
2463 lea 0x30($out), $out
2464
2465 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2466 jmp .Lxts_enc_done
2467.align 16
2468.Lxts_enc_2:
2469 pxor @XMM[8+0], @XMM[0]
2470 lea 0x20($inp), $inp
2471 pxor @XMM[8+1], @XMM[1]
2472 lea 0x80(%rsp), %rax # pass key schedule
2473 mov %edx, %r10d # pass rounds
2474
2475 call _bsaes_encrypt8
2476
2477 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2478 pxor 0x10(%rsp), @XMM[1]
2479 movdqu @XMM[0], 0x00($out) # write output
2480 movdqu @XMM[1], 0x10($out)
2481 lea 0x20($out), $out
2482
2483 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2484 jmp .Lxts_enc_done
2485.align 16
2486.Lxts_enc_1:
2487 pxor @XMM[0], @XMM[8]
2488 lea 0x10($inp), $inp
2489 movdqa @XMM[8], 0x20(%rbp)
2490 lea 0x20(%rbp), $arg1
2491 lea 0x20(%rbp), $arg2
2492 lea ($key), $arg3
fe068648 2493 call asm_AES_encrypt # doesn't touch %xmm
60d4e99c
AP
2494 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2495 #pxor @XMM[8], @XMM[0]
2496 #lea 0x80(%rsp), %rax # pass key schedule
2497 #mov %edx, %r10d # pass rounds
2498 #call _bsaes_encrypt8
2499 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2500 movdqu @XMM[0], 0x00($out) # write output
2501 lea 0x10($out), $out
2502
2503 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2504
2505.Lxts_enc_done:
2506 and \$15, %ebx
2507 jz .Lxts_enc_ret
2508 mov $out, %rdx
2509
2510.Lxts_enc_steal:
2511 movzb ($inp), %eax
2512 movzb -16(%rdx), %ecx
2513 lea 1($inp), $inp
2514 mov %al, -16(%rdx)
2515 mov %cl, 0(%rdx)
2516 lea 1(%rdx), %rdx
2517 sub \$1,%ebx
2518 jnz .Lxts_enc_steal
2519
2520 movdqu -16($out), @XMM[0]
2521 lea 0x20(%rbp), $arg1
2522 pxor @XMM[7], @XMM[0]
2523 lea 0x20(%rbp), $arg2
2524 movdqa @XMM[0], 0x20(%rbp)
2525 lea ($key), $arg3
fe068648 2526 call asm_AES_encrypt # doesn't touch %xmm
60d4e99c
AP
2527 pxor 0x20(%rbp), @XMM[7]
2528 movdqu @XMM[7], -16($out)
2529
2530.Lxts_enc_ret:
2531 lea (%rsp), %rax
2532 pxor %xmm0, %xmm0
2533.Lxts_enc_bzero: # wipe key schedule [if any]
2534 movdqa %xmm0, 0x00(%rax)
2535 movdqa %xmm0, 0x10(%rax)
2536 lea 0x20(%rax), %rax
2537 cmp %rax, %rbp
2538 ja .Lxts_enc_bzero
2539
384e6de4 2540 lea 0x78(%rbp),%rax
b84460ad 2541.cfi_def_cfa %rax,8
60d4e99c
AP
2542___
2543$code.=<<___ if ($win64);
2544 movaps 0x40(%rbp), %xmm6
2545 movaps 0x50(%rbp), %xmm7
2546 movaps 0x60(%rbp), %xmm8
2547 movaps 0x70(%rbp), %xmm9
2548 movaps 0x80(%rbp), %xmm10
2549 movaps 0x90(%rbp), %xmm11
2550 movaps 0xa0(%rbp), %xmm12
2551 movaps 0xb0(%rbp), %xmm13
2552 movaps 0xc0(%rbp), %xmm14
2553 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
2554 lea 0xa0(%rax), %rax
2555.Lxts_enc_tail:
60d4e99c
AP
2556___
2557$code.=<<___;
384e6de4 2558 mov -48(%rax), %r15
b84460ad 2559.cfi_restore %r15
384e6de4 2560 mov -40(%rax), %r14
b84460ad 2561.cfi_restore %r14
384e6de4 2562 mov -32(%rax), %r13
b84460ad 2563.cfi_restore %r13
384e6de4 2564 mov -24(%rax), %r12
b84460ad 2565.cfi_restore %r12
384e6de4 2566 mov -16(%rax), %rbx
b84460ad 2567.cfi_restore %rbx
384e6de4 2568 mov -8(%rax), %rbp
b84460ad 2569.cfi_restore %rbp
384e6de4 2570 lea (%rax), %rsp # restore %rsp
b84460ad 2571.cfi_def_cfa_register %rsp
60d4e99c
AP
2572.Lxts_enc_epilogue:
2573 ret
b84460ad 2574.cfi_endproc
60d4e99c
AP
2575.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2576
2577.globl bsaes_xts_decrypt
2578.type bsaes_xts_decrypt,\@abi-omnipotent
2579.align 16
2580bsaes_xts_decrypt:
b84460ad 2581.cfi_startproc
fe068648
AP
2582 mov %rsp, %rax
2583.Lxts_dec_prologue:
60d4e99c 2584 push %rbp
b84460ad 2585.cfi_push %rbp
60d4e99c 2586 push %rbx
b84460ad 2587.cfi_push %rbx
60d4e99c 2588 push %r12
b84460ad 2589.cfi_push %r12
60d4e99c 2590 push %r13
b84460ad 2591.cfi_push %r13
60d4e99c 2592 push %r14
b84460ad 2593.cfi_push %r14
60d4e99c 2594 push %r15
b84460ad 2595.cfi_push %r15
60d4e99c 2596 lea -0x48(%rsp), %rsp
b84460ad 2597.cfi_adjust_cfa_offset 0x48
60d4e99c
AP
2598___
2599$code.=<<___ if ($win64);
2600 mov 0xa0(%rsp),$arg5 # pull key2
2601 mov 0xa8(%rsp),$arg6 # pull ivp
2602 lea -0xa0(%rsp), %rsp
2603 movaps %xmm6, 0x40(%rsp)
2604 movaps %xmm7, 0x50(%rsp)
2605 movaps %xmm8, 0x60(%rsp)
2606 movaps %xmm9, 0x70(%rsp)
2607 movaps %xmm10, 0x80(%rsp)
2608 movaps %xmm11, 0x90(%rsp)
2609 movaps %xmm12, 0xa0(%rsp)
2610 movaps %xmm13, 0xb0(%rsp)
2611 movaps %xmm14, 0xc0(%rsp)
2612 movaps %xmm15, 0xd0(%rsp)
2613.Lxts_dec_body:
2614___
2615$code.=<<___;
2616 mov %rsp, %rbp # backup %rsp
2617 mov $arg1, $inp # backup arguments
2618 mov $arg2, $out
2619 mov $arg3, $len
2620 mov $arg4, $key
2621
2622 lea ($arg6), $arg1
2623 lea 0x20(%rbp), $arg2
2624 lea ($arg5), $arg3
fe068648 2625 call asm_AES_encrypt # generate initial tweak
60d4e99c
AP
2626
2627 mov 240($key), %eax # rounds
2628 mov $len, %rbx # backup $len
2629
2630 mov %eax, %edx # rounds
2631 shl \$7, %rax # 128 bytes per inner round key
2632 sub \$`128-32`, %rax # size of bit-sliced key schedule
2633 sub %rax, %rsp
2634
2635 mov %rsp, %rax # pass key schedule
2636 mov $key, %rcx # pass key
2637 mov %edx, %r10d # pass rounds
2638 call _bsaes_key_convert
2639 pxor (%rsp), %xmm7 # fix up round 0 key
2640 movdqa %xmm6, (%rax) # save last round key
2641 movdqa %xmm7, (%rsp)
2642
2643 xor %eax, %eax # if ($len%16) len-=16;
2644 and \$-16, $len
2645 test \$15, %ebx
2646 setnz %al
2647 shl \$4, %rax
2648 sub %rax, $len
2649
2650 sub \$0x80, %rsp # place for tweak[8]
2651 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2652
2653 pxor $twtmp, $twtmp
2654 movdqa .Lxts_magic(%rip), $twmask
2655 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2656
2657 sub \$0x80, $len
2658 jc .Lxts_dec_short
2659 jmp .Lxts_dec_loop
2660
2661.align 16
2662.Lxts_dec_loop:
2663___
2664 for ($i=0;$i<7;$i++) {
2665 $code.=<<___;
2666 pshufd \$0x13, $twtmp, $twres
2667 pxor $twtmp, $twtmp
2668 movdqa @XMM[7], @XMM[$i]
2669 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2670 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2671 pand $twmask, $twres # isolate carry and residue
2672 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2673 pxor $twres, @XMM[7]
2674___
2675 $code.=<<___ if ($i>=1);
2676 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2677___
2678 $code.=<<___ if ($i>=2);
2679 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2680___
2681 }
2682$code.=<<___;
2683 movdqu 0x60($inp), @XMM[8+6]
2684 pxor @XMM[8+5], @XMM[5]
2685 movdqu 0x70($inp), @XMM[8+7]
2686 lea 0x80($inp), $inp
2687 movdqa @XMM[7], 0x70(%rsp)
2688 pxor @XMM[8+6], @XMM[6]
2689 lea 0x80(%rsp), %rax # pass key schedule
2690 pxor @XMM[8+7], @XMM[7]
2691 mov %edx, %r10d # pass rounds
2692
2693 call _bsaes_decrypt8
2694
2695 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2696 pxor 0x10(%rsp), @XMM[1]
2697 movdqu @XMM[0], 0x00($out) # write output
2698 pxor 0x20(%rsp), @XMM[6]
2699 movdqu @XMM[1], 0x10($out)
2700 pxor 0x30(%rsp), @XMM[4]
2701 movdqu @XMM[6], 0x20($out)
2702 pxor 0x40(%rsp), @XMM[2]
2703 movdqu @XMM[4], 0x30($out)
2704 pxor 0x50(%rsp), @XMM[7]
2705 movdqu @XMM[2], 0x40($out)
2706 pxor 0x60(%rsp), @XMM[3]
2707 movdqu @XMM[7], 0x50($out)
2708 pxor 0x70(%rsp), @XMM[5]
2709 movdqu @XMM[3], 0x60($out)
2710 movdqu @XMM[5], 0x70($out)
2711 lea 0x80($out), $out
2712
2713 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2714 pxor $twtmp, $twtmp
2715 movdqa .Lxts_magic(%rip), $twmask
2716 pcmpgtd @XMM[7], $twtmp
2717 pshufd \$0x13, $twtmp, $twres
2718 pxor $twtmp, $twtmp
2719 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2720 pand $twmask, $twres # isolate carry and residue
2721 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2722 pxor $twres, @XMM[7]
2723
2724 sub \$0x80,$len
2725 jnc .Lxts_dec_loop
2726
2727.Lxts_dec_short:
2728 add \$0x80, $len
2729 jz .Lxts_dec_done
2730___
2731 for ($i=0;$i<7;$i++) {
2732 $code.=<<___;
2733 pshufd \$0x13, $twtmp, $twres
2734 pxor $twtmp, $twtmp
2735 movdqa @XMM[7], @XMM[$i]
2736 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2737 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2738 pand $twmask, $twres # isolate carry and residue
2739 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2740 pxor $twres, @XMM[7]
2741___
2742 $code.=<<___ if ($i>=1);
2743 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2744 cmp \$`0x10*$i`,$len
2745 je .Lxts_dec_$i
2746___
2747 $code.=<<___ if ($i>=2);
2748 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2749___
2750 }
2751$code.=<<___;
2752 movdqu 0x60($inp), @XMM[8+6]
2753 pxor @XMM[8+5], @XMM[5]
2754 movdqa @XMM[7], 0x70(%rsp)
2755 lea 0x70($inp), $inp
2756 pxor @XMM[8+6], @XMM[6]
2757 lea 0x80(%rsp), %rax # pass key schedule
2758 mov %edx, %r10d # pass rounds
2759
2760 call _bsaes_decrypt8
2761
2762 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2763 pxor 0x10(%rsp), @XMM[1]
2764 movdqu @XMM[0], 0x00($out) # write output
2765 pxor 0x20(%rsp), @XMM[6]
2766 movdqu @XMM[1], 0x10($out)
2767 pxor 0x30(%rsp), @XMM[4]
2768 movdqu @XMM[6], 0x20($out)
2769 pxor 0x40(%rsp), @XMM[2]
2770 movdqu @XMM[4], 0x30($out)
2771 pxor 0x50(%rsp), @XMM[7]
2772 movdqu @XMM[2], 0x40($out)
2773 pxor 0x60(%rsp), @XMM[3]
2774 movdqu @XMM[7], 0x50($out)
2775 movdqu @XMM[3], 0x60($out)
2776 lea 0x70($out), $out
2777
2778 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2779 jmp .Lxts_dec_done
2780.align 16
2781.Lxts_dec_6:
2782 pxor @XMM[8+4], @XMM[4]
2783 lea 0x60($inp), $inp
2784 pxor @XMM[8+5], @XMM[5]
2785 lea 0x80(%rsp), %rax # pass key schedule
2786 mov %edx, %r10d # pass rounds
2787
2788 call _bsaes_decrypt8
2789
2790 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2791 pxor 0x10(%rsp), @XMM[1]
2792 movdqu @XMM[0], 0x00($out) # write output
2793 pxor 0x20(%rsp), @XMM[6]
2794 movdqu @XMM[1], 0x10($out)
2795 pxor 0x30(%rsp), @XMM[4]
2796 movdqu @XMM[6], 0x20($out)
2797 pxor 0x40(%rsp), @XMM[2]
2798 movdqu @XMM[4], 0x30($out)
2799 pxor 0x50(%rsp), @XMM[7]
2800 movdqu @XMM[2], 0x40($out)
2801 movdqu @XMM[7], 0x50($out)
2802 lea 0x60($out), $out
2803
2804 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2805 jmp .Lxts_dec_done
2806.align 16
2807.Lxts_dec_5:
2808 pxor @XMM[8+3], @XMM[3]
2809 lea 0x50($inp), $inp
2810 pxor @XMM[8+4], @XMM[4]
2811 lea 0x80(%rsp), %rax # pass key schedule
2812 mov %edx, %r10d # pass rounds
2813
2814 call _bsaes_decrypt8
2815
2816 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2817 pxor 0x10(%rsp), @XMM[1]
2818 movdqu @XMM[0], 0x00($out) # write output
2819 pxor 0x20(%rsp), @XMM[6]
2820 movdqu @XMM[1], 0x10($out)
2821 pxor 0x30(%rsp), @XMM[4]
2822 movdqu @XMM[6], 0x20($out)
2823 pxor 0x40(%rsp), @XMM[2]
2824 movdqu @XMM[4], 0x30($out)
2825 movdqu @XMM[2], 0x40($out)
2826 lea 0x50($out), $out
2827
2828 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2829 jmp .Lxts_dec_done
2830.align 16
2831.Lxts_dec_4:
2832 pxor @XMM[8+2], @XMM[2]
2833 lea 0x40($inp), $inp
2834 pxor @XMM[8+3], @XMM[3]
2835 lea 0x80(%rsp), %rax # pass key schedule
2836 mov %edx, %r10d # pass rounds
2837
2838 call _bsaes_decrypt8
2839
2840 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2841 pxor 0x10(%rsp), @XMM[1]
2842 movdqu @XMM[0], 0x00($out) # write output
2843 pxor 0x20(%rsp), @XMM[6]
2844 movdqu @XMM[1], 0x10($out)
2845 pxor 0x30(%rsp), @XMM[4]
2846 movdqu @XMM[6], 0x20($out)
2847 movdqu @XMM[4], 0x30($out)
2848 lea 0x40($out), $out
2849
2850 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2851 jmp .Lxts_dec_done
2852.align 16
2853.Lxts_dec_3:
2854 pxor @XMM[8+1], @XMM[1]
2855 lea 0x30($inp), $inp
2856 pxor @XMM[8+2], @XMM[2]
2857 lea 0x80(%rsp), %rax # pass key schedule
2858 mov %edx, %r10d # pass rounds
2859
2860 call _bsaes_decrypt8
2861
2862 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2863 pxor 0x10(%rsp), @XMM[1]
2864 movdqu @XMM[0], 0x00($out) # write output
2865 pxor 0x20(%rsp), @XMM[6]
2866 movdqu @XMM[1], 0x10($out)
2867 movdqu @XMM[6], 0x20($out)
2868 lea 0x30($out), $out
2869
2870 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2871 jmp .Lxts_dec_done
2872.align 16
2873.Lxts_dec_2:
2874 pxor @XMM[8+0], @XMM[0]
2875 lea 0x20($inp), $inp
2876 pxor @XMM[8+1], @XMM[1]
2877 lea 0x80(%rsp), %rax # pass key schedule
2878 mov %edx, %r10d # pass rounds
2879
2880 call _bsaes_decrypt8
2881
2882 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2883 pxor 0x10(%rsp), @XMM[1]
2884 movdqu @XMM[0], 0x00($out) # write output
2885 movdqu @XMM[1], 0x10($out)
2886 lea 0x20($out), $out
2887
2888 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2889 jmp .Lxts_dec_done
2890.align 16
2891.Lxts_dec_1:
2892 pxor @XMM[0], @XMM[8]
2893 lea 0x10($inp), $inp
2894 movdqa @XMM[8], 0x20(%rbp)
2895 lea 0x20(%rbp), $arg1
2896 lea 0x20(%rbp), $arg2
2897 lea ($key), $arg3
fe068648 2898 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2899 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2900 #pxor @XMM[8], @XMM[0]
2901 #lea 0x80(%rsp), %rax # pass key schedule
2902 #mov %edx, %r10d # pass rounds
2903 #call _bsaes_decrypt8
2904 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2905 movdqu @XMM[0], 0x00($out) # write output
2906 lea 0x10($out), $out
2907
2908 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2909
2910.Lxts_dec_done:
2911 and \$15, %ebx
2912 jz .Lxts_dec_ret
2913
2914 pxor $twtmp, $twtmp
2915 movdqa .Lxts_magic(%rip), $twmask
2916 pcmpgtd @XMM[7], $twtmp
2917 pshufd \$0x13, $twtmp, $twres
2918 movdqa @XMM[7], @XMM[6]
2919 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2920 pand $twmask, $twres # isolate carry and residue
2921 movdqu ($inp), @XMM[0]
2922 pxor $twres, @XMM[7]
2923
2924 lea 0x20(%rbp), $arg1
2925 pxor @XMM[7], @XMM[0]
2926 lea 0x20(%rbp), $arg2
2927 movdqa @XMM[0], 0x20(%rbp)
2928 lea ($key), $arg3
fe068648 2929 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2930 pxor 0x20(%rbp), @XMM[7]
2931 mov $out, %rdx
2932 movdqu @XMM[7], ($out)
2933
2934.Lxts_dec_steal:
2935 movzb 16($inp), %eax
2936 movzb (%rdx), %ecx
2937 lea 1($inp), $inp
2938 mov %al, (%rdx)
2939 mov %cl, 16(%rdx)
2940 lea 1(%rdx), %rdx
2941 sub \$1,%ebx
2942 jnz .Lxts_dec_steal
2943
2944 movdqu ($out), @XMM[0]
2945 lea 0x20(%rbp), $arg1
2946 pxor @XMM[6], @XMM[0]
2947 lea 0x20(%rbp), $arg2
2948 movdqa @XMM[0], 0x20(%rbp)
2949 lea ($key), $arg3
fe068648 2950 call asm_AES_decrypt # doesn't touch %xmm
60d4e99c
AP
2951 pxor 0x20(%rbp), @XMM[6]
2952 movdqu @XMM[6], ($out)
2953
2954.Lxts_dec_ret:
2955 lea (%rsp), %rax
2956 pxor %xmm0, %xmm0
2957.Lxts_dec_bzero: # wipe key schedule [if any]
2958 movdqa %xmm0, 0x00(%rax)
2959 movdqa %xmm0, 0x10(%rax)
2960 lea 0x20(%rax), %rax
2961 cmp %rax, %rbp
2962 ja .Lxts_dec_bzero
2963
384e6de4 2964 lea 0x78(%rbp),%rax
b84460ad 2965.cfi_def_cfa %rax,8
60d4e99c
AP
2966___
2967$code.=<<___ if ($win64);
2968 movaps 0x40(%rbp), %xmm6
2969 movaps 0x50(%rbp), %xmm7
2970 movaps 0x60(%rbp), %xmm8
2971 movaps 0x70(%rbp), %xmm9
2972 movaps 0x80(%rbp), %xmm10
2973 movaps 0x90(%rbp), %xmm11
2974 movaps 0xa0(%rbp), %xmm12
2975 movaps 0xb0(%rbp), %xmm13
2976 movaps 0xc0(%rbp), %xmm14
2977 movaps 0xd0(%rbp), %xmm15
384e6de4
AP
2978 lea 0xa0(%rax), %rax
2979.Lxts_dec_tail:
60d4e99c
AP
2980___
2981$code.=<<___;
384e6de4 2982 mov -48(%rax), %r15
b84460ad 2983.cfi_restore %r15
384e6de4 2984 mov -40(%rax), %r14
b84460ad 2985.cfi_restore %r14
384e6de4 2986 mov -32(%rax), %r13
b84460ad 2987.cfi_restore %r13
384e6de4 2988 mov -24(%rax), %r12
b84460ad 2989.cfi_restore %r12
384e6de4 2990 mov -16(%rax), %rbx
b84460ad 2991.cfi_restore %rbx
384e6de4 2992 mov -8(%rax), %rbp
b84460ad 2993.cfi_restore %rbp
384e6de4 2994 lea (%rax), %rsp # restore %rsp
b84460ad 2995.cfi_def_cfa_register %rsp
60d4e99c
AP
2996.Lxts_dec_epilogue:
2997 ret
b84460ad 2998.cfi_endproc
60d4e99c
AP
2999.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
3000___
4ec93a10
AP
3001}
3002$code.=<<___;
a75a52a4 3003.type _bsaes_const,\@object
4ec93a10 3004.align 64
a75a52a4 3005_bsaes_const:
28507577
AP
3006.LM0ISR: # InvShiftRows constants
3007 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
3008.LISRM0:
3009 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
3010.LISR:
3011 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
5a326467
AP
3012.LBS0: # bit-slice constants
3013 .quad 0x5555555555555555, 0x5555555555555555
3014.LBS1:
3015 .quad 0x3333333333333333, 0x3333333333333333
3016.LBS2:
3017 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3018.LSR: # shiftrows constants
3019 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
3020.LSRM0:
3021 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
5a326467
AP
3022.LM0SR:
3023 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
5a326467 3024.LSWPUP: # byte-swap upper dword
4ec93a10
AP
3025 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
3026.LSWPUPM0SR:
3027 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
5a326467
AP
3028.LADD1: # counter increment constants
3029 .quad 0x0000000000000000, 0x0000000100000000
3030.LADD2:
3031 .quad 0x0000000000000000, 0x0000000200000000
3032.LADD3:
3033 .quad 0x0000000000000000, 0x0000000300000000
3034.LADD4:
3035 .quad 0x0000000000000000, 0x0000000400000000
3036.LADD5:
3037 .quad 0x0000000000000000, 0x0000000500000000
3038.LADD6:
3039 .quad 0x0000000000000000, 0x0000000600000000
3040.LADD7:
3041 .quad 0x0000000000000000, 0x0000000700000000
3042.LADD8:
3043 .quad 0x0000000000000000, 0x0000000800000000
60d4e99c
AP
3044.Lxts_magic:
3045 .long 0x87,0,1,0
f9ef874a
AP
3046.Lmasks:
3047 .quad 0x0101010101010101, 0x0101010101010101
3048 .quad 0x0202020202020202, 0x0202020202020202
3049 .quad 0x0404040404040404, 0x0404040404040404
3050 .quad 0x0808080808080808, 0x0808080808080808
3051.LM0:
3052 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
3053.L63:
3054 .quad 0x6363636363636363, 0x6363636363636363
a75a52a4 3055.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
4ec93a10 3056.align 64
a75a52a4 3057.size _bsaes_const,.-_bsaes_const
4ec93a10
AP
3058___
3059
fe068648
AP
3060# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3061# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3062if ($win64) {
3063$rec="%rcx";
3064$frame="%rdx";
3065$context="%r8";
3066$disp="%r9";
3067
3068$code.=<<___;
3069.extern __imp_RtlVirtualUnwind
3070.type se_handler,\@abi-omnipotent
3071.align 16
3072se_handler:
3073 push %rsi
3074 push %rdi
3075 push %rbx
3076 push %rbp
3077 push %r12
3078 push %r13
3079 push %r14
3080 push %r15
3081 pushfq
3082 sub \$64,%rsp
3083
3084 mov 120($context),%rax # pull context->Rax
3085 mov 248($context),%rbx # pull context->Rip
3086
3087 mov 8($disp),%rsi # disp->ImageBase
3088 mov 56($disp),%r11 # disp->HandlerData
3089
3090 mov 0(%r11),%r10d # HandlerData[0]
3091 lea (%rsi,%r10),%r10 # prologue label
384e6de4
AP
3092 cmp %r10,%rbx # context->Rip<=prologue label
3093 jbe .Lin_prologue
fe068648
AP
3094
3095 mov 4(%r11),%r10d # HandlerData[1]
3096 lea (%rsi,%r10),%r10 # epilogue label
3097 cmp %r10,%rbx # context->Rip>=epilogue label
3098 jae .Lin_prologue
3099
384e6de4
AP
3100 mov 8(%r11),%r10d # HandlerData[2]
3101 lea (%rsi,%r10),%r10 # epilogue label
3102 cmp %r10,%rbx # context->Rip>=tail label
3103 jae .Lin_tail
3104
fe068648
AP
3105 mov 160($context),%rax # pull context->Rbp
3106
3107 lea 0x40(%rax),%rsi # %xmm save area
3108 lea 512($context),%rdi # &context.Xmm6
3109 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3110 .long 0xa548f3fc # cld; rep movsq
384e6de4
AP
3111 lea 0xa0+0x78(%rax),%rax # adjust stack pointer
3112
3113.Lin_tail:
3114 mov -48(%rax),%rbp
3115 mov -40(%rax),%rbx
3116 mov -32(%rax),%r12
3117 mov -24(%rax),%r13
3118 mov -16(%rax),%r14
3119 mov -8(%rax),%r15
fe068648
AP
3120 mov %rbx,144($context) # restore context->Rbx
3121 mov %rbp,160($context) # restore context->Rbp
3122 mov %r12,216($context) # restore context->R12
3123 mov %r13,224($context) # restore context->R13
3124 mov %r14,232($context) # restore context->R14
3125 mov %r15,240($context) # restore context->R15
3126
3127.Lin_prologue:
3128 mov %rax,152($context) # restore context->Rsp
3129
3130 mov 40($disp),%rdi # disp->ContextRecord
3131 mov $context,%rsi # context
3132 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3133 .long 0xa548f3fc # cld; rep movsq
3134
3135 mov $disp,%rsi
3136 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3137 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3138 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3139 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3140 mov 40(%rsi),%r10 # disp->ContextRecord
3141 lea 56(%rsi),%r11 # &disp->HandlerData
3142 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3143 mov %r10,32(%rsp) # arg5
3144 mov %r11,40(%rsp) # arg6
3145 mov %r12,48(%rsp) # arg7
3146 mov %rcx,56(%rsp) # arg8, (NULL)
3147 call *__imp_RtlVirtualUnwind(%rip)
3148
3149 mov \$1,%eax # ExceptionContinueSearch
3150 add \$64,%rsp
3151 popfq
3152 pop %r15
3153 pop %r14
3154 pop %r13
3155 pop %r12
3156 pop %rbp
3157 pop %rbx
3158 pop %rdi
3159 pop %rsi
3160 ret
3161.size se_handler,.-se_handler
3162
3163.section .pdata
3164.align 4
3165___
3166$code.=<<___ if ($ecb);
3167 .rva .Lecb_enc_prologue
3168 .rva .Lecb_enc_epilogue
3169 .rva .Lecb_enc_info
3170
3171 .rva .Lecb_dec_prologue
3172 .rva .Lecb_dec_epilogue
3173 .rva .Lecb_dec_info
3174___
3175$code.=<<___;
3176 .rva .Lcbc_dec_prologue
3177 .rva .Lcbc_dec_epilogue
3178 .rva .Lcbc_dec_info
3179
3180 .rva .Lctr_enc_prologue
3181 .rva .Lctr_enc_epilogue
3182 .rva .Lctr_enc_info
3183
3184 .rva .Lxts_enc_prologue
3185 .rva .Lxts_enc_epilogue
3186 .rva .Lxts_enc_info
3187
3188 .rva .Lxts_dec_prologue
3189 .rva .Lxts_dec_epilogue
3190 .rva .Lxts_dec_info
3191
3192.section .xdata
3193.align 8
3194___
3195$code.=<<___ if ($ecb);
3196.Lecb_enc_info:
3197 .byte 9,0,0,0
3198 .rva se_handler
3199 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
384e6de4
AP
3200 .rva .Lecb_enc_tail
3201 .long 0
fe068648
AP
3202.Lecb_dec_info:
3203 .byte 9,0,0,0
3204 .rva se_handler
3205 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
384e6de4
AP
3206 .rva .Lecb_dec_tail
3207 .long 0
fe068648
AP
3208___
3209$code.=<<___;
3210.Lcbc_dec_info:
3211 .byte 9,0,0,0
3212 .rva se_handler
3213 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
384e6de4
AP
3214 .rva .Lcbc_dec_tail
3215 .long 0
fe068648
AP
3216.Lctr_enc_info:
3217 .byte 9,0,0,0
3218 .rva se_handler
3219 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
384e6de4
AP
3220 .rva .Lctr_enc_tail
3221 .long 0
fe068648
AP
3222.Lxts_enc_info:
3223 .byte 9,0,0,0
3224 .rva se_handler
3225 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
384e6de4
AP
3226 .rva .Lxts_enc_tail
3227 .long 0
fe068648
AP
3228.Lxts_dec_info:
3229 .byte 9,0,0,0
3230 .rva se_handler
3231 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
384e6de4
AP
3232 .rva .Lxts_dec_tail
3233 .long 0
fe068648
AP
3234___
3235}
3236
4ec93a10
AP
3237$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3238
3239print $code;
3240
3241close STDOUT;