]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/bsaes-x86_64.pl
Add missing algorithms to disable, and in particular, disable
[thirdparty/openssl.git] / crypto / aes / asm / bsaes-x86_64.pl
CommitLineData
4ec93a10
AP
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
11272648 7### Author: Emilia Käsper and Peter Schwabe ###
4ec93a10
AP
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 410 0.37
69# Nehalem 310 0.35
70# Atom 570 0.26
71#
72# The ratio values mean that 128-byte blocks will be processed
3b7c14bb 73# 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
4ec93a10
AP
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
28507577
AP
81# October 2011.
82#
b08259cd
AP
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 11.0
87# Nehalem 9.16
28507577 88#
4ec93a10
AP
89# <appro@openssl.org>
90
91$flavour = shift;
92$output = shift;
93if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94
95$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96
97$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100die "can't locate x86_64-xlate.pl";
101
102open STDOUT,"| $^X $xlate $flavour $output";
103
104my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
105my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
106
107{
108my ($key,$rounds,$const)=("%rax","%r10d","%r11");
109
28507577 110sub Sbox {
4ec93a10
AP
111# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
112# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
113my @b=@_[0..7];
114my @t=@_[8..11];
115my @s=@_[12..15];
116 &InBasisChange (@b);
117 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
118 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
119}
120
121sub InBasisChange {
122# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
124my @b=@_[0..7];
125$code.=<<___;
126 pxor @b[6], @b[5]
127 pxor @b[1], @b[2]
28507577 128 pxor @b[0], @b[3]
4ec93a10 129 pxor @b[2], @b[6]
28507577 130 pxor @b[0], @b[5]
4ec93a10
AP
131
132 pxor @b[3], @b[6]
133 pxor @b[7], @b[3]
134 pxor @b[5], @b[7]
135 pxor @b[4], @b[3]
136 pxor @b[5], @b[4]
137 pxor @b[1], @b[3]
138
139 pxor @b[7], @b[2]
140 pxor @b[5], @b[1]
141___
142}
143
144sub OutBasisChange {
145# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
146# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
147my @b=@_[0..7];
148$code.=<<___;
149 pxor @b[6], @b[0]
150 pxor @b[4], @b[1]
151 pxor @b[0], @b[2]
152 pxor @b[6], @b[4]
153 pxor @b[1], @b[6]
154
155 pxor @b[5], @b[1]
156 pxor @b[3], @b[5]
157 pxor @b[7], @b[3]
158 pxor @b[5], @b[7]
159 pxor @b[5], @b[2]
160
161 pxor @b[7], @b[4]
162___
163}
164
28507577
AP
165sub InvSbox {
166# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
167# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
168my @b=@_[0..7];
169my @t=@_[8..11];
170my @s=@_[12..15];
171 &InvInBasisChange (@b);
172 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
173 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
174}
175
176sub InvInBasisChange { # OutBasisChange in reverse
177my @b=@_[5,1,2,6,3,7,0,4];
178$code.=<<___
179 pxor @b[7], @b[4]
180
181 pxor @b[5], @b[7]
182 pxor @b[5], @b[2]
183 pxor @b[7], @b[3]
184 pxor @b[3], @b[5]
185 pxor @b[5], @b[1]
186
187 pxor @b[1], @b[6]
188 pxor @b[0], @b[2]
189 pxor @b[6], @b[4]
190 pxor @b[6], @b[0]
191 pxor @b[4], @b[1]
192___
193}
194
195sub InvOutBasisChange { # InBasisChange in reverse
196my @b=@_[2,5,7,3,6,1,0,4];
197$code.=<<___;
198 pxor @b[5], @b[1]
199 pxor @b[7], @b[2]
200
201 pxor @b[1], @b[3]
202 pxor @b[5], @b[4]
203 pxor @b[5], @b[7]
204 pxor @b[4], @b[3]
205 pxor @b[0], @b[5]
206 pxor @b[7], @b[3]
207 pxor @b[2], @b[6]
208 pxor @b[1], @b[2]
209 pxor @b[3], @b[6]
210
211 pxor @b[0], @b[3]
212 pxor @b[6], @b[5]
213___
214}
215
4ec93a10
AP
216sub Mul_GF4 {
217#;*************************************************************
218#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
219#;*************************************************************
220my ($x0,$x1,$y0,$y1,$t0)=@_;
221$code.=<<___;
222 movdqa $y0, $t0
223 pxor $y1, $t0
224 pand $x0, $t0
225 pxor $x1, $x0
226 pand $y0, $x1
227 pand $y1, $x0
228 pxor $x1, $x0
229 pxor $t0, $x1
230___
231}
232
233sub Mul_GF4_N { # not used, see next subroutine
234# multiply and scale by N
235my ($x0,$x1,$y0,$y1,$t0)=@_;
236$code.=<<___;
237 movdqa $y0, $t0
238 pxor $y1, $t0
239 pand $x0, $t0
240 pxor $x1, $x0
241 pand $y0, $x1
242 pand $y1, $x0
243 pxor $x0, $x1
244 pxor $t0, $x0
245___
246}
247
248sub Mul_GF4_N_GF4 {
249# interleaved Mul_GF4_N and Mul_GF4
250my ($x0,$x1,$y0,$y1,$t0,
251 $x2,$x3,$y2,$y3,$t1)=@_;
252$code.=<<___;
253 movdqa $y0, $t0
254 movdqa $y2, $t1
255 pxor $y1, $t0
256 pxor $y3, $t1
257 pand $x0, $t0
258 pand $x2, $t1
259 pxor $x1, $x0
260 pxor $x3, $x2
261 pand $y0, $x1
262 pand $y2, $x3
263 pand $y1, $x0
264 pand $y3, $x2
265 pxor $x0, $x1
266 pxor $x3, $x2
267 pxor $t0, $x0
268 pxor $t1, $x3
269___
270}
271sub Mul_GF16_2 {
272my @x=@_[0..7];
273my @y=@_[8..11];
274my @t=@_[12..15];
275$code.=<<___;
276 movdqa @x[0], @t[0]
277 movdqa @x[1], @t[1]
278___
279 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
280$code.=<<___;
281 pxor @x[2], @t[0]
282 pxor @x[3], @t[1]
283 pxor @y[2], @y[0]
284 pxor @y[3], @y[1]
285___
286 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
287 @x[2], @x[3], @y[2], @y[3], @t[2]);
288$code.=<<___;
289 pxor @t[0], @x[0]
290 pxor @t[0], @x[2]
291 pxor @t[1], @x[1]
292 pxor @t[1], @x[3]
293
294 movdqa @x[4], @t[0]
295 movdqa @x[5], @t[1]
296 pxor @x[6], @t[0]
297 pxor @x[7], @t[1]
298___
299 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
300 @x[6], @x[7], @y[2], @y[3], @t[2]);
301$code.=<<___;
302 pxor @y[2], @y[0]
303 pxor @y[3], @y[1]
304___
305 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
306$code.=<<___;
307 pxor @t[0], @x[4]
308 pxor @t[0], @x[6]
309 pxor @t[1], @x[5]
310 pxor @t[1], @x[7]
311___
312}
313sub Inv_GF256 {
314#;********************************************************************
315#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
316#;********************************************************************
317my @x=@_[0..7];
318my @t=@_[8..11];
319my @s=@_[12..15];
320# direct optimizations from hardware
321$code.=<<___;
322 movdqa @x[4], @t[3]
323 movdqa @x[5], @t[2]
324 movdqa @x[1], @t[1]
325 movdqa @x[7], @s[1]
326 movdqa @x[0], @s[0]
327
328 pxor @x[6], @t[3]
329 pxor @x[7], @t[2]
330 pxor @x[3], @t[1]
331 movdqa @t[3], @s[2]
332 pxor @x[6], @s[1]
333 movdqa @t[2], @t[0]
334 pxor @x[2], @s[0]
335 movdqa @t[3], @s[3]
336
337 por @t[1], @t[2]
338 por @s[0], @t[3]
339 pxor @t[0], @s[3]
340 pand @s[0], @s[2]
341 pxor @t[1], @s[0]
342 pand @t[1], @t[0]
343 pand @s[0], @s[3]
344 movdqa @x[3], @s[0]
345 pxor @x[2], @s[0]
346 pand @s[0], @s[1]
347 pxor @s[1], @t[3]
348 pxor @s[1], @t[2]
349 movdqa @x[4], @s[1]
350 movdqa @x[1], @s[0]
351 pxor @x[5], @s[1]
352 pxor @x[0], @s[0]
353 movdqa @s[1], @t[1]
354 pand @s[0], @s[1]
355 por @s[0], @t[1]
356 pxor @s[1], @t[0]
357 pxor @s[3], @t[3]
358 pxor @s[2], @t[2]
359 pxor @s[3], @t[1]
360 movdqa @x[7], @s[0]
361 pxor @s[2], @t[0]
362 movdqa @x[6], @s[1]
363 pxor @s[2], @t[1]
364 movdqa @x[5], @s[2]
365 pand @x[3], @s[0]
366 movdqa @x[4], @s[3]
367 pand @x[2], @s[1]
368 pand @x[1], @s[2]
369 por @x[0], @s[3]
370 pxor @s[0], @t[3]
371 pxor @s[1], @t[2]
372 pxor @s[2], @t[1]
373 pxor @s[3], @t[0]
374
375 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
376
377 # new smaller inversion
378
379 movdqa @t[3], @s[0]
380 pand @t[1], @t[3]
381 pxor @t[2], @s[0]
382
383 movdqa @t[0], @s[2]
384 movdqa @s[0], @s[3]
385 pxor @t[3], @s[2]
386 pand @s[2], @s[3]
387
388 movdqa @t[1], @s[1]
389 pxor @t[2], @s[3]
390 pxor @t[0], @s[1]
391
392 pxor @t[2], @t[3]
393
394 pand @t[3], @s[1]
395
396 movdqa @s[2], @t[2]
397 pxor @t[0], @s[1]
398
399 pxor @s[1], @t[2]
400 pxor @s[1], @t[1]
401
402 pand @t[0], @t[2]
403
404 pxor @t[2], @s[2]
405 pxor @t[2], @t[1]
406
407 pand @s[3], @s[2]
408
409 pxor @s[0], @s[2]
410___
411# output in s3, s2, s1, t1
412
413# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
414
415# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
416 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
417
418### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
419}
420
421# AES linear components
422
28507577 423sub ShiftRows {
4ec93a10
AP
424my @x=@_[0..7];
425my $mask=pop;
426$code.=<<___;
427 pxor 0x00($key),@x[0]
428 pxor 0x10($key),@x[1]
429 pshufb $mask,@x[0]
430 pxor 0x20($key),@x[2]
431 pshufb $mask,@x[1]
432 pxor 0x30($key),@x[3]
433 pshufb $mask,@x[2]
434 pxor 0x40($key),@x[4]
435 pshufb $mask,@x[3]
436 pxor 0x50($key),@x[5]
437 pshufb $mask,@x[4]
438 pxor 0x60($key),@x[6]
439 pshufb $mask,@x[5]
440 pxor 0x70($key),@x[7]
441 pshufb $mask,@x[6]
442 lea 0x80($key),$key
443 pshufb $mask,@x[7]
444___
445}
446
28507577 447sub MixColumns {
4ec93a10
AP
448# modified to emit output in order suitable for feeding back to aesenc[last]
449my @x=@_[0..7];
450my @t=@_[8..15];
451$code.=<<___;
452 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
453 pshufd \$0x93, @x[1], @t[1]
454 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
455 pshufd \$0x93, @x[2], @t[2]
456 pxor @t[1], @x[1]
457 pshufd \$0x93, @x[3], @t[3]
458 pxor @t[2], @x[2]
459 pshufd \$0x93, @x[4], @t[4]
460 pxor @t[3], @x[3]
461 pshufd \$0x93, @x[5], @t[5]
462 pxor @t[4], @x[4]
463 pshufd \$0x93, @x[6], @t[6]
464 pxor @t[5], @x[5]
465 pshufd \$0x93, @x[7], @t[7]
466 pxor @t[6], @x[6]
467 pxor @t[7], @x[7]
468
469 pxor @x[0], @t[1]
470 pxor @x[7], @t[0]
471 pxor @x[7], @t[1]
472 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
473 pxor @x[1], @t[2]
474 pshufd \$0x4E, @x[1], @x[1]
475 pxor @x[4], @t[5]
476 pxor @t[0], @x[0]
477 pxor @x[5], @t[6]
478 pxor @t[1], @x[1]
479 pxor @x[3], @t[4]
480 pshufd \$0x4E, @x[4], @t[0]
481 pxor @x[6], @t[7]
482 pshufd \$0x4E, @x[5], @t[1]
483 pxor @x[2], @t[3]
484 pshufd \$0x4E, @x[3], @x[4]
485 pxor @x[7], @t[3]
486 pshufd \$0x4E, @x[7], @x[5]
487 pxor @x[7], @t[4]
488 pshufd \$0x4E, @x[6], @x[3]
489 pxor @t[4], @t[0]
490 pshufd \$0x4E, @x[2], @x[6]
491 pxor @t[5], @t[1]
492
493 pxor @t[3], @x[4]
494 pxor @t[7], @x[5]
495 pxor @t[6], @x[3]
496 movdqa @t[0], @x[2]
497 pxor @t[2], @x[6]
498 movdqa @t[1], @x[7]
499___
500}
501
28507577
AP
502sub InvMixColumns {
503my @x=@_[0..7];
504my @t=@_[8..15];
505
506$code.=<<___;
28507577 507 # multiplication by 0x0e
b08259cd
AP
508 pshufd \$0x93, @x[7], @t[7]
509 movdqa @x[2], @t[2]
28507577
AP
510 pxor @x[5], @x[7] # 7 5
511 pxor @x[5], @x[2] # 2 5
b08259cd
AP
512 pshufd \$0x93, @x[0], @t[0]
513 movdqa @x[5], @t[5]
28507577
AP
514 pxor @x[0], @x[5] # 5 0 [1]
515 pxor @x[1], @x[0] # 0 1
b08259cd 516 pshufd \$0x93, @x[1], @t[1]
28507577
AP
517 pxor @x[2], @x[1] # 1 25
518 pxor @x[6], @x[0] # 01 6 [2]
519 pxor @x[3], @x[1] # 125 3 [4]
b08259cd 520 pshufd \$0x93, @x[3], @t[3]
28507577
AP
521 pxor @x[0], @x[2] # 25 016 [3]
522 pxor @x[7], @x[3] # 3 75
523 pxor @x[6], @x[7] # 75 6 [0]
b08259cd
AP
524 pshufd \$0x93, @x[6], @t[6]
525 movdqa @x[4], @t[4]
28507577
AP
526 pxor @x[4], @x[6] # 6 4
527 pxor @x[3], @x[4] # 4 375 [6]
528 pxor @x[7], @x[3] # 375 756=36
529 pxor @t[5], @x[6] # 64 5 [7]
28507577 530 pxor @t[2], @x[3] # 36 2
28507577 531 pxor @t[4], @x[3] # 362 4 [5]
b08259cd 532 pshufd \$0x93, @t[5], @t[5]
28507577
AP
533___
534 my @y = @x[7,5,0,2,1,3,4,6];
535$code.=<<___;
536 # multiplication by 0x0b
537 pxor @y[0], @y[1]
538 pxor @t[0], @y[0]
28507577 539 pxor @t[1], @y[1]
b08259cd
AP
540 pshufd \$0x93, @t[2], @t[2]
541 pxor @t[5], @y[0]
542 pxor @t[6], @y[1]
543 pxor @t[7], @y[0]
544 pshufd \$0x93, @t[4], @t[4]
28507577 545 pxor @t[6], @t[7] # clobber t[7]
b08259cd 546 pxor @y[0], @y[1]
28507577 547
b08259cd
AP
548 pxor @t[0], @y[3]
549 pshufd \$0x93, @t[0], @t[0]
28507577 550 pxor @t[1], @y[2]
b08259cd 551 pxor @t[1], @y[4]
28507577 552 pxor @t[2], @y[2]
b08259cd 553 pshufd \$0x93, @t[1], @t[1]
28507577 554 pxor @t[2], @y[3]
b08259cd
AP
555 pxor @t[2], @y[5]
556 pxor @t[7], @y[2]
557 pshufd \$0x93, @t[2], @t[2]
28507577 558 pxor @t[3], @y[3]
b08259cd
AP
559 pxor @t[3], @y[6]
560 pxor @t[3], @y[4]
561 pshufd \$0x93, @t[3], @t[3]
562 pxor @t[4], @y[7]
563 pxor @t[4], @y[5]
28507577 564 pxor @t[7], @y[7]
b08259cd
AP
565 pxor @t[5], @y[3]
566 pxor @t[4], @y[4]
28507577
AP
567 pxor @t[5], @t[7] # clobber t[7] even more
568
b08259cd
AP
569 pxor @t[7], @y[5]
570 pshufd \$0x93, @t[4], @t[4]
571 pxor @t[7], @y[6]
572 pxor @t[7], @y[4]
28507577
AP
573
574 pxor @t[5], @t[7]
b08259cd 575 pshufd \$0x93, @t[5], @t[5]
28507577
AP
576 pxor @t[6], @t[7] # restore t[7]
577
b08259cd
AP
578 # multiplication by 0x0d
579 pxor @y[7], @y[4]
580 pxor @t[4], @y[7]
28507577 581 pshufd \$0x93, @t[6], @t[6]
b08259cd
AP
582 pxor @t[0], @y[2]
583 pxor @t[5], @y[7]
584 pxor @t[2], @y[2]
28507577
AP
585 pshufd \$0x93, @t[7], @t[7]
586
28507577
AP
587 pxor @y[1], @y[3]
588 pxor @t[1], @y[1]
b08259cd 589 pxor @t[0], @y[0]
28507577 590 pxor @t[0], @y[3]
b08259cd
AP
591 pxor @t[5], @y[1]
592 pxor @t[5], @y[0]
593 pxor @t[7], @y[1]
594 pshufd \$0x93, @t[0], @t[0]
595 pxor @t[6], @y[0]
596 pxor @y[1], @y[3]
28507577 597 pxor @t[1], @y[4]
b08259cd 598 pshufd \$0x93, @t[1], @t[1]
28507577 599
b08259cd
AP
600 pxor @t[7], @y[7]
601 pxor @t[2], @y[4]
28507577 602 pxor @t[2], @y[5]
b08259cd
AP
603 pshufd \$0x93, @t[2], @t[2]
604 pxor @t[6], @y[2]
605 pxor @t[3], @t[6] # clobber t[6]
606 pxor @y[7], @y[4]
607 pxor @t[6], @y[3]
28507577
AP
608
609 pxor @t[6], @y[6]
b08259cd 610 pxor @t[5], @y[5]
28507577 611 pxor @t[4], @y[6]
b08259cd
AP
612 pshufd \$0x93, @t[4], @t[4]
613 pxor @t[6], @y[5]
614 pxor @t[7], @y[6]
28507577
AP
615 pxor @t[3], @t[6] # restore t[6]
616
28507577
AP
617 pshufd \$0x93, @t[5], @t[5]
618 pshufd \$0x93, @t[6], @t[6]
619 pshufd \$0x93, @t[7], @t[7]
b08259cd 620 pshufd \$0x93, @t[3], @t[3]
28507577
AP
621
622 # multiplication by 0x09
623 pxor @y[1], @y[4]
b08259cd 624 pxor @y[1], @t[1] # t[1]=y[1]
28507577 625 pxor @t[5], @t[0] # clobber t[0]
b08259cd 626 pxor @t[5], @t[1]
28507577 627 pxor @t[0], @y[3]
b08259cd
AP
628 pxor @y[0], @t[0] # t[0]=y[0]
629 pxor @t[6], @t[1]
630 pxor @t[7], @t[6] # clobber t[6]
631 pxor @t[1], @y[4]
28507577 632 pxor @t[4], @y[7]
b08259cd
AP
633 pxor @y[4], @t[4] # t[4]=y[4]
634 pxor @t[3], @y[6]
635 pxor @y[3], @t[3] # t[3]=y[3]
636 pxor @t[2], @y[5]
637 pxor @y[2], @t[2] # t[2]=y[2]
638 pxor @t[7], @t[3]
639 pxor @y[5], @t[5] # t[5]=y[5]
640 pxor @t[6], @t[2]
641 pxor @t[6], @t[5]
642 pxor @y[6], @t[6] # t[6]=y[6]
643 pxor @y[7], @t[7] # t[7]=y[7]
28507577
AP
644
645 movdqa @t[0],@XMM[0]
646 movdqa @t[1],@XMM[1]
647 movdqa @t[2],@XMM[2]
648 movdqa @t[3],@XMM[3]
649 movdqa @t[4],@XMM[4]
650 movdqa @t[5],@XMM[5]
651 movdqa @t[6],@XMM[6]
652 movdqa @t[7],@XMM[7]
653___
654}
655
4ec93a10
AP
656sub aesenc { # not used
657my @b=@_[0..7];
658my @t=@_[8..15];
659$code.=<<___;
660 movdqa 0x30($const),@t[0] # .LSR
661___
28507577
AP
662 &ShiftRows (@b,@t[0]);
663 &Sbox (@b,@t);
664 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
4ec93a10
AP
665}
666
667sub aesenclast { # not used
668my @b=@_[0..7];
669my @t=@_[8..15];
670$code.=<<___;
671 movdqa 0x40($const),@t[0] # .LSRM0
672___
28507577
AP
673 &ShiftRows (@b,@t[0]);
674 &Sbox (@b,@t);
4ec93a10
AP
675$code.=<<___
676 pxor 0x00($key),@b[0]
677 pxor 0x10($key),@b[1]
678 pxor 0x20($key),@b[4]
679 pxor 0x30($key),@b[6]
680 pxor 0x40($key),@b[3]
681 pxor 0x50($key),@b[7]
682 pxor 0x60($key),@b[2]
683 pxor 0x70($key),@b[5]
684___
685}
686
687sub swapmove {
688my ($a,$b,$n,$mask,$t)=@_;
689$code.=<<___;
690 movdqa $b,$t
691 psrlq \$$n,$b
692 pxor $a,$b
693 pand $mask,$b
694 pxor $b,$a
695 psllq \$$n,$b
696 pxor $t,$b
697___
698}
699sub swapmove2x {
700my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
701$code.=<<___;
702 movdqa $b0,$t0
703 psrlq \$$n,$b0
704 movdqa $b1,$t1
705 psrlq \$$n,$b1
706 pxor $a0,$b0
707 pxor $a1,$b1
708 pand $mask,$b0
709 pand $mask,$b1
710 pxor $b0,$a0
711 psllq \$$n,$b0
712 pxor $b1,$a1
713 psllq \$$n,$b1
714 pxor $t0,$b0
715 pxor $t1,$b1
716___
717}
718
719sub bitslice {
720my @x=reverse(@_[0..7]);
721my ($t0,$t1,$t2,$t3)=@_[8..11];
722$code.=<<___;
723 movdqa 0x00($const),$t0 # .LBS0
724 movdqa 0x10($const),$t1 # .LBS1
725___
726 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
727 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
728$code.=<<___;
729 movdqa 0x20($const),$t0 # .LBS2
730___
731 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
732 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
733
734 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
735 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
736}
737
738$code.=<<___;
739.text
740
741.extern AES_encrypt
28507577 742.extern AES_decrypt
4ec93a10
AP
743
744.type _bsaes_encrypt8,\@abi-omnipotent
745.align 64
746_bsaes_encrypt8:
747 lea .LBS0(%rip), $const # constants table
748
749 movdqa ($key), @XMM[9] # round 0 key
750 lea 0x10($key), $key
751 movdqa 0x60($const), @XMM[8] # .LM0SR
752 pxor @XMM[9], @XMM[0] # xor with round0 key
753 pxor @XMM[9], @XMM[1]
754 pshufb @XMM[8], @XMM[0]
755 pxor @XMM[9], @XMM[2]
756 pshufb @XMM[8], @XMM[1]
757 pxor @XMM[9], @XMM[3]
758 pshufb @XMM[8], @XMM[2]
759 pxor @XMM[9], @XMM[4]
760 pshufb @XMM[8], @XMM[3]
761 pxor @XMM[9], @XMM[5]
762 pshufb @XMM[8], @XMM[4]
763 pxor @XMM[9], @XMM[6]
764 pshufb @XMM[8], @XMM[5]
765 pxor @XMM[9], @XMM[7]
766 pshufb @XMM[8], @XMM[6]
767 pshufb @XMM[8], @XMM[7]
768_bsaes_encrypt8_bitslice:
769___
770 &bitslice (@XMM[0..7, 8..11]);
771$code.=<<___;
772 dec $rounds
773 jmp .Lenc_sbox
774.align 16
775.Lenc_loop:
776___
28507577 777 &ShiftRows (@XMM[0..7, 8]);
4ec93a10 778$code.=".Lenc_sbox:\n";
28507577 779 &Sbox (@XMM[0..7, 8..15]);
4ec93a10
AP
780$code.=<<___;
781 dec $rounds
782 jl .Lenc_done
783___
28507577 784 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
4ec93a10
AP
785$code.=<<___;
786 movdqa 0x30($const), @XMM[8] # .LSR
787 jnz .Lenc_loop
788 movdqa 0x40($const), @XMM[8] # .LSRM0
789 jmp .Lenc_loop
790.align 16
791.Lenc_done:
792___
793 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
794 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
795$code.=<<___;
796 movdqa ($key), @XMM[8] # last round key
4ec93a10
AP
797 pxor @XMM[8], @XMM[4]
798 pxor @XMM[8], @XMM[6]
799 pxor @XMM[8], @XMM[3]
800 pxor @XMM[8], @XMM[7]
801 pxor @XMM[8], @XMM[2]
802 pxor @XMM[8], @XMM[5]
28507577
AP
803 pxor @XMM[8], @XMM[0]
804 pxor @XMM[8], @XMM[1]
4ec93a10
AP
805 ret
806.size _bsaes_encrypt8,.-_bsaes_encrypt8
28507577
AP
807
808.type _bsaes_decrypt8,\@abi-omnipotent
809.align 64
810_bsaes_decrypt8:
811 lea .LBS0(%rip), $const # constants table
812
813 movdqa ($key), @XMM[9] # round 0 key
814 lea 0x10($key), $key
815 movdqa -0x30($const), @XMM[8] # .LM0ISR
816 pxor @XMM[9], @XMM[0] # xor with round0 key
817 pxor @XMM[9], @XMM[1]
818 pshufb @XMM[8], @XMM[0]
819 pxor @XMM[9], @XMM[2]
820 pshufb @XMM[8], @XMM[1]
821 pxor @XMM[9], @XMM[3]
822 pshufb @XMM[8], @XMM[2]
823 pxor @XMM[9], @XMM[4]
824 pshufb @XMM[8], @XMM[3]
825 pxor @XMM[9], @XMM[5]
826 pshufb @XMM[8], @XMM[4]
827 pxor @XMM[9], @XMM[6]
828 pshufb @XMM[8], @XMM[5]
829 pxor @XMM[9], @XMM[7]
830 pshufb @XMM[8], @XMM[6]
831 pshufb @XMM[8], @XMM[7]
832___
833 &bitslice (@XMM[0..7, 8..11]);
834$code.=<<___;
835 dec $rounds
836 jmp .Ldec_sbox
837.align 16
838.Ldec_loop:
839___
840 &ShiftRows (@XMM[0..7, 8]);
841$code.=".Ldec_sbox:\n";
842 &InvSbox (@XMM[0..7, 8..15]);
843$code.=<<___;
844 dec $rounds
845 jl .Ldec_done
846___
847 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
848$code.=<<___;
849 movdqa -0x10($const), @XMM[8] # .LISR
850 jnz .Ldec_loop
851 movdqa -0x20($const), @XMM[8] # .LISRM0
852 jmp .Ldec_loop
853.align 16
854.Ldec_done:
855___
856 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
857$code.=<<___;
858 movdqa ($key), @XMM[8] # last round key
859 pxor @XMM[8], @XMM[6]
860 pxor @XMM[8], @XMM[4]
861 pxor @XMM[8], @XMM[2]
862 pxor @XMM[8], @XMM[7]
863 pxor @XMM[8], @XMM[3]
864 pxor @XMM[8], @XMM[5]
865 pxor @XMM[8], @XMM[0]
866 pxor @XMM[8], @XMM[1]
867 ret
868.size _bsaes_decrypt8,.-_bsaes_decrypt8
4ec93a10
AP
869___
870}
871{
872my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
873
874sub bitslice_key {
875my @x=reverse(@_[0..7]);
876my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
877
878 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
879$code.=<<___;
880 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
881 movdqa @x[0], @x[2]
882 movdqa @x[1], @x[3]
883___
884 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
885
886 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
887$code.=<<___;
888 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
889 movdqa @x[0], @x[4]
890 movdqa @x[2], @x[6]
891 movdqa @x[1], @x[5]
892 movdqa @x[3], @x[7]
893___
894 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
895 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
896}
897
898$code.=<<___;
28507577 899.type _bsaes_key_convert,\@abi-omnipotent
4ec93a10 900.align 16
28507577 901_bsaes_key_convert:
4ec93a10
AP
902 lea .LBS1(%rip), $const
903 movdqu ($inp), %xmm7 # load round 0 key
904 movdqa -0x10($const), %xmm8 # .LBS0
905 movdqa 0x00($const), %xmm9 # .LBS1
906 movdqa 0x10($const), %xmm10 # .LBS2
907 movdqa 0x40($const), %xmm13 # .LM0
28507577 908 movdqa 0x60($const), %xmm14 # .LNOT
4ec93a10
AP
909
910 movdqu 0x10($inp), %xmm6 # load round 1 key
911 lea 0x10($inp), $inp
912 movdqa %xmm7, ($out) # save round 0 key
913 lea 0x10($out), $out
914 dec $rounds
915 jmp .Lkey_loop
916.align 16
917.Lkey_loop:
28507577 918 pshufb %xmm13, %xmm6 # .LM0
4ec93a10
AP
919 movdqa %xmm6, %xmm7
920___
921 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
922$code.=<<___;
923 pxor %xmm14, %xmm5 # "pnot"
924 pxor %xmm14, %xmm6
925 pxor %xmm14, %xmm0
926 pxor %xmm14, %xmm1
927 lea 0x10($inp), $inp
928 movdqa %xmm0, 0x00($out) # write bit-sliced round key
929 movdqa %xmm1, 0x10($out)
930 movdqa %xmm2, 0x20($out)
931 movdqa %xmm3, 0x30($out)
932 movdqa %xmm4, 0x40($out)
933 movdqa %xmm5, 0x50($out)
934 movdqa %xmm6, 0x60($out)
935 movdqa %xmm7, 0x70($out)
936 lea 0x80($out),$out
937 movdqu ($inp), %xmm6 # load next round key
938 dec $rounds
939 jnz .Lkey_loop
940
28507577
AP
941 movdqa 0x70($const), %xmm7 # .L63
942 #movdqa %xmm6, ($out) # don't save last round key
4ec93a10 943 ret
28507577 944.size _bsaes_key_convert,.-_bsaes_key_convert
4ec93a10
AP
945___
946}
947
28507577 948if (1 && !$win64) { # following four functions are unsupported interface
11272648 949 # used for benchmarking...
4ec93a10
AP
950$code.=<<___;
951.globl bsaes_enc_key_convert
952.type bsaes_enc_key_convert,\@function,2
953.align 16
954bsaes_enc_key_convert:
955 mov 240($inp),%r10d # pass rounds
956 mov $inp,%rcx # pass key
957 mov $out,%rax # pass key schedule
28507577
AP
958 call _bsaes_key_convert
959 pxor %xmm6,%xmm7 # fix up last round key
960 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
961 ret
962.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
963
964.globl bsaes_encrypt_128
965.type bsaes_encrypt_128,\@function,4
966.align 16
967bsaes_encrypt_128:
968.Lenc128_loop:
969 movdqu 0x00($inp), @XMM[0] # load input
970 movdqu 0x10($inp), @XMM[1]
971 movdqu 0x20($inp), @XMM[2]
972 movdqu 0x30($inp), @XMM[3]
973 movdqu 0x40($inp), @XMM[4]
974 movdqu 0x50($inp), @XMM[5]
975 movdqu 0x60($inp), @XMM[6]
976 movdqu 0x70($inp), @XMM[7]
977 mov $key, %rax # pass the $key
978 lea 0x80($inp), $inp
979 mov \$10,%r10d
980
981 call _bsaes_encrypt8
982
983 movdqu @XMM[0], 0x00($out) # write output
984 movdqu @XMM[1], 0x10($out)
985 movdqu @XMM[4], 0x20($out)
986 movdqu @XMM[6], 0x30($out)
987 movdqu @XMM[3], 0x40($out)
988 movdqu @XMM[7], 0x50($out)
989 movdqu @XMM[2], 0x60($out)
990 movdqu @XMM[5], 0x70($out)
991 lea 0x80($out), $out
992 sub \$0x80,$len
993 ja .Lenc128_loop
994 ret
995.size bsaes_encrypt_128,.-bsaes_encrypt_128
28507577
AP
996
997.globl bsaes_dec_key_convert
998.type bsaes_dec_key_convert,\@function,2
999.align 16
1000bsaes_dec_key_convert:
1001 mov 240($inp),%r10d # pass rounds
1002 mov $inp,%rcx # pass key
1003 mov $out,%rax # pass key schedule
1004 call _bsaes_key_convert
1005 pxor ($out),%xmm7 # fix up round 0 key
1006 movdqa %xmm6,(%rax) # save last round key
1007 movdqa %xmm7,($out)
1008 ret
1009.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1010
1011.globl bsaes_decrypt_128
1012.type bsaes_decrypt_128,\@function,4
1013.align 16
1014bsaes_decrypt_128:
1015.Ldec128_loop:
1016 movdqu 0x00($inp), @XMM[0] # load input
1017 movdqu 0x10($inp), @XMM[1]
1018 movdqu 0x20($inp), @XMM[2]
1019 movdqu 0x30($inp), @XMM[3]
1020 movdqu 0x40($inp), @XMM[4]
1021 movdqu 0x50($inp), @XMM[5]
1022 movdqu 0x60($inp), @XMM[6]
1023 movdqu 0x70($inp), @XMM[7]
1024 mov $key, %rax # pass the $key
1025 lea 0x80($inp), $inp
1026 mov \$10,%r10d
1027
1028 call _bsaes_decrypt8
1029
1030 movdqu @XMM[0], 0x00($out) # write output
1031 movdqu @XMM[1], 0x10($out)
1032 movdqu @XMM[6], 0x20($out)
1033 movdqu @XMM[4], 0x30($out)
1034 movdqu @XMM[2], 0x40($out)
1035 movdqu @XMM[7], 0x50($out)
1036 movdqu @XMM[3], 0x60($out)
1037 movdqu @XMM[5], 0x70($out)
1038 lea 0x80($out), $out
1039 sub \$0x80,$len
1040 ja .Ldec128_loop
1041 ret
1042.size bsaes_decrypt_128,.-bsaes_decrypt_128
4ec93a10
AP
1043___
1044}
1045{
1046######################################################################
1047#
1048# OpenSSL interface
1049#
1050my ($arg1,$arg2,$arg3,$arg4,$arg5) = $win64 ? ("%rcx","%rdx","%r8","%r9","%r10")
1051 : ("%rdi","%rsi","%rdx","%rcx","%r8");
1052my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1053
1054$code.=<<___;
1055.globl bsaes_ecb_encrypt_blocks
1056.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1057.align 16
1058bsaes_ecb_encrypt_blocks:
1059 push %rbp
1060 push %rbx
1061 push %r12
1062 push %r13
1063 push %r14
1064 push %r15
1065 lea -0x48(%rsp),%rsp
1066___
1067$code.=<<___ if ($win64);
1068 lea -0xa0(%rsp), %rsp
1069 movaps %xmm6, 0x40(%rsp)
1070 movaps %xmm7, 0x50(%rsp)
1071 movaps %xmm8, 0x60(%rsp)
1072 movaps %xmm9, 0x70(%rsp)
1073 movaps %xmm10, 0x80(%rsp)
1074 movaps %xmm11, 0x90(%rsp)
1075 movaps %xmm12, 0xa0(%rsp)
1076 movaps %xmm13, 0xb0(%rsp)
1077 movaps %xmm14, 0xc0(%rsp)
1078 movaps %xmm15, 0xd0(%rsp)
1079.Lecb_enc_body:
1080___
1081$code.=<<___;
1082 mov %rsp,%rbp # backup %rsp
1083 mov 240($arg4),%eax # rounds
1084 mov $arg1,$inp # backup arguments
1085 mov $arg2,$out
1086 mov $arg3,$len
1087 mov $arg4,$key
1088 cmp \$8,$arg3
1089 jb .Lecb_enc_short
1090
1091 mov %eax,%ebx # backup rounds
1092 shl \$7,%rax # 128 bytes per inner round key
1093 sub \$`128-32`,%rax # size of bit-sliced key schedule
1094 sub %rax,%rsp
1095 mov %rsp,%rax # pass key schedule
1096 mov $key,%rcx # pass key
1097 mov %ebx,%r10d # pass rounds
28507577
AP
1098 call _bsaes_key_convert
1099 pxor %xmm6,%xmm7 # fix up last round key
1100 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1101
1102 sub \$8,$len
1103.Lecb_enc_loop:
1104 movdqu 0x00($inp), @XMM[0] # load input
1105 movdqu 0x10($inp), @XMM[1]
1106 movdqu 0x20($inp), @XMM[2]
1107 movdqu 0x30($inp), @XMM[3]
1108 movdqu 0x40($inp), @XMM[4]
1109 movdqu 0x50($inp), @XMM[5]
1110 mov %rsp, %rax # pass key schedule
1111 movdqu 0x60($inp), @XMM[6]
1112 mov %ebx,%r10d # pass rounds
1113 movdqu 0x70($inp), @XMM[7]
1114 lea 0x80($inp), $inp
1115
1116 call _bsaes_encrypt8
1117
1118 movdqu @XMM[0], 0x00($out) # write output
1119 movdqu @XMM[1], 0x10($out)
1120 movdqu @XMM[4], 0x20($out)
1121 movdqu @XMM[6], 0x30($out)
1122 movdqu @XMM[3], 0x40($out)
1123 movdqu @XMM[7], 0x50($out)
1124 movdqu @XMM[2], 0x60($out)
1125 movdqu @XMM[5], 0x70($out)
1126 lea 0x80($out), $out
1127 sub \$8,$len
1128 jnc .Lecb_enc_loop
1129
1130 add \$8,$len
1131 jz .Lecb_enc_done
1132
1133 movdqu 0x00($inp), @XMM[0] # load input
1134 mov %rsp, %rax # pass key schedule
1135 mov %ebx,%r10d # pass rounds
1136 cmp \$2,$len
1137 jb .Lecb_enc_one
1138 movdqu 0x10($inp), @XMM[1]
1139 je .Lecb_enc_two
1140 movdqu 0x20($inp), @XMM[2]
1141 cmp \$4,$len
1142 jb .Lecb_enc_three
1143 movdqu 0x30($inp), @XMM[3]
1144 je .Lecb_enc_four
1145 movdqu 0x40($inp), @XMM[4]
1146 cmp \$6,$len
1147 jb .Lecb_enc_five
1148 movdqu 0x50($inp), @XMM[5]
1149 je .Lecb_enc_six
1150 movdqu 0x60($inp), @XMM[6]
1151 call _bsaes_encrypt8
1152 movdqu @XMM[0], 0x00($out) # write output
1153 movdqu @XMM[1], 0x10($out)
1154 movdqu @XMM[4], 0x20($out)
1155 movdqu @XMM[6], 0x30($out)
1156 movdqu @XMM[3], 0x40($out)
1157 movdqu @XMM[7], 0x50($out)
1158 movdqu @XMM[2], 0x60($out)
1159 jmp .Lecb_enc_done
1160.align 16
1161.Lecb_enc_six:
1162 call _bsaes_encrypt8
1163 movdqu @XMM[0], 0x00($out) # write output
1164 movdqu @XMM[1], 0x10($out)
1165 movdqu @XMM[4], 0x20($out)
1166 movdqu @XMM[6], 0x30($out)
1167 movdqu @XMM[3], 0x40($out)
1168 movdqu @XMM[7], 0x50($out)
1169 jmp .Lecb_enc_done
1170.align 16
1171.Lecb_enc_five:
1172 call _bsaes_encrypt8
1173 movdqu @XMM[0], 0x00($out) # write output
1174 movdqu @XMM[1], 0x10($out)
1175 movdqu @XMM[4], 0x20($out)
1176 movdqu @XMM[6], 0x30($out)
1177 movdqu @XMM[3], 0x40($out)
1178 jmp .Lecb_enc_done
1179.align 16
1180.Lecb_enc_four:
1181 call _bsaes_encrypt8
1182 movdqu @XMM[0], 0x00($out) # write output
1183 movdqu @XMM[1], 0x10($out)
1184 movdqu @XMM[4], 0x20($out)
1185 movdqu @XMM[6], 0x30($out)
1186 jmp .Lecb_enc_done
1187.align 16
1188.Lecb_enc_three:
1189 call _bsaes_encrypt8
1190 movdqu @XMM[0], 0x00($out) # write output
1191 movdqu @XMM[1], 0x10($out)
1192 movdqu @XMM[4], 0x20($out)
1193 jmp .Lecb_enc_done
1194.align 16
1195.Lecb_enc_two:
1196 call _bsaes_encrypt8
1197 movdqu @XMM[0], 0x00($out) # write output
1198 movdqu @XMM[1], 0x10($out)
1199 jmp .Lecb_enc_done
1200.align 16
1201.Lecb_enc_one:
1202 call _bsaes_encrypt8
1203 movdqu @XMM[0], 0x00($out) # write output
1204 jmp .Lecb_enc_done
1205.align 16
1206.Lecb_enc_short:
1207 lea ($inp), $arg1
1208 lea ($out), $arg2
1209 lea ($key), $arg3
1210 call AES_encrypt
1211 lea 16($inp), $inp
1212 lea 16($out), $out
1213 dec $len
1214 jnz .Lecb_enc_short
1215
1216.Lecb_enc_done:
1217 lea (%rsp),%rax
1218 pxor %xmm0, %xmm0
1219.Lecb_enc_bzero: # wipe key schedule [if any]
1220 movdqa %xmm0, 0x00(%rax)
1221 movdqa %xmm0, 0x10(%rax)
1222 lea 0x20(%rax), %rax
1223 cmp %rax, %rbp
1224 jb .Lecb_enc_bzero
1225
1226 lea (%rbp),%rsp # restore %rsp
1227___
1228$code.=<<___ if ($win64);
1229 movaps 0x40(%rbp), %xmm6
1230 movaps 0x50(%rbp), %xmm7
1231 movaps 0x60(%rbp), %xmm8
1232 movaps 0x70(%rbp), %xmm9
1233 movaps 0x80(%rbp), %xmm10
1234 movaps 0x90(%rbp), %xmm11
1235 movaps 0xa0(%rbp), %xmm12
1236 movaps 0xb0(%rbp), %xmm13
1237 movaps 0xc0(%rbp), %xmm14
1238 movaps 0xd0(%rbp), %xmm15
1239 lea 0xa0(%rbp), %rsp
1240___
1241$code.=<<___;
1242 mov 0x48(%rsp), %r15
1243 mov 0x50(%rsp), %r14
1244 mov 0x58(%rsp), %r13
1245 mov 0x60(%rsp), %r12
1246 mov 0x68(%rsp), %rbx
1247 mov 0x70(%rsp), %rbp
1248 lea 0x78(%rsp), %rsp
1249.Lecb_enc_epilogue:
1250 ret
1251.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1252
1253.globl bsaes_ctr32_encrypt_blocks
1254.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1255.align 16
1256bsaes_ctr32_encrypt_blocks:
1257 push %rbp
1258 push %rbx
1259 push %r12
1260 push %r13
1261 push %r14
1262 push %r15
1263 lea -0x48(%rsp), %rsp
1264___
1265$code.=<<___ if ($win64);
1266 mov 0xa0(%rsp),$arg5 # pull ivp
1267 lea -0xa0(%rsp), %rsp
1268 movaps %xmm6, 0x40(%rsp)
1269 movaps %xmm7, 0x50(%rsp)
1270 movaps %xmm8, 0x60(%rsp)
1271 movaps %xmm9, 0x70(%rsp)
1272 movaps %xmm10, 0x80(%rsp)
1273 movaps %xmm11, 0x90(%rsp)
1274 movaps %xmm12, 0xa0(%rsp)
1275 movaps %xmm13, 0xb0(%rsp)
1276 movaps %xmm14, 0xc0(%rsp)
1277 movaps %xmm15, 0xd0(%rsp)
1278.Lctr_enc_body:
1279___
1280$code.=<<___;
1281 mov %rsp, %rbp # backup %rsp
1282 movdqu ($arg5), %xmm0 # load counter
1283 mov 240($arg4), %eax # rounds
1284 mov $arg1, $inp # backup arguments
1285 mov $arg2, $out
1286 mov $arg3, $len
1287 mov $arg4, $key
1288 movdqa %xmm0, 0x20(%rbp) # copy counter
1289 cmp \$8, $arg3
1290 jb .Lctr_enc_short
1291
1292 mov %eax, %ebx # rounds
1293 shl \$7, %rax # 128 bytes per inner round key
1294 sub \$`128-32`, %rax # size of bit-sliced key schedule
1295 sub %rax, %rsp
1296
1297 mov %rsp, %rax # pass key schedule
1298 mov $key, %rcx # pass key
1299 mov %ebx, %r10d # pass rounds
28507577
AP
1300 call _bsaes_key_convert
1301 pxor %xmm6,%xmm7 # fix up last round key
1302 movdqa %xmm7,(%rax) # save last round key
4ec93a10
AP
1303
1304 movdqa (%rsp), @XMM[9] # load round0 key
1305 lea .LADD1(%rip), %r11
1306 movdqa 0x20(%rbp), @XMM[0] # counter copy
1307 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1308 pshufb @XMM[8], @XMM[9] # byte swap upper part
1309 pshufb @XMM[8], @XMM[0]
1310 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1311 jmp .Lctr_enc_loop
1312.align 16
1313.Lctr_enc_loop:
1314 movdqa @XMM[0], 0x20(%rbp) # save counter
1315 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1316 movdqa @XMM[0], @XMM[2]
1317 paddd 0x00(%r11), @XMM[1] # .LADD1
1318 movdqa @XMM[0], @XMM[3]
1319 paddd 0x10(%r11), @XMM[2] # .LADD2
1320 movdqa @XMM[0], @XMM[4]
1321 paddd 0x20(%r11), @XMM[3] # .LADD3
1322 movdqa @XMM[0], @XMM[5]
1323 paddd 0x30(%r11), @XMM[4] # .LADD4
1324 movdqa @XMM[0], @XMM[6]
1325 paddd 0x40(%r11), @XMM[5] # .LADD5
1326 movdqa @XMM[0], @XMM[7]
1327 paddd 0x50(%r11), @XMM[6] # .LADD6
1328 paddd 0x60(%r11), @XMM[7] # .LADD7
1329
1330 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1331 # to flip byte order in 32-bit counter
1332 movdqa (%rsp), @XMM[9] # round 0 key
1333 lea 0x10(%rsp), %rax # pass key schedule
1334 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1335 pxor @XMM[9], @XMM[0] # xor with round0 key
1336 pxor @XMM[9], @XMM[1]
1337 pshufb @XMM[8], @XMM[0]
1338 pxor @XMM[9], @XMM[2]
1339 pshufb @XMM[8], @XMM[1]
1340 pxor @XMM[9], @XMM[3]
1341 pshufb @XMM[8], @XMM[2]
1342 pxor @XMM[9], @XMM[4]
1343 pshufb @XMM[8], @XMM[3]
1344 pxor @XMM[9], @XMM[5]
1345 pshufb @XMM[8], @XMM[4]
1346 pxor @XMM[9], @XMM[6]
1347 pshufb @XMM[8], @XMM[5]
1348 pxor @XMM[9], @XMM[7]
1349 pshufb @XMM[8], @XMM[6]
1350 lea .LBS0(%rip), %r11 # constants table
1351 pshufb @XMM[8], @XMM[7]
1352 mov %ebx,%r10d # pass rounds
1353
1354 call _bsaes_encrypt8_bitslice
1355
1356 sub \$8,$len
1357 jc .Lctr_enc_loop_done
1358
1359 movdqu 0x00($inp), @XMM[8] # load input
1360 movdqu 0x10($inp), @XMM[9]
1361 movdqu 0x20($inp), @XMM[10]
1362 movdqu 0x30($inp), @XMM[11]
1363 movdqu 0x40($inp), @XMM[12]
1364 movdqu 0x50($inp), @XMM[13]
1365 movdqu 0x60($inp), @XMM[14]
1366 movdqu 0x70($inp), @XMM[15]
1367 lea 0x80($inp),$inp
1368 pxor @XMM[0], @XMM[8]
1369 movdqa 0x20(%rbp), @XMM[0] # load counter
1370 pxor @XMM[9], @XMM[1]
1371 movdqu @XMM[8], 0x00($out) # write output
1372 pxor @XMM[10], @XMM[4]
1373 movdqu @XMM[1], 0x10($out)
1374 pxor @XMM[11], @XMM[6]
1375 movdqu @XMM[4], 0x20($out)
1376 pxor @XMM[12], @XMM[3]
1377 movdqu @XMM[6], 0x30($out)
1378 pxor @XMM[13], @XMM[7]
1379 movdqu @XMM[3], 0x40($out)
1380 pxor @XMM[14], @XMM[2]
1381 movdqu @XMM[7], 0x50($out)
1382 pxor @XMM[15], @XMM[5]
1383 movdqu @XMM[2], 0x60($out)
1384 lea .LADD1(%rip), %r11
1385 movdqu @XMM[5], 0x70($out)
1386 lea 0x80($out), $out
1387 paddd 0x70(%r11), @XMM[0] # .LADD8
1388 jnz .Lctr_enc_loop
1389
1390 jmp .Lctr_enc_done
1391.align 16
1392.Lctr_enc_loop_done:
1393 movdqu 0x00($inp), @XMM[8] # load input
1394 pxor @XMM[8], @XMM[0]
1395 movdqu @XMM[0], 0x00($out) # write output
1396 cmp \$2,$len
1397 jb .Lctr_enc_done
1398 movdqu 0x10($inp), @XMM[9]
1399 pxor @XMM[9], @XMM[1]
1400 movdqu @XMM[1], 0x10($out)
1401 je .Lctr_enc_done
1402 movdqu 0x20($inp), @XMM[10]
1403 pxor @XMM[10], @XMM[4]
1404 movdqu @XMM[4], 0x20($out)
1405 cmp \$4,$len
1406 jb .Lctr_enc_done
1407 movdqu 0x30($inp), @XMM[11]
1408 pxor @XMM[11], @XMM[6]
1409 movdqu @XMM[6], 0x30($out)
1410 je .Lctr_enc_done
1411 movdqu 0x40($inp), @XMM[12]
1412 pxor @XMM[12], @XMM[3]
1413 movdqu @XMM[3], 0x40($out)
1414 cmp \$6,$len
1415 jb .Lctr_enc_done
1416 movdqu 0x50($inp), @XMM[13]
1417 pxor @XMM[13], @XMM[7]
1418 movdqu @XMM[7], 0x50($out)
1419 je .Lctr_enc_done
1420 movdqu 0x60($inp), @XMM[14]
1421 pxor @XMM[14], @XMM[2]
1422 movdqu @XMM[2], 0x60($out)
1423 jmp .Lctr_enc_done
1424
1425.align 16
1426.Lctr_enc_short:
1427 lea 0x20(%rbp), $arg1
1428 lea 0x30(%rbp), $arg2
1429 lea ($key), $arg3
1430 call AES_encrypt
1431 movdqu ($inp), @XMM[1]
1432 lea 16($inp), $inp
1433 mov 0x2c(%rbp), %eax # load 32-bit counter
1434 bswap %eax
1435 pxor 0x30(%rbp), @XMM[1]
1436 inc %eax # increment
1437 movdqu @XMM[1], ($out)
1438 bswap %eax
1439 lea 16($out), $out
1440 mov %eax, 0x2c(%rsp) # save 32-bit counter
1441 dec $len
1442 jnz .Lctr_enc_short
1443
1444.Lctr_enc_done:
1445 lea (%rsp), %rax
1446 pxor %xmm0, %xmm0
1447.Lctr_enc_bzero: # wipe key schedule [if any]
1448 movdqa %xmm0, 0x00(%rax)
1449 movdqa %xmm0, 0x10(%rax)
1450 lea 0x20(%rax), %rax
1451 cmp %rax, %rbp
1452 ja .Lctr_enc_bzero
1453
1454 lea (%rbp),%rsp # restore %rsp
1455___
1456$code.=<<___ if ($win64);
1457 movaps 0x40(%rbp), %xmm6
1458 movaps 0x50(%rbp), %xmm7
1459 movaps 0x60(%rbp), %xmm8
1460 movaps 0x70(%rbp), %xmm9
1461 movaps 0x80(%rbp), %xmm10
1462 movaps 0x90(%rbp), %xmm11
1463 movaps 0xa0(%rbp), %xmm12
1464 movaps 0xb0(%rbp), %xmm13
1465 movaps 0xc0(%rbp), %xmm14
1466 movaps 0xd0(%rbp), %xmm15
1467 lea 0xa0(%rbp), %rsp
1468___
1469$code.=<<___;
1470 mov 0x48(%rsp), %r15
1471 mov 0x50(%rsp), %r14
1472 mov 0x58(%rsp), %r13
1473 mov 0x60(%rsp), %r12
1474 mov 0x68(%rsp), %rbx
1475 mov 0x70(%rsp), %rbp
1476 lea 0x78(%rsp), %rsp
1477.Lctr_enc_epilogue:
1478 ret
1479.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1480___
1481}
1482$code.=<<___;
1483.align 64
28507577
AP
1484.LM0ISR: # InvShiftRows constants
1485 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
1486.LISRM0:
1487 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
1488.LISR:
1489 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
5a326467
AP
1490.LBS0: # bit-slice constants
1491 .quad 0x5555555555555555, 0x5555555555555555
1492.LBS1:
1493 .quad 0x3333333333333333, 0x3333333333333333
1494.LBS2:
1495 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
1496.LSR: # shiftrows constants
1497 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
1498.LSRM0:
1499 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
1500.LM0:
1501 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
1502.LM0SR:
1503 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
1504.LNOT: # magic constants
1505 .quad 0xffffffffffffffff, 0xffffffffffffffff
1506.L63:
1507 .quad 0x6363636363636363, 0x6363636363636363
1508.LSWPUP: # byte-swap upper dword
4ec93a10
AP
1509 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
1510.LSWPUPM0SR:
1511 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
5a326467
AP
1512.LADD1: # counter increment constants
1513 .quad 0x0000000000000000, 0x0000000100000000
1514.LADD2:
1515 .quad 0x0000000000000000, 0x0000000200000000
1516.LADD3:
1517 .quad 0x0000000000000000, 0x0000000300000000
1518.LADD4:
1519 .quad 0x0000000000000000, 0x0000000400000000
1520.LADD5:
1521 .quad 0x0000000000000000, 0x0000000500000000
1522.LADD6:
1523 .quad 0x0000000000000000, 0x0000000600000000
1524.LADD7:
1525 .quad 0x0000000000000000, 0x0000000700000000
1526.LADD8:
1527 .quad 0x0000000000000000, 0x0000000800000000
11272648 1528.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper and Peter Schwabe"
4ec93a10
AP
1529.align 64
1530___
1531
1532$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1533
1534print $code;
1535
1536close STDOUT;