3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
23 # Performance in cycles per byte processed with 128-bit key:
26 # Apple A7 2.39 1.20 1.20
27 # Cortex-A53 2.45 1.87 1.94
28 # Cortex-A57 3.64 1.34 1.32
31 open STDOUT
,">".shift;
38 #if __ARM_MAX_ARCH__>=7
41 $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
42 $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
43 #^^^^^^ this is done to simplify adoption by not depending
46 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
47 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
48 # maintain both 32- and 64-bit codes within single module and
49 # transliterate common code to either flavour with regex vodoo.
52 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
53 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
54 $flavour=~/64/?
map("q$_",(0..6)) : map("q$_",(0..3,8..10));
60 .long
0x01,0x01,0x01,0x01
61 .long
0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate
-n
-splat
62 .long
0x1b,0x1b,0x1b,0x1b
64 .globl
${prefix
}_set_encrypt_key
65 .type
${prefix
}_set_encrypt_key
,%function
67 ${prefix
}_set_encrypt_key
:
70 $code.=<<___
if ($flavour =~ /64/);
71 stp x29
,x30
,[sp
,#-16]!
91 veor
$zero,$zero,$zero
92 vld1
.8
{$in0},[$inp],#16
93 mov
$bits,#8 // reuse $bits
94 vld1
.32
{$rcon,$mask},[$ptr],#32
102 vtbl
.8 $key,{$in0},$mask
103 vext
.8 $tmp,$zero,$in0,#12
104 vst1
.32
{$in0},[$out],#16
109 vext
.8 $tmp,$zero,$tmp,#12
111 vext
.8 $tmp,$zero,$tmp,#12
114 vshl
.u8
$rcon,$rcon,#1
118 vld1
.32
{$rcon},[$ptr]
120 vtbl
.8 $key,{$in0},$mask
121 vext
.8 $tmp,$zero,$in0,#12
122 vst1
.32
{$in0},[$out],#16
126 vext
.8 $tmp,$zero,$tmp,#12
128 vext
.8 $tmp,$zero,$tmp,#12
131 vshl
.u8
$rcon,$rcon,#1
134 vtbl
.8 $key,{$in0},$mask
135 vext
.8 $tmp,$zero,$in0,#12
136 vst1
.32
{$in0},[$out],#16
140 vext
.8 $tmp,$zero,$tmp,#12
142 vext
.8 $tmp,$zero,$tmp,#12
146 vst1
.32
{$in0},[$out]
154 vld1
.8
{$in1},[$inp],#8
155 vmov
.i8
$key,#8 // borrow $key
156 vst1
.32
{$in0},[$out],#16
157 vsub
.i8
$mask,$mask,$key // adjust the mask
160 vtbl
.8 $key,{$in1},$mask
161 vext
.8 $tmp,$zero,$in0,#12
162 vst1
.32
{$in1},[$out],#8
167 vext
.8 $tmp,$zero,$tmp,#12
169 vext
.8 $tmp,$zero,$tmp,#12
172 vdup
.32 $tmp,${in0
}[3]
175 vext
.8 $in1,$zero,$in1,#12
176 vshl
.u8
$rcon,$rcon,#1
180 vst1
.32
{$in0},[$out],#16
192 vst1
.32
{$in0},[$out],#16
195 vtbl
.8 $key,{$in1},$mask
196 vext
.8 $tmp,$zero,$in0,#12
197 vst1
.32
{$in1},[$out],#16
202 vext
.8 $tmp,$zero,$tmp,#12
204 vext
.8 $tmp,$zero,$tmp,#12
207 vshl
.u8
$rcon,$rcon,#1
209 vst1
.32
{$in0},[$out],#16
212 vdup
.32 $key,${in0
}[3] // just splat
213 vext
.8 $tmp,$zero,$in1,#12
217 vext
.8 $tmp,$zero,$tmp,#12
219 vext
.8 $tmp,$zero,$tmp,#12
230 mov x0
,$ptr // return value
231 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
233 .size
${prefix
}_set_encrypt_key
,.-${prefix
}_set_encrypt_key
235 .globl
${prefix
}_set_decrypt_key
236 .type
${prefix
}_set_decrypt_key
,%function
238 ${prefix
}_set_decrypt_key
:
240 $code.=<<___
if ($flavour =~ /64/);
241 stp x29
,x30
,[sp
,#-16]!
244 $code.=<<___
if ($flavour !~ /64/);
253 sub $out,$out,#240 // restore original $out
255 add
$inp,$out,x12
,lsl
#4 // end of key schedule
257 vld1
.32
{v0
.16b
},[$out]
258 vld1
.32
{v1
.16b
},[$inp]
259 vst1
.32
{v0
.16b
},[$inp],x4
260 vst1
.32
{v1
.16b
},[$out],#16
263 vld1
.32
{v0
.16b
},[$out]
264 vld1
.32
{v1
.16b
},[$inp]
267 vst1
.32
{v0
.16b
},[$inp],x4
268 vst1
.32
{v1
.16b
},[$out],#16
272 vld1
.32
{v0
.16b
},[$out]
274 vst1
.32
{v0
.16b
},[$inp]
276 eor x0
,x0
,x0
// return value
279 $code.=<<___
if ($flavour !~ /64/);
282 $code.=<<___
if ($flavour =~ /64/);
287 .size
${prefix
}_set_decrypt_key
,.-${prefix
}_set_decrypt_key
293 my ($e,$mc) = $dir eq "en" ?
("e","mc") : ("d","imc");
294 my ($inp,$out,$key)=map("x$_",(0..2));
296 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
299 .globl
${prefix
}_
${dir
}crypt
300 .type
${prefix
}_
${dir
}crypt,%function
302 ${prefix
}_
${dir
}crypt:
303 ldr
$rounds,[$key,#240]
304 vld1
.32
{$rndkey0},[$key],#16
305 vld1
.8
{$inout},[$inp]
306 sub $rounds,$rounds,#2
307 vld1
.32
{$rndkey1},[$key],#16
310 aes
$e $inout,$rndkey0
311 vld1
.32
{$rndkey0},[$key],#16
313 subs
$rounds,$rounds,#2
314 aes
$e $inout,$rndkey1
315 vld1
.32
{$rndkey1},[$key],#16
319 aes
$e $inout,$rndkey0
320 vld1
.32
{$rndkey0},[$key]
322 aes
$e $inout,$rndkey1
323 veor
$inout,$inout,$rndkey0
325 vst1
.8
{$inout},[$out]
327 .size
${prefix
}_
${dir
}crypt,.-${prefix
}_
${dir
}crypt
334 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
335 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
336 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
338 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
340 ### q8-q15 preloaded key schedule
343 .globl
${prefix
}_cbc_encrypt
344 .type
${prefix
}_cbc_encrypt
,%function
346 ${prefix
}_cbc_encrypt
:
348 $code.=<<___
if ($flavour =~ /64/);
349 stp x29
,x30
,[sp
,#-16]!
352 $code.=<<___
if ($flavour !~ /64/);
355 vstmdb sp
!,{d8
-d15
} @ ABI specification says so
356 ldmia ip
,{r4
-r5
} @ load remaining args
364 cmp $enc,#0 // en- or decrypting?
365 ldr
$rounds,[$key,#240]
367 vld1
.8
{$ivec},[$ivp]
368 vld1
.8
{$dat},[$inp],$step
370 vld1
.32
{q8
-q9
},[$key] // load key schedule
...
371 sub $rounds,$rounds,#6
372 add
$key_,$key,x5
,lsl
#4 // pointer to last 7 round keys
373 sub $rounds,$rounds,#2
374 vld1
.32
{q10
-q11
},[$key_],#32
375 vld1
.32
{q12
-q13
},[$key_],#32
376 vld1
.32
{q14
-q15
},[$key_],#32
377 vld1
.32
{$rndlast},[$key_]
385 veor
$rndzero_n_last,q8
,$rndlast
390 vld1
.32
{q8
},[$key_],#16
394 vld1
.32
{q9
},[$key_],#16
409 vld1
.8
{q8
},[$inp],$step
412 veor q8
,q8
,$rndzero_n_last
415 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
421 veor
$ivec,$dat,$rndlast
422 vst1
.8
{$ivec},[$out],#16
429 vld1
.32
{$in0-$in1},[$key_]
436 vst1
.8
{$ivec},[$out],#16
450 vld1
.8
{q8
},[$inp],$step
457 veor q8
,q8
,$rndzero_n_last
459 veor
$ivec,$dat,$rndlast
460 b
.hs
.Loop_cbc_enc128
462 vst1
.8
{$ivec},[$out],#16
466 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
470 vld1
.8
{$dat2},[$inp],#16
471 subs
$len,$len,#32 // bias
475 vorr
$in2,$dat2,$dat2
478 vorr
$dat1,$dat2,$dat2
479 vld1
.8
{$dat2},[$inp],#16
481 vorr
$in1,$dat1,$dat1
482 vorr
$in2,$dat2,$dat2
488 vld1
.32
{q8
},[$key_],#16
496 vld1
.32
{q9
},[$key_],#16
505 veor
$tmp0,$ivec,$rndlast
509 veor
$tmp1,$in0,$rndlast
513 veor
$tmp2,$in1,$rndlast
519 mov
.lo x6
,$len // x6
, $cnt, is zero at this point
523 add
$inp,$inp,x6
// $inp is adjusted
in such way that
524 // at
exit from the
loop $dat1-$dat2
525 // are loaded with
last "words"
533 vld1
.8
{$in0},[$inp],#16
537 vld1
.8
{$in1},[$inp],#16
541 vld1
.8
{$in2},[$inp],#16
545 vld1
.32
{q8
},[$key_],#16 // re-pre-load rndkey[0]
551 veor
$tmp0,$tmp0,$dat0
552 veor
$tmp1,$tmp1,$dat1
553 veor
$dat2,$dat2,$tmp2
554 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
556 vst1
.8
{$tmp0},[$out],#16
558 vst1
.8
{$tmp1},[$out],#16
559 vst1
.8
{$dat2},[$out],#16
570 vld1
.32
{q8
},[$key_],#16
576 vld1
.32
{q9
},[$key_],#16
598 veor
$tmp1,$ivec,$rndlast
603 veor
$tmp2,$in1,$rndlast
607 veor
$tmp1,$tmp1,$dat1
608 veor
$tmp2,$tmp2,$dat2
610 vst1
.8
{$tmp1},[$out],#16
611 vst1
.8
{$tmp2},[$out],#16
615 veor
$tmp1,$tmp1,$dat2
617 vst1
.8
{$tmp1},[$out],#16
620 vst1
.8
{$ivec},[$ivp]
624 $code.=<<___
if ($flavour !~ /64/);
628 $code.=<<___
if ($flavour =~ /64/);
633 .size
${prefix
}_cbc_encrypt
,.-${prefix
}_cbc_encrypt
637 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
638 my ($rounds,$cnt,$key_)=("w5","w6","x7");
639 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
640 my $step="x12"; # aliases with $tctr2
642 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
643 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
645 my ($dat,$tmp)=($dat0,$tmp0);
647 ### q8-q15 preloaded key schedule
650 .globl
${prefix
}_ctr32_encrypt_blocks
651 .type
${prefix
}_ctr32_encrypt_blocks
,%function
653 ${prefix
}_ctr32_encrypt_blocks
:
655 $code.=<<___
if ($flavour =~ /64/);
656 stp x29
,x30
,[sp
,#-16]!
659 $code.=<<___
if ($flavour !~ /64/);
661 stmdb sp
!,{r4
-r10
,lr
}
662 vstmdb sp
!,{d8
-d15
} @ ABI specification says so
663 ldr r4
, [ip
] @ load remaining arg
666 ldr
$rounds,[$key,#240]
668 ldr
$ctr, [$ivp, #12]
669 vld1
.32
{$dat0},[$ivp]
671 vld1
.32
{q8
-q9
},[$key] // load key schedule
...
672 sub $rounds,$rounds,#4
675 add
$key_,$key,x5
,lsl
#4 // pointer to last 5 round keys
676 sub $rounds,$rounds,#2
677 vld1
.32
{q12
-q13
},[$key_],#32
678 vld1
.32
{q14
-q15
},[$key_],#32
679 vld1
.32
{$rndlast},[$key_]
686 vorr
$dat1,$dat0,$dat0
688 vorr
$dat2,$dat0,$dat0
690 vorr
$ivec,$dat0,$dat0
692 vmov
.32 ${dat1
}[3],$tctr1
695 sub $len,$len,#3 // bias
696 vmov
.32 ${dat2
}[3],$tctr2
704 vld1
.32
{q8
},[$key_],#16
712 vld1
.32
{q9
},[$key_],#16
723 vld1
.8
{$in0},[$inp],#16
726 vorr
$dat0,$ivec,$ivec
728 vld1
.8
{$in1},[$inp],#16
731 vorr
$dat1,$ivec,$ivec
733 vld1
.8
{$in2},[$inp],#16
736 vorr
$dat2,$ivec,$ivec
741 veor
$in0,$in0,$rndlast
746 veor
$in1,$in1,$rndlast
751 veor
$in2,$in2,$rndlast
754 vld1
.32
{q8
},[$key_],#16 // re-pre-load rndkey[0]
757 vmov
.32 ${dat0
}[3], $tctr0
762 vmov
.32 ${dat1
}[3], $tctr1
767 vmov
.32 ${dat2
}[3], $tctr2
777 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
778 vst1
.8
{$in0},[$out],#16
779 vst1
.8
{$in1},[$out],#16
780 vst1
.8
{$in2},[$out],#16
792 vld1
.32
{q8
},[$key_],#16
798 vld1
.32
{q9
},[$key_],#16
811 vld1
.8
{$in0},[$inp],$step
823 veor
$in0,$in0,$rndlast
826 veor
$in1,$in1,$rndlast
833 vst1
.8
{$in0},[$out],#16
839 $code.=<<___
if ($flavour !~ /64/);
841 ldmia sp
!,{r4
-r10
,pc
}
843 $code.=<<___
if ($flavour =~ /64/);
848 .size
${prefix
}_ctr32_encrypt_blocks
,.-${prefix
}_ctr32_encrypt_blocks
854 ########################################
855 if ($flavour =~ /64/) { ######## 64-bit code
857 "aesd" => 0x4e285800, "aese" => 0x4e284800,
858 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
861 my ($mnemonic,$arg)=@_;
863 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
864 sprintf ".inst\t0x%08x\t//%s %s",
865 $opcode{$mnemonic}|$1|($2<<5),
869 foreach(split("\n",$code)) {
870 s/\`([^\`]*)\`/eval($1)/geo;
872 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
873 s/@\s/\/\
//o; # old->new style commentary
875 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
876 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
877 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
878 s/vmov\.i8/movi/o or # fix up legacy mnemonics
880 s/vrev32\.8/rev32/o or
883 s/^(\s+)v/$1/o or # strip off v prefix
886 # fix up remainig legacy suffixes
888 m/\],#8/o and s/\.16b/\.8b/go;
889 s/\.[ui]?32//o and s/\.16b/\.4s/go;
890 s/\.[ui]?64//o and s/\.16b/\.2d/go;
891 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
895 } else { ######## 32-bit code
897 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
898 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
901 my ($mnemonic,$arg)=@_;
903 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
904 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
905 |(($2&7)<<1) |(($2&8)<<2);
906 # since ARMv7 instructions are always encoded little-endian.
907 # correct solution is to use .inst directive, but older
908 # assemblers don't implement it:-(
909 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
910 $word&0xff,($word>>8)&0xff,
911 ($word>>16)&0xff,($word>>24)&0xff,
919 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
920 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
921 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
927 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
928 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
934 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
935 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
938 foreach(split("\n",$code)) {
939 s/\`([^\`]*)\`/eval($1)/geo;
941 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
942 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
943 s/\/\/\s?
/@ /o; # new->old style commentary
945 # fix up remainig new-style suffixes
946 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
949 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
950 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
951 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
952 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
953 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
955 s/^(\s+)mov\./$1mov/o or
956 s/^(\s+)ret/$1bx\tlr/o;