2 # Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
42 # Performance in cycles per byte processed with 128-bit key:
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m
|\
.\w
+$| ?
pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m
|\
.| ?
shift : undef;
64 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
67 die "can't locate arm-xlate.pl";
69 open OUT
,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
75 $_byte = ($flavour =~ /win/ ?
"DCB" : ".byte");
80 #if __ARM_MAX_ARCH__>=7
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___
if ($flavour !~ /64/);
84 .arch armv7
-a
// don
't confuse not-so-latest binutils with argv8 :-)
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
92 # define INST(a,b,c,d) $_byte a,b,c,d
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
119 ${prefix}_set_encrypt_key:
122 $code.=<<___ if ($flavour =~ /64/);
123 AARCH64_VALID_CALL_TARGET
124 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
125 stp x29,x30,[sp,#-16]!
145 veor $zero,$zero,$zero
146 vld1.8 {$in0},[$inp],#16
147 mov $bits,#8 // reuse $bits
148 vld1.32 {$rcon,$mask},[$ptr],#32
156 vtbl.8 $key,{$in0},$mask
157 vext.8 $tmp,$zero,$in0,#12
158 vst1.32 {$in0},[$out],#16
163 vext.8 $tmp,$zero,$tmp,#12
165 vext.8 $tmp,$zero,$tmp,#12
168 vshl.u8 $rcon,$rcon,#1
172 vld1.32 {$rcon},[$ptr]
174 vtbl.8 $key,{$in0},$mask
175 vext.8 $tmp,$zero,$in0,#12
176 vst1.32 {$in0},[$out],#16
180 vext.8 $tmp,$zero,$tmp,#12
182 vext.8 $tmp,$zero,$tmp,#12
185 vshl.u8 $rcon,$rcon,#1
188 vtbl.8 $key,{$in0},$mask
189 vext.8 $tmp,$zero,$in0,#12
190 vst1.32 {$in0},[$out],#16
194 vext.8 $tmp,$zero,$tmp,#12
196 vext.8 $tmp,$zero,$tmp,#12
200 vst1.32 {$in0},[$out]
208 vld1.8 {$in1},[$inp],#8
209 vmov.i8 $key,#8 // borrow $key
210 vst1.32 {$in0},[$out],#16
211 vsub.i8 $mask,$mask,$key // adjust the mask
214 vtbl.8 $key,{$in1},$mask
215 vext.8 $tmp,$zero,$in0,#12
217 vst1.32 {$in1},[$out],#16
220 vst1.32 {$in1},[$out],#8
226 vext.8 $tmp,$zero,$tmp,#12
228 vext.8 $tmp,$zero,$tmp,#12
231 vdup.32 $tmp,${in0}[3]
234 vext.8 $in1,$zero,$in1,#12
235 vshl.u8 $rcon,$rcon,#1
239 vst1.32 {$in0},[$out],#16
251 vst1.32 {$in0},[$out],#16
254 vtbl.8 $key,{$in1},$mask
255 vext.8 $tmp,$zero,$in0,#12
256 vst1.32 {$in1},[$out],#16
261 vext.8 $tmp,$zero,$tmp,#12
263 vext.8 $tmp,$zero,$tmp,#12
266 vshl.u8 $rcon,$rcon,#1
268 vst1.32 {$in0},[$out],#16
271 vdup.32 $key,${in0}[3] // just splat
272 vext.8 $tmp,$zero,$in1,#12
276 vext.8 $tmp,$zero,$tmp,#12
278 vext.8 $tmp,$zero,$tmp,#12
289 mov x0,$ptr // return value
290 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
292 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
294 .globl ${prefix}_set_decrypt_key
295 .type ${prefix}_set_decrypt_key,%function
297 ${prefix}_set_decrypt_key:
299 $code.=<<___ if ($flavour =~ /64/);
300 AARCH64_SIGN_LINK_REGISTER
301 stp x29,x30,[sp,#-16]!
304 $code.=<<___ if ($flavour !~ /64/);
313 sub $out,$out,#240 // restore original $out
315 add $inp,$out,x12,lsl#4 // end of key schedule
317 vld1.32 {v0.16b},[$out]
318 vld1.32 {v1.16b},[$inp]
319 vst1.32 {v0.16b},[$inp],x4
320 vst1.32 {v1.16b},[$out],#16
323 vld1.32 {v0.16b},[$out]
324 vld1.32 {v1.16b},[$inp]
327 vst1.32 {v0.16b},[$inp],x4
328 vst1.32 {v1.16b},[$out],#16
332 vld1.32 {v0.16b},[$out]
334 vst1.32 {v0.16b},[$inp]
336 eor x0,x0,x0 // return value
339 $code.=<<___ if ($flavour !~ /64/);
342 $code.=<<___ if ($flavour =~ /64/);
344 AARCH64_VALIDATE_LINK_REGISTER
348 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
354 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
355 my ($inp,$out,$key)=map("x$_",(0..2));
357 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
360 .globl ${prefix}_${dir}crypt
361 .type ${prefix}_${dir}crypt,%function
363 ${prefix}_${dir}crypt:
365 $code.=<<___ if ($flavour =~ /64/);
366 AARCH64_VALID_CALL_TARGET
369 ldr $rounds,[$key,#240]
370 vld1.32 {$rndkey0},[$key],#16
371 vld1.8 {$inout},[$inp]
372 sub $rounds,$rounds,#2
373 vld1.32 {$rndkey1},[$key],#16
376 aes$e $inout,$rndkey0
378 vld1.32 {$rndkey0},[$key],#16
379 subs $rounds,$rounds,#2
380 aes$e $inout,$rndkey1
382 vld1.32 {$rndkey1},[$key],#16
385 aes$e $inout,$rndkey0
387 vld1.32 {$rndkey0},[$key]
388 aes$e $inout,$rndkey1
389 veor $inout,$inout,$rndkey0
391 vst1.8 {$inout},[$out]
393 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
400 # Performance in cycles per byte.
401 # Processed with AES-ECB different key size.
402 # It shows the value before and after optimization as below:
405 # AES-128-ECB AES-192-ECB AES-256-ECB
406 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
407 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
409 # Optimization is implemented by loop unrolling and interleaving.
410 # Commonly, we choose the unrolling factor as 5, if the input
411 # data size smaller than 5 blocks, but not smaller than 3 blocks,
412 # choose 3 as the unrolling factor.
413 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
414 # as one iteration, every loop the left size lsize -= 5*16.
415 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
416 # every loop lsize -=3*16.
417 # If lsize < 3*16 bytes, treat them as the tail, interleave the
418 # two blocks AES instructions.
419 # There is one special case, if the original input data size dsize
420 # = 16 bytes, we will treat it separately to improve the
421 # performance: one independent code block without LR, FP load and
422 # store, just looks like what the original ECB implementation does.
425 my ($inp,$out,$len,$key)=map("x$_",(0..3));
426 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
427 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
429 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
431 ### q7 last round key
432 ### q10-q15 q7 Last 7 round keys
433 ### q8-q9 preloaded round keys except last 7 keys for big size
434 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
437 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
439 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
440 my ($dat4,$in4,$tmp4);
441 if ($flavour =~ /64/) {
442 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
446 .globl ${prefix}_ecb_encrypt
447 .type ${prefix}_ecb_encrypt,%function
449 ${prefix}_ecb_encrypt:
451 $code.=<<___ if ($flavour =~ /64/);
452 AARCH64_VALID_CALL_TARGET
454 // Original input data size bigger than 16, jump to big size processing.
456 vld1.8 {$dat0},[$inp]
457 cmp $enc,#0 // en- or decrypting?
458 ldr $rounds,[$key,#240]
459 vld1.32 {q5-q6},[$key],#32 // load key schedule...
464 vld1.32 {q8-q9},[$key],#32 // load key schedule...
467 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
472 vld1.32 {q8},[$key],#16 // load key schedule...
475 vld1.32 {q9},[$key],#16 // load key schedule...
476 subs $rounds,$rounds,#2 // bias
477 b.gt .Lecb_round_loop
479 vld1.32 {q10-q11},[$key],#32 // load key schedule...
484 vld1.32 {q12-q13},[$key],#32 // load key schedule...
489 vld1.32 {q14-q15},[$key],#32 // load key schedule...
494 vld1.32 {$rndlast},[$key]
498 veor $dat0,$dat0,$rndlast
499 vst1.8 {$dat0},[$out]
504 vld1.32 {q8-q9},[$key],#32 // load key schedule...
507 subs $rounds,$rounds,#10 // bias
509 .Lecb_dec_round_loop:
512 vld1.32 {q8},[$key],#16 // load key schedule...
515 vld1.32 {q9},[$key],#16 // load key schedule...
516 subs $rounds,$rounds,#2 // bias
517 b.gt .Lecb_dec_round_loop
519 vld1.32 {q10-q11},[$key],#32 // load key schedule...
524 vld1.32 {q12-q13},[$key],#32 // load key schedule...
529 vld1.32 {q14-q15},[$key],#32 // load key schedule...
534 vld1.32 {$rndlast},[$key]
538 veor $dat0,$dat0,$rndlast
539 vst1.8 {$dat0},[$out]
543 $code.=<<___ if ($flavour =~ /64/);
544 stp x29,x30,[sp,#-16]!
547 $code.=<<___ if ($flavour !~ /64/);
550 vstmdb sp!,{d8-d15} @ ABI specification says so
551 ldmia ip,{r4-r5} @ load remaining args
559 cmp $enc,#0 // en- or decrypting?
560 ldr $rounds,[$key,#240]
562 vld1.8 {$dat},[$inp],$step
564 vld1.32 {q8-q9},[$key] // load key schedule...
565 sub $rounds,$rounds,#6
566 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
567 sub $rounds,$rounds,#2
568 vld1.32 {q10-q11},[$key_],#32
569 vld1.32 {q12-q13},[$key_],#32
570 vld1.32 {q14-q15},[$key_],#32
571 vld1.32 {$rndlast},[$key_]
577 vld1.8 {$dat1},[$inp],#16
578 subs $len,$len,#32 // bias
580 vorr $in1,$dat1,$dat1
581 vorr $dat2,$dat1,$dat1
586 vld1.8 {$dat2},[$inp],#16
588 $code.=<<___ if ($flavour =~ /64/);
592 vld1.8 {$dat3},[$inp],#16
593 vld1.8 {$dat4},[$inp],#16
594 sub $len,$len,#32 // bias
608 vld1.32 {q8},[$key_],#16
620 vld1.32 {q9},[$key_],#16
633 cmp $len,#0x40 // because .Lecb_enc_tail4x
646 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
659 add $inp,$inp,x6 // $inp is adjusted in such way that
660 // at exit from the loop $dat1-$dat4
661 // are loaded with last "words"
662 add x6,$len,#0x60 // because .Lecb_enc_tail4x
709 vld1.8 {$in0},[$inp],#16
711 vld1.8 {$in1},[$inp],#16
713 vld1.8 {$in2},[$inp],#16
715 vld1.8 {$in3},[$inp],#16
717 vld1.8 {$in4},[$inp],#16
718 cbz x6,.Lecb_enc_tail4x
719 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
720 veor $tmp0,$rndlast,$dat0
722 veor $tmp1,$rndlast,$dat1
724 veor $tmp2,$rndlast,$dat2
726 veor $tmp3,$rndlast,$dat3
728 veor $tmp4,$rndlast,$dat4
729 vst1.8 {$tmp0},[$out],#16
731 vst1.8 {$tmp1},[$out],#16
733 vst1.8 {$tmp2},[$out],#16
734 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
735 vst1.8 {$tmp3},[$out],#16
736 vst1.8 {$tmp4},[$out],#16
753 veor $tmp1,$rndlast,$dat1
754 veor $tmp2,$rndlast,$dat2
755 veor $tmp3,$rndlast,$dat3
756 veor $tmp4,$rndlast,$dat4
757 vst1.8 {$tmp1},[$out],#16
758 vst1.8 {$tmp2},[$out],#16
759 vst1.8 {$tmp3},[$out],#16
760 vst1.8 {$tmp4},[$out],#16
773 vld1.32 {q8},[$key_],#16
781 vld1.32 {q9},[$key_],#16
791 mov.lo x6,$len // x6, $cnt, is zero at this point
798 add $inp,$inp,x6 // $inp is adjusted in such way that
799 // at exit from the loop $dat1-$dat2
800 // are loaded with last "words"
808 vld1.8 {$in0},[$inp],#16
815 vld1.8 {$in1},[$inp],#16
822 vld1.8 {$in2},[$inp],#16
826 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
828 veor $tmp0,$rndlast,$dat0
829 veor $tmp1,$rndlast,$dat1
830 veor $dat2,$dat2,$rndlast
831 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
832 vst1.8 {$tmp0},[$out],#16
834 vst1.8 {$tmp1},[$out],#16
836 vst1.8 {$dat2},[$out],#16
849 vld1.32 {q8},[$key_],#16
855 vld1.32 {q9},[$key_],#16
882 veor $tmp1,$rndlast,$dat1
883 veor $tmp2,$rndlast,$dat2
884 vst1.8 {$tmp1},[$out],#16
885 vst1.8 {$tmp2},[$out],#16
889 veor $tmp1,$rndlast,$dat2
890 vst1.8 {$tmp1},[$out],#16
897 vld1.8 {$dat1},[$inp],#16
898 subs $len,$len,#32 // bias
900 vorr $in1,$dat1,$dat1
901 vorr $dat2,$dat1,$dat1
906 vld1.8 {$dat2},[$inp],#16
908 $code.=<<___ if ($flavour =~ /64/);
912 vld1.8 {$dat3},[$inp],#16
913 vld1.8 {$dat4},[$inp],#16
914 sub $len,$len,#32 // bias
928 vld1.32 {q8},[$key_],#16
940 vld1.32 {q9},[$key_],#16
953 cmp $len,#0x40 // because .Lecb_tail4x
966 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
979 add $inp,$inp,x6 // $inp is adjusted in such way that
980 // at exit from the loop $dat1-$dat4
981 // are loaded with last "words"
982 add x6,$len,#0x60 // because .Lecb_tail4x
1029 vld1.8 {$in0},[$inp],#16
1031 vld1.8 {$in1},[$inp],#16
1033 vld1.8 {$in2},[$inp],#16
1035 vld1.8 {$in3},[$inp],#16
1037 vld1.8 {$in4},[$inp],#16
1039 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1040 veor $tmp0,$rndlast,$dat0
1041 vorr $dat0,$in0,$in0
1042 veor $tmp1,$rndlast,$dat1
1043 vorr $dat1,$in1,$in1
1044 veor $tmp2,$rndlast,$dat2
1045 vorr $dat2,$in2,$in2
1046 veor $tmp3,$rndlast,$dat3
1047 vorr $dat3,$in3,$in3
1048 veor $tmp4,$rndlast,$dat4
1049 vst1.8 {$tmp0},[$out],#16
1050 vorr $dat4,$in4,$in4
1051 vst1.8 {$tmp1},[$out],#16
1053 vst1.8 {$tmp2},[$out],#16
1054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1055 vst1.8 {$tmp3},[$out],#16
1056 vst1.8 {$tmp4},[$out],#16
1057 b.hs .Loop5x_ecb_dec
1063 subs $len,$len,#0x30
1064 vorr $dat0,$in2,$in2
1065 vorr $dat1,$in3,$in3
1066 vorr $dat2,$in4,$in4
1073 veor $tmp1,$rndlast,$dat1
1074 veor $tmp2,$rndlast,$dat2
1075 veor $tmp3,$rndlast,$dat3
1076 veor $tmp4,$rndlast,$dat4
1077 vst1.8 {$tmp1},[$out],#16
1078 vst1.8 {$tmp2},[$out],#16
1079 vst1.8 {$tmp3},[$out],#16
1080 vst1.8 {$tmp4},[$out],#16
1093 vld1.32 {q8},[$key_],#16
1101 vld1.32 {q9},[$key_],#16
1102 b.gt .Loop3x_ecb_dec
1110 subs $len,$len,#0x30
1111 mov.lo x6,$len // x6, $cnt, is zero at this point
1118 add $inp,$inp,x6 // $inp is adjusted in such way that
1119 // at exit from the loop $dat1-$dat2
1120 // are loaded with last "words"
1128 vld1.8 {$in0},[$inp],#16
1135 vld1.8 {$in1},[$inp],#16
1142 vld1.8 {$in2},[$inp],#16
1146 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1148 veor $tmp0,$rndlast,$dat0
1149 veor $tmp1,$rndlast,$dat1
1150 veor $dat2,$dat2,$rndlast
1151 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1152 vst1.8 {$tmp0},[$out],#16
1153 vorr $dat0,$in0,$in0
1154 vst1.8 {$tmp1},[$out],#16
1155 vorr $dat1,$in1,$in1
1156 vst1.8 {$dat2},[$out],#16
1157 vorr $dat2,$in2,$in2
1158 b.hs .Loop3x_ecb_dec
1169 vld1.32 {q8},[$key_],#16
1175 vld1.32 {q9},[$key_],#16
1202 veor $tmp1,$rndlast,$dat1
1203 veor $tmp2,$rndlast,$dat2
1204 vst1.8 {$tmp1},[$out],#16
1205 vst1.8 {$tmp2},[$out],#16
1209 veor $tmp1,$rndlast,$dat2
1210 vst1.8 {$tmp1},[$out],#16
1215 $code.=<<___ if ($flavour !~ /64/);
1217 ldmia sp!,{r4-r8,pc}
1219 $code.=<<___ if ($flavour =~ /64/);
1222 $code.=<<___ if ($flavour =~ /64/);
1227 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1231 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1235 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1238 ### q8-q15 preloaded key schedule
1241 .globl ${prefix}_cbc_encrypt
1242 .type ${prefix}_cbc_encrypt,%function
1244 ${prefix}_cbc_encrypt:
1246 $code.=<<___ if ($flavour =~ /64/);
1247 AARCH64_VALID_CALL_TARGET
1248 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249 stp x29,x30,[sp,#-16]!
1252 $code.=<<___ if ($flavour !~ /64/);
1254 stmdb sp!,{r4-r8,lr}
1255 vstmdb sp!,{d8-d15} @ ABI specification says so
1256 ldmia ip,{r4-r5} @ load remaining args
1264 cmp $enc,#0 // en- or decrypting?
1265 ldr $rounds,[$key,#240]
1267 vld1.8 {$ivec},[$ivp]
1268 vld1.8 {$dat},[$inp],$step
1270 vld1.32 {q8-q9},[$key] // load key schedule...
1271 sub $rounds,$rounds,#6
1272 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1273 sub $rounds,$rounds,#2
1274 vld1.32 {q10-q11},[$key_],#32
1275 vld1.32 {q12-q13},[$key_],#32
1276 vld1.32 {q14-q15},[$key_],#32
1277 vld1.32 {$rndlast},[$key_]
1284 veor $dat,$dat,$ivec
1285 veor $rndzero_n_last,q8,$rndlast
1288 vld1.32 {$in0-$in1},[$key_]
1290 add $key4,$key,#16*4
1291 add $key5,$key,#16*5
1294 add $key6,$key,#16*6
1295 add $key7,$key,#16*7
1302 vst1.8 {$ivec},[$out],#16
1308 vld1.32 {q8},[$key4]
1312 vld1.32 {q9},[$key5]
1317 vld1.32 {q8},[$key6]
1320 vld1.32 {q9},[$key7]
1334 vld1.8 {q8},[$inp],$step
1337 veor q8,q8,$rndzero_n_last
1340 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1344 veor $ivec,$dat,$rndlast
1347 vst1.8 {$ivec},[$out],#16
1352 vld1.32 {$in0-$in1},[$key_]
1355 b .Lenter_cbc_enc128
1359 vst1.8 {$ivec},[$out],#16
1373 vld1.8 {q8},[$inp],$step
1380 veor q8,q8,$rndzero_n_last
1382 veor $ivec,$dat,$rndlast
1383 b.hs .Loop_cbc_enc128
1385 vst1.8 {$ivec},[$out],#16
1389 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1391 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1392 my ($dat4,$in4,$tmp4);
1393 if ($flavour =~ /64/) {
1394 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1400 vld1.8 {$dat2},[$inp],#16
1401 subs $len,$len,#32 // bias
1404 vorr $dat1,$dat,$dat
1405 vorr $in2,$dat2,$dat2
1408 vorr $dat1,$dat2,$dat2
1409 vld1.8 {$dat2},[$inp],#16
1411 vorr $in1,$dat1,$dat1
1412 vorr $in2,$dat2,$dat2
1414 $code.=<<___ if ($flavour =~ /64/);
1416 b.lo .Loop3x_cbc_dec
1418 vld1.8 {$dat3},[$inp],#16
1419 vld1.8 {$dat4},[$inp],#16
1420 sub $len,$len,#32 // bias
1422 vorr $in3,$dat3,$dat3
1423 vorr $in4,$dat4,$dat4
1436 vld1.32 {q8},[$key_],#16
1448 vld1.32 {q9},[$key_],#16
1449 b.gt .Loop5x_cbc_dec
1461 cmp $len,#0x40 // because .Lcbc_tail4x
1474 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1487 add $inp,$inp,x6 // $inp is adjusted in such way that
1488 // at exit from the loop $dat1-$dat4
1489 // are loaded with last "words"
1490 add x6,$len,#0x60 // because .Lcbc_tail4x
1536 veor $tmp0,$ivec,$rndlast
1538 veor $tmp1,$in0,$rndlast
1539 vld1.8 {$in0},[$inp],#16
1541 veor $tmp2,$in1,$rndlast
1542 vld1.8 {$in1},[$inp],#16
1544 veor $tmp3,$in2,$rndlast
1545 vld1.8 {$in2},[$inp],#16
1547 veor $tmp4,$in3,$rndlast
1548 vld1.8 {$in3},[$inp],#16
1550 vorr $ivec,$in4,$in4
1551 vld1.8 {$in4},[$inp],#16
1553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1554 veor $tmp0,$tmp0,$dat0
1555 vorr $dat0,$in0,$in0
1556 veor $tmp1,$tmp1,$dat1
1557 vorr $dat1,$in1,$in1
1558 veor $tmp2,$tmp2,$dat2
1559 vorr $dat2,$in2,$in2
1560 veor $tmp3,$tmp3,$dat3
1561 vorr $dat3,$in3,$in3
1562 veor $tmp4,$tmp4,$dat4
1563 vst1.8 {$tmp0},[$out],#16
1564 vorr $dat4,$in4,$in4
1565 vst1.8 {$tmp1},[$out],#16
1567 vst1.8 {$tmp2},[$out],#16
1568 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1569 vst1.8 {$tmp3},[$out],#16
1570 vst1.8 {$tmp4},[$out],#16
1571 b.hs .Loop5x_cbc_dec
1577 subs $len,$len,#0x30
1578 vorr $dat0,$in2,$in2
1580 vorr $dat1,$in3,$in3
1582 vorr $dat2,$in4,$in4
1590 veor $tmp1,$tmp0,$dat1
1591 veor $tmp2,$tmp2,$dat2
1592 veor $tmp3,$tmp3,$dat3
1593 veor $tmp4,$tmp4,$dat4
1594 vst1.8 {$tmp1},[$out],#16
1595 vst1.8 {$tmp2},[$out],#16
1596 vst1.8 {$tmp3},[$out],#16
1597 vst1.8 {$tmp4},[$out],#16
1610 vld1.32 {q8},[$key_],#16
1618 vld1.32 {q9},[$key_],#16
1619 b.gt .Loop3x_cbc_dec
1627 veor $tmp0,$ivec,$rndlast
1628 subs $len,$len,#0x30
1629 veor $tmp1,$in0,$rndlast
1630 mov.lo x6,$len // x6, $cnt, is zero at this point
1637 veor $tmp2,$in1,$rndlast
1638 add $inp,$inp,x6 // $inp is adjusted in such way that
1639 // at exit from the loop $dat1-$dat2
1640 // are loaded with last "words"
1641 vorr $ivec,$in2,$in2
1649 vld1.8 {$in0},[$inp],#16
1656 vld1.8 {$in1},[$inp],#16
1663 vld1.8 {$in2},[$inp],#16
1667 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1669 veor $tmp0,$tmp0,$dat0
1670 veor $tmp1,$tmp1,$dat1
1671 veor $dat2,$dat2,$tmp2
1672 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1673 vst1.8 {$tmp0},[$out],#16
1674 vorr $dat0,$in0,$in0
1675 vst1.8 {$tmp1},[$out],#16
1676 vorr $dat1,$in1,$in1
1677 vst1.8 {$dat2},[$out],#16
1678 vorr $dat2,$in2,$in2
1679 b.hs .Loop3x_cbc_dec
1690 vld1.32 {q8},[$key_],#16
1696 vld1.32 {q9},[$key_],#16
1716 veor $tmp1,$ivec,$rndlast
1721 veor $tmp2,$in1,$rndlast
1725 veor $tmp1,$tmp1,$dat1
1726 veor $tmp2,$tmp2,$dat2
1727 vorr $ivec,$in2,$in2
1728 vst1.8 {$tmp1},[$out],#16
1729 vst1.8 {$tmp2},[$out],#16
1733 veor $tmp1,$tmp1,$dat2
1734 vorr $ivec,$in2,$in2
1735 vst1.8 {$tmp1},[$out],#16
1738 vst1.8 {$ivec},[$ivp]
1742 $code.=<<___ if ($flavour !~ /64/);
1744 ldmia sp!,{r4-r8,pc}
1746 $code.=<<___ if ($flavour =~ /64/);
1751 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1755 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1756 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1757 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1758 my $step="x12"; # aliases with $tctr2
1760 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1761 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1763 # used only in 64-bit mode...
1764 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1766 my ($dat,$tmp)=($dat0,$tmp0);
1768 ### q8-q15 preloaded key schedule
1771 .globl ${prefix}_ctr32_encrypt_blocks
1772 .type ${prefix}_ctr32_encrypt_blocks,%function
1774 ${prefix}_ctr32_encrypt_blocks:
1776 $code.=<<___ if ($flavour =~ /64/);
1777 AARCH64_VALID_CALL_TARGET
1778 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1779 stp x29,x30,[sp,#-16]!
1782 $code.=<<___ if ($flavour !~ /64/);
1784 stmdb sp!,{r4-r10,lr}
1785 vstmdb sp!,{d8-d15} @ ABI specification says so
1786 ldr r4, [ip] @ load remaining arg
1789 ldr $rounds,[$key,#240]
1791 ldr $ctr, [$ivp, #12]
1793 vld1.8 {$dat0},[$ivp]
1795 vld1.32 {$dat0},[$ivp]
1797 vld1.32 {q8-q9},[$key] // load key schedule...
1798 sub $rounds,$rounds,#4
1801 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1802 sub $rounds,$rounds,#2
1803 vld1.32 {q12-q13},[$key_],#32
1804 vld1.32 {q14-q15},[$key_],#32
1805 vld1.32 {$rndlast},[$key_]
1812 add $tctr1, $ctr, #1
1813 vorr $ivec,$dat0,$dat0
1815 vmov.32 ${ivec}[3],$tctr1
1817 vorr $dat1,$ivec,$ivec
1820 vmov.32 ${ivec}[3],$tctr2
1821 sub $len,$len,#3 // bias
1822 vorr $dat2,$ivec,$ivec
1824 $code.=<<___ if ($flavour =~ /64/);
1830 vorr $dat3,$dat0,$dat0
1832 vorr $dat4,$dat0,$dat0
1834 vmov.32 ${dat3}[3],w13
1835 sub $len,$len,#2 // bias
1836 vmov.32 ${dat4}[3],w14
1852 vld1.32 {q8},[$key_],#16
1864 vld1.32 {q9},[$key_],#16
1878 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1890 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1926 vld1.8 {$in0},[$inp],#16
1929 vld1.8 {$in1},[$inp],#16
1932 vld1.8 {$in2},[$inp],#16
1935 vld1.8 {$in3},[$inp],#16
1938 vld1.8 {$in4},[$inp],#16
1941 veor $in0,$in0,$rndlast
1943 veor $in1,$in1,$rndlast
1945 veor $in2,$in2,$rndlast
1947 veor $in3,$in3,$rndlast
1949 veor $in4,$in4,$rndlast
1951 veor $in0,$in0,$dat0
1952 vorr $dat0,$ivec,$ivec
1953 veor $in1,$in1,$dat1
1954 vorr $dat1,$ivec,$ivec
1955 veor $in2,$in2,$dat2
1956 vorr $dat2,$ivec,$ivec
1957 veor $in3,$in3,$dat3
1958 vorr $dat3,$ivec,$ivec
1959 veor $in4,$in4,$dat4
1960 vorr $dat4,$ivec,$ivec
1962 vst1.8 {$in0},[$out],#16
1963 vmov.32 ${dat0}[3],$tctr0
1964 vst1.8 {$in1},[$out],#16
1965 vmov.32 ${dat1}[3],$tctr1
1966 vst1.8 {$in2},[$out],#16
1967 vmov.32 ${dat2}[3],$tctr2
1968 vst1.8 {$in3},[$out],#16
1969 vmov.32 ${dat3}[3],w13
1970 vst1.8 {$in4},[$out],#16
1971 vmov.32 ${dat4}[3],w14
1974 cbz $len,.Lctr32_done
1988 sub $len,$len,#3 // bias
2002 vld1.32 {q8},[$key_],#16
2010 vld1.32 {q9},[$key_],#16
2017 vld1.8 {$in0},[$inp],#16
2021 vld1.8 {$in1},[$inp],#16
2027 vld1.8 {$in2},[$inp],#16
2035 veor $in0,$in0,$rndlast
2039 veor $in1,$in1,$rndlast
2045 veor $in2,$in2,$rndlast
2046 vmov.32 ${ivec}[3], $tctr0
2049 vorr $dat0,$ivec,$ivec
2053 vmov.32 ${ivec}[3], $tctr1
2057 vorr $dat1,$ivec,$ivec
2058 vmov.32 ${ivec}[3], $tctr2
2061 vorr $dat2,$ivec,$ivec
2067 veor $in0,$in0,$tmp0
2068 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2069 vst1.8 {$in0},[$out],#16
2070 veor $in1,$in1,$tmp1
2072 vst1.8 {$in1},[$out],#16
2073 veor $in2,$in2,$tmp2
2074 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2075 vst1.8 {$in2},[$out],#16
2089 vld1.32 {q8},[$key_],#16
2095 vld1.32 {q9},[$key_],#16
2106 vld1.8 {$in0},[$inp],$step
2111 vld1.8 {$in1},[$inp]
2116 veor $in0,$in0,$rndlast
2121 veor $in1,$in1,$rndlast
2126 veor $in0,$in0,$dat0
2127 veor $in1,$in1,$dat1
2128 vst1.8 {$in0},[$out],#16
2130 vst1.8 {$in1},[$out]
2134 $code.=<<___ if ($flavour !~ /64/);
2136 ldmia sp!,{r4-r10,pc}
2138 $code.=<<___ if ($flavour =~ /64/);
2143 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2146 # Performance in cycles per byte.
2147 # Processed with AES-XTS different key size.
2148 # It shows the value before and after optimization as below:
2151 # AES-128-XTS AES-256-XTS
2152 # Cortex-A57 3.36/1.09 4.02/1.37
2153 # Cortex-A72 3.03/1.02 3.28/1.33
2155 # Optimization is implemented by loop unrolling and interleaving.
2156 # Commonly, we choose the unrolling factor as 5, if the input
2157 # data size smaller than 5 blocks, but not smaller than 3 blocks,
2158 # choose 3 as the unrolling factor.
2159 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
2160 # as one iteration, every loop the left size lsize -= 5*16.
2161 # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2162 # will be processed specially, which be integrated into the 5*16 bytes
2163 # loop to improve the efficiency.
2164 # There is one special case, if the original input data size dsize
2165 # = 16 bytes, we will treat it separately to improve the
2166 # performance: one independent code block without LR, FP load and
2168 # Encryption will process the (length -tailcnt) bytes as mentioned
2169 # previously, then encrypt the composite block as last second
2171 # Decryption will process the (length -tailcnt -1) bytes as mentioned
2172 # previously, then decrypt the last second cipher block to get the
2173 # last plain block(tail), decrypt the composite block as last second
2177 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2178 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2179 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2180 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2181 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2182 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2183 my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2184 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2185 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2187 my ($tmpin)=("v26.16b");
2188 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2191 # q10-q15, q7 Last 7 round keys
2192 # q8-q9 preloaded round keys except last 7 keys for big size
2193 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2196 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2198 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2199 my ($dat4,$in4,$tmp4);
2200 if ($flavour =~ /64/) {
2201 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2204 $code.=<<___ if ($flavour =~ /64/);
2205 .globl ${prefix}_xts_encrypt
2206 .type ${prefix}_xts_encrypt,%function
2208 ${prefix}_xts_encrypt:
2210 $code.=<<___ if ($flavour =~ /64/);
2211 AARCH64_VALID_CALL_TARGET
2213 // Original input data size bigger than 16, jump to big size processing.
2214 b.ne .Lxts_enc_big_size
2215 // Encrypt the iv with key2, as the first XEX iv.
2216 ldr $rounds,[$key2,#240]
2217 vld1.8 {$dat},[$key2],#16
2218 vld1.8 {$iv0},[$ivp]
2219 sub $rounds,$rounds,#2
2220 vld1.8 {$dat1},[$key2],#16
2225 vld1.32 {$dat},[$key2],#16
2226 subs $rounds,$rounds,#2
2229 vld1.32 {$dat1},[$key2],#16
2230 b.gt .Loop_enc_iv_enc
2234 vld1.32 {$dat},[$key2]
2238 vld1.8 {$dat0},[$inp]
2239 veor $dat0,$iv0,$dat0
2241 ldr $rounds,[$key1,#240]
2242 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2246 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2249 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2251 .Lxts_enc_round_loop:
2254 vld1.32 {q8},[$key1],#16 // load key schedule...
2257 vld1.32 {q9},[$key1],#16 // load key schedule...
2258 subs $rounds,$rounds,#2 // bias
2259 b.gt .Lxts_enc_round_loop
2261 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2266 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2271 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2276 vld1.32 {$rndlast},[$key1]
2280 veor $dat0,$dat0,$rndlast
2281 veor $dat0,$dat0,$iv0
2282 vst1.8 {$dat0},[$out]
2283 b .Lxts_enc_final_abort
2288 $code.=<<___ if ($flavour =~ /64/);
2289 stp $constnumx,$tmpinp,[sp,#-64]!
2290 stp $tailcnt,$midnumx,[sp,#48]
2291 stp $ivd10,$ivd20,[sp,#32]
2292 stp $ivd30,$ivd40,[sp,#16]
2294 // tailcnt store the tail value of length%16.
2295 and $tailcnt,$len,#0xf
2300 csel $step,xzr,$step,eq
2302 // Firstly, encrypt the iv with key2, as the first iv of XEX.
2303 ldr $rounds,[$key2,#240]
2304 vld1.32 {$dat},[$key2],#16
2305 vld1.8 {$iv0},[$ivp]
2306 sub $rounds,$rounds,#2
2307 vld1.32 {$dat1},[$key2],#16
2312 vld1.32 {$dat},[$key2],#16
2313 subs $rounds,$rounds,#2
2316 vld1.32 {$dat1},[$key2],#16
2321 vld1.32 {$dat},[$key2]
2325 // The iv for second block
2326 // $ivl- iv(low), $ivh - iv(high)
2327 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2331 extr $midnumx,$ivh,$ivh,#32
2332 extr $ivh,$ivh,$ivl,#63
2333 and $tmpmw,$constnum,$midnum,asr#31
2334 eor $ivl,$tmpmx,$ivl,lsl#1
2338 ldr $rounds0,[$key1,#240] // next starting point
2339 vld1.8 {$dat},[$inp],$step
2341 vld1.32 {q8-q9},[$key1] // load key schedule...
2342 sub $rounds0,$rounds0,#6
2343 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2344 sub $rounds0,$rounds0,#2
2345 vld1.32 {q10-q11},[$key_],#32
2346 vld1.32 {q12-q13},[$key_],#32
2347 vld1.32 {q14-q15},[$key_],#32
2348 vld1.32 {$rndlast},[$key_]
2351 mov $rounds,$rounds0
2355 vld1.8 {$dat2},[$inp],#16
2356 subs $len,$len,#32 // bias
2357 add $rounds,$rounds0,#2
2359 vorr $dat1,$dat,$dat
2361 vorr $in2,$dat2,$dat2
2362 vorr $in4,$dat2,$dat2
2363 b.lo .Lxts_inner_enc_tail
2364 veor $dat,$dat,$iv0 // before encryption, xor with iv
2365 veor $dat2,$dat2,$iv1
2367 // The iv for third block
2368 extr $midnumx,$ivh,$ivh,#32
2369 extr $ivh,$ivh,$ivl,#63
2370 and $tmpmw,$constnum,$midnum,asr#31
2371 eor $ivl,$tmpmx,$ivl,lsl#1
2376 vorr $dat1,$dat2,$dat2
2377 vld1.8 {$dat2},[$inp],#16
2379 vorr $in1,$dat1,$dat1
2380 veor $in2,$dat2,$iv2 // the third block
2381 veor $dat2,$dat2,$iv2
2383 b.lo .Lxts_outer_enc_tail
2385 // The iv for fourth block
2386 extr $midnumx,$ivh,$ivh,#32
2387 extr $ivh,$ivh,$ivl,#63
2388 and $tmpmw,$constnum,$midnum,asr#31
2389 eor $ivl,$tmpmx,$ivl,lsl#1
2393 vld1.8 {$dat3},[$inp],#16
2394 // The iv for fifth block
2395 extr $midnumx,$ivh,$ivh,#32
2396 extr $ivh,$ivh,$ivl,#63
2397 and $tmpmw,$constnum,$midnum,asr#31
2398 eor $ivl,$tmpmx,$ivl,lsl#1
2402 vld1.8 {$dat4},[$inp],#16
2403 veor $dat3,$dat3,$iv3 // the fourth block
2404 veor $dat4,$dat4,$iv4
2405 sub $len,$len,#32 // bias
2406 mov $rounds,$rounds0
2421 vld1.32 {q8},[$key_],#16
2422 subs $rounds,$rounds,#2
2433 vld1.32 {q9},[$key_],#16
2434 b.gt .Loop5x_xts_enc
2446 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2458 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2471 add $inp,$inp,$xoffset // x0 is adjusted in such way that
2472 // at exit from the loop v1.16b-v26.16b
2473 // are loaded with last "words"
2474 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2520 veor $tmp0,$rndlast,$iv0
2522 // The iv for first block of one iteration
2523 extr $midnumx,$ivh,$ivh,#32
2524 extr $ivh,$ivh,$ivl,#63
2525 and $tmpmw,$constnum,$midnum,asr#31
2526 eor $ivl,$tmpmx,$ivl,lsl#1
2529 veor $tmp1,$rndlast,$iv1
2530 vld1.8 {$in0},[$inp],#16
2532 // The iv for second block
2533 extr $midnumx,$ivh,$ivh,#32
2534 extr $ivh,$ivh,$ivl,#63
2535 and $tmpmw,$constnum,$midnum,asr#31
2536 eor $ivl,$tmpmx,$ivl,lsl#1
2539 veor $tmp2,$rndlast,$iv2
2540 vld1.8 {$in1},[$inp],#16
2542 // The iv for third block
2543 extr $midnumx,$ivh,$ivh,#32
2544 extr $ivh,$ivh,$ivl,#63
2545 and $tmpmw,$constnum,$midnum,asr#31
2546 eor $ivl,$tmpmx,$ivl,lsl#1
2549 veor $tmp3,$rndlast,$iv3
2550 vld1.8 {$in2},[$inp],#16
2552 // The iv for fourth block
2553 extr $midnumx,$ivh,$ivh,#32
2554 extr $ivh,$ivh,$ivl,#63
2555 and $tmpmw,$constnum,$midnum,asr#31
2556 eor $ivl,$tmpmx,$ivl,lsl#1
2559 veor $tmp4,$rndlast,$iv4
2560 vld1.8 {$in3},[$inp],#16
2563 // The iv for fifth block
2564 extr $midnumx,$ivh,$ivh,#32
2565 extr $ivh,$ivh,$ivl,#63
2566 and $tmpmw,$constnum,$midnum,asr #31
2567 eor $ivl,$tmpmx,$ivl,lsl #1
2571 vld1.8 {$in4},[$inp],#16
2572 cbz $xoffset,.Lxts_enc_tail4x
2573 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2574 veor $tmp0,$tmp0,$dat0
2575 veor $dat0,$in0,$iv0
2576 veor $tmp1,$tmp1,$dat1
2577 veor $dat1,$in1,$iv1
2578 veor $tmp2,$tmp2,$dat2
2579 veor $dat2,$in2,$iv2
2580 veor $tmp3,$tmp3,$dat3
2581 veor $dat3,$in3,$iv3
2582 veor $tmp4,$tmp4,$dat4
2583 vst1.8 {$tmp0},[$out],#16
2584 veor $dat4,$in4,$iv4
2585 vst1.8 {$tmp1},[$out],#16
2586 mov $rounds,$rounds0
2587 vst1.8 {$tmp2},[$out],#16
2588 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2589 vst1.8 {$tmp3},[$out],#16
2590 vst1.8 {$tmp4},[$out],#16
2591 b.hs .Loop5x_xts_enc
2594 // If left 4 blocks, borrow the five block's processing
.
2596 b
.ne .Loop5x_enc_after
2603 veor
$dat0,$iv0,$in0
2604 veor
$dat1,$iv1,$in1
2605 veor
$dat2,$in2,$iv2
2606 veor
$dat3,$in3,$iv3
2607 veor
$dat4,$in4,$iv4
2608 b
.eq .Loop5x_xts_enc
2612 cbz
$len,.Lxts_enc_done
2614 add
$rounds,$rounds0,#2
2615 subs
$len,$len,#0x30
2616 b
.lo
.Lxts_inner_enc_tail
2618 veor
$dat0,$iv0,$in2
2619 veor
$dat1,$iv1,$in3
2620 veor
$dat2,$in4,$iv2
2621 b
.Lxts_outer_enc_tail
2626 veor
$tmp1,$dat1,$tmp1
2627 vst1
.8
{$tmp1},[$out],#16
2628 veor
$tmp2,$dat2,$tmp2
2629 vst1
.8
{$tmp2},[$out],#16
2630 veor
$tmp3,$dat3,$tmp3
2631 veor
$tmp4,$dat4,$tmp4
2632 vst1
.8
{$tmp3-$tmp4},[$out],#32
2636 .Lxts_outer_enc_tail
:
2643 vld1
.32
{q8
},[$key_],#16
2644 subs
$rounds,$rounds,#2
2651 vld1
.32
{q9
},[$key_],#16
2652 b
.gt .Lxts_outer_enc_tail
2660 veor
$tmp0,$iv0,$rndlast
2661 subs
$len,$len,#0x30
2662 // The iv
for first block
2665 //mov
$constnum,#0x87
2666 extr
$midnumx,$ivh,$ivh,#32
2667 extr
$ivh,$ivh,$ivl,#63
2668 and $tmpmw,$constnum,$midnum,asr
#31
2669 eor
$ivl,$tmpmx,$ivl,lsl
#1
2672 veor
$tmp1,$iv1,$rndlast
2673 csel
$xoffset,$len,$xoffset,lo
// x6
, w6
, is zero at this point
2680 veor
$tmp2,$iv2,$rndlast
2682 add
$xoffset,$xoffset,#0x20
2683 add
$inp,$inp,$xoffset
2707 vld1
.8
{$in2},[$inp],#16
2708 add
$rounds,$rounds0,#2
2709 vld1
.32
{q8
},[$key_],#16 // re-pre-load rndkey[0]
2710 veor
$tmp0,$tmp0,$dat0
2711 veor
$tmp1,$tmp1,$dat1
2712 veor
$dat2,$dat2,$tmp2
2713 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
2714 vst1
.8
{$tmp0},[$out],#16
2715 vst1
.8
{$tmp1},[$out],#16
2716 vst1
.8
{$dat2},[$out],#16
2724 .Lxts_inner_enc_tail
:
2726 veor
$dat1,$in3,$iv0
2727 veor
$dat2,$in4,$iv1
2728 b
.eq .Lxts_enc_tail_loop
2729 veor
$dat2,$in4,$iv0
2730 .Lxts_enc_tail_loop
:
2735 vld1
.32
{q8
},[$key_],#16
2736 subs
$rounds,$rounds,#2
2741 vld1
.32
{q9
},[$key_],#16
2742 b
.gt .Lxts_enc_tail_loop
2761 veor
$tmp1,$iv0,$rndlast
2766 veor
$tmp2,$iv1,$rndlast
2770 veor
$tmp1,$tmp1,$dat1
2771 vst1
.8
{$tmp1},[$out],#16
2772 veor
$tmp2,$tmp2,$dat2
2774 vst1
.8
{$tmp2},[$out],#16
2778 extr
$midnumx,$ivh,$ivh,#32
2779 extr
$ivh,$ivh,$ivl,#63
2780 and $tmpmw,$constnum,$midnum,asr
#31
2781 eor
$ivl,$tmpmx,$ivl,lsl
#1
2787 veor
$tmp1,$tmp1,$dat2
2789 vst1
.8
{$tmp1},[$out],#16
2793 extr
$midnumx,$ivh,$ivh,#32
2794 extr
$ivh,$ivh,$ivl,#63
2795 and $tmpmw,$constnum,$midnum,asr
#31
2796 eor
$ivl,$tmpmx,$ivl,lsl
#1
2802 // Process the tail block with cipher stealing
.
2809 .composite_enc_loop
:
2810 subs
$tailcnt,$tailcnt,#1
2811 ldrb
$l2outp,[$out,$tailcnt]
2812 ldrb
$loutp,[$tmpinp,$tailcnt]
2813 strb
$l2outp,[$tmpoutp,$tailcnt]
2814 strb
$loutp,[$out,$tailcnt]
2815 b
.gt .composite_enc_loop
2816 .Lxts_enc_load_done
:
2817 vld1
.8
{$tmpin},[$out]
2818 veor
$tmpin,$tmpin,$iv0
2820 // Encrypt the composite block to get the
last second encrypted text block
2821 ldr
$rounds,[$key1,#240] // load key schedule...
2822 vld1
.8
{$dat},[$key1],#16
2823 sub $rounds,$rounds,#2
2824 vld1
.8
{$dat1},[$key1],#16 // load key schedule...
2828 vld1
.32
{$dat0},[$key1],#16
2829 subs
$rounds,$rounds,#2
2832 vld1
.32
{$dat1},[$key1],#16
2833 b
.gt .Loop_final_enc
2837 vld1
.32
{$dat0},[$key1]
2839 veor
$tmpin,$tmpin,$dat0
2840 veor
$tmpin,$tmpin,$iv0
2841 vst1
.8
{$tmpin},[$out]
2844 ldp
$tailcnt,$midnumx,[sp
,#48]
2845 ldp
$ivd10,$ivd20,[sp
,#32]
2846 ldp
$ivd30,$ivd40,[sp
,#16]
2847 ldp
$constnumx,$tmpinp,[sp
],#64
2848 .Lxts_enc_final_abort
:
2850 .size
${prefix
}_xts_encrypt
,.-${prefix
}_xts_encrypt
2855 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2856 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2857 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2858 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2859 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2860 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2861 my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2862 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2863 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2865 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2868 # q10-q15, q7 Last 7 round keys
2869 # q8-q9 preloaded round keys except last 7 keys for big size
2870 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2873 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2875 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2876 my ($dat4,$in4,$tmp4);
2877 if ($flavour =~ /64/) {
2878 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2881 $code.=<<___
if ($flavour =~ /64/);
2882 .globl
${prefix
}_xts_decrypt
2883 .type
${prefix
}_xts_decrypt
,%function
2885 ${prefix
}_xts_decrypt
:
2886 AARCH64_VALID_CALL_TARGET
2888 $code.=<<___
if ($flavour =~ /64/);
2890 // Original input data size bigger than
16, jump to big size processing
.
2891 b
.ne .Lxts_dec_big_size
2892 // Encrypt the iv with key2
, as the first XEX iv
.
2893 ldr
$rounds,[$key2,#240]
2894 vld1
.8
{$dat},[$key2],#16
2895 vld1
.8
{$iv0},[$ivp]
2896 sub $rounds,$rounds,#2
2897 vld1
.8
{$dat1},[$key2],#16
2899 .Loop_dec_small_iv_enc
:
2902 vld1
.32
{$dat},[$key2],#16
2903 subs
$rounds,$rounds,#2
2906 vld1
.32
{$dat1},[$key2],#16
2907 b
.gt .Loop_dec_small_iv_enc
2911 vld1
.32
{$dat},[$key2]
2915 vld1
.8
{$dat0},[$inp]
2916 veor
$dat0,$iv0,$dat0
2918 ldr
$rounds,[$key1,#240]
2919 vld1
.32
{q20
-q21
},[$key1],#32 // load key schedule...
2923 vld1
.32
{q8
-q9
},[$key1],#32 // load key schedule...
2926 subs
$rounds,$rounds,#10 // bias
2928 .Lxts_dec_round_loop
:
2931 vld1
.32
{q8
},[$key1],#16 // load key schedule...
2934 vld1
.32
{q9
},[$key1],#16 // load key schedule...
2935 subs
$rounds,$rounds,#2 // bias
2936 b
.gt .Lxts_dec_round_loop
2938 vld1
.32
{q10
-q11
},[$key1],#32 // load key schedule...
2943 vld1
.32
{q12
-q13
},[$key1],#32 // load key schedule...
2948 vld1
.32
{q14
-q15
},[$key1],#32 // load key schedule...
2953 vld1
.32
{$rndlast},[$key1]
2957 veor
$dat0,$dat0,$rndlast
2958 veor
$dat0,$iv0,$dat0
2959 vst1
.8
{$dat0},[$out]
2960 b
.Lxts_dec_final_abort
2963 $code.=<<___
if ($flavour =~ /64/);
2964 stp
$constnumx,$tmpinp,[sp
,#-64]!
2965 stp
$tailcnt,$midnumx,[sp
,#48]
2966 stp
$ivd10,$ivd20,[sp
,#32]
2967 stp
$ivd30,$ivd40,[sp
,#16]
2969 and $tailcnt,$len,#0xf
2973 b
.lo
.Lxts_dec_abort
2975 // Encrypt the iv with key2
, as the first XEX iv
2976 ldr
$rounds,[$key2,#240]
2977 vld1
.8
{$dat},[$key2],#16
2978 vld1
.8
{$iv0},[$ivp]
2979 sub $rounds,$rounds,#2
2980 vld1
.8
{$dat1},[$key2],#16
2985 vld1
.32
{$dat},[$key2],#16
2986 subs
$rounds,$rounds,#2
2989 vld1
.32
{$dat1},[$key2],#16
2990 b
.gt .Loop_dec_iv_enc
2994 vld1
.32
{$dat},[$key2]
2998 // The iv
for second block
2999 // $ivl- iv
(low
), $ivh - iv
(high
)
3000 // the five ivs stored into
, $iv0,$iv1,$iv2,$iv3,$iv4
3004 extr
$midnumx,$ivh,$ivh,#32
3005 extr
$ivh,$ivh,$ivl,#63
3006 and $tmpmw,$constnum,$midnum,asr
#31
3007 eor
$ivl,$tmpmx,$ivl,lsl
#1
3011 ldr
$rounds0,[$key1,#240] // load rounds number
3013 // The iv
for third block
3014 extr
$midnumx,$ivh,$ivh,#32
3015 extr
$ivh,$ivh,$ivl,#63
3016 and $tmpmw,$constnum,$midnum,asr
#31
3017 eor
$ivl,$tmpmx,$ivl,lsl
#1
3021 vld1
.32
{q8
-q9
},[$key1] // load key schedule
...
3022 sub $rounds0,$rounds0,#6
3023 add
$key_,$key1,$ivp,lsl
#4 // pointer to last 7 round keys
3024 sub $rounds0,$rounds0,#2
3025 vld1
.32
{q10
-q11
},[$key_],#32 // load key schedule...
3026 vld1
.32
{q12
-q13
},[$key_],#32
3027 vld1
.32
{q14
-q15
},[$key_],#32
3028 vld1
.32
{$rndlast},[$key_]
3030 // The iv
for fourth block
3031 extr
$midnumx,$ivh,$ivh,#32
3032 extr
$ivh,$ivh,$ivl,#63
3033 and $tmpmw,$constnum,$midnum,asr
#31
3034 eor
$ivl,$tmpmx,$ivl,lsl
#1
3039 mov
$rounds,$rounds0
3046 b
.eq .Lxts_dec_begin
3048 csel
$step,xzr
,$step,eq
3049 vld1
.8
{$dat},[$inp],#16
3053 vld1
.8
{$dat},[$inp],$step
3054 subs
$len,$len,#32 // bias
3055 add
$rounds,$rounds0,#2
3057 vorr
$dat1,$dat,$dat
3059 vld1
.8
{$dat2},[$inp],#16
3060 vorr
$in2,$dat2,$dat2
3061 vorr
$in4,$dat2,$dat2
3062 b
.lo
.Lxts_inner_dec_tail
3063 veor
$dat,$dat,$iv0 // before decryt
, xor with iv
3064 veor
$dat2,$dat2,$iv1
3066 vorr
$dat1,$dat2,$dat2
3067 vld1
.8
{$dat2},[$inp],#16
3069 vorr
$in1,$dat1,$dat1
3070 veor
$in2,$dat2,$iv2 // third block xox with third iv
3071 veor
$dat2,$dat2,$iv2
3073 b
.lo
.Lxts_outer_dec_tail
3075 vld1
.8
{$dat3},[$inp],#16
3077 // The iv
for fifth block
3078 extr
$midnumx,$ivh,$ivh,#32
3079 extr
$ivh,$ivh,$ivl,#63
3080 and $tmpmw,$constnum,$midnum,asr
#31
3081 eor
$ivl,$tmpmx,$ivl,lsl
#1
3085 vld1
.8
{$dat4},[$inp],#16
3086 veor
$dat3,$dat3,$iv3 // the fourth block
3087 veor
$dat4,$dat4,$iv4
3088 sub $len,$len,#32 // bias
3089 mov
$rounds,$rounds0
3104 vld1
.32
{q8
},[$key_],#16 // load key schedule...
3105 subs
$rounds,$rounds,#2
3116 vld1
.32
{q9
},[$key_],#16 // load key schedule...
3117 b
.gt .Loop5x_xts_dec
3129 subs
$len,$len,#0x50 // because .Lxts_dec_tail4x
3141 csel
$xoffset,xzr
,$len,gt // borrow x6
, w6
, "gt" is
not typo
3154 add
$inp,$inp,$xoffset // x0 is adjusted
in such way that
3155 // at
exit from the
loop v1
.16b
-v26
.16b
3156 // are loaded with
last "words"
3157 add
$xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3203 veor
$tmp0,$rndlast,$iv0
3205 // The iv
for first block of
next iteration
.
3206 extr
$midnumx,$ivh,$ivh,#32
3207 extr
$ivh,$ivh,$ivl,#63
3208 and $tmpmw,$constnum,$midnum,asr
#31
3209 eor
$ivl,$tmpmx,$ivl,lsl
#1
3212 veor
$tmp1,$rndlast,$iv1
3213 vld1
.8
{$in0},[$inp],#16
3215 // The iv
for second block
3216 extr
$midnumx,$ivh,$ivh,#32
3217 extr
$ivh,$ivh,$ivl,#63
3218 and $tmpmw,$constnum,$midnum,asr
#31
3219 eor
$ivl,$tmpmx,$ivl,lsl
#1
3222 veor
$tmp2,$rndlast,$iv2
3223 vld1
.8
{$in1},[$inp],#16
3225 // The iv
for third block
3226 extr
$midnumx,$ivh,$ivh,#32
3227 extr
$ivh,$ivh,$ivl,#63
3228 and $tmpmw,$constnum,$midnum,asr
#31
3229 eor
$ivl,$tmpmx,$ivl,lsl
#1
3232 veor
$tmp3,$rndlast,$iv3
3233 vld1
.8
{$in2},[$inp],#16
3235 // The iv
for fourth block
3236 extr
$midnumx,$ivh,$ivh,#32
3237 extr
$ivh,$ivh,$ivl,#63
3238 and $tmpmw,$constnum,$midnum,asr
#31
3239 eor
$ivl,$tmpmx,$ivl,lsl
#1
3242 veor
$tmp4,$rndlast,$iv4
3243 vld1
.8
{$in3},[$inp],#16
3246 // The iv
for fifth block
3247 extr
$midnumx,$ivh,$ivh,#32
3248 extr
$ivh,$ivh,$ivl,#63
3249 and $tmpmw,$constnum,$midnum,asr
#31
3250 eor
$ivl,$tmpmx,$ivl,lsl
#1
3254 vld1
.8
{$in4},[$inp],#16
3255 cbz
$xoffset,.Lxts_dec_tail4x
3256 vld1
.32
{q8
},[$key_],#16 // re-pre-load rndkey[0]
3257 veor
$tmp0,$tmp0,$dat0
3258 veor
$dat0,$in0,$iv0
3259 veor
$tmp1,$tmp1,$dat1
3260 veor
$dat1,$in1,$iv1
3261 veor
$tmp2,$tmp2,$dat2
3262 veor
$dat2,$in2,$iv2
3263 veor
$tmp3,$tmp3,$dat3
3264 veor
$dat3,$in3,$iv3
3265 veor
$tmp4,$tmp4,$dat4
3266 vst1
.8
{$tmp0},[$out],#16
3267 veor
$dat4,$in4,$iv4
3268 vst1
.8
{$tmp1},[$out],#16
3269 mov
$rounds,$rounds0
3270 vst1
.8
{$tmp2},[$out],#16
3271 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
3272 vst1
.8
{$tmp3},[$out],#16
3273 vst1
.8
{$tmp4},[$out],#16
3274 b
.hs
.Loop5x_xts_dec
3277 b
.ne .Loop5x_dec_after
3278 // If x2
($len) equal to
-0x10, the left blocks is
4.
3279 // After specially processing
, utilize the five blocks processing again
.
3280 // It will
use the following IVs
: $iv0,$iv0,$iv1,$iv2,$iv3.
3287 veor
$dat0,$iv0,$in0
3288 veor
$dat1,$iv1,$in1
3289 veor
$dat2,$in2,$iv2
3290 veor
$dat3,$in3,$iv3
3291 veor
$dat4,$in4,$iv4
3292 b
.eq .Loop5x_xts_dec
3298 add
$rounds,$rounds0,#2
3299 subs
$len,$len,#0x30
3300 b
.lo
.Lxts_inner_dec_tail
3302 veor
$dat0,$iv0,$in2
3303 veor
$dat1,$iv1,$in3
3304 veor
$dat2,$in4,$iv2
3305 b
.Lxts_outer_dec_tail
3310 vld1
.32
{$dat0},[$inp],#16
3311 veor
$tmp1,$dat1,$tmp0
3312 vst1
.8
{$tmp1},[$out],#16
3313 veor
$tmp2,$dat2,$tmp2
3314 vst1
.8
{$tmp2},[$out],#16
3315 veor
$tmp3,$dat3,$tmp3
3316 veor
$tmp4,$dat4,$tmp4
3317 vst1
.8
{$tmp3-$tmp4},[$out],#32
3321 .Lxts_outer_dec_tail
:
3328 vld1
.32
{q8
},[$key_],#16
3329 subs
$rounds,$rounds,#2
3336 vld1
.32
{q9
},[$key_],#16
3337 b
.gt .Lxts_outer_dec_tail
3345 veor
$tmp0,$iv0,$rndlast
3346 subs
$len,$len,#0x30
3347 // The iv
for first block
3351 extr
$midnumx,$ivh,$ivh,#32
3352 extr
$ivh,$ivh,$ivl,#63
3353 and $tmpmw,$constnum,$midnum,asr
#31
3354 eor
$ivl,$tmpmx,$ivl,lsl
#1
3357 veor
$tmp1,$iv1,$rndlast
3358 csel
$xoffset,$len,$xoffset,lo
// x6
, w6
, is zero at this point
3365 veor
$tmp2,$iv2,$rndlast
3366 // The iv
for second block
3367 extr
$midnumx,$ivh,$ivh,#32
3368 extr
$ivh,$ivh,$ivl,#63
3369 and $tmpmw,$constnum,$midnum,asr
#31
3370 eor
$ivl,$tmpmx,$ivl,lsl
#1
3374 add
$xoffset,$xoffset,#0x20
3375 add
$inp,$inp,$xoffset // $inp is adjusted to the
last data
3379 // The iv
for third block
3380 extr
$midnumx,$ivh,$ivh,#32
3381 extr
$ivh,$ivh,$ivl,#63
3382 and $tmpmw,$constnum,$midnum,asr
#31
3383 eor
$ivl,$tmpmx,$ivl,lsl
#1
3405 vld1
.8
{$in2},[$inp],#16
3409 vld1
.32
{q8
},[$key_],#16 // re-pre-load rndkey[0]
3410 add
$rounds,$rounds0,#2
3411 veor
$tmp0,$tmp0,$dat0
3412 veor
$tmp1,$tmp1,$dat1
3413 veor
$dat2,$dat2,$tmp2
3414 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
3415 vst1
.8
{$tmp0},[$out],#16
3416 vst1
.8
{$tmp1},[$out],#16
3417 vst1
.8
{$dat2},[$out],#16
3427 .Lxts_inner_dec_tail
:
3428 // $len == -0x10 means two blocks left
.
3430 veor
$dat1,$in3,$iv0
3431 veor
$dat2,$in4,$iv1
3432 b
.eq .Lxts_dec_tail_loop
3433 veor
$dat2,$in4,$iv0
3434 .Lxts_dec_tail_loop
:
3439 vld1
.32
{q8
},[$key_],#16
3440 subs
$rounds,$rounds,#2
3445 vld1
.32
{q9
},[$key_],#16
3446 b
.gt .Lxts_dec_tail_loop
3465 veor
$tmp1,$iv0,$rndlast
3470 veor
$tmp2,$iv1,$rndlast
3474 veor
$tmp1,$tmp1,$dat1
3475 veor
$tmp2,$tmp2,$dat2
3478 vst1
.8
{$tmp1},[$out],#16
3479 vst1
.8
{$tmp2},[$out],#16
3484 veor
$tmp1,$tmp1,$dat2
3487 vst1
.8
{$tmp1},[$out],#16
3492 b
.eq .Lxts_dec_abort
3493 // Processing the
last two blocks with cipher stealing
.
3495 cbnz x2
,.Lxts_dec_1st_done
3496 vld1
.32
{$dat0},[$inp],#16
3498 // Decrypt the
last second block to get the
last plain text block
3500 eor
$tmpin,$dat0,$iv1
3501 ldr
$rounds,[$key1,#240]
3502 vld1
.32
{$dat0},[$key1],#16
3503 sub $rounds,$rounds,#2
3504 vld1
.32
{$dat1},[$key1],#16
3505 .Loop_final_2nd_dec
:
3507 aesimc
$tmpin,$tmpin
3508 vld1
.32
{$dat0},[$key1],#16 // load key schedule...
3509 subs
$rounds,$rounds,#2
3511 aesimc
$tmpin,$tmpin
3512 vld1
.32
{$dat1},[$key1],#16 // load key schedule...
3513 b
.gt .Loop_final_2nd_dec
3516 aesimc
$tmpin,$tmpin
3517 vld1
.32
{$dat0},[$key1]
3519 veor
$tmpin,$tmpin,$dat0
3520 veor
$tmpin,$tmpin,$iv1
3521 vst1
.8
{$tmpin},[$out]
3524 add
$tmpoutp,$out,#16
3526 // Composite the tailcnt
"16 byte not aligned block" into the
last second plain blocks
3527 // to get the
last encrypted block
.
3528 .composite_dec_loop
:
3529 subs
$tailcnt,$tailcnt,#1
3530 ldrb
$l2outp,[$out,$tailcnt]
3531 ldrb
$loutp,[$tmpinp,$tailcnt]
3532 strb
$l2outp,[$tmpoutp,$tailcnt]
3533 strb
$loutp,[$out,$tailcnt]
3534 b
.gt .composite_dec_loop
3535 .Lxts_dec_load_done
:
3536 vld1
.8
{$tmpin},[$out]
3537 veor
$tmpin,$tmpin,$iv0
3539 // Decrypt the composite block to get the
last second plain text block
3540 ldr
$rounds,[$key_,#240]
3541 vld1
.8
{$dat},[$key_],#16
3542 sub $rounds,$rounds,#2
3543 vld1
.8
{$dat1},[$key_],#16
3546 aesimc
$tmpin,$tmpin
3547 vld1
.32
{$dat0},[$key_],#16 // load key schedule...
3548 subs
$rounds,$rounds,#2
3550 aesimc
$tmpin,$tmpin
3551 vld1
.32
{$dat1},[$key_],#16 // load key schedule...
3552 b
.gt .Loop_final_dec
3555 aesimc
$tmpin,$tmpin
3556 vld1
.32
{$dat0},[$key_]
3558 veor
$tmpin,$tmpin,$dat0
3559 veor
$tmpin,$tmpin,$iv0
3560 vst1
.8
{$tmpin},[$out]
3563 ldp
$tailcnt,$midnumx,[sp
,#48]
3564 ldp
$ivd10,$ivd20,[sp
,#32]
3565 ldp
$ivd30,$ivd40,[sp
,#16]
3566 ldp
$constnumx,$tmpinp,[sp
],#64
3568 .Lxts_dec_final_abort
:
3570 .size
${prefix
}_xts_decrypt
,.-${prefix
}_xts_decrypt
3577 ########################################
3578 if ($flavour =~ /64/) { ######## 64-bit code
3580 "aesd" => 0x4e285800, "aese" => 0x4e284800,
3581 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3583 local *unaes
= sub {
3584 my ($mnemonic,$arg)=@_;
3586 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
3587 sprintf ".inst\t0x%08x\t//%s %s",
3588 $opcode{$mnemonic}|$1|($2<<5),
3592 foreach(split("\n",$code)) {
3593 s/\`([^\`]*)\`/eval($1)/geo;
3595 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3596 s/@\s/\/\
//o; # old->new style commentary
3598 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3599 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3600 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3601 s/vmov\.i8/movi/o or # fix up legacy mnemonics
3603 s/vrev32\.8/rev32/o or
3604 s/vtst\.8/cmtst/o or
3606 s/^(\s+)v/$1/o or # strip off v prefix
3607 s/\bbx\s+lr\b/ret/o;
3609 # fix up remaining legacy suffixes
3611 m/\],#8/o and s/\.16b/\.8b/go;
3612 s/\.[ui]?32//o and s/\.16b/\.4s/go;
3613 s/\.[ui]?64//o and s/\.16b/\.2d/go;
3614 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3616 # Switch preprocessor checks to aarch64 versions.
3617 s/__ARME([BL])__/__AARCH64E$1__/go;
3621 } else { ######## 32-bit code
3623 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3624 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3626 local *unaes
= sub {
3627 my ($mnemonic,$arg)=@_;
3629 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3630 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3631 |(($2&7)<<1) |(($2&8)<<2);
3632 # since ARMv7 instructions are always encoded little-endian.
3633 # correct solution is to use .inst directive, but older
3634 # assemblers don't implement it:-(
3635 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3636 $word&0xff,($word>>8)&0xff,
3637 ($word>>16)&0xff,($word>>24)&0xff,
3645 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3646 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3647 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3653 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3654 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3660 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3661 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3664 foreach(split("\n",$code)) {
3665 s/\`([^\`]*)\`/eval($1)/geo;
3667 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3668 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3669 s/\/\/\s?
/@ /o; # new->old style commentary
3671 # fix up remaining new-style suffixes
3672 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
3675 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3676 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3677 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3678 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3679 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3680 s/^(\s+)b\./$1b/o or
3681 s/^(\s+)ret/$1bx\tlr/o;
3683 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3691 close STDOUT
or die "error closing STDOUT: $!";