2 # Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
42 # Performance in cycles per byte processed with 128-bit key:
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m
|\
.\w
+$| ?
pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m
|\
.| ?
shift : undef;
64 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
67 die "can't locate arm-xlate.pl";
69 open OUT
,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
75 $_byte = ($flavour =~ /win/ ?
"DCB" : ".byte");
80 #if __ARM_MAX_ARCH__>=7
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___
if ($flavour !~ /64/);
84 .arch armv7
-a
// don
't confuse not-so-latest binutils with argv8 :-)
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
92 # define INST(a,b,c,d) $_byte a,b,c,d
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
119 ${prefix}_set_encrypt_key:
122 $code.=<<___ if ($flavour =~ /64/);
123 AARCH64_VALID_CALL_TARGET
124 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
125 stp x29,x30,[sp,#-16]!
145 veor $zero,$zero,$zero
146 vld1.8 {$in0},[$inp],#16
147 mov $bits,#8 // reuse $bits
148 vld1.32 {$rcon,$mask},[$ptr],#32
156 vtbl.8 $key,{$in0},$mask
157 vext.8 $tmp,$zero,$in0,#12
158 vst1.32 {$in0},[$out],#16
163 vext.8 $tmp,$zero,$tmp,#12
165 vext.8 $tmp,$zero,$tmp,#12
168 vshl.u8 $rcon,$rcon,#1
172 vld1.32 {$rcon},[$ptr]
174 vtbl.8 $key,{$in0},$mask
175 vext.8 $tmp,$zero,$in0,#12
176 vst1.32 {$in0},[$out],#16
180 vext.8 $tmp,$zero,$tmp,#12
182 vext.8 $tmp,$zero,$tmp,#12
185 vshl.u8 $rcon,$rcon,#1
188 vtbl.8 $key,{$in0},$mask
189 vext.8 $tmp,$zero,$in0,#12
190 vst1.32 {$in0},[$out],#16
194 vext.8 $tmp,$zero,$tmp,#12
196 vext.8 $tmp,$zero,$tmp,#12
200 vst1.32 {$in0},[$out]
208 vld1.8 {$in1},[$inp],#8
209 vmov.i8 $key,#8 // borrow $key
210 vst1.32 {$in0},[$out],#16
211 vsub.i8 $mask,$mask,$key // adjust the mask
214 vtbl.8 $key,{$in1},$mask
215 vext.8 $tmp,$zero,$in0,#12
217 vst1.32 {$in1},[$out],#16
220 vst1.32 {$in1},[$out],#8
226 vext.8 $tmp,$zero,$tmp,#12
228 vext.8 $tmp,$zero,$tmp,#12
231 vdup.32 $tmp,${in0}[3]
234 vext.8 $in1,$zero,$in1,#12
235 vshl.u8 $rcon,$rcon,#1
239 vst1.32 {$in0},[$out],#16
251 vst1.32 {$in0},[$out],#16
254 vtbl.8 $key,{$in1},$mask
255 vext.8 $tmp,$zero,$in0,#12
256 vst1.32 {$in1},[$out],#16
261 vext.8 $tmp,$zero,$tmp,#12
263 vext.8 $tmp,$zero,$tmp,#12
266 vshl.u8 $rcon,$rcon,#1
268 vst1.32 {$in0},[$out],#16
271 vdup.32 $key,${in0}[3] // just splat
272 vext.8 $tmp,$zero,$in1,#12
276 vext.8 $tmp,$zero,$tmp,#12
278 vext.8 $tmp,$zero,$tmp,#12
289 mov x0,$ptr // return value
290 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
292 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
294 .globl ${prefix}_set_decrypt_key
295 .type ${prefix}_set_decrypt_key,%function
297 ${prefix}_set_decrypt_key:
299 $code.=<<___ if ($flavour =~ /64/);
300 AARCH64_SIGN_LINK_REGISTER
301 stp x29,x30,[sp,#-16]!
304 $code.=<<___ if ($flavour !~ /64/);
313 sub $out,$out,#240 // restore original $out
315 add $inp,$out,x12,lsl#4 // end of key schedule
317 vld1.32 {v0.16b},[$out]
318 vld1.32 {v1.16b},[$inp]
319 vst1.32 {v0.16b},[$inp],x4
320 vst1.32 {v1.16b},[$out],#16
323 vld1.32 {v0.16b},[$out]
324 vld1.32 {v1.16b},[$inp]
327 vst1.32 {v0.16b},[$inp],x4
328 vst1.32 {v1.16b},[$out],#16
332 vld1.32 {v0.16b},[$out]
334 vst1.32 {v0.16b},[$inp]
336 eor x0,x0,x0 // return value
339 $code.=<<___ if ($flavour !~ /64/);
342 $code.=<<___ if ($flavour =~ /64/);
344 AARCH64_VALIDATE_LINK_REGISTER
348 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
354 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
355 my ($inp,$out,$key)=map("x$_",(0..2));
357 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
360 .globl ${prefix}_${dir}crypt
361 .type ${prefix}_${dir}crypt,%function
363 ${prefix}_${dir}crypt:
365 $code.=<<___ if ($flavour =~ /64/);
366 AARCH64_VALID_CALL_TARGET
369 ldr $rounds,[$key,#240]
370 vld1.32 {$rndkey0},[$key],#16
371 vld1.8 {$inout},[$inp]
372 sub $rounds,$rounds,#2
373 vld1.32 {$rndkey1},[$key],#16
376 aes$e $inout,$rndkey0
378 vld1.32 {$rndkey0},[$key],#16
379 subs $rounds,$rounds,#2
380 aes$e $inout,$rndkey1
382 vld1.32 {$rndkey1},[$key],#16
385 aes$e $inout,$rndkey0
387 vld1.32 {$rndkey0},[$key]
388 aes$e $inout,$rndkey1
389 veor $inout,$inout,$rndkey0
391 vst1.8 {$inout},[$out]
393 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
400 # Performance in cycles per byte.
401 # Processed with AES-ECB different key size.
402 # It shows the value before and after optimization as below:
405 # AES-128-ECB AES-192-ECB AES-256-ECB
406 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
407 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
409 # Optimization is implemented by loop unrolling and interleaving.
410 # Commonly, we choose the unrolling factor as 5, if the input
411 # data size smaller than 5 blocks, but not smaller than 3 blocks,
412 # choose 3 as the unrolling factor.
413 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
414 # as one iteration, every loop the left size lsize -= 5*16.
415 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
416 # every loop lsize -=3*16.
417 # If lsize < 3*16 bytes, treat them as the tail, interleave the
418 # two blocks AES instructions.
419 # There is one special case, if the original input data size dsize
420 # = 16 bytes, we will treat it separately to improve the
421 # performance: one independent code block without LR, FP load and
422 # store, just looks like what the original ECB implementation does.
425 my ($inp,$out,$len,$key)=map("x$_",(0..3));
426 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
427 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
429 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
431 ### q7 last round key
432 ### q10-q15 q7 Last 7 round keys
433 ### q8-q9 preloaded round keys except last 7 keys for big size
434 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
437 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
439 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
440 my ($dat4,$in4,$tmp4);
441 if ($flavour =~ /64/) {
442 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
446 .globl ${prefix}_ecb_encrypt
447 .type ${prefix}_ecb_encrypt,%function
449 ${prefix}_ecb_encrypt:
451 $code.=<<___ if ($flavour =~ /64/);
452 AARCH64_VALID_CALL_TARGET
454 // Original input data size bigger than 16, jump to big size processing.
456 vld1.8 {$dat0},[$inp]
457 cmp $enc,#0 // en- or decrypting?
458 ldr $rounds,[$key,#240]
459 vld1.32 {q5-q6},[$key],#32 // load key schedule...
464 vld1.32 {q8-q9},[$key],#32 // load key schedule...
467 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
472 vld1.32 {q8},[$key],#16 // load key schedule...
475 vld1.32 {q9},[$key],#16 // load key schedule...
476 subs $rounds,$rounds,#2 // bias
477 b.gt .Lecb_round_loop
479 vld1.32 {q10-q11},[$key],#32 // load key schedule...
484 vld1.32 {q12-q13},[$key],#32 // load key schedule...
489 vld1.32 {q14-q15},[$key],#32 // load key schedule...
494 vld1.32 {$rndlast},[$key]
498 veor $dat0,$dat0,$rndlast
499 vst1.8 {$dat0},[$out]
504 vld1.32 {q8-q9},[$key],#32 // load key schedule...
507 subs $rounds,$rounds,#10 // bias
509 .Lecb_dec_round_loop:
512 vld1.32 {q8},[$key],#16 // load key schedule...
515 vld1.32 {q9},[$key],#16 // load key schedule...
516 subs $rounds,$rounds,#2 // bias
517 b.gt .Lecb_dec_round_loop
519 vld1.32 {q10-q11},[$key],#32 // load key schedule...
524 vld1.32 {q12-q13},[$key],#32 // load key schedule...
529 vld1.32 {q14-q15},[$key],#32 // load key schedule...
534 vld1.32 {$rndlast},[$key]
538 veor $dat0,$dat0,$rndlast
539 vst1.8 {$dat0},[$out]
543 $code.=<<___ if ($flavour =~ /64/);
544 stp x29,x30,[sp,#-16]!
547 $code.=<<___ if ($flavour !~ /64/);
550 vstmdb sp!,{d8-d15} @ ABI specification says so
551 ldmia ip,{r4-r5} @ load remaining args
559 cmp $enc,#0 // en- or decrypting?
560 ldr $rounds,[$key,#240]
562 vld1.8 {$dat},[$inp],$step
564 vld1.32 {q8-q9},[$key] // load key schedule...
565 sub $rounds,$rounds,#6
566 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
567 sub $rounds,$rounds,#2
568 vld1.32 {q10-q11},[$key_],#32
569 vld1.32 {q12-q13},[$key_],#32
570 vld1.32 {q14-q15},[$key_],#32
571 vld1.32 {$rndlast},[$key_]
577 vld1.8 {$dat1},[$inp],#16
578 subs $len,$len,#32 // bias
580 vorr $in1,$dat1,$dat1
581 vorr $dat2,$dat1,$dat1
586 vld1.8 {$dat2},[$inp],#16
588 $code.=<<___ if ($flavour =~ /64/);
592 vld1.8 {$dat3},[$inp],#16
593 vld1.8 {$dat4},[$inp],#16
594 sub $len,$len,#32 // bias
608 vld1.32 {q8},[$key_],#16
620 vld1.32 {q9},[$key_],#16
633 cmp $len,#0x40 // because .Lecb_enc_tail4x
646 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
659 add $inp,$inp,x6 // $inp is adjusted in such way that
660 // at exit from the loop $dat1-$dat4
661 // are loaded with last "words"
662 add x6,$len,#0x60 // because .Lecb_enc_tail4x
709 vld1.8 {$in0},[$inp],#16
711 vld1.8 {$in1},[$inp],#16
713 vld1.8 {$in2},[$inp],#16
715 vld1.8 {$in3},[$inp],#16
717 vld1.8 {$in4},[$inp],#16
718 cbz x6,.Lecb_enc_tail4x
719 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
720 veor $tmp0,$rndlast,$dat0
722 veor $tmp1,$rndlast,$dat1
724 veor $tmp2,$rndlast,$dat2
726 veor $tmp3,$rndlast,$dat3
728 veor $tmp4,$rndlast,$dat4
729 vst1.8 {$tmp0},[$out],#16
731 vst1.8 {$tmp1},[$out],#16
733 vst1.8 {$tmp2},[$out],#16
734 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
735 vst1.8 {$tmp3},[$out],#16
736 vst1.8 {$tmp4},[$out],#16
753 veor $tmp1,$rndlast,$dat1
754 veor $tmp2,$rndlast,$dat2
755 veor $tmp3,$rndlast,$dat3
756 veor $tmp4,$rndlast,$dat4
757 vst1.8 {$tmp1},[$out],#16
758 vst1.8 {$tmp2},[$out],#16
759 vst1.8 {$tmp3},[$out],#16
760 vst1.8 {$tmp4},[$out],#16
773 vld1.32 {q8},[$key_],#16
781 vld1.32 {q9},[$key_],#16
791 mov.lo x6,$len // x6, $cnt, is zero at this point
798 add $inp,$inp,x6 // $inp is adjusted in such way that
799 // at exit from the loop $dat1-$dat2
800 // are loaded with last "words"
808 vld1.8 {$in0},[$inp],#16
815 vld1.8 {$in1},[$inp],#16
822 vld1.8 {$in2},[$inp],#16
826 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
828 veor $tmp0,$rndlast,$dat0
829 veor $tmp1,$rndlast,$dat1
830 veor $dat2,$dat2,$rndlast
831 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
832 vst1.8 {$tmp0},[$out],#16
834 vst1.8 {$tmp1},[$out],#16
836 vst1.8 {$dat2},[$out],#16
849 vld1.32 {q8},[$key_],#16
855 vld1.32 {q9},[$key_],#16
882 veor $tmp1,$rndlast,$dat1
883 veor $tmp2,$rndlast,$dat2
884 vst1.8 {$tmp1},[$out],#16
885 vst1.8 {$tmp2},[$out],#16
889 veor $tmp1,$rndlast,$dat2
890 vst1.8 {$tmp1},[$out],#16
897 vld1.8 {$dat1},[$inp],#16
898 subs $len,$len,#32 // bias
900 vorr $in1,$dat1,$dat1
901 vorr $dat2,$dat1,$dat1
906 vld1.8 {$dat2},[$inp],#16
908 $code.=<<___ if ($flavour =~ /64/);
912 vld1.8 {$dat3},[$inp],#16
913 vld1.8 {$dat4},[$inp],#16
914 sub $len,$len,#32 // bias
928 vld1.32 {q8},[$key_],#16
940 vld1.32 {q9},[$key_],#16
953 cmp $len,#0x40 // because .Lecb_tail4x
966 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
979 add $inp,$inp,x6 // $inp is adjusted in such way that
980 // at exit from the loop $dat1-$dat4
981 // are loaded with last "words"
982 add x6,$len,#0x60 // because .Lecb_tail4x
1029 vld1.8 {$in0},[$inp],#16
1031 vld1.8 {$in1},[$inp],#16
1033 vld1.8 {$in2},[$inp],#16
1035 vld1.8 {$in3},[$inp],#16
1037 vld1.8 {$in4},[$inp],#16
1039 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1040 veor $tmp0,$rndlast,$dat0
1041 vorr $dat0,$in0,$in0
1042 veor $tmp1,$rndlast,$dat1
1043 vorr $dat1,$in1,$in1
1044 veor $tmp2,$rndlast,$dat2
1045 vorr $dat2,$in2,$in2
1046 veor $tmp3,$rndlast,$dat3
1047 vorr $dat3,$in3,$in3
1048 veor $tmp4,$rndlast,$dat4
1049 vst1.8 {$tmp0},[$out],#16
1050 vorr $dat4,$in4,$in4
1051 vst1.8 {$tmp1},[$out],#16
1053 vst1.8 {$tmp2},[$out],#16
1054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1055 vst1.8 {$tmp3},[$out],#16
1056 vst1.8 {$tmp4},[$out],#16
1057 b.hs .Loop5x_ecb_dec
1063 subs $len,$len,#0x30
1064 vorr $dat0,$in2,$in2
1065 vorr $dat1,$in3,$in3
1066 vorr $dat2,$in4,$in4
1073 veor $tmp1,$rndlast,$dat1
1074 veor $tmp2,$rndlast,$dat2
1075 veor $tmp3,$rndlast,$dat3
1076 veor $tmp4,$rndlast,$dat4
1077 vst1.8 {$tmp1},[$out],#16
1078 vst1.8 {$tmp2},[$out],#16
1079 vst1.8 {$tmp3},[$out],#16
1080 vst1.8 {$tmp4},[$out],#16
1093 vld1.32 {q8},[$key_],#16
1101 vld1.32 {q9},[$key_],#16
1102 b.gt .Loop3x_ecb_dec
1110 subs $len,$len,#0x30
1111 mov.lo x6,$len // x6, $cnt, is zero at this point
1118 add $inp,$inp,x6 // $inp is adjusted in such way that
1119 // at exit from the loop $dat1-$dat2
1120 // are loaded with last "words"
1128 vld1.8 {$in0},[$inp],#16
1135 vld1.8 {$in1},[$inp],#16
1142 vld1.8 {$in2},[$inp],#16
1146 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1148 veor $tmp0,$rndlast,$dat0
1149 veor $tmp1,$rndlast,$dat1
1150 veor $dat2,$dat2,$rndlast
1151 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1152 vst1.8 {$tmp0},[$out],#16
1153 vorr $dat0,$in0,$in0
1154 vst1.8 {$tmp1},[$out],#16
1155 vorr $dat1,$in1,$in1
1156 vst1.8 {$dat2},[$out],#16
1157 vorr $dat2,$in2,$in2
1158 b.hs .Loop3x_ecb_dec
1169 vld1.32 {q8},[$key_],#16
1175 vld1.32 {q9},[$key_],#16
1202 veor $tmp1,$rndlast,$dat1
1203 veor $tmp2,$rndlast,$dat2
1204 vst1.8 {$tmp1},[$out],#16
1205 vst1.8 {$tmp2},[$out],#16
1209 veor $tmp1,$rndlast,$dat2
1210 vst1.8 {$tmp1},[$out],#16
1215 $code.=<<___ if ($flavour !~ /64/);
1217 ldmia sp!,{r4-r8,pc}
1219 $code.=<<___ if ($flavour =~ /64/);
1222 $code.=<<___ if ($flavour =~ /64/);
1227 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1231 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1235 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1238 ### q8-q15 preloaded key schedule
1241 .globl ${prefix}_cbc_encrypt
1242 .type ${prefix}_cbc_encrypt,%function
1244 ${prefix}_cbc_encrypt:
1246 $code.=<<___ if ($flavour =~ /64/);
1247 AARCH64_VALID_CALL_TARGET
1248 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249 stp x29,x30,[sp,#-16]!
1252 $code.=<<___ if ($flavour !~ /64/);
1254 stmdb sp!,{r4-r8,lr}
1255 vstmdb sp!,{d8-d15} @ ABI specification says so
1256 ldmia ip,{r4-r5} @ load remaining args
1264 cmp $enc,#0 // en- or decrypting?
1265 ldr $rounds,[$key,#240]
1267 vld1.8 {$ivec},[$ivp]
1268 vld1.8 {$dat},[$inp],$step
1270 vld1.32 {q8-q9},[$key] // load key schedule...
1271 sub $rounds,$rounds,#6
1272 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1273 sub $rounds,$rounds,#2
1274 vld1.32 {q10-q11},[$key_],#32
1275 vld1.32 {q12-q13},[$key_],#32
1276 vld1.32 {q14-q15},[$key_],#32
1277 vld1.32 {$rndlast},[$key_]
1284 veor $dat,$dat,$ivec
1285 veor $rndzero_n_last,q8,$rndlast
1288 vld1.32 {$in0-$in1},[$key_]
1290 add $key4,$key,#16*4
1291 add $key5,$key,#16*5
1294 add $key6,$key,#16*6
1295 add $key7,$key,#16*7
1302 vst1.8 {$ivec},[$out],#16
1308 vld1.32 {q8},[$key4]
1312 vld1.32 {q9},[$key5]
1317 vld1.32 {q8},[$key6]
1320 vld1.32 {q9},[$key7]
1334 vld1.8 {q8},[$inp],$step
1337 veor q8,q8,$rndzero_n_last
1340 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1344 veor $ivec,$dat,$rndlast
1347 vst1.8 {$ivec},[$out],#16
1352 vld1.32 {$in0-$in1},[$key_]
1355 b .Lenter_cbc_enc128
1359 vst1.8 {$ivec},[$out],#16
1373 vld1.8 {q8},[$inp],$step
1380 veor q8,q8,$rndzero_n_last
1382 veor $ivec,$dat,$rndlast
1383 b.hs .Loop_cbc_enc128
1385 vst1.8 {$ivec},[$out],#16
1389 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1391 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1392 my ($dat4,$in4,$tmp4);
1393 if ($flavour =~ /64/) {
1394 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1400 vld1.8 {$dat2},[$inp],#16
1401 subs $len,$len,#32 // bias
1404 vorr $dat1,$dat,$dat
1405 vorr $in2,$dat2,$dat2
1408 vorr $dat1,$dat2,$dat2
1409 vld1.8 {$dat2},[$inp],#16
1411 vorr $in1,$dat1,$dat1
1412 vorr $in2,$dat2,$dat2
1414 $code.=<<___ if ($flavour =~ /64/);
1416 b.lo .Loop3x_cbc_dec
1418 vld1.8 {$dat3},[$inp],#16
1419 vld1.8 {$dat4},[$inp],#16
1420 sub $len,$len,#32 // bias
1422 vorr $in3,$dat3,$dat3
1423 vorr $in4,$dat4,$dat4
1436 vld1.32 {q8},[$key_],#16
1448 vld1.32 {q9},[$key_],#16
1449 b.gt .Loop5x_cbc_dec
1461 cmp $len,#0x40 // because .Lcbc_tail4x
1474 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1487 add $inp,$inp,x6 // $inp is adjusted in such way that
1488 // at exit from the loop $dat1-$dat4
1489 // are loaded with last "words"
1490 add x6,$len,#0x60 // because .Lcbc_tail4x
1536 veor $tmp0,$ivec,$rndlast
1538 veor $tmp1,$in0,$rndlast
1539 vld1.8 {$in0},[$inp],#16
1541 veor $tmp2,$in1,$rndlast
1542 vld1.8 {$in1},[$inp],#16
1544 veor $tmp3,$in2,$rndlast
1545 vld1.8 {$in2},[$inp],#16
1547 veor $tmp4,$in3,$rndlast
1548 vld1.8 {$in3},[$inp],#16
1550 vorr $ivec,$in4,$in4
1551 vld1.8 {$in4},[$inp],#16
1553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1554 veor $tmp0,$tmp0,$dat0
1555 vorr $dat0,$in0,$in0
1556 veor $tmp1,$tmp1,$dat1
1557 vorr $dat1,$in1,$in1
1558 veor $tmp2,$tmp2,$dat2
1559 vorr $dat2,$in2,$in2
1560 veor $tmp3,$tmp3,$dat3
1561 vorr $dat3,$in3,$in3
1562 veor $tmp4,$tmp4,$dat4
1563 vst1.8 {$tmp0},[$out],#16
1564 vorr $dat4,$in4,$in4
1565 vst1.8 {$tmp1},[$out],#16
1567 vst1.8 {$tmp2},[$out],#16
1568 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1569 vst1.8 {$tmp3},[$out],#16
1570 vst1.8 {$tmp4},[$out],#16
1571 b.hs .Loop5x_cbc_dec
1577 subs $len,$len,#0x30
1578 vorr $dat0,$in2,$in2
1580 vorr $dat1,$in3,$in3
1582 vorr $dat2,$in4,$in4
1590 veor $tmp1,$tmp0,$dat1
1591 veor $tmp2,$tmp2,$dat2
1592 veor $tmp3,$tmp3,$dat3
1593 veor $tmp4,$tmp4,$dat4
1594 vst1.8 {$tmp1},[$out],#16
1595 vst1.8 {$tmp2},[$out],#16
1596 vst1.8 {$tmp3},[$out],#16
1597 vst1.8 {$tmp4},[$out],#16
1610 vld1.32 {q8},[$key_],#16
1618 vld1.32 {q9},[$key_],#16
1619 b.gt .Loop3x_cbc_dec
1627 veor $tmp0,$ivec,$rndlast
1628 subs $len,$len,#0x30
1629 veor $tmp1,$in0,$rndlast
1630 mov.lo x6,$len // x6, $cnt, is zero at this point
1637 veor $tmp2,$in1,$rndlast
1638 add $inp,$inp,x6 // $inp is adjusted in such way that
1639 // at exit from the loop $dat1-$dat2
1640 // are loaded with last "words"
1641 vorr $ivec,$in2,$in2
1649 vld1.8 {$in0},[$inp],#16
1656 vld1.8 {$in1},[$inp],#16
1663 vld1.8 {$in2},[$inp],#16
1667 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1669 veor $tmp0,$tmp0,$dat0
1670 veor $tmp1,$tmp1,$dat1
1671 veor $dat2,$dat2,$tmp2
1672 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1673 vst1.8 {$tmp0},[$out],#16
1674 vorr $dat0,$in0,$in0
1675 vst1.8 {$tmp1},[$out],#16
1676 vorr $dat1,$in1,$in1
1677 vst1.8 {$dat2},[$out],#16
1678 vorr $dat2,$in2,$in2
1679 b.hs .Loop3x_cbc_dec
1690 vld1.32 {q8},[$key_],#16
1696 vld1.32 {q9},[$key_],#16
1716 veor $tmp1,$ivec,$rndlast
1721 veor $tmp2,$in1,$rndlast
1725 veor $tmp1,$tmp1,$dat1
1726 veor $tmp2,$tmp2,$dat2
1727 vorr $ivec,$in2,$in2
1728 vst1.8 {$tmp1},[$out],#16
1729 vst1.8 {$tmp2},[$out],#16
1733 veor $tmp1,$tmp1,$dat2
1734 vorr $ivec,$in2,$in2
1735 vst1.8 {$tmp1},[$out],#16
1738 vst1.8 {$ivec},[$ivp]
1742 $code.=<<___ if ($flavour !~ /64/);
1744 ldmia sp!,{r4-r8,pc}
1746 $code.=<<___ if ($flavour =~ /64/);
1751 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1755 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1756 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1757 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1758 my $step="x12"; # aliases with $tctr2
1760 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1761 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1763 # used only in 64-bit mode...
1764 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1766 my ($dat,$tmp)=($dat0,$tmp0);
1768 ### q8-q15 preloaded key schedule
1771 .globl ${prefix}_ctr32_encrypt_blocks
1772 .type ${prefix}_ctr32_encrypt_blocks,%function
1774 ${prefix}_ctr32_encrypt_blocks:
1776 $code.=<<___ if ($flavour =~ /64/);
1777 AARCH64_VALID_CALL_TARGET
1778 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1779 stp x29,x30,[sp,#-16]!
1782 $code.=<<___ if ($flavour !~ /64/);
1784 stmdb sp!,{r4-r10,lr}
1785 vstmdb sp!,{d8-d15} @ ABI specification says so
1786 ldr r4, [ip] @ load remaining arg
1789 ldr $rounds,[$key,#240]
1791 ldr $ctr, [$ivp, #12]
1793 vld1.8 {$dat0},[$ivp]
1795 vld1.32 {$dat0},[$ivp]
1797 vld1.32 {q8-q9},[$key] // load key schedule...
1798 sub $rounds,$rounds,#4
1801 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1802 sub $rounds,$rounds,#2
1803 vld1.32 {q12-q13},[$key_],#32
1804 vld1.32 {q14-q15},[$key_],#32
1805 vld1.32 {$rndlast},[$key_]
1813 $code.=<<___ if ($flavour =~ /64/);
1814 vorr $dat1,$dat0,$dat0
1815 add $tctr1, $ctr, #1
1816 vorr $dat2,$dat0,$dat0
1818 vorr $ivec,$dat0,$dat0
1820 vmov.32 ${dat1}[3],$tctr1
1823 sub $len,$len,#3 // bias
1824 vmov.32 ${dat2}[3],$tctr2
1826 $code.=<<___ if ($flavour !~ /64/);
1827 add $tctr1, $ctr, #1
1828 vorr $ivec,$dat0,$dat0
1830 vmov.32 ${ivec}[3],$tctr1
1832 vorr $dat1,$ivec,$ivec
1835 vmov.32 ${ivec}[3],$tctr2
1836 sub $len,$len,#3 // bias
1837 vorr $dat2,$ivec,$ivec
1839 $code.=<<___ if ($flavour =~ /64/);
1845 vorr $dat3,$dat0,$dat0
1847 vorr $dat4,$dat0,$dat0
1849 vmov.32 ${dat3}[3],w13
1850 sub $len,$len,#2 // bias
1851 vmov.32 ${dat4}[3],w14
1867 vld1.32 {q8},[$key_],#16
1879 vld1.32 {q9},[$key_],#16
1893 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1905 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1941 vld1.8 {$in0},[$inp],#16
1944 vld1.8 {$in1},[$inp],#16
1947 vld1.8 {$in2},[$inp],#16
1950 vld1.8 {$in3},[$inp],#16
1953 vld1.8 {$in4},[$inp],#16
1956 veor $in0,$in0,$rndlast
1958 veor $in1,$in1,$rndlast
1960 veor $in2,$in2,$rndlast
1962 veor $in3,$in3,$rndlast
1964 veor $in4,$in4,$rndlast
1966 veor $in0,$in0,$dat0
1967 vorr $dat0,$ivec,$ivec
1968 veor $in1,$in1,$dat1
1969 vorr $dat1,$ivec,$ivec
1970 veor $in2,$in2,$dat2
1971 vorr $dat2,$ivec,$ivec
1972 veor $in3,$in3,$dat3
1973 vorr $dat3,$ivec,$ivec
1974 veor $in4,$in4,$dat4
1975 vorr $dat4,$ivec,$ivec
1977 vst1.8 {$in0},[$out],#16
1978 vmov.32 ${dat0}[3],$tctr0
1979 vst1.8 {$in1},[$out],#16
1980 vmov.32 ${dat1}[3],$tctr1
1981 vst1.8 {$in2},[$out],#16
1982 vmov.32 ${dat2}[3],$tctr2
1983 vst1.8 {$in3},[$out],#16
1984 vmov.32 ${dat3}[3],w13
1985 vst1.8 {$in4},[$out],#16
1986 vmov.32 ${dat4}[3],w14
1989 cbz $len,.Lctr32_done
2003 sub $len,$len,#3 // bias
2017 vld1.32 {q8},[$key_],#16
2025 vld1.32 {q9},[$key_],#16
2032 vld1.8 {$in0},[$inp],#16
2034 $code.=<<___ if ($flavour =~ /64/);
2035 vorr $dat0,$ivec,$ivec
2037 $code.=<<___ if ($flavour !~ /64/);
2043 vld1.8 {$in1},[$inp],#16
2045 $code.=<<___ if ($flavour =~ /64/);
2046 vorr $dat1,$ivec,$ivec
2048 $code.=<<___ if ($flavour !~ /64/);
2056 vld1.8 {$in2},[$inp],#16
2061 $code.=<<___ if ($flavour =~ /64/);
2062 vorr $dat2,$ivec,$ivec
2070 veor $in0,$in0,$rndlast
2074 veor $in1,$in1,$rndlast
2080 veor $in2,$in2,$rndlast
2082 $code.=<<___ if ($flavour =~ /64/);
2086 vmov.32 ${dat0}[3], $tctr0
2088 $code.=<<___ if ($flavour !~ /64/);
2089 vmov.32 ${ivec}[3], $tctr0
2092 vorr $dat0,$ivec,$ivec
2099 $code.=<<___ if ($flavour !~ /64/);
2100 vmov.32 ${ivec}[3], $tctr1
2107 $code.=<<___ if ($flavour =~ /64/);
2108 vmov.32 ${dat1}[3], $tctr1
2112 vmov.32 ${dat2}[3], $tctr2
2114 $code.=<<___ if ($flavour !~ /64/);
2115 vorr $dat1,$ivec,$ivec
2116 vmov.32 ${ivec}[3], $tctr2
2119 vorr $dat2,$ivec,$ivec
2127 veor $in0,$in0,$tmp0
2128 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2129 vst1.8 {$in0},[$out],#16
2130 veor $in1,$in1,$tmp1
2132 vst1.8 {$in1},[$out],#16
2133 veor $in2,$in2,$tmp2
2134 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2135 vst1.8 {$in2},[$out],#16
2149 vld1.32 {q8},[$key_],#16
2155 vld1.32 {q9},[$key_],#16
2166 vld1.8 {$in0},[$inp],$step
2171 vld1.8 {$in1},[$inp]
2176 veor $in0,$in0,$rndlast
2181 veor $in1,$in1,$rndlast
2186 veor $in0,$in0,$dat0
2187 veor $in1,$in1,$dat1
2188 vst1.8 {$in0},[$out],#16
2190 vst1.8 {$in1},[$out]
2194 $code.=<<___ if ($flavour !~ /64/);
2196 ldmia sp!,{r4-r10,pc}
2198 $code.=<<___ if ($flavour =~ /64/);
2203 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2206 # Performance in cycles per byte.
2207 # Processed with AES-XTS different key size.
2208 # It shows the value before and after optimization as below:
2211 # AES-128-XTS AES-256-XTS
2212 # Cortex-A57 3.36/1.09 4.02/1.37
2213 # Cortex-A72 3.03/1.02 3.28/1.33
2215 # Optimization is implemented by loop unrolling and interleaving.
2216 # Commonly, we choose the unrolling factor as 5, if the input
2217 # data size smaller than 5 blocks, but not smaller than 3 blocks,
2218 # choose 3 as the unrolling factor.
2219 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
2220 # as one iteration, every loop the left size lsize -= 5*16.
2221 # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2222 # will be processed specially, which be integrated into the 5*16 bytes
2223 # loop to improve the efficiency.
2224 # There is one special case, if the original input data size dsize
2225 # = 16 bytes, we will treat it separately to improve the
2226 # performance: one independent code block without LR, FP load and
2228 # Encryption will process the (length -tailcnt) bytes as mentioned
2229 # previously, then encrypt the composite block as last second
2231 # Decryption will process the (length -tailcnt -1) bytes as mentioned
2232 # previously, then decrypt the last second cipher block to get the
2233 # last plain block(tail), decrypt the composite block as last second
2237 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2238 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2239 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2240 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2241 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2242 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2243 my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2244 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2245 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2247 my ($tmpin)=("v26.16b");
2248 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2251 # q10-q15, q7 Last 7 round keys
2252 # q8-q9 preloaded round keys except last 7 keys for big size
2253 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2256 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2258 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2259 my ($dat4,$in4,$tmp4);
2260 if ($flavour =~ /64/) {
2261 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2264 $code.=<<___ if ($flavour =~ /64/);
2265 .globl ${prefix}_xts_encrypt
2266 .type ${prefix}_xts_encrypt,%function
2268 ${prefix}_xts_encrypt:
2270 $code.=<<___ if ($flavour =~ /64/);
2271 AARCH64_VALID_CALL_TARGET
2273 // Original input data size bigger than 16, jump to big size processing.
2274 b.ne .Lxts_enc_big_size
2275 // Encrypt the iv with key2, as the first XEX iv.
2276 ldr $rounds,[$key2,#240]
2277 vld1.32 {$dat},[$key2],#16
2278 vld1.8 {$iv0},[$ivp]
2279 sub $rounds,$rounds,#2
2280 vld1.32 {$dat1},[$key2],#16
2285 vld1.32 {$dat},[$key2],#16
2286 subs $rounds,$rounds,#2
2289 vld1.32 {$dat1},[$key2],#16
2290 b.gt .Loop_enc_iv_enc
2294 vld1.32 {$dat},[$key2]
2298 vld1.8 {$dat0},[$inp]
2299 veor $dat0,$iv0,$dat0
2301 ldr $rounds,[$key1,#240]
2302 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2306 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2309 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2311 .Lxts_enc_round_loop:
2314 vld1.32 {q8},[$key1],#16 // load key schedule...
2317 vld1.32 {q9},[$key1],#16 // load key schedule...
2318 subs $rounds,$rounds,#2 // bias
2319 b.gt .Lxts_enc_round_loop
2321 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2326 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2331 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2336 vld1.32 {$rndlast},[$key1]
2340 veor $dat0,$dat0,$rndlast
2341 veor $dat0,$dat0,$iv0
2342 vst1.8 {$dat0},[$out]
2343 b .Lxts_enc_final_abort
2348 $code.=<<___ if ($flavour =~ /64/);
2349 stp $constnumx,$tmpinp,[sp,#-64]!
2350 stp $tailcnt,$midnumx,[sp,#48]
2351 stp $ivd10,$ivd20,[sp,#32]
2352 stp $ivd30,$ivd40,[sp,#16]
2354 // tailcnt store the tail value of length%16.
2355 and $tailcnt,$len,#0xf
2360 csel $step,xzr,$step,eq
2362 // Firstly, encrypt the iv with key2, as the first iv of XEX.
2363 ldr $rounds,[$key2,#240]
2364 vld1.32 {$dat},[$key2],#16
2365 vld1.8 {$iv0},[$ivp]
2366 sub $rounds,$rounds,#2
2367 vld1.32 {$dat1},[$key2],#16
2372 vld1.32 {$dat},[$key2],#16
2373 subs $rounds,$rounds,#2
2376 vld1.32 {$dat1},[$key2],#16
2381 vld1.32 {$dat},[$key2]
2385 // The iv for second block
2386 // $ivl- iv(low), $ivh - iv(high)
2387 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2391 extr $midnumx,$ivh,$ivh,#32
2392 extr $ivh,$ivh,$ivl,#63
2393 and $tmpmw,$constnum,$midnum,asr#31
2394 eor $ivl,$tmpmx,$ivl,lsl#1
2398 ldr $rounds0,[$key1,#240] // next starting point
2399 vld1.8 {$dat},[$inp],$step
2401 vld1.32 {q8-q9},[$key1] // load key schedule...
2402 sub $rounds0,$rounds0,#6
2403 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2404 sub $rounds0,$rounds0,#2
2405 vld1.32 {q10-q11},[$key_],#32
2406 vld1.32 {q12-q13},[$key_],#32
2407 vld1.32 {q14-q15},[$key_],#32
2408 vld1.32 {$rndlast},[$key_]
2411 mov $rounds,$rounds0
2415 vld1.8 {$dat2},[$inp],#16
2416 subs $len,$len,#32 // bias
2417 add $rounds,$rounds0,#2
2419 vorr $dat1,$dat,$dat
2421 vorr $in2,$dat2,$dat2
2422 vorr $in4,$dat2,$dat2
2423 b.lo .Lxts_inner_enc_tail
2424 veor $dat,$dat,$iv0 // before encryption, xor with iv
2425 veor $dat2,$dat2,$iv1
2427 // The iv for third block
2428 extr $midnumx,$ivh,$ivh,#32
2429 extr $ivh,$ivh,$ivl,#63
2430 and $tmpmw,$constnum,$midnum,asr#31
2431 eor $ivl,$tmpmx,$ivl,lsl#1
2436 vorr $dat1,$dat2,$dat2
2437 vld1.8 {$dat2},[$inp],#16
2439 vorr $in1,$dat1,$dat1
2440 veor $in2,$dat2,$iv2 // the third block
2441 veor $dat2,$dat2,$iv2
2443 b.lo .Lxts_outer_enc_tail
2445 // The iv for fourth block
2446 extr $midnumx,$ivh,$ivh,#32
2447 extr $ivh,$ivh,$ivl,#63
2448 and $tmpmw,$constnum,$midnum,asr#31
2449 eor $ivl,$tmpmx,$ivl,lsl#1
2453 vld1.8 {$dat3},[$inp],#16
2454 // The iv for fifth block
2455 extr $midnumx,$ivh,$ivh,#32
2456 extr $ivh,$ivh,$ivl,#63
2457 and $tmpmw,$constnum,$midnum,asr#31
2458 eor $ivl,$tmpmx,$ivl,lsl#1
2462 vld1.8 {$dat4},[$inp],#16
2463 veor $dat3,$dat3,$iv3 // the fourth block
2464 veor $dat4,$dat4,$iv4
2465 sub $len,$len,#32 // bias
2466 mov $rounds,$rounds0
2481 vld1.32 {q8},[$key_],#16
2482 subs $rounds,$rounds,#2
2493 vld1.32 {q9},[$key_],#16
2494 b.gt .Loop5x_xts_enc
2506 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2518 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2531 add $inp,$inp,$xoffset // x0 is adjusted in such way that
2532 // at exit from the loop v1.16b-v26.16b
2533 // are loaded with last "words"
2534 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2580 veor $tmp0,$rndlast,$iv0
2582 // The iv for first block of one iteration
2583 extr $midnumx,$ivh,$ivh,#32
2584 extr $ivh,$ivh,$ivl,#63
2585 and $tmpmw,$constnum,$midnum,asr#31
2586 eor $ivl,$tmpmx,$ivl,lsl#1
2589 veor $tmp1,$rndlast,$iv1
2590 vld1.8 {$in0},[$inp],#16
2592 // The iv for second block
2593 extr $midnumx,$ivh,$ivh,#32
2594 extr $ivh,$ivh,$ivl,#63
2595 and $tmpmw,$constnum,$midnum,asr#31
2596 eor $ivl,$tmpmx,$ivl,lsl#1
2599 veor $tmp2,$rndlast,$iv2
2600 vld1.8 {$in1},[$inp],#16
2602 // The iv for third block
2603 extr $midnumx,$ivh,$ivh,#32
2604 extr $ivh,$ivh,$ivl,#63
2605 and $tmpmw,$constnum,$midnum,asr#31
2606 eor $ivl,$tmpmx,$ivl,lsl#1
2609 veor $tmp3,$rndlast,$iv3
2610 vld1.8 {$in2},[$inp],#16
2612 // The iv for fourth block
2613 extr $midnumx,$ivh,$ivh,#32
2614 extr $ivh,$ivh,$ivl,#63
2615 and $tmpmw,$constnum,$midnum,asr#31
2616 eor $ivl,$tmpmx,$ivl,lsl#1
2619 veor $tmp4,$rndlast,$iv4
2620 vld1.8 {$in3},[$inp],#16
2623 // The iv for fifth block
2624 extr $midnumx,$ivh,$ivh,#32
2625 extr $ivh,$ivh,$ivl,#63
2626 and $tmpmw,$constnum,$midnum,asr #31
2627 eor $ivl,$tmpmx,$ivl,lsl #1
2631 vld1.8 {$in4},[$inp],#16
2632 cbz $xoffset,.Lxts_enc_tail4x
2633 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2634 veor $tmp0,$tmp0,$dat0
2635 veor $dat0,$in0,$iv0
2636 veor $tmp1,$tmp1,$dat1
2637 veor $dat1,$in1,$iv1
2638 veor $tmp2,$tmp2,$dat2
2639 veor $dat2,$in2,$iv2
2640 veor $tmp3,$tmp3,$dat3
2641 veor $dat3,$in3,$iv3
2642 veor $tmp4,$tmp4,$dat4
2643 vst1.8 {$tmp0},[$out],#16
2644 veor $dat4,$in4,$iv4
2645 vst1.8 {$tmp1},[$out],#16
2646 mov $rounds,$rounds0
2647 vst1.8 {$tmp2},[$out],#16
2648 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2649 vst1.8 {$tmp3},[$out],#16
2650 vst1.8 {$tmp4},[$out],#16
2651 b.hs .Loop5x_xts_enc
2654 // If left 4 blocks, borrow the five block's processing
.
2656 b
.ne .Loop5x_enc_after
2663 veor
$dat0,$iv0,$in0
2664 veor
$dat1,$iv1,$in1
2665 veor
$dat2,$in2,$iv2
2666 veor
$dat3,$in3,$iv3
2667 veor
$dat4,$in4,$iv4
2668 b
.eq .Loop5x_xts_enc
2672 cbz
$len,.Lxts_enc_done
2674 add
$rounds,$rounds0,#2
2675 subs
$len,$len,#0x30
2676 b
.lo
.Lxts_inner_enc_tail
2678 veor
$dat0,$iv0,$in2
2679 veor
$dat1,$iv1,$in3
2680 veor
$dat2,$in4,$iv2
2681 b
.Lxts_outer_enc_tail
2686 veor
$tmp1,$dat1,$tmp1
2687 vst1
.8
{$tmp1},[$out],#16
2688 veor
$tmp2,$dat2,$tmp2
2689 vst1
.8
{$tmp2},[$out],#16
2690 veor
$tmp3,$dat3,$tmp3
2691 veor
$tmp4,$dat4,$tmp4
2692 vst1
.8
{$tmp3-$tmp4},[$out],#32
2696 .Lxts_outer_enc_tail
:
2703 vld1
.32
{q8
},[$key_],#16
2704 subs
$rounds,$rounds,#2
2711 vld1
.32
{q9
},[$key_],#16
2712 b
.gt .Lxts_outer_enc_tail
2720 veor
$tmp0,$iv0,$rndlast
2721 subs
$len,$len,#0x30
2722 // The iv
for first block
2725 //mov
$constnum,#0x87
2726 extr
$midnumx,$ivh,$ivh,#32
2727 extr
$ivh,$ivh,$ivl,#63
2728 and $tmpmw,$constnum,$midnum,asr
#31
2729 eor
$ivl,$tmpmx,$ivl,lsl
#1
2732 veor
$tmp1,$iv1,$rndlast
2733 csel
$xoffset,$len,$xoffset,lo
// x6
, w6
, is zero at this point
2740 veor
$tmp2,$iv2,$rndlast
2742 add
$xoffset,$xoffset,#0x20
2743 add
$inp,$inp,$xoffset
2767 vld1
.8
{$in2},[$inp],#16
2768 add
$rounds,$rounds0,#2
2769 vld1
.32
{q8
},[$key_],#16 // re-pre-load rndkey[0]
2770 veor
$tmp0,$tmp0,$dat0
2771 veor
$tmp1,$tmp1,$dat1
2772 veor
$dat2,$dat2,$tmp2
2773 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
2774 vst1
.8
{$tmp0},[$out],#16
2775 vst1
.8
{$tmp1},[$out],#16
2776 vst1
.8
{$dat2},[$out],#16
2784 .Lxts_inner_enc_tail
:
2786 veor
$dat1,$in3,$iv0
2787 veor
$dat2,$in4,$iv1
2788 b
.eq .Lxts_enc_tail_loop
2789 veor
$dat2,$in4,$iv0
2790 .Lxts_enc_tail_loop
:
2795 vld1
.32
{q8
},[$key_],#16
2796 subs
$rounds,$rounds,#2
2801 vld1
.32
{q9
},[$key_],#16
2802 b
.gt .Lxts_enc_tail_loop
2821 veor
$tmp1,$iv0,$rndlast
2826 veor
$tmp2,$iv1,$rndlast
2830 veor
$tmp1,$tmp1,$dat1
2831 vst1
.8
{$tmp1},[$out],#16
2832 veor
$tmp2,$tmp2,$dat2
2834 vst1
.8
{$tmp2},[$out],#16
2838 extr
$midnumx,$ivh,$ivh,#32
2839 extr
$ivh,$ivh,$ivl,#63
2840 and $tmpmw,$constnum,$midnum,asr
#31
2841 eor
$ivl,$tmpmx,$ivl,lsl
#1
2847 veor
$tmp1,$tmp1,$dat2
2849 vst1
.8
{$tmp1},[$out],#16
2853 extr
$midnumx,$ivh,$ivh,#32
2854 extr
$ivh,$ivh,$ivl,#63
2855 and $tmpmw,$constnum,$midnum,asr
#31
2856 eor
$ivl,$tmpmx,$ivl,lsl
#1
2862 // Process the tail block with cipher stealing
.
2869 .composite_enc_loop
:
2870 subs
$tailcnt,$tailcnt,#1
2871 ldrb
$l2outp,[$out,$tailcnt]
2872 ldrb
$loutp,[$tmpinp,$tailcnt]
2873 strb
$l2outp,[$tmpoutp,$tailcnt]
2874 strb
$loutp,[$out,$tailcnt]
2875 b
.gt .composite_enc_loop
2876 .Lxts_enc_load_done
:
2877 vld1
.8
{$tmpin},[$out]
2878 veor
$tmpin,$tmpin,$iv0
2880 // Encrypt the composite block to get the
last second encrypted text block
2881 ldr
$rounds,[$key1,#240] // load key schedule...
2882 vld1
.32
{$dat},[$key1],#16
2883 sub $rounds,$rounds,#2
2884 vld1
.32
{$dat1},[$key1],#16 // load key schedule...
2888 vld1
.32
{$dat0},[$key1],#16
2889 subs
$rounds,$rounds,#2
2892 vld1
.32
{$dat1},[$key1],#16
2893 b
.gt .Loop_final_enc
2897 vld1
.32
{$dat0},[$key1]
2899 veor
$tmpin,$tmpin,$dat0
2900 veor
$tmpin,$tmpin,$iv0
2901 vst1
.8
{$tmpin},[$out]
2904 ldp
$tailcnt,$midnumx,[sp
,#48]
2905 ldp
$ivd10,$ivd20,[sp
,#32]
2906 ldp
$ivd30,$ivd40,[sp
,#16]
2907 ldp
$constnumx,$tmpinp,[sp
],#64
2908 .Lxts_enc_final_abort
:
2910 .size
${prefix
}_xts_encrypt
,.-${prefix
}_xts_encrypt
2915 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2916 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2917 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2918 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2919 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2920 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2921 my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2922 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2923 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2925 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2928 # q10-q15, q7 Last 7 round keys
2929 # q8-q9 preloaded round keys except last 7 keys for big size
2930 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2933 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2935 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2936 my ($dat4,$in4,$tmp4);
2937 if ($flavour =~ /64/) {
2938 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2941 $code.=<<___
if ($flavour =~ /64/);
2942 .globl
${prefix
}_xts_decrypt
2943 .type
${prefix
}_xts_decrypt
,%function
2945 ${prefix
}_xts_decrypt
:
2946 AARCH64_VALID_CALL_TARGET
2948 $code.=<<___
if ($flavour =~ /64/);
2950 // Original input data size bigger than
16, jump to big size processing
.
2951 b
.ne .Lxts_dec_big_size
2952 // Encrypt the iv with key2
, as the first XEX iv
.
2953 ldr
$rounds,[$key2,#240]
2954 vld1
.32
{$dat},[$key2],#16
2955 vld1
.8
{$iv0},[$ivp]
2956 sub $rounds,$rounds,#2
2957 vld1
.32
{$dat1},[$key2],#16
2959 .Loop_dec_small_iv_enc
:
2962 vld1
.32
{$dat},[$key2],#16
2963 subs
$rounds,$rounds,#2
2966 vld1
.32
{$dat1},[$key2],#16
2967 b
.gt .Loop_dec_small_iv_enc
2971 vld1
.32
{$dat},[$key2]
2975 vld1
.8
{$dat0},[$inp]
2976 veor
$dat0,$iv0,$dat0
2978 ldr
$rounds,[$key1,#240]
2979 vld1
.32
{q20
-q21
},[$key1],#32 // load key schedule...
2983 vld1
.32
{q8
-q9
},[$key1],#32 // load key schedule...
2986 subs
$rounds,$rounds,#10 // bias
2988 .Lxts_dec_round_loop
:
2991 vld1
.32
{q8
},[$key1],#16 // load key schedule...
2994 vld1
.32
{q9
},[$key1],#16 // load key schedule...
2995 subs
$rounds,$rounds,#2 // bias
2996 b
.gt .Lxts_dec_round_loop
2998 vld1
.32
{q10
-q11
},[$key1],#32 // load key schedule...
3003 vld1
.32
{q12
-q13
},[$key1],#32 // load key schedule...
3008 vld1
.32
{q14
-q15
},[$key1],#32 // load key schedule...
3013 vld1
.32
{$rndlast},[$key1]
3017 veor
$dat0,$dat0,$rndlast
3018 veor
$dat0,$iv0,$dat0
3019 vst1
.8
{$dat0},[$out]
3020 b
.Lxts_dec_final_abort
3023 $code.=<<___
if ($flavour =~ /64/);
3024 stp
$constnumx,$tmpinp,[sp
,#-64]!
3025 stp
$tailcnt,$midnumx,[sp
,#48]
3026 stp
$ivd10,$ivd20,[sp
,#32]
3027 stp
$ivd30,$ivd40,[sp
,#16]
3029 and $tailcnt,$len,#0xf
3033 b
.lo
.Lxts_dec_abort
3035 // Encrypt the iv with key2
, as the first XEX iv
3036 ldr
$rounds,[$key2,#240]
3037 vld1
.32
{$dat},[$key2],#16
3038 vld1
.8
{$iv0},[$ivp]
3039 sub $rounds,$rounds,#2
3040 vld1
.32
{$dat1},[$key2],#16
3045 vld1
.32
{$dat},[$key2],#16
3046 subs
$rounds,$rounds,#2
3049 vld1
.32
{$dat1},[$key2],#16
3050 b
.gt .Loop_dec_iv_enc
3054 vld1
.32
{$dat},[$key2]
3058 // The iv
for second block
3059 // $ivl- iv
(low
), $ivh - iv
(high
)
3060 // the five ivs stored into
, $iv0,$iv1,$iv2,$iv3,$iv4
3064 extr
$midnumx,$ivh,$ivh,#32
3065 extr
$ivh,$ivh,$ivl,#63
3066 and $tmpmw,$constnum,$midnum,asr
#31
3067 eor
$ivl,$tmpmx,$ivl,lsl
#1
3071 ldr
$rounds0,[$key1,#240] // load rounds number
3073 // The iv
for third block
3074 extr
$midnumx,$ivh,$ivh,#32
3075 extr
$ivh,$ivh,$ivl,#63
3076 and $tmpmw,$constnum,$midnum,asr
#31
3077 eor
$ivl,$tmpmx,$ivl,lsl
#1
3081 vld1
.32
{q8
-q9
},[$key1] // load key schedule
...
3082 sub $rounds0,$rounds0,#6
3083 add
$key_,$key1,$ivp,lsl
#4 // pointer to last 7 round keys
3084 sub $rounds0,$rounds0,#2
3085 vld1
.32
{q10
-q11
},[$key_],#32 // load key schedule...
3086 vld1
.32
{q12
-q13
},[$key_],#32
3087 vld1
.32
{q14
-q15
},[$key_],#32
3088 vld1
.32
{$rndlast},[$key_]
3090 // The iv
for fourth block
3091 extr
$midnumx,$ivh,$ivh,#32
3092 extr
$ivh,$ivh,$ivl,#63
3093 and $tmpmw,$constnum,$midnum,asr
#31
3094 eor
$ivl,$tmpmx,$ivl,lsl
#1
3099 mov
$rounds,$rounds0
3106 b
.eq .Lxts_dec_begin
3108 csel
$step,xzr
,$step,eq
3109 vld1
.8
{$dat},[$inp],#16
3113 vld1
.8
{$dat},[$inp],$step
3114 subs
$len,$len,#32 // bias
3115 add
$rounds,$rounds0,#2
3117 vorr
$dat1,$dat,$dat
3119 vld1
.8
{$dat2},[$inp],#16
3120 vorr
$in2,$dat2,$dat2
3121 vorr
$in4,$dat2,$dat2
3122 b
.lo
.Lxts_inner_dec_tail
3123 veor
$dat,$dat,$iv0 // before decryt
, xor with iv
3124 veor
$dat2,$dat2,$iv1
3126 vorr
$dat1,$dat2,$dat2
3127 vld1
.8
{$dat2},[$inp],#16
3129 vorr
$in1,$dat1,$dat1
3130 veor
$in2,$dat2,$iv2 // third block xox with third iv
3131 veor
$dat2,$dat2,$iv2
3133 b
.lo
.Lxts_outer_dec_tail
3135 vld1
.8
{$dat3},[$inp],#16
3137 // The iv
for fifth block
3138 extr
$midnumx,$ivh,$ivh,#32
3139 extr
$ivh,$ivh,$ivl,#63
3140 and $tmpmw,$constnum,$midnum,asr
#31
3141 eor
$ivl,$tmpmx,$ivl,lsl
#1
3145 vld1
.8
{$dat4},[$inp],#16
3146 veor
$dat3,$dat3,$iv3 // the fourth block
3147 veor
$dat4,$dat4,$iv4
3148 sub $len,$len,#32 // bias
3149 mov
$rounds,$rounds0
3164 vld1
.32
{q8
},[$key_],#16 // load key schedule...
3165 subs
$rounds,$rounds,#2
3176 vld1
.32
{q9
},[$key_],#16 // load key schedule...
3177 b
.gt .Loop5x_xts_dec
3189 subs
$len,$len,#0x50 // because .Lxts_dec_tail4x
3201 csel
$xoffset,xzr
,$len,gt // borrow x6
, w6
, "gt" is
not typo
3214 add
$inp,$inp,$xoffset // x0 is adjusted
in such way that
3215 // at
exit from the
loop v1
.16b
-v26
.16b
3216 // are loaded with
last "words"
3217 add
$xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3263 veor
$tmp0,$rndlast,$iv0
3265 // The iv
for first block of
next iteration
.
3266 extr
$midnumx,$ivh,$ivh,#32
3267 extr
$ivh,$ivh,$ivl,#63
3268 and $tmpmw,$constnum,$midnum,asr
#31
3269 eor
$ivl,$tmpmx,$ivl,lsl
#1
3272 veor
$tmp1,$rndlast,$iv1
3273 vld1
.8
{$in0},[$inp],#16
3275 // The iv
for second block
3276 extr
$midnumx,$ivh,$ivh,#32
3277 extr
$ivh,$ivh,$ivl,#63
3278 and $tmpmw,$constnum,$midnum,asr
#31
3279 eor
$ivl,$tmpmx,$ivl,lsl
#1
3282 veor
$tmp2,$rndlast,$iv2
3283 vld1
.8
{$in1},[$inp],#16
3285 // The iv
for third block
3286 extr
$midnumx,$ivh,$ivh,#32
3287 extr
$ivh,$ivh,$ivl,#63
3288 and $tmpmw,$constnum,$midnum,asr
#31
3289 eor
$ivl,$tmpmx,$ivl,lsl
#1
3292 veor
$tmp3,$rndlast,$iv3
3293 vld1
.8
{$in2},[$inp],#16
3295 // The iv
for fourth block
3296 extr
$midnumx,$ivh,$ivh,#32
3297 extr
$ivh,$ivh,$ivl,#63
3298 and $tmpmw,$constnum,$midnum,asr
#31
3299 eor
$ivl,$tmpmx,$ivl,lsl
#1
3302 veor
$tmp4,$rndlast,$iv4
3303 vld1
.8
{$in3},[$inp],#16
3306 // The iv
for fifth block
3307 extr
$midnumx,$ivh,$ivh,#32
3308 extr
$ivh,$ivh,$ivl,#63
3309 and $tmpmw,$constnum,$midnum,asr
#31
3310 eor
$ivl,$tmpmx,$ivl,lsl
#1
3314 vld1
.8
{$in4},[$inp],#16
3315 cbz
$xoffset,.Lxts_dec_tail4x
3316 vld1
.32
{q8
},[$key_],#16 // re-pre-load rndkey[0]
3317 veor
$tmp0,$tmp0,$dat0
3318 veor
$dat0,$in0,$iv0
3319 veor
$tmp1,$tmp1,$dat1
3320 veor
$dat1,$in1,$iv1
3321 veor
$tmp2,$tmp2,$dat2
3322 veor
$dat2,$in2,$iv2
3323 veor
$tmp3,$tmp3,$dat3
3324 veor
$dat3,$in3,$iv3
3325 veor
$tmp4,$tmp4,$dat4
3326 vst1
.8
{$tmp0},[$out],#16
3327 veor
$dat4,$in4,$iv4
3328 vst1
.8
{$tmp1},[$out],#16
3329 mov
$rounds,$rounds0
3330 vst1
.8
{$tmp2},[$out],#16
3331 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
3332 vst1
.8
{$tmp3},[$out],#16
3333 vst1
.8
{$tmp4},[$out],#16
3334 b
.hs
.Loop5x_xts_dec
3337 b
.ne .Loop5x_dec_after
3338 // If x2
($len) equal to
-0x10, the left blocks is
4.
3339 // After specially processing
, utilize the five blocks processing again
.
3340 // It will
use the following IVs
: $iv0,$iv0,$iv1,$iv2,$iv3.
3347 veor
$dat0,$iv0,$in0
3348 veor
$dat1,$iv1,$in1
3349 veor
$dat2,$in2,$iv2
3350 veor
$dat3,$in3,$iv3
3351 veor
$dat4,$in4,$iv4
3352 b
.eq .Loop5x_xts_dec
3358 add
$rounds,$rounds0,#2
3359 subs
$len,$len,#0x30
3360 b
.lo
.Lxts_inner_dec_tail
3362 veor
$dat0,$iv0,$in2
3363 veor
$dat1,$iv1,$in3
3364 veor
$dat2,$in4,$iv2
3365 b
.Lxts_outer_dec_tail
3371 veor
$tmp1,$dat1,$tmp0
3372 vst1
.8
{$tmp1},[$out],#16
3373 veor
$tmp2,$dat2,$tmp2
3374 vst1
.8
{$tmp2},[$out],#16
3375 veor
$tmp3,$dat3,$tmp3
3376 veor
$tmp4,$dat4,$tmp4
3377 vst1
.8
{$tmp3-$tmp4},[$out],#32
3379 b
.eq .Lxts_dec_abort
3380 vld1
.8
{$dat0},[$inp],#16
3383 .Lxts_outer_dec_tail
:
3390 vld1
.32
{q8
},[$key_],#16
3391 subs
$rounds,$rounds,#2
3398 vld1
.32
{q9
},[$key_],#16
3399 b
.gt .Lxts_outer_dec_tail
3407 veor
$tmp0,$iv0,$rndlast
3408 subs
$len,$len,#0x30
3409 // The iv
for first block
3413 extr
$midnumx,$ivh,$ivh,#32
3414 extr
$ivh,$ivh,$ivl,#63
3415 and $tmpmw,$constnum,$midnum,asr
#31
3416 eor
$ivl,$tmpmx,$ivl,lsl
#1
3419 veor
$tmp1,$iv1,$rndlast
3420 csel
$xoffset,$len,$xoffset,lo
// x6
, w6
, is zero at this point
3427 veor
$tmp2,$iv2,$rndlast
3428 // The iv
for second block
3429 extr
$midnumx,$ivh,$ivh,#32
3430 extr
$ivh,$ivh,$ivl,#63
3431 and $tmpmw,$constnum,$midnum,asr
#31
3432 eor
$ivl,$tmpmx,$ivl,lsl
#1
3436 add
$xoffset,$xoffset,#0x20
3437 add
$inp,$inp,$xoffset // $inp is adjusted to the
last data
3441 // The iv
for third block
3442 extr
$midnumx,$ivh,$ivh,#32
3443 extr
$ivh,$ivh,$ivl,#63
3444 and $tmpmw,$constnum,$midnum,asr
#31
3445 eor
$ivl,$tmpmx,$ivl,lsl
#1
3467 vld1
.8
{$in2},[$inp],#16
3471 vld1
.32
{q8
},[$key_],#16 // re-pre-load rndkey[0]
3472 add
$rounds,$rounds0,#2
3473 veor
$tmp0,$tmp0,$dat0
3474 veor
$tmp1,$tmp1,$dat1
3475 veor
$dat2,$dat2,$tmp2
3476 vld1
.32
{q9
},[$key_],#16 // re-pre-load rndkey[1]
3477 vst1
.8
{$tmp0},[$out],#16
3478 vst1
.8
{$tmp1},[$out],#16
3479 vst1
.8
{$dat2},[$out],#16
3489 .Lxts_inner_dec_tail
:
3490 // $len == -0x10 means two blocks left
.
3492 veor
$dat1,$in3,$iv0
3493 veor
$dat2,$in4,$iv1
3494 b
.eq .Lxts_dec_tail_loop
3495 veor
$dat2,$in4,$iv0
3496 .Lxts_dec_tail_loop
:
3501 vld1
.32
{q8
},[$key_],#16
3502 subs
$rounds,$rounds,#2
3507 vld1
.32
{q9
},[$key_],#16
3508 b
.gt .Lxts_dec_tail_loop
3527 veor
$tmp1,$iv0,$rndlast
3532 veor
$tmp2,$iv1,$rndlast
3536 veor
$tmp1,$tmp1,$dat1
3537 veor
$tmp2,$tmp2,$dat2
3540 vst1
.8
{$tmp1},[$out],#16
3541 vst1
.8
{$tmp2},[$out],#16
3546 veor
$tmp1,$tmp1,$dat2
3549 vst1
.8
{$tmp1},[$out],#16
3554 b
.eq .Lxts_dec_abort
3555 // Processing the
last two blocks with cipher stealing
.
3557 cbnz x2
,.Lxts_dec_1st_done
3558 vld1
.8
{$dat0},[$inp],#16
3560 // Decrypt the
last second block to get the
last plain text block
3562 eor
$tmpin,$dat0,$iv1
3563 ldr
$rounds,[$key1,#240]
3564 vld1
.32
{$dat0},[$key1],#16
3565 sub $rounds,$rounds,#2
3566 vld1
.32
{$dat1},[$key1],#16
3567 .Loop_final_2nd_dec
:
3569 aesimc
$tmpin,$tmpin
3570 vld1
.32
{$dat0},[$key1],#16 // load key schedule...
3571 subs
$rounds,$rounds,#2
3573 aesimc
$tmpin,$tmpin
3574 vld1
.32
{$dat1},[$key1],#16 // load key schedule...
3575 b
.gt .Loop_final_2nd_dec
3578 aesimc
$tmpin,$tmpin
3579 vld1
.32
{$dat0},[$key1]
3581 veor
$tmpin,$tmpin,$dat0
3582 veor
$tmpin,$tmpin,$iv1
3583 vst1
.8
{$tmpin},[$out]
3586 add
$tmpoutp,$out,#16
3588 // Composite the tailcnt
"16 byte not aligned block" into the
last second plain blocks
3589 // to get the
last encrypted block
.
3590 .composite_dec_loop
:
3591 subs
$tailcnt,$tailcnt,#1
3592 ldrb
$l2outp,[$out,$tailcnt]
3593 ldrb
$loutp,[$tmpinp,$tailcnt]
3594 strb
$l2outp,[$tmpoutp,$tailcnt]
3595 strb
$loutp,[$out,$tailcnt]
3596 b
.gt .composite_dec_loop
3597 .Lxts_dec_load_done
:
3598 vld1
.8
{$tmpin},[$out]
3599 veor
$tmpin,$tmpin,$iv0
3601 // Decrypt the composite block to get the
last second plain text block
3602 ldr
$rounds,[$key_,#240]
3603 vld1
.32
{$dat},[$key_],#16
3604 sub $rounds,$rounds,#2
3605 vld1
.32
{$dat1},[$key_],#16
3608 aesimc
$tmpin,$tmpin
3609 vld1
.32
{$dat0},[$key_],#16 // load key schedule...
3610 subs
$rounds,$rounds,#2
3612 aesimc
$tmpin,$tmpin
3613 vld1
.32
{$dat1},[$key_],#16 // load key schedule...
3614 b
.gt .Loop_final_dec
3617 aesimc
$tmpin,$tmpin
3618 vld1
.32
{$dat0},[$key_]
3620 veor
$tmpin,$tmpin,$dat0
3621 veor
$tmpin,$tmpin,$iv0
3622 vst1
.8
{$tmpin},[$out]
3625 ldp
$tailcnt,$midnumx,[sp
,#48]
3626 ldp
$ivd10,$ivd20,[sp
,#32]
3627 ldp
$ivd30,$ivd40,[sp
,#16]
3628 ldp
$constnumx,$tmpinp,[sp
],#64
3630 .Lxts_dec_final_abort
:
3632 .size
${prefix
}_xts_decrypt
,.-${prefix
}_xts_decrypt
3639 ########################################
3640 if ($flavour =~ /64/) { ######## 64-bit code
3642 "aesd" => 0x4e285800, "aese" => 0x4e284800,
3643 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3645 local *unaes
= sub {
3646 my ($mnemonic,$arg)=@_;
3648 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
3649 sprintf ".inst\t0x%08x\t//%s %s",
3650 $opcode{$mnemonic}|$1|($2<<5),
3654 foreach(split("\n",$code)) {
3655 s/\`([^\`]*)\`/eval($1)/geo;
3657 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3658 s/@\s/\/\
//o; # old->new style commentary
3660 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3661 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3662 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3663 s/vmov\.i8/movi/o or # fix up legacy mnemonics
3665 s/vrev32\.8/rev32/o or
3666 s/vtst\.8/cmtst/o or
3668 s/^(\s+)v/$1/o or # strip off v prefix
3669 s/\bbx\s+lr\b/ret/o;
3671 # fix up remaining legacy suffixes
3673 m/\],#8/o and s/\.16b/\.8b/go;
3674 s/\.[ui]?32//o and s/\.16b/\.4s/go;
3675 s/\.[ui]?64//o and s/\.16b/\.2d/go;
3676 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3678 # Switch preprocessor checks to aarch64 versions.
3679 s/__ARME([BL])__/__AARCH64E$1__/go;
3683 } else { ######## 32-bit code
3685 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3686 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3688 local *unaes
= sub {
3689 my ($mnemonic,$arg)=@_;
3691 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3692 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3693 |(($2&7)<<1) |(($2&8)<<2);
3694 # since ARMv7 instructions are always encoded little-endian.
3695 # correct solution is to use .inst directive, but older
3696 # assemblers don't implement it:-(
3697 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3698 $word&0xff,($word>>8)&0xff,
3699 ($word>>16)&0xff,($word>>24)&0xff,
3707 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3708 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3709 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3715 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3716 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3722 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3723 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3726 foreach(split("\n",$code)) {
3727 s/\`([^\`]*)\`/eval($1)/geo;
3729 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3730 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3731 s/\/\/\s?
/@ /o; # new->old style commentary
3733 # fix up remaining new-style suffixes
3734 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
3737 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3738 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3739 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3740 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3741 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3742 s/^(\s+)b\./$1b/o or
3743 s/^(\s+)ret/$1bx\tlr/o;
3745 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3753 close STDOUT
or die "error closing STDOUT: $!";