]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesv8-armx.pl
642d779b99acb37ab4526e1bc71c27c02cc5c031
[thirdparty/openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
29 #
30 # April 2019
31 #
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
40 # to be 5x...
41 #
42 # Performance in cycles per byte processed with 128-bit key:
43 #
44 # CBC enc CBC dec CTR
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
53 #
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
57 # interleaved;
58
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
68
69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
71 *STDOUT=*OUT;
72
73 $prefix="aes_v8";
74
75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77 $code=<<___;
78 #include "arm_arch.h"
79
80 #if __ARM_MAX_ARCH__>=7
81 ___
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___ if ($flavour !~ /64/);
84 .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
85 .fpu neon
86 #ifdef __thumb2__
87 .syntax unified
88 .thumb
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
90 #else
91 .code 32
92 # define INST(a,b,c,d) $_byte a,b,c,d
93 #endif
94
95 .text
96 ___
97
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
102 #
103 {{{
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109 $code.=<<___;
110 .align 5
111 .Lrcon:
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
115
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
118 .align 5
119 ${prefix}_set_encrypt_key:
120 .Lenc_key:
121 ___
122 $code.=<<___ if ($flavour =~ /64/);
123 AARCH64_VALID_CALL_TARGET
124 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
125 stp x29,x30,[sp,#-16]!
126 add x29,sp,#0
127 ___
128 $code.=<<___;
129 mov $ptr,#-1
130 cmp $inp,#0
131 b.eq .Lenc_key_abort
132 cmp $out,#0
133 b.eq .Lenc_key_abort
134 mov $ptr,#-2
135 cmp $bits,#128
136 b.lt .Lenc_key_abort
137 cmp $bits,#256
138 b.gt .Lenc_key_abort
139 tst $bits,#0x3f
140 b.ne .Lenc_key_abort
141
142 adr $ptr,.Lrcon
143 cmp $bits,#192
144
145 veor $zero,$zero,$zero
146 vld1.8 {$in0},[$inp],#16
147 mov $bits,#8 // reuse $bits
148 vld1.32 {$rcon,$mask},[$ptr],#32
149
150 b.lt .Loop128
151 b.eq .L192
152 b .L256
153
154 .align 4
155 .Loop128:
156 vtbl.8 $key,{$in0},$mask
157 vext.8 $tmp,$zero,$in0,#12
158 vst1.32 {$in0},[$out],#16
159 aese $key,$zero
160 subs $bits,$bits,#1
161
162 veor $in0,$in0,$tmp
163 vext.8 $tmp,$zero,$tmp,#12
164 veor $in0,$in0,$tmp
165 vext.8 $tmp,$zero,$tmp,#12
166 veor $key,$key,$rcon
167 veor $in0,$in0,$tmp
168 vshl.u8 $rcon,$rcon,#1
169 veor $in0,$in0,$key
170 b.ne .Loop128
171
172 vld1.32 {$rcon},[$ptr]
173
174 vtbl.8 $key,{$in0},$mask
175 vext.8 $tmp,$zero,$in0,#12
176 vst1.32 {$in0},[$out],#16
177 aese $key,$zero
178
179 veor $in0,$in0,$tmp
180 vext.8 $tmp,$zero,$tmp,#12
181 veor $in0,$in0,$tmp
182 vext.8 $tmp,$zero,$tmp,#12
183 veor $key,$key,$rcon
184 veor $in0,$in0,$tmp
185 vshl.u8 $rcon,$rcon,#1
186 veor $in0,$in0,$key
187
188 vtbl.8 $key,{$in0},$mask
189 vext.8 $tmp,$zero,$in0,#12
190 vst1.32 {$in0},[$out],#16
191 aese $key,$zero
192
193 veor $in0,$in0,$tmp
194 vext.8 $tmp,$zero,$tmp,#12
195 veor $in0,$in0,$tmp
196 vext.8 $tmp,$zero,$tmp,#12
197 veor $key,$key,$rcon
198 veor $in0,$in0,$tmp
199 veor $in0,$in0,$key
200 vst1.32 {$in0},[$out]
201 add $out,$out,#0x50
202
203 mov $rounds,#10
204 b .Ldone
205
206 .align 4
207 .L192:
208 vld1.8 {$in1},[$inp],#8
209 vmov.i8 $key,#8 // borrow $key
210 vst1.32 {$in0},[$out],#16
211 vsub.i8 $mask,$mask,$key // adjust the mask
212
213 .Loop192:
214 vtbl.8 $key,{$in1},$mask
215 vext.8 $tmp,$zero,$in0,#12
216 #ifdef __ARMEB__
217 vst1.32 {$in1},[$out],#16
218 sub $out,$out,#8
219 #else
220 vst1.32 {$in1},[$out],#8
221 #endif
222 aese $key,$zero
223 subs $bits,$bits,#1
224
225 veor $in0,$in0,$tmp
226 vext.8 $tmp,$zero,$tmp,#12
227 veor $in0,$in0,$tmp
228 vext.8 $tmp,$zero,$tmp,#12
229 veor $in0,$in0,$tmp
230
231 vdup.32 $tmp,${in0}[3]
232 veor $tmp,$tmp,$in1
233 veor $key,$key,$rcon
234 vext.8 $in1,$zero,$in1,#12
235 vshl.u8 $rcon,$rcon,#1
236 veor $in1,$in1,$tmp
237 veor $in0,$in0,$key
238 veor $in1,$in1,$key
239 vst1.32 {$in0},[$out],#16
240 b.ne .Loop192
241
242 mov $rounds,#12
243 add $out,$out,#0x20
244 b .Ldone
245
246 .align 4
247 .L256:
248 vld1.8 {$in1},[$inp]
249 mov $bits,#7
250 mov $rounds,#14
251 vst1.32 {$in0},[$out],#16
252
253 .Loop256:
254 vtbl.8 $key,{$in1},$mask
255 vext.8 $tmp,$zero,$in0,#12
256 vst1.32 {$in1},[$out],#16
257 aese $key,$zero
258 subs $bits,$bits,#1
259
260 veor $in0,$in0,$tmp
261 vext.8 $tmp,$zero,$tmp,#12
262 veor $in0,$in0,$tmp
263 vext.8 $tmp,$zero,$tmp,#12
264 veor $key,$key,$rcon
265 veor $in0,$in0,$tmp
266 vshl.u8 $rcon,$rcon,#1
267 veor $in0,$in0,$key
268 vst1.32 {$in0},[$out],#16
269 b.eq .Ldone
270
271 vdup.32 $key,${in0}[3] // just splat
272 vext.8 $tmp,$zero,$in1,#12
273 aese $key,$zero
274
275 veor $in1,$in1,$tmp
276 vext.8 $tmp,$zero,$tmp,#12
277 veor $in1,$in1,$tmp
278 vext.8 $tmp,$zero,$tmp,#12
279 veor $in1,$in1,$tmp
280
281 veor $in1,$in1,$key
282 b .Loop256
283
284 .Ldone:
285 str $rounds,[$out]
286 mov $ptr,#0
287
288 .Lenc_key_abort:
289 mov x0,$ptr // return value
290 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
291 ret
292 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
293
294 .globl ${prefix}_set_decrypt_key
295 .type ${prefix}_set_decrypt_key,%function
296 .align 5
297 ${prefix}_set_decrypt_key:
298 ___
299 $code.=<<___ if ($flavour =~ /64/);
300 AARCH64_SIGN_LINK_REGISTER
301 stp x29,x30,[sp,#-16]!
302 add x29,sp,#0
303 ___
304 $code.=<<___ if ($flavour !~ /64/);
305 stmdb sp!,{r4,lr}
306 ___
307 $code.=<<___;
308 bl .Lenc_key
309
310 cmp x0,#0
311 b.ne .Ldec_key_abort
312
313 sub $out,$out,#240 // restore original $out
314 mov x4,#-16
315 add $inp,$out,x12,lsl#4 // end of key schedule
316
317 vld1.32 {v0.16b},[$out]
318 vld1.32 {v1.16b},[$inp]
319 vst1.32 {v0.16b},[$inp],x4
320 vst1.32 {v1.16b},[$out],#16
321
322 .Loop_imc:
323 vld1.32 {v0.16b},[$out]
324 vld1.32 {v1.16b},[$inp]
325 aesimc v0.16b,v0.16b
326 aesimc v1.16b,v1.16b
327 vst1.32 {v0.16b},[$inp],x4
328 vst1.32 {v1.16b},[$out],#16
329 cmp $inp,$out
330 b.hi .Loop_imc
331
332 vld1.32 {v0.16b},[$out]
333 aesimc v0.16b,v0.16b
334 vst1.32 {v0.16b},[$inp]
335
336 eor x0,x0,x0 // return value
337 .Ldec_key_abort:
338 ___
339 $code.=<<___ if ($flavour !~ /64/);
340 ldmia sp!,{r4,pc}
341 ___
342 $code.=<<___ if ($flavour =~ /64/);
343 ldp x29,x30,[sp],#16
344 AARCH64_VALIDATE_LINK_REGISTER
345 ret
346 ___
347 $code.=<<___;
348 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
349 ___
350 }}}
351 {{{
352 sub gen_block () {
353 my $dir = shift;
354 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
355 my ($inp,$out,$key)=map("x$_",(0..2));
356 my $rounds="w3";
357 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
358
359 $code.=<<___;
360 .globl ${prefix}_${dir}crypt
361 .type ${prefix}_${dir}crypt,%function
362 .align 5
363 ${prefix}_${dir}crypt:
364 ___
365 $code.=<<___ if ($flavour =~ /64/);
366 AARCH64_VALID_CALL_TARGET
367 ___
368 $code.=<<___;
369 ldr $rounds,[$key,#240]
370 vld1.32 {$rndkey0},[$key],#16
371 vld1.8 {$inout},[$inp]
372 sub $rounds,$rounds,#2
373 vld1.32 {$rndkey1},[$key],#16
374
375 .Loop_${dir}c:
376 aes$e $inout,$rndkey0
377 aes$mc $inout,$inout
378 vld1.32 {$rndkey0},[$key],#16
379 subs $rounds,$rounds,#2
380 aes$e $inout,$rndkey1
381 aes$mc $inout,$inout
382 vld1.32 {$rndkey1},[$key],#16
383 b.gt .Loop_${dir}c
384
385 aes$e $inout,$rndkey0
386 aes$mc $inout,$inout
387 vld1.32 {$rndkey0},[$key]
388 aes$e $inout,$rndkey1
389 veor $inout,$inout,$rndkey0
390
391 vst1.8 {$inout},[$out]
392 ret
393 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
394 ___
395 }
396 &gen_block("en");
397 &gen_block("de");
398 }}}
399
400 # Performance in cycles per byte.
401 # Processed with AES-ECB different key size.
402 # It shows the value before and after optimization as below:
403 # (before/after):
404 #
405 # AES-128-ECB AES-192-ECB AES-256-ECB
406 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
407 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
408
409 # Optimization is implemented by loop unrolling and interleaving.
410 # Commonly, we choose the unrolling factor as 5, if the input
411 # data size smaller than 5 blocks, but not smaller than 3 blocks,
412 # choose 3 as the unrolling factor.
413 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
414 # as one iteration, every loop the left size lsize -= 5*16.
415 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
416 # every loop lsize -=3*16.
417 # If lsize < 3*16 bytes, treat them as the tail, interleave the
418 # two blocks AES instructions.
419 # There is one special case, if the original input data size dsize
420 # = 16 bytes, we will treat it separately to improve the
421 # performance: one independent code block without LR, FP load and
422 # store, just looks like what the original ECB implementation does.
423
424 {{{
425 my ($inp,$out,$len,$key)=map("x$_",(0..3));
426 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
427 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
428
429 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
430
431 ### q7 last round key
432 ### q10-q15 q7 Last 7 round keys
433 ### q8-q9 preloaded round keys except last 7 keys for big size
434 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
435
436 {
437 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
438
439 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
440 my ($dat4,$in4,$tmp4);
441 if ($flavour =~ /64/) {
442 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
443 }
444
445 $code.=<<___;
446 .globl ${prefix}_ecb_encrypt
447 .type ${prefix}_ecb_encrypt,%function
448 .align 5
449 ${prefix}_ecb_encrypt:
450 ___
451 $code.=<<___ if ($flavour =~ /64/);
452 AARCH64_VALID_CALL_TARGET
453 subs $len,$len,#16
454 // Original input data size bigger than 16, jump to big size processing.
455 b.ne .Lecb_big_size
456 vld1.8 {$dat0},[$inp]
457 cmp $enc,#0 // en- or decrypting?
458 ldr $rounds,[$key,#240]
459 vld1.32 {q5-q6},[$key],#32 // load key schedule...
460
461 b.eq .Lecb_small_dec
462 aese $dat0,q5
463 aesmc $dat0,$dat0
464 vld1.32 {q8-q9},[$key],#32 // load key schedule...
465 aese $dat0,q6
466 aesmc $dat0,$dat0
467 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
468 b.eq .Lecb_128_enc
469 .Lecb_round_loop:
470 aese $dat0,q8
471 aesmc $dat0,$dat0
472 vld1.32 {q8},[$key],#16 // load key schedule...
473 aese $dat0,q9
474 aesmc $dat0,$dat0
475 vld1.32 {q9},[$key],#16 // load key schedule...
476 subs $rounds,$rounds,#2 // bias
477 b.gt .Lecb_round_loop
478 .Lecb_128_enc:
479 vld1.32 {q10-q11},[$key],#32 // load key schedule...
480 aese $dat0,q8
481 aesmc $dat0,$dat0
482 aese $dat0,q9
483 aesmc $dat0,$dat0
484 vld1.32 {q12-q13},[$key],#32 // load key schedule...
485 aese $dat0,q10
486 aesmc $dat0,$dat0
487 aese $dat0,q11
488 aesmc $dat0,$dat0
489 vld1.32 {q14-q15},[$key],#32 // load key schedule...
490 aese $dat0,q12
491 aesmc $dat0,$dat0
492 aese $dat0,q13
493 aesmc $dat0,$dat0
494 vld1.32 {$rndlast},[$key]
495 aese $dat0,q14
496 aesmc $dat0,$dat0
497 aese $dat0,q15
498 veor $dat0,$dat0,$rndlast
499 vst1.8 {$dat0},[$out]
500 b .Lecb_Final_abort
501 .Lecb_small_dec:
502 aesd $dat0,q5
503 aesimc $dat0,$dat0
504 vld1.32 {q8-q9},[$key],#32 // load key schedule...
505 aesd $dat0,q6
506 aesimc $dat0,$dat0
507 subs $rounds,$rounds,#10 // bias
508 b.eq .Lecb_128_dec
509 .Lecb_dec_round_loop:
510 aesd $dat0,q8
511 aesimc $dat0,$dat0
512 vld1.32 {q8},[$key],#16 // load key schedule...
513 aesd $dat0,q9
514 aesimc $dat0,$dat0
515 vld1.32 {q9},[$key],#16 // load key schedule...
516 subs $rounds,$rounds,#2 // bias
517 b.gt .Lecb_dec_round_loop
518 .Lecb_128_dec:
519 vld1.32 {q10-q11},[$key],#32 // load key schedule...
520 aesd $dat0,q8
521 aesimc $dat0,$dat0
522 aesd $dat0,q9
523 aesimc $dat0,$dat0
524 vld1.32 {q12-q13},[$key],#32 // load key schedule...
525 aesd $dat0,q10
526 aesimc $dat0,$dat0
527 aesd $dat0,q11
528 aesimc $dat0,$dat0
529 vld1.32 {q14-q15},[$key],#32 // load key schedule...
530 aesd $dat0,q12
531 aesimc $dat0,$dat0
532 aesd $dat0,q13
533 aesimc $dat0,$dat0
534 vld1.32 {$rndlast},[$key]
535 aesd $dat0,q14
536 aesimc $dat0,$dat0
537 aesd $dat0,q15
538 veor $dat0,$dat0,$rndlast
539 vst1.8 {$dat0},[$out]
540 b .Lecb_Final_abort
541 .Lecb_big_size:
542 ___
543 $code.=<<___ if ($flavour =~ /64/);
544 stp x29,x30,[sp,#-16]!
545 add x29,sp,#0
546 ___
547 $code.=<<___ if ($flavour !~ /64/);
548 mov ip,sp
549 stmdb sp!,{r4-r8,lr}
550 vstmdb sp!,{d8-d15} @ ABI specification says so
551 ldmia ip,{r4-r5} @ load remaining args
552 subs $len,$len,#16
553 ___
554 $code.=<<___;
555 mov $step,#16
556 b.lo .Lecb_done
557 cclr $step,eq
558
559 cmp $enc,#0 // en- or decrypting?
560 ldr $rounds,[$key,#240]
561 and $len,$len,#-16
562 vld1.8 {$dat},[$inp],$step
563
564 vld1.32 {q8-q9},[$key] // load key schedule...
565 sub $rounds,$rounds,#6
566 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
567 sub $rounds,$rounds,#2
568 vld1.32 {q10-q11},[$key_],#32
569 vld1.32 {q12-q13},[$key_],#32
570 vld1.32 {q14-q15},[$key_],#32
571 vld1.32 {$rndlast},[$key_]
572
573 add $key_,$key,#32
574 mov $cnt,$rounds
575 b.eq .Lecb_dec
576
577 vld1.8 {$dat1},[$inp],#16
578 subs $len,$len,#32 // bias
579 add $cnt,$rounds,#2
580 vorr $in1,$dat1,$dat1
581 vorr $dat2,$dat1,$dat1
582 vorr $dat1,$dat,$dat
583 b.lo .Lecb_enc_tail
584
585 vorr $dat1,$in1,$in1
586 vld1.8 {$dat2},[$inp],#16
587 ___
588 $code.=<<___ if ($flavour =~ /64/);
589 cmp $len,#32
590 b.lo .Loop3x_ecb_enc
591
592 vld1.8 {$dat3},[$inp],#16
593 vld1.8 {$dat4},[$inp],#16
594 sub $len,$len,#32 // bias
595 mov $cnt,$rounds
596
597 .Loop5x_ecb_enc:
598 aese $dat0,q8
599 aesmc $dat0,$dat0
600 aese $dat1,q8
601 aesmc $dat1,$dat1
602 aese $dat2,q8
603 aesmc $dat2,$dat2
604 aese $dat3,q8
605 aesmc $dat3,$dat3
606 aese $dat4,q8
607 aesmc $dat4,$dat4
608 vld1.32 {q8},[$key_],#16
609 subs $cnt,$cnt,#2
610 aese $dat0,q9
611 aesmc $dat0,$dat0
612 aese $dat1,q9
613 aesmc $dat1,$dat1
614 aese $dat2,q9
615 aesmc $dat2,$dat2
616 aese $dat3,q9
617 aesmc $dat3,$dat3
618 aese $dat4,q9
619 aesmc $dat4,$dat4
620 vld1.32 {q9},[$key_],#16
621 b.gt .Loop5x_ecb_enc
622
623 aese $dat0,q8
624 aesmc $dat0,$dat0
625 aese $dat1,q8
626 aesmc $dat1,$dat1
627 aese $dat2,q8
628 aesmc $dat2,$dat2
629 aese $dat3,q8
630 aesmc $dat3,$dat3
631 aese $dat4,q8
632 aesmc $dat4,$dat4
633 cmp $len,#0x40 // because .Lecb_enc_tail4x
634 sub $len,$len,#0x50
635
636 aese $dat0,q9
637 aesmc $dat0,$dat0
638 aese $dat1,q9
639 aesmc $dat1,$dat1
640 aese $dat2,q9
641 aesmc $dat2,$dat2
642 aese $dat3,q9
643 aesmc $dat3,$dat3
644 aese $dat4,q9
645 aesmc $dat4,$dat4
646 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
647 mov $key_,$key
648
649 aese $dat0,q10
650 aesmc $dat0,$dat0
651 aese $dat1,q10
652 aesmc $dat1,$dat1
653 aese $dat2,q10
654 aesmc $dat2,$dat2
655 aese $dat3,q10
656 aesmc $dat3,$dat3
657 aese $dat4,q10
658 aesmc $dat4,$dat4
659 add $inp,$inp,x6 // $inp is adjusted in such way that
660 // at exit from the loop $dat1-$dat4
661 // are loaded with last "words"
662 add x6,$len,#0x60 // because .Lecb_enc_tail4x
663
664 aese $dat0,q11
665 aesmc $dat0,$dat0
666 aese $dat1,q11
667 aesmc $dat1,$dat1
668 aese $dat2,q11
669 aesmc $dat2,$dat2
670 aese $dat3,q11
671 aesmc $dat3,$dat3
672 aese $dat4,q11
673 aesmc $dat4,$dat4
674
675 aese $dat0,q12
676 aesmc $dat0,$dat0
677 aese $dat1,q12
678 aesmc $dat1,$dat1
679 aese $dat2,q12
680 aesmc $dat2,$dat2
681 aese $dat3,q12
682 aesmc $dat3,$dat3
683 aese $dat4,q12
684 aesmc $dat4,$dat4
685
686 aese $dat0,q13
687 aesmc $dat0,$dat0
688 aese $dat1,q13
689 aesmc $dat1,$dat1
690 aese $dat2,q13
691 aesmc $dat2,$dat2
692 aese $dat3,q13
693 aesmc $dat3,$dat3
694 aese $dat4,q13
695 aesmc $dat4,$dat4
696
697 aese $dat0,q14
698 aesmc $dat0,$dat0
699 aese $dat1,q14
700 aesmc $dat1,$dat1
701 aese $dat2,q14
702 aesmc $dat2,$dat2
703 aese $dat3,q14
704 aesmc $dat3,$dat3
705 aese $dat4,q14
706 aesmc $dat4,$dat4
707
708 aese $dat0,q15
709 vld1.8 {$in0},[$inp],#16
710 aese $dat1,q15
711 vld1.8 {$in1},[$inp],#16
712 aese $dat2,q15
713 vld1.8 {$in2},[$inp],#16
714 aese $dat3,q15
715 vld1.8 {$in3},[$inp],#16
716 aese $dat4,q15
717 vld1.8 {$in4},[$inp],#16
718 cbz x6,.Lecb_enc_tail4x
719 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
720 veor $tmp0,$rndlast,$dat0
721 vorr $dat0,$in0,$in0
722 veor $tmp1,$rndlast,$dat1
723 vorr $dat1,$in1,$in1
724 veor $tmp2,$rndlast,$dat2
725 vorr $dat2,$in2,$in2
726 veor $tmp3,$rndlast,$dat3
727 vorr $dat3,$in3,$in3
728 veor $tmp4,$rndlast,$dat4
729 vst1.8 {$tmp0},[$out],#16
730 vorr $dat4,$in4,$in4
731 vst1.8 {$tmp1},[$out],#16
732 mov $cnt,$rounds
733 vst1.8 {$tmp2},[$out],#16
734 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
735 vst1.8 {$tmp3},[$out],#16
736 vst1.8 {$tmp4},[$out],#16
737 b.hs .Loop5x_ecb_enc
738
739 add $len,$len,#0x50
740 cbz $len,.Lecb_done
741
742 add $cnt,$rounds,#2
743 subs $len,$len,#0x30
744 vorr $dat0,$in2,$in2
745 vorr $dat1,$in3,$in3
746 vorr $dat2,$in4,$in4
747 b.lo .Lecb_enc_tail
748
749 b .Loop3x_ecb_enc
750
751 .align 4
752 .Lecb_enc_tail4x:
753 veor $tmp1,$rndlast,$dat1
754 veor $tmp2,$rndlast,$dat2
755 veor $tmp3,$rndlast,$dat3
756 veor $tmp4,$rndlast,$dat4
757 vst1.8 {$tmp1},[$out],#16
758 vst1.8 {$tmp2},[$out],#16
759 vst1.8 {$tmp3},[$out],#16
760 vst1.8 {$tmp4},[$out],#16
761
762 b .Lecb_done
763 .align 4
764 ___
765 $code.=<<___;
766 .Loop3x_ecb_enc:
767 aese $dat0,q8
768 aesmc $dat0,$dat0
769 aese $dat1,q8
770 aesmc $dat1,$dat1
771 aese $dat2,q8
772 aesmc $dat2,$dat2
773 vld1.32 {q8},[$key_],#16
774 subs $cnt,$cnt,#2
775 aese $dat0,q9
776 aesmc $dat0,$dat0
777 aese $dat1,q9
778 aesmc $dat1,$dat1
779 aese $dat2,q9
780 aesmc $dat2,$dat2
781 vld1.32 {q9},[$key_],#16
782 b.gt .Loop3x_ecb_enc
783
784 aese $dat0,q8
785 aesmc $dat0,$dat0
786 aese $dat1,q8
787 aesmc $dat1,$dat1
788 aese $dat2,q8
789 aesmc $dat2,$dat2
790 subs $len,$len,#0x30
791 mov.lo x6,$len // x6, $cnt, is zero at this point
792 aese $dat0,q9
793 aesmc $dat0,$dat0
794 aese $dat1,q9
795 aesmc $dat1,$dat1
796 aese $dat2,q9
797 aesmc $dat2,$dat2
798 add $inp,$inp,x6 // $inp is adjusted in such way that
799 // at exit from the loop $dat1-$dat2
800 // are loaded with last "words"
801 mov $key_,$key
802 aese $dat0,q12
803 aesmc $dat0,$dat0
804 aese $dat1,q12
805 aesmc $dat1,$dat1
806 aese $dat2,q12
807 aesmc $dat2,$dat2
808 vld1.8 {$in0},[$inp],#16
809 aese $dat0,q13
810 aesmc $dat0,$dat0
811 aese $dat1,q13
812 aesmc $dat1,$dat1
813 aese $dat2,q13
814 aesmc $dat2,$dat2
815 vld1.8 {$in1},[$inp],#16
816 aese $dat0,q14
817 aesmc $dat0,$dat0
818 aese $dat1,q14
819 aesmc $dat1,$dat1
820 aese $dat2,q14
821 aesmc $dat2,$dat2
822 vld1.8 {$in2},[$inp],#16
823 aese $dat0,q15
824 aese $dat1,q15
825 aese $dat2,q15
826 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
827 add $cnt,$rounds,#2
828 veor $tmp0,$rndlast,$dat0
829 veor $tmp1,$rndlast,$dat1
830 veor $dat2,$dat2,$rndlast
831 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
832 vst1.8 {$tmp0},[$out],#16
833 vorr $dat0,$in0,$in0
834 vst1.8 {$tmp1},[$out],#16
835 vorr $dat1,$in1,$in1
836 vst1.8 {$dat2},[$out],#16
837 vorr $dat2,$in2,$in2
838 b.hs .Loop3x_ecb_enc
839
840 cmn $len,#0x30
841 b.eq .Lecb_done
842 nop
843
844 .Lecb_enc_tail:
845 aese $dat1,q8
846 aesmc $dat1,$dat1
847 aese $dat2,q8
848 aesmc $dat2,$dat2
849 vld1.32 {q8},[$key_],#16
850 subs $cnt,$cnt,#2
851 aese $dat1,q9
852 aesmc $dat1,$dat1
853 aese $dat2,q9
854 aesmc $dat2,$dat2
855 vld1.32 {q9},[$key_],#16
856 b.gt .Lecb_enc_tail
857
858 aese $dat1,q8
859 aesmc $dat1,$dat1
860 aese $dat2,q8
861 aesmc $dat2,$dat2
862 aese $dat1,q9
863 aesmc $dat1,$dat1
864 aese $dat2,q9
865 aesmc $dat2,$dat2
866 aese $dat1,q12
867 aesmc $dat1,$dat1
868 aese $dat2,q12
869 aesmc $dat2,$dat2
870 cmn $len,#0x20
871 aese $dat1,q13
872 aesmc $dat1,$dat1
873 aese $dat2,q13
874 aesmc $dat2,$dat2
875 aese $dat1,q14
876 aesmc $dat1,$dat1
877 aese $dat2,q14
878 aesmc $dat2,$dat2
879 aese $dat1,q15
880 aese $dat2,q15
881 b.eq .Lecb_enc_one
882 veor $tmp1,$rndlast,$dat1
883 veor $tmp2,$rndlast,$dat2
884 vst1.8 {$tmp1},[$out],#16
885 vst1.8 {$tmp2},[$out],#16
886 b .Lecb_done
887
888 .Lecb_enc_one:
889 veor $tmp1,$rndlast,$dat2
890 vst1.8 {$tmp1},[$out],#16
891 b .Lecb_done
892 ___
893
894 $code.=<<___;
895 .align 5
896 .Lecb_dec:
897 vld1.8 {$dat1},[$inp],#16
898 subs $len,$len,#32 // bias
899 add $cnt,$rounds,#2
900 vorr $in1,$dat1,$dat1
901 vorr $dat2,$dat1,$dat1
902 vorr $dat1,$dat,$dat
903 b.lo .Lecb_dec_tail
904
905 vorr $dat1,$in1,$in1
906 vld1.8 {$dat2},[$inp],#16
907 ___
908 $code.=<<___ if ($flavour =~ /64/);
909 cmp $len,#32
910 b.lo .Loop3x_ecb_dec
911
912 vld1.8 {$dat3},[$inp],#16
913 vld1.8 {$dat4},[$inp],#16
914 sub $len,$len,#32 // bias
915 mov $cnt,$rounds
916
917 .Loop5x_ecb_dec:
918 aesd $dat0,q8
919 aesimc $dat0,$dat0
920 aesd $dat1,q8
921 aesimc $dat1,$dat1
922 aesd $dat2,q8
923 aesimc $dat2,$dat2
924 aesd $dat3,q8
925 aesimc $dat3,$dat3
926 aesd $dat4,q8
927 aesimc $dat4,$dat4
928 vld1.32 {q8},[$key_],#16
929 subs $cnt,$cnt,#2
930 aesd $dat0,q9
931 aesimc $dat0,$dat0
932 aesd $dat1,q9
933 aesimc $dat1,$dat1
934 aesd $dat2,q9
935 aesimc $dat2,$dat2
936 aesd $dat3,q9
937 aesimc $dat3,$dat3
938 aesd $dat4,q9
939 aesimc $dat4,$dat4
940 vld1.32 {q9},[$key_],#16
941 b.gt .Loop5x_ecb_dec
942
943 aesd $dat0,q8
944 aesimc $dat0,$dat0
945 aesd $dat1,q8
946 aesimc $dat1,$dat1
947 aesd $dat2,q8
948 aesimc $dat2,$dat2
949 aesd $dat3,q8
950 aesimc $dat3,$dat3
951 aesd $dat4,q8
952 aesimc $dat4,$dat4
953 cmp $len,#0x40 // because .Lecb_tail4x
954 sub $len,$len,#0x50
955
956 aesd $dat0,q9
957 aesimc $dat0,$dat0
958 aesd $dat1,q9
959 aesimc $dat1,$dat1
960 aesd $dat2,q9
961 aesimc $dat2,$dat2
962 aesd $dat3,q9
963 aesimc $dat3,$dat3
964 aesd $dat4,q9
965 aesimc $dat4,$dat4
966 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
967 mov $key_,$key
968
969 aesd $dat0,q10
970 aesimc $dat0,$dat0
971 aesd $dat1,q10
972 aesimc $dat1,$dat1
973 aesd $dat2,q10
974 aesimc $dat2,$dat2
975 aesd $dat3,q10
976 aesimc $dat3,$dat3
977 aesd $dat4,q10
978 aesimc $dat4,$dat4
979 add $inp,$inp,x6 // $inp is adjusted in such way that
980 // at exit from the loop $dat1-$dat4
981 // are loaded with last "words"
982 add x6,$len,#0x60 // because .Lecb_tail4x
983
984 aesd $dat0,q11
985 aesimc $dat0,$dat0
986 aesd $dat1,q11
987 aesimc $dat1,$dat1
988 aesd $dat2,q11
989 aesimc $dat2,$dat2
990 aesd $dat3,q11
991 aesimc $dat3,$dat3
992 aesd $dat4,q11
993 aesimc $dat4,$dat4
994
995 aesd $dat0,q12
996 aesimc $dat0,$dat0
997 aesd $dat1,q12
998 aesimc $dat1,$dat1
999 aesd $dat2,q12
1000 aesimc $dat2,$dat2
1001 aesd $dat3,q12
1002 aesimc $dat3,$dat3
1003 aesd $dat4,q12
1004 aesimc $dat4,$dat4
1005
1006 aesd $dat0,q13
1007 aesimc $dat0,$dat0
1008 aesd $dat1,q13
1009 aesimc $dat1,$dat1
1010 aesd $dat2,q13
1011 aesimc $dat2,$dat2
1012 aesd $dat3,q13
1013 aesimc $dat3,$dat3
1014 aesd $dat4,q13
1015 aesimc $dat4,$dat4
1016
1017 aesd $dat0,q14
1018 aesimc $dat0,$dat0
1019 aesd $dat1,q14
1020 aesimc $dat1,$dat1
1021 aesd $dat2,q14
1022 aesimc $dat2,$dat2
1023 aesd $dat3,q14
1024 aesimc $dat3,$dat3
1025 aesd $dat4,q14
1026 aesimc $dat4,$dat4
1027
1028 aesd $dat0,q15
1029 vld1.8 {$in0},[$inp],#16
1030 aesd $dat1,q15
1031 vld1.8 {$in1},[$inp],#16
1032 aesd $dat2,q15
1033 vld1.8 {$in2},[$inp],#16
1034 aesd $dat3,q15
1035 vld1.8 {$in3},[$inp],#16
1036 aesd $dat4,q15
1037 vld1.8 {$in4},[$inp],#16
1038 cbz x6,.Lecb_tail4x
1039 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1040 veor $tmp0,$rndlast,$dat0
1041 vorr $dat0,$in0,$in0
1042 veor $tmp1,$rndlast,$dat1
1043 vorr $dat1,$in1,$in1
1044 veor $tmp2,$rndlast,$dat2
1045 vorr $dat2,$in2,$in2
1046 veor $tmp3,$rndlast,$dat3
1047 vorr $dat3,$in3,$in3
1048 veor $tmp4,$rndlast,$dat4
1049 vst1.8 {$tmp0},[$out],#16
1050 vorr $dat4,$in4,$in4
1051 vst1.8 {$tmp1},[$out],#16
1052 mov $cnt,$rounds
1053 vst1.8 {$tmp2},[$out],#16
1054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1055 vst1.8 {$tmp3},[$out],#16
1056 vst1.8 {$tmp4},[$out],#16
1057 b.hs .Loop5x_ecb_dec
1058
1059 add $len,$len,#0x50
1060 cbz $len,.Lecb_done
1061
1062 add $cnt,$rounds,#2
1063 subs $len,$len,#0x30
1064 vorr $dat0,$in2,$in2
1065 vorr $dat1,$in3,$in3
1066 vorr $dat2,$in4,$in4
1067 b.lo .Lecb_dec_tail
1068
1069 b .Loop3x_ecb_dec
1070
1071 .align 4
1072 .Lecb_tail4x:
1073 veor $tmp1,$rndlast,$dat1
1074 veor $tmp2,$rndlast,$dat2
1075 veor $tmp3,$rndlast,$dat3
1076 veor $tmp4,$rndlast,$dat4
1077 vst1.8 {$tmp1},[$out],#16
1078 vst1.8 {$tmp2},[$out],#16
1079 vst1.8 {$tmp3},[$out],#16
1080 vst1.8 {$tmp4},[$out],#16
1081
1082 b .Lecb_done
1083 .align 4
1084 ___
1085 $code.=<<___;
1086 .Loop3x_ecb_dec:
1087 aesd $dat0,q8
1088 aesimc $dat0,$dat0
1089 aesd $dat1,q8
1090 aesimc $dat1,$dat1
1091 aesd $dat2,q8
1092 aesimc $dat2,$dat2
1093 vld1.32 {q8},[$key_],#16
1094 subs $cnt,$cnt,#2
1095 aesd $dat0,q9
1096 aesimc $dat0,$dat0
1097 aesd $dat1,q9
1098 aesimc $dat1,$dat1
1099 aesd $dat2,q9
1100 aesimc $dat2,$dat2
1101 vld1.32 {q9},[$key_],#16
1102 b.gt .Loop3x_ecb_dec
1103
1104 aesd $dat0,q8
1105 aesimc $dat0,$dat0
1106 aesd $dat1,q8
1107 aesimc $dat1,$dat1
1108 aesd $dat2,q8
1109 aesimc $dat2,$dat2
1110 subs $len,$len,#0x30
1111 mov.lo x6,$len // x6, $cnt, is zero at this point
1112 aesd $dat0,q9
1113 aesimc $dat0,$dat0
1114 aesd $dat1,q9
1115 aesimc $dat1,$dat1
1116 aesd $dat2,q9
1117 aesimc $dat2,$dat2
1118 add $inp,$inp,x6 // $inp is adjusted in such way that
1119 // at exit from the loop $dat1-$dat2
1120 // are loaded with last "words"
1121 mov $key_,$key
1122 aesd $dat0,q12
1123 aesimc $dat0,$dat0
1124 aesd $dat1,q12
1125 aesimc $dat1,$dat1
1126 aesd $dat2,q12
1127 aesimc $dat2,$dat2
1128 vld1.8 {$in0},[$inp],#16
1129 aesd $dat0,q13
1130 aesimc $dat0,$dat0
1131 aesd $dat1,q13
1132 aesimc $dat1,$dat1
1133 aesd $dat2,q13
1134 aesimc $dat2,$dat2
1135 vld1.8 {$in1},[$inp],#16
1136 aesd $dat0,q14
1137 aesimc $dat0,$dat0
1138 aesd $dat1,q14
1139 aesimc $dat1,$dat1
1140 aesd $dat2,q14
1141 aesimc $dat2,$dat2
1142 vld1.8 {$in2},[$inp],#16
1143 aesd $dat0,q15
1144 aesd $dat1,q15
1145 aesd $dat2,q15
1146 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1147 add $cnt,$rounds,#2
1148 veor $tmp0,$rndlast,$dat0
1149 veor $tmp1,$rndlast,$dat1
1150 veor $dat2,$dat2,$rndlast
1151 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1152 vst1.8 {$tmp0},[$out],#16
1153 vorr $dat0,$in0,$in0
1154 vst1.8 {$tmp1},[$out],#16
1155 vorr $dat1,$in1,$in1
1156 vst1.8 {$dat2},[$out],#16
1157 vorr $dat2,$in2,$in2
1158 b.hs .Loop3x_ecb_dec
1159
1160 cmn $len,#0x30
1161 b.eq .Lecb_done
1162 nop
1163
1164 .Lecb_dec_tail:
1165 aesd $dat1,q8
1166 aesimc $dat1,$dat1
1167 aesd $dat2,q8
1168 aesimc $dat2,$dat2
1169 vld1.32 {q8},[$key_],#16
1170 subs $cnt,$cnt,#2
1171 aesd $dat1,q9
1172 aesimc $dat1,$dat1
1173 aesd $dat2,q9
1174 aesimc $dat2,$dat2
1175 vld1.32 {q9},[$key_],#16
1176 b.gt .Lecb_dec_tail
1177
1178 aesd $dat1,q8
1179 aesimc $dat1,$dat1
1180 aesd $dat2,q8
1181 aesimc $dat2,$dat2
1182 aesd $dat1,q9
1183 aesimc $dat1,$dat1
1184 aesd $dat2,q9
1185 aesimc $dat2,$dat2
1186 aesd $dat1,q12
1187 aesimc $dat1,$dat1
1188 aesd $dat2,q12
1189 aesimc $dat2,$dat2
1190 cmn $len,#0x20
1191 aesd $dat1,q13
1192 aesimc $dat1,$dat1
1193 aesd $dat2,q13
1194 aesimc $dat2,$dat2
1195 aesd $dat1,q14
1196 aesimc $dat1,$dat1
1197 aesd $dat2,q14
1198 aesimc $dat2,$dat2
1199 aesd $dat1,q15
1200 aesd $dat2,q15
1201 b.eq .Lecb_dec_one
1202 veor $tmp1,$rndlast,$dat1
1203 veor $tmp2,$rndlast,$dat2
1204 vst1.8 {$tmp1},[$out],#16
1205 vst1.8 {$tmp2},[$out],#16
1206 b .Lecb_done
1207
1208 .Lecb_dec_one:
1209 veor $tmp1,$rndlast,$dat2
1210 vst1.8 {$tmp1},[$out],#16
1211
1212 .Lecb_done:
1213 ___
1214 }
1215 $code.=<<___ if ($flavour !~ /64/);
1216 vldmia sp!,{d8-d15}
1217 ldmia sp!,{r4-r8,pc}
1218 ___
1219 $code.=<<___ if ($flavour =~ /64/);
1220 ldr x29,[sp],#16
1221 ___
1222 $code.=<<___ if ($flavour =~ /64/);
1223 .Lecb_Final_abort:
1224 ret
1225 ___
1226 $code.=<<___;
1227 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1228 ___
1229 }}}
1230 {{{
1231 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1234
1235 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1237
1238 ### q8-q15 preloaded key schedule
1239
1240 $code.=<<___;
1241 .globl ${prefix}_cbc_encrypt
1242 .type ${prefix}_cbc_encrypt,%function
1243 .align 5
1244 ${prefix}_cbc_encrypt:
1245 ___
1246 $code.=<<___ if ($flavour =~ /64/);
1247 AARCH64_VALID_CALL_TARGET
1248 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249 stp x29,x30,[sp,#-16]!
1250 add x29,sp,#0
1251 ___
1252 $code.=<<___ if ($flavour !~ /64/);
1253 mov ip,sp
1254 stmdb sp!,{r4-r8,lr}
1255 vstmdb sp!,{d8-d15} @ ABI specification says so
1256 ldmia ip,{r4-r5} @ load remaining args
1257 ___
1258 $code.=<<___;
1259 subs $len,$len,#16
1260 mov $step,#16
1261 b.lo .Lcbc_abort
1262 cclr $step,eq
1263
1264 cmp $enc,#0 // en- or decrypting?
1265 ldr $rounds,[$key,#240]
1266 and $len,$len,#-16
1267 vld1.8 {$ivec},[$ivp]
1268 vld1.8 {$dat},[$inp],$step
1269
1270 vld1.32 {q8-q9},[$key] // load key schedule...
1271 sub $rounds,$rounds,#6
1272 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1273 sub $rounds,$rounds,#2
1274 vld1.32 {q10-q11},[$key_],#32
1275 vld1.32 {q12-q13},[$key_],#32
1276 vld1.32 {q14-q15},[$key_],#32
1277 vld1.32 {$rndlast},[$key_]
1278
1279 add $key_,$key,#32
1280 mov $cnt,$rounds
1281 b.eq .Lcbc_dec
1282
1283 cmp $rounds,#2
1284 veor $dat,$dat,$ivec
1285 veor $rndzero_n_last,q8,$rndlast
1286 b.eq .Lcbc_enc128
1287
1288 vld1.32 {$in0-$in1},[$key_]
1289 add $key_,$key,#16
1290 add $key4,$key,#16*4
1291 add $key5,$key,#16*5
1292 aese $dat,q8
1293 aesmc $dat,$dat
1294 add $key6,$key,#16*6
1295 add $key7,$key,#16*7
1296 b .Lenter_cbc_enc
1297
1298 .align 4
1299 .Loop_cbc_enc:
1300 aese $dat,q8
1301 aesmc $dat,$dat
1302 vst1.8 {$ivec},[$out],#16
1303 .Lenter_cbc_enc:
1304 aese $dat,q9
1305 aesmc $dat,$dat
1306 aese $dat,$in0
1307 aesmc $dat,$dat
1308 vld1.32 {q8},[$key4]
1309 cmp $rounds,#4
1310 aese $dat,$in1
1311 aesmc $dat,$dat
1312 vld1.32 {q9},[$key5]
1313 b.eq .Lcbc_enc192
1314
1315 aese $dat,q8
1316 aesmc $dat,$dat
1317 vld1.32 {q8},[$key6]
1318 aese $dat,q9
1319 aesmc $dat,$dat
1320 vld1.32 {q9},[$key7]
1321 nop
1322
1323 .Lcbc_enc192:
1324 aese $dat,q8
1325 aesmc $dat,$dat
1326 subs $len,$len,#16
1327 aese $dat,q9
1328 aesmc $dat,$dat
1329 cclr $step,eq
1330 aese $dat,q10
1331 aesmc $dat,$dat
1332 aese $dat,q11
1333 aesmc $dat,$dat
1334 vld1.8 {q8},[$inp],$step
1335 aese $dat,q12
1336 aesmc $dat,$dat
1337 veor q8,q8,$rndzero_n_last
1338 aese $dat,q13
1339 aesmc $dat,$dat
1340 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1341 aese $dat,q14
1342 aesmc $dat,$dat
1343 aese $dat,q15
1344 veor $ivec,$dat,$rndlast
1345 b.hs .Loop_cbc_enc
1346
1347 vst1.8 {$ivec},[$out],#16
1348 b .Lcbc_done
1349
1350 .align 5
1351 .Lcbc_enc128:
1352 vld1.32 {$in0-$in1},[$key_]
1353 aese $dat,q8
1354 aesmc $dat,$dat
1355 b .Lenter_cbc_enc128
1356 .Loop_cbc_enc128:
1357 aese $dat,q8
1358 aesmc $dat,$dat
1359 vst1.8 {$ivec},[$out],#16
1360 .Lenter_cbc_enc128:
1361 aese $dat,q9
1362 aesmc $dat,$dat
1363 subs $len,$len,#16
1364 aese $dat,$in0
1365 aesmc $dat,$dat
1366 cclr $step,eq
1367 aese $dat,$in1
1368 aesmc $dat,$dat
1369 aese $dat,q10
1370 aesmc $dat,$dat
1371 aese $dat,q11
1372 aesmc $dat,$dat
1373 vld1.8 {q8},[$inp],$step
1374 aese $dat,q12
1375 aesmc $dat,$dat
1376 aese $dat,q13
1377 aesmc $dat,$dat
1378 aese $dat,q14
1379 aesmc $dat,$dat
1380 veor q8,q8,$rndzero_n_last
1381 aese $dat,q15
1382 veor $ivec,$dat,$rndlast
1383 b.hs .Loop_cbc_enc128
1384
1385 vst1.8 {$ivec},[$out],#16
1386 b .Lcbc_done
1387 ___
1388 {
1389 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1390
1391 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1392 my ($dat4,$in4,$tmp4);
1393 if ($flavour =~ /64/) {
1394 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1395 }
1396
1397 $code.=<<___;
1398 .align 5
1399 .Lcbc_dec:
1400 vld1.8 {$dat2},[$inp],#16
1401 subs $len,$len,#32 // bias
1402 add $cnt,$rounds,#2
1403 vorr $in1,$dat,$dat
1404 vorr $dat1,$dat,$dat
1405 vorr $in2,$dat2,$dat2
1406 b.lo .Lcbc_dec_tail
1407
1408 vorr $dat1,$dat2,$dat2
1409 vld1.8 {$dat2},[$inp],#16
1410 vorr $in0,$dat,$dat
1411 vorr $in1,$dat1,$dat1
1412 vorr $in2,$dat2,$dat2
1413 ___
1414 $code.=<<___ if ($flavour =~ /64/);
1415 cmp $len,#32
1416 b.lo .Loop3x_cbc_dec
1417
1418 vld1.8 {$dat3},[$inp],#16
1419 vld1.8 {$dat4},[$inp],#16
1420 sub $len,$len,#32 // bias
1421 mov $cnt,$rounds
1422 vorr $in3,$dat3,$dat3
1423 vorr $in4,$dat4,$dat4
1424
1425 .Loop5x_cbc_dec:
1426 aesd $dat0,q8
1427 aesimc $dat0,$dat0
1428 aesd $dat1,q8
1429 aesimc $dat1,$dat1
1430 aesd $dat2,q8
1431 aesimc $dat2,$dat2
1432 aesd $dat3,q8
1433 aesimc $dat3,$dat3
1434 aesd $dat4,q8
1435 aesimc $dat4,$dat4
1436 vld1.32 {q8},[$key_],#16
1437 subs $cnt,$cnt,#2
1438 aesd $dat0,q9
1439 aesimc $dat0,$dat0
1440 aesd $dat1,q9
1441 aesimc $dat1,$dat1
1442 aesd $dat2,q9
1443 aesimc $dat2,$dat2
1444 aesd $dat3,q9
1445 aesimc $dat3,$dat3
1446 aesd $dat4,q9
1447 aesimc $dat4,$dat4
1448 vld1.32 {q9},[$key_],#16
1449 b.gt .Loop5x_cbc_dec
1450
1451 aesd $dat0,q8
1452 aesimc $dat0,$dat0
1453 aesd $dat1,q8
1454 aesimc $dat1,$dat1
1455 aesd $dat2,q8
1456 aesimc $dat2,$dat2
1457 aesd $dat3,q8
1458 aesimc $dat3,$dat3
1459 aesd $dat4,q8
1460 aesimc $dat4,$dat4
1461 cmp $len,#0x40 // because .Lcbc_tail4x
1462 sub $len,$len,#0x50
1463
1464 aesd $dat0,q9
1465 aesimc $dat0,$dat0
1466 aesd $dat1,q9
1467 aesimc $dat1,$dat1
1468 aesd $dat2,q9
1469 aesimc $dat2,$dat2
1470 aesd $dat3,q9
1471 aesimc $dat3,$dat3
1472 aesd $dat4,q9
1473 aesimc $dat4,$dat4
1474 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1475 mov $key_,$key
1476
1477 aesd $dat0,q10
1478 aesimc $dat0,$dat0
1479 aesd $dat1,q10
1480 aesimc $dat1,$dat1
1481 aesd $dat2,q10
1482 aesimc $dat2,$dat2
1483 aesd $dat3,q10
1484 aesimc $dat3,$dat3
1485 aesd $dat4,q10
1486 aesimc $dat4,$dat4
1487 add $inp,$inp,x6 // $inp is adjusted in such way that
1488 // at exit from the loop $dat1-$dat4
1489 // are loaded with last "words"
1490 add x6,$len,#0x60 // because .Lcbc_tail4x
1491
1492 aesd $dat0,q11
1493 aesimc $dat0,$dat0
1494 aesd $dat1,q11
1495 aesimc $dat1,$dat1
1496 aesd $dat2,q11
1497 aesimc $dat2,$dat2
1498 aesd $dat3,q11
1499 aesimc $dat3,$dat3
1500 aesd $dat4,q11
1501 aesimc $dat4,$dat4
1502
1503 aesd $dat0,q12
1504 aesimc $dat0,$dat0
1505 aesd $dat1,q12
1506 aesimc $dat1,$dat1
1507 aesd $dat2,q12
1508 aesimc $dat2,$dat2
1509 aesd $dat3,q12
1510 aesimc $dat3,$dat3
1511 aesd $dat4,q12
1512 aesimc $dat4,$dat4
1513
1514 aesd $dat0,q13
1515 aesimc $dat0,$dat0
1516 aesd $dat1,q13
1517 aesimc $dat1,$dat1
1518 aesd $dat2,q13
1519 aesimc $dat2,$dat2
1520 aesd $dat3,q13
1521 aesimc $dat3,$dat3
1522 aesd $dat4,q13
1523 aesimc $dat4,$dat4
1524
1525 aesd $dat0,q14
1526 aesimc $dat0,$dat0
1527 aesd $dat1,q14
1528 aesimc $dat1,$dat1
1529 aesd $dat2,q14
1530 aesimc $dat2,$dat2
1531 aesd $dat3,q14
1532 aesimc $dat3,$dat3
1533 aesd $dat4,q14
1534 aesimc $dat4,$dat4
1535
1536 veor $tmp0,$ivec,$rndlast
1537 aesd $dat0,q15
1538 veor $tmp1,$in0,$rndlast
1539 vld1.8 {$in0},[$inp],#16
1540 aesd $dat1,q15
1541 veor $tmp2,$in1,$rndlast
1542 vld1.8 {$in1},[$inp],#16
1543 aesd $dat2,q15
1544 veor $tmp3,$in2,$rndlast
1545 vld1.8 {$in2},[$inp],#16
1546 aesd $dat3,q15
1547 veor $tmp4,$in3,$rndlast
1548 vld1.8 {$in3},[$inp],#16
1549 aesd $dat4,q15
1550 vorr $ivec,$in4,$in4
1551 vld1.8 {$in4},[$inp],#16
1552 cbz x6,.Lcbc_tail4x
1553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1554 veor $tmp0,$tmp0,$dat0
1555 vorr $dat0,$in0,$in0
1556 veor $tmp1,$tmp1,$dat1
1557 vorr $dat1,$in1,$in1
1558 veor $tmp2,$tmp2,$dat2
1559 vorr $dat2,$in2,$in2
1560 veor $tmp3,$tmp3,$dat3
1561 vorr $dat3,$in3,$in3
1562 veor $tmp4,$tmp4,$dat4
1563 vst1.8 {$tmp0},[$out],#16
1564 vorr $dat4,$in4,$in4
1565 vst1.8 {$tmp1},[$out],#16
1566 mov $cnt,$rounds
1567 vst1.8 {$tmp2},[$out],#16
1568 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1569 vst1.8 {$tmp3},[$out],#16
1570 vst1.8 {$tmp4},[$out],#16
1571 b.hs .Loop5x_cbc_dec
1572
1573 add $len,$len,#0x50
1574 cbz $len,.Lcbc_done
1575
1576 add $cnt,$rounds,#2
1577 subs $len,$len,#0x30
1578 vorr $dat0,$in2,$in2
1579 vorr $in0,$in2,$in2
1580 vorr $dat1,$in3,$in3
1581 vorr $in1,$in3,$in3
1582 vorr $dat2,$in4,$in4
1583 vorr $in2,$in4,$in4
1584 b.lo .Lcbc_dec_tail
1585
1586 b .Loop3x_cbc_dec
1587
1588 .align 4
1589 .Lcbc_tail4x:
1590 veor $tmp1,$tmp0,$dat1
1591 veor $tmp2,$tmp2,$dat2
1592 veor $tmp3,$tmp3,$dat3
1593 veor $tmp4,$tmp4,$dat4
1594 vst1.8 {$tmp1},[$out],#16
1595 vst1.8 {$tmp2},[$out],#16
1596 vst1.8 {$tmp3},[$out],#16
1597 vst1.8 {$tmp4},[$out],#16
1598
1599 b .Lcbc_done
1600 .align 4
1601 ___
1602 $code.=<<___;
1603 .Loop3x_cbc_dec:
1604 aesd $dat0,q8
1605 aesimc $dat0,$dat0
1606 aesd $dat1,q8
1607 aesimc $dat1,$dat1
1608 aesd $dat2,q8
1609 aesimc $dat2,$dat2
1610 vld1.32 {q8},[$key_],#16
1611 subs $cnt,$cnt,#2
1612 aesd $dat0,q9
1613 aesimc $dat0,$dat0
1614 aesd $dat1,q9
1615 aesimc $dat1,$dat1
1616 aesd $dat2,q9
1617 aesimc $dat2,$dat2
1618 vld1.32 {q9},[$key_],#16
1619 b.gt .Loop3x_cbc_dec
1620
1621 aesd $dat0,q8
1622 aesimc $dat0,$dat0
1623 aesd $dat1,q8
1624 aesimc $dat1,$dat1
1625 aesd $dat2,q8
1626 aesimc $dat2,$dat2
1627 veor $tmp0,$ivec,$rndlast
1628 subs $len,$len,#0x30
1629 veor $tmp1,$in0,$rndlast
1630 mov.lo x6,$len // x6, $cnt, is zero at this point
1631 aesd $dat0,q9
1632 aesimc $dat0,$dat0
1633 aesd $dat1,q9
1634 aesimc $dat1,$dat1
1635 aesd $dat2,q9
1636 aesimc $dat2,$dat2
1637 veor $tmp2,$in1,$rndlast
1638 add $inp,$inp,x6 // $inp is adjusted in such way that
1639 // at exit from the loop $dat1-$dat2
1640 // are loaded with last "words"
1641 vorr $ivec,$in2,$in2
1642 mov $key_,$key
1643 aesd $dat0,q12
1644 aesimc $dat0,$dat0
1645 aesd $dat1,q12
1646 aesimc $dat1,$dat1
1647 aesd $dat2,q12
1648 aesimc $dat2,$dat2
1649 vld1.8 {$in0},[$inp],#16
1650 aesd $dat0,q13
1651 aesimc $dat0,$dat0
1652 aesd $dat1,q13
1653 aesimc $dat1,$dat1
1654 aesd $dat2,q13
1655 aesimc $dat2,$dat2
1656 vld1.8 {$in1},[$inp],#16
1657 aesd $dat0,q14
1658 aesimc $dat0,$dat0
1659 aesd $dat1,q14
1660 aesimc $dat1,$dat1
1661 aesd $dat2,q14
1662 aesimc $dat2,$dat2
1663 vld1.8 {$in2},[$inp],#16
1664 aesd $dat0,q15
1665 aesd $dat1,q15
1666 aesd $dat2,q15
1667 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1668 add $cnt,$rounds,#2
1669 veor $tmp0,$tmp0,$dat0
1670 veor $tmp1,$tmp1,$dat1
1671 veor $dat2,$dat2,$tmp2
1672 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1673 vst1.8 {$tmp0},[$out],#16
1674 vorr $dat0,$in0,$in0
1675 vst1.8 {$tmp1},[$out],#16
1676 vorr $dat1,$in1,$in1
1677 vst1.8 {$dat2},[$out],#16
1678 vorr $dat2,$in2,$in2
1679 b.hs .Loop3x_cbc_dec
1680
1681 cmn $len,#0x30
1682 b.eq .Lcbc_done
1683 nop
1684
1685 .Lcbc_dec_tail:
1686 aesd $dat1,q8
1687 aesimc $dat1,$dat1
1688 aesd $dat2,q8
1689 aesimc $dat2,$dat2
1690 vld1.32 {q8},[$key_],#16
1691 subs $cnt,$cnt,#2
1692 aesd $dat1,q9
1693 aesimc $dat1,$dat1
1694 aesd $dat2,q9
1695 aesimc $dat2,$dat2
1696 vld1.32 {q9},[$key_],#16
1697 b.gt .Lcbc_dec_tail
1698
1699 aesd $dat1,q8
1700 aesimc $dat1,$dat1
1701 aesd $dat2,q8
1702 aesimc $dat2,$dat2
1703 aesd $dat1,q9
1704 aesimc $dat1,$dat1
1705 aesd $dat2,q9
1706 aesimc $dat2,$dat2
1707 aesd $dat1,q12
1708 aesimc $dat1,$dat1
1709 aesd $dat2,q12
1710 aesimc $dat2,$dat2
1711 cmn $len,#0x20
1712 aesd $dat1,q13
1713 aesimc $dat1,$dat1
1714 aesd $dat2,q13
1715 aesimc $dat2,$dat2
1716 veor $tmp1,$ivec,$rndlast
1717 aesd $dat1,q14
1718 aesimc $dat1,$dat1
1719 aesd $dat2,q14
1720 aesimc $dat2,$dat2
1721 veor $tmp2,$in1,$rndlast
1722 aesd $dat1,q15
1723 aesd $dat2,q15
1724 b.eq .Lcbc_dec_one
1725 veor $tmp1,$tmp1,$dat1
1726 veor $tmp2,$tmp2,$dat2
1727 vorr $ivec,$in2,$in2
1728 vst1.8 {$tmp1},[$out],#16
1729 vst1.8 {$tmp2},[$out],#16
1730 b .Lcbc_done
1731
1732 .Lcbc_dec_one:
1733 veor $tmp1,$tmp1,$dat2
1734 vorr $ivec,$in2,$in2
1735 vst1.8 {$tmp1},[$out],#16
1736
1737 .Lcbc_done:
1738 vst1.8 {$ivec},[$ivp]
1739 .Lcbc_abort:
1740 ___
1741 }
1742 $code.=<<___ if ($flavour !~ /64/);
1743 vldmia sp!,{d8-d15}
1744 ldmia sp!,{r4-r8,pc}
1745 ___
1746 $code.=<<___ if ($flavour =~ /64/);
1747 ldr x29,[sp],#16
1748 ret
1749 ___
1750 $code.=<<___;
1751 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1752 ___
1753 }}}
1754
1755 {{{
1756 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1757 my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7");
1758 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1759 my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15));
1760 my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23));
1761
1762 # q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15
1763 my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3));
1764 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9));
1765 my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15));
1766 my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21));
1767 my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27));
1768 my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27));
1769
1770 #q_X => qX, for ldp & stp
1771 my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7));
1772 my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23));
1773
1774 my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11));
1775
1776 $code.=<<___ if ($flavour =~ /64/);
1777 .globl ${prefix}_ctr32_encrypt_blocks_unroll12_eor3
1778 .type ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function
1779 .align 5
1780 ${prefix}_ctr32_encrypt_blocks_unroll12_eor3:
1781 AARCH64_VALID_CALL_TARGET
1782 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1783 stp x29,x30,[sp,#-80]!
1784 stp d8,d9,[sp, #16]
1785 stp d10,d11,[sp, #32]
1786 stp d12,d13,[sp, #48]
1787 stp d14,d15,[sp, #64]
1788 add x29,sp,#0
1789
1790 ldr $rounds,[$key,#240]
1791
1792 ldr $ctr, [$ivp, #12]
1793 #ifdef __AARCH64EB__
1794 vld1.8 {$dat0},[$ivp]
1795 #else
1796 vld1.32 {$dat0},[$ivp]
1797 #endif
1798 vld1.32 {$rndping-$rndpang},[$key] // load key schedule...
1799 sub $rounds,$rounds,#4
1800 cmp $len,#2
1801 add $key_,$key,$roundsx,lsl#4 // pointer to last round key
1802 sub $rounds,$rounds,#2
1803 add $key_, $key_, #64
1804 vld1.32 {$rndlast},[$key_]
1805 add $key_,$key,#32
1806 mov $cnt,$rounds
1807 #ifndef __AARCH64EB__
1808 rev $ctr, $ctr
1809 #endif
1810
1811 vorr $dat1,$dat0,$dat0
1812 add $tctr1, $ctr, #1
1813 vorr $dat2,$dat0,$dat0
1814 add $ctr, $ctr, #2
1815 vorr $ivec,$dat0,$dat0
1816 rev $tctr1, $tctr1
1817 vmov.32 ${dat1}[3],$tctr1
1818 b.ls .Lctr32_tail_unroll
1819 cmp $len,#6
1820 rev $tctr2, $ctr
1821 sub $len,$len,#3 // bias
1822 vmov.32 ${dat2}[3],$tctr2
1823 b.lo .Loop3x_ctr32_unroll
1824 cmp $len,#9
1825 vorr $dat3,$dat0,$dat0
1826 add $tctr3, $ctr, #1
1827 vorr $dat4,$dat0,$dat0
1828 add $tctr4, $ctr, #2
1829 rev $tctr3, $tctr3
1830 vorr $dat5,$dat0,$dat0
1831 add $ctr, $ctr, #3
1832 rev $tctr4, $tctr4
1833 vmov.32 ${dat3}[3],$tctr3
1834 rev $tctr5, $ctr
1835 vmov.32 ${dat4}[3],$tctr4
1836 vmov.32 ${dat5}[3],$tctr5
1837 sub $len,$len,#3
1838 b.lo .Loop6x_ctr32_unroll
1839
1840 // push regs to stack when 12 data chunks are interleaved
1841 stp x19,x20,[sp,#-16]!
1842 stp x21,x22,[sp,#-16]!
1843 stp x23,x24,[sp,#-16]!
1844 stp $dat8d,$dat9d,[sp,#-32]!
1845 stp $dat10d,$dat11d,[sp,#-32]!
1846
1847 add $tctr6,$ctr,#1
1848 add $tctr7,$ctr,#2
1849 add $tctr8,$ctr,#3
1850 add $tctr9,$ctr,#4
1851 add $tctr10,$ctr,#5
1852 add $ctr,$ctr,#6
1853 vorr $dat6,$dat0,$dat0
1854 rev $tctr6,$tctr6
1855 vorr $dat7,$dat0,$dat0
1856 rev $tctr7,$tctr7
1857 vorr $dat8,$dat0,$dat0
1858 rev $tctr8,$tctr8
1859 vorr $dat9,$dat0,$dat0
1860 rev $tctr9,$tctr9
1861 vorr $dat10,$dat0,$dat0
1862 rev $tctr10,$tctr10
1863 vorr $dat11,$dat0,$dat0
1864 rev $tctr11,$ctr
1865
1866 sub $len,$len,#6 // bias
1867 vmov.32 ${dat6}[3],$tctr6
1868 vmov.32 ${dat7}[3],$tctr7
1869 vmov.32 ${dat8}[3],$tctr8
1870 vmov.32 ${dat9}[3],$tctr9
1871 vmov.32 ${dat10}[3],$tctr10
1872 vmov.32 ${dat11}[3],$tctr11
1873 b .Loop12x_ctr32_unroll
1874
1875 .align 4
1876 .Loop12x_ctr32_unroll:
1877 aese $dat0,$rndping
1878 aesmc $dat0,$dat0
1879 aese $dat1,$rndping
1880 aesmc $dat1,$dat1
1881 aese $dat2,$rndping
1882 aesmc $dat2,$dat2
1883 aese $dat3,$rndping
1884 aesmc $dat3,$dat3
1885 aese $dat4,$rndping
1886 aesmc $dat4,$dat4
1887 aese $dat5,$rndping
1888 aesmc $dat5,$dat5
1889 aese $dat6,$rndping
1890 aesmc $dat6,$dat6
1891 aese $dat7,$rndping
1892 aesmc $dat7,$dat7
1893 aese $dat8,$rndping
1894 aesmc $dat8,$dat8
1895 aese $dat9,$rndping
1896 aesmc $dat9,$dat9
1897 aese $dat10,$rndping
1898 aesmc $dat10,$dat10
1899 aese $dat11,$rndping
1900 aesmc $dat11,$dat11
1901 vld1.32 {$rndping},[$key_],#16
1902 subs $cnt,$cnt,#2
1903 aese $dat0,$rndpang
1904 aesmc $dat0,$dat0
1905 aese $dat1,$rndpang
1906 aesmc $dat1,$dat1
1907 aese $dat2,$rndpang
1908 aesmc $dat2,$dat2
1909 aese $dat3,$rndpang
1910 aesmc $dat3,$dat3
1911 aese $dat4,$rndpang
1912 aesmc $dat4,$dat4
1913 aese $dat5,$rndpang
1914 aesmc $dat5,$dat5
1915 aese $dat6,$rndpang
1916 aesmc $dat6,$dat6
1917 aese $dat7,$rndpang
1918 aesmc $dat7,$dat7
1919 aese $dat8,$rndpang
1920 aesmc $dat8,$dat8
1921 aese $dat9,$rndpang
1922 aesmc $dat9,$dat9
1923 aese $dat10,$rndpang
1924 aesmc $dat10,$dat10
1925 aese $dat11,$rndpang
1926 aesmc $dat11,$dat11
1927 vld1.32 {$rndpang},[$key_],#16
1928 b.gt .Loop12x_ctr32_unroll
1929
1930 aese $dat0,$rndping
1931 aesmc $dat0,$dat0
1932 aese $dat1,$rndping
1933 aesmc $dat1,$dat1
1934 aese $dat2,$rndping
1935 aesmc $dat2,$dat2
1936 aese $dat3,$rndping
1937 aesmc $dat3,$dat3
1938 aese $dat4,$rndping
1939 aesmc $dat4,$dat4
1940 aese $dat5,$rndping
1941 aesmc $dat5,$dat5
1942 aese $dat6,$rndping
1943 aesmc $dat6,$dat6
1944 aese $dat7,$rndping
1945 aesmc $dat7,$dat7
1946 aese $dat8,$rndping
1947 aesmc $dat8,$dat8
1948 aese $dat9,$rndping
1949 aesmc $dat9,$dat9
1950 aese $dat10,$rndping
1951 aesmc $dat10,$dat10
1952 aese $dat11,$rndping
1953 aesmc $dat11,$dat11
1954 vld1.32 {$rndping},[$key_],#16
1955
1956 aese $dat0,$rndpang
1957 aesmc $dat0,$dat0
1958 aese $dat1,$rndpang
1959 aesmc $dat1,$dat1
1960 aese $dat2,$rndpang
1961 aesmc $dat2,$dat2
1962 aese $dat3,$rndpang
1963 aesmc $dat3,$dat3
1964 aese $dat4,$rndpang
1965 aesmc $dat4,$dat4
1966 aese $dat5,$rndpang
1967 aesmc $dat5,$dat5
1968 aese $dat6,$rndpang
1969 aesmc $dat6,$dat6
1970 aese $dat7,$rndpang
1971 aesmc $dat7,$dat7
1972 aese $dat8,$rndpang
1973 aesmc $dat8,$dat8
1974 aese $dat9,$rndpang
1975 aesmc $dat9,$dat9
1976 aese $dat10,$rndpang
1977 aesmc $dat10,$dat10
1978 aese $dat11,$rndpang
1979 aesmc $dat11,$dat11
1980 vld1.32 {$rndpang},[$key_],#16
1981
1982 aese $dat0,$rndping
1983 aesmc $dat0,$dat0
1984 add $tctr0,$ctr,#1
1985 add $tctr1,$ctr,#2
1986 aese $dat1,$rndping
1987 aesmc $dat1,$dat1
1988 add $tctr2,$ctr,#3
1989 add $tctr3,$ctr,#4
1990 aese $dat2,$rndping
1991 aesmc $dat2,$dat2
1992 add $tctr4,$ctr,#5
1993 add $tctr5,$ctr,#6
1994 rev $tctr0,$tctr0
1995 aese $dat3,$rndping
1996 aesmc $dat3,$dat3
1997 add $tctr6,$ctr,#7
1998 add $tctr7,$ctr,#8
1999 rev $tctr1,$tctr1
2000 rev $tctr2,$tctr2
2001 aese $dat4,$rndping
2002 aesmc $dat4,$dat4
2003 add $tctr8,$ctr,#9
2004 add $tctr9,$ctr,#10
2005 rev $tctr3,$tctr3
2006 rev $tctr4,$tctr4
2007 aese $dat5,$rndping
2008 aesmc $dat5,$dat5
2009 add $tctr10,$ctr,#11
2010 add $tctr11,$ctr,#12
2011 rev $tctr5,$tctr5
2012 rev $tctr6,$tctr6
2013 aese $dat6,$rndping
2014 aesmc $dat6,$dat6
2015 rev $tctr7,$tctr7
2016 rev $tctr8,$tctr8
2017 aese $dat7,$rndping
2018 aesmc $dat7,$dat7
2019 rev $tctr9,$tctr9
2020 rev $tctr10,$tctr10
2021 aese $dat8,$rndping
2022 aesmc $dat8,$dat8
2023 rev $tctr11,$tctr11
2024 aese $dat9,$rndping
2025 aesmc $dat9,$dat9
2026 aese $dat10,$rndping
2027 aesmc $dat10,$dat10
2028 aese $dat11,$rndping
2029 aesmc $dat11,$dat11
2030 vld1.32 {$rndping},[$key_],#16
2031
2032 aese $dat0,$rndpang
2033 aesmc $dat0,$dat0
2034 aese $dat1,$rndpang
2035 aesmc $dat1,$dat1
2036 aese $dat2,$rndpang
2037 aesmc $dat2,$dat2
2038 aese $dat3,$rndpang
2039 aesmc $dat3,$dat3
2040 vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64
2041 aese $dat4,$rndpang
2042 aesmc $dat4,$dat4
2043 aese $dat5,$rndpang
2044 aesmc $dat5,$dat5
2045 aese $dat6,$rndpang
2046 aesmc $dat6,$dat6
2047 aese $dat7,$rndpang
2048 aesmc $dat7,$dat7
2049 vld1.8 {$in4,$in5,$in6,$in7},[$inp],#64
2050 aese $dat8,$rndpang
2051 aesmc $dat8,$dat8
2052 aese $dat9,$rndpang
2053 aesmc $dat9,$dat9
2054 aese $dat10,$rndpang
2055 aesmc $dat10,$dat10
2056 aese $dat11,$rndpang
2057 aesmc $dat11,$dat11
2058 vld1.8 {$in8,$in9,$in10,$in11},[$inp],#64
2059 vld1.32 {$rndpang},[$key_],#16
2060
2061 mov $key_, $key
2062 aese $dat0,$rndping
2063 aesmc $dat0,$dat0
2064 aese $dat1,$rndping
2065 aesmc $dat1,$dat1
2066 aese $dat2,$rndping
2067 aesmc $dat2,$dat2
2068 aese $dat3,$rndping
2069 aesmc $dat3,$dat3
2070 aese $dat4,$rndping
2071 aesmc $dat4,$dat4
2072 aese $dat5,$rndping
2073 aesmc $dat5,$dat5
2074 aese $dat6,$rndping
2075 aesmc $dat6,$dat6
2076 aese $dat7,$rndping
2077 aesmc $dat7,$dat7
2078 aese $dat8,$rndping
2079 aesmc $dat8,$dat8
2080 aese $dat9,$rndping
2081 aesmc $dat9,$dat9
2082 aese $dat10,$rndping
2083 aesmc $dat10,$dat10
2084 aese $dat11,$rndping
2085 aesmc $dat11,$dat11
2086 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
2087
2088 aese $dat0,$rndpang
2089 eor3 $in0,$in0,$rndlast,$dat0
2090 vorr $dat0,$ivec,$ivec
2091 aese $dat1,$rndpang
2092 eor3 $in1,$in1,$rndlast,$dat1
2093 vorr $dat1,$ivec,$ivec
2094 aese $dat2,$rndpang
2095 eor3 $in2,$in2,$rndlast,$dat2
2096 vorr $dat2,$ivec,$ivec
2097 aese $dat3,$rndpang
2098 eor3 $in3,$in3,$rndlast,$dat3
2099 vorr $dat3,$ivec,$ivec
2100 aese $dat4,$rndpang
2101 eor3 $in4,$in4,$rndlast,$dat4
2102 vorr $dat4,$ivec,$ivec
2103 aese $dat5,$rndpang
2104 eor3 $in5,$in5,$rndlast,$dat5
2105 vorr $dat5,$ivec,$ivec
2106 aese $dat6,$rndpang
2107 eor3 $in6,$in6,$rndlast,$dat6
2108 vorr $dat6,$ivec,$ivec
2109 aese $dat7,$rndpang
2110 eor3 $in7,$in7,$rndlast,$dat7
2111 vorr $dat7,$ivec,$ivec
2112 aese $dat8,$rndpang
2113 eor3 $in8,$in8,$rndlast,$dat8
2114 vorr $dat8,$ivec,$ivec
2115 aese $dat9,$rndpang
2116 eor3 $in9,$in9,$rndlast,$dat9
2117 vorr $dat9,$ivec,$ivec
2118 aese $dat10,$rndpang
2119 eor3 $in10,$in10,$rndlast,$dat10
2120 vorr $dat10,$ivec,$ivec
2121 aese $dat11,$rndpang
2122 eor3 $in11,$in11,$rndlast,$dat11
2123 vorr $dat11,$ivec,$ivec
2124 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
2125
2126 vmov.32 ${dat0}[3],$tctr0
2127 vmov.32 ${dat1}[3],$tctr1
2128 vmov.32 ${dat2}[3],$tctr2
2129 vmov.32 ${dat3}[3],$tctr3
2130 vst1.8 {$in0,$in1,$in2,$in3},[$out],#64
2131 vmov.32 ${dat4}[3],$tctr4
2132 vmov.32 ${dat5}[3],$tctr5
2133 vmov.32 ${dat6}[3],$tctr6
2134 vmov.32 ${dat7}[3],$tctr7
2135 vst1.8 {$in4,$in5,$in6,$in7},[$out],#64
2136 vmov.32 ${dat8}[3],$tctr8
2137 vmov.32 ${dat9}[3],$tctr9
2138 vmov.32 ${dat10}[3],$tctr10
2139 vmov.32 ${dat11}[3],$tctr11
2140 vst1.8 {$in8,$in9,$in10,$in11},[$out],#64
2141
2142 mov $cnt,$rounds
2143
2144 add $ctr,$ctr,#12
2145 subs $len,$len,#12
2146 b.hs .Loop12x_ctr32_unroll
2147
2148 // pop regs from stack when 12 data chunks are interleaved
2149 ldp $dat10d,$dat11d,[sp],#32
2150 ldp $dat8d,$dat9d,[sp],#32
2151 ldp x23,x24,[sp],#16
2152 ldp x21,x22,[sp],#16
2153 ldp x19,x20,[sp],#16
2154
2155 add $len,$len,#12
2156 cbz $len,.Lctr32_done_unroll
2157 sub $ctr,$ctr,#12
2158
2159 cmp $len,#2
2160 b.ls .Lctr32_tail_unroll
2161
2162 cmp $len,#6
2163 sub $len,$len,#3 // bias
2164 add $ctr,$ctr,#3
2165 b.lo .Loop3x_ctr32_unroll
2166
2167 sub $len,$len,#3
2168 add $ctr,$ctr,#3
2169 b.lo .Loop6x_ctr32_unroll
2170
2171 .align 4
2172 .Loop6x_ctr32_unroll:
2173 aese $dat0,$rndping
2174 aesmc $dat0,$dat0
2175 aese $dat1,$rndping
2176 aesmc $dat1,$dat1
2177 aese $dat2,$rndping
2178 aesmc $dat2,$dat2
2179 aese $dat3,$rndping
2180 aesmc $dat3,$dat3
2181 aese $dat4,$rndping
2182 aesmc $dat4,$dat4
2183 aese $dat5,$rndping
2184 aesmc $dat5,$dat5
2185 vld1.32 {$rndping},[$key_],#16
2186 subs $cnt,$cnt,#2
2187 aese $dat0,$rndpang
2188 aesmc $dat0,$dat0
2189 aese $dat1,$rndpang
2190 aesmc $dat1,$dat1
2191 aese $dat2,$rndpang
2192 aesmc $dat2,$dat2
2193 aese $dat3,$rndpang
2194 aesmc $dat3,$dat3
2195 aese $dat4,$rndpang
2196 aesmc $dat4,$dat4
2197 aese $dat5,$rndpang
2198 aesmc $dat5,$dat5
2199 vld1.32 {$rndpang},[$key_],#16
2200 b.gt .Loop6x_ctr32_unroll
2201
2202 aese $dat0,$rndping
2203 aesmc $dat0,$dat0
2204 aese $dat1,$rndping
2205 aesmc $dat1,$dat1
2206 aese $dat2,$rndping
2207 aesmc $dat2,$dat2
2208 aese $dat3,$rndping
2209 aesmc $dat3,$dat3
2210 aese $dat4,$rndping
2211 aesmc $dat4,$dat4
2212 aese $dat5,$rndping
2213 aesmc $dat5,$dat5
2214 vld1.32 {$rndping},[$key_],#16
2215
2216 aese $dat0,$rndpang
2217 aesmc $dat0,$dat0
2218 aese $dat1,$rndpang
2219 aesmc $dat1,$dat1
2220 aese $dat2,$rndpang
2221 aesmc $dat2,$dat2
2222 aese $dat3,$rndpang
2223 aesmc $dat3,$dat3
2224 aese $dat4,$rndpang
2225 aesmc $dat4,$dat4
2226 aese $dat5,$rndpang
2227 aesmc $dat5,$dat5
2228 vld1.32 {$rndpang},[$key_],#16
2229
2230 aese $dat0,$rndping
2231 aesmc $dat0,$dat0
2232 add $tctr0,$ctr,#1
2233 add $tctr1,$ctr,#2
2234 aese $dat1,$rndping
2235 aesmc $dat1,$dat1
2236 add $tctr2,$ctr,#3
2237 add $tctr3,$ctr,#4
2238 aese $dat2,$rndping
2239 aesmc $dat2,$dat2
2240 add $tctr4,$ctr,#5
2241 add $tctr5,$ctr,#6
2242 rev $tctr0,$tctr0
2243 aese $dat3,$rndping
2244 aesmc $dat3,$dat3
2245 rev $tctr1,$tctr1
2246 rev $tctr2,$tctr2
2247 aese $dat4,$rndping
2248 aesmc $dat4,$dat4
2249 rev $tctr3,$tctr3
2250 rev $tctr4,$tctr4
2251 aese $dat5,$rndping
2252 aesmc $dat5,$dat5
2253 rev $tctr5,$tctr5
2254 vld1.32 {$rndping},[$key_],#16
2255
2256 aese $dat0,$rndpang
2257 aesmc $dat0,$dat0
2258 aese $dat1,$rndpang
2259 aesmc $dat1,$dat1
2260 vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64
2261 aese $dat2,$rndpang
2262 aesmc $dat2,$dat2
2263 aese $dat3,$rndpang
2264 aesmc $dat3,$dat3
2265 vld1.8 {$in4,$in5},[$inp],#32
2266 aese $dat4,$rndpang
2267 aesmc $dat4,$dat4
2268 aese $dat5,$rndpang
2269 aesmc $dat5,$dat5
2270 vld1.32 {$rndpang},[$key_],#16
2271
2272 mov $key_, $key
2273 aese $dat0,$rndping
2274 aesmc $dat0,$dat0
2275 aese $dat1,$rndping
2276 aesmc $dat1,$dat1
2277 aese $dat2,$rndping
2278 aesmc $dat2,$dat2
2279 aese $dat3,$rndping
2280 aesmc $dat3,$dat3
2281 aese $dat4,$rndping
2282 aesmc $dat4,$dat4
2283 aese $dat5,$rndping
2284 aesmc $dat5,$dat5
2285 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
2286
2287 aese $dat0,$rndpang
2288 eor3 $in0,$in0,$rndlast,$dat0
2289 aese $dat1,$rndpang
2290 eor3 $in1,$in1,$rndlast,$dat1
2291 aese $dat2,$rndpang
2292 eor3 $in2,$in2,$rndlast,$dat2
2293 aese $dat3,$rndpang
2294 eor3 $in3,$in3,$rndlast,$dat3
2295 aese $dat4,$rndpang
2296 eor3 $in4,$in4,$rndlast,$dat4
2297 aese $dat5,$rndpang
2298 eor3 $in5,$in5,$rndlast,$dat5
2299 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
2300
2301 vorr $dat0,$ivec,$ivec
2302 vorr $dat1,$ivec,$ivec
2303 vorr $dat2,$ivec,$ivec
2304 vorr $dat3,$ivec,$ivec
2305 vorr $dat4,$ivec,$ivec
2306 vorr $dat5,$ivec,$ivec
2307
2308 vmov.32 ${dat0}[3],$tctr0
2309 vmov.32 ${dat1}[3],$tctr1
2310 vst1.8 {$in0,$in1,$in2,$in3},[$out],#64
2311 vmov.32 ${dat2}[3],$tctr2
2312 vmov.32 ${dat3}[3],$tctr3
2313 vst1.8 {$in4,$in5},[$out],#32
2314 vmov.32 ${dat4}[3],$tctr4
2315 vmov.32 ${dat5}[3],$tctr5
2316
2317 cbz $len,.Lctr32_done_unroll
2318 mov $cnt,$rounds
2319
2320 cmp $len,#2
2321 b.ls .Lctr32_tail_unroll
2322
2323 sub $len,$len,#3 // bias
2324 add $ctr,$ctr,#3
2325 b .Loop3x_ctr32_unroll
2326
2327 .align 4
2328 .Loop3x_ctr32_unroll:
2329 aese $dat0,$rndping
2330 aesmc $dat0,$dat0
2331 aese $dat1,$rndping
2332 aesmc $dat1,$dat1
2333 aese $dat2,$rndping
2334 aesmc $dat2,$dat2
2335 vld1.32 {$rndping},[$key_],#16
2336 subs $cnt,$cnt,#2
2337 aese $dat0,$rndpang
2338 aesmc $dat0,$dat0
2339 aese $dat1,$rndpang
2340 aesmc $dat1,$dat1
2341 aese $dat2,$rndpang
2342 aesmc $dat2,$dat2
2343 vld1.32 {$rndpang},[$key_],#16
2344 b.gt .Loop3x_ctr32_unroll
2345
2346 aese $dat0,$rndping
2347 aesmc $tmp0,$dat0
2348 aese $dat1,$rndping
2349 aesmc $tmp1,$dat1
2350 vld1.8 {$in0,$in1,$in2},[$inp],#48
2351 vorr $dat0,$ivec,$ivec
2352 aese $dat2,$rndping
2353 aesmc $dat2,$dat2
2354 vld1.32 {$rndping},[$key_],#16
2355 vorr $dat1,$ivec,$ivec
2356 aese $tmp0,$rndpang
2357 aesmc $tmp0,$tmp0
2358 aese $tmp1,$rndpang
2359 aesmc $tmp1,$tmp1
2360 aese $dat2,$rndpang
2361 aesmc $tmp2,$dat2
2362 vld1.32 {$rndpang},[$key_],#16
2363 vorr $dat2,$ivec,$ivec
2364 add $tctr0,$ctr,#1
2365 aese $tmp0,$rndping
2366 aesmc $tmp0,$tmp0
2367 aese $tmp1,$rndping
2368 aesmc $tmp1,$tmp1
2369 add $tctr1,$ctr,#2
2370 aese $tmp2,$rndping
2371 aesmc $tmp2,$tmp2
2372 vld1.32 {$rndping},[$key_],#16
2373 add $ctr,$ctr,#3
2374 aese $tmp0,$rndpang
2375 aesmc $tmp0,$tmp0
2376 aese $tmp1,$rndpang
2377 aesmc $tmp1,$tmp1
2378
2379 rev $tctr0,$tctr0
2380 aese $tmp2,$rndpang
2381 aesmc $tmp2,$tmp2
2382 vld1.32 {$rndpang},[$key_],#16
2383 vmov.32 ${dat0}[3], $tctr0
2384 mov $key_,$key
2385 rev $tctr1,$tctr1
2386 aese $tmp0,$rndping
2387 aesmc $tmp0,$tmp0
2388
2389 aese $tmp1,$rndping
2390 aesmc $tmp1,$tmp1
2391 vmov.32 ${dat1}[3], $tctr1
2392 rev $tctr2,$ctr
2393 aese $tmp2,$rndping
2394 aesmc $tmp2,$tmp2
2395 vmov.32 ${dat2}[3], $tctr2
2396
2397 aese $tmp0,$rndpang
2398 aese $tmp1,$rndpang
2399 aese $tmp2,$rndpang
2400
2401 eor3 $in0,$in0,$rndlast,$tmp0
2402 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
2403 eor3 $in1,$in1,$rndlast,$tmp1
2404 mov $cnt,$rounds
2405 eor3 $in2,$in2,$rndlast,$tmp2
2406 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
2407 vst1.8 {$in0,$in1,$in2},[$out],#48
2408
2409 cbz $len,.Lctr32_done_unroll
2410
2411 .Lctr32_tail_unroll:
2412 cmp $len,#1
2413 b.eq .Lctr32_tail_1_unroll
2414
2415 .Lctr32_tail_2_unroll:
2416 aese $dat0,$rndping
2417 aesmc $dat0,$dat0
2418 aese $dat1,$rndping
2419 aesmc $dat1,$dat1
2420 vld1.32 {$rndping},[$key_],#16
2421 subs $cnt,$cnt,#2
2422 aese $dat0,$rndpang
2423 aesmc $dat0,$dat0
2424 aese $dat1,$rndpang
2425 aesmc $dat1,$dat1
2426 vld1.32 {$rndpang},[$key_],#16
2427 b.gt .Lctr32_tail_2_unroll
2428
2429 aese $dat0,$rndping
2430 aesmc $dat0,$dat0
2431 aese $dat1,$rndping
2432 aesmc $dat1,$dat1
2433 vld1.32 {$rndping},[$key_],#16
2434 aese $dat0,$rndpang
2435 aesmc $dat0,$dat0
2436 aese $dat1,$rndpang
2437 aesmc $dat1,$dat1
2438 vld1.32 {$rndpang},[$key_],#16
2439 vld1.8 {$in0,$in1},[$inp],#32
2440 aese $dat0,$rndping
2441 aesmc $dat0,$dat0
2442 aese $dat1,$rndping
2443 aesmc $dat1,$dat1
2444 vld1.32 {$rndping},[$key_],#16
2445 aese $dat0,$rndpang
2446 aesmc $dat0,$dat0
2447 aese $dat1,$rndpang
2448 aesmc $dat1,$dat1
2449 vld1.32 {$rndpang},[$key_],#16
2450 aese $dat0,$rndping
2451 aesmc $dat0,$dat0
2452 aese $dat1,$rndping
2453 aesmc $dat1,$dat1
2454 aese $dat0,$rndpang
2455 aese $dat1,$rndpang
2456
2457 eor3 $in0,$in0,$rndlast,$dat0
2458 eor3 $in1,$in1,$rndlast,$dat1
2459 vst1.8 {$in0,$in1},[$out],#32
2460 b .Lctr32_done_unroll
2461
2462 .Lctr32_tail_1_unroll:
2463 aese $dat0,$rndping
2464 aesmc $dat0,$dat0
2465 vld1.32 {$rndping},[$key_],#16
2466 subs $cnt,$cnt,#2
2467 aese $dat0,$rndpang
2468 aesmc $dat0,$dat0
2469 vld1.32 {$rndpang},[$key_],#16
2470 b.gt .Lctr32_tail_1_unroll
2471
2472 aese $dat0,$rndping
2473 aesmc $dat0,$dat0
2474 vld1.32 {$rndping},[$key_],#16
2475 aese $dat0,$rndpang
2476 aesmc $dat0,$dat0
2477 vld1.32 {$rndpang},[$key_],#16
2478 vld1.8 {$in0},[$inp]
2479 aese $dat0,$rndping
2480 aesmc $dat0,$dat0
2481 vld1.32 {$rndping},[$key_],#16
2482 aese $dat0,$rndpang
2483 aesmc $dat0,$dat0
2484 vld1.32 {$rndpang},[$key_],#16
2485 aese $dat0,$rndping
2486 aesmc $dat0,$dat0
2487 aese $dat0,$rndpang
2488
2489 eor3 $in0,$in0,$rndlast,$dat0
2490 vst1.8 {$in0},[$out],#16
2491
2492 .Lctr32_done_unroll:
2493 ldp d8,d9,[sp, #16]
2494 ldp d10,d11,[sp, #32]
2495 ldp d12,d13,[sp, #48]
2496 ldp d15,d16,[sp, #64]
2497 ldr x29,[sp],#80
2498 ret
2499 .size ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3
2500 ___
2501 }}}
2502
2503 {{{
2504 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
2505 my ($rounds,$cnt,$key_)=("w5","w6","x7");
2506 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
2507 my $step="x12"; # aliases with $tctr2
2508
2509 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
2510 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2511
2512 # used only in 64-bit mode...
2513 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
2514
2515 my ($dat,$tmp)=($dat0,$tmp0);
2516
2517 ### q8-q15 preloaded key schedule
2518
2519 $code.=<<___;
2520 .globl ${prefix}_ctr32_encrypt_blocks
2521 .type ${prefix}_ctr32_encrypt_blocks,%function
2522 .align 5
2523 ${prefix}_ctr32_encrypt_blocks:
2524 ___
2525 $code.=<<___ if ($flavour =~ /64/);
2526 AARCH64_VALID_CALL_TARGET
2527 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
2528 stp x29,x30,[sp,#-16]!
2529 add x29,sp,#0
2530 ___
2531 $code.=<<___ if ($flavour !~ /64/);
2532 mov ip,sp
2533 stmdb sp!,{r4-r10,lr}
2534 vstmdb sp!,{d8-d15} @ ABI specification says so
2535 ldr r4, [ip] @ load remaining arg
2536 ___
2537 $code.=<<___;
2538 ldr $rounds,[$key,#240]
2539
2540 ldr $ctr, [$ivp, #12]
2541 #ifdef __ARMEB__
2542 vld1.8 {$dat0},[$ivp]
2543 #else
2544 vld1.32 {$dat0},[$ivp]
2545 #endif
2546 vld1.32 {q8-q9},[$key] // load key schedule...
2547 sub $rounds,$rounds,#4
2548 mov $step,#16
2549 cmp $len,#2
2550 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
2551 sub $rounds,$rounds,#2
2552 vld1.32 {q12-q13},[$key_],#32
2553 vld1.32 {q14-q15},[$key_],#32
2554 vld1.32 {$rndlast},[$key_]
2555 add $key_,$key,#32
2556 mov $cnt,$rounds
2557 cclr $step,lo
2558 #ifndef __ARMEB__
2559 rev $ctr, $ctr
2560 #endif
2561 ___
2562 $code.=<<___ if ($flavour =~ /64/);
2563 vorr $dat1,$dat0,$dat0
2564 add $tctr1, $ctr, #1
2565 vorr $dat2,$dat0,$dat0
2566 add $ctr, $ctr, #2
2567 vorr $ivec,$dat0,$dat0
2568 rev $tctr1, $tctr1
2569 vmov.32 ${dat1}[3],$tctr1
2570 b.ls .Lctr32_tail
2571 rev $tctr2, $ctr
2572 sub $len,$len,#3 // bias
2573 vmov.32 ${dat2}[3],$tctr2
2574 ___
2575 $code.=<<___ if ($flavour !~ /64/);
2576 add $tctr1, $ctr, #1
2577 vorr $ivec,$dat0,$dat0
2578 rev $tctr1, $tctr1
2579 vmov.32 ${ivec}[3],$tctr1
2580 add $ctr, $ctr, #2
2581 vorr $dat1,$ivec,$ivec
2582 b.ls .Lctr32_tail
2583 rev $tctr2, $ctr
2584 vmov.32 ${ivec}[3],$tctr2
2585 sub $len,$len,#3 // bias
2586 vorr $dat2,$ivec,$ivec
2587 ___
2588 $code.=<<___ if ($flavour =~ /64/);
2589 cmp $len,#32
2590 b.lo .Loop3x_ctr32
2591
2592 add w13,$ctr,#1
2593 add w14,$ctr,#2
2594 vorr $dat3,$dat0,$dat0
2595 rev w13,w13
2596 vorr $dat4,$dat0,$dat0
2597 rev w14,w14
2598 vmov.32 ${dat3}[3],w13
2599 sub $len,$len,#2 // bias
2600 vmov.32 ${dat4}[3],w14
2601 add $ctr,$ctr,#2
2602 b .Loop5x_ctr32
2603
2604 .align 4
2605 .Loop5x_ctr32:
2606 aese $dat0,q8
2607 aesmc $dat0,$dat0
2608 aese $dat1,q8
2609 aesmc $dat1,$dat1
2610 aese $dat2,q8
2611 aesmc $dat2,$dat2
2612 aese $dat3,q8
2613 aesmc $dat3,$dat3
2614 aese $dat4,q8
2615 aesmc $dat4,$dat4
2616 vld1.32 {q8},[$key_],#16
2617 subs $cnt,$cnt,#2
2618 aese $dat0,q9
2619 aesmc $dat0,$dat0
2620 aese $dat1,q9
2621 aesmc $dat1,$dat1
2622 aese $dat2,q9
2623 aesmc $dat2,$dat2
2624 aese $dat3,q9
2625 aesmc $dat3,$dat3
2626 aese $dat4,q9
2627 aesmc $dat4,$dat4
2628 vld1.32 {q9},[$key_],#16
2629 b.gt .Loop5x_ctr32
2630
2631 mov $key_,$key
2632 aese $dat0,q8
2633 aesmc $dat0,$dat0
2634 aese $dat1,q8
2635 aesmc $dat1,$dat1
2636 aese $dat2,q8
2637 aesmc $dat2,$dat2
2638 aese $dat3,q8
2639 aesmc $dat3,$dat3
2640 aese $dat4,q8
2641 aesmc $dat4,$dat4
2642 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2643
2644 aese $dat0,q9
2645 aesmc $dat0,$dat0
2646 aese $dat1,q9
2647 aesmc $dat1,$dat1
2648 aese $dat2,q9
2649 aesmc $dat2,$dat2
2650 aese $dat3,q9
2651 aesmc $dat3,$dat3
2652 aese $dat4,q9
2653 aesmc $dat4,$dat4
2654 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2655
2656 aese $dat0,q12
2657 aesmc $dat0,$dat0
2658 add $tctr0,$ctr,#1
2659 add $tctr1,$ctr,#2
2660 aese $dat1,q12
2661 aesmc $dat1,$dat1
2662 add $tctr2,$ctr,#3
2663 add w13,$ctr,#4
2664 aese $dat2,q12
2665 aesmc $dat2,$dat2
2666 add w14,$ctr,#5
2667 rev $tctr0,$tctr0
2668 aese $dat3,q12
2669 aesmc $dat3,$dat3
2670 rev $tctr1,$tctr1
2671 rev $tctr2,$tctr2
2672 aese $dat4,q12
2673 aesmc $dat4,$dat4
2674 rev w13,w13
2675 rev w14,w14
2676
2677 aese $dat0,q13
2678 aesmc $dat0,$dat0
2679 aese $dat1,q13
2680 aesmc $dat1,$dat1
2681 aese $dat2,q13
2682 aesmc $dat2,$dat2
2683 aese $dat3,q13
2684 aesmc $dat3,$dat3
2685 aese $dat4,q13
2686 aesmc $dat4,$dat4
2687
2688 aese $dat0,q14
2689 aesmc $dat0,$dat0
2690 vld1.8 {$in0},[$inp],#16
2691 aese $dat1,q14
2692 aesmc $dat1,$dat1
2693 vld1.8 {$in1},[$inp],#16
2694 aese $dat2,q14
2695 aesmc $dat2,$dat2
2696 vld1.8 {$in2},[$inp],#16
2697 aese $dat3,q14
2698 aesmc $dat3,$dat3
2699 vld1.8 {$in3},[$inp],#16
2700 aese $dat4,q14
2701 aesmc $dat4,$dat4
2702 vld1.8 {$in4},[$inp],#16
2703
2704 aese $dat0,q15
2705 veor $in0,$in0,$rndlast
2706 aese $dat1,q15
2707 veor $in1,$in1,$rndlast
2708 aese $dat2,q15
2709 veor $in2,$in2,$rndlast
2710 aese $dat3,q15
2711 veor $in3,$in3,$rndlast
2712 aese $dat4,q15
2713 veor $in4,$in4,$rndlast
2714
2715 veor $in0,$in0,$dat0
2716 vorr $dat0,$ivec,$ivec
2717 veor $in1,$in1,$dat1
2718 vorr $dat1,$ivec,$ivec
2719 veor $in2,$in2,$dat2
2720 vorr $dat2,$ivec,$ivec
2721 veor $in3,$in3,$dat3
2722 vorr $dat3,$ivec,$ivec
2723 veor $in4,$in4,$dat4
2724 vorr $dat4,$ivec,$ivec
2725
2726 vst1.8 {$in0},[$out],#16
2727 vmov.32 ${dat0}[3],$tctr0
2728 vst1.8 {$in1},[$out],#16
2729 vmov.32 ${dat1}[3],$tctr1
2730 vst1.8 {$in2},[$out],#16
2731 vmov.32 ${dat2}[3],$tctr2
2732 vst1.8 {$in3},[$out],#16
2733 vmov.32 ${dat3}[3],w13
2734 vst1.8 {$in4},[$out],#16
2735 vmov.32 ${dat4}[3],w14
2736
2737 mov $cnt,$rounds
2738 cbz $len,.Lctr32_done
2739
2740 add $ctr,$ctr,#5
2741 subs $len,$len,#5
2742 b.hs .Loop5x_ctr32
2743
2744 add $len,$len,#5
2745 sub $ctr,$ctr,#5
2746
2747 cmp $len,#2
2748 mov $step,#16
2749 cclr $step,lo
2750 b.ls .Lctr32_tail
2751
2752 sub $len,$len,#3 // bias
2753 add $ctr,$ctr,#3
2754 ___
2755 $code.=<<___;
2756 b .Loop3x_ctr32
2757
2758 .align 4
2759 .Loop3x_ctr32:
2760 aese $dat0,q8
2761 aesmc $dat0,$dat0
2762 aese $dat1,q8
2763 aesmc $dat1,$dat1
2764 aese $dat2,q8
2765 aesmc $dat2,$dat2
2766 vld1.32 {q8},[$key_],#16
2767 subs $cnt,$cnt,#2
2768 aese $dat0,q9
2769 aesmc $dat0,$dat0
2770 aese $dat1,q9
2771 aesmc $dat1,$dat1
2772 aese $dat2,q9
2773 aesmc $dat2,$dat2
2774 vld1.32 {q9},[$key_],#16
2775 b.gt .Loop3x_ctr32
2776
2777 aese $dat0,q8
2778 aesmc $tmp0,$dat0
2779 aese $dat1,q8
2780 aesmc $tmp1,$dat1
2781 vld1.8 {$in0},[$inp],#16
2782 ___
2783 $code.=<<___ if ($flavour =~ /64/);
2784 vorr $dat0,$ivec,$ivec
2785 ___
2786 $code.=<<___ if ($flavour !~ /64/);
2787 add $tctr0,$ctr,#1
2788 ___
2789 $code.=<<___;
2790 aese $dat2,q8
2791 aesmc $dat2,$dat2
2792 vld1.8 {$in1},[$inp],#16
2793 ___
2794 $code.=<<___ if ($flavour =~ /64/);
2795 vorr $dat1,$ivec,$ivec
2796 ___
2797 $code.=<<___ if ($flavour !~ /64/);
2798 rev $tctr0,$tctr0
2799 ___
2800 $code.=<<___;
2801 aese $tmp0,q9
2802 aesmc $tmp0,$tmp0
2803 aese $tmp1,q9
2804 aesmc $tmp1,$tmp1
2805 vld1.8 {$in2},[$inp],#16
2806 mov $key_,$key
2807 aese $dat2,q9
2808 aesmc $tmp2,$dat2
2809 ___
2810 $code.=<<___ if ($flavour =~ /64/);
2811 vorr $dat2,$ivec,$ivec
2812 add $tctr0,$ctr,#1
2813 ___
2814 $code.=<<___;
2815 aese $tmp0,q12
2816 aesmc $tmp0,$tmp0
2817 aese $tmp1,q12
2818 aesmc $tmp1,$tmp1
2819 veor $in0,$in0,$rndlast
2820 add $tctr1,$ctr,#2
2821 aese $tmp2,q12
2822 aesmc $tmp2,$tmp2
2823 veor $in1,$in1,$rndlast
2824 add $ctr,$ctr,#3
2825 aese $tmp0,q13
2826 aesmc $tmp0,$tmp0
2827 aese $tmp1,q13
2828 aesmc $tmp1,$tmp1
2829 veor $in2,$in2,$rndlast
2830 ___
2831 $code.=<<___ if ($flavour =~ /64/);
2832 rev $tctr0,$tctr0
2833 aese $tmp2,q13
2834 aesmc $tmp2,$tmp2
2835 vmov.32 ${dat0}[3], $tctr0
2836 ___
2837 $code.=<<___ if ($flavour !~ /64/);
2838 vmov.32 ${ivec}[3], $tctr0
2839 aese $tmp2,q13
2840 aesmc $tmp2,$tmp2
2841 vorr $dat0,$ivec,$ivec
2842 ___
2843 $code.=<<___;
2844 rev $tctr1,$tctr1
2845 aese $tmp0,q14
2846 aesmc $tmp0,$tmp0
2847 ___
2848 $code.=<<___ if ($flavour !~ /64/);
2849 vmov.32 ${ivec}[3], $tctr1
2850 rev $tctr2,$ctr
2851 ___
2852 $code.=<<___;
2853 aese $tmp1,q14
2854 aesmc $tmp1,$tmp1
2855 ___
2856 $code.=<<___ if ($flavour =~ /64/);
2857 vmov.32 ${dat1}[3], $tctr1
2858 rev $tctr2,$ctr
2859 aese $tmp2,q14
2860 aesmc $tmp2,$tmp2
2861 vmov.32 ${dat2}[3], $tctr2
2862 ___
2863 $code.=<<___ if ($flavour !~ /64/);
2864 vorr $dat1,$ivec,$ivec
2865 vmov.32 ${ivec}[3], $tctr2
2866 aese $tmp2,q14
2867 aesmc $tmp2,$tmp2
2868 vorr $dat2,$ivec,$ivec
2869 ___
2870 $code.=<<___;
2871 subs $len,$len,#3
2872 aese $tmp0,q15
2873 aese $tmp1,q15
2874 aese $tmp2,q15
2875
2876 veor $in0,$in0,$tmp0
2877 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2878 vst1.8 {$in0},[$out],#16
2879 veor $in1,$in1,$tmp1
2880 mov $cnt,$rounds
2881 vst1.8 {$in1},[$out],#16
2882 veor $in2,$in2,$tmp2
2883 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2884 vst1.8 {$in2},[$out],#16
2885 b.hs .Loop3x_ctr32
2886
2887 adds $len,$len,#3
2888 b.eq .Lctr32_done
2889 cmp $len,#1
2890 mov $step,#16
2891 cclr $step,eq
2892
2893 .Lctr32_tail:
2894 aese $dat0,q8
2895 aesmc $dat0,$dat0
2896 aese $dat1,q8
2897 aesmc $dat1,$dat1
2898 vld1.32 {q8},[$key_],#16
2899 subs $cnt,$cnt,#2
2900 aese $dat0,q9
2901 aesmc $dat0,$dat0
2902 aese $dat1,q9
2903 aesmc $dat1,$dat1
2904 vld1.32 {q9},[$key_],#16
2905 b.gt .Lctr32_tail
2906
2907 aese $dat0,q8
2908 aesmc $dat0,$dat0
2909 aese $dat1,q8
2910 aesmc $dat1,$dat1
2911 aese $dat0,q9
2912 aesmc $dat0,$dat0
2913 aese $dat1,q9
2914 aesmc $dat1,$dat1
2915 vld1.8 {$in0},[$inp],$step
2916 aese $dat0,q12
2917 aesmc $dat0,$dat0
2918 aese $dat1,q12
2919 aesmc $dat1,$dat1
2920 vld1.8 {$in1},[$inp]
2921 aese $dat0,q13
2922 aesmc $dat0,$dat0
2923 aese $dat1,q13
2924 aesmc $dat1,$dat1
2925 veor $in0,$in0,$rndlast
2926 aese $dat0,q14
2927 aesmc $dat0,$dat0
2928 aese $dat1,q14
2929 aesmc $dat1,$dat1
2930 veor $in1,$in1,$rndlast
2931 aese $dat0,q15
2932 aese $dat1,q15
2933
2934 cmp $len,#1
2935 veor $in0,$in0,$dat0
2936 veor $in1,$in1,$dat1
2937 vst1.8 {$in0},[$out],#16
2938 b.eq .Lctr32_done
2939 vst1.8 {$in1},[$out]
2940
2941 .Lctr32_done:
2942 ___
2943 $code.=<<___ if ($flavour !~ /64/);
2944 vldmia sp!,{d8-d15}
2945 ldmia sp!,{r4-r10,pc}
2946 ___
2947 $code.=<<___ if ($flavour =~ /64/);
2948 ldr x29,[sp],#16
2949 ret
2950 ___
2951 $code.=<<___;
2952 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2953 ___
2954 }}}
2955 # Performance in cycles per byte.
2956 # Processed with AES-XTS different key size.
2957 # It shows the value before and after optimization as below:
2958 # (before/after):
2959 #
2960 # AES-128-XTS AES-256-XTS
2961 # Cortex-A57 3.36/1.09 4.02/1.37
2962 # Cortex-A72 3.03/1.02 3.28/1.33
2963
2964 # Optimization is implemented by loop unrolling and interleaving.
2965 # Commonly, we choose the unrolling factor as 5, if the input
2966 # data size smaller than 5 blocks, but not smaller than 3 blocks,
2967 # choose 3 as the unrolling factor.
2968 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
2969 # as one iteration, every loop the left size lsize -= 5*16.
2970 # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2971 # will be processed specially, which be integrated into the 5*16 bytes
2972 # loop to improve the efficiency.
2973 # There is one special case, if the original input data size dsize
2974 # = 16 bytes, we will treat it separately to improve the
2975 # performance: one independent code block without LR, FP load and
2976 # store.
2977 # Encryption will process the (length -tailcnt) bytes as mentioned
2978 # previously, then encrypt the composite block as last second
2979 # cipher block.
2980 # Decryption will process the (length -tailcnt -1) bytes as mentioned
2981 # previously, then decrypt the last second cipher block to get the
2982 # last plain block(tail), decrypt the composite block as last second
2983 # plain text block.
2984
2985 {{{
2986 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2987 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2988 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2989 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2990 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2991 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2992 my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2993 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2994 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2995
2996 my ($tmpin)=("v26.16b");
2997 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2998
2999 # q7 last round key
3000 # q10-q15, q7 Last 7 round keys
3001 # q8-q9 preloaded round keys except last 7 keys for big size
3002 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
3003
3004
3005 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
3006
3007 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
3008 my ($dat4,$in4,$tmp4);
3009 if ($flavour =~ /64/) {
3010 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
3011 }
3012
3013 $code.=<<___ if ($flavour =~ /64/);
3014 .globl ${prefix}_xts_encrypt
3015 .type ${prefix}_xts_encrypt,%function
3016 .align 5
3017 ${prefix}_xts_encrypt:
3018 ___
3019 $code.=<<___ if ($flavour =~ /64/);
3020 AARCH64_VALID_CALL_TARGET
3021 cmp $len,#16
3022 // Original input data size bigger than 16, jump to big size processing.
3023 b.ne .Lxts_enc_big_size
3024 // Encrypt the iv with key2, as the first XEX iv.
3025 ldr $rounds,[$key2,#240]
3026 vld1.32 {$dat},[$key2],#16
3027 vld1.8 {$iv0},[$ivp]
3028 sub $rounds,$rounds,#2
3029 vld1.32 {$dat1},[$key2],#16
3030
3031 .Loop_enc_iv_enc:
3032 aese $iv0,$dat
3033 aesmc $iv0,$iv0
3034 vld1.32 {$dat},[$key2],#16
3035 subs $rounds,$rounds,#2
3036 aese $iv0,$dat1
3037 aesmc $iv0,$iv0
3038 vld1.32 {$dat1},[$key2],#16
3039 b.gt .Loop_enc_iv_enc
3040
3041 aese $iv0,$dat
3042 aesmc $iv0,$iv0
3043 vld1.32 {$dat},[$key2]
3044 aese $iv0,$dat1
3045 veor $iv0,$iv0,$dat
3046
3047 vld1.8 {$dat0},[$inp]
3048 veor $dat0,$iv0,$dat0
3049
3050 ldr $rounds,[$key1,#240]
3051 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
3052
3053 aese $dat0,q20
3054 aesmc $dat0,$dat0
3055 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
3056 aese $dat0,q21
3057 aesmc $dat0,$dat0
3058 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
3059 b.eq .Lxts_128_enc
3060 .Lxts_enc_round_loop:
3061 aese $dat0,q8
3062 aesmc $dat0,$dat0
3063 vld1.32 {q8},[$key1],#16 // load key schedule...
3064 aese $dat0,q9
3065 aesmc $dat0,$dat0
3066 vld1.32 {q9},[$key1],#16 // load key schedule...
3067 subs $rounds,$rounds,#2 // bias
3068 b.gt .Lxts_enc_round_loop
3069 .Lxts_128_enc:
3070 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
3071 aese $dat0,q8
3072 aesmc $dat0,$dat0
3073 aese $dat0,q9
3074 aesmc $dat0,$dat0
3075 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
3076 aese $dat0,q10
3077 aesmc $dat0,$dat0
3078 aese $dat0,q11
3079 aesmc $dat0,$dat0
3080 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
3081 aese $dat0,q12
3082 aesmc $dat0,$dat0
3083 aese $dat0,q13
3084 aesmc $dat0,$dat0
3085 vld1.32 {$rndlast},[$key1]
3086 aese $dat0,q14
3087 aesmc $dat0,$dat0
3088 aese $dat0,q15
3089 veor $dat0,$dat0,$rndlast
3090 veor $dat0,$dat0,$iv0
3091 vst1.8 {$dat0},[$out]
3092 b .Lxts_enc_final_abort
3093
3094 .align 4
3095 .Lxts_enc_big_size:
3096 ___
3097 $code.=<<___ if ($flavour =~ /64/);
3098 stp $constnumx,$tmpinp,[sp,#-64]!
3099 stp $tailcnt,$midnumx,[sp,#48]
3100 stp $ivd10,$ivd20,[sp,#32]
3101 stp $ivd30,$ivd40,[sp,#16]
3102
3103 // tailcnt store the tail value of length%16.
3104 and $tailcnt,$len,#0xf
3105 and $len,$len,#-16
3106 subs $len,$len,#16
3107 mov $step,#16
3108 b.lo .Lxts_abort
3109 csel $step,xzr,$step,eq
3110
3111 // Firstly, encrypt the iv with key2, as the first iv of XEX.
3112 ldr $rounds,[$key2,#240]
3113 vld1.32 {$dat},[$key2],#16
3114 vld1.8 {$iv0},[$ivp]
3115 sub $rounds,$rounds,#2
3116 vld1.32 {$dat1},[$key2],#16
3117
3118 .Loop_iv_enc:
3119 aese $iv0,$dat
3120 aesmc $iv0,$iv0
3121 vld1.32 {$dat},[$key2],#16
3122 subs $rounds,$rounds,#2
3123 aese $iv0,$dat1
3124 aesmc $iv0,$iv0
3125 vld1.32 {$dat1},[$key2],#16
3126 b.gt .Loop_iv_enc
3127
3128 aese $iv0,$dat
3129 aesmc $iv0,$iv0
3130 vld1.32 {$dat},[$key2]
3131 aese $iv0,$dat1
3132 veor $iv0,$iv0,$dat
3133
3134 // The iv for second block
3135 // $ivl- iv(low), $ivh - iv(high)
3136 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3137 fmov $ivl,$ivd00
3138 fmov $ivh,$ivd01
3139 mov $constnum,#0x87
3140 extr $midnumx,$ivh,$ivh,#32
3141 extr $ivh,$ivh,$ivl,#63
3142 and $tmpmw,$constnum,$midnum,asr#31
3143 eor $ivl,$tmpmx,$ivl,lsl#1
3144 fmov $ivd10,$ivl
3145 fmov $ivd11,$ivh
3146
3147 ldr $rounds0,[$key1,#240] // next starting point
3148 vld1.8 {$dat},[$inp],$step
3149
3150 vld1.32 {q8-q9},[$key1] // load key schedule...
3151 sub $rounds0,$rounds0,#6
3152 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3153 sub $rounds0,$rounds0,#2
3154 vld1.32 {q10-q11},[$key_],#32
3155 vld1.32 {q12-q13},[$key_],#32
3156 vld1.32 {q14-q15},[$key_],#32
3157 vld1.32 {$rndlast},[$key_]
3158
3159 add $key_,$key1,#32
3160 mov $rounds,$rounds0
3161
3162 // Encryption
3163 .Lxts_enc:
3164 vld1.8 {$dat2},[$inp],#16
3165 subs $len,$len,#32 // bias
3166 add $rounds,$rounds0,#2
3167 vorr $in1,$dat,$dat
3168 vorr $dat1,$dat,$dat
3169 vorr $in3,$dat,$dat
3170 vorr $in2,$dat2,$dat2
3171 vorr $in4,$dat2,$dat2
3172 b.lo .Lxts_inner_enc_tail
3173 veor $dat,$dat,$iv0 // before encryption, xor with iv
3174 veor $dat2,$dat2,$iv1
3175
3176 // The iv for third block
3177 extr $midnumx,$ivh,$ivh,#32
3178 extr $ivh,$ivh,$ivl,#63
3179 and $tmpmw,$constnum,$midnum,asr#31
3180 eor $ivl,$tmpmx,$ivl,lsl#1
3181 fmov $ivd20,$ivl
3182 fmov $ivd21,$ivh
3183
3184
3185 vorr $dat1,$dat2,$dat2
3186 vld1.8 {$dat2},[$inp],#16
3187 vorr $in0,$dat,$dat
3188 vorr $in1,$dat1,$dat1
3189 veor $in2,$dat2,$iv2 // the third block
3190 veor $dat2,$dat2,$iv2
3191 cmp $len,#32
3192 b.lo .Lxts_outer_enc_tail
3193
3194 // The iv for fourth block
3195 extr $midnumx,$ivh,$ivh,#32
3196 extr $ivh,$ivh,$ivl,#63
3197 and $tmpmw,$constnum,$midnum,asr#31
3198 eor $ivl,$tmpmx,$ivl,lsl#1
3199 fmov $ivd30,$ivl
3200 fmov $ivd31,$ivh
3201
3202 vld1.8 {$dat3},[$inp],#16
3203 // The iv for fifth block
3204 extr $midnumx,$ivh,$ivh,#32
3205 extr $ivh,$ivh,$ivl,#63
3206 and $tmpmw,$constnum,$midnum,asr#31
3207 eor $ivl,$tmpmx,$ivl,lsl#1
3208 fmov $ivd40,$ivl
3209 fmov $ivd41,$ivh
3210
3211 vld1.8 {$dat4},[$inp],#16
3212 veor $dat3,$dat3,$iv3 // the fourth block
3213 veor $dat4,$dat4,$iv4
3214 sub $len,$len,#32 // bias
3215 mov $rounds,$rounds0
3216 b .Loop5x_xts_enc
3217
3218 .align 4
3219 .Loop5x_xts_enc:
3220 aese $dat0,q8
3221 aesmc $dat0,$dat0
3222 aese $dat1,q8
3223 aesmc $dat1,$dat1
3224 aese $dat2,q8
3225 aesmc $dat2,$dat2
3226 aese $dat3,q8
3227 aesmc $dat3,$dat3
3228 aese $dat4,q8
3229 aesmc $dat4,$dat4
3230 vld1.32 {q8},[$key_],#16
3231 subs $rounds,$rounds,#2
3232 aese $dat0,q9
3233 aesmc $dat0,$dat0
3234 aese $dat1,q9
3235 aesmc $dat1,$dat1
3236 aese $dat2,q9
3237 aesmc $dat2,$dat2
3238 aese $dat3,q9
3239 aesmc $dat3,$dat3
3240 aese $dat4,q9
3241 aesmc $dat4,$dat4
3242 vld1.32 {q9},[$key_],#16
3243 b.gt .Loop5x_xts_enc
3244
3245 aese $dat0,q8
3246 aesmc $dat0,$dat0
3247 aese $dat1,q8
3248 aesmc $dat1,$dat1
3249 aese $dat2,q8
3250 aesmc $dat2,$dat2
3251 aese $dat3,q8
3252 aesmc $dat3,$dat3
3253 aese $dat4,q8
3254 aesmc $dat4,$dat4
3255 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
3256
3257 aese $dat0,q9
3258 aesmc $dat0,$dat0
3259 aese $dat1,q9
3260 aesmc $dat1,$dat1
3261 aese $dat2,q9
3262 aesmc $dat2,$dat2
3263 aese $dat3,q9
3264 aesmc $dat3,$dat3
3265 aese $dat4,q9
3266 aesmc $dat4,$dat4
3267 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3268 mov $key_,$key1
3269
3270 aese $dat0,q10
3271 aesmc $dat0,$dat0
3272 aese $dat1,q10
3273 aesmc $dat1,$dat1
3274 aese $dat2,q10
3275 aesmc $dat2,$dat2
3276 aese $dat3,q10
3277 aesmc $dat3,$dat3
3278 aese $dat4,q10
3279 aesmc $dat4,$dat4
3280 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3281 // at exit from the loop v1.16b-v26.16b
3282 // are loaded with last "words"
3283 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
3284
3285 aese $dat0,q11
3286 aesmc $dat0,$dat0
3287 aese $dat1,q11
3288 aesmc $dat1,$dat1
3289 aese $dat2,q11
3290 aesmc $dat2,$dat2
3291 aese $dat3,q11
3292 aesmc $dat3,$dat3
3293 aese $dat4,q11
3294 aesmc $dat4,$dat4
3295
3296 aese $dat0,q12
3297 aesmc $dat0,$dat0
3298 aese $dat1,q12
3299 aesmc $dat1,$dat1
3300 aese $dat2,q12
3301 aesmc $dat2,$dat2
3302 aese $dat3,q12
3303 aesmc $dat3,$dat3
3304 aese $dat4,q12
3305 aesmc $dat4,$dat4
3306
3307 aese $dat0,q13
3308 aesmc $dat0,$dat0
3309 aese $dat1,q13
3310 aesmc $dat1,$dat1
3311 aese $dat2,q13
3312 aesmc $dat2,$dat2
3313 aese $dat3,q13
3314 aesmc $dat3,$dat3
3315 aese $dat4,q13
3316 aesmc $dat4,$dat4
3317
3318 aese $dat0,q14
3319 aesmc $dat0,$dat0
3320 aese $dat1,q14
3321 aesmc $dat1,$dat1
3322 aese $dat2,q14
3323 aesmc $dat2,$dat2
3324 aese $dat3,q14
3325 aesmc $dat3,$dat3
3326 aese $dat4,q14
3327 aesmc $dat4,$dat4
3328
3329 veor $tmp0,$rndlast,$iv0
3330 aese $dat0,q15
3331 // The iv for first block of one iteration
3332 extr $midnumx,$ivh,$ivh,#32
3333 extr $ivh,$ivh,$ivl,#63
3334 and $tmpmw,$constnum,$midnum,asr#31
3335 eor $ivl,$tmpmx,$ivl,lsl#1
3336 fmov $ivd00,$ivl
3337 fmov $ivd01,$ivh
3338 veor $tmp1,$rndlast,$iv1
3339 vld1.8 {$in0},[$inp],#16
3340 aese $dat1,q15
3341 // The iv for second block
3342 extr $midnumx,$ivh,$ivh,#32
3343 extr $ivh,$ivh,$ivl,#63
3344 and $tmpmw,$constnum,$midnum,asr#31
3345 eor $ivl,$tmpmx,$ivl,lsl#1
3346 fmov $ivd10,$ivl
3347 fmov $ivd11,$ivh
3348 veor $tmp2,$rndlast,$iv2
3349 vld1.8 {$in1},[$inp],#16
3350 aese $dat2,q15
3351 // The iv for third block
3352 extr $midnumx,$ivh,$ivh,#32
3353 extr $ivh,$ivh,$ivl,#63
3354 and $tmpmw,$constnum,$midnum,asr#31
3355 eor $ivl,$tmpmx,$ivl,lsl#1
3356 fmov $ivd20,$ivl
3357 fmov $ivd21,$ivh
3358 veor $tmp3,$rndlast,$iv3
3359 vld1.8 {$in2},[$inp],#16
3360 aese $dat3,q15
3361 // The iv for fourth block
3362 extr $midnumx,$ivh,$ivh,#32
3363 extr $ivh,$ivh,$ivl,#63
3364 and $tmpmw,$constnum,$midnum,asr#31
3365 eor $ivl,$tmpmx,$ivl,lsl#1
3366 fmov $ivd30,$ivl
3367 fmov $ivd31,$ivh
3368 veor $tmp4,$rndlast,$iv4
3369 vld1.8 {$in3},[$inp],#16
3370 aese $dat4,q15
3371
3372 // The iv for fifth block
3373 extr $midnumx,$ivh,$ivh,#32
3374 extr $ivh,$ivh,$ivl,#63
3375 and $tmpmw,$constnum,$midnum,asr #31
3376 eor $ivl,$tmpmx,$ivl,lsl #1
3377 fmov $ivd40,$ivl
3378 fmov $ivd41,$ivh
3379
3380 vld1.8 {$in4},[$inp],#16
3381 cbz $xoffset,.Lxts_enc_tail4x
3382 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3383 veor $tmp0,$tmp0,$dat0
3384 veor $dat0,$in0,$iv0
3385 veor $tmp1,$tmp1,$dat1
3386 veor $dat1,$in1,$iv1
3387 veor $tmp2,$tmp2,$dat2
3388 veor $dat2,$in2,$iv2
3389 veor $tmp3,$tmp3,$dat3
3390 veor $dat3,$in3,$iv3
3391 veor $tmp4,$tmp4,$dat4
3392 vst1.8 {$tmp0},[$out],#16
3393 veor $dat4,$in4,$iv4
3394 vst1.8 {$tmp1},[$out],#16
3395 mov $rounds,$rounds0
3396 vst1.8 {$tmp2},[$out],#16
3397 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3398 vst1.8 {$tmp3},[$out],#16
3399 vst1.8 {$tmp4},[$out],#16
3400 b.hs .Loop5x_xts_enc
3401
3402
3403 // If left 4 blocks, borrow the five block's processing.
3404 cmn $len,#0x10
3405 b.ne .Loop5x_enc_after
3406 vorr $iv4,$iv3,$iv3
3407 vorr $iv3,$iv2,$iv2
3408 vorr $iv2,$iv1,$iv1
3409 vorr $iv1,$iv0,$iv0
3410 fmov $ivl,$ivd40
3411 fmov $ivh,$ivd41
3412 veor $dat0,$iv0,$in0
3413 veor $dat1,$iv1,$in1
3414 veor $dat2,$in2,$iv2
3415 veor $dat3,$in3,$iv3
3416 veor $dat4,$in4,$iv4
3417 b.eq .Loop5x_xts_enc
3418
3419 .Loop5x_enc_after:
3420 add $len,$len,#0x50
3421 cbz $len,.Lxts_enc_done
3422
3423 add $rounds,$rounds0,#2
3424 subs $len,$len,#0x30
3425 b.lo .Lxts_inner_enc_tail
3426
3427 veor $dat0,$iv0,$in2
3428 veor $dat1,$iv1,$in3
3429 veor $dat2,$in4,$iv2
3430 b .Lxts_outer_enc_tail
3431
3432 .align 4
3433 .Lxts_enc_tail4x:
3434 add $inp,$inp,#16
3435 veor $tmp1,$dat1,$tmp1
3436 vst1.8 {$tmp1},[$out],#16
3437 veor $tmp2,$dat2,$tmp2
3438 vst1.8 {$tmp2},[$out],#16
3439 veor $tmp3,$dat3,$tmp3
3440 veor $tmp4,$dat4,$tmp4
3441 vst1.8 {$tmp3-$tmp4},[$out],#32
3442
3443 b .Lxts_enc_done
3444 .align 4
3445 .Lxts_outer_enc_tail:
3446 aese $dat0,q8
3447 aesmc $dat0,$dat0
3448 aese $dat1,q8
3449 aesmc $dat1,$dat1
3450 aese $dat2,q8
3451 aesmc $dat2,$dat2
3452 vld1.32 {q8},[$key_],#16
3453 subs $rounds,$rounds,#2
3454 aese $dat0,q9
3455 aesmc $dat0,$dat0
3456 aese $dat1,q9
3457 aesmc $dat1,$dat1
3458 aese $dat2,q9
3459 aesmc $dat2,$dat2
3460 vld1.32 {q9},[$key_],#16
3461 b.gt .Lxts_outer_enc_tail
3462
3463 aese $dat0,q8
3464 aesmc $dat0,$dat0
3465 aese $dat1,q8
3466 aesmc $dat1,$dat1
3467 aese $dat2,q8
3468 aesmc $dat2,$dat2
3469 veor $tmp0,$iv0,$rndlast
3470 subs $len,$len,#0x30
3471 // The iv for first block
3472 fmov $ivl,$ivd20
3473 fmov $ivh,$ivd21
3474 //mov $constnum,#0x87
3475 extr $midnumx,$ivh,$ivh,#32
3476 extr $ivh,$ivh,$ivl,#63
3477 and $tmpmw,$constnum,$midnum,asr#31
3478 eor $ivl,$tmpmx,$ivl,lsl#1
3479 fmov $ivd00,$ivl
3480 fmov $ivd01,$ivh
3481 veor $tmp1,$iv1,$rndlast
3482 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3483 aese $dat0,q9
3484 aesmc $dat0,$dat0
3485 aese $dat1,q9
3486 aesmc $dat1,$dat1
3487 aese $dat2,q9
3488 aesmc $dat2,$dat2
3489 veor $tmp2,$iv2,$rndlast
3490
3491 add $xoffset,$xoffset,#0x20
3492 add $inp,$inp,$xoffset
3493 mov $key_,$key1
3494
3495 aese $dat0,q12
3496 aesmc $dat0,$dat0
3497 aese $dat1,q12
3498 aesmc $dat1,$dat1
3499 aese $dat2,q12
3500 aesmc $dat2,$dat2
3501 aese $dat0,q13
3502 aesmc $dat0,$dat0
3503 aese $dat1,q13
3504 aesmc $dat1,$dat1
3505 aese $dat2,q13
3506 aesmc $dat2,$dat2
3507 aese $dat0,q14
3508 aesmc $dat0,$dat0
3509 aese $dat1,q14
3510 aesmc $dat1,$dat1
3511 aese $dat2,q14
3512 aesmc $dat2,$dat2
3513 aese $dat0,q15
3514 aese $dat1,q15
3515 aese $dat2,q15
3516 vld1.8 {$in2},[$inp],#16
3517 add $rounds,$rounds0,#2
3518 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3519 veor $tmp0,$tmp0,$dat0
3520 veor $tmp1,$tmp1,$dat1
3521 veor $dat2,$dat2,$tmp2
3522 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3523 vst1.8 {$tmp0},[$out],#16
3524 vst1.8 {$tmp1},[$out],#16
3525 vst1.8 {$dat2},[$out],#16
3526 cmn $len,#0x30
3527 b.eq .Lxts_enc_done
3528 .Lxts_encxor_one:
3529 vorr $in3,$in1,$in1
3530 vorr $in4,$in2,$in2
3531 nop
3532
3533 .Lxts_inner_enc_tail:
3534 cmn $len,#0x10
3535 veor $dat1,$in3,$iv0
3536 veor $dat2,$in4,$iv1
3537 b.eq .Lxts_enc_tail_loop
3538 veor $dat2,$in4,$iv0
3539 .Lxts_enc_tail_loop:
3540 aese $dat1,q8
3541 aesmc $dat1,$dat1
3542 aese $dat2,q8
3543 aesmc $dat2,$dat2
3544 vld1.32 {q8},[$key_],#16
3545 subs $rounds,$rounds,#2
3546 aese $dat1,q9
3547 aesmc $dat1,$dat1
3548 aese $dat2,q9
3549 aesmc $dat2,$dat2
3550 vld1.32 {q9},[$key_],#16
3551 b.gt .Lxts_enc_tail_loop
3552
3553 aese $dat1,q8
3554 aesmc $dat1,$dat1
3555 aese $dat2,q8
3556 aesmc $dat2,$dat2
3557 aese $dat1,q9
3558 aesmc $dat1,$dat1
3559 aese $dat2,q9
3560 aesmc $dat2,$dat2
3561 aese $dat1,q12
3562 aesmc $dat1,$dat1
3563 aese $dat2,q12
3564 aesmc $dat2,$dat2
3565 cmn $len,#0x20
3566 aese $dat1,q13
3567 aesmc $dat1,$dat1
3568 aese $dat2,q13
3569 aesmc $dat2,$dat2
3570 veor $tmp1,$iv0,$rndlast
3571 aese $dat1,q14
3572 aesmc $dat1,$dat1
3573 aese $dat2,q14
3574 aesmc $dat2,$dat2
3575 veor $tmp2,$iv1,$rndlast
3576 aese $dat1,q15
3577 aese $dat2,q15
3578 b.eq .Lxts_enc_one
3579 veor $tmp1,$tmp1,$dat1
3580 vst1.8 {$tmp1},[$out],#16
3581 veor $tmp2,$tmp2,$dat2
3582 vorr $iv0,$iv1,$iv1
3583 vst1.8 {$tmp2},[$out],#16
3584 fmov $ivl,$ivd10
3585 fmov $ivh,$ivd11
3586 mov $constnum,#0x87
3587 extr $midnumx,$ivh,$ivh,#32
3588 extr $ivh,$ivh,$ivl,#63
3589 and $tmpmw,$constnum,$midnum,asr #31
3590 eor $ivl,$tmpmx,$ivl,lsl #1
3591 fmov $ivd00,$ivl
3592 fmov $ivd01,$ivh
3593 b .Lxts_enc_done
3594
3595 .Lxts_enc_one:
3596 veor $tmp1,$tmp1,$dat2
3597 vorr $iv0,$iv0,$iv0
3598 vst1.8 {$tmp1},[$out],#16
3599 fmov $ivl,$ivd00
3600 fmov $ivh,$ivd01
3601 mov $constnum,#0x87
3602 extr $midnumx,$ivh,$ivh,#32
3603 extr $ivh,$ivh,$ivl,#63
3604 and $tmpmw,$constnum,$midnum,asr #31
3605 eor $ivl,$tmpmx,$ivl,lsl #1
3606 fmov $ivd00,$ivl
3607 fmov $ivd01,$ivh
3608 b .Lxts_enc_done
3609 .align 5
3610 .Lxts_enc_done:
3611 // Process the tail block with cipher stealing.
3612 tst $tailcnt,#0xf
3613 b.eq .Lxts_abort
3614
3615 mov $tmpinp,$inp
3616 mov $tmpoutp,$out
3617 sub $out,$out,#16
3618 .composite_enc_loop:
3619 subs $tailcnt,$tailcnt,#1
3620 ldrb $l2outp,[$out,$tailcnt]
3621 ldrb $loutp,[$tmpinp,$tailcnt]
3622 strb $l2outp,[$tmpoutp,$tailcnt]
3623 strb $loutp,[$out,$tailcnt]
3624 b.gt .composite_enc_loop
3625 .Lxts_enc_load_done:
3626 vld1.8 {$tmpin},[$out]
3627 veor $tmpin,$tmpin,$iv0
3628
3629 // Encrypt the composite block to get the last second encrypted text block
3630 ldr $rounds,[$key1,#240] // load key schedule...
3631 vld1.32 {$dat},[$key1],#16
3632 sub $rounds,$rounds,#2
3633 vld1.32 {$dat1},[$key1],#16 // load key schedule...
3634 .Loop_final_enc:
3635 aese $tmpin,$dat0
3636 aesmc $tmpin,$tmpin
3637 vld1.32 {$dat0},[$key1],#16
3638 subs $rounds,$rounds,#2
3639 aese $tmpin,$dat1
3640 aesmc $tmpin,$tmpin
3641 vld1.32 {$dat1},[$key1],#16
3642 b.gt .Loop_final_enc
3643
3644 aese $tmpin,$dat0
3645 aesmc $tmpin,$tmpin
3646 vld1.32 {$dat0},[$key1]
3647 aese $tmpin,$dat1
3648 veor $tmpin,$tmpin,$dat0
3649 veor $tmpin,$tmpin,$iv0
3650 vst1.8 {$tmpin},[$out]
3651
3652 .Lxts_abort:
3653 ldp $tailcnt,$midnumx,[sp,#48]
3654 ldp $ivd10,$ivd20,[sp,#32]
3655 ldp $ivd30,$ivd40,[sp,#16]
3656 ldp $constnumx,$tmpinp,[sp],#64
3657 .Lxts_enc_final_abort:
3658 ret
3659 .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
3660 ___
3661
3662 }}}
3663 {{{
3664 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
3665 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
3666 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
3667 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
3668 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
3669 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
3670 my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
3671 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
3672 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
3673
3674 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
3675
3676 # q7 last round key
3677 # q10-q15, q7 Last 7 round keys
3678 # q8-q9 preloaded round keys except last 7 keys for big size
3679 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
3680
3681 {
3682 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
3683
3684 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
3685 my ($dat4,$in4,$tmp4);
3686 if ($flavour =~ /64/) {
3687 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
3688 }
3689
3690 $code.=<<___ if ($flavour =~ /64/);
3691 .globl ${prefix}_xts_decrypt
3692 .type ${prefix}_xts_decrypt,%function
3693 .align 5
3694 ${prefix}_xts_decrypt:
3695 AARCH64_VALID_CALL_TARGET
3696 ___
3697 $code.=<<___ if ($flavour =~ /64/);
3698 cmp $len,#16
3699 // Original input data size bigger than 16, jump to big size processing.
3700 b.ne .Lxts_dec_big_size
3701 // Encrypt the iv with key2, as the first XEX iv.
3702 ldr $rounds,[$key2,#240]
3703 vld1.32 {$dat},[$key2],#16
3704 vld1.8 {$iv0},[$ivp]
3705 sub $rounds,$rounds,#2
3706 vld1.32 {$dat1},[$key2],#16
3707
3708 .Loop_dec_small_iv_enc:
3709 aese $iv0,$dat
3710 aesmc $iv0,$iv0
3711 vld1.32 {$dat},[$key2],#16
3712 subs $rounds,$rounds,#2
3713 aese $iv0,$dat1
3714 aesmc $iv0,$iv0
3715 vld1.32 {$dat1},[$key2],#16
3716 b.gt .Loop_dec_small_iv_enc
3717
3718 aese $iv0,$dat
3719 aesmc $iv0,$iv0
3720 vld1.32 {$dat},[$key2]
3721 aese $iv0,$dat1
3722 veor $iv0,$iv0,$dat
3723
3724 vld1.8 {$dat0},[$inp]
3725 veor $dat0,$iv0,$dat0
3726
3727 ldr $rounds,[$key1,#240]
3728 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
3729
3730 aesd $dat0,q20
3731 aesimc $dat0,$dat0
3732 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
3733 aesd $dat0,q21
3734 aesimc $dat0,$dat0
3735 subs $rounds,$rounds,#10 // bias
3736 b.eq .Lxts_128_dec
3737 .Lxts_dec_round_loop:
3738 aesd $dat0,q8
3739 aesimc $dat0,$dat0
3740 vld1.32 {q8},[$key1],#16 // load key schedule...
3741 aesd $dat0,q9
3742 aesimc $dat0,$dat0
3743 vld1.32 {q9},[$key1],#16 // load key schedule...
3744 subs $rounds,$rounds,#2 // bias
3745 b.gt .Lxts_dec_round_loop
3746 .Lxts_128_dec:
3747 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
3748 aesd $dat0,q8
3749 aesimc $dat0,$dat0
3750 aesd $dat0,q9
3751 aesimc $dat0,$dat0
3752 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
3753 aesd $dat0,q10
3754 aesimc $dat0,$dat0
3755 aesd $dat0,q11
3756 aesimc $dat0,$dat0
3757 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
3758 aesd $dat0,q12
3759 aesimc $dat0,$dat0
3760 aesd $dat0,q13
3761 aesimc $dat0,$dat0
3762 vld1.32 {$rndlast},[$key1]
3763 aesd $dat0,q14
3764 aesimc $dat0,$dat0
3765 aesd $dat0,q15
3766 veor $dat0,$dat0,$rndlast
3767 veor $dat0,$iv0,$dat0
3768 vst1.8 {$dat0},[$out]
3769 b .Lxts_dec_final_abort
3770 .Lxts_dec_big_size:
3771 ___
3772 $code.=<<___ if ($flavour =~ /64/);
3773 stp $constnumx,$tmpinp,[sp,#-64]!
3774 stp $tailcnt,$midnumx,[sp,#48]
3775 stp $ivd10,$ivd20,[sp,#32]
3776 stp $ivd30,$ivd40,[sp,#16]
3777
3778 and $tailcnt,$len,#0xf
3779 and $len,$len,#-16
3780 subs $len,$len,#16
3781 mov $step,#16
3782 b.lo .Lxts_dec_abort
3783
3784 // Encrypt the iv with key2, as the first XEX iv
3785 ldr $rounds,[$key2,#240]
3786 vld1.32 {$dat},[$key2],#16
3787 vld1.8 {$iv0},[$ivp]
3788 sub $rounds,$rounds,#2
3789 vld1.32 {$dat1},[$key2],#16
3790
3791 .Loop_dec_iv_enc:
3792 aese $iv0,$dat
3793 aesmc $iv0,$iv0
3794 vld1.32 {$dat},[$key2],#16
3795 subs $rounds,$rounds,#2
3796 aese $iv0,$dat1
3797 aesmc $iv0,$iv0
3798 vld1.32 {$dat1},[$key2],#16
3799 b.gt .Loop_dec_iv_enc
3800
3801 aese $iv0,$dat
3802 aesmc $iv0,$iv0
3803 vld1.32 {$dat},[$key2]
3804 aese $iv0,$dat1
3805 veor $iv0,$iv0,$dat
3806
3807 // The iv for second block
3808 // $ivl- iv(low), $ivh - iv(high)
3809 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3810 fmov $ivl,$ivd00
3811 fmov $ivh,$ivd01
3812 mov $constnum,#0x87
3813 extr $midnumx,$ivh,$ivh,#32
3814 extr $ivh,$ivh,$ivl,#63
3815 and $tmpmw,$constnum,$midnum,asr #31
3816 eor $ivl,$tmpmx,$ivl,lsl #1
3817 fmov $ivd10,$ivl
3818 fmov $ivd11,$ivh
3819
3820 ldr $rounds0,[$key1,#240] // load rounds number
3821
3822 // The iv for third block
3823 extr $midnumx,$ivh,$ivh,#32
3824 extr $ivh,$ivh,$ivl,#63
3825 and $tmpmw,$constnum,$midnum,asr #31
3826 eor $ivl,$tmpmx,$ivl,lsl #1
3827 fmov $ivd20,$ivl
3828 fmov $ivd21,$ivh
3829
3830 vld1.32 {q8-q9},[$key1] // load key schedule...
3831 sub $rounds0,$rounds0,#6
3832 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3833 sub $rounds0,$rounds0,#2
3834 vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3835 vld1.32 {q12-q13},[$key_],#32
3836 vld1.32 {q14-q15},[$key_],#32
3837 vld1.32 {$rndlast},[$key_]
3838
3839 // The iv for fourth block
3840 extr $midnumx,$ivh,$ivh,#32
3841 extr $ivh,$ivh,$ivl,#63
3842 and $tmpmw,$constnum,$midnum,asr #31
3843 eor $ivl,$tmpmx,$ivl,lsl #1
3844 fmov $ivd30,$ivl
3845 fmov $ivd31,$ivh
3846
3847 add $key_,$key1,#32
3848 mov $rounds,$rounds0
3849 b .Lxts_dec
3850
3851 // Decryption
3852 .align 5
3853 .Lxts_dec:
3854 tst $tailcnt,#0xf
3855 b.eq .Lxts_dec_begin
3856 subs $len,$len,#16
3857 csel $step,xzr,$step,eq
3858 vld1.8 {$dat},[$inp],#16
3859 b.lo .Lxts_done
3860 sub $inp,$inp,#16
3861 .Lxts_dec_begin:
3862 vld1.8 {$dat},[$inp],$step
3863 subs $len,$len,#32 // bias
3864 add $rounds,$rounds0,#2
3865 vorr $in1,$dat,$dat
3866 vorr $dat1,$dat,$dat
3867 vorr $in3,$dat,$dat
3868 vld1.8 {$dat2},[$inp],#16
3869 vorr $in2,$dat2,$dat2
3870 vorr $in4,$dat2,$dat2
3871 b.lo .Lxts_inner_dec_tail
3872 veor $dat,$dat,$iv0 // before decryt, xor with iv
3873 veor $dat2,$dat2,$iv1
3874
3875 vorr $dat1,$dat2,$dat2
3876 vld1.8 {$dat2},[$inp],#16
3877 vorr $in0,$dat,$dat
3878 vorr $in1,$dat1,$dat1
3879 veor $in2,$dat2,$iv2 // third block xox with third iv
3880 veor $dat2,$dat2,$iv2
3881 cmp $len,#32
3882 b.lo .Lxts_outer_dec_tail
3883
3884 vld1.8 {$dat3},[$inp],#16
3885
3886 // The iv for fifth block
3887 extr $midnumx,$ivh,$ivh,#32
3888 extr $ivh,$ivh,$ivl,#63
3889 and $tmpmw,$constnum,$midnum,asr #31
3890 eor $ivl,$tmpmx,$ivl,lsl #1
3891 fmov $ivd40,$ivl
3892 fmov $ivd41,$ivh
3893
3894 vld1.8 {$dat4},[$inp],#16
3895 veor $dat3,$dat3,$iv3 // the fourth block
3896 veor $dat4,$dat4,$iv4
3897 sub $len,$len,#32 // bias
3898 mov $rounds,$rounds0
3899 b .Loop5x_xts_dec
3900
3901 .align 4
3902 .Loop5x_xts_dec:
3903 aesd $dat0,q8
3904 aesimc $dat0,$dat0
3905 aesd $dat1,q8
3906 aesimc $dat1,$dat1
3907 aesd $dat2,q8
3908 aesimc $dat2,$dat2
3909 aesd $dat3,q8
3910 aesimc $dat3,$dat3
3911 aesd $dat4,q8
3912 aesimc $dat4,$dat4
3913 vld1.32 {q8},[$key_],#16 // load key schedule...
3914 subs $rounds,$rounds,#2
3915 aesd $dat0,q9
3916 aesimc $dat0,$dat0
3917 aesd $dat1,q9
3918 aesimc $dat1,$dat1
3919 aesd $dat2,q9
3920 aesimc $dat2,$dat2
3921 aesd $dat3,q9
3922 aesimc $dat3,$dat3
3923 aesd $dat4,q9
3924 aesimc $dat4,$dat4
3925 vld1.32 {q9},[$key_],#16 // load key schedule...
3926 b.gt .Loop5x_xts_dec
3927
3928 aesd $dat0,q8
3929 aesimc $dat0,$dat0
3930 aesd $dat1,q8
3931 aesimc $dat1,$dat1
3932 aesd $dat2,q8
3933 aesimc $dat2,$dat2
3934 aesd $dat3,q8
3935 aesimc $dat3,$dat3
3936 aesd $dat4,q8
3937 aesimc $dat4,$dat4
3938 subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3939
3940 aesd $dat0,q9
3941 aesimc $dat0,$dat
3942 aesd $dat1,q9
3943 aesimc $dat1,$dat1
3944 aesd $dat2,q9
3945 aesimc $dat2,$dat2
3946 aesd $dat3,q9
3947 aesimc $dat3,$dat3
3948 aesd $dat4,q9
3949 aesimc $dat4,$dat4
3950 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3951 mov $key_,$key1
3952
3953 aesd $dat0,q10
3954 aesimc $dat0,$dat0
3955 aesd $dat1,q10
3956 aesimc $dat1,$dat1
3957 aesd $dat2,q10
3958 aesimc $dat2,$dat2
3959 aesd $dat3,q10
3960 aesimc $dat3,$dat3
3961 aesd $dat4,q10
3962 aesimc $dat4,$dat4
3963 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3964 // at exit from the loop v1.16b-v26.16b
3965 // are loaded with last "words"
3966 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3967
3968 aesd $dat0,q11
3969 aesimc $dat0,$dat0
3970 aesd $dat1,q11
3971 aesimc $dat1,$dat1
3972 aesd $dat2,q11
3973 aesimc $dat2,$dat2
3974 aesd $dat3,q11
3975 aesimc $dat3,$dat3
3976 aesd $dat4,q11
3977 aesimc $dat4,$dat4
3978
3979 aesd $dat0,q12
3980 aesimc $dat0,$dat0
3981 aesd $dat1,q12
3982 aesimc $dat1,$dat1
3983 aesd $dat2,q12
3984 aesimc $dat2,$dat2
3985 aesd $dat3,q12
3986 aesimc $dat3,$dat3
3987 aesd $dat4,q12
3988 aesimc $dat4,$dat4
3989
3990 aesd $dat0,q13
3991 aesimc $dat0,$dat0
3992 aesd $dat1,q13
3993 aesimc $dat1,$dat1
3994 aesd $dat2,q13
3995 aesimc $dat2,$dat2
3996 aesd $dat3,q13
3997 aesimc $dat3,$dat3
3998 aesd $dat4,q13
3999 aesimc $dat4,$dat4
4000
4001 aesd $dat0,q14
4002 aesimc $dat0,$dat0
4003 aesd $dat1,q14
4004 aesimc $dat1,$dat1
4005 aesd $dat2,q14
4006 aesimc $dat2,$dat2
4007 aesd $dat3,q14
4008 aesimc $dat3,$dat3
4009 aesd $dat4,q14
4010 aesimc $dat4,$dat4
4011
4012 veor $tmp0,$rndlast,$iv0
4013 aesd $dat0,q15
4014 // The iv for first block of next iteration.
4015 extr $midnumx,$ivh,$ivh,#32
4016 extr $ivh,$ivh,$ivl,#63
4017 and $tmpmw,$constnum,$midnum,asr #31
4018 eor $ivl,$tmpmx,$ivl,lsl #1
4019 fmov $ivd00,$ivl
4020 fmov $ivd01,$ivh
4021 veor $tmp1,$rndlast,$iv1
4022 vld1.8 {$in0},[$inp],#16
4023 aesd $dat1,q15
4024 // The iv for second block
4025 extr $midnumx,$ivh,$ivh,#32
4026 extr $ivh,$ivh,$ivl,#63
4027 and $tmpmw,$constnum,$midnum,asr #31
4028 eor $ivl,$tmpmx,$ivl,lsl #1
4029 fmov $ivd10,$ivl
4030 fmov $ivd11,$ivh
4031 veor $tmp2,$rndlast,$iv2
4032 vld1.8 {$in1},[$inp],#16
4033 aesd $dat2,q15
4034 // The iv for third block
4035 extr $midnumx,$ivh,$ivh,#32
4036 extr $ivh,$ivh,$ivl,#63
4037 and $tmpmw,$constnum,$midnum,asr #31
4038 eor $ivl,$tmpmx,$ivl,lsl #1
4039 fmov $ivd20,$ivl
4040 fmov $ivd21,$ivh
4041 veor $tmp3,$rndlast,$iv3
4042 vld1.8 {$in2},[$inp],#16
4043 aesd $dat3,q15
4044 // The iv for fourth block
4045 extr $midnumx,$ivh,$ivh,#32
4046 extr $ivh,$ivh,$ivl,#63
4047 and $tmpmw,$constnum,$midnum,asr #31
4048 eor $ivl,$tmpmx,$ivl,lsl #1
4049 fmov $ivd30,$ivl
4050 fmov $ivd31,$ivh
4051 veor $tmp4,$rndlast,$iv4
4052 vld1.8 {$in3},[$inp],#16
4053 aesd $dat4,q15
4054
4055 // The iv for fifth block
4056 extr $midnumx,$ivh,$ivh,#32
4057 extr $ivh,$ivh,$ivl,#63
4058 and $tmpmw,$constnum,$midnum,asr #31
4059 eor $ivl,$tmpmx,$ivl,lsl #1
4060 fmov $ivd40,$ivl
4061 fmov $ivd41,$ivh
4062
4063 vld1.8 {$in4},[$inp],#16
4064 cbz $xoffset,.Lxts_dec_tail4x
4065 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
4066 veor $tmp0,$tmp0,$dat0
4067 veor $dat0,$in0,$iv0
4068 veor $tmp1,$tmp1,$dat1
4069 veor $dat1,$in1,$iv1
4070 veor $tmp2,$tmp2,$dat2
4071 veor $dat2,$in2,$iv2
4072 veor $tmp3,$tmp3,$dat3
4073 veor $dat3,$in3,$iv3
4074 veor $tmp4,$tmp4,$dat4
4075 vst1.8 {$tmp0},[$out],#16
4076 veor $dat4,$in4,$iv4
4077 vst1.8 {$tmp1},[$out],#16
4078 mov $rounds,$rounds0
4079 vst1.8 {$tmp2},[$out],#16
4080 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
4081 vst1.8 {$tmp3},[$out],#16
4082 vst1.8 {$tmp4},[$out],#16
4083 b.hs .Loop5x_xts_dec
4084
4085 cmn $len,#0x10
4086 b.ne .Loop5x_dec_after
4087 // If x2($len) equal to -0x10, the left blocks is 4.
4088 // After specially processing, utilize the five blocks processing again.
4089 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
4090 vorr $iv4,$iv3,$iv3
4091 vorr $iv3,$iv2,$iv2
4092 vorr $iv2,$iv1,$iv1
4093 vorr $iv1,$iv0,$iv0
4094 fmov $ivl,$ivd40
4095 fmov $ivh,$ivd41
4096 veor $dat0,$iv0,$in0
4097 veor $dat1,$iv1,$in1
4098 veor $dat2,$in2,$iv2
4099 veor $dat3,$in3,$iv3
4100 veor $dat4,$in4,$iv4
4101 b.eq .Loop5x_xts_dec
4102
4103 .Loop5x_dec_after:
4104 add $len,$len,#0x50
4105 cbz $len,.Lxts_done
4106
4107 add $rounds,$rounds0,#2
4108 subs $len,$len,#0x30
4109 b.lo .Lxts_inner_dec_tail
4110
4111 veor $dat0,$iv0,$in2
4112 veor $dat1,$iv1,$in3
4113 veor $dat2,$in4,$iv2
4114 b .Lxts_outer_dec_tail
4115
4116 .align 4
4117 .Lxts_dec_tail4x:
4118 add $inp,$inp,#16
4119 tst $tailcnt,#0xf
4120 veor $tmp1,$dat1,$tmp0
4121 vst1.8 {$tmp1},[$out],#16
4122 veor $tmp2,$dat2,$tmp2
4123 vst1.8 {$tmp2},[$out],#16
4124 veor $tmp3,$dat3,$tmp3
4125 veor $tmp4,$dat4,$tmp4
4126 vst1.8 {$tmp3-$tmp4},[$out],#32
4127
4128 b.eq .Lxts_dec_abort
4129 vld1.8 {$dat0},[$inp],#16
4130 b .Lxts_done
4131 .align 4
4132 .Lxts_outer_dec_tail:
4133 aesd $dat0,q8
4134 aesimc $dat0,$dat0
4135 aesd $dat1,q8
4136 aesimc $dat1,$dat1
4137 aesd $dat2,q8
4138 aesimc $dat2,$dat2
4139 vld1.32 {q8},[$key_],#16
4140 subs $rounds,$rounds,#2
4141 aesd $dat0,q9
4142 aesimc $dat0,$dat0
4143 aesd $dat1,q9
4144 aesimc $dat1,$dat1
4145 aesd $dat2,q9
4146 aesimc $dat2,$dat2
4147 vld1.32 {q9},[$key_],#16
4148 b.gt .Lxts_outer_dec_tail
4149
4150 aesd $dat0,q8
4151 aesimc $dat0,$dat0
4152 aesd $dat1,q8
4153 aesimc $dat1,$dat1
4154 aesd $dat2,q8
4155 aesimc $dat2,$dat2
4156 veor $tmp0,$iv0,$rndlast
4157 subs $len,$len,#0x30
4158 // The iv for first block
4159 fmov $ivl,$ivd20
4160 fmov $ivh,$ivd21
4161 mov $constnum,#0x87
4162 extr $midnumx,$ivh,$ivh,#32
4163 extr $ivh,$ivh,$ivl,#63
4164 and $tmpmw,$constnum,$midnum,asr #31
4165 eor $ivl,$tmpmx,$ivl,lsl #1
4166 fmov $ivd00,$ivl
4167 fmov $ivd01,$ivh
4168 veor $tmp1,$iv1,$rndlast
4169 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
4170 aesd $dat0,q9
4171 aesimc $dat0,$dat0
4172 aesd $dat1,q9
4173 aesimc $dat1,$dat1
4174 aesd $dat2,q9
4175 aesimc $dat2,$dat2
4176 veor $tmp2,$iv2,$rndlast
4177 // The iv for second block
4178 extr $midnumx,$ivh,$ivh,#32
4179 extr $ivh,$ivh,$ivl,#63
4180 and $tmpmw,$constnum,$midnum,asr #31
4181 eor $ivl,$tmpmx,$ivl,lsl #1
4182 fmov $ivd10,$ivl
4183 fmov $ivd11,$ivh
4184
4185 add $xoffset,$xoffset,#0x20
4186 add $inp,$inp,$xoffset // $inp is adjusted to the last data
4187
4188 mov $key_,$key1
4189
4190 // The iv for third block
4191 extr $midnumx,$ivh,$ivh,#32
4192 extr $ivh,$ivh,$ivl,#63
4193 and $tmpmw,$constnum,$midnum,asr #31
4194 eor $ivl,$tmpmx,$ivl,lsl #1
4195 fmov $ivd20,$ivl
4196 fmov $ivd21,$ivh
4197
4198 aesd $dat0,q12
4199 aesimc $dat0,$dat0
4200 aesd $dat1,q12
4201 aesimc $dat1,$dat1
4202 aesd $dat2,q12
4203 aesimc $dat2,$dat2
4204 aesd $dat0,q13
4205 aesimc $dat0,$dat0
4206 aesd $dat1,q13
4207 aesimc $dat1,$dat1
4208 aesd $dat2,q13
4209 aesimc $dat2,$dat2
4210 aesd $dat0,q14
4211 aesimc $dat0,$dat0
4212 aesd $dat1,q14
4213 aesimc $dat1,$dat1
4214 aesd $dat2,q14
4215 aesimc $dat2,$dat2
4216 vld1.8 {$in2},[$inp],#16
4217 aesd $dat0,q15
4218 aesd $dat1,q15
4219 aesd $dat2,q15
4220 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
4221 add $rounds,$rounds0,#2
4222 veor $tmp0,$tmp0,$dat0
4223 veor $tmp1,$tmp1,$dat1
4224 veor $dat2,$dat2,$tmp2
4225 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
4226 vst1.8 {$tmp0},[$out],#16
4227 vst1.8 {$tmp1},[$out],#16
4228 vst1.8 {$dat2},[$out],#16
4229
4230 cmn $len,#0x30
4231 add $len,$len,#0x30
4232 b.eq .Lxts_done
4233 sub $len,$len,#0x30
4234 vorr $in3,$in1,$in1
4235 vorr $in4,$in2,$in2
4236 nop
4237
4238 .Lxts_inner_dec_tail:
4239 // $len == -0x10 means two blocks left.
4240 cmn $len,#0x10
4241 veor $dat1,$in3,$iv0
4242 veor $dat2,$in4,$iv1
4243 b.eq .Lxts_dec_tail_loop
4244 veor $dat2,$in4,$iv0
4245 .Lxts_dec_tail_loop:
4246 aesd $dat1,q8
4247 aesimc $dat1,$dat1
4248 aesd $dat2,q8
4249 aesimc $dat2,$dat2
4250 vld1.32 {q8},[$key_],#16
4251 subs $rounds,$rounds,#2
4252 aesd $dat1,q9
4253 aesimc $dat1,$dat1
4254 aesd $dat2,q9
4255 aesimc $dat2,$dat2
4256 vld1.32 {q9},[$key_],#16
4257 b.gt .Lxts_dec_tail_loop
4258
4259 aesd $dat1,q8
4260 aesimc $dat1,$dat1
4261 aesd $dat2,q8
4262 aesimc $dat2,$dat2
4263 aesd $dat1,q9
4264 aesimc $dat1,$dat1
4265 aesd $dat2,q9
4266 aesimc $dat2,$dat2
4267 aesd $dat1,q12
4268 aesimc $dat1,$dat1
4269 aesd $dat2,q12
4270 aesimc $dat2,$dat2
4271 cmn $len,#0x20
4272 aesd $dat1,q13
4273 aesimc $dat1,$dat1
4274 aesd $dat2,q13
4275 aesimc $dat2,$dat2
4276 veor $tmp1,$iv0,$rndlast
4277 aesd $dat1,q14
4278 aesimc $dat1,$dat1
4279 aesd $dat2,q14
4280 aesimc $dat2,$dat2
4281 veor $tmp2,$iv1,$rndlast
4282 aesd $dat1,q15
4283 aesd $dat2,q15
4284 b.eq .Lxts_dec_one
4285 veor $tmp1,$tmp1,$dat1
4286 veor $tmp2,$tmp2,$dat2
4287 vorr $iv0,$iv2,$iv2
4288 vorr $iv1,$iv3,$iv3
4289 vst1.8 {$tmp1},[$out],#16
4290 vst1.8 {$tmp2},[$out],#16
4291 add $len,$len,#16
4292 b .Lxts_done
4293
4294 .Lxts_dec_one:
4295 veor $tmp1,$tmp1,$dat2
4296 vorr $iv0,$iv1,$iv1
4297 vorr $iv1,$iv2,$iv2
4298 vst1.8 {$tmp1},[$out],#16
4299 add $len,$len,#32
4300
4301 .Lxts_done:
4302 tst $tailcnt,#0xf
4303 b.eq .Lxts_dec_abort
4304 // Processing the last two blocks with cipher stealing.
4305 mov x7,x3
4306 cbnz x2,.Lxts_dec_1st_done
4307 vld1.8 {$dat0},[$inp],#16
4308
4309 // Decrypt the last second block to get the last plain text block
4310 .Lxts_dec_1st_done:
4311 eor $tmpin,$dat0,$iv1
4312 ldr $rounds,[$key1,#240]
4313 vld1.32 {$dat0},[$key1],#16
4314 sub $rounds,$rounds,#2
4315 vld1.32 {$dat1},[$key1],#16
4316 .Loop_final_2nd_dec:
4317 aesd $tmpin,$dat0
4318 aesimc $tmpin,$tmpin
4319 vld1.32 {$dat0},[$key1],#16 // load key schedule...
4320 subs $rounds,$rounds,#2
4321 aesd $tmpin,$dat1
4322 aesimc $tmpin,$tmpin
4323 vld1.32 {$dat1},[$key1],#16 // load key schedule...
4324 b.gt .Loop_final_2nd_dec
4325
4326 aesd $tmpin,$dat0
4327 aesimc $tmpin,$tmpin
4328 vld1.32 {$dat0},[$key1]
4329 aesd $tmpin,$dat1
4330 veor $tmpin,$tmpin,$dat0
4331 veor $tmpin,$tmpin,$iv1
4332 vst1.8 {$tmpin},[$out]
4333
4334 mov $tmpinp,$inp
4335 add $tmpoutp,$out,#16
4336
4337 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
4338 // to get the last encrypted block.
4339 .composite_dec_loop:
4340 subs $tailcnt,$tailcnt,#1
4341 ldrb $l2outp,[$out,$tailcnt]
4342 ldrb $loutp,[$tmpinp,$tailcnt]
4343 strb $l2outp,[$tmpoutp,$tailcnt]
4344 strb $loutp,[$out,$tailcnt]
4345 b.gt .composite_dec_loop
4346 .Lxts_dec_load_done:
4347 vld1.8 {$tmpin},[$out]
4348 veor $tmpin,$tmpin,$iv0
4349
4350 // Decrypt the composite block to get the last second plain text block
4351 ldr $rounds,[$key_,#240]
4352 vld1.32 {$dat},[$key_],#16
4353 sub $rounds,$rounds,#2
4354 vld1.32 {$dat1},[$key_],#16
4355 .Loop_final_dec:
4356 aesd $tmpin,$dat0
4357 aesimc $tmpin,$tmpin
4358 vld1.32 {$dat0},[$key_],#16 // load key schedule...
4359 subs $rounds,$rounds,#2
4360 aesd $tmpin,$dat1
4361 aesimc $tmpin,$tmpin
4362 vld1.32 {$dat1},[$key_],#16 // load key schedule...
4363 b.gt .Loop_final_dec
4364
4365 aesd $tmpin,$dat0
4366 aesimc $tmpin,$tmpin
4367 vld1.32 {$dat0},[$key_]
4368 aesd $tmpin,$dat1
4369 veor $tmpin,$tmpin,$dat0
4370 veor $tmpin,$tmpin,$iv0
4371 vst1.8 {$tmpin},[$out]
4372
4373 .Lxts_dec_abort:
4374 ldp $tailcnt,$midnumx,[sp,#48]
4375 ldp $ivd10,$ivd20,[sp,#32]
4376 ldp $ivd30,$ivd40,[sp,#16]
4377 ldp $constnumx,$tmpinp,[sp],#64
4378
4379 .Lxts_dec_final_abort:
4380 ret
4381 .size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
4382 ___
4383 }
4384 }}}
4385 $code.=<<___;
4386 #endif
4387 ___
4388 ########################################
4389 if ($flavour =~ /64/) { ######## 64-bit code
4390 my %opcode = (
4391 "aesd" => 0x4e285800, "aese" => 0x4e284800,
4392 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800,
4393 "eor3" => 0xce000000, );
4394
4395 local *unaes = sub {
4396 my ($mnemonic,$arg)=@_;
4397
4398 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
4399 sprintf ".inst\t0x%08x\t//%s %s",
4400 $opcode{$mnemonic}|$1|($2<<5),
4401 $mnemonic,$arg;
4402 };
4403
4404 sub unsha3 {
4405 my ($mnemonic,$arg)=@_;
4406
4407 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
4408 &&
4409 sprintf ".inst\t0x%08x\t//%s %s",
4410 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
4411 $mnemonic,$arg;
4412 }
4413
4414 foreach(split("\n",$code)) {
4415 s/\`([^\`]*)\`/eval($1)/geo;
4416
4417 s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo; # old->new registers
4418 s/\bq_([0-9]+)\b/"q".$1/geo; # old->new registers
4419 s/@\s/\/\//o; # old->new style commentary
4420
4421 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
4422 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
4423 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
4424 s/vmov\.i8/movi/o or # fix up legacy mnemonics
4425 s/vext\.8/ext/o or
4426 s/vrev32\.8/rev32/o or
4427 s/vtst\.8/cmtst/o or
4428 s/vshr/ushr/o or
4429 s/^(\s+)v/$1/o or # strip off v prefix
4430 s/\bbx\s+lr\b/ret/o;
4431 s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge;
4432
4433 # fix up remaining legacy suffixes
4434 s/\.[ui]?8//o;
4435 m/\],#8/o and s/\.16b/\.8b/go;
4436 s/\.[ui]?32//o and s/\.16b/\.4s/go;
4437 s/\.[ui]?64//o and s/\.16b/\.2d/go;
4438 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
4439
4440 # Switch preprocessor checks to aarch64 versions.
4441 s/__ARME([BL])__/__AARCH64E$1__/go;
4442
4443 print $_,"\n";
4444 }
4445 } else { ######## 32-bit code
4446 my %opcode = (
4447 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
4448 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
4449
4450 local *unaes = sub {
4451 my ($mnemonic,$arg)=@_;
4452
4453 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
4454 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
4455 |(($2&7)<<1) |(($2&8)<<2);
4456 # since ARMv7 instructions are always encoded little-endian.
4457 # correct solution is to use .inst directive, but older
4458 # assemblers don't implement it:-(
4459 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
4460 $word&0xff,($word>>8)&0xff,
4461 ($word>>16)&0xff,($word>>24)&0xff,
4462 $mnemonic,$arg;
4463 }
4464 };
4465
4466 sub unvtbl {
4467 my $arg=shift;
4468
4469 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
4470 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
4471 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
4472 }
4473
4474 sub unvdup32 {
4475 my $arg=shift;
4476
4477 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
4478 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
4479 }
4480
4481 sub unvmov32 {
4482 my $arg=shift;
4483
4484 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
4485 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
4486 }
4487
4488 foreach(split("\n",$code)) {
4489 s/\`([^\`]*)\`/eval($1)/geo;
4490
4491 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
4492 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
4493 s/\/\/\s?/@ /o; # new->old style commentary
4494
4495 # fix up remaining new-style suffixes
4496 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
4497 s/\],#[0-9]+/]!/o;
4498
4499 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
4500 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
4501 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
4502 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
4503 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
4504 s/^(\s+)b\./$1b/o or
4505 s/^(\s+)ret/$1bx\tlr/o;
4506
4507 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
4508 print " it $2\n";
4509 }
4510
4511 print $_,"\n";
4512 }
4513 }
4514
4515 close STDOUT or die "error closing STDOUT: $!";