]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesv8-armx.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
29 #
30 # April 2019
31 #
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
40 # to be 5x...
41 #
42 # Performance in cycles per byte processed with 128-bit key:
43 #
44 # CBC enc CBC dec CTR
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
53 #
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
57 # interleaved;
58
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
68
69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
71 *STDOUT=*OUT;
72
73 $prefix="aes_v8";
74
75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77 $code=<<___;
78 #include "arm_arch.h"
79
80 #if __ARM_MAX_ARCH__>=7
81 ___
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___ if ($flavour !~ /64/);
84 .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
85 .fpu neon
86 #ifdef __thumb2__
87 .syntax unified
88 .thumb
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
90 #else
91 .code 32
92 # define INST(a,b,c,d) $_byte a,b,c,d
93 #endif
94
95 .text
96 ___
97
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
102 #
103 {{{
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109 $code.=<<___;
110 .align 5
111 .Lrcon:
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
115
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
118 .align 5
119 ${prefix}_set_encrypt_key:
120 .Lenc_key:
121 ___
122 $code.=<<___ if ($flavour =~ /64/);
123 AARCH64_VALID_CALL_TARGET
124 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
125 stp x29,x30,[sp,#-16]!
126 add x29,sp,#0
127 ___
128 $code.=<<___;
129 mov $ptr,#-1
130 cmp $inp,#0
131 b.eq .Lenc_key_abort
132 cmp $out,#0
133 b.eq .Lenc_key_abort
134 mov $ptr,#-2
135 cmp $bits,#128
136 b.lt .Lenc_key_abort
137 cmp $bits,#256
138 b.gt .Lenc_key_abort
139 tst $bits,#0x3f
140 b.ne .Lenc_key_abort
141
142 adr $ptr,.Lrcon
143 cmp $bits,#192
144
145 veor $zero,$zero,$zero
146 vld1.8 {$in0},[$inp],#16
147 mov $bits,#8 // reuse $bits
148 vld1.32 {$rcon,$mask},[$ptr],#32
149
150 b.lt .Loop128
151 b.eq .L192
152 b .L256
153
154 .align 4
155 .Loop128:
156 vtbl.8 $key,{$in0},$mask
157 vext.8 $tmp,$zero,$in0,#12
158 vst1.32 {$in0},[$out],#16
159 aese $key,$zero
160 subs $bits,$bits,#1
161
162 veor $in0,$in0,$tmp
163 vext.8 $tmp,$zero,$tmp,#12
164 veor $in0,$in0,$tmp
165 vext.8 $tmp,$zero,$tmp,#12
166 veor $key,$key,$rcon
167 veor $in0,$in0,$tmp
168 vshl.u8 $rcon,$rcon,#1
169 veor $in0,$in0,$key
170 b.ne .Loop128
171
172 vld1.32 {$rcon},[$ptr]
173
174 vtbl.8 $key,{$in0},$mask
175 vext.8 $tmp,$zero,$in0,#12
176 vst1.32 {$in0},[$out],#16
177 aese $key,$zero
178
179 veor $in0,$in0,$tmp
180 vext.8 $tmp,$zero,$tmp,#12
181 veor $in0,$in0,$tmp
182 vext.8 $tmp,$zero,$tmp,#12
183 veor $key,$key,$rcon
184 veor $in0,$in0,$tmp
185 vshl.u8 $rcon,$rcon,#1
186 veor $in0,$in0,$key
187
188 vtbl.8 $key,{$in0},$mask
189 vext.8 $tmp,$zero,$in0,#12
190 vst1.32 {$in0},[$out],#16
191 aese $key,$zero
192
193 veor $in0,$in0,$tmp
194 vext.8 $tmp,$zero,$tmp,#12
195 veor $in0,$in0,$tmp
196 vext.8 $tmp,$zero,$tmp,#12
197 veor $key,$key,$rcon
198 veor $in0,$in0,$tmp
199 veor $in0,$in0,$key
200 vst1.32 {$in0},[$out]
201 add $out,$out,#0x50
202
203 mov $rounds,#10
204 b .Ldone
205
206 .align 4
207 .L192:
208 vld1.8 {$in1},[$inp],#8
209 vmov.i8 $key,#8 // borrow $key
210 vst1.32 {$in0},[$out],#16
211 vsub.i8 $mask,$mask,$key // adjust the mask
212
213 .Loop192:
214 vtbl.8 $key,{$in1},$mask
215 vext.8 $tmp,$zero,$in0,#12
216 #ifdef __ARMEB__
217 vst1.32 {$in1},[$out],#16
218 sub $out,$out,#8
219 #else
220 vst1.32 {$in1},[$out],#8
221 #endif
222 aese $key,$zero
223 subs $bits,$bits,#1
224
225 veor $in0,$in0,$tmp
226 vext.8 $tmp,$zero,$tmp,#12
227 veor $in0,$in0,$tmp
228 vext.8 $tmp,$zero,$tmp,#12
229 veor $in0,$in0,$tmp
230
231 vdup.32 $tmp,${in0}[3]
232 veor $tmp,$tmp,$in1
233 veor $key,$key,$rcon
234 vext.8 $in1,$zero,$in1,#12
235 vshl.u8 $rcon,$rcon,#1
236 veor $in1,$in1,$tmp
237 veor $in0,$in0,$key
238 veor $in1,$in1,$key
239 vst1.32 {$in0},[$out],#16
240 b.ne .Loop192
241
242 mov $rounds,#12
243 add $out,$out,#0x20
244 b .Ldone
245
246 .align 4
247 .L256:
248 vld1.8 {$in1},[$inp]
249 mov $bits,#7
250 mov $rounds,#14
251 vst1.32 {$in0},[$out],#16
252
253 .Loop256:
254 vtbl.8 $key,{$in1},$mask
255 vext.8 $tmp,$zero,$in0,#12
256 vst1.32 {$in1},[$out],#16
257 aese $key,$zero
258 subs $bits,$bits,#1
259
260 veor $in0,$in0,$tmp
261 vext.8 $tmp,$zero,$tmp,#12
262 veor $in0,$in0,$tmp
263 vext.8 $tmp,$zero,$tmp,#12
264 veor $key,$key,$rcon
265 veor $in0,$in0,$tmp
266 vshl.u8 $rcon,$rcon,#1
267 veor $in0,$in0,$key
268 vst1.32 {$in0},[$out],#16
269 b.eq .Ldone
270
271 vdup.32 $key,${in0}[3] // just splat
272 vext.8 $tmp,$zero,$in1,#12
273 aese $key,$zero
274
275 veor $in1,$in1,$tmp
276 vext.8 $tmp,$zero,$tmp,#12
277 veor $in1,$in1,$tmp
278 vext.8 $tmp,$zero,$tmp,#12
279 veor $in1,$in1,$tmp
280
281 veor $in1,$in1,$key
282 b .Loop256
283
284 .Ldone:
285 str $rounds,[$out]
286 mov $ptr,#0
287
288 .Lenc_key_abort:
289 mov x0,$ptr // return value
290 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
291 ret
292 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
293
294 .globl ${prefix}_set_decrypt_key
295 .type ${prefix}_set_decrypt_key,%function
296 .align 5
297 ${prefix}_set_decrypt_key:
298 ___
299 $code.=<<___ if ($flavour =~ /64/);
300 AARCH64_SIGN_LINK_REGISTER
301 stp x29,x30,[sp,#-16]!
302 add x29,sp,#0
303 ___
304 $code.=<<___ if ($flavour !~ /64/);
305 stmdb sp!,{r4,lr}
306 ___
307 $code.=<<___;
308 bl .Lenc_key
309
310 cmp x0,#0
311 b.ne .Ldec_key_abort
312
313 sub $out,$out,#240 // restore original $out
314 mov x4,#-16
315 add $inp,$out,x12,lsl#4 // end of key schedule
316
317 vld1.32 {v0.16b},[$out]
318 vld1.32 {v1.16b},[$inp]
319 vst1.32 {v0.16b},[$inp],x4
320 vst1.32 {v1.16b},[$out],#16
321
322 .Loop_imc:
323 vld1.32 {v0.16b},[$out]
324 vld1.32 {v1.16b},[$inp]
325 aesimc v0.16b,v0.16b
326 aesimc v1.16b,v1.16b
327 vst1.32 {v0.16b},[$inp],x4
328 vst1.32 {v1.16b},[$out],#16
329 cmp $inp,$out
330 b.hi .Loop_imc
331
332 vld1.32 {v0.16b},[$out]
333 aesimc v0.16b,v0.16b
334 vst1.32 {v0.16b},[$inp]
335
336 eor x0,x0,x0 // return value
337 .Ldec_key_abort:
338 ___
339 $code.=<<___ if ($flavour !~ /64/);
340 ldmia sp!,{r4,pc}
341 ___
342 $code.=<<___ if ($flavour =~ /64/);
343 ldp x29,x30,[sp],#16
344 AARCH64_VALIDATE_LINK_REGISTER
345 ret
346 ___
347 $code.=<<___;
348 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
349 ___
350 }}}
351 {{{
352 sub gen_block () {
353 my $dir = shift;
354 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
355 my ($inp,$out,$key)=map("x$_",(0..2));
356 my $rounds="w3";
357 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
358
359 $code.=<<___;
360 .globl ${prefix}_${dir}crypt
361 .type ${prefix}_${dir}crypt,%function
362 .align 5
363 ${prefix}_${dir}crypt:
364 ___
365 $code.=<<___ if ($flavour =~ /64/);
366 AARCH64_VALID_CALL_TARGET
367 ___
368 $code.=<<___;
369 ldr $rounds,[$key,#240]
370 vld1.32 {$rndkey0},[$key],#16
371 vld1.8 {$inout},[$inp]
372 sub $rounds,$rounds,#2
373 vld1.32 {$rndkey1},[$key],#16
374
375 .Loop_${dir}c:
376 aes$e $inout,$rndkey0
377 aes$mc $inout,$inout
378 vld1.32 {$rndkey0},[$key],#16
379 subs $rounds,$rounds,#2
380 aes$e $inout,$rndkey1
381 aes$mc $inout,$inout
382 vld1.32 {$rndkey1},[$key],#16
383 b.gt .Loop_${dir}c
384
385 aes$e $inout,$rndkey0
386 aes$mc $inout,$inout
387 vld1.32 {$rndkey0},[$key]
388 aes$e $inout,$rndkey1
389 veor $inout,$inout,$rndkey0
390
391 vst1.8 {$inout},[$out]
392 ret
393 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
394 ___
395 }
396 &gen_block("en");
397 &gen_block("de");
398 }}}
399
400 # Performance in cycles per byte.
401 # Processed with AES-ECB different key size.
402 # It shows the value before and after optimization as below:
403 # (before/after):
404 #
405 # AES-128-ECB AES-192-ECB AES-256-ECB
406 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
407 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
408
409 # Optimization is implemented by loop unrolling and interleaving.
410 # Commonly, we choose the unrolling factor as 5, if the input
411 # data size smaller than 5 blocks, but not smaller than 3 blocks,
412 # choose 3 as the unrolling factor.
413 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
414 # as one iteration, every loop the left size lsize -= 5*16.
415 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
416 # every loop lsize -=3*16.
417 # If lsize < 3*16 bytes, treat them as the tail, interleave the
418 # two blocks AES instructions.
419 # There is one special case, if the original input data size dsize
420 # = 16 bytes, we will treat it separately to improve the
421 # performance: one independent code block without LR, FP load and
422 # store, just looks like what the original ECB implementation does.
423
424 {{{
425 my ($inp,$out,$len,$key)=map("x$_",(0..3));
426 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
427 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
428
429 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
430
431 ### q7 last round key
432 ### q10-q15 q7 Last 7 round keys
433 ### q8-q9 preloaded round keys except last 7 keys for big size
434 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
435
436 {
437 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
438
439 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
440 my ($dat4,$in4,$tmp4);
441 if ($flavour =~ /64/) {
442 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
443 }
444
445 $code.=<<___;
446 .globl ${prefix}_ecb_encrypt
447 .type ${prefix}_ecb_encrypt,%function
448 .align 5
449 ${prefix}_ecb_encrypt:
450 ___
451 $code.=<<___ if ($flavour =~ /64/);
452 AARCH64_VALID_CALL_TARGET
453 subs $len,$len,#16
454 // Original input data size bigger than 16, jump to big size processing.
455 b.ne .Lecb_big_size
456 vld1.8 {$dat0},[$inp]
457 cmp $enc,#0 // en- or decrypting?
458 ldr $rounds,[$key,#240]
459 vld1.32 {q5-q6},[$key],#32 // load key schedule...
460
461 b.eq .Lecb_small_dec
462 aese $dat0,q5
463 aesmc $dat0,$dat0
464 vld1.32 {q8-q9},[$key],#32 // load key schedule...
465 aese $dat0,q6
466 aesmc $dat0,$dat0
467 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
468 b.eq .Lecb_128_enc
469 .Lecb_round_loop:
470 aese $dat0,q8
471 aesmc $dat0,$dat0
472 vld1.32 {q8},[$key],#16 // load key schedule...
473 aese $dat0,q9
474 aesmc $dat0,$dat0
475 vld1.32 {q9},[$key],#16 // load key schedule...
476 subs $rounds,$rounds,#2 // bias
477 b.gt .Lecb_round_loop
478 .Lecb_128_enc:
479 vld1.32 {q10-q11},[$key],#32 // load key schedule...
480 aese $dat0,q8
481 aesmc $dat0,$dat0
482 aese $dat0,q9
483 aesmc $dat0,$dat0
484 vld1.32 {q12-q13},[$key],#32 // load key schedule...
485 aese $dat0,q10
486 aesmc $dat0,$dat0
487 aese $dat0,q11
488 aesmc $dat0,$dat0
489 vld1.32 {q14-q15},[$key],#32 // load key schedule...
490 aese $dat0,q12
491 aesmc $dat0,$dat0
492 aese $dat0,q13
493 aesmc $dat0,$dat0
494 vld1.32 {$rndlast},[$key]
495 aese $dat0,q14
496 aesmc $dat0,$dat0
497 aese $dat0,q15
498 veor $dat0,$dat0,$rndlast
499 vst1.8 {$dat0},[$out]
500 b .Lecb_Final_abort
501 .Lecb_small_dec:
502 aesd $dat0,q5
503 aesimc $dat0,$dat0
504 vld1.32 {q8-q9},[$key],#32 // load key schedule...
505 aesd $dat0,q6
506 aesimc $dat0,$dat0
507 subs $rounds,$rounds,#10 // bias
508 b.eq .Lecb_128_dec
509 .Lecb_dec_round_loop:
510 aesd $dat0,q8
511 aesimc $dat0,$dat0
512 vld1.32 {q8},[$key],#16 // load key schedule...
513 aesd $dat0,q9
514 aesimc $dat0,$dat0
515 vld1.32 {q9},[$key],#16 // load key schedule...
516 subs $rounds,$rounds,#2 // bias
517 b.gt .Lecb_dec_round_loop
518 .Lecb_128_dec:
519 vld1.32 {q10-q11},[$key],#32 // load key schedule...
520 aesd $dat0,q8
521 aesimc $dat0,$dat0
522 aesd $dat0,q9
523 aesimc $dat0,$dat0
524 vld1.32 {q12-q13},[$key],#32 // load key schedule...
525 aesd $dat0,q10
526 aesimc $dat0,$dat0
527 aesd $dat0,q11
528 aesimc $dat0,$dat0
529 vld1.32 {q14-q15},[$key],#32 // load key schedule...
530 aesd $dat0,q12
531 aesimc $dat0,$dat0
532 aesd $dat0,q13
533 aesimc $dat0,$dat0
534 vld1.32 {$rndlast},[$key]
535 aesd $dat0,q14
536 aesimc $dat0,$dat0
537 aesd $dat0,q15
538 veor $dat0,$dat0,$rndlast
539 vst1.8 {$dat0},[$out]
540 b .Lecb_Final_abort
541 .Lecb_big_size:
542 ___
543 $code.=<<___ if ($flavour =~ /64/);
544 stp x29,x30,[sp,#-16]!
545 add x29,sp,#0
546 ___
547 $code.=<<___ if ($flavour !~ /64/);
548 mov ip,sp
549 stmdb sp!,{r4-r8,lr}
550 vstmdb sp!,{d8-d15} @ ABI specification says so
551 ldmia ip,{r4-r5} @ load remaining args
552 subs $len,$len,#16
553 ___
554 $code.=<<___;
555 mov $step,#16
556 b.lo .Lecb_done
557 cclr $step,eq
558
559 cmp $enc,#0 // en- or decrypting?
560 ldr $rounds,[$key,#240]
561 and $len,$len,#-16
562 vld1.8 {$dat},[$inp],$step
563
564 vld1.32 {q8-q9},[$key] // load key schedule...
565 sub $rounds,$rounds,#6
566 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
567 sub $rounds,$rounds,#2
568 vld1.32 {q10-q11},[$key_],#32
569 vld1.32 {q12-q13},[$key_],#32
570 vld1.32 {q14-q15},[$key_],#32
571 vld1.32 {$rndlast},[$key_]
572
573 add $key_,$key,#32
574 mov $cnt,$rounds
575 b.eq .Lecb_dec
576
577 vld1.8 {$dat1},[$inp],#16
578 subs $len,$len,#32 // bias
579 add $cnt,$rounds,#2
580 vorr $in1,$dat1,$dat1
581 vorr $dat2,$dat1,$dat1
582 vorr $dat1,$dat,$dat
583 b.lo .Lecb_enc_tail
584
585 vorr $dat1,$in1,$in1
586 vld1.8 {$dat2},[$inp],#16
587 ___
588 $code.=<<___ if ($flavour =~ /64/);
589 cmp $len,#32
590 b.lo .Loop3x_ecb_enc
591
592 vld1.8 {$dat3},[$inp],#16
593 vld1.8 {$dat4},[$inp],#16
594 sub $len,$len,#32 // bias
595 mov $cnt,$rounds
596
597 .Loop5x_ecb_enc:
598 aese $dat0,q8
599 aesmc $dat0,$dat0
600 aese $dat1,q8
601 aesmc $dat1,$dat1
602 aese $dat2,q8
603 aesmc $dat2,$dat2
604 aese $dat3,q8
605 aesmc $dat3,$dat3
606 aese $dat4,q8
607 aesmc $dat4,$dat4
608 vld1.32 {q8},[$key_],#16
609 subs $cnt,$cnt,#2
610 aese $dat0,q9
611 aesmc $dat0,$dat0
612 aese $dat1,q9
613 aesmc $dat1,$dat1
614 aese $dat2,q9
615 aesmc $dat2,$dat2
616 aese $dat3,q9
617 aesmc $dat3,$dat3
618 aese $dat4,q9
619 aesmc $dat4,$dat4
620 vld1.32 {q9},[$key_],#16
621 b.gt .Loop5x_ecb_enc
622
623 aese $dat0,q8
624 aesmc $dat0,$dat0
625 aese $dat1,q8
626 aesmc $dat1,$dat1
627 aese $dat2,q8
628 aesmc $dat2,$dat2
629 aese $dat3,q8
630 aesmc $dat3,$dat3
631 aese $dat4,q8
632 aesmc $dat4,$dat4
633 cmp $len,#0x40 // because .Lecb_enc_tail4x
634 sub $len,$len,#0x50
635
636 aese $dat0,q9
637 aesmc $dat0,$dat0
638 aese $dat1,q9
639 aesmc $dat1,$dat1
640 aese $dat2,q9
641 aesmc $dat2,$dat2
642 aese $dat3,q9
643 aesmc $dat3,$dat3
644 aese $dat4,q9
645 aesmc $dat4,$dat4
646 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
647 mov $key_,$key
648
649 aese $dat0,q10
650 aesmc $dat0,$dat0
651 aese $dat1,q10
652 aesmc $dat1,$dat1
653 aese $dat2,q10
654 aesmc $dat2,$dat2
655 aese $dat3,q10
656 aesmc $dat3,$dat3
657 aese $dat4,q10
658 aesmc $dat4,$dat4
659 add $inp,$inp,x6 // $inp is adjusted in such way that
660 // at exit from the loop $dat1-$dat4
661 // are loaded with last "words"
662 add x6,$len,#0x60 // because .Lecb_enc_tail4x
663
664 aese $dat0,q11
665 aesmc $dat0,$dat0
666 aese $dat1,q11
667 aesmc $dat1,$dat1
668 aese $dat2,q11
669 aesmc $dat2,$dat2
670 aese $dat3,q11
671 aesmc $dat3,$dat3
672 aese $dat4,q11
673 aesmc $dat4,$dat4
674
675 aese $dat0,q12
676 aesmc $dat0,$dat0
677 aese $dat1,q12
678 aesmc $dat1,$dat1
679 aese $dat2,q12
680 aesmc $dat2,$dat2
681 aese $dat3,q12
682 aesmc $dat3,$dat3
683 aese $dat4,q12
684 aesmc $dat4,$dat4
685
686 aese $dat0,q13
687 aesmc $dat0,$dat0
688 aese $dat1,q13
689 aesmc $dat1,$dat1
690 aese $dat2,q13
691 aesmc $dat2,$dat2
692 aese $dat3,q13
693 aesmc $dat3,$dat3
694 aese $dat4,q13
695 aesmc $dat4,$dat4
696
697 aese $dat0,q14
698 aesmc $dat0,$dat0
699 aese $dat1,q14
700 aesmc $dat1,$dat1
701 aese $dat2,q14
702 aesmc $dat2,$dat2
703 aese $dat3,q14
704 aesmc $dat3,$dat3
705 aese $dat4,q14
706 aesmc $dat4,$dat4
707
708 aese $dat0,q15
709 vld1.8 {$in0},[$inp],#16
710 aese $dat1,q15
711 vld1.8 {$in1},[$inp],#16
712 aese $dat2,q15
713 vld1.8 {$in2},[$inp],#16
714 aese $dat3,q15
715 vld1.8 {$in3},[$inp],#16
716 aese $dat4,q15
717 vld1.8 {$in4},[$inp],#16
718 cbz x6,.Lecb_enc_tail4x
719 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
720 veor $tmp0,$rndlast,$dat0
721 vorr $dat0,$in0,$in0
722 veor $tmp1,$rndlast,$dat1
723 vorr $dat1,$in1,$in1
724 veor $tmp2,$rndlast,$dat2
725 vorr $dat2,$in2,$in2
726 veor $tmp3,$rndlast,$dat3
727 vorr $dat3,$in3,$in3
728 veor $tmp4,$rndlast,$dat4
729 vst1.8 {$tmp0},[$out],#16
730 vorr $dat4,$in4,$in4
731 vst1.8 {$tmp1},[$out],#16
732 mov $cnt,$rounds
733 vst1.8 {$tmp2},[$out],#16
734 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
735 vst1.8 {$tmp3},[$out],#16
736 vst1.8 {$tmp4},[$out],#16
737 b.hs .Loop5x_ecb_enc
738
739 add $len,$len,#0x50
740 cbz $len,.Lecb_done
741
742 add $cnt,$rounds,#2
743 subs $len,$len,#0x30
744 vorr $dat0,$in2,$in2
745 vorr $dat1,$in3,$in3
746 vorr $dat2,$in4,$in4
747 b.lo .Lecb_enc_tail
748
749 b .Loop3x_ecb_enc
750
751 .align 4
752 .Lecb_enc_tail4x:
753 veor $tmp1,$rndlast,$dat1
754 veor $tmp2,$rndlast,$dat2
755 veor $tmp3,$rndlast,$dat3
756 veor $tmp4,$rndlast,$dat4
757 vst1.8 {$tmp1},[$out],#16
758 vst1.8 {$tmp2},[$out],#16
759 vst1.8 {$tmp3},[$out],#16
760 vst1.8 {$tmp4},[$out],#16
761
762 b .Lecb_done
763 .align 4
764 ___
765 $code.=<<___;
766 .Loop3x_ecb_enc:
767 aese $dat0,q8
768 aesmc $dat0,$dat0
769 aese $dat1,q8
770 aesmc $dat1,$dat1
771 aese $dat2,q8
772 aesmc $dat2,$dat2
773 vld1.32 {q8},[$key_],#16
774 subs $cnt,$cnt,#2
775 aese $dat0,q9
776 aesmc $dat0,$dat0
777 aese $dat1,q9
778 aesmc $dat1,$dat1
779 aese $dat2,q9
780 aesmc $dat2,$dat2
781 vld1.32 {q9},[$key_],#16
782 b.gt .Loop3x_ecb_enc
783
784 aese $dat0,q8
785 aesmc $dat0,$dat0
786 aese $dat1,q8
787 aesmc $dat1,$dat1
788 aese $dat2,q8
789 aesmc $dat2,$dat2
790 subs $len,$len,#0x30
791 mov.lo x6,$len // x6, $cnt, is zero at this point
792 aese $dat0,q9
793 aesmc $dat0,$dat0
794 aese $dat1,q9
795 aesmc $dat1,$dat1
796 aese $dat2,q9
797 aesmc $dat2,$dat2
798 add $inp,$inp,x6 // $inp is adjusted in such way that
799 // at exit from the loop $dat1-$dat2
800 // are loaded with last "words"
801 mov $key_,$key
802 aese $dat0,q12
803 aesmc $dat0,$dat0
804 aese $dat1,q12
805 aesmc $dat1,$dat1
806 aese $dat2,q12
807 aesmc $dat2,$dat2
808 vld1.8 {$in0},[$inp],#16
809 aese $dat0,q13
810 aesmc $dat0,$dat0
811 aese $dat1,q13
812 aesmc $dat1,$dat1
813 aese $dat2,q13
814 aesmc $dat2,$dat2
815 vld1.8 {$in1},[$inp],#16
816 aese $dat0,q14
817 aesmc $dat0,$dat0
818 aese $dat1,q14
819 aesmc $dat1,$dat1
820 aese $dat2,q14
821 aesmc $dat2,$dat2
822 vld1.8 {$in2},[$inp],#16
823 aese $dat0,q15
824 aese $dat1,q15
825 aese $dat2,q15
826 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
827 add $cnt,$rounds,#2
828 veor $tmp0,$rndlast,$dat0
829 veor $tmp1,$rndlast,$dat1
830 veor $dat2,$dat2,$rndlast
831 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
832 vst1.8 {$tmp0},[$out],#16
833 vorr $dat0,$in0,$in0
834 vst1.8 {$tmp1},[$out],#16
835 vorr $dat1,$in1,$in1
836 vst1.8 {$dat2},[$out],#16
837 vorr $dat2,$in2,$in2
838 b.hs .Loop3x_ecb_enc
839
840 cmn $len,#0x30
841 b.eq .Lecb_done
842 nop
843
844 .Lecb_enc_tail:
845 aese $dat1,q8
846 aesmc $dat1,$dat1
847 aese $dat2,q8
848 aesmc $dat2,$dat2
849 vld1.32 {q8},[$key_],#16
850 subs $cnt,$cnt,#2
851 aese $dat1,q9
852 aesmc $dat1,$dat1
853 aese $dat2,q9
854 aesmc $dat2,$dat2
855 vld1.32 {q9},[$key_],#16
856 b.gt .Lecb_enc_tail
857
858 aese $dat1,q8
859 aesmc $dat1,$dat1
860 aese $dat2,q8
861 aesmc $dat2,$dat2
862 aese $dat1,q9
863 aesmc $dat1,$dat1
864 aese $dat2,q9
865 aesmc $dat2,$dat2
866 aese $dat1,q12
867 aesmc $dat1,$dat1
868 aese $dat2,q12
869 aesmc $dat2,$dat2
870 cmn $len,#0x20
871 aese $dat1,q13
872 aesmc $dat1,$dat1
873 aese $dat2,q13
874 aesmc $dat2,$dat2
875 aese $dat1,q14
876 aesmc $dat1,$dat1
877 aese $dat2,q14
878 aesmc $dat2,$dat2
879 aese $dat1,q15
880 aese $dat2,q15
881 b.eq .Lecb_enc_one
882 veor $tmp1,$rndlast,$dat1
883 veor $tmp2,$rndlast,$dat2
884 vst1.8 {$tmp1},[$out],#16
885 vst1.8 {$tmp2},[$out],#16
886 b .Lecb_done
887
888 .Lecb_enc_one:
889 veor $tmp1,$rndlast,$dat2
890 vst1.8 {$tmp1},[$out],#16
891 b .Lecb_done
892 ___
893
894 $code.=<<___;
895 .align 5
896 .Lecb_dec:
897 vld1.8 {$dat1},[$inp],#16
898 subs $len,$len,#32 // bias
899 add $cnt,$rounds,#2
900 vorr $in1,$dat1,$dat1
901 vorr $dat2,$dat1,$dat1
902 vorr $dat1,$dat,$dat
903 b.lo .Lecb_dec_tail
904
905 vorr $dat1,$in1,$in1
906 vld1.8 {$dat2},[$inp],#16
907 ___
908 $code.=<<___ if ($flavour =~ /64/);
909 cmp $len,#32
910 b.lo .Loop3x_ecb_dec
911
912 vld1.8 {$dat3},[$inp],#16
913 vld1.8 {$dat4},[$inp],#16
914 sub $len,$len,#32 // bias
915 mov $cnt,$rounds
916
917 .Loop5x_ecb_dec:
918 aesd $dat0,q8
919 aesimc $dat0,$dat0
920 aesd $dat1,q8
921 aesimc $dat1,$dat1
922 aesd $dat2,q8
923 aesimc $dat2,$dat2
924 aesd $dat3,q8
925 aesimc $dat3,$dat3
926 aesd $dat4,q8
927 aesimc $dat4,$dat4
928 vld1.32 {q8},[$key_],#16
929 subs $cnt,$cnt,#2
930 aesd $dat0,q9
931 aesimc $dat0,$dat0
932 aesd $dat1,q9
933 aesimc $dat1,$dat1
934 aesd $dat2,q9
935 aesimc $dat2,$dat2
936 aesd $dat3,q9
937 aesimc $dat3,$dat3
938 aesd $dat4,q9
939 aesimc $dat4,$dat4
940 vld1.32 {q9},[$key_],#16
941 b.gt .Loop5x_ecb_dec
942
943 aesd $dat0,q8
944 aesimc $dat0,$dat0
945 aesd $dat1,q8
946 aesimc $dat1,$dat1
947 aesd $dat2,q8
948 aesimc $dat2,$dat2
949 aesd $dat3,q8
950 aesimc $dat3,$dat3
951 aesd $dat4,q8
952 aesimc $dat4,$dat4
953 cmp $len,#0x40 // because .Lecb_tail4x
954 sub $len,$len,#0x50
955
956 aesd $dat0,q9
957 aesimc $dat0,$dat0
958 aesd $dat1,q9
959 aesimc $dat1,$dat1
960 aesd $dat2,q9
961 aesimc $dat2,$dat2
962 aesd $dat3,q9
963 aesimc $dat3,$dat3
964 aesd $dat4,q9
965 aesimc $dat4,$dat4
966 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
967 mov $key_,$key
968
969 aesd $dat0,q10
970 aesimc $dat0,$dat0
971 aesd $dat1,q10
972 aesimc $dat1,$dat1
973 aesd $dat2,q10
974 aesimc $dat2,$dat2
975 aesd $dat3,q10
976 aesimc $dat3,$dat3
977 aesd $dat4,q10
978 aesimc $dat4,$dat4
979 add $inp,$inp,x6 // $inp is adjusted in such way that
980 // at exit from the loop $dat1-$dat4
981 // are loaded with last "words"
982 add x6,$len,#0x60 // because .Lecb_tail4x
983
984 aesd $dat0,q11
985 aesimc $dat0,$dat0
986 aesd $dat1,q11
987 aesimc $dat1,$dat1
988 aesd $dat2,q11
989 aesimc $dat2,$dat2
990 aesd $dat3,q11
991 aesimc $dat3,$dat3
992 aesd $dat4,q11
993 aesimc $dat4,$dat4
994
995 aesd $dat0,q12
996 aesimc $dat0,$dat0
997 aesd $dat1,q12
998 aesimc $dat1,$dat1
999 aesd $dat2,q12
1000 aesimc $dat2,$dat2
1001 aesd $dat3,q12
1002 aesimc $dat3,$dat3
1003 aesd $dat4,q12
1004 aesimc $dat4,$dat4
1005
1006 aesd $dat0,q13
1007 aesimc $dat0,$dat0
1008 aesd $dat1,q13
1009 aesimc $dat1,$dat1
1010 aesd $dat2,q13
1011 aesimc $dat2,$dat2
1012 aesd $dat3,q13
1013 aesimc $dat3,$dat3
1014 aesd $dat4,q13
1015 aesimc $dat4,$dat4
1016
1017 aesd $dat0,q14
1018 aesimc $dat0,$dat0
1019 aesd $dat1,q14
1020 aesimc $dat1,$dat1
1021 aesd $dat2,q14
1022 aesimc $dat2,$dat2
1023 aesd $dat3,q14
1024 aesimc $dat3,$dat3
1025 aesd $dat4,q14
1026 aesimc $dat4,$dat4
1027
1028 aesd $dat0,q15
1029 vld1.8 {$in0},[$inp],#16
1030 aesd $dat1,q15
1031 vld1.8 {$in1},[$inp],#16
1032 aesd $dat2,q15
1033 vld1.8 {$in2},[$inp],#16
1034 aesd $dat3,q15
1035 vld1.8 {$in3},[$inp],#16
1036 aesd $dat4,q15
1037 vld1.8 {$in4},[$inp],#16
1038 cbz x6,.Lecb_tail4x
1039 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1040 veor $tmp0,$rndlast,$dat0
1041 vorr $dat0,$in0,$in0
1042 veor $tmp1,$rndlast,$dat1
1043 vorr $dat1,$in1,$in1
1044 veor $tmp2,$rndlast,$dat2
1045 vorr $dat2,$in2,$in2
1046 veor $tmp3,$rndlast,$dat3
1047 vorr $dat3,$in3,$in3
1048 veor $tmp4,$rndlast,$dat4
1049 vst1.8 {$tmp0},[$out],#16
1050 vorr $dat4,$in4,$in4
1051 vst1.8 {$tmp1},[$out],#16
1052 mov $cnt,$rounds
1053 vst1.8 {$tmp2},[$out],#16
1054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1055 vst1.8 {$tmp3},[$out],#16
1056 vst1.8 {$tmp4},[$out],#16
1057 b.hs .Loop5x_ecb_dec
1058
1059 add $len,$len,#0x50
1060 cbz $len,.Lecb_done
1061
1062 add $cnt,$rounds,#2
1063 subs $len,$len,#0x30
1064 vorr $dat0,$in2,$in2
1065 vorr $dat1,$in3,$in3
1066 vorr $dat2,$in4,$in4
1067 b.lo .Lecb_dec_tail
1068
1069 b .Loop3x_ecb_dec
1070
1071 .align 4
1072 .Lecb_tail4x:
1073 veor $tmp1,$rndlast,$dat1
1074 veor $tmp2,$rndlast,$dat2
1075 veor $tmp3,$rndlast,$dat3
1076 veor $tmp4,$rndlast,$dat4
1077 vst1.8 {$tmp1},[$out],#16
1078 vst1.8 {$tmp2},[$out],#16
1079 vst1.8 {$tmp3},[$out],#16
1080 vst1.8 {$tmp4},[$out],#16
1081
1082 b .Lecb_done
1083 .align 4
1084 ___
1085 $code.=<<___;
1086 .Loop3x_ecb_dec:
1087 aesd $dat0,q8
1088 aesimc $dat0,$dat0
1089 aesd $dat1,q8
1090 aesimc $dat1,$dat1
1091 aesd $dat2,q8
1092 aesimc $dat2,$dat2
1093 vld1.32 {q8},[$key_],#16
1094 subs $cnt,$cnt,#2
1095 aesd $dat0,q9
1096 aesimc $dat0,$dat0
1097 aesd $dat1,q9
1098 aesimc $dat1,$dat1
1099 aesd $dat2,q9
1100 aesimc $dat2,$dat2
1101 vld1.32 {q9},[$key_],#16
1102 b.gt .Loop3x_ecb_dec
1103
1104 aesd $dat0,q8
1105 aesimc $dat0,$dat0
1106 aesd $dat1,q8
1107 aesimc $dat1,$dat1
1108 aesd $dat2,q8
1109 aesimc $dat2,$dat2
1110 subs $len,$len,#0x30
1111 mov.lo x6,$len // x6, $cnt, is zero at this point
1112 aesd $dat0,q9
1113 aesimc $dat0,$dat0
1114 aesd $dat1,q9
1115 aesimc $dat1,$dat1
1116 aesd $dat2,q9
1117 aesimc $dat2,$dat2
1118 add $inp,$inp,x6 // $inp is adjusted in such way that
1119 // at exit from the loop $dat1-$dat2
1120 // are loaded with last "words"
1121 mov $key_,$key
1122 aesd $dat0,q12
1123 aesimc $dat0,$dat0
1124 aesd $dat1,q12
1125 aesimc $dat1,$dat1
1126 aesd $dat2,q12
1127 aesimc $dat2,$dat2
1128 vld1.8 {$in0},[$inp],#16
1129 aesd $dat0,q13
1130 aesimc $dat0,$dat0
1131 aesd $dat1,q13
1132 aesimc $dat1,$dat1
1133 aesd $dat2,q13
1134 aesimc $dat2,$dat2
1135 vld1.8 {$in1},[$inp],#16
1136 aesd $dat0,q14
1137 aesimc $dat0,$dat0
1138 aesd $dat1,q14
1139 aesimc $dat1,$dat1
1140 aesd $dat2,q14
1141 aesimc $dat2,$dat2
1142 vld1.8 {$in2},[$inp],#16
1143 aesd $dat0,q15
1144 aesd $dat1,q15
1145 aesd $dat2,q15
1146 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1147 add $cnt,$rounds,#2
1148 veor $tmp0,$rndlast,$dat0
1149 veor $tmp1,$rndlast,$dat1
1150 veor $dat2,$dat2,$rndlast
1151 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1152 vst1.8 {$tmp0},[$out],#16
1153 vorr $dat0,$in0,$in0
1154 vst1.8 {$tmp1},[$out],#16
1155 vorr $dat1,$in1,$in1
1156 vst1.8 {$dat2},[$out],#16
1157 vorr $dat2,$in2,$in2
1158 b.hs .Loop3x_ecb_dec
1159
1160 cmn $len,#0x30
1161 b.eq .Lecb_done
1162 nop
1163
1164 .Lecb_dec_tail:
1165 aesd $dat1,q8
1166 aesimc $dat1,$dat1
1167 aesd $dat2,q8
1168 aesimc $dat2,$dat2
1169 vld1.32 {q8},[$key_],#16
1170 subs $cnt,$cnt,#2
1171 aesd $dat1,q9
1172 aesimc $dat1,$dat1
1173 aesd $dat2,q9
1174 aesimc $dat2,$dat2
1175 vld1.32 {q9},[$key_],#16
1176 b.gt .Lecb_dec_tail
1177
1178 aesd $dat1,q8
1179 aesimc $dat1,$dat1
1180 aesd $dat2,q8
1181 aesimc $dat2,$dat2
1182 aesd $dat1,q9
1183 aesimc $dat1,$dat1
1184 aesd $dat2,q9
1185 aesimc $dat2,$dat2
1186 aesd $dat1,q12
1187 aesimc $dat1,$dat1
1188 aesd $dat2,q12
1189 aesimc $dat2,$dat2
1190 cmn $len,#0x20
1191 aesd $dat1,q13
1192 aesimc $dat1,$dat1
1193 aesd $dat2,q13
1194 aesimc $dat2,$dat2
1195 aesd $dat1,q14
1196 aesimc $dat1,$dat1
1197 aesd $dat2,q14
1198 aesimc $dat2,$dat2
1199 aesd $dat1,q15
1200 aesd $dat2,q15
1201 b.eq .Lecb_dec_one
1202 veor $tmp1,$rndlast,$dat1
1203 veor $tmp2,$rndlast,$dat2
1204 vst1.8 {$tmp1},[$out],#16
1205 vst1.8 {$tmp2},[$out],#16
1206 b .Lecb_done
1207
1208 .Lecb_dec_one:
1209 veor $tmp1,$rndlast,$dat2
1210 vst1.8 {$tmp1},[$out],#16
1211
1212 .Lecb_done:
1213 ___
1214 }
1215 $code.=<<___ if ($flavour !~ /64/);
1216 vldmia sp!,{d8-d15}
1217 ldmia sp!,{r4-r8,pc}
1218 ___
1219 $code.=<<___ if ($flavour =~ /64/);
1220 ldr x29,[sp],#16
1221 ___
1222 $code.=<<___ if ($flavour =~ /64/);
1223 .Lecb_Final_abort:
1224 ret
1225 ___
1226 $code.=<<___;
1227 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1228 ___
1229 }}}
1230 {{{
1231 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1234
1235 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1237
1238 ### q8-q15 preloaded key schedule
1239
1240 $code.=<<___;
1241 .globl ${prefix}_cbc_encrypt
1242 .type ${prefix}_cbc_encrypt,%function
1243 .align 5
1244 ${prefix}_cbc_encrypt:
1245 ___
1246 $code.=<<___ if ($flavour =~ /64/);
1247 AARCH64_VALID_CALL_TARGET
1248 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249 stp x29,x30,[sp,#-16]!
1250 add x29,sp,#0
1251 ___
1252 $code.=<<___ if ($flavour !~ /64/);
1253 mov ip,sp
1254 stmdb sp!,{r4-r8,lr}
1255 vstmdb sp!,{d8-d15} @ ABI specification says so
1256 ldmia ip,{r4-r5} @ load remaining args
1257 ___
1258 $code.=<<___;
1259 subs $len,$len,#16
1260 mov $step,#16
1261 b.lo .Lcbc_abort
1262 cclr $step,eq
1263
1264 cmp $enc,#0 // en- or decrypting?
1265 ldr $rounds,[$key,#240]
1266 and $len,$len,#-16
1267 vld1.8 {$ivec},[$ivp]
1268 vld1.8 {$dat},[$inp],$step
1269
1270 vld1.32 {q8-q9},[$key] // load key schedule...
1271 sub $rounds,$rounds,#6
1272 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1273 sub $rounds,$rounds,#2
1274 vld1.32 {q10-q11},[$key_],#32
1275 vld1.32 {q12-q13},[$key_],#32
1276 vld1.32 {q14-q15},[$key_],#32
1277 vld1.32 {$rndlast},[$key_]
1278
1279 add $key_,$key,#32
1280 mov $cnt,$rounds
1281 b.eq .Lcbc_dec
1282
1283 cmp $rounds,#2
1284 veor $dat,$dat,$ivec
1285 veor $rndzero_n_last,q8,$rndlast
1286 b.eq .Lcbc_enc128
1287
1288 vld1.32 {$in0-$in1},[$key_]
1289 add $key_,$key,#16
1290 add $key4,$key,#16*4
1291 add $key5,$key,#16*5
1292 aese $dat,q8
1293 aesmc $dat,$dat
1294 add $key6,$key,#16*6
1295 add $key7,$key,#16*7
1296 b .Lenter_cbc_enc
1297
1298 .align 4
1299 .Loop_cbc_enc:
1300 aese $dat,q8
1301 aesmc $dat,$dat
1302 vst1.8 {$ivec},[$out],#16
1303 .Lenter_cbc_enc:
1304 aese $dat,q9
1305 aesmc $dat,$dat
1306 aese $dat,$in0
1307 aesmc $dat,$dat
1308 vld1.32 {q8},[$key4]
1309 cmp $rounds,#4
1310 aese $dat,$in1
1311 aesmc $dat,$dat
1312 vld1.32 {q9},[$key5]
1313 b.eq .Lcbc_enc192
1314
1315 aese $dat,q8
1316 aesmc $dat,$dat
1317 vld1.32 {q8},[$key6]
1318 aese $dat,q9
1319 aesmc $dat,$dat
1320 vld1.32 {q9},[$key7]
1321 nop
1322
1323 .Lcbc_enc192:
1324 aese $dat,q8
1325 aesmc $dat,$dat
1326 subs $len,$len,#16
1327 aese $dat,q9
1328 aesmc $dat,$dat
1329 cclr $step,eq
1330 aese $dat,q10
1331 aesmc $dat,$dat
1332 aese $dat,q11
1333 aesmc $dat,$dat
1334 vld1.8 {q8},[$inp],$step
1335 aese $dat,q12
1336 aesmc $dat,$dat
1337 veor q8,q8,$rndzero_n_last
1338 aese $dat,q13
1339 aesmc $dat,$dat
1340 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1341 aese $dat,q14
1342 aesmc $dat,$dat
1343 aese $dat,q15
1344 veor $ivec,$dat,$rndlast
1345 b.hs .Loop_cbc_enc
1346
1347 vst1.8 {$ivec},[$out],#16
1348 b .Lcbc_done
1349
1350 .align 5
1351 .Lcbc_enc128:
1352 vld1.32 {$in0-$in1},[$key_]
1353 aese $dat,q8
1354 aesmc $dat,$dat
1355 b .Lenter_cbc_enc128
1356 .Loop_cbc_enc128:
1357 aese $dat,q8
1358 aesmc $dat,$dat
1359 vst1.8 {$ivec},[$out],#16
1360 .Lenter_cbc_enc128:
1361 aese $dat,q9
1362 aesmc $dat,$dat
1363 subs $len,$len,#16
1364 aese $dat,$in0
1365 aesmc $dat,$dat
1366 cclr $step,eq
1367 aese $dat,$in1
1368 aesmc $dat,$dat
1369 aese $dat,q10
1370 aesmc $dat,$dat
1371 aese $dat,q11
1372 aesmc $dat,$dat
1373 vld1.8 {q8},[$inp],$step
1374 aese $dat,q12
1375 aesmc $dat,$dat
1376 aese $dat,q13
1377 aesmc $dat,$dat
1378 aese $dat,q14
1379 aesmc $dat,$dat
1380 veor q8,q8,$rndzero_n_last
1381 aese $dat,q15
1382 veor $ivec,$dat,$rndlast
1383 b.hs .Loop_cbc_enc128
1384
1385 vst1.8 {$ivec},[$out],#16
1386 b .Lcbc_done
1387 ___
1388 {
1389 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1390
1391 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1392 my ($dat4,$in4,$tmp4);
1393 if ($flavour =~ /64/) {
1394 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1395 }
1396
1397 $code.=<<___;
1398 .align 5
1399 .Lcbc_dec:
1400 vld1.8 {$dat2},[$inp],#16
1401 subs $len,$len,#32 // bias
1402 add $cnt,$rounds,#2
1403 vorr $in1,$dat,$dat
1404 vorr $dat1,$dat,$dat
1405 vorr $in2,$dat2,$dat2
1406 b.lo .Lcbc_dec_tail
1407
1408 vorr $dat1,$dat2,$dat2
1409 vld1.8 {$dat2},[$inp],#16
1410 vorr $in0,$dat,$dat
1411 vorr $in1,$dat1,$dat1
1412 vorr $in2,$dat2,$dat2
1413 ___
1414 $code.=<<___ if ($flavour =~ /64/);
1415 cmp $len,#32
1416 b.lo .Loop3x_cbc_dec
1417
1418 vld1.8 {$dat3},[$inp],#16
1419 vld1.8 {$dat4},[$inp],#16
1420 sub $len,$len,#32 // bias
1421 mov $cnt,$rounds
1422 vorr $in3,$dat3,$dat3
1423 vorr $in4,$dat4,$dat4
1424
1425 .Loop5x_cbc_dec:
1426 aesd $dat0,q8
1427 aesimc $dat0,$dat0
1428 aesd $dat1,q8
1429 aesimc $dat1,$dat1
1430 aesd $dat2,q8
1431 aesimc $dat2,$dat2
1432 aesd $dat3,q8
1433 aesimc $dat3,$dat3
1434 aesd $dat4,q8
1435 aesimc $dat4,$dat4
1436 vld1.32 {q8},[$key_],#16
1437 subs $cnt,$cnt,#2
1438 aesd $dat0,q9
1439 aesimc $dat0,$dat0
1440 aesd $dat1,q9
1441 aesimc $dat1,$dat1
1442 aesd $dat2,q9
1443 aesimc $dat2,$dat2
1444 aesd $dat3,q9
1445 aesimc $dat3,$dat3
1446 aesd $dat4,q9
1447 aesimc $dat4,$dat4
1448 vld1.32 {q9},[$key_],#16
1449 b.gt .Loop5x_cbc_dec
1450
1451 aesd $dat0,q8
1452 aesimc $dat0,$dat0
1453 aesd $dat1,q8
1454 aesimc $dat1,$dat1
1455 aesd $dat2,q8
1456 aesimc $dat2,$dat2
1457 aesd $dat3,q8
1458 aesimc $dat3,$dat3
1459 aesd $dat4,q8
1460 aesimc $dat4,$dat4
1461 cmp $len,#0x40 // because .Lcbc_tail4x
1462 sub $len,$len,#0x50
1463
1464 aesd $dat0,q9
1465 aesimc $dat0,$dat0
1466 aesd $dat1,q9
1467 aesimc $dat1,$dat1
1468 aesd $dat2,q9
1469 aesimc $dat2,$dat2
1470 aesd $dat3,q9
1471 aesimc $dat3,$dat3
1472 aesd $dat4,q9
1473 aesimc $dat4,$dat4
1474 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1475 mov $key_,$key
1476
1477 aesd $dat0,q10
1478 aesimc $dat0,$dat0
1479 aesd $dat1,q10
1480 aesimc $dat1,$dat1
1481 aesd $dat2,q10
1482 aesimc $dat2,$dat2
1483 aesd $dat3,q10
1484 aesimc $dat3,$dat3
1485 aesd $dat4,q10
1486 aesimc $dat4,$dat4
1487 add $inp,$inp,x6 // $inp is adjusted in such way that
1488 // at exit from the loop $dat1-$dat4
1489 // are loaded with last "words"
1490 add x6,$len,#0x60 // because .Lcbc_tail4x
1491
1492 aesd $dat0,q11
1493 aesimc $dat0,$dat0
1494 aesd $dat1,q11
1495 aesimc $dat1,$dat1
1496 aesd $dat2,q11
1497 aesimc $dat2,$dat2
1498 aesd $dat3,q11
1499 aesimc $dat3,$dat3
1500 aesd $dat4,q11
1501 aesimc $dat4,$dat4
1502
1503 aesd $dat0,q12
1504 aesimc $dat0,$dat0
1505 aesd $dat1,q12
1506 aesimc $dat1,$dat1
1507 aesd $dat2,q12
1508 aesimc $dat2,$dat2
1509 aesd $dat3,q12
1510 aesimc $dat3,$dat3
1511 aesd $dat4,q12
1512 aesimc $dat4,$dat4
1513
1514 aesd $dat0,q13
1515 aesimc $dat0,$dat0
1516 aesd $dat1,q13
1517 aesimc $dat1,$dat1
1518 aesd $dat2,q13
1519 aesimc $dat2,$dat2
1520 aesd $dat3,q13
1521 aesimc $dat3,$dat3
1522 aesd $dat4,q13
1523 aesimc $dat4,$dat4
1524
1525 aesd $dat0,q14
1526 aesimc $dat0,$dat0
1527 aesd $dat1,q14
1528 aesimc $dat1,$dat1
1529 aesd $dat2,q14
1530 aesimc $dat2,$dat2
1531 aesd $dat3,q14
1532 aesimc $dat3,$dat3
1533 aesd $dat4,q14
1534 aesimc $dat4,$dat4
1535
1536 veor $tmp0,$ivec,$rndlast
1537 aesd $dat0,q15
1538 veor $tmp1,$in0,$rndlast
1539 vld1.8 {$in0},[$inp],#16
1540 aesd $dat1,q15
1541 veor $tmp2,$in1,$rndlast
1542 vld1.8 {$in1},[$inp],#16
1543 aesd $dat2,q15
1544 veor $tmp3,$in2,$rndlast
1545 vld1.8 {$in2},[$inp],#16
1546 aesd $dat3,q15
1547 veor $tmp4,$in3,$rndlast
1548 vld1.8 {$in3},[$inp],#16
1549 aesd $dat4,q15
1550 vorr $ivec,$in4,$in4
1551 vld1.8 {$in4},[$inp],#16
1552 cbz x6,.Lcbc_tail4x
1553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1554 veor $tmp0,$tmp0,$dat0
1555 vorr $dat0,$in0,$in0
1556 veor $tmp1,$tmp1,$dat1
1557 vorr $dat1,$in1,$in1
1558 veor $tmp2,$tmp2,$dat2
1559 vorr $dat2,$in2,$in2
1560 veor $tmp3,$tmp3,$dat3
1561 vorr $dat3,$in3,$in3
1562 veor $tmp4,$tmp4,$dat4
1563 vst1.8 {$tmp0},[$out],#16
1564 vorr $dat4,$in4,$in4
1565 vst1.8 {$tmp1},[$out],#16
1566 mov $cnt,$rounds
1567 vst1.8 {$tmp2},[$out],#16
1568 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1569 vst1.8 {$tmp3},[$out],#16
1570 vst1.8 {$tmp4},[$out],#16
1571 b.hs .Loop5x_cbc_dec
1572
1573 add $len,$len,#0x50
1574 cbz $len,.Lcbc_done
1575
1576 add $cnt,$rounds,#2
1577 subs $len,$len,#0x30
1578 vorr $dat0,$in2,$in2
1579 vorr $in0,$in2,$in2
1580 vorr $dat1,$in3,$in3
1581 vorr $in1,$in3,$in3
1582 vorr $dat2,$in4,$in4
1583 vorr $in2,$in4,$in4
1584 b.lo .Lcbc_dec_tail
1585
1586 b .Loop3x_cbc_dec
1587
1588 .align 4
1589 .Lcbc_tail4x:
1590 veor $tmp1,$tmp0,$dat1
1591 veor $tmp2,$tmp2,$dat2
1592 veor $tmp3,$tmp3,$dat3
1593 veor $tmp4,$tmp4,$dat4
1594 vst1.8 {$tmp1},[$out],#16
1595 vst1.8 {$tmp2},[$out],#16
1596 vst1.8 {$tmp3},[$out],#16
1597 vst1.8 {$tmp4},[$out],#16
1598
1599 b .Lcbc_done
1600 .align 4
1601 ___
1602 $code.=<<___;
1603 .Loop3x_cbc_dec:
1604 aesd $dat0,q8
1605 aesimc $dat0,$dat0
1606 aesd $dat1,q8
1607 aesimc $dat1,$dat1
1608 aesd $dat2,q8
1609 aesimc $dat2,$dat2
1610 vld1.32 {q8},[$key_],#16
1611 subs $cnt,$cnt,#2
1612 aesd $dat0,q9
1613 aesimc $dat0,$dat0
1614 aesd $dat1,q9
1615 aesimc $dat1,$dat1
1616 aesd $dat2,q9
1617 aesimc $dat2,$dat2
1618 vld1.32 {q9},[$key_],#16
1619 b.gt .Loop3x_cbc_dec
1620
1621 aesd $dat0,q8
1622 aesimc $dat0,$dat0
1623 aesd $dat1,q8
1624 aesimc $dat1,$dat1
1625 aesd $dat2,q8
1626 aesimc $dat2,$dat2
1627 veor $tmp0,$ivec,$rndlast
1628 subs $len,$len,#0x30
1629 veor $tmp1,$in0,$rndlast
1630 mov.lo x6,$len // x6, $cnt, is zero at this point
1631 aesd $dat0,q9
1632 aesimc $dat0,$dat0
1633 aesd $dat1,q9
1634 aesimc $dat1,$dat1
1635 aesd $dat2,q9
1636 aesimc $dat2,$dat2
1637 veor $tmp2,$in1,$rndlast
1638 add $inp,$inp,x6 // $inp is adjusted in such way that
1639 // at exit from the loop $dat1-$dat2
1640 // are loaded with last "words"
1641 vorr $ivec,$in2,$in2
1642 mov $key_,$key
1643 aesd $dat0,q12
1644 aesimc $dat0,$dat0
1645 aesd $dat1,q12
1646 aesimc $dat1,$dat1
1647 aesd $dat2,q12
1648 aesimc $dat2,$dat2
1649 vld1.8 {$in0},[$inp],#16
1650 aesd $dat0,q13
1651 aesimc $dat0,$dat0
1652 aesd $dat1,q13
1653 aesimc $dat1,$dat1
1654 aesd $dat2,q13
1655 aesimc $dat2,$dat2
1656 vld1.8 {$in1},[$inp],#16
1657 aesd $dat0,q14
1658 aesimc $dat0,$dat0
1659 aesd $dat1,q14
1660 aesimc $dat1,$dat1
1661 aesd $dat2,q14
1662 aesimc $dat2,$dat2
1663 vld1.8 {$in2},[$inp],#16
1664 aesd $dat0,q15
1665 aesd $dat1,q15
1666 aesd $dat2,q15
1667 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1668 add $cnt,$rounds,#2
1669 veor $tmp0,$tmp0,$dat0
1670 veor $tmp1,$tmp1,$dat1
1671 veor $dat2,$dat2,$tmp2
1672 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1673 vst1.8 {$tmp0},[$out],#16
1674 vorr $dat0,$in0,$in0
1675 vst1.8 {$tmp1},[$out],#16
1676 vorr $dat1,$in1,$in1
1677 vst1.8 {$dat2},[$out],#16
1678 vorr $dat2,$in2,$in2
1679 b.hs .Loop3x_cbc_dec
1680
1681 cmn $len,#0x30
1682 b.eq .Lcbc_done
1683 nop
1684
1685 .Lcbc_dec_tail:
1686 aesd $dat1,q8
1687 aesimc $dat1,$dat1
1688 aesd $dat2,q8
1689 aesimc $dat2,$dat2
1690 vld1.32 {q8},[$key_],#16
1691 subs $cnt,$cnt,#2
1692 aesd $dat1,q9
1693 aesimc $dat1,$dat1
1694 aesd $dat2,q9
1695 aesimc $dat2,$dat2
1696 vld1.32 {q9},[$key_],#16
1697 b.gt .Lcbc_dec_tail
1698
1699 aesd $dat1,q8
1700 aesimc $dat1,$dat1
1701 aesd $dat2,q8
1702 aesimc $dat2,$dat2
1703 aesd $dat1,q9
1704 aesimc $dat1,$dat1
1705 aesd $dat2,q9
1706 aesimc $dat2,$dat2
1707 aesd $dat1,q12
1708 aesimc $dat1,$dat1
1709 aesd $dat2,q12
1710 aesimc $dat2,$dat2
1711 cmn $len,#0x20
1712 aesd $dat1,q13
1713 aesimc $dat1,$dat1
1714 aesd $dat2,q13
1715 aesimc $dat2,$dat2
1716 veor $tmp1,$ivec,$rndlast
1717 aesd $dat1,q14
1718 aesimc $dat1,$dat1
1719 aesd $dat2,q14
1720 aesimc $dat2,$dat2
1721 veor $tmp2,$in1,$rndlast
1722 aesd $dat1,q15
1723 aesd $dat2,q15
1724 b.eq .Lcbc_dec_one
1725 veor $tmp1,$tmp1,$dat1
1726 veor $tmp2,$tmp2,$dat2
1727 vorr $ivec,$in2,$in2
1728 vst1.8 {$tmp1},[$out],#16
1729 vst1.8 {$tmp2},[$out],#16
1730 b .Lcbc_done
1731
1732 .Lcbc_dec_one:
1733 veor $tmp1,$tmp1,$dat2
1734 vorr $ivec,$in2,$in2
1735 vst1.8 {$tmp1},[$out],#16
1736
1737 .Lcbc_done:
1738 vst1.8 {$ivec},[$ivp]
1739 .Lcbc_abort:
1740 ___
1741 }
1742 $code.=<<___ if ($flavour !~ /64/);
1743 vldmia sp!,{d8-d15}
1744 ldmia sp!,{r4-r8,pc}
1745 ___
1746 $code.=<<___ if ($flavour =~ /64/);
1747 ldr x29,[sp],#16
1748 ret
1749 ___
1750 $code.=<<___;
1751 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1752 ___
1753 }}}
1754 {{{
1755 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1756 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1757 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1758 my $step="x12"; # aliases with $tctr2
1759
1760 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1761 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1762
1763 # used only in 64-bit mode...
1764 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1765
1766 my ($dat,$tmp)=($dat0,$tmp0);
1767
1768 ### q8-q15 preloaded key schedule
1769
1770 $code.=<<___;
1771 .globl ${prefix}_ctr32_encrypt_blocks
1772 .type ${prefix}_ctr32_encrypt_blocks,%function
1773 .align 5
1774 ${prefix}_ctr32_encrypt_blocks:
1775 ___
1776 $code.=<<___ if ($flavour =~ /64/);
1777 AARCH64_VALID_CALL_TARGET
1778 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1779 stp x29,x30,[sp,#-16]!
1780 add x29,sp,#0
1781 ___
1782 $code.=<<___ if ($flavour !~ /64/);
1783 mov ip,sp
1784 stmdb sp!,{r4-r10,lr}
1785 vstmdb sp!,{d8-d15} @ ABI specification says so
1786 ldr r4, [ip] @ load remaining arg
1787 ___
1788 $code.=<<___;
1789 ldr $rounds,[$key,#240]
1790
1791 ldr $ctr, [$ivp, #12]
1792 #ifdef __ARMEB__
1793 vld1.8 {$dat0},[$ivp]
1794 #else
1795 vld1.32 {$dat0},[$ivp]
1796 #endif
1797 vld1.32 {q8-q9},[$key] // load key schedule...
1798 sub $rounds,$rounds,#4
1799 mov $step,#16
1800 cmp $len,#2
1801 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1802 sub $rounds,$rounds,#2
1803 vld1.32 {q12-q13},[$key_],#32
1804 vld1.32 {q14-q15},[$key_],#32
1805 vld1.32 {$rndlast},[$key_]
1806 add $key_,$key,#32
1807 mov $cnt,$rounds
1808 cclr $step,lo
1809 #ifndef __ARMEB__
1810 rev $ctr, $ctr
1811 #endif
1812 ___
1813 $code.=<<___ if ($flavour =~ /64/);
1814 vorr $dat1,$dat0,$dat0
1815 add $tctr1, $ctr, #1
1816 vorr $dat2,$dat0,$dat0
1817 add $ctr, $ctr, #2
1818 vorr $ivec,$dat0,$dat0
1819 rev $tctr1, $tctr1
1820 vmov.32 ${dat1}[3],$tctr1
1821 b.ls .Lctr32_tail
1822 rev $tctr2, $ctr
1823 sub $len,$len,#3 // bias
1824 vmov.32 ${dat2}[3],$tctr2
1825 ___
1826 $code.=<<___ if ($flavour !~ /64/);
1827 add $tctr1, $ctr, #1
1828 vorr $ivec,$dat0,$dat0
1829 rev $tctr1, $tctr1
1830 vmov.32 ${ivec}[3],$tctr1
1831 add $ctr, $ctr, #2
1832 vorr $dat1,$ivec,$ivec
1833 b.ls .Lctr32_tail
1834 rev $tctr2, $ctr
1835 vmov.32 ${ivec}[3],$tctr2
1836 sub $len,$len,#3 // bias
1837 vorr $dat2,$ivec,$ivec
1838 ___
1839 $code.=<<___ if ($flavour =~ /64/);
1840 cmp $len,#32
1841 b.lo .Loop3x_ctr32
1842
1843 add w13,$ctr,#1
1844 add w14,$ctr,#2
1845 vorr $dat3,$dat0,$dat0
1846 rev w13,w13
1847 vorr $dat4,$dat0,$dat0
1848 rev w14,w14
1849 vmov.32 ${dat3}[3],w13
1850 sub $len,$len,#2 // bias
1851 vmov.32 ${dat4}[3],w14
1852 add $ctr,$ctr,#2
1853 b .Loop5x_ctr32
1854
1855 .align 4
1856 .Loop5x_ctr32:
1857 aese $dat0,q8
1858 aesmc $dat0,$dat0
1859 aese $dat1,q8
1860 aesmc $dat1,$dat1
1861 aese $dat2,q8
1862 aesmc $dat2,$dat2
1863 aese $dat3,q8
1864 aesmc $dat3,$dat3
1865 aese $dat4,q8
1866 aesmc $dat4,$dat4
1867 vld1.32 {q8},[$key_],#16
1868 subs $cnt,$cnt,#2
1869 aese $dat0,q9
1870 aesmc $dat0,$dat0
1871 aese $dat1,q9
1872 aesmc $dat1,$dat1
1873 aese $dat2,q9
1874 aesmc $dat2,$dat2
1875 aese $dat3,q9
1876 aesmc $dat3,$dat3
1877 aese $dat4,q9
1878 aesmc $dat4,$dat4
1879 vld1.32 {q9},[$key_],#16
1880 b.gt .Loop5x_ctr32
1881
1882 mov $key_,$key
1883 aese $dat0,q8
1884 aesmc $dat0,$dat0
1885 aese $dat1,q8
1886 aesmc $dat1,$dat1
1887 aese $dat2,q8
1888 aesmc $dat2,$dat2
1889 aese $dat3,q8
1890 aesmc $dat3,$dat3
1891 aese $dat4,q8
1892 aesmc $dat4,$dat4
1893 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1894
1895 aese $dat0,q9
1896 aesmc $dat0,$dat0
1897 aese $dat1,q9
1898 aesmc $dat1,$dat1
1899 aese $dat2,q9
1900 aesmc $dat2,$dat2
1901 aese $dat3,q9
1902 aesmc $dat3,$dat3
1903 aese $dat4,q9
1904 aesmc $dat4,$dat4
1905 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1906
1907 aese $dat0,q12
1908 aesmc $dat0,$dat0
1909 add $tctr0,$ctr,#1
1910 add $tctr1,$ctr,#2
1911 aese $dat1,q12
1912 aesmc $dat1,$dat1
1913 add $tctr2,$ctr,#3
1914 add w13,$ctr,#4
1915 aese $dat2,q12
1916 aesmc $dat2,$dat2
1917 add w14,$ctr,#5
1918 rev $tctr0,$tctr0
1919 aese $dat3,q12
1920 aesmc $dat3,$dat3
1921 rev $tctr1,$tctr1
1922 rev $tctr2,$tctr2
1923 aese $dat4,q12
1924 aesmc $dat4,$dat4
1925 rev w13,w13
1926 rev w14,w14
1927
1928 aese $dat0,q13
1929 aesmc $dat0,$dat0
1930 aese $dat1,q13
1931 aesmc $dat1,$dat1
1932 aese $dat2,q13
1933 aesmc $dat2,$dat2
1934 aese $dat3,q13
1935 aesmc $dat3,$dat3
1936 aese $dat4,q13
1937 aesmc $dat4,$dat4
1938
1939 aese $dat0,q14
1940 aesmc $dat0,$dat0
1941 vld1.8 {$in0},[$inp],#16
1942 aese $dat1,q14
1943 aesmc $dat1,$dat1
1944 vld1.8 {$in1},[$inp],#16
1945 aese $dat2,q14
1946 aesmc $dat2,$dat2
1947 vld1.8 {$in2},[$inp],#16
1948 aese $dat3,q14
1949 aesmc $dat3,$dat3
1950 vld1.8 {$in3},[$inp],#16
1951 aese $dat4,q14
1952 aesmc $dat4,$dat4
1953 vld1.8 {$in4},[$inp],#16
1954
1955 aese $dat0,q15
1956 veor $in0,$in0,$rndlast
1957 aese $dat1,q15
1958 veor $in1,$in1,$rndlast
1959 aese $dat2,q15
1960 veor $in2,$in2,$rndlast
1961 aese $dat3,q15
1962 veor $in3,$in3,$rndlast
1963 aese $dat4,q15
1964 veor $in4,$in4,$rndlast
1965
1966 veor $in0,$in0,$dat0
1967 vorr $dat0,$ivec,$ivec
1968 veor $in1,$in1,$dat1
1969 vorr $dat1,$ivec,$ivec
1970 veor $in2,$in2,$dat2
1971 vorr $dat2,$ivec,$ivec
1972 veor $in3,$in3,$dat3
1973 vorr $dat3,$ivec,$ivec
1974 veor $in4,$in4,$dat4
1975 vorr $dat4,$ivec,$ivec
1976
1977 vst1.8 {$in0},[$out],#16
1978 vmov.32 ${dat0}[3],$tctr0
1979 vst1.8 {$in1},[$out],#16
1980 vmov.32 ${dat1}[3],$tctr1
1981 vst1.8 {$in2},[$out],#16
1982 vmov.32 ${dat2}[3],$tctr2
1983 vst1.8 {$in3},[$out],#16
1984 vmov.32 ${dat3}[3],w13
1985 vst1.8 {$in4},[$out],#16
1986 vmov.32 ${dat4}[3],w14
1987
1988 mov $cnt,$rounds
1989 cbz $len,.Lctr32_done
1990
1991 add $ctr,$ctr,#5
1992 subs $len,$len,#5
1993 b.hs .Loop5x_ctr32
1994
1995 add $len,$len,#5
1996 sub $ctr,$ctr,#5
1997
1998 cmp $len,#2
1999 mov $step,#16
2000 cclr $step,lo
2001 b.ls .Lctr32_tail
2002
2003 sub $len,$len,#3 // bias
2004 add $ctr,$ctr,#3
2005 ___
2006 $code.=<<___;
2007 b .Loop3x_ctr32
2008
2009 .align 4
2010 .Loop3x_ctr32:
2011 aese $dat0,q8
2012 aesmc $dat0,$dat0
2013 aese $dat1,q8
2014 aesmc $dat1,$dat1
2015 aese $dat2,q8
2016 aesmc $dat2,$dat2
2017 vld1.32 {q8},[$key_],#16
2018 subs $cnt,$cnt,#2
2019 aese $dat0,q9
2020 aesmc $dat0,$dat0
2021 aese $dat1,q9
2022 aesmc $dat1,$dat1
2023 aese $dat2,q9
2024 aesmc $dat2,$dat2
2025 vld1.32 {q9},[$key_],#16
2026 b.gt .Loop3x_ctr32
2027
2028 aese $dat0,q8
2029 aesmc $tmp0,$dat0
2030 aese $dat1,q8
2031 aesmc $tmp1,$dat1
2032 vld1.8 {$in0},[$inp],#16
2033 ___
2034 $code.=<<___ if ($flavour =~ /64/);
2035 vorr $dat0,$ivec,$ivec
2036 ___
2037 $code.=<<___ if ($flavour !~ /64/);
2038 add $tctr0,$ctr,#1
2039 ___
2040 $code.=<<___;
2041 aese $dat2,q8
2042 aesmc $dat2,$dat2
2043 vld1.8 {$in1},[$inp],#16
2044 ___
2045 $code.=<<___ if ($flavour =~ /64/);
2046 vorr $dat1,$ivec,$ivec
2047 ___
2048 $code.=<<___ if ($flavour !~ /64/);
2049 rev $tctr0,$tctr0
2050 ___
2051 $code.=<<___;
2052 aese $tmp0,q9
2053 aesmc $tmp0,$tmp0
2054 aese $tmp1,q9
2055 aesmc $tmp1,$tmp1
2056 vld1.8 {$in2},[$inp],#16
2057 mov $key_,$key
2058 aese $dat2,q9
2059 aesmc $tmp2,$dat2
2060 ___
2061 $code.=<<___ if ($flavour =~ /64/);
2062 vorr $dat2,$ivec,$ivec
2063 add $tctr0,$ctr,#1
2064 ___
2065 $code.=<<___;
2066 aese $tmp0,q12
2067 aesmc $tmp0,$tmp0
2068 aese $tmp1,q12
2069 aesmc $tmp1,$tmp1
2070 veor $in0,$in0,$rndlast
2071 add $tctr1,$ctr,#2
2072 aese $tmp2,q12
2073 aesmc $tmp2,$tmp2
2074 veor $in1,$in1,$rndlast
2075 add $ctr,$ctr,#3
2076 aese $tmp0,q13
2077 aesmc $tmp0,$tmp0
2078 aese $tmp1,q13
2079 aesmc $tmp1,$tmp1
2080 veor $in2,$in2,$rndlast
2081 ___
2082 $code.=<<___ if ($flavour =~ /64/);
2083 rev $tctr0,$tctr0
2084 aese $tmp2,q13
2085 aesmc $tmp2,$tmp2
2086 vmov.32 ${dat0}[3], $tctr0
2087 ___
2088 $code.=<<___ if ($flavour !~ /64/);
2089 vmov.32 ${ivec}[3], $tctr0
2090 aese $tmp2,q13
2091 aesmc $tmp2,$tmp2
2092 vorr $dat0,$ivec,$ivec
2093 ___
2094 $code.=<<___;
2095 rev $tctr1,$tctr1
2096 aese $tmp0,q14
2097 aesmc $tmp0,$tmp0
2098 ___
2099 $code.=<<___ if ($flavour !~ /64/);
2100 vmov.32 ${ivec}[3], $tctr1
2101 rev $tctr2,$ctr
2102 ___
2103 $code.=<<___;
2104 aese $tmp1,q14
2105 aesmc $tmp1,$tmp1
2106 ___
2107 $code.=<<___ if ($flavour =~ /64/);
2108 vmov.32 ${dat1}[3], $tctr1
2109 rev $tctr2,$ctr
2110 aese $tmp2,q14
2111 aesmc $tmp2,$tmp2
2112 vmov.32 ${dat2}[3], $tctr2
2113 ___
2114 $code.=<<___ if ($flavour !~ /64/);
2115 vorr $dat1,$ivec,$ivec
2116 vmov.32 ${ivec}[3], $tctr2
2117 aese $tmp2,q14
2118 aesmc $tmp2,$tmp2
2119 vorr $dat2,$ivec,$ivec
2120 ___
2121 $code.=<<___;
2122 subs $len,$len,#3
2123 aese $tmp0,q15
2124 aese $tmp1,q15
2125 aese $tmp2,q15
2126
2127 veor $in0,$in0,$tmp0
2128 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2129 vst1.8 {$in0},[$out],#16
2130 veor $in1,$in1,$tmp1
2131 mov $cnt,$rounds
2132 vst1.8 {$in1},[$out],#16
2133 veor $in2,$in2,$tmp2
2134 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2135 vst1.8 {$in2},[$out],#16
2136 b.hs .Loop3x_ctr32
2137
2138 adds $len,$len,#3
2139 b.eq .Lctr32_done
2140 cmp $len,#1
2141 mov $step,#16
2142 cclr $step,eq
2143
2144 .Lctr32_tail:
2145 aese $dat0,q8
2146 aesmc $dat0,$dat0
2147 aese $dat1,q8
2148 aesmc $dat1,$dat1
2149 vld1.32 {q8},[$key_],#16
2150 subs $cnt,$cnt,#2
2151 aese $dat0,q9
2152 aesmc $dat0,$dat0
2153 aese $dat1,q9
2154 aesmc $dat1,$dat1
2155 vld1.32 {q9},[$key_],#16
2156 b.gt .Lctr32_tail
2157
2158 aese $dat0,q8
2159 aesmc $dat0,$dat0
2160 aese $dat1,q8
2161 aesmc $dat1,$dat1
2162 aese $dat0,q9
2163 aesmc $dat0,$dat0
2164 aese $dat1,q9
2165 aesmc $dat1,$dat1
2166 vld1.8 {$in0},[$inp],$step
2167 aese $dat0,q12
2168 aesmc $dat0,$dat0
2169 aese $dat1,q12
2170 aesmc $dat1,$dat1
2171 vld1.8 {$in1},[$inp]
2172 aese $dat0,q13
2173 aesmc $dat0,$dat0
2174 aese $dat1,q13
2175 aesmc $dat1,$dat1
2176 veor $in0,$in0,$rndlast
2177 aese $dat0,q14
2178 aesmc $dat0,$dat0
2179 aese $dat1,q14
2180 aesmc $dat1,$dat1
2181 veor $in1,$in1,$rndlast
2182 aese $dat0,q15
2183 aese $dat1,q15
2184
2185 cmp $len,#1
2186 veor $in0,$in0,$dat0
2187 veor $in1,$in1,$dat1
2188 vst1.8 {$in0},[$out],#16
2189 b.eq .Lctr32_done
2190 vst1.8 {$in1},[$out]
2191
2192 .Lctr32_done:
2193 ___
2194 $code.=<<___ if ($flavour !~ /64/);
2195 vldmia sp!,{d8-d15}
2196 ldmia sp!,{r4-r10,pc}
2197 ___
2198 $code.=<<___ if ($flavour =~ /64/);
2199 ldr x29,[sp],#16
2200 ret
2201 ___
2202 $code.=<<___;
2203 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2204 ___
2205 }}}
2206 # Performance in cycles per byte.
2207 # Processed with AES-XTS different key size.
2208 # It shows the value before and after optimization as below:
2209 # (before/after):
2210 #
2211 # AES-128-XTS AES-256-XTS
2212 # Cortex-A57 3.36/1.09 4.02/1.37
2213 # Cortex-A72 3.03/1.02 3.28/1.33
2214
2215 # Optimization is implemented by loop unrolling and interleaving.
2216 # Commonly, we choose the unrolling factor as 5, if the input
2217 # data size smaller than 5 blocks, but not smaller than 3 blocks,
2218 # choose 3 as the unrolling factor.
2219 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
2220 # as one iteration, every loop the left size lsize -= 5*16.
2221 # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2222 # will be processed specially, which be integrated into the 5*16 bytes
2223 # loop to improve the efficiency.
2224 # There is one special case, if the original input data size dsize
2225 # = 16 bytes, we will treat it separately to improve the
2226 # performance: one independent code block without LR, FP load and
2227 # store.
2228 # Encryption will process the (length -tailcnt) bytes as mentioned
2229 # previously, then encrypt the composite block as last second
2230 # cipher block.
2231 # Decryption will process the (length -tailcnt -1) bytes as mentioned
2232 # previously, then decrypt the last second cipher block to get the
2233 # last plain block(tail), decrypt the composite block as last second
2234 # plain text block.
2235
2236 {{{
2237 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2238 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2239 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2240 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2241 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2242 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2243 my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2244 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2245 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2246
2247 my ($tmpin)=("v26.16b");
2248 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2249
2250 # q7 last round key
2251 # q10-q15, q7 Last 7 round keys
2252 # q8-q9 preloaded round keys except last 7 keys for big size
2253 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2254
2255
2256 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2257
2258 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2259 my ($dat4,$in4,$tmp4);
2260 if ($flavour =~ /64/) {
2261 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2262 }
2263
2264 $code.=<<___ if ($flavour =~ /64/);
2265 .globl ${prefix}_xts_encrypt
2266 .type ${prefix}_xts_encrypt,%function
2267 .align 5
2268 ${prefix}_xts_encrypt:
2269 ___
2270 $code.=<<___ if ($flavour =~ /64/);
2271 AARCH64_VALID_CALL_TARGET
2272 cmp $len,#16
2273 // Original input data size bigger than 16, jump to big size processing.
2274 b.ne .Lxts_enc_big_size
2275 // Encrypt the iv with key2, as the first XEX iv.
2276 ldr $rounds,[$key2,#240]
2277 vld1.32 {$dat},[$key2],#16
2278 vld1.8 {$iv0},[$ivp]
2279 sub $rounds,$rounds,#2
2280 vld1.32 {$dat1},[$key2],#16
2281
2282 .Loop_enc_iv_enc:
2283 aese $iv0,$dat
2284 aesmc $iv0,$iv0
2285 vld1.32 {$dat},[$key2],#16
2286 subs $rounds,$rounds,#2
2287 aese $iv0,$dat1
2288 aesmc $iv0,$iv0
2289 vld1.32 {$dat1},[$key2],#16
2290 b.gt .Loop_enc_iv_enc
2291
2292 aese $iv0,$dat
2293 aesmc $iv0,$iv0
2294 vld1.32 {$dat},[$key2]
2295 aese $iv0,$dat1
2296 veor $iv0,$iv0,$dat
2297
2298 vld1.8 {$dat0},[$inp]
2299 veor $dat0,$iv0,$dat0
2300
2301 ldr $rounds,[$key1,#240]
2302 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2303
2304 aese $dat0,q20
2305 aesmc $dat0,$dat0
2306 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2307 aese $dat0,q21
2308 aesmc $dat0,$dat0
2309 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2310 b.eq .Lxts_128_enc
2311 .Lxts_enc_round_loop:
2312 aese $dat0,q8
2313 aesmc $dat0,$dat0
2314 vld1.32 {q8},[$key1],#16 // load key schedule...
2315 aese $dat0,q9
2316 aesmc $dat0,$dat0
2317 vld1.32 {q9},[$key1],#16 // load key schedule...
2318 subs $rounds,$rounds,#2 // bias
2319 b.gt .Lxts_enc_round_loop
2320 .Lxts_128_enc:
2321 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2322 aese $dat0,q8
2323 aesmc $dat0,$dat0
2324 aese $dat0,q9
2325 aesmc $dat0,$dat0
2326 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2327 aese $dat0,q10
2328 aesmc $dat0,$dat0
2329 aese $dat0,q11
2330 aesmc $dat0,$dat0
2331 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2332 aese $dat0,q12
2333 aesmc $dat0,$dat0
2334 aese $dat0,q13
2335 aesmc $dat0,$dat0
2336 vld1.32 {$rndlast},[$key1]
2337 aese $dat0,q14
2338 aesmc $dat0,$dat0
2339 aese $dat0,q15
2340 veor $dat0,$dat0,$rndlast
2341 veor $dat0,$dat0,$iv0
2342 vst1.8 {$dat0},[$out]
2343 b .Lxts_enc_final_abort
2344
2345 .align 4
2346 .Lxts_enc_big_size:
2347 ___
2348 $code.=<<___ if ($flavour =~ /64/);
2349 stp $constnumx,$tmpinp,[sp,#-64]!
2350 stp $tailcnt,$midnumx,[sp,#48]
2351 stp $ivd10,$ivd20,[sp,#32]
2352 stp $ivd30,$ivd40,[sp,#16]
2353
2354 // tailcnt store the tail value of length%16.
2355 and $tailcnt,$len,#0xf
2356 and $len,$len,#-16
2357 subs $len,$len,#16
2358 mov $step,#16
2359 b.lo .Lxts_abort
2360 csel $step,xzr,$step,eq
2361
2362 // Firstly, encrypt the iv with key2, as the first iv of XEX.
2363 ldr $rounds,[$key2,#240]
2364 vld1.32 {$dat},[$key2],#16
2365 vld1.8 {$iv0},[$ivp]
2366 sub $rounds,$rounds,#2
2367 vld1.32 {$dat1},[$key2],#16
2368
2369 .Loop_iv_enc:
2370 aese $iv0,$dat
2371 aesmc $iv0,$iv0
2372 vld1.32 {$dat},[$key2],#16
2373 subs $rounds,$rounds,#2
2374 aese $iv0,$dat1
2375 aesmc $iv0,$iv0
2376 vld1.32 {$dat1},[$key2],#16
2377 b.gt .Loop_iv_enc
2378
2379 aese $iv0,$dat
2380 aesmc $iv0,$iv0
2381 vld1.32 {$dat},[$key2]
2382 aese $iv0,$dat1
2383 veor $iv0,$iv0,$dat
2384
2385 // The iv for second block
2386 // $ivl- iv(low), $ivh - iv(high)
2387 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2388 fmov $ivl,$ivd00
2389 fmov $ivh,$ivd01
2390 mov $constnum,#0x87
2391 extr $midnumx,$ivh,$ivh,#32
2392 extr $ivh,$ivh,$ivl,#63
2393 and $tmpmw,$constnum,$midnum,asr#31
2394 eor $ivl,$tmpmx,$ivl,lsl#1
2395 fmov $ivd10,$ivl
2396 fmov $ivd11,$ivh
2397
2398 ldr $rounds0,[$key1,#240] // next starting point
2399 vld1.8 {$dat},[$inp],$step
2400
2401 vld1.32 {q8-q9},[$key1] // load key schedule...
2402 sub $rounds0,$rounds0,#6
2403 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2404 sub $rounds0,$rounds0,#2
2405 vld1.32 {q10-q11},[$key_],#32
2406 vld1.32 {q12-q13},[$key_],#32
2407 vld1.32 {q14-q15},[$key_],#32
2408 vld1.32 {$rndlast},[$key_]
2409
2410 add $key_,$key1,#32
2411 mov $rounds,$rounds0
2412
2413 // Encryption
2414 .Lxts_enc:
2415 vld1.8 {$dat2},[$inp],#16
2416 subs $len,$len,#32 // bias
2417 add $rounds,$rounds0,#2
2418 vorr $in1,$dat,$dat
2419 vorr $dat1,$dat,$dat
2420 vorr $in3,$dat,$dat
2421 vorr $in2,$dat2,$dat2
2422 vorr $in4,$dat2,$dat2
2423 b.lo .Lxts_inner_enc_tail
2424 veor $dat,$dat,$iv0 // before encryption, xor with iv
2425 veor $dat2,$dat2,$iv1
2426
2427 // The iv for third block
2428 extr $midnumx,$ivh,$ivh,#32
2429 extr $ivh,$ivh,$ivl,#63
2430 and $tmpmw,$constnum,$midnum,asr#31
2431 eor $ivl,$tmpmx,$ivl,lsl#1
2432 fmov $ivd20,$ivl
2433 fmov $ivd21,$ivh
2434
2435
2436 vorr $dat1,$dat2,$dat2
2437 vld1.8 {$dat2},[$inp],#16
2438 vorr $in0,$dat,$dat
2439 vorr $in1,$dat1,$dat1
2440 veor $in2,$dat2,$iv2 // the third block
2441 veor $dat2,$dat2,$iv2
2442 cmp $len,#32
2443 b.lo .Lxts_outer_enc_tail
2444
2445 // The iv for fourth block
2446 extr $midnumx,$ivh,$ivh,#32
2447 extr $ivh,$ivh,$ivl,#63
2448 and $tmpmw,$constnum,$midnum,asr#31
2449 eor $ivl,$tmpmx,$ivl,lsl#1
2450 fmov $ivd30,$ivl
2451 fmov $ivd31,$ivh
2452
2453 vld1.8 {$dat3},[$inp],#16
2454 // The iv for fifth block
2455 extr $midnumx,$ivh,$ivh,#32
2456 extr $ivh,$ivh,$ivl,#63
2457 and $tmpmw,$constnum,$midnum,asr#31
2458 eor $ivl,$tmpmx,$ivl,lsl#1
2459 fmov $ivd40,$ivl
2460 fmov $ivd41,$ivh
2461
2462 vld1.8 {$dat4},[$inp],#16
2463 veor $dat3,$dat3,$iv3 // the fourth block
2464 veor $dat4,$dat4,$iv4
2465 sub $len,$len,#32 // bias
2466 mov $rounds,$rounds0
2467 b .Loop5x_xts_enc
2468
2469 .align 4
2470 .Loop5x_xts_enc:
2471 aese $dat0,q8
2472 aesmc $dat0,$dat0
2473 aese $dat1,q8
2474 aesmc $dat1,$dat1
2475 aese $dat2,q8
2476 aesmc $dat2,$dat2
2477 aese $dat3,q8
2478 aesmc $dat3,$dat3
2479 aese $dat4,q8
2480 aesmc $dat4,$dat4
2481 vld1.32 {q8},[$key_],#16
2482 subs $rounds,$rounds,#2
2483 aese $dat0,q9
2484 aesmc $dat0,$dat0
2485 aese $dat1,q9
2486 aesmc $dat1,$dat1
2487 aese $dat2,q9
2488 aesmc $dat2,$dat2
2489 aese $dat3,q9
2490 aesmc $dat3,$dat3
2491 aese $dat4,q9
2492 aesmc $dat4,$dat4
2493 vld1.32 {q9},[$key_],#16
2494 b.gt .Loop5x_xts_enc
2495
2496 aese $dat0,q8
2497 aesmc $dat0,$dat0
2498 aese $dat1,q8
2499 aesmc $dat1,$dat1
2500 aese $dat2,q8
2501 aesmc $dat2,$dat2
2502 aese $dat3,q8
2503 aesmc $dat3,$dat3
2504 aese $dat4,q8
2505 aesmc $dat4,$dat4
2506 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2507
2508 aese $dat0,q9
2509 aesmc $dat0,$dat0
2510 aese $dat1,q9
2511 aesmc $dat1,$dat1
2512 aese $dat2,q9
2513 aesmc $dat2,$dat2
2514 aese $dat3,q9
2515 aesmc $dat3,$dat3
2516 aese $dat4,q9
2517 aesmc $dat4,$dat4
2518 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2519 mov $key_,$key1
2520
2521 aese $dat0,q10
2522 aesmc $dat0,$dat0
2523 aese $dat1,q10
2524 aesmc $dat1,$dat1
2525 aese $dat2,q10
2526 aesmc $dat2,$dat2
2527 aese $dat3,q10
2528 aesmc $dat3,$dat3
2529 aese $dat4,q10
2530 aesmc $dat4,$dat4
2531 add $inp,$inp,$xoffset // x0 is adjusted in such way that
2532 // at exit from the loop v1.16b-v26.16b
2533 // are loaded with last "words"
2534 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2535
2536 aese $dat0,q11
2537 aesmc $dat0,$dat0
2538 aese $dat1,q11
2539 aesmc $dat1,$dat1
2540 aese $dat2,q11
2541 aesmc $dat2,$dat2
2542 aese $dat3,q11
2543 aesmc $dat3,$dat3
2544 aese $dat4,q11
2545 aesmc $dat4,$dat4
2546
2547 aese $dat0,q12
2548 aesmc $dat0,$dat0
2549 aese $dat1,q12
2550 aesmc $dat1,$dat1
2551 aese $dat2,q12
2552 aesmc $dat2,$dat2
2553 aese $dat3,q12
2554 aesmc $dat3,$dat3
2555 aese $dat4,q12
2556 aesmc $dat4,$dat4
2557
2558 aese $dat0,q13
2559 aesmc $dat0,$dat0
2560 aese $dat1,q13
2561 aesmc $dat1,$dat1
2562 aese $dat2,q13
2563 aesmc $dat2,$dat2
2564 aese $dat3,q13
2565 aesmc $dat3,$dat3
2566 aese $dat4,q13
2567 aesmc $dat4,$dat4
2568
2569 aese $dat0,q14
2570 aesmc $dat0,$dat0
2571 aese $dat1,q14
2572 aesmc $dat1,$dat1
2573 aese $dat2,q14
2574 aesmc $dat2,$dat2
2575 aese $dat3,q14
2576 aesmc $dat3,$dat3
2577 aese $dat4,q14
2578 aesmc $dat4,$dat4
2579
2580 veor $tmp0,$rndlast,$iv0
2581 aese $dat0,q15
2582 // The iv for first block of one iteration
2583 extr $midnumx,$ivh,$ivh,#32
2584 extr $ivh,$ivh,$ivl,#63
2585 and $tmpmw,$constnum,$midnum,asr#31
2586 eor $ivl,$tmpmx,$ivl,lsl#1
2587 fmov $ivd00,$ivl
2588 fmov $ivd01,$ivh
2589 veor $tmp1,$rndlast,$iv1
2590 vld1.8 {$in0},[$inp],#16
2591 aese $dat1,q15
2592 // The iv for second block
2593 extr $midnumx,$ivh,$ivh,#32
2594 extr $ivh,$ivh,$ivl,#63
2595 and $tmpmw,$constnum,$midnum,asr#31
2596 eor $ivl,$tmpmx,$ivl,lsl#1
2597 fmov $ivd10,$ivl
2598 fmov $ivd11,$ivh
2599 veor $tmp2,$rndlast,$iv2
2600 vld1.8 {$in1},[$inp],#16
2601 aese $dat2,q15
2602 // The iv for third block
2603 extr $midnumx,$ivh,$ivh,#32
2604 extr $ivh,$ivh,$ivl,#63
2605 and $tmpmw,$constnum,$midnum,asr#31
2606 eor $ivl,$tmpmx,$ivl,lsl#1
2607 fmov $ivd20,$ivl
2608 fmov $ivd21,$ivh
2609 veor $tmp3,$rndlast,$iv3
2610 vld1.8 {$in2},[$inp],#16
2611 aese $dat3,q15
2612 // The iv for fourth block
2613 extr $midnumx,$ivh,$ivh,#32
2614 extr $ivh,$ivh,$ivl,#63
2615 and $tmpmw,$constnum,$midnum,asr#31
2616 eor $ivl,$tmpmx,$ivl,lsl#1
2617 fmov $ivd30,$ivl
2618 fmov $ivd31,$ivh
2619 veor $tmp4,$rndlast,$iv4
2620 vld1.8 {$in3},[$inp],#16
2621 aese $dat4,q15
2622
2623 // The iv for fifth block
2624 extr $midnumx,$ivh,$ivh,#32
2625 extr $ivh,$ivh,$ivl,#63
2626 and $tmpmw,$constnum,$midnum,asr #31
2627 eor $ivl,$tmpmx,$ivl,lsl #1
2628 fmov $ivd40,$ivl
2629 fmov $ivd41,$ivh
2630
2631 vld1.8 {$in4},[$inp],#16
2632 cbz $xoffset,.Lxts_enc_tail4x
2633 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2634 veor $tmp0,$tmp0,$dat0
2635 veor $dat0,$in0,$iv0
2636 veor $tmp1,$tmp1,$dat1
2637 veor $dat1,$in1,$iv1
2638 veor $tmp2,$tmp2,$dat2
2639 veor $dat2,$in2,$iv2
2640 veor $tmp3,$tmp3,$dat3
2641 veor $dat3,$in3,$iv3
2642 veor $tmp4,$tmp4,$dat4
2643 vst1.8 {$tmp0},[$out],#16
2644 veor $dat4,$in4,$iv4
2645 vst1.8 {$tmp1},[$out],#16
2646 mov $rounds,$rounds0
2647 vst1.8 {$tmp2},[$out],#16
2648 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2649 vst1.8 {$tmp3},[$out],#16
2650 vst1.8 {$tmp4},[$out],#16
2651 b.hs .Loop5x_xts_enc
2652
2653
2654 // If left 4 blocks, borrow the five block's processing.
2655 cmn $len,#0x10
2656 b.ne .Loop5x_enc_after
2657 vorr $iv4,$iv3,$iv3
2658 vorr $iv3,$iv2,$iv2
2659 vorr $iv2,$iv1,$iv1
2660 vorr $iv1,$iv0,$iv0
2661 fmov $ivl,$ivd40
2662 fmov $ivh,$ivd41
2663 veor $dat0,$iv0,$in0
2664 veor $dat1,$iv1,$in1
2665 veor $dat2,$in2,$iv2
2666 veor $dat3,$in3,$iv3
2667 veor $dat4,$in4,$iv4
2668 b.eq .Loop5x_xts_enc
2669
2670 .Loop5x_enc_after:
2671 add $len,$len,#0x50
2672 cbz $len,.Lxts_enc_done
2673
2674 add $rounds,$rounds0,#2
2675 subs $len,$len,#0x30
2676 b.lo .Lxts_inner_enc_tail
2677
2678 veor $dat0,$iv0,$in2
2679 veor $dat1,$iv1,$in3
2680 veor $dat2,$in4,$iv2
2681 b .Lxts_outer_enc_tail
2682
2683 .align 4
2684 .Lxts_enc_tail4x:
2685 add $inp,$inp,#16
2686 veor $tmp1,$dat1,$tmp1
2687 vst1.8 {$tmp1},[$out],#16
2688 veor $tmp2,$dat2,$tmp2
2689 vst1.8 {$tmp2},[$out],#16
2690 veor $tmp3,$dat3,$tmp3
2691 veor $tmp4,$dat4,$tmp4
2692 vst1.8 {$tmp3-$tmp4},[$out],#32
2693
2694 b .Lxts_enc_done
2695 .align 4
2696 .Lxts_outer_enc_tail:
2697 aese $dat0,q8
2698 aesmc $dat0,$dat0
2699 aese $dat1,q8
2700 aesmc $dat1,$dat1
2701 aese $dat2,q8
2702 aesmc $dat2,$dat2
2703 vld1.32 {q8},[$key_],#16
2704 subs $rounds,$rounds,#2
2705 aese $dat0,q9
2706 aesmc $dat0,$dat0
2707 aese $dat1,q9
2708 aesmc $dat1,$dat1
2709 aese $dat2,q9
2710 aesmc $dat2,$dat2
2711 vld1.32 {q9},[$key_],#16
2712 b.gt .Lxts_outer_enc_tail
2713
2714 aese $dat0,q8
2715 aesmc $dat0,$dat0
2716 aese $dat1,q8
2717 aesmc $dat1,$dat1
2718 aese $dat2,q8
2719 aesmc $dat2,$dat2
2720 veor $tmp0,$iv0,$rndlast
2721 subs $len,$len,#0x30
2722 // The iv for first block
2723 fmov $ivl,$ivd20
2724 fmov $ivh,$ivd21
2725 //mov $constnum,#0x87
2726 extr $midnumx,$ivh,$ivh,#32
2727 extr $ivh,$ivh,$ivl,#63
2728 and $tmpmw,$constnum,$midnum,asr#31
2729 eor $ivl,$tmpmx,$ivl,lsl#1
2730 fmov $ivd00,$ivl
2731 fmov $ivd01,$ivh
2732 veor $tmp1,$iv1,$rndlast
2733 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
2734 aese $dat0,q9
2735 aesmc $dat0,$dat0
2736 aese $dat1,q9
2737 aesmc $dat1,$dat1
2738 aese $dat2,q9
2739 aesmc $dat2,$dat2
2740 veor $tmp2,$iv2,$rndlast
2741
2742 add $xoffset,$xoffset,#0x20
2743 add $inp,$inp,$xoffset
2744 mov $key_,$key1
2745
2746 aese $dat0,q12
2747 aesmc $dat0,$dat0
2748 aese $dat1,q12
2749 aesmc $dat1,$dat1
2750 aese $dat2,q12
2751 aesmc $dat2,$dat2
2752 aese $dat0,q13
2753 aesmc $dat0,$dat0
2754 aese $dat1,q13
2755 aesmc $dat1,$dat1
2756 aese $dat2,q13
2757 aesmc $dat2,$dat2
2758 aese $dat0,q14
2759 aesmc $dat0,$dat0
2760 aese $dat1,q14
2761 aesmc $dat1,$dat1
2762 aese $dat2,q14
2763 aesmc $dat2,$dat2
2764 aese $dat0,q15
2765 aese $dat1,q15
2766 aese $dat2,q15
2767 vld1.8 {$in2},[$inp],#16
2768 add $rounds,$rounds0,#2
2769 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2770 veor $tmp0,$tmp0,$dat0
2771 veor $tmp1,$tmp1,$dat1
2772 veor $dat2,$dat2,$tmp2
2773 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2774 vst1.8 {$tmp0},[$out],#16
2775 vst1.8 {$tmp1},[$out],#16
2776 vst1.8 {$dat2},[$out],#16
2777 cmn $len,#0x30
2778 b.eq .Lxts_enc_done
2779 .Lxts_encxor_one:
2780 vorr $in3,$in1,$in1
2781 vorr $in4,$in2,$in2
2782 nop
2783
2784 .Lxts_inner_enc_tail:
2785 cmn $len,#0x10
2786 veor $dat1,$in3,$iv0
2787 veor $dat2,$in4,$iv1
2788 b.eq .Lxts_enc_tail_loop
2789 veor $dat2,$in4,$iv0
2790 .Lxts_enc_tail_loop:
2791 aese $dat1,q8
2792 aesmc $dat1,$dat1
2793 aese $dat2,q8
2794 aesmc $dat2,$dat2
2795 vld1.32 {q8},[$key_],#16
2796 subs $rounds,$rounds,#2
2797 aese $dat1,q9
2798 aesmc $dat1,$dat1
2799 aese $dat2,q9
2800 aesmc $dat2,$dat2
2801 vld1.32 {q9},[$key_],#16
2802 b.gt .Lxts_enc_tail_loop
2803
2804 aese $dat1,q8
2805 aesmc $dat1,$dat1
2806 aese $dat2,q8
2807 aesmc $dat2,$dat2
2808 aese $dat1,q9
2809 aesmc $dat1,$dat1
2810 aese $dat2,q9
2811 aesmc $dat2,$dat2
2812 aese $dat1,q12
2813 aesmc $dat1,$dat1
2814 aese $dat2,q12
2815 aesmc $dat2,$dat2
2816 cmn $len,#0x20
2817 aese $dat1,q13
2818 aesmc $dat1,$dat1
2819 aese $dat2,q13
2820 aesmc $dat2,$dat2
2821 veor $tmp1,$iv0,$rndlast
2822 aese $dat1,q14
2823 aesmc $dat1,$dat1
2824 aese $dat2,q14
2825 aesmc $dat2,$dat2
2826 veor $tmp2,$iv1,$rndlast
2827 aese $dat1,q15
2828 aese $dat2,q15
2829 b.eq .Lxts_enc_one
2830 veor $tmp1,$tmp1,$dat1
2831 vst1.8 {$tmp1},[$out],#16
2832 veor $tmp2,$tmp2,$dat2
2833 vorr $iv0,$iv1,$iv1
2834 vst1.8 {$tmp2},[$out],#16
2835 fmov $ivl,$ivd10
2836 fmov $ivh,$ivd11
2837 mov $constnum,#0x87
2838 extr $midnumx,$ivh,$ivh,#32
2839 extr $ivh,$ivh,$ivl,#63
2840 and $tmpmw,$constnum,$midnum,asr #31
2841 eor $ivl,$tmpmx,$ivl,lsl #1
2842 fmov $ivd00,$ivl
2843 fmov $ivd01,$ivh
2844 b .Lxts_enc_done
2845
2846 .Lxts_enc_one:
2847 veor $tmp1,$tmp1,$dat2
2848 vorr $iv0,$iv0,$iv0
2849 vst1.8 {$tmp1},[$out],#16
2850 fmov $ivl,$ivd00
2851 fmov $ivh,$ivd01
2852 mov $constnum,#0x87
2853 extr $midnumx,$ivh,$ivh,#32
2854 extr $ivh,$ivh,$ivl,#63
2855 and $tmpmw,$constnum,$midnum,asr #31
2856 eor $ivl,$tmpmx,$ivl,lsl #1
2857 fmov $ivd00,$ivl
2858 fmov $ivd01,$ivh
2859 b .Lxts_enc_done
2860 .align 5
2861 .Lxts_enc_done:
2862 // Process the tail block with cipher stealing.
2863 tst $tailcnt,#0xf
2864 b.eq .Lxts_abort
2865
2866 mov $tmpinp,$inp
2867 mov $tmpoutp,$out
2868 sub $out,$out,#16
2869 .composite_enc_loop:
2870 subs $tailcnt,$tailcnt,#1
2871 ldrb $l2outp,[$out,$tailcnt]
2872 ldrb $loutp,[$tmpinp,$tailcnt]
2873 strb $l2outp,[$tmpoutp,$tailcnt]
2874 strb $loutp,[$out,$tailcnt]
2875 b.gt .composite_enc_loop
2876 .Lxts_enc_load_done:
2877 vld1.8 {$tmpin},[$out]
2878 veor $tmpin,$tmpin,$iv0
2879
2880 // Encrypt the composite block to get the last second encrypted text block
2881 ldr $rounds,[$key1,#240] // load key schedule...
2882 vld1.32 {$dat},[$key1],#16
2883 sub $rounds,$rounds,#2
2884 vld1.32 {$dat1},[$key1],#16 // load key schedule...
2885 .Loop_final_enc:
2886 aese $tmpin,$dat0
2887 aesmc $tmpin,$tmpin
2888 vld1.32 {$dat0},[$key1],#16
2889 subs $rounds,$rounds,#2
2890 aese $tmpin,$dat1
2891 aesmc $tmpin,$tmpin
2892 vld1.32 {$dat1},[$key1],#16
2893 b.gt .Loop_final_enc
2894
2895 aese $tmpin,$dat0
2896 aesmc $tmpin,$tmpin
2897 vld1.32 {$dat0},[$key1]
2898 aese $tmpin,$dat1
2899 veor $tmpin,$tmpin,$dat0
2900 veor $tmpin,$tmpin,$iv0
2901 vst1.8 {$tmpin},[$out]
2902
2903 .Lxts_abort:
2904 ldp $tailcnt,$midnumx,[sp,#48]
2905 ldp $ivd10,$ivd20,[sp,#32]
2906 ldp $ivd30,$ivd40,[sp,#16]
2907 ldp $constnumx,$tmpinp,[sp],#64
2908 .Lxts_enc_final_abort:
2909 ret
2910 .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2911 ___
2912
2913 }}}
2914 {{{
2915 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2916 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2917 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2918 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2919 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2920 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2921 my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2922 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2923 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2924
2925 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2926
2927 # q7 last round key
2928 # q10-q15, q7 Last 7 round keys
2929 # q8-q9 preloaded round keys except last 7 keys for big size
2930 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2931
2932 {
2933 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2934
2935 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2936 my ($dat4,$in4,$tmp4);
2937 if ($flavour =~ /64/) {
2938 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2939 }
2940
2941 $code.=<<___ if ($flavour =~ /64/);
2942 .globl ${prefix}_xts_decrypt
2943 .type ${prefix}_xts_decrypt,%function
2944 .align 5
2945 ${prefix}_xts_decrypt:
2946 AARCH64_VALID_CALL_TARGET
2947 ___
2948 $code.=<<___ if ($flavour =~ /64/);
2949 cmp $len,#16
2950 // Original input data size bigger than 16, jump to big size processing.
2951 b.ne .Lxts_dec_big_size
2952 // Encrypt the iv with key2, as the first XEX iv.
2953 ldr $rounds,[$key2,#240]
2954 vld1.32 {$dat},[$key2],#16
2955 vld1.8 {$iv0},[$ivp]
2956 sub $rounds,$rounds,#2
2957 vld1.32 {$dat1},[$key2],#16
2958
2959 .Loop_dec_small_iv_enc:
2960 aese $iv0,$dat
2961 aesmc $iv0,$iv0
2962 vld1.32 {$dat},[$key2],#16
2963 subs $rounds,$rounds,#2
2964 aese $iv0,$dat1
2965 aesmc $iv0,$iv0
2966 vld1.32 {$dat1},[$key2],#16
2967 b.gt .Loop_dec_small_iv_enc
2968
2969 aese $iv0,$dat
2970 aesmc $iv0,$iv0
2971 vld1.32 {$dat},[$key2]
2972 aese $iv0,$dat1
2973 veor $iv0,$iv0,$dat
2974
2975 vld1.8 {$dat0},[$inp]
2976 veor $dat0,$iv0,$dat0
2977
2978 ldr $rounds,[$key1,#240]
2979 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2980
2981 aesd $dat0,q20
2982 aesimc $dat0,$dat0
2983 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2984 aesd $dat0,q21
2985 aesimc $dat0,$dat0
2986 subs $rounds,$rounds,#10 // bias
2987 b.eq .Lxts_128_dec
2988 .Lxts_dec_round_loop:
2989 aesd $dat0,q8
2990 aesimc $dat0,$dat0
2991 vld1.32 {q8},[$key1],#16 // load key schedule...
2992 aesd $dat0,q9
2993 aesimc $dat0,$dat0
2994 vld1.32 {q9},[$key1],#16 // load key schedule...
2995 subs $rounds,$rounds,#2 // bias
2996 b.gt .Lxts_dec_round_loop
2997 .Lxts_128_dec:
2998 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2999 aesd $dat0,q8
3000 aesimc $dat0,$dat0
3001 aesd $dat0,q9
3002 aesimc $dat0,$dat0
3003 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
3004 aesd $dat0,q10
3005 aesimc $dat0,$dat0
3006 aesd $dat0,q11
3007 aesimc $dat0,$dat0
3008 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
3009 aesd $dat0,q12
3010 aesimc $dat0,$dat0
3011 aesd $dat0,q13
3012 aesimc $dat0,$dat0
3013 vld1.32 {$rndlast},[$key1]
3014 aesd $dat0,q14
3015 aesimc $dat0,$dat0
3016 aesd $dat0,q15
3017 veor $dat0,$dat0,$rndlast
3018 veor $dat0,$iv0,$dat0
3019 vst1.8 {$dat0},[$out]
3020 b .Lxts_dec_final_abort
3021 .Lxts_dec_big_size:
3022 ___
3023 $code.=<<___ if ($flavour =~ /64/);
3024 stp $constnumx,$tmpinp,[sp,#-64]!
3025 stp $tailcnt,$midnumx,[sp,#48]
3026 stp $ivd10,$ivd20,[sp,#32]
3027 stp $ivd30,$ivd40,[sp,#16]
3028
3029 and $tailcnt,$len,#0xf
3030 and $len,$len,#-16
3031 subs $len,$len,#16
3032 mov $step,#16
3033 b.lo .Lxts_dec_abort
3034
3035 // Encrypt the iv with key2, as the first XEX iv
3036 ldr $rounds,[$key2,#240]
3037 vld1.32 {$dat},[$key2],#16
3038 vld1.8 {$iv0},[$ivp]
3039 sub $rounds,$rounds,#2
3040 vld1.32 {$dat1},[$key2],#16
3041
3042 .Loop_dec_iv_enc:
3043 aese $iv0,$dat
3044 aesmc $iv0,$iv0
3045 vld1.32 {$dat},[$key2],#16
3046 subs $rounds,$rounds,#2
3047 aese $iv0,$dat1
3048 aesmc $iv0,$iv0
3049 vld1.32 {$dat1},[$key2],#16
3050 b.gt .Loop_dec_iv_enc
3051
3052 aese $iv0,$dat
3053 aesmc $iv0,$iv0
3054 vld1.32 {$dat},[$key2]
3055 aese $iv0,$dat1
3056 veor $iv0,$iv0,$dat
3057
3058 // The iv for second block
3059 // $ivl- iv(low), $ivh - iv(high)
3060 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3061 fmov $ivl,$ivd00
3062 fmov $ivh,$ivd01
3063 mov $constnum,#0x87
3064 extr $midnumx,$ivh,$ivh,#32
3065 extr $ivh,$ivh,$ivl,#63
3066 and $tmpmw,$constnum,$midnum,asr #31
3067 eor $ivl,$tmpmx,$ivl,lsl #1
3068 fmov $ivd10,$ivl
3069 fmov $ivd11,$ivh
3070
3071 ldr $rounds0,[$key1,#240] // load rounds number
3072
3073 // The iv for third block
3074 extr $midnumx,$ivh,$ivh,#32
3075 extr $ivh,$ivh,$ivl,#63
3076 and $tmpmw,$constnum,$midnum,asr #31
3077 eor $ivl,$tmpmx,$ivl,lsl #1
3078 fmov $ivd20,$ivl
3079 fmov $ivd21,$ivh
3080
3081 vld1.32 {q8-q9},[$key1] // load key schedule...
3082 sub $rounds0,$rounds0,#6
3083 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3084 sub $rounds0,$rounds0,#2
3085 vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3086 vld1.32 {q12-q13},[$key_],#32
3087 vld1.32 {q14-q15},[$key_],#32
3088 vld1.32 {$rndlast},[$key_]
3089
3090 // The iv for fourth block
3091 extr $midnumx,$ivh,$ivh,#32
3092 extr $ivh,$ivh,$ivl,#63
3093 and $tmpmw,$constnum,$midnum,asr #31
3094 eor $ivl,$tmpmx,$ivl,lsl #1
3095 fmov $ivd30,$ivl
3096 fmov $ivd31,$ivh
3097
3098 add $key_,$key1,#32
3099 mov $rounds,$rounds0
3100 b .Lxts_dec
3101
3102 // Decryption
3103 .align 5
3104 .Lxts_dec:
3105 tst $tailcnt,#0xf
3106 b.eq .Lxts_dec_begin
3107 subs $len,$len,#16
3108 csel $step,xzr,$step,eq
3109 vld1.8 {$dat},[$inp],#16
3110 b.lo .Lxts_done
3111 sub $inp,$inp,#16
3112 .Lxts_dec_begin:
3113 vld1.8 {$dat},[$inp],$step
3114 subs $len,$len,#32 // bias
3115 add $rounds,$rounds0,#2
3116 vorr $in1,$dat,$dat
3117 vorr $dat1,$dat,$dat
3118 vorr $in3,$dat,$dat
3119 vld1.8 {$dat2},[$inp],#16
3120 vorr $in2,$dat2,$dat2
3121 vorr $in4,$dat2,$dat2
3122 b.lo .Lxts_inner_dec_tail
3123 veor $dat,$dat,$iv0 // before decryt, xor with iv
3124 veor $dat2,$dat2,$iv1
3125
3126 vorr $dat1,$dat2,$dat2
3127 vld1.8 {$dat2},[$inp],#16
3128 vorr $in0,$dat,$dat
3129 vorr $in1,$dat1,$dat1
3130 veor $in2,$dat2,$iv2 // third block xox with third iv
3131 veor $dat2,$dat2,$iv2
3132 cmp $len,#32
3133 b.lo .Lxts_outer_dec_tail
3134
3135 vld1.8 {$dat3},[$inp],#16
3136
3137 // The iv for fifth block
3138 extr $midnumx,$ivh,$ivh,#32
3139 extr $ivh,$ivh,$ivl,#63
3140 and $tmpmw,$constnum,$midnum,asr #31
3141 eor $ivl,$tmpmx,$ivl,lsl #1
3142 fmov $ivd40,$ivl
3143 fmov $ivd41,$ivh
3144
3145 vld1.8 {$dat4},[$inp],#16
3146 veor $dat3,$dat3,$iv3 // the fourth block
3147 veor $dat4,$dat4,$iv4
3148 sub $len,$len,#32 // bias
3149 mov $rounds,$rounds0
3150 b .Loop5x_xts_dec
3151
3152 .align 4
3153 .Loop5x_xts_dec:
3154 aesd $dat0,q8
3155 aesimc $dat0,$dat0
3156 aesd $dat1,q8
3157 aesimc $dat1,$dat1
3158 aesd $dat2,q8
3159 aesimc $dat2,$dat2
3160 aesd $dat3,q8
3161 aesimc $dat3,$dat3
3162 aesd $dat4,q8
3163 aesimc $dat4,$dat4
3164 vld1.32 {q8},[$key_],#16 // load key schedule...
3165 subs $rounds,$rounds,#2
3166 aesd $dat0,q9
3167 aesimc $dat0,$dat0
3168 aesd $dat1,q9
3169 aesimc $dat1,$dat1
3170 aesd $dat2,q9
3171 aesimc $dat2,$dat2
3172 aesd $dat3,q9
3173 aesimc $dat3,$dat3
3174 aesd $dat4,q9
3175 aesimc $dat4,$dat4
3176 vld1.32 {q9},[$key_],#16 // load key schedule...
3177 b.gt .Loop5x_xts_dec
3178
3179 aesd $dat0,q8
3180 aesimc $dat0,$dat0
3181 aesd $dat1,q8
3182 aesimc $dat1,$dat1
3183 aesd $dat2,q8
3184 aesimc $dat2,$dat2
3185 aesd $dat3,q8
3186 aesimc $dat3,$dat3
3187 aesd $dat4,q8
3188 aesimc $dat4,$dat4
3189 subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3190
3191 aesd $dat0,q9
3192 aesimc $dat0,$dat
3193 aesd $dat1,q9
3194 aesimc $dat1,$dat1
3195 aesd $dat2,q9
3196 aesimc $dat2,$dat2
3197 aesd $dat3,q9
3198 aesimc $dat3,$dat3
3199 aesd $dat4,q9
3200 aesimc $dat4,$dat4
3201 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3202 mov $key_,$key1
3203
3204 aesd $dat0,q10
3205 aesimc $dat0,$dat0
3206 aesd $dat1,q10
3207 aesimc $dat1,$dat1
3208 aesd $dat2,q10
3209 aesimc $dat2,$dat2
3210 aesd $dat3,q10
3211 aesimc $dat3,$dat3
3212 aesd $dat4,q10
3213 aesimc $dat4,$dat4
3214 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3215 // at exit from the loop v1.16b-v26.16b
3216 // are loaded with last "words"
3217 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3218
3219 aesd $dat0,q11
3220 aesimc $dat0,$dat0
3221 aesd $dat1,q11
3222 aesimc $dat1,$dat1
3223 aesd $dat2,q11
3224 aesimc $dat2,$dat2
3225 aesd $dat3,q11
3226 aesimc $dat3,$dat3
3227 aesd $dat4,q11
3228 aesimc $dat4,$dat4
3229
3230 aesd $dat0,q12
3231 aesimc $dat0,$dat0
3232 aesd $dat1,q12
3233 aesimc $dat1,$dat1
3234 aesd $dat2,q12
3235 aesimc $dat2,$dat2
3236 aesd $dat3,q12
3237 aesimc $dat3,$dat3
3238 aesd $dat4,q12
3239 aesimc $dat4,$dat4
3240
3241 aesd $dat0,q13
3242 aesimc $dat0,$dat0
3243 aesd $dat1,q13
3244 aesimc $dat1,$dat1
3245 aesd $dat2,q13
3246 aesimc $dat2,$dat2
3247 aesd $dat3,q13
3248 aesimc $dat3,$dat3
3249 aesd $dat4,q13
3250 aesimc $dat4,$dat4
3251
3252 aesd $dat0,q14
3253 aesimc $dat0,$dat0
3254 aesd $dat1,q14
3255 aesimc $dat1,$dat1
3256 aesd $dat2,q14
3257 aesimc $dat2,$dat2
3258 aesd $dat3,q14
3259 aesimc $dat3,$dat3
3260 aesd $dat4,q14
3261 aesimc $dat4,$dat4
3262
3263 veor $tmp0,$rndlast,$iv0
3264 aesd $dat0,q15
3265 // The iv for first block of next iteration.
3266 extr $midnumx,$ivh,$ivh,#32
3267 extr $ivh,$ivh,$ivl,#63
3268 and $tmpmw,$constnum,$midnum,asr #31
3269 eor $ivl,$tmpmx,$ivl,lsl #1
3270 fmov $ivd00,$ivl
3271 fmov $ivd01,$ivh
3272 veor $tmp1,$rndlast,$iv1
3273 vld1.8 {$in0},[$inp],#16
3274 aesd $dat1,q15
3275 // The iv for second block
3276 extr $midnumx,$ivh,$ivh,#32
3277 extr $ivh,$ivh,$ivl,#63
3278 and $tmpmw,$constnum,$midnum,asr #31
3279 eor $ivl,$tmpmx,$ivl,lsl #1
3280 fmov $ivd10,$ivl
3281 fmov $ivd11,$ivh
3282 veor $tmp2,$rndlast,$iv2
3283 vld1.8 {$in1},[$inp],#16
3284 aesd $dat2,q15
3285 // The iv for third block
3286 extr $midnumx,$ivh,$ivh,#32
3287 extr $ivh,$ivh,$ivl,#63
3288 and $tmpmw,$constnum,$midnum,asr #31
3289 eor $ivl,$tmpmx,$ivl,lsl #1
3290 fmov $ivd20,$ivl
3291 fmov $ivd21,$ivh
3292 veor $tmp3,$rndlast,$iv3
3293 vld1.8 {$in2},[$inp],#16
3294 aesd $dat3,q15
3295 // The iv for fourth block
3296 extr $midnumx,$ivh,$ivh,#32
3297 extr $ivh,$ivh,$ivl,#63
3298 and $tmpmw,$constnum,$midnum,asr #31
3299 eor $ivl,$tmpmx,$ivl,lsl #1
3300 fmov $ivd30,$ivl
3301 fmov $ivd31,$ivh
3302 veor $tmp4,$rndlast,$iv4
3303 vld1.8 {$in3},[$inp],#16
3304 aesd $dat4,q15
3305
3306 // The iv for fifth block
3307 extr $midnumx,$ivh,$ivh,#32
3308 extr $ivh,$ivh,$ivl,#63
3309 and $tmpmw,$constnum,$midnum,asr #31
3310 eor $ivl,$tmpmx,$ivl,lsl #1
3311 fmov $ivd40,$ivl
3312 fmov $ivd41,$ivh
3313
3314 vld1.8 {$in4},[$inp],#16
3315 cbz $xoffset,.Lxts_dec_tail4x
3316 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3317 veor $tmp0,$tmp0,$dat0
3318 veor $dat0,$in0,$iv0
3319 veor $tmp1,$tmp1,$dat1
3320 veor $dat1,$in1,$iv1
3321 veor $tmp2,$tmp2,$dat2
3322 veor $dat2,$in2,$iv2
3323 veor $tmp3,$tmp3,$dat3
3324 veor $dat3,$in3,$iv3
3325 veor $tmp4,$tmp4,$dat4
3326 vst1.8 {$tmp0},[$out],#16
3327 veor $dat4,$in4,$iv4
3328 vst1.8 {$tmp1},[$out],#16
3329 mov $rounds,$rounds0
3330 vst1.8 {$tmp2},[$out],#16
3331 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3332 vst1.8 {$tmp3},[$out],#16
3333 vst1.8 {$tmp4},[$out],#16
3334 b.hs .Loop5x_xts_dec
3335
3336 cmn $len,#0x10
3337 b.ne .Loop5x_dec_after
3338 // If x2($len) equal to -0x10, the left blocks is 4.
3339 // After specially processing, utilize the five blocks processing again.
3340 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3341 vorr $iv4,$iv3,$iv3
3342 vorr $iv3,$iv2,$iv2
3343 vorr $iv2,$iv1,$iv1
3344 vorr $iv1,$iv0,$iv0
3345 fmov $ivl,$ivd40
3346 fmov $ivh,$ivd41
3347 veor $dat0,$iv0,$in0
3348 veor $dat1,$iv1,$in1
3349 veor $dat2,$in2,$iv2
3350 veor $dat3,$in3,$iv3
3351 veor $dat4,$in4,$iv4
3352 b.eq .Loop5x_xts_dec
3353
3354 .Loop5x_dec_after:
3355 add $len,$len,#0x50
3356 cbz $len,.Lxts_done
3357
3358 add $rounds,$rounds0,#2
3359 subs $len,$len,#0x30
3360 b.lo .Lxts_inner_dec_tail
3361
3362 veor $dat0,$iv0,$in2
3363 veor $dat1,$iv1,$in3
3364 veor $dat2,$in4,$iv2
3365 b .Lxts_outer_dec_tail
3366
3367 .align 4
3368 .Lxts_dec_tail4x:
3369 add $inp,$inp,#16
3370 tst $tailcnt,#0xf
3371 veor $tmp1,$dat1,$tmp0
3372 vst1.8 {$tmp1},[$out],#16
3373 veor $tmp2,$dat2,$tmp2
3374 vst1.8 {$tmp2},[$out],#16
3375 veor $tmp3,$dat3,$tmp3
3376 veor $tmp4,$dat4,$tmp4
3377 vst1.8 {$tmp3-$tmp4},[$out],#32
3378
3379 b.eq .Lxts_dec_abort
3380 vld1.8 {$dat0},[$inp],#16
3381 b .Lxts_done
3382 .align 4
3383 .Lxts_outer_dec_tail:
3384 aesd $dat0,q8
3385 aesimc $dat0,$dat0
3386 aesd $dat1,q8
3387 aesimc $dat1,$dat1
3388 aesd $dat2,q8
3389 aesimc $dat2,$dat2
3390 vld1.32 {q8},[$key_],#16
3391 subs $rounds,$rounds,#2
3392 aesd $dat0,q9
3393 aesimc $dat0,$dat0
3394 aesd $dat1,q9
3395 aesimc $dat1,$dat1
3396 aesd $dat2,q9
3397 aesimc $dat2,$dat2
3398 vld1.32 {q9},[$key_],#16
3399 b.gt .Lxts_outer_dec_tail
3400
3401 aesd $dat0,q8
3402 aesimc $dat0,$dat0
3403 aesd $dat1,q8
3404 aesimc $dat1,$dat1
3405 aesd $dat2,q8
3406 aesimc $dat2,$dat2
3407 veor $tmp0,$iv0,$rndlast
3408 subs $len,$len,#0x30
3409 // The iv for first block
3410 fmov $ivl,$ivd20
3411 fmov $ivh,$ivd21
3412 mov $constnum,#0x87
3413 extr $midnumx,$ivh,$ivh,#32
3414 extr $ivh,$ivh,$ivl,#63
3415 and $tmpmw,$constnum,$midnum,asr #31
3416 eor $ivl,$tmpmx,$ivl,lsl #1
3417 fmov $ivd00,$ivl
3418 fmov $ivd01,$ivh
3419 veor $tmp1,$iv1,$rndlast
3420 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3421 aesd $dat0,q9
3422 aesimc $dat0,$dat0
3423 aesd $dat1,q9
3424 aesimc $dat1,$dat1
3425 aesd $dat2,q9
3426 aesimc $dat2,$dat2
3427 veor $tmp2,$iv2,$rndlast
3428 // The iv for second block
3429 extr $midnumx,$ivh,$ivh,#32
3430 extr $ivh,$ivh,$ivl,#63
3431 and $tmpmw,$constnum,$midnum,asr #31
3432 eor $ivl,$tmpmx,$ivl,lsl #1
3433 fmov $ivd10,$ivl
3434 fmov $ivd11,$ivh
3435
3436 add $xoffset,$xoffset,#0x20
3437 add $inp,$inp,$xoffset // $inp is adjusted to the last data
3438
3439 mov $key_,$key1
3440
3441 // The iv for third block
3442 extr $midnumx,$ivh,$ivh,#32
3443 extr $ivh,$ivh,$ivl,#63
3444 and $tmpmw,$constnum,$midnum,asr #31
3445 eor $ivl,$tmpmx,$ivl,lsl #1
3446 fmov $ivd20,$ivl
3447 fmov $ivd21,$ivh
3448
3449 aesd $dat0,q12
3450 aesimc $dat0,$dat0
3451 aesd $dat1,q12
3452 aesimc $dat1,$dat1
3453 aesd $dat2,q12
3454 aesimc $dat2,$dat2
3455 aesd $dat0,q13
3456 aesimc $dat0,$dat0
3457 aesd $dat1,q13
3458 aesimc $dat1,$dat1
3459 aesd $dat2,q13
3460 aesimc $dat2,$dat2
3461 aesd $dat0,q14
3462 aesimc $dat0,$dat0
3463 aesd $dat1,q14
3464 aesimc $dat1,$dat1
3465 aesd $dat2,q14
3466 aesimc $dat2,$dat2
3467 vld1.8 {$in2},[$inp],#16
3468 aesd $dat0,q15
3469 aesd $dat1,q15
3470 aesd $dat2,q15
3471 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3472 add $rounds,$rounds0,#2
3473 veor $tmp0,$tmp0,$dat0
3474 veor $tmp1,$tmp1,$dat1
3475 veor $dat2,$dat2,$tmp2
3476 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3477 vst1.8 {$tmp0},[$out],#16
3478 vst1.8 {$tmp1},[$out],#16
3479 vst1.8 {$dat2},[$out],#16
3480
3481 cmn $len,#0x30
3482 add $len,$len,#0x30
3483 b.eq .Lxts_done
3484 sub $len,$len,#0x30
3485 vorr $in3,$in1,$in1
3486 vorr $in4,$in2,$in2
3487 nop
3488
3489 .Lxts_inner_dec_tail:
3490 // $len == -0x10 means two blocks left.
3491 cmn $len,#0x10
3492 veor $dat1,$in3,$iv0
3493 veor $dat2,$in4,$iv1
3494 b.eq .Lxts_dec_tail_loop
3495 veor $dat2,$in4,$iv0
3496 .Lxts_dec_tail_loop:
3497 aesd $dat1,q8
3498 aesimc $dat1,$dat1
3499 aesd $dat2,q8
3500 aesimc $dat2,$dat2
3501 vld1.32 {q8},[$key_],#16
3502 subs $rounds,$rounds,#2
3503 aesd $dat1,q9
3504 aesimc $dat1,$dat1
3505 aesd $dat2,q9
3506 aesimc $dat2,$dat2
3507 vld1.32 {q9},[$key_],#16
3508 b.gt .Lxts_dec_tail_loop
3509
3510 aesd $dat1,q8
3511 aesimc $dat1,$dat1
3512 aesd $dat2,q8
3513 aesimc $dat2,$dat2
3514 aesd $dat1,q9
3515 aesimc $dat1,$dat1
3516 aesd $dat2,q9
3517 aesimc $dat2,$dat2
3518 aesd $dat1,q12
3519 aesimc $dat1,$dat1
3520 aesd $dat2,q12
3521 aesimc $dat2,$dat2
3522 cmn $len,#0x20
3523 aesd $dat1,q13
3524 aesimc $dat1,$dat1
3525 aesd $dat2,q13
3526 aesimc $dat2,$dat2
3527 veor $tmp1,$iv0,$rndlast
3528 aesd $dat1,q14
3529 aesimc $dat1,$dat1
3530 aesd $dat2,q14
3531 aesimc $dat2,$dat2
3532 veor $tmp2,$iv1,$rndlast
3533 aesd $dat1,q15
3534 aesd $dat2,q15
3535 b.eq .Lxts_dec_one
3536 veor $tmp1,$tmp1,$dat1
3537 veor $tmp2,$tmp2,$dat2
3538 vorr $iv0,$iv2,$iv2
3539 vorr $iv1,$iv3,$iv3
3540 vst1.8 {$tmp1},[$out],#16
3541 vst1.8 {$tmp2},[$out],#16
3542 add $len,$len,#16
3543 b .Lxts_done
3544
3545 .Lxts_dec_one:
3546 veor $tmp1,$tmp1,$dat2
3547 vorr $iv0,$iv1,$iv1
3548 vorr $iv1,$iv2,$iv2
3549 vst1.8 {$tmp1},[$out],#16
3550 add $len,$len,#32
3551
3552 .Lxts_done:
3553 tst $tailcnt,#0xf
3554 b.eq .Lxts_dec_abort
3555 // Processing the last two blocks with cipher stealing.
3556 mov x7,x3
3557 cbnz x2,.Lxts_dec_1st_done
3558 vld1.8 {$dat0},[$inp],#16
3559
3560 // Decrypt the last second block to get the last plain text block
3561 .Lxts_dec_1st_done:
3562 eor $tmpin,$dat0,$iv1
3563 ldr $rounds,[$key1,#240]
3564 vld1.32 {$dat0},[$key1],#16
3565 sub $rounds,$rounds,#2
3566 vld1.32 {$dat1},[$key1],#16
3567 .Loop_final_2nd_dec:
3568 aesd $tmpin,$dat0
3569 aesimc $tmpin,$tmpin
3570 vld1.32 {$dat0},[$key1],#16 // load key schedule...
3571 subs $rounds,$rounds,#2
3572 aesd $tmpin,$dat1
3573 aesimc $tmpin,$tmpin
3574 vld1.32 {$dat1},[$key1],#16 // load key schedule...
3575 b.gt .Loop_final_2nd_dec
3576
3577 aesd $tmpin,$dat0
3578 aesimc $tmpin,$tmpin
3579 vld1.32 {$dat0},[$key1]
3580 aesd $tmpin,$dat1
3581 veor $tmpin,$tmpin,$dat0
3582 veor $tmpin,$tmpin,$iv1
3583 vst1.8 {$tmpin},[$out]
3584
3585 mov $tmpinp,$inp
3586 add $tmpoutp,$out,#16
3587
3588 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3589 // to get the last encrypted block.
3590 .composite_dec_loop:
3591 subs $tailcnt,$tailcnt,#1
3592 ldrb $l2outp,[$out,$tailcnt]
3593 ldrb $loutp,[$tmpinp,$tailcnt]
3594 strb $l2outp,[$tmpoutp,$tailcnt]
3595 strb $loutp,[$out,$tailcnt]
3596 b.gt .composite_dec_loop
3597 .Lxts_dec_load_done:
3598 vld1.8 {$tmpin},[$out]
3599 veor $tmpin,$tmpin,$iv0
3600
3601 // Decrypt the composite block to get the last second plain text block
3602 ldr $rounds,[$key_,#240]
3603 vld1.32 {$dat},[$key_],#16
3604 sub $rounds,$rounds,#2
3605 vld1.32 {$dat1},[$key_],#16
3606 .Loop_final_dec:
3607 aesd $tmpin,$dat0
3608 aesimc $tmpin,$tmpin
3609 vld1.32 {$dat0},[$key_],#16 // load key schedule...
3610 subs $rounds,$rounds,#2
3611 aesd $tmpin,$dat1
3612 aesimc $tmpin,$tmpin
3613 vld1.32 {$dat1},[$key_],#16 // load key schedule...
3614 b.gt .Loop_final_dec
3615
3616 aesd $tmpin,$dat0
3617 aesimc $tmpin,$tmpin
3618 vld1.32 {$dat0},[$key_]
3619 aesd $tmpin,$dat1
3620 veor $tmpin,$tmpin,$dat0
3621 veor $tmpin,$tmpin,$iv0
3622 vst1.8 {$tmpin},[$out]
3623
3624 .Lxts_dec_abort:
3625 ldp $tailcnt,$midnumx,[sp,#48]
3626 ldp $ivd10,$ivd20,[sp,#32]
3627 ldp $ivd30,$ivd40,[sp,#16]
3628 ldp $constnumx,$tmpinp,[sp],#64
3629
3630 .Lxts_dec_final_abort:
3631 ret
3632 .size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3633 ___
3634 }
3635 }}}
3636 $code.=<<___;
3637 #endif
3638 ___
3639 ########################################
3640 if ($flavour =~ /64/) { ######## 64-bit code
3641 my %opcode = (
3642 "aesd" => 0x4e285800, "aese" => 0x4e284800,
3643 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3644
3645 local *unaes = sub {
3646 my ($mnemonic,$arg)=@_;
3647
3648 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
3649 sprintf ".inst\t0x%08x\t//%s %s",
3650 $opcode{$mnemonic}|$1|($2<<5),
3651 $mnemonic,$arg;
3652 };
3653
3654 foreach(split("\n",$code)) {
3655 s/\`([^\`]*)\`/eval($1)/geo;
3656
3657 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3658 s/@\s/\/\//o; # old->new style commentary
3659
3660 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3661 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3662 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3663 s/vmov\.i8/movi/o or # fix up legacy mnemonics
3664 s/vext\.8/ext/o or
3665 s/vrev32\.8/rev32/o or
3666 s/vtst\.8/cmtst/o or
3667 s/vshr/ushr/o or
3668 s/^(\s+)v/$1/o or # strip off v prefix
3669 s/\bbx\s+lr\b/ret/o;
3670
3671 # fix up remaining legacy suffixes
3672 s/\.[ui]?8//o;
3673 m/\],#8/o and s/\.16b/\.8b/go;
3674 s/\.[ui]?32//o and s/\.16b/\.4s/go;
3675 s/\.[ui]?64//o and s/\.16b/\.2d/go;
3676 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3677
3678 # Switch preprocessor checks to aarch64 versions.
3679 s/__ARME([BL])__/__AARCH64E$1__/go;
3680
3681 print $_,"\n";
3682 }
3683 } else { ######## 32-bit code
3684 my %opcode = (
3685 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3686 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3687
3688 local *unaes = sub {
3689 my ($mnemonic,$arg)=@_;
3690
3691 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3692 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3693 |(($2&7)<<1) |(($2&8)<<2);
3694 # since ARMv7 instructions are always encoded little-endian.
3695 # correct solution is to use .inst directive, but older
3696 # assemblers don't implement it:-(
3697 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3698 $word&0xff,($word>>8)&0xff,
3699 ($word>>16)&0xff,($word>>24)&0xff,
3700 $mnemonic,$arg;
3701 }
3702 };
3703
3704 sub unvtbl {
3705 my $arg=shift;
3706
3707 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3708 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3709 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3710 }
3711
3712 sub unvdup32 {
3713 my $arg=shift;
3714
3715 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3716 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3717 }
3718
3719 sub unvmov32 {
3720 my $arg=shift;
3721
3722 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3723 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3724 }
3725
3726 foreach(split("\n",$code)) {
3727 s/\`([^\`]*)\`/eval($1)/geo;
3728
3729 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3730 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3731 s/\/\/\s?/@ /o; # new->old style commentary
3732
3733 # fix up remaining new-style suffixes
3734 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
3735 s/\],#[0-9]+/]!/o;
3736
3737 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3738 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3739 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3740 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3741 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3742 s/^(\s+)b\./$1b/o or
3743 s/^(\s+)ret/$1bx\tlr/o;
3744
3745 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3746 print " it $2\n";
3747 }
3748
3749 print $_,"\n";
3750 }
3751 }
3752
3753 close STDOUT or die "error closing STDOUT: $!";