]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesv8-armx.pl
Update copyright year
[thirdparty/openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
29 #
30 # April 2019
31 #
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
40 # to be 5x...
41 #
42 # Performance in cycles per byte processed with 128-bit key:
43 #
44 # CBC enc CBC dec CTR
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
53 #
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
57 # interleaved;
58
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
68
69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
71 *STDOUT=*OUT;
72
73 $prefix="aes_v8";
74
75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77 $code=<<___;
78 #include "arm_arch.h"
79
80 #if __ARM_MAX_ARCH__>=7
81 ___
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___ if ($flavour !~ /64/);
84 .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
85 .fpu neon
86 #ifdef __thumb2__
87 .syntax unified
88 .thumb
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
90 #else
91 .code 32
92 # define INST(a,b,c,d) $_byte a,b,c,d
93 #endif
94
95 .text
96 ___
97
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
102 #
103 {{{
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109 $code.=<<___;
110 .align 5
111 .Lrcon:
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
115
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
118 .align 5
119 ${prefix}_set_encrypt_key:
120 .Lenc_key:
121 ___
122 $code.=<<___ if ($flavour =~ /64/);
123 AARCH64_VALID_CALL_TARGET
124 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
125 stp x29,x30,[sp,#-16]!
126 add x29,sp,#0
127 ___
128 $code.=<<___;
129 mov $ptr,#-1
130 cmp $inp,#0
131 b.eq .Lenc_key_abort
132 cmp $out,#0
133 b.eq .Lenc_key_abort
134 mov $ptr,#-2
135 cmp $bits,#128
136 b.lt .Lenc_key_abort
137 cmp $bits,#256
138 b.gt .Lenc_key_abort
139 tst $bits,#0x3f
140 b.ne .Lenc_key_abort
141
142 adr $ptr,.Lrcon
143 cmp $bits,#192
144
145 veor $zero,$zero,$zero
146 vld1.8 {$in0},[$inp],#16
147 mov $bits,#8 // reuse $bits
148 vld1.32 {$rcon,$mask},[$ptr],#32
149
150 b.lt .Loop128
151 b.eq .L192
152 b .L256
153
154 .align 4
155 .Loop128:
156 vtbl.8 $key,{$in0},$mask
157 vext.8 $tmp,$zero,$in0,#12
158 vst1.32 {$in0},[$out],#16
159 aese $key,$zero
160 subs $bits,$bits,#1
161
162 veor $in0,$in0,$tmp
163 vext.8 $tmp,$zero,$tmp,#12
164 veor $in0,$in0,$tmp
165 vext.8 $tmp,$zero,$tmp,#12
166 veor $key,$key,$rcon
167 veor $in0,$in0,$tmp
168 vshl.u8 $rcon,$rcon,#1
169 veor $in0,$in0,$key
170 b.ne .Loop128
171
172 vld1.32 {$rcon},[$ptr]
173
174 vtbl.8 $key,{$in0},$mask
175 vext.8 $tmp,$zero,$in0,#12
176 vst1.32 {$in0},[$out],#16
177 aese $key,$zero
178
179 veor $in0,$in0,$tmp
180 vext.8 $tmp,$zero,$tmp,#12
181 veor $in0,$in0,$tmp
182 vext.8 $tmp,$zero,$tmp,#12
183 veor $key,$key,$rcon
184 veor $in0,$in0,$tmp
185 vshl.u8 $rcon,$rcon,#1
186 veor $in0,$in0,$key
187
188 vtbl.8 $key,{$in0},$mask
189 vext.8 $tmp,$zero,$in0,#12
190 vst1.32 {$in0},[$out],#16
191 aese $key,$zero
192
193 veor $in0,$in0,$tmp
194 vext.8 $tmp,$zero,$tmp,#12
195 veor $in0,$in0,$tmp
196 vext.8 $tmp,$zero,$tmp,#12
197 veor $key,$key,$rcon
198 veor $in0,$in0,$tmp
199 veor $in0,$in0,$key
200 vst1.32 {$in0},[$out]
201 add $out,$out,#0x50
202
203 mov $rounds,#10
204 b .Ldone
205
206 .align 4
207 .L192:
208 vld1.8 {$in1},[$inp],#8
209 vmov.i8 $key,#8 // borrow $key
210 vst1.32 {$in0},[$out],#16
211 vsub.i8 $mask,$mask,$key // adjust the mask
212
213 .Loop192:
214 vtbl.8 $key,{$in1},$mask
215 vext.8 $tmp,$zero,$in0,#12
216 #ifdef __ARMEB__
217 vst1.32 {$in1},[$out],#16
218 sub $out,$out,#8
219 #else
220 vst1.32 {$in1},[$out],#8
221 #endif
222 aese $key,$zero
223 subs $bits,$bits,#1
224
225 veor $in0,$in0,$tmp
226 vext.8 $tmp,$zero,$tmp,#12
227 veor $in0,$in0,$tmp
228 vext.8 $tmp,$zero,$tmp,#12
229 veor $in0,$in0,$tmp
230
231 vdup.32 $tmp,${in0}[3]
232 veor $tmp,$tmp,$in1
233 veor $key,$key,$rcon
234 vext.8 $in1,$zero,$in1,#12
235 vshl.u8 $rcon,$rcon,#1
236 veor $in1,$in1,$tmp
237 veor $in0,$in0,$key
238 veor $in1,$in1,$key
239 vst1.32 {$in0},[$out],#16
240 b.ne .Loop192
241
242 mov $rounds,#12
243 add $out,$out,#0x20
244 b .Ldone
245
246 .align 4
247 .L256:
248 vld1.8 {$in1},[$inp]
249 mov $bits,#7
250 mov $rounds,#14
251 vst1.32 {$in0},[$out],#16
252
253 .Loop256:
254 vtbl.8 $key,{$in1},$mask
255 vext.8 $tmp,$zero,$in0,#12
256 vst1.32 {$in1},[$out],#16
257 aese $key,$zero
258 subs $bits,$bits,#1
259
260 veor $in0,$in0,$tmp
261 vext.8 $tmp,$zero,$tmp,#12
262 veor $in0,$in0,$tmp
263 vext.8 $tmp,$zero,$tmp,#12
264 veor $key,$key,$rcon
265 veor $in0,$in0,$tmp
266 vshl.u8 $rcon,$rcon,#1
267 veor $in0,$in0,$key
268 vst1.32 {$in0},[$out],#16
269 b.eq .Ldone
270
271 vdup.32 $key,${in0}[3] // just splat
272 vext.8 $tmp,$zero,$in1,#12
273 aese $key,$zero
274
275 veor $in1,$in1,$tmp
276 vext.8 $tmp,$zero,$tmp,#12
277 veor $in1,$in1,$tmp
278 vext.8 $tmp,$zero,$tmp,#12
279 veor $in1,$in1,$tmp
280
281 veor $in1,$in1,$key
282 b .Loop256
283
284 .Ldone:
285 str $rounds,[$out]
286 mov $ptr,#0
287
288 .Lenc_key_abort:
289 mov x0,$ptr // return value
290 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
291 ret
292 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
293
294 .globl ${prefix}_set_decrypt_key
295 .type ${prefix}_set_decrypt_key,%function
296 .align 5
297 ${prefix}_set_decrypt_key:
298 ___
299 $code.=<<___ if ($flavour =~ /64/);
300 AARCH64_SIGN_LINK_REGISTER
301 stp x29,x30,[sp,#-16]!
302 add x29,sp,#0
303 ___
304 $code.=<<___ if ($flavour !~ /64/);
305 stmdb sp!,{r4,lr}
306 ___
307 $code.=<<___;
308 bl .Lenc_key
309
310 cmp x0,#0
311 b.ne .Ldec_key_abort
312
313 sub $out,$out,#240 // restore original $out
314 mov x4,#-16
315 add $inp,$out,x12,lsl#4 // end of key schedule
316
317 vld1.32 {v0.16b},[$out]
318 vld1.32 {v1.16b},[$inp]
319 vst1.32 {v0.16b},[$inp],x4
320 vst1.32 {v1.16b},[$out],#16
321
322 .Loop_imc:
323 vld1.32 {v0.16b},[$out]
324 vld1.32 {v1.16b},[$inp]
325 aesimc v0.16b,v0.16b
326 aesimc v1.16b,v1.16b
327 vst1.32 {v0.16b},[$inp],x4
328 vst1.32 {v1.16b},[$out],#16
329 cmp $inp,$out
330 b.hi .Loop_imc
331
332 vld1.32 {v0.16b},[$out]
333 aesimc v0.16b,v0.16b
334 vst1.32 {v0.16b},[$inp]
335
336 eor x0,x0,x0 // return value
337 .Ldec_key_abort:
338 ___
339 $code.=<<___ if ($flavour !~ /64/);
340 ldmia sp!,{r4,pc}
341 ___
342 $code.=<<___ if ($flavour =~ /64/);
343 ldp x29,x30,[sp],#16
344 AARCH64_VALIDATE_LINK_REGISTER
345 ret
346 ___
347 $code.=<<___;
348 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
349 ___
350 }}}
351 {{{
352 sub gen_block () {
353 my $dir = shift;
354 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
355 my ($inp,$out,$key)=map("x$_",(0..2));
356 my $rounds="w3";
357 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
358
359 $code.=<<___;
360 .globl ${prefix}_${dir}crypt
361 .type ${prefix}_${dir}crypt,%function
362 .align 5
363 ${prefix}_${dir}crypt:
364 ___
365 $code.=<<___ if ($flavour =~ /64/);
366 AARCH64_VALID_CALL_TARGET
367 ___
368 $code.=<<___;
369 ldr $rounds,[$key,#240]
370 vld1.32 {$rndkey0},[$key],#16
371 vld1.8 {$inout},[$inp]
372 sub $rounds,$rounds,#2
373 vld1.32 {$rndkey1},[$key],#16
374
375 .Loop_${dir}c:
376 aes$e $inout,$rndkey0
377 aes$mc $inout,$inout
378 vld1.32 {$rndkey0},[$key],#16
379 subs $rounds,$rounds,#2
380 aes$e $inout,$rndkey1
381 aes$mc $inout,$inout
382 vld1.32 {$rndkey1},[$key],#16
383 b.gt .Loop_${dir}c
384
385 aes$e $inout,$rndkey0
386 aes$mc $inout,$inout
387 vld1.32 {$rndkey0},[$key]
388 aes$e $inout,$rndkey1
389 veor $inout,$inout,$rndkey0
390
391 vst1.8 {$inout},[$out]
392 ret
393 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
394 ___
395 }
396 &gen_block("en");
397 &gen_block("de");
398 }}}
399
400 # Performance in cycles per byte.
401 # Processed with AES-ECB different key size.
402 # It shows the value before and after optimization as below:
403 # (before/after):
404 #
405 # AES-128-ECB AES-192-ECB AES-256-ECB
406 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
407 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
408
409 # Optimization is implemented by loop unrolling and interleaving.
410 # Commonly, we choose the unrolling factor as 5, if the input
411 # data size smaller than 5 blocks, but not smaller than 3 blocks,
412 # choose 3 as the unrolling factor.
413 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
414 # as one iteration, every loop the left size lsize -= 5*16.
415 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
416 # every loop lsize -=3*16.
417 # If lsize < 3*16 bytes, treat them as the tail, interleave the
418 # two blocks AES instructions.
419 # There is one special case, if the original input data size dsize
420 # = 16 bytes, we will treat it separately to improve the
421 # performance: one independent code block without LR, FP load and
422 # store, just looks like what the original ECB implementation does.
423
424 {{{
425 my ($inp,$out,$len,$key)=map("x$_",(0..3));
426 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
427 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
428
429 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
430
431 ### q7 last round key
432 ### q10-q15 q7 Last 7 round keys
433 ### q8-q9 preloaded round keys except last 7 keys for big size
434 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
435
436 {
437 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
438
439 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
440 my ($dat4,$in4,$tmp4);
441 if ($flavour =~ /64/) {
442 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
443 }
444
445 $code.=<<___;
446 .globl ${prefix}_ecb_encrypt
447 .type ${prefix}_ecb_encrypt,%function
448 .align 5
449 ${prefix}_ecb_encrypt:
450 ___
451 $code.=<<___ if ($flavour =~ /64/);
452 AARCH64_VALID_CALL_TARGET
453 subs $len,$len,#16
454 // Original input data size bigger than 16, jump to big size processing.
455 b.ne .Lecb_big_size
456 vld1.8 {$dat0},[$inp]
457 cmp $enc,#0 // en- or decrypting?
458 ldr $rounds,[$key,#240]
459 vld1.32 {q5-q6},[$key],#32 // load key schedule...
460
461 b.eq .Lecb_small_dec
462 aese $dat0,q5
463 aesmc $dat0,$dat0
464 vld1.32 {q8-q9},[$key],#32 // load key schedule...
465 aese $dat0,q6
466 aesmc $dat0,$dat0
467 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
468 b.eq .Lecb_128_enc
469 .Lecb_round_loop:
470 aese $dat0,q8
471 aesmc $dat0,$dat0
472 vld1.32 {q8},[$key],#16 // load key schedule...
473 aese $dat0,q9
474 aesmc $dat0,$dat0
475 vld1.32 {q9},[$key],#16 // load key schedule...
476 subs $rounds,$rounds,#2 // bias
477 b.gt .Lecb_round_loop
478 .Lecb_128_enc:
479 vld1.32 {q10-q11},[$key],#32 // load key schedule...
480 aese $dat0,q8
481 aesmc $dat0,$dat0
482 aese $dat0,q9
483 aesmc $dat0,$dat0
484 vld1.32 {q12-q13},[$key],#32 // load key schedule...
485 aese $dat0,q10
486 aesmc $dat0,$dat0
487 aese $dat0,q11
488 aesmc $dat0,$dat0
489 vld1.32 {q14-q15},[$key],#32 // load key schedule...
490 aese $dat0,q12
491 aesmc $dat0,$dat0
492 aese $dat0,q13
493 aesmc $dat0,$dat0
494 vld1.32 {$rndlast},[$key]
495 aese $dat0,q14
496 aesmc $dat0,$dat0
497 aese $dat0,q15
498 veor $dat0,$dat0,$rndlast
499 vst1.8 {$dat0},[$out]
500 b .Lecb_Final_abort
501 .Lecb_small_dec:
502 aesd $dat0,q5
503 aesimc $dat0,$dat0
504 vld1.32 {q8-q9},[$key],#32 // load key schedule...
505 aesd $dat0,q6
506 aesimc $dat0,$dat0
507 subs $rounds,$rounds,#10 // bias
508 b.eq .Lecb_128_dec
509 .Lecb_dec_round_loop:
510 aesd $dat0,q8
511 aesimc $dat0,$dat0
512 vld1.32 {q8},[$key],#16 // load key schedule...
513 aesd $dat0,q9
514 aesimc $dat0,$dat0
515 vld1.32 {q9},[$key],#16 // load key schedule...
516 subs $rounds,$rounds,#2 // bias
517 b.gt .Lecb_dec_round_loop
518 .Lecb_128_dec:
519 vld1.32 {q10-q11},[$key],#32 // load key schedule...
520 aesd $dat0,q8
521 aesimc $dat0,$dat0
522 aesd $dat0,q9
523 aesimc $dat0,$dat0
524 vld1.32 {q12-q13},[$key],#32 // load key schedule...
525 aesd $dat0,q10
526 aesimc $dat0,$dat0
527 aesd $dat0,q11
528 aesimc $dat0,$dat0
529 vld1.32 {q14-q15},[$key],#32 // load key schedule...
530 aesd $dat0,q12
531 aesimc $dat0,$dat0
532 aesd $dat0,q13
533 aesimc $dat0,$dat0
534 vld1.32 {$rndlast},[$key]
535 aesd $dat0,q14
536 aesimc $dat0,$dat0
537 aesd $dat0,q15
538 veor $dat0,$dat0,$rndlast
539 vst1.8 {$dat0},[$out]
540 b .Lecb_Final_abort
541 .Lecb_big_size:
542 ___
543 $code.=<<___ if ($flavour =~ /64/);
544 stp x29,x30,[sp,#-16]!
545 add x29,sp,#0
546 ___
547 $code.=<<___ if ($flavour !~ /64/);
548 mov ip,sp
549 stmdb sp!,{r4-r8,lr}
550 vstmdb sp!,{d8-d15} @ ABI specification says so
551 ldmia ip,{r4-r5} @ load remaining args
552 subs $len,$len,#16
553 ___
554 $code.=<<___;
555 mov $step,#16
556 b.lo .Lecb_done
557 cclr $step,eq
558
559 cmp $enc,#0 // en- or decrypting?
560 ldr $rounds,[$key,#240]
561 and $len,$len,#-16
562 vld1.8 {$dat},[$inp],$step
563
564 vld1.32 {q8-q9},[$key] // load key schedule...
565 sub $rounds,$rounds,#6
566 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
567 sub $rounds,$rounds,#2
568 vld1.32 {q10-q11},[$key_],#32
569 vld1.32 {q12-q13},[$key_],#32
570 vld1.32 {q14-q15},[$key_],#32
571 vld1.32 {$rndlast},[$key_]
572
573 add $key_,$key,#32
574 mov $cnt,$rounds
575 b.eq .Lecb_dec
576
577 vld1.8 {$dat1},[$inp],#16
578 subs $len,$len,#32 // bias
579 add $cnt,$rounds,#2
580 vorr $in1,$dat1,$dat1
581 vorr $dat2,$dat1,$dat1
582 vorr $dat1,$dat,$dat
583 b.lo .Lecb_enc_tail
584
585 vorr $dat1,$in1,$in1
586 vld1.8 {$dat2},[$inp],#16
587 ___
588 $code.=<<___ if ($flavour =~ /64/);
589 cmp $len,#32
590 b.lo .Loop3x_ecb_enc
591
592 vld1.8 {$dat3},[$inp],#16
593 vld1.8 {$dat4},[$inp],#16
594 sub $len,$len,#32 // bias
595 mov $cnt,$rounds
596
597 .Loop5x_ecb_enc:
598 aese $dat0,q8
599 aesmc $dat0,$dat0
600 aese $dat1,q8
601 aesmc $dat1,$dat1
602 aese $dat2,q8
603 aesmc $dat2,$dat2
604 aese $dat3,q8
605 aesmc $dat3,$dat3
606 aese $dat4,q8
607 aesmc $dat4,$dat4
608 vld1.32 {q8},[$key_],#16
609 subs $cnt,$cnt,#2
610 aese $dat0,q9
611 aesmc $dat0,$dat0
612 aese $dat1,q9
613 aesmc $dat1,$dat1
614 aese $dat2,q9
615 aesmc $dat2,$dat2
616 aese $dat3,q9
617 aesmc $dat3,$dat3
618 aese $dat4,q9
619 aesmc $dat4,$dat4
620 vld1.32 {q9},[$key_],#16
621 b.gt .Loop5x_ecb_enc
622
623 aese $dat0,q8
624 aesmc $dat0,$dat0
625 aese $dat1,q8
626 aesmc $dat1,$dat1
627 aese $dat2,q8
628 aesmc $dat2,$dat2
629 aese $dat3,q8
630 aesmc $dat3,$dat3
631 aese $dat4,q8
632 aesmc $dat4,$dat4
633 cmp $len,#0x40 // because .Lecb_enc_tail4x
634 sub $len,$len,#0x50
635
636 aese $dat0,q9
637 aesmc $dat0,$dat0
638 aese $dat1,q9
639 aesmc $dat1,$dat1
640 aese $dat2,q9
641 aesmc $dat2,$dat2
642 aese $dat3,q9
643 aesmc $dat3,$dat3
644 aese $dat4,q9
645 aesmc $dat4,$dat4
646 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
647 mov $key_,$key
648
649 aese $dat0,q10
650 aesmc $dat0,$dat0
651 aese $dat1,q10
652 aesmc $dat1,$dat1
653 aese $dat2,q10
654 aesmc $dat2,$dat2
655 aese $dat3,q10
656 aesmc $dat3,$dat3
657 aese $dat4,q10
658 aesmc $dat4,$dat4
659 add $inp,$inp,x6 // $inp is adjusted in such way that
660 // at exit from the loop $dat1-$dat4
661 // are loaded with last "words"
662 add x6,$len,#0x60 // because .Lecb_enc_tail4x
663
664 aese $dat0,q11
665 aesmc $dat0,$dat0
666 aese $dat1,q11
667 aesmc $dat1,$dat1
668 aese $dat2,q11
669 aesmc $dat2,$dat2
670 aese $dat3,q11
671 aesmc $dat3,$dat3
672 aese $dat4,q11
673 aesmc $dat4,$dat4
674
675 aese $dat0,q12
676 aesmc $dat0,$dat0
677 aese $dat1,q12
678 aesmc $dat1,$dat1
679 aese $dat2,q12
680 aesmc $dat2,$dat2
681 aese $dat3,q12
682 aesmc $dat3,$dat3
683 aese $dat4,q12
684 aesmc $dat4,$dat4
685
686 aese $dat0,q13
687 aesmc $dat0,$dat0
688 aese $dat1,q13
689 aesmc $dat1,$dat1
690 aese $dat2,q13
691 aesmc $dat2,$dat2
692 aese $dat3,q13
693 aesmc $dat3,$dat3
694 aese $dat4,q13
695 aesmc $dat4,$dat4
696
697 aese $dat0,q14
698 aesmc $dat0,$dat0
699 aese $dat1,q14
700 aesmc $dat1,$dat1
701 aese $dat2,q14
702 aesmc $dat2,$dat2
703 aese $dat3,q14
704 aesmc $dat3,$dat3
705 aese $dat4,q14
706 aesmc $dat4,$dat4
707
708 aese $dat0,q15
709 vld1.8 {$in0},[$inp],#16
710 aese $dat1,q15
711 vld1.8 {$in1},[$inp],#16
712 aese $dat2,q15
713 vld1.8 {$in2},[$inp],#16
714 aese $dat3,q15
715 vld1.8 {$in3},[$inp],#16
716 aese $dat4,q15
717 vld1.8 {$in4},[$inp],#16
718 cbz x6,.Lecb_enc_tail4x
719 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
720 veor $tmp0,$rndlast,$dat0
721 vorr $dat0,$in0,$in0
722 veor $tmp1,$rndlast,$dat1
723 vorr $dat1,$in1,$in1
724 veor $tmp2,$rndlast,$dat2
725 vorr $dat2,$in2,$in2
726 veor $tmp3,$rndlast,$dat3
727 vorr $dat3,$in3,$in3
728 veor $tmp4,$rndlast,$dat4
729 vst1.8 {$tmp0},[$out],#16
730 vorr $dat4,$in4,$in4
731 vst1.8 {$tmp1},[$out],#16
732 mov $cnt,$rounds
733 vst1.8 {$tmp2},[$out],#16
734 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
735 vst1.8 {$tmp3},[$out],#16
736 vst1.8 {$tmp4},[$out],#16
737 b.hs .Loop5x_ecb_enc
738
739 add $len,$len,#0x50
740 cbz $len,.Lecb_done
741
742 add $cnt,$rounds,#2
743 subs $len,$len,#0x30
744 vorr $dat0,$in2,$in2
745 vorr $dat1,$in3,$in3
746 vorr $dat2,$in4,$in4
747 b.lo .Lecb_enc_tail
748
749 b .Loop3x_ecb_enc
750
751 .align 4
752 .Lecb_enc_tail4x:
753 veor $tmp1,$rndlast,$dat1
754 veor $tmp2,$rndlast,$dat2
755 veor $tmp3,$rndlast,$dat3
756 veor $tmp4,$rndlast,$dat4
757 vst1.8 {$tmp1},[$out],#16
758 vst1.8 {$tmp2},[$out],#16
759 vst1.8 {$tmp3},[$out],#16
760 vst1.8 {$tmp4},[$out],#16
761
762 b .Lecb_done
763 .align 4
764 ___
765 $code.=<<___;
766 .Loop3x_ecb_enc:
767 aese $dat0,q8
768 aesmc $dat0,$dat0
769 aese $dat1,q8
770 aesmc $dat1,$dat1
771 aese $dat2,q8
772 aesmc $dat2,$dat2
773 vld1.32 {q8},[$key_],#16
774 subs $cnt,$cnt,#2
775 aese $dat0,q9
776 aesmc $dat0,$dat0
777 aese $dat1,q9
778 aesmc $dat1,$dat1
779 aese $dat2,q9
780 aesmc $dat2,$dat2
781 vld1.32 {q9},[$key_],#16
782 b.gt .Loop3x_ecb_enc
783
784 aese $dat0,q8
785 aesmc $dat0,$dat0
786 aese $dat1,q8
787 aesmc $dat1,$dat1
788 aese $dat2,q8
789 aesmc $dat2,$dat2
790 subs $len,$len,#0x30
791 mov.lo x6,$len // x6, $cnt, is zero at this point
792 aese $dat0,q9
793 aesmc $dat0,$dat0
794 aese $dat1,q9
795 aesmc $dat1,$dat1
796 aese $dat2,q9
797 aesmc $dat2,$dat2
798 add $inp,$inp,x6 // $inp is adjusted in such way that
799 // at exit from the loop $dat1-$dat2
800 // are loaded with last "words"
801 mov $key_,$key
802 aese $dat0,q12
803 aesmc $dat0,$dat0
804 aese $dat1,q12
805 aesmc $dat1,$dat1
806 aese $dat2,q12
807 aesmc $dat2,$dat2
808 vld1.8 {$in0},[$inp],#16
809 aese $dat0,q13
810 aesmc $dat0,$dat0
811 aese $dat1,q13
812 aesmc $dat1,$dat1
813 aese $dat2,q13
814 aesmc $dat2,$dat2
815 vld1.8 {$in1},[$inp],#16
816 aese $dat0,q14
817 aesmc $dat0,$dat0
818 aese $dat1,q14
819 aesmc $dat1,$dat1
820 aese $dat2,q14
821 aesmc $dat2,$dat2
822 vld1.8 {$in2},[$inp],#16
823 aese $dat0,q15
824 aese $dat1,q15
825 aese $dat2,q15
826 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
827 add $cnt,$rounds,#2
828 veor $tmp0,$rndlast,$dat0
829 veor $tmp1,$rndlast,$dat1
830 veor $dat2,$dat2,$rndlast
831 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
832 vst1.8 {$tmp0},[$out],#16
833 vorr $dat0,$in0,$in0
834 vst1.8 {$tmp1},[$out],#16
835 vorr $dat1,$in1,$in1
836 vst1.8 {$dat2},[$out],#16
837 vorr $dat2,$in2,$in2
838 b.hs .Loop3x_ecb_enc
839
840 cmn $len,#0x30
841 b.eq .Lecb_done
842 nop
843
844 .Lecb_enc_tail:
845 aese $dat1,q8
846 aesmc $dat1,$dat1
847 aese $dat2,q8
848 aesmc $dat2,$dat2
849 vld1.32 {q8},[$key_],#16
850 subs $cnt,$cnt,#2
851 aese $dat1,q9
852 aesmc $dat1,$dat1
853 aese $dat2,q9
854 aesmc $dat2,$dat2
855 vld1.32 {q9},[$key_],#16
856 b.gt .Lecb_enc_tail
857
858 aese $dat1,q8
859 aesmc $dat1,$dat1
860 aese $dat2,q8
861 aesmc $dat2,$dat2
862 aese $dat1,q9
863 aesmc $dat1,$dat1
864 aese $dat2,q9
865 aesmc $dat2,$dat2
866 aese $dat1,q12
867 aesmc $dat1,$dat1
868 aese $dat2,q12
869 aesmc $dat2,$dat2
870 cmn $len,#0x20
871 aese $dat1,q13
872 aesmc $dat1,$dat1
873 aese $dat2,q13
874 aesmc $dat2,$dat2
875 aese $dat1,q14
876 aesmc $dat1,$dat1
877 aese $dat2,q14
878 aesmc $dat2,$dat2
879 aese $dat1,q15
880 aese $dat2,q15
881 b.eq .Lecb_enc_one
882 veor $tmp1,$rndlast,$dat1
883 veor $tmp2,$rndlast,$dat2
884 vst1.8 {$tmp1},[$out],#16
885 vst1.8 {$tmp2},[$out],#16
886 b .Lecb_done
887
888 .Lecb_enc_one:
889 veor $tmp1,$rndlast,$dat2
890 vst1.8 {$tmp1},[$out],#16
891 b .Lecb_done
892 ___
893
894 $code.=<<___;
895 .align 5
896 .Lecb_dec:
897 vld1.8 {$dat1},[$inp],#16
898 subs $len,$len,#32 // bias
899 add $cnt,$rounds,#2
900 vorr $in1,$dat1,$dat1
901 vorr $dat2,$dat1,$dat1
902 vorr $dat1,$dat,$dat
903 b.lo .Lecb_dec_tail
904
905 vorr $dat1,$in1,$in1
906 vld1.8 {$dat2},[$inp],#16
907 ___
908 $code.=<<___ if ($flavour =~ /64/);
909 cmp $len,#32
910 b.lo .Loop3x_ecb_dec
911
912 vld1.8 {$dat3},[$inp],#16
913 vld1.8 {$dat4},[$inp],#16
914 sub $len,$len,#32 // bias
915 mov $cnt,$rounds
916
917 .Loop5x_ecb_dec:
918 aesd $dat0,q8
919 aesimc $dat0,$dat0
920 aesd $dat1,q8
921 aesimc $dat1,$dat1
922 aesd $dat2,q8
923 aesimc $dat2,$dat2
924 aesd $dat3,q8
925 aesimc $dat3,$dat3
926 aesd $dat4,q8
927 aesimc $dat4,$dat4
928 vld1.32 {q8},[$key_],#16
929 subs $cnt,$cnt,#2
930 aesd $dat0,q9
931 aesimc $dat0,$dat0
932 aesd $dat1,q9
933 aesimc $dat1,$dat1
934 aesd $dat2,q9
935 aesimc $dat2,$dat2
936 aesd $dat3,q9
937 aesimc $dat3,$dat3
938 aesd $dat4,q9
939 aesimc $dat4,$dat4
940 vld1.32 {q9},[$key_],#16
941 b.gt .Loop5x_ecb_dec
942
943 aesd $dat0,q8
944 aesimc $dat0,$dat0
945 aesd $dat1,q8
946 aesimc $dat1,$dat1
947 aesd $dat2,q8
948 aesimc $dat2,$dat2
949 aesd $dat3,q8
950 aesimc $dat3,$dat3
951 aesd $dat4,q8
952 aesimc $dat4,$dat4
953 cmp $len,#0x40 // because .Lecb_tail4x
954 sub $len,$len,#0x50
955
956 aesd $dat0,q9
957 aesimc $dat0,$dat0
958 aesd $dat1,q9
959 aesimc $dat1,$dat1
960 aesd $dat2,q9
961 aesimc $dat2,$dat2
962 aesd $dat3,q9
963 aesimc $dat3,$dat3
964 aesd $dat4,q9
965 aesimc $dat4,$dat4
966 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
967 mov $key_,$key
968
969 aesd $dat0,q10
970 aesimc $dat0,$dat0
971 aesd $dat1,q10
972 aesimc $dat1,$dat1
973 aesd $dat2,q10
974 aesimc $dat2,$dat2
975 aesd $dat3,q10
976 aesimc $dat3,$dat3
977 aesd $dat4,q10
978 aesimc $dat4,$dat4
979 add $inp,$inp,x6 // $inp is adjusted in such way that
980 // at exit from the loop $dat1-$dat4
981 // are loaded with last "words"
982 add x6,$len,#0x60 // because .Lecb_tail4x
983
984 aesd $dat0,q11
985 aesimc $dat0,$dat0
986 aesd $dat1,q11
987 aesimc $dat1,$dat1
988 aesd $dat2,q11
989 aesimc $dat2,$dat2
990 aesd $dat3,q11
991 aesimc $dat3,$dat3
992 aesd $dat4,q11
993 aesimc $dat4,$dat4
994
995 aesd $dat0,q12
996 aesimc $dat0,$dat0
997 aesd $dat1,q12
998 aesimc $dat1,$dat1
999 aesd $dat2,q12
1000 aesimc $dat2,$dat2
1001 aesd $dat3,q12
1002 aesimc $dat3,$dat3
1003 aesd $dat4,q12
1004 aesimc $dat4,$dat4
1005
1006 aesd $dat0,q13
1007 aesimc $dat0,$dat0
1008 aesd $dat1,q13
1009 aesimc $dat1,$dat1
1010 aesd $dat2,q13
1011 aesimc $dat2,$dat2
1012 aesd $dat3,q13
1013 aesimc $dat3,$dat3
1014 aesd $dat4,q13
1015 aesimc $dat4,$dat4
1016
1017 aesd $dat0,q14
1018 aesimc $dat0,$dat0
1019 aesd $dat1,q14
1020 aesimc $dat1,$dat1
1021 aesd $dat2,q14
1022 aesimc $dat2,$dat2
1023 aesd $dat3,q14
1024 aesimc $dat3,$dat3
1025 aesd $dat4,q14
1026 aesimc $dat4,$dat4
1027
1028 aesd $dat0,q15
1029 vld1.8 {$in0},[$inp],#16
1030 aesd $dat1,q15
1031 vld1.8 {$in1},[$inp],#16
1032 aesd $dat2,q15
1033 vld1.8 {$in2},[$inp],#16
1034 aesd $dat3,q15
1035 vld1.8 {$in3},[$inp],#16
1036 aesd $dat4,q15
1037 vld1.8 {$in4},[$inp],#16
1038 cbz x6,.Lecb_tail4x
1039 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1040 veor $tmp0,$rndlast,$dat0
1041 vorr $dat0,$in0,$in0
1042 veor $tmp1,$rndlast,$dat1
1043 vorr $dat1,$in1,$in1
1044 veor $tmp2,$rndlast,$dat2
1045 vorr $dat2,$in2,$in2
1046 veor $tmp3,$rndlast,$dat3
1047 vorr $dat3,$in3,$in3
1048 veor $tmp4,$rndlast,$dat4
1049 vst1.8 {$tmp0},[$out],#16
1050 vorr $dat4,$in4,$in4
1051 vst1.8 {$tmp1},[$out],#16
1052 mov $cnt,$rounds
1053 vst1.8 {$tmp2},[$out],#16
1054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1055 vst1.8 {$tmp3},[$out],#16
1056 vst1.8 {$tmp4},[$out],#16
1057 b.hs .Loop5x_ecb_dec
1058
1059 add $len,$len,#0x50
1060 cbz $len,.Lecb_done
1061
1062 add $cnt,$rounds,#2
1063 subs $len,$len,#0x30
1064 vorr $dat0,$in2,$in2
1065 vorr $dat1,$in3,$in3
1066 vorr $dat2,$in4,$in4
1067 b.lo .Lecb_dec_tail
1068
1069 b .Loop3x_ecb_dec
1070
1071 .align 4
1072 .Lecb_tail4x:
1073 veor $tmp1,$rndlast,$dat1
1074 veor $tmp2,$rndlast,$dat2
1075 veor $tmp3,$rndlast,$dat3
1076 veor $tmp4,$rndlast,$dat4
1077 vst1.8 {$tmp1},[$out],#16
1078 vst1.8 {$tmp2},[$out],#16
1079 vst1.8 {$tmp3},[$out],#16
1080 vst1.8 {$tmp4},[$out],#16
1081
1082 b .Lecb_done
1083 .align 4
1084 ___
1085 $code.=<<___;
1086 .Loop3x_ecb_dec:
1087 aesd $dat0,q8
1088 aesimc $dat0,$dat0
1089 aesd $dat1,q8
1090 aesimc $dat1,$dat1
1091 aesd $dat2,q8
1092 aesimc $dat2,$dat2
1093 vld1.32 {q8},[$key_],#16
1094 subs $cnt,$cnt,#2
1095 aesd $dat0,q9
1096 aesimc $dat0,$dat0
1097 aesd $dat1,q9
1098 aesimc $dat1,$dat1
1099 aesd $dat2,q9
1100 aesimc $dat2,$dat2
1101 vld1.32 {q9},[$key_],#16
1102 b.gt .Loop3x_ecb_dec
1103
1104 aesd $dat0,q8
1105 aesimc $dat0,$dat0
1106 aesd $dat1,q8
1107 aesimc $dat1,$dat1
1108 aesd $dat2,q8
1109 aesimc $dat2,$dat2
1110 subs $len,$len,#0x30
1111 mov.lo x6,$len // x6, $cnt, is zero at this point
1112 aesd $dat0,q9
1113 aesimc $dat0,$dat0
1114 aesd $dat1,q9
1115 aesimc $dat1,$dat1
1116 aesd $dat2,q9
1117 aesimc $dat2,$dat2
1118 add $inp,$inp,x6 // $inp is adjusted in such way that
1119 // at exit from the loop $dat1-$dat2
1120 // are loaded with last "words"
1121 mov $key_,$key
1122 aesd $dat0,q12
1123 aesimc $dat0,$dat0
1124 aesd $dat1,q12
1125 aesimc $dat1,$dat1
1126 aesd $dat2,q12
1127 aesimc $dat2,$dat2
1128 vld1.8 {$in0},[$inp],#16
1129 aesd $dat0,q13
1130 aesimc $dat0,$dat0
1131 aesd $dat1,q13
1132 aesimc $dat1,$dat1
1133 aesd $dat2,q13
1134 aesimc $dat2,$dat2
1135 vld1.8 {$in1},[$inp],#16
1136 aesd $dat0,q14
1137 aesimc $dat0,$dat0
1138 aesd $dat1,q14
1139 aesimc $dat1,$dat1
1140 aesd $dat2,q14
1141 aesimc $dat2,$dat2
1142 vld1.8 {$in2},[$inp],#16
1143 aesd $dat0,q15
1144 aesd $dat1,q15
1145 aesd $dat2,q15
1146 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1147 add $cnt,$rounds,#2
1148 veor $tmp0,$rndlast,$dat0
1149 veor $tmp1,$rndlast,$dat1
1150 veor $dat2,$dat2,$rndlast
1151 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1152 vst1.8 {$tmp0},[$out],#16
1153 vorr $dat0,$in0,$in0
1154 vst1.8 {$tmp1},[$out],#16
1155 vorr $dat1,$in1,$in1
1156 vst1.8 {$dat2},[$out],#16
1157 vorr $dat2,$in2,$in2
1158 b.hs .Loop3x_ecb_dec
1159
1160 cmn $len,#0x30
1161 b.eq .Lecb_done
1162 nop
1163
1164 .Lecb_dec_tail:
1165 aesd $dat1,q8
1166 aesimc $dat1,$dat1
1167 aesd $dat2,q8
1168 aesimc $dat2,$dat2
1169 vld1.32 {q8},[$key_],#16
1170 subs $cnt,$cnt,#2
1171 aesd $dat1,q9
1172 aesimc $dat1,$dat1
1173 aesd $dat2,q9
1174 aesimc $dat2,$dat2
1175 vld1.32 {q9},[$key_],#16
1176 b.gt .Lecb_dec_tail
1177
1178 aesd $dat1,q8
1179 aesimc $dat1,$dat1
1180 aesd $dat2,q8
1181 aesimc $dat2,$dat2
1182 aesd $dat1,q9
1183 aesimc $dat1,$dat1
1184 aesd $dat2,q9
1185 aesimc $dat2,$dat2
1186 aesd $dat1,q12
1187 aesimc $dat1,$dat1
1188 aesd $dat2,q12
1189 aesimc $dat2,$dat2
1190 cmn $len,#0x20
1191 aesd $dat1,q13
1192 aesimc $dat1,$dat1
1193 aesd $dat2,q13
1194 aesimc $dat2,$dat2
1195 aesd $dat1,q14
1196 aesimc $dat1,$dat1
1197 aesd $dat2,q14
1198 aesimc $dat2,$dat2
1199 aesd $dat1,q15
1200 aesd $dat2,q15
1201 b.eq .Lecb_dec_one
1202 veor $tmp1,$rndlast,$dat1
1203 veor $tmp2,$rndlast,$dat2
1204 vst1.8 {$tmp1},[$out],#16
1205 vst1.8 {$tmp2},[$out],#16
1206 b .Lecb_done
1207
1208 .Lecb_dec_one:
1209 veor $tmp1,$rndlast,$dat2
1210 vst1.8 {$tmp1},[$out],#16
1211
1212 .Lecb_done:
1213 ___
1214 }
1215 $code.=<<___ if ($flavour !~ /64/);
1216 vldmia sp!,{d8-d15}
1217 ldmia sp!,{r4-r8,pc}
1218 ___
1219 $code.=<<___ if ($flavour =~ /64/);
1220 ldr x29,[sp],#16
1221 ___
1222 $code.=<<___ if ($flavour =~ /64/);
1223 .Lecb_Final_abort:
1224 ret
1225 ___
1226 $code.=<<___;
1227 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1228 ___
1229 }}}
1230 {{{
1231 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1234
1235 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1237
1238 ### q8-q15 preloaded key schedule
1239
1240 $code.=<<___;
1241 .globl ${prefix}_cbc_encrypt
1242 .type ${prefix}_cbc_encrypt,%function
1243 .align 5
1244 ${prefix}_cbc_encrypt:
1245 ___
1246 $code.=<<___ if ($flavour =~ /64/);
1247 AARCH64_VALID_CALL_TARGET
1248 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249 stp x29,x30,[sp,#-16]!
1250 add x29,sp,#0
1251 ___
1252 $code.=<<___ if ($flavour !~ /64/);
1253 mov ip,sp
1254 stmdb sp!,{r4-r8,lr}
1255 vstmdb sp!,{d8-d15} @ ABI specification says so
1256 ldmia ip,{r4-r5} @ load remaining args
1257 ___
1258 $code.=<<___;
1259 subs $len,$len,#16
1260 mov $step,#16
1261 b.lo .Lcbc_abort
1262 cclr $step,eq
1263
1264 cmp $enc,#0 // en- or decrypting?
1265 ldr $rounds,[$key,#240]
1266 and $len,$len,#-16
1267 vld1.8 {$ivec},[$ivp]
1268 vld1.8 {$dat},[$inp],$step
1269
1270 vld1.32 {q8-q9},[$key] // load key schedule...
1271 sub $rounds,$rounds,#6
1272 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1273 sub $rounds,$rounds,#2
1274 vld1.32 {q10-q11},[$key_],#32
1275 vld1.32 {q12-q13},[$key_],#32
1276 vld1.32 {q14-q15},[$key_],#32
1277 vld1.32 {$rndlast},[$key_]
1278
1279 add $key_,$key,#32
1280 mov $cnt,$rounds
1281 b.eq .Lcbc_dec
1282
1283 cmp $rounds,#2
1284 veor $dat,$dat,$ivec
1285 veor $rndzero_n_last,q8,$rndlast
1286 b.eq .Lcbc_enc128
1287
1288 vld1.32 {$in0-$in1},[$key_]
1289 add $key_,$key,#16
1290 add $key4,$key,#16*4
1291 add $key5,$key,#16*5
1292 aese $dat,q8
1293 aesmc $dat,$dat
1294 add $key6,$key,#16*6
1295 add $key7,$key,#16*7
1296 b .Lenter_cbc_enc
1297
1298 .align 4
1299 .Loop_cbc_enc:
1300 aese $dat,q8
1301 aesmc $dat,$dat
1302 vst1.8 {$ivec},[$out],#16
1303 .Lenter_cbc_enc:
1304 aese $dat,q9
1305 aesmc $dat,$dat
1306 aese $dat,$in0
1307 aesmc $dat,$dat
1308 vld1.32 {q8},[$key4]
1309 cmp $rounds,#4
1310 aese $dat,$in1
1311 aesmc $dat,$dat
1312 vld1.32 {q9},[$key5]
1313 b.eq .Lcbc_enc192
1314
1315 aese $dat,q8
1316 aesmc $dat,$dat
1317 vld1.32 {q8},[$key6]
1318 aese $dat,q9
1319 aesmc $dat,$dat
1320 vld1.32 {q9},[$key7]
1321 nop
1322
1323 .Lcbc_enc192:
1324 aese $dat,q8
1325 aesmc $dat,$dat
1326 subs $len,$len,#16
1327 aese $dat,q9
1328 aesmc $dat,$dat
1329 cclr $step,eq
1330 aese $dat,q10
1331 aesmc $dat,$dat
1332 aese $dat,q11
1333 aesmc $dat,$dat
1334 vld1.8 {q8},[$inp],$step
1335 aese $dat,q12
1336 aesmc $dat,$dat
1337 veor q8,q8,$rndzero_n_last
1338 aese $dat,q13
1339 aesmc $dat,$dat
1340 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1341 aese $dat,q14
1342 aesmc $dat,$dat
1343 aese $dat,q15
1344 veor $ivec,$dat,$rndlast
1345 b.hs .Loop_cbc_enc
1346
1347 vst1.8 {$ivec},[$out],#16
1348 b .Lcbc_done
1349
1350 .align 5
1351 .Lcbc_enc128:
1352 vld1.32 {$in0-$in1},[$key_]
1353 aese $dat,q8
1354 aesmc $dat,$dat
1355 b .Lenter_cbc_enc128
1356 .Loop_cbc_enc128:
1357 aese $dat,q8
1358 aesmc $dat,$dat
1359 vst1.8 {$ivec},[$out],#16
1360 .Lenter_cbc_enc128:
1361 aese $dat,q9
1362 aesmc $dat,$dat
1363 subs $len,$len,#16
1364 aese $dat,$in0
1365 aesmc $dat,$dat
1366 cclr $step,eq
1367 aese $dat,$in1
1368 aesmc $dat,$dat
1369 aese $dat,q10
1370 aesmc $dat,$dat
1371 aese $dat,q11
1372 aesmc $dat,$dat
1373 vld1.8 {q8},[$inp],$step
1374 aese $dat,q12
1375 aesmc $dat,$dat
1376 aese $dat,q13
1377 aesmc $dat,$dat
1378 aese $dat,q14
1379 aesmc $dat,$dat
1380 veor q8,q8,$rndzero_n_last
1381 aese $dat,q15
1382 veor $ivec,$dat,$rndlast
1383 b.hs .Loop_cbc_enc128
1384
1385 vst1.8 {$ivec},[$out],#16
1386 b .Lcbc_done
1387 ___
1388 {
1389 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1390
1391 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1392 my ($dat4,$in4,$tmp4);
1393 if ($flavour =~ /64/) {
1394 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1395 }
1396
1397 $code.=<<___;
1398 .align 5
1399 .Lcbc_dec:
1400 vld1.8 {$dat2},[$inp],#16
1401 subs $len,$len,#32 // bias
1402 add $cnt,$rounds,#2
1403 vorr $in1,$dat,$dat
1404 vorr $dat1,$dat,$dat
1405 vorr $in2,$dat2,$dat2
1406 b.lo .Lcbc_dec_tail
1407
1408 vorr $dat1,$dat2,$dat2
1409 vld1.8 {$dat2},[$inp],#16
1410 vorr $in0,$dat,$dat
1411 vorr $in1,$dat1,$dat1
1412 vorr $in2,$dat2,$dat2
1413 ___
1414 $code.=<<___ if ($flavour =~ /64/);
1415 cmp $len,#32
1416 b.lo .Loop3x_cbc_dec
1417
1418 vld1.8 {$dat3},[$inp],#16
1419 vld1.8 {$dat4},[$inp],#16
1420 sub $len,$len,#32 // bias
1421 mov $cnt,$rounds
1422 vorr $in3,$dat3,$dat3
1423 vorr $in4,$dat4,$dat4
1424
1425 .Loop5x_cbc_dec:
1426 aesd $dat0,q8
1427 aesimc $dat0,$dat0
1428 aesd $dat1,q8
1429 aesimc $dat1,$dat1
1430 aesd $dat2,q8
1431 aesimc $dat2,$dat2
1432 aesd $dat3,q8
1433 aesimc $dat3,$dat3
1434 aesd $dat4,q8
1435 aesimc $dat4,$dat4
1436 vld1.32 {q8},[$key_],#16
1437 subs $cnt,$cnt,#2
1438 aesd $dat0,q9
1439 aesimc $dat0,$dat0
1440 aesd $dat1,q9
1441 aesimc $dat1,$dat1
1442 aesd $dat2,q9
1443 aesimc $dat2,$dat2
1444 aesd $dat3,q9
1445 aesimc $dat3,$dat3
1446 aesd $dat4,q9
1447 aesimc $dat4,$dat4
1448 vld1.32 {q9},[$key_],#16
1449 b.gt .Loop5x_cbc_dec
1450
1451 aesd $dat0,q8
1452 aesimc $dat0,$dat0
1453 aesd $dat1,q8
1454 aesimc $dat1,$dat1
1455 aesd $dat2,q8
1456 aesimc $dat2,$dat2
1457 aesd $dat3,q8
1458 aesimc $dat3,$dat3
1459 aesd $dat4,q8
1460 aesimc $dat4,$dat4
1461 cmp $len,#0x40 // because .Lcbc_tail4x
1462 sub $len,$len,#0x50
1463
1464 aesd $dat0,q9
1465 aesimc $dat0,$dat0
1466 aesd $dat1,q9
1467 aesimc $dat1,$dat1
1468 aesd $dat2,q9
1469 aesimc $dat2,$dat2
1470 aesd $dat3,q9
1471 aesimc $dat3,$dat3
1472 aesd $dat4,q9
1473 aesimc $dat4,$dat4
1474 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1475 mov $key_,$key
1476
1477 aesd $dat0,q10
1478 aesimc $dat0,$dat0
1479 aesd $dat1,q10
1480 aesimc $dat1,$dat1
1481 aesd $dat2,q10
1482 aesimc $dat2,$dat2
1483 aesd $dat3,q10
1484 aesimc $dat3,$dat3
1485 aesd $dat4,q10
1486 aesimc $dat4,$dat4
1487 add $inp,$inp,x6 // $inp is adjusted in such way that
1488 // at exit from the loop $dat1-$dat4
1489 // are loaded with last "words"
1490 add x6,$len,#0x60 // because .Lcbc_tail4x
1491
1492 aesd $dat0,q11
1493 aesimc $dat0,$dat0
1494 aesd $dat1,q11
1495 aesimc $dat1,$dat1
1496 aesd $dat2,q11
1497 aesimc $dat2,$dat2
1498 aesd $dat3,q11
1499 aesimc $dat3,$dat3
1500 aesd $dat4,q11
1501 aesimc $dat4,$dat4
1502
1503 aesd $dat0,q12
1504 aesimc $dat0,$dat0
1505 aesd $dat1,q12
1506 aesimc $dat1,$dat1
1507 aesd $dat2,q12
1508 aesimc $dat2,$dat2
1509 aesd $dat3,q12
1510 aesimc $dat3,$dat3
1511 aesd $dat4,q12
1512 aesimc $dat4,$dat4
1513
1514 aesd $dat0,q13
1515 aesimc $dat0,$dat0
1516 aesd $dat1,q13
1517 aesimc $dat1,$dat1
1518 aesd $dat2,q13
1519 aesimc $dat2,$dat2
1520 aesd $dat3,q13
1521 aesimc $dat3,$dat3
1522 aesd $dat4,q13
1523 aesimc $dat4,$dat4
1524
1525 aesd $dat0,q14
1526 aesimc $dat0,$dat0
1527 aesd $dat1,q14
1528 aesimc $dat1,$dat1
1529 aesd $dat2,q14
1530 aesimc $dat2,$dat2
1531 aesd $dat3,q14
1532 aesimc $dat3,$dat3
1533 aesd $dat4,q14
1534 aesimc $dat4,$dat4
1535
1536 veor $tmp0,$ivec,$rndlast
1537 aesd $dat0,q15
1538 veor $tmp1,$in0,$rndlast
1539 vld1.8 {$in0},[$inp],#16
1540 aesd $dat1,q15
1541 veor $tmp2,$in1,$rndlast
1542 vld1.8 {$in1},[$inp],#16
1543 aesd $dat2,q15
1544 veor $tmp3,$in2,$rndlast
1545 vld1.8 {$in2},[$inp],#16
1546 aesd $dat3,q15
1547 veor $tmp4,$in3,$rndlast
1548 vld1.8 {$in3},[$inp],#16
1549 aesd $dat4,q15
1550 vorr $ivec,$in4,$in4
1551 vld1.8 {$in4},[$inp],#16
1552 cbz x6,.Lcbc_tail4x
1553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1554 veor $tmp0,$tmp0,$dat0
1555 vorr $dat0,$in0,$in0
1556 veor $tmp1,$tmp1,$dat1
1557 vorr $dat1,$in1,$in1
1558 veor $tmp2,$tmp2,$dat2
1559 vorr $dat2,$in2,$in2
1560 veor $tmp3,$tmp3,$dat3
1561 vorr $dat3,$in3,$in3
1562 veor $tmp4,$tmp4,$dat4
1563 vst1.8 {$tmp0},[$out],#16
1564 vorr $dat4,$in4,$in4
1565 vst1.8 {$tmp1},[$out],#16
1566 mov $cnt,$rounds
1567 vst1.8 {$tmp2},[$out],#16
1568 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1569 vst1.8 {$tmp3},[$out],#16
1570 vst1.8 {$tmp4},[$out],#16
1571 b.hs .Loop5x_cbc_dec
1572
1573 add $len,$len,#0x50
1574 cbz $len,.Lcbc_done
1575
1576 add $cnt,$rounds,#2
1577 subs $len,$len,#0x30
1578 vorr $dat0,$in2,$in2
1579 vorr $in0,$in2,$in2
1580 vorr $dat1,$in3,$in3
1581 vorr $in1,$in3,$in3
1582 vorr $dat2,$in4,$in4
1583 vorr $in2,$in4,$in4
1584 b.lo .Lcbc_dec_tail
1585
1586 b .Loop3x_cbc_dec
1587
1588 .align 4
1589 .Lcbc_tail4x:
1590 veor $tmp1,$tmp0,$dat1
1591 veor $tmp2,$tmp2,$dat2
1592 veor $tmp3,$tmp3,$dat3
1593 veor $tmp4,$tmp4,$dat4
1594 vst1.8 {$tmp1},[$out],#16
1595 vst1.8 {$tmp2},[$out],#16
1596 vst1.8 {$tmp3},[$out],#16
1597 vst1.8 {$tmp4},[$out],#16
1598
1599 b .Lcbc_done
1600 .align 4
1601 ___
1602 $code.=<<___;
1603 .Loop3x_cbc_dec:
1604 aesd $dat0,q8
1605 aesimc $dat0,$dat0
1606 aesd $dat1,q8
1607 aesimc $dat1,$dat1
1608 aesd $dat2,q8
1609 aesimc $dat2,$dat2
1610 vld1.32 {q8},[$key_],#16
1611 subs $cnt,$cnt,#2
1612 aesd $dat0,q9
1613 aesimc $dat0,$dat0
1614 aesd $dat1,q9
1615 aesimc $dat1,$dat1
1616 aesd $dat2,q9
1617 aesimc $dat2,$dat2
1618 vld1.32 {q9},[$key_],#16
1619 b.gt .Loop3x_cbc_dec
1620
1621 aesd $dat0,q8
1622 aesimc $dat0,$dat0
1623 aesd $dat1,q8
1624 aesimc $dat1,$dat1
1625 aesd $dat2,q8
1626 aesimc $dat2,$dat2
1627 veor $tmp0,$ivec,$rndlast
1628 subs $len,$len,#0x30
1629 veor $tmp1,$in0,$rndlast
1630 mov.lo x6,$len // x6, $cnt, is zero at this point
1631 aesd $dat0,q9
1632 aesimc $dat0,$dat0
1633 aesd $dat1,q9
1634 aesimc $dat1,$dat1
1635 aesd $dat2,q9
1636 aesimc $dat2,$dat2
1637 veor $tmp2,$in1,$rndlast
1638 add $inp,$inp,x6 // $inp is adjusted in such way that
1639 // at exit from the loop $dat1-$dat2
1640 // are loaded with last "words"
1641 vorr $ivec,$in2,$in2
1642 mov $key_,$key
1643 aesd $dat0,q12
1644 aesimc $dat0,$dat0
1645 aesd $dat1,q12
1646 aesimc $dat1,$dat1
1647 aesd $dat2,q12
1648 aesimc $dat2,$dat2
1649 vld1.8 {$in0},[$inp],#16
1650 aesd $dat0,q13
1651 aesimc $dat0,$dat0
1652 aesd $dat1,q13
1653 aesimc $dat1,$dat1
1654 aesd $dat2,q13
1655 aesimc $dat2,$dat2
1656 vld1.8 {$in1},[$inp],#16
1657 aesd $dat0,q14
1658 aesimc $dat0,$dat0
1659 aesd $dat1,q14
1660 aesimc $dat1,$dat1
1661 aesd $dat2,q14
1662 aesimc $dat2,$dat2
1663 vld1.8 {$in2},[$inp],#16
1664 aesd $dat0,q15
1665 aesd $dat1,q15
1666 aesd $dat2,q15
1667 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1668 add $cnt,$rounds,#2
1669 veor $tmp0,$tmp0,$dat0
1670 veor $tmp1,$tmp1,$dat1
1671 veor $dat2,$dat2,$tmp2
1672 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1673 vst1.8 {$tmp0},[$out],#16
1674 vorr $dat0,$in0,$in0
1675 vst1.8 {$tmp1},[$out],#16
1676 vorr $dat1,$in1,$in1
1677 vst1.8 {$dat2},[$out],#16
1678 vorr $dat2,$in2,$in2
1679 b.hs .Loop3x_cbc_dec
1680
1681 cmn $len,#0x30
1682 b.eq .Lcbc_done
1683 nop
1684
1685 .Lcbc_dec_tail:
1686 aesd $dat1,q8
1687 aesimc $dat1,$dat1
1688 aesd $dat2,q8
1689 aesimc $dat2,$dat2
1690 vld1.32 {q8},[$key_],#16
1691 subs $cnt,$cnt,#2
1692 aesd $dat1,q9
1693 aesimc $dat1,$dat1
1694 aesd $dat2,q9
1695 aesimc $dat2,$dat2
1696 vld1.32 {q9},[$key_],#16
1697 b.gt .Lcbc_dec_tail
1698
1699 aesd $dat1,q8
1700 aesimc $dat1,$dat1
1701 aesd $dat2,q8
1702 aesimc $dat2,$dat2
1703 aesd $dat1,q9
1704 aesimc $dat1,$dat1
1705 aesd $dat2,q9
1706 aesimc $dat2,$dat2
1707 aesd $dat1,q12
1708 aesimc $dat1,$dat1
1709 aesd $dat2,q12
1710 aesimc $dat2,$dat2
1711 cmn $len,#0x20
1712 aesd $dat1,q13
1713 aesimc $dat1,$dat1
1714 aesd $dat2,q13
1715 aesimc $dat2,$dat2
1716 veor $tmp1,$ivec,$rndlast
1717 aesd $dat1,q14
1718 aesimc $dat1,$dat1
1719 aesd $dat2,q14
1720 aesimc $dat2,$dat2
1721 veor $tmp2,$in1,$rndlast
1722 aesd $dat1,q15
1723 aesd $dat2,q15
1724 b.eq .Lcbc_dec_one
1725 veor $tmp1,$tmp1,$dat1
1726 veor $tmp2,$tmp2,$dat2
1727 vorr $ivec,$in2,$in2
1728 vst1.8 {$tmp1},[$out],#16
1729 vst1.8 {$tmp2},[$out],#16
1730 b .Lcbc_done
1731
1732 .Lcbc_dec_one:
1733 veor $tmp1,$tmp1,$dat2
1734 vorr $ivec,$in2,$in2
1735 vst1.8 {$tmp1},[$out],#16
1736
1737 .Lcbc_done:
1738 vst1.8 {$ivec},[$ivp]
1739 .Lcbc_abort:
1740 ___
1741 }
1742 $code.=<<___ if ($flavour !~ /64/);
1743 vldmia sp!,{d8-d15}
1744 ldmia sp!,{r4-r8,pc}
1745 ___
1746 $code.=<<___ if ($flavour =~ /64/);
1747 ldr x29,[sp],#16
1748 ret
1749 ___
1750 $code.=<<___;
1751 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1752 ___
1753 }}}
1754 {{{
1755 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1756 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1757 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1758 my $step="x12"; # aliases with $tctr2
1759
1760 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1761 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1762
1763 # used only in 64-bit mode...
1764 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1765
1766 my ($dat,$tmp)=($dat0,$tmp0);
1767
1768 ### q8-q15 preloaded key schedule
1769
1770 $code.=<<___;
1771 .globl ${prefix}_ctr32_encrypt_blocks
1772 .type ${prefix}_ctr32_encrypt_blocks,%function
1773 .align 5
1774 ${prefix}_ctr32_encrypt_blocks:
1775 ___
1776 $code.=<<___ if ($flavour =~ /64/);
1777 AARCH64_VALID_CALL_TARGET
1778 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1779 stp x29,x30,[sp,#-16]!
1780 add x29,sp,#0
1781 ___
1782 $code.=<<___ if ($flavour !~ /64/);
1783 mov ip,sp
1784 stmdb sp!,{r4-r10,lr}
1785 vstmdb sp!,{d8-d15} @ ABI specification says so
1786 ldr r4, [ip] @ load remaining arg
1787 ___
1788 $code.=<<___;
1789 ldr $rounds,[$key,#240]
1790
1791 ldr $ctr, [$ivp, #12]
1792 #ifdef __ARMEB__
1793 vld1.8 {$dat0},[$ivp]
1794 #else
1795 vld1.32 {$dat0},[$ivp]
1796 #endif
1797 vld1.32 {q8-q9},[$key] // load key schedule...
1798 sub $rounds,$rounds,#4
1799 mov $step,#16
1800 cmp $len,#2
1801 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1802 sub $rounds,$rounds,#2
1803 vld1.32 {q12-q13},[$key_],#32
1804 vld1.32 {q14-q15},[$key_],#32
1805 vld1.32 {$rndlast},[$key_]
1806 add $key_,$key,#32
1807 mov $cnt,$rounds
1808 cclr $step,lo
1809 #ifndef __ARMEB__
1810 rev $ctr, $ctr
1811 #endif
1812 add $tctr1, $ctr, #1
1813 vorr $ivec,$dat0,$dat0
1814 rev $tctr1, $tctr1
1815 vmov.32 ${ivec}[3],$tctr1
1816 add $ctr, $ctr, #2
1817 vorr $dat1,$ivec,$ivec
1818 b.ls .Lctr32_tail
1819 rev $tctr2, $ctr
1820 vmov.32 ${ivec}[3],$tctr2
1821 sub $len,$len,#3 // bias
1822 vorr $dat2,$ivec,$ivec
1823 ___
1824 $code.=<<___ if ($flavour =~ /64/);
1825 cmp $len,#32
1826 b.lo .Loop3x_ctr32
1827
1828 add w13,$ctr,#1
1829 add w14,$ctr,#2
1830 vorr $dat3,$dat0,$dat0
1831 rev w13,w13
1832 vorr $dat4,$dat0,$dat0
1833 rev w14,w14
1834 vmov.32 ${dat3}[3],w13
1835 sub $len,$len,#2 // bias
1836 vmov.32 ${dat4}[3],w14
1837 add $ctr,$ctr,#2
1838 b .Loop5x_ctr32
1839
1840 .align 4
1841 .Loop5x_ctr32:
1842 aese $dat0,q8
1843 aesmc $dat0,$dat0
1844 aese $dat1,q8
1845 aesmc $dat1,$dat1
1846 aese $dat2,q8
1847 aesmc $dat2,$dat2
1848 aese $dat3,q8
1849 aesmc $dat3,$dat3
1850 aese $dat4,q8
1851 aesmc $dat4,$dat4
1852 vld1.32 {q8},[$key_],#16
1853 subs $cnt,$cnt,#2
1854 aese $dat0,q9
1855 aesmc $dat0,$dat0
1856 aese $dat1,q9
1857 aesmc $dat1,$dat1
1858 aese $dat2,q9
1859 aesmc $dat2,$dat2
1860 aese $dat3,q9
1861 aesmc $dat3,$dat3
1862 aese $dat4,q9
1863 aesmc $dat4,$dat4
1864 vld1.32 {q9},[$key_],#16
1865 b.gt .Loop5x_ctr32
1866
1867 mov $key_,$key
1868 aese $dat0,q8
1869 aesmc $dat0,$dat0
1870 aese $dat1,q8
1871 aesmc $dat1,$dat1
1872 aese $dat2,q8
1873 aesmc $dat2,$dat2
1874 aese $dat3,q8
1875 aesmc $dat3,$dat3
1876 aese $dat4,q8
1877 aesmc $dat4,$dat4
1878 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1879
1880 aese $dat0,q9
1881 aesmc $dat0,$dat0
1882 aese $dat1,q9
1883 aesmc $dat1,$dat1
1884 aese $dat2,q9
1885 aesmc $dat2,$dat2
1886 aese $dat3,q9
1887 aesmc $dat3,$dat3
1888 aese $dat4,q9
1889 aesmc $dat4,$dat4
1890 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1891
1892 aese $dat0,q12
1893 aesmc $dat0,$dat0
1894 add $tctr0,$ctr,#1
1895 add $tctr1,$ctr,#2
1896 aese $dat1,q12
1897 aesmc $dat1,$dat1
1898 add $tctr2,$ctr,#3
1899 add w13,$ctr,#4
1900 aese $dat2,q12
1901 aesmc $dat2,$dat2
1902 add w14,$ctr,#5
1903 rev $tctr0,$tctr0
1904 aese $dat3,q12
1905 aesmc $dat3,$dat3
1906 rev $tctr1,$tctr1
1907 rev $tctr2,$tctr2
1908 aese $dat4,q12
1909 aesmc $dat4,$dat4
1910 rev w13,w13
1911 rev w14,w14
1912
1913 aese $dat0,q13
1914 aesmc $dat0,$dat0
1915 aese $dat1,q13
1916 aesmc $dat1,$dat1
1917 aese $dat2,q13
1918 aesmc $dat2,$dat2
1919 aese $dat3,q13
1920 aesmc $dat3,$dat3
1921 aese $dat4,q13
1922 aesmc $dat4,$dat4
1923
1924 aese $dat0,q14
1925 aesmc $dat0,$dat0
1926 vld1.8 {$in0},[$inp],#16
1927 aese $dat1,q14
1928 aesmc $dat1,$dat1
1929 vld1.8 {$in1},[$inp],#16
1930 aese $dat2,q14
1931 aesmc $dat2,$dat2
1932 vld1.8 {$in2},[$inp],#16
1933 aese $dat3,q14
1934 aesmc $dat3,$dat3
1935 vld1.8 {$in3},[$inp],#16
1936 aese $dat4,q14
1937 aesmc $dat4,$dat4
1938 vld1.8 {$in4},[$inp],#16
1939
1940 aese $dat0,q15
1941 veor $in0,$in0,$rndlast
1942 aese $dat1,q15
1943 veor $in1,$in1,$rndlast
1944 aese $dat2,q15
1945 veor $in2,$in2,$rndlast
1946 aese $dat3,q15
1947 veor $in3,$in3,$rndlast
1948 aese $dat4,q15
1949 veor $in4,$in4,$rndlast
1950
1951 veor $in0,$in0,$dat0
1952 vorr $dat0,$ivec,$ivec
1953 veor $in1,$in1,$dat1
1954 vorr $dat1,$ivec,$ivec
1955 veor $in2,$in2,$dat2
1956 vorr $dat2,$ivec,$ivec
1957 veor $in3,$in3,$dat3
1958 vorr $dat3,$ivec,$ivec
1959 veor $in4,$in4,$dat4
1960 vorr $dat4,$ivec,$ivec
1961
1962 vst1.8 {$in0},[$out],#16
1963 vmov.32 ${dat0}[3],$tctr0
1964 vst1.8 {$in1},[$out],#16
1965 vmov.32 ${dat1}[3],$tctr1
1966 vst1.8 {$in2},[$out],#16
1967 vmov.32 ${dat2}[3],$tctr2
1968 vst1.8 {$in3},[$out],#16
1969 vmov.32 ${dat3}[3],w13
1970 vst1.8 {$in4},[$out],#16
1971 vmov.32 ${dat4}[3],w14
1972
1973 mov $cnt,$rounds
1974 cbz $len,.Lctr32_done
1975
1976 add $ctr,$ctr,#5
1977 subs $len,$len,#5
1978 b.hs .Loop5x_ctr32
1979
1980 add $len,$len,#5
1981 sub $ctr,$ctr,#5
1982
1983 cmp $len,#2
1984 mov $step,#16
1985 cclr $step,lo
1986 b.ls .Lctr32_tail
1987
1988 sub $len,$len,#3 // bias
1989 add $ctr,$ctr,#3
1990 ___
1991 $code.=<<___;
1992 b .Loop3x_ctr32
1993
1994 .align 4
1995 .Loop3x_ctr32:
1996 aese $dat0,q8
1997 aesmc $dat0,$dat0
1998 aese $dat1,q8
1999 aesmc $dat1,$dat1
2000 aese $dat2,q8
2001 aesmc $dat2,$dat2
2002 vld1.32 {q8},[$key_],#16
2003 subs $cnt,$cnt,#2
2004 aese $dat0,q9
2005 aesmc $dat0,$dat0
2006 aese $dat1,q9
2007 aesmc $dat1,$dat1
2008 aese $dat2,q9
2009 aesmc $dat2,$dat2
2010 vld1.32 {q9},[$key_],#16
2011 b.gt .Loop3x_ctr32
2012
2013 aese $dat0,q8
2014 aesmc $tmp0,$dat0
2015 aese $dat1,q8
2016 aesmc $tmp1,$dat1
2017 vld1.8 {$in0},[$inp],#16
2018 add $tctr0,$ctr,#1
2019 aese $dat2,q8
2020 aesmc $dat2,$dat2
2021 vld1.8 {$in1},[$inp],#16
2022 rev $tctr0,$tctr0
2023 aese $tmp0,q9
2024 aesmc $tmp0,$tmp0
2025 aese $tmp1,q9
2026 aesmc $tmp1,$tmp1
2027 vld1.8 {$in2},[$inp],#16
2028 mov $key_,$key
2029 aese $dat2,q9
2030 aesmc $tmp2,$dat2
2031 aese $tmp0,q12
2032 aesmc $tmp0,$tmp0
2033 aese $tmp1,q12
2034 aesmc $tmp1,$tmp1
2035 veor $in0,$in0,$rndlast
2036 add $tctr1,$ctr,#2
2037 aese $tmp2,q12
2038 aesmc $tmp2,$tmp2
2039 veor $in1,$in1,$rndlast
2040 add $ctr,$ctr,#3
2041 aese $tmp0,q13
2042 aesmc $tmp0,$tmp0
2043 aese $tmp1,q13
2044 aesmc $tmp1,$tmp1
2045 veor $in2,$in2,$rndlast
2046 vmov.32 ${ivec}[3], $tctr0
2047 aese $tmp2,q13
2048 aesmc $tmp2,$tmp2
2049 vorr $dat0,$ivec,$ivec
2050 rev $tctr1,$tctr1
2051 aese $tmp0,q14
2052 aesmc $tmp0,$tmp0
2053 vmov.32 ${ivec}[3], $tctr1
2054 rev $tctr2,$ctr
2055 aese $tmp1,q14
2056 aesmc $tmp1,$tmp1
2057 vorr $dat1,$ivec,$ivec
2058 vmov.32 ${ivec}[3], $tctr2
2059 aese $tmp2,q14
2060 aesmc $tmp2,$tmp2
2061 vorr $dat2,$ivec,$ivec
2062 subs $len,$len,#3
2063 aese $tmp0,q15
2064 aese $tmp1,q15
2065 aese $tmp2,q15
2066
2067 veor $in0,$in0,$tmp0
2068 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2069 vst1.8 {$in0},[$out],#16
2070 veor $in1,$in1,$tmp1
2071 mov $cnt,$rounds
2072 vst1.8 {$in1},[$out],#16
2073 veor $in2,$in2,$tmp2
2074 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2075 vst1.8 {$in2},[$out],#16
2076 b.hs .Loop3x_ctr32
2077
2078 adds $len,$len,#3
2079 b.eq .Lctr32_done
2080 cmp $len,#1
2081 mov $step,#16
2082 cclr $step,eq
2083
2084 .Lctr32_tail:
2085 aese $dat0,q8
2086 aesmc $dat0,$dat0
2087 aese $dat1,q8
2088 aesmc $dat1,$dat1
2089 vld1.32 {q8},[$key_],#16
2090 subs $cnt,$cnt,#2
2091 aese $dat0,q9
2092 aesmc $dat0,$dat0
2093 aese $dat1,q9
2094 aesmc $dat1,$dat1
2095 vld1.32 {q9},[$key_],#16
2096 b.gt .Lctr32_tail
2097
2098 aese $dat0,q8
2099 aesmc $dat0,$dat0
2100 aese $dat1,q8
2101 aesmc $dat1,$dat1
2102 aese $dat0,q9
2103 aesmc $dat0,$dat0
2104 aese $dat1,q9
2105 aesmc $dat1,$dat1
2106 vld1.8 {$in0},[$inp],$step
2107 aese $dat0,q12
2108 aesmc $dat0,$dat0
2109 aese $dat1,q12
2110 aesmc $dat1,$dat1
2111 vld1.8 {$in1},[$inp]
2112 aese $dat0,q13
2113 aesmc $dat0,$dat0
2114 aese $dat1,q13
2115 aesmc $dat1,$dat1
2116 veor $in0,$in0,$rndlast
2117 aese $dat0,q14
2118 aesmc $dat0,$dat0
2119 aese $dat1,q14
2120 aesmc $dat1,$dat1
2121 veor $in1,$in1,$rndlast
2122 aese $dat0,q15
2123 aese $dat1,q15
2124
2125 cmp $len,#1
2126 veor $in0,$in0,$dat0
2127 veor $in1,$in1,$dat1
2128 vst1.8 {$in0},[$out],#16
2129 b.eq .Lctr32_done
2130 vst1.8 {$in1},[$out]
2131
2132 .Lctr32_done:
2133 ___
2134 $code.=<<___ if ($flavour !~ /64/);
2135 vldmia sp!,{d8-d15}
2136 ldmia sp!,{r4-r10,pc}
2137 ___
2138 $code.=<<___ if ($flavour =~ /64/);
2139 ldr x29,[sp],#16
2140 ret
2141 ___
2142 $code.=<<___;
2143 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2144 ___
2145 }}}
2146 # Performance in cycles per byte.
2147 # Processed with AES-XTS different key size.
2148 # It shows the value before and after optimization as below:
2149 # (before/after):
2150 #
2151 # AES-128-XTS AES-256-XTS
2152 # Cortex-A57 3.36/1.09 4.02/1.37
2153 # Cortex-A72 3.03/1.02 3.28/1.33
2154
2155 # Optimization is implemented by loop unrolling and interleaving.
2156 # Commonly, we choose the unrolling factor as 5, if the input
2157 # data size smaller than 5 blocks, but not smaller than 3 blocks,
2158 # choose 3 as the unrolling factor.
2159 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
2160 # as one iteration, every loop the left size lsize -= 5*16.
2161 # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2162 # will be processed specially, which be integrated into the 5*16 bytes
2163 # loop to improve the efficiency.
2164 # There is one special case, if the original input data size dsize
2165 # = 16 bytes, we will treat it separately to improve the
2166 # performance: one independent code block without LR, FP load and
2167 # store.
2168 # Encryption will process the (length -tailcnt) bytes as mentioned
2169 # previously, then encrypt the composite block as last second
2170 # cipher block.
2171 # Decryption will process the (length -tailcnt -1) bytes as mentioned
2172 # previously, then decrypt the last second cipher block to get the
2173 # last plain block(tail), decrypt the composite block as last second
2174 # plain text block.
2175
2176 {{{
2177 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2178 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2179 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2180 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2181 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2182 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2183 my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2184 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2185 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2186
2187 my ($tmpin)=("v26.16b");
2188 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2189
2190 # q7 last round key
2191 # q10-q15, q7 Last 7 round keys
2192 # q8-q9 preloaded round keys except last 7 keys for big size
2193 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2194
2195
2196 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2197
2198 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2199 my ($dat4,$in4,$tmp4);
2200 if ($flavour =~ /64/) {
2201 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2202 }
2203
2204 $code.=<<___ if ($flavour =~ /64/);
2205 .globl ${prefix}_xts_encrypt
2206 .type ${prefix}_xts_encrypt,%function
2207 .align 5
2208 ${prefix}_xts_encrypt:
2209 ___
2210 $code.=<<___ if ($flavour =~ /64/);
2211 AARCH64_VALID_CALL_TARGET
2212 cmp $len,#16
2213 // Original input data size bigger than 16, jump to big size processing.
2214 b.ne .Lxts_enc_big_size
2215 // Encrypt the iv with key2, as the first XEX iv.
2216 ldr $rounds,[$key2,#240]
2217 vld1.8 {$dat},[$key2],#16
2218 vld1.8 {$iv0},[$ivp]
2219 sub $rounds,$rounds,#2
2220 vld1.8 {$dat1},[$key2],#16
2221
2222 .Loop_enc_iv_enc:
2223 aese $iv0,$dat
2224 aesmc $iv0,$iv0
2225 vld1.32 {$dat},[$key2],#16
2226 subs $rounds,$rounds,#2
2227 aese $iv0,$dat1
2228 aesmc $iv0,$iv0
2229 vld1.32 {$dat1},[$key2],#16
2230 b.gt .Loop_enc_iv_enc
2231
2232 aese $iv0,$dat
2233 aesmc $iv0,$iv0
2234 vld1.32 {$dat},[$key2]
2235 aese $iv0,$dat1
2236 veor $iv0,$iv0,$dat
2237
2238 vld1.8 {$dat0},[$inp]
2239 veor $dat0,$iv0,$dat0
2240
2241 ldr $rounds,[$key1,#240]
2242 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2243
2244 aese $dat0,q20
2245 aesmc $dat0,$dat0
2246 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2247 aese $dat0,q21
2248 aesmc $dat0,$dat0
2249 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2250 b.eq .Lxts_128_enc
2251 .Lxts_enc_round_loop:
2252 aese $dat0,q8
2253 aesmc $dat0,$dat0
2254 vld1.32 {q8},[$key1],#16 // load key schedule...
2255 aese $dat0,q9
2256 aesmc $dat0,$dat0
2257 vld1.32 {q9},[$key1],#16 // load key schedule...
2258 subs $rounds,$rounds,#2 // bias
2259 b.gt .Lxts_enc_round_loop
2260 .Lxts_128_enc:
2261 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2262 aese $dat0,q8
2263 aesmc $dat0,$dat0
2264 aese $dat0,q9
2265 aesmc $dat0,$dat0
2266 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2267 aese $dat0,q10
2268 aesmc $dat0,$dat0
2269 aese $dat0,q11
2270 aesmc $dat0,$dat0
2271 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2272 aese $dat0,q12
2273 aesmc $dat0,$dat0
2274 aese $dat0,q13
2275 aesmc $dat0,$dat0
2276 vld1.32 {$rndlast},[$key1]
2277 aese $dat0,q14
2278 aesmc $dat0,$dat0
2279 aese $dat0,q15
2280 veor $dat0,$dat0,$rndlast
2281 veor $dat0,$dat0,$iv0
2282 vst1.8 {$dat0},[$out]
2283 b .Lxts_enc_final_abort
2284
2285 .align 4
2286 .Lxts_enc_big_size:
2287 ___
2288 $code.=<<___ if ($flavour =~ /64/);
2289 stp $constnumx,$tmpinp,[sp,#-64]!
2290 stp $tailcnt,$midnumx,[sp,#48]
2291 stp $ivd10,$ivd20,[sp,#32]
2292 stp $ivd30,$ivd40,[sp,#16]
2293
2294 // tailcnt store the tail value of length%16.
2295 and $tailcnt,$len,#0xf
2296 and $len,$len,#-16
2297 subs $len,$len,#16
2298 mov $step,#16
2299 b.lo .Lxts_abort
2300 csel $step,xzr,$step,eq
2301
2302 // Firstly, encrypt the iv with key2, as the first iv of XEX.
2303 ldr $rounds,[$key2,#240]
2304 vld1.32 {$dat},[$key2],#16
2305 vld1.8 {$iv0},[$ivp]
2306 sub $rounds,$rounds,#2
2307 vld1.32 {$dat1},[$key2],#16
2308
2309 .Loop_iv_enc:
2310 aese $iv0,$dat
2311 aesmc $iv0,$iv0
2312 vld1.32 {$dat},[$key2],#16
2313 subs $rounds,$rounds,#2
2314 aese $iv0,$dat1
2315 aesmc $iv0,$iv0
2316 vld1.32 {$dat1},[$key2],#16
2317 b.gt .Loop_iv_enc
2318
2319 aese $iv0,$dat
2320 aesmc $iv0,$iv0
2321 vld1.32 {$dat},[$key2]
2322 aese $iv0,$dat1
2323 veor $iv0,$iv0,$dat
2324
2325 // The iv for second block
2326 // $ivl- iv(low), $ivh - iv(high)
2327 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2328 fmov $ivl,$ivd00
2329 fmov $ivh,$ivd01
2330 mov $constnum,#0x87
2331 extr $midnumx,$ivh,$ivh,#32
2332 extr $ivh,$ivh,$ivl,#63
2333 and $tmpmw,$constnum,$midnum,asr#31
2334 eor $ivl,$tmpmx,$ivl,lsl#1
2335 fmov $ivd10,$ivl
2336 fmov $ivd11,$ivh
2337
2338 ldr $rounds0,[$key1,#240] // next starting point
2339 vld1.8 {$dat},[$inp],$step
2340
2341 vld1.32 {q8-q9},[$key1] // load key schedule...
2342 sub $rounds0,$rounds0,#6
2343 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2344 sub $rounds0,$rounds0,#2
2345 vld1.32 {q10-q11},[$key_],#32
2346 vld1.32 {q12-q13},[$key_],#32
2347 vld1.32 {q14-q15},[$key_],#32
2348 vld1.32 {$rndlast},[$key_]
2349
2350 add $key_,$key1,#32
2351 mov $rounds,$rounds0
2352
2353 // Encryption
2354 .Lxts_enc:
2355 vld1.8 {$dat2},[$inp],#16
2356 subs $len,$len,#32 // bias
2357 add $rounds,$rounds0,#2
2358 vorr $in1,$dat,$dat
2359 vorr $dat1,$dat,$dat
2360 vorr $in3,$dat,$dat
2361 vorr $in2,$dat2,$dat2
2362 vorr $in4,$dat2,$dat2
2363 b.lo .Lxts_inner_enc_tail
2364 veor $dat,$dat,$iv0 // before encryption, xor with iv
2365 veor $dat2,$dat2,$iv1
2366
2367 // The iv for third block
2368 extr $midnumx,$ivh,$ivh,#32
2369 extr $ivh,$ivh,$ivl,#63
2370 and $tmpmw,$constnum,$midnum,asr#31
2371 eor $ivl,$tmpmx,$ivl,lsl#1
2372 fmov $ivd20,$ivl
2373 fmov $ivd21,$ivh
2374
2375
2376 vorr $dat1,$dat2,$dat2
2377 vld1.8 {$dat2},[$inp],#16
2378 vorr $in0,$dat,$dat
2379 vorr $in1,$dat1,$dat1
2380 veor $in2,$dat2,$iv2 // the third block
2381 veor $dat2,$dat2,$iv2
2382 cmp $len,#32
2383 b.lo .Lxts_outer_enc_tail
2384
2385 // The iv for fourth block
2386 extr $midnumx,$ivh,$ivh,#32
2387 extr $ivh,$ivh,$ivl,#63
2388 and $tmpmw,$constnum,$midnum,asr#31
2389 eor $ivl,$tmpmx,$ivl,lsl#1
2390 fmov $ivd30,$ivl
2391 fmov $ivd31,$ivh
2392
2393 vld1.8 {$dat3},[$inp],#16
2394 // The iv for fifth block
2395 extr $midnumx,$ivh,$ivh,#32
2396 extr $ivh,$ivh,$ivl,#63
2397 and $tmpmw,$constnum,$midnum,asr#31
2398 eor $ivl,$tmpmx,$ivl,lsl#1
2399 fmov $ivd40,$ivl
2400 fmov $ivd41,$ivh
2401
2402 vld1.8 {$dat4},[$inp],#16
2403 veor $dat3,$dat3,$iv3 // the fourth block
2404 veor $dat4,$dat4,$iv4
2405 sub $len,$len,#32 // bias
2406 mov $rounds,$rounds0
2407 b .Loop5x_xts_enc
2408
2409 .align 4
2410 .Loop5x_xts_enc:
2411 aese $dat0,q8
2412 aesmc $dat0,$dat0
2413 aese $dat1,q8
2414 aesmc $dat1,$dat1
2415 aese $dat2,q8
2416 aesmc $dat2,$dat2
2417 aese $dat3,q8
2418 aesmc $dat3,$dat3
2419 aese $dat4,q8
2420 aesmc $dat4,$dat4
2421 vld1.32 {q8},[$key_],#16
2422 subs $rounds,$rounds,#2
2423 aese $dat0,q9
2424 aesmc $dat0,$dat0
2425 aese $dat1,q9
2426 aesmc $dat1,$dat1
2427 aese $dat2,q9
2428 aesmc $dat2,$dat2
2429 aese $dat3,q9
2430 aesmc $dat3,$dat3
2431 aese $dat4,q9
2432 aesmc $dat4,$dat4
2433 vld1.32 {q9},[$key_],#16
2434 b.gt .Loop5x_xts_enc
2435
2436 aese $dat0,q8
2437 aesmc $dat0,$dat0
2438 aese $dat1,q8
2439 aesmc $dat1,$dat1
2440 aese $dat2,q8
2441 aesmc $dat2,$dat2
2442 aese $dat3,q8
2443 aesmc $dat3,$dat3
2444 aese $dat4,q8
2445 aesmc $dat4,$dat4
2446 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2447
2448 aese $dat0,q9
2449 aesmc $dat0,$dat0
2450 aese $dat1,q9
2451 aesmc $dat1,$dat1
2452 aese $dat2,q9
2453 aesmc $dat2,$dat2
2454 aese $dat3,q9
2455 aesmc $dat3,$dat3
2456 aese $dat4,q9
2457 aesmc $dat4,$dat4
2458 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2459 mov $key_,$key1
2460
2461 aese $dat0,q10
2462 aesmc $dat0,$dat0
2463 aese $dat1,q10
2464 aesmc $dat1,$dat1
2465 aese $dat2,q10
2466 aesmc $dat2,$dat2
2467 aese $dat3,q10
2468 aesmc $dat3,$dat3
2469 aese $dat4,q10
2470 aesmc $dat4,$dat4
2471 add $inp,$inp,$xoffset // x0 is adjusted in such way that
2472 // at exit from the loop v1.16b-v26.16b
2473 // are loaded with last "words"
2474 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2475
2476 aese $dat0,q11
2477 aesmc $dat0,$dat0
2478 aese $dat1,q11
2479 aesmc $dat1,$dat1
2480 aese $dat2,q11
2481 aesmc $dat2,$dat2
2482 aese $dat3,q11
2483 aesmc $dat3,$dat3
2484 aese $dat4,q11
2485 aesmc $dat4,$dat4
2486
2487 aese $dat0,q12
2488 aesmc $dat0,$dat0
2489 aese $dat1,q12
2490 aesmc $dat1,$dat1
2491 aese $dat2,q12
2492 aesmc $dat2,$dat2
2493 aese $dat3,q12
2494 aesmc $dat3,$dat3
2495 aese $dat4,q12
2496 aesmc $dat4,$dat4
2497
2498 aese $dat0,q13
2499 aesmc $dat0,$dat0
2500 aese $dat1,q13
2501 aesmc $dat1,$dat1
2502 aese $dat2,q13
2503 aesmc $dat2,$dat2
2504 aese $dat3,q13
2505 aesmc $dat3,$dat3
2506 aese $dat4,q13
2507 aesmc $dat4,$dat4
2508
2509 aese $dat0,q14
2510 aesmc $dat0,$dat0
2511 aese $dat1,q14
2512 aesmc $dat1,$dat1
2513 aese $dat2,q14
2514 aesmc $dat2,$dat2
2515 aese $dat3,q14
2516 aesmc $dat3,$dat3
2517 aese $dat4,q14
2518 aesmc $dat4,$dat4
2519
2520 veor $tmp0,$rndlast,$iv0
2521 aese $dat0,q15
2522 // The iv for first block of one iteration
2523 extr $midnumx,$ivh,$ivh,#32
2524 extr $ivh,$ivh,$ivl,#63
2525 and $tmpmw,$constnum,$midnum,asr#31
2526 eor $ivl,$tmpmx,$ivl,lsl#1
2527 fmov $ivd00,$ivl
2528 fmov $ivd01,$ivh
2529 veor $tmp1,$rndlast,$iv1
2530 vld1.8 {$in0},[$inp],#16
2531 aese $dat1,q15
2532 // The iv for second block
2533 extr $midnumx,$ivh,$ivh,#32
2534 extr $ivh,$ivh,$ivl,#63
2535 and $tmpmw,$constnum,$midnum,asr#31
2536 eor $ivl,$tmpmx,$ivl,lsl#1
2537 fmov $ivd10,$ivl
2538 fmov $ivd11,$ivh
2539 veor $tmp2,$rndlast,$iv2
2540 vld1.8 {$in1},[$inp],#16
2541 aese $dat2,q15
2542 // The iv for third block
2543 extr $midnumx,$ivh,$ivh,#32
2544 extr $ivh,$ivh,$ivl,#63
2545 and $tmpmw,$constnum,$midnum,asr#31
2546 eor $ivl,$tmpmx,$ivl,lsl#1
2547 fmov $ivd20,$ivl
2548 fmov $ivd21,$ivh
2549 veor $tmp3,$rndlast,$iv3
2550 vld1.8 {$in2},[$inp],#16
2551 aese $dat3,q15
2552 // The iv for fourth block
2553 extr $midnumx,$ivh,$ivh,#32
2554 extr $ivh,$ivh,$ivl,#63
2555 and $tmpmw,$constnum,$midnum,asr#31
2556 eor $ivl,$tmpmx,$ivl,lsl#1
2557 fmov $ivd30,$ivl
2558 fmov $ivd31,$ivh
2559 veor $tmp4,$rndlast,$iv4
2560 vld1.8 {$in3},[$inp],#16
2561 aese $dat4,q15
2562
2563 // The iv for fifth block
2564 extr $midnumx,$ivh,$ivh,#32
2565 extr $ivh,$ivh,$ivl,#63
2566 and $tmpmw,$constnum,$midnum,asr #31
2567 eor $ivl,$tmpmx,$ivl,lsl #1
2568 fmov $ivd40,$ivl
2569 fmov $ivd41,$ivh
2570
2571 vld1.8 {$in4},[$inp],#16
2572 cbz $xoffset,.Lxts_enc_tail4x
2573 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2574 veor $tmp0,$tmp0,$dat0
2575 veor $dat0,$in0,$iv0
2576 veor $tmp1,$tmp1,$dat1
2577 veor $dat1,$in1,$iv1
2578 veor $tmp2,$tmp2,$dat2
2579 veor $dat2,$in2,$iv2
2580 veor $tmp3,$tmp3,$dat3
2581 veor $dat3,$in3,$iv3
2582 veor $tmp4,$tmp4,$dat4
2583 vst1.8 {$tmp0},[$out],#16
2584 veor $dat4,$in4,$iv4
2585 vst1.8 {$tmp1},[$out],#16
2586 mov $rounds,$rounds0
2587 vst1.8 {$tmp2},[$out],#16
2588 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2589 vst1.8 {$tmp3},[$out],#16
2590 vst1.8 {$tmp4},[$out],#16
2591 b.hs .Loop5x_xts_enc
2592
2593
2594 // If left 4 blocks, borrow the five block's processing.
2595 cmn $len,#0x10
2596 b.ne .Loop5x_enc_after
2597 vorr $iv4,$iv3,$iv3
2598 vorr $iv3,$iv2,$iv2
2599 vorr $iv2,$iv1,$iv1
2600 vorr $iv1,$iv0,$iv0
2601 fmov $ivl,$ivd40
2602 fmov $ivh,$ivd41
2603 veor $dat0,$iv0,$in0
2604 veor $dat1,$iv1,$in1
2605 veor $dat2,$in2,$iv2
2606 veor $dat3,$in3,$iv3
2607 veor $dat4,$in4,$iv4
2608 b.eq .Loop5x_xts_enc
2609
2610 .Loop5x_enc_after:
2611 add $len,$len,#0x50
2612 cbz $len,.Lxts_enc_done
2613
2614 add $rounds,$rounds0,#2
2615 subs $len,$len,#0x30
2616 b.lo .Lxts_inner_enc_tail
2617
2618 veor $dat0,$iv0,$in2
2619 veor $dat1,$iv1,$in3
2620 veor $dat2,$in4,$iv2
2621 b .Lxts_outer_enc_tail
2622
2623 .align 4
2624 .Lxts_enc_tail4x:
2625 add $inp,$inp,#16
2626 veor $tmp1,$dat1,$tmp1
2627 vst1.8 {$tmp1},[$out],#16
2628 veor $tmp2,$dat2,$tmp2
2629 vst1.8 {$tmp2},[$out],#16
2630 veor $tmp3,$dat3,$tmp3
2631 veor $tmp4,$dat4,$tmp4
2632 vst1.8 {$tmp3-$tmp4},[$out],#32
2633
2634 b .Lxts_enc_done
2635 .align 4
2636 .Lxts_outer_enc_tail:
2637 aese $dat0,q8
2638 aesmc $dat0,$dat0
2639 aese $dat1,q8
2640 aesmc $dat1,$dat1
2641 aese $dat2,q8
2642 aesmc $dat2,$dat2
2643 vld1.32 {q8},[$key_],#16
2644 subs $rounds,$rounds,#2
2645 aese $dat0,q9
2646 aesmc $dat0,$dat0
2647 aese $dat1,q9
2648 aesmc $dat1,$dat1
2649 aese $dat2,q9
2650 aesmc $dat2,$dat2
2651 vld1.32 {q9},[$key_],#16
2652 b.gt .Lxts_outer_enc_tail
2653
2654 aese $dat0,q8
2655 aesmc $dat0,$dat0
2656 aese $dat1,q8
2657 aesmc $dat1,$dat1
2658 aese $dat2,q8
2659 aesmc $dat2,$dat2
2660 veor $tmp0,$iv0,$rndlast
2661 subs $len,$len,#0x30
2662 // The iv for first block
2663 fmov $ivl,$ivd20
2664 fmov $ivh,$ivd21
2665 //mov $constnum,#0x87
2666 extr $midnumx,$ivh,$ivh,#32
2667 extr $ivh,$ivh,$ivl,#63
2668 and $tmpmw,$constnum,$midnum,asr#31
2669 eor $ivl,$tmpmx,$ivl,lsl#1
2670 fmov $ivd00,$ivl
2671 fmov $ivd01,$ivh
2672 veor $tmp1,$iv1,$rndlast
2673 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
2674 aese $dat0,q9
2675 aesmc $dat0,$dat0
2676 aese $dat1,q9
2677 aesmc $dat1,$dat1
2678 aese $dat2,q9
2679 aesmc $dat2,$dat2
2680 veor $tmp2,$iv2,$rndlast
2681
2682 add $xoffset,$xoffset,#0x20
2683 add $inp,$inp,$xoffset
2684 mov $key_,$key1
2685
2686 aese $dat0,q12
2687 aesmc $dat0,$dat0
2688 aese $dat1,q12
2689 aesmc $dat1,$dat1
2690 aese $dat2,q12
2691 aesmc $dat2,$dat2
2692 aese $dat0,q13
2693 aesmc $dat0,$dat0
2694 aese $dat1,q13
2695 aesmc $dat1,$dat1
2696 aese $dat2,q13
2697 aesmc $dat2,$dat2
2698 aese $dat0,q14
2699 aesmc $dat0,$dat0
2700 aese $dat1,q14
2701 aesmc $dat1,$dat1
2702 aese $dat2,q14
2703 aesmc $dat2,$dat2
2704 aese $dat0,q15
2705 aese $dat1,q15
2706 aese $dat2,q15
2707 vld1.8 {$in2},[$inp],#16
2708 add $rounds,$rounds0,#2
2709 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2710 veor $tmp0,$tmp0,$dat0
2711 veor $tmp1,$tmp1,$dat1
2712 veor $dat2,$dat2,$tmp2
2713 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2714 vst1.8 {$tmp0},[$out],#16
2715 vst1.8 {$tmp1},[$out],#16
2716 vst1.8 {$dat2},[$out],#16
2717 cmn $len,#0x30
2718 b.eq .Lxts_enc_done
2719 .Lxts_encxor_one:
2720 vorr $in3,$in1,$in1
2721 vorr $in4,$in2,$in2
2722 nop
2723
2724 .Lxts_inner_enc_tail:
2725 cmn $len,#0x10
2726 veor $dat1,$in3,$iv0
2727 veor $dat2,$in4,$iv1
2728 b.eq .Lxts_enc_tail_loop
2729 veor $dat2,$in4,$iv0
2730 .Lxts_enc_tail_loop:
2731 aese $dat1,q8
2732 aesmc $dat1,$dat1
2733 aese $dat2,q8
2734 aesmc $dat2,$dat2
2735 vld1.32 {q8},[$key_],#16
2736 subs $rounds,$rounds,#2
2737 aese $dat1,q9
2738 aesmc $dat1,$dat1
2739 aese $dat2,q9
2740 aesmc $dat2,$dat2
2741 vld1.32 {q9},[$key_],#16
2742 b.gt .Lxts_enc_tail_loop
2743
2744 aese $dat1,q8
2745 aesmc $dat1,$dat1
2746 aese $dat2,q8
2747 aesmc $dat2,$dat2
2748 aese $dat1,q9
2749 aesmc $dat1,$dat1
2750 aese $dat2,q9
2751 aesmc $dat2,$dat2
2752 aese $dat1,q12
2753 aesmc $dat1,$dat1
2754 aese $dat2,q12
2755 aesmc $dat2,$dat2
2756 cmn $len,#0x20
2757 aese $dat1,q13
2758 aesmc $dat1,$dat1
2759 aese $dat2,q13
2760 aesmc $dat2,$dat2
2761 veor $tmp1,$iv0,$rndlast
2762 aese $dat1,q14
2763 aesmc $dat1,$dat1
2764 aese $dat2,q14
2765 aesmc $dat2,$dat2
2766 veor $tmp2,$iv1,$rndlast
2767 aese $dat1,q15
2768 aese $dat2,q15
2769 b.eq .Lxts_enc_one
2770 veor $tmp1,$tmp1,$dat1
2771 vst1.8 {$tmp1},[$out],#16
2772 veor $tmp2,$tmp2,$dat2
2773 vorr $iv0,$iv1,$iv1
2774 vst1.8 {$tmp2},[$out],#16
2775 fmov $ivl,$ivd10
2776 fmov $ivh,$ivd11
2777 mov $constnum,#0x87
2778 extr $midnumx,$ivh,$ivh,#32
2779 extr $ivh,$ivh,$ivl,#63
2780 and $tmpmw,$constnum,$midnum,asr #31
2781 eor $ivl,$tmpmx,$ivl,lsl #1
2782 fmov $ivd00,$ivl
2783 fmov $ivd01,$ivh
2784 b .Lxts_enc_done
2785
2786 .Lxts_enc_one:
2787 veor $tmp1,$tmp1,$dat2
2788 vorr $iv0,$iv0,$iv0
2789 vst1.8 {$tmp1},[$out],#16
2790 fmov $ivl,$ivd00
2791 fmov $ivh,$ivd01
2792 mov $constnum,#0x87
2793 extr $midnumx,$ivh,$ivh,#32
2794 extr $ivh,$ivh,$ivl,#63
2795 and $tmpmw,$constnum,$midnum,asr #31
2796 eor $ivl,$tmpmx,$ivl,lsl #1
2797 fmov $ivd00,$ivl
2798 fmov $ivd01,$ivh
2799 b .Lxts_enc_done
2800 .align 5
2801 .Lxts_enc_done:
2802 // Process the tail block with cipher stealing.
2803 tst $tailcnt,#0xf
2804 b.eq .Lxts_abort
2805
2806 mov $tmpinp,$inp
2807 mov $tmpoutp,$out
2808 sub $out,$out,#16
2809 .composite_enc_loop:
2810 subs $tailcnt,$tailcnt,#1
2811 ldrb $l2outp,[$out,$tailcnt]
2812 ldrb $loutp,[$tmpinp,$tailcnt]
2813 strb $l2outp,[$tmpoutp,$tailcnt]
2814 strb $loutp,[$out,$tailcnt]
2815 b.gt .composite_enc_loop
2816 .Lxts_enc_load_done:
2817 vld1.8 {$tmpin},[$out]
2818 veor $tmpin,$tmpin,$iv0
2819
2820 // Encrypt the composite block to get the last second encrypted text block
2821 ldr $rounds,[$key1,#240] // load key schedule...
2822 vld1.8 {$dat},[$key1],#16
2823 sub $rounds,$rounds,#2
2824 vld1.8 {$dat1},[$key1],#16 // load key schedule...
2825 .Loop_final_enc:
2826 aese $tmpin,$dat0
2827 aesmc $tmpin,$tmpin
2828 vld1.32 {$dat0},[$key1],#16
2829 subs $rounds,$rounds,#2
2830 aese $tmpin,$dat1
2831 aesmc $tmpin,$tmpin
2832 vld1.32 {$dat1},[$key1],#16
2833 b.gt .Loop_final_enc
2834
2835 aese $tmpin,$dat0
2836 aesmc $tmpin,$tmpin
2837 vld1.32 {$dat0},[$key1]
2838 aese $tmpin,$dat1
2839 veor $tmpin,$tmpin,$dat0
2840 veor $tmpin,$tmpin,$iv0
2841 vst1.8 {$tmpin},[$out]
2842
2843 .Lxts_abort:
2844 ldp $tailcnt,$midnumx,[sp,#48]
2845 ldp $ivd10,$ivd20,[sp,#32]
2846 ldp $ivd30,$ivd40,[sp,#16]
2847 ldp $constnumx,$tmpinp,[sp],#64
2848 .Lxts_enc_final_abort:
2849 ret
2850 .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2851 ___
2852
2853 }}}
2854 {{{
2855 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2856 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2857 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2858 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2859 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2860 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2861 my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2862 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2863 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2864
2865 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2866
2867 # q7 last round key
2868 # q10-q15, q7 Last 7 round keys
2869 # q8-q9 preloaded round keys except last 7 keys for big size
2870 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2871
2872 {
2873 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2874
2875 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2876 my ($dat4,$in4,$tmp4);
2877 if ($flavour =~ /64/) {
2878 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2879 }
2880
2881 $code.=<<___ if ($flavour =~ /64/);
2882 .globl ${prefix}_xts_decrypt
2883 .type ${prefix}_xts_decrypt,%function
2884 .align 5
2885 ${prefix}_xts_decrypt:
2886 AARCH64_VALID_CALL_TARGET
2887 ___
2888 $code.=<<___ if ($flavour =~ /64/);
2889 cmp $len,#16
2890 // Original input data size bigger than 16, jump to big size processing.
2891 b.ne .Lxts_dec_big_size
2892 // Encrypt the iv with key2, as the first XEX iv.
2893 ldr $rounds,[$key2,#240]
2894 vld1.8 {$dat},[$key2],#16
2895 vld1.8 {$iv0},[$ivp]
2896 sub $rounds,$rounds,#2
2897 vld1.8 {$dat1},[$key2],#16
2898
2899 .Loop_dec_small_iv_enc:
2900 aese $iv0,$dat
2901 aesmc $iv0,$iv0
2902 vld1.32 {$dat},[$key2],#16
2903 subs $rounds,$rounds,#2
2904 aese $iv0,$dat1
2905 aesmc $iv0,$iv0
2906 vld1.32 {$dat1},[$key2],#16
2907 b.gt .Loop_dec_small_iv_enc
2908
2909 aese $iv0,$dat
2910 aesmc $iv0,$iv0
2911 vld1.32 {$dat},[$key2]
2912 aese $iv0,$dat1
2913 veor $iv0,$iv0,$dat
2914
2915 vld1.8 {$dat0},[$inp]
2916 veor $dat0,$iv0,$dat0
2917
2918 ldr $rounds,[$key1,#240]
2919 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2920
2921 aesd $dat0,q20
2922 aesimc $dat0,$dat0
2923 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2924 aesd $dat0,q21
2925 aesimc $dat0,$dat0
2926 subs $rounds,$rounds,#10 // bias
2927 b.eq .Lxts_128_dec
2928 .Lxts_dec_round_loop:
2929 aesd $dat0,q8
2930 aesimc $dat0,$dat0
2931 vld1.32 {q8},[$key1],#16 // load key schedule...
2932 aesd $dat0,q9
2933 aesimc $dat0,$dat0
2934 vld1.32 {q9},[$key1],#16 // load key schedule...
2935 subs $rounds,$rounds,#2 // bias
2936 b.gt .Lxts_dec_round_loop
2937 .Lxts_128_dec:
2938 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2939 aesd $dat0,q8
2940 aesimc $dat0,$dat0
2941 aesd $dat0,q9
2942 aesimc $dat0,$dat0
2943 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2944 aesd $dat0,q10
2945 aesimc $dat0,$dat0
2946 aesd $dat0,q11
2947 aesimc $dat0,$dat0
2948 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2949 aesd $dat0,q12
2950 aesimc $dat0,$dat0
2951 aesd $dat0,q13
2952 aesimc $dat0,$dat0
2953 vld1.32 {$rndlast},[$key1]
2954 aesd $dat0,q14
2955 aesimc $dat0,$dat0
2956 aesd $dat0,q15
2957 veor $dat0,$dat0,$rndlast
2958 veor $dat0,$iv0,$dat0
2959 vst1.8 {$dat0},[$out]
2960 b .Lxts_dec_final_abort
2961 .Lxts_dec_big_size:
2962 ___
2963 $code.=<<___ if ($flavour =~ /64/);
2964 stp $constnumx,$tmpinp,[sp,#-64]!
2965 stp $tailcnt,$midnumx,[sp,#48]
2966 stp $ivd10,$ivd20,[sp,#32]
2967 stp $ivd30,$ivd40,[sp,#16]
2968
2969 and $tailcnt,$len,#0xf
2970 and $len,$len,#-16
2971 subs $len,$len,#16
2972 mov $step,#16
2973 b.lo .Lxts_dec_abort
2974
2975 // Encrypt the iv with key2, as the first XEX iv
2976 ldr $rounds,[$key2,#240]
2977 vld1.8 {$dat},[$key2],#16
2978 vld1.8 {$iv0},[$ivp]
2979 sub $rounds,$rounds,#2
2980 vld1.8 {$dat1},[$key2],#16
2981
2982 .Loop_dec_iv_enc:
2983 aese $iv0,$dat
2984 aesmc $iv0,$iv0
2985 vld1.32 {$dat},[$key2],#16
2986 subs $rounds,$rounds,#2
2987 aese $iv0,$dat1
2988 aesmc $iv0,$iv0
2989 vld1.32 {$dat1},[$key2],#16
2990 b.gt .Loop_dec_iv_enc
2991
2992 aese $iv0,$dat
2993 aesmc $iv0,$iv0
2994 vld1.32 {$dat},[$key2]
2995 aese $iv0,$dat1
2996 veor $iv0,$iv0,$dat
2997
2998 // The iv for second block
2999 // $ivl- iv(low), $ivh - iv(high)
3000 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3001 fmov $ivl,$ivd00
3002 fmov $ivh,$ivd01
3003 mov $constnum,#0x87
3004 extr $midnumx,$ivh,$ivh,#32
3005 extr $ivh,$ivh,$ivl,#63
3006 and $tmpmw,$constnum,$midnum,asr #31
3007 eor $ivl,$tmpmx,$ivl,lsl #1
3008 fmov $ivd10,$ivl
3009 fmov $ivd11,$ivh
3010
3011 ldr $rounds0,[$key1,#240] // load rounds number
3012
3013 // The iv for third block
3014 extr $midnumx,$ivh,$ivh,#32
3015 extr $ivh,$ivh,$ivl,#63
3016 and $tmpmw,$constnum,$midnum,asr #31
3017 eor $ivl,$tmpmx,$ivl,lsl #1
3018 fmov $ivd20,$ivl
3019 fmov $ivd21,$ivh
3020
3021 vld1.32 {q8-q9},[$key1] // load key schedule...
3022 sub $rounds0,$rounds0,#6
3023 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3024 sub $rounds0,$rounds0,#2
3025 vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3026 vld1.32 {q12-q13},[$key_],#32
3027 vld1.32 {q14-q15},[$key_],#32
3028 vld1.32 {$rndlast},[$key_]
3029
3030 // The iv for fourth block
3031 extr $midnumx,$ivh,$ivh,#32
3032 extr $ivh,$ivh,$ivl,#63
3033 and $tmpmw,$constnum,$midnum,asr #31
3034 eor $ivl,$tmpmx,$ivl,lsl #1
3035 fmov $ivd30,$ivl
3036 fmov $ivd31,$ivh
3037
3038 add $key_,$key1,#32
3039 mov $rounds,$rounds0
3040 b .Lxts_dec
3041
3042 // Decryption
3043 .align 5
3044 .Lxts_dec:
3045 tst $tailcnt,#0xf
3046 b.eq .Lxts_dec_begin
3047 subs $len,$len,#16
3048 csel $step,xzr,$step,eq
3049 vld1.8 {$dat},[$inp],#16
3050 b.lo .Lxts_done
3051 sub $inp,$inp,#16
3052 .Lxts_dec_begin:
3053 vld1.8 {$dat},[$inp],$step
3054 subs $len,$len,#32 // bias
3055 add $rounds,$rounds0,#2
3056 vorr $in1,$dat,$dat
3057 vorr $dat1,$dat,$dat
3058 vorr $in3,$dat,$dat
3059 vld1.8 {$dat2},[$inp],#16
3060 vorr $in2,$dat2,$dat2
3061 vorr $in4,$dat2,$dat2
3062 b.lo .Lxts_inner_dec_tail
3063 veor $dat,$dat,$iv0 // before decryt, xor with iv
3064 veor $dat2,$dat2,$iv1
3065
3066 vorr $dat1,$dat2,$dat2
3067 vld1.8 {$dat2},[$inp],#16
3068 vorr $in0,$dat,$dat
3069 vorr $in1,$dat1,$dat1
3070 veor $in2,$dat2,$iv2 // third block xox with third iv
3071 veor $dat2,$dat2,$iv2
3072 cmp $len,#32
3073 b.lo .Lxts_outer_dec_tail
3074
3075 vld1.8 {$dat3},[$inp],#16
3076
3077 // The iv for fifth block
3078 extr $midnumx,$ivh,$ivh,#32
3079 extr $ivh,$ivh,$ivl,#63
3080 and $tmpmw,$constnum,$midnum,asr #31
3081 eor $ivl,$tmpmx,$ivl,lsl #1
3082 fmov $ivd40,$ivl
3083 fmov $ivd41,$ivh
3084
3085 vld1.8 {$dat4},[$inp],#16
3086 veor $dat3,$dat3,$iv3 // the fourth block
3087 veor $dat4,$dat4,$iv4
3088 sub $len,$len,#32 // bias
3089 mov $rounds,$rounds0
3090 b .Loop5x_xts_dec
3091
3092 .align 4
3093 .Loop5x_xts_dec:
3094 aesd $dat0,q8
3095 aesimc $dat0,$dat0
3096 aesd $dat1,q8
3097 aesimc $dat1,$dat1
3098 aesd $dat2,q8
3099 aesimc $dat2,$dat2
3100 aesd $dat3,q8
3101 aesimc $dat3,$dat3
3102 aesd $dat4,q8
3103 aesimc $dat4,$dat4
3104 vld1.32 {q8},[$key_],#16 // load key schedule...
3105 subs $rounds,$rounds,#2
3106 aesd $dat0,q9
3107 aesimc $dat0,$dat0
3108 aesd $dat1,q9
3109 aesimc $dat1,$dat1
3110 aesd $dat2,q9
3111 aesimc $dat2,$dat2
3112 aesd $dat3,q9
3113 aesimc $dat3,$dat3
3114 aesd $dat4,q9
3115 aesimc $dat4,$dat4
3116 vld1.32 {q9},[$key_],#16 // load key schedule...
3117 b.gt .Loop5x_xts_dec
3118
3119 aesd $dat0,q8
3120 aesimc $dat0,$dat0
3121 aesd $dat1,q8
3122 aesimc $dat1,$dat1
3123 aesd $dat2,q8
3124 aesimc $dat2,$dat2
3125 aesd $dat3,q8
3126 aesimc $dat3,$dat3
3127 aesd $dat4,q8
3128 aesimc $dat4,$dat4
3129 subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3130
3131 aesd $dat0,q9
3132 aesimc $dat0,$dat
3133 aesd $dat1,q9
3134 aesimc $dat1,$dat1
3135 aesd $dat2,q9
3136 aesimc $dat2,$dat2
3137 aesd $dat3,q9
3138 aesimc $dat3,$dat3
3139 aesd $dat4,q9
3140 aesimc $dat4,$dat4
3141 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3142 mov $key_,$key1
3143
3144 aesd $dat0,q10
3145 aesimc $dat0,$dat0
3146 aesd $dat1,q10
3147 aesimc $dat1,$dat1
3148 aesd $dat2,q10
3149 aesimc $dat2,$dat2
3150 aesd $dat3,q10
3151 aesimc $dat3,$dat3
3152 aesd $dat4,q10
3153 aesimc $dat4,$dat4
3154 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3155 // at exit from the loop v1.16b-v26.16b
3156 // are loaded with last "words"
3157 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3158
3159 aesd $dat0,q11
3160 aesimc $dat0,$dat0
3161 aesd $dat1,q11
3162 aesimc $dat1,$dat1
3163 aesd $dat2,q11
3164 aesimc $dat2,$dat2
3165 aesd $dat3,q11
3166 aesimc $dat3,$dat3
3167 aesd $dat4,q11
3168 aesimc $dat4,$dat4
3169
3170 aesd $dat0,q12
3171 aesimc $dat0,$dat0
3172 aesd $dat1,q12
3173 aesimc $dat1,$dat1
3174 aesd $dat2,q12
3175 aesimc $dat2,$dat2
3176 aesd $dat3,q12
3177 aesimc $dat3,$dat3
3178 aesd $dat4,q12
3179 aesimc $dat4,$dat4
3180
3181 aesd $dat0,q13
3182 aesimc $dat0,$dat0
3183 aesd $dat1,q13
3184 aesimc $dat1,$dat1
3185 aesd $dat2,q13
3186 aesimc $dat2,$dat2
3187 aesd $dat3,q13
3188 aesimc $dat3,$dat3
3189 aesd $dat4,q13
3190 aesimc $dat4,$dat4
3191
3192 aesd $dat0,q14
3193 aesimc $dat0,$dat0
3194 aesd $dat1,q14
3195 aesimc $dat1,$dat1
3196 aesd $dat2,q14
3197 aesimc $dat2,$dat2
3198 aesd $dat3,q14
3199 aesimc $dat3,$dat3
3200 aesd $dat4,q14
3201 aesimc $dat4,$dat4
3202
3203 veor $tmp0,$rndlast,$iv0
3204 aesd $dat0,q15
3205 // The iv for first block of next iteration.
3206 extr $midnumx,$ivh,$ivh,#32
3207 extr $ivh,$ivh,$ivl,#63
3208 and $tmpmw,$constnum,$midnum,asr #31
3209 eor $ivl,$tmpmx,$ivl,lsl #1
3210 fmov $ivd00,$ivl
3211 fmov $ivd01,$ivh
3212 veor $tmp1,$rndlast,$iv1
3213 vld1.8 {$in0},[$inp],#16
3214 aesd $dat1,q15
3215 // The iv for second block
3216 extr $midnumx,$ivh,$ivh,#32
3217 extr $ivh,$ivh,$ivl,#63
3218 and $tmpmw,$constnum,$midnum,asr #31
3219 eor $ivl,$tmpmx,$ivl,lsl #1
3220 fmov $ivd10,$ivl
3221 fmov $ivd11,$ivh
3222 veor $tmp2,$rndlast,$iv2
3223 vld1.8 {$in1},[$inp],#16
3224 aesd $dat2,q15
3225 // The iv for third block
3226 extr $midnumx,$ivh,$ivh,#32
3227 extr $ivh,$ivh,$ivl,#63
3228 and $tmpmw,$constnum,$midnum,asr #31
3229 eor $ivl,$tmpmx,$ivl,lsl #1
3230 fmov $ivd20,$ivl
3231 fmov $ivd21,$ivh
3232 veor $tmp3,$rndlast,$iv3
3233 vld1.8 {$in2},[$inp],#16
3234 aesd $dat3,q15
3235 // The iv for fourth block
3236 extr $midnumx,$ivh,$ivh,#32
3237 extr $ivh,$ivh,$ivl,#63
3238 and $tmpmw,$constnum,$midnum,asr #31
3239 eor $ivl,$tmpmx,$ivl,lsl #1
3240 fmov $ivd30,$ivl
3241 fmov $ivd31,$ivh
3242 veor $tmp4,$rndlast,$iv4
3243 vld1.8 {$in3},[$inp],#16
3244 aesd $dat4,q15
3245
3246 // The iv for fifth block
3247 extr $midnumx,$ivh,$ivh,#32
3248 extr $ivh,$ivh,$ivl,#63
3249 and $tmpmw,$constnum,$midnum,asr #31
3250 eor $ivl,$tmpmx,$ivl,lsl #1
3251 fmov $ivd40,$ivl
3252 fmov $ivd41,$ivh
3253
3254 vld1.8 {$in4},[$inp],#16
3255 cbz $xoffset,.Lxts_dec_tail4x
3256 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3257 veor $tmp0,$tmp0,$dat0
3258 veor $dat0,$in0,$iv0
3259 veor $tmp1,$tmp1,$dat1
3260 veor $dat1,$in1,$iv1
3261 veor $tmp2,$tmp2,$dat2
3262 veor $dat2,$in2,$iv2
3263 veor $tmp3,$tmp3,$dat3
3264 veor $dat3,$in3,$iv3
3265 veor $tmp4,$tmp4,$dat4
3266 vst1.8 {$tmp0},[$out],#16
3267 veor $dat4,$in4,$iv4
3268 vst1.8 {$tmp1},[$out],#16
3269 mov $rounds,$rounds0
3270 vst1.8 {$tmp2},[$out],#16
3271 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3272 vst1.8 {$tmp3},[$out],#16
3273 vst1.8 {$tmp4},[$out],#16
3274 b.hs .Loop5x_xts_dec
3275
3276 cmn $len,#0x10
3277 b.ne .Loop5x_dec_after
3278 // If x2($len) equal to -0x10, the left blocks is 4.
3279 // After specially processing, utilize the five blocks processing again.
3280 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3281 vorr $iv4,$iv3,$iv3
3282 vorr $iv3,$iv2,$iv2
3283 vorr $iv2,$iv1,$iv1
3284 vorr $iv1,$iv0,$iv0
3285 fmov $ivl,$ivd40
3286 fmov $ivh,$ivd41
3287 veor $dat0,$iv0,$in0
3288 veor $dat1,$iv1,$in1
3289 veor $dat2,$in2,$iv2
3290 veor $dat3,$in3,$iv3
3291 veor $dat4,$in4,$iv4
3292 b.eq .Loop5x_xts_dec
3293
3294 .Loop5x_dec_after:
3295 add $len,$len,#0x50
3296 cbz $len,.Lxts_done
3297
3298 add $rounds,$rounds0,#2
3299 subs $len,$len,#0x30
3300 b.lo .Lxts_inner_dec_tail
3301
3302 veor $dat0,$iv0,$in2
3303 veor $dat1,$iv1,$in3
3304 veor $dat2,$in4,$iv2
3305 b .Lxts_outer_dec_tail
3306
3307 .align 4
3308 .Lxts_dec_tail4x:
3309 add $inp,$inp,#16
3310 vld1.32 {$dat0},[$inp],#16
3311 veor $tmp1,$dat1,$tmp0
3312 vst1.8 {$tmp1},[$out],#16
3313 veor $tmp2,$dat2,$tmp2
3314 vst1.8 {$tmp2},[$out],#16
3315 veor $tmp3,$dat3,$tmp3
3316 veor $tmp4,$dat4,$tmp4
3317 vst1.8 {$tmp3-$tmp4},[$out],#32
3318
3319 b .Lxts_done
3320 .align 4
3321 .Lxts_outer_dec_tail:
3322 aesd $dat0,q8
3323 aesimc $dat0,$dat0
3324 aesd $dat1,q8
3325 aesimc $dat1,$dat1
3326 aesd $dat2,q8
3327 aesimc $dat2,$dat2
3328 vld1.32 {q8},[$key_],#16
3329 subs $rounds,$rounds,#2
3330 aesd $dat0,q9
3331 aesimc $dat0,$dat0
3332 aesd $dat1,q9
3333 aesimc $dat1,$dat1
3334 aesd $dat2,q9
3335 aesimc $dat2,$dat2
3336 vld1.32 {q9},[$key_],#16
3337 b.gt .Lxts_outer_dec_tail
3338
3339 aesd $dat0,q8
3340 aesimc $dat0,$dat0
3341 aesd $dat1,q8
3342 aesimc $dat1,$dat1
3343 aesd $dat2,q8
3344 aesimc $dat2,$dat2
3345 veor $tmp0,$iv0,$rndlast
3346 subs $len,$len,#0x30
3347 // The iv for first block
3348 fmov $ivl,$ivd20
3349 fmov $ivh,$ivd21
3350 mov $constnum,#0x87
3351 extr $midnumx,$ivh,$ivh,#32
3352 extr $ivh,$ivh,$ivl,#63
3353 and $tmpmw,$constnum,$midnum,asr #31
3354 eor $ivl,$tmpmx,$ivl,lsl #1
3355 fmov $ivd00,$ivl
3356 fmov $ivd01,$ivh
3357 veor $tmp1,$iv1,$rndlast
3358 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3359 aesd $dat0,q9
3360 aesimc $dat0,$dat0
3361 aesd $dat1,q9
3362 aesimc $dat1,$dat1
3363 aesd $dat2,q9
3364 aesimc $dat2,$dat2
3365 veor $tmp2,$iv2,$rndlast
3366 // The iv for second block
3367 extr $midnumx,$ivh,$ivh,#32
3368 extr $ivh,$ivh,$ivl,#63
3369 and $tmpmw,$constnum,$midnum,asr #31
3370 eor $ivl,$tmpmx,$ivl,lsl #1
3371 fmov $ivd10,$ivl
3372 fmov $ivd11,$ivh
3373
3374 add $xoffset,$xoffset,#0x20
3375 add $inp,$inp,$xoffset // $inp is adjusted to the last data
3376
3377 mov $key_,$key1
3378
3379 // The iv for third block
3380 extr $midnumx,$ivh,$ivh,#32
3381 extr $ivh,$ivh,$ivl,#63
3382 and $tmpmw,$constnum,$midnum,asr #31
3383 eor $ivl,$tmpmx,$ivl,lsl #1
3384 fmov $ivd20,$ivl
3385 fmov $ivd21,$ivh
3386
3387 aesd $dat0,q12
3388 aesimc $dat0,$dat0
3389 aesd $dat1,q12
3390 aesimc $dat1,$dat1
3391 aesd $dat2,q12
3392 aesimc $dat2,$dat2
3393 aesd $dat0,q13
3394 aesimc $dat0,$dat0
3395 aesd $dat1,q13
3396 aesimc $dat1,$dat1
3397 aesd $dat2,q13
3398 aesimc $dat2,$dat2
3399 aesd $dat0,q14
3400 aesimc $dat0,$dat0
3401 aesd $dat1,q14
3402 aesimc $dat1,$dat1
3403 aesd $dat2,q14
3404 aesimc $dat2,$dat2
3405 vld1.8 {$in2},[$inp],#16
3406 aesd $dat0,q15
3407 aesd $dat1,q15
3408 aesd $dat2,q15
3409 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3410 add $rounds,$rounds0,#2
3411 veor $tmp0,$tmp0,$dat0
3412 veor $tmp1,$tmp1,$dat1
3413 veor $dat2,$dat2,$tmp2
3414 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3415 vst1.8 {$tmp0},[$out],#16
3416 vst1.8 {$tmp1},[$out],#16
3417 vst1.8 {$dat2},[$out],#16
3418
3419 cmn $len,#0x30
3420 add $len,$len,#0x30
3421 b.eq .Lxts_done
3422 sub $len,$len,#0x30
3423 vorr $in3,$in1,$in1
3424 vorr $in4,$in2,$in2
3425 nop
3426
3427 .Lxts_inner_dec_tail:
3428 // $len == -0x10 means two blocks left.
3429 cmn $len,#0x10
3430 veor $dat1,$in3,$iv0
3431 veor $dat2,$in4,$iv1
3432 b.eq .Lxts_dec_tail_loop
3433 veor $dat2,$in4,$iv0
3434 .Lxts_dec_tail_loop:
3435 aesd $dat1,q8
3436 aesimc $dat1,$dat1
3437 aesd $dat2,q8
3438 aesimc $dat2,$dat2
3439 vld1.32 {q8},[$key_],#16
3440 subs $rounds,$rounds,#2
3441 aesd $dat1,q9
3442 aesimc $dat1,$dat1
3443 aesd $dat2,q9
3444 aesimc $dat2,$dat2
3445 vld1.32 {q9},[$key_],#16
3446 b.gt .Lxts_dec_tail_loop
3447
3448 aesd $dat1,q8
3449 aesimc $dat1,$dat1
3450 aesd $dat2,q8
3451 aesimc $dat2,$dat2
3452 aesd $dat1,q9
3453 aesimc $dat1,$dat1
3454 aesd $dat2,q9
3455 aesimc $dat2,$dat2
3456 aesd $dat1,q12
3457 aesimc $dat1,$dat1
3458 aesd $dat2,q12
3459 aesimc $dat2,$dat2
3460 cmn $len,#0x20
3461 aesd $dat1,q13
3462 aesimc $dat1,$dat1
3463 aesd $dat2,q13
3464 aesimc $dat2,$dat2
3465 veor $tmp1,$iv0,$rndlast
3466 aesd $dat1,q14
3467 aesimc $dat1,$dat1
3468 aesd $dat2,q14
3469 aesimc $dat2,$dat2
3470 veor $tmp2,$iv1,$rndlast
3471 aesd $dat1,q15
3472 aesd $dat2,q15
3473 b.eq .Lxts_dec_one
3474 veor $tmp1,$tmp1,$dat1
3475 veor $tmp2,$tmp2,$dat2
3476 vorr $iv0,$iv2,$iv2
3477 vorr $iv1,$iv3,$iv3
3478 vst1.8 {$tmp1},[$out],#16
3479 vst1.8 {$tmp2},[$out],#16
3480 add $len,$len,#16
3481 b .Lxts_done
3482
3483 .Lxts_dec_one:
3484 veor $tmp1,$tmp1,$dat2
3485 vorr $iv0,$iv1,$iv1
3486 vorr $iv1,$iv2,$iv2
3487 vst1.8 {$tmp1},[$out],#16
3488 add $len,$len,#32
3489
3490 .Lxts_done:
3491 tst $tailcnt,#0xf
3492 b.eq .Lxts_dec_abort
3493 // Processing the last two blocks with cipher stealing.
3494 mov x7,x3
3495 cbnz x2,.Lxts_dec_1st_done
3496 vld1.32 {$dat0},[$inp],#16
3497
3498 // Decrypt the last second block to get the last plain text block
3499 .Lxts_dec_1st_done:
3500 eor $tmpin,$dat0,$iv1
3501 ldr $rounds,[$key1,#240]
3502 vld1.32 {$dat0},[$key1],#16
3503 sub $rounds,$rounds,#2
3504 vld1.32 {$dat1},[$key1],#16
3505 .Loop_final_2nd_dec:
3506 aesd $tmpin,$dat0
3507 aesimc $tmpin,$tmpin
3508 vld1.32 {$dat0},[$key1],#16 // load key schedule...
3509 subs $rounds,$rounds,#2
3510 aesd $tmpin,$dat1
3511 aesimc $tmpin,$tmpin
3512 vld1.32 {$dat1},[$key1],#16 // load key schedule...
3513 b.gt .Loop_final_2nd_dec
3514
3515 aesd $tmpin,$dat0
3516 aesimc $tmpin,$tmpin
3517 vld1.32 {$dat0},[$key1]
3518 aesd $tmpin,$dat1
3519 veor $tmpin,$tmpin,$dat0
3520 veor $tmpin,$tmpin,$iv1
3521 vst1.8 {$tmpin},[$out]
3522
3523 mov $tmpinp,$inp
3524 add $tmpoutp,$out,#16
3525
3526 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3527 // to get the last encrypted block.
3528 .composite_dec_loop:
3529 subs $tailcnt,$tailcnt,#1
3530 ldrb $l2outp,[$out,$tailcnt]
3531 ldrb $loutp,[$tmpinp,$tailcnt]
3532 strb $l2outp,[$tmpoutp,$tailcnt]
3533 strb $loutp,[$out,$tailcnt]
3534 b.gt .composite_dec_loop
3535 .Lxts_dec_load_done:
3536 vld1.8 {$tmpin},[$out]
3537 veor $tmpin,$tmpin,$iv0
3538
3539 // Decrypt the composite block to get the last second plain text block
3540 ldr $rounds,[$key_,#240]
3541 vld1.8 {$dat},[$key_],#16
3542 sub $rounds,$rounds,#2
3543 vld1.8 {$dat1},[$key_],#16
3544 .Loop_final_dec:
3545 aesd $tmpin,$dat0
3546 aesimc $tmpin,$tmpin
3547 vld1.32 {$dat0},[$key_],#16 // load key schedule...
3548 subs $rounds,$rounds,#2
3549 aesd $tmpin,$dat1
3550 aesimc $tmpin,$tmpin
3551 vld1.32 {$dat1},[$key_],#16 // load key schedule...
3552 b.gt .Loop_final_dec
3553
3554 aesd $tmpin,$dat0
3555 aesimc $tmpin,$tmpin
3556 vld1.32 {$dat0},[$key_]
3557 aesd $tmpin,$dat1
3558 veor $tmpin,$tmpin,$dat0
3559 veor $tmpin,$tmpin,$iv0
3560 vst1.8 {$tmpin},[$out]
3561
3562 .Lxts_dec_abort:
3563 ldp $tailcnt,$midnumx,[sp,#48]
3564 ldp $ivd10,$ivd20,[sp,#32]
3565 ldp $ivd30,$ivd40,[sp,#16]
3566 ldp $constnumx,$tmpinp,[sp],#64
3567
3568 .Lxts_dec_final_abort:
3569 ret
3570 .size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3571 ___
3572 }
3573 }}}
3574 $code.=<<___;
3575 #endif
3576 ___
3577 ########################################
3578 if ($flavour =~ /64/) { ######## 64-bit code
3579 my %opcode = (
3580 "aesd" => 0x4e285800, "aese" => 0x4e284800,
3581 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3582
3583 local *unaes = sub {
3584 my ($mnemonic,$arg)=@_;
3585
3586 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
3587 sprintf ".inst\t0x%08x\t//%s %s",
3588 $opcode{$mnemonic}|$1|($2<<5),
3589 $mnemonic,$arg;
3590 };
3591
3592 foreach(split("\n",$code)) {
3593 s/\`([^\`]*)\`/eval($1)/geo;
3594
3595 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3596 s/@\s/\/\//o; # old->new style commentary
3597
3598 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3599 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3600 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3601 s/vmov\.i8/movi/o or # fix up legacy mnemonics
3602 s/vext\.8/ext/o or
3603 s/vrev32\.8/rev32/o or
3604 s/vtst\.8/cmtst/o or
3605 s/vshr/ushr/o or
3606 s/^(\s+)v/$1/o or # strip off v prefix
3607 s/\bbx\s+lr\b/ret/o;
3608
3609 # fix up remaining legacy suffixes
3610 s/\.[ui]?8//o;
3611 m/\],#8/o and s/\.16b/\.8b/go;
3612 s/\.[ui]?32//o and s/\.16b/\.4s/go;
3613 s/\.[ui]?64//o and s/\.16b/\.2d/go;
3614 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3615
3616 # Switch preprocessor checks to aarch64 versions.
3617 s/__ARME([BL])__/__AARCH64E$1__/go;
3618
3619 print $_,"\n";
3620 }
3621 } else { ######## 32-bit code
3622 my %opcode = (
3623 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3624 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3625
3626 local *unaes = sub {
3627 my ($mnemonic,$arg)=@_;
3628
3629 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3630 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3631 |(($2&7)<<1) |(($2&8)<<2);
3632 # since ARMv7 instructions are always encoded little-endian.
3633 # correct solution is to use .inst directive, but older
3634 # assemblers don't implement it:-(
3635 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3636 $word&0xff,($word>>8)&0xff,
3637 ($word>>16)&0xff,($word>>24)&0xff,
3638 $mnemonic,$arg;
3639 }
3640 };
3641
3642 sub unvtbl {
3643 my $arg=shift;
3644
3645 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3646 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3647 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3648 }
3649
3650 sub unvdup32 {
3651 my $arg=shift;
3652
3653 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3654 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3655 }
3656
3657 sub unvmov32 {
3658 my $arg=shift;
3659
3660 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3661 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3662 }
3663
3664 foreach(split("\n",$code)) {
3665 s/\`([^\`]*)\`/eval($1)/geo;
3666
3667 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3668 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3669 s/\/\/\s?/@ /o; # new->old style commentary
3670
3671 # fix up remaining new-style suffixes
3672 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
3673 s/\],#[0-9]+/]!/o;
3674
3675 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3676 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3677 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3678 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3679 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3680 s/^(\s+)b\./$1b/o or
3681 s/^(\s+)ret/$1bx\tlr/o;
3682
3683 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3684 print " it $2\n";
3685 }
3686
3687 print $_,"\n";
3688 }
3689 }
3690
3691 close STDOUT or die "error closing STDOUT: $!";