]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesv8-armx.pl
1e93f86852b4033662d1c7c4d36ae3916341aea2
[thirdparty/openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
22 #
23 # Performance in cycles per byte processed with 128-bit key:
24 #
25 # CBC enc CBC dec CTR
26 # Apple A7 2.39 1.20 1.20
27 # Cortex-A53 2.45 1.87 1.94
28 # Cortex-A57 3.64 1.34 1.32
29
30 $flavour = shift;
31 open STDOUT,">".shift;
32
33 $prefix="aes_v8";
34
35 $code=<<___;
36 #include "arm_arch.h"
37
38 #if __ARM_MAX_ARCH__>=7
39 .text
40 ___
41 $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
42 $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
43 #^^^^^^ this is done to simplify adoption by not depending
44 # on latest binutils.
45
46 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
47 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
48 # maintain both 32- and 64-bit codes within single module and
49 # transliterate common code to either flavour with regex vodoo.
50 #
51 {{{
52 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
53 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
54 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
55
56
57 $code.=<<___;
58 .align 5
59 rcon:
60 .long 0x01,0x01,0x01,0x01
61 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
62 .long 0x1b,0x1b,0x1b,0x1b
63
64 .globl ${prefix}_set_encrypt_key
65 .type ${prefix}_set_encrypt_key,%function
66 .align 5
67 ${prefix}_set_encrypt_key:
68 .Lenc_key:
69 ___
70 $code.=<<___ if ($flavour =~ /64/);
71 stp x29,x30,[sp,#-16]!
72 add x29,sp,#0
73 ___
74 $code.=<<___;
75 mov $ptr,#-1
76 cmp $inp,#0
77 b.eq .Lenc_key_abort
78 cmp $out,#0
79 b.eq .Lenc_key_abort
80 mov $ptr,#-2
81 cmp $bits,#128
82 b.lt .Lenc_key_abort
83 cmp $bits,#256
84 b.gt .Lenc_key_abort
85 tst $bits,#0x3f
86 b.ne .Lenc_key_abort
87
88 adr $ptr,rcon
89 cmp $bits,#192
90
91 veor $zero,$zero,$zero
92 vld1.8 {$in0},[$inp],#16
93 mov $bits,#8 // reuse $bits
94 vld1.32 {$rcon,$mask},[$ptr],#32
95
96 b.lt .Loop128
97 b.eq .L192
98 b .L256
99
100 .align 4
101 .Loop128:
102 vtbl.8 $key,{$in0},$mask
103 vext.8 $tmp,$zero,$in0,#12
104 vst1.32 {$in0},[$out],#16
105 aese $key,$zero
106 subs $bits,$bits,#1
107
108 veor $in0,$in0,$tmp
109 vext.8 $tmp,$zero,$tmp,#12
110 veor $in0,$in0,$tmp
111 vext.8 $tmp,$zero,$tmp,#12
112 veor $key,$key,$rcon
113 veor $in0,$in0,$tmp
114 vshl.u8 $rcon,$rcon,#1
115 veor $in0,$in0,$key
116 b.ne .Loop128
117
118 vld1.32 {$rcon},[$ptr]
119
120 vtbl.8 $key,{$in0},$mask
121 vext.8 $tmp,$zero,$in0,#12
122 vst1.32 {$in0},[$out],#16
123 aese $key,$zero
124
125 veor $in0,$in0,$tmp
126 vext.8 $tmp,$zero,$tmp,#12
127 veor $in0,$in0,$tmp
128 vext.8 $tmp,$zero,$tmp,#12
129 veor $key,$key,$rcon
130 veor $in0,$in0,$tmp
131 vshl.u8 $rcon,$rcon,#1
132 veor $in0,$in0,$key
133
134 vtbl.8 $key,{$in0},$mask
135 vext.8 $tmp,$zero,$in0,#12
136 vst1.32 {$in0},[$out],#16
137 aese $key,$zero
138
139 veor $in0,$in0,$tmp
140 vext.8 $tmp,$zero,$tmp,#12
141 veor $in0,$in0,$tmp
142 vext.8 $tmp,$zero,$tmp,#12
143 veor $key,$key,$rcon
144 veor $in0,$in0,$tmp
145 veor $in0,$in0,$key
146 vst1.32 {$in0},[$out]
147 add $out,$out,#0x50
148
149 mov $rounds,#10
150 b .Ldone
151
152 .align 4
153 .L192:
154 vld1.8 {$in1},[$inp],#8
155 vmov.i8 $key,#8 // borrow $key
156 vst1.32 {$in0},[$out],#16
157 vsub.i8 $mask,$mask,$key // adjust the mask
158
159 .Loop192:
160 vtbl.8 $key,{$in1},$mask
161 vext.8 $tmp,$zero,$in0,#12
162 vst1.32 {$in1},[$out],#8
163 aese $key,$zero
164 subs $bits,$bits,#1
165
166 veor $in0,$in0,$tmp
167 vext.8 $tmp,$zero,$tmp,#12
168 veor $in0,$in0,$tmp
169 vext.8 $tmp,$zero,$tmp,#12
170 veor $in0,$in0,$tmp
171
172 vdup.32 $tmp,${in0}[3]
173 veor $tmp,$tmp,$in1
174 veor $key,$key,$rcon
175 vext.8 $in1,$zero,$in1,#12
176 vshl.u8 $rcon,$rcon,#1
177 veor $in1,$in1,$tmp
178 veor $in0,$in0,$key
179 veor $in1,$in1,$key
180 vst1.32 {$in0},[$out],#16
181 b.ne .Loop192
182
183 mov $rounds,#12
184 add $out,$out,#0x20
185 b .Ldone
186
187 .align 4
188 .L256:
189 vld1.8 {$in1},[$inp]
190 mov $bits,#7
191 mov $rounds,#14
192 vst1.32 {$in0},[$out],#16
193
194 .Loop256:
195 vtbl.8 $key,{$in1},$mask
196 vext.8 $tmp,$zero,$in0,#12
197 vst1.32 {$in1},[$out],#16
198 aese $key,$zero
199 subs $bits,$bits,#1
200
201 veor $in0,$in0,$tmp
202 vext.8 $tmp,$zero,$tmp,#12
203 veor $in0,$in0,$tmp
204 vext.8 $tmp,$zero,$tmp,#12
205 veor $key,$key,$rcon
206 veor $in0,$in0,$tmp
207 vshl.u8 $rcon,$rcon,#1
208 veor $in0,$in0,$key
209 vst1.32 {$in0},[$out],#16
210 b.eq .Ldone
211
212 vdup.32 $key,${in0}[3] // just splat
213 vext.8 $tmp,$zero,$in1,#12
214 aese $key,$zero
215
216 veor $in1,$in1,$tmp
217 vext.8 $tmp,$zero,$tmp,#12
218 veor $in1,$in1,$tmp
219 vext.8 $tmp,$zero,$tmp,#12
220 veor $in1,$in1,$tmp
221
222 veor $in1,$in1,$key
223 b .Loop256
224
225 .Ldone:
226 str $rounds,[$out]
227 mov $ptr,#0
228
229 .Lenc_key_abort:
230 mov x0,$ptr // return value
231 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
232 ret
233 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
234
235 .globl ${prefix}_set_decrypt_key
236 .type ${prefix}_set_decrypt_key,%function
237 .align 5
238 ${prefix}_set_decrypt_key:
239 ___
240 $code.=<<___ if ($flavour =~ /64/);
241 stp x29,x30,[sp,#-16]!
242 add x29,sp,#0
243 ___
244 $code.=<<___ if ($flavour !~ /64/);
245 stmdb sp!,{r4,lr}
246 ___
247 $code.=<<___;
248 bl .Lenc_key
249
250 cmp x0,#0
251 b.ne .Ldec_key_abort
252
253 sub $out,$out,#240 // restore original $out
254 mov x4,#-16
255 add $inp,$out,x12,lsl#4 // end of key schedule
256
257 vld1.32 {v0.16b},[$out]
258 vld1.32 {v1.16b},[$inp]
259 vst1.32 {v0.16b},[$inp],x4
260 vst1.32 {v1.16b},[$out],#16
261
262 .Loop_imc:
263 vld1.32 {v0.16b},[$out]
264 vld1.32 {v1.16b},[$inp]
265 aesimc v0.16b,v0.16b
266 aesimc v1.16b,v1.16b
267 vst1.32 {v0.16b},[$inp],x4
268 vst1.32 {v1.16b},[$out],#16
269 cmp $inp,$out
270 b.hi .Loop_imc
271
272 vld1.32 {v0.16b},[$out]
273 aesimc v0.16b,v0.16b
274 vst1.32 {v0.16b},[$inp]
275
276 eor x0,x0,x0 // return value
277 .Ldec_key_abort:
278 ___
279 $code.=<<___ if ($flavour !~ /64/);
280 ldmia sp!,{r4,pc}
281 ___
282 $code.=<<___ if ($flavour =~ /64/);
283 ldp x29,x30,[sp],#16
284 ret
285 ___
286 $code.=<<___;
287 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
288 ___
289 }}}
290 {{{
291 sub gen_block () {
292 my $dir = shift;
293 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
294 my ($inp,$out,$key)=map("x$_",(0..2));
295 my $rounds="w3";
296 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
297
298 $code.=<<___;
299 .globl ${prefix}_${dir}crypt
300 .type ${prefix}_${dir}crypt,%function
301 .align 5
302 ${prefix}_${dir}crypt:
303 ldr $rounds,[$key,#240]
304 vld1.32 {$rndkey0},[$key],#16
305 vld1.8 {$inout},[$inp]
306 sub $rounds,$rounds,#2
307 vld1.32 {$rndkey1},[$key],#16
308
309 .Loop_${dir}c:
310 aes$e $inout,$rndkey0
311 vld1.32 {$rndkey0},[$key],#16
312 aes$mc $inout,$inout
313 subs $rounds,$rounds,#2
314 aes$e $inout,$rndkey1
315 vld1.32 {$rndkey1},[$key],#16
316 aes$mc $inout,$inout
317 b.gt .Loop_${dir}c
318
319 aes$e $inout,$rndkey0
320 vld1.32 {$rndkey0},[$key]
321 aes$mc $inout,$inout
322 aes$e $inout,$rndkey1
323 veor $inout,$inout,$rndkey0
324
325 vst1.8 {$inout},[$out]
326 ret
327 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
328 ___
329 }
330 &gen_block("en");
331 &gen_block("de");
332 }}}
333 {{{
334 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
335 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
336 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
337
338 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
339
340 ### q8-q15 preloaded key schedule
341
342 $code.=<<___;
343 .globl ${prefix}_cbc_encrypt
344 .type ${prefix}_cbc_encrypt,%function
345 .align 5
346 ${prefix}_cbc_encrypt:
347 ___
348 $code.=<<___ if ($flavour =~ /64/);
349 stp x29,x30,[sp,#-16]!
350 add x29,sp,#0
351 ___
352 $code.=<<___ if ($flavour !~ /64/);
353 mov ip,sp
354 stmdb sp!,{r4-r8,lr}
355 vstmdb sp!,{d8-d15} @ ABI specification says so
356 ldmia ip,{r4-r5} @ load remaining args
357 ___
358 $code.=<<___;
359 subs $len,$len,#16
360 mov $step,#16
361 b.lo .Lcbc_abort
362 cclr $step,eq
363
364 cmp $enc,#0 // en- or decrypting?
365 ldr $rounds,[$key,#240]
366 and $len,$len,#-16
367 vld1.8 {$ivec},[$ivp]
368 vld1.8 {$dat},[$inp],$step
369
370 vld1.32 {q8-q9},[$key] // load key schedule...
371 sub $rounds,$rounds,#6
372 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
373 sub $rounds,$rounds,#2
374 vld1.32 {q10-q11},[$key_],#32
375 vld1.32 {q12-q13},[$key_],#32
376 vld1.32 {q14-q15},[$key_],#32
377 vld1.32 {$rndlast},[$key_]
378
379 add $key_,$key,#32
380 mov $cnt,$rounds
381 b.eq .Lcbc_dec
382
383 cmp $rounds,#2
384 veor $dat,$dat,$ivec
385 veor $rndzero_n_last,q8,$rndlast
386 b.eq .Lcbc_enc128
387
388 .Loop_cbc_enc:
389 aese $dat,q8
390 vld1.32 {q8},[$key_],#16
391 aesmc $dat,$dat
392 subs $cnt,$cnt,#2
393 aese $dat,q9
394 vld1.32 {q9},[$key_],#16
395 aesmc $dat,$dat
396 b.gt .Loop_cbc_enc
397
398 aese $dat,q8
399 aesmc $dat,$dat
400 subs $len,$len,#16
401 aese $dat,q9
402 aesmc $dat,$dat
403 cclr $step,eq
404 aese $dat,q10
405 aesmc $dat,$dat
406 add $key_,$key,#16
407 aese $dat,q11
408 aesmc $dat,$dat
409 vld1.8 {q8},[$inp],$step
410 aese $dat,q12
411 aesmc $dat,$dat
412 veor q8,q8,$rndzero_n_last
413 aese $dat,q13
414 aesmc $dat,$dat
415 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
416 aese $dat,q14
417 aesmc $dat,$dat
418 aese $dat,q15
419
420 mov $cnt,$rounds
421 veor $ivec,$dat,$rndlast
422 vst1.8 {$ivec},[$out],#16
423 b.hs .Loop_cbc_enc
424
425 b .Lcbc_done
426
427 .align 5
428 .Lcbc_enc128:
429 vld1.32 {$in0-$in1},[$key_]
430 aese $dat,q8
431 aesmc $dat,$dat
432 b .Lenter_cbc_enc128
433 .Loop_cbc_enc128:
434 aese $dat,q8
435 aesmc $dat,$dat
436 vst1.8 {$ivec},[$out],#16
437 .Lenter_cbc_enc128:
438 aese $dat,q9
439 aesmc $dat,$dat
440 subs $len,$len,#16
441 aese $dat,$in0
442 aesmc $dat,$dat
443 cclr $step,eq
444 aese $dat,$in1
445 aesmc $dat,$dat
446 aese $dat,q10
447 aesmc $dat,$dat
448 aese $dat,q11
449 aesmc $dat,$dat
450 vld1.8 {q8},[$inp],$step
451 aese $dat,q12
452 aesmc $dat,$dat
453 aese $dat,q13
454 aesmc $dat,$dat
455 aese $dat,q14
456 aesmc $dat,$dat
457 veor q8,q8,$rndzero_n_last
458 aese $dat,q15
459 veor $ivec,$dat,$rndlast
460 b.hs .Loop_cbc_enc128
461
462 vst1.8 {$ivec},[$out],#16
463 b .Lcbc_done
464 ___
465 {
466 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
467 $code.=<<___;
468 .align 5
469 .Lcbc_dec:
470 vld1.8 {$dat2},[$inp],#16
471 subs $len,$len,#32 // bias
472 add $cnt,$rounds,#2
473 vorr $in1,$dat,$dat
474 vorr $dat1,$dat,$dat
475 vorr $in2,$dat2,$dat2
476 b.lo .Lcbc_dec_tail
477
478 vorr $dat1,$dat2,$dat2
479 vld1.8 {$dat2},[$inp],#16
480 vorr $in0,$dat,$dat
481 vorr $in1,$dat1,$dat1
482 vorr $in2,$dat2,$dat2
483
484 .Loop3x_cbc_dec:
485 aesd $dat0,q8
486 aesd $dat1,q8
487 aesd $dat2,q8
488 vld1.32 {q8},[$key_],#16
489 aesimc $dat0,$dat0
490 aesimc $dat1,$dat1
491 aesimc $dat2,$dat2
492 subs $cnt,$cnt,#2
493 aesd $dat0,q9
494 aesd $dat1,q9
495 aesd $dat2,q9
496 vld1.32 {q9},[$key_],#16
497 aesimc $dat0,$dat0
498 aesimc $dat1,$dat1
499 aesimc $dat2,$dat2
500 b.gt .Loop3x_cbc_dec
501
502 aesd $dat0,q8
503 aesd $dat1,q8
504 aesd $dat2,q8
505 veor $tmp0,$ivec,$rndlast
506 aesimc $dat0,$dat0
507 aesimc $dat1,$dat1
508 aesimc $dat2,$dat2
509 veor $tmp1,$in0,$rndlast
510 aesd $dat0,q9
511 aesd $dat1,q9
512 aesd $dat2,q9
513 veor $tmp2,$in1,$rndlast
514 subs $len,$len,#0x30
515 aesimc $dat0,$dat0
516 aesimc $dat1,$dat1
517 aesimc $dat2,$dat2
518 vorr $ivec,$in2,$in2
519 mov.lo x6,$len // x6, $cnt, is zero at this point
520 aesd $dat0,q12
521 aesd $dat1,q12
522 aesd $dat2,q12
523 add $inp,$inp,x6 // $inp is adjusted in such way that
524 // at exit from the loop $dat1-$dat2
525 // are loaded with last "words"
526 aesimc $dat0,$dat0
527 aesimc $dat1,$dat1
528 aesimc $dat2,$dat2
529 mov $key_,$key
530 aesd $dat0,q13
531 aesd $dat1,q13
532 aesd $dat2,q13
533 vld1.8 {$in0},[$inp],#16
534 aesimc $dat0,$dat0
535 aesimc $dat1,$dat1
536 aesimc $dat2,$dat2
537 vld1.8 {$in1},[$inp],#16
538 aesd $dat0,q14
539 aesd $dat1,q14
540 aesd $dat2,q14
541 vld1.8 {$in2},[$inp],#16
542 aesimc $dat0,$dat0
543 aesimc $dat1,$dat1
544 aesimc $dat2,$dat2
545 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
546 aesd $dat0,q15
547 aesd $dat1,q15
548 aesd $dat2,q15
549
550 add $cnt,$rounds,#2
551 veor $tmp0,$tmp0,$dat0
552 veor $tmp1,$tmp1,$dat1
553 veor $dat2,$dat2,$tmp2
554 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
555 vorr $dat0,$in0,$in0
556 vst1.8 {$tmp0},[$out],#16
557 vorr $dat1,$in1,$in1
558 vst1.8 {$tmp1},[$out],#16
559 vst1.8 {$dat2},[$out],#16
560 vorr $dat2,$in2,$in2
561 b.hs .Loop3x_cbc_dec
562
563 cmn $len,#0x30
564 b.eq .Lcbc_done
565 nop
566
567 .Lcbc_dec_tail:
568 aesd $dat1,q8
569 aesd $dat2,q8
570 vld1.32 {q8},[$key_],#16
571 aesimc $dat1,$dat1
572 aesimc $dat2,$dat2
573 subs $cnt,$cnt,#2
574 aesd $dat1,q9
575 aesd $dat2,q9
576 vld1.32 {q9},[$key_],#16
577 aesimc $dat1,$dat1
578 aesimc $dat2,$dat2
579 b.gt .Lcbc_dec_tail
580
581 aesd $dat1,q8
582 aesd $dat2,q8
583 aesimc $dat1,$dat1
584 aesimc $dat2,$dat2
585 aesd $dat1,q9
586 aesd $dat2,q9
587 aesimc $dat1,$dat1
588 aesimc $dat2,$dat2
589 aesd $dat1,q12
590 aesd $dat2,q12
591 aesimc $dat1,$dat1
592 aesimc $dat2,$dat2
593 cmn $len,#0x20
594 aesd $dat1,q13
595 aesd $dat2,q13
596 aesimc $dat1,$dat1
597 aesimc $dat2,$dat2
598 veor $tmp1,$ivec,$rndlast
599 aesd $dat1,q14
600 aesd $dat2,q14
601 aesimc $dat1,$dat1
602 aesimc $dat2,$dat2
603 veor $tmp2,$in1,$rndlast
604 aesd $dat1,q15
605 aesd $dat2,q15
606 b.eq .Lcbc_dec_one
607 veor $tmp1,$tmp1,$dat1
608 veor $tmp2,$tmp2,$dat2
609 vorr $ivec,$in2,$in2
610 vst1.8 {$tmp1},[$out],#16
611 vst1.8 {$tmp2},[$out],#16
612 b .Lcbc_done
613
614 .Lcbc_dec_one:
615 veor $tmp1,$tmp1,$dat2
616 vorr $ivec,$in2,$in2
617 vst1.8 {$tmp1},[$out],#16
618
619 .Lcbc_done:
620 vst1.8 {$ivec},[$ivp]
621 .Lcbc_abort:
622 ___
623 }
624 $code.=<<___ if ($flavour !~ /64/);
625 vldmia sp!,{d8-d15}
626 ldmia sp!,{r4-r8,pc}
627 ___
628 $code.=<<___ if ($flavour =~ /64/);
629 ldr x29,[sp],#16
630 ret
631 ___
632 $code.=<<___;
633 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
634 ___
635 }}}
636 {{{
637 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
638 my ($rounds,$cnt,$key_)=("w5","w6","x7");
639 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
640 my $step="x12"; # aliases with $tctr2
641
642 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
643 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
644
645 my ($dat,$tmp)=($dat0,$tmp0);
646
647 ### q8-q15 preloaded key schedule
648
649 $code.=<<___;
650 .globl ${prefix}_ctr32_encrypt_blocks
651 .type ${prefix}_ctr32_encrypt_blocks,%function
652 .align 5
653 ${prefix}_ctr32_encrypt_blocks:
654 ___
655 $code.=<<___ if ($flavour =~ /64/);
656 stp x29,x30,[sp,#-16]!
657 add x29,sp,#0
658 ___
659 $code.=<<___ if ($flavour !~ /64/);
660 mov ip,sp
661 stmdb sp!,{r4-r10,lr}
662 vstmdb sp!,{d8-d15} @ ABI specification says so
663 ldr r4, [ip] @ load remaining arg
664 ___
665 $code.=<<___;
666 ldr $rounds,[$key,#240]
667
668 ldr $ctr, [$ivp, #12]
669 vld1.32 {$dat0},[$ivp]
670
671 vld1.32 {q8-q9},[$key] // load key schedule...
672 sub $rounds,$rounds,#4
673 mov $step,#16
674 cmp $len,#2
675 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
676 sub $rounds,$rounds,#2
677 vld1.32 {q12-q13},[$key_],#32
678 vld1.32 {q14-q15},[$key_],#32
679 vld1.32 {$rndlast},[$key_]
680 add $key_,$key,#32
681 mov $cnt,$rounds
682 cclr $step,lo
683 #ifndef __ARMEB__
684 rev $ctr, $ctr
685 #endif
686 vorr $dat1,$dat0,$dat0
687 add $tctr1, $ctr, #1
688 vorr $dat2,$dat0,$dat0
689 add $ctr, $ctr, #2
690 vorr $ivec,$dat0,$dat0
691 rev $tctr1, $tctr1
692 vmov.32 ${dat1}[3],$tctr1
693 b.ls .Lctr32_tail
694 rev $tctr2, $ctr
695 sub $len,$len,#3 // bias
696 vmov.32 ${dat2}[3],$tctr2
697 b .Loop3x_ctr32
698
699 .align 4
700 .Loop3x_ctr32:
701 aese $dat0,q8
702 aese $dat1,q8
703 aese $dat2,q8
704 vld1.32 {q8},[$key_],#16
705 aesmc $dat0,$dat0
706 aesmc $dat1,$dat1
707 aesmc $dat2,$dat2
708 subs $cnt,$cnt,#2
709 aese $dat0,q9
710 aese $dat1,q9
711 aese $dat2,q9
712 vld1.32 {q9},[$key_],#16
713 aesmc $dat0,$dat0
714 aesmc $dat1,$dat1
715 aesmc $dat2,$dat2
716 b.gt .Loop3x_ctr32
717
718 aese $dat0,q8
719 aese $dat1,q8
720 aese $dat2,q8
721 mov $key_,$key
722 aesmc $tmp0,$dat0
723 vld1.8 {$in0},[$inp],#16
724 aesmc $tmp1,$dat1
725 aesmc $dat2,$dat2
726 vorr $dat0,$ivec,$ivec
727 aese $tmp0,q9
728 vld1.8 {$in1},[$inp],#16
729 aese $tmp1,q9
730 aese $dat2,q9
731 vorr $dat1,$ivec,$ivec
732 aesmc $tmp0,$tmp0
733 vld1.8 {$in2},[$inp],#16
734 aesmc $tmp1,$tmp1
735 aesmc $tmp2,$dat2
736 vorr $dat2,$ivec,$ivec
737 add $tctr0,$ctr,#1
738 aese $tmp0,q12
739 aese $tmp1,q12
740 aese $tmp2,q12
741 veor $in0,$in0,$rndlast
742 add $tctr1,$ctr,#2
743 aesmc $tmp0,$tmp0
744 aesmc $tmp1,$tmp1
745 aesmc $tmp2,$tmp2
746 veor $in1,$in1,$rndlast
747 add $ctr,$ctr,#3
748 aese $tmp0,q13
749 aese $tmp1,q13
750 aese $tmp2,q13
751 veor $in2,$in2,$rndlast
752 rev $tctr0,$tctr0
753 aesmc $tmp0,$tmp0
754 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
755 aesmc $tmp1,$tmp1
756 aesmc $tmp2,$tmp2
757 vmov.32 ${dat0}[3], $tctr0
758 rev $tctr1,$tctr1
759 aese $tmp0,q14
760 aese $tmp1,q14
761 aese $tmp2,q14
762 vmov.32 ${dat1}[3], $tctr1
763 rev $tctr2,$ctr
764 aesmc $tmp0,$tmp0
765 aesmc $tmp1,$tmp1
766 aesmc $tmp2,$tmp2
767 vmov.32 ${dat2}[3], $tctr2
768 subs $len,$len,#3
769 aese $tmp0,q15
770 aese $tmp1,q15
771 aese $tmp2,q15
772
773 mov $cnt,$rounds
774 veor $in0,$in0,$tmp0
775 veor $in1,$in1,$tmp1
776 veor $in2,$in2,$tmp2
777 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
778 vst1.8 {$in0},[$out],#16
779 vst1.8 {$in1},[$out],#16
780 vst1.8 {$in2},[$out],#16
781 b.hs .Loop3x_ctr32
782
783 adds $len,$len,#3
784 b.eq .Lctr32_done
785 cmp $len,#1
786 mov $step,#16
787 cclr $step,eq
788
789 .Lctr32_tail:
790 aese $dat0,q8
791 aese $dat1,q8
792 vld1.32 {q8},[$key_],#16
793 aesmc $dat0,$dat0
794 aesmc $dat1,$dat1
795 subs $cnt,$cnt,#2
796 aese $dat0,q9
797 aese $dat1,q9
798 vld1.32 {q9},[$key_],#16
799 aesmc $dat0,$dat0
800 aesmc $dat1,$dat1
801 b.gt .Lctr32_tail
802
803 aese $dat0,q8
804 aese $dat1,q8
805 aesmc $dat0,$dat0
806 aesmc $dat1,$dat1
807 aese $dat0,q9
808 aese $dat1,q9
809 aesmc $dat0,$dat0
810 aesmc $dat1,$dat1
811 vld1.8 {$in0},[$inp],$step
812 aese $dat0,q12
813 aese $dat1,q12
814 vld1.8 {$in1},[$inp]
815 aesmc $dat0,$dat0
816 aesmc $dat1,$dat1
817 aese $dat0,q13
818 aese $dat1,q13
819 aesmc $dat0,$dat0
820 aesmc $dat1,$dat1
821 aese $dat0,q14
822 aese $dat1,q14
823 veor $in0,$in0,$rndlast
824 aesmc $dat0,$dat0
825 aesmc $dat1,$dat1
826 veor $in1,$in1,$rndlast
827 aese $dat0,q15
828 aese $dat1,q15
829
830 cmp $len,#1
831 veor $in0,$in0,$dat0
832 veor $in1,$in1,$dat1
833 vst1.8 {$in0},[$out],#16
834 b.eq .Lctr32_done
835 vst1.8 {$in1},[$out]
836
837 .Lctr32_done:
838 ___
839 $code.=<<___ if ($flavour !~ /64/);
840 vldmia sp!,{d8-d15}
841 ldmia sp!,{r4-r10,pc}
842 ___
843 $code.=<<___ if ($flavour =~ /64/);
844 ldr x29,[sp],#16
845 ret
846 ___
847 $code.=<<___;
848 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
849 ___
850 }}}
851 $code.=<<___;
852 #endif
853 ___
854 ########################################
855 if ($flavour =~ /64/) { ######## 64-bit code
856 my %opcode = (
857 "aesd" => 0x4e285800, "aese" => 0x4e284800,
858 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
859
860 local *unaes = sub {
861 my ($mnemonic,$arg)=@_;
862
863 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
864 sprintf ".inst\t0x%08x\t//%s %s",
865 $opcode{$mnemonic}|$1|($2<<5),
866 $mnemonic,$arg;
867 };
868
869 foreach(split("\n",$code)) {
870 s/\`([^\`]*)\`/eval($1)/geo;
871
872 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
873 s/@\s/\/\//o; # old->new style commentary
874
875 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
876 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
877 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
878 s/vmov\.i8/movi/o or # fix up legacy mnemonics
879 s/vext\.8/ext/o or
880 s/vrev32\.8/rev32/o or
881 s/vtst\.8/cmtst/o or
882 s/vshr/ushr/o or
883 s/^(\s+)v/$1/o or # strip off v prefix
884 s/\bbx\s+lr\b/ret/o;
885
886 # fix up remainig legacy suffixes
887 s/\.[ui]?8//o;
888 m/\],#8/o and s/\.16b/\.8b/go;
889 s/\.[ui]?32//o and s/\.16b/\.4s/go;
890 s/\.[ui]?64//o and s/\.16b/\.2d/go;
891 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
892
893 print $_,"\n";
894 }
895 } else { ######## 32-bit code
896 my %opcode = (
897 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
898 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
899
900 local *unaes = sub {
901 my ($mnemonic,$arg)=@_;
902
903 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
904 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
905 |(($2&7)<<1) |(($2&8)<<2);
906 # since ARMv7 instructions are always encoded little-endian.
907 # correct solution is to use .inst directive, but older
908 # assemblers don't implement it:-(
909 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
910 $word&0xff,($word>>8)&0xff,
911 ($word>>16)&0xff,($word>>24)&0xff,
912 $mnemonic,$arg;
913 }
914 };
915
916 sub unvtbl {
917 my $arg=shift;
918
919 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
920 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
921 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
922 }
923
924 sub unvdup32 {
925 my $arg=shift;
926
927 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
928 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
929 }
930
931 sub unvmov32 {
932 my $arg=shift;
933
934 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
935 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
936 }
937
938 foreach(split("\n",$code)) {
939 s/\`([^\`]*)\`/eval($1)/geo;
940
941 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
942 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
943 s/\/\/\s?/@ /o; # new->old style commentary
944
945 # fix up remainig new-style suffixes
946 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
947 s/\],#[0-9]+/]!/o;
948
949 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
950 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
951 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
952 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
953 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
954 s/^(\s+)b\./$1b/o or
955 s/^(\s+)mov\./$1mov/o or
956 s/^(\s+)ret/$1bx\tlr/o;
957
958 print $_,"\n";
959 }
960 }
961
962 close STDOUT;