]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/chacha/asm/chacha-armv8.pl
ARM64 assembly pack: add ThunderX2 results.
[thirdparty/openssl.git] / crypto / chacha / asm / chacha-armv8.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
03d770d9 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
1fdcef75
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
609b0852 18#
1fdcef75
AP
19# ChaCha20 for ARMv8.
20#
21# Performance in cycles per byte out of large buffer.
22#
23# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
24#
25# Apple A7 5.50/+49% 3.33 1.70
26# Cortex-A53 8.40/+80% 4.72 4.72(*)
27# Cortex-A57 8.06/+43% 4.90 4.43(**)
28# Denver 4.50/+82% 2.63 2.67(*)
29# X-Gene 9.50/+46% 8.82 8.89(*)
05ef4d19 30# Mongoose 8.00/+44% 3.64 3.25
75331623 31# Kryo 8.17/+50% 4.83 4.65
6465321e 32# ThunderX2 7.26/+48% 7.91 4.30
1fdcef75
AP
33#
34# (*) it's expected that doubling interleave factor doesn't help
35# all processors, only those with higher NEON latency and
36# higher instruction issue rate;
37# (**) expected improvement was actually higher;
38
39$flavour=shift;
40$output=shift;
41
42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45die "can't locate arm-xlate.pl";
46
47open OUT,"| \"$^X\" $xlate $flavour $output";
48*STDOUT=*OUT;
49
50sub AUTOLOAD() # thunk [simplified] x86-style perlasm
51{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
52 my $arg = pop;
53 $arg = "#$arg" if ($arg*1 eq $arg);
54 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
55}
56
57my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
58
59my @x=map("x$_",(5..17,19..21));
60my @d=map("x$_",(22..28,30));
61
62sub ROUND {
63my ($a0,$b0,$c0,$d0)=@_;
64my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
65my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
66my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
67
68 (
69 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
70 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
71 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
72 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
73 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
74 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
75 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
76 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
77 "&ror_32 (@x[$d0],@x[$d0],16)",
78 "&ror_32 (@x[$d1],@x[$d1],16)",
79 "&ror_32 (@x[$d2],@x[$d2],16)",
80 "&ror_32 (@x[$d3],@x[$d3],16)",
81
82 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
83 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
84 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
85 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
86 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
87 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
88 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
89 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
90 "&ror_32 (@x[$b0],@x[$b0],20)",
91 "&ror_32 (@x[$b1],@x[$b1],20)",
92 "&ror_32 (@x[$b2],@x[$b2],20)",
93 "&ror_32 (@x[$b3],@x[$b3],20)",
94
95 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
96 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
97 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
98 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
99 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
100 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
101 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
102 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
103 "&ror_32 (@x[$d0],@x[$d0],24)",
104 "&ror_32 (@x[$d1],@x[$d1],24)",
105 "&ror_32 (@x[$d2],@x[$d2],24)",
106 "&ror_32 (@x[$d3],@x[$d3],24)",
107
108 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
109 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
110 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
111 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
112 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
113 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
114 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
115 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
116 "&ror_32 (@x[$b0],@x[$b0],25)",
117 "&ror_32 (@x[$b1],@x[$b1],25)",
118 "&ror_32 (@x[$b2],@x[$b2],25)",
119 "&ror_32 (@x[$b3],@x[$b3],25)"
120 );
121}
122
123$code.=<<___;
124#include "arm_arch.h"
125
126.text
127
128.extern OPENSSL_armcap_P
129
130.align 5
131.Lsigma:
132.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
133.Lone:
134.long 1,0,0,0
1fdcef75
AP
135.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
136
137.globl ChaCha20_ctr32
138.type ChaCha20_ctr32,%function
139.align 5
140ChaCha20_ctr32:
141 cbz $len,.Labort
1fdcef75
AP
142 cmp $len,#192
143 b.lo .Lshort
db42bb44
AP
144
145 adrp x17,OPENSSL_armcap_P
146 ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
1fdcef75 147 tst w17,#ARMV7_NEON
db42bb44 148 b.ne .LChaCha20_neon
1fdcef75
AP
149
150.Lshort:
9a18aae5 151 .inst 0xd503233f // paciasp
1fdcef75
AP
152 stp x29,x30,[sp,#-96]!
153 add x29,sp,#0
154
155 adr @x[0],.Lsigma
156 stp x19,x20,[sp,#16]
157 stp x21,x22,[sp,#32]
158 stp x23,x24,[sp,#48]
159 stp x25,x26,[sp,#64]
160 stp x27,x28,[sp,#80]
161 sub sp,sp,#64
162
163 ldp @d[0],@d[1],[@x[0]] // load sigma
164 ldp @d[2],@d[3],[$key] // load key
165 ldp @d[4],@d[5],[$key,#16]
166 ldp @d[6],@d[7],[$ctr] // load counter
167#ifdef __ARMEB__
168 ror @d[2],@d[2],#32
169 ror @d[3],@d[3],#32
170 ror @d[4],@d[4],#32
171 ror @d[5],@d[5],#32
172 ror @d[6],@d[6],#32
173 ror @d[7],@d[7],#32
174#endif
175
176.Loop_outer:
177 mov.32 @x[0],@d[0] // unpack key block
178 lsr @x[1],@d[0],#32
179 mov.32 @x[2],@d[1]
180 lsr @x[3],@d[1],#32
181 mov.32 @x[4],@d[2]
182 lsr @x[5],@d[2],#32
183 mov.32 @x[6],@d[3]
184 lsr @x[7],@d[3],#32
185 mov.32 @x[8],@d[4]
186 lsr @x[9],@d[4],#32
187 mov.32 @x[10],@d[5]
188 lsr @x[11],@d[5],#32
189 mov.32 @x[12],@d[6]
190 lsr @x[13],@d[6],#32
191 mov.32 @x[14],@d[7]
192 lsr @x[15],@d[7],#32
193
194 mov $ctr,#10
195 subs $len,$len,#64
196.Loop:
609b0852 197 sub $ctr,$ctr,#1
1fdcef75
AP
198___
199 foreach (&ROUND(0, 4, 8,12)) { eval; }
200 foreach (&ROUND(0, 5,10,15)) { eval; }
201$code.=<<___;
202 cbnz $ctr,.Loop
203
204 add.32 @x[0],@x[0],@d[0] // accumulate key block
205 add @x[1],@x[1],@d[0],lsr#32
206 add.32 @x[2],@x[2],@d[1]
207 add @x[3],@x[3],@d[1],lsr#32
208 add.32 @x[4],@x[4],@d[2]
209 add @x[5],@x[5],@d[2],lsr#32
210 add.32 @x[6],@x[6],@d[3]
211 add @x[7],@x[7],@d[3],lsr#32
212 add.32 @x[8],@x[8],@d[4]
213 add @x[9],@x[9],@d[4],lsr#32
214 add.32 @x[10],@x[10],@d[5]
215 add @x[11],@x[11],@d[5],lsr#32
216 add.32 @x[12],@x[12],@d[6]
217 add @x[13],@x[13],@d[6],lsr#32
218 add.32 @x[14],@x[14],@d[7]
219 add @x[15],@x[15],@d[7],lsr#32
220
221 b.lo .Ltail
222
223 add @x[0],@x[0],@x[1],lsl#32 // pack
224 add @x[2],@x[2],@x[3],lsl#32
225 ldp @x[1],@x[3],[$inp,#0] // load input
226 add @x[4],@x[4],@x[5],lsl#32
227 add @x[6],@x[6],@x[7],lsl#32
228 ldp @x[5],@x[7],[$inp,#16]
229 add @x[8],@x[8],@x[9],lsl#32
230 add @x[10],@x[10],@x[11],lsl#32
231 ldp @x[9],@x[11],[$inp,#32]
232 add @x[12],@x[12],@x[13],lsl#32
233 add @x[14],@x[14],@x[15],lsl#32
234 ldp @x[13],@x[15],[$inp,#48]
235 add $inp,$inp,#64
236#ifdef __ARMEB__
237 rev @x[0],@x[0]
238 rev @x[2],@x[2]
239 rev @x[4],@x[4]
240 rev @x[6],@x[6]
241 rev @x[8],@x[8]
242 rev @x[10],@x[10]
243 rev @x[12],@x[12]
244 rev @x[14],@x[14]
245#endif
246 eor @x[0],@x[0],@x[1]
247 eor @x[2],@x[2],@x[3]
248 eor @x[4],@x[4],@x[5]
249 eor @x[6],@x[6],@x[7]
250 eor @x[8],@x[8],@x[9]
251 eor @x[10],@x[10],@x[11]
252 eor @x[12],@x[12],@x[13]
253 eor @x[14],@x[14],@x[15]
254
255 stp @x[0],@x[2],[$out,#0] // store output
256 add @d[6],@d[6],#1 // increment counter
257 stp @x[4],@x[6],[$out,#16]
258 stp @x[8],@x[10],[$out,#32]
259 stp @x[12],@x[14],[$out,#48]
260 add $out,$out,#64
261
262 b.hi .Loop_outer
263
264 ldp x19,x20,[x29,#16]
265 add sp,sp,#64
266 ldp x21,x22,[x29,#32]
267 ldp x23,x24,[x29,#48]
268 ldp x25,x26,[x29,#64]
269 ldp x27,x28,[x29,#80]
270 ldp x29,x30,[sp],#96
9a18aae5 271 .inst 0xd50323bf // autiasp
1fdcef75
AP
272.Labort:
273 ret
274
275.align 4
276.Ltail:
277 add $len,$len,#64
278.Less_than_64:
279 sub $out,$out,#1
280 add $inp,$inp,$len
281 add $out,$out,$len
282 add $ctr,sp,$len
283 neg $len,$len
284
285 add @x[0],@x[0],@x[1],lsl#32 // pack
286 add @x[2],@x[2],@x[3],lsl#32
287 add @x[4],@x[4],@x[5],lsl#32
288 add @x[6],@x[6],@x[7],lsl#32
289 add @x[8],@x[8],@x[9],lsl#32
290 add @x[10],@x[10],@x[11],lsl#32
291 add @x[12],@x[12],@x[13],lsl#32
292 add @x[14],@x[14],@x[15],lsl#32
293#ifdef __ARMEB__
294 rev @x[0],@x[0]
295 rev @x[2],@x[2]
296 rev @x[4],@x[4]
297 rev @x[6],@x[6]
298 rev @x[8],@x[8]
299 rev @x[10],@x[10]
300 rev @x[12],@x[12]
301 rev @x[14],@x[14]
302#endif
303 stp @x[0],@x[2],[sp,#0]
304 stp @x[4],@x[6],[sp,#16]
305 stp @x[8],@x[10],[sp,#32]
306 stp @x[12],@x[14],[sp,#48]
307
308.Loop_tail:
309 ldrb w10,[$inp,$len]
310 ldrb w11,[$ctr,$len]
311 add $len,$len,#1
312 eor w10,w10,w11
313 strb w10,[$out,$len]
314 cbnz $len,.Loop_tail
315
316 stp xzr,xzr,[sp,#0]
317 stp xzr,xzr,[sp,#16]
318 stp xzr,xzr,[sp,#32]
319 stp xzr,xzr,[sp,#48]
320
321 ldp x19,x20,[x29,#16]
322 add sp,sp,#64
323 ldp x21,x22,[x29,#32]
324 ldp x23,x24,[x29,#48]
325 ldp x25,x26,[x29,#64]
326 ldp x27,x28,[x29,#80]
327 ldp x29,x30,[sp],#96
9a18aae5 328 .inst 0xd50323bf // autiasp
1fdcef75
AP
329 ret
330.size ChaCha20_ctr32,.-ChaCha20_ctr32
331___
332
333{{{
334my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
335 map("v$_.4s",(0..7,16..23));
336my (@K)=map("v$_.4s",(24..30));
337my $ONE="v31.4s";
338
339sub NEONROUND {
340my $odd = pop;
341my ($a,$b,$c,$d,$t)=@_;
342
343 (
344 "&add ('$a','$a','$b')",
345 "&eor ('$d','$d','$a')",
346 "&rev32_16 ('$d','$d')", # vrot ($d,16)
347
348 "&add ('$c','$c','$d')",
349 "&eor ('$t','$b','$c')",
350 "&ushr ('$b','$t',20)",
351 "&sli ('$b','$t',12)",
352
353 "&add ('$a','$a','$b')",
354 "&eor ('$t','$d','$a')",
355 "&ushr ('$d','$t',24)",
356 "&sli ('$d','$t',8)",
357
358 "&add ('$c','$c','$d')",
359 "&eor ('$t','$b','$c')",
360 "&ushr ('$b','$t',25)",
361 "&sli ('$b','$t',7)",
362
363 "&ext ('$c','$c','$c',8)",
364 "&ext ('$d','$d','$d',$odd?4:12)",
365 "&ext ('$b','$b','$b',$odd?12:4)"
366 );
367}
368
369$code.=<<___;
370
371.type ChaCha20_neon,%function
372.align 5
373ChaCha20_neon:
db42bb44 374.LChaCha20_neon:
9a18aae5 375 .inst 0xd503233f // paciasp
1fdcef75
AP
376 stp x29,x30,[sp,#-96]!
377 add x29,sp,#0
378
379 adr @x[0],.Lsigma
380 stp x19,x20,[sp,#16]
381 stp x21,x22,[sp,#32]
382 stp x23,x24,[sp,#48]
383 stp x25,x26,[sp,#64]
384 stp x27,x28,[sp,#80]
385 cmp $len,#512
386 b.hs .L512_or_more_neon
387
388 sub sp,sp,#64
389
390 ldp @d[0],@d[1],[@x[0]] // load sigma
391 ld1 {@K[0]},[@x[0]],#16
392 ldp @d[2],@d[3],[$key] // load key
393 ldp @d[4],@d[5],[$key,#16]
394 ld1 {@K[1],@K[2]},[$key]
395 ldp @d[6],@d[7],[$ctr] // load counter
396 ld1 {@K[3]},[$ctr]
397 ld1 {$ONE},[@x[0]]
398#ifdef __ARMEB__
399 rev64 @K[0],@K[0]
400 ror @d[2],@d[2],#32
401 ror @d[3],@d[3],#32
402 ror @d[4],@d[4],#32
403 ror @d[5],@d[5],#32
404 ror @d[6],@d[6],#32
405 ror @d[7],@d[7],#32
406#endif
407 add @K[3],@K[3],$ONE // += 1
408 add @K[4],@K[3],$ONE
409 add @K[5],@K[4],$ONE
410 shl $ONE,$ONE,#2 // 1 -> 4
411
412.Loop_outer_neon:
413 mov.32 @x[0],@d[0] // unpack key block
414 lsr @x[1],@d[0],#32
415 mov $A0,@K[0]
416 mov.32 @x[2],@d[1]
417 lsr @x[3],@d[1],#32
418 mov $A1,@K[0]
419 mov.32 @x[4],@d[2]
420 lsr @x[5],@d[2],#32
421 mov $A2,@K[0]
422 mov.32 @x[6],@d[3]
423 mov $B0,@K[1]
424 lsr @x[7],@d[3],#32
425 mov $B1,@K[1]
426 mov.32 @x[8],@d[4]
427 mov $B2,@K[1]
428 lsr @x[9],@d[4],#32
429 mov $D0,@K[3]
430 mov.32 @x[10],@d[5]
431 mov $D1,@K[4]
432 lsr @x[11],@d[5],#32
433 mov $D2,@K[5]
434 mov.32 @x[12],@d[6]
435 mov $C0,@K[2]
436 lsr @x[13],@d[6],#32
437 mov $C1,@K[2]
438 mov.32 @x[14],@d[7]
439 mov $C2,@K[2]
440 lsr @x[15],@d[7],#32
441
442 mov $ctr,#10
443 subs $len,$len,#256
444.Loop_neon:
445 sub $ctr,$ctr,#1
446___
447 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
448 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
449 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
450 my @thread3=&ROUND(0,4,8,12);
451
452 foreach (@thread0) {
453 eval; eval(shift(@thread3));
454 eval(shift(@thread1)); eval(shift(@thread3));
455 eval(shift(@thread2)); eval(shift(@thread3));
456 }
457
458 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
459 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
460 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
461 @thread3=&ROUND(0,5,10,15);
462
463 foreach (@thread0) {
464 eval; eval(shift(@thread3));
465 eval(shift(@thread1)); eval(shift(@thread3));
466 eval(shift(@thread2)); eval(shift(@thread3));
467 }
468$code.=<<___;
469 cbnz $ctr,.Loop_neon
470
471 add.32 @x[0],@x[0],@d[0] // accumulate key block
472 add $A0,$A0,@K[0]
473 add @x[1],@x[1],@d[0],lsr#32
474 add $A1,$A1,@K[0]
475 add.32 @x[2],@x[2],@d[1]
476 add $A2,$A2,@K[0]
477 add @x[3],@x[3],@d[1],lsr#32
478 add $C0,$C0,@K[2]
479 add.32 @x[4],@x[4],@d[2]
480 add $C1,$C1,@K[2]
481 add @x[5],@x[5],@d[2],lsr#32
482 add $C2,$C2,@K[2]
483 add.32 @x[6],@x[6],@d[3]
484 add $D0,$D0,@K[3]
485 add @x[7],@x[7],@d[3],lsr#32
486 add.32 @x[8],@x[8],@d[4]
487 add $D1,$D1,@K[4]
488 add @x[9],@x[9],@d[4],lsr#32
489 add.32 @x[10],@x[10],@d[5]
490 add $D2,$D2,@K[5]
491 add @x[11],@x[11],@d[5],lsr#32
492 add.32 @x[12],@x[12],@d[6]
493 add $B0,$B0,@K[1]
494 add @x[13],@x[13],@d[6],lsr#32
495 add.32 @x[14],@x[14],@d[7]
496 add $B1,$B1,@K[1]
497 add @x[15],@x[15],@d[7],lsr#32
498 add $B2,$B2,@K[1]
499
500 b.lo .Ltail_neon
501
502 add @x[0],@x[0],@x[1],lsl#32 // pack
503 add @x[2],@x[2],@x[3],lsl#32
504 ldp @x[1],@x[3],[$inp,#0] // load input
505 add @x[4],@x[4],@x[5],lsl#32
506 add @x[6],@x[6],@x[7],lsl#32
507 ldp @x[5],@x[7],[$inp,#16]
508 add @x[8],@x[8],@x[9],lsl#32
509 add @x[10],@x[10],@x[11],lsl#32
510 ldp @x[9],@x[11],[$inp,#32]
511 add @x[12],@x[12],@x[13],lsl#32
512 add @x[14],@x[14],@x[15],lsl#32
513 ldp @x[13],@x[15],[$inp,#48]
514 add $inp,$inp,#64
515#ifdef __ARMEB__
516 rev @x[0],@x[0]
517 rev @x[2],@x[2]
518 rev @x[4],@x[4]
519 rev @x[6],@x[6]
520 rev @x[8],@x[8]
521 rev @x[10],@x[10]
522 rev @x[12],@x[12]
523 rev @x[14],@x[14]
524#endif
525 ld1.8 {$T0-$T3},[$inp],#64
526 eor @x[0],@x[0],@x[1]
527 eor @x[2],@x[2],@x[3]
528 eor @x[4],@x[4],@x[5]
529 eor @x[6],@x[6],@x[7]
530 eor @x[8],@x[8],@x[9]
531 eor $A0,$A0,$T0
532 eor @x[10],@x[10],@x[11]
533 eor $B0,$B0,$T1
534 eor @x[12],@x[12],@x[13]
535 eor $C0,$C0,$T2
536 eor @x[14],@x[14],@x[15]
537 eor $D0,$D0,$T3
538 ld1.8 {$T0-$T3},[$inp],#64
539
540 stp @x[0],@x[2],[$out,#0] // store output
541 add @d[6],@d[6],#4 // increment counter
542 stp @x[4],@x[6],[$out,#16]
543 add @K[3],@K[3],$ONE // += 4
544 stp @x[8],@x[10],[$out,#32]
545 add @K[4],@K[4],$ONE
546 stp @x[12],@x[14],[$out,#48]
547 add @K[5],@K[5],$ONE
548 add $out,$out,#64
549
550 st1.8 {$A0-$D0},[$out],#64
551 ld1.8 {$A0-$D0},[$inp],#64
552
553 eor $A1,$A1,$T0
554 eor $B1,$B1,$T1
555 eor $C1,$C1,$T2
556 eor $D1,$D1,$T3
557 st1.8 {$A1-$D1},[$out],#64
558
559 eor $A2,$A2,$A0
560 eor $B2,$B2,$B0
561 eor $C2,$C2,$C0
562 eor $D2,$D2,$D0
563 st1.8 {$A2-$D2},[$out],#64
564
565 b.hi .Loop_outer_neon
566
567 ldp x19,x20,[x29,#16]
568 add sp,sp,#64
569 ldp x21,x22,[x29,#32]
570 ldp x23,x24,[x29,#48]
571 ldp x25,x26,[x29,#64]
572 ldp x27,x28,[x29,#80]
573 ldp x29,x30,[sp],#96
9a18aae5 574 .inst 0xd50323bf // autiasp
1fdcef75
AP
575 ret
576
577.Ltail_neon:
578 add $len,$len,#256
579 cmp $len,#64
580 b.lo .Less_than_64
581
582 add @x[0],@x[0],@x[1],lsl#32 // pack
583 add @x[2],@x[2],@x[3],lsl#32
584 ldp @x[1],@x[3],[$inp,#0] // load input
585 add @x[4],@x[4],@x[5],lsl#32
586 add @x[6],@x[6],@x[7],lsl#32
587 ldp @x[5],@x[7],[$inp,#16]
588 add @x[8],@x[8],@x[9],lsl#32
589 add @x[10],@x[10],@x[11],lsl#32
590 ldp @x[9],@x[11],[$inp,#32]
591 add @x[12],@x[12],@x[13],lsl#32
592 add @x[14],@x[14],@x[15],lsl#32
593 ldp @x[13],@x[15],[$inp,#48]
594 add $inp,$inp,#64
595#ifdef __ARMEB__
596 rev @x[0],@x[0]
597 rev @x[2],@x[2]
598 rev @x[4],@x[4]
599 rev @x[6],@x[6]
600 rev @x[8],@x[8]
601 rev @x[10],@x[10]
602 rev @x[12],@x[12]
603 rev @x[14],@x[14]
604#endif
605 eor @x[0],@x[0],@x[1]
606 eor @x[2],@x[2],@x[3]
607 eor @x[4],@x[4],@x[5]
608 eor @x[6],@x[6],@x[7]
609 eor @x[8],@x[8],@x[9]
610 eor @x[10],@x[10],@x[11]
611 eor @x[12],@x[12],@x[13]
612 eor @x[14],@x[14],@x[15]
613
614 stp @x[0],@x[2],[$out,#0] // store output
615 add @d[6],@d[6],#4 // increment counter
616 stp @x[4],@x[6],[$out,#16]
617 stp @x[8],@x[10],[$out,#32]
618 stp @x[12],@x[14],[$out,#48]
619 add $out,$out,#64
620 b.eq .Ldone_neon
621 sub $len,$len,#64
622 cmp $len,#64
623 b.lo .Less_than_128
624
625 ld1.8 {$T0-$T3},[$inp],#64
626 eor $A0,$A0,$T0
627 eor $B0,$B0,$T1
628 eor $C0,$C0,$T2
629 eor $D0,$D0,$T3
630 st1.8 {$A0-$D0},[$out],#64
631 b.eq .Ldone_neon
632 sub $len,$len,#64
633 cmp $len,#64
634 b.lo .Less_than_192
635
636 ld1.8 {$T0-$T3},[$inp],#64
637 eor $A1,$A1,$T0
638 eor $B1,$B1,$T1
639 eor $C1,$C1,$T2
640 eor $D1,$D1,$T3
641 st1.8 {$A1-$D1},[$out],#64
642 b.eq .Ldone_neon
643 sub $len,$len,#64
644
645 st1.8 {$A2-$D2},[sp]
646 b .Last_neon
647
648.Less_than_128:
649 st1.8 {$A0-$D0},[sp]
650 b .Last_neon
651.Less_than_192:
652 st1.8 {$A1-$D1},[sp]
653 b .Last_neon
654
655.align 4
656.Last_neon:
657 sub $out,$out,#1
658 add $inp,$inp,$len
659 add $out,$out,$len
660 add $ctr,sp,$len
661 neg $len,$len
662
663.Loop_tail_neon:
664 ldrb w10,[$inp,$len]
665 ldrb w11,[$ctr,$len]
666 add $len,$len,#1
667 eor w10,w10,w11
668 strb w10,[$out,$len]
669 cbnz $len,.Loop_tail_neon
670
671 stp xzr,xzr,[sp,#0]
672 stp xzr,xzr,[sp,#16]
673 stp xzr,xzr,[sp,#32]
674 stp xzr,xzr,[sp,#48]
675
676.Ldone_neon:
677 ldp x19,x20,[x29,#16]
678 add sp,sp,#64
679 ldp x21,x22,[x29,#32]
680 ldp x23,x24,[x29,#48]
681 ldp x25,x26,[x29,#64]
682 ldp x27,x28,[x29,#80]
683 ldp x29,x30,[sp],#96
9a18aae5 684 .inst 0xd50323bf // autiasp
1fdcef75
AP
685 ret
686.size ChaCha20_neon,.-ChaCha20_neon
687___
688{
689my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
690my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
691 $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
692
693$code.=<<___;
694.type ChaCha20_512_neon,%function
695.align 5
696ChaCha20_512_neon:
9a18aae5 697 .inst 0xd503233f // paciasp
1fdcef75
AP
698 stp x29,x30,[sp,#-96]!
699 add x29,sp,#0
700
701 adr @x[0],.Lsigma
702 stp x19,x20,[sp,#16]
703 stp x21,x22,[sp,#32]
704 stp x23,x24,[sp,#48]
705 stp x25,x26,[sp,#64]
706 stp x27,x28,[sp,#80]
707
708.L512_or_more_neon:
709 sub sp,sp,#128+64
710
711 ldp @d[0],@d[1],[@x[0]] // load sigma
712 ld1 {@K[0]},[@x[0]],#16
713 ldp @d[2],@d[3],[$key] // load key
714 ldp @d[4],@d[5],[$key,#16]
715 ld1 {@K[1],@K[2]},[$key]
716 ldp @d[6],@d[7],[$ctr] // load counter
717 ld1 {@K[3]},[$ctr]
718 ld1 {$ONE},[@x[0]]
719#ifdef __ARMEB__
720 rev64 @K[0],@K[0]
721 ror @d[2],@d[2],#32
722 ror @d[3],@d[3],#32
723 ror @d[4],@d[4],#32
724 ror @d[5],@d[5],#32
725 ror @d[6],@d[6],#32
726 ror @d[7],@d[7],#32
727#endif
728 add @K[3],@K[3],$ONE // += 1
729 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
730 add @K[3],@K[3],$ONE // not typo
731 str @K[2],[sp,#32]
732 add @K[4],@K[3],$ONE
733 add @K[5],@K[4],$ONE
734 add @K[6],@K[5],$ONE
735 shl $ONE,$ONE,#2 // 1 -> 4
736
737 stp d8,d9,[sp,#128+0] // meet ABI requirements
738 stp d10,d11,[sp,#128+16]
739 stp d12,d13,[sp,#128+32]
740 stp d14,d15,[sp,#128+48]
741
742 sub $len,$len,#512 // not typo
743
744.Loop_outer_512_neon:
745 mov $A0,@K[0]
746 mov $A1,@K[0]
747 mov $A2,@K[0]
748 mov $A3,@K[0]
749 mov $A4,@K[0]
750 mov $A5,@K[0]
751 mov $B0,@K[1]
752 mov.32 @x[0],@d[0] // unpack key block
753 mov $B1,@K[1]
754 lsr @x[1],@d[0],#32
755 mov $B2,@K[1]
756 mov.32 @x[2],@d[1]
757 mov $B3,@K[1]
758 lsr @x[3],@d[1],#32
759 mov $B4,@K[1]
760 mov.32 @x[4],@d[2]
761 mov $B5,@K[1]
762 lsr @x[5],@d[2],#32
763 mov $D0,@K[3]
764 mov.32 @x[6],@d[3]
765 mov $D1,@K[4]
766 lsr @x[7],@d[3],#32
767 mov $D2,@K[5]
768 mov.32 @x[8],@d[4]
769 mov $D3,@K[6]
770 lsr @x[9],@d[4],#32
771 mov $C0,@K[2]
772 mov.32 @x[10],@d[5]
773 mov $C1,@K[2]
774 lsr @x[11],@d[5],#32
775 add $D4,$D0,$ONE // +4
776 mov.32 @x[12],@d[6]
777 add $D5,$D1,$ONE // +4
778 lsr @x[13],@d[6],#32
779 mov $C2,@K[2]
780 mov.32 @x[14],@d[7]
781 mov $C3,@K[2]
782 lsr @x[15],@d[7],#32
783 mov $C4,@K[2]
784 stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
785 mov $C5,@K[2]
786 str @K[5],[sp,#80]
787
788 mov $ctr,#5
789 subs $len,$len,#512
790.Loop_upper_neon:
791 sub $ctr,$ctr,#1
792___
793 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
794 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
795 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
796 my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
797 my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
798 my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
799 my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
800 my $diff = ($#thread0+1)*6 - $#thread67 - 1;
801 my $i = 0;
802
803 foreach (@thread0) {
804 eval; eval(shift(@thread67));
805 eval(shift(@thread1)); eval(shift(@thread67));
806 eval(shift(@thread2)); eval(shift(@thread67));
807 eval(shift(@thread3)); eval(shift(@thread67));
808 eval(shift(@thread4)); eval(shift(@thread67));
809 eval(shift(@thread5)); eval(shift(@thread67));
810 }
811
812 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
813 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
814 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
815 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
816 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
817 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
818 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
819
820 foreach (@thread0) {
821 eval; eval(shift(@thread67));
822 eval(shift(@thread1)); eval(shift(@thread67));
823 eval(shift(@thread2)); eval(shift(@thread67));
824 eval(shift(@thread3)); eval(shift(@thread67));
825 eval(shift(@thread4)); eval(shift(@thread67));
826 eval(shift(@thread5)); eval(shift(@thread67));
827 }
828$code.=<<___;
829 cbnz $ctr,.Loop_upper_neon
830
831 add.32 @x[0],@x[0],@d[0] // accumulate key block
832 add @x[1],@x[1],@d[0],lsr#32
833 add.32 @x[2],@x[2],@d[1]
834 add @x[3],@x[3],@d[1],lsr#32
835 add.32 @x[4],@x[4],@d[2]
836 add @x[5],@x[5],@d[2],lsr#32
837 add.32 @x[6],@x[6],@d[3]
838 add @x[7],@x[7],@d[3],lsr#32
839 add.32 @x[8],@x[8],@d[4]
840 add @x[9],@x[9],@d[4],lsr#32
841 add.32 @x[10],@x[10],@d[5]
842 add @x[11],@x[11],@d[5],lsr#32
843 add.32 @x[12],@x[12],@d[6]
844 add @x[13],@x[13],@d[6],lsr#32
845 add.32 @x[14],@x[14],@d[7]
846 add @x[15],@x[15],@d[7],lsr#32
847
848 add @x[0],@x[0],@x[1],lsl#32 // pack
849 add @x[2],@x[2],@x[3],lsl#32
850 ldp @x[1],@x[3],[$inp,#0] // load input
851 add @x[4],@x[4],@x[5],lsl#32
852 add @x[6],@x[6],@x[7],lsl#32
853 ldp @x[5],@x[7],[$inp,#16]
854 add @x[8],@x[8],@x[9],lsl#32
855 add @x[10],@x[10],@x[11],lsl#32
856 ldp @x[9],@x[11],[$inp,#32]
857 add @x[12],@x[12],@x[13],lsl#32
858 add @x[14],@x[14],@x[15],lsl#32
859 ldp @x[13],@x[15],[$inp,#48]
860 add $inp,$inp,#64
861#ifdef __ARMEB__
862 rev @x[0],@x[0]
863 rev @x[2],@x[2]
864 rev @x[4],@x[4]
865 rev @x[6],@x[6]
866 rev @x[8],@x[8]
867 rev @x[10],@x[10]
868 rev @x[12],@x[12]
869 rev @x[14],@x[14]
870#endif
871 eor @x[0],@x[0],@x[1]
872 eor @x[2],@x[2],@x[3]
873 eor @x[4],@x[4],@x[5]
874 eor @x[6],@x[6],@x[7]
875 eor @x[8],@x[8],@x[9]
876 eor @x[10],@x[10],@x[11]
877 eor @x[12],@x[12],@x[13]
878 eor @x[14],@x[14],@x[15]
879
880 stp @x[0],@x[2],[$out,#0] // store output
881 add @d[6],@d[6],#1 // increment counter
882 mov.32 @x[0],@d[0] // unpack key block
883 lsr @x[1],@d[0],#32
884 stp @x[4],@x[6],[$out,#16]
885 mov.32 @x[2],@d[1]
886 lsr @x[3],@d[1],#32
887 stp @x[8],@x[10],[$out,#32]
888 mov.32 @x[4],@d[2]
889 lsr @x[5],@d[2],#32
890 stp @x[12],@x[14],[$out,#48]
891 add $out,$out,#64
892 mov.32 @x[6],@d[3]
893 lsr @x[7],@d[3],#32
894 mov.32 @x[8],@d[4]
895 lsr @x[9],@d[4],#32
896 mov.32 @x[10],@d[5]
897 lsr @x[11],@d[5],#32
898 mov.32 @x[12],@d[6]
899 lsr @x[13],@d[6],#32
900 mov.32 @x[14],@d[7]
901 lsr @x[15],@d[7],#32
902
903 mov $ctr,#5
904.Loop_lower_neon:
905 sub $ctr,$ctr,#1
906___
907 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
908 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
909 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
910 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
911 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
912 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
913 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
914
915 foreach (@thread0) {
916 eval; eval(shift(@thread67));
917 eval(shift(@thread1)); eval(shift(@thread67));
918 eval(shift(@thread2)); eval(shift(@thread67));
919 eval(shift(@thread3)); eval(shift(@thread67));
920 eval(shift(@thread4)); eval(shift(@thread67));
921 eval(shift(@thread5)); eval(shift(@thread67));
922 }
923
924 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
925 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
926 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
927 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
928 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
929 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
930 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
931
932 foreach (@thread0) {
933 eval; eval(shift(@thread67));
934 eval(shift(@thread1)); eval(shift(@thread67));
935 eval(shift(@thread2)); eval(shift(@thread67));
936 eval(shift(@thread3)); eval(shift(@thread67));
937 eval(shift(@thread4)); eval(shift(@thread67));
938 eval(shift(@thread5)); eval(shift(@thread67));
939 }
940$code.=<<___;
941 cbnz $ctr,.Loop_lower_neon
942
943 add.32 @x[0],@x[0],@d[0] // accumulate key block
944 ldp @K[0],@K[1],[sp,#0]
945 add @x[1],@x[1],@d[0],lsr#32
946 ldp @K[2],@K[3],[sp,#32]
947 add.32 @x[2],@x[2],@d[1]
948 ldp @K[4],@K[5],[sp,#64]
949 add @x[3],@x[3],@d[1],lsr#32
950 add $A0,$A0,@K[0]
951 add.32 @x[4],@x[4],@d[2]
952 add $A1,$A1,@K[0]
953 add @x[5],@x[5],@d[2],lsr#32
954 add $A2,$A2,@K[0]
955 add.32 @x[6],@x[6],@d[3]
956 add $A3,$A3,@K[0]
957 add @x[7],@x[7],@d[3],lsr#32
958 add $A4,$A4,@K[0]
959 add.32 @x[8],@x[8],@d[4]
960 add $A5,$A5,@K[0]
961 add @x[9],@x[9],@d[4],lsr#32
962 add $C0,$C0,@K[2]
963 add.32 @x[10],@x[10],@d[5]
964 add $C1,$C1,@K[2]
965 add @x[11],@x[11],@d[5],lsr#32
966 add $C2,$C2,@K[2]
967 add.32 @x[12],@x[12],@d[6]
968 add $C3,$C3,@K[2]
969 add @x[13],@x[13],@d[6],lsr#32
970 add $C4,$C4,@K[2]
971 add.32 @x[14],@x[14],@d[7]
972 add $C5,$C5,@K[2]
973 add @x[15],@x[15],@d[7],lsr#32
974 add $D4,$D4,$ONE // +4
975 add @x[0],@x[0],@x[1],lsl#32 // pack
976 add $D5,$D5,$ONE // +4
977 add @x[2],@x[2],@x[3],lsl#32
978 add $D0,$D0,@K[3]
979 ldp @x[1],@x[3],[$inp,#0] // load input
980 add $D1,$D1,@K[4]
981 add @x[4],@x[4],@x[5],lsl#32
982 add $D2,$D2,@K[5]
983 add @x[6],@x[6],@x[7],lsl#32
984 add $D3,$D3,@K[6]
985 ldp @x[5],@x[7],[$inp,#16]
986 add $D4,$D4,@K[3]
987 add @x[8],@x[8],@x[9],lsl#32
988 add $D5,$D5,@K[4]
989 add @x[10],@x[10],@x[11],lsl#32
990 add $B0,$B0,@K[1]
991 ldp @x[9],@x[11],[$inp,#32]
992 add $B1,$B1,@K[1]
993 add @x[12],@x[12],@x[13],lsl#32
994 add $B2,$B2,@K[1]
995 add @x[14],@x[14],@x[15],lsl#32
996 add $B3,$B3,@K[1]
997 ldp @x[13],@x[15],[$inp,#48]
998 add $B4,$B4,@K[1]
999 add $inp,$inp,#64
1000 add $B5,$B5,@K[1]
1001
1002#ifdef __ARMEB__
1003 rev @x[0],@x[0]
1004 rev @x[2],@x[2]
1005 rev @x[4],@x[4]
1006 rev @x[6],@x[6]
1007 rev @x[8],@x[8]
1008 rev @x[10],@x[10]
1009 rev @x[12],@x[12]
1010 rev @x[14],@x[14]
1011#endif
1012 ld1.8 {$T0-$T3},[$inp],#64
1013 eor @x[0],@x[0],@x[1]
1014 eor @x[2],@x[2],@x[3]
1015 eor @x[4],@x[4],@x[5]
1016 eor @x[6],@x[6],@x[7]
1017 eor @x[8],@x[8],@x[9]
1018 eor $A0,$A0,$T0
1019 eor @x[10],@x[10],@x[11]
1020 eor $B0,$B0,$T1
1021 eor @x[12],@x[12],@x[13]
1022 eor $C0,$C0,$T2
1023 eor @x[14],@x[14],@x[15]
1024 eor $D0,$D0,$T3
1025 ld1.8 {$T0-$T3},[$inp],#64
1026
1027 stp @x[0],@x[2],[$out,#0] // store output
1028 add @d[6],@d[6],#7 // increment counter
1029 stp @x[4],@x[6],[$out,#16]
1030 stp @x[8],@x[10],[$out,#32]
1031 stp @x[12],@x[14],[$out,#48]
1032 add $out,$out,#64
1033 st1.8 {$A0-$D0},[$out],#64
1034
1035 ld1.8 {$A0-$D0},[$inp],#64
1036 eor $A1,$A1,$T0
1037 eor $B1,$B1,$T1
1038 eor $C1,$C1,$T2
1039 eor $D1,$D1,$T3
1040 st1.8 {$A1-$D1},[$out],#64
1041
1042 ld1.8 {$A1-$D1},[$inp],#64
1043 eor $A2,$A2,$A0
1044 ldp @K[0],@K[1],[sp,#0]
1045 eor $B2,$B2,$B0
1046 ldp @K[2],@K[3],[sp,#32]
1047 eor $C2,$C2,$C0
1048 eor $D2,$D2,$D0
1049 st1.8 {$A2-$D2},[$out],#64
1050
1051 ld1.8 {$A2-$D2},[$inp],#64
1052 eor $A3,$A3,$A1
1053 eor $B3,$B3,$B1
1054 eor $C3,$C3,$C1
1055 eor $D3,$D3,$D1
1056 st1.8 {$A3-$D3},[$out],#64
1057
1058 ld1.8 {$A3-$D3},[$inp],#64
1059 eor $A4,$A4,$A2
1060 eor $B4,$B4,$B2
1061 eor $C4,$C4,$C2
1062 eor $D4,$D4,$D2
1063 st1.8 {$A4-$D4},[$out],#64
1064
1065 shl $A0,$ONE,#1 // 4 -> 8
1066 eor $A5,$A5,$A3
1067 eor $B5,$B5,$B3
1068 eor $C5,$C5,$C3
1069 eor $D5,$D5,$D3
1070 st1.8 {$A5-$D5},[$out],#64
1071
1072 add @K[3],@K[3],$A0 // += 8
1073 add @K[4],@K[4],$A0
1074 add @K[5],@K[5],$A0
1075 add @K[6],@K[6],$A0
1076
1077 b.hs .Loop_outer_512_neon
1078
1079 adds $len,$len,#512
1080 ushr $A0,$ONE,#2 // 4 -> 1
1081
1082 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1083 ldp d10,d11,[sp,#128+16]
1084 ldp d12,d13,[sp,#128+32]
1085 ldp d14,d15,[sp,#128+48]
1086
1087 stp @K[0],$ONE,[sp,#0] // wipe off-load area
1088 stp @K[0],$ONE,[sp,#32]
1089 stp @K[0],$ONE,[sp,#64]
1090
1091 b.eq .Ldone_512_neon
1092
1093 cmp $len,#192
1094 sub @K[3],@K[3],$A0 // -= 1
1095 sub @K[4],@K[4],$A0
1096 sub @K[5],@K[5],$A0
1097 add sp,sp,#128
1098 b.hs .Loop_outer_neon
1099
1100 eor @K[1],@K[1],@K[1]
1101 eor @K[2],@K[2],@K[2]
1102 eor @K[3],@K[3],@K[3]
1103 eor @K[4],@K[4],@K[4]
1104 eor @K[5],@K[5],@K[5]
1105 eor @K[6],@K[6],@K[6]
1106 b .Loop_outer
1107
1108.Ldone_512_neon:
1109 ldp x19,x20,[x29,#16]
1110 add sp,sp,#128+64
1111 ldp x21,x22,[x29,#32]
1112 ldp x23,x24,[x29,#48]
1113 ldp x25,x26,[x29,#64]
1114 ldp x27,x28,[x29,#80]
1115 ldp x29,x30,[sp],#96
9a18aae5 1116 .inst 0xd50323bf // autiasp
1fdcef75
AP
1117 ret
1118.size ChaCha20_512_neon,.-ChaCha20_512_neon
1119___
1120}
1121}}}
1122
1123foreach (split("\n",$code)) {
1124 s/\`([^\`]*)\`/eval $1/geo;
1125
1126 (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
1127 (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
1128 (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
1129 (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
1130 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1131
1132 #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1133
1134 print $_,"\n";
1135}
b9077d85 1136close STDOUT; # flush