]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
03d770d9 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
1fdcef75 AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # June 2015 | |
609b0852 | 18 | # |
1fdcef75 AP |
19 | # ChaCha20 for ARMv8. |
20 | # | |
21 | # Performance in cycles per byte out of large buffer. | |
22 | # | |
23 | # IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU | |
24 | # | |
25 | # Apple A7 5.50/+49% 3.33 1.70 | |
26 | # Cortex-A53 8.40/+80% 4.72 4.72(*) | |
27 | # Cortex-A57 8.06/+43% 4.90 4.43(**) | |
28 | # Denver 4.50/+82% 2.63 2.67(*) | |
29 | # X-Gene 9.50/+46% 8.82 8.89(*) | |
05ef4d19 | 30 | # Mongoose 8.00/+44% 3.64 3.25 |
75331623 | 31 | # Kryo 8.17/+50% 4.83 4.65 |
6465321e | 32 | # ThunderX2 7.26/+48% 7.91 4.30 |
1fdcef75 AP |
33 | # |
34 | # (*) it's expected that doubling interleave factor doesn't help | |
35 | # all processors, only those with higher NEON latency and | |
36 | # higher instruction issue rate; | |
37 | # (**) expected improvement was actually higher; | |
38 | ||
39 | $flavour=shift; | |
40 | $output=shift; | |
41 | ||
42 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
43 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
44 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
45 | die "can't locate arm-xlate.pl"; | |
46 | ||
47 | open OUT,"| \"$^X\" $xlate $flavour $output"; | |
48 | *STDOUT=*OUT; | |
49 | ||
50 | sub AUTOLOAD() # thunk [simplified] x86-style perlasm | |
51 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; | |
52 | my $arg = pop; | |
53 | $arg = "#$arg" if ($arg*1 eq $arg); | |
54 | $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; | |
55 | } | |
56 | ||
57 | my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); | |
58 | ||
59 | my @x=map("x$_",(5..17,19..21)); | |
60 | my @d=map("x$_",(22..28,30)); | |
61 | ||
62 | sub ROUND { | |
63 | my ($a0,$b0,$c0,$d0)=@_; | |
64 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
65 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
66 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
67 | ||
68 | ( | |
69 | "&add_32 (@x[$a0],@x[$a0],@x[$b0])", | |
70 | "&add_32 (@x[$a1],@x[$a1],@x[$b1])", | |
71 | "&add_32 (@x[$a2],@x[$a2],@x[$b2])", | |
72 | "&add_32 (@x[$a3],@x[$a3],@x[$b3])", | |
73 | "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", | |
74 | "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", | |
75 | "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", | |
76 | "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", | |
77 | "&ror_32 (@x[$d0],@x[$d0],16)", | |
78 | "&ror_32 (@x[$d1],@x[$d1],16)", | |
79 | "&ror_32 (@x[$d2],@x[$d2],16)", | |
80 | "&ror_32 (@x[$d3],@x[$d3],16)", | |
81 | ||
82 | "&add_32 (@x[$c0],@x[$c0],@x[$d0])", | |
83 | "&add_32 (@x[$c1],@x[$c1],@x[$d1])", | |
84 | "&add_32 (@x[$c2],@x[$c2],@x[$d2])", | |
85 | "&add_32 (@x[$c3],@x[$c3],@x[$d3])", | |
86 | "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", | |
87 | "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", | |
88 | "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", | |
89 | "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", | |
90 | "&ror_32 (@x[$b0],@x[$b0],20)", | |
91 | "&ror_32 (@x[$b1],@x[$b1],20)", | |
92 | "&ror_32 (@x[$b2],@x[$b2],20)", | |
93 | "&ror_32 (@x[$b3],@x[$b3],20)", | |
94 | ||
95 | "&add_32 (@x[$a0],@x[$a0],@x[$b0])", | |
96 | "&add_32 (@x[$a1],@x[$a1],@x[$b1])", | |
97 | "&add_32 (@x[$a2],@x[$a2],@x[$b2])", | |
98 | "&add_32 (@x[$a3],@x[$a3],@x[$b3])", | |
99 | "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", | |
100 | "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", | |
101 | "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", | |
102 | "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", | |
103 | "&ror_32 (@x[$d0],@x[$d0],24)", | |
104 | "&ror_32 (@x[$d1],@x[$d1],24)", | |
105 | "&ror_32 (@x[$d2],@x[$d2],24)", | |
106 | "&ror_32 (@x[$d3],@x[$d3],24)", | |
107 | ||
108 | "&add_32 (@x[$c0],@x[$c0],@x[$d0])", | |
109 | "&add_32 (@x[$c1],@x[$c1],@x[$d1])", | |
110 | "&add_32 (@x[$c2],@x[$c2],@x[$d2])", | |
111 | "&add_32 (@x[$c3],@x[$c3],@x[$d3])", | |
112 | "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", | |
113 | "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", | |
114 | "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", | |
115 | "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", | |
116 | "&ror_32 (@x[$b0],@x[$b0],25)", | |
117 | "&ror_32 (@x[$b1],@x[$b1],25)", | |
118 | "&ror_32 (@x[$b2],@x[$b2],25)", | |
119 | "&ror_32 (@x[$b3],@x[$b3],25)" | |
120 | ); | |
121 | } | |
122 | ||
123 | $code.=<<___; | |
124 | #include "arm_arch.h" | |
125 | ||
126 | .text | |
127 | ||
128 | .extern OPENSSL_armcap_P | |
129 | ||
130 | .align 5 | |
131 | .Lsigma: | |
132 | .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral | |
133 | .Lone: | |
134 | .long 1,0,0,0 | |
1fdcef75 AP |
135 | .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
136 | ||
137 | .globl ChaCha20_ctr32 | |
138 | .type ChaCha20_ctr32,%function | |
139 | .align 5 | |
140 | ChaCha20_ctr32: | |
141 | cbz $len,.Labort | |
1fdcef75 AP |
142 | cmp $len,#192 |
143 | b.lo .Lshort | |
db42bb44 AP |
144 | |
145 | adrp x17,OPENSSL_armcap_P | |
146 | ldr w17,[x17,#:lo12:OPENSSL_armcap_P] | |
1fdcef75 | 147 | tst w17,#ARMV7_NEON |
db42bb44 | 148 | b.ne .LChaCha20_neon |
1fdcef75 AP |
149 | |
150 | .Lshort: | |
9a18aae5 | 151 | .inst 0xd503233f // paciasp |
1fdcef75 AP |
152 | stp x29,x30,[sp,#-96]! |
153 | add x29,sp,#0 | |
154 | ||
155 | adr @x[0],.Lsigma | |
156 | stp x19,x20,[sp,#16] | |
157 | stp x21,x22,[sp,#32] | |
158 | stp x23,x24,[sp,#48] | |
159 | stp x25,x26,[sp,#64] | |
160 | stp x27,x28,[sp,#80] | |
161 | sub sp,sp,#64 | |
162 | ||
163 | ldp @d[0],@d[1],[@x[0]] // load sigma | |
164 | ldp @d[2],@d[3],[$key] // load key | |
165 | ldp @d[4],@d[5],[$key,#16] | |
166 | ldp @d[6],@d[7],[$ctr] // load counter | |
167 | #ifdef __ARMEB__ | |
168 | ror @d[2],@d[2],#32 | |
169 | ror @d[3],@d[3],#32 | |
170 | ror @d[4],@d[4],#32 | |
171 | ror @d[5],@d[5],#32 | |
172 | ror @d[6],@d[6],#32 | |
173 | ror @d[7],@d[7],#32 | |
174 | #endif | |
175 | ||
176 | .Loop_outer: | |
177 | mov.32 @x[0],@d[0] // unpack key block | |
178 | lsr @x[1],@d[0],#32 | |
179 | mov.32 @x[2],@d[1] | |
180 | lsr @x[3],@d[1],#32 | |
181 | mov.32 @x[4],@d[2] | |
182 | lsr @x[5],@d[2],#32 | |
183 | mov.32 @x[6],@d[3] | |
184 | lsr @x[7],@d[3],#32 | |
185 | mov.32 @x[8],@d[4] | |
186 | lsr @x[9],@d[4],#32 | |
187 | mov.32 @x[10],@d[5] | |
188 | lsr @x[11],@d[5],#32 | |
189 | mov.32 @x[12],@d[6] | |
190 | lsr @x[13],@d[6],#32 | |
191 | mov.32 @x[14],@d[7] | |
192 | lsr @x[15],@d[7],#32 | |
193 | ||
194 | mov $ctr,#10 | |
195 | subs $len,$len,#64 | |
196 | .Loop: | |
609b0852 | 197 | sub $ctr,$ctr,#1 |
1fdcef75 AP |
198 | ___ |
199 | foreach (&ROUND(0, 4, 8,12)) { eval; } | |
200 | foreach (&ROUND(0, 5,10,15)) { eval; } | |
201 | $code.=<<___; | |
202 | cbnz $ctr,.Loop | |
203 | ||
204 | add.32 @x[0],@x[0],@d[0] // accumulate key block | |
205 | add @x[1],@x[1],@d[0],lsr#32 | |
206 | add.32 @x[2],@x[2],@d[1] | |
207 | add @x[3],@x[3],@d[1],lsr#32 | |
208 | add.32 @x[4],@x[4],@d[2] | |
209 | add @x[5],@x[5],@d[2],lsr#32 | |
210 | add.32 @x[6],@x[6],@d[3] | |
211 | add @x[7],@x[7],@d[3],lsr#32 | |
212 | add.32 @x[8],@x[8],@d[4] | |
213 | add @x[9],@x[9],@d[4],lsr#32 | |
214 | add.32 @x[10],@x[10],@d[5] | |
215 | add @x[11],@x[11],@d[5],lsr#32 | |
216 | add.32 @x[12],@x[12],@d[6] | |
217 | add @x[13],@x[13],@d[6],lsr#32 | |
218 | add.32 @x[14],@x[14],@d[7] | |
219 | add @x[15],@x[15],@d[7],lsr#32 | |
220 | ||
221 | b.lo .Ltail | |
222 | ||
223 | add @x[0],@x[0],@x[1],lsl#32 // pack | |
224 | add @x[2],@x[2],@x[3],lsl#32 | |
225 | ldp @x[1],@x[3],[$inp,#0] // load input | |
226 | add @x[4],@x[4],@x[5],lsl#32 | |
227 | add @x[6],@x[6],@x[7],lsl#32 | |
228 | ldp @x[5],@x[7],[$inp,#16] | |
229 | add @x[8],@x[8],@x[9],lsl#32 | |
230 | add @x[10],@x[10],@x[11],lsl#32 | |
231 | ldp @x[9],@x[11],[$inp,#32] | |
232 | add @x[12],@x[12],@x[13],lsl#32 | |
233 | add @x[14],@x[14],@x[15],lsl#32 | |
234 | ldp @x[13],@x[15],[$inp,#48] | |
235 | add $inp,$inp,#64 | |
236 | #ifdef __ARMEB__ | |
237 | rev @x[0],@x[0] | |
238 | rev @x[2],@x[2] | |
239 | rev @x[4],@x[4] | |
240 | rev @x[6],@x[6] | |
241 | rev @x[8],@x[8] | |
242 | rev @x[10],@x[10] | |
243 | rev @x[12],@x[12] | |
244 | rev @x[14],@x[14] | |
245 | #endif | |
246 | eor @x[0],@x[0],@x[1] | |
247 | eor @x[2],@x[2],@x[3] | |
248 | eor @x[4],@x[4],@x[5] | |
249 | eor @x[6],@x[6],@x[7] | |
250 | eor @x[8],@x[8],@x[9] | |
251 | eor @x[10],@x[10],@x[11] | |
252 | eor @x[12],@x[12],@x[13] | |
253 | eor @x[14],@x[14],@x[15] | |
254 | ||
255 | stp @x[0],@x[2],[$out,#0] // store output | |
256 | add @d[6],@d[6],#1 // increment counter | |
257 | stp @x[4],@x[6],[$out,#16] | |
258 | stp @x[8],@x[10],[$out,#32] | |
259 | stp @x[12],@x[14],[$out,#48] | |
260 | add $out,$out,#64 | |
261 | ||
262 | b.hi .Loop_outer | |
263 | ||
264 | ldp x19,x20,[x29,#16] | |
265 | add sp,sp,#64 | |
266 | ldp x21,x22,[x29,#32] | |
267 | ldp x23,x24,[x29,#48] | |
268 | ldp x25,x26,[x29,#64] | |
269 | ldp x27,x28,[x29,#80] | |
270 | ldp x29,x30,[sp],#96 | |
9a18aae5 | 271 | .inst 0xd50323bf // autiasp |
1fdcef75 AP |
272 | .Labort: |
273 | ret | |
274 | ||
275 | .align 4 | |
276 | .Ltail: | |
277 | add $len,$len,#64 | |
278 | .Less_than_64: | |
279 | sub $out,$out,#1 | |
280 | add $inp,$inp,$len | |
281 | add $out,$out,$len | |
282 | add $ctr,sp,$len | |
283 | neg $len,$len | |
284 | ||
285 | add @x[0],@x[0],@x[1],lsl#32 // pack | |
286 | add @x[2],@x[2],@x[3],lsl#32 | |
287 | add @x[4],@x[4],@x[5],lsl#32 | |
288 | add @x[6],@x[6],@x[7],lsl#32 | |
289 | add @x[8],@x[8],@x[9],lsl#32 | |
290 | add @x[10],@x[10],@x[11],lsl#32 | |
291 | add @x[12],@x[12],@x[13],lsl#32 | |
292 | add @x[14],@x[14],@x[15],lsl#32 | |
293 | #ifdef __ARMEB__ | |
294 | rev @x[0],@x[0] | |
295 | rev @x[2],@x[2] | |
296 | rev @x[4],@x[4] | |
297 | rev @x[6],@x[6] | |
298 | rev @x[8],@x[8] | |
299 | rev @x[10],@x[10] | |
300 | rev @x[12],@x[12] | |
301 | rev @x[14],@x[14] | |
302 | #endif | |
303 | stp @x[0],@x[2],[sp,#0] | |
304 | stp @x[4],@x[6],[sp,#16] | |
305 | stp @x[8],@x[10],[sp,#32] | |
306 | stp @x[12],@x[14],[sp,#48] | |
307 | ||
308 | .Loop_tail: | |
309 | ldrb w10,[$inp,$len] | |
310 | ldrb w11,[$ctr,$len] | |
311 | add $len,$len,#1 | |
312 | eor w10,w10,w11 | |
313 | strb w10,[$out,$len] | |
314 | cbnz $len,.Loop_tail | |
315 | ||
316 | stp xzr,xzr,[sp,#0] | |
317 | stp xzr,xzr,[sp,#16] | |
318 | stp xzr,xzr,[sp,#32] | |
319 | stp xzr,xzr,[sp,#48] | |
320 | ||
321 | ldp x19,x20,[x29,#16] | |
322 | add sp,sp,#64 | |
323 | ldp x21,x22,[x29,#32] | |
324 | ldp x23,x24,[x29,#48] | |
325 | ldp x25,x26,[x29,#64] | |
326 | ldp x27,x28,[x29,#80] | |
327 | ldp x29,x30,[sp],#96 | |
9a18aae5 | 328 | .inst 0xd50323bf // autiasp |
1fdcef75 AP |
329 | ret |
330 | .size ChaCha20_ctr32,.-ChaCha20_ctr32 | |
331 | ___ | |
332 | ||
333 | {{{ | |
334 | my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = | |
335 | map("v$_.4s",(0..7,16..23)); | |
336 | my (@K)=map("v$_.4s",(24..30)); | |
337 | my $ONE="v31.4s"; | |
338 | ||
339 | sub NEONROUND { | |
340 | my $odd = pop; | |
341 | my ($a,$b,$c,$d,$t)=@_; | |
342 | ||
343 | ( | |
344 | "&add ('$a','$a','$b')", | |
345 | "&eor ('$d','$d','$a')", | |
346 | "&rev32_16 ('$d','$d')", # vrot ($d,16) | |
347 | ||
348 | "&add ('$c','$c','$d')", | |
349 | "&eor ('$t','$b','$c')", | |
350 | "&ushr ('$b','$t',20)", | |
351 | "&sli ('$b','$t',12)", | |
352 | ||
353 | "&add ('$a','$a','$b')", | |
354 | "&eor ('$t','$d','$a')", | |
355 | "&ushr ('$d','$t',24)", | |
356 | "&sli ('$d','$t',8)", | |
357 | ||
358 | "&add ('$c','$c','$d')", | |
359 | "&eor ('$t','$b','$c')", | |
360 | "&ushr ('$b','$t',25)", | |
361 | "&sli ('$b','$t',7)", | |
362 | ||
363 | "&ext ('$c','$c','$c',8)", | |
364 | "&ext ('$d','$d','$d',$odd?4:12)", | |
365 | "&ext ('$b','$b','$b',$odd?12:4)" | |
366 | ); | |
367 | } | |
368 | ||
369 | $code.=<<___; | |
370 | ||
371 | .type ChaCha20_neon,%function | |
372 | .align 5 | |
373 | ChaCha20_neon: | |
db42bb44 | 374 | .LChaCha20_neon: |
9a18aae5 | 375 | .inst 0xd503233f // paciasp |
1fdcef75 AP |
376 | stp x29,x30,[sp,#-96]! |
377 | add x29,sp,#0 | |
378 | ||
379 | adr @x[0],.Lsigma | |
380 | stp x19,x20,[sp,#16] | |
381 | stp x21,x22,[sp,#32] | |
382 | stp x23,x24,[sp,#48] | |
383 | stp x25,x26,[sp,#64] | |
384 | stp x27,x28,[sp,#80] | |
385 | cmp $len,#512 | |
386 | b.hs .L512_or_more_neon | |
387 | ||
388 | sub sp,sp,#64 | |
389 | ||
390 | ldp @d[0],@d[1],[@x[0]] // load sigma | |
391 | ld1 {@K[0]},[@x[0]],#16 | |
392 | ldp @d[2],@d[3],[$key] // load key | |
393 | ldp @d[4],@d[5],[$key,#16] | |
394 | ld1 {@K[1],@K[2]},[$key] | |
395 | ldp @d[6],@d[7],[$ctr] // load counter | |
396 | ld1 {@K[3]},[$ctr] | |
397 | ld1 {$ONE},[@x[0]] | |
398 | #ifdef __ARMEB__ | |
399 | rev64 @K[0],@K[0] | |
400 | ror @d[2],@d[2],#32 | |
401 | ror @d[3],@d[3],#32 | |
402 | ror @d[4],@d[4],#32 | |
403 | ror @d[5],@d[5],#32 | |
404 | ror @d[6],@d[6],#32 | |
405 | ror @d[7],@d[7],#32 | |
406 | #endif | |
407 | add @K[3],@K[3],$ONE // += 1 | |
408 | add @K[4],@K[3],$ONE | |
409 | add @K[5],@K[4],$ONE | |
410 | shl $ONE,$ONE,#2 // 1 -> 4 | |
411 | ||
412 | .Loop_outer_neon: | |
413 | mov.32 @x[0],@d[0] // unpack key block | |
414 | lsr @x[1],@d[0],#32 | |
415 | mov $A0,@K[0] | |
416 | mov.32 @x[2],@d[1] | |
417 | lsr @x[3],@d[1],#32 | |
418 | mov $A1,@K[0] | |
419 | mov.32 @x[4],@d[2] | |
420 | lsr @x[5],@d[2],#32 | |
421 | mov $A2,@K[0] | |
422 | mov.32 @x[6],@d[3] | |
423 | mov $B0,@K[1] | |
424 | lsr @x[7],@d[3],#32 | |
425 | mov $B1,@K[1] | |
426 | mov.32 @x[8],@d[4] | |
427 | mov $B2,@K[1] | |
428 | lsr @x[9],@d[4],#32 | |
429 | mov $D0,@K[3] | |
430 | mov.32 @x[10],@d[5] | |
431 | mov $D1,@K[4] | |
432 | lsr @x[11],@d[5],#32 | |
433 | mov $D2,@K[5] | |
434 | mov.32 @x[12],@d[6] | |
435 | mov $C0,@K[2] | |
436 | lsr @x[13],@d[6],#32 | |
437 | mov $C1,@K[2] | |
438 | mov.32 @x[14],@d[7] | |
439 | mov $C2,@K[2] | |
440 | lsr @x[15],@d[7],#32 | |
441 | ||
442 | mov $ctr,#10 | |
443 | subs $len,$len,#256 | |
444 | .Loop_neon: | |
445 | sub $ctr,$ctr,#1 | |
446 | ___ | |
447 | my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); | |
448 | my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); | |
449 | my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); | |
450 | my @thread3=&ROUND(0,4,8,12); | |
451 | ||
452 | foreach (@thread0) { | |
453 | eval; eval(shift(@thread3)); | |
454 | eval(shift(@thread1)); eval(shift(@thread3)); | |
455 | eval(shift(@thread2)); eval(shift(@thread3)); | |
456 | } | |
457 | ||
458 | @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); | |
459 | @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); | |
460 | @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); | |
461 | @thread3=&ROUND(0,5,10,15); | |
462 | ||
463 | foreach (@thread0) { | |
464 | eval; eval(shift(@thread3)); | |
465 | eval(shift(@thread1)); eval(shift(@thread3)); | |
466 | eval(shift(@thread2)); eval(shift(@thread3)); | |
467 | } | |
468 | $code.=<<___; | |
469 | cbnz $ctr,.Loop_neon | |
470 | ||
471 | add.32 @x[0],@x[0],@d[0] // accumulate key block | |
472 | add $A0,$A0,@K[0] | |
473 | add @x[1],@x[1],@d[0],lsr#32 | |
474 | add $A1,$A1,@K[0] | |
475 | add.32 @x[2],@x[2],@d[1] | |
476 | add $A2,$A2,@K[0] | |
477 | add @x[3],@x[3],@d[1],lsr#32 | |
478 | add $C0,$C0,@K[2] | |
479 | add.32 @x[4],@x[4],@d[2] | |
480 | add $C1,$C1,@K[2] | |
481 | add @x[5],@x[5],@d[2],lsr#32 | |
482 | add $C2,$C2,@K[2] | |
483 | add.32 @x[6],@x[6],@d[3] | |
484 | add $D0,$D0,@K[3] | |
485 | add @x[7],@x[7],@d[3],lsr#32 | |
486 | add.32 @x[8],@x[8],@d[4] | |
487 | add $D1,$D1,@K[4] | |
488 | add @x[9],@x[9],@d[4],lsr#32 | |
489 | add.32 @x[10],@x[10],@d[5] | |
490 | add $D2,$D2,@K[5] | |
491 | add @x[11],@x[11],@d[5],lsr#32 | |
492 | add.32 @x[12],@x[12],@d[6] | |
493 | add $B0,$B0,@K[1] | |
494 | add @x[13],@x[13],@d[6],lsr#32 | |
495 | add.32 @x[14],@x[14],@d[7] | |
496 | add $B1,$B1,@K[1] | |
497 | add @x[15],@x[15],@d[7],lsr#32 | |
498 | add $B2,$B2,@K[1] | |
499 | ||
500 | b.lo .Ltail_neon | |
501 | ||
502 | add @x[0],@x[0],@x[1],lsl#32 // pack | |
503 | add @x[2],@x[2],@x[3],lsl#32 | |
504 | ldp @x[1],@x[3],[$inp,#0] // load input | |
505 | add @x[4],@x[4],@x[5],lsl#32 | |
506 | add @x[6],@x[6],@x[7],lsl#32 | |
507 | ldp @x[5],@x[7],[$inp,#16] | |
508 | add @x[8],@x[8],@x[9],lsl#32 | |
509 | add @x[10],@x[10],@x[11],lsl#32 | |
510 | ldp @x[9],@x[11],[$inp,#32] | |
511 | add @x[12],@x[12],@x[13],lsl#32 | |
512 | add @x[14],@x[14],@x[15],lsl#32 | |
513 | ldp @x[13],@x[15],[$inp,#48] | |
514 | add $inp,$inp,#64 | |
515 | #ifdef __ARMEB__ | |
516 | rev @x[0],@x[0] | |
517 | rev @x[2],@x[2] | |
518 | rev @x[4],@x[4] | |
519 | rev @x[6],@x[6] | |
520 | rev @x[8],@x[8] | |
521 | rev @x[10],@x[10] | |
522 | rev @x[12],@x[12] | |
523 | rev @x[14],@x[14] | |
524 | #endif | |
525 | ld1.8 {$T0-$T3},[$inp],#64 | |
526 | eor @x[0],@x[0],@x[1] | |
527 | eor @x[2],@x[2],@x[3] | |
528 | eor @x[4],@x[4],@x[5] | |
529 | eor @x[6],@x[6],@x[7] | |
530 | eor @x[8],@x[8],@x[9] | |
531 | eor $A0,$A0,$T0 | |
532 | eor @x[10],@x[10],@x[11] | |
533 | eor $B0,$B0,$T1 | |
534 | eor @x[12],@x[12],@x[13] | |
535 | eor $C0,$C0,$T2 | |
536 | eor @x[14],@x[14],@x[15] | |
537 | eor $D0,$D0,$T3 | |
538 | ld1.8 {$T0-$T3},[$inp],#64 | |
539 | ||
540 | stp @x[0],@x[2],[$out,#0] // store output | |
541 | add @d[6],@d[6],#4 // increment counter | |
542 | stp @x[4],@x[6],[$out,#16] | |
543 | add @K[3],@K[3],$ONE // += 4 | |
544 | stp @x[8],@x[10],[$out,#32] | |
545 | add @K[4],@K[4],$ONE | |
546 | stp @x[12],@x[14],[$out,#48] | |
547 | add @K[5],@K[5],$ONE | |
548 | add $out,$out,#64 | |
549 | ||
550 | st1.8 {$A0-$D0},[$out],#64 | |
551 | ld1.8 {$A0-$D0},[$inp],#64 | |
552 | ||
553 | eor $A1,$A1,$T0 | |
554 | eor $B1,$B1,$T1 | |
555 | eor $C1,$C1,$T2 | |
556 | eor $D1,$D1,$T3 | |
557 | st1.8 {$A1-$D1},[$out],#64 | |
558 | ||
559 | eor $A2,$A2,$A0 | |
560 | eor $B2,$B2,$B0 | |
561 | eor $C2,$C2,$C0 | |
562 | eor $D2,$D2,$D0 | |
563 | st1.8 {$A2-$D2},[$out],#64 | |
564 | ||
565 | b.hi .Loop_outer_neon | |
566 | ||
567 | ldp x19,x20,[x29,#16] | |
568 | add sp,sp,#64 | |
569 | ldp x21,x22,[x29,#32] | |
570 | ldp x23,x24,[x29,#48] | |
571 | ldp x25,x26,[x29,#64] | |
572 | ldp x27,x28,[x29,#80] | |
573 | ldp x29,x30,[sp],#96 | |
9a18aae5 | 574 | .inst 0xd50323bf // autiasp |
1fdcef75 AP |
575 | ret |
576 | ||
577 | .Ltail_neon: | |
578 | add $len,$len,#256 | |
579 | cmp $len,#64 | |
580 | b.lo .Less_than_64 | |
581 | ||
582 | add @x[0],@x[0],@x[1],lsl#32 // pack | |
583 | add @x[2],@x[2],@x[3],lsl#32 | |
584 | ldp @x[1],@x[3],[$inp,#0] // load input | |
585 | add @x[4],@x[4],@x[5],lsl#32 | |
586 | add @x[6],@x[6],@x[7],lsl#32 | |
587 | ldp @x[5],@x[7],[$inp,#16] | |
588 | add @x[8],@x[8],@x[9],lsl#32 | |
589 | add @x[10],@x[10],@x[11],lsl#32 | |
590 | ldp @x[9],@x[11],[$inp,#32] | |
591 | add @x[12],@x[12],@x[13],lsl#32 | |
592 | add @x[14],@x[14],@x[15],lsl#32 | |
593 | ldp @x[13],@x[15],[$inp,#48] | |
594 | add $inp,$inp,#64 | |
595 | #ifdef __ARMEB__ | |
596 | rev @x[0],@x[0] | |
597 | rev @x[2],@x[2] | |
598 | rev @x[4],@x[4] | |
599 | rev @x[6],@x[6] | |
600 | rev @x[8],@x[8] | |
601 | rev @x[10],@x[10] | |
602 | rev @x[12],@x[12] | |
603 | rev @x[14],@x[14] | |
604 | #endif | |
605 | eor @x[0],@x[0],@x[1] | |
606 | eor @x[2],@x[2],@x[3] | |
607 | eor @x[4],@x[4],@x[5] | |
608 | eor @x[6],@x[6],@x[7] | |
609 | eor @x[8],@x[8],@x[9] | |
610 | eor @x[10],@x[10],@x[11] | |
611 | eor @x[12],@x[12],@x[13] | |
612 | eor @x[14],@x[14],@x[15] | |
613 | ||
614 | stp @x[0],@x[2],[$out,#0] // store output | |
615 | add @d[6],@d[6],#4 // increment counter | |
616 | stp @x[4],@x[6],[$out,#16] | |
617 | stp @x[8],@x[10],[$out,#32] | |
618 | stp @x[12],@x[14],[$out,#48] | |
619 | add $out,$out,#64 | |
620 | b.eq .Ldone_neon | |
621 | sub $len,$len,#64 | |
622 | cmp $len,#64 | |
623 | b.lo .Less_than_128 | |
624 | ||
625 | ld1.8 {$T0-$T3},[$inp],#64 | |
626 | eor $A0,$A0,$T0 | |
627 | eor $B0,$B0,$T1 | |
628 | eor $C0,$C0,$T2 | |
629 | eor $D0,$D0,$T3 | |
630 | st1.8 {$A0-$D0},[$out],#64 | |
631 | b.eq .Ldone_neon | |
632 | sub $len,$len,#64 | |
633 | cmp $len,#64 | |
634 | b.lo .Less_than_192 | |
635 | ||
636 | ld1.8 {$T0-$T3},[$inp],#64 | |
637 | eor $A1,$A1,$T0 | |
638 | eor $B1,$B1,$T1 | |
639 | eor $C1,$C1,$T2 | |
640 | eor $D1,$D1,$T3 | |
641 | st1.8 {$A1-$D1},[$out],#64 | |
642 | b.eq .Ldone_neon | |
643 | sub $len,$len,#64 | |
644 | ||
645 | st1.8 {$A2-$D2},[sp] | |
646 | b .Last_neon | |
647 | ||
648 | .Less_than_128: | |
649 | st1.8 {$A0-$D0},[sp] | |
650 | b .Last_neon | |
651 | .Less_than_192: | |
652 | st1.8 {$A1-$D1},[sp] | |
653 | b .Last_neon | |
654 | ||
655 | .align 4 | |
656 | .Last_neon: | |
657 | sub $out,$out,#1 | |
658 | add $inp,$inp,$len | |
659 | add $out,$out,$len | |
660 | add $ctr,sp,$len | |
661 | neg $len,$len | |
662 | ||
663 | .Loop_tail_neon: | |
664 | ldrb w10,[$inp,$len] | |
665 | ldrb w11,[$ctr,$len] | |
666 | add $len,$len,#1 | |
667 | eor w10,w10,w11 | |
668 | strb w10,[$out,$len] | |
669 | cbnz $len,.Loop_tail_neon | |
670 | ||
671 | stp xzr,xzr,[sp,#0] | |
672 | stp xzr,xzr,[sp,#16] | |
673 | stp xzr,xzr,[sp,#32] | |
674 | stp xzr,xzr,[sp,#48] | |
675 | ||
676 | .Ldone_neon: | |
677 | ldp x19,x20,[x29,#16] | |
678 | add sp,sp,#64 | |
679 | ldp x21,x22,[x29,#32] | |
680 | ldp x23,x24,[x29,#48] | |
681 | ldp x25,x26,[x29,#64] | |
682 | ldp x27,x28,[x29,#80] | |
683 | ldp x29,x30,[sp],#96 | |
9a18aae5 | 684 | .inst 0xd50323bf // autiasp |
1fdcef75 AP |
685 | ret |
686 | .size ChaCha20_neon,.-ChaCha20_neon | |
687 | ___ | |
688 | { | |
689 | my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; | |
690 | my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, | |
691 | $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); | |
692 | ||
693 | $code.=<<___; | |
694 | .type ChaCha20_512_neon,%function | |
695 | .align 5 | |
696 | ChaCha20_512_neon: | |
9a18aae5 | 697 | .inst 0xd503233f // paciasp |
1fdcef75 AP |
698 | stp x29,x30,[sp,#-96]! |
699 | add x29,sp,#0 | |
700 | ||
701 | adr @x[0],.Lsigma | |
702 | stp x19,x20,[sp,#16] | |
703 | stp x21,x22,[sp,#32] | |
704 | stp x23,x24,[sp,#48] | |
705 | stp x25,x26,[sp,#64] | |
706 | stp x27,x28,[sp,#80] | |
707 | ||
708 | .L512_or_more_neon: | |
709 | sub sp,sp,#128+64 | |
710 | ||
711 | ldp @d[0],@d[1],[@x[0]] // load sigma | |
712 | ld1 {@K[0]},[@x[0]],#16 | |
713 | ldp @d[2],@d[3],[$key] // load key | |
714 | ldp @d[4],@d[5],[$key,#16] | |
715 | ld1 {@K[1],@K[2]},[$key] | |
716 | ldp @d[6],@d[7],[$ctr] // load counter | |
717 | ld1 {@K[3]},[$ctr] | |
718 | ld1 {$ONE},[@x[0]] | |
719 | #ifdef __ARMEB__ | |
720 | rev64 @K[0],@K[0] | |
721 | ror @d[2],@d[2],#32 | |
722 | ror @d[3],@d[3],#32 | |
723 | ror @d[4],@d[4],#32 | |
724 | ror @d[5],@d[5],#32 | |
725 | ror @d[6],@d[6],#32 | |
726 | ror @d[7],@d[7],#32 | |
727 | #endif | |
728 | add @K[3],@K[3],$ONE // += 1 | |
729 | stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part | |
730 | add @K[3],@K[3],$ONE // not typo | |
731 | str @K[2],[sp,#32] | |
732 | add @K[4],@K[3],$ONE | |
733 | add @K[5],@K[4],$ONE | |
734 | add @K[6],@K[5],$ONE | |
735 | shl $ONE,$ONE,#2 // 1 -> 4 | |
736 | ||
737 | stp d8,d9,[sp,#128+0] // meet ABI requirements | |
738 | stp d10,d11,[sp,#128+16] | |
739 | stp d12,d13,[sp,#128+32] | |
740 | stp d14,d15,[sp,#128+48] | |
741 | ||
742 | sub $len,$len,#512 // not typo | |
743 | ||
744 | .Loop_outer_512_neon: | |
745 | mov $A0,@K[0] | |
746 | mov $A1,@K[0] | |
747 | mov $A2,@K[0] | |
748 | mov $A3,@K[0] | |
749 | mov $A4,@K[0] | |
750 | mov $A5,@K[0] | |
751 | mov $B0,@K[1] | |
752 | mov.32 @x[0],@d[0] // unpack key block | |
753 | mov $B1,@K[1] | |
754 | lsr @x[1],@d[0],#32 | |
755 | mov $B2,@K[1] | |
756 | mov.32 @x[2],@d[1] | |
757 | mov $B3,@K[1] | |
758 | lsr @x[3],@d[1],#32 | |
759 | mov $B4,@K[1] | |
760 | mov.32 @x[4],@d[2] | |
761 | mov $B5,@K[1] | |
762 | lsr @x[5],@d[2],#32 | |
763 | mov $D0,@K[3] | |
764 | mov.32 @x[6],@d[3] | |
765 | mov $D1,@K[4] | |
766 | lsr @x[7],@d[3],#32 | |
767 | mov $D2,@K[5] | |
768 | mov.32 @x[8],@d[4] | |
769 | mov $D3,@K[6] | |
770 | lsr @x[9],@d[4],#32 | |
771 | mov $C0,@K[2] | |
772 | mov.32 @x[10],@d[5] | |
773 | mov $C1,@K[2] | |
774 | lsr @x[11],@d[5],#32 | |
775 | add $D4,$D0,$ONE // +4 | |
776 | mov.32 @x[12],@d[6] | |
777 | add $D5,$D1,$ONE // +4 | |
778 | lsr @x[13],@d[6],#32 | |
779 | mov $C2,@K[2] | |
780 | mov.32 @x[14],@d[7] | |
781 | mov $C3,@K[2] | |
782 | lsr @x[15],@d[7],#32 | |
783 | mov $C4,@K[2] | |
784 | stp @K[3],@K[4],[sp,#48] // off-load key block, variable part | |
785 | mov $C5,@K[2] | |
786 | str @K[5],[sp,#80] | |
787 | ||
788 | mov $ctr,#5 | |
789 | subs $len,$len,#512 | |
790 | .Loop_upper_neon: | |
791 | sub $ctr,$ctr,#1 | |
792 | ___ | |
793 | my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); | |
794 | my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); | |
795 | my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); | |
796 | my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); | |
797 | my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); | |
798 | my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); | |
799 | my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); | |
800 | my $diff = ($#thread0+1)*6 - $#thread67 - 1; | |
801 | my $i = 0; | |
802 | ||
803 | foreach (@thread0) { | |
804 | eval; eval(shift(@thread67)); | |
805 | eval(shift(@thread1)); eval(shift(@thread67)); | |
806 | eval(shift(@thread2)); eval(shift(@thread67)); | |
807 | eval(shift(@thread3)); eval(shift(@thread67)); | |
808 | eval(shift(@thread4)); eval(shift(@thread67)); | |
809 | eval(shift(@thread5)); eval(shift(@thread67)); | |
810 | } | |
811 | ||
812 | @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); | |
813 | @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); | |
814 | @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); | |
815 | @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); | |
816 | @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); | |
817 | @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); | |
818 | @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); | |
819 | ||
820 | foreach (@thread0) { | |
821 | eval; eval(shift(@thread67)); | |
822 | eval(shift(@thread1)); eval(shift(@thread67)); | |
823 | eval(shift(@thread2)); eval(shift(@thread67)); | |
824 | eval(shift(@thread3)); eval(shift(@thread67)); | |
825 | eval(shift(@thread4)); eval(shift(@thread67)); | |
826 | eval(shift(@thread5)); eval(shift(@thread67)); | |
827 | } | |
828 | $code.=<<___; | |
829 | cbnz $ctr,.Loop_upper_neon | |
830 | ||
831 | add.32 @x[0],@x[0],@d[0] // accumulate key block | |
832 | add @x[1],@x[1],@d[0],lsr#32 | |
833 | add.32 @x[2],@x[2],@d[1] | |
834 | add @x[3],@x[3],@d[1],lsr#32 | |
835 | add.32 @x[4],@x[4],@d[2] | |
836 | add @x[5],@x[5],@d[2],lsr#32 | |
837 | add.32 @x[6],@x[6],@d[3] | |
838 | add @x[7],@x[7],@d[3],lsr#32 | |
839 | add.32 @x[8],@x[8],@d[4] | |
840 | add @x[9],@x[9],@d[4],lsr#32 | |
841 | add.32 @x[10],@x[10],@d[5] | |
842 | add @x[11],@x[11],@d[5],lsr#32 | |
843 | add.32 @x[12],@x[12],@d[6] | |
844 | add @x[13],@x[13],@d[6],lsr#32 | |
845 | add.32 @x[14],@x[14],@d[7] | |
846 | add @x[15],@x[15],@d[7],lsr#32 | |
847 | ||
848 | add @x[0],@x[0],@x[1],lsl#32 // pack | |
849 | add @x[2],@x[2],@x[3],lsl#32 | |
850 | ldp @x[1],@x[3],[$inp,#0] // load input | |
851 | add @x[4],@x[4],@x[5],lsl#32 | |
852 | add @x[6],@x[6],@x[7],lsl#32 | |
853 | ldp @x[5],@x[7],[$inp,#16] | |
854 | add @x[8],@x[8],@x[9],lsl#32 | |
855 | add @x[10],@x[10],@x[11],lsl#32 | |
856 | ldp @x[9],@x[11],[$inp,#32] | |
857 | add @x[12],@x[12],@x[13],lsl#32 | |
858 | add @x[14],@x[14],@x[15],lsl#32 | |
859 | ldp @x[13],@x[15],[$inp,#48] | |
860 | add $inp,$inp,#64 | |
861 | #ifdef __ARMEB__ | |
862 | rev @x[0],@x[0] | |
863 | rev @x[2],@x[2] | |
864 | rev @x[4],@x[4] | |
865 | rev @x[6],@x[6] | |
866 | rev @x[8],@x[8] | |
867 | rev @x[10],@x[10] | |
868 | rev @x[12],@x[12] | |
869 | rev @x[14],@x[14] | |
870 | #endif | |
871 | eor @x[0],@x[0],@x[1] | |
872 | eor @x[2],@x[2],@x[3] | |
873 | eor @x[4],@x[4],@x[5] | |
874 | eor @x[6],@x[6],@x[7] | |
875 | eor @x[8],@x[8],@x[9] | |
876 | eor @x[10],@x[10],@x[11] | |
877 | eor @x[12],@x[12],@x[13] | |
878 | eor @x[14],@x[14],@x[15] | |
879 | ||
880 | stp @x[0],@x[2],[$out,#0] // store output | |
881 | add @d[6],@d[6],#1 // increment counter | |
882 | mov.32 @x[0],@d[0] // unpack key block | |
883 | lsr @x[1],@d[0],#32 | |
884 | stp @x[4],@x[6],[$out,#16] | |
885 | mov.32 @x[2],@d[1] | |
886 | lsr @x[3],@d[1],#32 | |
887 | stp @x[8],@x[10],[$out,#32] | |
888 | mov.32 @x[4],@d[2] | |
889 | lsr @x[5],@d[2],#32 | |
890 | stp @x[12],@x[14],[$out,#48] | |
891 | add $out,$out,#64 | |
892 | mov.32 @x[6],@d[3] | |
893 | lsr @x[7],@d[3],#32 | |
894 | mov.32 @x[8],@d[4] | |
895 | lsr @x[9],@d[4],#32 | |
896 | mov.32 @x[10],@d[5] | |
897 | lsr @x[11],@d[5],#32 | |
898 | mov.32 @x[12],@d[6] | |
899 | lsr @x[13],@d[6],#32 | |
900 | mov.32 @x[14],@d[7] | |
901 | lsr @x[15],@d[7],#32 | |
902 | ||
903 | mov $ctr,#5 | |
904 | .Loop_lower_neon: | |
905 | sub $ctr,$ctr,#1 | |
906 | ___ | |
907 | @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); | |
908 | @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); | |
909 | @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); | |
910 | @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); | |
911 | @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); | |
912 | @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); | |
913 | @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); | |
914 | ||
915 | foreach (@thread0) { | |
916 | eval; eval(shift(@thread67)); | |
917 | eval(shift(@thread1)); eval(shift(@thread67)); | |
918 | eval(shift(@thread2)); eval(shift(@thread67)); | |
919 | eval(shift(@thread3)); eval(shift(@thread67)); | |
920 | eval(shift(@thread4)); eval(shift(@thread67)); | |
921 | eval(shift(@thread5)); eval(shift(@thread67)); | |
922 | } | |
923 | ||
924 | @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); | |
925 | @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); | |
926 | @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); | |
927 | @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); | |
928 | @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); | |
929 | @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); | |
930 | @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); | |
931 | ||
932 | foreach (@thread0) { | |
933 | eval; eval(shift(@thread67)); | |
934 | eval(shift(@thread1)); eval(shift(@thread67)); | |
935 | eval(shift(@thread2)); eval(shift(@thread67)); | |
936 | eval(shift(@thread3)); eval(shift(@thread67)); | |
937 | eval(shift(@thread4)); eval(shift(@thread67)); | |
938 | eval(shift(@thread5)); eval(shift(@thread67)); | |
939 | } | |
940 | $code.=<<___; | |
941 | cbnz $ctr,.Loop_lower_neon | |
942 | ||
943 | add.32 @x[0],@x[0],@d[0] // accumulate key block | |
944 | ldp @K[0],@K[1],[sp,#0] | |
945 | add @x[1],@x[1],@d[0],lsr#32 | |
946 | ldp @K[2],@K[3],[sp,#32] | |
947 | add.32 @x[2],@x[2],@d[1] | |
948 | ldp @K[4],@K[5],[sp,#64] | |
949 | add @x[3],@x[3],@d[1],lsr#32 | |
950 | add $A0,$A0,@K[0] | |
951 | add.32 @x[4],@x[4],@d[2] | |
952 | add $A1,$A1,@K[0] | |
953 | add @x[5],@x[5],@d[2],lsr#32 | |
954 | add $A2,$A2,@K[0] | |
955 | add.32 @x[6],@x[6],@d[3] | |
956 | add $A3,$A3,@K[0] | |
957 | add @x[7],@x[7],@d[3],lsr#32 | |
958 | add $A4,$A4,@K[0] | |
959 | add.32 @x[8],@x[8],@d[4] | |
960 | add $A5,$A5,@K[0] | |
961 | add @x[9],@x[9],@d[4],lsr#32 | |
962 | add $C0,$C0,@K[2] | |
963 | add.32 @x[10],@x[10],@d[5] | |
964 | add $C1,$C1,@K[2] | |
965 | add @x[11],@x[11],@d[5],lsr#32 | |
966 | add $C2,$C2,@K[2] | |
967 | add.32 @x[12],@x[12],@d[6] | |
968 | add $C3,$C3,@K[2] | |
969 | add @x[13],@x[13],@d[6],lsr#32 | |
970 | add $C4,$C4,@K[2] | |
971 | add.32 @x[14],@x[14],@d[7] | |
972 | add $C5,$C5,@K[2] | |
973 | add @x[15],@x[15],@d[7],lsr#32 | |
974 | add $D4,$D4,$ONE // +4 | |
975 | add @x[0],@x[0],@x[1],lsl#32 // pack | |
976 | add $D5,$D5,$ONE // +4 | |
977 | add @x[2],@x[2],@x[3],lsl#32 | |
978 | add $D0,$D0,@K[3] | |
979 | ldp @x[1],@x[3],[$inp,#0] // load input | |
980 | add $D1,$D1,@K[4] | |
981 | add @x[4],@x[4],@x[5],lsl#32 | |
982 | add $D2,$D2,@K[5] | |
983 | add @x[6],@x[6],@x[7],lsl#32 | |
984 | add $D3,$D3,@K[6] | |
985 | ldp @x[5],@x[7],[$inp,#16] | |
986 | add $D4,$D4,@K[3] | |
987 | add @x[8],@x[8],@x[9],lsl#32 | |
988 | add $D5,$D5,@K[4] | |
989 | add @x[10],@x[10],@x[11],lsl#32 | |
990 | add $B0,$B0,@K[1] | |
991 | ldp @x[9],@x[11],[$inp,#32] | |
992 | add $B1,$B1,@K[1] | |
993 | add @x[12],@x[12],@x[13],lsl#32 | |
994 | add $B2,$B2,@K[1] | |
995 | add @x[14],@x[14],@x[15],lsl#32 | |
996 | add $B3,$B3,@K[1] | |
997 | ldp @x[13],@x[15],[$inp,#48] | |
998 | add $B4,$B4,@K[1] | |
999 | add $inp,$inp,#64 | |
1000 | add $B5,$B5,@K[1] | |
1001 | ||
1002 | #ifdef __ARMEB__ | |
1003 | rev @x[0],@x[0] | |
1004 | rev @x[2],@x[2] | |
1005 | rev @x[4],@x[4] | |
1006 | rev @x[6],@x[6] | |
1007 | rev @x[8],@x[8] | |
1008 | rev @x[10],@x[10] | |
1009 | rev @x[12],@x[12] | |
1010 | rev @x[14],@x[14] | |
1011 | #endif | |
1012 | ld1.8 {$T0-$T3},[$inp],#64 | |
1013 | eor @x[0],@x[0],@x[1] | |
1014 | eor @x[2],@x[2],@x[3] | |
1015 | eor @x[4],@x[4],@x[5] | |
1016 | eor @x[6],@x[6],@x[7] | |
1017 | eor @x[8],@x[8],@x[9] | |
1018 | eor $A0,$A0,$T0 | |
1019 | eor @x[10],@x[10],@x[11] | |
1020 | eor $B0,$B0,$T1 | |
1021 | eor @x[12],@x[12],@x[13] | |
1022 | eor $C0,$C0,$T2 | |
1023 | eor @x[14],@x[14],@x[15] | |
1024 | eor $D0,$D0,$T3 | |
1025 | ld1.8 {$T0-$T3},[$inp],#64 | |
1026 | ||
1027 | stp @x[0],@x[2],[$out,#0] // store output | |
1028 | add @d[6],@d[6],#7 // increment counter | |
1029 | stp @x[4],@x[6],[$out,#16] | |
1030 | stp @x[8],@x[10],[$out,#32] | |
1031 | stp @x[12],@x[14],[$out,#48] | |
1032 | add $out,$out,#64 | |
1033 | st1.8 {$A0-$D0},[$out],#64 | |
1034 | ||
1035 | ld1.8 {$A0-$D0},[$inp],#64 | |
1036 | eor $A1,$A1,$T0 | |
1037 | eor $B1,$B1,$T1 | |
1038 | eor $C1,$C1,$T2 | |
1039 | eor $D1,$D1,$T3 | |
1040 | st1.8 {$A1-$D1},[$out],#64 | |
1041 | ||
1042 | ld1.8 {$A1-$D1},[$inp],#64 | |
1043 | eor $A2,$A2,$A0 | |
1044 | ldp @K[0],@K[1],[sp,#0] | |
1045 | eor $B2,$B2,$B0 | |
1046 | ldp @K[2],@K[3],[sp,#32] | |
1047 | eor $C2,$C2,$C0 | |
1048 | eor $D2,$D2,$D0 | |
1049 | st1.8 {$A2-$D2},[$out],#64 | |
1050 | ||
1051 | ld1.8 {$A2-$D2},[$inp],#64 | |
1052 | eor $A3,$A3,$A1 | |
1053 | eor $B3,$B3,$B1 | |
1054 | eor $C3,$C3,$C1 | |
1055 | eor $D3,$D3,$D1 | |
1056 | st1.8 {$A3-$D3},[$out],#64 | |
1057 | ||
1058 | ld1.8 {$A3-$D3},[$inp],#64 | |
1059 | eor $A4,$A4,$A2 | |
1060 | eor $B4,$B4,$B2 | |
1061 | eor $C4,$C4,$C2 | |
1062 | eor $D4,$D4,$D2 | |
1063 | st1.8 {$A4-$D4},[$out],#64 | |
1064 | ||
1065 | shl $A0,$ONE,#1 // 4 -> 8 | |
1066 | eor $A5,$A5,$A3 | |
1067 | eor $B5,$B5,$B3 | |
1068 | eor $C5,$C5,$C3 | |
1069 | eor $D5,$D5,$D3 | |
1070 | st1.8 {$A5-$D5},[$out],#64 | |
1071 | ||
1072 | add @K[3],@K[3],$A0 // += 8 | |
1073 | add @K[4],@K[4],$A0 | |
1074 | add @K[5],@K[5],$A0 | |
1075 | add @K[6],@K[6],$A0 | |
1076 | ||
1077 | b.hs .Loop_outer_512_neon | |
1078 | ||
1079 | adds $len,$len,#512 | |
1080 | ushr $A0,$ONE,#2 // 4 -> 1 | |
1081 | ||
1082 | ldp d8,d9,[sp,#128+0] // meet ABI requirements | |
1083 | ldp d10,d11,[sp,#128+16] | |
1084 | ldp d12,d13,[sp,#128+32] | |
1085 | ldp d14,d15,[sp,#128+48] | |
1086 | ||
1087 | stp @K[0],$ONE,[sp,#0] // wipe off-load area | |
1088 | stp @K[0],$ONE,[sp,#32] | |
1089 | stp @K[0],$ONE,[sp,#64] | |
1090 | ||
1091 | b.eq .Ldone_512_neon | |
1092 | ||
1093 | cmp $len,#192 | |
1094 | sub @K[3],@K[3],$A0 // -= 1 | |
1095 | sub @K[4],@K[4],$A0 | |
1096 | sub @K[5],@K[5],$A0 | |
1097 | add sp,sp,#128 | |
1098 | b.hs .Loop_outer_neon | |
1099 | ||
1100 | eor @K[1],@K[1],@K[1] | |
1101 | eor @K[2],@K[2],@K[2] | |
1102 | eor @K[3],@K[3],@K[3] | |
1103 | eor @K[4],@K[4],@K[4] | |
1104 | eor @K[5],@K[5],@K[5] | |
1105 | eor @K[6],@K[6],@K[6] | |
1106 | b .Loop_outer | |
1107 | ||
1108 | .Ldone_512_neon: | |
1109 | ldp x19,x20,[x29,#16] | |
1110 | add sp,sp,#128+64 | |
1111 | ldp x21,x22,[x29,#32] | |
1112 | ldp x23,x24,[x29,#48] | |
1113 | ldp x25,x26,[x29,#64] | |
1114 | ldp x27,x28,[x29,#80] | |
1115 | ldp x29,x30,[sp],#96 | |
9a18aae5 | 1116 | .inst 0xd50323bf // autiasp |
1fdcef75 AP |
1117 | ret |
1118 | .size ChaCha20_512_neon,.-ChaCha20_512_neon | |
1119 | ___ | |
1120 | } | |
1121 | }}} | |
1122 | ||
1123 | foreach (split("\n",$code)) { | |
1124 | s/\`([^\`]*)\`/eval $1/geo; | |
1125 | ||
1126 | (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or | |
1127 | (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or | |
1128 | (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or | |
1129 | (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or | |
1130 | (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); | |
1131 | ||
1132 | #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; | |
1133 | ||
1134 | print $_,"\n"; | |
1135 | } | |
b9077d85 | 1136 | close STDOUT; # flush |