2 # Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv8.
20 # This is straightforward KECCAK_1X_ALT implementation. It makes no
21 # sense to attempt SIMD/NEON implementation for following reason.
22 # 64-bit lanes of vector registers can't be addressed as easily as in
23 # 32-bit mode. This means that 64-bit NEON is bound to be slower than
24 # 32-bit NEON, and this implementation is faster than 32-bit NEON on
25 # same processor. Even though it takes more scalar xor's and andn's,
26 # it gets compensated by availability of rotate. Not to forget that
27 # most processors achieve higher issue rate with scalar instructions.
31 # Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32 # variant with register permutation/rotation twist that allows to
33 # eliminate copies to temporary registers. If you look closely you'll
34 # notice that it uses only one lane of vector registers. The new
35 # instructions effectively facilitate parallel hashing, which we don't
36 # support [yet?]. But lowest-level core procedure is prepared for it.
37 # The inner round is 67 [vector] instructions, so it's not actually
38 # obvious that it will provide performance improvement [in serial
39 # hash] as long as vector instructions issue rate is limited to 1 per
42 ######################################################################
43 # Numbers are cycles per processed byte.
56 # (*) Corresponds to SHA3-256. No improvement coefficients are listed
57 # because they vary too much from compiler to compiler. Newer
58 # compiler does much better and improvement varies from 5% on
59 # Cortex-A57 to 25% on Cortex-A53. While in comparison to older
60 # compiler this code is at least 2x faster...
62 # $output is the last argument if it looks like a file (it has an extension)
63 # $flavour is the first argument if it doesn't look like a file
64 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m
|\
.\w
+$| ?
pop : undef;
65 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m
|\
.| ?
shift : undef;
67 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
68 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
69 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
70 die "can't locate arm-xlate.pl";
72 open OUT
,"| \"$^X\" $xlate $flavour \"$output\""
73 or die "can't call $xlate: $!";
76 my @rhotates = ([ 0, 1, 62, 28, 27 ],
77 [ 36, 44, 6, 55, 20 ],
78 [ 3, 10, 43, 25, 39 ],
79 [ 41, 45, 15, 21, 8 ],
80 [ 18, 2, 61, 56, 14 ]);
87 .align
8 // strategic alignment
and padding that allows to
use
88 // address value as
loop termination condition
...
92 .quad
0x0000000000000001
93 .quad
0x0000000000008082
94 .quad
0x800000000000808a
95 .quad
0x8000000080008000
96 .quad
0x000000000000808b
97 .quad
0x0000000080000001
98 .quad
0x8000000080008081
99 .quad
0x8000000000008009
100 .quad
0x000000000000008a
101 .quad
0x0000000000000088
102 .quad
0x0000000080008009
103 .quad
0x000000008000000a
104 .quad
0x000000008000808b
105 .quad
0x800000000000008b
106 .quad
0x8000000000008089
107 .quad
0x8000000000008003
108 .quad
0x8000000000008002
109 .quad
0x8000000000000080
110 .quad
0x000000000000800a
111 .quad
0x800000008000000a
112 .quad
0x8000000080008081
113 .quad
0x8000000000008080
114 .quad
0x0000000080000001
115 .quad
0x8000000080008008
119 my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
121 $A[3][3] = "x25"; # x18 is reserved
123 my @C = map("x$_", (26,27,28,30));
126 .type KeccakF1600_int
,%function
129 AARCH64_SIGN_LINK_REGISTER
131 stp
$C[2],x30
,[sp
,#16] // 32 bytes on top are mine
135 ////////////////////////////////////////// Theta
136 eor
$C[0],$A[0][0],$A[1][0]
137 stp
$A[0][4],$A[1][4],[sp
,#0] // offload pair...
138 eor
$C[1],$A[0][1],$A[1][1]
139 eor
$C[2],$A[0][2],$A[1][2]
140 eor
$C[3],$A[0][3],$A[1][3]
145 eor
$C[4],$A[0][4],$A[1][4]
146 eor
$C[0],$C[0],$A[2][0]
147 eor
$C[1],$C[1],$A[2][1]
148 eor
$C[2],$C[2],$A[2][2]
149 eor
$C[3],$C[3],$A[2][3]
150 eor
$C[4],$C[4],$A[2][4]
151 eor
$C[0],$C[0],$A[3][0]
152 eor
$C[1],$C[1],$A[3][1]
153 eor
$C[2],$C[2],$A[3][2]
154 eor
$C[3],$C[3],$A[3][3]
155 eor
$C[4],$C[4],$A[3][4]
156 eor
$C[0],$C[0],$A[4][0]
157 eor
$C[2],$C[2],$A[4][2]
158 eor
$C[1],$C[1],$A[4][1]
159 eor
$C[3],$C[3],$A[4][3]
160 eor
$C[4],$C[4],$A[4][4]
162 eor
$C[5],$C[0],$C[2],ror
#63
164 eor
$A[0][1],$A[0][1],$C[5]
165 eor
$A[1][1],$A[1][1],$C[5]
166 eor
$A[2][1],$A[2][1],$C[5]
167 eor
$A[3][1],$A[3][1],$C[5]
168 eor
$A[4][1],$A[4][1],$C[5]
170 eor
$C[5],$C[1],$C[3],ror
#63
171 eor
$C[2],$C[2],$C[4],ror
#63
172 eor
$C[3],$C[3],$C[0],ror
#63
173 eor
$C[4],$C[4],$C[1],ror
#63
175 eor
$C[1], $A[0][2],$C[5] // mov
$C[1],$A[0][2]
176 eor
$A[1][2],$A[1][2],$C[5]
177 eor
$A[2][2],$A[2][2],$C[5]
178 eor
$A[3][2],$A[3][2],$C[5]
179 eor
$A[4][2],$A[4][2],$C[5]
181 eor
$A[0][0],$A[0][0],$C[4]
182 eor
$A[1][0],$A[1][0],$C[4]
183 eor
$A[2][0],$A[2][0],$C[4]
184 eor
$A[3][0],$A[3][0],$C[4]
185 eor
$A[4][0],$A[4][0],$C[4]
190 ldp
$A[0][4],$A[1][4],[sp
,#0] // re-load offloaded data
191 eor
$C[0], $A[0][3],$C[2] // mov
$C[0],$A[0][3]
192 eor
$A[1][3],$A[1][3],$C[2]
193 eor
$A[2][3],$A[2][3],$C[2]
194 eor
$A[3][3],$A[3][3],$C[2]
195 eor
$A[4][3],$A[4][3],$C[2]
197 eor
$C[2], $A[0][4],$C[3] // mov
$C[2],$A[0][4]
198 eor
$A[1][4],$A[1][4],$C[3]
199 eor
$A[2][4],$A[2][4],$C[3]
200 eor
$A[3][4],$A[3][4],$C[3]
201 eor
$A[4][4],$A[4][4],$C[3]
203 ////////////////////////////////////////// Rho
+Pi
205 ror
$A[0][1],$A[1][1],#64-$rhotates[1][1]
207 ror
$A[0][2],$A[2][2],#64-$rhotates[2][2]
209 ror
$A[0][3],$A[3][3],#64-$rhotates[3][3]
211 ror
$A[0][4],$A[4][4],#64-$rhotates[4][4]
213 ror
$A[1][1],$A[1][4],#64-$rhotates[1][4]
214 ror
$A[2][2],$A[2][3],#64-$rhotates[2][3]
215 ror
$A[3][3],$A[3][2],#64-$rhotates[3][2]
216 ror
$A[4][4],$A[4][1],#64-$rhotates[4][1]
218 ror
$A[1][4],$A[4][2],#64-$rhotates[4][2]
219 ror
$A[2][3],$A[3][4],#64-$rhotates[3][4]
220 ror
$A[3][2],$A[2][1],#64-$rhotates[2][1]
221 ror
$A[4][1],$A[1][3],#64-$rhotates[1][3]
223 ror
$A[4][2],$A[2][4],#64-$rhotates[2][4]
224 ror
$A[3][4],$A[4][3],#64-$rhotates[4][3]
225 ror
$A[2][1],$A[1][2],#64-$rhotates[1][2]
226 ror
$A[1][3],$A[3][1],#64-$rhotates[3][1]
228 ror
$A[2][4],$A[4][0],#64-$rhotates[4][0]
229 ror
$A[4][3],$A[3][0],#64-$rhotates[3][0]
230 ror
$A[1][2],$A[2][0],#64-$rhotates[2][0]
231 ror
$A[3][1],$A[1][0],#64-$rhotates[1][0]
233 ror
$A[1][0],$C[0],#64-$rhotates[0][3]
234 ror
$A[2][0],$C[3],#64-$rhotates[0][1]
235 ror
$A[3][0],$C[2],#64-$rhotates[0][4]
236 ror
$A[4][0],$C[1],#64-$rhotates[0][2]
238 ////////////////////////////////////////// Chi
+Iota
239 bic
$C[0],$A[0][2],$A[0][1]
240 bic
$C[1],$A[0][3],$A[0][2]
241 bic
$C[2],$A[0][0],$A[0][4]
242 bic
$C[3],$A[0][1],$A[0][0]
243 eor
$A[0][0],$A[0][0],$C[0]
244 bic
$C[0],$A[0][4],$A[0][3]
245 eor
$A[0][1],$A[0][1],$C[1]
247 eor
$A[0][3],$A[0][3],$C[2]
248 eor
$A[0][4],$A[0][4],$C[3]
249 eor
$A[0][2],$A[0][2],$C[0]
250 ldr
$C[3],[$C[1]],#8 // Iota[i++]
252 bic
$C[0],$A[1][2],$A[1][1]
253 tst
$C[1],#255 // are we done?
255 bic
$C[1],$A[1][3],$A[1][2]
256 bic
$C[2],$A[1][0],$A[1][4]
257 eor
$A[0][0],$A[0][0],$C[3] // A
[0][0] ^= Iota
258 bic
$C[3],$A[1][1],$A[1][0]
259 eor
$A[1][0],$A[1][0],$C[0]
260 bic
$C[0],$A[1][4],$A[1][3]
261 eor
$A[1][1],$A[1][1],$C[1]
262 eor
$A[1][3],$A[1][3],$C[2]
263 eor
$A[1][4],$A[1][4],$C[3]
264 eor
$A[1][2],$A[1][2],$C[0]
266 bic
$C[0],$A[2][2],$A[2][1]
267 bic
$C[1],$A[2][3],$A[2][2]
268 bic
$C[2],$A[2][0],$A[2][4]
269 bic
$C[3],$A[2][1],$A[2][0]
270 eor
$A[2][0],$A[2][0],$C[0]
271 bic
$C[0],$A[2][4],$A[2][3]
272 eor
$A[2][1],$A[2][1],$C[1]
273 eor
$A[2][3],$A[2][3],$C[2]
274 eor
$A[2][4],$A[2][4],$C[3]
275 eor
$A[2][2],$A[2][2],$C[0]
277 bic
$C[0],$A[3][2],$A[3][1]
278 bic
$C[1],$A[3][3],$A[3][2]
279 bic
$C[2],$A[3][0],$A[3][4]
280 bic
$C[3],$A[3][1],$A[3][0]
281 eor
$A[3][0],$A[3][0],$C[0]
282 bic
$C[0],$A[3][4],$A[3][3]
283 eor
$A[3][1],$A[3][1],$C[1]
284 eor
$A[3][3],$A[3][3],$C[2]
285 eor
$A[3][4],$A[3][4],$C[3]
286 eor
$A[3][2],$A[3][2],$C[0]
288 bic
$C[0],$A[4][2],$A[4][1]
289 bic
$C[1],$A[4][3],$A[4][2]
290 bic
$C[2],$A[4][0],$A[4][4]
291 bic
$C[3],$A[4][1],$A[4][0]
292 eor
$A[4][0],$A[4][0],$C[0]
293 bic
$C[0],$A[4][4],$A[4][3]
294 eor
$A[4][1],$A[4][1],$C[1]
295 eor
$A[4][3],$A[4][3],$C[2]
296 eor
$A[4][4],$A[4][4],$C[3]
297 eor
$A[4][2],$A[4][2],$C[0]
302 AARCH64_VALIDATE_LINK_REGISTER
304 .size KeccakF1600_int
,.-KeccakF1600_int
306 .type KeccakF1600
,%function
309 AARCH64_SIGN_LINK_REGISTER
310 stp x29
,x30
,[sp
,#-128]!
319 str x0
,[sp
,#32] // offload argument
321 ldp
$A[0][0],$A[0][1],[x0
,#16*0]
322 ldp
$A[0][2],$A[0][3],[$C[0],#16*1]
323 ldp
$A[0][4],$A[1][0],[$C[0],#16*2]
324 ldp
$A[1][1],$A[1][2],[$C[0],#16*3]
325 ldp
$A[1][3],$A[1][4],[$C[0],#16*4]
326 ldp
$A[2][0],$A[2][1],[$C[0],#16*5]
327 ldp
$A[2][2],$A[2][3],[$C[0],#16*6]
328 ldp
$A[2][4],$A[3][0],[$C[0],#16*7]
329 ldp
$A[3][1],$A[3][2],[$C[0],#16*8]
330 ldp
$A[3][3],$A[3][4],[$C[0],#16*9]
331 ldp
$A[4][0],$A[4][1],[$C[0],#16*10]
332 ldp
$A[4][2],$A[4][3],[$C[0],#16*11]
333 ldr
$A[4][4],[$C[0],#16*12]
338 stp
$A[0][0],$A[0][1],[$C[0],#16*0]
339 stp
$A[0][2],$A[0][3],[$C[0],#16*1]
340 stp
$A[0][4],$A[1][0],[$C[0],#16*2]
341 stp
$A[1][1],$A[1][2],[$C[0],#16*3]
342 stp
$A[1][3],$A[1][4],[$C[0],#16*4]
343 stp
$A[2][0],$A[2][1],[$C[0],#16*5]
344 stp
$A[2][2],$A[2][3],[$C[0],#16*6]
345 stp
$A[2][4],$A[3][0],[$C[0],#16*7]
346 stp
$A[3][1],$A[3][2],[$C[0],#16*8]
347 stp
$A[3][3],$A[3][4],[$C[0],#16*9]
348 stp
$A[4][0],$A[4][1],[$C[0],#16*10]
349 stp
$A[4][2],$A[4][3],[$C[0],#16*11]
350 str
$A[4][4],[$C[0],#16*12]
352 ldp x19
,x20
,[x29
,#16]
354 ldp x21
,x22
,[x29
,#32]
355 ldp x23
,x24
,[x29
,#48]
356 ldp x25
,x26
,[x29
,#64]
357 ldp x27
,x28
,[x29
,#80]
358 ldp x29
,x30
,[sp
],#128
359 AARCH64_VALIDATE_LINK_REGISTER
361 .size KeccakF1600
,.-KeccakF1600
364 .type SHA3_absorb
,%function
367 AARCH64_SIGN_LINK_REGISTER
368 stp x29
,x30
,[sp
,#-128]!
377 stp x0
,x1
,[sp
,#32] // offload arguments
380 mov
$C[0],x0
// uint64_t A
[5][5]
381 mov
$C[1],x1
// const void
*inp
382 mov
$C[2],x2
// size_t len
383 mov
$C[3],x3
// size_t bsz
384 ldp
$A[0][0],$A[0][1],[$C[0],#16*0]
385 ldp
$A[0][2],$A[0][3],[$C[0],#16*1]
386 ldp
$A[0][4],$A[1][0],[$C[0],#16*2]
387 ldp
$A[1][1],$A[1][2],[$C[0],#16*3]
388 ldp
$A[1][3],$A[1][4],[$C[0],#16*4]
389 ldp
$A[2][0],$A[2][1],[$C[0],#16*5]
390 ldp
$A[2][2],$A[2][3],[$C[0],#16*6]
391 ldp
$A[2][4],$A[3][0],[$C[0],#16*7]
392 ldp
$A[3][1],$A[3][2],[$C[0],#16*8]
393 ldp
$A[3][3],$A[3][4],[$C[0],#16*9]
394 ldp
$A[4][0],$A[4][1],[$C[0],#16*10]
395 ldp
$A[4][2],$A[4][3],[$C[0],#16*11]
396 ldr
$A[4][4],[$C[0],#16*12]
401 subs
$C[0],$C[2],$C[3] // len
- bsz
404 str
$C[0],[sp
,#48] // save len - bsz
406 for (my $i=0; $i<24; $i+=2) {
409 ldr
$C[0],[$C[1]],#8 // *inp++
413 eor
$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
416 ldr
$C[0],[$C[1]],#8 // *inp++
420 eor
$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
425 ldr
$C[0],[$C[1]],#8 // *inp++
429 eor
$A[4][4],$A[4][4],$C[0]
432 str
$C[1],[sp
,#40] // save inp
436 ldr
$C[1],[sp
,#40] // restore arguments
437 ldp
$C[2],$C[3],[sp
,#48]
443 stp
$A[0][0],$A[0][1],[$C[1],#16*0]
444 stp
$A[0][2],$A[0][3],[$C[1],#16*1]
445 stp
$A[0][4],$A[1][0],[$C[1],#16*2]
446 stp
$A[1][1],$A[1][2],[$C[1],#16*3]
447 stp
$A[1][3],$A[1][4],[$C[1],#16*4]
448 stp
$A[2][0],$A[2][1],[$C[1],#16*5]
449 stp
$A[2][2],$A[2][3],[$C[1],#16*6]
450 stp
$A[2][4],$A[3][0],[$C[1],#16*7]
451 stp
$A[3][1],$A[3][2],[$C[1],#16*8]
452 stp
$A[3][3],$A[3][4],[$C[1],#16*9]
453 stp
$A[4][0],$A[4][1],[$C[1],#16*10]
454 stp
$A[4][2],$A[4][3],[$C[1],#16*11]
455 str
$A[4][4],[$C[1],#16*12]
457 mov x0
,$C[2] // return value
458 ldp x19
,x20
,[x29
,#16]
460 ldp x21
,x22
,[x29
,#32]
461 ldp x23
,x24
,[x29
,#48]
462 ldp x25
,x26
,[x29
,#64]
463 ldp x27
,x28
,[x29
,#80]
464 ldp x29
,x30
,[sp
],#128
465 AARCH64_VALIDATE_LINK_REGISTER
467 .size SHA3_absorb
,.-SHA3_absorb
470 my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
473 .type SHA3_squeeze
,%function
476 AARCH64_SIGN_LINK_REGISTER
477 stp x29
,x30
,[sp
,#-48]!
482 mov
$A_flat,x0
// put aside arguments
539 AARCH64_VALIDATE_LINK_REGISTER
541 .size SHA3_squeeze
,.-SHA3_squeeze
545 my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
546 "v".($_+3).".16b", "v".($_+4).".16b" ],
549 my @C = map("v$_.16b", (25..31));
550 my @D = @C[4,5,6,2,3];
553 .type KeccakF1600_ce
,%function
561 ////////////////////////////////////////////////// Theta
562 eor3
$C[0],$A[4][0],$A[3][0],$A[2][0]
563 eor3
$C[1],$A[4][1],$A[3][1],$A[2][1]
564 eor3
$C[2],$A[4][2],$A[3][2],$A[2][2]
565 eor3
$C[3],$A[4][3],$A[3][3],$A[2][3]
566 eor3
$C[4],$A[4][4],$A[3][4],$A[2][4]
567 eor3
$C[0],$C[0], $A[1][0],$A[0][0]
568 eor3
$C[1],$C[1], $A[1][1],$A[0][1]
569 eor3
$C[2],$C[2], $A[1][2],$A[0][2]
570 eor3
$C[3],$C[3], $A[1][3],$A[0][3]
571 eor3
$C[4],$C[4], $A[1][4],$A[0][4]
573 rax1
$C[5],$C[0],$C[2] // D
[1]
574 rax1
$C[6],$C[1],$C[3] // D
[2]
575 rax1
$C[2],$C[2],$C[4] // D
[3]
576 rax1
$C[3],$C[3],$C[0] // D
[4]
577 rax1
$C[4],$C[4],$C[1] // D
[0]
579 ////////////////////////////////////////////////// Theta
+Rho
+Pi
580 xar
$C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
582 xar
$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
583 xar
$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
584 xar
$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
585 xar
$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
586 xar
$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
588 xar
$C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
590 xar
$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
591 xar
$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
592 xar
$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
593 xar
$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
594 xar
$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
596 xar
$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
598 xar
$D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
599 xar
$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
600 xar
$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
601 xar
$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
602 xar
$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
604 xar
$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
606 eor
$A[0][0],$A[0][0],$D[0]
608 xar
$D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
609 xar
$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
610 xar
$D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
611 xar
$D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
612 xar
$D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
614 ////////////////////////////////////////////////// Chi
+Iota
615 bcax
$A[4][0],$C[1], $A[4][2],$A[1][3] // A
[1][3]=A
[4][1]
616 bcax
$A[4][1],$A[1][3],$A[4][3],$A[4][2] // A
[1][3]=A
[4][1]
617 bcax
$A[4][2],$A[4][2],$A[4][4],$A[4][3]
618 bcax
$A[4][3],$A[4][3],$C[1], $A[4][4]
619 bcax
$A[4][4],$A[4][4],$A[1][3],$C[1] // A
[1][3]=A
[4][1]
621 ld1r
{$C[1]},[x10
],#8
623 bcax
$A[3][2],$D[1], $A[3][4],$A[0][3] // A
[0][3]=A
[3][3]
624 bcax
$A[3][3],$A[0][3],$A[3][0],$A[3][4] // A
[0][3]=A
[3][3]
625 bcax
$A[3][4],$A[3][4],$A[3][1],$A[3][0]
626 bcax
$A[3][0],$A[3][0],$D[1], $A[3][1]
627 bcax
$A[3][1],$A[3][1],$A[0][3],$D[1] // A
[0][3]=A
[3][3]
629 bcax
$A[2][0],$C[0], $A[2][2],$D[2]
630 bcax
$A[2][1],$D[2], $A[2][3],$A[2][2]
631 bcax
$A[2][2],$A[2][2],$A[2][4],$A[2][3]
632 bcax
$A[2][3],$A[2][3],$C[0], $A[2][4]
633 bcax
$A[2][4],$A[2][4],$D[2], $C[0]
635 bcax
$A[1][2],$D[0], $A[1][4],$A[0][4] // A
[0][4]=A
[1][3]
636 bcax
$A[1][3],$A[0][4],$A[1][0],$A[1][4] // A
[0][4]=A
[1][3]
637 bcax
$A[1][4],$A[1][4],$A[1][1],$A[1][0]
638 bcax
$A[1][0],$A[1][0],$D[0], $A[1][1]
639 bcax
$A[1][1],$A[1][1],$A[0][4],$D[0] // A
[0][4]=A
[1][3]
641 bcax
$A[0][3],$D[3], $A[0][0],$D[4]
642 bcax
$A[0][4],$D[4], $A[0][1],$A[0][0]
643 bcax
$A[0][0],$A[0][0],$A[0][2],$A[0][1]
644 bcax
$A[0][1],$A[0][1],$D[3], $A[0][2]
645 bcax
$A[0][2],$A[0][2],$D[4], $D[3]
647 eor
$A[0][0],$A[0][0],$C[1]
653 .size KeccakF1600_ce
,.-KeccakF1600_ce
655 .type KeccakF1600_cext
,%function
658 AARCH64_SIGN_LINK_REGISTER
659 stp x29
,x30
,[sp
,#-80]!
661 stp d8
,d9
,[sp
,#16] // per ABI requirement
666 for($i=0; $i<24; $i+=2) { # load A[5][5]
669 ldp d
$i,d
$j,[x0
,#8*$i]
677 for($i=0; $i<24; $i+=2) { # store A[5][5]
680 stp d
$i,d
$j,[x0
,#8*$i]
691 AARCH64_VALIDATE_LINK_REGISTER
693 .size KeccakF1600_cext
,.-KeccakF1600_cext
697 my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
700 .globl SHA3_absorb_cext
701 .type SHA3_absorb_cext
,%function
704 AARCH64_SIGN_LINK_REGISTER
705 stp x29
,x30
,[sp
,#-80]!
707 stp d8
,d9
,[sp
,#16] // per ABI requirement
712 for($i=0; $i<24; $i+=2) { # load A[5][5]
715 ldp d
$i,d
$j,[x0
,#8*$i]
724 subs
$len,$len,$bsz // len
- bsz
727 for (my $i=0; $i<24; $i+=2) {
730 ldr d31
,[$inp],#8 // *inp++
732 rev64 v31
.16b
,v31
.16b
734 eor
$A[$i/5][$i%5],$A[$i/5][$i%5],v31
.16b
736 blo
.Lprocess_block_ce
737 ldr d31
,[$inp],#8 // *inp++
739 rev64 v31
.16b
,v31
.16b
741 eor
$A[$j/5][$j%5],$A[$j/5][$j%5],v31
.16b
742 beq
.Lprocess_block_ce
746 ldr d31
,[$inp],#8 // *inp++
748 rev64 v31
.16b
,v31
.16b
750 eor
$A[4][4],$A[4][4],v31
.16b
761 for($i=0; $i<24; $i+=2) { # store A[5][5]
764 stp d
$i,d
$j,[x0
,#8*$i]
769 add x0
,$len,$bsz // return value
776 AARCH64_VALIDATE_LINK_REGISTER
778 .size SHA3_absorb_cext
,.-SHA3_absorb_cext
782 my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
784 .globl SHA3_squeeze_cext
785 .type SHA3_squeeze_cext
,%function
788 AARCH64_SIGN_LINK_REGISTER
789 stp x29
,x30
,[sp
,#-16]!
797 blo
.Lsqueeze_tail_ce
802 beq
.Lsqueeze_done_ce
819 beq
.Lsqueeze_done_ce
823 beq
.Lsqueeze_done_ce
827 beq
.Lsqueeze_done_ce
831 beq
.Lsqueeze_done_ce
835 beq
.Lsqueeze_done_ce
839 beq
.Lsqueeze_done_ce
844 AARCH64_VALIDATE_LINK_REGISTER
846 .size SHA3_squeeze_cext
,.-SHA3_squeeze_cext
850 .asciz
"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
854 "rax1" => 0xce608c00, "eor3" => 0xce000000,
855 "bcax" => 0xce200000, "xar" => 0xce800000 );
858 my ($mnemonic,$arg)=@_;
860 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
862 sprintf ".inst\t0x%08x\t//%s %s",
863 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
868 foreach(split("\n",$code)) {
870 s/\`([^\`]*)\`/eval($1)/ge;
872 m/\bld1r\b/ and s/\.16b/.2d/g or
873 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
878 close STDOUT
or die "error closing STDOUT: $!";