]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-armv8.pl
sha/asm/keccak1600-armv8.pl: add hardware-assisted ARMv8.2 subroutines.
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-armv8.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for ARMv8.
17 #
18 # June 2017.
19 #
20 # This is straightforward KECCAK_1X_ALT implementation. It makes no
21 # sense to attempt SIMD/NEON implementation for following reason.
22 # 64-bit lanes of vector registers can't be addressed as easily as in
23 # 32-bit mode. This means that 64-bit NEON is bound to be slower than
24 # 32-bit NEON, and this implementation is faster than 32-bit NEON on
25 # same processor. Even though it takes more scalar xor's and andn's,
26 # it gets compensated by availability of rotate. Not to forget that
27 # most processors achieve higher issue rate with scalar instructions.
28 #
29 # February 2018.
30 #
31 # Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32 # variant with register permutation/rotation twist that allows to
33 # eliminate copies to temporary registers. If you look closely you'll
34 # notice that it uses only one lane of vector registers. The new
35 # instructions effectively facilitate parallel hashing, which we don't
36 # support [yet?]. But lowest-level core procedure is prepared for it.
37 # The inner round is 67 [vector] instructions, so it's not actually
38 # obvious that it will provide performance improvement [in serial
39 # hash] as long as vector instructions issue rate is limited to 1 per
40 # cycle...
41 #
42 ######################################################################
43 # Numbers are cycles per processed byte.
44 #
45 # r=1088(*)
46 #
47 # Cortex-A53 13
48 # Cortex-A57 12
49 # X-Gene 14
50 # Mongoose 10
51 # Kryo 12
52 # Denver 7.8
53 # Apple A7 7.2
54 #
55 # (*) Corresponds to SHA3-256. No improvement coefficients are listed
56 # because they vary too much from compiler to compiler. Newer
57 # compiler does much better and improvement varies from 5% on
58 # Cortex-A57 to 25% on Cortex-A53. While in comparison to older
59 # compiler this code is at least 2x faster...
60
61 $flavour = shift;
62 $output = shift;
63
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
68
69 open OUT,"| \"$^X\" $xlate $flavour $output";
70 *STDOUT=*OUT;
71
72 my @rhotates = ([ 0, 1, 62, 28, 27 ],
73 [ 36, 44, 6, 55, 20 ],
74 [ 3, 10, 43, 25, 39 ],
75 [ 41, 45, 15, 21, 8 ],
76 [ 18, 2, 61, 56, 14 ]);
77
78 $code.=<<___;
79 .text
80
81 .align 8 // strategic alignment and padding that allows to use
82 // address value as loop termination condition...
83 .quad 0,0,0,0,0,0,0,0
84 .type iotas,%object
85 iotas:
86 .quad 0x0000000000000001
87 .quad 0x0000000000008082
88 .quad 0x800000000000808a
89 .quad 0x8000000080008000
90 .quad 0x000000000000808b
91 .quad 0x0000000080000001
92 .quad 0x8000000080008081
93 .quad 0x8000000000008009
94 .quad 0x000000000000008a
95 .quad 0x0000000000000088
96 .quad 0x0000000080008009
97 .quad 0x000000008000000a
98 .quad 0x000000008000808b
99 .quad 0x800000000000008b
100 .quad 0x8000000000008089
101 .quad 0x8000000000008003
102 .quad 0x8000000000008002
103 .quad 0x8000000000000080
104 .quad 0x000000000000800a
105 .quad 0x800000008000000a
106 .quad 0x8000000080008081
107 .quad 0x8000000000008080
108 .quad 0x0000000080000001
109 .quad 0x8000000080008008
110 .size iotas,.-iotas
111 ___
112 {{{
113 my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
114 (0, 5, 10, 15, 20));
115 $A[3][3] = "x25"; # x18 is reserved
116
117 my @C = map("x$_", (26,27,28,30));
118
119 $code.=<<___;
120 .type KeccakF1600_int,%function
121 .align 5
122 KeccakF1600_int:
123 adr $C[2],iotas
124 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
125 b .Loop
126 .align 4
127 .Loop:
128 ////////////////////////////////////////// Theta
129 eor $C[0],$A[0][0],$A[1][0]
130 stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
131 eor $C[1],$A[0][1],$A[1][1]
132 eor $C[2],$A[0][2],$A[1][2]
133 eor $C[3],$A[0][3],$A[1][3]
134 ___
135 $C[4]=$A[0][4];
136 $C[5]=$A[1][4];
137 $code.=<<___;
138 eor $C[4],$A[0][4],$A[1][4]
139 eor $C[0],$C[0],$A[2][0]
140 eor $C[1],$C[1],$A[2][1]
141 eor $C[2],$C[2],$A[2][2]
142 eor $C[3],$C[3],$A[2][3]
143 eor $C[4],$C[4],$A[2][4]
144 eor $C[0],$C[0],$A[3][0]
145 eor $C[1],$C[1],$A[3][1]
146 eor $C[2],$C[2],$A[3][2]
147 eor $C[3],$C[3],$A[3][3]
148 eor $C[4],$C[4],$A[3][4]
149 eor $C[0],$C[0],$A[4][0]
150 eor $C[2],$C[2],$A[4][2]
151 eor $C[1],$C[1],$A[4][1]
152 eor $C[3],$C[3],$A[4][3]
153 eor $C[4],$C[4],$A[4][4]
154
155 eor $C[5],$C[0],$C[2],ror#63
156
157 eor $A[0][1],$A[0][1],$C[5]
158 eor $A[1][1],$A[1][1],$C[5]
159 eor $A[2][1],$A[2][1],$C[5]
160 eor $A[3][1],$A[3][1],$C[5]
161 eor $A[4][1],$A[4][1],$C[5]
162
163 eor $C[5],$C[1],$C[3],ror#63
164 eor $C[2],$C[2],$C[4],ror#63
165 eor $C[3],$C[3],$C[0],ror#63
166 eor $C[4],$C[4],$C[1],ror#63
167
168 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
169 eor $A[1][2],$A[1][2],$C[5]
170 eor $A[2][2],$A[2][2],$C[5]
171 eor $A[3][2],$A[3][2],$C[5]
172 eor $A[4][2],$A[4][2],$C[5]
173
174 eor $A[0][0],$A[0][0],$C[4]
175 eor $A[1][0],$A[1][0],$C[4]
176 eor $A[2][0],$A[2][0],$C[4]
177 eor $A[3][0],$A[3][0],$C[4]
178 eor $A[4][0],$A[4][0],$C[4]
179 ___
180 $C[4]=undef;
181 $C[5]=undef;
182 $code.=<<___;
183 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
184 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
185 eor $A[1][3],$A[1][3],$C[2]
186 eor $A[2][3],$A[2][3],$C[2]
187 eor $A[3][3],$A[3][3],$C[2]
188 eor $A[4][3],$A[4][3],$C[2]
189
190 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
191 eor $A[1][4],$A[1][4],$C[3]
192 eor $A[2][4],$A[2][4],$C[3]
193 eor $A[3][4],$A[3][4],$C[3]
194 eor $A[4][4],$A[4][4],$C[3]
195
196 ////////////////////////////////////////// Rho+Pi
197 mov $C[3],$A[0][1]
198 ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
199 //mov $C[1],$A[0][2]
200 ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
201 //mov $C[0],$A[0][3]
202 ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
203 //mov $C[2],$A[0][4]
204 ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
205
206 ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
207 ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
208 ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
209 ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
210
211 ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
212 ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
213 ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
214 ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
215
216 ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
217 ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
218 ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
219 ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
220
221 ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
222 ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
223 ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
224 ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
225
226 ror $A[1][0],$C[0],#64-$rhotates[0][3]
227 ror $A[2][0],$C[3],#64-$rhotates[0][1]
228 ror $A[3][0],$C[2],#64-$rhotates[0][4]
229 ror $A[4][0],$C[1],#64-$rhotates[0][2]
230
231 ////////////////////////////////////////// Chi+Iota
232 bic $C[0],$A[0][2],$A[0][1]
233 bic $C[1],$A[0][3],$A[0][2]
234 bic $C[2],$A[0][0],$A[0][4]
235 bic $C[3],$A[0][1],$A[0][0]
236 eor $A[0][0],$A[0][0],$C[0]
237 bic $C[0],$A[0][4],$A[0][3]
238 eor $A[0][1],$A[0][1],$C[1]
239 ldr $C[1],[sp,#16]
240 eor $A[0][3],$A[0][3],$C[2]
241 eor $A[0][4],$A[0][4],$C[3]
242 eor $A[0][2],$A[0][2],$C[0]
243 ldr $C[3],[$C[1]],#8 // Iota[i++]
244
245 bic $C[0],$A[1][2],$A[1][1]
246 tst $C[1],#255 // are we done?
247 str $C[1],[sp,#16]
248 bic $C[1],$A[1][3],$A[1][2]
249 bic $C[2],$A[1][0],$A[1][4]
250 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
251 bic $C[3],$A[1][1],$A[1][0]
252 eor $A[1][0],$A[1][0],$C[0]
253 bic $C[0],$A[1][4],$A[1][3]
254 eor $A[1][1],$A[1][1],$C[1]
255 eor $A[1][3],$A[1][3],$C[2]
256 eor $A[1][4],$A[1][4],$C[3]
257 eor $A[1][2],$A[1][2],$C[0]
258
259 bic $C[0],$A[2][2],$A[2][1]
260 bic $C[1],$A[2][3],$A[2][2]
261 bic $C[2],$A[2][0],$A[2][4]
262 bic $C[3],$A[2][1],$A[2][0]
263 eor $A[2][0],$A[2][0],$C[0]
264 bic $C[0],$A[2][4],$A[2][3]
265 eor $A[2][1],$A[2][1],$C[1]
266 eor $A[2][3],$A[2][3],$C[2]
267 eor $A[2][4],$A[2][4],$C[3]
268 eor $A[2][2],$A[2][2],$C[0]
269
270 bic $C[0],$A[3][2],$A[3][1]
271 bic $C[1],$A[3][3],$A[3][2]
272 bic $C[2],$A[3][0],$A[3][4]
273 bic $C[3],$A[3][1],$A[3][0]
274 eor $A[3][0],$A[3][0],$C[0]
275 bic $C[0],$A[3][4],$A[3][3]
276 eor $A[3][1],$A[3][1],$C[1]
277 eor $A[3][3],$A[3][3],$C[2]
278 eor $A[3][4],$A[3][4],$C[3]
279 eor $A[3][2],$A[3][2],$C[0]
280
281 bic $C[0],$A[4][2],$A[4][1]
282 bic $C[1],$A[4][3],$A[4][2]
283 bic $C[2],$A[4][0],$A[4][4]
284 bic $C[3],$A[4][1],$A[4][0]
285 eor $A[4][0],$A[4][0],$C[0]
286 bic $C[0],$A[4][4],$A[4][3]
287 eor $A[4][1],$A[4][1],$C[1]
288 eor $A[4][3],$A[4][3],$C[2]
289 eor $A[4][4],$A[4][4],$C[3]
290 eor $A[4][2],$A[4][2],$C[0]
291
292 bne .Loop
293
294 ldr x30,[sp,#24]
295 ret
296 .size KeccakF1600_int,.-KeccakF1600_int
297
298 .type KeccakF1600,%function
299 .align 5
300 KeccakF1600:
301 stp x29,x30,[sp,#-128]!
302 add x29,sp,#0
303 stp x19,x20,[sp,#16]
304 stp x21,x22,[sp,#32]
305 stp x23,x24,[sp,#48]
306 stp x25,x26,[sp,#64]
307 stp x27,x28,[sp,#80]
308 sub sp,sp,#48
309
310 str x0,[sp,#32] // offload argument
311 mov $C[0],x0
312 ldp $A[0][0],$A[0][1],[x0,#16*0]
313 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
314 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
315 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
316 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
317 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
318 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
319 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
320 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
321 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
322 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
323 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
324 ldr $A[4][4],[$C[0],#16*12]
325
326 bl KeccakF1600_int
327
328 ldr $C[0],[sp,#32]
329 stp $A[0][0],$A[0][1],[$C[0],#16*0]
330 stp $A[0][2],$A[0][3],[$C[0],#16*1]
331 stp $A[0][4],$A[1][0],[$C[0],#16*2]
332 stp $A[1][1],$A[1][2],[$C[0],#16*3]
333 stp $A[1][3],$A[1][4],[$C[0],#16*4]
334 stp $A[2][0],$A[2][1],[$C[0],#16*5]
335 stp $A[2][2],$A[2][3],[$C[0],#16*6]
336 stp $A[2][4],$A[3][0],[$C[0],#16*7]
337 stp $A[3][1],$A[3][2],[$C[0],#16*8]
338 stp $A[3][3],$A[3][4],[$C[0],#16*9]
339 stp $A[4][0],$A[4][1],[$C[0],#16*10]
340 stp $A[4][2],$A[4][3],[$C[0],#16*11]
341 str $A[4][4],[$C[0],#16*12]
342
343 ldp x19,x20,[x29,#16]
344 add sp,sp,#48
345 ldp x21,x22,[x29,#32]
346 ldp x23,x24,[x29,#48]
347 ldp x25,x26,[x29,#64]
348 ldp x27,x28,[x29,#80]
349 ldp x29,x30,[sp],#128
350 ret
351 .size KeccakF1600,.-KeccakF1600
352
353 .globl SHA3_absorb
354 .type SHA3_absorb,%function
355 .align 5
356 SHA3_absorb:
357 stp x29,x30,[sp,#-128]!
358 add x29,sp,#0
359 stp x19,x20,[sp,#16]
360 stp x21,x22,[sp,#32]
361 stp x23,x24,[sp,#48]
362 stp x25,x26,[sp,#64]
363 stp x27,x28,[sp,#80]
364 sub sp,sp,#64
365
366 stp x0,x1,[sp,#32] // offload arguments
367 stp x2,x3,[sp,#48]
368
369 mov $C[0],x0 // uint64_t A[5][5]
370 mov $C[1],x1 // const void *inp
371 mov $C[2],x2 // size_t len
372 mov $C[3],x3 // size_t bsz
373 ldp $A[0][0],$A[0][1],[$C[0],#16*0]
374 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
375 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
376 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
377 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
378 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
379 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
380 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
381 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
382 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
383 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
384 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
385 ldr $A[4][4],[$C[0],#16*12]
386 b .Loop_absorb
387
388 .align 4
389 .Loop_absorb:
390 subs $C[0],$C[2],$C[3] // len - bsz
391 blo .Labsorbed
392
393 str $C[0],[sp,#48] // save len - bsz
394 ___
395 for (my $i=0; $i<24; $i+=2) {
396 my $j = $i+1;
397 $code.=<<___;
398 ldr $C[0],[$C[1]],#8 // *inp++
399 #ifdef __AARCH64EB__
400 rev $C[0],$C[0]
401 #endif
402 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
403 cmp $C[3],#8*($i+2)
404 blo .Lprocess_block
405 ldr $C[0],[$C[1]],#8 // *inp++
406 #ifdef __AARCH64EB__
407 rev $C[0],$C[0]
408 #endif
409 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
410 beq .Lprocess_block
411 ___
412 }
413 $code.=<<___;
414 ldr $C[0],[$C[1]],#8 // *inp++
415 #ifdef __AARCH64EB__
416 rev $C[0],$C[0]
417 #endif
418 eor $A[4][4],$A[4][4],$C[0]
419
420 .Lprocess_block:
421 str $C[1],[sp,#40] // save inp
422
423 bl KeccakF1600_int
424
425 ldr $C[1],[sp,#40] // restore arguments
426 ldp $C[2],$C[3],[sp,#48]
427 b .Loop_absorb
428
429 .align 4
430 .Labsorbed:
431 ldr $C[1],[sp,#32]
432 stp $A[0][0],$A[0][1],[$C[1],#16*0]
433 stp $A[0][2],$A[0][3],[$C[1],#16*1]
434 stp $A[0][4],$A[1][0],[$C[1],#16*2]
435 stp $A[1][1],$A[1][2],[$C[1],#16*3]
436 stp $A[1][3],$A[1][4],[$C[1],#16*4]
437 stp $A[2][0],$A[2][1],[$C[1],#16*5]
438 stp $A[2][2],$A[2][3],[$C[1],#16*6]
439 stp $A[2][4],$A[3][0],[$C[1],#16*7]
440 stp $A[3][1],$A[3][2],[$C[1],#16*8]
441 stp $A[3][3],$A[3][4],[$C[1],#16*9]
442 stp $A[4][0],$A[4][1],[$C[1],#16*10]
443 stp $A[4][2],$A[4][3],[$C[1],#16*11]
444 str $A[4][4],[$C[1],#16*12]
445
446 mov x0,$C[2] // return value
447 ldp x19,x20,[x29,#16]
448 add sp,sp,#64
449 ldp x21,x22,[x29,#32]
450 ldp x23,x24,[x29,#48]
451 ldp x25,x26,[x29,#64]
452 ldp x27,x28,[x29,#80]
453 ldp x29,x30,[sp],#128
454 ret
455 .size SHA3_absorb,.-SHA3_absorb
456 ___
457 {
458 my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
459 $code.=<<___;
460 .globl SHA3_squeeze
461 .type SHA3_squeeze,%function
462 .align 5
463 SHA3_squeeze:
464 stp x29,x30,[sp,#-48]!
465 add x29,sp,#0
466 stp x19,x20,[sp,#16]
467 stp x21,x22,[sp,#32]
468
469 mov $A_flat,x0 // put aside arguments
470 mov $out,x1
471 mov $len,x2
472 mov $bsz,x3
473
474 .Loop_squeeze:
475 ldr x4,[x0],#8
476 cmp $len,#8
477 blo .Lsqueeze_tail
478 #ifdef __AARCH64EB__
479 rev x4,x4
480 #endif
481 str x4,[$out],#8
482 subs $len,$len,#8
483 beq .Lsqueeze_done
484
485 subs x3,x3,#8
486 bhi .Loop_squeeze
487
488 mov x0,$A_flat
489 bl KeccakF1600
490 mov x0,$A_flat
491 mov x3,$bsz
492 b .Loop_squeeze
493
494 .align 4
495 .Lsqueeze_tail:
496 strb w4,[$out],#1
497 lsr x4,x4,#8
498 subs $len,$len,#1
499 beq .Lsqueeze_done
500 strb w4,[$out],#1
501 lsr x4,x4,#8
502 subs $len,$len,#1
503 beq .Lsqueeze_done
504 strb w4,[$out],#1
505 lsr x4,x4,#8
506 subs $len,$len,#1
507 beq .Lsqueeze_done
508 strb w4,[$out],#1
509 lsr x4,x4,#8
510 subs $len,$len,#1
511 beq .Lsqueeze_done
512 strb w4,[$out],#1
513 lsr x4,x4,#8
514 subs $len,$len,#1
515 beq .Lsqueeze_done
516 strb w4,[$out],#1
517 lsr x4,x4,#8
518 subs $len,$len,#1
519 beq .Lsqueeze_done
520 strb w4,[$out],#1
521
522 .Lsqueeze_done:
523 ldp x19,x20,[sp,#16]
524 ldp x21,x22,[sp,#32]
525 ldp x29,x30,[sp],#48
526 ret
527 .size SHA3_squeeze,.-SHA3_squeeze
528 ___
529 } }}}
530 {{{
531 my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
532 "v".($_+3).".16b", "v".($_+4).".16b" ],
533 (0, 5, 10, 15, 20));
534
535 my @C = map("v$_.16b", (25..31));
536
537 $code.=<<___;
538 .type KeccakF1600_ce,%function
539 .align 5
540 KeccakF1600_ce:
541 mov x9,#6
542 adr x10,iotas
543 b .Loop_ce
544 .align 4
545 .Loop_ce:
546 ___
547 for($i=0; $i<4; $i++) {
548 $code.=<<___;
549 ////////////////////////////////////////////////// Theta
550 eor3 $C[0],$A[0][0],$A[1][0],$A[2][0]
551 eor3 $C[1],$A[0][1],$A[1][1],$A[2][1]
552 eor3 $C[2],$A[0][2],$A[1][2],$A[2][2]
553 eor3 $C[3],$A[0][3],$A[1][3],$A[2][3]
554 eor3 $C[4],$A[0][4],$A[1][4],$A[2][4]
555 eor3 $C[0],$C[0], $A[3][0],$A[4][0]
556 eor3 $C[1],$C[1], $A[3][1],$A[4][1]
557 eor3 $C[2],$C[2], $A[3][2],$A[4][2]
558 eor3 $C[3],$C[3], $A[3][3],$A[4][3]
559 eor3 $C[4],$C[4], $A[3][4],$A[4][4]
560
561 rax1 $C[5],$C[0],$C[2] // D[1]
562 rax1 $C[6],$C[1],$C[3] // D[2]
563 rax1 $C[2],$C[2],$C[4] // D[3]
564 rax1 $C[3],$C[3],$C[0] // D[4]
565 rax1 $C[4],$C[4],$C[1] // D[0]
566
567 ////////////////////////////////////////////////// Theta+Rho+Pi
568 xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1]
569 xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
570 xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
571 xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
572 xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
573
574 xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
575
576 xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
577 xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
578 xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
579 xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
580 xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
581
582 xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
583
584 eor $A[0][0],$A[0][0],$C[4]
585 ldr x11,[x10],#8
586
587 xar $C[1] ,$A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3]
588 xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
589 xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
590 xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
591 xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
592
593 xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // *
594
595 xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
596 xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
597 xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
598 xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
599 xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
600
601 xar $A[1][0],$A[0][3],$C[2],#64-$rhotates[0][3] // *
602
603 ////////////////////////////////////////////////// Chi+Iota
604 dup $C[6],x11 // borrow C[6]
605 bcax $C[3], $A[0][0],$A[0][2],$C[0] // *
606 bcax $A[0][1],$C[0], $C[1], $A[0][2] // *
607 bcax $A[0][2],$A[0][2],$A[0][4],$C[1]
608 bcax $A[0][3],$C[1], $A[0][0],$A[0][4]
609 bcax $A[0][4],$A[0][4],$C[0], $A[0][0]
610
611 bcax $C[0], $A[1][0],$A[1][2],$A[1][1] // *
612 bcax $C[1], $A[1][1],$A[1][3],$A[1][2] // *
613 bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3]
614 bcax $A[1][3],$A[1][3],$A[1][0],$A[1][4]
615 bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0]
616
617 eor $A[0][0],$C[3],$C[6] // Iota
618
619 bcax $C[2], $A[2][0],$A[2][2],$A[2][1] // *
620 bcax $C[3], $A[2][1],$A[2][3],$A[2][2] // *
621 bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
622 bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4]
623 bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0]
624
625 bcax $A[2][0],$A[3][0],$A[3][2],$A[3][1] // *
626 bcax $A[2][1],$A[3][1],$A[3][3],$A[3][2] // *
627 bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3]
628 bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4]
629 bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0]
630
631 bcax $A[3][0],$A[4][0],$A[4][2],$A[4][1] // *
632 bcax $A[3][1],$A[4][1],$A[4][3],$A[4][2] // *
633 bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
634 bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4]
635 bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0]
636 ___
637 ($A[1][0],$A[1][1], $C[0],$C[1])
638 = ($C[0],$C[1], $A[1][0],$A[1][1]);
639 ($A[2][0],$A[2][1], $A[3][0],$A[3][1], $A[4][0],$A[4][1], $C[2],$C[3])
640 = ($C[2],$C[3], $A[2][0],$A[2][1], $A[3][0],$A[3][1], $A[4][0],$A[4][1]);
641 }
642 $code.=<<___;
643 subs x9,x9,#1
644 bne .Loop_ce
645
646 ret
647 .size KeccakF1600_ce,.-KeccakF1600_ce
648
649 .type KeccakF1600_cext,%function
650 .align 5
651 KeccakF1600_cext:
652 stp x29,x30,[sp,#-80]!
653 add x29,sp,#0
654 stp d8,d9,[sp,#16] // per ABI requirement
655 stp d10,d11,[sp,#32]
656 stp d12,d13,[sp,#48]
657 stp d14,d15,[sp,#64]
658 ___
659 for($i=0; $i<24; $i+=2) { # load A[5][5]
660 my $j=$i+1;
661 $code.=<<___;
662 ldp d$i,d$j,[x0,#8*$i]
663 ___
664 }
665 $code.=<<___;
666 ldr d24,[x0,#8*$i]
667 bl KeccakF1600_ce
668 ldr x30,[sp,#8]
669 ___
670 for($i=0; $i<24; $i+=2) { # store A[5][5]
671 my $j=$i+1;
672 $code.=<<___;
673 stp d$i,d$j,[x0,#8*$i]
674 ___
675 }
676 $code.=<<___;
677 str d24,[x0,#8*$i]
678
679 ldp d8,d9,[sp,#16]
680 ldp d10,d11,[sp,#32]
681 ldp d12,d13,[sp,#48]
682 ldp d14,d15,[sp,#64]
683 ldr x29,[sp],#80
684 ret
685 .size KeccakF1600_cext,.-KeccakF1600_cext
686 ___
687
688 {
689 my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
690
691 $code.=<<___;
692 .globl SHA3_absorb_cext
693 .type SHA3_absorb_cext,%function
694 .align 5
695 SHA3_absorb_cext:
696 stp x29,x30,[sp,#-80]!
697 add x29,sp,#0
698 stp d8,d9,[sp,#16] // per ABI requirement
699 stp d10,d11,[sp,#32]
700 stp d12,d13,[sp,#48]
701 stp d14,d15,[sp,#64]
702 ___
703 for($i=0; $i<24; $i+=2) { # load A[5][5]
704 my $j=$i+1;
705 $code.=<<___;
706 ldp d$i,d$j,[x0,#8*$i]
707 ___
708 }
709 $code.=<<___;
710 ldr d24,[x0,#8*$i]
711 b .Loop_absorb_ce
712
713 .align 4
714 .Loop_absorb_ce:
715 subs $len,$len,$bsz // len - bsz
716 blo .Labsorbed_ce
717 ___
718 for (my $i=0; $i<24; $i+=2) {
719 my $j = $i+1;
720 $code.=<<___;
721 ldr d31,[$inp],#8 // *inp++
722 #ifdef __AARCH64EB__
723 rev64 v31.16b,v31.16b
724 #endif
725 eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
726 cmp $bsz,#8*($i+2)
727 blo .Lprocess_block_ce
728 ldr d31,[$inp],#8 // *inp++
729 #ifdef __AARCH64EB__
730 rev v31.16b,v31.16b
731 #endif
732 eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
733 beq .Lprocess_block_ce
734 ___
735 }
736 $code.=<<___;
737 ldr d31,[$inp],#8 // *inp++
738 #ifdef __AARCH64EB__
739 rev v31.16b,v31.16b
740 #endif
741 eor $A[4][4],$A[4][4],v31.16b
742
743 .Lprocess_block_ce:
744
745 bl KeccakF1600_ce
746
747 b .Loop_absorb_ce
748
749 .align 4
750 .Labsorbed_ce:
751 ___
752 for($i=0; $i<24; $i+=2) { # store A[5][5]
753 my $j=$i+1;
754 $code.=<<___;
755 stp d$i,d$j,[x0,#8*$i]
756 ___
757 }
758 $code.=<<___;
759 str d24,[x0,#8*$i]
760 add x0,$len,$bsz // return value
761
762 ldp d8,d9,[sp,#16]
763 ldp d10,d11,[sp,#32]
764 ldp d12,d13,[sp,#48]
765 ldp d14,d15,[sp,#64]
766 ldp x29,x30,[sp],#80
767 ret
768 .size SHA3_absorb_cext,.-SHA3_absorb_cext
769 ___
770 }
771 {
772 my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
773 $code.=<<___;
774 .globl SHA3_squeeze_cext
775 .type SHA3_squeeze_cext,%function
776 .align 5
777 SHA3_squeeze_cext:
778 stp x29,x30,[sp,#-16]!
779 add x29,sp,#0
780 mov x9,$ctx
781 mov x10,$bsz
782
783 .Loop_squeeze_ce:
784 ldr x4,[x9],#8
785 cmp $len,#8
786 blo .Lsqueeze_tail_ce
787 #ifdef __AARCH64EB__
788 rev x4,x4
789 #endif
790 str x4,[$out],#8
791 beq .Lsqueeze_done_ce
792
793 sub $len,$len,#8
794 subs x10,x10,#8
795 bhi .Loop_squeeze_ce
796
797 bl KeccakF1600_cext
798 ldr x30,[sp,#8]
799 mov x9,$ctx
800 mov x10,$bsz
801 b .Loop_squeeze_ce
802
803 .align 4
804 .Lsqueeze_tail_ce:
805 strb w4,[$out],#1
806 lsr x4,x4,#8
807 subs $len,$len,#1
808 beq .Lsqueeze_done_ce
809 strb w4,[$out],#1
810 lsr x4,x4,#8
811 subs $len,$len,#1
812 beq .Lsqueeze_done_ce
813 strb w4,[$out],#1
814 lsr x4,x4,#8
815 subs $len,$len,#1
816 beq .Lsqueeze_done_ce
817 strb w4,[$out],#1
818 lsr x4,x4,#8
819 subs $len,$len,#1
820 beq .Lsqueeze_done_ce
821 strb w4,[$out],#1
822 lsr x4,x4,#8
823 subs $len,$len,#1
824 beq .Lsqueeze_done_ce
825 strb w4,[$out],#1
826 lsr x4,x4,#8
827 subs $len,$len,#1
828 beq .Lsqueeze_done_ce
829 strb w4,[$out],#1
830
831 .Lsqueeze_done_ce:
832 ldr x29,[sp],#16
833 ret
834 .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
835 ___
836 } }}}
837 $code.=<<___;
838 .asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
839 ___
840
841 { my %opcode = (
842 "rax1" => 0xce608c00, "eor3" => 0xce000000,
843 "bcax" => 0xce200000, "xar" => 0xce800000 );
844
845 sub unsha3 {
846 my ($mnemonic,$arg)=@_;
847
848 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
849 &&
850 sprintf ".inst\t0x%08x\t//%s %s",
851 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
852 $mnemonic,$arg;
853 }
854 }
855
856 foreach(split("\n",$code)) {
857
858 s/\`([^\`]*)\`/eval($1)/ge;
859
860 m/\bdup\b/ and s/\.16b/.2d/g or
861 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
862
863 print $_,"\n";
864 }
865
866 close STDOUT;