]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-armv4.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-armv4.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for ARMv4.
17 #
18 # June 2017.
19 #
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
27 #
28 # August 2017.
29 #
30 # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31 # of rotate instructions with logical ones. This resulted in ~10%
32 # improvement on most processors. Switch to KECCAK_2X effectively
33 # minimizes re-loads from temporary storage, and merged rotates just
34 # eliminate corresponding instructions. As for latter. When examining
35 # code you'll notice commented ror instructions. These are eliminated
36 # ones, and you should trace destination register below to see what's
37 # going on. Just in case, why not all rotates are eliminated. Trouble
38 # is that you have operations that require both inputs to be rotated,
39 # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40 # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41 # that takes 'a' as input. And thing is that this next operation can
42 # be in next round. It's totally possible to "carry" rotate "factors"
43 # to the next round, but it makes code more complex. And the last word
44 # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45 # time being]...
46 #
47 # Reduce per-round instruction count in Thumb-2 case by 16%. This is
48 # achieved by folding ldr/str pairs to their double-word counterparts.
49 # Theoretically this should have improved performance on single-issue
50 # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51 # usual...
52 #
53 ########################################################################
54 # Numbers are cycles per processed byte. Non-NEON results account even
55 # for input bit interleaving.
56 #
57 # r=1088(*) Thumb-2(**) NEON
58 #
59 # ARM11xx 82/+150%
60 # Cortex-A5 88/+160%, 86, 36
61 # Cortex-A7 78/+160%, 68, 34
62 # Cortex-A8 51/+230%, 57, 30
63 # Cortex-A9 53/+210%, 51, 26
64 # Cortex-A15 42/+160%, 38, 18
65 # Snapdragon S4 43/+210%, 38, 24
66 #
67 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
68 # over compiler-generated KECCAK_2X reference code.
69 # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70 # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71 # processors are presented mostly for reference purposes.
72
73 $flavour = shift;
74 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
75 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
76
77 if ($flavour && $flavour ne "void") {
78 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
79 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
80 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
81 die "can't locate arm-xlate.pl";
82
83 open STDOUT,"| \"$^X\" $xlate $flavour $output";
84 } else {
85 open STDOUT,">$output";
86 }
87
88 my @C = map("r$_",(0..9));
89 my @E = map("r$_",(10..12,14));
90
91 ########################################################################
92 # Stack layout
93 # ----->+-----------------------+
94 # | uint64_t A[5][5] |
95 # | ... |
96 # +200->+-----------------------+
97 # | uint64_t D[5] |
98 # | ... |
99 # +240->+-----------------------+
100 # | uint64_t T[5][5] |
101 # | ... |
102 # +440->+-----------------------+
103 # | saved lr |
104 # +444->+-----------------------+
105 # | loop counter |
106 # +448->+-----------------------+
107 # | ...
108
109 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
110 my @D = map(8*$_, (25..29));
111 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
112
113 $code.=<<___;
114 #include "arm_arch.h"
115
116 .text
117
118 #if defined(__thumb2__)
119 .syntax unified
120 .thumb
121 #else
122 .code 32
123 #endif
124
125 .type iotas32, %object
126 .align 5
127 iotas32:
128 .long 0x00000001, 0x00000000
129 .long 0x00000000, 0x00000089
130 .long 0x00000000, 0x8000008b
131 .long 0x00000000, 0x80008080
132 .long 0x00000001, 0x0000008b
133 .long 0x00000001, 0x00008000
134 .long 0x00000001, 0x80008088
135 .long 0x00000001, 0x80000082
136 .long 0x00000000, 0x0000000b
137 .long 0x00000000, 0x0000000a
138 .long 0x00000001, 0x00008082
139 .long 0x00000000, 0x00008003
140 .long 0x00000001, 0x0000808b
141 .long 0x00000001, 0x8000000b
142 .long 0x00000001, 0x8000008a
143 .long 0x00000001, 0x80000081
144 .long 0x00000000, 0x80000081
145 .long 0x00000000, 0x80000008
146 .long 0x00000000, 0x00000083
147 .long 0x00000000, 0x80008003
148 .long 0x00000001, 0x80008088
149 .long 0x00000000, 0x80000088
150 .long 0x00000001, 0x00008000
151 .long 0x00000000, 0x80008082
152 .size iotas32,.-iotas32
153
154 .type KeccakF1600_int, %function
155 .align 5
156 KeccakF1600_int:
157 add @C[9],sp,#$A[4][2]
158 add @E[2],sp,#$A[0][0]
159 add @E[0],sp,#$A[1][0]
160 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
161 KeccakF1600_enter:
162 str lr,[sp,#440]
163 eor @E[1],@E[1],@E[1]
164 str @E[1],[sp,#444]
165 b .Lround2x
166
167 .align 4
168 .Lround2x:
169 ___
170 sub Round {
171 my (@A,@R); (@A[0..4],@R) = @_;
172
173 $code.=<<___;
174 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
175 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
176 #ifdef __thumb2__
177 eor @C[0],@C[0],@E[0]
178 eor @C[1],@C[1],@E[1]
179 eor @C[2],@C[2],@E[2]
180 ldrd @E[0],@E[1],[sp,#$A[1][2]]
181 eor @C[3],@C[3],@E[3]
182 ldrd @E[2],@E[3],[sp,#$A[1][3]]
183 eor @C[4],@C[4],@E[0]
184 eor @C[5],@C[5],@E[1]
185 eor @C[6],@C[6],@E[2]
186 ldrd @E[0],@E[1],[sp,#$A[1][4]]
187 eor @C[7],@C[7],@E[3]
188 ldrd @E[2],@E[3],[sp,#$A[2][0]]
189 eor @C[8],@C[8],@E[0]
190 eor @C[9],@C[9],@E[1]
191 eor @C[0],@C[0],@E[2]
192 ldrd @E[0],@E[1],[sp,#$A[2][1]]
193 eor @C[1],@C[1],@E[3]
194 ldrd @E[2],@E[3],[sp,#$A[2][2]]
195 eor @C[2],@C[2],@E[0]
196 eor @C[3],@C[3],@E[1]
197 eor @C[4],@C[4],@E[2]
198 ldrd @E[0],@E[1],[sp,#$A[2][3]]
199 eor @C[5],@C[5],@E[3]
200 ldrd @E[2],@E[3],[sp,#$A[2][4]]
201 eor @C[6],@C[6],@E[0]
202 eor @C[7],@C[7],@E[1]
203 eor @C[8],@C[8],@E[2]
204 ldrd @E[0],@E[1],[sp,#$A[3][0]]
205 eor @C[9],@C[9],@E[3]
206 ldrd @E[2],@E[3],[sp,#$A[3][1]]
207 eor @C[0],@C[0],@E[0]
208 eor @C[1],@C[1],@E[1]
209 eor @C[2],@C[2],@E[2]
210 ldrd @E[0],@E[1],[sp,#$A[3][2]]
211 eor @C[3],@C[3],@E[3]
212 ldrd @E[2],@E[3],[sp,#$A[3][3]]
213 eor @C[4],@C[4],@E[0]
214 eor @C[5],@C[5],@E[1]
215 eor @C[6],@C[6],@E[2]
216 ldrd @E[0],@E[1],[sp,#$A[3][4]]
217 eor @C[7],@C[7],@E[3]
218 ldrd @E[2],@E[3],[sp,#$A[4][0]]
219 eor @C[8],@C[8],@E[0]
220 eor @C[9],@C[9],@E[1]
221 eor @C[0],@C[0],@E[2]
222 ldrd @E[0],@E[1],[sp,#$A[4][1]]
223 eor @C[1],@C[1],@E[3]
224 ldrd @E[2],@E[3],[sp,#$A[0][2]]
225 eor @C[2],@C[2],@E[0]
226 eor @C[3],@C[3],@E[1]
227 eor @C[4],@C[4],@E[2]
228 ldrd @E[0],@E[1],[sp,#$A[0][3]]
229 eor @C[5],@C[5],@E[3]
230 ldrd @E[2],@E[3],[sp,#$A[0][4]]
231 #else
232 eor @C[0],@C[0],@E[0]
233 add @E[0],sp,#$A[1][2]
234 eor @C[1],@C[1],@E[1]
235 eor @C[2],@C[2],@E[2]
236 eor @C[3],@C[3],@E[3]
237 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
238 eor @C[4],@C[4],@E[0]
239 add @E[0],sp,#$A[1][4]
240 eor @C[5],@C[5],@E[1]
241 eor @C[6],@C[6],@E[2]
242 eor @C[7],@C[7],@E[3]
243 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
244 eor @C[8],@C[8],@E[0]
245 add @E[0],sp,#$A[2][1]
246 eor @C[9],@C[9],@E[1]
247 eor @C[0],@C[0],@E[2]
248 eor @C[1],@C[1],@E[3]
249 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
250 eor @C[2],@C[2],@E[0]
251 add @E[0],sp,#$A[2][3]
252 eor @C[3],@C[3],@E[1]
253 eor @C[4],@C[4],@E[2]
254 eor @C[5],@C[5],@E[3]
255 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
256 eor @C[6],@C[6],@E[0]
257 add @E[0],sp,#$A[3][0]
258 eor @C[7],@C[7],@E[1]
259 eor @C[8],@C[8],@E[2]
260 eor @C[9],@C[9],@E[3]
261 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
262 eor @C[0],@C[0],@E[0]
263 add @E[0],sp,#$A[3][2]
264 eor @C[1],@C[1],@E[1]
265 eor @C[2],@C[2],@E[2]
266 eor @C[3],@C[3],@E[3]
267 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
268 eor @C[4],@C[4],@E[0]
269 add @E[0],sp,#$A[3][4]
270 eor @C[5],@C[5],@E[1]
271 eor @C[6],@C[6],@E[2]
272 eor @C[7],@C[7],@E[3]
273 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
274 eor @C[8],@C[8],@E[0]
275 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
276 eor @C[9],@C[9],@E[1]
277 ldr @E[1],[sp,#$A[4][1]+4]
278 eor @C[0],@C[0],@E[2]
279 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
280 eor @C[1],@C[1],@E[3]
281 ldr @E[3],[sp,#$A[0][2]+4]
282 eor @C[2],@C[2],@E[0]
283 add @E[0],sp,#$A[0][3]
284 eor @C[3],@C[3],@E[1]
285 eor @C[4],@C[4],@E[2]
286 eor @C[5],@C[5],@E[3]
287 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
288 #endif
289 eor @C[6],@C[6],@E[0]
290 eor @C[7],@C[7],@E[1]
291 eor @C[8],@C[8],@E[2]
292 eor @C[9],@C[9],@E[3]
293
294 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
295 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
296 eor @E[1],@C[1],@C[4]
297 str.h @E[1],[sp,#$D[1]+4]
298 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
299 eor @E[3],@C[7],@C[0]
300 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
301 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
302 str.h @E[3],[sp,#$D[4]+4]
303 eor @C[1],@C[9],@C[2]
304 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
305 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
306 ldr.l @C[7],[sp,#$A[3][3]]
307 eor @C[3],@C[3],@C[6]
308 str.h @C[1],[sp,#$D[0]+4]
309 ldr.h @C[6],[sp,#$A[3][3]+4]
310 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
311 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
312 str.h @C[3],[sp,#$D[2]+4]
313 eor @C[5],@C[5],@C[8]
314
315 ldr.l @C[8],[sp,#$A[4][4]]
316 ldr.h @C[9],[sp,#$A[4][4]+4]
317 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
318 eor @C[7],@C[7],@C[4]
319 str.h @C[5],[sp,#$D[3]+4]
320 eor @C[6],@C[6],@C[5]
321 ldr.l @C[4],[sp,#$A[0][0]]
322 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
323 @ ror @C[6],@C[6],#32-11
324 ldr.h @C[5],[sp,#$A[0][0]+4]
325 eor @C[8],@C[8],@E[2]
326 eor @C[9],@C[9],@E[3]
327 ldr.l @E[2],[sp,#$A[2][2]]
328 eor @C[0],@C[0],@C[4]
329 ldr.h @E[3],[sp,#$A[2][2]+4]
330 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
331 @ ror @C[9],@C[9],#32-7
332 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
333 eor @E[2],@E[2],@C[2]
334 ldr.l @C[2],[sp,#$A[1][1]]
335 eor @E[3],@E[3],@C[3]
336 ldr.h @C[3],[sp,#$A[1][1]+4]
337 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
338 ldr @E[2],[sp,#444] @ load counter
339 eor @C[2],@C[2],@E[0]
340 adr @E[0],iotas32
341 ror @C[4],@E[3],#32-22
342 add @E[3],@E[0],@E[2]
343 eor @C[3],@C[3],@E[1]
344 ___
345 $code.=<<___ if ($A[0][0] != $T[0][0]);
346 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
347 ___
348 $code.=<<___ if ($A[0][0] == $T[0][0]);
349 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
350 add @E[2],@E[2],#16
351 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
352 cmp @E[2],#192
353 str @E[2],[sp,#444] @ store counter
354 ___
355 $code.=<<___;
356 bic @E[2],@C[4],@C[2],ror#32-22
357 bic @E[3],@C[5],@C[3],ror#32-22
358 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
359 ror @C[3],@C[3],#32-22
360 eor @E[2],@E[2],@C[0]
361 eor @E[3],@E[3],@C[1]
362 eor @E[0],@E[0],@E[2]
363 eor @E[1],@E[1],@E[3]
364 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
365 bic @E[2],@C[6],@C[4],ror#11
366 str.h @E[1],[sp,#$R[0][0]+4]
367 bic @E[3],@C[7],@C[5],ror#10
368 bic @E[0],@C[8],@C[6],ror#32-(11-7)
369 bic @E[1],@C[9],@C[7],ror#32-(10-7)
370 eor @E[2],@C[2],@E[2],ror#32-11
371 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
372 eor @E[3],@C[3],@E[3],ror#32-10
373 str.h @E[3],[sp,#$R[0][1]+4]
374 eor @E[0],@C[4],@E[0],ror#32-7
375 eor @E[1],@C[5],@E[1],ror#32-7
376 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
377 bic @E[2],@C[0],@C[8],ror#32-7
378 str.h @E[1],[sp,#$R[0][2]+4]
379 bic @E[3],@C[1],@C[9],ror#32-7
380 eor @E[2],@E[2],@C[6],ror#32-11
381 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
382 eor @E[3],@E[3],@C[7],ror#32-10
383 str.h @E[3],[sp,#$R[0][3]+4]
384 bic @E[0],@C[2],@C[0]
385 add @E[3],sp,#$D[3]
386 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
387 bic @E[1],@C[3],@C[1]
388 ldr.h @C[1],[sp,#$A[0][3]+4]
389 eor @E[0],@E[0],@C[8],ror#32-7
390 eor @E[1],@E[1],@C[9],ror#32-7
391 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
392 add @C[9],sp,#$D[0]
393 str.h @E[1],[sp,#$R[0][4]+4]
394
395 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
396 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
397
398 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
399 eor @C[0],@C[0],@E[0]
400 ldr.h @C[3],[sp,#$A[1][4]+4]
401 eor @C[1],@C[1],@E[1]
402 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
403 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
404 @ ror @C[1],@C[1],#32-14
405 ldr.h @E[1],[sp,#$A[3][1]+4]
406
407 eor @C[2],@C[2],@E[2]
408 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
409 eor @C[3],@C[3],@E[3]
410 ldr.h @C[5],[sp,#$A[2][0]+4]
411 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
412 @ ror @C[3],@C[3],#32-10
413
414 eor @C[6],@C[6],@C[4]
415 ldr.l @E[2],[sp,#$D[2]] @ D[2]
416 eor @C[7],@C[7],@C[5]
417 ldr.h @E[3],[sp,#$D[2]+4]
418 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
419 ror @C[4],@C[7],#32-2
420
421 eor @E[0],@E[0],@C[8]
422 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
423 eor @E[1],@E[1],@C[9]
424 ldr.h @C[9],[sp,#$A[4][2]+4]
425 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
426 ror @C[6],@E[1],#32-23
427
428 bic @E[0],@C[4],@C[2],ror#32-10
429 bic @E[1],@C[5],@C[3],ror#32-10
430 eor @E[2],@E[2],@C[8]
431 eor @E[3],@E[3],@C[9]
432 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
433 ror @C[8],@E[3],#32-31
434 eor @E[0],@E[0],@C[0],ror#32-14
435 eor @E[1],@E[1],@C[1],ror#32-14
436 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
437 bic @E[2],@C[6],@C[4]
438 str.h @E[1],[sp,#$R[1][0]+4]
439 bic @E[3],@C[7],@C[5]
440 eor @E[2],@E[2],@C[2],ror#32-10
441 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
442 eor @E[3],@E[3],@C[3],ror#32-10
443 str.h @E[3],[sp,#$R[1][1]+4]
444 bic @E[0],@C[8],@C[6]
445 bic @E[1],@C[9],@C[7]
446 bic @E[2],@C[0],@C[8],ror#14
447 bic @E[3],@C[1],@C[9],ror#14
448 eor @E[0],@E[0],@C[4]
449 eor @E[1],@E[1],@C[5]
450 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
451 bic @C[2],@C[2],@C[0],ror#32-(14-10)
452 str.h @E[1],[sp,#$R[1][2]+4]
453 eor @E[2],@C[6],@E[2],ror#32-14
454 bic @E[1],@C[3],@C[1],ror#32-(14-10)
455 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
456 eor @E[3],@C[7],@E[3],ror#32-14
457 str.h @E[3],[sp,#$R[1][3]+4]
458 add @E[2],sp,#$D[1]
459 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
460 eor @E[0],@C[8],@C[2],ror#32-10
461 ldr.h @C[0],[sp,#$A[0][1]+4]
462 eor @E[1],@C[9],@E[1],ror#32-10
463 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
464 str.h @E[1],[sp,#$R[1][4]+4]
465
466 add @C[9],sp,#$D[3]
467 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
468 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
469 ldr.h @C[3],[sp,#$A[1][2]+4]
470 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
471
472 eor @C[1],@C[1],@E[0]
473 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
474 eor @C[0],@C[0],@E[1]
475 ldr.h @C[5],[sp,#$A[2][3]+4]
476 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
477
478 eor @C[2],@C[2],@E[2]
479 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
480 eor @C[3],@C[3],@E[3]
481 ldr.h @E[1],[sp,#$A[3][4]+4]
482 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
483 ldr.l @E[2],[sp,#$D[0]] @ D[0]
484 @ ror @C[3],@C[3],#32-3
485 ldr.h @E[3],[sp,#$D[0]+4]
486
487 eor @C[4],@C[4],@C[6]
488 eor @C[5],@C[5],@C[7]
489 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
490 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
491
492 eor @E[0],@E[0],@C[8]
493 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
494 eor @E[1],@E[1],@C[9]
495 ldr.h @C[9],[sp,#$A[4][0]+4]
496 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
497 ror @C[7],@E[1],#32-4
498
499 eor @E[2],@E[2],@C[8]
500 eor @E[3],@E[3],@C[9]
501 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
502 ror @C[9],@E[3],#32-9
503
504 bic @E[0],@C[5],@C[2],ror#13-3
505 bic @E[1],@C[4],@C[3],ror#12-3
506 bic @E[2],@C[6],@C[5],ror#32-13
507 bic @E[3],@C[7],@C[4],ror#32-12
508 eor @E[0],@C[0],@E[0],ror#32-13
509 eor @E[1],@C[1],@E[1],ror#32-12
510 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
511 eor @E[2],@E[2],@C[2],ror#32-3
512 str.h @E[1],[sp,#$R[2][0]+4]
513 eor @E[3],@E[3],@C[3],ror#32-3
514 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
515 bic @E[0],@C[8],@C[6]
516 bic @E[1],@C[9],@C[7]
517 str.h @E[3],[sp,#$R[2][1]+4]
518 eor @E[0],@E[0],@C[5],ror#32-13
519 eor @E[1],@E[1],@C[4],ror#32-12
520 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
521 bic @E[2],@C[0],@C[8]
522 str.h @E[1],[sp,#$R[2][2]+4]
523 bic @E[3],@C[1],@C[9]
524 eor @E[2],@E[2],@C[6]
525 eor @E[3],@E[3],@C[7]
526 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
527 bic @E[0],@C[2],@C[0],ror#3
528 str.h @E[3],[sp,#$R[2][3]+4]
529 bic @E[1],@C[3],@C[1],ror#3
530 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
531 eor @E[0],@C[8],@E[0],ror#32-3
532 ldr.h @C[0],[sp,#$A[0][4]+4]
533 eor @E[1],@C[9],@E[1],ror#32-3
534 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
535 add @C[9],sp,#$D[1]
536 str.h @E[1],[sp,#$R[2][4]+4]
537
538 ldr.l @E[0],[sp,#$D[4]] @ D[4]
539 ldr.h @E[1],[sp,#$D[4]+4]
540 ldr.l @E[2],[sp,#$D[0]] @ D[0]
541 ldr.h @E[3],[sp,#$D[0]+4]
542
543 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
544
545 eor @C[1],@C[1],@E[0]
546 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
547 eor @C[0],@C[0],@E[1]
548 ldr.h @C[3],[sp,#$A[1][0]+4]
549 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
550 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
551 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
552 ldr.h @C[5],[sp,#$A[2][1]+4]
553
554 eor @C[2],@C[2],@E[2]
555 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
556 eor @C[3],@C[3],@E[3]
557 ldr.h @E[1],[sp,#$A[3][2]+4]
558 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
559 ldr.l @E[2],[sp,#$D[3]] @ D[3]
560 @ ror @C[3],@C[3],#32-18
561 ldr.h @E[3],[sp,#$D[3]+4]
562
563 eor @C[6],@C[6],@C[4]
564 eor @C[7],@C[7],@C[5]
565 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
566 ror @C[5],@C[7],#32-5
567
568 eor @E[0],@E[0],@C[8]
569 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
570 eor @E[1],@E[1],@C[9]
571 ldr.h @C[9],[sp,#$A[4][3]+4]
572 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
573 ror @C[6],@E[1],#32-8
574
575 eor @E[2],@E[2],@C[8]
576 eor @E[3],@E[3],@C[9]
577 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
578 ror @C[9],@E[3],#32-28
579
580 bic @E[0],@C[4],@C[2],ror#32-18
581 bic @E[1],@C[5],@C[3],ror#32-18
582 eor @E[0],@E[0],@C[0],ror#32-14
583 eor @E[1],@E[1],@C[1],ror#32-13
584 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
585 bic @E[2],@C[6],@C[4]
586 str.h @E[1],[sp,#$R[3][0]+4]
587 bic @E[3],@C[7],@C[5]
588 eor @E[2],@E[2],@C[2],ror#32-18
589 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
590 eor @E[3],@E[3],@C[3],ror#32-18
591 str.h @E[3],[sp,#$R[3][1]+4]
592 bic @E[0],@C[8],@C[6]
593 bic @E[1],@C[9],@C[7]
594 bic @E[2],@C[0],@C[8],ror#14
595 bic @E[3],@C[1],@C[9],ror#13
596 eor @E[0],@E[0],@C[4]
597 eor @E[1],@E[1],@C[5]
598 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
599 bic @C[2],@C[2],@C[0],ror#18-14
600 str.h @E[1],[sp,#$R[3][2]+4]
601 eor @E[2],@C[6],@E[2],ror#32-14
602 bic @E[1],@C[3],@C[1],ror#18-13
603 eor @E[3],@C[7],@E[3],ror#32-13
604 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
605 str.h @E[3],[sp,#$R[3][3]+4]
606 add @E[3],sp,#$D[2]
607 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
608 eor @E[0],@C[8],@C[2],ror#32-18
609 ldr.h @C[1],[sp,#$A[0][2]+4]
610 eor @E[1],@C[9],@E[1],ror#32-18
611 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
612 str.h @E[1],[sp,#$R[3][4]+4]
613
614 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
615 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
616 ldr.h @C[3],[sp,#$A[1][3]+4]
617 ldr.l @C[6],[sp,#$D[4]] @ D[4]
618 ldr.h @C[7],[sp,#$D[4]+4]
619
620 eor @C[0],@C[0],@E[0]
621 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
622 eor @C[1],@C[1],@E[1]
623 ldr.h @C[5],[sp,#$A[2][4]+4]
624 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
625 ldr.l @C[8],[sp,#$D[0]] @ D[0]
626 @ ror @C[1],@C[1],#32-31
627 ldr.h @C[9],[sp,#$D[0]+4]
628
629 eor @E[2],@E[2],@C[2]
630 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
631 eor @E[3],@E[3],@C[3]
632 ldr.h @E[1],[sp,#$A[3][0]+4]
633 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
634 ldr.l @E[2],[sp,#$D[1]] @ D[1]
635 ror @C[2],@E[3],#32-28
636 ldr.h @E[3],[sp,#$D[1]+4]
637
638 eor @C[6],@C[6],@C[4]
639 eor @C[7],@C[7],@C[5]
640 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
641 ror @C[4],@C[7],#32-20
642
643 eor @E[0],@E[0],@C[8]
644 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
645 eor @E[1],@E[1],@C[9]
646 ldr.h @C[9],[sp,#$A[4][1]+4]
647 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
648 ror @C[6],@E[1],#32-21
649
650 eor @C[8],@C[8],@E[2]
651 eor @C[9],@C[9],@E[3]
652 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
653 @ ror @C[9],@C[3],#32-1
654
655 bic @E[0],@C[4],@C[2]
656 bic @E[1],@C[5],@C[3]
657 eor @E[0],@E[0],@C[0],ror#32-31
658 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
659 eor @E[1],@E[1],@C[1],ror#32-31
660 str.h @E[1],[sp,#$R[4][0]+4]
661 bic @E[2],@C[6],@C[4]
662 bic @E[3],@C[7],@C[5]
663 eor @E[2],@E[2],@C[2]
664 eor @E[3],@E[3],@C[3]
665 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
666 bic @E[0],@C[8],@C[6],ror#1
667 str.h @E[3],[sp,#$R[4][1]+4]
668 bic @E[1],@C[9],@C[7],ror#1
669 bic @E[2],@C[0],@C[8],ror#31-1
670 bic @E[3],@C[1],@C[9],ror#31-1
671 eor @C[4],@C[4],@E[0],ror#32-1
672 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
673 eor @C[5],@C[5],@E[1],ror#32-1
674 str.h @C[5],[sp,#$R[4][2]+4]
675 eor @C[6],@C[6],@E[2],ror#32-31
676 eor @C[7],@C[7],@E[3],ror#32-31
677 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
678 bic @E[0],@C[2],@C[0],ror#32-31
679 str.h @C[7],[sp,#$R[4][3]+4]
680 bic @E[1],@C[3],@C[1],ror#32-31
681 add @E[2],sp,#$R[0][0]
682 eor @C[8],@E[0],@C[8],ror#32-1
683 add @E[0],sp,#$R[1][0]
684 eor @C[9],@E[1],@C[9],ror#32-1
685 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
686 str.h @C[9],[sp,#$R[4][4]+4]
687 ___
688 }
689 Round(@A,@T);
690 Round(@T,@A);
691 $code.=<<___;
692 blo .Lround2x
693
694 ldr pc,[sp,#440]
695 .size KeccakF1600_int,.-KeccakF1600_int
696
697 .type KeccakF1600, %function
698 .align 5
699 KeccakF1600:
700 stmdb sp!,{r0,r4-r11,lr}
701 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
702
703 add @E[0],r0,#$A[1][0]
704 add @E[1],sp,#$A[1][0]
705 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
706 stmia sp, {@C[0]-@C[9]}
707 ldmia @E[0]!,{@C[0]-@C[9]}
708 stmia @E[1]!,{@C[0]-@C[9]}
709 ldmia @E[0]!,{@C[0]-@C[9]}
710 stmia @E[1]!,{@C[0]-@C[9]}
711 ldmia @E[0]!,{@C[0]-@C[9]}
712 stmia @E[1]!,{@C[0]-@C[9]}
713 ldmia @E[0], {@C[0]-@C[9]}
714 add @E[2],sp,#$A[0][0]
715 add @E[0],sp,#$A[1][0]
716 stmia @E[1], {@C[0]-@C[9]}
717
718 bl KeccakF1600_enter
719
720 ldr @E[1], [sp,#440+16] @ restore pointer to A
721 ldmia sp, {@C[0]-@C[9]}
722 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
723 ldmia @E[0]!,{@C[0]-@C[9]}
724 stmia @E[1]!,{@C[0]-@C[9]}
725 ldmia @E[0]!,{@C[0]-@C[9]}
726 stmia @E[1]!,{@C[0]-@C[9]}
727 ldmia @E[0]!,{@C[0]-@C[9]}
728 stmia @E[1]!,{@C[0]-@C[9]}
729 ldmia @E[0], {@C[0]-@C[9]}
730 stmia @E[1], {@C[0]-@C[9]}
731
732 add sp,sp,#440+20
733 ldmia sp!,{r4-r11,pc}
734 .size KeccakF1600,.-KeccakF1600
735 ___
736 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
737
738 ########################################################################
739 # Stack layout
740 # ----->+-----------------------+
741 # | uint64_t A[5][5] |
742 # | ... |
743 # | ... |
744 # +456->+-----------------------+
745 # | 0x55555555 |
746 # +460->+-----------------------+
747 # | 0x33333333 |
748 # +464->+-----------------------+
749 # | 0x0f0f0f0f |
750 # +468->+-----------------------+
751 # | 0x00ff00ff |
752 # +472->+-----------------------+
753 # | uint64_t *A |
754 # +476->+-----------------------+
755 # | const void *inp |
756 # +480->+-----------------------+
757 # | size_t len |
758 # +484->+-----------------------+
759 # | size_t bs |
760 # +488->+-----------------------+
761 # | ....
762
763 $code.=<<___;
764 .global SHA3_absorb
765 .type SHA3_absorb,%function
766 .align 5
767 SHA3_absorb:
768 stmdb sp!,{r0-r12,lr}
769 sub sp,sp,#456+16
770
771 add $A_flat,r0,#$A[1][0]
772 @ mov $inp,r1
773 mov $len,r2
774 mov $bsz,r3
775 cmp r2,r3
776 blo .Labsorb_abort
777
778 add $inp,sp,#0
779 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
780 stmia $inp!, {@C[0]-@C[9]}
781 ldmia $A_flat!,{@C[0]-@C[9]}
782 stmia $inp!, {@C[0]-@C[9]}
783 ldmia $A_flat!,{@C[0]-@C[9]}
784 stmia $inp!, {@C[0]-@C[9]}
785 ldmia $A_flat!,{@C[0]-@C[9]}
786 stmia $inp!, {@C[0]-@C[9]}
787 ldmia $A_flat!,{@C[0]-@C[9]}
788 stmia $inp, {@C[0]-@C[9]}
789
790 ldr $inp,[sp,#476] @ restore $inp
791 #ifdef __thumb2__
792 mov r9,#0x00ff00ff
793 mov r8,#0x0f0f0f0f
794 mov r7,#0x33333333
795 mov r6,#0x55555555
796 #else
797 mov r6,#0x11 @ compose constants
798 mov r8,#0x0f
799 mov r9,#0xff
800 orr r6,r6,r6,lsl#8
801 orr r8,r8,r8,lsl#8
802 orr r6,r6,r6,lsl#16 @ 0x11111111
803 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
804 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
805 orr r7,r6,r6,lsl#1 @ 0x33333333
806 orr r6,r6,r6,lsl#2 @ 0x55555555
807 #endif
808 str r9,[sp,#468]
809 str r8,[sp,#464]
810 str r7,[sp,#460]
811 str r6,[sp,#456]
812 b .Loop_absorb
813
814 .align 4
815 .Loop_absorb:
816 subs r0,$len,$bsz
817 blo .Labsorbed
818 add $A_flat,sp,#0
819 str r0,[sp,#480] @ save len - bsz
820
821 .align 4
822 .Loop_block:
823 ldrb r0,[$inp],#1
824 ldrb r1,[$inp],#1
825 ldrb r2,[$inp],#1
826 ldrb r3,[$inp],#1
827 ldrb r4,[$inp],#1
828 orr r0,r0,r1,lsl#8
829 ldrb r1,[$inp],#1
830 orr r0,r0,r2,lsl#16
831 ldrb r2,[$inp],#1
832 orr r0,r0,r3,lsl#24 @ lo
833 ldrb r3,[$inp],#1
834 orr r1,r4,r1,lsl#8
835 orr r1,r1,r2,lsl#16
836 orr r1,r1,r3,lsl#24 @ hi
837
838 and r2,r0,r6 @ &=0x55555555
839 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
840 and r3,r1,r6 @ &=0x55555555
841 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
842 orr r2,r2,r2,lsr#1
843 orr r0,r0,r0,lsl#1
844 orr r3,r3,r3,lsr#1
845 orr r1,r1,r1,lsl#1
846 and r2,r2,r7 @ &=0x33333333
847 and r0,r0,r7,lsl#2 @ &=0xcccccccc
848 and r3,r3,r7 @ &=0x33333333
849 and r1,r1,r7,lsl#2 @ &=0xcccccccc
850 orr r2,r2,r2,lsr#2
851 orr r0,r0,r0,lsl#2
852 orr r3,r3,r3,lsr#2
853 orr r1,r1,r1,lsl#2
854 and r2,r2,r8 @ &=0x0f0f0f0f
855 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
856 and r3,r3,r8 @ &=0x0f0f0f0f
857 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
858 ldmia $A_flat,{r4-r5} @ A_flat[i]
859 orr r2,r2,r2,lsr#4
860 orr r0,r0,r0,lsl#4
861 orr r3,r3,r3,lsr#4
862 orr r1,r1,r1,lsl#4
863 and r2,r2,r9 @ &=0x00ff00ff
864 and r0,r0,r9,lsl#8 @ &=0xff00ff00
865 and r3,r3,r9 @ &=0x00ff00ff
866 and r1,r1,r9,lsl#8 @ &=0xff00ff00
867 orr r2,r2,r2,lsr#8
868 orr r0,r0,r0,lsl#8
869 orr r3,r3,r3,lsr#8
870 orr r1,r1,r1,lsl#8
871
872 lsl r2,r2,#16
873 lsr r1,r1,#16
874 eor r4,r4,r3,lsl#16
875 eor r5,r5,r0,lsr#16
876 eor r4,r4,r2,lsr#16
877 eor r5,r5,r1,lsl#16
878 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
879
880 subs $bsz,$bsz,#8
881 bhi .Loop_block
882
883 str $inp,[sp,#476]
884
885 bl KeccakF1600_int
886
887 add r14,sp,#456
888 ldmia r14,{r6-r12,r14} @ restore constants and variables
889 b .Loop_absorb
890
891 .align 4
892 .Labsorbed:
893 add $inp,sp,#$A[1][0]
894 ldmia sp, {@C[0]-@C[9]}
895 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
896 ldmia $inp!, {@C[0]-@C[9]}
897 stmia $A_flat!,{@C[0]-@C[9]}
898 ldmia $inp!, {@C[0]-@C[9]}
899 stmia $A_flat!,{@C[0]-@C[9]}
900 ldmia $inp!, {@C[0]-@C[9]}
901 stmia $A_flat!,{@C[0]-@C[9]}
902 ldmia $inp, {@C[0]-@C[9]}
903 stmia $A_flat, {@C[0]-@C[9]}
904
905 .Labsorb_abort:
906 add sp,sp,#456+32
907 mov r0,$len @ return value
908 ldmia sp!,{r4-r12,pc}
909 .size SHA3_absorb,.-SHA3_absorb
910 ___
911 }
912 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
913
914 $code.=<<___;
915 .global SHA3_squeeze
916 .type SHA3_squeeze,%function
917 .align 5
918 SHA3_squeeze:
919 stmdb sp!,{r0,r3-r10,lr}
920
921 mov $A_flat,r0
922 mov $out,r1
923 mov $len,r2
924 mov $bsz,r3
925
926 #ifdef __thumb2__
927 mov r9,#0x00ff00ff
928 mov r8,#0x0f0f0f0f
929 mov r7,#0x33333333
930 mov r6,#0x55555555
931 #else
932 mov r6,#0x11 @ compose constants
933 mov r8,#0x0f
934 mov r9,#0xff
935 orr r6,r6,r6,lsl#8
936 orr r8,r8,r8,lsl#8
937 orr r6,r6,r6,lsl#16 @ 0x11111111
938 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
939 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
940 orr r7,r6,r6,lsl#1 @ 0x33333333
941 orr r6,r6,r6,lsl#2 @ 0x55555555
942 #endif
943 stmdb sp!,{r6-r9}
944
945 mov r14,$A_flat
946 b .Loop_squeeze
947
948 .align 4
949 .Loop_squeeze:
950 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
951
952 lsl r2,r0,#16
953 lsl r3,r1,#16 @ r3 = r1 << 16
954 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
955 lsr r1,r1,#16
956 lsr r0,r0,#16 @ r0 = r0 >> 16
957 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
958
959 orr r2,r2,r2,lsl#8
960 orr r3,r3,r3,lsr#8
961 orr r0,r0,r0,lsl#8
962 orr r1,r1,r1,lsr#8
963 and r2,r2,r9 @ &=0x00ff00ff
964 and r3,r3,r9,lsl#8 @ &=0xff00ff00
965 and r0,r0,r9 @ &=0x00ff00ff
966 and r1,r1,r9,lsl#8 @ &=0xff00ff00
967 orr r2,r2,r2,lsl#4
968 orr r3,r3,r3,lsr#4
969 orr r0,r0,r0,lsl#4
970 orr r1,r1,r1,lsr#4
971 and r2,r2,r8 @ &=0x0f0f0f0f
972 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
973 and r0,r0,r8 @ &=0x0f0f0f0f
974 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
975 orr r2,r2,r2,lsl#2
976 orr r3,r3,r3,lsr#2
977 orr r0,r0,r0,lsl#2
978 orr r1,r1,r1,lsr#2
979 and r2,r2,r7 @ &=0x33333333
980 and r3,r3,r7,lsl#2 @ &=0xcccccccc
981 and r0,r0,r7 @ &=0x33333333
982 and r1,r1,r7,lsl#2 @ &=0xcccccccc
983 orr r2,r2,r2,lsl#1
984 orr r3,r3,r3,lsr#1
985 orr r0,r0,r0,lsl#1
986 orr r1,r1,r1,lsr#1
987 and r2,r2,r6 @ &=0x55555555
988 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
989 and r0,r0,r6 @ &=0x55555555
990 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
991
992 orr r2,r2,r3
993 orr r0,r0,r1
994
995 cmp $len,#8
996 blo .Lsqueeze_tail
997 lsr r1,r2,#8
998 strb r2,[$out],#1
999 lsr r3,r2,#16
1000 strb r1,[$out],#1
1001 lsr r2,r2,#24
1002 strb r3,[$out],#1
1003 strb r2,[$out],#1
1004
1005 lsr r1,r0,#8
1006 strb r0,[$out],#1
1007 lsr r3,r0,#16
1008 strb r1,[$out],#1
1009 lsr r0,r0,#24
1010 strb r3,[$out],#1
1011 strb r0,[$out],#1
1012 subs $len,$len,#8
1013 beq .Lsqueeze_done
1014
1015 subs $bsz,$bsz,#8 @ bsz -= 8
1016 bhi .Loop_squeeze
1017
1018 mov r0,r14 @ original $A_flat
1019
1020 bl KeccakF1600
1021
1022 ldmia sp,{r6-r10,r12} @ restore constants and variables
1023 mov r14,$A_flat
1024 b .Loop_squeeze
1025
1026 .align 4
1027 .Lsqueeze_tail:
1028 strb r2,[$out],#1
1029 lsr r2,r2,#8
1030 subs $len,$len,#1
1031 beq .Lsqueeze_done
1032 strb r2,[$out],#1
1033 lsr r2,r2,#8
1034 subs $len,$len,#1
1035 beq .Lsqueeze_done
1036 strb r2,[$out],#1
1037 lsr r2,r2,#8
1038 subs $len,$len,#1
1039 beq .Lsqueeze_done
1040 strb r2,[$out],#1
1041 subs $len,$len,#1
1042 beq .Lsqueeze_done
1043
1044 strb r0,[$out],#1
1045 lsr r0,r0,#8
1046 subs $len,$len,#1
1047 beq .Lsqueeze_done
1048 strb r0,[$out],#1
1049 lsr r0,r0,#8
1050 subs $len,$len,#1
1051 beq .Lsqueeze_done
1052 strb r0,[$out]
1053 b .Lsqueeze_done
1054
1055 .align 4
1056 .Lsqueeze_done:
1057 add sp,sp,#24
1058 ldmia sp!,{r4-r10,pc}
1059 .size SHA3_squeeze,.-SHA3_squeeze
1060 ___
1061 }
1062
1063 $code.=<<___;
1064 #if __ARM_MAX_ARCH__>=7
1065 .fpu neon
1066
1067 .type iotas64, %object
1068 .align 5
1069 iotas64:
1070 .quad 0x0000000000000001
1071 .quad 0x0000000000008082
1072 .quad 0x800000000000808a
1073 .quad 0x8000000080008000
1074 .quad 0x000000000000808b
1075 .quad 0x0000000080000001
1076 .quad 0x8000000080008081
1077 .quad 0x8000000000008009
1078 .quad 0x000000000000008a
1079 .quad 0x0000000000000088
1080 .quad 0x0000000080008009
1081 .quad 0x000000008000000a
1082 .quad 0x000000008000808b
1083 .quad 0x800000000000008b
1084 .quad 0x8000000000008089
1085 .quad 0x8000000000008003
1086 .quad 0x8000000000008002
1087 .quad 0x8000000000000080
1088 .quad 0x000000000000800a
1089 .quad 0x800000008000000a
1090 .quad 0x8000000080008081
1091 .quad 0x8000000000008080
1092 .quad 0x0000000080000001
1093 .quad 0x8000000080008008
1094 .size iotas64,.-iotas64
1095
1096 .type KeccakF1600_neon, %function
1097 .align 5
1098 KeccakF1600_neon:
1099 add r1, r0, #16
1100 adr r2, iotas64
1101 mov r3, #24 @ loop counter
1102 b .Loop_neon
1103
1104 .align 4
1105 .Loop_neon:
1106 @ Theta
1107 vst1.64 {q4}, [r0,:64] @ offload A[0..1][4]
1108 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1109 vst1.64 {d18}, [r1,:64] @ offload A[2][4]
1110 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1111 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1112 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1113 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1114 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1115 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1116 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1117 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1118 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1119 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1120 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1121 veor d25, d25, d24 @ C[4]^=A[4][4]
1122
1123 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1124 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1125 vadd.u64 d18, d25, d25 @ C[4]<<1
1126 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1127 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1128 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1129 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1130 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1131 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1132 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1133
1134 veor d0, d0, d25 @ A[0][0] ^= C[4]
1135 veor d1, d1, d25 @ A[1][0] ^= C[4]
1136 veor d10, d10, d25 @ A[2][0] ^= C[4]
1137 veor d11, d11, d25 @ A[3][0] ^= C[4]
1138 veor d20, d20, d25 @ A[4][0] ^= C[4]
1139
1140 veor d2, d2, d26 @ A[0][1] ^= D[1]
1141 veor d3, d3, d26 @ A[1][1] ^= D[1]
1142 veor d12, d12, d26 @ A[2][1] ^= D[1]
1143 veor d13, d13, d26 @ A[3][1] ^= D[1]
1144 veor d21, d21, d26 @ A[4][1] ^= D[1]
1145 vmov d26, d27
1146
1147 veor d6, d6, d28 @ A[0][3] ^= C[2]
1148 veor d7, d7, d28 @ A[1][3] ^= C[2]
1149 veor d16, d16, d28 @ A[2][3] ^= C[2]
1150 veor d17, d17, d28 @ A[3][3] ^= C[2]
1151 veor d23, d23, d28 @ A[4][3] ^= C[2]
1152 vld1.64 {q4}, [r0,:64] @ restore A[0..1][4]
1153 vmov d28, d29
1154
1155 vld1.64 {d18}, [r1,:64] @ restore A[2][4]
1156 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1157 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1158 veor d22, d22, d27 @ A[4][2] ^= D[2]
1159
1160 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1161 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1162 veor d24, d24, d29 @ A[4][4] ^= C[3]
1163
1164 @ Rho + Pi
1165 vmov d26, d2 @ C[1] = A[0][1]
1166 vshl.u64 d2, d3, #44
1167 vmov d27, d4 @ C[2] = A[0][2]
1168 vshl.u64 d4, d14, #43
1169 vmov d28, d6 @ C[3] = A[0][3]
1170 vshl.u64 d6, d17, #21
1171 vmov d29, d8 @ C[4] = A[0][4]
1172 vshl.u64 d8, d24, #14
1173 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1174 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1175 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1176 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1177
1178 vshl.u64 d3, d9, #20
1179 vshl.u64 d14, d16, #25
1180 vshl.u64 d17, d15, #15
1181 vshl.u64 d24, d21, #2
1182 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1183 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1184 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1185 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1186
1187 vshl.u64 d9, d22, #61
1188 @ vshl.u64 d16, d19, #8
1189 vshl.u64 d15, d12, #10
1190 vshl.u64 d21, d7, #55
1191 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1192 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1193 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1194 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1195
1196 vshl.u64 d22, d18, #39
1197 @ vshl.u64 d19, d23, #56
1198 vshl.u64 d12, d5, #6
1199 vshl.u64 d7, d13, #45
1200 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1201 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1202 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1203 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1204
1205 vshl.u64 d18, d20, #18
1206 vshl.u64 d23, d11, #41
1207 vshl.u64 d5, d10, #3
1208 vshl.u64 d13, d1, #36
1209 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1210 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1211 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1212 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1213
1214 vshl.u64 d1, d28, #28
1215 vshl.u64 d10, d26, #1
1216 vshl.u64 d11, d29, #27
1217 vshl.u64 d20, d27, #62
1218 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1219 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1220 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1221 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1222
1223 @ Chi + Iota
1224 vbic q13, q2, q1
1225 vbic q14, q3, q2
1226 vbic q15, q4, q3
1227 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1228 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1229 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1230 vst1.64 {q13}, [r0,:64] @ offload A[0..1][0]
1231 vbic q13, q0, q4
1232 vbic q15, q1, q0
1233 vmov q1, q14 @ A[0..1][1]
1234 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1235 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1236
1237 vbic q13, q7, q6
1238 vmov q0, q5 @ A[2..3][0]
1239 vbic q14, q8, q7
1240 vmov q15, q6 @ A[2..3][1]
1241 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1242 vbic q13, q9, q8
1243 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1244 vbic q14, q0, q9
1245 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1246 vbic q13, q15, q0
1247 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1248 vmov q14, q10 @ A[4][0..1]
1249 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1250
1251 vld1.64 d25, [r2,:64]! @ Iota[i++]
1252 vbic d26, d22, d21
1253 vbic d27, d23, d22
1254 vld1.64 {q0}, [r0,:64] @ restore A[0..1][0]
1255 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1256 vbic d26, d24, d23
1257 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1258 vbic d27, d28, d24
1259 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1260 vbic d26, d29, d28
1261 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1262 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1263 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1264
1265 subs r3, r3, #1
1266 bne .Loop_neon
1267
1268 bx lr
1269 .size KeccakF1600_neon,.-KeccakF1600_neon
1270
1271 .global SHA3_absorb_neon
1272 .type SHA3_absorb_neon, %function
1273 .align 5
1274 SHA3_absorb_neon:
1275 stmdb sp!, {r4-r6,lr}
1276 vstmdb sp!, {d8-d15}
1277
1278 mov r4, r1 @ inp
1279 mov r5, r2 @ len
1280 mov r6, r3 @ bsz
1281
1282 vld1.32 {d0}, [r0,:64]! @ A[0][0]
1283 vld1.32 {d2}, [r0,:64]! @ A[0][1]
1284 vld1.32 {d4}, [r0,:64]! @ A[0][2]
1285 vld1.32 {d6}, [r0,:64]! @ A[0][3]
1286 vld1.32 {d8}, [r0,:64]! @ A[0][4]
1287
1288 vld1.32 {d1}, [r0,:64]! @ A[1][0]
1289 vld1.32 {d3}, [r0,:64]! @ A[1][1]
1290 vld1.32 {d5}, [r0,:64]! @ A[1][2]
1291 vld1.32 {d7}, [r0,:64]! @ A[1][3]
1292 vld1.32 {d9}, [r0,:64]! @ A[1][4]
1293
1294 vld1.32 {d10}, [r0,:64]! @ A[2][0]
1295 vld1.32 {d12}, [r0,:64]! @ A[2][1]
1296 vld1.32 {d14}, [r0,:64]! @ A[2][2]
1297 vld1.32 {d16}, [r0,:64]! @ A[2][3]
1298 vld1.32 {d18}, [r0,:64]! @ A[2][4]
1299
1300 vld1.32 {d11}, [r0,:64]! @ A[3][0]
1301 vld1.32 {d13}, [r0,:64]! @ A[3][1]
1302 vld1.32 {d15}, [r0,:64]! @ A[3][2]
1303 vld1.32 {d17}, [r0,:64]! @ A[3][3]
1304 vld1.32 {d19}, [r0,:64]! @ A[3][4]
1305
1306 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3]
1307 vld1.32 {d24}, [r0,:64] @ A[4][4]
1308 sub r0, r0, #24*8 @ rewind
1309 b .Loop_absorb_neon
1310
1311 .align 4
1312 .Loop_absorb_neon:
1313 subs r12, r5, r6 @ len - bsz
1314 blo .Labsorbed_neon
1315 mov r5, r12
1316
1317 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1318 cmp r6, #8*2
1319 veor d0, d0, d31 @ A[0][0] ^= *inp++
1320 blo .Lprocess_neon
1321 vld1.8 {d31}, [r4]!
1322 veor d2, d2, d31 @ A[0][1] ^= *inp++
1323 beq .Lprocess_neon
1324 vld1.8 {d31}, [r4]!
1325 cmp r6, #8*4
1326 veor d4, d4, d31 @ A[0][2] ^= *inp++
1327 blo .Lprocess_neon
1328 vld1.8 {d31}, [r4]!
1329 veor d6, d6, d31 @ A[0][3] ^= *inp++
1330 beq .Lprocess_neon
1331 vld1.8 {d31},[r4]!
1332 cmp r6, #8*6
1333 veor d8, d8, d31 @ A[0][4] ^= *inp++
1334 blo .Lprocess_neon
1335
1336 vld1.8 {d31}, [r4]!
1337 veor d1, d1, d31 @ A[1][0] ^= *inp++
1338 beq .Lprocess_neon
1339 vld1.8 {d31}, [r4]!
1340 cmp r6, #8*8
1341 veor d3, d3, d31 @ A[1][1] ^= *inp++
1342 blo .Lprocess_neon
1343 vld1.8 {d31}, [r4]!
1344 veor d5, d5, d31 @ A[1][2] ^= *inp++
1345 beq .Lprocess_neon
1346 vld1.8 {d31}, [r4]!
1347 cmp r6, #8*10
1348 veor d7, d7, d31 @ A[1][3] ^= *inp++
1349 blo .Lprocess_neon
1350 vld1.8 {d31}, [r4]!
1351 veor d9, d9, d31 @ A[1][4] ^= *inp++
1352 beq .Lprocess_neon
1353
1354 vld1.8 {d31}, [r4]!
1355 cmp r6, #8*12
1356 veor d10, d10, d31 @ A[2][0] ^= *inp++
1357 blo .Lprocess_neon
1358 vld1.8 {d31}, [r4]!
1359 veor d12, d12, d31 @ A[2][1] ^= *inp++
1360 beq .Lprocess_neon
1361 vld1.8 {d31}, [r4]!
1362 cmp r6, #8*14
1363 veor d14, d14, d31 @ A[2][2] ^= *inp++
1364 blo .Lprocess_neon
1365 vld1.8 {d31}, [r4]!
1366 veor d16, d16, d31 @ A[2][3] ^= *inp++
1367 beq .Lprocess_neon
1368 vld1.8 {d31}, [r4]!
1369 cmp r6, #8*16
1370 veor d18, d18, d31 @ A[2][4] ^= *inp++
1371 blo .Lprocess_neon
1372
1373 vld1.8 {d31}, [r4]!
1374 veor d11, d11, d31 @ A[3][0] ^= *inp++
1375 beq .Lprocess_neon
1376 vld1.8 {d31}, [r4]!
1377 cmp r6, #8*18
1378 veor d13, d13, d31 @ A[3][1] ^= *inp++
1379 blo .Lprocess_neon
1380 vld1.8 {d31}, [r4]!
1381 veor d15, d15, d31 @ A[3][2] ^= *inp++
1382 beq .Lprocess_neon
1383 vld1.8 {d31}, [r4]!
1384 cmp r6, #8*20
1385 veor d17, d17, d31 @ A[3][3] ^= *inp++
1386 blo .Lprocess_neon
1387 vld1.8 {d31}, [r4]!
1388 veor d19, d19, d31 @ A[3][4] ^= *inp++
1389 beq .Lprocess_neon
1390
1391 vld1.8 {d31}, [r4]!
1392 cmp r6, #8*22
1393 veor d20, d20, d31 @ A[4][0] ^= *inp++
1394 blo .Lprocess_neon
1395 vld1.8 {d31}, [r4]!
1396 veor d21, d21, d31 @ A[4][1] ^= *inp++
1397 beq .Lprocess_neon
1398 vld1.8 {d31}, [r4]!
1399 cmp r6, #8*24
1400 veor d22, d22, d31 @ A[4][2] ^= *inp++
1401 blo .Lprocess_neon
1402 vld1.8 {d31}, [r4]!
1403 veor d23, d23, d31 @ A[4][3] ^= *inp++
1404 beq .Lprocess_neon
1405 vld1.8 {d31}, [r4]!
1406 veor d24, d24, d31 @ A[4][4] ^= *inp++
1407
1408 .Lprocess_neon:
1409 bl KeccakF1600_neon
1410 b .Loop_absorb_neon
1411
1412 .align 4
1413 .Labsorbed_neon:
1414 vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
1415 vst1.32 {d2}, [r0,:64]!
1416 vst1.32 {d4}, [r0,:64]!
1417 vst1.32 {d6}, [r0,:64]!
1418 vst1.32 {d8}, [r0,:64]!
1419
1420 vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
1421 vst1.32 {d3}, [r0,:64]!
1422 vst1.32 {d5}, [r0,:64]!
1423 vst1.32 {d7}, [r0,:64]!
1424 vst1.32 {d9}, [r0,:64]!
1425
1426 vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
1427 vst1.32 {d12}, [r0,:64]!
1428 vst1.32 {d14}, [r0,:64]!
1429 vst1.32 {d16}, [r0,:64]!
1430 vst1.32 {d18}, [r0,:64]!
1431
1432 vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
1433 vst1.32 {d13}, [r0,:64]!
1434 vst1.32 {d15}, [r0,:64]!
1435 vst1.32 {d17}, [r0,:64]!
1436 vst1.32 {d19}, [r0,:64]!
1437
1438 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1439 vst1.32 {d24}, [r0,:64]
1440
1441 mov r0, r5 @ return value
1442 vldmia sp!, {d8-d15}
1443 ldmia sp!, {r4-r6,pc}
1444 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1445
1446 .global SHA3_squeeze_neon
1447 .type SHA3_squeeze_neon, %function
1448 .align 5
1449 SHA3_squeeze_neon:
1450 stmdb sp!, {r4-r6,lr}
1451
1452 mov r4, r1 @ out
1453 mov r5, r2 @ len
1454 mov r6, r3 @ bsz
1455 mov r12, r0 @ A_flat
1456 mov r14, r3 @ bsz
1457 b .Loop_squeeze_neon
1458
1459 .align 4
1460 .Loop_squeeze_neon:
1461 cmp r5, #8
1462 blo .Lsqueeze_neon_tail
1463 vld1.32 {d0}, [r12]!
1464 vst1.8 {d0}, [r4]! @ endian-neutral store
1465
1466 subs r5, r5, #8 @ len -= 8
1467 beq .Lsqueeze_neon_done
1468
1469 subs r14, r14, #8 @ bsz -= 8
1470 bhi .Loop_squeeze_neon
1471
1472 vstmdb sp!, {d8-d15}
1473
1474 vld1.32 {d0}, [r0,:64]! @ A[0][0..4]
1475 vld1.32 {d2}, [r0,:64]!
1476 vld1.32 {d4}, [r0,:64]!
1477 vld1.32 {d6}, [r0,:64]!
1478 vld1.32 {d8}, [r0,:64]!
1479
1480 vld1.32 {d1}, [r0,:64]! @ A[1][0..4]
1481 vld1.32 {d3}, [r0,:64]!
1482 vld1.32 {d5}, [r0,:64]!
1483 vld1.32 {d7}, [r0,:64]!
1484 vld1.32 {d9}, [r0,:64]!
1485
1486 vld1.32 {d10}, [r0,:64]! @ A[2][0..4]
1487 vld1.32 {d12}, [r0,:64]!
1488 vld1.32 {d14}, [r0,:64]!
1489 vld1.32 {d16}, [r0,:64]!
1490 vld1.32 {d18}, [r0,:64]!
1491
1492 vld1.32 {d11}, [r0,:64]! @ A[3][0..4]
1493 vld1.32 {d13}, [r0,:64]!
1494 vld1.32 {d15}, [r0,:64]!
1495 vld1.32 {d17}, [r0,:64]!
1496 vld1.32 {d19}, [r0,:64]!
1497
1498 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1499 vld1.32 {d24}, [r0,:64]
1500 sub r0, r0, #24*8 @ rewind
1501
1502 bl KeccakF1600_neon
1503
1504 mov r12, r0 @ A_flat
1505 vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
1506 vst1.32 {d2}, [r0,:64]!
1507 vst1.32 {d4}, [r0,:64]!
1508 vst1.32 {d6}, [r0,:64]!
1509 vst1.32 {d8}, [r0,:64]!
1510
1511 vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
1512 vst1.32 {d3}, [r0,:64]!
1513 vst1.32 {d5}, [r0,:64]!
1514 vst1.32 {d7}, [r0,:64]!
1515 vst1.32 {d9}, [r0,:64]!
1516
1517 vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
1518 vst1.32 {d12}, [r0,:64]!
1519 vst1.32 {d14}, [r0,:64]!
1520 vst1.32 {d16}, [r0,:64]!
1521 vst1.32 {d18}, [r0,:64]!
1522
1523 vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
1524 vst1.32 {d13}, [r0,:64]!
1525 vst1.32 {d15}, [r0,:64]!
1526 vst1.32 {d17}, [r0,:64]!
1527 vst1.32 {d19}, [r0,:64]!
1528
1529 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1530 mov r14, r6 @ bsz
1531 vst1.32 {d24}, [r0,:64]
1532 mov r0, r12 @ rewind
1533
1534 vldmia sp!, {d8-d15}
1535 b .Loop_squeeze_neon
1536
1537 .align 4
1538 .Lsqueeze_neon_tail:
1539 ldmia r12, {r2,r3}
1540 cmp r5, #2
1541 strb r2, [r4],#1 @ endian-neutral store
1542 lsr r2, r2, #8
1543 blo .Lsqueeze_neon_done
1544 strb r2, [r4], #1
1545 lsr r2, r2, #8
1546 beq .Lsqueeze_neon_done
1547 strb r2, [r4], #1
1548 lsr r2, r2, #8
1549 cmp r5, #4
1550 blo .Lsqueeze_neon_done
1551 strb r2, [r4], #1
1552 beq .Lsqueeze_neon_done
1553
1554 strb r3, [r4], #1
1555 lsr r3, r3, #8
1556 cmp r5, #6
1557 blo .Lsqueeze_neon_done
1558 strb r3, [r4], #1
1559 lsr r3, r3, #8
1560 beq .Lsqueeze_neon_done
1561 strb r3, [r4], #1
1562
1563 .Lsqueeze_neon_done:
1564 ldmia sp!, {r4-r6,pc}
1565 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1566 #endif
1567 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1568 .align 2
1569 ___
1570
1571 {
1572 my %ldr, %str;
1573
1574 sub ldrd {
1575 my ($mnemonic,$half,$reg,$ea) = @_;
1576 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1577
1578 if ($half eq "l") {
1579 $$op{reg} = $reg;
1580 $$op{ea} = $ea;
1581 sprintf "#ifndef __thumb2__\n" .
1582 " %s\t%s,%s\n" .
1583 "#endif", $mnemonic,$reg,$ea;
1584 } else {
1585 sprintf "#ifndef __thumb2__\n" .
1586 " %s\t%s,%s\n" .
1587 "#else\n" .
1588 " %sd\t%s,%s,%s\n" .
1589 "#endif", $mnemonic,$reg,$ea,
1590 $mnemonic,$$op{reg},$reg,$$op{ea};
1591 }
1592 }
1593 }
1594
1595 foreach (split($/,$code)) {
1596 s/\`([^\`]*)\`/eval $1/ge;
1597
1598 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1599 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
1600 s/\bret\b/bx lr/g or
1601 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1602
1603 print $_,"\n";
1604 }
1605
1606 close STDOUT or die "error closing STDOUT: $!"; # enforce flush