]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-armv4.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-armv4.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for ARMv4.
17 #
18 # June 2017.
19 #
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
27 #
28 # August 2017.
29 #
30 # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31 # of rotate instructions with logical ones. This resulted in ~10%
32 # improvement on most processors. Switch to KECCAK_2X effectively
33 # minimizes re-loads from temporary storage, and merged rotates just
34 # eliminate corresponding instructions. As for latter. When examining
35 # code you'll notice commented ror instructions. These are eliminated
36 # ones, and you should trace destination register below to see what's
37 # going on. Just in case, why not all rotates are eliminated. Trouble
38 # is that you have operations that require both inputs to be rotated,
39 # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40 # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41 # that takes 'a' as input. And thing is that this next operation can
42 # be in next round. It's totally possible to "carry" rotate "factors"
43 # to the next round, but it makes code more complex. And the last word
44 # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45 # time being]...
46 #
47 # Reduce per-round instruction count in Thumb-2 case by 16%. This is
48 # achieved by folding ldr/str pairs to their double-word counterparts.
49 # Theoretically this should have improved performance on single-issue
50 # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51 # usual...
52 #
53 ########################################################################
54 # Numbers are cycles per processed byte. Non-NEON results account even
55 # for input bit interleaving.
56 #
57 # r=1088(*) Thumb-2(**) NEON
58 #
59 # ARM11xx 82/+150%
60 # Cortex-A5 88/+160%, 86, 36
61 # Cortex-A7 78/+160%, 68, 34
62 # Cortex-A8 51/+230%, 57, 30
63 # Cortex-A9 53/+210%, 51, 26
64 # Cortex-A15 42/+160%, 38, 18
65 # Snapdragon S4 43/+210%, 38, 24
66 #
67 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
68 # over compiler-generated KECCAK_2X reference code.
69 # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70 # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71 # processors are presented mostly for reference purposes.
72
73 # $output is the last argument if it looks like a file (it has an extension)
74 # $flavour is the first argument if it doesn't look like a file
75 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
76 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
77
78 if ($flavour && $flavour ne "void") {
79 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
80 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
81 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
82 die "can't locate arm-xlate.pl";
83
84 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
85 or die "can't call $xlate: $!";
86 } else {
87 $output and open STDOUT,">$output";
88 }
89
90 my @C = map("r$_",(0..9));
91 my @E = map("r$_",(10..12,14));
92
93 ########################################################################
94 # Stack layout
95 # ----->+-----------------------+
96 # | uint64_t A[5][5] |
97 # | ... |
98 # +200->+-----------------------+
99 # | uint64_t D[5] |
100 # | ... |
101 # +240->+-----------------------+
102 # | uint64_t T[5][5] |
103 # | ... |
104 # +440->+-----------------------+
105 # | saved lr |
106 # +444->+-----------------------+
107 # | loop counter |
108 # +448->+-----------------------+
109 # | ...
110
111 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
112 my @D = map(8*$_, (25..29));
113 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
114
115 $code.=<<___;
116 #include "arm_arch.h"
117
118 #if defined(__thumb2__)
119 .syntax unified
120 .thumb
121 #else
122 .code 32
123 #endif
124
125 .text
126
127 .type iotas32, %object
128 .align 5
129 iotas32:
130 .long 0x00000001, 0x00000000
131 .long 0x00000000, 0x00000089
132 .long 0x00000000, 0x8000008b
133 .long 0x00000000, 0x80008080
134 .long 0x00000001, 0x0000008b
135 .long 0x00000001, 0x00008000
136 .long 0x00000001, 0x80008088
137 .long 0x00000001, 0x80000082
138 .long 0x00000000, 0x0000000b
139 .long 0x00000000, 0x0000000a
140 .long 0x00000001, 0x00008082
141 .long 0x00000000, 0x00008003
142 .long 0x00000001, 0x0000808b
143 .long 0x00000001, 0x8000000b
144 .long 0x00000001, 0x8000008a
145 .long 0x00000001, 0x80000081
146 .long 0x00000000, 0x80000081
147 .long 0x00000000, 0x80000008
148 .long 0x00000000, 0x00000083
149 .long 0x00000000, 0x80008003
150 .long 0x00000001, 0x80008088
151 .long 0x00000000, 0x80000088
152 .long 0x00000001, 0x00008000
153 .long 0x00000000, 0x80008082
154 .size iotas32,.-iotas32
155
156 .type KeccakF1600_int, %function
157 .align 5
158 KeccakF1600_int:
159 add @C[9],sp,#$A[4][2]
160 add @E[2],sp,#$A[0][0]
161 add @E[0],sp,#$A[1][0]
162 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
163 KeccakF1600_enter:
164 str lr,[sp,#440]
165 eor @E[1],@E[1],@E[1]
166 str @E[1],[sp,#444]
167 b .Lround2x
168
169 .align 4
170 .Lround2x:
171 ___
172 sub Round {
173 my (@A,@R); (@A[0..4],@R) = @_;
174
175 $code.=<<___;
176 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
177 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
178 #ifdef __thumb2__
179 eor @C[0],@C[0],@E[0]
180 eor @C[1],@C[1],@E[1]
181 eor @C[2],@C[2],@E[2]
182 ldrd @E[0],@E[1],[sp,#$A[1][2]]
183 eor @C[3],@C[3],@E[3]
184 ldrd @E[2],@E[3],[sp,#$A[1][3]]
185 eor @C[4],@C[4],@E[0]
186 eor @C[5],@C[5],@E[1]
187 eor @C[6],@C[6],@E[2]
188 ldrd @E[0],@E[1],[sp,#$A[1][4]]
189 eor @C[7],@C[7],@E[3]
190 ldrd @E[2],@E[3],[sp,#$A[2][0]]
191 eor @C[8],@C[8],@E[0]
192 eor @C[9],@C[9],@E[1]
193 eor @C[0],@C[0],@E[2]
194 ldrd @E[0],@E[1],[sp,#$A[2][1]]
195 eor @C[1],@C[1],@E[3]
196 ldrd @E[2],@E[3],[sp,#$A[2][2]]
197 eor @C[2],@C[2],@E[0]
198 eor @C[3],@C[3],@E[1]
199 eor @C[4],@C[4],@E[2]
200 ldrd @E[0],@E[1],[sp,#$A[2][3]]
201 eor @C[5],@C[5],@E[3]
202 ldrd @E[2],@E[3],[sp,#$A[2][4]]
203 eor @C[6],@C[6],@E[0]
204 eor @C[7],@C[7],@E[1]
205 eor @C[8],@C[8],@E[2]
206 ldrd @E[0],@E[1],[sp,#$A[3][0]]
207 eor @C[9],@C[9],@E[3]
208 ldrd @E[2],@E[3],[sp,#$A[3][1]]
209 eor @C[0],@C[0],@E[0]
210 eor @C[1],@C[1],@E[1]
211 eor @C[2],@C[2],@E[2]
212 ldrd @E[0],@E[1],[sp,#$A[3][2]]
213 eor @C[3],@C[3],@E[3]
214 ldrd @E[2],@E[3],[sp,#$A[3][3]]
215 eor @C[4],@C[4],@E[0]
216 eor @C[5],@C[5],@E[1]
217 eor @C[6],@C[6],@E[2]
218 ldrd @E[0],@E[1],[sp,#$A[3][4]]
219 eor @C[7],@C[7],@E[3]
220 ldrd @E[2],@E[3],[sp,#$A[4][0]]
221 eor @C[8],@C[8],@E[0]
222 eor @C[9],@C[9],@E[1]
223 eor @C[0],@C[0],@E[2]
224 ldrd @E[0],@E[1],[sp,#$A[4][1]]
225 eor @C[1],@C[1],@E[3]
226 ldrd @E[2],@E[3],[sp,#$A[0][2]]
227 eor @C[2],@C[2],@E[0]
228 eor @C[3],@C[3],@E[1]
229 eor @C[4],@C[4],@E[2]
230 ldrd @E[0],@E[1],[sp,#$A[0][3]]
231 eor @C[5],@C[5],@E[3]
232 ldrd @E[2],@E[3],[sp,#$A[0][4]]
233 #else
234 eor @C[0],@C[0],@E[0]
235 add @E[0],sp,#$A[1][2]
236 eor @C[1],@C[1],@E[1]
237 eor @C[2],@C[2],@E[2]
238 eor @C[3],@C[3],@E[3]
239 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
240 eor @C[4],@C[4],@E[0]
241 add @E[0],sp,#$A[1][4]
242 eor @C[5],@C[5],@E[1]
243 eor @C[6],@C[6],@E[2]
244 eor @C[7],@C[7],@E[3]
245 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
246 eor @C[8],@C[8],@E[0]
247 add @E[0],sp,#$A[2][1]
248 eor @C[9],@C[9],@E[1]
249 eor @C[0],@C[0],@E[2]
250 eor @C[1],@C[1],@E[3]
251 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
252 eor @C[2],@C[2],@E[0]
253 add @E[0],sp,#$A[2][3]
254 eor @C[3],@C[3],@E[1]
255 eor @C[4],@C[4],@E[2]
256 eor @C[5],@C[5],@E[3]
257 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
258 eor @C[6],@C[6],@E[0]
259 add @E[0],sp,#$A[3][0]
260 eor @C[7],@C[7],@E[1]
261 eor @C[8],@C[8],@E[2]
262 eor @C[9],@C[9],@E[3]
263 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
264 eor @C[0],@C[0],@E[0]
265 add @E[0],sp,#$A[3][2]
266 eor @C[1],@C[1],@E[1]
267 eor @C[2],@C[2],@E[2]
268 eor @C[3],@C[3],@E[3]
269 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
270 eor @C[4],@C[4],@E[0]
271 add @E[0],sp,#$A[3][4]
272 eor @C[5],@C[5],@E[1]
273 eor @C[6],@C[6],@E[2]
274 eor @C[7],@C[7],@E[3]
275 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
276 eor @C[8],@C[8],@E[0]
277 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
278 eor @C[9],@C[9],@E[1]
279 ldr @E[1],[sp,#$A[4][1]+4]
280 eor @C[0],@C[0],@E[2]
281 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
282 eor @C[1],@C[1],@E[3]
283 ldr @E[3],[sp,#$A[0][2]+4]
284 eor @C[2],@C[2],@E[0]
285 add @E[0],sp,#$A[0][3]
286 eor @C[3],@C[3],@E[1]
287 eor @C[4],@C[4],@E[2]
288 eor @C[5],@C[5],@E[3]
289 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
290 #endif
291 eor @C[6],@C[6],@E[0]
292 eor @C[7],@C[7],@E[1]
293 eor @C[8],@C[8],@E[2]
294 eor @C[9],@C[9],@E[3]
295
296 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
297 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
298 eor @E[1],@C[1],@C[4]
299 str.h @E[1],[sp,#$D[1]+4]
300 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
301 eor @E[3],@C[7],@C[0]
302 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
303 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
304 str.h @E[3],[sp,#$D[4]+4]
305 eor @C[1],@C[9],@C[2]
306 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
307 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
308 ldr.l @C[7],[sp,#$A[3][3]]
309 eor @C[3],@C[3],@C[6]
310 str.h @C[1],[sp,#$D[0]+4]
311 ldr.h @C[6],[sp,#$A[3][3]+4]
312 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
313 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
314 str.h @C[3],[sp,#$D[2]+4]
315 eor @C[5],@C[5],@C[8]
316
317 ldr.l @C[8],[sp,#$A[4][4]]
318 ldr.h @C[9],[sp,#$A[4][4]+4]
319 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
320 eor @C[7],@C[7],@C[4]
321 str.h @C[5],[sp,#$D[3]+4]
322 eor @C[6],@C[6],@C[5]
323 ldr.l @C[4],[sp,#$A[0][0]]
324 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
325 @ ror @C[6],@C[6],#32-11
326 ldr.h @C[5],[sp,#$A[0][0]+4]
327 eor @C[8],@C[8],@E[2]
328 eor @C[9],@C[9],@E[3]
329 ldr.l @E[2],[sp,#$A[2][2]]
330 eor @C[0],@C[0],@C[4]
331 ldr.h @E[3],[sp,#$A[2][2]+4]
332 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
333 @ ror @C[9],@C[9],#32-7
334 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
335 eor @E[2],@E[2],@C[2]
336 ldr.l @C[2],[sp,#$A[1][1]]
337 eor @E[3],@E[3],@C[3]
338 ldr.h @C[3],[sp,#$A[1][1]+4]
339 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
340 ldr @E[2],[sp,#444] @ load counter
341 eor @C[2],@C[2],@E[0]
342 adr @E[0],iotas32
343 ror @C[4],@E[3],#32-22
344 add @E[3],@E[0],@E[2]
345 eor @C[3],@C[3],@E[1]
346 ___
347 $code.=<<___ if ($A[0][0] != $T[0][0]);
348 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
349 ___
350 $code.=<<___ if ($A[0][0] == $T[0][0]);
351 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
352 add @E[2],@E[2],#16
353 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
354 cmp @E[2],#192
355 str @E[2],[sp,#444] @ store counter
356 ___
357 $code.=<<___;
358 bic @E[2],@C[4],@C[2],ror#32-22
359 bic @E[3],@C[5],@C[3],ror#32-22
360 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
361 ror @C[3],@C[3],#32-22
362 eor @E[2],@E[2],@C[0]
363 eor @E[3],@E[3],@C[1]
364 eor @E[0],@E[0],@E[2]
365 eor @E[1],@E[1],@E[3]
366 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
367 bic @E[2],@C[6],@C[4],ror#11
368 str.h @E[1],[sp,#$R[0][0]+4]
369 bic @E[3],@C[7],@C[5],ror#10
370 bic @E[0],@C[8],@C[6],ror#32-(11-7)
371 bic @E[1],@C[9],@C[7],ror#32-(10-7)
372 eor @E[2],@C[2],@E[2],ror#32-11
373 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
374 eor @E[3],@C[3],@E[3],ror#32-10
375 str.h @E[3],[sp,#$R[0][1]+4]
376 eor @E[0],@C[4],@E[0],ror#32-7
377 eor @E[1],@C[5],@E[1],ror#32-7
378 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
379 bic @E[2],@C[0],@C[8],ror#32-7
380 str.h @E[1],[sp,#$R[0][2]+4]
381 bic @E[3],@C[1],@C[9],ror#32-7
382 eor @E[2],@E[2],@C[6],ror#32-11
383 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
384 eor @E[3],@E[3],@C[7],ror#32-10
385 str.h @E[3],[sp,#$R[0][3]+4]
386 bic @E[0],@C[2],@C[0]
387 add @E[3],sp,#$D[3]
388 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
389 bic @E[1],@C[3],@C[1]
390 ldr.h @C[1],[sp,#$A[0][3]+4]
391 eor @E[0],@E[0],@C[8],ror#32-7
392 eor @E[1],@E[1],@C[9],ror#32-7
393 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
394 add @C[9],sp,#$D[0]
395 str.h @E[1],[sp,#$R[0][4]+4]
396
397 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
398 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
399
400 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
401 eor @C[0],@C[0],@E[0]
402 ldr.h @C[3],[sp,#$A[1][4]+4]
403 eor @C[1],@C[1],@E[1]
404 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
405 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
406 @ ror @C[1],@C[1],#32-14
407 ldr.h @E[1],[sp,#$A[3][1]+4]
408
409 eor @C[2],@C[2],@E[2]
410 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
411 eor @C[3],@C[3],@E[3]
412 ldr.h @C[5],[sp,#$A[2][0]+4]
413 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
414 @ ror @C[3],@C[3],#32-10
415
416 eor @C[6],@C[6],@C[4]
417 ldr.l @E[2],[sp,#$D[2]] @ D[2]
418 eor @C[7],@C[7],@C[5]
419 ldr.h @E[3],[sp,#$D[2]+4]
420 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
421 ror @C[4],@C[7],#32-2
422
423 eor @E[0],@E[0],@C[8]
424 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
425 eor @E[1],@E[1],@C[9]
426 ldr.h @C[9],[sp,#$A[4][2]+4]
427 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
428 ror @C[6],@E[1],#32-23
429
430 bic @E[0],@C[4],@C[2],ror#32-10
431 bic @E[1],@C[5],@C[3],ror#32-10
432 eor @E[2],@E[2],@C[8]
433 eor @E[3],@E[3],@C[9]
434 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
435 ror @C[8],@E[3],#32-31
436 eor @E[0],@E[0],@C[0],ror#32-14
437 eor @E[1],@E[1],@C[1],ror#32-14
438 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
439 bic @E[2],@C[6],@C[4]
440 str.h @E[1],[sp,#$R[1][0]+4]
441 bic @E[3],@C[7],@C[5]
442 eor @E[2],@E[2],@C[2],ror#32-10
443 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
444 eor @E[3],@E[3],@C[3],ror#32-10
445 str.h @E[3],[sp,#$R[1][1]+4]
446 bic @E[0],@C[8],@C[6]
447 bic @E[1],@C[9],@C[7]
448 bic @E[2],@C[0],@C[8],ror#14
449 bic @E[3],@C[1],@C[9],ror#14
450 eor @E[0],@E[0],@C[4]
451 eor @E[1],@E[1],@C[5]
452 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
453 bic @C[2],@C[2],@C[0],ror#32-(14-10)
454 str.h @E[1],[sp,#$R[1][2]+4]
455 eor @E[2],@C[6],@E[2],ror#32-14
456 bic @E[1],@C[3],@C[1],ror#32-(14-10)
457 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
458 eor @E[3],@C[7],@E[3],ror#32-14
459 str.h @E[3],[sp,#$R[1][3]+4]
460 add @E[2],sp,#$D[1]
461 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
462 eor @E[0],@C[8],@C[2],ror#32-10
463 ldr.h @C[0],[sp,#$A[0][1]+4]
464 eor @E[1],@C[9],@E[1],ror#32-10
465 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
466 str.h @E[1],[sp,#$R[1][4]+4]
467
468 add @C[9],sp,#$D[3]
469 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
470 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
471 ldr.h @C[3],[sp,#$A[1][2]+4]
472 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
473
474 eor @C[1],@C[1],@E[0]
475 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
476 eor @C[0],@C[0],@E[1]
477 ldr.h @C[5],[sp,#$A[2][3]+4]
478 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
479
480 eor @C[2],@C[2],@E[2]
481 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
482 eor @C[3],@C[3],@E[3]
483 ldr.h @E[1],[sp,#$A[3][4]+4]
484 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
485 ldr.l @E[2],[sp,#$D[0]] @ D[0]
486 @ ror @C[3],@C[3],#32-3
487 ldr.h @E[3],[sp,#$D[0]+4]
488
489 eor @C[4],@C[4],@C[6]
490 eor @C[5],@C[5],@C[7]
491 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
492 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
493
494 eor @E[0],@E[0],@C[8]
495 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
496 eor @E[1],@E[1],@C[9]
497 ldr.h @C[9],[sp,#$A[4][0]+4]
498 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
499 ror @C[7],@E[1],#32-4
500
501 eor @E[2],@E[2],@C[8]
502 eor @E[3],@E[3],@C[9]
503 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
504 ror @C[9],@E[3],#32-9
505
506 bic @E[0],@C[5],@C[2],ror#13-3
507 bic @E[1],@C[4],@C[3],ror#12-3
508 bic @E[2],@C[6],@C[5],ror#32-13
509 bic @E[3],@C[7],@C[4],ror#32-12
510 eor @E[0],@C[0],@E[0],ror#32-13
511 eor @E[1],@C[1],@E[1],ror#32-12
512 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
513 eor @E[2],@E[2],@C[2],ror#32-3
514 str.h @E[1],[sp,#$R[2][0]+4]
515 eor @E[3],@E[3],@C[3],ror#32-3
516 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
517 bic @E[0],@C[8],@C[6]
518 bic @E[1],@C[9],@C[7]
519 str.h @E[3],[sp,#$R[2][1]+4]
520 eor @E[0],@E[0],@C[5],ror#32-13
521 eor @E[1],@E[1],@C[4],ror#32-12
522 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
523 bic @E[2],@C[0],@C[8]
524 str.h @E[1],[sp,#$R[2][2]+4]
525 bic @E[3],@C[1],@C[9]
526 eor @E[2],@E[2],@C[6]
527 eor @E[3],@E[3],@C[7]
528 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
529 bic @E[0],@C[2],@C[0],ror#3
530 str.h @E[3],[sp,#$R[2][3]+4]
531 bic @E[1],@C[3],@C[1],ror#3
532 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
533 eor @E[0],@C[8],@E[0],ror#32-3
534 ldr.h @C[0],[sp,#$A[0][4]+4]
535 eor @E[1],@C[9],@E[1],ror#32-3
536 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
537 add @C[9],sp,#$D[1]
538 str.h @E[1],[sp,#$R[2][4]+4]
539
540 ldr.l @E[0],[sp,#$D[4]] @ D[4]
541 ldr.h @E[1],[sp,#$D[4]+4]
542 ldr.l @E[2],[sp,#$D[0]] @ D[0]
543 ldr.h @E[3],[sp,#$D[0]+4]
544
545 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
546
547 eor @C[1],@C[1],@E[0]
548 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
549 eor @C[0],@C[0],@E[1]
550 ldr.h @C[3],[sp,#$A[1][0]+4]
551 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
552 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
553 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
554 ldr.h @C[5],[sp,#$A[2][1]+4]
555
556 eor @C[2],@C[2],@E[2]
557 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
558 eor @C[3],@C[3],@E[3]
559 ldr.h @E[1],[sp,#$A[3][2]+4]
560 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
561 ldr.l @E[2],[sp,#$D[3]] @ D[3]
562 @ ror @C[3],@C[3],#32-18
563 ldr.h @E[3],[sp,#$D[3]+4]
564
565 eor @C[6],@C[6],@C[4]
566 eor @C[7],@C[7],@C[5]
567 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
568 ror @C[5],@C[7],#32-5
569
570 eor @E[0],@E[0],@C[8]
571 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
572 eor @E[1],@E[1],@C[9]
573 ldr.h @C[9],[sp,#$A[4][3]+4]
574 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
575 ror @C[6],@E[1],#32-8
576
577 eor @E[2],@E[2],@C[8]
578 eor @E[3],@E[3],@C[9]
579 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
580 ror @C[9],@E[3],#32-28
581
582 bic @E[0],@C[4],@C[2],ror#32-18
583 bic @E[1],@C[5],@C[3],ror#32-18
584 eor @E[0],@E[0],@C[0],ror#32-14
585 eor @E[1],@E[1],@C[1],ror#32-13
586 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
587 bic @E[2],@C[6],@C[4]
588 str.h @E[1],[sp,#$R[3][0]+4]
589 bic @E[3],@C[7],@C[5]
590 eor @E[2],@E[2],@C[2],ror#32-18
591 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
592 eor @E[3],@E[3],@C[3],ror#32-18
593 str.h @E[3],[sp,#$R[3][1]+4]
594 bic @E[0],@C[8],@C[6]
595 bic @E[1],@C[9],@C[7]
596 bic @E[2],@C[0],@C[8],ror#14
597 bic @E[3],@C[1],@C[9],ror#13
598 eor @E[0],@E[0],@C[4]
599 eor @E[1],@E[1],@C[5]
600 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
601 bic @C[2],@C[2],@C[0],ror#18-14
602 str.h @E[1],[sp,#$R[3][2]+4]
603 eor @E[2],@C[6],@E[2],ror#32-14
604 bic @E[1],@C[3],@C[1],ror#18-13
605 eor @E[3],@C[7],@E[3],ror#32-13
606 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
607 str.h @E[3],[sp,#$R[3][3]+4]
608 add @E[3],sp,#$D[2]
609 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
610 eor @E[0],@C[8],@C[2],ror#32-18
611 ldr.h @C[1],[sp,#$A[0][2]+4]
612 eor @E[1],@C[9],@E[1],ror#32-18
613 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
614 str.h @E[1],[sp,#$R[3][4]+4]
615
616 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
617 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
618 ldr.h @C[3],[sp,#$A[1][3]+4]
619 ldr.l @C[6],[sp,#$D[4]] @ D[4]
620 ldr.h @C[7],[sp,#$D[4]+4]
621
622 eor @C[0],@C[0],@E[0]
623 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
624 eor @C[1],@C[1],@E[1]
625 ldr.h @C[5],[sp,#$A[2][4]+4]
626 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
627 ldr.l @C[8],[sp,#$D[0]] @ D[0]
628 @ ror @C[1],@C[1],#32-31
629 ldr.h @C[9],[sp,#$D[0]+4]
630
631 eor @E[2],@E[2],@C[2]
632 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
633 eor @E[3],@E[3],@C[3]
634 ldr.h @E[1],[sp,#$A[3][0]+4]
635 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
636 ldr.l @E[2],[sp,#$D[1]] @ D[1]
637 ror @C[2],@E[3],#32-28
638 ldr.h @E[3],[sp,#$D[1]+4]
639
640 eor @C[6],@C[6],@C[4]
641 eor @C[7],@C[7],@C[5]
642 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
643 ror @C[4],@C[7],#32-20
644
645 eor @E[0],@E[0],@C[8]
646 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
647 eor @E[1],@E[1],@C[9]
648 ldr.h @C[9],[sp,#$A[4][1]+4]
649 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
650 ror @C[6],@E[1],#32-21
651
652 eor @C[8],@C[8],@E[2]
653 eor @C[9],@C[9],@E[3]
654 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
655 @ ror @C[9],@C[3],#32-1
656
657 bic @E[0],@C[4],@C[2]
658 bic @E[1],@C[5],@C[3]
659 eor @E[0],@E[0],@C[0],ror#32-31
660 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
661 eor @E[1],@E[1],@C[1],ror#32-31
662 str.h @E[1],[sp,#$R[4][0]+4]
663 bic @E[2],@C[6],@C[4]
664 bic @E[3],@C[7],@C[5]
665 eor @E[2],@E[2],@C[2]
666 eor @E[3],@E[3],@C[3]
667 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
668 bic @E[0],@C[8],@C[6],ror#1
669 str.h @E[3],[sp,#$R[4][1]+4]
670 bic @E[1],@C[9],@C[7],ror#1
671 bic @E[2],@C[0],@C[8],ror#31-1
672 bic @E[3],@C[1],@C[9],ror#31-1
673 eor @C[4],@C[4],@E[0],ror#32-1
674 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
675 eor @C[5],@C[5],@E[1],ror#32-1
676 str.h @C[5],[sp,#$R[4][2]+4]
677 eor @C[6],@C[6],@E[2],ror#32-31
678 eor @C[7],@C[7],@E[3],ror#32-31
679 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
680 bic @E[0],@C[2],@C[0],ror#32-31
681 str.h @C[7],[sp,#$R[4][3]+4]
682 bic @E[1],@C[3],@C[1],ror#32-31
683 add @E[2],sp,#$R[0][0]
684 eor @C[8],@E[0],@C[8],ror#32-1
685 add @E[0],sp,#$R[1][0]
686 eor @C[9],@E[1],@C[9],ror#32-1
687 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
688 str.h @C[9],[sp,#$R[4][4]+4]
689 ___
690 }
691 Round(@A,@T);
692 Round(@T,@A);
693 $code.=<<___;
694 blo .Lround2x
695
696 #if __ARM_ARCH__>=5
697 ldr pc,[sp,#440]
698 #else
699 ldr lr,[sp,#440]
700 tst lr,#1
701 moveq pc,lr @ be binary compatible with V4, yet
702 bx lr @ interoperable with Thumb ISA:-)
703 #endif
704 .size KeccakF1600_int,.-KeccakF1600_int
705
706 .type KeccakF1600, %function
707 .align 5
708 KeccakF1600:
709 stmdb sp!,{r0,r4-r11,lr}
710 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
711
712 add @E[0],r0,#$A[1][0]
713 add @E[1],sp,#$A[1][0]
714 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
715 stmia sp, {@C[0]-@C[9]}
716 ldmia @E[0]!,{@C[0]-@C[9]}
717 stmia @E[1]!,{@C[0]-@C[9]}
718 ldmia @E[0]!,{@C[0]-@C[9]}
719 stmia @E[1]!,{@C[0]-@C[9]}
720 ldmia @E[0]!,{@C[0]-@C[9]}
721 stmia @E[1]!,{@C[0]-@C[9]}
722 ldmia @E[0], {@C[0]-@C[9]}
723 add @E[2],sp,#$A[0][0]
724 add @E[0],sp,#$A[1][0]
725 stmia @E[1], {@C[0]-@C[9]}
726
727 bl KeccakF1600_enter
728
729 ldr @E[1], [sp,#440+16] @ restore pointer to A
730 ldmia sp, {@C[0]-@C[9]}
731 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
732 ldmia @E[0]!,{@C[0]-@C[9]}
733 stmia @E[1]!,{@C[0]-@C[9]}
734 ldmia @E[0]!,{@C[0]-@C[9]}
735 stmia @E[1]!,{@C[0]-@C[9]}
736 ldmia @E[0]!,{@C[0]-@C[9]}
737 stmia @E[1]!,{@C[0]-@C[9]}
738 ldmia @E[0], {@C[0]-@C[9]}
739 stmia @E[1], {@C[0]-@C[9]}
740
741 add sp,sp,#440+20
742 #if __ARM_ARCH__>=5
743 ldmia sp!,{r4-r11,pc}
744 #else
745 ldmia sp!,{r4-r11,lr}
746 tst lr,#1
747 moveq pc,lr @ be binary compatible with V4, yet
748 bx lr @ interoperable with Thumb ISA:-)
749 #endif
750 .size KeccakF1600,.-KeccakF1600
751 ___
752 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
753
754 ########################################################################
755 # Stack layout
756 # ----->+-----------------------+
757 # | uint64_t A[5][5] |
758 # | ... |
759 # | ... |
760 # +456->+-----------------------+
761 # | 0x55555555 |
762 # +460->+-----------------------+
763 # | 0x33333333 |
764 # +464->+-----------------------+
765 # | 0x0f0f0f0f |
766 # +468->+-----------------------+
767 # | 0x00ff00ff |
768 # +472->+-----------------------+
769 # | uint64_t *A |
770 # +476->+-----------------------+
771 # | const void *inp |
772 # +480->+-----------------------+
773 # | size_t len |
774 # +484->+-----------------------+
775 # | size_t bs |
776 # +488->+-----------------------+
777 # | ....
778
779 $code.=<<___;
780 .global SHA3_absorb
781 .type SHA3_absorb,%function
782 .align 5
783 SHA3_absorb:
784 stmdb sp!,{r0-r12,lr}
785 sub sp,sp,#456+16
786
787 add $A_flat,r0,#$A[1][0]
788 @ mov $inp,r1
789 mov $len,r2
790 mov $bsz,r3
791 cmp r2,r3
792 blo .Labsorb_abort
793
794 add $inp,sp,#0
795 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
796 stmia $inp!, {@C[0]-@C[9]}
797 ldmia $A_flat!,{@C[0]-@C[9]}
798 stmia $inp!, {@C[0]-@C[9]}
799 ldmia $A_flat!,{@C[0]-@C[9]}
800 stmia $inp!, {@C[0]-@C[9]}
801 ldmia $A_flat!,{@C[0]-@C[9]}
802 stmia $inp!, {@C[0]-@C[9]}
803 ldmia $A_flat!,{@C[0]-@C[9]}
804 stmia $inp, {@C[0]-@C[9]}
805
806 ldr $inp,[sp,#476] @ restore $inp
807 #ifdef __thumb2__
808 mov r9,#0x00ff00ff
809 mov r8,#0x0f0f0f0f
810 mov r7,#0x33333333
811 mov r6,#0x55555555
812 #else
813 mov r6,#0x11 @ compose constants
814 mov r8,#0x0f
815 mov r9,#0xff
816 orr r6,r6,r6,lsl#8
817 orr r8,r8,r8,lsl#8
818 orr r6,r6,r6,lsl#16 @ 0x11111111
819 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
820 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
821 orr r7,r6,r6,lsl#1 @ 0x33333333
822 orr r6,r6,r6,lsl#2 @ 0x55555555
823 #endif
824 str r9,[sp,#468]
825 str r8,[sp,#464]
826 str r7,[sp,#460]
827 str r6,[sp,#456]
828 b .Loop_absorb
829
830 .align 4
831 .Loop_absorb:
832 subs r0,$len,$bsz
833 blo .Labsorbed
834 add $A_flat,sp,#0
835 str r0,[sp,#480] @ save len - bsz
836
837 .align 4
838 .Loop_block:
839 ldrb r0,[$inp],#1
840 ldrb r1,[$inp],#1
841 ldrb r2,[$inp],#1
842 ldrb r3,[$inp],#1
843 ldrb r4,[$inp],#1
844 orr r0,r0,r1,lsl#8
845 ldrb r1,[$inp],#1
846 orr r0,r0,r2,lsl#16
847 ldrb r2,[$inp],#1
848 orr r0,r0,r3,lsl#24 @ lo
849 ldrb r3,[$inp],#1
850 orr r1,r4,r1,lsl#8
851 orr r1,r1,r2,lsl#16
852 orr r1,r1,r3,lsl#24 @ hi
853
854 and r2,r0,r6 @ &=0x55555555
855 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
856 and r3,r1,r6 @ &=0x55555555
857 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
858 orr r2,r2,r2,lsr#1
859 orr r0,r0,r0,lsl#1
860 orr r3,r3,r3,lsr#1
861 orr r1,r1,r1,lsl#1
862 and r2,r2,r7 @ &=0x33333333
863 and r0,r0,r7,lsl#2 @ &=0xcccccccc
864 and r3,r3,r7 @ &=0x33333333
865 and r1,r1,r7,lsl#2 @ &=0xcccccccc
866 orr r2,r2,r2,lsr#2
867 orr r0,r0,r0,lsl#2
868 orr r3,r3,r3,lsr#2
869 orr r1,r1,r1,lsl#2
870 and r2,r2,r8 @ &=0x0f0f0f0f
871 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
872 and r3,r3,r8 @ &=0x0f0f0f0f
873 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
874 ldmia $A_flat,{r4-r5} @ A_flat[i]
875 orr r2,r2,r2,lsr#4
876 orr r0,r0,r0,lsl#4
877 orr r3,r3,r3,lsr#4
878 orr r1,r1,r1,lsl#4
879 and r2,r2,r9 @ &=0x00ff00ff
880 and r0,r0,r9,lsl#8 @ &=0xff00ff00
881 and r3,r3,r9 @ &=0x00ff00ff
882 and r1,r1,r9,lsl#8 @ &=0xff00ff00
883 orr r2,r2,r2,lsr#8
884 orr r0,r0,r0,lsl#8
885 orr r3,r3,r3,lsr#8
886 orr r1,r1,r1,lsl#8
887
888 lsl r2,r2,#16
889 lsr r1,r1,#16
890 eor r4,r4,r3,lsl#16
891 eor r5,r5,r0,lsr#16
892 eor r4,r4,r2,lsr#16
893 eor r5,r5,r1,lsl#16
894 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
895
896 subs $bsz,$bsz,#8
897 bhi .Loop_block
898
899 str $inp,[sp,#476]
900
901 bl KeccakF1600_int
902
903 add r14,sp,#456
904 ldmia r14,{r6-r12,r14} @ restore constants and variables
905 b .Loop_absorb
906
907 .align 4
908 .Labsorbed:
909 add $inp,sp,#$A[1][0]
910 ldmia sp, {@C[0]-@C[9]}
911 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
912 ldmia $inp!, {@C[0]-@C[9]}
913 stmia $A_flat!,{@C[0]-@C[9]}
914 ldmia $inp!, {@C[0]-@C[9]}
915 stmia $A_flat!,{@C[0]-@C[9]}
916 ldmia $inp!, {@C[0]-@C[9]}
917 stmia $A_flat!,{@C[0]-@C[9]}
918 ldmia $inp, {@C[0]-@C[9]}
919 stmia $A_flat, {@C[0]-@C[9]}
920
921 .Labsorb_abort:
922 add sp,sp,#456+32
923 mov r0,$len @ return value
924 #if __ARM_ARCH__>=5
925 ldmia sp!,{r4-r12,pc}
926 #else
927 ldmia sp!,{r4-r12,lr}
928 tst lr,#1
929 moveq pc,lr @ be binary compatible with V4, yet
930 bx lr @ interoperable with Thumb ISA:-)
931 #endif
932 .size SHA3_absorb,.-SHA3_absorb
933 ___
934 }
935 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
936
937 $code.=<<___;
938 .global SHA3_squeeze
939 .type SHA3_squeeze,%function
940 .align 5
941 SHA3_squeeze:
942 stmdb sp!,{r0,r3-r10,lr}
943
944 mov $A_flat,r0
945 mov $out,r1
946 mov $len,r2
947 mov $bsz,r3
948
949 #ifdef __thumb2__
950 mov r9,#0x00ff00ff
951 mov r8,#0x0f0f0f0f
952 mov r7,#0x33333333
953 mov r6,#0x55555555
954 #else
955 mov r6,#0x11 @ compose constants
956 mov r8,#0x0f
957 mov r9,#0xff
958 orr r6,r6,r6,lsl#8
959 orr r8,r8,r8,lsl#8
960 orr r6,r6,r6,lsl#16 @ 0x11111111
961 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
962 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
963 orr r7,r6,r6,lsl#1 @ 0x33333333
964 orr r6,r6,r6,lsl#2 @ 0x55555555
965 #endif
966 stmdb sp!,{r6-r9}
967
968 mov r14,$A_flat
969 b .Loop_squeeze
970
971 .align 4
972 .Loop_squeeze:
973 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
974
975 lsl r2,r0,#16
976 lsl r3,r1,#16 @ r3 = r1 << 16
977 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
978 lsr r1,r1,#16
979 lsr r0,r0,#16 @ r0 = r0 >> 16
980 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
981
982 orr r2,r2,r2,lsl#8
983 orr r3,r3,r3,lsr#8
984 orr r0,r0,r0,lsl#8
985 orr r1,r1,r1,lsr#8
986 and r2,r2,r9 @ &=0x00ff00ff
987 and r3,r3,r9,lsl#8 @ &=0xff00ff00
988 and r0,r0,r9 @ &=0x00ff00ff
989 and r1,r1,r9,lsl#8 @ &=0xff00ff00
990 orr r2,r2,r2,lsl#4
991 orr r3,r3,r3,lsr#4
992 orr r0,r0,r0,lsl#4
993 orr r1,r1,r1,lsr#4
994 and r2,r2,r8 @ &=0x0f0f0f0f
995 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
996 and r0,r0,r8 @ &=0x0f0f0f0f
997 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
998 orr r2,r2,r2,lsl#2
999 orr r3,r3,r3,lsr#2
1000 orr r0,r0,r0,lsl#2
1001 orr r1,r1,r1,lsr#2
1002 and r2,r2,r7 @ &=0x33333333
1003 and r3,r3,r7,lsl#2 @ &=0xcccccccc
1004 and r0,r0,r7 @ &=0x33333333
1005 and r1,r1,r7,lsl#2 @ &=0xcccccccc
1006 orr r2,r2,r2,lsl#1
1007 orr r3,r3,r3,lsr#1
1008 orr r0,r0,r0,lsl#1
1009 orr r1,r1,r1,lsr#1
1010 and r2,r2,r6 @ &=0x55555555
1011 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
1012 and r0,r0,r6 @ &=0x55555555
1013 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
1014
1015 orr r2,r2,r3
1016 orr r0,r0,r1
1017
1018 cmp $len,#8
1019 blo .Lsqueeze_tail
1020 lsr r1,r2,#8
1021 strb r2,[$out],#1
1022 lsr r3,r2,#16
1023 strb r1,[$out],#1
1024 lsr r2,r2,#24
1025 strb r3,[$out],#1
1026 strb r2,[$out],#1
1027
1028 lsr r1,r0,#8
1029 strb r0,[$out],#1
1030 lsr r3,r0,#16
1031 strb r1,[$out],#1
1032 lsr r0,r0,#24
1033 strb r3,[$out],#1
1034 strb r0,[$out],#1
1035 subs $len,$len,#8
1036 beq .Lsqueeze_done
1037
1038 subs $bsz,$bsz,#8 @ bsz -= 8
1039 bhi .Loop_squeeze
1040
1041 mov r0,r14 @ original $A_flat
1042
1043 bl KeccakF1600
1044
1045 ldmia sp,{r6-r10,r12} @ restore constants and variables
1046 mov r14,$A_flat
1047 b .Loop_squeeze
1048
1049 .align 4
1050 .Lsqueeze_tail:
1051 strb r2,[$out],#1
1052 lsr r2,r2,#8
1053 subs $len,$len,#1
1054 beq .Lsqueeze_done
1055 strb r2,[$out],#1
1056 lsr r2,r2,#8
1057 subs $len,$len,#1
1058 beq .Lsqueeze_done
1059 strb r2,[$out],#1
1060 lsr r2,r2,#8
1061 subs $len,$len,#1
1062 beq .Lsqueeze_done
1063 strb r2,[$out],#1
1064 subs $len,$len,#1
1065 beq .Lsqueeze_done
1066
1067 strb r0,[$out],#1
1068 lsr r0,r0,#8
1069 subs $len,$len,#1
1070 beq .Lsqueeze_done
1071 strb r0,[$out],#1
1072 lsr r0,r0,#8
1073 subs $len,$len,#1
1074 beq .Lsqueeze_done
1075 strb r0,[$out]
1076 b .Lsqueeze_done
1077
1078 .align 4
1079 .Lsqueeze_done:
1080 add sp,sp,#24
1081 #if __ARM_ARCH__>=5
1082 ldmia sp!,{r4-r10,pc}
1083 #else
1084 ldmia sp!,{r4-r10,lr}
1085 tst lr,#1
1086 moveq pc,lr @ be binary compatible with V4, yet
1087 bx lr @ interoperable with Thumb ISA:-)
1088 #endif
1089 .size SHA3_squeeze,.-SHA3_squeeze
1090 ___
1091 }
1092
1093 $code.=<<___;
1094 #if __ARM_MAX_ARCH__>=7
1095 .fpu neon
1096
1097 .type iotas64, %object
1098 .align 5
1099 iotas64:
1100 .quad 0x0000000000000001
1101 .quad 0x0000000000008082
1102 .quad 0x800000000000808a
1103 .quad 0x8000000080008000
1104 .quad 0x000000000000808b
1105 .quad 0x0000000080000001
1106 .quad 0x8000000080008081
1107 .quad 0x8000000000008009
1108 .quad 0x000000000000008a
1109 .quad 0x0000000000000088
1110 .quad 0x0000000080008009
1111 .quad 0x000000008000000a
1112 .quad 0x000000008000808b
1113 .quad 0x800000000000008b
1114 .quad 0x8000000000008089
1115 .quad 0x8000000000008003
1116 .quad 0x8000000000008002
1117 .quad 0x8000000000000080
1118 .quad 0x000000000000800a
1119 .quad 0x800000008000000a
1120 .quad 0x8000000080008081
1121 .quad 0x8000000000008080
1122 .quad 0x0000000080000001
1123 .quad 0x8000000080008008
1124 .size iotas64,.-iotas64
1125
1126 .type KeccakF1600_neon, %function
1127 .align 5
1128 KeccakF1600_neon:
1129 add r1, r0, #16
1130 adr r2, iotas64
1131 mov r3, #24 @ loop counter
1132 b .Loop_neon
1133
1134 .align 4
1135 .Loop_neon:
1136 @ Theta
1137 vst1.64 {q4}, [r0,:64] @ offload A[0..1][4]
1138 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1139 vst1.64 {d18}, [r1,:64] @ offload A[2][4]
1140 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1141 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1142 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1143 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1144 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1145 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1146 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1147 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1148 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1149 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1150 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1151 veor d25, d25, d24 @ C[4]^=A[4][4]
1152
1153 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1154 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1155 vadd.u64 d18, d25, d25 @ C[4]<<1
1156 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1157 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1158 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1159 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1160 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1161 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1162 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1163
1164 veor d0, d0, d25 @ A[0][0] ^= C[4]
1165 veor d1, d1, d25 @ A[1][0] ^= C[4]
1166 veor d10, d10, d25 @ A[2][0] ^= C[4]
1167 veor d11, d11, d25 @ A[3][0] ^= C[4]
1168 veor d20, d20, d25 @ A[4][0] ^= C[4]
1169
1170 veor d2, d2, d26 @ A[0][1] ^= D[1]
1171 veor d3, d3, d26 @ A[1][1] ^= D[1]
1172 veor d12, d12, d26 @ A[2][1] ^= D[1]
1173 veor d13, d13, d26 @ A[3][1] ^= D[1]
1174 veor d21, d21, d26 @ A[4][1] ^= D[1]
1175 vmov d26, d27
1176
1177 veor d6, d6, d28 @ A[0][3] ^= C[2]
1178 veor d7, d7, d28 @ A[1][3] ^= C[2]
1179 veor d16, d16, d28 @ A[2][3] ^= C[2]
1180 veor d17, d17, d28 @ A[3][3] ^= C[2]
1181 veor d23, d23, d28 @ A[4][3] ^= C[2]
1182 vld1.64 {q4}, [r0,:64] @ restore A[0..1][4]
1183 vmov d28, d29
1184
1185 vld1.64 {d18}, [r1,:64] @ restore A[2][4]
1186 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1187 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1188 veor d22, d22, d27 @ A[4][2] ^= D[2]
1189
1190 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1191 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1192 veor d24, d24, d29 @ A[4][4] ^= C[3]
1193
1194 @ Rho + Pi
1195 vmov d26, d2 @ C[1] = A[0][1]
1196 vshl.u64 d2, d3, #44
1197 vmov d27, d4 @ C[2] = A[0][2]
1198 vshl.u64 d4, d14, #43
1199 vmov d28, d6 @ C[3] = A[0][3]
1200 vshl.u64 d6, d17, #21
1201 vmov d29, d8 @ C[4] = A[0][4]
1202 vshl.u64 d8, d24, #14
1203 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1204 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1205 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1206 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1207
1208 vshl.u64 d3, d9, #20
1209 vshl.u64 d14, d16, #25
1210 vshl.u64 d17, d15, #15
1211 vshl.u64 d24, d21, #2
1212 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1213 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1214 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1215 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1216
1217 vshl.u64 d9, d22, #61
1218 @ vshl.u64 d16, d19, #8
1219 vshl.u64 d15, d12, #10
1220 vshl.u64 d21, d7, #55
1221 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1222 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1223 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1224 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1225
1226 vshl.u64 d22, d18, #39
1227 @ vshl.u64 d19, d23, #56
1228 vshl.u64 d12, d5, #6
1229 vshl.u64 d7, d13, #45
1230 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1231 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1232 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1233 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1234
1235 vshl.u64 d18, d20, #18
1236 vshl.u64 d23, d11, #41
1237 vshl.u64 d5, d10, #3
1238 vshl.u64 d13, d1, #36
1239 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1240 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1241 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1242 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1243
1244 vshl.u64 d1, d28, #28
1245 vshl.u64 d10, d26, #1
1246 vshl.u64 d11, d29, #27
1247 vshl.u64 d20, d27, #62
1248 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1249 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1250 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1251 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1252
1253 @ Chi + Iota
1254 vbic q13, q2, q1
1255 vbic q14, q3, q2
1256 vbic q15, q4, q3
1257 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1258 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1259 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1260 vst1.64 {q13}, [r0,:64] @ offload A[0..1][0]
1261 vbic q13, q0, q4
1262 vbic q15, q1, q0
1263 vmov q1, q14 @ A[0..1][1]
1264 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1265 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1266
1267 vbic q13, q7, q6
1268 vmov q0, q5 @ A[2..3][0]
1269 vbic q14, q8, q7
1270 vmov q15, q6 @ A[2..3][1]
1271 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1272 vbic q13, q9, q8
1273 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1274 vbic q14, q0, q9
1275 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1276 vbic q13, q15, q0
1277 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1278 vmov q14, q10 @ A[4][0..1]
1279 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1280
1281 vld1.64 d25, [r2,:64]! @ Iota[i++]
1282 vbic d26, d22, d21
1283 vbic d27, d23, d22
1284 vld1.64 {q0}, [r0,:64] @ restore A[0..1][0]
1285 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1286 vbic d26, d24, d23
1287 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1288 vbic d27, d28, d24
1289 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1290 vbic d26, d29, d28
1291 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1292 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1293 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1294
1295 subs r3, r3, #1
1296 bne .Loop_neon
1297
1298 ret
1299 .size KeccakF1600_neon,.-KeccakF1600_neon
1300
1301 .global SHA3_absorb_neon
1302 .type SHA3_absorb_neon, %function
1303 .align 5
1304 SHA3_absorb_neon:
1305 stmdb sp!, {r4-r6,lr}
1306 vstmdb sp!, {d8-d15}
1307
1308 mov r4, r1 @ inp
1309 mov r5, r2 @ len
1310 mov r6, r3 @ bsz
1311
1312 vld1.32 {d0}, [r0,:64]! @ A[0][0]
1313 vld1.32 {d2}, [r0,:64]! @ A[0][1]
1314 vld1.32 {d4}, [r0,:64]! @ A[0][2]
1315 vld1.32 {d6}, [r0,:64]! @ A[0][3]
1316 vld1.32 {d8}, [r0,:64]! @ A[0][4]
1317
1318 vld1.32 {d1}, [r0,:64]! @ A[1][0]
1319 vld1.32 {d3}, [r0,:64]! @ A[1][1]
1320 vld1.32 {d5}, [r0,:64]! @ A[1][2]
1321 vld1.32 {d7}, [r0,:64]! @ A[1][3]
1322 vld1.32 {d9}, [r0,:64]! @ A[1][4]
1323
1324 vld1.32 {d10}, [r0,:64]! @ A[2][0]
1325 vld1.32 {d12}, [r0,:64]! @ A[2][1]
1326 vld1.32 {d14}, [r0,:64]! @ A[2][2]
1327 vld1.32 {d16}, [r0,:64]! @ A[2][3]
1328 vld1.32 {d18}, [r0,:64]! @ A[2][4]
1329
1330 vld1.32 {d11}, [r0,:64]! @ A[3][0]
1331 vld1.32 {d13}, [r0,:64]! @ A[3][1]
1332 vld1.32 {d15}, [r0,:64]! @ A[3][2]
1333 vld1.32 {d17}, [r0,:64]! @ A[3][3]
1334 vld1.32 {d19}, [r0,:64]! @ A[3][4]
1335
1336 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3]
1337 vld1.32 {d24}, [r0,:64] @ A[4][4]
1338 sub r0, r0, #24*8 @ rewind
1339 b .Loop_absorb_neon
1340
1341 .align 4
1342 .Loop_absorb_neon:
1343 subs r12, r5, r6 @ len - bsz
1344 blo .Labsorbed_neon
1345 mov r5, r12
1346
1347 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1348 cmp r6, #8*2
1349 veor d0, d0, d31 @ A[0][0] ^= *inp++
1350 blo .Lprocess_neon
1351 vld1.8 {d31}, [r4]!
1352 veor d2, d2, d31 @ A[0][1] ^= *inp++
1353 beq .Lprocess_neon
1354 vld1.8 {d31}, [r4]!
1355 cmp r6, #8*4
1356 veor d4, d4, d31 @ A[0][2] ^= *inp++
1357 blo .Lprocess_neon
1358 vld1.8 {d31}, [r4]!
1359 veor d6, d6, d31 @ A[0][3] ^= *inp++
1360 beq .Lprocess_neon
1361 vld1.8 {d31},[r4]!
1362 cmp r6, #8*6
1363 veor d8, d8, d31 @ A[0][4] ^= *inp++
1364 blo .Lprocess_neon
1365
1366 vld1.8 {d31}, [r4]!
1367 veor d1, d1, d31 @ A[1][0] ^= *inp++
1368 beq .Lprocess_neon
1369 vld1.8 {d31}, [r4]!
1370 cmp r6, #8*8
1371 veor d3, d3, d31 @ A[1][1] ^= *inp++
1372 blo .Lprocess_neon
1373 vld1.8 {d31}, [r4]!
1374 veor d5, d5, d31 @ A[1][2] ^= *inp++
1375 beq .Lprocess_neon
1376 vld1.8 {d31}, [r4]!
1377 cmp r6, #8*10
1378 veor d7, d7, d31 @ A[1][3] ^= *inp++
1379 blo .Lprocess_neon
1380 vld1.8 {d31}, [r4]!
1381 veor d9, d9, d31 @ A[1][4] ^= *inp++
1382 beq .Lprocess_neon
1383
1384 vld1.8 {d31}, [r4]!
1385 cmp r6, #8*12
1386 veor d10, d10, d31 @ A[2][0] ^= *inp++
1387 blo .Lprocess_neon
1388 vld1.8 {d31}, [r4]!
1389 veor d12, d12, d31 @ A[2][1] ^= *inp++
1390 beq .Lprocess_neon
1391 vld1.8 {d31}, [r4]!
1392 cmp r6, #8*14
1393 veor d14, d14, d31 @ A[2][2] ^= *inp++
1394 blo .Lprocess_neon
1395 vld1.8 {d31}, [r4]!
1396 veor d16, d16, d31 @ A[2][3] ^= *inp++
1397 beq .Lprocess_neon
1398 vld1.8 {d31}, [r4]!
1399 cmp r6, #8*16
1400 veor d18, d18, d31 @ A[2][4] ^= *inp++
1401 blo .Lprocess_neon
1402
1403 vld1.8 {d31}, [r4]!
1404 veor d11, d11, d31 @ A[3][0] ^= *inp++
1405 beq .Lprocess_neon
1406 vld1.8 {d31}, [r4]!
1407 cmp r6, #8*18
1408 veor d13, d13, d31 @ A[3][1] ^= *inp++
1409 blo .Lprocess_neon
1410 vld1.8 {d31}, [r4]!
1411 veor d15, d15, d31 @ A[3][2] ^= *inp++
1412 beq .Lprocess_neon
1413 vld1.8 {d31}, [r4]!
1414 cmp r6, #8*20
1415 veor d17, d17, d31 @ A[3][3] ^= *inp++
1416 blo .Lprocess_neon
1417 vld1.8 {d31}, [r4]!
1418 veor d19, d19, d31 @ A[3][4] ^= *inp++
1419 beq .Lprocess_neon
1420
1421 vld1.8 {d31}, [r4]!
1422 cmp r6, #8*22
1423 veor d20, d20, d31 @ A[4][0] ^= *inp++
1424 blo .Lprocess_neon
1425 vld1.8 {d31}, [r4]!
1426 veor d21, d21, d31 @ A[4][1] ^= *inp++
1427 beq .Lprocess_neon
1428 vld1.8 {d31}, [r4]!
1429 cmp r6, #8*24
1430 veor d22, d22, d31 @ A[4][2] ^= *inp++
1431 blo .Lprocess_neon
1432 vld1.8 {d31}, [r4]!
1433 veor d23, d23, d31 @ A[4][3] ^= *inp++
1434 beq .Lprocess_neon
1435 vld1.8 {d31}, [r4]!
1436 veor d24, d24, d31 @ A[4][4] ^= *inp++
1437
1438 .Lprocess_neon:
1439 bl KeccakF1600_neon
1440 b .Loop_absorb_neon
1441
1442 .align 4
1443 .Labsorbed_neon:
1444 vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
1445 vst1.32 {d2}, [r0,:64]!
1446 vst1.32 {d4}, [r0,:64]!
1447 vst1.32 {d6}, [r0,:64]!
1448 vst1.32 {d8}, [r0,:64]!
1449
1450 vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
1451 vst1.32 {d3}, [r0,:64]!
1452 vst1.32 {d5}, [r0,:64]!
1453 vst1.32 {d7}, [r0,:64]!
1454 vst1.32 {d9}, [r0,:64]!
1455
1456 vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
1457 vst1.32 {d12}, [r0,:64]!
1458 vst1.32 {d14}, [r0,:64]!
1459 vst1.32 {d16}, [r0,:64]!
1460 vst1.32 {d18}, [r0,:64]!
1461
1462 vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
1463 vst1.32 {d13}, [r0,:64]!
1464 vst1.32 {d15}, [r0,:64]!
1465 vst1.32 {d17}, [r0,:64]!
1466 vst1.32 {d19}, [r0,:64]!
1467
1468 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1469 vst1.32 {d24}, [r0,:64]
1470
1471 mov r0, r5 @ return value
1472 vldmia sp!, {d8-d15}
1473 ldmia sp!, {r4-r6,pc}
1474 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1475
1476 .global SHA3_squeeze_neon
1477 .type SHA3_squeeze_neon, %function
1478 .align 5
1479 SHA3_squeeze_neon:
1480 stmdb sp!, {r4-r6,lr}
1481
1482 mov r4, r1 @ out
1483 mov r5, r2 @ len
1484 mov r6, r3 @ bsz
1485 mov r12, r0 @ A_flat
1486 mov r14, r3 @ bsz
1487 b .Loop_squeeze_neon
1488
1489 .align 4
1490 .Loop_squeeze_neon:
1491 cmp r5, #8
1492 blo .Lsqueeze_neon_tail
1493 vld1.32 {d0}, [r12]!
1494 vst1.8 {d0}, [r4]! @ endian-neutral store
1495
1496 subs r5, r5, #8 @ len -= 8
1497 beq .Lsqueeze_neon_done
1498
1499 subs r14, r14, #8 @ bsz -= 8
1500 bhi .Loop_squeeze_neon
1501
1502 vstmdb sp!, {d8-d15}
1503
1504 vld1.32 {d0}, [r0,:64]! @ A[0][0..4]
1505 vld1.32 {d2}, [r0,:64]!
1506 vld1.32 {d4}, [r0,:64]!
1507 vld1.32 {d6}, [r0,:64]!
1508 vld1.32 {d8}, [r0,:64]!
1509
1510 vld1.32 {d1}, [r0,:64]! @ A[1][0..4]
1511 vld1.32 {d3}, [r0,:64]!
1512 vld1.32 {d5}, [r0,:64]!
1513 vld1.32 {d7}, [r0,:64]!
1514 vld1.32 {d9}, [r0,:64]!
1515
1516 vld1.32 {d10}, [r0,:64]! @ A[2][0..4]
1517 vld1.32 {d12}, [r0,:64]!
1518 vld1.32 {d14}, [r0,:64]!
1519 vld1.32 {d16}, [r0,:64]!
1520 vld1.32 {d18}, [r0,:64]!
1521
1522 vld1.32 {d11}, [r0,:64]! @ A[3][0..4]
1523 vld1.32 {d13}, [r0,:64]!
1524 vld1.32 {d15}, [r0,:64]!
1525 vld1.32 {d17}, [r0,:64]!
1526 vld1.32 {d19}, [r0,:64]!
1527
1528 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1529 vld1.32 {d24}, [r0,:64]
1530 sub r0, r0, #24*8 @ rewind
1531
1532 bl KeccakF1600_neon
1533
1534 mov r12, r0 @ A_flat
1535 vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
1536 vst1.32 {d2}, [r0,:64]!
1537 vst1.32 {d4}, [r0,:64]!
1538 vst1.32 {d6}, [r0,:64]!
1539 vst1.32 {d8}, [r0,:64]!
1540
1541 vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
1542 vst1.32 {d3}, [r0,:64]!
1543 vst1.32 {d5}, [r0,:64]!
1544 vst1.32 {d7}, [r0,:64]!
1545 vst1.32 {d9}, [r0,:64]!
1546
1547 vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
1548 vst1.32 {d12}, [r0,:64]!
1549 vst1.32 {d14}, [r0,:64]!
1550 vst1.32 {d16}, [r0,:64]!
1551 vst1.32 {d18}, [r0,:64]!
1552
1553 vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
1554 vst1.32 {d13}, [r0,:64]!
1555 vst1.32 {d15}, [r0,:64]!
1556 vst1.32 {d17}, [r0,:64]!
1557 vst1.32 {d19}, [r0,:64]!
1558
1559 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1560 mov r14, r6 @ bsz
1561 vst1.32 {d24}, [r0,:64]
1562 mov r0, r12 @ rewind
1563
1564 vldmia sp!, {d8-d15}
1565 b .Loop_squeeze_neon
1566
1567 .align 4
1568 .Lsqueeze_neon_tail:
1569 ldmia r12, {r2,r3}
1570 cmp r5, #2
1571 strb r2, [r4],#1 @ endian-neutral store
1572 lsr r2, r2, #8
1573 blo .Lsqueeze_neon_done
1574 strb r2, [r4], #1
1575 lsr r2, r2, #8
1576 beq .Lsqueeze_neon_done
1577 strb r2, [r4], #1
1578 lsr r2, r2, #8
1579 cmp r5, #4
1580 blo .Lsqueeze_neon_done
1581 strb r2, [r4], #1
1582 beq .Lsqueeze_neon_done
1583
1584 strb r3, [r4], #1
1585 lsr r3, r3, #8
1586 cmp r5, #6
1587 blo .Lsqueeze_neon_done
1588 strb r3, [r4], #1
1589 lsr r3, r3, #8
1590 beq .Lsqueeze_neon_done
1591 strb r3, [r4], #1
1592
1593 .Lsqueeze_neon_done:
1594 ldmia sp!, {r4-r6,pc}
1595 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1596 #endif
1597 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1598 .align 2
1599 ___
1600
1601 {
1602 my %ldr, %str;
1603
1604 sub ldrd {
1605 my ($mnemonic,$half,$reg,$ea) = @_;
1606 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1607
1608 if ($half eq "l") {
1609 $$op{reg} = $reg;
1610 $$op{ea} = $ea;
1611 sprintf "#ifndef __thumb2__\n" .
1612 " %s\t%s,%s\n" .
1613 "#endif", $mnemonic,$reg,$ea;
1614 } else {
1615 sprintf "#ifndef __thumb2__\n" .
1616 " %s\t%s,%s\n" .
1617 "#else\n" .
1618 " %sd\t%s,%s,%s\n" .
1619 "#endif", $mnemonic,$reg,$ea,
1620 $mnemonic,$$op{reg},$reg,$$op{ea};
1621 }
1622 }
1623 }
1624
1625 foreach (split($/,$code)) {
1626 s/\`([^\`]*)\`/eval $1/ge;
1627
1628 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1629 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
1630 s/\bret\b/bx lr/g or
1631 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1632
1633 print $_,"\n";
1634 }
1635
1636 close STDOUT or die "error closing STDOUT: $!"; # enforce flush