]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-armv4.pl
a9ad3cbc3f8e0c2c766225f9ce3e1ae96a8a31e7
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-armv4.pl
1 #!/usr/bin/env perl
2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for ARMv4.
17 #
18 # June 2017.
19 #
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
27 #
28 # August 2017.
29 #
30 # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31 # of rotate instructions with logical ones. This resulted in ~10%
32 # improvement on most processors. Switch to KECCAK_2X effectively
33 # minimizes re-loads from temporary storage, and merged rotates just
34 # eliminate corresponding instructions. As for latter. When examining
35 # code you'll notice commented ror instructions. These are eliminated
36 # ones, and you should trace destination register below to see what's
37 # going on. Just in case, why not all rotates are eliminated. Trouble
38 # is that you have operations that require both inputs to be rotated,
39 # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40 # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41 # that takes 'a' as input. And thing is that this next operation can
42 # be in next round. It's totally possible to "carry" rotate "factors"
43 # to the next round, but it makes code more complex. And the last word
44 # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45 # time being]...
46 #
47 # Reduce per-round instruction count in Thumb-2 case by 16%. This is
48 # achieved by folding ldr/str pairs to their double-word counterparts.
49 # Theoretically this should have improved performance on single-issue
50 # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51 # usual...
52 #
53 ########################################################################
54 # Numbers are cycles per processed byte. Non-NEON results account even
55 # for input bit interleaving.
56 #
57 # r=1088(*) Thumb-2(**) NEON
58 #
59 # ARM11xx 82/+150%
60 # Cortex-A5 88/+160%, 86, 36
61 # Cortex-A7 78/+160%, 68, 34
62 # Cortex-A8 51/+230%, 57, 30
63 # Cortex-A9 53/+210%, 51, 26
64 # Cortex-A15 42/+160%, 38, 18
65 # Snapdragon S4 43/+210%, 38, 24
66 #
67 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
68 # over compiler-generated KECCAK_2X reference code.
69 # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70 # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71 # processors are presented mostly for reference purposes.
72
73 my @C = map("r$_",(0..9));
74 my @E = map("r$_",(10..12,14));
75
76 ########################################################################
77 # Stack layout
78 # ----->+-----------------------+
79 # | uint64_t A[5][5] |
80 # | ... |
81 # +200->+-----------------------+
82 # | uint64_t D[5] |
83 # | ... |
84 # +240->+-----------------------+
85 # | uint64_t T[5][5] |
86 # | ... |
87 # +440->+-----------------------+
88 # | saved lr |
89 # +444->+-----------------------+
90 # | loop counter |
91 # +448->+-----------------------+
92 # | ...
93
94 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
95 my @D = map(8*$_, (25..29));
96 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
97
98 $code.=<<___;
99 .text
100
101 #if defined(__thumb2__)
102 .syntax unified
103 .thumb
104 #else
105 .code 32
106 #endif
107
108 .type iotas32, %object
109 .align 5
110 iotas32:
111 .long 0x00000001, 0x00000000
112 .long 0x00000000, 0x00000089
113 .long 0x00000000, 0x8000008b
114 .long 0x00000000, 0x80008080
115 .long 0x00000001, 0x0000008b
116 .long 0x00000001, 0x00008000
117 .long 0x00000001, 0x80008088
118 .long 0x00000001, 0x80000082
119 .long 0x00000000, 0x0000000b
120 .long 0x00000000, 0x0000000a
121 .long 0x00000001, 0x00008082
122 .long 0x00000000, 0x00008003
123 .long 0x00000001, 0x0000808b
124 .long 0x00000001, 0x8000000b
125 .long 0x00000001, 0x8000008a
126 .long 0x00000001, 0x80000081
127 .long 0x00000000, 0x80000081
128 .long 0x00000000, 0x80000008
129 .long 0x00000000, 0x00000083
130 .long 0x00000000, 0x80008003
131 .long 0x00000001, 0x80008088
132 .long 0x00000000, 0x80000088
133 .long 0x00000001, 0x00008000
134 .long 0x00000000, 0x80008082
135 .size iotas32,.-iotas32
136
137 .type KeccakF1600_int, %function
138 .align 5
139 KeccakF1600_int:
140 add @C[9],sp,#$A[4][2]
141 add @E[2],sp,#$A[0][0]
142 add @E[0],sp,#$A[1][0]
143 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
144 KeccakF1600_enter:
145 str lr,[sp,#440]
146 eor @E[1],@E[1],@E[1]
147 str @E[1],[sp,#444]
148 b .Lround2x
149
150 .align 4
151 .Lround2x:
152 ___
153 sub Round {
154 my (@A,@R); (@A[0..4],@R) = @_;
155
156 $code.=<<___;
157 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
158 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
159 #ifdef __thumb2__
160 eor @C[0],@C[0],@E[0]
161 eor @C[1],@C[1],@E[1]
162 eor @C[2],@C[2],@E[2]
163 ldrd @E[0],@E[1],[sp,#$A[1][2]]
164 eor @C[3],@C[3],@E[3]
165 ldrd @E[2],@E[3],[sp,#$A[1][3]]
166 eor @C[4],@C[4],@E[0]
167 eor @C[5],@C[5],@E[1]
168 eor @C[6],@C[6],@E[2]
169 ldrd @E[0],@E[1],[sp,#$A[1][4]]
170 eor @C[7],@C[7],@E[3]
171 ldrd @E[2],@E[3],[sp,#$A[2][0]]
172 eor @C[8],@C[8],@E[0]
173 eor @C[9],@C[9],@E[1]
174 eor @C[0],@C[0],@E[2]
175 ldrd @E[0],@E[1],[sp,#$A[2][1]]
176 eor @C[1],@C[1],@E[3]
177 ldrd @E[2],@E[3],[sp,#$A[2][2]]
178 eor @C[2],@C[2],@E[0]
179 eor @C[3],@C[3],@E[1]
180 eor @C[4],@C[4],@E[2]
181 ldrd @E[0],@E[1],[sp,#$A[2][3]]
182 eor @C[5],@C[5],@E[3]
183 ldrd @E[2],@E[3],[sp,#$A[2][4]]
184 eor @C[6],@C[6],@E[0]
185 eor @C[7],@C[7],@E[1]
186 eor @C[8],@C[8],@E[2]
187 ldrd @E[0],@E[1],[sp,#$A[3][0]]
188 eor @C[9],@C[9],@E[3]
189 ldrd @E[2],@E[3],[sp,#$A[3][1]]
190 eor @C[0],@C[0],@E[0]
191 eor @C[1],@C[1],@E[1]
192 eor @C[2],@C[2],@E[2]
193 ldrd @E[0],@E[1],[sp,#$A[3][2]]
194 eor @C[3],@C[3],@E[3]
195 ldrd @E[2],@E[3],[sp,#$A[3][3]]
196 eor @C[4],@C[4],@E[0]
197 eor @C[5],@C[5],@E[1]
198 eor @C[6],@C[6],@E[2]
199 ldrd @E[0],@E[1],[sp,#$A[3][4]]
200 eor @C[7],@C[7],@E[3]
201 ldrd @E[2],@E[3],[sp,#$A[4][0]]
202 eor @C[8],@C[8],@E[0]
203 eor @C[9],@C[9],@E[1]
204 eor @C[0],@C[0],@E[2]
205 ldrd @E[0],@E[1],[sp,#$A[4][1]]
206 eor @C[1],@C[1],@E[3]
207 ldrd @E[2],@E[3],[sp,#$A[0][2]]
208 eor @C[2],@C[2],@E[0]
209 eor @C[3],@C[3],@E[1]
210 eor @C[4],@C[4],@E[2]
211 ldrd @E[0],@E[1],[sp,#$A[0][3]]
212 eor @C[5],@C[5],@E[3]
213 ldrd @E[2],@E[3],[sp,#$A[0][4]]
214 #else
215 eor @C[0],@C[0],@E[0]
216 add @E[0],sp,#$A[1][2]
217 eor @C[1],@C[1],@E[1]
218 eor @C[2],@C[2],@E[2]
219 eor @C[3],@C[3],@E[3]
220 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
221 eor @C[4],@C[4],@E[0]
222 add @E[0],sp,#$A[1][4]
223 eor @C[5],@C[5],@E[1]
224 eor @C[6],@C[6],@E[2]
225 eor @C[7],@C[7],@E[3]
226 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
227 eor @C[8],@C[8],@E[0]
228 add @E[0],sp,#$A[2][1]
229 eor @C[9],@C[9],@E[1]
230 eor @C[0],@C[0],@E[2]
231 eor @C[1],@C[1],@E[3]
232 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
233 eor @C[2],@C[2],@E[0]
234 add @E[0],sp,#$A[2][3]
235 eor @C[3],@C[3],@E[1]
236 eor @C[4],@C[4],@E[2]
237 eor @C[5],@C[5],@E[3]
238 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
239 eor @C[6],@C[6],@E[0]
240 add @E[0],sp,#$A[3][0]
241 eor @C[7],@C[7],@E[1]
242 eor @C[8],@C[8],@E[2]
243 eor @C[9],@C[9],@E[3]
244 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
245 eor @C[0],@C[0],@E[0]
246 add @E[0],sp,#$A[3][2]
247 eor @C[1],@C[1],@E[1]
248 eor @C[2],@C[2],@E[2]
249 eor @C[3],@C[3],@E[3]
250 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
251 eor @C[4],@C[4],@E[0]
252 add @E[0],sp,#$A[3][4]
253 eor @C[5],@C[5],@E[1]
254 eor @C[6],@C[6],@E[2]
255 eor @C[7],@C[7],@E[3]
256 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
257 eor @C[8],@C[8],@E[0]
258 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
259 eor @C[9],@C[9],@E[1]
260 ldr @E[1],[sp,#$A[4][1]+4]
261 eor @C[0],@C[0],@E[2]
262 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
263 eor @C[1],@C[1],@E[3]
264 ldr @E[3],[sp,#$A[0][2]+4]
265 eor @C[2],@C[2],@E[0]
266 add @E[0],sp,#$A[0][3]
267 eor @C[3],@C[3],@E[1]
268 eor @C[4],@C[4],@E[2]
269 eor @C[5],@C[5],@E[3]
270 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
271 #endif
272 eor @C[6],@C[6],@E[0]
273 eor @C[7],@C[7],@E[1]
274 eor @C[8],@C[8],@E[2]
275 eor @C[9],@C[9],@E[3]
276
277 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
278 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
279 eor @E[1],@C[1],@C[4]
280 str.h @E[1],[sp,#$D[1]+4]
281 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
282 eor @E[3],@C[7],@C[0]
283 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
284 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
285 str.h @E[3],[sp,#$D[4]+4]
286 eor @C[1],@C[9],@C[2]
287 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
288 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
289 ldr.l @C[7],[sp,#$A[3][3]]
290 eor @C[3],@C[3],@C[6]
291 str.h @C[1],[sp,#$D[0]+4]
292 ldr.h @C[6],[sp,#$A[3][3]+4]
293 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
294 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
295 str.h @C[3],[sp,#$D[2]+4]
296 eor @C[5],@C[5],@C[8]
297
298 ldr.l @C[8],[sp,#$A[4][4]]
299 ldr.h @C[9],[sp,#$A[4][4]+4]
300 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
301 eor @C[7],@C[7],@C[4]
302 str.h @C[5],[sp,#$D[3]+4]
303 eor @C[6],@C[6],@C[5]
304 ldr.l @C[4],[sp,#$A[0][0]]
305 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
306 @ ror @C[6],@C[6],#32-11
307 ldr.h @C[5],[sp,#$A[0][0]+4]
308 eor @C[8],@C[8],@E[2]
309 eor @C[9],@C[9],@E[3]
310 ldr.l @E[2],[sp,#$A[2][2]]
311 eor @C[0],@C[0],@C[4]
312 ldr.h @E[3],[sp,#$A[2][2]+4]
313 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
314 @ ror @C[9],@C[9],#32-7
315 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
316 eor @E[2],@E[2],@C[2]
317 ldr.l @C[2],[sp,#$A[1][1]]
318 eor @E[3],@E[3],@C[3]
319 ldr.h @C[3],[sp,#$A[1][1]+4]
320 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
321 ldr @E[2],[sp,#444] @ load counter
322 eor @C[2],@C[2],@E[0]
323 adr @E[0],iotas32
324 ror @C[4],@E[3],#32-22
325 add @E[3],@E[0],@E[2]
326 eor @C[3],@C[3],@E[1]
327 ___
328 $code.=<<___ if ($A[0][0] != $T[0][0]);
329 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
330 ___
331 $code.=<<___ if ($A[0][0] == $T[0][0]);
332 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
333 add @E[2],@E[2],#16
334 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
335 cmp @E[2],#192
336 str @E[2],[sp,#444] @ store counter
337 ___
338 $code.=<<___;
339 bic @E[2],@C[4],@C[2],ror#32-22
340 bic @E[3],@C[5],@C[3],ror#32-22
341 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
342 ror @C[3],@C[3],#32-22
343 eor @E[2],@E[2],@C[0]
344 eor @E[3],@E[3],@C[1]
345 eor @E[0],@E[0],@E[2]
346 eor @E[1],@E[1],@E[3]
347 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
348 bic @E[2],@C[6],@C[4],ror#11
349 str.h @E[1],[sp,#$R[0][0]+4]
350 bic @E[3],@C[7],@C[5],ror#10
351 bic @E[0],@C[8],@C[6],ror#32-(11-7)
352 bic @E[1],@C[9],@C[7],ror#32-(10-7)
353 eor @E[2],@C[2],@E[2],ror#32-11
354 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
355 eor @E[3],@C[3],@E[3],ror#32-10
356 str.h @E[3],[sp,#$R[0][1]+4]
357 eor @E[0],@C[4],@E[0],ror#32-7
358 eor @E[1],@C[5],@E[1],ror#32-7
359 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
360 bic @E[2],@C[0],@C[8],ror#32-7
361 str.h @E[1],[sp,#$R[0][2]+4]
362 bic @E[3],@C[1],@C[9],ror#32-7
363 eor @E[2],@E[2],@C[6],ror#32-11
364 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
365 eor @E[3],@E[3],@C[7],ror#32-10
366 str.h @E[3],[sp,#$R[0][3]+4]
367 bic @E[0],@C[2],@C[0]
368 add @E[3],sp,#$D[3]
369 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
370 bic @E[1],@C[3],@C[1]
371 ldr.h @C[1],[sp,#$A[0][3]+4]
372 eor @E[0],@E[0],@C[8],ror#32-7
373 eor @E[1],@E[1],@C[9],ror#32-7
374 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
375 add @C[9],sp,#$D[0]
376 str.h @E[1],[sp,#$R[0][4]+4]
377
378 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
379 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
380
381 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
382 eor @C[0],@C[0],@E[0]
383 ldr.h @C[3],[sp,#$A[1][4]+4]
384 eor @C[1],@C[1],@E[1]
385 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
386 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
387 @ ror @C[1],@C[1],#32-14
388 ldr.h @E[1],[sp,#$A[3][1]+4]
389
390 eor @C[2],@C[2],@E[2]
391 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
392 eor @C[3],@C[3],@E[3]
393 ldr.h @C[5],[sp,#$A[2][0]+4]
394 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
395 @ ror @C[3],@C[3],#32-10
396
397 eor @C[6],@C[6],@C[4]
398 ldr.l @E[2],[sp,#$D[2]] @ D[2]
399 eor @C[7],@C[7],@C[5]
400 ldr.h @E[3],[sp,#$D[2]+4]
401 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
402 ror @C[4],@C[7],#32-2
403
404 eor @E[0],@E[0],@C[8]
405 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
406 eor @E[1],@E[1],@C[9]
407 ldr.h @C[9],[sp,#$A[4][2]+4]
408 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
409 ror @C[6],@E[1],#32-23
410
411 bic @E[0],@C[4],@C[2],ror#32-10
412 bic @E[1],@C[5],@C[3],ror#32-10
413 eor @E[2],@E[2],@C[8]
414 eor @E[3],@E[3],@C[9]
415 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
416 ror @C[8],@E[3],#32-31
417 eor @E[0],@E[0],@C[0],ror#32-14
418 eor @E[1],@E[1],@C[1],ror#32-14
419 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
420 bic @E[2],@C[6],@C[4]
421 str.h @E[1],[sp,#$R[1][0]+4]
422 bic @E[3],@C[7],@C[5]
423 eor @E[2],@E[2],@C[2],ror#32-10
424 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
425 eor @E[3],@E[3],@C[3],ror#32-10
426 str.h @E[3],[sp,#$R[1][1]+4]
427 bic @E[0],@C[8],@C[6]
428 bic @E[1],@C[9],@C[7]
429 bic @E[2],@C[0],@C[8],ror#14
430 bic @E[3],@C[1],@C[9],ror#14
431 eor @E[0],@E[0],@C[4]
432 eor @E[1],@E[1],@C[5]
433 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
434 bic @C[2],@C[2],@C[0],ror#32-(14-10)
435 str.h @E[1],[sp,#$R[1][2]+4]
436 eor @E[2],@C[6],@E[2],ror#32-14
437 bic @E[1],@C[3],@C[1],ror#32-(14-10)
438 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
439 eor @E[3],@C[7],@E[3],ror#32-14
440 str.h @E[3],[sp,#$R[1][3]+4]
441 add @E[2],sp,#$D[1]
442 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
443 eor @E[0],@C[8],@C[2],ror#32-10
444 ldr.h @C[0],[sp,#$A[0][1]+4]
445 eor @E[1],@C[9],@E[1],ror#32-10
446 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
447 str.h @E[1],[sp,#$R[1][4]+4]
448
449 add @C[9],sp,#$D[3]
450 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
451 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
452 ldr.h @C[3],[sp,#$A[1][2]+4]
453 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
454
455 eor @C[1],@C[1],@E[0]
456 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
457 eor @C[0],@C[0],@E[1]
458 ldr.h @C[5],[sp,#$A[2][3]+4]
459 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
460
461 eor @C[2],@C[2],@E[2]
462 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
463 eor @C[3],@C[3],@E[3]
464 ldr.h @E[1],[sp,#$A[3][4]+4]
465 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
466 ldr.l @E[2],[sp,#$D[0]] @ D[0]
467 @ ror @C[3],@C[3],#32-3
468 ldr.h @E[3],[sp,#$D[0]+4]
469
470 eor @C[4],@C[4],@C[6]
471 eor @C[5],@C[5],@C[7]
472 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
473 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
474
475 eor @E[0],@E[0],@C[8]
476 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
477 eor @E[1],@E[1],@C[9]
478 ldr.h @C[9],[sp,#$A[4][0]+4]
479 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
480 ror @C[7],@E[1],#32-4
481
482 eor @E[2],@E[2],@C[8]
483 eor @E[3],@E[3],@C[9]
484 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
485 ror @C[9],@E[3],#32-9
486
487 bic @E[0],@C[5],@C[2],ror#13-3
488 bic @E[1],@C[4],@C[3],ror#12-3
489 bic @E[2],@C[6],@C[5],ror#32-13
490 bic @E[3],@C[7],@C[4],ror#32-12
491 eor @E[0],@C[0],@E[0],ror#32-13
492 eor @E[1],@C[1],@E[1],ror#32-12
493 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
494 eor @E[2],@E[2],@C[2],ror#32-3
495 str.h @E[1],[sp,#$R[2][0]+4]
496 eor @E[3],@E[3],@C[3],ror#32-3
497 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
498 bic @E[0],@C[8],@C[6]
499 bic @E[1],@C[9],@C[7]
500 str.h @E[3],[sp,#$R[2][1]+4]
501 eor @E[0],@E[0],@C[5],ror#32-13
502 eor @E[1],@E[1],@C[4],ror#32-12
503 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
504 bic @E[2],@C[0],@C[8]
505 str.h @E[1],[sp,#$R[2][2]+4]
506 bic @E[3],@C[1],@C[9]
507 eor @E[2],@E[2],@C[6]
508 eor @E[3],@E[3],@C[7]
509 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
510 bic @E[0],@C[2],@C[0],ror#3
511 str.h @E[3],[sp,#$R[2][3]+4]
512 bic @E[1],@C[3],@C[1],ror#3
513 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
514 eor @E[0],@C[8],@E[0],ror#32-3
515 ldr.h @C[0],[sp,#$A[0][4]+4]
516 eor @E[1],@C[9],@E[1],ror#32-3
517 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
518 add @C[9],sp,#$D[1]
519 str.h @E[1],[sp,#$R[2][4]+4]
520
521 ldr.l @E[0],[sp,#$D[4]] @ D[4]
522 ldr.h @E[1],[sp,#$D[4]+4]
523 ldr.l @E[2],[sp,#$D[0]] @ D[0]
524 ldr.h @E[3],[sp,#$D[0]+4]
525
526 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
527
528 eor @C[1],@C[1],@E[0]
529 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
530 eor @C[0],@C[0],@E[1]
531 ldr.h @C[3],[sp,#$A[1][0]+4]
532 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
533 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
534 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
535 ldr.h @C[5],[sp,#$A[2][1]+4]
536
537 eor @C[2],@C[2],@E[2]
538 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
539 eor @C[3],@C[3],@E[3]
540 ldr.h @E[1],[sp,#$A[3][2]+4]
541 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
542 ldr.l @E[2],[sp,#$D[3]] @ D[3]
543 @ ror @C[3],@C[3],#32-18
544 ldr.h @E[3],[sp,#$D[3]+4]
545
546 eor @C[6],@C[6],@C[4]
547 eor @C[7],@C[7],@C[5]
548 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
549 ror @C[5],@C[7],#32-5
550
551 eor @E[0],@E[0],@C[8]
552 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
553 eor @E[1],@E[1],@C[9]
554 ldr.h @C[9],[sp,#$A[4][3]+4]
555 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
556 ror @C[6],@E[1],#32-8
557
558 eor @E[2],@E[2],@C[8]
559 eor @E[3],@E[3],@C[9]
560 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
561 ror @C[9],@E[3],#32-28
562
563 bic @E[0],@C[4],@C[2],ror#32-18
564 bic @E[1],@C[5],@C[3],ror#32-18
565 eor @E[0],@E[0],@C[0],ror#32-14
566 eor @E[1],@E[1],@C[1],ror#32-13
567 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
568 bic @E[2],@C[6],@C[4]
569 str.h @E[1],[sp,#$R[3][0]+4]
570 bic @E[3],@C[7],@C[5]
571 eor @E[2],@E[2],@C[2],ror#32-18
572 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
573 eor @E[3],@E[3],@C[3],ror#32-18
574 str.h @E[3],[sp,#$R[3][1]+4]
575 bic @E[0],@C[8],@C[6]
576 bic @E[1],@C[9],@C[7]
577 bic @E[2],@C[0],@C[8],ror#14
578 bic @E[3],@C[1],@C[9],ror#13
579 eor @E[0],@E[0],@C[4]
580 eor @E[1],@E[1],@C[5]
581 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
582 bic @C[2],@C[2],@C[0],ror#18-14
583 str.h @E[1],[sp,#$R[3][2]+4]
584 eor @E[2],@C[6],@E[2],ror#32-14
585 bic @E[1],@C[3],@C[1],ror#18-13
586 eor @E[3],@C[7],@E[3],ror#32-13
587 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
588 str.h @E[3],[sp,#$R[3][3]+4]
589 add @E[3],sp,#$D[2]
590 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
591 eor @E[0],@C[8],@C[2],ror#32-18
592 ldr.h @C[1],[sp,#$A[0][2]+4]
593 eor @E[1],@C[9],@E[1],ror#32-18
594 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
595 str.h @E[1],[sp,#$R[3][4]+4]
596
597 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
598 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
599 ldr.h @C[3],[sp,#$A[1][3]+4]
600 ldr.l @C[6],[sp,#$D[4]] @ D[4]
601 ldr.h @C[7],[sp,#$D[4]+4]
602
603 eor @C[0],@C[0],@E[0]
604 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
605 eor @C[1],@C[1],@E[1]
606 ldr.h @C[5],[sp,#$A[2][4]+4]
607 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
608 ldr.l @C[8],[sp,#$D[0]] @ D[0]
609 @ ror @C[1],@C[1],#32-31
610 ldr.h @C[9],[sp,#$D[0]+4]
611
612 eor @E[2],@E[2],@C[2]
613 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
614 eor @E[3],@E[3],@C[3]
615 ldr.h @E[1],[sp,#$A[3][0]+4]
616 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
617 ldr.l @E[2],[sp,#$D[1]] @ D[1]
618 ror @C[2],@E[3],#32-28
619 ldr.h @E[3],[sp,#$D[1]+4]
620
621 eor @C[6],@C[6],@C[4]
622 eor @C[7],@C[7],@C[5]
623 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
624 ror @C[4],@C[7],#32-20
625
626 eor @E[0],@E[0],@C[8]
627 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
628 eor @E[1],@E[1],@C[9]
629 ldr.h @C[9],[sp,#$A[4][1]+4]
630 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
631 ror @C[6],@E[1],#32-21
632
633 eor @C[8],@C[8],@E[2]
634 eor @C[9],@C[9],@E[3]
635 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
636 @ ror @C[9],@C[3],#32-1
637
638 bic @E[0],@C[4],@C[2]
639 bic @E[1],@C[5],@C[3]
640 eor @E[0],@E[0],@C[0],ror#32-31
641 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
642 eor @E[1],@E[1],@C[1],ror#32-31
643 str.h @E[1],[sp,#$R[4][0]+4]
644 bic @E[2],@C[6],@C[4]
645 bic @E[3],@C[7],@C[5]
646 eor @E[2],@E[2],@C[2]
647 eor @E[3],@E[3],@C[3]
648 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
649 bic @E[0],@C[8],@C[6],ror#1
650 str.h @E[3],[sp,#$R[4][1]+4]
651 bic @E[1],@C[9],@C[7],ror#1
652 bic @E[2],@C[0],@C[8],ror#31-1
653 bic @E[3],@C[1],@C[9],ror#31-1
654 eor @C[4],@C[4],@E[0],ror#32-1
655 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
656 eor @C[5],@C[5],@E[1],ror#32-1
657 str.h @C[5],[sp,#$R[4][2]+4]
658 eor @C[6],@C[6],@E[2],ror#32-31
659 eor @C[7],@C[7],@E[3],ror#32-31
660 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
661 bic @E[0],@C[2],@C[0],ror#32-31
662 str.h @C[7],[sp,#$R[4][3]+4]
663 bic @E[1],@C[3],@C[1],ror#32-31
664 add @E[2],sp,#$R[0][0]
665 eor @C[8],@E[0],@C[8],ror#32-1
666 add @E[0],sp,#$R[1][0]
667 eor @C[9],@E[1],@C[9],ror#32-1
668 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
669 str.h @C[9],[sp,#$R[4][4]+4]
670 ___
671 }
672 Round(@A,@T);
673 Round(@T,@A);
674 $code.=<<___;
675 blo .Lround2x
676
677 ldr pc,[sp,#440]
678 .size KeccakF1600_int,.-KeccakF1600_int
679
680 .type KeccakF1600, %function
681 .align 5
682 KeccakF1600:
683 stmdb sp!,{r0,r4-r11,lr}
684 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
685
686 add @E[0],r0,#$A[1][0]
687 add @E[1],sp,#$A[1][0]
688 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
689 stmia sp, {@C[0]-@C[9]}
690 ldmia @E[0]!,{@C[0]-@C[9]}
691 stmia @E[1]!,{@C[0]-@C[9]}
692 ldmia @E[0]!,{@C[0]-@C[9]}
693 stmia @E[1]!,{@C[0]-@C[9]}
694 ldmia @E[0]!,{@C[0]-@C[9]}
695 stmia @E[1]!,{@C[0]-@C[9]}
696 ldmia @E[0], {@C[0]-@C[9]}
697 add @E[2],sp,#$A[0][0]
698 add @E[0],sp,#$A[1][0]
699 stmia @E[1], {@C[0]-@C[9]}
700
701 bl KeccakF1600_enter
702
703 ldr @E[1], [sp,#440+16] @ restore pointer to A
704 ldmia sp, {@C[0]-@C[9]}
705 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
706 ldmia @E[0]!,{@C[0]-@C[9]}
707 stmia @E[1]!,{@C[0]-@C[9]}
708 ldmia @E[0]!,{@C[0]-@C[9]}
709 stmia @E[1]!,{@C[0]-@C[9]}
710 ldmia @E[0]!,{@C[0]-@C[9]}
711 stmia @E[1]!,{@C[0]-@C[9]}
712 ldmia @E[0], {@C[0]-@C[9]}
713 stmia @E[1], {@C[0]-@C[9]}
714
715 add sp,sp,#440+20
716 ldmia sp!,{r4-r11,pc}
717 .size KeccakF1600,.-KeccakF1600
718 ___
719 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
720
721 ########################################################################
722 # Stack layout
723 # ----->+-----------------------+
724 # | uint64_t A[5][5] |
725 # | ... |
726 # | ... |
727 # +456->+-----------------------+
728 # | 0x55555555 |
729 # +460->+-----------------------+
730 # | 0x33333333 |
731 # +464->+-----------------------+
732 # | 0x0f0f0f0f |
733 # +468->+-----------------------+
734 # | 0x00ff00ff |
735 # +472->+-----------------------+
736 # | uint64_t *A |
737 # +476->+-----------------------+
738 # | const void *inp |
739 # +480->+-----------------------+
740 # | size_t len |
741 # +484->+-----------------------+
742 # | size_t bs |
743 # +488->+-----------------------+
744 # | ....
745
746 $code.=<<___;
747 .global SHA3_absorb
748 .type SHA3_absorb,%function
749 .align 5
750 SHA3_absorb:
751 stmdb sp!,{r0-r12,lr}
752 sub sp,sp,#456+16
753
754 add $A_flat,r0,#$A[1][0]
755 @ mov $inp,r1
756 mov $len,r2
757 mov $bsz,r3
758 cmp r2,r3
759 blo .Labsorb_abort
760
761 add $inp,sp,#0
762 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
763 stmia $inp!, {@C[0]-@C[9]}
764 ldmia $A_flat!,{@C[0]-@C[9]}
765 stmia $inp!, {@C[0]-@C[9]}
766 ldmia $A_flat!,{@C[0]-@C[9]}
767 stmia $inp!, {@C[0]-@C[9]}
768 ldmia $A_flat!,{@C[0]-@C[9]}
769 stmia $inp!, {@C[0]-@C[9]}
770 ldmia $A_flat!,{@C[0]-@C[9]}
771 stmia $inp, {@C[0]-@C[9]}
772
773 ldr $inp,[sp,#476] @ restore $inp
774 #ifdef __thumb2__
775 mov r9,#0x00ff00ff
776 mov r8,#0x0f0f0f0f
777 mov r7,#0x33333333
778 mov r6,#0x55555555
779 #else
780 mov r6,#0x11 @ compose constants
781 mov r8,#0x0f
782 mov r9,#0xff
783 orr r6,r6,r6,lsl#8
784 orr r8,r8,r8,lsl#8
785 orr r6,r6,r6,lsl#16 @ 0x11111111
786 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
787 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
788 orr r7,r6,r6,lsl#1 @ 0x33333333
789 orr r6,r6,r6,lsl#2 @ 0x55555555
790 #endif
791 str r9,[sp,#468]
792 str r8,[sp,#464]
793 str r7,[sp,#460]
794 str r6,[sp,#456]
795 b .Loop_absorb
796
797 .align 4
798 .Loop_absorb:
799 subs r0,$len,$bsz
800 blo .Labsorbed
801 add $A_flat,sp,#0
802 str r0,[sp,#480] @ save len - bsz
803
804 .align 4
805 .Loop_block:
806 ldrb r0,[$inp],#1
807 ldrb r1,[$inp],#1
808 ldrb r2,[$inp],#1
809 ldrb r3,[$inp],#1
810 ldrb r4,[$inp],#1
811 orr r0,r0,r1,lsl#8
812 ldrb r1,[$inp],#1
813 orr r0,r0,r2,lsl#16
814 ldrb r2,[$inp],#1
815 orr r0,r0,r3,lsl#24 @ lo
816 ldrb r3,[$inp],#1
817 orr r1,r4,r1,lsl#8
818 orr r1,r1,r2,lsl#16
819 orr r1,r1,r3,lsl#24 @ hi
820
821 and r2,r0,r6 @ &=0x55555555
822 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
823 and r3,r1,r6 @ &=0x55555555
824 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
825 orr r2,r2,r2,lsr#1
826 orr r0,r0,r0,lsl#1
827 orr r3,r3,r3,lsr#1
828 orr r1,r1,r1,lsl#1
829 and r2,r2,r7 @ &=0x33333333
830 and r0,r0,r7,lsl#2 @ &=0xcccccccc
831 and r3,r3,r7 @ &=0x33333333
832 and r1,r1,r7,lsl#2 @ &=0xcccccccc
833 orr r2,r2,r2,lsr#2
834 orr r0,r0,r0,lsl#2
835 orr r3,r3,r3,lsr#2
836 orr r1,r1,r1,lsl#2
837 and r2,r2,r8 @ &=0x0f0f0f0f
838 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
839 and r3,r3,r8 @ &=0x0f0f0f0f
840 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
841 ldmia $A_flat,{r4-r5} @ A_flat[i]
842 orr r2,r2,r2,lsr#4
843 orr r0,r0,r0,lsl#4
844 orr r3,r3,r3,lsr#4
845 orr r1,r1,r1,lsl#4
846 and r2,r2,r9 @ &=0x00ff00ff
847 and r0,r0,r9,lsl#8 @ &=0xff00ff00
848 and r3,r3,r9 @ &=0x00ff00ff
849 and r1,r1,r9,lsl#8 @ &=0xff00ff00
850 orr r2,r2,r2,lsr#8
851 orr r0,r0,r0,lsl#8
852 orr r3,r3,r3,lsr#8
853 orr r1,r1,r1,lsl#8
854
855 lsl r2,r2,#16
856 lsr r1,r1,#16
857 eor r4,r4,r3,lsl#16
858 eor r5,r5,r0,lsr#16
859 eor r4,r4,r2,lsr#16
860 eor r5,r5,r1,lsl#16
861 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
862
863 subs $bsz,$bsz,#8
864 bhi .Loop_block
865
866 str $inp,[sp,#476]
867
868 bl KeccakF1600_int
869
870 add r14,sp,#456
871 ldmia r14,{r6-r12,r14} @ restore constants and variables
872 b .Loop_absorb
873
874 .align 4
875 .Labsorbed:
876 add $inp,sp,#$A[1][0]
877 ldmia sp, {@C[0]-@C[9]}
878 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
879 ldmia $inp!, {@C[0]-@C[9]}
880 stmia $A_flat!,{@C[0]-@C[9]}
881 ldmia $inp!, {@C[0]-@C[9]}
882 stmia $A_flat!,{@C[0]-@C[9]}
883 ldmia $inp!, {@C[0]-@C[9]}
884 stmia $A_flat!,{@C[0]-@C[9]}
885 ldmia $inp, {@C[0]-@C[9]}
886 stmia $A_flat, {@C[0]-@C[9]}
887
888 .Labsorb_abort:
889 add sp,sp,#456+32
890 mov r0,$len @ return value
891 ldmia sp!,{r4-r12,pc}
892 .size SHA3_absorb,.-SHA3_absorb
893 ___
894 }
895 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
896
897 $code.=<<___;
898 .global SHA3_squeeze
899 .type SHA3_squeeze,%function
900 .align 5
901 SHA3_squeeze:
902 stmdb sp!,{r0,r3-r10,lr}
903
904 mov $A_flat,r0
905 mov $out,r1
906 mov $len,r2
907 mov $bsz,r3
908
909 #ifdef __thumb2__
910 mov r9,#0x00ff00ff
911 mov r8,#0x0f0f0f0f
912 mov r7,#0x33333333
913 mov r6,#0x55555555
914 #else
915 mov r6,#0x11 @ compose constants
916 mov r8,#0x0f
917 mov r9,#0xff
918 orr r6,r6,r6,lsl#8
919 orr r8,r8,r8,lsl#8
920 orr r6,r6,r6,lsl#16 @ 0x11111111
921 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
922 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
923 orr r7,r6,r6,lsl#1 @ 0x33333333
924 orr r6,r6,r6,lsl#2 @ 0x55555555
925 #endif
926 stmdb sp!,{r6-r9}
927
928 mov r14,$A_flat
929 b .Loop_squeeze
930
931 .align 4
932 .Loop_squeeze:
933 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
934
935 lsl r2,r0,#16
936 lsl r3,r1,#16 @ r3 = r1 << 16
937 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
938 lsr r1,r1,#16
939 lsr r0,r0,#16 @ r0 = r0 >> 16
940 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
941
942 orr r2,r2,r2,lsl#8
943 orr r3,r3,r3,lsr#8
944 orr r0,r0,r0,lsl#8
945 orr r1,r1,r1,lsr#8
946 and r2,r2,r9 @ &=0x00ff00ff
947 and r3,r3,r9,lsl#8 @ &=0xff00ff00
948 and r0,r0,r9 @ &=0x00ff00ff
949 and r1,r1,r9,lsl#8 @ &=0xff00ff00
950 orr r2,r2,r2,lsl#4
951 orr r3,r3,r3,lsr#4
952 orr r0,r0,r0,lsl#4
953 orr r1,r1,r1,lsr#4
954 and r2,r2,r8 @ &=0x0f0f0f0f
955 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
956 and r0,r0,r8 @ &=0x0f0f0f0f
957 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
958 orr r2,r2,r2,lsl#2
959 orr r3,r3,r3,lsr#2
960 orr r0,r0,r0,lsl#2
961 orr r1,r1,r1,lsr#2
962 and r2,r2,r7 @ &=0x33333333
963 and r3,r3,r7,lsl#2 @ &=0xcccccccc
964 and r0,r0,r7 @ &=0x33333333
965 and r1,r1,r7,lsl#2 @ &=0xcccccccc
966 orr r2,r2,r2,lsl#1
967 orr r3,r3,r3,lsr#1
968 orr r0,r0,r0,lsl#1
969 orr r1,r1,r1,lsr#1
970 and r2,r2,r6 @ &=0x55555555
971 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
972 and r0,r0,r6 @ &=0x55555555
973 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
974
975 orr r2,r2,r3
976 orr r0,r0,r1
977
978 cmp $len,#8
979 blo .Lsqueeze_tail
980 lsr r1,r2,#8
981 strb r2,[$out],#1
982 lsr r3,r2,#16
983 strb r1,[$out],#1
984 lsr r2,r2,#24
985 strb r3,[$out],#1
986 strb r2,[$out],#1
987
988 lsr r1,r0,#8
989 strb r0,[$out],#1
990 lsr r3,r0,#16
991 strb r1,[$out],#1
992 lsr r0,r0,#24
993 strb r3,[$out],#1
994 strb r0,[$out],#1
995 subs $len,$len,#8
996 beq .Lsqueeze_done
997
998 subs $bsz,$bsz,#8 @ bsz -= 8
999 bhi .Loop_squeeze
1000
1001 mov r0,r14 @ original $A_flat
1002
1003 bl KeccakF1600
1004
1005 ldmia sp,{r6-r10,r12} @ restore constants and variables
1006 mov r14,$A_flat
1007 b .Loop_squeeze
1008
1009 .align 4
1010 .Lsqueeze_tail:
1011 strb r2,[$out],#1
1012 lsr r2,r2,#8
1013 subs $len,$len,#1
1014 beq .Lsqueeze_done
1015 strb r2,[$out],#1
1016 lsr r2,r2,#8
1017 subs $len,$len,#1
1018 beq .Lsqueeze_done
1019 strb r2,[$out],#1
1020 lsr r2,r2,#8
1021 subs $len,$len,#1
1022 beq .Lsqueeze_done
1023 strb r2,[$out],#1
1024 subs $len,$len,#1
1025 beq .Lsqueeze_done
1026
1027 strb r0,[$out],#1
1028 lsr r0,r0,#8
1029 subs $len,$len,#1
1030 beq .Lsqueeze_done
1031 strb r0,[$out],#1
1032 lsr r0,r0,#8
1033 subs $len,$len,#1
1034 beq .Lsqueeze_done
1035 strb r0,[$out]
1036 b .Lsqueeze_done
1037
1038 .align 4
1039 .Lsqueeze_done:
1040 add sp,sp,#24
1041 ldmia sp!,{r4-r10,pc}
1042 .size SHA3_squeeze,.-SHA3_squeeze
1043 ___
1044 }
1045
1046 $code.=<<___;
1047 .fpu neon
1048
1049 .type iotas64, %object
1050 .align 5
1051 iotas64:
1052 .quad 0x0000000000000001
1053 .quad 0x0000000000008082
1054 .quad 0x800000000000808a
1055 .quad 0x8000000080008000
1056 .quad 0x000000000000808b
1057 .quad 0x0000000080000001
1058 .quad 0x8000000080008081
1059 .quad 0x8000000000008009
1060 .quad 0x000000000000008a
1061 .quad 0x0000000000000088
1062 .quad 0x0000000080008009
1063 .quad 0x000000008000000a
1064 .quad 0x000000008000808b
1065 .quad 0x800000000000008b
1066 .quad 0x8000000000008089
1067 .quad 0x8000000000008003
1068 .quad 0x8000000000008002
1069 .quad 0x8000000000000080
1070 .quad 0x000000000000800a
1071 .quad 0x800000008000000a
1072 .quad 0x8000000080008081
1073 .quad 0x8000000000008080
1074 .quad 0x0000000080000001
1075 .quad 0x8000000080008008
1076 .size iotas64,.-iotas64
1077
1078 .type KeccakF1600_neon, %function
1079 .align 5
1080 KeccakF1600_neon:
1081 add r1, r0, #16
1082 adr r2, iotas64
1083 mov r3, #24 @ loop counter
1084 b .Loop_neon
1085
1086 .align 4
1087 .Loop_neon:
1088 @ Theta
1089 vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
1090 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1091 vst1.64 {d18}, [r1:64] @ offload A[2][4]
1092 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1093 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1094 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1095 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1096 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1097 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1098 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1099 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1100 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1101 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1102 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1103 veor d25, d25, d24 @ C[4]^=A[4][4]
1104
1105 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1106 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1107 vadd.u64 d18, d25, d25 @ C[4]<<1
1108 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1109 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1110 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1111 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1112 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1113 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1114 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1115
1116 veor d0, d0, d25 @ A[0][0] ^= C[4]
1117 veor d1, d1, d25 @ A[1][0] ^= C[4]
1118 veor d10, d10, d25 @ A[2][0] ^= C[4]
1119 veor d11, d11, d25 @ A[3][0] ^= C[4]
1120 veor d20, d20, d25 @ A[4][0] ^= C[4]
1121
1122 veor d2, d2, d26 @ A[0][1] ^= D[1]
1123 veor d3, d3, d26 @ A[1][1] ^= D[1]
1124 veor d12, d12, d26 @ A[2][1] ^= D[1]
1125 veor d13, d13, d26 @ A[3][1] ^= D[1]
1126 veor d21, d21, d26 @ A[4][1] ^= D[1]
1127 vmov d26, d27
1128
1129 veor d6, d6, d28 @ A[0][3] ^= C[2]
1130 veor d7, d7, d28 @ A[1][3] ^= C[2]
1131 veor d16, d16, d28 @ A[2][3] ^= C[2]
1132 veor d17, d17, d28 @ A[3][3] ^= C[2]
1133 veor d23, d23, d28 @ A[4][3] ^= C[2]
1134 vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
1135 vmov d28, d29
1136
1137 vld1.64 {d18}, [r1:64] @ restore A[2][4]
1138 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1139 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1140 veor d22, d22, d27 @ A[4][2] ^= D[2]
1141
1142 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1143 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1144 veor d24, d24, d29 @ A[4][4] ^= C[3]
1145
1146 @ Rho + Pi
1147 vmov d26, d2 @ C[1] = A[0][1]
1148 vshl.u64 d2, d3, #44
1149 vmov d27, d4 @ C[2] = A[0][2]
1150 vshl.u64 d4, d14, #43
1151 vmov d28, d6 @ C[3] = A[0][3]
1152 vshl.u64 d6, d17, #21
1153 vmov d29, d8 @ C[4] = A[0][4]
1154 vshl.u64 d8, d24, #14
1155 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1156 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1157 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1158 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1159
1160 vshl.u64 d3, d9, #20
1161 vshl.u64 d14, d16, #25
1162 vshl.u64 d17, d15, #15
1163 vshl.u64 d24, d21, #2
1164 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1165 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1166 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1167 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1168
1169 vshl.u64 d9, d22, #61
1170 @ vshl.u64 d16, d19, #8
1171 vshl.u64 d15, d12, #10
1172 vshl.u64 d21, d7, #55
1173 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1174 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1175 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1176 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1177
1178 vshl.u64 d22, d18, #39
1179 @ vshl.u64 d19, d23, #56
1180 vshl.u64 d12, d5, #6
1181 vshl.u64 d7, d13, #45
1182 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1183 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1184 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1185 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1186
1187 vshl.u64 d18, d20, #18
1188 vshl.u64 d23, d11, #41
1189 vshl.u64 d5, d10, #3
1190 vshl.u64 d13, d1, #36
1191 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1192 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1193 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1194 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1195
1196 vshl.u64 d1, d28, #28
1197 vshl.u64 d10, d26, #1
1198 vshl.u64 d11, d29, #27
1199 vshl.u64 d20, d27, #62
1200 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1201 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1202 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1203 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1204
1205 @ Chi + Iota
1206 vbic q13, q2, q1
1207 vbic q14, q3, q2
1208 vbic q15, q4, q3
1209 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1210 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1211 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1212 vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
1213 vbic q13, q0, q4
1214 vbic q15, q1, q0
1215 vmov q1, q14 @ A[0..1][1]
1216 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1217 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1218
1219 vbic q13, q7, q6
1220 vmov q0, q5 @ A[2..3][0]
1221 vbic q14, q8, q7
1222 vmov q15, q6 @ A[2..3][1]
1223 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1224 vbic q13, q9, q8
1225 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1226 vbic q14, q0, q9
1227 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1228 vbic q13, q15, q0
1229 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1230 vmov q14, q10 @ A[4][0..1]
1231 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1232
1233 vld1.64 d25, [r2:64]! @ Iota[i++]
1234 vbic d26, d22, d21
1235 vbic d27, d23, d22
1236 vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
1237 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1238 vbic d26, d24, d23
1239 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1240 vbic d27, d28, d24
1241 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1242 vbic d26, d29, d28
1243 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1244 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1245 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1246
1247 subs r3, r3, #1
1248 bne .Loop_neon
1249
1250 bx lr
1251 .size KeccakF1600_neon,.-KeccakF1600_neon
1252
1253 .global SHA3_absorb_neon
1254 .type SHA3_absorb_neon, %function
1255 .align 5
1256 SHA3_absorb_neon:
1257 stmdb sp!, {r4-r6,lr}
1258 vstmdb sp!, {d8-d15}
1259
1260 mov r4, r1 @ inp
1261 mov r5, r2 @ len
1262 mov r6, r3 @ bsz
1263
1264 vld1.32 {d0}, [r0:64]! @ A[0][0]
1265 vld1.32 {d2}, [r0:64]! @ A[0][1]
1266 vld1.32 {d4}, [r0:64]! @ A[0][2]
1267 vld1.32 {d6}, [r0:64]! @ A[0][3]
1268 vld1.32 {d8}, [r0:64]! @ A[0][4]
1269
1270 vld1.32 {d1}, [r0:64]! @ A[1][0]
1271 vld1.32 {d3}, [r0:64]! @ A[1][1]
1272 vld1.32 {d5}, [r0:64]! @ A[1][2]
1273 vld1.32 {d7}, [r0:64]! @ A[1][3]
1274 vld1.32 {d9}, [r0:64]! @ A[1][4]
1275
1276 vld1.32 {d10}, [r0:64]! @ A[2][0]
1277 vld1.32 {d12}, [r0:64]! @ A[2][1]
1278 vld1.32 {d14}, [r0:64]! @ A[2][2]
1279 vld1.32 {d16}, [r0:64]! @ A[2][3]
1280 vld1.32 {d18}, [r0:64]! @ A[2][4]
1281
1282 vld1.32 {d11}, [r0:64]! @ A[3][0]
1283 vld1.32 {d13}, [r0:64]! @ A[3][1]
1284 vld1.32 {d15}, [r0:64]! @ A[3][2]
1285 vld1.32 {d17}, [r0:64]! @ A[3][3]
1286 vld1.32 {d19}, [r0:64]! @ A[3][4]
1287
1288 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
1289 vld1.32 {d24}, [r0:64] @ A[4][4]
1290 sub r0, r0, #24*8 @ rewind
1291 b .Loop_absorb_neon
1292
1293 .align 4
1294 .Loop_absorb_neon:
1295 subs r12, r5, r6 @ len - bsz
1296 blo .Labsorbed_neon
1297 mov r5, r12
1298
1299 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1300 cmp r6, #8*2
1301 veor d0, d0, d31 @ A[0][0] ^= *inp++
1302 blo .Lprocess_neon
1303 vld1.8 {d31}, [r4]!
1304 veor d2, d2, d31 @ A[0][1] ^= *inp++
1305 beq .Lprocess_neon
1306 vld1.8 {d31}, [r4]!
1307 cmp r6, #8*4
1308 veor d4, d4, d31 @ A[0][2] ^= *inp++
1309 blo .Lprocess_neon
1310 vld1.8 {d31}, [r4]!
1311 veor d6, d6, d31 @ A[0][3] ^= *inp++
1312 beq .Lprocess_neon
1313 vld1.8 {d31},[r4]!
1314 cmp r6, #8*6
1315 veor d8, d8, d31 @ A[0][4] ^= *inp++
1316 blo .Lprocess_neon
1317
1318 vld1.8 {d31}, [r4]!
1319 veor d1, d1, d31 @ A[1][0] ^= *inp++
1320 beq .Lprocess_neon
1321 vld1.8 {d31}, [r4]!
1322 cmp r6, #8*8
1323 veor d3, d3, d31 @ A[1][1] ^= *inp++
1324 blo .Lprocess_neon
1325 vld1.8 {d31}, [r4]!
1326 veor d5, d5, d31 @ A[1][2] ^= *inp++
1327 beq .Lprocess_neon
1328 vld1.8 {d31}, [r4]!
1329 cmp r6, #8*10
1330 veor d7, d7, d31 @ A[1][3] ^= *inp++
1331 blo .Lprocess_neon
1332 vld1.8 {d31}, [r4]!
1333 veor d9, d9, d31 @ A[1][4] ^= *inp++
1334 beq .Lprocess_neon
1335
1336 vld1.8 {d31}, [r4]!
1337 cmp r6, #8*12
1338 veor d10, d10, d31 @ A[2][0] ^= *inp++
1339 blo .Lprocess_neon
1340 vld1.8 {d31}, [r4]!
1341 veor d12, d12, d31 @ A[2][1] ^= *inp++
1342 beq .Lprocess_neon
1343 vld1.8 {d31}, [r4]!
1344 cmp r6, #8*14
1345 veor d14, d14, d31 @ A[2][2] ^= *inp++
1346 blo .Lprocess_neon
1347 vld1.8 {d31}, [r4]!
1348 veor d16, d16, d31 @ A[2][3] ^= *inp++
1349 beq .Lprocess_neon
1350 vld1.8 {d31}, [r4]!
1351 cmp r6, #8*16
1352 veor d18, d18, d31 @ A[2][4] ^= *inp++
1353 blo .Lprocess_neon
1354
1355 vld1.8 {d31}, [r4]!
1356 veor d11, d11, d31 @ A[3][0] ^= *inp++
1357 beq .Lprocess_neon
1358 vld1.8 {d31}, [r4]!
1359 cmp r6, #8*18
1360 veor d13, d13, d31 @ A[3][1] ^= *inp++
1361 blo .Lprocess_neon
1362 vld1.8 {d31}, [r4]!
1363 veor d15, d15, d31 @ A[3][2] ^= *inp++
1364 beq .Lprocess_neon
1365 vld1.8 {d31}, [r4]!
1366 cmp r6, #8*20
1367 veor d17, d17, d31 @ A[3][3] ^= *inp++
1368 blo .Lprocess_neon
1369 vld1.8 {d31}, [r4]!
1370 veor d19, d19, d31 @ A[3][4] ^= *inp++
1371 beq .Lprocess_neon
1372
1373 vld1.8 {d31}, [r4]!
1374 cmp r6, #8*22
1375 veor d20, d20, d31 @ A[4][0] ^= *inp++
1376 blo .Lprocess_neon
1377 vld1.8 {d31}, [r4]!
1378 veor d21, d21, d31 @ A[4][1] ^= *inp++
1379 beq .Lprocess_neon
1380 vld1.8 {d31}, [r4]!
1381 cmp r6, #8*24
1382 veor d22, d22, d31 @ A[4][2] ^= *inp++
1383 blo .Lprocess_neon
1384 vld1.8 {d31}, [r4]!
1385 veor d23, d23, d31 @ A[4][3] ^= *inp++
1386 beq .Lprocess_neon
1387 vld1.8 {d31}, [r4]!
1388 veor d24, d24, d31 @ A[4][4] ^= *inp++
1389
1390 .Lprocess_neon:
1391 bl KeccakF1600_neon
1392 b .Loop_absorb_neon
1393
1394 .align 4
1395 .Labsorbed_neon:
1396 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1397 vst1.32 {d2}, [r0:64]!
1398 vst1.32 {d4}, [r0:64]!
1399 vst1.32 {d6}, [r0:64]!
1400 vst1.32 {d8}, [r0:64]!
1401
1402 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1403 vst1.32 {d3}, [r0:64]!
1404 vst1.32 {d5}, [r0:64]!
1405 vst1.32 {d7}, [r0:64]!
1406 vst1.32 {d9}, [r0:64]!
1407
1408 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1409 vst1.32 {d12}, [r0:64]!
1410 vst1.32 {d14}, [r0:64]!
1411 vst1.32 {d16}, [r0:64]!
1412 vst1.32 {d18}, [r0:64]!
1413
1414 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1415 vst1.32 {d13}, [r0:64]!
1416 vst1.32 {d15}, [r0:64]!
1417 vst1.32 {d17}, [r0:64]!
1418 vst1.32 {d19}, [r0:64]!
1419
1420 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1421 vst1.32 {d24}, [r0:64]
1422
1423 mov r0, r5 @ return value
1424 vldmia sp!, {d8-d15}
1425 ldmia sp!, {r4-r6,pc}
1426 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1427
1428 .global SHA3_squeeze_neon
1429 .type SHA3_squeeze_neon, %function
1430 .align 5
1431 SHA3_squeeze_neon:
1432 stmdb sp!, {r4-r6,lr}
1433
1434 mov r4, r1 @ out
1435 mov r5, r2 @ len
1436 mov r6, r3 @ bsz
1437 mov r12, r0 @ A_flat
1438 mov r14, r3 @ bsz
1439 b .Loop_squeeze_neon
1440
1441 .align 4
1442 .Loop_squeeze_neon:
1443 cmp r5, #8
1444 blo .Lsqueeze_neon_tail
1445 vld1.32 {d0}, [r12]!
1446 vst1.8 {d0}, [r4]! @ endian-neutral store
1447
1448 subs r5, r5, #8 @ len -= 8
1449 beq .Lsqueeze_neon_done
1450
1451 subs r14, r14, #8 @ bsz -= 8
1452 bhi .Loop_squeeze_neon
1453
1454 vstmdb sp!, {d8-d15}
1455
1456 vld1.32 {d0}, [r0:64]! @ A[0][0..4]
1457 vld1.32 {d2}, [r0:64]!
1458 vld1.32 {d4}, [r0:64]!
1459 vld1.32 {d6}, [r0:64]!
1460 vld1.32 {d8}, [r0:64]!
1461
1462 vld1.32 {d1}, [r0:64]! @ A[1][0..4]
1463 vld1.32 {d3}, [r0:64]!
1464 vld1.32 {d5}, [r0:64]!
1465 vld1.32 {d7}, [r0:64]!
1466 vld1.32 {d9}, [r0:64]!
1467
1468 vld1.32 {d10}, [r0:64]! @ A[2][0..4]
1469 vld1.32 {d12}, [r0:64]!
1470 vld1.32 {d14}, [r0:64]!
1471 vld1.32 {d16}, [r0:64]!
1472 vld1.32 {d18}, [r0:64]!
1473
1474 vld1.32 {d11}, [r0:64]! @ A[3][0..4]
1475 vld1.32 {d13}, [r0:64]!
1476 vld1.32 {d15}, [r0:64]!
1477 vld1.32 {d17}, [r0:64]!
1478 vld1.32 {d19}, [r0:64]!
1479
1480 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1481 vld1.32 {d24}, [r0:64]
1482 sub r0, r0, #24*8 @ rewind
1483
1484 bl KeccakF1600_neon
1485
1486 mov r12, r0 @ A_flat
1487 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1488 vst1.32 {d2}, [r0:64]!
1489 vst1.32 {d4}, [r0:64]!
1490 vst1.32 {d6}, [r0:64]!
1491 vst1.32 {d8}, [r0:64]!
1492
1493 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1494 vst1.32 {d3}, [r0:64]!
1495 vst1.32 {d5}, [r0:64]!
1496 vst1.32 {d7}, [r0:64]!
1497 vst1.32 {d9}, [r0:64]!
1498
1499 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1500 vst1.32 {d12}, [r0:64]!
1501 vst1.32 {d14}, [r0:64]!
1502 vst1.32 {d16}, [r0:64]!
1503 vst1.32 {d18}, [r0:64]!
1504
1505 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1506 vst1.32 {d13}, [r0:64]!
1507 vst1.32 {d15}, [r0:64]!
1508 vst1.32 {d17}, [r0:64]!
1509 vst1.32 {d19}, [r0:64]!
1510
1511 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1512 mov r14, r6 @ bsz
1513 vst1.32 {d24}, [r0:64]
1514 mov r0, r12 @ rewind
1515
1516 vldmia sp!, {d8-d15}
1517 b .Loop_squeeze_neon
1518
1519 .align 4
1520 .Lsqueeze_neon_tail:
1521 ldmia r12, {r2,r3}
1522 cmp r5, #2
1523 strb r2, [r4],#1 @ endian-neutral store
1524 lsr r2, r2, #8
1525 blo .Lsqueeze_neon_done
1526 strb r2, [r4], #1
1527 lsr r2, r2, #8
1528 beq .Lsqueeze_neon_done
1529 strb r2, [r4], #1
1530 lsr r2, r2, #8
1531 cmp r5, #4
1532 blo .Lsqueeze_neon_done
1533 strb r2, [r4], #1
1534 beq .Lsqueeze_neon_done
1535
1536 strb r3, [r4], #1
1537 lsr r3, r3, #8
1538 cmp r5, #6
1539 blo .Lsqueeze_neon_done
1540 strb r3, [r4], #1
1541 lsr r3, r3, #8
1542 beq .Lsqueeze_neon_done
1543 strb r3, [r4], #1
1544
1545 .Lsqueeze_neon_done:
1546 ldmia sp!, {r4-r6,pc}
1547 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1548 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1549 .align 2
1550 ___
1551
1552 {
1553 my %ldr, %str;
1554
1555 sub ldrd {
1556 my ($mnemonic,$half,$reg,$ea) = @_;
1557 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1558
1559 if ($half eq "l") {
1560 $$op{reg} = $reg;
1561 $$op{ea} = $ea;
1562 sprintf "#ifndef __thumb2__\n" .
1563 " %s\t%s,%s\n" .
1564 "#endif", $mnemonic,$reg,$ea;
1565 } else {
1566 sprintf "#ifndef __thumb2__\n" .
1567 " %s\t%s,%s\n" .
1568 "#else\n" .
1569 " %sd\t%s,%s,%s\n" .
1570 "#endif", $mnemonic,$reg,$ea,
1571 $mnemonic,$$op{reg},$reg,$$op{ea};
1572 }
1573 }
1574 }
1575
1576 $output=pop;
1577 open STDOUT,">$output";
1578
1579 foreach (split($/,$code)) {
1580 s/\`([^\`]*)\`/eval $1/ge;
1581
1582 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1583 s/\bret\b/bx lr/g or
1584 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1585
1586 print $_,"\n";
1587 }
1588
1589 close STDOUT; # enforce flush