]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/keccak1600-armv4.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-armv4.pl
CommitLineData
56676f87 1#!/usr/bin/env perl
b0edda11 2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
56676f87
AP
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv4.
17#
18# June 2017.
19#
367c5527
AP
20# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21# interleaving. How does it compare to Keccak Code Package? It's as
22# fast, but several times smaller, and is endian- and ISA-neutral. ISA
23# neutrality means that minimum ISA requirement is ARMv4, yet it can
24# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25# register layout taken from Keccak Code Package. It's also as fast,
26# in fact faster by 10-15% on some processors, and endian-neutral.
56676f87 27#
d9ca12cb
AP
28# August 2017.
29#
30# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31# of rotate instructions with logical ones. This resulted in ~10%
32# improvement on most processors. Switch to KECCAK_2X effectively
33# minimizes re-loads from temporary storage, and merged rotates just
34# eliminate corresponding instructions. As for latter. When examining
35# code you'll notice commented ror instructions. These are eliminated
36# ones, and you should trace destination register below to see what's
37# going on. Just in case, why not all rotates are eliminated. Trouble
38# is that you have operations that require both inputs to be rotated,
39# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41# that takes 'a' as input. And thing is that this next operation can
42# be in next round. It's totally possible to "carry" rotate "factors"
43# to the next round, but it makes code more complex. And the last word
44# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45# time being]...
46#
e0584e96
AP
47# Reduce per-round instruction count in Thumb-2 case by 16%. This is
48# achieved by folding ldr/str pairs to their double-word counterparts.
49# Theoretically this should have improved performance on single-issue
50# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51# usual...
52#
56676f87 53########################################################################
367c5527 54# Numbers are cycles per processed byte. Non-NEON results account even
6dad1efe 55# for input bit interleaving.
56676f87 56#
e0584e96 57# r=1088(*) Thumb-2(**) NEON
56676f87 58#
d9ca12cb 59# ARM11xx 82/+150%
e0584e96
AP
60# Cortex-A5 88/+160%, 86, 36
61# Cortex-A7 78/+160%, 68, 34
62# Cortex-A8 51/+230%, 57, 30
63# Cortex-A9 53/+210%, 51, 26
64# Cortex-A15 42/+160%, 38, 18
65# Snapdragon S4 43/+210%, 38, 24
56676f87 66#
d9ca12cb
AP
67# (*) Corresponds to SHA3-256. Percentage after slash is improvement
68# over compiler-generated KECCAK_2X reference code.
e0584e96
AP
69# (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70# Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71# processors are presented mostly for reference purposes.
56676f87 72
aabfd329
AP
73my @C = map("r$_",(0..9));
74my @E = map("r$_",(10..12,14));
75
56676f87
AP
76########################################################################
77# Stack layout
78# ----->+-----------------------+
79# | uint64_t A[5][5] |
80# | ... |
81# +200->+-----------------------+
82# | uint64_t D[5] |
83# | ... |
84# +240->+-----------------------+
d9ca12cb 85# | uint64_t T[5][5] |
56676f87 86# | ... |
d9ca12cb 87# +440->+-----------------------+
56676f87 88# | saved lr |
d9ca12cb 89# +444->+-----------------------+
56676f87 90# | loop counter |
d9ca12cb 91# +448->+-----------------------+
56676f87
AP
92# | ...
93
aabfd329
AP
94my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
95my @D = map(8*$_, (25..29));
d9ca12cb 96my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
aabfd329
AP
97
98$code.=<<___;
99.text
100
56676f87
AP
101#if defined(__thumb2__)
102.syntax unified
103.thumb
104#else
105.code 32
106#endif
107
367c5527 108.type iotas32, %object
aabfd329 109.align 5
367c5527 110iotas32:
aabfd329
AP
111 .long 0x00000001, 0x00000000
112 .long 0x00000000, 0x00000089
113 .long 0x00000000, 0x8000008b
114 .long 0x00000000, 0x80008080
115 .long 0x00000001, 0x0000008b
116 .long 0x00000001, 0x00008000
117 .long 0x00000001, 0x80008088
118 .long 0x00000001, 0x80000082
119 .long 0x00000000, 0x0000000b
120 .long 0x00000000, 0x0000000a
121 .long 0x00000001, 0x00008082
122 .long 0x00000000, 0x00008003
123 .long 0x00000001, 0x0000808b
124 .long 0x00000001, 0x8000000b
125 .long 0x00000001, 0x8000008a
126 .long 0x00000001, 0x80000081
127 .long 0x00000000, 0x80000081
128 .long 0x00000000, 0x80000008
129 .long 0x00000000, 0x00000083
130 .long 0x00000000, 0x80008003
131 .long 0x00000001, 0x80008088
132 .long 0x00000000, 0x80000088
133 .long 0x00000001, 0x00008000
134 .long 0x00000000, 0x80008082
367c5527 135.size iotas32,.-iotas32
aabfd329 136
56676f87 137.type KeccakF1600_int, %function
aabfd329 138.align 5
56676f87 139KeccakF1600_int:
d9ca12cb
AP
140 add @C[9],sp,#$A[4][2]
141 add @E[2],sp,#$A[0][0]
aabfd329 142 add @E[0],sp,#$A[1][0]
d9ca12cb 143 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
56676f87 144KeccakF1600_enter:
d9ca12cb 145 str lr,[sp,#440]
56676f87 146 eor @E[1],@E[1],@E[1]
d9ca12cb 147 str @E[1],[sp,#444]
e0584e96 148 b .Lround2x
aabfd329
AP
149
150.align 4
e0584e96 151.Lround2x:
d9ca12cb
AP
152___
153sub Round {
154my (@A,@R); (@A[0..4],@R) = @_;
155
156$code.=<<___;
157 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
aabfd329 158 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
e0584e96
AP
159#ifdef __thumb2__
160 eor @C[0],@C[0],@E[0]
161 eor @C[1],@C[1],@E[1]
162 eor @C[2],@C[2],@E[2]
163 ldrd @E[0],@E[1],[sp,#$A[1][2]]
164 eor @C[3],@C[3],@E[3]
165 ldrd @E[2],@E[3],[sp,#$A[1][3]]
166 eor @C[4],@C[4],@E[0]
167 eor @C[5],@C[5],@E[1]
168 eor @C[6],@C[6],@E[2]
169 ldrd @E[0],@E[1],[sp,#$A[1][4]]
170 eor @C[7],@C[7],@E[3]
171 ldrd @E[2],@E[3],[sp,#$A[2][0]]
172 eor @C[8],@C[8],@E[0]
173 eor @C[9],@C[9],@E[1]
174 eor @C[0],@C[0],@E[2]
175 ldrd @E[0],@E[1],[sp,#$A[2][1]]
176 eor @C[1],@C[1],@E[3]
177 ldrd @E[2],@E[3],[sp,#$A[2][2]]
178 eor @C[2],@C[2],@E[0]
179 eor @C[3],@C[3],@E[1]
180 eor @C[4],@C[4],@E[2]
181 ldrd @E[0],@E[1],[sp,#$A[2][3]]
182 eor @C[5],@C[5],@E[3]
183 ldrd @E[2],@E[3],[sp,#$A[2][4]]
184 eor @C[6],@C[6],@E[0]
185 eor @C[7],@C[7],@E[1]
186 eor @C[8],@C[8],@E[2]
187 ldrd @E[0],@E[1],[sp,#$A[3][0]]
188 eor @C[9],@C[9],@E[3]
189 ldrd @E[2],@E[3],[sp,#$A[3][1]]
190 eor @C[0],@C[0],@E[0]
191 eor @C[1],@C[1],@E[1]
192 eor @C[2],@C[2],@E[2]
193 ldrd @E[0],@E[1],[sp,#$A[3][2]]
194 eor @C[3],@C[3],@E[3]
195 ldrd @E[2],@E[3],[sp,#$A[3][3]]
196 eor @C[4],@C[4],@E[0]
197 eor @C[5],@C[5],@E[1]
198 eor @C[6],@C[6],@E[2]
199 ldrd @E[0],@E[1],[sp,#$A[3][4]]
200 eor @C[7],@C[7],@E[3]
201 ldrd @E[2],@E[3],[sp,#$A[4][0]]
202 eor @C[8],@C[8],@E[0]
203 eor @C[9],@C[9],@E[1]
204 eor @C[0],@C[0],@E[2]
205 ldrd @E[0],@E[1],[sp,#$A[4][1]]
206 eor @C[1],@C[1],@E[3]
207 ldrd @E[2],@E[3],[sp,#$A[0][2]]
208 eor @C[2],@C[2],@E[0]
209 eor @C[3],@C[3],@E[1]
210 eor @C[4],@C[4],@E[2]
211 ldrd @E[0],@E[1],[sp,#$A[0][3]]
212 eor @C[5],@C[5],@E[3]
213 ldrd @E[2],@E[3],[sp,#$A[0][4]]
214#else
aabfd329
AP
215 eor @C[0],@C[0],@E[0]
216 add @E[0],sp,#$A[1][2]
217 eor @C[1],@C[1],@E[1]
218 eor @C[2],@C[2],@E[2]
219 eor @C[3],@C[3],@E[3]
220 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
221 eor @C[4],@C[4],@E[0]
222 add @E[0],sp,#$A[1][4]
223 eor @C[5],@C[5],@E[1]
224 eor @C[6],@C[6],@E[2]
225 eor @C[7],@C[7],@E[3]
226 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
227 eor @C[8],@C[8],@E[0]
228 add @E[0],sp,#$A[2][1]
229 eor @C[9],@C[9],@E[1]
230 eor @C[0],@C[0],@E[2]
231 eor @C[1],@C[1],@E[3]
232 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
233 eor @C[2],@C[2],@E[0]
234 add @E[0],sp,#$A[2][3]
235 eor @C[3],@C[3],@E[1]
236 eor @C[4],@C[4],@E[2]
237 eor @C[5],@C[5],@E[3]
238 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
239 eor @C[6],@C[6],@E[0]
240 add @E[0],sp,#$A[3][0]
241 eor @C[7],@C[7],@E[1]
242 eor @C[8],@C[8],@E[2]
243 eor @C[9],@C[9],@E[3]
244 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
245 eor @C[0],@C[0],@E[0]
246 add @E[0],sp,#$A[3][2]
247 eor @C[1],@C[1],@E[1]
248 eor @C[2],@C[2],@E[2]
249 eor @C[3],@C[3],@E[3]
250 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
251 eor @C[4],@C[4],@E[0]
252 add @E[0],sp,#$A[3][4]
253 eor @C[5],@C[5],@E[1]
254 eor @C[6],@C[6],@E[2]
255 eor @C[7],@C[7],@E[3]
256 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
257 eor @C[8],@C[8],@E[0]
d9ca12cb 258 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
aabfd329 259 eor @C[9],@C[9],@E[1]
d9ca12cb 260 ldr @E[1],[sp,#$A[4][1]+4]
aabfd329 261 eor @C[0],@C[0],@E[2]
d9ca12cb 262 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
aabfd329 263 eor @C[1],@C[1],@E[3]
d9ca12cb 264 ldr @E[3],[sp,#$A[0][2]+4]
aabfd329 265 eor @C[2],@C[2],@E[0]
d9ca12cb 266 add @E[0],sp,#$A[0][3]
aabfd329
AP
267 eor @C[3],@C[3],@E[1]
268 eor @C[4],@C[4],@E[2]
269 eor @C[5],@C[5],@E[3]
d9ca12cb 270 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
e0584e96 271#endif
aabfd329
AP
272 eor @C[6],@C[6],@E[0]
273 eor @C[7],@C[7],@E[1]
274 eor @C[8],@C[8],@E[2]
275 eor @C[9],@C[9],@E[3]
276
277 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
e0584e96 278 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
aabfd329 279 eor @E[1],@C[1],@C[4]
e0584e96 280 str.h @E[1],[sp,#$D[1]+4]
aabfd329 281 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
aabfd329 282 eor @E[3],@C[7],@C[0]
e0584e96 283 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
aabfd329 284 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
e0584e96 285 str.h @E[3],[sp,#$D[4]+4]
aabfd329 286 eor @C[1],@C[9],@C[2]
e0584e96 287 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
aabfd329 288 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
e0584e96 289 ldr.l @C[7],[sp,#$A[3][3]]
aabfd329 290 eor @C[3],@C[3],@C[6]
e0584e96
AP
291 str.h @C[1],[sp,#$D[0]+4]
292 ldr.h @C[6],[sp,#$A[3][3]+4]
293 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
aabfd329 294 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
e0584e96 295 str.h @C[3],[sp,#$D[2]+4]
aabfd329 296 eor @C[5],@C[5],@C[8]
aabfd329 297
e0584e96
AP
298 ldr.l @C[8],[sp,#$A[4][4]]
299 ldr.h @C[9],[sp,#$A[4][4]+4]
300 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
d9ca12cb 301 eor @C[7],@C[7],@C[4]
e0584e96 302 str.h @C[5],[sp,#$D[3]+4]
d9ca12cb 303 eor @C[6],@C[6],@C[5]
e0584e96 304 ldr.l @C[4],[sp,#$A[0][0]]
d9ca12cb
AP
305 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
306 @ ror @C[6],@C[6],#32-11
e0584e96 307 ldr.h @C[5],[sp,#$A[0][0]+4]
aabfd329
AP
308 eor @C[8],@C[8],@E[2]
309 eor @C[9],@C[9],@E[3]
e0584e96 310 ldr.l @E[2],[sp,#$A[2][2]]
aabfd329 311 eor @C[0],@C[0],@C[4]
e0584e96 312 ldr.h @E[3],[sp,#$A[2][2]+4]
d9ca12cb
AP
313 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
314 @ ror @C[9],@C[9],#32-7
aabfd329 315 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
aabfd329 316 eor @E[2],@E[2],@C[2]
e0584e96 317 ldr.l @C[2],[sp,#$A[1][1]]
53718107 318 eor @E[3],@E[3],@C[3]
e0584e96 319 ldr.h @C[3],[sp,#$A[1][1]+4]
53718107 320 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
d9ca12cb 321 ldr @E[2],[sp,#444] @ load counter
aabfd329 322 eor @C[2],@C[2],@E[0]
d9ca12cb 323 adr @E[0],iotas32
53718107 324 ror @C[4],@E[3],#32-22
d9ca12cb 325 add @E[3],@E[0],@E[2]
aabfd329 326 eor @C[3],@C[3],@E[1]
d9ca12cb
AP
327___
328$code.=<<___ if ($A[0][0] != $T[0][0]);
329 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
330___
331$code.=<<___ if ($A[0][0] == $T[0][0]);
e0584e96 332 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
d9ca12cb 333 add @E[2],@E[2],#16
e0584e96 334 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
56676f87 335 cmp @E[2],#192
d9ca12cb
AP
336 str @E[2],[sp,#444] @ store counter
337___
338$code.=<<___;
339 bic @E[2],@C[4],@C[2],ror#32-22
340 bic @E[3],@C[5],@C[3],ror#32-22
341 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
342 ror @C[3],@C[3],#32-22
56676f87
AP
343 eor @E[2],@E[2],@C[0]
344 eor @E[3],@E[3],@C[1]
aabfd329
AP
345 eor @E[0],@E[0],@E[2]
346 eor @E[1],@E[1],@E[3]
e0584e96 347 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
d9ca12cb 348 bic @E[2],@C[6],@C[4],ror#11
e0584e96 349 str.h @E[1],[sp,#$R[0][0]+4]
d9ca12cb
AP
350 bic @E[3],@C[7],@C[5],ror#10
351 bic @E[0],@C[8],@C[6],ror#32-(11-7)
352 bic @E[1],@C[9],@C[7],ror#32-(10-7)
353 eor @E[2],@C[2],@E[2],ror#32-11
e0584e96 354 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
d9ca12cb 355 eor @E[3],@C[3],@E[3],ror#32-10
e0584e96 356 str.h @E[3],[sp,#$R[0][1]+4]
d9ca12cb 357 eor @E[0],@C[4],@E[0],ror#32-7
d9ca12cb 358 eor @E[1],@C[5],@E[1],ror#32-7
e0584e96 359 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
d9ca12cb 360 bic @E[2],@C[0],@C[8],ror#32-7
e0584e96 361 str.h @E[1],[sp,#$R[0][2]+4]
d9ca12cb
AP
362 bic @E[3],@C[1],@C[9],ror#32-7
363 eor @E[2],@E[2],@C[6],ror#32-11
e0584e96 364 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
d9ca12cb 365 eor @E[3],@E[3],@C[7],ror#32-10
e0584e96 366 str.h @E[3],[sp,#$R[0][3]+4]
aabfd329 367 bic @E[0],@C[2],@C[0]
d9ca12cb 368 add @E[3],sp,#$D[3]
e0584e96 369 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
aabfd329 370 bic @E[1],@C[3],@C[1]
e0584e96 371 ldr.h @C[1],[sp,#$A[0][3]+4]
d9ca12cb 372 eor @E[0],@E[0],@C[8],ror#32-7
d9ca12cb 373 eor @E[1],@E[1],@C[9],ror#32-7
e0584e96 374 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
d9ca12cb 375 add @C[9],sp,#$D[0]
e0584e96 376 str.h @E[1],[sp,#$R[0][4]+4]
d9ca12cb
AP
377
378 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
379 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
380
e0584e96 381 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
aabfd329 382 eor @C[0],@C[0],@E[0]
e0584e96 383 ldr.h @C[3],[sp,#$A[1][4]+4]
aabfd329 384 eor @C[1],@C[1],@E[1]
d9ca12cb 385 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
e0584e96 386 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
d9ca12cb 387 @ ror @C[1],@C[1],#32-14
e0584e96 388 ldr.h @E[1],[sp,#$A[3][1]+4]
d9ca12cb 389
aabfd329 390 eor @C[2],@C[2],@E[2]
e0584e96 391 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
aabfd329 392 eor @C[3],@C[3],@E[3]
e0584e96 393 ldr.h @C[5],[sp,#$A[2][0]+4]
d9ca12cb
AP
394 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
395 @ ror @C[3],@C[3],#32-10
396
aabfd329 397 eor @C[6],@C[6],@C[4]
e0584e96 398 ldr.l @E[2],[sp,#$D[2]] @ D[2]
aabfd329 399 eor @C[7],@C[7],@C[5]
e0584e96 400 ldr.h @E[3],[sp,#$D[2]+4]
d9ca12cb 401 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
53718107 402 ror @C[4],@C[7],#32-2
d9ca12cb 403
aabfd329 404 eor @E[0],@E[0],@C[8]
e0584e96 405 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
aabfd329 406 eor @E[1],@E[1],@C[9]
e0584e96 407 ldr.h @C[9],[sp,#$A[4][2]+4]
d9ca12cb
AP
408 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
409 ror @C[6],@E[1],#32-23
410
411 bic @E[0],@C[4],@C[2],ror#32-10
412 bic @E[1],@C[5],@C[3],ror#32-10
413 eor @E[2],@E[2],@C[8]
414 eor @E[3],@E[3],@C[9]
415 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
416 ror @C[8],@E[3],#32-31
417 eor @E[0],@E[0],@C[0],ror#32-14
418 eor @E[1],@E[1],@C[1],ror#32-14
e0584e96 419 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
aabfd329 420 bic @E[2],@C[6],@C[4]
e0584e96 421 str.h @E[1],[sp,#$R[1][0]+4]
aabfd329 422 bic @E[3],@C[7],@C[5]
d9ca12cb 423 eor @E[2],@E[2],@C[2],ror#32-10
e0584e96 424 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
d9ca12cb 425 eor @E[3],@E[3],@C[3],ror#32-10
e0584e96 426 str.h @E[3],[sp,#$R[1][1]+4]
aabfd329 427 bic @E[0],@C[8],@C[6]
aabfd329 428 bic @E[1],@C[9],@C[7]
d9ca12cb
AP
429 bic @E[2],@C[0],@C[8],ror#14
430 bic @E[3],@C[1],@C[9],ror#14
aabfd329
AP
431 eor @E[0],@E[0],@C[4]
432 eor @E[1],@E[1],@C[5]
e0584e96
AP
433 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
434 bic @C[2],@C[2],@C[0],ror#32-(14-10)
435 str.h @E[1],[sp,#$R[1][2]+4]
d9ca12cb 436 eor @E[2],@C[6],@E[2],ror#32-14
d9ca12cb 437 bic @E[1],@C[3],@C[1],ror#32-(14-10)
e0584e96 438 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
d9ca12cb 439 eor @E[3],@C[7],@E[3],ror#32-14
e0584e96 440 str.h @E[3],[sp,#$R[1][3]+4]
d9ca12cb 441 add @E[2],sp,#$D[1]
e0584e96
AP
442 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
443 eor @E[0],@C[8],@C[2],ror#32-10
444 ldr.h @C[0],[sp,#$A[0][1]+4]
d9ca12cb 445 eor @E[1],@C[9],@E[1],ror#32-10
e0584e96
AP
446 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
447 str.h @E[1],[sp,#$R[1][4]+4]
d9ca12cb 448
e0584e96 449 add @C[9],sp,#$D[3]
d9ca12cb 450 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
e0584e96
AP
451 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
452 ldr.h @C[3],[sp,#$A[1][2]+4]
d9ca12cb
AP
453 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
454
455 eor @C[1],@C[1],@E[0]
e0584e96 456 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
d9ca12cb 457 eor @C[0],@C[0],@E[1]
e0584e96 458 ldr.h @C[5],[sp,#$A[2][3]+4]
d9ca12cb
AP
459 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
460
461 eor @C[2],@C[2],@E[2]
e0584e96 462 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
d9ca12cb 463 eor @C[3],@C[3],@E[3]
e0584e96 464 ldr.h @E[1],[sp,#$A[3][4]+4]
d9ca12cb 465 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
e0584e96 466 ldr.l @E[2],[sp,#$D[0]] @ D[0]
d9ca12cb 467 @ ror @C[3],@C[3],#32-3
e0584e96 468 ldr.h @E[3],[sp,#$D[0]+4]
d9ca12cb
AP
469
470 eor @C[4],@C[4],@C[6]
471 eor @C[5],@C[5],@C[7]
472 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
473 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
474
aabfd329 475 eor @E[0],@E[0],@C[8]
e0584e96 476 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
aabfd329 477 eor @E[1],@E[1],@C[9]
e0584e96 478 ldr.h @C[9],[sp,#$A[4][0]+4]
d9ca12cb
AP
479 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
480 ror @C[7],@E[1],#32-4
aabfd329 481
d9ca12cb
AP
482 eor @E[2],@E[2],@C[8]
483 eor @E[3],@E[3],@C[9]
484 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
485 ror @C[9],@E[3],#32-9
486
487 bic @E[0],@C[5],@C[2],ror#13-3
488 bic @E[1],@C[4],@C[3],ror#12-3
489 bic @E[2],@C[6],@C[5],ror#32-13
490 bic @E[3],@C[7],@C[4],ror#32-12
491 eor @E[0],@C[0],@E[0],ror#32-13
492 eor @E[1],@C[1],@E[1],ror#32-12
e0584e96 493 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
d9ca12cb 494 eor @E[2],@E[2],@C[2],ror#32-3
e0584e96 495 str.h @E[1],[sp,#$R[2][0]+4]
d9ca12cb 496 eor @E[3],@E[3],@C[3],ror#32-3
e0584e96 497 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
aabfd329 498 bic @E[0],@C[8],@C[6]
aabfd329 499 bic @E[1],@C[9],@C[7]
e0584e96 500 str.h @E[3],[sp,#$R[2][1]+4]
d9ca12cb
AP
501 eor @E[0],@E[0],@C[5],ror#32-13
502 eor @E[1],@E[1],@C[4],ror#32-12
e0584e96 503 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
aabfd329 504 bic @E[2],@C[0],@C[8]
e0584e96 505 str.h @E[1],[sp,#$R[2][2]+4]
aabfd329
AP
506 bic @E[3],@C[1],@C[9]
507 eor @E[2],@E[2],@C[6]
508 eor @E[3],@E[3],@C[7]
e0584e96 509 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
d9ca12cb 510 bic @E[0],@C[2],@C[0],ror#3
e0584e96 511 str.h @E[3],[sp,#$R[2][3]+4]
d9ca12cb 512 bic @E[1],@C[3],@C[1],ror#3
e0584e96 513 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
d9ca12cb 514 eor @E[0],@C[8],@E[0],ror#32-3
e0584e96 515 ldr.h @C[0],[sp,#$A[0][4]+4]
d9ca12cb 516 eor @E[1],@C[9],@E[1],ror#32-3
e0584e96 517 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
d9ca12cb 518 add @C[9],sp,#$D[1]
e0584e96 519 str.h @E[1],[sp,#$R[2][4]+4]
d9ca12cb 520
e0584e96
AP
521 ldr.l @E[0],[sp,#$D[4]] @ D[4]
522 ldr.h @E[1],[sp,#$D[4]+4]
523 ldr.l @E[2],[sp,#$D[0]] @ D[0]
524 ldr.h @E[3],[sp,#$D[0]+4]
d9ca12cb
AP
525
526 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
527
528 eor @C[1],@C[1],@E[0]
e0584e96 529 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
d9ca12cb 530 eor @C[0],@C[0],@E[1]
e0584e96 531 ldr.h @C[3],[sp,#$A[1][0]+4]
d9ca12cb 532 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
e0584e96 533 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
d9ca12cb 534 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
e0584e96 535 ldr.h @C[5],[sp,#$A[2][1]+4]
d9ca12cb
AP
536
537 eor @C[2],@C[2],@E[2]
e0584e96 538 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
d9ca12cb 539 eor @C[3],@C[3],@E[3]
e0584e96 540 ldr.h @E[1],[sp,#$A[3][2]+4]
d9ca12cb 541 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
e0584e96 542 ldr.l @E[2],[sp,#$D[3]] @ D[3]
d9ca12cb 543 @ ror @C[3],@C[3],#32-18
e0584e96 544 ldr.h @E[3],[sp,#$D[3]+4]
d9ca12cb
AP
545
546 eor @C[6],@C[6],@C[4]
547 eor @C[7],@C[7],@C[5]
548 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
549 ror @C[5],@C[7],#32-5
550
aabfd329 551 eor @E[0],@E[0],@C[8]
e0584e96 552 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
aabfd329 553 eor @E[1],@E[1],@C[9]
e0584e96 554 ldr.h @C[9],[sp,#$A[4][3]+4]
aabfd329 555 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
53718107 556 ror @C[6],@E[1],#32-8
aabfd329 557
d9ca12cb
AP
558 eor @E[2],@E[2],@C[8]
559 eor @E[3],@E[3],@C[9]
560 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
561 ror @C[9],@E[3],#32-28
562
563 bic @E[0],@C[4],@C[2],ror#32-18
564 bic @E[1],@C[5],@C[3],ror#32-18
565 eor @E[0],@E[0],@C[0],ror#32-14
566 eor @E[1],@E[1],@C[1],ror#32-13
e0584e96 567 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
aabfd329 568 bic @E[2],@C[6],@C[4]
e0584e96 569 str.h @E[1],[sp,#$R[3][0]+4]
aabfd329 570 bic @E[3],@C[7],@C[5]
d9ca12cb 571 eor @E[2],@E[2],@C[2],ror#32-18
e0584e96 572 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
d9ca12cb 573 eor @E[3],@E[3],@C[3],ror#32-18
e0584e96 574 str.h @E[3],[sp,#$R[3][1]+4]
aabfd329 575 bic @E[0],@C[8],@C[6]
aabfd329 576 bic @E[1],@C[9],@C[7]
d9ca12cb
AP
577 bic @E[2],@C[0],@C[8],ror#14
578 bic @E[3],@C[1],@C[9],ror#13
aabfd329
AP
579 eor @E[0],@E[0],@C[4]
580 eor @E[1],@E[1],@C[5]
e0584e96
AP
581 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
582 bic @C[2],@C[2],@C[0],ror#18-14
583 str.h @E[1],[sp,#$R[3][2]+4]
d9ca12cb 584 eor @E[2],@C[6],@E[2],ror#32-14
d9ca12cb
AP
585 bic @E[1],@C[3],@C[1],ror#18-13
586 eor @E[3],@C[7],@E[3],ror#32-13
e0584e96
AP
587 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
588 str.h @E[3],[sp,#$R[3][3]+4]
d9ca12cb 589 add @E[3],sp,#$D[2]
e0584e96
AP
590 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
591 eor @E[0],@C[8],@C[2],ror#32-18
592 ldr.h @C[1],[sp,#$A[0][2]+4]
d9ca12cb 593 eor @E[1],@C[9],@E[1],ror#32-18
e0584e96
AP
594 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
595 str.h @E[1],[sp,#$R[3][4]+4]
d9ca12cb
AP
596
597 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
e0584e96
AP
598 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
599 ldr.h @C[3],[sp,#$A[1][3]+4]
600 ldr.l @C[6],[sp,#$D[4]] @ D[4]
601 ldr.h @C[7],[sp,#$D[4]+4]
d9ca12cb
AP
602
603 eor @C[0],@C[0],@E[0]
e0584e96 604 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
d9ca12cb 605 eor @C[1],@C[1],@E[1]
e0584e96 606 ldr.h @C[5],[sp,#$A[2][4]+4]
d9ca12cb 607 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
e0584e96 608 ldr.l @C[8],[sp,#$D[0]] @ D[0]
d9ca12cb 609 @ ror @C[1],@C[1],#32-31
e0584e96 610 ldr.h @C[9],[sp,#$D[0]+4]
d9ca12cb
AP
611
612 eor @E[2],@E[2],@C[2]
e0584e96 613 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
d9ca12cb 614 eor @E[3],@E[3],@C[3]
e0584e96 615 ldr.h @E[1],[sp,#$A[3][0]+4]
d9ca12cb 616 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
e0584e96 617 ldr.l @E[2],[sp,#$D[1]] @ D[1]
d9ca12cb 618 ror @C[2],@E[3],#32-28
e0584e96 619 ldr.h @E[3],[sp,#$D[1]+4]
d9ca12cb
AP
620
621 eor @C[6],@C[6],@C[4]
622 eor @C[7],@C[7],@C[5]
623 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
624 ror @C[4],@C[7],#32-20
625
aabfd329 626 eor @E[0],@E[0],@C[8]
e0584e96 627 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
aabfd329 628 eor @E[1],@E[1],@C[9]
e0584e96 629 ldr.h @C[9],[sp,#$A[4][1]+4]
d9ca12cb
AP
630 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
631 ror @C[6],@E[1],#32-21
632
633 eor @C[8],@C[8],@E[2]
634 eor @C[9],@C[9],@E[3]
635 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
636 @ ror @C[9],@C[3],#32-1
aabfd329
AP
637
638 bic @E[0],@C[4],@C[2]
639 bic @E[1],@C[5],@C[3]
d9ca12cb 640 eor @E[0],@E[0],@C[0],ror#32-31
e0584e96 641 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
d9ca12cb 642 eor @E[1],@E[1],@C[1],ror#32-31
e0584e96 643 str.h @E[1],[sp,#$R[4][0]+4]
aabfd329 644 bic @E[2],@C[6],@C[4]
aabfd329
AP
645 bic @E[3],@C[7],@C[5]
646 eor @E[2],@E[2],@C[2]
647 eor @E[3],@E[3],@C[3]
e0584e96 648 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
d9ca12cb 649 bic @E[0],@C[8],@C[6],ror#1
e0584e96 650 str.h @E[3],[sp,#$R[4][1]+4]
d9ca12cb
AP
651 bic @E[1],@C[9],@C[7],ror#1
652 bic @E[2],@C[0],@C[8],ror#31-1
653 bic @E[3],@C[1],@C[9],ror#31-1
654 eor @C[4],@C[4],@E[0],ror#32-1
e0584e96 655 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
d9ca12cb 656 eor @C[5],@C[5],@E[1],ror#32-1
e0584e96 657 str.h @C[5],[sp,#$R[4][2]+4]
d9ca12cb 658 eor @C[6],@C[6],@E[2],ror#32-31
d9ca12cb 659 eor @C[7],@C[7],@E[3],ror#32-31
e0584e96 660 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
d9ca12cb 661 bic @E[0],@C[2],@C[0],ror#32-31
e0584e96 662 str.h @C[7],[sp,#$R[4][3]+4]
d9ca12cb
AP
663 bic @E[1],@C[3],@C[1],ror#32-31
664 add @E[2],sp,#$R[0][0]
665 eor @C[8],@E[0],@C[8],ror#32-1
666 add @E[0],sp,#$R[1][0]
667 eor @C[9],@E[1],@C[9],ror#32-1
e0584e96
AP
668 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
669 str.h @C[9],[sp,#$R[4][4]+4]
d9ca12cb
AP
670___
671}
672 Round(@A,@T);
673 Round(@T,@A);
674$code.=<<___;
e0584e96 675 blo .Lround2x
aabfd329 676
d9ca12cb 677 ldr pc,[sp,#440]
56676f87
AP
678.size KeccakF1600_int,.-KeccakF1600_int
679
680.type KeccakF1600, %function
681.align 5
682KeccakF1600:
683 stmdb sp!,{r0,r4-r11,lr}
d9ca12cb 684 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
56676f87
AP
685
686 add @E[0],r0,#$A[1][0]
687 add @E[1],sp,#$A[1][0]
d9ca12cb
AP
688 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
689 stmia sp, {@C[0]-@C[9]}
690 ldmia @E[0]!,{@C[0]-@C[9]}
56676f87
AP
691 stmia @E[1]!,{@C[0]-@C[9]}
692 ldmia @E[0]!,{@C[0]-@C[9]}
693 stmia @E[1]!,{@C[0]-@C[9]}
694 ldmia @E[0]!,{@C[0]-@C[9]}
695 stmia @E[1]!,{@C[0]-@C[9]}
696 ldmia @E[0], {@C[0]-@C[9]}
d9ca12cb 697 add @E[2],sp,#$A[0][0]
56676f87 698 add @E[0],sp,#$A[1][0]
d9ca12cb 699 stmia @E[1], {@C[0]-@C[9]}
56676f87
AP
700
701 bl KeccakF1600_enter
702
d9ca12cb 703 ldr @E[1], [sp,#440+16] @ restore pointer to A
56676f87
AP
704 ldmia sp, {@C[0]-@C[9]}
705 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
aabfd329
AP
706 ldmia @E[0]!,{@C[0]-@C[9]}
707 stmia @E[1]!,{@C[0]-@C[9]}
708 ldmia @E[0]!,{@C[0]-@C[9]}
709 stmia @E[1]!,{@C[0]-@C[9]}
710 ldmia @E[0]!,{@C[0]-@C[9]}
711 stmia @E[1]!,{@C[0]-@C[9]}
56676f87
AP
712 ldmia @E[0], {@C[0]-@C[9]}
713 stmia @E[1], {@C[0]-@C[9]}
aabfd329 714
d9ca12cb 715 add sp,sp,#440+20
56676f87 716 ldmia sp!,{r4-r11,pc}
aabfd329
AP
717.size KeccakF1600,.-KeccakF1600
718___
6dad1efe 719{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
56676f87
AP
720
721########################################################################
722# Stack layout
723# ----->+-----------------------+
724# | uint64_t A[5][5] |
725# | ... |
726# | ... |
d9ca12cb 727# +456->+-----------------------+
6dad1efe 728# | 0x55555555 |
d9ca12cb 729# +460->+-----------------------+
6dad1efe 730# | 0x33333333 |
d9ca12cb 731# +464->+-----------------------+
6dad1efe 732# | 0x0f0f0f0f |
d9ca12cb 733# +468->+-----------------------+
6dad1efe 734# | 0x00ff00ff |
d9ca12cb 735# +472->+-----------------------+
6dad1efe 736# | uint64_t *A |
d9ca12cb 737# +476->+-----------------------+
6dad1efe 738# | const void *inp |
d9ca12cb 739# +480->+-----------------------+
6dad1efe 740# | size_t len |
d9ca12cb 741# +484->+-----------------------+
6dad1efe 742# | size_t bs |
d9ca12cb 743# +488->+-----------------------+
56676f87
AP
744# | ....
745
746$code.=<<___;
747.global SHA3_absorb
748.type SHA3_absorb,%function
749.align 5
750SHA3_absorb:
751 stmdb sp!,{r0-r12,lr}
d9ca12cb 752 sub sp,sp,#456+16
56676f87 753
6dad1efe
AP
754 add $A_flat,r0,#$A[1][0]
755 @ mov $inp,r1
56676f87
AP
756 mov $len,r2
757 mov $bsz,r3
6dad1efe
AP
758 cmp r2,r3
759 blo .Labsorb_abort
760
761 add $inp,sp,#0
762 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
763 stmia $inp!, {@C[0]-@C[9]}
764 ldmia $A_flat!,{@C[0]-@C[9]}
765 stmia $inp!, {@C[0]-@C[9]}
766 ldmia $A_flat!,{@C[0]-@C[9]}
767 stmia $inp!, {@C[0]-@C[9]}
768 ldmia $A_flat!,{@C[0]-@C[9]}
769 stmia $inp!, {@C[0]-@C[9]}
770 ldmia $A_flat!,{@C[0]-@C[9]}
771 stmia $inp, {@C[0]-@C[9]}
772
d9ca12cb 773 ldr $inp,[sp,#476] @ restore $inp
6dad1efe
AP
774#ifdef __thumb2__
775 mov r9,#0x00ff00ff
776 mov r8,#0x0f0f0f0f
777 mov r7,#0x33333333
778 mov r6,#0x55555555
779#else
780 mov r6,#0x11 @ compose constants
781 mov r8,#0x0f
782 mov r9,#0xff
783 orr r6,r6,r6,lsl#8
784 orr r8,r8,r8,lsl#8
785 orr r6,r6,r6,lsl#16 @ 0x11111111
786 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
787 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
788 orr r7,r6,r6,lsl#1 @ 0x33333333
789 orr r6,r6,r6,lsl#2 @ 0x55555555
790#endif
d9ca12cb
AP
791 str r9,[sp,#468]
792 str r8,[sp,#464]
793 str r7,[sp,#460]
794 str r6,[sp,#456]
6dad1efe 795 b .Loop_absorb
56676f87 796
6dad1efe 797.align 4
56676f87
AP
798.Loop_absorb:
799 subs r0,$len,$bsz
800 blo .Labsorbed
801 add $A_flat,sp,#0
d9ca12cb 802 str r0,[sp,#480] @ save len - bsz
56676f87 803
6dad1efe 804.align 4
56676f87 805.Loop_block:
6dad1efe
AP
806 ldrb r0,[$inp],#1
807 ldrb r1,[$inp],#1
808 ldrb r2,[$inp],#1
809 ldrb r3,[$inp],#1
810 ldrb r4,[$inp],#1
811 orr r0,r0,r1,lsl#8
812 ldrb r1,[$inp],#1
813 orr r0,r0,r2,lsl#16
814 ldrb r2,[$inp],#1
815 orr r0,r0,r3,lsl#24 @ lo
816 ldrb r3,[$inp],#1
817 orr r1,r4,r1,lsl#8
818 orr r1,r1,r2,lsl#16
819 orr r1,r1,r3,lsl#24 @ hi
820
821 and r2,r0,r6 @ &=0x55555555
822 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
823 and r3,r1,r6 @ &=0x55555555
824 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
825 orr r2,r2,r2,lsr#1
826 orr r0,r0,r0,lsl#1
827 orr r3,r3,r3,lsr#1
828 orr r1,r1,r1,lsl#1
829 and r2,r2,r7 @ &=0x33333333
830 and r0,r0,r7,lsl#2 @ &=0xcccccccc
831 and r3,r3,r7 @ &=0x33333333
832 and r1,r1,r7,lsl#2 @ &=0xcccccccc
833 orr r2,r2,r2,lsr#2
834 orr r0,r0,r0,lsl#2
835 orr r3,r3,r3,lsr#2
836 orr r1,r1,r1,lsl#2
837 and r2,r2,r8 @ &=0x0f0f0f0f
838 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
839 and r3,r3,r8 @ &=0x0f0f0f0f
840 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
841 ldmia $A_flat,{r4-r5} @ A_flat[i]
842 orr r2,r2,r2,lsr#4
843 orr r0,r0,r0,lsl#4
844 orr r3,r3,r3,lsr#4
845 orr r1,r1,r1,lsl#4
846 and r2,r2,r9 @ &=0x00ff00ff
847 and r0,r0,r9,lsl#8 @ &=0xff00ff00
848 and r3,r3,r9 @ &=0x00ff00ff
849 and r1,r1,r9,lsl#8 @ &=0xff00ff00
850 orr r2,r2,r2,lsr#8
851 orr r0,r0,r0,lsl#8
852 orr r3,r3,r3,lsr#8
853 orr r1,r1,r1,lsl#8
854
855 lsl r2,r2,#16
856 lsr r1,r1,#16
857 eor r4,r4,r3,lsl#16
858 eor r5,r5,r0,lsr#16
859 eor r4,r4,r2,lsr#16
860 eor r5,r5,r1,lsl#16
861 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
862
56676f87
AP
863 subs $bsz,$bsz,#8
864 bhi .Loop_block
865
d9ca12cb 866 str $inp,[sp,#476]
56676f87
AP
867
868 bl KeccakF1600_int
869
d9ca12cb 870 add r14,sp,#456
6dad1efe 871 ldmia r14,{r6-r12,r14} @ restore constants and variables
56676f87
AP
872 b .Loop_absorb
873
874.align 4
875.Labsorbed:
6dad1efe
AP
876 add $inp,sp,#$A[1][0]
877 ldmia sp, {@C[0]-@C[9]}
878 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
879 ldmia $inp!, {@C[0]-@C[9]}
880 stmia $A_flat!,{@C[0]-@C[9]}
881 ldmia $inp!, {@C[0]-@C[9]}
882 stmia $A_flat!,{@C[0]-@C[9]}
883 ldmia $inp!, {@C[0]-@C[9]}
884 stmia $A_flat!,{@C[0]-@C[9]}
885 ldmia $inp, {@C[0]-@C[9]}
886 stmia $A_flat, {@C[0]-@C[9]}
887
888.Labsorb_abort:
d9ca12cb 889 add sp,sp,#456+32
56676f87
AP
890 mov r0,$len @ return value
891 ldmia sp!,{r4-r12,pc}
892.size SHA3_absorb,.-SHA3_absorb
893___
894}
6dad1efe 895{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
56676f87
AP
896
897$code.=<<___;
898.global SHA3_squeeze
899.type SHA3_squeeze,%function
900.align 5
901SHA3_squeeze:
6dad1efe
AP
902 stmdb sp!,{r0,r3-r10,lr}
903
56676f87
AP
904 mov $A_flat,r0
905 mov $out,r1
906 mov $len,r2
907 mov $bsz,r3
6dad1efe
AP
908
909#ifdef __thumb2__
910 mov r9,#0x00ff00ff
911 mov r8,#0x0f0f0f0f
912 mov r7,#0x33333333
913 mov r6,#0x55555555
914#else
915 mov r6,#0x11 @ compose constants
916 mov r8,#0x0f
917 mov r9,#0xff
918 orr r6,r6,r6,lsl#8
919 orr r8,r8,r8,lsl#8
920 orr r6,r6,r6,lsl#16 @ 0x11111111
921 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
922 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
923 orr r7,r6,r6,lsl#1 @ 0x33333333
924 orr r6,r6,r6,lsl#2 @ 0x55555555
925#endif
926 stmdb sp!,{r6-r9}
927
928 mov r14,$A_flat
56676f87
AP
929 b .Loop_squeeze
930
931.align 4
932.Loop_squeeze:
6dad1efe
AP
933 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
934
935 lsl r2,r0,#16
936 lsl r3,r1,#16 @ r3 = r1 << 16
937 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
938 lsr r1,r1,#16
939 lsr r0,r0,#16 @ r0 = r0 >> 16
940 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
941
942 orr r2,r2,r2,lsl#8
943 orr r3,r3,r3,lsr#8
944 orr r0,r0,r0,lsl#8
945 orr r1,r1,r1,lsr#8
946 and r2,r2,r9 @ &=0x00ff00ff
947 and r3,r3,r9,lsl#8 @ &=0xff00ff00
948 and r0,r0,r9 @ &=0x00ff00ff
949 and r1,r1,r9,lsl#8 @ &=0xff00ff00
950 orr r2,r2,r2,lsl#4
951 orr r3,r3,r3,lsr#4
952 orr r0,r0,r0,lsl#4
953 orr r1,r1,r1,lsr#4
954 and r2,r2,r8 @ &=0x0f0f0f0f
955 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
956 and r0,r0,r8 @ &=0x0f0f0f0f
957 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
958 orr r2,r2,r2,lsl#2
959 orr r3,r3,r3,lsr#2
960 orr r0,r0,r0,lsl#2
961 orr r1,r1,r1,lsr#2
962 and r2,r2,r7 @ &=0x33333333
963 and r3,r3,r7,lsl#2 @ &=0xcccccccc
964 and r0,r0,r7 @ &=0x33333333
965 and r1,r1,r7,lsl#2 @ &=0xcccccccc
966 orr r2,r2,r2,lsl#1
967 orr r3,r3,r3,lsr#1
968 orr r0,r0,r0,lsl#1
969 orr r1,r1,r1,lsr#1
970 and r2,r2,r6 @ &=0x55555555
971 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
972 and r0,r0,r6 @ &=0x55555555
973 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
974
975 orr r2,r2,r3
976 orr r0,r0,r1
977
978 cmp $len,#8
979 blo .Lsqueeze_tail
980 lsr r1,r2,#8
981 strb r2,[$out],#1
982 lsr r3,r2,#16
983 strb r1,[$out],#1
984 lsr r2,r2,#24
985 strb r3,[$out],#1
986 strb r2,[$out],#1
987
988 lsr r1,r0,#8
989 strb r0,[$out],#1
990 lsr r3,r0,#16
991 strb r1,[$out],#1
992 lsr r0,r0,#24
993 strb r3,[$out],#1
994 strb r0,[$out],#1
995 subs $len,$len,#8
56676f87 996 beq .Lsqueeze_done
56676f87 997
6dad1efe 998 subs $bsz,$bsz,#8 @ bsz -= 8
56676f87
AP
999 bhi .Loop_squeeze
1000
6dad1efe 1001 mov r0,r14 @ original $A_flat
56676f87
AP
1002
1003 bl KeccakF1600
1004
6dad1efe
AP
1005 ldmia sp,{r6-r10,r12} @ restore constants and variables
1006 mov r14,$A_flat
56676f87
AP
1007 b .Loop_squeeze
1008
6dad1efe
AP
1009.align 4
1010.Lsqueeze_tail:
1011 strb r2,[$out],#1
1012 lsr r2,r2,#8
1013 subs $len,$len,#1
1014 beq .Lsqueeze_done
1015 strb r2,[$out],#1
1016 lsr r2,r2,#8
1017 subs $len,$len,#1
1018 beq .Lsqueeze_done
1019 strb r2,[$out],#1
1020 lsr r2,r2,#8
1021 subs $len,$len,#1
1022 beq .Lsqueeze_done
1023 strb r2,[$out],#1
1024 subs $len,$len,#1
1025 beq .Lsqueeze_done
1026
1027 strb r0,[$out],#1
1028 lsr r0,r0,#8
1029 subs $len,$len,#1
1030 beq .Lsqueeze_done
1031 strb r0,[$out],#1
1032 lsr r0,r0,#8
1033 subs $len,$len,#1
1034 beq .Lsqueeze_done
1035 strb r0,[$out]
1036 b .Lsqueeze_done
1037
1038.align 4
56676f87 1039.Lsqueeze_done:
6dad1efe 1040 add sp,sp,#24
56676f87
AP
1041 ldmia sp!,{r4-r10,pc}
1042.size SHA3_squeeze,.-SHA3_squeeze
56676f87
AP
1043___
1044}
1045
367c5527
AP
1046$code.=<<___;
1047.fpu neon
1048
1049.type iotas64, %object
1050.align 5
1051iotas64:
1052 .quad 0x0000000000000001
1053 .quad 0x0000000000008082
1054 .quad 0x800000000000808a
1055 .quad 0x8000000080008000
1056 .quad 0x000000000000808b
1057 .quad 0x0000000080000001
1058 .quad 0x8000000080008081
1059 .quad 0x8000000000008009
1060 .quad 0x000000000000008a
1061 .quad 0x0000000000000088
1062 .quad 0x0000000080008009
1063 .quad 0x000000008000000a
1064 .quad 0x000000008000808b
1065 .quad 0x800000000000008b
1066 .quad 0x8000000000008089
1067 .quad 0x8000000000008003
1068 .quad 0x8000000000008002
1069 .quad 0x8000000000000080
1070 .quad 0x000000000000800a
1071 .quad 0x800000008000000a
1072 .quad 0x8000000080008081
1073 .quad 0x8000000000008080
1074 .quad 0x0000000080000001
1075 .quad 0x8000000080008008
1076.size iotas64,.-iotas64
1077
1078.type KeccakF1600_neon, %function
1079.align 5
1080KeccakF1600_neon:
1081 add r1, r0, #16
1082 adr r2, iotas64
1083 mov r3, #24 @ loop counter
1084 b .Loop_neon
1085
1086.align 4
1087.Loop_neon:
1088 @ Theta
1089 vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
1090 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1091 vst1.64 {d18}, [r1:64] @ offload A[2][4]
1092 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1093 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1094 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1095 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1096 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1097 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1098 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1099 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1100 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1101 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1102 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1103 veor d25, d25, d24 @ C[4]^=A[4][4]
1104
1105 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1106 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1107 vadd.u64 d18, d25, d25 @ C[4]<<1
1108 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1109 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1110 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1111 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1112 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1113 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1114 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1115
1116 veor d0, d0, d25 @ A[0][0] ^= C[4]
1117 veor d1, d1, d25 @ A[1][0] ^= C[4]
1118 veor d10, d10, d25 @ A[2][0] ^= C[4]
1119 veor d11, d11, d25 @ A[3][0] ^= C[4]
1120 veor d20, d20, d25 @ A[4][0] ^= C[4]
1121
1122 veor d2, d2, d26 @ A[0][1] ^= D[1]
1123 veor d3, d3, d26 @ A[1][1] ^= D[1]
1124 veor d12, d12, d26 @ A[2][1] ^= D[1]
1125 veor d13, d13, d26 @ A[3][1] ^= D[1]
1126 veor d21, d21, d26 @ A[4][1] ^= D[1]
1127 vmov d26, d27
1128
1129 veor d6, d6, d28 @ A[0][3] ^= C[2]
1130 veor d7, d7, d28 @ A[1][3] ^= C[2]
1131 veor d16, d16, d28 @ A[2][3] ^= C[2]
1132 veor d17, d17, d28 @ A[3][3] ^= C[2]
1133 veor d23, d23, d28 @ A[4][3] ^= C[2]
1134 vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
1135 vmov d28, d29
1136
1137 vld1.64 {d18}, [r1:64] @ restore A[2][4]
1138 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1139 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1140 veor d22, d22, d27 @ A[4][2] ^= D[2]
1141
1142 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1143 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1144 veor d24, d24, d29 @ A[4][4] ^= C[3]
1145
1146 @ Rho + Pi
1147 vmov d26, d2 @ C[1] = A[0][1]
1148 vshl.u64 d2, d3, #44
1149 vmov d27, d4 @ C[2] = A[0][2]
1150 vshl.u64 d4, d14, #43
1151 vmov d28, d6 @ C[3] = A[0][3]
1152 vshl.u64 d6, d17, #21
1153 vmov d29, d8 @ C[4] = A[0][4]
1154 vshl.u64 d8, d24, #14
1155 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1156 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1157 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1158 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1159
1160 vshl.u64 d3, d9, #20
1161 vshl.u64 d14, d16, #25
1162 vshl.u64 d17, d15, #15
1163 vshl.u64 d24, d21, #2
1164 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1165 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1166 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1167 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1168
1169 vshl.u64 d9, d22, #61
1170 @ vshl.u64 d16, d19, #8
1171 vshl.u64 d15, d12, #10
1172 vshl.u64 d21, d7, #55
1173 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1174 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1175 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1176 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1177
1178 vshl.u64 d22, d18, #39
1179 @ vshl.u64 d19, d23, #56
1180 vshl.u64 d12, d5, #6
1181 vshl.u64 d7, d13, #45
1182 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1183 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1184 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1185 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1186
1187 vshl.u64 d18, d20, #18
1188 vshl.u64 d23, d11, #41
1189 vshl.u64 d5, d10, #3
1190 vshl.u64 d13, d1, #36
1191 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1192 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1193 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1194 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1195
1196 vshl.u64 d1, d28, #28
1197 vshl.u64 d10, d26, #1
1198 vshl.u64 d11, d29, #27
1199 vshl.u64 d20, d27, #62
1200 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1201 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1202 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1203 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1204
1205 @ Chi + Iota
1206 vbic q13, q2, q1
1207 vbic q14, q3, q2
1208 vbic q15, q4, q3
1209 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1210 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1211 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1212 vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
1213 vbic q13, q0, q4
1214 vbic q15, q1, q0
1215 vmov q1, q14 @ A[0..1][1]
1216 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1217 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1218
1219 vbic q13, q7, q6
1220 vmov q0, q5 @ A[2..3][0]
1221 vbic q14, q8, q7
1222 vmov q15, q6 @ A[2..3][1]
1223 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1224 vbic q13, q9, q8
1225 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1226 vbic q14, q0, q9
1227 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1228 vbic q13, q15, q0
1229 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1230 vmov q14, q10 @ A[4][0..1]
1231 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1232
1233 vld1.64 d25, [r2:64]! @ Iota[i++]
1234 vbic d26, d22, d21
1235 vbic d27, d23, d22
1236 vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
1237 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1238 vbic d26, d24, d23
1239 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1240 vbic d27, d28, d24
1241 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1242 vbic d26, d29, d28
1243 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1244 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1245 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1246
1247 subs r3, r3, #1
1248 bne .Loop_neon
1249
1250 bx lr
1251.size KeccakF1600_neon,.-KeccakF1600_neon
1252
1253.global SHA3_absorb_neon
1254.type SHA3_absorb_neon, %function
1255.align 5
1256SHA3_absorb_neon:
1257 stmdb sp!, {r4-r6,lr}
1258 vstmdb sp!, {d8-d15}
1259
1260 mov r4, r1 @ inp
1261 mov r5, r2 @ len
1262 mov r6, r3 @ bsz
1263
1264 vld1.32 {d0}, [r0:64]! @ A[0][0]
1265 vld1.32 {d2}, [r0:64]! @ A[0][1]
1266 vld1.32 {d4}, [r0:64]! @ A[0][2]
1267 vld1.32 {d6}, [r0:64]! @ A[0][3]
1268 vld1.32 {d8}, [r0:64]! @ A[0][4]
1269
1270 vld1.32 {d1}, [r0:64]! @ A[1][0]
1271 vld1.32 {d3}, [r0:64]! @ A[1][1]
1272 vld1.32 {d5}, [r0:64]! @ A[1][2]
1273 vld1.32 {d7}, [r0:64]! @ A[1][3]
1274 vld1.32 {d9}, [r0:64]! @ A[1][4]
1275
1276 vld1.32 {d10}, [r0:64]! @ A[2][0]
1277 vld1.32 {d12}, [r0:64]! @ A[2][1]
1278 vld1.32 {d14}, [r0:64]! @ A[2][2]
1279 vld1.32 {d16}, [r0:64]! @ A[2][3]
1280 vld1.32 {d18}, [r0:64]! @ A[2][4]
1281
1282 vld1.32 {d11}, [r0:64]! @ A[3][0]
1283 vld1.32 {d13}, [r0:64]! @ A[3][1]
1284 vld1.32 {d15}, [r0:64]! @ A[3][2]
1285 vld1.32 {d17}, [r0:64]! @ A[3][3]
1286 vld1.32 {d19}, [r0:64]! @ A[3][4]
1287
1288 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
1289 vld1.32 {d24}, [r0:64] @ A[4][4]
1290 sub r0, r0, #24*8 @ rewind
1291 b .Loop_absorb_neon
1292
1293.align 4
1294.Loop_absorb_neon:
1295 subs r12, r5, r6 @ len - bsz
1296 blo .Labsorbed_neon
1297 mov r5, r12
1298
1299 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1300 cmp r6, #8*2
1301 veor d0, d0, d31 @ A[0][0] ^= *inp++
1302 blo .Lprocess_neon
1303 vld1.8 {d31}, [r4]!
1304 veor d2, d2, d31 @ A[0][1] ^= *inp++
1305 beq .Lprocess_neon
1306 vld1.8 {d31}, [r4]!
1307 cmp r6, #8*4
1308 veor d4, d4, d31 @ A[0][2] ^= *inp++
1309 blo .Lprocess_neon
1310 vld1.8 {d31}, [r4]!
1311 veor d6, d6, d31 @ A[0][3] ^= *inp++
1312 beq .Lprocess_neon
1313 vld1.8 {d31},[r4]!
1314 cmp r6, #8*6
1315 veor d8, d8, d31 @ A[0][4] ^= *inp++
1316 blo .Lprocess_neon
1317
1318 vld1.8 {d31}, [r4]!
1319 veor d1, d1, d31 @ A[1][0] ^= *inp++
1320 beq .Lprocess_neon
1321 vld1.8 {d31}, [r4]!
1322 cmp r6, #8*8
1323 veor d3, d3, d31 @ A[1][1] ^= *inp++
1324 blo .Lprocess_neon
1325 vld1.8 {d31}, [r4]!
1326 veor d5, d5, d31 @ A[1][2] ^= *inp++
1327 beq .Lprocess_neon
1328 vld1.8 {d31}, [r4]!
1329 cmp r6, #8*10
1330 veor d7, d7, d31 @ A[1][3] ^= *inp++
1331 blo .Lprocess_neon
1332 vld1.8 {d31}, [r4]!
1333 veor d9, d9, d31 @ A[1][4] ^= *inp++
1334 beq .Lprocess_neon
1335
1336 vld1.8 {d31}, [r4]!
1337 cmp r6, #8*12
1338 veor d10, d10, d31 @ A[2][0] ^= *inp++
1339 blo .Lprocess_neon
1340 vld1.8 {d31}, [r4]!
1341 veor d12, d12, d31 @ A[2][1] ^= *inp++
1342 beq .Lprocess_neon
1343 vld1.8 {d31}, [r4]!
1344 cmp r6, #8*14
1345 veor d14, d14, d31 @ A[2][2] ^= *inp++
1346 blo .Lprocess_neon
1347 vld1.8 {d31}, [r4]!
1348 veor d16, d16, d31 @ A[2][3] ^= *inp++
1349 beq .Lprocess_neon
1350 vld1.8 {d31}, [r4]!
1351 cmp r6, #8*16
1352 veor d18, d18, d31 @ A[2][4] ^= *inp++
1353 blo .Lprocess_neon
1354
1355 vld1.8 {d31}, [r4]!
1356 veor d11, d11, d31 @ A[3][0] ^= *inp++
1357 beq .Lprocess_neon
1358 vld1.8 {d31}, [r4]!
1359 cmp r6, #8*18
1360 veor d13, d13, d31 @ A[3][1] ^= *inp++
1361 blo .Lprocess_neon
1362 vld1.8 {d31}, [r4]!
1363 veor d15, d15, d31 @ A[3][2] ^= *inp++
1364 beq .Lprocess_neon
1365 vld1.8 {d31}, [r4]!
1366 cmp r6, #8*20
1367 veor d17, d17, d31 @ A[3][3] ^= *inp++
1368 blo .Lprocess_neon
1369 vld1.8 {d31}, [r4]!
1370 veor d19, d19, d31 @ A[3][4] ^= *inp++
1371 beq .Lprocess_neon
1372
1373 vld1.8 {d31}, [r4]!
1374 cmp r6, #8*22
1375 veor d20, d20, d31 @ A[4][0] ^= *inp++
1376 blo .Lprocess_neon
1377 vld1.8 {d31}, [r4]!
1378 veor d21, d21, d31 @ A[4][1] ^= *inp++
1379 beq .Lprocess_neon
1380 vld1.8 {d31}, [r4]!
1381 cmp r6, #8*24
1382 veor d22, d22, d31 @ A[4][2] ^= *inp++
1383 blo .Lprocess_neon
1384 vld1.8 {d31}, [r4]!
1385 veor d23, d23, d31 @ A[4][3] ^= *inp++
1386 beq .Lprocess_neon
1387 vld1.8 {d31}, [r4]!
1388 veor d24, d24, d31 @ A[4][4] ^= *inp++
1389
1390.Lprocess_neon:
1391 bl KeccakF1600_neon
1392 b .Loop_absorb_neon
1393
1394.align 4
1395.Labsorbed_neon:
1396 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1397 vst1.32 {d2}, [r0:64]!
1398 vst1.32 {d4}, [r0:64]!
1399 vst1.32 {d6}, [r0:64]!
1400 vst1.32 {d8}, [r0:64]!
1401
1402 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1403 vst1.32 {d3}, [r0:64]!
1404 vst1.32 {d5}, [r0:64]!
1405 vst1.32 {d7}, [r0:64]!
1406 vst1.32 {d9}, [r0:64]!
1407
1408 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1409 vst1.32 {d12}, [r0:64]!
1410 vst1.32 {d14}, [r0:64]!
1411 vst1.32 {d16}, [r0:64]!
1412 vst1.32 {d18}, [r0:64]!
1413
1414 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1415 vst1.32 {d13}, [r0:64]!
1416 vst1.32 {d15}, [r0:64]!
1417 vst1.32 {d17}, [r0:64]!
1418 vst1.32 {d19}, [r0:64]!
1419
1420 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1421 vst1.32 {d24}, [r0:64]
1422
1423 mov r0, r5 @ return value
1424 vldmia sp!, {d8-d15}
1425 ldmia sp!, {r4-r6,pc}
1426.size SHA3_absorb_neon,.-SHA3_absorb_neon
1427
1428.global SHA3_squeeze_neon
1429.type SHA3_squeeze_neon, %function
1430.align 5
1431SHA3_squeeze_neon:
1432 stmdb sp!, {r4-r6,lr}
1433
1434 mov r4, r1 @ out
1435 mov r5, r2 @ len
1436 mov r6, r3 @ bsz
1437 mov r12, r0 @ A_flat
1438 mov r14, r3 @ bsz
1439 b .Loop_squeeze_neon
1440
1441.align 4
1442.Loop_squeeze_neon:
1443 cmp r5, #8
1444 blo .Lsqueeze_neon_tail
1445 vld1.32 {d0}, [r12]!
1446 vst1.8 {d0}, [r4]! @ endian-neutral store
1447
1448 subs r5, r5, #8 @ len -= 8
1449 beq .Lsqueeze_neon_done
1450
1451 subs r14, r14, #8 @ bsz -= 8
1452 bhi .Loop_squeeze_neon
1453
1454 vstmdb sp!, {d8-d15}
1455
1456 vld1.32 {d0}, [r0:64]! @ A[0][0..4]
1457 vld1.32 {d2}, [r0:64]!
1458 vld1.32 {d4}, [r0:64]!
1459 vld1.32 {d6}, [r0:64]!
1460 vld1.32 {d8}, [r0:64]!
1461
1462 vld1.32 {d1}, [r0:64]! @ A[1][0..4]
1463 vld1.32 {d3}, [r0:64]!
1464 vld1.32 {d5}, [r0:64]!
1465 vld1.32 {d7}, [r0:64]!
1466 vld1.32 {d9}, [r0:64]!
1467
1468 vld1.32 {d10}, [r0:64]! @ A[2][0..4]
1469 vld1.32 {d12}, [r0:64]!
1470 vld1.32 {d14}, [r0:64]!
1471 vld1.32 {d16}, [r0:64]!
1472 vld1.32 {d18}, [r0:64]!
1473
1474 vld1.32 {d11}, [r0:64]! @ A[3][0..4]
1475 vld1.32 {d13}, [r0:64]!
1476 vld1.32 {d15}, [r0:64]!
1477 vld1.32 {d17}, [r0:64]!
1478 vld1.32 {d19}, [r0:64]!
1479
1480 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1481 vld1.32 {d24}, [r0:64]
1482 sub r0, r0, #24*8 @ rewind
1483
1484 bl KeccakF1600_neon
1485
1486 mov r12, r0 @ A_flat
1487 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1488 vst1.32 {d2}, [r0:64]!
1489 vst1.32 {d4}, [r0:64]!
1490 vst1.32 {d6}, [r0:64]!
1491 vst1.32 {d8}, [r0:64]!
1492
1493 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1494 vst1.32 {d3}, [r0:64]!
1495 vst1.32 {d5}, [r0:64]!
1496 vst1.32 {d7}, [r0:64]!
1497 vst1.32 {d9}, [r0:64]!
1498
1499 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1500 vst1.32 {d12}, [r0:64]!
1501 vst1.32 {d14}, [r0:64]!
1502 vst1.32 {d16}, [r0:64]!
1503 vst1.32 {d18}, [r0:64]!
1504
1505 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1506 vst1.32 {d13}, [r0:64]!
1507 vst1.32 {d15}, [r0:64]!
1508 vst1.32 {d17}, [r0:64]!
1509 vst1.32 {d19}, [r0:64]!
1510
1511 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1512 mov r14, r6 @ bsz
1513 vst1.32 {d24}, [r0:64]
1514 mov r0, r12 @ rewind
1515
1516 vldmia sp!, {d8-d15}
1517 b .Loop_squeeze_neon
1518
1519.align 4
1520.Lsqueeze_neon_tail:
1521 ldmia r12, {r2,r3}
1522 cmp r5, #2
1523 strb r2, [r4],#1 @ endian-neutral store
1524 lsr r2, r2, #8
1525 blo .Lsqueeze_neon_done
1526 strb r2, [r4], #1
1527 lsr r2, r2, #8
1528 beq .Lsqueeze_neon_done
1529 strb r2, [r4], #1
1530 lsr r2, r2, #8
1531 cmp r5, #4
1532 blo .Lsqueeze_neon_done
1533 strb r2, [r4], #1
1534 beq .Lsqueeze_neon_done
1535
1536 strb r3, [r4], #1
1537 lsr r3, r3, #8
1538 cmp r5, #6
1539 blo .Lsqueeze_neon_done
1540 strb r3, [r4], #1
1541 lsr r3, r3, #8
1542 beq .Lsqueeze_neon_done
1543 strb r3, [r4], #1
1544
1545.Lsqueeze_neon_done:
1546 ldmia sp!, {r4-r6,pc}
1547.size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1548.asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1549.align 2
1550___
1551
e0584e96
AP
1552{
1553 my %ldr, %str;
1554
1555 sub ldrd {
1556 my ($mnemonic,$half,$reg,$ea) = @_;
1557 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1558
1559 if ($half eq "l") {
1560 $$op{reg} = $reg;
1561 $$op{ea} = $ea;
1562 sprintf "#ifndef __thumb2__\n" .
1563 " %s\t%s,%s\n" .
1564 "#endif", $mnemonic,$reg,$ea;
1565 } else {
1566 sprintf "#ifndef __thumb2__\n" .
1567 " %s\t%s,%s\n" .
1568 "#else\n" .
1569 " %sd\t%s,%s,%s\n" .
1570 "#endif", $mnemonic,$reg,$ea,
1571 $mnemonic,$$op{reg},$reg,$$op{ea};
1572 }
1573 }
1574}
1575
2bd3b626
RL
1576$output=pop;
1577open STDOUT,">$output";
1578
e0584e96
AP
1579foreach (split($/,$code)) {
1580 s/\`([^\`]*)\`/eval $1/ge;
1581
1582 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1583 s/\bret\b/bx lr/g or
1584 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1585
1586 print $_,"\n";
1587}
56676f87
AP
1588
1589close STDOUT; # enforce flush