]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/keccak1600-armv4.pl
Following the license change, modify the boilerplates in crypto/sha/
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-armv4.pl
CommitLineData
56676f87 1#!/usr/bin/env perl
b0edda11 2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
56676f87 3#
a598ed0d 4# Licensed under the Apache License 2.0 (the "License"). You may not use
56676f87
AP
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv4.
17#
18# June 2017.
19#
367c5527
AP
20# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21# interleaving. How does it compare to Keccak Code Package? It's as
22# fast, but several times smaller, and is endian- and ISA-neutral. ISA
23# neutrality means that minimum ISA requirement is ARMv4, yet it can
24# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25# register layout taken from Keccak Code Package. It's also as fast,
26# in fact faster by 10-15% on some processors, and endian-neutral.
56676f87 27#
d9ca12cb
AP
28# August 2017.
29#
30# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31# of rotate instructions with logical ones. This resulted in ~10%
32# improvement on most processors. Switch to KECCAK_2X effectively
33# minimizes re-loads from temporary storage, and merged rotates just
34# eliminate corresponding instructions. As for latter. When examining
35# code you'll notice commented ror instructions. These are eliminated
36# ones, and you should trace destination register below to see what's
37# going on. Just in case, why not all rotates are eliminated. Trouble
38# is that you have operations that require both inputs to be rotated,
39# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41# that takes 'a' as input. And thing is that this next operation can
42# be in next round. It's totally possible to "carry" rotate "factors"
43# to the next round, but it makes code more complex. And the last word
44# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45# time being]...
46#
e0584e96
AP
47# Reduce per-round instruction count in Thumb-2 case by 16%. This is
48# achieved by folding ldr/str pairs to their double-word counterparts.
49# Theoretically this should have improved performance on single-issue
50# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51# usual...
52#
56676f87 53########################################################################
367c5527 54# Numbers are cycles per processed byte. Non-NEON results account even
6dad1efe 55# for input bit interleaving.
56676f87 56#
e0584e96 57# r=1088(*) Thumb-2(**) NEON
56676f87 58#
d9ca12cb 59# ARM11xx 82/+150%
e0584e96
AP
60# Cortex-A5 88/+160%, 86, 36
61# Cortex-A7 78/+160%, 68, 34
62# Cortex-A8 51/+230%, 57, 30
63# Cortex-A9 53/+210%, 51, 26
64# Cortex-A15 42/+160%, 38, 18
65# Snapdragon S4 43/+210%, 38, 24
56676f87 66#
d9ca12cb
AP
67# (*) Corresponds to SHA3-256. Percentage after slash is improvement
68# over compiler-generated KECCAK_2X reference code.
e0584e96
AP
69# (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70# Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71# processors are presented mostly for reference purposes.
56676f87 72
e9afe7a1
AP
73$flavour = shift;
74if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
75else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
76
77if ($flavour && $flavour ne "void") {
78 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
79 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
80 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
81 die "can't locate arm-xlate.pl";
82
83 open STDOUT,"| \"$^X\" $xlate $flavour $output";
84} else {
85 open STDOUT,">$output";
86}
87
aabfd329
AP
88my @C = map("r$_",(0..9));
89my @E = map("r$_",(10..12,14));
90
56676f87
AP
91########################################################################
92# Stack layout
93# ----->+-----------------------+
94# | uint64_t A[5][5] |
95# | ... |
96# +200->+-----------------------+
97# | uint64_t D[5] |
98# | ... |
99# +240->+-----------------------+
d9ca12cb 100# | uint64_t T[5][5] |
56676f87 101# | ... |
d9ca12cb 102# +440->+-----------------------+
56676f87 103# | saved lr |
d9ca12cb 104# +444->+-----------------------+
56676f87 105# | loop counter |
d9ca12cb 106# +448->+-----------------------+
56676f87
AP
107# | ...
108
aabfd329
AP
109my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
110my @D = map(8*$_, (25..29));
d9ca12cb 111my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
aabfd329
AP
112
113$code.=<<___;
e9afe7a1
AP
114#include "arm_arch.h"
115
aabfd329
AP
116.text
117
56676f87
AP
118#if defined(__thumb2__)
119.syntax unified
120.thumb
121#else
122.code 32
123#endif
124
367c5527 125.type iotas32, %object
aabfd329 126.align 5
367c5527 127iotas32:
aabfd329
AP
128 .long 0x00000001, 0x00000000
129 .long 0x00000000, 0x00000089
130 .long 0x00000000, 0x8000008b
131 .long 0x00000000, 0x80008080
132 .long 0x00000001, 0x0000008b
133 .long 0x00000001, 0x00008000
134 .long 0x00000001, 0x80008088
135 .long 0x00000001, 0x80000082
136 .long 0x00000000, 0x0000000b
137 .long 0x00000000, 0x0000000a
138 .long 0x00000001, 0x00008082
139 .long 0x00000000, 0x00008003
140 .long 0x00000001, 0x0000808b
141 .long 0x00000001, 0x8000000b
142 .long 0x00000001, 0x8000008a
143 .long 0x00000001, 0x80000081
144 .long 0x00000000, 0x80000081
145 .long 0x00000000, 0x80000008
146 .long 0x00000000, 0x00000083
147 .long 0x00000000, 0x80008003
148 .long 0x00000001, 0x80008088
149 .long 0x00000000, 0x80000088
150 .long 0x00000001, 0x00008000
151 .long 0x00000000, 0x80008082
367c5527 152.size iotas32,.-iotas32
aabfd329 153
56676f87 154.type KeccakF1600_int, %function
aabfd329 155.align 5
56676f87 156KeccakF1600_int:
d9ca12cb
AP
157 add @C[9],sp,#$A[4][2]
158 add @E[2],sp,#$A[0][0]
aabfd329 159 add @E[0],sp,#$A[1][0]
d9ca12cb 160 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
56676f87 161KeccakF1600_enter:
d9ca12cb 162 str lr,[sp,#440]
56676f87 163 eor @E[1],@E[1],@E[1]
d9ca12cb 164 str @E[1],[sp,#444]
e0584e96 165 b .Lround2x
aabfd329
AP
166
167.align 4
e0584e96 168.Lround2x:
d9ca12cb
AP
169___
170sub Round {
171my (@A,@R); (@A[0..4],@R) = @_;
172
173$code.=<<___;
174 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
aabfd329 175 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
e0584e96
AP
176#ifdef __thumb2__
177 eor @C[0],@C[0],@E[0]
178 eor @C[1],@C[1],@E[1]
179 eor @C[2],@C[2],@E[2]
180 ldrd @E[0],@E[1],[sp,#$A[1][2]]
181 eor @C[3],@C[3],@E[3]
182 ldrd @E[2],@E[3],[sp,#$A[1][3]]
183 eor @C[4],@C[4],@E[0]
184 eor @C[5],@C[5],@E[1]
185 eor @C[6],@C[6],@E[2]
186 ldrd @E[0],@E[1],[sp,#$A[1][4]]
187 eor @C[7],@C[7],@E[3]
188 ldrd @E[2],@E[3],[sp,#$A[2][0]]
189 eor @C[8],@C[8],@E[0]
190 eor @C[9],@C[9],@E[1]
191 eor @C[0],@C[0],@E[2]
192 ldrd @E[0],@E[1],[sp,#$A[2][1]]
193 eor @C[1],@C[1],@E[3]
194 ldrd @E[2],@E[3],[sp,#$A[2][2]]
195 eor @C[2],@C[2],@E[0]
196 eor @C[3],@C[3],@E[1]
197 eor @C[4],@C[4],@E[2]
198 ldrd @E[0],@E[1],[sp,#$A[2][3]]
199 eor @C[5],@C[5],@E[3]
200 ldrd @E[2],@E[3],[sp,#$A[2][4]]
201 eor @C[6],@C[6],@E[0]
202 eor @C[7],@C[7],@E[1]
203 eor @C[8],@C[8],@E[2]
204 ldrd @E[0],@E[1],[sp,#$A[3][0]]
205 eor @C[9],@C[9],@E[3]
206 ldrd @E[2],@E[3],[sp,#$A[3][1]]
207 eor @C[0],@C[0],@E[0]
208 eor @C[1],@C[1],@E[1]
209 eor @C[2],@C[2],@E[2]
210 ldrd @E[0],@E[1],[sp,#$A[3][2]]
211 eor @C[3],@C[3],@E[3]
212 ldrd @E[2],@E[3],[sp,#$A[3][3]]
213 eor @C[4],@C[4],@E[0]
214 eor @C[5],@C[5],@E[1]
215 eor @C[6],@C[6],@E[2]
216 ldrd @E[0],@E[1],[sp,#$A[3][4]]
217 eor @C[7],@C[7],@E[3]
218 ldrd @E[2],@E[3],[sp,#$A[4][0]]
219 eor @C[8],@C[8],@E[0]
220 eor @C[9],@C[9],@E[1]
221 eor @C[0],@C[0],@E[2]
222 ldrd @E[0],@E[1],[sp,#$A[4][1]]
223 eor @C[1],@C[1],@E[3]
224 ldrd @E[2],@E[3],[sp,#$A[0][2]]
225 eor @C[2],@C[2],@E[0]
226 eor @C[3],@C[3],@E[1]
227 eor @C[4],@C[4],@E[2]
228 ldrd @E[0],@E[1],[sp,#$A[0][3]]
229 eor @C[5],@C[5],@E[3]
230 ldrd @E[2],@E[3],[sp,#$A[0][4]]
231#else
aabfd329
AP
232 eor @C[0],@C[0],@E[0]
233 add @E[0],sp,#$A[1][2]
234 eor @C[1],@C[1],@E[1]
235 eor @C[2],@C[2],@E[2]
236 eor @C[3],@C[3],@E[3]
237 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
238 eor @C[4],@C[4],@E[0]
239 add @E[0],sp,#$A[1][4]
240 eor @C[5],@C[5],@E[1]
241 eor @C[6],@C[6],@E[2]
242 eor @C[7],@C[7],@E[3]
243 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
244 eor @C[8],@C[8],@E[0]
245 add @E[0],sp,#$A[2][1]
246 eor @C[9],@C[9],@E[1]
247 eor @C[0],@C[0],@E[2]
248 eor @C[1],@C[1],@E[3]
249 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
250 eor @C[2],@C[2],@E[0]
251 add @E[0],sp,#$A[2][3]
252 eor @C[3],@C[3],@E[1]
253 eor @C[4],@C[4],@E[2]
254 eor @C[5],@C[5],@E[3]
255 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
256 eor @C[6],@C[6],@E[0]
257 add @E[0],sp,#$A[3][0]
258 eor @C[7],@C[7],@E[1]
259 eor @C[8],@C[8],@E[2]
260 eor @C[9],@C[9],@E[3]
261 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
262 eor @C[0],@C[0],@E[0]
263 add @E[0],sp,#$A[3][2]
264 eor @C[1],@C[1],@E[1]
265 eor @C[2],@C[2],@E[2]
266 eor @C[3],@C[3],@E[3]
267 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
268 eor @C[4],@C[4],@E[0]
269 add @E[0],sp,#$A[3][4]
270 eor @C[5],@C[5],@E[1]
271 eor @C[6],@C[6],@E[2]
272 eor @C[7],@C[7],@E[3]
273 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
274 eor @C[8],@C[8],@E[0]
d9ca12cb 275 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
aabfd329 276 eor @C[9],@C[9],@E[1]
d9ca12cb 277 ldr @E[1],[sp,#$A[4][1]+4]
aabfd329 278 eor @C[0],@C[0],@E[2]
d9ca12cb 279 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
aabfd329 280 eor @C[1],@C[1],@E[3]
d9ca12cb 281 ldr @E[3],[sp,#$A[0][2]+4]
aabfd329 282 eor @C[2],@C[2],@E[0]
d9ca12cb 283 add @E[0],sp,#$A[0][3]
aabfd329
AP
284 eor @C[3],@C[3],@E[1]
285 eor @C[4],@C[4],@E[2]
286 eor @C[5],@C[5],@E[3]
d9ca12cb 287 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
e0584e96 288#endif
aabfd329
AP
289 eor @C[6],@C[6],@E[0]
290 eor @C[7],@C[7],@E[1]
291 eor @C[8],@C[8],@E[2]
292 eor @C[9],@C[9],@E[3]
293
294 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
e0584e96 295 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
aabfd329 296 eor @E[1],@C[1],@C[4]
e0584e96 297 str.h @E[1],[sp,#$D[1]+4]
aabfd329 298 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
aabfd329 299 eor @E[3],@C[7],@C[0]
e0584e96 300 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
aabfd329 301 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
e0584e96 302 str.h @E[3],[sp,#$D[4]+4]
aabfd329 303 eor @C[1],@C[9],@C[2]
e0584e96 304 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
aabfd329 305 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
e0584e96 306 ldr.l @C[7],[sp,#$A[3][3]]
aabfd329 307 eor @C[3],@C[3],@C[6]
e0584e96
AP
308 str.h @C[1],[sp,#$D[0]+4]
309 ldr.h @C[6],[sp,#$A[3][3]+4]
310 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
aabfd329 311 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
e0584e96 312 str.h @C[3],[sp,#$D[2]+4]
aabfd329 313 eor @C[5],@C[5],@C[8]
aabfd329 314
e0584e96
AP
315 ldr.l @C[8],[sp,#$A[4][4]]
316 ldr.h @C[9],[sp,#$A[4][4]+4]
317 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
d9ca12cb 318 eor @C[7],@C[7],@C[4]
e0584e96 319 str.h @C[5],[sp,#$D[3]+4]
d9ca12cb 320 eor @C[6],@C[6],@C[5]
e0584e96 321 ldr.l @C[4],[sp,#$A[0][0]]
d9ca12cb
AP
322 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
323 @ ror @C[6],@C[6],#32-11
e0584e96 324 ldr.h @C[5],[sp,#$A[0][0]+4]
aabfd329
AP
325 eor @C[8],@C[8],@E[2]
326 eor @C[9],@C[9],@E[3]
e0584e96 327 ldr.l @E[2],[sp,#$A[2][2]]
aabfd329 328 eor @C[0],@C[0],@C[4]
e0584e96 329 ldr.h @E[3],[sp,#$A[2][2]+4]
d9ca12cb
AP
330 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
331 @ ror @C[9],@C[9],#32-7
aabfd329 332 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
aabfd329 333 eor @E[2],@E[2],@C[2]
e0584e96 334 ldr.l @C[2],[sp,#$A[1][1]]
53718107 335 eor @E[3],@E[3],@C[3]
e0584e96 336 ldr.h @C[3],[sp,#$A[1][1]+4]
53718107 337 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
d9ca12cb 338 ldr @E[2],[sp,#444] @ load counter
aabfd329 339 eor @C[2],@C[2],@E[0]
d9ca12cb 340 adr @E[0],iotas32
53718107 341 ror @C[4],@E[3],#32-22
d9ca12cb 342 add @E[3],@E[0],@E[2]
aabfd329 343 eor @C[3],@C[3],@E[1]
d9ca12cb
AP
344___
345$code.=<<___ if ($A[0][0] != $T[0][0]);
346 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
347___
348$code.=<<___ if ($A[0][0] == $T[0][0]);
e0584e96 349 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
d9ca12cb 350 add @E[2],@E[2],#16
e0584e96 351 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
56676f87 352 cmp @E[2],#192
d9ca12cb
AP
353 str @E[2],[sp,#444] @ store counter
354___
355$code.=<<___;
356 bic @E[2],@C[4],@C[2],ror#32-22
357 bic @E[3],@C[5],@C[3],ror#32-22
358 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
359 ror @C[3],@C[3],#32-22
56676f87
AP
360 eor @E[2],@E[2],@C[0]
361 eor @E[3],@E[3],@C[1]
aabfd329
AP
362 eor @E[0],@E[0],@E[2]
363 eor @E[1],@E[1],@E[3]
e0584e96 364 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
d9ca12cb 365 bic @E[2],@C[6],@C[4],ror#11
e0584e96 366 str.h @E[1],[sp,#$R[0][0]+4]
d9ca12cb
AP
367 bic @E[3],@C[7],@C[5],ror#10
368 bic @E[0],@C[8],@C[6],ror#32-(11-7)
369 bic @E[1],@C[9],@C[7],ror#32-(10-7)
370 eor @E[2],@C[2],@E[2],ror#32-11
e0584e96 371 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
d9ca12cb 372 eor @E[3],@C[3],@E[3],ror#32-10
e0584e96 373 str.h @E[3],[sp,#$R[0][1]+4]
d9ca12cb 374 eor @E[0],@C[4],@E[0],ror#32-7
d9ca12cb 375 eor @E[1],@C[5],@E[1],ror#32-7
e0584e96 376 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
d9ca12cb 377 bic @E[2],@C[0],@C[8],ror#32-7
e0584e96 378 str.h @E[1],[sp,#$R[0][2]+4]
d9ca12cb
AP
379 bic @E[3],@C[1],@C[9],ror#32-7
380 eor @E[2],@E[2],@C[6],ror#32-11
e0584e96 381 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
d9ca12cb 382 eor @E[3],@E[3],@C[7],ror#32-10
e0584e96 383 str.h @E[3],[sp,#$R[0][3]+4]
aabfd329 384 bic @E[0],@C[2],@C[0]
d9ca12cb 385 add @E[3],sp,#$D[3]
e0584e96 386 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
aabfd329 387 bic @E[1],@C[3],@C[1]
e0584e96 388 ldr.h @C[1],[sp,#$A[0][3]+4]
d9ca12cb 389 eor @E[0],@E[0],@C[8],ror#32-7
d9ca12cb 390 eor @E[1],@E[1],@C[9],ror#32-7
e0584e96 391 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
d9ca12cb 392 add @C[9],sp,#$D[0]
e0584e96 393 str.h @E[1],[sp,#$R[0][4]+4]
d9ca12cb
AP
394
395 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
396 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
397
e0584e96 398 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
aabfd329 399 eor @C[0],@C[0],@E[0]
e0584e96 400 ldr.h @C[3],[sp,#$A[1][4]+4]
aabfd329 401 eor @C[1],@C[1],@E[1]
d9ca12cb 402 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
e0584e96 403 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
d9ca12cb 404 @ ror @C[1],@C[1],#32-14
e0584e96 405 ldr.h @E[1],[sp,#$A[3][1]+4]
d9ca12cb 406
aabfd329 407 eor @C[2],@C[2],@E[2]
e0584e96 408 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
aabfd329 409 eor @C[3],@C[3],@E[3]
e0584e96 410 ldr.h @C[5],[sp,#$A[2][0]+4]
d9ca12cb
AP
411 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
412 @ ror @C[3],@C[3],#32-10
413
aabfd329 414 eor @C[6],@C[6],@C[4]
e0584e96 415 ldr.l @E[2],[sp,#$D[2]] @ D[2]
aabfd329 416 eor @C[7],@C[7],@C[5]
e0584e96 417 ldr.h @E[3],[sp,#$D[2]+4]
d9ca12cb 418 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
53718107 419 ror @C[4],@C[7],#32-2
d9ca12cb 420
aabfd329 421 eor @E[0],@E[0],@C[8]
e0584e96 422 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
aabfd329 423 eor @E[1],@E[1],@C[9]
e0584e96 424 ldr.h @C[9],[sp,#$A[4][2]+4]
d9ca12cb
AP
425 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
426 ror @C[6],@E[1],#32-23
427
428 bic @E[0],@C[4],@C[2],ror#32-10
429 bic @E[1],@C[5],@C[3],ror#32-10
430 eor @E[2],@E[2],@C[8]
431 eor @E[3],@E[3],@C[9]
432 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
433 ror @C[8],@E[3],#32-31
434 eor @E[0],@E[0],@C[0],ror#32-14
435 eor @E[1],@E[1],@C[1],ror#32-14
e0584e96 436 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
aabfd329 437 bic @E[2],@C[6],@C[4]
e0584e96 438 str.h @E[1],[sp,#$R[1][0]+4]
aabfd329 439 bic @E[3],@C[7],@C[5]
d9ca12cb 440 eor @E[2],@E[2],@C[2],ror#32-10
e0584e96 441 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
d9ca12cb 442 eor @E[3],@E[3],@C[3],ror#32-10
e0584e96 443 str.h @E[3],[sp,#$R[1][1]+4]
aabfd329 444 bic @E[0],@C[8],@C[6]
aabfd329 445 bic @E[1],@C[9],@C[7]
d9ca12cb
AP
446 bic @E[2],@C[0],@C[8],ror#14
447 bic @E[3],@C[1],@C[9],ror#14
aabfd329
AP
448 eor @E[0],@E[0],@C[4]
449 eor @E[1],@E[1],@C[5]
e0584e96
AP
450 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
451 bic @C[2],@C[2],@C[0],ror#32-(14-10)
452 str.h @E[1],[sp,#$R[1][2]+4]
d9ca12cb 453 eor @E[2],@C[6],@E[2],ror#32-14
d9ca12cb 454 bic @E[1],@C[3],@C[1],ror#32-(14-10)
e0584e96 455 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
d9ca12cb 456 eor @E[3],@C[7],@E[3],ror#32-14
e0584e96 457 str.h @E[3],[sp,#$R[1][3]+4]
d9ca12cb 458 add @E[2],sp,#$D[1]
e0584e96
AP
459 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
460 eor @E[0],@C[8],@C[2],ror#32-10
461 ldr.h @C[0],[sp,#$A[0][1]+4]
d9ca12cb 462 eor @E[1],@C[9],@E[1],ror#32-10
e0584e96
AP
463 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
464 str.h @E[1],[sp,#$R[1][4]+4]
d9ca12cb 465
e0584e96 466 add @C[9],sp,#$D[3]
d9ca12cb 467 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
e0584e96
AP
468 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
469 ldr.h @C[3],[sp,#$A[1][2]+4]
d9ca12cb
AP
470 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
471
472 eor @C[1],@C[1],@E[0]
e0584e96 473 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
d9ca12cb 474 eor @C[0],@C[0],@E[1]
e0584e96 475 ldr.h @C[5],[sp,#$A[2][3]+4]
d9ca12cb
AP
476 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
477
478 eor @C[2],@C[2],@E[2]
e0584e96 479 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
d9ca12cb 480 eor @C[3],@C[3],@E[3]
e0584e96 481 ldr.h @E[1],[sp,#$A[3][4]+4]
d9ca12cb 482 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
e0584e96 483 ldr.l @E[2],[sp,#$D[0]] @ D[0]
d9ca12cb 484 @ ror @C[3],@C[3],#32-3
e0584e96 485 ldr.h @E[3],[sp,#$D[0]+4]
d9ca12cb
AP
486
487 eor @C[4],@C[4],@C[6]
488 eor @C[5],@C[5],@C[7]
489 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
490 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
491
aabfd329 492 eor @E[0],@E[0],@C[8]
e0584e96 493 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
aabfd329 494 eor @E[1],@E[1],@C[9]
e0584e96 495 ldr.h @C[9],[sp,#$A[4][0]+4]
d9ca12cb
AP
496 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
497 ror @C[7],@E[1],#32-4
aabfd329 498
d9ca12cb
AP
499 eor @E[2],@E[2],@C[8]
500 eor @E[3],@E[3],@C[9]
501 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
502 ror @C[9],@E[3],#32-9
503
504 bic @E[0],@C[5],@C[2],ror#13-3
505 bic @E[1],@C[4],@C[3],ror#12-3
506 bic @E[2],@C[6],@C[5],ror#32-13
507 bic @E[3],@C[7],@C[4],ror#32-12
508 eor @E[0],@C[0],@E[0],ror#32-13
509 eor @E[1],@C[1],@E[1],ror#32-12
e0584e96 510 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
d9ca12cb 511 eor @E[2],@E[2],@C[2],ror#32-3
e0584e96 512 str.h @E[1],[sp,#$R[2][0]+4]
d9ca12cb 513 eor @E[3],@E[3],@C[3],ror#32-3
e0584e96 514 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
aabfd329 515 bic @E[0],@C[8],@C[6]
aabfd329 516 bic @E[1],@C[9],@C[7]
e0584e96 517 str.h @E[3],[sp,#$R[2][1]+4]
d9ca12cb
AP
518 eor @E[0],@E[0],@C[5],ror#32-13
519 eor @E[1],@E[1],@C[4],ror#32-12
e0584e96 520 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
aabfd329 521 bic @E[2],@C[0],@C[8]
e0584e96 522 str.h @E[1],[sp,#$R[2][2]+4]
aabfd329
AP
523 bic @E[3],@C[1],@C[9]
524 eor @E[2],@E[2],@C[6]
525 eor @E[3],@E[3],@C[7]
e0584e96 526 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
d9ca12cb 527 bic @E[0],@C[2],@C[0],ror#3
e0584e96 528 str.h @E[3],[sp,#$R[2][3]+4]
d9ca12cb 529 bic @E[1],@C[3],@C[1],ror#3
e0584e96 530 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
d9ca12cb 531 eor @E[0],@C[8],@E[0],ror#32-3
e0584e96 532 ldr.h @C[0],[sp,#$A[0][4]+4]
d9ca12cb 533 eor @E[1],@C[9],@E[1],ror#32-3
e0584e96 534 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
d9ca12cb 535 add @C[9],sp,#$D[1]
e0584e96 536 str.h @E[1],[sp,#$R[2][4]+4]
d9ca12cb 537
e0584e96
AP
538 ldr.l @E[0],[sp,#$D[4]] @ D[4]
539 ldr.h @E[1],[sp,#$D[4]+4]
540 ldr.l @E[2],[sp,#$D[0]] @ D[0]
541 ldr.h @E[3],[sp,#$D[0]+4]
d9ca12cb
AP
542
543 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
544
545 eor @C[1],@C[1],@E[0]
e0584e96 546 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
d9ca12cb 547 eor @C[0],@C[0],@E[1]
e0584e96 548 ldr.h @C[3],[sp,#$A[1][0]+4]
d9ca12cb 549 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
e0584e96 550 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
d9ca12cb 551 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
e0584e96 552 ldr.h @C[5],[sp,#$A[2][1]+4]
d9ca12cb
AP
553
554 eor @C[2],@C[2],@E[2]
e0584e96 555 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
d9ca12cb 556 eor @C[3],@C[3],@E[3]
e0584e96 557 ldr.h @E[1],[sp,#$A[3][2]+4]
d9ca12cb 558 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
e0584e96 559 ldr.l @E[2],[sp,#$D[3]] @ D[3]
d9ca12cb 560 @ ror @C[3],@C[3],#32-18
e0584e96 561 ldr.h @E[3],[sp,#$D[3]+4]
d9ca12cb
AP
562
563 eor @C[6],@C[6],@C[4]
564 eor @C[7],@C[7],@C[5]
565 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
566 ror @C[5],@C[7],#32-5
567
aabfd329 568 eor @E[0],@E[0],@C[8]
e0584e96 569 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
aabfd329 570 eor @E[1],@E[1],@C[9]
e0584e96 571 ldr.h @C[9],[sp,#$A[4][3]+4]
aabfd329 572 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
53718107 573 ror @C[6],@E[1],#32-8
aabfd329 574
d9ca12cb
AP
575 eor @E[2],@E[2],@C[8]
576 eor @E[3],@E[3],@C[9]
577 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
578 ror @C[9],@E[3],#32-28
579
580 bic @E[0],@C[4],@C[2],ror#32-18
581 bic @E[1],@C[5],@C[3],ror#32-18
582 eor @E[0],@E[0],@C[0],ror#32-14
583 eor @E[1],@E[1],@C[1],ror#32-13
e0584e96 584 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
aabfd329 585 bic @E[2],@C[6],@C[4]
e0584e96 586 str.h @E[1],[sp,#$R[3][0]+4]
aabfd329 587 bic @E[3],@C[7],@C[5]
d9ca12cb 588 eor @E[2],@E[2],@C[2],ror#32-18
e0584e96 589 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
d9ca12cb 590 eor @E[3],@E[3],@C[3],ror#32-18
e0584e96 591 str.h @E[3],[sp,#$R[3][1]+4]
aabfd329 592 bic @E[0],@C[8],@C[6]
aabfd329 593 bic @E[1],@C[9],@C[7]
d9ca12cb
AP
594 bic @E[2],@C[0],@C[8],ror#14
595 bic @E[3],@C[1],@C[9],ror#13
aabfd329
AP
596 eor @E[0],@E[0],@C[4]
597 eor @E[1],@E[1],@C[5]
e0584e96
AP
598 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
599 bic @C[2],@C[2],@C[0],ror#18-14
600 str.h @E[1],[sp,#$R[3][2]+4]
d9ca12cb 601 eor @E[2],@C[6],@E[2],ror#32-14
d9ca12cb
AP
602 bic @E[1],@C[3],@C[1],ror#18-13
603 eor @E[3],@C[7],@E[3],ror#32-13
e0584e96
AP
604 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
605 str.h @E[3],[sp,#$R[3][3]+4]
d9ca12cb 606 add @E[3],sp,#$D[2]
e0584e96
AP
607 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
608 eor @E[0],@C[8],@C[2],ror#32-18
609 ldr.h @C[1],[sp,#$A[0][2]+4]
d9ca12cb 610 eor @E[1],@C[9],@E[1],ror#32-18
e0584e96
AP
611 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
612 str.h @E[1],[sp,#$R[3][4]+4]
d9ca12cb
AP
613
614 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
e0584e96
AP
615 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
616 ldr.h @C[3],[sp,#$A[1][3]+4]
617 ldr.l @C[6],[sp,#$D[4]] @ D[4]
618 ldr.h @C[7],[sp,#$D[4]+4]
d9ca12cb
AP
619
620 eor @C[0],@C[0],@E[0]
e0584e96 621 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
d9ca12cb 622 eor @C[1],@C[1],@E[1]
e0584e96 623 ldr.h @C[5],[sp,#$A[2][4]+4]
d9ca12cb 624 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
e0584e96 625 ldr.l @C[8],[sp,#$D[0]] @ D[0]
d9ca12cb 626 @ ror @C[1],@C[1],#32-31
e0584e96 627 ldr.h @C[9],[sp,#$D[0]+4]
d9ca12cb
AP
628
629 eor @E[2],@E[2],@C[2]
e0584e96 630 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
d9ca12cb 631 eor @E[3],@E[3],@C[3]
e0584e96 632 ldr.h @E[1],[sp,#$A[3][0]+4]
d9ca12cb 633 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
e0584e96 634 ldr.l @E[2],[sp,#$D[1]] @ D[1]
d9ca12cb 635 ror @C[2],@E[3],#32-28
e0584e96 636 ldr.h @E[3],[sp,#$D[1]+4]
d9ca12cb
AP
637
638 eor @C[6],@C[6],@C[4]
639 eor @C[7],@C[7],@C[5]
640 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
641 ror @C[4],@C[7],#32-20
642
aabfd329 643 eor @E[0],@E[0],@C[8]
e0584e96 644 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
aabfd329 645 eor @E[1],@E[1],@C[9]
e0584e96 646 ldr.h @C[9],[sp,#$A[4][1]+4]
d9ca12cb
AP
647 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
648 ror @C[6],@E[1],#32-21
649
650 eor @C[8],@C[8],@E[2]
651 eor @C[9],@C[9],@E[3]
652 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
653 @ ror @C[9],@C[3],#32-1
aabfd329
AP
654
655 bic @E[0],@C[4],@C[2]
656 bic @E[1],@C[5],@C[3]
d9ca12cb 657 eor @E[0],@E[0],@C[0],ror#32-31
e0584e96 658 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
d9ca12cb 659 eor @E[1],@E[1],@C[1],ror#32-31
e0584e96 660 str.h @E[1],[sp,#$R[4][0]+4]
aabfd329 661 bic @E[2],@C[6],@C[4]
aabfd329
AP
662 bic @E[3],@C[7],@C[5]
663 eor @E[2],@E[2],@C[2]
664 eor @E[3],@E[3],@C[3]
e0584e96 665 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
d9ca12cb 666 bic @E[0],@C[8],@C[6],ror#1
e0584e96 667 str.h @E[3],[sp,#$R[4][1]+4]
d9ca12cb
AP
668 bic @E[1],@C[9],@C[7],ror#1
669 bic @E[2],@C[0],@C[8],ror#31-1
670 bic @E[3],@C[1],@C[9],ror#31-1
671 eor @C[4],@C[4],@E[0],ror#32-1
e0584e96 672 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
d9ca12cb 673 eor @C[5],@C[5],@E[1],ror#32-1
e0584e96 674 str.h @C[5],[sp,#$R[4][2]+4]
d9ca12cb 675 eor @C[6],@C[6],@E[2],ror#32-31
d9ca12cb 676 eor @C[7],@C[7],@E[3],ror#32-31
e0584e96 677 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
d9ca12cb 678 bic @E[0],@C[2],@C[0],ror#32-31
e0584e96 679 str.h @C[7],[sp,#$R[4][3]+4]
d9ca12cb
AP
680 bic @E[1],@C[3],@C[1],ror#32-31
681 add @E[2],sp,#$R[0][0]
682 eor @C[8],@E[0],@C[8],ror#32-1
683 add @E[0],sp,#$R[1][0]
684 eor @C[9],@E[1],@C[9],ror#32-1
e0584e96
AP
685 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
686 str.h @C[9],[sp,#$R[4][4]+4]
d9ca12cb
AP
687___
688}
689 Round(@A,@T);
690 Round(@T,@A);
691$code.=<<___;
e0584e96 692 blo .Lround2x
aabfd329 693
d9ca12cb 694 ldr pc,[sp,#440]
56676f87
AP
695.size KeccakF1600_int,.-KeccakF1600_int
696
697.type KeccakF1600, %function
698.align 5
699KeccakF1600:
700 stmdb sp!,{r0,r4-r11,lr}
d9ca12cb 701 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
56676f87
AP
702
703 add @E[0],r0,#$A[1][0]
704 add @E[1],sp,#$A[1][0]
d9ca12cb
AP
705 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
706 stmia sp, {@C[0]-@C[9]}
707 ldmia @E[0]!,{@C[0]-@C[9]}
56676f87
AP
708 stmia @E[1]!,{@C[0]-@C[9]}
709 ldmia @E[0]!,{@C[0]-@C[9]}
710 stmia @E[1]!,{@C[0]-@C[9]}
711 ldmia @E[0]!,{@C[0]-@C[9]}
712 stmia @E[1]!,{@C[0]-@C[9]}
713 ldmia @E[0], {@C[0]-@C[9]}
d9ca12cb 714 add @E[2],sp,#$A[0][0]
56676f87 715 add @E[0],sp,#$A[1][0]
d9ca12cb 716 stmia @E[1], {@C[0]-@C[9]}
56676f87
AP
717
718 bl KeccakF1600_enter
719
d9ca12cb 720 ldr @E[1], [sp,#440+16] @ restore pointer to A
56676f87
AP
721 ldmia sp, {@C[0]-@C[9]}
722 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
aabfd329
AP
723 ldmia @E[0]!,{@C[0]-@C[9]}
724 stmia @E[1]!,{@C[0]-@C[9]}
725 ldmia @E[0]!,{@C[0]-@C[9]}
726 stmia @E[1]!,{@C[0]-@C[9]}
727 ldmia @E[0]!,{@C[0]-@C[9]}
728 stmia @E[1]!,{@C[0]-@C[9]}
56676f87
AP
729 ldmia @E[0], {@C[0]-@C[9]}
730 stmia @E[1], {@C[0]-@C[9]}
aabfd329 731
d9ca12cb 732 add sp,sp,#440+20
56676f87 733 ldmia sp!,{r4-r11,pc}
aabfd329
AP
734.size KeccakF1600,.-KeccakF1600
735___
6dad1efe 736{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
56676f87
AP
737
738########################################################################
739# Stack layout
740# ----->+-----------------------+
741# | uint64_t A[5][5] |
742# | ... |
743# | ... |
d9ca12cb 744# +456->+-----------------------+
6dad1efe 745# | 0x55555555 |
d9ca12cb 746# +460->+-----------------------+
6dad1efe 747# | 0x33333333 |
d9ca12cb 748# +464->+-----------------------+
6dad1efe 749# | 0x0f0f0f0f |
d9ca12cb 750# +468->+-----------------------+
6dad1efe 751# | 0x00ff00ff |
d9ca12cb 752# +472->+-----------------------+
6dad1efe 753# | uint64_t *A |
d9ca12cb 754# +476->+-----------------------+
6dad1efe 755# | const void *inp |
d9ca12cb 756# +480->+-----------------------+
6dad1efe 757# | size_t len |
d9ca12cb 758# +484->+-----------------------+
6dad1efe 759# | size_t bs |
d9ca12cb 760# +488->+-----------------------+
56676f87
AP
761# | ....
762
763$code.=<<___;
764.global SHA3_absorb
765.type SHA3_absorb,%function
766.align 5
767SHA3_absorb:
768 stmdb sp!,{r0-r12,lr}
d9ca12cb 769 sub sp,sp,#456+16
56676f87 770
6dad1efe
AP
771 add $A_flat,r0,#$A[1][0]
772 @ mov $inp,r1
56676f87
AP
773 mov $len,r2
774 mov $bsz,r3
6dad1efe
AP
775 cmp r2,r3
776 blo .Labsorb_abort
777
778 add $inp,sp,#0
779 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
780 stmia $inp!, {@C[0]-@C[9]}
781 ldmia $A_flat!,{@C[0]-@C[9]}
782 stmia $inp!, {@C[0]-@C[9]}
783 ldmia $A_flat!,{@C[0]-@C[9]}
784 stmia $inp!, {@C[0]-@C[9]}
785 ldmia $A_flat!,{@C[0]-@C[9]}
786 stmia $inp!, {@C[0]-@C[9]}
787 ldmia $A_flat!,{@C[0]-@C[9]}
788 stmia $inp, {@C[0]-@C[9]}
789
d9ca12cb 790 ldr $inp,[sp,#476] @ restore $inp
6dad1efe
AP
791#ifdef __thumb2__
792 mov r9,#0x00ff00ff
793 mov r8,#0x0f0f0f0f
794 mov r7,#0x33333333
795 mov r6,#0x55555555
796#else
797 mov r6,#0x11 @ compose constants
798 mov r8,#0x0f
799 mov r9,#0xff
800 orr r6,r6,r6,lsl#8
801 orr r8,r8,r8,lsl#8
802 orr r6,r6,r6,lsl#16 @ 0x11111111
803 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
804 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
805 orr r7,r6,r6,lsl#1 @ 0x33333333
806 orr r6,r6,r6,lsl#2 @ 0x55555555
807#endif
d9ca12cb
AP
808 str r9,[sp,#468]
809 str r8,[sp,#464]
810 str r7,[sp,#460]
811 str r6,[sp,#456]
6dad1efe 812 b .Loop_absorb
56676f87 813
6dad1efe 814.align 4
56676f87
AP
815.Loop_absorb:
816 subs r0,$len,$bsz
817 blo .Labsorbed
818 add $A_flat,sp,#0
d9ca12cb 819 str r0,[sp,#480] @ save len - bsz
56676f87 820
6dad1efe 821.align 4
56676f87 822.Loop_block:
6dad1efe
AP
823 ldrb r0,[$inp],#1
824 ldrb r1,[$inp],#1
825 ldrb r2,[$inp],#1
826 ldrb r3,[$inp],#1
827 ldrb r4,[$inp],#1
828 orr r0,r0,r1,lsl#8
829 ldrb r1,[$inp],#1
830 orr r0,r0,r2,lsl#16
831 ldrb r2,[$inp],#1
832 orr r0,r0,r3,lsl#24 @ lo
833 ldrb r3,[$inp],#1
834 orr r1,r4,r1,lsl#8
835 orr r1,r1,r2,lsl#16
836 orr r1,r1,r3,lsl#24 @ hi
837
838 and r2,r0,r6 @ &=0x55555555
839 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
840 and r3,r1,r6 @ &=0x55555555
841 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
842 orr r2,r2,r2,lsr#1
843 orr r0,r0,r0,lsl#1
844 orr r3,r3,r3,lsr#1
845 orr r1,r1,r1,lsl#1
846 and r2,r2,r7 @ &=0x33333333
847 and r0,r0,r7,lsl#2 @ &=0xcccccccc
848 and r3,r3,r7 @ &=0x33333333
849 and r1,r1,r7,lsl#2 @ &=0xcccccccc
850 orr r2,r2,r2,lsr#2
851 orr r0,r0,r0,lsl#2
852 orr r3,r3,r3,lsr#2
853 orr r1,r1,r1,lsl#2
854 and r2,r2,r8 @ &=0x0f0f0f0f
855 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
856 and r3,r3,r8 @ &=0x0f0f0f0f
857 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
858 ldmia $A_flat,{r4-r5} @ A_flat[i]
859 orr r2,r2,r2,lsr#4
860 orr r0,r0,r0,lsl#4
861 orr r3,r3,r3,lsr#4
862 orr r1,r1,r1,lsl#4
863 and r2,r2,r9 @ &=0x00ff00ff
864 and r0,r0,r9,lsl#8 @ &=0xff00ff00
865 and r3,r3,r9 @ &=0x00ff00ff
866 and r1,r1,r9,lsl#8 @ &=0xff00ff00
867 orr r2,r2,r2,lsr#8
868 orr r0,r0,r0,lsl#8
869 orr r3,r3,r3,lsr#8
870 orr r1,r1,r1,lsl#8
871
872 lsl r2,r2,#16
873 lsr r1,r1,#16
874 eor r4,r4,r3,lsl#16
875 eor r5,r5,r0,lsr#16
876 eor r4,r4,r2,lsr#16
877 eor r5,r5,r1,lsl#16
878 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
879
56676f87
AP
880 subs $bsz,$bsz,#8
881 bhi .Loop_block
882
d9ca12cb 883 str $inp,[sp,#476]
56676f87
AP
884
885 bl KeccakF1600_int
886
d9ca12cb 887 add r14,sp,#456
6dad1efe 888 ldmia r14,{r6-r12,r14} @ restore constants and variables
56676f87
AP
889 b .Loop_absorb
890
891.align 4
892.Labsorbed:
6dad1efe
AP
893 add $inp,sp,#$A[1][0]
894 ldmia sp, {@C[0]-@C[9]}
895 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
896 ldmia $inp!, {@C[0]-@C[9]}
897 stmia $A_flat!,{@C[0]-@C[9]}
898 ldmia $inp!, {@C[0]-@C[9]}
899 stmia $A_flat!,{@C[0]-@C[9]}
900 ldmia $inp!, {@C[0]-@C[9]}
901 stmia $A_flat!,{@C[0]-@C[9]}
902 ldmia $inp, {@C[0]-@C[9]}
903 stmia $A_flat, {@C[0]-@C[9]}
904
905.Labsorb_abort:
d9ca12cb 906 add sp,sp,#456+32
56676f87
AP
907 mov r0,$len @ return value
908 ldmia sp!,{r4-r12,pc}
909.size SHA3_absorb,.-SHA3_absorb
910___
911}
6dad1efe 912{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
56676f87
AP
913
914$code.=<<___;
915.global SHA3_squeeze
916.type SHA3_squeeze,%function
917.align 5
918SHA3_squeeze:
6dad1efe
AP
919 stmdb sp!,{r0,r3-r10,lr}
920
56676f87
AP
921 mov $A_flat,r0
922 mov $out,r1
923 mov $len,r2
924 mov $bsz,r3
6dad1efe
AP
925
926#ifdef __thumb2__
927 mov r9,#0x00ff00ff
928 mov r8,#0x0f0f0f0f
929 mov r7,#0x33333333
930 mov r6,#0x55555555
931#else
932 mov r6,#0x11 @ compose constants
933 mov r8,#0x0f
934 mov r9,#0xff
935 orr r6,r6,r6,lsl#8
936 orr r8,r8,r8,lsl#8
937 orr r6,r6,r6,lsl#16 @ 0x11111111
938 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
939 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
940 orr r7,r6,r6,lsl#1 @ 0x33333333
941 orr r6,r6,r6,lsl#2 @ 0x55555555
942#endif
943 stmdb sp!,{r6-r9}
944
945 mov r14,$A_flat
56676f87
AP
946 b .Loop_squeeze
947
948.align 4
949.Loop_squeeze:
6dad1efe
AP
950 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
951
952 lsl r2,r0,#16
953 lsl r3,r1,#16 @ r3 = r1 << 16
954 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
955 lsr r1,r1,#16
956 lsr r0,r0,#16 @ r0 = r0 >> 16
957 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
958
959 orr r2,r2,r2,lsl#8
960 orr r3,r3,r3,lsr#8
961 orr r0,r0,r0,lsl#8
962 orr r1,r1,r1,lsr#8
963 and r2,r2,r9 @ &=0x00ff00ff
964 and r3,r3,r9,lsl#8 @ &=0xff00ff00
965 and r0,r0,r9 @ &=0x00ff00ff
966 and r1,r1,r9,lsl#8 @ &=0xff00ff00
967 orr r2,r2,r2,lsl#4
968 orr r3,r3,r3,lsr#4
969 orr r0,r0,r0,lsl#4
970 orr r1,r1,r1,lsr#4
971 and r2,r2,r8 @ &=0x0f0f0f0f
972 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
973 and r0,r0,r8 @ &=0x0f0f0f0f
974 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
975 orr r2,r2,r2,lsl#2
976 orr r3,r3,r3,lsr#2
977 orr r0,r0,r0,lsl#2
978 orr r1,r1,r1,lsr#2
979 and r2,r2,r7 @ &=0x33333333
980 and r3,r3,r7,lsl#2 @ &=0xcccccccc
981 and r0,r0,r7 @ &=0x33333333
982 and r1,r1,r7,lsl#2 @ &=0xcccccccc
983 orr r2,r2,r2,lsl#1
984 orr r3,r3,r3,lsr#1
985 orr r0,r0,r0,lsl#1
986 orr r1,r1,r1,lsr#1
987 and r2,r2,r6 @ &=0x55555555
988 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
989 and r0,r0,r6 @ &=0x55555555
990 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
991
992 orr r2,r2,r3
993 orr r0,r0,r1
994
995 cmp $len,#8
996 blo .Lsqueeze_tail
997 lsr r1,r2,#8
998 strb r2,[$out],#1
999 lsr r3,r2,#16
1000 strb r1,[$out],#1
1001 lsr r2,r2,#24
1002 strb r3,[$out],#1
1003 strb r2,[$out],#1
1004
1005 lsr r1,r0,#8
1006 strb r0,[$out],#1
1007 lsr r3,r0,#16
1008 strb r1,[$out],#1
1009 lsr r0,r0,#24
1010 strb r3,[$out],#1
1011 strb r0,[$out],#1
1012 subs $len,$len,#8
56676f87 1013 beq .Lsqueeze_done
56676f87 1014
6dad1efe 1015 subs $bsz,$bsz,#8 @ bsz -= 8
56676f87
AP
1016 bhi .Loop_squeeze
1017
6dad1efe 1018 mov r0,r14 @ original $A_flat
56676f87
AP
1019
1020 bl KeccakF1600
1021
6dad1efe
AP
1022 ldmia sp,{r6-r10,r12} @ restore constants and variables
1023 mov r14,$A_flat
56676f87
AP
1024 b .Loop_squeeze
1025
6dad1efe
AP
1026.align 4
1027.Lsqueeze_tail:
1028 strb r2,[$out],#1
1029 lsr r2,r2,#8
1030 subs $len,$len,#1
1031 beq .Lsqueeze_done
1032 strb r2,[$out],#1
1033 lsr r2,r2,#8
1034 subs $len,$len,#1
1035 beq .Lsqueeze_done
1036 strb r2,[$out],#1
1037 lsr r2,r2,#8
1038 subs $len,$len,#1
1039 beq .Lsqueeze_done
1040 strb r2,[$out],#1
1041 subs $len,$len,#1
1042 beq .Lsqueeze_done
1043
1044 strb r0,[$out],#1
1045 lsr r0,r0,#8
1046 subs $len,$len,#1
1047 beq .Lsqueeze_done
1048 strb r0,[$out],#1
1049 lsr r0,r0,#8
1050 subs $len,$len,#1
1051 beq .Lsqueeze_done
1052 strb r0,[$out]
1053 b .Lsqueeze_done
1054
1055.align 4
56676f87 1056.Lsqueeze_done:
6dad1efe 1057 add sp,sp,#24
56676f87
AP
1058 ldmia sp!,{r4-r10,pc}
1059.size SHA3_squeeze,.-SHA3_squeeze
56676f87
AP
1060___
1061}
1062
367c5527 1063$code.=<<___;
e9afe7a1 1064#if __ARM_MAX_ARCH__>=7
367c5527
AP
1065.fpu neon
1066
1067.type iotas64, %object
1068.align 5
1069iotas64:
1070 .quad 0x0000000000000001
1071 .quad 0x0000000000008082
1072 .quad 0x800000000000808a
1073 .quad 0x8000000080008000
1074 .quad 0x000000000000808b
1075 .quad 0x0000000080000001
1076 .quad 0x8000000080008081
1077 .quad 0x8000000000008009
1078 .quad 0x000000000000008a
1079 .quad 0x0000000000000088
1080 .quad 0x0000000080008009
1081 .quad 0x000000008000000a
1082 .quad 0x000000008000808b
1083 .quad 0x800000000000008b
1084 .quad 0x8000000000008089
1085 .quad 0x8000000000008003
1086 .quad 0x8000000000008002
1087 .quad 0x8000000000000080
1088 .quad 0x000000000000800a
1089 .quad 0x800000008000000a
1090 .quad 0x8000000080008081
1091 .quad 0x8000000000008080
1092 .quad 0x0000000080000001
1093 .quad 0x8000000080008008
1094.size iotas64,.-iotas64
1095
1096.type KeccakF1600_neon, %function
1097.align 5
1098KeccakF1600_neon:
1099 add r1, r0, #16
1100 adr r2, iotas64
1101 mov r3, #24 @ loop counter
1102 b .Loop_neon
1103
1104.align 4
1105.Loop_neon:
1106 @ Theta
1107 vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
1108 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1109 vst1.64 {d18}, [r1:64] @ offload A[2][4]
1110 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1111 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1112 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1113 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1114 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1115 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1116 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1117 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1118 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1119 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1120 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1121 veor d25, d25, d24 @ C[4]^=A[4][4]
1122
1123 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1124 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1125 vadd.u64 d18, d25, d25 @ C[4]<<1
1126 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1127 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1128 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1129 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1130 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1131 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1132 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1133
1134 veor d0, d0, d25 @ A[0][0] ^= C[4]
1135 veor d1, d1, d25 @ A[1][0] ^= C[4]
1136 veor d10, d10, d25 @ A[2][0] ^= C[4]
1137 veor d11, d11, d25 @ A[3][0] ^= C[4]
1138 veor d20, d20, d25 @ A[4][0] ^= C[4]
1139
1140 veor d2, d2, d26 @ A[0][1] ^= D[1]
1141 veor d3, d3, d26 @ A[1][1] ^= D[1]
1142 veor d12, d12, d26 @ A[2][1] ^= D[1]
1143 veor d13, d13, d26 @ A[3][1] ^= D[1]
1144 veor d21, d21, d26 @ A[4][1] ^= D[1]
1145 vmov d26, d27
1146
1147 veor d6, d6, d28 @ A[0][3] ^= C[2]
1148 veor d7, d7, d28 @ A[1][3] ^= C[2]
1149 veor d16, d16, d28 @ A[2][3] ^= C[2]
1150 veor d17, d17, d28 @ A[3][3] ^= C[2]
1151 veor d23, d23, d28 @ A[4][3] ^= C[2]
1152 vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
1153 vmov d28, d29
1154
1155 vld1.64 {d18}, [r1:64] @ restore A[2][4]
1156 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1157 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1158 veor d22, d22, d27 @ A[4][2] ^= D[2]
1159
1160 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1161 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1162 veor d24, d24, d29 @ A[4][4] ^= C[3]
1163
1164 @ Rho + Pi
1165 vmov d26, d2 @ C[1] = A[0][1]
1166 vshl.u64 d2, d3, #44
1167 vmov d27, d4 @ C[2] = A[0][2]
1168 vshl.u64 d4, d14, #43
1169 vmov d28, d6 @ C[3] = A[0][3]
1170 vshl.u64 d6, d17, #21
1171 vmov d29, d8 @ C[4] = A[0][4]
1172 vshl.u64 d8, d24, #14
1173 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1174 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1175 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1176 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1177
1178 vshl.u64 d3, d9, #20
1179 vshl.u64 d14, d16, #25
1180 vshl.u64 d17, d15, #15
1181 vshl.u64 d24, d21, #2
1182 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1183 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1184 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1185 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1186
1187 vshl.u64 d9, d22, #61
1188 @ vshl.u64 d16, d19, #8
1189 vshl.u64 d15, d12, #10
1190 vshl.u64 d21, d7, #55
1191 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1192 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1193 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1194 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1195
1196 vshl.u64 d22, d18, #39
1197 @ vshl.u64 d19, d23, #56
1198 vshl.u64 d12, d5, #6
1199 vshl.u64 d7, d13, #45
1200 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1201 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1202 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1203 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1204
1205 vshl.u64 d18, d20, #18
1206 vshl.u64 d23, d11, #41
1207 vshl.u64 d5, d10, #3
1208 vshl.u64 d13, d1, #36
1209 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1210 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1211 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1212 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1213
1214 vshl.u64 d1, d28, #28
1215 vshl.u64 d10, d26, #1
1216 vshl.u64 d11, d29, #27
1217 vshl.u64 d20, d27, #62
1218 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1219 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1220 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1221 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1222
1223 @ Chi + Iota
1224 vbic q13, q2, q1
1225 vbic q14, q3, q2
1226 vbic q15, q4, q3
1227 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1228 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1229 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1230 vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
1231 vbic q13, q0, q4
1232 vbic q15, q1, q0
1233 vmov q1, q14 @ A[0..1][1]
1234 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1235 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1236
1237 vbic q13, q7, q6
1238 vmov q0, q5 @ A[2..3][0]
1239 vbic q14, q8, q7
1240 vmov q15, q6 @ A[2..3][1]
1241 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1242 vbic q13, q9, q8
1243 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1244 vbic q14, q0, q9
1245 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1246 vbic q13, q15, q0
1247 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1248 vmov q14, q10 @ A[4][0..1]
1249 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1250
1251 vld1.64 d25, [r2:64]! @ Iota[i++]
1252 vbic d26, d22, d21
1253 vbic d27, d23, d22
1254 vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
1255 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1256 vbic d26, d24, d23
1257 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1258 vbic d27, d28, d24
1259 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1260 vbic d26, d29, d28
1261 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1262 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1263 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1264
1265 subs r3, r3, #1
1266 bne .Loop_neon
1267
1268 bx lr
1269.size KeccakF1600_neon,.-KeccakF1600_neon
1270
1271.global SHA3_absorb_neon
1272.type SHA3_absorb_neon, %function
1273.align 5
1274SHA3_absorb_neon:
1275 stmdb sp!, {r4-r6,lr}
1276 vstmdb sp!, {d8-d15}
1277
1278 mov r4, r1 @ inp
1279 mov r5, r2 @ len
1280 mov r6, r3 @ bsz
1281
1282 vld1.32 {d0}, [r0:64]! @ A[0][0]
1283 vld1.32 {d2}, [r0:64]! @ A[0][1]
1284 vld1.32 {d4}, [r0:64]! @ A[0][2]
1285 vld1.32 {d6}, [r0:64]! @ A[0][3]
1286 vld1.32 {d8}, [r0:64]! @ A[0][4]
1287
1288 vld1.32 {d1}, [r0:64]! @ A[1][0]
1289 vld1.32 {d3}, [r0:64]! @ A[1][1]
1290 vld1.32 {d5}, [r0:64]! @ A[1][2]
1291 vld1.32 {d7}, [r0:64]! @ A[1][3]
1292 vld1.32 {d9}, [r0:64]! @ A[1][4]
1293
1294 vld1.32 {d10}, [r0:64]! @ A[2][0]
1295 vld1.32 {d12}, [r0:64]! @ A[2][1]
1296 vld1.32 {d14}, [r0:64]! @ A[2][2]
1297 vld1.32 {d16}, [r0:64]! @ A[2][3]
1298 vld1.32 {d18}, [r0:64]! @ A[2][4]
1299
1300 vld1.32 {d11}, [r0:64]! @ A[3][0]
1301 vld1.32 {d13}, [r0:64]! @ A[3][1]
1302 vld1.32 {d15}, [r0:64]! @ A[3][2]
1303 vld1.32 {d17}, [r0:64]! @ A[3][3]
1304 vld1.32 {d19}, [r0:64]! @ A[3][4]
1305
1306 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
1307 vld1.32 {d24}, [r0:64] @ A[4][4]
1308 sub r0, r0, #24*8 @ rewind
1309 b .Loop_absorb_neon
1310
1311.align 4
1312.Loop_absorb_neon:
1313 subs r12, r5, r6 @ len - bsz
1314 blo .Labsorbed_neon
1315 mov r5, r12
1316
1317 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1318 cmp r6, #8*2
1319 veor d0, d0, d31 @ A[0][0] ^= *inp++
1320 blo .Lprocess_neon
1321 vld1.8 {d31}, [r4]!
1322 veor d2, d2, d31 @ A[0][1] ^= *inp++
1323 beq .Lprocess_neon
1324 vld1.8 {d31}, [r4]!
1325 cmp r6, #8*4
1326 veor d4, d4, d31 @ A[0][2] ^= *inp++
1327 blo .Lprocess_neon
1328 vld1.8 {d31}, [r4]!
1329 veor d6, d6, d31 @ A[0][3] ^= *inp++
1330 beq .Lprocess_neon
1331 vld1.8 {d31},[r4]!
1332 cmp r6, #8*6
1333 veor d8, d8, d31 @ A[0][4] ^= *inp++
1334 blo .Lprocess_neon
1335
1336 vld1.8 {d31}, [r4]!
1337 veor d1, d1, d31 @ A[1][0] ^= *inp++
1338 beq .Lprocess_neon
1339 vld1.8 {d31}, [r4]!
1340 cmp r6, #8*8
1341 veor d3, d3, d31 @ A[1][1] ^= *inp++
1342 blo .Lprocess_neon
1343 vld1.8 {d31}, [r4]!
1344 veor d5, d5, d31 @ A[1][2] ^= *inp++
1345 beq .Lprocess_neon
1346 vld1.8 {d31}, [r4]!
1347 cmp r6, #8*10
1348 veor d7, d7, d31 @ A[1][3] ^= *inp++
1349 blo .Lprocess_neon
1350 vld1.8 {d31}, [r4]!
1351 veor d9, d9, d31 @ A[1][4] ^= *inp++
1352 beq .Lprocess_neon
1353
1354 vld1.8 {d31}, [r4]!
1355 cmp r6, #8*12
1356 veor d10, d10, d31 @ A[2][0] ^= *inp++
1357 blo .Lprocess_neon
1358 vld1.8 {d31}, [r4]!
1359 veor d12, d12, d31 @ A[2][1] ^= *inp++
1360 beq .Lprocess_neon
1361 vld1.8 {d31}, [r4]!
1362 cmp r6, #8*14
1363 veor d14, d14, d31 @ A[2][2] ^= *inp++
1364 blo .Lprocess_neon
1365 vld1.8 {d31}, [r4]!
1366 veor d16, d16, d31 @ A[2][3] ^= *inp++
1367 beq .Lprocess_neon
1368 vld1.8 {d31}, [r4]!
1369 cmp r6, #8*16
1370 veor d18, d18, d31 @ A[2][4] ^= *inp++
1371 blo .Lprocess_neon
1372
1373 vld1.8 {d31}, [r4]!
1374 veor d11, d11, d31 @ A[3][0] ^= *inp++
1375 beq .Lprocess_neon
1376 vld1.8 {d31}, [r4]!
1377 cmp r6, #8*18
1378 veor d13, d13, d31 @ A[3][1] ^= *inp++
1379 blo .Lprocess_neon
1380 vld1.8 {d31}, [r4]!
1381 veor d15, d15, d31 @ A[3][2] ^= *inp++
1382 beq .Lprocess_neon
1383 vld1.8 {d31}, [r4]!
1384 cmp r6, #8*20
1385 veor d17, d17, d31 @ A[3][3] ^= *inp++
1386 blo .Lprocess_neon
1387 vld1.8 {d31}, [r4]!
1388 veor d19, d19, d31 @ A[3][4] ^= *inp++
1389 beq .Lprocess_neon
1390
1391 vld1.8 {d31}, [r4]!
1392 cmp r6, #8*22
1393 veor d20, d20, d31 @ A[4][0] ^= *inp++
1394 blo .Lprocess_neon
1395 vld1.8 {d31}, [r4]!
1396 veor d21, d21, d31 @ A[4][1] ^= *inp++
1397 beq .Lprocess_neon
1398 vld1.8 {d31}, [r4]!
1399 cmp r6, #8*24
1400 veor d22, d22, d31 @ A[4][2] ^= *inp++
1401 blo .Lprocess_neon
1402 vld1.8 {d31}, [r4]!
1403 veor d23, d23, d31 @ A[4][3] ^= *inp++
1404 beq .Lprocess_neon
1405 vld1.8 {d31}, [r4]!
1406 veor d24, d24, d31 @ A[4][4] ^= *inp++
1407
1408.Lprocess_neon:
1409 bl KeccakF1600_neon
1410 b .Loop_absorb_neon
1411
1412.align 4
1413.Labsorbed_neon:
1414 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1415 vst1.32 {d2}, [r0:64]!
1416 vst1.32 {d4}, [r0:64]!
1417 vst1.32 {d6}, [r0:64]!
1418 vst1.32 {d8}, [r0:64]!
1419
1420 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1421 vst1.32 {d3}, [r0:64]!
1422 vst1.32 {d5}, [r0:64]!
1423 vst1.32 {d7}, [r0:64]!
1424 vst1.32 {d9}, [r0:64]!
1425
1426 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1427 vst1.32 {d12}, [r0:64]!
1428 vst1.32 {d14}, [r0:64]!
1429 vst1.32 {d16}, [r0:64]!
1430 vst1.32 {d18}, [r0:64]!
1431
1432 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1433 vst1.32 {d13}, [r0:64]!
1434 vst1.32 {d15}, [r0:64]!
1435 vst1.32 {d17}, [r0:64]!
1436 vst1.32 {d19}, [r0:64]!
1437
1438 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1439 vst1.32 {d24}, [r0:64]
1440
1441 mov r0, r5 @ return value
1442 vldmia sp!, {d8-d15}
1443 ldmia sp!, {r4-r6,pc}
1444.size SHA3_absorb_neon,.-SHA3_absorb_neon
1445
1446.global SHA3_squeeze_neon
1447.type SHA3_squeeze_neon, %function
1448.align 5
1449SHA3_squeeze_neon:
1450 stmdb sp!, {r4-r6,lr}
1451
1452 mov r4, r1 @ out
1453 mov r5, r2 @ len
1454 mov r6, r3 @ bsz
1455 mov r12, r0 @ A_flat
1456 mov r14, r3 @ bsz
1457 b .Loop_squeeze_neon
1458
1459.align 4
1460.Loop_squeeze_neon:
1461 cmp r5, #8
1462 blo .Lsqueeze_neon_tail
1463 vld1.32 {d0}, [r12]!
1464 vst1.8 {d0}, [r4]! @ endian-neutral store
1465
1466 subs r5, r5, #8 @ len -= 8
1467 beq .Lsqueeze_neon_done
1468
1469 subs r14, r14, #8 @ bsz -= 8
1470 bhi .Loop_squeeze_neon
1471
1472 vstmdb sp!, {d8-d15}
1473
1474 vld1.32 {d0}, [r0:64]! @ A[0][0..4]
1475 vld1.32 {d2}, [r0:64]!
1476 vld1.32 {d4}, [r0:64]!
1477 vld1.32 {d6}, [r0:64]!
1478 vld1.32 {d8}, [r0:64]!
1479
1480 vld1.32 {d1}, [r0:64]! @ A[1][0..4]
1481 vld1.32 {d3}, [r0:64]!
1482 vld1.32 {d5}, [r0:64]!
1483 vld1.32 {d7}, [r0:64]!
1484 vld1.32 {d9}, [r0:64]!
1485
1486 vld1.32 {d10}, [r0:64]! @ A[2][0..4]
1487 vld1.32 {d12}, [r0:64]!
1488 vld1.32 {d14}, [r0:64]!
1489 vld1.32 {d16}, [r0:64]!
1490 vld1.32 {d18}, [r0:64]!
1491
1492 vld1.32 {d11}, [r0:64]! @ A[3][0..4]
1493 vld1.32 {d13}, [r0:64]!
1494 vld1.32 {d15}, [r0:64]!
1495 vld1.32 {d17}, [r0:64]!
1496 vld1.32 {d19}, [r0:64]!
1497
1498 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1499 vld1.32 {d24}, [r0:64]
1500 sub r0, r0, #24*8 @ rewind
1501
1502 bl KeccakF1600_neon
1503
1504 mov r12, r0 @ A_flat
1505 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1506 vst1.32 {d2}, [r0:64]!
1507 vst1.32 {d4}, [r0:64]!
1508 vst1.32 {d6}, [r0:64]!
1509 vst1.32 {d8}, [r0:64]!
1510
1511 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1512 vst1.32 {d3}, [r0:64]!
1513 vst1.32 {d5}, [r0:64]!
1514 vst1.32 {d7}, [r0:64]!
1515 vst1.32 {d9}, [r0:64]!
1516
1517 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1518 vst1.32 {d12}, [r0:64]!
1519 vst1.32 {d14}, [r0:64]!
1520 vst1.32 {d16}, [r0:64]!
1521 vst1.32 {d18}, [r0:64]!
1522
1523 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1524 vst1.32 {d13}, [r0:64]!
1525 vst1.32 {d15}, [r0:64]!
1526 vst1.32 {d17}, [r0:64]!
1527 vst1.32 {d19}, [r0:64]!
1528
1529 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1530 mov r14, r6 @ bsz
1531 vst1.32 {d24}, [r0:64]
1532 mov r0, r12 @ rewind
1533
1534 vldmia sp!, {d8-d15}
1535 b .Loop_squeeze_neon
1536
1537.align 4
1538.Lsqueeze_neon_tail:
1539 ldmia r12, {r2,r3}
1540 cmp r5, #2
1541 strb r2, [r4],#1 @ endian-neutral store
1542 lsr r2, r2, #8
1543 blo .Lsqueeze_neon_done
1544 strb r2, [r4], #1
1545 lsr r2, r2, #8
1546 beq .Lsqueeze_neon_done
1547 strb r2, [r4], #1
1548 lsr r2, r2, #8
1549 cmp r5, #4
1550 blo .Lsqueeze_neon_done
1551 strb r2, [r4], #1
1552 beq .Lsqueeze_neon_done
1553
1554 strb r3, [r4], #1
1555 lsr r3, r3, #8
1556 cmp r5, #6
1557 blo .Lsqueeze_neon_done
1558 strb r3, [r4], #1
1559 lsr r3, r3, #8
1560 beq .Lsqueeze_neon_done
1561 strb r3, [r4], #1
1562
1563.Lsqueeze_neon_done:
1564 ldmia sp!, {r4-r6,pc}
1565.size SHA3_squeeze_neon,.-SHA3_squeeze_neon
e9afe7a1 1566#endif
367c5527
AP
1567.asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1568.align 2
1569___
1570
e0584e96
AP
1571{
1572 my %ldr, %str;
1573
1574 sub ldrd {
1575 my ($mnemonic,$half,$reg,$ea) = @_;
1576 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1577
1578 if ($half eq "l") {
1579 $$op{reg} = $reg;
1580 $$op{ea} = $ea;
1581 sprintf "#ifndef __thumb2__\n" .
1582 " %s\t%s,%s\n" .
1583 "#endif", $mnemonic,$reg,$ea;
1584 } else {
1585 sprintf "#ifndef __thumb2__\n" .
1586 " %s\t%s,%s\n" .
1587 "#else\n" .
1588 " %sd\t%s,%s,%s\n" .
1589 "#endif", $mnemonic,$reg,$ea,
1590 $mnemonic,$$op{reg},$reg,$$op{ea};
1591 }
1592 }
1593}
1594
1595foreach (split($/,$code)) {
1596 s/\`([^\`]*)\`/eval $1/ge;
1597
1598 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
e9afe7a1 1599 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
e0584e96
AP
1600 s/\bret\b/bx lr/g or
1601 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1602
1603 print $_,"\n";
1604}
56676f87
AP
1605
1606close STDOUT; # enforce flush