]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-armv4.pl
sha[1|512]-armv8.pl: get instruction endianness right.
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
CommitLineData
1fa29843
AP
1#!/usr/bin/env perl
2
3# ====================================================================
f26328c2 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1fa29843
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
2d22e080
AP
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
1fa29843 20
1e863180
AP
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
482a7d80 29# one byte in 23.3 cycles or ~60% faster than integer-only code.
1e863180 30
f26328c2
AP
31# August 2012.
32#
33# Improve NEON performance by 12% on Snapdragon S4. In absolute
34# terms it's 22.6 cycles per byte, which is disappointing result.
35# Technical writers asserted that 3-way S4 pipeline can sustain
36# multiple NEON instructions per cycle, but dual NEON issue could
37# not be observed, and for NEON-only sequences IPC(*) was found to
38# be limited by 1:-( 0.33 and 0.66 were measured for sequences with
39# ILPs(*) of 1 and 2 respectively. This in turn means that you can
40# even find yourself striving, as I did here, for achieving IPC
41# adequate to one delivered by Cortex A8 [for reference, it's
42# 0.5 for ILP of 1, and 1 for higher ILPs].
43#
44# (*) ILP, instruction-level parallelism, how many instructions
45# *can* execute at the same time. IPC, instructions per cycle,
46# indicates how many instructions actually execute.
47
74eb3e09
AP
48# Byte order [in]dependence. =========================================
49#
1e863180
AP
50# Originally caller was expected to maintain specific *dword* order in
51# h[0-7], namely with most significant dword at *lower* address, which
52# was reflected in below two parameters as 0 and 4. Now caller is
53# expected to maintain native byte order for whole 64-bit values.
54$hi="HI";
55$lo="LO";
74eb3e09 56# ====================================================================
1fa29843 57
396df731 58while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
4c7c5ff6
AP
59open STDOUT,">$output";
60
1e863180 61$ctx="r0"; # parameter block
1fa29843
AP
62$inp="r1";
63$len="r2";
1e863180 64
1fa29843
AP
65$Tlo="r3";
66$Thi="r4";
67$Alo="r5";
68$Ahi="r6";
69$Elo="r7";
70$Ehi="r8";
71$t0="r9";
72$t1="r10";
73$t2="r11";
74$t3="r12";
75############ r13 is stack pointer
76$Ktbl="r14";
77############ r15 is program counter
78
79$Aoff=8*0;
80$Boff=8*1;
81$Coff=8*2;
82$Doff=8*3;
83$Eoff=8*4;
84$Foff=8*5;
85$Goff=8*6;
86$Hoff=8*7;
87$Xoff=8*8;
88
89sub BODY_00_15() {
90my $magic = shift;
91$code.=<<___;
1fa29843
AP
92 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
93 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
94 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
95 mov $t0,$Elo,lsr#14
1e863180 96 str $Tlo,[sp,#$Xoff+0]
1fa29843 97 mov $t1,$Ehi,lsr#14
1e863180 98 str $Thi,[sp,#$Xoff+4]
1fa29843 99 eor $t0,$t0,$Ehi,lsl#18
1e863180 100 ldr $t2,[sp,#$Hoff+0] @ h.lo
1fa29843 101 eor $t1,$t1,$Elo,lsl#18
1e863180 102 ldr $t3,[sp,#$Hoff+4] @ h.hi
1fa29843
AP
103 eor $t0,$t0,$Elo,lsr#18
104 eor $t1,$t1,$Ehi,lsr#18
105 eor $t0,$t0,$Ehi,lsl#14
106 eor $t1,$t1,$Elo,lsl#14
107 eor $t0,$t0,$Ehi,lsr#9
108 eor $t1,$t1,$Elo,lsr#9
109 eor $t0,$t0,$Elo,lsl#23
110 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
111 adds $Tlo,$Tlo,$t0
1fa29843 112 ldr $t0,[sp,#$Foff+0] @ f.lo
2d22e080 113 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
1fa29843 114 ldr $t1,[sp,#$Foff+4] @ f.hi
2d22e080 115 adds $Tlo,$Tlo,$t2
1fa29843 116 ldr $t2,[sp,#$Goff+0] @ g.lo
2d22e080 117 adc $Thi,$Thi,$t3 @ T += h
1fa29843 118 ldr $t3,[sp,#$Goff+4] @ g.hi
1fa29843
AP
119
120 eor $t0,$t0,$t2
2d22e080 121 str $Elo,[sp,#$Eoff+0]
1fa29843 122 eor $t1,$t1,$t3
2d22e080 123 str $Ehi,[sp,#$Eoff+4]
1fa29843 124 and $t0,$t0,$Elo
2d22e080 125 str $Alo,[sp,#$Aoff+0]
1fa29843 126 and $t1,$t1,$Ehi
2d22e080 127 str $Ahi,[sp,#$Aoff+4]
1fa29843 128 eor $t0,$t0,$t2
1e863180 129 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
2d22e080 130 eor $t1,$t1,$t3 @ Ch(e,f,g)
1e863180 131 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
1fa29843
AP
132
133 adds $Tlo,$Tlo,$t0
2d22e080 134 ldr $Elo,[sp,#$Doff+0] @ d.lo
1fa29843 135 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
2d22e080 136 ldr $Ehi,[sp,#$Doff+4] @ d.hi
1fa29843 137 adds $Tlo,$Tlo,$t2
1e863180 138 and $t0,$t2,#0xff
1fa29843
AP
139 adc $Thi,$Thi,$t3 @ T += K[i]
140 adds $Elo,$Elo,$Tlo
1e863180 141 ldr $t2,[sp,#$Boff+0] @ b.lo
1fa29843 142 adc $Ehi,$Ehi,$Thi @ d += T
1fa29843 143 teq $t0,#$magic
1fa29843 144
b5e5760d 145 ldr $t3,[sp,#$Coff+0] @ c.lo
1e863180 146 orreq $Ktbl,$Ktbl,#1
1fa29843
AP
147 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
148 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
149 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
150 mov $t0,$Alo,lsr#28
151 mov $t1,$Ahi,lsr#28
152 eor $t0,$t0,$Ahi,lsl#4
153 eor $t1,$t1,$Alo,lsl#4
154 eor $t0,$t0,$Ahi,lsr#2
155 eor $t1,$t1,$Alo,lsr#2
156 eor $t0,$t0,$Alo,lsl#30
157 eor $t1,$t1,$Ahi,lsl#30
158 eor $t0,$t0,$Ahi,lsr#7
159 eor $t1,$t1,$Alo,lsr#7
160 eor $t0,$t0,$Alo,lsl#25
161 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
162 adds $Tlo,$Tlo,$t0
1e863180 163 and $t0,$Alo,$t2
1fa29843
AP
164 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
165
1fa29843 166 ldr $t1,[sp,#$Boff+4] @ b.hi
1e863180 167 orr $Alo,$Alo,$t2
1fa29843
AP
168 ldr $t2,[sp,#$Coff+4] @ c.hi
169 and $Alo,$Alo,$t3
1fa29843
AP
170 and $t3,$Ahi,$t1
171 orr $Ahi,$Ahi,$t1
1e863180 172 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
1fa29843 173 and $Ahi,$Ahi,$t2
1fa29843 174 adds $Alo,$Alo,$Tlo
1e863180 175 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
1fa29843 176 sub sp,sp,#8
1e863180
AP
177 adc $Ahi,$Ahi,$Thi @ h += T
178 tst $Ktbl,#1
1fa29843
AP
179 add $Ktbl,$Ktbl,#8
180___
181}
182$code=<<___;
1e863180
AP
183#include "arm_arch.h"
184#ifdef __ARMEL__
185# define LO 0
186# define HI 4
187# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
188#else
189# define HI 0
190# define LO 4
191# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
192#endif
193
1fa29843
AP
194.text
195.code 32
196.type K512,%object
197.align 5
198K512:
1e863180
AP
199WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
200WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
201WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
202WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
203WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
204WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
205WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
206WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
207WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
208WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
209WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
210WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
211WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
212WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
213WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
214WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
215WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
216WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
217WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
218WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
219WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
220WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
221WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
222WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
223WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
224WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
225WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
226WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
227WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
228WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
229WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
230WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
231WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
232WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
233WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
234WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
235WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
236WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
237WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
238WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
1fa29843 239.size K512,.-K512
1e863180 240.LOPENSSL_armcap:
87873f43 241.word OPENSSL_armcap_P-sha512_block_data_order
1e863180 242.skip 32-4
1fa29843
AP
243
244.global sha512_block_data_order
245.type sha512_block_data_order,%function
246sha512_block_data_order:
247 sub r3,pc,#8 @ sha512_block_data_order
248 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
1e863180
AP
249#if __ARM_ARCH__>=7
250 ldr r12,.LOPENSSL_armcap
87873f43 251 ldr r12,[r3,r12] @ OPENSSL_armcap_P
1e863180
AP
252 tst r12,#1
253 bne .LNEON
254#endif
1fa29843 255 stmdb sp!,{r4-r12,lr}
1e863180 256 sub $Ktbl,r3,#672 @ K512
1fa29843
AP
257 sub sp,sp,#9*8
258
259 ldr $Elo,[$ctx,#$Eoff+$lo]
260 ldr $Ehi,[$ctx,#$Eoff+$hi]
261 ldr $t0, [$ctx,#$Goff+$lo]
262 ldr $t1, [$ctx,#$Goff+$hi]
263 ldr $t2, [$ctx,#$Hoff+$lo]
264 ldr $t3, [$ctx,#$Hoff+$hi]
265.Loop:
266 str $t0, [sp,#$Goff+0]
267 str $t1, [sp,#$Goff+4]
268 str $t2, [sp,#$Hoff+0]
269 str $t3, [sp,#$Hoff+4]
270 ldr $Alo,[$ctx,#$Aoff+$lo]
271 ldr $Ahi,[$ctx,#$Aoff+$hi]
272 ldr $Tlo,[$ctx,#$Boff+$lo]
273 ldr $Thi,[$ctx,#$Boff+$hi]
274 ldr $t0, [$ctx,#$Coff+$lo]
275 ldr $t1, [$ctx,#$Coff+$hi]
276 ldr $t2, [$ctx,#$Doff+$lo]
277 ldr $t3, [$ctx,#$Doff+$hi]
278 str $Tlo,[sp,#$Boff+0]
279 str $Thi,[sp,#$Boff+4]
280 str $t0, [sp,#$Coff+0]
281 str $t1, [sp,#$Coff+4]
282 str $t2, [sp,#$Doff+0]
283 str $t3, [sp,#$Doff+4]
284 ldr $Tlo,[$ctx,#$Foff+$lo]
285 ldr $Thi,[$ctx,#$Foff+$hi]
286 str $Tlo,[sp,#$Foff+0]
287 str $Thi,[sp,#$Foff+4]
288
289.L00_15:
1e863180 290#if __ARM_ARCH__<7
1fa29843
AP
291 ldrb $Tlo,[$inp,#7]
292 ldrb $t0, [$inp,#6]
293 ldrb $t1, [$inp,#5]
294 ldrb $t2, [$inp,#4]
295 ldrb $Thi,[$inp,#3]
296 ldrb $t3, [$inp,#2]
297 orr $Tlo,$Tlo,$t0,lsl#8
298 ldrb $t0, [$inp,#1]
299 orr $Tlo,$Tlo,$t1,lsl#16
300 ldrb $t1, [$inp],#8
301 orr $Tlo,$Tlo,$t2,lsl#24
302 orr $Thi,$Thi,$t3,lsl#8
303 orr $Thi,$Thi,$t0,lsl#16
304 orr $Thi,$Thi,$t1,lsl#24
1e863180
AP
305#else
306 ldr $Tlo,[$inp,#4]
307 ldr $Thi,[$inp],#8
308#ifdef __ARMEL__
309 rev $Tlo,$Tlo
310 rev $Thi,$Thi
311#endif
312#endif
1fa29843
AP
313___
314 &BODY_00_15(0x94);
315$code.=<<___;
316 tst $Ktbl,#1
317 beq .L00_15
1fa29843
AP
318 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
319 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
1e863180
AP
320 bic $Ktbl,$Ktbl,#1
321.L16_79:
1fa29843
AP
322 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
323 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
324 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
325 mov $Tlo,$t0,lsr#1
1e863180 326 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
1fa29843 327 mov $Thi,$t1,lsr#1
1e863180 328 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
1fa29843
AP
329 eor $Tlo,$Tlo,$t1,lsl#31
330 eor $Thi,$Thi,$t0,lsl#31
331 eor $Tlo,$Tlo,$t0,lsr#8
332 eor $Thi,$Thi,$t1,lsr#8
333 eor $Tlo,$Tlo,$t1,lsl#24
334 eor $Thi,$Thi,$t0,lsl#24
335 eor $Tlo,$Tlo,$t0,lsr#7
336 eor $Thi,$Thi,$t1,lsr#7
337 eor $Tlo,$Tlo,$t1,lsl#25
338
339 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
340 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
341 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
342 mov $t0,$t2,lsr#19
343 mov $t1,$t3,lsr#19
344 eor $t0,$t0,$t3,lsl#13
345 eor $t1,$t1,$t2,lsl#13
346 eor $t0,$t0,$t3,lsr#29
347 eor $t1,$t1,$t2,lsr#29
348 eor $t0,$t0,$t2,lsl#3
349 eor $t1,$t1,$t3,lsl#3
350 eor $t0,$t0,$t2,lsr#6
351 eor $t1,$t1,$t3,lsr#6
1e863180 352 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
1fa29843
AP
353 eor $t0,$t0,$t3,lsl#26
354
1fa29843
AP
355 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
356 adds $Tlo,$Tlo,$t0
1e863180 357 ldr $t0,[sp,#`$Xoff+8*16`+0]
1fa29843
AP
358 adc $Thi,$Thi,$t1
359
1fa29843
AP
360 ldr $t1,[sp,#`$Xoff+8*16`+4]
361 adds $Tlo,$Tlo,$t2
362 adc $Thi,$Thi,$t3
363 adds $Tlo,$Tlo,$t0
364 adc $Thi,$Thi,$t1
1fa29843
AP
365___
366 &BODY_00_15(0x17);
367$code.=<<___;
1e863180
AP
368 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
369 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
1fa29843
AP
370 beq .L16_79
371 bic $Ktbl,$Ktbl,#1
372
373 ldr $Tlo,[sp,#$Boff+0]
374 ldr $Thi,[sp,#$Boff+4]
375 ldr $t0, [$ctx,#$Aoff+$lo]
376 ldr $t1, [$ctx,#$Aoff+$hi]
377 ldr $t2, [$ctx,#$Boff+$lo]
378 ldr $t3, [$ctx,#$Boff+$hi]
379 adds $t0,$Alo,$t0
1fa29843 380 str $t0, [$ctx,#$Aoff+$lo]
1e863180 381 adc $t1,$Ahi,$t1
1fa29843 382 str $t1, [$ctx,#$Aoff+$hi]
1e863180 383 adds $t2,$Tlo,$t2
1fa29843 384 str $t2, [$ctx,#$Boff+$lo]
1e863180 385 adc $t3,$Thi,$t3
1fa29843
AP
386 str $t3, [$ctx,#$Boff+$hi]
387
388 ldr $Alo,[sp,#$Coff+0]
389 ldr $Ahi,[sp,#$Coff+4]
390 ldr $Tlo,[sp,#$Doff+0]
391 ldr $Thi,[sp,#$Doff+4]
392 ldr $t0, [$ctx,#$Coff+$lo]
393 ldr $t1, [$ctx,#$Coff+$hi]
394 ldr $t2, [$ctx,#$Doff+$lo]
395 ldr $t3, [$ctx,#$Doff+$hi]
396 adds $t0,$Alo,$t0
1fa29843 397 str $t0, [$ctx,#$Coff+$lo]
1e863180 398 adc $t1,$Ahi,$t1
1fa29843 399 str $t1, [$ctx,#$Coff+$hi]
1e863180 400 adds $t2,$Tlo,$t2
1fa29843 401 str $t2, [$ctx,#$Doff+$lo]
1e863180 402 adc $t3,$Thi,$t3
1fa29843
AP
403 str $t3, [$ctx,#$Doff+$hi]
404
405 ldr $Tlo,[sp,#$Foff+0]
406 ldr $Thi,[sp,#$Foff+4]
407 ldr $t0, [$ctx,#$Eoff+$lo]
408 ldr $t1, [$ctx,#$Eoff+$hi]
409 ldr $t2, [$ctx,#$Foff+$lo]
410 ldr $t3, [$ctx,#$Foff+$hi]
411 adds $Elo,$Elo,$t0
1fa29843 412 str $Elo,[$ctx,#$Eoff+$lo]
1e863180 413 adc $Ehi,$Ehi,$t1
1fa29843 414 str $Ehi,[$ctx,#$Eoff+$hi]
1e863180 415 adds $t2,$Tlo,$t2
1fa29843 416 str $t2, [$ctx,#$Foff+$lo]
1e863180 417 adc $t3,$Thi,$t3
1fa29843
AP
418 str $t3, [$ctx,#$Foff+$hi]
419
420 ldr $Alo,[sp,#$Goff+0]
421 ldr $Ahi,[sp,#$Goff+4]
422 ldr $Tlo,[sp,#$Hoff+0]
423 ldr $Thi,[sp,#$Hoff+4]
424 ldr $t0, [$ctx,#$Goff+$lo]
425 ldr $t1, [$ctx,#$Goff+$hi]
426 ldr $t2, [$ctx,#$Hoff+$lo]
427 ldr $t3, [$ctx,#$Hoff+$hi]
428 adds $t0,$Alo,$t0
1fa29843 429 str $t0, [$ctx,#$Goff+$lo]
1e863180 430 adc $t1,$Ahi,$t1
1fa29843 431 str $t1, [$ctx,#$Goff+$hi]
1e863180 432 adds $t2,$Tlo,$t2
1fa29843 433 str $t2, [$ctx,#$Hoff+$lo]
1e863180 434 adc $t3,$Thi,$t3
1fa29843
AP
435 str $t3, [$ctx,#$Hoff+$hi]
436
437 add sp,sp,#640
438 sub $Ktbl,$Ktbl,#640
439
440 teq $inp,$len
441 bne .Loop
442
443 add sp,sp,#8*9 @ destroy frame
1e863180
AP
444#if __ARM_ARCH__>=5
445 ldmia sp!,{r4-r12,pc}
446#else
1fa29843
AP
447 ldmia sp!,{r4-r12,lr}
448 tst lr,#1
449 moveq pc,lr @ be binary compatible with V4, yet
450 bx lr @ interoperable with Thumb ISA:-)
1e863180
AP
451#endif
452___
453
454{
455my @Sigma0=(28,34,39);
456my @Sigma1=(14,18,41);
457my @sigma0=(1, 8, 7);
458my @sigma1=(19,61,6);
459
460my $Ktbl="r3";
461my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
462
463my @X=map("d$_",(0..15));
464my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
465
466sub NEON_00_15() {
467my $i=shift;
468my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
469my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
470
471$code.=<<___ if ($i<16 || $i&1);
472 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
473#if $i<16
474 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
475#endif
476 vshr.u64 $t1,$e,#@Sigma1[1]
f26328c2
AP
477#if $i>0
478 vadd.i64 $a,$Maj @ h+=Maj from the past
479#endif
1e863180
AP
480 vshr.u64 $t2,$e,#@Sigma1[2]
481___
482$code.=<<___;
483 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
484 vsli.64 $t0,$e,#`64-@Sigma1[0]`
485 vsli.64 $t1,$e,#`64-@Sigma1[1]`
482a7d80 486 vmov $Ch,$e
1e863180
AP
487 vsli.64 $t2,$e,#`64-@Sigma1[2]`
488#if $i<16 && defined(__ARMEL__)
489 vrev64.8 @X[$i],@X[$i]
490#endif
482a7d80 491 veor $t1,$t0
f26328c2 492 vbsl $Ch,$f,$g @ Ch(e,f,g)
1e863180 493 vshr.u64 $t0,$a,#@Sigma0[0]
482a7d80 494 veor $t2,$t1 @ Sigma1(e)
f26328c2 495 vadd.i64 $T1,$Ch,$h
1e863180 496 vshr.u64 $t1,$a,#@Sigma0[1]
1e863180 497 vsli.64 $t0,$a,#`64-@Sigma0[0]`
f26328c2
AP
498 vadd.i64 $T1,$t2
499 vshr.u64 $t2,$a,#@Sigma0[2]
500 vadd.i64 $K,@X[$i%16]
1e863180 501 vsli.64 $t1,$a,#`64-@Sigma0[1]`
f26328c2 502 veor $Maj,$a,$b
1e863180 503 vsli.64 $t2,$a,#`64-@Sigma0[2]`
1e863180 504 veor $h,$t0,$t1
f26328c2 505 vadd.i64 $T1,$K
482a7d80 506 vbsl $Maj,$c,$b @ Maj(a,b,c)
f26328c2 507 veor $h,$t2 @ Sigma0(a)
1e863180 508 vadd.i64 $d,$T1
f26328c2
AP
509 vadd.i64 $Maj,$T1
510 @ vadd.i64 $h,$Maj
1e863180
AP
511___
512}
513
514sub NEON_16_79() {
515my $i=shift;
516
517if ($i&1) { &NEON_00_15($i,@_); return; }
518
519# 2x-vectorized, therefore runs every 2nd round
520my @X=map("q$_",(0..7)); # view @X as 128-bit vector
521my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
522my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
523my $e=@_[4]; # $e from NEON_00_15
524$i /= 2;
525$code.=<<___;
526 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
527 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
f26328c2 528 vadd.i64 @_[0],d30 @ h+=Maj from the past
1e863180
AP
529 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
530 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
531 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
532 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
533 veor $s1,$t0
534 vshr.u64 $t0,$s0,#@sigma0[0]
535 veor $s1,$t1 @ sigma1(X[i+14])
536 vshr.u64 $t1,$s0,#@sigma0[1]
537 vadd.i64 @X[$i%8],$s1
538 vshr.u64 $s1,$s0,#@sigma0[2]
539 vsli.64 $t0,$s0,#`64-@sigma0[0]`
540 vsli.64 $t1,$s0,#`64-@sigma0[1]`
541 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
542 veor $s1,$t0
543 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
544 vadd.i64 @X[$i%8],$s0
545 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
546 veor $s1,$t1 @ sigma0(X[i+1])
547 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
548 vadd.i64 @X[$i%8],$s1
549___
550 &NEON_00_15(2*$i,@_);
551}
552
553$code.=<<___;
554#if __ARM_ARCH__>=7
555.fpu neon
556
557.align 4
558.LNEON:
559 dmb @ errata #451034 on early Cortex A8
560 vstmdb sp!,{d8-d15} @ ABI specification says so
561 sub $Ktbl,r3,#672 @ K512
562 vldmia $ctx,{$A-$H} @ load context
563.Loop_neon:
564___
565for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
566$code.=<<___;
567 mov $cnt,#4
568.L16_79_neon:
569 subs $cnt,#1
570___
571for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
572$code.=<<___;
573 bne .L16_79_neon
574
f26328c2 575 vadd.i64 $A,d30 @ h+=Maj from the past
1e863180
AP
576 vldmia $ctx,{d24-d31} @ load context to temp
577 vadd.i64 q8,q12 @ vectorized accumulate
578 vadd.i64 q9,q13
579 vadd.i64 q10,q14
580 vadd.i64 q11,q15
581 vstmia $ctx,{$A-$H} @ save context
582 teq $inp,$len
583 sub $Ktbl,#640 @ rewind K512
584 bne .Loop_neon
585
586 vldmia sp!,{d8-d15} @ epilogue
587 bx lr
588#endif
589___
590}
591$code.=<<___;
592.size sha512_block_data_order,.-sha512_block_data_order
593.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
97a6a01f 594.align 2
87873f43 595.comm OPENSSL_armcap_P,4,4
1fa29843
AP
596___
597
598$code =~ s/\`([^\`]*)\`/eval $1/gem;
7722e53f 599$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
1fa29843 600print $code;
4c7c5ff6 601close STDOUT; # enforce flush