]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-armv4.pl
sha/asm/sha256-armv4.pl: fix compile issue in kernel
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
CommitLineData
1fa29843
AP
1#!/usr/bin/env perl
2
3# ====================================================================
f26328c2 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1fa29843
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
2d22e080
AP
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
1fa29843 20
1e863180
AP
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
482a7d80 29# one byte in 23.3 cycles or ~60% faster than integer-only code.
1e863180 30
f26328c2
AP
31# August 2012.
32#
33# Improve NEON performance by 12% on Snapdragon S4. In absolute
34# terms it's 22.6 cycles per byte, which is disappointing result.
35# Technical writers asserted that 3-way S4 pipeline can sustain
36# multiple NEON instructions per cycle, but dual NEON issue could
e390ae50
AP
37# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
38# for further details. On side note Cortex-A15 processes one byte in
39# 16 cycles.
f26328c2 40
74eb3e09
AP
41# Byte order [in]dependence. =========================================
42#
1e863180
AP
43# Originally caller was expected to maintain specific *dword* order in
44# h[0-7], namely with most significant dword at *lower* address, which
45# was reflected in below two parameters as 0 and 4. Now caller is
46# expected to maintain native byte order for whole 64-bit values.
47$hi="HI";
48$lo="LO";
74eb3e09 49# ====================================================================
1fa29843 50
396df731 51while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
4c7c5ff6
AP
52open STDOUT,">$output";
53
1e863180 54$ctx="r0"; # parameter block
1fa29843
AP
55$inp="r1";
56$len="r2";
1e863180 57
1fa29843
AP
58$Tlo="r3";
59$Thi="r4";
60$Alo="r5";
61$Ahi="r6";
62$Elo="r7";
63$Ehi="r8";
64$t0="r9";
65$t1="r10";
66$t2="r11";
67$t3="r12";
68############ r13 is stack pointer
69$Ktbl="r14";
70############ r15 is program counter
71
72$Aoff=8*0;
73$Boff=8*1;
74$Coff=8*2;
75$Doff=8*3;
76$Eoff=8*4;
77$Foff=8*5;
78$Goff=8*6;
79$Hoff=8*7;
80$Xoff=8*8;
81
82sub BODY_00_15() {
83my $magic = shift;
84$code.=<<___;
1fa29843
AP
85 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
86 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
87 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
88 mov $t0,$Elo,lsr#14
1e863180 89 str $Tlo,[sp,#$Xoff+0]
1fa29843 90 mov $t1,$Ehi,lsr#14
1e863180 91 str $Thi,[sp,#$Xoff+4]
1fa29843 92 eor $t0,$t0,$Ehi,lsl#18
1e863180 93 ldr $t2,[sp,#$Hoff+0] @ h.lo
1fa29843 94 eor $t1,$t1,$Elo,lsl#18
1e863180 95 ldr $t3,[sp,#$Hoff+4] @ h.hi
1fa29843
AP
96 eor $t0,$t0,$Elo,lsr#18
97 eor $t1,$t1,$Ehi,lsr#18
98 eor $t0,$t0,$Ehi,lsl#14
99 eor $t1,$t1,$Elo,lsl#14
100 eor $t0,$t0,$Ehi,lsr#9
101 eor $t1,$t1,$Elo,lsr#9
102 eor $t0,$t0,$Elo,lsl#23
103 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
104 adds $Tlo,$Tlo,$t0
1fa29843 105 ldr $t0,[sp,#$Foff+0] @ f.lo
2d22e080 106 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
1fa29843 107 ldr $t1,[sp,#$Foff+4] @ f.hi
2d22e080 108 adds $Tlo,$Tlo,$t2
1fa29843 109 ldr $t2,[sp,#$Goff+0] @ g.lo
2d22e080 110 adc $Thi,$Thi,$t3 @ T += h
1fa29843 111 ldr $t3,[sp,#$Goff+4] @ g.hi
1fa29843
AP
112
113 eor $t0,$t0,$t2
2d22e080 114 str $Elo,[sp,#$Eoff+0]
1fa29843 115 eor $t1,$t1,$t3
2d22e080 116 str $Ehi,[sp,#$Eoff+4]
1fa29843 117 and $t0,$t0,$Elo
2d22e080 118 str $Alo,[sp,#$Aoff+0]
1fa29843 119 and $t1,$t1,$Ehi
2d22e080 120 str $Ahi,[sp,#$Aoff+4]
1fa29843 121 eor $t0,$t0,$t2
1e863180 122 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
2d22e080 123 eor $t1,$t1,$t3 @ Ch(e,f,g)
1e863180 124 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
1fa29843
AP
125
126 adds $Tlo,$Tlo,$t0
2d22e080 127 ldr $Elo,[sp,#$Doff+0] @ d.lo
1fa29843 128 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
2d22e080 129 ldr $Ehi,[sp,#$Doff+4] @ d.hi
1fa29843 130 adds $Tlo,$Tlo,$t2
1e863180 131 and $t0,$t2,#0xff
1fa29843
AP
132 adc $Thi,$Thi,$t3 @ T += K[i]
133 adds $Elo,$Elo,$Tlo
1e863180 134 ldr $t2,[sp,#$Boff+0] @ b.lo
1fa29843 135 adc $Ehi,$Ehi,$Thi @ d += T
1fa29843 136 teq $t0,#$magic
1fa29843 137
b5e5760d 138 ldr $t3,[sp,#$Coff+0] @ c.lo
1e863180 139 orreq $Ktbl,$Ktbl,#1
1fa29843
AP
140 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
141 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
142 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
143 mov $t0,$Alo,lsr#28
144 mov $t1,$Ahi,lsr#28
145 eor $t0,$t0,$Ahi,lsl#4
146 eor $t1,$t1,$Alo,lsl#4
147 eor $t0,$t0,$Ahi,lsr#2
148 eor $t1,$t1,$Alo,lsr#2
149 eor $t0,$t0,$Alo,lsl#30
150 eor $t1,$t1,$Ahi,lsl#30
151 eor $t0,$t0,$Ahi,lsr#7
152 eor $t1,$t1,$Alo,lsr#7
153 eor $t0,$t0,$Alo,lsl#25
154 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
155 adds $Tlo,$Tlo,$t0
1e863180 156 and $t0,$Alo,$t2
1fa29843
AP
157 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
158
1fa29843 159 ldr $t1,[sp,#$Boff+4] @ b.hi
1e863180 160 orr $Alo,$Alo,$t2
1fa29843
AP
161 ldr $t2,[sp,#$Coff+4] @ c.hi
162 and $Alo,$Alo,$t3
1fa29843
AP
163 and $t3,$Ahi,$t1
164 orr $Ahi,$Ahi,$t1
1e863180 165 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
1fa29843 166 and $Ahi,$Ahi,$t2
1fa29843 167 adds $Alo,$Alo,$Tlo
1e863180 168 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
1fa29843 169 sub sp,sp,#8
1e863180
AP
170 adc $Ahi,$Ahi,$Thi @ h += T
171 tst $Ktbl,#1
1fa29843
AP
172 add $Ktbl,$Ktbl,#8
173___
174}
175$code=<<___;
1e863180
AP
176#include "arm_arch.h"
177#ifdef __ARMEL__
178# define LO 0
179# define HI 4
180# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
181#else
182# define HI 0
183# define LO 4
184# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
185#endif
186
1fa29843
AP
187.text
188.code 32
189.type K512,%object
190.align 5
191K512:
1e863180
AP
192WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
193WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
194WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
195WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
196WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
197WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
198WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
199WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
200WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
201WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
202WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
203WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
204WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
205WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
206WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
207WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
208WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
209WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
210WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
211WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
212WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
213WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
214WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
215WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
216WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
217WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
218WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
219WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
220WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
221WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
222WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
223WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
224WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
225WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
226WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
227WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
228WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
229WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
230WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
231WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
1fa29843 232.size K512,.-K512
c1669e1c 233#if __ARM_MAX_ARCH__>=7
1e863180 234.LOPENSSL_armcap:
87873f43 235.word OPENSSL_armcap_P-sha512_block_data_order
1e863180 236.skip 32-4
c1669e1c
AP
237#else
238.skip 32
239#endif
1fa29843
AP
240
241.global sha512_block_data_order
242.type sha512_block_data_order,%function
243sha512_block_data_order:
244 sub r3,pc,#8 @ sha512_block_data_order
245 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
c1669e1c 246#if __ARM_MAX_ARCH__>=7
1e863180 247 ldr r12,.LOPENSSL_armcap
87873f43 248 ldr r12,[r3,r12] @ OPENSSL_armcap_P
1e863180
AP
249 tst r12,#1
250 bne .LNEON
251#endif
1fa29843 252 stmdb sp!,{r4-r12,lr}
1e863180 253 sub $Ktbl,r3,#672 @ K512
1fa29843
AP
254 sub sp,sp,#9*8
255
256 ldr $Elo,[$ctx,#$Eoff+$lo]
257 ldr $Ehi,[$ctx,#$Eoff+$hi]
258 ldr $t0, [$ctx,#$Goff+$lo]
259 ldr $t1, [$ctx,#$Goff+$hi]
260 ldr $t2, [$ctx,#$Hoff+$lo]
261 ldr $t3, [$ctx,#$Hoff+$hi]
262.Loop:
263 str $t0, [sp,#$Goff+0]
264 str $t1, [sp,#$Goff+4]
265 str $t2, [sp,#$Hoff+0]
266 str $t3, [sp,#$Hoff+4]
267 ldr $Alo,[$ctx,#$Aoff+$lo]
268 ldr $Ahi,[$ctx,#$Aoff+$hi]
269 ldr $Tlo,[$ctx,#$Boff+$lo]
270 ldr $Thi,[$ctx,#$Boff+$hi]
271 ldr $t0, [$ctx,#$Coff+$lo]
272 ldr $t1, [$ctx,#$Coff+$hi]
273 ldr $t2, [$ctx,#$Doff+$lo]
274 ldr $t3, [$ctx,#$Doff+$hi]
275 str $Tlo,[sp,#$Boff+0]
276 str $Thi,[sp,#$Boff+4]
277 str $t0, [sp,#$Coff+0]
278 str $t1, [sp,#$Coff+4]
279 str $t2, [sp,#$Doff+0]
280 str $t3, [sp,#$Doff+4]
281 ldr $Tlo,[$ctx,#$Foff+$lo]
282 ldr $Thi,[$ctx,#$Foff+$hi]
283 str $Tlo,[sp,#$Foff+0]
284 str $Thi,[sp,#$Foff+4]
285
286.L00_15:
1e863180 287#if __ARM_ARCH__<7
1fa29843
AP
288 ldrb $Tlo,[$inp,#7]
289 ldrb $t0, [$inp,#6]
290 ldrb $t1, [$inp,#5]
291 ldrb $t2, [$inp,#4]
292 ldrb $Thi,[$inp,#3]
293 ldrb $t3, [$inp,#2]
294 orr $Tlo,$Tlo,$t0,lsl#8
295 ldrb $t0, [$inp,#1]
296 orr $Tlo,$Tlo,$t1,lsl#16
297 ldrb $t1, [$inp],#8
298 orr $Tlo,$Tlo,$t2,lsl#24
299 orr $Thi,$Thi,$t3,lsl#8
300 orr $Thi,$Thi,$t0,lsl#16
301 orr $Thi,$Thi,$t1,lsl#24
1e863180
AP
302#else
303 ldr $Tlo,[$inp,#4]
304 ldr $Thi,[$inp],#8
305#ifdef __ARMEL__
306 rev $Tlo,$Tlo
307 rev $Thi,$Thi
308#endif
309#endif
1fa29843
AP
310___
311 &BODY_00_15(0x94);
312$code.=<<___;
313 tst $Ktbl,#1
314 beq .L00_15
1fa29843
AP
315 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
316 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
1e863180
AP
317 bic $Ktbl,$Ktbl,#1
318.L16_79:
1fa29843
AP
319 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
320 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
321 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
322 mov $Tlo,$t0,lsr#1
1e863180 323 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
1fa29843 324 mov $Thi,$t1,lsr#1
1e863180 325 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
1fa29843
AP
326 eor $Tlo,$Tlo,$t1,lsl#31
327 eor $Thi,$Thi,$t0,lsl#31
328 eor $Tlo,$Tlo,$t0,lsr#8
329 eor $Thi,$Thi,$t1,lsr#8
330 eor $Tlo,$Tlo,$t1,lsl#24
331 eor $Thi,$Thi,$t0,lsl#24
332 eor $Tlo,$Tlo,$t0,lsr#7
333 eor $Thi,$Thi,$t1,lsr#7
334 eor $Tlo,$Tlo,$t1,lsl#25
335
336 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
337 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
338 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
339 mov $t0,$t2,lsr#19
340 mov $t1,$t3,lsr#19
341 eor $t0,$t0,$t3,lsl#13
342 eor $t1,$t1,$t2,lsl#13
343 eor $t0,$t0,$t3,lsr#29
344 eor $t1,$t1,$t2,lsr#29
345 eor $t0,$t0,$t2,lsl#3
346 eor $t1,$t1,$t3,lsl#3
347 eor $t0,$t0,$t2,lsr#6
348 eor $t1,$t1,$t3,lsr#6
1e863180 349 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
1fa29843
AP
350 eor $t0,$t0,$t3,lsl#26
351
1fa29843
AP
352 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
353 adds $Tlo,$Tlo,$t0
1e863180 354 ldr $t0,[sp,#`$Xoff+8*16`+0]
1fa29843
AP
355 adc $Thi,$Thi,$t1
356
1fa29843
AP
357 ldr $t1,[sp,#`$Xoff+8*16`+4]
358 adds $Tlo,$Tlo,$t2
359 adc $Thi,$Thi,$t3
360 adds $Tlo,$Tlo,$t0
361 adc $Thi,$Thi,$t1
1fa29843
AP
362___
363 &BODY_00_15(0x17);
364$code.=<<___;
1e863180
AP
365 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
366 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
1fa29843
AP
367 beq .L16_79
368 bic $Ktbl,$Ktbl,#1
369
370 ldr $Tlo,[sp,#$Boff+0]
371 ldr $Thi,[sp,#$Boff+4]
372 ldr $t0, [$ctx,#$Aoff+$lo]
373 ldr $t1, [$ctx,#$Aoff+$hi]
374 ldr $t2, [$ctx,#$Boff+$lo]
375 ldr $t3, [$ctx,#$Boff+$hi]
376 adds $t0,$Alo,$t0
1fa29843 377 str $t0, [$ctx,#$Aoff+$lo]
1e863180 378 adc $t1,$Ahi,$t1
1fa29843 379 str $t1, [$ctx,#$Aoff+$hi]
1e863180 380 adds $t2,$Tlo,$t2
1fa29843 381 str $t2, [$ctx,#$Boff+$lo]
1e863180 382 adc $t3,$Thi,$t3
1fa29843
AP
383 str $t3, [$ctx,#$Boff+$hi]
384
385 ldr $Alo,[sp,#$Coff+0]
386 ldr $Ahi,[sp,#$Coff+4]
387 ldr $Tlo,[sp,#$Doff+0]
388 ldr $Thi,[sp,#$Doff+4]
389 ldr $t0, [$ctx,#$Coff+$lo]
390 ldr $t1, [$ctx,#$Coff+$hi]
391 ldr $t2, [$ctx,#$Doff+$lo]
392 ldr $t3, [$ctx,#$Doff+$hi]
393 adds $t0,$Alo,$t0
1fa29843 394 str $t0, [$ctx,#$Coff+$lo]
1e863180 395 adc $t1,$Ahi,$t1
1fa29843 396 str $t1, [$ctx,#$Coff+$hi]
1e863180 397 adds $t2,$Tlo,$t2
1fa29843 398 str $t2, [$ctx,#$Doff+$lo]
1e863180 399 adc $t3,$Thi,$t3
1fa29843
AP
400 str $t3, [$ctx,#$Doff+$hi]
401
402 ldr $Tlo,[sp,#$Foff+0]
403 ldr $Thi,[sp,#$Foff+4]
404 ldr $t0, [$ctx,#$Eoff+$lo]
405 ldr $t1, [$ctx,#$Eoff+$hi]
406 ldr $t2, [$ctx,#$Foff+$lo]
407 ldr $t3, [$ctx,#$Foff+$hi]
408 adds $Elo,$Elo,$t0
1fa29843 409 str $Elo,[$ctx,#$Eoff+$lo]
1e863180 410 adc $Ehi,$Ehi,$t1
1fa29843 411 str $Ehi,[$ctx,#$Eoff+$hi]
1e863180 412 adds $t2,$Tlo,$t2
1fa29843 413 str $t2, [$ctx,#$Foff+$lo]
1e863180 414 adc $t3,$Thi,$t3
1fa29843
AP
415 str $t3, [$ctx,#$Foff+$hi]
416
417 ldr $Alo,[sp,#$Goff+0]
418 ldr $Ahi,[sp,#$Goff+4]
419 ldr $Tlo,[sp,#$Hoff+0]
420 ldr $Thi,[sp,#$Hoff+4]
421 ldr $t0, [$ctx,#$Goff+$lo]
422 ldr $t1, [$ctx,#$Goff+$hi]
423 ldr $t2, [$ctx,#$Hoff+$lo]
424 ldr $t3, [$ctx,#$Hoff+$hi]
425 adds $t0,$Alo,$t0
1fa29843 426 str $t0, [$ctx,#$Goff+$lo]
1e863180 427 adc $t1,$Ahi,$t1
1fa29843 428 str $t1, [$ctx,#$Goff+$hi]
1e863180 429 adds $t2,$Tlo,$t2
1fa29843 430 str $t2, [$ctx,#$Hoff+$lo]
1e863180 431 adc $t3,$Thi,$t3
1fa29843
AP
432 str $t3, [$ctx,#$Hoff+$hi]
433
434 add sp,sp,#640
435 sub $Ktbl,$Ktbl,#640
436
437 teq $inp,$len
438 bne .Loop
439
440 add sp,sp,#8*9 @ destroy frame
1e863180
AP
441#if __ARM_ARCH__>=5
442 ldmia sp!,{r4-r12,pc}
443#else
1fa29843
AP
444 ldmia sp!,{r4-r12,lr}
445 tst lr,#1
446 moveq pc,lr @ be binary compatible with V4, yet
447 bx lr @ interoperable with Thumb ISA:-)
1e863180
AP
448#endif
449___
450
451{
452my @Sigma0=(28,34,39);
453my @Sigma1=(14,18,41);
454my @sigma0=(1, 8, 7);
455my @sigma1=(19,61,6);
456
457my $Ktbl="r3";
458my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
459
460my @X=map("d$_",(0..15));
461my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
462
463sub NEON_00_15() {
464my $i=shift;
465my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
466my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
467
468$code.=<<___ if ($i<16 || $i&1);
469 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
470#if $i<16
471 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
472#endif
473 vshr.u64 $t1,$e,#@Sigma1[1]
f26328c2
AP
474#if $i>0
475 vadd.i64 $a,$Maj @ h+=Maj from the past
476#endif
1e863180
AP
477 vshr.u64 $t2,$e,#@Sigma1[2]
478___
479$code.=<<___;
480 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
481 vsli.64 $t0,$e,#`64-@Sigma1[0]`
482 vsli.64 $t1,$e,#`64-@Sigma1[1]`
482a7d80 483 vmov $Ch,$e
1e863180
AP
484 vsli.64 $t2,$e,#`64-@Sigma1[2]`
485#if $i<16 && defined(__ARMEL__)
486 vrev64.8 @X[$i],@X[$i]
487#endif
482a7d80 488 veor $t1,$t0
f26328c2 489 vbsl $Ch,$f,$g @ Ch(e,f,g)
1e863180 490 vshr.u64 $t0,$a,#@Sigma0[0]
482a7d80 491 veor $t2,$t1 @ Sigma1(e)
f26328c2 492 vadd.i64 $T1,$Ch,$h
1e863180 493 vshr.u64 $t1,$a,#@Sigma0[1]
1e863180 494 vsli.64 $t0,$a,#`64-@Sigma0[0]`
f26328c2
AP
495 vadd.i64 $T1,$t2
496 vshr.u64 $t2,$a,#@Sigma0[2]
497 vadd.i64 $K,@X[$i%16]
1e863180 498 vsli.64 $t1,$a,#`64-@Sigma0[1]`
f26328c2 499 veor $Maj,$a,$b
1e863180 500 vsli.64 $t2,$a,#`64-@Sigma0[2]`
1e863180 501 veor $h,$t0,$t1
f26328c2 502 vadd.i64 $T1,$K
482a7d80 503 vbsl $Maj,$c,$b @ Maj(a,b,c)
f26328c2 504 veor $h,$t2 @ Sigma0(a)
1e863180 505 vadd.i64 $d,$T1
f26328c2
AP
506 vadd.i64 $Maj,$T1
507 @ vadd.i64 $h,$Maj
1e863180
AP
508___
509}
510
511sub NEON_16_79() {
512my $i=shift;
513
514if ($i&1) { &NEON_00_15($i,@_); return; }
515
516# 2x-vectorized, therefore runs every 2nd round
517my @X=map("q$_",(0..7)); # view @X as 128-bit vector
518my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
519my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
520my $e=@_[4]; # $e from NEON_00_15
521$i /= 2;
522$code.=<<___;
523 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
524 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
f26328c2 525 vadd.i64 @_[0],d30 @ h+=Maj from the past
1e863180
AP
526 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
527 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
528 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
529 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
530 veor $s1,$t0
531 vshr.u64 $t0,$s0,#@sigma0[0]
532 veor $s1,$t1 @ sigma1(X[i+14])
533 vshr.u64 $t1,$s0,#@sigma0[1]
534 vadd.i64 @X[$i%8],$s1
535 vshr.u64 $s1,$s0,#@sigma0[2]
536 vsli.64 $t0,$s0,#`64-@sigma0[0]`
537 vsli.64 $t1,$s0,#`64-@sigma0[1]`
538 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
539 veor $s1,$t0
540 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
541 vadd.i64 @X[$i%8],$s0
542 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
543 veor $s1,$t1 @ sigma0(X[i+1])
544 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
545 vadd.i64 @X[$i%8],$s1
546___
547 &NEON_00_15(2*$i,@_);
548}
549
550$code.=<<___;
c1669e1c
AP
551#if __ARM_MAX_ARCH__>=7
552.arch armv7-a
1e863180
AP
553.fpu neon
554
555.align 4
556.LNEON:
557 dmb @ errata #451034 on early Cortex A8
558 vstmdb sp!,{d8-d15} @ ABI specification says so
559 sub $Ktbl,r3,#672 @ K512
560 vldmia $ctx,{$A-$H} @ load context
561.Loop_neon:
562___
563for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
564$code.=<<___;
565 mov $cnt,#4
566.L16_79_neon:
567 subs $cnt,#1
568___
569for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
570$code.=<<___;
571 bne .L16_79_neon
572
f26328c2 573 vadd.i64 $A,d30 @ h+=Maj from the past
1e863180
AP
574 vldmia $ctx,{d24-d31} @ load context to temp
575 vadd.i64 q8,q12 @ vectorized accumulate
576 vadd.i64 q9,q13
577 vadd.i64 q10,q14
578 vadd.i64 q11,q15
579 vstmia $ctx,{$A-$H} @ save context
580 teq $inp,$len
581 sub $Ktbl,#640 @ rewind K512
582 bne .Loop_neon
583
584 vldmia sp!,{d8-d15} @ epilogue
5dcf70a1 585 ret @ bx lr
1e863180
AP
586#endif
587___
588}
589$code.=<<___;
590.size sha512_block_data_order,.-sha512_block_data_order
591.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
97a6a01f 592.align 2
c1669e1c 593#if __ARM_MAX_ARCH__>=7
87873f43 594.comm OPENSSL_armcap_P,4,4
c1669e1c 595#endif
1fa29843
AP
596___
597
598$code =~ s/\`([^\`]*)\`/eval $1/gem;
7722e53f 599$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
5dcf70a1 600$code =~ s/\bret\b/bx lr/gm;
1fa29843 601print $code;
4c7c5ff6 602close STDOUT; # enforce flush