]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-armv4.pl
Configure: Engage ecp_nistz256-armv8 module.
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
CommitLineData
1fa29843
AP
1#!/usr/bin/env perl
2
3# ====================================================================
f26328c2 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1fa29843
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
b1a5d1c6
AP
8#
9# Permission to use under GPL terms is granted.
1fa29843
AP
10# ====================================================================
11
12# SHA512 block procedure for ARMv4. September 2007.
13
14# This code is ~4.5 (four and a half) times faster than code generated
2d22e080
AP
15# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16# Xscale PXA250 core].
17#
18# July 2010.
19#
20# Rescheduling for dual-issue pipeline resulted in 6% improvement on
21# Cortex A8 core and ~40 cycles per processed byte.
1fa29843 22
1e863180
AP
23# February 2011.
24#
25# Profiler-assisted and platform-specific optimization resulted in 7%
26# improvement on Coxtex A8 core and ~38 cycles per byte.
27
28# March 2011.
29#
30# Add NEON implementation. On Cortex A8 it was measured to process
482a7d80 31# one byte in 23.3 cycles or ~60% faster than integer-only code.
1e863180 32
f26328c2
AP
33# August 2012.
34#
35# Improve NEON performance by 12% on Snapdragon S4. In absolute
36# terms it's 22.6 cycles per byte, which is disappointing result.
37# Technical writers asserted that 3-way S4 pipeline can sustain
38# multiple NEON instructions per cycle, but dual NEON issue could
e390ae50
AP
39# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40# for further details. On side note Cortex-A15 processes one byte in
41# 16 cycles.
f26328c2 42
74eb3e09
AP
43# Byte order [in]dependence. =========================================
44#
1e863180
AP
45# Originally caller was expected to maintain specific *dword* order in
46# h[0-7], namely with most significant dword at *lower* address, which
47# was reflected in below two parameters as 0 and 4. Now caller is
48# expected to maintain native byte order for whole 64-bit values.
49$hi="HI";
50$lo="LO";
74eb3e09 51# ====================================================================
1fa29843 52
396df731 53while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
4c7c5ff6
AP
54open STDOUT,">$output";
55
1e863180 56$ctx="r0"; # parameter block
1fa29843
AP
57$inp="r1";
58$len="r2";
1e863180 59
1fa29843
AP
60$Tlo="r3";
61$Thi="r4";
62$Alo="r5";
63$Ahi="r6";
64$Elo="r7";
65$Ehi="r8";
66$t0="r9";
67$t1="r10";
68$t2="r11";
69$t3="r12";
70############ r13 is stack pointer
71$Ktbl="r14";
72############ r15 is program counter
73
74$Aoff=8*0;
75$Boff=8*1;
76$Coff=8*2;
77$Doff=8*3;
78$Eoff=8*4;
79$Foff=8*5;
80$Goff=8*6;
81$Hoff=8*7;
82$Xoff=8*8;
83
84sub BODY_00_15() {
85my $magic = shift;
86$code.=<<___;
1fa29843
AP
87 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
88 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
89 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
90 mov $t0,$Elo,lsr#14
1e863180 91 str $Tlo,[sp,#$Xoff+0]
1fa29843 92 mov $t1,$Ehi,lsr#14
1e863180 93 str $Thi,[sp,#$Xoff+4]
1fa29843 94 eor $t0,$t0,$Ehi,lsl#18
1e863180 95 ldr $t2,[sp,#$Hoff+0] @ h.lo
1fa29843 96 eor $t1,$t1,$Elo,lsl#18
1e863180 97 ldr $t3,[sp,#$Hoff+4] @ h.hi
1fa29843
AP
98 eor $t0,$t0,$Elo,lsr#18
99 eor $t1,$t1,$Ehi,lsr#18
100 eor $t0,$t0,$Ehi,lsl#14
101 eor $t1,$t1,$Elo,lsl#14
102 eor $t0,$t0,$Ehi,lsr#9
103 eor $t1,$t1,$Elo,lsr#9
104 eor $t0,$t0,$Elo,lsl#23
105 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
106 adds $Tlo,$Tlo,$t0
1fa29843 107 ldr $t0,[sp,#$Foff+0] @ f.lo
2d22e080 108 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
1fa29843 109 ldr $t1,[sp,#$Foff+4] @ f.hi
2d22e080 110 adds $Tlo,$Tlo,$t2
1fa29843 111 ldr $t2,[sp,#$Goff+0] @ g.lo
2d22e080 112 adc $Thi,$Thi,$t3 @ T += h
1fa29843 113 ldr $t3,[sp,#$Goff+4] @ g.hi
1fa29843
AP
114
115 eor $t0,$t0,$t2
2d22e080 116 str $Elo,[sp,#$Eoff+0]
1fa29843 117 eor $t1,$t1,$t3
2d22e080 118 str $Ehi,[sp,#$Eoff+4]
1fa29843 119 and $t0,$t0,$Elo
2d22e080 120 str $Alo,[sp,#$Aoff+0]
1fa29843 121 and $t1,$t1,$Ehi
2d22e080 122 str $Ahi,[sp,#$Aoff+4]
1fa29843 123 eor $t0,$t0,$t2
1e863180 124 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
2d22e080 125 eor $t1,$t1,$t3 @ Ch(e,f,g)
1e863180 126 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
1fa29843
AP
127
128 adds $Tlo,$Tlo,$t0
2d22e080 129 ldr $Elo,[sp,#$Doff+0] @ d.lo
1fa29843 130 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
2d22e080 131 ldr $Ehi,[sp,#$Doff+4] @ d.hi
1fa29843 132 adds $Tlo,$Tlo,$t2
1e863180 133 and $t0,$t2,#0xff
1fa29843
AP
134 adc $Thi,$Thi,$t3 @ T += K[i]
135 adds $Elo,$Elo,$Tlo
1e863180 136 ldr $t2,[sp,#$Boff+0] @ b.lo
1fa29843 137 adc $Ehi,$Ehi,$Thi @ d += T
1fa29843 138 teq $t0,#$magic
1fa29843 139
b5e5760d 140 ldr $t3,[sp,#$Coff+0] @ c.lo
b1a5d1c6
AP
141#if __ARM_ARCH__>=7
142 it eq @ Thumb2 thing, sanity check in ARM
143#endif
1e863180 144 orreq $Ktbl,$Ktbl,#1
1fa29843
AP
145 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
146 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
147 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
148 mov $t0,$Alo,lsr#28
149 mov $t1,$Ahi,lsr#28
150 eor $t0,$t0,$Ahi,lsl#4
151 eor $t1,$t1,$Alo,lsl#4
152 eor $t0,$t0,$Ahi,lsr#2
153 eor $t1,$t1,$Alo,lsr#2
154 eor $t0,$t0,$Alo,lsl#30
155 eor $t1,$t1,$Ahi,lsl#30
156 eor $t0,$t0,$Ahi,lsr#7
157 eor $t1,$t1,$Alo,lsr#7
158 eor $t0,$t0,$Alo,lsl#25
159 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
160 adds $Tlo,$Tlo,$t0
1e863180 161 and $t0,$Alo,$t2
1fa29843
AP
162 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
163
1fa29843 164 ldr $t1,[sp,#$Boff+4] @ b.hi
1e863180 165 orr $Alo,$Alo,$t2
1fa29843
AP
166 ldr $t2,[sp,#$Coff+4] @ c.hi
167 and $Alo,$Alo,$t3
1fa29843
AP
168 and $t3,$Ahi,$t1
169 orr $Ahi,$Ahi,$t1
1e863180 170 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
1fa29843 171 and $Ahi,$Ahi,$t2
1fa29843 172 adds $Alo,$Alo,$Tlo
1e863180 173 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
1fa29843 174 sub sp,sp,#8
1e863180
AP
175 adc $Ahi,$Ahi,$Thi @ h += T
176 tst $Ktbl,#1
1fa29843
AP
177 add $Ktbl,$Ktbl,#8
178___
179}
180$code=<<___;
b1a5d1c6
AP
181#ifndef __KERNEL__
182# include "arm_arch.h"
183# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
184# define VFP_ABI_POP vldmia sp!,{d8-d15}
185#else
186# define __ARM_ARCH__ __LINUX_ARM_ARCH__
187# define __ARM_MAX_ARCH__ 7
188# define VFP_ABI_PUSH
189# define VFP_ABI_POP
190#endif
191
1e863180
AP
192#ifdef __ARMEL__
193# define LO 0
194# define HI 4
195# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
196#else
197# define HI 0
198# define LO 4
199# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
200#endif
201
1fa29843 202.text
b1a5d1c6 203#if __ARM_ARCH__<7
1fa29843 204.code 32
b1a5d1c6
AP
205#else
206.syntax unified
207# ifdef __thumb2__
208# define adrl adr
209.thumb
210# else
211.code 32
212# endif
213#endif
214
1fa29843
AP
215.type K512,%object
216.align 5
217K512:
1e863180
AP
218WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
219WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
220WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
221WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
222WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
223WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
224WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
225WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
226WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
227WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
228WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
229WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
230WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
231WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
232WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
233WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
234WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
235WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
236WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
237WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
238WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
239WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
240WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
241WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
242WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
243WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
244WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
245WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
246WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
247WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
248WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
249WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
250WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
251WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
252WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
253WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
254WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
255WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
256WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
257WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
1fa29843 258.size K512,.-K512
b1a5d1c6 259#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1e863180 260.LOPENSSL_armcap:
87873f43 261.word OPENSSL_armcap_P-sha512_block_data_order
1e863180 262.skip 32-4
c1669e1c
AP
263#else
264.skip 32
265#endif
1fa29843
AP
266
267.global sha512_block_data_order
268.type sha512_block_data_order,%function
269sha512_block_data_order:
b1a5d1c6 270#if __ARM_ARCH__<7
1fa29843 271 sub r3,pc,#8 @ sha512_block_data_order
b1a5d1c6
AP
272#else
273 adr r3,sha512_block_data_order
274#endif
275#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1e863180 276 ldr r12,.LOPENSSL_armcap
87873f43 277 ldr r12,[r3,r12] @ OPENSSL_armcap_P
1e863180
AP
278 tst r12,#1
279 bne .LNEON
280#endif
b1a5d1c6 281 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
1fa29843 282 stmdb sp!,{r4-r12,lr}
1e863180 283 sub $Ktbl,r3,#672 @ K512
1fa29843
AP
284 sub sp,sp,#9*8
285
286 ldr $Elo,[$ctx,#$Eoff+$lo]
287 ldr $Ehi,[$ctx,#$Eoff+$hi]
288 ldr $t0, [$ctx,#$Goff+$lo]
289 ldr $t1, [$ctx,#$Goff+$hi]
290 ldr $t2, [$ctx,#$Hoff+$lo]
291 ldr $t3, [$ctx,#$Hoff+$hi]
292.Loop:
293 str $t0, [sp,#$Goff+0]
294 str $t1, [sp,#$Goff+4]
295 str $t2, [sp,#$Hoff+0]
296 str $t3, [sp,#$Hoff+4]
297 ldr $Alo,[$ctx,#$Aoff+$lo]
298 ldr $Ahi,[$ctx,#$Aoff+$hi]
299 ldr $Tlo,[$ctx,#$Boff+$lo]
300 ldr $Thi,[$ctx,#$Boff+$hi]
301 ldr $t0, [$ctx,#$Coff+$lo]
302 ldr $t1, [$ctx,#$Coff+$hi]
303 ldr $t2, [$ctx,#$Doff+$lo]
304 ldr $t3, [$ctx,#$Doff+$hi]
305 str $Tlo,[sp,#$Boff+0]
306 str $Thi,[sp,#$Boff+4]
307 str $t0, [sp,#$Coff+0]
308 str $t1, [sp,#$Coff+4]
309 str $t2, [sp,#$Doff+0]
310 str $t3, [sp,#$Doff+4]
311 ldr $Tlo,[$ctx,#$Foff+$lo]
312 ldr $Thi,[$ctx,#$Foff+$hi]
313 str $Tlo,[sp,#$Foff+0]
314 str $Thi,[sp,#$Foff+4]
315
316.L00_15:
1e863180 317#if __ARM_ARCH__<7
1fa29843
AP
318 ldrb $Tlo,[$inp,#7]
319 ldrb $t0, [$inp,#6]
320 ldrb $t1, [$inp,#5]
321 ldrb $t2, [$inp,#4]
322 ldrb $Thi,[$inp,#3]
323 ldrb $t3, [$inp,#2]
324 orr $Tlo,$Tlo,$t0,lsl#8
325 ldrb $t0, [$inp,#1]
326 orr $Tlo,$Tlo,$t1,lsl#16
327 ldrb $t1, [$inp],#8
328 orr $Tlo,$Tlo,$t2,lsl#24
329 orr $Thi,$Thi,$t3,lsl#8
330 orr $Thi,$Thi,$t0,lsl#16
331 orr $Thi,$Thi,$t1,lsl#24
1e863180
AP
332#else
333 ldr $Tlo,[$inp,#4]
334 ldr $Thi,[$inp],#8
335#ifdef __ARMEL__
336 rev $Tlo,$Tlo
337 rev $Thi,$Thi
338#endif
339#endif
1fa29843
AP
340___
341 &BODY_00_15(0x94);
342$code.=<<___;
343 tst $Ktbl,#1
344 beq .L00_15
1fa29843
AP
345 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
346 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
1e863180
AP
347 bic $Ktbl,$Ktbl,#1
348.L16_79:
1fa29843
AP
349 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
350 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
351 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
352 mov $Tlo,$t0,lsr#1
1e863180 353 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
1fa29843 354 mov $Thi,$t1,lsr#1
1e863180 355 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
1fa29843
AP
356 eor $Tlo,$Tlo,$t1,lsl#31
357 eor $Thi,$Thi,$t0,lsl#31
358 eor $Tlo,$Tlo,$t0,lsr#8
359 eor $Thi,$Thi,$t1,lsr#8
360 eor $Tlo,$Tlo,$t1,lsl#24
361 eor $Thi,$Thi,$t0,lsl#24
362 eor $Tlo,$Tlo,$t0,lsr#7
363 eor $Thi,$Thi,$t1,lsr#7
364 eor $Tlo,$Tlo,$t1,lsl#25
365
366 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
367 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
368 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
369 mov $t0,$t2,lsr#19
370 mov $t1,$t3,lsr#19
371 eor $t0,$t0,$t3,lsl#13
372 eor $t1,$t1,$t2,lsl#13
373 eor $t0,$t0,$t3,lsr#29
374 eor $t1,$t1,$t2,lsr#29
375 eor $t0,$t0,$t2,lsl#3
376 eor $t1,$t1,$t3,lsl#3
377 eor $t0,$t0,$t2,lsr#6
378 eor $t1,$t1,$t3,lsr#6
1e863180 379 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
1fa29843
AP
380 eor $t0,$t0,$t3,lsl#26
381
1fa29843
AP
382 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
383 adds $Tlo,$Tlo,$t0
1e863180 384 ldr $t0,[sp,#`$Xoff+8*16`+0]
1fa29843
AP
385 adc $Thi,$Thi,$t1
386
1fa29843
AP
387 ldr $t1,[sp,#`$Xoff+8*16`+4]
388 adds $Tlo,$Tlo,$t2
389 adc $Thi,$Thi,$t3
390 adds $Tlo,$Tlo,$t0
391 adc $Thi,$Thi,$t1
1fa29843
AP
392___
393 &BODY_00_15(0x17);
394$code.=<<___;
b1a5d1c6
AP
395#if __ARM_ARCH__>=7
396 ittt eq @ Thumb2 thing, sanity check in ARM
397#endif
1e863180
AP
398 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
399 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
1fa29843
AP
400 beq .L16_79
401 bic $Ktbl,$Ktbl,#1
402
403 ldr $Tlo,[sp,#$Boff+0]
404 ldr $Thi,[sp,#$Boff+4]
405 ldr $t0, [$ctx,#$Aoff+$lo]
406 ldr $t1, [$ctx,#$Aoff+$hi]
407 ldr $t2, [$ctx,#$Boff+$lo]
408 ldr $t3, [$ctx,#$Boff+$hi]
409 adds $t0,$Alo,$t0
1fa29843 410 str $t0, [$ctx,#$Aoff+$lo]
1e863180 411 adc $t1,$Ahi,$t1
1fa29843 412 str $t1, [$ctx,#$Aoff+$hi]
1e863180 413 adds $t2,$Tlo,$t2
1fa29843 414 str $t2, [$ctx,#$Boff+$lo]
1e863180 415 adc $t3,$Thi,$t3
1fa29843
AP
416 str $t3, [$ctx,#$Boff+$hi]
417
418 ldr $Alo,[sp,#$Coff+0]
419 ldr $Ahi,[sp,#$Coff+4]
420 ldr $Tlo,[sp,#$Doff+0]
421 ldr $Thi,[sp,#$Doff+4]
422 ldr $t0, [$ctx,#$Coff+$lo]
423 ldr $t1, [$ctx,#$Coff+$hi]
424 ldr $t2, [$ctx,#$Doff+$lo]
425 ldr $t3, [$ctx,#$Doff+$hi]
426 adds $t0,$Alo,$t0
1fa29843 427 str $t0, [$ctx,#$Coff+$lo]
1e863180 428 adc $t1,$Ahi,$t1
1fa29843 429 str $t1, [$ctx,#$Coff+$hi]
1e863180 430 adds $t2,$Tlo,$t2
1fa29843 431 str $t2, [$ctx,#$Doff+$lo]
1e863180 432 adc $t3,$Thi,$t3
1fa29843
AP
433 str $t3, [$ctx,#$Doff+$hi]
434
435 ldr $Tlo,[sp,#$Foff+0]
436 ldr $Thi,[sp,#$Foff+4]
437 ldr $t0, [$ctx,#$Eoff+$lo]
438 ldr $t1, [$ctx,#$Eoff+$hi]
439 ldr $t2, [$ctx,#$Foff+$lo]
440 ldr $t3, [$ctx,#$Foff+$hi]
441 adds $Elo,$Elo,$t0
1fa29843 442 str $Elo,[$ctx,#$Eoff+$lo]
1e863180 443 adc $Ehi,$Ehi,$t1
1fa29843 444 str $Ehi,[$ctx,#$Eoff+$hi]
1e863180 445 adds $t2,$Tlo,$t2
1fa29843 446 str $t2, [$ctx,#$Foff+$lo]
1e863180 447 adc $t3,$Thi,$t3
1fa29843
AP
448 str $t3, [$ctx,#$Foff+$hi]
449
450 ldr $Alo,[sp,#$Goff+0]
451 ldr $Ahi,[sp,#$Goff+4]
452 ldr $Tlo,[sp,#$Hoff+0]
453 ldr $Thi,[sp,#$Hoff+4]
454 ldr $t0, [$ctx,#$Goff+$lo]
455 ldr $t1, [$ctx,#$Goff+$hi]
456 ldr $t2, [$ctx,#$Hoff+$lo]
457 ldr $t3, [$ctx,#$Hoff+$hi]
458 adds $t0,$Alo,$t0
1fa29843 459 str $t0, [$ctx,#$Goff+$lo]
1e863180 460 adc $t1,$Ahi,$t1
1fa29843 461 str $t1, [$ctx,#$Goff+$hi]
1e863180 462 adds $t2,$Tlo,$t2
1fa29843 463 str $t2, [$ctx,#$Hoff+$lo]
1e863180 464 adc $t3,$Thi,$t3
1fa29843
AP
465 str $t3, [$ctx,#$Hoff+$hi]
466
467 add sp,sp,#640
468 sub $Ktbl,$Ktbl,#640
469
470 teq $inp,$len
471 bne .Loop
472
473 add sp,sp,#8*9 @ destroy frame
1e863180
AP
474#if __ARM_ARCH__>=5
475 ldmia sp!,{r4-r12,pc}
476#else
1fa29843
AP
477 ldmia sp!,{r4-r12,lr}
478 tst lr,#1
479 moveq pc,lr @ be binary compatible with V4, yet
480 bx lr @ interoperable with Thumb ISA:-)
1e863180 481#endif
b1a5d1c6 482.size sha512_block_data_order,.-sha512_block_data_order
1e863180
AP
483___
484
485{
486my @Sigma0=(28,34,39);
487my @Sigma1=(14,18,41);
488my @sigma0=(1, 8, 7);
489my @sigma1=(19,61,6);
490
491my $Ktbl="r3";
492my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
493
494my @X=map("d$_",(0..15));
495my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
496
497sub NEON_00_15() {
498my $i=shift;
499my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
500my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
501
502$code.=<<___ if ($i<16 || $i&1);
503 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
504#if $i<16
505 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
506#endif
507 vshr.u64 $t1,$e,#@Sigma1[1]
f26328c2
AP
508#if $i>0
509 vadd.i64 $a,$Maj @ h+=Maj from the past
510#endif
1e863180
AP
511 vshr.u64 $t2,$e,#@Sigma1[2]
512___
513$code.=<<___;
514 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
515 vsli.64 $t0,$e,#`64-@Sigma1[0]`
516 vsli.64 $t1,$e,#`64-@Sigma1[1]`
482a7d80 517 vmov $Ch,$e
1e863180
AP
518 vsli.64 $t2,$e,#`64-@Sigma1[2]`
519#if $i<16 && defined(__ARMEL__)
520 vrev64.8 @X[$i],@X[$i]
521#endif
482a7d80 522 veor $t1,$t0
f26328c2 523 vbsl $Ch,$f,$g @ Ch(e,f,g)
1e863180 524 vshr.u64 $t0,$a,#@Sigma0[0]
482a7d80 525 veor $t2,$t1 @ Sigma1(e)
f26328c2 526 vadd.i64 $T1,$Ch,$h
1e863180 527 vshr.u64 $t1,$a,#@Sigma0[1]
1e863180 528 vsli.64 $t0,$a,#`64-@Sigma0[0]`
f26328c2
AP
529 vadd.i64 $T1,$t2
530 vshr.u64 $t2,$a,#@Sigma0[2]
531 vadd.i64 $K,@X[$i%16]
1e863180 532 vsli.64 $t1,$a,#`64-@Sigma0[1]`
f26328c2 533 veor $Maj,$a,$b
1e863180 534 vsli.64 $t2,$a,#`64-@Sigma0[2]`
1e863180 535 veor $h,$t0,$t1
f26328c2 536 vadd.i64 $T1,$K
482a7d80 537 vbsl $Maj,$c,$b @ Maj(a,b,c)
f26328c2 538 veor $h,$t2 @ Sigma0(a)
1e863180 539 vadd.i64 $d,$T1
f26328c2
AP
540 vadd.i64 $Maj,$T1
541 @ vadd.i64 $h,$Maj
1e863180
AP
542___
543}
544
545sub NEON_16_79() {
546my $i=shift;
547
548if ($i&1) { &NEON_00_15($i,@_); return; }
549
550# 2x-vectorized, therefore runs every 2nd round
551my @X=map("q$_",(0..7)); # view @X as 128-bit vector
552my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
553my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
554my $e=@_[4]; # $e from NEON_00_15
555$i /= 2;
556$code.=<<___;
557 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
558 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
f26328c2 559 vadd.i64 @_[0],d30 @ h+=Maj from the past
1e863180
AP
560 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
561 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
562 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
563 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
564 veor $s1,$t0
565 vshr.u64 $t0,$s0,#@sigma0[0]
566 veor $s1,$t1 @ sigma1(X[i+14])
567 vshr.u64 $t1,$s0,#@sigma0[1]
568 vadd.i64 @X[$i%8],$s1
569 vshr.u64 $s1,$s0,#@sigma0[2]
570 vsli.64 $t0,$s0,#`64-@sigma0[0]`
571 vsli.64 $t1,$s0,#`64-@sigma0[1]`
572 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
573 veor $s1,$t0
574 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
575 vadd.i64 @X[$i%8],$s0
576 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
577 veor $s1,$t1 @ sigma0(X[i+1])
578 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
579 vadd.i64 @X[$i%8],$s1
580___
581 &NEON_00_15(2*$i,@_);
582}
583
584$code.=<<___;
c1669e1c
AP
585#if __ARM_MAX_ARCH__>=7
586.arch armv7-a
1e863180
AP
587.fpu neon
588
b1a5d1c6
AP
589.global sha512_block_data_order_neon
590.type sha512_block_data_order_neon,%function
1e863180 591.align 4
b1a5d1c6 592sha512_block_data_order_neon:
1e863180
AP
593.LNEON:
594 dmb @ errata #451034 on early Cortex A8
b1a5d1c6
AP
595 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
596 VFP_ABI_PUSH
597 adrl $Ktbl,K512
1e863180
AP
598 vldmia $ctx,{$A-$H} @ load context
599.Loop_neon:
600___
601for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
602$code.=<<___;
603 mov $cnt,#4
604.L16_79_neon:
605 subs $cnt,#1
606___
607for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
608$code.=<<___;
609 bne .L16_79_neon
610
f26328c2 611 vadd.i64 $A,d30 @ h+=Maj from the past
1e863180
AP
612 vldmia $ctx,{d24-d31} @ load context to temp
613 vadd.i64 q8,q12 @ vectorized accumulate
614 vadd.i64 q9,q13
615 vadd.i64 q10,q14
616 vadd.i64 q11,q15
617 vstmia $ctx,{$A-$H} @ save context
618 teq $inp,$len
619 sub $Ktbl,#640 @ rewind K512
620 bne .Loop_neon
621
b1a5d1c6 622 VFP_ABI_POP
5dcf70a1 623 ret @ bx lr
b1a5d1c6 624.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1e863180
AP
625#endif
626___
627}
628$code.=<<___;
1e863180 629.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
97a6a01f 630.align 2
b1a5d1c6 631#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
87873f43 632.comm OPENSSL_armcap_P,4,4
c1669e1c 633#endif
1fa29843
AP
634___
635
636$code =~ s/\`([^\`]*)\`/eval $1/gem;
7722e53f 637$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
5dcf70a1 638$code =~ s/\bret\b/bx lr/gm;
b1a5d1c6
AP
639
640open SELF,$0;
641while(<SELF>) {
642 next if (/^#!/);
643 last if (!s/^#/@/ and !/^$/);
644 print;
645}
646close SELF;
647
1fa29843 648print $code;
4c7c5ff6 649close STDOUT; # enforce flush