]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-armv4.pl
Remove inconsistency in ARM support.
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
CommitLineData
1fa29843
AP
1#!/usr/bin/env perl
2
3# ====================================================================
f26328c2 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1fa29843
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
2d22e080
AP
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
1fa29843 20
1e863180
AP
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
482a7d80 29# one byte in 23.3 cycles or ~60% faster than integer-only code.
1e863180 30
f26328c2
AP
31# August 2012.
32#
33# Improve NEON performance by 12% on Snapdragon S4. In absolute
34# terms it's 22.6 cycles per byte, which is disappointing result.
35# Technical writers asserted that 3-way S4 pipeline can sustain
36# multiple NEON instructions per cycle, but dual NEON issue could
37# not be observed, and for NEON-only sequences IPC(*) was found to
38# be limited by 1:-( 0.33 and 0.66 were measured for sequences with
39# ILPs(*) of 1 and 2 respectively. This in turn means that you can
40# even find yourself striving, as I did here, for achieving IPC
41# adequate to one delivered by Cortex A8 [for reference, it's
42# 0.5 for ILP of 1, and 1 for higher ILPs].
43#
44# (*) ILP, instruction-level parallelism, how many instructions
45# *can* execute at the same time. IPC, instructions per cycle,
46# indicates how many instructions actually execute.
47
74eb3e09
AP
48# Byte order [in]dependence. =========================================
49#
1e863180
AP
50# Originally caller was expected to maintain specific *dword* order in
51# h[0-7], namely with most significant dword at *lower* address, which
52# was reflected in below two parameters as 0 and 4. Now caller is
53# expected to maintain native byte order for whole 64-bit values.
54$hi="HI";
55$lo="LO";
74eb3e09 56# ====================================================================
1fa29843 57
396df731 58while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
4c7c5ff6
AP
59open STDOUT,">$output";
60
1e863180 61$ctx="r0"; # parameter block
1fa29843
AP
62$inp="r1";
63$len="r2";
1e863180 64
1fa29843
AP
65$Tlo="r3";
66$Thi="r4";
67$Alo="r5";
68$Ahi="r6";
69$Elo="r7";
70$Ehi="r8";
71$t0="r9";
72$t1="r10";
73$t2="r11";
74$t3="r12";
75############ r13 is stack pointer
76$Ktbl="r14";
77############ r15 is program counter
78
79$Aoff=8*0;
80$Boff=8*1;
81$Coff=8*2;
82$Doff=8*3;
83$Eoff=8*4;
84$Foff=8*5;
85$Goff=8*6;
86$Hoff=8*7;
87$Xoff=8*8;
88
89sub BODY_00_15() {
90my $magic = shift;
91$code.=<<___;
1fa29843
AP
92 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
93 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
94 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
95 mov $t0,$Elo,lsr#14
1e863180 96 str $Tlo,[sp,#$Xoff+0]
1fa29843 97 mov $t1,$Ehi,lsr#14
1e863180 98 str $Thi,[sp,#$Xoff+4]
1fa29843 99 eor $t0,$t0,$Ehi,lsl#18
1e863180 100 ldr $t2,[sp,#$Hoff+0] @ h.lo
1fa29843 101 eor $t1,$t1,$Elo,lsl#18
1e863180 102 ldr $t3,[sp,#$Hoff+4] @ h.hi
1fa29843
AP
103 eor $t0,$t0,$Elo,lsr#18
104 eor $t1,$t1,$Ehi,lsr#18
105 eor $t0,$t0,$Ehi,lsl#14
106 eor $t1,$t1,$Elo,lsl#14
107 eor $t0,$t0,$Ehi,lsr#9
108 eor $t1,$t1,$Elo,lsr#9
109 eor $t0,$t0,$Elo,lsl#23
110 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
111 adds $Tlo,$Tlo,$t0
1fa29843 112 ldr $t0,[sp,#$Foff+0] @ f.lo
2d22e080 113 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
1fa29843 114 ldr $t1,[sp,#$Foff+4] @ f.hi
2d22e080 115 adds $Tlo,$Tlo,$t2
1fa29843 116 ldr $t2,[sp,#$Goff+0] @ g.lo
2d22e080 117 adc $Thi,$Thi,$t3 @ T += h
1fa29843 118 ldr $t3,[sp,#$Goff+4] @ g.hi
1fa29843
AP
119
120 eor $t0,$t0,$t2
2d22e080 121 str $Elo,[sp,#$Eoff+0]
1fa29843 122 eor $t1,$t1,$t3
2d22e080 123 str $Ehi,[sp,#$Eoff+4]
1fa29843 124 and $t0,$t0,$Elo
2d22e080 125 str $Alo,[sp,#$Aoff+0]
1fa29843 126 and $t1,$t1,$Ehi
2d22e080 127 str $Ahi,[sp,#$Aoff+4]
1fa29843 128 eor $t0,$t0,$t2
1e863180 129 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
2d22e080 130 eor $t1,$t1,$t3 @ Ch(e,f,g)
1e863180 131 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
1fa29843
AP
132
133 adds $Tlo,$Tlo,$t0
2d22e080 134 ldr $Elo,[sp,#$Doff+0] @ d.lo
1fa29843 135 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
2d22e080 136 ldr $Ehi,[sp,#$Doff+4] @ d.hi
1fa29843 137 adds $Tlo,$Tlo,$t2
1e863180 138 and $t0,$t2,#0xff
1fa29843
AP
139 adc $Thi,$Thi,$t3 @ T += K[i]
140 adds $Elo,$Elo,$Tlo
1e863180 141 ldr $t2,[sp,#$Boff+0] @ b.lo
1fa29843 142 adc $Ehi,$Ehi,$Thi @ d += T
1fa29843 143 teq $t0,#$magic
1fa29843 144
b5e5760d 145 ldr $t3,[sp,#$Coff+0] @ c.lo
1e863180 146 orreq $Ktbl,$Ktbl,#1
1fa29843
AP
147 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
148 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
149 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
150 mov $t0,$Alo,lsr#28
151 mov $t1,$Ahi,lsr#28
152 eor $t0,$t0,$Ahi,lsl#4
153 eor $t1,$t1,$Alo,lsl#4
154 eor $t0,$t0,$Ahi,lsr#2
155 eor $t1,$t1,$Alo,lsr#2
156 eor $t0,$t0,$Alo,lsl#30
157 eor $t1,$t1,$Ahi,lsl#30
158 eor $t0,$t0,$Ahi,lsr#7
159 eor $t1,$t1,$Alo,lsr#7
160 eor $t0,$t0,$Alo,lsl#25
161 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
162 adds $Tlo,$Tlo,$t0
1e863180 163 and $t0,$Alo,$t2
1fa29843
AP
164 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
165
1fa29843 166 ldr $t1,[sp,#$Boff+4] @ b.hi
1e863180 167 orr $Alo,$Alo,$t2
1fa29843
AP
168 ldr $t2,[sp,#$Coff+4] @ c.hi
169 and $Alo,$Alo,$t3
1fa29843
AP
170 and $t3,$Ahi,$t1
171 orr $Ahi,$Ahi,$t1
1e863180 172 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
1fa29843 173 and $Ahi,$Ahi,$t2
1fa29843 174 adds $Alo,$Alo,$Tlo
1e863180 175 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
1fa29843 176 sub sp,sp,#8
1e863180
AP
177 adc $Ahi,$Ahi,$Thi @ h += T
178 tst $Ktbl,#1
1fa29843
AP
179 add $Ktbl,$Ktbl,#8
180___
181}
182$code=<<___;
1e863180
AP
183#include "arm_arch.h"
184#ifdef __ARMEL__
185# define LO 0
186# define HI 4
187# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
188#else
189# define HI 0
190# define LO 4
191# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
192#endif
193
1fa29843
AP
194.text
195.code 32
196.type K512,%object
197.align 5
198K512:
1e863180
AP
199WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
200WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
201WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
202WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
203WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
204WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
205WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
206WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
207WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
208WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
209WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
210WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
211WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
212WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
213WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
214WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
215WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
216WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
217WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
218WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
219WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
220WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
221WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
222WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
223WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
224WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
225WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
226WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
227WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
228WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
229WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
230WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
231WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
232WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
233WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
234WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
235WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
236WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
237WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
238WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
1fa29843 239.size K512,.-K512
c1669e1c 240#if __ARM_MAX_ARCH__>=7
1e863180 241.LOPENSSL_armcap:
87873f43 242.word OPENSSL_armcap_P-sha512_block_data_order
1e863180 243.skip 32-4
c1669e1c
AP
244#else
245.skip 32
246#endif
1fa29843
AP
247
248.global sha512_block_data_order
249.type sha512_block_data_order,%function
250sha512_block_data_order:
251 sub r3,pc,#8 @ sha512_block_data_order
252 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
c1669e1c 253#if __ARM_MAX_ARCH__>=7
1e863180 254 ldr r12,.LOPENSSL_armcap
87873f43 255 ldr r12,[r3,r12] @ OPENSSL_armcap_P
1e863180
AP
256 tst r12,#1
257 bne .LNEON
258#endif
1fa29843 259 stmdb sp!,{r4-r12,lr}
1e863180 260 sub $Ktbl,r3,#672 @ K512
1fa29843
AP
261 sub sp,sp,#9*8
262
263 ldr $Elo,[$ctx,#$Eoff+$lo]
264 ldr $Ehi,[$ctx,#$Eoff+$hi]
265 ldr $t0, [$ctx,#$Goff+$lo]
266 ldr $t1, [$ctx,#$Goff+$hi]
267 ldr $t2, [$ctx,#$Hoff+$lo]
268 ldr $t3, [$ctx,#$Hoff+$hi]
269.Loop:
270 str $t0, [sp,#$Goff+0]
271 str $t1, [sp,#$Goff+4]
272 str $t2, [sp,#$Hoff+0]
273 str $t3, [sp,#$Hoff+4]
274 ldr $Alo,[$ctx,#$Aoff+$lo]
275 ldr $Ahi,[$ctx,#$Aoff+$hi]
276 ldr $Tlo,[$ctx,#$Boff+$lo]
277 ldr $Thi,[$ctx,#$Boff+$hi]
278 ldr $t0, [$ctx,#$Coff+$lo]
279 ldr $t1, [$ctx,#$Coff+$hi]
280 ldr $t2, [$ctx,#$Doff+$lo]
281 ldr $t3, [$ctx,#$Doff+$hi]
282 str $Tlo,[sp,#$Boff+0]
283 str $Thi,[sp,#$Boff+4]
284 str $t0, [sp,#$Coff+0]
285 str $t1, [sp,#$Coff+4]
286 str $t2, [sp,#$Doff+0]
287 str $t3, [sp,#$Doff+4]
288 ldr $Tlo,[$ctx,#$Foff+$lo]
289 ldr $Thi,[$ctx,#$Foff+$hi]
290 str $Tlo,[sp,#$Foff+0]
291 str $Thi,[sp,#$Foff+4]
292
293.L00_15:
1e863180 294#if __ARM_ARCH__<7
1fa29843
AP
295 ldrb $Tlo,[$inp,#7]
296 ldrb $t0, [$inp,#6]
297 ldrb $t1, [$inp,#5]
298 ldrb $t2, [$inp,#4]
299 ldrb $Thi,[$inp,#3]
300 ldrb $t3, [$inp,#2]
301 orr $Tlo,$Tlo,$t0,lsl#8
302 ldrb $t0, [$inp,#1]
303 orr $Tlo,$Tlo,$t1,lsl#16
304 ldrb $t1, [$inp],#8
305 orr $Tlo,$Tlo,$t2,lsl#24
306 orr $Thi,$Thi,$t3,lsl#8
307 orr $Thi,$Thi,$t0,lsl#16
308 orr $Thi,$Thi,$t1,lsl#24
1e863180
AP
309#else
310 ldr $Tlo,[$inp,#4]
311 ldr $Thi,[$inp],#8
312#ifdef __ARMEL__
313 rev $Tlo,$Tlo
314 rev $Thi,$Thi
315#endif
316#endif
1fa29843
AP
317___
318 &BODY_00_15(0x94);
319$code.=<<___;
320 tst $Ktbl,#1
321 beq .L00_15
1fa29843
AP
322 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
323 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
1e863180
AP
324 bic $Ktbl,$Ktbl,#1
325.L16_79:
1fa29843
AP
326 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
327 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
328 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
329 mov $Tlo,$t0,lsr#1
1e863180 330 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
1fa29843 331 mov $Thi,$t1,lsr#1
1e863180 332 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
1fa29843
AP
333 eor $Tlo,$Tlo,$t1,lsl#31
334 eor $Thi,$Thi,$t0,lsl#31
335 eor $Tlo,$Tlo,$t0,lsr#8
336 eor $Thi,$Thi,$t1,lsr#8
337 eor $Tlo,$Tlo,$t1,lsl#24
338 eor $Thi,$Thi,$t0,lsl#24
339 eor $Tlo,$Tlo,$t0,lsr#7
340 eor $Thi,$Thi,$t1,lsr#7
341 eor $Tlo,$Tlo,$t1,lsl#25
342
343 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
344 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
345 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
346 mov $t0,$t2,lsr#19
347 mov $t1,$t3,lsr#19
348 eor $t0,$t0,$t3,lsl#13
349 eor $t1,$t1,$t2,lsl#13
350 eor $t0,$t0,$t3,lsr#29
351 eor $t1,$t1,$t2,lsr#29
352 eor $t0,$t0,$t2,lsl#3
353 eor $t1,$t1,$t3,lsl#3
354 eor $t0,$t0,$t2,lsr#6
355 eor $t1,$t1,$t3,lsr#6
1e863180 356 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
1fa29843
AP
357 eor $t0,$t0,$t3,lsl#26
358
1fa29843
AP
359 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
360 adds $Tlo,$Tlo,$t0
1e863180 361 ldr $t0,[sp,#`$Xoff+8*16`+0]
1fa29843
AP
362 adc $Thi,$Thi,$t1
363
1fa29843
AP
364 ldr $t1,[sp,#`$Xoff+8*16`+4]
365 adds $Tlo,$Tlo,$t2
366 adc $Thi,$Thi,$t3
367 adds $Tlo,$Tlo,$t0
368 adc $Thi,$Thi,$t1
1fa29843
AP
369___
370 &BODY_00_15(0x17);
371$code.=<<___;
1e863180
AP
372 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
373 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
1fa29843
AP
374 beq .L16_79
375 bic $Ktbl,$Ktbl,#1
376
377 ldr $Tlo,[sp,#$Boff+0]
378 ldr $Thi,[sp,#$Boff+4]
379 ldr $t0, [$ctx,#$Aoff+$lo]
380 ldr $t1, [$ctx,#$Aoff+$hi]
381 ldr $t2, [$ctx,#$Boff+$lo]
382 ldr $t3, [$ctx,#$Boff+$hi]
383 adds $t0,$Alo,$t0
1fa29843 384 str $t0, [$ctx,#$Aoff+$lo]
1e863180 385 adc $t1,$Ahi,$t1
1fa29843 386 str $t1, [$ctx,#$Aoff+$hi]
1e863180 387 adds $t2,$Tlo,$t2
1fa29843 388 str $t2, [$ctx,#$Boff+$lo]
1e863180 389 adc $t3,$Thi,$t3
1fa29843
AP
390 str $t3, [$ctx,#$Boff+$hi]
391
392 ldr $Alo,[sp,#$Coff+0]
393 ldr $Ahi,[sp,#$Coff+4]
394 ldr $Tlo,[sp,#$Doff+0]
395 ldr $Thi,[sp,#$Doff+4]
396 ldr $t0, [$ctx,#$Coff+$lo]
397 ldr $t1, [$ctx,#$Coff+$hi]
398 ldr $t2, [$ctx,#$Doff+$lo]
399 ldr $t3, [$ctx,#$Doff+$hi]
400 adds $t0,$Alo,$t0
1fa29843 401 str $t0, [$ctx,#$Coff+$lo]
1e863180 402 adc $t1,$Ahi,$t1
1fa29843 403 str $t1, [$ctx,#$Coff+$hi]
1e863180 404 adds $t2,$Tlo,$t2
1fa29843 405 str $t2, [$ctx,#$Doff+$lo]
1e863180 406 adc $t3,$Thi,$t3
1fa29843
AP
407 str $t3, [$ctx,#$Doff+$hi]
408
409 ldr $Tlo,[sp,#$Foff+0]
410 ldr $Thi,[sp,#$Foff+4]
411 ldr $t0, [$ctx,#$Eoff+$lo]
412 ldr $t1, [$ctx,#$Eoff+$hi]
413 ldr $t2, [$ctx,#$Foff+$lo]
414 ldr $t3, [$ctx,#$Foff+$hi]
415 adds $Elo,$Elo,$t0
1fa29843 416 str $Elo,[$ctx,#$Eoff+$lo]
1e863180 417 adc $Ehi,$Ehi,$t1
1fa29843 418 str $Ehi,[$ctx,#$Eoff+$hi]
1e863180 419 adds $t2,$Tlo,$t2
1fa29843 420 str $t2, [$ctx,#$Foff+$lo]
1e863180 421 adc $t3,$Thi,$t3
1fa29843
AP
422 str $t3, [$ctx,#$Foff+$hi]
423
424 ldr $Alo,[sp,#$Goff+0]
425 ldr $Ahi,[sp,#$Goff+4]
426 ldr $Tlo,[sp,#$Hoff+0]
427 ldr $Thi,[sp,#$Hoff+4]
428 ldr $t0, [$ctx,#$Goff+$lo]
429 ldr $t1, [$ctx,#$Goff+$hi]
430 ldr $t2, [$ctx,#$Hoff+$lo]
431 ldr $t3, [$ctx,#$Hoff+$hi]
432 adds $t0,$Alo,$t0
1fa29843 433 str $t0, [$ctx,#$Goff+$lo]
1e863180 434 adc $t1,$Ahi,$t1
1fa29843 435 str $t1, [$ctx,#$Goff+$hi]
1e863180 436 adds $t2,$Tlo,$t2
1fa29843 437 str $t2, [$ctx,#$Hoff+$lo]
1e863180 438 adc $t3,$Thi,$t3
1fa29843
AP
439 str $t3, [$ctx,#$Hoff+$hi]
440
441 add sp,sp,#640
442 sub $Ktbl,$Ktbl,#640
443
444 teq $inp,$len
445 bne .Loop
446
447 add sp,sp,#8*9 @ destroy frame
1e863180
AP
448#if __ARM_ARCH__>=5
449 ldmia sp!,{r4-r12,pc}
450#else
1fa29843
AP
451 ldmia sp!,{r4-r12,lr}
452 tst lr,#1
453 moveq pc,lr @ be binary compatible with V4, yet
454 bx lr @ interoperable with Thumb ISA:-)
1e863180
AP
455#endif
456___
457
458{
459my @Sigma0=(28,34,39);
460my @Sigma1=(14,18,41);
461my @sigma0=(1, 8, 7);
462my @sigma1=(19,61,6);
463
464my $Ktbl="r3";
465my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
466
467my @X=map("d$_",(0..15));
468my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
469
470sub NEON_00_15() {
471my $i=shift;
472my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
473my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
474
475$code.=<<___ if ($i<16 || $i&1);
476 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
477#if $i<16
478 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
479#endif
480 vshr.u64 $t1,$e,#@Sigma1[1]
f26328c2
AP
481#if $i>0
482 vadd.i64 $a,$Maj @ h+=Maj from the past
483#endif
1e863180
AP
484 vshr.u64 $t2,$e,#@Sigma1[2]
485___
486$code.=<<___;
487 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
488 vsli.64 $t0,$e,#`64-@Sigma1[0]`
489 vsli.64 $t1,$e,#`64-@Sigma1[1]`
482a7d80 490 vmov $Ch,$e
1e863180
AP
491 vsli.64 $t2,$e,#`64-@Sigma1[2]`
492#if $i<16 && defined(__ARMEL__)
493 vrev64.8 @X[$i],@X[$i]
494#endif
482a7d80 495 veor $t1,$t0
f26328c2 496 vbsl $Ch,$f,$g @ Ch(e,f,g)
1e863180 497 vshr.u64 $t0,$a,#@Sigma0[0]
482a7d80 498 veor $t2,$t1 @ Sigma1(e)
f26328c2 499 vadd.i64 $T1,$Ch,$h
1e863180 500 vshr.u64 $t1,$a,#@Sigma0[1]
1e863180 501 vsli.64 $t0,$a,#`64-@Sigma0[0]`
f26328c2
AP
502 vadd.i64 $T1,$t2
503 vshr.u64 $t2,$a,#@Sigma0[2]
504 vadd.i64 $K,@X[$i%16]
1e863180 505 vsli.64 $t1,$a,#`64-@Sigma0[1]`
f26328c2 506 veor $Maj,$a,$b
1e863180 507 vsli.64 $t2,$a,#`64-@Sigma0[2]`
1e863180 508 veor $h,$t0,$t1
f26328c2 509 vadd.i64 $T1,$K
482a7d80 510 vbsl $Maj,$c,$b @ Maj(a,b,c)
f26328c2 511 veor $h,$t2 @ Sigma0(a)
1e863180 512 vadd.i64 $d,$T1
f26328c2
AP
513 vadd.i64 $Maj,$T1
514 @ vadd.i64 $h,$Maj
1e863180
AP
515___
516}
517
518sub NEON_16_79() {
519my $i=shift;
520
521if ($i&1) { &NEON_00_15($i,@_); return; }
522
523# 2x-vectorized, therefore runs every 2nd round
524my @X=map("q$_",(0..7)); # view @X as 128-bit vector
525my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
526my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
527my $e=@_[4]; # $e from NEON_00_15
528$i /= 2;
529$code.=<<___;
530 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
531 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
f26328c2 532 vadd.i64 @_[0],d30 @ h+=Maj from the past
1e863180
AP
533 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
534 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
535 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
536 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
537 veor $s1,$t0
538 vshr.u64 $t0,$s0,#@sigma0[0]
539 veor $s1,$t1 @ sigma1(X[i+14])
540 vshr.u64 $t1,$s0,#@sigma0[1]
541 vadd.i64 @X[$i%8],$s1
542 vshr.u64 $s1,$s0,#@sigma0[2]
543 vsli.64 $t0,$s0,#`64-@sigma0[0]`
544 vsli.64 $t1,$s0,#`64-@sigma0[1]`
545 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
546 veor $s1,$t0
547 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
548 vadd.i64 @X[$i%8],$s0
549 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
550 veor $s1,$t1 @ sigma0(X[i+1])
551 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
552 vadd.i64 @X[$i%8],$s1
553___
554 &NEON_00_15(2*$i,@_);
555}
556
557$code.=<<___;
c1669e1c
AP
558#if __ARM_MAX_ARCH__>=7
559.arch armv7-a
1e863180
AP
560.fpu neon
561
562.align 4
563.LNEON:
564 dmb @ errata #451034 on early Cortex A8
565 vstmdb sp!,{d8-d15} @ ABI specification says so
566 sub $Ktbl,r3,#672 @ K512
567 vldmia $ctx,{$A-$H} @ load context
568.Loop_neon:
569___
570for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
571$code.=<<___;
572 mov $cnt,#4
573.L16_79_neon:
574 subs $cnt,#1
575___
576for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
577$code.=<<___;
578 bne .L16_79_neon
579
f26328c2 580 vadd.i64 $A,d30 @ h+=Maj from the past
1e863180
AP
581 vldmia $ctx,{d24-d31} @ load context to temp
582 vadd.i64 q8,q12 @ vectorized accumulate
583 vadd.i64 q9,q13
584 vadd.i64 q10,q14
585 vadd.i64 q11,q15
586 vstmia $ctx,{$A-$H} @ save context
587 teq $inp,$len
588 sub $Ktbl,#640 @ rewind K512
589 bne .Loop_neon
590
591 vldmia sp!,{d8-d15} @ epilogue
5dcf70a1 592 ret @ bx lr
1e863180
AP
593#endif
594___
595}
596$code.=<<___;
597.size sha512_block_data_order,.-sha512_block_data_order
598.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
97a6a01f 599.align 2
c1669e1c 600#if __ARM_MAX_ARCH__>=7
87873f43 601.comm OPENSSL_armcap_P,4,4
c1669e1c 602#endif
1fa29843
AP
603___
604
605$code =~ s/\`([^\`]*)\`/eval $1/gem;
7722e53f 606$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
5dcf70a1 607$code =~ s/\bret\b/bx lr/gm;
1fa29843 608print $code;
4c7c5ff6 609close STDOUT; # enforce flush