]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-armv4.pl
Doc nits cleanup, round 2
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
CommitLineData
1fa29843
AP
1#!/usr/bin/env perl
2
3# ====================================================================
f26328c2 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1fa29843
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
b1a5d1c6
AP
8#
9# Permission to use under GPL terms is granted.
1fa29843
AP
10# ====================================================================
11
12# SHA512 block procedure for ARMv4. September 2007.
13
14# This code is ~4.5 (four and a half) times faster than code generated
2d22e080
AP
15# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16# Xscale PXA250 core].
17#
18# July 2010.
19#
20# Rescheduling for dual-issue pipeline resulted in 6% improvement on
21# Cortex A8 core and ~40 cycles per processed byte.
1fa29843 22
1e863180
AP
23# February 2011.
24#
25# Profiler-assisted and platform-specific optimization resulted in 7%
26# improvement on Coxtex A8 core and ~38 cycles per byte.
27
28# March 2011.
29#
30# Add NEON implementation. On Cortex A8 it was measured to process
482a7d80 31# one byte in 23.3 cycles or ~60% faster than integer-only code.
1e863180 32
f26328c2
AP
33# August 2012.
34#
35# Improve NEON performance by 12% on Snapdragon S4. In absolute
36# terms it's 22.6 cycles per byte, which is disappointing result.
37# Technical writers asserted that 3-way S4 pipeline can sustain
38# multiple NEON instructions per cycle, but dual NEON issue could
e390ae50
AP
39# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40# for further details. On side note Cortex-A15 processes one byte in
41# 16 cycles.
f26328c2 42
74eb3e09
AP
43# Byte order [in]dependence. =========================================
44#
1e863180
AP
45# Originally caller was expected to maintain specific *dword* order in
46# h[0-7], namely with most significant dword at *lower* address, which
47# was reflected in below two parameters as 0 and 4. Now caller is
48# expected to maintain native byte order for whole 64-bit values.
49$hi="HI";
50$lo="LO";
74eb3e09 51# ====================================================================
1fa29843 52
313e6ec1 53$flavour = shift;
a5aa63a4
RL
54if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
55else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
313e6ec1
AP
56
57if ($flavour && $flavour ne "void") {
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
61 die "can't locate arm-xlate.pl";
62
63 open STDOUT,"| \"$^X\" $xlate $flavour $output";
64} else {
65 open STDOUT,">$output";
66}
4c7c5ff6 67
1e863180 68$ctx="r0"; # parameter block
1fa29843
AP
69$inp="r1";
70$len="r2";
1e863180 71
1fa29843
AP
72$Tlo="r3";
73$Thi="r4";
74$Alo="r5";
75$Ahi="r6";
76$Elo="r7";
77$Ehi="r8";
78$t0="r9";
79$t1="r10";
80$t2="r11";
81$t3="r12";
82############ r13 is stack pointer
83$Ktbl="r14";
84############ r15 is program counter
85
86$Aoff=8*0;
87$Boff=8*1;
88$Coff=8*2;
89$Doff=8*3;
90$Eoff=8*4;
91$Foff=8*5;
92$Goff=8*6;
93$Hoff=8*7;
94$Xoff=8*8;
95
96sub BODY_00_15() {
97my $magic = shift;
98$code.=<<___;
1fa29843
AP
99 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
100 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
101 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
102 mov $t0,$Elo,lsr#14
1e863180 103 str $Tlo,[sp,#$Xoff+0]
1fa29843 104 mov $t1,$Ehi,lsr#14
1e863180 105 str $Thi,[sp,#$Xoff+4]
1fa29843 106 eor $t0,$t0,$Ehi,lsl#18
1e863180 107 ldr $t2,[sp,#$Hoff+0] @ h.lo
1fa29843 108 eor $t1,$t1,$Elo,lsl#18
1e863180 109 ldr $t3,[sp,#$Hoff+4] @ h.hi
1fa29843
AP
110 eor $t0,$t0,$Elo,lsr#18
111 eor $t1,$t1,$Ehi,lsr#18
112 eor $t0,$t0,$Ehi,lsl#14
113 eor $t1,$t1,$Elo,lsl#14
114 eor $t0,$t0,$Ehi,lsr#9
115 eor $t1,$t1,$Elo,lsr#9
116 eor $t0,$t0,$Elo,lsl#23
117 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
118 adds $Tlo,$Tlo,$t0
1fa29843 119 ldr $t0,[sp,#$Foff+0] @ f.lo
2d22e080 120 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
1fa29843 121 ldr $t1,[sp,#$Foff+4] @ f.hi
2d22e080 122 adds $Tlo,$Tlo,$t2
1fa29843 123 ldr $t2,[sp,#$Goff+0] @ g.lo
2d22e080 124 adc $Thi,$Thi,$t3 @ T += h
1fa29843 125 ldr $t3,[sp,#$Goff+4] @ g.hi
1fa29843
AP
126
127 eor $t0,$t0,$t2
2d22e080 128 str $Elo,[sp,#$Eoff+0]
1fa29843 129 eor $t1,$t1,$t3
2d22e080 130 str $Ehi,[sp,#$Eoff+4]
1fa29843 131 and $t0,$t0,$Elo
2d22e080 132 str $Alo,[sp,#$Aoff+0]
1fa29843 133 and $t1,$t1,$Ehi
2d22e080 134 str $Ahi,[sp,#$Aoff+4]
1fa29843 135 eor $t0,$t0,$t2
1e863180 136 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
2d22e080 137 eor $t1,$t1,$t3 @ Ch(e,f,g)
1e863180 138 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
1fa29843
AP
139
140 adds $Tlo,$Tlo,$t0
2d22e080 141 ldr $Elo,[sp,#$Doff+0] @ d.lo
1fa29843 142 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
2d22e080 143 ldr $Ehi,[sp,#$Doff+4] @ d.hi
1fa29843 144 adds $Tlo,$Tlo,$t2
1e863180 145 and $t0,$t2,#0xff
1fa29843
AP
146 adc $Thi,$Thi,$t3 @ T += K[i]
147 adds $Elo,$Elo,$Tlo
1e863180 148 ldr $t2,[sp,#$Boff+0] @ b.lo
1fa29843 149 adc $Ehi,$Ehi,$Thi @ d += T
1fa29843 150 teq $t0,#$magic
1fa29843 151
b5e5760d 152 ldr $t3,[sp,#$Coff+0] @ c.lo
b1a5d1c6
AP
153#if __ARM_ARCH__>=7
154 it eq @ Thumb2 thing, sanity check in ARM
155#endif
1e863180 156 orreq $Ktbl,$Ktbl,#1
1fa29843
AP
157 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
158 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
159 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
160 mov $t0,$Alo,lsr#28
161 mov $t1,$Ahi,lsr#28
162 eor $t0,$t0,$Ahi,lsl#4
163 eor $t1,$t1,$Alo,lsl#4
164 eor $t0,$t0,$Ahi,lsr#2
165 eor $t1,$t1,$Alo,lsr#2
166 eor $t0,$t0,$Alo,lsl#30
167 eor $t1,$t1,$Ahi,lsl#30
168 eor $t0,$t0,$Ahi,lsr#7
169 eor $t1,$t1,$Alo,lsr#7
170 eor $t0,$t0,$Alo,lsl#25
171 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
172 adds $Tlo,$Tlo,$t0
1e863180 173 and $t0,$Alo,$t2
1fa29843
AP
174 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
175
1fa29843 176 ldr $t1,[sp,#$Boff+4] @ b.hi
1e863180 177 orr $Alo,$Alo,$t2
1fa29843
AP
178 ldr $t2,[sp,#$Coff+4] @ c.hi
179 and $Alo,$Alo,$t3
1fa29843
AP
180 and $t3,$Ahi,$t1
181 orr $Ahi,$Ahi,$t1
1e863180 182 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
1fa29843 183 and $Ahi,$Ahi,$t2
1fa29843 184 adds $Alo,$Alo,$Tlo
1e863180 185 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
1fa29843 186 sub sp,sp,#8
1e863180
AP
187 adc $Ahi,$Ahi,$Thi @ h += T
188 tst $Ktbl,#1
1fa29843
AP
189 add $Ktbl,$Ktbl,#8
190___
191}
192$code=<<___;
b1a5d1c6
AP
193#ifndef __KERNEL__
194# include "arm_arch.h"
195# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
196# define VFP_ABI_POP vldmia sp!,{d8-d15}
197#else
198# define __ARM_ARCH__ __LINUX_ARM_ARCH__
199# define __ARM_MAX_ARCH__ 7
200# define VFP_ABI_PUSH
201# define VFP_ABI_POP
202#endif
203
1e863180
AP
204#ifdef __ARMEL__
205# define LO 0
206# define HI 4
207# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
208#else
209# define HI 0
210# define LO 4
211# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
212#endif
213
1fa29843 214.text
a2859927 215#if defined(__thumb2__)
b1a5d1c6 216.syntax unified
b1a5d1c6 217.thumb
11208dcf
AP
218# define adrl adr
219#else
220.code 32
b1a5d1c6
AP
221#endif
222
1fa29843
AP
223.type K512,%object
224.align 5
225K512:
1e863180
AP
226WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
227WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
228WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
229WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
230WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
231WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
232WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
233WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
234WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
235WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
236WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
237WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
238WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
239WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
240WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
241WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
242WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
243WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
244WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
245WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
246WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
247WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
248WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
249WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
250WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
251WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
252WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
253WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
254WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
255WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
256WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
257WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
258WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
259WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
260WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
261WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
262WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
263WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
264WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
265WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
1fa29843 266.size K512,.-K512
b1a5d1c6 267#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1e863180 268.LOPENSSL_armcap:
313e6ec1 269.word OPENSSL_armcap_P-.Lsha512_block_data_order
1e863180 270.skip 32-4
c1669e1c
AP
271#else
272.skip 32
273#endif
1fa29843
AP
274
275.global sha512_block_data_order
276.type sha512_block_data_order,%function
277sha512_block_data_order:
313e6ec1 278.Lsha512_block_data_order:
11208dcf 279#if __ARM_ARCH__<7 && !defined(__thumb2__)
1fa29843 280 sub r3,pc,#8 @ sha512_block_data_order
b1a5d1c6 281#else
11208dcf 282 adr r3,.Lsha512_block_data_order
b1a5d1c6
AP
283#endif
284#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1e863180 285 ldr r12,.LOPENSSL_armcap
87873f43 286 ldr r12,[r3,r12] @ OPENSSL_armcap_P
313e6ec1
AP
287#ifdef __APPLE__
288 ldr r12,[r12]
289#endif
bdbd3aea 290 tst r12,#ARMV7_NEON
1e863180
AP
291 bne .LNEON
292#endif
b1a5d1c6 293 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
1fa29843 294 stmdb sp!,{r4-r12,lr}
1e863180 295 sub $Ktbl,r3,#672 @ K512
1fa29843
AP
296 sub sp,sp,#9*8
297
298 ldr $Elo,[$ctx,#$Eoff+$lo]
299 ldr $Ehi,[$ctx,#$Eoff+$hi]
300 ldr $t0, [$ctx,#$Goff+$lo]
301 ldr $t1, [$ctx,#$Goff+$hi]
302 ldr $t2, [$ctx,#$Hoff+$lo]
303 ldr $t3, [$ctx,#$Hoff+$hi]
304.Loop:
305 str $t0, [sp,#$Goff+0]
306 str $t1, [sp,#$Goff+4]
307 str $t2, [sp,#$Hoff+0]
308 str $t3, [sp,#$Hoff+4]
309 ldr $Alo,[$ctx,#$Aoff+$lo]
310 ldr $Ahi,[$ctx,#$Aoff+$hi]
311 ldr $Tlo,[$ctx,#$Boff+$lo]
312 ldr $Thi,[$ctx,#$Boff+$hi]
313 ldr $t0, [$ctx,#$Coff+$lo]
314 ldr $t1, [$ctx,#$Coff+$hi]
315 ldr $t2, [$ctx,#$Doff+$lo]
316 ldr $t3, [$ctx,#$Doff+$hi]
317 str $Tlo,[sp,#$Boff+0]
318 str $Thi,[sp,#$Boff+4]
319 str $t0, [sp,#$Coff+0]
320 str $t1, [sp,#$Coff+4]
321 str $t2, [sp,#$Doff+0]
322 str $t3, [sp,#$Doff+4]
323 ldr $Tlo,[$ctx,#$Foff+$lo]
324 ldr $Thi,[$ctx,#$Foff+$hi]
325 str $Tlo,[sp,#$Foff+0]
326 str $Thi,[sp,#$Foff+4]
327
328.L00_15:
1e863180 329#if __ARM_ARCH__<7
1fa29843
AP
330 ldrb $Tlo,[$inp,#7]
331 ldrb $t0, [$inp,#6]
332 ldrb $t1, [$inp,#5]
333 ldrb $t2, [$inp,#4]
334 ldrb $Thi,[$inp,#3]
335 ldrb $t3, [$inp,#2]
336 orr $Tlo,$Tlo,$t0,lsl#8
337 ldrb $t0, [$inp,#1]
338 orr $Tlo,$Tlo,$t1,lsl#16
339 ldrb $t1, [$inp],#8
340 orr $Tlo,$Tlo,$t2,lsl#24
341 orr $Thi,$Thi,$t3,lsl#8
342 orr $Thi,$Thi,$t0,lsl#16
343 orr $Thi,$Thi,$t1,lsl#24
1e863180
AP
344#else
345 ldr $Tlo,[$inp,#4]
346 ldr $Thi,[$inp],#8
347#ifdef __ARMEL__
348 rev $Tlo,$Tlo
349 rev $Thi,$Thi
350#endif
351#endif
1fa29843
AP
352___
353 &BODY_00_15(0x94);
354$code.=<<___;
355 tst $Ktbl,#1
356 beq .L00_15
1fa29843
AP
357 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
358 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
1e863180
AP
359 bic $Ktbl,$Ktbl,#1
360.L16_79:
1fa29843
AP
361 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
362 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
363 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
364 mov $Tlo,$t0,lsr#1
1e863180 365 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
1fa29843 366 mov $Thi,$t1,lsr#1
1e863180 367 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
1fa29843
AP
368 eor $Tlo,$Tlo,$t1,lsl#31
369 eor $Thi,$Thi,$t0,lsl#31
370 eor $Tlo,$Tlo,$t0,lsr#8
371 eor $Thi,$Thi,$t1,lsr#8
372 eor $Tlo,$Tlo,$t1,lsl#24
373 eor $Thi,$Thi,$t0,lsl#24
374 eor $Tlo,$Tlo,$t0,lsr#7
375 eor $Thi,$Thi,$t1,lsr#7
376 eor $Tlo,$Tlo,$t1,lsl#25
377
378 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
379 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
380 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
381 mov $t0,$t2,lsr#19
382 mov $t1,$t3,lsr#19
383 eor $t0,$t0,$t3,lsl#13
384 eor $t1,$t1,$t2,lsl#13
385 eor $t0,$t0,$t3,lsr#29
386 eor $t1,$t1,$t2,lsr#29
387 eor $t0,$t0,$t2,lsl#3
388 eor $t1,$t1,$t3,lsl#3
389 eor $t0,$t0,$t2,lsr#6
390 eor $t1,$t1,$t3,lsr#6
1e863180 391 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
1fa29843
AP
392 eor $t0,$t0,$t3,lsl#26
393
1fa29843
AP
394 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
395 adds $Tlo,$Tlo,$t0
1e863180 396 ldr $t0,[sp,#`$Xoff+8*16`+0]
1fa29843
AP
397 adc $Thi,$Thi,$t1
398
1fa29843
AP
399 ldr $t1,[sp,#`$Xoff+8*16`+4]
400 adds $Tlo,$Tlo,$t2
401 adc $Thi,$Thi,$t3
402 adds $Tlo,$Tlo,$t0
403 adc $Thi,$Thi,$t1
1fa29843
AP
404___
405 &BODY_00_15(0x17);
406$code.=<<___;
b1a5d1c6
AP
407#if __ARM_ARCH__>=7
408 ittt eq @ Thumb2 thing, sanity check in ARM
409#endif
1e863180
AP
410 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
411 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
1fa29843
AP
412 beq .L16_79
413 bic $Ktbl,$Ktbl,#1
414
415 ldr $Tlo,[sp,#$Boff+0]
416 ldr $Thi,[sp,#$Boff+4]
417 ldr $t0, [$ctx,#$Aoff+$lo]
418 ldr $t1, [$ctx,#$Aoff+$hi]
419 ldr $t2, [$ctx,#$Boff+$lo]
420 ldr $t3, [$ctx,#$Boff+$hi]
421 adds $t0,$Alo,$t0
1fa29843 422 str $t0, [$ctx,#$Aoff+$lo]
1e863180 423 adc $t1,$Ahi,$t1
1fa29843 424 str $t1, [$ctx,#$Aoff+$hi]
1e863180 425 adds $t2,$Tlo,$t2
1fa29843 426 str $t2, [$ctx,#$Boff+$lo]
1e863180 427 adc $t3,$Thi,$t3
1fa29843
AP
428 str $t3, [$ctx,#$Boff+$hi]
429
430 ldr $Alo,[sp,#$Coff+0]
431 ldr $Ahi,[sp,#$Coff+4]
432 ldr $Tlo,[sp,#$Doff+0]
433 ldr $Thi,[sp,#$Doff+4]
434 ldr $t0, [$ctx,#$Coff+$lo]
435 ldr $t1, [$ctx,#$Coff+$hi]
436 ldr $t2, [$ctx,#$Doff+$lo]
437 ldr $t3, [$ctx,#$Doff+$hi]
438 adds $t0,$Alo,$t0
1fa29843 439 str $t0, [$ctx,#$Coff+$lo]
1e863180 440 adc $t1,$Ahi,$t1
1fa29843 441 str $t1, [$ctx,#$Coff+$hi]
1e863180 442 adds $t2,$Tlo,$t2
1fa29843 443 str $t2, [$ctx,#$Doff+$lo]
1e863180 444 adc $t3,$Thi,$t3
1fa29843
AP
445 str $t3, [$ctx,#$Doff+$hi]
446
447 ldr $Tlo,[sp,#$Foff+0]
448 ldr $Thi,[sp,#$Foff+4]
449 ldr $t0, [$ctx,#$Eoff+$lo]
450 ldr $t1, [$ctx,#$Eoff+$hi]
451 ldr $t2, [$ctx,#$Foff+$lo]
452 ldr $t3, [$ctx,#$Foff+$hi]
453 adds $Elo,$Elo,$t0
1fa29843 454 str $Elo,[$ctx,#$Eoff+$lo]
1e863180 455 adc $Ehi,$Ehi,$t1
1fa29843 456 str $Ehi,[$ctx,#$Eoff+$hi]
1e863180 457 adds $t2,$Tlo,$t2
1fa29843 458 str $t2, [$ctx,#$Foff+$lo]
1e863180 459 adc $t3,$Thi,$t3
1fa29843
AP
460 str $t3, [$ctx,#$Foff+$hi]
461
462 ldr $Alo,[sp,#$Goff+0]
463 ldr $Ahi,[sp,#$Goff+4]
464 ldr $Tlo,[sp,#$Hoff+0]
465 ldr $Thi,[sp,#$Hoff+4]
466 ldr $t0, [$ctx,#$Goff+$lo]
467 ldr $t1, [$ctx,#$Goff+$hi]
468 ldr $t2, [$ctx,#$Hoff+$lo]
469 ldr $t3, [$ctx,#$Hoff+$hi]
470 adds $t0,$Alo,$t0
1fa29843 471 str $t0, [$ctx,#$Goff+$lo]
1e863180 472 adc $t1,$Ahi,$t1
1fa29843 473 str $t1, [$ctx,#$Goff+$hi]
1e863180 474 adds $t2,$Tlo,$t2
1fa29843 475 str $t2, [$ctx,#$Hoff+$lo]
1e863180 476 adc $t3,$Thi,$t3
1fa29843
AP
477 str $t3, [$ctx,#$Hoff+$hi]
478
479 add sp,sp,#640
480 sub $Ktbl,$Ktbl,#640
481
482 teq $inp,$len
483 bne .Loop
484
485 add sp,sp,#8*9 @ destroy frame
1e863180
AP
486#if __ARM_ARCH__>=5
487 ldmia sp!,{r4-r12,pc}
488#else
1fa29843
AP
489 ldmia sp!,{r4-r12,lr}
490 tst lr,#1
491 moveq pc,lr @ be binary compatible with V4, yet
492 bx lr @ interoperable with Thumb ISA:-)
1e863180 493#endif
b1a5d1c6 494.size sha512_block_data_order,.-sha512_block_data_order
1e863180
AP
495___
496
497{
498my @Sigma0=(28,34,39);
499my @Sigma1=(14,18,41);
500my @sigma0=(1, 8, 7);
501my @sigma1=(19,61,6);
502
503my $Ktbl="r3";
504my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
505
506my @X=map("d$_",(0..15));
507my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
508
509sub NEON_00_15() {
510my $i=shift;
511my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
512my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
513
514$code.=<<___ if ($i<16 || $i&1);
515 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
516#if $i<16
517 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
518#endif
519 vshr.u64 $t1,$e,#@Sigma1[1]
f26328c2
AP
520#if $i>0
521 vadd.i64 $a,$Maj @ h+=Maj from the past
522#endif
1e863180
AP
523 vshr.u64 $t2,$e,#@Sigma1[2]
524___
525$code.=<<___;
526 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
527 vsli.64 $t0,$e,#`64-@Sigma1[0]`
528 vsli.64 $t1,$e,#`64-@Sigma1[1]`
482a7d80 529 vmov $Ch,$e
1e863180
AP
530 vsli.64 $t2,$e,#`64-@Sigma1[2]`
531#if $i<16 && defined(__ARMEL__)
532 vrev64.8 @X[$i],@X[$i]
533#endif
482a7d80 534 veor $t1,$t0
f26328c2 535 vbsl $Ch,$f,$g @ Ch(e,f,g)
1e863180 536 vshr.u64 $t0,$a,#@Sigma0[0]
482a7d80 537 veor $t2,$t1 @ Sigma1(e)
f26328c2 538 vadd.i64 $T1,$Ch,$h
1e863180 539 vshr.u64 $t1,$a,#@Sigma0[1]
1e863180 540 vsli.64 $t0,$a,#`64-@Sigma0[0]`
f26328c2
AP
541 vadd.i64 $T1,$t2
542 vshr.u64 $t2,$a,#@Sigma0[2]
543 vadd.i64 $K,@X[$i%16]
1e863180 544 vsli.64 $t1,$a,#`64-@Sigma0[1]`
f26328c2 545 veor $Maj,$a,$b
1e863180 546 vsli.64 $t2,$a,#`64-@Sigma0[2]`
1e863180 547 veor $h,$t0,$t1
f26328c2 548 vadd.i64 $T1,$K
482a7d80 549 vbsl $Maj,$c,$b @ Maj(a,b,c)
f26328c2 550 veor $h,$t2 @ Sigma0(a)
1e863180 551 vadd.i64 $d,$T1
f26328c2
AP
552 vadd.i64 $Maj,$T1
553 @ vadd.i64 $h,$Maj
1e863180
AP
554___
555}
556
557sub NEON_16_79() {
558my $i=shift;
559
560if ($i&1) { &NEON_00_15($i,@_); return; }
561
562# 2x-vectorized, therefore runs every 2nd round
563my @X=map("q$_",(0..7)); # view @X as 128-bit vector
564my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
565my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
566my $e=@_[4]; # $e from NEON_00_15
567$i /= 2;
568$code.=<<___;
569 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
570 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
f26328c2 571 vadd.i64 @_[0],d30 @ h+=Maj from the past
1e863180
AP
572 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
573 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
574 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
575 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
576 veor $s1,$t0
577 vshr.u64 $t0,$s0,#@sigma0[0]
578 veor $s1,$t1 @ sigma1(X[i+14])
579 vshr.u64 $t1,$s0,#@sigma0[1]
580 vadd.i64 @X[$i%8],$s1
581 vshr.u64 $s1,$s0,#@sigma0[2]
582 vsli.64 $t0,$s0,#`64-@sigma0[0]`
583 vsli.64 $t1,$s0,#`64-@sigma0[1]`
584 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
585 veor $s1,$t0
586 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
587 vadd.i64 @X[$i%8],$s0
588 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
589 veor $s1,$t1 @ sigma0(X[i+1])
590 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
591 vadd.i64 @X[$i%8],$s1
592___
593 &NEON_00_15(2*$i,@_);
594}
595
596$code.=<<___;
c1669e1c
AP
597#if __ARM_MAX_ARCH__>=7
598.arch armv7-a
1e863180
AP
599.fpu neon
600
b1a5d1c6
AP
601.global sha512_block_data_order_neon
602.type sha512_block_data_order_neon,%function
1e863180 603.align 4
b1a5d1c6 604sha512_block_data_order_neon:
1e863180
AP
605.LNEON:
606 dmb @ errata #451034 on early Cortex A8
b1a5d1c6 607 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
313e6ec1 608 adr $Ktbl,K512
b1a5d1c6 609 VFP_ABI_PUSH
1e863180
AP
610 vldmia $ctx,{$A-$H} @ load context
611.Loop_neon:
612___
613for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
614$code.=<<___;
615 mov $cnt,#4
616.L16_79_neon:
617 subs $cnt,#1
618___
619for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
620$code.=<<___;
621 bne .L16_79_neon
622
f26328c2 623 vadd.i64 $A,d30 @ h+=Maj from the past
1e863180
AP
624 vldmia $ctx,{d24-d31} @ load context to temp
625 vadd.i64 q8,q12 @ vectorized accumulate
626 vadd.i64 q9,q13
627 vadd.i64 q10,q14
628 vadd.i64 q11,q15
629 vstmia $ctx,{$A-$H} @ save context
630 teq $inp,$len
631 sub $Ktbl,#640 @ rewind K512
632 bne .Loop_neon
633
b1a5d1c6 634 VFP_ABI_POP
5dcf70a1 635 ret @ bx lr
b1a5d1c6 636.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1e863180
AP
637#endif
638___
639}
640$code.=<<___;
1e863180 641.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
97a6a01f 642.align 2
b1a5d1c6 643#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
87873f43 644.comm OPENSSL_armcap_P,4,4
c1669e1c 645#endif
1fa29843
AP
646___
647
648$code =~ s/\`([^\`]*)\`/eval $1/gem;
7722e53f 649$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
5dcf70a1 650$code =~ s/\bret\b/bx lr/gm;
b1a5d1c6
AP
651
652open SELF,$0;
653while(<SELF>) {
654 next if (/^#!/);
655 last if (!s/^#/@/ and !/^$/);
656 print;
657}
658close SELF;
659
1fa29843 660print $code;
4c7c5ff6 661close STDOUT; # enforce flush