]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-armv4.pl
Following the license change, modify the boilerplates in crypto/seed/
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
1212818e 2# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e
RS
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
1fa29843
AP
9
10# ====================================================================
f26328c2 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1fa29843
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
b1a5d1c6
AP
15#
16# Permission to use under GPL terms is granted.
1fa29843
AP
17# ====================================================================
18
19# SHA512 block procedure for ARMv4. September 2007.
20
21# This code is ~4.5 (four and a half) times faster than code generated
2d22e080
AP
22# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23# Xscale PXA250 core].
24#
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 6% improvement on
28# Cortex A8 core and ~40 cycles per processed byte.
1fa29843 29
1e863180
AP
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 7%
33# improvement on Coxtex A8 core and ~38 cycles per byte.
34
35# March 2011.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process
482a7d80 38# one byte in 23.3 cycles or ~60% faster than integer-only code.
1e863180 39
f26328c2
AP
40# August 2012.
41#
42# Improve NEON performance by 12% on Snapdragon S4. In absolute
43# terms it's 22.6 cycles per byte, which is disappointing result.
44# Technical writers asserted that 3-way S4 pipeline can sustain
45# multiple NEON instructions per cycle, but dual NEON issue could
e390ae50
AP
46# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47# for further details. On side note Cortex-A15 processes one byte in
48# 16 cycles.
f26328c2 49
74eb3e09
AP
50# Byte order [in]dependence. =========================================
51#
1e863180
AP
52# Originally caller was expected to maintain specific *dword* order in
53# h[0-7], namely with most significant dword at *lower* address, which
54# was reflected in below two parameters as 0 and 4. Now caller is
55# expected to maintain native byte order for whole 64-bit values.
56$hi="HI";
57$lo="LO";
74eb3e09 58# ====================================================================
1fa29843 59
313e6ec1 60$flavour = shift;
a5aa63a4
RL
61if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
62else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
313e6ec1
AP
63
64if ($flavour && $flavour ne "void") {
65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
67 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
68 die "can't locate arm-xlate.pl";
69
70 open STDOUT,"| \"$^X\" $xlate $flavour $output";
71} else {
72 open STDOUT,">$output";
73}
4c7c5ff6 74
1e863180 75$ctx="r0"; # parameter block
1fa29843
AP
76$inp="r1";
77$len="r2";
1e863180 78
1fa29843
AP
79$Tlo="r3";
80$Thi="r4";
81$Alo="r5";
82$Ahi="r6";
83$Elo="r7";
84$Ehi="r8";
85$t0="r9";
86$t1="r10";
87$t2="r11";
88$t3="r12";
89############ r13 is stack pointer
90$Ktbl="r14";
91############ r15 is program counter
92
93$Aoff=8*0;
94$Boff=8*1;
95$Coff=8*2;
96$Doff=8*3;
97$Eoff=8*4;
98$Foff=8*5;
99$Goff=8*6;
100$Hoff=8*7;
101$Xoff=8*8;
102
103sub BODY_00_15() {
104my $magic = shift;
105$code.=<<___;
1fa29843
AP
106 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
107 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
108 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
109 mov $t0,$Elo,lsr#14
1e863180 110 str $Tlo,[sp,#$Xoff+0]
1fa29843 111 mov $t1,$Ehi,lsr#14
1e863180 112 str $Thi,[sp,#$Xoff+4]
1fa29843 113 eor $t0,$t0,$Ehi,lsl#18
1e863180 114 ldr $t2,[sp,#$Hoff+0] @ h.lo
1fa29843 115 eor $t1,$t1,$Elo,lsl#18
1e863180 116 ldr $t3,[sp,#$Hoff+4] @ h.hi
1fa29843
AP
117 eor $t0,$t0,$Elo,lsr#18
118 eor $t1,$t1,$Ehi,lsr#18
119 eor $t0,$t0,$Ehi,lsl#14
120 eor $t1,$t1,$Elo,lsl#14
121 eor $t0,$t0,$Ehi,lsr#9
122 eor $t1,$t1,$Elo,lsr#9
123 eor $t0,$t0,$Elo,lsl#23
124 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
125 adds $Tlo,$Tlo,$t0
1fa29843 126 ldr $t0,[sp,#$Foff+0] @ f.lo
2d22e080 127 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
1fa29843 128 ldr $t1,[sp,#$Foff+4] @ f.hi
2d22e080 129 adds $Tlo,$Tlo,$t2
1fa29843 130 ldr $t2,[sp,#$Goff+0] @ g.lo
2d22e080 131 adc $Thi,$Thi,$t3 @ T += h
1fa29843 132 ldr $t3,[sp,#$Goff+4] @ g.hi
1fa29843
AP
133
134 eor $t0,$t0,$t2
2d22e080 135 str $Elo,[sp,#$Eoff+0]
1fa29843 136 eor $t1,$t1,$t3
2d22e080 137 str $Ehi,[sp,#$Eoff+4]
1fa29843 138 and $t0,$t0,$Elo
2d22e080 139 str $Alo,[sp,#$Aoff+0]
1fa29843 140 and $t1,$t1,$Ehi
2d22e080 141 str $Ahi,[sp,#$Aoff+4]
1fa29843 142 eor $t0,$t0,$t2
1e863180 143 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
2d22e080 144 eor $t1,$t1,$t3 @ Ch(e,f,g)
1e863180 145 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
1fa29843
AP
146
147 adds $Tlo,$Tlo,$t0
2d22e080 148 ldr $Elo,[sp,#$Doff+0] @ d.lo
1fa29843 149 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
2d22e080 150 ldr $Ehi,[sp,#$Doff+4] @ d.hi
1fa29843 151 adds $Tlo,$Tlo,$t2
1e863180 152 and $t0,$t2,#0xff
1fa29843
AP
153 adc $Thi,$Thi,$t3 @ T += K[i]
154 adds $Elo,$Elo,$Tlo
1e863180 155 ldr $t2,[sp,#$Boff+0] @ b.lo
1fa29843 156 adc $Ehi,$Ehi,$Thi @ d += T
1fa29843 157 teq $t0,#$magic
1fa29843 158
b5e5760d 159 ldr $t3,[sp,#$Coff+0] @ c.lo
2e51557b 160#ifdef __thumb2__
b1a5d1c6
AP
161 it eq @ Thumb2 thing, sanity check in ARM
162#endif
1e863180 163 orreq $Ktbl,$Ktbl,#1
1fa29843
AP
164 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
165 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
166 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
167 mov $t0,$Alo,lsr#28
168 mov $t1,$Ahi,lsr#28
169 eor $t0,$t0,$Ahi,lsl#4
170 eor $t1,$t1,$Alo,lsl#4
171 eor $t0,$t0,$Ahi,lsr#2
172 eor $t1,$t1,$Alo,lsr#2
173 eor $t0,$t0,$Alo,lsl#30
174 eor $t1,$t1,$Ahi,lsl#30
175 eor $t0,$t0,$Ahi,lsr#7
176 eor $t1,$t1,$Alo,lsr#7
177 eor $t0,$t0,$Alo,lsl#25
178 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
179 adds $Tlo,$Tlo,$t0
1e863180 180 and $t0,$Alo,$t2
1fa29843
AP
181 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
182
1fa29843 183 ldr $t1,[sp,#$Boff+4] @ b.hi
1e863180 184 orr $Alo,$Alo,$t2
1fa29843
AP
185 ldr $t2,[sp,#$Coff+4] @ c.hi
186 and $Alo,$Alo,$t3
1fa29843
AP
187 and $t3,$Ahi,$t1
188 orr $Ahi,$Ahi,$t1
1e863180 189 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
1fa29843 190 and $Ahi,$Ahi,$t2
1fa29843 191 adds $Alo,$Alo,$Tlo
1e863180 192 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
1fa29843 193 sub sp,sp,#8
1e863180
AP
194 adc $Ahi,$Ahi,$Thi @ h += T
195 tst $Ktbl,#1
1fa29843
AP
196 add $Ktbl,$Ktbl,#8
197___
198}
199$code=<<___;
b1a5d1c6
AP
200#ifndef __KERNEL__
201# include "arm_arch.h"
202# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
203# define VFP_ABI_POP vldmia sp!,{d8-d15}
204#else
205# define __ARM_ARCH__ __LINUX_ARM_ARCH__
206# define __ARM_MAX_ARCH__ 7
207# define VFP_ABI_PUSH
208# define VFP_ABI_POP
209#endif
210
1e863180
AP
211#ifdef __ARMEL__
212# define LO 0
213# define HI 4
214# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
215#else
216# define HI 0
217# define LO 4
218# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
219#endif
220
1fa29843 221.text
a2859927 222#if defined(__thumb2__)
b1a5d1c6 223.syntax unified
b1a5d1c6 224.thumb
11208dcf
AP
225# define adrl adr
226#else
227.code 32
b1a5d1c6
AP
228#endif
229
1fa29843
AP
230.type K512,%object
231.align 5
232K512:
1e863180
AP
233WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
234WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
235WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
236WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
237WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
238WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
239WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
240WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
241WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
242WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
243WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
244WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
245WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
246WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
247WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
248WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
249WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
250WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
251WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
252WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
253WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
254WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
255WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
256WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
257WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
258WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
259WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
260WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
261WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
262WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
263WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
264WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
265WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
266WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
267WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
268WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
269WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
270WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
271WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
272WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
1fa29843 273.size K512,.-K512
b1a5d1c6 274#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1e863180 275.LOPENSSL_armcap:
313e6ec1 276.word OPENSSL_armcap_P-.Lsha512_block_data_order
1e863180 277.skip 32-4
c1669e1c
AP
278#else
279.skip 32
280#endif
1fa29843
AP
281
282.global sha512_block_data_order
283.type sha512_block_data_order,%function
284sha512_block_data_order:
313e6ec1 285.Lsha512_block_data_order:
11208dcf 286#if __ARM_ARCH__<7 && !defined(__thumb2__)
1fa29843 287 sub r3,pc,#8 @ sha512_block_data_order
b1a5d1c6 288#else
11208dcf 289 adr r3,.Lsha512_block_data_order
b1a5d1c6
AP
290#endif
291#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1e863180 292 ldr r12,.LOPENSSL_armcap
87873f43 293 ldr r12,[r3,r12] @ OPENSSL_armcap_P
313e6ec1
AP
294#ifdef __APPLE__
295 ldr r12,[r12]
296#endif
bdbd3aea 297 tst r12,#ARMV7_NEON
1e863180
AP
298 bne .LNEON
299#endif
b1a5d1c6 300 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
1fa29843 301 stmdb sp!,{r4-r12,lr}
1e863180 302 sub $Ktbl,r3,#672 @ K512
1fa29843
AP
303 sub sp,sp,#9*8
304
305 ldr $Elo,[$ctx,#$Eoff+$lo]
306 ldr $Ehi,[$ctx,#$Eoff+$hi]
307 ldr $t0, [$ctx,#$Goff+$lo]
308 ldr $t1, [$ctx,#$Goff+$hi]
309 ldr $t2, [$ctx,#$Hoff+$lo]
310 ldr $t3, [$ctx,#$Hoff+$hi]
311.Loop:
312 str $t0, [sp,#$Goff+0]
313 str $t1, [sp,#$Goff+4]
314 str $t2, [sp,#$Hoff+0]
315 str $t3, [sp,#$Hoff+4]
316 ldr $Alo,[$ctx,#$Aoff+$lo]
317 ldr $Ahi,[$ctx,#$Aoff+$hi]
318 ldr $Tlo,[$ctx,#$Boff+$lo]
319 ldr $Thi,[$ctx,#$Boff+$hi]
320 ldr $t0, [$ctx,#$Coff+$lo]
321 ldr $t1, [$ctx,#$Coff+$hi]
322 ldr $t2, [$ctx,#$Doff+$lo]
323 ldr $t3, [$ctx,#$Doff+$hi]
324 str $Tlo,[sp,#$Boff+0]
325 str $Thi,[sp,#$Boff+4]
326 str $t0, [sp,#$Coff+0]
327 str $t1, [sp,#$Coff+4]
328 str $t2, [sp,#$Doff+0]
329 str $t3, [sp,#$Doff+4]
330 ldr $Tlo,[$ctx,#$Foff+$lo]
331 ldr $Thi,[$ctx,#$Foff+$hi]
332 str $Tlo,[sp,#$Foff+0]
333 str $Thi,[sp,#$Foff+4]
334
335.L00_15:
1e863180 336#if __ARM_ARCH__<7
1fa29843
AP
337 ldrb $Tlo,[$inp,#7]
338 ldrb $t0, [$inp,#6]
339 ldrb $t1, [$inp,#5]
340 ldrb $t2, [$inp,#4]
341 ldrb $Thi,[$inp,#3]
342 ldrb $t3, [$inp,#2]
343 orr $Tlo,$Tlo,$t0,lsl#8
344 ldrb $t0, [$inp,#1]
345 orr $Tlo,$Tlo,$t1,lsl#16
346 ldrb $t1, [$inp],#8
347 orr $Tlo,$Tlo,$t2,lsl#24
348 orr $Thi,$Thi,$t3,lsl#8
349 orr $Thi,$Thi,$t0,lsl#16
350 orr $Thi,$Thi,$t1,lsl#24
1e863180
AP
351#else
352 ldr $Tlo,[$inp,#4]
353 ldr $Thi,[$inp],#8
354#ifdef __ARMEL__
355 rev $Tlo,$Tlo
356 rev $Thi,$Thi
357#endif
358#endif
1fa29843
AP
359___
360 &BODY_00_15(0x94);
361$code.=<<___;
362 tst $Ktbl,#1
363 beq .L00_15
1fa29843
AP
364 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
365 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
1e863180
AP
366 bic $Ktbl,$Ktbl,#1
367.L16_79:
1fa29843
AP
368 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
369 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
370 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
371 mov $Tlo,$t0,lsr#1
1e863180 372 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
1fa29843 373 mov $Thi,$t1,lsr#1
1e863180 374 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
1fa29843
AP
375 eor $Tlo,$Tlo,$t1,lsl#31
376 eor $Thi,$Thi,$t0,lsl#31
377 eor $Tlo,$Tlo,$t0,lsr#8
378 eor $Thi,$Thi,$t1,lsr#8
379 eor $Tlo,$Tlo,$t1,lsl#24
380 eor $Thi,$Thi,$t0,lsl#24
381 eor $Tlo,$Tlo,$t0,lsr#7
382 eor $Thi,$Thi,$t1,lsr#7
383 eor $Tlo,$Tlo,$t1,lsl#25
384
385 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
386 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
387 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
388 mov $t0,$t2,lsr#19
389 mov $t1,$t3,lsr#19
390 eor $t0,$t0,$t3,lsl#13
391 eor $t1,$t1,$t2,lsl#13
392 eor $t0,$t0,$t3,lsr#29
393 eor $t1,$t1,$t2,lsr#29
394 eor $t0,$t0,$t2,lsl#3
395 eor $t1,$t1,$t3,lsl#3
396 eor $t0,$t0,$t2,lsr#6
397 eor $t1,$t1,$t3,lsr#6
1e863180 398 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
1fa29843
AP
399 eor $t0,$t0,$t3,lsl#26
400
1fa29843
AP
401 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
402 adds $Tlo,$Tlo,$t0
1e863180 403 ldr $t0,[sp,#`$Xoff+8*16`+0]
1fa29843
AP
404 adc $Thi,$Thi,$t1
405
1fa29843
AP
406 ldr $t1,[sp,#`$Xoff+8*16`+4]
407 adds $Tlo,$Tlo,$t2
408 adc $Thi,$Thi,$t3
409 adds $Tlo,$Tlo,$t0
410 adc $Thi,$Thi,$t1
1fa29843
AP
411___
412 &BODY_00_15(0x17);
413$code.=<<___;
2e51557b 414#ifdef __thumb2__
b1a5d1c6
AP
415 ittt eq @ Thumb2 thing, sanity check in ARM
416#endif
1e863180
AP
417 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
418 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
1fa29843
AP
419 beq .L16_79
420 bic $Ktbl,$Ktbl,#1
421
422 ldr $Tlo,[sp,#$Boff+0]
423 ldr $Thi,[sp,#$Boff+4]
424 ldr $t0, [$ctx,#$Aoff+$lo]
425 ldr $t1, [$ctx,#$Aoff+$hi]
426 ldr $t2, [$ctx,#$Boff+$lo]
427 ldr $t3, [$ctx,#$Boff+$hi]
428 adds $t0,$Alo,$t0
1fa29843 429 str $t0, [$ctx,#$Aoff+$lo]
1e863180 430 adc $t1,$Ahi,$t1
1fa29843 431 str $t1, [$ctx,#$Aoff+$hi]
1e863180 432 adds $t2,$Tlo,$t2
1fa29843 433 str $t2, [$ctx,#$Boff+$lo]
1e863180 434 adc $t3,$Thi,$t3
1fa29843
AP
435 str $t3, [$ctx,#$Boff+$hi]
436
437 ldr $Alo,[sp,#$Coff+0]
438 ldr $Ahi,[sp,#$Coff+4]
439 ldr $Tlo,[sp,#$Doff+0]
440 ldr $Thi,[sp,#$Doff+4]
441 ldr $t0, [$ctx,#$Coff+$lo]
442 ldr $t1, [$ctx,#$Coff+$hi]
443 ldr $t2, [$ctx,#$Doff+$lo]
444 ldr $t3, [$ctx,#$Doff+$hi]
445 adds $t0,$Alo,$t0
1fa29843 446 str $t0, [$ctx,#$Coff+$lo]
1e863180 447 adc $t1,$Ahi,$t1
1fa29843 448 str $t1, [$ctx,#$Coff+$hi]
1e863180 449 adds $t2,$Tlo,$t2
1fa29843 450 str $t2, [$ctx,#$Doff+$lo]
1e863180 451 adc $t3,$Thi,$t3
1fa29843
AP
452 str $t3, [$ctx,#$Doff+$hi]
453
454 ldr $Tlo,[sp,#$Foff+0]
455 ldr $Thi,[sp,#$Foff+4]
456 ldr $t0, [$ctx,#$Eoff+$lo]
457 ldr $t1, [$ctx,#$Eoff+$hi]
458 ldr $t2, [$ctx,#$Foff+$lo]
459 ldr $t3, [$ctx,#$Foff+$hi]
460 adds $Elo,$Elo,$t0
1fa29843 461 str $Elo,[$ctx,#$Eoff+$lo]
1e863180 462 adc $Ehi,$Ehi,$t1
1fa29843 463 str $Ehi,[$ctx,#$Eoff+$hi]
1e863180 464 adds $t2,$Tlo,$t2
1fa29843 465 str $t2, [$ctx,#$Foff+$lo]
1e863180 466 adc $t3,$Thi,$t3
1fa29843
AP
467 str $t3, [$ctx,#$Foff+$hi]
468
469 ldr $Alo,[sp,#$Goff+0]
470 ldr $Ahi,[sp,#$Goff+4]
471 ldr $Tlo,[sp,#$Hoff+0]
472 ldr $Thi,[sp,#$Hoff+4]
473 ldr $t0, [$ctx,#$Goff+$lo]
474 ldr $t1, [$ctx,#$Goff+$hi]
475 ldr $t2, [$ctx,#$Hoff+$lo]
476 ldr $t3, [$ctx,#$Hoff+$hi]
477 adds $t0,$Alo,$t0
1fa29843 478 str $t0, [$ctx,#$Goff+$lo]
1e863180 479 adc $t1,$Ahi,$t1
1fa29843 480 str $t1, [$ctx,#$Goff+$hi]
1e863180 481 adds $t2,$Tlo,$t2
1fa29843 482 str $t2, [$ctx,#$Hoff+$lo]
1e863180 483 adc $t3,$Thi,$t3
1fa29843
AP
484 str $t3, [$ctx,#$Hoff+$hi]
485
486 add sp,sp,#640
487 sub $Ktbl,$Ktbl,#640
488
489 teq $inp,$len
490 bne .Loop
491
492 add sp,sp,#8*9 @ destroy frame
1e863180
AP
493#if __ARM_ARCH__>=5
494 ldmia sp!,{r4-r12,pc}
495#else
1fa29843
AP
496 ldmia sp!,{r4-r12,lr}
497 tst lr,#1
498 moveq pc,lr @ be binary compatible with V4, yet
499 bx lr @ interoperable with Thumb ISA:-)
1e863180 500#endif
b1a5d1c6 501.size sha512_block_data_order,.-sha512_block_data_order
1e863180
AP
502___
503
504{
505my @Sigma0=(28,34,39);
506my @Sigma1=(14,18,41);
507my @sigma0=(1, 8, 7);
508my @sigma1=(19,61,6);
509
510my $Ktbl="r3";
511my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
512
513my @X=map("d$_",(0..15));
514my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
515
516sub NEON_00_15() {
517my $i=shift;
518my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
519my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
520
521$code.=<<___ if ($i<16 || $i&1);
522 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
523#if $i<16
524 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
525#endif
526 vshr.u64 $t1,$e,#@Sigma1[1]
f26328c2
AP
527#if $i>0
528 vadd.i64 $a,$Maj @ h+=Maj from the past
529#endif
1e863180
AP
530 vshr.u64 $t2,$e,#@Sigma1[2]
531___
532$code.=<<___;
533 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
534 vsli.64 $t0,$e,#`64-@Sigma1[0]`
535 vsli.64 $t1,$e,#`64-@Sigma1[1]`
482a7d80 536 vmov $Ch,$e
1e863180
AP
537 vsli.64 $t2,$e,#`64-@Sigma1[2]`
538#if $i<16 && defined(__ARMEL__)
539 vrev64.8 @X[$i],@X[$i]
540#endif
482a7d80 541 veor $t1,$t0
f26328c2 542 vbsl $Ch,$f,$g @ Ch(e,f,g)
1e863180 543 vshr.u64 $t0,$a,#@Sigma0[0]
482a7d80 544 veor $t2,$t1 @ Sigma1(e)
f26328c2 545 vadd.i64 $T1,$Ch,$h
1e863180 546 vshr.u64 $t1,$a,#@Sigma0[1]
1e863180 547 vsli.64 $t0,$a,#`64-@Sigma0[0]`
f26328c2
AP
548 vadd.i64 $T1,$t2
549 vshr.u64 $t2,$a,#@Sigma0[2]
550 vadd.i64 $K,@X[$i%16]
1e863180 551 vsli.64 $t1,$a,#`64-@Sigma0[1]`
f26328c2 552 veor $Maj,$a,$b
1e863180 553 vsli.64 $t2,$a,#`64-@Sigma0[2]`
1e863180 554 veor $h,$t0,$t1
f26328c2 555 vadd.i64 $T1,$K
482a7d80 556 vbsl $Maj,$c,$b @ Maj(a,b,c)
f26328c2 557 veor $h,$t2 @ Sigma0(a)
1e863180 558 vadd.i64 $d,$T1
f26328c2
AP
559 vadd.i64 $Maj,$T1
560 @ vadd.i64 $h,$Maj
1e863180
AP
561___
562}
563
564sub NEON_16_79() {
565my $i=shift;
566
567if ($i&1) { &NEON_00_15($i,@_); return; }
568
569# 2x-vectorized, therefore runs every 2nd round
570my @X=map("q$_",(0..7)); # view @X as 128-bit vector
571my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
572my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
573my $e=@_[4]; # $e from NEON_00_15
574$i /= 2;
575$code.=<<___;
576 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
577 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
f26328c2 578 vadd.i64 @_[0],d30 @ h+=Maj from the past
1e863180
AP
579 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
580 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
581 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
582 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
583 veor $s1,$t0
584 vshr.u64 $t0,$s0,#@sigma0[0]
585 veor $s1,$t1 @ sigma1(X[i+14])
586 vshr.u64 $t1,$s0,#@sigma0[1]
587 vadd.i64 @X[$i%8],$s1
588 vshr.u64 $s1,$s0,#@sigma0[2]
589 vsli.64 $t0,$s0,#`64-@sigma0[0]`
590 vsli.64 $t1,$s0,#`64-@sigma0[1]`
591 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
592 veor $s1,$t0
593 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
594 vadd.i64 @X[$i%8],$s0
595 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
596 veor $s1,$t1 @ sigma0(X[i+1])
597 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
598 vadd.i64 @X[$i%8],$s1
599___
600 &NEON_00_15(2*$i,@_);
601}
602
603$code.=<<___;
c1669e1c
AP
604#if __ARM_MAX_ARCH__>=7
605.arch armv7-a
1e863180
AP
606.fpu neon
607
b1a5d1c6
AP
608.global sha512_block_data_order_neon
609.type sha512_block_data_order_neon,%function
1e863180 610.align 4
b1a5d1c6 611sha512_block_data_order_neon:
1e863180
AP
612.LNEON:
613 dmb @ errata #451034 on early Cortex A8
b1a5d1c6 614 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
313e6ec1 615 adr $Ktbl,K512
b1a5d1c6 616 VFP_ABI_PUSH
1e863180
AP
617 vldmia $ctx,{$A-$H} @ load context
618.Loop_neon:
619___
620for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
621$code.=<<___;
622 mov $cnt,#4
623.L16_79_neon:
624 subs $cnt,#1
625___
626for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
627$code.=<<___;
628 bne .L16_79_neon
629
f26328c2 630 vadd.i64 $A,d30 @ h+=Maj from the past
1e863180
AP
631 vldmia $ctx,{d24-d31} @ load context to temp
632 vadd.i64 q8,q12 @ vectorized accumulate
633 vadd.i64 q9,q13
634 vadd.i64 q10,q14
635 vadd.i64 q11,q15
636 vstmia $ctx,{$A-$H} @ save context
637 teq $inp,$len
638 sub $Ktbl,#640 @ rewind K512
639 bne .Loop_neon
640
b1a5d1c6 641 VFP_ABI_POP
5dcf70a1 642 ret @ bx lr
b1a5d1c6 643.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1e863180
AP
644#endif
645___
646}
647$code.=<<___;
1e863180 648.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
97a6a01f 649.align 2
b1a5d1c6 650#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
87873f43 651.comm OPENSSL_armcap_P,4,4
c1669e1c 652#endif
1fa29843
AP
653___
654
655$code =~ s/\`([^\`]*)\`/eval $1/gem;
7722e53f 656$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
5dcf70a1 657$code =~ s/\bret\b/bx lr/gm;
b1a5d1c6
AP
658
659open SELF,$0;
660while(<SELF>) {
661 next if (/^#!/);
662 last if (!s/^#/@/ and !/^$/);
663 print;
664}
665close SELF;
666
1fa29843 667print $code;
4c7c5ff6 668close STDOUT; # enforce flush