]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-armv4.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
da1c088f 2# Copyright 2007-2023 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
a598ed0d 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
1fa29843
AP
9
10# ====================================================================
f26328c2 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1fa29843
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
b1a5d1c6
AP
15#
16# Permission to use under GPL terms is granted.
1fa29843
AP
17# ====================================================================
18
19# SHA512 block procedure for ARMv4. September 2007.
20
21# This code is ~4.5 (four and a half) times faster than code generated
2d22e080
AP
22# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23# Xscale PXA250 core].
24#
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 6% improvement on
28# Cortex A8 core and ~40 cycles per processed byte.
1fa29843 29
1e863180
AP
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 7%
33# improvement on Coxtex A8 core and ~38 cycles per byte.
34
35# March 2011.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process
482a7d80 38# one byte in 23.3 cycles or ~60% faster than integer-only code.
1e863180 39
f26328c2
AP
40# August 2012.
41#
42# Improve NEON performance by 12% on Snapdragon S4. In absolute
43# terms it's 22.6 cycles per byte, which is disappointing result.
44# Technical writers asserted that 3-way S4 pipeline can sustain
45# multiple NEON instructions per cycle, but dual NEON issue could
e390ae50
AP
46# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47# for further details. On side note Cortex-A15 processes one byte in
48# 16 cycles.
f26328c2 49
74eb3e09
AP
50# Byte order [in]dependence. =========================================
51#
1e863180
AP
52# Originally caller was expected to maintain specific *dword* order in
53# h[0-7], namely with most significant dword at *lower* address, which
54# was reflected in below two parameters as 0 and 4. Now caller is
55# expected to maintain native byte order for whole 64-bit values.
56$hi="HI";
57$lo="LO";
74eb3e09 58# ====================================================================
1fa29843 59
1aa89a7a
RL
60# $output is the last argument if it looks like a file (it has an extension)
61# $flavour is the first argument if it doesn't look like a file
62$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
63$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
313e6ec1
AP
64
65if ($flavour && $flavour ne "void") {
66 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
68 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
69 die "can't locate arm-xlate.pl";
70
1aa89a7a
RL
71 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
72 or die "can't call $xlate: $!";
313e6ec1 73} else {
1aa89a7a 74 $output and open STDOUT,">$output";
313e6ec1 75}
4c7c5ff6 76
1e863180 77$ctx="r0"; # parameter block
1fa29843
AP
78$inp="r1";
79$len="r2";
1e863180 80
1fa29843
AP
81$Tlo="r3";
82$Thi="r4";
83$Alo="r5";
84$Ahi="r6";
85$Elo="r7";
86$Ehi="r8";
87$t0="r9";
88$t1="r10";
89$t2="r11";
90$t3="r12";
91############ r13 is stack pointer
92$Ktbl="r14";
93############ r15 is program counter
94
95$Aoff=8*0;
96$Boff=8*1;
97$Coff=8*2;
98$Doff=8*3;
99$Eoff=8*4;
100$Foff=8*5;
101$Goff=8*6;
102$Hoff=8*7;
103$Xoff=8*8;
104
105sub BODY_00_15() {
106my $magic = shift;
107$code.=<<___;
1fa29843
AP
108 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
109 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
110 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
111 mov $t0,$Elo,lsr#14
1e863180 112 str $Tlo,[sp,#$Xoff+0]
1fa29843 113 mov $t1,$Ehi,lsr#14
1e863180 114 str $Thi,[sp,#$Xoff+4]
1fa29843 115 eor $t0,$t0,$Ehi,lsl#18
1e863180 116 ldr $t2,[sp,#$Hoff+0] @ h.lo
1fa29843 117 eor $t1,$t1,$Elo,lsl#18
1e863180 118 ldr $t3,[sp,#$Hoff+4] @ h.hi
1fa29843
AP
119 eor $t0,$t0,$Elo,lsr#18
120 eor $t1,$t1,$Ehi,lsr#18
121 eor $t0,$t0,$Ehi,lsl#14
122 eor $t1,$t1,$Elo,lsl#14
123 eor $t0,$t0,$Ehi,lsr#9
124 eor $t1,$t1,$Elo,lsr#9
125 eor $t0,$t0,$Elo,lsl#23
126 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
127 adds $Tlo,$Tlo,$t0
1fa29843 128 ldr $t0,[sp,#$Foff+0] @ f.lo
2d22e080 129 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
1fa29843 130 ldr $t1,[sp,#$Foff+4] @ f.hi
2d22e080 131 adds $Tlo,$Tlo,$t2
1fa29843 132 ldr $t2,[sp,#$Goff+0] @ g.lo
2d22e080 133 adc $Thi,$Thi,$t3 @ T += h
1fa29843 134 ldr $t3,[sp,#$Goff+4] @ g.hi
1fa29843
AP
135
136 eor $t0,$t0,$t2
2d22e080 137 str $Elo,[sp,#$Eoff+0]
1fa29843 138 eor $t1,$t1,$t3
2d22e080 139 str $Ehi,[sp,#$Eoff+4]
1fa29843 140 and $t0,$t0,$Elo
2d22e080 141 str $Alo,[sp,#$Aoff+0]
1fa29843 142 and $t1,$t1,$Ehi
2d22e080 143 str $Ahi,[sp,#$Aoff+4]
1fa29843 144 eor $t0,$t0,$t2
1e863180 145 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
2d22e080 146 eor $t1,$t1,$t3 @ Ch(e,f,g)
1e863180 147 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
1fa29843
AP
148
149 adds $Tlo,$Tlo,$t0
2d22e080 150 ldr $Elo,[sp,#$Doff+0] @ d.lo
1fa29843 151 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
2d22e080 152 ldr $Ehi,[sp,#$Doff+4] @ d.hi
1fa29843 153 adds $Tlo,$Tlo,$t2
1e863180 154 and $t0,$t2,#0xff
1fa29843
AP
155 adc $Thi,$Thi,$t3 @ T += K[i]
156 adds $Elo,$Elo,$Tlo
1e863180 157 ldr $t2,[sp,#$Boff+0] @ b.lo
1fa29843 158 adc $Ehi,$Ehi,$Thi @ d += T
1fa29843 159 teq $t0,#$magic
1fa29843 160
b5e5760d 161 ldr $t3,[sp,#$Coff+0] @ c.lo
2e51557b 162#ifdef __thumb2__
b1a5d1c6
AP
163 it eq @ Thumb2 thing, sanity check in ARM
164#endif
1e863180 165 orreq $Ktbl,$Ktbl,#1
1fa29843
AP
166 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
167 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
168 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
169 mov $t0,$Alo,lsr#28
170 mov $t1,$Ahi,lsr#28
171 eor $t0,$t0,$Ahi,lsl#4
172 eor $t1,$t1,$Alo,lsl#4
173 eor $t0,$t0,$Ahi,lsr#2
174 eor $t1,$t1,$Alo,lsr#2
175 eor $t0,$t0,$Alo,lsl#30
176 eor $t1,$t1,$Ahi,lsl#30
177 eor $t0,$t0,$Ahi,lsr#7
178 eor $t1,$t1,$Alo,lsr#7
179 eor $t0,$t0,$Alo,lsl#25
180 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
181 adds $Tlo,$Tlo,$t0
1e863180 182 and $t0,$Alo,$t2
1fa29843
AP
183 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
184
1fa29843 185 ldr $t1,[sp,#$Boff+4] @ b.hi
1e863180 186 orr $Alo,$Alo,$t2
1fa29843
AP
187 ldr $t2,[sp,#$Coff+4] @ c.hi
188 and $Alo,$Alo,$t3
1fa29843
AP
189 and $t3,$Ahi,$t1
190 orr $Ahi,$Ahi,$t1
1e863180 191 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
1fa29843 192 and $Ahi,$Ahi,$t2
1fa29843 193 adds $Alo,$Alo,$Tlo
1e863180 194 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
1fa29843 195 sub sp,sp,#8
1e863180
AP
196 adc $Ahi,$Ahi,$Thi @ h += T
197 tst $Ktbl,#1
1fa29843
AP
198 add $Ktbl,$Ktbl,#8
199___
200}
3405db97
AP
201
202my $_word = ($flavour =~ /win/ ? "DCDU" : ".word");
203
1fa29843 204$code=<<___;
b1a5d1c6
AP
205#ifndef __KERNEL__
206# include "arm_arch.h"
207# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
208# define VFP_ABI_POP vldmia sp!,{d8-d15}
209#else
210# define __ARM_ARCH__ __LINUX_ARM_ARCH__
211# define __ARM_MAX_ARCH__ 7
212# define VFP_ABI_PUSH
213# define VFP_ABI_POP
214#endif
215
1e863180
AP
216#ifdef __ARMEL__
217# define LO 0
218# define HI 4
3405db97 219# define WORD64(hi0,lo0,hi1,lo1) $_word lo0,hi0, lo1,hi1
1e863180
AP
220#else
221# define HI 0
222# define LO 4
3405db97 223# define WORD64(hi0,lo0,hi1,lo1) $_word hi0,lo0, hi1,lo1
1e863180
AP
224#endif
225
a2859927 226#if defined(__thumb2__)
b1a5d1c6 227.syntax unified
b1a5d1c6 228.thumb
11208dcf
AP
229# define adrl adr
230#else
231.code 32
b1a5d1c6
AP
232#endif
233
3405db97
AP
234.text
235
1fa29843
AP
236.type K512,%object
237.align 5
238K512:
1e863180
AP
239WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
240WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
241WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
242WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
243WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
244WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
245WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
246WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
247WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
248WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
249WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
250WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
251WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
252WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
253WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
254WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
255WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
256WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
257WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
258WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
259WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
260WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
261WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
262WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
263WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
264WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
265WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
266WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
267WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
268WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
269WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
270WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
271WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
272WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
273WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
274WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
275WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
276WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
277WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
278WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
1fa29843 279.size K512,.-K512
b1a5d1c6 280#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1e863180 281.LOPENSSL_armcap:
3405db97
AP
282# ifdef _WIN32
283.word OPENSSL_armcap_P
284# else
313e6ec1 285.word OPENSSL_armcap_P-.Lsha512_block_data_order
3405db97 286# endif
1e863180 287.skip 32-4
c1669e1c
AP
288#else
289.skip 32
290#endif
1fa29843
AP
291
292.global sha512_block_data_order
293.type sha512_block_data_order,%function
294sha512_block_data_order:
313e6ec1 295.Lsha512_block_data_order:
11208dcf 296#if __ARM_ARCH__<7 && !defined(__thumb2__)
1fa29843 297 sub r3,pc,#8 @ sha512_block_data_order
b1a5d1c6 298#else
11208dcf 299 adr r3,.Lsha512_block_data_order
b1a5d1c6
AP
300#endif
301#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1e863180 302 ldr r12,.LOPENSSL_armcap
3405db97 303# if !defined(_WIN32)
87873f43 304 ldr r12,[r3,r12] @ OPENSSL_armcap_P
3405db97
AP
305# endif
306# if defined(__APPLE__) || defined(_WIN32)
313e6ec1 307 ldr r12,[r12]
3405db97 308# endif
bdbd3aea 309 tst r12,#ARMV7_NEON
1e863180
AP
310 bne .LNEON
311#endif
b1a5d1c6 312 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
1fa29843 313 stmdb sp!,{r4-r12,lr}
1e863180 314 sub $Ktbl,r3,#672 @ K512
1fa29843
AP
315 sub sp,sp,#9*8
316
317 ldr $Elo,[$ctx,#$Eoff+$lo]
318 ldr $Ehi,[$ctx,#$Eoff+$hi]
319 ldr $t0, [$ctx,#$Goff+$lo]
320 ldr $t1, [$ctx,#$Goff+$hi]
321 ldr $t2, [$ctx,#$Hoff+$lo]
322 ldr $t3, [$ctx,#$Hoff+$hi]
323.Loop:
324 str $t0, [sp,#$Goff+0]
325 str $t1, [sp,#$Goff+4]
326 str $t2, [sp,#$Hoff+0]
327 str $t3, [sp,#$Hoff+4]
328 ldr $Alo,[$ctx,#$Aoff+$lo]
329 ldr $Ahi,[$ctx,#$Aoff+$hi]
330 ldr $Tlo,[$ctx,#$Boff+$lo]
331 ldr $Thi,[$ctx,#$Boff+$hi]
332 ldr $t0, [$ctx,#$Coff+$lo]
333 ldr $t1, [$ctx,#$Coff+$hi]
334 ldr $t2, [$ctx,#$Doff+$lo]
335 ldr $t3, [$ctx,#$Doff+$hi]
336 str $Tlo,[sp,#$Boff+0]
337 str $Thi,[sp,#$Boff+4]
338 str $t0, [sp,#$Coff+0]
339 str $t1, [sp,#$Coff+4]
340 str $t2, [sp,#$Doff+0]
341 str $t3, [sp,#$Doff+4]
342 ldr $Tlo,[$ctx,#$Foff+$lo]
343 ldr $Thi,[$ctx,#$Foff+$hi]
344 str $Tlo,[sp,#$Foff+0]
345 str $Thi,[sp,#$Foff+4]
346
347.L00_15:
1e863180 348#if __ARM_ARCH__<7
1fa29843
AP
349 ldrb $Tlo,[$inp,#7]
350 ldrb $t0, [$inp,#6]
351 ldrb $t1, [$inp,#5]
352 ldrb $t2, [$inp,#4]
353 ldrb $Thi,[$inp,#3]
354 ldrb $t3, [$inp,#2]
355 orr $Tlo,$Tlo,$t0,lsl#8
356 ldrb $t0, [$inp,#1]
357 orr $Tlo,$Tlo,$t1,lsl#16
358 ldrb $t1, [$inp],#8
359 orr $Tlo,$Tlo,$t2,lsl#24
360 orr $Thi,$Thi,$t3,lsl#8
361 orr $Thi,$Thi,$t0,lsl#16
362 orr $Thi,$Thi,$t1,lsl#24
1e863180
AP
363#else
364 ldr $Tlo,[$inp,#4]
365 ldr $Thi,[$inp],#8
366#ifdef __ARMEL__
367 rev $Tlo,$Tlo
368 rev $Thi,$Thi
369#endif
370#endif
1fa29843
AP
371___
372 &BODY_00_15(0x94);
373$code.=<<___;
374 tst $Ktbl,#1
375 beq .L00_15
1fa29843
AP
376 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
377 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
1e863180
AP
378 bic $Ktbl,$Ktbl,#1
379.L16_79:
1fa29843
AP
380 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
381 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
382 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
383 mov $Tlo,$t0,lsr#1
1e863180 384 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
1fa29843 385 mov $Thi,$t1,lsr#1
1e863180 386 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
1fa29843
AP
387 eor $Tlo,$Tlo,$t1,lsl#31
388 eor $Thi,$Thi,$t0,lsl#31
389 eor $Tlo,$Tlo,$t0,lsr#8
390 eor $Thi,$Thi,$t1,lsr#8
391 eor $Tlo,$Tlo,$t1,lsl#24
392 eor $Thi,$Thi,$t0,lsl#24
393 eor $Tlo,$Tlo,$t0,lsr#7
394 eor $Thi,$Thi,$t1,lsr#7
395 eor $Tlo,$Tlo,$t1,lsl#25
396
397 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
398 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
399 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
400 mov $t0,$t2,lsr#19
401 mov $t1,$t3,lsr#19
402 eor $t0,$t0,$t3,lsl#13
403 eor $t1,$t1,$t2,lsl#13
404 eor $t0,$t0,$t3,lsr#29
405 eor $t1,$t1,$t2,lsr#29
406 eor $t0,$t0,$t2,lsl#3
407 eor $t1,$t1,$t3,lsl#3
408 eor $t0,$t0,$t2,lsr#6
409 eor $t1,$t1,$t3,lsr#6
1e863180 410 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
1fa29843
AP
411 eor $t0,$t0,$t3,lsl#26
412
1fa29843
AP
413 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
414 adds $Tlo,$Tlo,$t0
1e863180 415 ldr $t0,[sp,#`$Xoff+8*16`+0]
1fa29843
AP
416 adc $Thi,$Thi,$t1
417
1fa29843
AP
418 ldr $t1,[sp,#`$Xoff+8*16`+4]
419 adds $Tlo,$Tlo,$t2
420 adc $Thi,$Thi,$t3
421 adds $Tlo,$Tlo,$t0
422 adc $Thi,$Thi,$t1
1fa29843
AP
423___
424 &BODY_00_15(0x17);
425$code.=<<___;
2e51557b 426#ifdef __thumb2__
b1a5d1c6
AP
427 ittt eq @ Thumb2 thing, sanity check in ARM
428#endif
1e863180
AP
429 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
430 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
1fa29843
AP
431 beq .L16_79
432 bic $Ktbl,$Ktbl,#1
433
434 ldr $Tlo,[sp,#$Boff+0]
435 ldr $Thi,[sp,#$Boff+4]
436 ldr $t0, [$ctx,#$Aoff+$lo]
437 ldr $t1, [$ctx,#$Aoff+$hi]
438 ldr $t2, [$ctx,#$Boff+$lo]
439 ldr $t3, [$ctx,#$Boff+$hi]
440 adds $t0,$Alo,$t0
1fa29843 441 str $t0, [$ctx,#$Aoff+$lo]
1e863180 442 adc $t1,$Ahi,$t1
1fa29843 443 str $t1, [$ctx,#$Aoff+$hi]
1e863180 444 adds $t2,$Tlo,$t2
1fa29843 445 str $t2, [$ctx,#$Boff+$lo]
1e863180 446 adc $t3,$Thi,$t3
1fa29843
AP
447 str $t3, [$ctx,#$Boff+$hi]
448
449 ldr $Alo,[sp,#$Coff+0]
450 ldr $Ahi,[sp,#$Coff+4]
451 ldr $Tlo,[sp,#$Doff+0]
452 ldr $Thi,[sp,#$Doff+4]
453 ldr $t0, [$ctx,#$Coff+$lo]
454 ldr $t1, [$ctx,#$Coff+$hi]
455 ldr $t2, [$ctx,#$Doff+$lo]
456 ldr $t3, [$ctx,#$Doff+$hi]
457 adds $t0,$Alo,$t0
1fa29843 458 str $t0, [$ctx,#$Coff+$lo]
1e863180 459 adc $t1,$Ahi,$t1
1fa29843 460 str $t1, [$ctx,#$Coff+$hi]
1e863180 461 adds $t2,$Tlo,$t2
1fa29843 462 str $t2, [$ctx,#$Doff+$lo]
1e863180 463 adc $t3,$Thi,$t3
1fa29843
AP
464 str $t3, [$ctx,#$Doff+$hi]
465
466 ldr $Tlo,[sp,#$Foff+0]
467 ldr $Thi,[sp,#$Foff+4]
468 ldr $t0, [$ctx,#$Eoff+$lo]
469 ldr $t1, [$ctx,#$Eoff+$hi]
470 ldr $t2, [$ctx,#$Foff+$lo]
471 ldr $t3, [$ctx,#$Foff+$hi]
472 adds $Elo,$Elo,$t0
1fa29843 473 str $Elo,[$ctx,#$Eoff+$lo]
1e863180 474 adc $Ehi,$Ehi,$t1
1fa29843 475 str $Ehi,[$ctx,#$Eoff+$hi]
1e863180 476 adds $t2,$Tlo,$t2
1fa29843 477 str $t2, [$ctx,#$Foff+$lo]
1e863180 478 adc $t3,$Thi,$t3
1fa29843
AP
479 str $t3, [$ctx,#$Foff+$hi]
480
481 ldr $Alo,[sp,#$Goff+0]
482 ldr $Ahi,[sp,#$Goff+4]
483 ldr $Tlo,[sp,#$Hoff+0]
484 ldr $Thi,[sp,#$Hoff+4]
485 ldr $t0, [$ctx,#$Goff+$lo]
486 ldr $t1, [$ctx,#$Goff+$hi]
487 ldr $t2, [$ctx,#$Hoff+$lo]
488 ldr $t3, [$ctx,#$Hoff+$hi]
489 adds $t0,$Alo,$t0
1fa29843 490 str $t0, [$ctx,#$Goff+$lo]
1e863180 491 adc $t1,$Ahi,$t1
1fa29843 492 str $t1, [$ctx,#$Goff+$hi]
1e863180 493 adds $t2,$Tlo,$t2
1fa29843 494 str $t2, [$ctx,#$Hoff+$lo]
1e863180 495 adc $t3,$Thi,$t3
1fa29843
AP
496 str $t3, [$ctx,#$Hoff+$hi]
497
498 add sp,sp,#640
499 sub $Ktbl,$Ktbl,#640
500
501 teq $inp,$len
502 bne .Loop
503
504 add sp,sp,#8*9 @ destroy frame
1e863180
AP
505#if __ARM_ARCH__>=5
506 ldmia sp!,{r4-r12,pc}
507#else
1fa29843
AP
508 ldmia sp!,{r4-r12,lr}
509 tst lr,#1
510 moveq pc,lr @ be binary compatible with V4, yet
511 bx lr @ interoperable with Thumb ISA:-)
1e863180 512#endif
b1a5d1c6 513.size sha512_block_data_order,.-sha512_block_data_order
1e863180
AP
514___
515
516{
517my @Sigma0=(28,34,39);
518my @Sigma1=(14,18,41);
519my @sigma0=(1, 8, 7);
520my @sigma1=(19,61,6);
521
522my $Ktbl="r3";
523my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
524
525my @X=map("d$_",(0..15));
526my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
527
528sub NEON_00_15() {
529my $i=shift;
530my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
531my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
532
533$code.=<<___ if ($i<16 || $i&1);
534 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
535#if $i<16
536 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
537#endif
538 vshr.u64 $t1,$e,#@Sigma1[1]
f26328c2
AP
539#if $i>0
540 vadd.i64 $a,$Maj @ h+=Maj from the past
541#endif
1e863180
AP
542 vshr.u64 $t2,$e,#@Sigma1[2]
543___
544$code.=<<___;
545 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
546 vsli.64 $t0,$e,#`64-@Sigma1[0]`
547 vsli.64 $t1,$e,#`64-@Sigma1[1]`
482a7d80 548 vmov $Ch,$e
1e863180
AP
549 vsli.64 $t2,$e,#`64-@Sigma1[2]`
550#if $i<16 && defined(__ARMEL__)
551 vrev64.8 @X[$i],@X[$i]
552#endif
482a7d80 553 veor $t1,$t0
f26328c2 554 vbsl $Ch,$f,$g @ Ch(e,f,g)
1e863180 555 vshr.u64 $t0,$a,#@Sigma0[0]
482a7d80 556 veor $t2,$t1 @ Sigma1(e)
f26328c2 557 vadd.i64 $T1,$Ch,$h
1e863180 558 vshr.u64 $t1,$a,#@Sigma0[1]
1e863180 559 vsli.64 $t0,$a,#`64-@Sigma0[0]`
f26328c2
AP
560 vadd.i64 $T1,$t2
561 vshr.u64 $t2,$a,#@Sigma0[2]
562 vadd.i64 $K,@X[$i%16]
1e863180 563 vsli.64 $t1,$a,#`64-@Sigma0[1]`
f26328c2 564 veor $Maj,$a,$b
1e863180 565 vsli.64 $t2,$a,#`64-@Sigma0[2]`
1e863180 566 veor $h,$t0,$t1
f26328c2 567 vadd.i64 $T1,$K
482a7d80 568 vbsl $Maj,$c,$b @ Maj(a,b,c)
f26328c2 569 veor $h,$t2 @ Sigma0(a)
1e863180 570 vadd.i64 $d,$T1
f26328c2
AP
571 vadd.i64 $Maj,$T1
572 @ vadd.i64 $h,$Maj
1e863180
AP
573___
574}
575
576sub NEON_16_79() {
577my $i=shift;
578
579if ($i&1) { &NEON_00_15($i,@_); return; }
580
581# 2x-vectorized, therefore runs every 2nd round
582my @X=map("q$_",(0..7)); # view @X as 128-bit vector
583my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
584my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
585my $e=@_[4]; # $e from NEON_00_15
586$i /= 2;
587$code.=<<___;
588 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
589 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
f26328c2 590 vadd.i64 @_[0],d30 @ h+=Maj from the past
1e863180
AP
591 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
592 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
593 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
594 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
595 veor $s1,$t0
596 vshr.u64 $t0,$s0,#@sigma0[0]
597 veor $s1,$t1 @ sigma1(X[i+14])
598 vshr.u64 $t1,$s0,#@sigma0[1]
599 vadd.i64 @X[$i%8],$s1
600 vshr.u64 $s1,$s0,#@sigma0[2]
601 vsli.64 $t0,$s0,#`64-@sigma0[0]`
602 vsli.64 $t1,$s0,#`64-@sigma0[1]`
603 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
604 veor $s1,$t0
605 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
606 vadd.i64 @X[$i%8],$s0
607 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
608 veor $s1,$t1 @ sigma0(X[i+1])
609 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
610 vadd.i64 @X[$i%8],$s1
611___
612 &NEON_00_15(2*$i,@_);
613}
614
615$code.=<<___;
c1669e1c
AP
616#if __ARM_MAX_ARCH__>=7
617.arch armv7-a
1e863180
AP
618.fpu neon
619
b1a5d1c6
AP
620.global sha512_block_data_order_neon
621.type sha512_block_data_order_neon,%function
1e863180 622.align 4
b1a5d1c6 623sha512_block_data_order_neon:
1e863180
AP
624.LNEON:
625 dmb @ errata #451034 on early Cortex A8
b1a5d1c6 626 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
313e6ec1 627 adr $Ktbl,K512
b1a5d1c6 628 VFP_ABI_PUSH
1e863180
AP
629 vldmia $ctx,{$A-$H} @ load context
630.Loop_neon:
631___
632for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
633$code.=<<___;
634 mov $cnt,#4
635.L16_79_neon:
636 subs $cnt,#1
637___
638for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
639$code.=<<___;
640 bne .L16_79_neon
641
f26328c2 642 vadd.i64 $A,d30 @ h+=Maj from the past
1e863180
AP
643 vldmia $ctx,{d24-d31} @ load context to temp
644 vadd.i64 q8,q12 @ vectorized accumulate
645 vadd.i64 q9,q13
646 vadd.i64 q10,q14
647 vadd.i64 q11,q15
648 vstmia $ctx,{$A-$H} @ save context
649 teq $inp,$len
650 sub $Ktbl,#640 @ rewind K512
651 bne .Loop_neon
652
b1a5d1c6 653 VFP_ABI_POP
5dcf70a1 654 ret @ bx lr
b1a5d1c6 655.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1e863180
AP
656#endif
657___
658}
659$code.=<<___;
1e863180 660.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
97a6a01f 661.align 2
b1a5d1c6 662#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
7b508cd1 663.extern OPENSSL_armcap_P
c1669e1c 664#endif
1fa29843
AP
665___
666
667$code =~ s/\`([^\`]*)\`/eval $1/gem;
7722e53f 668$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
5dcf70a1 669$code =~ s/\bret\b/bx lr/gm;
b1a5d1c6
AP
670
671open SELF,$0;
672while(<SELF>) {
673 next if (/^#!/);
674 last if (!s/^#/@/ and !/^$/);
675 print;
676}
677close SELF;
678
1fa29843 679print $code;
a21314db 680close STDOUT or die "error closing STDOUT: $!"; # enforce flush