]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha512-armv4.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2023 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 #
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
18
19 # SHA512 block procedure for ARMv4. September 2007.
20
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
24 #
25 # July 2010.
26 #
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
29
30 # February 2011.
31 #
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
34
35 # March 2011.
36 #
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
39
40 # August 2012.
41 #
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
48 # 16 cycles.
49
50 # Byte order [in]dependence. =========================================
51 #
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
56 $hi="HI";
57 $lo="LO";
58 # ====================================================================
59
60 # $output is the last argument if it looks like a file (it has an extension)
61 # $flavour is the first argument if it doesn't look like a file
62 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
63 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
64
65 if ($flavour && $flavour ne "void") {
66 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
68 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
69 die "can't locate arm-xlate.pl";
70
71 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
72 or die "can't call $xlate: $!";
73 } else {
74 $output and open STDOUT,">$output";
75 }
76
77 $ctx="r0"; # parameter block
78 $inp="r1";
79 $len="r2";
80
81 $Tlo="r3";
82 $Thi="r4";
83 $Alo="r5";
84 $Ahi="r6";
85 $Elo="r7";
86 $Ehi="r8";
87 $t0="r9";
88 $t1="r10";
89 $t2="r11";
90 $t3="r12";
91 ############ r13 is stack pointer
92 $Ktbl="r14";
93 ############ r15 is program counter
94
95 $Aoff=8*0;
96 $Boff=8*1;
97 $Coff=8*2;
98 $Doff=8*3;
99 $Eoff=8*4;
100 $Foff=8*5;
101 $Goff=8*6;
102 $Hoff=8*7;
103 $Xoff=8*8;
104
105 sub BODY_00_15() {
106 my $magic = shift;
107 $code.=<<___;
108 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
109 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
110 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
111 mov $t0,$Elo,lsr#14
112 str $Tlo,[sp,#$Xoff+0]
113 mov $t1,$Ehi,lsr#14
114 str $Thi,[sp,#$Xoff+4]
115 eor $t0,$t0,$Ehi,lsl#18
116 ldr $t2,[sp,#$Hoff+0] @ h.lo
117 eor $t1,$t1,$Elo,lsl#18
118 ldr $t3,[sp,#$Hoff+4] @ h.hi
119 eor $t0,$t0,$Elo,lsr#18
120 eor $t1,$t1,$Ehi,lsr#18
121 eor $t0,$t0,$Ehi,lsl#14
122 eor $t1,$t1,$Elo,lsl#14
123 eor $t0,$t0,$Ehi,lsr#9
124 eor $t1,$t1,$Elo,lsr#9
125 eor $t0,$t0,$Elo,lsl#23
126 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
127 adds $Tlo,$Tlo,$t0
128 ldr $t0,[sp,#$Foff+0] @ f.lo
129 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
130 ldr $t1,[sp,#$Foff+4] @ f.hi
131 adds $Tlo,$Tlo,$t2
132 ldr $t2,[sp,#$Goff+0] @ g.lo
133 adc $Thi,$Thi,$t3 @ T += h
134 ldr $t3,[sp,#$Goff+4] @ g.hi
135
136 eor $t0,$t0,$t2
137 str $Elo,[sp,#$Eoff+0]
138 eor $t1,$t1,$t3
139 str $Ehi,[sp,#$Eoff+4]
140 and $t0,$t0,$Elo
141 str $Alo,[sp,#$Aoff+0]
142 and $t1,$t1,$Ehi
143 str $Ahi,[sp,#$Aoff+4]
144 eor $t0,$t0,$t2
145 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
146 eor $t1,$t1,$t3 @ Ch(e,f,g)
147 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
148
149 adds $Tlo,$Tlo,$t0
150 ldr $Elo,[sp,#$Doff+0] @ d.lo
151 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
152 ldr $Ehi,[sp,#$Doff+4] @ d.hi
153 adds $Tlo,$Tlo,$t2
154 and $t0,$t2,#0xff
155 adc $Thi,$Thi,$t3 @ T += K[i]
156 adds $Elo,$Elo,$Tlo
157 ldr $t2,[sp,#$Boff+0] @ b.lo
158 adc $Ehi,$Ehi,$Thi @ d += T
159 teq $t0,#$magic
160
161 ldr $t3,[sp,#$Coff+0] @ c.lo
162 #ifdef __thumb2__
163 it eq @ Thumb2 thing, sanity check in ARM
164 #endif
165 orreq $Ktbl,$Ktbl,#1
166 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
167 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
168 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
169 mov $t0,$Alo,lsr#28
170 mov $t1,$Ahi,lsr#28
171 eor $t0,$t0,$Ahi,lsl#4
172 eor $t1,$t1,$Alo,lsl#4
173 eor $t0,$t0,$Ahi,lsr#2
174 eor $t1,$t1,$Alo,lsr#2
175 eor $t0,$t0,$Alo,lsl#30
176 eor $t1,$t1,$Ahi,lsl#30
177 eor $t0,$t0,$Ahi,lsr#7
178 eor $t1,$t1,$Alo,lsr#7
179 eor $t0,$t0,$Alo,lsl#25
180 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
181 adds $Tlo,$Tlo,$t0
182 and $t0,$Alo,$t2
183 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
184
185 ldr $t1,[sp,#$Boff+4] @ b.hi
186 orr $Alo,$Alo,$t2
187 ldr $t2,[sp,#$Coff+4] @ c.hi
188 and $Alo,$Alo,$t3
189 and $t3,$Ahi,$t1
190 orr $Ahi,$Ahi,$t1
191 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
192 and $Ahi,$Ahi,$t2
193 adds $Alo,$Alo,$Tlo
194 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
195 sub sp,sp,#8
196 adc $Ahi,$Ahi,$Thi @ h += T
197 tst $Ktbl,#1
198 add $Ktbl,$Ktbl,#8
199 ___
200 }
201
202 my $_word = ($flavour =~ /win/ ? "DCDU" : ".word");
203
204 $code=<<___;
205 #ifndef __KERNEL__
206 # include "arm_arch.h"
207 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
208 # define VFP_ABI_POP vldmia sp!,{d8-d15}
209 #else
210 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
211 # define __ARM_MAX_ARCH__ 7
212 # define VFP_ABI_PUSH
213 # define VFP_ABI_POP
214 #endif
215
216 #ifdef __ARMEL__
217 # define LO 0
218 # define HI 4
219 # define WORD64(hi0,lo0,hi1,lo1) $_word lo0,hi0, lo1,hi1
220 #else
221 # define HI 0
222 # define LO 4
223 # define WORD64(hi0,lo0,hi1,lo1) $_word hi0,lo0, hi1,lo1
224 #endif
225
226 #if defined(__thumb2__)
227 .syntax unified
228 .thumb
229 # define adrl adr
230 #else
231 .code 32
232 #endif
233
234 .text
235
236 .type K512,%object
237 .align 5
238 K512:
239 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
240 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
241 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
242 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
243 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
244 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
245 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
246 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
247 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
248 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
249 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
250 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
251 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
252 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
253 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
254 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
255 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
256 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
257 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
258 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
259 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
260 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
261 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
262 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
263 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
264 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
265 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
266 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
267 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
268 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
269 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
270 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
271 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
272 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
273 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
274 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
275 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
276 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
277 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
278 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
279 .size K512,.-K512
280 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
281 .LOPENSSL_armcap:
282 # ifdef _WIN32
283 .word OPENSSL_armcap_P
284 # else
285 .word OPENSSL_armcap_P-.Lsha512_block_data_order
286 # endif
287 .skip 32-4
288 #else
289 .skip 32
290 #endif
291
292 .global sha512_block_data_order
293 .type sha512_block_data_order,%function
294 sha512_block_data_order:
295 .Lsha512_block_data_order:
296 #if __ARM_ARCH__<7 && !defined(__thumb2__)
297 sub r3,pc,#8 @ sha512_block_data_order
298 #else
299 adr r3,.Lsha512_block_data_order
300 #endif
301 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
302 ldr r12,.LOPENSSL_armcap
303 # if !defined(_WIN32)
304 ldr r12,[r3,r12] @ OPENSSL_armcap_P
305 # endif
306 # if defined(__APPLE__) || defined(_WIN32)
307 ldr r12,[r12]
308 # endif
309 tst r12,#ARMV7_NEON
310 bne .LNEON
311 #endif
312 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
313 stmdb sp!,{r4-r12,lr}
314 sub $Ktbl,r3,#672 @ K512
315 sub sp,sp,#9*8
316
317 ldr $Elo,[$ctx,#$Eoff+$lo]
318 ldr $Ehi,[$ctx,#$Eoff+$hi]
319 ldr $t0, [$ctx,#$Goff+$lo]
320 ldr $t1, [$ctx,#$Goff+$hi]
321 ldr $t2, [$ctx,#$Hoff+$lo]
322 ldr $t3, [$ctx,#$Hoff+$hi]
323 .Loop:
324 str $t0, [sp,#$Goff+0]
325 str $t1, [sp,#$Goff+4]
326 str $t2, [sp,#$Hoff+0]
327 str $t3, [sp,#$Hoff+4]
328 ldr $Alo,[$ctx,#$Aoff+$lo]
329 ldr $Ahi,[$ctx,#$Aoff+$hi]
330 ldr $Tlo,[$ctx,#$Boff+$lo]
331 ldr $Thi,[$ctx,#$Boff+$hi]
332 ldr $t0, [$ctx,#$Coff+$lo]
333 ldr $t1, [$ctx,#$Coff+$hi]
334 ldr $t2, [$ctx,#$Doff+$lo]
335 ldr $t3, [$ctx,#$Doff+$hi]
336 str $Tlo,[sp,#$Boff+0]
337 str $Thi,[sp,#$Boff+4]
338 str $t0, [sp,#$Coff+0]
339 str $t1, [sp,#$Coff+4]
340 str $t2, [sp,#$Doff+0]
341 str $t3, [sp,#$Doff+4]
342 ldr $Tlo,[$ctx,#$Foff+$lo]
343 ldr $Thi,[$ctx,#$Foff+$hi]
344 str $Tlo,[sp,#$Foff+0]
345 str $Thi,[sp,#$Foff+4]
346
347 .L00_15:
348 #if __ARM_ARCH__<7
349 ldrb $Tlo,[$inp,#7]
350 ldrb $t0, [$inp,#6]
351 ldrb $t1, [$inp,#5]
352 ldrb $t2, [$inp,#4]
353 ldrb $Thi,[$inp,#3]
354 ldrb $t3, [$inp,#2]
355 orr $Tlo,$Tlo,$t0,lsl#8
356 ldrb $t0, [$inp,#1]
357 orr $Tlo,$Tlo,$t1,lsl#16
358 ldrb $t1, [$inp],#8
359 orr $Tlo,$Tlo,$t2,lsl#24
360 orr $Thi,$Thi,$t3,lsl#8
361 orr $Thi,$Thi,$t0,lsl#16
362 orr $Thi,$Thi,$t1,lsl#24
363 #else
364 ldr $Tlo,[$inp,#4]
365 ldr $Thi,[$inp],#8
366 #ifdef __ARMEL__
367 rev $Tlo,$Tlo
368 rev $Thi,$Thi
369 #endif
370 #endif
371 ___
372 &BODY_00_15(0x94);
373 $code.=<<___;
374 tst $Ktbl,#1
375 beq .L00_15
376 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
377 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
378 bic $Ktbl,$Ktbl,#1
379 .L16_79:
380 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
381 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
382 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
383 mov $Tlo,$t0,lsr#1
384 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
385 mov $Thi,$t1,lsr#1
386 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
387 eor $Tlo,$Tlo,$t1,lsl#31
388 eor $Thi,$Thi,$t0,lsl#31
389 eor $Tlo,$Tlo,$t0,lsr#8
390 eor $Thi,$Thi,$t1,lsr#8
391 eor $Tlo,$Tlo,$t1,lsl#24
392 eor $Thi,$Thi,$t0,lsl#24
393 eor $Tlo,$Tlo,$t0,lsr#7
394 eor $Thi,$Thi,$t1,lsr#7
395 eor $Tlo,$Tlo,$t1,lsl#25
396
397 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
398 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
399 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
400 mov $t0,$t2,lsr#19
401 mov $t1,$t3,lsr#19
402 eor $t0,$t0,$t3,lsl#13
403 eor $t1,$t1,$t2,lsl#13
404 eor $t0,$t0,$t3,lsr#29
405 eor $t1,$t1,$t2,lsr#29
406 eor $t0,$t0,$t2,lsl#3
407 eor $t1,$t1,$t3,lsl#3
408 eor $t0,$t0,$t2,lsr#6
409 eor $t1,$t1,$t3,lsr#6
410 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
411 eor $t0,$t0,$t3,lsl#26
412
413 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
414 adds $Tlo,$Tlo,$t0
415 ldr $t0,[sp,#`$Xoff+8*16`+0]
416 adc $Thi,$Thi,$t1
417
418 ldr $t1,[sp,#`$Xoff+8*16`+4]
419 adds $Tlo,$Tlo,$t2
420 adc $Thi,$Thi,$t3
421 adds $Tlo,$Tlo,$t0
422 adc $Thi,$Thi,$t1
423 ___
424 &BODY_00_15(0x17);
425 $code.=<<___;
426 #ifdef __thumb2__
427 ittt eq @ Thumb2 thing, sanity check in ARM
428 #endif
429 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
430 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
431 beq .L16_79
432 bic $Ktbl,$Ktbl,#1
433
434 ldr $Tlo,[sp,#$Boff+0]
435 ldr $Thi,[sp,#$Boff+4]
436 ldr $t0, [$ctx,#$Aoff+$lo]
437 ldr $t1, [$ctx,#$Aoff+$hi]
438 ldr $t2, [$ctx,#$Boff+$lo]
439 ldr $t3, [$ctx,#$Boff+$hi]
440 adds $t0,$Alo,$t0
441 str $t0, [$ctx,#$Aoff+$lo]
442 adc $t1,$Ahi,$t1
443 str $t1, [$ctx,#$Aoff+$hi]
444 adds $t2,$Tlo,$t2
445 str $t2, [$ctx,#$Boff+$lo]
446 adc $t3,$Thi,$t3
447 str $t3, [$ctx,#$Boff+$hi]
448
449 ldr $Alo,[sp,#$Coff+0]
450 ldr $Ahi,[sp,#$Coff+4]
451 ldr $Tlo,[sp,#$Doff+0]
452 ldr $Thi,[sp,#$Doff+4]
453 ldr $t0, [$ctx,#$Coff+$lo]
454 ldr $t1, [$ctx,#$Coff+$hi]
455 ldr $t2, [$ctx,#$Doff+$lo]
456 ldr $t3, [$ctx,#$Doff+$hi]
457 adds $t0,$Alo,$t0
458 str $t0, [$ctx,#$Coff+$lo]
459 adc $t1,$Ahi,$t1
460 str $t1, [$ctx,#$Coff+$hi]
461 adds $t2,$Tlo,$t2
462 str $t2, [$ctx,#$Doff+$lo]
463 adc $t3,$Thi,$t3
464 str $t3, [$ctx,#$Doff+$hi]
465
466 ldr $Tlo,[sp,#$Foff+0]
467 ldr $Thi,[sp,#$Foff+4]
468 ldr $t0, [$ctx,#$Eoff+$lo]
469 ldr $t1, [$ctx,#$Eoff+$hi]
470 ldr $t2, [$ctx,#$Foff+$lo]
471 ldr $t3, [$ctx,#$Foff+$hi]
472 adds $Elo,$Elo,$t0
473 str $Elo,[$ctx,#$Eoff+$lo]
474 adc $Ehi,$Ehi,$t1
475 str $Ehi,[$ctx,#$Eoff+$hi]
476 adds $t2,$Tlo,$t2
477 str $t2, [$ctx,#$Foff+$lo]
478 adc $t3,$Thi,$t3
479 str $t3, [$ctx,#$Foff+$hi]
480
481 ldr $Alo,[sp,#$Goff+0]
482 ldr $Ahi,[sp,#$Goff+4]
483 ldr $Tlo,[sp,#$Hoff+0]
484 ldr $Thi,[sp,#$Hoff+4]
485 ldr $t0, [$ctx,#$Goff+$lo]
486 ldr $t1, [$ctx,#$Goff+$hi]
487 ldr $t2, [$ctx,#$Hoff+$lo]
488 ldr $t3, [$ctx,#$Hoff+$hi]
489 adds $t0,$Alo,$t0
490 str $t0, [$ctx,#$Goff+$lo]
491 adc $t1,$Ahi,$t1
492 str $t1, [$ctx,#$Goff+$hi]
493 adds $t2,$Tlo,$t2
494 str $t2, [$ctx,#$Hoff+$lo]
495 adc $t3,$Thi,$t3
496 str $t3, [$ctx,#$Hoff+$hi]
497
498 add sp,sp,#640
499 sub $Ktbl,$Ktbl,#640
500
501 teq $inp,$len
502 bne .Loop
503
504 add sp,sp,#8*9 @ destroy frame
505 #if __ARM_ARCH__>=5
506 ldmia sp!,{r4-r12,pc}
507 #else
508 ldmia sp!,{r4-r12,lr}
509 tst lr,#1
510 moveq pc,lr @ be binary compatible with V4, yet
511 bx lr @ interoperable with Thumb ISA:-)
512 #endif
513 .size sha512_block_data_order,.-sha512_block_data_order
514 ___
515
516 {
517 my @Sigma0=(28,34,39);
518 my @Sigma1=(14,18,41);
519 my @sigma0=(1, 8, 7);
520 my @sigma1=(19,61,6);
521
522 my $Ktbl="r3";
523 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
524
525 my @X=map("d$_",(0..15));
526 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
527
528 sub NEON_00_15() {
529 my $i=shift;
530 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
531 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
532
533 $code.=<<___ if ($i<16 || $i&1);
534 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
535 #if $i<16
536 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
537 #endif
538 vshr.u64 $t1,$e,#@Sigma1[1]
539 #if $i>0
540 vadd.i64 $a,$Maj @ h+=Maj from the past
541 #endif
542 vshr.u64 $t2,$e,#@Sigma1[2]
543 ___
544 $code.=<<___;
545 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
546 vsli.64 $t0,$e,#`64-@Sigma1[0]`
547 vsli.64 $t1,$e,#`64-@Sigma1[1]`
548 vmov $Ch,$e
549 vsli.64 $t2,$e,#`64-@Sigma1[2]`
550 #if $i<16 && defined(__ARMEL__)
551 vrev64.8 @X[$i],@X[$i]
552 #endif
553 veor $t1,$t0
554 vbsl $Ch,$f,$g @ Ch(e,f,g)
555 vshr.u64 $t0,$a,#@Sigma0[0]
556 veor $t2,$t1 @ Sigma1(e)
557 vadd.i64 $T1,$Ch,$h
558 vshr.u64 $t1,$a,#@Sigma0[1]
559 vsli.64 $t0,$a,#`64-@Sigma0[0]`
560 vadd.i64 $T1,$t2
561 vshr.u64 $t2,$a,#@Sigma0[2]
562 vadd.i64 $K,@X[$i%16]
563 vsli.64 $t1,$a,#`64-@Sigma0[1]`
564 veor $Maj,$a,$b
565 vsli.64 $t2,$a,#`64-@Sigma0[2]`
566 veor $h,$t0,$t1
567 vadd.i64 $T1,$K
568 vbsl $Maj,$c,$b @ Maj(a,b,c)
569 veor $h,$t2 @ Sigma0(a)
570 vadd.i64 $d,$T1
571 vadd.i64 $Maj,$T1
572 @ vadd.i64 $h,$Maj
573 ___
574 }
575
576 sub NEON_16_79() {
577 my $i=shift;
578
579 if ($i&1) { &NEON_00_15($i,@_); return; }
580
581 # 2x-vectorized, therefore runs every 2nd round
582 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
583 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
584 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
585 my $e=@_[4]; # $e from NEON_00_15
586 $i /= 2;
587 $code.=<<___;
588 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
589 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
590 vadd.i64 @_[0],d30 @ h+=Maj from the past
591 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
592 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
593 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
594 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
595 veor $s1,$t0
596 vshr.u64 $t0,$s0,#@sigma0[0]
597 veor $s1,$t1 @ sigma1(X[i+14])
598 vshr.u64 $t1,$s0,#@sigma0[1]
599 vadd.i64 @X[$i%8],$s1
600 vshr.u64 $s1,$s0,#@sigma0[2]
601 vsli.64 $t0,$s0,#`64-@sigma0[0]`
602 vsli.64 $t1,$s0,#`64-@sigma0[1]`
603 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
604 veor $s1,$t0
605 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
606 vadd.i64 @X[$i%8],$s0
607 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
608 veor $s1,$t1 @ sigma0(X[i+1])
609 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
610 vadd.i64 @X[$i%8],$s1
611 ___
612 &NEON_00_15(2*$i,@_);
613 }
614
615 $code.=<<___;
616 #if __ARM_MAX_ARCH__>=7
617 .arch armv7-a
618 .fpu neon
619
620 .global sha512_block_data_order_neon
621 .type sha512_block_data_order_neon,%function
622 .align 4
623 sha512_block_data_order_neon:
624 .LNEON:
625 dmb @ errata #451034 on early Cortex A8
626 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
627 adr $Ktbl,K512
628 VFP_ABI_PUSH
629 vldmia $ctx,{$A-$H} @ load context
630 .Loop_neon:
631 ___
632 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
633 $code.=<<___;
634 mov $cnt,#4
635 .L16_79_neon:
636 subs $cnt,#1
637 ___
638 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
639 $code.=<<___;
640 bne .L16_79_neon
641
642 vadd.i64 $A,d30 @ h+=Maj from the past
643 vldmia $ctx,{d24-d31} @ load context to temp
644 vadd.i64 q8,q12 @ vectorized accumulate
645 vadd.i64 q9,q13
646 vadd.i64 q10,q14
647 vadd.i64 q11,q15
648 vstmia $ctx,{$A-$H} @ save context
649 teq $inp,$len
650 sub $Ktbl,#640 @ rewind K512
651 bne .Loop_neon
652
653 VFP_ABI_POP
654 ret @ bx lr
655 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
656 #endif
657 ___
658 }
659 $code.=<<___;
660 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
661 .align 2
662 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
663 .extern OPENSSL_armcap_P
664 #endif
665 ___
666
667 $code =~ s/\`([^\`]*)\`/eval $1/gem;
668 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
669 $code =~ s/\bret\b/bx lr/gm;
670
671 open SELF,$0;
672 while(<SELF>) {
673 next if (/^#!/);
674 last if (!s/^#/@/ and !/^$/);
675 print;
676 }
677 close SELF;
678
679 print $code;
680 close STDOUT or die "error closing STDOUT: $!"; # enforce flush