]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha512-armv4.pl
872c27fbc249f32f5bcf8f6160d17e650e3a18aa
[thirdparty/openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 #
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
18
19 # SHA512 block procedure for ARMv4. September 2007.
20
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
24 #
25 # July 2010.
26 #
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
29
30 # February 2011.
31 #
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
34
35 # March 2011.
36 #
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
39
40 # August 2012.
41 #
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
48 # 16 cycles.
49
50 # Byte order [in]dependence. =========================================
51 #
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
56 $hi="HI";
57 $lo="LO";
58 # ====================================================================
59
60 $flavour = shift;
61 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
62 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
63
64 if ($flavour && $flavour ne "void") {
65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
67 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
68 die "can't locate arm-xlate.pl";
69
70 open STDOUT,"| \"$^X\" $xlate $flavour $output";
71 } else {
72 open STDOUT,">$output";
73 }
74
75 $ctx="r0"; # parameter block
76 $inp="r1";
77 $len="r2";
78
79 $Tlo="r3";
80 $Thi="r4";
81 $Alo="r5";
82 $Ahi="r6";
83 $Elo="r7";
84 $Ehi="r8";
85 $t0="r9";
86 $t1="r10";
87 $t2="r11";
88 $t3="r12";
89 ############ r13 is stack pointer
90 $Ktbl="r14";
91 ############ r15 is program counter
92
93 $Aoff=8*0;
94 $Boff=8*1;
95 $Coff=8*2;
96 $Doff=8*3;
97 $Eoff=8*4;
98 $Foff=8*5;
99 $Goff=8*6;
100 $Hoff=8*7;
101 $Xoff=8*8;
102
103 sub BODY_00_15() {
104 my $magic = shift;
105 $code.=<<___;
106 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
107 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
108 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
109 mov $t0,$Elo,lsr#14
110 str $Tlo,[sp,#$Xoff+0]
111 mov $t1,$Ehi,lsr#14
112 str $Thi,[sp,#$Xoff+4]
113 eor $t0,$t0,$Ehi,lsl#18
114 ldr $t2,[sp,#$Hoff+0] @ h.lo
115 eor $t1,$t1,$Elo,lsl#18
116 ldr $t3,[sp,#$Hoff+4] @ h.hi
117 eor $t0,$t0,$Elo,lsr#18
118 eor $t1,$t1,$Ehi,lsr#18
119 eor $t0,$t0,$Ehi,lsl#14
120 eor $t1,$t1,$Elo,lsl#14
121 eor $t0,$t0,$Ehi,lsr#9
122 eor $t1,$t1,$Elo,lsr#9
123 eor $t0,$t0,$Elo,lsl#23
124 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
125 adds $Tlo,$Tlo,$t0
126 ldr $t0,[sp,#$Foff+0] @ f.lo
127 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
128 ldr $t1,[sp,#$Foff+4] @ f.hi
129 adds $Tlo,$Tlo,$t2
130 ldr $t2,[sp,#$Goff+0] @ g.lo
131 adc $Thi,$Thi,$t3 @ T += h
132 ldr $t3,[sp,#$Goff+4] @ g.hi
133
134 eor $t0,$t0,$t2
135 str $Elo,[sp,#$Eoff+0]
136 eor $t1,$t1,$t3
137 str $Ehi,[sp,#$Eoff+4]
138 and $t0,$t0,$Elo
139 str $Alo,[sp,#$Aoff+0]
140 and $t1,$t1,$Ehi
141 str $Ahi,[sp,#$Aoff+4]
142 eor $t0,$t0,$t2
143 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
144 eor $t1,$t1,$t3 @ Ch(e,f,g)
145 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
146
147 adds $Tlo,$Tlo,$t0
148 ldr $Elo,[sp,#$Doff+0] @ d.lo
149 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
150 ldr $Ehi,[sp,#$Doff+4] @ d.hi
151 adds $Tlo,$Tlo,$t2
152 and $t0,$t2,#0xff
153 adc $Thi,$Thi,$t3 @ T += K[i]
154 adds $Elo,$Elo,$Tlo
155 ldr $t2,[sp,#$Boff+0] @ b.lo
156 adc $Ehi,$Ehi,$Thi @ d += T
157 teq $t0,#$magic
158
159 ldr $t3,[sp,#$Coff+0] @ c.lo
160 #ifdef __thumb2__
161 it eq @ Thumb2 thing, sanity check in ARM
162 #endif
163 orreq $Ktbl,$Ktbl,#1
164 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
165 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
166 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
167 mov $t0,$Alo,lsr#28
168 mov $t1,$Ahi,lsr#28
169 eor $t0,$t0,$Ahi,lsl#4
170 eor $t1,$t1,$Alo,lsl#4
171 eor $t0,$t0,$Ahi,lsr#2
172 eor $t1,$t1,$Alo,lsr#2
173 eor $t0,$t0,$Alo,lsl#30
174 eor $t1,$t1,$Ahi,lsl#30
175 eor $t0,$t0,$Ahi,lsr#7
176 eor $t1,$t1,$Alo,lsr#7
177 eor $t0,$t0,$Alo,lsl#25
178 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
179 adds $Tlo,$Tlo,$t0
180 and $t0,$Alo,$t2
181 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
182
183 ldr $t1,[sp,#$Boff+4] @ b.hi
184 orr $Alo,$Alo,$t2
185 ldr $t2,[sp,#$Coff+4] @ c.hi
186 and $Alo,$Alo,$t3
187 and $t3,$Ahi,$t1
188 orr $Ahi,$Ahi,$t1
189 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
190 and $Ahi,$Ahi,$t2
191 adds $Alo,$Alo,$Tlo
192 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
193 sub sp,sp,#8
194 adc $Ahi,$Ahi,$Thi @ h += T
195 tst $Ktbl,#1
196 add $Ktbl,$Ktbl,#8
197 ___
198 }
199 $code=<<___;
200 #ifndef __KERNEL__
201 # include "arm_arch.h"
202 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
203 # define VFP_ABI_POP vldmia sp!,{d8-d15}
204 #else
205 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
206 # define __ARM_MAX_ARCH__ 7
207 # define VFP_ABI_PUSH
208 # define VFP_ABI_POP
209 #endif
210
211 #ifdef __ARMEL__
212 # define LO 0
213 # define HI 4
214 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
215 #else
216 # define HI 0
217 # define LO 4
218 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
219 #endif
220
221 .text
222 #if defined(__thumb2__)
223 .syntax unified
224 .thumb
225 # define adrl adr
226 #else
227 .code 32
228 #endif
229
230 .type K512,%object
231 .align 5
232 K512:
233 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
234 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
235 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
236 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
237 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
238 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
239 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
240 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
241 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
242 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
243 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
244 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
245 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
246 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
247 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
248 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
249 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
250 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
251 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
252 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
253 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
254 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
255 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
256 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
257 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
258 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
259 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
260 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
261 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
262 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
263 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
264 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
265 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
266 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
267 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
268 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
269 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
270 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
271 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
272 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
273 .size K512,.-K512
274 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
275 .LOPENSSL_armcap:
276 .word OPENSSL_armcap_P-.Lsha512_block_data_order
277 .skip 32-4
278 #else
279 .skip 32
280 #endif
281
282 .global sha512_block_data_order
283 .type sha512_block_data_order,%function
284 sha512_block_data_order:
285 .Lsha512_block_data_order:
286 #if __ARM_ARCH__<7 && !defined(__thumb2__)
287 sub r3,pc,#8 @ sha512_block_data_order
288 #else
289 adr r3,.Lsha512_block_data_order
290 #endif
291 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
292 ldr r12,.LOPENSSL_armcap
293 ldr r12,[r3,r12] @ OPENSSL_armcap_P
294 #ifdef __APPLE__
295 ldr r12,[r12]
296 #endif
297 tst r12,#ARMV7_NEON
298 bne .LNEON
299 #endif
300 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
301 stmdb sp!,{r4-r12,lr}
302 sub $Ktbl,r3,#672 @ K512
303 sub sp,sp,#9*8
304
305 ldr $Elo,[$ctx,#$Eoff+$lo]
306 ldr $Ehi,[$ctx,#$Eoff+$hi]
307 ldr $t0, [$ctx,#$Goff+$lo]
308 ldr $t1, [$ctx,#$Goff+$hi]
309 ldr $t2, [$ctx,#$Hoff+$lo]
310 ldr $t3, [$ctx,#$Hoff+$hi]
311 .Loop:
312 str $t0, [sp,#$Goff+0]
313 str $t1, [sp,#$Goff+4]
314 str $t2, [sp,#$Hoff+0]
315 str $t3, [sp,#$Hoff+4]
316 ldr $Alo,[$ctx,#$Aoff+$lo]
317 ldr $Ahi,[$ctx,#$Aoff+$hi]
318 ldr $Tlo,[$ctx,#$Boff+$lo]
319 ldr $Thi,[$ctx,#$Boff+$hi]
320 ldr $t0, [$ctx,#$Coff+$lo]
321 ldr $t1, [$ctx,#$Coff+$hi]
322 ldr $t2, [$ctx,#$Doff+$lo]
323 ldr $t3, [$ctx,#$Doff+$hi]
324 str $Tlo,[sp,#$Boff+0]
325 str $Thi,[sp,#$Boff+4]
326 str $t0, [sp,#$Coff+0]
327 str $t1, [sp,#$Coff+4]
328 str $t2, [sp,#$Doff+0]
329 str $t3, [sp,#$Doff+4]
330 ldr $Tlo,[$ctx,#$Foff+$lo]
331 ldr $Thi,[$ctx,#$Foff+$hi]
332 str $Tlo,[sp,#$Foff+0]
333 str $Thi,[sp,#$Foff+4]
334
335 .L00_15:
336 #if __ARM_ARCH__<7
337 ldrb $Tlo,[$inp,#7]
338 ldrb $t0, [$inp,#6]
339 ldrb $t1, [$inp,#5]
340 ldrb $t2, [$inp,#4]
341 ldrb $Thi,[$inp,#3]
342 ldrb $t3, [$inp,#2]
343 orr $Tlo,$Tlo,$t0,lsl#8
344 ldrb $t0, [$inp,#1]
345 orr $Tlo,$Tlo,$t1,lsl#16
346 ldrb $t1, [$inp],#8
347 orr $Tlo,$Tlo,$t2,lsl#24
348 orr $Thi,$Thi,$t3,lsl#8
349 orr $Thi,$Thi,$t0,lsl#16
350 orr $Thi,$Thi,$t1,lsl#24
351 #else
352 ldr $Tlo,[$inp,#4]
353 ldr $Thi,[$inp],#8
354 #ifdef __ARMEL__
355 rev $Tlo,$Tlo
356 rev $Thi,$Thi
357 #endif
358 #endif
359 ___
360 &BODY_00_15(0x94);
361 $code.=<<___;
362 tst $Ktbl,#1
363 beq .L00_15
364 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
365 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
366 bic $Ktbl,$Ktbl,#1
367 .L16_79:
368 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
369 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
370 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
371 mov $Tlo,$t0,lsr#1
372 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
373 mov $Thi,$t1,lsr#1
374 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
375 eor $Tlo,$Tlo,$t1,lsl#31
376 eor $Thi,$Thi,$t0,lsl#31
377 eor $Tlo,$Tlo,$t0,lsr#8
378 eor $Thi,$Thi,$t1,lsr#8
379 eor $Tlo,$Tlo,$t1,lsl#24
380 eor $Thi,$Thi,$t0,lsl#24
381 eor $Tlo,$Tlo,$t0,lsr#7
382 eor $Thi,$Thi,$t1,lsr#7
383 eor $Tlo,$Tlo,$t1,lsl#25
384
385 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
386 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
387 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
388 mov $t0,$t2,lsr#19
389 mov $t1,$t3,lsr#19
390 eor $t0,$t0,$t3,lsl#13
391 eor $t1,$t1,$t2,lsl#13
392 eor $t0,$t0,$t3,lsr#29
393 eor $t1,$t1,$t2,lsr#29
394 eor $t0,$t0,$t2,lsl#3
395 eor $t1,$t1,$t3,lsl#3
396 eor $t0,$t0,$t2,lsr#6
397 eor $t1,$t1,$t3,lsr#6
398 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
399 eor $t0,$t0,$t3,lsl#26
400
401 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
402 adds $Tlo,$Tlo,$t0
403 ldr $t0,[sp,#`$Xoff+8*16`+0]
404 adc $Thi,$Thi,$t1
405
406 ldr $t1,[sp,#`$Xoff+8*16`+4]
407 adds $Tlo,$Tlo,$t2
408 adc $Thi,$Thi,$t3
409 adds $Tlo,$Tlo,$t0
410 adc $Thi,$Thi,$t1
411 ___
412 &BODY_00_15(0x17);
413 $code.=<<___;
414 #ifdef __thumb2__
415 ittt eq @ Thumb2 thing, sanity check in ARM
416 #endif
417 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
418 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
419 beq .L16_79
420 bic $Ktbl,$Ktbl,#1
421
422 ldr $Tlo,[sp,#$Boff+0]
423 ldr $Thi,[sp,#$Boff+4]
424 ldr $t0, [$ctx,#$Aoff+$lo]
425 ldr $t1, [$ctx,#$Aoff+$hi]
426 ldr $t2, [$ctx,#$Boff+$lo]
427 ldr $t3, [$ctx,#$Boff+$hi]
428 adds $t0,$Alo,$t0
429 str $t0, [$ctx,#$Aoff+$lo]
430 adc $t1,$Ahi,$t1
431 str $t1, [$ctx,#$Aoff+$hi]
432 adds $t2,$Tlo,$t2
433 str $t2, [$ctx,#$Boff+$lo]
434 adc $t3,$Thi,$t3
435 str $t3, [$ctx,#$Boff+$hi]
436
437 ldr $Alo,[sp,#$Coff+0]
438 ldr $Ahi,[sp,#$Coff+4]
439 ldr $Tlo,[sp,#$Doff+0]
440 ldr $Thi,[sp,#$Doff+4]
441 ldr $t0, [$ctx,#$Coff+$lo]
442 ldr $t1, [$ctx,#$Coff+$hi]
443 ldr $t2, [$ctx,#$Doff+$lo]
444 ldr $t3, [$ctx,#$Doff+$hi]
445 adds $t0,$Alo,$t0
446 str $t0, [$ctx,#$Coff+$lo]
447 adc $t1,$Ahi,$t1
448 str $t1, [$ctx,#$Coff+$hi]
449 adds $t2,$Tlo,$t2
450 str $t2, [$ctx,#$Doff+$lo]
451 adc $t3,$Thi,$t3
452 str $t3, [$ctx,#$Doff+$hi]
453
454 ldr $Tlo,[sp,#$Foff+0]
455 ldr $Thi,[sp,#$Foff+4]
456 ldr $t0, [$ctx,#$Eoff+$lo]
457 ldr $t1, [$ctx,#$Eoff+$hi]
458 ldr $t2, [$ctx,#$Foff+$lo]
459 ldr $t3, [$ctx,#$Foff+$hi]
460 adds $Elo,$Elo,$t0
461 str $Elo,[$ctx,#$Eoff+$lo]
462 adc $Ehi,$Ehi,$t1
463 str $Ehi,[$ctx,#$Eoff+$hi]
464 adds $t2,$Tlo,$t2
465 str $t2, [$ctx,#$Foff+$lo]
466 adc $t3,$Thi,$t3
467 str $t3, [$ctx,#$Foff+$hi]
468
469 ldr $Alo,[sp,#$Goff+0]
470 ldr $Ahi,[sp,#$Goff+4]
471 ldr $Tlo,[sp,#$Hoff+0]
472 ldr $Thi,[sp,#$Hoff+4]
473 ldr $t0, [$ctx,#$Goff+$lo]
474 ldr $t1, [$ctx,#$Goff+$hi]
475 ldr $t2, [$ctx,#$Hoff+$lo]
476 ldr $t3, [$ctx,#$Hoff+$hi]
477 adds $t0,$Alo,$t0
478 str $t0, [$ctx,#$Goff+$lo]
479 adc $t1,$Ahi,$t1
480 str $t1, [$ctx,#$Goff+$hi]
481 adds $t2,$Tlo,$t2
482 str $t2, [$ctx,#$Hoff+$lo]
483 adc $t3,$Thi,$t3
484 str $t3, [$ctx,#$Hoff+$hi]
485
486 add sp,sp,#640
487 sub $Ktbl,$Ktbl,#640
488
489 teq $inp,$len
490 bne .Loop
491
492 add sp,sp,#8*9 @ destroy frame
493 #if __ARM_ARCH__>=5
494 ldmia sp!,{r4-r12,pc}
495 #else
496 ldmia sp!,{r4-r12,lr}
497 tst lr,#1
498 moveq pc,lr @ be binary compatible with V4, yet
499 bx lr @ interoperable with Thumb ISA:-)
500 #endif
501 .size sha512_block_data_order,.-sha512_block_data_order
502 ___
503
504 {
505 my @Sigma0=(28,34,39);
506 my @Sigma1=(14,18,41);
507 my @sigma0=(1, 8, 7);
508 my @sigma1=(19,61,6);
509
510 my $Ktbl="r3";
511 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
512
513 my @X=map("d$_",(0..15));
514 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
515
516 sub NEON_00_15() {
517 my $i=shift;
518 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
519 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
520
521 $code.=<<___ if ($i<16 || $i&1);
522 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
523 #if $i<16
524 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
525 #endif
526 vshr.u64 $t1,$e,#@Sigma1[1]
527 #if $i>0
528 vadd.i64 $a,$Maj @ h+=Maj from the past
529 #endif
530 vshr.u64 $t2,$e,#@Sigma1[2]
531 ___
532 $code.=<<___;
533 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
534 vsli.64 $t0,$e,#`64-@Sigma1[0]`
535 vsli.64 $t1,$e,#`64-@Sigma1[1]`
536 vmov $Ch,$e
537 vsli.64 $t2,$e,#`64-@Sigma1[2]`
538 #if $i<16 && defined(__ARMEL__)
539 vrev64.8 @X[$i],@X[$i]
540 #endif
541 veor $t1,$t0
542 vbsl $Ch,$f,$g @ Ch(e,f,g)
543 vshr.u64 $t0,$a,#@Sigma0[0]
544 veor $t2,$t1 @ Sigma1(e)
545 vadd.i64 $T1,$Ch,$h
546 vshr.u64 $t1,$a,#@Sigma0[1]
547 vsli.64 $t0,$a,#`64-@Sigma0[0]`
548 vadd.i64 $T1,$t2
549 vshr.u64 $t2,$a,#@Sigma0[2]
550 vadd.i64 $K,@X[$i%16]
551 vsli.64 $t1,$a,#`64-@Sigma0[1]`
552 veor $Maj,$a,$b
553 vsli.64 $t2,$a,#`64-@Sigma0[2]`
554 veor $h,$t0,$t1
555 vadd.i64 $T1,$K
556 vbsl $Maj,$c,$b @ Maj(a,b,c)
557 veor $h,$t2 @ Sigma0(a)
558 vadd.i64 $d,$T1
559 vadd.i64 $Maj,$T1
560 @ vadd.i64 $h,$Maj
561 ___
562 }
563
564 sub NEON_16_79() {
565 my $i=shift;
566
567 if ($i&1) { &NEON_00_15($i,@_); return; }
568
569 # 2x-vectorized, therefore runs every 2nd round
570 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
571 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
572 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
573 my $e=@_[4]; # $e from NEON_00_15
574 $i /= 2;
575 $code.=<<___;
576 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
577 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
578 vadd.i64 @_[0],d30 @ h+=Maj from the past
579 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
580 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
581 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
582 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
583 veor $s1,$t0
584 vshr.u64 $t0,$s0,#@sigma0[0]
585 veor $s1,$t1 @ sigma1(X[i+14])
586 vshr.u64 $t1,$s0,#@sigma0[1]
587 vadd.i64 @X[$i%8],$s1
588 vshr.u64 $s1,$s0,#@sigma0[2]
589 vsli.64 $t0,$s0,#`64-@sigma0[0]`
590 vsli.64 $t1,$s0,#`64-@sigma0[1]`
591 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
592 veor $s1,$t0
593 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
594 vadd.i64 @X[$i%8],$s0
595 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
596 veor $s1,$t1 @ sigma0(X[i+1])
597 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
598 vadd.i64 @X[$i%8],$s1
599 ___
600 &NEON_00_15(2*$i,@_);
601 }
602
603 $code.=<<___;
604 #if __ARM_MAX_ARCH__>=7
605 .arch armv7-a
606 .fpu neon
607
608 .global sha512_block_data_order_neon
609 .type sha512_block_data_order_neon,%function
610 .align 4
611 sha512_block_data_order_neon:
612 .LNEON:
613 dmb @ errata #451034 on early Cortex A8
614 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
615 adr $Ktbl,K512
616 VFP_ABI_PUSH
617 vldmia $ctx,{$A-$H} @ load context
618 .Loop_neon:
619 ___
620 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
621 $code.=<<___;
622 mov $cnt,#4
623 .L16_79_neon:
624 subs $cnt,#1
625 ___
626 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
627 $code.=<<___;
628 bne .L16_79_neon
629
630 vadd.i64 $A,d30 @ h+=Maj from the past
631 vldmia $ctx,{d24-d31} @ load context to temp
632 vadd.i64 q8,q12 @ vectorized accumulate
633 vadd.i64 q9,q13
634 vadd.i64 q10,q14
635 vadd.i64 q11,q15
636 vstmia $ctx,{$A-$H} @ save context
637 teq $inp,$len
638 sub $Ktbl,#640 @ rewind K512
639 bne .Loop_neon
640
641 VFP_ABI_POP
642 ret @ bx lr
643 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
644 #endif
645 ___
646 }
647 $code.=<<___;
648 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
649 .align 2
650 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
651 .comm OPENSSL_armcap_P,4,4
652 #endif
653 ___
654
655 $code =~ s/\`([^\`]*)\`/eval $1/gem;
656 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
657 $code =~ s/\bret\b/bx lr/gm;
658
659 open SELF,$0;
660 while(<SELF>) {
661 next if (/^#!/);
662 last if (!s/^#/@/ and !/^$/);
663 print;
664 }
665 close SELF;
666
667 print $code;
668 close STDOUT; # enforce flush