]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha256-c64xplus.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / sha256-c64xplus.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
a598ed0d 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
3e181369
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# SHA256 for C64x+.
18#
19# January 2012
20#
21# Performance is just below 10 cycles per processed byte, which is
22# almost 40% faster than compiler-generated code. Unroll is unlikely
23# to give more than ~8% improvement...
24#
25# !!! Note that this module uses AMR, which means that all interrupt
26# service routines are expected to preserve it and for own well-being
27# zero it upon entry.
28
1aa89a7a 29$output = pop and open STDOUT,">$output";
3e181369
AP
30
31($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
32 $K256="A3";
33
34($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
35 =map("A$_",(16..31));
36($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
37 =map("B$_",(16..31));
38
39($Xia,$Xib)=("A5","B5"); # circular/ring buffer
40 $CTXB=$t2e;
41
42($Xn,$X0,$K)=("B7","B8","B9");
43($Maj,$Ch)=($T2,"B6");
44
45$code.=<<___;
46 .text
bd227733
AP
47
48 .if .ASSEMBLER_VERSION<7000000
49 .asg 0,__TI_EABI__
50 .endif
3e181369
AP
51 .if __TI_EABI__
52 .nocmp
904732f6 53 .asg sha256_block_data_order,_sha256_block_data_order
3e181369
AP
54 .endif
55
56 .asg B3,RA
57 .asg A15,FP
58 .asg B15,SP
59
60 .if .BIG_ENDIAN
61 .asg SWAP2,MV
62 .asg SWAP4,MV
63 .endif
64
65 .global _sha256_block_data_order
66_sha256_block_data_order:
bd227733 67__sha256_block:
3e181369
AP
68 .asmfunc stack_usage(64)
69 MV $NUM,A0 ; reassign $NUM
70|| MVK -64,B0
71 [!A0] BNOP RA ; if ($NUM==0) return;
72|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
73|| [A0] MV SP,FP
bd227733 74 [A0] ADDKPC __sha256_block,B2
3e181369
AP
75|| [A0] AND B0,SP,SP ; align stack at 64 bytes
76 .if __TI_EABI__
77 [A0] MVK 0x00404,B1
bd227733 78|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
3e181369 79 [A0] MVKH 0x50000,B1
bd227733 80|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
3e181369
AP
81 .else
82 [A0] MVK 0x00404,B1
bd227733 83|| [A0] MVKL (K256-__sha256_block),$K256
3e181369 84 [A0] MVKH 0x50000,B1
bd227733 85|| [A0] MVKH (K256-__sha256_block),$K256
3e181369
AP
86 .endif
87 [A0] MVC B1,AMR ; setup circular addressing
88|| [A0] MV SP,$Xia
89 [A0] MV SP,$Xib
90|| [A0] ADD B2,$K256,$K256
91|| [A0] MV $CTXA,$CTXB
92|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
93 LDW *${CTXA}[0],$A ; load ctx
94|| LDW *${CTXB}[4],$E
95 LDW *${CTXA}[1],$B
96|| LDW *${CTXB}[5],$F
97 LDW *${CTXA}[2],$C
98|| LDW *${CTXB}[6],$G
99 LDW *${CTXA}[3],$D
100|| LDW *${CTXB}[7],$H
101
102 LDNW *$INP++,$Xn ; pre-fetch input
103 LDW *$K256++,$K ; pre-fetch K256[0]
104 MVK 14,B0 ; loop counters
105 MVK 47,B1
106|| ADDAW $Xia,9,$Xia
107outerloop?:
108 SUB A0,1,A0
109|| MV $A,$Actx
110|| MV $E,$Ectx
111|| MVD $B,$Bctx
112|| MVD $F,$Fctx
113 MV $C,$Cctx
114|| MV $G,$Gctx
115|| MVD $D,$Dctx
116|| MVD $H,$Hctx
117|| SWAP4 $Xn,$X0
118
119 SPLOOPD 8 ; BODY_00_14
120|| MVC B0,ILC
121|| SWAP2 $X0,$X0
122
123 LDNW *$INP++,$Xn
124|| ROTL $A,30,$S0
125|| OR $A,$B,$Maj
126|| AND $A,$B,$t2a
127|| ROTL $E,26,$S1
128|| AND $F,$E,$Ch
129|| ANDN $G,$E,$t2e
130 ROTL $A,19,$t0a
131|| AND $C,$Maj,$Maj
132|| ROTL $E,21,$t0e
133|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
134 ROTL $A,10,$t1a
135|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
136|| ROTL $E,7,$t1e
137|| ADD $K,$H,$T1 ; T1 = h + K256[i]
138 ADD $X0,$T1,$T1 ; T1 += X[i];
139|| STW $X0,*$Xib++
140|| XOR $t0a,$S0,$S0
141|| XOR $t0e,$S1,$S1
142 XOR $t1a,$S0,$S0 ; Sigma0(a)
143|| XOR $t1e,$S1,$S1 ; Sigma1(e)
144|| LDW *$K256++,$K ; pre-fetch K256[i+1]
145|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
146 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
147|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
148|| ROTL $G,0,$H ; h = g
149|| MV $F,$G ; g = f
150|| MV $X0,$X14
151|| SWAP4 $Xn,$X0
152 SWAP2 $X0,$X0
153|| MV $E,$F ; f = e
154|| ADD $D,$T1,$E ; e = d + T1
155|| MV $C,$D ; d = c
156 MV $B,$C ; c = b
157|| MV $A,$B ; b = a
158|| ADD $T1,$T2,$A ; a = T1 + T2
159 SPKERNEL
160
161 ROTL $A,30,$S0 ; BODY_15
162|| OR $A,$B,$Maj
163|| AND $A,$B,$t2a
164|| ROTL $E,26,$S1
165|| AND $F,$E,$Ch
166|| ANDN $G,$E,$t2e
167|| LDW *${Xib}[1],$Xn ; modulo-scheduled
168 ROTL $A,19,$t0a
169|| AND $C,$Maj,$Maj
170|| ROTL $E,21,$t0e
171|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
172|| LDW *${Xib}[2],$X1 ; modulo-scheduled
173 ROTL $A,10,$t1a
174|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
175|| ROTL $E,7,$t1e
176|| ADD $K,$H,$T1 ; T1 = h + K256[i]
177 ADD $X0,$T1,$T1 ; T1 += X[i];
178|| STW $X0,*$Xib++
179|| XOR $t0a,$S0,$S0
180|| XOR $t0e,$S1,$S1
181 XOR $t1a,$S0,$S0 ; Sigma0(a)
182|| XOR $t1e,$S1,$S1 ; Sigma1(e)
183|| LDW *$K256++,$K ; pre-fetch K256[i+1]
184|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
185 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
186|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
187|| ROTL $G,0,$H ; h = g
188|| MV $F,$G ; g = f
189|| MV $X0,$X15
190 MV $E,$F ; f = e
191|| ADD $D,$T1,$E ; e = d + T1
192|| MV $C,$D ; d = c
193|| MV $Xn,$X0 ; modulo-scheduled
194|| LDW *$Xia,$X9 ; modulo-scheduled
195|| ROTL $X1,25,$t0e ; modulo-scheduled
196|| ROTL $X14,15,$t0a ; modulo-scheduled
197 SHRU $X1,3,$s0 ; modulo-scheduled
198|| SHRU $X14,10,$s1 ; modulo-scheduled
199|| ROTL $B,0,$C ; c = b
200|| MV $A,$B ; b = a
201|| ADD $T1,$T2,$A ; a = T1 + T2
202
203 SPLOOPD 10 ; BODY_16_63
204|| MVC B1,ILC
205|| ROTL $X1,14,$t1e ; modulo-scheduled
206|| ROTL $X14,13,$t1a ; modulo-scheduled
207
208 XOR $t0e,$s0,$s0
209|| XOR $t0a,$s1,$s1
210|| MV $X15,$X14
211|| MV $X1,$Xn
212 XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
213|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
214|| LDW *${Xib}[2],$X1 ; module-scheduled
215 ROTL $A,30,$S0
216|| OR $A,$B,$Maj
217|| AND $A,$B,$t2a
218|| ROTL $E,26,$S1
219|| AND $F,$E,$Ch
220|| ANDN $G,$E,$t2e
221|| ADD $X9,$X0,$X0 ; X[i] += X[i+9]
222 ROTL $A,19,$t0a
223|| AND $C,$Maj,$Maj
224|| ROTL $E,21,$t0e
225|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
226|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
227 ROTL $A,10,$t1a
228|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
229|| ROTL $E,7,$t1e
230|| ADD $H,$K,$T1 ; T1 = h + K256[i]
231|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
232 XOR $t0a,$S0,$S0
233|| XOR $t0e,$S1,$S1
234|| ADD $X0,$T1,$T1 ; T1 += X[i]
235|| STW $X0,*$Xib++
236 XOR $t1a,$S0,$S0 ; Sigma0(a)
237|| XOR $t1e,$S1,$S1 ; Sigma1(e)
238|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
239|| MV $X0,$X15
240|| ROTL $G,0,$H ; h = g
241|| LDW *$K256++,$K ; pre-fetch K256[i+1]
242 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
243|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
244|| MV $F,$G ; g = f
245|| MV $Xn,$X0 ; modulo-scheduled
246|| LDW *++$Xia,$X9 ; modulo-scheduled
247|| ROTL $X1,25,$t0e ; module-scheduled
248|| ROTL $X14,15,$t0a ; modulo-scheduled
249 ROTL $X1,14,$t1e ; modulo-scheduled
250|| ROTL $X14,13,$t1a ; modulo-scheduled
251|| MV $E,$F ; f = e
252|| ADD $D,$T1,$E ; e = d + T1
253|| MV $C,$D ; d = c
254|| MV $B,$C ; c = b
255 MV $A,$B ; b = a
256|| ADD $T1,$T2,$A ; a = T1 + T2
257|| SHRU $X1,3,$s0 ; modulo-scheduled
258|| SHRU $X14,10,$s1 ; modulo-scheduled
259 SPKERNEL
260
261 [A0] B outerloop?
262|| [A0] LDNW *$INP++,$Xn ; pre-fetch input
263|| [A0] ADDK -260,$K256 ; rewind K256
264|| ADD $Actx,$A,$A ; accumulate ctx
265|| ADD $Ectx,$E,$E
266|| ADD $Bctx,$B,$B
267 ADD $Fctx,$F,$F
268|| ADD $Cctx,$C,$C
269|| ADD $Gctx,$G,$G
270|| ADD $Dctx,$D,$D
271|| ADD $Hctx,$H,$H
272|| [A0] LDW *$K256++,$K ; pre-fetch K256[0]
273
274 [!A0] BNOP RA
275||[!A0] MV $CTXA,$CTXB
276 [!A0] MV FP,SP ; restore stack pointer
277||[!A0] LDW *FP[0],FP ; restore frame pointer
278 [!A0] STW $A,*${CTXA}[0] ; save ctx
279||[!A0] STW $E,*${CTXB}[4]
280||[!A0] MVK 0,B0
281 [!A0] STW $B,*${CTXA}[1]
282||[!A0] STW $F,*${CTXB}[5]
283||[!A0] MVC B0,AMR ; clear AMR
284 STW $C,*${CTXA}[2]
285|| STW $G,*${CTXB}[6]
286 STW $D,*${CTXA}[3]
287|| STW $H,*${CTXB}[7]
288 .endasmfunc
289
904732f6
AP
290 .if __TI_EABI__
291 .sect ".text:sha_asm.const"
292 .else
3e181369 293 .sect ".const:sha_asm"
904732f6 294 .endif
3e181369
AP
295 .align 128
296K256:
297 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
298 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
299 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
300 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
301 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
302 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
303 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
304 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
305 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
306 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
307 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
308 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
309 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
310 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
311 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
312 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
313 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
314 .align 4
315
316___
317
318print $code;
a21314db 319close STDOUT or die "error closing STDOUT: $!";