]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha256-c64xplus.pl
C64x+ assembly pack: improve EABI support.
[thirdparty/openssl.git] / crypto / sha / asm / sha256-c64xplus.pl
CommitLineData
3e181369
AP
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256 for C64x+.
11#
12# January 2012
13#
14# Performance is just below 10 cycles per processed byte, which is
15# almost 40% faster than compiler-generated code. Unroll is unlikely
16# to give more than ~8% improvement...
17#
18# !!! Note that this module uses AMR, which means that all interrupt
19# service routines are expected to preserve it and for own well-being
20# zero it upon entry.
21
22while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23open STDOUT,">$output";
24
25($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
26 $K256="A3";
27
28($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
29 =map("A$_",(16..31));
30($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
31 =map("B$_",(16..31));
32
33($Xia,$Xib)=("A5","B5"); # circular/ring buffer
34 $CTXB=$t2e;
35
36($Xn,$X0,$K)=("B7","B8","B9");
37($Maj,$Ch)=($T2,"B6");
38
39$code.=<<___;
40 .text
41 .if __TI_EABI__
42 .nocmp
904732f6 43 .asg sha256_block_data_order,_sha256_block_data_order
3e181369
AP
44 .endif
45
46 .asg B3,RA
47 .asg A15,FP
48 .asg B15,SP
49
50 .if .BIG_ENDIAN
51 .asg SWAP2,MV
52 .asg SWAP4,MV
53 .endif
54
55 .global _sha256_block_data_order
56_sha256_block_data_order:
57 .asmfunc stack_usage(64)
58 MV $NUM,A0 ; reassign $NUM
59|| MVK -64,B0
60 [!A0] BNOP RA ; if ($NUM==0) return;
61|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
62|| [A0] MV SP,FP
63 [A0] ADDKPC _sha256_block_data_order,B2
64|| [A0] AND B0,SP,SP ; align stack at 64 bytes
65 .if __TI_EABI__
66 [A0] MVK 0x00404,B1
67|| [A0] MVKL \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
68 [A0] MVKH 0x50000,B1
69|| [A0] MVKH \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
70 .else
71 [A0] MVK 0x00404,B1
72|| [A0] MVKL (K256-_sha256_block_data_order),$K256
73 [A0] MVKH 0x50000,B1
74|| [A0] MVKH (K256-_sha256_block_data_order),$K256
75 .endif
76 [A0] MVC B1,AMR ; setup circular addressing
77|| [A0] MV SP,$Xia
78 [A0] MV SP,$Xib
79|| [A0] ADD B2,$K256,$K256
80|| [A0] MV $CTXA,$CTXB
81|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
82 LDW *${CTXA}[0],$A ; load ctx
83|| LDW *${CTXB}[4],$E
84 LDW *${CTXA}[1],$B
85|| LDW *${CTXB}[5],$F
86 LDW *${CTXA}[2],$C
87|| LDW *${CTXB}[6],$G
88 LDW *${CTXA}[3],$D
89|| LDW *${CTXB}[7],$H
90
91 LDNW *$INP++,$Xn ; pre-fetch input
92 LDW *$K256++,$K ; pre-fetch K256[0]
93 MVK 14,B0 ; loop counters
94 MVK 47,B1
95|| ADDAW $Xia,9,$Xia
96outerloop?:
97 SUB A0,1,A0
98|| MV $A,$Actx
99|| MV $E,$Ectx
100|| MVD $B,$Bctx
101|| MVD $F,$Fctx
102 MV $C,$Cctx
103|| MV $G,$Gctx
104|| MVD $D,$Dctx
105|| MVD $H,$Hctx
106|| SWAP4 $Xn,$X0
107
108 SPLOOPD 8 ; BODY_00_14
109|| MVC B0,ILC
110|| SWAP2 $X0,$X0
111
112 LDNW *$INP++,$Xn
113|| ROTL $A,30,$S0
114|| OR $A,$B,$Maj
115|| AND $A,$B,$t2a
116|| ROTL $E,26,$S1
117|| AND $F,$E,$Ch
118|| ANDN $G,$E,$t2e
119 ROTL $A,19,$t0a
120|| AND $C,$Maj,$Maj
121|| ROTL $E,21,$t0e
122|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
123 ROTL $A,10,$t1a
124|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
125|| ROTL $E,7,$t1e
126|| ADD $K,$H,$T1 ; T1 = h + K256[i]
127 ADD $X0,$T1,$T1 ; T1 += X[i];
128|| STW $X0,*$Xib++
129|| XOR $t0a,$S0,$S0
130|| XOR $t0e,$S1,$S1
131 XOR $t1a,$S0,$S0 ; Sigma0(a)
132|| XOR $t1e,$S1,$S1 ; Sigma1(e)
133|| LDW *$K256++,$K ; pre-fetch K256[i+1]
134|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
135 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
136|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
137|| ROTL $G,0,$H ; h = g
138|| MV $F,$G ; g = f
139|| MV $X0,$X14
140|| SWAP4 $Xn,$X0
141 SWAP2 $X0,$X0
142|| MV $E,$F ; f = e
143|| ADD $D,$T1,$E ; e = d + T1
144|| MV $C,$D ; d = c
145 MV $B,$C ; c = b
146|| MV $A,$B ; b = a
147|| ADD $T1,$T2,$A ; a = T1 + T2
148 SPKERNEL
149
150 ROTL $A,30,$S0 ; BODY_15
151|| OR $A,$B,$Maj
152|| AND $A,$B,$t2a
153|| ROTL $E,26,$S1
154|| AND $F,$E,$Ch
155|| ANDN $G,$E,$t2e
156|| LDW *${Xib}[1],$Xn ; modulo-scheduled
157 ROTL $A,19,$t0a
158|| AND $C,$Maj,$Maj
159|| ROTL $E,21,$t0e
160|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
161|| LDW *${Xib}[2],$X1 ; modulo-scheduled
162 ROTL $A,10,$t1a
163|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
164|| ROTL $E,7,$t1e
165|| ADD $K,$H,$T1 ; T1 = h + K256[i]
166 ADD $X0,$T1,$T1 ; T1 += X[i];
167|| STW $X0,*$Xib++
168|| XOR $t0a,$S0,$S0
169|| XOR $t0e,$S1,$S1
170 XOR $t1a,$S0,$S0 ; Sigma0(a)
171|| XOR $t1e,$S1,$S1 ; Sigma1(e)
172|| LDW *$K256++,$K ; pre-fetch K256[i+1]
173|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
174 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
175|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
176|| ROTL $G,0,$H ; h = g
177|| MV $F,$G ; g = f
178|| MV $X0,$X15
179 MV $E,$F ; f = e
180|| ADD $D,$T1,$E ; e = d + T1
181|| MV $C,$D ; d = c
182|| MV $Xn,$X0 ; modulo-scheduled
183|| LDW *$Xia,$X9 ; modulo-scheduled
184|| ROTL $X1,25,$t0e ; modulo-scheduled
185|| ROTL $X14,15,$t0a ; modulo-scheduled
186 SHRU $X1,3,$s0 ; modulo-scheduled
187|| SHRU $X14,10,$s1 ; modulo-scheduled
188|| ROTL $B,0,$C ; c = b
189|| MV $A,$B ; b = a
190|| ADD $T1,$T2,$A ; a = T1 + T2
191
192 SPLOOPD 10 ; BODY_16_63
193|| MVC B1,ILC
194|| ROTL $X1,14,$t1e ; modulo-scheduled
195|| ROTL $X14,13,$t1a ; modulo-scheduled
196
197 XOR $t0e,$s0,$s0
198|| XOR $t0a,$s1,$s1
199|| MV $X15,$X14
200|| MV $X1,$Xn
201 XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
202|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
203|| LDW *${Xib}[2],$X1 ; module-scheduled
204 ROTL $A,30,$S0
205|| OR $A,$B,$Maj
206|| AND $A,$B,$t2a
207|| ROTL $E,26,$S1
208|| AND $F,$E,$Ch
209|| ANDN $G,$E,$t2e
210|| ADD $X9,$X0,$X0 ; X[i] += X[i+9]
211 ROTL $A,19,$t0a
212|| AND $C,$Maj,$Maj
213|| ROTL $E,21,$t0e
214|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
215|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
216 ROTL $A,10,$t1a
217|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
218|| ROTL $E,7,$t1e
219|| ADD $H,$K,$T1 ; T1 = h + K256[i]
220|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
221 XOR $t0a,$S0,$S0
222|| XOR $t0e,$S1,$S1
223|| ADD $X0,$T1,$T1 ; T1 += X[i]
224|| STW $X0,*$Xib++
225 XOR $t1a,$S0,$S0 ; Sigma0(a)
226|| XOR $t1e,$S1,$S1 ; Sigma1(e)
227|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
228|| MV $X0,$X15
229|| ROTL $G,0,$H ; h = g
230|| LDW *$K256++,$K ; pre-fetch K256[i+1]
231 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
232|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
233|| MV $F,$G ; g = f
234|| MV $Xn,$X0 ; modulo-scheduled
235|| LDW *++$Xia,$X9 ; modulo-scheduled
236|| ROTL $X1,25,$t0e ; module-scheduled
237|| ROTL $X14,15,$t0a ; modulo-scheduled
238 ROTL $X1,14,$t1e ; modulo-scheduled
239|| ROTL $X14,13,$t1a ; modulo-scheduled
240|| MV $E,$F ; f = e
241|| ADD $D,$T1,$E ; e = d + T1
242|| MV $C,$D ; d = c
243|| MV $B,$C ; c = b
244 MV $A,$B ; b = a
245|| ADD $T1,$T2,$A ; a = T1 + T2
246|| SHRU $X1,3,$s0 ; modulo-scheduled
247|| SHRU $X14,10,$s1 ; modulo-scheduled
248 SPKERNEL
249
250 [A0] B outerloop?
251|| [A0] LDNW *$INP++,$Xn ; pre-fetch input
252|| [A0] ADDK -260,$K256 ; rewind K256
253|| ADD $Actx,$A,$A ; accumulate ctx
254|| ADD $Ectx,$E,$E
255|| ADD $Bctx,$B,$B
256 ADD $Fctx,$F,$F
257|| ADD $Cctx,$C,$C
258|| ADD $Gctx,$G,$G
259|| ADD $Dctx,$D,$D
260|| ADD $Hctx,$H,$H
261|| [A0] LDW *$K256++,$K ; pre-fetch K256[0]
262
263 [!A0] BNOP RA
264||[!A0] MV $CTXA,$CTXB
265 [!A0] MV FP,SP ; restore stack pointer
266||[!A0] LDW *FP[0],FP ; restore frame pointer
267 [!A0] STW $A,*${CTXA}[0] ; save ctx
268||[!A0] STW $E,*${CTXB}[4]
269||[!A0] MVK 0,B0
270 [!A0] STW $B,*${CTXA}[1]
271||[!A0] STW $F,*${CTXB}[5]
272||[!A0] MVC B0,AMR ; clear AMR
273 STW $C,*${CTXA}[2]
274|| STW $G,*${CTXB}[6]
275 STW $D,*${CTXA}[3]
276|| STW $H,*${CTXB}[7]
277 .endasmfunc
278
904732f6
AP
279 .if __TI_EABI__
280 .sect ".text:sha_asm.const"
281 .else
3e181369 282 .sect ".const:sha_asm"
904732f6 283 .endif
3e181369
AP
284 .align 128
285K256:
286 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
287 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
288 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
289 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
290 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
291 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
292 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
293 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
294 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
295 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
296 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
297 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
298 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
299 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
300 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
301 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
302 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
303 .align 4
304
305___
306
307print $code;
904732f6 308close STDOUT;