]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
f4e175e4 AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # ChaCha20 for C64x+. | |
18 | # | |
19 | # October 2015 | |
20 | # | |
21 | # Performance is 3.54 cycles per processed byte, which is ~4.3 times | |
22 | # faster than code generated by TI compiler. Compiler also disables | |
23 | # interrupts for some reason, thus making interrupt response time | |
24 | # dependent on input length. This module on the other hand is free | |
25 | # from such limiation. | |
26 | ||
df0cb57c RL |
27 | $output=pop; |
28 | open STDOUT,">$output"; | |
29 | ||
f4e175e4 AP |
30 | ($OUT,$INP,$LEN,$KEYB,$COUNTERA)=("A4","B4","A6","B6","A8"); |
31 | ($KEYA,$COUNTERB,$STEP)=("A7","B7","A3"); | |
32 | ||
33 | @X= ("A16","B16","A17","B17","A18","B18","A19","B19", | |
34 | "A20","B20","A21","B21","A22","B22","A23","B23"); | |
35 | @Y= ("A24","B24","A25","B25","A26","B26","A27","B27", | |
36 | "A28","B28","A29","B29","A30","B30","A31","B31"); | |
37 | @DAT=("A6", "A7", "B6", "B7", "A8", "A9", "B8", "B9", | |
38 | "A10","A11","B10","B11","A12","A13","B12","B13"); | |
39 | ||
40 | # yes, overlaps with @DAT, used only in 2x interleave code path... | |
41 | @K2x=("A6", "B6", "A7", "B7", "A8", "B8", "A9", "B9", | |
42 | "A10","B10","A11","B11","A2", "B2", "A13","B13"); | |
43 | ||
44 | $code.=<<___; | |
45 | .text | |
46 | ||
47 | .if .ASSEMBLER_VERSION<7000000 | |
48 | .asg 0,__TI_EABI__ | |
49 | .endif | |
50 | .if __TI_EABI__ | |
51 | .asg ChaCha20_ctr32,_ChaCha20_ctr32 | |
52 | .endif | |
53 | ||
54 | .asg B3,RA | |
55 | .asg A15,FP | |
56 | .asg B15,SP | |
57 | ||
58 | .global _ChaCha20_ctr32 | |
59 | .align 32 | |
60 | _ChaCha20_ctr32: | |
61 | .asmfunc stack_usage(40+64) | |
62 | MV $LEN,A0 ; reassign | |
63 | [!A0] BNOP RA ; no data | |
64 | || [A0] STW FP,*SP--(40+64) ; save frame pointer and alloca(40+64) | |
65 | || [A0] MV SP,FP | |
66 | [A0] STDW B13:B12,*SP[4+8] ; ABI says so | |
67 | || [A0] MV $KEYB,$KEYA | |
68 | || [A0] MV $COUNTERA,$COUNTERB | |
69 | [A0] STDW B11:B10,*SP[3+8] | |
70 | || [A0] STDW A13:A12,*FP[-3] | |
71 | [A0] STDW A11:A10,*FP[-4] | |
72 | || [A0] MVK 128,$STEP ; 2 * input block size | |
73 | ||
74 | [A0] LDW *${KEYA}[0],@Y[4] ; load key | |
75 | || [A0] LDW *${KEYB}[1],@Y[5] | |
76 | || [A0] MVK 0x00007865,@Y[0] ; synthesize sigma | |
77 | || [A0] MVK 0x0000646e,@Y[1] | |
78 | [A0] LDW *${KEYA}[2],@Y[6] | |
79 | || [A0] LDW *${KEYB}[3],@Y[7] | |
80 | || [A0] MVKH 0x61700000,@Y[0] | |
81 | || [A0] MVKH 0x33200000,@Y[1] | |
82 | LDW *${KEYA}[4],@Y[8] | |
83 | || LDW *${KEYB}[5],@Y[9] | |
84 | || MVK 0x00002d32,@Y[2] | |
85 | || MVK 0x00006574,@Y[3] | |
86 | LDW *${KEYA}[6],@Y[10] | |
87 | || LDW *${KEYB}[7],@Y[11] | |
88 | || MVKH 0x79620000,@Y[2] | |
89 | || MVKH 0x6b200000,@Y[3] | |
90 | LDW *${COUNTERA}[0],@Y[12] ; load counter||nonce | |
91 | || LDW *${COUNTERB}[1],@Y[13] | |
92 | || CMPLTU A0,$STEP,A1 ; is length < 2*blocks? | |
93 | LDW *${COUNTERA}[2],@Y[14] | |
94 | || LDW *${COUNTERB}[3],@Y[15] | |
95 | || [A1] BNOP top1x? | |
96 | [A1] MVK 64,$STEP ; input block size | |
97 | || MVK 10,B0 ; inner loop counter | |
98 | ||
99 | DMV @Y[2],@Y[0],@X[2]:@X[0] ; copy block | |
100 | || DMV @Y[3],@Y[1],@X[3]:@X[1] | |
101 | ||[!A1] STDW @Y[2]:@Y[0],*FP[-12] ; offload key material to stack | |
102 | ||[!A1] STDW @Y[3]:@Y[1],*SP[2] | |
103 | DMV @Y[6],@Y[4],@X[6]:@X[4] | |
104 | || DMV @Y[7],@Y[5],@X[7]:@X[5] | |
105 | ||[!A1] STDW @Y[6]:@Y[4],*FP[-10] | |
106 | ||[!A1] STDW @Y[7]:@Y[5],*SP[4] | |
107 | DMV @Y[10],@Y[8],@X[10]:@X[8] | |
108 | || DMV @Y[11],@Y[9],@X[11]:@X[9] | |
109 | ||[!A1] STDW @Y[10]:@Y[8],*FP[-8] | |
110 | ||[!A1] STDW @Y[11]:@Y[9],*SP[6] | |
111 | DMV @Y[14],@Y[12],@X[14]:@X[12] | |
112 | || DMV @Y[15],@Y[13],@X[15]:@X[13] | |
113 | ||[!A1] MV @Y[12],@K2x[12] ; counter | |
114 | ||[!A1] MV @Y[13],@K2x[13] | |
115 | ||[!A1] STW @Y[14],*FP[-6*2] | |
116 | ||[!A1] STW @Y[15],*SP[8*2] | |
117 | ___ | |
118 | { ################################################################ | |
119 | # 2x interleave gives 50% performance improvement | |
120 | # | |
121 | my ($a0,$a1,$a2,$a3) = (0..3); | |
122 | my ($b0,$b1,$b2,$b3) = (4..7); | |
123 | my ($c0,$c1,$c2,$c3) = (8..11); | |
124 | my ($d0,$d1,$d2,$d3) = (12..15); | |
125 | ||
126 | $code.=<<___; | |
127 | outer2x?: | |
128 | ADD @X[$b1],@X[$a1],@X[$a1] | |
129 | || ADD @X[$b2],@X[$a2],@X[$a2] | |
130 | || ADD @X[$b0],@X[$a0],@X[$a0] | |
131 | || ADD @X[$b3],@X[$a3],@X[$a3] | |
132 | || DMV @Y[2],@Y[0],@K2x[2]:@K2x[0] | |
133 | || DMV @Y[3],@Y[1],@K2x[3]:@K2x[1] | |
134 | XOR @X[$a1],@X[$d1],@X[$d1] | |
135 | || XOR @X[$a2],@X[$d2],@X[$d2] | |
136 | || XOR @X[$a0],@X[$d0],@X[$d0] | |
137 | || XOR @X[$a3],@X[$d3],@X[$d3] | |
138 | || DMV @Y[6],@Y[4],@K2x[6]:@K2x[4] | |
139 | || DMV @Y[7],@Y[5],@K2x[7]:@K2x[5] | |
140 | SWAP2 @X[$d1],@X[$d1] ; rotate by 16 | |
141 | || SWAP2 @X[$d2],@X[$d2] | |
142 | || SWAP2 @X[$d0],@X[$d0] | |
143 | || SWAP2 @X[$d3],@X[$d3] | |
144 | ||
145 | ADD @X[$d1],@X[$c1],@X[$c1] | |
146 | || ADD @X[$d2],@X[$c2],@X[$c2] | |
147 | || ADD @X[$d0],@X[$c0],@X[$c0] | |
148 | || ADD @X[$d3],@X[$c3],@X[$c3] | |
149 | || DMV @Y[10],@Y[8],@K2x[10]:@K2x[8] | |
150 | || DMV @Y[11],@Y[9],@K2x[11]:@K2x[9] | |
151 | XOR @X[$c1],@X[$b1],@X[$b1] | |
152 | || XOR @X[$c2],@X[$b2],@X[$b2] | |
153 | || XOR @X[$c0],@X[$b0],@X[$b0] | |
154 | || XOR @X[$c3],@X[$b3],@X[$b3] | |
155 | || ADD 1,@Y[12],@Y[12] ; adjust counter for 2nd block | |
156 | ROTL @X[$b1],12,@X[$b1] | |
157 | || ROTL @X[$b2],12,@X[$b2] | |
158 | || MV @Y[14],@K2x[14] | |
159 | || MV @Y[15],@K2x[15] | |
160 | top2x?: | |
161 | ROTL @X[$b0],12,@X[$b0] | |
162 | || ROTL @X[$b3],12,@X[$b3] | |
163 | || ADD @Y[$b1],@Y[$a1],@Y[$a1] | |
164 | || ADD @Y[$b2],@Y[$a2],@Y[$a2] | |
165 | ADD @Y[$b0],@Y[$a0],@Y[$a0] | |
166 | || ADD @Y[$b3],@Y[$a3],@Y[$a3] | |
167 | ||
168 | || ADD @X[$b1],@X[$a1],@X[$a1] | |
169 | || ADD @X[$b2],@X[$a2],@X[$a2] | |
170 | || XOR @Y[$a1],@Y[$d1],@Y[$d1] | |
171 | || XOR @Y[$a2],@Y[$d2],@Y[$d2] | |
172 | XOR @Y[$a0],@Y[$d0],@Y[$d0] | |
173 | || XOR @Y[$a3],@Y[$d3],@Y[$d3] | |
174 | || ADD @X[$b0],@X[$a0],@X[$a0] | |
175 | || ADD @X[$b3],@X[$a3],@X[$a3] | |
176 | || XOR @X[$a1],@X[$d1],@X[$d1] | |
177 | || XOR @X[$a2],@X[$d2],@X[$d2] | |
178 | XOR @X[$a0],@X[$d0],@X[$d0] | |
179 | || XOR @X[$a3],@X[$d3],@X[$d3] | |
180 | || ROTL @X[$d1],8,@X[$d1] | |
181 | || ROTL @X[$d2],8,@X[$d2] | |
182 | || SWAP2 @Y[$d1],@Y[$d1] ; rotate by 16 | |
183 | || SWAP2 @Y[$d2],@Y[$d2] | |
184 | || SWAP2 @Y[$d0],@Y[$d0] | |
185 | || SWAP2 @Y[$d3],@Y[$d3] | |
186 | ROTL @X[$d0],8,@X[$d0] | |
187 | || ROTL @X[$d3],8,@X[$d3] | |
188 | || ADD @Y[$d1],@Y[$c1],@Y[$c1] | |
189 | || ADD @Y[$d2],@Y[$c2],@Y[$c2] | |
190 | || ADD @Y[$d0],@Y[$c0],@Y[$c0] | |
191 | || ADD @Y[$d3],@Y[$c3],@Y[$c3] | |
192 | || BNOP middle2x1? ; protect from interrupt | |
193 | ||
194 | ADD @X[$d1],@X[$c1],@X[$c1] | |
195 | || ADD @X[$d2],@X[$c2],@X[$c2] | |
196 | || XOR @Y[$c1],@Y[$b1],@Y[$b1] | |
197 | || XOR @Y[$c2],@Y[$b2],@Y[$b2] | |
198 | || XOR @Y[$c0],@Y[$b0],@Y[$b0] | |
199 | || XOR @Y[$c3],@Y[$b3],@Y[$b3] | |
200 | ADD @X[$d0],@X[$c0],@X[$c0] | |
201 | || ADD @X[$d3],@X[$c3],@X[$c3] | |
202 | || XOR @X[$c1],@X[$b1],@X[$b1] | |
203 | || XOR @X[$c2],@X[$b2],@X[$b2] | |
204 | || ROTL @X[$d1],0,@X[$d2] ; moved to avoid cross-path stall | |
205 | || ROTL @X[$d2],0,@X[$d3] | |
206 | XOR @X[$c0],@X[$b0],@X[$b0] | |
207 | || XOR @X[$c3],@X[$b3],@X[$b3] | |
208 | || MV @X[$d0],@X[$d1] | |
209 | || MV @X[$d3],@X[$d0] | |
210 | || ROTL @Y[$b1],12,@Y[$b1] | |
211 | || ROTL @Y[$b2],12,@Y[$b2] | |
212 | ROTL @X[$b1],7,@X[$b0] ; avoided cross-path stall | |
213 | || ROTL @X[$b2],7,@X[$b1] | |
214 | ROTL @X[$b0],7,@X[$b3] | |
215 | || ROTL @X[$b3],7,@X[$b2] | |
216 | middle2x1?: | |
217 | ||
218 | ROTL @Y[$b0],12,@Y[$b0] | |
219 | || ROTL @Y[$b3],12,@Y[$b3] | |
220 | || ADD @X[$b0],@X[$a0],@X[$a0] | |
221 | || ADD @X[$b1],@X[$a1],@X[$a1] | |
222 | ADD @X[$b2],@X[$a2],@X[$a2] | |
223 | || ADD @X[$b3],@X[$a3],@X[$a3] | |
224 | ||
225 | || ADD @Y[$b1],@Y[$a1],@Y[$a1] | |
226 | || ADD @Y[$b2],@Y[$a2],@Y[$a2] | |
227 | || XOR @X[$a0],@X[$d0],@X[$d0] | |
228 | || XOR @X[$a1],@X[$d1],@X[$d1] | |
229 | XOR @X[$a2],@X[$d2],@X[$d2] | |
230 | || XOR @X[$a3],@X[$d3],@X[$d3] | |
231 | || ADD @Y[$b0],@Y[$a0],@Y[$a0] | |
232 | || ADD @Y[$b3],@Y[$a3],@Y[$a3] | |
233 | || XOR @Y[$a1],@Y[$d1],@Y[$d1] | |
234 | || XOR @Y[$a2],@Y[$d2],@Y[$d2] | |
235 | XOR @Y[$a0],@Y[$d0],@Y[$d0] | |
236 | || XOR @Y[$a3],@Y[$d3],@Y[$d3] | |
237 | || ROTL @Y[$d1],8,@Y[$d1] | |
238 | || ROTL @Y[$d2],8,@Y[$d2] | |
239 | || SWAP2 @X[$d0],@X[$d0] ; rotate by 16 | |
240 | || SWAP2 @X[$d1],@X[$d1] | |
241 | || SWAP2 @X[$d2],@X[$d2] | |
242 | || SWAP2 @X[$d3],@X[$d3] | |
243 | ROTL @Y[$d0],8,@Y[$d0] | |
244 | || ROTL @Y[$d3],8,@Y[$d3] | |
245 | || ADD @X[$d0],@X[$c2],@X[$c2] | |
246 | || ADD @X[$d1],@X[$c3],@X[$c3] | |
247 | || ADD @X[$d2],@X[$c0],@X[$c0] | |
248 | || ADD @X[$d3],@X[$c1],@X[$c1] | |
249 | || BNOP middle2x2? ; protect from interrupt | |
250 | ||
251 | ADD @Y[$d1],@Y[$c1],@Y[$c1] | |
252 | || ADD @Y[$d2],@Y[$c2],@Y[$c2] | |
253 | || XOR @X[$c2],@X[$b0],@X[$b0] | |
254 | || XOR @X[$c3],@X[$b1],@X[$b1] | |
255 | || XOR @X[$c0],@X[$b2],@X[$b2] | |
256 | || XOR @X[$c1],@X[$b3],@X[$b3] | |
257 | ADD @Y[$d0],@Y[$c0],@Y[$c0] | |
258 | || ADD @Y[$d3],@Y[$c3],@Y[$c3] | |
259 | || XOR @Y[$c1],@Y[$b1],@Y[$b1] | |
260 | || XOR @Y[$c2],@Y[$b2],@Y[$b2] | |
261 | || ROTL @Y[$d1],0,@Y[$d2] ; moved to avoid cross-path stall | |
262 | || ROTL @Y[$d2],0,@Y[$d3] | |
263 | XOR @Y[$c0],@Y[$b0],@Y[$b0] | |
264 | || XOR @Y[$c3],@Y[$b3],@Y[$b3] | |
265 | || MV @Y[$d0],@Y[$d1] | |
266 | || MV @Y[$d3],@Y[$d0] | |
267 | || ROTL @X[$b0],12,@X[$b0] | |
268 | || ROTL @X[$b1],12,@X[$b1] | |
269 | ROTL @Y[$b1],7,@Y[$b0] ; avoided cross-path stall | |
270 | || ROTL @Y[$b2],7,@Y[$b1] | |
271 | ROTL @Y[$b0],7,@Y[$b3] | |
272 | || ROTL @Y[$b3],7,@Y[$b2] | |
273 | middle2x2?: | |
274 | ||
275 | ROTL @X[$b2],12,@X[$b2] | |
276 | || ROTL @X[$b3],12,@X[$b3] | |
277 | || ADD @Y[$b0],@Y[$a0],@Y[$a0] | |
278 | || ADD @Y[$b1],@Y[$a1],@Y[$a1] | |
279 | ADD @Y[$b2],@Y[$a2],@Y[$a2] | |
280 | || ADD @Y[$b3],@Y[$a3],@Y[$a3] | |
281 | ||
282 | || ADD @X[$b0],@X[$a0],@X[$a0] | |
283 | || ADD @X[$b1],@X[$a1],@X[$a1] | |
284 | || XOR @Y[$a0],@Y[$d0],@Y[$d0] | |
285 | || XOR @Y[$a1],@Y[$d1],@Y[$d1] | |
286 | XOR @Y[$a2],@Y[$d2],@Y[$d2] | |
287 | || XOR @Y[$a3],@Y[$d3],@Y[$d3] | |
288 | || ADD @X[$b2],@X[$a2],@X[$a2] | |
289 | || ADD @X[$b3],@X[$a3],@X[$a3] | |
290 | || XOR @X[$a0],@X[$d0],@X[$d0] | |
291 | || XOR @X[$a1],@X[$d1],@X[$d1] | |
292 | XOR @X[$a2],@X[$d2],@X[$d2] | |
293 | || XOR @X[$a3],@X[$d3],@X[$d3] | |
294 | || ROTL @X[$d0],8,@X[$d0] | |
295 | || ROTL @X[$d1],8,@X[$d1] | |
296 | || SWAP2 @Y[$d0],@Y[$d0] ; rotate by 16 | |
297 | || SWAP2 @Y[$d1],@Y[$d1] | |
298 | || SWAP2 @Y[$d2],@Y[$d2] | |
299 | || SWAP2 @Y[$d3],@Y[$d3] | |
300 | ROTL @X[$d2],8,@X[$d2] | |
301 | || ROTL @X[$d3],8,@X[$d3] | |
302 | || ADD @Y[$d0],@Y[$c2],@Y[$c2] | |
303 | || ADD @Y[$d1],@Y[$c3],@Y[$c3] | |
304 | || ADD @Y[$d2],@Y[$c0],@Y[$c0] | |
305 | || ADD @Y[$d3],@Y[$c1],@Y[$c1] | |
306 | || BNOP bottom2x1? ; protect from interrupt | |
307 | ||
308 | ADD @X[$d0],@X[$c2],@X[$c2] | |
309 | || ADD @X[$d1],@X[$c3],@X[$c3] | |
310 | || XOR @Y[$c2],@Y[$b0],@Y[$b0] | |
311 | || XOR @Y[$c3],@Y[$b1],@Y[$b1] | |
312 | || XOR @Y[$c0],@Y[$b2],@Y[$b2] | |
313 | || XOR @Y[$c1],@Y[$b3],@Y[$b3] | |
314 | ADD @X[$d2],@X[$c0],@X[$c0] | |
315 | || ADD @X[$d3],@X[$c1],@X[$c1] | |
316 | || XOR @X[$c2],@X[$b0],@X[$b0] | |
317 | || XOR @X[$c3],@X[$b1],@X[$b1] | |
318 | || ROTL @X[$d0],0,@X[$d3] ; moved to avoid cross-path stall | |
319 | || ROTL @X[$d1],0,@X[$d0] | |
320 | XOR @X[$c0],@X[$b2],@X[$b2] | |
321 | || XOR @X[$c1],@X[$b3],@X[$b3] | |
322 | || MV @X[$d2],@X[$d1] | |
323 | || MV @X[$d3],@X[$d2] | |
324 | || ROTL @Y[$b0],12,@Y[$b0] | |
325 | || ROTL @Y[$b1],12,@Y[$b1] | |
326 | ROTL @X[$b0],7,@X[$b1] ; avoided cross-path stall | |
327 | || ROTL @X[$b1],7,@X[$b2] | |
328 | ROTL @X[$b2],7,@X[$b3] | |
329 | || ROTL @X[$b3],7,@X[$b0] | |
330 | || [B0] SUB B0,1,B0 ; decrement inner loop counter | |
331 | bottom2x1?: | |
332 | ||
333 | ROTL @Y[$b2],12,@Y[$b2] | |
334 | || ROTL @Y[$b3],12,@Y[$b3] | |
335 | || [B0] ADD @X[$b1],@X[$a1],@X[$a1] ; modulo-scheduled | |
336 | || [B0] ADD @X[$b2],@X[$a2],@X[$a2] | |
337 | [B0] ADD @X[$b0],@X[$a0],@X[$a0] | |
338 | || [B0] ADD @X[$b3],@X[$a3],@X[$a3] | |
339 | ||
340 | || ADD @Y[$b0],@Y[$a0],@Y[$a0] | |
341 | || ADD @Y[$b1],@Y[$a1],@Y[$a1] | |
342 | || [B0] XOR @X[$a1],@X[$d1],@X[$d1] | |
343 | || [B0] XOR @X[$a2],@X[$d2],@X[$d2] | |
344 | [B0] XOR @X[$a0],@X[$d0],@X[$d0] | |
345 | || [B0] XOR @X[$a3],@X[$d3],@X[$d3] | |
346 | || ADD @Y[$b2],@Y[$a2],@Y[$a2] | |
347 | || ADD @Y[$b3],@Y[$a3],@Y[$a3] | |
348 | || XOR @Y[$a0],@Y[$d0],@Y[$d0] | |
349 | || XOR @Y[$a1],@Y[$d1],@Y[$d1] | |
350 | XOR @Y[$a2],@Y[$d2],@Y[$d2] | |
351 | || XOR @Y[$a3],@Y[$d3],@Y[$d3] | |
352 | || ROTL @Y[$d0],8,@Y[$d0] | |
353 | || ROTL @Y[$d1],8,@Y[$d1] | |
354 | || [B0] SWAP2 @X[$d1],@X[$d1] ; rotate by 16 | |
355 | || [B0] SWAP2 @X[$d2],@X[$d2] | |
356 | || [B0] SWAP2 @X[$d0],@X[$d0] | |
357 | || [B0] SWAP2 @X[$d3],@X[$d3] | |
358 | ROTL @Y[$d2],8,@Y[$d2] | |
359 | || ROTL @Y[$d3],8,@Y[$d3] | |
360 | || [B0] ADD @X[$d1],@X[$c1],@X[$c1] | |
361 | || [B0] ADD @X[$d2],@X[$c2],@X[$c2] | |
362 | || [B0] ADD @X[$d0],@X[$c0],@X[$c0] | |
363 | || [B0] ADD @X[$d3],@X[$c3],@X[$c3] | |
364 | || [B0] BNOP top2x? ; even protects from interrupt | |
365 | ||
366 | ADD @Y[$d0],@Y[$c2],@Y[$c2] | |
367 | || ADD @Y[$d1],@Y[$c3],@Y[$c3] | |
368 | || [B0] XOR @X[$c1],@X[$b1],@X[$b1] | |
369 | || [B0] XOR @X[$c2],@X[$b2],@X[$b2] | |
370 | || [B0] XOR @X[$c0],@X[$b0],@X[$b0] | |
371 | || [B0] XOR @X[$c3],@X[$b3],@X[$b3] | |
372 | ADD @Y[$d2],@Y[$c0],@Y[$c0] | |
373 | || ADD @Y[$d3],@Y[$c1],@Y[$c1] | |
374 | || XOR @Y[$c2],@Y[$b0],@Y[$b0] | |
375 | || XOR @Y[$c3],@Y[$b1],@Y[$b1] | |
376 | || ROTL @Y[$d0],0,@Y[$d3] ; moved to avoid cross-path stall | |
377 | || ROTL @Y[$d1],0,@Y[$d0] | |
378 | XOR @Y[$c0],@Y[$b2],@Y[$b2] | |
379 | || XOR @Y[$c1],@Y[$b3],@Y[$b3] | |
380 | || MV @Y[$d2],@Y[$d1] | |
381 | || MV @Y[$d3],@Y[$d2] | |
382 | || [B0] ROTL @X[$b1],12,@X[$b1] | |
383 | || [B0] ROTL @X[$b2],12,@X[$b2] | |
384 | ROTL @Y[$b0],7,@Y[$b1] ; avoided cross-path stall | |
385 | || ROTL @Y[$b1],7,@Y[$b2] | |
386 | ROTL @Y[$b2],7,@Y[$b3] | |
387 | || ROTL @Y[$b3],7,@Y[$b0] | |
388 | bottom2x2?: | |
389 | ___ | |
390 | } | |
391 | ||
392 | $code.=<<___; | |
393 | ADD @K2x[0],@X[0],@X[0] ; accumulate key material | |
394 | || ADD @K2x[1],@X[1],@X[1] | |
395 | || ADD @K2x[2],@X[2],@X[2] | |
396 | || ADD @K2x[3],@X[3],@X[3] | |
397 | ADD @K2x[0],@Y[0],@Y[0] | |
398 | || ADD @K2x[1],@Y[1],@Y[1] | |
399 | || ADD @K2x[2],@Y[2],@Y[2] | |
400 | || ADD @K2x[3],@Y[3],@Y[3] | |
401 | || LDNDW *${INP}++[8],@DAT[1]:@DAT[0] | |
402 | ADD @K2x[4],@X[4],@X[4] | |
403 | || ADD @K2x[5],@X[5],@X[5] | |
404 | || ADD @K2x[6],@X[6],@X[6] | |
405 | || ADD @K2x[7],@X[7],@X[7] | |
406 | || LDNDW *${INP}[-7],@DAT[3]:@DAT[2] | |
407 | ADD @K2x[4],@Y[4],@Y[4] | |
408 | || ADD @K2x[5],@Y[5],@Y[5] | |
409 | || ADD @K2x[6],@Y[6],@Y[6] | |
410 | || ADD @K2x[7],@Y[7],@Y[7] | |
411 | || LDNDW *${INP}[-6],@DAT[5]:@DAT[4] | |
412 | ADD @K2x[8],@X[8],@X[8] | |
413 | || ADD @K2x[9],@X[9],@X[9] | |
414 | || ADD @K2x[10],@X[10],@X[10] | |
415 | || ADD @K2x[11],@X[11],@X[11] | |
416 | || LDNDW *${INP}[-5],@DAT[7]:@DAT[6] | |
417 | ADD @K2x[8],@Y[8],@Y[8] | |
418 | || ADD @K2x[9],@Y[9],@Y[9] | |
419 | || ADD @K2x[10],@Y[10],@Y[10] | |
420 | || ADD @K2x[11],@Y[11],@Y[11] | |
421 | || LDNDW *${INP}[-4],@DAT[9]:@DAT[8] | |
422 | ADD @K2x[12],@X[12],@X[12] | |
423 | || ADD @K2x[13],@X[13],@X[13] | |
424 | || ADD @K2x[14],@X[14],@X[14] | |
425 | || ADD @K2x[15],@X[15],@X[15] | |
426 | || LDNDW *${INP}[-3],@DAT[11]:@DAT[10] | |
427 | ADD @K2x[12],@Y[12],@Y[12] | |
428 | || ADD @K2x[13],@Y[13],@Y[13] | |
429 | || ADD @K2x[14],@Y[14],@Y[14] | |
430 | || ADD @K2x[15],@Y[15],@Y[15] | |
431 | || LDNDW *${INP}[-2],@DAT[13]:@DAT[12] | |
432 | ADD 1,@Y[12],@Y[12] ; adjust counter for 2nd block | |
433 | || ADD 2,@K2x[12],@K2x[12] ; increment counter | |
434 | || LDNDW *${INP}[-1],@DAT[15]:@DAT[14] | |
435 | ||
436 | .if .BIG_ENDIAN | |
437 | SWAP2 @X[0],@X[0] | |
438 | || SWAP2 @X[1],@X[1] | |
439 | || SWAP2 @X[2],@X[2] | |
440 | || SWAP2 @X[3],@X[3] | |
441 | SWAP2 @X[4],@X[4] | |
442 | || SWAP2 @X[5],@X[5] | |
443 | || SWAP2 @X[6],@X[6] | |
444 | || SWAP2 @X[7],@X[7] | |
445 | SWAP2 @X[8],@X[8] | |
446 | || SWAP2 @X[9],@X[9] | |
447 | || SWAP4 @X[0],@X[1] | |
448 | || SWAP4 @X[1],@X[0] | |
449 | SWAP2 @X[10],@X[10] | |
450 | || SWAP2 @X[11],@X[11] | |
451 | || SWAP4 @X[2],@X[3] | |
452 | || SWAP4 @X[3],@X[2] | |
453 | SWAP2 @X[12],@X[12] | |
454 | || SWAP2 @X[13],@X[13] | |
455 | || SWAP4 @X[4],@X[5] | |
456 | || SWAP4 @X[5],@X[4] | |
457 | SWAP2 @X[14],@X[14] | |
458 | || SWAP2 @X[15],@X[15] | |
459 | || SWAP4 @X[6],@X[7] | |
460 | || SWAP4 @X[7],@X[6] | |
461 | SWAP4 @X[8],@X[9] | |
462 | || SWAP4 @X[9],@X[8] | |
463 | || SWAP2 @Y[0],@Y[0] | |
464 | || SWAP2 @Y[1],@Y[1] | |
465 | SWAP4 @X[10],@X[11] | |
466 | || SWAP4 @X[11],@X[10] | |
467 | || SWAP2 @Y[2],@Y[2] | |
468 | || SWAP2 @Y[3],@Y[3] | |
469 | SWAP4 @X[12],@X[13] | |
470 | || SWAP4 @X[13],@X[12] | |
471 | || SWAP2 @Y[4],@Y[4] | |
472 | || SWAP2 @Y[5],@Y[5] | |
473 | SWAP4 @X[14],@X[15] | |
474 | || SWAP4 @X[15],@X[14] | |
475 | || SWAP2 @Y[6],@Y[6] | |
476 | || SWAP2 @Y[7],@Y[7] | |
477 | SWAP2 @Y[8],@Y[8] | |
478 | || SWAP2 @Y[9],@Y[9] | |
479 | || SWAP4 @Y[0],@Y[1] | |
480 | || SWAP4 @Y[1],@Y[0] | |
481 | SWAP2 @Y[10],@Y[10] | |
482 | || SWAP2 @Y[11],@Y[11] | |
483 | || SWAP4 @Y[2],@Y[3] | |
484 | || SWAP4 @Y[3],@Y[2] | |
485 | SWAP2 @Y[12],@Y[12] | |
486 | || SWAP2 @Y[13],@Y[13] | |
487 | || SWAP4 @Y[4],@Y[5] | |
488 | || SWAP4 @Y[5],@Y[4] | |
489 | SWAP2 @Y[14],@Y[14] | |
490 | || SWAP2 @Y[15],@Y[15] | |
491 | || SWAP4 @Y[6],@Y[7] | |
492 | || SWAP4 @Y[7],@Y[6] | |
493 | SWAP4 @Y[8],@Y[9] | |
494 | || SWAP4 @Y[9],@Y[8] | |
495 | SWAP4 @Y[10],@Y[11] | |
496 | || SWAP4 @Y[11],@Y[10] | |
497 | SWAP4 @Y[12],@Y[13] | |
498 | || SWAP4 @Y[13],@Y[12] | |
499 | SWAP4 @Y[14],@Y[15] | |
500 | || SWAP4 @Y[15],@Y[14] | |
501 | .endif | |
502 | ||
503 | XOR @DAT[0],@X[0],@X[0] ; xor 1st block | |
504 | || XOR @DAT[3],@X[3],@X[3] | |
505 | || XOR @DAT[2],@X[2],@X[1] | |
506 | || XOR @DAT[1],@X[1],@X[2] | |
507 | || LDNDW *${INP}++[8],@DAT[1]:@DAT[0] | |
508 | XOR @DAT[4],@X[4],@X[4] | |
509 | || XOR @DAT[7],@X[7],@X[7] | |
510 | || LDNDW *${INP}[-7],@DAT[3]:@DAT[2] | |
511 | XOR @DAT[6],@X[6],@X[5] | |
512 | || XOR @DAT[5],@X[5],@X[6] | |
513 | || LDNDW *${INP}[-6],@DAT[5]:@DAT[4] | |
514 | XOR @DAT[8],@X[8],@X[8] | |
515 | || XOR @DAT[11],@X[11],@X[11] | |
516 | || LDNDW *${INP}[-5],@DAT[7]:@DAT[6] | |
517 | XOR @DAT[10],@X[10],@X[9] | |
518 | || XOR @DAT[9],@X[9],@X[10] | |
519 | || LDNDW *${INP}[-4],@DAT[9]:@DAT[8] | |
520 | XOR @DAT[12],@X[12],@X[12] | |
521 | || XOR @DAT[15],@X[15],@X[15] | |
522 | || LDNDW *${INP}[-3],@DAT[11]:@DAT[10] | |
523 | XOR @DAT[14],@X[14],@X[13] | |
524 | || XOR @DAT[13],@X[13],@X[14] | |
525 | || LDNDW *${INP}[-2],@DAT[13]:@DAT[12] | |
526 | [A0] SUB A0,$STEP,A0 ; SUB A0,128,A0 | |
527 | || LDNDW *${INP}[-1],@DAT[15]:@DAT[14] | |
528 | ||
529 | XOR @Y[0],@DAT[0],@DAT[0] ; xor 2nd block | |
530 | || XOR @Y[1],@DAT[1],@DAT[1] | |
531 | || STNDW @X[2]:@X[0],*${OUT}++[8] | |
532 | XOR @Y[2],@DAT[2],@DAT[2] | |
533 | || XOR @Y[3],@DAT[3],@DAT[3] | |
534 | || STNDW @X[3]:@X[1],*${OUT}[-7] | |
535 | XOR @Y[4],@DAT[4],@DAT[4] | |
536 | || [A0] LDDW *FP[-12],@X[2]:@X[0] ; re-load key material from stack | |
537 | || [A0] LDDW *SP[2], @X[3]:@X[1] | |
538 | XOR @Y[5],@DAT[5],@DAT[5] | |
539 | || STNDW @X[6]:@X[4],*${OUT}[-6] | |
540 | XOR @Y[6],@DAT[6],@DAT[6] | |
541 | || XOR @Y[7],@DAT[7],@DAT[7] | |
542 | || STNDW @X[7]:@X[5],*${OUT}[-5] | |
543 | XOR @Y[8],@DAT[8],@DAT[8] | |
544 | || [A0] LDDW *FP[-10],@X[6]:@X[4] | |
545 | || [A0] LDDW *SP[4], @X[7]:@X[5] | |
546 | XOR @Y[9],@DAT[9],@DAT[9] | |
547 | || STNDW @X[10]:@X[8],*${OUT}[-4] | |
548 | XOR @Y[10],@DAT[10],@DAT[10] | |
549 | || XOR @Y[11],@DAT[11],@DAT[11] | |
550 | || STNDW @X[11]:@X[9],*${OUT}[-3] | |
551 | XOR @Y[12],@DAT[12],@DAT[12] | |
552 | || [A0] LDDW *FP[-8], @X[10]:@X[8] | |
553 | || [A0] LDDW *SP[6], @X[11]:@X[9] | |
554 | XOR @Y[13],@DAT[13],@DAT[13] | |
555 | || STNDW @X[14]:@X[12],*${OUT}[-2] | |
556 | XOR @Y[14],@DAT[14],@DAT[14] | |
557 | || XOR @Y[15],@DAT[15],@DAT[15] | |
558 | || STNDW @X[15]:@X[13],*${OUT}[-1] | |
559 | ||
560 | [A0] MV @K2x[12],@X[12] | |
561 | || [A0] MV @K2x[13],@X[13] | |
562 | || [A0] LDW *FP[-6*2], @X[14] | |
563 | || [A0] LDW *SP[8*2], @X[15] | |
564 | ||
565 | [A0] DMV @X[2],@X[0],@Y[2]:@Y[0] ; duplicate key material | |
566 | || STNDW @DAT[1]:@DAT[0],*${OUT}++[8] | |
567 | [A0] DMV @X[3],@X[1],@Y[3]:@Y[1] | |
568 | || STNDW @DAT[3]:@DAT[2],*${OUT}[-7] | |
569 | [A0] DMV @X[6],@X[4],@Y[6]:@Y[4] | |
570 | || STNDW @DAT[5]:@DAT[4],*${OUT}[-6] | |
571 | || CMPLTU A0,$STEP,A1 ; is remaining length < 2*blocks? | |
572 | ||[!A0] BNOP epilogue? | |
573 | [A0] DMV @X[7],@X[5],@Y[7]:@Y[5] | |
574 | || STNDW @DAT[7]:@DAT[6],*${OUT}[-5] | |
575 | ||[!A1] BNOP outer2x? | |
576 | [A0] DMV @X[10],@X[8],@Y[10]:@Y[8] | |
577 | || STNDW @DAT[9]:@DAT[8],*${OUT}[-4] | |
578 | [A0] DMV @X[11],@X[9],@Y[11]:@Y[9] | |
579 | || STNDW @DAT[11]:@DAT[10],*${OUT}[-3] | |
580 | [A0] DMV @X[14],@X[12],@Y[14]:@Y[12] | |
581 | || STNDW @DAT[13]:@DAT[12],*${OUT}[-2] | |
582 | [A0] DMV @X[15],@X[13],@Y[15]:@Y[13] | |
583 | || STNDW @DAT[15]:@DAT[14],*${OUT}[-1] | |
584 | ;;===== branch to epilogue? is taken here | |
585 | [A1] MVK 64,$STEP | |
586 | || [A0] MVK 10,B0 ; inner loop counter | |
587 | ;;===== branch to outer2x? is taken here | |
588 | ___ | |
589 | { | |
590 | my ($a0,$a1,$a2,$a3) = (0..3); | |
591 | my ($b0,$b1,$b2,$b3) = (4..7); | |
592 | my ($c0,$c1,$c2,$c3) = (8..11); | |
593 | my ($d0,$d1,$d2,$d3) = (12..15); | |
594 | ||
595 | $code.=<<___; | |
596 | top1x?: | |
597 | ADD @X[$b1],@X[$a1],@X[$a1] | |
598 | || ADD @X[$b2],@X[$a2],@X[$a2] | |
599 | ADD @X[$b0],@X[$a0],@X[$a0] | |
600 | || ADD @X[$b3],@X[$a3],@X[$a3] | |
601 | || XOR @X[$a1],@X[$d1],@X[$d1] | |
602 | || XOR @X[$a2],@X[$d2],@X[$d2] | |
603 | XOR @X[$a0],@X[$d0],@X[$d0] | |
604 | || XOR @X[$a3],@X[$d3],@X[$d3] | |
605 | || SWAP2 @X[$d1],@X[$d1] ; rotate by 16 | |
606 | || SWAP2 @X[$d2],@X[$d2] | |
607 | SWAP2 @X[$d0],@X[$d0] | |
608 | || SWAP2 @X[$d3],@X[$d3] | |
609 | ||
610 | || ADD @X[$d1],@X[$c1],@X[$c1] | |
611 | || ADD @X[$d2],@X[$c2],@X[$c2] | |
612 | ADD @X[$d0],@X[$c0],@X[$c0] | |
613 | || ADD @X[$d3],@X[$c3],@X[$c3] | |
614 | || XOR @X[$c1],@X[$b1],@X[$b1] | |
615 | || XOR @X[$c2],@X[$b2],@X[$b2] | |
616 | XOR @X[$c0],@X[$b0],@X[$b0] | |
617 | || XOR @X[$c3],@X[$b3],@X[$b3] | |
618 | || ROTL @X[$b1],12,@X[$b1] | |
619 | || ROTL @X[$b2],12,@X[$b2] | |
620 | ROTL @X[$b0],12,@X[$b0] | |
621 | || ROTL @X[$b3],12,@X[$b3] | |
622 | ||
623 | ADD @X[$b1],@X[$a1],@X[$a1] | |
624 | || ADD @X[$b2],@X[$a2],@X[$a2] | |
625 | ADD @X[$b0],@X[$a0],@X[$a0] | |
626 | || ADD @X[$b3],@X[$a3],@X[$a3] | |
627 | || XOR @X[$a1],@X[$d1],@X[$d1] | |
628 | || XOR @X[$a2],@X[$d2],@X[$d2] | |
629 | XOR @X[$a0],@X[$d0],@X[$d0] | |
630 | || XOR @X[$a3],@X[$d3],@X[$d3] | |
631 | || ROTL @X[$d1],8,@X[$d1] | |
632 | || ROTL @X[$d2],8,@X[$d2] | |
633 | ROTL @X[$d0],8,@X[$d0] | |
634 | || ROTL @X[$d3],8,@X[$d3] | |
635 | || BNOP middle1x? ; protect from interrupt | |
636 | ||
637 | ADD @X[$d1],@X[$c1],@X[$c1] | |
638 | || ADD @X[$d2],@X[$c2],@X[$c2] | |
639 | ADD @X[$d0],@X[$c0],@X[$c0] | |
640 | || ADD @X[$d3],@X[$c3],@X[$c3] | |
641 | || XOR @X[$c1],@X[$b1],@X[$b1] | |
642 | || XOR @X[$c2],@X[$b2],@X[$b2] | |
643 | || ROTL @X[$d1],0,@X[$d2] ; moved to avoid cross-path stall | |
644 | || ROTL @X[$d2],0,@X[$d3] | |
645 | XOR @X[$c0],@X[$b0],@X[$b0] | |
646 | || XOR @X[$c3],@X[$b3],@X[$b3] | |
647 | || ROTL @X[$d0],0,@X[$d1] | |
648 | || ROTL @X[$d3],0,@X[$d0] | |
649 | ROTL @X[$b1],7,@X[$b0] ; avoided cross-path stall | |
650 | || ROTL @X[$b2],7,@X[$b1] | |
651 | ROTL @X[$b0],7,@X[$b3] | |
652 | || ROTL @X[$b3],7,@X[$b2] | |
653 | middle1x?: | |
654 | ||
655 | ADD @X[$b0],@X[$a0],@X[$a0] | |
656 | || ADD @X[$b1],@X[$a1],@X[$a1] | |
657 | ADD @X[$b2],@X[$a2],@X[$a2] | |
658 | || ADD @X[$b3],@X[$a3],@X[$a3] | |
659 | || XOR @X[$a0],@X[$d0],@X[$d0] | |
660 | || XOR @X[$a1],@X[$d1],@X[$d1] | |
661 | XOR @X[$a2],@X[$d2],@X[$d2] | |
662 | || XOR @X[$a3],@X[$d3],@X[$d3] | |
663 | || SWAP2 @X[$d0],@X[$d0] ; rotate by 16 | |
664 | || SWAP2 @X[$d1],@X[$d1] | |
665 | SWAP2 @X[$d2],@X[$d2] | |
666 | || SWAP2 @X[$d3],@X[$d3] | |
667 | ||
668 | || ADD @X[$d0],@X[$c2],@X[$c2] | |
669 | || ADD @X[$d1],@X[$c3],@X[$c3] | |
670 | ADD @X[$d2],@X[$c0],@X[$c0] | |
671 | || ADD @X[$d3],@X[$c1],@X[$c1] | |
672 | || XOR @X[$c2],@X[$b0],@X[$b0] | |
673 | || XOR @X[$c3],@X[$b1],@X[$b1] | |
674 | XOR @X[$c0],@X[$b2],@X[$b2] | |
675 | || XOR @X[$c1],@X[$b3],@X[$b3] | |
676 | || ROTL @X[$b0],12,@X[$b0] | |
677 | || ROTL @X[$b1],12,@X[$b1] | |
678 | ROTL @X[$b2],12,@X[$b2] | |
679 | || ROTL @X[$b3],12,@X[$b3] | |
680 | ||
681 | ADD @X[$b0],@X[$a0],@X[$a0] | |
682 | || ADD @X[$b1],@X[$a1],@X[$a1] | |
683 | || [B0] SUB B0,1,B0 ; decrement inner loop counter | |
684 | ADD @X[$b2],@X[$a2],@X[$a2] | |
685 | || ADD @X[$b3],@X[$a3],@X[$a3] | |
686 | || XOR @X[$a0],@X[$d0],@X[$d0] | |
687 | || XOR @X[$a1],@X[$d1],@X[$d1] | |
688 | XOR @X[$a2],@X[$d2],@X[$d2] | |
689 | || XOR @X[$a3],@X[$d3],@X[$d3] | |
690 | || ROTL @X[$d0],8,@X[$d0] | |
691 | || ROTL @X[$d1],8,@X[$d1] | |
692 | ROTL @X[$d2],8,@X[$d2] | |
693 | || ROTL @X[$d3],8,@X[$d3] | |
694 | || [B0] BNOP top1x? ; even protects from interrupt | |
695 | ||
696 | ADD @X[$d0],@X[$c2],@X[$c2] | |
697 | || ADD @X[$d1],@X[$c3],@X[$c3] | |
698 | ADD @X[$d2],@X[$c0],@X[$c0] | |
699 | || ADD @X[$d3],@X[$c1],@X[$c1] | |
700 | || XOR @X[$c2],@X[$b0],@X[$b0] | |
701 | || XOR @X[$c3],@X[$b1],@X[$b1] | |
702 | || ROTL @X[$d0],0,@X[$d3] ; moved to avoid cross-path stall | |
703 | || ROTL @X[$d1],0,@X[$d0] | |
704 | XOR @X[$c0],@X[$b2],@X[$b2] | |
705 | || XOR @X[$c1],@X[$b3],@X[$b3] | |
706 | || ROTL @X[$d2],0,@X[$d1] | |
707 | || ROTL @X[$d3],0,@X[$d2] | |
708 | ROTL @X[$b0],7,@X[$b1] ; avoided cross-path stall | |
709 | || ROTL @X[$b1],7,@X[$b2] | |
710 | ROTL @X[$b2],7,@X[$b3] | |
711 | || ROTL @X[$b3],7,@X[$b0] | |
712 | ||[!B0] CMPLTU A0,$STEP,A1 ; less than 64 bytes left? | |
713 | bottom1x?: | |
714 | ___ | |
715 | } | |
716 | ||
717 | $code.=<<___; | |
718 | ADD @Y[0],@X[0],@X[0] ; accumulate key material | |
719 | || ADD @Y[1],@X[1],@X[1] | |
720 | || ADD @Y[2],@X[2],@X[2] | |
721 | || ADD @Y[3],@X[3],@X[3] | |
722 | ||[!A1] LDNDW *${INP}++[8],@DAT[1]:@DAT[0] | |
723 | || [A1] BNOP tail? | |
724 | ADD @Y[4],@X[4],@X[4] | |
725 | || ADD @Y[5],@X[5],@X[5] | |
726 | || ADD @Y[6],@X[6],@X[6] | |
727 | || ADD @Y[7],@X[7],@X[7] | |
728 | ||[!A1] LDNDW *${INP}[-7],@DAT[3]:@DAT[2] | |
729 | ADD @Y[8],@X[8],@X[8] | |
730 | || ADD @Y[9],@X[9],@X[9] | |
731 | || ADD @Y[10],@X[10],@X[10] | |
732 | || ADD @Y[11],@X[11],@X[11] | |
733 | ||[!A1] LDNDW *${INP}[-6],@DAT[5]:@DAT[4] | |
734 | ADD @Y[12],@X[12],@X[12] | |
735 | || ADD @Y[13],@X[13],@X[13] | |
736 | || ADD @Y[14],@X[14],@X[14] | |
737 | || ADD @Y[15],@X[15],@X[15] | |
738 | ||[!A1] LDNDW *${INP}[-5],@DAT[7]:@DAT[6] | |
739 | [!A1] LDNDW *${INP}[-4],@DAT[9]:@DAT[8] | |
740 | [!A1] LDNDW *${INP}[-3],@DAT[11]:@DAT[10] | |
741 | LDNDW *${INP}[-2],@DAT[13]:@DAT[12] | |
742 | LDNDW *${INP}[-1],@DAT[15]:@DAT[14] | |
743 | ||
744 | .if .BIG_ENDIAN | |
745 | SWAP2 @X[0],@X[0] | |
746 | || SWAP2 @X[1],@X[1] | |
747 | || SWAP2 @X[2],@X[2] | |
748 | || SWAP2 @X[3],@X[3] | |
749 | SWAP2 @X[4],@X[4] | |
750 | || SWAP2 @X[5],@X[5] | |
751 | || SWAP2 @X[6],@X[6] | |
752 | || SWAP2 @X[7],@X[7] | |
753 | SWAP2 @X[8],@X[8] | |
754 | || SWAP2 @X[9],@X[9] | |
755 | || SWAP4 @X[0],@X[1] | |
756 | || SWAP4 @X[1],@X[0] | |
757 | SWAP2 @X[10],@X[10] | |
758 | || SWAP2 @X[11],@X[11] | |
759 | || SWAP4 @X[2],@X[3] | |
760 | || SWAP4 @X[3],@X[2] | |
761 | SWAP2 @X[12],@X[12] | |
762 | || SWAP2 @X[13],@X[13] | |
763 | || SWAP4 @X[4],@X[5] | |
764 | || SWAP4 @X[5],@X[4] | |
765 | SWAP2 @X[14],@X[14] | |
766 | || SWAP2 @X[15],@X[15] | |
767 | || SWAP4 @X[6],@X[7] | |
768 | || SWAP4 @X[7],@X[6] | |
769 | SWAP4 @X[8],@X[9] | |
770 | || SWAP4 @X[9],@X[8] | |
771 | SWAP4 @X[10],@X[11] | |
772 | || SWAP4 @X[11],@X[10] | |
773 | SWAP4 @X[12],@X[13] | |
774 | || SWAP4 @X[13],@X[12] | |
775 | SWAP4 @X[14],@X[15] | |
776 | || SWAP4 @X[15],@X[14] | |
777 | .else | |
778 | NOP 1 | |
779 | .endif | |
780 | ||
781 | XOR @X[0],@DAT[0],@DAT[0] ; xor with input | |
782 | || XOR @X[1],@DAT[1],@DAT[1] | |
783 | || XOR @X[2],@DAT[2],@DAT[2] | |
784 | || XOR @X[3],@DAT[3],@DAT[3] | |
785 | || [A0] SUB A0,$STEP,A0 ; SUB A0,64,A0 | |
786 | XOR @X[4],@DAT[4],@DAT[4] | |
787 | || XOR @X[5],@DAT[5],@DAT[5] | |
788 | || XOR @X[6],@DAT[6],@DAT[6] | |
789 | || XOR @X[7],@DAT[7],@DAT[7] | |
790 | || STNDW @DAT[1]:@DAT[0],*${OUT}++[8] | |
791 | XOR @X[8],@DAT[8],@DAT[8] | |
792 | || XOR @X[9],@DAT[9],@DAT[9] | |
793 | || XOR @X[10],@DAT[10],@DAT[10] | |
794 | || XOR @X[11],@DAT[11],@DAT[11] | |
795 | || STNDW @DAT[3]:@DAT[2],*${OUT}[-7] | |
796 | XOR @X[12],@DAT[12],@DAT[12] | |
797 | || XOR @X[13],@DAT[13],@DAT[13] | |
798 | || XOR @X[14],@DAT[14],@DAT[14] | |
799 | || XOR @X[15],@DAT[15],@DAT[15] | |
800 | || STNDW @DAT[5]:@DAT[4],*${OUT}[-6] | |
801 | || [A0] BNOP top1x? | |
802 | [A0] DMV @Y[2],@Y[0],@X[2]:@X[0] ; duplicate key material | |
803 | || [A0] DMV @Y[3],@Y[1],@X[3]:@X[1] | |
804 | || STNDW @DAT[7]:@DAT[6],*${OUT}[-5] | |
805 | [A0] DMV @Y[6],@Y[4],@X[6]:@X[4] | |
806 | || [A0] DMV @Y[7],@Y[5],@X[7]:@X[5] | |
807 | || STNDW @DAT[9]:@DAT[8],*${OUT}[-4] | |
808 | [A0] DMV @Y[10],@Y[8],@X[10]:@X[8] | |
809 | || [A0] DMV @Y[11],@Y[9],@X[11]:@X[9] | |
810 | || [A0] ADD 1,@Y[12],@Y[12] ; increment counter | |
811 | || STNDW @DAT[11]:@DAT[10],*${OUT}[-3] | |
812 | [A0] DMV @Y[14],@Y[12],@X[14]:@X[12] | |
813 | || [A0] DMV @Y[15],@Y[13],@X[15]:@X[13] | |
814 | || STNDW @DAT[13]:@DAT[12],*${OUT}[-2] | |
815 | [A0] MVK 10,B0 ; inner loop counter | |
816 | || STNDW @DAT[15]:@DAT[14],*${OUT}[-1] | |
817 | ;;===== branch to top1x? is taken here | |
818 | ||
819 | epilogue?: | |
820 | LDDW *FP[-4],A11:A10 ; ABI says so | |
821 | LDDW *FP[-3],A13:A12 | |
822 | || LDDW *SP[3+8],B11:B10 | |
823 | LDDW *SP[4+8],B13:B12 | |
824 | || BNOP RA | |
825 | LDW *++SP(40+64),FP ; restore frame pointer | |
826 | NOP 4 | |
827 | ||
828 | tail?: | |
829 | LDBU *${INP}++[1],B24 ; load byte by byte | |
830 | || SUB A0,1,A0 | |
831 | || SUB A0,1,B1 | |
832 | [!B1] BNOP epilogue? ; interrupts are disabled for whole time | |
833 | || [A0] LDBU *${INP}++[1],B24 | |
834 | || [A0] SUB A0,1,A0 | |
835 | || SUB B1,1,B1 | |
836 | [!B1] BNOP epilogue? | |
837 | || [A0] LDBU *${INP}++[1],B24 | |
838 | || [A0] SUB A0,1,A0 | |
839 | || SUB B1,1,B1 | |
840 | [!B1] BNOP epilogue? | |
841 | || ROTL @X[0],0,A24 | |
842 | || [A0] LDBU *${INP}++[1],B24 | |
843 | || [A0] SUB A0,1,A0 | |
844 | || SUB B1,1,B1 | |
845 | [!B1] BNOP epilogue? | |
846 | || ROTL @X[0],24,A24 | |
847 | || [A0] LDBU *${INP}++[1],A24 | |
848 | || [A0] SUB A0,1,A0 | |
849 | || SUB B1,1,B1 | |
850 | [!B1] BNOP epilogue? | |
851 | || ROTL @X[0],16,A24 | |
852 | || [A0] LDBU *${INP}++[1],A24 | |
853 | || [A0] SUB A0,1,A0 | |
854 | || SUB B1,1,B1 | |
855 | || XOR A24,B24,B25 | |
856 | STB B25,*${OUT}++[1] ; store byte by byte | |
857 | ||[!B1] BNOP epilogue? | |
858 | || ROTL @X[0],8,A24 | |
859 | || [A0] LDBU *${INP}++[1],A24 | |
860 | || [A0] SUB A0,1,A0 | |
861 | || SUB B1,1,B1 | |
862 | || XOR A24,B24,B25 | |
863 | STB B25,*${OUT}++[1] | |
864 | ___ | |
865 | sub TAIL_STEP { | |
866 | my $Xi= shift; | |
867 | my $T = ($Xi=~/^B/?"B24":"A24"); # match @X[i] to avoid cross path | |
868 | my $D = $T; $D=~tr/AB/BA/; | |
869 | my $O = $D; $O=~s/24/25/; | |
870 | ||
871 | $code.=<<___; | |
872 | ||[!B1] BNOP epilogue? | |
873 | || ROTL $Xi,0,$T | |
874 | || [A0] LDBU *${INP}++[1],$D | |
875 | || [A0] SUB A0,1,A0 | |
876 | || SUB B1,1,B1 | |
877 | || XOR A24,B24,$O | |
878 | STB $O,*${OUT}++[1] | |
879 | ||[!B1] BNOP epilogue? | |
880 | || ROTL $Xi,24,$T | |
881 | || [A0] LDBU *${INP}++[1],$T | |
882 | || [A0] SUB A0,1,A0 | |
883 | || SUB B1,1,B1 | |
884 | || XOR A24,B24,$O | |
885 | STB $O,*${OUT}++[1] | |
886 | ||[!B1] BNOP epilogue? | |
887 | || ROTL $Xi,16,$T | |
888 | || [A0] LDBU *${INP}++[1],$T | |
889 | || [A0] SUB A0,1,A0 | |
890 | || SUB B1,1,B1 | |
891 | || XOR A24,B24,$O | |
892 | STB $O,*${OUT}++[1] | |
893 | ||[!B1] BNOP epilogue? | |
894 | || ROTL $Xi,8,$T | |
895 | || [A0] LDBU *${INP}++[1],$T | |
896 | || [A0] SUB A0,1,A0 | |
897 | || SUB B1,1,B1 | |
898 | || XOR A24,B24,$O | |
899 | STB $O,*${OUT}++[1] | |
900 | ___ | |
901 | } | |
902 | foreach (1..14) { TAIL_STEP(@X[$_]); } | |
903 | $code.=<<___; | |
904 | ||[!B1] BNOP epilogue? | |
905 | || ROTL @X[15],0,B24 | |
906 | || XOR A24,B24,A25 | |
907 | STB A25,*${OUT}++[1] | |
908 | || ROTL @X[15],24,B24 | |
909 | || XOR A24,B24,A25 | |
910 | STB A25,*${OUT}++[1] | |
911 | || ROTL @X[15],16,B24 | |
912 | || XOR A24,B24,A25 | |
913 | STB A25,*${OUT}++[1] | |
914 | || XOR A24,B24,A25 | |
915 | STB A25,*${OUT}++[1] | |
916 | || XOR A24,B24,B25 | |
917 | STB B25,*${OUT}++[1] | |
918 | .endasmfunc | |
919 | ||
920 | .sect .const | |
921 | .cstring "ChaCha20 for C64x+, CRYPTOGAMS by <appro\@openssl.org>" | |
922 | .align 4 | |
923 | ___ | |
924 | ||
925 | print $code; | |
926 | close STDOUT; |