]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
5593d9c9 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
4739ccdb AP |
9 | |
10 | # ==================================================================== | |
e3713c36 RS |
11 | # Written by David S. Miller and Andy Polyakov. |
12 | # The module is licensed under 2-clause BSD | |
4739ccdb AP |
13 | # license. October 2012. All rights reserved. |
14 | # ==================================================================== | |
15 | ||
16 | ###################################################################### | |
17 | # Camellia for SPARC T4. | |
18 | # | |
19 | # As with AES below results [for aligned data] are virtually identical | |
46f4e1be | 20 | # to critical path lengths for 3-cycle instruction latency: |
4739ccdb AP |
21 | # |
22 | # 128-bit key 192/256- | |
23 | # CBC encrypt 4.14/4.21(*) 5.46/5.52 | |
24 | # (*) numbers after slash are for | |
25 | # misaligned data. | |
26 | # | |
27 | # As with Intel AES-NI, question is if it's possible to improve | |
46f4e1be | 28 | # performance of parallelizable modes by interleaving round |
4739ccdb AP |
29 | # instructions. In Camellia every instruction is dependent on |
30 | # previous, which means that there is place for 2 additional ones | |
31 | # in between two dependent. Can we expect 3x performance improvement? | |
32 | # At least one can argue that it should be possible to break 2x | |
33 | # barrier... For some reason not even 2x appears to be possible: | |
34 | # | |
35 | # 128-bit key 192/256- | |
36 | # CBC decrypt 2.21/2.74 2.99/3.40 | |
37 | # CTR 2.15/2.68(*) 2.93/3.34 | |
38 | # (*) numbers after slash are for | |
39 | # misaligned data. | |
40 | # | |
41 | # This is for 2x interleave. But compared to 1x interleave CBC decrypt | |
42 | # improved by ... 0% for 128-bit key, and 11% for 192/256-bit one. | |
43 | # So that out-of-order execution logic can take non-interleaved code | |
44 | # to 1.87x, but can't take 2x interleaved one any further. There | |
45 | # surely is some explanation... As result 3x interleave was not even | |
46 | # attempted. Instead an effort was made to share specific modes | |
47 | # implementations with AES module (therefore sparct4_modes.pl). | |
48 | # | |
49 | # To anchor to something else, software C implementation processes | |
50 | # one byte in 38 cycles with 128-bit key on same processor. | |
51 | ||
52 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
53 | push(@INC,"${dir}","${dir}../../perlasm"); | |
54 | require "sparcv9_modes.pl"; | |
55 | ||
1aa89a7a | 56 | $output = pop and open STDOUT,">$output"; |
4739ccdb AP |
57 | |
58 | $::evp=1; # if $evp is set to 0, script generates module with | |
59 | # Camellia_[en|de]crypt, Camellia_set_key and Camellia_cbc_encrypt | |
60 | # entry points. These are fully compatible with openssl/camellia.h. | |
61 | ||
62 | ###################################################################### | |
63 | # single-round subroutines | |
64 | # | |
65 | { | |
66 | my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); | |
67 | ||
68 | $code=<<___; | |
eb77e888 AP |
69 | #include "sparc_arch.h" |
70 | ||
4739ccdb AP |
71 | .text |
72 | ||
73 | .globl cmll_t4_encrypt | |
74 | .align 32 | |
75 | cmll_t4_encrypt: | |
76 | andcc $inp, 7, %g1 ! is input aligned? | |
77 | andn $inp, 7, $inp | |
78 | ||
79 | ldx [$key + 0], %g4 | |
80 | ldx [$key + 8], %g5 | |
81 | ||
82 | ldx [$inp + 0], %o4 | |
83 | bz,pt %icc, 1f | |
84 | ldx [$inp + 8], %o5 | |
85 | ldx [$inp + 16], $inp | |
86 | sll %g1, 3, %g1 | |
87 | sub %g0, %g1, %o3 | |
88 | sllx %o4, %g1, %o4 | |
89 | sllx %o5, %g1, %g1 | |
90 | srlx %o5, %o3, %o5 | |
91 | srlx $inp, %o3, %o3 | |
92 | or %o5, %o4, %o4 | |
93 | or %o3, %g1, %o5 | |
94 | 1: | |
95 | ld [$key + 272], $rounds ! grandRounds, 3 or 4 | |
96 | ldd [$key + 16], %f12 | |
97 | ldd [$key + 24], %f14 | |
98 | xor %g4, %o4, %o4 | |
99 | xor %g5, %o5, %o5 | |
100 | ldd [$key + 32], %f16 | |
101 | ldd [$key + 40], %f18 | |
102 | movxtod %o4, %f0 | |
103 | movxtod %o5, %f2 | |
104 | ldd [$key + 48], %f20 | |
105 | ldd [$key + 56], %f22 | |
106 | sub $rounds, 1, $rounds | |
107 | ldd [$key + 64], %f24 | |
108 | ldd [$key + 72], %f26 | |
109 | add $key, 80, $key | |
110 | ||
111 | .Lenc: | |
112 | camellia_f %f12, %f2, %f0, %f2 | |
113 | ldd [$key + 0], %f12 | |
114 | sub $rounds,1,$rounds | |
115 | camellia_f %f14, %f0, %f2, %f0 | |
116 | ldd [$key + 8], %f14 | |
117 | camellia_f %f16, %f2, %f0, %f2 | |
118 | ldd [$key + 16], %f16 | |
119 | camellia_f %f18, %f0, %f2, %f0 | |
120 | ldd [$key + 24], %f18 | |
121 | camellia_f %f20, %f2, %f0, %f2 | |
122 | ldd [$key + 32], %f20 | |
123 | camellia_f %f22, %f0, %f2, %f0 | |
124 | ldd [$key + 40], %f22 | |
125 | camellia_fl %f24, %f0, %f0 | |
126 | ldd [$key + 48], %f24 | |
127 | camellia_fli %f26, %f2, %f2 | |
128 | ldd [$key + 56], %f26 | |
129 | brnz,pt $rounds, .Lenc | |
130 | add $key, 64, $key | |
131 | ||
132 | andcc $out, 7, $tmp ! is output aligned? | |
133 | camellia_f %f12, %f2, %f0, %f2 | |
134 | camellia_f %f14, %f0, %f2, %f0 | |
135 | camellia_f %f16, %f2, %f0, %f2 | |
136 | camellia_f %f18, %f0, %f2, %f0 | |
137 | camellia_f %f20, %f2, %f0, %f4 | |
138 | camellia_f %f22, %f0, %f4, %f2 | |
139 | fxor %f24, %f4, %f0 | |
140 | fxor %f26, %f2, %f2 | |
141 | ||
142 | bnz,pn %icc, 2f | |
143 | nop | |
144 | ||
145 | std %f0, [$out + 0] | |
146 | retl | |
147 | std %f2, [$out + 8] | |
148 | ||
149 | 2: alignaddrl $out, %g0, $out | |
150 | mov 0xff, $mask | |
151 | srl $mask, $tmp, $mask | |
152 | ||
153 | faligndata %f0, %f0, %f4 | |
154 | faligndata %f0, %f2, %f6 | |
155 | faligndata %f2, %f2, %f8 | |
156 | ||
157 | stda %f4, [$out + $mask]0xc0 ! partial store | |
158 | std %f6, [$out + 8] | |
159 | add $out, 16, $out | |
160 | orn %g0, $mask, $mask | |
161 | retl | |
162 | stda %f8, [$out + $mask]0xc0 ! partial store | |
163 | .type cmll_t4_encrypt,#function | |
164 | .size cmll_t4_encrypt,.-cmll_t4_encrypt | |
165 | ||
166 | .globl cmll_t4_decrypt | |
167 | .align 32 | |
168 | cmll_t4_decrypt: | |
169 | ld [$key + 272], $rounds ! grandRounds, 3 or 4 | |
170 | andcc $inp, 7, %g1 ! is input aligned? | |
171 | andn $inp, 7, $inp | |
172 | ||
173 | sll $rounds, 6, $rounds | |
174 | add $rounds, $key, $key | |
175 | ||
176 | ldx [$inp + 0], %o4 | |
177 | bz,pt %icc, 1f | |
178 | ldx [$inp + 8], %o5 | |
179 | ldx [$inp + 16], $inp | |
180 | sll %g1, 3, %g1 | |
181 | sub %g0, %g1, %g4 | |
182 | sllx %o4, %g1, %o4 | |
183 | sllx %o5, %g1, %g1 | |
184 | srlx %o5, %g4, %o5 | |
185 | srlx $inp, %g4, %g4 | |
186 | or %o5, %o4, %o4 | |
187 | or %g4, %g1, %o5 | |
188 | 1: | |
189 | ldx [$key + 0], %g4 | |
190 | ldx [$key + 8], %g5 | |
191 | ldd [$key - 8], %f12 | |
192 | ldd [$key - 16], %f14 | |
193 | xor %g4, %o4, %o4 | |
194 | xor %g5, %o5, %o5 | |
195 | ldd [$key - 24], %f16 | |
196 | ldd [$key - 32], %f18 | |
197 | movxtod %o4, %f0 | |
198 | movxtod %o5, %f2 | |
199 | ldd [$key - 40], %f20 | |
200 | ldd [$key - 48], %f22 | |
201 | sub $rounds, 64, $rounds | |
202 | ldd [$key - 56], %f24 | |
203 | ldd [$key - 64], %f26 | |
204 | sub $key, 64, $key | |
205 | ||
206 | .Ldec: | |
207 | camellia_f %f12, %f2, %f0, %f2 | |
208 | ldd [$key - 8], %f12 | |
209 | sub $rounds, 64, $rounds | |
210 | camellia_f %f14, %f0, %f2, %f0 | |
211 | ldd [$key - 16], %f14 | |
212 | camellia_f %f16, %f2, %f0, %f2 | |
213 | ldd [$key - 24], %f16 | |
214 | camellia_f %f18, %f0, %f2, %f0 | |
215 | ldd [$key - 32], %f18 | |
216 | camellia_f %f20, %f2, %f0, %f2 | |
217 | ldd [$key - 40], %f20 | |
218 | camellia_f %f22, %f0, %f2, %f0 | |
219 | ldd [$key - 48], %f22 | |
220 | camellia_fl %f24, %f0, %f0 | |
221 | ldd [$key - 56], %f24 | |
222 | camellia_fli %f26, %f2, %f2 | |
223 | ldd [$key - 64], %f26 | |
224 | brnz,pt $rounds, .Ldec | |
225 | sub $key, 64, $key | |
226 | ||
227 | andcc $out, 7, $tmp ! is output aligned? | |
228 | camellia_f %f12, %f2, %f0, %f2 | |
229 | camellia_f %f14, %f0, %f2, %f0 | |
230 | camellia_f %f16, %f2, %f0, %f2 | |
231 | camellia_f %f18, %f0, %f2, %f0 | |
232 | camellia_f %f20, %f2, %f0, %f4 | |
233 | camellia_f %f22, %f0, %f4, %f2 | |
234 | fxor %f26, %f4, %f0 | |
235 | fxor %f24, %f2, %f2 | |
236 | ||
237 | bnz,pn %icc, 2f | |
238 | nop | |
239 | ||
240 | std %f0, [$out + 0] | |
241 | retl | |
242 | std %f2, [$out + 8] | |
243 | ||
244 | 2: alignaddrl $out, %g0, $out | |
245 | mov 0xff, $mask | |
246 | srl $mask, $tmp, $mask | |
247 | ||
248 | faligndata %f0, %f0, %f4 | |
249 | faligndata %f0, %f2, %f6 | |
250 | faligndata %f2, %f2, %f8 | |
251 | ||
252 | stda %f4, [$out + $mask]0xc0 ! partial store | |
253 | std %f6, [$out + 8] | |
254 | add $out, 16, $out | |
255 | orn %g0, $mask, $mask | |
256 | retl | |
257 | stda %f8, [$out + $mask]0xc0 ! partial store | |
258 | .type cmll_t4_decrypt,#function | |
259 | .size cmll_t4_decrypt,.-cmll_t4_decrypt | |
260 | ___ | |
261 | } | |
262 | ||
263 | ###################################################################### | |
264 | # key setup subroutines | |
265 | # | |
266 | { | |
267 | sub ROTL128 { | |
268 | my $rot = shift; | |
269 | ||
270 | "srlx %o4, 64-$rot, %g4\n\t". | |
271 | "sllx %o4, $rot, %o4\n\t". | |
272 | "srlx %o5, 64-$rot, %g5\n\t". | |
273 | "sllx %o5, $rot, %o5\n\t". | |
274 | "or %o4, %g5, %o4\n\t". | |
275 | "or %o5, %g4, %o5"; | |
276 | } | |
277 | ||
278 | my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5)); | |
279 | $code.=<<___; | |
280 | .globl cmll_t4_set_key | |
281 | .align 32 | |
282 | cmll_t4_set_key: | |
283 | and $inp, 7, $tmp | |
284 | alignaddr $inp, %g0, $inp | |
285 | cmp $bits, 192 | |
286 | ldd [$inp + 0], %f0 | |
287 | bl,pt %icc,.L128 | |
288 | ldd [$inp + 8], %f2 | |
289 | ||
290 | be,pt %icc,.L192 | |
291 | ldd [$inp + 16], %f4 | |
292 | ||
293 | brz,pt $tmp, .L256aligned | |
294 | ldd [$inp + 24], %f6 | |
295 | ||
296 | ldd [$inp + 32], %f8 | |
297 | faligndata %f0, %f2, %f0 | |
298 | faligndata %f2, %f4, %f2 | |
299 | faligndata %f4, %f6, %f4 | |
300 | b .L256aligned | |
301 | faligndata %f6, %f8, %f6 | |
302 | ||
303 | .align 16 | |
304 | .L192: | |
305 | brz,a,pt $tmp, .L256aligned | |
306 | fnot2 %f4, %f6 | |
307 | ||
308 | ldd [$inp + 24], %f6 | |
309 | nop | |
310 | faligndata %f0, %f2, %f0 | |
311 | faligndata %f2, %f4, %f2 | |
312 | faligndata %f4, %f6, %f4 | |
313 | fnot2 %f4, %f6 | |
314 | ||
315 | .L256aligned: | |
316 | std %f0, [$out + 0] ! k[0, 1] | |
317 | fsrc2 %f0, %f28 | |
318 | std %f2, [$out + 8] ! k[2, 3] | |
319 | fsrc2 %f2, %f30 | |
320 | fxor %f4, %f0, %f0 | |
321 | b .L128key | |
322 | fxor %f6, %f2, %f2 | |
323 | ||
324 | .align 16 | |
325 | .L128: | |
326 | brz,pt $tmp, .L128aligned | |
327 | nop | |
328 | ||
329 | ldd [$inp + 16], %f4 | |
330 | nop | |
331 | faligndata %f0, %f2, %f0 | |
332 | faligndata %f2, %f4, %f2 | |
333 | ||
334 | .L128aligned: | |
335 | std %f0, [$out + 0] ! k[0, 1] | |
336 | fsrc2 %f0, %f28 | |
337 | std %f2, [$out + 8] ! k[2, 3] | |
338 | fsrc2 %f2, %f30 | |
339 | ||
340 | .L128key: | |
341 | mov %o7, %o5 | |
342 | 1: call .+8 | |
343 | add %o7, SIGMA-1b, %o4 | |
344 | mov %o5, %o7 | |
345 | ||
346 | ldd [%o4 + 0], %f16 | |
347 | ldd [%o4 + 8], %f18 | |
348 | ldd [%o4 + 16], %f20 | |
349 | ldd [%o4 + 24], %f22 | |
350 | ||
351 | camellia_f %f16, %f2, %f0, %f2 | |
352 | camellia_f %f18, %f0, %f2, %f0 | |
353 | fxor %f28, %f0, %f0 | |
354 | fxor %f30, %f2, %f2 | |
355 | camellia_f %f20, %f2, %f0, %f2 | |
356 | camellia_f %f22, %f0, %f2, %f0 | |
357 | ||
358 | bge,pn %icc, .L256key | |
359 | nop | |
360 | std %f0, [$out + 0x10] ! k[ 4, 5] | |
361 | std %f2, [$out + 0x18] ! k[ 6, 7] | |
362 | ||
363 | movdtox %f0, %o4 | |
364 | movdtox %f2, %o5 | |
365 | `&ROTL128(15)` | |
366 | stx %o4, [$out + 0x30] ! k[12, 13] | |
367 | stx %o5, [$out + 0x38] ! k[14, 15] | |
368 | `&ROTL128(15)` | |
369 | stx %o4, [$out + 0x40] ! k[16, 17] | |
370 | stx %o5, [$out + 0x48] ! k[18, 19] | |
371 | `&ROTL128(15)` | |
372 | stx %o4, [$out + 0x60] ! k[24, 25] | |
373 | `&ROTL128(15)` | |
374 | stx %o4, [$out + 0x70] ! k[28, 29] | |
375 | stx %o5, [$out + 0x78] ! k[30, 31] | |
376 | `&ROTL128(34)` | |
377 | stx %o4, [$out + 0xa0] ! k[40, 41] | |
378 | stx %o5, [$out + 0xa8] ! k[42, 43] | |
379 | `&ROTL128(17)` | |
380 | stx %o4, [$out + 0xc0] ! k[48, 49] | |
381 | stx %o5, [$out + 0xc8] ! k[50, 51] | |
382 | ||
383 | movdtox %f28, %o4 ! k[ 0, 1] | |
384 | movdtox %f30, %o5 ! k[ 2, 3] | |
385 | `&ROTL128(15)` | |
386 | stx %o4, [$out + 0x20] ! k[ 8, 9] | |
387 | stx %o5, [$out + 0x28] ! k[10, 11] | |
388 | `&ROTL128(30)` | |
389 | stx %o4, [$out + 0x50] ! k[20, 21] | |
390 | stx %o5, [$out + 0x58] ! k[22, 23] | |
391 | `&ROTL128(15)` | |
392 | stx %o5, [$out + 0x68] ! k[26, 27] | |
393 | `&ROTL128(17)` | |
394 | stx %o4, [$out + 0x80] ! k[32, 33] | |
395 | stx %o5, [$out + 0x88] ! k[34, 35] | |
396 | `&ROTL128(17)` | |
397 | stx %o4, [$out + 0x90] ! k[36, 37] | |
398 | stx %o5, [$out + 0x98] ! k[38, 39] | |
399 | `&ROTL128(17)` | |
400 | stx %o4, [$out + 0xb0] ! k[44, 45] | |
401 | stx %o5, [$out + 0xb8] ! k[46, 47] | |
402 | ||
403 | mov 3, $tmp | |
404 | st $tmp, [$out + 0x110] | |
405 | retl | |
406 | xor %o0, %o0, %o0 | |
407 | ||
408 | .align 16 | |
409 | .L256key: | |
410 | ldd [%o4 + 32], %f24 | |
411 | ldd [%o4 + 40], %f26 | |
412 | ||
413 | std %f0, [$out + 0x30] ! k[12, 13] | |
414 | std %f2, [$out + 0x38] ! k[14, 15] | |
415 | ||
416 | fxor %f4, %f0, %f0 | |
417 | fxor %f6, %f2, %f2 | |
418 | camellia_f %f24, %f2, %f0, %f2 | |
419 | camellia_f %f26, %f0, %f2, %f0 | |
420 | ||
421 | std %f0, [$out + 0x10] ! k[ 4, 5] | |
422 | std %f2, [$out + 0x18] ! k[ 6, 7] | |
423 | ||
424 | movdtox %f0, %o4 | |
425 | movdtox %f2, %o5 | |
426 | `&ROTL128(30)` | |
427 | stx %o4, [$out + 0x50] ! k[20, 21] | |
428 | stx %o5, [$out + 0x58] ! k[22, 23] | |
429 | `&ROTL128(30)` | |
430 | stx %o4, [$out + 0xa0] ! k[40, 41] | |
431 | stx %o5, [$out + 0xa8] ! k[42, 43] | |
432 | `&ROTL128(51)` | |
433 | stx %o4, [$out + 0x100] ! k[64, 65] | |
434 | stx %o5, [$out + 0x108] ! k[66, 67] | |
435 | ||
436 | movdtox %f4, %o4 ! k[ 8, 9] | |
437 | movdtox %f6, %o5 ! k[10, 11] | |
438 | `&ROTL128(15)` | |
439 | stx %o4, [$out + 0x20] ! k[ 8, 9] | |
440 | stx %o5, [$out + 0x28] ! k[10, 11] | |
441 | `&ROTL128(15)` | |
442 | stx %o4, [$out + 0x40] ! k[16, 17] | |
443 | stx %o5, [$out + 0x48] ! k[18, 19] | |
444 | `&ROTL128(30)` | |
445 | stx %o4, [$out + 0x90] ! k[36, 37] | |
446 | stx %o5, [$out + 0x98] ! k[38, 39] | |
447 | `&ROTL128(34)` | |
448 | stx %o4, [$out + 0xd0] ! k[52, 53] | |
449 | stx %o5, [$out + 0xd8] ! k[54, 55] | |
450 | ldx [$out + 0x30], %o4 ! k[12, 13] | |
451 | ldx [$out + 0x38], %o5 ! k[14, 15] | |
452 | `&ROTL128(15)` | |
453 | stx %o4, [$out + 0x30] ! k[12, 13] | |
454 | stx %o5, [$out + 0x38] ! k[14, 15] | |
455 | `&ROTL128(30)` | |
456 | stx %o4, [$out + 0x70] ! k[28, 29] | |
457 | stx %o5, [$out + 0x78] ! k[30, 31] | |
458 | srlx %o4, 32, %g4 | |
459 | srlx %o5, 32, %g5 | |
460 | st %o4, [$out + 0xc0] ! k[48] | |
461 | st %g5, [$out + 0xc4] ! k[49] | |
462 | st %o5, [$out + 0xc8] ! k[50] | |
463 | st %g4, [$out + 0xcc] ! k[51] | |
464 | `&ROTL128(49)` | |
465 | stx %o4, [$out + 0xe0] ! k[56, 57] | |
466 | stx %o5, [$out + 0xe8] ! k[58, 59] | |
467 | ||
468 | movdtox %f28, %o4 ! k[ 0, 1] | |
469 | movdtox %f30, %o5 ! k[ 2, 3] | |
470 | `&ROTL128(45)` | |
471 | stx %o4, [$out + 0x60] ! k[24, 25] | |
472 | stx %o5, [$out + 0x68] ! k[26, 27] | |
473 | `&ROTL128(15)` | |
474 | stx %o4, [$out + 0x80] ! k[32, 33] | |
475 | stx %o5, [$out + 0x88] ! k[34, 35] | |
476 | `&ROTL128(17)` | |
477 | stx %o4, [$out + 0xb0] ! k[44, 45] | |
478 | stx %o5, [$out + 0xb8] ! k[46, 47] | |
479 | `&ROTL128(34)` | |
480 | stx %o4, [$out + 0xf0] ! k[60, 61] | |
481 | stx %o5, [$out + 0xf8] ! k[62, 63] | |
482 | ||
483 | mov 4, $tmp | |
484 | st $tmp, [$out + 0x110] | |
485 | retl | |
486 | xor %o0, %o0, %o0 | |
487 | .type cmll_t4_set_key,#function | |
488 | .size cmll_t4_set_key,.-cmll_t4_set_key | |
489 | .align 32 | |
490 | SIGMA: | |
491 | .long 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2 | |
492 | .long 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c | |
493 | .long 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd | |
494 | .type SIGMA,#object | |
495 | .size SIGMA,.-SIGMA | |
496 | .asciz "Camellia for SPARC T4, David S. Miller, Andy Polyakov" | |
497 | ___ | |
498 | } | |
499 | ||
500 | {{{ | |
501 | my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5)); | |
502 | my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7)); | |
503 | ||
504 | $code.=<<___; | |
505 | .align 32 | |
506 | _cmll128_load_enckey: | |
507 | ldx [$key + 0], %g4 | |
508 | ldx [$key + 8], %g5 | |
509 | ___ | |
510 | for ($i=2; $i<26;$i++) { # load key schedule | |
511 | $code.=<<___; | |
512 | ldd [$key + `8*$i`], %f`12+2*$i` | |
513 | ___ | |
514 | } | |
515 | $code.=<<___; | |
516 | retl | |
517 | nop | |
518 | .type _cmll128_load_enckey,#function | |
519 | .size _cmll128_load_enckey,.-_cmll128_load_enckey | |
520 | _cmll256_load_enckey=_cmll128_load_enckey | |
521 | ||
522 | .align 32 | |
523 | _cmll256_load_deckey: | |
524 | ldd [$key + 64], %f62 | |
525 | ldd [$key + 72], %f60 | |
526 | b .Load_deckey | |
527 | add $key, 64, $key | |
528 | _cmll128_load_deckey: | |
529 | ldd [$key + 0], %f60 | |
530 | ldd [$key + 8], %f62 | |
531 | .Load_deckey: | |
532 | ___ | |
533 | for ($i=2; $i<24;$i++) { # load key schedule | |
534 | $code.=<<___; | |
535 | ldd [$key + `8*$i`], %f`62-2*$i` | |
536 | ___ | |
537 | } | |
538 | $code.=<<___; | |
539 | ldx [$key + 192], %g4 | |
540 | retl | |
541 | ldx [$key + 200], %g5 | |
542 | .type _cmll256_load_deckey,#function | |
543 | .size _cmll256_load_deckey,.-_cmll256_load_deckey | |
544 | ||
545 | .align 32 | |
546 | _cmll128_encrypt_1x: | |
547 | ___ | |
548 | for ($i=0; $i<3; $i++) { | |
549 | $code.=<<___; | |
550 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
551 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
552 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
553 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
554 | ___ | |
555 | $code.=<<___ if ($i<2); | |
556 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
557 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
558 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
559 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
560 | ___ | |
561 | } | |
562 | $code.=<<___; | |
563 | camellia_f %f56, %f2, %f0, %f4 | |
564 | camellia_f %f58, %f0, %f4, %f2 | |
565 | fxor %f60, %f4, %f0 | |
566 | retl | |
567 | fxor %f62, %f2, %f2 | |
568 | .type _cmll128_encrypt_1x,#function | |
569 | .size _cmll128_encrypt_1x,.-_cmll128_encrypt_1x | |
570 | _cmll128_decrypt_1x=_cmll128_encrypt_1x | |
571 | ||
572 | .align 32 | |
573 | _cmll128_encrypt_2x: | |
574 | ___ | |
575 | for ($i=0; $i<3; $i++) { | |
576 | $code.=<<___; | |
577 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
578 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
579 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
580 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
581 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
582 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
583 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
584 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
585 | ___ | |
586 | $code.=<<___ if ($i<2); | |
587 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
588 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
589 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
590 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
591 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
592 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
593 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
594 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
595 | ___ | |
596 | } | |
597 | $code.=<<___; | |
598 | camellia_f %f56, %f2, %f0, %f8 | |
599 | camellia_f %f56, %f6, %f4, %f10 | |
600 | camellia_f %f58, %f0, %f8, %f2 | |
601 | camellia_f %f58, %f4, %f10, %f6 | |
602 | fxor %f60, %f8, %f0 | |
603 | fxor %f60, %f10, %f4 | |
604 | fxor %f62, %f2, %f2 | |
605 | retl | |
606 | fxor %f62, %f6, %f6 | |
607 | .type _cmll128_encrypt_2x,#function | |
608 | .size _cmll128_encrypt_2x,.-_cmll128_encrypt_2x | |
609 | _cmll128_decrypt_2x=_cmll128_encrypt_2x | |
610 | ||
611 | .align 32 | |
612 | _cmll256_encrypt_1x: | |
613 | camellia_f %f16, %f2, %f0, %f2 | |
614 | camellia_f %f18, %f0, %f2, %f0 | |
615 | ldd [$key + 208], %f16 | |
616 | ldd [$key + 216], %f18 | |
617 | camellia_f %f20, %f2, %f0, %f2 | |
618 | camellia_f %f22, %f0, %f2, %f0 | |
619 | ldd [$key + 224], %f20 | |
620 | ldd [$key + 232], %f22 | |
621 | camellia_f %f24, %f2, %f0, %f2 | |
622 | camellia_f %f26, %f0, %f2, %f0 | |
623 | ldd [$key + 240], %f24 | |
624 | ldd [$key + 248], %f26 | |
625 | camellia_fl %f28, %f0, %f0 | |
626 | camellia_fli %f30, %f2, %f2 | |
627 | ldd [$key + 256], %f28 | |
628 | ldd [$key + 264], %f30 | |
629 | ___ | |
630 | for ($i=1; $i<3; $i++) { | |
631 | $code.=<<___; | |
632 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
633 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
634 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
635 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
636 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
637 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
638 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
639 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
640 | ___ | |
641 | } | |
642 | $code.=<<___; | |
643 | camellia_f %f16, %f2, %f0, %f2 | |
644 | camellia_f %f18, %f0, %f2, %f0 | |
645 | ldd [$key + 16], %f16 | |
646 | ldd [$key + 24], %f18 | |
647 | camellia_f %f20, %f2, %f0, %f2 | |
648 | camellia_f %f22, %f0, %f2, %f0 | |
649 | ldd [$key + 32], %f20 | |
650 | ldd [$key + 40], %f22 | |
651 | camellia_f %f24, %f2, %f0, %f4 | |
652 | camellia_f %f26, %f0, %f4, %f2 | |
653 | ldd [$key + 48], %f24 | |
654 | ldd [$key + 56], %f26 | |
655 | fxor %f28, %f4, %f0 | |
656 | fxor %f30, %f2, %f2 | |
657 | ldd [$key + 64], %f28 | |
658 | retl | |
659 | ldd [$key + 72], %f30 | |
660 | .type _cmll256_encrypt_1x,#function | |
661 | .size _cmll256_encrypt_1x,.-_cmll256_encrypt_1x | |
662 | ||
663 | .align 32 | |
664 | _cmll256_encrypt_2x: | |
665 | camellia_f %f16, %f2, %f0, %f2 | |
666 | camellia_f %f16, %f6, %f4, %f6 | |
667 | camellia_f %f18, %f0, %f2, %f0 | |
668 | camellia_f %f18, %f4, %f6, %f4 | |
669 | ldd [$key + 208], %f16 | |
670 | ldd [$key + 216], %f18 | |
671 | camellia_f %f20, %f2, %f0, %f2 | |
672 | camellia_f %f20, %f6, %f4, %f6 | |
673 | camellia_f %f22, %f0, %f2, %f0 | |
674 | camellia_f %f22, %f4, %f6, %f4 | |
675 | ldd [$key + 224], %f20 | |
676 | ldd [$key + 232], %f22 | |
677 | camellia_f %f24, %f2, %f0, %f2 | |
678 | camellia_f %f24, %f6, %f4, %f6 | |
679 | camellia_f %f26, %f0, %f2, %f0 | |
680 | camellia_f %f26, %f4, %f6, %f4 | |
681 | ldd [$key + 240], %f24 | |
682 | ldd [$key + 248], %f26 | |
683 | camellia_fl %f28, %f0, %f0 | |
684 | camellia_fl %f28, %f4, %f4 | |
685 | camellia_fli %f30, %f2, %f2 | |
686 | camellia_fli %f30, %f6, %f6 | |
687 | ldd [$key + 256], %f28 | |
688 | ldd [$key + 264], %f30 | |
689 | ___ | |
690 | for ($i=1; $i<3; $i++) { | |
691 | $code.=<<___; | |
692 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
693 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
694 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
695 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
696 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
697 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
698 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
699 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
700 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
701 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
702 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
703 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
704 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
705 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
706 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
707 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
708 | ___ | |
709 | } | |
710 | $code.=<<___; | |
711 | camellia_f %f16, %f2, %f0, %f2 | |
712 | camellia_f %f16, %f6, %f4, %f6 | |
713 | camellia_f %f18, %f0, %f2, %f0 | |
714 | camellia_f %f18, %f4, %f6, %f4 | |
715 | ldd [$key + 16], %f16 | |
716 | ldd [$key + 24], %f18 | |
717 | camellia_f %f20, %f2, %f0, %f2 | |
718 | camellia_f %f20, %f6, %f4, %f6 | |
719 | camellia_f %f22, %f0, %f2, %f0 | |
720 | camellia_f %f22, %f4, %f6, %f4 | |
721 | ldd [$key + 32], %f20 | |
722 | ldd [$key + 40], %f22 | |
723 | camellia_f %f24, %f2, %f0, %f8 | |
724 | camellia_f %f24, %f6, %f4, %f10 | |
725 | camellia_f %f26, %f0, %f8, %f2 | |
726 | camellia_f %f26, %f4, %f10, %f6 | |
727 | ldd [$key + 48], %f24 | |
728 | ldd [$key + 56], %f26 | |
729 | fxor %f28, %f8, %f0 | |
730 | fxor %f28, %f10, %f4 | |
731 | fxor %f30, %f2, %f2 | |
732 | fxor %f30, %f6, %f6 | |
733 | ldd [$key + 64], %f28 | |
734 | retl | |
735 | ldd [$key + 72], %f30 | |
736 | .type _cmll256_encrypt_2x,#function | |
737 | .size _cmll256_encrypt_2x,.-_cmll256_encrypt_2x | |
738 | ||
739 | .align 32 | |
740 | _cmll256_decrypt_1x: | |
741 | camellia_f %f16, %f2, %f0, %f2 | |
742 | camellia_f %f18, %f0, %f2, %f0 | |
743 | ldd [$key - 8], %f16 | |
744 | ldd [$key - 16], %f18 | |
745 | camellia_f %f20, %f2, %f0, %f2 | |
746 | camellia_f %f22, %f0, %f2, %f0 | |
747 | ldd [$key - 24], %f20 | |
748 | ldd [$key - 32], %f22 | |
749 | camellia_f %f24, %f2, %f0, %f2 | |
750 | camellia_f %f26, %f0, %f2, %f0 | |
751 | ldd [$key - 40], %f24 | |
752 | ldd [$key - 48], %f26 | |
753 | camellia_fl %f28, %f0, %f0 | |
754 | camellia_fli %f30, %f2, %f2 | |
755 | ldd [$key - 56], %f28 | |
756 | ldd [$key - 64], %f30 | |
757 | ___ | |
758 | for ($i=1; $i<3; $i++) { | |
759 | $code.=<<___; | |
760 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
761 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
762 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
763 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
764 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
765 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
766 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
767 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
768 | ___ | |
769 | } | |
770 | $code.=<<___; | |
771 | camellia_f %f16, %f2, %f0, %f2 | |
772 | camellia_f %f18, %f0, %f2, %f0 | |
773 | ldd [$key + 184], %f16 | |
774 | ldd [$key + 176], %f18 | |
775 | camellia_f %f20, %f2, %f0, %f2 | |
776 | camellia_f %f22, %f0, %f2, %f0 | |
777 | ldd [$key + 168], %f20 | |
778 | ldd [$key + 160], %f22 | |
779 | camellia_f %f24, %f2, %f0, %f4 | |
780 | camellia_f %f26, %f0, %f4, %f2 | |
781 | ldd [$key + 152], %f24 | |
782 | ldd [$key + 144], %f26 | |
783 | fxor %f30, %f4, %f0 | |
784 | fxor %f28, %f2, %f2 | |
785 | ldd [$key + 136], %f28 | |
786 | retl | |
787 | ldd [$key + 128], %f30 | |
788 | .type _cmll256_decrypt_1x,#function | |
789 | .size _cmll256_decrypt_1x,.-_cmll256_decrypt_1x | |
790 | ||
791 | .align 32 | |
792 | _cmll256_decrypt_2x: | |
793 | camellia_f %f16, %f2, %f0, %f2 | |
794 | camellia_f %f16, %f6, %f4, %f6 | |
795 | camellia_f %f18, %f0, %f2, %f0 | |
796 | camellia_f %f18, %f4, %f6, %f4 | |
797 | ldd [$key - 8], %f16 | |
798 | ldd [$key - 16], %f18 | |
799 | camellia_f %f20, %f2, %f0, %f2 | |
800 | camellia_f %f20, %f6, %f4, %f6 | |
801 | camellia_f %f22, %f0, %f2, %f0 | |
802 | camellia_f %f22, %f4, %f6, %f4 | |
803 | ldd [$key - 24], %f20 | |
804 | ldd [$key - 32], %f22 | |
805 | camellia_f %f24, %f2, %f0, %f2 | |
806 | camellia_f %f24, %f6, %f4, %f6 | |
807 | camellia_f %f26, %f0, %f2, %f0 | |
808 | camellia_f %f26, %f4, %f6, %f4 | |
809 | ldd [$key - 40], %f24 | |
810 | ldd [$key - 48], %f26 | |
811 | camellia_fl %f28, %f0, %f0 | |
812 | camellia_fl %f28, %f4, %f4 | |
813 | camellia_fli %f30, %f2, %f2 | |
814 | camellia_fli %f30, %f6, %f6 | |
815 | ldd [$key - 56], %f28 | |
816 | ldd [$key - 64], %f30 | |
817 | ___ | |
818 | for ($i=1; $i<3; $i++) { | |
819 | $code.=<<___; | |
820 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
821 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
822 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
823 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
824 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
825 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
826 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
827 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
828 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
829 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
830 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
831 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
832 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
833 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
834 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
835 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
836 | ___ | |
837 | } | |
838 | $code.=<<___; | |
839 | camellia_f %f16, %f2, %f0, %f2 | |
840 | camellia_f %f16, %f6, %f4, %f6 | |
841 | camellia_f %f18, %f0, %f2, %f0 | |
842 | camellia_f %f18, %f4, %f6, %f4 | |
843 | ldd [$key + 184], %f16 | |
844 | ldd [$key + 176], %f18 | |
845 | camellia_f %f20, %f2, %f0, %f2 | |
846 | camellia_f %f20, %f6, %f4, %f6 | |
847 | camellia_f %f22, %f0, %f2, %f0 | |
848 | camellia_f %f22, %f4, %f6, %f4 | |
849 | ldd [$key + 168], %f20 | |
850 | ldd [$key + 160], %f22 | |
851 | camellia_f %f24, %f2, %f0, %f8 | |
852 | camellia_f %f24, %f6, %f4, %f10 | |
853 | camellia_f %f26, %f0, %f8, %f2 | |
854 | camellia_f %f26, %f4, %f10, %f6 | |
855 | ldd [$key + 152], %f24 | |
856 | ldd [$key + 144], %f26 | |
857 | fxor %f30, %f8, %f0 | |
858 | fxor %f30, %f10, %f4 | |
859 | fxor %f28, %f2, %f2 | |
860 | fxor %f28, %f6, %f6 | |
861 | ldd [$key + 136], %f28 | |
862 | retl | |
863 | ldd [$key + 128], %f30 | |
864 | .type _cmll256_decrypt_2x,#function | |
865 | .size _cmll256_decrypt_2x,.-_cmll256_decrypt_2x | |
866 | ___ | |
867 | ||
868 | &alg_cbc_encrypt_implement("cmll",128); | |
869 | &alg_cbc_encrypt_implement("cmll",256); | |
870 | ||
871 | &alg_cbc_decrypt_implement("cmll",128); | |
872 | &alg_cbc_decrypt_implement("cmll",256); | |
873 | ||
874 | if ($::evp) { | |
875 | &alg_ctr32_implement("cmll",128); | |
876 | &alg_ctr32_implement("cmll",256); | |
877 | } | |
878 | }}} | |
879 | ||
880 | if (!$::evp) { | |
881 | $code.=<<___; | |
882 | .global Camellia_encrypt | |
883 | Camellia_encrypt=cmll_t4_encrypt | |
884 | .global Camellia_decrypt | |
885 | Camellia_decrypt=cmll_t4_decrypt | |
886 | .global Camellia_set_key | |
887 | .align 32 | |
888 | Camellia_set_key: | |
889 | andcc %o2, 7, %g0 ! double-check alignment | |
890 | bnz,a,pn %icc, 1f | |
891 | mov -1, %o0 | |
892 | brz,a,pn %o0, 1f | |
893 | mov -1, %o0 | |
894 | brz,a,pn %o2, 1f | |
895 | mov -1, %o0 | |
896 | andncc %o1, 0x1c0, %g0 | |
897 | bnz,a,pn %icc, 1f | |
898 | mov -2, %o0 | |
899 | cmp %o1, 128 | |
900 | bl,a,pn %icc, 1f | |
901 | mov -2, %o0 | |
902 | b cmll_t4_set_key | |
903 | nop | |
904 | 1: retl | |
905 | nop | |
906 | .type Camellia_set_key,#function | |
907 | .size Camellia_set_key,.-Camellia_set_key | |
908 | ___ | |
909 | ||
910 | my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5)); | |
911 | ||
912 | $code.=<<___; | |
913 | .globl Camellia_cbc_encrypt | |
914 | .align 32 | |
915 | Camellia_cbc_encrypt: | |
916 | ld [$key + 272], %g1 | |
917 | nop | |
918 | brz $enc, .Lcbc_decrypt | |
919 | cmp %g1, 3 | |
920 | ||
921 | be,pt %icc, cmll128_t4_cbc_encrypt | |
922 | nop | |
923 | ba cmll256_t4_cbc_encrypt | |
924 | nop | |
925 | ||
926 | .Lcbc_decrypt: | |
927 | be,pt %icc, cmll128_t4_cbc_decrypt | |
928 | nop | |
929 | ba cmll256_t4_cbc_decrypt | |
930 | nop | |
931 | .type Camellia_cbc_encrypt,#function | |
932 | .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt | |
933 | ___ | |
934 | } | |
935 | ||
936 | &emit_assembler(); | |
937 | ||
a21314db | 938 | close STDOUT or die "error closing STDOUT: $!"; |