]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
4739ccdb AP |
9 | |
10 | # ==================================================================== | |
e3713c36 RS |
11 | # Written by David S. Miller and Andy Polyakov. |
12 | # The module is licensed under 2-clause BSD | |
4739ccdb AP |
13 | # license. October 2012. All rights reserved. |
14 | # ==================================================================== | |
15 | ||
16 | ###################################################################### | |
17 | # Camellia for SPARC T4. | |
18 | # | |
19 | # As with AES below results [for aligned data] are virtually identical | |
46f4e1be | 20 | # to critical path lengths for 3-cycle instruction latency: |
4739ccdb AP |
21 | # |
22 | # 128-bit key 192/256- | |
23 | # CBC encrypt 4.14/4.21(*) 5.46/5.52 | |
24 | # (*) numbers after slash are for | |
25 | # misaligned data. | |
26 | # | |
27 | # As with Intel AES-NI, question is if it's possible to improve | |
46f4e1be | 28 | # performance of parallelizable modes by interleaving round |
4739ccdb AP |
29 | # instructions. In Camellia every instruction is dependent on |
30 | # previous, which means that there is place for 2 additional ones | |
31 | # in between two dependent. Can we expect 3x performance improvement? | |
32 | # At least one can argue that it should be possible to break 2x | |
33 | # barrier... For some reason not even 2x appears to be possible: | |
34 | # | |
35 | # 128-bit key 192/256- | |
36 | # CBC decrypt 2.21/2.74 2.99/3.40 | |
37 | # CTR 2.15/2.68(*) 2.93/3.34 | |
38 | # (*) numbers after slash are for | |
39 | # misaligned data. | |
40 | # | |
41 | # This is for 2x interleave. But compared to 1x interleave CBC decrypt | |
42 | # improved by ... 0% for 128-bit key, and 11% for 192/256-bit one. | |
43 | # So that out-of-order execution logic can take non-interleaved code | |
44 | # to 1.87x, but can't take 2x interleaved one any further. There | |
45 | # surely is some explanation... As result 3x interleave was not even | |
46 | # attempted. Instead an effort was made to share specific modes | |
47 | # implementations with AES module (therefore sparct4_modes.pl). | |
48 | # | |
49 | # To anchor to something else, software C implementation processes | |
50 | # one byte in 38 cycles with 128-bit key on same processor. | |
51 | ||
52 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
53 | push(@INC,"${dir}","${dir}../../perlasm"); | |
54 | require "sparcv9_modes.pl"; | |
55 | ||
eb77e888 AP |
56 | $output = pop; |
57 | open STDOUT,">$output"; | |
4739ccdb AP |
58 | |
59 | $::evp=1; # if $evp is set to 0, script generates module with | |
60 | # Camellia_[en|de]crypt, Camellia_set_key and Camellia_cbc_encrypt | |
61 | # entry points. These are fully compatible with openssl/camellia.h. | |
62 | ||
63 | ###################################################################### | |
64 | # single-round subroutines | |
65 | # | |
66 | { | |
67 | my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); | |
68 | ||
69 | $code=<<___; | |
eb77e888 AP |
70 | #include "sparc_arch.h" |
71 | ||
4739ccdb AP |
72 | .text |
73 | ||
74 | .globl cmll_t4_encrypt | |
75 | .align 32 | |
76 | cmll_t4_encrypt: | |
77 | andcc $inp, 7, %g1 ! is input aligned? | |
78 | andn $inp, 7, $inp | |
79 | ||
80 | ldx [$key + 0], %g4 | |
81 | ldx [$key + 8], %g5 | |
82 | ||
83 | ldx [$inp + 0], %o4 | |
84 | bz,pt %icc, 1f | |
85 | ldx [$inp + 8], %o5 | |
86 | ldx [$inp + 16], $inp | |
87 | sll %g1, 3, %g1 | |
88 | sub %g0, %g1, %o3 | |
89 | sllx %o4, %g1, %o4 | |
90 | sllx %o5, %g1, %g1 | |
91 | srlx %o5, %o3, %o5 | |
92 | srlx $inp, %o3, %o3 | |
93 | or %o5, %o4, %o4 | |
94 | or %o3, %g1, %o5 | |
95 | 1: | |
96 | ld [$key + 272], $rounds ! grandRounds, 3 or 4 | |
97 | ldd [$key + 16], %f12 | |
98 | ldd [$key + 24], %f14 | |
99 | xor %g4, %o4, %o4 | |
100 | xor %g5, %o5, %o5 | |
101 | ldd [$key + 32], %f16 | |
102 | ldd [$key + 40], %f18 | |
103 | movxtod %o4, %f0 | |
104 | movxtod %o5, %f2 | |
105 | ldd [$key + 48], %f20 | |
106 | ldd [$key + 56], %f22 | |
107 | sub $rounds, 1, $rounds | |
108 | ldd [$key + 64], %f24 | |
109 | ldd [$key + 72], %f26 | |
110 | add $key, 80, $key | |
111 | ||
112 | .Lenc: | |
113 | camellia_f %f12, %f2, %f0, %f2 | |
114 | ldd [$key + 0], %f12 | |
115 | sub $rounds,1,$rounds | |
116 | camellia_f %f14, %f0, %f2, %f0 | |
117 | ldd [$key + 8], %f14 | |
118 | camellia_f %f16, %f2, %f0, %f2 | |
119 | ldd [$key + 16], %f16 | |
120 | camellia_f %f18, %f0, %f2, %f0 | |
121 | ldd [$key + 24], %f18 | |
122 | camellia_f %f20, %f2, %f0, %f2 | |
123 | ldd [$key + 32], %f20 | |
124 | camellia_f %f22, %f0, %f2, %f0 | |
125 | ldd [$key + 40], %f22 | |
126 | camellia_fl %f24, %f0, %f0 | |
127 | ldd [$key + 48], %f24 | |
128 | camellia_fli %f26, %f2, %f2 | |
129 | ldd [$key + 56], %f26 | |
130 | brnz,pt $rounds, .Lenc | |
131 | add $key, 64, $key | |
132 | ||
133 | andcc $out, 7, $tmp ! is output aligned? | |
134 | camellia_f %f12, %f2, %f0, %f2 | |
135 | camellia_f %f14, %f0, %f2, %f0 | |
136 | camellia_f %f16, %f2, %f0, %f2 | |
137 | camellia_f %f18, %f0, %f2, %f0 | |
138 | camellia_f %f20, %f2, %f0, %f4 | |
139 | camellia_f %f22, %f0, %f4, %f2 | |
140 | fxor %f24, %f4, %f0 | |
141 | fxor %f26, %f2, %f2 | |
142 | ||
143 | bnz,pn %icc, 2f | |
144 | nop | |
145 | ||
146 | std %f0, [$out + 0] | |
147 | retl | |
148 | std %f2, [$out + 8] | |
149 | ||
150 | 2: alignaddrl $out, %g0, $out | |
151 | mov 0xff, $mask | |
152 | srl $mask, $tmp, $mask | |
153 | ||
154 | faligndata %f0, %f0, %f4 | |
155 | faligndata %f0, %f2, %f6 | |
156 | faligndata %f2, %f2, %f8 | |
157 | ||
158 | stda %f4, [$out + $mask]0xc0 ! partial store | |
159 | std %f6, [$out + 8] | |
160 | add $out, 16, $out | |
161 | orn %g0, $mask, $mask | |
162 | retl | |
163 | stda %f8, [$out + $mask]0xc0 ! partial store | |
164 | .type cmll_t4_encrypt,#function | |
165 | .size cmll_t4_encrypt,.-cmll_t4_encrypt | |
166 | ||
167 | .globl cmll_t4_decrypt | |
168 | .align 32 | |
169 | cmll_t4_decrypt: | |
170 | ld [$key + 272], $rounds ! grandRounds, 3 or 4 | |
171 | andcc $inp, 7, %g1 ! is input aligned? | |
172 | andn $inp, 7, $inp | |
173 | ||
174 | sll $rounds, 6, $rounds | |
175 | add $rounds, $key, $key | |
176 | ||
177 | ldx [$inp + 0], %o4 | |
178 | bz,pt %icc, 1f | |
179 | ldx [$inp + 8], %o5 | |
180 | ldx [$inp + 16], $inp | |
181 | sll %g1, 3, %g1 | |
182 | sub %g0, %g1, %g4 | |
183 | sllx %o4, %g1, %o4 | |
184 | sllx %o5, %g1, %g1 | |
185 | srlx %o5, %g4, %o5 | |
186 | srlx $inp, %g4, %g4 | |
187 | or %o5, %o4, %o4 | |
188 | or %g4, %g1, %o5 | |
189 | 1: | |
190 | ldx [$key + 0], %g4 | |
191 | ldx [$key + 8], %g5 | |
192 | ldd [$key - 8], %f12 | |
193 | ldd [$key - 16], %f14 | |
194 | xor %g4, %o4, %o4 | |
195 | xor %g5, %o5, %o5 | |
196 | ldd [$key - 24], %f16 | |
197 | ldd [$key - 32], %f18 | |
198 | movxtod %o4, %f0 | |
199 | movxtod %o5, %f2 | |
200 | ldd [$key - 40], %f20 | |
201 | ldd [$key - 48], %f22 | |
202 | sub $rounds, 64, $rounds | |
203 | ldd [$key - 56], %f24 | |
204 | ldd [$key - 64], %f26 | |
205 | sub $key, 64, $key | |
206 | ||
207 | .Ldec: | |
208 | camellia_f %f12, %f2, %f0, %f2 | |
209 | ldd [$key - 8], %f12 | |
210 | sub $rounds, 64, $rounds | |
211 | camellia_f %f14, %f0, %f2, %f0 | |
212 | ldd [$key - 16], %f14 | |
213 | camellia_f %f16, %f2, %f0, %f2 | |
214 | ldd [$key - 24], %f16 | |
215 | camellia_f %f18, %f0, %f2, %f0 | |
216 | ldd [$key - 32], %f18 | |
217 | camellia_f %f20, %f2, %f0, %f2 | |
218 | ldd [$key - 40], %f20 | |
219 | camellia_f %f22, %f0, %f2, %f0 | |
220 | ldd [$key - 48], %f22 | |
221 | camellia_fl %f24, %f0, %f0 | |
222 | ldd [$key - 56], %f24 | |
223 | camellia_fli %f26, %f2, %f2 | |
224 | ldd [$key - 64], %f26 | |
225 | brnz,pt $rounds, .Ldec | |
226 | sub $key, 64, $key | |
227 | ||
228 | andcc $out, 7, $tmp ! is output aligned? | |
229 | camellia_f %f12, %f2, %f0, %f2 | |
230 | camellia_f %f14, %f0, %f2, %f0 | |
231 | camellia_f %f16, %f2, %f0, %f2 | |
232 | camellia_f %f18, %f0, %f2, %f0 | |
233 | camellia_f %f20, %f2, %f0, %f4 | |
234 | camellia_f %f22, %f0, %f4, %f2 | |
235 | fxor %f26, %f4, %f0 | |
236 | fxor %f24, %f2, %f2 | |
237 | ||
238 | bnz,pn %icc, 2f | |
239 | nop | |
240 | ||
241 | std %f0, [$out + 0] | |
242 | retl | |
243 | std %f2, [$out + 8] | |
244 | ||
245 | 2: alignaddrl $out, %g0, $out | |
246 | mov 0xff, $mask | |
247 | srl $mask, $tmp, $mask | |
248 | ||
249 | faligndata %f0, %f0, %f4 | |
250 | faligndata %f0, %f2, %f6 | |
251 | faligndata %f2, %f2, %f8 | |
252 | ||
253 | stda %f4, [$out + $mask]0xc0 ! partial store | |
254 | std %f6, [$out + 8] | |
255 | add $out, 16, $out | |
256 | orn %g0, $mask, $mask | |
257 | retl | |
258 | stda %f8, [$out + $mask]0xc0 ! partial store | |
259 | .type cmll_t4_decrypt,#function | |
260 | .size cmll_t4_decrypt,.-cmll_t4_decrypt | |
261 | ___ | |
262 | } | |
263 | ||
264 | ###################################################################### | |
265 | # key setup subroutines | |
266 | # | |
267 | { | |
268 | sub ROTL128 { | |
269 | my $rot = shift; | |
270 | ||
271 | "srlx %o4, 64-$rot, %g4\n\t". | |
272 | "sllx %o4, $rot, %o4\n\t". | |
273 | "srlx %o5, 64-$rot, %g5\n\t". | |
274 | "sllx %o5, $rot, %o5\n\t". | |
275 | "or %o4, %g5, %o4\n\t". | |
276 | "or %o5, %g4, %o5"; | |
277 | } | |
278 | ||
279 | my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5)); | |
280 | $code.=<<___; | |
281 | .globl cmll_t4_set_key | |
282 | .align 32 | |
283 | cmll_t4_set_key: | |
284 | and $inp, 7, $tmp | |
285 | alignaddr $inp, %g0, $inp | |
286 | cmp $bits, 192 | |
287 | ldd [$inp + 0], %f0 | |
288 | bl,pt %icc,.L128 | |
289 | ldd [$inp + 8], %f2 | |
290 | ||
291 | be,pt %icc,.L192 | |
292 | ldd [$inp + 16], %f4 | |
293 | ||
294 | brz,pt $tmp, .L256aligned | |
295 | ldd [$inp + 24], %f6 | |
296 | ||
297 | ldd [$inp + 32], %f8 | |
298 | faligndata %f0, %f2, %f0 | |
299 | faligndata %f2, %f4, %f2 | |
300 | faligndata %f4, %f6, %f4 | |
301 | b .L256aligned | |
302 | faligndata %f6, %f8, %f6 | |
303 | ||
304 | .align 16 | |
305 | .L192: | |
306 | brz,a,pt $tmp, .L256aligned | |
307 | fnot2 %f4, %f6 | |
308 | ||
309 | ldd [$inp + 24], %f6 | |
310 | nop | |
311 | faligndata %f0, %f2, %f0 | |
312 | faligndata %f2, %f4, %f2 | |
313 | faligndata %f4, %f6, %f4 | |
314 | fnot2 %f4, %f6 | |
315 | ||
316 | .L256aligned: | |
317 | std %f0, [$out + 0] ! k[0, 1] | |
318 | fsrc2 %f0, %f28 | |
319 | std %f2, [$out + 8] ! k[2, 3] | |
320 | fsrc2 %f2, %f30 | |
321 | fxor %f4, %f0, %f0 | |
322 | b .L128key | |
323 | fxor %f6, %f2, %f2 | |
324 | ||
325 | .align 16 | |
326 | .L128: | |
327 | brz,pt $tmp, .L128aligned | |
328 | nop | |
329 | ||
330 | ldd [$inp + 16], %f4 | |
331 | nop | |
332 | faligndata %f0, %f2, %f0 | |
333 | faligndata %f2, %f4, %f2 | |
334 | ||
335 | .L128aligned: | |
336 | std %f0, [$out + 0] ! k[0, 1] | |
337 | fsrc2 %f0, %f28 | |
338 | std %f2, [$out + 8] ! k[2, 3] | |
339 | fsrc2 %f2, %f30 | |
340 | ||
341 | .L128key: | |
342 | mov %o7, %o5 | |
343 | 1: call .+8 | |
344 | add %o7, SIGMA-1b, %o4 | |
345 | mov %o5, %o7 | |
346 | ||
347 | ldd [%o4 + 0], %f16 | |
348 | ldd [%o4 + 8], %f18 | |
349 | ldd [%o4 + 16], %f20 | |
350 | ldd [%o4 + 24], %f22 | |
351 | ||
352 | camellia_f %f16, %f2, %f0, %f2 | |
353 | camellia_f %f18, %f0, %f2, %f0 | |
354 | fxor %f28, %f0, %f0 | |
355 | fxor %f30, %f2, %f2 | |
356 | camellia_f %f20, %f2, %f0, %f2 | |
357 | camellia_f %f22, %f0, %f2, %f0 | |
358 | ||
359 | bge,pn %icc, .L256key | |
360 | nop | |
361 | std %f0, [$out + 0x10] ! k[ 4, 5] | |
362 | std %f2, [$out + 0x18] ! k[ 6, 7] | |
363 | ||
364 | movdtox %f0, %o4 | |
365 | movdtox %f2, %o5 | |
366 | `&ROTL128(15)` | |
367 | stx %o4, [$out + 0x30] ! k[12, 13] | |
368 | stx %o5, [$out + 0x38] ! k[14, 15] | |
369 | `&ROTL128(15)` | |
370 | stx %o4, [$out + 0x40] ! k[16, 17] | |
371 | stx %o5, [$out + 0x48] ! k[18, 19] | |
372 | `&ROTL128(15)` | |
373 | stx %o4, [$out + 0x60] ! k[24, 25] | |
374 | `&ROTL128(15)` | |
375 | stx %o4, [$out + 0x70] ! k[28, 29] | |
376 | stx %o5, [$out + 0x78] ! k[30, 31] | |
377 | `&ROTL128(34)` | |
378 | stx %o4, [$out + 0xa0] ! k[40, 41] | |
379 | stx %o5, [$out + 0xa8] ! k[42, 43] | |
380 | `&ROTL128(17)` | |
381 | stx %o4, [$out + 0xc0] ! k[48, 49] | |
382 | stx %o5, [$out + 0xc8] ! k[50, 51] | |
383 | ||
384 | movdtox %f28, %o4 ! k[ 0, 1] | |
385 | movdtox %f30, %o5 ! k[ 2, 3] | |
386 | `&ROTL128(15)` | |
387 | stx %o4, [$out + 0x20] ! k[ 8, 9] | |
388 | stx %o5, [$out + 0x28] ! k[10, 11] | |
389 | `&ROTL128(30)` | |
390 | stx %o4, [$out + 0x50] ! k[20, 21] | |
391 | stx %o5, [$out + 0x58] ! k[22, 23] | |
392 | `&ROTL128(15)` | |
393 | stx %o5, [$out + 0x68] ! k[26, 27] | |
394 | `&ROTL128(17)` | |
395 | stx %o4, [$out + 0x80] ! k[32, 33] | |
396 | stx %o5, [$out + 0x88] ! k[34, 35] | |
397 | `&ROTL128(17)` | |
398 | stx %o4, [$out + 0x90] ! k[36, 37] | |
399 | stx %o5, [$out + 0x98] ! k[38, 39] | |
400 | `&ROTL128(17)` | |
401 | stx %o4, [$out + 0xb0] ! k[44, 45] | |
402 | stx %o5, [$out + 0xb8] ! k[46, 47] | |
403 | ||
404 | mov 3, $tmp | |
405 | st $tmp, [$out + 0x110] | |
406 | retl | |
407 | xor %o0, %o0, %o0 | |
408 | ||
409 | .align 16 | |
410 | .L256key: | |
411 | ldd [%o4 + 32], %f24 | |
412 | ldd [%o4 + 40], %f26 | |
413 | ||
414 | std %f0, [$out + 0x30] ! k[12, 13] | |
415 | std %f2, [$out + 0x38] ! k[14, 15] | |
416 | ||
417 | fxor %f4, %f0, %f0 | |
418 | fxor %f6, %f2, %f2 | |
419 | camellia_f %f24, %f2, %f0, %f2 | |
420 | camellia_f %f26, %f0, %f2, %f0 | |
421 | ||
422 | std %f0, [$out + 0x10] ! k[ 4, 5] | |
423 | std %f2, [$out + 0x18] ! k[ 6, 7] | |
424 | ||
425 | movdtox %f0, %o4 | |
426 | movdtox %f2, %o5 | |
427 | `&ROTL128(30)` | |
428 | stx %o4, [$out + 0x50] ! k[20, 21] | |
429 | stx %o5, [$out + 0x58] ! k[22, 23] | |
430 | `&ROTL128(30)` | |
431 | stx %o4, [$out + 0xa0] ! k[40, 41] | |
432 | stx %o5, [$out + 0xa8] ! k[42, 43] | |
433 | `&ROTL128(51)` | |
434 | stx %o4, [$out + 0x100] ! k[64, 65] | |
435 | stx %o5, [$out + 0x108] ! k[66, 67] | |
436 | ||
437 | movdtox %f4, %o4 ! k[ 8, 9] | |
438 | movdtox %f6, %o5 ! k[10, 11] | |
439 | `&ROTL128(15)` | |
440 | stx %o4, [$out + 0x20] ! k[ 8, 9] | |
441 | stx %o5, [$out + 0x28] ! k[10, 11] | |
442 | `&ROTL128(15)` | |
443 | stx %o4, [$out + 0x40] ! k[16, 17] | |
444 | stx %o5, [$out + 0x48] ! k[18, 19] | |
445 | `&ROTL128(30)` | |
446 | stx %o4, [$out + 0x90] ! k[36, 37] | |
447 | stx %o5, [$out + 0x98] ! k[38, 39] | |
448 | `&ROTL128(34)` | |
449 | stx %o4, [$out + 0xd0] ! k[52, 53] | |
450 | stx %o5, [$out + 0xd8] ! k[54, 55] | |
451 | ldx [$out + 0x30], %o4 ! k[12, 13] | |
452 | ldx [$out + 0x38], %o5 ! k[14, 15] | |
453 | `&ROTL128(15)` | |
454 | stx %o4, [$out + 0x30] ! k[12, 13] | |
455 | stx %o5, [$out + 0x38] ! k[14, 15] | |
456 | `&ROTL128(30)` | |
457 | stx %o4, [$out + 0x70] ! k[28, 29] | |
458 | stx %o5, [$out + 0x78] ! k[30, 31] | |
459 | srlx %o4, 32, %g4 | |
460 | srlx %o5, 32, %g5 | |
461 | st %o4, [$out + 0xc0] ! k[48] | |
462 | st %g5, [$out + 0xc4] ! k[49] | |
463 | st %o5, [$out + 0xc8] ! k[50] | |
464 | st %g4, [$out + 0xcc] ! k[51] | |
465 | `&ROTL128(49)` | |
466 | stx %o4, [$out + 0xe0] ! k[56, 57] | |
467 | stx %o5, [$out + 0xe8] ! k[58, 59] | |
468 | ||
469 | movdtox %f28, %o4 ! k[ 0, 1] | |
470 | movdtox %f30, %o5 ! k[ 2, 3] | |
471 | `&ROTL128(45)` | |
472 | stx %o4, [$out + 0x60] ! k[24, 25] | |
473 | stx %o5, [$out + 0x68] ! k[26, 27] | |
474 | `&ROTL128(15)` | |
475 | stx %o4, [$out + 0x80] ! k[32, 33] | |
476 | stx %o5, [$out + 0x88] ! k[34, 35] | |
477 | `&ROTL128(17)` | |
478 | stx %o4, [$out + 0xb0] ! k[44, 45] | |
479 | stx %o5, [$out + 0xb8] ! k[46, 47] | |
480 | `&ROTL128(34)` | |
481 | stx %o4, [$out + 0xf0] ! k[60, 61] | |
482 | stx %o5, [$out + 0xf8] ! k[62, 63] | |
483 | ||
484 | mov 4, $tmp | |
485 | st $tmp, [$out + 0x110] | |
486 | retl | |
487 | xor %o0, %o0, %o0 | |
488 | .type cmll_t4_set_key,#function | |
489 | .size cmll_t4_set_key,.-cmll_t4_set_key | |
490 | .align 32 | |
491 | SIGMA: | |
492 | .long 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2 | |
493 | .long 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c | |
494 | .long 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd | |
495 | .type SIGMA,#object | |
496 | .size SIGMA,.-SIGMA | |
497 | .asciz "Camellia for SPARC T4, David S. Miller, Andy Polyakov" | |
498 | ___ | |
499 | } | |
500 | ||
501 | {{{ | |
502 | my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5)); | |
503 | my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7)); | |
504 | ||
505 | $code.=<<___; | |
506 | .align 32 | |
507 | _cmll128_load_enckey: | |
508 | ldx [$key + 0], %g4 | |
509 | ldx [$key + 8], %g5 | |
510 | ___ | |
511 | for ($i=2; $i<26;$i++) { # load key schedule | |
512 | $code.=<<___; | |
513 | ldd [$key + `8*$i`], %f`12+2*$i` | |
514 | ___ | |
515 | } | |
516 | $code.=<<___; | |
517 | retl | |
518 | nop | |
519 | .type _cmll128_load_enckey,#function | |
520 | .size _cmll128_load_enckey,.-_cmll128_load_enckey | |
521 | _cmll256_load_enckey=_cmll128_load_enckey | |
522 | ||
523 | .align 32 | |
524 | _cmll256_load_deckey: | |
525 | ldd [$key + 64], %f62 | |
526 | ldd [$key + 72], %f60 | |
527 | b .Load_deckey | |
528 | add $key, 64, $key | |
529 | _cmll128_load_deckey: | |
530 | ldd [$key + 0], %f60 | |
531 | ldd [$key + 8], %f62 | |
532 | .Load_deckey: | |
533 | ___ | |
534 | for ($i=2; $i<24;$i++) { # load key schedule | |
535 | $code.=<<___; | |
536 | ldd [$key + `8*$i`], %f`62-2*$i` | |
537 | ___ | |
538 | } | |
539 | $code.=<<___; | |
540 | ldx [$key + 192], %g4 | |
541 | retl | |
542 | ldx [$key + 200], %g5 | |
543 | .type _cmll256_load_deckey,#function | |
544 | .size _cmll256_load_deckey,.-_cmll256_load_deckey | |
545 | ||
546 | .align 32 | |
547 | _cmll128_encrypt_1x: | |
548 | ___ | |
549 | for ($i=0; $i<3; $i++) { | |
550 | $code.=<<___; | |
551 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
552 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
553 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
554 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
555 | ___ | |
556 | $code.=<<___ if ($i<2); | |
557 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
558 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
559 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
560 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
561 | ___ | |
562 | } | |
563 | $code.=<<___; | |
564 | camellia_f %f56, %f2, %f0, %f4 | |
565 | camellia_f %f58, %f0, %f4, %f2 | |
566 | fxor %f60, %f4, %f0 | |
567 | retl | |
568 | fxor %f62, %f2, %f2 | |
569 | .type _cmll128_encrypt_1x,#function | |
570 | .size _cmll128_encrypt_1x,.-_cmll128_encrypt_1x | |
571 | _cmll128_decrypt_1x=_cmll128_encrypt_1x | |
572 | ||
573 | .align 32 | |
574 | _cmll128_encrypt_2x: | |
575 | ___ | |
576 | for ($i=0; $i<3; $i++) { | |
577 | $code.=<<___; | |
578 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
579 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
580 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
581 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
582 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
583 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
584 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
585 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
586 | ___ | |
587 | $code.=<<___ if ($i<2); | |
588 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
589 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
590 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
591 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
592 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
593 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
594 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
595 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
596 | ___ | |
597 | } | |
598 | $code.=<<___; | |
599 | camellia_f %f56, %f2, %f0, %f8 | |
600 | camellia_f %f56, %f6, %f4, %f10 | |
601 | camellia_f %f58, %f0, %f8, %f2 | |
602 | camellia_f %f58, %f4, %f10, %f6 | |
603 | fxor %f60, %f8, %f0 | |
604 | fxor %f60, %f10, %f4 | |
605 | fxor %f62, %f2, %f2 | |
606 | retl | |
607 | fxor %f62, %f6, %f6 | |
608 | .type _cmll128_encrypt_2x,#function | |
609 | .size _cmll128_encrypt_2x,.-_cmll128_encrypt_2x | |
610 | _cmll128_decrypt_2x=_cmll128_encrypt_2x | |
611 | ||
612 | .align 32 | |
613 | _cmll256_encrypt_1x: | |
614 | camellia_f %f16, %f2, %f0, %f2 | |
615 | camellia_f %f18, %f0, %f2, %f0 | |
616 | ldd [$key + 208], %f16 | |
617 | ldd [$key + 216], %f18 | |
618 | camellia_f %f20, %f2, %f0, %f2 | |
619 | camellia_f %f22, %f0, %f2, %f0 | |
620 | ldd [$key + 224], %f20 | |
621 | ldd [$key + 232], %f22 | |
622 | camellia_f %f24, %f2, %f0, %f2 | |
623 | camellia_f %f26, %f0, %f2, %f0 | |
624 | ldd [$key + 240], %f24 | |
625 | ldd [$key + 248], %f26 | |
626 | camellia_fl %f28, %f0, %f0 | |
627 | camellia_fli %f30, %f2, %f2 | |
628 | ldd [$key + 256], %f28 | |
629 | ldd [$key + 264], %f30 | |
630 | ___ | |
631 | for ($i=1; $i<3; $i++) { | |
632 | $code.=<<___; | |
633 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
634 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
635 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
636 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
637 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
638 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
639 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
640 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
641 | ___ | |
642 | } | |
643 | $code.=<<___; | |
644 | camellia_f %f16, %f2, %f0, %f2 | |
645 | camellia_f %f18, %f0, %f2, %f0 | |
646 | ldd [$key + 16], %f16 | |
647 | ldd [$key + 24], %f18 | |
648 | camellia_f %f20, %f2, %f0, %f2 | |
649 | camellia_f %f22, %f0, %f2, %f0 | |
650 | ldd [$key + 32], %f20 | |
651 | ldd [$key + 40], %f22 | |
652 | camellia_f %f24, %f2, %f0, %f4 | |
653 | camellia_f %f26, %f0, %f4, %f2 | |
654 | ldd [$key + 48], %f24 | |
655 | ldd [$key + 56], %f26 | |
656 | fxor %f28, %f4, %f0 | |
657 | fxor %f30, %f2, %f2 | |
658 | ldd [$key + 64], %f28 | |
659 | retl | |
660 | ldd [$key + 72], %f30 | |
661 | .type _cmll256_encrypt_1x,#function | |
662 | .size _cmll256_encrypt_1x,.-_cmll256_encrypt_1x | |
663 | ||
664 | .align 32 | |
665 | _cmll256_encrypt_2x: | |
666 | camellia_f %f16, %f2, %f0, %f2 | |
667 | camellia_f %f16, %f6, %f4, %f6 | |
668 | camellia_f %f18, %f0, %f2, %f0 | |
669 | camellia_f %f18, %f4, %f6, %f4 | |
670 | ldd [$key + 208], %f16 | |
671 | ldd [$key + 216], %f18 | |
672 | camellia_f %f20, %f2, %f0, %f2 | |
673 | camellia_f %f20, %f6, %f4, %f6 | |
674 | camellia_f %f22, %f0, %f2, %f0 | |
675 | camellia_f %f22, %f4, %f6, %f4 | |
676 | ldd [$key + 224], %f20 | |
677 | ldd [$key + 232], %f22 | |
678 | camellia_f %f24, %f2, %f0, %f2 | |
679 | camellia_f %f24, %f6, %f4, %f6 | |
680 | camellia_f %f26, %f0, %f2, %f0 | |
681 | camellia_f %f26, %f4, %f6, %f4 | |
682 | ldd [$key + 240], %f24 | |
683 | ldd [$key + 248], %f26 | |
684 | camellia_fl %f28, %f0, %f0 | |
685 | camellia_fl %f28, %f4, %f4 | |
686 | camellia_fli %f30, %f2, %f2 | |
687 | camellia_fli %f30, %f6, %f6 | |
688 | ldd [$key + 256], %f28 | |
689 | ldd [$key + 264], %f30 | |
690 | ___ | |
691 | for ($i=1; $i<3; $i++) { | |
692 | $code.=<<___; | |
693 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
694 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
695 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
696 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
697 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
698 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
699 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
700 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
701 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
702 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
703 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
704 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
705 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
706 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
707 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
708 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
709 | ___ | |
710 | } | |
711 | $code.=<<___; | |
712 | camellia_f %f16, %f2, %f0, %f2 | |
713 | camellia_f %f16, %f6, %f4, %f6 | |
714 | camellia_f %f18, %f0, %f2, %f0 | |
715 | camellia_f %f18, %f4, %f6, %f4 | |
716 | ldd [$key + 16], %f16 | |
717 | ldd [$key + 24], %f18 | |
718 | camellia_f %f20, %f2, %f0, %f2 | |
719 | camellia_f %f20, %f6, %f4, %f6 | |
720 | camellia_f %f22, %f0, %f2, %f0 | |
721 | camellia_f %f22, %f4, %f6, %f4 | |
722 | ldd [$key + 32], %f20 | |
723 | ldd [$key + 40], %f22 | |
724 | camellia_f %f24, %f2, %f0, %f8 | |
725 | camellia_f %f24, %f6, %f4, %f10 | |
726 | camellia_f %f26, %f0, %f8, %f2 | |
727 | camellia_f %f26, %f4, %f10, %f6 | |
728 | ldd [$key + 48], %f24 | |
729 | ldd [$key + 56], %f26 | |
730 | fxor %f28, %f8, %f0 | |
731 | fxor %f28, %f10, %f4 | |
732 | fxor %f30, %f2, %f2 | |
733 | fxor %f30, %f6, %f6 | |
734 | ldd [$key + 64], %f28 | |
735 | retl | |
736 | ldd [$key + 72], %f30 | |
737 | .type _cmll256_encrypt_2x,#function | |
738 | .size _cmll256_encrypt_2x,.-_cmll256_encrypt_2x | |
739 | ||
740 | .align 32 | |
741 | _cmll256_decrypt_1x: | |
742 | camellia_f %f16, %f2, %f0, %f2 | |
743 | camellia_f %f18, %f0, %f2, %f0 | |
744 | ldd [$key - 8], %f16 | |
745 | ldd [$key - 16], %f18 | |
746 | camellia_f %f20, %f2, %f0, %f2 | |
747 | camellia_f %f22, %f0, %f2, %f0 | |
748 | ldd [$key - 24], %f20 | |
749 | ldd [$key - 32], %f22 | |
750 | camellia_f %f24, %f2, %f0, %f2 | |
751 | camellia_f %f26, %f0, %f2, %f0 | |
752 | ldd [$key - 40], %f24 | |
753 | ldd [$key - 48], %f26 | |
754 | camellia_fl %f28, %f0, %f0 | |
755 | camellia_fli %f30, %f2, %f2 | |
756 | ldd [$key - 56], %f28 | |
757 | ldd [$key - 64], %f30 | |
758 | ___ | |
759 | for ($i=1; $i<3; $i++) { | |
760 | $code.=<<___; | |
761 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
762 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
763 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
764 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
765 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
766 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
767 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
768 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
769 | ___ | |
770 | } | |
771 | $code.=<<___; | |
772 | camellia_f %f16, %f2, %f0, %f2 | |
773 | camellia_f %f18, %f0, %f2, %f0 | |
774 | ldd [$key + 184], %f16 | |
775 | ldd [$key + 176], %f18 | |
776 | camellia_f %f20, %f2, %f0, %f2 | |
777 | camellia_f %f22, %f0, %f2, %f0 | |
778 | ldd [$key + 168], %f20 | |
779 | ldd [$key + 160], %f22 | |
780 | camellia_f %f24, %f2, %f0, %f4 | |
781 | camellia_f %f26, %f0, %f4, %f2 | |
782 | ldd [$key + 152], %f24 | |
783 | ldd [$key + 144], %f26 | |
784 | fxor %f30, %f4, %f0 | |
785 | fxor %f28, %f2, %f2 | |
786 | ldd [$key + 136], %f28 | |
787 | retl | |
788 | ldd [$key + 128], %f30 | |
789 | .type _cmll256_decrypt_1x,#function | |
790 | .size _cmll256_decrypt_1x,.-_cmll256_decrypt_1x | |
791 | ||
792 | .align 32 | |
793 | _cmll256_decrypt_2x: | |
794 | camellia_f %f16, %f2, %f0, %f2 | |
795 | camellia_f %f16, %f6, %f4, %f6 | |
796 | camellia_f %f18, %f0, %f2, %f0 | |
797 | camellia_f %f18, %f4, %f6, %f4 | |
798 | ldd [$key - 8], %f16 | |
799 | ldd [$key - 16], %f18 | |
800 | camellia_f %f20, %f2, %f0, %f2 | |
801 | camellia_f %f20, %f6, %f4, %f6 | |
802 | camellia_f %f22, %f0, %f2, %f0 | |
803 | camellia_f %f22, %f4, %f6, %f4 | |
804 | ldd [$key - 24], %f20 | |
805 | ldd [$key - 32], %f22 | |
806 | camellia_f %f24, %f2, %f0, %f2 | |
807 | camellia_f %f24, %f6, %f4, %f6 | |
808 | camellia_f %f26, %f0, %f2, %f0 | |
809 | camellia_f %f26, %f4, %f6, %f4 | |
810 | ldd [$key - 40], %f24 | |
811 | ldd [$key - 48], %f26 | |
812 | camellia_fl %f28, %f0, %f0 | |
813 | camellia_fl %f28, %f4, %f4 | |
814 | camellia_fli %f30, %f2, %f2 | |
815 | camellia_fli %f30, %f6, %f6 | |
816 | ldd [$key - 56], %f28 | |
817 | ldd [$key - 64], %f30 | |
818 | ___ | |
819 | for ($i=1; $i<3; $i++) { | |
820 | $code.=<<___; | |
821 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
822 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
823 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
824 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
825 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
826 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
827 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
828 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
829 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
830 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
831 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
832 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
833 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
834 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
835 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
836 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
837 | ___ | |
838 | } | |
839 | $code.=<<___; | |
840 | camellia_f %f16, %f2, %f0, %f2 | |
841 | camellia_f %f16, %f6, %f4, %f6 | |
842 | camellia_f %f18, %f0, %f2, %f0 | |
843 | camellia_f %f18, %f4, %f6, %f4 | |
844 | ldd [$key + 184], %f16 | |
845 | ldd [$key + 176], %f18 | |
846 | camellia_f %f20, %f2, %f0, %f2 | |
847 | camellia_f %f20, %f6, %f4, %f6 | |
848 | camellia_f %f22, %f0, %f2, %f0 | |
849 | camellia_f %f22, %f4, %f6, %f4 | |
850 | ldd [$key + 168], %f20 | |
851 | ldd [$key + 160], %f22 | |
852 | camellia_f %f24, %f2, %f0, %f8 | |
853 | camellia_f %f24, %f6, %f4, %f10 | |
854 | camellia_f %f26, %f0, %f8, %f2 | |
855 | camellia_f %f26, %f4, %f10, %f6 | |
856 | ldd [$key + 152], %f24 | |
857 | ldd [$key + 144], %f26 | |
858 | fxor %f30, %f8, %f0 | |
859 | fxor %f30, %f10, %f4 | |
860 | fxor %f28, %f2, %f2 | |
861 | fxor %f28, %f6, %f6 | |
862 | ldd [$key + 136], %f28 | |
863 | retl | |
864 | ldd [$key + 128], %f30 | |
865 | .type _cmll256_decrypt_2x,#function | |
866 | .size _cmll256_decrypt_2x,.-_cmll256_decrypt_2x | |
867 | ___ | |
868 | ||
869 | &alg_cbc_encrypt_implement("cmll",128); | |
870 | &alg_cbc_encrypt_implement("cmll",256); | |
871 | ||
872 | &alg_cbc_decrypt_implement("cmll",128); | |
873 | &alg_cbc_decrypt_implement("cmll",256); | |
874 | ||
875 | if ($::evp) { | |
876 | &alg_ctr32_implement("cmll",128); | |
877 | &alg_ctr32_implement("cmll",256); | |
878 | } | |
879 | }}} | |
880 | ||
881 | if (!$::evp) { | |
882 | $code.=<<___; | |
883 | .global Camellia_encrypt | |
884 | Camellia_encrypt=cmll_t4_encrypt | |
885 | .global Camellia_decrypt | |
886 | Camellia_decrypt=cmll_t4_decrypt | |
887 | .global Camellia_set_key | |
888 | .align 32 | |
889 | Camellia_set_key: | |
890 | andcc %o2, 7, %g0 ! double-check alignment | |
891 | bnz,a,pn %icc, 1f | |
892 | mov -1, %o0 | |
893 | brz,a,pn %o0, 1f | |
894 | mov -1, %o0 | |
895 | brz,a,pn %o2, 1f | |
896 | mov -1, %o0 | |
897 | andncc %o1, 0x1c0, %g0 | |
898 | bnz,a,pn %icc, 1f | |
899 | mov -2, %o0 | |
900 | cmp %o1, 128 | |
901 | bl,a,pn %icc, 1f | |
902 | mov -2, %o0 | |
903 | b cmll_t4_set_key | |
904 | nop | |
905 | 1: retl | |
906 | nop | |
907 | .type Camellia_set_key,#function | |
908 | .size Camellia_set_key,.-Camellia_set_key | |
909 | ___ | |
910 | ||
911 | my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5)); | |
912 | ||
913 | $code.=<<___; | |
914 | .globl Camellia_cbc_encrypt | |
915 | .align 32 | |
916 | Camellia_cbc_encrypt: | |
917 | ld [$key + 272], %g1 | |
918 | nop | |
919 | brz $enc, .Lcbc_decrypt | |
920 | cmp %g1, 3 | |
921 | ||
922 | be,pt %icc, cmll128_t4_cbc_encrypt | |
923 | nop | |
924 | ba cmll256_t4_cbc_encrypt | |
925 | nop | |
926 | ||
927 | .Lcbc_decrypt: | |
928 | be,pt %icc, cmll128_t4_cbc_decrypt | |
929 | nop | |
930 | ba cmll256_t4_cbc_decrypt | |
931 | nop | |
932 | .type Camellia_cbc_encrypt,#function | |
933 | .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt | |
934 | ___ | |
935 | } | |
936 | ||
937 | &emit_assembler(); | |
938 | ||
939 | close STDOUT; |