]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
5593d9c9 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
4739ccdb AP |
9 | |
10 | # ==================================================================== | |
e3713c36 RS |
11 | # Written by David S. Miller and Andy Polyakov. |
12 | # The module is licensed under 2-clause BSD | |
4739ccdb AP |
13 | # license. October 2012. All rights reserved. |
14 | # ==================================================================== | |
15 | ||
16 | ###################################################################### | |
17 | # Camellia for SPARC T4. | |
18 | # | |
19 | # As with AES below results [for aligned data] are virtually identical | |
46f4e1be | 20 | # to critical path lengths for 3-cycle instruction latency: |
4739ccdb AP |
21 | # |
22 | # 128-bit key 192/256- | |
23 | # CBC encrypt 4.14/4.21(*) 5.46/5.52 | |
24 | # (*) numbers after slash are for | |
25 | # misaligned data. | |
26 | # | |
27 | # As with Intel AES-NI, question is if it's possible to improve | |
46f4e1be | 28 | # performance of parallelizable modes by interleaving round |
4739ccdb AP |
29 | # instructions. In Camellia every instruction is dependent on |
30 | # previous, which means that there is place for 2 additional ones | |
31 | # in between two dependent. Can we expect 3x performance improvement? | |
32 | # At least one can argue that it should be possible to break 2x | |
33 | # barrier... For some reason not even 2x appears to be possible: | |
34 | # | |
35 | # 128-bit key 192/256- | |
36 | # CBC decrypt 2.21/2.74 2.99/3.40 | |
37 | # CTR 2.15/2.68(*) 2.93/3.34 | |
38 | # (*) numbers after slash are for | |
39 | # misaligned data. | |
40 | # | |
41 | # This is for 2x interleave. But compared to 1x interleave CBC decrypt | |
42 | # improved by ... 0% for 128-bit key, and 11% for 192/256-bit one. | |
43 | # So that out-of-order execution logic can take non-interleaved code | |
44 | # to 1.87x, but can't take 2x interleaved one any further. There | |
45 | # surely is some explanation... As result 3x interleave was not even | |
46 | # attempted. Instead an effort was made to share specific modes | |
47 | # implementations with AES module (therefore sparct4_modes.pl). | |
48 | # | |
49 | # To anchor to something else, software C implementation processes | |
50 | # one byte in 38 cycles with 128-bit key on same processor. | |
51 | ||
52 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
53 | push(@INC,"${dir}","${dir}../../perlasm"); | |
54 | require "sparcv9_modes.pl"; | |
55 | ||
1aa89a7a | 56 | $output = pop and open STDOUT,">$output"; |
4739ccdb AP |
57 | |
58 | $::evp=1; # if $evp is set to 0, script generates module with | |
59 | # Camellia_[en|de]crypt, Camellia_set_key and Camellia_cbc_encrypt | |
60 | # entry points. These are fully compatible with openssl/camellia.h. | |
61 | ||
62 | ###################################################################### | |
63 | # single-round subroutines | |
64 | # | |
65 | { | |
66 | my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); | |
67 | ||
68 | $code=<<___; | |
52f7e44e TM |
69 | #ifndef __ASSEMBLER__ |
70 | # define __ASSEMBLER__ 1 | |
71 | #endif | |
72 | #include "crypto/sparc_arch.h" | |
eb77e888 | 73 | |
4739ccdb AP |
74 | .text |
75 | ||
76 | .globl cmll_t4_encrypt | |
77 | .align 32 | |
78 | cmll_t4_encrypt: | |
79 | andcc $inp, 7, %g1 ! is input aligned? | |
80 | andn $inp, 7, $inp | |
81 | ||
82 | ldx [$key + 0], %g4 | |
83 | ldx [$key + 8], %g5 | |
84 | ||
85 | ldx [$inp + 0], %o4 | |
86 | bz,pt %icc, 1f | |
87 | ldx [$inp + 8], %o5 | |
88 | ldx [$inp + 16], $inp | |
89 | sll %g1, 3, %g1 | |
90 | sub %g0, %g1, %o3 | |
91 | sllx %o4, %g1, %o4 | |
92 | sllx %o5, %g1, %g1 | |
93 | srlx %o5, %o3, %o5 | |
94 | srlx $inp, %o3, %o3 | |
95 | or %o5, %o4, %o4 | |
96 | or %o3, %g1, %o5 | |
97 | 1: | |
98 | ld [$key + 272], $rounds ! grandRounds, 3 or 4 | |
99 | ldd [$key + 16], %f12 | |
100 | ldd [$key + 24], %f14 | |
101 | xor %g4, %o4, %o4 | |
102 | xor %g5, %o5, %o5 | |
103 | ldd [$key + 32], %f16 | |
104 | ldd [$key + 40], %f18 | |
105 | movxtod %o4, %f0 | |
106 | movxtod %o5, %f2 | |
107 | ldd [$key + 48], %f20 | |
108 | ldd [$key + 56], %f22 | |
109 | sub $rounds, 1, $rounds | |
110 | ldd [$key + 64], %f24 | |
111 | ldd [$key + 72], %f26 | |
112 | add $key, 80, $key | |
113 | ||
114 | .Lenc: | |
115 | camellia_f %f12, %f2, %f0, %f2 | |
116 | ldd [$key + 0], %f12 | |
117 | sub $rounds,1,$rounds | |
118 | camellia_f %f14, %f0, %f2, %f0 | |
119 | ldd [$key + 8], %f14 | |
120 | camellia_f %f16, %f2, %f0, %f2 | |
121 | ldd [$key + 16], %f16 | |
122 | camellia_f %f18, %f0, %f2, %f0 | |
123 | ldd [$key + 24], %f18 | |
124 | camellia_f %f20, %f2, %f0, %f2 | |
125 | ldd [$key + 32], %f20 | |
126 | camellia_f %f22, %f0, %f2, %f0 | |
127 | ldd [$key + 40], %f22 | |
128 | camellia_fl %f24, %f0, %f0 | |
129 | ldd [$key + 48], %f24 | |
130 | camellia_fli %f26, %f2, %f2 | |
131 | ldd [$key + 56], %f26 | |
132 | brnz,pt $rounds, .Lenc | |
133 | add $key, 64, $key | |
134 | ||
135 | andcc $out, 7, $tmp ! is output aligned? | |
136 | camellia_f %f12, %f2, %f0, %f2 | |
137 | camellia_f %f14, %f0, %f2, %f0 | |
138 | camellia_f %f16, %f2, %f0, %f2 | |
139 | camellia_f %f18, %f0, %f2, %f0 | |
140 | camellia_f %f20, %f2, %f0, %f4 | |
141 | camellia_f %f22, %f0, %f4, %f2 | |
142 | fxor %f24, %f4, %f0 | |
143 | fxor %f26, %f2, %f2 | |
144 | ||
145 | bnz,pn %icc, 2f | |
146 | nop | |
147 | ||
148 | std %f0, [$out + 0] | |
149 | retl | |
150 | std %f2, [$out + 8] | |
151 | ||
152 | 2: alignaddrl $out, %g0, $out | |
153 | mov 0xff, $mask | |
154 | srl $mask, $tmp, $mask | |
155 | ||
156 | faligndata %f0, %f0, %f4 | |
157 | faligndata %f0, %f2, %f6 | |
158 | faligndata %f2, %f2, %f8 | |
159 | ||
160 | stda %f4, [$out + $mask]0xc0 ! partial store | |
161 | std %f6, [$out + 8] | |
162 | add $out, 16, $out | |
163 | orn %g0, $mask, $mask | |
164 | retl | |
165 | stda %f8, [$out + $mask]0xc0 ! partial store | |
166 | .type cmll_t4_encrypt,#function | |
167 | .size cmll_t4_encrypt,.-cmll_t4_encrypt | |
168 | ||
169 | .globl cmll_t4_decrypt | |
170 | .align 32 | |
171 | cmll_t4_decrypt: | |
172 | ld [$key + 272], $rounds ! grandRounds, 3 or 4 | |
173 | andcc $inp, 7, %g1 ! is input aligned? | |
174 | andn $inp, 7, $inp | |
175 | ||
176 | sll $rounds, 6, $rounds | |
177 | add $rounds, $key, $key | |
178 | ||
179 | ldx [$inp + 0], %o4 | |
180 | bz,pt %icc, 1f | |
181 | ldx [$inp + 8], %o5 | |
182 | ldx [$inp + 16], $inp | |
183 | sll %g1, 3, %g1 | |
184 | sub %g0, %g1, %g4 | |
185 | sllx %o4, %g1, %o4 | |
186 | sllx %o5, %g1, %g1 | |
187 | srlx %o5, %g4, %o5 | |
188 | srlx $inp, %g4, %g4 | |
189 | or %o5, %o4, %o4 | |
190 | or %g4, %g1, %o5 | |
191 | 1: | |
192 | ldx [$key + 0], %g4 | |
193 | ldx [$key + 8], %g5 | |
194 | ldd [$key - 8], %f12 | |
195 | ldd [$key - 16], %f14 | |
196 | xor %g4, %o4, %o4 | |
197 | xor %g5, %o5, %o5 | |
198 | ldd [$key - 24], %f16 | |
199 | ldd [$key - 32], %f18 | |
200 | movxtod %o4, %f0 | |
201 | movxtod %o5, %f2 | |
202 | ldd [$key - 40], %f20 | |
203 | ldd [$key - 48], %f22 | |
204 | sub $rounds, 64, $rounds | |
205 | ldd [$key - 56], %f24 | |
206 | ldd [$key - 64], %f26 | |
207 | sub $key, 64, $key | |
208 | ||
209 | .Ldec: | |
210 | camellia_f %f12, %f2, %f0, %f2 | |
211 | ldd [$key - 8], %f12 | |
212 | sub $rounds, 64, $rounds | |
213 | camellia_f %f14, %f0, %f2, %f0 | |
214 | ldd [$key - 16], %f14 | |
215 | camellia_f %f16, %f2, %f0, %f2 | |
216 | ldd [$key - 24], %f16 | |
217 | camellia_f %f18, %f0, %f2, %f0 | |
218 | ldd [$key - 32], %f18 | |
219 | camellia_f %f20, %f2, %f0, %f2 | |
220 | ldd [$key - 40], %f20 | |
221 | camellia_f %f22, %f0, %f2, %f0 | |
222 | ldd [$key - 48], %f22 | |
223 | camellia_fl %f24, %f0, %f0 | |
224 | ldd [$key - 56], %f24 | |
225 | camellia_fli %f26, %f2, %f2 | |
226 | ldd [$key - 64], %f26 | |
227 | brnz,pt $rounds, .Ldec | |
228 | sub $key, 64, $key | |
229 | ||
230 | andcc $out, 7, $tmp ! is output aligned? | |
231 | camellia_f %f12, %f2, %f0, %f2 | |
232 | camellia_f %f14, %f0, %f2, %f0 | |
233 | camellia_f %f16, %f2, %f0, %f2 | |
234 | camellia_f %f18, %f0, %f2, %f0 | |
235 | camellia_f %f20, %f2, %f0, %f4 | |
236 | camellia_f %f22, %f0, %f4, %f2 | |
237 | fxor %f26, %f4, %f0 | |
238 | fxor %f24, %f2, %f2 | |
239 | ||
240 | bnz,pn %icc, 2f | |
241 | nop | |
242 | ||
243 | std %f0, [$out + 0] | |
244 | retl | |
245 | std %f2, [$out + 8] | |
246 | ||
247 | 2: alignaddrl $out, %g0, $out | |
248 | mov 0xff, $mask | |
249 | srl $mask, $tmp, $mask | |
250 | ||
251 | faligndata %f0, %f0, %f4 | |
252 | faligndata %f0, %f2, %f6 | |
253 | faligndata %f2, %f2, %f8 | |
254 | ||
255 | stda %f4, [$out + $mask]0xc0 ! partial store | |
256 | std %f6, [$out + 8] | |
257 | add $out, 16, $out | |
258 | orn %g0, $mask, $mask | |
259 | retl | |
260 | stda %f8, [$out + $mask]0xc0 ! partial store | |
261 | .type cmll_t4_decrypt,#function | |
262 | .size cmll_t4_decrypt,.-cmll_t4_decrypt | |
263 | ___ | |
264 | } | |
265 | ||
266 | ###################################################################### | |
267 | # key setup subroutines | |
268 | # | |
269 | { | |
270 | sub ROTL128 { | |
271 | my $rot = shift; | |
272 | ||
273 | "srlx %o4, 64-$rot, %g4\n\t". | |
274 | "sllx %o4, $rot, %o4\n\t". | |
275 | "srlx %o5, 64-$rot, %g5\n\t". | |
276 | "sllx %o5, $rot, %o5\n\t". | |
277 | "or %o4, %g5, %o4\n\t". | |
278 | "or %o5, %g4, %o5"; | |
279 | } | |
280 | ||
281 | my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5)); | |
282 | $code.=<<___; | |
283 | .globl cmll_t4_set_key | |
284 | .align 32 | |
285 | cmll_t4_set_key: | |
286 | and $inp, 7, $tmp | |
287 | alignaddr $inp, %g0, $inp | |
288 | cmp $bits, 192 | |
289 | ldd [$inp + 0], %f0 | |
290 | bl,pt %icc,.L128 | |
291 | ldd [$inp + 8], %f2 | |
292 | ||
293 | be,pt %icc,.L192 | |
294 | ldd [$inp + 16], %f4 | |
295 | ||
296 | brz,pt $tmp, .L256aligned | |
297 | ldd [$inp + 24], %f6 | |
298 | ||
299 | ldd [$inp + 32], %f8 | |
300 | faligndata %f0, %f2, %f0 | |
301 | faligndata %f2, %f4, %f2 | |
302 | faligndata %f4, %f6, %f4 | |
303 | b .L256aligned | |
304 | faligndata %f6, %f8, %f6 | |
305 | ||
306 | .align 16 | |
307 | .L192: | |
308 | brz,a,pt $tmp, .L256aligned | |
309 | fnot2 %f4, %f6 | |
310 | ||
311 | ldd [$inp + 24], %f6 | |
312 | nop | |
313 | faligndata %f0, %f2, %f0 | |
314 | faligndata %f2, %f4, %f2 | |
315 | faligndata %f4, %f6, %f4 | |
316 | fnot2 %f4, %f6 | |
317 | ||
318 | .L256aligned: | |
319 | std %f0, [$out + 0] ! k[0, 1] | |
320 | fsrc2 %f0, %f28 | |
321 | std %f2, [$out + 8] ! k[2, 3] | |
322 | fsrc2 %f2, %f30 | |
323 | fxor %f4, %f0, %f0 | |
324 | b .L128key | |
325 | fxor %f6, %f2, %f2 | |
326 | ||
327 | .align 16 | |
328 | .L128: | |
329 | brz,pt $tmp, .L128aligned | |
330 | nop | |
331 | ||
332 | ldd [$inp + 16], %f4 | |
333 | nop | |
334 | faligndata %f0, %f2, %f0 | |
335 | faligndata %f2, %f4, %f2 | |
336 | ||
337 | .L128aligned: | |
338 | std %f0, [$out + 0] ! k[0, 1] | |
339 | fsrc2 %f0, %f28 | |
340 | std %f2, [$out + 8] ! k[2, 3] | |
341 | fsrc2 %f2, %f30 | |
342 | ||
343 | .L128key: | |
344 | mov %o7, %o5 | |
345 | 1: call .+8 | |
346 | add %o7, SIGMA-1b, %o4 | |
347 | mov %o5, %o7 | |
348 | ||
349 | ldd [%o4 + 0], %f16 | |
350 | ldd [%o4 + 8], %f18 | |
351 | ldd [%o4 + 16], %f20 | |
352 | ldd [%o4 + 24], %f22 | |
353 | ||
354 | camellia_f %f16, %f2, %f0, %f2 | |
355 | camellia_f %f18, %f0, %f2, %f0 | |
356 | fxor %f28, %f0, %f0 | |
357 | fxor %f30, %f2, %f2 | |
358 | camellia_f %f20, %f2, %f0, %f2 | |
359 | camellia_f %f22, %f0, %f2, %f0 | |
360 | ||
361 | bge,pn %icc, .L256key | |
362 | nop | |
363 | std %f0, [$out + 0x10] ! k[ 4, 5] | |
364 | std %f2, [$out + 0x18] ! k[ 6, 7] | |
365 | ||
366 | movdtox %f0, %o4 | |
367 | movdtox %f2, %o5 | |
368 | `&ROTL128(15)` | |
369 | stx %o4, [$out + 0x30] ! k[12, 13] | |
370 | stx %o5, [$out + 0x38] ! k[14, 15] | |
371 | `&ROTL128(15)` | |
372 | stx %o4, [$out + 0x40] ! k[16, 17] | |
373 | stx %o5, [$out + 0x48] ! k[18, 19] | |
374 | `&ROTL128(15)` | |
375 | stx %o4, [$out + 0x60] ! k[24, 25] | |
376 | `&ROTL128(15)` | |
377 | stx %o4, [$out + 0x70] ! k[28, 29] | |
378 | stx %o5, [$out + 0x78] ! k[30, 31] | |
379 | `&ROTL128(34)` | |
380 | stx %o4, [$out + 0xa0] ! k[40, 41] | |
381 | stx %o5, [$out + 0xa8] ! k[42, 43] | |
382 | `&ROTL128(17)` | |
383 | stx %o4, [$out + 0xc0] ! k[48, 49] | |
384 | stx %o5, [$out + 0xc8] ! k[50, 51] | |
385 | ||
386 | movdtox %f28, %o4 ! k[ 0, 1] | |
387 | movdtox %f30, %o5 ! k[ 2, 3] | |
388 | `&ROTL128(15)` | |
389 | stx %o4, [$out + 0x20] ! k[ 8, 9] | |
390 | stx %o5, [$out + 0x28] ! k[10, 11] | |
391 | `&ROTL128(30)` | |
392 | stx %o4, [$out + 0x50] ! k[20, 21] | |
393 | stx %o5, [$out + 0x58] ! k[22, 23] | |
394 | `&ROTL128(15)` | |
395 | stx %o5, [$out + 0x68] ! k[26, 27] | |
396 | `&ROTL128(17)` | |
397 | stx %o4, [$out + 0x80] ! k[32, 33] | |
398 | stx %o5, [$out + 0x88] ! k[34, 35] | |
399 | `&ROTL128(17)` | |
400 | stx %o4, [$out + 0x90] ! k[36, 37] | |
401 | stx %o5, [$out + 0x98] ! k[38, 39] | |
402 | `&ROTL128(17)` | |
403 | stx %o4, [$out + 0xb0] ! k[44, 45] | |
404 | stx %o5, [$out + 0xb8] ! k[46, 47] | |
405 | ||
406 | mov 3, $tmp | |
407 | st $tmp, [$out + 0x110] | |
408 | retl | |
409 | xor %o0, %o0, %o0 | |
410 | ||
411 | .align 16 | |
412 | .L256key: | |
413 | ldd [%o4 + 32], %f24 | |
414 | ldd [%o4 + 40], %f26 | |
415 | ||
416 | std %f0, [$out + 0x30] ! k[12, 13] | |
417 | std %f2, [$out + 0x38] ! k[14, 15] | |
418 | ||
419 | fxor %f4, %f0, %f0 | |
420 | fxor %f6, %f2, %f2 | |
421 | camellia_f %f24, %f2, %f0, %f2 | |
422 | camellia_f %f26, %f0, %f2, %f0 | |
423 | ||
424 | std %f0, [$out + 0x10] ! k[ 4, 5] | |
425 | std %f2, [$out + 0x18] ! k[ 6, 7] | |
426 | ||
427 | movdtox %f0, %o4 | |
428 | movdtox %f2, %o5 | |
429 | `&ROTL128(30)` | |
430 | stx %o4, [$out + 0x50] ! k[20, 21] | |
431 | stx %o5, [$out + 0x58] ! k[22, 23] | |
432 | `&ROTL128(30)` | |
433 | stx %o4, [$out + 0xa0] ! k[40, 41] | |
434 | stx %o5, [$out + 0xa8] ! k[42, 43] | |
435 | `&ROTL128(51)` | |
436 | stx %o4, [$out + 0x100] ! k[64, 65] | |
437 | stx %o5, [$out + 0x108] ! k[66, 67] | |
438 | ||
439 | movdtox %f4, %o4 ! k[ 8, 9] | |
440 | movdtox %f6, %o5 ! k[10, 11] | |
441 | `&ROTL128(15)` | |
442 | stx %o4, [$out + 0x20] ! k[ 8, 9] | |
443 | stx %o5, [$out + 0x28] ! k[10, 11] | |
444 | `&ROTL128(15)` | |
445 | stx %o4, [$out + 0x40] ! k[16, 17] | |
446 | stx %o5, [$out + 0x48] ! k[18, 19] | |
447 | `&ROTL128(30)` | |
448 | stx %o4, [$out + 0x90] ! k[36, 37] | |
449 | stx %o5, [$out + 0x98] ! k[38, 39] | |
450 | `&ROTL128(34)` | |
451 | stx %o4, [$out + 0xd0] ! k[52, 53] | |
452 | stx %o5, [$out + 0xd8] ! k[54, 55] | |
453 | ldx [$out + 0x30], %o4 ! k[12, 13] | |
454 | ldx [$out + 0x38], %o5 ! k[14, 15] | |
455 | `&ROTL128(15)` | |
456 | stx %o4, [$out + 0x30] ! k[12, 13] | |
457 | stx %o5, [$out + 0x38] ! k[14, 15] | |
458 | `&ROTL128(30)` | |
459 | stx %o4, [$out + 0x70] ! k[28, 29] | |
460 | stx %o5, [$out + 0x78] ! k[30, 31] | |
461 | srlx %o4, 32, %g4 | |
462 | srlx %o5, 32, %g5 | |
463 | st %o4, [$out + 0xc0] ! k[48] | |
464 | st %g5, [$out + 0xc4] ! k[49] | |
465 | st %o5, [$out + 0xc8] ! k[50] | |
466 | st %g4, [$out + 0xcc] ! k[51] | |
467 | `&ROTL128(49)` | |
468 | stx %o4, [$out + 0xe0] ! k[56, 57] | |
469 | stx %o5, [$out + 0xe8] ! k[58, 59] | |
470 | ||
471 | movdtox %f28, %o4 ! k[ 0, 1] | |
472 | movdtox %f30, %o5 ! k[ 2, 3] | |
473 | `&ROTL128(45)` | |
474 | stx %o4, [$out + 0x60] ! k[24, 25] | |
475 | stx %o5, [$out + 0x68] ! k[26, 27] | |
476 | `&ROTL128(15)` | |
477 | stx %o4, [$out + 0x80] ! k[32, 33] | |
478 | stx %o5, [$out + 0x88] ! k[34, 35] | |
479 | `&ROTL128(17)` | |
480 | stx %o4, [$out + 0xb0] ! k[44, 45] | |
481 | stx %o5, [$out + 0xb8] ! k[46, 47] | |
482 | `&ROTL128(34)` | |
483 | stx %o4, [$out + 0xf0] ! k[60, 61] | |
484 | stx %o5, [$out + 0xf8] ! k[62, 63] | |
485 | ||
486 | mov 4, $tmp | |
487 | st $tmp, [$out + 0x110] | |
488 | retl | |
489 | xor %o0, %o0, %o0 | |
490 | .type cmll_t4_set_key,#function | |
491 | .size cmll_t4_set_key,.-cmll_t4_set_key | |
492 | .align 32 | |
493 | SIGMA: | |
494 | .long 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2 | |
495 | .long 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c | |
496 | .long 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd | |
497 | .type SIGMA,#object | |
498 | .size SIGMA,.-SIGMA | |
499 | .asciz "Camellia for SPARC T4, David S. Miller, Andy Polyakov" | |
500 | ___ | |
501 | } | |
502 | ||
503 | {{{ | |
504 | my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5)); | |
505 | my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7)); | |
506 | ||
507 | $code.=<<___; | |
508 | .align 32 | |
509 | _cmll128_load_enckey: | |
510 | ldx [$key + 0], %g4 | |
511 | ldx [$key + 8], %g5 | |
512 | ___ | |
513 | for ($i=2; $i<26;$i++) { # load key schedule | |
514 | $code.=<<___; | |
515 | ldd [$key + `8*$i`], %f`12+2*$i` | |
516 | ___ | |
517 | } | |
518 | $code.=<<___; | |
519 | retl | |
520 | nop | |
521 | .type _cmll128_load_enckey,#function | |
522 | .size _cmll128_load_enckey,.-_cmll128_load_enckey | |
523 | _cmll256_load_enckey=_cmll128_load_enckey | |
524 | ||
525 | .align 32 | |
526 | _cmll256_load_deckey: | |
527 | ldd [$key + 64], %f62 | |
528 | ldd [$key + 72], %f60 | |
529 | b .Load_deckey | |
530 | add $key, 64, $key | |
531 | _cmll128_load_deckey: | |
532 | ldd [$key + 0], %f60 | |
533 | ldd [$key + 8], %f62 | |
534 | .Load_deckey: | |
535 | ___ | |
536 | for ($i=2; $i<24;$i++) { # load key schedule | |
537 | $code.=<<___; | |
538 | ldd [$key + `8*$i`], %f`62-2*$i` | |
539 | ___ | |
540 | } | |
541 | $code.=<<___; | |
542 | ldx [$key + 192], %g4 | |
543 | retl | |
544 | ldx [$key + 200], %g5 | |
545 | .type _cmll256_load_deckey,#function | |
546 | .size _cmll256_load_deckey,.-_cmll256_load_deckey | |
547 | ||
548 | .align 32 | |
549 | _cmll128_encrypt_1x: | |
550 | ___ | |
551 | for ($i=0; $i<3; $i++) { | |
552 | $code.=<<___; | |
553 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
554 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
555 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
556 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
557 | ___ | |
558 | $code.=<<___ if ($i<2); | |
559 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
560 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
561 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
562 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
563 | ___ | |
564 | } | |
565 | $code.=<<___; | |
566 | camellia_f %f56, %f2, %f0, %f4 | |
567 | camellia_f %f58, %f0, %f4, %f2 | |
568 | fxor %f60, %f4, %f0 | |
569 | retl | |
570 | fxor %f62, %f2, %f2 | |
571 | .type _cmll128_encrypt_1x,#function | |
572 | .size _cmll128_encrypt_1x,.-_cmll128_encrypt_1x | |
573 | _cmll128_decrypt_1x=_cmll128_encrypt_1x | |
574 | ||
575 | .align 32 | |
576 | _cmll128_encrypt_2x: | |
577 | ___ | |
578 | for ($i=0; $i<3; $i++) { | |
579 | $code.=<<___; | |
580 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
581 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
582 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
583 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
584 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
585 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
586 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
587 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
588 | ___ | |
589 | $code.=<<___ if ($i<2); | |
590 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
591 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
592 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
593 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
594 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
595 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
596 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
597 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
598 | ___ | |
599 | } | |
600 | $code.=<<___; | |
601 | camellia_f %f56, %f2, %f0, %f8 | |
602 | camellia_f %f56, %f6, %f4, %f10 | |
603 | camellia_f %f58, %f0, %f8, %f2 | |
604 | camellia_f %f58, %f4, %f10, %f6 | |
605 | fxor %f60, %f8, %f0 | |
606 | fxor %f60, %f10, %f4 | |
607 | fxor %f62, %f2, %f2 | |
608 | retl | |
609 | fxor %f62, %f6, %f6 | |
610 | .type _cmll128_encrypt_2x,#function | |
611 | .size _cmll128_encrypt_2x,.-_cmll128_encrypt_2x | |
612 | _cmll128_decrypt_2x=_cmll128_encrypt_2x | |
613 | ||
614 | .align 32 | |
615 | _cmll256_encrypt_1x: | |
616 | camellia_f %f16, %f2, %f0, %f2 | |
617 | camellia_f %f18, %f0, %f2, %f0 | |
618 | ldd [$key + 208], %f16 | |
619 | ldd [$key + 216], %f18 | |
620 | camellia_f %f20, %f2, %f0, %f2 | |
621 | camellia_f %f22, %f0, %f2, %f0 | |
622 | ldd [$key + 224], %f20 | |
623 | ldd [$key + 232], %f22 | |
624 | camellia_f %f24, %f2, %f0, %f2 | |
625 | camellia_f %f26, %f0, %f2, %f0 | |
626 | ldd [$key + 240], %f24 | |
627 | ldd [$key + 248], %f26 | |
628 | camellia_fl %f28, %f0, %f0 | |
629 | camellia_fli %f30, %f2, %f2 | |
630 | ldd [$key + 256], %f28 | |
631 | ldd [$key + 264], %f30 | |
632 | ___ | |
633 | for ($i=1; $i<3; $i++) { | |
634 | $code.=<<___; | |
635 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
636 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
637 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
638 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
639 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
640 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
641 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
642 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
643 | ___ | |
644 | } | |
645 | $code.=<<___; | |
646 | camellia_f %f16, %f2, %f0, %f2 | |
647 | camellia_f %f18, %f0, %f2, %f0 | |
648 | ldd [$key + 16], %f16 | |
649 | ldd [$key + 24], %f18 | |
650 | camellia_f %f20, %f2, %f0, %f2 | |
651 | camellia_f %f22, %f0, %f2, %f0 | |
652 | ldd [$key + 32], %f20 | |
653 | ldd [$key + 40], %f22 | |
654 | camellia_f %f24, %f2, %f0, %f4 | |
655 | camellia_f %f26, %f0, %f4, %f2 | |
656 | ldd [$key + 48], %f24 | |
657 | ldd [$key + 56], %f26 | |
658 | fxor %f28, %f4, %f0 | |
659 | fxor %f30, %f2, %f2 | |
660 | ldd [$key + 64], %f28 | |
661 | retl | |
662 | ldd [$key + 72], %f30 | |
663 | .type _cmll256_encrypt_1x,#function | |
664 | .size _cmll256_encrypt_1x,.-_cmll256_encrypt_1x | |
665 | ||
666 | .align 32 | |
667 | _cmll256_encrypt_2x: | |
668 | camellia_f %f16, %f2, %f0, %f2 | |
669 | camellia_f %f16, %f6, %f4, %f6 | |
670 | camellia_f %f18, %f0, %f2, %f0 | |
671 | camellia_f %f18, %f4, %f6, %f4 | |
672 | ldd [$key + 208], %f16 | |
673 | ldd [$key + 216], %f18 | |
674 | camellia_f %f20, %f2, %f0, %f2 | |
675 | camellia_f %f20, %f6, %f4, %f6 | |
676 | camellia_f %f22, %f0, %f2, %f0 | |
677 | camellia_f %f22, %f4, %f6, %f4 | |
678 | ldd [$key + 224], %f20 | |
679 | ldd [$key + 232], %f22 | |
680 | camellia_f %f24, %f2, %f0, %f2 | |
681 | camellia_f %f24, %f6, %f4, %f6 | |
682 | camellia_f %f26, %f0, %f2, %f0 | |
683 | camellia_f %f26, %f4, %f6, %f4 | |
684 | ldd [$key + 240], %f24 | |
685 | ldd [$key + 248], %f26 | |
686 | camellia_fl %f28, %f0, %f0 | |
687 | camellia_fl %f28, %f4, %f4 | |
688 | camellia_fli %f30, %f2, %f2 | |
689 | camellia_fli %f30, %f6, %f6 | |
690 | ldd [$key + 256], %f28 | |
691 | ldd [$key + 264], %f30 | |
692 | ___ | |
693 | for ($i=1; $i<3; $i++) { | |
694 | $code.=<<___; | |
695 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
696 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
697 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
698 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
699 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
700 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
701 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
702 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
703 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
704 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
705 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
706 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
707 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
708 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
709 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
710 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
711 | ___ | |
712 | } | |
713 | $code.=<<___; | |
714 | camellia_f %f16, %f2, %f0, %f2 | |
715 | camellia_f %f16, %f6, %f4, %f6 | |
716 | camellia_f %f18, %f0, %f2, %f0 | |
717 | camellia_f %f18, %f4, %f6, %f4 | |
718 | ldd [$key + 16], %f16 | |
719 | ldd [$key + 24], %f18 | |
720 | camellia_f %f20, %f2, %f0, %f2 | |
721 | camellia_f %f20, %f6, %f4, %f6 | |
722 | camellia_f %f22, %f0, %f2, %f0 | |
723 | camellia_f %f22, %f4, %f6, %f4 | |
724 | ldd [$key + 32], %f20 | |
725 | ldd [$key + 40], %f22 | |
726 | camellia_f %f24, %f2, %f0, %f8 | |
727 | camellia_f %f24, %f6, %f4, %f10 | |
728 | camellia_f %f26, %f0, %f8, %f2 | |
729 | camellia_f %f26, %f4, %f10, %f6 | |
730 | ldd [$key + 48], %f24 | |
731 | ldd [$key + 56], %f26 | |
732 | fxor %f28, %f8, %f0 | |
733 | fxor %f28, %f10, %f4 | |
734 | fxor %f30, %f2, %f2 | |
735 | fxor %f30, %f6, %f6 | |
736 | ldd [$key + 64], %f28 | |
737 | retl | |
738 | ldd [$key + 72], %f30 | |
739 | .type _cmll256_encrypt_2x,#function | |
740 | .size _cmll256_encrypt_2x,.-_cmll256_encrypt_2x | |
741 | ||
742 | .align 32 | |
743 | _cmll256_decrypt_1x: | |
744 | camellia_f %f16, %f2, %f0, %f2 | |
745 | camellia_f %f18, %f0, %f2, %f0 | |
746 | ldd [$key - 8], %f16 | |
747 | ldd [$key - 16], %f18 | |
748 | camellia_f %f20, %f2, %f0, %f2 | |
749 | camellia_f %f22, %f0, %f2, %f0 | |
750 | ldd [$key - 24], %f20 | |
751 | ldd [$key - 32], %f22 | |
752 | camellia_f %f24, %f2, %f0, %f2 | |
753 | camellia_f %f26, %f0, %f2, %f0 | |
754 | ldd [$key - 40], %f24 | |
755 | ldd [$key - 48], %f26 | |
756 | camellia_fl %f28, %f0, %f0 | |
757 | camellia_fli %f30, %f2, %f2 | |
758 | ldd [$key - 56], %f28 | |
759 | ldd [$key - 64], %f30 | |
760 | ___ | |
761 | for ($i=1; $i<3; $i++) { | |
762 | $code.=<<___; | |
763 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
764 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
765 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
766 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
767 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
768 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
769 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
770 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
771 | ___ | |
772 | } | |
773 | $code.=<<___; | |
774 | camellia_f %f16, %f2, %f0, %f2 | |
775 | camellia_f %f18, %f0, %f2, %f0 | |
776 | ldd [$key + 184], %f16 | |
777 | ldd [$key + 176], %f18 | |
778 | camellia_f %f20, %f2, %f0, %f2 | |
779 | camellia_f %f22, %f0, %f2, %f0 | |
780 | ldd [$key + 168], %f20 | |
781 | ldd [$key + 160], %f22 | |
782 | camellia_f %f24, %f2, %f0, %f4 | |
783 | camellia_f %f26, %f0, %f4, %f2 | |
784 | ldd [$key + 152], %f24 | |
785 | ldd [$key + 144], %f26 | |
786 | fxor %f30, %f4, %f0 | |
787 | fxor %f28, %f2, %f2 | |
788 | ldd [$key + 136], %f28 | |
789 | retl | |
790 | ldd [$key + 128], %f30 | |
791 | .type _cmll256_decrypt_1x,#function | |
792 | .size _cmll256_decrypt_1x,.-_cmll256_decrypt_1x | |
793 | ||
794 | .align 32 | |
795 | _cmll256_decrypt_2x: | |
796 | camellia_f %f16, %f2, %f0, %f2 | |
797 | camellia_f %f16, %f6, %f4, %f6 | |
798 | camellia_f %f18, %f0, %f2, %f0 | |
799 | camellia_f %f18, %f4, %f6, %f4 | |
800 | ldd [$key - 8], %f16 | |
801 | ldd [$key - 16], %f18 | |
802 | camellia_f %f20, %f2, %f0, %f2 | |
803 | camellia_f %f20, %f6, %f4, %f6 | |
804 | camellia_f %f22, %f0, %f2, %f0 | |
805 | camellia_f %f22, %f4, %f6, %f4 | |
806 | ldd [$key - 24], %f20 | |
807 | ldd [$key - 32], %f22 | |
808 | camellia_f %f24, %f2, %f0, %f2 | |
809 | camellia_f %f24, %f6, %f4, %f6 | |
810 | camellia_f %f26, %f0, %f2, %f0 | |
811 | camellia_f %f26, %f4, %f6, %f4 | |
812 | ldd [$key - 40], %f24 | |
813 | ldd [$key - 48], %f26 | |
814 | camellia_fl %f28, %f0, %f0 | |
815 | camellia_fl %f28, %f4, %f4 | |
816 | camellia_fli %f30, %f2, %f2 | |
817 | camellia_fli %f30, %f6, %f6 | |
818 | ldd [$key - 56], %f28 | |
819 | ldd [$key - 64], %f30 | |
820 | ___ | |
821 | for ($i=1; $i<3; $i++) { | |
822 | $code.=<<___; | |
823 | camellia_f %f`16+16*$i+0`, %f2, %f0, %f2 | |
824 | camellia_f %f`16+16*$i+0`, %f6, %f4, %f6 | |
825 | camellia_f %f`16+16*$i+2`, %f0, %f2, %f0 | |
826 | camellia_f %f`16+16*$i+2`, %f4, %f6, %f4 | |
827 | camellia_f %f`16+16*$i+4`, %f2, %f0, %f2 | |
828 | camellia_f %f`16+16*$i+4`, %f6, %f4, %f6 | |
829 | camellia_f %f`16+16*$i+6`, %f0, %f2, %f0 | |
830 | camellia_f %f`16+16*$i+6`, %f4, %f6, %f4 | |
831 | camellia_f %f`16+16*$i+8`, %f2, %f0, %f2 | |
832 | camellia_f %f`16+16*$i+8`, %f6, %f4, %f6 | |
833 | camellia_f %f`16+16*$i+10`, %f0, %f2, %f0 | |
834 | camellia_f %f`16+16*$i+10`, %f4, %f6, %f4 | |
835 | camellia_fl %f`16+16*$i+12`, %f0, %f0 | |
836 | camellia_fl %f`16+16*$i+12`, %f4, %f4 | |
837 | camellia_fli %f`16+16*$i+14`, %f2, %f2 | |
838 | camellia_fli %f`16+16*$i+14`, %f6, %f6 | |
839 | ___ | |
840 | } | |
841 | $code.=<<___; | |
842 | camellia_f %f16, %f2, %f0, %f2 | |
843 | camellia_f %f16, %f6, %f4, %f6 | |
844 | camellia_f %f18, %f0, %f2, %f0 | |
845 | camellia_f %f18, %f4, %f6, %f4 | |
846 | ldd [$key + 184], %f16 | |
847 | ldd [$key + 176], %f18 | |
848 | camellia_f %f20, %f2, %f0, %f2 | |
849 | camellia_f %f20, %f6, %f4, %f6 | |
850 | camellia_f %f22, %f0, %f2, %f0 | |
851 | camellia_f %f22, %f4, %f6, %f4 | |
852 | ldd [$key + 168], %f20 | |
853 | ldd [$key + 160], %f22 | |
854 | camellia_f %f24, %f2, %f0, %f8 | |
855 | camellia_f %f24, %f6, %f4, %f10 | |
856 | camellia_f %f26, %f0, %f8, %f2 | |
857 | camellia_f %f26, %f4, %f10, %f6 | |
858 | ldd [$key + 152], %f24 | |
859 | ldd [$key + 144], %f26 | |
860 | fxor %f30, %f8, %f0 | |
861 | fxor %f30, %f10, %f4 | |
862 | fxor %f28, %f2, %f2 | |
863 | fxor %f28, %f6, %f6 | |
864 | ldd [$key + 136], %f28 | |
865 | retl | |
866 | ldd [$key + 128], %f30 | |
867 | .type _cmll256_decrypt_2x,#function | |
868 | .size _cmll256_decrypt_2x,.-_cmll256_decrypt_2x | |
869 | ___ | |
870 | ||
871 | &alg_cbc_encrypt_implement("cmll",128); | |
872 | &alg_cbc_encrypt_implement("cmll",256); | |
873 | ||
874 | &alg_cbc_decrypt_implement("cmll",128); | |
875 | &alg_cbc_decrypt_implement("cmll",256); | |
876 | ||
877 | if ($::evp) { | |
878 | &alg_ctr32_implement("cmll",128); | |
879 | &alg_ctr32_implement("cmll",256); | |
880 | } | |
881 | }}} | |
882 | ||
883 | if (!$::evp) { | |
884 | $code.=<<___; | |
885 | .global Camellia_encrypt | |
886 | Camellia_encrypt=cmll_t4_encrypt | |
887 | .global Camellia_decrypt | |
888 | Camellia_decrypt=cmll_t4_decrypt | |
889 | .global Camellia_set_key | |
890 | .align 32 | |
891 | Camellia_set_key: | |
892 | andcc %o2, 7, %g0 ! double-check alignment | |
893 | bnz,a,pn %icc, 1f | |
894 | mov -1, %o0 | |
895 | brz,a,pn %o0, 1f | |
896 | mov -1, %o0 | |
897 | brz,a,pn %o2, 1f | |
898 | mov -1, %o0 | |
899 | andncc %o1, 0x1c0, %g0 | |
900 | bnz,a,pn %icc, 1f | |
901 | mov -2, %o0 | |
902 | cmp %o1, 128 | |
903 | bl,a,pn %icc, 1f | |
904 | mov -2, %o0 | |
905 | b cmll_t4_set_key | |
906 | nop | |
907 | 1: retl | |
908 | nop | |
909 | .type Camellia_set_key,#function | |
910 | .size Camellia_set_key,.-Camellia_set_key | |
911 | ___ | |
912 | ||
913 | my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5)); | |
914 | ||
915 | $code.=<<___; | |
916 | .globl Camellia_cbc_encrypt | |
917 | .align 32 | |
918 | Camellia_cbc_encrypt: | |
919 | ld [$key + 272], %g1 | |
920 | nop | |
921 | brz $enc, .Lcbc_decrypt | |
922 | cmp %g1, 3 | |
923 | ||
924 | be,pt %icc, cmll128_t4_cbc_encrypt | |
925 | nop | |
926 | ba cmll256_t4_cbc_encrypt | |
927 | nop | |
928 | ||
929 | .Lcbc_decrypt: | |
930 | be,pt %icc, cmll128_t4_cbc_decrypt | |
931 | nop | |
932 | ba cmll256_t4_cbc_decrypt | |
933 | nop | |
934 | .type Camellia_cbc_encrypt,#function | |
935 | .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt | |
936 | ___ | |
937 | } | |
938 | ||
939 | &emit_assembler(); | |
940 | ||
a21314db | 941 | close STDOUT or die "error closing STDOUT: $!"; |