]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/camellia/asm/cmll-x86_64.pl
36fd7cbfd4eedddb83ea98b34c57e489f90411e2
[thirdparty/openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12 #
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
20
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
23 #
24 # AMD64 Core2 EM64T
25 # -evp camellia-128-ecb 16.7 21.0 22.7
26 # + over gcc 3.4.6 +25% +5% 0%
27 #
28 # camellia-128-cbc 15.7 20.4 21.1
29 #
30 # 128-bit key setup 128 216 205 cycles/key
31 # + over gcc 3.4.6 +54% +39% +15%
32 #
33 # Numbers in "+" rows represent performance improvement over compiler
34 # generated code. Key setup timings are impressive on AMD and Core2
35 # thanks to 64-bit operations being covertly deployed. Improvement on
36 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37 # apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39 $flavour = shift;
40 $output = shift;
41 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42
43 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48 die "can't locate x86_64-xlate.pl";
49
50 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
51 *STDOUT=*OUT;
52
53 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
54 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
55 $r =~ s/%[er]([sd]i)/%\1l/;
56 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
57
58 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
59 @S=("%r8d","%r9d","%r10d","%r11d");
60 $i0="%esi";
61 $i1="%edi";
62 $Tbl="%rbp"; # size optimization
63 $inp="%r12";
64 $out="%r13";
65 $key="%r14";
66 $keyend="%r15";
67 $arg0d=$win64?"%ecx":"%edi";
68
69 # const unsigned int Camellia_SBOX[4][256];
70 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
71 # and [2][] - with [3][]. This is done to minimize code size.
72 $SBOX1_1110=0; # Camellia_SBOX[0]
73 $SBOX4_4404=4; # Camellia_SBOX[1]
74 $SBOX2_0222=2048; # Camellia_SBOX[2]
75 $SBOX3_3033=2052; # Camellia_SBOX[3]
76
77 sub Camellia_Feistel {
78 my $i=@_[0];
79 my $seed=defined(@_[1])?@_[1]:0;
80 my $scale=$seed<0?-8:8;
81 my $j=($i&1)*2;
82 my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
83
84 $code.=<<___;
85 xor $s0,$t0 # t0^=key[0]
86 xor $s1,$t1 # t1^=key[1]
87 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
88 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
89 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
90 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
91 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
92 shr \$16,$t0
93 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
94 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
95 shr \$16,$t1
96 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
97 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
98 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
99 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
100 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
101 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
102 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
103 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
104 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
105 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
106 mov `$seed+($i+1)*$scale+4`($key),$t0
107 xor $t3,$t2 # t2^=t3
108 ror \$8,$t3 # t3=RightRotate(t3,8)
109 xor $t2,$s2
110 xor $t2,$s3
111 xor $t3,$s3
112 ___
113 }
114
115 # void Camellia_EncryptBlock_Rounds(
116 # int grandRounds,
117 # const Byte plaintext[],
118 # const KEY_TABLE_TYPE keyTable,
119 # Byte ciphertext[])
120 $code=<<___;
121 .text
122
123 # V1.x API
124 .globl Camellia_EncryptBlock
125 .type Camellia_EncryptBlock,\@abi-omnipotent
126 .align 16
127 Camellia_EncryptBlock:
128 .cfi_startproc
129 movl \$128,%eax
130 subl $arg0d,%eax
131 movl \$3,$arg0d
132 adcl \$0,$arg0d # keyBitLength==128?3:4
133 jmp .Lenc_rounds
134 .cfi_endproc
135 .size Camellia_EncryptBlock,.-Camellia_EncryptBlock
136 # V2
137 .globl Camellia_EncryptBlock_Rounds
138 .type Camellia_EncryptBlock_Rounds,\@function,4
139 .align 16
140 .Lenc_rounds:
141 Camellia_EncryptBlock_Rounds:
142 .cfi_startproc
143 push %rbx
144 .cfi_push %rbx
145 push %rbp
146 .cfi_push %rbp
147 push %r13
148 .cfi_push %r13
149 push %r14
150 .cfi_push %r14
151 push %r15
152 .cfi_push %r15
153 .Lenc_prologue:
154
155 #mov %rsi,$inp # put away arguments
156 mov %rcx,$out
157 mov %rdx,$key
158
159 shl \$6,%edi # process grandRounds
160 lea .LCamellia_SBOX(%rip),$Tbl
161 lea ($key,%rdi),$keyend
162
163 mov 0(%rsi),@S[0] # load plaintext
164 mov 4(%rsi),@S[1]
165 mov 8(%rsi),@S[2]
166 bswap @S[0]
167 mov 12(%rsi),@S[3]
168 bswap @S[1]
169 bswap @S[2]
170 bswap @S[3]
171
172 call _x86_64_Camellia_encrypt
173
174 bswap @S[0]
175 bswap @S[1]
176 bswap @S[2]
177 mov @S[0],0($out)
178 bswap @S[3]
179 mov @S[1],4($out)
180 mov @S[2],8($out)
181 mov @S[3],12($out)
182
183 mov 0(%rsp),%r15
184 .cfi_restore %r15
185 mov 8(%rsp),%r14
186 .cfi_restore %r14
187 mov 16(%rsp),%r13
188 .cfi_restore %r13
189 mov 24(%rsp),%rbp
190 .cfi_restore %rbp
191 mov 32(%rsp),%rbx
192 .cfi_restore %rbx
193 lea 40(%rsp),%rsp
194 .cfi_adjust_cfa_offset -40
195 .Lenc_epilogue:
196 ret
197 .cfi_endproc
198 .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
199
200 .type _x86_64_Camellia_encrypt,\@abi-omnipotent
201 .align 16
202 _x86_64_Camellia_encrypt:
203 .cfi_startproc
204 xor 0($key),@S[1]
205 xor 4($key),@S[0] # ^=key[0-3]
206 xor 8($key),@S[3]
207 xor 12($key),@S[2]
208 .align 16
209 .Leloop:
210 mov 16($key),$t1 # prefetch key[4-5]
211 mov 20($key),$t0
212
213 ___
214 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
215 $code.=<<___;
216 lea 16*4($key),$key
217 cmp $keyend,$key
218 mov 8($key),$t3 # prefetch key[2-3]
219 mov 12($key),$t2
220 je .Ledone
221
222 and @S[0],$t0
223 or @S[3],$t3
224 rol \$1,$t0
225 xor $t3,@S[2] # s2^=s3|key[3];
226 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
227 and @S[2],$t2
228 or @S[1],$t1
229 rol \$1,$t2
230 xor $t1,@S[0] # s0^=s1|key[1];
231 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
232 jmp .Leloop
233
234 .align 16
235 .Ledone:
236 xor @S[2],$t0 # SwapHalf
237 xor @S[3],$t1
238 xor @S[0],$t2
239 xor @S[1],$t3
240
241 mov $t0,@S[0]
242 mov $t1,@S[1]
243 mov $t2,@S[2]
244 mov $t3,@S[3]
245
246 .byte 0xf3,0xc3 # rep ret
247 .cfi_endproc
248 .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
249
250 # V1.x API
251 .globl Camellia_DecryptBlock
252 .type Camellia_DecryptBlock,\@abi-omnipotent
253 .align 16
254 Camellia_DecryptBlock:
255 .cfi_startproc
256 movl \$128,%eax
257 subl $arg0d,%eax
258 movl \$3,$arg0d
259 adcl \$0,$arg0d # keyBitLength==128?3:4
260 jmp .Ldec_rounds
261 .cfi_endproc
262 .size Camellia_DecryptBlock,.-Camellia_DecryptBlock
263 # V2
264 .globl Camellia_DecryptBlock_Rounds
265 .type Camellia_DecryptBlock_Rounds,\@function,4
266 .align 16
267 .Ldec_rounds:
268 Camellia_DecryptBlock_Rounds:
269 .cfi_startproc
270 push %rbx
271 .cfi_push %rbx
272 push %rbp
273 .cfi_push %rbp
274 push %r13
275 .cfi_push %r13
276 push %r14
277 .cfi_push %r14
278 push %r15
279 .cfi_push %r15
280 .Ldec_prologue:
281
282 #mov %rsi,$inp # put away arguments
283 mov %rcx,$out
284 mov %rdx,$keyend
285
286 shl \$6,%edi # process grandRounds
287 lea .LCamellia_SBOX(%rip),$Tbl
288 lea ($keyend,%rdi),$key
289
290 mov 0(%rsi),@S[0] # load plaintext
291 mov 4(%rsi),@S[1]
292 mov 8(%rsi),@S[2]
293 bswap @S[0]
294 mov 12(%rsi),@S[3]
295 bswap @S[1]
296 bswap @S[2]
297 bswap @S[3]
298
299 call _x86_64_Camellia_decrypt
300
301 bswap @S[0]
302 bswap @S[1]
303 bswap @S[2]
304 mov @S[0],0($out)
305 bswap @S[3]
306 mov @S[1],4($out)
307 mov @S[2],8($out)
308 mov @S[3],12($out)
309
310 mov 0(%rsp),%r15
311 .cfi_restore %r15
312 mov 8(%rsp),%r14
313 .cfi_restore %r14
314 mov 16(%rsp),%r13
315 .cfi_restore %r13
316 mov 24(%rsp),%rbp
317 .cfi_restore %rbp
318 mov 32(%rsp),%rbx
319 .cfi_restore %rbx
320 lea 40(%rsp),%rsp
321 .cfi_adjust_cfa_offset -40
322 .Ldec_epilogue:
323 ret
324 .cfi_endproc
325 .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
326
327 .type _x86_64_Camellia_decrypt,\@abi-omnipotent
328 .align 16
329 _x86_64_Camellia_decrypt:
330 .cfi_startproc
331 xor 0($key),@S[1]
332 xor 4($key),@S[0] # ^=key[0-3]
333 xor 8($key),@S[3]
334 xor 12($key),@S[2]
335 .align 16
336 .Ldloop:
337 mov -8($key),$t1 # prefetch key[4-5]
338 mov -4($key),$t0
339
340 ___
341 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
342 $code.=<<___;
343 lea -16*4($key),$key
344 cmp $keyend,$key
345 mov 0($key),$t3 # prefetch key[2-3]
346 mov 4($key),$t2
347 je .Lddone
348
349 and @S[0],$t0
350 or @S[3],$t3
351 rol \$1,$t0
352 xor $t3,@S[2] # s2^=s3|key[3];
353 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
354 and @S[2],$t2
355 or @S[1],$t1
356 rol \$1,$t2
357 xor $t1,@S[0] # s0^=s1|key[1];
358 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
359
360 jmp .Ldloop
361
362 .align 16
363 .Lddone:
364 xor @S[2],$t2
365 xor @S[3],$t3
366 xor @S[0],$t0
367 xor @S[1],$t1
368
369 mov $t2,@S[0] # SwapHalf
370 mov $t3,@S[1]
371 mov $t0,@S[2]
372 mov $t1,@S[3]
373
374 .byte 0xf3,0xc3 # rep ret
375 .cfi_endproc
376 .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
377 ___
378
379 sub _saveround {
380 my ($rnd,$key,@T)=@_;
381 my $bias=int(@T[0])?shift(@T):0;
382
383 if ($#T==3) {
384 $code.=<<___;
385 mov @T[1],`$bias+$rnd*8+0`($key)
386 mov @T[0],`$bias+$rnd*8+4`($key)
387 mov @T[3],`$bias+$rnd*8+8`($key)
388 mov @T[2],`$bias+$rnd*8+12`($key)
389 ___
390 } else {
391 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
392 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
393 }
394 }
395
396 sub _loadround {
397 my ($rnd,$key,@T)=@_;
398 my $bias=int(@T[0])?shift(@T):0;
399
400 $code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
401 $code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
402 }
403
404 # shld is very slow on Intel EM64T family. Even on AMD it limits
405 # instruction decode rate [because it's VectorPath] and consequently
406 # performance...
407 sub __rotl128 {
408 my ($i0,$i1,$rot)=@_;
409
410 if ($rot) {
411 $code.=<<___;
412 mov $i0,%r11
413 shld \$$rot,$i1,$i0
414 shld \$$rot,%r11,$i1
415 ___
416 }
417 }
418
419 # ... Implementing 128-bit rotate without shld gives 80% better
420 # performance EM64T, +15% on AMD64 and only ~7% degradation on
421 # Core2. This is therefore preferred.
422 sub _rotl128 {
423 my ($i0,$i1,$rot)=@_;
424
425 if ($rot) {
426 $code.=<<___;
427 mov $i0,%r11
428 shl \$$rot,$i0
429 mov $i1,%r9
430 shr \$`64-$rot`,%r9
431 shr \$`64-$rot`,%r11
432 or %r9,$i0
433 shl \$$rot,$i1
434 or %r11,$i1
435 ___
436 }
437 }
438
439 { my $step=0;
440
441 $code.=<<___;
442 .globl Camellia_Ekeygen
443 .type Camellia_Ekeygen,\@function,3
444 .align 16
445 Camellia_Ekeygen:
446 .cfi_startproc
447 push %rbx
448 .cfi_push %rbx
449 push %rbp
450 .cfi_push %rbp
451 push %r13
452 .cfi_push %r13
453 push %r14
454 .cfi_push %r14
455 push %r15
456 .cfi_push %r15
457 .Lkey_prologue:
458
459 mov %edi,${keyend}d # put away arguments, keyBitLength
460 mov %rdx,$out # keyTable
461
462 mov 0(%rsi),@S[0] # load 0-127 bits
463 mov 4(%rsi),@S[1]
464 mov 8(%rsi),@S[2]
465 mov 12(%rsi),@S[3]
466
467 bswap @S[0]
468 bswap @S[1]
469 bswap @S[2]
470 bswap @S[3]
471 ___
472 &_saveround (0,$out,@S); # KL<<<0
473 $code.=<<___;
474 cmp \$128,$keyend # check keyBitLength
475 je .L1st128
476
477 mov 16(%rsi),@S[0] # load 128-191 bits
478 mov 20(%rsi),@S[1]
479 cmp \$192,$keyend
480 je .L1st192
481 mov 24(%rsi),@S[2] # load 192-255 bits
482 mov 28(%rsi),@S[3]
483 jmp .L1st256
484 .L1st192:
485 mov @S[0],@S[2]
486 mov @S[1],@S[3]
487 not @S[2]
488 not @S[3]
489 .L1st256:
490 bswap @S[0]
491 bswap @S[1]
492 bswap @S[2]
493 bswap @S[3]
494 ___
495 &_saveround (4,$out,@S); # temp storage for KR!
496 $code.=<<___;
497 xor 0($out),@S[1] # KR^KL
498 xor 4($out),@S[0]
499 xor 8($out),@S[3]
500 xor 12($out),@S[2]
501
502 .L1st128:
503 lea .LCamellia_SIGMA(%rip),$key
504 lea .LCamellia_SBOX(%rip),$Tbl
505
506 mov 0($key),$t1
507 mov 4($key),$t0
508 ___
509 &Camellia_Feistel($step++);
510 &Camellia_Feistel($step++);
511 $code.=<<___;
512 xor 0($out),@S[1] # ^KL
513 xor 4($out),@S[0]
514 xor 8($out),@S[3]
515 xor 12($out),@S[2]
516 ___
517 &Camellia_Feistel($step++);
518 &Camellia_Feistel($step++);
519 $code.=<<___;
520 cmp \$128,$keyend
521 jne .L2nd256
522
523 lea 128($out),$out # size optimization
524 shl \$32,%r8 # @S[0]||
525 shl \$32,%r10 # @S[2]||
526 or %r9,%r8 # ||@S[1]
527 or %r11,%r10 # ||@S[3]
528 ___
529 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
530 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
531 &_rotl128 ("%rax","%rbx",15);
532 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
533 &_rotl128 ("%r8","%r10",15);
534 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
535 &_rotl128 ("%r8","%r10",15); # 15+15=30
536 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
537 &_rotl128 ("%rax","%rbx",30); # 15+30=45
538 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
539 &_rotl128 ("%r8","%r10",15); # 30+15=45
540 &_saveround (12,$out,-128,"%r8"); # KA<<<45
541 &_rotl128 ("%rax","%rbx",15); # 45+15=60
542 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
543 &_rotl128 ("%r8","%r10",15); # 45+15=60
544 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
545 &_rotl128 ("%rax","%rbx",17); # 60+17=77
546 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
547 &_rotl128 ("%rax","%rbx",17); # 77+17=94
548 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
549 &_rotl128 ("%r8","%r10",34); # 60+34=94
550 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
551 &_rotl128 ("%rax","%rbx",17); # 94+17=111
552 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
553 &_rotl128 ("%r8","%r10",17); # 94+17=111
554 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
555 $code.=<<___;
556 mov \$3,%eax
557 jmp .Ldone
558 .align 16
559 .L2nd256:
560 ___
561 &_saveround (6,$out,@S); # temp storage for KA!
562 $code.=<<___;
563 xor `4*8+0`($out),@S[1] # KA^KR
564 xor `4*8+4`($out),@S[0]
565 xor `5*8+0`($out),@S[3]
566 xor `5*8+4`($out),@S[2]
567 ___
568 &Camellia_Feistel($step++);
569 &Camellia_Feistel($step++);
570
571 &_loadround (0,$out,"%rax","%rbx"); # KL
572 &_loadround (4,$out,"%rcx","%rdx"); # KR
573 &_loadround (6,$out,"%r14","%r15"); # KA
574 $code.=<<___;
575 lea 128($out),$out # size optimization
576 shl \$32,%r8 # @S[0]||
577 shl \$32,%r10 # @S[2]||
578 or %r9,%r8 # ||@S[1]
579 or %r11,%r10 # ||@S[3]
580 ___
581 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
582 &_rotl128 ("%rcx","%rdx",15);
583 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
584 &_rotl128 ("%r14","%r15",15);
585 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
586 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
587 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
588 &_rotl128 ("%r8","%r10",30);
589 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
590 &_rotl128 ("%rax","%rbx",45);
591 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
592 &_rotl128 ("%r14","%r15",30); # 15+30=45
593 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
594 &_rotl128 ("%rax","%rbx",15); # 45+15=60
595 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
596 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
597 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
598 &_rotl128 ("%r8","%r10",30); # 30+30=60
599 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
600 &_rotl128 ("%rax","%rbx",17); # 60+17=77
601 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
602 &_rotl128 ("%r14","%r15",32); # 45+32=77
603 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
604 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
605 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
606 &_rotl128 ("%r14","%r15",17); # 77+17=94
607 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
608 &_rotl128 ("%rax","%rbx",34); # 77+34=111
609 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
610 &_rotl128 ("%r8","%r10",51); # 60+51=111
611 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
612 $code.=<<___;
613 mov \$4,%eax
614 .Ldone:
615 mov 0(%rsp),%r15
616 .cfi_restore %r15
617 mov 8(%rsp),%r14
618 .cfi_restore %r14
619 mov 16(%rsp),%r13
620 .cfi_restore %r13
621 mov 24(%rsp),%rbp
622 .cfi_restore %rbp
623 mov 32(%rsp),%rbx
624 .cfi_restore %rbx
625 lea 40(%rsp),%rsp
626 .cfi_adjust_cfa_offset -40
627 .Lkey_epilogue:
628 ret
629 .cfi_endproc
630 .size Camellia_Ekeygen,.-Camellia_Ekeygen
631 ___
632 }
633
634 @SBOX=(
635 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
636 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
637 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
638 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
639 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
640 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
641 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
642 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
643 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
644 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
645 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
646 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
647 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
648 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
649 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
650 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
651
652 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
653 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
654 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
655 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
656
657 $code.=<<___;
658 .align 64
659 .LCamellia_SIGMA:
660 .long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
661 .long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
662 .long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
663 .long 0, 0, 0, 0
664 .LCamellia_SBOX:
665 ___
666 # tables are interleaved, remember?
667 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
668 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
669 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
670
671 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
672 # size_t length, const CAMELLIA_KEY *key,
673 # unsigned char *ivp,const int enc);
674 {
675 $_key="0(%rsp)";
676 $_end="8(%rsp)"; # inp+len&~15
677 $_res="16(%rsp)"; # len&15
678 $ivec="24(%rsp)";
679 $_ivp="40(%rsp)";
680 $_rsp="48(%rsp)";
681
682 $code.=<<___;
683 .globl Camellia_cbc_encrypt
684 .type Camellia_cbc_encrypt,\@function,6
685 .align 16
686 Camellia_cbc_encrypt:
687 .cfi_startproc
688 cmp \$0,%rdx
689 je .Lcbc_abort
690 push %rbx
691 .cfi_push %rbx
692 push %rbp
693 .cfi_push %rbp
694 push %r12
695 .cfi_push %r12
696 push %r13
697 .cfi_push %r13
698 push %r14
699 .cfi_push %r14
700 push %r15
701 .cfi_push %r15
702 .Lcbc_prologue:
703
704 mov %rsp,%rbp
705 .cfi_def_cfa_register %rbp
706 sub \$64,%rsp
707 and \$-64,%rsp
708
709 # place stack frame just "above mod 1024" the key schedule,
710 # this ensures that cache associativity suffices
711 lea -64-63(%rcx),%r10
712 sub %rsp,%r10
713 neg %r10
714 and \$0x3C0,%r10
715 sub %r10,%rsp
716 #add \$8,%rsp # 8 is reserved for callee's ra
717
718 mov %rdi,$inp # inp argument
719 mov %rsi,$out # out argument
720 mov %r8,%rbx # ivp argument
721 mov %rcx,$key # key argument
722 mov 272(%rcx),${keyend}d # grandRounds
723
724 mov %r8,$_ivp
725 mov %rbp,$_rsp
726 .cfi_cfa_expression $_rsp,deref,+56
727
728 .Lcbc_body:
729 lea .LCamellia_SBOX(%rip),$Tbl
730
731 mov \$32,%ecx
732 .align 4
733 .Lcbc_prefetch_sbox:
734 mov 0($Tbl),%rax
735 mov 32($Tbl),%rsi
736 mov 64($Tbl),%rdi
737 mov 96($Tbl),%r11
738 lea 128($Tbl),$Tbl
739 loop .Lcbc_prefetch_sbox
740 sub \$4096,$Tbl
741 shl \$6,$keyend
742 mov %rdx,%rcx # len argument
743 lea ($key,$keyend),$keyend
744
745 cmp \$0,%r9d # enc argument
746 je .LCBC_DECRYPT
747
748 and \$-16,%rdx
749 and \$15,%rcx # length residue
750 lea ($inp,%rdx),%rdx
751 mov $key,$_key
752 mov %rdx,$_end
753 mov %rcx,$_res
754
755 cmp $inp,%rdx
756 mov 0(%rbx),@S[0] # load IV
757 mov 4(%rbx),@S[1]
758 mov 8(%rbx),@S[2]
759 mov 12(%rbx),@S[3]
760 je .Lcbc_enc_tail
761 jmp .Lcbc_eloop
762
763 .align 16
764 .Lcbc_eloop:
765 xor 0($inp),@S[0]
766 xor 4($inp),@S[1]
767 xor 8($inp),@S[2]
768 bswap @S[0]
769 xor 12($inp),@S[3]
770 bswap @S[1]
771 bswap @S[2]
772 bswap @S[3]
773
774 call _x86_64_Camellia_encrypt
775
776 mov $_key,$key # "rewind" the key
777 bswap @S[0]
778 mov $_end,%rdx
779 bswap @S[1]
780 mov $_res,%rcx
781 bswap @S[2]
782 mov @S[0],0($out)
783 bswap @S[3]
784 mov @S[1],4($out)
785 mov @S[2],8($out)
786 lea 16($inp),$inp
787 mov @S[3],12($out)
788 cmp %rdx,$inp
789 lea 16($out),$out
790 jne .Lcbc_eloop
791
792 cmp \$0,%rcx
793 jne .Lcbc_enc_tail
794
795 mov $_ivp,$out
796 mov @S[0],0($out) # write out IV residue
797 mov @S[1],4($out)
798 mov @S[2],8($out)
799 mov @S[3],12($out)
800 jmp .Lcbc_done
801
802 .align 16
803 .Lcbc_enc_tail:
804 xor %rax,%rax
805 mov %rax,0+$ivec
806 mov %rax,8+$ivec
807 mov %rax,$_res
808
809 .Lcbc_enc_pushf:
810 pushfq
811 cld
812 mov $inp,%rsi
813 lea 8+$ivec,%rdi
814 .long 0x9066A4F3 # rep movsb
815 popfq
816 .Lcbc_enc_popf:
817
818 lea $ivec,$inp
819 lea 16+$ivec,%rax
820 mov %rax,$_end
821 jmp .Lcbc_eloop # one more time
822
823 .align 16
824 .LCBC_DECRYPT:
825 xchg $key,$keyend
826 add \$15,%rdx
827 and \$15,%rcx # length residue
828 and \$-16,%rdx
829 mov $key,$_key
830 lea ($inp,%rdx),%rdx
831 mov %rdx,$_end
832 mov %rcx,$_res
833
834 mov (%rbx),%rax # load IV
835 mov 8(%rbx),%rbx
836 jmp .Lcbc_dloop
837 .align 16
838 .Lcbc_dloop:
839 mov 0($inp),@S[0]
840 mov 4($inp),@S[1]
841 mov 8($inp),@S[2]
842 bswap @S[0]
843 mov 12($inp),@S[3]
844 bswap @S[1]
845 mov %rax,0+$ivec # save IV to temporary storage
846 bswap @S[2]
847 mov %rbx,8+$ivec
848 bswap @S[3]
849
850 call _x86_64_Camellia_decrypt
851
852 mov $_key,$key # "rewind" the key
853 mov $_end,%rdx
854 mov $_res,%rcx
855
856 bswap @S[0]
857 mov ($inp),%rax # load IV for next iteration
858 bswap @S[1]
859 mov 8($inp),%rbx
860 bswap @S[2]
861 xor 0+$ivec,@S[0]
862 bswap @S[3]
863 xor 4+$ivec,@S[1]
864 xor 8+$ivec,@S[2]
865 lea 16($inp),$inp
866 xor 12+$ivec,@S[3]
867 cmp %rdx,$inp
868 je .Lcbc_ddone
869
870 mov @S[0],0($out)
871 mov @S[1],4($out)
872 mov @S[2],8($out)
873 mov @S[3],12($out)
874
875 lea 16($out),$out
876 jmp .Lcbc_dloop
877
878 .align 16
879 .Lcbc_ddone:
880 mov $_ivp,%rdx
881 cmp \$0,%rcx
882 jne .Lcbc_dec_tail
883
884 mov @S[0],0($out)
885 mov @S[1],4($out)
886 mov @S[2],8($out)
887 mov @S[3],12($out)
888
889 mov %rax,(%rdx) # write out IV residue
890 mov %rbx,8(%rdx)
891 jmp .Lcbc_done
892 .align 16
893 .Lcbc_dec_tail:
894 mov @S[0],0+$ivec
895 mov @S[1],4+$ivec
896 mov @S[2],8+$ivec
897 mov @S[3],12+$ivec
898
899 .Lcbc_dec_pushf:
900 pushfq
901 cld
902 lea 8+$ivec,%rsi
903 lea ($out),%rdi
904 .long 0x9066A4F3 # rep movsb
905 popfq
906 .Lcbc_dec_popf:
907
908 mov %rax,(%rdx) # write out IV residue
909 mov %rbx,8(%rdx)
910 jmp .Lcbc_done
911
912 .align 16
913 .Lcbc_done:
914 mov $_rsp,%rcx
915 .cfi_def_cfa %rcx,56
916 mov 0(%rcx),%r15
917 .cfi_restore %r15
918 mov 8(%rcx),%r14
919 .cfi_restore %r14
920 mov 16(%rcx),%r13
921 .cfi_restore %r13
922 mov 24(%rcx),%r12
923 .cfi_restore %r12
924 mov 32(%rcx),%rbp
925 .cfi_restore %rbp
926 mov 40(%rcx),%rbx
927 .cfi_restore %rbx
928 lea 48(%rcx),%rsp
929 .cfi_def_cfa %rsp,8
930 .Lcbc_abort:
931 ret
932 .cfi_endproc
933 .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
934
935 .asciz "Camellia for x86_64 by <appro\@openssl.org>"
936 ___
937 }
938
939 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
940 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
941 if ($win64) {
942 $rec="%rcx";
943 $frame="%rdx";
944 $context="%r8";
945 $disp="%r9";
946
947 $code.=<<___;
948 .extern __imp_RtlVirtualUnwind
949 .type common_se_handler,\@abi-omnipotent
950 .align 16
951 common_se_handler:
952 push %rsi
953 push %rdi
954 push %rbx
955 push %rbp
956 push %r12
957 push %r13
958 push %r14
959 push %r15
960 pushfq
961 lea -64(%rsp),%rsp
962
963 mov 120($context),%rax # pull context->Rax
964 mov 248($context),%rbx # pull context->Rip
965
966 mov 8($disp),%rsi # disp->ImageBase
967 mov 56($disp),%r11 # disp->HandlerData
968
969 mov 0(%r11),%r10d # HandlerData[0]
970 lea (%rsi,%r10),%r10 # prologue label
971 cmp %r10,%rbx # context->Rip<prologue label
972 jb .Lin_prologue
973
974 mov 152($context),%rax # pull context->Rsp
975
976 mov 4(%r11),%r10d # HandlerData[1]
977 lea (%rsi,%r10),%r10 # epilogue label
978 cmp %r10,%rbx # context->Rip>=epilogue label
979 jae .Lin_prologue
980
981 lea 40(%rax),%rax
982 mov -8(%rax),%rbx
983 mov -16(%rax),%rbp
984 mov -24(%rax),%r13
985 mov -32(%rax),%r14
986 mov -40(%rax),%r15
987 mov %rbx,144($context) # restore context->Rbx
988 mov %rbp,160($context) # restore context->Rbp
989 mov %r13,224($context) # restore context->R13
990 mov %r14,232($context) # restore context->R14
991 mov %r15,240($context) # restore context->R15
992
993 .Lin_prologue:
994 mov 8(%rax),%rdi
995 mov 16(%rax),%rsi
996 mov %rax,152($context) # restore context->Rsp
997 mov %rsi,168($context) # restore context->Rsi
998 mov %rdi,176($context) # restore context->Rdi
999
1000 jmp .Lcommon_seh_exit
1001 .size common_se_handler,.-common_se_handler
1002
1003 .type cbc_se_handler,\@abi-omnipotent
1004 .align 16
1005 cbc_se_handler:
1006 push %rsi
1007 push %rdi
1008 push %rbx
1009 push %rbp
1010 push %r12
1011 push %r13
1012 push %r14
1013 push %r15
1014 pushfq
1015 lea -64(%rsp),%rsp
1016
1017 mov 120($context),%rax # pull context->Rax
1018 mov 248($context),%rbx # pull context->Rip
1019
1020 lea .Lcbc_prologue(%rip),%r10
1021 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
1022 jb .Lin_cbc_prologue
1023
1024 lea .Lcbc_body(%rip),%r10
1025 cmp %r10,%rbx # context->Rip<.Lcbc_body
1026 jb .Lin_cbc_frame_setup
1027
1028 mov 152($context),%rax # pull context->Rsp
1029
1030 lea .Lcbc_abort(%rip),%r10
1031 cmp %r10,%rbx # context->Rip>=.Lcbc_abort
1032 jae .Lin_cbc_prologue
1033
1034 # handle pushf/popf in Camellia_cbc_encrypt
1035 lea .Lcbc_enc_pushf(%rip),%r10
1036 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
1037 jbe .Lin_cbc_no_flag
1038 lea 8(%rax),%rax
1039 lea .Lcbc_enc_popf(%rip),%r10
1040 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
1041 jb .Lin_cbc_no_flag
1042 lea -8(%rax),%rax
1043 lea .Lcbc_dec_pushf(%rip),%r10
1044 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
1045 jbe .Lin_cbc_no_flag
1046 lea 8(%rax),%rax
1047 lea .Lcbc_dec_popf(%rip),%r10
1048 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
1049 jb .Lin_cbc_no_flag
1050 lea -8(%rax),%rax
1051
1052 .Lin_cbc_no_flag:
1053 mov 48(%rax),%rax # $_rsp
1054 lea 48(%rax),%rax
1055
1056 .Lin_cbc_frame_setup:
1057 mov -8(%rax),%rbx
1058 mov -16(%rax),%rbp
1059 mov -24(%rax),%r12
1060 mov -32(%rax),%r13
1061 mov -40(%rax),%r14
1062 mov -48(%rax),%r15
1063 mov %rbx,144($context) # restore context->Rbx
1064 mov %rbp,160($context) # restore context->Rbp
1065 mov %r12,216($context) # restore context->R12
1066 mov %r13,224($context) # restore context->R13
1067 mov %r14,232($context) # restore context->R14
1068 mov %r15,240($context) # restore context->R15
1069
1070 .Lin_cbc_prologue:
1071 mov 8(%rax),%rdi
1072 mov 16(%rax),%rsi
1073 mov %rax,152($context) # restore context->Rsp
1074 mov %rsi,168($context) # restore context->Rsi
1075 mov %rdi,176($context) # restore context->Rdi
1076
1077 .align 4
1078 .Lcommon_seh_exit:
1079
1080 mov 40($disp),%rdi # disp->ContextRecord
1081 mov $context,%rsi # context
1082 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1083 .long 0xa548f3fc # cld; rep movsq
1084
1085 mov $disp,%rsi
1086 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1087 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1088 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1089 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1090 mov 40(%rsi),%r10 # disp->ContextRecord
1091 lea 56(%rsi),%r11 # &disp->HandlerData
1092 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1093 mov %r10,32(%rsp) # arg5
1094 mov %r11,40(%rsp) # arg6
1095 mov %r12,48(%rsp) # arg7
1096 mov %rcx,56(%rsp) # arg8, (NULL)
1097 call *__imp_RtlVirtualUnwind(%rip)
1098
1099 mov \$1,%eax # ExceptionContinueSearch
1100 lea 64(%rsp),%rsp
1101 popfq
1102 pop %r15
1103 pop %r14
1104 pop %r13
1105 pop %r12
1106 pop %rbp
1107 pop %rbx
1108 pop %rdi
1109 pop %rsi
1110 ret
1111 .size cbc_se_handler,.-cbc_se_handler
1112
1113 .section .pdata
1114 .align 4
1115 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
1116 .rva .LSEH_end_Camellia_EncryptBlock_Rounds
1117 .rva .LSEH_info_Camellia_EncryptBlock_Rounds
1118
1119 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
1120 .rva .LSEH_end_Camellia_DecryptBlock_Rounds
1121 .rva .LSEH_info_Camellia_DecryptBlock_Rounds
1122
1123 .rva .LSEH_begin_Camellia_Ekeygen
1124 .rva .LSEH_end_Camellia_Ekeygen
1125 .rva .LSEH_info_Camellia_Ekeygen
1126
1127 .rva .LSEH_begin_Camellia_cbc_encrypt
1128 .rva .LSEH_end_Camellia_cbc_encrypt
1129 .rva .LSEH_info_Camellia_cbc_encrypt
1130
1131 .section .xdata
1132 .align 8
1133 .LSEH_info_Camellia_EncryptBlock_Rounds:
1134 .byte 9,0,0,0
1135 .rva common_se_handler
1136 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
1137 .LSEH_info_Camellia_DecryptBlock_Rounds:
1138 .byte 9,0,0,0
1139 .rva common_se_handler
1140 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
1141 .LSEH_info_Camellia_Ekeygen:
1142 .byte 9,0,0,0
1143 .rva common_se_handler
1144 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
1145 .LSEH_info_Camellia_cbc_encrypt:
1146 .byte 9,0,0,0
1147 .rva cbc_se_handler
1148 ___
1149 }
1150
1151 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1152 print $code;
1153 close STDOUT or die "error closing STDOUT: $!";