]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/camellia/asm/cmll-x86_64.pl
x86_64 assembly pack: tolerate spaces in source directory name.
[thirdparty/openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12 #
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
20
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
23 #
24 # AMD64 Core2 EM64T
25 # -evp camellia-128-ecb 16.7 21.0 22.7
26 # + over gcc 3.4.6 +25% +5% 0%
27 #
28 # camellia-128-cbc 15.7 20.4 21.1
29 #
30 # 128-bit key setup 128 216 205 cycles/key
31 # + over gcc 3.4.6 +54% +39% +15%
32 #
33 # Numbers in "+" rows represent performance improvement over compiler
34 # generated code. Key setup timings are impressive on AMD and Core2
35 # thanks to 64-bit operations being covertly deployed. Improvement on
36 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37 # apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39 $flavour = shift;
40 $output = shift;
41 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42
43 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48 die "can't locate x86_64-xlate.pl";
49
50 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
51 *STDOUT=*OUT;
52
53 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
54 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
55 $r =~ s/%[er]([sd]i)/%\1l/;
56 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
57
58 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
59 @S=("%r8d","%r9d","%r10d","%r11d");
60 $i0="%esi";
61 $i1="%edi";
62 $Tbl="%rbp"; # size optimization
63 $inp="%r12";
64 $out="%r13";
65 $key="%r14";
66 $keyend="%r15";
67 $arg0d=$win64?"%ecx":"%edi";
68
69 # const unsigned int Camellia_SBOX[4][256];
70 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
71 # and [2][] - with [3][]. This is done to minimize code size.
72 $SBOX1_1110=0; # Camellia_SBOX[0]
73 $SBOX4_4404=4; # Camellia_SBOX[1]
74 $SBOX2_0222=2048; # Camellia_SBOX[2]
75 $SBOX3_3033=2052; # Camellia_SBOX[3]
76
77 sub Camellia_Feistel {
78 my $i=@_[0];
79 my $seed=defined(@_[1])?@_[1]:0;
80 my $scale=$seed<0?-8:8;
81 my $j=($i&1)*2;
82 my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
83
84 $code.=<<___;
85 xor $s0,$t0 # t0^=key[0]
86 xor $s1,$t1 # t1^=key[1]
87 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
88 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
89 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
90 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
91 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
92 shr \$16,$t0
93 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
94 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
95 shr \$16,$t1
96 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
97 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
98 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
99 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
100 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
101 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
102 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
103 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
104 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
105 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
106 mov `$seed+($i+1)*$scale+4`($key),$t0
107 xor $t3,$t2 # t2^=t3
108 ror \$8,$t3 # t3=RightRotate(t3,8)
109 xor $t2,$s2
110 xor $t2,$s3
111 xor $t3,$s3
112 ___
113 }
114
115 # void Camellia_EncryptBlock_Rounds(
116 # int grandRounds,
117 # const Byte plaintext[],
118 # const KEY_TABLE_TYPE keyTable,
119 # Byte ciphertext[])
120 $code=<<___;
121 .text
122
123 # V1.x API
124 .globl Camellia_EncryptBlock
125 .type Camellia_EncryptBlock,\@abi-omnipotent
126 .align 16
127 Camellia_EncryptBlock:
128 movl \$128,%eax
129 subl $arg0d,%eax
130 movl \$3,$arg0d
131 adcl \$0,$arg0d # keyBitLength==128?3:4
132 jmp .Lenc_rounds
133 .size Camellia_EncryptBlock,.-Camellia_EncryptBlock
134 # V2
135 .globl Camellia_EncryptBlock_Rounds
136 .type Camellia_EncryptBlock_Rounds,\@function,4
137 .align 16
138 .Lenc_rounds:
139 Camellia_EncryptBlock_Rounds:
140 push %rbx
141 push %rbp
142 push %r13
143 push %r14
144 push %r15
145 .Lenc_prologue:
146
147 #mov %rsi,$inp # put away arguments
148 mov %rcx,$out
149 mov %rdx,$key
150
151 shl \$6,%edi # process grandRounds
152 lea .LCamellia_SBOX(%rip),$Tbl
153 lea ($key,%rdi),$keyend
154
155 mov 0(%rsi),@S[0] # load plaintext
156 mov 4(%rsi),@S[1]
157 mov 8(%rsi),@S[2]
158 bswap @S[0]
159 mov 12(%rsi),@S[3]
160 bswap @S[1]
161 bswap @S[2]
162 bswap @S[3]
163
164 call _x86_64_Camellia_encrypt
165
166 bswap @S[0]
167 bswap @S[1]
168 bswap @S[2]
169 mov @S[0],0($out)
170 bswap @S[3]
171 mov @S[1],4($out)
172 mov @S[2],8($out)
173 mov @S[3],12($out)
174
175 mov 0(%rsp),%r15
176 mov 8(%rsp),%r14
177 mov 16(%rsp),%r13
178 mov 24(%rsp),%rbp
179 mov 32(%rsp),%rbx
180 lea 40(%rsp),%rsp
181 .Lenc_epilogue:
182 ret
183 .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
184
185 .type _x86_64_Camellia_encrypt,\@abi-omnipotent
186 .align 16
187 _x86_64_Camellia_encrypt:
188 xor 0($key),@S[1]
189 xor 4($key),@S[0] # ^=key[0-3]
190 xor 8($key),@S[3]
191 xor 12($key),@S[2]
192 .align 16
193 .Leloop:
194 mov 16($key),$t1 # prefetch key[4-5]
195 mov 20($key),$t0
196
197 ___
198 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
199 $code.=<<___;
200 lea 16*4($key),$key
201 cmp $keyend,$key
202 mov 8($key),$t3 # prefetch key[2-3]
203 mov 12($key),$t2
204 je .Ledone
205
206 and @S[0],$t0
207 or @S[3],$t3
208 rol \$1,$t0
209 xor $t3,@S[2] # s2^=s3|key[3];
210 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
211 and @S[2],$t2
212 or @S[1],$t1
213 rol \$1,$t2
214 xor $t1,@S[0] # s0^=s1|key[1];
215 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
216 jmp .Leloop
217
218 .align 16
219 .Ledone:
220 xor @S[2],$t0 # SwapHalf
221 xor @S[3],$t1
222 xor @S[0],$t2
223 xor @S[1],$t3
224
225 mov $t0,@S[0]
226 mov $t1,@S[1]
227 mov $t2,@S[2]
228 mov $t3,@S[3]
229
230 .byte 0xf3,0xc3 # rep ret
231 .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
232
233 # V1.x API
234 .globl Camellia_DecryptBlock
235 .type Camellia_DecryptBlock,\@abi-omnipotent
236 .align 16
237 Camellia_DecryptBlock:
238 movl \$128,%eax
239 subl $arg0d,%eax
240 movl \$3,$arg0d
241 adcl \$0,$arg0d # keyBitLength==128?3:4
242 jmp .Ldec_rounds
243 .size Camellia_DecryptBlock,.-Camellia_DecryptBlock
244 # V2
245 .globl Camellia_DecryptBlock_Rounds
246 .type Camellia_DecryptBlock_Rounds,\@function,4
247 .align 16
248 .Ldec_rounds:
249 Camellia_DecryptBlock_Rounds:
250 push %rbx
251 push %rbp
252 push %r13
253 push %r14
254 push %r15
255 .Ldec_prologue:
256
257 #mov %rsi,$inp # put away arguments
258 mov %rcx,$out
259 mov %rdx,$keyend
260
261 shl \$6,%edi # process grandRounds
262 lea .LCamellia_SBOX(%rip),$Tbl
263 lea ($keyend,%rdi),$key
264
265 mov 0(%rsi),@S[0] # load plaintext
266 mov 4(%rsi),@S[1]
267 mov 8(%rsi),@S[2]
268 bswap @S[0]
269 mov 12(%rsi),@S[3]
270 bswap @S[1]
271 bswap @S[2]
272 bswap @S[3]
273
274 call _x86_64_Camellia_decrypt
275
276 bswap @S[0]
277 bswap @S[1]
278 bswap @S[2]
279 mov @S[0],0($out)
280 bswap @S[3]
281 mov @S[1],4($out)
282 mov @S[2],8($out)
283 mov @S[3],12($out)
284
285 mov 0(%rsp),%r15
286 mov 8(%rsp),%r14
287 mov 16(%rsp),%r13
288 mov 24(%rsp),%rbp
289 mov 32(%rsp),%rbx
290 lea 40(%rsp),%rsp
291 .Ldec_epilogue:
292 ret
293 .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
294
295 .type _x86_64_Camellia_decrypt,\@abi-omnipotent
296 .align 16
297 _x86_64_Camellia_decrypt:
298 xor 0($key),@S[1]
299 xor 4($key),@S[0] # ^=key[0-3]
300 xor 8($key),@S[3]
301 xor 12($key),@S[2]
302 .align 16
303 .Ldloop:
304 mov -8($key),$t1 # prefetch key[4-5]
305 mov -4($key),$t0
306
307 ___
308 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
309 $code.=<<___;
310 lea -16*4($key),$key
311 cmp $keyend,$key
312 mov 0($key),$t3 # prefetch key[2-3]
313 mov 4($key),$t2
314 je .Lddone
315
316 and @S[0],$t0
317 or @S[3],$t3
318 rol \$1,$t0
319 xor $t3,@S[2] # s2^=s3|key[3];
320 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
321 and @S[2],$t2
322 or @S[1],$t1
323 rol \$1,$t2
324 xor $t1,@S[0] # s0^=s1|key[1];
325 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
326
327 jmp .Ldloop
328
329 .align 16
330 .Lddone:
331 xor @S[2],$t2
332 xor @S[3],$t3
333 xor @S[0],$t0
334 xor @S[1],$t1
335
336 mov $t2,@S[0] # SwapHalf
337 mov $t3,@S[1]
338 mov $t0,@S[2]
339 mov $t1,@S[3]
340
341 .byte 0xf3,0xc3 # rep ret
342 .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
343 ___
344
345 sub _saveround {
346 my ($rnd,$key,@T)=@_;
347 my $bias=int(@T[0])?shift(@T):0;
348
349 if ($#T==3) {
350 $code.=<<___;
351 mov @T[1],`$bias+$rnd*8+0`($key)
352 mov @T[0],`$bias+$rnd*8+4`($key)
353 mov @T[3],`$bias+$rnd*8+8`($key)
354 mov @T[2],`$bias+$rnd*8+12`($key)
355 ___
356 } else {
357 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
358 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
359 }
360 }
361
362 sub _loadround {
363 my ($rnd,$key,@T)=@_;
364 my $bias=int(@T[0])?shift(@T):0;
365
366 $code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
367 $code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
368 }
369
370 # shld is very slow on Intel EM64T family. Even on AMD it limits
371 # instruction decode rate [because it's VectorPath] and consequently
372 # performance...
373 sub __rotl128 {
374 my ($i0,$i1,$rot)=@_;
375
376 if ($rot) {
377 $code.=<<___;
378 mov $i0,%r11
379 shld \$$rot,$i1,$i0
380 shld \$$rot,%r11,$i1
381 ___
382 }
383 }
384
385 # ... Implementing 128-bit rotate without shld gives 80% better
386 # performance EM64T, +15% on AMD64 and only ~7% degradation on
387 # Core2. This is therefore preferred.
388 sub _rotl128 {
389 my ($i0,$i1,$rot)=@_;
390
391 if ($rot) {
392 $code.=<<___;
393 mov $i0,%r11
394 shl \$$rot,$i0
395 mov $i1,%r9
396 shr \$`64-$rot`,%r9
397 shr \$`64-$rot`,%r11
398 or %r9,$i0
399 shl \$$rot,$i1
400 or %r11,$i1
401 ___
402 }
403 }
404
405 { my $step=0;
406
407 $code.=<<___;
408 .globl Camellia_Ekeygen
409 .type Camellia_Ekeygen,\@function,3
410 .align 16
411 Camellia_Ekeygen:
412 push %rbx
413 push %rbp
414 push %r13
415 push %r14
416 push %r15
417 .Lkey_prologue:
418
419 mov %edi,${keyend}d # put away arguments, keyBitLength
420 mov %rdx,$out # keyTable
421
422 mov 0(%rsi),@S[0] # load 0-127 bits
423 mov 4(%rsi),@S[1]
424 mov 8(%rsi),@S[2]
425 mov 12(%rsi),@S[3]
426
427 bswap @S[0]
428 bswap @S[1]
429 bswap @S[2]
430 bswap @S[3]
431 ___
432 &_saveround (0,$out,@S); # KL<<<0
433 $code.=<<___;
434 cmp \$128,$keyend # check keyBitLength
435 je .L1st128
436
437 mov 16(%rsi),@S[0] # load 128-191 bits
438 mov 20(%rsi),@S[1]
439 cmp \$192,$keyend
440 je .L1st192
441 mov 24(%rsi),@S[2] # load 192-255 bits
442 mov 28(%rsi),@S[3]
443 jmp .L1st256
444 .L1st192:
445 mov @S[0],@S[2]
446 mov @S[1],@S[3]
447 not @S[2]
448 not @S[3]
449 .L1st256:
450 bswap @S[0]
451 bswap @S[1]
452 bswap @S[2]
453 bswap @S[3]
454 ___
455 &_saveround (4,$out,@S); # temp storage for KR!
456 $code.=<<___;
457 xor 0($out),@S[1] # KR^KL
458 xor 4($out),@S[0]
459 xor 8($out),@S[3]
460 xor 12($out),@S[2]
461
462 .L1st128:
463 lea .LCamellia_SIGMA(%rip),$key
464 lea .LCamellia_SBOX(%rip),$Tbl
465
466 mov 0($key),$t1
467 mov 4($key),$t0
468 ___
469 &Camellia_Feistel($step++);
470 &Camellia_Feistel($step++);
471 $code.=<<___;
472 xor 0($out),@S[1] # ^KL
473 xor 4($out),@S[0]
474 xor 8($out),@S[3]
475 xor 12($out),@S[2]
476 ___
477 &Camellia_Feistel($step++);
478 &Camellia_Feistel($step++);
479 $code.=<<___;
480 cmp \$128,$keyend
481 jne .L2nd256
482
483 lea 128($out),$out # size optimization
484 shl \$32,%r8 # @S[0]||
485 shl \$32,%r10 # @S[2]||
486 or %r9,%r8 # ||@S[1]
487 or %r11,%r10 # ||@S[3]
488 ___
489 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
490 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
491 &_rotl128 ("%rax","%rbx",15);
492 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
493 &_rotl128 ("%r8","%r10",15);
494 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
495 &_rotl128 ("%r8","%r10",15); # 15+15=30
496 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
497 &_rotl128 ("%rax","%rbx",30); # 15+30=45
498 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
499 &_rotl128 ("%r8","%r10",15); # 30+15=45
500 &_saveround (12,$out,-128,"%r8"); # KA<<<45
501 &_rotl128 ("%rax","%rbx",15); # 45+15=60
502 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
503 &_rotl128 ("%r8","%r10",15); # 45+15=60
504 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
505 &_rotl128 ("%rax","%rbx",17); # 60+17=77
506 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
507 &_rotl128 ("%rax","%rbx",17); # 77+17=94
508 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
509 &_rotl128 ("%r8","%r10",34); # 60+34=94
510 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
511 &_rotl128 ("%rax","%rbx",17); # 94+17=111
512 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
513 &_rotl128 ("%r8","%r10",17); # 94+17=111
514 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
515 $code.=<<___;
516 mov \$3,%eax
517 jmp .Ldone
518 .align 16
519 .L2nd256:
520 ___
521 &_saveround (6,$out,@S); # temp storage for KA!
522 $code.=<<___;
523 xor `4*8+0`($out),@S[1] # KA^KR
524 xor `4*8+4`($out),@S[0]
525 xor `5*8+0`($out),@S[3]
526 xor `5*8+4`($out),@S[2]
527 ___
528 &Camellia_Feistel($step++);
529 &Camellia_Feistel($step++);
530
531 &_loadround (0,$out,"%rax","%rbx"); # KL
532 &_loadround (4,$out,"%rcx","%rdx"); # KR
533 &_loadround (6,$out,"%r14","%r15"); # KA
534 $code.=<<___;
535 lea 128($out),$out # size optimization
536 shl \$32,%r8 # @S[0]||
537 shl \$32,%r10 # @S[2]||
538 or %r9,%r8 # ||@S[1]
539 or %r11,%r10 # ||@S[3]
540 ___
541 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
542 &_rotl128 ("%rcx","%rdx",15);
543 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
544 &_rotl128 ("%r14","%r15",15);
545 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
546 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
547 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
548 &_rotl128 ("%r8","%r10",30);
549 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
550 &_rotl128 ("%rax","%rbx",45);
551 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
552 &_rotl128 ("%r14","%r15",30); # 15+30=45
553 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
554 &_rotl128 ("%rax","%rbx",15); # 45+15=60
555 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
556 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
557 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
558 &_rotl128 ("%r8","%r10",30); # 30+30=60
559 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
560 &_rotl128 ("%rax","%rbx",17); # 60+17=77
561 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
562 &_rotl128 ("%r14","%r15",32); # 45+32=77
563 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
564 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
565 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
566 &_rotl128 ("%r14","%r15",17); # 77+17=94
567 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
568 &_rotl128 ("%rax","%rbx",34); # 77+34=111
569 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
570 &_rotl128 ("%r8","%r10",51); # 60+51=111
571 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
572 $code.=<<___;
573 mov \$4,%eax
574 .Ldone:
575 mov 0(%rsp),%r15
576 mov 8(%rsp),%r14
577 mov 16(%rsp),%r13
578 mov 24(%rsp),%rbp
579 mov 32(%rsp),%rbx
580 lea 40(%rsp),%rsp
581 .Lkey_epilogue:
582 ret
583 .size Camellia_Ekeygen,.-Camellia_Ekeygen
584 ___
585 }
586
587 @SBOX=(
588 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
589 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
590 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
591 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
592 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
593 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
594 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
595 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
596 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
597 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
598 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
599 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
600 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
601 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
602 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
603 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
604
605 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
606 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
607 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
608 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
609
610 $code.=<<___;
611 .align 64
612 .LCamellia_SIGMA:
613 .long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
614 .long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
615 .long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
616 .long 0, 0, 0, 0
617 .LCamellia_SBOX:
618 ___
619 # tables are interleaved, remember?
620 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
621 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
622 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
623
624 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
625 # size_t length, const CAMELLIA_KEY *key,
626 # unsigned char *ivp,const int enc);
627 {
628 $_key="0(%rsp)";
629 $_end="8(%rsp)"; # inp+len&~15
630 $_res="16(%rsp)"; # len&15
631 $ivec="24(%rsp)";
632 $_ivp="40(%rsp)";
633 $_rsp="48(%rsp)";
634
635 $code.=<<___;
636 .globl Camellia_cbc_encrypt
637 .type Camellia_cbc_encrypt,\@function,6
638 .align 16
639 Camellia_cbc_encrypt:
640 cmp \$0,%rdx
641 je .Lcbc_abort
642 push %rbx
643 push %rbp
644 push %r12
645 push %r13
646 push %r14
647 push %r15
648 .Lcbc_prologue:
649
650 mov %rsp,%rbp
651 sub \$64,%rsp
652 and \$-64,%rsp
653
654 # place stack frame just "above mod 1024" the key schedule,
655 # this ensures that cache associativity suffices
656 lea -64-63(%rcx),%r10
657 sub %rsp,%r10
658 neg %r10
659 and \$0x3C0,%r10
660 sub %r10,%rsp
661 #add \$8,%rsp # 8 is reserved for callee's ra
662
663 mov %rdi,$inp # inp argument
664 mov %rsi,$out # out argument
665 mov %r8,%rbx # ivp argument
666 mov %rcx,$key # key argument
667 mov 272(%rcx),${keyend}d # grandRounds
668
669 mov %r8,$_ivp
670 mov %rbp,$_rsp
671
672 .Lcbc_body:
673 lea .LCamellia_SBOX(%rip),$Tbl
674
675 mov \$32,%ecx
676 .align 4
677 .Lcbc_prefetch_sbox:
678 mov 0($Tbl),%rax
679 mov 32($Tbl),%rsi
680 mov 64($Tbl),%rdi
681 mov 96($Tbl),%r11
682 lea 128($Tbl),$Tbl
683 loop .Lcbc_prefetch_sbox
684 sub \$4096,$Tbl
685 shl \$6,$keyend
686 mov %rdx,%rcx # len argument
687 lea ($key,$keyend),$keyend
688
689 cmp \$0,%r9d # enc argument
690 je .LCBC_DECRYPT
691
692 and \$-16,%rdx
693 and \$15,%rcx # length residue
694 lea ($inp,%rdx),%rdx
695 mov $key,$_key
696 mov %rdx,$_end
697 mov %rcx,$_res
698
699 cmp $inp,%rdx
700 mov 0(%rbx),@S[0] # load IV
701 mov 4(%rbx),@S[1]
702 mov 8(%rbx),@S[2]
703 mov 12(%rbx),@S[3]
704 je .Lcbc_enc_tail
705 jmp .Lcbc_eloop
706
707 .align 16
708 .Lcbc_eloop:
709 xor 0($inp),@S[0]
710 xor 4($inp),@S[1]
711 xor 8($inp),@S[2]
712 bswap @S[0]
713 xor 12($inp),@S[3]
714 bswap @S[1]
715 bswap @S[2]
716 bswap @S[3]
717
718 call _x86_64_Camellia_encrypt
719
720 mov $_key,$key # "rewind" the key
721 bswap @S[0]
722 mov $_end,%rdx
723 bswap @S[1]
724 mov $_res,%rcx
725 bswap @S[2]
726 mov @S[0],0($out)
727 bswap @S[3]
728 mov @S[1],4($out)
729 mov @S[2],8($out)
730 lea 16($inp),$inp
731 mov @S[3],12($out)
732 cmp %rdx,$inp
733 lea 16($out),$out
734 jne .Lcbc_eloop
735
736 cmp \$0,%rcx
737 jne .Lcbc_enc_tail
738
739 mov $_ivp,$out
740 mov @S[0],0($out) # write out IV residue
741 mov @S[1],4($out)
742 mov @S[2],8($out)
743 mov @S[3],12($out)
744 jmp .Lcbc_done
745
746 .align 16
747 .Lcbc_enc_tail:
748 xor %rax,%rax
749 mov %rax,0+$ivec
750 mov %rax,8+$ivec
751 mov %rax,$_res
752
753 .Lcbc_enc_pushf:
754 pushfq
755 cld
756 mov $inp,%rsi
757 lea 8+$ivec,%rdi
758 .long 0x9066A4F3 # rep movsb
759 popfq
760 .Lcbc_enc_popf:
761
762 lea $ivec,$inp
763 lea 16+$ivec,%rax
764 mov %rax,$_end
765 jmp .Lcbc_eloop # one more time
766
767 .align 16
768 .LCBC_DECRYPT:
769 xchg $key,$keyend
770 add \$15,%rdx
771 and \$15,%rcx # length residue
772 and \$-16,%rdx
773 mov $key,$_key
774 lea ($inp,%rdx),%rdx
775 mov %rdx,$_end
776 mov %rcx,$_res
777
778 mov (%rbx),%rax # load IV
779 mov 8(%rbx),%rbx
780 jmp .Lcbc_dloop
781 .align 16
782 .Lcbc_dloop:
783 mov 0($inp),@S[0]
784 mov 4($inp),@S[1]
785 mov 8($inp),@S[2]
786 bswap @S[0]
787 mov 12($inp),@S[3]
788 bswap @S[1]
789 mov %rax,0+$ivec # save IV to temporary storage
790 bswap @S[2]
791 mov %rbx,8+$ivec
792 bswap @S[3]
793
794 call _x86_64_Camellia_decrypt
795
796 mov $_key,$key # "rewind" the key
797 mov $_end,%rdx
798 mov $_res,%rcx
799
800 bswap @S[0]
801 mov ($inp),%rax # load IV for next iteration
802 bswap @S[1]
803 mov 8($inp),%rbx
804 bswap @S[2]
805 xor 0+$ivec,@S[0]
806 bswap @S[3]
807 xor 4+$ivec,@S[1]
808 xor 8+$ivec,@S[2]
809 lea 16($inp),$inp
810 xor 12+$ivec,@S[3]
811 cmp %rdx,$inp
812 je .Lcbc_ddone
813
814 mov @S[0],0($out)
815 mov @S[1],4($out)
816 mov @S[2],8($out)
817 mov @S[3],12($out)
818
819 lea 16($out),$out
820 jmp .Lcbc_dloop
821
822 .align 16
823 .Lcbc_ddone:
824 mov $_ivp,%rdx
825 cmp \$0,%rcx
826 jne .Lcbc_dec_tail
827
828 mov @S[0],0($out)
829 mov @S[1],4($out)
830 mov @S[2],8($out)
831 mov @S[3],12($out)
832
833 mov %rax,(%rdx) # write out IV residue
834 mov %rbx,8(%rdx)
835 jmp .Lcbc_done
836 .align 16
837 .Lcbc_dec_tail:
838 mov @S[0],0+$ivec
839 mov @S[1],4+$ivec
840 mov @S[2],8+$ivec
841 mov @S[3],12+$ivec
842
843 .Lcbc_dec_pushf:
844 pushfq
845 cld
846 lea 8+$ivec,%rsi
847 lea ($out),%rdi
848 .long 0x9066A4F3 # rep movsb
849 popfq
850 .Lcbc_dec_popf:
851
852 mov %rax,(%rdx) # write out IV residue
853 mov %rbx,8(%rdx)
854 jmp .Lcbc_done
855
856 .align 16
857 .Lcbc_done:
858 mov $_rsp,%rcx
859 mov 0(%rcx),%r15
860 mov 8(%rcx),%r14
861 mov 16(%rcx),%r13
862 mov 24(%rcx),%r12
863 mov 32(%rcx),%rbp
864 mov 40(%rcx),%rbx
865 lea 48(%rcx),%rsp
866 .Lcbc_abort:
867 ret
868 .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
869
870 .asciz "Camellia for x86_64 by <appro\@openssl.org>"
871 ___
872 }
873
874 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
875 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
876 if ($win64) {
877 $rec="%rcx";
878 $frame="%rdx";
879 $context="%r8";
880 $disp="%r9";
881
882 $code.=<<___;
883 .extern __imp_RtlVirtualUnwind
884 .type common_se_handler,\@abi-omnipotent
885 .align 16
886 common_se_handler:
887 push %rsi
888 push %rdi
889 push %rbx
890 push %rbp
891 push %r12
892 push %r13
893 push %r14
894 push %r15
895 pushfq
896 lea -64(%rsp),%rsp
897
898 mov 120($context),%rax # pull context->Rax
899 mov 248($context),%rbx # pull context->Rip
900
901 mov 8($disp),%rsi # disp->ImageBase
902 mov 56($disp),%r11 # disp->HandlerData
903
904 mov 0(%r11),%r10d # HandlerData[0]
905 lea (%rsi,%r10),%r10 # prologue label
906 cmp %r10,%rbx # context->Rip<prologue label
907 jb .Lin_prologue
908
909 mov 152($context),%rax # pull context->Rsp
910
911 mov 4(%r11),%r10d # HandlerData[1]
912 lea (%rsi,%r10),%r10 # epilogue label
913 cmp %r10,%rbx # context->Rip>=epilogue label
914 jae .Lin_prologue
915
916 lea 40(%rax),%rax
917 mov -8(%rax),%rbx
918 mov -16(%rax),%rbp
919 mov -24(%rax),%r13
920 mov -32(%rax),%r14
921 mov -40(%rax),%r15
922 mov %rbx,144($context) # restore context->Rbx
923 mov %rbp,160($context) # restore context->Rbp
924 mov %r13,224($context) # restore context->R13
925 mov %r14,232($context) # restore context->R14
926 mov %r15,240($context) # restore context->R15
927
928 .Lin_prologue:
929 mov 8(%rax),%rdi
930 mov 16(%rax),%rsi
931 mov %rax,152($context) # restore context->Rsp
932 mov %rsi,168($context) # restore context->Rsi
933 mov %rdi,176($context) # restore context->Rdi
934
935 jmp .Lcommon_seh_exit
936 .size common_se_handler,.-common_se_handler
937
938 .type cbc_se_handler,\@abi-omnipotent
939 .align 16
940 cbc_se_handler:
941 push %rsi
942 push %rdi
943 push %rbx
944 push %rbp
945 push %r12
946 push %r13
947 push %r14
948 push %r15
949 pushfq
950 lea -64(%rsp),%rsp
951
952 mov 120($context),%rax # pull context->Rax
953 mov 248($context),%rbx # pull context->Rip
954
955 lea .Lcbc_prologue(%rip),%r10
956 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
957 jb .Lin_cbc_prologue
958
959 lea .Lcbc_body(%rip),%r10
960 cmp %r10,%rbx # context->Rip<.Lcbc_body
961 jb .Lin_cbc_frame_setup
962
963 mov 152($context),%rax # pull context->Rsp
964
965 lea .Lcbc_abort(%rip),%r10
966 cmp %r10,%rbx # context->Rip>=.Lcbc_abort
967 jae .Lin_cbc_prologue
968
969 # handle pushf/popf in Camellia_cbc_encrypt
970 lea .Lcbc_enc_pushf(%rip),%r10
971 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
972 jbe .Lin_cbc_no_flag
973 lea 8(%rax),%rax
974 lea .Lcbc_enc_popf(%rip),%r10
975 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
976 jb .Lin_cbc_no_flag
977 lea -8(%rax),%rax
978 lea .Lcbc_dec_pushf(%rip),%r10
979 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
980 jbe .Lin_cbc_no_flag
981 lea 8(%rax),%rax
982 lea .Lcbc_dec_popf(%rip),%r10
983 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
984 jb .Lin_cbc_no_flag
985 lea -8(%rax),%rax
986
987 .Lin_cbc_no_flag:
988 mov 48(%rax),%rax # $_rsp
989 lea 48(%rax),%rax
990
991 .Lin_cbc_frame_setup:
992 mov -8(%rax),%rbx
993 mov -16(%rax),%rbp
994 mov -24(%rax),%r12
995 mov -32(%rax),%r13
996 mov -40(%rax),%r14
997 mov -48(%rax),%r15
998 mov %rbx,144($context) # restore context->Rbx
999 mov %rbp,160($context) # restore context->Rbp
1000 mov %r12,216($context) # restore context->R12
1001 mov %r13,224($context) # restore context->R13
1002 mov %r14,232($context) # restore context->R14
1003 mov %r15,240($context) # restore context->R15
1004
1005 .Lin_cbc_prologue:
1006 mov 8(%rax),%rdi
1007 mov 16(%rax),%rsi
1008 mov %rax,152($context) # restore context->Rsp
1009 mov %rsi,168($context) # restore context->Rsi
1010 mov %rdi,176($context) # restore context->Rdi
1011
1012 .align 4
1013 .Lcommon_seh_exit:
1014
1015 mov 40($disp),%rdi # disp->ContextRecord
1016 mov $context,%rsi # context
1017 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1018 .long 0xa548f3fc # cld; rep movsq
1019
1020 mov $disp,%rsi
1021 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1022 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1023 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1024 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1025 mov 40(%rsi),%r10 # disp->ContextRecord
1026 lea 56(%rsi),%r11 # &disp->HandlerData
1027 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1028 mov %r10,32(%rsp) # arg5
1029 mov %r11,40(%rsp) # arg6
1030 mov %r12,48(%rsp) # arg7
1031 mov %rcx,56(%rsp) # arg8, (NULL)
1032 call *__imp_RtlVirtualUnwind(%rip)
1033
1034 mov \$1,%eax # ExceptionContinueSearch
1035 lea 64(%rsp),%rsp
1036 popfq
1037 pop %r15
1038 pop %r14
1039 pop %r13
1040 pop %r12
1041 pop %rbp
1042 pop %rbx
1043 pop %rdi
1044 pop %rsi
1045 ret
1046 .size cbc_se_handler,.-cbc_se_handler
1047
1048 .section .pdata
1049 .align 4
1050 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
1051 .rva .LSEH_end_Camellia_EncryptBlock_Rounds
1052 .rva .LSEH_info_Camellia_EncryptBlock_Rounds
1053
1054 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
1055 .rva .LSEH_end_Camellia_DecryptBlock_Rounds
1056 .rva .LSEH_info_Camellia_DecryptBlock_Rounds
1057
1058 .rva .LSEH_begin_Camellia_Ekeygen
1059 .rva .LSEH_end_Camellia_Ekeygen
1060 .rva .LSEH_info_Camellia_Ekeygen
1061
1062 .rva .LSEH_begin_Camellia_cbc_encrypt
1063 .rva .LSEH_end_Camellia_cbc_encrypt
1064 .rva .LSEH_info_Camellia_cbc_encrypt
1065
1066 .section .xdata
1067 .align 8
1068 .LSEH_info_Camellia_EncryptBlock_Rounds:
1069 .byte 9,0,0,0
1070 .rva common_se_handler
1071 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
1072 .LSEH_info_Camellia_DecryptBlock_Rounds:
1073 .byte 9,0,0,0
1074 .rva common_se_handler
1075 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
1076 .LSEH_info_Camellia_Ekeygen:
1077 .byte 9,0,0,0
1078 .rva common_se_handler
1079 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
1080 .LSEH_info_Camellia_cbc_encrypt:
1081 .byte 9,0,0,0
1082 .rva cbc_se_handler
1083 ___
1084 }
1085
1086 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1087 print $code;
1088 close STDOUT;