2 # Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
24 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
26 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
29 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
30 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f
$xlate) or
31 die "can't locate x86_64-xlate.pl";
33 open OUT
,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
38 %PADLOCK_PREFETCH=(ecb
=>128, cbc
=>64, ctr32
=>32); # prefetch errata
39 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
47 ($arg1,$arg2,$arg3,$arg4)=$win64?
("%rcx","%rdx","%r8", "%r9") : # Win64 order
48 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
51 .globl padlock_capability
52 .type padlock_capability
,\
@abi-omnipotent
59 cmp \
$`"0x".unpack("H*",'tneC')`,%ebx
61 cmp \
$`"0x".unpack("H*",'Hrua')`,%edx
63 cmp \
$`"0x".unpack("H*",'slua')`,%ecx
67 cmp \
$`"0x".unpack("H*",'hS ')`,%ebx
69 cmp \
$`"0x".unpack("H*",'hgna')`,%edx
71 cmp \
$`"0x".unpack("H*",' ia')`,%ecx
84 or \
$0x10,%eax # set Nano bit#4
88 .size padlock_capability
,.-padlock_capability
90 .globl padlock_key_bswap
91 .type padlock_key_bswap
,\
@abi-omnipotent
,0
103 .size padlock_key_bswap
,.-padlock_key_bswap
105 .globl padlock_verify_context
106 .type padlock_verify_context
,\
@abi-omnipotent
108 padlock_verify_context
:
111 lea
.Lpadlock_saved_context
(%rip),%rax
112 call _padlock_verify_ctx
115 .size padlock_verify_context
,.-padlock_verify_context
117 .type _padlock_verify_ctx
,\
@abi-omnipotent
130 .size _padlock_verify_ctx
,.-_padlock_verify_ctx
132 .globl padlock_reload_key
133 .type padlock_reload_key
,\
@abi-omnipotent
139 .size padlock_reload_key
,.-padlock_reload_key
141 .globl padlock_aes_block
142 .type padlock_aes_block
,\
@function,3
147 lea
32($ctx),%rbx # key
148 lea
16($ctx),$ctx # control word
149 .byte
0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
152 .size padlock_aes_block
,.-padlock_aes_block
154 .globl padlock_xstore
155 .type padlock_xstore
,\
@function,2
159 .byte
0x0f,0xa7,0xc0 # xstore
161 .size padlock_xstore
,.-padlock_xstore
163 .globl padlock_sha1_oneshot
164 .type padlock_sha1_oneshot
,\
@function,3
166 padlock_sha1_oneshot
:
168 mov
%rdi,%rdx # put aside %rdi
169 movups
(%rdi),%xmm0 # copy-in context
176 .byte
0xf3,0x0f,0xa6,0xc8 # rep xsha1
180 movups
%xmm0,(%rdx) # copy-out context
183 .size padlock_sha1_oneshot
,.-padlock_sha1_oneshot
185 .globl padlock_sha1_blocks
186 .type padlock_sha1_blocks
,\
@function,3
190 mov
%rdi,%rdx # put aside %rdi
191 movups
(%rdi),%xmm0 # copy-in context
198 .byte
0xf3,0x0f,0xa6,0xc8 # rep xsha1
202 movups
%xmm0,(%rdx) # copy-out context
205 .size padlock_sha1_blocks
,.-padlock_sha1_blocks
207 .globl padlock_sha256_oneshot
208 .type padlock_sha256_oneshot
,\
@function,3
210 padlock_sha256_oneshot
:
212 mov
%rdi,%rdx # put aside %rdi
213 movups
(%rdi),%xmm0 # copy-in context
215 movups
16(%rdi),%xmm1
218 movaps
%xmm1,16(%rsp)
220 .byte
0xf3,0x0f,0xa6,0xd0 # rep xsha256
222 movaps
16(%rsp),%xmm1
224 movups
%xmm0,(%rdx) # copy-out context
225 movups
%xmm1,16(%rdx)
227 .size padlock_sha256_oneshot
,.-padlock_sha256_oneshot
229 .globl padlock_sha256_blocks
230 .type padlock_sha256_blocks
,\
@function,3
232 padlock_sha256_blocks
:
234 mov
%rdi,%rdx # put aside %rdi
235 movups
(%rdi),%xmm0 # copy-in context
237 movups
16(%rdi),%xmm1
240 movaps
%xmm1,16(%rsp)
242 .byte
0xf3,0x0f,0xa6,0xd0 # rep xsha256
244 movaps
16(%rsp),%xmm1
246 movups
%xmm0,(%rdx) # copy-out context
247 movups
%xmm1,16(%rdx)
249 .size padlock_sha256_blocks
,.-padlock_sha256_blocks
251 .globl padlock_sha512_blocks
252 .type padlock_sha512_blocks
,\
@function,3
254 padlock_sha512_blocks
:
256 mov
%rdi,%rdx # put aside %rdi
257 movups
(%rdi),%xmm0 # copy-in context
259 movups
16(%rdi),%xmm1
260 movups
32(%rdi),%xmm2
261 movups
48(%rdi),%xmm3
264 movaps
%xmm1,16(%rsp)
265 movaps
%xmm2,32(%rsp)
266 movaps
%xmm3,48(%rsp)
267 .byte
0xf3,0x0f,0xa6,0xe0 # rep xha512
269 movaps
16(%rsp),%xmm1
270 movaps
32(%rsp),%xmm2
271 movaps
48(%rsp),%xmm3
273 movups
%xmm0,(%rdx) # copy-out context
274 movups
%xmm1,16(%rdx)
275 movups
%xmm2,32(%rdx)
276 movups
%xmm3,48(%rdx)
278 .size padlock_sha512_blocks
,.-padlock_sha512_blocks
282 my ($mode,$opcode) = @_;
283 # int padlock_$mode_encrypt(void *out, const void *inp,
284 # struct padlock_cipher_data *ctx, size_t len);
286 .globl padlock_
${mode
}_encrypt
287 .type padlock_
${mode
}_encrypt
,\
@function,4
289 padlock_
${mode
}_encrypt
:
298 lea
.Lpadlock_saved_context
(%rip),%rax
301 call _padlock_verify_ctx
302 lea
16($ctx),$ctx # control word
305 testl \
$`1<<5`,($ctx) # align bit in control word
306 jnz
.L
${mode
}_aligned
308 setz
%al # !out_misaligned
310 setz
%bl # !inp_misaligned
312 jnz
.L
${mode
}_aligned
314 mov \
$$PADLOCK_CHUNK,$chunk
315 not %rax # out_misaligned?-1:0
318 cmovc
$len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
319 and $chunk,%rax # out_misaligned?chunk:0
322 and \
$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
324 mov \
$$PADLOCK_CHUNK,%rax
325 cmovz
%rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
327 $code.=<<___
if ($mode eq "ctr32");
329 mov
-4($ctx),%eax # pull 32-bit counter
332 and \
$`$PADLOCK_CHUNK/16-1`,%eax
333 mov \
$$PADLOCK_CHUNK,$chunk
337 cmova
%rax,$chunk # don't let counter cross PADLOCK_CHUNK
340 $code.=<<___
if ($PADLOCK_PREFETCH{$mode});
343 mov
$inp,%rax # check if prefetch crosses page
348 and \
$0xfff,%rax # distance to page boundary
349 cmp \
$$PADLOCK_PREFETCH{$mode},%rax
350 mov \
$-$PADLOCK_PREFETCH{$mode},%rax
351 cmovae
$chunk,%rax # mask=distance<prefetch?-prefetch:-1
353 jz
.L
${mode
}_unaligned_tail
359 cmp $len,$chunk # ctr32 artefact
360 cmova
$len,$chunk # ctr32 artefact
361 mov
$out,%r8 # save parameters
366 test \
$0x0f,$out # out_misaligned
368 test \
$0x0f,$inp # inp_misaligned
369 jz
.L
${mode
}_inp_aligned
371 .byte
0xf3,0x48,0xa5 # rep movsq
375 .L
${mode
}_inp_aligned
:
376 lea
-16($ctx),%rax # ivp
377 lea
16($ctx),%rbx # key
379 .byte
0xf3,0x0f,0xa7,$opcode # rep xcrypt*
381 $code.=<<___
if ($mode !~ /ecb|ctr/);
383 movdqa
%xmm0,-16($ctx) # copy [or refresh] iv
385 $code.=<<___
if ($mode eq "ctr32");
386 mov
-4($ctx),%eax # pull 32-bit counter
387 test \
$0xffff0000,%eax
388 jnz
.L
${mode
}_no_carry
396 mov
%r8,$out # restore parameters
399 jz
.L
${mode
}_out_aligned
403 .byte
0xf3,0x48,0xa5 # rep movsq
405 .L
${mode
}_out_aligned
:
411 mov \
$$PADLOCK_CHUNK,$chunk
413 if (!$PADLOCK_PREFETCH{$mode}) {
423 $code.=<<___
if ($mode eq "ctr32");
425 mov
$inp,%rax # check if prefetch crosses page
430 and \
$0xfff,%rax # distance to page boundary
431 cmp \
$$PADLOCK_PREFETCH{$mode},%rax
432 mov \
$-$PADLOCK_PREFETCH{$mode},%rax
438 .L
${mode
}_unaligned_tail
:
442 mov
$out,%r8 # save parameters
444 sub %rax,%rsp # alloca
447 .byte
0xf3,0x48,0xa5 # rep movsq
449 mov
%r8, $out # restore parameters
475 $code.=<<___
if ($mode eq "ctr32");
476 mov
-4($ctx),%eax # pull 32-bit counter
480 mov \
$`16*0x10000`,$chunk
484 cmova
%rax,$chunk # don't let counter cross 2^16
486 jbe
.L
${mode
}_aligned_skip
488 .L
${mode
}_aligned_loop
:
489 mov
$len,%r10 # save parameters
493 lea
-16($ctx),%rax # ivp
494 lea
16($ctx),%rbx # key
495 shr \
$4,$len # len/=AES_BLOCK_SIZE
496 .byte
0xf3,0x0f,0xa7,$opcode # rep xcrypt*
498 mov
-4($ctx),%eax # pull 32-bit counter
504 mov
%r10,$len # restore parameters
506 mov \
$`16*0x10000`,$chunk
509 jae
.L
${mode
}_aligned_loop
511 .L
${mode
}_aligned_skip
:
513 $code.=<<___
if ($PADLOCK_PREFETCH{$mode});
516 and \
$0xfff,%rbp # distance to page boundary
518 cmp \
$$PADLOCK_PREFETCH{$mode},%rbp
519 mov \
$$PADLOCK_PREFETCH{$mode}-1,%rbp
521 and $len,%rbp # remainder
523 jz
.L
${mode
}_aligned_tail
526 lea
-16($ctx),%rax # ivp
527 lea
16($ctx),%rbx # key
528 shr \
$4,$len # len/=AES_BLOCK_SIZE
529 .byte
0xf3,0x0f,0xa7,$opcode # rep xcrypt*
531 $code.=<<___
if ($mode !~ /ecb|ctr/);
533 movdqa
%xmm0,-16($ctx) # copy [or refresh] iv
535 $code.=<<___
if ($PADLOCK_PREFETCH{$mode});
536 test
%rbp,%rbp # check remainder
539 .L
${mode
}_aligned_tail
:
547 .byte
0xf3,0x48,0xa5 # rep movsq
561 .size padlock_
${mode
}_encrypt
,.-padlock_
${mode
}_encrypt
565 &generate_mode
("ecb",0xc8);
566 &generate_mode
("cbc",0xd0);
567 &generate_mode
("cfb",0xe0);
568 &generate_mode
("ofb",0xe8);
569 &generate_mode
("ctr32",0xd8); # all 64-bit CPUs have working CTR...
572 .asciz
"VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
576 .Lpadlock_saved_context
:
579 $code =~ s/\`([^\`]*)\`/eval($1)/gem;