]> git.ipfire.org Git - thirdparty/openssl.git/blame - engines/asm/e_padlock-x86_64.pl
Copyright year updates
[thirdparty/openssl.git] / engines / asm / e_padlock-x86_64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
da1c088f 2# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
ab3fa1c0 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
ed28aef8
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# September 2011
18#
50452b2e
AP
19# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20# details.
ed28aef8 21
1aa89a7a
RL
22# $output is the last argument if it looks like a file (it has an extension)
23# $flavour is the first argument if it doesn't look like a file
24$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
25$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
ed28aef8
AP
26
27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
32die "can't locate x86_64-xlate.pl";
33
1aa89a7a
RL
34open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
35 or die "can't call $xlate: $!";
46bf83f0 36*STDOUT=*OUT;
ed28aef8
AP
37
38$code=".text\n";
39
ed998634 40%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
50452b2e 41$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
ed28aef8
AP
42
43$ctx="%rdx";
44$out="%rdi";
45$inp="%rsi";
46$len="%rcx";
47$chunk="%rbx";
48
49($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
50 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
51
52$code.=<<___;
53.globl padlock_capability
54.type padlock_capability,\@abi-omnipotent
55.align 16
56padlock_capability:
57 mov %rbx,%r8
58 xor %eax,%eax
59 cpuid
60 xor %eax,%eax
61 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
51cf8ba0 62 jne .Lzhaoxin
ed28aef8
AP
63 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
64 jne .Lnoluck
65 cmp \$`"0x".unpack("H*",'slua')`,%ecx
66 jne .Lnoluck
51cf8ba0
J
67 jmp .LzhaoxinEnd
68.Lzhaoxin:
69 cmp \$`"0x".unpack("H*",'hS ')`,%ebx
70 jne .Lnoluck
71 cmp \$`"0x".unpack("H*",'hgna')`,%edx
72 jne .Lnoluck
73 cmp \$`"0x".unpack("H*",' ia')`,%ecx
74 jne .Lnoluck
75.LzhaoxinEnd:
ed28aef8
AP
76 mov \$0xC0000000,%eax
77 cpuid
78 mov %eax,%edx
79 xor %eax,%eax
80 cmp \$0xC0000001,%edx
81 jb .Lnoluck
82 mov \$0xC0000001,%eax
83 cpuid
84 mov %edx,%eax
85 and \$0xffffffef,%eax
86 or \$0x10,%eax # set Nano bit#4
87.Lnoluck:
88 mov %r8,%rbx
89 ret
90.size padlock_capability,.-padlock_capability
91
92.globl padlock_key_bswap
93.type padlock_key_bswap,\@abi-omnipotent,0
94.align 16
95padlock_key_bswap:
96 mov 240($arg1),%edx
7331e7ef
V
97 inc %edx
98 shl \$2,%edx
ed28aef8
AP
99.Lbswap_loop:
100 mov ($arg1),%eax
101 bswap %eax
102 mov %eax,($arg1)
103 lea 4($arg1),$arg1
104 sub \$1,%edx
105 jnz .Lbswap_loop
106 ret
107.size padlock_key_bswap,.-padlock_key_bswap
108
109.globl padlock_verify_context
110.type padlock_verify_context,\@abi-omnipotent
111.align 16
112padlock_verify_context:
113 mov $arg1,$ctx
114 pushf
115 lea .Lpadlock_saved_context(%rip),%rax
116 call _padlock_verify_ctx
117 lea 8(%rsp),%rsp
118 ret
119.size padlock_verify_context,.-padlock_verify_context
120
121.type _padlock_verify_ctx,\@abi-omnipotent
122.align 16
123_padlock_verify_ctx:
124 mov 8(%rsp),%r8
125 bt \$30,%r8
126 jnc .Lverified
127 cmp (%rax),$ctx
128 je .Lverified
129 pushf
130 popf
131.Lverified:
132 mov $ctx,(%rax)
133 ret
134.size _padlock_verify_ctx,.-_padlock_verify_ctx
135
136.globl padlock_reload_key
137.type padlock_reload_key,\@abi-omnipotent
138.align 16
139padlock_reload_key:
140 pushf
141 popf
142 ret
143.size padlock_reload_key,.-padlock_reload_key
144
145.globl padlock_aes_block
146.type padlock_aes_block,\@function,3
147.align 16
148padlock_aes_block:
149 mov %rbx,%r8
150 mov \$1,$len
151 lea 32($ctx),%rbx # key
152 lea 16($ctx),$ctx # control word
153 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
154 mov %r8,%rbx
155 ret
156.size padlock_aes_block,.-padlock_aes_block
157
158.globl padlock_xstore
159.type padlock_xstore,\@function,2
160.align 16
161padlock_xstore:
162 mov %esi,%edx
163 .byte 0x0f,0xa7,0xc0 # xstore
164 ret
165.size padlock_xstore,.-padlock_xstore
166
167.globl padlock_sha1_oneshot
168.type padlock_sha1_oneshot,\@function,3
169.align 16
170padlock_sha1_oneshot:
ed28aef8 171 mov %rdx,%rcx
08d62e9f
AP
172 mov %rdi,%rdx # put aside %rdi
173 movups (%rdi),%xmm0 # copy-in context
174 sub \$128+8,%rsp
175 mov 16(%rdi),%eax
176 movaps %xmm0,(%rsp)
177 mov %rsp,%rdi
178 mov %eax,16(%rsp)
179 xor %rax,%rax
ed28aef8 180 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
08d62e9f
AP
181 movaps (%rsp),%xmm0
182 mov 16(%rsp),%eax
b1d3e9de 183 add \$128+8,%rsp
08d62e9f
AP
184 movups %xmm0,(%rdx) # copy-out context
185 mov %eax,16(%rdx)
ed28aef8
AP
186 ret
187.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
188
149ca712
AP
189.globl padlock_sha1_blocks
190.type padlock_sha1_blocks,\@function,3
ed28aef8 191.align 16
149ca712 192padlock_sha1_blocks:
ed28aef8 193 mov %rdx,%rcx
08d62e9f
AP
194 mov %rdi,%rdx # put aside %rdi
195 movups (%rdi),%xmm0 # copy-in context
196 sub \$128+8,%rsp
197 mov 16(%rdi),%eax
198 movaps %xmm0,(%rsp)
199 mov %rsp,%rdi
200 mov %eax,16(%rsp)
201 mov \$-1,%rax
ed28aef8 202 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
08d62e9f
AP
203 movaps (%rsp),%xmm0
204 mov 16(%rsp),%eax
b1d3e9de 205 add \$128+8,%rsp
08d62e9f
AP
206 movups %xmm0,(%rdx) # copy-out context
207 mov %eax,16(%rdx)
ed28aef8 208 ret
149ca712 209.size padlock_sha1_blocks,.-padlock_sha1_blocks
ed28aef8
AP
210
211.globl padlock_sha256_oneshot
212.type padlock_sha256_oneshot,\@function,3
213.align 16
214padlock_sha256_oneshot:
ed28aef8 215 mov %rdx,%rcx
08d62e9f
AP
216 mov %rdi,%rdx # put aside %rdi
217 movups (%rdi),%xmm0 # copy-in context
218 sub \$128+8,%rsp
219 movups 16(%rdi),%xmm1
220 movaps %xmm0,(%rsp)
221 mov %rsp,%rdi
222 movaps %xmm1,16(%rsp)
223 xor %rax,%rax
ed28aef8 224 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
08d62e9f
AP
225 movaps (%rsp),%xmm0
226 movaps 16(%rsp),%xmm1
b1d3e9de 227 add \$128+8,%rsp
08d62e9f
AP
228 movups %xmm0,(%rdx) # copy-out context
229 movups %xmm1,16(%rdx)
ed28aef8
AP
230 ret
231.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
232
149ca712
AP
233.globl padlock_sha256_blocks
234.type padlock_sha256_blocks,\@function,3
ed28aef8 235.align 16
149ca712 236padlock_sha256_blocks:
ed28aef8 237 mov %rdx,%rcx
08d62e9f
AP
238 mov %rdi,%rdx # put aside %rdi
239 movups (%rdi),%xmm0 # copy-in context
240 sub \$128+8,%rsp
241 movups 16(%rdi),%xmm1
242 movaps %xmm0,(%rsp)
243 mov %rsp,%rdi
244 movaps %xmm1,16(%rsp)
245 mov \$-1,%rax
ed28aef8 246 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
08d62e9f
AP
247 movaps (%rsp),%xmm0
248 movaps 16(%rsp),%xmm1
b1d3e9de 249 add \$128+8,%rsp
08d62e9f
AP
250 movups %xmm0,(%rdx) # copy-out context
251 movups %xmm1,16(%rdx)
ed28aef8 252 ret
149ca712
AP
253.size padlock_sha256_blocks,.-padlock_sha256_blocks
254
d18762f7
AP
255.globl padlock_sha512_blocks
256.type padlock_sha512_blocks,\@function,3
149ca712
AP
257.align 16
258padlock_sha512_blocks:
259 mov %rdx,%rcx
08d62e9f
AP
260 mov %rdi,%rdx # put aside %rdi
261 movups (%rdi),%xmm0 # copy-in context
262 sub \$128+8,%rsp
263 movups 16(%rdi),%xmm1
264 movups 32(%rdi),%xmm2
265 movups 48(%rdi),%xmm3
266 movaps %xmm0,(%rsp)
267 mov %rsp,%rdi
268 movaps %xmm1,16(%rsp)
269 movaps %xmm2,32(%rsp)
270 movaps %xmm3,48(%rsp)
149ca712 271 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
08d62e9f
AP
272 movaps (%rsp),%xmm0
273 movaps 16(%rsp),%xmm1
274 movaps 32(%rsp),%xmm2
275 movaps 48(%rsp),%xmm3
b1d3e9de 276 add \$128+8,%rsp
08d62e9f
AP
277 movups %xmm0,(%rdx) # copy-out context
278 movups %xmm1,16(%rdx)
279 movups %xmm2,32(%rdx)
280 movups %xmm3,48(%rdx)
149ca712
AP
281 ret
282.size padlock_sha512_blocks,.-padlock_sha512_blocks
ed28aef8
AP
283___
284
285sub generate_mode {
286my ($mode,$opcode) = @_;
287# int padlock_$mode_encrypt(void *out, const void *inp,
288# struct padlock_cipher_data *ctx, size_t len);
289$code.=<<___;
290.globl padlock_${mode}_encrypt
291.type padlock_${mode}_encrypt,\@function,4
292.align 16
293padlock_${mode}_encrypt:
294 push %rbp
295 push %rbx
296
297 xor %eax,%eax
298 test \$15,$ctx
299 jnz .L${mode}_abort
300 test \$15,$len
301 jnz .L${mode}_abort
302 lea .Lpadlock_saved_context(%rip),%rax
303 pushf
304 cld
305 call _padlock_verify_ctx
306 lea 16($ctx),$ctx # control word
307 xor %eax,%eax
308 xor %ebx,%ebx
33987f2f 309 testl \$`1<<5`,($ctx) # align bit in control word
149ca712 310 jnz .L${mode}_aligned
ed28aef8
AP
311 test \$0x0f,$out
312 setz %al # !out_misaligned
313 test \$0x0f,$inp
314 setz %bl # !inp_misaligned
315 test %ebx,%eax
316 jnz .L${mode}_aligned
317 neg %rax
318 mov \$$PADLOCK_CHUNK,$chunk
319 not %rax # out_misaligned?-1:0
320 lea (%rsp),%rbp
321 cmp $chunk,$len
322 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
323 and $chunk,%rax # out_misaligned?chunk:0
324 mov $len,$chunk
325 neg %rax
326 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
327 lea (%rax,%rbp),%rsp
ed998634
AP
328 mov \$$PADLOCK_CHUNK,%rax
329 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
50452b2e
AP
330___
331$code.=<<___ if ($mode eq "ctr32");
6c8ce3c2 332.L${mode}_reenter:
50452b2e
AP
333 mov -4($ctx),%eax # pull 32-bit counter
334 bswap %eax
335 neg %eax
336 and \$`$PADLOCK_CHUNK/16-1`,%eax
ed998634 337 mov \$$PADLOCK_CHUNK,$chunk
50452b2e 338 shl \$4,%eax
ed998634 339 cmovz $chunk,%rax
50452b2e
AP
340 cmp %rax,$len
341 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
ed998634
AP
342 cmovbe $len,$chunk
343___
344$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
345 cmp $chunk,$len
346 ja .L${mode}_loop
347 mov $inp,%rax # check if prefetch crosses page
348 cmp %rsp,%rbp
349 cmove $out,%rax
350 add $len,%rax
351 neg %rax
352 and \$0xfff,%rax # distance to page boundary
353 cmp \$$PADLOCK_PREFETCH{$mode},%rax
354 mov \$-$PADLOCK_PREFETCH{$mode},%rax
355 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
356 and %rax,$chunk
357 jz .L${mode}_unaligned_tail
50452b2e
AP
358___
359$code.=<<___;
ed28aef8
AP
360 jmp .L${mode}_loop
361.align 16
362.L${mode}_loop:
50452b2e
AP
363 cmp $len,$chunk # ctr32 artefact
364 cmova $len,$chunk # ctr32 artefact
ed28aef8
AP
365 mov $out,%r8 # save parameters
366 mov $inp,%r9
367 mov $len,%r10
368 mov $chunk,$len
369 mov $chunk,%r11
370 test \$0x0f,$out # out_misaligned
371 cmovnz %rsp,$out
372 test \$0x0f,$inp # inp_misaligned
373 jz .L${mode}_inp_aligned
374 shr \$3,$len
375 .byte 0xf3,0x48,0xa5 # rep movsq
376 sub $chunk,$out
377 mov $chunk,$len
378 mov $out,$inp
379.L${mode}_inp_aligned:
380 lea -16($ctx),%rax # ivp
381 lea 16($ctx),%rbx # key
382 shr \$4,$len
383 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
384___
385$code.=<<___ if ($mode !~ /ecb|ctr/);
386 movdqa (%rax),%xmm0
387 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
388___
50452b2e
AP
389$code.=<<___ if ($mode eq "ctr32");
390 mov -4($ctx),%eax # pull 32-bit counter
391 test \$0xffff0000,%eax
ed998634 392 jnz .L${mode}_no_carry
50452b2e
AP
393 bswap %eax
394 add \$0x10000,%eax
395 bswap %eax
396 mov %eax,-4($ctx)
ed998634 397.L${mode}_no_carry:
50452b2e 398___
ed28aef8 399$code.=<<___;
1afd7fa9 400 mov %r8,$out # restore parameters
ed28aef8
AP
401 mov %r11,$chunk
402 test \$0x0f,$out
403 jz .L${mode}_out_aligned
404 mov $chunk,$len
ed28aef8 405 lea (%rsp),$inp
ed998634 406 shr \$3,$len
ed28aef8
AP
407 .byte 0xf3,0x48,0xa5 # rep movsq
408 sub $chunk,$out
409.L${mode}_out_aligned:
410 mov %r9,$inp
411 mov %r10,$len
412 add $chunk,$out
413 add $chunk,$inp
414 sub $chunk,$len
415 mov \$$PADLOCK_CHUNK,$chunk
ed998634
AP
416___
417 if (!$PADLOCK_PREFETCH{$mode}) {
418$code.=<<___;
ed28aef8 419 jnz .L${mode}_loop
ed998634
AP
420___
421 } else {
422$code.=<<___;
423 jz .L${mode}_break
424 cmp $chunk,$len
425 jae .L${mode}_loop
426___
427$code.=<<___ if ($mode eq "ctr32");
428 mov $len,$chunk
429 mov $inp,%rax # check if prefetch crosses page
6c8ce3c2 430 cmp %rsp,%rbp
ed998634
AP
431 cmove $out,%rax
432 add $len,%rax
433 neg %rax
434 and \$0xfff,%rax # distance to page boundary
435 cmp \$$PADLOCK_PREFETCH{$mode},%rax
436 mov \$-$PADLOCK_PREFETCH{$mode},%rax
437 cmovae $chunk,%rax
438 and %rax,$chunk
439 jnz .L${mode}_loop
440___
441$code.=<<___;
442.L${mode}_unaligned_tail:
443 xor %eax,%eax
444 cmp %rsp,%rbp
445 cmove $len,%rax
446 mov $out,%r8 # save parameters
447 mov $len,$chunk
448 sub %rax,%rsp # alloca
449 shr \$3,$len
450 lea (%rsp),$out
451 .byte 0xf3,0x48,0xa5 # rep movsq
452 mov %rsp,$inp
453 mov %r8, $out # restore parameters
454 mov $chunk,$len
455 jmp .L${mode}_loop
456.align 16
457.L${mode}_break:
458___
459 }
460$code.=<<___;
461 cmp %rbp,%rsp
6c8ce3c2
AP
462 je .L${mode}_done
463
464 pxor %xmm0,%xmm0
465 lea (%rsp),%rax
466.L${mode}_bzero:
467 movaps %xmm0,(%rax)
468 lea 16(%rax),%rax
469 cmp %rax,%rbp
470 ja .L${mode}_bzero
ed28aef8 471
ed28aef8
AP
472.L${mode}_done:
473 lea (%rbp),%rsp
474 jmp .L${mode}_exit
ed998634 475
ed28aef8
AP
476.align 16
477.L${mode}_aligned:
50452b2e
AP
478___
479$code.=<<___ if ($mode eq "ctr32");
480 mov -4($ctx),%eax # pull 32-bit counter
50452b2e 481 bswap %eax
50452b2e
AP
482 neg %eax
483 and \$0xffff,%eax
ed998634 484 mov \$`16*0x10000`,$chunk
50452b2e 485 shl \$4,%eax
ed998634 486 cmovz $chunk,%rax
50452b2e
AP
487 cmp %rax,$len
488 cmova %rax,$chunk # don't let counter cross 2^16
ed998634
AP
489 cmovbe $len,$chunk
490 jbe .L${mode}_aligned_skip
491
50452b2e 492.L${mode}_aligned_loop:
50452b2e
AP
493 mov $len,%r10 # save parameters
494 mov $chunk,$len
495 mov $chunk,%r11
ed998634 496
ed28aef8
AP
497 lea -16($ctx),%rax # ivp
498 lea 16($ctx),%rbx # key
499 shr \$4,$len # len/=AES_BLOCK_SIZE
500 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
ed998634 501
50452b2e
AP
502 mov -4($ctx),%eax # pull 32-bit counter
503 bswap %eax
504 add \$0x10000,%eax
505 bswap %eax
506 mov %eax,-4($ctx)
507
1afd7fa9 508 mov %r10,$len # restore parameters
ed998634 509 sub %r11,$len
50452b2e 510 mov \$`16*0x10000`,$chunk
ed998634
AP
511 jz .L${mode}_exit
512 cmp $chunk,$len
513 jae .L${mode}_aligned_loop
514
515.L${mode}_aligned_skip:
516___
517$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
518 lea ($inp,$len),%rbp
519 neg %rbp
520 and \$0xfff,%rbp # distance to page boundary
521 xor %eax,%eax
522 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
523 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
524 cmovae %rax,%rbp
525 and $len,%rbp # remainder
526 sub %rbp,$len
527 jz .L${mode}_aligned_tail
528___
529$code.=<<___;
530 lea -16($ctx),%rax # ivp
531 lea 16($ctx),%rbx # key
532 shr \$4,$len # len/=AES_BLOCK_SIZE
533 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
534___
535$code.=<<___ if ($mode !~ /ecb|ctr/);
536 movdqa (%rax),%xmm0
537 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
538___
539$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
540 test %rbp,%rbp # check remainder
541 jz .L${mode}_exit
542
543.L${mode}_aligned_tail:
544 mov $out,%r8
545 mov %rbp,$chunk
546 mov %rbp,$len
547 lea (%rsp),%rbp
548 sub $len,%rsp
549 shr \$3,$len
550 lea (%rsp),$out
609b0852 551 .byte 0xf3,0x48,0xa5 # rep movsq
ed998634
AP
552 lea (%r8),$out
553 lea (%rsp),$inp
554 mov $chunk,$len
555 jmp .L${mode}_loop
50452b2e 556___
ed28aef8
AP
557$code.=<<___;
558.L${mode}_exit:
559 mov \$1,%eax
560 lea 8(%rsp),%rsp
561.L${mode}_abort:
562 pop %rbx
563 pop %rbp
564 ret
565.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
566___
567}
568
569&generate_mode("ecb",0xc8);
570&generate_mode("cbc",0xd0);
571&generate_mode("cfb",0xe0);
572&generate_mode("ofb",0xe8);
50452b2e 573&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
ed28aef8
AP
574
575$code.=<<___;
576.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
577.align 16
578.data
579.align 8
580.Lpadlock_saved_context:
581 .quad 0
582___
583$code =~ s/\`([^\`]*)\`/eval($1)/gem;
584
585print $code;
586
587close STDOUT;