]> git.ipfire.org Git - thirdparty/openssl.git/blame - engines/asm/e_padlock-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / engines / asm / e_padlock-x86_64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
b0edda11 2# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e
RS
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
ed28aef8
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# September 2011
18#
50452b2e
AP
19# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20# details.
ed28aef8
AP
21
22$flavour = shift;
23$output = shift;
24if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
25
26$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
27
28$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
30( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
31die "can't locate x86_64-xlate.pl";
32
cfe1d992 33open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
46bf83f0 34*STDOUT=*OUT;
ed28aef8
AP
35
36$code=".text\n";
37
ed998634 38%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
50452b2e 39$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
ed28aef8
AP
40
41$ctx="%rdx";
42$out="%rdi";
43$inp="%rsi";
44$len="%rcx";
45$chunk="%rbx";
46
47($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
48 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
49
50$code.=<<___;
51.globl padlock_capability
52.type padlock_capability,\@abi-omnipotent
53.align 16
54padlock_capability:
55 mov %rbx,%r8
56 xor %eax,%eax
57 cpuid
58 xor %eax,%eax
59 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
51cf8ba0 60 jne .Lzhaoxin
ed28aef8
AP
61 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
62 jne .Lnoluck
63 cmp \$`"0x".unpack("H*",'slua')`,%ecx
64 jne .Lnoluck
51cf8ba0
J
65 jmp .LzhaoxinEnd
66.Lzhaoxin:
67 cmp \$`"0x".unpack("H*",'hS ')`,%ebx
68 jne .Lnoluck
69 cmp \$`"0x".unpack("H*",'hgna')`,%edx
70 jne .Lnoluck
71 cmp \$`"0x".unpack("H*",' ia')`,%ecx
72 jne .Lnoluck
73.LzhaoxinEnd:
ed28aef8
AP
74 mov \$0xC0000000,%eax
75 cpuid
76 mov %eax,%edx
77 xor %eax,%eax
78 cmp \$0xC0000001,%edx
79 jb .Lnoluck
80 mov \$0xC0000001,%eax
81 cpuid
82 mov %edx,%eax
83 and \$0xffffffef,%eax
84 or \$0x10,%eax # set Nano bit#4
85.Lnoluck:
86 mov %r8,%rbx
87 ret
88.size padlock_capability,.-padlock_capability
89
90.globl padlock_key_bswap
91.type padlock_key_bswap,\@abi-omnipotent,0
92.align 16
93padlock_key_bswap:
94 mov 240($arg1),%edx
95.Lbswap_loop:
96 mov ($arg1),%eax
97 bswap %eax
98 mov %eax,($arg1)
99 lea 4($arg1),$arg1
100 sub \$1,%edx
101 jnz .Lbswap_loop
102 ret
103.size padlock_key_bswap,.-padlock_key_bswap
104
105.globl padlock_verify_context
106.type padlock_verify_context,\@abi-omnipotent
107.align 16
108padlock_verify_context:
109 mov $arg1,$ctx
110 pushf
111 lea .Lpadlock_saved_context(%rip),%rax
112 call _padlock_verify_ctx
113 lea 8(%rsp),%rsp
114 ret
115.size padlock_verify_context,.-padlock_verify_context
116
117.type _padlock_verify_ctx,\@abi-omnipotent
118.align 16
119_padlock_verify_ctx:
120 mov 8(%rsp),%r8
121 bt \$30,%r8
122 jnc .Lverified
123 cmp (%rax),$ctx
124 je .Lverified
125 pushf
126 popf
127.Lverified:
128 mov $ctx,(%rax)
129 ret
130.size _padlock_verify_ctx,.-_padlock_verify_ctx
131
132.globl padlock_reload_key
133.type padlock_reload_key,\@abi-omnipotent
134.align 16
135padlock_reload_key:
136 pushf
137 popf
138 ret
139.size padlock_reload_key,.-padlock_reload_key
140
141.globl padlock_aes_block
142.type padlock_aes_block,\@function,3
143.align 16
144padlock_aes_block:
145 mov %rbx,%r8
146 mov \$1,$len
147 lea 32($ctx),%rbx # key
148 lea 16($ctx),$ctx # control word
149 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
150 mov %r8,%rbx
151 ret
152.size padlock_aes_block,.-padlock_aes_block
153
154.globl padlock_xstore
155.type padlock_xstore,\@function,2
156.align 16
157padlock_xstore:
158 mov %esi,%edx
159 .byte 0x0f,0xa7,0xc0 # xstore
160 ret
161.size padlock_xstore,.-padlock_xstore
162
163.globl padlock_sha1_oneshot
164.type padlock_sha1_oneshot,\@function,3
165.align 16
166padlock_sha1_oneshot:
ed28aef8 167 mov %rdx,%rcx
08d62e9f
AP
168 mov %rdi,%rdx # put aside %rdi
169 movups (%rdi),%xmm0 # copy-in context
170 sub \$128+8,%rsp
171 mov 16(%rdi),%eax
172 movaps %xmm0,(%rsp)
173 mov %rsp,%rdi
174 mov %eax,16(%rsp)
175 xor %rax,%rax
ed28aef8 176 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
08d62e9f
AP
177 movaps (%rsp),%xmm0
178 mov 16(%rsp),%eax
b1d3e9de 179 add \$128+8,%rsp
08d62e9f
AP
180 movups %xmm0,(%rdx) # copy-out context
181 mov %eax,16(%rdx)
ed28aef8
AP
182 ret
183.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
184
149ca712
AP
185.globl padlock_sha1_blocks
186.type padlock_sha1_blocks,\@function,3
ed28aef8 187.align 16
149ca712 188padlock_sha1_blocks:
ed28aef8 189 mov %rdx,%rcx
08d62e9f
AP
190 mov %rdi,%rdx # put aside %rdi
191 movups (%rdi),%xmm0 # copy-in context
192 sub \$128+8,%rsp
193 mov 16(%rdi),%eax
194 movaps %xmm0,(%rsp)
195 mov %rsp,%rdi
196 mov %eax,16(%rsp)
197 mov \$-1,%rax
ed28aef8 198 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
08d62e9f
AP
199 movaps (%rsp),%xmm0
200 mov 16(%rsp),%eax
b1d3e9de 201 add \$128+8,%rsp
08d62e9f
AP
202 movups %xmm0,(%rdx) # copy-out context
203 mov %eax,16(%rdx)
ed28aef8 204 ret
149ca712 205.size padlock_sha1_blocks,.-padlock_sha1_blocks
ed28aef8
AP
206
207.globl padlock_sha256_oneshot
208.type padlock_sha256_oneshot,\@function,3
209.align 16
210padlock_sha256_oneshot:
ed28aef8 211 mov %rdx,%rcx
08d62e9f
AP
212 mov %rdi,%rdx # put aside %rdi
213 movups (%rdi),%xmm0 # copy-in context
214 sub \$128+8,%rsp
215 movups 16(%rdi),%xmm1
216 movaps %xmm0,(%rsp)
217 mov %rsp,%rdi
218 movaps %xmm1,16(%rsp)
219 xor %rax,%rax
ed28aef8 220 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
08d62e9f
AP
221 movaps (%rsp),%xmm0
222 movaps 16(%rsp),%xmm1
b1d3e9de 223 add \$128+8,%rsp
08d62e9f
AP
224 movups %xmm0,(%rdx) # copy-out context
225 movups %xmm1,16(%rdx)
ed28aef8
AP
226 ret
227.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
228
149ca712
AP
229.globl padlock_sha256_blocks
230.type padlock_sha256_blocks,\@function,3
ed28aef8 231.align 16
149ca712 232padlock_sha256_blocks:
ed28aef8 233 mov %rdx,%rcx
08d62e9f
AP
234 mov %rdi,%rdx # put aside %rdi
235 movups (%rdi),%xmm0 # copy-in context
236 sub \$128+8,%rsp
237 movups 16(%rdi),%xmm1
238 movaps %xmm0,(%rsp)
239 mov %rsp,%rdi
240 movaps %xmm1,16(%rsp)
241 mov \$-1,%rax
ed28aef8 242 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
08d62e9f
AP
243 movaps (%rsp),%xmm0
244 movaps 16(%rsp),%xmm1
b1d3e9de 245 add \$128+8,%rsp
08d62e9f
AP
246 movups %xmm0,(%rdx) # copy-out context
247 movups %xmm1,16(%rdx)
ed28aef8 248 ret
149ca712
AP
249.size padlock_sha256_blocks,.-padlock_sha256_blocks
250
d18762f7
AP
251.globl padlock_sha512_blocks
252.type padlock_sha512_blocks,\@function,3
149ca712
AP
253.align 16
254padlock_sha512_blocks:
255 mov %rdx,%rcx
08d62e9f
AP
256 mov %rdi,%rdx # put aside %rdi
257 movups (%rdi),%xmm0 # copy-in context
258 sub \$128+8,%rsp
259 movups 16(%rdi),%xmm1
260 movups 32(%rdi),%xmm2
261 movups 48(%rdi),%xmm3
262 movaps %xmm0,(%rsp)
263 mov %rsp,%rdi
264 movaps %xmm1,16(%rsp)
265 movaps %xmm2,32(%rsp)
266 movaps %xmm3,48(%rsp)
149ca712 267 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
08d62e9f
AP
268 movaps (%rsp),%xmm0
269 movaps 16(%rsp),%xmm1
270 movaps 32(%rsp),%xmm2
271 movaps 48(%rsp),%xmm3
b1d3e9de 272 add \$128+8,%rsp
08d62e9f
AP
273 movups %xmm0,(%rdx) # copy-out context
274 movups %xmm1,16(%rdx)
275 movups %xmm2,32(%rdx)
276 movups %xmm3,48(%rdx)
149ca712
AP
277 ret
278.size padlock_sha512_blocks,.-padlock_sha512_blocks
ed28aef8
AP
279___
280
281sub generate_mode {
282my ($mode,$opcode) = @_;
283# int padlock_$mode_encrypt(void *out, const void *inp,
284# struct padlock_cipher_data *ctx, size_t len);
285$code.=<<___;
286.globl padlock_${mode}_encrypt
287.type padlock_${mode}_encrypt,\@function,4
288.align 16
289padlock_${mode}_encrypt:
290 push %rbp
291 push %rbx
292
293 xor %eax,%eax
294 test \$15,$ctx
295 jnz .L${mode}_abort
296 test \$15,$len
297 jnz .L${mode}_abort
298 lea .Lpadlock_saved_context(%rip),%rax
299 pushf
300 cld
301 call _padlock_verify_ctx
302 lea 16($ctx),$ctx # control word
303 xor %eax,%eax
304 xor %ebx,%ebx
33987f2f 305 testl \$`1<<5`,($ctx) # align bit in control word
149ca712 306 jnz .L${mode}_aligned
ed28aef8
AP
307 test \$0x0f,$out
308 setz %al # !out_misaligned
309 test \$0x0f,$inp
310 setz %bl # !inp_misaligned
311 test %ebx,%eax
312 jnz .L${mode}_aligned
313 neg %rax
314 mov \$$PADLOCK_CHUNK,$chunk
315 not %rax # out_misaligned?-1:0
316 lea (%rsp),%rbp
317 cmp $chunk,$len
318 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
319 and $chunk,%rax # out_misaligned?chunk:0
320 mov $len,$chunk
321 neg %rax
322 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
323 lea (%rax,%rbp),%rsp
ed998634
AP
324 mov \$$PADLOCK_CHUNK,%rax
325 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
50452b2e
AP
326___
327$code.=<<___ if ($mode eq "ctr32");
6c8ce3c2 328.L${mode}_reenter:
50452b2e
AP
329 mov -4($ctx),%eax # pull 32-bit counter
330 bswap %eax
331 neg %eax
332 and \$`$PADLOCK_CHUNK/16-1`,%eax
ed998634 333 mov \$$PADLOCK_CHUNK,$chunk
50452b2e 334 shl \$4,%eax
ed998634 335 cmovz $chunk,%rax
50452b2e
AP
336 cmp %rax,$len
337 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
ed998634
AP
338 cmovbe $len,$chunk
339___
340$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
341 cmp $chunk,$len
342 ja .L${mode}_loop
343 mov $inp,%rax # check if prefetch crosses page
344 cmp %rsp,%rbp
345 cmove $out,%rax
346 add $len,%rax
347 neg %rax
348 and \$0xfff,%rax # distance to page boundary
349 cmp \$$PADLOCK_PREFETCH{$mode},%rax
350 mov \$-$PADLOCK_PREFETCH{$mode},%rax
351 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
352 and %rax,$chunk
353 jz .L${mode}_unaligned_tail
50452b2e
AP
354___
355$code.=<<___;
ed28aef8
AP
356 jmp .L${mode}_loop
357.align 16
358.L${mode}_loop:
50452b2e
AP
359 cmp $len,$chunk # ctr32 artefact
360 cmova $len,$chunk # ctr32 artefact
ed28aef8
AP
361 mov $out,%r8 # save parameters
362 mov $inp,%r9
363 mov $len,%r10
364 mov $chunk,$len
365 mov $chunk,%r11
366 test \$0x0f,$out # out_misaligned
367 cmovnz %rsp,$out
368 test \$0x0f,$inp # inp_misaligned
369 jz .L${mode}_inp_aligned
370 shr \$3,$len
371 .byte 0xf3,0x48,0xa5 # rep movsq
372 sub $chunk,$out
373 mov $chunk,$len
374 mov $out,$inp
375.L${mode}_inp_aligned:
376 lea -16($ctx),%rax # ivp
377 lea 16($ctx),%rbx # key
378 shr \$4,$len
379 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
380___
381$code.=<<___ if ($mode !~ /ecb|ctr/);
382 movdqa (%rax),%xmm0
383 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
384___
50452b2e
AP
385$code.=<<___ if ($mode eq "ctr32");
386 mov -4($ctx),%eax # pull 32-bit counter
387 test \$0xffff0000,%eax
ed998634 388 jnz .L${mode}_no_carry
50452b2e
AP
389 bswap %eax
390 add \$0x10000,%eax
391 bswap %eax
392 mov %eax,-4($ctx)
ed998634 393.L${mode}_no_carry:
50452b2e 394___
ed28aef8 395$code.=<<___;
1afd7fa9 396 mov %r8,$out # restore parameters
ed28aef8
AP
397 mov %r11,$chunk
398 test \$0x0f,$out
399 jz .L${mode}_out_aligned
400 mov $chunk,$len
ed28aef8 401 lea (%rsp),$inp
ed998634 402 shr \$3,$len
ed28aef8
AP
403 .byte 0xf3,0x48,0xa5 # rep movsq
404 sub $chunk,$out
405.L${mode}_out_aligned:
406 mov %r9,$inp
407 mov %r10,$len
408 add $chunk,$out
409 add $chunk,$inp
410 sub $chunk,$len
411 mov \$$PADLOCK_CHUNK,$chunk
ed998634
AP
412___
413 if (!$PADLOCK_PREFETCH{$mode}) {
414$code.=<<___;
ed28aef8 415 jnz .L${mode}_loop
ed998634
AP
416___
417 } else {
418$code.=<<___;
419 jz .L${mode}_break
420 cmp $chunk,$len
421 jae .L${mode}_loop
422___
423$code.=<<___ if ($mode eq "ctr32");
424 mov $len,$chunk
425 mov $inp,%rax # check if prefetch crosses page
6c8ce3c2 426 cmp %rsp,%rbp
ed998634
AP
427 cmove $out,%rax
428 add $len,%rax
429 neg %rax
430 and \$0xfff,%rax # distance to page boundary
431 cmp \$$PADLOCK_PREFETCH{$mode},%rax
432 mov \$-$PADLOCK_PREFETCH{$mode},%rax
433 cmovae $chunk,%rax
434 and %rax,$chunk
435 jnz .L${mode}_loop
436___
437$code.=<<___;
438.L${mode}_unaligned_tail:
439 xor %eax,%eax
440 cmp %rsp,%rbp
441 cmove $len,%rax
442 mov $out,%r8 # save parameters
443 mov $len,$chunk
444 sub %rax,%rsp # alloca
445 shr \$3,$len
446 lea (%rsp),$out
447 .byte 0xf3,0x48,0xa5 # rep movsq
448 mov %rsp,$inp
449 mov %r8, $out # restore parameters
450 mov $chunk,$len
451 jmp .L${mode}_loop
452.align 16
453.L${mode}_break:
454___
455 }
456$code.=<<___;
457 cmp %rbp,%rsp
6c8ce3c2
AP
458 je .L${mode}_done
459
460 pxor %xmm0,%xmm0
461 lea (%rsp),%rax
462.L${mode}_bzero:
463 movaps %xmm0,(%rax)
464 lea 16(%rax),%rax
465 cmp %rax,%rbp
466 ja .L${mode}_bzero
ed28aef8 467
ed28aef8
AP
468.L${mode}_done:
469 lea (%rbp),%rsp
470 jmp .L${mode}_exit
ed998634 471
ed28aef8
AP
472.align 16
473.L${mode}_aligned:
50452b2e
AP
474___
475$code.=<<___ if ($mode eq "ctr32");
476 mov -4($ctx),%eax # pull 32-bit counter
50452b2e 477 bswap %eax
50452b2e
AP
478 neg %eax
479 and \$0xffff,%eax
ed998634 480 mov \$`16*0x10000`,$chunk
50452b2e 481 shl \$4,%eax
ed998634 482 cmovz $chunk,%rax
50452b2e
AP
483 cmp %rax,$len
484 cmova %rax,$chunk # don't let counter cross 2^16
ed998634
AP
485 cmovbe $len,$chunk
486 jbe .L${mode}_aligned_skip
487
50452b2e 488.L${mode}_aligned_loop:
50452b2e
AP
489 mov $len,%r10 # save parameters
490 mov $chunk,$len
491 mov $chunk,%r11
ed998634 492
ed28aef8
AP
493 lea -16($ctx),%rax # ivp
494 lea 16($ctx),%rbx # key
495 shr \$4,$len # len/=AES_BLOCK_SIZE
496 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
ed998634 497
50452b2e
AP
498 mov -4($ctx),%eax # pull 32-bit counter
499 bswap %eax
500 add \$0x10000,%eax
501 bswap %eax
502 mov %eax,-4($ctx)
503
1afd7fa9 504 mov %r10,$len # restore parameters
ed998634 505 sub %r11,$len
50452b2e 506 mov \$`16*0x10000`,$chunk
ed998634
AP
507 jz .L${mode}_exit
508 cmp $chunk,$len
509 jae .L${mode}_aligned_loop
510
511.L${mode}_aligned_skip:
512___
513$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
514 lea ($inp,$len),%rbp
515 neg %rbp
516 and \$0xfff,%rbp # distance to page boundary
517 xor %eax,%eax
518 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
519 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
520 cmovae %rax,%rbp
521 and $len,%rbp # remainder
522 sub %rbp,$len
523 jz .L${mode}_aligned_tail
524___
525$code.=<<___;
526 lea -16($ctx),%rax # ivp
527 lea 16($ctx),%rbx # key
528 shr \$4,$len # len/=AES_BLOCK_SIZE
529 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
530___
531$code.=<<___ if ($mode !~ /ecb|ctr/);
532 movdqa (%rax),%xmm0
533 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
534___
535$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
536 test %rbp,%rbp # check remainder
537 jz .L${mode}_exit
538
539.L${mode}_aligned_tail:
540 mov $out,%r8
541 mov %rbp,$chunk
542 mov %rbp,$len
543 lea (%rsp),%rbp
544 sub $len,%rsp
545 shr \$3,$len
546 lea (%rsp),$out
609b0852 547 .byte 0xf3,0x48,0xa5 # rep movsq
ed998634
AP
548 lea (%r8),$out
549 lea (%rsp),$inp
550 mov $chunk,$len
551 jmp .L${mode}_loop
50452b2e 552___
ed28aef8
AP
553$code.=<<___;
554.L${mode}_exit:
555 mov \$1,%eax
556 lea 8(%rsp),%rsp
557.L${mode}_abort:
558 pop %rbx
559 pop %rbp
560 ret
561.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
562___
563}
564
565&generate_mode("ecb",0xc8);
566&generate_mode("cbc",0xd0);
567&generate_mode("cfb",0xe0);
568&generate_mode("ofb",0xe8);
50452b2e 569&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
ed28aef8
AP
570
571$code.=<<___;
572.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
573.align 16
574.data
575.align 8
576.Lpadlock_saved_context:
577 .quad 0
578___
579$code =~ s/\`([^\`]*)\`/eval($1)/gem;
580
581print $code;
582
583close STDOUT;