]> git.ipfire.org Git - thirdparty/openssl.git/blame - engines/asm/e_padlock-x86_64.pl
Clarify a couple of details around "make variables"
[thirdparty/openssl.git] / engines / asm / e_padlock-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
ed28aef8
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# September 2011
18#
50452b2e
AP
19# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20# details.
ed28aef8
AP
21
22$flavour = shift;
23$output = shift;
24if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
25
26$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
27
28$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
30( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
31die "can't locate x86_64-xlate.pl";
32
cfe1d992 33open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
46bf83f0 34*STDOUT=*OUT;
ed28aef8
AP
35
36$code=".text\n";
37
ed998634 38%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
50452b2e 39$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
ed28aef8
AP
40
41$ctx="%rdx";
42$out="%rdi";
43$inp="%rsi";
44$len="%rcx";
45$chunk="%rbx";
46
47($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
48 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
49
50$code.=<<___;
51.globl padlock_capability
52.type padlock_capability,\@abi-omnipotent
53.align 16
54padlock_capability:
55 mov %rbx,%r8
56 xor %eax,%eax
57 cpuid
58 xor %eax,%eax
59 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
60 jne .Lnoluck
61 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
62 jne .Lnoluck
63 cmp \$`"0x".unpack("H*",'slua')`,%ecx
64 jne .Lnoluck
65 mov \$0xC0000000,%eax
66 cpuid
67 mov %eax,%edx
68 xor %eax,%eax
69 cmp \$0xC0000001,%edx
70 jb .Lnoluck
71 mov \$0xC0000001,%eax
72 cpuid
73 mov %edx,%eax
74 and \$0xffffffef,%eax
75 or \$0x10,%eax # set Nano bit#4
76.Lnoluck:
77 mov %r8,%rbx
78 ret
79.size padlock_capability,.-padlock_capability
80
81.globl padlock_key_bswap
82.type padlock_key_bswap,\@abi-omnipotent,0
83.align 16
84padlock_key_bswap:
85 mov 240($arg1),%edx
86.Lbswap_loop:
87 mov ($arg1),%eax
88 bswap %eax
89 mov %eax,($arg1)
90 lea 4($arg1),$arg1
91 sub \$1,%edx
92 jnz .Lbswap_loop
93 ret
94.size padlock_key_bswap,.-padlock_key_bswap
95
96.globl padlock_verify_context
97.type padlock_verify_context,\@abi-omnipotent
98.align 16
99padlock_verify_context:
100 mov $arg1,$ctx
101 pushf
102 lea .Lpadlock_saved_context(%rip),%rax
103 call _padlock_verify_ctx
104 lea 8(%rsp),%rsp
105 ret
106.size padlock_verify_context,.-padlock_verify_context
107
108.type _padlock_verify_ctx,\@abi-omnipotent
109.align 16
110_padlock_verify_ctx:
111 mov 8(%rsp),%r8
112 bt \$30,%r8
113 jnc .Lverified
114 cmp (%rax),$ctx
115 je .Lverified
116 pushf
117 popf
118.Lverified:
119 mov $ctx,(%rax)
120 ret
121.size _padlock_verify_ctx,.-_padlock_verify_ctx
122
123.globl padlock_reload_key
124.type padlock_reload_key,\@abi-omnipotent
125.align 16
126padlock_reload_key:
127 pushf
128 popf
129 ret
130.size padlock_reload_key,.-padlock_reload_key
131
132.globl padlock_aes_block
133.type padlock_aes_block,\@function,3
134.align 16
135padlock_aes_block:
136 mov %rbx,%r8
137 mov \$1,$len
138 lea 32($ctx),%rbx # key
139 lea 16($ctx),$ctx # control word
140 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
141 mov %r8,%rbx
142 ret
143.size padlock_aes_block,.-padlock_aes_block
144
145.globl padlock_xstore
146.type padlock_xstore,\@function,2
147.align 16
148padlock_xstore:
149 mov %esi,%edx
150 .byte 0x0f,0xa7,0xc0 # xstore
151 ret
152.size padlock_xstore,.-padlock_xstore
153
154.globl padlock_sha1_oneshot
155.type padlock_sha1_oneshot,\@function,3
156.align 16
157padlock_sha1_oneshot:
ed28aef8 158 mov %rdx,%rcx
08d62e9f
AP
159 mov %rdi,%rdx # put aside %rdi
160 movups (%rdi),%xmm0 # copy-in context
161 sub \$128+8,%rsp
162 mov 16(%rdi),%eax
163 movaps %xmm0,(%rsp)
164 mov %rsp,%rdi
165 mov %eax,16(%rsp)
166 xor %rax,%rax
ed28aef8 167 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
08d62e9f
AP
168 movaps (%rsp),%xmm0
169 mov 16(%rsp),%eax
b1d3e9de 170 add \$128+8,%rsp
08d62e9f
AP
171 movups %xmm0,(%rdx) # copy-out context
172 mov %eax,16(%rdx)
ed28aef8
AP
173 ret
174.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
175
149ca712
AP
176.globl padlock_sha1_blocks
177.type padlock_sha1_blocks,\@function,3
ed28aef8 178.align 16
149ca712 179padlock_sha1_blocks:
ed28aef8 180 mov %rdx,%rcx
08d62e9f
AP
181 mov %rdi,%rdx # put aside %rdi
182 movups (%rdi),%xmm0 # copy-in context
183 sub \$128+8,%rsp
184 mov 16(%rdi),%eax
185 movaps %xmm0,(%rsp)
186 mov %rsp,%rdi
187 mov %eax,16(%rsp)
188 mov \$-1,%rax
ed28aef8 189 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
08d62e9f
AP
190 movaps (%rsp),%xmm0
191 mov 16(%rsp),%eax
b1d3e9de 192 add \$128+8,%rsp
08d62e9f
AP
193 movups %xmm0,(%rdx) # copy-out context
194 mov %eax,16(%rdx)
ed28aef8 195 ret
149ca712 196.size padlock_sha1_blocks,.-padlock_sha1_blocks
ed28aef8
AP
197
198.globl padlock_sha256_oneshot
199.type padlock_sha256_oneshot,\@function,3
200.align 16
201padlock_sha256_oneshot:
ed28aef8 202 mov %rdx,%rcx
08d62e9f
AP
203 mov %rdi,%rdx # put aside %rdi
204 movups (%rdi),%xmm0 # copy-in context
205 sub \$128+8,%rsp
206 movups 16(%rdi),%xmm1
207 movaps %xmm0,(%rsp)
208 mov %rsp,%rdi
209 movaps %xmm1,16(%rsp)
210 xor %rax,%rax
ed28aef8 211 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
08d62e9f
AP
212 movaps (%rsp),%xmm0
213 movaps 16(%rsp),%xmm1
b1d3e9de 214 add \$128+8,%rsp
08d62e9f
AP
215 movups %xmm0,(%rdx) # copy-out context
216 movups %xmm1,16(%rdx)
ed28aef8
AP
217 ret
218.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
219
149ca712
AP
220.globl padlock_sha256_blocks
221.type padlock_sha256_blocks,\@function,3
ed28aef8 222.align 16
149ca712 223padlock_sha256_blocks:
ed28aef8 224 mov %rdx,%rcx
08d62e9f
AP
225 mov %rdi,%rdx # put aside %rdi
226 movups (%rdi),%xmm0 # copy-in context
227 sub \$128+8,%rsp
228 movups 16(%rdi),%xmm1
229 movaps %xmm0,(%rsp)
230 mov %rsp,%rdi
231 movaps %xmm1,16(%rsp)
232 mov \$-1,%rax
ed28aef8 233 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
08d62e9f
AP
234 movaps (%rsp),%xmm0
235 movaps 16(%rsp),%xmm1
b1d3e9de 236 add \$128+8,%rsp
08d62e9f
AP
237 movups %xmm0,(%rdx) # copy-out context
238 movups %xmm1,16(%rdx)
ed28aef8 239 ret
149ca712
AP
240.size padlock_sha256_blocks,.-padlock_sha256_blocks
241
d18762f7
AP
242.globl padlock_sha512_blocks
243.type padlock_sha512_blocks,\@function,3
149ca712
AP
244.align 16
245padlock_sha512_blocks:
246 mov %rdx,%rcx
08d62e9f
AP
247 mov %rdi,%rdx # put aside %rdi
248 movups (%rdi),%xmm0 # copy-in context
249 sub \$128+8,%rsp
250 movups 16(%rdi),%xmm1
251 movups 32(%rdi),%xmm2
252 movups 48(%rdi),%xmm3
253 movaps %xmm0,(%rsp)
254 mov %rsp,%rdi
255 movaps %xmm1,16(%rsp)
256 movaps %xmm2,32(%rsp)
257 movaps %xmm3,48(%rsp)
149ca712 258 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
08d62e9f
AP
259 movaps (%rsp),%xmm0
260 movaps 16(%rsp),%xmm1
261 movaps 32(%rsp),%xmm2
262 movaps 48(%rsp),%xmm3
b1d3e9de 263 add \$128+8,%rsp
08d62e9f
AP
264 movups %xmm0,(%rdx) # copy-out context
265 movups %xmm1,16(%rdx)
266 movups %xmm2,32(%rdx)
267 movups %xmm3,48(%rdx)
149ca712
AP
268 ret
269.size padlock_sha512_blocks,.-padlock_sha512_blocks
ed28aef8
AP
270___
271
272sub generate_mode {
273my ($mode,$opcode) = @_;
274# int padlock_$mode_encrypt(void *out, const void *inp,
275# struct padlock_cipher_data *ctx, size_t len);
276$code.=<<___;
277.globl padlock_${mode}_encrypt
278.type padlock_${mode}_encrypt,\@function,4
279.align 16
280padlock_${mode}_encrypt:
281 push %rbp
282 push %rbx
283
284 xor %eax,%eax
285 test \$15,$ctx
286 jnz .L${mode}_abort
287 test \$15,$len
288 jnz .L${mode}_abort
289 lea .Lpadlock_saved_context(%rip),%rax
290 pushf
291 cld
292 call _padlock_verify_ctx
293 lea 16($ctx),$ctx # control word
294 xor %eax,%eax
295 xor %ebx,%ebx
33987f2f 296 testl \$`1<<5`,($ctx) # align bit in control word
149ca712 297 jnz .L${mode}_aligned
ed28aef8
AP
298 test \$0x0f,$out
299 setz %al # !out_misaligned
300 test \$0x0f,$inp
301 setz %bl # !inp_misaligned
302 test %ebx,%eax
303 jnz .L${mode}_aligned
304 neg %rax
305 mov \$$PADLOCK_CHUNK,$chunk
306 not %rax # out_misaligned?-1:0
307 lea (%rsp),%rbp
308 cmp $chunk,$len
309 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
310 and $chunk,%rax # out_misaligned?chunk:0
311 mov $len,$chunk
312 neg %rax
313 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
314 lea (%rax,%rbp),%rsp
ed998634
AP
315 mov \$$PADLOCK_CHUNK,%rax
316 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
50452b2e
AP
317___
318$code.=<<___ if ($mode eq "ctr32");
6c8ce3c2 319.L${mode}_reenter:
50452b2e
AP
320 mov -4($ctx),%eax # pull 32-bit counter
321 bswap %eax
322 neg %eax
323 and \$`$PADLOCK_CHUNK/16-1`,%eax
ed998634 324 mov \$$PADLOCK_CHUNK,$chunk
50452b2e 325 shl \$4,%eax
ed998634 326 cmovz $chunk,%rax
50452b2e
AP
327 cmp %rax,$len
328 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
ed998634
AP
329 cmovbe $len,$chunk
330___
331$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
332 cmp $chunk,$len
333 ja .L${mode}_loop
334 mov $inp,%rax # check if prefetch crosses page
335 cmp %rsp,%rbp
336 cmove $out,%rax
337 add $len,%rax
338 neg %rax
339 and \$0xfff,%rax # distance to page boundary
340 cmp \$$PADLOCK_PREFETCH{$mode},%rax
341 mov \$-$PADLOCK_PREFETCH{$mode},%rax
342 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
343 and %rax,$chunk
344 jz .L${mode}_unaligned_tail
50452b2e
AP
345___
346$code.=<<___;
ed28aef8
AP
347 jmp .L${mode}_loop
348.align 16
349.L${mode}_loop:
50452b2e
AP
350 cmp $len,$chunk # ctr32 artefact
351 cmova $len,$chunk # ctr32 artefact
ed28aef8
AP
352 mov $out,%r8 # save parameters
353 mov $inp,%r9
354 mov $len,%r10
355 mov $chunk,$len
356 mov $chunk,%r11
357 test \$0x0f,$out # out_misaligned
358 cmovnz %rsp,$out
359 test \$0x0f,$inp # inp_misaligned
360 jz .L${mode}_inp_aligned
361 shr \$3,$len
362 .byte 0xf3,0x48,0xa5 # rep movsq
363 sub $chunk,$out
364 mov $chunk,$len
365 mov $out,$inp
366.L${mode}_inp_aligned:
367 lea -16($ctx),%rax # ivp
368 lea 16($ctx),%rbx # key
369 shr \$4,$len
370 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
371___
372$code.=<<___ if ($mode !~ /ecb|ctr/);
373 movdqa (%rax),%xmm0
374 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
375___
50452b2e
AP
376$code.=<<___ if ($mode eq "ctr32");
377 mov -4($ctx),%eax # pull 32-bit counter
378 test \$0xffff0000,%eax
ed998634 379 jnz .L${mode}_no_carry
50452b2e
AP
380 bswap %eax
381 add \$0x10000,%eax
382 bswap %eax
383 mov %eax,-4($ctx)
ed998634 384.L${mode}_no_carry:
50452b2e 385___
ed28aef8 386$code.=<<___;
1afd7fa9 387 mov %r8,$out # restore parameters
ed28aef8
AP
388 mov %r11,$chunk
389 test \$0x0f,$out
390 jz .L${mode}_out_aligned
391 mov $chunk,$len
ed28aef8 392 lea (%rsp),$inp
ed998634 393 shr \$3,$len
ed28aef8
AP
394 .byte 0xf3,0x48,0xa5 # rep movsq
395 sub $chunk,$out
396.L${mode}_out_aligned:
397 mov %r9,$inp
398 mov %r10,$len
399 add $chunk,$out
400 add $chunk,$inp
401 sub $chunk,$len
402 mov \$$PADLOCK_CHUNK,$chunk
ed998634
AP
403___
404 if (!$PADLOCK_PREFETCH{$mode}) {
405$code.=<<___;
ed28aef8 406 jnz .L${mode}_loop
ed998634
AP
407___
408 } else {
409$code.=<<___;
410 jz .L${mode}_break
411 cmp $chunk,$len
412 jae .L${mode}_loop
413___
414$code.=<<___ if ($mode eq "ctr32");
415 mov $len,$chunk
416 mov $inp,%rax # check if prefetch crosses page
6c8ce3c2 417 cmp %rsp,%rbp
ed998634
AP
418 cmove $out,%rax
419 add $len,%rax
420 neg %rax
421 and \$0xfff,%rax # distance to page boundary
422 cmp \$$PADLOCK_PREFETCH{$mode},%rax
423 mov \$-$PADLOCK_PREFETCH{$mode},%rax
424 cmovae $chunk,%rax
425 and %rax,$chunk
426 jnz .L${mode}_loop
427___
428$code.=<<___;
429.L${mode}_unaligned_tail:
430 xor %eax,%eax
431 cmp %rsp,%rbp
432 cmove $len,%rax
433 mov $out,%r8 # save parameters
434 mov $len,$chunk
435 sub %rax,%rsp # alloca
436 shr \$3,$len
437 lea (%rsp),$out
438 .byte 0xf3,0x48,0xa5 # rep movsq
439 mov %rsp,$inp
440 mov %r8, $out # restore parameters
441 mov $chunk,$len
442 jmp .L${mode}_loop
443.align 16
444.L${mode}_break:
445___
446 }
447$code.=<<___;
448 cmp %rbp,%rsp
6c8ce3c2
AP
449 je .L${mode}_done
450
451 pxor %xmm0,%xmm0
452 lea (%rsp),%rax
453.L${mode}_bzero:
454 movaps %xmm0,(%rax)
455 lea 16(%rax),%rax
456 cmp %rax,%rbp
457 ja .L${mode}_bzero
ed28aef8 458
ed28aef8
AP
459.L${mode}_done:
460 lea (%rbp),%rsp
461 jmp .L${mode}_exit
ed998634 462
ed28aef8
AP
463.align 16
464.L${mode}_aligned:
50452b2e
AP
465___
466$code.=<<___ if ($mode eq "ctr32");
467 mov -4($ctx),%eax # pull 32-bit counter
50452b2e 468 bswap %eax
50452b2e
AP
469 neg %eax
470 and \$0xffff,%eax
ed998634 471 mov \$`16*0x10000`,$chunk
50452b2e 472 shl \$4,%eax
ed998634 473 cmovz $chunk,%rax
50452b2e
AP
474 cmp %rax,$len
475 cmova %rax,$chunk # don't let counter cross 2^16
ed998634
AP
476 cmovbe $len,$chunk
477 jbe .L${mode}_aligned_skip
478
50452b2e 479.L${mode}_aligned_loop:
50452b2e
AP
480 mov $len,%r10 # save parameters
481 mov $chunk,$len
482 mov $chunk,%r11
ed998634 483
ed28aef8
AP
484 lea -16($ctx),%rax # ivp
485 lea 16($ctx),%rbx # key
486 shr \$4,$len # len/=AES_BLOCK_SIZE
487 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
ed998634 488
50452b2e
AP
489 mov -4($ctx),%eax # pull 32-bit counter
490 bswap %eax
491 add \$0x10000,%eax
492 bswap %eax
493 mov %eax,-4($ctx)
494
1afd7fa9 495 mov %r10,$len # restore parameters
ed998634 496 sub %r11,$len
50452b2e 497 mov \$`16*0x10000`,$chunk
ed998634
AP
498 jz .L${mode}_exit
499 cmp $chunk,$len
500 jae .L${mode}_aligned_loop
501
502.L${mode}_aligned_skip:
503___
504$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
505 lea ($inp,$len),%rbp
506 neg %rbp
507 and \$0xfff,%rbp # distance to page boundary
508 xor %eax,%eax
509 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
510 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
511 cmovae %rax,%rbp
512 and $len,%rbp # remainder
513 sub %rbp,$len
514 jz .L${mode}_aligned_tail
515___
516$code.=<<___;
517 lea -16($ctx),%rax # ivp
518 lea 16($ctx),%rbx # key
519 shr \$4,$len # len/=AES_BLOCK_SIZE
520 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
521___
522$code.=<<___ if ($mode !~ /ecb|ctr/);
523 movdqa (%rax),%xmm0
524 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
525___
526$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
527 test %rbp,%rbp # check remainder
528 jz .L${mode}_exit
529
530.L${mode}_aligned_tail:
531 mov $out,%r8
532 mov %rbp,$chunk
533 mov %rbp,$len
534 lea (%rsp),%rbp
535 sub $len,%rsp
536 shr \$3,$len
537 lea (%rsp),$out
609b0852 538 .byte 0xf3,0x48,0xa5 # rep movsq
ed998634
AP
539 lea (%r8),$out
540 lea (%rsp),$inp
541 mov $chunk,$len
542 jmp .L${mode}_loop
50452b2e 543___
ed28aef8
AP
544$code.=<<___;
545.L${mode}_exit:
546 mov \$1,%eax
547 lea 8(%rsp),%rsp
548.L${mode}_abort:
549 pop %rbx
550 pop %rbp
551 ret
552.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
553___
554}
555
556&generate_mode("ecb",0xc8);
557&generate_mode("cbc",0xd0);
558&generate_mode("cfb",0xe0);
559&generate_mode("ofb",0xe8);
50452b2e 560&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
ed28aef8
AP
561
562$code.=<<___;
563.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
564.align 16
565.data
566.align 8
567.Lpadlock_saved_context:
568 .quad 0
569___
570$code =~ s/\`([^\`]*)\`/eval($1)/gem;
571
572print $code;
573
574close STDOUT;