]> git.ipfire.org Git - thirdparty/openssl.git/blob - engines/asm/e_padlock-x86_64.pl
bf57da78fccdb4a4dae354ac64c57a08940c90e0
[thirdparty/openssl.git] / engines / asm / e_padlock-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # September 2011
18 #
19 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20 # details.
21
22 $flavour = shift;
23 $output = shift;
24 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
25
26 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
27
28 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
30 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
31 die "can't locate x86_64-xlate.pl";
32
33 open OUT,"| \"$^X\" $xlate $flavour $output";
34 *STDOUT=*OUT;
35
36 $code=".text\n";
37
38 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
39 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
40
41 $ctx="%rdx";
42 $out="%rdi";
43 $inp="%rsi";
44 $len="%rcx";
45 $chunk="%rbx";
46
47 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
48 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
49
50 $code.=<<___;
51 .globl padlock_capability
52 .type padlock_capability,\@abi-omnipotent
53 .align 16
54 padlock_capability:
55 mov %rbx,%r8
56 xor %eax,%eax
57 cpuid
58 xor %eax,%eax
59 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
60 jne .Lnoluck
61 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
62 jne .Lnoluck
63 cmp \$`"0x".unpack("H*",'slua')`,%ecx
64 jne .Lnoluck
65 mov \$0xC0000000,%eax
66 cpuid
67 mov %eax,%edx
68 xor %eax,%eax
69 cmp \$0xC0000001,%edx
70 jb .Lnoluck
71 mov \$0xC0000001,%eax
72 cpuid
73 mov %edx,%eax
74 and \$0xffffffef,%eax
75 or \$0x10,%eax # set Nano bit#4
76 .Lnoluck:
77 mov %r8,%rbx
78 ret
79 .size padlock_capability,.-padlock_capability
80
81 .globl padlock_key_bswap
82 .type padlock_key_bswap,\@abi-omnipotent,0
83 .align 16
84 padlock_key_bswap:
85 mov 240($arg1),%edx
86 .Lbswap_loop:
87 mov ($arg1),%eax
88 bswap %eax
89 mov %eax,($arg1)
90 lea 4($arg1),$arg1
91 sub \$1,%edx
92 jnz .Lbswap_loop
93 ret
94 .size padlock_key_bswap,.-padlock_key_bswap
95
96 .globl padlock_verify_context
97 .type padlock_verify_context,\@abi-omnipotent
98 .align 16
99 padlock_verify_context:
100 mov $arg1,$ctx
101 pushf
102 lea .Lpadlock_saved_context(%rip),%rax
103 call _padlock_verify_ctx
104 lea 8(%rsp),%rsp
105 ret
106 .size padlock_verify_context,.-padlock_verify_context
107
108 .type _padlock_verify_ctx,\@abi-omnipotent
109 .align 16
110 _padlock_verify_ctx:
111 mov 8(%rsp),%r8
112 bt \$30,%r8
113 jnc .Lverified
114 cmp (%rax),$ctx
115 je .Lverified
116 pushf
117 popf
118 .Lverified:
119 mov $ctx,(%rax)
120 ret
121 .size _padlock_verify_ctx,.-_padlock_verify_ctx
122
123 .globl padlock_reload_key
124 .type padlock_reload_key,\@abi-omnipotent
125 .align 16
126 padlock_reload_key:
127 pushf
128 popf
129 ret
130 .size padlock_reload_key,.-padlock_reload_key
131
132 .globl padlock_aes_block
133 .type padlock_aes_block,\@function,3
134 .align 16
135 padlock_aes_block:
136 mov %rbx,%r8
137 mov \$1,$len
138 lea 32($ctx),%rbx # key
139 lea 16($ctx),$ctx # control word
140 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
141 mov %r8,%rbx
142 ret
143 .size padlock_aes_block,.-padlock_aes_block
144
145 .globl padlock_xstore
146 .type padlock_xstore,\@function,2
147 .align 16
148 padlock_xstore:
149 mov %esi,%edx
150 .byte 0x0f,0xa7,0xc0 # xstore
151 ret
152 .size padlock_xstore,.-padlock_xstore
153
154 .globl padlock_sha1_oneshot
155 .type padlock_sha1_oneshot,\@function,3
156 .align 16
157 padlock_sha1_oneshot:
158 mov %rdx,%rcx
159 mov %rdi,%rdx # put aside %rdi
160 movups (%rdi),%xmm0 # copy-in context
161 sub \$128+8,%rsp
162 mov 16(%rdi),%eax
163 movaps %xmm0,(%rsp)
164 mov %rsp,%rdi
165 mov %eax,16(%rsp)
166 xor %rax,%rax
167 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
168 movaps (%rsp),%xmm0
169 mov 16(%rsp),%eax
170 add \$128+8,%rsp
171 movups %xmm0,(%rdx) # copy-out context
172 mov %eax,16(%rdx)
173 ret
174 .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
175
176 .globl padlock_sha1_blocks
177 .type padlock_sha1_blocks,\@function,3
178 .align 16
179 padlock_sha1_blocks:
180 mov %rdx,%rcx
181 mov %rdi,%rdx # put aside %rdi
182 movups (%rdi),%xmm0 # copy-in context
183 sub \$128+8,%rsp
184 mov 16(%rdi),%eax
185 movaps %xmm0,(%rsp)
186 mov %rsp,%rdi
187 mov %eax,16(%rsp)
188 mov \$-1,%rax
189 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
190 movaps (%rsp),%xmm0
191 mov 16(%rsp),%eax
192 add \$128+8,%rsp
193 movups %xmm0,(%rdx) # copy-out context
194 mov %eax,16(%rdx)
195 ret
196 .size padlock_sha1_blocks,.-padlock_sha1_blocks
197
198 .globl padlock_sha256_oneshot
199 .type padlock_sha256_oneshot,\@function,3
200 .align 16
201 padlock_sha256_oneshot:
202 mov %rdx,%rcx
203 mov %rdi,%rdx # put aside %rdi
204 movups (%rdi),%xmm0 # copy-in context
205 sub \$128+8,%rsp
206 movups 16(%rdi),%xmm1
207 movaps %xmm0,(%rsp)
208 mov %rsp,%rdi
209 movaps %xmm1,16(%rsp)
210 xor %rax,%rax
211 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
212 movaps (%rsp),%xmm0
213 movaps 16(%rsp),%xmm1
214 add \$128+8,%rsp
215 movups %xmm0,(%rdx) # copy-out context
216 movups %xmm1,16(%rdx)
217 ret
218 .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
219
220 .globl padlock_sha256_blocks
221 .type padlock_sha256_blocks,\@function,3
222 .align 16
223 padlock_sha256_blocks:
224 mov %rdx,%rcx
225 mov %rdi,%rdx # put aside %rdi
226 movups (%rdi),%xmm0 # copy-in context
227 sub \$128+8,%rsp
228 movups 16(%rdi),%xmm1
229 movaps %xmm0,(%rsp)
230 mov %rsp,%rdi
231 movaps %xmm1,16(%rsp)
232 mov \$-1,%rax
233 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
234 movaps (%rsp),%xmm0
235 movaps 16(%rsp),%xmm1
236 add \$128+8,%rsp
237 movups %xmm0,(%rdx) # copy-out context
238 movups %xmm1,16(%rdx)
239 ret
240 .size padlock_sha256_blocks,.-padlock_sha256_blocks
241
242 .globl padlock_sha512_blocks
243 .type padlock_sha512_blocks,\@function,3
244 .align 16
245 padlock_sha512_blocks:
246 mov %rdx,%rcx
247 mov %rdi,%rdx # put aside %rdi
248 movups (%rdi),%xmm0 # copy-in context
249 sub \$128+8,%rsp
250 movups 16(%rdi),%xmm1
251 movups 32(%rdi),%xmm2
252 movups 48(%rdi),%xmm3
253 movaps %xmm0,(%rsp)
254 mov %rsp,%rdi
255 movaps %xmm1,16(%rsp)
256 movaps %xmm2,32(%rsp)
257 movaps %xmm3,48(%rsp)
258 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
259 movaps (%rsp),%xmm0
260 movaps 16(%rsp),%xmm1
261 movaps 32(%rsp),%xmm2
262 movaps 48(%rsp),%xmm3
263 add \$128+8,%rsp
264 movups %xmm0,(%rdx) # copy-out context
265 movups %xmm1,16(%rdx)
266 movups %xmm2,32(%rdx)
267 movups %xmm3,48(%rdx)
268 ret
269 .size padlock_sha512_blocks,.-padlock_sha512_blocks
270 ___
271
272 sub generate_mode {
273 my ($mode,$opcode) = @_;
274 # int padlock_$mode_encrypt(void *out, const void *inp,
275 # struct padlock_cipher_data *ctx, size_t len);
276 $code.=<<___;
277 .globl padlock_${mode}_encrypt
278 .type padlock_${mode}_encrypt,\@function,4
279 .align 16
280 padlock_${mode}_encrypt:
281 push %rbp
282 push %rbx
283
284 xor %eax,%eax
285 test \$15,$ctx
286 jnz .L${mode}_abort
287 test \$15,$len
288 jnz .L${mode}_abort
289 lea .Lpadlock_saved_context(%rip),%rax
290 pushf
291 cld
292 call _padlock_verify_ctx
293 lea 16($ctx),$ctx # control word
294 xor %eax,%eax
295 xor %ebx,%ebx
296 testl \$`1<<5`,($ctx) # align bit in control word
297 jnz .L${mode}_aligned
298 test \$0x0f,$out
299 setz %al # !out_misaligned
300 test \$0x0f,$inp
301 setz %bl # !inp_misaligned
302 test %ebx,%eax
303 jnz .L${mode}_aligned
304 neg %rax
305 mov \$$PADLOCK_CHUNK,$chunk
306 not %rax # out_misaligned?-1:0
307 lea (%rsp),%rbp
308 cmp $chunk,$len
309 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
310 and $chunk,%rax # out_misaligned?chunk:0
311 mov $len,$chunk
312 neg %rax
313 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
314 lea (%rax,%rbp),%rsp
315 mov \$$PADLOCK_CHUNK,%rax
316 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
317 ___
318 $code.=<<___ if ($mode eq "ctr32");
319 .L${mode}_reenter:
320 mov -4($ctx),%eax # pull 32-bit counter
321 bswap %eax
322 neg %eax
323 and \$`$PADLOCK_CHUNK/16-1`,%eax
324 mov \$$PADLOCK_CHUNK,$chunk
325 shl \$4,%eax
326 cmovz $chunk,%rax
327 cmp %rax,$len
328 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
329 cmovbe $len,$chunk
330 ___
331 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
332 cmp $chunk,$len
333 ja .L${mode}_loop
334 mov $inp,%rax # check if prefetch crosses page
335 cmp %rsp,%rbp
336 cmove $out,%rax
337 add $len,%rax
338 neg %rax
339 and \$0xfff,%rax # distance to page boundary
340 cmp \$$PADLOCK_PREFETCH{$mode},%rax
341 mov \$-$PADLOCK_PREFETCH{$mode},%rax
342 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
343 and %rax,$chunk
344 jz .L${mode}_unaligned_tail
345 ___
346 $code.=<<___;
347 jmp .L${mode}_loop
348 .align 16
349 .L${mode}_loop:
350 cmp $len,$chunk # ctr32 artefact
351 cmova $len,$chunk # ctr32 artefact
352 mov $out,%r8 # save parameters
353 mov $inp,%r9
354 mov $len,%r10
355 mov $chunk,$len
356 mov $chunk,%r11
357 test \$0x0f,$out # out_misaligned
358 cmovnz %rsp,$out
359 test \$0x0f,$inp # inp_misaligned
360 jz .L${mode}_inp_aligned
361 shr \$3,$len
362 .byte 0xf3,0x48,0xa5 # rep movsq
363 sub $chunk,$out
364 mov $chunk,$len
365 mov $out,$inp
366 .L${mode}_inp_aligned:
367 lea -16($ctx),%rax # ivp
368 lea 16($ctx),%rbx # key
369 shr \$4,$len
370 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
371 ___
372 $code.=<<___ if ($mode !~ /ecb|ctr/);
373 movdqa (%rax),%xmm0
374 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
375 ___
376 $code.=<<___ if ($mode eq "ctr32");
377 mov -4($ctx),%eax # pull 32-bit counter
378 test \$0xffff0000,%eax
379 jnz .L${mode}_no_carry
380 bswap %eax
381 add \$0x10000,%eax
382 bswap %eax
383 mov %eax,-4($ctx)
384 .L${mode}_no_carry:
385 ___
386 $code.=<<___;
387 mov %r8,$out # restore parameters
388 mov %r11,$chunk
389 test \$0x0f,$out
390 jz .L${mode}_out_aligned
391 mov $chunk,$len
392 lea (%rsp),$inp
393 shr \$3,$len
394 .byte 0xf3,0x48,0xa5 # rep movsq
395 sub $chunk,$out
396 .L${mode}_out_aligned:
397 mov %r9,$inp
398 mov %r10,$len
399 add $chunk,$out
400 add $chunk,$inp
401 sub $chunk,$len
402 mov \$$PADLOCK_CHUNK,$chunk
403 ___
404 if (!$PADLOCK_PREFETCH{$mode}) {
405 $code.=<<___;
406 jnz .L${mode}_loop
407 ___
408 } else {
409 $code.=<<___;
410 jz .L${mode}_break
411 cmp $chunk,$len
412 jae .L${mode}_loop
413 ___
414 $code.=<<___ if ($mode eq "ctr32");
415 mov $len,$chunk
416 mov $inp,%rax # check if prefetch crosses page
417 cmp %rsp,%rbp
418 cmove $out,%rax
419 add $len,%rax
420 neg %rax
421 and \$0xfff,%rax # distance to page boundary
422 cmp \$$PADLOCK_PREFETCH{$mode},%rax
423 mov \$-$PADLOCK_PREFETCH{$mode},%rax
424 cmovae $chunk,%rax
425 and %rax,$chunk
426 jnz .L${mode}_loop
427 ___
428 $code.=<<___;
429 .L${mode}_unaligned_tail:
430 xor %eax,%eax
431 cmp %rsp,%rbp
432 cmove $len,%rax
433 mov $out,%r8 # save parameters
434 mov $len,$chunk
435 sub %rax,%rsp # alloca
436 shr \$3,$len
437 lea (%rsp),$out
438 .byte 0xf3,0x48,0xa5 # rep movsq
439 mov %rsp,$inp
440 mov %r8, $out # restore parameters
441 mov $chunk,$len
442 jmp .L${mode}_loop
443 .align 16
444 .L${mode}_break:
445 ___
446 }
447 $code.=<<___;
448 cmp %rbp,%rsp
449 je .L${mode}_done
450
451 pxor %xmm0,%xmm0
452 lea (%rsp),%rax
453 .L${mode}_bzero:
454 movaps %xmm0,(%rax)
455 lea 16(%rax),%rax
456 cmp %rax,%rbp
457 ja .L${mode}_bzero
458
459 .L${mode}_done:
460 lea (%rbp),%rsp
461 jmp .L${mode}_exit
462
463 .align 16
464 .L${mode}_aligned:
465 ___
466 $code.=<<___ if ($mode eq "ctr32");
467 mov -4($ctx),%eax # pull 32-bit counter
468 bswap %eax
469 neg %eax
470 and \$0xffff,%eax
471 mov \$`16*0x10000`,$chunk
472 shl \$4,%eax
473 cmovz $chunk,%rax
474 cmp %rax,$len
475 cmova %rax,$chunk # don't let counter cross 2^16
476 cmovbe $len,$chunk
477 jbe .L${mode}_aligned_skip
478
479 .L${mode}_aligned_loop:
480 mov $len,%r10 # save parameters
481 mov $chunk,$len
482 mov $chunk,%r11
483
484 lea -16($ctx),%rax # ivp
485 lea 16($ctx),%rbx # key
486 shr \$4,$len # len/=AES_BLOCK_SIZE
487 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
488
489 mov -4($ctx),%eax # pull 32-bit counter
490 bswap %eax
491 add \$0x10000,%eax
492 bswap %eax
493 mov %eax,-4($ctx)
494
495 mov %r10,$len # restore parameters
496 sub %r11,$len
497 mov \$`16*0x10000`,$chunk
498 jz .L${mode}_exit
499 cmp $chunk,$len
500 jae .L${mode}_aligned_loop
501
502 .L${mode}_aligned_skip:
503 ___
504 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
505 lea ($inp,$len),%rbp
506 neg %rbp
507 and \$0xfff,%rbp # distance to page boundary
508 xor %eax,%eax
509 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
510 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
511 cmovae %rax,%rbp
512 and $len,%rbp # remainder
513 sub %rbp,$len
514 jz .L${mode}_aligned_tail
515 ___
516 $code.=<<___;
517 lea -16($ctx),%rax # ivp
518 lea 16($ctx),%rbx # key
519 shr \$4,$len # len/=AES_BLOCK_SIZE
520 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
521 ___
522 $code.=<<___ if ($mode !~ /ecb|ctr/);
523 movdqa (%rax),%xmm0
524 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
525 ___
526 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
527 test %rbp,%rbp # check remainder
528 jz .L${mode}_exit
529
530 .L${mode}_aligned_tail:
531 mov $out,%r8
532 mov %rbp,$chunk
533 mov %rbp,$len
534 lea (%rsp),%rbp
535 sub $len,%rsp
536 shr \$3,$len
537 lea (%rsp),$out
538 .byte 0xf3,0x48,0xa5 # rep movsq
539 lea (%r8),$out
540 lea (%rsp),$inp
541 mov $chunk,$len
542 jmp .L${mode}_loop
543 ___
544 $code.=<<___;
545 .L${mode}_exit:
546 mov \$1,%eax
547 lea 8(%rsp),%rsp
548 .L${mode}_abort:
549 pop %rbx
550 pop %rbp
551 ret
552 .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
553 ___
554 }
555
556 &generate_mode("ecb",0xc8);
557 &generate_mode("cbc",0xd0);
558 &generate_mode("cfb",0xe0);
559 &generate_mode("ofb",0xe8);
560 &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
561
562 $code.=<<___;
563 .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
564 .align 16
565 .data
566 .align 8
567 .Lpadlock_saved_context:
568 .quad 0
569 ___
570 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
571
572 print $code;
573
574 close STDOUT;