]> git.ipfire.org Git - thirdparty/openssl.git/blob - engines/asm/e_padlock-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / engines / asm / e_padlock-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # September 2011
18 #
19 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20 # details.
21
22 $flavour = shift;
23 $output = shift;
24 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
25
26 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
27
28 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
30 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
31 die "can't locate x86_64-xlate.pl";
32
33 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
34 *STDOUT=*OUT;
35
36 $code=".text\n";
37
38 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
39 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
40
41 $ctx="%rdx";
42 $out="%rdi";
43 $inp="%rsi";
44 $len="%rcx";
45 $chunk="%rbx";
46
47 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
48 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
49
50 $code.=<<___;
51 .globl padlock_capability
52 .type padlock_capability,\@abi-omnipotent
53 .align 16
54 padlock_capability:
55 mov %rbx,%r8
56 xor %eax,%eax
57 cpuid
58 xor %eax,%eax
59 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
60 jne .Lzhaoxin
61 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
62 jne .Lnoluck
63 cmp \$`"0x".unpack("H*",'slua')`,%ecx
64 jne .Lnoluck
65 jmp .LzhaoxinEnd
66 .Lzhaoxin:
67 cmp \$`"0x".unpack("H*",'hS ')`,%ebx
68 jne .Lnoluck
69 cmp \$`"0x".unpack("H*",'hgna')`,%edx
70 jne .Lnoluck
71 cmp \$`"0x".unpack("H*",' ia')`,%ecx
72 jne .Lnoluck
73 .LzhaoxinEnd:
74 mov \$0xC0000000,%eax
75 cpuid
76 mov %eax,%edx
77 xor %eax,%eax
78 cmp \$0xC0000001,%edx
79 jb .Lnoluck
80 mov \$0xC0000001,%eax
81 cpuid
82 mov %edx,%eax
83 and \$0xffffffef,%eax
84 or \$0x10,%eax # set Nano bit#4
85 .Lnoluck:
86 mov %r8,%rbx
87 ret
88 .size padlock_capability,.-padlock_capability
89
90 .globl padlock_key_bswap
91 .type padlock_key_bswap,\@abi-omnipotent,0
92 .align 16
93 padlock_key_bswap:
94 mov 240($arg1),%edx
95 .Lbswap_loop:
96 mov ($arg1),%eax
97 bswap %eax
98 mov %eax,($arg1)
99 lea 4($arg1),$arg1
100 sub \$1,%edx
101 jnz .Lbswap_loop
102 ret
103 .size padlock_key_bswap,.-padlock_key_bswap
104
105 .globl padlock_verify_context
106 .type padlock_verify_context,\@abi-omnipotent
107 .align 16
108 padlock_verify_context:
109 mov $arg1,$ctx
110 pushf
111 lea .Lpadlock_saved_context(%rip),%rax
112 call _padlock_verify_ctx
113 lea 8(%rsp),%rsp
114 ret
115 .size padlock_verify_context,.-padlock_verify_context
116
117 .type _padlock_verify_ctx,\@abi-omnipotent
118 .align 16
119 _padlock_verify_ctx:
120 mov 8(%rsp),%r8
121 bt \$30,%r8
122 jnc .Lverified
123 cmp (%rax),$ctx
124 je .Lverified
125 pushf
126 popf
127 .Lverified:
128 mov $ctx,(%rax)
129 ret
130 .size _padlock_verify_ctx,.-_padlock_verify_ctx
131
132 .globl padlock_reload_key
133 .type padlock_reload_key,\@abi-omnipotent
134 .align 16
135 padlock_reload_key:
136 pushf
137 popf
138 ret
139 .size padlock_reload_key,.-padlock_reload_key
140
141 .globl padlock_aes_block
142 .type padlock_aes_block,\@function,3
143 .align 16
144 padlock_aes_block:
145 mov %rbx,%r8
146 mov \$1,$len
147 lea 32($ctx),%rbx # key
148 lea 16($ctx),$ctx # control word
149 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
150 mov %r8,%rbx
151 ret
152 .size padlock_aes_block,.-padlock_aes_block
153
154 .globl padlock_xstore
155 .type padlock_xstore,\@function,2
156 .align 16
157 padlock_xstore:
158 mov %esi,%edx
159 .byte 0x0f,0xa7,0xc0 # xstore
160 ret
161 .size padlock_xstore,.-padlock_xstore
162
163 .globl padlock_sha1_oneshot
164 .type padlock_sha1_oneshot,\@function,3
165 .align 16
166 padlock_sha1_oneshot:
167 mov %rdx,%rcx
168 mov %rdi,%rdx # put aside %rdi
169 movups (%rdi),%xmm0 # copy-in context
170 sub \$128+8,%rsp
171 mov 16(%rdi),%eax
172 movaps %xmm0,(%rsp)
173 mov %rsp,%rdi
174 mov %eax,16(%rsp)
175 xor %rax,%rax
176 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
177 movaps (%rsp),%xmm0
178 mov 16(%rsp),%eax
179 add \$128+8,%rsp
180 movups %xmm0,(%rdx) # copy-out context
181 mov %eax,16(%rdx)
182 ret
183 .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
184
185 .globl padlock_sha1_blocks
186 .type padlock_sha1_blocks,\@function,3
187 .align 16
188 padlock_sha1_blocks:
189 mov %rdx,%rcx
190 mov %rdi,%rdx # put aside %rdi
191 movups (%rdi),%xmm0 # copy-in context
192 sub \$128+8,%rsp
193 mov 16(%rdi),%eax
194 movaps %xmm0,(%rsp)
195 mov %rsp,%rdi
196 mov %eax,16(%rsp)
197 mov \$-1,%rax
198 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
199 movaps (%rsp),%xmm0
200 mov 16(%rsp),%eax
201 add \$128+8,%rsp
202 movups %xmm0,(%rdx) # copy-out context
203 mov %eax,16(%rdx)
204 ret
205 .size padlock_sha1_blocks,.-padlock_sha1_blocks
206
207 .globl padlock_sha256_oneshot
208 .type padlock_sha256_oneshot,\@function,3
209 .align 16
210 padlock_sha256_oneshot:
211 mov %rdx,%rcx
212 mov %rdi,%rdx # put aside %rdi
213 movups (%rdi),%xmm0 # copy-in context
214 sub \$128+8,%rsp
215 movups 16(%rdi),%xmm1
216 movaps %xmm0,(%rsp)
217 mov %rsp,%rdi
218 movaps %xmm1,16(%rsp)
219 xor %rax,%rax
220 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
221 movaps (%rsp),%xmm0
222 movaps 16(%rsp),%xmm1
223 add \$128+8,%rsp
224 movups %xmm0,(%rdx) # copy-out context
225 movups %xmm1,16(%rdx)
226 ret
227 .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
228
229 .globl padlock_sha256_blocks
230 .type padlock_sha256_blocks,\@function,3
231 .align 16
232 padlock_sha256_blocks:
233 mov %rdx,%rcx
234 mov %rdi,%rdx # put aside %rdi
235 movups (%rdi),%xmm0 # copy-in context
236 sub \$128+8,%rsp
237 movups 16(%rdi),%xmm1
238 movaps %xmm0,(%rsp)
239 mov %rsp,%rdi
240 movaps %xmm1,16(%rsp)
241 mov \$-1,%rax
242 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
243 movaps (%rsp),%xmm0
244 movaps 16(%rsp),%xmm1
245 add \$128+8,%rsp
246 movups %xmm0,(%rdx) # copy-out context
247 movups %xmm1,16(%rdx)
248 ret
249 .size padlock_sha256_blocks,.-padlock_sha256_blocks
250
251 .globl padlock_sha512_blocks
252 .type padlock_sha512_blocks,\@function,3
253 .align 16
254 padlock_sha512_blocks:
255 mov %rdx,%rcx
256 mov %rdi,%rdx # put aside %rdi
257 movups (%rdi),%xmm0 # copy-in context
258 sub \$128+8,%rsp
259 movups 16(%rdi),%xmm1
260 movups 32(%rdi),%xmm2
261 movups 48(%rdi),%xmm3
262 movaps %xmm0,(%rsp)
263 mov %rsp,%rdi
264 movaps %xmm1,16(%rsp)
265 movaps %xmm2,32(%rsp)
266 movaps %xmm3,48(%rsp)
267 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
268 movaps (%rsp),%xmm0
269 movaps 16(%rsp),%xmm1
270 movaps 32(%rsp),%xmm2
271 movaps 48(%rsp),%xmm3
272 add \$128+8,%rsp
273 movups %xmm0,(%rdx) # copy-out context
274 movups %xmm1,16(%rdx)
275 movups %xmm2,32(%rdx)
276 movups %xmm3,48(%rdx)
277 ret
278 .size padlock_sha512_blocks,.-padlock_sha512_blocks
279 ___
280
281 sub generate_mode {
282 my ($mode,$opcode) = @_;
283 # int padlock_$mode_encrypt(void *out, const void *inp,
284 # struct padlock_cipher_data *ctx, size_t len);
285 $code.=<<___;
286 .globl padlock_${mode}_encrypt
287 .type padlock_${mode}_encrypt,\@function,4
288 .align 16
289 padlock_${mode}_encrypt:
290 push %rbp
291 push %rbx
292
293 xor %eax,%eax
294 test \$15,$ctx
295 jnz .L${mode}_abort
296 test \$15,$len
297 jnz .L${mode}_abort
298 lea .Lpadlock_saved_context(%rip),%rax
299 pushf
300 cld
301 call _padlock_verify_ctx
302 lea 16($ctx),$ctx # control word
303 xor %eax,%eax
304 xor %ebx,%ebx
305 testl \$`1<<5`,($ctx) # align bit in control word
306 jnz .L${mode}_aligned
307 test \$0x0f,$out
308 setz %al # !out_misaligned
309 test \$0x0f,$inp
310 setz %bl # !inp_misaligned
311 test %ebx,%eax
312 jnz .L${mode}_aligned
313 neg %rax
314 mov \$$PADLOCK_CHUNK,$chunk
315 not %rax # out_misaligned?-1:0
316 lea (%rsp),%rbp
317 cmp $chunk,$len
318 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
319 and $chunk,%rax # out_misaligned?chunk:0
320 mov $len,$chunk
321 neg %rax
322 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
323 lea (%rax,%rbp),%rsp
324 mov \$$PADLOCK_CHUNK,%rax
325 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
326 ___
327 $code.=<<___ if ($mode eq "ctr32");
328 .L${mode}_reenter:
329 mov -4($ctx),%eax # pull 32-bit counter
330 bswap %eax
331 neg %eax
332 and \$`$PADLOCK_CHUNK/16-1`,%eax
333 mov \$$PADLOCK_CHUNK,$chunk
334 shl \$4,%eax
335 cmovz $chunk,%rax
336 cmp %rax,$len
337 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
338 cmovbe $len,$chunk
339 ___
340 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
341 cmp $chunk,$len
342 ja .L${mode}_loop
343 mov $inp,%rax # check if prefetch crosses page
344 cmp %rsp,%rbp
345 cmove $out,%rax
346 add $len,%rax
347 neg %rax
348 and \$0xfff,%rax # distance to page boundary
349 cmp \$$PADLOCK_PREFETCH{$mode},%rax
350 mov \$-$PADLOCK_PREFETCH{$mode},%rax
351 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
352 and %rax,$chunk
353 jz .L${mode}_unaligned_tail
354 ___
355 $code.=<<___;
356 jmp .L${mode}_loop
357 .align 16
358 .L${mode}_loop:
359 cmp $len,$chunk # ctr32 artefact
360 cmova $len,$chunk # ctr32 artefact
361 mov $out,%r8 # save parameters
362 mov $inp,%r9
363 mov $len,%r10
364 mov $chunk,$len
365 mov $chunk,%r11
366 test \$0x0f,$out # out_misaligned
367 cmovnz %rsp,$out
368 test \$0x0f,$inp # inp_misaligned
369 jz .L${mode}_inp_aligned
370 shr \$3,$len
371 .byte 0xf3,0x48,0xa5 # rep movsq
372 sub $chunk,$out
373 mov $chunk,$len
374 mov $out,$inp
375 .L${mode}_inp_aligned:
376 lea -16($ctx),%rax # ivp
377 lea 16($ctx),%rbx # key
378 shr \$4,$len
379 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
380 ___
381 $code.=<<___ if ($mode !~ /ecb|ctr/);
382 movdqa (%rax),%xmm0
383 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
384 ___
385 $code.=<<___ if ($mode eq "ctr32");
386 mov -4($ctx),%eax # pull 32-bit counter
387 test \$0xffff0000,%eax
388 jnz .L${mode}_no_carry
389 bswap %eax
390 add \$0x10000,%eax
391 bswap %eax
392 mov %eax,-4($ctx)
393 .L${mode}_no_carry:
394 ___
395 $code.=<<___;
396 mov %r8,$out # restore parameters
397 mov %r11,$chunk
398 test \$0x0f,$out
399 jz .L${mode}_out_aligned
400 mov $chunk,$len
401 lea (%rsp),$inp
402 shr \$3,$len
403 .byte 0xf3,0x48,0xa5 # rep movsq
404 sub $chunk,$out
405 .L${mode}_out_aligned:
406 mov %r9,$inp
407 mov %r10,$len
408 add $chunk,$out
409 add $chunk,$inp
410 sub $chunk,$len
411 mov \$$PADLOCK_CHUNK,$chunk
412 ___
413 if (!$PADLOCK_PREFETCH{$mode}) {
414 $code.=<<___;
415 jnz .L${mode}_loop
416 ___
417 } else {
418 $code.=<<___;
419 jz .L${mode}_break
420 cmp $chunk,$len
421 jae .L${mode}_loop
422 ___
423 $code.=<<___ if ($mode eq "ctr32");
424 mov $len,$chunk
425 mov $inp,%rax # check if prefetch crosses page
426 cmp %rsp,%rbp
427 cmove $out,%rax
428 add $len,%rax
429 neg %rax
430 and \$0xfff,%rax # distance to page boundary
431 cmp \$$PADLOCK_PREFETCH{$mode},%rax
432 mov \$-$PADLOCK_PREFETCH{$mode},%rax
433 cmovae $chunk,%rax
434 and %rax,$chunk
435 jnz .L${mode}_loop
436 ___
437 $code.=<<___;
438 .L${mode}_unaligned_tail:
439 xor %eax,%eax
440 cmp %rsp,%rbp
441 cmove $len,%rax
442 mov $out,%r8 # save parameters
443 mov $len,$chunk
444 sub %rax,%rsp # alloca
445 shr \$3,$len
446 lea (%rsp),$out
447 .byte 0xf3,0x48,0xa5 # rep movsq
448 mov %rsp,$inp
449 mov %r8, $out # restore parameters
450 mov $chunk,$len
451 jmp .L${mode}_loop
452 .align 16
453 .L${mode}_break:
454 ___
455 }
456 $code.=<<___;
457 cmp %rbp,%rsp
458 je .L${mode}_done
459
460 pxor %xmm0,%xmm0
461 lea (%rsp),%rax
462 .L${mode}_bzero:
463 movaps %xmm0,(%rax)
464 lea 16(%rax),%rax
465 cmp %rax,%rbp
466 ja .L${mode}_bzero
467
468 .L${mode}_done:
469 lea (%rbp),%rsp
470 jmp .L${mode}_exit
471
472 .align 16
473 .L${mode}_aligned:
474 ___
475 $code.=<<___ if ($mode eq "ctr32");
476 mov -4($ctx),%eax # pull 32-bit counter
477 bswap %eax
478 neg %eax
479 and \$0xffff,%eax
480 mov \$`16*0x10000`,$chunk
481 shl \$4,%eax
482 cmovz $chunk,%rax
483 cmp %rax,$len
484 cmova %rax,$chunk # don't let counter cross 2^16
485 cmovbe $len,$chunk
486 jbe .L${mode}_aligned_skip
487
488 .L${mode}_aligned_loop:
489 mov $len,%r10 # save parameters
490 mov $chunk,$len
491 mov $chunk,%r11
492
493 lea -16($ctx),%rax # ivp
494 lea 16($ctx),%rbx # key
495 shr \$4,$len # len/=AES_BLOCK_SIZE
496 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
497
498 mov -4($ctx),%eax # pull 32-bit counter
499 bswap %eax
500 add \$0x10000,%eax
501 bswap %eax
502 mov %eax,-4($ctx)
503
504 mov %r10,$len # restore parameters
505 sub %r11,$len
506 mov \$`16*0x10000`,$chunk
507 jz .L${mode}_exit
508 cmp $chunk,$len
509 jae .L${mode}_aligned_loop
510
511 .L${mode}_aligned_skip:
512 ___
513 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
514 lea ($inp,$len),%rbp
515 neg %rbp
516 and \$0xfff,%rbp # distance to page boundary
517 xor %eax,%eax
518 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
519 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
520 cmovae %rax,%rbp
521 and $len,%rbp # remainder
522 sub %rbp,$len
523 jz .L${mode}_aligned_tail
524 ___
525 $code.=<<___;
526 lea -16($ctx),%rax # ivp
527 lea 16($ctx),%rbx # key
528 shr \$4,$len # len/=AES_BLOCK_SIZE
529 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
530 ___
531 $code.=<<___ if ($mode !~ /ecb|ctr/);
532 movdqa (%rax),%xmm0
533 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
534 ___
535 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
536 test %rbp,%rbp # check remainder
537 jz .L${mode}_exit
538
539 .L${mode}_aligned_tail:
540 mov $out,%r8
541 mov %rbp,$chunk
542 mov %rbp,$len
543 lea (%rsp),%rbp
544 sub $len,%rsp
545 shr \$3,$len
546 lea (%rsp),$out
547 .byte 0xf3,0x48,0xa5 # rep movsq
548 lea (%r8),$out
549 lea (%rsp),$inp
550 mov $chunk,$len
551 jmp .L${mode}_loop
552 ___
553 $code.=<<___;
554 .L${mode}_exit:
555 mov \$1,%eax
556 lea 8(%rsp),%rsp
557 .L${mode}_abort:
558 pop %rbx
559 pop %rbp
560 ret
561 .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
562 ___
563 }
564
565 &generate_mode("ecb",0xc8);
566 &generate_mode("cbc",0xd0);
567 &generate_mode("cfb",0xe0);
568 &generate_mode("ofb",0xe8);
569 &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
570
571 $code.=<<___;
572 .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
573 .align 16
574 .data
575 .align 8
576 .Lpadlock_saved_context:
577 .quad 0
578 ___
579 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
580
581 print $code;
582
583 close STDOUT;