]> git.ipfire.org Git - thirdparty/openssl.git/blame - engines/asm/e_padlock-x86_64.pl
Doc nits cleanup, round 2
[thirdparty/openssl.git] / engines / asm / e_padlock-x86_64.pl
CommitLineData
ed28aef8
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# September 2011
11#
50452b2e
AP
12# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
13# details.
ed28aef8
AP
14
15$flavour = shift;
16$output = shift;
17if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
18
19$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
20
21$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
22( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
23( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
24die "can't locate x86_64-xlate.pl";
25
46bf83f0
AP
26open OUT,"| \"$^X\" $xlate $flavour $output";
27*STDOUT=*OUT;
ed28aef8
AP
28
29$code=".text\n";
30
ed998634 31%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
50452b2e 32$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
ed28aef8
AP
33
34$ctx="%rdx";
35$out="%rdi";
36$inp="%rsi";
37$len="%rcx";
38$chunk="%rbx";
39
40($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
41 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
42
43$code.=<<___;
44.globl padlock_capability
45.type padlock_capability,\@abi-omnipotent
46.align 16
47padlock_capability:
48 mov %rbx,%r8
49 xor %eax,%eax
50 cpuid
51 xor %eax,%eax
52 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
53 jne .Lnoluck
54 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
55 jne .Lnoluck
56 cmp \$`"0x".unpack("H*",'slua')`,%ecx
57 jne .Lnoluck
58 mov \$0xC0000000,%eax
59 cpuid
60 mov %eax,%edx
61 xor %eax,%eax
62 cmp \$0xC0000001,%edx
63 jb .Lnoluck
64 mov \$0xC0000001,%eax
65 cpuid
66 mov %edx,%eax
67 and \$0xffffffef,%eax
68 or \$0x10,%eax # set Nano bit#4
69.Lnoluck:
70 mov %r8,%rbx
71 ret
72.size padlock_capability,.-padlock_capability
73
74.globl padlock_key_bswap
75.type padlock_key_bswap,\@abi-omnipotent,0
76.align 16
77padlock_key_bswap:
78 mov 240($arg1),%edx
79.Lbswap_loop:
80 mov ($arg1),%eax
81 bswap %eax
82 mov %eax,($arg1)
83 lea 4($arg1),$arg1
84 sub \$1,%edx
85 jnz .Lbswap_loop
86 ret
87.size padlock_key_bswap,.-padlock_key_bswap
88
89.globl padlock_verify_context
90.type padlock_verify_context,\@abi-omnipotent
91.align 16
92padlock_verify_context:
93 mov $arg1,$ctx
94 pushf
95 lea .Lpadlock_saved_context(%rip),%rax
96 call _padlock_verify_ctx
97 lea 8(%rsp),%rsp
98 ret
99.size padlock_verify_context,.-padlock_verify_context
100
101.type _padlock_verify_ctx,\@abi-omnipotent
102.align 16
103_padlock_verify_ctx:
104 mov 8(%rsp),%r8
105 bt \$30,%r8
106 jnc .Lverified
107 cmp (%rax),$ctx
108 je .Lverified
109 pushf
110 popf
111.Lverified:
112 mov $ctx,(%rax)
113 ret
114.size _padlock_verify_ctx,.-_padlock_verify_ctx
115
116.globl padlock_reload_key
117.type padlock_reload_key,\@abi-omnipotent
118.align 16
119padlock_reload_key:
120 pushf
121 popf
122 ret
123.size padlock_reload_key,.-padlock_reload_key
124
125.globl padlock_aes_block
126.type padlock_aes_block,\@function,3
127.align 16
128padlock_aes_block:
129 mov %rbx,%r8
130 mov \$1,$len
131 lea 32($ctx),%rbx # key
132 lea 16($ctx),$ctx # control word
133 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
134 mov %r8,%rbx
135 ret
136.size padlock_aes_block,.-padlock_aes_block
137
138.globl padlock_xstore
139.type padlock_xstore,\@function,2
140.align 16
141padlock_xstore:
142 mov %esi,%edx
143 .byte 0x0f,0xa7,0xc0 # xstore
144 ret
145.size padlock_xstore,.-padlock_xstore
146
147.globl padlock_sha1_oneshot
148.type padlock_sha1_oneshot,\@function,3
149.align 16
150padlock_sha1_oneshot:
ed28aef8 151 mov %rdx,%rcx
08d62e9f
AP
152 mov %rdi,%rdx # put aside %rdi
153 movups (%rdi),%xmm0 # copy-in context
154 sub \$128+8,%rsp
155 mov 16(%rdi),%eax
156 movaps %xmm0,(%rsp)
157 mov %rsp,%rdi
158 mov %eax,16(%rsp)
159 xor %rax,%rax
ed28aef8 160 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
08d62e9f
AP
161 movaps (%rsp),%xmm0
162 mov 16(%rsp),%eax
b1d3e9de 163 add \$128+8,%rsp
08d62e9f
AP
164 movups %xmm0,(%rdx) # copy-out context
165 mov %eax,16(%rdx)
ed28aef8
AP
166 ret
167.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
168
149ca712
AP
169.globl padlock_sha1_blocks
170.type padlock_sha1_blocks,\@function,3
ed28aef8 171.align 16
149ca712 172padlock_sha1_blocks:
ed28aef8 173 mov %rdx,%rcx
08d62e9f
AP
174 mov %rdi,%rdx # put aside %rdi
175 movups (%rdi),%xmm0 # copy-in context
176 sub \$128+8,%rsp
177 mov 16(%rdi),%eax
178 movaps %xmm0,(%rsp)
179 mov %rsp,%rdi
180 mov %eax,16(%rsp)
181 mov \$-1,%rax
ed28aef8 182 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
08d62e9f
AP
183 movaps (%rsp),%xmm0
184 mov 16(%rsp),%eax
b1d3e9de 185 add \$128+8,%rsp
08d62e9f
AP
186 movups %xmm0,(%rdx) # copy-out context
187 mov %eax,16(%rdx)
ed28aef8 188 ret
149ca712 189.size padlock_sha1_blocks,.-padlock_sha1_blocks
ed28aef8
AP
190
191.globl padlock_sha256_oneshot
192.type padlock_sha256_oneshot,\@function,3
193.align 16
194padlock_sha256_oneshot:
ed28aef8 195 mov %rdx,%rcx
08d62e9f
AP
196 mov %rdi,%rdx # put aside %rdi
197 movups (%rdi),%xmm0 # copy-in context
198 sub \$128+8,%rsp
199 movups 16(%rdi),%xmm1
200 movaps %xmm0,(%rsp)
201 mov %rsp,%rdi
202 movaps %xmm1,16(%rsp)
203 xor %rax,%rax
ed28aef8 204 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
08d62e9f
AP
205 movaps (%rsp),%xmm0
206 movaps 16(%rsp),%xmm1
b1d3e9de 207 add \$128+8,%rsp
08d62e9f
AP
208 movups %xmm0,(%rdx) # copy-out context
209 movups %xmm1,16(%rdx)
ed28aef8
AP
210 ret
211.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
212
149ca712
AP
213.globl padlock_sha256_blocks
214.type padlock_sha256_blocks,\@function,3
ed28aef8 215.align 16
149ca712 216padlock_sha256_blocks:
ed28aef8 217 mov %rdx,%rcx
08d62e9f
AP
218 mov %rdi,%rdx # put aside %rdi
219 movups (%rdi),%xmm0 # copy-in context
220 sub \$128+8,%rsp
221 movups 16(%rdi),%xmm1
222 movaps %xmm0,(%rsp)
223 mov %rsp,%rdi
224 movaps %xmm1,16(%rsp)
225 mov \$-1,%rax
ed28aef8 226 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
08d62e9f
AP
227 movaps (%rsp),%xmm0
228 movaps 16(%rsp),%xmm1
b1d3e9de 229 add \$128+8,%rsp
08d62e9f
AP
230 movups %xmm0,(%rdx) # copy-out context
231 movups %xmm1,16(%rdx)
ed28aef8 232 ret
149ca712
AP
233.size padlock_sha256_blocks,.-padlock_sha256_blocks
234
d18762f7
AP
235.globl padlock_sha512_blocks
236.type padlock_sha512_blocks,\@function,3
149ca712
AP
237.align 16
238padlock_sha512_blocks:
239 mov %rdx,%rcx
08d62e9f
AP
240 mov %rdi,%rdx # put aside %rdi
241 movups (%rdi),%xmm0 # copy-in context
242 sub \$128+8,%rsp
243 movups 16(%rdi),%xmm1
244 movups 32(%rdi),%xmm2
245 movups 48(%rdi),%xmm3
246 movaps %xmm0,(%rsp)
247 mov %rsp,%rdi
248 movaps %xmm1,16(%rsp)
249 movaps %xmm2,32(%rsp)
250 movaps %xmm3,48(%rsp)
149ca712 251 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
08d62e9f
AP
252 movaps (%rsp),%xmm0
253 movaps 16(%rsp),%xmm1
254 movaps 32(%rsp),%xmm2
255 movaps 48(%rsp),%xmm3
b1d3e9de 256 add \$128+8,%rsp
08d62e9f
AP
257 movups %xmm0,(%rdx) # copy-out context
258 movups %xmm1,16(%rdx)
259 movups %xmm2,32(%rdx)
260 movups %xmm3,48(%rdx)
149ca712
AP
261 ret
262.size padlock_sha512_blocks,.-padlock_sha512_blocks
ed28aef8
AP
263___
264
265sub generate_mode {
266my ($mode,$opcode) = @_;
267# int padlock_$mode_encrypt(void *out, const void *inp,
268# struct padlock_cipher_data *ctx, size_t len);
269$code.=<<___;
270.globl padlock_${mode}_encrypt
271.type padlock_${mode}_encrypt,\@function,4
272.align 16
273padlock_${mode}_encrypt:
274 push %rbp
275 push %rbx
276
277 xor %eax,%eax
278 test \$15,$ctx
279 jnz .L${mode}_abort
280 test \$15,$len
281 jnz .L${mode}_abort
282 lea .Lpadlock_saved_context(%rip),%rax
283 pushf
284 cld
285 call _padlock_verify_ctx
286 lea 16($ctx),$ctx # control word
287 xor %eax,%eax
288 xor %ebx,%ebx
33987f2f 289 testl \$`1<<5`,($ctx) # align bit in control word
149ca712 290 jnz .L${mode}_aligned
ed28aef8
AP
291 test \$0x0f,$out
292 setz %al # !out_misaligned
293 test \$0x0f,$inp
294 setz %bl # !inp_misaligned
295 test %ebx,%eax
296 jnz .L${mode}_aligned
297 neg %rax
298 mov \$$PADLOCK_CHUNK,$chunk
299 not %rax # out_misaligned?-1:0
300 lea (%rsp),%rbp
301 cmp $chunk,$len
302 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
303 and $chunk,%rax # out_misaligned?chunk:0
304 mov $len,$chunk
305 neg %rax
306 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
307 lea (%rax,%rbp),%rsp
ed998634
AP
308 mov \$$PADLOCK_CHUNK,%rax
309 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
50452b2e
AP
310___
311$code.=<<___ if ($mode eq "ctr32");
6c8ce3c2 312.L${mode}_reenter:
50452b2e
AP
313 mov -4($ctx),%eax # pull 32-bit counter
314 bswap %eax
315 neg %eax
316 and \$`$PADLOCK_CHUNK/16-1`,%eax
ed998634 317 mov \$$PADLOCK_CHUNK,$chunk
50452b2e 318 shl \$4,%eax
ed998634 319 cmovz $chunk,%rax
50452b2e
AP
320 cmp %rax,$len
321 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
ed998634
AP
322 cmovbe $len,$chunk
323___
324$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
325 cmp $chunk,$len
326 ja .L${mode}_loop
327 mov $inp,%rax # check if prefetch crosses page
328 cmp %rsp,%rbp
329 cmove $out,%rax
330 add $len,%rax
331 neg %rax
332 and \$0xfff,%rax # distance to page boundary
333 cmp \$$PADLOCK_PREFETCH{$mode},%rax
334 mov \$-$PADLOCK_PREFETCH{$mode},%rax
335 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
336 and %rax,$chunk
337 jz .L${mode}_unaligned_tail
50452b2e
AP
338___
339$code.=<<___;
ed28aef8
AP
340 jmp .L${mode}_loop
341.align 16
342.L${mode}_loop:
50452b2e
AP
343 cmp $len,$chunk # ctr32 artefact
344 cmova $len,$chunk # ctr32 artefact
ed28aef8
AP
345 mov $out,%r8 # save parameters
346 mov $inp,%r9
347 mov $len,%r10
348 mov $chunk,$len
349 mov $chunk,%r11
350 test \$0x0f,$out # out_misaligned
351 cmovnz %rsp,$out
352 test \$0x0f,$inp # inp_misaligned
353 jz .L${mode}_inp_aligned
354 shr \$3,$len
355 .byte 0xf3,0x48,0xa5 # rep movsq
356 sub $chunk,$out
357 mov $chunk,$len
358 mov $out,$inp
359.L${mode}_inp_aligned:
360 lea -16($ctx),%rax # ivp
361 lea 16($ctx),%rbx # key
362 shr \$4,$len
363 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
364___
365$code.=<<___ if ($mode !~ /ecb|ctr/);
366 movdqa (%rax),%xmm0
367 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
368___
50452b2e
AP
369$code.=<<___ if ($mode eq "ctr32");
370 mov -4($ctx),%eax # pull 32-bit counter
371 test \$0xffff0000,%eax
ed998634 372 jnz .L${mode}_no_carry
50452b2e
AP
373 bswap %eax
374 add \$0x10000,%eax
375 bswap %eax
376 mov %eax,-4($ctx)
ed998634 377.L${mode}_no_carry:
50452b2e 378___
ed28aef8 379$code.=<<___;
1afd7fa9 380 mov %r8,$out # restore parameters
ed28aef8
AP
381 mov %r11,$chunk
382 test \$0x0f,$out
383 jz .L${mode}_out_aligned
384 mov $chunk,$len
ed28aef8 385 lea (%rsp),$inp
ed998634 386 shr \$3,$len
ed28aef8
AP
387 .byte 0xf3,0x48,0xa5 # rep movsq
388 sub $chunk,$out
389.L${mode}_out_aligned:
390 mov %r9,$inp
391 mov %r10,$len
392 add $chunk,$out
393 add $chunk,$inp
394 sub $chunk,$len
395 mov \$$PADLOCK_CHUNK,$chunk
ed998634
AP
396___
397 if (!$PADLOCK_PREFETCH{$mode}) {
398$code.=<<___;
ed28aef8 399 jnz .L${mode}_loop
ed998634
AP
400___
401 } else {
402$code.=<<___;
403 jz .L${mode}_break
404 cmp $chunk,$len
405 jae .L${mode}_loop
406___
407$code.=<<___ if ($mode eq "ctr32");
408 mov $len,$chunk
409 mov $inp,%rax # check if prefetch crosses page
6c8ce3c2 410 cmp %rsp,%rbp
ed998634
AP
411 cmove $out,%rax
412 add $len,%rax
413 neg %rax
414 and \$0xfff,%rax # distance to page boundary
415 cmp \$$PADLOCK_PREFETCH{$mode},%rax
416 mov \$-$PADLOCK_PREFETCH{$mode},%rax
417 cmovae $chunk,%rax
418 and %rax,$chunk
419 jnz .L${mode}_loop
420___
421$code.=<<___;
422.L${mode}_unaligned_tail:
423 xor %eax,%eax
424 cmp %rsp,%rbp
425 cmove $len,%rax
426 mov $out,%r8 # save parameters
427 mov $len,$chunk
428 sub %rax,%rsp # alloca
429 shr \$3,$len
430 lea (%rsp),$out
431 .byte 0xf3,0x48,0xa5 # rep movsq
432 mov %rsp,$inp
433 mov %r8, $out # restore parameters
434 mov $chunk,$len
435 jmp .L${mode}_loop
436.align 16
437.L${mode}_break:
438___
439 }
440$code.=<<___;
441 cmp %rbp,%rsp
6c8ce3c2
AP
442 je .L${mode}_done
443
444 pxor %xmm0,%xmm0
445 lea (%rsp),%rax
446.L${mode}_bzero:
447 movaps %xmm0,(%rax)
448 lea 16(%rax),%rax
449 cmp %rax,%rbp
450 ja .L${mode}_bzero
ed28aef8 451
ed28aef8
AP
452.L${mode}_done:
453 lea (%rbp),%rsp
454 jmp .L${mode}_exit
ed998634 455
ed28aef8
AP
456.align 16
457.L${mode}_aligned:
50452b2e
AP
458___
459$code.=<<___ if ($mode eq "ctr32");
460 mov -4($ctx),%eax # pull 32-bit counter
50452b2e 461 bswap %eax
50452b2e
AP
462 neg %eax
463 and \$0xffff,%eax
ed998634 464 mov \$`16*0x10000`,$chunk
50452b2e 465 shl \$4,%eax
ed998634 466 cmovz $chunk,%rax
50452b2e
AP
467 cmp %rax,$len
468 cmova %rax,$chunk # don't let counter cross 2^16
ed998634
AP
469 cmovbe $len,$chunk
470 jbe .L${mode}_aligned_skip
471
50452b2e 472.L${mode}_aligned_loop:
50452b2e
AP
473 mov $len,%r10 # save parameters
474 mov $chunk,$len
475 mov $chunk,%r11
ed998634 476
ed28aef8
AP
477 lea -16($ctx),%rax # ivp
478 lea 16($ctx),%rbx # key
479 shr \$4,$len # len/=AES_BLOCK_SIZE
480 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
ed998634 481
50452b2e
AP
482 mov -4($ctx),%eax # pull 32-bit counter
483 bswap %eax
484 add \$0x10000,%eax
485 bswap %eax
486 mov %eax,-4($ctx)
487
1afd7fa9 488 mov %r10,$len # restore parameters
ed998634 489 sub %r11,$len
50452b2e 490 mov \$`16*0x10000`,$chunk
ed998634
AP
491 jz .L${mode}_exit
492 cmp $chunk,$len
493 jae .L${mode}_aligned_loop
494
495.L${mode}_aligned_skip:
496___
497$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
498 lea ($inp,$len),%rbp
499 neg %rbp
500 and \$0xfff,%rbp # distance to page boundary
501 xor %eax,%eax
502 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
503 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
504 cmovae %rax,%rbp
505 and $len,%rbp # remainder
506 sub %rbp,$len
507 jz .L${mode}_aligned_tail
508___
509$code.=<<___;
510 lea -16($ctx),%rax # ivp
511 lea 16($ctx),%rbx # key
512 shr \$4,$len # len/=AES_BLOCK_SIZE
513 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
514___
515$code.=<<___ if ($mode !~ /ecb|ctr/);
516 movdqa (%rax),%xmm0
517 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
518___
519$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
520 test %rbp,%rbp # check remainder
521 jz .L${mode}_exit
522
523.L${mode}_aligned_tail:
524 mov $out,%r8
525 mov %rbp,$chunk
526 mov %rbp,$len
527 lea (%rsp),%rbp
528 sub $len,%rsp
529 shr \$3,$len
530 lea (%rsp),$out
531 .byte 0xf3,0x48,0xa5 # rep movsq
532 lea (%r8),$out
533 lea (%rsp),$inp
534 mov $chunk,$len
535 jmp .L${mode}_loop
50452b2e 536___
ed28aef8
AP
537$code.=<<___;
538.L${mode}_exit:
539 mov \$1,%eax
540 lea 8(%rsp),%rsp
541.L${mode}_abort:
542 pop %rbx
543 pop %rbp
544 ret
545.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
546___
547}
548
549&generate_mode("ecb",0xc8);
550&generate_mode("cbc",0xd0);
551&generate_mode("cfb",0xe0);
552&generate_mode("ofb",0xe8);
50452b2e 553&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
ed28aef8
AP
554
555$code.=<<___;
556.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
557.align 16
558.data
559.align 8
560.Lpadlock_saved_context:
561 .quad 0
562___
563$code =~ s/\`([^\`]*)\`/eval($1)/gem;
564
565print $code;
566
567close STDOUT;