]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
ed28aef8 AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # September 2011 | |
18 | # | |
50452b2e AP |
19 | # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for |
20 | # details. | |
ed28aef8 AP |
21 | |
22 | $flavour = shift; | |
23 | $output = shift; | |
24 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
25 | ||
26 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
27 | ||
28 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
29 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
30 | ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or | |
31 | die "can't locate x86_64-xlate.pl"; | |
32 | ||
cfe1d992 | 33 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
46bf83f0 | 34 | *STDOUT=*OUT; |
ed28aef8 AP |
35 | |
36 | $code=".text\n"; | |
37 | ||
ed998634 | 38 | %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata |
50452b2e | 39 | $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 |
ed28aef8 AP |
40 | |
41 | $ctx="%rdx"; | |
42 | $out="%rdi"; | |
43 | $inp="%rsi"; | |
44 | $len="%rcx"; | |
45 | $chunk="%rbx"; | |
46 | ||
47 | ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | |
48 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
49 | ||
50 | $code.=<<___; | |
51 | .globl padlock_capability | |
52 | .type padlock_capability,\@abi-omnipotent | |
53 | .align 16 | |
54 | padlock_capability: | |
55 | mov %rbx,%r8 | |
56 | xor %eax,%eax | |
57 | cpuid | |
58 | xor %eax,%eax | |
59 | cmp \$`"0x".unpack("H*",'tneC')`,%ebx | |
60 | jne .Lnoluck | |
61 | cmp \$`"0x".unpack("H*",'Hrua')`,%edx | |
62 | jne .Lnoluck | |
63 | cmp \$`"0x".unpack("H*",'slua')`,%ecx | |
64 | jne .Lnoluck | |
65 | mov \$0xC0000000,%eax | |
66 | cpuid | |
67 | mov %eax,%edx | |
68 | xor %eax,%eax | |
69 | cmp \$0xC0000001,%edx | |
70 | jb .Lnoluck | |
71 | mov \$0xC0000001,%eax | |
72 | cpuid | |
73 | mov %edx,%eax | |
74 | and \$0xffffffef,%eax | |
75 | or \$0x10,%eax # set Nano bit#4 | |
76 | .Lnoluck: | |
77 | mov %r8,%rbx | |
78 | ret | |
79 | .size padlock_capability,.-padlock_capability | |
80 | ||
81 | .globl padlock_key_bswap | |
82 | .type padlock_key_bswap,\@abi-omnipotent,0 | |
83 | .align 16 | |
84 | padlock_key_bswap: | |
85 | mov 240($arg1),%edx | |
86 | .Lbswap_loop: | |
87 | mov ($arg1),%eax | |
88 | bswap %eax | |
89 | mov %eax,($arg1) | |
90 | lea 4($arg1),$arg1 | |
91 | sub \$1,%edx | |
92 | jnz .Lbswap_loop | |
93 | ret | |
94 | .size padlock_key_bswap,.-padlock_key_bswap | |
95 | ||
96 | .globl padlock_verify_context | |
97 | .type padlock_verify_context,\@abi-omnipotent | |
98 | .align 16 | |
99 | padlock_verify_context: | |
100 | mov $arg1,$ctx | |
101 | pushf | |
102 | lea .Lpadlock_saved_context(%rip),%rax | |
103 | call _padlock_verify_ctx | |
104 | lea 8(%rsp),%rsp | |
105 | ret | |
106 | .size padlock_verify_context,.-padlock_verify_context | |
107 | ||
108 | .type _padlock_verify_ctx,\@abi-omnipotent | |
109 | .align 16 | |
110 | _padlock_verify_ctx: | |
111 | mov 8(%rsp),%r8 | |
112 | bt \$30,%r8 | |
113 | jnc .Lverified | |
114 | cmp (%rax),$ctx | |
115 | je .Lverified | |
116 | pushf | |
117 | popf | |
118 | .Lverified: | |
119 | mov $ctx,(%rax) | |
120 | ret | |
121 | .size _padlock_verify_ctx,.-_padlock_verify_ctx | |
122 | ||
123 | .globl padlock_reload_key | |
124 | .type padlock_reload_key,\@abi-omnipotent | |
125 | .align 16 | |
126 | padlock_reload_key: | |
127 | pushf | |
128 | popf | |
129 | ret | |
130 | .size padlock_reload_key,.-padlock_reload_key | |
131 | ||
132 | .globl padlock_aes_block | |
133 | .type padlock_aes_block,\@function,3 | |
134 | .align 16 | |
135 | padlock_aes_block: | |
136 | mov %rbx,%r8 | |
137 | mov \$1,$len | |
138 | lea 32($ctx),%rbx # key | |
139 | lea 16($ctx),$ctx # control word | |
140 | .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb | |
141 | mov %r8,%rbx | |
142 | ret | |
143 | .size padlock_aes_block,.-padlock_aes_block | |
144 | ||
145 | .globl padlock_xstore | |
146 | .type padlock_xstore,\@function,2 | |
147 | .align 16 | |
148 | padlock_xstore: | |
149 | mov %esi,%edx | |
150 | .byte 0x0f,0xa7,0xc0 # xstore | |
151 | ret | |
152 | .size padlock_xstore,.-padlock_xstore | |
153 | ||
154 | .globl padlock_sha1_oneshot | |
155 | .type padlock_sha1_oneshot,\@function,3 | |
156 | .align 16 | |
157 | padlock_sha1_oneshot: | |
ed28aef8 | 158 | mov %rdx,%rcx |
08d62e9f AP |
159 | mov %rdi,%rdx # put aside %rdi |
160 | movups (%rdi),%xmm0 # copy-in context | |
161 | sub \$128+8,%rsp | |
162 | mov 16(%rdi),%eax | |
163 | movaps %xmm0,(%rsp) | |
164 | mov %rsp,%rdi | |
165 | mov %eax,16(%rsp) | |
166 | xor %rax,%rax | |
ed28aef8 | 167 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 |
08d62e9f AP |
168 | movaps (%rsp),%xmm0 |
169 | mov 16(%rsp),%eax | |
b1d3e9de | 170 | add \$128+8,%rsp |
08d62e9f AP |
171 | movups %xmm0,(%rdx) # copy-out context |
172 | mov %eax,16(%rdx) | |
ed28aef8 AP |
173 | ret |
174 | .size padlock_sha1_oneshot,.-padlock_sha1_oneshot | |
175 | ||
149ca712 AP |
176 | .globl padlock_sha1_blocks |
177 | .type padlock_sha1_blocks,\@function,3 | |
ed28aef8 | 178 | .align 16 |
149ca712 | 179 | padlock_sha1_blocks: |
ed28aef8 | 180 | mov %rdx,%rcx |
08d62e9f AP |
181 | mov %rdi,%rdx # put aside %rdi |
182 | movups (%rdi),%xmm0 # copy-in context | |
183 | sub \$128+8,%rsp | |
184 | mov 16(%rdi),%eax | |
185 | movaps %xmm0,(%rsp) | |
186 | mov %rsp,%rdi | |
187 | mov %eax,16(%rsp) | |
188 | mov \$-1,%rax | |
ed28aef8 | 189 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 |
08d62e9f AP |
190 | movaps (%rsp),%xmm0 |
191 | mov 16(%rsp),%eax | |
b1d3e9de | 192 | add \$128+8,%rsp |
08d62e9f AP |
193 | movups %xmm0,(%rdx) # copy-out context |
194 | mov %eax,16(%rdx) | |
ed28aef8 | 195 | ret |
149ca712 | 196 | .size padlock_sha1_blocks,.-padlock_sha1_blocks |
ed28aef8 AP |
197 | |
198 | .globl padlock_sha256_oneshot | |
199 | .type padlock_sha256_oneshot,\@function,3 | |
200 | .align 16 | |
201 | padlock_sha256_oneshot: | |
ed28aef8 | 202 | mov %rdx,%rcx |
08d62e9f AP |
203 | mov %rdi,%rdx # put aside %rdi |
204 | movups (%rdi),%xmm0 # copy-in context | |
205 | sub \$128+8,%rsp | |
206 | movups 16(%rdi),%xmm1 | |
207 | movaps %xmm0,(%rsp) | |
208 | mov %rsp,%rdi | |
209 | movaps %xmm1,16(%rsp) | |
210 | xor %rax,%rax | |
ed28aef8 | 211 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 |
08d62e9f AP |
212 | movaps (%rsp),%xmm0 |
213 | movaps 16(%rsp),%xmm1 | |
b1d3e9de | 214 | add \$128+8,%rsp |
08d62e9f AP |
215 | movups %xmm0,(%rdx) # copy-out context |
216 | movups %xmm1,16(%rdx) | |
ed28aef8 AP |
217 | ret |
218 | .size padlock_sha256_oneshot,.-padlock_sha256_oneshot | |
219 | ||
149ca712 AP |
220 | .globl padlock_sha256_blocks |
221 | .type padlock_sha256_blocks,\@function,3 | |
ed28aef8 | 222 | .align 16 |
149ca712 | 223 | padlock_sha256_blocks: |
ed28aef8 | 224 | mov %rdx,%rcx |
08d62e9f AP |
225 | mov %rdi,%rdx # put aside %rdi |
226 | movups (%rdi),%xmm0 # copy-in context | |
227 | sub \$128+8,%rsp | |
228 | movups 16(%rdi),%xmm1 | |
229 | movaps %xmm0,(%rsp) | |
230 | mov %rsp,%rdi | |
231 | movaps %xmm1,16(%rsp) | |
232 | mov \$-1,%rax | |
ed28aef8 | 233 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 |
08d62e9f AP |
234 | movaps (%rsp),%xmm0 |
235 | movaps 16(%rsp),%xmm1 | |
b1d3e9de | 236 | add \$128+8,%rsp |
08d62e9f AP |
237 | movups %xmm0,(%rdx) # copy-out context |
238 | movups %xmm1,16(%rdx) | |
ed28aef8 | 239 | ret |
149ca712 AP |
240 | .size padlock_sha256_blocks,.-padlock_sha256_blocks |
241 | ||
d18762f7 AP |
242 | .globl padlock_sha512_blocks |
243 | .type padlock_sha512_blocks,\@function,3 | |
149ca712 AP |
244 | .align 16 |
245 | padlock_sha512_blocks: | |
246 | mov %rdx,%rcx | |
08d62e9f AP |
247 | mov %rdi,%rdx # put aside %rdi |
248 | movups (%rdi),%xmm0 # copy-in context | |
249 | sub \$128+8,%rsp | |
250 | movups 16(%rdi),%xmm1 | |
251 | movups 32(%rdi),%xmm2 | |
252 | movups 48(%rdi),%xmm3 | |
253 | movaps %xmm0,(%rsp) | |
254 | mov %rsp,%rdi | |
255 | movaps %xmm1,16(%rsp) | |
256 | movaps %xmm2,32(%rsp) | |
257 | movaps %xmm3,48(%rsp) | |
149ca712 | 258 | .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 |
08d62e9f AP |
259 | movaps (%rsp),%xmm0 |
260 | movaps 16(%rsp),%xmm1 | |
261 | movaps 32(%rsp),%xmm2 | |
262 | movaps 48(%rsp),%xmm3 | |
b1d3e9de | 263 | add \$128+8,%rsp |
08d62e9f AP |
264 | movups %xmm0,(%rdx) # copy-out context |
265 | movups %xmm1,16(%rdx) | |
266 | movups %xmm2,32(%rdx) | |
267 | movups %xmm3,48(%rdx) | |
149ca712 AP |
268 | ret |
269 | .size padlock_sha512_blocks,.-padlock_sha512_blocks | |
ed28aef8 AP |
270 | ___ |
271 | ||
272 | sub generate_mode { | |
273 | my ($mode,$opcode) = @_; | |
274 | # int padlock_$mode_encrypt(void *out, const void *inp, | |
275 | # struct padlock_cipher_data *ctx, size_t len); | |
276 | $code.=<<___; | |
277 | .globl padlock_${mode}_encrypt | |
278 | .type padlock_${mode}_encrypt,\@function,4 | |
279 | .align 16 | |
280 | padlock_${mode}_encrypt: | |
281 | push %rbp | |
282 | push %rbx | |
283 | ||
284 | xor %eax,%eax | |
285 | test \$15,$ctx | |
286 | jnz .L${mode}_abort | |
287 | test \$15,$len | |
288 | jnz .L${mode}_abort | |
289 | lea .Lpadlock_saved_context(%rip),%rax | |
290 | pushf | |
291 | cld | |
292 | call _padlock_verify_ctx | |
293 | lea 16($ctx),$ctx # control word | |
294 | xor %eax,%eax | |
295 | xor %ebx,%ebx | |
33987f2f | 296 | testl \$`1<<5`,($ctx) # align bit in control word |
149ca712 | 297 | jnz .L${mode}_aligned |
ed28aef8 AP |
298 | test \$0x0f,$out |
299 | setz %al # !out_misaligned | |
300 | test \$0x0f,$inp | |
301 | setz %bl # !inp_misaligned | |
302 | test %ebx,%eax | |
303 | jnz .L${mode}_aligned | |
304 | neg %rax | |
305 | mov \$$PADLOCK_CHUNK,$chunk | |
306 | not %rax # out_misaligned?-1:0 | |
307 | lea (%rsp),%rbp | |
308 | cmp $chunk,$len | |
309 | cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len | |
310 | and $chunk,%rax # out_misaligned?chunk:0 | |
311 | mov $len,$chunk | |
312 | neg %rax | |
313 | and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK | |
314 | lea (%rax,%rbp),%rsp | |
ed998634 AP |
315 | mov \$$PADLOCK_CHUNK,%rax |
316 | cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK | |
50452b2e AP |
317 | ___ |
318 | $code.=<<___ if ($mode eq "ctr32"); | |
6c8ce3c2 | 319 | .L${mode}_reenter: |
50452b2e AP |
320 | mov -4($ctx),%eax # pull 32-bit counter |
321 | bswap %eax | |
322 | neg %eax | |
323 | and \$`$PADLOCK_CHUNK/16-1`,%eax | |
ed998634 | 324 | mov \$$PADLOCK_CHUNK,$chunk |
50452b2e | 325 | shl \$4,%eax |
ed998634 | 326 | cmovz $chunk,%rax |
50452b2e AP |
327 | cmp %rax,$len |
328 | cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK | |
ed998634 AP |
329 | cmovbe $len,$chunk |
330 | ___ | |
331 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
332 | cmp $chunk,$len | |
333 | ja .L${mode}_loop | |
334 | mov $inp,%rax # check if prefetch crosses page | |
335 | cmp %rsp,%rbp | |
336 | cmove $out,%rax | |
337 | add $len,%rax | |
338 | neg %rax | |
339 | and \$0xfff,%rax # distance to page boundary | |
340 | cmp \$$PADLOCK_PREFETCH{$mode},%rax | |
341 | mov \$-$PADLOCK_PREFETCH{$mode},%rax | |
342 | cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 | |
343 | and %rax,$chunk | |
344 | jz .L${mode}_unaligned_tail | |
50452b2e AP |
345 | ___ |
346 | $code.=<<___; | |
ed28aef8 AP |
347 | jmp .L${mode}_loop |
348 | .align 16 | |
349 | .L${mode}_loop: | |
50452b2e AP |
350 | cmp $len,$chunk # ctr32 artefact |
351 | cmova $len,$chunk # ctr32 artefact | |
ed28aef8 AP |
352 | mov $out,%r8 # save parameters |
353 | mov $inp,%r9 | |
354 | mov $len,%r10 | |
355 | mov $chunk,$len | |
356 | mov $chunk,%r11 | |
357 | test \$0x0f,$out # out_misaligned | |
358 | cmovnz %rsp,$out | |
359 | test \$0x0f,$inp # inp_misaligned | |
360 | jz .L${mode}_inp_aligned | |
361 | shr \$3,$len | |
362 | .byte 0xf3,0x48,0xa5 # rep movsq | |
363 | sub $chunk,$out | |
364 | mov $chunk,$len | |
365 | mov $out,$inp | |
366 | .L${mode}_inp_aligned: | |
367 | lea -16($ctx),%rax # ivp | |
368 | lea 16($ctx),%rbx # key | |
369 | shr \$4,$len | |
370 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
371 | ___ | |
372 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
373 | movdqa (%rax),%xmm0 | |
374 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
375 | ___ | |
50452b2e AP |
376 | $code.=<<___ if ($mode eq "ctr32"); |
377 | mov -4($ctx),%eax # pull 32-bit counter | |
378 | test \$0xffff0000,%eax | |
ed998634 | 379 | jnz .L${mode}_no_carry |
50452b2e AP |
380 | bswap %eax |
381 | add \$0x10000,%eax | |
382 | bswap %eax | |
383 | mov %eax,-4($ctx) | |
ed998634 | 384 | .L${mode}_no_carry: |
50452b2e | 385 | ___ |
ed28aef8 | 386 | $code.=<<___; |
1afd7fa9 | 387 | mov %r8,$out # restore parameters |
ed28aef8 AP |
388 | mov %r11,$chunk |
389 | test \$0x0f,$out | |
390 | jz .L${mode}_out_aligned | |
391 | mov $chunk,$len | |
ed28aef8 | 392 | lea (%rsp),$inp |
ed998634 | 393 | shr \$3,$len |
ed28aef8 AP |
394 | .byte 0xf3,0x48,0xa5 # rep movsq |
395 | sub $chunk,$out | |
396 | .L${mode}_out_aligned: | |
397 | mov %r9,$inp | |
398 | mov %r10,$len | |
399 | add $chunk,$out | |
400 | add $chunk,$inp | |
401 | sub $chunk,$len | |
402 | mov \$$PADLOCK_CHUNK,$chunk | |
ed998634 AP |
403 | ___ |
404 | if (!$PADLOCK_PREFETCH{$mode}) { | |
405 | $code.=<<___; | |
ed28aef8 | 406 | jnz .L${mode}_loop |
ed998634 AP |
407 | ___ |
408 | } else { | |
409 | $code.=<<___; | |
410 | jz .L${mode}_break | |
411 | cmp $chunk,$len | |
412 | jae .L${mode}_loop | |
413 | ___ | |
414 | $code.=<<___ if ($mode eq "ctr32"); | |
415 | mov $len,$chunk | |
416 | mov $inp,%rax # check if prefetch crosses page | |
6c8ce3c2 | 417 | cmp %rsp,%rbp |
ed998634 AP |
418 | cmove $out,%rax |
419 | add $len,%rax | |
420 | neg %rax | |
421 | and \$0xfff,%rax # distance to page boundary | |
422 | cmp \$$PADLOCK_PREFETCH{$mode},%rax | |
423 | mov \$-$PADLOCK_PREFETCH{$mode},%rax | |
424 | cmovae $chunk,%rax | |
425 | and %rax,$chunk | |
426 | jnz .L${mode}_loop | |
427 | ___ | |
428 | $code.=<<___; | |
429 | .L${mode}_unaligned_tail: | |
430 | xor %eax,%eax | |
431 | cmp %rsp,%rbp | |
432 | cmove $len,%rax | |
433 | mov $out,%r8 # save parameters | |
434 | mov $len,$chunk | |
435 | sub %rax,%rsp # alloca | |
436 | shr \$3,$len | |
437 | lea (%rsp),$out | |
438 | .byte 0xf3,0x48,0xa5 # rep movsq | |
439 | mov %rsp,$inp | |
440 | mov %r8, $out # restore parameters | |
441 | mov $chunk,$len | |
442 | jmp .L${mode}_loop | |
443 | .align 16 | |
444 | .L${mode}_break: | |
445 | ___ | |
446 | } | |
447 | $code.=<<___; | |
448 | cmp %rbp,%rsp | |
6c8ce3c2 AP |
449 | je .L${mode}_done |
450 | ||
451 | pxor %xmm0,%xmm0 | |
452 | lea (%rsp),%rax | |
453 | .L${mode}_bzero: | |
454 | movaps %xmm0,(%rax) | |
455 | lea 16(%rax),%rax | |
456 | cmp %rax,%rbp | |
457 | ja .L${mode}_bzero | |
ed28aef8 | 458 | |
ed28aef8 AP |
459 | .L${mode}_done: |
460 | lea (%rbp),%rsp | |
461 | jmp .L${mode}_exit | |
ed998634 | 462 | |
ed28aef8 AP |
463 | .align 16 |
464 | .L${mode}_aligned: | |
50452b2e AP |
465 | ___ |
466 | $code.=<<___ if ($mode eq "ctr32"); | |
467 | mov -4($ctx),%eax # pull 32-bit counter | |
50452b2e | 468 | bswap %eax |
50452b2e AP |
469 | neg %eax |
470 | and \$0xffff,%eax | |
ed998634 | 471 | mov \$`16*0x10000`,$chunk |
50452b2e | 472 | shl \$4,%eax |
ed998634 | 473 | cmovz $chunk,%rax |
50452b2e AP |
474 | cmp %rax,$len |
475 | cmova %rax,$chunk # don't let counter cross 2^16 | |
ed998634 AP |
476 | cmovbe $len,$chunk |
477 | jbe .L${mode}_aligned_skip | |
478 | ||
50452b2e | 479 | .L${mode}_aligned_loop: |
50452b2e AP |
480 | mov $len,%r10 # save parameters |
481 | mov $chunk,$len | |
482 | mov $chunk,%r11 | |
ed998634 | 483 | |
ed28aef8 AP |
484 | lea -16($ctx),%rax # ivp |
485 | lea 16($ctx),%rbx # key | |
486 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
487 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
ed998634 | 488 | |
50452b2e AP |
489 | mov -4($ctx),%eax # pull 32-bit counter |
490 | bswap %eax | |
491 | add \$0x10000,%eax | |
492 | bswap %eax | |
493 | mov %eax,-4($ctx) | |
494 | ||
1afd7fa9 | 495 | mov %r10,$len # restore parameters |
ed998634 | 496 | sub %r11,$len |
50452b2e | 497 | mov \$`16*0x10000`,$chunk |
ed998634 AP |
498 | jz .L${mode}_exit |
499 | cmp $chunk,$len | |
500 | jae .L${mode}_aligned_loop | |
501 | ||
502 | .L${mode}_aligned_skip: | |
503 | ___ | |
504 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
505 | lea ($inp,$len),%rbp | |
506 | neg %rbp | |
507 | and \$0xfff,%rbp # distance to page boundary | |
508 | xor %eax,%eax | |
509 | cmp \$$PADLOCK_PREFETCH{$mode},%rbp | |
510 | mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp | |
511 | cmovae %rax,%rbp | |
512 | and $len,%rbp # remainder | |
513 | sub %rbp,$len | |
514 | jz .L${mode}_aligned_tail | |
515 | ___ | |
516 | $code.=<<___; | |
517 | lea -16($ctx),%rax # ivp | |
518 | lea 16($ctx),%rbx # key | |
519 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
520 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
521 | ___ | |
522 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
523 | movdqa (%rax),%xmm0 | |
524 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
525 | ___ | |
526 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
527 | test %rbp,%rbp # check remainder | |
528 | jz .L${mode}_exit | |
529 | ||
530 | .L${mode}_aligned_tail: | |
531 | mov $out,%r8 | |
532 | mov %rbp,$chunk | |
533 | mov %rbp,$len | |
534 | lea (%rsp),%rbp | |
535 | sub $len,%rsp | |
536 | shr \$3,$len | |
537 | lea (%rsp),$out | |
609b0852 | 538 | .byte 0xf3,0x48,0xa5 # rep movsq |
ed998634 AP |
539 | lea (%r8),$out |
540 | lea (%rsp),$inp | |
541 | mov $chunk,$len | |
542 | jmp .L${mode}_loop | |
50452b2e | 543 | ___ |
ed28aef8 AP |
544 | $code.=<<___; |
545 | .L${mode}_exit: | |
546 | mov \$1,%eax | |
547 | lea 8(%rsp),%rsp | |
548 | .L${mode}_abort: | |
549 | pop %rbx | |
550 | pop %rbp | |
551 | ret | |
552 | .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt | |
553 | ___ | |
554 | } | |
555 | ||
556 | &generate_mode("ecb",0xc8); | |
557 | &generate_mode("cbc",0xd0); | |
558 | &generate_mode("cfb",0xe0); | |
559 | &generate_mode("ofb",0xe8); | |
50452b2e | 560 | &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... |
ed28aef8 AP |
561 | |
562 | $code.=<<___; | |
563 | .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" | |
564 | .align 16 | |
565 | .data | |
566 | .align 8 | |
567 | .Lpadlock_saved_context: | |
568 | .quad 0 | |
569 | ___ | |
570 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
571 | ||
572 | print $code; | |
573 | ||
574 | close STDOUT; |