]>
Commit | Line | Data |
---|---|---|
ed28aef8 AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | ||
10 | # September 2011 | |
11 | # | |
50452b2e AP |
12 | # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for |
13 | # details. | |
ed28aef8 AP |
14 | |
15 | $flavour = shift; | |
16 | $output = shift; | |
17 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
18 | ||
19 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
20 | ||
21 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
22 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
23 | ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or | |
24 | die "can't locate x86_64-xlate.pl"; | |
25 | ||
46bf83f0 AP |
26 | open OUT,"| \"$^X\" $xlate $flavour $output"; |
27 | *STDOUT=*OUT; | |
ed28aef8 AP |
28 | |
29 | $code=".text\n"; | |
30 | ||
ed998634 | 31 | %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata |
50452b2e | 32 | $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 |
ed28aef8 AP |
33 | |
34 | $ctx="%rdx"; | |
35 | $out="%rdi"; | |
36 | $inp="%rsi"; | |
37 | $len="%rcx"; | |
38 | $chunk="%rbx"; | |
39 | ||
40 | ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | |
41 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
42 | ||
43 | $code.=<<___; | |
44 | .globl padlock_capability | |
45 | .type padlock_capability,\@abi-omnipotent | |
46 | .align 16 | |
47 | padlock_capability: | |
48 | mov %rbx,%r8 | |
49 | xor %eax,%eax | |
50 | cpuid | |
51 | xor %eax,%eax | |
52 | cmp \$`"0x".unpack("H*",'tneC')`,%ebx | |
53 | jne .Lnoluck | |
54 | cmp \$`"0x".unpack("H*",'Hrua')`,%edx | |
55 | jne .Lnoluck | |
56 | cmp \$`"0x".unpack("H*",'slua')`,%ecx | |
57 | jne .Lnoluck | |
58 | mov \$0xC0000000,%eax | |
59 | cpuid | |
60 | mov %eax,%edx | |
61 | xor %eax,%eax | |
62 | cmp \$0xC0000001,%edx | |
63 | jb .Lnoluck | |
64 | mov \$0xC0000001,%eax | |
65 | cpuid | |
66 | mov %edx,%eax | |
67 | and \$0xffffffef,%eax | |
68 | or \$0x10,%eax # set Nano bit#4 | |
69 | .Lnoluck: | |
70 | mov %r8,%rbx | |
71 | ret | |
72 | .size padlock_capability,.-padlock_capability | |
73 | ||
74 | .globl padlock_key_bswap | |
75 | .type padlock_key_bswap,\@abi-omnipotent,0 | |
76 | .align 16 | |
77 | padlock_key_bswap: | |
78 | mov 240($arg1),%edx | |
79 | .Lbswap_loop: | |
80 | mov ($arg1),%eax | |
81 | bswap %eax | |
82 | mov %eax,($arg1) | |
83 | lea 4($arg1),$arg1 | |
84 | sub \$1,%edx | |
85 | jnz .Lbswap_loop | |
86 | ret | |
87 | .size padlock_key_bswap,.-padlock_key_bswap | |
88 | ||
89 | .globl padlock_verify_context | |
90 | .type padlock_verify_context,\@abi-omnipotent | |
91 | .align 16 | |
92 | padlock_verify_context: | |
93 | mov $arg1,$ctx | |
94 | pushf | |
95 | lea .Lpadlock_saved_context(%rip),%rax | |
96 | call _padlock_verify_ctx | |
97 | lea 8(%rsp),%rsp | |
98 | ret | |
99 | .size padlock_verify_context,.-padlock_verify_context | |
100 | ||
101 | .type _padlock_verify_ctx,\@abi-omnipotent | |
102 | .align 16 | |
103 | _padlock_verify_ctx: | |
104 | mov 8(%rsp),%r8 | |
105 | bt \$30,%r8 | |
106 | jnc .Lverified | |
107 | cmp (%rax),$ctx | |
108 | je .Lverified | |
109 | pushf | |
110 | popf | |
111 | .Lverified: | |
112 | mov $ctx,(%rax) | |
113 | ret | |
114 | .size _padlock_verify_ctx,.-_padlock_verify_ctx | |
115 | ||
116 | .globl padlock_reload_key | |
117 | .type padlock_reload_key,\@abi-omnipotent | |
118 | .align 16 | |
119 | padlock_reload_key: | |
120 | pushf | |
121 | popf | |
122 | ret | |
123 | .size padlock_reload_key,.-padlock_reload_key | |
124 | ||
125 | .globl padlock_aes_block | |
126 | .type padlock_aes_block,\@function,3 | |
127 | .align 16 | |
128 | padlock_aes_block: | |
129 | mov %rbx,%r8 | |
130 | mov \$1,$len | |
131 | lea 32($ctx),%rbx # key | |
132 | lea 16($ctx),$ctx # control word | |
133 | .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb | |
134 | mov %r8,%rbx | |
135 | ret | |
136 | .size padlock_aes_block,.-padlock_aes_block | |
137 | ||
138 | .globl padlock_xstore | |
139 | .type padlock_xstore,\@function,2 | |
140 | .align 16 | |
141 | padlock_xstore: | |
142 | mov %esi,%edx | |
143 | .byte 0x0f,0xa7,0xc0 # xstore | |
144 | ret | |
145 | .size padlock_xstore,.-padlock_xstore | |
146 | ||
147 | .globl padlock_sha1_oneshot | |
148 | .type padlock_sha1_oneshot,\@function,3 | |
149 | .align 16 | |
150 | padlock_sha1_oneshot: | |
ed28aef8 | 151 | mov %rdx,%rcx |
08d62e9f AP |
152 | mov %rdi,%rdx # put aside %rdi |
153 | movups (%rdi),%xmm0 # copy-in context | |
154 | sub \$128+8,%rsp | |
155 | mov 16(%rdi),%eax | |
156 | movaps %xmm0,(%rsp) | |
157 | mov %rsp,%rdi | |
158 | mov %eax,16(%rsp) | |
159 | xor %rax,%rax | |
ed28aef8 | 160 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 |
08d62e9f AP |
161 | movaps (%rsp),%xmm0 |
162 | mov 16(%rsp),%eax | |
b1d3e9de | 163 | add \$128+8,%rsp |
08d62e9f AP |
164 | movups %xmm0,(%rdx) # copy-out context |
165 | mov %eax,16(%rdx) | |
ed28aef8 AP |
166 | ret |
167 | .size padlock_sha1_oneshot,.-padlock_sha1_oneshot | |
168 | ||
149ca712 AP |
169 | .globl padlock_sha1_blocks |
170 | .type padlock_sha1_blocks,\@function,3 | |
ed28aef8 | 171 | .align 16 |
149ca712 | 172 | padlock_sha1_blocks: |
ed28aef8 | 173 | mov %rdx,%rcx |
08d62e9f AP |
174 | mov %rdi,%rdx # put aside %rdi |
175 | movups (%rdi),%xmm0 # copy-in context | |
176 | sub \$128+8,%rsp | |
177 | mov 16(%rdi),%eax | |
178 | movaps %xmm0,(%rsp) | |
179 | mov %rsp,%rdi | |
180 | mov %eax,16(%rsp) | |
181 | mov \$-1,%rax | |
ed28aef8 | 182 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 |
08d62e9f AP |
183 | movaps (%rsp),%xmm0 |
184 | mov 16(%rsp),%eax | |
b1d3e9de | 185 | add \$128+8,%rsp |
08d62e9f AP |
186 | movups %xmm0,(%rdx) # copy-out context |
187 | mov %eax,16(%rdx) | |
ed28aef8 | 188 | ret |
149ca712 | 189 | .size padlock_sha1_blocks,.-padlock_sha1_blocks |
ed28aef8 AP |
190 | |
191 | .globl padlock_sha256_oneshot | |
192 | .type padlock_sha256_oneshot,\@function,3 | |
193 | .align 16 | |
194 | padlock_sha256_oneshot: | |
ed28aef8 | 195 | mov %rdx,%rcx |
08d62e9f AP |
196 | mov %rdi,%rdx # put aside %rdi |
197 | movups (%rdi),%xmm0 # copy-in context | |
198 | sub \$128+8,%rsp | |
199 | movups 16(%rdi),%xmm1 | |
200 | movaps %xmm0,(%rsp) | |
201 | mov %rsp,%rdi | |
202 | movaps %xmm1,16(%rsp) | |
203 | xor %rax,%rax | |
ed28aef8 | 204 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 |
08d62e9f AP |
205 | movaps (%rsp),%xmm0 |
206 | movaps 16(%rsp),%xmm1 | |
b1d3e9de | 207 | add \$128+8,%rsp |
08d62e9f AP |
208 | movups %xmm0,(%rdx) # copy-out context |
209 | movups %xmm1,16(%rdx) | |
ed28aef8 AP |
210 | ret |
211 | .size padlock_sha256_oneshot,.-padlock_sha256_oneshot | |
212 | ||
149ca712 AP |
213 | .globl padlock_sha256_blocks |
214 | .type padlock_sha256_blocks,\@function,3 | |
ed28aef8 | 215 | .align 16 |
149ca712 | 216 | padlock_sha256_blocks: |
ed28aef8 | 217 | mov %rdx,%rcx |
08d62e9f AP |
218 | mov %rdi,%rdx # put aside %rdi |
219 | movups (%rdi),%xmm0 # copy-in context | |
220 | sub \$128+8,%rsp | |
221 | movups 16(%rdi),%xmm1 | |
222 | movaps %xmm0,(%rsp) | |
223 | mov %rsp,%rdi | |
224 | movaps %xmm1,16(%rsp) | |
225 | mov \$-1,%rax | |
ed28aef8 | 226 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 |
08d62e9f AP |
227 | movaps (%rsp),%xmm0 |
228 | movaps 16(%rsp),%xmm1 | |
b1d3e9de | 229 | add \$128+8,%rsp |
08d62e9f AP |
230 | movups %xmm0,(%rdx) # copy-out context |
231 | movups %xmm1,16(%rdx) | |
ed28aef8 | 232 | ret |
149ca712 AP |
233 | .size padlock_sha256_blocks,.-padlock_sha256_blocks |
234 | ||
d18762f7 AP |
235 | .globl padlock_sha512_blocks |
236 | .type padlock_sha512_blocks,\@function,3 | |
149ca712 AP |
237 | .align 16 |
238 | padlock_sha512_blocks: | |
239 | mov %rdx,%rcx | |
08d62e9f AP |
240 | mov %rdi,%rdx # put aside %rdi |
241 | movups (%rdi),%xmm0 # copy-in context | |
242 | sub \$128+8,%rsp | |
243 | movups 16(%rdi),%xmm1 | |
244 | movups 32(%rdi),%xmm2 | |
245 | movups 48(%rdi),%xmm3 | |
246 | movaps %xmm0,(%rsp) | |
247 | mov %rsp,%rdi | |
248 | movaps %xmm1,16(%rsp) | |
249 | movaps %xmm2,32(%rsp) | |
250 | movaps %xmm3,48(%rsp) | |
149ca712 | 251 | .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 |
08d62e9f AP |
252 | movaps (%rsp),%xmm0 |
253 | movaps 16(%rsp),%xmm1 | |
254 | movaps 32(%rsp),%xmm2 | |
255 | movaps 48(%rsp),%xmm3 | |
b1d3e9de | 256 | add \$128+8,%rsp |
08d62e9f AP |
257 | movups %xmm0,(%rdx) # copy-out context |
258 | movups %xmm1,16(%rdx) | |
259 | movups %xmm2,32(%rdx) | |
260 | movups %xmm3,48(%rdx) | |
149ca712 AP |
261 | ret |
262 | .size padlock_sha512_blocks,.-padlock_sha512_blocks | |
ed28aef8 AP |
263 | ___ |
264 | ||
265 | sub generate_mode { | |
266 | my ($mode,$opcode) = @_; | |
267 | # int padlock_$mode_encrypt(void *out, const void *inp, | |
268 | # struct padlock_cipher_data *ctx, size_t len); | |
269 | $code.=<<___; | |
270 | .globl padlock_${mode}_encrypt | |
271 | .type padlock_${mode}_encrypt,\@function,4 | |
272 | .align 16 | |
273 | padlock_${mode}_encrypt: | |
274 | push %rbp | |
275 | push %rbx | |
276 | ||
277 | xor %eax,%eax | |
278 | test \$15,$ctx | |
279 | jnz .L${mode}_abort | |
280 | test \$15,$len | |
281 | jnz .L${mode}_abort | |
282 | lea .Lpadlock_saved_context(%rip),%rax | |
283 | pushf | |
284 | cld | |
285 | call _padlock_verify_ctx | |
286 | lea 16($ctx),$ctx # control word | |
287 | xor %eax,%eax | |
288 | xor %ebx,%ebx | |
33987f2f | 289 | testl \$`1<<5`,($ctx) # align bit in control word |
149ca712 | 290 | jnz .L${mode}_aligned |
ed28aef8 AP |
291 | test \$0x0f,$out |
292 | setz %al # !out_misaligned | |
293 | test \$0x0f,$inp | |
294 | setz %bl # !inp_misaligned | |
295 | test %ebx,%eax | |
296 | jnz .L${mode}_aligned | |
297 | neg %rax | |
298 | mov \$$PADLOCK_CHUNK,$chunk | |
299 | not %rax # out_misaligned?-1:0 | |
300 | lea (%rsp),%rbp | |
301 | cmp $chunk,$len | |
302 | cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len | |
303 | and $chunk,%rax # out_misaligned?chunk:0 | |
304 | mov $len,$chunk | |
305 | neg %rax | |
306 | and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK | |
307 | lea (%rax,%rbp),%rsp | |
ed998634 AP |
308 | mov \$$PADLOCK_CHUNK,%rax |
309 | cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK | |
50452b2e AP |
310 | ___ |
311 | $code.=<<___ if ($mode eq "ctr32"); | |
6c8ce3c2 | 312 | .L${mode}_reenter: |
50452b2e AP |
313 | mov -4($ctx),%eax # pull 32-bit counter |
314 | bswap %eax | |
315 | neg %eax | |
316 | and \$`$PADLOCK_CHUNK/16-1`,%eax | |
ed998634 | 317 | mov \$$PADLOCK_CHUNK,$chunk |
50452b2e | 318 | shl \$4,%eax |
ed998634 | 319 | cmovz $chunk,%rax |
50452b2e AP |
320 | cmp %rax,$len |
321 | cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK | |
ed998634 AP |
322 | cmovbe $len,$chunk |
323 | ___ | |
324 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
325 | cmp $chunk,$len | |
326 | ja .L${mode}_loop | |
327 | mov $inp,%rax # check if prefetch crosses page | |
328 | cmp %rsp,%rbp | |
329 | cmove $out,%rax | |
330 | add $len,%rax | |
331 | neg %rax | |
332 | and \$0xfff,%rax # distance to page boundary | |
333 | cmp \$$PADLOCK_PREFETCH{$mode},%rax | |
334 | mov \$-$PADLOCK_PREFETCH{$mode},%rax | |
335 | cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 | |
336 | and %rax,$chunk | |
337 | jz .L${mode}_unaligned_tail | |
50452b2e AP |
338 | ___ |
339 | $code.=<<___; | |
ed28aef8 AP |
340 | jmp .L${mode}_loop |
341 | .align 16 | |
342 | .L${mode}_loop: | |
50452b2e AP |
343 | cmp $len,$chunk # ctr32 artefact |
344 | cmova $len,$chunk # ctr32 artefact | |
ed28aef8 AP |
345 | mov $out,%r8 # save parameters |
346 | mov $inp,%r9 | |
347 | mov $len,%r10 | |
348 | mov $chunk,$len | |
349 | mov $chunk,%r11 | |
350 | test \$0x0f,$out # out_misaligned | |
351 | cmovnz %rsp,$out | |
352 | test \$0x0f,$inp # inp_misaligned | |
353 | jz .L${mode}_inp_aligned | |
354 | shr \$3,$len | |
355 | .byte 0xf3,0x48,0xa5 # rep movsq | |
356 | sub $chunk,$out | |
357 | mov $chunk,$len | |
358 | mov $out,$inp | |
359 | .L${mode}_inp_aligned: | |
360 | lea -16($ctx),%rax # ivp | |
361 | lea 16($ctx),%rbx # key | |
362 | shr \$4,$len | |
363 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
364 | ___ | |
365 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
366 | movdqa (%rax),%xmm0 | |
367 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
368 | ___ | |
50452b2e AP |
369 | $code.=<<___ if ($mode eq "ctr32"); |
370 | mov -4($ctx),%eax # pull 32-bit counter | |
371 | test \$0xffff0000,%eax | |
ed998634 | 372 | jnz .L${mode}_no_carry |
50452b2e AP |
373 | bswap %eax |
374 | add \$0x10000,%eax | |
375 | bswap %eax | |
376 | mov %eax,-4($ctx) | |
ed998634 | 377 | .L${mode}_no_carry: |
50452b2e | 378 | ___ |
ed28aef8 AP |
379 | $code.=<<___; |
380 | mov %r8,$out # restore paramters | |
381 | mov %r11,$chunk | |
382 | test \$0x0f,$out | |
383 | jz .L${mode}_out_aligned | |
384 | mov $chunk,$len | |
ed28aef8 | 385 | lea (%rsp),$inp |
ed998634 | 386 | shr \$3,$len |
ed28aef8 AP |
387 | .byte 0xf3,0x48,0xa5 # rep movsq |
388 | sub $chunk,$out | |
389 | .L${mode}_out_aligned: | |
390 | mov %r9,$inp | |
391 | mov %r10,$len | |
392 | add $chunk,$out | |
393 | add $chunk,$inp | |
394 | sub $chunk,$len | |
395 | mov \$$PADLOCK_CHUNK,$chunk | |
ed998634 AP |
396 | ___ |
397 | if (!$PADLOCK_PREFETCH{$mode}) { | |
398 | $code.=<<___; | |
ed28aef8 | 399 | jnz .L${mode}_loop |
ed998634 AP |
400 | ___ |
401 | } else { | |
402 | $code.=<<___; | |
403 | jz .L${mode}_break | |
404 | cmp $chunk,$len | |
405 | jae .L${mode}_loop | |
406 | ___ | |
407 | $code.=<<___ if ($mode eq "ctr32"); | |
408 | mov $len,$chunk | |
409 | mov $inp,%rax # check if prefetch crosses page | |
6c8ce3c2 | 410 | cmp %rsp,%rbp |
ed998634 AP |
411 | cmove $out,%rax |
412 | add $len,%rax | |
413 | neg %rax | |
414 | and \$0xfff,%rax # distance to page boundary | |
415 | cmp \$$PADLOCK_PREFETCH{$mode},%rax | |
416 | mov \$-$PADLOCK_PREFETCH{$mode},%rax | |
417 | cmovae $chunk,%rax | |
418 | and %rax,$chunk | |
419 | jnz .L${mode}_loop | |
420 | ___ | |
421 | $code.=<<___; | |
422 | .L${mode}_unaligned_tail: | |
423 | xor %eax,%eax | |
424 | cmp %rsp,%rbp | |
425 | cmove $len,%rax | |
426 | mov $out,%r8 # save parameters | |
427 | mov $len,$chunk | |
428 | sub %rax,%rsp # alloca | |
429 | shr \$3,$len | |
430 | lea (%rsp),$out | |
431 | .byte 0xf3,0x48,0xa5 # rep movsq | |
432 | mov %rsp,$inp | |
433 | mov %r8, $out # restore parameters | |
434 | mov $chunk,$len | |
435 | jmp .L${mode}_loop | |
436 | .align 16 | |
437 | .L${mode}_break: | |
438 | ___ | |
439 | } | |
440 | $code.=<<___; | |
441 | cmp %rbp,%rsp | |
6c8ce3c2 AP |
442 | je .L${mode}_done |
443 | ||
444 | pxor %xmm0,%xmm0 | |
445 | lea (%rsp),%rax | |
446 | .L${mode}_bzero: | |
447 | movaps %xmm0,(%rax) | |
448 | lea 16(%rax),%rax | |
449 | cmp %rax,%rbp | |
450 | ja .L${mode}_bzero | |
ed28aef8 | 451 | |
ed28aef8 AP |
452 | .L${mode}_done: |
453 | lea (%rbp),%rsp | |
454 | jmp .L${mode}_exit | |
ed998634 | 455 | |
ed28aef8 AP |
456 | .align 16 |
457 | .L${mode}_aligned: | |
50452b2e AP |
458 | ___ |
459 | $code.=<<___ if ($mode eq "ctr32"); | |
460 | mov -4($ctx),%eax # pull 32-bit counter | |
50452b2e | 461 | bswap %eax |
50452b2e AP |
462 | neg %eax |
463 | and \$0xffff,%eax | |
ed998634 | 464 | mov \$`16*0x10000`,$chunk |
50452b2e | 465 | shl \$4,%eax |
ed998634 | 466 | cmovz $chunk,%rax |
50452b2e AP |
467 | cmp %rax,$len |
468 | cmova %rax,$chunk # don't let counter cross 2^16 | |
ed998634 AP |
469 | cmovbe $len,$chunk |
470 | jbe .L${mode}_aligned_skip | |
471 | ||
50452b2e | 472 | .L${mode}_aligned_loop: |
50452b2e AP |
473 | mov $len,%r10 # save parameters |
474 | mov $chunk,$len | |
475 | mov $chunk,%r11 | |
ed998634 | 476 | |
ed28aef8 AP |
477 | lea -16($ctx),%rax # ivp |
478 | lea 16($ctx),%rbx # key | |
479 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
480 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
ed998634 | 481 | |
50452b2e AP |
482 | mov -4($ctx),%eax # pull 32-bit counter |
483 | bswap %eax | |
484 | add \$0x10000,%eax | |
485 | bswap %eax | |
486 | mov %eax,-4($ctx) | |
487 | ||
ed998634 AP |
488 | mov %r10,$len # restore paramters |
489 | sub %r11,$len | |
50452b2e | 490 | mov \$`16*0x10000`,$chunk |
ed998634 AP |
491 | jz .L${mode}_exit |
492 | cmp $chunk,$len | |
493 | jae .L${mode}_aligned_loop | |
494 | ||
495 | .L${mode}_aligned_skip: | |
496 | ___ | |
497 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
498 | lea ($inp,$len),%rbp | |
499 | neg %rbp | |
500 | and \$0xfff,%rbp # distance to page boundary | |
501 | xor %eax,%eax | |
502 | cmp \$$PADLOCK_PREFETCH{$mode},%rbp | |
503 | mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp | |
504 | cmovae %rax,%rbp | |
505 | and $len,%rbp # remainder | |
506 | sub %rbp,$len | |
507 | jz .L${mode}_aligned_tail | |
508 | ___ | |
509 | $code.=<<___; | |
510 | lea -16($ctx),%rax # ivp | |
511 | lea 16($ctx),%rbx # key | |
512 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
513 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
514 | ___ | |
515 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
516 | movdqa (%rax),%xmm0 | |
517 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
518 | ___ | |
519 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
520 | test %rbp,%rbp # check remainder | |
521 | jz .L${mode}_exit | |
522 | ||
523 | .L${mode}_aligned_tail: | |
524 | mov $out,%r8 | |
525 | mov %rbp,$chunk | |
526 | mov %rbp,$len | |
527 | lea (%rsp),%rbp | |
528 | sub $len,%rsp | |
529 | shr \$3,$len | |
530 | lea (%rsp),$out | |
531 | .byte 0xf3,0x48,0xa5 # rep movsq | |
532 | lea (%r8),$out | |
533 | lea (%rsp),$inp | |
534 | mov $chunk,$len | |
535 | jmp .L${mode}_loop | |
50452b2e | 536 | ___ |
ed28aef8 AP |
537 | $code.=<<___; |
538 | .L${mode}_exit: | |
539 | mov \$1,%eax | |
540 | lea 8(%rsp),%rsp | |
541 | .L${mode}_abort: | |
542 | pop %rbx | |
543 | pop %rbp | |
544 | ret | |
545 | .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt | |
546 | ___ | |
547 | } | |
548 | ||
549 | &generate_mode("ecb",0xc8); | |
550 | &generate_mode("cbc",0xd0); | |
551 | &generate_mode("cfb",0xe0); | |
552 | &generate_mode("ofb",0xe8); | |
50452b2e | 553 | &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... |
ed28aef8 AP |
554 | |
555 | $code.=<<___; | |
556 | .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" | |
557 | .align 16 | |
558 | .data | |
559 | .align 8 | |
560 | .Lpadlock_saved_context: | |
561 | .quad 0 | |
562 | ___ | |
563 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
564 | ||
565 | print $code; | |
566 | ||
567 | close STDOUT; |