]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
b0edda11 | 2 | # Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e RS |
3 | # |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
ed28aef8 AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # September 2011 | |
18 | # | |
50452b2e AP |
19 | # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for |
20 | # details. | |
ed28aef8 AP |
21 | |
22 | $flavour = shift; | |
23 | $output = shift; | |
24 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
25 | ||
26 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
27 | ||
28 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
29 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
30 | ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or | |
31 | die "can't locate x86_64-xlate.pl"; | |
32 | ||
cfe1d992 | 33 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
46bf83f0 | 34 | *STDOUT=*OUT; |
ed28aef8 AP |
35 | |
36 | $code=".text\n"; | |
37 | ||
ed998634 | 38 | %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata |
50452b2e | 39 | $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 |
ed28aef8 AP |
40 | |
41 | $ctx="%rdx"; | |
42 | $out="%rdi"; | |
43 | $inp="%rsi"; | |
44 | $len="%rcx"; | |
45 | $chunk="%rbx"; | |
46 | ||
47 | ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | |
48 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
49 | ||
50 | $code.=<<___; | |
51 | .globl padlock_capability | |
52 | .type padlock_capability,\@abi-omnipotent | |
53 | .align 16 | |
54 | padlock_capability: | |
55 | mov %rbx,%r8 | |
56 | xor %eax,%eax | |
57 | cpuid | |
58 | xor %eax,%eax | |
59 | cmp \$`"0x".unpack("H*",'tneC')`,%ebx | |
51cf8ba0 | 60 | jne .Lzhaoxin |
ed28aef8 AP |
61 | cmp \$`"0x".unpack("H*",'Hrua')`,%edx |
62 | jne .Lnoluck | |
63 | cmp \$`"0x".unpack("H*",'slua')`,%ecx | |
64 | jne .Lnoluck | |
51cf8ba0 J |
65 | jmp .LzhaoxinEnd |
66 | .Lzhaoxin: | |
67 | cmp \$`"0x".unpack("H*",'hS ')`,%ebx | |
68 | jne .Lnoluck | |
69 | cmp \$`"0x".unpack("H*",'hgna')`,%edx | |
70 | jne .Lnoluck | |
71 | cmp \$`"0x".unpack("H*",' ia')`,%ecx | |
72 | jne .Lnoluck | |
73 | .LzhaoxinEnd: | |
ed28aef8 AP |
74 | mov \$0xC0000000,%eax |
75 | cpuid | |
76 | mov %eax,%edx | |
77 | xor %eax,%eax | |
78 | cmp \$0xC0000001,%edx | |
79 | jb .Lnoluck | |
80 | mov \$0xC0000001,%eax | |
81 | cpuid | |
82 | mov %edx,%eax | |
83 | and \$0xffffffef,%eax | |
84 | or \$0x10,%eax # set Nano bit#4 | |
85 | .Lnoluck: | |
86 | mov %r8,%rbx | |
87 | ret | |
88 | .size padlock_capability,.-padlock_capability | |
89 | ||
90 | .globl padlock_key_bswap | |
91 | .type padlock_key_bswap,\@abi-omnipotent,0 | |
92 | .align 16 | |
93 | padlock_key_bswap: | |
94 | mov 240($arg1),%edx | |
95 | .Lbswap_loop: | |
96 | mov ($arg1),%eax | |
97 | bswap %eax | |
98 | mov %eax,($arg1) | |
99 | lea 4($arg1),$arg1 | |
100 | sub \$1,%edx | |
101 | jnz .Lbswap_loop | |
102 | ret | |
103 | .size padlock_key_bswap,.-padlock_key_bswap | |
104 | ||
105 | .globl padlock_verify_context | |
106 | .type padlock_verify_context,\@abi-omnipotent | |
107 | .align 16 | |
108 | padlock_verify_context: | |
109 | mov $arg1,$ctx | |
110 | pushf | |
111 | lea .Lpadlock_saved_context(%rip),%rax | |
112 | call _padlock_verify_ctx | |
113 | lea 8(%rsp),%rsp | |
114 | ret | |
115 | .size padlock_verify_context,.-padlock_verify_context | |
116 | ||
117 | .type _padlock_verify_ctx,\@abi-omnipotent | |
118 | .align 16 | |
119 | _padlock_verify_ctx: | |
120 | mov 8(%rsp),%r8 | |
121 | bt \$30,%r8 | |
122 | jnc .Lverified | |
123 | cmp (%rax),$ctx | |
124 | je .Lverified | |
125 | pushf | |
126 | popf | |
127 | .Lverified: | |
128 | mov $ctx,(%rax) | |
129 | ret | |
130 | .size _padlock_verify_ctx,.-_padlock_verify_ctx | |
131 | ||
132 | .globl padlock_reload_key | |
133 | .type padlock_reload_key,\@abi-omnipotent | |
134 | .align 16 | |
135 | padlock_reload_key: | |
136 | pushf | |
137 | popf | |
138 | ret | |
139 | .size padlock_reload_key,.-padlock_reload_key | |
140 | ||
141 | .globl padlock_aes_block | |
142 | .type padlock_aes_block,\@function,3 | |
143 | .align 16 | |
144 | padlock_aes_block: | |
145 | mov %rbx,%r8 | |
146 | mov \$1,$len | |
147 | lea 32($ctx),%rbx # key | |
148 | lea 16($ctx),$ctx # control word | |
149 | .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb | |
150 | mov %r8,%rbx | |
151 | ret | |
152 | .size padlock_aes_block,.-padlock_aes_block | |
153 | ||
154 | .globl padlock_xstore | |
155 | .type padlock_xstore,\@function,2 | |
156 | .align 16 | |
157 | padlock_xstore: | |
158 | mov %esi,%edx | |
159 | .byte 0x0f,0xa7,0xc0 # xstore | |
160 | ret | |
161 | .size padlock_xstore,.-padlock_xstore | |
162 | ||
163 | .globl padlock_sha1_oneshot | |
164 | .type padlock_sha1_oneshot,\@function,3 | |
165 | .align 16 | |
166 | padlock_sha1_oneshot: | |
ed28aef8 | 167 | mov %rdx,%rcx |
08d62e9f AP |
168 | mov %rdi,%rdx # put aside %rdi |
169 | movups (%rdi),%xmm0 # copy-in context | |
170 | sub \$128+8,%rsp | |
171 | mov 16(%rdi),%eax | |
172 | movaps %xmm0,(%rsp) | |
173 | mov %rsp,%rdi | |
174 | mov %eax,16(%rsp) | |
175 | xor %rax,%rax | |
ed28aef8 | 176 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 |
08d62e9f AP |
177 | movaps (%rsp),%xmm0 |
178 | mov 16(%rsp),%eax | |
b1d3e9de | 179 | add \$128+8,%rsp |
08d62e9f AP |
180 | movups %xmm0,(%rdx) # copy-out context |
181 | mov %eax,16(%rdx) | |
ed28aef8 AP |
182 | ret |
183 | .size padlock_sha1_oneshot,.-padlock_sha1_oneshot | |
184 | ||
149ca712 AP |
185 | .globl padlock_sha1_blocks |
186 | .type padlock_sha1_blocks,\@function,3 | |
ed28aef8 | 187 | .align 16 |
149ca712 | 188 | padlock_sha1_blocks: |
ed28aef8 | 189 | mov %rdx,%rcx |
08d62e9f AP |
190 | mov %rdi,%rdx # put aside %rdi |
191 | movups (%rdi),%xmm0 # copy-in context | |
192 | sub \$128+8,%rsp | |
193 | mov 16(%rdi),%eax | |
194 | movaps %xmm0,(%rsp) | |
195 | mov %rsp,%rdi | |
196 | mov %eax,16(%rsp) | |
197 | mov \$-1,%rax | |
ed28aef8 | 198 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 |
08d62e9f AP |
199 | movaps (%rsp),%xmm0 |
200 | mov 16(%rsp),%eax | |
b1d3e9de | 201 | add \$128+8,%rsp |
08d62e9f AP |
202 | movups %xmm0,(%rdx) # copy-out context |
203 | mov %eax,16(%rdx) | |
ed28aef8 | 204 | ret |
149ca712 | 205 | .size padlock_sha1_blocks,.-padlock_sha1_blocks |
ed28aef8 AP |
206 | |
207 | .globl padlock_sha256_oneshot | |
208 | .type padlock_sha256_oneshot,\@function,3 | |
209 | .align 16 | |
210 | padlock_sha256_oneshot: | |
ed28aef8 | 211 | mov %rdx,%rcx |
08d62e9f AP |
212 | mov %rdi,%rdx # put aside %rdi |
213 | movups (%rdi),%xmm0 # copy-in context | |
214 | sub \$128+8,%rsp | |
215 | movups 16(%rdi),%xmm1 | |
216 | movaps %xmm0,(%rsp) | |
217 | mov %rsp,%rdi | |
218 | movaps %xmm1,16(%rsp) | |
219 | xor %rax,%rax | |
ed28aef8 | 220 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 |
08d62e9f AP |
221 | movaps (%rsp),%xmm0 |
222 | movaps 16(%rsp),%xmm1 | |
b1d3e9de | 223 | add \$128+8,%rsp |
08d62e9f AP |
224 | movups %xmm0,(%rdx) # copy-out context |
225 | movups %xmm1,16(%rdx) | |
ed28aef8 AP |
226 | ret |
227 | .size padlock_sha256_oneshot,.-padlock_sha256_oneshot | |
228 | ||
149ca712 AP |
229 | .globl padlock_sha256_blocks |
230 | .type padlock_sha256_blocks,\@function,3 | |
ed28aef8 | 231 | .align 16 |
149ca712 | 232 | padlock_sha256_blocks: |
ed28aef8 | 233 | mov %rdx,%rcx |
08d62e9f AP |
234 | mov %rdi,%rdx # put aside %rdi |
235 | movups (%rdi),%xmm0 # copy-in context | |
236 | sub \$128+8,%rsp | |
237 | movups 16(%rdi),%xmm1 | |
238 | movaps %xmm0,(%rsp) | |
239 | mov %rsp,%rdi | |
240 | movaps %xmm1,16(%rsp) | |
241 | mov \$-1,%rax | |
ed28aef8 | 242 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 |
08d62e9f AP |
243 | movaps (%rsp),%xmm0 |
244 | movaps 16(%rsp),%xmm1 | |
b1d3e9de | 245 | add \$128+8,%rsp |
08d62e9f AP |
246 | movups %xmm0,(%rdx) # copy-out context |
247 | movups %xmm1,16(%rdx) | |
ed28aef8 | 248 | ret |
149ca712 AP |
249 | .size padlock_sha256_blocks,.-padlock_sha256_blocks |
250 | ||
d18762f7 AP |
251 | .globl padlock_sha512_blocks |
252 | .type padlock_sha512_blocks,\@function,3 | |
149ca712 AP |
253 | .align 16 |
254 | padlock_sha512_blocks: | |
255 | mov %rdx,%rcx | |
08d62e9f AP |
256 | mov %rdi,%rdx # put aside %rdi |
257 | movups (%rdi),%xmm0 # copy-in context | |
258 | sub \$128+8,%rsp | |
259 | movups 16(%rdi),%xmm1 | |
260 | movups 32(%rdi),%xmm2 | |
261 | movups 48(%rdi),%xmm3 | |
262 | movaps %xmm0,(%rsp) | |
263 | mov %rsp,%rdi | |
264 | movaps %xmm1,16(%rsp) | |
265 | movaps %xmm2,32(%rsp) | |
266 | movaps %xmm3,48(%rsp) | |
149ca712 | 267 | .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 |
08d62e9f AP |
268 | movaps (%rsp),%xmm0 |
269 | movaps 16(%rsp),%xmm1 | |
270 | movaps 32(%rsp),%xmm2 | |
271 | movaps 48(%rsp),%xmm3 | |
b1d3e9de | 272 | add \$128+8,%rsp |
08d62e9f AP |
273 | movups %xmm0,(%rdx) # copy-out context |
274 | movups %xmm1,16(%rdx) | |
275 | movups %xmm2,32(%rdx) | |
276 | movups %xmm3,48(%rdx) | |
149ca712 AP |
277 | ret |
278 | .size padlock_sha512_blocks,.-padlock_sha512_blocks | |
ed28aef8 AP |
279 | ___ |
280 | ||
281 | sub generate_mode { | |
282 | my ($mode,$opcode) = @_; | |
283 | # int padlock_$mode_encrypt(void *out, const void *inp, | |
284 | # struct padlock_cipher_data *ctx, size_t len); | |
285 | $code.=<<___; | |
286 | .globl padlock_${mode}_encrypt | |
287 | .type padlock_${mode}_encrypt,\@function,4 | |
288 | .align 16 | |
289 | padlock_${mode}_encrypt: | |
290 | push %rbp | |
291 | push %rbx | |
292 | ||
293 | xor %eax,%eax | |
294 | test \$15,$ctx | |
295 | jnz .L${mode}_abort | |
296 | test \$15,$len | |
297 | jnz .L${mode}_abort | |
298 | lea .Lpadlock_saved_context(%rip),%rax | |
299 | pushf | |
300 | cld | |
301 | call _padlock_verify_ctx | |
302 | lea 16($ctx),$ctx # control word | |
303 | xor %eax,%eax | |
304 | xor %ebx,%ebx | |
33987f2f | 305 | testl \$`1<<5`,($ctx) # align bit in control word |
149ca712 | 306 | jnz .L${mode}_aligned |
ed28aef8 AP |
307 | test \$0x0f,$out |
308 | setz %al # !out_misaligned | |
309 | test \$0x0f,$inp | |
310 | setz %bl # !inp_misaligned | |
311 | test %ebx,%eax | |
312 | jnz .L${mode}_aligned | |
313 | neg %rax | |
314 | mov \$$PADLOCK_CHUNK,$chunk | |
315 | not %rax # out_misaligned?-1:0 | |
316 | lea (%rsp),%rbp | |
317 | cmp $chunk,$len | |
318 | cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len | |
319 | and $chunk,%rax # out_misaligned?chunk:0 | |
320 | mov $len,$chunk | |
321 | neg %rax | |
322 | and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK | |
323 | lea (%rax,%rbp),%rsp | |
ed998634 AP |
324 | mov \$$PADLOCK_CHUNK,%rax |
325 | cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK | |
50452b2e AP |
326 | ___ |
327 | $code.=<<___ if ($mode eq "ctr32"); | |
6c8ce3c2 | 328 | .L${mode}_reenter: |
50452b2e AP |
329 | mov -4($ctx),%eax # pull 32-bit counter |
330 | bswap %eax | |
331 | neg %eax | |
332 | and \$`$PADLOCK_CHUNK/16-1`,%eax | |
ed998634 | 333 | mov \$$PADLOCK_CHUNK,$chunk |
50452b2e | 334 | shl \$4,%eax |
ed998634 | 335 | cmovz $chunk,%rax |
50452b2e AP |
336 | cmp %rax,$len |
337 | cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK | |
ed998634 AP |
338 | cmovbe $len,$chunk |
339 | ___ | |
340 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
341 | cmp $chunk,$len | |
342 | ja .L${mode}_loop | |
343 | mov $inp,%rax # check if prefetch crosses page | |
344 | cmp %rsp,%rbp | |
345 | cmove $out,%rax | |
346 | add $len,%rax | |
347 | neg %rax | |
348 | and \$0xfff,%rax # distance to page boundary | |
349 | cmp \$$PADLOCK_PREFETCH{$mode},%rax | |
350 | mov \$-$PADLOCK_PREFETCH{$mode},%rax | |
351 | cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 | |
352 | and %rax,$chunk | |
353 | jz .L${mode}_unaligned_tail | |
50452b2e AP |
354 | ___ |
355 | $code.=<<___; | |
ed28aef8 AP |
356 | jmp .L${mode}_loop |
357 | .align 16 | |
358 | .L${mode}_loop: | |
50452b2e AP |
359 | cmp $len,$chunk # ctr32 artefact |
360 | cmova $len,$chunk # ctr32 artefact | |
ed28aef8 AP |
361 | mov $out,%r8 # save parameters |
362 | mov $inp,%r9 | |
363 | mov $len,%r10 | |
364 | mov $chunk,$len | |
365 | mov $chunk,%r11 | |
366 | test \$0x0f,$out # out_misaligned | |
367 | cmovnz %rsp,$out | |
368 | test \$0x0f,$inp # inp_misaligned | |
369 | jz .L${mode}_inp_aligned | |
370 | shr \$3,$len | |
371 | .byte 0xf3,0x48,0xa5 # rep movsq | |
372 | sub $chunk,$out | |
373 | mov $chunk,$len | |
374 | mov $out,$inp | |
375 | .L${mode}_inp_aligned: | |
376 | lea -16($ctx),%rax # ivp | |
377 | lea 16($ctx),%rbx # key | |
378 | shr \$4,$len | |
379 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
380 | ___ | |
381 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
382 | movdqa (%rax),%xmm0 | |
383 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
384 | ___ | |
50452b2e AP |
385 | $code.=<<___ if ($mode eq "ctr32"); |
386 | mov -4($ctx),%eax # pull 32-bit counter | |
387 | test \$0xffff0000,%eax | |
ed998634 | 388 | jnz .L${mode}_no_carry |
50452b2e AP |
389 | bswap %eax |
390 | add \$0x10000,%eax | |
391 | bswap %eax | |
392 | mov %eax,-4($ctx) | |
ed998634 | 393 | .L${mode}_no_carry: |
50452b2e | 394 | ___ |
ed28aef8 | 395 | $code.=<<___; |
1afd7fa9 | 396 | mov %r8,$out # restore parameters |
ed28aef8 AP |
397 | mov %r11,$chunk |
398 | test \$0x0f,$out | |
399 | jz .L${mode}_out_aligned | |
400 | mov $chunk,$len | |
ed28aef8 | 401 | lea (%rsp),$inp |
ed998634 | 402 | shr \$3,$len |
ed28aef8 AP |
403 | .byte 0xf3,0x48,0xa5 # rep movsq |
404 | sub $chunk,$out | |
405 | .L${mode}_out_aligned: | |
406 | mov %r9,$inp | |
407 | mov %r10,$len | |
408 | add $chunk,$out | |
409 | add $chunk,$inp | |
410 | sub $chunk,$len | |
411 | mov \$$PADLOCK_CHUNK,$chunk | |
ed998634 AP |
412 | ___ |
413 | if (!$PADLOCK_PREFETCH{$mode}) { | |
414 | $code.=<<___; | |
ed28aef8 | 415 | jnz .L${mode}_loop |
ed998634 AP |
416 | ___ |
417 | } else { | |
418 | $code.=<<___; | |
419 | jz .L${mode}_break | |
420 | cmp $chunk,$len | |
421 | jae .L${mode}_loop | |
422 | ___ | |
423 | $code.=<<___ if ($mode eq "ctr32"); | |
424 | mov $len,$chunk | |
425 | mov $inp,%rax # check if prefetch crosses page | |
6c8ce3c2 | 426 | cmp %rsp,%rbp |
ed998634 AP |
427 | cmove $out,%rax |
428 | add $len,%rax | |
429 | neg %rax | |
430 | and \$0xfff,%rax # distance to page boundary | |
431 | cmp \$$PADLOCK_PREFETCH{$mode},%rax | |
432 | mov \$-$PADLOCK_PREFETCH{$mode},%rax | |
433 | cmovae $chunk,%rax | |
434 | and %rax,$chunk | |
435 | jnz .L${mode}_loop | |
436 | ___ | |
437 | $code.=<<___; | |
438 | .L${mode}_unaligned_tail: | |
439 | xor %eax,%eax | |
440 | cmp %rsp,%rbp | |
441 | cmove $len,%rax | |
442 | mov $out,%r8 # save parameters | |
443 | mov $len,$chunk | |
444 | sub %rax,%rsp # alloca | |
445 | shr \$3,$len | |
446 | lea (%rsp),$out | |
447 | .byte 0xf3,0x48,0xa5 # rep movsq | |
448 | mov %rsp,$inp | |
449 | mov %r8, $out # restore parameters | |
450 | mov $chunk,$len | |
451 | jmp .L${mode}_loop | |
452 | .align 16 | |
453 | .L${mode}_break: | |
454 | ___ | |
455 | } | |
456 | $code.=<<___; | |
457 | cmp %rbp,%rsp | |
6c8ce3c2 AP |
458 | je .L${mode}_done |
459 | ||
460 | pxor %xmm0,%xmm0 | |
461 | lea (%rsp),%rax | |
462 | .L${mode}_bzero: | |
463 | movaps %xmm0,(%rax) | |
464 | lea 16(%rax),%rax | |
465 | cmp %rax,%rbp | |
466 | ja .L${mode}_bzero | |
ed28aef8 | 467 | |
ed28aef8 AP |
468 | .L${mode}_done: |
469 | lea (%rbp),%rsp | |
470 | jmp .L${mode}_exit | |
ed998634 | 471 | |
ed28aef8 AP |
472 | .align 16 |
473 | .L${mode}_aligned: | |
50452b2e AP |
474 | ___ |
475 | $code.=<<___ if ($mode eq "ctr32"); | |
476 | mov -4($ctx),%eax # pull 32-bit counter | |
50452b2e | 477 | bswap %eax |
50452b2e AP |
478 | neg %eax |
479 | and \$0xffff,%eax | |
ed998634 | 480 | mov \$`16*0x10000`,$chunk |
50452b2e | 481 | shl \$4,%eax |
ed998634 | 482 | cmovz $chunk,%rax |
50452b2e AP |
483 | cmp %rax,$len |
484 | cmova %rax,$chunk # don't let counter cross 2^16 | |
ed998634 AP |
485 | cmovbe $len,$chunk |
486 | jbe .L${mode}_aligned_skip | |
487 | ||
50452b2e | 488 | .L${mode}_aligned_loop: |
50452b2e AP |
489 | mov $len,%r10 # save parameters |
490 | mov $chunk,$len | |
491 | mov $chunk,%r11 | |
ed998634 | 492 | |
ed28aef8 AP |
493 | lea -16($ctx),%rax # ivp |
494 | lea 16($ctx),%rbx # key | |
495 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
496 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
ed998634 | 497 | |
50452b2e AP |
498 | mov -4($ctx),%eax # pull 32-bit counter |
499 | bswap %eax | |
500 | add \$0x10000,%eax | |
501 | bswap %eax | |
502 | mov %eax,-4($ctx) | |
503 | ||
1afd7fa9 | 504 | mov %r10,$len # restore parameters |
ed998634 | 505 | sub %r11,$len |
50452b2e | 506 | mov \$`16*0x10000`,$chunk |
ed998634 AP |
507 | jz .L${mode}_exit |
508 | cmp $chunk,$len | |
509 | jae .L${mode}_aligned_loop | |
510 | ||
511 | .L${mode}_aligned_skip: | |
512 | ___ | |
513 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
514 | lea ($inp,$len),%rbp | |
515 | neg %rbp | |
516 | and \$0xfff,%rbp # distance to page boundary | |
517 | xor %eax,%eax | |
518 | cmp \$$PADLOCK_PREFETCH{$mode},%rbp | |
519 | mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp | |
520 | cmovae %rax,%rbp | |
521 | and $len,%rbp # remainder | |
522 | sub %rbp,$len | |
523 | jz .L${mode}_aligned_tail | |
524 | ___ | |
525 | $code.=<<___; | |
526 | lea -16($ctx),%rax # ivp | |
527 | lea 16($ctx),%rbx # key | |
528 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
529 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
530 | ___ | |
531 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
532 | movdqa (%rax),%xmm0 | |
533 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
534 | ___ | |
535 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
536 | test %rbp,%rbp # check remainder | |
537 | jz .L${mode}_exit | |
538 | ||
539 | .L${mode}_aligned_tail: | |
540 | mov $out,%r8 | |
541 | mov %rbp,$chunk | |
542 | mov %rbp,$len | |
543 | lea (%rsp),%rbp | |
544 | sub $len,%rsp | |
545 | shr \$3,$len | |
546 | lea (%rsp),$out | |
609b0852 | 547 | .byte 0xf3,0x48,0xa5 # rep movsq |
ed998634 AP |
548 | lea (%r8),$out |
549 | lea (%rsp),$inp | |
550 | mov $chunk,$len | |
551 | jmp .L${mode}_loop | |
50452b2e | 552 | ___ |
ed28aef8 AP |
553 | $code.=<<___; |
554 | .L${mode}_exit: | |
555 | mov \$1,%eax | |
556 | lea 8(%rsp),%rsp | |
557 | .L${mode}_abort: | |
558 | pop %rbx | |
559 | pop %rbp | |
560 | ret | |
561 | .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt | |
562 | ___ | |
563 | } | |
564 | ||
565 | &generate_mode("ecb",0xc8); | |
566 | &generate_mode("cbc",0xd0); | |
567 | &generate_mode("cfb",0xe0); | |
568 | &generate_mode("ofb",0xe8); | |
50452b2e | 569 | &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... |
ed28aef8 AP |
570 | |
571 | $code.=<<___; | |
572 | .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" | |
573 | .align 16 | |
574 | .data | |
575 | .align 8 | |
576 | .Lpadlock_saved_context: | |
577 | .quad 0 | |
578 | ___ | |
579 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
580 | ||
581 | print $code; | |
582 | ||
583 | close STDOUT; |