]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
da1c088f | 2 | # Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
ab3fa1c0 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
ed28aef8 AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # September 2011 | |
18 | # | |
50452b2e AP |
19 | # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for |
20 | # details. | |
ed28aef8 | 21 | |
1aa89a7a RL |
22 | # $output is the last argument if it looks like a file (it has an extension) |
23 | # $flavour is the first argument if it doesn't look like a file | |
24 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
25 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
ed28aef8 AP |
26 | |
27 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
28 | ||
29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
30 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
31 | ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or | |
32 | die "can't locate x86_64-xlate.pl"; | |
33 | ||
1aa89a7a RL |
34 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
35 | or die "can't call $xlate: $!"; | |
46bf83f0 | 36 | *STDOUT=*OUT; |
ed28aef8 AP |
37 | |
38 | $code=".text\n"; | |
39 | ||
ed998634 | 40 | %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata |
50452b2e | 41 | $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 |
ed28aef8 AP |
42 | |
43 | $ctx="%rdx"; | |
44 | $out="%rdi"; | |
45 | $inp="%rsi"; | |
46 | $len="%rcx"; | |
47 | $chunk="%rbx"; | |
48 | ||
49 | ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | |
50 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
51 | ||
52 | $code.=<<___; | |
53 | .globl padlock_capability | |
54 | .type padlock_capability,\@abi-omnipotent | |
55 | .align 16 | |
56 | padlock_capability: | |
57 | mov %rbx,%r8 | |
58 | xor %eax,%eax | |
59 | cpuid | |
60 | xor %eax,%eax | |
61 | cmp \$`"0x".unpack("H*",'tneC')`,%ebx | |
51cf8ba0 | 62 | jne .Lzhaoxin |
ed28aef8 AP |
63 | cmp \$`"0x".unpack("H*",'Hrua')`,%edx |
64 | jne .Lnoluck | |
65 | cmp \$`"0x".unpack("H*",'slua')`,%ecx | |
66 | jne .Lnoluck | |
51cf8ba0 J |
67 | jmp .LzhaoxinEnd |
68 | .Lzhaoxin: | |
69 | cmp \$`"0x".unpack("H*",'hS ')`,%ebx | |
70 | jne .Lnoluck | |
71 | cmp \$`"0x".unpack("H*",'hgna')`,%edx | |
72 | jne .Lnoluck | |
73 | cmp \$`"0x".unpack("H*",' ia')`,%ecx | |
74 | jne .Lnoluck | |
75 | .LzhaoxinEnd: | |
ed28aef8 AP |
76 | mov \$0xC0000000,%eax |
77 | cpuid | |
78 | mov %eax,%edx | |
79 | xor %eax,%eax | |
80 | cmp \$0xC0000001,%edx | |
81 | jb .Lnoluck | |
82 | mov \$0xC0000001,%eax | |
83 | cpuid | |
84 | mov %edx,%eax | |
85 | and \$0xffffffef,%eax | |
86 | or \$0x10,%eax # set Nano bit#4 | |
87 | .Lnoluck: | |
88 | mov %r8,%rbx | |
89 | ret | |
90 | .size padlock_capability,.-padlock_capability | |
91 | ||
92 | .globl padlock_key_bswap | |
93 | .type padlock_key_bswap,\@abi-omnipotent,0 | |
94 | .align 16 | |
95 | padlock_key_bswap: | |
96 | mov 240($arg1),%edx | |
7331e7ef V |
97 | inc %edx |
98 | shl \$2,%edx | |
ed28aef8 AP |
99 | .Lbswap_loop: |
100 | mov ($arg1),%eax | |
101 | bswap %eax | |
102 | mov %eax,($arg1) | |
103 | lea 4($arg1),$arg1 | |
104 | sub \$1,%edx | |
105 | jnz .Lbswap_loop | |
106 | ret | |
107 | .size padlock_key_bswap,.-padlock_key_bswap | |
108 | ||
109 | .globl padlock_verify_context | |
110 | .type padlock_verify_context,\@abi-omnipotent | |
111 | .align 16 | |
112 | padlock_verify_context: | |
113 | mov $arg1,$ctx | |
114 | pushf | |
115 | lea .Lpadlock_saved_context(%rip),%rax | |
116 | call _padlock_verify_ctx | |
117 | lea 8(%rsp),%rsp | |
118 | ret | |
119 | .size padlock_verify_context,.-padlock_verify_context | |
120 | ||
121 | .type _padlock_verify_ctx,\@abi-omnipotent | |
122 | .align 16 | |
123 | _padlock_verify_ctx: | |
124 | mov 8(%rsp),%r8 | |
125 | bt \$30,%r8 | |
126 | jnc .Lverified | |
127 | cmp (%rax),$ctx | |
128 | je .Lverified | |
129 | pushf | |
130 | popf | |
131 | .Lverified: | |
132 | mov $ctx,(%rax) | |
133 | ret | |
134 | .size _padlock_verify_ctx,.-_padlock_verify_ctx | |
135 | ||
136 | .globl padlock_reload_key | |
137 | .type padlock_reload_key,\@abi-omnipotent | |
138 | .align 16 | |
139 | padlock_reload_key: | |
140 | pushf | |
141 | popf | |
142 | ret | |
143 | .size padlock_reload_key,.-padlock_reload_key | |
144 | ||
145 | .globl padlock_aes_block | |
146 | .type padlock_aes_block,\@function,3 | |
147 | .align 16 | |
148 | padlock_aes_block: | |
149 | mov %rbx,%r8 | |
150 | mov \$1,$len | |
151 | lea 32($ctx),%rbx # key | |
152 | lea 16($ctx),$ctx # control word | |
153 | .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb | |
154 | mov %r8,%rbx | |
155 | ret | |
156 | .size padlock_aes_block,.-padlock_aes_block | |
157 | ||
158 | .globl padlock_xstore | |
159 | .type padlock_xstore,\@function,2 | |
160 | .align 16 | |
161 | padlock_xstore: | |
162 | mov %esi,%edx | |
163 | .byte 0x0f,0xa7,0xc0 # xstore | |
164 | ret | |
165 | .size padlock_xstore,.-padlock_xstore | |
166 | ||
167 | .globl padlock_sha1_oneshot | |
168 | .type padlock_sha1_oneshot,\@function,3 | |
169 | .align 16 | |
170 | padlock_sha1_oneshot: | |
ed28aef8 | 171 | mov %rdx,%rcx |
08d62e9f AP |
172 | mov %rdi,%rdx # put aside %rdi |
173 | movups (%rdi),%xmm0 # copy-in context | |
174 | sub \$128+8,%rsp | |
175 | mov 16(%rdi),%eax | |
176 | movaps %xmm0,(%rsp) | |
177 | mov %rsp,%rdi | |
178 | mov %eax,16(%rsp) | |
179 | xor %rax,%rax | |
ed28aef8 | 180 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 |
08d62e9f AP |
181 | movaps (%rsp),%xmm0 |
182 | mov 16(%rsp),%eax | |
b1d3e9de | 183 | add \$128+8,%rsp |
08d62e9f AP |
184 | movups %xmm0,(%rdx) # copy-out context |
185 | mov %eax,16(%rdx) | |
ed28aef8 AP |
186 | ret |
187 | .size padlock_sha1_oneshot,.-padlock_sha1_oneshot | |
188 | ||
149ca712 AP |
189 | .globl padlock_sha1_blocks |
190 | .type padlock_sha1_blocks,\@function,3 | |
ed28aef8 | 191 | .align 16 |
149ca712 | 192 | padlock_sha1_blocks: |
ed28aef8 | 193 | mov %rdx,%rcx |
08d62e9f AP |
194 | mov %rdi,%rdx # put aside %rdi |
195 | movups (%rdi),%xmm0 # copy-in context | |
196 | sub \$128+8,%rsp | |
197 | mov 16(%rdi),%eax | |
198 | movaps %xmm0,(%rsp) | |
199 | mov %rsp,%rdi | |
200 | mov %eax,16(%rsp) | |
201 | mov \$-1,%rax | |
ed28aef8 | 202 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 |
08d62e9f AP |
203 | movaps (%rsp),%xmm0 |
204 | mov 16(%rsp),%eax | |
b1d3e9de | 205 | add \$128+8,%rsp |
08d62e9f AP |
206 | movups %xmm0,(%rdx) # copy-out context |
207 | mov %eax,16(%rdx) | |
ed28aef8 | 208 | ret |
149ca712 | 209 | .size padlock_sha1_blocks,.-padlock_sha1_blocks |
ed28aef8 AP |
210 | |
211 | .globl padlock_sha256_oneshot | |
212 | .type padlock_sha256_oneshot,\@function,3 | |
213 | .align 16 | |
214 | padlock_sha256_oneshot: | |
ed28aef8 | 215 | mov %rdx,%rcx |
08d62e9f AP |
216 | mov %rdi,%rdx # put aside %rdi |
217 | movups (%rdi),%xmm0 # copy-in context | |
218 | sub \$128+8,%rsp | |
219 | movups 16(%rdi),%xmm1 | |
220 | movaps %xmm0,(%rsp) | |
221 | mov %rsp,%rdi | |
222 | movaps %xmm1,16(%rsp) | |
223 | xor %rax,%rax | |
ed28aef8 | 224 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 |
08d62e9f AP |
225 | movaps (%rsp),%xmm0 |
226 | movaps 16(%rsp),%xmm1 | |
b1d3e9de | 227 | add \$128+8,%rsp |
08d62e9f AP |
228 | movups %xmm0,(%rdx) # copy-out context |
229 | movups %xmm1,16(%rdx) | |
ed28aef8 AP |
230 | ret |
231 | .size padlock_sha256_oneshot,.-padlock_sha256_oneshot | |
232 | ||
149ca712 AP |
233 | .globl padlock_sha256_blocks |
234 | .type padlock_sha256_blocks,\@function,3 | |
ed28aef8 | 235 | .align 16 |
149ca712 | 236 | padlock_sha256_blocks: |
ed28aef8 | 237 | mov %rdx,%rcx |
08d62e9f AP |
238 | mov %rdi,%rdx # put aside %rdi |
239 | movups (%rdi),%xmm0 # copy-in context | |
240 | sub \$128+8,%rsp | |
241 | movups 16(%rdi),%xmm1 | |
242 | movaps %xmm0,(%rsp) | |
243 | mov %rsp,%rdi | |
244 | movaps %xmm1,16(%rsp) | |
245 | mov \$-1,%rax | |
ed28aef8 | 246 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 |
08d62e9f AP |
247 | movaps (%rsp),%xmm0 |
248 | movaps 16(%rsp),%xmm1 | |
b1d3e9de | 249 | add \$128+8,%rsp |
08d62e9f AP |
250 | movups %xmm0,(%rdx) # copy-out context |
251 | movups %xmm1,16(%rdx) | |
ed28aef8 | 252 | ret |
149ca712 AP |
253 | .size padlock_sha256_blocks,.-padlock_sha256_blocks |
254 | ||
d18762f7 AP |
255 | .globl padlock_sha512_blocks |
256 | .type padlock_sha512_blocks,\@function,3 | |
149ca712 AP |
257 | .align 16 |
258 | padlock_sha512_blocks: | |
259 | mov %rdx,%rcx | |
08d62e9f AP |
260 | mov %rdi,%rdx # put aside %rdi |
261 | movups (%rdi),%xmm0 # copy-in context | |
262 | sub \$128+8,%rsp | |
263 | movups 16(%rdi),%xmm1 | |
264 | movups 32(%rdi),%xmm2 | |
265 | movups 48(%rdi),%xmm3 | |
266 | movaps %xmm0,(%rsp) | |
267 | mov %rsp,%rdi | |
268 | movaps %xmm1,16(%rsp) | |
269 | movaps %xmm2,32(%rsp) | |
270 | movaps %xmm3,48(%rsp) | |
149ca712 | 271 | .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 |
08d62e9f AP |
272 | movaps (%rsp),%xmm0 |
273 | movaps 16(%rsp),%xmm1 | |
274 | movaps 32(%rsp),%xmm2 | |
275 | movaps 48(%rsp),%xmm3 | |
b1d3e9de | 276 | add \$128+8,%rsp |
08d62e9f AP |
277 | movups %xmm0,(%rdx) # copy-out context |
278 | movups %xmm1,16(%rdx) | |
279 | movups %xmm2,32(%rdx) | |
280 | movups %xmm3,48(%rdx) | |
149ca712 AP |
281 | ret |
282 | .size padlock_sha512_blocks,.-padlock_sha512_blocks | |
ed28aef8 AP |
283 | ___ |
284 | ||
285 | sub generate_mode { | |
286 | my ($mode,$opcode) = @_; | |
287 | # int padlock_$mode_encrypt(void *out, const void *inp, | |
288 | # struct padlock_cipher_data *ctx, size_t len); | |
289 | $code.=<<___; | |
290 | .globl padlock_${mode}_encrypt | |
291 | .type padlock_${mode}_encrypt,\@function,4 | |
292 | .align 16 | |
293 | padlock_${mode}_encrypt: | |
294 | push %rbp | |
295 | push %rbx | |
296 | ||
297 | xor %eax,%eax | |
298 | test \$15,$ctx | |
299 | jnz .L${mode}_abort | |
300 | test \$15,$len | |
301 | jnz .L${mode}_abort | |
302 | lea .Lpadlock_saved_context(%rip),%rax | |
303 | pushf | |
304 | cld | |
305 | call _padlock_verify_ctx | |
306 | lea 16($ctx),$ctx # control word | |
307 | xor %eax,%eax | |
308 | xor %ebx,%ebx | |
33987f2f | 309 | testl \$`1<<5`,($ctx) # align bit in control word |
149ca712 | 310 | jnz .L${mode}_aligned |
ed28aef8 AP |
311 | test \$0x0f,$out |
312 | setz %al # !out_misaligned | |
313 | test \$0x0f,$inp | |
314 | setz %bl # !inp_misaligned | |
315 | test %ebx,%eax | |
316 | jnz .L${mode}_aligned | |
317 | neg %rax | |
318 | mov \$$PADLOCK_CHUNK,$chunk | |
319 | not %rax # out_misaligned?-1:0 | |
320 | lea (%rsp),%rbp | |
321 | cmp $chunk,$len | |
322 | cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len | |
323 | and $chunk,%rax # out_misaligned?chunk:0 | |
324 | mov $len,$chunk | |
325 | neg %rax | |
326 | and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK | |
327 | lea (%rax,%rbp),%rsp | |
ed998634 AP |
328 | mov \$$PADLOCK_CHUNK,%rax |
329 | cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK | |
50452b2e AP |
330 | ___ |
331 | $code.=<<___ if ($mode eq "ctr32"); | |
6c8ce3c2 | 332 | .L${mode}_reenter: |
50452b2e AP |
333 | mov -4($ctx),%eax # pull 32-bit counter |
334 | bswap %eax | |
335 | neg %eax | |
336 | and \$`$PADLOCK_CHUNK/16-1`,%eax | |
ed998634 | 337 | mov \$$PADLOCK_CHUNK,$chunk |
50452b2e | 338 | shl \$4,%eax |
ed998634 | 339 | cmovz $chunk,%rax |
50452b2e AP |
340 | cmp %rax,$len |
341 | cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK | |
ed998634 AP |
342 | cmovbe $len,$chunk |
343 | ___ | |
344 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
345 | cmp $chunk,$len | |
346 | ja .L${mode}_loop | |
347 | mov $inp,%rax # check if prefetch crosses page | |
348 | cmp %rsp,%rbp | |
349 | cmove $out,%rax | |
350 | add $len,%rax | |
351 | neg %rax | |
352 | and \$0xfff,%rax # distance to page boundary | |
353 | cmp \$$PADLOCK_PREFETCH{$mode},%rax | |
354 | mov \$-$PADLOCK_PREFETCH{$mode},%rax | |
355 | cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 | |
356 | and %rax,$chunk | |
357 | jz .L${mode}_unaligned_tail | |
50452b2e AP |
358 | ___ |
359 | $code.=<<___; | |
ed28aef8 AP |
360 | jmp .L${mode}_loop |
361 | .align 16 | |
362 | .L${mode}_loop: | |
50452b2e AP |
363 | cmp $len,$chunk # ctr32 artefact |
364 | cmova $len,$chunk # ctr32 artefact | |
ed28aef8 AP |
365 | mov $out,%r8 # save parameters |
366 | mov $inp,%r9 | |
367 | mov $len,%r10 | |
368 | mov $chunk,$len | |
369 | mov $chunk,%r11 | |
370 | test \$0x0f,$out # out_misaligned | |
371 | cmovnz %rsp,$out | |
372 | test \$0x0f,$inp # inp_misaligned | |
373 | jz .L${mode}_inp_aligned | |
374 | shr \$3,$len | |
375 | .byte 0xf3,0x48,0xa5 # rep movsq | |
376 | sub $chunk,$out | |
377 | mov $chunk,$len | |
378 | mov $out,$inp | |
379 | .L${mode}_inp_aligned: | |
380 | lea -16($ctx),%rax # ivp | |
381 | lea 16($ctx),%rbx # key | |
382 | shr \$4,$len | |
383 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
384 | ___ | |
385 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
386 | movdqa (%rax),%xmm0 | |
387 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
388 | ___ | |
50452b2e AP |
389 | $code.=<<___ if ($mode eq "ctr32"); |
390 | mov -4($ctx),%eax # pull 32-bit counter | |
391 | test \$0xffff0000,%eax | |
ed998634 | 392 | jnz .L${mode}_no_carry |
50452b2e AP |
393 | bswap %eax |
394 | add \$0x10000,%eax | |
395 | bswap %eax | |
396 | mov %eax,-4($ctx) | |
ed998634 | 397 | .L${mode}_no_carry: |
50452b2e | 398 | ___ |
ed28aef8 | 399 | $code.=<<___; |
1afd7fa9 | 400 | mov %r8,$out # restore parameters |
ed28aef8 AP |
401 | mov %r11,$chunk |
402 | test \$0x0f,$out | |
403 | jz .L${mode}_out_aligned | |
404 | mov $chunk,$len | |
ed28aef8 | 405 | lea (%rsp),$inp |
ed998634 | 406 | shr \$3,$len |
ed28aef8 AP |
407 | .byte 0xf3,0x48,0xa5 # rep movsq |
408 | sub $chunk,$out | |
409 | .L${mode}_out_aligned: | |
410 | mov %r9,$inp | |
411 | mov %r10,$len | |
412 | add $chunk,$out | |
413 | add $chunk,$inp | |
414 | sub $chunk,$len | |
415 | mov \$$PADLOCK_CHUNK,$chunk | |
ed998634 AP |
416 | ___ |
417 | if (!$PADLOCK_PREFETCH{$mode}) { | |
418 | $code.=<<___; | |
ed28aef8 | 419 | jnz .L${mode}_loop |
ed998634 AP |
420 | ___ |
421 | } else { | |
422 | $code.=<<___; | |
423 | jz .L${mode}_break | |
424 | cmp $chunk,$len | |
425 | jae .L${mode}_loop | |
426 | ___ | |
427 | $code.=<<___ if ($mode eq "ctr32"); | |
428 | mov $len,$chunk | |
429 | mov $inp,%rax # check if prefetch crosses page | |
6c8ce3c2 | 430 | cmp %rsp,%rbp |
ed998634 AP |
431 | cmove $out,%rax |
432 | add $len,%rax | |
433 | neg %rax | |
434 | and \$0xfff,%rax # distance to page boundary | |
435 | cmp \$$PADLOCK_PREFETCH{$mode},%rax | |
436 | mov \$-$PADLOCK_PREFETCH{$mode},%rax | |
437 | cmovae $chunk,%rax | |
438 | and %rax,$chunk | |
439 | jnz .L${mode}_loop | |
440 | ___ | |
441 | $code.=<<___; | |
442 | .L${mode}_unaligned_tail: | |
443 | xor %eax,%eax | |
444 | cmp %rsp,%rbp | |
445 | cmove $len,%rax | |
446 | mov $out,%r8 # save parameters | |
447 | mov $len,$chunk | |
448 | sub %rax,%rsp # alloca | |
449 | shr \$3,$len | |
450 | lea (%rsp),$out | |
451 | .byte 0xf3,0x48,0xa5 # rep movsq | |
452 | mov %rsp,$inp | |
453 | mov %r8, $out # restore parameters | |
454 | mov $chunk,$len | |
455 | jmp .L${mode}_loop | |
456 | .align 16 | |
457 | .L${mode}_break: | |
458 | ___ | |
459 | } | |
460 | $code.=<<___; | |
461 | cmp %rbp,%rsp | |
6c8ce3c2 AP |
462 | je .L${mode}_done |
463 | ||
464 | pxor %xmm0,%xmm0 | |
465 | lea (%rsp),%rax | |
466 | .L${mode}_bzero: | |
467 | movaps %xmm0,(%rax) | |
468 | lea 16(%rax),%rax | |
469 | cmp %rax,%rbp | |
470 | ja .L${mode}_bzero | |
ed28aef8 | 471 | |
ed28aef8 AP |
472 | .L${mode}_done: |
473 | lea (%rbp),%rsp | |
474 | jmp .L${mode}_exit | |
ed998634 | 475 | |
ed28aef8 AP |
476 | .align 16 |
477 | .L${mode}_aligned: | |
50452b2e AP |
478 | ___ |
479 | $code.=<<___ if ($mode eq "ctr32"); | |
480 | mov -4($ctx),%eax # pull 32-bit counter | |
50452b2e | 481 | bswap %eax |
50452b2e AP |
482 | neg %eax |
483 | and \$0xffff,%eax | |
ed998634 | 484 | mov \$`16*0x10000`,$chunk |
50452b2e | 485 | shl \$4,%eax |
ed998634 | 486 | cmovz $chunk,%rax |
50452b2e AP |
487 | cmp %rax,$len |
488 | cmova %rax,$chunk # don't let counter cross 2^16 | |
ed998634 AP |
489 | cmovbe $len,$chunk |
490 | jbe .L${mode}_aligned_skip | |
491 | ||
50452b2e | 492 | .L${mode}_aligned_loop: |
50452b2e AP |
493 | mov $len,%r10 # save parameters |
494 | mov $chunk,$len | |
495 | mov $chunk,%r11 | |
ed998634 | 496 | |
ed28aef8 AP |
497 | lea -16($ctx),%rax # ivp |
498 | lea 16($ctx),%rbx # key | |
499 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
500 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
ed998634 | 501 | |
50452b2e AP |
502 | mov -4($ctx),%eax # pull 32-bit counter |
503 | bswap %eax | |
504 | add \$0x10000,%eax | |
505 | bswap %eax | |
506 | mov %eax,-4($ctx) | |
507 | ||
1afd7fa9 | 508 | mov %r10,$len # restore parameters |
ed998634 | 509 | sub %r11,$len |
50452b2e | 510 | mov \$`16*0x10000`,$chunk |
ed998634 AP |
511 | jz .L${mode}_exit |
512 | cmp $chunk,$len | |
513 | jae .L${mode}_aligned_loop | |
514 | ||
515 | .L${mode}_aligned_skip: | |
516 | ___ | |
517 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
518 | lea ($inp,$len),%rbp | |
519 | neg %rbp | |
520 | and \$0xfff,%rbp # distance to page boundary | |
521 | xor %eax,%eax | |
522 | cmp \$$PADLOCK_PREFETCH{$mode},%rbp | |
523 | mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp | |
524 | cmovae %rax,%rbp | |
525 | and $len,%rbp # remainder | |
526 | sub %rbp,$len | |
527 | jz .L${mode}_aligned_tail | |
528 | ___ | |
529 | $code.=<<___; | |
530 | lea -16($ctx),%rax # ivp | |
531 | lea 16($ctx),%rbx # key | |
532 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
533 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
534 | ___ | |
535 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
536 | movdqa (%rax),%xmm0 | |
537 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
538 | ___ | |
539 | $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); | |
540 | test %rbp,%rbp # check remainder | |
541 | jz .L${mode}_exit | |
542 | ||
543 | .L${mode}_aligned_tail: | |
544 | mov $out,%r8 | |
545 | mov %rbp,$chunk | |
546 | mov %rbp,$len | |
547 | lea (%rsp),%rbp | |
548 | sub $len,%rsp | |
549 | shr \$3,$len | |
550 | lea (%rsp),$out | |
609b0852 | 551 | .byte 0xf3,0x48,0xa5 # rep movsq |
ed998634 AP |
552 | lea (%r8),$out |
553 | lea (%rsp),$inp | |
554 | mov $chunk,$len | |
555 | jmp .L${mode}_loop | |
50452b2e | 556 | ___ |
ed28aef8 AP |
557 | $code.=<<___; |
558 | .L${mode}_exit: | |
559 | mov \$1,%eax | |
560 | lea 8(%rsp),%rsp | |
561 | .L${mode}_abort: | |
562 | pop %rbx | |
563 | pop %rbp | |
564 | ret | |
565 | .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt | |
566 | ___ | |
567 | } | |
568 | ||
569 | &generate_mode("ecb",0xc8); | |
570 | &generate_mode("cbc",0xd0); | |
571 | &generate_mode("cfb",0xe0); | |
572 | &generate_mode("ofb",0xe8); | |
50452b2e | 573 | &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... |
ed28aef8 AP |
574 | |
575 | $code.=<<___; | |
576 | .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" | |
577 | .align 16 | |
578 | .data | |
579 | .align 8 | |
580 | .Lpadlock_saved_context: | |
581 | .quad 0 | |
582 | ___ | |
583 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
584 | ||
585 | print $code; | |
586 | ||
587 | close STDOUT; |