]>
Commit | Line | Data |
---|---|---|
ed28aef8 AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | ||
10 | # September 2011 | |
11 | # | |
50452b2e AP |
12 | # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for |
13 | # details. | |
ed28aef8 AP |
14 | |
15 | $flavour = shift; | |
16 | $output = shift; | |
17 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
18 | ||
19 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
20 | ||
21 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
22 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
23 | ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or | |
24 | die "can't locate x86_64-xlate.pl"; | |
25 | ||
26 | open STDOUT,"| $^X $xlate $flavour $output"; | |
27 | ||
28 | $code=".text\n"; | |
29 | ||
50452b2e | 30 | $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 |
ed28aef8 AP |
31 | |
32 | $ctx="%rdx"; | |
33 | $out="%rdi"; | |
34 | $inp="%rsi"; | |
35 | $len="%rcx"; | |
36 | $chunk="%rbx"; | |
37 | ||
38 | ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | |
39 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
40 | ||
41 | $code.=<<___; | |
42 | .globl padlock_capability | |
43 | .type padlock_capability,\@abi-omnipotent | |
44 | .align 16 | |
45 | padlock_capability: | |
46 | mov %rbx,%r8 | |
47 | xor %eax,%eax | |
48 | cpuid | |
49 | xor %eax,%eax | |
50 | cmp \$`"0x".unpack("H*",'tneC')`,%ebx | |
51 | jne .Lnoluck | |
52 | cmp \$`"0x".unpack("H*",'Hrua')`,%edx | |
53 | jne .Lnoluck | |
54 | cmp \$`"0x".unpack("H*",'slua')`,%ecx | |
55 | jne .Lnoluck | |
56 | mov \$0xC0000000,%eax | |
57 | cpuid | |
58 | mov %eax,%edx | |
59 | xor %eax,%eax | |
60 | cmp \$0xC0000001,%edx | |
61 | jb .Lnoluck | |
62 | mov \$0xC0000001,%eax | |
63 | cpuid | |
64 | mov %edx,%eax | |
65 | and \$0xffffffef,%eax | |
66 | or \$0x10,%eax # set Nano bit#4 | |
67 | .Lnoluck: | |
68 | mov %r8,%rbx | |
69 | ret | |
70 | .size padlock_capability,.-padlock_capability | |
71 | ||
72 | .globl padlock_key_bswap | |
73 | .type padlock_key_bswap,\@abi-omnipotent,0 | |
74 | .align 16 | |
75 | padlock_key_bswap: | |
76 | mov 240($arg1),%edx | |
77 | .Lbswap_loop: | |
78 | mov ($arg1),%eax | |
79 | bswap %eax | |
80 | mov %eax,($arg1) | |
81 | lea 4($arg1),$arg1 | |
82 | sub \$1,%edx | |
83 | jnz .Lbswap_loop | |
84 | ret | |
85 | .size padlock_key_bswap,.-padlock_key_bswap | |
86 | ||
87 | .globl padlock_verify_context | |
88 | .type padlock_verify_context,\@abi-omnipotent | |
89 | .align 16 | |
90 | padlock_verify_context: | |
91 | mov $arg1,$ctx | |
92 | pushf | |
93 | lea .Lpadlock_saved_context(%rip),%rax | |
94 | call _padlock_verify_ctx | |
95 | lea 8(%rsp),%rsp | |
96 | ret | |
97 | .size padlock_verify_context,.-padlock_verify_context | |
98 | ||
99 | .type _padlock_verify_ctx,\@abi-omnipotent | |
100 | .align 16 | |
101 | _padlock_verify_ctx: | |
102 | mov 8(%rsp),%r8 | |
103 | bt \$30,%r8 | |
104 | jnc .Lverified | |
105 | cmp (%rax),$ctx | |
106 | je .Lverified | |
107 | pushf | |
108 | popf | |
109 | .Lverified: | |
110 | mov $ctx,(%rax) | |
111 | ret | |
112 | .size _padlock_verify_ctx,.-_padlock_verify_ctx | |
113 | ||
114 | .globl padlock_reload_key | |
115 | .type padlock_reload_key,\@abi-omnipotent | |
116 | .align 16 | |
117 | padlock_reload_key: | |
118 | pushf | |
119 | popf | |
120 | ret | |
121 | .size padlock_reload_key,.-padlock_reload_key | |
122 | ||
123 | .globl padlock_aes_block | |
124 | .type padlock_aes_block,\@function,3 | |
125 | .align 16 | |
126 | padlock_aes_block: | |
127 | mov %rbx,%r8 | |
128 | mov \$1,$len | |
129 | lea 32($ctx),%rbx # key | |
130 | lea 16($ctx),$ctx # control word | |
131 | .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb | |
132 | mov %r8,%rbx | |
133 | ret | |
134 | .size padlock_aes_block,.-padlock_aes_block | |
135 | ||
136 | .globl padlock_xstore | |
137 | .type padlock_xstore,\@function,2 | |
138 | .align 16 | |
139 | padlock_xstore: | |
140 | mov %esi,%edx | |
141 | .byte 0x0f,0xa7,0xc0 # xstore | |
142 | ret | |
143 | .size padlock_xstore,.-padlock_xstore | |
144 | ||
145 | .globl padlock_sha1_oneshot | |
146 | .type padlock_sha1_oneshot,\@function,3 | |
147 | .align 16 | |
148 | padlock_sha1_oneshot: | |
149 | xor %rax,%rax | |
150 | mov %rdx,%rcx | |
151 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 | |
152 | ret | |
153 | .size padlock_sha1_oneshot,.-padlock_sha1_oneshot | |
154 | ||
149ca712 AP |
155 | .globl padlock_sha1_blocks |
156 | .type padlock_sha1_blocks,\@function,3 | |
ed28aef8 | 157 | .align 16 |
149ca712 | 158 | padlock_sha1_blocks: |
ed28aef8 AP |
159 | mov \$-1,%rax |
160 | mov %rdx,%rcx | |
161 | .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 | |
162 | ret | |
149ca712 | 163 | .size padlock_sha1_blocks,.-padlock_sha1_blocks |
ed28aef8 AP |
164 | |
165 | .globl padlock_sha256_oneshot | |
166 | .type padlock_sha256_oneshot,\@function,3 | |
167 | .align 16 | |
168 | padlock_sha256_oneshot: | |
169 | xor %rax,%rax | |
170 | mov %rdx,%rcx | |
171 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 | |
172 | ret | |
173 | .size padlock_sha256_oneshot,.-padlock_sha256_oneshot | |
174 | ||
149ca712 AP |
175 | .globl padlock_sha256_blocks |
176 | .type padlock_sha256_blocks,\@function,3 | |
ed28aef8 | 177 | .align 16 |
149ca712 | 178 | padlock_sha256_blocks: |
ed28aef8 AP |
179 | mov \$-1,%rax |
180 | mov %rdx,%rcx | |
181 | .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 | |
182 | ret | |
149ca712 AP |
183 | .size padlock_sha256_blocks,.-padlock_sha256_blocks |
184 | ||
d18762f7 AP |
185 | .globl padlock_sha512_blocks |
186 | .type padlock_sha512_blocks,\@function,3 | |
149ca712 AP |
187 | .align 16 |
188 | padlock_sha512_blocks: | |
189 | mov %rdx,%rcx | |
190 | .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 | |
191 | ret | |
192 | .size padlock_sha512_blocks,.-padlock_sha512_blocks | |
ed28aef8 AP |
193 | ___ |
194 | ||
195 | sub generate_mode { | |
196 | my ($mode,$opcode) = @_; | |
197 | # int padlock_$mode_encrypt(void *out, const void *inp, | |
198 | # struct padlock_cipher_data *ctx, size_t len); | |
199 | $code.=<<___; | |
200 | .globl padlock_${mode}_encrypt | |
201 | .type padlock_${mode}_encrypt,\@function,4 | |
202 | .align 16 | |
203 | padlock_${mode}_encrypt: | |
204 | push %rbp | |
205 | push %rbx | |
206 | ||
207 | xor %eax,%eax | |
208 | test \$15,$ctx | |
209 | jnz .L${mode}_abort | |
210 | test \$15,$len | |
211 | jnz .L${mode}_abort | |
212 | lea .Lpadlock_saved_context(%rip),%rax | |
213 | pushf | |
214 | cld | |
215 | call _padlock_verify_ctx | |
216 | lea 16($ctx),$ctx # control word | |
217 | xor %eax,%eax | |
218 | xor %ebx,%ebx | |
33987f2f | 219 | testl \$`1<<5`,($ctx) # align bit in control word |
149ca712 | 220 | jnz .L${mode}_aligned |
ed28aef8 AP |
221 | test \$0x0f,$out |
222 | setz %al # !out_misaligned | |
223 | test \$0x0f,$inp | |
224 | setz %bl # !inp_misaligned | |
225 | test %ebx,%eax | |
226 | jnz .L${mode}_aligned | |
227 | neg %rax | |
228 | mov \$$PADLOCK_CHUNK,$chunk | |
229 | not %rax # out_misaligned?-1:0 | |
230 | lea (%rsp),%rbp | |
231 | cmp $chunk,$len | |
232 | cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len | |
233 | and $chunk,%rax # out_misaligned?chunk:0 | |
234 | mov $len,$chunk | |
235 | neg %rax | |
236 | and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK | |
237 | lea (%rax,%rbp),%rsp | |
50452b2e AP |
238 | ___ |
239 | $code.=<<___ if ($mode eq "ctr32"); | |
240 | mov -4($ctx),%eax # pull 32-bit counter | |
241 | bswap %eax | |
242 | neg %eax | |
243 | and \$`$PADLOCK_CHUNK/16-1`,%eax | |
244 | jz .L${mode}_loop | |
245 | shl \$4,%eax | |
246 | cmp %rax,$len | |
247 | cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK | |
248 | ___ | |
249 | $code.=<<___; | |
ed28aef8 AP |
250 | jmp .L${mode}_loop |
251 | .align 16 | |
252 | .L${mode}_loop: | |
50452b2e AP |
253 | cmp $len,$chunk # ctr32 artefact |
254 | cmova $len,$chunk # ctr32 artefact | |
ed28aef8 AP |
255 | mov $out,%r8 # save parameters |
256 | mov $inp,%r9 | |
257 | mov $len,%r10 | |
258 | mov $chunk,$len | |
259 | mov $chunk,%r11 | |
260 | test \$0x0f,$out # out_misaligned | |
261 | cmovnz %rsp,$out | |
262 | test \$0x0f,$inp # inp_misaligned | |
263 | jz .L${mode}_inp_aligned | |
264 | shr \$3,$len | |
265 | .byte 0xf3,0x48,0xa5 # rep movsq | |
266 | sub $chunk,$out | |
267 | mov $chunk,$len | |
268 | mov $out,$inp | |
269 | .L${mode}_inp_aligned: | |
270 | lea -16($ctx),%rax # ivp | |
271 | lea 16($ctx),%rbx # key | |
272 | shr \$4,$len | |
273 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
274 | ___ | |
275 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
276 | movdqa (%rax),%xmm0 | |
277 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
278 | ___ | |
50452b2e AP |
279 | $code.=<<___ if ($mode eq "ctr32"); |
280 | mov -4($ctx),%eax # pull 32-bit counter | |
281 | test \$0xffff0000,%eax | |
282 | jnz .L${mode}_no_corr | |
283 | bswap %eax | |
284 | add \$0x10000,%eax | |
285 | bswap %eax | |
286 | mov %eax,-4($ctx) | |
287 | .L${mode}_no_corr: | |
288 | ___ | |
ed28aef8 AP |
289 | $code.=<<___; |
290 | mov %r8,$out # restore paramters | |
291 | mov %r11,$chunk | |
292 | test \$0x0f,$out | |
293 | jz .L${mode}_out_aligned | |
294 | mov $chunk,$len | |
295 | shr \$3,$len | |
296 | lea (%rsp),$inp | |
297 | .byte 0xf3,0x48,0xa5 # rep movsq | |
298 | sub $chunk,$out | |
299 | .L${mode}_out_aligned: | |
300 | mov %r9,$inp | |
301 | mov %r10,$len | |
302 | add $chunk,$out | |
303 | add $chunk,$inp | |
304 | sub $chunk,$len | |
305 | mov \$$PADLOCK_CHUNK,$chunk | |
306 | jnz .L${mode}_loop | |
307 | ||
308 | test \$0x0f,$out | |
309 | jz .L${mode}_done | |
310 | ||
311 | mov %rbp,$len | |
312 | mov %rsp,$out | |
313 | sub %rsp,$len | |
314 | xor %rax,%rax | |
315 | shr \$3,$len | |
316 | .byte 0xf3,0x48,0xab # rep stosq | |
317 | .L${mode}_done: | |
318 | lea (%rbp),%rsp | |
319 | jmp .L${mode}_exit | |
320 | ||
321 | .align 16 | |
322 | .L${mode}_aligned: | |
50452b2e AP |
323 | ___ |
324 | $code.=<<___ if ($mode eq "ctr32"); | |
325 | mov -4($ctx),%eax # pull 32-bit counter | |
326 | mov \$`16*0x10000`,$chunk | |
327 | bswap %eax | |
328 | cmp $len,$chunk | |
329 | cmova $len,$chunk | |
330 | neg %eax | |
331 | and \$0xffff,%eax | |
332 | jz .L${mode}_aligned_loop | |
333 | shl \$4,%eax | |
334 | cmp %rax,$len | |
335 | cmova %rax,$chunk # don't let counter cross 2^16 | |
336 | jmp .L${mode}_aligned_loop | |
337 | .align 16 | |
338 | .L${mode}_aligned_loop: | |
339 | cmp $len,$chunk | |
340 | cmova $len,$chunk | |
341 | mov $len,%r10 # save parameters | |
342 | mov $chunk,$len | |
343 | mov $chunk,%r11 | |
344 | ___ | |
345 | $code.=<<___; | |
ed28aef8 AP |
346 | lea -16($ctx),%rax # ivp |
347 | lea 16($ctx),%rbx # key | |
348 | shr \$4,$len # len/=AES_BLOCK_SIZE | |
349 | .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* | |
350 | ___ | |
351 | $code.=<<___ if ($mode !~ /ecb|ctr/); | |
352 | movdqa (%rax),%xmm0 | |
353 | movdqa %xmm0,-16($ctx) # copy [or refresh] iv | |
354 | ___ | |
50452b2e AP |
355 | $code.=<<___ if ($mode eq "ctr32"); |
356 | mov -4($ctx),%eax # pull 32-bit counter | |
357 | bswap %eax | |
358 | add \$0x10000,%eax | |
359 | bswap %eax | |
360 | mov %eax,-4($ctx) | |
361 | ||
362 | mov %r11,$chunk # restore paramters | |
363 | mov %r10,$len | |
364 | sub $chunk,$len | |
365 | mov \$`16*0x10000`,$chunk | |
366 | jnz .L${mode}_aligned_loop | |
367 | ___ | |
ed28aef8 AP |
368 | $code.=<<___; |
369 | .L${mode}_exit: | |
370 | mov \$1,%eax | |
371 | lea 8(%rsp),%rsp | |
372 | .L${mode}_abort: | |
373 | pop %rbx | |
374 | pop %rbp | |
375 | ret | |
376 | .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt | |
377 | ___ | |
378 | } | |
379 | ||
380 | &generate_mode("ecb",0xc8); | |
381 | &generate_mode("cbc",0xd0); | |
382 | &generate_mode("cfb",0xe0); | |
383 | &generate_mode("ofb",0xe8); | |
50452b2e | 384 | &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... |
ed28aef8 AP |
385 | |
386 | $code.=<<___; | |
387 | .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" | |
388 | .align 16 | |
389 | .data | |
390 | .align 8 | |
391 | .Lpadlock_saved_context: | |
392 | .quad 0 | |
393 | ___ | |
394 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
395 | ||
396 | print $code; | |
397 | ||
398 | close STDOUT; |