]>
Commit | Line | Data |
---|---|---|
0b4bb91d AP |
1 | #!/usr/bin/env perl |
2 | ||
31ed9a21 AP |
3 | ############################################################################## |
4 | # # | |
5 | # Copyright (c) 2012, Intel Corporation # | |
6 | # # | |
7 | # All rights reserved. # | |
8 | # # | |
9 | # Redistribution and use in source and binary forms, with or without # | |
10 | # modification, are permitted provided that the following conditions are # | |
11 | # met: # | |
12 | # # | |
13 | # * Redistributions of source code must retain the above copyright # | |
14 | # notice, this list of conditions and the following disclaimer. # | |
15 | # # | |
16 | # * Redistributions in binary form must reproduce the above copyright # | |
17 | # notice, this list of conditions and the following disclaimer in the # | |
18 | # documentation and/or other materials provided with the # | |
19 | # distribution. # | |
20 | # # | |
21 | # * Neither the name of the Intel Corporation nor the names of its # | |
22 | # contributors may be used to endorse or promote products derived from # | |
23 | # this software without specific prior written permission. # | |
24 | # # | |
25 | # # | |
26 | # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # | |
27 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # | |
28 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # | |
29 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # | |
30 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # | |
31 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # | |
32 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # | |
33 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # | |
34 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # | |
35 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # | |
36 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # | |
37 | # # | |
38 | ############################################################################## | |
39 | # Developers and authors: # | |
40 | # Shay Gueron (1, 2), and Vlad Krasnov (1) # | |
41 | # (1) Intel Architecture Group, Microprocessor and Chipset Development, # | |
42 | # Israel Development Center, Haifa, Israel # | |
43 | # (2) University of Haifa # | |
44 | ############################################################################## | |
45 | # Reference: # | |
46 | # [1] S. Gueron, "Efficient Software Implementations of Modular # | |
47 | # Exponentiation", http://eprint.iacr.org/2011/239 # | |
48 | # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # | |
49 | # IEEE Proceedings of 9th International Conference on Information # | |
50 | # Technology: New Generations (ITNG 2012), 821-823 (2012). # | |
51 | # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# | |
52 | # Journal of Cryptographic Engineering 2:31-43 (2012). # | |
53 | # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # | |
54 | # resistant 512-bit and 1024-bit modular exponentiation for optimizing # | |
55 | # RSA1024 and RSA2048 on x86_64 platforms", # | |
56 | # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# | |
57 | ############################################################################## | |
0b4bb91d AP |
58 | |
59 | # While original submission covers 512- and 1024-bit exponentiation, | |
60 | # this module is limited to 512-bit version only (and as such | |
61 | # accelerates RSA1024 sign). This is because improvement for longer | |
62 | # keys is not high enough to justify the effort, highest measured | |
63 | # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming | |
64 | # for the moment of this writing!] Nor does this module implement | |
65 | # "monolithic" complete exponentiation jumbo-subroutine, but adheres | |
66 | # to more modular mixture of C and assembly. And it's optimized even | |
67 | # for processors other than Intel Core family (see table below for | |
68 | # improvement coefficients). | |
69 | # <appro@openssl.org> | |
70 | # | |
71 | # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) | |
72 | # ----------------+--------------------------- | |
73 | # Opteron +13% |+5% +20% | |
74 | # Bulldozer -0% |-1% +10% | |
75 | # P4 +11% |+7% +8% | |
76 | # Westmere +5% |+14% +17% | |
77 | # Sandy Bridge +2% |+12% +29% | |
78 | # Ivy Bridge +1% |+11% +35% | |
79 | # Haswell(**) -0% |+12% +39% | |
80 | # Atom +13% |+11% +4% | |
81 | # VIA Nano +70% |+9% +25% | |
82 | # | |
83 | # (*) rsax engine and fips numbers are presented for reference | |
84 | # purposes; | |
87954638 | 85 | # (**) MULX was attempted, but found to give only marginal improvement; |
0b4bb91d AP |
86 | |
87 | $flavour = shift; | |
88 | $output = shift; | |
89 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
90 | ||
91 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
92 | ||
93 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
94 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
95 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
96 | die "can't locate x86_64-xlate.pl"; | |
97 | ||
15735e4f | 98 | open OUT,"| \"$^X\" $xlate $flavour $output"; |
0b4bb91d AP |
99 | *STDOUT=*OUT; |
100 | ||
87954638 AP |
101 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
102 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
30b9c234 | 103 | $addx = ($1>=2.23); |
87954638 AP |
104 | } |
105 | ||
106 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
107 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
108 | $addx = ($1>=2.10); | |
109 | } | |
110 | ||
111 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
112 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
1b0fe79f | 113 | $addx = ($1>=12); |
87954638 AP |
114 | } |
115 | ||
b9749432 | 116 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { |
a356e488 AP |
117 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 |
118 | $addx = ($ver>=3.03); | |
119 | } | |
120 | ||
0b4bb91d AP |
121 | ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API |
122 | { | |
123 | my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); | |
124 | ||
125 | $code.=<<___; | |
126 | .text | |
127 | ||
87954638 AP |
128 | .extern OPENSSL_ia32cap_P |
129 | ||
0b4bb91d | 130 | .globl rsaz_512_sqr |
6efef384 | 131 | .type rsaz_512_sqr,\@function,5 |
0b4bb91d AP |
132 | .align 32 |
133 | rsaz_512_sqr: # 25-29% faster than rsaz_512_mul | |
134 | push %rbx | |
135 | push %rbp | |
136 | push %r12 | |
137 | push %r13 | |
138 | push %r14 | |
139 | push %r15 | |
140 | ||
141 | subq \$128+24, %rsp | |
142 | .Lsqr_body: | |
143 | movq $mod, %rbp # common argument | |
144 | movq ($inp), %rdx | |
145 | movq 8($inp), %rax | |
146 | movq $n0, 128(%rsp) | |
87954638 AP |
147 | ___ |
148 | $code.=<<___ if ($addx); | |
149 | movl \$0x80100,%r11d | |
150 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
151 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
152 | je .Loop_sqrx | |
153 | ___ | |
154 | $code.=<<___; | |
0b4bb91d AP |
155 | jmp .Loop_sqr |
156 | ||
157 | .align 32 | |
158 | .Loop_sqr: | |
159 | movl $times,128+8(%rsp) | |
0b4bb91d AP |
160 | #first iteration |
161 | movq %rdx, %rbx | |
162 | mulq %rdx | |
163 | movq %rax, %r8 | |
164 | movq 16($inp), %rax | |
165 | movq %rdx, %r9 | |
166 | ||
167 | mulq %rbx | |
168 | addq %rax, %r9 | |
169 | movq 24($inp), %rax | |
170 | movq %rdx, %r10 | |
171 | adcq \$0, %r10 | |
172 | ||
173 | mulq %rbx | |
174 | addq %rax, %r10 | |
175 | movq 32($inp), %rax | |
176 | movq %rdx, %r11 | |
177 | adcq \$0, %r11 | |
178 | ||
179 | mulq %rbx | |
180 | addq %rax, %r11 | |
181 | movq 40($inp), %rax | |
182 | movq %rdx, %r12 | |
183 | adcq \$0, %r12 | |
184 | ||
185 | mulq %rbx | |
186 | addq %rax, %r12 | |
187 | movq 48($inp), %rax | |
188 | movq %rdx, %r13 | |
189 | adcq \$0, %r13 | |
190 | ||
191 | mulq %rbx | |
192 | addq %rax, %r13 | |
193 | movq 56($inp), %rax | |
194 | movq %rdx, %r14 | |
195 | adcq \$0, %r14 | |
196 | ||
197 | mulq %rbx | |
198 | addq %rax, %r14 | |
199 | movq %rbx, %rax | |
200 | movq %rdx, %r15 | |
201 | adcq \$0, %r15 | |
202 | ||
203 | addq %r8, %r8 #shlq \$1, %r8 | |
204 | movq %r9, %rcx | |
205 | adcq %r9, %r9 #shld \$1, %r8, %r9 | |
206 | ||
207 | mulq %rax | |
208 | movq %rax, (%rsp) | |
209 | addq %rdx, %r8 | |
210 | adcq \$0, %r9 | |
211 | ||
212 | movq %r8, 8(%rsp) | |
213 | shrq \$63, %rcx | |
214 | ||
215 | #second iteration | |
216 | movq 8($inp), %r8 | |
217 | movq 16($inp), %rax | |
218 | mulq %r8 | |
219 | addq %rax, %r10 | |
220 | movq 24($inp), %rax | |
221 | movq %rdx, %rbx | |
222 | adcq \$0, %rbx | |
223 | ||
224 | mulq %r8 | |
225 | addq %rax, %r11 | |
226 | movq 32($inp), %rax | |
227 | adcq \$0, %rdx | |
228 | addq %rbx, %r11 | |
229 | movq %rdx, %rbx | |
230 | adcq \$0, %rbx | |
231 | ||
232 | mulq %r8 | |
233 | addq %rax, %r12 | |
234 | movq 40($inp), %rax | |
235 | adcq \$0, %rdx | |
236 | addq %rbx, %r12 | |
237 | movq %rdx, %rbx | |
238 | adcq \$0, %rbx | |
239 | ||
240 | mulq %r8 | |
241 | addq %rax, %r13 | |
242 | movq 48($inp), %rax | |
243 | adcq \$0, %rdx | |
244 | addq %rbx, %r13 | |
245 | movq %rdx, %rbx | |
246 | adcq \$0, %rbx | |
247 | ||
248 | mulq %r8 | |
249 | addq %rax, %r14 | |
250 | movq 56($inp), %rax | |
251 | adcq \$0, %rdx | |
252 | addq %rbx, %r14 | |
253 | movq %rdx, %rbx | |
254 | adcq \$0, %rbx | |
255 | ||
256 | mulq %r8 | |
257 | addq %rax, %r15 | |
258 | movq %r8, %rax | |
259 | adcq \$0, %rdx | |
260 | addq %rbx, %r15 | |
261 | movq %rdx, %r8 | |
262 | movq %r10, %rdx | |
263 | adcq \$0, %r8 | |
264 | ||
265 | add %rdx, %rdx | |
266 | lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 | |
267 | movq %r11, %rbx | |
268 | adcq %r11, %r11 #shld \$1, %r10, %r11 | |
269 | ||
270 | mulq %rax | |
271 | addq %rax, %r9 | |
272 | adcq %rdx, %r10 | |
273 | adcq \$0, %r11 | |
274 | ||
275 | movq %r9, 16(%rsp) | |
276 | movq %r10, 24(%rsp) | |
277 | shrq \$63, %rbx | |
278 | ||
279 | #third iteration | |
280 | movq 16($inp), %r9 | |
281 | movq 24($inp), %rax | |
282 | mulq %r9 | |
283 | addq %rax, %r12 | |
284 | movq 32($inp), %rax | |
285 | movq %rdx, %rcx | |
286 | adcq \$0, %rcx | |
287 | ||
288 | mulq %r9 | |
289 | addq %rax, %r13 | |
290 | movq 40($inp), %rax | |
291 | adcq \$0, %rdx | |
292 | addq %rcx, %r13 | |
293 | movq %rdx, %rcx | |
294 | adcq \$0, %rcx | |
295 | ||
296 | mulq %r9 | |
297 | addq %rax, %r14 | |
298 | movq 48($inp), %rax | |
299 | adcq \$0, %rdx | |
300 | addq %rcx, %r14 | |
301 | movq %rdx, %rcx | |
302 | adcq \$0, %rcx | |
303 | ||
304 | mulq %r9 | |
305 | movq %r12, %r10 | |
306 | lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 | |
307 | addq %rax, %r15 | |
308 | movq 56($inp), %rax | |
309 | adcq \$0, %rdx | |
310 | addq %rcx, %r15 | |
311 | movq %rdx, %rcx | |
312 | adcq \$0, %rcx | |
313 | ||
314 | mulq %r9 | |
315 | shrq \$63, %r10 | |
316 | addq %rax, %r8 | |
317 | movq %r9, %rax | |
318 | adcq \$0, %rdx | |
319 | addq %rcx, %r8 | |
320 | movq %rdx, %r9 | |
321 | adcq \$0, %r9 | |
322 | ||
323 | movq %r13, %rcx | |
324 | leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 | |
325 | ||
326 | mulq %rax | |
327 | addq %rax, %r11 | |
328 | adcq %rdx, %r12 | |
329 | adcq \$0, %r13 | |
330 | ||
331 | movq %r11, 32(%rsp) | |
332 | movq %r12, 40(%rsp) | |
333 | shrq \$63, %rcx | |
334 | ||
335 | #fourth iteration | |
336 | movq 24($inp), %r10 | |
337 | movq 32($inp), %rax | |
338 | mulq %r10 | |
339 | addq %rax, %r14 | |
340 | movq 40($inp), %rax | |
341 | movq %rdx, %rbx | |
342 | adcq \$0, %rbx | |
343 | ||
344 | mulq %r10 | |
345 | addq %rax, %r15 | |
346 | movq 48($inp), %rax | |
347 | adcq \$0, %rdx | |
348 | addq %rbx, %r15 | |
349 | movq %rdx, %rbx | |
350 | adcq \$0, %rbx | |
351 | ||
352 | mulq %r10 | |
353 | movq %r14, %r12 | |
354 | leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 | |
355 | addq %rax, %r8 | |
356 | movq 56($inp), %rax | |
357 | adcq \$0, %rdx | |
358 | addq %rbx, %r8 | |
359 | movq %rdx, %rbx | |
360 | adcq \$0, %rbx | |
361 | ||
362 | mulq %r10 | |
363 | shrq \$63, %r12 | |
364 | addq %rax, %r9 | |
365 | movq %r10, %rax | |
366 | adcq \$0, %rdx | |
367 | addq %rbx, %r9 | |
368 | movq %rdx, %r10 | |
369 | adcq \$0, %r10 | |
370 | ||
371 | movq %r15, %rbx | |
372 | leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 | |
373 | ||
374 | mulq %rax | |
375 | addq %rax, %r13 | |
376 | adcq %rdx, %r14 | |
377 | adcq \$0, %r15 | |
378 | ||
379 | movq %r13, 48(%rsp) | |
380 | movq %r14, 56(%rsp) | |
381 | shrq \$63, %rbx | |
382 | ||
383 | #fifth iteration | |
384 | movq 32($inp), %r11 | |
385 | movq 40($inp), %rax | |
386 | mulq %r11 | |
387 | addq %rax, %r8 | |
388 | movq 48($inp), %rax | |
389 | movq %rdx, %rcx | |
390 | adcq \$0, %rcx | |
391 | ||
392 | mulq %r11 | |
393 | addq %rax, %r9 | |
394 | movq 56($inp), %rax | |
395 | adcq \$0, %rdx | |
396 | movq %r8, %r12 | |
397 | leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 | |
398 | addq %rcx, %r9 | |
399 | movq %rdx, %rcx | |
400 | adcq \$0, %rcx | |
401 | ||
402 | mulq %r11 | |
403 | shrq \$63, %r12 | |
404 | addq %rax, %r10 | |
405 | movq %r11, %rax | |
406 | adcq \$0, %rdx | |
407 | addq %rcx, %r10 | |
408 | movq %rdx, %r11 | |
409 | adcq \$0, %r11 | |
410 | ||
411 | movq %r9, %rcx | |
412 | leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 | |
413 | ||
414 | mulq %rax | |
415 | addq %rax, %r15 | |
416 | adcq %rdx, %r8 | |
417 | adcq \$0, %r9 | |
418 | ||
419 | movq %r15, 64(%rsp) | |
420 | movq %r8, 72(%rsp) | |
421 | shrq \$63, %rcx | |
422 | ||
423 | #sixth iteration | |
424 | movq 40($inp), %r12 | |
425 | movq 48($inp), %rax | |
426 | mulq %r12 | |
427 | addq %rax, %r10 | |
428 | movq 56($inp), %rax | |
429 | movq %rdx, %rbx | |
430 | adcq \$0, %rbx | |
431 | ||
432 | mulq %r12 | |
433 | addq %rax, %r11 | |
434 | movq %r12, %rax | |
435 | movq %r10, %r15 | |
436 | leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 | |
437 | adcq \$0, %rdx | |
438 | shrq \$63, %r15 | |
439 | addq %rbx, %r11 | |
440 | movq %rdx, %r12 | |
441 | adcq \$0, %r12 | |
442 | ||
443 | movq %r11, %rbx | |
444 | leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 | |
445 | ||
446 | mulq %rax | |
447 | addq %rax, %r9 | |
448 | adcq %rdx, %r10 | |
449 | adcq \$0, %r11 | |
450 | ||
451 | movq %r9, 80(%rsp) | |
452 | movq %r10, 88(%rsp) | |
453 | ||
454 | #seventh iteration | |
455 | movq 48($inp), %r13 | |
456 | movq 56($inp), %rax | |
457 | mulq %r13 | |
458 | addq %rax, %r12 | |
459 | movq %r13, %rax | |
460 | movq %rdx, %r13 | |
461 | adcq \$0, %r13 | |
462 | ||
463 | xorq %r14, %r14 | |
464 | shlq \$1, %rbx | |
465 | adcq %r12, %r12 #shld \$1, %rbx, %r12 | |
466 | adcq %r13, %r13 #shld \$1, %r12, %r13 | |
467 | adcq %r14, %r14 #shld \$1, %r13, %r14 | |
468 | ||
469 | mulq %rax | |
470 | addq %rax, %r11 | |
471 | adcq %rdx, %r12 | |
472 | adcq \$0, %r13 | |
473 | ||
474 | movq %r11, 96(%rsp) | |
475 | movq %r12, 104(%rsp) | |
476 | ||
477 | #eighth iteration | |
478 | movq 56($inp), %rax | |
479 | mulq %rax | |
480 | addq %rax, %r13 | |
481 | adcq \$0, %rdx | |
482 | ||
483 | addq %rdx, %r14 | |
484 | ||
485 | movq %r13, 112(%rsp) | |
486 | movq %r14, 120(%rsp) | |
87954638 AP |
487 | |
488 | movq (%rsp), %r8 | |
489 | movq 8(%rsp), %r9 | |
490 | movq 16(%rsp), %r10 | |
491 | movq 24(%rsp), %r11 | |
492 | movq 32(%rsp), %r12 | |
493 | movq 40(%rsp), %r13 | |
494 | movq 48(%rsp), %r14 | |
495 | movq 56(%rsp), %r15 | |
496 | ||
497 | call __rsaz_512_reduce | |
498 | ||
499 | addq 64(%rsp), %r8 | |
500 | adcq 72(%rsp), %r9 | |
501 | adcq 80(%rsp), %r10 | |
502 | adcq 88(%rsp), %r11 | |
503 | adcq 96(%rsp), %r12 | |
504 | adcq 104(%rsp), %r13 | |
505 | adcq 112(%rsp), %r14 | |
506 | adcq 120(%rsp), %r15 | |
507 | sbbq %rcx, %rcx | |
508 | ||
509 | call __rsaz_512_subtract | |
510 | ||
511 | movq %r8, %rdx | |
512 | movq %r9, %rax | |
513 | movl 128+8(%rsp), $times | |
514 | movq $out, $inp | |
515 | ||
516 | decl $times | |
517 | jnz .Loop_sqr | |
0b4bb91d | 518 | ___ |
87954638 | 519 | if ($addx) { |
0b4bb91d | 520 | $code.=<<___; |
87954638 AP |
521 | jmp .Lsqr_tail |
522 | ||
523 | .align 32 | |
524 | .Loop_sqrx: | |
525 | movl $times,128+8(%rsp) | |
0b4bb91d | 526 | movq $out, %xmm0 # off-load |
87954638 | 527 | movq %rbp, %xmm1 # off-load |
0b4bb91d AP |
528 | #first iteration |
529 | mulx %rax, %r8, %r9 | |
530 | ||
531 | mulx 16($inp), %rcx, %r10 | |
87954638 | 532 | xor %rbp, %rbp # cf=0, of=0 |
0b4bb91d AP |
533 | |
534 | mulx 24($inp), %rax, %r11 | |
87954638 | 535 | adcx %rcx, %r9 |
0b4bb91d AP |
536 | |
537 | mulx 32($inp), %rcx, %r12 | |
87954638 | 538 | adcx %rax, %r10 |
0b4bb91d AP |
539 | |
540 | mulx 40($inp), %rax, %r13 | |
87954638 | 541 | adcx %rcx, %r11 |
0b4bb91d | 542 | |
87954638 AP |
543 | .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 |
544 | adcx %rax, %r12 | |
545 | adcx %rcx, %r13 | |
0b4bb91d | 546 | |
87954638 AP |
547 | .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 |
548 | adcx %rax, %r14 | |
549 | adcx %rbp, %r15 # %rbp is 0 | |
0b4bb91d | 550 | |
87954638 | 551 | mov %r9, %rcx |
0b4bb91d AP |
552 | shld \$1, %r8, %r9 |
553 | shl \$1, %r8 | |
554 | ||
87954638 | 555 | xor %ebp, %ebp |
0b4bb91d | 556 | mulx %rdx, %rax, %rdx |
87954638 AP |
557 | adcx %rdx, %r8 |
558 | mov 8($inp), %rdx | |
559 | adcx %rbp, %r9 | |
0b4bb91d AP |
560 | |
561 | mov %rax, (%rsp) | |
562 | mov %r8, 8(%rsp) | |
563 | ||
564 | #second iteration | |
0b4bb91d | 565 | mulx 16($inp), %rax, %rbx |
87954638 AP |
566 | adox %rax, %r10 |
567 | adcx %rbx, %r11 | |
0b4bb91d | 568 | |
87954638 AP |
569 | .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 |
570 | adox $out, %r11 | |
571 | adcx %r8, %r12 | |
0b4bb91d AP |
572 | |
573 | mulx 32($inp), %rax, %rbx | |
87954638 AP |
574 | adox %rax, %r12 |
575 | adcx %rbx, %r13 | |
0b4bb91d AP |
576 | |
577 | mulx 40($inp), $out, %r8 | |
87954638 AP |
578 | adox $out, %r13 |
579 | adcx %r8, %r14 | |
0b4bb91d | 580 | |
87954638 AP |
581 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx |
582 | adox %rax, %r14 | |
583 | adcx %rbx, %r15 | |
0b4bb91d | 584 | |
87954638 AP |
585 | .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 |
586 | adox $out, %r15 | |
587 | adcx %rbp, %r8 | |
588 | adox %rbp, %r8 | |
0b4bb91d | 589 | |
87954638 | 590 | mov %r11, %rbx |
0b4bb91d AP |
591 | shld \$1, %r10, %r11 |
592 | shld \$1, %rcx, %r10 | |
593 | ||
87954638 | 594 | xor %ebp,%ebp |
0b4bb91d | 595 | mulx %rdx, %rax, %rcx |
87954638 AP |
596 | mov 16($inp), %rdx |
597 | adcx %rax, %r9 | |
598 | adcx %rcx, %r10 | |
599 | adcx %rbp, %r11 | |
0b4bb91d AP |
600 | |
601 | mov %r9, 16(%rsp) | |
87954638 | 602 | .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) |
0b4bb91d AP |
603 | |
604 | #third iteration | |
87954638 AP |
605 | .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 |
606 | adox $out, %r12 | |
607 | adcx %r9, %r13 | |
0b4bb91d AP |
608 | |
609 | mulx 32($inp), %rax, %rcx | |
87954638 AP |
610 | adox %rax, %r13 |
611 | adcx %rcx, %r14 | |
0b4bb91d AP |
612 | |
613 | mulx 40($inp), $out, %r9 | |
87954638 AP |
614 | adox $out, %r14 |
615 | adcx %r9, %r15 | |
0b4bb91d | 616 | |
87954638 AP |
617 | .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx |
618 | adox %rax, %r15 | |
619 | adcx %rcx, %r8 | |
0b4bb91d | 620 | |
87954638 AP |
621 | .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 |
622 | adox $out, %r8 | |
623 | adcx %rbp, %r9 | |
624 | adox %rbp, %r9 | |
0b4bb91d | 625 | |
87954638 | 626 | mov %r13, %rcx |
0b4bb91d AP |
627 | shld \$1, %r12, %r13 |
628 | shld \$1, %rbx, %r12 | |
629 | ||
87954638 | 630 | xor %ebp, %ebp |
0b4bb91d | 631 | mulx %rdx, %rax, %rdx |
87954638 AP |
632 | adcx %rax, %r11 |
633 | adcx %rdx, %r12 | |
634 | mov 24($inp), %rdx | |
635 | adcx %rbp, %r13 | |
0b4bb91d AP |
636 | |
637 | mov %r11, 32(%rsp) | |
87954638 | 638 | .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) |
0b4bb91d AP |
639 | |
640 | #fourth iteration | |
87954638 AP |
641 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx |
642 | adox %rax, %r14 | |
643 | adcx %rbx, %r15 | |
0b4bb91d AP |
644 | |
645 | mulx 40($inp), $out, %r10 | |
87954638 AP |
646 | adox $out, %r15 |
647 | adcx %r10, %r8 | |
0b4bb91d AP |
648 | |
649 | mulx 48($inp), %rax, %rbx | |
87954638 AP |
650 | adox %rax, %r8 |
651 | adcx %rbx, %r9 | |
0b4bb91d AP |
652 | |
653 | mulx 56($inp), $out, %r10 | |
87954638 AP |
654 | adox $out, %r9 |
655 | adcx %rbp, %r10 | |
656 | adox %rbp, %r10 | |
0b4bb91d | 657 | |
87954638 AP |
658 | .byte 0x66 |
659 | mov %r15, %rbx | |
0b4bb91d AP |
660 | shld \$1, %r14, %r15 |
661 | shld \$1, %rcx, %r14 | |
662 | ||
87954638 | 663 | xor %ebp, %ebp |
0b4bb91d | 664 | mulx %rdx, %rax, %rdx |
87954638 AP |
665 | adcx %rax, %r13 |
666 | adcx %rdx, %r14 | |
667 | mov 32($inp), %rdx | |
668 | adcx %rbp, %r15 | |
0b4bb91d AP |
669 | |
670 | mov %r13, 48(%rsp) | |
671 | mov %r14, 56(%rsp) | |
672 | ||
673 | #fifth iteration | |
87954638 AP |
674 | .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 |
675 | adox $out, %r8 | |
676 | adcx %r11, %r9 | |
0b4bb91d AP |
677 | |
678 | mulx 48($inp), %rax, %rcx | |
87954638 AP |
679 | adox %rax, %r9 |
680 | adcx %rcx, %r10 | |
0b4bb91d AP |
681 | |
682 | mulx 56($inp), $out, %r11 | |
87954638 AP |
683 | adox $out, %r10 |
684 | adcx %rbp, %r11 | |
685 | adox %rbp, %r11 | |
0b4bb91d AP |
686 | |
687 | mov %r9, %rcx | |
688 | shld \$1, %r8, %r9 | |
689 | shld \$1, %rbx, %r8 | |
690 | ||
87954638 | 691 | xor %ebp, %ebp |
0b4bb91d | 692 | mulx %rdx, %rax, %rdx |
87954638 AP |
693 | adcx %rax, %r15 |
694 | adcx %rdx, %r8 | |
695 | mov 40($inp), %rdx | |
696 | adcx %rbp, %r9 | |
0b4bb91d AP |
697 | |
698 | mov %r15, 64(%rsp) | |
699 | mov %r8, 72(%rsp) | |
700 | ||
701 | #sixth iteration | |
87954638 AP |
702 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx |
703 | adox %rax, %r10 | |
704 | adcx %rbx, %r11 | |
0b4bb91d | 705 | |
87954638 AP |
706 | .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 |
707 | adox $out, %r11 | |
708 | adcx %rbp, %r12 | |
709 | adox %rbp, %r12 | |
0b4bb91d AP |
710 | |
711 | mov %r11, %rbx | |
712 | shld \$1, %r10, %r11 | |
713 | shld \$1, %rcx, %r10 | |
714 | ||
87954638 | 715 | xor %ebp, %ebp |
0b4bb91d | 716 | mulx %rdx, %rax, %rdx |
87954638 AP |
717 | adcx %rax, %r9 |
718 | adcx %rdx, %r10 | |
719 | mov 48($inp), %rdx | |
720 | adcx %rbp, %r11 | |
0b4bb91d AP |
721 | |
722 | mov %r9, 80(%rsp) | |
723 | mov %r10, 88(%rsp) | |
724 | ||
725 | #seventh iteration | |
87954638 AP |
726 | .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 |
727 | adox %rax, %r12 | |
728 | adox %rbp, %r13 | |
0b4bb91d AP |
729 | |
730 | xor %r14, %r14 | |
731 | shld \$1, %r13, %r14 | |
732 | shld \$1, %r12, %r13 | |
733 | shld \$1, %rbx, %r12 | |
734 | ||
87954638 | 735 | xor %ebp, %ebp |
0b4bb91d | 736 | mulx %rdx, %rax, %rdx |
87954638 AP |
737 | adcx %rax, %r11 |
738 | adcx %rdx, %r12 | |
739 | mov 56($inp), %rdx | |
740 | adcx %rbp, %r13 | |
0b4bb91d | 741 | |
87954638 AP |
742 | .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) |
743 | .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) | |
0b4bb91d AP |
744 | |
745 | #eighth iteration | |
0b4bb91d | 746 | mulx %rdx, %rax, %rdx |
87954638 AP |
747 | adox %rax, %r13 |
748 | adox %rbp, %rdx | |
749 | ||
750 | .byte 0x66 | |
0b4bb91d AP |
751 | add %rdx, %r14 |
752 | ||
753 | movq %r13, 112(%rsp) | |
754 | movq %r14, 120(%rsp) | |
755 | movq %xmm0, $out | |
87954638 AP |
756 | movq %xmm1, %rbp |
757 | ||
758 | movq 128(%rsp), %rdx # pull $n0 | |
0b4bb91d AP |
759 | movq (%rsp), %r8 |
760 | movq 8(%rsp), %r9 | |
761 | movq 16(%rsp), %r10 | |
762 | movq 24(%rsp), %r11 | |
763 | movq 32(%rsp), %r12 | |
764 | movq 40(%rsp), %r13 | |
765 | movq 48(%rsp), %r14 | |
766 | movq 56(%rsp), %r15 | |
767 | ||
87954638 | 768 | call __rsaz_512_reducex |
0b4bb91d AP |
769 | |
770 | addq 64(%rsp), %r8 | |
771 | adcq 72(%rsp), %r9 | |
772 | adcq 80(%rsp), %r10 | |
773 | adcq 88(%rsp), %r11 | |
774 | adcq 96(%rsp), %r12 | |
775 | adcq 104(%rsp), %r13 | |
776 | adcq 112(%rsp), %r14 | |
777 | adcq 120(%rsp), %r15 | |
778 | sbbq %rcx, %rcx | |
779 | ||
87954638 | 780 | call __rsaz_512_subtract |
0b4bb91d AP |
781 | |
782 | movq %r8, %rdx | |
783 | movq %r9, %rax | |
784 | movl 128+8(%rsp), $times | |
785 | movq $out, $inp | |
786 | ||
787 | decl $times | |
87954638 AP |
788 | jnz .Loop_sqrx |
789 | ||
790 | .Lsqr_tail: | |
791 | ___ | |
792 | } | |
793 | $code.=<<___; | |
0b4bb91d AP |
794 | |
795 | leaq 128+24+48(%rsp), %rax | |
796 | movq -48(%rax), %r15 | |
797 | movq -40(%rax), %r14 | |
798 | movq -32(%rax), %r13 | |
799 | movq -24(%rax), %r12 | |
800 | movq -16(%rax), %rbp | |
801 | movq -8(%rax), %rbx | |
802 | leaq (%rax), %rsp | |
803 | .Lsqr_epilogue: | |
804 | ret | |
805 | .size rsaz_512_sqr,.-rsaz_512_sqr | |
806 | ___ | |
807 | } | |
808 | { | |
809 | my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); | |
810 | $code.=<<___; | |
fd8ad019 | 811 | .globl rsaz_512_mul |
0b4bb91d AP |
812 | .type rsaz_512_mul,\@function,5 |
813 | .align 32 | |
814 | rsaz_512_mul: | |
815 | push %rbx | |
816 | push %rbp | |
817 | push %r12 | |
818 | push %r13 | |
819 | push %r14 | |
820 | push %r15 | |
821 | ||
822 | subq \$128+24, %rsp | |
823 | .Lmul_body: | |
824 | movq $out, %xmm0 # off-load arguments | |
825 | movq $mod, %xmm1 | |
826 | movq $n0, 128(%rsp) | |
87954638 AP |
827 | ___ |
828 | $code.=<<___ if ($addx); | |
829 | movl \$0x80100,%r11d | |
830 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
831 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
832 | je .Lmulx | |
833 | ___ | |
834 | $code.=<<___; | |
835 | movq ($bp), %rbx # pass b[0] | |
0b4bb91d | 836 | movq $bp, %rbp # pass argument |
fd8ad019 | 837 | call __rsaz_512_mul |
0b4bb91d AP |
838 | |
839 | movq %xmm0, $out | |
840 | movq %xmm1, %rbp | |
841 | ||
842 | movq (%rsp), %r8 | |
843 | movq 8(%rsp), %r9 | |
844 | movq 16(%rsp), %r10 | |
845 | movq 24(%rsp), %r11 | |
846 | movq 32(%rsp), %r12 | |
847 | movq 40(%rsp), %r13 | |
848 | movq 48(%rsp), %r14 | |
849 | movq 56(%rsp), %r15 | |
850 | ||
87954638 AP |
851 | call __rsaz_512_reduce |
852 | ___ | |
853 | $code.=<<___ if ($addx); | |
854 | jmp .Lmul_tail | |
0b4bb91d | 855 | |
87954638 AP |
856 | .align 32 |
857 | .Lmulx: | |
858 | movq $bp, %rbp # pass argument | |
859 | movq ($bp), %rdx # pass b[0] | |
860 | call __rsaz_512_mulx | |
861 | ||
862 | movq %xmm0, $out | |
863 | movq %xmm1, %rbp | |
864 | ||
865 | movq 128(%rsp), %rdx # pull $n0 | |
866 | movq (%rsp), %r8 | |
867 | movq 8(%rsp), %r9 | |
868 | movq 16(%rsp), %r10 | |
869 | movq 24(%rsp), %r11 | |
870 | movq 32(%rsp), %r12 | |
871 | movq 40(%rsp), %r13 | |
872 | movq 48(%rsp), %r14 | |
873 | movq 56(%rsp), %r15 | |
874 | ||
875 | call __rsaz_512_reducex | |
876 | .Lmul_tail: | |
877 | ___ | |
878 | $code.=<<___; | |
0b4bb91d AP |
879 | addq 64(%rsp), %r8 |
880 | adcq 72(%rsp), %r9 | |
881 | adcq 80(%rsp), %r10 | |
882 | adcq 88(%rsp), %r11 | |
883 | adcq 96(%rsp), %r12 | |
884 | adcq 104(%rsp), %r13 | |
885 | adcq 112(%rsp), %r14 | |
886 | adcq 120(%rsp), %r15 | |
887 | sbbq %rcx, %rcx | |
888 | ||
87954638 | 889 | call __rsaz_512_subtract |
0b4bb91d AP |
890 | |
891 | leaq 128+24+48(%rsp), %rax | |
892 | movq -48(%rax), %r15 | |
893 | movq -40(%rax), %r14 | |
894 | movq -32(%rax), %r13 | |
895 | movq -24(%rax), %r12 | |
896 | movq -16(%rax), %rbp | |
897 | movq -8(%rax), %rbx | |
898 | leaq (%rax), %rsp | |
899 | .Lmul_epilogue: | |
900 | ret | |
901 | .size rsaz_512_mul,.-rsaz_512_mul | |
902 | ___ | |
903 | } | |
904 | { | |
905 | my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | |
906 | $code.=<<___; | |
fd8ad019 | 907 | .globl rsaz_512_mul_gather4 |
0b4bb91d AP |
908 | .type rsaz_512_mul_gather4,\@function,6 |
909 | .align 32 | |
910 | rsaz_512_mul_gather4: | |
911 | push %rbx | |
912 | push %rbp | |
913 | push %r12 | |
914 | push %r13 | |
915 | push %r14 | |
916 | push %r15 | |
917 | ||
8bd7ca99 | 918 | mov $pwr, $pwr |
0b4bb91d AP |
919 | subq \$128+24, %rsp |
920 | .Lmul_gather4_body: | |
87954638 AP |
921 | ___ |
922 | $code.=<<___ if ($addx); | |
923 | movl \$0x80100,%r11d | |
924 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
925 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
926 | je .Lmulx_gather | |
927 | ___ | |
928 | $code.=<<___; | |
0b4bb91d AP |
929 | movl 64($bp,$pwr,4), %eax |
930 | movq $out, %xmm0 # off-load arguments | |
931 | movl ($bp,$pwr,4), %ebx | |
932 | movq $mod, %xmm1 | |
933 | movq $n0, 128(%rsp) | |
934 | ||
935 | shlq \$32, %rax | |
936 | or %rax, %rbx | |
937 | movq ($ap), %rax | |
938 | movq 8($ap), %rcx | |
939 | leaq 128($bp,$pwr,4), %rbp | |
940 | mulq %rbx # 0 iteration | |
941 | movq %rax, (%rsp) | |
942 | movq %rcx, %rax | |
943 | movq %rdx, %r8 | |
944 | ||
945 | mulq %rbx | |
946 | movd (%rbp), %xmm4 | |
947 | addq %rax, %r8 | |
948 | movq 16($ap), %rax | |
949 | movq %rdx, %r9 | |
950 | adcq \$0, %r9 | |
951 | ||
952 | mulq %rbx | |
953 | movd 64(%rbp), %xmm5 | |
954 | addq %rax, %r9 | |
955 | movq 24($ap), %rax | |
956 | movq %rdx, %r10 | |
957 | adcq \$0, %r10 | |
958 | ||
959 | mulq %rbx | |
960 | pslldq \$4, %xmm5 | |
961 | addq %rax, %r10 | |
962 | movq 32($ap), %rax | |
963 | movq %rdx, %r11 | |
964 | adcq \$0, %r11 | |
965 | ||
966 | mulq %rbx | |
967 | por %xmm5, %xmm4 | |
968 | addq %rax, %r11 | |
969 | movq 40($ap), %rax | |
970 | movq %rdx, %r12 | |
971 | adcq \$0, %r12 | |
972 | ||
973 | mulq %rbx | |
974 | addq %rax, %r12 | |
975 | movq 48($ap), %rax | |
976 | movq %rdx, %r13 | |
977 | adcq \$0, %r13 | |
978 | ||
979 | mulq %rbx | |
980 | leaq 128(%rbp), %rbp | |
981 | addq %rax, %r13 | |
982 | movq 56($ap), %rax | |
983 | movq %rdx, %r14 | |
984 | adcq \$0, %r14 | |
985 | ||
986 | mulq %rbx | |
987 | movq %xmm4, %rbx | |
988 | addq %rax, %r14 | |
989 | movq ($ap), %rax | |
990 | movq %rdx, %r15 | |
991 | adcq \$0, %r15 | |
992 | ||
993 | leaq 8(%rsp), %rdi | |
994 | movl \$7, %ecx | |
995 | jmp .Loop_mul_gather | |
996 | ||
997 | .align 32 | |
998 | .Loop_mul_gather: | |
999 | mulq %rbx | |
1000 | addq %rax, %r8 | |
1001 | movq 8($ap), %rax | |
1002 | movq %r8, (%rdi) | |
1003 | movq %rdx, %r8 | |
1004 | adcq \$0, %r8 | |
1005 | ||
1006 | mulq %rbx | |
1007 | movd (%rbp), %xmm4 | |
1008 | addq %rax, %r9 | |
1009 | movq 16($ap), %rax | |
1010 | adcq \$0, %rdx | |
1011 | addq %r9, %r8 | |
1012 | movq %rdx, %r9 | |
1013 | adcq \$0, %r9 | |
1014 | ||
1015 | mulq %rbx | |
1016 | movd 64(%rbp), %xmm5 | |
1017 | addq %rax, %r10 | |
1018 | movq 24($ap), %rax | |
1019 | adcq \$0, %rdx | |
1020 | addq %r10, %r9 | |
1021 | movq %rdx, %r10 | |
1022 | adcq \$0, %r10 | |
1023 | ||
1024 | mulq %rbx | |
1025 | pslldq \$4, %xmm5 | |
1026 | addq %rax, %r11 | |
1027 | movq 32($ap), %rax | |
1028 | adcq \$0, %rdx | |
1029 | addq %r11, %r10 | |
1030 | movq %rdx, %r11 | |
1031 | adcq \$0, %r11 | |
1032 | ||
1033 | mulq %rbx | |
1034 | por %xmm5, %xmm4 | |
1035 | addq %rax, %r12 | |
1036 | movq 40($ap), %rax | |
1037 | adcq \$0, %rdx | |
1038 | addq %r12, %r11 | |
1039 | movq %rdx, %r12 | |
1040 | adcq \$0, %r12 | |
1041 | ||
1042 | mulq %rbx | |
1043 | addq %rax, %r13 | |
1044 | movq 48($ap), %rax | |
1045 | adcq \$0, %rdx | |
1046 | addq %r13, %r12 | |
1047 | movq %rdx, %r13 | |
1048 | adcq \$0, %r13 | |
1049 | ||
1050 | mulq %rbx | |
1051 | addq %rax, %r14 | |
1052 | movq 56($ap), %rax | |
1053 | adcq \$0, %rdx | |
1054 | addq %r14, %r13 | |
1055 | movq %rdx, %r14 | |
1056 | adcq \$0, %r14 | |
1057 | ||
1058 | mulq %rbx | |
1059 | movq %xmm4, %rbx | |
1060 | addq %rax, %r15 | |
1061 | movq ($ap), %rax | |
1062 | adcq \$0, %rdx | |
1063 | addq %r15, %r14 | |
1064 | movq %rdx, %r15 | |
1065 | adcq \$0, %r15 | |
1066 | ||
1067 | leaq 128(%rbp), %rbp | |
1068 | leaq 8(%rdi), %rdi | |
1069 | ||
1070 | decl %ecx | |
1071 | jnz .Loop_mul_gather | |
1072 | ||
1073 | movq %r8, (%rdi) | |
1074 | movq %r9, 8(%rdi) | |
1075 | movq %r10, 16(%rdi) | |
1076 | movq %r11, 24(%rdi) | |
1077 | movq %r12, 32(%rdi) | |
1078 | movq %r13, 40(%rdi) | |
1079 | movq %r14, 48(%rdi) | |
1080 | movq %r15, 56(%rdi) | |
1081 | ||
1082 | movq %xmm0, $out | |
1083 | movq %xmm1, %rbp | |
1084 | ||
1085 | movq (%rsp), %r8 | |
1086 | movq 8(%rsp), %r9 | |
1087 | movq 16(%rsp), %r10 | |
1088 | movq 24(%rsp), %r11 | |
1089 | movq 32(%rsp), %r12 | |
1090 | movq 40(%rsp), %r13 | |
1091 | movq 48(%rsp), %r14 | |
1092 | movq 56(%rsp), %r15 | |
1093 | ||
87954638 AP |
1094 | call __rsaz_512_reduce |
1095 | ___ | |
1096 | $code.=<<___ if ($addx); | |
1097 | jmp .Lmul_gather_tail | |
1098 | ||
1099 | .align 32 | |
1100 | .Lmulx_gather: | |
1101 | mov 64($bp,$pwr,4), %eax | |
1102 | movq $out, %xmm0 # off-load arguments | |
1103 | lea 128($bp,$pwr,4), %rbp | |
1104 | mov ($bp,$pwr,4), %edx | |
1105 | movq $mod, %xmm1 | |
1106 | mov $n0, 128(%rsp) | |
1107 | ||
1108 | shl \$32, %rax | |
1109 | or %rax, %rdx | |
1110 | mulx ($ap), %rbx, %r8 # 0 iteration | |
1111 | mov %rbx, (%rsp) | |
1112 | xor %edi, %edi # cf=0, of=0 | |
1113 | ||
1114 | mulx 8($ap), %rax, %r9 | |
1115 | movd (%rbp), %xmm4 | |
1116 | ||
1117 | mulx 16($ap), %rbx, %r10 | |
1118 | movd 64(%rbp), %xmm5 | |
1119 | adcx %rax, %r8 | |
1120 | ||
1121 | mulx 24($ap), %rax, %r11 | |
1122 | pslldq \$4, %xmm5 | |
1123 | adcx %rbx, %r9 | |
1124 | ||
1125 | mulx 32($ap), %rbx, %r12 | |
1126 | por %xmm5, %xmm4 | |
1127 | adcx %rax, %r10 | |
1128 | ||
1129 | mulx 40($ap), %rax, %r13 | |
1130 | adcx %rbx, %r11 | |
1131 | ||
1132 | mulx 48($ap), %rbx, %r14 | |
1133 | lea 128(%rbp), %rbp | |
1134 | adcx %rax, %r12 | |
1135 | ||
1136 | mulx 56($ap), %rax, %r15 | |
1137 | movq %xmm4, %rdx | |
1138 | adcx %rbx, %r13 | |
1139 | adcx %rax, %r14 | |
1140 | mov %r8, %rbx | |
1141 | adcx %rdi, %r15 # %rdi is 0 | |
1142 | ||
1143 | mov \$-7, %rcx | |
1144 | jmp .Loop_mulx_gather | |
1145 | ||
1146 | .align 32 | |
1147 | .Loop_mulx_gather: | |
1148 | mulx ($ap), %rax, %r8 | |
1149 | adcx %rax, %rbx | |
1150 | adox %r9, %r8 | |
1151 | ||
1152 | mulx 8($ap), %rax, %r9 | |
1153 | .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4 | |
1154 | adcx %rax, %r8 | |
1155 | adox %r10, %r9 | |
0b4bb91d | 1156 | |
87954638 AP |
1157 | mulx 16($ap), %rax, %r10 |
1158 | movd 64(%rbp), %xmm5 | |
1159 | lea 128(%rbp), %rbp | |
1160 | adcx %rax, %r9 | |
1161 | adox %r11, %r10 | |
1162 | ||
1163 | .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 | |
1164 | pslldq \$4, %xmm5 | |
1165 | por %xmm5, %xmm4 | |
1166 | adcx %rax, %r10 | |
1167 | adox %r12, %r11 | |
1168 | ||
1169 | mulx 32($ap), %rax, %r12 | |
1170 | adcx %rax, %r11 | |
1171 | adox %r13, %r12 | |
1172 | ||
1173 | mulx 40($ap), %rax, %r13 | |
1174 | adcx %rax, %r12 | |
1175 | adox %r14, %r13 | |
1176 | ||
1177 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 | |
1178 | adcx %rax, %r13 | |
1179 | adox %r15, %r14 | |
1180 | ||
1181 | mulx 56($ap), %rax, %r15 | |
1182 | movq %xmm4, %rdx | |
1183 | mov %rbx, 64(%rsp,%rcx,8) | |
1184 | adcx %rax, %r14 | |
1185 | adox %rdi, %r15 | |
1186 | mov %r8, %rbx | |
1187 | adcx %rdi, %r15 # cf=0 | |
1188 | ||
1189 | inc %rcx # of=0 | |
1190 | jnz .Loop_mulx_gather | |
1191 | ||
1192 | mov %r8, 64(%rsp) | |
1193 | mov %r9, 64+8(%rsp) | |
1194 | mov %r10, 64+16(%rsp) | |
1195 | mov %r11, 64+24(%rsp) | |
1196 | mov %r12, 64+32(%rsp) | |
1197 | mov %r13, 64+40(%rsp) | |
1198 | mov %r14, 64+48(%rsp) | |
1199 | mov %r15, 64+56(%rsp) | |
1200 | ||
1201 | movq %xmm0, $out | |
1202 | movq %xmm1, %rbp | |
1203 | ||
1204 | mov 128(%rsp), %rdx # pull $n0 | |
1205 | mov (%rsp), %r8 | |
1206 | mov 8(%rsp), %r9 | |
1207 | mov 16(%rsp), %r10 | |
1208 | mov 24(%rsp), %r11 | |
1209 | mov 32(%rsp), %r12 | |
1210 | mov 40(%rsp), %r13 | |
1211 | mov 48(%rsp), %r14 | |
1212 | mov 56(%rsp), %r15 | |
1213 | ||
1214 | call __rsaz_512_reducex | |
1215 | ||
1216 | .Lmul_gather_tail: | |
1217 | ___ | |
1218 | $code.=<<___; | |
0b4bb91d AP |
1219 | addq 64(%rsp), %r8 |
1220 | adcq 72(%rsp), %r9 | |
1221 | adcq 80(%rsp), %r10 | |
1222 | adcq 88(%rsp), %r11 | |
1223 | adcq 96(%rsp), %r12 | |
1224 | adcq 104(%rsp), %r13 | |
1225 | adcq 112(%rsp), %r14 | |
1226 | adcq 120(%rsp), %r15 | |
1227 | sbbq %rcx, %rcx | |
1228 | ||
87954638 | 1229 | call __rsaz_512_subtract |
0b4bb91d AP |
1230 | |
1231 | leaq 128+24+48(%rsp), %rax | |
1232 | movq -48(%rax), %r15 | |
1233 | movq -40(%rax), %r14 | |
1234 | movq -32(%rax), %r13 | |
1235 | movq -24(%rax), %r12 | |
1236 | movq -16(%rax), %rbp | |
1237 | movq -8(%rax), %rbx | |
1238 | leaq (%rax), %rsp | |
1239 | .Lmul_gather4_epilogue: | |
1240 | ret | |
1241 | .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 | |
1242 | ___ | |
1243 | } | |
1244 | { | |
1245 | my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | |
1246 | $code.=<<___; | |
fd8ad019 | 1247 | .globl rsaz_512_mul_scatter4 |
0b4bb91d AP |
1248 | .type rsaz_512_mul_scatter4,\@function,6 |
1249 | .align 32 | |
1250 | rsaz_512_mul_scatter4: | |
1251 | push %rbx | |
1252 | push %rbp | |
1253 | push %r12 | |
1254 | push %r13 | |
1255 | push %r14 | |
1256 | push %r15 | |
1257 | ||
8bd7ca99 | 1258 | mov $pwr, $pwr |
0b4bb91d AP |
1259 | subq \$128+24, %rsp |
1260 | .Lmul_scatter4_body: | |
1261 | leaq ($tbl,$pwr,4), $tbl | |
1262 | movq $out, %xmm0 # off-load arguments | |
1263 | movq $mod, %xmm1 | |
1264 | movq $tbl, %xmm2 | |
1265 | movq $n0, 128(%rsp) | |
1266 | ||
1267 | movq $out, %rbp | |
87954638 AP |
1268 | ___ |
1269 | $code.=<<___ if ($addx); | |
1270 | movl \$0x80100,%r11d | |
1271 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
1272 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
1273 | je .Lmulx_scatter | |
1274 | ___ | |
1275 | $code.=<<___; | |
1276 | movq ($out),%rbx # pass b[0] | |
fd8ad019 | 1277 | call __rsaz_512_mul |
0b4bb91d AP |
1278 | |
1279 | movq %xmm0, $out | |
1280 | movq %xmm1, %rbp | |
1281 | ||
1282 | movq (%rsp), %r8 | |
1283 | movq 8(%rsp), %r9 | |
1284 | movq 16(%rsp), %r10 | |
1285 | movq 24(%rsp), %r11 | |
1286 | movq 32(%rsp), %r12 | |
1287 | movq 40(%rsp), %r13 | |
1288 | movq 48(%rsp), %r14 | |
1289 | movq 56(%rsp), %r15 | |
1290 | ||
87954638 AP |
1291 | call __rsaz_512_reduce |
1292 | ___ | |
1293 | $code.=<<___ if ($addx); | |
1294 | jmp .Lmul_scatter_tail | |
1295 | ||
1296 | .align 32 | |
1297 | .Lmulx_scatter: | |
1298 | movq ($out), %rdx # pass b[0] | |
1299 | call __rsaz_512_mulx | |
1300 | ||
1301 | movq %xmm0, $out | |
1302 | movq %xmm1, %rbp | |
1303 | ||
1304 | movq 128(%rsp), %rdx # pull $n0 | |
1305 | movq (%rsp), %r8 | |
1306 | movq 8(%rsp), %r9 | |
1307 | movq 16(%rsp), %r10 | |
1308 | movq 24(%rsp), %r11 | |
1309 | movq 32(%rsp), %r12 | |
1310 | movq 40(%rsp), %r13 | |
1311 | movq 48(%rsp), %r14 | |
1312 | movq 56(%rsp), %r15 | |
1313 | ||
1314 | call __rsaz_512_reducex | |
0b4bb91d | 1315 | |
87954638 AP |
1316 | .Lmul_scatter_tail: |
1317 | ___ | |
1318 | $code.=<<___; | |
0b4bb91d AP |
1319 | addq 64(%rsp), %r8 |
1320 | adcq 72(%rsp), %r9 | |
1321 | adcq 80(%rsp), %r10 | |
1322 | adcq 88(%rsp), %r11 | |
1323 | adcq 96(%rsp), %r12 | |
1324 | adcq 104(%rsp), %r13 | |
1325 | adcq 112(%rsp), %r14 | |
1326 | adcq 120(%rsp), %r15 | |
1327 | movq %xmm2, $inp | |
1328 | sbbq %rcx, %rcx | |
1329 | ||
87954638 | 1330 | call __rsaz_512_subtract |
0b4bb91d AP |
1331 | |
1332 | movl %r8d, 64*0($inp) # scatter | |
1333 | shrq \$32, %r8 | |
1334 | movl %r9d, 64*2($inp) | |
1335 | shrq \$32, %r9 | |
1336 | movl %r10d, 64*4($inp) | |
1337 | shrq \$32, %r10 | |
1338 | movl %r11d, 64*6($inp) | |
1339 | shrq \$32, %r11 | |
1340 | movl %r12d, 64*8($inp) | |
1341 | shrq \$32, %r12 | |
1342 | movl %r13d, 64*10($inp) | |
1343 | shrq \$32, %r13 | |
1344 | movl %r14d, 64*12($inp) | |
1345 | shrq \$32, %r14 | |
1346 | movl %r15d, 64*14($inp) | |
1347 | shrq \$32, %r15 | |
1348 | movl %r8d, 64*1($inp) | |
1349 | movl %r9d, 64*3($inp) | |
1350 | movl %r10d, 64*5($inp) | |
1351 | movl %r11d, 64*7($inp) | |
1352 | movl %r12d, 64*9($inp) | |
1353 | movl %r13d, 64*11($inp) | |
1354 | movl %r14d, 64*13($inp) | |
1355 | movl %r15d, 64*15($inp) | |
1356 | ||
1357 | leaq 128+24+48(%rsp), %rax | |
1358 | movq -48(%rax), %r15 | |
1359 | movq -40(%rax), %r14 | |
1360 | movq -32(%rax), %r13 | |
1361 | movq -24(%rax), %r12 | |
1362 | movq -16(%rax), %rbp | |
1363 | movq -8(%rax), %rbx | |
1364 | leaq (%rax), %rsp | |
1365 | .Lmul_scatter4_epilogue: | |
1366 | ret | |
1367 | .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 | |
1368 | ___ | |
1369 | } | |
1370 | { | |
1371 | my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); | |
1372 | $code.=<<___; | |
1373 | .globl rsaz_512_mul_by_one | |
1374 | .type rsaz_512_mul_by_one,\@function,4 | |
1375 | .align 32 | |
1376 | rsaz_512_mul_by_one: | |
1377 | push %rbx | |
1378 | push %rbp | |
1379 | push %r12 | |
1380 | push %r13 | |
1381 | push %r14 | |
1382 | push %r15 | |
1383 | ||
1384 | subq \$128+24, %rsp | |
1385 | .Lmul_by_one_body: | |
87954638 AP |
1386 | ___ |
1387 | $code.=<<___ if ($addx); | |
1388 | movl OPENSSL_ia32cap_P+8(%rip),%eax | |
1389 | ___ | |
1390 | $code.=<<___; | |
0b4bb91d AP |
1391 | movq $mod, %rbp # reassign argument |
1392 | movq $n0, 128(%rsp) | |
1393 | ||
1394 | movq ($inp), %r8 | |
1395 | pxor %xmm0, %xmm0 | |
1396 | movq 8($inp), %r9 | |
1397 | movq 16($inp), %r10 | |
1398 | movq 24($inp), %r11 | |
1399 | movq 32($inp), %r12 | |
1400 | movq 40($inp), %r13 | |
1401 | movq 48($inp), %r14 | |
1402 | movq 56($inp), %r15 | |
1403 | ||
1404 | movdqa %xmm0, (%rsp) | |
1405 | movdqa %xmm0, 16(%rsp) | |
1406 | movdqa %xmm0, 32(%rsp) | |
1407 | movdqa %xmm0, 48(%rsp) | |
1408 | movdqa %xmm0, 64(%rsp) | |
1409 | movdqa %xmm0, 80(%rsp) | |
1410 | movdqa %xmm0, 96(%rsp) | |
87954638 AP |
1411 | ___ |
1412 | $code.=<<___ if ($addx); | |
1413 | andl \$0x80100,%eax | |
1414 | cmpl \$0x80100,%eax # check for MULX and ADO/CX | |
1415 | je .Lby_one_callx | |
1416 | ___ | |
1417 | $code.=<<___; | |
1418 | call __rsaz_512_reduce | |
1419 | ___ | |
1420 | $code.=<<___ if ($addx); | |
1421 | jmp .Lby_one_tail | |
1422 | .align 32 | |
1423 | .Lby_one_callx: | |
1424 | movq 128(%rsp), %rdx # pull $n0 | |
1425 | call __rsaz_512_reducex | |
1426 | .Lby_one_tail: | |
1427 | ___ | |
1428 | $code.=<<___; | |
0b4bb91d AP |
1429 | movq %r8, ($out) |
1430 | movq %r9, 8($out) | |
1431 | movq %r10, 16($out) | |
1432 | movq %r11, 24($out) | |
1433 | movq %r12, 32($out) | |
1434 | movq %r13, 40($out) | |
1435 | movq %r14, 48($out) | |
1436 | movq %r15, 56($out) | |
1437 | ||
1438 | leaq 128+24+48(%rsp), %rax | |
1439 | movq -48(%rax), %r15 | |
1440 | movq -40(%rax), %r14 | |
1441 | movq -32(%rax), %r13 | |
1442 | movq -24(%rax), %r12 | |
1443 | movq -16(%rax), %rbp | |
1444 | movq -8(%rax), %rbx | |
1445 | leaq (%rax), %rsp | |
1446 | .Lmul_by_one_epilogue: | |
1447 | ret | |
1448 | .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one | |
1449 | ___ | |
1450 | } | |
87954638 | 1451 | { # __rsaz_512_reduce |
0b4bb91d AP |
1452 | # |
1453 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 | |
1454 | # output: %r8-%r15 | |
1455 | # clobbers: everything except %rbp and %rdi | |
1456 | $code.=<<___; | |
87954638 | 1457 | .type __rsaz_512_reduce,\@abi-omnipotent |
0b4bb91d | 1458 | .align 32 |
87954638 | 1459 | __rsaz_512_reduce: |
0b4bb91d AP |
1460 | movq %r8, %rbx |
1461 | imulq 128+8(%rsp), %rbx | |
1462 | movq 0(%rbp), %rax | |
1463 | movl \$8, %ecx | |
1464 | jmp .Lreduction_loop | |
1465 | ||
1466 | .align 32 | |
1467 | .Lreduction_loop: | |
1468 | mulq %rbx | |
1469 | movq 8(%rbp), %rax | |
1470 | negq %r8 | |
1471 | movq %rdx, %r8 | |
1472 | adcq \$0, %r8 | |
1473 | ||
1474 | mulq %rbx | |
1475 | addq %rax, %r9 | |
1476 | movq 16(%rbp), %rax | |
1477 | adcq \$0, %rdx | |
1478 | addq %r9, %r8 | |
1479 | movq %rdx, %r9 | |
1480 | adcq \$0, %r9 | |
1481 | ||
1482 | mulq %rbx | |
1483 | addq %rax, %r10 | |
1484 | movq 24(%rbp), %rax | |
1485 | adcq \$0, %rdx | |
1486 | addq %r10, %r9 | |
1487 | movq %rdx, %r10 | |
1488 | adcq \$0, %r10 | |
1489 | ||
1490 | mulq %rbx | |
1491 | addq %rax, %r11 | |
1492 | movq 32(%rbp), %rax | |
1493 | adcq \$0, %rdx | |
1494 | addq %r11, %r10 | |
1495 | movq 128+8(%rsp), %rsi | |
87954638 AP |
1496 | #movq %rdx, %r11 |
1497 | #adcq \$0, %r11 | |
1498 | adcq \$0, %rdx | |
0b4bb91d | 1499 | movq %rdx, %r11 |
0b4bb91d AP |
1500 | |
1501 | mulq %rbx | |
1502 | addq %rax, %r12 | |
1503 | movq 40(%rbp), %rax | |
1504 | adcq \$0, %rdx | |
1505 | imulq %r8, %rsi | |
1506 | addq %r12, %r11 | |
1507 | movq %rdx, %r12 | |
1508 | adcq \$0, %r12 | |
1509 | ||
1510 | mulq %rbx | |
1511 | addq %rax, %r13 | |
1512 | movq 48(%rbp), %rax | |
1513 | adcq \$0, %rdx | |
1514 | addq %r13, %r12 | |
1515 | movq %rdx, %r13 | |
1516 | adcq \$0, %r13 | |
1517 | ||
1518 | mulq %rbx | |
1519 | addq %rax, %r14 | |
1520 | movq 56(%rbp), %rax | |
1521 | adcq \$0, %rdx | |
1522 | addq %r14, %r13 | |
1523 | movq %rdx, %r14 | |
1524 | adcq \$0, %r14 | |
1525 | ||
1526 | mulq %rbx | |
1527 | movq %rsi, %rbx | |
1528 | addq %rax, %r15 | |
1529 | movq 0(%rbp), %rax | |
1530 | adcq \$0, %rdx | |
1531 | addq %r15, %r14 | |
1532 | movq %rdx, %r15 | |
1533 | adcq \$0, %r15 | |
1534 | ||
1535 | decl %ecx | |
1536 | jne .Lreduction_loop | |
87954638 AP |
1537 | |
1538 | ret | |
1539 | .size __rsaz_512_reduce,.-__rsaz_512_reduce | |
0b4bb91d | 1540 | ___ |
87954638 AP |
1541 | } |
1542 | if ($addx) { | |
1543 | # __rsaz_512_reducex | |
1544 | # | |
1545 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 | |
1546 | # output: %r8-%r15 | |
1547 | # clobbers: everything except %rbp and %rdi | |
0b4bb91d | 1548 | $code.=<<___; |
87954638 AP |
1549 | .type __rsaz_512_reducex,\@abi-omnipotent |
1550 | .align 32 | |
1551 | __rsaz_512_reducex: | |
1552 | #movq 128+8(%rsp), %rdx # pull $n0 | |
0b4bb91d | 1553 | imulq %r8, %rdx |
87954638 | 1554 | xorq %rsi, %rsi # cf=0,of=0 |
0b4bb91d | 1555 | movl \$8, %ecx |
87954638 | 1556 | jmp .Lreduction_loopx |
0b4bb91d AP |
1557 | |
1558 | .align 32 | |
87954638 AP |
1559 | .Lreduction_loopx: |
1560 | mov %r8, %rbx | |
0b4bb91d | 1561 | mulx 0(%rbp), %rax, %r8 |
87954638 AP |
1562 | adcx %rbx, %rax |
1563 | adox %r9, %r8 | |
0b4bb91d AP |
1564 | |
1565 | mulx 8(%rbp), %rax, %r9 | |
87954638 AP |
1566 | adcx %rax, %r8 |
1567 | adox %r10, %r9 | |
1568 | ||
1569 | mulx 16(%rbp), %rbx, %r10 | |
1570 | adcx %rbx, %r9 | |
1571 | adox %r11, %r10 | |
1572 | ||
1573 | mulx 24(%rbp), %rbx, %r11 | |
1574 | adcx %rbx, %r10 | |
1575 | adox %r12, %r11 | |
1576 | ||
1577 | .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 | |
1578 | mov %rdx, %rax | |
1579 | mov %r8, %rdx | |
1580 | adcx %rbx, %r11 | |
1581 | adox %r13, %r12 | |
1582 | ||
1583 | mulx 128+8(%rsp), %rbx, %rdx | |
1584 | mov %rax, %rdx | |
0b4bb91d AP |
1585 | |
1586 | mulx 40(%rbp), %rax, %r13 | |
87954638 AP |
1587 | adcx %rax, %r12 |
1588 | adox %r14, %r13 | |
0b4bb91d | 1589 | |
87954638 AP |
1590 | .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 |
1591 | adcx %rax, %r13 | |
1592 | adox %r15, %r14 | |
0b4bb91d AP |
1593 | |
1594 | mulx 56(%rbp), %rax, %r15 | |
1595 | mov %rbx, %rdx | |
87954638 AP |
1596 | adcx %rax, %r14 |
1597 | adox %rsi, %r15 # %rsi is 0 | |
1598 | adcx %rsi, %r15 # cf=0 | |
1599 | ||
1600 | decl %ecx # of=0 | |
1601 | jne .Lreduction_loopx | |
0b4bb91d | 1602 | |
0b4bb91d | 1603 | ret |
87954638 | 1604 | .size __rsaz_512_reducex,.-__rsaz_512_reducex |
0b4bb91d AP |
1605 | ___ |
1606 | } | |
87954638 | 1607 | { # __rsaz_512_subtract |
0b4bb91d AP |
1608 | # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask |
1609 | # output: | |
1610 | # clobbers: everything but %rdi, %rsi and %rbp | |
1611 | $code.=<<___; | |
87954638 | 1612 | .type __rsaz_512_subtract,\@abi-omnipotent |
0b4bb91d | 1613 | .align 32 |
87954638 | 1614 | __rsaz_512_subtract: |
0b4bb91d AP |
1615 | movq %r8, ($out) |
1616 | movq %r9, 8($out) | |
1617 | movq %r10, 16($out) | |
1618 | movq %r11, 24($out) | |
1619 | movq %r12, 32($out) | |
1620 | movq %r13, 40($out) | |
1621 | movq %r14, 48($out) | |
1622 | movq %r15, 56($out) | |
1623 | ||
1624 | movq 0($mod), %r8 | |
1625 | movq 8($mod), %r9 | |
1626 | negq %r8 | |
1627 | notq %r9 | |
1628 | andq %rcx, %r8 | |
1629 | movq 16($mod), %r10 | |
1630 | andq %rcx, %r9 | |
1631 | notq %r10 | |
1632 | movq 24($mod), %r11 | |
1633 | andq %rcx, %r10 | |
1634 | notq %r11 | |
1635 | movq 32($mod), %r12 | |
1636 | andq %rcx, %r11 | |
1637 | notq %r12 | |
1638 | movq 40($mod), %r13 | |
1639 | andq %rcx, %r12 | |
1640 | notq %r13 | |
1641 | movq 48($mod), %r14 | |
1642 | andq %rcx, %r13 | |
1643 | notq %r14 | |
1644 | movq 56($mod), %r15 | |
1645 | andq %rcx, %r14 | |
1646 | notq %r15 | |
1647 | andq %rcx, %r15 | |
1648 | ||
1649 | addq ($out), %r8 | |
1650 | adcq 8($out), %r9 | |
1651 | adcq 16($out), %r10 | |
1652 | adcq 24($out), %r11 | |
1653 | adcq 32($out), %r12 | |
1654 | adcq 40($out), %r13 | |
1655 | adcq 48($out), %r14 | |
1656 | adcq 56($out), %r15 | |
1657 | ||
1658 | movq %r8, ($out) | |
1659 | movq %r9, 8($out) | |
1660 | movq %r10, 16($out) | |
1661 | movq %r11, 24($out) | |
1662 | movq %r12, 32($out) | |
1663 | movq %r13, 40($out) | |
1664 | movq %r14, 48($out) | |
1665 | movq %r15, 56($out) | |
1666 | ||
1667 | ret | |
87954638 | 1668 | .size __rsaz_512_subtract,.-__rsaz_512_subtract |
0b4bb91d AP |
1669 | ___ |
1670 | } | |
fd8ad019 | 1671 | { # __rsaz_512_mul |
0b4bb91d AP |
1672 | # |
1673 | # input: %rsi - ap, %rbp - bp | |
0d4fb843 | 1674 | # output: |
0b4bb91d AP |
1675 | # clobbers: everything |
1676 | my ($ap,$bp) = ("%rsi","%rbp"); | |
1677 | $code.=<<___; | |
fd8ad019 | 1678 | .type __rsaz_512_mul,\@abi-omnipotent |
0b4bb91d | 1679 | .align 32 |
fd8ad019 | 1680 | __rsaz_512_mul: |
0b4bb91d AP |
1681 | leaq 8(%rsp), %rdi |
1682 | ||
0b4bb91d AP |
1683 | movq ($ap), %rax |
1684 | mulq %rbx | |
1685 | movq %rax, (%rdi) | |
1686 | movq 8($ap), %rax | |
1687 | movq %rdx, %r8 | |
1688 | ||
1689 | mulq %rbx | |
1690 | addq %rax, %r8 | |
1691 | movq 16($ap), %rax | |
1692 | movq %rdx, %r9 | |
1693 | adcq \$0, %r9 | |
1694 | ||
1695 | mulq %rbx | |
1696 | addq %rax, %r9 | |
1697 | movq 24($ap), %rax | |
1698 | movq %rdx, %r10 | |
1699 | adcq \$0, %r10 | |
1700 | ||
1701 | mulq %rbx | |
1702 | addq %rax, %r10 | |
1703 | movq 32($ap), %rax | |
1704 | movq %rdx, %r11 | |
1705 | adcq \$0, %r11 | |
1706 | ||
1707 | mulq %rbx | |
1708 | addq %rax, %r11 | |
1709 | movq 40($ap), %rax | |
1710 | movq %rdx, %r12 | |
1711 | adcq \$0, %r12 | |
1712 | ||
1713 | mulq %rbx | |
1714 | addq %rax, %r12 | |
1715 | movq 48($ap), %rax | |
1716 | movq %rdx, %r13 | |
1717 | adcq \$0, %r13 | |
1718 | ||
1719 | mulq %rbx | |
1720 | addq %rax, %r13 | |
1721 | movq 56($ap), %rax | |
1722 | movq %rdx, %r14 | |
1723 | adcq \$0, %r14 | |
1724 | ||
1725 | mulq %rbx | |
1726 | addq %rax, %r14 | |
1727 | movq ($ap), %rax | |
1728 | movq %rdx, %r15 | |
1729 | adcq \$0, %r15 | |
1730 | ||
1731 | leaq 8($bp), $bp | |
1732 | leaq 8(%rdi), %rdi | |
1733 | ||
1734 | movl \$7, %ecx | |
1735 | jmp .Loop_mul | |
1736 | ||
1737 | .align 32 | |
1738 | .Loop_mul: | |
1739 | movq ($bp), %rbx | |
1740 | mulq %rbx | |
1741 | addq %rax, %r8 | |
1742 | movq 8($ap), %rax | |
1743 | movq %r8, (%rdi) | |
1744 | movq %rdx, %r8 | |
1745 | adcq \$0, %r8 | |
1746 | ||
1747 | mulq %rbx | |
1748 | addq %rax, %r9 | |
1749 | movq 16($ap), %rax | |
1750 | adcq \$0, %rdx | |
1751 | addq %r9, %r8 | |
1752 | movq %rdx, %r9 | |
1753 | adcq \$0, %r9 | |
1754 | ||
1755 | mulq %rbx | |
1756 | addq %rax, %r10 | |
1757 | movq 24($ap), %rax | |
1758 | adcq \$0, %rdx | |
1759 | addq %r10, %r9 | |
1760 | movq %rdx, %r10 | |
1761 | adcq \$0, %r10 | |
1762 | ||
1763 | mulq %rbx | |
1764 | addq %rax, %r11 | |
1765 | movq 32($ap), %rax | |
1766 | adcq \$0, %rdx | |
1767 | addq %r11, %r10 | |
1768 | movq %rdx, %r11 | |
1769 | adcq \$0, %r11 | |
1770 | ||
1771 | mulq %rbx | |
1772 | addq %rax, %r12 | |
1773 | movq 40($ap), %rax | |
1774 | adcq \$0, %rdx | |
1775 | addq %r12, %r11 | |
1776 | movq %rdx, %r12 | |
1777 | adcq \$0, %r12 | |
1778 | ||
1779 | mulq %rbx | |
1780 | addq %rax, %r13 | |
1781 | movq 48($ap), %rax | |
1782 | adcq \$0, %rdx | |
1783 | addq %r13, %r12 | |
1784 | movq %rdx, %r13 | |
1785 | adcq \$0, %r13 | |
1786 | ||
1787 | mulq %rbx | |
1788 | addq %rax, %r14 | |
1789 | movq 56($ap), %rax | |
1790 | adcq \$0, %rdx | |
1791 | addq %r14, %r13 | |
1792 | movq %rdx, %r14 | |
1793 | leaq 8($bp), $bp | |
1794 | adcq \$0, %r14 | |
1795 | ||
1796 | mulq %rbx | |
1797 | addq %rax, %r15 | |
1798 | movq ($ap), %rax | |
1799 | adcq \$0, %rdx | |
1800 | addq %r15, %r14 | |
1801 | movq %rdx, %r15 | |
1802 | adcq \$0, %r15 | |
1803 | ||
1804 | leaq 8(%rdi), %rdi | |
1805 | ||
1806 | decl %ecx | |
1807 | jnz .Loop_mul | |
1808 | ||
1809 | movq %r8, (%rdi) | |
1810 | movq %r9, 8(%rdi) | |
1811 | movq %r10, 16(%rdi) | |
1812 | movq %r11, 24(%rdi) | |
1813 | movq %r12, 32(%rdi) | |
1814 | movq %r13, 40(%rdi) | |
1815 | movq %r14, 48(%rdi) | |
1816 | movq %r15, 56(%rdi) | |
1817 | ||
1818 | ret | |
fd8ad019 | 1819 | .size __rsaz_512_mul,.-__rsaz_512_mul |
0b4bb91d AP |
1820 | ___ |
1821 | } | |
87954638 AP |
1822 | if ($addx) { |
1823 | # __rsaz_512_mulx | |
1824 | # | |
1825 | # input: %rsi - ap, %rbp - bp | |
0d4fb843 | 1826 | # output: |
87954638 AP |
1827 | # clobbers: everything |
1828 | my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); | |
1829 | $code.=<<___; | |
1830 | .type __rsaz_512_mulx,\@abi-omnipotent | |
1831 | .align 32 | |
1832 | __rsaz_512_mulx: | |
1833 | mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller | |
31ed9a21 | 1834 | mov \$-6, %rcx |
87954638 AP |
1835 | |
1836 | mulx 8($ap), %rax, %r9 | |
1837 | movq %rbx, 8(%rsp) | |
1838 | ||
1839 | mulx 16($ap), %rbx, %r10 | |
31ed9a21 | 1840 | adc %rax, %r8 |
87954638 AP |
1841 | |
1842 | mulx 24($ap), %rax, %r11 | |
31ed9a21 | 1843 | adc %rbx, %r9 |
87954638 | 1844 | |
31ed9a21 AP |
1845 | mulx 32($ap), %rbx, %r12 |
1846 | adc %rax, %r10 | |
87954638 AP |
1847 | |
1848 | mulx 40($ap), %rax, %r13 | |
31ed9a21 | 1849 | adc %rbx, %r11 |
87954638 AP |
1850 | |
1851 | mulx 48($ap), %rbx, %r14 | |
31ed9a21 | 1852 | adc %rax, %r12 |
87954638 AP |
1853 | |
1854 | mulx 56($ap), %rax, %r15 | |
1855 | mov 8($bp), %rdx | |
31ed9a21 AP |
1856 | adc %rbx, %r13 |
1857 | adc %rax, %r14 | |
1858 | adc \$0, %r15 | |
87954638 | 1859 | |
31ed9a21 | 1860 | xor $zero, $zero # cf=0,of=0 |
87954638 AP |
1861 | jmp .Loop_mulx |
1862 | ||
1863 | .align 32 | |
1864 | .Loop_mulx: | |
1865 | movq %r8, %rbx | |
1866 | mulx ($ap), %rax, %r8 | |
1867 | adcx %rax, %rbx | |
1868 | adox %r9, %r8 | |
1869 | ||
1870 | mulx 8($ap), %rax, %r9 | |
1871 | adcx %rax, %r8 | |
1872 | adox %r10, %r9 | |
1873 | ||
1874 | mulx 16($ap), %rax, %r10 | |
1875 | adcx %rax, %r9 | |
1876 | adox %r11, %r10 | |
1877 | ||
1878 | mulx 24($ap), %rax, %r11 | |
1879 | adcx %rax, %r10 | |
1880 | adox %r12, %r11 | |
1881 | ||
1882 | .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 | |
1883 | adcx %rax, %r11 | |
1884 | adox %r13, %r12 | |
1885 | ||
1886 | mulx 40($ap), %rax, %r13 | |
1887 | adcx %rax, %r12 | |
1888 | adox %r14, %r13 | |
1889 | ||
1890 | mulx 48($ap), %rax, %r14 | |
1891 | adcx %rax, %r13 | |
1892 | adox %r15, %r14 | |
1893 | ||
1894 | mulx 56($ap), %rax, %r15 | |
1895 | movq 64($bp,%rcx,8), %rdx | |
1896 | movq %rbx, 8+64-8(%rsp,%rcx,8) | |
1897 | adcx %rax, %r14 | |
1898 | adox $zero, %r15 | |
1899 | adcx $zero, %r15 # cf=0 | |
1900 | ||
1901 | inc %rcx # of=0 | |
1902 | jnz .Loop_mulx | |
1903 | ||
1904 | movq %r8, %rbx | |
1905 | mulx ($ap), %rax, %r8 | |
1906 | adcx %rax, %rbx | |
1907 | adox %r9, %r8 | |
1908 | ||
1909 | .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 | |
1910 | adcx %rax, %r8 | |
1911 | adox %r10, %r9 | |
1912 | ||
1913 | .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 | |
1914 | adcx %rax, %r9 | |
1915 | adox %r11, %r10 | |
1916 | ||
1917 | mulx 24($ap), %rax, %r11 | |
1918 | adcx %rax, %r10 | |
1919 | adox %r12, %r11 | |
1920 | ||
1921 | mulx 32($ap), %rax, %r12 | |
1922 | adcx %rax, %r11 | |
1923 | adox %r13, %r12 | |
1924 | ||
1925 | mulx 40($ap), %rax, %r13 | |
1926 | adcx %rax, %r12 | |
1927 | adox %r14, %r13 | |
1928 | ||
1929 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 | |
1930 | adcx %rax, %r13 | |
1931 | adox %r15, %r14 | |
1932 | ||
1933 | .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 | |
1934 | adcx %rax, %r14 | |
1935 | adox $zero, %r15 | |
1936 | adcx $zero, %r15 | |
1937 | ||
1938 | mov %rbx, 8+64-8(%rsp) | |
1939 | mov %r8, 8+64(%rsp) | |
1940 | mov %r9, 8+64+8(%rsp) | |
1941 | mov %r10, 8+64+16(%rsp) | |
1942 | mov %r11, 8+64+24(%rsp) | |
1943 | mov %r12, 8+64+32(%rsp) | |
1944 | mov %r13, 8+64+40(%rsp) | |
1945 | mov %r14, 8+64+48(%rsp) | |
1946 | mov %r15, 8+64+56(%rsp) | |
1947 | ||
1948 | ret | |
1949 | .size __rsaz_512_mulx,.-__rsaz_512_mulx | |
1950 | ___ | |
1951 | } | |
0b4bb91d AP |
1952 | { |
1953 | my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); | |
1954 | $code.=<<___; | |
1955 | .globl rsaz_512_scatter4 | |
1956 | .type rsaz_512_scatter4,\@abi-omnipotent | |
1957 | .align 16 | |
1958 | rsaz_512_scatter4: | |
1959 | leaq ($out,$power,4), $out | |
1960 | movl \$8, %r9d | |
1961 | jmp .Loop_scatter | |
1962 | .align 16 | |
1963 | .Loop_scatter: | |
1964 | movq ($inp), %rax | |
1965 | leaq 8($inp), $inp | |
1966 | movl %eax, ($out) | |
1967 | shrq \$32, %rax | |
1968 | movl %eax, 64($out) | |
1969 | leaq 128($out), $out | |
1970 | decl %r9d | |
1971 | jnz .Loop_scatter | |
1972 | ret | |
1973 | .size rsaz_512_scatter4,.-rsaz_512_scatter4 | |
1974 | ||
1975 | .globl rsaz_512_gather4 | |
1976 | .type rsaz_512_gather4,\@abi-omnipotent | |
1977 | .align 16 | |
1978 | rsaz_512_gather4: | |
1979 | leaq ($inp,$power,4), $inp | |
1980 | movl \$8, %r9d | |
1981 | jmp .Loop_gather | |
1982 | .align 16 | |
1983 | .Loop_gather: | |
1984 | movl ($inp), %eax | |
1985 | movl 64($inp), %r8d | |
1986 | leaq 128($inp), $inp | |
1987 | shlq \$32, %r8 | |
1988 | or %r8, %rax | |
1989 | movq %rax, ($out) | |
1990 | leaq 8($out), $out | |
1991 | decl %r9d | |
1992 | jnz .Loop_gather | |
1993 | ret | |
1994 | .size rsaz_512_gather4,.-rsaz_512_gather4 | |
1995 | ___ | |
1996 | } | |
1997 | ||
1998 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
1999 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
2000 | if ($win64) { | |
2001 | $rec="%rcx"; | |
2002 | $frame="%rdx"; | |
2003 | $context="%r8"; | |
2004 | $disp="%r9"; | |
2005 | ||
2006 | $code.=<<___; | |
2007 | .extern __imp_RtlVirtualUnwind | |
2008 | .type se_handler,\@abi-omnipotent | |
2009 | .align 16 | |
2010 | se_handler: | |
2011 | push %rsi | |
2012 | push %rdi | |
2013 | push %rbx | |
2014 | push %rbp | |
2015 | push %r12 | |
2016 | push %r13 | |
2017 | push %r14 | |
2018 | push %r15 | |
2019 | pushfq | |
2020 | sub \$64,%rsp | |
2021 | ||
2022 | mov 120($context),%rax # pull context->Rax | |
2023 | mov 248($context),%rbx # pull context->Rip | |
2024 | ||
2025 | mov 8($disp),%rsi # disp->ImageBase | |
2026 | mov 56($disp),%r11 # disp->HandlerData | |
2027 | ||
2028 | mov 0(%r11),%r10d # HandlerData[0] | |
2029 | lea (%rsi,%r10),%r10 # end of prologue label | |
2030 | cmp %r10,%rbx # context->Rip<end of prologue label | |
2031 | jb .Lcommon_seh_tail | |
2032 | ||
2033 | mov 152($context),%rax # pull context->Rsp | |
2034 | ||
2035 | mov 4(%r11),%r10d # HandlerData[1] | |
2036 | lea (%rsi,%r10),%r10 # epilogue label | |
2037 | cmp %r10,%rbx # context->Rip>=epilogue label | |
2038 | jae .Lcommon_seh_tail | |
2039 | ||
2040 | lea 128+24+48(%rax),%rax | |
2041 | ||
2042 | mov -8(%rax),%rbx | |
2043 | mov -16(%rax),%rbp | |
2044 | mov -24(%rax),%r12 | |
2045 | mov -32(%rax),%r13 | |
2046 | mov -40(%rax),%r14 | |
2047 | mov -48(%rax),%r15 | |
2048 | mov %rbx,144($context) # restore context->Rbx | |
2049 | mov %rbp,160($context) # restore context->Rbp | |
2050 | mov %r12,216($context) # restore context->R12 | |
2051 | mov %r13,224($context) # restore context->R13 | |
2052 | mov %r14,232($context) # restore context->R14 | |
2053 | mov %r15,240($context) # restore context->R15 | |
2054 | ||
2055 | .Lcommon_seh_tail: | |
2056 | mov 8(%rax),%rdi | |
2057 | mov 16(%rax),%rsi | |
2058 | mov %rax,152($context) # restore context->Rsp | |
2059 | mov %rsi,168($context) # restore context->Rsi | |
2060 | mov %rdi,176($context) # restore context->Rdi | |
2061 | ||
2062 | mov 40($disp),%rdi # disp->ContextRecord | |
2063 | mov $context,%rsi # context | |
2064 | mov \$154,%ecx # sizeof(CONTEXT) | |
2065 | .long 0xa548f3fc # cld; rep movsq | |
2066 | ||
2067 | mov $disp,%rsi | |
2068 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
2069 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
2070 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
2071 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
2072 | mov 40(%rsi),%r10 # disp->ContextRecord | |
2073 | lea 56(%rsi),%r11 # &disp->HandlerData | |
2074 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
2075 | mov %r10,32(%rsp) # arg5 | |
2076 | mov %r11,40(%rsp) # arg6 | |
2077 | mov %r12,48(%rsp) # arg7 | |
2078 | mov %rcx,56(%rsp) # arg8, (NULL) | |
2079 | call *__imp_RtlVirtualUnwind(%rip) | |
2080 | ||
2081 | mov \$1,%eax # ExceptionContinueSearch | |
2082 | add \$64,%rsp | |
2083 | popfq | |
2084 | pop %r15 | |
2085 | pop %r14 | |
2086 | pop %r13 | |
2087 | pop %r12 | |
2088 | pop %rbp | |
2089 | pop %rbx | |
2090 | pop %rdi | |
2091 | pop %rsi | |
2092 | ret | |
2093 | .size sqr_handler,.-sqr_handler | |
2094 | ||
2095 | .section .pdata | |
2096 | .align 4 | |
2097 | .rva .LSEH_begin_rsaz_512_sqr | |
2098 | .rva .LSEH_end_rsaz_512_sqr | |
2099 | .rva .LSEH_info_rsaz_512_sqr | |
2100 | ||
2101 | .rva .LSEH_begin_rsaz_512_mul | |
2102 | .rva .LSEH_end_rsaz_512_mul | |
2103 | .rva .LSEH_info_rsaz_512_mul | |
2104 | ||
2105 | .rva .LSEH_begin_rsaz_512_mul_gather4 | |
2106 | .rva .LSEH_end_rsaz_512_mul_gather4 | |
2107 | .rva .LSEH_info_rsaz_512_mul_gather4 | |
2108 | ||
2109 | .rva .LSEH_begin_rsaz_512_mul_scatter4 | |
2110 | .rva .LSEH_end_rsaz_512_mul_scatter4 | |
2111 | .rva .LSEH_info_rsaz_512_mul_scatter4 | |
2112 | ||
2113 | .rva .LSEH_begin_rsaz_512_mul_by_one | |
2114 | .rva .LSEH_end_rsaz_512_mul_by_one | |
2115 | .rva .LSEH_info_rsaz_512_mul_by_one | |
2116 | ||
2117 | .section .xdata | |
2118 | .align 8 | |
2119 | .LSEH_info_rsaz_512_sqr: | |
2120 | .byte 9,0,0,0 | |
2121 | .rva se_handler | |
2122 | .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] | |
2123 | .LSEH_info_rsaz_512_mul: | |
2124 | .byte 9,0,0,0 | |
2125 | .rva se_handler | |
2126 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[] | |
2127 | .LSEH_info_rsaz_512_mul_gather4: | |
2128 | .byte 9,0,0,0 | |
2129 | .rva se_handler | |
2130 | .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] | |
2131 | .LSEH_info_rsaz_512_mul_scatter4: | |
2132 | .byte 9,0,0,0 | |
2133 | .rva se_handler | |
2134 | .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] | |
2135 | .LSEH_info_rsaz_512_mul_by_one: | |
2136 | .byte 9,0,0,0 | |
2137 | .rva se_handler | |
2138 | .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] | |
2139 | ___ | |
2140 | } | |
2141 | ||
2142 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
2143 | print $code; | |
2144 | close STDOUT; |