]>
Commit | Line | Data |
---|---|---|
0b4bb91d AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | #******************************************************************************# | |
4 | #* Copyright(c) 2012, Intel Corp. *# | |
5 | #* Developers and authors: *# | |
6 | #* Shay Gueron (1, 2), and Vlad Krasnov (1) *# | |
7 | #* (1) Intel Architecture Group, Microprocessor and Chipset Development, *# | |
8 | #* Israel Development Center, Haifa, Israel *# | |
9 | #* (2) University of Haifa *# | |
10 | #******************************************************************************# | |
11 | #* This submission to OpenSSL is to be made available under the OpenSSL *# | |
12 | #* license, and only to the OpenSSL project, in order to allow integration *# | |
13 | #* into the publicly distributed code. ? *# | |
14 | #* The use of this code, or portions of this code, or concepts embedded in *# | |
15 | #* this code, or modification of this code and/or algorithm(s) in it, or the *# | |
16 | #* use of this code for any other purpose than stated above, requires special *# | |
17 | #* licensing. *# | |
18 | #******************************************************************************# | |
19 | #******************************************************************************# | |
20 | #* DISCLAIMER: *# | |
21 | #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS *# | |
22 | #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *# | |
23 | #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *# | |
24 | #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*# | |
25 | #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *# | |
26 | #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF *# | |
27 | #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS *# | |
28 | #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN *# | |
29 | #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) *# | |
30 | #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *# | |
31 | #* POSSIBILITY OF SUCH DAMAGE. *# | |
32 | #******************************************************************************# | |
33 | #* Reference: *# | |
34 | #* [1] S. Gueron, "Efficient Software Implementations of Modular *# | |
35 | #* Exponentiation", http://eprint.iacr.org/2011/239 *# | |
36 | #* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". *# | |
37 | #* IEEE Proceedings of 9th International Conference on Information *# | |
38 | #* Technology: New Generations (ITNG 2012), 821-823 (2012). *# | |
39 | #* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*# | |
40 | #* Journal of Cryptographic Engineering 2:31-43 (2012). *# | |
41 | #* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis *# | |
42 | #* resistant 512-bit and 1024-bit modular exponentiation for optimizing *# | |
43 | #* RSA1024 and RSA2048 on x86_64 platforms", *# | |
44 | #* http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*# | |
45 | ################################################################################ | |
46 | ||
47 | # While original submission covers 512- and 1024-bit exponentiation, | |
48 | # this module is limited to 512-bit version only (and as such | |
49 | # accelerates RSA1024 sign). This is because improvement for longer | |
50 | # keys is not high enough to justify the effort, highest measured | |
51 | # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming | |
52 | # for the moment of this writing!] Nor does this module implement | |
53 | # "monolithic" complete exponentiation jumbo-subroutine, but adheres | |
54 | # to more modular mixture of C and assembly. And it's optimized even | |
55 | # for processors other than Intel Core family (see table below for | |
56 | # improvement coefficients). | |
57 | # <appro@openssl.org> | |
58 | # | |
59 | # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) | |
60 | # ----------------+--------------------------- | |
61 | # Opteron +13% |+5% +20% | |
62 | # Bulldozer -0% |-1% +10% | |
63 | # P4 +11% |+7% +8% | |
64 | # Westmere +5% |+14% +17% | |
65 | # Sandy Bridge +2% |+12% +29% | |
66 | # Ivy Bridge +1% |+11% +35% | |
67 | # Haswell(**) -0% |+12% +39% | |
68 | # Atom +13% |+11% +4% | |
69 | # VIA Nano +70% |+9% +25% | |
70 | # | |
71 | # (*) rsax engine and fips numbers are presented for reference | |
72 | # purposes; | |
87954638 | 73 | # (**) MULX was attempted, but found to give only marginal improvement; |
0b4bb91d AP |
74 | |
75 | $flavour = shift; | |
76 | $output = shift; | |
77 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
78 | ||
79 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
80 | ||
81 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
82 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
83 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
84 | die "can't locate x86_64-xlate.pl"; | |
85 | ||
86 | open OUT,"| $^X $xlate $flavour $output"; | |
87 | *STDOUT=*OUT; | |
88 | ||
87954638 AP |
89 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
90 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
30b9c234 | 91 | $addx = ($1>=2.23); |
87954638 AP |
92 | } |
93 | ||
94 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
95 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
96 | $addx = ($1>=2.10); | |
97 | } | |
98 | ||
99 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
100 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
101 | $addx = ($1>=11); | |
102 | } | |
103 | ||
0b4bb91d AP |
104 | ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API |
105 | { | |
106 | my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); | |
107 | ||
108 | $code.=<<___; | |
109 | .text | |
110 | ||
87954638 AP |
111 | .extern OPENSSL_ia32cap_P |
112 | ||
0b4bb91d AP |
113 | .globl rsaz_512_sqr |
114 | .type rsaz_512_sqr,\@function,4 | |
115 | .align 32 | |
116 | rsaz_512_sqr: # 25-29% faster than rsaz_512_mul | |
117 | push %rbx | |
118 | push %rbp | |
119 | push %r12 | |
120 | push %r13 | |
121 | push %r14 | |
122 | push %r15 | |
123 | ||
124 | subq \$128+24, %rsp | |
125 | .Lsqr_body: | |
126 | movq $mod, %rbp # common argument | |
127 | movq ($inp), %rdx | |
128 | movq 8($inp), %rax | |
129 | movq $n0, 128(%rsp) | |
87954638 AP |
130 | ___ |
131 | $code.=<<___ if ($addx); | |
132 | movl \$0x80100,%r11d | |
133 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
134 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
135 | je .Loop_sqrx | |
136 | ___ | |
137 | $code.=<<___; | |
0b4bb91d AP |
138 | jmp .Loop_sqr |
139 | ||
140 | .align 32 | |
141 | .Loop_sqr: | |
142 | movl $times,128+8(%rsp) | |
0b4bb91d AP |
143 | #first iteration |
144 | movq %rdx, %rbx | |
145 | mulq %rdx | |
146 | movq %rax, %r8 | |
147 | movq 16($inp), %rax | |
148 | movq %rdx, %r9 | |
149 | ||
150 | mulq %rbx | |
151 | addq %rax, %r9 | |
152 | movq 24($inp), %rax | |
153 | movq %rdx, %r10 | |
154 | adcq \$0, %r10 | |
155 | ||
156 | mulq %rbx | |
157 | addq %rax, %r10 | |
158 | movq 32($inp), %rax | |
159 | movq %rdx, %r11 | |
160 | adcq \$0, %r11 | |
161 | ||
162 | mulq %rbx | |
163 | addq %rax, %r11 | |
164 | movq 40($inp), %rax | |
165 | movq %rdx, %r12 | |
166 | adcq \$0, %r12 | |
167 | ||
168 | mulq %rbx | |
169 | addq %rax, %r12 | |
170 | movq 48($inp), %rax | |
171 | movq %rdx, %r13 | |
172 | adcq \$0, %r13 | |
173 | ||
174 | mulq %rbx | |
175 | addq %rax, %r13 | |
176 | movq 56($inp), %rax | |
177 | movq %rdx, %r14 | |
178 | adcq \$0, %r14 | |
179 | ||
180 | mulq %rbx | |
181 | addq %rax, %r14 | |
182 | movq %rbx, %rax | |
183 | movq %rdx, %r15 | |
184 | adcq \$0, %r15 | |
185 | ||
186 | addq %r8, %r8 #shlq \$1, %r8 | |
187 | movq %r9, %rcx | |
188 | adcq %r9, %r9 #shld \$1, %r8, %r9 | |
189 | ||
190 | mulq %rax | |
191 | movq %rax, (%rsp) | |
192 | addq %rdx, %r8 | |
193 | adcq \$0, %r9 | |
194 | ||
195 | movq %r8, 8(%rsp) | |
196 | shrq \$63, %rcx | |
197 | ||
198 | #second iteration | |
199 | movq 8($inp), %r8 | |
200 | movq 16($inp), %rax | |
201 | mulq %r8 | |
202 | addq %rax, %r10 | |
203 | movq 24($inp), %rax | |
204 | movq %rdx, %rbx | |
205 | adcq \$0, %rbx | |
206 | ||
207 | mulq %r8 | |
208 | addq %rax, %r11 | |
209 | movq 32($inp), %rax | |
210 | adcq \$0, %rdx | |
211 | addq %rbx, %r11 | |
212 | movq %rdx, %rbx | |
213 | adcq \$0, %rbx | |
214 | ||
215 | mulq %r8 | |
216 | addq %rax, %r12 | |
217 | movq 40($inp), %rax | |
218 | adcq \$0, %rdx | |
219 | addq %rbx, %r12 | |
220 | movq %rdx, %rbx | |
221 | adcq \$0, %rbx | |
222 | ||
223 | mulq %r8 | |
224 | addq %rax, %r13 | |
225 | movq 48($inp), %rax | |
226 | adcq \$0, %rdx | |
227 | addq %rbx, %r13 | |
228 | movq %rdx, %rbx | |
229 | adcq \$0, %rbx | |
230 | ||
231 | mulq %r8 | |
232 | addq %rax, %r14 | |
233 | movq 56($inp), %rax | |
234 | adcq \$0, %rdx | |
235 | addq %rbx, %r14 | |
236 | movq %rdx, %rbx | |
237 | adcq \$0, %rbx | |
238 | ||
239 | mulq %r8 | |
240 | addq %rax, %r15 | |
241 | movq %r8, %rax | |
242 | adcq \$0, %rdx | |
243 | addq %rbx, %r15 | |
244 | movq %rdx, %r8 | |
245 | movq %r10, %rdx | |
246 | adcq \$0, %r8 | |
247 | ||
248 | add %rdx, %rdx | |
249 | lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 | |
250 | movq %r11, %rbx | |
251 | adcq %r11, %r11 #shld \$1, %r10, %r11 | |
252 | ||
253 | mulq %rax | |
254 | addq %rax, %r9 | |
255 | adcq %rdx, %r10 | |
256 | adcq \$0, %r11 | |
257 | ||
258 | movq %r9, 16(%rsp) | |
259 | movq %r10, 24(%rsp) | |
260 | shrq \$63, %rbx | |
261 | ||
262 | #third iteration | |
263 | movq 16($inp), %r9 | |
264 | movq 24($inp), %rax | |
265 | mulq %r9 | |
266 | addq %rax, %r12 | |
267 | movq 32($inp), %rax | |
268 | movq %rdx, %rcx | |
269 | adcq \$0, %rcx | |
270 | ||
271 | mulq %r9 | |
272 | addq %rax, %r13 | |
273 | movq 40($inp), %rax | |
274 | adcq \$0, %rdx | |
275 | addq %rcx, %r13 | |
276 | movq %rdx, %rcx | |
277 | adcq \$0, %rcx | |
278 | ||
279 | mulq %r9 | |
280 | addq %rax, %r14 | |
281 | movq 48($inp), %rax | |
282 | adcq \$0, %rdx | |
283 | addq %rcx, %r14 | |
284 | movq %rdx, %rcx | |
285 | adcq \$0, %rcx | |
286 | ||
287 | mulq %r9 | |
288 | movq %r12, %r10 | |
289 | lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 | |
290 | addq %rax, %r15 | |
291 | movq 56($inp), %rax | |
292 | adcq \$0, %rdx | |
293 | addq %rcx, %r15 | |
294 | movq %rdx, %rcx | |
295 | adcq \$0, %rcx | |
296 | ||
297 | mulq %r9 | |
298 | shrq \$63, %r10 | |
299 | addq %rax, %r8 | |
300 | movq %r9, %rax | |
301 | adcq \$0, %rdx | |
302 | addq %rcx, %r8 | |
303 | movq %rdx, %r9 | |
304 | adcq \$0, %r9 | |
305 | ||
306 | movq %r13, %rcx | |
307 | leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 | |
308 | ||
309 | mulq %rax | |
310 | addq %rax, %r11 | |
311 | adcq %rdx, %r12 | |
312 | adcq \$0, %r13 | |
313 | ||
314 | movq %r11, 32(%rsp) | |
315 | movq %r12, 40(%rsp) | |
316 | shrq \$63, %rcx | |
317 | ||
318 | #fourth iteration | |
319 | movq 24($inp), %r10 | |
320 | movq 32($inp), %rax | |
321 | mulq %r10 | |
322 | addq %rax, %r14 | |
323 | movq 40($inp), %rax | |
324 | movq %rdx, %rbx | |
325 | adcq \$0, %rbx | |
326 | ||
327 | mulq %r10 | |
328 | addq %rax, %r15 | |
329 | movq 48($inp), %rax | |
330 | adcq \$0, %rdx | |
331 | addq %rbx, %r15 | |
332 | movq %rdx, %rbx | |
333 | adcq \$0, %rbx | |
334 | ||
335 | mulq %r10 | |
336 | movq %r14, %r12 | |
337 | leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 | |
338 | addq %rax, %r8 | |
339 | movq 56($inp), %rax | |
340 | adcq \$0, %rdx | |
341 | addq %rbx, %r8 | |
342 | movq %rdx, %rbx | |
343 | adcq \$0, %rbx | |
344 | ||
345 | mulq %r10 | |
346 | shrq \$63, %r12 | |
347 | addq %rax, %r9 | |
348 | movq %r10, %rax | |
349 | adcq \$0, %rdx | |
350 | addq %rbx, %r9 | |
351 | movq %rdx, %r10 | |
352 | adcq \$0, %r10 | |
353 | ||
354 | movq %r15, %rbx | |
355 | leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 | |
356 | ||
357 | mulq %rax | |
358 | addq %rax, %r13 | |
359 | adcq %rdx, %r14 | |
360 | adcq \$0, %r15 | |
361 | ||
362 | movq %r13, 48(%rsp) | |
363 | movq %r14, 56(%rsp) | |
364 | shrq \$63, %rbx | |
365 | ||
366 | #fifth iteration | |
367 | movq 32($inp), %r11 | |
368 | movq 40($inp), %rax | |
369 | mulq %r11 | |
370 | addq %rax, %r8 | |
371 | movq 48($inp), %rax | |
372 | movq %rdx, %rcx | |
373 | adcq \$0, %rcx | |
374 | ||
375 | mulq %r11 | |
376 | addq %rax, %r9 | |
377 | movq 56($inp), %rax | |
378 | adcq \$0, %rdx | |
379 | movq %r8, %r12 | |
380 | leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 | |
381 | addq %rcx, %r9 | |
382 | movq %rdx, %rcx | |
383 | adcq \$0, %rcx | |
384 | ||
385 | mulq %r11 | |
386 | shrq \$63, %r12 | |
387 | addq %rax, %r10 | |
388 | movq %r11, %rax | |
389 | adcq \$0, %rdx | |
390 | addq %rcx, %r10 | |
391 | movq %rdx, %r11 | |
392 | adcq \$0, %r11 | |
393 | ||
394 | movq %r9, %rcx | |
395 | leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 | |
396 | ||
397 | mulq %rax | |
398 | addq %rax, %r15 | |
399 | adcq %rdx, %r8 | |
400 | adcq \$0, %r9 | |
401 | ||
402 | movq %r15, 64(%rsp) | |
403 | movq %r8, 72(%rsp) | |
404 | shrq \$63, %rcx | |
405 | ||
406 | #sixth iteration | |
407 | movq 40($inp), %r12 | |
408 | movq 48($inp), %rax | |
409 | mulq %r12 | |
410 | addq %rax, %r10 | |
411 | movq 56($inp), %rax | |
412 | movq %rdx, %rbx | |
413 | adcq \$0, %rbx | |
414 | ||
415 | mulq %r12 | |
416 | addq %rax, %r11 | |
417 | movq %r12, %rax | |
418 | movq %r10, %r15 | |
419 | leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 | |
420 | adcq \$0, %rdx | |
421 | shrq \$63, %r15 | |
422 | addq %rbx, %r11 | |
423 | movq %rdx, %r12 | |
424 | adcq \$0, %r12 | |
425 | ||
426 | movq %r11, %rbx | |
427 | leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 | |
428 | ||
429 | mulq %rax | |
430 | addq %rax, %r9 | |
431 | adcq %rdx, %r10 | |
432 | adcq \$0, %r11 | |
433 | ||
434 | movq %r9, 80(%rsp) | |
435 | movq %r10, 88(%rsp) | |
436 | ||
437 | #seventh iteration | |
438 | movq 48($inp), %r13 | |
439 | movq 56($inp), %rax | |
440 | mulq %r13 | |
441 | addq %rax, %r12 | |
442 | movq %r13, %rax | |
443 | movq %rdx, %r13 | |
444 | adcq \$0, %r13 | |
445 | ||
446 | xorq %r14, %r14 | |
447 | shlq \$1, %rbx | |
448 | adcq %r12, %r12 #shld \$1, %rbx, %r12 | |
449 | adcq %r13, %r13 #shld \$1, %r12, %r13 | |
450 | adcq %r14, %r14 #shld \$1, %r13, %r14 | |
451 | ||
452 | mulq %rax | |
453 | addq %rax, %r11 | |
454 | adcq %rdx, %r12 | |
455 | adcq \$0, %r13 | |
456 | ||
457 | movq %r11, 96(%rsp) | |
458 | movq %r12, 104(%rsp) | |
459 | ||
460 | #eighth iteration | |
461 | movq 56($inp), %rax | |
462 | mulq %rax | |
463 | addq %rax, %r13 | |
464 | adcq \$0, %rdx | |
465 | ||
466 | addq %rdx, %r14 | |
467 | ||
468 | movq %r13, 112(%rsp) | |
469 | movq %r14, 120(%rsp) | |
87954638 AP |
470 | |
471 | movq (%rsp), %r8 | |
472 | movq 8(%rsp), %r9 | |
473 | movq 16(%rsp), %r10 | |
474 | movq 24(%rsp), %r11 | |
475 | movq 32(%rsp), %r12 | |
476 | movq 40(%rsp), %r13 | |
477 | movq 48(%rsp), %r14 | |
478 | movq 56(%rsp), %r15 | |
479 | ||
480 | call __rsaz_512_reduce | |
481 | ||
482 | addq 64(%rsp), %r8 | |
483 | adcq 72(%rsp), %r9 | |
484 | adcq 80(%rsp), %r10 | |
485 | adcq 88(%rsp), %r11 | |
486 | adcq 96(%rsp), %r12 | |
487 | adcq 104(%rsp), %r13 | |
488 | adcq 112(%rsp), %r14 | |
489 | adcq 120(%rsp), %r15 | |
490 | sbbq %rcx, %rcx | |
491 | ||
492 | call __rsaz_512_subtract | |
493 | ||
494 | movq %r8, %rdx | |
495 | movq %r9, %rax | |
496 | movl 128+8(%rsp), $times | |
497 | movq $out, $inp | |
498 | ||
499 | decl $times | |
500 | jnz .Loop_sqr | |
0b4bb91d | 501 | ___ |
87954638 | 502 | if ($addx) { |
0b4bb91d | 503 | $code.=<<___; |
87954638 AP |
504 | jmp .Lsqr_tail |
505 | ||
506 | .align 32 | |
507 | .Loop_sqrx: | |
508 | movl $times,128+8(%rsp) | |
0b4bb91d | 509 | movq $out, %xmm0 # off-load |
87954638 | 510 | movq %rbp, %xmm1 # off-load |
0b4bb91d AP |
511 | #first iteration |
512 | mulx %rax, %r8, %r9 | |
513 | ||
514 | mulx 16($inp), %rcx, %r10 | |
87954638 | 515 | xor %rbp, %rbp # cf=0, of=0 |
0b4bb91d AP |
516 | |
517 | mulx 24($inp), %rax, %r11 | |
87954638 | 518 | adcx %rcx, %r9 |
0b4bb91d AP |
519 | |
520 | mulx 32($inp), %rcx, %r12 | |
87954638 | 521 | adcx %rax, %r10 |
0b4bb91d AP |
522 | |
523 | mulx 40($inp), %rax, %r13 | |
87954638 | 524 | adcx %rcx, %r11 |
0b4bb91d | 525 | |
87954638 AP |
526 | .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 |
527 | adcx %rax, %r12 | |
528 | adcx %rcx, %r13 | |
0b4bb91d | 529 | |
87954638 AP |
530 | .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 |
531 | adcx %rax, %r14 | |
532 | adcx %rbp, %r15 # %rbp is 0 | |
0b4bb91d | 533 | |
87954638 | 534 | mov %r9, %rcx |
0b4bb91d AP |
535 | shld \$1, %r8, %r9 |
536 | shl \$1, %r8 | |
537 | ||
87954638 | 538 | xor %ebp, %ebp |
0b4bb91d | 539 | mulx %rdx, %rax, %rdx |
87954638 AP |
540 | adcx %rdx, %r8 |
541 | mov 8($inp), %rdx | |
542 | adcx %rbp, %r9 | |
0b4bb91d AP |
543 | |
544 | mov %rax, (%rsp) | |
545 | mov %r8, 8(%rsp) | |
546 | ||
547 | #second iteration | |
0b4bb91d | 548 | mulx 16($inp), %rax, %rbx |
87954638 AP |
549 | adox %rax, %r10 |
550 | adcx %rbx, %r11 | |
0b4bb91d | 551 | |
87954638 AP |
552 | .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 |
553 | adox $out, %r11 | |
554 | adcx %r8, %r12 | |
0b4bb91d AP |
555 | |
556 | mulx 32($inp), %rax, %rbx | |
87954638 AP |
557 | adox %rax, %r12 |
558 | adcx %rbx, %r13 | |
0b4bb91d AP |
559 | |
560 | mulx 40($inp), $out, %r8 | |
87954638 AP |
561 | adox $out, %r13 |
562 | adcx %r8, %r14 | |
0b4bb91d | 563 | |
87954638 AP |
564 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx |
565 | adox %rax, %r14 | |
566 | adcx %rbx, %r15 | |
0b4bb91d | 567 | |
87954638 AP |
568 | .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 |
569 | adox $out, %r15 | |
570 | adcx %rbp, %r8 | |
571 | adox %rbp, %r8 | |
0b4bb91d | 572 | |
87954638 | 573 | mov %r11, %rbx |
0b4bb91d AP |
574 | shld \$1, %r10, %r11 |
575 | shld \$1, %rcx, %r10 | |
576 | ||
87954638 | 577 | xor %ebp,%ebp |
0b4bb91d | 578 | mulx %rdx, %rax, %rcx |
87954638 AP |
579 | mov 16($inp), %rdx |
580 | adcx %rax, %r9 | |
581 | adcx %rcx, %r10 | |
582 | adcx %rbp, %r11 | |
0b4bb91d AP |
583 | |
584 | mov %r9, 16(%rsp) | |
87954638 | 585 | .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) |
0b4bb91d AP |
586 | |
587 | #third iteration | |
87954638 AP |
588 | .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 |
589 | adox $out, %r12 | |
590 | adcx %r9, %r13 | |
0b4bb91d AP |
591 | |
592 | mulx 32($inp), %rax, %rcx | |
87954638 AP |
593 | adox %rax, %r13 |
594 | adcx %rcx, %r14 | |
0b4bb91d AP |
595 | |
596 | mulx 40($inp), $out, %r9 | |
87954638 AP |
597 | adox $out, %r14 |
598 | adcx %r9, %r15 | |
0b4bb91d | 599 | |
87954638 AP |
600 | .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx |
601 | adox %rax, %r15 | |
602 | adcx %rcx, %r8 | |
0b4bb91d | 603 | |
87954638 AP |
604 | .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 |
605 | adox $out, %r8 | |
606 | adcx %rbp, %r9 | |
607 | adox %rbp, %r9 | |
0b4bb91d | 608 | |
87954638 | 609 | mov %r13, %rcx |
0b4bb91d AP |
610 | shld \$1, %r12, %r13 |
611 | shld \$1, %rbx, %r12 | |
612 | ||
87954638 | 613 | xor %ebp, %ebp |
0b4bb91d | 614 | mulx %rdx, %rax, %rdx |
87954638 AP |
615 | adcx %rax, %r11 |
616 | adcx %rdx, %r12 | |
617 | mov 24($inp), %rdx | |
618 | adcx %rbp, %r13 | |
0b4bb91d AP |
619 | |
620 | mov %r11, 32(%rsp) | |
87954638 | 621 | .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) |
0b4bb91d AP |
622 | |
623 | #fourth iteration | |
87954638 AP |
624 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx |
625 | adox %rax, %r14 | |
626 | adcx %rbx, %r15 | |
0b4bb91d AP |
627 | |
628 | mulx 40($inp), $out, %r10 | |
87954638 AP |
629 | adox $out, %r15 |
630 | adcx %r10, %r8 | |
0b4bb91d AP |
631 | |
632 | mulx 48($inp), %rax, %rbx | |
87954638 AP |
633 | adox %rax, %r8 |
634 | adcx %rbx, %r9 | |
0b4bb91d AP |
635 | |
636 | mulx 56($inp), $out, %r10 | |
87954638 AP |
637 | adox $out, %r9 |
638 | adcx %rbp, %r10 | |
639 | adox %rbp, %r10 | |
0b4bb91d | 640 | |
87954638 AP |
641 | .byte 0x66 |
642 | mov %r15, %rbx | |
0b4bb91d AP |
643 | shld \$1, %r14, %r15 |
644 | shld \$1, %rcx, %r14 | |
645 | ||
87954638 | 646 | xor %ebp, %ebp |
0b4bb91d | 647 | mulx %rdx, %rax, %rdx |
87954638 AP |
648 | adcx %rax, %r13 |
649 | adcx %rdx, %r14 | |
650 | mov 32($inp), %rdx | |
651 | adcx %rbp, %r15 | |
0b4bb91d AP |
652 | |
653 | mov %r13, 48(%rsp) | |
654 | mov %r14, 56(%rsp) | |
655 | ||
656 | #fifth iteration | |
87954638 AP |
657 | .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 |
658 | adox $out, %r8 | |
659 | adcx %r11, %r9 | |
0b4bb91d AP |
660 | |
661 | mulx 48($inp), %rax, %rcx | |
87954638 AP |
662 | adox %rax, %r9 |
663 | adcx %rcx, %r10 | |
0b4bb91d AP |
664 | |
665 | mulx 56($inp), $out, %r11 | |
87954638 AP |
666 | adox $out, %r10 |
667 | adcx %rbp, %r11 | |
668 | adox %rbp, %r11 | |
0b4bb91d AP |
669 | |
670 | mov %r9, %rcx | |
671 | shld \$1, %r8, %r9 | |
672 | shld \$1, %rbx, %r8 | |
673 | ||
87954638 | 674 | xor %ebp, %ebp |
0b4bb91d | 675 | mulx %rdx, %rax, %rdx |
87954638 AP |
676 | adcx %rax, %r15 |
677 | adcx %rdx, %r8 | |
678 | mov 40($inp), %rdx | |
679 | adcx %rbp, %r9 | |
0b4bb91d AP |
680 | |
681 | mov %r15, 64(%rsp) | |
682 | mov %r8, 72(%rsp) | |
683 | ||
684 | #sixth iteration | |
87954638 AP |
685 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx |
686 | adox %rax, %r10 | |
687 | adcx %rbx, %r11 | |
0b4bb91d | 688 | |
87954638 AP |
689 | .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 |
690 | adox $out, %r11 | |
691 | adcx %rbp, %r12 | |
692 | adox %rbp, %r12 | |
0b4bb91d AP |
693 | |
694 | mov %r11, %rbx | |
695 | shld \$1, %r10, %r11 | |
696 | shld \$1, %rcx, %r10 | |
697 | ||
87954638 | 698 | xor %ebp, %ebp |
0b4bb91d | 699 | mulx %rdx, %rax, %rdx |
87954638 AP |
700 | adcx %rax, %r9 |
701 | adcx %rdx, %r10 | |
702 | mov 48($inp), %rdx | |
703 | adcx %rbp, %r11 | |
0b4bb91d AP |
704 | |
705 | mov %r9, 80(%rsp) | |
706 | mov %r10, 88(%rsp) | |
707 | ||
708 | #seventh iteration | |
87954638 AP |
709 | .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 |
710 | adox %rax, %r12 | |
711 | adox %rbp, %r13 | |
0b4bb91d AP |
712 | |
713 | xor %r14, %r14 | |
714 | shld \$1, %r13, %r14 | |
715 | shld \$1, %r12, %r13 | |
716 | shld \$1, %rbx, %r12 | |
717 | ||
87954638 | 718 | xor %ebp, %ebp |
0b4bb91d | 719 | mulx %rdx, %rax, %rdx |
87954638 AP |
720 | adcx %rax, %r11 |
721 | adcx %rdx, %r12 | |
722 | mov 56($inp), %rdx | |
723 | adcx %rbp, %r13 | |
0b4bb91d | 724 | |
87954638 AP |
725 | .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) |
726 | .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) | |
0b4bb91d AP |
727 | |
728 | #eighth iteration | |
0b4bb91d | 729 | mulx %rdx, %rax, %rdx |
87954638 AP |
730 | adox %rax, %r13 |
731 | adox %rbp, %rdx | |
732 | ||
733 | .byte 0x66 | |
0b4bb91d AP |
734 | add %rdx, %r14 |
735 | ||
736 | movq %r13, 112(%rsp) | |
737 | movq %r14, 120(%rsp) | |
738 | movq %xmm0, $out | |
87954638 AP |
739 | movq %xmm1, %rbp |
740 | ||
741 | movq 128(%rsp), %rdx # pull $n0 | |
0b4bb91d AP |
742 | movq (%rsp), %r8 |
743 | movq 8(%rsp), %r9 | |
744 | movq 16(%rsp), %r10 | |
745 | movq 24(%rsp), %r11 | |
746 | movq 32(%rsp), %r12 | |
747 | movq 40(%rsp), %r13 | |
748 | movq 48(%rsp), %r14 | |
749 | movq 56(%rsp), %r15 | |
750 | ||
87954638 | 751 | call __rsaz_512_reducex |
0b4bb91d AP |
752 | |
753 | addq 64(%rsp), %r8 | |
754 | adcq 72(%rsp), %r9 | |
755 | adcq 80(%rsp), %r10 | |
756 | adcq 88(%rsp), %r11 | |
757 | adcq 96(%rsp), %r12 | |
758 | adcq 104(%rsp), %r13 | |
759 | adcq 112(%rsp), %r14 | |
760 | adcq 120(%rsp), %r15 | |
761 | sbbq %rcx, %rcx | |
762 | ||
87954638 | 763 | call __rsaz_512_subtract |
0b4bb91d AP |
764 | |
765 | movq %r8, %rdx | |
766 | movq %r9, %rax | |
767 | movl 128+8(%rsp), $times | |
768 | movq $out, $inp | |
769 | ||
770 | decl $times | |
87954638 AP |
771 | jnz .Loop_sqrx |
772 | ||
773 | .Lsqr_tail: | |
774 | ___ | |
775 | } | |
776 | $code.=<<___; | |
0b4bb91d AP |
777 | |
778 | leaq 128+24+48(%rsp), %rax | |
779 | movq -48(%rax), %r15 | |
780 | movq -40(%rax), %r14 | |
781 | movq -32(%rax), %r13 | |
782 | movq -24(%rax), %r12 | |
783 | movq -16(%rax), %rbp | |
784 | movq -8(%rax), %rbx | |
785 | leaq (%rax), %rsp | |
786 | .Lsqr_epilogue: | |
787 | ret | |
788 | .size rsaz_512_sqr,.-rsaz_512_sqr | |
789 | ___ | |
790 | } | |
791 | { | |
792 | my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); | |
793 | $code.=<<___; | |
fd8ad019 | 794 | .globl rsaz_512_mul |
0b4bb91d AP |
795 | .type rsaz_512_mul,\@function,5 |
796 | .align 32 | |
797 | rsaz_512_mul: | |
798 | push %rbx | |
799 | push %rbp | |
800 | push %r12 | |
801 | push %r13 | |
802 | push %r14 | |
803 | push %r15 | |
804 | ||
805 | subq \$128+24, %rsp | |
806 | .Lmul_body: | |
807 | movq $out, %xmm0 # off-load arguments | |
808 | movq $mod, %xmm1 | |
809 | movq $n0, 128(%rsp) | |
87954638 AP |
810 | ___ |
811 | $code.=<<___ if ($addx); | |
812 | movl \$0x80100,%r11d | |
813 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
814 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
815 | je .Lmulx | |
816 | ___ | |
817 | $code.=<<___; | |
818 | movq ($bp), %rbx # pass b[0] | |
0b4bb91d | 819 | movq $bp, %rbp # pass argument |
fd8ad019 | 820 | call __rsaz_512_mul |
0b4bb91d AP |
821 | |
822 | movq %xmm0, $out | |
823 | movq %xmm1, %rbp | |
824 | ||
825 | movq (%rsp), %r8 | |
826 | movq 8(%rsp), %r9 | |
827 | movq 16(%rsp), %r10 | |
828 | movq 24(%rsp), %r11 | |
829 | movq 32(%rsp), %r12 | |
830 | movq 40(%rsp), %r13 | |
831 | movq 48(%rsp), %r14 | |
832 | movq 56(%rsp), %r15 | |
833 | ||
87954638 AP |
834 | call __rsaz_512_reduce |
835 | ___ | |
836 | $code.=<<___ if ($addx); | |
837 | jmp .Lmul_tail | |
0b4bb91d | 838 | |
87954638 AP |
839 | .align 32 |
840 | .Lmulx: | |
841 | movq $bp, %rbp # pass argument | |
842 | movq ($bp), %rdx # pass b[0] | |
843 | call __rsaz_512_mulx | |
844 | ||
845 | movq %xmm0, $out | |
846 | movq %xmm1, %rbp | |
847 | ||
848 | movq 128(%rsp), %rdx # pull $n0 | |
849 | movq (%rsp), %r8 | |
850 | movq 8(%rsp), %r9 | |
851 | movq 16(%rsp), %r10 | |
852 | movq 24(%rsp), %r11 | |
853 | movq 32(%rsp), %r12 | |
854 | movq 40(%rsp), %r13 | |
855 | movq 48(%rsp), %r14 | |
856 | movq 56(%rsp), %r15 | |
857 | ||
858 | call __rsaz_512_reducex | |
859 | .Lmul_tail: | |
860 | ___ | |
861 | $code.=<<___; | |
0b4bb91d AP |
862 | addq 64(%rsp), %r8 |
863 | adcq 72(%rsp), %r9 | |
864 | adcq 80(%rsp), %r10 | |
865 | adcq 88(%rsp), %r11 | |
866 | adcq 96(%rsp), %r12 | |
867 | adcq 104(%rsp), %r13 | |
868 | adcq 112(%rsp), %r14 | |
869 | adcq 120(%rsp), %r15 | |
870 | sbbq %rcx, %rcx | |
871 | ||
87954638 | 872 | call __rsaz_512_subtract |
0b4bb91d AP |
873 | |
874 | leaq 128+24+48(%rsp), %rax | |
875 | movq -48(%rax), %r15 | |
876 | movq -40(%rax), %r14 | |
877 | movq -32(%rax), %r13 | |
878 | movq -24(%rax), %r12 | |
879 | movq -16(%rax), %rbp | |
880 | movq -8(%rax), %rbx | |
881 | leaq (%rax), %rsp | |
882 | .Lmul_epilogue: | |
883 | ret | |
884 | .size rsaz_512_mul,.-rsaz_512_mul | |
885 | ___ | |
886 | } | |
887 | { | |
888 | my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | |
889 | $code.=<<___; | |
fd8ad019 | 890 | .globl rsaz_512_mul_gather4 |
0b4bb91d AP |
891 | .type rsaz_512_mul_gather4,\@function,6 |
892 | .align 32 | |
893 | rsaz_512_mul_gather4: | |
894 | push %rbx | |
895 | push %rbp | |
896 | push %r12 | |
897 | push %r13 | |
898 | push %r14 | |
899 | push %r15 | |
900 | ||
901 | subq \$128+24, %rsp | |
902 | .Lmul_gather4_body: | |
87954638 AP |
903 | ___ |
904 | $code.=<<___ if ($addx); | |
905 | movl \$0x80100,%r11d | |
906 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
907 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
908 | je .Lmulx_gather | |
909 | ___ | |
910 | $code.=<<___; | |
0b4bb91d AP |
911 | movl 64($bp,$pwr,4), %eax |
912 | movq $out, %xmm0 # off-load arguments | |
913 | movl ($bp,$pwr,4), %ebx | |
914 | movq $mod, %xmm1 | |
915 | movq $n0, 128(%rsp) | |
916 | ||
917 | shlq \$32, %rax | |
918 | or %rax, %rbx | |
919 | movq ($ap), %rax | |
920 | movq 8($ap), %rcx | |
921 | leaq 128($bp,$pwr,4), %rbp | |
922 | mulq %rbx # 0 iteration | |
923 | movq %rax, (%rsp) | |
924 | movq %rcx, %rax | |
925 | movq %rdx, %r8 | |
926 | ||
927 | mulq %rbx | |
928 | movd (%rbp), %xmm4 | |
929 | addq %rax, %r8 | |
930 | movq 16($ap), %rax | |
931 | movq %rdx, %r9 | |
932 | adcq \$0, %r9 | |
933 | ||
934 | mulq %rbx | |
935 | movd 64(%rbp), %xmm5 | |
936 | addq %rax, %r9 | |
937 | movq 24($ap), %rax | |
938 | movq %rdx, %r10 | |
939 | adcq \$0, %r10 | |
940 | ||
941 | mulq %rbx | |
942 | pslldq \$4, %xmm5 | |
943 | addq %rax, %r10 | |
944 | movq 32($ap), %rax | |
945 | movq %rdx, %r11 | |
946 | adcq \$0, %r11 | |
947 | ||
948 | mulq %rbx | |
949 | por %xmm5, %xmm4 | |
950 | addq %rax, %r11 | |
951 | movq 40($ap), %rax | |
952 | movq %rdx, %r12 | |
953 | adcq \$0, %r12 | |
954 | ||
955 | mulq %rbx | |
956 | addq %rax, %r12 | |
957 | movq 48($ap), %rax | |
958 | movq %rdx, %r13 | |
959 | adcq \$0, %r13 | |
960 | ||
961 | mulq %rbx | |
962 | leaq 128(%rbp), %rbp | |
963 | addq %rax, %r13 | |
964 | movq 56($ap), %rax | |
965 | movq %rdx, %r14 | |
966 | adcq \$0, %r14 | |
967 | ||
968 | mulq %rbx | |
969 | movq %xmm4, %rbx | |
970 | addq %rax, %r14 | |
971 | movq ($ap), %rax | |
972 | movq %rdx, %r15 | |
973 | adcq \$0, %r15 | |
974 | ||
975 | leaq 8(%rsp), %rdi | |
976 | movl \$7, %ecx | |
977 | jmp .Loop_mul_gather | |
978 | ||
979 | .align 32 | |
980 | .Loop_mul_gather: | |
981 | mulq %rbx | |
982 | addq %rax, %r8 | |
983 | movq 8($ap), %rax | |
984 | movq %r8, (%rdi) | |
985 | movq %rdx, %r8 | |
986 | adcq \$0, %r8 | |
987 | ||
988 | mulq %rbx | |
989 | movd (%rbp), %xmm4 | |
990 | addq %rax, %r9 | |
991 | movq 16($ap), %rax | |
992 | adcq \$0, %rdx | |
993 | addq %r9, %r8 | |
994 | movq %rdx, %r9 | |
995 | adcq \$0, %r9 | |
996 | ||
997 | mulq %rbx | |
998 | movd 64(%rbp), %xmm5 | |
999 | addq %rax, %r10 | |
1000 | movq 24($ap), %rax | |
1001 | adcq \$0, %rdx | |
1002 | addq %r10, %r9 | |
1003 | movq %rdx, %r10 | |
1004 | adcq \$0, %r10 | |
1005 | ||
1006 | mulq %rbx | |
1007 | pslldq \$4, %xmm5 | |
1008 | addq %rax, %r11 | |
1009 | movq 32($ap), %rax | |
1010 | adcq \$0, %rdx | |
1011 | addq %r11, %r10 | |
1012 | movq %rdx, %r11 | |
1013 | adcq \$0, %r11 | |
1014 | ||
1015 | mulq %rbx | |
1016 | por %xmm5, %xmm4 | |
1017 | addq %rax, %r12 | |
1018 | movq 40($ap), %rax | |
1019 | adcq \$0, %rdx | |
1020 | addq %r12, %r11 | |
1021 | movq %rdx, %r12 | |
1022 | adcq \$0, %r12 | |
1023 | ||
1024 | mulq %rbx | |
1025 | addq %rax, %r13 | |
1026 | movq 48($ap), %rax | |
1027 | adcq \$0, %rdx | |
1028 | addq %r13, %r12 | |
1029 | movq %rdx, %r13 | |
1030 | adcq \$0, %r13 | |
1031 | ||
1032 | mulq %rbx | |
1033 | addq %rax, %r14 | |
1034 | movq 56($ap), %rax | |
1035 | adcq \$0, %rdx | |
1036 | addq %r14, %r13 | |
1037 | movq %rdx, %r14 | |
1038 | adcq \$0, %r14 | |
1039 | ||
1040 | mulq %rbx | |
1041 | movq %xmm4, %rbx | |
1042 | addq %rax, %r15 | |
1043 | movq ($ap), %rax | |
1044 | adcq \$0, %rdx | |
1045 | addq %r15, %r14 | |
1046 | movq %rdx, %r15 | |
1047 | adcq \$0, %r15 | |
1048 | ||
1049 | leaq 128(%rbp), %rbp | |
1050 | leaq 8(%rdi), %rdi | |
1051 | ||
1052 | decl %ecx | |
1053 | jnz .Loop_mul_gather | |
1054 | ||
1055 | movq %r8, (%rdi) | |
1056 | movq %r9, 8(%rdi) | |
1057 | movq %r10, 16(%rdi) | |
1058 | movq %r11, 24(%rdi) | |
1059 | movq %r12, 32(%rdi) | |
1060 | movq %r13, 40(%rdi) | |
1061 | movq %r14, 48(%rdi) | |
1062 | movq %r15, 56(%rdi) | |
1063 | ||
1064 | movq %xmm0, $out | |
1065 | movq %xmm1, %rbp | |
1066 | ||
1067 | movq (%rsp), %r8 | |
1068 | movq 8(%rsp), %r9 | |
1069 | movq 16(%rsp), %r10 | |
1070 | movq 24(%rsp), %r11 | |
1071 | movq 32(%rsp), %r12 | |
1072 | movq 40(%rsp), %r13 | |
1073 | movq 48(%rsp), %r14 | |
1074 | movq 56(%rsp), %r15 | |
1075 | ||
87954638 AP |
1076 | call __rsaz_512_reduce |
1077 | ___ | |
1078 | $code.=<<___ if ($addx); | |
1079 | jmp .Lmul_gather_tail | |
1080 | ||
1081 | .align 32 | |
1082 | .Lmulx_gather: | |
1083 | mov 64($bp,$pwr,4), %eax | |
1084 | movq $out, %xmm0 # off-load arguments | |
1085 | lea 128($bp,$pwr,4), %rbp | |
1086 | mov ($bp,$pwr,4), %edx | |
1087 | movq $mod, %xmm1 | |
1088 | mov $n0, 128(%rsp) | |
1089 | ||
1090 | shl \$32, %rax | |
1091 | or %rax, %rdx | |
1092 | mulx ($ap), %rbx, %r8 # 0 iteration | |
1093 | mov %rbx, (%rsp) | |
1094 | xor %edi, %edi # cf=0, of=0 | |
1095 | ||
1096 | mulx 8($ap), %rax, %r9 | |
1097 | movd (%rbp), %xmm4 | |
1098 | ||
1099 | mulx 16($ap), %rbx, %r10 | |
1100 | movd 64(%rbp), %xmm5 | |
1101 | adcx %rax, %r8 | |
1102 | ||
1103 | mulx 24($ap), %rax, %r11 | |
1104 | pslldq \$4, %xmm5 | |
1105 | adcx %rbx, %r9 | |
1106 | ||
1107 | mulx 32($ap), %rbx, %r12 | |
1108 | por %xmm5, %xmm4 | |
1109 | adcx %rax, %r10 | |
1110 | ||
1111 | mulx 40($ap), %rax, %r13 | |
1112 | adcx %rbx, %r11 | |
1113 | ||
1114 | mulx 48($ap), %rbx, %r14 | |
1115 | lea 128(%rbp), %rbp | |
1116 | adcx %rax, %r12 | |
1117 | ||
1118 | mulx 56($ap), %rax, %r15 | |
1119 | movq %xmm4, %rdx | |
1120 | adcx %rbx, %r13 | |
1121 | adcx %rax, %r14 | |
1122 | mov %r8, %rbx | |
1123 | adcx %rdi, %r15 # %rdi is 0 | |
1124 | ||
1125 | mov \$-7, %rcx | |
1126 | jmp .Loop_mulx_gather | |
1127 | ||
1128 | .align 32 | |
1129 | .Loop_mulx_gather: | |
1130 | mulx ($ap), %rax, %r8 | |
1131 | adcx %rax, %rbx | |
1132 | adox %r9, %r8 | |
1133 | ||
1134 | mulx 8($ap), %rax, %r9 | |
1135 | .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4 | |
1136 | adcx %rax, %r8 | |
1137 | adox %r10, %r9 | |
0b4bb91d | 1138 | |
87954638 AP |
1139 | mulx 16($ap), %rax, %r10 |
1140 | movd 64(%rbp), %xmm5 | |
1141 | lea 128(%rbp), %rbp | |
1142 | adcx %rax, %r9 | |
1143 | adox %r11, %r10 | |
1144 | ||
1145 | .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 | |
1146 | pslldq \$4, %xmm5 | |
1147 | por %xmm5, %xmm4 | |
1148 | adcx %rax, %r10 | |
1149 | adox %r12, %r11 | |
1150 | ||
1151 | mulx 32($ap), %rax, %r12 | |
1152 | adcx %rax, %r11 | |
1153 | adox %r13, %r12 | |
1154 | ||
1155 | mulx 40($ap), %rax, %r13 | |
1156 | adcx %rax, %r12 | |
1157 | adox %r14, %r13 | |
1158 | ||
1159 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 | |
1160 | adcx %rax, %r13 | |
1161 | adox %r15, %r14 | |
1162 | ||
1163 | mulx 56($ap), %rax, %r15 | |
1164 | movq %xmm4, %rdx | |
1165 | mov %rbx, 64(%rsp,%rcx,8) | |
1166 | adcx %rax, %r14 | |
1167 | adox %rdi, %r15 | |
1168 | mov %r8, %rbx | |
1169 | adcx %rdi, %r15 # cf=0 | |
1170 | ||
1171 | inc %rcx # of=0 | |
1172 | jnz .Loop_mulx_gather | |
1173 | ||
1174 | mov %r8, 64(%rsp) | |
1175 | mov %r9, 64+8(%rsp) | |
1176 | mov %r10, 64+16(%rsp) | |
1177 | mov %r11, 64+24(%rsp) | |
1178 | mov %r12, 64+32(%rsp) | |
1179 | mov %r13, 64+40(%rsp) | |
1180 | mov %r14, 64+48(%rsp) | |
1181 | mov %r15, 64+56(%rsp) | |
1182 | ||
1183 | movq %xmm0, $out | |
1184 | movq %xmm1, %rbp | |
1185 | ||
1186 | mov 128(%rsp), %rdx # pull $n0 | |
1187 | mov (%rsp), %r8 | |
1188 | mov 8(%rsp), %r9 | |
1189 | mov 16(%rsp), %r10 | |
1190 | mov 24(%rsp), %r11 | |
1191 | mov 32(%rsp), %r12 | |
1192 | mov 40(%rsp), %r13 | |
1193 | mov 48(%rsp), %r14 | |
1194 | mov 56(%rsp), %r15 | |
1195 | ||
1196 | call __rsaz_512_reducex | |
1197 | ||
1198 | .Lmul_gather_tail: | |
1199 | ___ | |
1200 | $code.=<<___; | |
0b4bb91d AP |
1201 | addq 64(%rsp), %r8 |
1202 | adcq 72(%rsp), %r9 | |
1203 | adcq 80(%rsp), %r10 | |
1204 | adcq 88(%rsp), %r11 | |
1205 | adcq 96(%rsp), %r12 | |
1206 | adcq 104(%rsp), %r13 | |
1207 | adcq 112(%rsp), %r14 | |
1208 | adcq 120(%rsp), %r15 | |
1209 | sbbq %rcx, %rcx | |
1210 | ||
87954638 | 1211 | call __rsaz_512_subtract |
0b4bb91d AP |
1212 | |
1213 | leaq 128+24+48(%rsp), %rax | |
1214 | movq -48(%rax), %r15 | |
1215 | movq -40(%rax), %r14 | |
1216 | movq -32(%rax), %r13 | |
1217 | movq -24(%rax), %r12 | |
1218 | movq -16(%rax), %rbp | |
1219 | movq -8(%rax), %rbx | |
1220 | leaq (%rax), %rsp | |
1221 | .Lmul_gather4_epilogue: | |
1222 | ret | |
1223 | .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 | |
1224 | ___ | |
1225 | } | |
1226 | { | |
1227 | my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | |
1228 | $code.=<<___; | |
fd8ad019 | 1229 | .globl rsaz_512_mul_scatter4 |
0b4bb91d AP |
1230 | .type rsaz_512_mul_scatter4,\@function,6 |
1231 | .align 32 | |
1232 | rsaz_512_mul_scatter4: | |
1233 | push %rbx | |
1234 | push %rbp | |
1235 | push %r12 | |
1236 | push %r13 | |
1237 | push %r14 | |
1238 | push %r15 | |
1239 | ||
1240 | subq \$128+24, %rsp | |
1241 | .Lmul_scatter4_body: | |
1242 | leaq ($tbl,$pwr,4), $tbl | |
1243 | movq $out, %xmm0 # off-load arguments | |
1244 | movq $mod, %xmm1 | |
1245 | movq $tbl, %xmm2 | |
1246 | movq $n0, 128(%rsp) | |
1247 | ||
1248 | movq $out, %rbp | |
87954638 AP |
1249 | ___ |
1250 | $code.=<<___ if ($addx); | |
1251 | movl \$0x80100,%r11d | |
1252 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
1253 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
1254 | je .Lmulx_scatter | |
1255 | ___ | |
1256 | $code.=<<___; | |
1257 | movq ($out),%rbx # pass b[0] | |
fd8ad019 | 1258 | call __rsaz_512_mul |
0b4bb91d AP |
1259 | |
1260 | movq %xmm0, $out | |
1261 | movq %xmm1, %rbp | |
1262 | ||
1263 | movq (%rsp), %r8 | |
1264 | movq 8(%rsp), %r9 | |
1265 | movq 16(%rsp), %r10 | |
1266 | movq 24(%rsp), %r11 | |
1267 | movq 32(%rsp), %r12 | |
1268 | movq 40(%rsp), %r13 | |
1269 | movq 48(%rsp), %r14 | |
1270 | movq 56(%rsp), %r15 | |
1271 | ||
87954638 AP |
1272 | call __rsaz_512_reduce |
1273 | ___ | |
1274 | $code.=<<___ if ($addx); | |
1275 | jmp .Lmul_scatter_tail | |
1276 | ||
1277 | .align 32 | |
1278 | .Lmulx_scatter: | |
1279 | movq ($out), %rdx # pass b[0] | |
1280 | call __rsaz_512_mulx | |
1281 | ||
1282 | movq %xmm0, $out | |
1283 | movq %xmm1, %rbp | |
1284 | ||
1285 | movq 128(%rsp), %rdx # pull $n0 | |
1286 | movq (%rsp), %r8 | |
1287 | movq 8(%rsp), %r9 | |
1288 | movq 16(%rsp), %r10 | |
1289 | movq 24(%rsp), %r11 | |
1290 | movq 32(%rsp), %r12 | |
1291 | movq 40(%rsp), %r13 | |
1292 | movq 48(%rsp), %r14 | |
1293 | movq 56(%rsp), %r15 | |
1294 | ||
1295 | call __rsaz_512_reducex | |
0b4bb91d | 1296 | |
87954638 AP |
1297 | .Lmul_scatter_tail: |
1298 | ___ | |
1299 | $code.=<<___; | |
0b4bb91d AP |
1300 | addq 64(%rsp), %r8 |
1301 | adcq 72(%rsp), %r9 | |
1302 | adcq 80(%rsp), %r10 | |
1303 | adcq 88(%rsp), %r11 | |
1304 | adcq 96(%rsp), %r12 | |
1305 | adcq 104(%rsp), %r13 | |
1306 | adcq 112(%rsp), %r14 | |
1307 | adcq 120(%rsp), %r15 | |
1308 | movq %xmm2, $inp | |
1309 | sbbq %rcx, %rcx | |
1310 | ||
87954638 | 1311 | call __rsaz_512_subtract |
0b4bb91d AP |
1312 | |
1313 | movl %r8d, 64*0($inp) # scatter | |
1314 | shrq \$32, %r8 | |
1315 | movl %r9d, 64*2($inp) | |
1316 | shrq \$32, %r9 | |
1317 | movl %r10d, 64*4($inp) | |
1318 | shrq \$32, %r10 | |
1319 | movl %r11d, 64*6($inp) | |
1320 | shrq \$32, %r11 | |
1321 | movl %r12d, 64*8($inp) | |
1322 | shrq \$32, %r12 | |
1323 | movl %r13d, 64*10($inp) | |
1324 | shrq \$32, %r13 | |
1325 | movl %r14d, 64*12($inp) | |
1326 | shrq \$32, %r14 | |
1327 | movl %r15d, 64*14($inp) | |
1328 | shrq \$32, %r15 | |
1329 | movl %r8d, 64*1($inp) | |
1330 | movl %r9d, 64*3($inp) | |
1331 | movl %r10d, 64*5($inp) | |
1332 | movl %r11d, 64*7($inp) | |
1333 | movl %r12d, 64*9($inp) | |
1334 | movl %r13d, 64*11($inp) | |
1335 | movl %r14d, 64*13($inp) | |
1336 | movl %r15d, 64*15($inp) | |
1337 | ||
1338 | leaq 128+24+48(%rsp), %rax | |
1339 | movq -48(%rax), %r15 | |
1340 | movq -40(%rax), %r14 | |
1341 | movq -32(%rax), %r13 | |
1342 | movq -24(%rax), %r12 | |
1343 | movq -16(%rax), %rbp | |
1344 | movq -8(%rax), %rbx | |
1345 | leaq (%rax), %rsp | |
1346 | .Lmul_scatter4_epilogue: | |
1347 | ret | |
1348 | .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 | |
1349 | ___ | |
1350 | } | |
1351 | { | |
1352 | my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); | |
1353 | $code.=<<___; | |
1354 | .globl rsaz_512_mul_by_one | |
1355 | .type rsaz_512_mul_by_one,\@function,4 | |
1356 | .align 32 | |
1357 | rsaz_512_mul_by_one: | |
1358 | push %rbx | |
1359 | push %rbp | |
1360 | push %r12 | |
1361 | push %r13 | |
1362 | push %r14 | |
1363 | push %r15 | |
1364 | ||
1365 | subq \$128+24, %rsp | |
1366 | .Lmul_by_one_body: | |
87954638 AP |
1367 | ___ |
1368 | $code.=<<___ if ($addx); | |
1369 | movl OPENSSL_ia32cap_P+8(%rip),%eax | |
1370 | ___ | |
1371 | $code.=<<___; | |
0b4bb91d AP |
1372 | movq $mod, %rbp # reassign argument |
1373 | movq $n0, 128(%rsp) | |
1374 | ||
1375 | movq ($inp), %r8 | |
1376 | pxor %xmm0, %xmm0 | |
1377 | movq 8($inp), %r9 | |
1378 | movq 16($inp), %r10 | |
1379 | movq 24($inp), %r11 | |
1380 | movq 32($inp), %r12 | |
1381 | movq 40($inp), %r13 | |
1382 | movq 48($inp), %r14 | |
1383 | movq 56($inp), %r15 | |
1384 | ||
1385 | movdqa %xmm0, (%rsp) | |
1386 | movdqa %xmm0, 16(%rsp) | |
1387 | movdqa %xmm0, 32(%rsp) | |
1388 | movdqa %xmm0, 48(%rsp) | |
1389 | movdqa %xmm0, 64(%rsp) | |
1390 | movdqa %xmm0, 80(%rsp) | |
1391 | movdqa %xmm0, 96(%rsp) | |
87954638 AP |
1392 | ___ |
1393 | $code.=<<___ if ($addx); | |
1394 | andl \$0x80100,%eax | |
1395 | cmpl \$0x80100,%eax # check for MULX and ADO/CX | |
1396 | je .Lby_one_callx | |
1397 | ___ | |
1398 | $code.=<<___; | |
1399 | call __rsaz_512_reduce | |
1400 | ___ | |
1401 | $code.=<<___ if ($addx); | |
1402 | jmp .Lby_one_tail | |
1403 | .align 32 | |
1404 | .Lby_one_callx: | |
1405 | movq 128(%rsp), %rdx # pull $n0 | |
1406 | call __rsaz_512_reducex | |
1407 | .Lby_one_tail: | |
1408 | ___ | |
1409 | $code.=<<___; | |
0b4bb91d AP |
1410 | movq %r8, ($out) |
1411 | movq %r9, 8($out) | |
1412 | movq %r10, 16($out) | |
1413 | movq %r11, 24($out) | |
1414 | movq %r12, 32($out) | |
1415 | movq %r13, 40($out) | |
1416 | movq %r14, 48($out) | |
1417 | movq %r15, 56($out) | |
1418 | ||
1419 | leaq 128+24+48(%rsp), %rax | |
1420 | movq -48(%rax), %r15 | |
1421 | movq -40(%rax), %r14 | |
1422 | movq -32(%rax), %r13 | |
1423 | movq -24(%rax), %r12 | |
1424 | movq -16(%rax), %rbp | |
1425 | movq -8(%rax), %rbx | |
1426 | leaq (%rax), %rsp | |
1427 | .Lmul_by_one_epilogue: | |
1428 | ret | |
1429 | .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one | |
1430 | ___ | |
1431 | } | |
87954638 | 1432 | { # __rsaz_512_reduce |
0b4bb91d AP |
1433 | # |
1434 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 | |
1435 | # output: %r8-%r15 | |
1436 | # clobbers: everything except %rbp and %rdi | |
1437 | $code.=<<___; | |
87954638 | 1438 | .type __rsaz_512_reduce,\@abi-omnipotent |
0b4bb91d | 1439 | .align 32 |
87954638 | 1440 | __rsaz_512_reduce: |
0b4bb91d AP |
1441 | movq %r8, %rbx |
1442 | imulq 128+8(%rsp), %rbx | |
1443 | movq 0(%rbp), %rax | |
1444 | movl \$8, %ecx | |
1445 | jmp .Lreduction_loop | |
1446 | ||
1447 | .align 32 | |
1448 | .Lreduction_loop: | |
1449 | mulq %rbx | |
1450 | movq 8(%rbp), %rax | |
1451 | negq %r8 | |
1452 | movq %rdx, %r8 | |
1453 | adcq \$0, %r8 | |
1454 | ||
1455 | mulq %rbx | |
1456 | addq %rax, %r9 | |
1457 | movq 16(%rbp), %rax | |
1458 | adcq \$0, %rdx | |
1459 | addq %r9, %r8 | |
1460 | movq %rdx, %r9 | |
1461 | adcq \$0, %r9 | |
1462 | ||
1463 | mulq %rbx | |
1464 | addq %rax, %r10 | |
1465 | movq 24(%rbp), %rax | |
1466 | adcq \$0, %rdx | |
1467 | addq %r10, %r9 | |
1468 | movq %rdx, %r10 | |
1469 | adcq \$0, %r10 | |
1470 | ||
1471 | mulq %rbx | |
1472 | addq %rax, %r11 | |
1473 | movq 32(%rbp), %rax | |
1474 | adcq \$0, %rdx | |
1475 | addq %r11, %r10 | |
1476 | movq 128+8(%rsp), %rsi | |
87954638 AP |
1477 | #movq %rdx, %r11 |
1478 | #adcq \$0, %r11 | |
1479 | adcq \$0, %rdx | |
0b4bb91d | 1480 | movq %rdx, %r11 |
0b4bb91d AP |
1481 | |
1482 | mulq %rbx | |
1483 | addq %rax, %r12 | |
1484 | movq 40(%rbp), %rax | |
1485 | adcq \$0, %rdx | |
1486 | imulq %r8, %rsi | |
1487 | addq %r12, %r11 | |
1488 | movq %rdx, %r12 | |
1489 | adcq \$0, %r12 | |
1490 | ||
1491 | mulq %rbx | |
1492 | addq %rax, %r13 | |
1493 | movq 48(%rbp), %rax | |
1494 | adcq \$0, %rdx | |
1495 | addq %r13, %r12 | |
1496 | movq %rdx, %r13 | |
1497 | adcq \$0, %r13 | |
1498 | ||
1499 | mulq %rbx | |
1500 | addq %rax, %r14 | |
1501 | movq 56(%rbp), %rax | |
1502 | adcq \$0, %rdx | |
1503 | addq %r14, %r13 | |
1504 | movq %rdx, %r14 | |
1505 | adcq \$0, %r14 | |
1506 | ||
1507 | mulq %rbx | |
1508 | movq %rsi, %rbx | |
1509 | addq %rax, %r15 | |
1510 | movq 0(%rbp), %rax | |
1511 | adcq \$0, %rdx | |
1512 | addq %r15, %r14 | |
1513 | movq %rdx, %r15 | |
1514 | adcq \$0, %r15 | |
1515 | ||
1516 | decl %ecx | |
1517 | jne .Lreduction_loop | |
87954638 AP |
1518 | |
1519 | ret | |
1520 | .size __rsaz_512_reduce,.-__rsaz_512_reduce | |
0b4bb91d | 1521 | ___ |
87954638 AP |
1522 | } |
1523 | if ($addx) { | |
1524 | # __rsaz_512_reducex | |
1525 | # | |
1526 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 | |
1527 | # output: %r8-%r15 | |
1528 | # clobbers: everything except %rbp and %rdi | |
0b4bb91d | 1529 | $code.=<<___; |
87954638 AP |
1530 | .type __rsaz_512_reducex,\@abi-omnipotent |
1531 | .align 32 | |
1532 | __rsaz_512_reducex: | |
1533 | #movq 128+8(%rsp), %rdx # pull $n0 | |
0b4bb91d | 1534 | imulq %r8, %rdx |
87954638 | 1535 | xorq %rsi, %rsi # cf=0,of=0 |
0b4bb91d | 1536 | movl \$8, %ecx |
87954638 | 1537 | jmp .Lreduction_loopx |
0b4bb91d AP |
1538 | |
1539 | .align 32 | |
87954638 AP |
1540 | .Lreduction_loopx: |
1541 | mov %r8, %rbx | |
0b4bb91d | 1542 | mulx 0(%rbp), %rax, %r8 |
87954638 AP |
1543 | adcx %rbx, %rax |
1544 | adox %r9, %r8 | |
0b4bb91d AP |
1545 | |
1546 | mulx 8(%rbp), %rax, %r9 | |
87954638 AP |
1547 | adcx %rax, %r8 |
1548 | adox %r10, %r9 | |
1549 | ||
1550 | mulx 16(%rbp), %rbx, %r10 | |
1551 | adcx %rbx, %r9 | |
1552 | adox %r11, %r10 | |
1553 | ||
1554 | mulx 24(%rbp), %rbx, %r11 | |
1555 | adcx %rbx, %r10 | |
1556 | adox %r12, %r11 | |
1557 | ||
1558 | .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 | |
1559 | mov %rdx, %rax | |
1560 | mov %r8, %rdx | |
1561 | adcx %rbx, %r11 | |
1562 | adox %r13, %r12 | |
1563 | ||
1564 | mulx 128+8(%rsp), %rbx, %rdx | |
1565 | mov %rax, %rdx | |
0b4bb91d AP |
1566 | |
1567 | mulx 40(%rbp), %rax, %r13 | |
87954638 AP |
1568 | adcx %rax, %r12 |
1569 | adox %r14, %r13 | |
0b4bb91d | 1570 | |
87954638 AP |
1571 | .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 |
1572 | adcx %rax, %r13 | |
1573 | adox %r15, %r14 | |
0b4bb91d AP |
1574 | |
1575 | mulx 56(%rbp), %rax, %r15 | |
1576 | mov %rbx, %rdx | |
87954638 AP |
1577 | adcx %rax, %r14 |
1578 | adox %rsi, %r15 # %rsi is 0 | |
1579 | adcx %rsi, %r15 # cf=0 | |
1580 | ||
1581 | decl %ecx # of=0 | |
1582 | jne .Lreduction_loopx | |
0b4bb91d | 1583 | |
0b4bb91d | 1584 | ret |
87954638 | 1585 | .size __rsaz_512_reducex,.-__rsaz_512_reducex |
0b4bb91d AP |
1586 | ___ |
1587 | } | |
87954638 | 1588 | { # __rsaz_512_subtract |
0b4bb91d AP |
1589 | # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask |
1590 | # output: | |
1591 | # clobbers: everything but %rdi, %rsi and %rbp | |
1592 | $code.=<<___; | |
87954638 | 1593 | .type __rsaz_512_subtract,\@abi-omnipotent |
0b4bb91d | 1594 | .align 32 |
87954638 | 1595 | __rsaz_512_subtract: |
0b4bb91d AP |
1596 | movq %r8, ($out) |
1597 | movq %r9, 8($out) | |
1598 | movq %r10, 16($out) | |
1599 | movq %r11, 24($out) | |
1600 | movq %r12, 32($out) | |
1601 | movq %r13, 40($out) | |
1602 | movq %r14, 48($out) | |
1603 | movq %r15, 56($out) | |
1604 | ||
1605 | movq 0($mod), %r8 | |
1606 | movq 8($mod), %r9 | |
1607 | negq %r8 | |
1608 | notq %r9 | |
1609 | andq %rcx, %r8 | |
1610 | movq 16($mod), %r10 | |
1611 | andq %rcx, %r9 | |
1612 | notq %r10 | |
1613 | movq 24($mod), %r11 | |
1614 | andq %rcx, %r10 | |
1615 | notq %r11 | |
1616 | movq 32($mod), %r12 | |
1617 | andq %rcx, %r11 | |
1618 | notq %r12 | |
1619 | movq 40($mod), %r13 | |
1620 | andq %rcx, %r12 | |
1621 | notq %r13 | |
1622 | movq 48($mod), %r14 | |
1623 | andq %rcx, %r13 | |
1624 | notq %r14 | |
1625 | movq 56($mod), %r15 | |
1626 | andq %rcx, %r14 | |
1627 | notq %r15 | |
1628 | andq %rcx, %r15 | |
1629 | ||
1630 | addq ($out), %r8 | |
1631 | adcq 8($out), %r9 | |
1632 | adcq 16($out), %r10 | |
1633 | adcq 24($out), %r11 | |
1634 | adcq 32($out), %r12 | |
1635 | adcq 40($out), %r13 | |
1636 | adcq 48($out), %r14 | |
1637 | adcq 56($out), %r15 | |
1638 | ||
1639 | movq %r8, ($out) | |
1640 | movq %r9, 8($out) | |
1641 | movq %r10, 16($out) | |
1642 | movq %r11, 24($out) | |
1643 | movq %r12, 32($out) | |
1644 | movq %r13, 40($out) | |
1645 | movq %r14, 48($out) | |
1646 | movq %r15, 56($out) | |
1647 | ||
1648 | ret | |
87954638 | 1649 | .size __rsaz_512_subtract,.-__rsaz_512_subtract |
0b4bb91d AP |
1650 | ___ |
1651 | } | |
fd8ad019 | 1652 | { # __rsaz_512_mul |
0b4bb91d AP |
1653 | # |
1654 | # input: %rsi - ap, %rbp - bp | |
1655 | # ouput: | |
1656 | # clobbers: everything | |
1657 | my ($ap,$bp) = ("%rsi","%rbp"); | |
1658 | $code.=<<___; | |
fd8ad019 | 1659 | .type __rsaz_512_mul,\@abi-omnipotent |
0b4bb91d | 1660 | .align 32 |
fd8ad019 | 1661 | __rsaz_512_mul: |
0b4bb91d AP |
1662 | leaq 8(%rsp), %rdi |
1663 | ||
0b4bb91d AP |
1664 | movq ($ap), %rax |
1665 | mulq %rbx | |
1666 | movq %rax, (%rdi) | |
1667 | movq 8($ap), %rax | |
1668 | movq %rdx, %r8 | |
1669 | ||
1670 | mulq %rbx | |
1671 | addq %rax, %r8 | |
1672 | movq 16($ap), %rax | |
1673 | movq %rdx, %r9 | |
1674 | adcq \$0, %r9 | |
1675 | ||
1676 | mulq %rbx | |
1677 | addq %rax, %r9 | |
1678 | movq 24($ap), %rax | |
1679 | movq %rdx, %r10 | |
1680 | adcq \$0, %r10 | |
1681 | ||
1682 | mulq %rbx | |
1683 | addq %rax, %r10 | |
1684 | movq 32($ap), %rax | |
1685 | movq %rdx, %r11 | |
1686 | adcq \$0, %r11 | |
1687 | ||
1688 | mulq %rbx | |
1689 | addq %rax, %r11 | |
1690 | movq 40($ap), %rax | |
1691 | movq %rdx, %r12 | |
1692 | adcq \$0, %r12 | |
1693 | ||
1694 | mulq %rbx | |
1695 | addq %rax, %r12 | |
1696 | movq 48($ap), %rax | |
1697 | movq %rdx, %r13 | |
1698 | adcq \$0, %r13 | |
1699 | ||
1700 | mulq %rbx | |
1701 | addq %rax, %r13 | |
1702 | movq 56($ap), %rax | |
1703 | movq %rdx, %r14 | |
1704 | adcq \$0, %r14 | |
1705 | ||
1706 | mulq %rbx | |
1707 | addq %rax, %r14 | |
1708 | movq ($ap), %rax | |
1709 | movq %rdx, %r15 | |
1710 | adcq \$0, %r15 | |
1711 | ||
1712 | leaq 8($bp), $bp | |
1713 | leaq 8(%rdi), %rdi | |
1714 | ||
1715 | movl \$7, %ecx | |
1716 | jmp .Loop_mul | |
1717 | ||
1718 | .align 32 | |
1719 | .Loop_mul: | |
1720 | movq ($bp), %rbx | |
1721 | mulq %rbx | |
1722 | addq %rax, %r8 | |
1723 | movq 8($ap), %rax | |
1724 | movq %r8, (%rdi) | |
1725 | movq %rdx, %r8 | |
1726 | adcq \$0, %r8 | |
1727 | ||
1728 | mulq %rbx | |
1729 | addq %rax, %r9 | |
1730 | movq 16($ap), %rax | |
1731 | adcq \$0, %rdx | |
1732 | addq %r9, %r8 | |
1733 | movq %rdx, %r9 | |
1734 | adcq \$0, %r9 | |
1735 | ||
1736 | mulq %rbx | |
1737 | addq %rax, %r10 | |
1738 | movq 24($ap), %rax | |
1739 | adcq \$0, %rdx | |
1740 | addq %r10, %r9 | |
1741 | movq %rdx, %r10 | |
1742 | adcq \$0, %r10 | |
1743 | ||
1744 | mulq %rbx | |
1745 | addq %rax, %r11 | |
1746 | movq 32($ap), %rax | |
1747 | adcq \$0, %rdx | |
1748 | addq %r11, %r10 | |
1749 | movq %rdx, %r11 | |
1750 | adcq \$0, %r11 | |
1751 | ||
1752 | mulq %rbx | |
1753 | addq %rax, %r12 | |
1754 | movq 40($ap), %rax | |
1755 | adcq \$0, %rdx | |
1756 | addq %r12, %r11 | |
1757 | movq %rdx, %r12 | |
1758 | adcq \$0, %r12 | |
1759 | ||
1760 | mulq %rbx | |
1761 | addq %rax, %r13 | |
1762 | movq 48($ap), %rax | |
1763 | adcq \$0, %rdx | |
1764 | addq %r13, %r12 | |
1765 | movq %rdx, %r13 | |
1766 | adcq \$0, %r13 | |
1767 | ||
1768 | mulq %rbx | |
1769 | addq %rax, %r14 | |
1770 | movq 56($ap), %rax | |
1771 | adcq \$0, %rdx | |
1772 | addq %r14, %r13 | |
1773 | movq %rdx, %r14 | |
1774 | leaq 8($bp), $bp | |
1775 | adcq \$0, %r14 | |
1776 | ||
1777 | mulq %rbx | |
1778 | addq %rax, %r15 | |
1779 | movq ($ap), %rax | |
1780 | adcq \$0, %rdx | |
1781 | addq %r15, %r14 | |
1782 | movq %rdx, %r15 | |
1783 | adcq \$0, %r15 | |
1784 | ||
1785 | leaq 8(%rdi), %rdi | |
1786 | ||
1787 | decl %ecx | |
1788 | jnz .Loop_mul | |
1789 | ||
1790 | movq %r8, (%rdi) | |
1791 | movq %r9, 8(%rdi) | |
1792 | movq %r10, 16(%rdi) | |
1793 | movq %r11, 24(%rdi) | |
1794 | movq %r12, 32(%rdi) | |
1795 | movq %r13, 40(%rdi) | |
1796 | movq %r14, 48(%rdi) | |
1797 | movq %r15, 56(%rdi) | |
1798 | ||
1799 | ret | |
fd8ad019 | 1800 | .size __rsaz_512_mul,.-__rsaz_512_mul |
0b4bb91d AP |
1801 | ___ |
1802 | } | |
87954638 AP |
1803 | if ($addx) { |
1804 | # __rsaz_512_mulx | |
1805 | # | |
1806 | # input: %rsi - ap, %rbp - bp | |
1807 | # ouput: | |
1808 | # clobbers: everything | |
1809 | my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); | |
1810 | $code.=<<___; | |
1811 | .type __rsaz_512_mulx,\@abi-omnipotent | |
1812 | .align 32 | |
1813 | __rsaz_512_mulx: | |
1814 | mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller | |
1815 | xor $zero, $zero # cf=0,of=0 | |
1816 | ||
1817 | mulx 8($ap), %rax, %r9 | |
1818 | movq %rbx, 8(%rsp) | |
1819 | ||
1820 | mulx 16($ap), %rbx, %r10 | |
1821 | adcx %rax, %r8 | |
1822 | ||
1823 | mulx 24($ap), %rax, %r11 | |
1824 | adcx %rbx, %r9 | |
1825 | ||
1826 | .byte 0xc4,0x62,0xe3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rbx, %r12 | |
1827 | adcx %rax, %r10 | |
1828 | ||
1829 | mulx 40($ap), %rax, %r13 | |
1830 | adcx %rbx, %r11 | |
1831 | ||
1832 | mulx 48($ap), %rbx, %r14 | |
1833 | adcx %rax, %r12 | |
1834 | ||
1835 | mulx 56($ap), %rax, %r15 | |
1836 | mov 8($bp), %rdx | |
1837 | adcx %rbx, %r13 | |
1838 | adcx %rax, %r14 | |
1839 | adcx $zero, %r15 # cf=0 | |
1840 | ||
1841 | mov \$-6, %rcx | |
1842 | jmp .Loop_mulx | |
1843 | ||
1844 | .align 32 | |
1845 | .Loop_mulx: | |
1846 | movq %r8, %rbx | |
1847 | mulx ($ap), %rax, %r8 | |
1848 | adcx %rax, %rbx | |
1849 | adox %r9, %r8 | |
1850 | ||
1851 | mulx 8($ap), %rax, %r9 | |
1852 | adcx %rax, %r8 | |
1853 | adox %r10, %r9 | |
1854 | ||
1855 | mulx 16($ap), %rax, %r10 | |
1856 | adcx %rax, %r9 | |
1857 | adox %r11, %r10 | |
1858 | ||
1859 | mulx 24($ap), %rax, %r11 | |
1860 | adcx %rax, %r10 | |
1861 | adox %r12, %r11 | |
1862 | ||
1863 | .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 | |
1864 | adcx %rax, %r11 | |
1865 | adox %r13, %r12 | |
1866 | ||
1867 | mulx 40($ap), %rax, %r13 | |
1868 | adcx %rax, %r12 | |
1869 | adox %r14, %r13 | |
1870 | ||
1871 | mulx 48($ap), %rax, %r14 | |
1872 | adcx %rax, %r13 | |
1873 | adox %r15, %r14 | |
1874 | ||
1875 | mulx 56($ap), %rax, %r15 | |
1876 | movq 64($bp,%rcx,8), %rdx | |
1877 | movq %rbx, 8+64-8(%rsp,%rcx,8) | |
1878 | adcx %rax, %r14 | |
1879 | adox $zero, %r15 | |
1880 | adcx $zero, %r15 # cf=0 | |
1881 | ||
1882 | inc %rcx # of=0 | |
1883 | jnz .Loop_mulx | |
1884 | ||
1885 | movq %r8, %rbx | |
1886 | mulx ($ap), %rax, %r8 | |
1887 | adcx %rax, %rbx | |
1888 | adox %r9, %r8 | |
1889 | ||
1890 | .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 | |
1891 | adcx %rax, %r8 | |
1892 | adox %r10, %r9 | |
1893 | ||
1894 | .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 | |
1895 | adcx %rax, %r9 | |
1896 | adox %r11, %r10 | |
1897 | ||
1898 | mulx 24($ap), %rax, %r11 | |
1899 | adcx %rax, %r10 | |
1900 | adox %r12, %r11 | |
1901 | ||
1902 | mulx 32($ap), %rax, %r12 | |
1903 | adcx %rax, %r11 | |
1904 | adox %r13, %r12 | |
1905 | ||
1906 | mulx 40($ap), %rax, %r13 | |
1907 | adcx %rax, %r12 | |
1908 | adox %r14, %r13 | |
1909 | ||
1910 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 | |
1911 | adcx %rax, %r13 | |
1912 | adox %r15, %r14 | |
1913 | ||
1914 | .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 | |
1915 | adcx %rax, %r14 | |
1916 | adox $zero, %r15 | |
1917 | adcx $zero, %r15 | |
1918 | ||
1919 | mov %rbx, 8+64-8(%rsp) | |
1920 | mov %r8, 8+64(%rsp) | |
1921 | mov %r9, 8+64+8(%rsp) | |
1922 | mov %r10, 8+64+16(%rsp) | |
1923 | mov %r11, 8+64+24(%rsp) | |
1924 | mov %r12, 8+64+32(%rsp) | |
1925 | mov %r13, 8+64+40(%rsp) | |
1926 | mov %r14, 8+64+48(%rsp) | |
1927 | mov %r15, 8+64+56(%rsp) | |
1928 | ||
1929 | ret | |
1930 | .size __rsaz_512_mulx,.-__rsaz_512_mulx | |
1931 | ___ | |
1932 | } | |
0b4bb91d AP |
1933 | { |
1934 | my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); | |
1935 | $code.=<<___; | |
1936 | .globl rsaz_512_scatter4 | |
1937 | .type rsaz_512_scatter4,\@abi-omnipotent | |
1938 | .align 16 | |
1939 | rsaz_512_scatter4: | |
1940 | leaq ($out,$power,4), $out | |
1941 | movl \$8, %r9d | |
1942 | jmp .Loop_scatter | |
1943 | .align 16 | |
1944 | .Loop_scatter: | |
1945 | movq ($inp), %rax | |
1946 | leaq 8($inp), $inp | |
1947 | movl %eax, ($out) | |
1948 | shrq \$32, %rax | |
1949 | movl %eax, 64($out) | |
1950 | leaq 128($out), $out | |
1951 | decl %r9d | |
1952 | jnz .Loop_scatter | |
1953 | ret | |
1954 | .size rsaz_512_scatter4,.-rsaz_512_scatter4 | |
1955 | ||
1956 | .globl rsaz_512_gather4 | |
1957 | .type rsaz_512_gather4,\@abi-omnipotent | |
1958 | .align 16 | |
1959 | rsaz_512_gather4: | |
1960 | leaq ($inp,$power,4), $inp | |
1961 | movl \$8, %r9d | |
1962 | jmp .Loop_gather | |
1963 | .align 16 | |
1964 | .Loop_gather: | |
1965 | movl ($inp), %eax | |
1966 | movl 64($inp), %r8d | |
1967 | leaq 128($inp), $inp | |
1968 | shlq \$32, %r8 | |
1969 | or %r8, %rax | |
1970 | movq %rax, ($out) | |
1971 | leaq 8($out), $out | |
1972 | decl %r9d | |
1973 | jnz .Loop_gather | |
1974 | ret | |
1975 | .size rsaz_512_gather4,.-rsaz_512_gather4 | |
1976 | ___ | |
1977 | } | |
1978 | ||
1979 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
1980 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
1981 | if ($win64) { | |
1982 | $rec="%rcx"; | |
1983 | $frame="%rdx"; | |
1984 | $context="%r8"; | |
1985 | $disp="%r9"; | |
1986 | ||
1987 | $code.=<<___; | |
1988 | .extern __imp_RtlVirtualUnwind | |
1989 | .type se_handler,\@abi-omnipotent | |
1990 | .align 16 | |
1991 | se_handler: | |
1992 | push %rsi | |
1993 | push %rdi | |
1994 | push %rbx | |
1995 | push %rbp | |
1996 | push %r12 | |
1997 | push %r13 | |
1998 | push %r14 | |
1999 | push %r15 | |
2000 | pushfq | |
2001 | sub \$64,%rsp | |
2002 | ||
2003 | mov 120($context),%rax # pull context->Rax | |
2004 | mov 248($context),%rbx # pull context->Rip | |
2005 | ||
2006 | mov 8($disp),%rsi # disp->ImageBase | |
2007 | mov 56($disp),%r11 # disp->HandlerData | |
2008 | ||
2009 | mov 0(%r11),%r10d # HandlerData[0] | |
2010 | lea (%rsi,%r10),%r10 # end of prologue label | |
2011 | cmp %r10,%rbx # context->Rip<end of prologue label | |
2012 | jb .Lcommon_seh_tail | |
2013 | ||
2014 | mov 152($context),%rax # pull context->Rsp | |
2015 | ||
2016 | mov 4(%r11),%r10d # HandlerData[1] | |
2017 | lea (%rsi,%r10),%r10 # epilogue label | |
2018 | cmp %r10,%rbx # context->Rip>=epilogue label | |
2019 | jae .Lcommon_seh_tail | |
2020 | ||
2021 | lea 128+24+48(%rax),%rax | |
2022 | ||
2023 | mov -8(%rax),%rbx | |
2024 | mov -16(%rax),%rbp | |
2025 | mov -24(%rax),%r12 | |
2026 | mov -32(%rax),%r13 | |
2027 | mov -40(%rax),%r14 | |
2028 | mov -48(%rax),%r15 | |
2029 | mov %rbx,144($context) # restore context->Rbx | |
2030 | mov %rbp,160($context) # restore context->Rbp | |
2031 | mov %r12,216($context) # restore context->R12 | |
2032 | mov %r13,224($context) # restore context->R13 | |
2033 | mov %r14,232($context) # restore context->R14 | |
2034 | mov %r15,240($context) # restore context->R15 | |
2035 | ||
2036 | .Lcommon_seh_tail: | |
2037 | mov 8(%rax),%rdi | |
2038 | mov 16(%rax),%rsi | |
2039 | mov %rax,152($context) # restore context->Rsp | |
2040 | mov %rsi,168($context) # restore context->Rsi | |
2041 | mov %rdi,176($context) # restore context->Rdi | |
2042 | ||
2043 | mov 40($disp),%rdi # disp->ContextRecord | |
2044 | mov $context,%rsi # context | |
2045 | mov \$154,%ecx # sizeof(CONTEXT) | |
2046 | .long 0xa548f3fc # cld; rep movsq | |
2047 | ||
2048 | mov $disp,%rsi | |
2049 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
2050 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
2051 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
2052 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
2053 | mov 40(%rsi),%r10 # disp->ContextRecord | |
2054 | lea 56(%rsi),%r11 # &disp->HandlerData | |
2055 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
2056 | mov %r10,32(%rsp) # arg5 | |
2057 | mov %r11,40(%rsp) # arg6 | |
2058 | mov %r12,48(%rsp) # arg7 | |
2059 | mov %rcx,56(%rsp) # arg8, (NULL) | |
2060 | call *__imp_RtlVirtualUnwind(%rip) | |
2061 | ||
2062 | mov \$1,%eax # ExceptionContinueSearch | |
2063 | add \$64,%rsp | |
2064 | popfq | |
2065 | pop %r15 | |
2066 | pop %r14 | |
2067 | pop %r13 | |
2068 | pop %r12 | |
2069 | pop %rbp | |
2070 | pop %rbx | |
2071 | pop %rdi | |
2072 | pop %rsi | |
2073 | ret | |
2074 | .size sqr_handler,.-sqr_handler | |
2075 | ||
2076 | .section .pdata | |
2077 | .align 4 | |
2078 | .rva .LSEH_begin_rsaz_512_sqr | |
2079 | .rva .LSEH_end_rsaz_512_sqr | |
2080 | .rva .LSEH_info_rsaz_512_sqr | |
2081 | ||
2082 | .rva .LSEH_begin_rsaz_512_mul | |
2083 | .rva .LSEH_end_rsaz_512_mul | |
2084 | .rva .LSEH_info_rsaz_512_mul | |
2085 | ||
2086 | .rva .LSEH_begin_rsaz_512_mul_gather4 | |
2087 | .rva .LSEH_end_rsaz_512_mul_gather4 | |
2088 | .rva .LSEH_info_rsaz_512_mul_gather4 | |
2089 | ||
2090 | .rva .LSEH_begin_rsaz_512_mul_scatter4 | |
2091 | .rva .LSEH_end_rsaz_512_mul_scatter4 | |
2092 | .rva .LSEH_info_rsaz_512_mul_scatter4 | |
2093 | ||
2094 | .rva .LSEH_begin_rsaz_512_mul_by_one | |
2095 | .rva .LSEH_end_rsaz_512_mul_by_one | |
2096 | .rva .LSEH_info_rsaz_512_mul_by_one | |
2097 | ||
2098 | .section .xdata | |
2099 | .align 8 | |
2100 | .LSEH_info_rsaz_512_sqr: | |
2101 | .byte 9,0,0,0 | |
2102 | .rva se_handler | |
2103 | .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] | |
2104 | .LSEH_info_rsaz_512_mul: | |
2105 | .byte 9,0,0,0 | |
2106 | .rva se_handler | |
2107 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[] | |
2108 | .LSEH_info_rsaz_512_mul_gather4: | |
2109 | .byte 9,0,0,0 | |
2110 | .rva se_handler | |
2111 | .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] | |
2112 | .LSEH_info_rsaz_512_mul_scatter4: | |
2113 | .byte 9,0,0,0 | |
2114 | .rva se_handler | |
2115 | .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] | |
2116 | .LSEH_info_rsaz_512_mul_by_one: | |
2117 | .byte 9,0,0,0 | |
2118 | .rva se_handler | |
2119 | .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] | |
2120 | ___ | |
2121 | } | |
2122 | ||
2123 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
2124 | print $code; | |
2125 | close STDOUT; |