]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/rsaz-x86_64.pl
x86_64 assembly pack: tolerate spaces in source directory name.
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 ##############################################################################
11 # #
12 # Copyright (c) 2012, Intel Corporation #
13 # #
14 # All rights reserved. #
15 # #
16 # Redistribution and use in source and binary forms, with or without #
17 # modification, are permitted provided that the following conditions are #
18 # met: #
19 # #
20 # * Redistributions of source code must retain the above copyright #
21 # notice, this list of conditions and the following disclaimer. #
22 # #
23 # * Redistributions in binary form must reproduce the above copyright #
24 # notice, this list of conditions and the following disclaimer in the #
25 # documentation and/or other materials provided with the #
26 # distribution. #
27 # #
28 # * Neither the name of the Intel Corporation nor the names of its #
29 # contributors may be used to endorse or promote products derived from #
30 # this software without specific prior written permission. #
31 # #
32 # #
33 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
34 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
35 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
36 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
37 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
38 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
39 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
40 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
41 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
42 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
43 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
44 # #
45 ##############################################################################
46 # Developers and authors: #
47 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
48 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
49 # Israel Development Center, Haifa, Israel #
50 # (2) University of Haifa #
51 ##############################################################################
52 # Reference: #
53 # [1] S. Gueron, "Efficient Software Implementations of Modular #
54 # Exponentiation", http://eprint.iacr.org/2011/239 #
55 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
56 # IEEE Proceedings of 9th International Conference on Information #
57 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
58 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
59 # Journal of Cryptographic Engineering 2:31-43 (2012). #
60 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
61 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
62 # RSA1024 and RSA2048 on x86_64 platforms", #
63 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
64 ##############################################################################
65
66 # While original submission covers 512- and 1024-bit exponentiation,
67 # this module is limited to 512-bit version only (and as such
68 # accelerates RSA1024 sign). This is because improvement for longer
69 # keys is not high enough to justify the effort, highest measured
70 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
71 # for the moment of this writing!] Nor does this module implement
72 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
73 # to more modular mixture of C and assembly. And it's optimized even
74 # for processors other than Intel Core family (see table below for
75 # improvement coefficients).
76 # <appro@openssl.org>
77 #
78 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
79 # ----------------+---------------------------
80 # Opteron +13% |+5% +20%
81 # Bulldozer -0% |-1% +10%
82 # P4 +11% |+7% +8%
83 # Westmere +5% |+14% +17%
84 # Sandy Bridge +2% |+12% +29%
85 # Ivy Bridge +1% |+11% +35%
86 # Haswell(**) -0% |+12% +39%
87 # Atom +13% |+11% +4%
88 # VIA Nano +70% |+9% +25%
89 #
90 # (*) rsax engine and fips numbers are presented for reference
91 # purposes;
92 # (**) MULX was attempted, but found to give only marginal improvement;
93
94 $flavour = shift;
95 $output = shift;
96 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
97
98 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
99
100 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
101 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
102 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
103 die "can't locate x86_64-xlate.pl";
104
105 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
106 *STDOUT=*OUT;
107
108 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
109 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
110 $addx = ($1>=2.23);
111 }
112
113 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
114 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
115 $addx = ($1>=2.10);
116 }
117
118 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
119 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
120 $addx = ($1>=12);
121 }
122
123 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
124 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
125 $addx = ($ver>=3.03);
126 }
127
128 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
129 {
130 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
131
132 $code.=<<___;
133 .text
134
135 .extern OPENSSL_ia32cap_P
136
137 .globl rsaz_512_sqr
138 .type rsaz_512_sqr,\@function,5
139 .align 32
140 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
141 push %rbx
142 push %rbp
143 push %r12
144 push %r13
145 push %r14
146 push %r15
147
148 subq \$128+24, %rsp
149 .Lsqr_body:
150 movq $mod, %rbp # common argument
151 movq ($inp), %rdx
152 movq 8($inp), %rax
153 movq $n0, 128(%rsp)
154 ___
155 $code.=<<___ if ($addx);
156 movl \$0x80100,%r11d
157 andl OPENSSL_ia32cap_P+8(%rip),%r11d
158 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
159 je .Loop_sqrx
160 ___
161 $code.=<<___;
162 jmp .Loop_sqr
163
164 .align 32
165 .Loop_sqr:
166 movl $times,128+8(%rsp)
167 #first iteration
168 movq %rdx, %rbx
169 mulq %rdx
170 movq %rax, %r8
171 movq 16($inp), %rax
172 movq %rdx, %r9
173
174 mulq %rbx
175 addq %rax, %r9
176 movq 24($inp), %rax
177 movq %rdx, %r10
178 adcq \$0, %r10
179
180 mulq %rbx
181 addq %rax, %r10
182 movq 32($inp), %rax
183 movq %rdx, %r11
184 adcq \$0, %r11
185
186 mulq %rbx
187 addq %rax, %r11
188 movq 40($inp), %rax
189 movq %rdx, %r12
190 adcq \$0, %r12
191
192 mulq %rbx
193 addq %rax, %r12
194 movq 48($inp), %rax
195 movq %rdx, %r13
196 adcq \$0, %r13
197
198 mulq %rbx
199 addq %rax, %r13
200 movq 56($inp), %rax
201 movq %rdx, %r14
202 adcq \$0, %r14
203
204 mulq %rbx
205 addq %rax, %r14
206 movq %rbx, %rax
207 movq %rdx, %r15
208 adcq \$0, %r15
209
210 addq %r8, %r8 #shlq \$1, %r8
211 movq %r9, %rcx
212 adcq %r9, %r9 #shld \$1, %r8, %r9
213
214 mulq %rax
215 movq %rax, (%rsp)
216 addq %rdx, %r8
217 adcq \$0, %r9
218
219 movq %r8, 8(%rsp)
220 shrq \$63, %rcx
221
222 #second iteration
223 movq 8($inp), %r8
224 movq 16($inp), %rax
225 mulq %r8
226 addq %rax, %r10
227 movq 24($inp), %rax
228 movq %rdx, %rbx
229 adcq \$0, %rbx
230
231 mulq %r8
232 addq %rax, %r11
233 movq 32($inp), %rax
234 adcq \$0, %rdx
235 addq %rbx, %r11
236 movq %rdx, %rbx
237 adcq \$0, %rbx
238
239 mulq %r8
240 addq %rax, %r12
241 movq 40($inp), %rax
242 adcq \$0, %rdx
243 addq %rbx, %r12
244 movq %rdx, %rbx
245 adcq \$0, %rbx
246
247 mulq %r8
248 addq %rax, %r13
249 movq 48($inp), %rax
250 adcq \$0, %rdx
251 addq %rbx, %r13
252 movq %rdx, %rbx
253 adcq \$0, %rbx
254
255 mulq %r8
256 addq %rax, %r14
257 movq 56($inp), %rax
258 adcq \$0, %rdx
259 addq %rbx, %r14
260 movq %rdx, %rbx
261 adcq \$0, %rbx
262
263 mulq %r8
264 addq %rax, %r15
265 movq %r8, %rax
266 adcq \$0, %rdx
267 addq %rbx, %r15
268 movq %rdx, %r8
269 movq %r10, %rdx
270 adcq \$0, %r8
271
272 add %rdx, %rdx
273 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
274 movq %r11, %rbx
275 adcq %r11, %r11 #shld \$1, %r10, %r11
276
277 mulq %rax
278 addq %rax, %r9
279 adcq %rdx, %r10
280 adcq \$0, %r11
281
282 movq %r9, 16(%rsp)
283 movq %r10, 24(%rsp)
284 shrq \$63, %rbx
285
286 #third iteration
287 movq 16($inp), %r9
288 movq 24($inp), %rax
289 mulq %r9
290 addq %rax, %r12
291 movq 32($inp), %rax
292 movq %rdx, %rcx
293 adcq \$0, %rcx
294
295 mulq %r9
296 addq %rax, %r13
297 movq 40($inp), %rax
298 adcq \$0, %rdx
299 addq %rcx, %r13
300 movq %rdx, %rcx
301 adcq \$0, %rcx
302
303 mulq %r9
304 addq %rax, %r14
305 movq 48($inp), %rax
306 adcq \$0, %rdx
307 addq %rcx, %r14
308 movq %rdx, %rcx
309 adcq \$0, %rcx
310
311 mulq %r9
312 movq %r12, %r10
313 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
314 addq %rax, %r15
315 movq 56($inp), %rax
316 adcq \$0, %rdx
317 addq %rcx, %r15
318 movq %rdx, %rcx
319 adcq \$0, %rcx
320
321 mulq %r9
322 shrq \$63, %r10
323 addq %rax, %r8
324 movq %r9, %rax
325 adcq \$0, %rdx
326 addq %rcx, %r8
327 movq %rdx, %r9
328 adcq \$0, %r9
329
330 movq %r13, %rcx
331 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
332
333 mulq %rax
334 addq %rax, %r11
335 adcq %rdx, %r12
336 adcq \$0, %r13
337
338 movq %r11, 32(%rsp)
339 movq %r12, 40(%rsp)
340 shrq \$63, %rcx
341
342 #fourth iteration
343 movq 24($inp), %r10
344 movq 32($inp), %rax
345 mulq %r10
346 addq %rax, %r14
347 movq 40($inp), %rax
348 movq %rdx, %rbx
349 adcq \$0, %rbx
350
351 mulq %r10
352 addq %rax, %r15
353 movq 48($inp), %rax
354 adcq \$0, %rdx
355 addq %rbx, %r15
356 movq %rdx, %rbx
357 adcq \$0, %rbx
358
359 mulq %r10
360 movq %r14, %r12
361 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
362 addq %rax, %r8
363 movq 56($inp), %rax
364 adcq \$0, %rdx
365 addq %rbx, %r8
366 movq %rdx, %rbx
367 adcq \$0, %rbx
368
369 mulq %r10
370 shrq \$63, %r12
371 addq %rax, %r9
372 movq %r10, %rax
373 adcq \$0, %rdx
374 addq %rbx, %r9
375 movq %rdx, %r10
376 adcq \$0, %r10
377
378 movq %r15, %rbx
379 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
380
381 mulq %rax
382 addq %rax, %r13
383 adcq %rdx, %r14
384 adcq \$0, %r15
385
386 movq %r13, 48(%rsp)
387 movq %r14, 56(%rsp)
388 shrq \$63, %rbx
389
390 #fifth iteration
391 movq 32($inp), %r11
392 movq 40($inp), %rax
393 mulq %r11
394 addq %rax, %r8
395 movq 48($inp), %rax
396 movq %rdx, %rcx
397 adcq \$0, %rcx
398
399 mulq %r11
400 addq %rax, %r9
401 movq 56($inp), %rax
402 adcq \$0, %rdx
403 movq %r8, %r12
404 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
405 addq %rcx, %r9
406 movq %rdx, %rcx
407 adcq \$0, %rcx
408
409 mulq %r11
410 shrq \$63, %r12
411 addq %rax, %r10
412 movq %r11, %rax
413 adcq \$0, %rdx
414 addq %rcx, %r10
415 movq %rdx, %r11
416 adcq \$0, %r11
417
418 movq %r9, %rcx
419 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
420
421 mulq %rax
422 addq %rax, %r15
423 adcq %rdx, %r8
424 adcq \$0, %r9
425
426 movq %r15, 64(%rsp)
427 movq %r8, 72(%rsp)
428 shrq \$63, %rcx
429
430 #sixth iteration
431 movq 40($inp), %r12
432 movq 48($inp), %rax
433 mulq %r12
434 addq %rax, %r10
435 movq 56($inp), %rax
436 movq %rdx, %rbx
437 adcq \$0, %rbx
438
439 mulq %r12
440 addq %rax, %r11
441 movq %r12, %rax
442 movq %r10, %r15
443 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
444 adcq \$0, %rdx
445 shrq \$63, %r15
446 addq %rbx, %r11
447 movq %rdx, %r12
448 adcq \$0, %r12
449
450 movq %r11, %rbx
451 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
452
453 mulq %rax
454 addq %rax, %r9
455 adcq %rdx, %r10
456 adcq \$0, %r11
457
458 movq %r9, 80(%rsp)
459 movq %r10, 88(%rsp)
460
461 #seventh iteration
462 movq 48($inp), %r13
463 movq 56($inp), %rax
464 mulq %r13
465 addq %rax, %r12
466 movq %r13, %rax
467 movq %rdx, %r13
468 adcq \$0, %r13
469
470 xorq %r14, %r14
471 shlq \$1, %rbx
472 adcq %r12, %r12 #shld \$1, %rbx, %r12
473 adcq %r13, %r13 #shld \$1, %r12, %r13
474 adcq %r14, %r14 #shld \$1, %r13, %r14
475
476 mulq %rax
477 addq %rax, %r11
478 adcq %rdx, %r12
479 adcq \$0, %r13
480
481 movq %r11, 96(%rsp)
482 movq %r12, 104(%rsp)
483
484 #eighth iteration
485 movq 56($inp), %rax
486 mulq %rax
487 addq %rax, %r13
488 adcq \$0, %rdx
489
490 addq %rdx, %r14
491
492 movq %r13, 112(%rsp)
493 movq %r14, 120(%rsp)
494
495 movq (%rsp), %r8
496 movq 8(%rsp), %r9
497 movq 16(%rsp), %r10
498 movq 24(%rsp), %r11
499 movq 32(%rsp), %r12
500 movq 40(%rsp), %r13
501 movq 48(%rsp), %r14
502 movq 56(%rsp), %r15
503
504 call __rsaz_512_reduce
505
506 addq 64(%rsp), %r8
507 adcq 72(%rsp), %r9
508 adcq 80(%rsp), %r10
509 adcq 88(%rsp), %r11
510 adcq 96(%rsp), %r12
511 adcq 104(%rsp), %r13
512 adcq 112(%rsp), %r14
513 adcq 120(%rsp), %r15
514 sbbq %rcx, %rcx
515
516 call __rsaz_512_subtract
517
518 movq %r8, %rdx
519 movq %r9, %rax
520 movl 128+8(%rsp), $times
521 movq $out, $inp
522
523 decl $times
524 jnz .Loop_sqr
525 ___
526 if ($addx) {
527 $code.=<<___;
528 jmp .Lsqr_tail
529
530 .align 32
531 .Loop_sqrx:
532 movl $times,128+8(%rsp)
533 movq $out, %xmm0 # off-load
534 movq %rbp, %xmm1 # off-load
535 #first iteration
536 mulx %rax, %r8, %r9
537
538 mulx 16($inp), %rcx, %r10
539 xor %rbp, %rbp # cf=0, of=0
540
541 mulx 24($inp), %rax, %r11
542 adcx %rcx, %r9
543
544 mulx 32($inp), %rcx, %r12
545 adcx %rax, %r10
546
547 mulx 40($inp), %rax, %r13
548 adcx %rcx, %r11
549
550 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
551 adcx %rax, %r12
552 adcx %rcx, %r13
553
554 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
555 adcx %rax, %r14
556 adcx %rbp, %r15 # %rbp is 0
557
558 mov %r9, %rcx
559 shld \$1, %r8, %r9
560 shl \$1, %r8
561
562 xor %ebp, %ebp
563 mulx %rdx, %rax, %rdx
564 adcx %rdx, %r8
565 mov 8($inp), %rdx
566 adcx %rbp, %r9
567
568 mov %rax, (%rsp)
569 mov %r8, 8(%rsp)
570
571 #second iteration
572 mulx 16($inp), %rax, %rbx
573 adox %rax, %r10
574 adcx %rbx, %r11
575
576 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
577 adox $out, %r11
578 adcx %r8, %r12
579
580 mulx 32($inp), %rax, %rbx
581 adox %rax, %r12
582 adcx %rbx, %r13
583
584 mulx 40($inp), $out, %r8
585 adox $out, %r13
586 adcx %r8, %r14
587
588 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
589 adox %rax, %r14
590 adcx %rbx, %r15
591
592 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
593 adox $out, %r15
594 adcx %rbp, %r8
595 adox %rbp, %r8
596
597 mov %r11, %rbx
598 shld \$1, %r10, %r11
599 shld \$1, %rcx, %r10
600
601 xor %ebp,%ebp
602 mulx %rdx, %rax, %rcx
603 mov 16($inp), %rdx
604 adcx %rax, %r9
605 adcx %rcx, %r10
606 adcx %rbp, %r11
607
608 mov %r9, 16(%rsp)
609 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
610
611 #third iteration
612 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
613 adox $out, %r12
614 adcx %r9, %r13
615
616 mulx 32($inp), %rax, %rcx
617 adox %rax, %r13
618 adcx %rcx, %r14
619
620 mulx 40($inp), $out, %r9
621 adox $out, %r14
622 adcx %r9, %r15
623
624 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
625 adox %rax, %r15
626 adcx %rcx, %r8
627
628 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
629 adox $out, %r8
630 adcx %rbp, %r9
631 adox %rbp, %r9
632
633 mov %r13, %rcx
634 shld \$1, %r12, %r13
635 shld \$1, %rbx, %r12
636
637 xor %ebp, %ebp
638 mulx %rdx, %rax, %rdx
639 adcx %rax, %r11
640 adcx %rdx, %r12
641 mov 24($inp), %rdx
642 adcx %rbp, %r13
643
644 mov %r11, 32(%rsp)
645 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
646
647 #fourth iteration
648 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
649 adox %rax, %r14
650 adcx %rbx, %r15
651
652 mulx 40($inp), $out, %r10
653 adox $out, %r15
654 adcx %r10, %r8
655
656 mulx 48($inp), %rax, %rbx
657 adox %rax, %r8
658 adcx %rbx, %r9
659
660 mulx 56($inp), $out, %r10
661 adox $out, %r9
662 adcx %rbp, %r10
663 adox %rbp, %r10
664
665 .byte 0x66
666 mov %r15, %rbx
667 shld \$1, %r14, %r15
668 shld \$1, %rcx, %r14
669
670 xor %ebp, %ebp
671 mulx %rdx, %rax, %rdx
672 adcx %rax, %r13
673 adcx %rdx, %r14
674 mov 32($inp), %rdx
675 adcx %rbp, %r15
676
677 mov %r13, 48(%rsp)
678 mov %r14, 56(%rsp)
679
680 #fifth iteration
681 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
682 adox $out, %r8
683 adcx %r11, %r9
684
685 mulx 48($inp), %rax, %rcx
686 adox %rax, %r9
687 adcx %rcx, %r10
688
689 mulx 56($inp), $out, %r11
690 adox $out, %r10
691 adcx %rbp, %r11
692 adox %rbp, %r11
693
694 mov %r9, %rcx
695 shld \$1, %r8, %r9
696 shld \$1, %rbx, %r8
697
698 xor %ebp, %ebp
699 mulx %rdx, %rax, %rdx
700 adcx %rax, %r15
701 adcx %rdx, %r8
702 mov 40($inp), %rdx
703 adcx %rbp, %r9
704
705 mov %r15, 64(%rsp)
706 mov %r8, 72(%rsp)
707
708 #sixth iteration
709 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
710 adox %rax, %r10
711 adcx %rbx, %r11
712
713 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
714 adox $out, %r11
715 adcx %rbp, %r12
716 adox %rbp, %r12
717
718 mov %r11, %rbx
719 shld \$1, %r10, %r11
720 shld \$1, %rcx, %r10
721
722 xor %ebp, %ebp
723 mulx %rdx, %rax, %rdx
724 adcx %rax, %r9
725 adcx %rdx, %r10
726 mov 48($inp), %rdx
727 adcx %rbp, %r11
728
729 mov %r9, 80(%rsp)
730 mov %r10, 88(%rsp)
731
732 #seventh iteration
733 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
734 adox %rax, %r12
735 adox %rbp, %r13
736
737 xor %r14, %r14
738 shld \$1, %r13, %r14
739 shld \$1, %r12, %r13
740 shld \$1, %rbx, %r12
741
742 xor %ebp, %ebp
743 mulx %rdx, %rax, %rdx
744 adcx %rax, %r11
745 adcx %rdx, %r12
746 mov 56($inp), %rdx
747 adcx %rbp, %r13
748
749 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
750 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
751
752 #eighth iteration
753 mulx %rdx, %rax, %rdx
754 adox %rax, %r13
755 adox %rbp, %rdx
756
757 .byte 0x66
758 add %rdx, %r14
759
760 movq %r13, 112(%rsp)
761 movq %r14, 120(%rsp)
762 movq %xmm0, $out
763 movq %xmm1, %rbp
764
765 movq 128(%rsp), %rdx # pull $n0
766 movq (%rsp), %r8
767 movq 8(%rsp), %r9
768 movq 16(%rsp), %r10
769 movq 24(%rsp), %r11
770 movq 32(%rsp), %r12
771 movq 40(%rsp), %r13
772 movq 48(%rsp), %r14
773 movq 56(%rsp), %r15
774
775 call __rsaz_512_reducex
776
777 addq 64(%rsp), %r8
778 adcq 72(%rsp), %r9
779 adcq 80(%rsp), %r10
780 adcq 88(%rsp), %r11
781 adcq 96(%rsp), %r12
782 adcq 104(%rsp), %r13
783 adcq 112(%rsp), %r14
784 adcq 120(%rsp), %r15
785 sbbq %rcx, %rcx
786
787 call __rsaz_512_subtract
788
789 movq %r8, %rdx
790 movq %r9, %rax
791 movl 128+8(%rsp), $times
792 movq $out, $inp
793
794 decl $times
795 jnz .Loop_sqrx
796
797 .Lsqr_tail:
798 ___
799 }
800 $code.=<<___;
801
802 leaq 128+24+48(%rsp), %rax
803 movq -48(%rax), %r15
804 movq -40(%rax), %r14
805 movq -32(%rax), %r13
806 movq -24(%rax), %r12
807 movq -16(%rax), %rbp
808 movq -8(%rax), %rbx
809 leaq (%rax), %rsp
810 .Lsqr_epilogue:
811 ret
812 .size rsaz_512_sqr,.-rsaz_512_sqr
813 ___
814 }
815 {
816 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
817 $code.=<<___;
818 .globl rsaz_512_mul
819 .type rsaz_512_mul,\@function,5
820 .align 32
821 rsaz_512_mul:
822 push %rbx
823 push %rbp
824 push %r12
825 push %r13
826 push %r14
827 push %r15
828
829 subq \$128+24, %rsp
830 .Lmul_body:
831 movq $out, %xmm0 # off-load arguments
832 movq $mod, %xmm1
833 movq $n0, 128(%rsp)
834 ___
835 $code.=<<___ if ($addx);
836 movl \$0x80100,%r11d
837 andl OPENSSL_ia32cap_P+8(%rip),%r11d
838 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
839 je .Lmulx
840 ___
841 $code.=<<___;
842 movq ($bp), %rbx # pass b[0]
843 movq $bp, %rbp # pass argument
844 call __rsaz_512_mul
845
846 movq %xmm0, $out
847 movq %xmm1, %rbp
848
849 movq (%rsp), %r8
850 movq 8(%rsp), %r9
851 movq 16(%rsp), %r10
852 movq 24(%rsp), %r11
853 movq 32(%rsp), %r12
854 movq 40(%rsp), %r13
855 movq 48(%rsp), %r14
856 movq 56(%rsp), %r15
857
858 call __rsaz_512_reduce
859 ___
860 $code.=<<___ if ($addx);
861 jmp .Lmul_tail
862
863 .align 32
864 .Lmulx:
865 movq $bp, %rbp # pass argument
866 movq ($bp), %rdx # pass b[0]
867 call __rsaz_512_mulx
868
869 movq %xmm0, $out
870 movq %xmm1, %rbp
871
872 movq 128(%rsp), %rdx # pull $n0
873 movq (%rsp), %r8
874 movq 8(%rsp), %r9
875 movq 16(%rsp), %r10
876 movq 24(%rsp), %r11
877 movq 32(%rsp), %r12
878 movq 40(%rsp), %r13
879 movq 48(%rsp), %r14
880 movq 56(%rsp), %r15
881
882 call __rsaz_512_reducex
883 .Lmul_tail:
884 ___
885 $code.=<<___;
886 addq 64(%rsp), %r8
887 adcq 72(%rsp), %r9
888 adcq 80(%rsp), %r10
889 adcq 88(%rsp), %r11
890 adcq 96(%rsp), %r12
891 adcq 104(%rsp), %r13
892 adcq 112(%rsp), %r14
893 adcq 120(%rsp), %r15
894 sbbq %rcx, %rcx
895
896 call __rsaz_512_subtract
897
898 leaq 128+24+48(%rsp), %rax
899 movq -48(%rax), %r15
900 movq -40(%rax), %r14
901 movq -32(%rax), %r13
902 movq -24(%rax), %r12
903 movq -16(%rax), %rbp
904 movq -8(%rax), %rbx
905 leaq (%rax), %rsp
906 .Lmul_epilogue:
907 ret
908 .size rsaz_512_mul,.-rsaz_512_mul
909 ___
910 }
911 {
912 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
913 $code.=<<___;
914 .globl rsaz_512_mul_gather4
915 .type rsaz_512_mul_gather4,\@function,6
916 .align 32
917 rsaz_512_mul_gather4:
918 push %rbx
919 push %rbp
920 push %r12
921 push %r13
922 push %r14
923 push %r15
924
925 subq \$`128+24+($win64?0xb0:0)`, %rsp
926 ___
927 $code.=<<___ if ($win64);
928 movaps %xmm6,0xa0(%rsp)
929 movaps %xmm7,0xb0(%rsp)
930 movaps %xmm8,0xc0(%rsp)
931 movaps %xmm9,0xd0(%rsp)
932 movaps %xmm10,0xe0(%rsp)
933 movaps %xmm11,0xf0(%rsp)
934 movaps %xmm12,0x100(%rsp)
935 movaps %xmm13,0x110(%rsp)
936 movaps %xmm14,0x120(%rsp)
937 movaps %xmm15,0x130(%rsp)
938 ___
939 $code.=<<___;
940 .Lmul_gather4_body:
941 movd $pwr,%xmm8
942 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
943 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
944
945 pshufd \$0,%xmm8,%xmm8 # broadcast $power
946 movdqa %xmm1,%xmm7
947 movdqa %xmm1,%xmm2
948 ___
949 ########################################################################
950 # calculate mask by comparing 0..15 to $power
951 #
952 for($i=0;$i<4;$i++) {
953 $code.=<<___;
954 paddd %xmm`$i`,%xmm`$i+1`
955 pcmpeqd %xmm8,%xmm`$i`
956 movdqa %xmm7,%xmm`$i+3`
957 ___
958 }
959 for(;$i<7;$i++) {
960 $code.=<<___;
961 paddd %xmm`$i`,%xmm`$i+1`
962 pcmpeqd %xmm8,%xmm`$i`
963 ___
964 }
965 $code.=<<___;
966 pcmpeqd %xmm8,%xmm7
967
968 movdqa 16*0($bp),%xmm8
969 movdqa 16*1($bp),%xmm9
970 movdqa 16*2($bp),%xmm10
971 movdqa 16*3($bp),%xmm11
972 pand %xmm0,%xmm8
973 movdqa 16*4($bp),%xmm12
974 pand %xmm1,%xmm9
975 movdqa 16*5($bp),%xmm13
976 pand %xmm2,%xmm10
977 movdqa 16*6($bp),%xmm14
978 pand %xmm3,%xmm11
979 movdqa 16*7($bp),%xmm15
980 leaq 128($bp), %rbp
981 pand %xmm4,%xmm12
982 pand %xmm5,%xmm13
983 pand %xmm6,%xmm14
984 pand %xmm7,%xmm15
985 por %xmm10,%xmm8
986 por %xmm11,%xmm9
987 por %xmm12,%xmm8
988 por %xmm13,%xmm9
989 por %xmm14,%xmm8
990 por %xmm15,%xmm9
991
992 por %xmm9,%xmm8
993 pshufd \$0x4e,%xmm8,%xmm9
994 por %xmm9,%xmm8
995 ___
996 $code.=<<___ if ($addx);
997 movl \$0x80100,%r11d
998 andl OPENSSL_ia32cap_P+8(%rip),%r11d
999 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1000 je .Lmulx_gather
1001 ___
1002 $code.=<<___;
1003 movq %xmm8,%rbx
1004
1005 movq $n0, 128(%rsp) # off-load arguments
1006 movq $out, 128+8(%rsp)
1007 movq $mod, 128+16(%rsp)
1008
1009 movq ($ap), %rax
1010 movq 8($ap), %rcx
1011 mulq %rbx # 0 iteration
1012 movq %rax, (%rsp)
1013 movq %rcx, %rax
1014 movq %rdx, %r8
1015
1016 mulq %rbx
1017 addq %rax, %r8
1018 movq 16($ap), %rax
1019 movq %rdx, %r9
1020 adcq \$0, %r9
1021
1022 mulq %rbx
1023 addq %rax, %r9
1024 movq 24($ap), %rax
1025 movq %rdx, %r10
1026 adcq \$0, %r10
1027
1028 mulq %rbx
1029 addq %rax, %r10
1030 movq 32($ap), %rax
1031 movq %rdx, %r11
1032 adcq \$0, %r11
1033
1034 mulq %rbx
1035 addq %rax, %r11
1036 movq 40($ap), %rax
1037 movq %rdx, %r12
1038 adcq \$0, %r12
1039
1040 mulq %rbx
1041 addq %rax, %r12
1042 movq 48($ap), %rax
1043 movq %rdx, %r13
1044 adcq \$0, %r13
1045
1046 mulq %rbx
1047 addq %rax, %r13
1048 movq 56($ap), %rax
1049 movq %rdx, %r14
1050 adcq \$0, %r14
1051
1052 mulq %rbx
1053 addq %rax, %r14
1054 movq ($ap), %rax
1055 movq %rdx, %r15
1056 adcq \$0, %r15
1057
1058 leaq 8(%rsp), %rdi
1059 movl \$7, %ecx
1060 jmp .Loop_mul_gather
1061
1062 .align 32
1063 .Loop_mul_gather:
1064 movdqa 16*0(%rbp),%xmm8
1065 movdqa 16*1(%rbp),%xmm9
1066 movdqa 16*2(%rbp),%xmm10
1067 movdqa 16*3(%rbp),%xmm11
1068 pand %xmm0,%xmm8
1069 movdqa 16*4(%rbp),%xmm12
1070 pand %xmm1,%xmm9
1071 movdqa 16*5(%rbp),%xmm13
1072 pand %xmm2,%xmm10
1073 movdqa 16*6(%rbp),%xmm14
1074 pand %xmm3,%xmm11
1075 movdqa 16*7(%rbp),%xmm15
1076 leaq 128(%rbp), %rbp
1077 pand %xmm4,%xmm12
1078 pand %xmm5,%xmm13
1079 pand %xmm6,%xmm14
1080 pand %xmm7,%xmm15
1081 por %xmm10,%xmm8
1082 por %xmm11,%xmm9
1083 por %xmm12,%xmm8
1084 por %xmm13,%xmm9
1085 por %xmm14,%xmm8
1086 por %xmm15,%xmm9
1087
1088 por %xmm9,%xmm8
1089 pshufd \$0x4e,%xmm8,%xmm9
1090 por %xmm9,%xmm8
1091 movq %xmm8,%rbx
1092
1093 mulq %rbx
1094 addq %rax, %r8
1095 movq 8($ap), %rax
1096 movq %r8, (%rdi)
1097 movq %rdx, %r8
1098 adcq \$0, %r8
1099
1100 mulq %rbx
1101 addq %rax, %r9
1102 movq 16($ap), %rax
1103 adcq \$0, %rdx
1104 addq %r9, %r8
1105 movq %rdx, %r9
1106 adcq \$0, %r9
1107
1108 mulq %rbx
1109 addq %rax, %r10
1110 movq 24($ap), %rax
1111 adcq \$0, %rdx
1112 addq %r10, %r9
1113 movq %rdx, %r10
1114 adcq \$0, %r10
1115
1116 mulq %rbx
1117 addq %rax, %r11
1118 movq 32($ap), %rax
1119 adcq \$0, %rdx
1120 addq %r11, %r10
1121 movq %rdx, %r11
1122 adcq \$0, %r11
1123
1124 mulq %rbx
1125 addq %rax, %r12
1126 movq 40($ap), %rax
1127 adcq \$0, %rdx
1128 addq %r12, %r11
1129 movq %rdx, %r12
1130 adcq \$0, %r12
1131
1132 mulq %rbx
1133 addq %rax, %r13
1134 movq 48($ap), %rax
1135 adcq \$0, %rdx
1136 addq %r13, %r12
1137 movq %rdx, %r13
1138 adcq \$0, %r13
1139
1140 mulq %rbx
1141 addq %rax, %r14
1142 movq 56($ap), %rax
1143 adcq \$0, %rdx
1144 addq %r14, %r13
1145 movq %rdx, %r14
1146 adcq \$0, %r14
1147
1148 mulq %rbx
1149 addq %rax, %r15
1150 movq ($ap), %rax
1151 adcq \$0, %rdx
1152 addq %r15, %r14
1153 movq %rdx, %r15
1154 adcq \$0, %r15
1155
1156 leaq 8(%rdi), %rdi
1157
1158 decl %ecx
1159 jnz .Loop_mul_gather
1160
1161 movq %r8, (%rdi)
1162 movq %r9, 8(%rdi)
1163 movq %r10, 16(%rdi)
1164 movq %r11, 24(%rdi)
1165 movq %r12, 32(%rdi)
1166 movq %r13, 40(%rdi)
1167 movq %r14, 48(%rdi)
1168 movq %r15, 56(%rdi)
1169
1170 movq 128+8(%rsp), $out
1171 movq 128+16(%rsp), %rbp
1172
1173 movq (%rsp), %r8
1174 movq 8(%rsp), %r9
1175 movq 16(%rsp), %r10
1176 movq 24(%rsp), %r11
1177 movq 32(%rsp), %r12
1178 movq 40(%rsp), %r13
1179 movq 48(%rsp), %r14
1180 movq 56(%rsp), %r15
1181
1182 call __rsaz_512_reduce
1183 ___
1184 $code.=<<___ if ($addx);
1185 jmp .Lmul_gather_tail
1186
1187 .align 32
1188 .Lmulx_gather:
1189 movq %xmm8,%rdx
1190
1191 mov $n0, 128(%rsp) # off-load arguments
1192 mov $out, 128+8(%rsp)
1193 mov $mod, 128+16(%rsp)
1194
1195 mulx ($ap), %rbx, %r8 # 0 iteration
1196 mov %rbx, (%rsp)
1197 xor %edi, %edi # cf=0, of=0
1198
1199 mulx 8($ap), %rax, %r9
1200
1201 mulx 16($ap), %rbx, %r10
1202 adcx %rax, %r8
1203
1204 mulx 24($ap), %rax, %r11
1205 adcx %rbx, %r9
1206
1207 mulx 32($ap), %rbx, %r12
1208 adcx %rax, %r10
1209
1210 mulx 40($ap), %rax, %r13
1211 adcx %rbx, %r11
1212
1213 mulx 48($ap), %rbx, %r14
1214 adcx %rax, %r12
1215
1216 mulx 56($ap), %rax, %r15
1217 adcx %rbx, %r13
1218 adcx %rax, %r14
1219 .byte 0x67
1220 mov %r8, %rbx
1221 adcx %rdi, %r15 # %rdi is 0
1222
1223 mov \$-7, %rcx
1224 jmp .Loop_mulx_gather
1225
1226 .align 32
1227 .Loop_mulx_gather:
1228 movdqa 16*0(%rbp),%xmm8
1229 movdqa 16*1(%rbp),%xmm9
1230 movdqa 16*2(%rbp),%xmm10
1231 movdqa 16*3(%rbp),%xmm11
1232 pand %xmm0,%xmm8
1233 movdqa 16*4(%rbp),%xmm12
1234 pand %xmm1,%xmm9
1235 movdqa 16*5(%rbp),%xmm13
1236 pand %xmm2,%xmm10
1237 movdqa 16*6(%rbp),%xmm14
1238 pand %xmm3,%xmm11
1239 movdqa 16*7(%rbp),%xmm15
1240 leaq 128(%rbp), %rbp
1241 pand %xmm4,%xmm12
1242 pand %xmm5,%xmm13
1243 pand %xmm6,%xmm14
1244 pand %xmm7,%xmm15
1245 por %xmm10,%xmm8
1246 por %xmm11,%xmm9
1247 por %xmm12,%xmm8
1248 por %xmm13,%xmm9
1249 por %xmm14,%xmm8
1250 por %xmm15,%xmm9
1251
1252 por %xmm9,%xmm8
1253 pshufd \$0x4e,%xmm8,%xmm9
1254 por %xmm9,%xmm8
1255 movq %xmm8,%rdx
1256
1257 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1258 adcx %rax, %rbx
1259 adox %r9, %r8
1260
1261 mulx 8($ap), %rax, %r9
1262 adcx %rax, %r8
1263 adox %r10, %r9
1264
1265 mulx 16($ap), %rax, %r10
1266 adcx %rax, %r9
1267 adox %r11, %r10
1268
1269 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1270 adcx %rax, %r10
1271 adox %r12, %r11
1272
1273 mulx 32($ap), %rax, %r12
1274 adcx %rax, %r11
1275 adox %r13, %r12
1276
1277 mulx 40($ap), %rax, %r13
1278 adcx %rax, %r12
1279 adox %r14, %r13
1280
1281 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1282 adcx %rax, %r13
1283 .byte 0x67
1284 adox %r15, %r14
1285
1286 mulx 56($ap), %rax, %r15
1287 mov %rbx, 64(%rsp,%rcx,8)
1288 adcx %rax, %r14
1289 adox %rdi, %r15
1290 mov %r8, %rbx
1291 adcx %rdi, %r15 # cf=0
1292
1293 inc %rcx # of=0
1294 jnz .Loop_mulx_gather
1295
1296 mov %r8, 64(%rsp)
1297 mov %r9, 64+8(%rsp)
1298 mov %r10, 64+16(%rsp)
1299 mov %r11, 64+24(%rsp)
1300 mov %r12, 64+32(%rsp)
1301 mov %r13, 64+40(%rsp)
1302 mov %r14, 64+48(%rsp)
1303 mov %r15, 64+56(%rsp)
1304
1305 mov 128(%rsp), %rdx # pull arguments
1306 mov 128+8(%rsp), $out
1307 mov 128+16(%rsp), %rbp
1308
1309 mov (%rsp), %r8
1310 mov 8(%rsp), %r9
1311 mov 16(%rsp), %r10
1312 mov 24(%rsp), %r11
1313 mov 32(%rsp), %r12
1314 mov 40(%rsp), %r13
1315 mov 48(%rsp), %r14
1316 mov 56(%rsp), %r15
1317
1318 call __rsaz_512_reducex
1319
1320 .Lmul_gather_tail:
1321 ___
1322 $code.=<<___;
1323 addq 64(%rsp), %r8
1324 adcq 72(%rsp), %r9
1325 adcq 80(%rsp), %r10
1326 adcq 88(%rsp), %r11
1327 adcq 96(%rsp), %r12
1328 adcq 104(%rsp), %r13
1329 adcq 112(%rsp), %r14
1330 adcq 120(%rsp), %r15
1331 sbbq %rcx, %rcx
1332
1333 call __rsaz_512_subtract
1334
1335 leaq 128+24+48(%rsp), %rax
1336 ___
1337 $code.=<<___ if ($win64);
1338 movaps 0xa0-0xc8(%rax),%xmm6
1339 movaps 0xb0-0xc8(%rax),%xmm7
1340 movaps 0xc0-0xc8(%rax),%xmm8
1341 movaps 0xd0-0xc8(%rax),%xmm9
1342 movaps 0xe0-0xc8(%rax),%xmm10
1343 movaps 0xf0-0xc8(%rax),%xmm11
1344 movaps 0x100-0xc8(%rax),%xmm12
1345 movaps 0x110-0xc8(%rax),%xmm13
1346 movaps 0x120-0xc8(%rax),%xmm14
1347 movaps 0x130-0xc8(%rax),%xmm15
1348 lea 0xb0(%rax),%rax
1349 ___
1350 $code.=<<___;
1351 movq -48(%rax), %r15
1352 movq -40(%rax), %r14
1353 movq -32(%rax), %r13
1354 movq -24(%rax), %r12
1355 movq -16(%rax), %rbp
1356 movq -8(%rax), %rbx
1357 leaq (%rax), %rsp
1358 .Lmul_gather4_epilogue:
1359 ret
1360 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1361 ___
1362 }
1363 {
1364 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1365 $code.=<<___;
1366 .globl rsaz_512_mul_scatter4
1367 .type rsaz_512_mul_scatter4,\@function,6
1368 .align 32
1369 rsaz_512_mul_scatter4:
1370 push %rbx
1371 push %rbp
1372 push %r12
1373 push %r13
1374 push %r14
1375 push %r15
1376
1377 mov $pwr, $pwr
1378 subq \$128+24, %rsp
1379 .Lmul_scatter4_body:
1380 leaq ($tbl,$pwr,8), $tbl
1381 movq $out, %xmm0 # off-load arguments
1382 movq $mod, %xmm1
1383 movq $tbl, %xmm2
1384 movq $n0, 128(%rsp)
1385
1386 movq $out, %rbp
1387 ___
1388 $code.=<<___ if ($addx);
1389 movl \$0x80100,%r11d
1390 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1391 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1392 je .Lmulx_scatter
1393 ___
1394 $code.=<<___;
1395 movq ($out),%rbx # pass b[0]
1396 call __rsaz_512_mul
1397
1398 movq %xmm0, $out
1399 movq %xmm1, %rbp
1400
1401 movq (%rsp), %r8
1402 movq 8(%rsp), %r9
1403 movq 16(%rsp), %r10
1404 movq 24(%rsp), %r11
1405 movq 32(%rsp), %r12
1406 movq 40(%rsp), %r13
1407 movq 48(%rsp), %r14
1408 movq 56(%rsp), %r15
1409
1410 call __rsaz_512_reduce
1411 ___
1412 $code.=<<___ if ($addx);
1413 jmp .Lmul_scatter_tail
1414
1415 .align 32
1416 .Lmulx_scatter:
1417 movq ($out), %rdx # pass b[0]
1418 call __rsaz_512_mulx
1419
1420 movq %xmm0, $out
1421 movq %xmm1, %rbp
1422
1423 movq 128(%rsp), %rdx # pull $n0
1424 movq (%rsp), %r8
1425 movq 8(%rsp), %r9
1426 movq 16(%rsp), %r10
1427 movq 24(%rsp), %r11
1428 movq 32(%rsp), %r12
1429 movq 40(%rsp), %r13
1430 movq 48(%rsp), %r14
1431 movq 56(%rsp), %r15
1432
1433 call __rsaz_512_reducex
1434
1435 .Lmul_scatter_tail:
1436 ___
1437 $code.=<<___;
1438 addq 64(%rsp), %r8
1439 adcq 72(%rsp), %r9
1440 adcq 80(%rsp), %r10
1441 adcq 88(%rsp), %r11
1442 adcq 96(%rsp), %r12
1443 adcq 104(%rsp), %r13
1444 adcq 112(%rsp), %r14
1445 adcq 120(%rsp), %r15
1446 movq %xmm2, $inp
1447 sbbq %rcx, %rcx
1448
1449 call __rsaz_512_subtract
1450
1451 movq %r8, 128*0($inp) # scatter
1452 movq %r9, 128*1($inp)
1453 movq %r10, 128*2($inp)
1454 movq %r11, 128*3($inp)
1455 movq %r12, 128*4($inp)
1456 movq %r13, 128*5($inp)
1457 movq %r14, 128*6($inp)
1458 movq %r15, 128*7($inp)
1459
1460 leaq 128+24+48(%rsp), %rax
1461 movq -48(%rax), %r15
1462 movq -40(%rax), %r14
1463 movq -32(%rax), %r13
1464 movq -24(%rax), %r12
1465 movq -16(%rax), %rbp
1466 movq -8(%rax), %rbx
1467 leaq (%rax), %rsp
1468 .Lmul_scatter4_epilogue:
1469 ret
1470 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1471 ___
1472 }
1473 {
1474 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1475 $code.=<<___;
1476 .globl rsaz_512_mul_by_one
1477 .type rsaz_512_mul_by_one,\@function,4
1478 .align 32
1479 rsaz_512_mul_by_one:
1480 push %rbx
1481 push %rbp
1482 push %r12
1483 push %r13
1484 push %r14
1485 push %r15
1486
1487 subq \$128+24, %rsp
1488 .Lmul_by_one_body:
1489 ___
1490 $code.=<<___ if ($addx);
1491 movl OPENSSL_ia32cap_P+8(%rip),%eax
1492 ___
1493 $code.=<<___;
1494 movq $mod, %rbp # reassign argument
1495 movq $n0, 128(%rsp)
1496
1497 movq ($inp), %r8
1498 pxor %xmm0, %xmm0
1499 movq 8($inp), %r9
1500 movq 16($inp), %r10
1501 movq 24($inp), %r11
1502 movq 32($inp), %r12
1503 movq 40($inp), %r13
1504 movq 48($inp), %r14
1505 movq 56($inp), %r15
1506
1507 movdqa %xmm0, (%rsp)
1508 movdqa %xmm0, 16(%rsp)
1509 movdqa %xmm0, 32(%rsp)
1510 movdqa %xmm0, 48(%rsp)
1511 movdqa %xmm0, 64(%rsp)
1512 movdqa %xmm0, 80(%rsp)
1513 movdqa %xmm0, 96(%rsp)
1514 ___
1515 $code.=<<___ if ($addx);
1516 andl \$0x80100,%eax
1517 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1518 je .Lby_one_callx
1519 ___
1520 $code.=<<___;
1521 call __rsaz_512_reduce
1522 ___
1523 $code.=<<___ if ($addx);
1524 jmp .Lby_one_tail
1525 .align 32
1526 .Lby_one_callx:
1527 movq 128(%rsp), %rdx # pull $n0
1528 call __rsaz_512_reducex
1529 .Lby_one_tail:
1530 ___
1531 $code.=<<___;
1532 movq %r8, ($out)
1533 movq %r9, 8($out)
1534 movq %r10, 16($out)
1535 movq %r11, 24($out)
1536 movq %r12, 32($out)
1537 movq %r13, 40($out)
1538 movq %r14, 48($out)
1539 movq %r15, 56($out)
1540
1541 leaq 128+24+48(%rsp), %rax
1542 movq -48(%rax), %r15
1543 movq -40(%rax), %r14
1544 movq -32(%rax), %r13
1545 movq -24(%rax), %r12
1546 movq -16(%rax), %rbp
1547 movq -8(%rax), %rbx
1548 leaq (%rax), %rsp
1549 .Lmul_by_one_epilogue:
1550 ret
1551 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1552 ___
1553 }
1554 { # __rsaz_512_reduce
1555 #
1556 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1557 # output: %r8-%r15
1558 # clobbers: everything except %rbp and %rdi
1559 $code.=<<___;
1560 .type __rsaz_512_reduce,\@abi-omnipotent
1561 .align 32
1562 __rsaz_512_reduce:
1563 movq %r8, %rbx
1564 imulq 128+8(%rsp), %rbx
1565 movq 0(%rbp), %rax
1566 movl \$8, %ecx
1567 jmp .Lreduction_loop
1568
1569 .align 32
1570 .Lreduction_loop:
1571 mulq %rbx
1572 movq 8(%rbp), %rax
1573 negq %r8
1574 movq %rdx, %r8
1575 adcq \$0, %r8
1576
1577 mulq %rbx
1578 addq %rax, %r9
1579 movq 16(%rbp), %rax
1580 adcq \$0, %rdx
1581 addq %r9, %r8
1582 movq %rdx, %r9
1583 adcq \$0, %r9
1584
1585 mulq %rbx
1586 addq %rax, %r10
1587 movq 24(%rbp), %rax
1588 adcq \$0, %rdx
1589 addq %r10, %r9
1590 movq %rdx, %r10
1591 adcq \$0, %r10
1592
1593 mulq %rbx
1594 addq %rax, %r11
1595 movq 32(%rbp), %rax
1596 adcq \$0, %rdx
1597 addq %r11, %r10
1598 movq 128+8(%rsp), %rsi
1599 #movq %rdx, %r11
1600 #adcq \$0, %r11
1601 adcq \$0, %rdx
1602 movq %rdx, %r11
1603
1604 mulq %rbx
1605 addq %rax, %r12
1606 movq 40(%rbp), %rax
1607 adcq \$0, %rdx
1608 imulq %r8, %rsi
1609 addq %r12, %r11
1610 movq %rdx, %r12
1611 adcq \$0, %r12
1612
1613 mulq %rbx
1614 addq %rax, %r13
1615 movq 48(%rbp), %rax
1616 adcq \$0, %rdx
1617 addq %r13, %r12
1618 movq %rdx, %r13
1619 adcq \$0, %r13
1620
1621 mulq %rbx
1622 addq %rax, %r14
1623 movq 56(%rbp), %rax
1624 adcq \$0, %rdx
1625 addq %r14, %r13
1626 movq %rdx, %r14
1627 adcq \$0, %r14
1628
1629 mulq %rbx
1630 movq %rsi, %rbx
1631 addq %rax, %r15
1632 movq 0(%rbp), %rax
1633 adcq \$0, %rdx
1634 addq %r15, %r14
1635 movq %rdx, %r15
1636 adcq \$0, %r15
1637
1638 decl %ecx
1639 jne .Lreduction_loop
1640
1641 ret
1642 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1643 ___
1644 }
1645 if ($addx) {
1646 # __rsaz_512_reducex
1647 #
1648 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1649 # output: %r8-%r15
1650 # clobbers: everything except %rbp and %rdi
1651 $code.=<<___;
1652 .type __rsaz_512_reducex,\@abi-omnipotent
1653 .align 32
1654 __rsaz_512_reducex:
1655 #movq 128+8(%rsp), %rdx # pull $n0
1656 imulq %r8, %rdx
1657 xorq %rsi, %rsi # cf=0,of=0
1658 movl \$8, %ecx
1659 jmp .Lreduction_loopx
1660
1661 .align 32
1662 .Lreduction_loopx:
1663 mov %r8, %rbx
1664 mulx 0(%rbp), %rax, %r8
1665 adcx %rbx, %rax
1666 adox %r9, %r8
1667
1668 mulx 8(%rbp), %rax, %r9
1669 adcx %rax, %r8
1670 adox %r10, %r9
1671
1672 mulx 16(%rbp), %rbx, %r10
1673 adcx %rbx, %r9
1674 adox %r11, %r10
1675
1676 mulx 24(%rbp), %rbx, %r11
1677 adcx %rbx, %r10
1678 adox %r12, %r11
1679
1680 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1681 mov %rdx, %rax
1682 mov %r8, %rdx
1683 adcx %rbx, %r11
1684 adox %r13, %r12
1685
1686 mulx 128+8(%rsp), %rbx, %rdx
1687 mov %rax, %rdx
1688
1689 mulx 40(%rbp), %rax, %r13
1690 adcx %rax, %r12
1691 adox %r14, %r13
1692
1693 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1694 adcx %rax, %r13
1695 adox %r15, %r14
1696
1697 mulx 56(%rbp), %rax, %r15
1698 mov %rbx, %rdx
1699 adcx %rax, %r14
1700 adox %rsi, %r15 # %rsi is 0
1701 adcx %rsi, %r15 # cf=0
1702
1703 decl %ecx # of=0
1704 jne .Lreduction_loopx
1705
1706 ret
1707 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1708 ___
1709 }
1710 { # __rsaz_512_subtract
1711 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1712 # output:
1713 # clobbers: everything but %rdi, %rsi and %rbp
1714 $code.=<<___;
1715 .type __rsaz_512_subtract,\@abi-omnipotent
1716 .align 32
1717 __rsaz_512_subtract:
1718 movq %r8, ($out)
1719 movq %r9, 8($out)
1720 movq %r10, 16($out)
1721 movq %r11, 24($out)
1722 movq %r12, 32($out)
1723 movq %r13, 40($out)
1724 movq %r14, 48($out)
1725 movq %r15, 56($out)
1726
1727 movq 0($mod), %r8
1728 movq 8($mod), %r9
1729 negq %r8
1730 notq %r9
1731 andq %rcx, %r8
1732 movq 16($mod), %r10
1733 andq %rcx, %r9
1734 notq %r10
1735 movq 24($mod), %r11
1736 andq %rcx, %r10
1737 notq %r11
1738 movq 32($mod), %r12
1739 andq %rcx, %r11
1740 notq %r12
1741 movq 40($mod), %r13
1742 andq %rcx, %r12
1743 notq %r13
1744 movq 48($mod), %r14
1745 andq %rcx, %r13
1746 notq %r14
1747 movq 56($mod), %r15
1748 andq %rcx, %r14
1749 notq %r15
1750 andq %rcx, %r15
1751
1752 addq ($out), %r8
1753 adcq 8($out), %r9
1754 adcq 16($out), %r10
1755 adcq 24($out), %r11
1756 adcq 32($out), %r12
1757 adcq 40($out), %r13
1758 adcq 48($out), %r14
1759 adcq 56($out), %r15
1760
1761 movq %r8, ($out)
1762 movq %r9, 8($out)
1763 movq %r10, 16($out)
1764 movq %r11, 24($out)
1765 movq %r12, 32($out)
1766 movq %r13, 40($out)
1767 movq %r14, 48($out)
1768 movq %r15, 56($out)
1769
1770 ret
1771 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1772 ___
1773 }
1774 { # __rsaz_512_mul
1775 #
1776 # input: %rsi - ap, %rbp - bp
1777 # output:
1778 # clobbers: everything
1779 my ($ap,$bp) = ("%rsi","%rbp");
1780 $code.=<<___;
1781 .type __rsaz_512_mul,\@abi-omnipotent
1782 .align 32
1783 __rsaz_512_mul:
1784 leaq 8(%rsp), %rdi
1785
1786 movq ($ap), %rax
1787 mulq %rbx
1788 movq %rax, (%rdi)
1789 movq 8($ap), %rax
1790 movq %rdx, %r8
1791
1792 mulq %rbx
1793 addq %rax, %r8
1794 movq 16($ap), %rax
1795 movq %rdx, %r9
1796 adcq \$0, %r9
1797
1798 mulq %rbx
1799 addq %rax, %r9
1800 movq 24($ap), %rax
1801 movq %rdx, %r10
1802 adcq \$0, %r10
1803
1804 mulq %rbx
1805 addq %rax, %r10
1806 movq 32($ap), %rax
1807 movq %rdx, %r11
1808 adcq \$0, %r11
1809
1810 mulq %rbx
1811 addq %rax, %r11
1812 movq 40($ap), %rax
1813 movq %rdx, %r12
1814 adcq \$0, %r12
1815
1816 mulq %rbx
1817 addq %rax, %r12
1818 movq 48($ap), %rax
1819 movq %rdx, %r13
1820 adcq \$0, %r13
1821
1822 mulq %rbx
1823 addq %rax, %r13
1824 movq 56($ap), %rax
1825 movq %rdx, %r14
1826 adcq \$0, %r14
1827
1828 mulq %rbx
1829 addq %rax, %r14
1830 movq ($ap), %rax
1831 movq %rdx, %r15
1832 adcq \$0, %r15
1833
1834 leaq 8($bp), $bp
1835 leaq 8(%rdi), %rdi
1836
1837 movl \$7, %ecx
1838 jmp .Loop_mul
1839
1840 .align 32
1841 .Loop_mul:
1842 movq ($bp), %rbx
1843 mulq %rbx
1844 addq %rax, %r8
1845 movq 8($ap), %rax
1846 movq %r8, (%rdi)
1847 movq %rdx, %r8
1848 adcq \$0, %r8
1849
1850 mulq %rbx
1851 addq %rax, %r9
1852 movq 16($ap), %rax
1853 adcq \$0, %rdx
1854 addq %r9, %r8
1855 movq %rdx, %r9
1856 adcq \$0, %r9
1857
1858 mulq %rbx
1859 addq %rax, %r10
1860 movq 24($ap), %rax
1861 adcq \$0, %rdx
1862 addq %r10, %r9
1863 movq %rdx, %r10
1864 adcq \$0, %r10
1865
1866 mulq %rbx
1867 addq %rax, %r11
1868 movq 32($ap), %rax
1869 adcq \$0, %rdx
1870 addq %r11, %r10
1871 movq %rdx, %r11
1872 adcq \$0, %r11
1873
1874 mulq %rbx
1875 addq %rax, %r12
1876 movq 40($ap), %rax
1877 adcq \$0, %rdx
1878 addq %r12, %r11
1879 movq %rdx, %r12
1880 adcq \$0, %r12
1881
1882 mulq %rbx
1883 addq %rax, %r13
1884 movq 48($ap), %rax
1885 adcq \$0, %rdx
1886 addq %r13, %r12
1887 movq %rdx, %r13
1888 adcq \$0, %r13
1889
1890 mulq %rbx
1891 addq %rax, %r14
1892 movq 56($ap), %rax
1893 adcq \$0, %rdx
1894 addq %r14, %r13
1895 movq %rdx, %r14
1896 leaq 8($bp), $bp
1897 adcq \$0, %r14
1898
1899 mulq %rbx
1900 addq %rax, %r15
1901 movq ($ap), %rax
1902 adcq \$0, %rdx
1903 addq %r15, %r14
1904 movq %rdx, %r15
1905 adcq \$0, %r15
1906
1907 leaq 8(%rdi), %rdi
1908
1909 decl %ecx
1910 jnz .Loop_mul
1911
1912 movq %r8, (%rdi)
1913 movq %r9, 8(%rdi)
1914 movq %r10, 16(%rdi)
1915 movq %r11, 24(%rdi)
1916 movq %r12, 32(%rdi)
1917 movq %r13, 40(%rdi)
1918 movq %r14, 48(%rdi)
1919 movq %r15, 56(%rdi)
1920
1921 ret
1922 .size __rsaz_512_mul,.-__rsaz_512_mul
1923 ___
1924 }
1925 if ($addx) {
1926 # __rsaz_512_mulx
1927 #
1928 # input: %rsi - ap, %rbp - bp
1929 # output:
1930 # clobbers: everything
1931 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1932 $code.=<<___;
1933 .type __rsaz_512_mulx,\@abi-omnipotent
1934 .align 32
1935 __rsaz_512_mulx:
1936 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1937 mov \$-6, %rcx
1938
1939 mulx 8($ap), %rax, %r9
1940 movq %rbx, 8(%rsp)
1941
1942 mulx 16($ap), %rbx, %r10
1943 adc %rax, %r8
1944
1945 mulx 24($ap), %rax, %r11
1946 adc %rbx, %r9
1947
1948 mulx 32($ap), %rbx, %r12
1949 adc %rax, %r10
1950
1951 mulx 40($ap), %rax, %r13
1952 adc %rbx, %r11
1953
1954 mulx 48($ap), %rbx, %r14
1955 adc %rax, %r12
1956
1957 mulx 56($ap), %rax, %r15
1958 mov 8($bp), %rdx
1959 adc %rbx, %r13
1960 adc %rax, %r14
1961 adc \$0, %r15
1962
1963 xor $zero, $zero # cf=0,of=0
1964 jmp .Loop_mulx
1965
1966 .align 32
1967 .Loop_mulx:
1968 movq %r8, %rbx
1969 mulx ($ap), %rax, %r8
1970 adcx %rax, %rbx
1971 adox %r9, %r8
1972
1973 mulx 8($ap), %rax, %r9
1974 adcx %rax, %r8
1975 adox %r10, %r9
1976
1977 mulx 16($ap), %rax, %r10
1978 adcx %rax, %r9
1979 adox %r11, %r10
1980
1981 mulx 24($ap), %rax, %r11
1982 adcx %rax, %r10
1983 adox %r12, %r11
1984
1985 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1986 adcx %rax, %r11
1987 adox %r13, %r12
1988
1989 mulx 40($ap), %rax, %r13
1990 adcx %rax, %r12
1991 adox %r14, %r13
1992
1993 mulx 48($ap), %rax, %r14
1994 adcx %rax, %r13
1995 adox %r15, %r14
1996
1997 mulx 56($ap), %rax, %r15
1998 movq 64($bp,%rcx,8), %rdx
1999 movq %rbx, 8+64-8(%rsp,%rcx,8)
2000 adcx %rax, %r14
2001 adox $zero, %r15
2002 adcx $zero, %r15 # cf=0
2003
2004 inc %rcx # of=0
2005 jnz .Loop_mulx
2006
2007 movq %r8, %rbx
2008 mulx ($ap), %rax, %r8
2009 adcx %rax, %rbx
2010 adox %r9, %r8
2011
2012 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2013 adcx %rax, %r8
2014 adox %r10, %r9
2015
2016 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2017 adcx %rax, %r9
2018 adox %r11, %r10
2019
2020 mulx 24($ap), %rax, %r11
2021 adcx %rax, %r10
2022 adox %r12, %r11
2023
2024 mulx 32($ap), %rax, %r12
2025 adcx %rax, %r11
2026 adox %r13, %r12
2027
2028 mulx 40($ap), %rax, %r13
2029 adcx %rax, %r12
2030 adox %r14, %r13
2031
2032 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2033 adcx %rax, %r13
2034 adox %r15, %r14
2035
2036 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2037 adcx %rax, %r14
2038 adox $zero, %r15
2039 adcx $zero, %r15
2040
2041 mov %rbx, 8+64-8(%rsp)
2042 mov %r8, 8+64(%rsp)
2043 mov %r9, 8+64+8(%rsp)
2044 mov %r10, 8+64+16(%rsp)
2045 mov %r11, 8+64+24(%rsp)
2046 mov %r12, 8+64+32(%rsp)
2047 mov %r13, 8+64+40(%rsp)
2048 mov %r14, 8+64+48(%rsp)
2049 mov %r15, 8+64+56(%rsp)
2050
2051 ret
2052 .size __rsaz_512_mulx,.-__rsaz_512_mulx
2053 ___
2054 }
2055 {
2056 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2057 $code.=<<___;
2058 .globl rsaz_512_scatter4
2059 .type rsaz_512_scatter4,\@abi-omnipotent
2060 .align 16
2061 rsaz_512_scatter4:
2062 leaq ($out,$power,8), $out
2063 movl \$8, %r9d
2064 jmp .Loop_scatter
2065 .align 16
2066 .Loop_scatter:
2067 movq ($inp), %rax
2068 leaq 8($inp), $inp
2069 movq %rax, ($out)
2070 leaq 128($out), $out
2071 decl %r9d
2072 jnz .Loop_scatter
2073 ret
2074 .size rsaz_512_scatter4,.-rsaz_512_scatter4
2075
2076 .globl rsaz_512_gather4
2077 .type rsaz_512_gather4,\@abi-omnipotent
2078 .align 16
2079 rsaz_512_gather4:
2080 ___
2081 $code.=<<___ if ($win64);
2082 .LSEH_begin_rsaz_512_gather4:
2083 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2084 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2085 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2086 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2087 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2088 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2089 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2090 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2091 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2092 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2093 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2094 ___
2095 $code.=<<___;
2096 movd $power,%xmm8
2097 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
2098 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
2099
2100 pshufd \$0,%xmm8,%xmm8 # broadcast $power
2101 movdqa %xmm1,%xmm7
2102 movdqa %xmm1,%xmm2
2103 ___
2104 ########################################################################
2105 # calculate mask by comparing 0..15 to $power
2106 #
2107 for($i=0;$i<4;$i++) {
2108 $code.=<<___;
2109 paddd %xmm`$i`,%xmm`$i+1`
2110 pcmpeqd %xmm8,%xmm`$i`
2111 movdqa %xmm7,%xmm`$i+3`
2112 ___
2113 }
2114 for(;$i<7;$i++) {
2115 $code.=<<___;
2116 paddd %xmm`$i`,%xmm`$i+1`
2117 pcmpeqd %xmm8,%xmm`$i`
2118 ___
2119 }
2120 $code.=<<___;
2121 pcmpeqd %xmm8,%xmm7
2122 movl \$8, %r9d
2123 jmp .Loop_gather
2124 .align 16
2125 .Loop_gather:
2126 movdqa 16*0($inp),%xmm8
2127 movdqa 16*1($inp),%xmm9
2128 movdqa 16*2($inp),%xmm10
2129 movdqa 16*3($inp),%xmm11
2130 pand %xmm0,%xmm8
2131 movdqa 16*4($inp),%xmm12
2132 pand %xmm1,%xmm9
2133 movdqa 16*5($inp),%xmm13
2134 pand %xmm2,%xmm10
2135 movdqa 16*6($inp),%xmm14
2136 pand %xmm3,%xmm11
2137 movdqa 16*7($inp),%xmm15
2138 leaq 128($inp), $inp
2139 pand %xmm4,%xmm12
2140 pand %xmm5,%xmm13
2141 pand %xmm6,%xmm14
2142 pand %xmm7,%xmm15
2143 por %xmm10,%xmm8
2144 por %xmm11,%xmm9
2145 por %xmm12,%xmm8
2146 por %xmm13,%xmm9
2147 por %xmm14,%xmm8
2148 por %xmm15,%xmm9
2149
2150 por %xmm9,%xmm8
2151 pshufd \$0x4e,%xmm8,%xmm9
2152 por %xmm9,%xmm8
2153 movq %xmm8,($out)
2154 leaq 8($out), $out
2155 decl %r9d
2156 jnz .Loop_gather
2157 ___
2158 $code.=<<___ if ($win64);
2159 movaps 0x00(%rsp),%xmm6
2160 movaps 0x10(%rsp),%xmm7
2161 movaps 0x20(%rsp),%xmm8
2162 movaps 0x30(%rsp),%xmm9
2163 movaps 0x40(%rsp),%xmm10
2164 movaps 0x50(%rsp),%xmm11
2165 movaps 0x60(%rsp),%xmm12
2166 movaps 0x70(%rsp),%xmm13
2167 movaps 0x80(%rsp),%xmm14
2168 movaps 0x90(%rsp),%xmm15
2169 add \$0xa8,%rsp
2170 ___
2171 $code.=<<___;
2172 ret
2173 .LSEH_end_rsaz_512_gather4:
2174 .size rsaz_512_gather4,.-rsaz_512_gather4
2175
2176 .align 64
2177 .Linc:
2178 .long 0,0, 1,1
2179 .long 2,2, 2,2
2180 ___
2181 }
2182
2183 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2184 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2185 if ($win64) {
2186 $rec="%rcx";
2187 $frame="%rdx";
2188 $context="%r8";
2189 $disp="%r9";
2190
2191 $code.=<<___;
2192 .extern __imp_RtlVirtualUnwind
2193 .type se_handler,\@abi-omnipotent
2194 .align 16
2195 se_handler:
2196 push %rsi
2197 push %rdi
2198 push %rbx
2199 push %rbp
2200 push %r12
2201 push %r13
2202 push %r14
2203 push %r15
2204 pushfq
2205 sub \$64,%rsp
2206
2207 mov 120($context),%rax # pull context->Rax
2208 mov 248($context),%rbx # pull context->Rip
2209
2210 mov 8($disp),%rsi # disp->ImageBase
2211 mov 56($disp),%r11 # disp->HandlerData
2212
2213 mov 0(%r11),%r10d # HandlerData[0]
2214 lea (%rsi,%r10),%r10 # end of prologue label
2215 cmp %r10,%rbx # context->Rip<end of prologue label
2216 jb .Lcommon_seh_tail
2217
2218 mov 152($context),%rax # pull context->Rsp
2219
2220 mov 4(%r11),%r10d # HandlerData[1]
2221 lea (%rsi,%r10),%r10 # epilogue label
2222 cmp %r10,%rbx # context->Rip>=epilogue label
2223 jae .Lcommon_seh_tail
2224
2225 lea 128+24+48(%rax),%rax
2226
2227 lea .Lmul_gather4_epilogue(%rip),%rbx
2228 cmp %r10,%rbx
2229 jne .Lse_not_in_mul_gather4
2230
2231 lea 0xb0(%rax),%rax
2232
2233 lea -48-0xa8(%rax),%rsi
2234 lea 512($context),%rdi
2235 mov \$20,%ecx
2236 .long 0xa548f3fc # cld; rep movsq
2237
2238 .Lse_not_in_mul_gather4:
2239 mov -8(%rax),%rbx
2240 mov -16(%rax),%rbp
2241 mov -24(%rax),%r12
2242 mov -32(%rax),%r13
2243 mov -40(%rax),%r14
2244 mov -48(%rax),%r15
2245 mov %rbx,144($context) # restore context->Rbx
2246 mov %rbp,160($context) # restore context->Rbp
2247 mov %r12,216($context) # restore context->R12
2248 mov %r13,224($context) # restore context->R13
2249 mov %r14,232($context) # restore context->R14
2250 mov %r15,240($context) # restore context->R15
2251
2252 .Lcommon_seh_tail:
2253 mov 8(%rax),%rdi
2254 mov 16(%rax),%rsi
2255 mov %rax,152($context) # restore context->Rsp
2256 mov %rsi,168($context) # restore context->Rsi
2257 mov %rdi,176($context) # restore context->Rdi
2258
2259 mov 40($disp),%rdi # disp->ContextRecord
2260 mov $context,%rsi # context
2261 mov \$154,%ecx # sizeof(CONTEXT)
2262 .long 0xa548f3fc # cld; rep movsq
2263
2264 mov $disp,%rsi
2265 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2266 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2267 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2268 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2269 mov 40(%rsi),%r10 # disp->ContextRecord
2270 lea 56(%rsi),%r11 # &disp->HandlerData
2271 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2272 mov %r10,32(%rsp) # arg5
2273 mov %r11,40(%rsp) # arg6
2274 mov %r12,48(%rsp) # arg7
2275 mov %rcx,56(%rsp) # arg8, (NULL)
2276 call *__imp_RtlVirtualUnwind(%rip)
2277
2278 mov \$1,%eax # ExceptionContinueSearch
2279 add \$64,%rsp
2280 popfq
2281 pop %r15
2282 pop %r14
2283 pop %r13
2284 pop %r12
2285 pop %rbp
2286 pop %rbx
2287 pop %rdi
2288 pop %rsi
2289 ret
2290 .size se_handler,.-se_handler
2291
2292 .section .pdata
2293 .align 4
2294 .rva .LSEH_begin_rsaz_512_sqr
2295 .rva .LSEH_end_rsaz_512_sqr
2296 .rva .LSEH_info_rsaz_512_sqr
2297
2298 .rva .LSEH_begin_rsaz_512_mul
2299 .rva .LSEH_end_rsaz_512_mul
2300 .rva .LSEH_info_rsaz_512_mul
2301
2302 .rva .LSEH_begin_rsaz_512_mul_gather4
2303 .rva .LSEH_end_rsaz_512_mul_gather4
2304 .rva .LSEH_info_rsaz_512_mul_gather4
2305
2306 .rva .LSEH_begin_rsaz_512_mul_scatter4
2307 .rva .LSEH_end_rsaz_512_mul_scatter4
2308 .rva .LSEH_info_rsaz_512_mul_scatter4
2309
2310 .rva .LSEH_begin_rsaz_512_mul_by_one
2311 .rva .LSEH_end_rsaz_512_mul_by_one
2312 .rva .LSEH_info_rsaz_512_mul_by_one
2313
2314 .rva .LSEH_begin_rsaz_512_gather4
2315 .rva .LSEH_end_rsaz_512_gather4
2316 .rva .LSEH_info_rsaz_512_gather4
2317
2318 .section .xdata
2319 .align 8
2320 .LSEH_info_rsaz_512_sqr:
2321 .byte 9,0,0,0
2322 .rva se_handler
2323 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2324 .LSEH_info_rsaz_512_mul:
2325 .byte 9,0,0,0
2326 .rva se_handler
2327 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2328 .LSEH_info_rsaz_512_mul_gather4:
2329 .byte 9,0,0,0
2330 .rva se_handler
2331 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2332 .LSEH_info_rsaz_512_mul_scatter4:
2333 .byte 9,0,0,0
2334 .rva se_handler
2335 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2336 .LSEH_info_rsaz_512_mul_by_one:
2337 .byte 9,0,0,0
2338 .rva se_handler
2339 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2340 .LSEH_info_rsaz_512_gather4:
2341 .byte 0x01,0x46,0x16,0x00
2342 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2343 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2344 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2345 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2346 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2347 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2348 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2349 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2350 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2351 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2352 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
2353 ___
2354 }
2355
2356 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2357 print $code;
2358 close STDOUT;