]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/rsaz-x86_64.pl
Merge Intel copyright notice into standard
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
4 #
5 # Licensed under the OpenSSL license (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
9 #
10 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
12 # (2) University of Haifa, Israel
13 #
14 # References:
15 # [1] S. Gueron, "Efficient Software Implementations of Modular
16 # Exponentiation", http://eprint.iacr.org/2011/239
17 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
18 # IEEE Proceedings of 9th International Conference on Information
19 # Technology: New Generations (ITNG 2012), 821-823 (2012).
20 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
21 # Journal of Cryptographic Engineering 2:31-43 (2012).
22 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
23 # resistant 512-bit and 1024-bit modular exponentiation for optimizing
24 # RSA1024 and RSA2048 on x86_64 platforms",
25 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
26 #
27 # While original submission covers 512- and 1024-bit exponentiation,
28 # this module is limited to 512-bit version only (and as such
29 # accelerates RSA1024 sign). This is because improvement for longer
30 # keys is not high enough to justify the effort, highest measured
31 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
32 # for the moment of this writing!] Nor does this module implement
33 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
34 # to more modular mixture of C and assembly. And it's optimized even
35 # for processors other than Intel Core family (see table below for
36 # improvement coefficients).
37 # <appro@openssl.org>
38 #
39 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
40 # ----------------+---------------------------
41 # Opteron +13% |+5% +20%
42 # Bulldozer -0% |-1% +10%
43 # P4 +11% |+7% +8%
44 # Westmere +5% |+14% +17%
45 # Sandy Bridge +2% |+12% +29%
46 # Ivy Bridge +1% |+11% +35%
47 # Haswell(**) -0% |+12% +39%
48 # Atom +13% |+11% +4%
49 # VIA Nano +70% |+9% +25%
50 #
51 # (*) rsax engine and fips numbers are presented for reference
52 # purposes;
53 # (**) MULX was attempted, but found to give only marginal improvement;
54
55 $flavour = shift;
56 $output = shift;
57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
58
59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
60
61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
64 die "can't locate x86_64-xlate.pl";
65
66 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
67 *STDOUT=*OUT;
68
69 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
70 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
71 $addx = ($1>=2.23);
72 }
73
74 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
75 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
76 $addx = ($1>=2.10);
77 }
78
79 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
80 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
81 $addx = ($1>=12);
82 }
83
84 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
86 $addx = ($ver>=3.03);
87 }
88
89 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
90 {
91 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
92
93 $code.=<<___;
94 .text
95
96 .extern OPENSSL_ia32cap_P
97
98 .globl rsaz_512_sqr
99 .type rsaz_512_sqr,\@function,5
100 .align 32
101 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
102 .cfi_startproc
103 push %rbx
104 .cfi_push %rbx
105 push %rbp
106 .cfi_push %rbp
107 push %r12
108 .cfi_push %r12
109 push %r13
110 .cfi_push %r13
111 push %r14
112 .cfi_push %r14
113 push %r15
114 .cfi_push %r15
115
116 subq \$128+24, %rsp
117 .cfi_adjust_cfa_offset 128+24
118 .Lsqr_body:
119 movq $mod, %rbp # common argument
120 movq ($inp), %rdx
121 movq 8($inp), %rax
122 movq $n0, 128(%rsp)
123 ___
124 $code.=<<___ if ($addx);
125 movl \$0x80100,%r11d
126 andl OPENSSL_ia32cap_P+8(%rip),%r11d
127 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
128 je .Loop_sqrx
129 ___
130 $code.=<<___;
131 jmp .Loop_sqr
132
133 .align 32
134 .Loop_sqr:
135 movl $times,128+8(%rsp)
136 #first iteration
137 movq %rdx, %rbx
138 mulq %rdx
139 movq %rax, %r8
140 movq 16($inp), %rax
141 movq %rdx, %r9
142
143 mulq %rbx
144 addq %rax, %r9
145 movq 24($inp), %rax
146 movq %rdx, %r10
147 adcq \$0, %r10
148
149 mulq %rbx
150 addq %rax, %r10
151 movq 32($inp), %rax
152 movq %rdx, %r11
153 adcq \$0, %r11
154
155 mulq %rbx
156 addq %rax, %r11
157 movq 40($inp), %rax
158 movq %rdx, %r12
159 adcq \$0, %r12
160
161 mulq %rbx
162 addq %rax, %r12
163 movq 48($inp), %rax
164 movq %rdx, %r13
165 adcq \$0, %r13
166
167 mulq %rbx
168 addq %rax, %r13
169 movq 56($inp), %rax
170 movq %rdx, %r14
171 adcq \$0, %r14
172
173 mulq %rbx
174 addq %rax, %r14
175 movq %rbx, %rax
176 movq %rdx, %r15
177 adcq \$0, %r15
178
179 addq %r8, %r8 #shlq \$1, %r8
180 movq %r9, %rcx
181 adcq %r9, %r9 #shld \$1, %r8, %r9
182
183 mulq %rax
184 movq %rax, (%rsp)
185 addq %rdx, %r8
186 adcq \$0, %r9
187
188 movq %r8, 8(%rsp)
189 shrq \$63, %rcx
190
191 #second iteration
192 movq 8($inp), %r8
193 movq 16($inp), %rax
194 mulq %r8
195 addq %rax, %r10
196 movq 24($inp), %rax
197 movq %rdx, %rbx
198 adcq \$0, %rbx
199
200 mulq %r8
201 addq %rax, %r11
202 movq 32($inp), %rax
203 adcq \$0, %rdx
204 addq %rbx, %r11
205 movq %rdx, %rbx
206 adcq \$0, %rbx
207
208 mulq %r8
209 addq %rax, %r12
210 movq 40($inp), %rax
211 adcq \$0, %rdx
212 addq %rbx, %r12
213 movq %rdx, %rbx
214 adcq \$0, %rbx
215
216 mulq %r8
217 addq %rax, %r13
218 movq 48($inp), %rax
219 adcq \$0, %rdx
220 addq %rbx, %r13
221 movq %rdx, %rbx
222 adcq \$0, %rbx
223
224 mulq %r8
225 addq %rax, %r14
226 movq 56($inp), %rax
227 adcq \$0, %rdx
228 addq %rbx, %r14
229 movq %rdx, %rbx
230 adcq \$0, %rbx
231
232 mulq %r8
233 addq %rax, %r15
234 movq %r8, %rax
235 adcq \$0, %rdx
236 addq %rbx, %r15
237 movq %rdx, %r8
238 movq %r10, %rdx
239 adcq \$0, %r8
240
241 add %rdx, %rdx
242 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
243 movq %r11, %rbx
244 adcq %r11, %r11 #shld \$1, %r10, %r11
245
246 mulq %rax
247 addq %rax, %r9
248 adcq %rdx, %r10
249 adcq \$0, %r11
250
251 movq %r9, 16(%rsp)
252 movq %r10, 24(%rsp)
253 shrq \$63, %rbx
254
255 #third iteration
256 movq 16($inp), %r9
257 movq 24($inp), %rax
258 mulq %r9
259 addq %rax, %r12
260 movq 32($inp), %rax
261 movq %rdx, %rcx
262 adcq \$0, %rcx
263
264 mulq %r9
265 addq %rax, %r13
266 movq 40($inp), %rax
267 adcq \$0, %rdx
268 addq %rcx, %r13
269 movq %rdx, %rcx
270 adcq \$0, %rcx
271
272 mulq %r9
273 addq %rax, %r14
274 movq 48($inp), %rax
275 adcq \$0, %rdx
276 addq %rcx, %r14
277 movq %rdx, %rcx
278 adcq \$0, %rcx
279
280 mulq %r9
281 movq %r12, %r10
282 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
283 addq %rax, %r15
284 movq 56($inp), %rax
285 adcq \$0, %rdx
286 addq %rcx, %r15
287 movq %rdx, %rcx
288 adcq \$0, %rcx
289
290 mulq %r9
291 shrq \$63, %r10
292 addq %rax, %r8
293 movq %r9, %rax
294 adcq \$0, %rdx
295 addq %rcx, %r8
296 movq %rdx, %r9
297 adcq \$0, %r9
298
299 movq %r13, %rcx
300 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
301
302 mulq %rax
303 addq %rax, %r11
304 adcq %rdx, %r12
305 adcq \$0, %r13
306
307 movq %r11, 32(%rsp)
308 movq %r12, 40(%rsp)
309 shrq \$63, %rcx
310
311 #fourth iteration
312 movq 24($inp), %r10
313 movq 32($inp), %rax
314 mulq %r10
315 addq %rax, %r14
316 movq 40($inp), %rax
317 movq %rdx, %rbx
318 adcq \$0, %rbx
319
320 mulq %r10
321 addq %rax, %r15
322 movq 48($inp), %rax
323 adcq \$0, %rdx
324 addq %rbx, %r15
325 movq %rdx, %rbx
326 adcq \$0, %rbx
327
328 mulq %r10
329 movq %r14, %r12
330 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
331 addq %rax, %r8
332 movq 56($inp), %rax
333 adcq \$0, %rdx
334 addq %rbx, %r8
335 movq %rdx, %rbx
336 adcq \$0, %rbx
337
338 mulq %r10
339 shrq \$63, %r12
340 addq %rax, %r9
341 movq %r10, %rax
342 adcq \$0, %rdx
343 addq %rbx, %r9
344 movq %rdx, %r10
345 adcq \$0, %r10
346
347 movq %r15, %rbx
348 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
349
350 mulq %rax
351 addq %rax, %r13
352 adcq %rdx, %r14
353 adcq \$0, %r15
354
355 movq %r13, 48(%rsp)
356 movq %r14, 56(%rsp)
357 shrq \$63, %rbx
358
359 #fifth iteration
360 movq 32($inp), %r11
361 movq 40($inp), %rax
362 mulq %r11
363 addq %rax, %r8
364 movq 48($inp), %rax
365 movq %rdx, %rcx
366 adcq \$0, %rcx
367
368 mulq %r11
369 addq %rax, %r9
370 movq 56($inp), %rax
371 adcq \$0, %rdx
372 movq %r8, %r12
373 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
374 addq %rcx, %r9
375 movq %rdx, %rcx
376 adcq \$0, %rcx
377
378 mulq %r11
379 shrq \$63, %r12
380 addq %rax, %r10
381 movq %r11, %rax
382 adcq \$0, %rdx
383 addq %rcx, %r10
384 movq %rdx, %r11
385 adcq \$0, %r11
386
387 movq %r9, %rcx
388 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
389
390 mulq %rax
391 addq %rax, %r15
392 adcq %rdx, %r8
393 adcq \$0, %r9
394
395 movq %r15, 64(%rsp)
396 movq %r8, 72(%rsp)
397 shrq \$63, %rcx
398
399 #sixth iteration
400 movq 40($inp), %r12
401 movq 48($inp), %rax
402 mulq %r12
403 addq %rax, %r10
404 movq 56($inp), %rax
405 movq %rdx, %rbx
406 adcq \$0, %rbx
407
408 mulq %r12
409 addq %rax, %r11
410 movq %r12, %rax
411 movq %r10, %r15
412 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
413 adcq \$0, %rdx
414 shrq \$63, %r15
415 addq %rbx, %r11
416 movq %rdx, %r12
417 adcq \$0, %r12
418
419 movq %r11, %rbx
420 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
421
422 mulq %rax
423 addq %rax, %r9
424 adcq %rdx, %r10
425 adcq \$0, %r11
426
427 movq %r9, 80(%rsp)
428 movq %r10, 88(%rsp)
429
430 #seventh iteration
431 movq 48($inp), %r13
432 movq 56($inp), %rax
433 mulq %r13
434 addq %rax, %r12
435 movq %r13, %rax
436 movq %rdx, %r13
437 adcq \$0, %r13
438
439 xorq %r14, %r14
440 shlq \$1, %rbx
441 adcq %r12, %r12 #shld \$1, %rbx, %r12
442 adcq %r13, %r13 #shld \$1, %r12, %r13
443 adcq %r14, %r14 #shld \$1, %r13, %r14
444
445 mulq %rax
446 addq %rax, %r11
447 adcq %rdx, %r12
448 adcq \$0, %r13
449
450 movq %r11, 96(%rsp)
451 movq %r12, 104(%rsp)
452
453 #eighth iteration
454 movq 56($inp), %rax
455 mulq %rax
456 addq %rax, %r13
457 adcq \$0, %rdx
458
459 addq %rdx, %r14
460
461 movq %r13, 112(%rsp)
462 movq %r14, 120(%rsp)
463
464 movq (%rsp), %r8
465 movq 8(%rsp), %r9
466 movq 16(%rsp), %r10
467 movq 24(%rsp), %r11
468 movq 32(%rsp), %r12
469 movq 40(%rsp), %r13
470 movq 48(%rsp), %r14
471 movq 56(%rsp), %r15
472
473 call __rsaz_512_reduce
474
475 addq 64(%rsp), %r8
476 adcq 72(%rsp), %r9
477 adcq 80(%rsp), %r10
478 adcq 88(%rsp), %r11
479 adcq 96(%rsp), %r12
480 adcq 104(%rsp), %r13
481 adcq 112(%rsp), %r14
482 adcq 120(%rsp), %r15
483 sbbq %rcx, %rcx
484
485 call __rsaz_512_subtract
486
487 movq %r8, %rdx
488 movq %r9, %rax
489 movl 128+8(%rsp), $times
490 movq $out, $inp
491
492 decl $times
493 jnz .Loop_sqr
494 ___
495 if ($addx) {
496 $code.=<<___;
497 jmp .Lsqr_tail
498
499 .align 32
500 .Loop_sqrx:
501 movl $times,128+8(%rsp)
502 movq $out, %xmm0 # off-load
503 movq %rbp, %xmm1 # off-load
504 #first iteration
505 mulx %rax, %r8, %r9
506
507 mulx 16($inp), %rcx, %r10
508 xor %rbp, %rbp # cf=0, of=0
509
510 mulx 24($inp), %rax, %r11
511 adcx %rcx, %r9
512
513 mulx 32($inp), %rcx, %r12
514 adcx %rax, %r10
515
516 mulx 40($inp), %rax, %r13
517 adcx %rcx, %r11
518
519 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
520 adcx %rax, %r12
521 adcx %rcx, %r13
522
523 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
524 adcx %rax, %r14
525 adcx %rbp, %r15 # %rbp is 0
526
527 mov %r9, %rcx
528 shld \$1, %r8, %r9
529 shl \$1, %r8
530
531 xor %ebp, %ebp
532 mulx %rdx, %rax, %rdx
533 adcx %rdx, %r8
534 mov 8($inp), %rdx
535 adcx %rbp, %r9
536
537 mov %rax, (%rsp)
538 mov %r8, 8(%rsp)
539
540 #second iteration
541 mulx 16($inp), %rax, %rbx
542 adox %rax, %r10
543 adcx %rbx, %r11
544
545 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
546 adox $out, %r11
547 adcx %r8, %r12
548
549 mulx 32($inp), %rax, %rbx
550 adox %rax, %r12
551 adcx %rbx, %r13
552
553 mulx 40($inp), $out, %r8
554 adox $out, %r13
555 adcx %r8, %r14
556
557 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
558 adox %rax, %r14
559 adcx %rbx, %r15
560
561 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
562 adox $out, %r15
563 adcx %rbp, %r8
564 adox %rbp, %r8
565
566 mov %r11, %rbx
567 shld \$1, %r10, %r11
568 shld \$1, %rcx, %r10
569
570 xor %ebp,%ebp
571 mulx %rdx, %rax, %rcx
572 mov 16($inp), %rdx
573 adcx %rax, %r9
574 adcx %rcx, %r10
575 adcx %rbp, %r11
576
577 mov %r9, 16(%rsp)
578 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
579
580 #third iteration
581 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
582 adox $out, %r12
583 adcx %r9, %r13
584
585 mulx 32($inp), %rax, %rcx
586 adox %rax, %r13
587 adcx %rcx, %r14
588
589 mulx 40($inp), $out, %r9
590 adox $out, %r14
591 adcx %r9, %r15
592
593 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
594 adox %rax, %r15
595 adcx %rcx, %r8
596
597 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
598 adox $out, %r8
599 adcx %rbp, %r9
600 adox %rbp, %r9
601
602 mov %r13, %rcx
603 shld \$1, %r12, %r13
604 shld \$1, %rbx, %r12
605
606 xor %ebp, %ebp
607 mulx %rdx, %rax, %rdx
608 adcx %rax, %r11
609 adcx %rdx, %r12
610 mov 24($inp), %rdx
611 adcx %rbp, %r13
612
613 mov %r11, 32(%rsp)
614 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
615
616 #fourth iteration
617 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
618 adox %rax, %r14
619 adcx %rbx, %r15
620
621 mulx 40($inp), $out, %r10
622 adox $out, %r15
623 adcx %r10, %r8
624
625 mulx 48($inp), %rax, %rbx
626 adox %rax, %r8
627 adcx %rbx, %r9
628
629 mulx 56($inp), $out, %r10
630 adox $out, %r9
631 adcx %rbp, %r10
632 adox %rbp, %r10
633
634 .byte 0x66
635 mov %r15, %rbx
636 shld \$1, %r14, %r15
637 shld \$1, %rcx, %r14
638
639 xor %ebp, %ebp
640 mulx %rdx, %rax, %rdx
641 adcx %rax, %r13
642 adcx %rdx, %r14
643 mov 32($inp), %rdx
644 adcx %rbp, %r15
645
646 mov %r13, 48(%rsp)
647 mov %r14, 56(%rsp)
648
649 #fifth iteration
650 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
651 adox $out, %r8
652 adcx %r11, %r9
653
654 mulx 48($inp), %rax, %rcx
655 adox %rax, %r9
656 adcx %rcx, %r10
657
658 mulx 56($inp), $out, %r11
659 adox $out, %r10
660 adcx %rbp, %r11
661 adox %rbp, %r11
662
663 mov %r9, %rcx
664 shld \$1, %r8, %r9
665 shld \$1, %rbx, %r8
666
667 xor %ebp, %ebp
668 mulx %rdx, %rax, %rdx
669 adcx %rax, %r15
670 adcx %rdx, %r8
671 mov 40($inp), %rdx
672 adcx %rbp, %r9
673
674 mov %r15, 64(%rsp)
675 mov %r8, 72(%rsp)
676
677 #sixth iteration
678 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
679 adox %rax, %r10
680 adcx %rbx, %r11
681
682 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
683 adox $out, %r11
684 adcx %rbp, %r12
685 adox %rbp, %r12
686
687 mov %r11, %rbx
688 shld \$1, %r10, %r11
689 shld \$1, %rcx, %r10
690
691 xor %ebp, %ebp
692 mulx %rdx, %rax, %rdx
693 adcx %rax, %r9
694 adcx %rdx, %r10
695 mov 48($inp), %rdx
696 adcx %rbp, %r11
697
698 mov %r9, 80(%rsp)
699 mov %r10, 88(%rsp)
700
701 #seventh iteration
702 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
703 adox %rax, %r12
704 adox %rbp, %r13
705
706 xor %r14, %r14
707 shld \$1, %r13, %r14
708 shld \$1, %r12, %r13
709 shld \$1, %rbx, %r12
710
711 xor %ebp, %ebp
712 mulx %rdx, %rax, %rdx
713 adcx %rax, %r11
714 adcx %rdx, %r12
715 mov 56($inp), %rdx
716 adcx %rbp, %r13
717
718 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
719 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
720
721 #eighth iteration
722 mulx %rdx, %rax, %rdx
723 adox %rax, %r13
724 adox %rbp, %rdx
725
726 .byte 0x66
727 add %rdx, %r14
728
729 movq %r13, 112(%rsp)
730 movq %r14, 120(%rsp)
731 movq %xmm0, $out
732 movq %xmm1, %rbp
733
734 movq 128(%rsp), %rdx # pull $n0
735 movq (%rsp), %r8
736 movq 8(%rsp), %r9
737 movq 16(%rsp), %r10
738 movq 24(%rsp), %r11
739 movq 32(%rsp), %r12
740 movq 40(%rsp), %r13
741 movq 48(%rsp), %r14
742 movq 56(%rsp), %r15
743
744 call __rsaz_512_reducex
745
746 addq 64(%rsp), %r8
747 adcq 72(%rsp), %r9
748 adcq 80(%rsp), %r10
749 adcq 88(%rsp), %r11
750 adcq 96(%rsp), %r12
751 adcq 104(%rsp), %r13
752 adcq 112(%rsp), %r14
753 adcq 120(%rsp), %r15
754 sbbq %rcx, %rcx
755
756 call __rsaz_512_subtract
757
758 movq %r8, %rdx
759 movq %r9, %rax
760 movl 128+8(%rsp), $times
761 movq $out, $inp
762
763 decl $times
764 jnz .Loop_sqrx
765
766 .Lsqr_tail:
767 ___
768 }
769 $code.=<<___;
770
771 leaq 128+24+48(%rsp), %rax
772 .cfi_def_cfa %rax,8
773 movq -48(%rax), %r15
774 .cfi_restore %r15
775 movq -40(%rax), %r14
776 .cfi_restore %r14
777 movq -32(%rax), %r13
778 .cfi_restore %r13
779 movq -24(%rax), %r12
780 .cfi_restore %r12
781 movq -16(%rax), %rbp
782 .cfi_restore %rbp
783 movq -8(%rax), %rbx
784 .cfi_restore %rbx
785 leaq (%rax), %rsp
786 .cfi_def_cfa_register %rsp
787 .Lsqr_epilogue:
788 ret
789 .cfi_endproc
790 .size rsaz_512_sqr,.-rsaz_512_sqr
791 ___
792 }
793 {
794 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
795 $code.=<<___;
796 .globl rsaz_512_mul
797 .type rsaz_512_mul,\@function,5
798 .align 32
799 rsaz_512_mul:
800 .cfi_startproc
801 push %rbx
802 .cfi_push %rbx
803 push %rbp
804 .cfi_push %rbp
805 push %r12
806 .cfi_push %r12
807 push %r13
808 .cfi_push %r13
809 push %r14
810 .cfi_push %r14
811 push %r15
812 .cfi_push %r15
813
814 subq \$128+24, %rsp
815 .cfi_adjust_cfa_offset 128+24
816 .Lmul_body:
817 movq $out, %xmm0 # off-load arguments
818 movq $mod, %xmm1
819 movq $n0, 128(%rsp)
820 ___
821 $code.=<<___ if ($addx);
822 movl \$0x80100,%r11d
823 andl OPENSSL_ia32cap_P+8(%rip),%r11d
824 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
825 je .Lmulx
826 ___
827 $code.=<<___;
828 movq ($bp), %rbx # pass b[0]
829 movq $bp, %rbp # pass argument
830 call __rsaz_512_mul
831
832 movq %xmm0, $out
833 movq %xmm1, %rbp
834
835 movq (%rsp), %r8
836 movq 8(%rsp), %r9
837 movq 16(%rsp), %r10
838 movq 24(%rsp), %r11
839 movq 32(%rsp), %r12
840 movq 40(%rsp), %r13
841 movq 48(%rsp), %r14
842 movq 56(%rsp), %r15
843
844 call __rsaz_512_reduce
845 ___
846 $code.=<<___ if ($addx);
847 jmp .Lmul_tail
848
849 .align 32
850 .Lmulx:
851 movq $bp, %rbp # pass argument
852 movq ($bp), %rdx # pass b[0]
853 call __rsaz_512_mulx
854
855 movq %xmm0, $out
856 movq %xmm1, %rbp
857
858 movq 128(%rsp), %rdx # pull $n0
859 movq (%rsp), %r8
860 movq 8(%rsp), %r9
861 movq 16(%rsp), %r10
862 movq 24(%rsp), %r11
863 movq 32(%rsp), %r12
864 movq 40(%rsp), %r13
865 movq 48(%rsp), %r14
866 movq 56(%rsp), %r15
867
868 call __rsaz_512_reducex
869 .Lmul_tail:
870 ___
871 $code.=<<___;
872 addq 64(%rsp), %r8
873 adcq 72(%rsp), %r9
874 adcq 80(%rsp), %r10
875 adcq 88(%rsp), %r11
876 adcq 96(%rsp), %r12
877 adcq 104(%rsp), %r13
878 adcq 112(%rsp), %r14
879 adcq 120(%rsp), %r15
880 sbbq %rcx, %rcx
881
882 call __rsaz_512_subtract
883
884 leaq 128+24+48(%rsp), %rax
885 .cfi_def_cfa %rax,8
886 movq -48(%rax), %r15
887 .cfi_restore %r15
888 movq -40(%rax), %r14
889 .cfi_restore %r14
890 movq -32(%rax), %r13
891 .cfi_restore %r13
892 movq -24(%rax), %r12
893 .cfi_restore %r12
894 movq -16(%rax), %rbp
895 .cfi_restore %rbp
896 movq -8(%rax), %rbx
897 .cfi_restore %rbx
898 leaq (%rax), %rsp
899 .cfi_def_cfa_register %rsp
900 .Lmul_epilogue:
901 ret
902 .cfi_endproc
903 .size rsaz_512_mul,.-rsaz_512_mul
904 ___
905 }
906 {
907 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
908 $code.=<<___;
909 .globl rsaz_512_mul_gather4
910 .type rsaz_512_mul_gather4,\@function,6
911 .align 32
912 rsaz_512_mul_gather4:
913 .cfi_startproc
914 push %rbx
915 .cfi_push %rbx
916 push %rbp
917 .cfi_push %rbp
918 push %r12
919 .cfi_push %r12
920 push %r13
921 .cfi_push %r13
922 push %r14
923 .cfi_push %r14
924 push %r15
925 .cfi_push %r15
926
927 subq \$`128+24+($win64?0xb0:0)`, %rsp
928 .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
929 ___
930 $code.=<<___ if ($win64);
931 movaps %xmm6,0xa0(%rsp)
932 movaps %xmm7,0xb0(%rsp)
933 movaps %xmm8,0xc0(%rsp)
934 movaps %xmm9,0xd0(%rsp)
935 movaps %xmm10,0xe0(%rsp)
936 movaps %xmm11,0xf0(%rsp)
937 movaps %xmm12,0x100(%rsp)
938 movaps %xmm13,0x110(%rsp)
939 movaps %xmm14,0x120(%rsp)
940 movaps %xmm15,0x130(%rsp)
941 ___
942 $code.=<<___;
943 .Lmul_gather4_body:
944 movd $pwr,%xmm8
945 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
946 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
947
948 pshufd \$0,%xmm8,%xmm8 # broadcast $power
949 movdqa %xmm1,%xmm7
950 movdqa %xmm1,%xmm2
951 ___
952 ########################################################################
953 # calculate mask by comparing 0..15 to $power
954 #
955 for($i=0;$i<4;$i++) {
956 $code.=<<___;
957 paddd %xmm`$i`,%xmm`$i+1`
958 pcmpeqd %xmm8,%xmm`$i`
959 movdqa %xmm7,%xmm`$i+3`
960 ___
961 }
962 for(;$i<7;$i++) {
963 $code.=<<___;
964 paddd %xmm`$i`,%xmm`$i+1`
965 pcmpeqd %xmm8,%xmm`$i`
966 ___
967 }
968 $code.=<<___;
969 pcmpeqd %xmm8,%xmm7
970
971 movdqa 16*0($bp),%xmm8
972 movdqa 16*1($bp),%xmm9
973 movdqa 16*2($bp),%xmm10
974 movdqa 16*3($bp),%xmm11
975 pand %xmm0,%xmm8
976 movdqa 16*4($bp),%xmm12
977 pand %xmm1,%xmm9
978 movdqa 16*5($bp),%xmm13
979 pand %xmm2,%xmm10
980 movdqa 16*6($bp),%xmm14
981 pand %xmm3,%xmm11
982 movdqa 16*7($bp),%xmm15
983 leaq 128($bp), %rbp
984 pand %xmm4,%xmm12
985 pand %xmm5,%xmm13
986 pand %xmm6,%xmm14
987 pand %xmm7,%xmm15
988 por %xmm10,%xmm8
989 por %xmm11,%xmm9
990 por %xmm12,%xmm8
991 por %xmm13,%xmm9
992 por %xmm14,%xmm8
993 por %xmm15,%xmm9
994
995 por %xmm9,%xmm8
996 pshufd \$0x4e,%xmm8,%xmm9
997 por %xmm9,%xmm8
998 ___
999 $code.=<<___ if ($addx);
1000 movl \$0x80100,%r11d
1001 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1002 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1003 je .Lmulx_gather
1004 ___
1005 $code.=<<___;
1006 movq %xmm8,%rbx
1007
1008 movq $n0, 128(%rsp) # off-load arguments
1009 movq $out, 128+8(%rsp)
1010 movq $mod, 128+16(%rsp)
1011
1012 movq ($ap), %rax
1013 movq 8($ap), %rcx
1014 mulq %rbx # 0 iteration
1015 movq %rax, (%rsp)
1016 movq %rcx, %rax
1017 movq %rdx, %r8
1018
1019 mulq %rbx
1020 addq %rax, %r8
1021 movq 16($ap), %rax
1022 movq %rdx, %r9
1023 adcq \$0, %r9
1024
1025 mulq %rbx
1026 addq %rax, %r9
1027 movq 24($ap), %rax
1028 movq %rdx, %r10
1029 adcq \$0, %r10
1030
1031 mulq %rbx
1032 addq %rax, %r10
1033 movq 32($ap), %rax
1034 movq %rdx, %r11
1035 adcq \$0, %r11
1036
1037 mulq %rbx
1038 addq %rax, %r11
1039 movq 40($ap), %rax
1040 movq %rdx, %r12
1041 adcq \$0, %r12
1042
1043 mulq %rbx
1044 addq %rax, %r12
1045 movq 48($ap), %rax
1046 movq %rdx, %r13
1047 adcq \$0, %r13
1048
1049 mulq %rbx
1050 addq %rax, %r13
1051 movq 56($ap), %rax
1052 movq %rdx, %r14
1053 adcq \$0, %r14
1054
1055 mulq %rbx
1056 addq %rax, %r14
1057 movq ($ap), %rax
1058 movq %rdx, %r15
1059 adcq \$0, %r15
1060
1061 leaq 8(%rsp), %rdi
1062 movl \$7, %ecx
1063 jmp .Loop_mul_gather
1064
1065 .align 32
1066 .Loop_mul_gather:
1067 movdqa 16*0(%rbp),%xmm8
1068 movdqa 16*1(%rbp),%xmm9
1069 movdqa 16*2(%rbp),%xmm10
1070 movdqa 16*3(%rbp),%xmm11
1071 pand %xmm0,%xmm8
1072 movdqa 16*4(%rbp),%xmm12
1073 pand %xmm1,%xmm9
1074 movdqa 16*5(%rbp),%xmm13
1075 pand %xmm2,%xmm10
1076 movdqa 16*6(%rbp),%xmm14
1077 pand %xmm3,%xmm11
1078 movdqa 16*7(%rbp),%xmm15
1079 leaq 128(%rbp), %rbp
1080 pand %xmm4,%xmm12
1081 pand %xmm5,%xmm13
1082 pand %xmm6,%xmm14
1083 pand %xmm7,%xmm15
1084 por %xmm10,%xmm8
1085 por %xmm11,%xmm9
1086 por %xmm12,%xmm8
1087 por %xmm13,%xmm9
1088 por %xmm14,%xmm8
1089 por %xmm15,%xmm9
1090
1091 por %xmm9,%xmm8
1092 pshufd \$0x4e,%xmm8,%xmm9
1093 por %xmm9,%xmm8
1094 movq %xmm8,%rbx
1095
1096 mulq %rbx
1097 addq %rax, %r8
1098 movq 8($ap), %rax
1099 movq %r8, (%rdi)
1100 movq %rdx, %r8
1101 adcq \$0, %r8
1102
1103 mulq %rbx
1104 addq %rax, %r9
1105 movq 16($ap), %rax
1106 adcq \$0, %rdx
1107 addq %r9, %r8
1108 movq %rdx, %r9
1109 adcq \$0, %r9
1110
1111 mulq %rbx
1112 addq %rax, %r10
1113 movq 24($ap), %rax
1114 adcq \$0, %rdx
1115 addq %r10, %r9
1116 movq %rdx, %r10
1117 adcq \$0, %r10
1118
1119 mulq %rbx
1120 addq %rax, %r11
1121 movq 32($ap), %rax
1122 adcq \$0, %rdx
1123 addq %r11, %r10
1124 movq %rdx, %r11
1125 adcq \$0, %r11
1126
1127 mulq %rbx
1128 addq %rax, %r12
1129 movq 40($ap), %rax
1130 adcq \$0, %rdx
1131 addq %r12, %r11
1132 movq %rdx, %r12
1133 adcq \$0, %r12
1134
1135 mulq %rbx
1136 addq %rax, %r13
1137 movq 48($ap), %rax
1138 adcq \$0, %rdx
1139 addq %r13, %r12
1140 movq %rdx, %r13
1141 adcq \$0, %r13
1142
1143 mulq %rbx
1144 addq %rax, %r14
1145 movq 56($ap), %rax
1146 adcq \$0, %rdx
1147 addq %r14, %r13
1148 movq %rdx, %r14
1149 adcq \$0, %r14
1150
1151 mulq %rbx
1152 addq %rax, %r15
1153 movq ($ap), %rax
1154 adcq \$0, %rdx
1155 addq %r15, %r14
1156 movq %rdx, %r15
1157 adcq \$0, %r15
1158
1159 leaq 8(%rdi), %rdi
1160
1161 decl %ecx
1162 jnz .Loop_mul_gather
1163
1164 movq %r8, (%rdi)
1165 movq %r9, 8(%rdi)
1166 movq %r10, 16(%rdi)
1167 movq %r11, 24(%rdi)
1168 movq %r12, 32(%rdi)
1169 movq %r13, 40(%rdi)
1170 movq %r14, 48(%rdi)
1171 movq %r15, 56(%rdi)
1172
1173 movq 128+8(%rsp), $out
1174 movq 128+16(%rsp), %rbp
1175
1176 movq (%rsp), %r8
1177 movq 8(%rsp), %r9
1178 movq 16(%rsp), %r10
1179 movq 24(%rsp), %r11
1180 movq 32(%rsp), %r12
1181 movq 40(%rsp), %r13
1182 movq 48(%rsp), %r14
1183 movq 56(%rsp), %r15
1184
1185 call __rsaz_512_reduce
1186 ___
1187 $code.=<<___ if ($addx);
1188 jmp .Lmul_gather_tail
1189
1190 .align 32
1191 .Lmulx_gather:
1192 movq %xmm8,%rdx
1193
1194 mov $n0, 128(%rsp) # off-load arguments
1195 mov $out, 128+8(%rsp)
1196 mov $mod, 128+16(%rsp)
1197
1198 mulx ($ap), %rbx, %r8 # 0 iteration
1199 mov %rbx, (%rsp)
1200 xor %edi, %edi # cf=0, of=0
1201
1202 mulx 8($ap), %rax, %r9
1203
1204 mulx 16($ap), %rbx, %r10
1205 adcx %rax, %r8
1206
1207 mulx 24($ap), %rax, %r11
1208 adcx %rbx, %r9
1209
1210 mulx 32($ap), %rbx, %r12
1211 adcx %rax, %r10
1212
1213 mulx 40($ap), %rax, %r13
1214 adcx %rbx, %r11
1215
1216 mulx 48($ap), %rbx, %r14
1217 adcx %rax, %r12
1218
1219 mulx 56($ap), %rax, %r15
1220 adcx %rbx, %r13
1221 adcx %rax, %r14
1222 .byte 0x67
1223 mov %r8, %rbx
1224 adcx %rdi, %r15 # %rdi is 0
1225
1226 mov \$-7, %rcx
1227 jmp .Loop_mulx_gather
1228
1229 .align 32
1230 .Loop_mulx_gather:
1231 movdqa 16*0(%rbp),%xmm8
1232 movdqa 16*1(%rbp),%xmm9
1233 movdqa 16*2(%rbp),%xmm10
1234 movdqa 16*3(%rbp),%xmm11
1235 pand %xmm0,%xmm8
1236 movdqa 16*4(%rbp),%xmm12
1237 pand %xmm1,%xmm9
1238 movdqa 16*5(%rbp),%xmm13
1239 pand %xmm2,%xmm10
1240 movdqa 16*6(%rbp),%xmm14
1241 pand %xmm3,%xmm11
1242 movdqa 16*7(%rbp),%xmm15
1243 leaq 128(%rbp), %rbp
1244 pand %xmm4,%xmm12
1245 pand %xmm5,%xmm13
1246 pand %xmm6,%xmm14
1247 pand %xmm7,%xmm15
1248 por %xmm10,%xmm8
1249 por %xmm11,%xmm9
1250 por %xmm12,%xmm8
1251 por %xmm13,%xmm9
1252 por %xmm14,%xmm8
1253 por %xmm15,%xmm9
1254
1255 por %xmm9,%xmm8
1256 pshufd \$0x4e,%xmm8,%xmm9
1257 por %xmm9,%xmm8
1258 movq %xmm8,%rdx
1259
1260 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1261 adcx %rax, %rbx
1262 adox %r9, %r8
1263
1264 mulx 8($ap), %rax, %r9
1265 adcx %rax, %r8
1266 adox %r10, %r9
1267
1268 mulx 16($ap), %rax, %r10
1269 adcx %rax, %r9
1270 adox %r11, %r10
1271
1272 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1273 adcx %rax, %r10
1274 adox %r12, %r11
1275
1276 mulx 32($ap), %rax, %r12
1277 adcx %rax, %r11
1278 adox %r13, %r12
1279
1280 mulx 40($ap), %rax, %r13
1281 adcx %rax, %r12
1282 adox %r14, %r13
1283
1284 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1285 adcx %rax, %r13
1286 .byte 0x67
1287 adox %r15, %r14
1288
1289 mulx 56($ap), %rax, %r15
1290 mov %rbx, 64(%rsp,%rcx,8)
1291 adcx %rax, %r14
1292 adox %rdi, %r15
1293 mov %r8, %rbx
1294 adcx %rdi, %r15 # cf=0
1295
1296 inc %rcx # of=0
1297 jnz .Loop_mulx_gather
1298
1299 mov %r8, 64(%rsp)
1300 mov %r9, 64+8(%rsp)
1301 mov %r10, 64+16(%rsp)
1302 mov %r11, 64+24(%rsp)
1303 mov %r12, 64+32(%rsp)
1304 mov %r13, 64+40(%rsp)
1305 mov %r14, 64+48(%rsp)
1306 mov %r15, 64+56(%rsp)
1307
1308 mov 128(%rsp), %rdx # pull arguments
1309 mov 128+8(%rsp), $out
1310 mov 128+16(%rsp), %rbp
1311
1312 mov (%rsp), %r8
1313 mov 8(%rsp), %r9
1314 mov 16(%rsp), %r10
1315 mov 24(%rsp), %r11
1316 mov 32(%rsp), %r12
1317 mov 40(%rsp), %r13
1318 mov 48(%rsp), %r14
1319 mov 56(%rsp), %r15
1320
1321 call __rsaz_512_reducex
1322
1323 .Lmul_gather_tail:
1324 ___
1325 $code.=<<___;
1326 addq 64(%rsp), %r8
1327 adcq 72(%rsp), %r9
1328 adcq 80(%rsp), %r10
1329 adcq 88(%rsp), %r11
1330 adcq 96(%rsp), %r12
1331 adcq 104(%rsp), %r13
1332 adcq 112(%rsp), %r14
1333 adcq 120(%rsp), %r15
1334 sbbq %rcx, %rcx
1335
1336 call __rsaz_512_subtract
1337
1338 leaq 128+24+48(%rsp), %rax
1339 ___
1340 $code.=<<___ if ($win64);
1341 movaps 0xa0-0xc8(%rax),%xmm6
1342 movaps 0xb0-0xc8(%rax),%xmm7
1343 movaps 0xc0-0xc8(%rax),%xmm8
1344 movaps 0xd0-0xc8(%rax),%xmm9
1345 movaps 0xe0-0xc8(%rax),%xmm10
1346 movaps 0xf0-0xc8(%rax),%xmm11
1347 movaps 0x100-0xc8(%rax),%xmm12
1348 movaps 0x110-0xc8(%rax),%xmm13
1349 movaps 0x120-0xc8(%rax),%xmm14
1350 movaps 0x130-0xc8(%rax),%xmm15
1351 lea 0xb0(%rax),%rax
1352 ___
1353 $code.=<<___;
1354 .cfi_def_cfa %rax,8
1355 movq -48(%rax), %r15
1356 .cfi_restore %r15
1357 movq -40(%rax), %r14
1358 .cfi_restore %r14
1359 movq -32(%rax), %r13
1360 .cfi_restore %r13
1361 movq -24(%rax), %r12
1362 .cfi_restore %r12
1363 movq -16(%rax), %rbp
1364 .cfi_restore %rbp
1365 movq -8(%rax), %rbx
1366 .cfi_restore %rbx
1367 leaq (%rax), %rsp
1368 .cfi_def_cfa_register %rsp
1369 .Lmul_gather4_epilogue:
1370 ret
1371 .cfi_endproc
1372 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1373 ___
1374 }
1375 {
1376 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1377 $code.=<<___;
1378 .globl rsaz_512_mul_scatter4
1379 .type rsaz_512_mul_scatter4,\@function,6
1380 .align 32
1381 rsaz_512_mul_scatter4:
1382 .cfi_startproc
1383 push %rbx
1384 .cfi_push %rbx
1385 push %rbp
1386 .cfi_push %rbp
1387 push %r12
1388 .cfi_push %r12
1389 push %r13
1390 .cfi_push %r13
1391 push %r14
1392 .cfi_push %r14
1393 push %r15
1394 .cfi_push %r15
1395
1396 mov $pwr, $pwr
1397 subq \$128+24, %rsp
1398 .cfi_adjust_cfa_offset 128+24
1399 .Lmul_scatter4_body:
1400 leaq ($tbl,$pwr,8), $tbl
1401 movq $out, %xmm0 # off-load arguments
1402 movq $mod, %xmm1
1403 movq $tbl, %xmm2
1404 movq $n0, 128(%rsp)
1405
1406 movq $out, %rbp
1407 ___
1408 $code.=<<___ if ($addx);
1409 movl \$0x80100,%r11d
1410 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1411 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1412 je .Lmulx_scatter
1413 ___
1414 $code.=<<___;
1415 movq ($out),%rbx # pass b[0]
1416 call __rsaz_512_mul
1417
1418 movq %xmm0, $out
1419 movq %xmm1, %rbp
1420
1421 movq (%rsp), %r8
1422 movq 8(%rsp), %r9
1423 movq 16(%rsp), %r10
1424 movq 24(%rsp), %r11
1425 movq 32(%rsp), %r12
1426 movq 40(%rsp), %r13
1427 movq 48(%rsp), %r14
1428 movq 56(%rsp), %r15
1429
1430 call __rsaz_512_reduce
1431 ___
1432 $code.=<<___ if ($addx);
1433 jmp .Lmul_scatter_tail
1434
1435 .align 32
1436 .Lmulx_scatter:
1437 movq ($out), %rdx # pass b[0]
1438 call __rsaz_512_mulx
1439
1440 movq %xmm0, $out
1441 movq %xmm1, %rbp
1442
1443 movq 128(%rsp), %rdx # pull $n0
1444 movq (%rsp), %r8
1445 movq 8(%rsp), %r9
1446 movq 16(%rsp), %r10
1447 movq 24(%rsp), %r11
1448 movq 32(%rsp), %r12
1449 movq 40(%rsp), %r13
1450 movq 48(%rsp), %r14
1451 movq 56(%rsp), %r15
1452
1453 call __rsaz_512_reducex
1454
1455 .Lmul_scatter_tail:
1456 ___
1457 $code.=<<___;
1458 addq 64(%rsp), %r8
1459 adcq 72(%rsp), %r9
1460 adcq 80(%rsp), %r10
1461 adcq 88(%rsp), %r11
1462 adcq 96(%rsp), %r12
1463 adcq 104(%rsp), %r13
1464 adcq 112(%rsp), %r14
1465 adcq 120(%rsp), %r15
1466 movq %xmm2, $inp
1467 sbbq %rcx, %rcx
1468
1469 call __rsaz_512_subtract
1470
1471 movq %r8, 128*0($inp) # scatter
1472 movq %r9, 128*1($inp)
1473 movq %r10, 128*2($inp)
1474 movq %r11, 128*3($inp)
1475 movq %r12, 128*4($inp)
1476 movq %r13, 128*5($inp)
1477 movq %r14, 128*6($inp)
1478 movq %r15, 128*7($inp)
1479
1480 leaq 128+24+48(%rsp), %rax
1481 .cfi_def_cfa %rax,8
1482 movq -48(%rax), %r15
1483 .cfi_restore %r15
1484 movq -40(%rax), %r14
1485 .cfi_restore %r14
1486 movq -32(%rax), %r13
1487 .cfi_restore %r13
1488 movq -24(%rax), %r12
1489 .cfi_restore %r12
1490 movq -16(%rax), %rbp
1491 .cfi_restore %rbp
1492 movq -8(%rax), %rbx
1493 .cfi_restore %rbx
1494 leaq (%rax), %rsp
1495 .cfi_def_cfa_register %rsp
1496 .Lmul_scatter4_epilogue:
1497 ret
1498 .cfi_endproc
1499 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1500 ___
1501 }
1502 {
1503 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1504 $code.=<<___;
1505 .globl rsaz_512_mul_by_one
1506 .type rsaz_512_mul_by_one,\@function,4
1507 .align 32
1508 rsaz_512_mul_by_one:
1509 .cfi_startproc
1510 push %rbx
1511 .cfi_push %rbx
1512 push %rbp
1513 .cfi_push %rbp
1514 push %r12
1515 .cfi_push %r12
1516 push %r13
1517 .cfi_push %r13
1518 push %r14
1519 .cfi_push %r14
1520 push %r15
1521 .cfi_push %r15
1522
1523 subq \$128+24, %rsp
1524 .cfi_adjust_cfa_offset 128+24
1525 .Lmul_by_one_body:
1526 ___
1527 $code.=<<___ if ($addx);
1528 movl OPENSSL_ia32cap_P+8(%rip),%eax
1529 ___
1530 $code.=<<___;
1531 movq $mod, %rbp # reassign argument
1532 movq $n0, 128(%rsp)
1533
1534 movq ($inp), %r8
1535 pxor %xmm0, %xmm0
1536 movq 8($inp), %r9
1537 movq 16($inp), %r10
1538 movq 24($inp), %r11
1539 movq 32($inp), %r12
1540 movq 40($inp), %r13
1541 movq 48($inp), %r14
1542 movq 56($inp), %r15
1543
1544 movdqa %xmm0, (%rsp)
1545 movdqa %xmm0, 16(%rsp)
1546 movdqa %xmm0, 32(%rsp)
1547 movdqa %xmm0, 48(%rsp)
1548 movdqa %xmm0, 64(%rsp)
1549 movdqa %xmm0, 80(%rsp)
1550 movdqa %xmm0, 96(%rsp)
1551 ___
1552 $code.=<<___ if ($addx);
1553 andl \$0x80100,%eax
1554 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1555 je .Lby_one_callx
1556 ___
1557 $code.=<<___;
1558 call __rsaz_512_reduce
1559 ___
1560 $code.=<<___ if ($addx);
1561 jmp .Lby_one_tail
1562 .align 32
1563 .Lby_one_callx:
1564 movq 128(%rsp), %rdx # pull $n0
1565 call __rsaz_512_reducex
1566 .Lby_one_tail:
1567 ___
1568 $code.=<<___;
1569 movq %r8, ($out)
1570 movq %r9, 8($out)
1571 movq %r10, 16($out)
1572 movq %r11, 24($out)
1573 movq %r12, 32($out)
1574 movq %r13, 40($out)
1575 movq %r14, 48($out)
1576 movq %r15, 56($out)
1577
1578 leaq 128+24+48(%rsp), %rax
1579 .cfi_def_cfa %rax,8
1580 movq -48(%rax), %r15
1581 .cfi_restore %r15
1582 movq -40(%rax), %r14
1583 .cfi_restore %r14
1584 movq -32(%rax), %r13
1585 .cfi_restore %r13
1586 movq -24(%rax), %r12
1587 .cfi_restore %r12
1588 movq -16(%rax), %rbp
1589 .cfi_restore %rbp
1590 movq -8(%rax), %rbx
1591 .cfi_restore %rbx
1592 leaq (%rax), %rsp
1593 .cfi_def_cfa_register %rsp
1594 .Lmul_by_one_epilogue:
1595 ret
1596 .cfi_endproc
1597 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1598 ___
1599 }
1600 { # __rsaz_512_reduce
1601 #
1602 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1603 # output: %r8-%r15
1604 # clobbers: everything except %rbp and %rdi
1605 $code.=<<___;
1606 .type __rsaz_512_reduce,\@abi-omnipotent
1607 .align 32
1608 __rsaz_512_reduce:
1609 movq %r8, %rbx
1610 imulq 128+8(%rsp), %rbx
1611 movq 0(%rbp), %rax
1612 movl \$8, %ecx
1613 jmp .Lreduction_loop
1614
1615 .align 32
1616 .Lreduction_loop:
1617 mulq %rbx
1618 movq 8(%rbp), %rax
1619 negq %r8
1620 movq %rdx, %r8
1621 adcq \$0, %r8
1622
1623 mulq %rbx
1624 addq %rax, %r9
1625 movq 16(%rbp), %rax
1626 adcq \$0, %rdx
1627 addq %r9, %r8
1628 movq %rdx, %r9
1629 adcq \$0, %r9
1630
1631 mulq %rbx
1632 addq %rax, %r10
1633 movq 24(%rbp), %rax
1634 adcq \$0, %rdx
1635 addq %r10, %r9
1636 movq %rdx, %r10
1637 adcq \$0, %r10
1638
1639 mulq %rbx
1640 addq %rax, %r11
1641 movq 32(%rbp), %rax
1642 adcq \$0, %rdx
1643 addq %r11, %r10
1644 movq 128+8(%rsp), %rsi
1645 #movq %rdx, %r11
1646 #adcq \$0, %r11
1647 adcq \$0, %rdx
1648 movq %rdx, %r11
1649
1650 mulq %rbx
1651 addq %rax, %r12
1652 movq 40(%rbp), %rax
1653 adcq \$0, %rdx
1654 imulq %r8, %rsi
1655 addq %r12, %r11
1656 movq %rdx, %r12
1657 adcq \$0, %r12
1658
1659 mulq %rbx
1660 addq %rax, %r13
1661 movq 48(%rbp), %rax
1662 adcq \$0, %rdx
1663 addq %r13, %r12
1664 movq %rdx, %r13
1665 adcq \$0, %r13
1666
1667 mulq %rbx
1668 addq %rax, %r14
1669 movq 56(%rbp), %rax
1670 adcq \$0, %rdx
1671 addq %r14, %r13
1672 movq %rdx, %r14
1673 adcq \$0, %r14
1674
1675 mulq %rbx
1676 movq %rsi, %rbx
1677 addq %rax, %r15
1678 movq 0(%rbp), %rax
1679 adcq \$0, %rdx
1680 addq %r15, %r14
1681 movq %rdx, %r15
1682 adcq \$0, %r15
1683
1684 decl %ecx
1685 jne .Lreduction_loop
1686
1687 ret
1688 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1689 ___
1690 }
1691 if ($addx) {
1692 # __rsaz_512_reducex
1693 #
1694 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1695 # output: %r8-%r15
1696 # clobbers: everything except %rbp and %rdi
1697 $code.=<<___;
1698 .type __rsaz_512_reducex,\@abi-omnipotent
1699 .align 32
1700 __rsaz_512_reducex:
1701 #movq 128+8(%rsp), %rdx # pull $n0
1702 imulq %r8, %rdx
1703 xorq %rsi, %rsi # cf=0,of=0
1704 movl \$8, %ecx
1705 jmp .Lreduction_loopx
1706
1707 .align 32
1708 .Lreduction_loopx:
1709 mov %r8, %rbx
1710 mulx 0(%rbp), %rax, %r8
1711 adcx %rbx, %rax
1712 adox %r9, %r8
1713
1714 mulx 8(%rbp), %rax, %r9
1715 adcx %rax, %r8
1716 adox %r10, %r9
1717
1718 mulx 16(%rbp), %rbx, %r10
1719 adcx %rbx, %r9
1720 adox %r11, %r10
1721
1722 mulx 24(%rbp), %rbx, %r11
1723 adcx %rbx, %r10
1724 adox %r12, %r11
1725
1726 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1727 mov %rdx, %rax
1728 mov %r8, %rdx
1729 adcx %rbx, %r11
1730 adox %r13, %r12
1731
1732 mulx 128+8(%rsp), %rbx, %rdx
1733 mov %rax, %rdx
1734
1735 mulx 40(%rbp), %rax, %r13
1736 adcx %rax, %r12
1737 adox %r14, %r13
1738
1739 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1740 adcx %rax, %r13
1741 adox %r15, %r14
1742
1743 mulx 56(%rbp), %rax, %r15
1744 mov %rbx, %rdx
1745 adcx %rax, %r14
1746 adox %rsi, %r15 # %rsi is 0
1747 adcx %rsi, %r15 # cf=0
1748
1749 decl %ecx # of=0
1750 jne .Lreduction_loopx
1751
1752 ret
1753 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1754 ___
1755 }
1756 { # __rsaz_512_subtract
1757 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1758 # output:
1759 # clobbers: everything but %rdi, %rsi and %rbp
1760 $code.=<<___;
1761 .type __rsaz_512_subtract,\@abi-omnipotent
1762 .align 32
1763 __rsaz_512_subtract:
1764 movq %r8, ($out)
1765 movq %r9, 8($out)
1766 movq %r10, 16($out)
1767 movq %r11, 24($out)
1768 movq %r12, 32($out)
1769 movq %r13, 40($out)
1770 movq %r14, 48($out)
1771 movq %r15, 56($out)
1772
1773 movq 0($mod), %r8
1774 movq 8($mod), %r9
1775 negq %r8
1776 notq %r9
1777 andq %rcx, %r8
1778 movq 16($mod), %r10
1779 andq %rcx, %r9
1780 notq %r10
1781 movq 24($mod), %r11
1782 andq %rcx, %r10
1783 notq %r11
1784 movq 32($mod), %r12
1785 andq %rcx, %r11
1786 notq %r12
1787 movq 40($mod), %r13
1788 andq %rcx, %r12
1789 notq %r13
1790 movq 48($mod), %r14
1791 andq %rcx, %r13
1792 notq %r14
1793 movq 56($mod), %r15
1794 andq %rcx, %r14
1795 notq %r15
1796 andq %rcx, %r15
1797
1798 addq ($out), %r8
1799 adcq 8($out), %r9
1800 adcq 16($out), %r10
1801 adcq 24($out), %r11
1802 adcq 32($out), %r12
1803 adcq 40($out), %r13
1804 adcq 48($out), %r14
1805 adcq 56($out), %r15
1806
1807 movq %r8, ($out)
1808 movq %r9, 8($out)
1809 movq %r10, 16($out)
1810 movq %r11, 24($out)
1811 movq %r12, 32($out)
1812 movq %r13, 40($out)
1813 movq %r14, 48($out)
1814 movq %r15, 56($out)
1815
1816 ret
1817 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1818 ___
1819 }
1820 { # __rsaz_512_mul
1821 #
1822 # input: %rsi - ap, %rbp - bp
1823 # output:
1824 # clobbers: everything
1825 my ($ap,$bp) = ("%rsi","%rbp");
1826 $code.=<<___;
1827 .type __rsaz_512_mul,\@abi-omnipotent
1828 .align 32
1829 __rsaz_512_mul:
1830 leaq 8(%rsp), %rdi
1831
1832 movq ($ap), %rax
1833 mulq %rbx
1834 movq %rax, (%rdi)
1835 movq 8($ap), %rax
1836 movq %rdx, %r8
1837
1838 mulq %rbx
1839 addq %rax, %r8
1840 movq 16($ap), %rax
1841 movq %rdx, %r9
1842 adcq \$0, %r9
1843
1844 mulq %rbx
1845 addq %rax, %r9
1846 movq 24($ap), %rax
1847 movq %rdx, %r10
1848 adcq \$0, %r10
1849
1850 mulq %rbx
1851 addq %rax, %r10
1852 movq 32($ap), %rax
1853 movq %rdx, %r11
1854 adcq \$0, %r11
1855
1856 mulq %rbx
1857 addq %rax, %r11
1858 movq 40($ap), %rax
1859 movq %rdx, %r12
1860 adcq \$0, %r12
1861
1862 mulq %rbx
1863 addq %rax, %r12
1864 movq 48($ap), %rax
1865 movq %rdx, %r13
1866 adcq \$0, %r13
1867
1868 mulq %rbx
1869 addq %rax, %r13
1870 movq 56($ap), %rax
1871 movq %rdx, %r14
1872 adcq \$0, %r14
1873
1874 mulq %rbx
1875 addq %rax, %r14
1876 movq ($ap), %rax
1877 movq %rdx, %r15
1878 adcq \$0, %r15
1879
1880 leaq 8($bp), $bp
1881 leaq 8(%rdi), %rdi
1882
1883 movl \$7, %ecx
1884 jmp .Loop_mul
1885
1886 .align 32
1887 .Loop_mul:
1888 movq ($bp), %rbx
1889 mulq %rbx
1890 addq %rax, %r8
1891 movq 8($ap), %rax
1892 movq %r8, (%rdi)
1893 movq %rdx, %r8
1894 adcq \$0, %r8
1895
1896 mulq %rbx
1897 addq %rax, %r9
1898 movq 16($ap), %rax
1899 adcq \$0, %rdx
1900 addq %r9, %r8
1901 movq %rdx, %r9
1902 adcq \$0, %r9
1903
1904 mulq %rbx
1905 addq %rax, %r10
1906 movq 24($ap), %rax
1907 adcq \$0, %rdx
1908 addq %r10, %r9
1909 movq %rdx, %r10
1910 adcq \$0, %r10
1911
1912 mulq %rbx
1913 addq %rax, %r11
1914 movq 32($ap), %rax
1915 adcq \$0, %rdx
1916 addq %r11, %r10
1917 movq %rdx, %r11
1918 adcq \$0, %r11
1919
1920 mulq %rbx
1921 addq %rax, %r12
1922 movq 40($ap), %rax
1923 adcq \$0, %rdx
1924 addq %r12, %r11
1925 movq %rdx, %r12
1926 adcq \$0, %r12
1927
1928 mulq %rbx
1929 addq %rax, %r13
1930 movq 48($ap), %rax
1931 adcq \$0, %rdx
1932 addq %r13, %r12
1933 movq %rdx, %r13
1934 adcq \$0, %r13
1935
1936 mulq %rbx
1937 addq %rax, %r14
1938 movq 56($ap), %rax
1939 adcq \$0, %rdx
1940 addq %r14, %r13
1941 movq %rdx, %r14
1942 leaq 8($bp), $bp
1943 adcq \$0, %r14
1944
1945 mulq %rbx
1946 addq %rax, %r15
1947 movq ($ap), %rax
1948 adcq \$0, %rdx
1949 addq %r15, %r14
1950 movq %rdx, %r15
1951 adcq \$0, %r15
1952
1953 leaq 8(%rdi), %rdi
1954
1955 decl %ecx
1956 jnz .Loop_mul
1957
1958 movq %r8, (%rdi)
1959 movq %r9, 8(%rdi)
1960 movq %r10, 16(%rdi)
1961 movq %r11, 24(%rdi)
1962 movq %r12, 32(%rdi)
1963 movq %r13, 40(%rdi)
1964 movq %r14, 48(%rdi)
1965 movq %r15, 56(%rdi)
1966
1967 ret
1968 .size __rsaz_512_mul,.-__rsaz_512_mul
1969 ___
1970 }
1971 if ($addx) {
1972 # __rsaz_512_mulx
1973 #
1974 # input: %rsi - ap, %rbp - bp
1975 # output:
1976 # clobbers: everything
1977 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1978 $code.=<<___;
1979 .type __rsaz_512_mulx,\@abi-omnipotent
1980 .align 32
1981 __rsaz_512_mulx:
1982 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1983 mov \$-6, %rcx
1984
1985 mulx 8($ap), %rax, %r9
1986 movq %rbx, 8(%rsp)
1987
1988 mulx 16($ap), %rbx, %r10
1989 adc %rax, %r8
1990
1991 mulx 24($ap), %rax, %r11
1992 adc %rbx, %r9
1993
1994 mulx 32($ap), %rbx, %r12
1995 adc %rax, %r10
1996
1997 mulx 40($ap), %rax, %r13
1998 adc %rbx, %r11
1999
2000 mulx 48($ap), %rbx, %r14
2001 adc %rax, %r12
2002
2003 mulx 56($ap), %rax, %r15
2004 mov 8($bp), %rdx
2005 adc %rbx, %r13
2006 adc %rax, %r14
2007 adc \$0, %r15
2008
2009 xor $zero, $zero # cf=0,of=0
2010 jmp .Loop_mulx
2011
2012 .align 32
2013 .Loop_mulx:
2014 movq %r8, %rbx
2015 mulx ($ap), %rax, %r8
2016 adcx %rax, %rbx
2017 adox %r9, %r8
2018
2019 mulx 8($ap), %rax, %r9
2020 adcx %rax, %r8
2021 adox %r10, %r9
2022
2023 mulx 16($ap), %rax, %r10
2024 adcx %rax, %r9
2025 adox %r11, %r10
2026
2027 mulx 24($ap), %rax, %r11
2028 adcx %rax, %r10
2029 adox %r12, %r11
2030
2031 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
2032 adcx %rax, %r11
2033 adox %r13, %r12
2034
2035 mulx 40($ap), %rax, %r13
2036 adcx %rax, %r12
2037 adox %r14, %r13
2038
2039 mulx 48($ap), %rax, %r14
2040 adcx %rax, %r13
2041 adox %r15, %r14
2042
2043 mulx 56($ap), %rax, %r15
2044 movq 64($bp,%rcx,8), %rdx
2045 movq %rbx, 8+64-8(%rsp,%rcx,8)
2046 adcx %rax, %r14
2047 adox $zero, %r15
2048 adcx $zero, %r15 # cf=0
2049
2050 inc %rcx # of=0
2051 jnz .Loop_mulx
2052
2053 movq %r8, %rbx
2054 mulx ($ap), %rax, %r8
2055 adcx %rax, %rbx
2056 adox %r9, %r8
2057
2058 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2059 adcx %rax, %r8
2060 adox %r10, %r9
2061
2062 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2063 adcx %rax, %r9
2064 adox %r11, %r10
2065
2066 mulx 24($ap), %rax, %r11
2067 adcx %rax, %r10
2068 adox %r12, %r11
2069
2070 mulx 32($ap), %rax, %r12
2071 adcx %rax, %r11
2072 adox %r13, %r12
2073
2074 mulx 40($ap), %rax, %r13
2075 adcx %rax, %r12
2076 adox %r14, %r13
2077
2078 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2079 adcx %rax, %r13
2080 adox %r15, %r14
2081
2082 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2083 adcx %rax, %r14
2084 adox $zero, %r15
2085 adcx $zero, %r15
2086
2087 mov %rbx, 8+64-8(%rsp)
2088 mov %r8, 8+64(%rsp)
2089 mov %r9, 8+64+8(%rsp)
2090 mov %r10, 8+64+16(%rsp)
2091 mov %r11, 8+64+24(%rsp)
2092 mov %r12, 8+64+32(%rsp)
2093 mov %r13, 8+64+40(%rsp)
2094 mov %r14, 8+64+48(%rsp)
2095 mov %r15, 8+64+56(%rsp)
2096
2097 ret
2098 .size __rsaz_512_mulx,.-__rsaz_512_mulx
2099 ___
2100 }
2101 {
2102 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2103 $code.=<<___;
2104 .globl rsaz_512_scatter4
2105 .type rsaz_512_scatter4,\@abi-omnipotent
2106 .align 16
2107 rsaz_512_scatter4:
2108 leaq ($out,$power,8), $out
2109 movl \$8, %r9d
2110 jmp .Loop_scatter
2111 .align 16
2112 .Loop_scatter:
2113 movq ($inp), %rax
2114 leaq 8($inp), $inp
2115 movq %rax, ($out)
2116 leaq 128($out), $out
2117 decl %r9d
2118 jnz .Loop_scatter
2119 ret
2120 .size rsaz_512_scatter4,.-rsaz_512_scatter4
2121
2122 .globl rsaz_512_gather4
2123 .type rsaz_512_gather4,\@abi-omnipotent
2124 .align 16
2125 rsaz_512_gather4:
2126 ___
2127 $code.=<<___ if ($win64);
2128 .LSEH_begin_rsaz_512_gather4:
2129 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2130 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2131 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2132 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2133 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2134 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2135 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2136 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2137 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2138 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2139 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2140 ___
2141 $code.=<<___;
2142 movd $power,%xmm8
2143 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
2144 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
2145
2146 pshufd \$0,%xmm8,%xmm8 # broadcast $power
2147 movdqa %xmm1,%xmm7
2148 movdqa %xmm1,%xmm2
2149 ___
2150 ########################################################################
2151 # calculate mask by comparing 0..15 to $power
2152 #
2153 for($i=0;$i<4;$i++) {
2154 $code.=<<___;
2155 paddd %xmm`$i`,%xmm`$i+1`
2156 pcmpeqd %xmm8,%xmm`$i`
2157 movdqa %xmm7,%xmm`$i+3`
2158 ___
2159 }
2160 for(;$i<7;$i++) {
2161 $code.=<<___;
2162 paddd %xmm`$i`,%xmm`$i+1`
2163 pcmpeqd %xmm8,%xmm`$i`
2164 ___
2165 }
2166 $code.=<<___;
2167 pcmpeqd %xmm8,%xmm7
2168 movl \$8, %r9d
2169 jmp .Loop_gather
2170 .align 16
2171 .Loop_gather:
2172 movdqa 16*0($inp),%xmm8
2173 movdqa 16*1($inp),%xmm9
2174 movdqa 16*2($inp),%xmm10
2175 movdqa 16*3($inp),%xmm11
2176 pand %xmm0,%xmm8
2177 movdqa 16*4($inp),%xmm12
2178 pand %xmm1,%xmm9
2179 movdqa 16*5($inp),%xmm13
2180 pand %xmm2,%xmm10
2181 movdqa 16*6($inp),%xmm14
2182 pand %xmm3,%xmm11
2183 movdqa 16*7($inp),%xmm15
2184 leaq 128($inp), $inp
2185 pand %xmm4,%xmm12
2186 pand %xmm5,%xmm13
2187 pand %xmm6,%xmm14
2188 pand %xmm7,%xmm15
2189 por %xmm10,%xmm8
2190 por %xmm11,%xmm9
2191 por %xmm12,%xmm8
2192 por %xmm13,%xmm9
2193 por %xmm14,%xmm8
2194 por %xmm15,%xmm9
2195
2196 por %xmm9,%xmm8
2197 pshufd \$0x4e,%xmm8,%xmm9
2198 por %xmm9,%xmm8
2199 movq %xmm8,($out)
2200 leaq 8($out), $out
2201 decl %r9d
2202 jnz .Loop_gather
2203 ___
2204 $code.=<<___ if ($win64);
2205 movaps 0x00(%rsp),%xmm6
2206 movaps 0x10(%rsp),%xmm7
2207 movaps 0x20(%rsp),%xmm8
2208 movaps 0x30(%rsp),%xmm9
2209 movaps 0x40(%rsp),%xmm10
2210 movaps 0x50(%rsp),%xmm11
2211 movaps 0x60(%rsp),%xmm12
2212 movaps 0x70(%rsp),%xmm13
2213 movaps 0x80(%rsp),%xmm14
2214 movaps 0x90(%rsp),%xmm15
2215 add \$0xa8,%rsp
2216 ___
2217 $code.=<<___;
2218 ret
2219 .LSEH_end_rsaz_512_gather4:
2220 .size rsaz_512_gather4,.-rsaz_512_gather4
2221
2222 .align 64
2223 .Linc:
2224 .long 0,0, 1,1
2225 .long 2,2, 2,2
2226 ___
2227 }
2228
2229 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2230 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2231 if ($win64) {
2232 $rec="%rcx";
2233 $frame="%rdx";
2234 $context="%r8";
2235 $disp="%r9";
2236
2237 $code.=<<___;
2238 .extern __imp_RtlVirtualUnwind
2239 .type se_handler,\@abi-omnipotent
2240 .align 16
2241 se_handler:
2242 push %rsi
2243 push %rdi
2244 push %rbx
2245 push %rbp
2246 push %r12
2247 push %r13
2248 push %r14
2249 push %r15
2250 pushfq
2251 sub \$64,%rsp
2252
2253 mov 120($context),%rax # pull context->Rax
2254 mov 248($context),%rbx # pull context->Rip
2255
2256 mov 8($disp),%rsi # disp->ImageBase
2257 mov 56($disp),%r11 # disp->HandlerData
2258
2259 mov 0(%r11),%r10d # HandlerData[0]
2260 lea (%rsi,%r10),%r10 # end of prologue label
2261 cmp %r10,%rbx # context->Rip<end of prologue label
2262 jb .Lcommon_seh_tail
2263
2264 mov 152($context),%rax # pull context->Rsp
2265
2266 mov 4(%r11),%r10d # HandlerData[1]
2267 lea (%rsi,%r10),%r10 # epilogue label
2268 cmp %r10,%rbx # context->Rip>=epilogue label
2269 jae .Lcommon_seh_tail
2270
2271 lea 128+24+48(%rax),%rax
2272
2273 lea .Lmul_gather4_epilogue(%rip),%rbx
2274 cmp %r10,%rbx
2275 jne .Lse_not_in_mul_gather4
2276
2277 lea 0xb0(%rax),%rax
2278
2279 lea -48-0xa8(%rax),%rsi
2280 lea 512($context),%rdi
2281 mov \$20,%ecx
2282 .long 0xa548f3fc # cld; rep movsq
2283
2284 .Lse_not_in_mul_gather4:
2285 mov -8(%rax),%rbx
2286 mov -16(%rax),%rbp
2287 mov -24(%rax),%r12
2288 mov -32(%rax),%r13
2289 mov -40(%rax),%r14
2290 mov -48(%rax),%r15
2291 mov %rbx,144($context) # restore context->Rbx
2292 mov %rbp,160($context) # restore context->Rbp
2293 mov %r12,216($context) # restore context->R12
2294 mov %r13,224($context) # restore context->R13
2295 mov %r14,232($context) # restore context->R14
2296 mov %r15,240($context) # restore context->R15
2297
2298 .Lcommon_seh_tail:
2299 mov 8(%rax),%rdi
2300 mov 16(%rax),%rsi
2301 mov %rax,152($context) # restore context->Rsp
2302 mov %rsi,168($context) # restore context->Rsi
2303 mov %rdi,176($context) # restore context->Rdi
2304
2305 mov 40($disp),%rdi # disp->ContextRecord
2306 mov $context,%rsi # context
2307 mov \$154,%ecx # sizeof(CONTEXT)
2308 .long 0xa548f3fc # cld; rep movsq
2309
2310 mov $disp,%rsi
2311 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2312 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2313 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2314 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2315 mov 40(%rsi),%r10 # disp->ContextRecord
2316 lea 56(%rsi),%r11 # &disp->HandlerData
2317 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2318 mov %r10,32(%rsp) # arg5
2319 mov %r11,40(%rsp) # arg6
2320 mov %r12,48(%rsp) # arg7
2321 mov %rcx,56(%rsp) # arg8, (NULL)
2322 call *__imp_RtlVirtualUnwind(%rip)
2323
2324 mov \$1,%eax # ExceptionContinueSearch
2325 add \$64,%rsp
2326 popfq
2327 pop %r15
2328 pop %r14
2329 pop %r13
2330 pop %r12
2331 pop %rbp
2332 pop %rbx
2333 pop %rdi
2334 pop %rsi
2335 ret
2336 .size se_handler,.-se_handler
2337
2338 .section .pdata
2339 .align 4
2340 .rva .LSEH_begin_rsaz_512_sqr
2341 .rva .LSEH_end_rsaz_512_sqr
2342 .rva .LSEH_info_rsaz_512_sqr
2343
2344 .rva .LSEH_begin_rsaz_512_mul
2345 .rva .LSEH_end_rsaz_512_mul
2346 .rva .LSEH_info_rsaz_512_mul
2347
2348 .rva .LSEH_begin_rsaz_512_mul_gather4
2349 .rva .LSEH_end_rsaz_512_mul_gather4
2350 .rva .LSEH_info_rsaz_512_mul_gather4
2351
2352 .rva .LSEH_begin_rsaz_512_mul_scatter4
2353 .rva .LSEH_end_rsaz_512_mul_scatter4
2354 .rva .LSEH_info_rsaz_512_mul_scatter4
2355
2356 .rva .LSEH_begin_rsaz_512_mul_by_one
2357 .rva .LSEH_end_rsaz_512_mul_by_one
2358 .rva .LSEH_info_rsaz_512_mul_by_one
2359
2360 .rva .LSEH_begin_rsaz_512_gather4
2361 .rva .LSEH_end_rsaz_512_gather4
2362 .rva .LSEH_info_rsaz_512_gather4
2363
2364 .section .xdata
2365 .align 8
2366 .LSEH_info_rsaz_512_sqr:
2367 .byte 9,0,0,0
2368 .rva se_handler
2369 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2370 .LSEH_info_rsaz_512_mul:
2371 .byte 9,0,0,0
2372 .rva se_handler
2373 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2374 .LSEH_info_rsaz_512_mul_gather4:
2375 .byte 9,0,0,0
2376 .rva se_handler
2377 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2378 .LSEH_info_rsaz_512_mul_scatter4:
2379 .byte 9,0,0,0
2380 .rva se_handler
2381 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2382 .LSEH_info_rsaz_512_mul_by_one:
2383 .byte 9,0,0,0
2384 .rva se_handler
2385 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2386 .LSEH_info_rsaz_512_gather4:
2387 .byte 0x01,0x46,0x16,0x00
2388 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2389 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2390 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2391 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2392 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2393 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2394 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2395 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2396 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2397 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2398 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
2399 ___
2400 }
2401
2402 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2403 print $code;
2404 close STDOUT;