]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/rsaz-x86_64.pl
ac21676d8d3605ea174ed4acfcd531337d463b8f
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
4 #
5 # Licensed under the OpenSSL license (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
9 #
10 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
12 # (2) University of Haifa, Israel
13 #
14 # References:
15 # [1] S. Gueron, "Efficient Software Implementations of Modular
16 # Exponentiation", http://eprint.iacr.org/2011/239
17 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
18 # IEEE Proceedings of 9th International Conference on Information
19 # Technology: New Generations (ITNG 2012), 821-823 (2012).
20 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
21 # Journal of Cryptographic Engineering 2:31-43 (2012).
22 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
23 # resistant 512-bit and 1024-bit modular exponentiation for optimizing
24 # RSA1024 and RSA2048 on x86_64 platforms",
25 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
26 #
27 # While original submission covers 512- and 1024-bit exponentiation,
28 # this module is limited to 512-bit version only (and as such
29 # accelerates RSA1024 sign). This is because improvement for longer
30 # keys is not high enough to justify the effort, highest measured
31 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
32 # for the moment of this writing!] Nor does this module implement
33 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
34 # to more modular mixture of C and assembly. And it's optimized even
35 # for processors other than Intel Core family (see table below for
36 # improvement coefficients).
37 # <appro@openssl.org>
38 #
39 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
40 # ----------------+---------------------------
41 # Opteron +13% |+5% +20%
42 # Bulldozer -0% |-1% +10%
43 # P4 +11% |+7% +8%
44 # Westmere +5% |+14% +17%
45 # Sandy Bridge +2% |+12% +29%
46 # Ivy Bridge +1% |+11% +35%
47 # Haswell(**) -0% |+12% +39%
48 # Atom +13% |+11% +4%
49 # VIA Nano +70% |+9% +25%
50 #
51 # (*) rsax engine and fips numbers are presented for reference
52 # purposes;
53 # (**) MULX was attempted, but found to give only marginal improvement;
54
55 $flavour = shift;
56 $output = shift;
57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
58
59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
60
61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
64 die "can't locate x86_64-xlate.pl";
65
66 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
67 *STDOUT=*OUT;
68
69 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
70 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
71 $addx = ($1>=2.23);
72 }
73
74 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
75 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
76 $addx = ($1>=2.10);
77 }
78
79 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
80 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
81 $addx = ($1>=12);
82 }
83
84 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
86 $addx = ($ver>=3.03);
87 }
88
89 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
90 {
91 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
92
93 $code.=<<___;
94 .text
95
96 .extern OPENSSL_ia32cap_P
97
98 .globl rsaz_512_sqr
99 .type rsaz_512_sqr,\@function,5
100 .align 32
101 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
102 .cfi_startproc
103 push %rbx
104 .cfi_push %rbx
105 push %rbp
106 .cfi_push %rbp
107 push %r12
108 .cfi_push %r12
109 push %r13
110 .cfi_push %r13
111 push %r14
112 .cfi_push %r14
113 push %r15
114 .cfi_push %r15
115
116 subq \$128+24, %rsp
117 .cfi_adjust_cfa_offset 128+24
118 .Lsqr_body:
119 movq $mod, %xmm1 # common off-load
120 movq ($inp), %rdx
121 movq 8($inp), %rax
122 movq $n0, 128(%rsp)
123 ___
124 $code.=<<___ if ($addx);
125 movl \$0x80100,%r11d
126 andl OPENSSL_ia32cap_P+8(%rip),%r11d
127 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
128 je .Loop_sqrx
129 ___
130 $code.=<<___;
131 jmp .Loop_sqr
132
133 .align 32
134 .Loop_sqr:
135 movl $times,128+8(%rsp)
136 #first iteration
137 movq %rdx, %rbx # 0($inp)
138 mov %rax, %rbp # 8($inp)
139 mulq %rdx
140 movq %rax, %r8
141 movq 16($inp), %rax
142 movq %rdx, %r9
143
144 mulq %rbx
145 addq %rax, %r9
146 movq 24($inp), %rax
147 movq %rdx, %r10
148 adcq \$0, %r10
149
150 mulq %rbx
151 addq %rax, %r10
152 movq 32($inp), %rax
153 movq %rdx, %r11
154 adcq \$0, %r11
155
156 mulq %rbx
157 addq %rax, %r11
158 movq 40($inp), %rax
159 movq %rdx, %r12
160 adcq \$0, %r12
161
162 mulq %rbx
163 addq %rax, %r12
164 movq 48($inp), %rax
165 movq %rdx, %r13
166 adcq \$0, %r13
167
168 mulq %rbx
169 addq %rax, %r13
170 movq 56($inp), %rax
171 movq %rdx, %r14
172 adcq \$0, %r14
173
174 mulq %rbx
175 addq %rax, %r14
176 movq %rbx, %rax
177 adcq \$0, %rdx
178
179 xorq %rcx,%rcx # rcx:r8 = r8 << 1
180 addq %r8, %r8
181 movq %rdx, %r15
182 adcq \$0, %rcx
183
184 mulq %rax
185 addq %r8, %rdx
186 adcq \$0, %rcx
187
188 movq %rax, (%rsp)
189 movq %rdx, 8(%rsp)
190
191 #second iteration
192 movq 16($inp), %rax
193 mulq %rbp
194 addq %rax, %r10
195 movq 24($inp), %rax
196 movq %rdx, %rbx
197 adcq \$0, %rbx
198
199 mulq %rbp
200 addq %rax, %r11
201 movq 32($inp), %rax
202 adcq \$0, %rdx
203 addq %rbx, %r11
204 movq %rdx, %rbx
205 adcq \$0, %rbx
206
207 mulq %rbp
208 addq %rax, %r12
209 movq 40($inp), %rax
210 adcq \$0, %rdx
211 addq %rbx, %r12
212 movq %rdx, %rbx
213 adcq \$0, %rbx
214
215 mulq %rbp
216 addq %rax, %r13
217 movq 48($inp), %rax
218 adcq \$0, %rdx
219 addq %rbx, %r13
220 movq %rdx, %rbx
221 adcq \$0, %rbx
222
223 mulq %rbp
224 addq %rax, %r14
225 movq 56($inp), %rax
226 adcq \$0, %rdx
227 addq %rbx, %r14
228 movq %rdx, %rbx
229 adcq \$0, %rbx
230
231 mulq %rbp
232 addq %rax, %r15
233 movq %rbp, %rax
234 adcq \$0, %rdx
235 addq %rbx, %r15
236 adcq \$0, %rdx
237
238 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
239 addq %r9, %r9
240 movq %rdx, %r8
241 adcq %r10, %r10
242 adcq \$0, %rbx
243
244 mulq %rax
245 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
246 addq %rcx, %rax
247 movq 16($inp), %rbp
248 addq %rax, %r9
249 movq 24($inp), %rax
250 adcq %rdx, %r10
251 adcq \$0, %rbx
252
253 movq %r9, 16(%rsp)
254 movq %r10, 24(%rsp)
255
256 #third iteration
257 mulq %rbp
258 addq %rax, %r12
259 movq 32($inp), %rax
260 movq %rdx, %rcx
261 adcq \$0, %rcx
262
263 mulq %rbp
264 addq %rax, %r13
265 movq 40($inp), %rax
266 adcq \$0, %rdx
267 addq %rcx, %r13
268 movq %rdx, %rcx
269 adcq \$0, %rcx
270
271 mulq %rbp
272 addq %rax, %r14
273 movq 48($inp), %rax
274 adcq \$0, %rdx
275 addq %rcx, %r14
276 movq %rdx, %rcx
277 adcq \$0, %rcx
278
279 mulq %rbp
280 addq %rax, %r15
281 movq 56($inp), %rax
282 adcq \$0, %rdx
283 addq %rcx, %r15
284 movq %rdx, %rcx
285 adcq \$0, %rcx
286
287 mulq %rbp
288 addq %rax, %r8
289 movq %rbp, %rax
290 adcq \$0, %rdx
291 addq %rcx, %r8
292 adcq \$0, %rdx
293
294 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
295 addq %r11, %r11
296 movq %rdx, %r9
297 adcq %r12, %r12
298 adcq \$0, %rcx
299
300 mulq %rax
301 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
302 addq %rbx, %rax
303 movq 24($inp), %r10
304 addq %rax, %r11
305 movq 32($inp), %rax
306 adcq %rdx, %r12
307 adcq \$0, %rcx
308
309 movq %r11, 32(%rsp)
310 movq %r12, 40(%rsp)
311
312 #fourth iteration
313 mov %rax, %r11 # 32($inp)
314 mulq %r10
315 addq %rax, %r14
316 movq 40($inp), %rax
317 movq %rdx, %rbx
318 adcq \$0, %rbx
319
320 mov %rax, %r12 # 40($inp)
321 mulq %r10
322 addq %rax, %r15
323 movq 48($inp), %rax
324 adcq \$0, %rdx
325 addq %rbx, %r15
326 movq %rdx, %rbx
327 adcq \$0, %rbx
328
329 mov %rax, %rbp # 48($inp)
330 mulq %r10
331 addq %rax, %r8
332 movq 56($inp), %rax
333 adcq \$0, %rdx
334 addq %rbx, %r8
335 movq %rdx, %rbx
336 adcq \$0, %rbx
337
338 mulq %r10
339 addq %rax, %r9
340 movq %r10, %rax
341 adcq \$0, %rdx
342 addq %rbx, %r9
343 adcq \$0, %rdx
344
345 xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
346 addq %r13, %r13
347 movq %rdx, %r10
348 adcq %r14, %r14
349 adcq \$0, %rbx
350
351 mulq %rax
352 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
353 addq %rcx, %rax
354 addq %rax, %r13
355 movq %r12, %rax # 40($inp)
356 adcq %rdx, %r14
357 adcq \$0, %rbx
358
359 movq %r13, 48(%rsp)
360 movq %r14, 56(%rsp)
361
362 #fifth iteration
363 mulq %r11
364 addq %rax, %r8
365 movq %rbp, %rax # 48($inp)
366 movq %rdx, %rcx
367 adcq \$0, %rcx
368
369 mulq %r11
370 addq %rax, %r9
371 movq 56($inp), %rax
372 adcq \$0, %rdx
373 addq %rcx, %r9
374 movq %rdx, %rcx
375 adcq \$0, %rcx
376
377 mov %rax, %r14 # 56($inp)
378 mulq %r11
379 addq %rax, %r10
380 movq %r11, %rax
381 adcq \$0, %rdx
382 addq %rcx, %r10
383 adcq \$0, %rdx
384
385 xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
386 addq %r15, %r15
387 movq %rdx, %r11
388 adcq %r8, %r8
389 adcq \$0, %rcx
390
391 mulq %rax
392 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
393 addq %rbx, %rax
394 addq %rax, %r15
395 movq %rbp, %rax # 48($inp)
396 adcq %rdx, %r8
397 adcq \$0, %rcx
398
399 movq %r15, 64(%rsp)
400 movq %r8, 72(%rsp)
401
402 #sixth iteration
403 mulq %r12
404 addq %rax, %r10
405 movq %r14, %rax # 56($inp)
406 movq %rdx, %rbx
407 adcq \$0, %rbx
408
409 mulq %r12
410 addq %rax, %r11
411 movq %r12, %rax
412 adcq \$0, %rdx
413 addq %rbx, %r11
414 adcq \$0, %rdx
415
416 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
417 addq %r9, %r9
418 movq %rdx, %r12
419 adcq %r10, %r10
420 adcq \$0, %rbx
421
422 mulq %rax
423 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
424 addq %rcx, %rax
425 addq %rax, %r9
426 movq %r14, %rax # 56($inp)
427 adcq %rdx, %r10
428 adcq \$0, %rbx
429
430 movq %r9, 80(%rsp)
431 movq %r10, 88(%rsp)
432
433 #seventh iteration
434 mulq %rbp
435 addq %rax, %r12
436 movq %rbp, %rax
437 adcq \$0, %rdx
438
439 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
440 addq %r11, %r11
441 movq %rdx, %r13
442 adcq %r12, %r12
443 adcq \$0, %rcx
444
445 mulq %rax
446 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
447 addq %rbx, %rax
448 addq %rax, %r11
449 movq %r14, %rax # 56($inp)
450 adcq %rdx, %r12
451 adcq \$0, %rcx
452
453 movq %r11, 96(%rsp)
454 movq %r12, 104(%rsp)
455
456 #eighth iteration
457 xorq %rbx, %rbx # rbx:r13 = r13 << 1
458 addq %r13, %r13
459 adcq \$0, %rbx
460
461 mulq %rax
462 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
463 addq %rcx, %rax
464 addq %r13, %rax
465 adcq %rbx, %rdx
466
467 movq (%rsp), %r8
468 movq 8(%rsp), %r9
469 movq 16(%rsp), %r10
470 movq 24(%rsp), %r11
471 movq 32(%rsp), %r12
472 movq 40(%rsp), %r13
473 movq 48(%rsp), %r14
474 movq 56(%rsp), %r15
475 movq %xmm1, %rbp
476
477 movq %rax, 112(%rsp)
478 movq %rdx, 120(%rsp)
479
480 call __rsaz_512_reduce
481
482 addq 64(%rsp), %r8
483 adcq 72(%rsp), %r9
484 adcq 80(%rsp), %r10
485 adcq 88(%rsp), %r11
486 adcq 96(%rsp), %r12
487 adcq 104(%rsp), %r13
488 adcq 112(%rsp), %r14
489 adcq 120(%rsp), %r15
490 sbbq %rcx, %rcx
491
492 call __rsaz_512_subtract
493
494 movq %r8, %rdx
495 movq %r9, %rax
496 movl 128+8(%rsp), $times
497 movq $out, $inp
498
499 decl $times
500 jnz .Loop_sqr
501 ___
502 if ($addx) {
503 $code.=<<___;
504 jmp .Lsqr_tail
505
506 .align 32
507 .Loop_sqrx:
508 movl $times,128+8(%rsp)
509 movq $out, %xmm0 # off-load
510 #first iteration
511 mulx %rax, %r8, %r9
512 mov %rax, %rbx
513
514 mulx 16($inp), %rcx, %r10
515 xor %rbp, %rbp # cf=0, of=0
516
517 mulx 24($inp), %rax, %r11
518 adcx %rcx, %r9
519
520 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
521 adcx %rax, %r10
522
523 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
524 adcx %rcx, %r11
525
526 mulx 48($inp), %rcx, %r14
527 adcx %rax, %r12
528 adcx %rcx, %r13
529
530 mulx 56($inp), %rax, %r15
531 adcx %rax, %r14
532 adcx %rbp, %r15 # %rbp is 0
533
534 mulx %rdx, %rax, $out
535 mov %rbx, %rdx # 8($inp)
536 xor %rcx, %rcx
537 adox %r8, %r8
538 adcx $out, %r8
539 adox %rbp, %rcx
540 adcx %rbp, %rcx
541
542 mov %rax, (%rsp)
543 mov %r8, 8(%rsp)
544
545 #second iteration
546 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
547 adox %rax, %r10
548 adcx %rbx, %r11
549
550 mulx 24($inp), $out, %r8
551 adox $out, %r11
552 .byte 0x66
553 adcx %r8, %r12
554
555 mulx 32($inp), %rax, %rbx
556 adox %rax, %r12
557 adcx %rbx, %r13
558
559 mulx 40($inp), $out, %r8
560 adox $out, %r13
561 adcx %r8, %r14
562
563 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
564 adox %rax, %r14
565 adcx %rbx, %r15
566
567 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
568 adox $out, %r15
569 adcx %rbp, %r8
570 mulx %rdx, %rax, $out
571 adox %rbp, %r8
572 .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
573
574 xor %rbx, %rbx
575 adox %r9, %r9
576 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
577 adcx %rcx, %rax
578 adox %r10, %r10
579 adcx %rax, %r9
580 adox %rbp, %rbx
581 adcx $out, %r10
582 adcx %rbp, %rbx
583
584 mov %r9, 16(%rsp)
585 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
586
587 #third iteration
588 mulx 24($inp), $out, %r9
589 adox $out, %r12
590 adcx %r9, %r13
591
592 mulx 32($inp), %rax, %rcx
593 adox %rax, %r13
594 adcx %rcx, %r14
595
596 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
597 adox $out, %r14
598 adcx %r9, %r15
599
600 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
601 adox %rax, %r15
602 adcx %rcx, %r8
603
604 mulx 56($inp), $out, %r9
605 adox $out, %r8
606 adcx %rbp, %r9
607 mulx %rdx, %rax, $out
608 adox %rbp, %r9
609 mov 24($inp), %rdx
610
611 xor %rcx, %rcx
612 adox %r11, %r11
613 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
614 adcx %rbx, %rax
615 adox %r12, %r12
616 adcx %rax, %r11
617 adox %rbp, %rcx
618 adcx $out, %r12
619 adcx %rbp, %rcx
620
621 mov %r11, 32(%rsp)
622 mov %r12, 40(%rsp)
623
624 #fourth iteration
625 mulx 32($inp), %rax, %rbx
626 adox %rax, %r14
627 adcx %rbx, %r15
628
629 mulx 40($inp), $out, %r10
630 adox $out, %r15
631 adcx %r10, %r8
632
633 mulx 48($inp), %rax, %rbx
634 adox %rax, %r8
635 adcx %rbx, %r9
636
637 mulx 56($inp), $out, %r10
638 adox $out, %r9
639 adcx %rbp, %r10
640 mulx %rdx, %rax, $out
641 adox %rbp, %r10
642 mov 32($inp), %rdx
643
644 xor %rbx, %rbx
645 adox %r13, %r13
646 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
647 adcx %rcx, %rax
648 adox %r14, %r14
649 adcx %rax, %r13
650 adox %rbp, %rbx
651 adcx $out, %r14
652 adcx %rbp, %rbx
653
654 mov %r13, 48(%rsp)
655 mov %r14, 56(%rsp)
656
657 #fifth iteration
658 mulx 40($inp), $out, %r11
659 adox $out, %r8
660 adcx %r11, %r9
661
662 mulx 48($inp), %rax, %rcx
663 adox %rax, %r9
664 adcx %rcx, %r10
665
666 mulx 56($inp), $out, %r11
667 adox $out, %r10
668 adcx %rbp, %r11
669 mulx %rdx, %rax, $out
670 mov 40($inp), %rdx
671 adox %rbp, %r11
672
673 xor %rcx, %rcx
674 adox %r15, %r15
675 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
676 adcx %rbx, %rax
677 adox %r8, %r8
678 adcx %rax, %r15
679 adox %rbp, %rcx
680 adcx $out, %r8
681 adcx %rbp, %rcx
682
683 mov %r15, 64(%rsp)
684 mov %r8, 72(%rsp)
685
686 #sixth iteration
687 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
688 adox %rax, %r10
689 adcx %rbx, %r11
690
691 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
692 adox $out, %r11
693 adcx %rbp, %r12
694 mulx %rdx, %rax, $out
695 adox %rbp, %r12
696 mov 48($inp), %rdx
697
698 xor %rbx, %rbx
699 adox %r9, %r9
700 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
701 adcx %rcx, %rax
702 adox %r10, %r10
703 adcx %rax, %r9
704 adcx $out, %r10
705 adox %rbp, %rbx
706 adcx %rbp, %rbx
707
708 mov %r9, 80(%rsp)
709 mov %r10, 88(%rsp)
710
711 #seventh iteration
712 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
713 adox %rax, %r12
714 adox %rbp, %r13
715
716 mulx %rdx, %rax, $out
717 xor %rcx, %rcx
718 mov 56($inp), %rdx
719 adox %r11, %r11
720 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
721 adcx %rbx, %rax
722 adox %r12, %r12
723 adcx %rax, %r11
724 adox %rbp, %rcx
725 adcx $out, %r12
726 adcx %rbp, %rcx
727
728 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
729 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
730
731 #eighth iteration
732 mulx %rdx, %rax, %rdx
733 xor %rbx, %rbx
734 adox %r13, %r13
735 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
736 adcx %rcx, %rax
737 adox %rbp, %rbx
738 adcx %r13, %rax
739 adcx %rdx, %rbx
740
741 movq %xmm0, $out
742 movq %xmm1, %rbp
743
744 movq 128(%rsp), %rdx # pull $n0
745 movq (%rsp), %r8
746 movq 8(%rsp), %r9
747 movq 16(%rsp), %r10
748 movq 24(%rsp), %r11
749 movq 32(%rsp), %r12
750 movq 40(%rsp), %r13
751 movq 48(%rsp), %r14
752 movq 56(%rsp), %r15
753
754 movq %rax, 112(%rsp)
755 movq %rbx, 120(%rsp)
756
757 call __rsaz_512_reducex
758
759 addq 64(%rsp), %r8
760 adcq 72(%rsp), %r9
761 adcq 80(%rsp), %r10
762 adcq 88(%rsp), %r11
763 adcq 96(%rsp), %r12
764 adcq 104(%rsp), %r13
765 adcq 112(%rsp), %r14
766 adcq 120(%rsp), %r15
767 sbbq %rcx, %rcx
768
769 call __rsaz_512_subtract
770
771 movq %r8, %rdx
772 movq %r9, %rax
773 movl 128+8(%rsp), $times
774 movq $out, $inp
775
776 decl $times
777 jnz .Loop_sqrx
778
779 .Lsqr_tail:
780 ___
781 }
782 $code.=<<___;
783
784 leaq 128+24+48(%rsp), %rax
785 .cfi_def_cfa %rax,8
786 movq -48(%rax), %r15
787 .cfi_restore %r15
788 movq -40(%rax), %r14
789 .cfi_restore %r14
790 movq -32(%rax), %r13
791 .cfi_restore %r13
792 movq -24(%rax), %r12
793 .cfi_restore %r12
794 movq -16(%rax), %rbp
795 .cfi_restore %rbp
796 movq -8(%rax), %rbx
797 .cfi_restore %rbx
798 leaq (%rax), %rsp
799 .cfi_def_cfa_register %rsp
800 .Lsqr_epilogue:
801 ret
802 .cfi_endproc
803 .size rsaz_512_sqr,.-rsaz_512_sqr
804 ___
805 }
806 {
807 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
808 $code.=<<___;
809 .globl rsaz_512_mul
810 .type rsaz_512_mul,\@function,5
811 .align 32
812 rsaz_512_mul:
813 .cfi_startproc
814 push %rbx
815 .cfi_push %rbx
816 push %rbp
817 .cfi_push %rbp
818 push %r12
819 .cfi_push %r12
820 push %r13
821 .cfi_push %r13
822 push %r14
823 .cfi_push %r14
824 push %r15
825 .cfi_push %r15
826
827 subq \$128+24, %rsp
828 .cfi_adjust_cfa_offset 128+24
829 .Lmul_body:
830 movq $out, %xmm0 # off-load arguments
831 movq $mod, %xmm1
832 movq $n0, 128(%rsp)
833 ___
834 $code.=<<___ if ($addx);
835 movl \$0x80100,%r11d
836 andl OPENSSL_ia32cap_P+8(%rip),%r11d
837 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
838 je .Lmulx
839 ___
840 $code.=<<___;
841 movq ($bp), %rbx # pass b[0]
842 movq $bp, %rbp # pass argument
843 call __rsaz_512_mul
844
845 movq %xmm0, $out
846 movq %xmm1, %rbp
847
848 movq (%rsp), %r8
849 movq 8(%rsp), %r9
850 movq 16(%rsp), %r10
851 movq 24(%rsp), %r11
852 movq 32(%rsp), %r12
853 movq 40(%rsp), %r13
854 movq 48(%rsp), %r14
855 movq 56(%rsp), %r15
856
857 call __rsaz_512_reduce
858 ___
859 $code.=<<___ if ($addx);
860 jmp .Lmul_tail
861
862 .align 32
863 .Lmulx:
864 movq $bp, %rbp # pass argument
865 movq ($bp), %rdx # pass b[0]
866 call __rsaz_512_mulx
867
868 movq %xmm0, $out
869 movq %xmm1, %rbp
870
871 movq 128(%rsp), %rdx # pull $n0
872 movq (%rsp), %r8
873 movq 8(%rsp), %r9
874 movq 16(%rsp), %r10
875 movq 24(%rsp), %r11
876 movq 32(%rsp), %r12
877 movq 40(%rsp), %r13
878 movq 48(%rsp), %r14
879 movq 56(%rsp), %r15
880
881 call __rsaz_512_reducex
882 .Lmul_tail:
883 ___
884 $code.=<<___;
885 addq 64(%rsp), %r8
886 adcq 72(%rsp), %r9
887 adcq 80(%rsp), %r10
888 adcq 88(%rsp), %r11
889 adcq 96(%rsp), %r12
890 adcq 104(%rsp), %r13
891 adcq 112(%rsp), %r14
892 adcq 120(%rsp), %r15
893 sbbq %rcx, %rcx
894
895 call __rsaz_512_subtract
896
897 leaq 128+24+48(%rsp), %rax
898 .cfi_def_cfa %rax,8
899 movq -48(%rax), %r15
900 .cfi_restore %r15
901 movq -40(%rax), %r14
902 .cfi_restore %r14
903 movq -32(%rax), %r13
904 .cfi_restore %r13
905 movq -24(%rax), %r12
906 .cfi_restore %r12
907 movq -16(%rax), %rbp
908 .cfi_restore %rbp
909 movq -8(%rax), %rbx
910 .cfi_restore %rbx
911 leaq (%rax), %rsp
912 .cfi_def_cfa_register %rsp
913 .Lmul_epilogue:
914 ret
915 .cfi_endproc
916 .size rsaz_512_mul,.-rsaz_512_mul
917 ___
918 }
919 {
920 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
921 $code.=<<___;
922 .globl rsaz_512_mul_gather4
923 .type rsaz_512_mul_gather4,\@function,6
924 .align 32
925 rsaz_512_mul_gather4:
926 .cfi_startproc
927 push %rbx
928 .cfi_push %rbx
929 push %rbp
930 .cfi_push %rbp
931 push %r12
932 .cfi_push %r12
933 push %r13
934 .cfi_push %r13
935 push %r14
936 .cfi_push %r14
937 push %r15
938 .cfi_push %r15
939
940 subq \$`128+24+($win64?0xb0:0)`, %rsp
941 .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
942 ___
943 $code.=<<___ if ($win64);
944 movaps %xmm6,0xa0(%rsp)
945 movaps %xmm7,0xb0(%rsp)
946 movaps %xmm8,0xc0(%rsp)
947 movaps %xmm9,0xd0(%rsp)
948 movaps %xmm10,0xe0(%rsp)
949 movaps %xmm11,0xf0(%rsp)
950 movaps %xmm12,0x100(%rsp)
951 movaps %xmm13,0x110(%rsp)
952 movaps %xmm14,0x120(%rsp)
953 movaps %xmm15,0x130(%rsp)
954 ___
955 $code.=<<___;
956 .Lmul_gather4_body:
957 movd $pwr,%xmm8
958 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
959 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
960
961 pshufd \$0,%xmm8,%xmm8 # broadcast $power
962 movdqa %xmm1,%xmm7
963 movdqa %xmm1,%xmm2
964 ___
965 ########################################################################
966 # calculate mask by comparing 0..15 to $power
967 #
968 for($i=0;$i<4;$i++) {
969 $code.=<<___;
970 paddd %xmm`$i`,%xmm`$i+1`
971 pcmpeqd %xmm8,%xmm`$i`
972 movdqa %xmm7,%xmm`$i+3`
973 ___
974 }
975 for(;$i<7;$i++) {
976 $code.=<<___;
977 paddd %xmm`$i`,%xmm`$i+1`
978 pcmpeqd %xmm8,%xmm`$i`
979 ___
980 }
981 $code.=<<___;
982 pcmpeqd %xmm8,%xmm7
983
984 movdqa 16*0($bp),%xmm8
985 movdqa 16*1($bp),%xmm9
986 movdqa 16*2($bp),%xmm10
987 movdqa 16*3($bp),%xmm11
988 pand %xmm0,%xmm8
989 movdqa 16*4($bp),%xmm12
990 pand %xmm1,%xmm9
991 movdqa 16*5($bp),%xmm13
992 pand %xmm2,%xmm10
993 movdqa 16*6($bp),%xmm14
994 pand %xmm3,%xmm11
995 movdqa 16*7($bp),%xmm15
996 leaq 128($bp), %rbp
997 pand %xmm4,%xmm12
998 pand %xmm5,%xmm13
999 pand %xmm6,%xmm14
1000 pand %xmm7,%xmm15
1001 por %xmm10,%xmm8
1002 por %xmm11,%xmm9
1003 por %xmm12,%xmm8
1004 por %xmm13,%xmm9
1005 por %xmm14,%xmm8
1006 por %xmm15,%xmm9
1007
1008 por %xmm9,%xmm8
1009 pshufd \$0x4e,%xmm8,%xmm9
1010 por %xmm9,%xmm8
1011 ___
1012 $code.=<<___ if ($addx);
1013 movl \$0x80100,%r11d
1014 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1015 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1016 je .Lmulx_gather
1017 ___
1018 $code.=<<___;
1019 movq %xmm8,%rbx
1020
1021 movq $n0, 128(%rsp) # off-load arguments
1022 movq $out, 128+8(%rsp)
1023 movq $mod, 128+16(%rsp)
1024
1025 movq ($ap), %rax
1026 movq 8($ap), %rcx
1027 mulq %rbx # 0 iteration
1028 movq %rax, (%rsp)
1029 movq %rcx, %rax
1030 movq %rdx, %r8
1031
1032 mulq %rbx
1033 addq %rax, %r8
1034 movq 16($ap), %rax
1035 movq %rdx, %r9
1036 adcq \$0, %r9
1037
1038 mulq %rbx
1039 addq %rax, %r9
1040 movq 24($ap), %rax
1041 movq %rdx, %r10
1042 adcq \$0, %r10
1043
1044 mulq %rbx
1045 addq %rax, %r10
1046 movq 32($ap), %rax
1047 movq %rdx, %r11
1048 adcq \$0, %r11
1049
1050 mulq %rbx
1051 addq %rax, %r11
1052 movq 40($ap), %rax
1053 movq %rdx, %r12
1054 adcq \$0, %r12
1055
1056 mulq %rbx
1057 addq %rax, %r12
1058 movq 48($ap), %rax
1059 movq %rdx, %r13
1060 adcq \$0, %r13
1061
1062 mulq %rbx
1063 addq %rax, %r13
1064 movq 56($ap), %rax
1065 movq %rdx, %r14
1066 adcq \$0, %r14
1067
1068 mulq %rbx
1069 addq %rax, %r14
1070 movq ($ap), %rax
1071 movq %rdx, %r15
1072 adcq \$0, %r15
1073
1074 leaq 8(%rsp), %rdi
1075 movl \$7, %ecx
1076 jmp .Loop_mul_gather
1077
1078 .align 32
1079 .Loop_mul_gather:
1080 movdqa 16*0(%rbp),%xmm8
1081 movdqa 16*1(%rbp),%xmm9
1082 movdqa 16*2(%rbp),%xmm10
1083 movdqa 16*3(%rbp),%xmm11
1084 pand %xmm0,%xmm8
1085 movdqa 16*4(%rbp),%xmm12
1086 pand %xmm1,%xmm9
1087 movdqa 16*5(%rbp),%xmm13
1088 pand %xmm2,%xmm10
1089 movdqa 16*6(%rbp),%xmm14
1090 pand %xmm3,%xmm11
1091 movdqa 16*7(%rbp),%xmm15
1092 leaq 128(%rbp), %rbp
1093 pand %xmm4,%xmm12
1094 pand %xmm5,%xmm13
1095 pand %xmm6,%xmm14
1096 pand %xmm7,%xmm15
1097 por %xmm10,%xmm8
1098 por %xmm11,%xmm9
1099 por %xmm12,%xmm8
1100 por %xmm13,%xmm9
1101 por %xmm14,%xmm8
1102 por %xmm15,%xmm9
1103
1104 por %xmm9,%xmm8
1105 pshufd \$0x4e,%xmm8,%xmm9
1106 por %xmm9,%xmm8
1107 movq %xmm8,%rbx
1108
1109 mulq %rbx
1110 addq %rax, %r8
1111 movq 8($ap), %rax
1112 movq %r8, (%rdi)
1113 movq %rdx, %r8
1114 adcq \$0, %r8
1115
1116 mulq %rbx
1117 addq %rax, %r9
1118 movq 16($ap), %rax
1119 adcq \$0, %rdx
1120 addq %r9, %r8
1121 movq %rdx, %r9
1122 adcq \$0, %r9
1123
1124 mulq %rbx
1125 addq %rax, %r10
1126 movq 24($ap), %rax
1127 adcq \$0, %rdx
1128 addq %r10, %r9
1129 movq %rdx, %r10
1130 adcq \$0, %r10
1131
1132 mulq %rbx
1133 addq %rax, %r11
1134 movq 32($ap), %rax
1135 adcq \$0, %rdx
1136 addq %r11, %r10
1137 movq %rdx, %r11
1138 adcq \$0, %r11
1139
1140 mulq %rbx
1141 addq %rax, %r12
1142 movq 40($ap), %rax
1143 adcq \$0, %rdx
1144 addq %r12, %r11
1145 movq %rdx, %r12
1146 adcq \$0, %r12
1147
1148 mulq %rbx
1149 addq %rax, %r13
1150 movq 48($ap), %rax
1151 adcq \$0, %rdx
1152 addq %r13, %r12
1153 movq %rdx, %r13
1154 adcq \$0, %r13
1155
1156 mulq %rbx
1157 addq %rax, %r14
1158 movq 56($ap), %rax
1159 adcq \$0, %rdx
1160 addq %r14, %r13
1161 movq %rdx, %r14
1162 adcq \$0, %r14
1163
1164 mulq %rbx
1165 addq %rax, %r15
1166 movq ($ap), %rax
1167 adcq \$0, %rdx
1168 addq %r15, %r14
1169 movq %rdx, %r15
1170 adcq \$0, %r15
1171
1172 leaq 8(%rdi), %rdi
1173
1174 decl %ecx
1175 jnz .Loop_mul_gather
1176
1177 movq %r8, (%rdi)
1178 movq %r9, 8(%rdi)
1179 movq %r10, 16(%rdi)
1180 movq %r11, 24(%rdi)
1181 movq %r12, 32(%rdi)
1182 movq %r13, 40(%rdi)
1183 movq %r14, 48(%rdi)
1184 movq %r15, 56(%rdi)
1185
1186 movq 128+8(%rsp), $out
1187 movq 128+16(%rsp), %rbp
1188
1189 movq (%rsp), %r8
1190 movq 8(%rsp), %r9
1191 movq 16(%rsp), %r10
1192 movq 24(%rsp), %r11
1193 movq 32(%rsp), %r12
1194 movq 40(%rsp), %r13
1195 movq 48(%rsp), %r14
1196 movq 56(%rsp), %r15
1197
1198 call __rsaz_512_reduce
1199 ___
1200 $code.=<<___ if ($addx);
1201 jmp .Lmul_gather_tail
1202
1203 .align 32
1204 .Lmulx_gather:
1205 movq %xmm8,%rdx
1206
1207 mov $n0, 128(%rsp) # off-load arguments
1208 mov $out, 128+8(%rsp)
1209 mov $mod, 128+16(%rsp)
1210
1211 mulx ($ap), %rbx, %r8 # 0 iteration
1212 mov %rbx, (%rsp)
1213 xor %edi, %edi # cf=0, of=0
1214
1215 mulx 8($ap), %rax, %r9
1216
1217 mulx 16($ap), %rbx, %r10
1218 adcx %rax, %r8
1219
1220 mulx 24($ap), %rax, %r11
1221 adcx %rbx, %r9
1222
1223 mulx 32($ap), %rbx, %r12
1224 adcx %rax, %r10
1225
1226 mulx 40($ap), %rax, %r13
1227 adcx %rbx, %r11
1228
1229 mulx 48($ap), %rbx, %r14
1230 adcx %rax, %r12
1231
1232 mulx 56($ap), %rax, %r15
1233 adcx %rbx, %r13
1234 adcx %rax, %r14
1235 .byte 0x67
1236 mov %r8, %rbx
1237 adcx %rdi, %r15 # %rdi is 0
1238
1239 mov \$-7, %rcx
1240 jmp .Loop_mulx_gather
1241
1242 .align 32
1243 .Loop_mulx_gather:
1244 movdqa 16*0(%rbp),%xmm8
1245 movdqa 16*1(%rbp),%xmm9
1246 movdqa 16*2(%rbp),%xmm10
1247 movdqa 16*3(%rbp),%xmm11
1248 pand %xmm0,%xmm8
1249 movdqa 16*4(%rbp),%xmm12
1250 pand %xmm1,%xmm9
1251 movdqa 16*5(%rbp),%xmm13
1252 pand %xmm2,%xmm10
1253 movdqa 16*6(%rbp),%xmm14
1254 pand %xmm3,%xmm11
1255 movdqa 16*7(%rbp),%xmm15
1256 leaq 128(%rbp), %rbp
1257 pand %xmm4,%xmm12
1258 pand %xmm5,%xmm13
1259 pand %xmm6,%xmm14
1260 pand %xmm7,%xmm15
1261 por %xmm10,%xmm8
1262 por %xmm11,%xmm9
1263 por %xmm12,%xmm8
1264 por %xmm13,%xmm9
1265 por %xmm14,%xmm8
1266 por %xmm15,%xmm9
1267
1268 por %xmm9,%xmm8
1269 pshufd \$0x4e,%xmm8,%xmm9
1270 por %xmm9,%xmm8
1271 movq %xmm8,%rdx
1272
1273 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1274 adcx %rax, %rbx
1275 adox %r9, %r8
1276
1277 mulx 8($ap), %rax, %r9
1278 adcx %rax, %r8
1279 adox %r10, %r9
1280
1281 mulx 16($ap), %rax, %r10
1282 adcx %rax, %r9
1283 adox %r11, %r10
1284
1285 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1286 adcx %rax, %r10
1287 adox %r12, %r11
1288
1289 mulx 32($ap), %rax, %r12
1290 adcx %rax, %r11
1291 adox %r13, %r12
1292
1293 mulx 40($ap), %rax, %r13
1294 adcx %rax, %r12
1295 adox %r14, %r13
1296
1297 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1298 adcx %rax, %r13
1299 .byte 0x67
1300 adox %r15, %r14
1301
1302 mulx 56($ap), %rax, %r15
1303 mov %rbx, 64(%rsp,%rcx,8)
1304 adcx %rax, %r14
1305 adox %rdi, %r15
1306 mov %r8, %rbx
1307 adcx %rdi, %r15 # cf=0
1308
1309 inc %rcx # of=0
1310 jnz .Loop_mulx_gather
1311
1312 mov %r8, 64(%rsp)
1313 mov %r9, 64+8(%rsp)
1314 mov %r10, 64+16(%rsp)
1315 mov %r11, 64+24(%rsp)
1316 mov %r12, 64+32(%rsp)
1317 mov %r13, 64+40(%rsp)
1318 mov %r14, 64+48(%rsp)
1319 mov %r15, 64+56(%rsp)
1320
1321 mov 128(%rsp), %rdx # pull arguments
1322 mov 128+8(%rsp), $out
1323 mov 128+16(%rsp), %rbp
1324
1325 mov (%rsp), %r8
1326 mov 8(%rsp), %r9
1327 mov 16(%rsp), %r10
1328 mov 24(%rsp), %r11
1329 mov 32(%rsp), %r12
1330 mov 40(%rsp), %r13
1331 mov 48(%rsp), %r14
1332 mov 56(%rsp), %r15
1333
1334 call __rsaz_512_reducex
1335
1336 .Lmul_gather_tail:
1337 ___
1338 $code.=<<___;
1339 addq 64(%rsp), %r8
1340 adcq 72(%rsp), %r9
1341 adcq 80(%rsp), %r10
1342 adcq 88(%rsp), %r11
1343 adcq 96(%rsp), %r12
1344 adcq 104(%rsp), %r13
1345 adcq 112(%rsp), %r14
1346 adcq 120(%rsp), %r15
1347 sbbq %rcx, %rcx
1348
1349 call __rsaz_512_subtract
1350
1351 leaq 128+24+48(%rsp), %rax
1352 ___
1353 $code.=<<___ if ($win64);
1354 movaps 0xa0-0xc8(%rax),%xmm6
1355 movaps 0xb0-0xc8(%rax),%xmm7
1356 movaps 0xc0-0xc8(%rax),%xmm8
1357 movaps 0xd0-0xc8(%rax),%xmm9
1358 movaps 0xe0-0xc8(%rax),%xmm10
1359 movaps 0xf0-0xc8(%rax),%xmm11
1360 movaps 0x100-0xc8(%rax),%xmm12
1361 movaps 0x110-0xc8(%rax),%xmm13
1362 movaps 0x120-0xc8(%rax),%xmm14
1363 movaps 0x130-0xc8(%rax),%xmm15
1364 lea 0xb0(%rax),%rax
1365 ___
1366 $code.=<<___;
1367 .cfi_def_cfa %rax,8
1368 movq -48(%rax), %r15
1369 .cfi_restore %r15
1370 movq -40(%rax), %r14
1371 .cfi_restore %r14
1372 movq -32(%rax), %r13
1373 .cfi_restore %r13
1374 movq -24(%rax), %r12
1375 .cfi_restore %r12
1376 movq -16(%rax), %rbp
1377 .cfi_restore %rbp
1378 movq -8(%rax), %rbx
1379 .cfi_restore %rbx
1380 leaq (%rax), %rsp
1381 .cfi_def_cfa_register %rsp
1382 .Lmul_gather4_epilogue:
1383 ret
1384 .cfi_endproc
1385 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1386 ___
1387 }
1388 {
1389 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1390 $code.=<<___;
1391 .globl rsaz_512_mul_scatter4
1392 .type rsaz_512_mul_scatter4,\@function,6
1393 .align 32
1394 rsaz_512_mul_scatter4:
1395 .cfi_startproc
1396 push %rbx
1397 .cfi_push %rbx
1398 push %rbp
1399 .cfi_push %rbp
1400 push %r12
1401 .cfi_push %r12
1402 push %r13
1403 .cfi_push %r13
1404 push %r14
1405 .cfi_push %r14
1406 push %r15
1407 .cfi_push %r15
1408
1409 mov $pwr, $pwr
1410 subq \$128+24, %rsp
1411 .cfi_adjust_cfa_offset 128+24
1412 .Lmul_scatter4_body:
1413 leaq ($tbl,$pwr,8), $tbl
1414 movq $out, %xmm0 # off-load arguments
1415 movq $mod, %xmm1
1416 movq $tbl, %xmm2
1417 movq $n0, 128(%rsp)
1418
1419 movq $out, %rbp
1420 ___
1421 $code.=<<___ if ($addx);
1422 movl \$0x80100,%r11d
1423 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1424 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1425 je .Lmulx_scatter
1426 ___
1427 $code.=<<___;
1428 movq ($out),%rbx # pass b[0]
1429 call __rsaz_512_mul
1430
1431 movq %xmm0, $out
1432 movq %xmm1, %rbp
1433
1434 movq (%rsp), %r8
1435 movq 8(%rsp), %r9
1436 movq 16(%rsp), %r10
1437 movq 24(%rsp), %r11
1438 movq 32(%rsp), %r12
1439 movq 40(%rsp), %r13
1440 movq 48(%rsp), %r14
1441 movq 56(%rsp), %r15
1442
1443 call __rsaz_512_reduce
1444 ___
1445 $code.=<<___ if ($addx);
1446 jmp .Lmul_scatter_tail
1447
1448 .align 32
1449 .Lmulx_scatter:
1450 movq ($out), %rdx # pass b[0]
1451 call __rsaz_512_mulx
1452
1453 movq %xmm0, $out
1454 movq %xmm1, %rbp
1455
1456 movq 128(%rsp), %rdx # pull $n0
1457 movq (%rsp), %r8
1458 movq 8(%rsp), %r9
1459 movq 16(%rsp), %r10
1460 movq 24(%rsp), %r11
1461 movq 32(%rsp), %r12
1462 movq 40(%rsp), %r13
1463 movq 48(%rsp), %r14
1464 movq 56(%rsp), %r15
1465
1466 call __rsaz_512_reducex
1467
1468 .Lmul_scatter_tail:
1469 ___
1470 $code.=<<___;
1471 addq 64(%rsp), %r8
1472 adcq 72(%rsp), %r9
1473 adcq 80(%rsp), %r10
1474 adcq 88(%rsp), %r11
1475 adcq 96(%rsp), %r12
1476 adcq 104(%rsp), %r13
1477 adcq 112(%rsp), %r14
1478 adcq 120(%rsp), %r15
1479 movq %xmm2, $inp
1480 sbbq %rcx, %rcx
1481
1482 call __rsaz_512_subtract
1483
1484 movq %r8, 128*0($inp) # scatter
1485 movq %r9, 128*1($inp)
1486 movq %r10, 128*2($inp)
1487 movq %r11, 128*3($inp)
1488 movq %r12, 128*4($inp)
1489 movq %r13, 128*5($inp)
1490 movq %r14, 128*6($inp)
1491 movq %r15, 128*7($inp)
1492
1493 leaq 128+24+48(%rsp), %rax
1494 .cfi_def_cfa %rax,8
1495 movq -48(%rax), %r15
1496 .cfi_restore %r15
1497 movq -40(%rax), %r14
1498 .cfi_restore %r14
1499 movq -32(%rax), %r13
1500 .cfi_restore %r13
1501 movq -24(%rax), %r12
1502 .cfi_restore %r12
1503 movq -16(%rax), %rbp
1504 .cfi_restore %rbp
1505 movq -8(%rax), %rbx
1506 .cfi_restore %rbx
1507 leaq (%rax), %rsp
1508 .cfi_def_cfa_register %rsp
1509 .Lmul_scatter4_epilogue:
1510 ret
1511 .cfi_endproc
1512 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1513 ___
1514 }
1515 {
1516 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1517 $code.=<<___;
1518 .globl rsaz_512_mul_by_one
1519 .type rsaz_512_mul_by_one,\@function,4
1520 .align 32
1521 rsaz_512_mul_by_one:
1522 .cfi_startproc
1523 push %rbx
1524 .cfi_push %rbx
1525 push %rbp
1526 .cfi_push %rbp
1527 push %r12
1528 .cfi_push %r12
1529 push %r13
1530 .cfi_push %r13
1531 push %r14
1532 .cfi_push %r14
1533 push %r15
1534 .cfi_push %r15
1535
1536 subq \$128+24, %rsp
1537 .cfi_adjust_cfa_offset 128+24
1538 .Lmul_by_one_body:
1539 ___
1540 $code.=<<___ if ($addx);
1541 movl OPENSSL_ia32cap_P+8(%rip),%eax
1542 ___
1543 $code.=<<___;
1544 movq $mod, %rbp # reassign argument
1545 movq $n0, 128(%rsp)
1546
1547 movq ($inp), %r8
1548 pxor %xmm0, %xmm0
1549 movq 8($inp), %r9
1550 movq 16($inp), %r10
1551 movq 24($inp), %r11
1552 movq 32($inp), %r12
1553 movq 40($inp), %r13
1554 movq 48($inp), %r14
1555 movq 56($inp), %r15
1556
1557 movdqa %xmm0, (%rsp)
1558 movdqa %xmm0, 16(%rsp)
1559 movdqa %xmm0, 32(%rsp)
1560 movdqa %xmm0, 48(%rsp)
1561 movdqa %xmm0, 64(%rsp)
1562 movdqa %xmm0, 80(%rsp)
1563 movdqa %xmm0, 96(%rsp)
1564 ___
1565 $code.=<<___ if ($addx);
1566 andl \$0x80100,%eax
1567 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1568 je .Lby_one_callx
1569 ___
1570 $code.=<<___;
1571 call __rsaz_512_reduce
1572 ___
1573 $code.=<<___ if ($addx);
1574 jmp .Lby_one_tail
1575 .align 32
1576 .Lby_one_callx:
1577 movq 128(%rsp), %rdx # pull $n0
1578 call __rsaz_512_reducex
1579 .Lby_one_tail:
1580 ___
1581 $code.=<<___;
1582 movq %r8, ($out)
1583 movq %r9, 8($out)
1584 movq %r10, 16($out)
1585 movq %r11, 24($out)
1586 movq %r12, 32($out)
1587 movq %r13, 40($out)
1588 movq %r14, 48($out)
1589 movq %r15, 56($out)
1590
1591 leaq 128+24+48(%rsp), %rax
1592 .cfi_def_cfa %rax,8
1593 movq -48(%rax), %r15
1594 .cfi_restore %r15
1595 movq -40(%rax), %r14
1596 .cfi_restore %r14
1597 movq -32(%rax), %r13
1598 .cfi_restore %r13
1599 movq -24(%rax), %r12
1600 .cfi_restore %r12
1601 movq -16(%rax), %rbp
1602 .cfi_restore %rbp
1603 movq -8(%rax), %rbx
1604 .cfi_restore %rbx
1605 leaq (%rax), %rsp
1606 .cfi_def_cfa_register %rsp
1607 .Lmul_by_one_epilogue:
1608 ret
1609 .cfi_endproc
1610 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1611 ___
1612 }
1613 { # __rsaz_512_reduce
1614 #
1615 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1616 # output: %r8-%r15
1617 # clobbers: everything except %rbp and %rdi
1618 $code.=<<___;
1619 .type __rsaz_512_reduce,\@abi-omnipotent
1620 .align 32
1621 __rsaz_512_reduce:
1622 .cfi_startproc
1623 movq %r8, %rbx
1624 imulq 128+8(%rsp), %rbx
1625 movq 0(%rbp), %rax
1626 movl \$8, %ecx
1627 jmp .Lreduction_loop
1628
1629 .align 32
1630 .Lreduction_loop:
1631 mulq %rbx
1632 movq 8(%rbp), %rax
1633 negq %r8
1634 movq %rdx, %r8
1635 adcq \$0, %r8
1636
1637 mulq %rbx
1638 addq %rax, %r9
1639 movq 16(%rbp), %rax
1640 adcq \$0, %rdx
1641 addq %r9, %r8
1642 movq %rdx, %r9
1643 adcq \$0, %r9
1644
1645 mulq %rbx
1646 addq %rax, %r10
1647 movq 24(%rbp), %rax
1648 adcq \$0, %rdx
1649 addq %r10, %r9
1650 movq %rdx, %r10
1651 adcq \$0, %r10
1652
1653 mulq %rbx
1654 addq %rax, %r11
1655 movq 32(%rbp), %rax
1656 adcq \$0, %rdx
1657 addq %r11, %r10
1658 movq 128+8(%rsp), %rsi
1659 #movq %rdx, %r11
1660 #adcq \$0, %r11
1661 adcq \$0, %rdx
1662 movq %rdx, %r11
1663
1664 mulq %rbx
1665 addq %rax, %r12
1666 movq 40(%rbp), %rax
1667 adcq \$0, %rdx
1668 imulq %r8, %rsi
1669 addq %r12, %r11
1670 movq %rdx, %r12
1671 adcq \$0, %r12
1672
1673 mulq %rbx
1674 addq %rax, %r13
1675 movq 48(%rbp), %rax
1676 adcq \$0, %rdx
1677 addq %r13, %r12
1678 movq %rdx, %r13
1679 adcq \$0, %r13
1680
1681 mulq %rbx
1682 addq %rax, %r14
1683 movq 56(%rbp), %rax
1684 adcq \$0, %rdx
1685 addq %r14, %r13
1686 movq %rdx, %r14
1687 adcq \$0, %r14
1688
1689 mulq %rbx
1690 movq %rsi, %rbx
1691 addq %rax, %r15
1692 movq 0(%rbp), %rax
1693 adcq \$0, %rdx
1694 addq %r15, %r14
1695 movq %rdx, %r15
1696 adcq \$0, %r15
1697
1698 decl %ecx
1699 jne .Lreduction_loop
1700
1701 ret
1702 .cfi_endproc
1703 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1704 ___
1705 }
1706 if ($addx) {
1707 # __rsaz_512_reducex
1708 #
1709 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1710 # output: %r8-%r15
1711 # clobbers: everything except %rbp and %rdi
1712 $code.=<<___;
1713 .type __rsaz_512_reducex,\@abi-omnipotent
1714 .align 32
1715 __rsaz_512_reducex:
1716 .cfi_startproc
1717 #movq 128+8(%rsp), %rdx # pull $n0
1718 imulq %r8, %rdx
1719 xorq %rsi, %rsi # cf=0,of=0
1720 movl \$8, %ecx
1721 jmp .Lreduction_loopx
1722
1723 .align 32
1724 .Lreduction_loopx:
1725 mov %r8, %rbx
1726 mulx 0(%rbp), %rax, %r8
1727 adcx %rbx, %rax
1728 adox %r9, %r8
1729
1730 mulx 8(%rbp), %rax, %r9
1731 adcx %rax, %r8
1732 adox %r10, %r9
1733
1734 mulx 16(%rbp), %rbx, %r10
1735 adcx %rbx, %r9
1736 adox %r11, %r10
1737
1738 mulx 24(%rbp), %rbx, %r11
1739 adcx %rbx, %r10
1740 adox %r12, %r11
1741
1742 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1743 mov %rdx, %rax
1744 mov %r8, %rdx
1745 adcx %rbx, %r11
1746 adox %r13, %r12
1747
1748 mulx 128+8(%rsp), %rbx, %rdx
1749 mov %rax, %rdx
1750
1751 mulx 40(%rbp), %rax, %r13
1752 adcx %rax, %r12
1753 adox %r14, %r13
1754
1755 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1756 adcx %rax, %r13
1757 adox %r15, %r14
1758
1759 mulx 56(%rbp), %rax, %r15
1760 mov %rbx, %rdx
1761 adcx %rax, %r14
1762 adox %rsi, %r15 # %rsi is 0
1763 adcx %rsi, %r15 # cf=0
1764
1765 decl %ecx # of=0
1766 jne .Lreduction_loopx
1767
1768 ret
1769 .cfi_endproc
1770 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1771 ___
1772 }
1773 { # __rsaz_512_subtract
1774 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1775 # output:
1776 # clobbers: everything but %rdi, %rsi and %rbp
1777 $code.=<<___;
1778 .type __rsaz_512_subtract,\@abi-omnipotent
1779 .align 32
1780 __rsaz_512_subtract:
1781 .cfi_startproc
1782 movq %r8, ($out)
1783 movq %r9, 8($out)
1784 movq %r10, 16($out)
1785 movq %r11, 24($out)
1786 movq %r12, 32($out)
1787 movq %r13, 40($out)
1788 movq %r14, 48($out)
1789 movq %r15, 56($out)
1790
1791 movq 0($mod), %r8
1792 movq 8($mod), %r9
1793 negq %r8
1794 notq %r9
1795 andq %rcx, %r8
1796 movq 16($mod), %r10
1797 andq %rcx, %r9
1798 notq %r10
1799 movq 24($mod), %r11
1800 andq %rcx, %r10
1801 notq %r11
1802 movq 32($mod), %r12
1803 andq %rcx, %r11
1804 notq %r12
1805 movq 40($mod), %r13
1806 andq %rcx, %r12
1807 notq %r13
1808 movq 48($mod), %r14
1809 andq %rcx, %r13
1810 notq %r14
1811 movq 56($mod), %r15
1812 andq %rcx, %r14
1813 notq %r15
1814 andq %rcx, %r15
1815
1816 addq ($out), %r8
1817 adcq 8($out), %r9
1818 adcq 16($out), %r10
1819 adcq 24($out), %r11
1820 adcq 32($out), %r12
1821 adcq 40($out), %r13
1822 adcq 48($out), %r14
1823 adcq 56($out), %r15
1824
1825 movq %r8, ($out)
1826 movq %r9, 8($out)
1827 movq %r10, 16($out)
1828 movq %r11, 24($out)
1829 movq %r12, 32($out)
1830 movq %r13, 40($out)
1831 movq %r14, 48($out)
1832 movq %r15, 56($out)
1833
1834 ret
1835 .cfi_endproc
1836 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1837 ___
1838 }
1839 { # __rsaz_512_mul
1840 #
1841 # input: %rsi - ap, %rbp - bp
1842 # output:
1843 # clobbers: everything
1844 my ($ap,$bp) = ("%rsi","%rbp");
1845 $code.=<<___;
1846 .type __rsaz_512_mul,\@abi-omnipotent
1847 .align 32
1848 __rsaz_512_mul:
1849 .cfi_startproc
1850 leaq 8(%rsp), %rdi
1851
1852 movq ($ap), %rax
1853 mulq %rbx
1854 movq %rax, (%rdi)
1855 movq 8($ap), %rax
1856 movq %rdx, %r8
1857
1858 mulq %rbx
1859 addq %rax, %r8
1860 movq 16($ap), %rax
1861 movq %rdx, %r9
1862 adcq \$0, %r9
1863
1864 mulq %rbx
1865 addq %rax, %r9
1866 movq 24($ap), %rax
1867 movq %rdx, %r10
1868 adcq \$0, %r10
1869
1870 mulq %rbx
1871 addq %rax, %r10
1872 movq 32($ap), %rax
1873 movq %rdx, %r11
1874 adcq \$0, %r11
1875
1876 mulq %rbx
1877 addq %rax, %r11
1878 movq 40($ap), %rax
1879 movq %rdx, %r12
1880 adcq \$0, %r12
1881
1882 mulq %rbx
1883 addq %rax, %r12
1884 movq 48($ap), %rax
1885 movq %rdx, %r13
1886 adcq \$0, %r13
1887
1888 mulq %rbx
1889 addq %rax, %r13
1890 movq 56($ap), %rax
1891 movq %rdx, %r14
1892 adcq \$0, %r14
1893
1894 mulq %rbx
1895 addq %rax, %r14
1896 movq ($ap), %rax
1897 movq %rdx, %r15
1898 adcq \$0, %r15
1899
1900 leaq 8($bp), $bp
1901 leaq 8(%rdi), %rdi
1902
1903 movl \$7, %ecx
1904 jmp .Loop_mul
1905
1906 .align 32
1907 .Loop_mul:
1908 movq ($bp), %rbx
1909 mulq %rbx
1910 addq %rax, %r8
1911 movq 8($ap), %rax
1912 movq %r8, (%rdi)
1913 movq %rdx, %r8
1914 adcq \$0, %r8
1915
1916 mulq %rbx
1917 addq %rax, %r9
1918 movq 16($ap), %rax
1919 adcq \$0, %rdx
1920 addq %r9, %r8
1921 movq %rdx, %r9
1922 adcq \$0, %r9
1923
1924 mulq %rbx
1925 addq %rax, %r10
1926 movq 24($ap), %rax
1927 adcq \$0, %rdx
1928 addq %r10, %r9
1929 movq %rdx, %r10
1930 adcq \$0, %r10
1931
1932 mulq %rbx
1933 addq %rax, %r11
1934 movq 32($ap), %rax
1935 adcq \$0, %rdx
1936 addq %r11, %r10
1937 movq %rdx, %r11
1938 adcq \$0, %r11
1939
1940 mulq %rbx
1941 addq %rax, %r12
1942 movq 40($ap), %rax
1943 adcq \$0, %rdx
1944 addq %r12, %r11
1945 movq %rdx, %r12
1946 adcq \$0, %r12
1947
1948 mulq %rbx
1949 addq %rax, %r13
1950 movq 48($ap), %rax
1951 adcq \$0, %rdx
1952 addq %r13, %r12
1953 movq %rdx, %r13
1954 adcq \$0, %r13
1955
1956 mulq %rbx
1957 addq %rax, %r14
1958 movq 56($ap), %rax
1959 adcq \$0, %rdx
1960 addq %r14, %r13
1961 movq %rdx, %r14
1962 leaq 8($bp), $bp
1963 adcq \$0, %r14
1964
1965 mulq %rbx
1966 addq %rax, %r15
1967 movq ($ap), %rax
1968 adcq \$0, %rdx
1969 addq %r15, %r14
1970 movq %rdx, %r15
1971 adcq \$0, %r15
1972
1973 leaq 8(%rdi), %rdi
1974
1975 decl %ecx
1976 jnz .Loop_mul
1977
1978 movq %r8, (%rdi)
1979 movq %r9, 8(%rdi)
1980 movq %r10, 16(%rdi)
1981 movq %r11, 24(%rdi)
1982 movq %r12, 32(%rdi)
1983 movq %r13, 40(%rdi)
1984 movq %r14, 48(%rdi)
1985 movq %r15, 56(%rdi)
1986
1987 ret
1988 .cfi_endproc
1989 .size __rsaz_512_mul,.-__rsaz_512_mul
1990 ___
1991 }
1992 if ($addx) {
1993 # __rsaz_512_mulx
1994 #
1995 # input: %rsi - ap, %rbp - bp
1996 # output:
1997 # clobbers: everything
1998 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1999 $code.=<<___;
2000 .type __rsaz_512_mulx,\@abi-omnipotent
2001 .align 32
2002 __rsaz_512_mulx:
2003 .cfi_startproc
2004 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
2005 mov \$-6, %rcx
2006
2007 mulx 8($ap), %rax, %r9
2008 movq %rbx, 8(%rsp)
2009
2010 mulx 16($ap), %rbx, %r10
2011 adc %rax, %r8
2012
2013 mulx 24($ap), %rax, %r11
2014 adc %rbx, %r9
2015
2016 mulx 32($ap), %rbx, %r12
2017 adc %rax, %r10
2018
2019 mulx 40($ap), %rax, %r13
2020 adc %rbx, %r11
2021
2022 mulx 48($ap), %rbx, %r14
2023 adc %rax, %r12
2024
2025 mulx 56($ap), %rax, %r15
2026 mov 8($bp), %rdx
2027 adc %rbx, %r13
2028 adc %rax, %r14
2029 adc \$0, %r15
2030
2031 xor $zero, $zero # cf=0,of=0
2032 jmp .Loop_mulx
2033
2034 .align 32
2035 .Loop_mulx:
2036 movq %r8, %rbx
2037 mulx ($ap), %rax, %r8
2038 adcx %rax, %rbx
2039 adox %r9, %r8
2040
2041 mulx 8($ap), %rax, %r9
2042 adcx %rax, %r8
2043 adox %r10, %r9
2044
2045 mulx 16($ap), %rax, %r10
2046 adcx %rax, %r9
2047 adox %r11, %r10
2048
2049 mulx 24($ap), %rax, %r11
2050 adcx %rax, %r10
2051 adox %r12, %r11
2052
2053 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
2054 adcx %rax, %r11
2055 adox %r13, %r12
2056
2057 mulx 40($ap), %rax, %r13
2058 adcx %rax, %r12
2059 adox %r14, %r13
2060
2061 mulx 48($ap), %rax, %r14
2062 adcx %rax, %r13
2063 adox %r15, %r14
2064
2065 mulx 56($ap), %rax, %r15
2066 movq 64($bp,%rcx,8), %rdx
2067 movq %rbx, 8+64-8(%rsp,%rcx,8)
2068 adcx %rax, %r14
2069 adox $zero, %r15
2070 adcx $zero, %r15 # cf=0
2071
2072 inc %rcx # of=0
2073 jnz .Loop_mulx
2074
2075 movq %r8, %rbx
2076 mulx ($ap), %rax, %r8
2077 adcx %rax, %rbx
2078 adox %r9, %r8
2079
2080 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2081 adcx %rax, %r8
2082 adox %r10, %r9
2083
2084 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2085 adcx %rax, %r9
2086 adox %r11, %r10
2087
2088 mulx 24($ap), %rax, %r11
2089 adcx %rax, %r10
2090 adox %r12, %r11
2091
2092 mulx 32($ap), %rax, %r12
2093 adcx %rax, %r11
2094 adox %r13, %r12
2095
2096 mulx 40($ap), %rax, %r13
2097 adcx %rax, %r12
2098 adox %r14, %r13
2099
2100 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2101 adcx %rax, %r13
2102 adox %r15, %r14
2103
2104 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2105 adcx %rax, %r14
2106 adox $zero, %r15
2107 adcx $zero, %r15
2108
2109 mov %rbx, 8+64-8(%rsp)
2110 mov %r8, 8+64(%rsp)
2111 mov %r9, 8+64+8(%rsp)
2112 mov %r10, 8+64+16(%rsp)
2113 mov %r11, 8+64+24(%rsp)
2114 mov %r12, 8+64+32(%rsp)
2115 mov %r13, 8+64+40(%rsp)
2116 mov %r14, 8+64+48(%rsp)
2117 mov %r15, 8+64+56(%rsp)
2118
2119 ret
2120 .cfi_endproc
2121 .size __rsaz_512_mulx,.-__rsaz_512_mulx
2122 ___
2123 }
2124 {
2125 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2126 $code.=<<___;
2127 .globl rsaz_512_scatter4
2128 .type rsaz_512_scatter4,\@abi-omnipotent
2129 .align 16
2130 rsaz_512_scatter4:
2131 .cfi_startproc
2132 leaq ($out,$power,8), $out
2133 movl \$8, %r9d
2134 jmp .Loop_scatter
2135 .align 16
2136 .Loop_scatter:
2137 movq ($inp), %rax
2138 leaq 8($inp), $inp
2139 movq %rax, ($out)
2140 leaq 128($out), $out
2141 decl %r9d
2142 jnz .Loop_scatter
2143 ret
2144 .cfi_endproc
2145 .size rsaz_512_scatter4,.-rsaz_512_scatter4
2146
2147 .globl rsaz_512_gather4
2148 .type rsaz_512_gather4,\@abi-omnipotent
2149 .align 16
2150 rsaz_512_gather4:
2151 .cfi_startproc
2152 ___
2153 $code.=<<___ if ($win64);
2154 .LSEH_begin_rsaz_512_gather4:
2155 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2156 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2157 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2158 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2159 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2160 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2161 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2162 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2163 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2164 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2165 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2166 ___
2167 $code.=<<___;
2168 movd $power,%xmm8
2169 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
2170 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
2171
2172 pshufd \$0,%xmm8,%xmm8 # broadcast $power
2173 movdqa %xmm1,%xmm7
2174 movdqa %xmm1,%xmm2
2175 ___
2176 ########################################################################
2177 # calculate mask by comparing 0..15 to $power
2178 #
2179 for($i=0;$i<4;$i++) {
2180 $code.=<<___;
2181 paddd %xmm`$i`,%xmm`$i+1`
2182 pcmpeqd %xmm8,%xmm`$i`
2183 movdqa %xmm7,%xmm`$i+3`
2184 ___
2185 }
2186 for(;$i<7;$i++) {
2187 $code.=<<___;
2188 paddd %xmm`$i`,%xmm`$i+1`
2189 pcmpeqd %xmm8,%xmm`$i`
2190 ___
2191 }
2192 $code.=<<___;
2193 pcmpeqd %xmm8,%xmm7
2194 movl \$8, %r9d
2195 jmp .Loop_gather
2196 .align 16
2197 .Loop_gather:
2198 movdqa 16*0($inp),%xmm8
2199 movdqa 16*1($inp),%xmm9
2200 movdqa 16*2($inp),%xmm10
2201 movdqa 16*3($inp),%xmm11
2202 pand %xmm0,%xmm8
2203 movdqa 16*4($inp),%xmm12
2204 pand %xmm1,%xmm9
2205 movdqa 16*5($inp),%xmm13
2206 pand %xmm2,%xmm10
2207 movdqa 16*6($inp),%xmm14
2208 pand %xmm3,%xmm11
2209 movdqa 16*7($inp),%xmm15
2210 leaq 128($inp), $inp
2211 pand %xmm4,%xmm12
2212 pand %xmm5,%xmm13
2213 pand %xmm6,%xmm14
2214 pand %xmm7,%xmm15
2215 por %xmm10,%xmm8
2216 por %xmm11,%xmm9
2217 por %xmm12,%xmm8
2218 por %xmm13,%xmm9
2219 por %xmm14,%xmm8
2220 por %xmm15,%xmm9
2221
2222 por %xmm9,%xmm8
2223 pshufd \$0x4e,%xmm8,%xmm9
2224 por %xmm9,%xmm8
2225 movq %xmm8,($out)
2226 leaq 8($out), $out
2227 decl %r9d
2228 jnz .Loop_gather
2229 ___
2230 $code.=<<___ if ($win64);
2231 movaps 0x00(%rsp),%xmm6
2232 movaps 0x10(%rsp),%xmm7
2233 movaps 0x20(%rsp),%xmm8
2234 movaps 0x30(%rsp),%xmm9
2235 movaps 0x40(%rsp),%xmm10
2236 movaps 0x50(%rsp),%xmm11
2237 movaps 0x60(%rsp),%xmm12
2238 movaps 0x70(%rsp),%xmm13
2239 movaps 0x80(%rsp),%xmm14
2240 movaps 0x90(%rsp),%xmm15
2241 add \$0xa8,%rsp
2242 ___
2243 $code.=<<___;
2244 ret
2245 .LSEH_end_rsaz_512_gather4:
2246 .cfi_endproc
2247 .size rsaz_512_gather4,.-rsaz_512_gather4
2248
2249 .align 64
2250 .Linc:
2251 .long 0,0, 1,1
2252 .long 2,2, 2,2
2253 ___
2254 }
2255
2256 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2257 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2258 if ($win64) {
2259 $rec="%rcx";
2260 $frame="%rdx";
2261 $context="%r8";
2262 $disp="%r9";
2263
2264 $code.=<<___;
2265 .extern __imp_RtlVirtualUnwind
2266 .type se_handler,\@abi-omnipotent
2267 .align 16
2268 se_handler:
2269 push %rsi
2270 push %rdi
2271 push %rbx
2272 push %rbp
2273 push %r12
2274 push %r13
2275 push %r14
2276 push %r15
2277 pushfq
2278 sub \$64,%rsp
2279
2280 mov 120($context),%rax # pull context->Rax
2281 mov 248($context),%rbx # pull context->Rip
2282
2283 mov 8($disp),%rsi # disp->ImageBase
2284 mov 56($disp),%r11 # disp->HandlerData
2285
2286 mov 0(%r11),%r10d # HandlerData[0]
2287 lea (%rsi,%r10),%r10 # end of prologue label
2288 cmp %r10,%rbx # context->Rip<end of prologue label
2289 jb .Lcommon_seh_tail
2290
2291 mov 152($context),%rax # pull context->Rsp
2292
2293 mov 4(%r11),%r10d # HandlerData[1]
2294 lea (%rsi,%r10),%r10 # epilogue label
2295 cmp %r10,%rbx # context->Rip>=epilogue label
2296 jae .Lcommon_seh_tail
2297
2298 lea 128+24+48(%rax),%rax
2299
2300 lea .Lmul_gather4_epilogue(%rip),%rbx
2301 cmp %r10,%rbx
2302 jne .Lse_not_in_mul_gather4
2303
2304 lea 0xb0(%rax),%rax
2305
2306 lea -48-0xa8(%rax),%rsi
2307 lea 512($context),%rdi
2308 mov \$20,%ecx
2309 .long 0xa548f3fc # cld; rep movsq
2310
2311 .Lse_not_in_mul_gather4:
2312 mov -8(%rax),%rbx
2313 mov -16(%rax),%rbp
2314 mov -24(%rax),%r12
2315 mov -32(%rax),%r13
2316 mov -40(%rax),%r14
2317 mov -48(%rax),%r15
2318 mov %rbx,144($context) # restore context->Rbx
2319 mov %rbp,160($context) # restore context->Rbp
2320 mov %r12,216($context) # restore context->R12
2321 mov %r13,224($context) # restore context->R13
2322 mov %r14,232($context) # restore context->R14
2323 mov %r15,240($context) # restore context->R15
2324
2325 .Lcommon_seh_tail:
2326 mov 8(%rax),%rdi
2327 mov 16(%rax),%rsi
2328 mov %rax,152($context) # restore context->Rsp
2329 mov %rsi,168($context) # restore context->Rsi
2330 mov %rdi,176($context) # restore context->Rdi
2331
2332 mov 40($disp),%rdi # disp->ContextRecord
2333 mov $context,%rsi # context
2334 mov \$154,%ecx # sizeof(CONTEXT)
2335 .long 0xa548f3fc # cld; rep movsq
2336
2337 mov $disp,%rsi
2338 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2339 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2340 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2341 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2342 mov 40(%rsi),%r10 # disp->ContextRecord
2343 lea 56(%rsi),%r11 # &disp->HandlerData
2344 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2345 mov %r10,32(%rsp) # arg5
2346 mov %r11,40(%rsp) # arg6
2347 mov %r12,48(%rsp) # arg7
2348 mov %rcx,56(%rsp) # arg8, (NULL)
2349 call *__imp_RtlVirtualUnwind(%rip)
2350
2351 mov \$1,%eax # ExceptionContinueSearch
2352 add \$64,%rsp
2353 popfq
2354 pop %r15
2355 pop %r14
2356 pop %r13
2357 pop %r12
2358 pop %rbp
2359 pop %rbx
2360 pop %rdi
2361 pop %rsi
2362 ret
2363 .size se_handler,.-se_handler
2364
2365 .section .pdata
2366 .align 4
2367 .rva .LSEH_begin_rsaz_512_sqr
2368 .rva .LSEH_end_rsaz_512_sqr
2369 .rva .LSEH_info_rsaz_512_sqr
2370
2371 .rva .LSEH_begin_rsaz_512_mul
2372 .rva .LSEH_end_rsaz_512_mul
2373 .rva .LSEH_info_rsaz_512_mul
2374
2375 .rva .LSEH_begin_rsaz_512_mul_gather4
2376 .rva .LSEH_end_rsaz_512_mul_gather4
2377 .rva .LSEH_info_rsaz_512_mul_gather4
2378
2379 .rva .LSEH_begin_rsaz_512_mul_scatter4
2380 .rva .LSEH_end_rsaz_512_mul_scatter4
2381 .rva .LSEH_info_rsaz_512_mul_scatter4
2382
2383 .rva .LSEH_begin_rsaz_512_mul_by_one
2384 .rva .LSEH_end_rsaz_512_mul_by_one
2385 .rva .LSEH_info_rsaz_512_mul_by_one
2386
2387 .rva .LSEH_begin_rsaz_512_gather4
2388 .rva .LSEH_end_rsaz_512_gather4
2389 .rva .LSEH_info_rsaz_512_gather4
2390
2391 .section .xdata
2392 .align 8
2393 .LSEH_info_rsaz_512_sqr:
2394 .byte 9,0,0,0
2395 .rva se_handler
2396 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2397 .LSEH_info_rsaz_512_mul:
2398 .byte 9,0,0,0
2399 .rva se_handler
2400 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2401 .LSEH_info_rsaz_512_mul_gather4:
2402 .byte 9,0,0,0
2403 .rva se_handler
2404 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2405 .LSEH_info_rsaz_512_mul_scatter4:
2406 .byte 9,0,0,0
2407 .rva se_handler
2408 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2409 .LSEH_info_rsaz_512_mul_by_one:
2410 .byte 9,0,0,0
2411 .rva se_handler
2412 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2413 .LSEH_info_rsaz_512_gather4:
2414 .byte 0x01,0x46,0x16,0x00
2415 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2416 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2417 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2418 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2419 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2420 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2421 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2422 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2423 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2424 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2425 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
2426 ___
2427 }
2428
2429 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2430 print $code;
2431 close STDOUT;