]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/rsaz-x86_64.pl
x86_64 assembly pack: tolerate spaces in source directory name.
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
0b4bb91d 9
31ed9a21
AP
10##############################################################################
11# #
12# Copyright (c) 2012, Intel Corporation #
13# #
14# All rights reserved. #
15# #
16# Redistribution and use in source and binary forms, with or without #
17# modification, are permitted provided that the following conditions are #
18# met: #
19# #
20# * Redistributions of source code must retain the above copyright #
21# notice, this list of conditions and the following disclaimer. #
22# #
23# * Redistributions in binary form must reproduce the above copyright #
24# notice, this list of conditions and the following disclaimer in the #
25# documentation and/or other materials provided with the #
26# distribution. #
27# #
28# * Neither the name of the Intel Corporation nor the names of its #
29# contributors may be used to endorse or promote products derived from #
30# this software without specific prior written permission. #
31# #
32# #
33# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
34# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
35# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
36# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
37# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
38# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
39# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
40# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
41# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
42# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
43# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
44# #
45##############################################################################
46# Developers and authors: #
47# Shay Gueron (1, 2), and Vlad Krasnov (1) #
48# (1) Intel Architecture Group, Microprocessor and Chipset Development, #
49# Israel Development Center, Haifa, Israel #
50# (2) University of Haifa #
51##############################################################################
52# Reference: #
53# [1] S. Gueron, "Efficient Software Implementations of Modular #
54# Exponentiation", http://eprint.iacr.org/2011/239 #
55# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
56# IEEE Proceedings of 9th International Conference on Information #
57# Technology: New Generations (ITNG 2012), 821-823 (2012). #
58# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
59# Journal of Cryptographic Engineering 2:31-43 (2012). #
60# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
61# resistant 512-bit and 1024-bit modular exponentiation for optimizing #
62# RSA1024 and RSA2048 on x86_64 platforms", #
63# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
64##############################################################################
0b4bb91d
AP
65
66# While original submission covers 512- and 1024-bit exponentiation,
67# this module is limited to 512-bit version only (and as such
68# accelerates RSA1024 sign). This is because improvement for longer
69# keys is not high enough to justify the effort, highest measured
70# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
71# for the moment of this writing!] Nor does this module implement
72# "monolithic" complete exponentiation jumbo-subroutine, but adheres
73# to more modular mixture of C and assembly. And it's optimized even
74# for processors other than Intel Core family (see table below for
75# improvement coefficients).
76# <appro@openssl.org>
77#
78# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
79# ----------------+---------------------------
80# Opteron +13% |+5% +20%
81# Bulldozer -0% |-1% +10%
82# P4 +11% |+7% +8%
83# Westmere +5% |+14% +17%
84# Sandy Bridge +2% |+12% +29%
85# Ivy Bridge +1% |+11% +35%
86# Haswell(**) -0% |+12% +39%
87# Atom +13% |+11% +4%
88# VIA Nano +70% |+9% +25%
89#
90# (*) rsax engine and fips numbers are presented for reference
91# purposes;
87954638 92# (**) MULX was attempted, but found to give only marginal improvement;
0b4bb91d
AP
93
94$flavour = shift;
95$output = shift;
96if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
97
98$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
99
100$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
101( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
102( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
103die "can't locate x86_64-xlate.pl";
104
cfe1d992 105open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
0b4bb91d
AP
106*STDOUT=*OUT;
107
87954638
AP
108if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
109 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
30b9c234 110 $addx = ($1>=2.23);
87954638
AP
111}
112
113if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
114 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
115 $addx = ($1>=2.10);
116}
117
118if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
119 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1b0fe79f 120 $addx = ($1>=12);
87954638
AP
121}
122
b9749432 123if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
a356e488
AP
124 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
125 $addx = ($ver>=3.03);
126}
127
0b4bb91d
AP
128($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
129{
130my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
131
132$code.=<<___;
133.text
134
87954638
AP
135.extern OPENSSL_ia32cap_P
136
0b4bb91d 137.globl rsaz_512_sqr
6efef384 138.type rsaz_512_sqr,\@function,5
0b4bb91d
AP
139.align 32
140rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
141 push %rbx
142 push %rbp
143 push %r12
144 push %r13
145 push %r14
146 push %r15
147
148 subq \$128+24, %rsp
149.Lsqr_body:
150 movq $mod, %rbp # common argument
151 movq ($inp), %rdx
152 movq 8($inp), %rax
153 movq $n0, 128(%rsp)
87954638
AP
154___
155$code.=<<___ if ($addx);
156 movl \$0x80100,%r11d
157 andl OPENSSL_ia32cap_P+8(%rip),%r11d
158 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
159 je .Loop_sqrx
160___
161$code.=<<___;
0b4bb91d
AP
162 jmp .Loop_sqr
163
164.align 32
165.Loop_sqr:
166 movl $times,128+8(%rsp)
0b4bb91d
AP
167#first iteration
168 movq %rdx, %rbx
169 mulq %rdx
170 movq %rax, %r8
171 movq 16($inp), %rax
172 movq %rdx, %r9
173
174 mulq %rbx
175 addq %rax, %r9
176 movq 24($inp), %rax
177 movq %rdx, %r10
178 adcq \$0, %r10
179
180 mulq %rbx
181 addq %rax, %r10
182 movq 32($inp), %rax
183 movq %rdx, %r11
184 adcq \$0, %r11
185
186 mulq %rbx
187 addq %rax, %r11
188 movq 40($inp), %rax
189 movq %rdx, %r12
190 adcq \$0, %r12
191
192 mulq %rbx
193 addq %rax, %r12
194 movq 48($inp), %rax
195 movq %rdx, %r13
196 adcq \$0, %r13
197
198 mulq %rbx
199 addq %rax, %r13
200 movq 56($inp), %rax
201 movq %rdx, %r14
202 adcq \$0, %r14
203
204 mulq %rbx
205 addq %rax, %r14
206 movq %rbx, %rax
207 movq %rdx, %r15
208 adcq \$0, %r15
209
210 addq %r8, %r8 #shlq \$1, %r8
211 movq %r9, %rcx
212 adcq %r9, %r9 #shld \$1, %r8, %r9
213
214 mulq %rax
215 movq %rax, (%rsp)
216 addq %rdx, %r8
217 adcq \$0, %r9
218
219 movq %r8, 8(%rsp)
220 shrq \$63, %rcx
221
222#second iteration
223 movq 8($inp), %r8
224 movq 16($inp), %rax
225 mulq %r8
226 addq %rax, %r10
227 movq 24($inp), %rax
228 movq %rdx, %rbx
229 adcq \$0, %rbx
230
231 mulq %r8
232 addq %rax, %r11
233 movq 32($inp), %rax
234 adcq \$0, %rdx
235 addq %rbx, %r11
236 movq %rdx, %rbx
237 adcq \$0, %rbx
238
239 mulq %r8
240 addq %rax, %r12
241 movq 40($inp), %rax
242 adcq \$0, %rdx
243 addq %rbx, %r12
244 movq %rdx, %rbx
245 adcq \$0, %rbx
246
247 mulq %r8
248 addq %rax, %r13
249 movq 48($inp), %rax
250 adcq \$0, %rdx
251 addq %rbx, %r13
252 movq %rdx, %rbx
253 adcq \$0, %rbx
254
255 mulq %r8
256 addq %rax, %r14
257 movq 56($inp), %rax
258 adcq \$0, %rdx
259 addq %rbx, %r14
260 movq %rdx, %rbx
261 adcq \$0, %rbx
262
263 mulq %r8
264 addq %rax, %r15
265 movq %r8, %rax
266 adcq \$0, %rdx
267 addq %rbx, %r15
268 movq %rdx, %r8
269 movq %r10, %rdx
270 adcq \$0, %r8
271
272 add %rdx, %rdx
273 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
274 movq %r11, %rbx
275 adcq %r11, %r11 #shld \$1, %r10, %r11
276
277 mulq %rax
278 addq %rax, %r9
279 adcq %rdx, %r10
280 adcq \$0, %r11
281
282 movq %r9, 16(%rsp)
283 movq %r10, 24(%rsp)
284 shrq \$63, %rbx
285
286#third iteration
287 movq 16($inp), %r9
288 movq 24($inp), %rax
289 mulq %r9
290 addq %rax, %r12
291 movq 32($inp), %rax
292 movq %rdx, %rcx
293 adcq \$0, %rcx
294
295 mulq %r9
296 addq %rax, %r13
297 movq 40($inp), %rax
298 adcq \$0, %rdx
299 addq %rcx, %r13
300 movq %rdx, %rcx
301 adcq \$0, %rcx
302
303 mulq %r9
304 addq %rax, %r14
305 movq 48($inp), %rax
306 adcq \$0, %rdx
307 addq %rcx, %r14
308 movq %rdx, %rcx
309 adcq \$0, %rcx
310
311 mulq %r9
312 movq %r12, %r10
313 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
314 addq %rax, %r15
315 movq 56($inp), %rax
316 adcq \$0, %rdx
317 addq %rcx, %r15
318 movq %rdx, %rcx
319 adcq \$0, %rcx
320
321 mulq %r9
322 shrq \$63, %r10
323 addq %rax, %r8
324 movq %r9, %rax
325 adcq \$0, %rdx
326 addq %rcx, %r8
327 movq %rdx, %r9
328 adcq \$0, %r9
329
330 movq %r13, %rcx
331 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
332
333 mulq %rax
334 addq %rax, %r11
335 adcq %rdx, %r12
336 adcq \$0, %r13
337
338 movq %r11, 32(%rsp)
339 movq %r12, 40(%rsp)
340 shrq \$63, %rcx
341
342#fourth iteration
343 movq 24($inp), %r10
344 movq 32($inp), %rax
345 mulq %r10
346 addq %rax, %r14
347 movq 40($inp), %rax
348 movq %rdx, %rbx
349 adcq \$0, %rbx
350
351 mulq %r10
352 addq %rax, %r15
353 movq 48($inp), %rax
354 adcq \$0, %rdx
355 addq %rbx, %r15
356 movq %rdx, %rbx
357 adcq \$0, %rbx
358
359 mulq %r10
360 movq %r14, %r12
361 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
362 addq %rax, %r8
363 movq 56($inp), %rax
364 adcq \$0, %rdx
365 addq %rbx, %r8
366 movq %rdx, %rbx
367 adcq \$0, %rbx
368
369 mulq %r10
370 shrq \$63, %r12
371 addq %rax, %r9
372 movq %r10, %rax
373 adcq \$0, %rdx
374 addq %rbx, %r9
375 movq %rdx, %r10
376 adcq \$0, %r10
377
378 movq %r15, %rbx
379 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
380
381 mulq %rax
382 addq %rax, %r13
383 adcq %rdx, %r14
384 adcq \$0, %r15
385
386 movq %r13, 48(%rsp)
387 movq %r14, 56(%rsp)
388 shrq \$63, %rbx
389
390#fifth iteration
391 movq 32($inp), %r11
392 movq 40($inp), %rax
393 mulq %r11
394 addq %rax, %r8
395 movq 48($inp), %rax
396 movq %rdx, %rcx
397 adcq \$0, %rcx
398
399 mulq %r11
400 addq %rax, %r9
401 movq 56($inp), %rax
402 adcq \$0, %rdx
403 movq %r8, %r12
404 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
405 addq %rcx, %r9
406 movq %rdx, %rcx
407 adcq \$0, %rcx
408
409 mulq %r11
410 shrq \$63, %r12
411 addq %rax, %r10
412 movq %r11, %rax
413 adcq \$0, %rdx
414 addq %rcx, %r10
415 movq %rdx, %r11
416 adcq \$0, %r11
417
418 movq %r9, %rcx
419 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
420
421 mulq %rax
422 addq %rax, %r15
423 adcq %rdx, %r8
424 adcq \$0, %r9
425
426 movq %r15, 64(%rsp)
427 movq %r8, 72(%rsp)
428 shrq \$63, %rcx
429
430#sixth iteration
431 movq 40($inp), %r12
432 movq 48($inp), %rax
433 mulq %r12
434 addq %rax, %r10
435 movq 56($inp), %rax
436 movq %rdx, %rbx
437 adcq \$0, %rbx
438
439 mulq %r12
440 addq %rax, %r11
441 movq %r12, %rax
442 movq %r10, %r15
443 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
444 adcq \$0, %rdx
445 shrq \$63, %r15
446 addq %rbx, %r11
447 movq %rdx, %r12
448 adcq \$0, %r12
449
450 movq %r11, %rbx
451 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
452
453 mulq %rax
454 addq %rax, %r9
455 adcq %rdx, %r10
456 adcq \$0, %r11
457
458 movq %r9, 80(%rsp)
459 movq %r10, 88(%rsp)
460
461#seventh iteration
462 movq 48($inp), %r13
463 movq 56($inp), %rax
464 mulq %r13
465 addq %rax, %r12
466 movq %r13, %rax
467 movq %rdx, %r13
468 adcq \$0, %r13
469
470 xorq %r14, %r14
471 shlq \$1, %rbx
472 adcq %r12, %r12 #shld \$1, %rbx, %r12
473 adcq %r13, %r13 #shld \$1, %r12, %r13
474 adcq %r14, %r14 #shld \$1, %r13, %r14
475
476 mulq %rax
477 addq %rax, %r11
478 adcq %rdx, %r12
479 adcq \$0, %r13
480
481 movq %r11, 96(%rsp)
482 movq %r12, 104(%rsp)
483
484#eighth iteration
485 movq 56($inp), %rax
486 mulq %rax
487 addq %rax, %r13
488 adcq \$0, %rdx
489
490 addq %rdx, %r14
491
492 movq %r13, 112(%rsp)
493 movq %r14, 120(%rsp)
87954638
AP
494
495 movq (%rsp), %r8
496 movq 8(%rsp), %r9
497 movq 16(%rsp), %r10
498 movq 24(%rsp), %r11
499 movq 32(%rsp), %r12
500 movq 40(%rsp), %r13
501 movq 48(%rsp), %r14
502 movq 56(%rsp), %r15
503
504 call __rsaz_512_reduce
505
506 addq 64(%rsp), %r8
507 adcq 72(%rsp), %r9
508 adcq 80(%rsp), %r10
509 adcq 88(%rsp), %r11
510 adcq 96(%rsp), %r12
511 adcq 104(%rsp), %r13
512 adcq 112(%rsp), %r14
513 adcq 120(%rsp), %r15
514 sbbq %rcx, %rcx
515
516 call __rsaz_512_subtract
517
518 movq %r8, %rdx
519 movq %r9, %rax
520 movl 128+8(%rsp), $times
521 movq $out, $inp
522
523 decl $times
524 jnz .Loop_sqr
0b4bb91d 525___
87954638 526if ($addx) {
0b4bb91d 527$code.=<<___;
87954638
AP
528 jmp .Lsqr_tail
529
530.align 32
531.Loop_sqrx:
532 movl $times,128+8(%rsp)
0b4bb91d 533 movq $out, %xmm0 # off-load
87954638 534 movq %rbp, %xmm1 # off-load
0b4bb91d
AP
535#first iteration
536 mulx %rax, %r8, %r9
537
538 mulx 16($inp), %rcx, %r10
87954638 539 xor %rbp, %rbp # cf=0, of=0
0b4bb91d
AP
540
541 mulx 24($inp), %rax, %r11
87954638 542 adcx %rcx, %r9
0b4bb91d
AP
543
544 mulx 32($inp), %rcx, %r12
87954638 545 adcx %rax, %r10
0b4bb91d
AP
546
547 mulx 40($inp), %rax, %r13
87954638 548 adcx %rcx, %r11
0b4bb91d 549
87954638
AP
550 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
551 adcx %rax, %r12
552 adcx %rcx, %r13
0b4bb91d 553
87954638
AP
554 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
555 adcx %rax, %r14
556 adcx %rbp, %r15 # %rbp is 0
0b4bb91d 557
87954638 558 mov %r9, %rcx
0b4bb91d
AP
559 shld \$1, %r8, %r9
560 shl \$1, %r8
561
87954638 562 xor %ebp, %ebp
0b4bb91d 563 mulx %rdx, %rax, %rdx
87954638
AP
564 adcx %rdx, %r8
565 mov 8($inp), %rdx
566 adcx %rbp, %r9
0b4bb91d
AP
567
568 mov %rax, (%rsp)
569 mov %r8, 8(%rsp)
570
571#second iteration
0b4bb91d 572 mulx 16($inp), %rax, %rbx
87954638
AP
573 adox %rax, %r10
574 adcx %rbx, %r11
0b4bb91d 575
87954638
AP
576 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
577 adox $out, %r11
578 adcx %r8, %r12
0b4bb91d
AP
579
580 mulx 32($inp), %rax, %rbx
87954638
AP
581 adox %rax, %r12
582 adcx %rbx, %r13
0b4bb91d
AP
583
584 mulx 40($inp), $out, %r8
87954638
AP
585 adox $out, %r13
586 adcx %r8, %r14
0b4bb91d 587
87954638
AP
588 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
589 adox %rax, %r14
590 adcx %rbx, %r15
0b4bb91d 591
87954638
AP
592 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
593 adox $out, %r15
594 adcx %rbp, %r8
595 adox %rbp, %r8
0b4bb91d 596
87954638 597 mov %r11, %rbx
0b4bb91d
AP
598 shld \$1, %r10, %r11
599 shld \$1, %rcx, %r10
600
87954638 601 xor %ebp,%ebp
0b4bb91d 602 mulx %rdx, %rax, %rcx
87954638
AP
603 mov 16($inp), %rdx
604 adcx %rax, %r9
605 adcx %rcx, %r10
606 adcx %rbp, %r11
0b4bb91d
AP
607
608 mov %r9, 16(%rsp)
87954638 609 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
0b4bb91d
AP
610
611#third iteration
87954638
AP
612 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
613 adox $out, %r12
614 adcx %r9, %r13
0b4bb91d
AP
615
616 mulx 32($inp), %rax, %rcx
87954638
AP
617 adox %rax, %r13
618 adcx %rcx, %r14
0b4bb91d
AP
619
620 mulx 40($inp), $out, %r9
87954638
AP
621 adox $out, %r14
622 adcx %r9, %r15
0b4bb91d 623
87954638
AP
624 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
625 adox %rax, %r15
626 adcx %rcx, %r8
0b4bb91d 627
87954638
AP
628 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
629 adox $out, %r8
630 adcx %rbp, %r9
631 adox %rbp, %r9
0b4bb91d 632
87954638 633 mov %r13, %rcx
0b4bb91d
AP
634 shld \$1, %r12, %r13
635 shld \$1, %rbx, %r12
636
87954638 637 xor %ebp, %ebp
0b4bb91d 638 mulx %rdx, %rax, %rdx
87954638
AP
639 adcx %rax, %r11
640 adcx %rdx, %r12
641 mov 24($inp), %rdx
642 adcx %rbp, %r13
0b4bb91d
AP
643
644 mov %r11, 32(%rsp)
87954638 645 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
0b4bb91d
AP
646
647#fourth iteration
87954638
AP
648 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
649 adox %rax, %r14
650 adcx %rbx, %r15
0b4bb91d
AP
651
652 mulx 40($inp), $out, %r10
87954638
AP
653 adox $out, %r15
654 adcx %r10, %r8
0b4bb91d
AP
655
656 mulx 48($inp), %rax, %rbx
87954638
AP
657 adox %rax, %r8
658 adcx %rbx, %r9
0b4bb91d
AP
659
660 mulx 56($inp), $out, %r10
87954638
AP
661 adox $out, %r9
662 adcx %rbp, %r10
663 adox %rbp, %r10
0b4bb91d 664
87954638
AP
665 .byte 0x66
666 mov %r15, %rbx
0b4bb91d
AP
667 shld \$1, %r14, %r15
668 shld \$1, %rcx, %r14
669
87954638 670 xor %ebp, %ebp
0b4bb91d 671 mulx %rdx, %rax, %rdx
87954638
AP
672 adcx %rax, %r13
673 adcx %rdx, %r14
674 mov 32($inp), %rdx
675 adcx %rbp, %r15
0b4bb91d
AP
676
677 mov %r13, 48(%rsp)
678 mov %r14, 56(%rsp)
679
680#fifth iteration
87954638
AP
681 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
682 adox $out, %r8
683 adcx %r11, %r9
0b4bb91d
AP
684
685 mulx 48($inp), %rax, %rcx
87954638
AP
686 adox %rax, %r9
687 adcx %rcx, %r10
0b4bb91d
AP
688
689 mulx 56($inp), $out, %r11
87954638
AP
690 adox $out, %r10
691 adcx %rbp, %r11
692 adox %rbp, %r11
0b4bb91d
AP
693
694 mov %r9, %rcx
695 shld \$1, %r8, %r9
696 shld \$1, %rbx, %r8
697
87954638 698 xor %ebp, %ebp
0b4bb91d 699 mulx %rdx, %rax, %rdx
87954638
AP
700 adcx %rax, %r15
701 adcx %rdx, %r8
702 mov 40($inp), %rdx
703 adcx %rbp, %r9
0b4bb91d
AP
704
705 mov %r15, 64(%rsp)
706 mov %r8, 72(%rsp)
707
708#sixth iteration
87954638
AP
709 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
710 adox %rax, %r10
711 adcx %rbx, %r11
0b4bb91d 712
87954638
AP
713 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
714 adox $out, %r11
715 adcx %rbp, %r12
716 adox %rbp, %r12
0b4bb91d
AP
717
718 mov %r11, %rbx
719 shld \$1, %r10, %r11
720 shld \$1, %rcx, %r10
721
87954638 722 xor %ebp, %ebp
0b4bb91d 723 mulx %rdx, %rax, %rdx
87954638
AP
724 adcx %rax, %r9
725 adcx %rdx, %r10
726 mov 48($inp), %rdx
727 adcx %rbp, %r11
0b4bb91d
AP
728
729 mov %r9, 80(%rsp)
730 mov %r10, 88(%rsp)
731
732#seventh iteration
87954638
AP
733 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
734 adox %rax, %r12
735 adox %rbp, %r13
0b4bb91d
AP
736
737 xor %r14, %r14
738 shld \$1, %r13, %r14
739 shld \$1, %r12, %r13
740 shld \$1, %rbx, %r12
741
87954638 742 xor %ebp, %ebp
0b4bb91d 743 mulx %rdx, %rax, %rdx
87954638
AP
744 adcx %rax, %r11
745 adcx %rdx, %r12
746 mov 56($inp), %rdx
747 adcx %rbp, %r13
0b4bb91d 748
87954638
AP
749 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
750 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
0b4bb91d
AP
751
752#eighth iteration
0b4bb91d 753 mulx %rdx, %rax, %rdx
87954638
AP
754 adox %rax, %r13
755 adox %rbp, %rdx
756
757 .byte 0x66
0b4bb91d
AP
758 add %rdx, %r14
759
760 movq %r13, 112(%rsp)
761 movq %r14, 120(%rsp)
762 movq %xmm0, $out
87954638
AP
763 movq %xmm1, %rbp
764
765 movq 128(%rsp), %rdx # pull $n0
0b4bb91d
AP
766 movq (%rsp), %r8
767 movq 8(%rsp), %r9
768 movq 16(%rsp), %r10
769 movq 24(%rsp), %r11
770 movq 32(%rsp), %r12
771 movq 40(%rsp), %r13
772 movq 48(%rsp), %r14
773 movq 56(%rsp), %r15
774
87954638 775 call __rsaz_512_reducex
0b4bb91d
AP
776
777 addq 64(%rsp), %r8
778 adcq 72(%rsp), %r9
779 adcq 80(%rsp), %r10
780 adcq 88(%rsp), %r11
781 adcq 96(%rsp), %r12
782 adcq 104(%rsp), %r13
783 adcq 112(%rsp), %r14
784 adcq 120(%rsp), %r15
785 sbbq %rcx, %rcx
786
87954638 787 call __rsaz_512_subtract
0b4bb91d
AP
788
789 movq %r8, %rdx
790 movq %r9, %rax
791 movl 128+8(%rsp), $times
792 movq $out, $inp
793
794 decl $times
87954638
AP
795 jnz .Loop_sqrx
796
797.Lsqr_tail:
798___
799}
800$code.=<<___;
0b4bb91d
AP
801
802 leaq 128+24+48(%rsp), %rax
803 movq -48(%rax), %r15
804 movq -40(%rax), %r14
805 movq -32(%rax), %r13
806 movq -24(%rax), %r12
807 movq -16(%rax), %rbp
808 movq -8(%rax), %rbx
809 leaq (%rax), %rsp
810.Lsqr_epilogue:
811 ret
812.size rsaz_512_sqr,.-rsaz_512_sqr
813___
814}
815{
816my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
817$code.=<<___;
fd8ad019 818.globl rsaz_512_mul
0b4bb91d
AP
819.type rsaz_512_mul,\@function,5
820.align 32
821rsaz_512_mul:
822 push %rbx
823 push %rbp
824 push %r12
825 push %r13
826 push %r14
827 push %r15
828
829 subq \$128+24, %rsp
830.Lmul_body:
831 movq $out, %xmm0 # off-load arguments
832 movq $mod, %xmm1
833 movq $n0, 128(%rsp)
87954638
AP
834___
835$code.=<<___ if ($addx);
836 movl \$0x80100,%r11d
837 andl OPENSSL_ia32cap_P+8(%rip),%r11d
838 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
839 je .Lmulx
840___
841$code.=<<___;
842 movq ($bp), %rbx # pass b[0]
0b4bb91d 843 movq $bp, %rbp # pass argument
fd8ad019 844 call __rsaz_512_mul
0b4bb91d
AP
845
846 movq %xmm0, $out
847 movq %xmm1, %rbp
848
849 movq (%rsp), %r8
850 movq 8(%rsp), %r9
851 movq 16(%rsp), %r10
852 movq 24(%rsp), %r11
853 movq 32(%rsp), %r12
854 movq 40(%rsp), %r13
855 movq 48(%rsp), %r14
856 movq 56(%rsp), %r15
857
87954638
AP
858 call __rsaz_512_reduce
859___
860$code.=<<___ if ($addx);
861 jmp .Lmul_tail
0b4bb91d 862
87954638
AP
863.align 32
864.Lmulx:
865 movq $bp, %rbp # pass argument
866 movq ($bp), %rdx # pass b[0]
867 call __rsaz_512_mulx
868
869 movq %xmm0, $out
870 movq %xmm1, %rbp
871
872 movq 128(%rsp), %rdx # pull $n0
873 movq (%rsp), %r8
874 movq 8(%rsp), %r9
875 movq 16(%rsp), %r10
876 movq 24(%rsp), %r11
877 movq 32(%rsp), %r12
878 movq 40(%rsp), %r13
879 movq 48(%rsp), %r14
880 movq 56(%rsp), %r15
881
882 call __rsaz_512_reducex
883.Lmul_tail:
884___
885$code.=<<___;
0b4bb91d
AP
886 addq 64(%rsp), %r8
887 adcq 72(%rsp), %r9
888 adcq 80(%rsp), %r10
889 adcq 88(%rsp), %r11
890 adcq 96(%rsp), %r12
891 adcq 104(%rsp), %r13
892 adcq 112(%rsp), %r14
893 adcq 120(%rsp), %r15
894 sbbq %rcx, %rcx
895
87954638 896 call __rsaz_512_subtract
0b4bb91d
AP
897
898 leaq 128+24+48(%rsp), %rax
899 movq -48(%rax), %r15
900 movq -40(%rax), %r14
901 movq -32(%rax), %r13
902 movq -24(%rax), %r12
903 movq -16(%rax), %rbp
904 movq -8(%rax), %rbx
905 leaq (%rax), %rsp
906.Lmul_epilogue:
907 ret
908.size rsaz_512_mul,.-rsaz_512_mul
909___
910}
911{
912my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
913$code.=<<___;
fd8ad019 914.globl rsaz_512_mul_gather4
0b4bb91d
AP
915.type rsaz_512_mul_gather4,\@function,6
916.align 32
917rsaz_512_mul_gather4:
918 push %rbx
919 push %rbp
920 push %r12
921 push %r13
922 push %r14
923 push %r15
924
5ea08bd2
AP
925 subq \$`128+24+($win64?0xb0:0)`, %rsp
926___
927$code.=<<___ if ($win64);
928 movaps %xmm6,0xa0(%rsp)
929 movaps %xmm7,0xb0(%rsp)
930 movaps %xmm8,0xc0(%rsp)
931 movaps %xmm9,0xd0(%rsp)
932 movaps %xmm10,0xe0(%rsp)
933 movaps %xmm11,0xf0(%rsp)
934 movaps %xmm12,0x100(%rsp)
935 movaps %xmm13,0x110(%rsp)
936 movaps %xmm14,0x120(%rsp)
937 movaps %xmm15,0x130(%rsp)
938___
939$code.=<<___;
0b4bb91d 940.Lmul_gather4_body:
5ea08bd2
AP
941 movd $pwr,%xmm8
942 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
943 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
944
945 pshufd \$0,%xmm8,%xmm8 # broadcast $power
946 movdqa %xmm1,%xmm7
947 movdqa %xmm1,%xmm2
948___
949########################################################################
950# calculate mask by comparing 0..15 to $power
951#
952for($i=0;$i<4;$i++) {
953$code.=<<___;
954 paddd %xmm`$i`,%xmm`$i+1`
955 pcmpeqd %xmm8,%xmm`$i`
956 movdqa %xmm7,%xmm`$i+3`
957___
958}
959for(;$i<7;$i++) {
960$code.=<<___;
961 paddd %xmm`$i`,%xmm`$i+1`
962 pcmpeqd %xmm8,%xmm`$i`
963___
964}
965$code.=<<___;
966 pcmpeqd %xmm8,%xmm7
967
968 movdqa 16*0($bp),%xmm8
969 movdqa 16*1($bp),%xmm9
970 movdqa 16*2($bp),%xmm10
971 movdqa 16*3($bp),%xmm11
972 pand %xmm0,%xmm8
973 movdqa 16*4($bp),%xmm12
974 pand %xmm1,%xmm9
975 movdqa 16*5($bp),%xmm13
976 pand %xmm2,%xmm10
977 movdqa 16*6($bp),%xmm14
978 pand %xmm3,%xmm11
979 movdqa 16*7($bp),%xmm15
980 leaq 128($bp), %rbp
981 pand %xmm4,%xmm12
982 pand %xmm5,%xmm13
983 pand %xmm6,%xmm14
984 pand %xmm7,%xmm15
985 por %xmm10,%xmm8
986 por %xmm11,%xmm9
987 por %xmm12,%xmm8
988 por %xmm13,%xmm9
989 por %xmm14,%xmm8
990 por %xmm15,%xmm9
991
992 por %xmm9,%xmm8
993 pshufd \$0x4e,%xmm8,%xmm9
994 por %xmm9,%xmm8
87954638
AP
995___
996$code.=<<___ if ($addx);
997 movl \$0x80100,%r11d
998 andl OPENSSL_ia32cap_P+8(%rip),%r11d
999 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1000 je .Lmulx_gather
1001___
1002$code.=<<___;
5ea08bd2
AP
1003 movq %xmm8,%rbx
1004
1005 movq $n0, 128(%rsp) # off-load arguments
1006 movq $out, 128+8(%rsp)
1007 movq $mod, 128+16(%rsp)
0b4bb91d 1008
0b4bb91d
AP
1009 movq ($ap), %rax
1010 movq 8($ap), %rcx
0b4bb91d
AP
1011 mulq %rbx # 0 iteration
1012 movq %rax, (%rsp)
1013 movq %rcx, %rax
1014 movq %rdx, %r8
1015
1016 mulq %rbx
0b4bb91d
AP
1017 addq %rax, %r8
1018 movq 16($ap), %rax
1019 movq %rdx, %r9
1020 adcq \$0, %r9
1021
1022 mulq %rbx
0b4bb91d
AP
1023 addq %rax, %r9
1024 movq 24($ap), %rax
1025 movq %rdx, %r10
1026 adcq \$0, %r10
1027
1028 mulq %rbx
0b4bb91d
AP
1029 addq %rax, %r10
1030 movq 32($ap), %rax
1031 movq %rdx, %r11
1032 adcq \$0, %r11
1033
1034 mulq %rbx
0b4bb91d
AP
1035 addq %rax, %r11
1036 movq 40($ap), %rax
1037 movq %rdx, %r12
1038 adcq \$0, %r12
1039
1040 mulq %rbx
1041 addq %rax, %r12
1042 movq 48($ap), %rax
1043 movq %rdx, %r13
1044 adcq \$0, %r13
1045
1046 mulq %rbx
0b4bb91d
AP
1047 addq %rax, %r13
1048 movq 56($ap), %rax
1049 movq %rdx, %r14
1050 adcq \$0, %r14
1051
1052 mulq %rbx
0b4bb91d
AP
1053 addq %rax, %r14
1054 movq ($ap), %rax
1055 movq %rdx, %r15
1056 adcq \$0, %r15
1057
1058 leaq 8(%rsp), %rdi
1059 movl \$7, %ecx
1060 jmp .Loop_mul_gather
1061
1062.align 32
1063.Loop_mul_gather:
5ea08bd2
AP
1064 movdqa 16*0(%rbp),%xmm8
1065 movdqa 16*1(%rbp),%xmm9
1066 movdqa 16*2(%rbp),%xmm10
1067 movdqa 16*3(%rbp),%xmm11
1068 pand %xmm0,%xmm8
1069 movdqa 16*4(%rbp),%xmm12
1070 pand %xmm1,%xmm9
1071 movdqa 16*5(%rbp),%xmm13
1072 pand %xmm2,%xmm10
1073 movdqa 16*6(%rbp),%xmm14
1074 pand %xmm3,%xmm11
1075 movdqa 16*7(%rbp),%xmm15
1076 leaq 128(%rbp), %rbp
1077 pand %xmm4,%xmm12
1078 pand %xmm5,%xmm13
1079 pand %xmm6,%xmm14
1080 pand %xmm7,%xmm15
1081 por %xmm10,%xmm8
1082 por %xmm11,%xmm9
1083 por %xmm12,%xmm8
1084 por %xmm13,%xmm9
1085 por %xmm14,%xmm8
1086 por %xmm15,%xmm9
1087
1088 por %xmm9,%xmm8
1089 pshufd \$0x4e,%xmm8,%xmm9
1090 por %xmm9,%xmm8
1091 movq %xmm8,%rbx
1092
0b4bb91d
AP
1093 mulq %rbx
1094 addq %rax, %r8
1095 movq 8($ap), %rax
1096 movq %r8, (%rdi)
1097 movq %rdx, %r8
1098 adcq \$0, %r8
1099
1100 mulq %rbx
0b4bb91d
AP
1101 addq %rax, %r9
1102 movq 16($ap), %rax
1103 adcq \$0, %rdx
1104 addq %r9, %r8
1105 movq %rdx, %r9
1106 adcq \$0, %r9
1107
1108 mulq %rbx
0b4bb91d
AP
1109 addq %rax, %r10
1110 movq 24($ap), %rax
1111 adcq \$0, %rdx
1112 addq %r10, %r9
1113 movq %rdx, %r10
1114 adcq \$0, %r10
1115
1116 mulq %rbx
0b4bb91d
AP
1117 addq %rax, %r11
1118 movq 32($ap), %rax
1119 adcq \$0, %rdx
1120 addq %r11, %r10
1121 movq %rdx, %r11
1122 adcq \$0, %r11
1123
1124 mulq %rbx
0b4bb91d
AP
1125 addq %rax, %r12
1126 movq 40($ap), %rax
1127 adcq \$0, %rdx
1128 addq %r12, %r11
1129 movq %rdx, %r12
1130 adcq \$0, %r12
1131
1132 mulq %rbx
1133 addq %rax, %r13
1134 movq 48($ap), %rax
1135 adcq \$0, %rdx
1136 addq %r13, %r12
1137 movq %rdx, %r13
1138 adcq \$0, %r13
1139
1140 mulq %rbx
1141 addq %rax, %r14
1142 movq 56($ap), %rax
1143 adcq \$0, %rdx
1144 addq %r14, %r13
1145 movq %rdx, %r14
1146 adcq \$0, %r14
1147
1148 mulq %rbx
0b4bb91d
AP
1149 addq %rax, %r15
1150 movq ($ap), %rax
1151 adcq \$0, %rdx
1152 addq %r15, %r14
1153 movq %rdx, %r15
1154 adcq \$0, %r15
1155
0b4bb91d
AP
1156 leaq 8(%rdi), %rdi
1157
1158 decl %ecx
1159 jnz .Loop_mul_gather
1160
1161 movq %r8, (%rdi)
1162 movq %r9, 8(%rdi)
1163 movq %r10, 16(%rdi)
1164 movq %r11, 24(%rdi)
1165 movq %r12, 32(%rdi)
1166 movq %r13, 40(%rdi)
1167 movq %r14, 48(%rdi)
1168 movq %r15, 56(%rdi)
1169
5ea08bd2
AP
1170 movq 128+8(%rsp), $out
1171 movq 128+16(%rsp), %rbp
0b4bb91d
AP
1172
1173 movq (%rsp), %r8
1174 movq 8(%rsp), %r9
1175 movq 16(%rsp), %r10
1176 movq 24(%rsp), %r11
1177 movq 32(%rsp), %r12
1178 movq 40(%rsp), %r13
1179 movq 48(%rsp), %r14
1180 movq 56(%rsp), %r15
1181
87954638
AP
1182 call __rsaz_512_reduce
1183___
1184$code.=<<___ if ($addx);
1185 jmp .Lmul_gather_tail
1186
1187.align 32
1188.Lmulx_gather:
5ea08bd2
AP
1189 movq %xmm8,%rdx
1190
1191 mov $n0, 128(%rsp) # off-load arguments
1192 mov $out, 128+8(%rsp)
1193 mov $mod, 128+16(%rsp)
87954638 1194
87954638
AP
1195 mulx ($ap), %rbx, %r8 # 0 iteration
1196 mov %rbx, (%rsp)
1197 xor %edi, %edi # cf=0, of=0
1198
1199 mulx 8($ap), %rax, %r9
87954638
AP
1200
1201 mulx 16($ap), %rbx, %r10
87954638
AP
1202 adcx %rax, %r8
1203
1204 mulx 24($ap), %rax, %r11
87954638
AP
1205 adcx %rbx, %r9
1206
1207 mulx 32($ap), %rbx, %r12
87954638
AP
1208 adcx %rax, %r10
1209
1210 mulx 40($ap), %rax, %r13
1211 adcx %rbx, %r11
1212
1213 mulx 48($ap), %rbx, %r14
87954638
AP
1214 adcx %rax, %r12
1215
1216 mulx 56($ap), %rax, %r15
87954638
AP
1217 adcx %rbx, %r13
1218 adcx %rax, %r14
5ea08bd2 1219 .byte 0x67
87954638
AP
1220 mov %r8, %rbx
1221 adcx %rdi, %r15 # %rdi is 0
1222
1223 mov \$-7, %rcx
1224 jmp .Loop_mulx_gather
1225
1226.align 32
1227.Loop_mulx_gather:
5ea08bd2
AP
1228 movdqa 16*0(%rbp),%xmm8
1229 movdqa 16*1(%rbp),%xmm9
1230 movdqa 16*2(%rbp),%xmm10
1231 movdqa 16*3(%rbp),%xmm11
1232 pand %xmm0,%xmm8
1233 movdqa 16*4(%rbp),%xmm12
1234 pand %xmm1,%xmm9
1235 movdqa 16*5(%rbp),%xmm13
1236 pand %xmm2,%xmm10
1237 movdqa 16*6(%rbp),%xmm14
1238 pand %xmm3,%xmm11
1239 movdqa 16*7(%rbp),%xmm15
1240 leaq 128(%rbp), %rbp
1241 pand %xmm4,%xmm12
1242 pand %xmm5,%xmm13
1243 pand %xmm6,%xmm14
1244 pand %xmm7,%xmm15
1245 por %xmm10,%xmm8
1246 por %xmm11,%xmm9
1247 por %xmm12,%xmm8
1248 por %xmm13,%xmm9
1249 por %xmm14,%xmm8
1250 por %xmm15,%xmm9
1251
1252 por %xmm9,%xmm8
1253 pshufd \$0x4e,%xmm8,%xmm9
1254 por %xmm9,%xmm8
1255 movq %xmm8,%rdx
1256
1257 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
87954638
AP
1258 adcx %rax, %rbx
1259 adox %r9, %r8
1260
1261 mulx 8($ap), %rax, %r9
87954638
AP
1262 adcx %rax, %r8
1263 adox %r10, %r9
0b4bb91d 1264
87954638 1265 mulx 16($ap), %rax, %r10
87954638
AP
1266 adcx %rax, %r9
1267 adox %r11, %r10
1268
1269 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
87954638
AP
1270 adcx %rax, %r10
1271 adox %r12, %r11
1272
1273 mulx 32($ap), %rax, %r12
1274 adcx %rax, %r11
1275 adox %r13, %r12
1276
1277 mulx 40($ap), %rax, %r13
1278 adcx %rax, %r12
1279 adox %r14, %r13
1280
1281 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1282 adcx %rax, %r13
5ea08bd2 1283 .byte 0x67
87954638
AP
1284 adox %r15, %r14
1285
1286 mulx 56($ap), %rax, %r15
87954638
AP
1287 mov %rbx, 64(%rsp,%rcx,8)
1288 adcx %rax, %r14
1289 adox %rdi, %r15
1290 mov %r8, %rbx
1291 adcx %rdi, %r15 # cf=0
1292
1293 inc %rcx # of=0
1294 jnz .Loop_mulx_gather
1295
1296 mov %r8, 64(%rsp)
1297 mov %r9, 64+8(%rsp)
1298 mov %r10, 64+16(%rsp)
1299 mov %r11, 64+24(%rsp)
1300 mov %r12, 64+32(%rsp)
1301 mov %r13, 64+40(%rsp)
1302 mov %r14, 64+48(%rsp)
1303 mov %r15, 64+56(%rsp)
1304
5ea08bd2
AP
1305 mov 128(%rsp), %rdx # pull arguments
1306 mov 128+8(%rsp), $out
1307 mov 128+16(%rsp), %rbp
87954638 1308
87954638
AP
1309 mov (%rsp), %r8
1310 mov 8(%rsp), %r9
1311 mov 16(%rsp), %r10
1312 mov 24(%rsp), %r11
1313 mov 32(%rsp), %r12
1314 mov 40(%rsp), %r13
1315 mov 48(%rsp), %r14
1316 mov 56(%rsp), %r15
1317
1318 call __rsaz_512_reducex
1319
1320.Lmul_gather_tail:
1321___
1322$code.=<<___;
0b4bb91d
AP
1323 addq 64(%rsp), %r8
1324 adcq 72(%rsp), %r9
1325 adcq 80(%rsp), %r10
1326 adcq 88(%rsp), %r11
1327 adcq 96(%rsp), %r12
1328 adcq 104(%rsp), %r13
1329 adcq 112(%rsp), %r14
1330 adcq 120(%rsp), %r15
1331 sbbq %rcx, %rcx
1332
87954638 1333 call __rsaz_512_subtract
0b4bb91d
AP
1334
1335 leaq 128+24+48(%rsp), %rax
5ea08bd2
AP
1336___
1337$code.=<<___ if ($win64);
1338 movaps 0xa0-0xc8(%rax),%xmm6
1339 movaps 0xb0-0xc8(%rax),%xmm7
1340 movaps 0xc0-0xc8(%rax),%xmm8
1341 movaps 0xd0-0xc8(%rax),%xmm9
1342 movaps 0xe0-0xc8(%rax),%xmm10
1343 movaps 0xf0-0xc8(%rax),%xmm11
1344 movaps 0x100-0xc8(%rax),%xmm12
1345 movaps 0x110-0xc8(%rax),%xmm13
1346 movaps 0x120-0xc8(%rax),%xmm14
1347 movaps 0x130-0xc8(%rax),%xmm15
1348 lea 0xb0(%rax),%rax
1349___
1350$code.=<<___;
0b4bb91d
AP
1351 movq -48(%rax), %r15
1352 movq -40(%rax), %r14
1353 movq -32(%rax), %r13
1354 movq -24(%rax), %r12
1355 movq -16(%rax), %rbp
1356 movq -8(%rax), %rbx
1357 leaq (%rax), %rsp
1358.Lmul_gather4_epilogue:
1359 ret
1360.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1361___
1362}
1363{
1364my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1365$code.=<<___;
fd8ad019 1366.globl rsaz_512_mul_scatter4
0b4bb91d
AP
1367.type rsaz_512_mul_scatter4,\@function,6
1368.align 32
1369rsaz_512_mul_scatter4:
1370 push %rbx
1371 push %rbp
1372 push %r12
1373 push %r13
1374 push %r14
1375 push %r15
1376
8bd7ca99 1377 mov $pwr, $pwr
0b4bb91d
AP
1378 subq \$128+24, %rsp
1379.Lmul_scatter4_body:
5ea08bd2 1380 leaq ($tbl,$pwr,8), $tbl
0b4bb91d
AP
1381 movq $out, %xmm0 # off-load arguments
1382 movq $mod, %xmm1
1383 movq $tbl, %xmm2
1384 movq $n0, 128(%rsp)
1385
1386 movq $out, %rbp
87954638
AP
1387___
1388$code.=<<___ if ($addx);
1389 movl \$0x80100,%r11d
1390 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1391 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1392 je .Lmulx_scatter
1393___
1394$code.=<<___;
1395 movq ($out),%rbx # pass b[0]
fd8ad019 1396 call __rsaz_512_mul
0b4bb91d
AP
1397
1398 movq %xmm0, $out
1399 movq %xmm1, %rbp
1400
1401 movq (%rsp), %r8
1402 movq 8(%rsp), %r9
1403 movq 16(%rsp), %r10
1404 movq 24(%rsp), %r11
1405 movq 32(%rsp), %r12
1406 movq 40(%rsp), %r13
1407 movq 48(%rsp), %r14
1408 movq 56(%rsp), %r15
1409
87954638
AP
1410 call __rsaz_512_reduce
1411___
1412$code.=<<___ if ($addx);
1413 jmp .Lmul_scatter_tail
1414
1415.align 32
1416.Lmulx_scatter:
1417 movq ($out), %rdx # pass b[0]
1418 call __rsaz_512_mulx
1419
1420 movq %xmm0, $out
1421 movq %xmm1, %rbp
1422
1423 movq 128(%rsp), %rdx # pull $n0
1424 movq (%rsp), %r8
1425 movq 8(%rsp), %r9
1426 movq 16(%rsp), %r10
1427 movq 24(%rsp), %r11
1428 movq 32(%rsp), %r12
1429 movq 40(%rsp), %r13
1430 movq 48(%rsp), %r14
1431 movq 56(%rsp), %r15
1432
1433 call __rsaz_512_reducex
0b4bb91d 1434
87954638
AP
1435.Lmul_scatter_tail:
1436___
1437$code.=<<___;
0b4bb91d
AP
1438 addq 64(%rsp), %r8
1439 adcq 72(%rsp), %r9
1440 adcq 80(%rsp), %r10
1441 adcq 88(%rsp), %r11
1442 adcq 96(%rsp), %r12
1443 adcq 104(%rsp), %r13
1444 adcq 112(%rsp), %r14
1445 adcq 120(%rsp), %r15
1446 movq %xmm2, $inp
1447 sbbq %rcx, %rcx
1448
87954638 1449 call __rsaz_512_subtract
0b4bb91d 1450
5ea08bd2
AP
1451 movq %r8, 128*0($inp) # scatter
1452 movq %r9, 128*1($inp)
1453 movq %r10, 128*2($inp)
1454 movq %r11, 128*3($inp)
1455 movq %r12, 128*4($inp)
1456 movq %r13, 128*5($inp)
1457 movq %r14, 128*6($inp)
1458 movq %r15, 128*7($inp)
0b4bb91d
AP
1459
1460 leaq 128+24+48(%rsp), %rax
1461 movq -48(%rax), %r15
1462 movq -40(%rax), %r14
1463 movq -32(%rax), %r13
1464 movq -24(%rax), %r12
1465 movq -16(%rax), %rbp
1466 movq -8(%rax), %rbx
1467 leaq (%rax), %rsp
1468.Lmul_scatter4_epilogue:
1469 ret
1470.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1471___
1472}
1473{
1474my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1475$code.=<<___;
1476.globl rsaz_512_mul_by_one
1477.type rsaz_512_mul_by_one,\@function,4
1478.align 32
1479rsaz_512_mul_by_one:
1480 push %rbx
1481 push %rbp
1482 push %r12
1483 push %r13
1484 push %r14
1485 push %r15
1486
1487 subq \$128+24, %rsp
1488.Lmul_by_one_body:
87954638
AP
1489___
1490$code.=<<___ if ($addx);
1491 movl OPENSSL_ia32cap_P+8(%rip),%eax
1492___
1493$code.=<<___;
0b4bb91d
AP
1494 movq $mod, %rbp # reassign argument
1495 movq $n0, 128(%rsp)
1496
1497 movq ($inp), %r8
1498 pxor %xmm0, %xmm0
1499 movq 8($inp), %r9
1500 movq 16($inp), %r10
1501 movq 24($inp), %r11
1502 movq 32($inp), %r12
1503 movq 40($inp), %r13
1504 movq 48($inp), %r14
1505 movq 56($inp), %r15
1506
1507 movdqa %xmm0, (%rsp)
1508 movdqa %xmm0, 16(%rsp)
1509 movdqa %xmm0, 32(%rsp)
1510 movdqa %xmm0, 48(%rsp)
1511 movdqa %xmm0, 64(%rsp)
1512 movdqa %xmm0, 80(%rsp)
1513 movdqa %xmm0, 96(%rsp)
87954638
AP
1514___
1515$code.=<<___ if ($addx);
1516 andl \$0x80100,%eax
1517 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1518 je .Lby_one_callx
1519___
1520$code.=<<___;
1521 call __rsaz_512_reduce
1522___
1523$code.=<<___ if ($addx);
1524 jmp .Lby_one_tail
1525.align 32
1526.Lby_one_callx:
1527 movq 128(%rsp), %rdx # pull $n0
1528 call __rsaz_512_reducex
1529.Lby_one_tail:
1530___
1531$code.=<<___;
0b4bb91d
AP
1532 movq %r8, ($out)
1533 movq %r9, 8($out)
1534 movq %r10, 16($out)
1535 movq %r11, 24($out)
1536 movq %r12, 32($out)
1537 movq %r13, 40($out)
1538 movq %r14, 48($out)
1539 movq %r15, 56($out)
1540
1541 leaq 128+24+48(%rsp), %rax
1542 movq -48(%rax), %r15
1543 movq -40(%rax), %r14
1544 movq -32(%rax), %r13
1545 movq -24(%rax), %r12
1546 movq -16(%rax), %rbp
1547 movq -8(%rax), %rbx
1548 leaq (%rax), %rsp
1549.Lmul_by_one_epilogue:
1550 ret
1551.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1552___
1553}
87954638 1554{ # __rsaz_512_reduce
0b4bb91d
AP
1555 #
1556 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1557 # output: %r8-%r15
1558 # clobbers: everything except %rbp and %rdi
1559$code.=<<___;
87954638 1560.type __rsaz_512_reduce,\@abi-omnipotent
0b4bb91d 1561.align 32
87954638 1562__rsaz_512_reduce:
0b4bb91d
AP
1563 movq %r8, %rbx
1564 imulq 128+8(%rsp), %rbx
1565 movq 0(%rbp), %rax
1566 movl \$8, %ecx
1567 jmp .Lreduction_loop
1568
1569.align 32
1570.Lreduction_loop:
1571 mulq %rbx
1572 movq 8(%rbp), %rax
1573 negq %r8
1574 movq %rdx, %r8
1575 adcq \$0, %r8
1576
1577 mulq %rbx
1578 addq %rax, %r9
1579 movq 16(%rbp), %rax
1580 adcq \$0, %rdx
1581 addq %r9, %r8
1582 movq %rdx, %r9
1583 adcq \$0, %r9
1584
1585 mulq %rbx
1586 addq %rax, %r10
1587 movq 24(%rbp), %rax
1588 adcq \$0, %rdx
1589 addq %r10, %r9
1590 movq %rdx, %r10
1591 adcq \$0, %r10
1592
1593 mulq %rbx
1594 addq %rax, %r11
1595 movq 32(%rbp), %rax
1596 adcq \$0, %rdx
1597 addq %r11, %r10
1598 movq 128+8(%rsp), %rsi
87954638
AP
1599 #movq %rdx, %r11
1600 #adcq \$0, %r11
1601 adcq \$0, %rdx
0b4bb91d 1602 movq %rdx, %r11
0b4bb91d
AP
1603
1604 mulq %rbx
1605 addq %rax, %r12
1606 movq 40(%rbp), %rax
1607 adcq \$0, %rdx
1608 imulq %r8, %rsi
1609 addq %r12, %r11
1610 movq %rdx, %r12
1611 adcq \$0, %r12
1612
1613 mulq %rbx
1614 addq %rax, %r13
1615 movq 48(%rbp), %rax
1616 adcq \$0, %rdx
1617 addq %r13, %r12
1618 movq %rdx, %r13
1619 adcq \$0, %r13
1620
1621 mulq %rbx
1622 addq %rax, %r14
1623 movq 56(%rbp), %rax
1624 adcq \$0, %rdx
1625 addq %r14, %r13
1626 movq %rdx, %r14
1627 adcq \$0, %r14
1628
1629 mulq %rbx
1630 movq %rsi, %rbx
1631 addq %rax, %r15
1632 movq 0(%rbp), %rax
1633 adcq \$0, %rdx
1634 addq %r15, %r14
1635 movq %rdx, %r15
1636 adcq \$0, %r15
1637
1638 decl %ecx
1639 jne .Lreduction_loop
87954638
AP
1640
1641 ret
1642.size __rsaz_512_reduce,.-__rsaz_512_reduce
0b4bb91d 1643___
87954638
AP
1644}
1645if ($addx) {
1646 # __rsaz_512_reducex
1647 #
1648 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1649 # output: %r8-%r15
1650 # clobbers: everything except %rbp and %rdi
0b4bb91d 1651$code.=<<___;
87954638
AP
1652.type __rsaz_512_reducex,\@abi-omnipotent
1653.align 32
1654__rsaz_512_reducex:
1655 #movq 128+8(%rsp), %rdx # pull $n0
0b4bb91d 1656 imulq %r8, %rdx
87954638 1657 xorq %rsi, %rsi # cf=0,of=0
0b4bb91d 1658 movl \$8, %ecx
87954638 1659 jmp .Lreduction_loopx
0b4bb91d
AP
1660
1661.align 32
87954638
AP
1662.Lreduction_loopx:
1663 mov %r8, %rbx
0b4bb91d 1664 mulx 0(%rbp), %rax, %r8
87954638
AP
1665 adcx %rbx, %rax
1666 adox %r9, %r8
0b4bb91d
AP
1667
1668 mulx 8(%rbp), %rax, %r9
87954638
AP
1669 adcx %rax, %r8
1670 adox %r10, %r9
1671
1672 mulx 16(%rbp), %rbx, %r10
1673 adcx %rbx, %r9
1674 adox %r11, %r10
1675
1676 mulx 24(%rbp), %rbx, %r11
1677 adcx %rbx, %r10
1678 adox %r12, %r11
1679
1680 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1681 mov %rdx, %rax
1682 mov %r8, %rdx
1683 adcx %rbx, %r11
1684 adox %r13, %r12
1685
1686 mulx 128+8(%rsp), %rbx, %rdx
1687 mov %rax, %rdx
0b4bb91d
AP
1688
1689 mulx 40(%rbp), %rax, %r13
87954638
AP
1690 adcx %rax, %r12
1691 adox %r14, %r13
0b4bb91d 1692
87954638
AP
1693 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1694 adcx %rax, %r13
1695 adox %r15, %r14
0b4bb91d
AP
1696
1697 mulx 56(%rbp), %rax, %r15
1698 mov %rbx, %rdx
87954638
AP
1699 adcx %rax, %r14
1700 adox %rsi, %r15 # %rsi is 0
1701 adcx %rsi, %r15 # cf=0
1702
1703 decl %ecx # of=0
1704 jne .Lreduction_loopx
0b4bb91d 1705
0b4bb91d 1706 ret
87954638 1707.size __rsaz_512_reducex,.-__rsaz_512_reducex
0b4bb91d
AP
1708___
1709}
87954638 1710{ # __rsaz_512_subtract
0b4bb91d
AP
1711 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1712 # output:
1713 # clobbers: everything but %rdi, %rsi and %rbp
1714$code.=<<___;
87954638 1715.type __rsaz_512_subtract,\@abi-omnipotent
0b4bb91d 1716.align 32
87954638 1717__rsaz_512_subtract:
0b4bb91d
AP
1718 movq %r8, ($out)
1719 movq %r9, 8($out)
1720 movq %r10, 16($out)
1721 movq %r11, 24($out)
1722 movq %r12, 32($out)
1723 movq %r13, 40($out)
1724 movq %r14, 48($out)
1725 movq %r15, 56($out)
1726
1727 movq 0($mod), %r8
1728 movq 8($mod), %r9
1729 negq %r8
1730 notq %r9
1731 andq %rcx, %r8
1732 movq 16($mod), %r10
1733 andq %rcx, %r9
1734 notq %r10
1735 movq 24($mod), %r11
1736 andq %rcx, %r10
1737 notq %r11
1738 movq 32($mod), %r12
1739 andq %rcx, %r11
1740 notq %r12
1741 movq 40($mod), %r13
1742 andq %rcx, %r12
1743 notq %r13
1744 movq 48($mod), %r14
1745 andq %rcx, %r13
1746 notq %r14
1747 movq 56($mod), %r15
1748 andq %rcx, %r14
1749 notq %r15
1750 andq %rcx, %r15
1751
1752 addq ($out), %r8
1753 adcq 8($out), %r9
1754 adcq 16($out), %r10
1755 adcq 24($out), %r11
1756 adcq 32($out), %r12
1757 adcq 40($out), %r13
1758 adcq 48($out), %r14
1759 adcq 56($out), %r15
1760
1761 movq %r8, ($out)
1762 movq %r9, 8($out)
1763 movq %r10, 16($out)
1764 movq %r11, 24($out)
1765 movq %r12, 32($out)
1766 movq %r13, 40($out)
1767 movq %r14, 48($out)
1768 movq %r15, 56($out)
1769
1770 ret
87954638 1771.size __rsaz_512_subtract,.-__rsaz_512_subtract
0b4bb91d
AP
1772___
1773}
fd8ad019 1774{ # __rsaz_512_mul
0b4bb91d
AP
1775 #
1776 # input: %rsi - ap, %rbp - bp
0d4fb843 1777 # output:
0b4bb91d
AP
1778 # clobbers: everything
1779my ($ap,$bp) = ("%rsi","%rbp");
1780$code.=<<___;
fd8ad019 1781.type __rsaz_512_mul,\@abi-omnipotent
0b4bb91d 1782.align 32
fd8ad019 1783__rsaz_512_mul:
0b4bb91d
AP
1784 leaq 8(%rsp), %rdi
1785
0b4bb91d
AP
1786 movq ($ap), %rax
1787 mulq %rbx
1788 movq %rax, (%rdi)
1789 movq 8($ap), %rax
1790 movq %rdx, %r8
1791
1792 mulq %rbx
1793 addq %rax, %r8
1794 movq 16($ap), %rax
1795 movq %rdx, %r9
1796 adcq \$0, %r9
1797
1798 mulq %rbx
1799 addq %rax, %r9
1800 movq 24($ap), %rax
1801 movq %rdx, %r10
1802 adcq \$0, %r10
1803
1804 mulq %rbx
1805 addq %rax, %r10
1806 movq 32($ap), %rax
1807 movq %rdx, %r11
1808 adcq \$0, %r11
1809
1810 mulq %rbx
1811 addq %rax, %r11
1812 movq 40($ap), %rax
1813 movq %rdx, %r12
1814 adcq \$0, %r12
1815
1816 mulq %rbx
1817 addq %rax, %r12
1818 movq 48($ap), %rax
1819 movq %rdx, %r13
1820 adcq \$0, %r13
1821
1822 mulq %rbx
1823 addq %rax, %r13
1824 movq 56($ap), %rax
1825 movq %rdx, %r14
1826 adcq \$0, %r14
1827
1828 mulq %rbx
1829 addq %rax, %r14
1830 movq ($ap), %rax
1831 movq %rdx, %r15
1832 adcq \$0, %r15
1833
1834 leaq 8($bp), $bp
1835 leaq 8(%rdi), %rdi
1836
1837 movl \$7, %ecx
1838 jmp .Loop_mul
1839
1840.align 32
1841.Loop_mul:
1842 movq ($bp), %rbx
1843 mulq %rbx
1844 addq %rax, %r8
1845 movq 8($ap), %rax
1846 movq %r8, (%rdi)
1847 movq %rdx, %r8
1848 adcq \$0, %r8
1849
1850 mulq %rbx
1851 addq %rax, %r9
1852 movq 16($ap), %rax
1853 adcq \$0, %rdx
1854 addq %r9, %r8
1855 movq %rdx, %r9
1856 adcq \$0, %r9
1857
1858 mulq %rbx
1859 addq %rax, %r10
1860 movq 24($ap), %rax
1861 adcq \$0, %rdx
1862 addq %r10, %r9
1863 movq %rdx, %r10
1864 adcq \$0, %r10
1865
1866 mulq %rbx
1867 addq %rax, %r11
1868 movq 32($ap), %rax
1869 adcq \$0, %rdx
1870 addq %r11, %r10
1871 movq %rdx, %r11
1872 adcq \$0, %r11
1873
1874 mulq %rbx
1875 addq %rax, %r12
1876 movq 40($ap), %rax
1877 adcq \$0, %rdx
1878 addq %r12, %r11
1879 movq %rdx, %r12
1880 adcq \$0, %r12
1881
1882 mulq %rbx
1883 addq %rax, %r13
1884 movq 48($ap), %rax
1885 adcq \$0, %rdx
1886 addq %r13, %r12
1887 movq %rdx, %r13
1888 adcq \$0, %r13
1889
1890 mulq %rbx
1891 addq %rax, %r14
1892 movq 56($ap), %rax
1893 adcq \$0, %rdx
1894 addq %r14, %r13
1895 movq %rdx, %r14
1896 leaq 8($bp), $bp
1897 adcq \$0, %r14
1898
1899 mulq %rbx
1900 addq %rax, %r15
1901 movq ($ap), %rax
1902 adcq \$0, %rdx
1903 addq %r15, %r14
1904 movq %rdx, %r15
1905 adcq \$0, %r15
1906
1907 leaq 8(%rdi), %rdi
1908
1909 decl %ecx
1910 jnz .Loop_mul
1911
1912 movq %r8, (%rdi)
1913 movq %r9, 8(%rdi)
1914 movq %r10, 16(%rdi)
1915 movq %r11, 24(%rdi)
1916 movq %r12, 32(%rdi)
1917 movq %r13, 40(%rdi)
1918 movq %r14, 48(%rdi)
1919 movq %r15, 56(%rdi)
1920
1921 ret
fd8ad019 1922.size __rsaz_512_mul,.-__rsaz_512_mul
0b4bb91d
AP
1923___
1924}
87954638
AP
1925if ($addx) {
1926 # __rsaz_512_mulx
1927 #
1928 # input: %rsi - ap, %rbp - bp
0d4fb843 1929 # output:
87954638
AP
1930 # clobbers: everything
1931my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1932$code.=<<___;
1933.type __rsaz_512_mulx,\@abi-omnipotent
1934.align 32
1935__rsaz_512_mulx:
1936 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
31ed9a21 1937 mov \$-6, %rcx
87954638
AP
1938
1939 mulx 8($ap), %rax, %r9
1940 movq %rbx, 8(%rsp)
1941
1942 mulx 16($ap), %rbx, %r10
31ed9a21 1943 adc %rax, %r8
87954638
AP
1944
1945 mulx 24($ap), %rax, %r11
31ed9a21 1946 adc %rbx, %r9
87954638 1947
31ed9a21
AP
1948 mulx 32($ap), %rbx, %r12
1949 adc %rax, %r10
87954638
AP
1950
1951 mulx 40($ap), %rax, %r13
31ed9a21 1952 adc %rbx, %r11
87954638
AP
1953
1954 mulx 48($ap), %rbx, %r14
31ed9a21 1955 adc %rax, %r12
87954638
AP
1956
1957 mulx 56($ap), %rax, %r15
1958 mov 8($bp), %rdx
31ed9a21
AP
1959 adc %rbx, %r13
1960 adc %rax, %r14
1961 adc \$0, %r15
87954638 1962
31ed9a21 1963 xor $zero, $zero # cf=0,of=0
87954638
AP
1964 jmp .Loop_mulx
1965
1966.align 32
1967.Loop_mulx:
1968 movq %r8, %rbx
1969 mulx ($ap), %rax, %r8
1970 adcx %rax, %rbx
1971 adox %r9, %r8
1972
1973 mulx 8($ap), %rax, %r9
1974 adcx %rax, %r8
1975 adox %r10, %r9
1976
1977 mulx 16($ap), %rax, %r10
1978 adcx %rax, %r9
1979 adox %r11, %r10
1980
1981 mulx 24($ap), %rax, %r11
1982 adcx %rax, %r10
1983 adox %r12, %r11
1984
1985 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1986 adcx %rax, %r11
1987 adox %r13, %r12
1988
1989 mulx 40($ap), %rax, %r13
1990 adcx %rax, %r12
1991 adox %r14, %r13
1992
1993 mulx 48($ap), %rax, %r14
1994 adcx %rax, %r13
1995 adox %r15, %r14
1996
1997 mulx 56($ap), %rax, %r15
1998 movq 64($bp,%rcx,8), %rdx
1999 movq %rbx, 8+64-8(%rsp,%rcx,8)
2000 adcx %rax, %r14
2001 adox $zero, %r15
2002 adcx $zero, %r15 # cf=0
2003
2004 inc %rcx # of=0
2005 jnz .Loop_mulx
2006
2007 movq %r8, %rbx
2008 mulx ($ap), %rax, %r8
2009 adcx %rax, %rbx
2010 adox %r9, %r8
2011
2012 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2013 adcx %rax, %r8
2014 adox %r10, %r9
2015
2016 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2017 adcx %rax, %r9
2018 adox %r11, %r10
2019
2020 mulx 24($ap), %rax, %r11
2021 adcx %rax, %r10
2022 adox %r12, %r11
2023
2024 mulx 32($ap), %rax, %r12
2025 adcx %rax, %r11
2026 adox %r13, %r12
2027
2028 mulx 40($ap), %rax, %r13
2029 adcx %rax, %r12
2030 adox %r14, %r13
2031
2032 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2033 adcx %rax, %r13
2034 adox %r15, %r14
2035
2036 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2037 adcx %rax, %r14
2038 adox $zero, %r15
2039 adcx $zero, %r15
2040
2041 mov %rbx, 8+64-8(%rsp)
2042 mov %r8, 8+64(%rsp)
2043 mov %r9, 8+64+8(%rsp)
2044 mov %r10, 8+64+16(%rsp)
2045 mov %r11, 8+64+24(%rsp)
2046 mov %r12, 8+64+32(%rsp)
2047 mov %r13, 8+64+40(%rsp)
2048 mov %r14, 8+64+48(%rsp)
2049 mov %r15, 8+64+56(%rsp)
2050
2051 ret
2052.size __rsaz_512_mulx,.-__rsaz_512_mulx
2053___
2054}
0b4bb91d
AP
2055{
2056my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2057$code.=<<___;
2058.globl rsaz_512_scatter4
2059.type rsaz_512_scatter4,\@abi-omnipotent
2060.align 16
2061rsaz_512_scatter4:
5ea08bd2 2062 leaq ($out,$power,8), $out
0b4bb91d
AP
2063 movl \$8, %r9d
2064 jmp .Loop_scatter
2065.align 16
2066.Loop_scatter:
2067 movq ($inp), %rax
2068 leaq 8($inp), $inp
5ea08bd2 2069 movq %rax, ($out)
0b4bb91d
AP
2070 leaq 128($out), $out
2071 decl %r9d
2072 jnz .Loop_scatter
2073 ret
2074.size rsaz_512_scatter4,.-rsaz_512_scatter4
2075
2076.globl rsaz_512_gather4
2077.type rsaz_512_gather4,\@abi-omnipotent
2078.align 16
2079rsaz_512_gather4:
5ea08bd2
AP
2080___
2081$code.=<<___ if ($win64);
2082.LSEH_begin_rsaz_512_gather4:
2083 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2084 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2085 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2086 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2087 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2088 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2089 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2090 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2091 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2092 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2093 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2094___
2095$code.=<<___;
2096 movd $power,%xmm8
2097 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
2098 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
2099
2100 pshufd \$0,%xmm8,%xmm8 # broadcast $power
2101 movdqa %xmm1,%xmm7
2102 movdqa %xmm1,%xmm2
2103___
2104########################################################################
2105# calculate mask by comparing 0..15 to $power
2106#
2107for($i=0;$i<4;$i++) {
2108$code.=<<___;
2109 paddd %xmm`$i`,%xmm`$i+1`
2110 pcmpeqd %xmm8,%xmm`$i`
2111 movdqa %xmm7,%xmm`$i+3`
2112___
2113}
2114for(;$i<7;$i++) {
2115$code.=<<___;
2116 paddd %xmm`$i`,%xmm`$i+1`
2117 pcmpeqd %xmm8,%xmm`$i`
2118___
2119}
2120$code.=<<___;
2121 pcmpeqd %xmm8,%xmm7
0b4bb91d
AP
2122 movl \$8, %r9d
2123 jmp .Loop_gather
2124.align 16
2125.Loop_gather:
5ea08bd2
AP
2126 movdqa 16*0($inp),%xmm8
2127 movdqa 16*1($inp),%xmm9
2128 movdqa 16*2($inp),%xmm10
2129 movdqa 16*3($inp),%xmm11
2130 pand %xmm0,%xmm8
2131 movdqa 16*4($inp),%xmm12
2132 pand %xmm1,%xmm9
2133 movdqa 16*5($inp),%xmm13
2134 pand %xmm2,%xmm10
2135 movdqa 16*6($inp),%xmm14
2136 pand %xmm3,%xmm11
2137 movdqa 16*7($inp),%xmm15
0b4bb91d 2138 leaq 128($inp), $inp
5ea08bd2
AP
2139 pand %xmm4,%xmm12
2140 pand %xmm5,%xmm13
2141 pand %xmm6,%xmm14
2142 pand %xmm7,%xmm15
2143 por %xmm10,%xmm8
2144 por %xmm11,%xmm9
2145 por %xmm12,%xmm8
2146 por %xmm13,%xmm9
2147 por %xmm14,%xmm8
2148 por %xmm15,%xmm9
2149
2150 por %xmm9,%xmm8
2151 pshufd \$0x4e,%xmm8,%xmm9
2152 por %xmm9,%xmm8
2153 movq %xmm8,($out)
0b4bb91d
AP
2154 leaq 8($out), $out
2155 decl %r9d
2156 jnz .Loop_gather
5ea08bd2
AP
2157___
2158$code.=<<___ if ($win64);
2159 movaps 0x00(%rsp),%xmm6
2160 movaps 0x10(%rsp),%xmm7
2161 movaps 0x20(%rsp),%xmm8
2162 movaps 0x30(%rsp),%xmm9
2163 movaps 0x40(%rsp),%xmm10
2164 movaps 0x50(%rsp),%xmm11
2165 movaps 0x60(%rsp),%xmm12
2166 movaps 0x70(%rsp),%xmm13
2167 movaps 0x80(%rsp),%xmm14
2168 movaps 0x90(%rsp),%xmm15
2169 add \$0xa8,%rsp
2170___
2171$code.=<<___;
0b4bb91d 2172 ret
5ea08bd2 2173.LSEH_end_rsaz_512_gather4:
0b4bb91d 2174.size rsaz_512_gather4,.-rsaz_512_gather4
5ea08bd2
AP
2175
2176.align 64
2177.Linc:
2178 .long 0,0, 1,1
2179 .long 2,2, 2,2
0b4bb91d
AP
2180___
2181}
2182
2183# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2184# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2185if ($win64) {
2186$rec="%rcx";
2187$frame="%rdx";
2188$context="%r8";
2189$disp="%r9";
2190
2191$code.=<<___;
2192.extern __imp_RtlVirtualUnwind
2193.type se_handler,\@abi-omnipotent
2194.align 16
2195se_handler:
2196 push %rsi
2197 push %rdi
2198 push %rbx
2199 push %rbp
2200 push %r12
2201 push %r13
2202 push %r14
2203 push %r15
2204 pushfq
2205 sub \$64,%rsp
2206
2207 mov 120($context),%rax # pull context->Rax
2208 mov 248($context),%rbx # pull context->Rip
2209
2210 mov 8($disp),%rsi # disp->ImageBase
2211 mov 56($disp),%r11 # disp->HandlerData
2212
2213 mov 0(%r11),%r10d # HandlerData[0]
2214 lea (%rsi,%r10),%r10 # end of prologue label
2215 cmp %r10,%rbx # context->Rip<end of prologue label
2216 jb .Lcommon_seh_tail
2217
2218 mov 152($context),%rax # pull context->Rsp
2219
2220 mov 4(%r11),%r10d # HandlerData[1]
2221 lea (%rsi,%r10),%r10 # epilogue label
2222 cmp %r10,%rbx # context->Rip>=epilogue label
2223 jae .Lcommon_seh_tail
2224
2225 lea 128+24+48(%rax),%rax
2226
5ea08bd2
AP
2227 lea .Lmul_gather4_epilogue(%rip),%rbx
2228 cmp %r10,%rbx
2229 jne .Lse_not_in_mul_gather4
2230
2231 lea 0xb0(%rax),%rax
2232
2233 lea -48-0xa8(%rax),%rsi
2234 lea 512($context),%rdi
2235 mov \$20,%ecx
2236 .long 0xa548f3fc # cld; rep movsq
2237
2238.Lse_not_in_mul_gather4:
0b4bb91d
AP
2239 mov -8(%rax),%rbx
2240 mov -16(%rax),%rbp
2241 mov -24(%rax),%r12
2242 mov -32(%rax),%r13
2243 mov -40(%rax),%r14
2244 mov -48(%rax),%r15
2245 mov %rbx,144($context) # restore context->Rbx
2246 mov %rbp,160($context) # restore context->Rbp
2247 mov %r12,216($context) # restore context->R12
2248 mov %r13,224($context) # restore context->R13
2249 mov %r14,232($context) # restore context->R14
2250 mov %r15,240($context) # restore context->R15
2251
2252.Lcommon_seh_tail:
2253 mov 8(%rax),%rdi
2254 mov 16(%rax),%rsi
2255 mov %rax,152($context) # restore context->Rsp
2256 mov %rsi,168($context) # restore context->Rsi
2257 mov %rdi,176($context) # restore context->Rdi
2258
2259 mov 40($disp),%rdi # disp->ContextRecord
2260 mov $context,%rsi # context
2261 mov \$154,%ecx # sizeof(CONTEXT)
2262 .long 0xa548f3fc # cld; rep movsq
2263
2264 mov $disp,%rsi
2265 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2266 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2267 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2268 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2269 mov 40(%rsi),%r10 # disp->ContextRecord
2270 lea 56(%rsi),%r11 # &disp->HandlerData
2271 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2272 mov %r10,32(%rsp) # arg5
2273 mov %r11,40(%rsp) # arg6
2274 mov %r12,48(%rsp) # arg7
2275 mov %rcx,56(%rsp) # arg8, (NULL)
2276 call *__imp_RtlVirtualUnwind(%rip)
2277
2278 mov \$1,%eax # ExceptionContinueSearch
2279 add \$64,%rsp
2280 popfq
2281 pop %r15
2282 pop %r14
2283 pop %r13
2284 pop %r12
2285 pop %rbp
2286 pop %rbx
2287 pop %rdi
2288 pop %rsi
2289 ret
5ea08bd2 2290.size se_handler,.-se_handler
0b4bb91d
AP
2291
2292.section .pdata
2293.align 4
2294 .rva .LSEH_begin_rsaz_512_sqr
2295 .rva .LSEH_end_rsaz_512_sqr
2296 .rva .LSEH_info_rsaz_512_sqr
2297
2298 .rva .LSEH_begin_rsaz_512_mul
2299 .rva .LSEH_end_rsaz_512_mul
2300 .rva .LSEH_info_rsaz_512_mul
2301
2302 .rva .LSEH_begin_rsaz_512_mul_gather4
2303 .rva .LSEH_end_rsaz_512_mul_gather4
2304 .rva .LSEH_info_rsaz_512_mul_gather4
2305
2306 .rva .LSEH_begin_rsaz_512_mul_scatter4
2307 .rva .LSEH_end_rsaz_512_mul_scatter4
2308 .rva .LSEH_info_rsaz_512_mul_scatter4
2309
2310 .rva .LSEH_begin_rsaz_512_mul_by_one
2311 .rva .LSEH_end_rsaz_512_mul_by_one
2312 .rva .LSEH_info_rsaz_512_mul_by_one
2313
5ea08bd2
AP
2314 .rva .LSEH_begin_rsaz_512_gather4
2315 .rva .LSEH_end_rsaz_512_gather4
2316 .rva .LSEH_info_rsaz_512_gather4
2317
0b4bb91d
AP
2318.section .xdata
2319.align 8
2320.LSEH_info_rsaz_512_sqr:
2321 .byte 9,0,0,0
2322 .rva se_handler
2323 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2324.LSEH_info_rsaz_512_mul:
2325 .byte 9,0,0,0
2326 .rva se_handler
2327 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2328.LSEH_info_rsaz_512_mul_gather4:
2329 .byte 9,0,0,0
2330 .rva se_handler
2331 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2332.LSEH_info_rsaz_512_mul_scatter4:
2333 .byte 9,0,0,0
2334 .rva se_handler
2335 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2336.LSEH_info_rsaz_512_mul_by_one:
2337 .byte 9,0,0,0
2338 .rva se_handler
2339 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
5ea08bd2
AP
2340.LSEH_info_rsaz_512_gather4:
2341 .byte 0x01,0x46,0x16,0x00
d6d422e1
AP
2342 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2343 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2344 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2345 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2346 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2347 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2348 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2349 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2350 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2351 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2352 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
0b4bb91d
AP
2353___
2354}
2355
2356$code =~ s/\`([^\`]*)\`/eval $1/gem;
2357print $code;
2358close STDOUT;