]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/rsaz-x86_64.pl
GH601: Various spelling fixes.
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
CommitLineData
0b4bb91d
AP
1#!/usr/bin/env perl
2
31ed9a21
AP
3##############################################################################
4# #
5# Copyright (c) 2012, Intel Corporation #
6# #
7# All rights reserved. #
8# #
9# Redistribution and use in source and binary forms, with or without #
10# modification, are permitted provided that the following conditions are #
11# met: #
12# #
13# * Redistributions of source code must retain the above copyright #
14# notice, this list of conditions and the following disclaimer. #
15# #
16# * Redistributions in binary form must reproduce the above copyright #
17# notice, this list of conditions and the following disclaimer in the #
18# documentation and/or other materials provided with the #
19# distribution. #
20# #
21# * Neither the name of the Intel Corporation nor the names of its #
22# contributors may be used to endorse or promote products derived from #
23# this software without specific prior written permission. #
24# #
25# #
26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37# #
38##############################################################################
39# Developers and authors: #
40# Shay Gueron (1, 2), and Vlad Krasnov (1) #
41# (1) Intel Architecture Group, Microprocessor and Chipset Development, #
42# Israel Development Center, Haifa, Israel #
43# (2) University of Haifa #
44##############################################################################
45# Reference: #
46# [1] S. Gueron, "Efficient Software Implementations of Modular #
47# Exponentiation", http://eprint.iacr.org/2011/239 #
48# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
49# IEEE Proceedings of 9th International Conference on Information #
50# Technology: New Generations (ITNG 2012), 821-823 (2012). #
51# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52# Journal of Cryptographic Engineering 2:31-43 (2012). #
53# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
54# resistant 512-bit and 1024-bit modular exponentiation for optimizing #
55# RSA1024 and RSA2048 on x86_64 platforms", #
56# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57##############################################################################
0b4bb91d
AP
58
59# While original submission covers 512- and 1024-bit exponentiation,
60# this module is limited to 512-bit version only (and as such
61# accelerates RSA1024 sign). This is because improvement for longer
62# keys is not high enough to justify the effort, highest measured
63# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64# for the moment of this writing!] Nor does this module implement
65# "monolithic" complete exponentiation jumbo-subroutine, but adheres
66# to more modular mixture of C and assembly. And it's optimized even
67# for processors other than Intel Core family (see table below for
68# improvement coefficients).
69# <appro@openssl.org>
70#
71# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
72# ----------------+---------------------------
73# Opteron +13% |+5% +20%
74# Bulldozer -0% |-1% +10%
75# P4 +11% |+7% +8%
76# Westmere +5% |+14% +17%
77# Sandy Bridge +2% |+12% +29%
78# Ivy Bridge +1% |+11% +35%
79# Haswell(**) -0% |+12% +39%
80# Atom +13% |+11% +4%
81# VIA Nano +70% |+9% +25%
82#
83# (*) rsax engine and fips numbers are presented for reference
84# purposes;
87954638 85# (**) MULX was attempted, but found to give only marginal improvement;
0b4bb91d
AP
86
87$flavour = shift;
88$output = shift;
89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96die "can't locate x86_64-xlate.pl";
97
15735e4f 98open OUT,"| \"$^X\" $xlate $flavour $output";
0b4bb91d
AP
99*STDOUT=*OUT;
100
87954638
AP
101if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
30b9c234 103 $addx = ($1>=2.23);
87954638
AP
104}
105
106if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108 $addx = ($1>=2.10);
109}
110
111if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1b0fe79f 113 $addx = ($1>=12);
87954638
AP
114}
115
b9749432 116if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
a356e488
AP
117 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
118 $addx = ($ver>=3.03);
119}
120
0b4bb91d
AP
121($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
122{
123my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
124
125$code.=<<___;
126.text
127
87954638
AP
128.extern OPENSSL_ia32cap_P
129
0b4bb91d 130.globl rsaz_512_sqr
6efef384 131.type rsaz_512_sqr,\@function,5
0b4bb91d
AP
132.align 32
133rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
134 push %rbx
135 push %rbp
136 push %r12
137 push %r13
138 push %r14
139 push %r15
140
141 subq \$128+24, %rsp
142.Lsqr_body:
143 movq $mod, %rbp # common argument
144 movq ($inp), %rdx
145 movq 8($inp), %rax
146 movq $n0, 128(%rsp)
87954638
AP
147___
148$code.=<<___ if ($addx);
149 movl \$0x80100,%r11d
150 andl OPENSSL_ia32cap_P+8(%rip),%r11d
151 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
152 je .Loop_sqrx
153___
154$code.=<<___;
0b4bb91d
AP
155 jmp .Loop_sqr
156
157.align 32
158.Loop_sqr:
159 movl $times,128+8(%rsp)
0b4bb91d
AP
160#first iteration
161 movq %rdx, %rbx
162 mulq %rdx
163 movq %rax, %r8
164 movq 16($inp), %rax
165 movq %rdx, %r9
166
167 mulq %rbx
168 addq %rax, %r9
169 movq 24($inp), %rax
170 movq %rdx, %r10
171 adcq \$0, %r10
172
173 mulq %rbx
174 addq %rax, %r10
175 movq 32($inp), %rax
176 movq %rdx, %r11
177 adcq \$0, %r11
178
179 mulq %rbx
180 addq %rax, %r11
181 movq 40($inp), %rax
182 movq %rdx, %r12
183 adcq \$0, %r12
184
185 mulq %rbx
186 addq %rax, %r12
187 movq 48($inp), %rax
188 movq %rdx, %r13
189 adcq \$0, %r13
190
191 mulq %rbx
192 addq %rax, %r13
193 movq 56($inp), %rax
194 movq %rdx, %r14
195 adcq \$0, %r14
196
197 mulq %rbx
198 addq %rax, %r14
199 movq %rbx, %rax
200 movq %rdx, %r15
201 adcq \$0, %r15
202
203 addq %r8, %r8 #shlq \$1, %r8
204 movq %r9, %rcx
205 adcq %r9, %r9 #shld \$1, %r8, %r9
206
207 mulq %rax
208 movq %rax, (%rsp)
209 addq %rdx, %r8
210 adcq \$0, %r9
211
212 movq %r8, 8(%rsp)
213 shrq \$63, %rcx
214
215#second iteration
216 movq 8($inp), %r8
217 movq 16($inp), %rax
218 mulq %r8
219 addq %rax, %r10
220 movq 24($inp), %rax
221 movq %rdx, %rbx
222 adcq \$0, %rbx
223
224 mulq %r8
225 addq %rax, %r11
226 movq 32($inp), %rax
227 adcq \$0, %rdx
228 addq %rbx, %r11
229 movq %rdx, %rbx
230 adcq \$0, %rbx
231
232 mulq %r8
233 addq %rax, %r12
234 movq 40($inp), %rax
235 adcq \$0, %rdx
236 addq %rbx, %r12
237 movq %rdx, %rbx
238 adcq \$0, %rbx
239
240 mulq %r8
241 addq %rax, %r13
242 movq 48($inp), %rax
243 adcq \$0, %rdx
244 addq %rbx, %r13
245 movq %rdx, %rbx
246 adcq \$0, %rbx
247
248 mulq %r8
249 addq %rax, %r14
250 movq 56($inp), %rax
251 adcq \$0, %rdx
252 addq %rbx, %r14
253 movq %rdx, %rbx
254 adcq \$0, %rbx
255
256 mulq %r8
257 addq %rax, %r15
258 movq %r8, %rax
259 adcq \$0, %rdx
260 addq %rbx, %r15
261 movq %rdx, %r8
262 movq %r10, %rdx
263 adcq \$0, %r8
264
265 add %rdx, %rdx
266 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
267 movq %r11, %rbx
268 adcq %r11, %r11 #shld \$1, %r10, %r11
269
270 mulq %rax
271 addq %rax, %r9
272 adcq %rdx, %r10
273 adcq \$0, %r11
274
275 movq %r9, 16(%rsp)
276 movq %r10, 24(%rsp)
277 shrq \$63, %rbx
278
279#third iteration
280 movq 16($inp), %r9
281 movq 24($inp), %rax
282 mulq %r9
283 addq %rax, %r12
284 movq 32($inp), %rax
285 movq %rdx, %rcx
286 adcq \$0, %rcx
287
288 mulq %r9
289 addq %rax, %r13
290 movq 40($inp), %rax
291 adcq \$0, %rdx
292 addq %rcx, %r13
293 movq %rdx, %rcx
294 adcq \$0, %rcx
295
296 mulq %r9
297 addq %rax, %r14
298 movq 48($inp), %rax
299 adcq \$0, %rdx
300 addq %rcx, %r14
301 movq %rdx, %rcx
302 adcq \$0, %rcx
303
304 mulq %r9
305 movq %r12, %r10
306 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
307 addq %rax, %r15
308 movq 56($inp), %rax
309 adcq \$0, %rdx
310 addq %rcx, %r15
311 movq %rdx, %rcx
312 adcq \$0, %rcx
313
314 mulq %r9
315 shrq \$63, %r10
316 addq %rax, %r8
317 movq %r9, %rax
318 adcq \$0, %rdx
319 addq %rcx, %r8
320 movq %rdx, %r9
321 adcq \$0, %r9
322
323 movq %r13, %rcx
324 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
325
326 mulq %rax
327 addq %rax, %r11
328 adcq %rdx, %r12
329 adcq \$0, %r13
330
331 movq %r11, 32(%rsp)
332 movq %r12, 40(%rsp)
333 shrq \$63, %rcx
334
335#fourth iteration
336 movq 24($inp), %r10
337 movq 32($inp), %rax
338 mulq %r10
339 addq %rax, %r14
340 movq 40($inp), %rax
341 movq %rdx, %rbx
342 adcq \$0, %rbx
343
344 mulq %r10
345 addq %rax, %r15
346 movq 48($inp), %rax
347 adcq \$0, %rdx
348 addq %rbx, %r15
349 movq %rdx, %rbx
350 adcq \$0, %rbx
351
352 mulq %r10
353 movq %r14, %r12
354 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
355 addq %rax, %r8
356 movq 56($inp), %rax
357 adcq \$0, %rdx
358 addq %rbx, %r8
359 movq %rdx, %rbx
360 adcq \$0, %rbx
361
362 mulq %r10
363 shrq \$63, %r12
364 addq %rax, %r9
365 movq %r10, %rax
366 adcq \$0, %rdx
367 addq %rbx, %r9
368 movq %rdx, %r10
369 adcq \$0, %r10
370
371 movq %r15, %rbx
372 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
373
374 mulq %rax
375 addq %rax, %r13
376 adcq %rdx, %r14
377 adcq \$0, %r15
378
379 movq %r13, 48(%rsp)
380 movq %r14, 56(%rsp)
381 shrq \$63, %rbx
382
383#fifth iteration
384 movq 32($inp), %r11
385 movq 40($inp), %rax
386 mulq %r11
387 addq %rax, %r8
388 movq 48($inp), %rax
389 movq %rdx, %rcx
390 adcq \$0, %rcx
391
392 mulq %r11
393 addq %rax, %r9
394 movq 56($inp), %rax
395 adcq \$0, %rdx
396 movq %r8, %r12
397 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
398 addq %rcx, %r9
399 movq %rdx, %rcx
400 adcq \$0, %rcx
401
402 mulq %r11
403 shrq \$63, %r12
404 addq %rax, %r10
405 movq %r11, %rax
406 adcq \$0, %rdx
407 addq %rcx, %r10
408 movq %rdx, %r11
409 adcq \$0, %r11
410
411 movq %r9, %rcx
412 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
413
414 mulq %rax
415 addq %rax, %r15
416 adcq %rdx, %r8
417 adcq \$0, %r9
418
419 movq %r15, 64(%rsp)
420 movq %r8, 72(%rsp)
421 shrq \$63, %rcx
422
423#sixth iteration
424 movq 40($inp), %r12
425 movq 48($inp), %rax
426 mulq %r12
427 addq %rax, %r10
428 movq 56($inp), %rax
429 movq %rdx, %rbx
430 adcq \$0, %rbx
431
432 mulq %r12
433 addq %rax, %r11
434 movq %r12, %rax
435 movq %r10, %r15
436 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
437 adcq \$0, %rdx
438 shrq \$63, %r15
439 addq %rbx, %r11
440 movq %rdx, %r12
441 adcq \$0, %r12
442
443 movq %r11, %rbx
444 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
445
446 mulq %rax
447 addq %rax, %r9
448 adcq %rdx, %r10
449 adcq \$0, %r11
450
451 movq %r9, 80(%rsp)
452 movq %r10, 88(%rsp)
453
454#seventh iteration
455 movq 48($inp), %r13
456 movq 56($inp), %rax
457 mulq %r13
458 addq %rax, %r12
459 movq %r13, %rax
460 movq %rdx, %r13
461 adcq \$0, %r13
462
463 xorq %r14, %r14
464 shlq \$1, %rbx
465 adcq %r12, %r12 #shld \$1, %rbx, %r12
466 adcq %r13, %r13 #shld \$1, %r12, %r13
467 adcq %r14, %r14 #shld \$1, %r13, %r14
468
469 mulq %rax
470 addq %rax, %r11
471 adcq %rdx, %r12
472 adcq \$0, %r13
473
474 movq %r11, 96(%rsp)
475 movq %r12, 104(%rsp)
476
477#eighth iteration
478 movq 56($inp), %rax
479 mulq %rax
480 addq %rax, %r13
481 adcq \$0, %rdx
482
483 addq %rdx, %r14
484
485 movq %r13, 112(%rsp)
486 movq %r14, 120(%rsp)
87954638
AP
487
488 movq (%rsp), %r8
489 movq 8(%rsp), %r9
490 movq 16(%rsp), %r10
491 movq 24(%rsp), %r11
492 movq 32(%rsp), %r12
493 movq 40(%rsp), %r13
494 movq 48(%rsp), %r14
495 movq 56(%rsp), %r15
496
497 call __rsaz_512_reduce
498
499 addq 64(%rsp), %r8
500 adcq 72(%rsp), %r9
501 adcq 80(%rsp), %r10
502 adcq 88(%rsp), %r11
503 adcq 96(%rsp), %r12
504 adcq 104(%rsp), %r13
505 adcq 112(%rsp), %r14
506 adcq 120(%rsp), %r15
507 sbbq %rcx, %rcx
508
509 call __rsaz_512_subtract
510
511 movq %r8, %rdx
512 movq %r9, %rax
513 movl 128+8(%rsp), $times
514 movq $out, $inp
515
516 decl $times
517 jnz .Loop_sqr
0b4bb91d 518___
87954638 519if ($addx) {
0b4bb91d 520$code.=<<___;
87954638
AP
521 jmp .Lsqr_tail
522
523.align 32
524.Loop_sqrx:
525 movl $times,128+8(%rsp)
0b4bb91d 526 movq $out, %xmm0 # off-load
87954638 527 movq %rbp, %xmm1 # off-load
0b4bb91d
AP
528#first iteration
529 mulx %rax, %r8, %r9
530
531 mulx 16($inp), %rcx, %r10
87954638 532 xor %rbp, %rbp # cf=0, of=0
0b4bb91d
AP
533
534 mulx 24($inp), %rax, %r11
87954638 535 adcx %rcx, %r9
0b4bb91d
AP
536
537 mulx 32($inp), %rcx, %r12
87954638 538 adcx %rax, %r10
0b4bb91d
AP
539
540 mulx 40($inp), %rax, %r13
87954638 541 adcx %rcx, %r11
0b4bb91d 542
87954638
AP
543 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
544 adcx %rax, %r12
545 adcx %rcx, %r13
0b4bb91d 546
87954638
AP
547 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
548 adcx %rax, %r14
549 adcx %rbp, %r15 # %rbp is 0
0b4bb91d 550
87954638 551 mov %r9, %rcx
0b4bb91d
AP
552 shld \$1, %r8, %r9
553 shl \$1, %r8
554
87954638 555 xor %ebp, %ebp
0b4bb91d 556 mulx %rdx, %rax, %rdx
87954638
AP
557 adcx %rdx, %r8
558 mov 8($inp), %rdx
559 adcx %rbp, %r9
0b4bb91d
AP
560
561 mov %rax, (%rsp)
562 mov %r8, 8(%rsp)
563
564#second iteration
0b4bb91d 565 mulx 16($inp), %rax, %rbx
87954638
AP
566 adox %rax, %r10
567 adcx %rbx, %r11
0b4bb91d 568
87954638
AP
569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
570 adox $out, %r11
571 adcx %r8, %r12
0b4bb91d
AP
572
573 mulx 32($inp), %rax, %rbx
87954638
AP
574 adox %rax, %r12
575 adcx %rbx, %r13
0b4bb91d
AP
576
577 mulx 40($inp), $out, %r8
87954638
AP
578 adox $out, %r13
579 adcx %r8, %r14
0b4bb91d 580
87954638
AP
581 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
582 adox %rax, %r14
583 adcx %rbx, %r15
0b4bb91d 584
87954638
AP
585 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
586 adox $out, %r15
587 adcx %rbp, %r8
588 adox %rbp, %r8
0b4bb91d 589
87954638 590 mov %r11, %rbx
0b4bb91d
AP
591 shld \$1, %r10, %r11
592 shld \$1, %rcx, %r10
593
87954638 594 xor %ebp,%ebp
0b4bb91d 595 mulx %rdx, %rax, %rcx
87954638
AP
596 mov 16($inp), %rdx
597 adcx %rax, %r9
598 adcx %rcx, %r10
599 adcx %rbp, %r11
0b4bb91d
AP
600
601 mov %r9, 16(%rsp)
87954638 602 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
0b4bb91d
AP
603
604#third iteration
87954638
AP
605 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
606 adox $out, %r12
607 adcx %r9, %r13
0b4bb91d
AP
608
609 mulx 32($inp), %rax, %rcx
87954638
AP
610 adox %rax, %r13
611 adcx %rcx, %r14
0b4bb91d
AP
612
613 mulx 40($inp), $out, %r9
87954638
AP
614 adox $out, %r14
615 adcx %r9, %r15
0b4bb91d 616
87954638
AP
617 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
618 adox %rax, %r15
619 adcx %rcx, %r8
0b4bb91d 620
87954638
AP
621 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
622 adox $out, %r8
623 adcx %rbp, %r9
624 adox %rbp, %r9
0b4bb91d 625
87954638 626 mov %r13, %rcx
0b4bb91d
AP
627 shld \$1, %r12, %r13
628 shld \$1, %rbx, %r12
629
87954638 630 xor %ebp, %ebp
0b4bb91d 631 mulx %rdx, %rax, %rdx
87954638
AP
632 adcx %rax, %r11
633 adcx %rdx, %r12
634 mov 24($inp), %rdx
635 adcx %rbp, %r13
0b4bb91d
AP
636
637 mov %r11, 32(%rsp)
87954638 638 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
0b4bb91d
AP
639
640#fourth iteration
87954638
AP
641 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
642 adox %rax, %r14
643 adcx %rbx, %r15
0b4bb91d
AP
644
645 mulx 40($inp), $out, %r10
87954638
AP
646 adox $out, %r15
647 adcx %r10, %r8
0b4bb91d
AP
648
649 mulx 48($inp), %rax, %rbx
87954638
AP
650 adox %rax, %r8
651 adcx %rbx, %r9
0b4bb91d
AP
652
653 mulx 56($inp), $out, %r10
87954638
AP
654 adox $out, %r9
655 adcx %rbp, %r10
656 adox %rbp, %r10
0b4bb91d 657
87954638
AP
658 .byte 0x66
659 mov %r15, %rbx
0b4bb91d
AP
660 shld \$1, %r14, %r15
661 shld \$1, %rcx, %r14
662
87954638 663 xor %ebp, %ebp
0b4bb91d 664 mulx %rdx, %rax, %rdx
87954638
AP
665 adcx %rax, %r13
666 adcx %rdx, %r14
667 mov 32($inp), %rdx
668 adcx %rbp, %r15
0b4bb91d
AP
669
670 mov %r13, 48(%rsp)
671 mov %r14, 56(%rsp)
672
673#fifth iteration
87954638
AP
674 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
675 adox $out, %r8
676 adcx %r11, %r9
0b4bb91d
AP
677
678 mulx 48($inp), %rax, %rcx
87954638
AP
679 adox %rax, %r9
680 adcx %rcx, %r10
0b4bb91d
AP
681
682 mulx 56($inp), $out, %r11
87954638
AP
683 adox $out, %r10
684 adcx %rbp, %r11
685 adox %rbp, %r11
0b4bb91d
AP
686
687 mov %r9, %rcx
688 shld \$1, %r8, %r9
689 shld \$1, %rbx, %r8
690
87954638 691 xor %ebp, %ebp
0b4bb91d 692 mulx %rdx, %rax, %rdx
87954638
AP
693 adcx %rax, %r15
694 adcx %rdx, %r8
695 mov 40($inp), %rdx
696 adcx %rbp, %r9
0b4bb91d
AP
697
698 mov %r15, 64(%rsp)
699 mov %r8, 72(%rsp)
700
701#sixth iteration
87954638
AP
702 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
703 adox %rax, %r10
704 adcx %rbx, %r11
0b4bb91d 705
87954638
AP
706 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
707 adox $out, %r11
708 adcx %rbp, %r12
709 adox %rbp, %r12
0b4bb91d
AP
710
711 mov %r11, %rbx
712 shld \$1, %r10, %r11
713 shld \$1, %rcx, %r10
714
87954638 715 xor %ebp, %ebp
0b4bb91d 716 mulx %rdx, %rax, %rdx
87954638
AP
717 adcx %rax, %r9
718 adcx %rdx, %r10
719 mov 48($inp), %rdx
720 adcx %rbp, %r11
0b4bb91d
AP
721
722 mov %r9, 80(%rsp)
723 mov %r10, 88(%rsp)
724
725#seventh iteration
87954638
AP
726 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
727 adox %rax, %r12
728 adox %rbp, %r13
0b4bb91d
AP
729
730 xor %r14, %r14
731 shld \$1, %r13, %r14
732 shld \$1, %r12, %r13
733 shld \$1, %rbx, %r12
734
87954638 735 xor %ebp, %ebp
0b4bb91d 736 mulx %rdx, %rax, %rdx
87954638
AP
737 adcx %rax, %r11
738 adcx %rdx, %r12
739 mov 56($inp), %rdx
740 adcx %rbp, %r13
0b4bb91d 741
87954638
AP
742 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
743 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
0b4bb91d
AP
744
745#eighth iteration
0b4bb91d 746 mulx %rdx, %rax, %rdx
87954638
AP
747 adox %rax, %r13
748 adox %rbp, %rdx
749
750 .byte 0x66
0b4bb91d
AP
751 add %rdx, %r14
752
753 movq %r13, 112(%rsp)
754 movq %r14, 120(%rsp)
755 movq %xmm0, $out
87954638
AP
756 movq %xmm1, %rbp
757
758 movq 128(%rsp), %rdx # pull $n0
0b4bb91d
AP
759 movq (%rsp), %r8
760 movq 8(%rsp), %r9
761 movq 16(%rsp), %r10
762 movq 24(%rsp), %r11
763 movq 32(%rsp), %r12
764 movq 40(%rsp), %r13
765 movq 48(%rsp), %r14
766 movq 56(%rsp), %r15
767
87954638 768 call __rsaz_512_reducex
0b4bb91d
AP
769
770 addq 64(%rsp), %r8
771 adcq 72(%rsp), %r9
772 adcq 80(%rsp), %r10
773 adcq 88(%rsp), %r11
774 adcq 96(%rsp), %r12
775 adcq 104(%rsp), %r13
776 adcq 112(%rsp), %r14
777 adcq 120(%rsp), %r15
778 sbbq %rcx, %rcx
779
87954638 780 call __rsaz_512_subtract
0b4bb91d
AP
781
782 movq %r8, %rdx
783 movq %r9, %rax
784 movl 128+8(%rsp), $times
785 movq $out, $inp
786
787 decl $times
87954638
AP
788 jnz .Loop_sqrx
789
790.Lsqr_tail:
791___
792}
793$code.=<<___;
0b4bb91d
AP
794
795 leaq 128+24+48(%rsp), %rax
796 movq -48(%rax), %r15
797 movq -40(%rax), %r14
798 movq -32(%rax), %r13
799 movq -24(%rax), %r12
800 movq -16(%rax), %rbp
801 movq -8(%rax), %rbx
802 leaq (%rax), %rsp
803.Lsqr_epilogue:
804 ret
805.size rsaz_512_sqr,.-rsaz_512_sqr
806___
807}
808{
809my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
810$code.=<<___;
fd8ad019 811.globl rsaz_512_mul
0b4bb91d
AP
812.type rsaz_512_mul,\@function,5
813.align 32
814rsaz_512_mul:
815 push %rbx
816 push %rbp
817 push %r12
818 push %r13
819 push %r14
820 push %r15
821
822 subq \$128+24, %rsp
823.Lmul_body:
824 movq $out, %xmm0 # off-load arguments
825 movq $mod, %xmm1
826 movq $n0, 128(%rsp)
87954638
AP
827___
828$code.=<<___ if ($addx);
829 movl \$0x80100,%r11d
830 andl OPENSSL_ia32cap_P+8(%rip),%r11d
831 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
832 je .Lmulx
833___
834$code.=<<___;
835 movq ($bp), %rbx # pass b[0]
0b4bb91d 836 movq $bp, %rbp # pass argument
fd8ad019 837 call __rsaz_512_mul
0b4bb91d
AP
838
839 movq %xmm0, $out
840 movq %xmm1, %rbp
841
842 movq (%rsp), %r8
843 movq 8(%rsp), %r9
844 movq 16(%rsp), %r10
845 movq 24(%rsp), %r11
846 movq 32(%rsp), %r12
847 movq 40(%rsp), %r13
848 movq 48(%rsp), %r14
849 movq 56(%rsp), %r15
850
87954638
AP
851 call __rsaz_512_reduce
852___
853$code.=<<___ if ($addx);
854 jmp .Lmul_tail
0b4bb91d 855
87954638
AP
856.align 32
857.Lmulx:
858 movq $bp, %rbp # pass argument
859 movq ($bp), %rdx # pass b[0]
860 call __rsaz_512_mulx
861
862 movq %xmm0, $out
863 movq %xmm1, %rbp
864
865 movq 128(%rsp), %rdx # pull $n0
866 movq (%rsp), %r8
867 movq 8(%rsp), %r9
868 movq 16(%rsp), %r10
869 movq 24(%rsp), %r11
870 movq 32(%rsp), %r12
871 movq 40(%rsp), %r13
872 movq 48(%rsp), %r14
873 movq 56(%rsp), %r15
874
875 call __rsaz_512_reducex
876.Lmul_tail:
877___
878$code.=<<___;
0b4bb91d
AP
879 addq 64(%rsp), %r8
880 adcq 72(%rsp), %r9
881 adcq 80(%rsp), %r10
882 adcq 88(%rsp), %r11
883 adcq 96(%rsp), %r12
884 adcq 104(%rsp), %r13
885 adcq 112(%rsp), %r14
886 adcq 120(%rsp), %r15
887 sbbq %rcx, %rcx
888
87954638 889 call __rsaz_512_subtract
0b4bb91d
AP
890
891 leaq 128+24+48(%rsp), %rax
892 movq -48(%rax), %r15
893 movq -40(%rax), %r14
894 movq -32(%rax), %r13
895 movq -24(%rax), %r12
896 movq -16(%rax), %rbp
897 movq -8(%rax), %rbx
898 leaq (%rax), %rsp
899.Lmul_epilogue:
900 ret
901.size rsaz_512_mul,.-rsaz_512_mul
902___
903}
904{
905my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
906$code.=<<___;
fd8ad019 907.globl rsaz_512_mul_gather4
0b4bb91d
AP
908.type rsaz_512_mul_gather4,\@function,6
909.align 32
910rsaz_512_mul_gather4:
911 push %rbx
912 push %rbp
913 push %r12
914 push %r13
915 push %r14
916 push %r15
917
8bd7ca99 918 mov $pwr, $pwr
0b4bb91d
AP
919 subq \$128+24, %rsp
920.Lmul_gather4_body:
87954638
AP
921___
922$code.=<<___ if ($addx);
923 movl \$0x80100,%r11d
924 andl OPENSSL_ia32cap_P+8(%rip),%r11d
925 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
926 je .Lmulx_gather
927___
928$code.=<<___;
0b4bb91d
AP
929 movl 64($bp,$pwr,4), %eax
930 movq $out, %xmm0 # off-load arguments
931 movl ($bp,$pwr,4), %ebx
932 movq $mod, %xmm1
933 movq $n0, 128(%rsp)
934
935 shlq \$32, %rax
936 or %rax, %rbx
937 movq ($ap), %rax
938 movq 8($ap), %rcx
939 leaq 128($bp,$pwr,4), %rbp
940 mulq %rbx # 0 iteration
941 movq %rax, (%rsp)
942 movq %rcx, %rax
943 movq %rdx, %r8
944
945 mulq %rbx
946 movd (%rbp), %xmm4
947 addq %rax, %r8
948 movq 16($ap), %rax
949 movq %rdx, %r9
950 adcq \$0, %r9
951
952 mulq %rbx
953 movd 64(%rbp), %xmm5
954 addq %rax, %r9
955 movq 24($ap), %rax
956 movq %rdx, %r10
957 adcq \$0, %r10
958
959 mulq %rbx
960 pslldq \$4, %xmm5
961 addq %rax, %r10
962 movq 32($ap), %rax
963 movq %rdx, %r11
964 adcq \$0, %r11
965
966 mulq %rbx
967 por %xmm5, %xmm4
968 addq %rax, %r11
969 movq 40($ap), %rax
970 movq %rdx, %r12
971 adcq \$0, %r12
972
973 mulq %rbx
974 addq %rax, %r12
975 movq 48($ap), %rax
976 movq %rdx, %r13
977 adcq \$0, %r13
978
979 mulq %rbx
980 leaq 128(%rbp), %rbp
981 addq %rax, %r13
982 movq 56($ap), %rax
983 movq %rdx, %r14
984 adcq \$0, %r14
985
986 mulq %rbx
987 movq %xmm4, %rbx
988 addq %rax, %r14
989 movq ($ap), %rax
990 movq %rdx, %r15
991 adcq \$0, %r15
992
993 leaq 8(%rsp), %rdi
994 movl \$7, %ecx
995 jmp .Loop_mul_gather
996
997.align 32
998.Loop_mul_gather:
999 mulq %rbx
1000 addq %rax, %r8
1001 movq 8($ap), %rax
1002 movq %r8, (%rdi)
1003 movq %rdx, %r8
1004 adcq \$0, %r8
1005
1006 mulq %rbx
1007 movd (%rbp), %xmm4
1008 addq %rax, %r9
1009 movq 16($ap), %rax
1010 adcq \$0, %rdx
1011 addq %r9, %r8
1012 movq %rdx, %r9
1013 adcq \$0, %r9
1014
1015 mulq %rbx
1016 movd 64(%rbp), %xmm5
1017 addq %rax, %r10
1018 movq 24($ap), %rax
1019 adcq \$0, %rdx
1020 addq %r10, %r9
1021 movq %rdx, %r10
1022 adcq \$0, %r10
1023
1024 mulq %rbx
1025 pslldq \$4, %xmm5
1026 addq %rax, %r11
1027 movq 32($ap), %rax
1028 adcq \$0, %rdx
1029 addq %r11, %r10
1030 movq %rdx, %r11
1031 adcq \$0, %r11
1032
1033 mulq %rbx
1034 por %xmm5, %xmm4
1035 addq %rax, %r12
1036 movq 40($ap), %rax
1037 adcq \$0, %rdx
1038 addq %r12, %r11
1039 movq %rdx, %r12
1040 adcq \$0, %r12
1041
1042 mulq %rbx
1043 addq %rax, %r13
1044 movq 48($ap), %rax
1045 adcq \$0, %rdx
1046 addq %r13, %r12
1047 movq %rdx, %r13
1048 adcq \$0, %r13
1049
1050 mulq %rbx
1051 addq %rax, %r14
1052 movq 56($ap), %rax
1053 adcq \$0, %rdx
1054 addq %r14, %r13
1055 movq %rdx, %r14
1056 adcq \$0, %r14
1057
1058 mulq %rbx
1059 movq %xmm4, %rbx
1060 addq %rax, %r15
1061 movq ($ap), %rax
1062 adcq \$0, %rdx
1063 addq %r15, %r14
1064 movq %rdx, %r15
1065 adcq \$0, %r15
1066
1067 leaq 128(%rbp), %rbp
1068 leaq 8(%rdi), %rdi
1069
1070 decl %ecx
1071 jnz .Loop_mul_gather
1072
1073 movq %r8, (%rdi)
1074 movq %r9, 8(%rdi)
1075 movq %r10, 16(%rdi)
1076 movq %r11, 24(%rdi)
1077 movq %r12, 32(%rdi)
1078 movq %r13, 40(%rdi)
1079 movq %r14, 48(%rdi)
1080 movq %r15, 56(%rdi)
1081
1082 movq %xmm0, $out
1083 movq %xmm1, %rbp
1084
1085 movq (%rsp), %r8
1086 movq 8(%rsp), %r9
1087 movq 16(%rsp), %r10
1088 movq 24(%rsp), %r11
1089 movq 32(%rsp), %r12
1090 movq 40(%rsp), %r13
1091 movq 48(%rsp), %r14
1092 movq 56(%rsp), %r15
1093
87954638
AP
1094 call __rsaz_512_reduce
1095___
1096$code.=<<___ if ($addx);
1097 jmp .Lmul_gather_tail
1098
1099.align 32
1100.Lmulx_gather:
1101 mov 64($bp,$pwr,4), %eax
1102 movq $out, %xmm0 # off-load arguments
1103 lea 128($bp,$pwr,4), %rbp
1104 mov ($bp,$pwr,4), %edx
1105 movq $mod, %xmm1
1106 mov $n0, 128(%rsp)
1107
1108 shl \$32, %rax
1109 or %rax, %rdx
1110 mulx ($ap), %rbx, %r8 # 0 iteration
1111 mov %rbx, (%rsp)
1112 xor %edi, %edi # cf=0, of=0
1113
1114 mulx 8($ap), %rax, %r9
1115 movd (%rbp), %xmm4
1116
1117 mulx 16($ap), %rbx, %r10
1118 movd 64(%rbp), %xmm5
1119 adcx %rax, %r8
1120
1121 mulx 24($ap), %rax, %r11
1122 pslldq \$4, %xmm5
1123 adcx %rbx, %r9
1124
1125 mulx 32($ap), %rbx, %r12
1126 por %xmm5, %xmm4
1127 adcx %rax, %r10
1128
1129 mulx 40($ap), %rax, %r13
1130 adcx %rbx, %r11
1131
1132 mulx 48($ap), %rbx, %r14
1133 lea 128(%rbp), %rbp
1134 adcx %rax, %r12
1135
1136 mulx 56($ap), %rax, %r15
1137 movq %xmm4, %rdx
1138 adcx %rbx, %r13
1139 adcx %rax, %r14
1140 mov %r8, %rbx
1141 adcx %rdi, %r15 # %rdi is 0
1142
1143 mov \$-7, %rcx
1144 jmp .Loop_mulx_gather
1145
1146.align 32
1147.Loop_mulx_gather:
1148 mulx ($ap), %rax, %r8
1149 adcx %rax, %rbx
1150 adox %r9, %r8
1151
1152 mulx 8($ap), %rax, %r9
1153 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1154 adcx %rax, %r8
1155 adox %r10, %r9
0b4bb91d 1156
87954638
AP
1157 mulx 16($ap), %rax, %r10
1158 movd 64(%rbp), %xmm5
1159 lea 128(%rbp), %rbp
1160 adcx %rax, %r9
1161 adox %r11, %r10
1162
1163 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1164 pslldq \$4, %xmm5
1165 por %xmm5, %xmm4
1166 adcx %rax, %r10
1167 adox %r12, %r11
1168
1169 mulx 32($ap), %rax, %r12
1170 adcx %rax, %r11
1171 adox %r13, %r12
1172
1173 mulx 40($ap), %rax, %r13
1174 adcx %rax, %r12
1175 adox %r14, %r13
1176
1177 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1178 adcx %rax, %r13
1179 adox %r15, %r14
1180
1181 mulx 56($ap), %rax, %r15
1182 movq %xmm4, %rdx
1183 mov %rbx, 64(%rsp,%rcx,8)
1184 adcx %rax, %r14
1185 adox %rdi, %r15
1186 mov %r8, %rbx
1187 adcx %rdi, %r15 # cf=0
1188
1189 inc %rcx # of=0
1190 jnz .Loop_mulx_gather
1191
1192 mov %r8, 64(%rsp)
1193 mov %r9, 64+8(%rsp)
1194 mov %r10, 64+16(%rsp)
1195 mov %r11, 64+24(%rsp)
1196 mov %r12, 64+32(%rsp)
1197 mov %r13, 64+40(%rsp)
1198 mov %r14, 64+48(%rsp)
1199 mov %r15, 64+56(%rsp)
1200
1201 movq %xmm0, $out
1202 movq %xmm1, %rbp
1203
1204 mov 128(%rsp), %rdx # pull $n0
1205 mov (%rsp), %r8
1206 mov 8(%rsp), %r9
1207 mov 16(%rsp), %r10
1208 mov 24(%rsp), %r11
1209 mov 32(%rsp), %r12
1210 mov 40(%rsp), %r13
1211 mov 48(%rsp), %r14
1212 mov 56(%rsp), %r15
1213
1214 call __rsaz_512_reducex
1215
1216.Lmul_gather_tail:
1217___
1218$code.=<<___;
0b4bb91d
AP
1219 addq 64(%rsp), %r8
1220 adcq 72(%rsp), %r9
1221 adcq 80(%rsp), %r10
1222 adcq 88(%rsp), %r11
1223 adcq 96(%rsp), %r12
1224 adcq 104(%rsp), %r13
1225 adcq 112(%rsp), %r14
1226 adcq 120(%rsp), %r15
1227 sbbq %rcx, %rcx
1228
87954638 1229 call __rsaz_512_subtract
0b4bb91d
AP
1230
1231 leaq 128+24+48(%rsp), %rax
1232 movq -48(%rax), %r15
1233 movq -40(%rax), %r14
1234 movq -32(%rax), %r13
1235 movq -24(%rax), %r12
1236 movq -16(%rax), %rbp
1237 movq -8(%rax), %rbx
1238 leaq (%rax), %rsp
1239.Lmul_gather4_epilogue:
1240 ret
1241.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1242___
1243}
1244{
1245my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1246$code.=<<___;
fd8ad019 1247.globl rsaz_512_mul_scatter4
0b4bb91d
AP
1248.type rsaz_512_mul_scatter4,\@function,6
1249.align 32
1250rsaz_512_mul_scatter4:
1251 push %rbx
1252 push %rbp
1253 push %r12
1254 push %r13
1255 push %r14
1256 push %r15
1257
8bd7ca99 1258 mov $pwr, $pwr
0b4bb91d
AP
1259 subq \$128+24, %rsp
1260.Lmul_scatter4_body:
1261 leaq ($tbl,$pwr,4), $tbl
1262 movq $out, %xmm0 # off-load arguments
1263 movq $mod, %xmm1
1264 movq $tbl, %xmm2
1265 movq $n0, 128(%rsp)
1266
1267 movq $out, %rbp
87954638
AP
1268___
1269$code.=<<___ if ($addx);
1270 movl \$0x80100,%r11d
1271 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1272 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1273 je .Lmulx_scatter
1274___
1275$code.=<<___;
1276 movq ($out),%rbx # pass b[0]
fd8ad019 1277 call __rsaz_512_mul
0b4bb91d
AP
1278
1279 movq %xmm0, $out
1280 movq %xmm1, %rbp
1281
1282 movq (%rsp), %r8
1283 movq 8(%rsp), %r9
1284 movq 16(%rsp), %r10
1285 movq 24(%rsp), %r11
1286 movq 32(%rsp), %r12
1287 movq 40(%rsp), %r13
1288 movq 48(%rsp), %r14
1289 movq 56(%rsp), %r15
1290
87954638
AP
1291 call __rsaz_512_reduce
1292___
1293$code.=<<___ if ($addx);
1294 jmp .Lmul_scatter_tail
1295
1296.align 32
1297.Lmulx_scatter:
1298 movq ($out), %rdx # pass b[0]
1299 call __rsaz_512_mulx
1300
1301 movq %xmm0, $out
1302 movq %xmm1, %rbp
1303
1304 movq 128(%rsp), %rdx # pull $n0
1305 movq (%rsp), %r8
1306 movq 8(%rsp), %r9
1307 movq 16(%rsp), %r10
1308 movq 24(%rsp), %r11
1309 movq 32(%rsp), %r12
1310 movq 40(%rsp), %r13
1311 movq 48(%rsp), %r14
1312 movq 56(%rsp), %r15
1313
1314 call __rsaz_512_reducex
0b4bb91d 1315
87954638
AP
1316.Lmul_scatter_tail:
1317___
1318$code.=<<___;
0b4bb91d
AP
1319 addq 64(%rsp), %r8
1320 adcq 72(%rsp), %r9
1321 adcq 80(%rsp), %r10
1322 adcq 88(%rsp), %r11
1323 adcq 96(%rsp), %r12
1324 adcq 104(%rsp), %r13
1325 adcq 112(%rsp), %r14
1326 adcq 120(%rsp), %r15
1327 movq %xmm2, $inp
1328 sbbq %rcx, %rcx
1329
87954638 1330 call __rsaz_512_subtract
0b4bb91d
AP
1331
1332 movl %r8d, 64*0($inp) # scatter
1333 shrq \$32, %r8
1334 movl %r9d, 64*2($inp)
1335 shrq \$32, %r9
1336 movl %r10d, 64*4($inp)
1337 shrq \$32, %r10
1338 movl %r11d, 64*6($inp)
1339 shrq \$32, %r11
1340 movl %r12d, 64*8($inp)
1341 shrq \$32, %r12
1342 movl %r13d, 64*10($inp)
1343 shrq \$32, %r13
1344 movl %r14d, 64*12($inp)
1345 shrq \$32, %r14
1346 movl %r15d, 64*14($inp)
1347 shrq \$32, %r15
1348 movl %r8d, 64*1($inp)
1349 movl %r9d, 64*3($inp)
1350 movl %r10d, 64*5($inp)
1351 movl %r11d, 64*7($inp)
1352 movl %r12d, 64*9($inp)
1353 movl %r13d, 64*11($inp)
1354 movl %r14d, 64*13($inp)
1355 movl %r15d, 64*15($inp)
1356
1357 leaq 128+24+48(%rsp), %rax
1358 movq -48(%rax), %r15
1359 movq -40(%rax), %r14
1360 movq -32(%rax), %r13
1361 movq -24(%rax), %r12
1362 movq -16(%rax), %rbp
1363 movq -8(%rax), %rbx
1364 leaq (%rax), %rsp
1365.Lmul_scatter4_epilogue:
1366 ret
1367.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1368___
1369}
1370{
1371my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1372$code.=<<___;
1373.globl rsaz_512_mul_by_one
1374.type rsaz_512_mul_by_one,\@function,4
1375.align 32
1376rsaz_512_mul_by_one:
1377 push %rbx
1378 push %rbp
1379 push %r12
1380 push %r13
1381 push %r14
1382 push %r15
1383
1384 subq \$128+24, %rsp
1385.Lmul_by_one_body:
87954638
AP
1386___
1387$code.=<<___ if ($addx);
1388 movl OPENSSL_ia32cap_P+8(%rip),%eax
1389___
1390$code.=<<___;
0b4bb91d
AP
1391 movq $mod, %rbp # reassign argument
1392 movq $n0, 128(%rsp)
1393
1394 movq ($inp), %r8
1395 pxor %xmm0, %xmm0
1396 movq 8($inp), %r9
1397 movq 16($inp), %r10
1398 movq 24($inp), %r11
1399 movq 32($inp), %r12
1400 movq 40($inp), %r13
1401 movq 48($inp), %r14
1402 movq 56($inp), %r15
1403
1404 movdqa %xmm0, (%rsp)
1405 movdqa %xmm0, 16(%rsp)
1406 movdqa %xmm0, 32(%rsp)
1407 movdqa %xmm0, 48(%rsp)
1408 movdqa %xmm0, 64(%rsp)
1409 movdqa %xmm0, 80(%rsp)
1410 movdqa %xmm0, 96(%rsp)
87954638
AP
1411___
1412$code.=<<___ if ($addx);
1413 andl \$0x80100,%eax
1414 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1415 je .Lby_one_callx
1416___
1417$code.=<<___;
1418 call __rsaz_512_reduce
1419___
1420$code.=<<___ if ($addx);
1421 jmp .Lby_one_tail
1422.align 32
1423.Lby_one_callx:
1424 movq 128(%rsp), %rdx # pull $n0
1425 call __rsaz_512_reducex
1426.Lby_one_tail:
1427___
1428$code.=<<___;
0b4bb91d
AP
1429 movq %r8, ($out)
1430 movq %r9, 8($out)
1431 movq %r10, 16($out)
1432 movq %r11, 24($out)
1433 movq %r12, 32($out)
1434 movq %r13, 40($out)
1435 movq %r14, 48($out)
1436 movq %r15, 56($out)
1437
1438 leaq 128+24+48(%rsp), %rax
1439 movq -48(%rax), %r15
1440 movq -40(%rax), %r14
1441 movq -32(%rax), %r13
1442 movq -24(%rax), %r12
1443 movq -16(%rax), %rbp
1444 movq -8(%rax), %rbx
1445 leaq (%rax), %rsp
1446.Lmul_by_one_epilogue:
1447 ret
1448.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1449___
1450}
87954638 1451{ # __rsaz_512_reduce
0b4bb91d
AP
1452 #
1453 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1454 # output: %r8-%r15
1455 # clobbers: everything except %rbp and %rdi
1456$code.=<<___;
87954638 1457.type __rsaz_512_reduce,\@abi-omnipotent
0b4bb91d 1458.align 32
87954638 1459__rsaz_512_reduce:
0b4bb91d
AP
1460 movq %r8, %rbx
1461 imulq 128+8(%rsp), %rbx
1462 movq 0(%rbp), %rax
1463 movl \$8, %ecx
1464 jmp .Lreduction_loop
1465
1466.align 32
1467.Lreduction_loop:
1468 mulq %rbx
1469 movq 8(%rbp), %rax
1470 negq %r8
1471 movq %rdx, %r8
1472 adcq \$0, %r8
1473
1474 mulq %rbx
1475 addq %rax, %r9
1476 movq 16(%rbp), %rax
1477 adcq \$0, %rdx
1478 addq %r9, %r8
1479 movq %rdx, %r9
1480 adcq \$0, %r9
1481
1482 mulq %rbx
1483 addq %rax, %r10
1484 movq 24(%rbp), %rax
1485 adcq \$0, %rdx
1486 addq %r10, %r9
1487 movq %rdx, %r10
1488 adcq \$0, %r10
1489
1490 mulq %rbx
1491 addq %rax, %r11
1492 movq 32(%rbp), %rax
1493 adcq \$0, %rdx
1494 addq %r11, %r10
1495 movq 128+8(%rsp), %rsi
87954638
AP
1496 #movq %rdx, %r11
1497 #adcq \$0, %r11
1498 adcq \$0, %rdx
0b4bb91d 1499 movq %rdx, %r11
0b4bb91d
AP
1500
1501 mulq %rbx
1502 addq %rax, %r12
1503 movq 40(%rbp), %rax
1504 adcq \$0, %rdx
1505 imulq %r8, %rsi
1506 addq %r12, %r11
1507 movq %rdx, %r12
1508 adcq \$0, %r12
1509
1510 mulq %rbx
1511 addq %rax, %r13
1512 movq 48(%rbp), %rax
1513 adcq \$0, %rdx
1514 addq %r13, %r12
1515 movq %rdx, %r13
1516 adcq \$0, %r13
1517
1518 mulq %rbx
1519 addq %rax, %r14
1520 movq 56(%rbp), %rax
1521 adcq \$0, %rdx
1522 addq %r14, %r13
1523 movq %rdx, %r14
1524 adcq \$0, %r14
1525
1526 mulq %rbx
1527 movq %rsi, %rbx
1528 addq %rax, %r15
1529 movq 0(%rbp), %rax
1530 adcq \$0, %rdx
1531 addq %r15, %r14
1532 movq %rdx, %r15
1533 adcq \$0, %r15
1534
1535 decl %ecx
1536 jne .Lreduction_loop
87954638
AP
1537
1538 ret
1539.size __rsaz_512_reduce,.-__rsaz_512_reduce
0b4bb91d 1540___
87954638
AP
1541}
1542if ($addx) {
1543 # __rsaz_512_reducex
1544 #
1545 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1546 # output: %r8-%r15
1547 # clobbers: everything except %rbp and %rdi
0b4bb91d 1548$code.=<<___;
87954638
AP
1549.type __rsaz_512_reducex,\@abi-omnipotent
1550.align 32
1551__rsaz_512_reducex:
1552 #movq 128+8(%rsp), %rdx # pull $n0
0b4bb91d 1553 imulq %r8, %rdx
87954638 1554 xorq %rsi, %rsi # cf=0,of=0
0b4bb91d 1555 movl \$8, %ecx
87954638 1556 jmp .Lreduction_loopx
0b4bb91d
AP
1557
1558.align 32
87954638
AP
1559.Lreduction_loopx:
1560 mov %r8, %rbx
0b4bb91d 1561 mulx 0(%rbp), %rax, %r8
87954638
AP
1562 adcx %rbx, %rax
1563 adox %r9, %r8
0b4bb91d
AP
1564
1565 mulx 8(%rbp), %rax, %r9
87954638
AP
1566 adcx %rax, %r8
1567 adox %r10, %r9
1568
1569 mulx 16(%rbp), %rbx, %r10
1570 adcx %rbx, %r9
1571 adox %r11, %r10
1572
1573 mulx 24(%rbp), %rbx, %r11
1574 adcx %rbx, %r10
1575 adox %r12, %r11
1576
1577 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1578 mov %rdx, %rax
1579 mov %r8, %rdx
1580 adcx %rbx, %r11
1581 adox %r13, %r12
1582
1583 mulx 128+8(%rsp), %rbx, %rdx
1584 mov %rax, %rdx
0b4bb91d
AP
1585
1586 mulx 40(%rbp), %rax, %r13
87954638
AP
1587 adcx %rax, %r12
1588 adox %r14, %r13
0b4bb91d 1589
87954638
AP
1590 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1591 adcx %rax, %r13
1592 adox %r15, %r14
0b4bb91d
AP
1593
1594 mulx 56(%rbp), %rax, %r15
1595 mov %rbx, %rdx
87954638
AP
1596 adcx %rax, %r14
1597 adox %rsi, %r15 # %rsi is 0
1598 adcx %rsi, %r15 # cf=0
1599
1600 decl %ecx # of=0
1601 jne .Lreduction_loopx
0b4bb91d 1602
0b4bb91d 1603 ret
87954638 1604.size __rsaz_512_reducex,.-__rsaz_512_reducex
0b4bb91d
AP
1605___
1606}
87954638 1607{ # __rsaz_512_subtract
0b4bb91d
AP
1608 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1609 # output:
1610 # clobbers: everything but %rdi, %rsi and %rbp
1611$code.=<<___;
87954638 1612.type __rsaz_512_subtract,\@abi-omnipotent
0b4bb91d 1613.align 32
87954638 1614__rsaz_512_subtract:
0b4bb91d
AP
1615 movq %r8, ($out)
1616 movq %r9, 8($out)
1617 movq %r10, 16($out)
1618 movq %r11, 24($out)
1619 movq %r12, 32($out)
1620 movq %r13, 40($out)
1621 movq %r14, 48($out)
1622 movq %r15, 56($out)
1623
1624 movq 0($mod), %r8
1625 movq 8($mod), %r9
1626 negq %r8
1627 notq %r9
1628 andq %rcx, %r8
1629 movq 16($mod), %r10
1630 andq %rcx, %r9
1631 notq %r10
1632 movq 24($mod), %r11
1633 andq %rcx, %r10
1634 notq %r11
1635 movq 32($mod), %r12
1636 andq %rcx, %r11
1637 notq %r12
1638 movq 40($mod), %r13
1639 andq %rcx, %r12
1640 notq %r13
1641 movq 48($mod), %r14
1642 andq %rcx, %r13
1643 notq %r14
1644 movq 56($mod), %r15
1645 andq %rcx, %r14
1646 notq %r15
1647 andq %rcx, %r15
1648
1649 addq ($out), %r8
1650 adcq 8($out), %r9
1651 adcq 16($out), %r10
1652 adcq 24($out), %r11
1653 adcq 32($out), %r12
1654 adcq 40($out), %r13
1655 adcq 48($out), %r14
1656 adcq 56($out), %r15
1657
1658 movq %r8, ($out)
1659 movq %r9, 8($out)
1660 movq %r10, 16($out)
1661 movq %r11, 24($out)
1662 movq %r12, 32($out)
1663 movq %r13, 40($out)
1664 movq %r14, 48($out)
1665 movq %r15, 56($out)
1666
1667 ret
87954638 1668.size __rsaz_512_subtract,.-__rsaz_512_subtract
0b4bb91d
AP
1669___
1670}
fd8ad019 1671{ # __rsaz_512_mul
0b4bb91d
AP
1672 #
1673 # input: %rsi - ap, %rbp - bp
0d4fb843 1674 # output:
0b4bb91d
AP
1675 # clobbers: everything
1676my ($ap,$bp) = ("%rsi","%rbp");
1677$code.=<<___;
fd8ad019 1678.type __rsaz_512_mul,\@abi-omnipotent
0b4bb91d 1679.align 32
fd8ad019 1680__rsaz_512_mul:
0b4bb91d
AP
1681 leaq 8(%rsp), %rdi
1682
0b4bb91d
AP
1683 movq ($ap), %rax
1684 mulq %rbx
1685 movq %rax, (%rdi)
1686 movq 8($ap), %rax
1687 movq %rdx, %r8
1688
1689 mulq %rbx
1690 addq %rax, %r8
1691 movq 16($ap), %rax
1692 movq %rdx, %r9
1693 adcq \$0, %r9
1694
1695 mulq %rbx
1696 addq %rax, %r9
1697 movq 24($ap), %rax
1698 movq %rdx, %r10
1699 adcq \$0, %r10
1700
1701 mulq %rbx
1702 addq %rax, %r10
1703 movq 32($ap), %rax
1704 movq %rdx, %r11
1705 adcq \$0, %r11
1706
1707 mulq %rbx
1708 addq %rax, %r11
1709 movq 40($ap), %rax
1710 movq %rdx, %r12
1711 adcq \$0, %r12
1712
1713 mulq %rbx
1714 addq %rax, %r12
1715 movq 48($ap), %rax
1716 movq %rdx, %r13
1717 adcq \$0, %r13
1718
1719 mulq %rbx
1720 addq %rax, %r13
1721 movq 56($ap), %rax
1722 movq %rdx, %r14
1723 adcq \$0, %r14
1724
1725 mulq %rbx
1726 addq %rax, %r14
1727 movq ($ap), %rax
1728 movq %rdx, %r15
1729 adcq \$0, %r15
1730
1731 leaq 8($bp), $bp
1732 leaq 8(%rdi), %rdi
1733
1734 movl \$7, %ecx
1735 jmp .Loop_mul
1736
1737.align 32
1738.Loop_mul:
1739 movq ($bp), %rbx
1740 mulq %rbx
1741 addq %rax, %r8
1742 movq 8($ap), %rax
1743 movq %r8, (%rdi)
1744 movq %rdx, %r8
1745 adcq \$0, %r8
1746
1747 mulq %rbx
1748 addq %rax, %r9
1749 movq 16($ap), %rax
1750 adcq \$0, %rdx
1751 addq %r9, %r8
1752 movq %rdx, %r9
1753 adcq \$0, %r9
1754
1755 mulq %rbx
1756 addq %rax, %r10
1757 movq 24($ap), %rax
1758 adcq \$0, %rdx
1759 addq %r10, %r9
1760 movq %rdx, %r10
1761 adcq \$0, %r10
1762
1763 mulq %rbx
1764 addq %rax, %r11
1765 movq 32($ap), %rax
1766 adcq \$0, %rdx
1767 addq %r11, %r10
1768 movq %rdx, %r11
1769 adcq \$0, %r11
1770
1771 mulq %rbx
1772 addq %rax, %r12
1773 movq 40($ap), %rax
1774 adcq \$0, %rdx
1775 addq %r12, %r11
1776 movq %rdx, %r12
1777 adcq \$0, %r12
1778
1779 mulq %rbx
1780 addq %rax, %r13
1781 movq 48($ap), %rax
1782 adcq \$0, %rdx
1783 addq %r13, %r12
1784 movq %rdx, %r13
1785 adcq \$0, %r13
1786
1787 mulq %rbx
1788 addq %rax, %r14
1789 movq 56($ap), %rax
1790 adcq \$0, %rdx
1791 addq %r14, %r13
1792 movq %rdx, %r14
1793 leaq 8($bp), $bp
1794 adcq \$0, %r14
1795
1796 mulq %rbx
1797 addq %rax, %r15
1798 movq ($ap), %rax
1799 adcq \$0, %rdx
1800 addq %r15, %r14
1801 movq %rdx, %r15
1802 adcq \$0, %r15
1803
1804 leaq 8(%rdi), %rdi
1805
1806 decl %ecx
1807 jnz .Loop_mul
1808
1809 movq %r8, (%rdi)
1810 movq %r9, 8(%rdi)
1811 movq %r10, 16(%rdi)
1812 movq %r11, 24(%rdi)
1813 movq %r12, 32(%rdi)
1814 movq %r13, 40(%rdi)
1815 movq %r14, 48(%rdi)
1816 movq %r15, 56(%rdi)
1817
1818 ret
fd8ad019 1819.size __rsaz_512_mul,.-__rsaz_512_mul
0b4bb91d
AP
1820___
1821}
87954638
AP
1822if ($addx) {
1823 # __rsaz_512_mulx
1824 #
1825 # input: %rsi - ap, %rbp - bp
0d4fb843 1826 # output:
87954638
AP
1827 # clobbers: everything
1828my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1829$code.=<<___;
1830.type __rsaz_512_mulx,\@abi-omnipotent
1831.align 32
1832__rsaz_512_mulx:
1833 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
31ed9a21 1834 mov \$-6, %rcx
87954638
AP
1835
1836 mulx 8($ap), %rax, %r9
1837 movq %rbx, 8(%rsp)
1838
1839 mulx 16($ap), %rbx, %r10
31ed9a21 1840 adc %rax, %r8
87954638
AP
1841
1842 mulx 24($ap), %rax, %r11
31ed9a21 1843 adc %rbx, %r9
87954638 1844
31ed9a21
AP
1845 mulx 32($ap), %rbx, %r12
1846 adc %rax, %r10
87954638
AP
1847
1848 mulx 40($ap), %rax, %r13
31ed9a21 1849 adc %rbx, %r11
87954638
AP
1850
1851 mulx 48($ap), %rbx, %r14
31ed9a21 1852 adc %rax, %r12
87954638
AP
1853
1854 mulx 56($ap), %rax, %r15
1855 mov 8($bp), %rdx
31ed9a21
AP
1856 adc %rbx, %r13
1857 adc %rax, %r14
1858 adc \$0, %r15
87954638 1859
31ed9a21 1860 xor $zero, $zero # cf=0,of=0
87954638
AP
1861 jmp .Loop_mulx
1862
1863.align 32
1864.Loop_mulx:
1865 movq %r8, %rbx
1866 mulx ($ap), %rax, %r8
1867 adcx %rax, %rbx
1868 adox %r9, %r8
1869
1870 mulx 8($ap), %rax, %r9
1871 adcx %rax, %r8
1872 adox %r10, %r9
1873
1874 mulx 16($ap), %rax, %r10
1875 adcx %rax, %r9
1876 adox %r11, %r10
1877
1878 mulx 24($ap), %rax, %r11
1879 adcx %rax, %r10
1880 adox %r12, %r11
1881
1882 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1883 adcx %rax, %r11
1884 adox %r13, %r12
1885
1886 mulx 40($ap), %rax, %r13
1887 adcx %rax, %r12
1888 adox %r14, %r13
1889
1890 mulx 48($ap), %rax, %r14
1891 adcx %rax, %r13
1892 adox %r15, %r14
1893
1894 mulx 56($ap), %rax, %r15
1895 movq 64($bp,%rcx,8), %rdx
1896 movq %rbx, 8+64-8(%rsp,%rcx,8)
1897 adcx %rax, %r14
1898 adox $zero, %r15
1899 adcx $zero, %r15 # cf=0
1900
1901 inc %rcx # of=0
1902 jnz .Loop_mulx
1903
1904 movq %r8, %rbx
1905 mulx ($ap), %rax, %r8
1906 adcx %rax, %rbx
1907 adox %r9, %r8
1908
1909 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1910 adcx %rax, %r8
1911 adox %r10, %r9
1912
1913 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1914 adcx %rax, %r9
1915 adox %r11, %r10
1916
1917 mulx 24($ap), %rax, %r11
1918 adcx %rax, %r10
1919 adox %r12, %r11
1920
1921 mulx 32($ap), %rax, %r12
1922 adcx %rax, %r11
1923 adox %r13, %r12
1924
1925 mulx 40($ap), %rax, %r13
1926 adcx %rax, %r12
1927 adox %r14, %r13
1928
1929 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1930 adcx %rax, %r13
1931 adox %r15, %r14
1932
1933 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1934 adcx %rax, %r14
1935 adox $zero, %r15
1936 adcx $zero, %r15
1937
1938 mov %rbx, 8+64-8(%rsp)
1939 mov %r8, 8+64(%rsp)
1940 mov %r9, 8+64+8(%rsp)
1941 mov %r10, 8+64+16(%rsp)
1942 mov %r11, 8+64+24(%rsp)
1943 mov %r12, 8+64+32(%rsp)
1944 mov %r13, 8+64+40(%rsp)
1945 mov %r14, 8+64+48(%rsp)
1946 mov %r15, 8+64+56(%rsp)
1947
1948 ret
1949.size __rsaz_512_mulx,.-__rsaz_512_mulx
1950___
1951}
0b4bb91d
AP
1952{
1953my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1954$code.=<<___;
1955.globl rsaz_512_scatter4
1956.type rsaz_512_scatter4,\@abi-omnipotent
1957.align 16
1958rsaz_512_scatter4:
1959 leaq ($out,$power,4), $out
1960 movl \$8, %r9d
1961 jmp .Loop_scatter
1962.align 16
1963.Loop_scatter:
1964 movq ($inp), %rax
1965 leaq 8($inp), $inp
1966 movl %eax, ($out)
1967 shrq \$32, %rax
1968 movl %eax, 64($out)
1969 leaq 128($out), $out
1970 decl %r9d
1971 jnz .Loop_scatter
1972 ret
1973.size rsaz_512_scatter4,.-rsaz_512_scatter4
1974
1975.globl rsaz_512_gather4
1976.type rsaz_512_gather4,\@abi-omnipotent
1977.align 16
1978rsaz_512_gather4:
1979 leaq ($inp,$power,4), $inp
1980 movl \$8, %r9d
1981 jmp .Loop_gather
1982.align 16
1983.Loop_gather:
1984 movl ($inp), %eax
1985 movl 64($inp), %r8d
1986 leaq 128($inp), $inp
1987 shlq \$32, %r8
1988 or %r8, %rax
1989 movq %rax, ($out)
1990 leaq 8($out), $out
1991 decl %r9d
1992 jnz .Loop_gather
1993 ret
1994.size rsaz_512_gather4,.-rsaz_512_gather4
1995___
1996}
1997
1998# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1999# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2000if ($win64) {
2001$rec="%rcx";
2002$frame="%rdx";
2003$context="%r8";
2004$disp="%r9";
2005
2006$code.=<<___;
2007.extern __imp_RtlVirtualUnwind
2008.type se_handler,\@abi-omnipotent
2009.align 16
2010se_handler:
2011 push %rsi
2012 push %rdi
2013 push %rbx
2014 push %rbp
2015 push %r12
2016 push %r13
2017 push %r14
2018 push %r15
2019 pushfq
2020 sub \$64,%rsp
2021
2022 mov 120($context),%rax # pull context->Rax
2023 mov 248($context),%rbx # pull context->Rip
2024
2025 mov 8($disp),%rsi # disp->ImageBase
2026 mov 56($disp),%r11 # disp->HandlerData
2027
2028 mov 0(%r11),%r10d # HandlerData[0]
2029 lea (%rsi,%r10),%r10 # end of prologue label
2030 cmp %r10,%rbx # context->Rip<end of prologue label
2031 jb .Lcommon_seh_tail
2032
2033 mov 152($context),%rax # pull context->Rsp
2034
2035 mov 4(%r11),%r10d # HandlerData[1]
2036 lea (%rsi,%r10),%r10 # epilogue label
2037 cmp %r10,%rbx # context->Rip>=epilogue label
2038 jae .Lcommon_seh_tail
2039
2040 lea 128+24+48(%rax),%rax
2041
2042 mov -8(%rax),%rbx
2043 mov -16(%rax),%rbp
2044 mov -24(%rax),%r12
2045 mov -32(%rax),%r13
2046 mov -40(%rax),%r14
2047 mov -48(%rax),%r15
2048 mov %rbx,144($context) # restore context->Rbx
2049 mov %rbp,160($context) # restore context->Rbp
2050 mov %r12,216($context) # restore context->R12
2051 mov %r13,224($context) # restore context->R13
2052 mov %r14,232($context) # restore context->R14
2053 mov %r15,240($context) # restore context->R15
2054
2055.Lcommon_seh_tail:
2056 mov 8(%rax),%rdi
2057 mov 16(%rax),%rsi
2058 mov %rax,152($context) # restore context->Rsp
2059 mov %rsi,168($context) # restore context->Rsi
2060 mov %rdi,176($context) # restore context->Rdi
2061
2062 mov 40($disp),%rdi # disp->ContextRecord
2063 mov $context,%rsi # context
2064 mov \$154,%ecx # sizeof(CONTEXT)
2065 .long 0xa548f3fc # cld; rep movsq
2066
2067 mov $disp,%rsi
2068 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2069 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2070 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2071 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2072 mov 40(%rsi),%r10 # disp->ContextRecord
2073 lea 56(%rsi),%r11 # &disp->HandlerData
2074 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2075 mov %r10,32(%rsp) # arg5
2076 mov %r11,40(%rsp) # arg6
2077 mov %r12,48(%rsp) # arg7
2078 mov %rcx,56(%rsp) # arg8, (NULL)
2079 call *__imp_RtlVirtualUnwind(%rip)
2080
2081 mov \$1,%eax # ExceptionContinueSearch
2082 add \$64,%rsp
2083 popfq
2084 pop %r15
2085 pop %r14
2086 pop %r13
2087 pop %r12
2088 pop %rbp
2089 pop %rbx
2090 pop %rdi
2091 pop %rsi
2092 ret
2093.size sqr_handler,.-sqr_handler
2094
2095.section .pdata
2096.align 4
2097 .rva .LSEH_begin_rsaz_512_sqr
2098 .rva .LSEH_end_rsaz_512_sqr
2099 .rva .LSEH_info_rsaz_512_sqr
2100
2101 .rva .LSEH_begin_rsaz_512_mul
2102 .rva .LSEH_end_rsaz_512_mul
2103 .rva .LSEH_info_rsaz_512_mul
2104
2105 .rva .LSEH_begin_rsaz_512_mul_gather4
2106 .rva .LSEH_end_rsaz_512_mul_gather4
2107 .rva .LSEH_info_rsaz_512_mul_gather4
2108
2109 .rva .LSEH_begin_rsaz_512_mul_scatter4
2110 .rva .LSEH_end_rsaz_512_mul_scatter4
2111 .rva .LSEH_info_rsaz_512_mul_scatter4
2112
2113 .rva .LSEH_begin_rsaz_512_mul_by_one
2114 .rva .LSEH_end_rsaz_512_mul_by_one
2115 .rva .LSEH_info_rsaz_512_mul_by_one
2116
2117.section .xdata
2118.align 8
2119.LSEH_info_rsaz_512_sqr:
2120 .byte 9,0,0,0
2121 .rva se_handler
2122 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2123.LSEH_info_rsaz_512_mul:
2124 .byte 9,0,0,0
2125 .rva se_handler
2126 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2127.LSEH_info_rsaz_512_mul_gather4:
2128 .byte 9,0,0,0
2129 .rva se_handler
2130 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2131.LSEH_info_rsaz_512_mul_scatter4:
2132 .byte 9,0,0,0
2133 .rva se_handler
2134 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2135.LSEH_info_rsaz_512_mul_by_one:
2136 .byte 9,0,0,0
2137 .rva se_handler
2138 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2139___
2140}
2141
2142$code =~ s/\`([^\`]*)\`/eval $1/gem;
2143print $code;
2144close STDOUT;