]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
0b4bb91d | 9 | |
31ed9a21 AP |
10 | ############################################################################## |
11 | # # | |
12 | # Copyright (c) 2012, Intel Corporation # | |
13 | # # | |
14 | # All rights reserved. # | |
15 | # # | |
16 | # Redistribution and use in source and binary forms, with or without # | |
17 | # modification, are permitted provided that the following conditions are # | |
18 | # met: # | |
19 | # # | |
20 | # * Redistributions of source code must retain the above copyright # | |
21 | # notice, this list of conditions and the following disclaimer. # | |
22 | # # | |
23 | # * Redistributions in binary form must reproduce the above copyright # | |
24 | # notice, this list of conditions and the following disclaimer in the # | |
25 | # documentation and/or other materials provided with the # | |
26 | # distribution. # | |
27 | # # | |
28 | # * Neither the name of the Intel Corporation nor the names of its # | |
29 | # contributors may be used to endorse or promote products derived from # | |
30 | # this software without specific prior written permission. # | |
31 | # # | |
32 | # # | |
33 | # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # | |
34 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # | |
35 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # | |
36 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # | |
37 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # | |
38 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # | |
39 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # | |
40 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # | |
41 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # | |
42 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # | |
43 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # | |
44 | # # | |
45 | ############################################################################## | |
46 | # Developers and authors: # | |
47 | # Shay Gueron (1, 2), and Vlad Krasnov (1) # | |
48 | # (1) Intel Architecture Group, Microprocessor and Chipset Development, # | |
49 | # Israel Development Center, Haifa, Israel # | |
50 | # (2) University of Haifa # | |
51 | ############################################################################## | |
52 | # Reference: # | |
53 | # [1] S. Gueron, "Efficient Software Implementations of Modular # | |
54 | # Exponentiation", http://eprint.iacr.org/2011/239 # | |
55 | # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # | |
56 | # IEEE Proceedings of 9th International Conference on Information # | |
57 | # Technology: New Generations (ITNG 2012), 821-823 (2012). # | |
58 | # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# | |
59 | # Journal of Cryptographic Engineering 2:31-43 (2012). # | |
60 | # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # | |
61 | # resistant 512-bit and 1024-bit modular exponentiation for optimizing # | |
62 | # RSA1024 and RSA2048 on x86_64 platforms", # | |
63 | # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# | |
64 | ############################################################################## | |
0b4bb91d AP |
65 | |
66 | # While original submission covers 512- and 1024-bit exponentiation, | |
67 | # this module is limited to 512-bit version only (and as such | |
68 | # accelerates RSA1024 sign). This is because improvement for longer | |
69 | # keys is not high enough to justify the effort, highest measured | |
70 | # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming | |
71 | # for the moment of this writing!] Nor does this module implement | |
72 | # "monolithic" complete exponentiation jumbo-subroutine, but adheres | |
73 | # to more modular mixture of C and assembly. And it's optimized even | |
74 | # for processors other than Intel Core family (see table below for | |
75 | # improvement coefficients). | |
76 | # <appro@openssl.org> | |
77 | # | |
78 | # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) | |
79 | # ----------------+--------------------------- | |
80 | # Opteron +13% |+5% +20% | |
81 | # Bulldozer -0% |-1% +10% | |
82 | # P4 +11% |+7% +8% | |
83 | # Westmere +5% |+14% +17% | |
84 | # Sandy Bridge +2% |+12% +29% | |
85 | # Ivy Bridge +1% |+11% +35% | |
86 | # Haswell(**) -0% |+12% +39% | |
87 | # Atom +13% |+11% +4% | |
88 | # VIA Nano +70% |+9% +25% | |
89 | # | |
90 | # (*) rsax engine and fips numbers are presented for reference | |
91 | # purposes; | |
87954638 | 92 | # (**) MULX was attempted, but found to give only marginal improvement; |
0b4bb91d AP |
93 | |
94 | $flavour = shift; | |
95 | $output = shift; | |
96 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
97 | ||
98 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
99 | ||
100 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
101 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
102 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
103 | die "can't locate x86_64-xlate.pl"; | |
104 | ||
cfe1d992 | 105 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
0b4bb91d AP |
106 | *STDOUT=*OUT; |
107 | ||
87954638 AP |
108 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
109 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
30b9c234 | 110 | $addx = ($1>=2.23); |
87954638 AP |
111 | } |
112 | ||
113 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
114 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
115 | $addx = ($1>=2.10); | |
116 | } | |
117 | ||
118 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
119 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
1b0fe79f | 120 | $addx = ($1>=12); |
87954638 AP |
121 | } |
122 | ||
b9749432 | 123 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { |
a356e488 AP |
124 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 |
125 | $addx = ($ver>=3.03); | |
126 | } | |
127 | ||
0b4bb91d AP |
128 | ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API |
129 | { | |
130 | my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); | |
131 | ||
132 | $code.=<<___; | |
133 | .text | |
134 | ||
87954638 AP |
135 | .extern OPENSSL_ia32cap_P |
136 | ||
0b4bb91d | 137 | .globl rsaz_512_sqr |
6efef384 | 138 | .type rsaz_512_sqr,\@function,5 |
0b4bb91d AP |
139 | .align 32 |
140 | rsaz_512_sqr: # 25-29% faster than rsaz_512_mul | |
141 | push %rbx | |
142 | push %rbp | |
143 | push %r12 | |
144 | push %r13 | |
145 | push %r14 | |
146 | push %r15 | |
147 | ||
148 | subq \$128+24, %rsp | |
149 | .Lsqr_body: | |
150 | movq $mod, %rbp # common argument | |
151 | movq ($inp), %rdx | |
152 | movq 8($inp), %rax | |
153 | movq $n0, 128(%rsp) | |
87954638 AP |
154 | ___ |
155 | $code.=<<___ if ($addx); | |
156 | movl \$0x80100,%r11d | |
157 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
158 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
159 | je .Loop_sqrx | |
160 | ___ | |
161 | $code.=<<___; | |
0b4bb91d AP |
162 | jmp .Loop_sqr |
163 | ||
164 | .align 32 | |
165 | .Loop_sqr: | |
166 | movl $times,128+8(%rsp) | |
0b4bb91d AP |
167 | #first iteration |
168 | movq %rdx, %rbx | |
169 | mulq %rdx | |
170 | movq %rax, %r8 | |
171 | movq 16($inp), %rax | |
172 | movq %rdx, %r9 | |
173 | ||
174 | mulq %rbx | |
175 | addq %rax, %r9 | |
176 | movq 24($inp), %rax | |
177 | movq %rdx, %r10 | |
178 | adcq \$0, %r10 | |
179 | ||
180 | mulq %rbx | |
181 | addq %rax, %r10 | |
182 | movq 32($inp), %rax | |
183 | movq %rdx, %r11 | |
184 | adcq \$0, %r11 | |
185 | ||
186 | mulq %rbx | |
187 | addq %rax, %r11 | |
188 | movq 40($inp), %rax | |
189 | movq %rdx, %r12 | |
190 | adcq \$0, %r12 | |
191 | ||
192 | mulq %rbx | |
193 | addq %rax, %r12 | |
194 | movq 48($inp), %rax | |
195 | movq %rdx, %r13 | |
196 | adcq \$0, %r13 | |
197 | ||
198 | mulq %rbx | |
199 | addq %rax, %r13 | |
200 | movq 56($inp), %rax | |
201 | movq %rdx, %r14 | |
202 | adcq \$0, %r14 | |
203 | ||
204 | mulq %rbx | |
205 | addq %rax, %r14 | |
206 | movq %rbx, %rax | |
207 | movq %rdx, %r15 | |
208 | adcq \$0, %r15 | |
209 | ||
210 | addq %r8, %r8 #shlq \$1, %r8 | |
211 | movq %r9, %rcx | |
212 | adcq %r9, %r9 #shld \$1, %r8, %r9 | |
213 | ||
214 | mulq %rax | |
215 | movq %rax, (%rsp) | |
216 | addq %rdx, %r8 | |
217 | adcq \$0, %r9 | |
218 | ||
219 | movq %r8, 8(%rsp) | |
220 | shrq \$63, %rcx | |
221 | ||
222 | #second iteration | |
223 | movq 8($inp), %r8 | |
224 | movq 16($inp), %rax | |
225 | mulq %r8 | |
226 | addq %rax, %r10 | |
227 | movq 24($inp), %rax | |
228 | movq %rdx, %rbx | |
229 | adcq \$0, %rbx | |
230 | ||
231 | mulq %r8 | |
232 | addq %rax, %r11 | |
233 | movq 32($inp), %rax | |
234 | adcq \$0, %rdx | |
235 | addq %rbx, %r11 | |
236 | movq %rdx, %rbx | |
237 | adcq \$0, %rbx | |
238 | ||
239 | mulq %r8 | |
240 | addq %rax, %r12 | |
241 | movq 40($inp), %rax | |
242 | adcq \$0, %rdx | |
243 | addq %rbx, %r12 | |
244 | movq %rdx, %rbx | |
245 | adcq \$0, %rbx | |
246 | ||
247 | mulq %r8 | |
248 | addq %rax, %r13 | |
249 | movq 48($inp), %rax | |
250 | adcq \$0, %rdx | |
251 | addq %rbx, %r13 | |
252 | movq %rdx, %rbx | |
253 | adcq \$0, %rbx | |
254 | ||
255 | mulq %r8 | |
256 | addq %rax, %r14 | |
257 | movq 56($inp), %rax | |
258 | adcq \$0, %rdx | |
259 | addq %rbx, %r14 | |
260 | movq %rdx, %rbx | |
261 | adcq \$0, %rbx | |
262 | ||
263 | mulq %r8 | |
264 | addq %rax, %r15 | |
265 | movq %r8, %rax | |
266 | adcq \$0, %rdx | |
267 | addq %rbx, %r15 | |
268 | movq %rdx, %r8 | |
269 | movq %r10, %rdx | |
270 | adcq \$0, %r8 | |
271 | ||
272 | add %rdx, %rdx | |
273 | lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 | |
274 | movq %r11, %rbx | |
275 | adcq %r11, %r11 #shld \$1, %r10, %r11 | |
276 | ||
277 | mulq %rax | |
278 | addq %rax, %r9 | |
279 | adcq %rdx, %r10 | |
280 | adcq \$0, %r11 | |
281 | ||
282 | movq %r9, 16(%rsp) | |
283 | movq %r10, 24(%rsp) | |
284 | shrq \$63, %rbx | |
285 | ||
286 | #third iteration | |
287 | movq 16($inp), %r9 | |
288 | movq 24($inp), %rax | |
289 | mulq %r9 | |
290 | addq %rax, %r12 | |
291 | movq 32($inp), %rax | |
292 | movq %rdx, %rcx | |
293 | adcq \$0, %rcx | |
294 | ||
295 | mulq %r9 | |
296 | addq %rax, %r13 | |
297 | movq 40($inp), %rax | |
298 | adcq \$0, %rdx | |
299 | addq %rcx, %r13 | |
300 | movq %rdx, %rcx | |
301 | adcq \$0, %rcx | |
302 | ||
303 | mulq %r9 | |
304 | addq %rax, %r14 | |
305 | movq 48($inp), %rax | |
306 | adcq \$0, %rdx | |
307 | addq %rcx, %r14 | |
308 | movq %rdx, %rcx | |
309 | adcq \$0, %rcx | |
310 | ||
311 | mulq %r9 | |
312 | movq %r12, %r10 | |
313 | lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 | |
314 | addq %rax, %r15 | |
315 | movq 56($inp), %rax | |
316 | adcq \$0, %rdx | |
317 | addq %rcx, %r15 | |
318 | movq %rdx, %rcx | |
319 | adcq \$0, %rcx | |
320 | ||
321 | mulq %r9 | |
322 | shrq \$63, %r10 | |
323 | addq %rax, %r8 | |
324 | movq %r9, %rax | |
325 | adcq \$0, %rdx | |
326 | addq %rcx, %r8 | |
327 | movq %rdx, %r9 | |
328 | adcq \$0, %r9 | |
329 | ||
330 | movq %r13, %rcx | |
331 | leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 | |
332 | ||
333 | mulq %rax | |
334 | addq %rax, %r11 | |
335 | adcq %rdx, %r12 | |
336 | adcq \$0, %r13 | |
337 | ||
338 | movq %r11, 32(%rsp) | |
339 | movq %r12, 40(%rsp) | |
340 | shrq \$63, %rcx | |
341 | ||
342 | #fourth iteration | |
343 | movq 24($inp), %r10 | |
344 | movq 32($inp), %rax | |
345 | mulq %r10 | |
346 | addq %rax, %r14 | |
347 | movq 40($inp), %rax | |
348 | movq %rdx, %rbx | |
349 | adcq \$0, %rbx | |
350 | ||
351 | mulq %r10 | |
352 | addq %rax, %r15 | |
353 | movq 48($inp), %rax | |
354 | adcq \$0, %rdx | |
355 | addq %rbx, %r15 | |
356 | movq %rdx, %rbx | |
357 | adcq \$0, %rbx | |
358 | ||
359 | mulq %r10 | |
360 | movq %r14, %r12 | |
361 | leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 | |
362 | addq %rax, %r8 | |
363 | movq 56($inp), %rax | |
364 | adcq \$0, %rdx | |
365 | addq %rbx, %r8 | |
366 | movq %rdx, %rbx | |
367 | adcq \$0, %rbx | |
368 | ||
369 | mulq %r10 | |
370 | shrq \$63, %r12 | |
371 | addq %rax, %r9 | |
372 | movq %r10, %rax | |
373 | adcq \$0, %rdx | |
374 | addq %rbx, %r9 | |
375 | movq %rdx, %r10 | |
376 | adcq \$0, %r10 | |
377 | ||
378 | movq %r15, %rbx | |
379 | leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 | |
380 | ||
381 | mulq %rax | |
382 | addq %rax, %r13 | |
383 | adcq %rdx, %r14 | |
384 | adcq \$0, %r15 | |
385 | ||
386 | movq %r13, 48(%rsp) | |
387 | movq %r14, 56(%rsp) | |
388 | shrq \$63, %rbx | |
389 | ||
390 | #fifth iteration | |
391 | movq 32($inp), %r11 | |
392 | movq 40($inp), %rax | |
393 | mulq %r11 | |
394 | addq %rax, %r8 | |
395 | movq 48($inp), %rax | |
396 | movq %rdx, %rcx | |
397 | adcq \$0, %rcx | |
398 | ||
399 | mulq %r11 | |
400 | addq %rax, %r9 | |
401 | movq 56($inp), %rax | |
402 | adcq \$0, %rdx | |
403 | movq %r8, %r12 | |
404 | leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 | |
405 | addq %rcx, %r9 | |
406 | movq %rdx, %rcx | |
407 | adcq \$0, %rcx | |
408 | ||
409 | mulq %r11 | |
410 | shrq \$63, %r12 | |
411 | addq %rax, %r10 | |
412 | movq %r11, %rax | |
413 | adcq \$0, %rdx | |
414 | addq %rcx, %r10 | |
415 | movq %rdx, %r11 | |
416 | adcq \$0, %r11 | |
417 | ||
418 | movq %r9, %rcx | |
419 | leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 | |
420 | ||
421 | mulq %rax | |
422 | addq %rax, %r15 | |
423 | adcq %rdx, %r8 | |
424 | adcq \$0, %r9 | |
425 | ||
426 | movq %r15, 64(%rsp) | |
427 | movq %r8, 72(%rsp) | |
428 | shrq \$63, %rcx | |
429 | ||
430 | #sixth iteration | |
431 | movq 40($inp), %r12 | |
432 | movq 48($inp), %rax | |
433 | mulq %r12 | |
434 | addq %rax, %r10 | |
435 | movq 56($inp), %rax | |
436 | movq %rdx, %rbx | |
437 | adcq \$0, %rbx | |
438 | ||
439 | mulq %r12 | |
440 | addq %rax, %r11 | |
441 | movq %r12, %rax | |
442 | movq %r10, %r15 | |
443 | leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 | |
444 | adcq \$0, %rdx | |
445 | shrq \$63, %r15 | |
446 | addq %rbx, %r11 | |
447 | movq %rdx, %r12 | |
448 | adcq \$0, %r12 | |
449 | ||
450 | movq %r11, %rbx | |
451 | leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 | |
452 | ||
453 | mulq %rax | |
454 | addq %rax, %r9 | |
455 | adcq %rdx, %r10 | |
456 | adcq \$0, %r11 | |
457 | ||
458 | movq %r9, 80(%rsp) | |
459 | movq %r10, 88(%rsp) | |
460 | ||
461 | #seventh iteration | |
462 | movq 48($inp), %r13 | |
463 | movq 56($inp), %rax | |
464 | mulq %r13 | |
465 | addq %rax, %r12 | |
466 | movq %r13, %rax | |
467 | movq %rdx, %r13 | |
468 | adcq \$0, %r13 | |
469 | ||
470 | xorq %r14, %r14 | |
471 | shlq \$1, %rbx | |
472 | adcq %r12, %r12 #shld \$1, %rbx, %r12 | |
473 | adcq %r13, %r13 #shld \$1, %r12, %r13 | |
474 | adcq %r14, %r14 #shld \$1, %r13, %r14 | |
475 | ||
476 | mulq %rax | |
477 | addq %rax, %r11 | |
478 | adcq %rdx, %r12 | |
479 | adcq \$0, %r13 | |
480 | ||
481 | movq %r11, 96(%rsp) | |
482 | movq %r12, 104(%rsp) | |
483 | ||
484 | #eighth iteration | |
485 | movq 56($inp), %rax | |
486 | mulq %rax | |
487 | addq %rax, %r13 | |
488 | adcq \$0, %rdx | |
489 | ||
490 | addq %rdx, %r14 | |
491 | ||
492 | movq %r13, 112(%rsp) | |
493 | movq %r14, 120(%rsp) | |
87954638 AP |
494 | |
495 | movq (%rsp), %r8 | |
496 | movq 8(%rsp), %r9 | |
497 | movq 16(%rsp), %r10 | |
498 | movq 24(%rsp), %r11 | |
499 | movq 32(%rsp), %r12 | |
500 | movq 40(%rsp), %r13 | |
501 | movq 48(%rsp), %r14 | |
502 | movq 56(%rsp), %r15 | |
503 | ||
504 | call __rsaz_512_reduce | |
505 | ||
506 | addq 64(%rsp), %r8 | |
507 | adcq 72(%rsp), %r9 | |
508 | adcq 80(%rsp), %r10 | |
509 | adcq 88(%rsp), %r11 | |
510 | adcq 96(%rsp), %r12 | |
511 | adcq 104(%rsp), %r13 | |
512 | adcq 112(%rsp), %r14 | |
513 | adcq 120(%rsp), %r15 | |
514 | sbbq %rcx, %rcx | |
515 | ||
516 | call __rsaz_512_subtract | |
517 | ||
518 | movq %r8, %rdx | |
519 | movq %r9, %rax | |
520 | movl 128+8(%rsp), $times | |
521 | movq $out, $inp | |
522 | ||
523 | decl $times | |
524 | jnz .Loop_sqr | |
0b4bb91d | 525 | ___ |
87954638 | 526 | if ($addx) { |
0b4bb91d | 527 | $code.=<<___; |
87954638 AP |
528 | jmp .Lsqr_tail |
529 | ||
530 | .align 32 | |
531 | .Loop_sqrx: | |
532 | movl $times,128+8(%rsp) | |
0b4bb91d | 533 | movq $out, %xmm0 # off-load |
87954638 | 534 | movq %rbp, %xmm1 # off-load |
0b4bb91d AP |
535 | #first iteration |
536 | mulx %rax, %r8, %r9 | |
537 | ||
538 | mulx 16($inp), %rcx, %r10 | |
87954638 | 539 | xor %rbp, %rbp # cf=0, of=0 |
0b4bb91d AP |
540 | |
541 | mulx 24($inp), %rax, %r11 | |
87954638 | 542 | adcx %rcx, %r9 |
0b4bb91d AP |
543 | |
544 | mulx 32($inp), %rcx, %r12 | |
87954638 | 545 | adcx %rax, %r10 |
0b4bb91d AP |
546 | |
547 | mulx 40($inp), %rax, %r13 | |
87954638 | 548 | adcx %rcx, %r11 |
0b4bb91d | 549 | |
87954638 AP |
550 | .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 |
551 | adcx %rax, %r12 | |
552 | adcx %rcx, %r13 | |
0b4bb91d | 553 | |
87954638 AP |
554 | .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 |
555 | adcx %rax, %r14 | |
556 | adcx %rbp, %r15 # %rbp is 0 | |
0b4bb91d | 557 | |
87954638 | 558 | mov %r9, %rcx |
0b4bb91d AP |
559 | shld \$1, %r8, %r9 |
560 | shl \$1, %r8 | |
561 | ||
87954638 | 562 | xor %ebp, %ebp |
0b4bb91d | 563 | mulx %rdx, %rax, %rdx |
87954638 AP |
564 | adcx %rdx, %r8 |
565 | mov 8($inp), %rdx | |
566 | adcx %rbp, %r9 | |
0b4bb91d AP |
567 | |
568 | mov %rax, (%rsp) | |
569 | mov %r8, 8(%rsp) | |
570 | ||
571 | #second iteration | |
0b4bb91d | 572 | mulx 16($inp), %rax, %rbx |
87954638 AP |
573 | adox %rax, %r10 |
574 | adcx %rbx, %r11 | |
0b4bb91d | 575 | |
87954638 AP |
576 | .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 |
577 | adox $out, %r11 | |
578 | adcx %r8, %r12 | |
0b4bb91d AP |
579 | |
580 | mulx 32($inp), %rax, %rbx | |
87954638 AP |
581 | adox %rax, %r12 |
582 | adcx %rbx, %r13 | |
0b4bb91d AP |
583 | |
584 | mulx 40($inp), $out, %r8 | |
87954638 AP |
585 | adox $out, %r13 |
586 | adcx %r8, %r14 | |
0b4bb91d | 587 | |
87954638 AP |
588 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx |
589 | adox %rax, %r14 | |
590 | adcx %rbx, %r15 | |
0b4bb91d | 591 | |
87954638 AP |
592 | .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 |
593 | adox $out, %r15 | |
594 | adcx %rbp, %r8 | |
595 | adox %rbp, %r8 | |
0b4bb91d | 596 | |
87954638 | 597 | mov %r11, %rbx |
0b4bb91d AP |
598 | shld \$1, %r10, %r11 |
599 | shld \$1, %rcx, %r10 | |
600 | ||
87954638 | 601 | xor %ebp,%ebp |
0b4bb91d | 602 | mulx %rdx, %rax, %rcx |
87954638 AP |
603 | mov 16($inp), %rdx |
604 | adcx %rax, %r9 | |
605 | adcx %rcx, %r10 | |
606 | adcx %rbp, %r11 | |
0b4bb91d AP |
607 | |
608 | mov %r9, 16(%rsp) | |
87954638 | 609 | .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) |
0b4bb91d AP |
610 | |
611 | #third iteration | |
87954638 AP |
612 | .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 |
613 | adox $out, %r12 | |
614 | adcx %r9, %r13 | |
0b4bb91d AP |
615 | |
616 | mulx 32($inp), %rax, %rcx | |
87954638 AP |
617 | adox %rax, %r13 |
618 | adcx %rcx, %r14 | |
0b4bb91d AP |
619 | |
620 | mulx 40($inp), $out, %r9 | |
87954638 AP |
621 | adox $out, %r14 |
622 | adcx %r9, %r15 | |
0b4bb91d | 623 | |
87954638 AP |
624 | .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx |
625 | adox %rax, %r15 | |
626 | adcx %rcx, %r8 | |
0b4bb91d | 627 | |
87954638 AP |
628 | .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 |
629 | adox $out, %r8 | |
630 | adcx %rbp, %r9 | |
631 | adox %rbp, %r9 | |
0b4bb91d | 632 | |
87954638 | 633 | mov %r13, %rcx |
0b4bb91d AP |
634 | shld \$1, %r12, %r13 |
635 | shld \$1, %rbx, %r12 | |
636 | ||
87954638 | 637 | xor %ebp, %ebp |
0b4bb91d | 638 | mulx %rdx, %rax, %rdx |
87954638 AP |
639 | adcx %rax, %r11 |
640 | adcx %rdx, %r12 | |
641 | mov 24($inp), %rdx | |
642 | adcx %rbp, %r13 | |
0b4bb91d AP |
643 | |
644 | mov %r11, 32(%rsp) | |
87954638 | 645 | .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) |
0b4bb91d AP |
646 | |
647 | #fourth iteration | |
87954638 AP |
648 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx |
649 | adox %rax, %r14 | |
650 | adcx %rbx, %r15 | |
0b4bb91d AP |
651 | |
652 | mulx 40($inp), $out, %r10 | |
87954638 AP |
653 | adox $out, %r15 |
654 | adcx %r10, %r8 | |
0b4bb91d AP |
655 | |
656 | mulx 48($inp), %rax, %rbx | |
87954638 AP |
657 | adox %rax, %r8 |
658 | adcx %rbx, %r9 | |
0b4bb91d AP |
659 | |
660 | mulx 56($inp), $out, %r10 | |
87954638 AP |
661 | adox $out, %r9 |
662 | adcx %rbp, %r10 | |
663 | adox %rbp, %r10 | |
0b4bb91d | 664 | |
87954638 AP |
665 | .byte 0x66 |
666 | mov %r15, %rbx | |
0b4bb91d AP |
667 | shld \$1, %r14, %r15 |
668 | shld \$1, %rcx, %r14 | |
669 | ||
87954638 | 670 | xor %ebp, %ebp |
0b4bb91d | 671 | mulx %rdx, %rax, %rdx |
87954638 AP |
672 | adcx %rax, %r13 |
673 | adcx %rdx, %r14 | |
674 | mov 32($inp), %rdx | |
675 | adcx %rbp, %r15 | |
0b4bb91d AP |
676 | |
677 | mov %r13, 48(%rsp) | |
678 | mov %r14, 56(%rsp) | |
679 | ||
680 | #fifth iteration | |
87954638 AP |
681 | .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 |
682 | adox $out, %r8 | |
683 | adcx %r11, %r9 | |
0b4bb91d AP |
684 | |
685 | mulx 48($inp), %rax, %rcx | |
87954638 AP |
686 | adox %rax, %r9 |
687 | adcx %rcx, %r10 | |
0b4bb91d AP |
688 | |
689 | mulx 56($inp), $out, %r11 | |
87954638 AP |
690 | adox $out, %r10 |
691 | adcx %rbp, %r11 | |
692 | adox %rbp, %r11 | |
0b4bb91d AP |
693 | |
694 | mov %r9, %rcx | |
695 | shld \$1, %r8, %r9 | |
696 | shld \$1, %rbx, %r8 | |
697 | ||
87954638 | 698 | xor %ebp, %ebp |
0b4bb91d | 699 | mulx %rdx, %rax, %rdx |
87954638 AP |
700 | adcx %rax, %r15 |
701 | adcx %rdx, %r8 | |
702 | mov 40($inp), %rdx | |
703 | adcx %rbp, %r9 | |
0b4bb91d AP |
704 | |
705 | mov %r15, 64(%rsp) | |
706 | mov %r8, 72(%rsp) | |
707 | ||
708 | #sixth iteration | |
87954638 AP |
709 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx |
710 | adox %rax, %r10 | |
711 | adcx %rbx, %r11 | |
0b4bb91d | 712 | |
87954638 AP |
713 | .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 |
714 | adox $out, %r11 | |
715 | adcx %rbp, %r12 | |
716 | adox %rbp, %r12 | |
0b4bb91d AP |
717 | |
718 | mov %r11, %rbx | |
719 | shld \$1, %r10, %r11 | |
720 | shld \$1, %rcx, %r10 | |
721 | ||
87954638 | 722 | xor %ebp, %ebp |
0b4bb91d | 723 | mulx %rdx, %rax, %rdx |
87954638 AP |
724 | adcx %rax, %r9 |
725 | adcx %rdx, %r10 | |
726 | mov 48($inp), %rdx | |
727 | adcx %rbp, %r11 | |
0b4bb91d AP |
728 | |
729 | mov %r9, 80(%rsp) | |
730 | mov %r10, 88(%rsp) | |
731 | ||
732 | #seventh iteration | |
87954638 AP |
733 | .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 |
734 | adox %rax, %r12 | |
735 | adox %rbp, %r13 | |
0b4bb91d AP |
736 | |
737 | xor %r14, %r14 | |
738 | shld \$1, %r13, %r14 | |
739 | shld \$1, %r12, %r13 | |
740 | shld \$1, %rbx, %r12 | |
741 | ||
87954638 | 742 | xor %ebp, %ebp |
0b4bb91d | 743 | mulx %rdx, %rax, %rdx |
87954638 AP |
744 | adcx %rax, %r11 |
745 | adcx %rdx, %r12 | |
746 | mov 56($inp), %rdx | |
747 | adcx %rbp, %r13 | |
0b4bb91d | 748 | |
87954638 AP |
749 | .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) |
750 | .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) | |
0b4bb91d AP |
751 | |
752 | #eighth iteration | |
0b4bb91d | 753 | mulx %rdx, %rax, %rdx |
87954638 AP |
754 | adox %rax, %r13 |
755 | adox %rbp, %rdx | |
756 | ||
757 | .byte 0x66 | |
0b4bb91d AP |
758 | add %rdx, %r14 |
759 | ||
760 | movq %r13, 112(%rsp) | |
761 | movq %r14, 120(%rsp) | |
762 | movq %xmm0, $out | |
87954638 AP |
763 | movq %xmm1, %rbp |
764 | ||
765 | movq 128(%rsp), %rdx # pull $n0 | |
0b4bb91d AP |
766 | movq (%rsp), %r8 |
767 | movq 8(%rsp), %r9 | |
768 | movq 16(%rsp), %r10 | |
769 | movq 24(%rsp), %r11 | |
770 | movq 32(%rsp), %r12 | |
771 | movq 40(%rsp), %r13 | |
772 | movq 48(%rsp), %r14 | |
773 | movq 56(%rsp), %r15 | |
774 | ||
87954638 | 775 | call __rsaz_512_reducex |
0b4bb91d AP |
776 | |
777 | addq 64(%rsp), %r8 | |
778 | adcq 72(%rsp), %r9 | |
779 | adcq 80(%rsp), %r10 | |
780 | adcq 88(%rsp), %r11 | |
781 | adcq 96(%rsp), %r12 | |
782 | adcq 104(%rsp), %r13 | |
783 | adcq 112(%rsp), %r14 | |
784 | adcq 120(%rsp), %r15 | |
785 | sbbq %rcx, %rcx | |
786 | ||
87954638 | 787 | call __rsaz_512_subtract |
0b4bb91d AP |
788 | |
789 | movq %r8, %rdx | |
790 | movq %r9, %rax | |
791 | movl 128+8(%rsp), $times | |
792 | movq $out, $inp | |
793 | ||
794 | decl $times | |
87954638 AP |
795 | jnz .Loop_sqrx |
796 | ||
797 | .Lsqr_tail: | |
798 | ___ | |
799 | } | |
800 | $code.=<<___; | |
0b4bb91d AP |
801 | |
802 | leaq 128+24+48(%rsp), %rax | |
803 | movq -48(%rax), %r15 | |
804 | movq -40(%rax), %r14 | |
805 | movq -32(%rax), %r13 | |
806 | movq -24(%rax), %r12 | |
807 | movq -16(%rax), %rbp | |
808 | movq -8(%rax), %rbx | |
809 | leaq (%rax), %rsp | |
810 | .Lsqr_epilogue: | |
811 | ret | |
812 | .size rsaz_512_sqr,.-rsaz_512_sqr | |
813 | ___ | |
814 | } | |
815 | { | |
816 | my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); | |
817 | $code.=<<___; | |
fd8ad019 | 818 | .globl rsaz_512_mul |
0b4bb91d AP |
819 | .type rsaz_512_mul,\@function,5 |
820 | .align 32 | |
821 | rsaz_512_mul: | |
822 | push %rbx | |
823 | push %rbp | |
824 | push %r12 | |
825 | push %r13 | |
826 | push %r14 | |
827 | push %r15 | |
828 | ||
829 | subq \$128+24, %rsp | |
830 | .Lmul_body: | |
831 | movq $out, %xmm0 # off-load arguments | |
832 | movq $mod, %xmm1 | |
833 | movq $n0, 128(%rsp) | |
87954638 AP |
834 | ___ |
835 | $code.=<<___ if ($addx); | |
836 | movl \$0x80100,%r11d | |
837 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
838 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
839 | je .Lmulx | |
840 | ___ | |
841 | $code.=<<___; | |
842 | movq ($bp), %rbx # pass b[0] | |
0b4bb91d | 843 | movq $bp, %rbp # pass argument |
fd8ad019 | 844 | call __rsaz_512_mul |
0b4bb91d AP |
845 | |
846 | movq %xmm0, $out | |
847 | movq %xmm1, %rbp | |
848 | ||
849 | movq (%rsp), %r8 | |
850 | movq 8(%rsp), %r9 | |
851 | movq 16(%rsp), %r10 | |
852 | movq 24(%rsp), %r11 | |
853 | movq 32(%rsp), %r12 | |
854 | movq 40(%rsp), %r13 | |
855 | movq 48(%rsp), %r14 | |
856 | movq 56(%rsp), %r15 | |
857 | ||
87954638 AP |
858 | call __rsaz_512_reduce |
859 | ___ | |
860 | $code.=<<___ if ($addx); | |
861 | jmp .Lmul_tail | |
0b4bb91d | 862 | |
87954638 AP |
863 | .align 32 |
864 | .Lmulx: | |
865 | movq $bp, %rbp # pass argument | |
866 | movq ($bp), %rdx # pass b[0] | |
867 | call __rsaz_512_mulx | |
868 | ||
869 | movq %xmm0, $out | |
870 | movq %xmm1, %rbp | |
871 | ||
872 | movq 128(%rsp), %rdx # pull $n0 | |
873 | movq (%rsp), %r8 | |
874 | movq 8(%rsp), %r9 | |
875 | movq 16(%rsp), %r10 | |
876 | movq 24(%rsp), %r11 | |
877 | movq 32(%rsp), %r12 | |
878 | movq 40(%rsp), %r13 | |
879 | movq 48(%rsp), %r14 | |
880 | movq 56(%rsp), %r15 | |
881 | ||
882 | call __rsaz_512_reducex | |
883 | .Lmul_tail: | |
884 | ___ | |
885 | $code.=<<___; | |
0b4bb91d AP |
886 | addq 64(%rsp), %r8 |
887 | adcq 72(%rsp), %r9 | |
888 | adcq 80(%rsp), %r10 | |
889 | adcq 88(%rsp), %r11 | |
890 | adcq 96(%rsp), %r12 | |
891 | adcq 104(%rsp), %r13 | |
892 | adcq 112(%rsp), %r14 | |
893 | adcq 120(%rsp), %r15 | |
894 | sbbq %rcx, %rcx | |
895 | ||
87954638 | 896 | call __rsaz_512_subtract |
0b4bb91d AP |
897 | |
898 | leaq 128+24+48(%rsp), %rax | |
899 | movq -48(%rax), %r15 | |
900 | movq -40(%rax), %r14 | |
901 | movq -32(%rax), %r13 | |
902 | movq -24(%rax), %r12 | |
903 | movq -16(%rax), %rbp | |
904 | movq -8(%rax), %rbx | |
905 | leaq (%rax), %rsp | |
906 | .Lmul_epilogue: | |
907 | ret | |
908 | .size rsaz_512_mul,.-rsaz_512_mul | |
909 | ___ | |
910 | } | |
911 | { | |
912 | my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | |
913 | $code.=<<___; | |
fd8ad019 | 914 | .globl rsaz_512_mul_gather4 |
0b4bb91d AP |
915 | .type rsaz_512_mul_gather4,\@function,6 |
916 | .align 32 | |
917 | rsaz_512_mul_gather4: | |
918 | push %rbx | |
919 | push %rbp | |
920 | push %r12 | |
921 | push %r13 | |
922 | push %r14 | |
923 | push %r15 | |
924 | ||
5ea08bd2 AP |
925 | subq \$`128+24+($win64?0xb0:0)`, %rsp |
926 | ___ | |
927 | $code.=<<___ if ($win64); | |
928 | movaps %xmm6,0xa0(%rsp) | |
929 | movaps %xmm7,0xb0(%rsp) | |
930 | movaps %xmm8,0xc0(%rsp) | |
931 | movaps %xmm9,0xd0(%rsp) | |
932 | movaps %xmm10,0xe0(%rsp) | |
933 | movaps %xmm11,0xf0(%rsp) | |
934 | movaps %xmm12,0x100(%rsp) | |
935 | movaps %xmm13,0x110(%rsp) | |
936 | movaps %xmm14,0x120(%rsp) | |
937 | movaps %xmm15,0x130(%rsp) | |
938 | ___ | |
939 | $code.=<<___; | |
0b4bb91d | 940 | .Lmul_gather4_body: |
5ea08bd2 AP |
941 | movd $pwr,%xmm8 |
942 | movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 | |
943 | movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 | |
944 | ||
945 | pshufd \$0,%xmm8,%xmm8 # broadcast $power | |
946 | movdqa %xmm1,%xmm7 | |
947 | movdqa %xmm1,%xmm2 | |
948 | ___ | |
949 | ######################################################################## | |
950 | # calculate mask by comparing 0..15 to $power | |
951 | # | |
952 | for($i=0;$i<4;$i++) { | |
953 | $code.=<<___; | |
954 | paddd %xmm`$i`,%xmm`$i+1` | |
955 | pcmpeqd %xmm8,%xmm`$i` | |
956 | movdqa %xmm7,%xmm`$i+3` | |
957 | ___ | |
958 | } | |
959 | for(;$i<7;$i++) { | |
960 | $code.=<<___; | |
961 | paddd %xmm`$i`,%xmm`$i+1` | |
962 | pcmpeqd %xmm8,%xmm`$i` | |
963 | ___ | |
964 | } | |
965 | $code.=<<___; | |
966 | pcmpeqd %xmm8,%xmm7 | |
967 | ||
968 | movdqa 16*0($bp),%xmm8 | |
969 | movdqa 16*1($bp),%xmm9 | |
970 | movdqa 16*2($bp),%xmm10 | |
971 | movdqa 16*3($bp),%xmm11 | |
972 | pand %xmm0,%xmm8 | |
973 | movdqa 16*4($bp),%xmm12 | |
974 | pand %xmm1,%xmm9 | |
975 | movdqa 16*5($bp),%xmm13 | |
976 | pand %xmm2,%xmm10 | |
977 | movdqa 16*6($bp),%xmm14 | |
978 | pand %xmm3,%xmm11 | |
979 | movdqa 16*7($bp),%xmm15 | |
980 | leaq 128($bp), %rbp | |
981 | pand %xmm4,%xmm12 | |
982 | pand %xmm5,%xmm13 | |
983 | pand %xmm6,%xmm14 | |
984 | pand %xmm7,%xmm15 | |
985 | por %xmm10,%xmm8 | |
986 | por %xmm11,%xmm9 | |
987 | por %xmm12,%xmm8 | |
988 | por %xmm13,%xmm9 | |
989 | por %xmm14,%xmm8 | |
990 | por %xmm15,%xmm9 | |
991 | ||
992 | por %xmm9,%xmm8 | |
993 | pshufd \$0x4e,%xmm8,%xmm9 | |
994 | por %xmm9,%xmm8 | |
87954638 AP |
995 | ___ |
996 | $code.=<<___ if ($addx); | |
997 | movl \$0x80100,%r11d | |
998 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
999 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
1000 | je .Lmulx_gather | |
1001 | ___ | |
1002 | $code.=<<___; | |
5ea08bd2 AP |
1003 | movq %xmm8,%rbx |
1004 | ||
1005 | movq $n0, 128(%rsp) # off-load arguments | |
1006 | movq $out, 128+8(%rsp) | |
1007 | movq $mod, 128+16(%rsp) | |
0b4bb91d | 1008 | |
0b4bb91d AP |
1009 | movq ($ap), %rax |
1010 | movq 8($ap), %rcx | |
0b4bb91d AP |
1011 | mulq %rbx # 0 iteration |
1012 | movq %rax, (%rsp) | |
1013 | movq %rcx, %rax | |
1014 | movq %rdx, %r8 | |
1015 | ||
1016 | mulq %rbx | |
0b4bb91d AP |
1017 | addq %rax, %r8 |
1018 | movq 16($ap), %rax | |
1019 | movq %rdx, %r9 | |
1020 | adcq \$0, %r9 | |
1021 | ||
1022 | mulq %rbx | |
0b4bb91d AP |
1023 | addq %rax, %r9 |
1024 | movq 24($ap), %rax | |
1025 | movq %rdx, %r10 | |
1026 | adcq \$0, %r10 | |
1027 | ||
1028 | mulq %rbx | |
0b4bb91d AP |
1029 | addq %rax, %r10 |
1030 | movq 32($ap), %rax | |
1031 | movq %rdx, %r11 | |
1032 | adcq \$0, %r11 | |
1033 | ||
1034 | mulq %rbx | |
0b4bb91d AP |
1035 | addq %rax, %r11 |
1036 | movq 40($ap), %rax | |
1037 | movq %rdx, %r12 | |
1038 | adcq \$0, %r12 | |
1039 | ||
1040 | mulq %rbx | |
1041 | addq %rax, %r12 | |
1042 | movq 48($ap), %rax | |
1043 | movq %rdx, %r13 | |
1044 | adcq \$0, %r13 | |
1045 | ||
1046 | mulq %rbx | |
0b4bb91d AP |
1047 | addq %rax, %r13 |
1048 | movq 56($ap), %rax | |
1049 | movq %rdx, %r14 | |
1050 | adcq \$0, %r14 | |
1051 | ||
1052 | mulq %rbx | |
0b4bb91d AP |
1053 | addq %rax, %r14 |
1054 | movq ($ap), %rax | |
1055 | movq %rdx, %r15 | |
1056 | adcq \$0, %r15 | |
1057 | ||
1058 | leaq 8(%rsp), %rdi | |
1059 | movl \$7, %ecx | |
1060 | jmp .Loop_mul_gather | |
1061 | ||
1062 | .align 32 | |
1063 | .Loop_mul_gather: | |
5ea08bd2 AP |
1064 | movdqa 16*0(%rbp),%xmm8 |
1065 | movdqa 16*1(%rbp),%xmm9 | |
1066 | movdqa 16*2(%rbp),%xmm10 | |
1067 | movdqa 16*3(%rbp),%xmm11 | |
1068 | pand %xmm0,%xmm8 | |
1069 | movdqa 16*4(%rbp),%xmm12 | |
1070 | pand %xmm1,%xmm9 | |
1071 | movdqa 16*5(%rbp),%xmm13 | |
1072 | pand %xmm2,%xmm10 | |
1073 | movdqa 16*6(%rbp),%xmm14 | |
1074 | pand %xmm3,%xmm11 | |
1075 | movdqa 16*7(%rbp),%xmm15 | |
1076 | leaq 128(%rbp), %rbp | |
1077 | pand %xmm4,%xmm12 | |
1078 | pand %xmm5,%xmm13 | |
1079 | pand %xmm6,%xmm14 | |
1080 | pand %xmm7,%xmm15 | |
1081 | por %xmm10,%xmm8 | |
1082 | por %xmm11,%xmm9 | |
1083 | por %xmm12,%xmm8 | |
1084 | por %xmm13,%xmm9 | |
1085 | por %xmm14,%xmm8 | |
1086 | por %xmm15,%xmm9 | |
1087 | ||
1088 | por %xmm9,%xmm8 | |
1089 | pshufd \$0x4e,%xmm8,%xmm9 | |
1090 | por %xmm9,%xmm8 | |
1091 | movq %xmm8,%rbx | |
1092 | ||
0b4bb91d AP |
1093 | mulq %rbx |
1094 | addq %rax, %r8 | |
1095 | movq 8($ap), %rax | |
1096 | movq %r8, (%rdi) | |
1097 | movq %rdx, %r8 | |
1098 | adcq \$0, %r8 | |
1099 | ||
1100 | mulq %rbx | |
0b4bb91d AP |
1101 | addq %rax, %r9 |
1102 | movq 16($ap), %rax | |
1103 | adcq \$0, %rdx | |
1104 | addq %r9, %r8 | |
1105 | movq %rdx, %r9 | |
1106 | adcq \$0, %r9 | |
1107 | ||
1108 | mulq %rbx | |
0b4bb91d AP |
1109 | addq %rax, %r10 |
1110 | movq 24($ap), %rax | |
1111 | adcq \$0, %rdx | |
1112 | addq %r10, %r9 | |
1113 | movq %rdx, %r10 | |
1114 | adcq \$0, %r10 | |
1115 | ||
1116 | mulq %rbx | |
0b4bb91d AP |
1117 | addq %rax, %r11 |
1118 | movq 32($ap), %rax | |
1119 | adcq \$0, %rdx | |
1120 | addq %r11, %r10 | |
1121 | movq %rdx, %r11 | |
1122 | adcq \$0, %r11 | |
1123 | ||
1124 | mulq %rbx | |
0b4bb91d AP |
1125 | addq %rax, %r12 |
1126 | movq 40($ap), %rax | |
1127 | adcq \$0, %rdx | |
1128 | addq %r12, %r11 | |
1129 | movq %rdx, %r12 | |
1130 | adcq \$0, %r12 | |
1131 | ||
1132 | mulq %rbx | |
1133 | addq %rax, %r13 | |
1134 | movq 48($ap), %rax | |
1135 | adcq \$0, %rdx | |
1136 | addq %r13, %r12 | |
1137 | movq %rdx, %r13 | |
1138 | adcq \$0, %r13 | |
1139 | ||
1140 | mulq %rbx | |
1141 | addq %rax, %r14 | |
1142 | movq 56($ap), %rax | |
1143 | adcq \$0, %rdx | |
1144 | addq %r14, %r13 | |
1145 | movq %rdx, %r14 | |
1146 | adcq \$0, %r14 | |
1147 | ||
1148 | mulq %rbx | |
0b4bb91d AP |
1149 | addq %rax, %r15 |
1150 | movq ($ap), %rax | |
1151 | adcq \$0, %rdx | |
1152 | addq %r15, %r14 | |
1153 | movq %rdx, %r15 | |
1154 | adcq \$0, %r15 | |
1155 | ||
0b4bb91d AP |
1156 | leaq 8(%rdi), %rdi |
1157 | ||
1158 | decl %ecx | |
1159 | jnz .Loop_mul_gather | |
1160 | ||
1161 | movq %r8, (%rdi) | |
1162 | movq %r9, 8(%rdi) | |
1163 | movq %r10, 16(%rdi) | |
1164 | movq %r11, 24(%rdi) | |
1165 | movq %r12, 32(%rdi) | |
1166 | movq %r13, 40(%rdi) | |
1167 | movq %r14, 48(%rdi) | |
1168 | movq %r15, 56(%rdi) | |
1169 | ||
5ea08bd2 AP |
1170 | movq 128+8(%rsp), $out |
1171 | movq 128+16(%rsp), %rbp | |
0b4bb91d AP |
1172 | |
1173 | movq (%rsp), %r8 | |
1174 | movq 8(%rsp), %r9 | |
1175 | movq 16(%rsp), %r10 | |
1176 | movq 24(%rsp), %r11 | |
1177 | movq 32(%rsp), %r12 | |
1178 | movq 40(%rsp), %r13 | |
1179 | movq 48(%rsp), %r14 | |
1180 | movq 56(%rsp), %r15 | |
1181 | ||
87954638 AP |
1182 | call __rsaz_512_reduce |
1183 | ___ | |
1184 | $code.=<<___ if ($addx); | |
1185 | jmp .Lmul_gather_tail | |
1186 | ||
1187 | .align 32 | |
1188 | .Lmulx_gather: | |
5ea08bd2 AP |
1189 | movq %xmm8,%rdx |
1190 | ||
1191 | mov $n0, 128(%rsp) # off-load arguments | |
1192 | mov $out, 128+8(%rsp) | |
1193 | mov $mod, 128+16(%rsp) | |
87954638 | 1194 | |
87954638 AP |
1195 | mulx ($ap), %rbx, %r8 # 0 iteration |
1196 | mov %rbx, (%rsp) | |
1197 | xor %edi, %edi # cf=0, of=0 | |
1198 | ||
1199 | mulx 8($ap), %rax, %r9 | |
87954638 AP |
1200 | |
1201 | mulx 16($ap), %rbx, %r10 | |
87954638 AP |
1202 | adcx %rax, %r8 |
1203 | ||
1204 | mulx 24($ap), %rax, %r11 | |
87954638 AP |
1205 | adcx %rbx, %r9 |
1206 | ||
1207 | mulx 32($ap), %rbx, %r12 | |
87954638 AP |
1208 | adcx %rax, %r10 |
1209 | ||
1210 | mulx 40($ap), %rax, %r13 | |
1211 | adcx %rbx, %r11 | |
1212 | ||
1213 | mulx 48($ap), %rbx, %r14 | |
87954638 AP |
1214 | adcx %rax, %r12 |
1215 | ||
1216 | mulx 56($ap), %rax, %r15 | |
87954638 AP |
1217 | adcx %rbx, %r13 |
1218 | adcx %rax, %r14 | |
5ea08bd2 | 1219 | .byte 0x67 |
87954638 AP |
1220 | mov %r8, %rbx |
1221 | adcx %rdi, %r15 # %rdi is 0 | |
1222 | ||
1223 | mov \$-7, %rcx | |
1224 | jmp .Loop_mulx_gather | |
1225 | ||
1226 | .align 32 | |
1227 | .Loop_mulx_gather: | |
5ea08bd2 AP |
1228 | movdqa 16*0(%rbp),%xmm8 |
1229 | movdqa 16*1(%rbp),%xmm9 | |
1230 | movdqa 16*2(%rbp),%xmm10 | |
1231 | movdqa 16*3(%rbp),%xmm11 | |
1232 | pand %xmm0,%xmm8 | |
1233 | movdqa 16*4(%rbp),%xmm12 | |
1234 | pand %xmm1,%xmm9 | |
1235 | movdqa 16*5(%rbp),%xmm13 | |
1236 | pand %xmm2,%xmm10 | |
1237 | movdqa 16*6(%rbp),%xmm14 | |
1238 | pand %xmm3,%xmm11 | |
1239 | movdqa 16*7(%rbp),%xmm15 | |
1240 | leaq 128(%rbp), %rbp | |
1241 | pand %xmm4,%xmm12 | |
1242 | pand %xmm5,%xmm13 | |
1243 | pand %xmm6,%xmm14 | |
1244 | pand %xmm7,%xmm15 | |
1245 | por %xmm10,%xmm8 | |
1246 | por %xmm11,%xmm9 | |
1247 | por %xmm12,%xmm8 | |
1248 | por %xmm13,%xmm9 | |
1249 | por %xmm14,%xmm8 | |
1250 | por %xmm15,%xmm9 | |
1251 | ||
1252 | por %xmm9,%xmm8 | |
1253 | pshufd \$0x4e,%xmm8,%xmm9 | |
1254 | por %xmm9,%xmm8 | |
1255 | movq %xmm8,%rdx | |
1256 | ||
1257 | .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 | |
87954638 AP |
1258 | adcx %rax, %rbx |
1259 | adox %r9, %r8 | |
1260 | ||
1261 | mulx 8($ap), %rax, %r9 | |
87954638 AP |
1262 | adcx %rax, %r8 |
1263 | adox %r10, %r9 | |
0b4bb91d | 1264 | |
87954638 | 1265 | mulx 16($ap), %rax, %r10 |
87954638 AP |
1266 | adcx %rax, %r9 |
1267 | adox %r11, %r10 | |
1268 | ||
1269 | .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 | |
87954638 AP |
1270 | adcx %rax, %r10 |
1271 | adox %r12, %r11 | |
1272 | ||
1273 | mulx 32($ap), %rax, %r12 | |
1274 | adcx %rax, %r11 | |
1275 | adox %r13, %r12 | |
1276 | ||
1277 | mulx 40($ap), %rax, %r13 | |
1278 | adcx %rax, %r12 | |
1279 | adox %r14, %r13 | |
1280 | ||
1281 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 | |
1282 | adcx %rax, %r13 | |
5ea08bd2 | 1283 | .byte 0x67 |
87954638 AP |
1284 | adox %r15, %r14 |
1285 | ||
1286 | mulx 56($ap), %rax, %r15 | |
87954638 AP |
1287 | mov %rbx, 64(%rsp,%rcx,8) |
1288 | adcx %rax, %r14 | |
1289 | adox %rdi, %r15 | |
1290 | mov %r8, %rbx | |
1291 | adcx %rdi, %r15 # cf=0 | |
1292 | ||
1293 | inc %rcx # of=0 | |
1294 | jnz .Loop_mulx_gather | |
1295 | ||
1296 | mov %r8, 64(%rsp) | |
1297 | mov %r9, 64+8(%rsp) | |
1298 | mov %r10, 64+16(%rsp) | |
1299 | mov %r11, 64+24(%rsp) | |
1300 | mov %r12, 64+32(%rsp) | |
1301 | mov %r13, 64+40(%rsp) | |
1302 | mov %r14, 64+48(%rsp) | |
1303 | mov %r15, 64+56(%rsp) | |
1304 | ||
5ea08bd2 AP |
1305 | mov 128(%rsp), %rdx # pull arguments |
1306 | mov 128+8(%rsp), $out | |
1307 | mov 128+16(%rsp), %rbp | |
87954638 | 1308 | |
87954638 AP |
1309 | mov (%rsp), %r8 |
1310 | mov 8(%rsp), %r9 | |
1311 | mov 16(%rsp), %r10 | |
1312 | mov 24(%rsp), %r11 | |
1313 | mov 32(%rsp), %r12 | |
1314 | mov 40(%rsp), %r13 | |
1315 | mov 48(%rsp), %r14 | |
1316 | mov 56(%rsp), %r15 | |
1317 | ||
1318 | call __rsaz_512_reducex | |
1319 | ||
1320 | .Lmul_gather_tail: | |
1321 | ___ | |
1322 | $code.=<<___; | |
0b4bb91d AP |
1323 | addq 64(%rsp), %r8 |
1324 | adcq 72(%rsp), %r9 | |
1325 | adcq 80(%rsp), %r10 | |
1326 | adcq 88(%rsp), %r11 | |
1327 | adcq 96(%rsp), %r12 | |
1328 | adcq 104(%rsp), %r13 | |
1329 | adcq 112(%rsp), %r14 | |
1330 | adcq 120(%rsp), %r15 | |
1331 | sbbq %rcx, %rcx | |
1332 | ||
87954638 | 1333 | call __rsaz_512_subtract |
0b4bb91d AP |
1334 | |
1335 | leaq 128+24+48(%rsp), %rax | |
5ea08bd2 AP |
1336 | ___ |
1337 | $code.=<<___ if ($win64); | |
1338 | movaps 0xa0-0xc8(%rax),%xmm6 | |
1339 | movaps 0xb0-0xc8(%rax),%xmm7 | |
1340 | movaps 0xc0-0xc8(%rax),%xmm8 | |
1341 | movaps 0xd0-0xc8(%rax),%xmm9 | |
1342 | movaps 0xe0-0xc8(%rax),%xmm10 | |
1343 | movaps 0xf0-0xc8(%rax),%xmm11 | |
1344 | movaps 0x100-0xc8(%rax),%xmm12 | |
1345 | movaps 0x110-0xc8(%rax),%xmm13 | |
1346 | movaps 0x120-0xc8(%rax),%xmm14 | |
1347 | movaps 0x130-0xc8(%rax),%xmm15 | |
1348 | lea 0xb0(%rax),%rax | |
1349 | ___ | |
1350 | $code.=<<___; | |
0b4bb91d AP |
1351 | movq -48(%rax), %r15 |
1352 | movq -40(%rax), %r14 | |
1353 | movq -32(%rax), %r13 | |
1354 | movq -24(%rax), %r12 | |
1355 | movq -16(%rax), %rbp | |
1356 | movq -8(%rax), %rbx | |
1357 | leaq (%rax), %rsp | |
1358 | .Lmul_gather4_epilogue: | |
1359 | ret | |
1360 | .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 | |
1361 | ___ | |
1362 | } | |
1363 | { | |
1364 | my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | |
1365 | $code.=<<___; | |
fd8ad019 | 1366 | .globl rsaz_512_mul_scatter4 |
0b4bb91d AP |
1367 | .type rsaz_512_mul_scatter4,\@function,6 |
1368 | .align 32 | |
1369 | rsaz_512_mul_scatter4: | |
1370 | push %rbx | |
1371 | push %rbp | |
1372 | push %r12 | |
1373 | push %r13 | |
1374 | push %r14 | |
1375 | push %r15 | |
1376 | ||
8bd7ca99 | 1377 | mov $pwr, $pwr |
0b4bb91d AP |
1378 | subq \$128+24, %rsp |
1379 | .Lmul_scatter4_body: | |
5ea08bd2 | 1380 | leaq ($tbl,$pwr,8), $tbl |
0b4bb91d AP |
1381 | movq $out, %xmm0 # off-load arguments |
1382 | movq $mod, %xmm1 | |
1383 | movq $tbl, %xmm2 | |
1384 | movq $n0, 128(%rsp) | |
1385 | ||
1386 | movq $out, %rbp | |
87954638 AP |
1387 | ___ |
1388 | $code.=<<___ if ($addx); | |
1389 | movl \$0x80100,%r11d | |
1390 | andl OPENSSL_ia32cap_P+8(%rip),%r11d | |
1391 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX | |
1392 | je .Lmulx_scatter | |
1393 | ___ | |
1394 | $code.=<<___; | |
1395 | movq ($out),%rbx # pass b[0] | |
fd8ad019 | 1396 | call __rsaz_512_mul |
0b4bb91d AP |
1397 | |
1398 | movq %xmm0, $out | |
1399 | movq %xmm1, %rbp | |
1400 | ||
1401 | movq (%rsp), %r8 | |
1402 | movq 8(%rsp), %r9 | |
1403 | movq 16(%rsp), %r10 | |
1404 | movq 24(%rsp), %r11 | |
1405 | movq 32(%rsp), %r12 | |
1406 | movq 40(%rsp), %r13 | |
1407 | movq 48(%rsp), %r14 | |
1408 | movq 56(%rsp), %r15 | |
1409 | ||
87954638 AP |
1410 | call __rsaz_512_reduce |
1411 | ___ | |
1412 | $code.=<<___ if ($addx); | |
1413 | jmp .Lmul_scatter_tail | |
1414 | ||
1415 | .align 32 | |
1416 | .Lmulx_scatter: | |
1417 | movq ($out), %rdx # pass b[0] | |
1418 | call __rsaz_512_mulx | |
1419 | ||
1420 | movq %xmm0, $out | |
1421 | movq %xmm1, %rbp | |
1422 | ||
1423 | movq 128(%rsp), %rdx # pull $n0 | |
1424 | movq (%rsp), %r8 | |
1425 | movq 8(%rsp), %r9 | |
1426 | movq 16(%rsp), %r10 | |
1427 | movq 24(%rsp), %r11 | |
1428 | movq 32(%rsp), %r12 | |
1429 | movq 40(%rsp), %r13 | |
1430 | movq 48(%rsp), %r14 | |
1431 | movq 56(%rsp), %r15 | |
1432 | ||
1433 | call __rsaz_512_reducex | |
0b4bb91d | 1434 | |
87954638 AP |
1435 | .Lmul_scatter_tail: |
1436 | ___ | |
1437 | $code.=<<___; | |
0b4bb91d AP |
1438 | addq 64(%rsp), %r8 |
1439 | adcq 72(%rsp), %r9 | |
1440 | adcq 80(%rsp), %r10 | |
1441 | adcq 88(%rsp), %r11 | |
1442 | adcq 96(%rsp), %r12 | |
1443 | adcq 104(%rsp), %r13 | |
1444 | adcq 112(%rsp), %r14 | |
1445 | adcq 120(%rsp), %r15 | |
1446 | movq %xmm2, $inp | |
1447 | sbbq %rcx, %rcx | |
1448 | ||
87954638 | 1449 | call __rsaz_512_subtract |
0b4bb91d | 1450 | |
5ea08bd2 AP |
1451 | movq %r8, 128*0($inp) # scatter |
1452 | movq %r9, 128*1($inp) | |
1453 | movq %r10, 128*2($inp) | |
1454 | movq %r11, 128*3($inp) | |
1455 | movq %r12, 128*4($inp) | |
1456 | movq %r13, 128*5($inp) | |
1457 | movq %r14, 128*6($inp) | |
1458 | movq %r15, 128*7($inp) | |
0b4bb91d AP |
1459 | |
1460 | leaq 128+24+48(%rsp), %rax | |
1461 | movq -48(%rax), %r15 | |
1462 | movq -40(%rax), %r14 | |
1463 | movq -32(%rax), %r13 | |
1464 | movq -24(%rax), %r12 | |
1465 | movq -16(%rax), %rbp | |
1466 | movq -8(%rax), %rbx | |
1467 | leaq (%rax), %rsp | |
1468 | .Lmul_scatter4_epilogue: | |
1469 | ret | |
1470 | .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 | |
1471 | ___ | |
1472 | } | |
1473 | { | |
1474 | my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); | |
1475 | $code.=<<___; | |
1476 | .globl rsaz_512_mul_by_one | |
1477 | .type rsaz_512_mul_by_one,\@function,4 | |
1478 | .align 32 | |
1479 | rsaz_512_mul_by_one: | |
1480 | push %rbx | |
1481 | push %rbp | |
1482 | push %r12 | |
1483 | push %r13 | |
1484 | push %r14 | |
1485 | push %r15 | |
1486 | ||
1487 | subq \$128+24, %rsp | |
1488 | .Lmul_by_one_body: | |
87954638 AP |
1489 | ___ |
1490 | $code.=<<___ if ($addx); | |
1491 | movl OPENSSL_ia32cap_P+8(%rip),%eax | |
1492 | ___ | |
1493 | $code.=<<___; | |
0b4bb91d AP |
1494 | movq $mod, %rbp # reassign argument |
1495 | movq $n0, 128(%rsp) | |
1496 | ||
1497 | movq ($inp), %r8 | |
1498 | pxor %xmm0, %xmm0 | |
1499 | movq 8($inp), %r9 | |
1500 | movq 16($inp), %r10 | |
1501 | movq 24($inp), %r11 | |
1502 | movq 32($inp), %r12 | |
1503 | movq 40($inp), %r13 | |
1504 | movq 48($inp), %r14 | |
1505 | movq 56($inp), %r15 | |
1506 | ||
1507 | movdqa %xmm0, (%rsp) | |
1508 | movdqa %xmm0, 16(%rsp) | |
1509 | movdqa %xmm0, 32(%rsp) | |
1510 | movdqa %xmm0, 48(%rsp) | |
1511 | movdqa %xmm0, 64(%rsp) | |
1512 | movdqa %xmm0, 80(%rsp) | |
1513 | movdqa %xmm0, 96(%rsp) | |
87954638 AP |
1514 | ___ |
1515 | $code.=<<___ if ($addx); | |
1516 | andl \$0x80100,%eax | |
1517 | cmpl \$0x80100,%eax # check for MULX and ADO/CX | |
1518 | je .Lby_one_callx | |
1519 | ___ | |
1520 | $code.=<<___; | |
1521 | call __rsaz_512_reduce | |
1522 | ___ | |
1523 | $code.=<<___ if ($addx); | |
1524 | jmp .Lby_one_tail | |
1525 | .align 32 | |
1526 | .Lby_one_callx: | |
1527 | movq 128(%rsp), %rdx # pull $n0 | |
1528 | call __rsaz_512_reducex | |
1529 | .Lby_one_tail: | |
1530 | ___ | |
1531 | $code.=<<___; | |
0b4bb91d AP |
1532 | movq %r8, ($out) |
1533 | movq %r9, 8($out) | |
1534 | movq %r10, 16($out) | |
1535 | movq %r11, 24($out) | |
1536 | movq %r12, 32($out) | |
1537 | movq %r13, 40($out) | |
1538 | movq %r14, 48($out) | |
1539 | movq %r15, 56($out) | |
1540 | ||
1541 | leaq 128+24+48(%rsp), %rax | |
1542 | movq -48(%rax), %r15 | |
1543 | movq -40(%rax), %r14 | |
1544 | movq -32(%rax), %r13 | |
1545 | movq -24(%rax), %r12 | |
1546 | movq -16(%rax), %rbp | |
1547 | movq -8(%rax), %rbx | |
1548 | leaq (%rax), %rsp | |
1549 | .Lmul_by_one_epilogue: | |
1550 | ret | |
1551 | .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one | |
1552 | ___ | |
1553 | } | |
87954638 | 1554 | { # __rsaz_512_reduce |
0b4bb91d AP |
1555 | # |
1556 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 | |
1557 | # output: %r8-%r15 | |
1558 | # clobbers: everything except %rbp and %rdi | |
1559 | $code.=<<___; | |
87954638 | 1560 | .type __rsaz_512_reduce,\@abi-omnipotent |
0b4bb91d | 1561 | .align 32 |
87954638 | 1562 | __rsaz_512_reduce: |
0b4bb91d AP |
1563 | movq %r8, %rbx |
1564 | imulq 128+8(%rsp), %rbx | |
1565 | movq 0(%rbp), %rax | |
1566 | movl \$8, %ecx | |
1567 | jmp .Lreduction_loop | |
1568 | ||
1569 | .align 32 | |
1570 | .Lreduction_loop: | |
1571 | mulq %rbx | |
1572 | movq 8(%rbp), %rax | |
1573 | negq %r8 | |
1574 | movq %rdx, %r8 | |
1575 | adcq \$0, %r8 | |
1576 | ||
1577 | mulq %rbx | |
1578 | addq %rax, %r9 | |
1579 | movq 16(%rbp), %rax | |
1580 | adcq \$0, %rdx | |
1581 | addq %r9, %r8 | |
1582 | movq %rdx, %r9 | |
1583 | adcq \$0, %r9 | |
1584 | ||
1585 | mulq %rbx | |
1586 | addq %rax, %r10 | |
1587 | movq 24(%rbp), %rax | |
1588 | adcq \$0, %rdx | |
1589 | addq %r10, %r9 | |
1590 | movq %rdx, %r10 | |
1591 | adcq \$0, %r10 | |
1592 | ||
1593 | mulq %rbx | |
1594 | addq %rax, %r11 | |
1595 | movq 32(%rbp), %rax | |
1596 | adcq \$0, %rdx | |
1597 | addq %r11, %r10 | |
1598 | movq 128+8(%rsp), %rsi | |
87954638 AP |
1599 | #movq %rdx, %r11 |
1600 | #adcq \$0, %r11 | |
1601 | adcq \$0, %rdx | |
0b4bb91d | 1602 | movq %rdx, %r11 |
0b4bb91d AP |
1603 | |
1604 | mulq %rbx | |
1605 | addq %rax, %r12 | |
1606 | movq 40(%rbp), %rax | |
1607 | adcq \$0, %rdx | |
1608 | imulq %r8, %rsi | |
1609 | addq %r12, %r11 | |
1610 | movq %rdx, %r12 | |
1611 | adcq \$0, %r12 | |
1612 | ||
1613 | mulq %rbx | |
1614 | addq %rax, %r13 | |
1615 | movq 48(%rbp), %rax | |
1616 | adcq \$0, %rdx | |
1617 | addq %r13, %r12 | |
1618 | movq %rdx, %r13 | |
1619 | adcq \$0, %r13 | |
1620 | ||
1621 | mulq %rbx | |
1622 | addq %rax, %r14 | |
1623 | movq 56(%rbp), %rax | |
1624 | adcq \$0, %rdx | |
1625 | addq %r14, %r13 | |
1626 | movq %rdx, %r14 | |
1627 | adcq \$0, %r14 | |
1628 | ||
1629 | mulq %rbx | |
1630 | movq %rsi, %rbx | |
1631 | addq %rax, %r15 | |
1632 | movq 0(%rbp), %rax | |
1633 | adcq \$0, %rdx | |
1634 | addq %r15, %r14 | |
1635 | movq %rdx, %r15 | |
1636 | adcq \$0, %r15 | |
1637 | ||
1638 | decl %ecx | |
1639 | jne .Lreduction_loop | |
87954638 AP |
1640 | |
1641 | ret | |
1642 | .size __rsaz_512_reduce,.-__rsaz_512_reduce | |
0b4bb91d | 1643 | ___ |
87954638 AP |
1644 | } |
1645 | if ($addx) { | |
1646 | # __rsaz_512_reducex | |
1647 | # | |
1648 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 | |
1649 | # output: %r8-%r15 | |
1650 | # clobbers: everything except %rbp and %rdi | |
0b4bb91d | 1651 | $code.=<<___; |
87954638 AP |
1652 | .type __rsaz_512_reducex,\@abi-omnipotent |
1653 | .align 32 | |
1654 | __rsaz_512_reducex: | |
1655 | #movq 128+8(%rsp), %rdx # pull $n0 | |
0b4bb91d | 1656 | imulq %r8, %rdx |
87954638 | 1657 | xorq %rsi, %rsi # cf=0,of=0 |
0b4bb91d | 1658 | movl \$8, %ecx |
87954638 | 1659 | jmp .Lreduction_loopx |
0b4bb91d AP |
1660 | |
1661 | .align 32 | |
87954638 AP |
1662 | .Lreduction_loopx: |
1663 | mov %r8, %rbx | |
0b4bb91d | 1664 | mulx 0(%rbp), %rax, %r8 |
87954638 AP |
1665 | adcx %rbx, %rax |
1666 | adox %r9, %r8 | |
0b4bb91d AP |
1667 | |
1668 | mulx 8(%rbp), %rax, %r9 | |
87954638 AP |
1669 | adcx %rax, %r8 |
1670 | adox %r10, %r9 | |
1671 | ||
1672 | mulx 16(%rbp), %rbx, %r10 | |
1673 | adcx %rbx, %r9 | |
1674 | adox %r11, %r10 | |
1675 | ||
1676 | mulx 24(%rbp), %rbx, %r11 | |
1677 | adcx %rbx, %r10 | |
1678 | adox %r12, %r11 | |
1679 | ||
1680 | .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 | |
1681 | mov %rdx, %rax | |
1682 | mov %r8, %rdx | |
1683 | adcx %rbx, %r11 | |
1684 | adox %r13, %r12 | |
1685 | ||
1686 | mulx 128+8(%rsp), %rbx, %rdx | |
1687 | mov %rax, %rdx | |
0b4bb91d AP |
1688 | |
1689 | mulx 40(%rbp), %rax, %r13 | |
87954638 AP |
1690 | adcx %rax, %r12 |
1691 | adox %r14, %r13 | |
0b4bb91d | 1692 | |
87954638 AP |
1693 | .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 |
1694 | adcx %rax, %r13 | |
1695 | adox %r15, %r14 | |
0b4bb91d AP |
1696 | |
1697 | mulx 56(%rbp), %rax, %r15 | |
1698 | mov %rbx, %rdx | |
87954638 AP |
1699 | adcx %rax, %r14 |
1700 | adox %rsi, %r15 # %rsi is 0 | |
1701 | adcx %rsi, %r15 # cf=0 | |
1702 | ||
1703 | decl %ecx # of=0 | |
1704 | jne .Lreduction_loopx | |
0b4bb91d | 1705 | |
0b4bb91d | 1706 | ret |
87954638 | 1707 | .size __rsaz_512_reducex,.-__rsaz_512_reducex |
0b4bb91d AP |
1708 | ___ |
1709 | } | |
87954638 | 1710 | { # __rsaz_512_subtract |
0b4bb91d AP |
1711 | # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask |
1712 | # output: | |
1713 | # clobbers: everything but %rdi, %rsi and %rbp | |
1714 | $code.=<<___; | |
87954638 | 1715 | .type __rsaz_512_subtract,\@abi-omnipotent |
0b4bb91d | 1716 | .align 32 |
87954638 | 1717 | __rsaz_512_subtract: |
0b4bb91d AP |
1718 | movq %r8, ($out) |
1719 | movq %r9, 8($out) | |
1720 | movq %r10, 16($out) | |
1721 | movq %r11, 24($out) | |
1722 | movq %r12, 32($out) | |
1723 | movq %r13, 40($out) | |
1724 | movq %r14, 48($out) | |
1725 | movq %r15, 56($out) | |
1726 | ||
1727 | movq 0($mod), %r8 | |
1728 | movq 8($mod), %r9 | |
1729 | negq %r8 | |
1730 | notq %r9 | |
1731 | andq %rcx, %r8 | |
1732 | movq 16($mod), %r10 | |
1733 | andq %rcx, %r9 | |
1734 | notq %r10 | |
1735 | movq 24($mod), %r11 | |
1736 | andq %rcx, %r10 | |
1737 | notq %r11 | |
1738 | movq 32($mod), %r12 | |
1739 | andq %rcx, %r11 | |
1740 | notq %r12 | |
1741 | movq 40($mod), %r13 | |
1742 | andq %rcx, %r12 | |
1743 | notq %r13 | |
1744 | movq 48($mod), %r14 | |
1745 | andq %rcx, %r13 | |
1746 | notq %r14 | |
1747 | movq 56($mod), %r15 | |
1748 | andq %rcx, %r14 | |
1749 | notq %r15 | |
1750 | andq %rcx, %r15 | |
1751 | ||
1752 | addq ($out), %r8 | |
1753 | adcq 8($out), %r9 | |
1754 | adcq 16($out), %r10 | |
1755 | adcq 24($out), %r11 | |
1756 | adcq 32($out), %r12 | |
1757 | adcq 40($out), %r13 | |
1758 | adcq 48($out), %r14 | |
1759 | adcq 56($out), %r15 | |
1760 | ||
1761 | movq %r8, ($out) | |
1762 | movq %r9, 8($out) | |
1763 | movq %r10, 16($out) | |
1764 | movq %r11, 24($out) | |
1765 | movq %r12, 32($out) | |
1766 | movq %r13, 40($out) | |
1767 | movq %r14, 48($out) | |
1768 | movq %r15, 56($out) | |
1769 | ||
1770 | ret | |
87954638 | 1771 | .size __rsaz_512_subtract,.-__rsaz_512_subtract |
0b4bb91d AP |
1772 | ___ |
1773 | } | |
fd8ad019 | 1774 | { # __rsaz_512_mul |
0b4bb91d AP |
1775 | # |
1776 | # input: %rsi - ap, %rbp - bp | |
0d4fb843 | 1777 | # output: |
0b4bb91d AP |
1778 | # clobbers: everything |
1779 | my ($ap,$bp) = ("%rsi","%rbp"); | |
1780 | $code.=<<___; | |
fd8ad019 | 1781 | .type __rsaz_512_mul,\@abi-omnipotent |
0b4bb91d | 1782 | .align 32 |
fd8ad019 | 1783 | __rsaz_512_mul: |
0b4bb91d AP |
1784 | leaq 8(%rsp), %rdi |
1785 | ||
0b4bb91d AP |
1786 | movq ($ap), %rax |
1787 | mulq %rbx | |
1788 | movq %rax, (%rdi) | |
1789 | movq 8($ap), %rax | |
1790 | movq %rdx, %r8 | |
1791 | ||
1792 | mulq %rbx | |
1793 | addq %rax, %r8 | |
1794 | movq 16($ap), %rax | |
1795 | movq %rdx, %r9 | |
1796 | adcq \$0, %r9 | |
1797 | ||
1798 | mulq %rbx | |
1799 | addq %rax, %r9 | |
1800 | movq 24($ap), %rax | |
1801 | movq %rdx, %r10 | |
1802 | adcq \$0, %r10 | |
1803 | ||
1804 | mulq %rbx | |
1805 | addq %rax, %r10 | |
1806 | movq 32($ap), %rax | |
1807 | movq %rdx, %r11 | |
1808 | adcq \$0, %r11 | |
1809 | ||
1810 | mulq %rbx | |
1811 | addq %rax, %r11 | |
1812 | movq 40($ap), %rax | |
1813 | movq %rdx, %r12 | |
1814 | adcq \$0, %r12 | |
1815 | ||
1816 | mulq %rbx | |
1817 | addq %rax, %r12 | |
1818 | movq 48($ap), %rax | |
1819 | movq %rdx, %r13 | |
1820 | adcq \$0, %r13 | |
1821 | ||
1822 | mulq %rbx | |
1823 | addq %rax, %r13 | |
1824 | movq 56($ap), %rax | |
1825 | movq %rdx, %r14 | |
1826 | adcq \$0, %r14 | |
1827 | ||
1828 | mulq %rbx | |
1829 | addq %rax, %r14 | |
1830 | movq ($ap), %rax | |
1831 | movq %rdx, %r15 | |
1832 | adcq \$0, %r15 | |
1833 | ||
1834 | leaq 8($bp), $bp | |
1835 | leaq 8(%rdi), %rdi | |
1836 | ||
1837 | movl \$7, %ecx | |
1838 | jmp .Loop_mul | |
1839 | ||
1840 | .align 32 | |
1841 | .Loop_mul: | |
1842 | movq ($bp), %rbx | |
1843 | mulq %rbx | |
1844 | addq %rax, %r8 | |
1845 | movq 8($ap), %rax | |
1846 | movq %r8, (%rdi) | |
1847 | movq %rdx, %r8 | |
1848 | adcq \$0, %r8 | |
1849 | ||
1850 | mulq %rbx | |
1851 | addq %rax, %r9 | |
1852 | movq 16($ap), %rax | |
1853 | adcq \$0, %rdx | |
1854 | addq %r9, %r8 | |
1855 | movq %rdx, %r9 | |
1856 | adcq \$0, %r9 | |
1857 | ||
1858 | mulq %rbx | |
1859 | addq %rax, %r10 | |
1860 | movq 24($ap), %rax | |
1861 | adcq \$0, %rdx | |
1862 | addq %r10, %r9 | |
1863 | movq %rdx, %r10 | |
1864 | adcq \$0, %r10 | |
1865 | ||
1866 | mulq %rbx | |
1867 | addq %rax, %r11 | |
1868 | movq 32($ap), %rax | |
1869 | adcq \$0, %rdx | |
1870 | addq %r11, %r10 | |
1871 | movq %rdx, %r11 | |
1872 | adcq \$0, %r11 | |
1873 | ||
1874 | mulq %rbx | |
1875 | addq %rax, %r12 | |
1876 | movq 40($ap), %rax | |
1877 | adcq \$0, %rdx | |
1878 | addq %r12, %r11 | |
1879 | movq %rdx, %r12 | |
1880 | adcq \$0, %r12 | |
1881 | ||
1882 | mulq %rbx | |
1883 | addq %rax, %r13 | |
1884 | movq 48($ap), %rax | |
1885 | adcq \$0, %rdx | |
1886 | addq %r13, %r12 | |
1887 | movq %rdx, %r13 | |
1888 | adcq \$0, %r13 | |
1889 | ||
1890 | mulq %rbx | |
1891 | addq %rax, %r14 | |
1892 | movq 56($ap), %rax | |
1893 | adcq \$0, %rdx | |
1894 | addq %r14, %r13 | |
1895 | movq %rdx, %r14 | |
1896 | leaq 8($bp), $bp | |
1897 | adcq \$0, %r14 | |
1898 | ||
1899 | mulq %rbx | |
1900 | addq %rax, %r15 | |
1901 | movq ($ap), %rax | |
1902 | adcq \$0, %rdx | |
1903 | addq %r15, %r14 | |
1904 | movq %rdx, %r15 | |
1905 | adcq \$0, %r15 | |
1906 | ||
1907 | leaq 8(%rdi), %rdi | |
1908 | ||
1909 | decl %ecx | |
1910 | jnz .Loop_mul | |
1911 | ||
1912 | movq %r8, (%rdi) | |
1913 | movq %r9, 8(%rdi) | |
1914 | movq %r10, 16(%rdi) | |
1915 | movq %r11, 24(%rdi) | |
1916 | movq %r12, 32(%rdi) | |
1917 | movq %r13, 40(%rdi) | |
1918 | movq %r14, 48(%rdi) | |
1919 | movq %r15, 56(%rdi) | |
1920 | ||
1921 | ret | |
fd8ad019 | 1922 | .size __rsaz_512_mul,.-__rsaz_512_mul |
0b4bb91d AP |
1923 | ___ |
1924 | } | |
87954638 AP |
1925 | if ($addx) { |
1926 | # __rsaz_512_mulx | |
1927 | # | |
1928 | # input: %rsi - ap, %rbp - bp | |
0d4fb843 | 1929 | # output: |
87954638 AP |
1930 | # clobbers: everything |
1931 | my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); | |
1932 | $code.=<<___; | |
1933 | .type __rsaz_512_mulx,\@abi-omnipotent | |
1934 | .align 32 | |
1935 | __rsaz_512_mulx: | |
1936 | mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller | |
31ed9a21 | 1937 | mov \$-6, %rcx |
87954638 AP |
1938 | |
1939 | mulx 8($ap), %rax, %r9 | |
1940 | movq %rbx, 8(%rsp) | |
1941 | ||
1942 | mulx 16($ap), %rbx, %r10 | |
31ed9a21 | 1943 | adc %rax, %r8 |
87954638 AP |
1944 | |
1945 | mulx 24($ap), %rax, %r11 | |
31ed9a21 | 1946 | adc %rbx, %r9 |
87954638 | 1947 | |
31ed9a21 AP |
1948 | mulx 32($ap), %rbx, %r12 |
1949 | adc %rax, %r10 | |
87954638 AP |
1950 | |
1951 | mulx 40($ap), %rax, %r13 | |
31ed9a21 | 1952 | adc %rbx, %r11 |
87954638 AP |
1953 | |
1954 | mulx 48($ap), %rbx, %r14 | |
31ed9a21 | 1955 | adc %rax, %r12 |
87954638 AP |
1956 | |
1957 | mulx 56($ap), %rax, %r15 | |
1958 | mov 8($bp), %rdx | |
31ed9a21 AP |
1959 | adc %rbx, %r13 |
1960 | adc %rax, %r14 | |
1961 | adc \$0, %r15 | |
87954638 | 1962 | |
31ed9a21 | 1963 | xor $zero, $zero # cf=0,of=0 |
87954638 AP |
1964 | jmp .Loop_mulx |
1965 | ||
1966 | .align 32 | |
1967 | .Loop_mulx: | |
1968 | movq %r8, %rbx | |
1969 | mulx ($ap), %rax, %r8 | |
1970 | adcx %rax, %rbx | |
1971 | adox %r9, %r8 | |
1972 | ||
1973 | mulx 8($ap), %rax, %r9 | |
1974 | adcx %rax, %r8 | |
1975 | adox %r10, %r9 | |
1976 | ||
1977 | mulx 16($ap), %rax, %r10 | |
1978 | adcx %rax, %r9 | |
1979 | adox %r11, %r10 | |
1980 | ||
1981 | mulx 24($ap), %rax, %r11 | |
1982 | adcx %rax, %r10 | |
1983 | adox %r12, %r11 | |
1984 | ||
1985 | .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 | |
1986 | adcx %rax, %r11 | |
1987 | adox %r13, %r12 | |
1988 | ||
1989 | mulx 40($ap), %rax, %r13 | |
1990 | adcx %rax, %r12 | |
1991 | adox %r14, %r13 | |
1992 | ||
1993 | mulx 48($ap), %rax, %r14 | |
1994 | adcx %rax, %r13 | |
1995 | adox %r15, %r14 | |
1996 | ||
1997 | mulx 56($ap), %rax, %r15 | |
1998 | movq 64($bp,%rcx,8), %rdx | |
1999 | movq %rbx, 8+64-8(%rsp,%rcx,8) | |
2000 | adcx %rax, %r14 | |
2001 | adox $zero, %r15 | |
2002 | adcx $zero, %r15 # cf=0 | |
2003 | ||
2004 | inc %rcx # of=0 | |
2005 | jnz .Loop_mulx | |
2006 | ||
2007 | movq %r8, %rbx | |
2008 | mulx ($ap), %rax, %r8 | |
2009 | adcx %rax, %rbx | |
2010 | adox %r9, %r8 | |
2011 | ||
2012 | .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 | |
2013 | adcx %rax, %r8 | |
2014 | adox %r10, %r9 | |
2015 | ||
2016 | .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 | |
2017 | adcx %rax, %r9 | |
2018 | adox %r11, %r10 | |
2019 | ||
2020 | mulx 24($ap), %rax, %r11 | |
2021 | adcx %rax, %r10 | |
2022 | adox %r12, %r11 | |
2023 | ||
2024 | mulx 32($ap), %rax, %r12 | |
2025 | adcx %rax, %r11 | |
2026 | adox %r13, %r12 | |
2027 | ||
2028 | mulx 40($ap), %rax, %r13 | |
2029 | adcx %rax, %r12 | |
2030 | adox %r14, %r13 | |
2031 | ||
2032 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 | |
2033 | adcx %rax, %r13 | |
2034 | adox %r15, %r14 | |
2035 | ||
2036 | .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 | |
2037 | adcx %rax, %r14 | |
2038 | adox $zero, %r15 | |
2039 | adcx $zero, %r15 | |
2040 | ||
2041 | mov %rbx, 8+64-8(%rsp) | |
2042 | mov %r8, 8+64(%rsp) | |
2043 | mov %r9, 8+64+8(%rsp) | |
2044 | mov %r10, 8+64+16(%rsp) | |
2045 | mov %r11, 8+64+24(%rsp) | |
2046 | mov %r12, 8+64+32(%rsp) | |
2047 | mov %r13, 8+64+40(%rsp) | |
2048 | mov %r14, 8+64+48(%rsp) | |
2049 | mov %r15, 8+64+56(%rsp) | |
2050 | ||
2051 | ret | |
2052 | .size __rsaz_512_mulx,.-__rsaz_512_mulx | |
2053 | ___ | |
2054 | } | |
0b4bb91d AP |
2055 | { |
2056 | my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); | |
2057 | $code.=<<___; | |
2058 | .globl rsaz_512_scatter4 | |
2059 | .type rsaz_512_scatter4,\@abi-omnipotent | |
2060 | .align 16 | |
2061 | rsaz_512_scatter4: | |
5ea08bd2 | 2062 | leaq ($out,$power,8), $out |
0b4bb91d AP |
2063 | movl \$8, %r9d |
2064 | jmp .Loop_scatter | |
2065 | .align 16 | |
2066 | .Loop_scatter: | |
2067 | movq ($inp), %rax | |
2068 | leaq 8($inp), $inp | |
5ea08bd2 | 2069 | movq %rax, ($out) |
0b4bb91d AP |
2070 | leaq 128($out), $out |
2071 | decl %r9d | |
2072 | jnz .Loop_scatter | |
2073 | ret | |
2074 | .size rsaz_512_scatter4,.-rsaz_512_scatter4 | |
2075 | ||
2076 | .globl rsaz_512_gather4 | |
2077 | .type rsaz_512_gather4,\@abi-omnipotent | |
2078 | .align 16 | |
2079 | rsaz_512_gather4: | |
5ea08bd2 AP |
2080 | ___ |
2081 | $code.=<<___ if ($win64); | |
2082 | .LSEH_begin_rsaz_512_gather4: | |
2083 | .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp | |
2084 | .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) | |
2085 | .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) | |
2086 | .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) | |
2087 | .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) | |
2088 | .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) | |
2089 | .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) | |
2090 | .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) | |
2091 | .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) | |
2092 | .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) | |
2093 | .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) | |
2094 | ___ | |
2095 | $code.=<<___; | |
2096 | movd $power,%xmm8 | |
2097 | movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 | |
2098 | movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 | |
2099 | ||
2100 | pshufd \$0,%xmm8,%xmm8 # broadcast $power | |
2101 | movdqa %xmm1,%xmm7 | |
2102 | movdqa %xmm1,%xmm2 | |
2103 | ___ | |
2104 | ######################################################################## | |
2105 | # calculate mask by comparing 0..15 to $power | |
2106 | # | |
2107 | for($i=0;$i<4;$i++) { | |
2108 | $code.=<<___; | |
2109 | paddd %xmm`$i`,%xmm`$i+1` | |
2110 | pcmpeqd %xmm8,%xmm`$i` | |
2111 | movdqa %xmm7,%xmm`$i+3` | |
2112 | ___ | |
2113 | } | |
2114 | for(;$i<7;$i++) { | |
2115 | $code.=<<___; | |
2116 | paddd %xmm`$i`,%xmm`$i+1` | |
2117 | pcmpeqd %xmm8,%xmm`$i` | |
2118 | ___ | |
2119 | } | |
2120 | $code.=<<___; | |
2121 | pcmpeqd %xmm8,%xmm7 | |
0b4bb91d AP |
2122 | movl \$8, %r9d |
2123 | jmp .Loop_gather | |
2124 | .align 16 | |
2125 | .Loop_gather: | |
5ea08bd2 AP |
2126 | movdqa 16*0($inp),%xmm8 |
2127 | movdqa 16*1($inp),%xmm9 | |
2128 | movdqa 16*2($inp),%xmm10 | |
2129 | movdqa 16*3($inp),%xmm11 | |
2130 | pand %xmm0,%xmm8 | |
2131 | movdqa 16*4($inp),%xmm12 | |
2132 | pand %xmm1,%xmm9 | |
2133 | movdqa 16*5($inp),%xmm13 | |
2134 | pand %xmm2,%xmm10 | |
2135 | movdqa 16*6($inp),%xmm14 | |
2136 | pand %xmm3,%xmm11 | |
2137 | movdqa 16*7($inp),%xmm15 | |
0b4bb91d | 2138 | leaq 128($inp), $inp |
5ea08bd2 AP |
2139 | pand %xmm4,%xmm12 |
2140 | pand %xmm5,%xmm13 | |
2141 | pand %xmm6,%xmm14 | |
2142 | pand %xmm7,%xmm15 | |
2143 | por %xmm10,%xmm8 | |
2144 | por %xmm11,%xmm9 | |
2145 | por %xmm12,%xmm8 | |
2146 | por %xmm13,%xmm9 | |
2147 | por %xmm14,%xmm8 | |
2148 | por %xmm15,%xmm9 | |
2149 | ||
2150 | por %xmm9,%xmm8 | |
2151 | pshufd \$0x4e,%xmm8,%xmm9 | |
2152 | por %xmm9,%xmm8 | |
2153 | movq %xmm8,($out) | |
0b4bb91d AP |
2154 | leaq 8($out), $out |
2155 | decl %r9d | |
2156 | jnz .Loop_gather | |
5ea08bd2 AP |
2157 | ___ |
2158 | $code.=<<___ if ($win64); | |
2159 | movaps 0x00(%rsp),%xmm6 | |
2160 | movaps 0x10(%rsp),%xmm7 | |
2161 | movaps 0x20(%rsp),%xmm8 | |
2162 | movaps 0x30(%rsp),%xmm9 | |
2163 | movaps 0x40(%rsp),%xmm10 | |
2164 | movaps 0x50(%rsp),%xmm11 | |
2165 | movaps 0x60(%rsp),%xmm12 | |
2166 | movaps 0x70(%rsp),%xmm13 | |
2167 | movaps 0x80(%rsp),%xmm14 | |
2168 | movaps 0x90(%rsp),%xmm15 | |
2169 | add \$0xa8,%rsp | |
2170 | ___ | |
2171 | $code.=<<___; | |
0b4bb91d | 2172 | ret |
5ea08bd2 | 2173 | .LSEH_end_rsaz_512_gather4: |
0b4bb91d | 2174 | .size rsaz_512_gather4,.-rsaz_512_gather4 |
5ea08bd2 AP |
2175 | |
2176 | .align 64 | |
2177 | .Linc: | |
2178 | .long 0,0, 1,1 | |
2179 | .long 2,2, 2,2 | |
0b4bb91d AP |
2180 | ___ |
2181 | } | |
2182 | ||
2183 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
2184 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
2185 | if ($win64) { | |
2186 | $rec="%rcx"; | |
2187 | $frame="%rdx"; | |
2188 | $context="%r8"; | |
2189 | $disp="%r9"; | |
2190 | ||
2191 | $code.=<<___; | |
2192 | .extern __imp_RtlVirtualUnwind | |
2193 | .type se_handler,\@abi-omnipotent | |
2194 | .align 16 | |
2195 | se_handler: | |
2196 | push %rsi | |
2197 | push %rdi | |
2198 | push %rbx | |
2199 | push %rbp | |
2200 | push %r12 | |
2201 | push %r13 | |
2202 | push %r14 | |
2203 | push %r15 | |
2204 | pushfq | |
2205 | sub \$64,%rsp | |
2206 | ||
2207 | mov 120($context),%rax # pull context->Rax | |
2208 | mov 248($context),%rbx # pull context->Rip | |
2209 | ||
2210 | mov 8($disp),%rsi # disp->ImageBase | |
2211 | mov 56($disp),%r11 # disp->HandlerData | |
2212 | ||
2213 | mov 0(%r11),%r10d # HandlerData[0] | |
2214 | lea (%rsi,%r10),%r10 # end of prologue label | |
2215 | cmp %r10,%rbx # context->Rip<end of prologue label | |
2216 | jb .Lcommon_seh_tail | |
2217 | ||
2218 | mov 152($context),%rax # pull context->Rsp | |
2219 | ||
2220 | mov 4(%r11),%r10d # HandlerData[1] | |
2221 | lea (%rsi,%r10),%r10 # epilogue label | |
2222 | cmp %r10,%rbx # context->Rip>=epilogue label | |
2223 | jae .Lcommon_seh_tail | |
2224 | ||
2225 | lea 128+24+48(%rax),%rax | |
2226 | ||
5ea08bd2 AP |
2227 | lea .Lmul_gather4_epilogue(%rip),%rbx |
2228 | cmp %r10,%rbx | |
2229 | jne .Lse_not_in_mul_gather4 | |
2230 | ||
2231 | lea 0xb0(%rax),%rax | |
2232 | ||
2233 | lea -48-0xa8(%rax),%rsi | |
2234 | lea 512($context),%rdi | |
2235 | mov \$20,%ecx | |
2236 | .long 0xa548f3fc # cld; rep movsq | |
2237 | ||
2238 | .Lse_not_in_mul_gather4: | |
0b4bb91d AP |
2239 | mov -8(%rax),%rbx |
2240 | mov -16(%rax),%rbp | |
2241 | mov -24(%rax),%r12 | |
2242 | mov -32(%rax),%r13 | |
2243 | mov -40(%rax),%r14 | |
2244 | mov -48(%rax),%r15 | |
2245 | mov %rbx,144($context) # restore context->Rbx | |
2246 | mov %rbp,160($context) # restore context->Rbp | |
2247 | mov %r12,216($context) # restore context->R12 | |
2248 | mov %r13,224($context) # restore context->R13 | |
2249 | mov %r14,232($context) # restore context->R14 | |
2250 | mov %r15,240($context) # restore context->R15 | |
2251 | ||
2252 | .Lcommon_seh_tail: | |
2253 | mov 8(%rax),%rdi | |
2254 | mov 16(%rax),%rsi | |
2255 | mov %rax,152($context) # restore context->Rsp | |
2256 | mov %rsi,168($context) # restore context->Rsi | |
2257 | mov %rdi,176($context) # restore context->Rdi | |
2258 | ||
2259 | mov 40($disp),%rdi # disp->ContextRecord | |
2260 | mov $context,%rsi # context | |
2261 | mov \$154,%ecx # sizeof(CONTEXT) | |
2262 | .long 0xa548f3fc # cld; rep movsq | |
2263 | ||
2264 | mov $disp,%rsi | |
2265 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
2266 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
2267 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
2268 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
2269 | mov 40(%rsi),%r10 # disp->ContextRecord | |
2270 | lea 56(%rsi),%r11 # &disp->HandlerData | |
2271 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
2272 | mov %r10,32(%rsp) # arg5 | |
2273 | mov %r11,40(%rsp) # arg6 | |
2274 | mov %r12,48(%rsp) # arg7 | |
2275 | mov %rcx,56(%rsp) # arg8, (NULL) | |
2276 | call *__imp_RtlVirtualUnwind(%rip) | |
2277 | ||
2278 | mov \$1,%eax # ExceptionContinueSearch | |
2279 | add \$64,%rsp | |
2280 | popfq | |
2281 | pop %r15 | |
2282 | pop %r14 | |
2283 | pop %r13 | |
2284 | pop %r12 | |
2285 | pop %rbp | |
2286 | pop %rbx | |
2287 | pop %rdi | |
2288 | pop %rsi | |
2289 | ret | |
5ea08bd2 | 2290 | .size se_handler,.-se_handler |
0b4bb91d AP |
2291 | |
2292 | .section .pdata | |
2293 | .align 4 | |
2294 | .rva .LSEH_begin_rsaz_512_sqr | |
2295 | .rva .LSEH_end_rsaz_512_sqr | |
2296 | .rva .LSEH_info_rsaz_512_sqr | |
2297 | ||
2298 | .rva .LSEH_begin_rsaz_512_mul | |
2299 | .rva .LSEH_end_rsaz_512_mul | |
2300 | .rva .LSEH_info_rsaz_512_mul | |
2301 | ||
2302 | .rva .LSEH_begin_rsaz_512_mul_gather4 | |
2303 | .rva .LSEH_end_rsaz_512_mul_gather4 | |
2304 | .rva .LSEH_info_rsaz_512_mul_gather4 | |
2305 | ||
2306 | .rva .LSEH_begin_rsaz_512_mul_scatter4 | |
2307 | .rva .LSEH_end_rsaz_512_mul_scatter4 | |
2308 | .rva .LSEH_info_rsaz_512_mul_scatter4 | |
2309 | ||
2310 | .rva .LSEH_begin_rsaz_512_mul_by_one | |
2311 | .rva .LSEH_end_rsaz_512_mul_by_one | |
2312 | .rva .LSEH_info_rsaz_512_mul_by_one | |
2313 | ||
5ea08bd2 AP |
2314 | .rva .LSEH_begin_rsaz_512_gather4 |
2315 | .rva .LSEH_end_rsaz_512_gather4 | |
2316 | .rva .LSEH_info_rsaz_512_gather4 | |
2317 | ||
0b4bb91d AP |
2318 | .section .xdata |
2319 | .align 8 | |
2320 | .LSEH_info_rsaz_512_sqr: | |
2321 | .byte 9,0,0,0 | |
2322 | .rva se_handler | |
2323 | .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] | |
2324 | .LSEH_info_rsaz_512_mul: | |
2325 | .byte 9,0,0,0 | |
2326 | .rva se_handler | |
2327 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[] | |
2328 | .LSEH_info_rsaz_512_mul_gather4: | |
2329 | .byte 9,0,0,0 | |
2330 | .rva se_handler | |
2331 | .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] | |
2332 | .LSEH_info_rsaz_512_mul_scatter4: | |
2333 | .byte 9,0,0,0 | |
2334 | .rva se_handler | |
2335 | .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] | |
2336 | .LSEH_info_rsaz_512_mul_by_one: | |
2337 | .byte 9,0,0,0 | |
2338 | .rva se_handler | |
2339 | .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] | |
5ea08bd2 AP |
2340 | .LSEH_info_rsaz_512_gather4: |
2341 | .byte 0x01,0x46,0x16,0x00 | |
d6d422e1 AP |
2342 | .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 |
2343 | .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 | |
2344 | .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 | |
2345 | .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 | |
2346 | .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 | |
2347 | .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 | |
2348 | .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 | |
2349 | .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 | |
2350 | .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 | |
2351 | .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 | |
2352 | .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 | |
0b4bb91d AP |
2353 | ___ |
2354 | } | |
2355 | ||
2356 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
2357 | print $code; | |
2358 | close STDOUT; |