]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
0b4bb91d | 9 | |
31ed9a21 AP |
10 | ############################################################################## |
11 | # # | |
12 | # Copyright (c) 2012, Intel Corporation # | |
13 | # # | |
14 | # All rights reserved. # | |
15 | # # | |
16 | # Redistribution and use in source and binary forms, with or without # | |
17 | # modification, are permitted provided that the following conditions are # | |
18 | # met: # | |
19 | # # | |
20 | # * Redistributions of source code must retain the above copyright # | |
21 | # notice, this list of conditions and the following disclaimer. # | |
22 | # # | |
23 | # * Redistributions in binary form must reproduce the above copyright # | |
24 | # notice, this list of conditions and the following disclaimer in the # | |
25 | # documentation and/or other materials provided with the # | |
26 | # distribution. # | |
27 | # # | |
28 | # * Neither the name of the Intel Corporation nor the names of its # | |
29 | # contributors may be used to endorse or promote products derived from # | |
30 | # this software without specific prior written permission. # | |
31 | # # | |
32 | # # | |
33 | # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # | |
34 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # | |
35 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # | |
36 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # | |
37 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # | |
38 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # | |
39 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # | |
40 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # | |
41 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # | |
42 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # | |
43 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # | |
44 | # # | |
45 | ############################################################################## | |
46 | # Developers and authors: # | |
47 | # Shay Gueron (1, 2), and Vlad Krasnov (1) # | |
48 | # (1) Intel Corporation, Israel Development Center, Haifa, Israel # | |
49 | # (2) University of Haifa, Israel # | |
50 | ############################################################################## | |
51 | # Reference: # | |
52 | # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # | |
53 | # Exponentiation, Using Advanced Vector Instructions Architectures", # | |
54 | # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # | |
55 | # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # | |
56 | # [2] S. Gueron: "Efficient Software Implementations of Modular # | |
57 | # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # | |
58 | # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # | |
59 | # Proceedings of 9th International Conference on Information Technology: # | |
60 | # New Generations (ITNG 2012), pp.821-823 (2012) # | |
61 | # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # | |
62 | # resistant 1024-bit modular exponentiation, for optimizing RSA2048 # | |
63 | # on AVX2 capable x86_64 platforms", # | |
64 | # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# | |
65 | ############################################################################## | |
66 | # | |
67 | # +13% improvement over original submission by <appro@openssl.org> | |
0b4bb91d AP |
68 | # |
69 | # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this | |
31ed9a21 | 70 | # 2.3GHz Haswell 621 765/+23% 1113/+79% |
b3d72949 | 71 | # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% |
0b4bb91d AP |
72 | # |
73 | # (*) if system doesn't support AVX2, for reference purposes; | |
b3d72949 AP |
74 | # (**) scaled to 2.3GHz to simplify comparison; |
75 | # (***) scalar AD*X code is faster than AVX2 and is preferred code | |
76 | # path for Broadwell; | |
0b4bb91d AP |
77 | |
78 | $flavour = shift; | |
79 | $output = shift; | |
80 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
81 | ||
82 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
83 | ||
84 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
85 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
86 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
87 | die "can't locate x86_64-xlate.pl"; | |
88 | ||
89 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
90 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
91 | $avx = ($1>=2.19) + ($1>=2.22); | |
f3f620e1 | 92 | $addx = ($1>=2.23); |
0b4bb91d AP |
93 | } |
94 | ||
95 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
96 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
5c57c69f | 97 | $avx = ($1>=2.09) + ($1>=2.10); |
f3f620e1 | 98 | $addx = ($1>=2.10); |
0b4bb91d AP |
99 | } |
100 | ||
101 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
102 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
103 | $avx = ($1>=10) + ($1>=11); | |
f3f620e1 | 104 | $addx = ($1>=11); |
ac171925 AP |
105 | } |
106 | ||
a356e488 AP |
107 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { |
108 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 | |
109 | $avx = ($ver>=3.0) + ($ver>=3.01); | |
110 | $addx = ($ver>=3.03); | |
0b4bb91d AP |
111 | } |
112 | ||
15735e4f | 113 | open OUT,"| \"$^X\" $xlate $flavour $output"; |
0b4bb91d AP |
114 | *STDOUT = *OUT; |
115 | ||
116 | if ($avx>1) {{{ | |
117 | { # void AMS_WW( | |
118 | my $rp="%rdi"; # BN_ULONG *rp, | |
119 | my $ap="%rsi"; # const BN_ULONG *ap, | |
120 | my $np="%rdx"; # const BN_ULONG *np, | |
121 | my $n0="%ecx"; # const BN_ULONG n0, | |
122 | my $rep="%r8d"; # int repeat); | |
123 | ||
124 | # The registers that hold the accumulated redundant result | |
125 | # The AMM works on 1024 bit operands, and redundant word size is 29 | |
126 | # Therefore: ceil(1024/29)/4 = 9 | |
127 | my $ACC0="%ymm0"; | |
128 | my $ACC1="%ymm1"; | |
129 | my $ACC2="%ymm2"; | |
130 | my $ACC3="%ymm3"; | |
131 | my $ACC4="%ymm4"; | |
132 | my $ACC5="%ymm5"; | |
133 | my $ACC6="%ymm6"; | |
134 | my $ACC7="%ymm7"; | |
135 | my $ACC8="%ymm8"; | |
136 | my $ACC9="%ymm9"; | |
137 | # Registers that hold the broadcasted words of bp, currently used | |
138 | my $B1="%ymm10"; | |
139 | my $B2="%ymm11"; | |
140 | # Registers that hold the broadcasted words of Y, currently used | |
141 | my $Y1="%ymm12"; | |
142 | my $Y2="%ymm13"; | |
143 | # Helper registers | |
144 | my $TEMP1="%ymm14"; | |
145 | my $AND_MASK="%ymm15"; | |
146 | # alu registers that hold the first words of the ACC | |
147 | my $r0="%r9"; | |
148 | my $r1="%r10"; | |
149 | my $r2="%r11"; | |
150 | my $r3="%r12"; | |
151 | ||
152 | my $i="%r14d"; # loop counter | |
153 | my $tmp = "%r15"; | |
154 | ||
155 | my $FrameSize=32*18+32*8; # place for A^2 and 2*A | |
156 | ||
157 | my $aap=$r0; | |
158 | my $tp0="%rbx"; | |
159 | my $tp1=$r3; | |
fa104be3 | 160 | my $tpa=$tmp; |
0b4bb91d AP |
161 | |
162 | $np="%r13"; # reassigned argument | |
163 | ||
164 | $code.=<<___; | |
5c57c69f AP |
165 | .text |
166 | ||
0b4bb91d AP |
167 | .globl rsaz_1024_sqr_avx2 |
168 | .type rsaz_1024_sqr_avx2,\@function,5 | |
169 | .align 64 | |
170 | rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 | |
171 | lea (%rsp), %rax | |
172 | push %rbx | |
173 | push %rbp | |
174 | push %r12 | |
175 | push %r13 | |
176 | push %r14 | |
177 | push %r15 | |
31ed9a21 | 178 | vzeroupper |
0b4bb91d AP |
179 | ___ |
180 | $code.=<<___ if ($win64); | |
181 | lea -0xa8(%rsp),%rsp | |
31ed9a21 AP |
182 | vmovaps %xmm6,-0xd8(%rax) |
183 | vmovaps %xmm7,-0xc8(%rax) | |
184 | vmovaps %xmm8,-0xb8(%rax) | |
185 | vmovaps %xmm9,-0xa8(%rax) | |
186 | vmovaps %xmm10,-0x98(%rax) | |
187 | vmovaps %xmm11,-0x88(%rax) | |
188 | vmovaps %xmm12,-0x78(%rax) | |
189 | vmovaps %xmm13,-0x68(%rax) | |
190 | vmovaps %xmm14,-0x58(%rax) | |
191 | vmovaps %xmm15,-0x48(%rax) | |
0b4bb91d AP |
192 | .Lsqr_1024_body: |
193 | ___ | |
194 | $code.=<<___; | |
195 | mov %rax,%rbp | |
0b4bb91d AP |
196 | mov %rdx, $np # reassigned argument |
197 | sub \$$FrameSize, %rsp | |
198 | mov $np, $tmp | |
199 | sub \$-128, $rp # size optimization | |
200 | sub \$-128, $ap | |
201 | sub \$-128, $np | |
202 | ||
203 | and \$4095, $tmp # see if $np crosses page | |
204 | add \$32*10, $tmp | |
205 | shr \$12, $tmp | |
31ed9a21 | 206 | vpxor $ACC9,$ACC9,$ACC9 |
0b4bb91d AP |
207 | jz .Lsqr_1024_no_n_copy |
208 | ||
209 | # unaligned 256-bit load that crosses page boundary can | |
210 | # cause >2x performance degradation here, so if $np does | |
211 | # cross page boundary, copy it to stack and make sure stack | |
212 | # frame doesn't... | |
213 | sub \$32*10,%rsp | |
214 | vmovdqu 32*0-128($np), $ACC0 | |
215 | and \$-2048, %rsp | |
216 | vmovdqu 32*1-128($np), $ACC1 | |
217 | vmovdqu 32*2-128($np), $ACC2 | |
218 | vmovdqu 32*3-128($np), $ACC3 | |
219 | vmovdqu 32*4-128($np), $ACC4 | |
220 | vmovdqu 32*5-128($np), $ACC5 | |
221 | vmovdqu 32*6-128($np), $ACC6 | |
222 | vmovdqu 32*7-128($np), $ACC7 | |
223 | vmovdqu 32*8-128($np), $ACC8 | |
224 | lea $FrameSize+128(%rsp),$np | |
225 | vmovdqu $ACC0, 32*0-128($np) | |
226 | vmovdqu $ACC1, 32*1-128($np) | |
227 | vmovdqu $ACC2, 32*2-128($np) | |
228 | vmovdqu $ACC3, 32*3-128($np) | |
229 | vmovdqu $ACC4, 32*4-128($np) | |
230 | vmovdqu $ACC5, 32*5-128($np) | |
231 | vmovdqu $ACC6, 32*6-128($np) | |
232 | vmovdqu $ACC7, 32*7-128($np) | |
233 | vmovdqu $ACC8, 32*8-128($np) | |
31ed9a21 | 234 | vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero |
0b4bb91d AP |
235 | |
236 | .Lsqr_1024_no_n_copy: | |
237 | and \$-1024, %rsp | |
238 | ||
239 | vmovdqu 32*1-128($ap), $ACC1 | |
240 | vmovdqu 32*2-128($ap), $ACC2 | |
241 | vmovdqu 32*3-128($ap), $ACC3 | |
242 | vmovdqu 32*4-128($ap), $ACC4 | |
243 | vmovdqu 32*5-128($ap), $ACC5 | |
244 | vmovdqu 32*6-128($ap), $ACC6 | |
245 | vmovdqu 32*7-128($ap), $ACC7 | |
246 | vmovdqu 32*8-128($ap), $ACC8 | |
247 | ||
248 | lea 192(%rsp), $tp0 # 64+128=192 | |
249 | vpbroadcastq .Land_mask(%rip), $AND_MASK | |
250 | jmp .LOOP_GRANDE_SQR_1024 | |
251 | ||
252 | .align 32 | |
253 | .LOOP_GRANDE_SQR_1024: | |
254 | lea 32*18+128(%rsp), $aap # size optimization | |
255 | lea 448(%rsp), $tp1 # 64+128+256=448 | |
256 | ||
257 | # the squaring is performed as described in Variant B of | |
258 | # "Speeding up Big-Number Squaring", so start by calculating | |
259 | # the A*2=A+A vector | |
260 | vpaddq $ACC1, $ACC1, $ACC1 | |
261 | vpbroadcastq 32*0-128($ap), $B1 | |
262 | vpaddq $ACC2, $ACC2, $ACC2 | |
263 | vmovdqa $ACC1, 32*0-128($aap) | |
264 | vpaddq $ACC3, $ACC3, $ACC3 | |
265 | vmovdqa $ACC2, 32*1-128($aap) | |
266 | vpaddq $ACC4, $ACC4, $ACC4 | |
267 | vmovdqa $ACC3, 32*2-128($aap) | |
268 | vpaddq $ACC5, $ACC5, $ACC5 | |
269 | vmovdqa $ACC4, 32*3-128($aap) | |
270 | vpaddq $ACC6, $ACC6, $ACC6 | |
271 | vmovdqa $ACC5, 32*4-128($aap) | |
272 | vpaddq $ACC7, $ACC7, $ACC7 | |
273 | vmovdqa $ACC6, 32*5-128($aap) | |
274 | vpaddq $ACC8, $ACC8, $ACC8 | |
275 | vmovdqa $ACC7, 32*6-128($aap) | |
276 | vpxor $ACC9, $ACC9, $ACC9 | |
277 | vmovdqa $ACC8, 32*7-128($aap) | |
278 | ||
279 | vpmuludq 32*0-128($ap), $B1, $ACC0 | |
280 | vpbroadcastq 32*1-128($ap), $B2 | |
281 | vmovdqu $ACC9, 32*9-192($tp0) # zero upper half | |
282 | vpmuludq $B1, $ACC1, $ACC1 | |
283 | vmovdqu $ACC9, 32*10-448($tp1) | |
284 | vpmuludq $B1, $ACC2, $ACC2 | |
285 | vmovdqu $ACC9, 32*11-448($tp1) | |
286 | vpmuludq $B1, $ACC3, $ACC3 | |
287 | vmovdqu $ACC9, 32*12-448($tp1) | |
288 | vpmuludq $B1, $ACC4, $ACC4 | |
289 | vmovdqu $ACC9, 32*13-448($tp1) | |
290 | vpmuludq $B1, $ACC5, $ACC5 | |
291 | vmovdqu $ACC9, 32*14-448($tp1) | |
292 | vpmuludq $B1, $ACC6, $ACC6 | |
293 | vmovdqu $ACC9, 32*15-448($tp1) | |
294 | vpmuludq $B1, $ACC7, $ACC7 | |
295 | vmovdqu $ACC9, 32*16-448($tp1) | |
296 | vpmuludq $B1, $ACC8, $ACC8 | |
297 | vpbroadcastq 32*2-128($ap), $B1 | |
298 | vmovdqu $ACC9, 32*17-448($tp1) | |
299 | ||
fa104be3 | 300 | mov $ap, $tpa |
0b4bb91d | 301 | mov \$4, $i |
5c57c69f | 302 | jmp .Lsqr_entry_1024 |
0b4bb91d AP |
303 | ___ |
304 | $TEMP0=$Y1; | |
305 | $TEMP2=$Y2; | |
306 | $code.=<<___; | |
307 | .align 32 | |
308 | .LOOP_SQR_1024: | |
fa104be3 | 309 | vpbroadcastq 32*1-128($tpa), $B2 |
0b4bb91d | 310 | vpmuludq 32*0-128($ap), $B1, $ACC0 |
fa104be3 | 311 | vpaddq 32*0-192($tp0), $ACC0, $ACC0 |
0b4bb91d | 312 | vpmuludq 32*0-128($aap), $B1, $ACC1 |
fa104be3 | 313 | vpaddq 32*1-192($tp0), $ACC1, $ACC1 |
0b4bb91d | 314 | vpmuludq 32*1-128($aap), $B1, $ACC2 |
fa104be3 | 315 | vpaddq 32*2-192($tp0), $ACC2, $ACC2 |
0b4bb91d | 316 | vpmuludq 32*2-128($aap), $B1, $ACC3 |
fa104be3 | 317 | vpaddq 32*3-192($tp0), $ACC3, $ACC3 |
0b4bb91d | 318 | vpmuludq 32*3-128($aap), $B1, $ACC4 |
fa104be3 | 319 | vpaddq 32*4-192($tp0), $ACC4, $ACC4 |
0b4bb91d | 320 | vpmuludq 32*4-128($aap), $B1, $ACC5 |
fa104be3 | 321 | vpaddq 32*5-192($tp0), $ACC5, $ACC5 |
0b4bb91d | 322 | vpmuludq 32*5-128($aap), $B1, $ACC6 |
fa104be3 | 323 | vpaddq 32*6-192($tp0), $ACC6, $ACC6 |
0b4bb91d | 324 | vpmuludq 32*6-128($aap), $B1, $ACC7 |
fa104be3 | 325 | vpaddq 32*7-192($tp0), $ACC7, $ACC7 |
0b4bb91d | 326 | vpmuludq 32*7-128($aap), $B1, $ACC8 |
fa104be3 AP |
327 | vpbroadcastq 32*2-128($tpa), $B1 |
328 | vpaddq 32*8-192($tp0), $ACC8, $ACC8 | |
5c57c69f | 329 | .Lsqr_entry_1024: |
fa104be3 AP |
330 | vmovdqu $ACC0, 32*0-192($tp0) |
331 | vmovdqu $ACC1, 32*1-192($tp0) | |
0b4bb91d AP |
332 | |
333 | vpmuludq 32*1-128($ap), $B2, $TEMP0 | |
334 | vpaddq $TEMP0, $ACC2, $ACC2 | |
335 | vpmuludq 32*1-128($aap), $B2, $TEMP1 | |
336 | vpaddq $TEMP1, $ACC3, $ACC3 | |
337 | vpmuludq 32*2-128($aap), $B2, $TEMP2 | |
338 | vpaddq $TEMP2, $ACC4, $ACC4 | |
339 | vpmuludq 32*3-128($aap), $B2, $TEMP0 | |
340 | vpaddq $TEMP0, $ACC5, $ACC5 | |
341 | vpmuludq 32*4-128($aap), $B2, $TEMP1 | |
342 | vpaddq $TEMP1, $ACC6, $ACC6 | |
343 | vpmuludq 32*5-128($aap), $B2, $TEMP2 | |
0b4bb91d AP |
344 | vpaddq $TEMP2, $ACC7, $ACC7 |
345 | vpmuludq 32*6-128($aap), $B2, $TEMP0 | |
346 | vpaddq $TEMP0, $ACC8, $ACC8 | |
347 | vpmuludq 32*7-128($aap), $B2, $ACC0 | |
fa104be3 AP |
348 | vpbroadcastq 32*3-128($tpa), $B2 |
349 | vpaddq 32*9-192($tp0), $ACC0, $ACC0 | |
0b4bb91d | 350 | |
fa104be3 AP |
351 | vmovdqu $ACC2, 32*2-192($tp0) |
352 | vmovdqu $ACC3, 32*3-192($tp0) | |
0b4bb91d AP |
353 | |
354 | vpmuludq 32*2-128($ap), $B1, $TEMP2 | |
355 | vpaddq $TEMP2, $ACC4, $ACC4 | |
356 | vpmuludq 32*2-128($aap), $B1, $TEMP0 | |
357 | vpaddq $TEMP0, $ACC5, $ACC5 | |
358 | vpmuludq 32*3-128($aap), $B1, $TEMP1 | |
359 | vpaddq $TEMP1, $ACC6, $ACC6 | |
360 | vpmuludq 32*4-128($aap), $B1, $TEMP2 | |
361 | vpaddq $TEMP2, $ACC7, $ACC7 | |
362 | vpmuludq 32*5-128($aap), $B1, $TEMP0 | |
0b4bb91d AP |
363 | vpaddq $TEMP0, $ACC8, $ACC8 |
364 | vpmuludq 32*6-128($aap), $B1, $TEMP1 | |
365 | vpaddq $TEMP1, $ACC0, $ACC0 | |
366 | vpmuludq 32*7-128($aap), $B1, $ACC1 | |
fa104be3 AP |
367 | vpbroadcastq 32*4-128($tpa), $B1 |
368 | vpaddq 32*10-448($tp1), $ACC1, $ACC1 | |
0b4bb91d | 369 | |
fa104be3 AP |
370 | vmovdqu $ACC4, 32*4-192($tp0) |
371 | vmovdqu $ACC5, 32*5-192($tp0) | |
0b4bb91d AP |
372 | |
373 | vpmuludq 32*3-128($ap), $B2, $TEMP0 | |
374 | vpaddq $TEMP0, $ACC6, $ACC6 | |
375 | vpmuludq 32*3-128($aap), $B2, $TEMP1 | |
376 | vpaddq $TEMP1, $ACC7, $ACC7 | |
377 | vpmuludq 32*4-128($aap), $B2, $TEMP2 | |
378 | vpaddq $TEMP2, $ACC8, $ACC8 | |
379 | vpmuludq 32*5-128($aap), $B2, $TEMP0 | |
0b4bb91d AP |
380 | vpaddq $TEMP0, $ACC0, $ACC0 |
381 | vpmuludq 32*6-128($aap), $B2, $TEMP1 | |
382 | vpaddq $TEMP1, $ACC1, $ACC1 | |
383 | vpmuludq 32*7-128($aap), $B2, $ACC2 | |
fa104be3 AP |
384 | vpbroadcastq 32*5-128($tpa), $B2 |
385 | vpaddq 32*11-448($tp1), $ACC2, $ACC2 | |
0b4bb91d | 386 | |
fa104be3 AP |
387 | vmovdqu $ACC6, 32*6-192($tp0) |
388 | vmovdqu $ACC7, 32*7-192($tp0) | |
0b4bb91d AP |
389 | |
390 | vpmuludq 32*4-128($ap), $B1, $TEMP0 | |
391 | vpaddq $TEMP0, $ACC8, $ACC8 | |
392 | vpmuludq 32*4-128($aap), $B1, $TEMP1 | |
393 | vpaddq $TEMP1, $ACC0, $ACC0 | |
394 | vpmuludq 32*5-128($aap), $B1, $TEMP2 | |
0b4bb91d AP |
395 | vpaddq $TEMP2, $ACC1, $ACC1 |
396 | vpmuludq 32*6-128($aap), $B1, $TEMP0 | |
397 | vpaddq $TEMP0, $ACC2, $ACC2 | |
398 | vpmuludq 32*7-128($aap), $B1, $ACC3 | |
fa104be3 AP |
399 | vpbroadcastq 32*6-128($tpa), $B1 |
400 | vpaddq 32*12-448($tp1), $ACC3, $ACC3 | |
0b4bb91d | 401 | |
fa104be3 AP |
402 | vmovdqu $ACC8, 32*8-192($tp0) |
403 | vmovdqu $ACC0, 32*9-192($tp0) | |
404 | lea 8($tp0), $tp0 | |
0b4bb91d AP |
405 | |
406 | vpmuludq 32*5-128($ap), $B2, $TEMP2 | |
407 | vpaddq $TEMP2, $ACC1, $ACC1 | |
408 | vpmuludq 32*5-128($aap), $B2, $TEMP0 | |
0b4bb91d AP |
409 | vpaddq $TEMP0, $ACC2, $ACC2 |
410 | vpmuludq 32*6-128($aap), $B2, $TEMP1 | |
411 | vpaddq $TEMP1, $ACC3, $ACC3 | |
412 | vpmuludq 32*7-128($aap), $B2, $ACC4 | |
fa104be3 AP |
413 | vpbroadcastq 32*7-128($tpa), $B2 |
414 | vpaddq 32*13-448($tp1), $ACC4, $ACC4 | |
0b4bb91d | 415 | |
fa104be3 AP |
416 | vmovdqu $ACC1, 32*10-448($tp1) |
417 | vmovdqu $ACC2, 32*11-448($tp1) | |
0b4bb91d AP |
418 | |
419 | vpmuludq 32*6-128($ap), $B1, $TEMP0 | |
0b4bb91d AP |
420 | vpaddq $TEMP0, $ACC3, $ACC3 |
421 | vpmuludq 32*6-128($aap), $B1, $TEMP1 | |
fa104be3 | 422 | vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 |
0b4bb91d AP |
423 | vpaddq $TEMP1, $ACC4, $ACC4 |
424 | vpmuludq 32*7-128($aap), $B1, $ACC5 | |
fa104be3 AP |
425 | vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration |
426 | vpaddq 32*14-448($tp1), $ACC5, $ACC5 | |
0b4bb91d | 427 | |
fa104be3 AP |
428 | vmovdqu $ACC3, 32*12-448($tp1) |
429 | vmovdqu $ACC4, 32*13-448($tp1) | |
430 | lea 8($tpa), $tpa | |
0b4bb91d AP |
431 | |
432 | vpmuludq 32*7-128($ap), $B2, $TEMP0 | |
0b4bb91d AP |
433 | vpaddq $TEMP0, $ACC5, $ACC5 |
434 | vpmuludq 32*7-128($aap), $B2, $ACC6 | |
fa104be3 | 435 | vpaddq 32*15-448($tp1), $ACC6, $ACC6 |
0b4bb91d AP |
436 | |
437 | vpmuludq 32*8-128($ap), $ACC0, $ACC7 | |
fa104be3 AP |
438 | vmovdqu $ACC5, 32*14-448($tp1) |
439 | vpaddq 32*16-448($tp1), $ACC7, $ACC7 | |
440 | vmovdqu $ACC6, 32*15-448($tp1) | |
441 | vmovdqu $ACC7, 32*16-448($tp1) | |
442 | lea 8($tp1), $tp1 | |
0b4bb91d | 443 | |
0b4bb91d AP |
444 | dec $i |
445 | jnz .LOOP_SQR_1024 | |
446 | ___ | |
447 | $ZERO = $ACC9; | |
448 | $TEMP0 = $B1; | |
449 | $TEMP2 = $B2; | |
450 | $TEMP3 = $Y1; | |
451 | $TEMP4 = $Y2; | |
452 | $code.=<<___; | |
d6d422e1 | 453 | # we need to fix indices 32-39 to avoid overflow |
fa104be3 AP |
454 | vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), |
455 | vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) | |
456 | vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) | |
457 | lea 192(%rsp), $tp0 # 64+128=192 | |
0b4bb91d AP |
458 | |
459 | vpsrlq \$29, $ACC8, $TEMP1 | |
460 | vpand $AND_MASK, $ACC8, $ACC8 | |
461 | vpsrlq \$29, $ACC1, $TEMP2 | |
462 | vpand $AND_MASK, $ACC1, $ACC1 | |
463 | ||
464 | vpermq \$0x93, $TEMP1, $TEMP1 | |
465 | vpxor $ZERO, $ZERO, $ZERO | |
466 | vpermq \$0x93, $TEMP2, $TEMP2 | |
467 | ||
468 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
469 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
470 | vpaddq $TEMP0, $ACC8, $ACC8 | |
471 | vpblendd \$3, $TEMP2, $ZERO, $TEMP2 | |
472 | vpaddq $TEMP1, $ACC1, $ACC1 | |
473 | vpaddq $TEMP2, $ACC2, $ACC2 | |
474 | vmovdqu $ACC1, 32*9-192($tp0) | |
fa104be3 | 475 | vmovdqu $ACC2, 32*10-192($tp0) |
0b4bb91d AP |
476 | |
477 | mov (%rsp), %rax | |
478 | mov 8(%rsp), $r1 | |
479 | mov 16(%rsp), $r2 | |
480 | mov 24(%rsp), $r3 | |
481 | vmovdqu 32*1(%rsp), $ACC1 | |
482 | vmovdqu 32*2-192($tp0), $ACC2 | |
483 | vmovdqu 32*3-192($tp0), $ACC3 | |
484 | vmovdqu 32*4-192($tp0), $ACC4 | |
485 | vmovdqu 32*5-192($tp0), $ACC5 | |
486 | vmovdqu 32*6-192($tp0), $ACC6 | |
487 | vmovdqu 32*7-192($tp0), $ACC7 | |
488 | ||
489 | mov %rax, $r0 | |
490 | imull $n0, %eax | |
491 | and \$0x1fffffff, %eax | |
492 | vmovd %eax, $Y1 | |
493 | ||
494 | mov %rax, %rdx | |
495 | imulq -128($np), %rax | |
496 | vpbroadcastq $Y1, $Y1 | |
497 | add %rax, $r0 | |
498 | mov %rdx, %rax | |
499 | imulq 8-128($np), %rax | |
500 | shr \$29, $r0 | |
501 | add %rax, $r1 | |
502 | mov %rdx, %rax | |
503 | imulq 16-128($np), %rax | |
504 | add $r0, $r1 | |
505 | add %rax, $r2 | |
506 | imulq 24-128($np), %rdx | |
507 | add %rdx, $r3 | |
508 | ||
509 | mov $r1, %rax | |
510 | imull $n0, %eax | |
511 | and \$0x1fffffff, %eax | |
512 | ||
513 | mov \$9, $i | |
514 | jmp .LOOP_REDUCE_1024 | |
515 | ||
516 | .align 32 | |
517 | .LOOP_REDUCE_1024: | |
518 | vmovd %eax, $Y2 | |
519 | vpbroadcastq $Y2, $Y2 | |
520 | ||
521 | vpmuludq 32*1-128($np), $Y1, $TEMP0 | |
522 | mov %rax, %rdx | |
523 | imulq -128($np), %rax | |
524 | vpaddq $TEMP0, $ACC1, $ACC1 | |
0b4bb91d | 525 | add %rax, $r1 |
fa104be3 | 526 | vpmuludq 32*2-128($np), $Y1, $TEMP1 |
0b4bb91d AP |
527 | mov %rdx, %rax |
528 | imulq 8-128($np), %rax | |
529 | vpaddq $TEMP1, $ACC2, $ACC2 | |
530 | vpmuludq 32*3-128($np), $Y1, $TEMP2 | |
fa104be3 | 531 | .byte 0x67 |
0b4bb91d | 532 | add %rax, $r2 |
fa104be3 | 533 | .byte 0x67 |
0b4bb91d AP |
534 | mov %rdx, %rax |
535 | imulq 16-128($np), %rax | |
536 | shr \$29, $r1 | |
537 | vpaddq $TEMP2, $ACC3, $ACC3 | |
538 | vpmuludq 32*4-128($np), $Y1, $TEMP0 | |
539 | add %rax, $r3 | |
540 | add $r1, $r2 | |
541 | vpaddq $TEMP0, $ACC4, $ACC4 | |
542 | vpmuludq 32*5-128($np), $Y1, $TEMP1 | |
543 | mov $r2, %rax | |
544 | imull $n0, %eax | |
545 | vpaddq $TEMP1, $ACC5, $ACC5 | |
546 | vpmuludq 32*6-128($np), $Y1, $TEMP2 | |
547 | and \$0x1fffffff, %eax | |
548 | vpaddq $TEMP2, $ACC6, $ACC6 | |
549 | vpmuludq 32*7-128($np), $Y1, $TEMP0 | |
550 | vpaddq $TEMP0, $ACC7, $ACC7 | |
551 | vpmuludq 32*8-128($np), $Y1, $TEMP1 | |
552 | vmovd %eax, $Y1 | |
fa104be3 | 553 | #vmovdqu 32*1-8-128($np), $TEMP2 # moved below |
0b4bb91d | 554 | vpaddq $TEMP1, $ACC8, $ACC8 |
fa104be3 | 555 | #vmovdqu 32*2-8-128($np), $TEMP0 # moved below |
0b4bb91d AP |
556 | vpbroadcastq $Y1, $Y1 |
557 | ||
fa104be3 | 558 | vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above |
0b4bb91d AP |
559 | vmovdqu 32*3-8-128($np), $TEMP1 |
560 | mov %rax, %rdx | |
561 | imulq -128($np), %rax | |
562 | vpaddq $TEMP2, $ACC1, $ACC1 | |
fa104be3 | 563 | vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above |
0b4bb91d AP |
564 | vmovdqu 32*4-8-128($np), $TEMP2 |
565 | add %rax, $r2 | |
566 | mov %rdx, %rax | |
567 | imulq 8-128($np), %rax | |
568 | vpaddq $TEMP0, $ACC2, $ACC2 | |
569 | add $r3, %rax | |
570 | shr \$29, $r2 | |
571 | vpmuludq $Y2, $TEMP1, $TEMP1 | |
572 | vmovdqu 32*5-8-128($np), $TEMP0 | |
573 | add $r2, %rax | |
574 | vpaddq $TEMP1, $ACC3, $ACC3 | |
575 | vpmuludq $Y2, $TEMP2, $TEMP2 | |
576 | vmovdqu 32*6-8-128($np), $TEMP1 | |
fa104be3 | 577 | .byte 0x67 |
0b4bb91d AP |
578 | mov %rax, $r3 |
579 | imull $n0, %eax | |
580 | vpaddq $TEMP2, $ACC4, $ACC4 | |
581 | vpmuludq $Y2, $TEMP0, $TEMP0 | |
fa104be3 | 582 | .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 |
0b4bb91d AP |
583 | and \$0x1fffffff, %eax |
584 | vpaddq $TEMP0, $ACC5, $ACC5 | |
585 | vpmuludq $Y2, $TEMP1, $TEMP1 | |
586 | vmovdqu 32*8-8-128($np), $TEMP0 | |
587 | vpaddq $TEMP1, $ACC6, $ACC6 | |
588 | vpmuludq $Y2, $TEMP2, $TEMP2 | |
589 | vmovdqu 32*9-8-128($np), $ACC9 | |
590 | vmovd %eax, $ACC0 # borrow ACC0 for Y2 | |
591 | imulq -128($np), %rax | |
592 | vpaddq $TEMP2, $ACC7, $ACC7 | |
593 | vpmuludq $Y2, $TEMP0, $TEMP0 | |
594 | vmovdqu 32*1-16-128($np), $TEMP1 | |
595 | vpbroadcastq $ACC0, $ACC0 | |
596 | vpaddq $TEMP0, $ACC8, $ACC8 | |
597 | vpmuludq $Y2, $ACC9, $ACC9 | |
598 | vmovdqu 32*2-16-128($np), $TEMP2 | |
599 | add %rax, $r3 | |
600 | ||
601 | ___ | |
602 | ($ACC0,$Y2)=($Y2,$ACC0); | |
603 | $code.=<<___; | |
604 | vmovdqu 32*1-24-128($np), $ACC0 | |
605 | vpmuludq $Y1, $TEMP1, $TEMP1 | |
606 | vmovdqu 32*3-16-128($np), $TEMP0 | |
607 | vpaddq $TEMP1, $ACC1, $ACC1 | |
608 | vpmuludq $Y2, $ACC0, $ACC0 | |
609 | vpmuludq $Y1, $TEMP2, $TEMP2 | |
fa104be3 | 610 | .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 |
0b4bb91d AP |
611 | vpaddq $ACC1, $ACC0, $ACC0 |
612 | vpaddq $TEMP2, $ACC2, $ACC2 | |
613 | vpmuludq $Y1, $TEMP0, $TEMP0 | |
614 | vmovdqu 32*5-16-128($np), $TEMP2 | |
fa104be3 | 615 | .byte 0x67 |
0b4bb91d AP |
616 | vmovq $ACC0, %rax |
617 | vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 | |
618 | vpaddq $TEMP0, $ACC3, $ACC3 | |
619 | vpmuludq $Y1, $TEMP1, $TEMP1 | |
620 | vmovdqu 32*6-16-128($np), $TEMP0 | |
621 | vpaddq $TEMP1, $ACC4, $ACC4 | |
622 | vpmuludq $Y1, $TEMP2, $TEMP2 | |
623 | vmovdqu 32*7-16-128($np), $TEMP1 | |
624 | vpaddq $TEMP2, $ACC5, $ACC5 | |
625 | vpmuludq $Y1, $TEMP0, $TEMP0 | |
626 | vmovdqu 32*8-16-128($np), $TEMP2 | |
627 | vpaddq $TEMP0, $ACC6, $ACC6 | |
628 | vpmuludq $Y1, $TEMP1, $TEMP1 | |
0b4bb91d | 629 | shr \$29, $r3 |
fa104be3 AP |
630 | vmovdqu 32*9-16-128($np), $TEMP0 |
631 | add $r3, %rax | |
0b4bb91d AP |
632 | vpaddq $TEMP1, $ACC7, $ACC7 |
633 | vpmuludq $Y1, $TEMP2, $TEMP2 | |
fa104be3 | 634 | #vmovdqu 32*2-24-128($np), $TEMP1 # moved below |
0b4bb91d AP |
635 | mov %rax, $r0 |
636 | imull $n0, %eax | |
637 | vpaddq $TEMP2, $ACC8, $ACC8 | |
638 | vpmuludq $Y1, $TEMP0, $TEMP0 | |
639 | and \$0x1fffffff, %eax | |
640 | vmovd %eax, $Y1 | |
641 | vmovdqu 32*3-24-128($np), $TEMP2 | |
fa104be3 | 642 | .byte 0x67 |
0b4bb91d AP |
643 | vpaddq $TEMP0, $ACC9, $ACC9 |
644 | vpbroadcastq $Y1, $Y1 | |
645 | ||
fa104be3 | 646 | vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above |
0b4bb91d AP |
647 | vmovdqu 32*4-24-128($np), $TEMP0 |
648 | mov %rax, %rdx | |
649 | imulq -128($np), %rax | |
650 | mov 8(%rsp), $r1 | |
651 | vpaddq $TEMP1, $ACC2, $ACC1 | |
652 | vpmuludq $Y2, $TEMP2, $TEMP2 | |
653 | vmovdqu 32*5-24-128($np), $TEMP1 | |
654 | add %rax, $r0 | |
655 | mov %rdx, %rax | |
656 | imulq 8-128($np), %rax | |
fa104be3 | 657 | .byte 0x67 |
0b4bb91d AP |
658 | shr \$29, $r0 |
659 | mov 16(%rsp), $r2 | |
660 | vpaddq $TEMP2, $ACC3, $ACC2 | |
661 | vpmuludq $Y2, $TEMP0, $TEMP0 | |
662 | vmovdqu 32*6-24-128($np), $TEMP2 | |
663 | add %rax, $r1 | |
664 | mov %rdx, %rax | |
665 | imulq 16-128($np), %rax | |
666 | vpaddq $TEMP0, $ACC4, $ACC3 | |
667 | vpmuludq $Y2, $TEMP1, $TEMP1 | |
668 | vmovdqu 32*7-24-128($np), $TEMP0 | |
669 | imulq 24-128($np), %rdx # future $r3 | |
670 | add %rax, $r2 | |
671 | lea ($r0,$r1), %rax | |
672 | vpaddq $TEMP1, $ACC5, $ACC4 | |
673 | vpmuludq $Y2, $TEMP2, $TEMP2 | |
674 | vmovdqu 32*8-24-128($np), $TEMP1 | |
675 | mov %rax, $r1 | |
676 | imull $n0, %eax | |
0b4bb91d | 677 | vpmuludq $Y2, $TEMP0, $TEMP0 |
fa104be3 | 678 | vpaddq $TEMP2, $ACC6, $ACC5 |
0b4bb91d AP |
679 | vmovdqu 32*9-24-128($np), $TEMP2 |
680 | and \$0x1fffffff, %eax | |
681 | vpaddq $TEMP0, $ACC7, $ACC6 | |
682 | vpmuludq $Y2, $TEMP1, $TEMP1 | |
683 | add 24(%rsp), %rdx | |
684 | vpaddq $TEMP1, $ACC8, $ACC7 | |
685 | vpmuludq $Y2, $TEMP2, $TEMP2 | |
686 | vpaddq $TEMP2, $ACC9, $ACC8 | |
687 | vmovq $r3, $ACC9 | |
688 | mov %rdx, $r3 | |
689 | ||
690 | dec $i | |
691 | jnz .LOOP_REDUCE_1024 | |
692 | ___ | |
693 | ($ACC0,$Y2)=($Y2,$ACC0); | |
694 | $code.=<<___; | |
695 | lea 448(%rsp), $tp1 # size optimization | |
696 | vpaddq $ACC9, $Y2, $ACC0 | |
697 | vpxor $ZERO, $ZERO, $ZERO | |
698 | ||
699 | vpaddq 32*9-192($tp0), $ACC0, $ACC0 | |
700 | vpaddq 32*10-448($tp1), $ACC1, $ACC1 | |
701 | vpaddq 32*11-448($tp1), $ACC2, $ACC2 | |
702 | vpaddq 32*12-448($tp1), $ACC3, $ACC3 | |
703 | vpaddq 32*13-448($tp1), $ACC4, $ACC4 | |
704 | vpaddq 32*14-448($tp1), $ACC5, $ACC5 | |
705 | vpaddq 32*15-448($tp1), $ACC6, $ACC6 | |
706 | vpaddq 32*16-448($tp1), $ACC7, $ACC7 | |
707 | vpaddq 32*17-448($tp1), $ACC8, $ACC8 | |
708 | ||
709 | vpsrlq \$29, $ACC0, $TEMP1 | |
710 | vpand $AND_MASK, $ACC0, $ACC0 | |
711 | vpsrlq \$29, $ACC1, $TEMP2 | |
712 | vpand $AND_MASK, $ACC1, $ACC1 | |
713 | vpsrlq \$29, $ACC2, $TEMP3 | |
714 | vpermq \$0x93, $TEMP1, $TEMP1 | |
715 | vpand $AND_MASK, $ACC2, $ACC2 | |
716 | vpsrlq \$29, $ACC3, $TEMP4 | |
717 | vpermq \$0x93, $TEMP2, $TEMP2 | |
718 | vpand $AND_MASK, $ACC3, $ACC3 | |
719 | vpermq \$0x93, $TEMP3, $TEMP3 | |
720 | ||
721 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
722 | vpermq \$0x93, $TEMP4, $TEMP4 | |
723 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
724 | vpaddq $TEMP0, $ACC0, $ACC0 | |
725 | vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 | |
726 | vpaddq $TEMP1, $ACC1, $ACC1 | |
727 | vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 | |
728 | vpaddq $TEMP2, $ACC2, $ACC2 | |
729 | vpblendd \$3, $TEMP4, $ZERO, $TEMP4 | |
730 | vpaddq $TEMP3, $ACC3, $ACC3 | |
731 | vpaddq $TEMP4, $ACC4, $ACC4 | |
732 | ||
733 | vpsrlq \$29, $ACC0, $TEMP1 | |
734 | vpand $AND_MASK, $ACC0, $ACC0 | |
735 | vpsrlq \$29, $ACC1, $TEMP2 | |
736 | vpand $AND_MASK, $ACC1, $ACC1 | |
737 | vpsrlq \$29, $ACC2, $TEMP3 | |
738 | vpermq \$0x93, $TEMP1, $TEMP1 | |
739 | vpand $AND_MASK, $ACC2, $ACC2 | |
740 | vpsrlq \$29, $ACC3, $TEMP4 | |
741 | vpermq \$0x93, $TEMP2, $TEMP2 | |
742 | vpand $AND_MASK, $ACC3, $ACC3 | |
743 | vpermq \$0x93, $TEMP3, $TEMP3 | |
744 | ||
745 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
746 | vpermq \$0x93, $TEMP4, $TEMP4 | |
747 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
748 | vpaddq $TEMP0, $ACC0, $ACC0 | |
749 | vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 | |
750 | vpaddq $TEMP1, $ACC1, $ACC1 | |
751 | vmovdqu $ACC0, 32*0-128($rp) | |
752 | vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 | |
753 | vpaddq $TEMP2, $ACC2, $ACC2 | |
754 | vmovdqu $ACC1, 32*1-128($rp) | |
755 | vpblendd \$3, $TEMP4, $ZERO, $TEMP4 | |
756 | vpaddq $TEMP3, $ACC3, $ACC3 | |
757 | vmovdqu $ACC2, 32*2-128($rp) | |
758 | vpaddq $TEMP4, $ACC4, $ACC4 | |
759 | vmovdqu $ACC3, 32*3-128($rp) | |
760 | ___ | |
761 | $TEMP5=$ACC0; | |
762 | $code.=<<___; | |
763 | vpsrlq \$29, $ACC4, $TEMP1 | |
764 | vpand $AND_MASK, $ACC4, $ACC4 | |
765 | vpsrlq \$29, $ACC5, $TEMP2 | |
766 | vpand $AND_MASK, $ACC5, $ACC5 | |
767 | vpsrlq \$29, $ACC6, $TEMP3 | |
768 | vpermq \$0x93, $TEMP1, $TEMP1 | |
769 | vpand $AND_MASK, $ACC6, $ACC6 | |
770 | vpsrlq \$29, $ACC7, $TEMP4 | |
771 | vpermq \$0x93, $TEMP2, $TEMP2 | |
772 | vpand $AND_MASK, $ACC7, $ACC7 | |
773 | vpsrlq \$29, $ACC8, $TEMP5 | |
774 | vpermq \$0x93, $TEMP3, $TEMP3 | |
775 | vpand $AND_MASK, $ACC8, $ACC8 | |
776 | vpermq \$0x93, $TEMP4, $TEMP4 | |
777 | ||
778 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
779 | vpermq \$0x93, $TEMP5, $TEMP5 | |
780 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
781 | vpaddq $TEMP0, $ACC4, $ACC4 | |
782 | vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 | |
783 | vpaddq $TEMP1, $ACC5, $ACC5 | |
784 | vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 | |
785 | vpaddq $TEMP2, $ACC6, $ACC6 | |
786 | vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 | |
787 | vpaddq $TEMP3, $ACC7, $ACC7 | |
788 | vpaddq $TEMP4, $ACC8, $ACC8 | |
789 | ||
790 | vpsrlq \$29, $ACC4, $TEMP1 | |
791 | vpand $AND_MASK, $ACC4, $ACC4 | |
792 | vpsrlq \$29, $ACC5, $TEMP2 | |
793 | vpand $AND_MASK, $ACC5, $ACC5 | |
794 | vpsrlq \$29, $ACC6, $TEMP3 | |
795 | vpermq \$0x93, $TEMP1, $TEMP1 | |
796 | vpand $AND_MASK, $ACC6, $ACC6 | |
797 | vpsrlq \$29, $ACC7, $TEMP4 | |
798 | vpermq \$0x93, $TEMP2, $TEMP2 | |
799 | vpand $AND_MASK, $ACC7, $ACC7 | |
800 | vpsrlq \$29, $ACC8, $TEMP5 | |
801 | vpermq \$0x93, $TEMP3, $TEMP3 | |
802 | vpand $AND_MASK, $ACC8, $ACC8 | |
803 | vpermq \$0x93, $TEMP4, $TEMP4 | |
804 | ||
805 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
806 | vpermq \$0x93, $TEMP5, $TEMP5 | |
807 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
808 | vpaddq $TEMP0, $ACC4, $ACC4 | |
809 | vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 | |
810 | vpaddq $TEMP1, $ACC5, $ACC5 | |
811 | vmovdqu $ACC4, 32*4-128($rp) | |
812 | vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 | |
813 | vpaddq $TEMP2, $ACC6, $ACC6 | |
814 | vmovdqu $ACC5, 32*5-128($rp) | |
815 | vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 | |
816 | vpaddq $TEMP3, $ACC7, $ACC7 | |
817 | vmovdqu $ACC6, 32*6-128($rp) | |
818 | vpaddq $TEMP4, $ACC8, $ACC8 | |
819 | vmovdqu $ACC7, 32*7-128($rp) | |
820 | vmovdqu $ACC8, 32*8-128($rp) | |
821 | ||
822 | mov $rp, $ap | |
823 | dec $rep | |
824 | jne .LOOP_GRANDE_SQR_1024 | |
825 | ||
826 | vzeroall | |
827 | mov %rbp, %rax | |
828 | ___ | |
829 | $code.=<<___ if ($win64); | |
830 | movaps -0xd8(%rax),%xmm6 | |
831 | movaps -0xc8(%rax),%xmm7 | |
832 | movaps -0xb8(%rax),%xmm8 | |
833 | movaps -0xa8(%rax),%xmm9 | |
834 | movaps -0x98(%rax),%xmm10 | |
835 | movaps -0x88(%rax),%xmm11 | |
836 | movaps -0x78(%rax),%xmm12 | |
837 | movaps -0x68(%rax),%xmm13 | |
838 | movaps -0x58(%rax),%xmm14 | |
839 | movaps -0x48(%rax),%xmm15 | |
840 | ___ | |
841 | $code.=<<___; | |
842 | mov -48(%rax),%r15 | |
843 | mov -40(%rax),%r14 | |
844 | mov -32(%rax),%r13 | |
845 | mov -24(%rax),%r12 | |
846 | mov -16(%rax),%rbp | |
847 | mov -8(%rax),%rbx | |
848 | lea (%rax),%rsp # restore %rsp | |
849 | .Lsqr_1024_epilogue: | |
850 | ret | |
851 | .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 | |
852 | ___ | |
853 | } | |
854 | ||
855 | { # void AMM_WW( | |
856 | my $rp="%rdi"; # BN_ULONG *rp, | |
857 | my $ap="%rsi"; # const BN_ULONG *ap, | |
858 | my $bp="%rdx"; # const BN_ULONG *bp, | |
859 | my $np="%rcx"; # const BN_ULONG *np, | |
860 | my $n0="%r8d"; # unsigned int n0); | |
861 | ||
862 | # The registers that hold the accumulated redundant result | |
863 | # The AMM works on 1024 bit operands, and redundant word size is 29 | |
864 | # Therefore: ceil(1024/29)/4 = 9 | |
865 | my $ACC0="%ymm0"; | |
866 | my $ACC1="%ymm1"; | |
867 | my $ACC2="%ymm2"; | |
868 | my $ACC3="%ymm3"; | |
869 | my $ACC4="%ymm4"; | |
870 | my $ACC5="%ymm5"; | |
871 | my $ACC6="%ymm6"; | |
872 | my $ACC7="%ymm7"; | |
873 | my $ACC8="%ymm8"; | |
874 | my $ACC9="%ymm9"; | |
875 | ||
876 | # Registers that hold the broadcasted words of multiplier, currently used | |
877 | my $Bi="%ymm10"; | |
878 | my $Yi="%ymm11"; | |
879 | ||
880 | # Helper registers | |
881 | my $TEMP0=$ACC0; | |
882 | my $TEMP1="%ymm12"; | |
883 | my $TEMP2="%ymm13"; | |
884 | my $ZERO="%ymm14"; | |
885 | my $AND_MASK="%ymm15"; | |
886 | ||
887 | # alu registers that hold the first words of the ACC | |
888 | my $r0="%r9"; | |
889 | my $r1="%r10"; | |
890 | my $r2="%r11"; | |
891 | my $r3="%r12"; | |
892 | ||
893 | my $i="%r14d"; | |
894 | my $tmp="%r15"; | |
895 | ||
896 | $bp="%r13"; # reassigned argument | |
897 | ||
898 | $code.=<<___; | |
899 | .globl rsaz_1024_mul_avx2 | |
900 | .type rsaz_1024_mul_avx2,\@function,5 | |
901 | .align 64 | |
902 | rsaz_1024_mul_avx2: | |
903 | lea (%rsp), %rax | |
904 | push %rbx | |
905 | push %rbp | |
906 | push %r12 | |
907 | push %r13 | |
908 | push %r14 | |
909 | push %r15 | |
910 | ___ | |
911 | $code.=<<___ if ($win64); | |
31ed9a21 | 912 | vzeroupper |
0b4bb91d | 913 | lea -0xa8(%rsp),%rsp |
31ed9a21 AP |
914 | vmovaps %xmm6,-0xd8(%rax) |
915 | vmovaps %xmm7,-0xc8(%rax) | |
916 | vmovaps %xmm8,-0xb8(%rax) | |
917 | vmovaps %xmm9,-0xa8(%rax) | |
918 | vmovaps %xmm10,-0x98(%rax) | |
919 | vmovaps %xmm11,-0x88(%rax) | |
920 | vmovaps %xmm12,-0x78(%rax) | |
921 | vmovaps %xmm13,-0x68(%rax) | |
922 | vmovaps %xmm14,-0x58(%rax) | |
923 | vmovaps %xmm15,-0x48(%rax) | |
0b4bb91d AP |
924 | .Lmul_1024_body: |
925 | ___ | |
926 | $code.=<<___; | |
927 | mov %rax,%rbp | |
928 | vzeroall | |
929 | mov %rdx, $bp # reassigned argument | |
930 | sub \$64,%rsp | |
931 | ||
932 | # unaligned 256-bit load that crosses page boundary can | |
933 | # cause severe performance degradation here, so if $ap does | |
934 | # cross page boundary, swap it with $bp [meaning that caller | |
935 | # is advised to lay down $ap and $bp next to each other, so | |
936 | # that only one can cross page boundary]. | |
31ed9a21 | 937 | .byte 0x67,0x67 |
0b4bb91d AP |
938 | mov $ap, $tmp |
939 | and \$4095, $tmp | |
940 | add \$32*10, $tmp | |
941 | shr \$12, $tmp | |
942 | mov $ap, $tmp | |
943 | cmovnz $bp, $ap | |
944 | cmovnz $tmp, $bp | |
945 | ||
946 | mov $np, $tmp | |
947 | sub \$-128,$ap # size optimization | |
948 | sub \$-128,$np | |
949 | sub \$-128,$rp | |
950 | ||
951 | and \$4095, $tmp # see if $np crosses page | |
952 | add \$32*10, $tmp | |
31ed9a21 | 953 | .byte 0x67,0x67 |
0b4bb91d AP |
954 | shr \$12, $tmp |
955 | jz .Lmul_1024_no_n_copy | |
956 | ||
957 | # unaligned 256-bit load that crosses page boundary can | |
958 | # cause severe performance degradation here, so if $np does | |
959 | # cross page boundary, copy it to stack and make sure stack | |
960 | # frame doesn't... | |
961 | sub \$32*10,%rsp | |
962 | vmovdqu 32*0-128($np), $ACC0 | |
963 | and \$-512, %rsp | |
964 | vmovdqu 32*1-128($np), $ACC1 | |
965 | vmovdqu 32*2-128($np), $ACC2 | |
966 | vmovdqu 32*3-128($np), $ACC3 | |
967 | vmovdqu 32*4-128($np), $ACC4 | |
968 | vmovdqu 32*5-128($np), $ACC5 | |
969 | vmovdqu 32*6-128($np), $ACC6 | |
970 | vmovdqu 32*7-128($np), $ACC7 | |
971 | vmovdqu 32*8-128($np), $ACC8 | |
972 | lea 64+128(%rsp),$np | |
973 | vmovdqu $ACC0, 32*0-128($np) | |
974 | vpxor $ACC0, $ACC0, $ACC0 | |
975 | vmovdqu $ACC1, 32*1-128($np) | |
976 | vpxor $ACC1, $ACC1, $ACC1 | |
977 | vmovdqu $ACC2, 32*2-128($np) | |
978 | vpxor $ACC2, $ACC2, $ACC2 | |
979 | vmovdqu $ACC3, 32*3-128($np) | |
980 | vpxor $ACC3, $ACC3, $ACC3 | |
981 | vmovdqu $ACC4, 32*4-128($np) | |
982 | vpxor $ACC4, $ACC4, $ACC4 | |
983 | vmovdqu $ACC5, 32*5-128($np) | |
984 | vpxor $ACC5, $ACC5, $ACC5 | |
985 | vmovdqu $ACC6, 32*6-128($np) | |
986 | vpxor $ACC6, $ACC6, $ACC6 | |
987 | vmovdqu $ACC7, 32*7-128($np) | |
988 | vpxor $ACC7, $ACC7, $ACC7 | |
989 | vmovdqu $ACC8, 32*8-128($np) | |
990 | vmovdqa $ACC0, $ACC8 | |
991 | vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall | |
992 | .Lmul_1024_no_n_copy: | |
993 | and \$-64,%rsp | |
994 | ||
995 | mov ($bp), %rbx | |
996 | vpbroadcastq ($bp), $Bi | |
997 | vmovdqu $ACC0, (%rsp) # clear top of stack | |
998 | xor $r0, $r0 | |
31ed9a21 | 999 | .byte 0x67 |
0b4bb91d AP |
1000 | xor $r1, $r1 |
1001 | xor $r2, $r2 | |
1002 | xor $r3, $r3 | |
1003 | ||
1004 | vmovdqu .Land_mask(%rip), $AND_MASK | |
1005 | mov \$9, $i | |
406d4af0 | 1006 | vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall |
0b4bb91d AP |
1007 | jmp .Loop_mul_1024 |
1008 | ||
1009 | .align 32 | |
1010 | .Loop_mul_1024: | |
1011 | vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) | |
1012 | mov %rbx, %rax | |
1013 | imulq -128($ap), %rax | |
1014 | add $r0, %rax | |
1015 | mov %rbx, $r1 | |
1016 | imulq 8-128($ap), $r1 | |
1017 | add 8(%rsp), $r1 | |
1018 | ||
1019 | mov %rax, $r0 | |
1020 | imull $n0, %eax | |
1021 | and \$0x1fffffff, %eax | |
1022 | ||
1023 | mov %rbx, $r2 | |
1024 | imulq 16-128($ap), $r2 | |
1025 | add 16(%rsp), $r2 | |
1026 | ||
1027 | mov %rbx, $r3 | |
1028 | imulq 24-128($ap), $r3 | |
1029 | add 24(%rsp), $r3 | |
1030 | vpmuludq 32*1-128($ap),$Bi,$TEMP0 | |
1031 | vmovd %eax, $Yi | |
1032 | vpaddq $TEMP0,$ACC1,$ACC1 | |
1033 | vpmuludq 32*2-128($ap),$Bi,$TEMP1 | |
1034 | vpbroadcastq $Yi, $Yi | |
1035 | vpaddq $TEMP1,$ACC2,$ACC2 | |
1036 | vpmuludq 32*3-128($ap),$Bi,$TEMP2 | |
1037 | vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 | |
1038 | vpaddq $TEMP2,$ACC3,$ACC3 | |
1039 | vpmuludq 32*4-128($ap),$Bi,$TEMP0 | |
1040 | vpaddq $TEMP0,$ACC4,$ACC4 | |
1041 | vpmuludq 32*5-128($ap),$Bi,$TEMP1 | |
1042 | vpaddq $TEMP1,$ACC5,$ACC5 | |
1043 | vpmuludq 32*6-128($ap),$Bi,$TEMP2 | |
1044 | vpaddq $TEMP2,$ACC6,$ACC6 | |
1045 | vpmuludq 32*7-128($ap),$Bi,$TEMP0 | |
1046 | vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 | |
1047 | vpaddq $TEMP0,$ACC7,$ACC7 | |
1048 | vpmuludq 32*8-128($ap),$Bi,$TEMP1 | |
1049 | vpbroadcastq 8($bp), $Bi | |
1050 | vpaddq $TEMP1,$ACC8,$ACC8 | |
1051 | ||
1052 | mov %rax,%rdx | |
1053 | imulq -128($np),%rax | |
1054 | add %rax,$r0 | |
1055 | mov %rdx,%rax | |
1056 | imulq 8-128($np),%rax | |
1057 | add %rax,$r1 | |
1058 | mov %rdx,%rax | |
1059 | imulq 16-128($np),%rax | |
1060 | add %rax,$r2 | |
1061 | shr \$29, $r0 | |
1062 | imulq 24-128($np),%rdx | |
1063 | add %rdx,$r3 | |
1064 | add $r0, $r1 | |
1065 | ||
1066 | vpmuludq 32*1-128($np),$Yi,$TEMP2 | |
1067 | vmovq $Bi, %rbx | |
1068 | vpaddq $TEMP2,$ACC1,$ACC1 | |
1069 | vpmuludq 32*2-128($np),$Yi,$TEMP0 | |
1070 | vpaddq $TEMP0,$ACC2,$ACC2 | |
1071 | vpmuludq 32*3-128($np),$Yi,$TEMP1 | |
1072 | vpaddq $TEMP1,$ACC3,$ACC3 | |
1073 | vpmuludq 32*4-128($np),$Yi,$TEMP2 | |
1074 | vpaddq $TEMP2,$ACC4,$ACC4 | |
1075 | vpmuludq 32*5-128($np),$Yi,$TEMP0 | |
1076 | vpaddq $TEMP0,$ACC5,$ACC5 | |
1077 | vpmuludq 32*6-128($np),$Yi,$TEMP1 | |
1078 | vpaddq $TEMP1,$ACC6,$ACC6 | |
1079 | vpmuludq 32*7-128($np),$Yi,$TEMP2 | |
1080 | vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 | |
1081 | vpaddq $TEMP2,$ACC7,$ACC7 | |
1082 | vpmuludq 32*8-128($np),$Yi,$TEMP0 | |
1083 | vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 | |
1084 | vpaddq $TEMP0,$ACC8,$ACC8 | |
1085 | ||
1086 | mov %rbx, %rax | |
1087 | imulq -128($ap),%rax | |
1088 | add %rax,$r1 | |
1089 | vmovdqu -8+32*1-128($ap),$TEMP1 | |
1090 | mov %rbx, %rax | |
1091 | imulq 8-128($ap),%rax | |
1092 | add %rax,$r2 | |
1093 | vmovdqu -8+32*2-128($ap),$TEMP2 | |
1094 | ||
1095 | mov $r1, %rax | |
1096 | imull $n0, %eax | |
1097 | and \$0x1fffffff, %eax | |
1098 | ||
1099 | imulq 16-128($ap),%rbx | |
1100 | add %rbx,$r3 | |
1101 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1102 | vmovd %eax, $Yi | |
1103 | vmovdqu -8+32*3-128($ap),$TEMP0 | |
1104 | vpaddq $TEMP1,$ACC1,$ACC1 | |
1105 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1106 | vpbroadcastq $Yi, $Yi | |
1107 | vmovdqu -8+32*4-128($ap),$TEMP1 | |
1108 | vpaddq $TEMP2,$ACC2,$ACC2 | |
1109 | vpmuludq $Bi,$TEMP0,$TEMP0 | |
1110 | vmovdqu -8+32*5-128($ap),$TEMP2 | |
1111 | vpaddq $TEMP0,$ACC3,$ACC3 | |
1112 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1113 | vmovdqu -8+32*6-128($ap),$TEMP0 | |
1114 | vpaddq $TEMP1,$ACC4,$ACC4 | |
1115 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1116 | vmovdqu -8+32*7-128($ap),$TEMP1 | |
1117 | vpaddq $TEMP2,$ACC5,$ACC5 | |
1118 | vpmuludq $Bi,$TEMP0,$TEMP0 | |
1119 | vmovdqu -8+32*8-128($ap),$TEMP2 | |
1120 | vpaddq $TEMP0,$ACC6,$ACC6 | |
1121 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1122 | vmovdqu -8+32*9-128($ap),$ACC9 | |
1123 | vpaddq $TEMP1,$ACC7,$ACC7 | |
1124 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1125 | vpaddq $TEMP2,$ACC8,$ACC8 | |
1126 | vpmuludq $Bi,$ACC9,$ACC9 | |
1127 | vpbroadcastq 16($bp), $Bi | |
1128 | ||
1129 | mov %rax,%rdx | |
1130 | imulq -128($np),%rax | |
1131 | add %rax,$r1 | |
1132 | vmovdqu -8+32*1-128($np),$TEMP0 | |
1133 | mov %rdx,%rax | |
1134 | imulq 8-128($np),%rax | |
1135 | add %rax,$r2 | |
1136 | vmovdqu -8+32*2-128($np),$TEMP1 | |
1137 | shr \$29, $r1 | |
1138 | imulq 16-128($np),%rdx | |
1139 | add %rdx,$r3 | |
1140 | add $r1, $r2 | |
1141 | ||
1142 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1143 | vmovq $Bi, %rbx | |
1144 | vmovdqu -8+32*3-128($np),$TEMP2 | |
1145 | vpaddq $TEMP0,$ACC1,$ACC1 | |
1146 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1147 | vmovdqu -8+32*4-128($np),$TEMP0 | |
1148 | vpaddq $TEMP1,$ACC2,$ACC2 | |
1149 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1150 | vmovdqu -8+32*5-128($np),$TEMP1 | |
1151 | vpaddq $TEMP2,$ACC3,$ACC3 | |
1152 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1153 | vmovdqu -8+32*6-128($np),$TEMP2 | |
1154 | vpaddq $TEMP0,$ACC4,$ACC4 | |
1155 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1156 | vmovdqu -8+32*7-128($np),$TEMP0 | |
1157 | vpaddq $TEMP1,$ACC5,$ACC5 | |
1158 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1159 | vmovdqu -8+32*8-128($np),$TEMP1 | |
1160 | vpaddq $TEMP2,$ACC6,$ACC6 | |
1161 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1162 | vmovdqu -8+32*9-128($np),$TEMP2 | |
1163 | vpaddq $TEMP0,$ACC7,$ACC7 | |
1164 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1165 | vpaddq $TEMP1,$ACC8,$ACC8 | |
1166 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1167 | vpaddq $TEMP2,$ACC9,$ACC9 | |
1168 | ||
1169 | vmovdqu -16+32*1-128($ap),$TEMP0 | |
1170 | mov %rbx,%rax | |
1171 | imulq -128($ap),%rax | |
1172 | add $r2,%rax | |
1173 | ||
1174 | vmovdqu -16+32*2-128($ap),$TEMP1 | |
1175 | mov %rax,$r2 | |
1176 | imull $n0, %eax | |
1177 | and \$0x1fffffff, %eax | |
1178 | ||
1179 | imulq 8-128($ap),%rbx | |
1180 | add %rbx,$r3 | |
1181 | vpmuludq $Bi,$TEMP0,$TEMP0 | |
1182 | vmovd %eax, $Yi | |
1183 | vmovdqu -16+32*3-128($ap),$TEMP2 | |
1184 | vpaddq $TEMP0,$ACC1,$ACC1 | |
1185 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1186 | vpbroadcastq $Yi, $Yi | |
1187 | vmovdqu -16+32*4-128($ap),$TEMP0 | |
1188 | vpaddq $TEMP1,$ACC2,$ACC2 | |
1189 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1190 | vmovdqu -16+32*5-128($ap),$TEMP1 | |
1191 | vpaddq $TEMP2,$ACC3,$ACC3 | |
1192 | vpmuludq $Bi,$TEMP0,$TEMP0 | |
1193 | vmovdqu -16+32*6-128($ap),$TEMP2 | |
1194 | vpaddq $TEMP0,$ACC4,$ACC4 | |
1195 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1196 | vmovdqu -16+32*7-128($ap),$TEMP0 | |
1197 | vpaddq $TEMP1,$ACC5,$ACC5 | |
1198 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1199 | vmovdqu -16+32*8-128($ap),$TEMP1 | |
1200 | vpaddq $TEMP2,$ACC6,$ACC6 | |
1201 | vpmuludq $Bi,$TEMP0,$TEMP0 | |
1202 | vmovdqu -16+32*9-128($ap),$TEMP2 | |
1203 | vpaddq $TEMP0,$ACC7,$ACC7 | |
1204 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1205 | vpaddq $TEMP1,$ACC8,$ACC8 | |
1206 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1207 | vpbroadcastq 24($bp), $Bi | |
1208 | vpaddq $TEMP2,$ACC9,$ACC9 | |
1209 | ||
1210 | vmovdqu -16+32*1-128($np),$TEMP0 | |
1211 | mov %rax,%rdx | |
1212 | imulq -128($np),%rax | |
1213 | add %rax,$r2 | |
1214 | vmovdqu -16+32*2-128($np),$TEMP1 | |
1215 | imulq 8-128($np),%rdx | |
1216 | add %rdx,$r3 | |
1217 | shr \$29, $r2 | |
1218 | ||
1219 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1220 | vmovq $Bi, %rbx | |
1221 | vmovdqu -16+32*3-128($np),$TEMP2 | |
1222 | vpaddq $TEMP0,$ACC1,$ACC1 | |
1223 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1224 | vmovdqu -16+32*4-128($np),$TEMP0 | |
1225 | vpaddq $TEMP1,$ACC2,$ACC2 | |
1226 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1227 | vmovdqu -16+32*5-128($np),$TEMP1 | |
1228 | vpaddq $TEMP2,$ACC3,$ACC3 | |
1229 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1230 | vmovdqu -16+32*6-128($np),$TEMP2 | |
1231 | vpaddq $TEMP0,$ACC4,$ACC4 | |
1232 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1233 | vmovdqu -16+32*7-128($np),$TEMP0 | |
1234 | vpaddq $TEMP1,$ACC5,$ACC5 | |
1235 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1236 | vmovdqu -16+32*8-128($np),$TEMP1 | |
1237 | vpaddq $TEMP2,$ACC6,$ACC6 | |
1238 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1239 | vmovdqu -16+32*9-128($np),$TEMP2 | |
1240 | vpaddq $TEMP0,$ACC7,$ACC7 | |
1241 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1242 | vmovdqu -24+32*1-128($ap),$TEMP0 | |
1243 | vpaddq $TEMP1,$ACC8,$ACC8 | |
1244 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1245 | vmovdqu -24+32*2-128($ap),$TEMP1 | |
1246 | vpaddq $TEMP2,$ACC9,$ACC9 | |
1247 | ||
1248 | add $r2, $r3 | |
1249 | imulq -128($ap),%rbx | |
1250 | add %rbx,$r3 | |
1251 | ||
1252 | mov $r3, %rax | |
1253 | imull $n0, %eax | |
1254 | and \$0x1fffffff, %eax | |
1255 | ||
1256 | vpmuludq $Bi,$TEMP0,$TEMP0 | |
1257 | vmovd %eax, $Yi | |
1258 | vmovdqu -24+32*3-128($ap),$TEMP2 | |
1259 | vpaddq $TEMP0,$ACC1,$ACC1 | |
1260 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1261 | vpbroadcastq $Yi, $Yi | |
1262 | vmovdqu -24+32*4-128($ap),$TEMP0 | |
1263 | vpaddq $TEMP1,$ACC2,$ACC2 | |
1264 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1265 | vmovdqu -24+32*5-128($ap),$TEMP1 | |
1266 | vpaddq $TEMP2,$ACC3,$ACC3 | |
1267 | vpmuludq $Bi,$TEMP0,$TEMP0 | |
1268 | vmovdqu -24+32*6-128($ap),$TEMP2 | |
1269 | vpaddq $TEMP0,$ACC4,$ACC4 | |
1270 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1271 | vmovdqu -24+32*7-128($ap),$TEMP0 | |
1272 | vpaddq $TEMP1,$ACC5,$ACC5 | |
1273 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1274 | vmovdqu -24+32*8-128($ap),$TEMP1 | |
1275 | vpaddq $TEMP2,$ACC6,$ACC6 | |
1276 | vpmuludq $Bi,$TEMP0,$TEMP0 | |
1277 | vmovdqu -24+32*9-128($ap),$TEMP2 | |
1278 | vpaddq $TEMP0,$ACC7,$ACC7 | |
1279 | vpmuludq $Bi,$TEMP1,$TEMP1 | |
1280 | vpaddq $TEMP1,$ACC8,$ACC8 | |
1281 | vpmuludq $Bi,$TEMP2,$TEMP2 | |
1282 | vpbroadcastq 32($bp), $Bi | |
1283 | vpaddq $TEMP2,$ACC9,$ACC9 | |
1284 | add \$32, $bp # $bp++ | |
1285 | ||
1286 | vmovdqu -24+32*1-128($np),$TEMP0 | |
1287 | imulq -128($np),%rax | |
1288 | add %rax,$r3 | |
1289 | shr \$29, $r3 | |
1290 | ||
1291 | vmovdqu -24+32*2-128($np),$TEMP1 | |
1292 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1293 | vmovq $Bi, %rbx | |
1294 | vmovdqu -24+32*3-128($np),$TEMP2 | |
1295 | vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 | |
1296 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1297 | vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 | |
1298 | vpaddq $TEMP1,$ACC2,$ACC1 | |
1299 | vmovdqu -24+32*4-128($np),$TEMP0 | |
1300 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1301 | vmovdqu -24+32*5-128($np),$TEMP1 | |
1302 | vpaddq $TEMP2,$ACC3,$ACC2 | |
1303 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1304 | vmovdqu -24+32*6-128($np),$TEMP2 | |
1305 | vpaddq $TEMP0,$ACC4,$ACC3 | |
1306 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1307 | vmovdqu -24+32*7-128($np),$TEMP0 | |
1308 | vpaddq $TEMP1,$ACC5,$ACC4 | |
1309 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1310 | vmovdqu -24+32*8-128($np),$TEMP1 | |
1311 | vpaddq $TEMP2,$ACC6,$ACC5 | |
1312 | vpmuludq $Yi,$TEMP0,$TEMP0 | |
1313 | vmovdqu -24+32*9-128($np),$TEMP2 | |
1314 | mov $r3, $r0 | |
1315 | vpaddq $TEMP0,$ACC7,$ACC6 | |
1316 | vpmuludq $Yi,$TEMP1,$TEMP1 | |
1317 | add (%rsp), $r0 | |
1318 | vpaddq $TEMP1,$ACC8,$ACC7 | |
1319 | vpmuludq $Yi,$TEMP2,$TEMP2 | |
1320 | vmovq $r3, $TEMP1 | |
1321 | vpaddq $TEMP2,$ACC9,$ACC8 | |
1322 | ||
1323 | dec $i | |
1324 | jnz .Loop_mul_1024 | |
1325 | ___ | |
1326 | ||
1327 | # (*) Original implementation was correcting ACC1-ACC3 for overflow | |
1328 | # after 7 loop runs, or after 28 iterations, or 56 additions. | |
1329 | # But as we underutilize resources, it's possible to correct in | |
1330 | # each iteration with marginal performance loss. But then, as | |
1331 | # we do it in each iteration, we can correct less digits, and | |
1332 | # avoid performance penalties completely. Also note that we | |
1333 | # correct only three digits out of four. This works because | |
1334 | # most significant digit is subjected to less additions. | |
1335 | ||
1336 | $TEMP0 = $ACC9; | |
1337 | $TEMP3 = $Bi; | |
1338 | $TEMP4 = $Yi; | |
1339 | $code.=<<___; | |
1340 | vpermq \$0, $AND_MASK, $AND_MASK | |
1341 | vpaddq (%rsp), $TEMP1, $ACC0 | |
1342 | ||
1343 | vpsrlq \$29, $ACC0, $TEMP1 | |
1344 | vpand $AND_MASK, $ACC0, $ACC0 | |
1345 | vpsrlq \$29, $ACC1, $TEMP2 | |
1346 | vpand $AND_MASK, $ACC1, $ACC1 | |
1347 | vpsrlq \$29, $ACC2, $TEMP3 | |
1348 | vpermq \$0x93, $TEMP1, $TEMP1 | |
1349 | vpand $AND_MASK, $ACC2, $ACC2 | |
1350 | vpsrlq \$29, $ACC3, $TEMP4 | |
1351 | vpermq \$0x93, $TEMP2, $TEMP2 | |
1352 | vpand $AND_MASK, $ACC3, $ACC3 | |
1353 | ||
1354 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
1355 | vpermq \$0x93, $TEMP3, $TEMP3 | |
1356 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
1357 | vpermq \$0x93, $TEMP4, $TEMP4 | |
1358 | vpaddq $TEMP0, $ACC0, $ACC0 | |
1359 | vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 | |
1360 | vpaddq $TEMP1, $ACC1, $ACC1 | |
1361 | vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 | |
1362 | vpaddq $TEMP2, $ACC2, $ACC2 | |
1363 | vpblendd \$3, $TEMP4, $ZERO, $TEMP4 | |
1364 | vpaddq $TEMP3, $ACC3, $ACC3 | |
1365 | vpaddq $TEMP4, $ACC4, $ACC4 | |
1366 | ||
1367 | vpsrlq \$29, $ACC0, $TEMP1 | |
1368 | vpand $AND_MASK, $ACC0, $ACC0 | |
1369 | vpsrlq \$29, $ACC1, $TEMP2 | |
1370 | vpand $AND_MASK, $ACC1, $ACC1 | |
1371 | vpsrlq \$29, $ACC2, $TEMP3 | |
1372 | vpermq \$0x93, $TEMP1, $TEMP1 | |
1373 | vpand $AND_MASK, $ACC2, $ACC2 | |
1374 | vpsrlq \$29, $ACC3, $TEMP4 | |
1375 | vpermq \$0x93, $TEMP2, $TEMP2 | |
1376 | vpand $AND_MASK, $ACC3, $ACC3 | |
1377 | vpermq \$0x93, $TEMP3, $TEMP3 | |
1378 | ||
1379 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
1380 | vpermq \$0x93, $TEMP4, $TEMP4 | |
1381 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
1382 | vpaddq $TEMP0, $ACC0, $ACC0 | |
1383 | vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 | |
1384 | vpaddq $TEMP1, $ACC1, $ACC1 | |
1385 | vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 | |
1386 | vpaddq $TEMP2, $ACC2, $ACC2 | |
1387 | vpblendd \$3, $TEMP4, $ZERO, $TEMP4 | |
1388 | vpaddq $TEMP3, $ACC3, $ACC3 | |
1389 | vpaddq $TEMP4, $ACC4, $ACC4 | |
1390 | ||
1391 | vmovdqu $ACC0, 0-128($rp) | |
1392 | vmovdqu $ACC1, 32-128($rp) | |
1393 | vmovdqu $ACC2, 64-128($rp) | |
1394 | vmovdqu $ACC3, 96-128($rp) | |
1395 | ___ | |
1396 | ||
1397 | $TEMP5=$ACC0; | |
1398 | $code.=<<___; | |
1399 | vpsrlq \$29, $ACC4, $TEMP1 | |
1400 | vpand $AND_MASK, $ACC4, $ACC4 | |
1401 | vpsrlq \$29, $ACC5, $TEMP2 | |
1402 | vpand $AND_MASK, $ACC5, $ACC5 | |
1403 | vpsrlq \$29, $ACC6, $TEMP3 | |
1404 | vpermq \$0x93, $TEMP1, $TEMP1 | |
1405 | vpand $AND_MASK, $ACC6, $ACC6 | |
1406 | vpsrlq \$29, $ACC7, $TEMP4 | |
1407 | vpermq \$0x93, $TEMP2, $TEMP2 | |
1408 | vpand $AND_MASK, $ACC7, $ACC7 | |
1409 | vpsrlq \$29, $ACC8, $TEMP5 | |
1410 | vpermq \$0x93, $TEMP3, $TEMP3 | |
1411 | vpand $AND_MASK, $ACC8, $ACC8 | |
1412 | vpermq \$0x93, $TEMP4, $TEMP4 | |
1413 | ||
1414 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
1415 | vpermq \$0x93, $TEMP5, $TEMP5 | |
1416 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
1417 | vpaddq $TEMP0, $ACC4, $ACC4 | |
1418 | vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 | |
1419 | vpaddq $TEMP1, $ACC5, $ACC5 | |
1420 | vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 | |
1421 | vpaddq $TEMP2, $ACC6, $ACC6 | |
1422 | vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 | |
1423 | vpaddq $TEMP3, $ACC7, $ACC7 | |
1424 | vpaddq $TEMP4, $ACC8, $ACC8 | |
1425 | ||
1426 | vpsrlq \$29, $ACC4, $TEMP1 | |
1427 | vpand $AND_MASK, $ACC4, $ACC4 | |
1428 | vpsrlq \$29, $ACC5, $TEMP2 | |
1429 | vpand $AND_MASK, $ACC5, $ACC5 | |
1430 | vpsrlq \$29, $ACC6, $TEMP3 | |
1431 | vpermq \$0x93, $TEMP1, $TEMP1 | |
1432 | vpand $AND_MASK, $ACC6, $ACC6 | |
1433 | vpsrlq \$29, $ACC7, $TEMP4 | |
1434 | vpermq \$0x93, $TEMP2, $TEMP2 | |
1435 | vpand $AND_MASK, $ACC7, $ACC7 | |
1436 | vpsrlq \$29, $ACC8, $TEMP5 | |
1437 | vpermq \$0x93, $TEMP3, $TEMP3 | |
1438 | vpand $AND_MASK, $ACC8, $ACC8 | |
1439 | vpermq \$0x93, $TEMP4, $TEMP4 | |
1440 | ||
1441 | vpblendd \$3, $ZERO, $TEMP1, $TEMP0 | |
1442 | vpermq \$0x93, $TEMP5, $TEMP5 | |
1443 | vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 | |
1444 | vpaddq $TEMP0, $ACC4, $ACC4 | |
1445 | vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 | |
1446 | vpaddq $TEMP1, $ACC5, $ACC5 | |
1447 | vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 | |
1448 | vpaddq $TEMP2, $ACC6, $ACC6 | |
1449 | vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 | |
1450 | vpaddq $TEMP3, $ACC7, $ACC7 | |
1451 | vpaddq $TEMP4, $ACC8, $ACC8 | |
1452 | ||
1453 | vmovdqu $ACC4, 128-128($rp) | |
1454 | vmovdqu $ACC5, 160-128($rp) | |
1455 | vmovdqu $ACC6, 192-128($rp) | |
1456 | vmovdqu $ACC7, 224-128($rp) | |
1457 | vmovdqu $ACC8, 256-128($rp) | |
1458 | vzeroupper | |
1459 | ||
1460 | mov %rbp, %rax | |
1461 | ___ | |
1462 | $code.=<<___ if ($win64); | |
1463 | movaps -0xd8(%rax),%xmm6 | |
1464 | movaps -0xc8(%rax),%xmm7 | |
1465 | movaps -0xb8(%rax),%xmm8 | |
1466 | movaps -0xa8(%rax),%xmm9 | |
1467 | movaps -0x98(%rax),%xmm10 | |
1468 | movaps -0x88(%rax),%xmm11 | |
1469 | movaps -0x78(%rax),%xmm12 | |
1470 | movaps -0x68(%rax),%xmm13 | |
1471 | movaps -0x58(%rax),%xmm14 | |
1472 | movaps -0x48(%rax),%xmm15 | |
1473 | ___ | |
1474 | $code.=<<___; | |
1475 | mov -48(%rax),%r15 | |
1476 | mov -40(%rax),%r14 | |
1477 | mov -32(%rax),%r13 | |
1478 | mov -24(%rax),%r12 | |
1479 | mov -16(%rax),%rbp | |
1480 | mov -8(%rax),%rbx | |
1481 | lea (%rax),%rsp # restore %rsp | |
1482 | .Lmul_1024_epilogue: | |
1483 | ret | |
1484 | .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 | |
1485 | ___ | |
1486 | } | |
1487 | { | |
1488 | my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); | |
1489 | my @T = map("%r$_",(8..11)); | |
1490 | ||
1491 | $code.=<<___; | |
1492 | .globl rsaz_1024_red2norm_avx2 | |
1493 | .type rsaz_1024_red2norm_avx2,\@abi-omnipotent | |
1494 | .align 32 | |
1495 | rsaz_1024_red2norm_avx2: | |
1496 | sub \$-128,$inp # size optimization | |
1497 | xor %rax,%rax | |
1498 | ___ | |
1499 | ||
1500 | for ($j=0,$i=0; $i<16; $i++) { | |
1501 | my $k=0; | |
1502 | while (29*$j<64*($i+1)) { # load data till boundary | |
1503 | $code.=" mov `8*$j-128`($inp), @T[0]\n"; | |
1504 | $j++; $k++; push(@T,shift(@T)); | |
1505 | } | |
1506 | $l=$k; | |
1507 | while ($k>1) { # shift loaded data but last value | |
1508 | $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; | |
1509 | $k--; | |
1510 | } | |
1511 | $code.=<<___; # shift last value | |
1512 | mov @T[-1], @T[0] | |
1513 | shl \$`29*($j-1)`, @T[-1] | |
1514 | shr \$`-29*($j-1)`, @T[0] | |
1515 | ___ | |
1516 | while ($l) { # accumulate all values | |
1517 | $code.=" add @T[-$l], %rax\n"; | |
1518 | $l--; | |
1519 | } | |
1520 | $code.=<<___; | |
1521 | adc \$0, @T[0] # consume eventual carry | |
1522 | mov %rax, 8*$i($out) | |
1523 | mov @T[0], %rax | |
1524 | ___ | |
1525 | push(@T,shift(@T)); | |
1526 | } | |
1527 | $code.=<<___; | |
1528 | ret | |
1529 | .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 | |
1530 | ||
1531 | .globl rsaz_1024_norm2red_avx2 | |
1532 | .type rsaz_1024_norm2red_avx2,\@abi-omnipotent | |
1533 | .align 32 | |
1534 | rsaz_1024_norm2red_avx2: | |
1535 | sub \$-128,$out # size optimization | |
1536 | mov ($inp),@T[0] | |
1537 | mov \$0x1fffffff,%eax | |
1538 | ___ | |
1539 | for ($j=0,$i=0; $i<16; $i++) { | |
1540 | $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); | |
1541 | $code.=" xor @T[1],@T[1]\n" if ($i==15); | |
1542 | my $k=1; | |
1543 | while (29*($j+1)<64*($i+1)) { | |
1544 | $code.=<<___; | |
1545 | mov @T[0],@T[-$k] | |
1546 | shr \$`29*$j`,@T[-$k] | |
1547 | and %rax,@T[-$k] # &0x1fffffff | |
1548 | mov @T[-$k],`8*$j-128`($out) | |
1549 | ___ | |
1550 | $j++; $k++; | |
1551 | } | |
1552 | $code.=<<___; | |
1553 | shrd \$`29*$j`,@T[1],@T[0] | |
1554 | and %rax,@T[0] | |
1555 | mov @T[0],`8*$j-128`($out) | |
1556 | ___ | |
1557 | $j++; | |
1558 | push(@T,shift(@T)); | |
1559 | } | |
1560 | $code.=<<___; | |
1561 | mov @T[0],`8*$j-128`($out) # zero | |
1562 | mov @T[0],`8*($j+1)-128`($out) | |
1563 | mov @T[0],`8*($j+2)-128`($out) | |
1564 | mov @T[0],`8*($j+3)-128`($out) | |
1565 | ret | |
1566 | .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 | |
1567 | ___ | |
1568 | } | |
1569 | { | |
1570 | my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); | |
1571 | ||
1572 | $code.=<<___; | |
1573 | .globl rsaz_1024_scatter5_avx2 | |
1574 | .type rsaz_1024_scatter5_avx2,\@abi-omnipotent | |
1575 | .align 32 | |
1576 | rsaz_1024_scatter5_avx2: | |
1577 | vzeroupper | |
1578 | vmovdqu .Lscatter_permd(%rip),%ymm5 | |
1579 | shl \$4,$power | |
1580 | lea ($out,$power),$out | |
1581 | mov \$9,%eax | |
1582 | jmp .Loop_scatter_1024 | |
1583 | ||
1584 | .align 32 | |
1585 | .Loop_scatter_1024: | |
1586 | vmovdqu ($inp),%ymm0 | |
1587 | lea 32($inp),$inp | |
1588 | vpermd %ymm0,%ymm5,%ymm0 | |
1589 | vmovdqu %xmm0,($out) | |
1590 | lea 16*32($out),$out | |
1591 | dec %eax | |
1592 | jnz .Loop_scatter_1024 | |
1593 | ||
1594 | vzeroupper | |
1595 | ret | |
1596 | .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 | |
1597 | ||
1598 | .globl rsaz_1024_gather5_avx2 | |
1599 | .type rsaz_1024_gather5_avx2,\@abi-omnipotent | |
1600 | .align 32 | |
1601 | rsaz_1024_gather5_avx2: | |
d6d422e1 AP |
1602 | vzeroupper |
1603 | mov %rsp,%r11 | |
0b4bb91d AP |
1604 | ___ |
1605 | $code.=<<___ if ($win64); | |
1606 | lea -0x88(%rsp),%rax | |
1607 | .LSEH_begin_rsaz_1024_gather5: | |
1608 | # I can't trust assembler to use specific encoding:-( | |
d6d422e1 AP |
1609 | .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp |
1610 | .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) | |
1611 | .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) | |
1612 | .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) | |
1613 | .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) | |
1614 | .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) | |
1615 | .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) | |
1616 | .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) | |
1617 | .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) | |
1618 | .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) | |
1619 | .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) | |
0b4bb91d AP |
1620 | ___ |
1621 | $code.=<<___; | |
d6d422e1 AP |
1622 | lea -0x100(%rsp),%rsp |
1623 | and \$-32, %rsp | |
1624 | lea .Linc(%rip), %r10 | |
1625 | lea -128(%rsp),%rax # control u-op density | |
1626 | ||
1627 | vmovd $power, %xmm4 | |
1628 | vmovdqa (%r10),%ymm0 | |
1629 | vmovdqa 32(%r10),%ymm1 | |
1630 | vmovdqa 64(%r10),%ymm5 | |
1631 | vpbroadcastd %xmm4,%ymm4 | |
1632 | ||
1633 | vpaddd %ymm5, %ymm0, %ymm2 | |
1634 | vpcmpeqd %ymm4, %ymm0, %ymm0 | |
1635 | vpaddd %ymm5, %ymm1, %ymm3 | |
1636 | vpcmpeqd %ymm4, %ymm1, %ymm1 | |
1637 | vmovdqa %ymm0, 32*0+128(%rax) | |
1638 | vpaddd %ymm5, %ymm2, %ymm0 | |
1639 | vpcmpeqd %ymm4, %ymm2, %ymm2 | |
1640 | vmovdqa %ymm1, 32*1+128(%rax) | |
1641 | vpaddd %ymm5, %ymm3, %ymm1 | |
1642 | vpcmpeqd %ymm4, %ymm3, %ymm3 | |
1643 | vmovdqa %ymm2, 32*2+128(%rax) | |
1644 | vpaddd %ymm5, %ymm0, %ymm2 | |
1645 | vpcmpeqd %ymm4, %ymm0, %ymm0 | |
1646 | vmovdqa %ymm3, 32*3+128(%rax) | |
1647 | vpaddd %ymm5, %ymm1, %ymm3 | |
1648 | vpcmpeqd %ymm4, %ymm1, %ymm1 | |
1649 | vmovdqa %ymm0, 32*4+128(%rax) | |
1650 | vpaddd %ymm5, %ymm2, %ymm8 | |
1651 | vpcmpeqd %ymm4, %ymm2, %ymm2 | |
1652 | vmovdqa %ymm1, 32*5+128(%rax) | |
1653 | vpaddd %ymm5, %ymm3, %ymm9 | |
1654 | vpcmpeqd %ymm4, %ymm3, %ymm3 | |
1655 | vmovdqa %ymm2, 32*6+128(%rax) | |
1656 | vpaddd %ymm5, %ymm8, %ymm10 | |
1657 | vpcmpeqd %ymm4, %ymm8, %ymm8 | |
1658 | vmovdqa %ymm3, 32*7+128(%rax) | |
1659 | vpaddd %ymm5, %ymm9, %ymm11 | |
1660 | vpcmpeqd %ymm4, %ymm9, %ymm9 | |
1661 | vpaddd %ymm5, %ymm10, %ymm12 | |
1662 | vpcmpeqd %ymm4, %ymm10, %ymm10 | |
1663 | vpaddd %ymm5, %ymm11, %ymm13 | |
1664 | vpcmpeqd %ymm4, %ymm11, %ymm11 | |
1665 | vpaddd %ymm5, %ymm12, %ymm14 | |
1666 | vpcmpeqd %ymm4, %ymm12, %ymm12 | |
1667 | vpaddd %ymm5, %ymm13, %ymm15 | |
1668 | vpcmpeqd %ymm4, %ymm13, %ymm13 | |
1669 | vpcmpeqd %ymm4, %ymm14, %ymm14 | |
1670 | vpcmpeqd %ymm4, %ymm15, %ymm15 | |
1671 | ||
1672 | vmovdqa -32(%r10),%ymm7 # .Lgather_permd | |
1673 | lea 128($inp), $inp | |
1674 | mov \$9,$power | |
0b4bb91d | 1675 | |
0b4bb91d | 1676 | .Loop_gather_1024: |
d6d422e1 AP |
1677 | vmovdqa 32*0-128($inp), %ymm0 |
1678 | vmovdqa 32*1-128($inp), %ymm1 | |
1679 | vmovdqa 32*2-128($inp), %ymm2 | |
1680 | vmovdqa 32*3-128($inp), %ymm3 | |
1681 | vpand 32*0+128(%rax), %ymm0, %ymm0 | |
1682 | vpand 32*1+128(%rax), %ymm1, %ymm1 | |
1683 | vpand 32*2+128(%rax), %ymm2, %ymm2 | |
1684 | vpor %ymm0, %ymm1, %ymm4 | |
1685 | vpand 32*3+128(%rax), %ymm3, %ymm3 | |
1686 | vmovdqa 32*4-128($inp), %ymm0 | |
1687 | vmovdqa 32*5-128($inp), %ymm1 | |
1688 | vpor %ymm2, %ymm3, %ymm5 | |
1689 | vmovdqa 32*6-128($inp), %ymm2 | |
1690 | vmovdqa 32*7-128($inp), %ymm3 | |
1691 | vpand 32*4+128(%rax), %ymm0, %ymm0 | |
1692 | vpand 32*5+128(%rax), %ymm1, %ymm1 | |
1693 | vpand 32*6+128(%rax), %ymm2, %ymm2 | |
1694 | vpor %ymm0, %ymm4, %ymm4 | |
1695 | vpand 32*7+128(%rax), %ymm3, %ymm3 | |
1696 | vpand 32*8-128($inp), %ymm8, %ymm0 | |
1697 | vpor %ymm1, %ymm5, %ymm5 | |
1698 | vpand 32*9-128($inp), %ymm9, %ymm1 | |
1699 | vpor %ymm2, %ymm4, %ymm4 | |
1700 | vpand 32*10-128($inp),%ymm10, %ymm2 | |
1701 | vpor %ymm3, %ymm5, %ymm5 | |
1702 | vpand 32*11-128($inp),%ymm11, %ymm3 | |
1703 | vpor %ymm0, %ymm4, %ymm4 | |
1704 | vpand 32*12-128($inp),%ymm12, %ymm0 | |
1705 | vpor %ymm1, %ymm5, %ymm5 | |
1706 | vpand 32*13-128($inp),%ymm13, %ymm1 | |
1707 | vpor %ymm2, %ymm4, %ymm4 | |
1708 | vpand 32*14-128($inp),%ymm14, %ymm2 | |
1709 | vpor %ymm3, %ymm5, %ymm5 | |
1710 | vpand 32*15-128($inp),%ymm15, %ymm3 | |
1711 | lea 32*16($inp), $inp | |
1712 | vpor %ymm0, %ymm4, %ymm4 | |
1713 | vpor %ymm1, %ymm5, %ymm5 | |
1714 | vpor %ymm2, %ymm4, %ymm4 | |
1715 | vpor %ymm3, %ymm5, %ymm5 | |
1716 | ||
1717 | vpor %ymm5, %ymm4, %ymm4 | |
1718 | vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared | |
1719 | vpor %xmm4, %xmm5, %xmm5 | |
1720 | vpermd %ymm5,%ymm7,%ymm5 | |
1721 | vmovdqu %ymm5,($out) | |
0b4bb91d | 1722 | lea 32($out),$out |
d6d422e1 | 1723 | dec $power |
0b4bb91d AP |
1724 | jnz .Loop_gather_1024 |
1725 | ||
1726 | vpxor %ymm0,%ymm0,%ymm0 | |
1727 | vmovdqu %ymm0,($out) | |
1728 | vzeroupper | |
1729 | ___ | |
1730 | $code.=<<___ if ($win64); | |
d6d422e1 AP |
1731 | movaps -0xa8(%r11),%xmm6 |
1732 | movaps -0x98(%r11),%xmm7 | |
1733 | movaps -0x88(%r11),%xmm8 | |
1734 | movaps -0x78(%r11),%xmm9 | |
1735 | movaps -0x68(%r11),%xmm10 | |
1736 | movaps -0x58(%r11),%xmm11 | |
1737 | movaps -0x48(%r11),%xmm12 | |
1738 | movaps -0x38(%r11),%xmm13 | |
1739 | movaps -0x28(%r11),%xmm14 | |
1740 | movaps -0x18(%r11),%xmm15 | |
0b4bb91d AP |
1741 | .LSEH_end_rsaz_1024_gather5: |
1742 | ___ | |
1743 | $code.=<<___; | |
d6d422e1 | 1744 | lea (%r11),%rsp |
0b4bb91d AP |
1745 | ret |
1746 | .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 | |
1747 | ___ | |
1748 | } | |
1749 | ||
1750 | $code.=<<___; | |
1751 | .extern OPENSSL_ia32cap_P | |
1752 | .globl rsaz_avx2_eligible | |
1753 | .type rsaz_avx2_eligible,\@abi-omnipotent | |
1754 | .align 32 | |
1755 | rsaz_avx2_eligible: | |
1756 | mov OPENSSL_ia32cap_P+8(%rip),%eax | |
f3f620e1 AP |
1757 | ___ |
1758 | $code.=<<___ if ($addx); | |
1759 | mov \$`1<<8|1<<19`,%ecx | |
1760 | mov \$0,%edx | |
1761 | and %eax,%ecx | |
1762 | cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X | |
1763 | cmove %edx,%eax | |
1764 | ___ | |
1765 | $code.=<<___; | |
0b4bb91d AP |
1766 | and \$`1<<5`,%eax |
1767 | shr \$5,%eax | |
1768 | ret | |
1769 | .size rsaz_avx2_eligible,.-rsaz_avx2_eligible | |
1770 | ||
1771 | .align 64 | |
1772 | .Land_mask: | |
1773 | .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 | |
1774 | .Lscatter_permd: | |
1775 | .long 0,2,4,6,7,7,7,7 | |
1776 | .Lgather_permd: | |
1777 | .long 0,7,1,7,2,7,3,7 | |
d6d422e1 AP |
1778 | .Linc: |
1779 | .long 0,0,0,0, 1,1,1,1 | |
1780 | .long 2,2,2,2, 3,3,3,3 | |
1781 | .long 4,4,4,4, 4,4,4,4 | |
0b4bb91d AP |
1782 | .align 64 |
1783 | ___ | |
1784 | ||
1785 | if ($win64) { | |
1786 | $rec="%rcx"; | |
1787 | $frame="%rdx"; | |
1788 | $context="%r8"; | |
1789 | $disp="%r9"; | |
1790 | ||
1791 | $code.=<<___ | |
1792 | .extern __imp_RtlVirtualUnwind | |
1793 | .type rsaz_se_handler,\@abi-omnipotent | |
1794 | .align 16 | |
1795 | rsaz_se_handler: | |
1796 | push %rsi | |
1797 | push %rdi | |
1798 | push %rbx | |
1799 | push %rbp | |
1800 | push %r12 | |
1801 | push %r13 | |
1802 | push %r14 | |
1803 | push %r15 | |
1804 | pushfq | |
1805 | sub \$64,%rsp | |
1806 | ||
1807 | mov 120($context),%rax # pull context->Rax | |
1808 | mov 248($context),%rbx # pull context->Rip | |
1809 | ||
1810 | mov 8($disp),%rsi # disp->ImageBase | |
1811 | mov 56($disp),%r11 # disp->HandlerData | |
1812 | ||
1813 | mov 0(%r11),%r10d # HandlerData[0] | |
1814 | lea (%rsi,%r10),%r10 # prologue label | |
1815 | cmp %r10,%rbx # context->Rip<prologue label | |
1816 | jb .Lcommon_seh_tail | |
1817 | ||
1818 | mov 152($context),%rax # pull context->Rsp | |
1819 | ||
1820 | mov 4(%r11),%r10d # HandlerData[1] | |
1821 | lea (%rsi,%r10),%r10 # epilogue label | |
1822 | cmp %r10,%rbx # context->Rip>=epilogue label | |
1823 | jae .Lcommon_seh_tail | |
1824 | ||
1825 | mov 160($context),%rax # pull context->Rbp | |
1826 | ||
1827 | mov -48(%rax),%r15 | |
1828 | mov -40(%rax),%r14 | |
1829 | mov -32(%rax),%r13 | |
1830 | mov -24(%rax),%r12 | |
1831 | mov -16(%rax),%rbp | |
1832 | mov -8(%rax),%rbx | |
1833 | mov %r15,240($context) | |
1834 | mov %r14,232($context) | |
1835 | mov %r13,224($context) | |
1836 | mov %r12,216($context) | |
1837 | mov %rbp,160($context) | |
1838 | mov %rbx,144($context) | |
1839 | ||
1840 | lea -0xd8(%rax),%rsi # %xmm save area | |
1841 | lea 512($context),%rdi # & context.Xmm6 | |
1842 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | |
1843 | .long 0xa548f3fc # cld; rep movsq | |
1844 | ||
1845 | .Lcommon_seh_tail: | |
1846 | mov 8(%rax),%rdi | |
1847 | mov 16(%rax),%rsi | |
1848 | mov %rax,152($context) # restore context->Rsp | |
1849 | mov %rsi,168($context) # restore context->Rsi | |
1850 | mov %rdi,176($context) # restore context->Rdi | |
1851 | ||
1852 | mov 40($disp),%rdi # disp->ContextRecord | |
1853 | mov $context,%rsi # context | |
1854 | mov \$154,%ecx # sizeof(CONTEXT) | |
1855 | .long 0xa548f3fc # cld; rep movsq | |
1856 | ||
1857 | mov $disp,%rsi | |
1858 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
1859 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
1860 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
1861 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
1862 | mov 40(%rsi),%r10 # disp->ContextRecord | |
1863 | lea 56(%rsi),%r11 # &disp->HandlerData | |
1864 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
1865 | mov %r10,32(%rsp) # arg5 | |
1866 | mov %r11,40(%rsp) # arg6 | |
1867 | mov %r12,48(%rsp) # arg7 | |
1868 | mov %rcx,56(%rsp) # arg8, (NULL) | |
1869 | call *__imp_RtlVirtualUnwind(%rip) | |
1870 | ||
1871 | mov \$1,%eax # ExceptionContinueSearch | |
1872 | add \$64,%rsp | |
1873 | popfq | |
1874 | pop %r15 | |
1875 | pop %r14 | |
1876 | pop %r13 | |
1877 | pop %r12 | |
1878 | pop %rbp | |
1879 | pop %rbx | |
1880 | pop %rdi | |
1881 | pop %rsi | |
1882 | ret | |
1883 | .size rsaz_se_handler,.-rsaz_se_handler | |
1884 | ||
1885 | .section .pdata | |
1886 | .align 4 | |
1887 | .rva .LSEH_begin_rsaz_1024_sqr_avx2 | |
1888 | .rva .LSEH_end_rsaz_1024_sqr_avx2 | |
1889 | .rva .LSEH_info_rsaz_1024_sqr_avx2 | |
1890 | ||
1891 | .rva .LSEH_begin_rsaz_1024_mul_avx2 | |
1892 | .rva .LSEH_end_rsaz_1024_mul_avx2 | |
1893 | .rva .LSEH_info_rsaz_1024_mul_avx2 | |
1894 | ||
1895 | .rva .LSEH_begin_rsaz_1024_gather5 | |
1896 | .rva .LSEH_end_rsaz_1024_gather5 | |
1897 | .rva .LSEH_info_rsaz_1024_gather5 | |
1898 | .section .xdata | |
1899 | .align 8 | |
1900 | .LSEH_info_rsaz_1024_sqr_avx2: | |
1901 | .byte 9,0,0,0 | |
1902 | .rva rsaz_se_handler | |
1903 | .rva .Lsqr_1024_body,.Lsqr_1024_epilogue | |
1904 | .LSEH_info_rsaz_1024_mul_avx2: | |
1905 | .byte 9,0,0,0 | |
1906 | .rva rsaz_se_handler | |
1907 | .rva .Lmul_1024_body,.Lmul_1024_epilogue | |
1908 | .LSEH_info_rsaz_1024_gather5: | |
d6d422e1 AP |
1909 | .byte 0x01,0x36,0x17,0x0b |
1910 | .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 | |
1911 | .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 | |
1912 | .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 | |
1913 | .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 | |
1914 | .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 | |
1915 | .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 | |
1916 | .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 | |
1917 | .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 | |
1918 | .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 | |
1919 | .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 | |
1920 | .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 | |
1921 | .byte 0x00,0xb3,0x00,0x00 # set_frame r11 | |
0b4bb91d AP |
1922 | ___ |
1923 | } | |
1924 | ||
1925 | foreach (split("\n",$code)) { | |
1926 | s/\`([^\`]*)\`/eval($1)/ge; | |
1927 | ||
1928 | s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or | |
1929 | ||
1930 | s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or | |
1931 | s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or | |
1932 | s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or | |
1933 | s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or | |
1934 | s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; | |
1935 | print $_,"\n"; | |
1936 | } | |
1937 | ||
1938 | }}} else {{{ | |
1939 | print <<___; # assembler is too old | |
1940 | .text | |
1941 | ||
1942 | .globl rsaz_avx2_eligible | |
1943 | .type rsaz_avx2_eligible,\@abi-omnipotent | |
852f837f | 1944 | rsaz_avx2_eligible: |
0b4bb91d AP |
1945 | xor %eax,%eax |
1946 | ret | |
1947 | .size rsaz_avx2_eligible,.-rsaz_avx2_eligible | |
1948 | ||
1949 | .globl rsaz_1024_sqr_avx2 | |
1950 | .globl rsaz_1024_mul_avx2 | |
1951 | .globl rsaz_1024_norm2red_avx2 | |
1952 | .globl rsaz_1024_red2norm_avx2 | |
1953 | .globl rsaz_1024_scatter5_avx2 | |
1954 | .globl rsaz_1024_gather5_avx2 | |
1955 | .type rsaz_1024_sqr_avx2,\@abi-omnipotent | |
1956 | rsaz_1024_sqr_avx2: | |
1957 | rsaz_1024_mul_avx2: | |
1958 | rsaz_1024_norm2red_avx2: | |
1959 | rsaz_1024_red2norm_avx2: | |
1960 | rsaz_1024_scatter5_avx2: | |
1961 | rsaz_1024_gather5_avx2: | |
1962 | .byte 0x0f,0x0b # ud2 | |
1963 | ret | |
1964 | .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 | |
1965 | ___ | |
1966 | }}} | |
1967 | ||
1968 | close STDOUT; |