]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. |
dcf6e50f | 3 | # Copyright (c) 2014, Intel Corporation. All Rights Reserved. |
6aa36e8e | 4 | # |
a7f182b7 | 5 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
6 | # this file except in compliance with the License. You can obtain a copy |
7 | # in the file LICENSE in the source distribution or at | |
8 | # https://www.openssl.org/source/license.html | |
dcf6e50f RS |
9 | # |
10 | # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) | |
11 | # (1) Intel Corporation, Israel Development Center, Haifa, Israel | |
12 | # (2) University of Haifa, Israel | |
13 | # | |
14 | # Reference: | |
15 | # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with | |
16 | # 256 Bit Primes" | |
4d3fa06f | 17 | |
1aa89a7a RL |
18 | # $output is the last argument if it looks like a file (it has an extension) |
19 | # $flavour is the first argument if it doesn't look like a file | |
20 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
21 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
4d3fa06f AP |
22 | |
23 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
24 | ||
25 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
26 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
27 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
28 | die "can't locate x86_64-xlate.pl"; | |
29 | ||
1aa89a7a RL |
30 | open OUT,"| \"$^X\" $xlate $flavour \"$output\"" |
31 | or die "can't call $xlate: $!"; | |
4d3fa06f AP |
32 | *STDOUT=*OUT; |
33 | ||
34 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
35 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
36 | $avx = ($1>=2.19) + ($1>=2.22); | |
37 | $addx = ($1>=2.23); | |
38 | } | |
39 | ||
40 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
41 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
42 | $avx = ($1>=2.09) + ($1>=2.10); | |
43 | $addx = ($1>=2.10); | |
44 | } | |
45 | ||
46 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
47 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
48 | $avx = ($1>=10) + ($1>=11); | |
49 | $addx = ($1>=12); | |
50 | } | |
51 | ||
9bb3e5fd | 52 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { |
4d3fa06f AP |
53 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 |
54 | $avx = ($ver>=3.0) + ($ver>=3.01); | |
55 | $addx = ($ver>=3.03); | |
56 | } | |
57 | ||
58 | if ($avx>=2) {{ | |
59 | $digit_size = "\$29"; | |
60 | $n_digits = "\$9"; | |
61 | ||
62 | $code.=<<___; | |
63 | .text | |
64 | ||
65 | .align 64 | |
66 | .LAVX2_AND_MASK: | |
67 | .LAVX2_POLY: | |
68 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
69 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
70 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
71 | .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff | |
72 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
73 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
74 | .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000 | |
75 | .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000 | |
76 | .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff | |
77 | ||
78 | .LAVX2_POLY_x2: | |
79 | .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC | |
80 | .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC | |
81 | .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC | |
82 | .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC | |
83 | .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE | |
84 | .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE | |
85 | .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE | |
86 | .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE | |
87 | .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC | |
88 | ||
89 | .LAVX2_POLY_x8: | |
90 | .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 | |
91 | .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 | |
92 | .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 | |
93 | .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8 | |
94 | .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC | |
95 | .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC | |
96 | .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC | |
97 | .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC | |
98 | .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8 | |
99 | ||
100 | .LONE: | |
101 | .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020 | |
102 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
103 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
104 | .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000 | |
105 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
106 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
107 | .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff | |
108 | .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff | |
109 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
110 | ||
111 | # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL | |
112 | # Montgomery form (*2^256) to our format (*2^261) | |
113 | ||
114 | .LTO_MONT_AVX2: | |
115 | .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400 | |
116 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
117 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
118 | .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000 | |
119 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
120 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
121 | .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff | |
122 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
123 | .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003 | |
124 | ||
125 | .LFROM_MONT_AVX2: | |
126 | .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001 | |
127 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
128 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
129 | .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00 | |
130 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
131 | .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff | |
132 | .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff | |
133 | .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff | |
134 | .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 | |
135 | ||
136 | .LIntOne: | |
137 | .long 1,1,1,1,1,1,1,1 | |
138 | ___ | |
139 | ||
140 | { | |
0d4fb843 | 141 | # This function receives a pointer to an array of four affine points |
46f4e1be | 142 | # (X, Y, <1>) and rearranges the data for AVX2 execution, while |
4d3fa06f AP |
143 | # converting it to 2^29 radix redundant form |
144 | ||
145 | my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3, | |
146 | $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15)); | |
147 | ||
148 | $code.=<<___; | |
149 | .globl ecp_nistz256_avx2_transpose_convert | |
150 | .type ecp_nistz256_avx2_transpose_convert,\@function,2 | |
151 | .align 64 | |
152 | ecp_nistz256_avx2_transpose_convert: | |
153 | vzeroupper | |
154 | ___ | |
155 | $code.=<<___ if ($win64); | |
156 | lea -8-16*10(%rsp), %rsp | |
157 | vmovaps %xmm6, -8-16*10(%rax) | |
158 | vmovaps %xmm7, -8-16*9(%rax) | |
159 | vmovaps %xmm8, -8-16*8(%rax) | |
160 | vmovaps %xmm9, -8-16*7(%rax) | |
161 | vmovaps %xmm10, -8-16*6(%rax) | |
162 | vmovaps %xmm11, -8-16*5(%rax) | |
163 | vmovaps %xmm12, -8-16*4(%rax) | |
164 | vmovaps %xmm13, -8-16*3(%rax) | |
165 | vmovaps %xmm14, -8-16*2(%rax) | |
166 | vmovaps %xmm15, -8-16*1(%rax) | |
167 | ___ | |
168 | $code.=<<___; | |
169 | # Load the data | |
170 | vmovdqa 32*0(%rsi), $X0 | |
171 | lea 112(%rsi), %rax # size optimization | |
172 | vmovdqa 32*1(%rsi), $Y0 | |
173 | lea .LAVX2_AND_MASK(%rip), %rdx | |
174 | vmovdqa 32*2(%rsi), $X1 | |
175 | vmovdqa 32*3(%rsi), $Y1 | |
176 | vmovdqa 32*4-112(%rax), $X2 | |
177 | vmovdqa 32*5-112(%rax), $Y2 | |
178 | vmovdqa 32*6-112(%rax), $X3 | |
179 | vmovdqa 32*7-112(%rax), $Y3 | |
180 | ||
181 | # Transpose X and Y independently | |
182 | vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0] | |
183 | vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0] | |
184 | vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1] | |
185 | vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1] | |
186 | ||
187 | vpunpcklqdq $Y1, $Y0, $T4 | |
188 | vpunpcklqdq $Y3, $Y2, $T5 | |
189 | vpunpckhqdq $Y1, $Y0, $T6 | |
190 | vpunpckhqdq $Y3, $Y2, $T7 | |
191 | ||
192 | vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0] | |
193 | vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1] | |
194 | vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2] | |
195 | vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3] | |
196 | ||
197 | vperm2i128 \$0x20, $T5, $T4, $Y0 | |
198 | vperm2i128 \$0x20, $T7, $T6, $Y1 | |
199 | vperm2i128 \$0x31, $T5, $T4, $Y2 | |
200 | vperm2i128 \$0x31, $T7, $T6, $Y3 | |
201 | vmovdqa (%rdx), $T7 | |
202 | ||
203 | vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask; | |
204 | vpsrlq \$29, $X0, $X0 | |
205 | vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask; | |
206 | vpsrlq \$29, $X0, $X0 | |
207 | vpsllq \$6, $X1, $T2 | |
208 | vpxor $X0, $T2, $T2 | |
209 | vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; | |
210 | vpsrlq \$23, $X1, $X1 | |
211 | vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; | |
212 | vpsrlq \$29, $X1, $X1 | |
213 | vpsllq \$12, $X2, $T4 | |
214 | vpxor $X1, $T4, $T4 | |
215 | vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; | |
216 | vpsrlq \$17, $X2, $X2 | |
217 | vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; | |
218 | vpsrlq \$29, $X2, $X2 | |
219 | vpsllq \$18, $X3, $T6 | |
220 | vpxor $X2, $T6, $T6 | |
221 | vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; | |
222 | vpsrlq \$11, $X3, $X3 | |
223 | vmovdqa $T0, 32*0(%rdi) | |
224 | lea 112(%rdi), %rax # size optimization | |
225 | vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; | |
226 | vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; | |
227 | ||
228 | vmovdqa $T1, 32*1(%rdi) | |
229 | vmovdqa $T2, 32*2(%rdi) | |
230 | vmovdqa $T3, 32*3(%rdi) | |
231 | vmovdqa $T4, 32*4-112(%rax) | |
232 | vmovdqa $T5, 32*5-112(%rax) | |
233 | vmovdqa $T6, 32*6-112(%rax) | |
234 | vmovdqa $T0, 32*7-112(%rax) | |
235 | vmovdqa $X3, 32*8-112(%rax) | |
236 | lea 448(%rdi), %rax # size optimization | |
237 | ||
238 | vpand $T7, $Y0, $T0 # out[0] = in[0] & mask; | |
239 | vpsrlq \$29, $Y0, $Y0 | |
240 | vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask; | |
241 | vpsrlq \$29, $Y0, $Y0 | |
242 | vpsllq \$6, $Y1, $T2 | |
243 | vpxor $Y0, $T2, $T2 | |
244 | vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; | |
245 | vpsrlq \$23, $Y1, $Y1 | |
246 | vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; | |
247 | vpsrlq \$29, $Y1, $Y1 | |
248 | vpsllq \$12, $Y2, $T4 | |
249 | vpxor $Y1, $T4, $T4 | |
250 | vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; | |
251 | vpsrlq \$17, $Y2, $Y2 | |
252 | vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; | |
253 | vpsrlq \$29, $Y2, $Y2 | |
254 | vpsllq \$18, $Y3, $T6 | |
255 | vpxor $Y2, $T6, $T6 | |
256 | vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; | |
257 | vpsrlq \$11, $Y3, $Y3 | |
258 | vmovdqa $T0, 32*9-448(%rax) | |
259 | vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; | |
260 | vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; | |
261 | ||
262 | vmovdqa $T1, 32*10-448(%rax) | |
263 | vmovdqa $T2, 32*11-448(%rax) | |
264 | vmovdqa $T3, 32*12-448(%rax) | |
265 | vmovdqa $T4, 32*13-448(%rax) | |
266 | vmovdqa $T5, 32*14-448(%rax) | |
267 | vmovdqa $T6, 32*15-448(%rax) | |
268 | vmovdqa $T0, 32*16-448(%rax) | |
269 | vmovdqa $Y3, 32*17-448(%rax) | |
270 | ||
271 | vzeroupper | |
272 | ___ | |
273 | $code.=<<___ if ($win64); | |
274 | movaps 16*0(%rsp), %xmm6 | |
275 | movaps 16*1(%rsp), %xmm7 | |
276 | movaps 16*2(%rsp), %xmm8 | |
277 | movaps 16*3(%rsp), %xmm9 | |
278 | movaps 16*4(%rsp), %xmm10 | |
279 | movaps 16*5(%rsp), %xmm11 | |
280 | movaps 16*6(%rsp), %xmm12 | |
281 | movaps 16*7(%rsp), %xmm13 | |
282 | movaps 16*8(%rsp), %xmm14 | |
283 | movaps 16*9(%rsp), %xmm15 | |
284 | lea 8+16*10(%rsp), %rsp | |
285 | ___ | |
286 | $code.=<<___; | |
287 | ret | |
288 | .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert | |
289 | ___ | |
290 | } | |
291 | { | |
292 | ################################################################################ | |
0d4fb843 | 293 | # This function receives a pointer to an array of four AVX2 formatted points |
46f4e1be | 294 | # (X, Y, Z) convert the data to normal representation, and rearranges the data |
4d3fa06f AP |
295 | |
296 | my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8)); | |
297 | my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15)); | |
298 | ||
299 | $code.=<<___; | |
300 | ||
301 | .globl ecp_nistz256_avx2_convert_transpose_back | |
302 | .type ecp_nistz256_avx2_convert_transpose_back,\@function,2 | |
303 | .align 32 | |
304 | ecp_nistz256_avx2_convert_transpose_back: | |
305 | vzeroupper | |
306 | ___ | |
307 | $code.=<<___ if ($win64); | |
308 | lea -8-16*10(%rsp), %rsp | |
309 | vmovaps %xmm6, -8-16*10(%rax) | |
310 | vmovaps %xmm7, -8-16*9(%rax) | |
311 | vmovaps %xmm8, -8-16*8(%rax) | |
312 | vmovaps %xmm9, -8-16*7(%rax) | |
313 | vmovaps %xmm10, -8-16*6(%rax) | |
314 | vmovaps %xmm11, -8-16*5(%rax) | |
315 | vmovaps %xmm12, -8-16*4(%rax) | |
316 | vmovaps %xmm13, -8-16*3(%rax) | |
317 | vmovaps %xmm14, -8-16*2(%rax) | |
318 | vmovaps %xmm15, -8-16*1(%rax) | |
319 | ___ | |
320 | $code.=<<___; | |
321 | mov \$3, %ecx | |
322 | ||
323 | .Lconv_loop: | |
324 | vmovdqa 32*0(%rsi), $D0 | |
325 | lea 160(%rsi), %rax # size optimization | |
326 | vmovdqa 32*1(%rsi), $D1 | |
327 | vmovdqa 32*2(%rsi), $D2 | |
328 | vmovdqa 32*3(%rsi), $D3 | |
329 | vmovdqa 32*4-160(%rax), $D4 | |
330 | vmovdqa 32*5-160(%rax), $D5 | |
331 | vmovdqa 32*6-160(%rax), $D6 | |
332 | vmovdqa 32*7-160(%rax), $D7 | |
333 | vmovdqa 32*8-160(%rax), $D8 | |
334 | ||
335 | vpsllq \$29, $D1, $D1 | |
336 | vpsllq \$58, $D2, $T0 | |
337 | vpaddq $D1, $D0, $D0 | |
338 | vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2); | |
339 | ||
340 | vpsrlq \$6, $D2, $D2 | |
341 | vpsllq \$23, $D3, $D3 | |
342 | vpsllq \$52, $D4, $T1 | |
343 | vpaddq $D2, $D3, $D3 | |
344 | vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64); | |
345 | ||
346 | vpsrlq \$12, $D4, $D4 | |
347 | vpsllq \$17, $D5, $D5 | |
348 | vpsllq \$46, $D6, $T2 | |
349 | vpaddq $D4, $D5, $D5 | |
350 | vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64); | |
351 | ||
352 | vpsrlq \$18, $D6, $D6 | |
353 | vpsllq \$11, $D7, $D7 | |
354 | vpsllq \$40, $D8, $T3 | |
355 | vpaddq $D6, $D7, $D7 | |
356 | vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64); | |
357 | ||
358 | vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0] | |
359 | vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0] | |
360 | vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1] | |
361 | vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1] | |
362 | ||
363 | vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0] | |
364 | vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1] | |
365 | vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2] | |
366 | vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3] | |
367 | ||
368 | vmovdqa $D0, 32*0(%rdi) | |
369 | vmovdqa $D1, 32*3(%rdi) | |
370 | vmovdqa $D2, 32*6(%rdi) | |
371 | vmovdqa $D3, 32*9(%rdi) | |
372 | ||
373 | lea 32*9(%rsi), %rsi | |
374 | lea 32*1(%rdi), %rdi | |
375 | ||
376 | dec %ecx | |
377 | jnz .Lconv_loop | |
378 | ||
379 | vzeroupper | |
380 | ___ | |
381 | $code.=<<___ if ($win64); | |
382 | movaps 16*0(%rsp), %xmm6 | |
383 | movaps 16*1(%rsp), %xmm7 | |
384 | movaps 16*2(%rsp), %xmm8 | |
385 | movaps 16*3(%rsp), %xmm9 | |
386 | movaps 16*4(%rsp), %xmm10 | |
387 | movaps 16*5(%rsp), %xmm11 | |
388 | movaps 16*6(%rsp), %xmm12 | |
389 | movaps 16*7(%rsp), %xmm13 | |
390 | movaps 16*8(%rsp), %xmm14 | |
391 | movaps 16*9(%rsp), %xmm15 | |
392 | lea 8+16*10(%rsp), %rsp | |
393 | ___ | |
394 | $code.=<<___; | |
395 | ret | |
396 | .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back | |
397 | ___ | |
398 | } | |
399 | { | |
400 | my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx"); | |
401 | my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8)); | |
402 | my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13)); | |
403 | ||
404 | sub NORMALIZE { | |
405 | my $ret=<<___; | |
406 | vpsrlq $digit_size, $ACC0, $T0 | |
407 | vpand $AND_MASK, $ACC0, $ACC0 | |
408 | vpaddq $T0, $ACC1, $ACC1 | |
409 | ||
410 | vpsrlq $digit_size, $ACC1, $T0 | |
411 | vpand $AND_MASK, $ACC1, $ACC1 | |
412 | vpaddq $T0, $ACC2, $ACC2 | |
413 | ||
414 | vpsrlq $digit_size, $ACC2, $T0 | |
415 | vpand $AND_MASK, $ACC2, $ACC2 | |
416 | vpaddq $T0, $ACC3, $ACC3 | |
417 | ||
418 | vpsrlq $digit_size, $ACC3, $T0 | |
419 | vpand $AND_MASK, $ACC3, $ACC3 | |
420 | vpaddq $T0, $ACC4, $ACC4 | |
421 | ||
422 | vpsrlq $digit_size, $ACC4, $T0 | |
423 | vpand $AND_MASK, $ACC4, $ACC4 | |
424 | vpaddq $T0, $ACC5, $ACC5 | |
425 | ||
426 | vpsrlq $digit_size, $ACC5, $T0 | |
427 | vpand $AND_MASK, $ACC5, $ACC5 | |
428 | vpaddq $T0, $ACC6, $ACC6 | |
429 | ||
430 | vpsrlq $digit_size, $ACC6, $T0 | |
431 | vpand $AND_MASK, $ACC6, $ACC6 | |
432 | vpaddq $T0, $ACC7, $ACC7 | |
433 | ||
434 | vpsrlq $digit_size, $ACC7, $T0 | |
435 | vpand $AND_MASK, $ACC7, $ACC7 | |
436 | vpaddq $T0, $ACC8, $ACC8 | |
437 | #vpand $AND_MASK, $ACC8, $ACC8 | |
438 | ___ | |
439 | $ret; | |
440 | } | |
441 | ||
442 | sub STORE { | |
443 | my $ret=<<___; | |
444 | vmovdqa $ACC0, 32*0(%rdi) | |
445 | lea 160(%rdi), %rax # size optimization | |
446 | vmovdqa $ACC1, 32*1(%rdi) | |
447 | vmovdqa $ACC2, 32*2(%rdi) | |
448 | vmovdqa $ACC3, 32*3(%rdi) | |
449 | vmovdqa $ACC4, 32*4-160(%rax) | |
450 | vmovdqa $ACC5, 32*5-160(%rax) | |
451 | vmovdqa $ACC6, 32*6-160(%rax) | |
452 | vmovdqa $ACC7, 32*7-160(%rax) | |
453 | vmovdqa $ACC8, 32*8-160(%rax) | |
454 | ___ | |
455 | $ret; | |
456 | } | |
457 | ||
458 | $code.=<<___; | |
459 | .type avx2_normalize,\@abi-omnipotent | |
460 | .align 32 | |
461 | avx2_normalize: | |
462 | vpsrlq $digit_size, $ACC0, $T0 | |
463 | vpand $AND_MASK, $ACC0, $ACC0 | |
464 | vpaddq $T0, $ACC1, $ACC1 | |
465 | ||
466 | vpsrlq $digit_size, $ACC1, $T0 | |
467 | vpand $AND_MASK, $ACC1, $ACC1 | |
468 | vpaddq $T0, $ACC2, $ACC2 | |
469 | ||
470 | vpsrlq $digit_size, $ACC2, $T0 | |
471 | vpand $AND_MASK, $ACC2, $ACC2 | |
472 | vpaddq $T0, $ACC3, $ACC3 | |
473 | ||
474 | vpsrlq $digit_size, $ACC3, $T0 | |
475 | vpand $AND_MASK, $ACC3, $ACC3 | |
476 | vpaddq $T0, $ACC4, $ACC4 | |
477 | ||
478 | vpsrlq $digit_size, $ACC4, $T0 | |
479 | vpand $AND_MASK, $ACC4, $ACC4 | |
480 | vpaddq $T0, $ACC5, $ACC5 | |
481 | ||
482 | vpsrlq $digit_size, $ACC5, $T0 | |
483 | vpand $AND_MASK, $ACC5, $ACC5 | |
484 | vpaddq $T0, $ACC6, $ACC6 | |
485 | ||
486 | vpsrlq $digit_size, $ACC6, $T0 | |
487 | vpand $AND_MASK, $ACC6, $ACC6 | |
488 | vpaddq $T0, $ACC7, $ACC7 | |
489 | ||
490 | vpsrlq $digit_size, $ACC7, $T0 | |
491 | vpand $AND_MASK, $ACC7, $ACC7 | |
492 | vpaddq $T0, $ACC8, $ACC8 | |
493 | #vpand $AND_MASK, $ACC8, $ACC8 | |
494 | ||
495 | ret | |
496 | .size avx2_normalize,.-avx2_normalize | |
497 | ||
498 | .type avx2_normalize_n_store,\@abi-omnipotent | |
499 | .align 32 | |
500 | avx2_normalize_n_store: | |
501 | vpsrlq $digit_size, $ACC0, $T0 | |
502 | vpand $AND_MASK, $ACC0, $ACC0 | |
503 | vpaddq $T0, $ACC1, $ACC1 | |
504 | ||
505 | vpsrlq $digit_size, $ACC1, $T0 | |
506 | vpand $AND_MASK, $ACC1, $ACC1 | |
507 | vmovdqa $ACC0, 32*0(%rdi) | |
508 | lea 160(%rdi), %rax # size optimization | |
509 | vpaddq $T0, $ACC2, $ACC2 | |
510 | ||
511 | vpsrlq $digit_size, $ACC2, $T0 | |
512 | vpand $AND_MASK, $ACC2, $ACC2 | |
513 | vmovdqa $ACC1, 32*1(%rdi) | |
514 | vpaddq $T0, $ACC3, $ACC3 | |
515 | ||
516 | vpsrlq $digit_size, $ACC3, $T0 | |
517 | vpand $AND_MASK, $ACC3, $ACC3 | |
518 | vmovdqa $ACC2, 32*2(%rdi) | |
519 | vpaddq $T0, $ACC4, $ACC4 | |
520 | ||
521 | vpsrlq $digit_size, $ACC4, $T0 | |
522 | vpand $AND_MASK, $ACC4, $ACC4 | |
523 | vmovdqa $ACC3, 32*3(%rdi) | |
524 | vpaddq $T0, $ACC5, $ACC5 | |
525 | ||
526 | vpsrlq $digit_size, $ACC5, $T0 | |
527 | vpand $AND_MASK, $ACC5, $ACC5 | |
528 | vmovdqa $ACC4, 32*4-160(%rax) | |
529 | vpaddq $T0, $ACC6, $ACC6 | |
530 | ||
531 | vpsrlq $digit_size, $ACC6, $T0 | |
532 | vpand $AND_MASK, $ACC6, $ACC6 | |
533 | vmovdqa $ACC5, 32*5-160(%rax) | |
534 | vpaddq $T0, $ACC7, $ACC7 | |
535 | ||
536 | vpsrlq $digit_size, $ACC7, $T0 | |
537 | vpand $AND_MASK, $ACC7, $ACC7 | |
538 | vmovdqa $ACC6, 32*6-160(%rax) | |
539 | vpaddq $T0, $ACC8, $ACC8 | |
540 | #vpand $AND_MASK, $ACC8, $ACC8 | |
541 | vmovdqa $ACC7, 32*7-160(%rax) | |
542 | vmovdqa $ACC8, 32*8-160(%rax) | |
543 | ||
544 | ret | |
545 | .size avx2_normalize_n_store,.-avx2_normalize_n_store | |
546 | ||
547 | ################################################################################ | |
548 | # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4); | |
549 | .type avx2_mul_x4,\@abi-omnipotent | |
550 | .align 32 | |
551 | avx2_mul_x4: | |
552 | lea .LAVX2_POLY(%rip), %rax | |
553 | ||
554 | vpxor $ACC0, $ACC0, $ACC0 | |
555 | vpxor $ACC1, $ACC1, $ACC1 | |
556 | vpxor $ACC2, $ACC2, $ACC2 | |
557 | vpxor $ACC3, $ACC3, $ACC3 | |
558 | vpxor $ACC4, $ACC4, $ACC4 | |
559 | vpxor $ACC5, $ACC5, $ACC5 | |
560 | vpxor $ACC6, $ACC6, $ACC6 | |
561 | vpxor $ACC7, $ACC7, $ACC7 | |
562 | ||
563 | vmovdqa 32*7(%rax), %ymm14 | |
564 | vmovdqa 32*8(%rax), %ymm15 | |
565 | ||
566 | mov $n_digits, $itr | |
567 | lea -512($a_ptr), $a_ptr # strategic bias to control u-op density | |
568 | jmp .Lavx2_mul_x4_loop | |
569 | ||
570 | .align 32 | |
571 | .Lavx2_mul_x4_loop: | |
572 | vmovdqa 32*0($b_ptr), $B | |
573 | lea 32*1($b_ptr), $b_ptr | |
574 | ||
575 | vpmuludq 32*0+512($a_ptr), $B, $T0 | |
576 | vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW | |
577 | vpaddq $T0, $ACC0, $ACC0 | |
578 | vpmuludq 32*2+512($a_ptr), $B, $T0 | |
579 | vpaddq $OVERFLOW, $ACC1, $ACC1 | |
580 | vpand $AND_MASK, $ACC0, $Y | |
581 | vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW | |
582 | vpaddq $T0, $ACC2, $ACC2 | |
583 | vpmuludq 32*4+512($a_ptr), $B, $T0 | |
584 | vpaddq $OVERFLOW, $ACC3, $ACC3 | |
585 | vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW | |
586 | vpaddq $T0, $ACC4, $ACC4 | |
587 | vpmuludq 32*6+512($a_ptr), $B, $T0 | |
588 | vpaddq $OVERFLOW, $ACC5, $ACC5 | |
589 | vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW | |
590 | vpaddq $T0, $ACC6, $ACC6 | |
591 | ||
592 | # Skip some multiplications, optimizing for the constant poly | |
593 | vpmuludq $AND_MASK, $Y, $T0 | |
594 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
595 | vpmuludq 32*8+512($a_ptr), $B, $ACC8 | |
596 | vpaddq $T0, $ACC0, $OVERFLOW | |
597 | vpaddq $T0, $ACC1, $ACC0 | |
598 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
599 | vpaddq $T0, $ACC2, $ACC1 | |
600 | vpmuludq 32*3(%rax), $Y, $T0 | |
601 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
602 | vpaddq $T0, $ACC3, $ACC2 | |
603 | .byte 0x67 | |
604 | vmovdqa $ACC4, $ACC3 | |
605 | vpsllq \$18, $Y, $OVERFLOW | |
606 | .byte 0x67 | |
607 | vmovdqa $ACC5, $ACC4 | |
608 | vpmuludq %ymm14, $Y, $T0 | |
609 | vpaddq $OVERFLOW, $ACC6, $ACC5 | |
610 | vpmuludq %ymm15, $Y, $OVERFLOW | |
611 | vpaddq $T0, $ACC7, $ACC6 | |
612 | vpaddq $OVERFLOW, $ACC8, $ACC7 | |
613 | ||
614 | dec $itr | |
615 | jnz .Lavx2_mul_x4_loop | |
616 | ||
617 | vpxor $ACC8, $ACC8, $ACC8 | |
618 | ||
619 | ret | |
620 | .size avx2_mul_x4,.-avx2_mul_x4 | |
621 | ||
622 | # Function optimized for the constant 1 | |
623 | ################################################################################ | |
624 | # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4); | |
625 | .type avx2_mul_by1_x4,\@abi-omnipotent | |
626 | .align 32 | |
627 | avx2_mul_by1_x4: | |
628 | lea .LAVX2_POLY(%rip), %rax | |
629 | ||
630 | vpxor $ACC0, $ACC0, $ACC0 | |
631 | vpxor $ACC1, $ACC1, $ACC1 | |
632 | vpxor $ACC2, $ACC2, $ACC2 | |
633 | vpxor $ACC3, $ACC3, $ACC3 | |
634 | vpxor $ACC4, $ACC4, $ACC4 | |
635 | vpxor $ACC5, $ACC5, $ACC5 | |
636 | vpxor $ACC6, $ACC6, $ACC6 | |
637 | vpxor $ACC7, $ACC7, $ACC7 | |
638 | vpxor $ACC8, $ACC8, $ACC8 | |
639 | ||
640 | vmovdqa 32*3+.LONE(%rip), %ymm14 | |
641 | vmovdqa 32*7+.LONE(%rip), %ymm15 | |
642 | ||
643 | mov $n_digits, $itr | |
644 | jmp .Lavx2_mul_by1_x4_loop | |
645 | ||
646 | .align 32 | |
647 | .Lavx2_mul_by1_x4_loop: | |
648 | vmovdqa 32*0($a_ptr), $B | |
649 | .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr | |
650 | ||
651 | vpsllq \$5, $B, $OVERFLOW | |
652 | vpmuludq %ymm14, $B, $T0 | |
653 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
654 | vpaddq $T0, $ACC3, $ACC3 | |
655 | .byte 0x67 | |
656 | vpmuludq $AND_MASK, $B, $T0 | |
657 | vpand $AND_MASK, $ACC0, $Y | |
658 | vpaddq $T0, $ACC4, $ACC4 | |
659 | vpaddq $T0, $ACC5, $ACC5 | |
660 | vpaddq $T0, $ACC6, $ACC6 | |
661 | vpsllq \$23, $B, $T0 | |
662 | ||
663 | .byte 0x67,0x67 | |
664 | vpmuludq %ymm15, $B, $OVERFLOW | |
665 | vpsubq $T0, $ACC6, $ACC6 | |
666 | ||
667 | vpmuludq $AND_MASK, $Y, $T0 | |
668 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
669 | vpaddq $T0, $ACC0, $OVERFLOW | |
670 | vpaddq $T0, $ACC1, $ACC0 | |
671 | .byte 0x67,0x67 | |
672 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
673 | vpaddq $T0, $ACC2, $ACC1 | |
674 | vpmuludq 32*3(%rax), $Y, $T0 | |
675 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
676 | vpaddq $T0, $ACC3, $ACC2 | |
677 | vmovdqa $ACC4, $ACC3 | |
678 | vpsllq \$18, $Y, $OVERFLOW | |
679 | vmovdqa $ACC5, $ACC4 | |
680 | vpmuludq 32*7(%rax), $Y, $T0 | |
681 | vpaddq $OVERFLOW, $ACC6, $ACC5 | |
682 | vpaddq $T0, $ACC7, $ACC6 | |
683 | vpmuludq 32*8(%rax), $Y, $ACC7 | |
684 | ||
685 | dec $itr | |
686 | jnz .Lavx2_mul_by1_x4_loop | |
687 | ||
688 | ret | |
689 | .size avx2_mul_by1_x4,.-avx2_mul_by1_x4 | |
690 | ||
691 | ################################################################################ | |
692 | # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4); | |
693 | .type avx2_sqr_x4,\@abi-omnipotent | |
694 | .align 32 | |
695 | avx2_sqr_x4: | |
696 | lea .LAVX2_POLY(%rip), %rax | |
697 | ||
698 | vmovdqa 32*7(%rax), %ymm14 | |
699 | vmovdqa 32*8(%rax), %ymm15 | |
700 | ||
701 | vmovdqa 32*0($a_ptr), $B | |
702 | vmovdqa 32*1($a_ptr), $ACC1 | |
703 | vmovdqa 32*2($a_ptr), $ACC2 | |
704 | vmovdqa 32*3($a_ptr), $ACC3 | |
705 | vmovdqa 32*4($a_ptr), $ACC4 | |
706 | vmovdqa 32*5($a_ptr), $ACC5 | |
707 | vmovdqa 32*6($a_ptr), $ACC6 | |
708 | vmovdqa 32*7($a_ptr), $ACC7 | |
709 | vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7 | |
710 | vmovdqa 32*8($a_ptr), $ACC8 | |
711 | vpaddq $ACC2, $ACC2, $ACC2 | |
712 | vmovdqa $ACC1, 32*0(%rcx) | |
713 | vpaddq $ACC3, $ACC3, $ACC3 | |
714 | vmovdqa $ACC2, 32*1(%rcx) | |
715 | vpaddq $ACC4, $ACC4, $ACC4 | |
716 | vmovdqa $ACC3, 32*2(%rcx) | |
717 | vpaddq $ACC5, $ACC5, $ACC5 | |
718 | vmovdqa $ACC4, 32*3(%rcx) | |
719 | vpaddq $ACC6, $ACC6, $ACC6 | |
720 | vmovdqa $ACC5, 32*4(%rcx) | |
721 | vpaddq $ACC7, $ACC7, $ACC7 | |
722 | vmovdqa $ACC6, 32*5(%rcx) | |
723 | vpaddq $ACC8, $ACC8, $ACC8 | |
724 | vmovdqa $ACC7, 32*6(%rcx) | |
725 | vmovdqa $ACC8, 32*7(%rcx) | |
726 | ||
727 | #itr 1 | |
728 | vpmuludq $B, $B, $ACC0 | |
729 | vpmuludq $B, $ACC1, $ACC1 | |
730 | vpand $AND_MASK, $ACC0, $Y | |
731 | vpmuludq $B, $ACC2, $ACC2 | |
732 | vpmuludq $B, $ACC3, $ACC3 | |
733 | vpmuludq $B, $ACC4, $ACC4 | |
734 | vpmuludq $B, $ACC5, $ACC5 | |
735 | vpmuludq $B, $ACC6, $ACC6 | |
736 | vpmuludq $AND_MASK, $Y, $T0 | |
737 | vpmuludq $B, $ACC7, $ACC7 | |
738 | vpmuludq $B, $ACC8, $ACC8 | |
739 | vmovdqa 32*1($a_ptr), $B | |
740 | ||
741 | vpaddq $T0, $ACC0, $OVERFLOW | |
742 | vpaddq $T0, $ACC1, $ACC0 | |
743 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
744 | vpaddq $T0, $ACC2, $ACC1 | |
745 | vpmuludq 32*3(%rax), $Y, $T0 | |
746 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
747 | vpaddq $T0, $ACC3, $ACC2 | |
748 | vmovdqa $ACC4, $ACC3 | |
749 | vpsllq \$18, $Y, $T0 | |
750 | vmovdqa $ACC5, $ACC4 | |
751 | vpmuludq %ymm14, $Y, $OVERFLOW | |
752 | vpaddq $T0, $ACC6, $ACC5 | |
753 | vpmuludq %ymm15, $Y, $T0 | |
754 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
755 | vpaddq $T0, $ACC8, $ACC7 | |
756 | ||
757 | #itr 2 | |
758 | vpmuludq $B, $B, $OVERFLOW | |
759 | vpand $AND_MASK, $ACC0, $Y | |
760 | vpmuludq 32*1(%rcx), $B, $T0 | |
761 | vpaddq $OVERFLOW, $ACC1, $ACC1 | |
762 | vpmuludq 32*2(%rcx), $B, $OVERFLOW | |
763 | vpaddq $T0, $ACC2, $ACC2 | |
764 | vpmuludq 32*3(%rcx), $B, $T0 | |
765 | vpaddq $OVERFLOW, $ACC3, $ACC3 | |
766 | vpmuludq 32*4(%rcx), $B, $OVERFLOW | |
767 | vpaddq $T0, $ACC4, $ACC4 | |
768 | vpmuludq 32*5(%rcx), $B, $T0 | |
769 | vpaddq $OVERFLOW, $ACC5, $ACC5 | |
770 | vpmuludq 32*6(%rcx), $B, $OVERFLOW | |
771 | vpaddq $T0, $ACC6, $ACC6 | |
772 | ||
773 | vpmuludq $AND_MASK, $Y, $T0 | |
774 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
775 | vpmuludq 32*7(%rcx), $B, $ACC8 | |
776 | vmovdqa 32*2($a_ptr), $B | |
777 | vpaddq $T0, $ACC0, $OVERFLOW | |
778 | vpaddq $T0, $ACC1, $ACC0 | |
779 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
780 | vpaddq $T0, $ACC2, $ACC1 | |
781 | vpmuludq 32*3(%rax), $Y, $T0 | |
782 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
783 | vpaddq $T0, $ACC3, $ACC2 | |
784 | vmovdqa $ACC4, $ACC3 | |
785 | vpsllq \$18, $Y, $T0 | |
786 | vmovdqa $ACC5, $ACC4 | |
787 | vpmuludq %ymm14, $Y, $OVERFLOW | |
788 | vpaddq $T0, $ACC6, $ACC5 | |
789 | vpmuludq %ymm15, $Y, $T0 | |
790 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
791 | vpaddq $T0, $ACC8, $ACC7 | |
792 | ||
793 | #itr 3 | |
794 | vpmuludq $B, $B, $T0 | |
795 | vpand $AND_MASK, $ACC0, $Y | |
796 | vpmuludq 32*2(%rcx), $B, $OVERFLOW | |
797 | vpaddq $T0, $ACC2, $ACC2 | |
798 | vpmuludq 32*3(%rcx), $B, $T0 | |
799 | vpaddq $OVERFLOW, $ACC3, $ACC3 | |
800 | vpmuludq 32*4(%rcx), $B, $OVERFLOW | |
801 | vpaddq $T0, $ACC4, $ACC4 | |
802 | vpmuludq 32*5(%rcx), $B, $T0 | |
803 | vpaddq $OVERFLOW, $ACC5, $ACC5 | |
804 | vpmuludq 32*6(%rcx), $B, $OVERFLOW | |
805 | vpaddq $T0, $ACC6, $ACC6 | |
806 | ||
807 | vpmuludq $AND_MASK, $Y, $T0 | |
808 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
809 | vpmuludq 32*7(%rcx), $B, $ACC8 | |
810 | vmovdqa 32*3($a_ptr), $B | |
811 | vpaddq $T0, $ACC0, $OVERFLOW | |
812 | vpaddq $T0, $ACC1, $ACC0 | |
813 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
814 | vpaddq $T0, $ACC2, $ACC1 | |
815 | vpmuludq 32*3(%rax), $Y, $T0 | |
816 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
817 | vpaddq $T0, $ACC3, $ACC2 | |
818 | vmovdqa $ACC4, $ACC3 | |
819 | vpsllq \$18, $Y, $T0 | |
820 | vmovdqa $ACC5, $ACC4 | |
821 | vpmuludq %ymm14, $Y, $OVERFLOW | |
822 | vpaddq $T0, $ACC6, $ACC5 | |
823 | vpmuludq %ymm15, $Y, $T0 | |
824 | vpand $AND_MASK, $ACC0, $Y | |
825 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
826 | vpaddq $T0, $ACC8, $ACC7 | |
827 | ||
828 | #itr 4 | |
829 | vpmuludq $B, $B, $OVERFLOW | |
830 | vpmuludq 32*3(%rcx), $B, $T0 | |
831 | vpaddq $OVERFLOW, $ACC3, $ACC3 | |
832 | vpmuludq 32*4(%rcx), $B, $OVERFLOW | |
833 | vpaddq $T0, $ACC4, $ACC4 | |
834 | vpmuludq 32*5(%rcx), $B, $T0 | |
835 | vpaddq $OVERFLOW, $ACC5, $ACC5 | |
836 | vpmuludq 32*6(%rcx), $B, $OVERFLOW | |
837 | vpaddq $T0, $ACC6, $ACC6 | |
838 | ||
839 | vpmuludq $AND_MASK, $Y, $T0 | |
840 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
841 | vpmuludq 32*7(%rcx), $B, $ACC8 | |
842 | vmovdqa 32*4($a_ptr), $B | |
843 | vpaddq $T0, $ACC0, $OVERFLOW | |
844 | vpaddq $T0, $ACC1, $ACC0 | |
845 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
846 | vpaddq $T0, $ACC2, $ACC1 | |
847 | vpmuludq 32*3(%rax), $Y, $T0 | |
848 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
849 | vpaddq $T0, $ACC3, $ACC2 | |
850 | vmovdqa $ACC4, $ACC3 | |
851 | vpsllq \$18, $Y, $T0 | |
852 | vmovdqa $ACC5, $ACC4 | |
853 | vpmuludq %ymm14, $Y, $OVERFLOW | |
854 | vpaddq $T0, $ACC6, $ACC5 | |
855 | vpmuludq %ymm15, $Y, $T0 | |
856 | vpand $AND_MASK, $ACC0, $Y | |
857 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
858 | vpaddq $T0, $ACC8, $ACC7 | |
859 | ||
860 | #itr 5 | |
861 | vpmuludq $B, $B, $T0 | |
862 | vpmuludq 32*4(%rcx), $B, $OVERFLOW | |
863 | vpaddq $T0, $ACC4, $ACC4 | |
864 | vpmuludq 32*5(%rcx), $B, $T0 | |
865 | vpaddq $OVERFLOW, $ACC5, $ACC5 | |
866 | vpmuludq 32*6(%rcx), $B, $OVERFLOW | |
867 | vpaddq $T0, $ACC6, $ACC6 | |
868 | ||
869 | vpmuludq $AND_MASK, $Y, $T0 | |
870 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
871 | vpmuludq 32*7(%rcx), $B, $ACC8 | |
872 | vmovdqa 32*5($a_ptr), $B | |
873 | vpaddq $T0, $ACC0, $OVERFLOW | |
874 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
875 | vpaddq $T0, $ACC1, $ACC0 | |
876 | vpaddq $T0, $ACC2, $ACC1 | |
877 | vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0 | |
878 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
879 | vpaddq $T0, $ACC3, $ACC2 | |
880 | vmovdqa $ACC4, $ACC3 | |
881 | vpsllq \$18, $Y, $T0 | |
882 | vmovdqa $ACC5, $ACC4 | |
883 | vpmuludq %ymm14, $Y, $OVERFLOW | |
884 | vpaddq $T0, $ACC6, $ACC5 | |
885 | vpmuludq %ymm15, $Y, $T0 | |
886 | vpand $AND_MASK, $ACC0, $Y | |
887 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
888 | vpaddq $T0, $ACC8, $ACC7 | |
889 | ||
890 | #itr 6 | |
891 | vpmuludq $B, $B, $OVERFLOW | |
892 | vpmuludq 32*5(%rcx), $B, $T0 | |
893 | vpaddq $OVERFLOW, $ACC5, $ACC5 | |
894 | vpmuludq 32*6(%rcx), $B, $OVERFLOW | |
895 | vpaddq $T0, $ACC6, $ACC6 | |
896 | ||
897 | vpmuludq $AND_MASK, $Y, $T0 | |
898 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
899 | vpmuludq 32*7(%rcx), $B, $ACC8 | |
900 | vmovdqa 32*6($a_ptr), $B | |
901 | vpaddq $T0, $ACC0, $OVERFLOW | |
902 | vpaddq $T0, $ACC1, $ACC0 | |
903 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
904 | vpaddq $T0, $ACC2, $ACC1 | |
905 | vpmuludq 32*3(%rax), $Y, $T0 | |
906 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
907 | vpaddq $T0, $ACC3, $ACC2 | |
908 | vmovdqa $ACC4, $ACC3 | |
909 | vpsllq \$18, $Y, $T0 | |
910 | vmovdqa $ACC5, $ACC4 | |
911 | vpmuludq %ymm14, $Y, $OVERFLOW | |
912 | vpaddq $T0, $ACC6, $ACC5 | |
913 | vpmuludq %ymm15, $Y, $T0 | |
914 | vpand $AND_MASK, $ACC0, $Y | |
915 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
916 | vpaddq $T0, $ACC8, $ACC7 | |
917 | ||
918 | #itr 7 | |
919 | vpmuludq $B, $B, $T0 | |
920 | vpmuludq 32*6(%rcx), $B, $OVERFLOW | |
921 | vpaddq $T0, $ACC6, $ACC6 | |
922 | ||
923 | vpmuludq $AND_MASK, $Y, $T0 | |
924 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
925 | vpmuludq 32*7(%rcx), $B, $ACC8 | |
926 | vmovdqa 32*7($a_ptr), $B | |
927 | vpaddq $T0, $ACC0, $OVERFLOW | |
928 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
929 | vpaddq $T0, $ACC1, $ACC0 | |
930 | vpaddq $T0, $ACC2, $ACC1 | |
931 | vpmuludq 32*3(%rax), $Y, $T0 | |
932 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
933 | vpaddq $T0, $ACC3, $ACC2 | |
934 | vmovdqa $ACC4, $ACC3 | |
935 | vpsllq \$18, $Y, $T0 | |
936 | vmovdqa $ACC5, $ACC4 | |
937 | vpmuludq %ymm14, $Y, $OVERFLOW | |
938 | vpaddq $T0, $ACC6, $ACC5 | |
939 | vpmuludq %ymm15, $Y, $T0 | |
940 | vpand $AND_MASK, $ACC0, $Y | |
941 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
942 | vpaddq $T0, $ACC8, $ACC7 | |
943 | ||
944 | #itr 8 | |
945 | vpmuludq $B, $B, $OVERFLOW | |
946 | ||
947 | vpmuludq $AND_MASK, $Y, $T0 | |
948 | vpaddq $OVERFLOW, $ACC7, $ACC7 | |
949 | vpmuludq 32*7(%rcx), $B, $ACC8 | |
950 | vmovdqa 32*8($a_ptr), $B | |
951 | vpaddq $T0, $ACC0, $OVERFLOW | |
952 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
953 | vpaddq $T0, $ACC1, $ACC0 | |
954 | vpaddq $T0, $ACC2, $ACC1 | |
955 | vpmuludq 32*3(%rax), $Y, $T0 | |
956 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
957 | vpaddq $T0, $ACC3, $ACC2 | |
958 | vmovdqa $ACC4, $ACC3 | |
959 | vpsllq \$18, $Y, $T0 | |
960 | vmovdqa $ACC5, $ACC4 | |
961 | vpmuludq %ymm14, $Y, $OVERFLOW | |
962 | vpaddq $T0, $ACC6, $ACC5 | |
963 | vpmuludq %ymm15, $Y, $T0 | |
964 | vpand $AND_MASK, $ACC0, $Y | |
965 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
966 | vpaddq $T0, $ACC8, $ACC7 | |
967 | ||
968 | #itr 9 | |
969 | vpmuludq $B, $B, $ACC8 | |
970 | ||
971 | vpmuludq $AND_MASK, $Y, $T0 | |
972 | vpaddq $T0, $ACC0, $OVERFLOW | |
973 | vpsrlq $digit_size, $OVERFLOW, $OVERFLOW | |
974 | vpaddq $T0, $ACC1, $ACC0 | |
975 | vpaddq $T0, $ACC2, $ACC1 | |
976 | vpmuludq 32*3(%rax), $Y, $T0 | |
977 | vpaddq $OVERFLOW, $ACC0, $ACC0 | |
978 | vpaddq $T0, $ACC3, $ACC2 | |
979 | vmovdqa $ACC4, $ACC3 | |
980 | vpsllq \$18, $Y, $T0 | |
981 | vmovdqa $ACC5, $ACC4 | |
982 | vpmuludq %ymm14, $Y, $OVERFLOW | |
983 | vpaddq $T0, $ACC6, $ACC5 | |
984 | vpmuludq %ymm15, $Y, $T0 | |
985 | vpaddq $OVERFLOW, $ACC7, $ACC6 | |
986 | vpaddq $T0, $ACC8, $ACC7 | |
987 | ||
988 | vpxor $ACC8, $ACC8, $ACC8 | |
989 | ||
990 | ret | |
991 | .size avx2_sqr_x4,.-avx2_sqr_x4 | |
992 | ||
993 | ################################################################################ | |
994 | # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4); | |
995 | .type avx2_sub_x4,\@abi-omnipotent | |
996 | .align 32 | |
997 | avx2_sub_x4: | |
998 | vmovdqa 32*0($a_ptr), $ACC0 | |
999 | lea 160($a_ptr), $a_ptr | |
1000 | lea .LAVX2_POLY_x8+128(%rip), %rax | |
1001 | lea 128($b_ptr), $b_ptr | |
1002 | vmovdqa 32*1-160($a_ptr), $ACC1 | |
1003 | vmovdqa 32*2-160($a_ptr), $ACC2 | |
1004 | vmovdqa 32*3-160($a_ptr), $ACC3 | |
1005 | vmovdqa 32*4-160($a_ptr), $ACC4 | |
1006 | vmovdqa 32*5-160($a_ptr), $ACC5 | |
1007 | vmovdqa 32*6-160($a_ptr), $ACC6 | |
1008 | vmovdqa 32*7-160($a_ptr), $ACC7 | |
1009 | vmovdqa 32*8-160($a_ptr), $ACC8 | |
1010 | ||
1011 | vpaddq 32*0-128(%rax), $ACC0, $ACC0 | |
1012 | vpaddq 32*1-128(%rax), $ACC1, $ACC1 | |
1013 | vpaddq 32*2-128(%rax), $ACC2, $ACC2 | |
1014 | vpaddq 32*3-128(%rax), $ACC3, $ACC3 | |
1015 | vpaddq 32*4-128(%rax), $ACC4, $ACC4 | |
1016 | vpaddq 32*5-128(%rax), $ACC5, $ACC5 | |
1017 | vpaddq 32*6-128(%rax), $ACC6, $ACC6 | |
1018 | vpaddq 32*7-128(%rax), $ACC7, $ACC7 | |
1019 | vpaddq 32*8-128(%rax), $ACC8, $ACC8 | |
1020 | ||
1021 | vpsubq 32*0-128($b_ptr), $ACC0, $ACC0 | |
1022 | vpsubq 32*1-128($b_ptr), $ACC1, $ACC1 | |
1023 | vpsubq 32*2-128($b_ptr), $ACC2, $ACC2 | |
1024 | vpsubq 32*3-128($b_ptr), $ACC3, $ACC3 | |
1025 | vpsubq 32*4-128($b_ptr), $ACC4, $ACC4 | |
1026 | vpsubq 32*5-128($b_ptr), $ACC5, $ACC5 | |
1027 | vpsubq 32*6-128($b_ptr), $ACC6, $ACC6 | |
1028 | vpsubq 32*7-128($b_ptr), $ACC7, $ACC7 | |
1029 | vpsubq 32*8-128($b_ptr), $ACC8, $ACC8 | |
1030 | ||
1031 | ret | |
1032 | .size avx2_sub_x4,.-avx2_sub_x4 | |
1033 | ||
1034 | .type avx2_select_n_store,\@abi-omnipotent | |
1035 | .align 32 | |
1036 | avx2_select_n_store: | |
1037 | vmovdqa `8+32*9*8`(%rsp), $Y | |
1038 | vpor `8+32*9*8+32`(%rsp), $Y, $Y | |
1039 | ||
1040 | vpandn $ACC0, $Y, $ACC0 | |
1041 | vpandn $ACC1, $Y, $ACC1 | |
1042 | vpandn $ACC2, $Y, $ACC2 | |
1043 | vpandn $ACC3, $Y, $ACC3 | |
1044 | vpandn $ACC4, $Y, $ACC4 | |
1045 | vpandn $ACC5, $Y, $ACC5 | |
1046 | vpandn $ACC6, $Y, $ACC6 | |
1047 | vmovdqa `8+32*9*8+32`(%rsp), $B | |
1048 | vpandn $ACC7, $Y, $ACC7 | |
1049 | vpandn `8+32*9*8`(%rsp), $B, $B | |
1050 | vpandn $ACC8, $Y, $ACC8 | |
1051 | ||
1052 | vpand 32*0(%rsi), $B, $T0 | |
1053 | lea 160(%rsi), %rax | |
1054 | vpand 32*1(%rsi), $B, $Y | |
1055 | vpxor $T0, $ACC0, $ACC0 | |
1056 | vpand 32*2(%rsi), $B, $T0 | |
1057 | vpxor $Y, $ACC1, $ACC1 | |
1058 | vpand 32*3(%rsi), $B, $Y | |
1059 | vpxor $T0, $ACC2, $ACC2 | |
1060 | vpand 32*4-160(%rax), $B, $T0 | |
1061 | vpxor $Y, $ACC3, $ACC3 | |
1062 | vpand 32*5-160(%rax), $B, $Y | |
1063 | vpxor $T0, $ACC4, $ACC4 | |
1064 | vpand 32*6-160(%rax), $B, $T0 | |
1065 | vpxor $Y, $ACC5, $ACC5 | |
1066 | vpand 32*7-160(%rax), $B, $Y | |
1067 | vpxor $T0, $ACC6, $ACC6 | |
1068 | vpand 32*8-160(%rax), $B, $T0 | |
1069 | vmovdqa `8+32*9*8+32`(%rsp), $B | |
1070 | vpxor $Y, $ACC7, $ACC7 | |
1071 | ||
1072 | vpand 32*0(%rdx), $B, $Y | |
1073 | lea 160(%rdx), %rax | |
1074 | vpxor $T0, $ACC8, $ACC8 | |
1075 | vpand 32*1(%rdx), $B, $T0 | |
1076 | vpxor $Y, $ACC0, $ACC0 | |
1077 | vpand 32*2(%rdx), $B, $Y | |
1078 | vpxor $T0, $ACC1, $ACC1 | |
1079 | vpand 32*3(%rdx), $B, $T0 | |
1080 | vpxor $Y, $ACC2, $ACC2 | |
1081 | vpand 32*4-160(%rax), $B, $Y | |
1082 | vpxor $T0, $ACC3, $ACC3 | |
1083 | vpand 32*5-160(%rax), $B, $T0 | |
1084 | vpxor $Y, $ACC4, $ACC4 | |
1085 | vpand 32*6-160(%rax), $B, $Y | |
1086 | vpxor $T0, $ACC5, $ACC5 | |
1087 | vpand 32*7-160(%rax), $B, $T0 | |
1088 | vpxor $Y, $ACC6, $ACC6 | |
1089 | vpand 32*8-160(%rax), $B, $Y | |
1090 | vpxor $T0, $ACC7, $ACC7 | |
1091 | vpxor $Y, $ACC8, $ACC8 | |
1092 | `&STORE` | |
1093 | ||
1094 | ret | |
1095 | .size avx2_select_n_store,.-avx2_select_n_store | |
1096 | ___ | |
1097 | $code.=<<___ if (0); # inlined | |
1098 | ################################################################################ | |
1099 | # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4); | |
1100 | .type avx2_mul_by2_x4,\@abi-omnipotent | |
1101 | .align 32 | |
1102 | avx2_mul_by2_x4: | |
1103 | vmovdqa 32*0($a_ptr), $ACC0 | |
1104 | lea 160($a_ptr), %rax | |
1105 | vmovdqa 32*1($a_ptr), $ACC1 | |
1106 | vmovdqa 32*2($a_ptr), $ACC2 | |
1107 | vmovdqa 32*3($a_ptr), $ACC3 | |
1108 | vmovdqa 32*4-160(%rax), $ACC4 | |
1109 | vmovdqa 32*5-160(%rax), $ACC5 | |
1110 | vmovdqa 32*6-160(%rax), $ACC6 | |
1111 | vmovdqa 32*7-160(%rax), $ACC7 | |
1112 | vmovdqa 32*8-160(%rax), $ACC8 | |
1113 | ||
1114 | vpaddq $ACC0, $ACC0, $ACC0 | |
1115 | vpaddq $ACC1, $ACC1, $ACC1 | |
1116 | vpaddq $ACC2, $ACC2, $ACC2 | |
1117 | vpaddq $ACC3, $ACC3, $ACC3 | |
1118 | vpaddq $ACC4, $ACC4, $ACC4 | |
1119 | vpaddq $ACC5, $ACC5, $ACC5 | |
1120 | vpaddq $ACC6, $ACC6, $ACC6 | |
1121 | vpaddq $ACC7, $ACC7, $ACC7 | |
1122 | vpaddq $ACC8, $ACC8, $ACC8 | |
1123 | ||
1124 | ret | |
1125 | .size avx2_mul_by2_x4,.-avx2_mul_by2_x4 | |
1126 | ___ | |
1127 | my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx"); | |
1128 | my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10"); | |
1129 | ||
1130 | $code.=<<___; | |
1131 | ################################################################################ | |
1132 | # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4); | |
1133 | .globl ecp_nistz256_avx2_point_add_affine_x4 | |
1134 | .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3 | |
1135 | .align 32 | |
1136 | ecp_nistz256_avx2_point_add_affine_x4: | |
1137 | mov %rsp, %rax | |
1138 | push %rbp | |
1139 | vzeroupper | |
1140 | ___ | |
1141 | $code.=<<___ if ($win64); | |
1142 | lea -16*10(%rsp), %rsp | |
1143 | vmovaps %xmm6, -8-16*10(%rax) | |
1144 | vmovaps %xmm7, -8-16*9(%rax) | |
1145 | vmovaps %xmm8, -8-16*8(%rax) | |
1146 | vmovaps %xmm9, -8-16*7(%rax) | |
1147 | vmovaps %xmm10, -8-16*6(%rax) | |
1148 | vmovaps %xmm11, -8-16*5(%rax) | |
1149 | vmovaps %xmm12, -8-16*4(%rax) | |
1150 | vmovaps %xmm13, -8-16*3(%rax) | |
1151 | vmovaps %xmm14, -8-16*2(%rax) | |
1152 | vmovaps %xmm15, -8-16*1(%rax) | |
1153 | ___ | |
1154 | $code.=<<___; | |
1155 | lea -8(%rax), %rbp | |
1156 | ||
1157 | # Result + 32*0 = Result.X | |
1158 | # Result + 32*9 = Result.Y | |
1159 | # Result + 32*18 = Result.Z | |
1160 | ||
1161 | # A + 32*0 = A.X | |
1162 | # A + 32*9 = A.Y | |
1163 | # A + 32*18 = A.Z | |
1164 | ||
1165 | # B + 32*0 = B.X | |
1166 | # B + 32*9 = B.Y | |
1167 | ||
1168 | sub \$`32*9*8+32*2+32*8`, %rsp | |
1169 | and \$-64, %rsp | |
1170 | ||
1171 | mov $r_ptr_in, $r_ptr | |
1172 | mov $a_ptr_in, $a_ptr | |
1173 | mov $b_ptr_in, $b_ptr | |
1174 | ||
1175 | vmovdqa 32*0($a_ptr_in), %ymm0 | |
1176 | vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK | |
1177 | vpxor %ymm1, %ymm1, %ymm1 | |
1178 | lea 256($a_ptr_in), %rax # size optimization | |
1179 | vpor 32*1($a_ptr_in), %ymm0, %ymm0 | |
1180 | vpor 32*2($a_ptr_in), %ymm0, %ymm0 | |
1181 | vpor 32*3($a_ptr_in), %ymm0, %ymm0 | |
1182 | vpor 32*4-256(%rax), %ymm0, %ymm0 | |
1183 | lea 256(%rax), %rcx # size optimization | |
1184 | vpor 32*5-256(%rax), %ymm0, %ymm0 | |
1185 | vpor 32*6-256(%rax), %ymm0, %ymm0 | |
1186 | vpor 32*7-256(%rax), %ymm0, %ymm0 | |
1187 | vpor 32*8-256(%rax), %ymm0, %ymm0 | |
1188 | vpor 32*9-256(%rax), %ymm0, %ymm0 | |
1189 | vpor 32*10-256(%rax), %ymm0, %ymm0 | |
1190 | vpor 32*11-256(%rax), %ymm0, %ymm0 | |
1191 | vpor 32*12-512(%rcx), %ymm0, %ymm0 | |
1192 | vpor 32*13-512(%rcx), %ymm0, %ymm0 | |
1193 | vpor 32*14-512(%rcx), %ymm0, %ymm0 | |
1194 | vpor 32*15-512(%rcx), %ymm0, %ymm0 | |
1195 | vpor 32*16-512(%rcx), %ymm0, %ymm0 | |
1196 | vpor 32*17-512(%rcx), %ymm0, %ymm0 | |
1197 | vpcmpeqq %ymm1, %ymm0, %ymm0 | |
1198 | vmovdqa %ymm0, `32*9*8`(%rsp) | |
1199 | ||
1200 | vpxor %ymm1, %ymm1, %ymm1 | |
1201 | vmovdqa 32*0($b_ptr), %ymm0 | |
1202 | lea 256($b_ptr), %rax # size optimization | |
1203 | vpor 32*1($b_ptr), %ymm0, %ymm0 | |
1204 | vpor 32*2($b_ptr), %ymm0, %ymm0 | |
1205 | vpor 32*3($b_ptr), %ymm0, %ymm0 | |
1206 | vpor 32*4-256(%rax), %ymm0, %ymm0 | |
1207 | lea 256(%rax), %rcx # size optimization | |
1208 | vpor 32*5-256(%rax), %ymm0, %ymm0 | |
1209 | vpor 32*6-256(%rax), %ymm0, %ymm0 | |
1210 | vpor 32*7-256(%rax), %ymm0, %ymm0 | |
1211 | vpor 32*8-256(%rax), %ymm0, %ymm0 | |
1212 | vpor 32*9-256(%rax), %ymm0, %ymm0 | |
1213 | vpor 32*10-256(%rax), %ymm0, %ymm0 | |
1214 | vpor 32*11-256(%rax), %ymm0, %ymm0 | |
1215 | vpor 32*12-512(%rcx), %ymm0, %ymm0 | |
1216 | vpor 32*13-512(%rcx), %ymm0, %ymm0 | |
1217 | vpor 32*14-512(%rcx), %ymm0, %ymm0 | |
1218 | vpor 32*15-512(%rcx), %ymm0, %ymm0 | |
1219 | vpor 32*16-512(%rcx), %ymm0, %ymm0 | |
1220 | vpor 32*17-512(%rcx), %ymm0, %ymm0 | |
1221 | vpcmpeqq %ymm1, %ymm0, %ymm0 | |
1222 | vmovdqa %ymm0, `32*9*8+32`(%rsp) | |
1223 | ||
1224 | # Z1^2 = Z1*Z1 | |
1225 | lea `32*9*2`($a_ptr), %rsi | |
1226 | lea `32*9*2`(%rsp), %rdi | |
1227 | lea `32*9*8+32*2`(%rsp), %rcx # temporary vector | |
1228 | call avx2_sqr_x4 | |
1229 | call avx2_normalize_n_store | |
1230 | ||
1231 | # U2 = X2*Z1^2 | |
1232 | lea `32*9*0`($b_ptr), %rsi | |
1233 | lea `32*9*2`(%rsp), %rdx | |
1234 | lea `32*9*0`(%rsp), %rdi | |
1235 | call avx2_mul_x4 | |
1236 | #call avx2_normalize | |
1237 | `&STORE` | |
1238 | ||
1239 | # S2 = Z1*Z1^2 = Z1^3 | |
1240 | lea `32*9*2`($a_ptr), %rsi | |
1241 | lea `32*9*2`(%rsp), %rdx | |
1242 | lea `32*9*1`(%rsp), %rdi | |
1243 | call avx2_mul_x4 | |
1244 | call avx2_normalize_n_store | |
1245 | ||
1246 | # S2 = S2*Y2 = Y2*Z1^3 | |
1247 | lea `32*9*1`($b_ptr), %rsi | |
1248 | lea `32*9*1`(%rsp), %rdx | |
1249 | lea `32*9*1`(%rsp), %rdi | |
1250 | call avx2_mul_x4 | |
1251 | call avx2_normalize_n_store | |
1252 | ||
1253 | # H = U2 - U1 = U2 - X1 | |
1254 | lea `32*9*0`(%rsp), %rsi | |
1255 | lea `32*9*0`($a_ptr), %rdx | |
1256 | lea `32*9*3`(%rsp), %rdi | |
1257 | call avx2_sub_x4 | |
1258 | call avx2_normalize_n_store | |
1259 | ||
1260 | # R = S2 - S1 = S2 - Y1 | |
1261 | lea `32*9*1`(%rsp), %rsi | |
1262 | lea `32*9*1`($a_ptr), %rdx | |
1263 | lea `32*9*4`(%rsp), %rdi | |
1264 | call avx2_sub_x4 | |
1265 | call avx2_normalize_n_store | |
1266 | ||
1267 | # Z3 = H*Z1*Z2 | |
1268 | lea `32*9*3`(%rsp), %rsi | |
1269 | lea `32*9*2`($a_ptr), %rdx | |
1270 | lea `32*9*2`($r_ptr), %rdi | |
1271 | call avx2_mul_x4 | |
1272 | call avx2_normalize | |
1273 | ||
1274 | lea .LONE(%rip), %rsi | |
1275 | lea `32*9*2`($a_ptr), %rdx | |
1276 | call avx2_select_n_store | |
1277 | ||
1278 | # R^2 = R^2 | |
1279 | lea `32*9*4`(%rsp), %rsi | |
1280 | lea `32*9*6`(%rsp), %rdi | |
1281 | lea `32*9*8+32*2`(%rsp), %rcx # temporary vector | |
1282 | call avx2_sqr_x4 | |
1283 | call avx2_normalize_n_store | |
1284 | ||
1285 | # H^2 = H^2 | |
1286 | lea `32*9*3`(%rsp), %rsi | |
1287 | lea `32*9*5`(%rsp), %rdi | |
1288 | call avx2_sqr_x4 | |
1289 | call avx2_normalize_n_store | |
1290 | ||
1291 | # H^3 = H^2*H | |
1292 | lea `32*9*3`(%rsp), %rsi | |
1293 | lea `32*9*5`(%rsp), %rdx | |
1294 | lea `32*9*7`(%rsp), %rdi | |
1295 | call avx2_mul_x4 | |
1296 | call avx2_normalize_n_store | |
1297 | ||
1298 | # U2 = U1*H^2 | |
1299 | lea `32*9*0`($a_ptr), %rsi | |
1300 | lea `32*9*5`(%rsp), %rdx | |
1301 | lea `32*9*0`(%rsp), %rdi | |
1302 | call avx2_mul_x4 | |
1303 | #call avx2_normalize | |
1304 | `&STORE` | |
1305 | ||
1306 | # Hsqr = U2*2 | |
1307 | #lea 32*9*0(%rsp), %rsi | |
1308 | #lea 32*9*5(%rsp), %rdi | |
1309 | #call avx2_mul_by2_x4 | |
1310 | ||
1311 | vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 | |
1312 | lea `32*9*5`(%rsp), %rdi | |
1313 | vpaddq $ACC1, $ACC1, $ACC1 | |
1314 | vpaddq $ACC2, $ACC2, $ACC2 | |
1315 | vpaddq $ACC3, $ACC3, $ACC3 | |
1316 | vpaddq $ACC4, $ACC4, $ACC4 | |
1317 | vpaddq $ACC5, $ACC5, $ACC5 | |
1318 | vpaddq $ACC6, $ACC6, $ACC6 | |
1319 | vpaddq $ACC7, $ACC7, $ACC7 | |
1320 | vpaddq $ACC8, $ACC8, $ACC8 | |
1321 | call avx2_normalize_n_store | |
1322 | ||
1323 | # X3 = R^2 - H^3 | |
1324 | #lea 32*9*6(%rsp), %rsi | |
1325 | #lea 32*9*7(%rsp), %rdx | |
1326 | #lea 32*9*5(%rsp), %rcx | |
1327 | #lea 32*9*0($r_ptr), %rdi | |
1328 | #call avx2_sub_x4 | |
1329 | #NORMALIZE | |
1330 | #STORE | |
1331 | ||
1332 | # X3 = X3 - U2*2 | |
1333 | #lea 32*9*0($r_ptr), %rsi | |
1334 | #lea 32*9*0($r_ptr), %rdi | |
1335 | #call avx2_sub_x4 | |
1336 | #NORMALIZE | |
1337 | #STORE | |
1338 | ||
1339 | lea `32*9*6+128`(%rsp), %rsi | |
1340 | lea .LAVX2_POLY_x2+128(%rip), %rax | |
1341 | lea `32*9*7+128`(%rsp), %rdx | |
1342 | lea `32*9*5+128`(%rsp), %rcx | |
1343 | lea `32*9*0`($r_ptr), %rdi | |
1344 | ||
1345 | vmovdqa 32*0-128(%rsi), $ACC0 | |
1346 | vmovdqa 32*1-128(%rsi), $ACC1 | |
1347 | vmovdqa 32*2-128(%rsi), $ACC2 | |
1348 | vmovdqa 32*3-128(%rsi), $ACC3 | |
1349 | vmovdqa 32*4-128(%rsi), $ACC4 | |
1350 | vmovdqa 32*5-128(%rsi), $ACC5 | |
1351 | vmovdqa 32*6-128(%rsi), $ACC6 | |
1352 | vmovdqa 32*7-128(%rsi), $ACC7 | |
1353 | vmovdqa 32*8-128(%rsi), $ACC8 | |
1354 | ||
1355 | vpaddq 32*0-128(%rax), $ACC0, $ACC0 | |
1356 | vpaddq 32*1-128(%rax), $ACC1, $ACC1 | |
1357 | vpaddq 32*2-128(%rax), $ACC2, $ACC2 | |
1358 | vpaddq 32*3-128(%rax), $ACC3, $ACC3 | |
1359 | vpaddq 32*4-128(%rax), $ACC4, $ACC4 | |
1360 | vpaddq 32*5-128(%rax), $ACC5, $ACC5 | |
1361 | vpaddq 32*6-128(%rax), $ACC6, $ACC6 | |
1362 | vpaddq 32*7-128(%rax), $ACC7, $ACC7 | |
1363 | vpaddq 32*8-128(%rax), $ACC8, $ACC8 | |
1364 | ||
1365 | vpsubq 32*0-128(%rdx), $ACC0, $ACC0 | |
1366 | vpsubq 32*1-128(%rdx), $ACC1, $ACC1 | |
1367 | vpsubq 32*2-128(%rdx), $ACC2, $ACC2 | |
1368 | vpsubq 32*3-128(%rdx), $ACC3, $ACC3 | |
1369 | vpsubq 32*4-128(%rdx), $ACC4, $ACC4 | |
1370 | vpsubq 32*5-128(%rdx), $ACC5, $ACC5 | |
1371 | vpsubq 32*6-128(%rdx), $ACC6, $ACC6 | |
1372 | vpsubq 32*7-128(%rdx), $ACC7, $ACC7 | |
1373 | vpsubq 32*8-128(%rdx), $ACC8, $ACC8 | |
1374 | ||
1375 | vpsubq 32*0-128(%rcx), $ACC0, $ACC0 | |
1376 | vpsubq 32*1-128(%rcx), $ACC1, $ACC1 | |
1377 | vpsubq 32*2-128(%rcx), $ACC2, $ACC2 | |
1378 | vpsubq 32*3-128(%rcx), $ACC3, $ACC3 | |
1379 | vpsubq 32*4-128(%rcx), $ACC4, $ACC4 | |
1380 | vpsubq 32*5-128(%rcx), $ACC5, $ACC5 | |
1381 | vpsubq 32*6-128(%rcx), $ACC6, $ACC6 | |
1382 | vpsubq 32*7-128(%rcx), $ACC7, $ACC7 | |
1383 | vpsubq 32*8-128(%rcx), $ACC8, $ACC8 | |
1384 | call avx2_normalize | |
1385 | ||
1386 | lea 32*0($b_ptr), %rsi | |
1387 | lea 32*0($a_ptr), %rdx | |
1388 | call avx2_select_n_store | |
1389 | ||
1390 | # H = U2 - X3 | |
1391 | lea `32*9*0`(%rsp), %rsi | |
1392 | lea `32*9*0`($r_ptr), %rdx | |
1393 | lea `32*9*3`(%rsp), %rdi | |
1394 | call avx2_sub_x4 | |
1395 | call avx2_normalize_n_store | |
1396 | ||
1397 | # | |
1398 | lea `32*9*3`(%rsp), %rsi | |
1399 | lea `32*9*4`(%rsp), %rdx | |
1400 | lea `32*9*3`(%rsp), %rdi | |
1401 | call avx2_mul_x4 | |
1402 | call avx2_normalize_n_store | |
1403 | ||
1404 | # | |
1405 | lea `32*9*7`(%rsp), %rsi | |
1406 | lea `32*9*1`($a_ptr), %rdx | |
1407 | lea `32*9*1`(%rsp), %rdi | |
1408 | call avx2_mul_x4 | |
1409 | call avx2_normalize_n_store | |
1410 | ||
1411 | # | |
1412 | lea `32*9*3`(%rsp), %rsi | |
1413 | lea `32*9*1`(%rsp), %rdx | |
1414 | lea `32*9*1`($r_ptr), %rdi | |
1415 | call avx2_sub_x4 | |
1416 | call avx2_normalize | |
1417 | ||
1418 | lea 32*9($b_ptr), %rsi | |
1419 | lea 32*9($a_ptr), %rdx | |
1420 | call avx2_select_n_store | |
1421 | ||
1422 | #lea 32*9*0($r_ptr), %rsi | |
1423 | #lea 32*9*0($r_ptr), %rdi | |
1424 | #call avx2_mul_by1_x4 | |
1425 | #NORMALIZE | |
1426 | #STORE | |
1427 | ||
1428 | lea `32*9*1`($r_ptr), %rsi | |
1429 | lea `32*9*1`($r_ptr), %rdi | |
1430 | call avx2_mul_by1_x4 | |
1431 | call avx2_normalize_n_store | |
1432 | ||
1433 | vzeroupper | |
1434 | ___ | |
1435 | $code.=<<___ if ($win64); | |
1436 | movaps %xmm6, -16*10(%rbp) | |
1437 | movaps %xmm7, -16*9(%rbp) | |
1438 | movaps %xmm8, -16*8(%rbp) | |
1439 | movaps %xmm9, -16*7(%rbp) | |
1440 | movaps %xmm10, -16*6(%rbp) | |
1441 | movaps %xmm11, -16*5(%rbp) | |
1442 | movaps %xmm12, -16*4(%rbp) | |
1443 | movaps %xmm13, -16*3(%rbp) | |
1444 | movaps %xmm14, -16*2(%rbp) | |
1445 | movaps %xmm15, -16*1(%rbp) | |
1446 | ___ | |
1447 | $code.=<<___; | |
1448 | mov %rbp, %rsp | |
1449 | pop %rbp | |
1450 | ret | |
1451 | .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4 | |
1452 | ||
1453 | ################################################################################ | |
1454 | # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4); | |
1455 | .globl ecp_nistz256_avx2_point_add_affines_x4 | |
1456 | .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3 | |
1457 | .align 32 | |
1458 | ecp_nistz256_avx2_point_add_affines_x4: | |
1459 | mov %rsp, %rax | |
1460 | push %rbp | |
1461 | vzeroupper | |
1462 | ___ | |
1463 | $code.=<<___ if ($win64); | |
1464 | lea -16*10(%rsp), %rsp | |
1465 | vmovaps %xmm6, -8-16*10(%rax) | |
1466 | vmovaps %xmm7, -8-16*9(%rax) | |
1467 | vmovaps %xmm8, -8-16*8(%rax) | |
1468 | vmovaps %xmm9, -8-16*7(%rax) | |
1469 | vmovaps %xmm10, -8-16*6(%rax) | |
1470 | vmovaps %xmm11, -8-16*5(%rax) | |
1471 | vmovaps %xmm12, -8-16*4(%rax) | |
1472 | vmovaps %xmm13, -8-16*3(%rax) | |
1473 | vmovaps %xmm14, -8-16*2(%rax) | |
1474 | vmovaps %xmm15, -8-16*1(%rax) | |
1475 | ___ | |
1476 | $code.=<<___; | |
1477 | lea -8(%rax), %rbp | |
1478 | ||
1479 | # Result + 32*0 = Result.X | |
1480 | # Result + 32*9 = Result.Y | |
1481 | # Result + 32*18 = Result.Z | |
1482 | ||
1483 | # A + 32*0 = A.X | |
1484 | # A + 32*9 = A.Y | |
1485 | ||
1486 | # B + 32*0 = B.X | |
1487 | # B + 32*9 = B.Y | |
1488 | ||
1489 | sub \$`32*9*8+32*2+32*8`, %rsp | |
1490 | and \$-64, %rsp | |
1491 | ||
1492 | mov $r_ptr_in, $r_ptr | |
1493 | mov $a_ptr_in, $a_ptr | |
1494 | mov $b_ptr_in, $b_ptr | |
1495 | ||
1496 | vmovdqa 32*0($a_ptr_in), %ymm0 | |
1497 | vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK | |
1498 | vpxor %ymm1, %ymm1, %ymm1 | |
1499 | lea 256($a_ptr_in), %rax # size optimization | |
1500 | vpor 32*1($a_ptr_in), %ymm0, %ymm0 | |
1501 | vpor 32*2($a_ptr_in), %ymm0, %ymm0 | |
1502 | vpor 32*3($a_ptr_in), %ymm0, %ymm0 | |
1503 | vpor 32*4-256(%rax), %ymm0, %ymm0 | |
1504 | lea 256(%rax), %rcx # size optimization | |
1505 | vpor 32*5-256(%rax), %ymm0, %ymm0 | |
1506 | vpor 32*6-256(%rax), %ymm0, %ymm0 | |
1507 | vpor 32*7-256(%rax), %ymm0, %ymm0 | |
1508 | vpor 32*8-256(%rax), %ymm0, %ymm0 | |
1509 | vpor 32*9-256(%rax), %ymm0, %ymm0 | |
1510 | vpor 32*10-256(%rax), %ymm0, %ymm0 | |
1511 | vpor 32*11-256(%rax), %ymm0, %ymm0 | |
1512 | vpor 32*12-512(%rcx), %ymm0, %ymm0 | |
1513 | vpor 32*13-512(%rcx), %ymm0, %ymm0 | |
1514 | vpor 32*14-512(%rcx), %ymm0, %ymm0 | |
1515 | vpor 32*15-512(%rcx), %ymm0, %ymm0 | |
1516 | vpor 32*16-512(%rcx), %ymm0, %ymm0 | |
1517 | vpor 32*17-512(%rcx), %ymm0, %ymm0 | |
1518 | vpcmpeqq %ymm1, %ymm0, %ymm0 | |
1519 | vmovdqa %ymm0, `32*9*8`(%rsp) | |
1520 | ||
1521 | vpxor %ymm1, %ymm1, %ymm1 | |
1522 | vmovdqa 32*0($b_ptr), %ymm0 | |
1523 | lea 256($b_ptr), %rax # size optimization | |
1524 | vpor 32*1($b_ptr), %ymm0, %ymm0 | |
1525 | vpor 32*2($b_ptr), %ymm0, %ymm0 | |
1526 | vpor 32*3($b_ptr), %ymm0, %ymm0 | |
1527 | vpor 32*4-256(%rax), %ymm0, %ymm0 | |
1528 | lea 256(%rax), %rcx # size optimization | |
1529 | vpor 32*5-256(%rax), %ymm0, %ymm0 | |
1530 | vpor 32*6-256(%rax), %ymm0, %ymm0 | |
1531 | vpor 32*7-256(%rax), %ymm0, %ymm0 | |
1532 | vpor 32*8-256(%rax), %ymm0, %ymm0 | |
1533 | vpor 32*9-256(%rax), %ymm0, %ymm0 | |
1534 | vpor 32*10-256(%rax), %ymm0, %ymm0 | |
1535 | vpor 32*11-256(%rax), %ymm0, %ymm0 | |
1536 | vpor 32*12-512(%rcx), %ymm0, %ymm0 | |
1537 | vpor 32*13-512(%rcx), %ymm0, %ymm0 | |
1538 | vpor 32*14-512(%rcx), %ymm0, %ymm0 | |
1539 | vpor 32*15-512(%rcx), %ymm0, %ymm0 | |
1540 | vpor 32*16-512(%rcx), %ymm0, %ymm0 | |
1541 | vpor 32*17-512(%rcx), %ymm0, %ymm0 | |
1542 | vpcmpeqq %ymm1, %ymm0, %ymm0 | |
1543 | vmovdqa %ymm0, `32*9*8+32`(%rsp) | |
1544 | ||
1545 | # H = U2 - U1 = X2 - X1 | |
1546 | lea `32*9*0`($b_ptr), %rsi | |
1547 | lea `32*9*0`($a_ptr), %rdx | |
1548 | lea `32*9*3`(%rsp), %rdi | |
1549 | call avx2_sub_x4 | |
1550 | call avx2_normalize_n_store | |
1551 | ||
1552 | # R = S2 - S1 = Y2 - Y1 | |
1553 | lea `32*9*1`($b_ptr), %rsi | |
1554 | lea `32*9*1`($a_ptr), %rdx | |
1555 | lea `32*9*4`(%rsp), %rdi | |
1556 | call avx2_sub_x4 | |
1557 | call avx2_normalize_n_store | |
1558 | ||
1559 | # Z3 = H*Z1*Z2 = H | |
1560 | lea `32*9*3`(%rsp), %rsi | |
1561 | lea `32*9*2`($r_ptr), %rdi | |
1562 | call avx2_mul_by1_x4 | |
1563 | call avx2_normalize | |
1564 | ||
1565 | vmovdqa `32*9*8`(%rsp), $B | |
1566 | vpor `32*9*8+32`(%rsp), $B, $B | |
1567 | ||
1568 | vpandn $ACC0, $B, $ACC0 | |
1569 | lea .LONE+128(%rip), %rax | |
1570 | vpandn $ACC1, $B, $ACC1 | |
1571 | vpandn $ACC2, $B, $ACC2 | |
1572 | vpandn $ACC3, $B, $ACC3 | |
1573 | vpandn $ACC4, $B, $ACC4 | |
1574 | vpandn $ACC5, $B, $ACC5 | |
1575 | vpandn $ACC6, $B, $ACC6 | |
1576 | vpandn $ACC7, $B, $ACC7 | |
1577 | ||
1578 | vpand 32*0-128(%rax), $B, $T0 | |
1579 | vpandn $ACC8, $B, $ACC8 | |
1580 | vpand 32*1-128(%rax), $B, $Y | |
1581 | vpxor $T0, $ACC0, $ACC0 | |
1582 | vpand 32*2-128(%rax), $B, $T0 | |
1583 | vpxor $Y, $ACC1, $ACC1 | |
1584 | vpand 32*3-128(%rax), $B, $Y | |
1585 | vpxor $T0, $ACC2, $ACC2 | |
1586 | vpand 32*4-128(%rax), $B, $T0 | |
1587 | vpxor $Y, $ACC3, $ACC3 | |
1588 | vpand 32*5-128(%rax), $B, $Y | |
1589 | vpxor $T0, $ACC4, $ACC4 | |
1590 | vpand 32*6-128(%rax), $B, $T0 | |
1591 | vpxor $Y, $ACC5, $ACC5 | |
1592 | vpand 32*7-128(%rax), $B, $Y | |
1593 | vpxor $T0, $ACC6, $ACC6 | |
1594 | vpand 32*8-128(%rax), $B, $T0 | |
1595 | vpxor $Y, $ACC7, $ACC7 | |
1596 | vpxor $T0, $ACC8, $ACC8 | |
1597 | `&STORE` | |
1598 | ||
1599 | # R^2 = R^2 | |
1600 | lea `32*9*4`(%rsp), %rsi | |
1601 | lea `32*9*6`(%rsp), %rdi | |
1602 | lea `32*9*8+32*2`(%rsp), %rcx # temporary vector | |
1603 | call avx2_sqr_x4 | |
1604 | call avx2_normalize_n_store | |
1605 | ||
1606 | # H^2 = H^2 | |
1607 | lea `32*9*3`(%rsp), %rsi | |
1608 | lea `32*9*5`(%rsp), %rdi | |
1609 | call avx2_sqr_x4 | |
1610 | call avx2_normalize_n_store | |
1611 | ||
1612 | # H^3 = H^2*H | |
1613 | lea `32*9*3`(%rsp), %rsi | |
1614 | lea `32*9*5`(%rsp), %rdx | |
1615 | lea `32*9*7`(%rsp), %rdi | |
1616 | call avx2_mul_x4 | |
1617 | call avx2_normalize_n_store | |
1618 | ||
1619 | # U2 = U1*H^2 | |
1620 | lea `32*9*0`($a_ptr), %rsi | |
1621 | lea `32*9*5`(%rsp), %rdx | |
1622 | lea `32*9*0`(%rsp), %rdi | |
1623 | call avx2_mul_x4 | |
1624 | #call avx2_normalize | |
1625 | `&STORE` | |
1626 | ||
1627 | # Hsqr = U2*2 | |
1628 | #lea 32*9*0(%rsp), %rsi | |
1629 | #lea 32*9*5(%rsp), %rdi | |
1630 | #call avx2_mul_by2_x4 | |
1631 | ||
1632 | vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 | |
1633 | lea `32*9*5`(%rsp), %rdi | |
1634 | vpaddq $ACC1, $ACC1, $ACC1 | |
1635 | vpaddq $ACC2, $ACC2, $ACC2 | |
1636 | vpaddq $ACC3, $ACC3, $ACC3 | |
1637 | vpaddq $ACC4, $ACC4, $ACC4 | |
1638 | vpaddq $ACC5, $ACC5, $ACC5 | |
1639 | vpaddq $ACC6, $ACC6, $ACC6 | |
1640 | vpaddq $ACC7, $ACC7, $ACC7 | |
1641 | vpaddq $ACC8, $ACC8, $ACC8 | |
1642 | call avx2_normalize_n_store | |
1643 | ||
1644 | # X3 = R^2 - H^3 | |
1645 | #lea 32*9*6(%rsp), %rsi | |
1646 | #lea 32*9*7(%rsp), %rdx | |
1647 | #lea 32*9*5(%rsp), %rcx | |
1648 | #lea 32*9*0($r_ptr), %rdi | |
1649 | #call avx2_sub_x4 | |
1650 | #NORMALIZE | |
1651 | #STORE | |
1652 | ||
1653 | # X3 = X3 - U2*2 | |
1654 | #lea 32*9*0($r_ptr), %rsi | |
1655 | #lea 32*9*0($r_ptr), %rdi | |
1656 | #call avx2_sub_x4 | |
1657 | #NORMALIZE | |
1658 | #STORE | |
1659 | ||
1660 | lea `32*9*6+128`(%rsp), %rsi | |
1661 | lea .LAVX2_POLY_x2+128(%rip), %rax | |
1662 | lea `32*9*7+128`(%rsp), %rdx | |
1663 | lea `32*9*5+128`(%rsp), %rcx | |
1664 | lea `32*9*0`($r_ptr), %rdi | |
1665 | ||
1666 | vmovdqa 32*0-128(%rsi), $ACC0 | |
1667 | vmovdqa 32*1-128(%rsi), $ACC1 | |
1668 | vmovdqa 32*2-128(%rsi), $ACC2 | |
1669 | vmovdqa 32*3-128(%rsi), $ACC3 | |
1670 | vmovdqa 32*4-128(%rsi), $ACC4 | |
1671 | vmovdqa 32*5-128(%rsi), $ACC5 | |
1672 | vmovdqa 32*6-128(%rsi), $ACC6 | |
1673 | vmovdqa 32*7-128(%rsi), $ACC7 | |
1674 | vmovdqa 32*8-128(%rsi), $ACC8 | |
1675 | ||
1676 | vpaddq 32*0-128(%rax), $ACC0, $ACC0 | |
1677 | vpaddq 32*1-128(%rax), $ACC1, $ACC1 | |
1678 | vpaddq 32*2-128(%rax), $ACC2, $ACC2 | |
1679 | vpaddq 32*3-128(%rax), $ACC3, $ACC3 | |
1680 | vpaddq 32*4-128(%rax), $ACC4, $ACC4 | |
1681 | vpaddq 32*5-128(%rax), $ACC5, $ACC5 | |
1682 | vpaddq 32*6-128(%rax), $ACC6, $ACC6 | |
1683 | vpaddq 32*7-128(%rax), $ACC7, $ACC7 | |
1684 | vpaddq 32*8-128(%rax), $ACC8, $ACC8 | |
1685 | ||
1686 | vpsubq 32*0-128(%rdx), $ACC0, $ACC0 | |
1687 | vpsubq 32*1-128(%rdx), $ACC1, $ACC1 | |
1688 | vpsubq 32*2-128(%rdx), $ACC2, $ACC2 | |
1689 | vpsubq 32*3-128(%rdx), $ACC3, $ACC3 | |
1690 | vpsubq 32*4-128(%rdx), $ACC4, $ACC4 | |
1691 | vpsubq 32*5-128(%rdx), $ACC5, $ACC5 | |
1692 | vpsubq 32*6-128(%rdx), $ACC6, $ACC6 | |
1693 | vpsubq 32*7-128(%rdx), $ACC7, $ACC7 | |
1694 | vpsubq 32*8-128(%rdx), $ACC8, $ACC8 | |
1695 | ||
1696 | vpsubq 32*0-128(%rcx), $ACC0, $ACC0 | |
1697 | vpsubq 32*1-128(%rcx), $ACC1, $ACC1 | |
1698 | vpsubq 32*2-128(%rcx), $ACC2, $ACC2 | |
1699 | vpsubq 32*3-128(%rcx), $ACC3, $ACC3 | |
1700 | vpsubq 32*4-128(%rcx), $ACC4, $ACC4 | |
1701 | vpsubq 32*5-128(%rcx), $ACC5, $ACC5 | |
1702 | vpsubq 32*6-128(%rcx), $ACC6, $ACC6 | |
1703 | vpsubq 32*7-128(%rcx), $ACC7, $ACC7 | |
1704 | vpsubq 32*8-128(%rcx), $ACC8, $ACC8 | |
1705 | call avx2_normalize | |
1706 | ||
1707 | lea 32*0($b_ptr), %rsi | |
1708 | lea 32*0($a_ptr), %rdx | |
1709 | call avx2_select_n_store | |
1710 | ||
1711 | # H = U2 - X3 | |
1712 | lea `32*9*0`(%rsp), %rsi | |
1713 | lea `32*9*0`($r_ptr), %rdx | |
1714 | lea `32*9*3`(%rsp), %rdi | |
1715 | call avx2_sub_x4 | |
1716 | call avx2_normalize_n_store | |
1717 | ||
1718 | # H = H*R | |
1719 | lea `32*9*3`(%rsp), %rsi | |
1720 | lea `32*9*4`(%rsp), %rdx | |
1721 | lea `32*9*3`(%rsp), %rdi | |
1722 | call avx2_mul_x4 | |
1723 | call avx2_normalize_n_store | |
1724 | ||
1725 | # S2 = S1 * H^3 | |
1726 | lea `32*9*7`(%rsp), %rsi | |
1727 | lea `32*9*1`($a_ptr), %rdx | |
1728 | lea `32*9*1`(%rsp), %rdi | |
1729 | call avx2_mul_x4 | |
1730 | call avx2_normalize_n_store | |
1731 | ||
1732 | # | |
1733 | lea `32*9*3`(%rsp), %rsi | |
1734 | lea `32*9*1`(%rsp), %rdx | |
1735 | lea `32*9*1`($r_ptr), %rdi | |
1736 | call avx2_sub_x4 | |
1737 | call avx2_normalize | |
1738 | ||
1739 | lea 32*9($b_ptr), %rsi | |
1740 | lea 32*9($a_ptr), %rdx | |
1741 | call avx2_select_n_store | |
1742 | ||
1743 | #lea 32*9*0($r_ptr), %rsi | |
1744 | #lea 32*9*0($r_ptr), %rdi | |
1745 | #call avx2_mul_by1_x4 | |
1746 | #NORMALIZE | |
1747 | #STORE | |
1748 | ||
1749 | lea `32*9*1`($r_ptr), %rsi | |
1750 | lea `32*9*1`($r_ptr), %rdi | |
1751 | call avx2_mul_by1_x4 | |
1752 | call avx2_normalize_n_store | |
1753 | ||
1754 | vzeroupper | |
1755 | ___ | |
1756 | $code.=<<___ if ($win64); | |
1757 | movaps %xmm6, -16*10(%rbp) | |
1758 | movaps %xmm7, -16*9(%rbp) | |
1759 | movaps %xmm8, -16*8(%rbp) | |
1760 | movaps %xmm9, -16*7(%rbp) | |
1761 | movaps %xmm10, -16*6(%rbp) | |
1762 | movaps %xmm11, -16*5(%rbp) | |
1763 | movaps %xmm12, -16*4(%rbp) | |
1764 | movaps %xmm13, -16*3(%rbp) | |
1765 | movaps %xmm14, -16*2(%rbp) | |
1766 | movaps %xmm15, -16*1(%rbp) | |
1767 | ___ | |
1768 | $code.=<<___; | |
1769 | mov %rbp, %rsp | |
1770 | pop %rbp | |
1771 | ret | |
1772 | .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4 | |
1773 | ||
1774 | ################################################################################ | |
1775 | # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4); | |
1776 | .globl ecp_nistz256_avx2_to_mont | |
1777 | .type ecp_nistz256_avx2_to_mont,\@function,2 | |
1778 | .align 32 | |
1779 | ecp_nistz256_avx2_to_mont: | |
1780 | vzeroupper | |
1781 | ___ | |
1782 | $code.=<<___ if ($win64); | |
1783 | lea -8-16*10(%rsp), %rsp | |
1784 | vmovaps %xmm6, -8-16*10(%rax) | |
1785 | vmovaps %xmm7, -8-16*9(%rax) | |
1786 | vmovaps %xmm8, -8-16*8(%rax) | |
1787 | vmovaps %xmm9, -8-16*7(%rax) | |
1788 | vmovaps %xmm10, -8-16*6(%rax) | |
1789 | vmovaps %xmm11, -8-16*5(%rax) | |
1790 | vmovaps %xmm12, -8-16*4(%rax) | |
1791 | vmovaps %xmm13, -8-16*3(%rax) | |
1792 | vmovaps %xmm14, -8-16*2(%rax) | |
1793 | vmovaps %xmm15, -8-16*1(%rax) | |
1794 | ___ | |
1795 | $code.=<<___; | |
1796 | vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK | |
1797 | lea .LTO_MONT_AVX2(%rip), %rdx | |
1798 | call avx2_mul_x4 | |
1799 | call avx2_normalize_n_store | |
1800 | ||
1801 | vzeroupper | |
1802 | ___ | |
1803 | $code.=<<___ if ($win64); | |
1804 | movaps 16*0(%rsp), %xmm6 | |
1805 | movaps 16*1(%rsp), %xmm7 | |
1806 | movaps 16*2(%rsp), %xmm8 | |
1807 | movaps 16*3(%rsp), %xmm9 | |
1808 | movaps 16*4(%rsp), %xmm10 | |
1809 | movaps 16*5(%rsp), %xmm11 | |
1810 | movaps 16*6(%rsp), %xmm12 | |
1811 | movaps 16*7(%rsp), %xmm13 | |
1812 | movaps 16*8(%rsp), %xmm14 | |
1813 | movaps 16*9(%rsp), %xmm15 | |
1814 | lea 8+16*10(%rsp), %rsp | |
1815 | ___ | |
1816 | $code.=<<___; | |
1817 | ret | |
1818 | .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont | |
1819 | ||
1820 | ################################################################################ | |
1821 | # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4); | |
1822 | .globl ecp_nistz256_avx2_from_mont | |
1823 | .type ecp_nistz256_avx2_from_mont,\@function,2 | |
1824 | .align 32 | |
1825 | ecp_nistz256_avx2_from_mont: | |
1826 | vzeroupper | |
1827 | ___ | |
1828 | $code.=<<___ if ($win64); | |
1829 | lea -8-16*10(%rsp), %rsp | |
1830 | vmovaps %xmm6, -8-16*10(%rax) | |
1831 | vmovaps %xmm7, -8-16*9(%rax) | |
1832 | vmovaps %xmm8, -8-16*8(%rax) | |
1833 | vmovaps %xmm9, -8-16*7(%rax) | |
1834 | vmovaps %xmm10, -8-16*6(%rax) | |
1835 | vmovaps %xmm11, -8-16*5(%rax) | |
1836 | vmovaps %xmm12, -8-16*4(%rax) | |
1837 | vmovaps %xmm13, -8-16*3(%rax) | |
1838 | vmovaps %xmm14, -8-16*2(%rax) | |
1839 | vmovaps %xmm15, -8-16*1(%rax) | |
1840 | ___ | |
1841 | $code.=<<___; | |
1842 | vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK | |
1843 | lea .LFROM_MONT_AVX2(%rip), %rdx | |
1844 | call avx2_mul_x4 | |
1845 | call avx2_normalize_n_store | |
1846 | ||
1847 | vzeroupper | |
1848 | ___ | |
1849 | $code.=<<___ if ($win64); | |
1850 | movaps 16*0(%rsp), %xmm6 | |
1851 | movaps 16*1(%rsp), %xmm7 | |
1852 | movaps 16*2(%rsp), %xmm8 | |
1853 | movaps 16*3(%rsp), %xmm9 | |
1854 | movaps 16*4(%rsp), %xmm10 | |
1855 | movaps 16*5(%rsp), %xmm11 | |
1856 | movaps 16*6(%rsp), %xmm12 | |
1857 | movaps 16*7(%rsp), %xmm13 | |
1858 | movaps 16*8(%rsp), %xmm14 | |
1859 | movaps 16*9(%rsp), %xmm15 | |
1860 | lea 8+16*10(%rsp), %rsp | |
1861 | ___ | |
1862 | $code.=<<___; | |
1863 | ret | |
1864 | .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont | |
1865 | ||
1866 | ################################################################################ | |
1867 | # void ecp_nistz256_avx2_set1(void* RESULTx4); | |
1868 | .globl ecp_nistz256_avx2_set1 | |
1869 | .type ecp_nistz256_avx2_set1,\@function,1 | |
1870 | .align 32 | |
1871 | ecp_nistz256_avx2_set1: | |
1872 | lea .LONE+128(%rip), %rax | |
1873 | lea 128(%rdi), %rdi | |
1874 | vzeroupper | |
1875 | vmovdqa 32*0-128(%rax), %ymm0 | |
1876 | vmovdqa 32*1-128(%rax), %ymm1 | |
1877 | vmovdqa 32*2-128(%rax), %ymm2 | |
1878 | vmovdqa 32*3-128(%rax), %ymm3 | |
1879 | vmovdqa 32*4-128(%rax), %ymm4 | |
1880 | vmovdqa 32*5-128(%rax), %ymm5 | |
1881 | vmovdqa %ymm0, 32*0-128(%rdi) | |
1882 | vmovdqa 32*6-128(%rax), %ymm0 | |
1883 | vmovdqa %ymm1, 32*1-128(%rdi) | |
1884 | vmovdqa 32*7-128(%rax), %ymm1 | |
1885 | vmovdqa %ymm2, 32*2-128(%rdi) | |
1886 | vmovdqa 32*8-128(%rax), %ymm2 | |
1887 | vmovdqa %ymm3, 32*3-128(%rdi) | |
1888 | vmovdqa %ymm4, 32*4-128(%rdi) | |
1889 | vmovdqa %ymm5, 32*5-128(%rdi) | |
1890 | vmovdqa %ymm0, 32*6-128(%rdi) | |
1891 | vmovdqa %ymm1, 32*7-128(%rdi) | |
1892 | vmovdqa %ymm2, 32*8-128(%rdi) | |
1893 | ||
1894 | vzeroupper | |
1895 | ret | |
1896 | .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1 | |
1897 | ___ | |
1898 | } | |
1899 | { | |
1900 | ################################################################################ | |
3ff08e1d | 1901 | # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in, |
4d3fa06f AP |
1902 | # int index0, int index1, int index2, int index3); |
1903 | ################################################################################ | |
1904 | ||
1905 | my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d"); | |
1906 | my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3)); | |
1907 | my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11)); | |
1908 | my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15)); | |
1909 | ||
1910 | $code.=<<___; | |
3ff08e1d AP |
1911 | .globl ecp_nistz256_avx2_multi_gather_w7 |
1912 | .type ecp_nistz256_avx2_multi_gather_w7,\@function,6 | |
4d3fa06f | 1913 | .align 32 |
3ff08e1d | 1914 | ecp_nistz256_avx2_multi_gather_w7: |
4d3fa06f AP |
1915 | vzeroupper |
1916 | ___ | |
1917 | $code.=<<___ if ($win64); | |
1918 | lea -8-16*10(%rsp), %rsp | |
1919 | vmovaps %xmm6, -8-16*10(%rax) | |
1920 | vmovaps %xmm7, -8-16*9(%rax) | |
1921 | vmovaps %xmm8, -8-16*8(%rax) | |
1922 | vmovaps %xmm9, -8-16*7(%rax) | |
1923 | vmovaps %xmm10, -8-16*6(%rax) | |
1924 | vmovaps %xmm11, -8-16*5(%rax) | |
1925 | vmovaps %xmm12, -8-16*4(%rax) | |
1926 | vmovaps %xmm13, -8-16*3(%rax) | |
1927 | vmovaps %xmm14, -8-16*2(%rax) | |
1928 | vmovaps %xmm15, -8-16*1(%rax) | |
1929 | ___ | |
1930 | $code.=<<___; | |
1931 | lea .LIntOne(%rip), %rax | |
1932 | ||
1933 | vmovd $index0, %xmm0 | |
1934 | vmovd $index1, %xmm1 | |
1935 | vmovd $index2, %xmm2 | |
1936 | vmovd $index3, %xmm3 | |
1937 | ||
1938 | vpxor $R0a, $R0a, $R0a | |
1939 | vpxor $R0b, $R0b, $R0b | |
1940 | vpxor $R1a, $R1a, $R1a | |
1941 | vpxor $R1b, $R1b, $R1b | |
1942 | vpxor $R2a, $R2a, $R2a | |
1943 | vpxor $R2b, $R2b, $R2b | |
1944 | vpxor $R3a, $R3a, $R3a | |
1945 | vpxor $R3b, $R3b, $R3b | |
1946 | vmovdqa (%rax), $M0 | |
1947 | ||
1948 | vpermd $INDEX0, $R0a, $INDEX0 | |
1949 | vpermd $INDEX1, $R0a, $INDEX1 | |
1950 | vpermd $INDEX2, $R0a, $INDEX2 | |
1951 | vpermd $INDEX3, $R0a, $INDEX3 | |
1952 | ||
1953 | mov \$64, %ecx | |
1954 | lea 112($val), $val # size optimization | |
1955 | jmp .Lmulti_select_loop_avx2 | |
1956 | ||
1957 | # INDEX=0, corresponds to the point at infty (0,0) | |
1958 | .align 32 | |
1959 | .Lmulti_select_loop_avx2: | |
1960 | vpcmpeqd $INDEX0, $M0, $TMP0 | |
1961 | ||
1962 | vmovdqa `32*0+32*64*2*0`($in_t), $T0 | |
1963 | vmovdqa `32*1+32*64*2*0`($in_t), $T1 | |
1964 | vpand $TMP0, $T0, $T0 | |
1965 | vpand $TMP0, $T1, $T1 | |
1966 | vpxor $T0, $R0a, $R0a | |
1967 | vpxor $T1, $R0b, $R0b | |
1968 | ||
1969 | vpcmpeqd $INDEX1, $M0, $TMP0 | |
1970 | ||
1971 | vmovdqa `32*0+32*64*2*1`($in_t), $T0 | |
1972 | vmovdqa `32*1+32*64*2*1`($in_t), $T1 | |
1973 | vpand $TMP0, $T0, $T0 | |
1974 | vpand $TMP0, $T1, $T1 | |
1975 | vpxor $T0, $R1a, $R1a | |
1976 | vpxor $T1, $R1b, $R1b | |
1977 | ||
1978 | vpcmpeqd $INDEX2, $M0, $TMP0 | |
1979 | ||
1980 | vmovdqa `32*0+32*64*2*2`($in_t), $T0 | |
1981 | vmovdqa `32*1+32*64*2*2`($in_t), $T1 | |
1982 | vpand $TMP0, $T0, $T0 | |
1983 | vpand $TMP0, $T1, $T1 | |
1984 | vpxor $T0, $R2a, $R2a | |
1985 | vpxor $T1, $R2b, $R2b | |
1986 | ||
1987 | vpcmpeqd $INDEX3, $M0, $TMP0 | |
1988 | ||
1989 | vmovdqa `32*0+32*64*2*3`($in_t), $T0 | |
1990 | vmovdqa `32*1+32*64*2*3`($in_t), $T1 | |
1991 | vpand $TMP0, $T0, $T0 | |
1992 | vpand $TMP0, $T1, $T1 | |
1993 | vpxor $T0, $R3a, $R3a | |
1994 | vpxor $T1, $R3b, $R3b | |
1995 | ||
1996 | vpaddd (%rax), $M0, $M0 # increment | |
1997 | lea 32*2($in_t), $in_t | |
1998 | ||
1999 | dec %ecx | |
2000 | jnz .Lmulti_select_loop_avx2 | |
2001 | ||
2002 | vmovdqu $R0a, 32*0-112($val) | |
2003 | vmovdqu $R0b, 32*1-112($val) | |
2004 | vmovdqu $R1a, 32*2-112($val) | |
2005 | vmovdqu $R1b, 32*3-112($val) | |
2006 | vmovdqu $R2a, 32*4-112($val) | |
2007 | vmovdqu $R2b, 32*5-112($val) | |
2008 | vmovdqu $R3a, 32*6-112($val) | |
2009 | vmovdqu $R3b, 32*7-112($val) | |
2010 | ||
2011 | vzeroupper | |
2012 | ___ | |
2013 | $code.=<<___ if ($win64); | |
2014 | movaps 16*0(%rsp), %xmm6 | |
2015 | movaps 16*1(%rsp), %xmm7 | |
2016 | movaps 16*2(%rsp), %xmm8 | |
2017 | movaps 16*3(%rsp), %xmm9 | |
2018 | movaps 16*4(%rsp), %xmm10 | |
2019 | movaps 16*5(%rsp), %xmm11 | |
2020 | movaps 16*6(%rsp), %xmm12 | |
2021 | movaps 16*7(%rsp), %xmm13 | |
2022 | movaps 16*8(%rsp), %xmm14 | |
2023 | movaps 16*9(%rsp), %xmm15 | |
2024 | lea 8+16*10(%rsp), %rsp | |
2025 | ___ | |
2026 | $code.=<<___; | |
2027 | ret | |
3ff08e1d | 2028 | .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 |
4d3fa06f AP |
2029 | |
2030 | .extern OPENSSL_ia32cap_P | |
2031 | .globl ecp_nistz_avx2_eligible | |
2032 | .type ecp_nistz_avx2_eligible,\@abi-omnipotent | |
2033 | .align 32 | |
2034 | ecp_nistz_avx2_eligible: | |
2035 | mov OPENSSL_ia32cap_P+8(%rip),%eax | |
2036 | shr \$5,%eax | |
2037 | and \$1,%eax | |
2038 | ret | |
2039 | .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible | |
2040 | ___ | |
2041 | } | |
2042 | }} else {{ # assembler is too old | |
2043 | $code.=<<___; | |
2044 | .text | |
2045 | ||
2046 | .globl ecp_nistz256_avx2_transpose_convert | |
2047 | .globl ecp_nistz256_avx2_convert_transpose_back | |
2048 | .globl ecp_nistz256_avx2_point_add_affine_x4 | |
2049 | .globl ecp_nistz256_avx2_point_add_affines_x4 | |
2050 | .globl ecp_nistz256_avx2_to_mont | |
2051 | .globl ecp_nistz256_avx2_from_mont | |
2052 | .globl ecp_nistz256_avx2_set1 | |
3ff08e1d AP |
2053 | .globl ecp_nistz256_avx2_multi_gather_w7 |
2054 | .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent | |
4d3fa06f AP |
2055 | ecp_nistz256_avx2_transpose_convert: |
2056 | ecp_nistz256_avx2_convert_transpose_back: | |
2057 | ecp_nistz256_avx2_point_add_affine_x4: | |
2058 | ecp_nistz256_avx2_point_add_affines_x4: | |
2059 | ecp_nistz256_avx2_to_mont: | |
2060 | ecp_nistz256_avx2_from_mont: | |
2061 | ecp_nistz256_avx2_set1: | |
3ff08e1d | 2062 | ecp_nistz256_avx2_multi_gather_w7: |
4d3fa06f AP |
2063 | .byte 0x0f,0x0b # ud2 |
2064 | ret | |
3ff08e1d | 2065 | .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 |
4d3fa06f AP |
2066 | |
2067 | .globl ecp_nistz_avx2_eligible | |
2068 | .type ecp_nistz_avx2_eligible,\@abi-omnipotent | |
2069 | ecp_nistz_avx2_eligible: | |
2070 | xor %eax,%eax | |
2071 | ret | |
2072 | .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible | |
2073 | ___ | |
2074 | }} | |
2075 | ||
2076 | foreach (split("\n",$code)) { | |
2077 | s/\`([^\`]*)\`/eval($1)/geo; | |
2078 | ||
2079 | print $_,"\n"; | |
2080 | } | |
2081 | ||
a21314db | 2082 | close STDOUT or die "error closing STDOUT: $!"; |