]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/asm/ecp_nistz256-avx2.pl
For all assembler scripts where it matters, recognise clang > 9.x
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-avx2.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4 #
5 # Licensed under the OpenSSL license (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
9 #
10 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
12 # (2) University of Haifa, Israel
13 #
14 # Reference:
15 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
16 # 256 Bit Primes"
17
18 $flavour = shift;
19 $output = shift;
20 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
21
22 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
23
24 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
25 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
26 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
27 die "can't locate x86_64-xlate.pl";
28
29 open OUT,"| \"$^X\" $xlate $flavour $output";
30 *STDOUT=*OUT;
31
32 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
33 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
34 $avx = ($1>=2.19) + ($1>=2.22);
35 $addx = ($1>=2.23);
36 }
37
38 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
39 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
40 $avx = ($1>=2.09) + ($1>=2.10);
41 $addx = ($1>=2.10);
42 }
43
44 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
45 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
46 $avx = ($1>=10) + ($1>=11);
47 $addx = ($1>=12);
48 }
49
50 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) {
51 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
52 $avx = ($ver>=3.0) + ($ver>=3.01);
53 $addx = ($ver>=3.03);
54 }
55
56 if ($avx>=2) {{
57 $digit_size = "\$29";
58 $n_digits = "\$9";
59
60 $code.=<<___;
61 .text
62
63 .align 64
64 .LAVX2_AND_MASK:
65 .LAVX2_POLY:
66 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
67 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
68 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
69 .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
70 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
71 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
72 .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
73 .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
74 .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
75
76 .LAVX2_POLY_x2:
77 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
78 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
79 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
80 .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
81 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
82 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
83 .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
84 .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
85 .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
86
87 .LAVX2_POLY_x8:
88 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
89 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
90 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
91 .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
92 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
93 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
94 .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
95 .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
96 .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
97
98 .LONE:
99 .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
100 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
101 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
102 .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
103 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
104 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
105 .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
106 .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
107 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
108
109 # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
110 # Montgomery form (*2^256) to our format (*2^261)
111
112 .LTO_MONT_AVX2:
113 .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
114 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
115 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
116 .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
117 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
118 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
119 .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
120 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
121 .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
122
123 .LFROM_MONT_AVX2:
124 .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
125 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
126 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
127 .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
128 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
129 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
130 .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
131 .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
132 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
133
134 .LIntOne:
135 .long 1,1,1,1,1,1,1,1
136 ___
137
138 {
139 # This function receives a pointer to an array of four affine points
140 # (X, Y, <1>) and rearranges the data for AVX2 execution, while
141 # converting it to 2^29 radix redundant form
142
143 my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
144 $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
145
146 $code.=<<___;
147 .globl ecp_nistz256_avx2_transpose_convert
148 .type ecp_nistz256_avx2_transpose_convert,\@function,2
149 .align 64
150 ecp_nistz256_avx2_transpose_convert:
151 vzeroupper
152 ___
153 $code.=<<___ if ($win64);
154 lea -8-16*10(%rsp), %rsp
155 vmovaps %xmm6, -8-16*10(%rax)
156 vmovaps %xmm7, -8-16*9(%rax)
157 vmovaps %xmm8, -8-16*8(%rax)
158 vmovaps %xmm9, -8-16*7(%rax)
159 vmovaps %xmm10, -8-16*6(%rax)
160 vmovaps %xmm11, -8-16*5(%rax)
161 vmovaps %xmm12, -8-16*4(%rax)
162 vmovaps %xmm13, -8-16*3(%rax)
163 vmovaps %xmm14, -8-16*2(%rax)
164 vmovaps %xmm15, -8-16*1(%rax)
165 ___
166 $code.=<<___;
167 # Load the data
168 vmovdqa 32*0(%rsi), $X0
169 lea 112(%rsi), %rax # size optimization
170 vmovdqa 32*1(%rsi), $Y0
171 lea .LAVX2_AND_MASK(%rip), %rdx
172 vmovdqa 32*2(%rsi), $X1
173 vmovdqa 32*3(%rsi), $Y1
174 vmovdqa 32*4-112(%rax), $X2
175 vmovdqa 32*5-112(%rax), $Y2
176 vmovdqa 32*6-112(%rax), $X3
177 vmovdqa 32*7-112(%rax), $Y3
178
179 # Transpose X and Y independently
180 vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0]
181 vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0]
182 vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1]
183 vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1]
184
185 vpunpcklqdq $Y1, $Y0, $T4
186 vpunpcklqdq $Y3, $Y2, $T5
187 vpunpckhqdq $Y1, $Y0, $T6
188 vpunpckhqdq $Y3, $Y2, $T7
189
190 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0]
191 vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1]
192 vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2]
193 vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3]
194
195 vperm2i128 \$0x20, $T5, $T4, $Y0
196 vperm2i128 \$0x20, $T7, $T6, $Y1
197 vperm2i128 \$0x31, $T5, $T4, $Y2
198 vperm2i128 \$0x31, $T7, $T6, $Y3
199 vmovdqa (%rdx), $T7
200
201 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask;
202 vpsrlq \$29, $X0, $X0
203 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask;
204 vpsrlq \$29, $X0, $X0
205 vpsllq \$6, $X1, $T2
206 vpxor $X0, $T2, $T2
207 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
208 vpsrlq \$23, $X1, $X1
209 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
210 vpsrlq \$29, $X1, $X1
211 vpsllq \$12, $X2, $T4
212 vpxor $X1, $T4, $T4
213 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
214 vpsrlq \$17, $X2, $X2
215 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
216 vpsrlq \$29, $X2, $X2
217 vpsllq \$18, $X3, $T6
218 vpxor $X2, $T6, $T6
219 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
220 vpsrlq \$11, $X3, $X3
221 vmovdqa $T0, 32*0(%rdi)
222 lea 112(%rdi), %rax # size optimization
223 vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
224 vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
225
226 vmovdqa $T1, 32*1(%rdi)
227 vmovdqa $T2, 32*2(%rdi)
228 vmovdqa $T3, 32*3(%rdi)
229 vmovdqa $T4, 32*4-112(%rax)
230 vmovdqa $T5, 32*5-112(%rax)
231 vmovdqa $T6, 32*6-112(%rax)
232 vmovdqa $T0, 32*7-112(%rax)
233 vmovdqa $X3, 32*8-112(%rax)
234 lea 448(%rdi), %rax # size optimization
235
236 vpand $T7, $Y0, $T0 # out[0] = in[0] & mask;
237 vpsrlq \$29, $Y0, $Y0
238 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask;
239 vpsrlq \$29, $Y0, $Y0
240 vpsllq \$6, $Y1, $T2
241 vpxor $Y0, $T2, $T2
242 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
243 vpsrlq \$23, $Y1, $Y1
244 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
245 vpsrlq \$29, $Y1, $Y1
246 vpsllq \$12, $Y2, $T4
247 vpxor $Y1, $T4, $T4
248 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
249 vpsrlq \$17, $Y2, $Y2
250 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
251 vpsrlq \$29, $Y2, $Y2
252 vpsllq \$18, $Y3, $T6
253 vpxor $Y2, $T6, $T6
254 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
255 vpsrlq \$11, $Y3, $Y3
256 vmovdqa $T0, 32*9-448(%rax)
257 vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
258 vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
259
260 vmovdqa $T1, 32*10-448(%rax)
261 vmovdqa $T2, 32*11-448(%rax)
262 vmovdqa $T3, 32*12-448(%rax)
263 vmovdqa $T4, 32*13-448(%rax)
264 vmovdqa $T5, 32*14-448(%rax)
265 vmovdqa $T6, 32*15-448(%rax)
266 vmovdqa $T0, 32*16-448(%rax)
267 vmovdqa $Y3, 32*17-448(%rax)
268
269 vzeroupper
270 ___
271 $code.=<<___ if ($win64);
272 movaps 16*0(%rsp), %xmm6
273 movaps 16*1(%rsp), %xmm7
274 movaps 16*2(%rsp), %xmm8
275 movaps 16*3(%rsp), %xmm9
276 movaps 16*4(%rsp), %xmm10
277 movaps 16*5(%rsp), %xmm11
278 movaps 16*6(%rsp), %xmm12
279 movaps 16*7(%rsp), %xmm13
280 movaps 16*8(%rsp), %xmm14
281 movaps 16*9(%rsp), %xmm15
282 lea 8+16*10(%rsp), %rsp
283 ___
284 $code.=<<___;
285 ret
286 .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
287 ___
288 }
289 {
290 ################################################################################
291 # This function receives a pointer to an array of four AVX2 formatted points
292 # (X, Y, Z) convert the data to normal representation, and rearranges the data
293
294 my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
295 my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
296
297 $code.=<<___;
298
299 .globl ecp_nistz256_avx2_convert_transpose_back
300 .type ecp_nistz256_avx2_convert_transpose_back,\@function,2
301 .align 32
302 ecp_nistz256_avx2_convert_transpose_back:
303 vzeroupper
304 ___
305 $code.=<<___ if ($win64);
306 lea -8-16*10(%rsp), %rsp
307 vmovaps %xmm6, -8-16*10(%rax)
308 vmovaps %xmm7, -8-16*9(%rax)
309 vmovaps %xmm8, -8-16*8(%rax)
310 vmovaps %xmm9, -8-16*7(%rax)
311 vmovaps %xmm10, -8-16*6(%rax)
312 vmovaps %xmm11, -8-16*5(%rax)
313 vmovaps %xmm12, -8-16*4(%rax)
314 vmovaps %xmm13, -8-16*3(%rax)
315 vmovaps %xmm14, -8-16*2(%rax)
316 vmovaps %xmm15, -8-16*1(%rax)
317 ___
318 $code.=<<___;
319 mov \$3, %ecx
320
321 .Lconv_loop:
322 vmovdqa 32*0(%rsi), $D0
323 lea 160(%rsi), %rax # size optimization
324 vmovdqa 32*1(%rsi), $D1
325 vmovdqa 32*2(%rsi), $D2
326 vmovdqa 32*3(%rsi), $D3
327 vmovdqa 32*4-160(%rax), $D4
328 vmovdqa 32*5-160(%rax), $D5
329 vmovdqa 32*6-160(%rax), $D6
330 vmovdqa 32*7-160(%rax), $D7
331 vmovdqa 32*8-160(%rax), $D8
332
333 vpsllq \$29, $D1, $D1
334 vpsllq \$58, $D2, $T0
335 vpaddq $D1, $D0, $D0
336 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
337
338 vpsrlq \$6, $D2, $D2
339 vpsllq \$23, $D3, $D3
340 vpsllq \$52, $D4, $T1
341 vpaddq $D2, $D3, $D3
342 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
343
344 vpsrlq \$12, $D4, $D4
345 vpsllq \$17, $D5, $D5
346 vpsllq \$46, $D6, $T2
347 vpaddq $D4, $D5, $D5
348 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
349
350 vpsrlq \$18, $D6, $D6
351 vpsllq \$11, $D7, $D7
352 vpsllq \$40, $D8, $T3
353 vpaddq $D6, $D7, $D7
354 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
355
356 vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0]
357 vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0]
358 vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1]
359 vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1]
360
361 vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0]
362 vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1]
363 vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2]
364 vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3]
365
366 vmovdqa $D0, 32*0(%rdi)
367 vmovdqa $D1, 32*3(%rdi)
368 vmovdqa $D2, 32*6(%rdi)
369 vmovdqa $D3, 32*9(%rdi)
370
371 lea 32*9(%rsi), %rsi
372 lea 32*1(%rdi), %rdi
373
374 dec %ecx
375 jnz .Lconv_loop
376
377 vzeroupper
378 ___
379 $code.=<<___ if ($win64);
380 movaps 16*0(%rsp), %xmm6
381 movaps 16*1(%rsp), %xmm7
382 movaps 16*2(%rsp), %xmm8
383 movaps 16*3(%rsp), %xmm9
384 movaps 16*4(%rsp), %xmm10
385 movaps 16*5(%rsp), %xmm11
386 movaps 16*6(%rsp), %xmm12
387 movaps 16*7(%rsp), %xmm13
388 movaps 16*8(%rsp), %xmm14
389 movaps 16*9(%rsp), %xmm15
390 lea 8+16*10(%rsp), %rsp
391 ___
392 $code.=<<___;
393 ret
394 .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
395 ___
396 }
397 {
398 my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
399 my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
400 my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
401
402 sub NORMALIZE {
403 my $ret=<<___;
404 vpsrlq $digit_size, $ACC0, $T0
405 vpand $AND_MASK, $ACC0, $ACC0
406 vpaddq $T0, $ACC1, $ACC1
407
408 vpsrlq $digit_size, $ACC1, $T0
409 vpand $AND_MASK, $ACC1, $ACC1
410 vpaddq $T0, $ACC2, $ACC2
411
412 vpsrlq $digit_size, $ACC2, $T0
413 vpand $AND_MASK, $ACC2, $ACC2
414 vpaddq $T0, $ACC3, $ACC3
415
416 vpsrlq $digit_size, $ACC3, $T0
417 vpand $AND_MASK, $ACC3, $ACC3
418 vpaddq $T0, $ACC4, $ACC4
419
420 vpsrlq $digit_size, $ACC4, $T0
421 vpand $AND_MASK, $ACC4, $ACC4
422 vpaddq $T0, $ACC5, $ACC5
423
424 vpsrlq $digit_size, $ACC5, $T0
425 vpand $AND_MASK, $ACC5, $ACC5
426 vpaddq $T0, $ACC6, $ACC6
427
428 vpsrlq $digit_size, $ACC6, $T0
429 vpand $AND_MASK, $ACC6, $ACC6
430 vpaddq $T0, $ACC7, $ACC7
431
432 vpsrlq $digit_size, $ACC7, $T0
433 vpand $AND_MASK, $ACC7, $ACC7
434 vpaddq $T0, $ACC8, $ACC8
435 #vpand $AND_MASK, $ACC8, $ACC8
436 ___
437 $ret;
438 }
439
440 sub STORE {
441 my $ret=<<___;
442 vmovdqa $ACC0, 32*0(%rdi)
443 lea 160(%rdi), %rax # size optimization
444 vmovdqa $ACC1, 32*1(%rdi)
445 vmovdqa $ACC2, 32*2(%rdi)
446 vmovdqa $ACC3, 32*3(%rdi)
447 vmovdqa $ACC4, 32*4-160(%rax)
448 vmovdqa $ACC5, 32*5-160(%rax)
449 vmovdqa $ACC6, 32*6-160(%rax)
450 vmovdqa $ACC7, 32*7-160(%rax)
451 vmovdqa $ACC8, 32*8-160(%rax)
452 ___
453 $ret;
454 }
455
456 $code.=<<___;
457 .type avx2_normalize,\@abi-omnipotent
458 .align 32
459 avx2_normalize:
460 vpsrlq $digit_size, $ACC0, $T0
461 vpand $AND_MASK, $ACC0, $ACC0
462 vpaddq $T0, $ACC1, $ACC1
463
464 vpsrlq $digit_size, $ACC1, $T0
465 vpand $AND_MASK, $ACC1, $ACC1
466 vpaddq $T0, $ACC2, $ACC2
467
468 vpsrlq $digit_size, $ACC2, $T0
469 vpand $AND_MASK, $ACC2, $ACC2
470 vpaddq $T0, $ACC3, $ACC3
471
472 vpsrlq $digit_size, $ACC3, $T0
473 vpand $AND_MASK, $ACC3, $ACC3
474 vpaddq $T0, $ACC4, $ACC4
475
476 vpsrlq $digit_size, $ACC4, $T0
477 vpand $AND_MASK, $ACC4, $ACC4
478 vpaddq $T0, $ACC5, $ACC5
479
480 vpsrlq $digit_size, $ACC5, $T0
481 vpand $AND_MASK, $ACC5, $ACC5
482 vpaddq $T0, $ACC6, $ACC6
483
484 vpsrlq $digit_size, $ACC6, $T0
485 vpand $AND_MASK, $ACC6, $ACC6
486 vpaddq $T0, $ACC7, $ACC7
487
488 vpsrlq $digit_size, $ACC7, $T0
489 vpand $AND_MASK, $ACC7, $ACC7
490 vpaddq $T0, $ACC8, $ACC8
491 #vpand $AND_MASK, $ACC8, $ACC8
492
493 ret
494 .size avx2_normalize,.-avx2_normalize
495
496 .type avx2_normalize_n_store,\@abi-omnipotent
497 .align 32
498 avx2_normalize_n_store:
499 vpsrlq $digit_size, $ACC0, $T0
500 vpand $AND_MASK, $ACC0, $ACC0
501 vpaddq $T0, $ACC1, $ACC1
502
503 vpsrlq $digit_size, $ACC1, $T0
504 vpand $AND_MASK, $ACC1, $ACC1
505 vmovdqa $ACC0, 32*0(%rdi)
506 lea 160(%rdi), %rax # size optimization
507 vpaddq $T0, $ACC2, $ACC2
508
509 vpsrlq $digit_size, $ACC2, $T0
510 vpand $AND_MASK, $ACC2, $ACC2
511 vmovdqa $ACC1, 32*1(%rdi)
512 vpaddq $T0, $ACC3, $ACC3
513
514 vpsrlq $digit_size, $ACC3, $T0
515 vpand $AND_MASK, $ACC3, $ACC3
516 vmovdqa $ACC2, 32*2(%rdi)
517 vpaddq $T0, $ACC4, $ACC4
518
519 vpsrlq $digit_size, $ACC4, $T0
520 vpand $AND_MASK, $ACC4, $ACC4
521 vmovdqa $ACC3, 32*3(%rdi)
522 vpaddq $T0, $ACC5, $ACC5
523
524 vpsrlq $digit_size, $ACC5, $T0
525 vpand $AND_MASK, $ACC5, $ACC5
526 vmovdqa $ACC4, 32*4-160(%rax)
527 vpaddq $T0, $ACC6, $ACC6
528
529 vpsrlq $digit_size, $ACC6, $T0
530 vpand $AND_MASK, $ACC6, $ACC6
531 vmovdqa $ACC5, 32*5-160(%rax)
532 vpaddq $T0, $ACC7, $ACC7
533
534 vpsrlq $digit_size, $ACC7, $T0
535 vpand $AND_MASK, $ACC7, $ACC7
536 vmovdqa $ACC6, 32*6-160(%rax)
537 vpaddq $T0, $ACC8, $ACC8
538 #vpand $AND_MASK, $ACC8, $ACC8
539 vmovdqa $ACC7, 32*7-160(%rax)
540 vmovdqa $ACC8, 32*8-160(%rax)
541
542 ret
543 .size avx2_normalize_n_store,.-avx2_normalize_n_store
544
545 ################################################################################
546 # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
547 .type avx2_mul_x4,\@abi-omnipotent
548 .align 32
549 avx2_mul_x4:
550 lea .LAVX2_POLY(%rip), %rax
551
552 vpxor $ACC0, $ACC0, $ACC0
553 vpxor $ACC1, $ACC1, $ACC1
554 vpxor $ACC2, $ACC2, $ACC2
555 vpxor $ACC3, $ACC3, $ACC3
556 vpxor $ACC4, $ACC4, $ACC4
557 vpxor $ACC5, $ACC5, $ACC5
558 vpxor $ACC6, $ACC6, $ACC6
559 vpxor $ACC7, $ACC7, $ACC7
560
561 vmovdqa 32*7(%rax), %ymm14
562 vmovdqa 32*8(%rax), %ymm15
563
564 mov $n_digits, $itr
565 lea -512($a_ptr), $a_ptr # strategic bias to control u-op density
566 jmp .Lavx2_mul_x4_loop
567
568 .align 32
569 .Lavx2_mul_x4_loop:
570 vmovdqa 32*0($b_ptr), $B
571 lea 32*1($b_ptr), $b_ptr
572
573 vpmuludq 32*0+512($a_ptr), $B, $T0
574 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
575 vpaddq $T0, $ACC0, $ACC0
576 vpmuludq 32*2+512($a_ptr), $B, $T0
577 vpaddq $OVERFLOW, $ACC1, $ACC1
578 vpand $AND_MASK, $ACC0, $Y
579 vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW
580 vpaddq $T0, $ACC2, $ACC2
581 vpmuludq 32*4+512($a_ptr), $B, $T0
582 vpaddq $OVERFLOW, $ACC3, $ACC3
583 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW
584 vpaddq $T0, $ACC4, $ACC4
585 vpmuludq 32*6+512($a_ptr), $B, $T0
586 vpaddq $OVERFLOW, $ACC5, $ACC5
587 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW
588 vpaddq $T0, $ACC6, $ACC6
589
590 # Skip some multiplications, optimizing for the constant poly
591 vpmuludq $AND_MASK, $Y, $T0
592 vpaddq $OVERFLOW, $ACC7, $ACC7
593 vpmuludq 32*8+512($a_ptr), $B, $ACC8
594 vpaddq $T0, $ACC0, $OVERFLOW
595 vpaddq $T0, $ACC1, $ACC0
596 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
597 vpaddq $T0, $ACC2, $ACC1
598 vpmuludq 32*3(%rax), $Y, $T0
599 vpaddq $OVERFLOW, $ACC0, $ACC0
600 vpaddq $T0, $ACC3, $ACC2
601 .byte 0x67
602 vmovdqa $ACC4, $ACC3
603 vpsllq \$18, $Y, $OVERFLOW
604 .byte 0x67
605 vmovdqa $ACC5, $ACC4
606 vpmuludq %ymm14, $Y, $T0
607 vpaddq $OVERFLOW, $ACC6, $ACC5
608 vpmuludq %ymm15, $Y, $OVERFLOW
609 vpaddq $T0, $ACC7, $ACC6
610 vpaddq $OVERFLOW, $ACC8, $ACC7
611
612 dec $itr
613 jnz .Lavx2_mul_x4_loop
614
615 vpxor $ACC8, $ACC8, $ACC8
616
617 ret
618 .size avx2_mul_x4,.-avx2_mul_x4
619
620 # Function optimized for the constant 1
621 ################################################################################
622 # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
623 .type avx2_mul_by1_x4,\@abi-omnipotent
624 .align 32
625 avx2_mul_by1_x4:
626 lea .LAVX2_POLY(%rip), %rax
627
628 vpxor $ACC0, $ACC0, $ACC0
629 vpxor $ACC1, $ACC1, $ACC1
630 vpxor $ACC2, $ACC2, $ACC2
631 vpxor $ACC3, $ACC3, $ACC3
632 vpxor $ACC4, $ACC4, $ACC4
633 vpxor $ACC5, $ACC5, $ACC5
634 vpxor $ACC6, $ACC6, $ACC6
635 vpxor $ACC7, $ACC7, $ACC7
636 vpxor $ACC8, $ACC8, $ACC8
637
638 vmovdqa 32*3+.LONE(%rip), %ymm14
639 vmovdqa 32*7+.LONE(%rip), %ymm15
640
641 mov $n_digits, $itr
642 jmp .Lavx2_mul_by1_x4_loop
643
644 .align 32
645 .Lavx2_mul_by1_x4_loop:
646 vmovdqa 32*0($a_ptr), $B
647 .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr
648
649 vpsllq \$5, $B, $OVERFLOW
650 vpmuludq %ymm14, $B, $T0
651 vpaddq $OVERFLOW, $ACC0, $ACC0
652 vpaddq $T0, $ACC3, $ACC3
653 .byte 0x67
654 vpmuludq $AND_MASK, $B, $T0
655 vpand $AND_MASK, $ACC0, $Y
656 vpaddq $T0, $ACC4, $ACC4
657 vpaddq $T0, $ACC5, $ACC5
658 vpaddq $T0, $ACC6, $ACC6
659 vpsllq \$23, $B, $T0
660
661 .byte 0x67,0x67
662 vpmuludq %ymm15, $B, $OVERFLOW
663 vpsubq $T0, $ACC6, $ACC6
664
665 vpmuludq $AND_MASK, $Y, $T0
666 vpaddq $OVERFLOW, $ACC7, $ACC7
667 vpaddq $T0, $ACC0, $OVERFLOW
668 vpaddq $T0, $ACC1, $ACC0
669 .byte 0x67,0x67
670 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
671 vpaddq $T0, $ACC2, $ACC1
672 vpmuludq 32*3(%rax), $Y, $T0
673 vpaddq $OVERFLOW, $ACC0, $ACC0
674 vpaddq $T0, $ACC3, $ACC2
675 vmovdqa $ACC4, $ACC3
676 vpsllq \$18, $Y, $OVERFLOW
677 vmovdqa $ACC5, $ACC4
678 vpmuludq 32*7(%rax), $Y, $T0
679 vpaddq $OVERFLOW, $ACC6, $ACC5
680 vpaddq $T0, $ACC7, $ACC6
681 vpmuludq 32*8(%rax), $Y, $ACC7
682
683 dec $itr
684 jnz .Lavx2_mul_by1_x4_loop
685
686 ret
687 .size avx2_mul_by1_x4,.-avx2_mul_by1_x4
688
689 ################################################################################
690 # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
691 .type avx2_sqr_x4,\@abi-omnipotent
692 .align 32
693 avx2_sqr_x4:
694 lea .LAVX2_POLY(%rip), %rax
695
696 vmovdqa 32*7(%rax), %ymm14
697 vmovdqa 32*8(%rax), %ymm15
698
699 vmovdqa 32*0($a_ptr), $B
700 vmovdqa 32*1($a_ptr), $ACC1
701 vmovdqa 32*2($a_ptr), $ACC2
702 vmovdqa 32*3($a_ptr), $ACC3
703 vmovdqa 32*4($a_ptr), $ACC4
704 vmovdqa 32*5($a_ptr), $ACC5
705 vmovdqa 32*6($a_ptr), $ACC6
706 vmovdqa 32*7($a_ptr), $ACC7
707 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7
708 vmovdqa 32*8($a_ptr), $ACC8
709 vpaddq $ACC2, $ACC2, $ACC2
710 vmovdqa $ACC1, 32*0(%rcx)
711 vpaddq $ACC3, $ACC3, $ACC3
712 vmovdqa $ACC2, 32*1(%rcx)
713 vpaddq $ACC4, $ACC4, $ACC4
714 vmovdqa $ACC3, 32*2(%rcx)
715 vpaddq $ACC5, $ACC5, $ACC5
716 vmovdqa $ACC4, 32*3(%rcx)
717 vpaddq $ACC6, $ACC6, $ACC6
718 vmovdqa $ACC5, 32*4(%rcx)
719 vpaddq $ACC7, $ACC7, $ACC7
720 vmovdqa $ACC6, 32*5(%rcx)
721 vpaddq $ACC8, $ACC8, $ACC8
722 vmovdqa $ACC7, 32*6(%rcx)
723 vmovdqa $ACC8, 32*7(%rcx)
724
725 #itr 1
726 vpmuludq $B, $B, $ACC0
727 vpmuludq $B, $ACC1, $ACC1
728 vpand $AND_MASK, $ACC0, $Y
729 vpmuludq $B, $ACC2, $ACC2
730 vpmuludq $B, $ACC3, $ACC3
731 vpmuludq $B, $ACC4, $ACC4
732 vpmuludq $B, $ACC5, $ACC5
733 vpmuludq $B, $ACC6, $ACC6
734 vpmuludq $AND_MASK, $Y, $T0
735 vpmuludq $B, $ACC7, $ACC7
736 vpmuludq $B, $ACC8, $ACC8
737 vmovdqa 32*1($a_ptr), $B
738
739 vpaddq $T0, $ACC0, $OVERFLOW
740 vpaddq $T0, $ACC1, $ACC0
741 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
742 vpaddq $T0, $ACC2, $ACC1
743 vpmuludq 32*3(%rax), $Y, $T0
744 vpaddq $OVERFLOW, $ACC0, $ACC0
745 vpaddq $T0, $ACC3, $ACC2
746 vmovdqa $ACC4, $ACC3
747 vpsllq \$18, $Y, $T0
748 vmovdqa $ACC5, $ACC4
749 vpmuludq %ymm14, $Y, $OVERFLOW
750 vpaddq $T0, $ACC6, $ACC5
751 vpmuludq %ymm15, $Y, $T0
752 vpaddq $OVERFLOW, $ACC7, $ACC6
753 vpaddq $T0, $ACC8, $ACC7
754
755 #itr 2
756 vpmuludq $B, $B, $OVERFLOW
757 vpand $AND_MASK, $ACC0, $Y
758 vpmuludq 32*1(%rcx), $B, $T0
759 vpaddq $OVERFLOW, $ACC1, $ACC1
760 vpmuludq 32*2(%rcx), $B, $OVERFLOW
761 vpaddq $T0, $ACC2, $ACC2
762 vpmuludq 32*3(%rcx), $B, $T0
763 vpaddq $OVERFLOW, $ACC3, $ACC3
764 vpmuludq 32*4(%rcx), $B, $OVERFLOW
765 vpaddq $T0, $ACC4, $ACC4
766 vpmuludq 32*5(%rcx), $B, $T0
767 vpaddq $OVERFLOW, $ACC5, $ACC5
768 vpmuludq 32*6(%rcx), $B, $OVERFLOW
769 vpaddq $T0, $ACC6, $ACC6
770
771 vpmuludq $AND_MASK, $Y, $T0
772 vpaddq $OVERFLOW, $ACC7, $ACC7
773 vpmuludq 32*7(%rcx), $B, $ACC8
774 vmovdqa 32*2($a_ptr), $B
775 vpaddq $T0, $ACC0, $OVERFLOW
776 vpaddq $T0, $ACC1, $ACC0
777 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
778 vpaddq $T0, $ACC2, $ACC1
779 vpmuludq 32*3(%rax), $Y, $T0
780 vpaddq $OVERFLOW, $ACC0, $ACC0
781 vpaddq $T0, $ACC3, $ACC2
782 vmovdqa $ACC4, $ACC3
783 vpsllq \$18, $Y, $T0
784 vmovdqa $ACC5, $ACC4
785 vpmuludq %ymm14, $Y, $OVERFLOW
786 vpaddq $T0, $ACC6, $ACC5
787 vpmuludq %ymm15, $Y, $T0
788 vpaddq $OVERFLOW, $ACC7, $ACC6
789 vpaddq $T0, $ACC8, $ACC7
790
791 #itr 3
792 vpmuludq $B, $B, $T0
793 vpand $AND_MASK, $ACC0, $Y
794 vpmuludq 32*2(%rcx), $B, $OVERFLOW
795 vpaddq $T0, $ACC2, $ACC2
796 vpmuludq 32*3(%rcx), $B, $T0
797 vpaddq $OVERFLOW, $ACC3, $ACC3
798 vpmuludq 32*4(%rcx), $B, $OVERFLOW
799 vpaddq $T0, $ACC4, $ACC4
800 vpmuludq 32*5(%rcx), $B, $T0
801 vpaddq $OVERFLOW, $ACC5, $ACC5
802 vpmuludq 32*6(%rcx), $B, $OVERFLOW
803 vpaddq $T0, $ACC6, $ACC6
804
805 vpmuludq $AND_MASK, $Y, $T0
806 vpaddq $OVERFLOW, $ACC7, $ACC7
807 vpmuludq 32*7(%rcx), $B, $ACC8
808 vmovdqa 32*3($a_ptr), $B
809 vpaddq $T0, $ACC0, $OVERFLOW
810 vpaddq $T0, $ACC1, $ACC0
811 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
812 vpaddq $T0, $ACC2, $ACC1
813 vpmuludq 32*3(%rax), $Y, $T0
814 vpaddq $OVERFLOW, $ACC0, $ACC0
815 vpaddq $T0, $ACC3, $ACC2
816 vmovdqa $ACC4, $ACC3
817 vpsllq \$18, $Y, $T0
818 vmovdqa $ACC5, $ACC4
819 vpmuludq %ymm14, $Y, $OVERFLOW
820 vpaddq $T0, $ACC6, $ACC5
821 vpmuludq %ymm15, $Y, $T0
822 vpand $AND_MASK, $ACC0, $Y
823 vpaddq $OVERFLOW, $ACC7, $ACC6
824 vpaddq $T0, $ACC8, $ACC7
825
826 #itr 4
827 vpmuludq $B, $B, $OVERFLOW
828 vpmuludq 32*3(%rcx), $B, $T0
829 vpaddq $OVERFLOW, $ACC3, $ACC3
830 vpmuludq 32*4(%rcx), $B, $OVERFLOW
831 vpaddq $T0, $ACC4, $ACC4
832 vpmuludq 32*5(%rcx), $B, $T0
833 vpaddq $OVERFLOW, $ACC5, $ACC5
834 vpmuludq 32*6(%rcx), $B, $OVERFLOW
835 vpaddq $T0, $ACC6, $ACC6
836
837 vpmuludq $AND_MASK, $Y, $T0
838 vpaddq $OVERFLOW, $ACC7, $ACC7
839 vpmuludq 32*7(%rcx), $B, $ACC8
840 vmovdqa 32*4($a_ptr), $B
841 vpaddq $T0, $ACC0, $OVERFLOW
842 vpaddq $T0, $ACC1, $ACC0
843 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
844 vpaddq $T0, $ACC2, $ACC1
845 vpmuludq 32*3(%rax), $Y, $T0
846 vpaddq $OVERFLOW, $ACC0, $ACC0
847 vpaddq $T0, $ACC3, $ACC2
848 vmovdqa $ACC4, $ACC3
849 vpsllq \$18, $Y, $T0
850 vmovdqa $ACC5, $ACC4
851 vpmuludq %ymm14, $Y, $OVERFLOW
852 vpaddq $T0, $ACC6, $ACC5
853 vpmuludq %ymm15, $Y, $T0
854 vpand $AND_MASK, $ACC0, $Y
855 vpaddq $OVERFLOW, $ACC7, $ACC6
856 vpaddq $T0, $ACC8, $ACC7
857
858 #itr 5
859 vpmuludq $B, $B, $T0
860 vpmuludq 32*4(%rcx), $B, $OVERFLOW
861 vpaddq $T0, $ACC4, $ACC4
862 vpmuludq 32*5(%rcx), $B, $T0
863 vpaddq $OVERFLOW, $ACC5, $ACC5
864 vpmuludq 32*6(%rcx), $B, $OVERFLOW
865 vpaddq $T0, $ACC6, $ACC6
866
867 vpmuludq $AND_MASK, $Y, $T0
868 vpaddq $OVERFLOW, $ACC7, $ACC7
869 vpmuludq 32*7(%rcx), $B, $ACC8
870 vmovdqa 32*5($a_ptr), $B
871 vpaddq $T0, $ACC0, $OVERFLOW
872 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
873 vpaddq $T0, $ACC1, $ACC0
874 vpaddq $T0, $ACC2, $ACC1
875 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0
876 vpaddq $OVERFLOW, $ACC0, $ACC0
877 vpaddq $T0, $ACC3, $ACC2
878 vmovdqa $ACC4, $ACC3
879 vpsllq \$18, $Y, $T0
880 vmovdqa $ACC5, $ACC4
881 vpmuludq %ymm14, $Y, $OVERFLOW
882 vpaddq $T0, $ACC6, $ACC5
883 vpmuludq %ymm15, $Y, $T0
884 vpand $AND_MASK, $ACC0, $Y
885 vpaddq $OVERFLOW, $ACC7, $ACC6
886 vpaddq $T0, $ACC8, $ACC7
887
888 #itr 6
889 vpmuludq $B, $B, $OVERFLOW
890 vpmuludq 32*5(%rcx), $B, $T0
891 vpaddq $OVERFLOW, $ACC5, $ACC5
892 vpmuludq 32*6(%rcx), $B, $OVERFLOW
893 vpaddq $T0, $ACC6, $ACC6
894
895 vpmuludq $AND_MASK, $Y, $T0
896 vpaddq $OVERFLOW, $ACC7, $ACC7
897 vpmuludq 32*7(%rcx), $B, $ACC8
898 vmovdqa 32*6($a_ptr), $B
899 vpaddq $T0, $ACC0, $OVERFLOW
900 vpaddq $T0, $ACC1, $ACC0
901 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
902 vpaddq $T0, $ACC2, $ACC1
903 vpmuludq 32*3(%rax), $Y, $T0
904 vpaddq $OVERFLOW, $ACC0, $ACC0
905 vpaddq $T0, $ACC3, $ACC2
906 vmovdqa $ACC4, $ACC3
907 vpsllq \$18, $Y, $T0
908 vmovdqa $ACC5, $ACC4
909 vpmuludq %ymm14, $Y, $OVERFLOW
910 vpaddq $T0, $ACC6, $ACC5
911 vpmuludq %ymm15, $Y, $T0
912 vpand $AND_MASK, $ACC0, $Y
913 vpaddq $OVERFLOW, $ACC7, $ACC6
914 vpaddq $T0, $ACC8, $ACC7
915
916 #itr 7
917 vpmuludq $B, $B, $T0
918 vpmuludq 32*6(%rcx), $B, $OVERFLOW
919 vpaddq $T0, $ACC6, $ACC6
920
921 vpmuludq $AND_MASK, $Y, $T0
922 vpaddq $OVERFLOW, $ACC7, $ACC7
923 vpmuludq 32*7(%rcx), $B, $ACC8
924 vmovdqa 32*7($a_ptr), $B
925 vpaddq $T0, $ACC0, $OVERFLOW
926 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
927 vpaddq $T0, $ACC1, $ACC0
928 vpaddq $T0, $ACC2, $ACC1
929 vpmuludq 32*3(%rax), $Y, $T0
930 vpaddq $OVERFLOW, $ACC0, $ACC0
931 vpaddq $T0, $ACC3, $ACC2
932 vmovdqa $ACC4, $ACC3
933 vpsllq \$18, $Y, $T0
934 vmovdqa $ACC5, $ACC4
935 vpmuludq %ymm14, $Y, $OVERFLOW
936 vpaddq $T0, $ACC6, $ACC5
937 vpmuludq %ymm15, $Y, $T0
938 vpand $AND_MASK, $ACC0, $Y
939 vpaddq $OVERFLOW, $ACC7, $ACC6
940 vpaddq $T0, $ACC8, $ACC7
941
942 #itr 8
943 vpmuludq $B, $B, $OVERFLOW
944
945 vpmuludq $AND_MASK, $Y, $T0
946 vpaddq $OVERFLOW, $ACC7, $ACC7
947 vpmuludq 32*7(%rcx), $B, $ACC8
948 vmovdqa 32*8($a_ptr), $B
949 vpaddq $T0, $ACC0, $OVERFLOW
950 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
951 vpaddq $T0, $ACC1, $ACC0
952 vpaddq $T0, $ACC2, $ACC1
953 vpmuludq 32*3(%rax), $Y, $T0
954 vpaddq $OVERFLOW, $ACC0, $ACC0
955 vpaddq $T0, $ACC3, $ACC2
956 vmovdqa $ACC4, $ACC3
957 vpsllq \$18, $Y, $T0
958 vmovdqa $ACC5, $ACC4
959 vpmuludq %ymm14, $Y, $OVERFLOW
960 vpaddq $T0, $ACC6, $ACC5
961 vpmuludq %ymm15, $Y, $T0
962 vpand $AND_MASK, $ACC0, $Y
963 vpaddq $OVERFLOW, $ACC7, $ACC6
964 vpaddq $T0, $ACC8, $ACC7
965
966 #itr 9
967 vpmuludq $B, $B, $ACC8
968
969 vpmuludq $AND_MASK, $Y, $T0
970 vpaddq $T0, $ACC0, $OVERFLOW
971 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
972 vpaddq $T0, $ACC1, $ACC0
973 vpaddq $T0, $ACC2, $ACC1
974 vpmuludq 32*3(%rax), $Y, $T0
975 vpaddq $OVERFLOW, $ACC0, $ACC0
976 vpaddq $T0, $ACC3, $ACC2
977 vmovdqa $ACC4, $ACC3
978 vpsllq \$18, $Y, $T0
979 vmovdqa $ACC5, $ACC4
980 vpmuludq %ymm14, $Y, $OVERFLOW
981 vpaddq $T0, $ACC6, $ACC5
982 vpmuludq %ymm15, $Y, $T0
983 vpaddq $OVERFLOW, $ACC7, $ACC6
984 vpaddq $T0, $ACC8, $ACC7
985
986 vpxor $ACC8, $ACC8, $ACC8
987
988 ret
989 .size avx2_sqr_x4,.-avx2_sqr_x4
990
991 ################################################################################
992 # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
993 .type avx2_sub_x4,\@abi-omnipotent
994 .align 32
995 avx2_sub_x4:
996 vmovdqa 32*0($a_ptr), $ACC0
997 lea 160($a_ptr), $a_ptr
998 lea .LAVX2_POLY_x8+128(%rip), %rax
999 lea 128($b_ptr), $b_ptr
1000 vmovdqa 32*1-160($a_ptr), $ACC1
1001 vmovdqa 32*2-160($a_ptr), $ACC2
1002 vmovdqa 32*3-160($a_ptr), $ACC3
1003 vmovdqa 32*4-160($a_ptr), $ACC4
1004 vmovdqa 32*5-160($a_ptr), $ACC5
1005 vmovdqa 32*6-160($a_ptr), $ACC6
1006 vmovdqa 32*7-160($a_ptr), $ACC7
1007 vmovdqa 32*8-160($a_ptr), $ACC8
1008
1009 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1010 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1011 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1012 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1013 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1014 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1015 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1016 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1017 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1018
1019 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0
1020 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1
1021 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2
1022 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3
1023 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4
1024 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5
1025 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6
1026 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7
1027 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8
1028
1029 ret
1030 .size avx2_sub_x4,.-avx2_sub_x4
1031
1032 .type avx2_select_n_store,\@abi-omnipotent
1033 .align 32
1034 avx2_select_n_store:
1035 vmovdqa `8+32*9*8`(%rsp), $Y
1036 vpor `8+32*9*8+32`(%rsp), $Y, $Y
1037
1038 vpandn $ACC0, $Y, $ACC0
1039 vpandn $ACC1, $Y, $ACC1
1040 vpandn $ACC2, $Y, $ACC2
1041 vpandn $ACC3, $Y, $ACC3
1042 vpandn $ACC4, $Y, $ACC4
1043 vpandn $ACC5, $Y, $ACC5
1044 vpandn $ACC6, $Y, $ACC6
1045 vmovdqa `8+32*9*8+32`(%rsp), $B
1046 vpandn $ACC7, $Y, $ACC7
1047 vpandn `8+32*9*8`(%rsp), $B, $B
1048 vpandn $ACC8, $Y, $ACC8
1049
1050 vpand 32*0(%rsi), $B, $T0
1051 lea 160(%rsi), %rax
1052 vpand 32*1(%rsi), $B, $Y
1053 vpxor $T0, $ACC0, $ACC0
1054 vpand 32*2(%rsi), $B, $T0
1055 vpxor $Y, $ACC1, $ACC1
1056 vpand 32*3(%rsi), $B, $Y
1057 vpxor $T0, $ACC2, $ACC2
1058 vpand 32*4-160(%rax), $B, $T0
1059 vpxor $Y, $ACC3, $ACC3
1060 vpand 32*5-160(%rax), $B, $Y
1061 vpxor $T0, $ACC4, $ACC4
1062 vpand 32*6-160(%rax), $B, $T0
1063 vpxor $Y, $ACC5, $ACC5
1064 vpand 32*7-160(%rax), $B, $Y
1065 vpxor $T0, $ACC6, $ACC6
1066 vpand 32*8-160(%rax), $B, $T0
1067 vmovdqa `8+32*9*8+32`(%rsp), $B
1068 vpxor $Y, $ACC7, $ACC7
1069
1070 vpand 32*0(%rdx), $B, $Y
1071 lea 160(%rdx), %rax
1072 vpxor $T0, $ACC8, $ACC8
1073 vpand 32*1(%rdx), $B, $T0
1074 vpxor $Y, $ACC0, $ACC0
1075 vpand 32*2(%rdx), $B, $Y
1076 vpxor $T0, $ACC1, $ACC1
1077 vpand 32*3(%rdx), $B, $T0
1078 vpxor $Y, $ACC2, $ACC2
1079 vpand 32*4-160(%rax), $B, $Y
1080 vpxor $T0, $ACC3, $ACC3
1081 vpand 32*5-160(%rax), $B, $T0
1082 vpxor $Y, $ACC4, $ACC4
1083 vpand 32*6-160(%rax), $B, $Y
1084 vpxor $T0, $ACC5, $ACC5
1085 vpand 32*7-160(%rax), $B, $T0
1086 vpxor $Y, $ACC6, $ACC6
1087 vpand 32*8-160(%rax), $B, $Y
1088 vpxor $T0, $ACC7, $ACC7
1089 vpxor $Y, $ACC8, $ACC8
1090 `&STORE`
1091
1092 ret
1093 .size avx2_select_n_store,.-avx2_select_n_store
1094 ___
1095 $code.=<<___ if (0); # inlined
1096 ################################################################################
1097 # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
1098 .type avx2_mul_by2_x4,\@abi-omnipotent
1099 .align 32
1100 avx2_mul_by2_x4:
1101 vmovdqa 32*0($a_ptr), $ACC0
1102 lea 160($a_ptr), %rax
1103 vmovdqa 32*1($a_ptr), $ACC1
1104 vmovdqa 32*2($a_ptr), $ACC2
1105 vmovdqa 32*3($a_ptr), $ACC3
1106 vmovdqa 32*4-160(%rax), $ACC4
1107 vmovdqa 32*5-160(%rax), $ACC5
1108 vmovdqa 32*6-160(%rax), $ACC6
1109 vmovdqa 32*7-160(%rax), $ACC7
1110 vmovdqa 32*8-160(%rax), $ACC8
1111
1112 vpaddq $ACC0, $ACC0, $ACC0
1113 vpaddq $ACC1, $ACC1, $ACC1
1114 vpaddq $ACC2, $ACC2, $ACC2
1115 vpaddq $ACC3, $ACC3, $ACC3
1116 vpaddq $ACC4, $ACC4, $ACC4
1117 vpaddq $ACC5, $ACC5, $ACC5
1118 vpaddq $ACC6, $ACC6, $ACC6
1119 vpaddq $ACC7, $ACC7, $ACC7
1120 vpaddq $ACC8, $ACC8, $ACC8
1121
1122 ret
1123 .size avx2_mul_by2_x4,.-avx2_mul_by2_x4
1124 ___
1125 my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
1126 my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
1127
1128 $code.=<<___;
1129 ################################################################################
1130 # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
1131 .globl ecp_nistz256_avx2_point_add_affine_x4
1132 .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3
1133 .align 32
1134 ecp_nistz256_avx2_point_add_affine_x4:
1135 mov %rsp, %rax
1136 push %rbp
1137 vzeroupper
1138 ___
1139 $code.=<<___ if ($win64);
1140 lea -16*10(%rsp), %rsp
1141 vmovaps %xmm6, -8-16*10(%rax)
1142 vmovaps %xmm7, -8-16*9(%rax)
1143 vmovaps %xmm8, -8-16*8(%rax)
1144 vmovaps %xmm9, -8-16*7(%rax)
1145 vmovaps %xmm10, -8-16*6(%rax)
1146 vmovaps %xmm11, -8-16*5(%rax)
1147 vmovaps %xmm12, -8-16*4(%rax)
1148 vmovaps %xmm13, -8-16*3(%rax)
1149 vmovaps %xmm14, -8-16*2(%rax)
1150 vmovaps %xmm15, -8-16*1(%rax)
1151 ___
1152 $code.=<<___;
1153 lea -8(%rax), %rbp
1154
1155 # Result + 32*0 = Result.X
1156 # Result + 32*9 = Result.Y
1157 # Result + 32*18 = Result.Z
1158
1159 # A + 32*0 = A.X
1160 # A + 32*9 = A.Y
1161 # A + 32*18 = A.Z
1162
1163 # B + 32*0 = B.X
1164 # B + 32*9 = B.Y
1165
1166 sub \$`32*9*8+32*2+32*8`, %rsp
1167 and \$-64, %rsp
1168
1169 mov $r_ptr_in, $r_ptr
1170 mov $a_ptr_in, $a_ptr
1171 mov $b_ptr_in, $b_ptr
1172
1173 vmovdqa 32*0($a_ptr_in), %ymm0
1174 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1175 vpxor %ymm1, %ymm1, %ymm1
1176 lea 256($a_ptr_in), %rax # size optimization
1177 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1178 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1179 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1180 vpor 32*4-256(%rax), %ymm0, %ymm0
1181 lea 256(%rax), %rcx # size optimization
1182 vpor 32*5-256(%rax), %ymm0, %ymm0
1183 vpor 32*6-256(%rax), %ymm0, %ymm0
1184 vpor 32*7-256(%rax), %ymm0, %ymm0
1185 vpor 32*8-256(%rax), %ymm0, %ymm0
1186 vpor 32*9-256(%rax), %ymm0, %ymm0
1187 vpor 32*10-256(%rax), %ymm0, %ymm0
1188 vpor 32*11-256(%rax), %ymm0, %ymm0
1189 vpor 32*12-512(%rcx), %ymm0, %ymm0
1190 vpor 32*13-512(%rcx), %ymm0, %ymm0
1191 vpor 32*14-512(%rcx), %ymm0, %ymm0
1192 vpor 32*15-512(%rcx), %ymm0, %ymm0
1193 vpor 32*16-512(%rcx), %ymm0, %ymm0
1194 vpor 32*17-512(%rcx), %ymm0, %ymm0
1195 vpcmpeqq %ymm1, %ymm0, %ymm0
1196 vmovdqa %ymm0, `32*9*8`(%rsp)
1197
1198 vpxor %ymm1, %ymm1, %ymm1
1199 vmovdqa 32*0($b_ptr), %ymm0
1200 lea 256($b_ptr), %rax # size optimization
1201 vpor 32*1($b_ptr), %ymm0, %ymm0
1202 vpor 32*2($b_ptr), %ymm0, %ymm0
1203 vpor 32*3($b_ptr), %ymm0, %ymm0
1204 vpor 32*4-256(%rax), %ymm0, %ymm0
1205 lea 256(%rax), %rcx # size optimization
1206 vpor 32*5-256(%rax), %ymm0, %ymm0
1207 vpor 32*6-256(%rax), %ymm0, %ymm0
1208 vpor 32*7-256(%rax), %ymm0, %ymm0
1209 vpor 32*8-256(%rax), %ymm0, %ymm0
1210 vpor 32*9-256(%rax), %ymm0, %ymm0
1211 vpor 32*10-256(%rax), %ymm0, %ymm0
1212 vpor 32*11-256(%rax), %ymm0, %ymm0
1213 vpor 32*12-512(%rcx), %ymm0, %ymm0
1214 vpor 32*13-512(%rcx), %ymm0, %ymm0
1215 vpor 32*14-512(%rcx), %ymm0, %ymm0
1216 vpor 32*15-512(%rcx), %ymm0, %ymm0
1217 vpor 32*16-512(%rcx), %ymm0, %ymm0
1218 vpor 32*17-512(%rcx), %ymm0, %ymm0
1219 vpcmpeqq %ymm1, %ymm0, %ymm0
1220 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1221
1222 # Z1^2 = Z1*Z1
1223 lea `32*9*2`($a_ptr), %rsi
1224 lea `32*9*2`(%rsp), %rdi
1225 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1226 call avx2_sqr_x4
1227 call avx2_normalize_n_store
1228
1229 # U2 = X2*Z1^2
1230 lea `32*9*0`($b_ptr), %rsi
1231 lea `32*9*2`(%rsp), %rdx
1232 lea `32*9*0`(%rsp), %rdi
1233 call avx2_mul_x4
1234 #call avx2_normalize
1235 `&STORE`
1236
1237 # S2 = Z1*Z1^2 = Z1^3
1238 lea `32*9*2`($a_ptr), %rsi
1239 lea `32*9*2`(%rsp), %rdx
1240 lea `32*9*1`(%rsp), %rdi
1241 call avx2_mul_x4
1242 call avx2_normalize_n_store
1243
1244 # S2 = S2*Y2 = Y2*Z1^3
1245 lea `32*9*1`($b_ptr), %rsi
1246 lea `32*9*1`(%rsp), %rdx
1247 lea `32*9*1`(%rsp), %rdi
1248 call avx2_mul_x4
1249 call avx2_normalize_n_store
1250
1251 # H = U2 - U1 = U2 - X1
1252 lea `32*9*0`(%rsp), %rsi
1253 lea `32*9*0`($a_ptr), %rdx
1254 lea `32*9*3`(%rsp), %rdi
1255 call avx2_sub_x4
1256 call avx2_normalize_n_store
1257
1258 # R = S2 - S1 = S2 - Y1
1259 lea `32*9*1`(%rsp), %rsi
1260 lea `32*9*1`($a_ptr), %rdx
1261 lea `32*9*4`(%rsp), %rdi
1262 call avx2_sub_x4
1263 call avx2_normalize_n_store
1264
1265 # Z3 = H*Z1*Z2
1266 lea `32*9*3`(%rsp), %rsi
1267 lea `32*9*2`($a_ptr), %rdx
1268 lea `32*9*2`($r_ptr), %rdi
1269 call avx2_mul_x4
1270 call avx2_normalize
1271
1272 lea .LONE(%rip), %rsi
1273 lea `32*9*2`($a_ptr), %rdx
1274 call avx2_select_n_store
1275
1276 # R^2 = R^2
1277 lea `32*9*4`(%rsp), %rsi
1278 lea `32*9*6`(%rsp), %rdi
1279 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1280 call avx2_sqr_x4
1281 call avx2_normalize_n_store
1282
1283 # H^2 = H^2
1284 lea `32*9*3`(%rsp), %rsi
1285 lea `32*9*5`(%rsp), %rdi
1286 call avx2_sqr_x4
1287 call avx2_normalize_n_store
1288
1289 # H^3 = H^2*H
1290 lea `32*9*3`(%rsp), %rsi
1291 lea `32*9*5`(%rsp), %rdx
1292 lea `32*9*7`(%rsp), %rdi
1293 call avx2_mul_x4
1294 call avx2_normalize_n_store
1295
1296 # U2 = U1*H^2
1297 lea `32*9*0`($a_ptr), %rsi
1298 lea `32*9*5`(%rsp), %rdx
1299 lea `32*9*0`(%rsp), %rdi
1300 call avx2_mul_x4
1301 #call avx2_normalize
1302 `&STORE`
1303
1304 # Hsqr = U2*2
1305 #lea 32*9*0(%rsp), %rsi
1306 #lea 32*9*5(%rsp), %rdi
1307 #call avx2_mul_by2_x4
1308
1309 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1310 lea `32*9*5`(%rsp), %rdi
1311 vpaddq $ACC1, $ACC1, $ACC1
1312 vpaddq $ACC2, $ACC2, $ACC2
1313 vpaddq $ACC3, $ACC3, $ACC3
1314 vpaddq $ACC4, $ACC4, $ACC4
1315 vpaddq $ACC5, $ACC5, $ACC5
1316 vpaddq $ACC6, $ACC6, $ACC6
1317 vpaddq $ACC7, $ACC7, $ACC7
1318 vpaddq $ACC8, $ACC8, $ACC8
1319 call avx2_normalize_n_store
1320
1321 # X3 = R^2 - H^3
1322 #lea 32*9*6(%rsp), %rsi
1323 #lea 32*9*7(%rsp), %rdx
1324 #lea 32*9*5(%rsp), %rcx
1325 #lea 32*9*0($r_ptr), %rdi
1326 #call avx2_sub_x4
1327 #NORMALIZE
1328 #STORE
1329
1330 # X3 = X3 - U2*2
1331 #lea 32*9*0($r_ptr), %rsi
1332 #lea 32*9*0($r_ptr), %rdi
1333 #call avx2_sub_x4
1334 #NORMALIZE
1335 #STORE
1336
1337 lea `32*9*6+128`(%rsp), %rsi
1338 lea .LAVX2_POLY_x2+128(%rip), %rax
1339 lea `32*9*7+128`(%rsp), %rdx
1340 lea `32*9*5+128`(%rsp), %rcx
1341 lea `32*9*0`($r_ptr), %rdi
1342
1343 vmovdqa 32*0-128(%rsi), $ACC0
1344 vmovdqa 32*1-128(%rsi), $ACC1
1345 vmovdqa 32*2-128(%rsi), $ACC2
1346 vmovdqa 32*3-128(%rsi), $ACC3
1347 vmovdqa 32*4-128(%rsi), $ACC4
1348 vmovdqa 32*5-128(%rsi), $ACC5
1349 vmovdqa 32*6-128(%rsi), $ACC6
1350 vmovdqa 32*7-128(%rsi), $ACC7
1351 vmovdqa 32*8-128(%rsi), $ACC8
1352
1353 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1354 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1355 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1356 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1357 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1358 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1359 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1360 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1361 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1362
1363 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1364 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1365 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1366 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1367 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1368 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1369 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1370 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1371 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1372
1373 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1374 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1375 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1376 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1377 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1378 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1379 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1380 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1381 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1382 call avx2_normalize
1383
1384 lea 32*0($b_ptr), %rsi
1385 lea 32*0($a_ptr), %rdx
1386 call avx2_select_n_store
1387
1388 # H = U2 - X3
1389 lea `32*9*0`(%rsp), %rsi
1390 lea `32*9*0`($r_ptr), %rdx
1391 lea `32*9*3`(%rsp), %rdi
1392 call avx2_sub_x4
1393 call avx2_normalize_n_store
1394
1395 #
1396 lea `32*9*3`(%rsp), %rsi
1397 lea `32*9*4`(%rsp), %rdx
1398 lea `32*9*3`(%rsp), %rdi
1399 call avx2_mul_x4
1400 call avx2_normalize_n_store
1401
1402 #
1403 lea `32*9*7`(%rsp), %rsi
1404 lea `32*9*1`($a_ptr), %rdx
1405 lea `32*9*1`(%rsp), %rdi
1406 call avx2_mul_x4
1407 call avx2_normalize_n_store
1408
1409 #
1410 lea `32*9*3`(%rsp), %rsi
1411 lea `32*9*1`(%rsp), %rdx
1412 lea `32*9*1`($r_ptr), %rdi
1413 call avx2_sub_x4
1414 call avx2_normalize
1415
1416 lea 32*9($b_ptr), %rsi
1417 lea 32*9($a_ptr), %rdx
1418 call avx2_select_n_store
1419
1420 #lea 32*9*0($r_ptr), %rsi
1421 #lea 32*9*0($r_ptr), %rdi
1422 #call avx2_mul_by1_x4
1423 #NORMALIZE
1424 #STORE
1425
1426 lea `32*9*1`($r_ptr), %rsi
1427 lea `32*9*1`($r_ptr), %rdi
1428 call avx2_mul_by1_x4
1429 call avx2_normalize_n_store
1430
1431 vzeroupper
1432 ___
1433 $code.=<<___ if ($win64);
1434 movaps %xmm6, -16*10(%rbp)
1435 movaps %xmm7, -16*9(%rbp)
1436 movaps %xmm8, -16*8(%rbp)
1437 movaps %xmm9, -16*7(%rbp)
1438 movaps %xmm10, -16*6(%rbp)
1439 movaps %xmm11, -16*5(%rbp)
1440 movaps %xmm12, -16*4(%rbp)
1441 movaps %xmm13, -16*3(%rbp)
1442 movaps %xmm14, -16*2(%rbp)
1443 movaps %xmm15, -16*1(%rbp)
1444 ___
1445 $code.=<<___;
1446 mov %rbp, %rsp
1447 pop %rbp
1448 ret
1449 .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
1450
1451 ################################################################################
1452 # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
1453 .globl ecp_nistz256_avx2_point_add_affines_x4
1454 .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3
1455 .align 32
1456 ecp_nistz256_avx2_point_add_affines_x4:
1457 mov %rsp, %rax
1458 push %rbp
1459 vzeroupper
1460 ___
1461 $code.=<<___ if ($win64);
1462 lea -16*10(%rsp), %rsp
1463 vmovaps %xmm6, -8-16*10(%rax)
1464 vmovaps %xmm7, -8-16*9(%rax)
1465 vmovaps %xmm8, -8-16*8(%rax)
1466 vmovaps %xmm9, -8-16*7(%rax)
1467 vmovaps %xmm10, -8-16*6(%rax)
1468 vmovaps %xmm11, -8-16*5(%rax)
1469 vmovaps %xmm12, -8-16*4(%rax)
1470 vmovaps %xmm13, -8-16*3(%rax)
1471 vmovaps %xmm14, -8-16*2(%rax)
1472 vmovaps %xmm15, -8-16*1(%rax)
1473 ___
1474 $code.=<<___;
1475 lea -8(%rax), %rbp
1476
1477 # Result + 32*0 = Result.X
1478 # Result + 32*9 = Result.Y
1479 # Result + 32*18 = Result.Z
1480
1481 # A + 32*0 = A.X
1482 # A + 32*9 = A.Y
1483
1484 # B + 32*0 = B.X
1485 # B + 32*9 = B.Y
1486
1487 sub \$`32*9*8+32*2+32*8`, %rsp
1488 and \$-64, %rsp
1489
1490 mov $r_ptr_in, $r_ptr
1491 mov $a_ptr_in, $a_ptr
1492 mov $b_ptr_in, $b_ptr
1493
1494 vmovdqa 32*0($a_ptr_in), %ymm0
1495 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1496 vpxor %ymm1, %ymm1, %ymm1
1497 lea 256($a_ptr_in), %rax # size optimization
1498 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1499 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1500 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1501 vpor 32*4-256(%rax), %ymm0, %ymm0
1502 lea 256(%rax), %rcx # size optimization
1503 vpor 32*5-256(%rax), %ymm0, %ymm0
1504 vpor 32*6-256(%rax), %ymm0, %ymm0
1505 vpor 32*7-256(%rax), %ymm0, %ymm0
1506 vpor 32*8-256(%rax), %ymm0, %ymm0
1507 vpor 32*9-256(%rax), %ymm0, %ymm0
1508 vpor 32*10-256(%rax), %ymm0, %ymm0
1509 vpor 32*11-256(%rax), %ymm0, %ymm0
1510 vpor 32*12-512(%rcx), %ymm0, %ymm0
1511 vpor 32*13-512(%rcx), %ymm0, %ymm0
1512 vpor 32*14-512(%rcx), %ymm0, %ymm0
1513 vpor 32*15-512(%rcx), %ymm0, %ymm0
1514 vpor 32*16-512(%rcx), %ymm0, %ymm0
1515 vpor 32*17-512(%rcx), %ymm0, %ymm0
1516 vpcmpeqq %ymm1, %ymm0, %ymm0
1517 vmovdqa %ymm0, `32*9*8`(%rsp)
1518
1519 vpxor %ymm1, %ymm1, %ymm1
1520 vmovdqa 32*0($b_ptr), %ymm0
1521 lea 256($b_ptr), %rax # size optimization
1522 vpor 32*1($b_ptr), %ymm0, %ymm0
1523 vpor 32*2($b_ptr), %ymm0, %ymm0
1524 vpor 32*3($b_ptr), %ymm0, %ymm0
1525 vpor 32*4-256(%rax), %ymm0, %ymm0
1526 lea 256(%rax), %rcx # size optimization
1527 vpor 32*5-256(%rax), %ymm0, %ymm0
1528 vpor 32*6-256(%rax), %ymm0, %ymm0
1529 vpor 32*7-256(%rax), %ymm0, %ymm0
1530 vpor 32*8-256(%rax), %ymm0, %ymm0
1531 vpor 32*9-256(%rax), %ymm0, %ymm0
1532 vpor 32*10-256(%rax), %ymm0, %ymm0
1533 vpor 32*11-256(%rax), %ymm0, %ymm0
1534 vpor 32*12-512(%rcx), %ymm0, %ymm0
1535 vpor 32*13-512(%rcx), %ymm0, %ymm0
1536 vpor 32*14-512(%rcx), %ymm0, %ymm0
1537 vpor 32*15-512(%rcx), %ymm0, %ymm0
1538 vpor 32*16-512(%rcx), %ymm0, %ymm0
1539 vpor 32*17-512(%rcx), %ymm0, %ymm0
1540 vpcmpeqq %ymm1, %ymm0, %ymm0
1541 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1542
1543 # H = U2 - U1 = X2 - X1
1544 lea `32*9*0`($b_ptr), %rsi
1545 lea `32*9*0`($a_ptr), %rdx
1546 lea `32*9*3`(%rsp), %rdi
1547 call avx2_sub_x4
1548 call avx2_normalize_n_store
1549
1550 # R = S2 - S1 = Y2 - Y1
1551 lea `32*9*1`($b_ptr), %rsi
1552 lea `32*9*1`($a_ptr), %rdx
1553 lea `32*9*4`(%rsp), %rdi
1554 call avx2_sub_x4
1555 call avx2_normalize_n_store
1556
1557 # Z3 = H*Z1*Z2 = H
1558 lea `32*9*3`(%rsp), %rsi
1559 lea `32*9*2`($r_ptr), %rdi
1560 call avx2_mul_by1_x4
1561 call avx2_normalize
1562
1563 vmovdqa `32*9*8`(%rsp), $B
1564 vpor `32*9*8+32`(%rsp), $B, $B
1565
1566 vpandn $ACC0, $B, $ACC0
1567 lea .LONE+128(%rip), %rax
1568 vpandn $ACC1, $B, $ACC1
1569 vpandn $ACC2, $B, $ACC2
1570 vpandn $ACC3, $B, $ACC3
1571 vpandn $ACC4, $B, $ACC4
1572 vpandn $ACC5, $B, $ACC5
1573 vpandn $ACC6, $B, $ACC6
1574 vpandn $ACC7, $B, $ACC7
1575
1576 vpand 32*0-128(%rax), $B, $T0
1577 vpandn $ACC8, $B, $ACC8
1578 vpand 32*1-128(%rax), $B, $Y
1579 vpxor $T0, $ACC0, $ACC0
1580 vpand 32*2-128(%rax), $B, $T0
1581 vpxor $Y, $ACC1, $ACC1
1582 vpand 32*3-128(%rax), $B, $Y
1583 vpxor $T0, $ACC2, $ACC2
1584 vpand 32*4-128(%rax), $B, $T0
1585 vpxor $Y, $ACC3, $ACC3
1586 vpand 32*5-128(%rax), $B, $Y
1587 vpxor $T0, $ACC4, $ACC4
1588 vpand 32*6-128(%rax), $B, $T0
1589 vpxor $Y, $ACC5, $ACC5
1590 vpand 32*7-128(%rax), $B, $Y
1591 vpxor $T0, $ACC6, $ACC6
1592 vpand 32*8-128(%rax), $B, $T0
1593 vpxor $Y, $ACC7, $ACC7
1594 vpxor $T0, $ACC8, $ACC8
1595 `&STORE`
1596
1597 # R^2 = R^2
1598 lea `32*9*4`(%rsp), %rsi
1599 lea `32*9*6`(%rsp), %rdi
1600 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1601 call avx2_sqr_x4
1602 call avx2_normalize_n_store
1603
1604 # H^2 = H^2
1605 lea `32*9*3`(%rsp), %rsi
1606 lea `32*9*5`(%rsp), %rdi
1607 call avx2_sqr_x4
1608 call avx2_normalize_n_store
1609
1610 # H^3 = H^2*H
1611 lea `32*9*3`(%rsp), %rsi
1612 lea `32*9*5`(%rsp), %rdx
1613 lea `32*9*7`(%rsp), %rdi
1614 call avx2_mul_x4
1615 call avx2_normalize_n_store
1616
1617 # U2 = U1*H^2
1618 lea `32*9*0`($a_ptr), %rsi
1619 lea `32*9*5`(%rsp), %rdx
1620 lea `32*9*0`(%rsp), %rdi
1621 call avx2_mul_x4
1622 #call avx2_normalize
1623 `&STORE`
1624
1625 # Hsqr = U2*2
1626 #lea 32*9*0(%rsp), %rsi
1627 #lea 32*9*5(%rsp), %rdi
1628 #call avx2_mul_by2_x4
1629
1630 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1631 lea `32*9*5`(%rsp), %rdi
1632 vpaddq $ACC1, $ACC1, $ACC1
1633 vpaddq $ACC2, $ACC2, $ACC2
1634 vpaddq $ACC3, $ACC3, $ACC3
1635 vpaddq $ACC4, $ACC4, $ACC4
1636 vpaddq $ACC5, $ACC5, $ACC5
1637 vpaddq $ACC6, $ACC6, $ACC6
1638 vpaddq $ACC7, $ACC7, $ACC7
1639 vpaddq $ACC8, $ACC8, $ACC8
1640 call avx2_normalize_n_store
1641
1642 # X3 = R^2 - H^3
1643 #lea 32*9*6(%rsp), %rsi
1644 #lea 32*9*7(%rsp), %rdx
1645 #lea 32*9*5(%rsp), %rcx
1646 #lea 32*9*0($r_ptr), %rdi
1647 #call avx2_sub_x4
1648 #NORMALIZE
1649 #STORE
1650
1651 # X3 = X3 - U2*2
1652 #lea 32*9*0($r_ptr), %rsi
1653 #lea 32*9*0($r_ptr), %rdi
1654 #call avx2_sub_x4
1655 #NORMALIZE
1656 #STORE
1657
1658 lea `32*9*6+128`(%rsp), %rsi
1659 lea .LAVX2_POLY_x2+128(%rip), %rax
1660 lea `32*9*7+128`(%rsp), %rdx
1661 lea `32*9*5+128`(%rsp), %rcx
1662 lea `32*9*0`($r_ptr), %rdi
1663
1664 vmovdqa 32*0-128(%rsi), $ACC0
1665 vmovdqa 32*1-128(%rsi), $ACC1
1666 vmovdqa 32*2-128(%rsi), $ACC2
1667 vmovdqa 32*3-128(%rsi), $ACC3
1668 vmovdqa 32*4-128(%rsi), $ACC4
1669 vmovdqa 32*5-128(%rsi), $ACC5
1670 vmovdqa 32*6-128(%rsi), $ACC6
1671 vmovdqa 32*7-128(%rsi), $ACC7
1672 vmovdqa 32*8-128(%rsi), $ACC8
1673
1674 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1675 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1676 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1677 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1678 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1679 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1680 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1681 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1682 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1683
1684 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1685 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1686 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1687 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1688 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1689 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1690 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1691 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1692 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1693
1694 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1695 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1696 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1697 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1698 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1699 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1700 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1701 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1702 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1703 call avx2_normalize
1704
1705 lea 32*0($b_ptr), %rsi
1706 lea 32*0($a_ptr), %rdx
1707 call avx2_select_n_store
1708
1709 # H = U2 - X3
1710 lea `32*9*0`(%rsp), %rsi
1711 lea `32*9*0`($r_ptr), %rdx
1712 lea `32*9*3`(%rsp), %rdi
1713 call avx2_sub_x4
1714 call avx2_normalize_n_store
1715
1716 # H = H*R
1717 lea `32*9*3`(%rsp), %rsi
1718 lea `32*9*4`(%rsp), %rdx
1719 lea `32*9*3`(%rsp), %rdi
1720 call avx2_mul_x4
1721 call avx2_normalize_n_store
1722
1723 # S2 = S1 * H^3
1724 lea `32*9*7`(%rsp), %rsi
1725 lea `32*9*1`($a_ptr), %rdx
1726 lea `32*9*1`(%rsp), %rdi
1727 call avx2_mul_x4
1728 call avx2_normalize_n_store
1729
1730 #
1731 lea `32*9*3`(%rsp), %rsi
1732 lea `32*9*1`(%rsp), %rdx
1733 lea `32*9*1`($r_ptr), %rdi
1734 call avx2_sub_x4
1735 call avx2_normalize
1736
1737 lea 32*9($b_ptr), %rsi
1738 lea 32*9($a_ptr), %rdx
1739 call avx2_select_n_store
1740
1741 #lea 32*9*0($r_ptr), %rsi
1742 #lea 32*9*0($r_ptr), %rdi
1743 #call avx2_mul_by1_x4
1744 #NORMALIZE
1745 #STORE
1746
1747 lea `32*9*1`($r_ptr), %rsi
1748 lea `32*9*1`($r_ptr), %rdi
1749 call avx2_mul_by1_x4
1750 call avx2_normalize_n_store
1751
1752 vzeroupper
1753 ___
1754 $code.=<<___ if ($win64);
1755 movaps %xmm6, -16*10(%rbp)
1756 movaps %xmm7, -16*9(%rbp)
1757 movaps %xmm8, -16*8(%rbp)
1758 movaps %xmm9, -16*7(%rbp)
1759 movaps %xmm10, -16*6(%rbp)
1760 movaps %xmm11, -16*5(%rbp)
1761 movaps %xmm12, -16*4(%rbp)
1762 movaps %xmm13, -16*3(%rbp)
1763 movaps %xmm14, -16*2(%rbp)
1764 movaps %xmm15, -16*1(%rbp)
1765 ___
1766 $code.=<<___;
1767 mov %rbp, %rsp
1768 pop %rbp
1769 ret
1770 .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
1771
1772 ################################################################################
1773 # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
1774 .globl ecp_nistz256_avx2_to_mont
1775 .type ecp_nistz256_avx2_to_mont,\@function,2
1776 .align 32
1777 ecp_nistz256_avx2_to_mont:
1778 vzeroupper
1779 ___
1780 $code.=<<___ if ($win64);
1781 lea -8-16*10(%rsp), %rsp
1782 vmovaps %xmm6, -8-16*10(%rax)
1783 vmovaps %xmm7, -8-16*9(%rax)
1784 vmovaps %xmm8, -8-16*8(%rax)
1785 vmovaps %xmm9, -8-16*7(%rax)
1786 vmovaps %xmm10, -8-16*6(%rax)
1787 vmovaps %xmm11, -8-16*5(%rax)
1788 vmovaps %xmm12, -8-16*4(%rax)
1789 vmovaps %xmm13, -8-16*3(%rax)
1790 vmovaps %xmm14, -8-16*2(%rax)
1791 vmovaps %xmm15, -8-16*1(%rax)
1792 ___
1793 $code.=<<___;
1794 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1795 lea .LTO_MONT_AVX2(%rip), %rdx
1796 call avx2_mul_x4
1797 call avx2_normalize_n_store
1798
1799 vzeroupper
1800 ___
1801 $code.=<<___ if ($win64);
1802 movaps 16*0(%rsp), %xmm6
1803 movaps 16*1(%rsp), %xmm7
1804 movaps 16*2(%rsp), %xmm8
1805 movaps 16*3(%rsp), %xmm9
1806 movaps 16*4(%rsp), %xmm10
1807 movaps 16*5(%rsp), %xmm11
1808 movaps 16*6(%rsp), %xmm12
1809 movaps 16*7(%rsp), %xmm13
1810 movaps 16*8(%rsp), %xmm14
1811 movaps 16*9(%rsp), %xmm15
1812 lea 8+16*10(%rsp), %rsp
1813 ___
1814 $code.=<<___;
1815 ret
1816 .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
1817
1818 ################################################################################
1819 # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
1820 .globl ecp_nistz256_avx2_from_mont
1821 .type ecp_nistz256_avx2_from_mont,\@function,2
1822 .align 32
1823 ecp_nistz256_avx2_from_mont:
1824 vzeroupper
1825 ___
1826 $code.=<<___ if ($win64);
1827 lea -8-16*10(%rsp), %rsp
1828 vmovaps %xmm6, -8-16*10(%rax)
1829 vmovaps %xmm7, -8-16*9(%rax)
1830 vmovaps %xmm8, -8-16*8(%rax)
1831 vmovaps %xmm9, -8-16*7(%rax)
1832 vmovaps %xmm10, -8-16*6(%rax)
1833 vmovaps %xmm11, -8-16*5(%rax)
1834 vmovaps %xmm12, -8-16*4(%rax)
1835 vmovaps %xmm13, -8-16*3(%rax)
1836 vmovaps %xmm14, -8-16*2(%rax)
1837 vmovaps %xmm15, -8-16*1(%rax)
1838 ___
1839 $code.=<<___;
1840 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1841 lea .LFROM_MONT_AVX2(%rip), %rdx
1842 call avx2_mul_x4
1843 call avx2_normalize_n_store
1844
1845 vzeroupper
1846 ___
1847 $code.=<<___ if ($win64);
1848 movaps 16*0(%rsp), %xmm6
1849 movaps 16*1(%rsp), %xmm7
1850 movaps 16*2(%rsp), %xmm8
1851 movaps 16*3(%rsp), %xmm9
1852 movaps 16*4(%rsp), %xmm10
1853 movaps 16*5(%rsp), %xmm11
1854 movaps 16*6(%rsp), %xmm12
1855 movaps 16*7(%rsp), %xmm13
1856 movaps 16*8(%rsp), %xmm14
1857 movaps 16*9(%rsp), %xmm15
1858 lea 8+16*10(%rsp), %rsp
1859 ___
1860 $code.=<<___;
1861 ret
1862 .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
1863
1864 ################################################################################
1865 # void ecp_nistz256_avx2_set1(void* RESULTx4);
1866 .globl ecp_nistz256_avx2_set1
1867 .type ecp_nistz256_avx2_set1,\@function,1
1868 .align 32
1869 ecp_nistz256_avx2_set1:
1870 lea .LONE+128(%rip), %rax
1871 lea 128(%rdi), %rdi
1872 vzeroupper
1873 vmovdqa 32*0-128(%rax), %ymm0
1874 vmovdqa 32*1-128(%rax), %ymm1
1875 vmovdqa 32*2-128(%rax), %ymm2
1876 vmovdqa 32*3-128(%rax), %ymm3
1877 vmovdqa 32*4-128(%rax), %ymm4
1878 vmovdqa 32*5-128(%rax), %ymm5
1879 vmovdqa %ymm0, 32*0-128(%rdi)
1880 vmovdqa 32*6-128(%rax), %ymm0
1881 vmovdqa %ymm1, 32*1-128(%rdi)
1882 vmovdqa 32*7-128(%rax), %ymm1
1883 vmovdqa %ymm2, 32*2-128(%rdi)
1884 vmovdqa 32*8-128(%rax), %ymm2
1885 vmovdqa %ymm3, 32*3-128(%rdi)
1886 vmovdqa %ymm4, 32*4-128(%rdi)
1887 vmovdqa %ymm5, 32*5-128(%rdi)
1888 vmovdqa %ymm0, 32*6-128(%rdi)
1889 vmovdqa %ymm1, 32*7-128(%rdi)
1890 vmovdqa %ymm2, 32*8-128(%rdi)
1891
1892 vzeroupper
1893 ret
1894 .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
1895 ___
1896 }
1897 {
1898 ################################################################################
1899 # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
1900 # int index0, int index1, int index2, int index3);
1901 ################################################################################
1902
1903 my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
1904 my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
1905 my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
1906 my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
1907
1908 $code.=<<___;
1909 .globl ecp_nistz256_avx2_multi_gather_w7
1910 .type ecp_nistz256_avx2_multi_gather_w7,\@function,6
1911 .align 32
1912 ecp_nistz256_avx2_multi_gather_w7:
1913 vzeroupper
1914 ___
1915 $code.=<<___ if ($win64);
1916 lea -8-16*10(%rsp), %rsp
1917 vmovaps %xmm6, -8-16*10(%rax)
1918 vmovaps %xmm7, -8-16*9(%rax)
1919 vmovaps %xmm8, -8-16*8(%rax)
1920 vmovaps %xmm9, -8-16*7(%rax)
1921 vmovaps %xmm10, -8-16*6(%rax)
1922 vmovaps %xmm11, -8-16*5(%rax)
1923 vmovaps %xmm12, -8-16*4(%rax)
1924 vmovaps %xmm13, -8-16*3(%rax)
1925 vmovaps %xmm14, -8-16*2(%rax)
1926 vmovaps %xmm15, -8-16*1(%rax)
1927 ___
1928 $code.=<<___;
1929 lea .LIntOne(%rip), %rax
1930
1931 vmovd $index0, %xmm0
1932 vmovd $index1, %xmm1
1933 vmovd $index2, %xmm2
1934 vmovd $index3, %xmm3
1935
1936 vpxor $R0a, $R0a, $R0a
1937 vpxor $R0b, $R0b, $R0b
1938 vpxor $R1a, $R1a, $R1a
1939 vpxor $R1b, $R1b, $R1b
1940 vpxor $R2a, $R2a, $R2a
1941 vpxor $R2b, $R2b, $R2b
1942 vpxor $R3a, $R3a, $R3a
1943 vpxor $R3b, $R3b, $R3b
1944 vmovdqa (%rax), $M0
1945
1946 vpermd $INDEX0, $R0a, $INDEX0
1947 vpermd $INDEX1, $R0a, $INDEX1
1948 vpermd $INDEX2, $R0a, $INDEX2
1949 vpermd $INDEX3, $R0a, $INDEX3
1950
1951 mov \$64, %ecx
1952 lea 112($val), $val # size optimization
1953 jmp .Lmulti_select_loop_avx2
1954
1955 # INDEX=0, corresponds to the point at infty (0,0)
1956 .align 32
1957 .Lmulti_select_loop_avx2:
1958 vpcmpeqd $INDEX0, $M0, $TMP0
1959
1960 vmovdqa `32*0+32*64*2*0`($in_t), $T0
1961 vmovdqa `32*1+32*64*2*0`($in_t), $T1
1962 vpand $TMP0, $T0, $T0
1963 vpand $TMP0, $T1, $T1
1964 vpxor $T0, $R0a, $R0a
1965 vpxor $T1, $R0b, $R0b
1966
1967 vpcmpeqd $INDEX1, $M0, $TMP0
1968
1969 vmovdqa `32*0+32*64*2*1`($in_t), $T0
1970 vmovdqa `32*1+32*64*2*1`($in_t), $T1
1971 vpand $TMP0, $T0, $T0
1972 vpand $TMP0, $T1, $T1
1973 vpxor $T0, $R1a, $R1a
1974 vpxor $T1, $R1b, $R1b
1975
1976 vpcmpeqd $INDEX2, $M0, $TMP0
1977
1978 vmovdqa `32*0+32*64*2*2`($in_t), $T0
1979 vmovdqa `32*1+32*64*2*2`($in_t), $T1
1980 vpand $TMP0, $T0, $T0
1981 vpand $TMP0, $T1, $T1
1982 vpxor $T0, $R2a, $R2a
1983 vpxor $T1, $R2b, $R2b
1984
1985 vpcmpeqd $INDEX3, $M0, $TMP0
1986
1987 vmovdqa `32*0+32*64*2*3`($in_t), $T0
1988 vmovdqa `32*1+32*64*2*3`($in_t), $T1
1989 vpand $TMP0, $T0, $T0
1990 vpand $TMP0, $T1, $T1
1991 vpxor $T0, $R3a, $R3a
1992 vpxor $T1, $R3b, $R3b
1993
1994 vpaddd (%rax), $M0, $M0 # increment
1995 lea 32*2($in_t), $in_t
1996
1997 dec %ecx
1998 jnz .Lmulti_select_loop_avx2
1999
2000 vmovdqu $R0a, 32*0-112($val)
2001 vmovdqu $R0b, 32*1-112($val)
2002 vmovdqu $R1a, 32*2-112($val)
2003 vmovdqu $R1b, 32*3-112($val)
2004 vmovdqu $R2a, 32*4-112($val)
2005 vmovdqu $R2b, 32*5-112($val)
2006 vmovdqu $R3a, 32*6-112($val)
2007 vmovdqu $R3b, 32*7-112($val)
2008
2009 vzeroupper
2010 ___
2011 $code.=<<___ if ($win64);
2012 movaps 16*0(%rsp), %xmm6
2013 movaps 16*1(%rsp), %xmm7
2014 movaps 16*2(%rsp), %xmm8
2015 movaps 16*3(%rsp), %xmm9
2016 movaps 16*4(%rsp), %xmm10
2017 movaps 16*5(%rsp), %xmm11
2018 movaps 16*6(%rsp), %xmm12
2019 movaps 16*7(%rsp), %xmm13
2020 movaps 16*8(%rsp), %xmm14
2021 movaps 16*9(%rsp), %xmm15
2022 lea 8+16*10(%rsp), %rsp
2023 ___
2024 $code.=<<___;
2025 ret
2026 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2027
2028 .extern OPENSSL_ia32cap_P
2029 .globl ecp_nistz_avx2_eligible
2030 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2031 .align 32
2032 ecp_nistz_avx2_eligible:
2033 mov OPENSSL_ia32cap_P+8(%rip),%eax
2034 shr \$5,%eax
2035 and \$1,%eax
2036 ret
2037 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2038 ___
2039 }
2040 }} else {{ # assembler is too old
2041 $code.=<<___;
2042 .text
2043
2044 .globl ecp_nistz256_avx2_transpose_convert
2045 .globl ecp_nistz256_avx2_convert_transpose_back
2046 .globl ecp_nistz256_avx2_point_add_affine_x4
2047 .globl ecp_nistz256_avx2_point_add_affines_x4
2048 .globl ecp_nistz256_avx2_to_mont
2049 .globl ecp_nistz256_avx2_from_mont
2050 .globl ecp_nistz256_avx2_set1
2051 .globl ecp_nistz256_avx2_multi_gather_w7
2052 .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
2053 ecp_nistz256_avx2_transpose_convert:
2054 ecp_nistz256_avx2_convert_transpose_back:
2055 ecp_nistz256_avx2_point_add_affine_x4:
2056 ecp_nistz256_avx2_point_add_affines_x4:
2057 ecp_nistz256_avx2_to_mont:
2058 ecp_nistz256_avx2_from_mont:
2059 ecp_nistz256_avx2_set1:
2060 ecp_nistz256_avx2_multi_gather_w7:
2061 .byte 0x0f,0x0b # ud2
2062 ret
2063 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2064
2065 .globl ecp_nistz_avx2_eligible
2066 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2067 ecp_nistz_avx2_eligible:
2068 xor %eax,%eax
2069 ret
2070 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2071 ___
2072 }}
2073
2074 foreach (split("\n",$code)) {
2075 s/\`([^\`]*)\`/eval($1)/geo;
2076
2077 print $_,"\n";
2078 }
2079
2080 close STDOUT;