]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/asm/ecp_nistz256-avx2.pl
Add cipher list ciphersuites which using encryption algorithm in mode CBC.
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-avx2.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4 #
5 # Licensed under the Apache License 2.0 (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
9 #
10 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
12 # (2) University of Haifa, Israel
13 #
14 # Reference:
15 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
16 # 256 Bit Primes"
17
18 # $output is the last argument if it looks like a file (it has an extension)
19 # $flavour is the first argument if it doesn't look like a file
20 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
21 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
22
23 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
24
25 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
26 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
27 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
28 die "can't locate x86_64-xlate.pl";
29
30 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
31 or die "can't call $xlate: $!";
32 *STDOUT=*OUT;
33
34 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
35 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
36 $avx = ($1>=2.19) + ($1>=2.22);
37 $addx = ($1>=2.23);
38 }
39
40 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
41 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
42 $avx = ($1>=2.09) + ($1>=2.10);
43 $addx = ($1>=2.10);
44 }
45
46 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
47 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
48 $avx = ($1>=10) + ($1>=11);
49 $addx = ($1>=12);
50 }
51
52 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) {
53 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
54 $avx = ($ver>=3.0) + ($ver>=3.01);
55 $addx = ($ver>=3.03);
56 }
57
58 if ($avx>=2) {{
59 $digit_size = "\$29";
60 $n_digits = "\$9";
61
62 $code.=<<___;
63 .text
64
65 .align 64
66 .LAVX2_AND_MASK:
67 .LAVX2_POLY:
68 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
69 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
70 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
71 .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
72 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
73 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
74 .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
75 .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
76 .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
77
78 .LAVX2_POLY_x2:
79 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
80 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
81 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
82 .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
83 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
84 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
85 .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
86 .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
87 .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
88
89 .LAVX2_POLY_x8:
90 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
91 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
92 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
93 .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
94 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
95 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
96 .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
97 .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
98 .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
99
100 .LONE:
101 .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
102 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
103 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
104 .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
105 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
106 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
107 .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
108 .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
109 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
110
111 # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
112 # Montgomery form (*2^256) to our format (*2^261)
113
114 .LTO_MONT_AVX2:
115 .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
116 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
117 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
118 .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
119 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
120 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
121 .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
122 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
123 .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
124
125 .LFROM_MONT_AVX2:
126 .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
127 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
128 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
129 .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
130 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
131 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
132 .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
133 .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
134 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
135
136 .LIntOne:
137 .long 1,1,1,1,1,1,1,1
138 ___
139
140 {
141 # This function receives a pointer to an array of four affine points
142 # (X, Y, <1>) and rearranges the data for AVX2 execution, while
143 # converting it to 2^29 radix redundant form
144
145 my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
146 $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
147
148 $code.=<<___;
149 .globl ecp_nistz256_avx2_transpose_convert
150 .type ecp_nistz256_avx2_transpose_convert,\@function,2
151 .align 64
152 ecp_nistz256_avx2_transpose_convert:
153 vzeroupper
154 ___
155 $code.=<<___ if ($win64);
156 lea -8-16*10(%rsp), %rsp
157 vmovaps %xmm6, -8-16*10(%rax)
158 vmovaps %xmm7, -8-16*9(%rax)
159 vmovaps %xmm8, -8-16*8(%rax)
160 vmovaps %xmm9, -8-16*7(%rax)
161 vmovaps %xmm10, -8-16*6(%rax)
162 vmovaps %xmm11, -8-16*5(%rax)
163 vmovaps %xmm12, -8-16*4(%rax)
164 vmovaps %xmm13, -8-16*3(%rax)
165 vmovaps %xmm14, -8-16*2(%rax)
166 vmovaps %xmm15, -8-16*1(%rax)
167 ___
168 $code.=<<___;
169 # Load the data
170 vmovdqa 32*0(%rsi), $X0
171 lea 112(%rsi), %rax # size optimization
172 vmovdqa 32*1(%rsi), $Y0
173 lea .LAVX2_AND_MASK(%rip), %rdx
174 vmovdqa 32*2(%rsi), $X1
175 vmovdqa 32*3(%rsi), $Y1
176 vmovdqa 32*4-112(%rax), $X2
177 vmovdqa 32*5-112(%rax), $Y2
178 vmovdqa 32*6-112(%rax), $X3
179 vmovdqa 32*7-112(%rax), $Y3
180
181 # Transpose X and Y independently
182 vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0]
183 vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0]
184 vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1]
185 vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1]
186
187 vpunpcklqdq $Y1, $Y0, $T4
188 vpunpcklqdq $Y3, $Y2, $T5
189 vpunpckhqdq $Y1, $Y0, $T6
190 vpunpckhqdq $Y3, $Y2, $T7
191
192 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0]
193 vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1]
194 vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2]
195 vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3]
196
197 vperm2i128 \$0x20, $T5, $T4, $Y0
198 vperm2i128 \$0x20, $T7, $T6, $Y1
199 vperm2i128 \$0x31, $T5, $T4, $Y2
200 vperm2i128 \$0x31, $T7, $T6, $Y3
201 vmovdqa (%rdx), $T7
202
203 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask;
204 vpsrlq \$29, $X0, $X0
205 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask;
206 vpsrlq \$29, $X0, $X0
207 vpsllq \$6, $X1, $T2
208 vpxor $X0, $T2, $T2
209 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
210 vpsrlq \$23, $X1, $X1
211 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
212 vpsrlq \$29, $X1, $X1
213 vpsllq \$12, $X2, $T4
214 vpxor $X1, $T4, $T4
215 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
216 vpsrlq \$17, $X2, $X2
217 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
218 vpsrlq \$29, $X2, $X2
219 vpsllq \$18, $X3, $T6
220 vpxor $X2, $T6, $T6
221 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
222 vpsrlq \$11, $X3, $X3
223 vmovdqa $T0, 32*0(%rdi)
224 lea 112(%rdi), %rax # size optimization
225 vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
226 vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
227
228 vmovdqa $T1, 32*1(%rdi)
229 vmovdqa $T2, 32*2(%rdi)
230 vmovdqa $T3, 32*3(%rdi)
231 vmovdqa $T4, 32*4-112(%rax)
232 vmovdqa $T5, 32*5-112(%rax)
233 vmovdqa $T6, 32*6-112(%rax)
234 vmovdqa $T0, 32*7-112(%rax)
235 vmovdqa $X3, 32*8-112(%rax)
236 lea 448(%rdi), %rax # size optimization
237
238 vpand $T7, $Y0, $T0 # out[0] = in[0] & mask;
239 vpsrlq \$29, $Y0, $Y0
240 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask;
241 vpsrlq \$29, $Y0, $Y0
242 vpsllq \$6, $Y1, $T2
243 vpxor $Y0, $T2, $T2
244 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
245 vpsrlq \$23, $Y1, $Y1
246 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
247 vpsrlq \$29, $Y1, $Y1
248 vpsllq \$12, $Y2, $T4
249 vpxor $Y1, $T4, $T4
250 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
251 vpsrlq \$17, $Y2, $Y2
252 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
253 vpsrlq \$29, $Y2, $Y2
254 vpsllq \$18, $Y3, $T6
255 vpxor $Y2, $T6, $T6
256 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
257 vpsrlq \$11, $Y3, $Y3
258 vmovdqa $T0, 32*9-448(%rax)
259 vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
260 vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
261
262 vmovdqa $T1, 32*10-448(%rax)
263 vmovdqa $T2, 32*11-448(%rax)
264 vmovdqa $T3, 32*12-448(%rax)
265 vmovdqa $T4, 32*13-448(%rax)
266 vmovdqa $T5, 32*14-448(%rax)
267 vmovdqa $T6, 32*15-448(%rax)
268 vmovdqa $T0, 32*16-448(%rax)
269 vmovdqa $Y3, 32*17-448(%rax)
270
271 vzeroupper
272 ___
273 $code.=<<___ if ($win64);
274 movaps 16*0(%rsp), %xmm6
275 movaps 16*1(%rsp), %xmm7
276 movaps 16*2(%rsp), %xmm8
277 movaps 16*3(%rsp), %xmm9
278 movaps 16*4(%rsp), %xmm10
279 movaps 16*5(%rsp), %xmm11
280 movaps 16*6(%rsp), %xmm12
281 movaps 16*7(%rsp), %xmm13
282 movaps 16*8(%rsp), %xmm14
283 movaps 16*9(%rsp), %xmm15
284 lea 8+16*10(%rsp), %rsp
285 ___
286 $code.=<<___;
287 ret
288 .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
289 ___
290 }
291 {
292 ################################################################################
293 # This function receives a pointer to an array of four AVX2 formatted points
294 # (X, Y, Z) convert the data to normal representation, and rearranges the data
295
296 my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
297 my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
298
299 $code.=<<___;
300
301 .globl ecp_nistz256_avx2_convert_transpose_back
302 .type ecp_nistz256_avx2_convert_transpose_back,\@function,2
303 .align 32
304 ecp_nistz256_avx2_convert_transpose_back:
305 vzeroupper
306 ___
307 $code.=<<___ if ($win64);
308 lea -8-16*10(%rsp), %rsp
309 vmovaps %xmm6, -8-16*10(%rax)
310 vmovaps %xmm7, -8-16*9(%rax)
311 vmovaps %xmm8, -8-16*8(%rax)
312 vmovaps %xmm9, -8-16*7(%rax)
313 vmovaps %xmm10, -8-16*6(%rax)
314 vmovaps %xmm11, -8-16*5(%rax)
315 vmovaps %xmm12, -8-16*4(%rax)
316 vmovaps %xmm13, -8-16*3(%rax)
317 vmovaps %xmm14, -8-16*2(%rax)
318 vmovaps %xmm15, -8-16*1(%rax)
319 ___
320 $code.=<<___;
321 mov \$3, %ecx
322
323 .Lconv_loop:
324 vmovdqa 32*0(%rsi), $D0
325 lea 160(%rsi), %rax # size optimization
326 vmovdqa 32*1(%rsi), $D1
327 vmovdqa 32*2(%rsi), $D2
328 vmovdqa 32*3(%rsi), $D3
329 vmovdqa 32*4-160(%rax), $D4
330 vmovdqa 32*5-160(%rax), $D5
331 vmovdqa 32*6-160(%rax), $D6
332 vmovdqa 32*7-160(%rax), $D7
333 vmovdqa 32*8-160(%rax), $D8
334
335 vpsllq \$29, $D1, $D1
336 vpsllq \$58, $D2, $T0
337 vpaddq $D1, $D0, $D0
338 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
339
340 vpsrlq \$6, $D2, $D2
341 vpsllq \$23, $D3, $D3
342 vpsllq \$52, $D4, $T1
343 vpaddq $D2, $D3, $D3
344 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
345
346 vpsrlq \$12, $D4, $D4
347 vpsllq \$17, $D5, $D5
348 vpsllq \$46, $D6, $T2
349 vpaddq $D4, $D5, $D5
350 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
351
352 vpsrlq \$18, $D6, $D6
353 vpsllq \$11, $D7, $D7
354 vpsllq \$40, $D8, $T3
355 vpaddq $D6, $D7, $D7
356 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
357
358 vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0]
359 vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0]
360 vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1]
361 vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1]
362
363 vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0]
364 vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1]
365 vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2]
366 vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3]
367
368 vmovdqa $D0, 32*0(%rdi)
369 vmovdqa $D1, 32*3(%rdi)
370 vmovdqa $D2, 32*6(%rdi)
371 vmovdqa $D3, 32*9(%rdi)
372
373 lea 32*9(%rsi), %rsi
374 lea 32*1(%rdi), %rdi
375
376 dec %ecx
377 jnz .Lconv_loop
378
379 vzeroupper
380 ___
381 $code.=<<___ if ($win64);
382 movaps 16*0(%rsp), %xmm6
383 movaps 16*1(%rsp), %xmm7
384 movaps 16*2(%rsp), %xmm8
385 movaps 16*3(%rsp), %xmm9
386 movaps 16*4(%rsp), %xmm10
387 movaps 16*5(%rsp), %xmm11
388 movaps 16*6(%rsp), %xmm12
389 movaps 16*7(%rsp), %xmm13
390 movaps 16*8(%rsp), %xmm14
391 movaps 16*9(%rsp), %xmm15
392 lea 8+16*10(%rsp), %rsp
393 ___
394 $code.=<<___;
395 ret
396 .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
397 ___
398 }
399 {
400 my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
401 my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
402 my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
403
404 sub NORMALIZE {
405 my $ret=<<___;
406 vpsrlq $digit_size, $ACC0, $T0
407 vpand $AND_MASK, $ACC0, $ACC0
408 vpaddq $T0, $ACC1, $ACC1
409
410 vpsrlq $digit_size, $ACC1, $T0
411 vpand $AND_MASK, $ACC1, $ACC1
412 vpaddq $T0, $ACC2, $ACC2
413
414 vpsrlq $digit_size, $ACC2, $T0
415 vpand $AND_MASK, $ACC2, $ACC2
416 vpaddq $T0, $ACC3, $ACC3
417
418 vpsrlq $digit_size, $ACC3, $T0
419 vpand $AND_MASK, $ACC3, $ACC3
420 vpaddq $T0, $ACC4, $ACC4
421
422 vpsrlq $digit_size, $ACC4, $T0
423 vpand $AND_MASK, $ACC4, $ACC4
424 vpaddq $T0, $ACC5, $ACC5
425
426 vpsrlq $digit_size, $ACC5, $T0
427 vpand $AND_MASK, $ACC5, $ACC5
428 vpaddq $T0, $ACC6, $ACC6
429
430 vpsrlq $digit_size, $ACC6, $T0
431 vpand $AND_MASK, $ACC6, $ACC6
432 vpaddq $T0, $ACC7, $ACC7
433
434 vpsrlq $digit_size, $ACC7, $T0
435 vpand $AND_MASK, $ACC7, $ACC7
436 vpaddq $T0, $ACC8, $ACC8
437 #vpand $AND_MASK, $ACC8, $ACC8
438 ___
439 $ret;
440 }
441
442 sub STORE {
443 my $ret=<<___;
444 vmovdqa $ACC0, 32*0(%rdi)
445 lea 160(%rdi), %rax # size optimization
446 vmovdqa $ACC1, 32*1(%rdi)
447 vmovdqa $ACC2, 32*2(%rdi)
448 vmovdqa $ACC3, 32*3(%rdi)
449 vmovdqa $ACC4, 32*4-160(%rax)
450 vmovdqa $ACC5, 32*5-160(%rax)
451 vmovdqa $ACC6, 32*6-160(%rax)
452 vmovdqa $ACC7, 32*7-160(%rax)
453 vmovdqa $ACC8, 32*8-160(%rax)
454 ___
455 $ret;
456 }
457
458 $code.=<<___;
459 .type avx2_normalize,\@abi-omnipotent
460 .align 32
461 avx2_normalize:
462 vpsrlq $digit_size, $ACC0, $T0
463 vpand $AND_MASK, $ACC0, $ACC0
464 vpaddq $T0, $ACC1, $ACC1
465
466 vpsrlq $digit_size, $ACC1, $T0
467 vpand $AND_MASK, $ACC1, $ACC1
468 vpaddq $T0, $ACC2, $ACC2
469
470 vpsrlq $digit_size, $ACC2, $T0
471 vpand $AND_MASK, $ACC2, $ACC2
472 vpaddq $T0, $ACC3, $ACC3
473
474 vpsrlq $digit_size, $ACC3, $T0
475 vpand $AND_MASK, $ACC3, $ACC3
476 vpaddq $T0, $ACC4, $ACC4
477
478 vpsrlq $digit_size, $ACC4, $T0
479 vpand $AND_MASK, $ACC4, $ACC4
480 vpaddq $T0, $ACC5, $ACC5
481
482 vpsrlq $digit_size, $ACC5, $T0
483 vpand $AND_MASK, $ACC5, $ACC5
484 vpaddq $T0, $ACC6, $ACC6
485
486 vpsrlq $digit_size, $ACC6, $T0
487 vpand $AND_MASK, $ACC6, $ACC6
488 vpaddq $T0, $ACC7, $ACC7
489
490 vpsrlq $digit_size, $ACC7, $T0
491 vpand $AND_MASK, $ACC7, $ACC7
492 vpaddq $T0, $ACC8, $ACC8
493 #vpand $AND_MASK, $ACC8, $ACC8
494
495 ret
496 .size avx2_normalize,.-avx2_normalize
497
498 .type avx2_normalize_n_store,\@abi-omnipotent
499 .align 32
500 avx2_normalize_n_store:
501 vpsrlq $digit_size, $ACC0, $T0
502 vpand $AND_MASK, $ACC0, $ACC0
503 vpaddq $T0, $ACC1, $ACC1
504
505 vpsrlq $digit_size, $ACC1, $T0
506 vpand $AND_MASK, $ACC1, $ACC1
507 vmovdqa $ACC0, 32*0(%rdi)
508 lea 160(%rdi), %rax # size optimization
509 vpaddq $T0, $ACC2, $ACC2
510
511 vpsrlq $digit_size, $ACC2, $T0
512 vpand $AND_MASK, $ACC2, $ACC2
513 vmovdqa $ACC1, 32*1(%rdi)
514 vpaddq $T0, $ACC3, $ACC3
515
516 vpsrlq $digit_size, $ACC3, $T0
517 vpand $AND_MASK, $ACC3, $ACC3
518 vmovdqa $ACC2, 32*2(%rdi)
519 vpaddq $T0, $ACC4, $ACC4
520
521 vpsrlq $digit_size, $ACC4, $T0
522 vpand $AND_MASK, $ACC4, $ACC4
523 vmovdqa $ACC3, 32*3(%rdi)
524 vpaddq $T0, $ACC5, $ACC5
525
526 vpsrlq $digit_size, $ACC5, $T0
527 vpand $AND_MASK, $ACC5, $ACC5
528 vmovdqa $ACC4, 32*4-160(%rax)
529 vpaddq $T0, $ACC6, $ACC6
530
531 vpsrlq $digit_size, $ACC6, $T0
532 vpand $AND_MASK, $ACC6, $ACC6
533 vmovdqa $ACC5, 32*5-160(%rax)
534 vpaddq $T0, $ACC7, $ACC7
535
536 vpsrlq $digit_size, $ACC7, $T0
537 vpand $AND_MASK, $ACC7, $ACC7
538 vmovdqa $ACC6, 32*6-160(%rax)
539 vpaddq $T0, $ACC8, $ACC8
540 #vpand $AND_MASK, $ACC8, $ACC8
541 vmovdqa $ACC7, 32*7-160(%rax)
542 vmovdqa $ACC8, 32*8-160(%rax)
543
544 ret
545 .size avx2_normalize_n_store,.-avx2_normalize_n_store
546
547 ################################################################################
548 # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
549 .type avx2_mul_x4,\@abi-omnipotent
550 .align 32
551 avx2_mul_x4:
552 lea .LAVX2_POLY(%rip), %rax
553
554 vpxor $ACC0, $ACC0, $ACC0
555 vpxor $ACC1, $ACC1, $ACC1
556 vpxor $ACC2, $ACC2, $ACC2
557 vpxor $ACC3, $ACC3, $ACC3
558 vpxor $ACC4, $ACC4, $ACC4
559 vpxor $ACC5, $ACC5, $ACC5
560 vpxor $ACC6, $ACC6, $ACC6
561 vpxor $ACC7, $ACC7, $ACC7
562
563 vmovdqa 32*7(%rax), %ymm14
564 vmovdqa 32*8(%rax), %ymm15
565
566 mov $n_digits, $itr
567 lea -512($a_ptr), $a_ptr # strategic bias to control u-op density
568 jmp .Lavx2_mul_x4_loop
569
570 .align 32
571 .Lavx2_mul_x4_loop:
572 vmovdqa 32*0($b_ptr), $B
573 lea 32*1($b_ptr), $b_ptr
574
575 vpmuludq 32*0+512($a_ptr), $B, $T0
576 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
577 vpaddq $T0, $ACC0, $ACC0
578 vpmuludq 32*2+512($a_ptr), $B, $T0
579 vpaddq $OVERFLOW, $ACC1, $ACC1
580 vpand $AND_MASK, $ACC0, $Y
581 vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW
582 vpaddq $T0, $ACC2, $ACC2
583 vpmuludq 32*4+512($a_ptr), $B, $T0
584 vpaddq $OVERFLOW, $ACC3, $ACC3
585 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW
586 vpaddq $T0, $ACC4, $ACC4
587 vpmuludq 32*6+512($a_ptr), $B, $T0
588 vpaddq $OVERFLOW, $ACC5, $ACC5
589 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW
590 vpaddq $T0, $ACC6, $ACC6
591
592 # Skip some multiplications, optimizing for the constant poly
593 vpmuludq $AND_MASK, $Y, $T0
594 vpaddq $OVERFLOW, $ACC7, $ACC7
595 vpmuludq 32*8+512($a_ptr), $B, $ACC8
596 vpaddq $T0, $ACC0, $OVERFLOW
597 vpaddq $T0, $ACC1, $ACC0
598 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
599 vpaddq $T0, $ACC2, $ACC1
600 vpmuludq 32*3(%rax), $Y, $T0
601 vpaddq $OVERFLOW, $ACC0, $ACC0
602 vpaddq $T0, $ACC3, $ACC2
603 .byte 0x67
604 vmovdqa $ACC4, $ACC3
605 vpsllq \$18, $Y, $OVERFLOW
606 .byte 0x67
607 vmovdqa $ACC5, $ACC4
608 vpmuludq %ymm14, $Y, $T0
609 vpaddq $OVERFLOW, $ACC6, $ACC5
610 vpmuludq %ymm15, $Y, $OVERFLOW
611 vpaddq $T0, $ACC7, $ACC6
612 vpaddq $OVERFLOW, $ACC8, $ACC7
613
614 dec $itr
615 jnz .Lavx2_mul_x4_loop
616
617 vpxor $ACC8, $ACC8, $ACC8
618
619 ret
620 .size avx2_mul_x4,.-avx2_mul_x4
621
622 # Function optimized for the constant 1
623 ################################################################################
624 # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
625 .type avx2_mul_by1_x4,\@abi-omnipotent
626 .align 32
627 avx2_mul_by1_x4:
628 lea .LAVX2_POLY(%rip), %rax
629
630 vpxor $ACC0, $ACC0, $ACC0
631 vpxor $ACC1, $ACC1, $ACC1
632 vpxor $ACC2, $ACC2, $ACC2
633 vpxor $ACC3, $ACC3, $ACC3
634 vpxor $ACC4, $ACC4, $ACC4
635 vpxor $ACC5, $ACC5, $ACC5
636 vpxor $ACC6, $ACC6, $ACC6
637 vpxor $ACC7, $ACC7, $ACC7
638 vpxor $ACC8, $ACC8, $ACC8
639
640 vmovdqa 32*3+.LONE(%rip), %ymm14
641 vmovdqa 32*7+.LONE(%rip), %ymm15
642
643 mov $n_digits, $itr
644 jmp .Lavx2_mul_by1_x4_loop
645
646 .align 32
647 .Lavx2_mul_by1_x4_loop:
648 vmovdqa 32*0($a_ptr), $B
649 .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr
650
651 vpsllq \$5, $B, $OVERFLOW
652 vpmuludq %ymm14, $B, $T0
653 vpaddq $OVERFLOW, $ACC0, $ACC0
654 vpaddq $T0, $ACC3, $ACC3
655 .byte 0x67
656 vpmuludq $AND_MASK, $B, $T0
657 vpand $AND_MASK, $ACC0, $Y
658 vpaddq $T0, $ACC4, $ACC4
659 vpaddq $T0, $ACC5, $ACC5
660 vpaddq $T0, $ACC6, $ACC6
661 vpsllq \$23, $B, $T0
662
663 .byte 0x67,0x67
664 vpmuludq %ymm15, $B, $OVERFLOW
665 vpsubq $T0, $ACC6, $ACC6
666
667 vpmuludq $AND_MASK, $Y, $T0
668 vpaddq $OVERFLOW, $ACC7, $ACC7
669 vpaddq $T0, $ACC0, $OVERFLOW
670 vpaddq $T0, $ACC1, $ACC0
671 .byte 0x67,0x67
672 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
673 vpaddq $T0, $ACC2, $ACC1
674 vpmuludq 32*3(%rax), $Y, $T0
675 vpaddq $OVERFLOW, $ACC0, $ACC0
676 vpaddq $T0, $ACC3, $ACC2
677 vmovdqa $ACC4, $ACC3
678 vpsllq \$18, $Y, $OVERFLOW
679 vmovdqa $ACC5, $ACC4
680 vpmuludq 32*7(%rax), $Y, $T0
681 vpaddq $OVERFLOW, $ACC6, $ACC5
682 vpaddq $T0, $ACC7, $ACC6
683 vpmuludq 32*8(%rax), $Y, $ACC7
684
685 dec $itr
686 jnz .Lavx2_mul_by1_x4_loop
687
688 ret
689 .size avx2_mul_by1_x4,.-avx2_mul_by1_x4
690
691 ################################################################################
692 # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
693 .type avx2_sqr_x4,\@abi-omnipotent
694 .align 32
695 avx2_sqr_x4:
696 lea .LAVX2_POLY(%rip), %rax
697
698 vmovdqa 32*7(%rax), %ymm14
699 vmovdqa 32*8(%rax), %ymm15
700
701 vmovdqa 32*0($a_ptr), $B
702 vmovdqa 32*1($a_ptr), $ACC1
703 vmovdqa 32*2($a_ptr), $ACC2
704 vmovdqa 32*3($a_ptr), $ACC3
705 vmovdqa 32*4($a_ptr), $ACC4
706 vmovdqa 32*5($a_ptr), $ACC5
707 vmovdqa 32*6($a_ptr), $ACC6
708 vmovdqa 32*7($a_ptr), $ACC7
709 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7
710 vmovdqa 32*8($a_ptr), $ACC8
711 vpaddq $ACC2, $ACC2, $ACC2
712 vmovdqa $ACC1, 32*0(%rcx)
713 vpaddq $ACC3, $ACC3, $ACC3
714 vmovdqa $ACC2, 32*1(%rcx)
715 vpaddq $ACC4, $ACC4, $ACC4
716 vmovdqa $ACC3, 32*2(%rcx)
717 vpaddq $ACC5, $ACC5, $ACC5
718 vmovdqa $ACC4, 32*3(%rcx)
719 vpaddq $ACC6, $ACC6, $ACC6
720 vmovdqa $ACC5, 32*4(%rcx)
721 vpaddq $ACC7, $ACC7, $ACC7
722 vmovdqa $ACC6, 32*5(%rcx)
723 vpaddq $ACC8, $ACC8, $ACC8
724 vmovdqa $ACC7, 32*6(%rcx)
725 vmovdqa $ACC8, 32*7(%rcx)
726
727 #itr 1
728 vpmuludq $B, $B, $ACC0
729 vpmuludq $B, $ACC1, $ACC1
730 vpand $AND_MASK, $ACC0, $Y
731 vpmuludq $B, $ACC2, $ACC2
732 vpmuludq $B, $ACC3, $ACC3
733 vpmuludq $B, $ACC4, $ACC4
734 vpmuludq $B, $ACC5, $ACC5
735 vpmuludq $B, $ACC6, $ACC6
736 vpmuludq $AND_MASK, $Y, $T0
737 vpmuludq $B, $ACC7, $ACC7
738 vpmuludq $B, $ACC8, $ACC8
739 vmovdqa 32*1($a_ptr), $B
740
741 vpaddq $T0, $ACC0, $OVERFLOW
742 vpaddq $T0, $ACC1, $ACC0
743 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
744 vpaddq $T0, $ACC2, $ACC1
745 vpmuludq 32*3(%rax), $Y, $T0
746 vpaddq $OVERFLOW, $ACC0, $ACC0
747 vpaddq $T0, $ACC3, $ACC2
748 vmovdqa $ACC4, $ACC3
749 vpsllq \$18, $Y, $T0
750 vmovdqa $ACC5, $ACC4
751 vpmuludq %ymm14, $Y, $OVERFLOW
752 vpaddq $T0, $ACC6, $ACC5
753 vpmuludq %ymm15, $Y, $T0
754 vpaddq $OVERFLOW, $ACC7, $ACC6
755 vpaddq $T0, $ACC8, $ACC7
756
757 #itr 2
758 vpmuludq $B, $B, $OVERFLOW
759 vpand $AND_MASK, $ACC0, $Y
760 vpmuludq 32*1(%rcx), $B, $T0
761 vpaddq $OVERFLOW, $ACC1, $ACC1
762 vpmuludq 32*2(%rcx), $B, $OVERFLOW
763 vpaddq $T0, $ACC2, $ACC2
764 vpmuludq 32*3(%rcx), $B, $T0
765 vpaddq $OVERFLOW, $ACC3, $ACC3
766 vpmuludq 32*4(%rcx), $B, $OVERFLOW
767 vpaddq $T0, $ACC4, $ACC4
768 vpmuludq 32*5(%rcx), $B, $T0
769 vpaddq $OVERFLOW, $ACC5, $ACC5
770 vpmuludq 32*6(%rcx), $B, $OVERFLOW
771 vpaddq $T0, $ACC6, $ACC6
772
773 vpmuludq $AND_MASK, $Y, $T0
774 vpaddq $OVERFLOW, $ACC7, $ACC7
775 vpmuludq 32*7(%rcx), $B, $ACC8
776 vmovdqa 32*2($a_ptr), $B
777 vpaddq $T0, $ACC0, $OVERFLOW
778 vpaddq $T0, $ACC1, $ACC0
779 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
780 vpaddq $T0, $ACC2, $ACC1
781 vpmuludq 32*3(%rax), $Y, $T0
782 vpaddq $OVERFLOW, $ACC0, $ACC0
783 vpaddq $T0, $ACC3, $ACC2
784 vmovdqa $ACC4, $ACC3
785 vpsllq \$18, $Y, $T0
786 vmovdqa $ACC5, $ACC4
787 vpmuludq %ymm14, $Y, $OVERFLOW
788 vpaddq $T0, $ACC6, $ACC5
789 vpmuludq %ymm15, $Y, $T0
790 vpaddq $OVERFLOW, $ACC7, $ACC6
791 vpaddq $T0, $ACC8, $ACC7
792
793 #itr 3
794 vpmuludq $B, $B, $T0
795 vpand $AND_MASK, $ACC0, $Y
796 vpmuludq 32*2(%rcx), $B, $OVERFLOW
797 vpaddq $T0, $ACC2, $ACC2
798 vpmuludq 32*3(%rcx), $B, $T0
799 vpaddq $OVERFLOW, $ACC3, $ACC3
800 vpmuludq 32*4(%rcx), $B, $OVERFLOW
801 vpaddq $T0, $ACC4, $ACC4
802 vpmuludq 32*5(%rcx), $B, $T0
803 vpaddq $OVERFLOW, $ACC5, $ACC5
804 vpmuludq 32*6(%rcx), $B, $OVERFLOW
805 vpaddq $T0, $ACC6, $ACC6
806
807 vpmuludq $AND_MASK, $Y, $T0
808 vpaddq $OVERFLOW, $ACC7, $ACC7
809 vpmuludq 32*7(%rcx), $B, $ACC8
810 vmovdqa 32*3($a_ptr), $B
811 vpaddq $T0, $ACC0, $OVERFLOW
812 vpaddq $T0, $ACC1, $ACC0
813 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
814 vpaddq $T0, $ACC2, $ACC1
815 vpmuludq 32*3(%rax), $Y, $T0
816 vpaddq $OVERFLOW, $ACC0, $ACC0
817 vpaddq $T0, $ACC3, $ACC2
818 vmovdqa $ACC4, $ACC3
819 vpsllq \$18, $Y, $T0
820 vmovdqa $ACC5, $ACC4
821 vpmuludq %ymm14, $Y, $OVERFLOW
822 vpaddq $T0, $ACC6, $ACC5
823 vpmuludq %ymm15, $Y, $T0
824 vpand $AND_MASK, $ACC0, $Y
825 vpaddq $OVERFLOW, $ACC7, $ACC6
826 vpaddq $T0, $ACC8, $ACC7
827
828 #itr 4
829 vpmuludq $B, $B, $OVERFLOW
830 vpmuludq 32*3(%rcx), $B, $T0
831 vpaddq $OVERFLOW, $ACC3, $ACC3
832 vpmuludq 32*4(%rcx), $B, $OVERFLOW
833 vpaddq $T0, $ACC4, $ACC4
834 vpmuludq 32*5(%rcx), $B, $T0
835 vpaddq $OVERFLOW, $ACC5, $ACC5
836 vpmuludq 32*6(%rcx), $B, $OVERFLOW
837 vpaddq $T0, $ACC6, $ACC6
838
839 vpmuludq $AND_MASK, $Y, $T0
840 vpaddq $OVERFLOW, $ACC7, $ACC7
841 vpmuludq 32*7(%rcx), $B, $ACC8
842 vmovdqa 32*4($a_ptr), $B
843 vpaddq $T0, $ACC0, $OVERFLOW
844 vpaddq $T0, $ACC1, $ACC0
845 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
846 vpaddq $T0, $ACC2, $ACC1
847 vpmuludq 32*3(%rax), $Y, $T0
848 vpaddq $OVERFLOW, $ACC0, $ACC0
849 vpaddq $T0, $ACC3, $ACC2
850 vmovdqa $ACC4, $ACC3
851 vpsllq \$18, $Y, $T0
852 vmovdqa $ACC5, $ACC4
853 vpmuludq %ymm14, $Y, $OVERFLOW
854 vpaddq $T0, $ACC6, $ACC5
855 vpmuludq %ymm15, $Y, $T0
856 vpand $AND_MASK, $ACC0, $Y
857 vpaddq $OVERFLOW, $ACC7, $ACC6
858 vpaddq $T0, $ACC8, $ACC7
859
860 #itr 5
861 vpmuludq $B, $B, $T0
862 vpmuludq 32*4(%rcx), $B, $OVERFLOW
863 vpaddq $T0, $ACC4, $ACC4
864 vpmuludq 32*5(%rcx), $B, $T0
865 vpaddq $OVERFLOW, $ACC5, $ACC5
866 vpmuludq 32*6(%rcx), $B, $OVERFLOW
867 vpaddq $T0, $ACC6, $ACC6
868
869 vpmuludq $AND_MASK, $Y, $T0
870 vpaddq $OVERFLOW, $ACC7, $ACC7
871 vpmuludq 32*7(%rcx), $B, $ACC8
872 vmovdqa 32*5($a_ptr), $B
873 vpaddq $T0, $ACC0, $OVERFLOW
874 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
875 vpaddq $T0, $ACC1, $ACC0
876 vpaddq $T0, $ACC2, $ACC1
877 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0
878 vpaddq $OVERFLOW, $ACC0, $ACC0
879 vpaddq $T0, $ACC3, $ACC2
880 vmovdqa $ACC4, $ACC3
881 vpsllq \$18, $Y, $T0
882 vmovdqa $ACC5, $ACC4
883 vpmuludq %ymm14, $Y, $OVERFLOW
884 vpaddq $T0, $ACC6, $ACC5
885 vpmuludq %ymm15, $Y, $T0
886 vpand $AND_MASK, $ACC0, $Y
887 vpaddq $OVERFLOW, $ACC7, $ACC6
888 vpaddq $T0, $ACC8, $ACC7
889
890 #itr 6
891 vpmuludq $B, $B, $OVERFLOW
892 vpmuludq 32*5(%rcx), $B, $T0
893 vpaddq $OVERFLOW, $ACC5, $ACC5
894 vpmuludq 32*6(%rcx), $B, $OVERFLOW
895 vpaddq $T0, $ACC6, $ACC6
896
897 vpmuludq $AND_MASK, $Y, $T0
898 vpaddq $OVERFLOW, $ACC7, $ACC7
899 vpmuludq 32*7(%rcx), $B, $ACC8
900 vmovdqa 32*6($a_ptr), $B
901 vpaddq $T0, $ACC0, $OVERFLOW
902 vpaddq $T0, $ACC1, $ACC0
903 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
904 vpaddq $T0, $ACC2, $ACC1
905 vpmuludq 32*3(%rax), $Y, $T0
906 vpaddq $OVERFLOW, $ACC0, $ACC0
907 vpaddq $T0, $ACC3, $ACC2
908 vmovdqa $ACC4, $ACC3
909 vpsllq \$18, $Y, $T0
910 vmovdqa $ACC5, $ACC4
911 vpmuludq %ymm14, $Y, $OVERFLOW
912 vpaddq $T0, $ACC6, $ACC5
913 vpmuludq %ymm15, $Y, $T0
914 vpand $AND_MASK, $ACC0, $Y
915 vpaddq $OVERFLOW, $ACC7, $ACC6
916 vpaddq $T0, $ACC8, $ACC7
917
918 #itr 7
919 vpmuludq $B, $B, $T0
920 vpmuludq 32*6(%rcx), $B, $OVERFLOW
921 vpaddq $T0, $ACC6, $ACC6
922
923 vpmuludq $AND_MASK, $Y, $T0
924 vpaddq $OVERFLOW, $ACC7, $ACC7
925 vpmuludq 32*7(%rcx), $B, $ACC8
926 vmovdqa 32*7($a_ptr), $B
927 vpaddq $T0, $ACC0, $OVERFLOW
928 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
929 vpaddq $T0, $ACC1, $ACC0
930 vpaddq $T0, $ACC2, $ACC1
931 vpmuludq 32*3(%rax), $Y, $T0
932 vpaddq $OVERFLOW, $ACC0, $ACC0
933 vpaddq $T0, $ACC3, $ACC2
934 vmovdqa $ACC4, $ACC3
935 vpsllq \$18, $Y, $T0
936 vmovdqa $ACC5, $ACC4
937 vpmuludq %ymm14, $Y, $OVERFLOW
938 vpaddq $T0, $ACC6, $ACC5
939 vpmuludq %ymm15, $Y, $T0
940 vpand $AND_MASK, $ACC0, $Y
941 vpaddq $OVERFLOW, $ACC7, $ACC6
942 vpaddq $T0, $ACC8, $ACC7
943
944 #itr 8
945 vpmuludq $B, $B, $OVERFLOW
946
947 vpmuludq $AND_MASK, $Y, $T0
948 vpaddq $OVERFLOW, $ACC7, $ACC7
949 vpmuludq 32*7(%rcx), $B, $ACC8
950 vmovdqa 32*8($a_ptr), $B
951 vpaddq $T0, $ACC0, $OVERFLOW
952 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
953 vpaddq $T0, $ACC1, $ACC0
954 vpaddq $T0, $ACC2, $ACC1
955 vpmuludq 32*3(%rax), $Y, $T0
956 vpaddq $OVERFLOW, $ACC0, $ACC0
957 vpaddq $T0, $ACC3, $ACC2
958 vmovdqa $ACC4, $ACC3
959 vpsllq \$18, $Y, $T0
960 vmovdqa $ACC5, $ACC4
961 vpmuludq %ymm14, $Y, $OVERFLOW
962 vpaddq $T0, $ACC6, $ACC5
963 vpmuludq %ymm15, $Y, $T0
964 vpand $AND_MASK, $ACC0, $Y
965 vpaddq $OVERFLOW, $ACC7, $ACC6
966 vpaddq $T0, $ACC8, $ACC7
967
968 #itr 9
969 vpmuludq $B, $B, $ACC8
970
971 vpmuludq $AND_MASK, $Y, $T0
972 vpaddq $T0, $ACC0, $OVERFLOW
973 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
974 vpaddq $T0, $ACC1, $ACC0
975 vpaddq $T0, $ACC2, $ACC1
976 vpmuludq 32*3(%rax), $Y, $T0
977 vpaddq $OVERFLOW, $ACC0, $ACC0
978 vpaddq $T0, $ACC3, $ACC2
979 vmovdqa $ACC4, $ACC3
980 vpsllq \$18, $Y, $T0
981 vmovdqa $ACC5, $ACC4
982 vpmuludq %ymm14, $Y, $OVERFLOW
983 vpaddq $T0, $ACC6, $ACC5
984 vpmuludq %ymm15, $Y, $T0
985 vpaddq $OVERFLOW, $ACC7, $ACC6
986 vpaddq $T0, $ACC8, $ACC7
987
988 vpxor $ACC8, $ACC8, $ACC8
989
990 ret
991 .size avx2_sqr_x4,.-avx2_sqr_x4
992
993 ################################################################################
994 # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
995 .type avx2_sub_x4,\@abi-omnipotent
996 .align 32
997 avx2_sub_x4:
998 vmovdqa 32*0($a_ptr), $ACC0
999 lea 160($a_ptr), $a_ptr
1000 lea .LAVX2_POLY_x8+128(%rip), %rax
1001 lea 128($b_ptr), $b_ptr
1002 vmovdqa 32*1-160($a_ptr), $ACC1
1003 vmovdqa 32*2-160($a_ptr), $ACC2
1004 vmovdqa 32*3-160($a_ptr), $ACC3
1005 vmovdqa 32*4-160($a_ptr), $ACC4
1006 vmovdqa 32*5-160($a_ptr), $ACC5
1007 vmovdqa 32*6-160($a_ptr), $ACC6
1008 vmovdqa 32*7-160($a_ptr), $ACC7
1009 vmovdqa 32*8-160($a_ptr), $ACC8
1010
1011 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1012 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1013 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1014 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1015 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1016 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1017 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1018 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1019 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1020
1021 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0
1022 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1
1023 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2
1024 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3
1025 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4
1026 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5
1027 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6
1028 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7
1029 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8
1030
1031 ret
1032 .size avx2_sub_x4,.-avx2_sub_x4
1033
1034 .type avx2_select_n_store,\@abi-omnipotent
1035 .align 32
1036 avx2_select_n_store:
1037 vmovdqa `8+32*9*8`(%rsp), $Y
1038 vpor `8+32*9*8+32`(%rsp), $Y, $Y
1039
1040 vpandn $ACC0, $Y, $ACC0
1041 vpandn $ACC1, $Y, $ACC1
1042 vpandn $ACC2, $Y, $ACC2
1043 vpandn $ACC3, $Y, $ACC3
1044 vpandn $ACC4, $Y, $ACC4
1045 vpandn $ACC5, $Y, $ACC5
1046 vpandn $ACC6, $Y, $ACC6
1047 vmovdqa `8+32*9*8+32`(%rsp), $B
1048 vpandn $ACC7, $Y, $ACC7
1049 vpandn `8+32*9*8`(%rsp), $B, $B
1050 vpandn $ACC8, $Y, $ACC8
1051
1052 vpand 32*0(%rsi), $B, $T0
1053 lea 160(%rsi), %rax
1054 vpand 32*1(%rsi), $B, $Y
1055 vpxor $T0, $ACC0, $ACC0
1056 vpand 32*2(%rsi), $B, $T0
1057 vpxor $Y, $ACC1, $ACC1
1058 vpand 32*3(%rsi), $B, $Y
1059 vpxor $T0, $ACC2, $ACC2
1060 vpand 32*4-160(%rax), $B, $T0
1061 vpxor $Y, $ACC3, $ACC3
1062 vpand 32*5-160(%rax), $B, $Y
1063 vpxor $T0, $ACC4, $ACC4
1064 vpand 32*6-160(%rax), $B, $T0
1065 vpxor $Y, $ACC5, $ACC5
1066 vpand 32*7-160(%rax), $B, $Y
1067 vpxor $T0, $ACC6, $ACC6
1068 vpand 32*8-160(%rax), $B, $T0
1069 vmovdqa `8+32*9*8+32`(%rsp), $B
1070 vpxor $Y, $ACC7, $ACC7
1071
1072 vpand 32*0(%rdx), $B, $Y
1073 lea 160(%rdx), %rax
1074 vpxor $T0, $ACC8, $ACC8
1075 vpand 32*1(%rdx), $B, $T0
1076 vpxor $Y, $ACC0, $ACC0
1077 vpand 32*2(%rdx), $B, $Y
1078 vpxor $T0, $ACC1, $ACC1
1079 vpand 32*3(%rdx), $B, $T0
1080 vpxor $Y, $ACC2, $ACC2
1081 vpand 32*4-160(%rax), $B, $Y
1082 vpxor $T0, $ACC3, $ACC3
1083 vpand 32*5-160(%rax), $B, $T0
1084 vpxor $Y, $ACC4, $ACC4
1085 vpand 32*6-160(%rax), $B, $Y
1086 vpxor $T0, $ACC5, $ACC5
1087 vpand 32*7-160(%rax), $B, $T0
1088 vpxor $Y, $ACC6, $ACC6
1089 vpand 32*8-160(%rax), $B, $Y
1090 vpxor $T0, $ACC7, $ACC7
1091 vpxor $Y, $ACC8, $ACC8
1092 `&STORE`
1093
1094 ret
1095 .size avx2_select_n_store,.-avx2_select_n_store
1096 ___
1097 $code.=<<___ if (0); # inlined
1098 ################################################################################
1099 # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
1100 .type avx2_mul_by2_x4,\@abi-omnipotent
1101 .align 32
1102 avx2_mul_by2_x4:
1103 vmovdqa 32*0($a_ptr), $ACC0
1104 lea 160($a_ptr), %rax
1105 vmovdqa 32*1($a_ptr), $ACC1
1106 vmovdqa 32*2($a_ptr), $ACC2
1107 vmovdqa 32*3($a_ptr), $ACC3
1108 vmovdqa 32*4-160(%rax), $ACC4
1109 vmovdqa 32*5-160(%rax), $ACC5
1110 vmovdqa 32*6-160(%rax), $ACC6
1111 vmovdqa 32*7-160(%rax), $ACC7
1112 vmovdqa 32*8-160(%rax), $ACC8
1113
1114 vpaddq $ACC0, $ACC0, $ACC0
1115 vpaddq $ACC1, $ACC1, $ACC1
1116 vpaddq $ACC2, $ACC2, $ACC2
1117 vpaddq $ACC3, $ACC3, $ACC3
1118 vpaddq $ACC4, $ACC4, $ACC4
1119 vpaddq $ACC5, $ACC5, $ACC5
1120 vpaddq $ACC6, $ACC6, $ACC6
1121 vpaddq $ACC7, $ACC7, $ACC7
1122 vpaddq $ACC8, $ACC8, $ACC8
1123
1124 ret
1125 .size avx2_mul_by2_x4,.-avx2_mul_by2_x4
1126 ___
1127 my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
1128 my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
1129
1130 $code.=<<___;
1131 ################################################################################
1132 # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
1133 .globl ecp_nistz256_avx2_point_add_affine_x4
1134 .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3
1135 .align 32
1136 ecp_nistz256_avx2_point_add_affine_x4:
1137 mov %rsp, %rax
1138 push %rbp
1139 vzeroupper
1140 ___
1141 $code.=<<___ if ($win64);
1142 lea -16*10(%rsp), %rsp
1143 vmovaps %xmm6, -8-16*10(%rax)
1144 vmovaps %xmm7, -8-16*9(%rax)
1145 vmovaps %xmm8, -8-16*8(%rax)
1146 vmovaps %xmm9, -8-16*7(%rax)
1147 vmovaps %xmm10, -8-16*6(%rax)
1148 vmovaps %xmm11, -8-16*5(%rax)
1149 vmovaps %xmm12, -8-16*4(%rax)
1150 vmovaps %xmm13, -8-16*3(%rax)
1151 vmovaps %xmm14, -8-16*2(%rax)
1152 vmovaps %xmm15, -8-16*1(%rax)
1153 ___
1154 $code.=<<___;
1155 lea -8(%rax), %rbp
1156
1157 # Result + 32*0 = Result.X
1158 # Result + 32*9 = Result.Y
1159 # Result + 32*18 = Result.Z
1160
1161 # A + 32*0 = A.X
1162 # A + 32*9 = A.Y
1163 # A + 32*18 = A.Z
1164
1165 # B + 32*0 = B.X
1166 # B + 32*9 = B.Y
1167
1168 sub \$`32*9*8+32*2+32*8`, %rsp
1169 and \$-64, %rsp
1170
1171 mov $r_ptr_in, $r_ptr
1172 mov $a_ptr_in, $a_ptr
1173 mov $b_ptr_in, $b_ptr
1174
1175 vmovdqa 32*0($a_ptr_in), %ymm0
1176 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1177 vpxor %ymm1, %ymm1, %ymm1
1178 lea 256($a_ptr_in), %rax # size optimization
1179 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1180 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1181 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1182 vpor 32*4-256(%rax), %ymm0, %ymm0
1183 lea 256(%rax), %rcx # size optimization
1184 vpor 32*5-256(%rax), %ymm0, %ymm0
1185 vpor 32*6-256(%rax), %ymm0, %ymm0
1186 vpor 32*7-256(%rax), %ymm0, %ymm0
1187 vpor 32*8-256(%rax), %ymm0, %ymm0
1188 vpor 32*9-256(%rax), %ymm0, %ymm0
1189 vpor 32*10-256(%rax), %ymm0, %ymm0
1190 vpor 32*11-256(%rax), %ymm0, %ymm0
1191 vpor 32*12-512(%rcx), %ymm0, %ymm0
1192 vpor 32*13-512(%rcx), %ymm0, %ymm0
1193 vpor 32*14-512(%rcx), %ymm0, %ymm0
1194 vpor 32*15-512(%rcx), %ymm0, %ymm0
1195 vpor 32*16-512(%rcx), %ymm0, %ymm0
1196 vpor 32*17-512(%rcx), %ymm0, %ymm0
1197 vpcmpeqq %ymm1, %ymm0, %ymm0
1198 vmovdqa %ymm0, `32*9*8`(%rsp)
1199
1200 vpxor %ymm1, %ymm1, %ymm1
1201 vmovdqa 32*0($b_ptr), %ymm0
1202 lea 256($b_ptr), %rax # size optimization
1203 vpor 32*1($b_ptr), %ymm0, %ymm0
1204 vpor 32*2($b_ptr), %ymm0, %ymm0
1205 vpor 32*3($b_ptr), %ymm0, %ymm0
1206 vpor 32*4-256(%rax), %ymm0, %ymm0
1207 lea 256(%rax), %rcx # size optimization
1208 vpor 32*5-256(%rax), %ymm0, %ymm0
1209 vpor 32*6-256(%rax), %ymm0, %ymm0
1210 vpor 32*7-256(%rax), %ymm0, %ymm0
1211 vpor 32*8-256(%rax), %ymm0, %ymm0
1212 vpor 32*9-256(%rax), %ymm0, %ymm0
1213 vpor 32*10-256(%rax), %ymm0, %ymm0
1214 vpor 32*11-256(%rax), %ymm0, %ymm0
1215 vpor 32*12-512(%rcx), %ymm0, %ymm0
1216 vpor 32*13-512(%rcx), %ymm0, %ymm0
1217 vpor 32*14-512(%rcx), %ymm0, %ymm0
1218 vpor 32*15-512(%rcx), %ymm0, %ymm0
1219 vpor 32*16-512(%rcx), %ymm0, %ymm0
1220 vpor 32*17-512(%rcx), %ymm0, %ymm0
1221 vpcmpeqq %ymm1, %ymm0, %ymm0
1222 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1223
1224 # Z1^2 = Z1*Z1
1225 lea `32*9*2`($a_ptr), %rsi
1226 lea `32*9*2`(%rsp), %rdi
1227 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1228 call avx2_sqr_x4
1229 call avx2_normalize_n_store
1230
1231 # U2 = X2*Z1^2
1232 lea `32*9*0`($b_ptr), %rsi
1233 lea `32*9*2`(%rsp), %rdx
1234 lea `32*9*0`(%rsp), %rdi
1235 call avx2_mul_x4
1236 #call avx2_normalize
1237 `&STORE`
1238
1239 # S2 = Z1*Z1^2 = Z1^3
1240 lea `32*9*2`($a_ptr), %rsi
1241 lea `32*9*2`(%rsp), %rdx
1242 lea `32*9*1`(%rsp), %rdi
1243 call avx2_mul_x4
1244 call avx2_normalize_n_store
1245
1246 # S2 = S2*Y2 = Y2*Z1^3
1247 lea `32*9*1`($b_ptr), %rsi
1248 lea `32*9*1`(%rsp), %rdx
1249 lea `32*9*1`(%rsp), %rdi
1250 call avx2_mul_x4
1251 call avx2_normalize_n_store
1252
1253 # H = U2 - U1 = U2 - X1
1254 lea `32*9*0`(%rsp), %rsi
1255 lea `32*9*0`($a_ptr), %rdx
1256 lea `32*9*3`(%rsp), %rdi
1257 call avx2_sub_x4
1258 call avx2_normalize_n_store
1259
1260 # R = S2 - S1 = S2 - Y1
1261 lea `32*9*1`(%rsp), %rsi
1262 lea `32*9*1`($a_ptr), %rdx
1263 lea `32*9*4`(%rsp), %rdi
1264 call avx2_sub_x4
1265 call avx2_normalize_n_store
1266
1267 # Z3 = H*Z1*Z2
1268 lea `32*9*3`(%rsp), %rsi
1269 lea `32*9*2`($a_ptr), %rdx
1270 lea `32*9*2`($r_ptr), %rdi
1271 call avx2_mul_x4
1272 call avx2_normalize
1273
1274 lea .LONE(%rip), %rsi
1275 lea `32*9*2`($a_ptr), %rdx
1276 call avx2_select_n_store
1277
1278 # R^2 = R^2
1279 lea `32*9*4`(%rsp), %rsi
1280 lea `32*9*6`(%rsp), %rdi
1281 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1282 call avx2_sqr_x4
1283 call avx2_normalize_n_store
1284
1285 # H^2 = H^2
1286 lea `32*9*3`(%rsp), %rsi
1287 lea `32*9*5`(%rsp), %rdi
1288 call avx2_sqr_x4
1289 call avx2_normalize_n_store
1290
1291 # H^3 = H^2*H
1292 lea `32*9*3`(%rsp), %rsi
1293 lea `32*9*5`(%rsp), %rdx
1294 lea `32*9*7`(%rsp), %rdi
1295 call avx2_mul_x4
1296 call avx2_normalize_n_store
1297
1298 # U2 = U1*H^2
1299 lea `32*9*0`($a_ptr), %rsi
1300 lea `32*9*5`(%rsp), %rdx
1301 lea `32*9*0`(%rsp), %rdi
1302 call avx2_mul_x4
1303 #call avx2_normalize
1304 `&STORE`
1305
1306 # Hsqr = U2*2
1307 #lea 32*9*0(%rsp), %rsi
1308 #lea 32*9*5(%rsp), %rdi
1309 #call avx2_mul_by2_x4
1310
1311 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1312 lea `32*9*5`(%rsp), %rdi
1313 vpaddq $ACC1, $ACC1, $ACC1
1314 vpaddq $ACC2, $ACC2, $ACC2
1315 vpaddq $ACC3, $ACC3, $ACC3
1316 vpaddq $ACC4, $ACC4, $ACC4
1317 vpaddq $ACC5, $ACC5, $ACC5
1318 vpaddq $ACC6, $ACC6, $ACC6
1319 vpaddq $ACC7, $ACC7, $ACC7
1320 vpaddq $ACC8, $ACC8, $ACC8
1321 call avx2_normalize_n_store
1322
1323 # X3 = R^2 - H^3
1324 #lea 32*9*6(%rsp), %rsi
1325 #lea 32*9*7(%rsp), %rdx
1326 #lea 32*9*5(%rsp), %rcx
1327 #lea 32*9*0($r_ptr), %rdi
1328 #call avx2_sub_x4
1329 #NORMALIZE
1330 #STORE
1331
1332 # X3 = X3 - U2*2
1333 #lea 32*9*0($r_ptr), %rsi
1334 #lea 32*9*0($r_ptr), %rdi
1335 #call avx2_sub_x4
1336 #NORMALIZE
1337 #STORE
1338
1339 lea `32*9*6+128`(%rsp), %rsi
1340 lea .LAVX2_POLY_x2+128(%rip), %rax
1341 lea `32*9*7+128`(%rsp), %rdx
1342 lea `32*9*5+128`(%rsp), %rcx
1343 lea `32*9*0`($r_ptr), %rdi
1344
1345 vmovdqa 32*0-128(%rsi), $ACC0
1346 vmovdqa 32*1-128(%rsi), $ACC1
1347 vmovdqa 32*2-128(%rsi), $ACC2
1348 vmovdqa 32*3-128(%rsi), $ACC3
1349 vmovdqa 32*4-128(%rsi), $ACC4
1350 vmovdqa 32*5-128(%rsi), $ACC5
1351 vmovdqa 32*6-128(%rsi), $ACC6
1352 vmovdqa 32*7-128(%rsi), $ACC7
1353 vmovdqa 32*8-128(%rsi), $ACC8
1354
1355 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1356 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1357 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1358 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1359 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1360 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1361 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1362 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1363 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1364
1365 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1366 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1367 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1368 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1369 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1370 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1371 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1372 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1373 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1374
1375 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1376 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1377 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1378 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1379 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1380 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1381 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1382 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1383 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1384 call avx2_normalize
1385
1386 lea 32*0($b_ptr), %rsi
1387 lea 32*0($a_ptr), %rdx
1388 call avx2_select_n_store
1389
1390 # H = U2 - X3
1391 lea `32*9*0`(%rsp), %rsi
1392 lea `32*9*0`($r_ptr), %rdx
1393 lea `32*9*3`(%rsp), %rdi
1394 call avx2_sub_x4
1395 call avx2_normalize_n_store
1396
1397 #
1398 lea `32*9*3`(%rsp), %rsi
1399 lea `32*9*4`(%rsp), %rdx
1400 lea `32*9*3`(%rsp), %rdi
1401 call avx2_mul_x4
1402 call avx2_normalize_n_store
1403
1404 #
1405 lea `32*9*7`(%rsp), %rsi
1406 lea `32*9*1`($a_ptr), %rdx
1407 lea `32*9*1`(%rsp), %rdi
1408 call avx2_mul_x4
1409 call avx2_normalize_n_store
1410
1411 #
1412 lea `32*9*3`(%rsp), %rsi
1413 lea `32*9*1`(%rsp), %rdx
1414 lea `32*9*1`($r_ptr), %rdi
1415 call avx2_sub_x4
1416 call avx2_normalize
1417
1418 lea 32*9($b_ptr), %rsi
1419 lea 32*9($a_ptr), %rdx
1420 call avx2_select_n_store
1421
1422 #lea 32*9*0($r_ptr), %rsi
1423 #lea 32*9*0($r_ptr), %rdi
1424 #call avx2_mul_by1_x4
1425 #NORMALIZE
1426 #STORE
1427
1428 lea `32*9*1`($r_ptr), %rsi
1429 lea `32*9*1`($r_ptr), %rdi
1430 call avx2_mul_by1_x4
1431 call avx2_normalize_n_store
1432
1433 vzeroupper
1434 ___
1435 $code.=<<___ if ($win64);
1436 movaps %xmm6, -16*10(%rbp)
1437 movaps %xmm7, -16*9(%rbp)
1438 movaps %xmm8, -16*8(%rbp)
1439 movaps %xmm9, -16*7(%rbp)
1440 movaps %xmm10, -16*6(%rbp)
1441 movaps %xmm11, -16*5(%rbp)
1442 movaps %xmm12, -16*4(%rbp)
1443 movaps %xmm13, -16*3(%rbp)
1444 movaps %xmm14, -16*2(%rbp)
1445 movaps %xmm15, -16*1(%rbp)
1446 ___
1447 $code.=<<___;
1448 mov %rbp, %rsp
1449 pop %rbp
1450 ret
1451 .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
1452
1453 ################################################################################
1454 # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
1455 .globl ecp_nistz256_avx2_point_add_affines_x4
1456 .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3
1457 .align 32
1458 ecp_nistz256_avx2_point_add_affines_x4:
1459 mov %rsp, %rax
1460 push %rbp
1461 vzeroupper
1462 ___
1463 $code.=<<___ if ($win64);
1464 lea -16*10(%rsp), %rsp
1465 vmovaps %xmm6, -8-16*10(%rax)
1466 vmovaps %xmm7, -8-16*9(%rax)
1467 vmovaps %xmm8, -8-16*8(%rax)
1468 vmovaps %xmm9, -8-16*7(%rax)
1469 vmovaps %xmm10, -8-16*6(%rax)
1470 vmovaps %xmm11, -8-16*5(%rax)
1471 vmovaps %xmm12, -8-16*4(%rax)
1472 vmovaps %xmm13, -8-16*3(%rax)
1473 vmovaps %xmm14, -8-16*2(%rax)
1474 vmovaps %xmm15, -8-16*1(%rax)
1475 ___
1476 $code.=<<___;
1477 lea -8(%rax), %rbp
1478
1479 # Result + 32*0 = Result.X
1480 # Result + 32*9 = Result.Y
1481 # Result + 32*18 = Result.Z
1482
1483 # A + 32*0 = A.X
1484 # A + 32*9 = A.Y
1485
1486 # B + 32*0 = B.X
1487 # B + 32*9 = B.Y
1488
1489 sub \$`32*9*8+32*2+32*8`, %rsp
1490 and \$-64, %rsp
1491
1492 mov $r_ptr_in, $r_ptr
1493 mov $a_ptr_in, $a_ptr
1494 mov $b_ptr_in, $b_ptr
1495
1496 vmovdqa 32*0($a_ptr_in), %ymm0
1497 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1498 vpxor %ymm1, %ymm1, %ymm1
1499 lea 256($a_ptr_in), %rax # size optimization
1500 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1501 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1502 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1503 vpor 32*4-256(%rax), %ymm0, %ymm0
1504 lea 256(%rax), %rcx # size optimization
1505 vpor 32*5-256(%rax), %ymm0, %ymm0
1506 vpor 32*6-256(%rax), %ymm0, %ymm0
1507 vpor 32*7-256(%rax), %ymm0, %ymm0
1508 vpor 32*8-256(%rax), %ymm0, %ymm0
1509 vpor 32*9-256(%rax), %ymm0, %ymm0
1510 vpor 32*10-256(%rax), %ymm0, %ymm0
1511 vpor 32*11-256(%rax), %ymm0, %ymm0
1512 vpor 32*12-512(%rcx), %ymm0, %ymm0
1513 vpor 32*13-512(%rcx), %ymm0, %ymm0
1514 vpor 32*14-512(%rcx), %ymm0, %ymm0
1515 vpor 32*15-512(%rcx), %ymm0, %ymm0
1516 vpor 32*16-512(%rcx), %ymm0, %ymm0
1517 vpor 32*17-512(%rcx), %ymm0, %ymm0
1518 vpcmpeqq %ymm1, %ymm0, %ymm0
1519 vmovdqa %ymm0, `32*9*8`(%rsp)
1520
1521 vpxor %ymm1, %ymm1, %ymm1
1522 vmovdqa 32*0($b_ptr), %ymm0
1523 lea 256($b_ptr), %rax # size optimization
1524 vpor 32*1($b_ptr), %ymm0, %ymm0
1525 vpor 32*2($b_ptr), %ymm0, %ymm0
1526 vpor 32*3($b_ptr), %ymm0, %ymm0
1527 vpor 32*4-256(%rax), %ymm0, %ymm0
1528 lea 256(%rax), %rcx # size optimization
1529 vpor 32*5-256(%rax), %ymm0, %ymm0
1530 vpor 32*6-256(%rax), %ymm0, %ymm0
1531 vpor 32*7-256(%rax), %ymm0, %ymm0
1532 vpor 32*8-256(%rax), %ymm0, %ymm0
1533 vpor 32*9-256(%rax), %ymm0, %ymm0
1534 vpor 32*10-256(%rax), %ymm0, %ymm0
1535 vpor 32*11-256(%rax), %ymm0, %ymm0
1536 vpor 32*12-512(%rcx), %ymm0, %ymm0
1537 vpor 32*13-512(%rcx), %ymm0, %ymm0
1538 vpor 32*14-512(%rcx), %ymm0, %ymm0
1539 vpor 32*15-512(%rcx), %ymm0, %ymm0
1540 vpor 32*16-512(%rcx), %ymm0, %ymm0
1541 vpor 32*17-512(%rcx), %ymm0, %ymm0
1542 vpcmpeqq %ymm1, %ymm0, %ymm0
1543 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1544
1545 # H = U2 - U1 = X2 - X1
1546 lea `32*9*0`($b_ptr), %rsi
1547 lea `32*9*0`($a_ptr), %rdx
1548 lea `32*9*3`(%rsp), %rdi
1549 call avx2_sub_x4
1550 call avx2_normalize_n_store
1551
1552 # R = S2 - S1 = Y2 - Y1
1553 lea `32*9*1`($b_ptr), %rsi
1554 lea `32*9*1`($a_ptr), %rdx
1555 lea `32*9*4`(%rsp), %rdi
1556 call avx2_sub_x4
1557 call avx2_normalize_n_store
1558
1559 # Z3 = H*Z1*Z2 = H
1560 lea `32*9*3`(%rsp), %rsi
1561 lea `32*9*2`($r_ptr), %rdi
1562 call avx2_mul_by1_x4
1563 call avx2_normalize
1564
1565 vmovdqa `32*9*8`(%rsp), $B
1566 vpor `32*9*8+32`(%rsp), $B, $B
1567
1568 vpandn $ACC0, $B, $ACC0
1569 lea .LONE+128(%rip), %rax
1570 vpandn $ACC1, $B, $ACC1
1571 vpandn $ACC2, $B, $ACC2
1572 vpandn $ACC3, $B, $ACC3
1573 vpandn $ACC4, $B, $ACC4
1574 vpandn $ACC5, $B, $ACC5
1575 vpandn $ACC6, $B, $ACC6
1576 vpandn $ACC7, $B, $ACC7
1577
1578 vpand 32*0-128(%rax), $B, $T0
1579 vpandn $ACC8, $B, $ACC8
1580 vpand 32*1-128(%rax), $B, $Y
1581 vpxor $T0, $ACC0, $ACC0
1582 vpand 32*2-128(%rax), $B, $T0
1583 vpxor $Y, $ACC1, $ACC1
1584 vpand 32*3-128(%rax), $B, $Y
1585 vpxor $T0, $ACC2, $ACC2
1586 vpand 32*4-128(%rax), $B, $T0
1587 vpxor $Y, $ACC3, $ACC3
1588 vpand 32*5-128(%rax), $B, $Y
1589 vpxor $T0, $ACC4, $ACC4
1590 vpand 32*6-128(%rax), $B, $T0
1591 vpxor $Y, $ACC5, $ACC5
1592 vpand 32*7-128(%rax), $B, $Y
1593 vpxor $T0, $ACC6, $ACC6
1594 vpand 32*8-128(%rax), $B, $T0
1595 vpxor $Y, $ACC7, $ACC7
1596 vpxor $T0, $ACC8, $ACC8
1597 `&STORE`
1598
1599 # R^2 = R^2
1600 lea `32*9*4`(%rsp), %rsi
1601 lea `32*9*6`(%rsp), %rdi
1602 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1603 call avx2_sqr_x4
1604 call avx2_normalize_n_store
1605
1606 # H^2 = H^2
1607 lea `32*9*3`(%rsp), %rsi
1608 lea `32*9*5`(%rsp), %rdi
1609 call avx2_sqr_x4
1610 call avx2_normalize_n_store
1611
1612 # H^3 = H^2*H
1613 lea `32*9*3`(%rsp), %rsi
1614 lea `32*9*5`(%rsp), %rdx
1615 lea `32*9*7`(%rsp), %rdi
1616 call avx2_mul_x4
1617 call avx2_normalize_n_store
1618
1619 # U2 = U1*H^2
1620 lea `32*9*0`($a_ptr), %rsi
1621 lea `32*9*5`(%rsp), %rdx
1622 lea `32*9*0`(%rsp), %rdi
1623 call avx2_mul_x4
1624 #call avx2_normalize
1625 `&STORE`
1626
1627 # Hsqr = U2*2
1628 #lea 32*9*0(%rsp), %rsi
1629 #lea 32*9*5(%rsp), %rdi
1630 #call avx2_mul_by2_x4
1631
1632 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1633 lea `32*9*5`(%rsp), %rdi
1634 vpaddq $ACC1, $ACC1, $ACC1
1635 vpaddq $ACC2, $ACC2, $ACC2
1636 vpaddq $ACC3, $ACC3, $ACC3
1637 vpaddq $ACC4, $ACC4, $ACC4
1638 vpaddq $ACC5, $ACC5, $ACC5
1639 vpaddq $ACC6, $ACC6, $ACC6
1640 vpaddq $ACC7, $ACC7, $ACC7
1641 vpaddq $ACC8, $ACC8, $ACC8
1642 call avx2_normalize_n_store
1643
1644 # X3 = R^2 - H^3
1645 #lea 32*9*6(%rsp), %rsi
1646 #lea 32*9*7(%rsp), %rdx
1647 #lea 32*9*5(%rsp), %rcx
1648 #lea 32*9*0($r_ptr), %rdi
1649 #call avx2_sub_x4
1650 #NORMALIZE
1651 #STORE
1652
1653 # X3 = X3 - U2*2
1654 #lea 32*9*0($r_ptr), %rsi
1655 #lea 32*9*0($r_ptr), %rdi
1656 #call avx2_sub_x4
1657 #NORMALIZE
1658 #STORE
1659
1660 lea `32*9*6+128`(%rsp), %rsi
1661 lea .LAVX2_POLY_x2+128(%rip), %rax
1662 lea `32*9*7+128`(%rsp), %rdx
1663 lea `32*9*5+128`(%rsp), %rcx
1664 lea `32*9*0`($r_ptr), %rdi
1665
1666 vmovdqa 32*0-128(%rsi), $ACC0
1667 vmovdqa 32*1-128(%rsi), $ACC1
1668 vmovdqa 32*2-128(%rsi), $ACC2
1669 vmovdqa 32*3-128(%rsi), $ACC3
1670 vmovdqa 32*4-128(%rsi), $ACC4
1671 vmovdqa 32*5-128(%rsi), $ACC5
1672 vmovdqa 32*6-128(%rsi), $ACC6
1673 vmovdqa 32*7-128(%rsi), $ACC7
1674 vmovdqa 32*8-128(%rsi), $ACC8
1675
1676 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1677 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1678 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1679 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1680 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1681 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1682 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1683 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1684 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1685
1686 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1687 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1688 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1689 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1690 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1691 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1692 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1693 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1694 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1695
1696 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1697 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1698 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1699 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1700 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1701 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1702 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1703 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1704 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1705 call avx2_normalize
1706
1707 lea 32*0($b_ptr), %rsi
1708 lea 32*0($a_ptr), %rdx
1709 call avx2_select_n_store
1710
1711 # H = U2 - X3
1712 lea `32*9*0`(%rsp), %rsi
1713 lea `32*9*0`($r_ptr), %rdx
1714 lea `32*9*3`(%rsp), %rdi
1715 call avx2_sub_x4
1716 call avx2_normalize_n_store
1717
1718 # H = H*R
1719 lea `32*9*3`(%rsp), %rsi
1720 lea `32*9*4`(%rsp), %rdx
1721 lea `32*9*3`(%rsp), %rdi
1722 call avx2_mul_x4
1723 call avx2_normalize_n_store
1724
1725 # S2 = S1 * H^3
1726 lea `32*9*7`(%rsp), %rsi
1727 lea `32*9*1`($a_ptr), %rdx
1728 lea `32*9*1`(%rsp), %rdi
1729 call avx2_mul_x4
1730 call avx2_normalize_n_store
1731
1732 #
1733 lea `32*9*3`(%rsp), %rsi
1734 lea `32*9*1`(%rsp), %rdx
1735 lea `32*9*1`($r_ptr), %rdi
1736 call avx2_sub_x4
1737 call avx2_normalize
1738
1739 lea 32*9($b_ptr), %rsi
1740 lea 32*9($a_ptr), %rdx
1741 call avx2_select_n_store
1742
1743 #lea 32*9*0($r_ptr), %rsi
1744 #lea 32*9*0($r_ptr), %rdi
1745 #call avx2_mul_by1_x4
1746 #NORMALIZE
1747 #STORE
1748
1749 lea `32*9*1`($r_ptr), %rsi
1750 lea `32*9*1`($r_ptr), %rdi
1751 call avx2_mul_by1_x4
1752 call avx2_normalize_n_store
1753
1754 vzeroupper
1755 ___
1756 $code.=<<___ if ($win64);
1757 movaps %xmm6, -16*10(%rbp)
1758 movaps %xmm7, -16*9(%rbp)
1759 movaps %xmm8, -16*8(%rbp)
1760 movaps %xmm9, -16*7(%rbp)
1761 movaps %xmm10, -16*6(%rbp)
1762 movaps %xmm11, -16*5(%rbp)
1763 movaps %xmm12, -16*4(%rbp)
1764 movaps %xmm13, -16*3(%rbp)
1765 movaps %xmm14, -16*2(%rbp)
1766 movaps %xmm15, -16*1(%rbp)
1767 ___
1768 $code.=<<___;
1769 mov %rbp, %rsp
1770 pop %rbp
1771 ret
1772 .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
1773
1774 ################################################################################
1775 # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
1776 .globl ecp_nistz256_avx2_to_mont
1777 .type ecp_nistz256_avx2_to_mont,\@function,2
1778 .align 32
1779 ecp_nistz256_avx2_to_mont:
1780 vzeroupper
1781 ___
1782 $code.=<<___ if ($win64);
1783 lea -8-16*10(%rsp), %rsp
1784 vmovaps %xmm6, -8-16*10(%rax)
1785 vmovaps %xmm7, -8-16*9(%rax)
1786 vmovaps %xmm8, -8-16*8(%rax)
1787 vmovaps %xmm9, -8-16*7(%rax)
1788 vmovaps %xmm10, -8-16*6(%rax)
1789 vmovaps %xmm11, -8-16*5(%rax)
1790 vmovaps %xmm12, -8-16*4(%rax)
1791 vmovaps %xmm13, -8-16*3(%rax)
1792 vmovaps %xmm14, -8-16*2(%rax)
1793 vmovaps %xmm15, -8-16*1(%rax)
1794 ___
1795 $code.=<<___;
1796 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1797 lea .LTO_MONT_AVX2(%rip), %rdx
1798 call avx2_mul_x4
1799 call avx2_normalize_n_store
1800
1801 vzeroupper
1802 ___
1803 $code.=<<___ if ($win64);
1804 movaps 16*0(%rsp), %xmm6
1805 movaps 16*1(%rsp), %xmm7
1806 movaps 16*2(%rsp), %xmm8
1807 movaps 16*3(%rsp), %xmm9
1808 movaps 16*4(%rsp), %xmm10
1809 movaps 16*5(%rsp), %xmm11
1810 movaps 16*6(%rsp), %xmm12
1811 movaps 16*7(%rsp), %xmm13
1812 movaps 16*8(%rsp), %xmm14
1813 movaps 16*9(%rsp), %xmm15
1814 lea 8+16*10(%rsp), %rsp
1815 ___
1816 $code.=<<___;
1817 ret
1818 .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
1819
1820 ################################################################################
1821 # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
1822 .globl ecp_nistz256_avx2_from_mont
1823 .type ecp_nistz256_avx2_from_mont,\@function,2
1824 .align 32
1825 ecp_nistz256_avx2_from_mont:
1826 vzeroupper
1827 ___
1828 $code.=<<___ if ($win64);
1829 lea -8-16*10(%rsp), %rsp
1830 vmovaps %xmm6, -8-16*10(%rax)
1831 vmovaps %xmm7, -8-16*9(%rax)
1832 vmovaps %xmm8, -8-16*8(%rax)
1833 vmovaps %xmm9, -8-16*7(%rax)
1834 vmovaps %xmm10, -8-16*6(%rax)
1835 vmovaps %xmm11, -8-16*5(%rax)
1836 vmovaps %xmm12, -8-16*4(%rax)
1837 vmovaps %xmm13, -8-16*3(%rax)
1838 vmovaps %xmm14, -8-16*2(%rax)
1839 vmovaps %xmm15, -8-16*1(%rax)
1840 ___
1841 $code.=<<___;
1842 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1843 lea .LFROM_MONT_AVX2(%rip), %rdx
1844 call avx2_mul_x4
1845 call avx2_normalize_n_store
1846
1847 vzeroupper
1848 ___
1849 $code.=<<___ if ($win64);
1850 movaps 16*0(%rsp), %xmm6
1851 movaps 16*1(%rsp), %xmm7
1852 movaps 16*2(%rsp), %xmm8
1853 movaps 16*3(%rsp), %xmm9
1854 movaps 16*4(%rsp), %xmm10
1855 movaps 16*5(%rsp), %xmm11
1856 movaps 16*6(%rsp), %xmm12
1857 movaps 16*7(%rsp), %xmm13
1858 movaps 16*8(%rsp), %xmm14
1859 movaps 16*9(%rsp), %xmm15
1860 lea 8+16*10(%rsp), %rsp
1861 ___
1862 $code.=<<___;
1863 ret
1864 .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
1865
1866 ################################################################################
1867 # void ecp_nistz256_avx2_set1(void* RESULTx4);
1868 .globl ecp_nistz256_avx2_set1
1869 .type ecp_nistz256_avx2_set1,\@function,1
1870 .align 32
1871 ecp_nistz256_avx2_set1:
1872 lea .LONE+128(%rip), %rax
1873 lea 128(%rdi), %rdi
1874 vzeroupper
1875 vmovdqa 32*0-128(%rax), %ymm0
1876 vmovdqa 32*1-128(%rax), %ymm1
1877 vmovdqa 32*2-128(%rax), %ymm2
1878 vmovdqa 32*3-128(%rax), %ymm3
1879 vmovdqa 32*4-128(%rax), %ymm4
1880 vmovdqa 32*5-128(%rax), %ymm5
1881 vmovdqa %ymm0, 32*0-128(%rdi)
1882 vmovdqa 32*6-128(%rax), %ymm0
1883 vmovdqa %ymm1, 32*1-128(%rdi)
1884 vmovdqa 32*7-128(%rax), %ymm1
1885 vmovdqa %ymm2, 32*2-128(%rdi)
1886 vmovdqa 32*8-128(%rax), %ymm2
1887 vmovdqa %ymm3, 32*3-128(%rdi)
1888 vmovdqa %ymm4, 32*4-128(%rdi)
1889 vmovdqa %ymm5, 32*5-128(%rdi)
1890 vmovdqa %ymm0, 32*6-128(%rdi)
1891 vmovdqa %ymm1, 32*7-128(%rdi)
1892 vmovdqa %ymm2, 32*8-128(%rdi)
1893
1894 vzeroupper
1895 ret
1896 .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
1897 ___
1898 }
1899 {
1900 ################################################################################
1901 # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
1902 # int index0, int index1, int index2, int index3);
1903 ################################################################################
1904
1905 my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
1906 my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
1907 my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
1908 my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
1909
1910 $code.=<<___;
1911 .globl ecp_nistz256_avx2_multi_gather_w7
1912 .type ecp_nistz256_avx2_multi_gather_w7,\@function,6
1913 .align 32
1914 ecp_nistz256_avx2_multi_gather_w7:
1915 vzeroupper
1916 ___
1917 $code.=<<___ if ($win64);
1918 lea -8-16*10(%rsp), %rsp
1919 vmovaps %xmm6, -8-16*10(%rax)
1920 vmovaps %xmm7, -8-16*9(%rax)
1921 vmovaps %xmm8, -8-16*8(%rax)
1922 vmovaps %xmm9, -8-16*7(%rax)
1923 vmovaps %xmm10, -8-16*6(%rax)
1924 vmovaps %xmm11, -8-16*5(%rax)
1925 vmovaps %xmm12, -8-16*4(%rax)
1926 vmovaps %xmm13, -8-16*3(%rax)
1927 vmovaps %xmm14, -8-16*2(%rax)
1928 vmovaps %xmm15, -8-16*1(%rax)
1929 ___
1930 $code.=<<___;
1931 lea .LIntOne(%rip), %rax
1932
1933 vmovd $index0, %xmm0
1934 vmovd $index1, %xmm1
1935 vmovd $index2, %xmm2
1936 vmovd $index3, %xmm3
1937
1938 vpxor $R0a, $R0a, $R0a
1939 vpxor $R0b, $R0b, $R0b
1940 vpxor $R1a, $R1a, $R1a
1941 vpxor $R1b, $R1b, $R1b
1942 vpxor $R2a, $R2a, $R2a
1943 vpxor $R2b, $R2b, $R2b
1944 vpxor $R3a, $R3a, $R3a
1945 vpxor $R3b, $R3b, $R3b
1946 vmovdqa (%rax), $M0
1947
1948 vpermd $INDEX0, $R0a, $INDEX0
1949 vpermd $INDEX1, $R0a, $INDEX1
1950 vpermd $INDEX2, $R0a, $INDEX2
1951 vpermd $INDEX3, $R0a, $INDEX3
1952
1953 mov \$64, %ecx
1954 lea 112($val), $val # size optimization
1955 jmp .Lmulti_select_loop_avx2
1956
1957 # INDEX=0, corresponds to the point at infty (0,0)
1958 .align 32
1959 .Lmulti_select_loop_avx2:
1960 vpcmpeqd $INDEX0, $M0, $TMP0
1961
1962 vmovdqa `32*0+32*64*2*0`($in_t), $T0
1963 vmovdqa `32*1+32*64*2*0`($in_t), $T1
1964 vpand $TMP0, $T0, $T0
1965 vpand $TMP0, $T1, $T1
1966 vpxor $T0, $R0a, $R0a
1967 vpxor $T1, $R0b, $R0b
1968
1969 vpcmpeqd $INDEX1, $M0, $TMP0
1970
1971 vmovdqa `32*0+32*64*2*1`($in_t), $T0
1972 vmovdqa `32*1+32*64*2*1`($in_t), $T1
1973 vpand $TMP0, $T0, $T0
1974 vpand $TMP0, $T1, $T1
1975 vpxor $T0, $R1a, $R1a
1976 vpxor $T1, $R1b, $R1b
1977
1978 vpcmpeqd $INDEX2, $M0, $TMP0
1979
1980 vmovdqa `32*0+32*64*2*2`($in_t), $T0
1981 vmovdqa `32*1+32*64*2*2`($in_t), $T1
1982 vpand $TMP0, $T0, $T0
1983 vpand $TMP0, $T1, $T1
1984 vpxor $T0, $R2a, $R2a
1985 vpxor $T1, $R2b, $R2b
1986
1987 vpcmpeqd $INDEX3, $M0, $TMP0
1988
1989 vmovdqa `32*0+32*64*2*3`($in_t), $T0
1990 vmovdqa `32*1+32*64*2*3`($in_t), $T1
1991 vpand $TMP0, $T0, $T0
1992 vpand $TMP0, $T1, $T1
1993 vpxor $T0, $R3a, $R3a
1994 vpxor $T1, $R3b, $R3b
1995
1996 vpaddd (%rax), $M0, $M0 # increment
1997 lea 32*2($in_t), $in_t
1998
1999 dec %ecx
2000 jnz .Lmulti_select_loop_avx2
2001
2002 vmovdqu $R0a, 32*0-112($val)
2003 vmovdqu $R0b, 32*1-112($val)
2004 vmovdqu $R1a, 32*2-112($val)
2005 vmovdqu $R1b, 32*3-112($val)
2006 vmovdqu $R2a, 32*4-112($val)
2007 vmovdqu $R2b, 32*5-112($val)
2008 vmovdqu $R3a, 32*6-112($val)
2009 vmovdqu $R3b, 32*7-112($val)
2010
2011 vzeroupper
2012 ___
2013 $code.=<<___ if ($win64);
2014 movaps 16*0(%rsp), %xmm6
2015 movaps 16*1(%rsp), %xmm7
2016 movaps 16*2(%rsp), %xmm8
2017 movaps 16*3(%rsp), %xmm9
2018 movaps 16*4(%rsp), %xmm10
2019 movaps 16*5(%rsp), %xmm11
2020 movaps 16*6(%rsp), %xmm12
2021 movaps 16*7(%rsp), %xmm13
2022 movaps 16*8(%rsp), %xmm14
2023 movaps 16*9(%rsp), %xmm15
2024 lea 8+16*10(%rsp), %rsp
2025 ___
2026 $code.=<<___;
2027 ret
2028 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2029
2030 .extern OPENSSL_ia32cap_P
2031 .globl ecp_nistz_avx2_eligible
2032 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2033 .align 32
2034 ecp_nistz_avx2_eligible:
2035 mov OPENSSL_ia32cap_P+8(%rip),%eax
2036 shr \$5,%eax
2037 and \$1,%eax
2038 ret
2039 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2040 ___
2041 }
2042 }} else {{ # assembler is too old
2043 $code.=<<___;
2044 .text
2045
2046 .globl ecp_nistz256_avx2_transpose_convert
2047 .globl ecp_nistz256_avx2_convert_transpose_back
2048 .globl ecp_nistz256_avx2_point_add_affine_x4
2049 .globl ecp_nistz256_avx2_point_add_affines_x4
2050 .globl ecp_nistz256_avx2_to_mont
2051 .globl ecp_nistz256_avx2_from_mont
2052 .globl ecp_nistz256_avx2_set1
2053 .globl ecp_nistz256_avx2_multi_gather_w7
2054 .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
2055 ecp_nistz256_avx2_transpose_convert:
2056 ecp_nistz256_avx2_convert_transpose_back:
2057 ecp_nistz256_avx2_point_add_affine_x4:
2058 ecp_nistz256_avx2_point_add_affines_x4:
2059 ecp_nistz256_avx2_to_mont:
2060 ecp_nistz256_avx2_from_mont:
2061 ecp_nistz256_avx2_set1:
2062 ecp_nistz256_avx2_multi_gather_w7:
2063 .byte 0x0f,0x0b # ud2
2064 ret
2065 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2066
2067 .globl ecp_nistz_avx2_eligible
2068 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2069 ecp_nistz_avx2_eligible:
2070 xor %eax,%eax
2071 ret
2072 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2073 ___
2074 }}
2075
2076 foreach (split("\n",$code)) {
2077 s/\`([^\`]*)\`/eval($1)/geo;
2078
2079 print $_,"\n";
2080 }
2081
2082 close STDOUT or die "error closing STDOUT: $!";