]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. |
dcf6e50f | 3 | # Copyright (c) 2014, Intel Corporation. All Rights Reserved. |
eb791696 | 4 | # Copyright (c) 2015 CloudFlare, Inc. |
6aa36e8e | 5 | # |
a7f182b7 | 6 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
7 | # this file except in compliance with the License. You can obtain a copy |
8 | # in the file LICENSE in the source distribution or at | |
9 | # https://www.openssl.org/source/license.html | |
dcf6e50f | 10 | # |
eb791696 | 11 | # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) |
dcf6e50f RS |
12 | # (1) Intel Corporation, Israel Development Center, Haifa, Israel |
13 | # (2) University of Haifa, Israel | |
eb791696 | 14 | # (3) CloudFlare, Inc. |
dcf6e50f RS |
15 | # |
16 | # Reference: | |
17 | # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with | |
18 | # 256 Bit Primes" | |
4d3fa06f AP |
19 | |
20 | # Further optimization by <appro@openssl.org>: | |
21 | # | |
f0e6871d | 22 | # this/original with/without -DECP_NISTZ256_ASM(*) |
eb791696 AP |
23 | # Opteron +15-49% +150-195% |
24 | # Bulldozer +18-45% +175-240% | |
25 | # P4 +24-46% +100-150% | |
26 | # Westmere +18-34% +87-160% | |
27 | # Sandy Bridge +14-35% +120-185% | |
28 | # Ivy Bridge +11-35% +125-180% | |
29 | # Haswell +10-37% +160-200% | |
30 | # Broadwell +24-58% +210-270% | |
31 | # Atom +20-50% +180-240% | |
32 | # VIA Nano +50-160% +480-480% | |
f0e6871d AP |
33 | # |
34 | # (*) "without -DECP_NISTZ256_ASM" refers to build with | |
35 | # "enable-ec_nistp_64_gcc_128"; | |
4d3fa06f AP |
36 | # |
37 | # Ranges denote minimum and maximum improvement coefficients depending | |
eb791696 AP |
38 | # on benchmark. In "this/original" column lower coefficient is for |
39 | # ECDSA sign, while in "with/without" - for ECDH key agreement, and | |
40 | # higher - for ECDSA sign, relatively fastest server-side operation. | |
41 | # Keep in mind that +100% means 2x improvement. | |
4d3fa06f | 42 | |
1aa89a7a RL |
43 | # $output is the last argument if it looks like a file (it has an extension) |
44 | # $flavour is the first argument if it doesn't look like a file | |
45 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
46 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
4d3fa06f AP |
47 | |
48 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
49 | ||
50 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
51 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
52 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
53 | die "can't locate x86_64-xlate.pl"; | |
54 | ||
1aa89a7a RL |
55 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
56 | or die "can't call $xlate: $!"; | |
4d3fa06f AP |
57 | *STDOUT=*OUT; |
58 | ||
59 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
60 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
61 | $avx = ($1>=2.19) + ($1>=2.22); | |
62 | $addx = ($1>=2.23); | |
63 | } | |
64 | ||
65 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
66 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
67 | $avx = ($1>=2.09) + ($1>=2.10); | |
68 | $addx = ($1>=2.10); | |
69 | } | |
70 | ||
71 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
72 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
73 | $avx = ($1>=10) + ($1>=11); | |
74 | $addx = ($1>=12); | |
75 | } | |
76 | ||
9bb3e5fd | 77 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { |
4d3fa06f AP |
78 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 |
79 | $avx = ($ver>=3.0) + ($ver>=3.01); | |
80 | $addx = ($ver>=3.03); | |
81 | } | |
82 | ||
83 | $code.=<<___; | |
84 | .text | |
85 | .extern OPENSSL_ia32cap_P | |
86 | ||
87 | # The polynomial | |
88 | .align 64 | |
89 | .Lpoly: | |
90 | .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 | |
91 | ||
92 | # 2^512 mod P precomputed for NIST P256 polynomial | |
93 | .LRR: | |
94 | .quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd | |
95 | ||
96 | .LOne: | |
97 | .long 1,1,1,1,1,1,1,1 | |
98 | .LTwo: | |
99 | .long 2,2,2,2,2,2,2,2 | |
100 | .LThree: | |
101 | .long 3,3,3,3,3,3,3,3 | |
102 | .LONE_mont: | |
103 | .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe | |
eb791696 AP |
104 | |
105 | # Constants for computations modulo ord(p256) | |
106 | .Lord: | |
107 | .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 | |
108 | .LordK: | |
109 | .quad 0xccd1c8aaee00bc4f | |
4d3fa06f AP |
110 | ___ |
111 | ||
112 | { | |
113 | ################################################################################ | |
114 | # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); | |
115 | ||
116 | my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); | |
117 | my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); | |
118 | my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); | |
119 | ||
120 | $code.=<<___; | |
121 | ||
122 | .globl ecp_nistz256_mul_by_2 | |
123 | .type ecp_nistz256_mul_by_2,\@function,2 | |
124 | .align 64 | |
125 | ecp_nistz256_mul_by_2: | |
86e11278 | 126 | .cfi_startproc |
4d3fa06f | 127 | push %r12 |
86e11278 | 128 | .cfi_push %r12 |
4d3fa06f | 129 | push %r13 |
86e11278 | 130 | .cfi_push %r13 |
384e6de4 | 131 | .Lmul_by_2_body: |
4d3fa06f AP |
132 | |
133 | mov 8*0($a_ptr), $a0 | |
b62b2454 | 134 | xor $t4,$t4 |
4d3fa06f AP |
135 | mov 8*1($a_ptr), $a1 |
136 | add $a0, $a0 # a0:a3+a0:a3 | |
137 | mov 8*2($a_ptr), $a2 | |
138 | adc $a1, $a1 | |
139 | mov 8*3($a_ptr), $a3 | |
140 | lea .Lpoly(%rip), $a_ptr | |
141 | mov $a0, $t0 | |
142 | adc $a2, $a2 | |
143 | adc $a3, $a3 | |
144 | mov $a1, $t1 | |
b62b2454 | 145 | adc \$0, $t4 |
4d3fa06f AP |
146 | |
147 | sub 8*0($a_ptr), $a0 | |
148 | mov $a2, $t2 | |
149 | sbb 8*1($a_ptr), $a1 | |
150 | sbb 8*2($a_ptr), $a2 | |
151 | mov $a3, $t3 | |
152 | sbb 8*3($a_ptr), $a3 | |
b62b2454 | 153 | sbb \$0, $t4 |
4d3fa06f | 154 | |
d3034d31 AP |
155 | cmovc $t0, $a0 |
156 | cmovc $t1, $a1 | |
4d3fa06f | 157 | mov $a0, 8*0($r_ptr) |
d3034d31 | 158 | cmovc $t2, $a2 |
4d3fa06f | 159 | mov $a1, 8*1($r_ptr) |
d3034d31 | 160 | cmovc $t3, $a3 |
4d3fa06f AP |
161 | mov $a2, 8*2($r_ptr) |
162 | mov $a3, 8*3($r_ptr) | |
163 | ||
384e6de4 | 164 | mov 0(%rsp),%r13 |
86e11278 | 165 | .cfi_restore %r13 |
384e6de4 | 166 | mov 8(%rsp),%r12 |
86e11278 | 167 | .cfi_restore %r12 |
384e6de4 | 168 | lea 16(%rsp),%rsp |
86e11278 | 169 | .cfi_adjust_cfa_offset -16 |
384e6de4 | 170 | .Lmul_by_2_epilogue: |
4d3fa06f | 171 | ret |
86e11278 | 172 | .cfi_endproc |
4d3fa06f AP |
173 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 |
174 | ||
175 | ################################################################################ | |
176 | # void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); | |
177 | .globl ecp_nistz256_div_by_2 | |
178 | .type ecp_nistz256_div_by_2,\@function,2 | |
179 | .align 32 | |
180 | ecp_nistz256_div_by_2: | |
86e11278 | 181 | .cfi_startproc |
4d3fa06f | 182 | push %r12 |
86e11278 | 183 | .cfi_push %r12 |
4d3fa06f | 184 | push %r13 |
86e11278 | 185 | .cfi_push %r13 |
384e6de4 | 186 | .Ldiv_by_2_body: |
4d3fa06f AP |
187 | |
188 | mov 8*0($a_ptr), $a0 | |
189 | mov 8*1($a_ptr), $a1 | |
190 | mov 8*2($a_ptr), $a2 | |
191 | mov $a0, $t0 | |
192 | mov 8*3($a_ptr), $a3 | |
193 | lea .Lpoly(%rip), $a_ptr | |
194 | ||
195 | mov $a1, $t1 | |
196 | xor $t4, $t4 | |
197 | add 8*0($a_ptr), $a0 | |
198 | mov $a2, $t2 | |
199 | adc 8*1($a_ptr), $a1 | |
200 | adc 8*2($a_ptr), $a2 | |
201 | mov $a3, $t3 | |
202 | adc 8*3($a_ptr), $a3 | |
203 | adc \$0, $t4 | |
204 | xor $a_ptr, $a_ptr # borrow $a_ptr | |
205 | test \$1, $t0 | |
206 | ||
207 | cmovz $t0, $a0 | |
208 | cmovz $t1, $a1 | |
209 | cmovz $t2, $a2 | |
210 | cmovz $t3, $a3 | |
211 | cmovz $a_ptr, $t4 | |
212 | ||
213 | mov $a1, $t0 # a0:a3>>1 | |
214 | shr \$1, $a0 | |
215 | shl \$63, $t0 | |
216 | mov $a2, $t1 | |
217 | shr \$1, $a1 | |
218 | or $t0, $a0 | |
219 | shl \$63, $t1 | |
220 | mov $a3, $t2 | |
221 | shr \$1, $a2 | |
222 | or $t1, $a1 | |
223 | shl \$63, $t2 | |
224 | shr \$1, $a3 | |
225 | shl \$63, $t4 | |
226 | or $t2, $a2 | |
227 | or $t4, $a3 | |
228 | ||
229 | mov $a0, 8*0($r_ptr) | |
230 | mov $a1, 8*1($r_ptr) | |
231 | mov $a2, 8*2($r_ptr) | |
232 | mov $a3, 8*3($r_ptr) | |
233 | ||
384e6de4 | 234 | mov 0(%rsp),%r13 |
86e11278 | 235 | .cfi_restore %r13 |
384e6de4 | 236 | mov 8(%rsp),%r12 |
86e11278 | 237 | .cfi_restore %r12 |
384e6de4 | 238 | lea 16(%rsp),%rsp |
86e11278 | 239 | .cfi_adjust_cfa_offset -16 |
384e6de4 | 240 | .Ldiv_by_2_epilogue: |
4d3fa06f | 241 | ret |
86e11278 | 242 | .cfi_endproc |
4d3fa06f AP |
243 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 |
244 | ||
245 | ################################################################################ | |
246 | # void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); | |
247 | .globl ecp_nistz256_mul_by_3 | |
248 | .type ecp_nistz256_mul_by_3,\@function,2 | |
249 | .align 32 | |
250 | ecp_nistz256_mul_by_3: | |
86e11278 | 251 | .cfi_startproc |
4d3fa06f | 252 | push %r12 |
86e11278 | 253 | .cfi_push %r12 |
4d3fa06f | 254 | push %r13 |
86e11278 | 255 | .cfi_push %r13 |
384e6de4 | 256 | .Lmul_by_3_body: |
4d3fa06f AP |
257 | |
258 | mov 8*0($a_ptr), $a0 | |
259 | xor $t4, $t4 | |
260 | mov 8*1($a_ptr), $a1 | |
261 | add $a0, $a0 # a0:a3+a0:a3 | |
262 | mov 8*2($a_ptr), $a2 | |
263 | adc $a1, $a1 | |
264 | mov 8*3($a_ptr), $a3 | |
265 | mov $a0, $t0 | |
266 | adc $a2, $a2 | |
267 | adc $a3, $a3 | |
268 | mov $a1, $t1 | |
269 | adc \$0, $t4 | |
270 | ||
271 | sub \$-1, $a0 | |
272 | mov $a2, $t2 | |
273 | sbb .Lpoly+8*1(%rip), $a1 | |
274 | sbb \$0, $a2 | |
275 | mov $a3, $t3 | |
276 | sbb .Lpoly+8*3(%rip), $a3 | |
b62b2454 | 277 | sbb \$0, $t4 |
4d3fa06f | 278 | |
d3034d31 AP |
279 | cmovc $t0, $a0 |
280 | cmovc $t1, $a1 | |
281 | cmovc $t2, $a2 | |
282 | cmovc $t3, $a3 | |
4d3fa06f AP |
283 | |
284 | xor $t4, $t4 | |
285 | add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] | |
286 | adc 8*1($a_ptr), $a1 | |
287 | mov $a0, $t0 | |
288 | adc 8*2($a_ptr), $a2 | |
289 | adc 8*3($a_ptr), $a3 | |
290 | mov $a1, $t1 | |
291 | adc \$0, $t4 | |
292 | ||
293 | sub \$-1, $a0 | |
294 | mov $a2, $t2 | |
295 | sbb .Lpoly+8*1(%rip), $a1 | |
296 | sbb \$0, $a2 | |
297 | mov $a3, $t3 | |
298 | sbb .Lpoly+8*3(%rip), $a3 | |
b62b2454 | 299 | sbb \$0, $t4 |
4d3fa06f | 300 | |
d3034d31 AP |
301 | cmovc $t0, $a0 |
302 | cmovc $t1, $a1 | |
4d3fa06f | 303 | mov $a0, 8*0($r_ptr) |
d3034d31 | 304 | cmovc $t2, $a2 |
4d3fa06f | 305 | mov $a1, 8*1($r_ptr) |
d3034d31 | 306 | cmovc $t3, $a3 |
4d3fa06f AP |
307 | mov $a2, 8*2($r_ptr) |
308 | mov $a3, 8*3($r_ptr) | |
309 | ||
384e6de4 | 310 | mov 0(%rsp),%r13 |
86e11278 | 311 | .cfi_restore %r13 |
384e6de4 | 312 | mov 8(%rsp),%r12 |
86e11278 | 313 | .cfi_restore %r12 |
384e6de4 | 314 | lea 16(%rsp),%rsp |
86e11278 | 315 | .cfi_adjust_cfa_offset -16 |
384e6de4 | 316 | .Lmul_by_3_epilogue: |
4d3fa06f | 317 | ret |
86e11278 | 318 | .cfi_endproc |
4d3fa06f AP |
319 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 |
320 | ||
321 | ################################################################################ | |
322 | # void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); | |
323 | .globl ecp_nistz256_add | |
324 | .type ecp_nistz256_add,\@function,3 | |
325 | .align 32 | |
326 | ecp_nistz256_add: | |
86e11278 | 327 | .cfi_startproc |
4d3fa06f | 328 | push %r12 |
86e11278 | 329 | .cfi_push %r12 |
4d3fa06f | 330 | push %r13 |
86e11278 | 331 | .cfi_push %r13 |
384e6de4 | 332 | .Ladd_body: |
4d3fa06f AP |
333 | |
334 | mov 8*0($a_ptr), $a0 | |
335 | xor $t4, $t4 | |
336 | mov 8*1($a_ptr), $a1 | |
337 | mov 8*2($a_ptr), $a2 | |
338 | mov 8*3($a_ptr), $a3 | |
339 | lea .Lpoly(%rip), $a_ptr | |
340 | ||
341 | add 8*0($b_ptr), $a0 | |
342 | adc 8*1($b_ptr), $a1 | |
343 | mov $a0, $t0 | |
344 | adc 8*2($b_ptr), $a2 | |
345 | adc 8*3($b_ptr), $a3 | |
346 | mov $a1, $t1 | |
347 | adc \$0, $t4 | |
348 | ||
349 | sub 8*0($a_ptr), $a0 | |
350 | mov $a2, $t2 | |
351 | sbb 8*1($a_ptr), $a1 | |
352 | sbb 8*2($a_ptr), $a2 | |
353 | mov $a3, $t3 | |
354 | sbb 8*3($a_ptr), $a3 | |
b62b2454 | 355 | sbb \$0, $t4 |
4d3fa06f | 356 | |
d3034d31 AP |
357 | cmovc $t0, $a0 |
358 | cmovc $t1, $a1 | |
4d3fa06f | 359 | mov $a0, 8*0($r_ptr) |
d3034d31 | 360 | cmovc $t2, $a2 |
4d3fa06f | 361 | mov $a1, 8*1($r_ptr) |
d3034d31 | 362 | cmovc $t3, $a3 |
4d3fa06f AP |
363 | mov $a2, 8*2($r_ptr) |
364 | mov $a3, 8*3($r_ptr) | |
365 | ||
384e6de4 | 366 | mov 0(%rsp),%r13 |
86e11278 | 367 | .cfi_restore %r13 |
384e6de4 | 368 | mov 8(%rsp),%r12 |
86e11278 | 369 | .cfi_restore %r12 |
384e6de4 | 370 | lea 16(%rsp),%rsp |
86e11278 | 371 | .cfi_adjust_cfa_offset -16 |
384e6de4 | 372 | .Ladd_epilogue: |
4d3fa06f | 373 | ret |
86e11278 | 374 | .cfi_endproc |
4d3fa06f AP |
375 | .size ecp_nistz256_add,.-ecp_nistz256_add |
376 | ||
377 | ################################################################################ | |
378 | # void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); | |
379 | .globl ecp_nistz256_sub | |
380 | .type ecp_nistz256_sub,\@function,3 | |
381 | .align 32 | |
382 | ecp_nistz256_sub: | |
86e11278 | 383 | .cfi_startproc |
4d3fa06f | 384 | push %r12 |
86e11278 | 385 | .cfi_push %r12 |
4d3fa06f | 386 | push %r13 |
86e11278 | 387 | .cfi_push %r13 |
384e6de4 | 388 | .Lsub_body: |
4d3fa06f AP |
389 | |
390 | mov 8*0($a_ptr), $a0 | |
391 | xor $t4, $t4 | |
392 | mov 8*1($a_ptr), $a1 | |
393 | mov 8*2($a_ptr), $a2 | |
394 | mov 8*3($a_ptr), $a3 | |
395 | lea .Lpoly(%rip), $a_ptr | |
396 | ||
397 | sub 8*0($b_ptr), $a0 | |
398 | sbb 8*1($b_ptr), $a1 | |
399 | mov $a0, $t0 | |
400 | sbb 8*2($b_ptr), $a2 | |
401 | sbb 8*3($b_ptr), $a3 | |
402 | mov $a1, $t1 | |
403 | sbb \$0, $t4 | |
404 | ||
405 | add 8*0($a_ptr), $a0 | |
406 | mov $a2, $t2 | |
407 | adc 8*1($a_ptr), $a1 | |
408 | adc 8*2($a_ptr), $a2 | |
409 | mov $a3, $t3 | |
410 | adc 8*3($a_ptr), $a3 | |
411 | test $t4, $t4 | |
412 | ||
413 | cmovz $t0, $a0 | |
414 | cmovz $t1, $a1 | |
415 | mov $a0, 8*0($r_ptr) | |
416 | cmovz $t2, $a2 | |
417 | mov $a1, 8*1($r_ptr) | |
418 | cmovz $t3, $a3 | |
419 | mov $a2, 8*2($r_ptr) | |
420 | mov $a3, 8*3($r_ptr) | |
421 | ||
384e6de4 | 422 | mov 0(%rsp),%r13 |
86e11278 | 423 | .cfi_restore %r13 |
384e6de4 | 424 | mov 8(%rsp),%r12 |
86e11278 | 425 | .cfi_restore %r12 |
384e6de4 | 426 | lea 16(%rsp),%rsp |
86e11278 | 427 | .cfi_adjust_cfa_offset -16 |
384e6de4 | 428 | .Lsub_epilogue: |
4d3fa06f | 429 | ret |
86e11278 | 430 | .cfi_endproc |
4d3fa06f AP |
431 | .size ecp_nistz256_sub,.-ecp_nistz256_sub |
432 | ||
433 | ################################################################################ | |
434 | # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); | |
435 | .globl ecp_nistz256_neg | |
436 | .type ecp_nistz256_neg,\@function,2 | |
437 | .align 32 | |
438 | ecp_nistz256_neg: | |
86e11278 | 439 | .cfi_startproc |
4d3fa06f | 440 | push %r12 |
86e11278 | 441 | .cfi_push %r12 |
4d3fa06f | 442 | push %r13 |
86e11278 | 443 | .cfi_push %r13 |
384e6de4 | 444 | .Lneg_body: |
4d3fa06f AP |
445 | |
446 | xor $a0, $a0 | |
447 | xor $a1, $a1 | |
448 | xor $a2, $a2 | |
449 | xor $a3, $a3 | |
450 | xor $t4, $t4 | |
451 | ||
452 | sub 8*0($a_ptr), $a0 | |
453 | sbb 8*1($a_ptr), $a1 | |
454 | sbb 8*2($a_ptr), $a2 | |
455 | mov $a0, $t0 | |
456 | sbb 8*3($a_ptr), $a3 | |
457 | lea .Lpoly(%rip), $a_ptr | |
458 | mov $a1, $t1 | |
459 | sbb \$0, $t4 | |
460 | ||
461 | add 8*0($a_ptr), $a0 | |
462 | mov $a2, $t2 | |
463 | adc 8*1($a_ptr), $a1 | |
464 | adc 8*2($a_ptr), $a2 | |
465 | mov $a3, $t3 | |
466 | adc 8*3($a_ptr), $a3 | |
467 | test $t4, $t4 | |
468 | ||
469 | cmovz $t0, $a0 | |
470 | cmovz $t1, $a1 | |
471 | mov $a0, 8*0($r_ptr) | |
472 | cmovz $t2, $a2 | |
473 | mov $a1, 8*1($r_ptr) | |
474 | cmovz $t3, $a3 | |
475 | mov $a2, 8*2($r_ptr) | |
476 | mov $a3, 8*3($r_ptr) | |
477 | ||
384e6de4 | 478 | mov 0(%rsp),%r13 |
86e11278 | 479 | .cfi_restore %r13 |
384e6de4 | 480 | mov 8(%rsp),%r12 |
86e11278 | 481 | .cfi_restore %r12 |
384e6de4 | 482 | lea 16(%rsp),%rsp |
86e11278 | 483 | .cfi_adjust_cfa_offset -16 |
384e6de4 | 484 | .Lneg_epilogue: |
4d3fa06f | 485 | ret |
86e11278 | 486 | .cfi_endproc |
4d3fa06f AP |
487 | .size ecp_nistz256_neg,.-ecp_nistz256_neg |
488 | ___ | |
489 | } | |
490 | { | |
491 | my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); | |
492 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); | |
493 | my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); | |
494 | my ($poly1,$poly3)=($acc6,$acc7); | |
495 | ||
eb791696 AP |
496 | $code.=<<___; |
497 | ################################################################################ | |
498 | # void ecp_nistz256_ord_mul_mont( | |
499 | # uint64_t res[4], | |
500 | # uint64_t a[4], | |
501 | # uint64_t b[4]); | |
502 | ||
503 | .globl ecp_nistz256_ord_mul_mont | |
504 | .type ecp_nistz256_ord_mul_mont,\@function,3 | |
505 | .align 32 | |
506 | ecp_nistz256_ord_mul_mont: | |
d5e11843 | 507 | .cfi_startproc |
eb791696 AP |
508 | ___ |
509 | $code.=<<___ if ($addx); | |
510 | mov \$0x80100, %ecx | |
511 | and OPENSSL_ia32cap_P+8(%rip), %ecx | |
512 | cmp \$0x80100, %ecx | |
513 | je .Lecp_nistz256_ord_mul_montx | |
514 | ___ | |
515 | $code.=<<___; | |
516 | push %rbp | |
d5e11843 | 517 | .cfi_push %rbp |
eb791696 | 518 | push %rbx |
d5e11843 | 519 | .cfi_push %rbx |
eb791696 | 520 | push %r12 |
d5e11843 | 521 | .cfi_push %r12 |
eb791696 | 522 | push %r13 |
d5e11843 | 523 | .cfi_push %r13 |
eb791696 | 524 | push %r14 |
d5e11843 | 525 | .cfi_push %r14 |
eb791696 | 526 | push %r15 |
d5e11843 AP |
527 | .cfi_push %r15 |
528 | .Lord_mul_body: | |
eb791696 AP |
529 | |
530 | mov 8*0($b_org), %rax | |
531 | mov $b_org, $b_ptr | |
532 | lea .Lord(%rip), %r14 | |
533 | mov .LordK(%rip), %r15 | |
534 | ||
535 | ################################# * b[0] | |
536 | mov %rax, $t0 | |
537 | mulq 8*0($a_ptr) | |
538 | mov %rax, $acc0 | |
539 | mov $t0, %rax | |
540 | mov %rdx, $acc1 | |
541 | ||
542 | mulq 8*1($a_ptr) | |
543 | add %rax, $acc1 | |
544 | mov $t0, %rax | |
545 | adc \$0, %rdx | |
546 | mov %rdx, $acc2 | |
547 | ||
548 | mulq 8*2($a_ptr) | |
549 | add %rax, $acc2 | |
550 | mov $t0, %rax | |
551 | adc \$0, %rdx | |
552 | ||
553 | mov $acc0, $acc5 | |
554 | imulq %r15,$acc0 | |
555 | ||
556 | mov %rdx, $acc3 | |
557 | mulq 8*3($a_ptr) | |
558 | add %rax, $acc3 | |
559 | mov $acc0, %rax | |
560 | adc \$0, %rdx | |
561 | mov %rdx, $acc4 | |
562 | ||
563 | ################################# First reduction step | |
564 | mulq 8*0(%r14) | |
565 | mov $acc0, $t1 | |
566 | add %rax, $acc5 # guaranteed to be zero | |
567 | mov $acc0, %rax | |
568 | adc \$0, %rdx | |
569 | mov %rdx, $t0 | |
570 | ||
571 | sub $acc0, $acc2 | |
572 | sbb \$0, $acc0 # can't borrow | |
573 | ||
574 | mulq 8*1(%r14) | |
575 | add $t0, $acc1 | |
576 | adc \$0, %rdx | |
577 | add %rax, $acc1 | |
578 | mov $t1, %rax | |
579 | adc %rdx, $acc2 | |
580 | mov $t1, %rdx | |
581 | adc \$0, $acc0 # can't overflow | |
582 | ||
583 | shl \$32, %rax | |
584 | shr \$32, %rdx | |
585 | sub %rax, $acc3 | |
586 | mov 8*1($b_ptr), %rax | |
587 | sbb %rdx, $t1 # can't borrow | |
588 | ||
589 | add $acc0, $acc3 | |
590 | adc $t1, $acc4 | |
591 | adc \$0, $acc5 | |
592 | ||
593 | ################################# * b[1] | |
594 | mov %rax, $t0 | |
595 | mulq 8*0($a_ptr) | |
596 | add %rax, $acc1 | |
597 | mov $t0, %rax | |
598 | adc \$0, %rdx | |
599 | mov %rdx, $t1 | |
600 | ||
601 | mulq 8*1($a_ptr) | |
602 | add $t1, $acc2 | |
603 | adc \$0, %rdx | |
604 | add %rax, $acc2 | |
605 | mov $t0, %rax | |
606 | adc \$0, %rdx | |
607 | mov %rdx, $t1 | |
608 | ||
609 | mulq 8*2($a_ptr) | |
610 | add $t1, $acc3 | |
611 | adc \$0, %rdx | |
612 | add %rax, $acc3 | |
613 | mov $t0, %rax | |
614 | adc \$0, %rdx | |
615 | ||
616 | mov $acc1, $t0 | |
617 | imulq %r15, $acc1 | |
618 | ||
619 | mov %rdx, $t1 | |
620 | mulq 8*3($a_ptr) | |
621 | add $t1, $acc4 | |
622 | adc \$0, %rdx | |
623 | xor $acc0, $acc0 | |
624 | add %rax, $acc4 | |
625 | mov $acc1, %rax | |
626 | adc %rdx, $acc5 | |
627 | adc \$0, $acc0 | |
628 | ||
629 | ################################# Second reduction step | |
630 | mulq 8*0(%r14) | |
631 | mov $acc1, $t1 | |
632 | add %rax, $t0 # guaranteed to be zero | |
633 | mov $acc1, %rax | |
634 | adc %rdx, $t0 | |
635 | ||
636 | sub $acc1, $acc3 | |
637 | sbb \$0, $acc1 # can't borrow | |
638 | ||
639 | mulq 8*1(%r14) | |
640 | add $t0, $acc2 | |
641 | adc \$0, %rdx | |
642 | add %rax, $acc2 | |
643 | mov $t1, %rax | |
644 | adc %rdx, $acc3 | |
645 | mov $t1, %rdx | |
646 | adc \$0, $acc1 # can't overflow | |
647 | ||
648 | shl \$32, %rax | |
649 | shr \$32, %rdx | |
650 | sub %rax, $acc4 | |
651 | mov 8*2($b_ptr), %rax | |
652 | sbb %rdx, $t1 # can't borrow | |
653 | ||
654 | add $acc1, $acc4 | |
655 | adc $t1, $acc5 | |
656 | adc \$0, $acc0 | |
657 | ||
658 | ################################## * b[2] | |
659 | mov %rax, $t0 | |
660 | mulq 8*0($a_ptr) | |
661 | add %rax, $acc2 | |
662 | mov $t0, %rax | |
663 | adc \$0, %rdx | |
664 | mov %rdx, $t1 | |
665 | ||
666 | mulq 8*1($a_ptr) | |
667 | add $t1, $acc3 | |
668 | adc \$0, %rdx | |
669 | add %rax, $acc3 | |
670 | mov $t0, %rax | |
671 | adc \$0, %rdx | |
672 | mov %rdx, $t1 | |
673 | ||
674 | mulq 8*2($a_ptr) | |
675 | add $t1, $acc4 | |
676 | adc \$0, %rdx | |
677 | add %rax, $acc4 | |
678 | mov $t0, %rax | |
679 | adc \$0, %rdx | |
680 | ||
681 | mov $acc2, $t0 | |
682 | imulq %r15, $acc2 | |
683 | ||
684 | mov %rdx, $t1 | |
685 | mulq 8*3($a_ptr) | |
686 | add $t1, $acc5 | |
687 | adc \$0, %rdx | |
688 | xor $acc1, $acc1 | |
689 | add %rax, $acc5 | |
690 | mov $acc2, %rax | |
691 | adc %rdx, $acc0 | |
692 | adc \$0, $acc1 | |
693 | ||
694 | ################################# Third reduction step | |
695 | mulq 8*0(%r14) | |
696 | mov $acc2, $t1 | |
697 | add %rax, $t0 # guaranteed to be zero | |
698 | mov $acc2, %rax | |
699 | adc %rdx, $t0 | |
700 | ||
701 | sub $acc2, $acc4 | |
702 | sbb \$0, $acc2 # can't borrow | |
703 | ||
704 | mulq 8*1(%r14) | |
705 | add $t0, $acc3 | |
706 | adc \$0, %rdx | |
707 | add %rax, $acc3 | |
708 | mov $t1, %rax | |
709 | adc %rdx, $acc4 | |
710 | mov $t1, %rdx | |
711 | adc \$0, $acc2 # can't overflow | |
712 | ||
713 | shl \$32, %rax | |
714 | shr \$32, %rdx | |
715 | sub %rax, $acc5 | |
716 | mov 8*3($b_ptr), %rax | |
717 | sbb %rdx, $t1 # can't borrow | |
718 | ||
719 | add $acc2, $acc5 | |
720 | adc $t1, $acc0 | |
721 | adc \$0, $acc1 | |
722 | ||
723 | ################################# * b[3] | |
724 | mov %rax, $t0 | |
725 | mulq 8*0($a_ptr) | |
726 | add %rax, $acc3 | |
727 | mov $t0, %rax | |
728 | adc \$0, %rdx | |
729 | mov %rdx, $t1 | |
730 | ||
731 | mulq 8*1($a_ptr) | |
732 | add $t1, $acc4 | |
733 | adc \$0, %rdx | |
734 | add %rax, $acc4 | |
735 | mov $t0, %rax | |
736 | adc \$0, %rdx | |
737 | mov %rdx, $t1 | |
738 | ||
739 | mulq 8*2($a_ptr) | |
740 | add $t1, $acc5 | |
741 | adc \$0, %rdx | |
742 | add %rax, $acc5 | |
743 | mov $t0, %rax | |
744 | adc \$0, %rdx | |
745 | ||
746 | mov $acc3, $t0 | |
747 | imulq %r15, $acc3 | |
748 | ||
749 | mov %rdx, $t1 | |
750 | mulq 8*3($a_ptr) | |
751 | add $t1, $acc0 | |
752 | adc \$0, %rdx | |
753 | xor $acc2, $acc2 | |
754 | add %rax, $acc0 | |
755 | mov $acc3, %rax | |
756 | adc %rdx, $acc1 | |
757 | adc \$0, $acc2 | |
758 | ||
759 | ################################# Last reduction step | |
760 | mulq 8*0(%r14) | |
761 | mov $acc3, $t1 | |
762 | add %rax, $t0 # guaranteed to be zero | |
763 | mov $acc3, %rax | |
764 | adc %rdx, $t0 | |
765 | ||
766 | sub $acc3, $acc5 | |
767 | sbb \$0, $acc3 # can't borrow | |
768 | ||
769 | mulq 8*1(%r14) | |
770 | add $t0, $acc4 | |
771 | adc \$0, %rdx | |
772 | add %rax, $acc4 | |
773 | mov $t1, %rax | |
774 | adc %rdx, $acc5 | |
775 | mov $t1, %rdx | |
776 | adc \$0, $acc3 # can't overflow | |
777 | ||
778 | shl \$32, %rax | |
779 | shr \$32, %rdx | |
780 | sub %rax, $acc0 | |
781 | sbb %rdx, $t1 # can't borrow | |
782 | ||
783 | add $acc3, $acc0 | |
784 | adc $t1, $acc1 | |
785 | adc \$0, $acc2 | |
786 | ||
787 | ################################# Subtract ord | |
788 | mov $acc4, $a_ptr | |
789 | sub 8*0(%r14), $acc4 | |
790 | mov $acc5, $acc3 | |
791 | sbb 8*1(%r14), $acc5 | |
792 | mov $acc0, $t0 | |
793 | sbb 8*2(%r14), $acc0 | |
794 | mov $acc1, $t1 | |
795 | sbb 8*3(%r14), $acc1 | |
796 | sbb \$0, $acc2 | |
797 | ||
798 | cmovc $a_ptr, $acc4 | |
799 | cmovc $acc3, $acc5 | |
800 | cmovc $t0, $acc0 | |
801 | cmovc $t1, $acc1 | |
802 | ||
803 | mov $acc4, 8*0($r_ptr) | |
804 | mov $acc5, 8*1($r_ptr) | |
805 | mov $acc0, 8*2($r_ptr) | |
806 | mov $acc1, 8*3($r_ptr) | |
807 | ||
d5e11843 AP |
808 | mov 0(%rsp),%r15 |
809 | .cfi_restore %r15 | |
810 | mov 8(%rsp),%r14 | |
811 | .cfi_restore %r14 | |
812 | mov 16(%rsp),%r13 | |
813 | .cfi_restore %r13 | |
814 | mov 24(%rsp),%r12 | |
815 | .cfi_restore %r12 | |
816 | mov 32(%rsp),%rbx | |
817 | .cfi_restore %rbx | |
818 | mov 40(%rsp),%rbp | |
819 | .cfi_restore %rbp | |
820 | lea 48(%rsp),%rsp | |
821 | .cfi_adjust_cfa_offset -48 | |
822 | .Lord_mul_epilogue: | |
eb791696 | 823 | ret |
d5e11843 | 824 | .cfi_endproc |
eb791696 AP |
825 | .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont |
826 | ||
827 | ################################################################################ | |
828 | # void ecp_nistz256_ord_sqr_mont( | |
829 | # uint64_t res[4], | |
830 | # uint64_t a[4], | |
15972296 | 831 | # uint64_t rep); |
eb791696 AP |
832 | |
833 | .globl ecp_nistz256_ord_sqr_mont | |
834 | .type ecp_nistz256_ord_sqr_mont,\@function,3 | |
835 | .align 32 | |
836 | ecp_nistz256_ord_sqr_mont: | |
d5e11843 | 837 | .cfi_startproc |
eb791696 AP |
838 | ___ |
839 | $code.=<<___ if ($addx); | |
840 | mov \$0x80100, %ecx | |
841 | and OPENSSL_ia32cap_P+8(%rip), %ecx | |
842 | cmp \$0x80100, %ecx | |
843 | je .Lecp_nistz256_ord_sqr_montx | |
844 | ___ | |
845 | $code.=<<___; | |
846 | push %rbp | |
d5e11843 | 847 | .cfi_push %rbp |
eb791696 | 848 | push %rbx |
d5e11843 | 849 | .cfi_push %rbx |
eb791696 | 850 | push %r12 |
d5e11843 | 851 | .cfi_push %r12 |
eb791696 | 852 | push %r13 |
d5e11843 | 853 | .cfi_push %r13 |
eb791696 | 854 | push %r14 |
d5e11843 | 855 | .cfi_push %r14 |
eb791696 | 856 | push %r15 |
d5e11843 AP |
857 | .cfi_push %r15 |
858 | .Lord_sqr_body: | |
eb791696 AP |
859 | |
860 | mov 8*0($a_ptr), $acc0 | |
861 | mov 8*1($a_ptr), %rax | |
862 | mov 8*2($a_ptr), $acc6 | |
863 | mov 8*3($a_ptr), $acc7 | |
864 | lea .Lord(%rip), $a_ptr # pointer to modulus | |
865 | mov $b_org, $b_ptr | |
866 | jmp .Loop_ord_sqr | |
867 | ||
868 | .align 32 | |
869 | .Loop_ord_sqr: | |
870 | ################################# a[1:] * a[0] | |
871 | mov %rax, $t1 # put aside a[1] | |
872 | mul $acc0 # a[1] * a[0] | |
873 | mov %rax, $acc1 | |
874 | movq $t1, %xmm1 # offload a[1] | |
875 | mov $acc6, %rax | |
876 | mov %rdx, $acc2 | |
877 | ||
878 | mul $acc0 # a[2] * a[0] | |
879 | add %rax, $acc2 | |
880 | mov $acc7, %rax | |
881 | movq $acc6, %xmm2 # offload a[2] | |
882 | adc \$0, %rdx | |
883 | mov %rdx, $acc3 | |
884 | ||
885 | mul $acc0 # a[3] * a[0] | |
886 | add %rax, $acc3 | |
887 | mov $acc7, %rax | |
888 | movq $acc7, %xmm3 # offload a[3] | |
889 | adc \$0, %rdx | |
890 | mov %rdx, $acc4 | |
891 | ||
892 | ################################# a[3] * a[2] | |
893 | mul $acc6 # a[3] * a[2] | |
894 | mov %rax, $acc5 | |
895 | mov $acc6, %rax | |
896 | mov %rdx, $acc6 | |
897 | ||
898 | ################################# a[2:] * a[1] | |
899 | mul $t1 # a[2] * a[1] | |
900 | add %rax, $acc3 | |
901 | mov $acc7, %rax | |
902 | adc \$0, %rdx | |
903 | mov %rdx, $acc7 | |
904 | ||
905 | mul $t1 # a[3] * a[1] | |
906 | add %rax, $acc4 | |
907 | adc \$0, %rdx | |
908 | ||
909 | add $acc7, $acc4 | |
910 | adc %rdx, $acc5 | |
911 | adc \$0, $acc6 # can't overflow | |
912 | ||
913 | ################################# *2 | |
914 | xor $acc7, $acc7 | |
915 | mov $acc0, %rax | |
916 | add $acc1, $acc1 | |
917 | adc $acc2, $acc2 | |
918 | adc $acc3, $acc3 | |
919 | adc $acc4, $acc4 | |
920 | adc $acc5, $acc5 | |
921 | adc $acc6, $acc6 | |
922 | adc \$0, $acc7 | |
923 | ||
924 | ################################# Missing products | |
925 | mul %rax # a[0] * a[0] | |
926 | mov %rax, $acc0 | |
927 | movq %xmm1, %rax | |
928 | mov %rdx, $t1 | |
929 | ||
930 | mul %rax # a[1] * a[1] | |
931 | add $t1, $acc1 | |
932 | adc %rax, $acc2 | |
933 | movq %xmm2, %rax | |
934 | adc \$0, %rdx | |
935 | mov %rdx, $t1 | |
936 | ||
937 | mul %rax # a[2] * a[2] | |
938 | add $t1, $acc3 | |
939 | adc %rax, $acc4 | |
940 | movq %xmm3, %rax | |
941 | adc \$0, %rdx | |
942 | mov %rdx, $t1 | |
943 | ||
944 | mov $acc0, $t0 | |
945 | imulq 8*4($a_ptr), $acc0 # *= .LordK | |
946 | ||
947 | mul %rax # a[3] * a[3] | |
948 | add $t1, $acc5 | |
949 | adc %rax, $acc6 | |
950 | mov 8*0($a_ptr), %rax # modulus[0] | |
951 | adc %rdx, $acc7 # can't overflow | |
952 | ||
953 | ################################# First reduction step | |
954 | mul $acc0 | |
955 | mov $acc0, $t1 | |
956 | add %rax, $t0 # guaranteed to be zero | |
957 | mov 8*1($a_ptr), %rax # modulus[1] | |
958 | adc %rdx, $t0 | |
959 | ||
960 | sub $acc0, $acc2 | |
961 | sbb \$0, $t1 # can't borrow | |
962 | ||
963 | mul $acc0 | |
964 | add $t0, $acc1 | |
965 | adc \$0, %rdx | |
966 | add %rax, $acc1 | |
967 | mov $acc0, %rax | |
968 | adc %rdx, $acc2 | |
969 | mov $acc0, %rdx | |
970 | adc \$0, $t1 # can't overflow | |
971 | ||
972 | mov $acc1, $t0 | |
973 | imulq 8*4($a_ptr), $acc1 # *= .LordK | |
974 | ||
975 | shl \$32, %rax | |
976 | shr \$32, %rdx | |
977 | sub %rax, $acc3 | |
978 | mov 8*0($a_ptr), %rax | |
979 | sbb %rdx, $acc0 # can't borrow | |
980 | ||
981 | add $t1, $acc3 | |
982 | adc \$0, $acc0 # can't overflow | |
983 | ||
984 | ################################# Second reduction step | |
985 | mul $acc1 | |
986 | mov $acc1, $t1 | |
987 | add %rax, $t0 # guaranteed to be zero | |
988 | mov 8*1($a_ptr), %rax | |
989 | adc %rdx, $t0 | |
990 | ||
991 | sub $acc1, $acc3 | |
992 | sbb \$0, $t1 # can't borrow | |
993 | ||
994 | mul $acc1 | |
995 | add $t0, $acc2 | |
996 | adc \$0, %rdx | |
997 | add %rax, $acc2 | |
998 | mov $acc1, %rax | |
999 | adc %rdx, $acc3 | |
1000 | mov $acc1, %rdx | |
1001 | adc \$0, $t1 # can't overflow | |
1002 | ||
1003 | mov $acc2, $t0 | |
1004 | imulq 8*4($a_ptr), $acc2 # *= .LordK | |
1005 | ||
1006 | shl \$32, %rax | |
1007 | shr \$32, %rdx | |
1008 | sub %rax, $acc0 | |
1009 | mov 8*0($a_ptr), %rax | |
1010 | sbb %rdx, $acc1 # can't borrow | |
1011 | ||
1012 | add $t1, $acc0 | |
1013 | adc \$0, $acc1 # can't overflow | |
1014 | ||
1015 | ################################# Third reduction step | |
1016 | mul $acc2 | |
1017 | mov $acc2, $t1 | |
1018 | add %rax, $t0 # guaranteed to be zero | |
1019 | mov 8*1($a_ptr), %rax | |
1020 | adc %rdx, $t0 | |
1021 | ||
1022 | sub $acc2, $acc0 | |
1023 | sbb \$0, $t1 # can't borrow | |
1024 | ||
1025 | mul $acc2 | |
1026 | add $t0, $acc3 | |
1027 | adc \$0, %rdx | |
1028 | add %rax, $acc3 | |
1029 | mov $acc2, %rax | |
1030 | adc %rdx, $acc0 | |
1031 | mov $acc2, %rdx | |
1032 | adc \$0, $t1 # can't overflow | |
1033 | ||
1034 | mov $acc3, $t0 | |
1035 | imulq 8*4($a_ptr), $acc3 # *= .LordK | |
1036 | ||
1037 | shl \$32, %rax | |
1038 | shr \$32, %rdx | |
1039 | sub %rax, $acc1 | |
1040 | mov 8*0($a_ptr), %rax | |
1041 | sbb %rdx, $acc2 # can't borrow | |
1042 | ||
1043 | add $t1, $acc1 | |
1044 | adc \$0, $acc2 # can't overflow | |
1045 | ||
1046 | ################################# Last reduction step | |
1047 | mul $acc3 | |
1048 | mov $acc3, $t1 | |
1049 | add %rax, $t0 # guaranteed to be zero | |
1050 | mov 8*1($a_ptr), %rax | |
1051 | adc %rdx, $t0 | |
1052 | ||
1053 | sub $acc3, $acc1 | |
1054 | sbb \$0, $t1 # can't borrow | |
1055 | ||
1056 | mul $acc3 | |
1057 | add $t0, $acc0 | |
1058 | adc \$0, %rdx | |
1059 | add %rax, $acc0 | |
1060 | mov $acc3, %rax | |
1061 | adc %rdx, $acc1 | |
1062 | mov $acc3, %rdx | |
1063 | adc \$0, $t1 # can't overflow | |
1064 | ||
1065 | shl \$32, %rax | |
1066 | shr \$32, %rdx | |
1067 | sub %rax, $acc2 | |
1068 | sbb %rdx, $acc3 # can't borrow | |
1069 | ||
1070 | add $t1, $acc2 | |
1071 | adc \$0, $acc3 # can't overflow | |
1072 | ||
1073 | ################################# Add bits [511:256] of the sqr result | |
1074 | xor %rdx, %rdx | |
1075 | add $acc4, $acc0 | |
1076 | adc $acc5, $acc1 | |
1077 | mov $acc0, $acc4 | |
1078 | adc $acc6, $acc2 | |
1079 | adc $acc7, $acc3 | |
1080 | mov $acc1, %rax | |
1081 | adc \$0, %rdx | |
1082 | ||
1083 | ################################# Compare to modulus | |
1084 | sub 8*0($a_ptr), $acc0 | |
1085 | mov $acc2, $acc6 | |
1086 | sbb 8*1($a_ptr), $acc1 | |
1087 | sbb 8*2($a_ptr), $acc2 | |
1088 | mov $acc3, $acc7 | |
1089 | sbb 8*3($a_ptr), $acc3 | |
1090 | sbb \$0, %rdx | |
1091 | ||
1092 | cmovc $acc4, $acc0 | |
1093 | cmovnc $acc1, %rax | |
1094 | cmovnc $acc2, $acc6 | |
1095 | cmovnc $acc3, $acc7 | |
1096 | ||
1097 | dec $b_ptr | |
1098 | jnz .Loop_ord_sqr | |
1099 | ||
1100 | mov $acc0, 8*0($r_ptr) | |
1101 | mov %rax, 8*1($r_ptr) | |
1102 | pxor %xmm1, %xmm1 | |
1103 | mov $acc6, 8*2($r_ptr) | |
1104 | pxor %xmm2, %xmm2 | |
1105 | mov $acc7, 8*3($r_ptr) | |
1106 | pxor %xmm3, %xmm3 | |
1107 | ||
d5e11843 AP |
1108 | mov 0(%rsp),%r15 |
1109 | .cfi_restore %r15 | |
1110 | mov 8(%rsp),%r14 | |
1111 | .cfi_restore %r14 | |
1112 | mov 16(%rsp),%r13 | |
1113 | .cfi_restore %r13 | |
1114 | mov 24(%rsp),%r12 | |
1115 | .cfi_restore %r12 | |
1116 | mov 32(%rsp),%rbx | |
1117 | .cfi_restore %rbx | |
1118 | mov 40(%rsp),%rbp | |
1119 | .cfi_restore %rbp | |
1120 | lea 48(%rsp),%rsp | |
1121 | .cfi_adjust_cfa_offset -48 | |
1122 | .Lord_sqr_epilogue: | |
eb791696 | 1123 | ret |
d5e11843 | 1124 | .cfi_endproc |
eb791696 AP |
1125 | .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont |
1126 | ___ | |
1127 | ||
1128 | $code.=<<___ if ($addx); | |
1129 | ################################################################################ | |
1130 | .type ecp_nistz256_ord_mul_montx,\@function,3 | |
1131 | .align 32 | |
1132 | ecp_nistz256_ord_mul_montx: | |
d5e11843 | 1133 | .cfi_startproc |
eb791696 AP |
1134 | .Lecp_nistz256_ord_mul_montx: |
1135 | push %rbp | |
d5e11843 | 1136 | .cfi_push %rbp |
eb791696 | 1137 | push %rbx |
d5e11843 | 1138 | .cfi_push %rbx |
eb791696 | 1139 | push %r12 |
d5e11843 | 1140 | .cfi_push %r12 |
eb791696 | 1141 | push %r13 |
d5e11843 | 1142 | .cfi_push %r13 |
eb791696 | 1143 | push %r14 |
d5e11843 | 1144 | .cfi_push %r14 |
eb791696 | 1145 | push %r15 |
d5e11843 AP |
1146 | .cfi_push %r15 |
1147 | .Lord_mulx_body: | |
eb791696 AP |
1148 | |
1149 | mov $b_org, $b_ptr | |
1150 | mov 8*0($b_org), %rdx | |
1151 | mov 8*0($a_ptr), $acc1 | |
1152 | mov 8*1($a_ptr), $acc2 | |
1153 | mov 8*2($a_ptr), $acc3 | |
1154 | mov 8*3($a_ptr), $acc4 | |
1155 | lea -128($a_ptr), $a_ptr # control u-op density | |
1156 | lea .Lord-128(%rip), %r14 | |
1157 | mov .LordK(%rip), %r15 | |
1158 | ||
1159 | ################################# Multiply by b[0] | |
1160 | mulx $acc1, $acc0, $acc1 | |
1161 | mulx $acc2, $t0, $acc2 | |
1162 | mulx $acc3, $t1, $acc3 | |
1163 | add $t0, $acc1 | |
1164 | mulx $acc4, $t0, $acc4 | |
1165 | mov $acc0, %rdx | |
1166 | mulx %r15, %rdx, %rax | |
1167 | adc $t1, $acc2 | |
1168 | adc $t0, $acc3 | |
1169 | adc \$0, $acc4 | |
1170 | ||
1171 | ################################# reduction | |
1172 | xor $acc5, $acc5 # $acc5=0, cf=0, of=0 | |
1173 | mulx 8*0+128(%r14), $t0, $t1 | |
1174 | adcx $t0, $acc0 # guaranteed to be zero | |
1175 | adox $t1, $acc1 | |
1176 | ||
1177 | mulx 8*1+128(%r14), $t0, $t1 | |
1178 | adcx $t0, $acc1 | |
1179 | adox $t1, $acc2 | |
1180 | ||
1181 | mulx 8*2+128(%r14), $t0, $t1 | |
1182 | adcx $t0, $acc2 | |
1183 | adox $t1, $acc3 | |
1184 | ||
1185 | mulx 8*3+128(%r14), $t0, $t1 | |
1186 | mov 8*1($b_ptr), %rdx | |
1187 | adcx $t0, $acc3 | |
1188 | adox $t1, $acc4 | |
1189 | adcx $acc0, $acc4 | |
1190 | adox $acc0, $acc5 | |
1191 | adc \$0, $acc5 # cf=0, of=0 | |
1192 | ||
1193 | ################################# Multiply by b[1] | |
1194 | mulx 8*0+128($a_ptr), $t0, $t1 | |
1195 | adcx $t0, $acc1 | |
1196 | adox $t1, $acc2 | |
1197 | ||
1198 | mulx 8*1+128($a_ptr), $t0, $t1 | |
1199 | adcx $t0, $acc2 | |
1200 | adox $t1, $acc3 | |
1201 | ||
1202 | mulx 8*2+128($a_ptr), $t0, $t1 | |
1203 | adcx $t0, $acc3 | |
1204 | adox $t1, $acc4 | |
1205 | ||
1206 | mulx 8*3+128($a_ptr), $t0, $t1 | |
1207 | mov $acc1, %rdx | |
1208 | mulx %r15, %rdx, %rax | |
1209 | adcx $t0, $acc4 | |
1210 | adox $t1, $acc5 | |
1211 | ||
1212 | adcx $acc0, $acc5 | |
1213 | adox $acc0, $acc0 | |
1214 | adc \$0, $acc0 # cf=0, of=0 | |
1215 | ||
1216 | ################################# reduction | |
1217 | mulx 8*0+128(%r14), $t0, $t1 | |
1218 | adcx $t0, $acc1 # guaranteed to be zero | |
1219 | adox $t1, $acc2 | |
1220 | ||
1221 | mulx 8*1+128(%r14), $t0, $t1 | |
1222 | adcx $t0, $acc2 | |
1223 | adox $t1, $acc3 | |
1224 | ||
1225 | mulx 8*2+128(%r14), $t0, $t1 | |
1226 | adcx $t0, $acc3 | |
1227 | adox $t1, $acc4 | |
1228 | ||
1229 | mulx 8*3+128(%r14), $t0, $t1 | |
1230 | mov 8*2($b_ptr), %rdx | |
1231 | adcx $t0, $acc4 | |
1232 | adox $t1, $acc5 | |
1233 | adcx $acc1, $acc5 | |
1234 | adox $acc1, $acc0 | |
1235 | adc \$0, $acc0 # cf=0, of=0 | |
1236 | ||
1237 | ################################# Multiply by b[2] | |
1238 | mulx 8*0+128($a_ptr), $t0, $t1 | |
1239 | adcx $t0, $acc2 | |
1240 | adox $t1, $acc3 | |
1241 | ||
1242 | mulx 8*1+128($a_ptr), $t0, $t1 | |
1243 | adcx $t0, $acc3 | |
1244 | adox $t1, $acc4 | |
1245 | ||
1246 | mulx 8*2+128($a_ptr), $t0, $t1 | |
1247 | adcx $t0, $acc4 | |
1248 | adox $t1, $acc5 | |
1249 | ||
1250 | mulx 8*3+128($a_ptr), $t0, $t1 | |
1251 | mov $acc2, %rdx | |
1252 | mulx %r15, %rdx, %rax | |
1253 | adcx $t0, $acc5 | |
1254 | adox $t1, $acc0 | |
1255 | ||
1256 | adcx $acc1, $acc0 | |
1257 | adox $acc1, $acc1 | |
1258 | adc \$0, $acc1 # cf=0, of=0 | |
1259 | ||
1260 | ################################# reduction | |
1261 | mulx 8*0+128(%r14), $t0, $t1 | |
1262 | adcx $t0, $acc2 # guaranteed to be zero | |
1263 | adox $t1, $acc3 | |
1264 | ||
1265 | mulx 8*1+128(%r14), $t0, $t1 | |
1266 | adcx $t0, $acc3 | |
1267 | adox $t1, $acc4 | |
1268 | ||
1269 | mulx 8*2+128(%r14), $t0, $t1 | |
1270 | adcx $t0, $acc4 | |
1271 | adox $t1, $acc5 | |
1272 | ||
1273 | mulx 8*3+128(%r14), $t0, $t1 | |
1274 | mov 8*3($b_ptr), %rdx | |
1275 | adcx $t0, $acc5 | |
1276 | adox $t1, $acc0 | |
1277 | adcx $acc2, $acc0 | |
1278 | adox $acc2, $acc1 | |
1279 | adc \$0, $acc1 # cf=0, of=0 | |
1280 | ||
1281 | ################################# Multiply by b[3] | |
1282 | mulx 8*0+128($a_ptr), $t0, $t1 | |
1283 | adcx $t0, $acc3 | |
1284 | adox $t1, $acc4 | |
1285 | ||
1286 | mulx 8*1+128($a_ptr), $t0, $t1 | |
1287 | adcx $t0, $acc4 | |
1288 | adox $t1, $acc5 | |
1289 | ||
1290 | mulx 8*2+128($a_ptr), $t0, $t1 | |
1291 | adcx $t0, $acc5 | |
1292 | adox $t1, $acc0 | |
1293 | ||
1294 | mulx 8*3+128($a_ptr), $t0, $t1 | |
1295 | mov $acc3, %rdx | |
1296 | mulx %r15, %rdx, %rax | |
1297 | adcx $t0, $acc0 | |
1298 | adox $t1, $acc1 | |
1299 | ||
1300 | adcx $acc2, $acc1 | |
1301 | adox $acc2, $acc2 | |
1302 | adc \$0, $acc2 # cf=0, of=0 | |
1303 | ||
1304 | ################################# reduction | |
1305 | mulx 8*0+128(%r14), $t0, $t1 | |
c2969ff6 | 1306 | adcx $t0, $acc3 # guaranteed to be zero |
eb791696 AP |
1307 | adox $t1, $acc4 |
1308 | ||
1309 | mulx 8*1+128(%r14), $t0, $t1 | |
1310 | adcx $t0, $acc4 | |
1311 | adox $t1, $acc5 | |
1312 | ||
1313 | mulx 8*2+128(%r14), $t0, $t1 | |
1314 | adcx $t0, $acc5 | |
1315 | adox $t1, $acc0 | |
1316 | ||
1317 | mulx 8*3+128(%r14), $t0, $t1 | |
1318 | lea 128(%r14),%r14 | |
1319 | mov $acc4, $t2 | |
1320 | adcx $t0, $acc0 | |
1321 | adox $t1, $acc1 | |
1322 | mov $acc5, $t3 | |
1323 | adcx $acc3, $acc1 | |
1324 | adox $acc3, $acc2 | |
1325 | adc \$0, $acc2 | |
1326 | ||
1327 | ################################# | |
1328 | # Branch-less conditional subtraction of P | |
1329 | mov $acc0, $t0 | |
1330 | sub 8*0(%r14), $acc4 | |
1331 | sbb 8*1(%r14), $acc5 | |
1332 | sbb 8*2(%r14), $acc0 | |
1333 | mov $acc1, $t1 | |
1334 | sbb 8*3(%r14), $acc1 | |
1335 | sbb \$0, $acc2 | |
1336 | ||
1337 | cmovc $t2, $acc4 | |
1338 | cmovc $t3, $acc5 | |
1339 | cmovc $t0, $acc0 | |
1340 | cmovc $t1, $acc1 | |
1341 | ||
1342 | mov $acc4, 8*0($r_ptr) | |
1343 | mov $acc5, 8*1($r_ptr) | |
1344 | mov $acc0, 8*2($r_ptr) | |
1345 | mov $acc1, 8*3($r_ptr) | |
1346 | ||
d5e11843 AP |
1347 | mov 0(%rsp),%r15 |
1348 | .cfi_restore %r15 | |
1349 | mov 8(%rsp),%r14 | |
1350 | .cfi_restore %r14 | |
1351 | mov 16(%rsp),%r13 | |
1352 | .cfi_restore %r13 | |
1353 | mov 24(%rsp),%r12 | |
1354 | .cfi_restore %r12 | |
1355 | mov 32(%rsp),%rbx | |
1356 | .cfi_restore %rbx | |
1357 | mov 40(%rsp),%rbp | |
1358 | .cfi_restore %rbp | |
1359 | lea 48(%rsp),%rsp | |
1360 | .cfi_adjust_cfa_offset -48 | |
1361 | .Lord_mulx_epilogue: | |
eb791696 | 1362 | ret |
d5e11843 | 1363 | .cfi_endproc |
eb791696 AP |
1364 | .size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx |
1365 | ||
1366 | .type ecp_nistz256_ord_sqr_montx,\@function,3 | |
1367 | .align 32 | |
1368 | ecp_nistz256_ord_sqr_montx: | |
d5e11843 | 1369 | .cfi_startproc |
eb791696 AP |
1370 | .Lecp_nistz256_ord_sqr_montx: |
1371 | push %rbp | |
d5e11843 | 1372 | .cfi_push %rbp |
eb791696 | 1373 | push %rbx |
d5e11843 | 1374 | .cfi_push %rbx |
eb791696 | 1375 | push %r12 |
d5e11843 | 1376 | .cfi_push %r12 |
eb791696 | 1377 | push %r13 |
d5e11843 | 1378 | .cfi_push %r13 |
eb791696 | 1379 | push %r14 |
d5e11843 | 1380 | .cfi_push %r14 |
eb791696 | 1381 | push %r15 |
d5e11843 AP |
1382 | .cfi_push %r15 |
1383 | .Lord_sqrx_body: | |
eb791696 AP |
1384 | |
1385 | mov $b_org, $b_ptr | |
1386 | mov 8*0($a_ptr), %rdx | |
1387 | mov 8*1($a_ptr), $acc6 | |
1388 | mov 8*2($a_ptr), $acc7 | |
1389 | mov 8*3($a_ptr), $acc0 | |
1390 | lea .Lord(%rip), $a_ptr | |
1391 | jmp .Loop_ord_sqrx | |
1392 | ||
1393 | .align 32 | |
1394 | .Loop_ord_sqrx: | |
1395 | mulx $acc6, $acc1, $acc2 # a[0]*a[1] | |
1396 | mulx $acc7, $t0, $acc3 # a[0]*a[2] | |
1397 | mov %rdx, %rax # offload a[0] | |
1398 | movq $acc6, %xmm1 # offload a[1] | |
1399 | mulx $acc0, $t1, $acc4 # a[0]*a[3] | |
1400 | mov $acc6, %rdx | |
1401 | add $t0, $acc2 | |
1402 | movq $acc7, %xmm2 # offload a[2] | |
1403 | adc $t1, $acc3 | |
1404 | adc \$0, $acc4 | |
1405 | xor $acc5, $acc5 # $acc5=0,cf=0,of=0 | |
1406 | ################################# | |
1407 | mulx $acc7, $t0, $t1 # a[1]*a[2] | |
1408 | adcx $t0, $acc3 | |
1409 | adox $t1, $acc4 | |
1410 | ||
1411 | mulx $acc0, $t0, $t1 # a[1]*a[3] | |
1412 | mov $acc7, %rdx | |
1413 | adcx $t0, $acc4 | |
1414 | adox $t1, $acc5 | |
1415 | adc \$0, $acc5 | |
1416 | ################################# | |
1417 | mulx $acc0, $t0, $acc6 # a[2]*a[3] | |
1418 | mov %rax, %rdx | |
1419 | movq $acc0, %xmm3 # offload a[3] | |
1420 | xor $acc7, $acc7 # $acc7=0,cf=0,of=0 | |
1421 | adcx $acc1, $acc1 # acc1:6<<1 | |
1422 | adox $t0, $acc5 | |
1423 | adcx $acc2, $acc2 | |
1424 | adox $acc7, $acc6 # of=0 | |
1425 | ||
1426 | ################################# a[i]*a[i] | |
1427 | mulx %rdx, $acc0, $t1 | |
1428 | movq %xmm1, %rdx | |
1429 | adcx $acc3, $acc3 | |
1430 | adox $t1, $acc1 | |
1431 | adcx $acc4, $acc4 | |
1432 | mulx %rdx, $t0, $t4 | |
1433 | movq %xmm2, %rdx | |
1434 | adcx $acc5, $acc5 | |
1435 | adox $t0, $acc2 | |
1436 | adcx $acc6, $acc6 | |
1437 | mulx %rdx, $t0, $t1 | |
1438 | .byte 0x67 | |
1439 | movq %xmm3, %rdx | |
1440 | adox $t4, $acc3 | |
1441 | adcx $acc7, $acc7 | |
1442 | adox $t0, $acc4 | |
1443 | adox $t1, $acc5 | |
1444 | mulx %rdx, $t0, $t4 | |
1445 | adox $t0, $acc6 | |
1446 | adox $t4, $acc7 | |
1447 | ||
1448 | ################################# reduction | |
1449 | mov $acc0, %rdx | |
1450 | mulx 8*4($a_ptr), %rdx, $t0 | |
1451 | ||
1452 | xor %rax, %rax # cf=0, of=0 | |
1453 | mulx 8*0($a_ptr), $t0, $t1 | |
1454 | adcx $t0, $acc0 # guaranteed to be zero | |
1455 | adox $t1, $acc1 | |
1456 | mulx 8*1($a_ptr), $t0, $t1 | |
1457 | adcx $t0, $acc1 | |
1458 | adox $t1, $acc2 | |
1459 | mulx 8*2($a_ptr), $t0, $t1 | |
1460 | adcx $t0, $acc2 | |
1461 | adox $t1, $acc3 | |
1462 | mulx 8*3($a_ptr), $t0, $t1 | |
1463 | adcx $t0, $acc3 | |
1464 | adox $t1, $acc0 # of=0 | |
1465 | adcx %rax, $acc0 # cf=0 | |
1466 | ||
1467 | ################################# | |
1468 | mov $acc1, %rdx | |
1469 | mulx 8*4($a_ptr), %rdx, $t0 | |
1470 | ||
1471 | mulx 8*0($a_ptr), $t0, $t1 | |
1472 | adox $t0, $acc1 # guaranteed to be zero | |
1473 | adcx $t1, $acc2 | |
1474 | mulx 8*1($a_ptr), $t0, $t1 | |
1475 | adox $t0, $acc2 | |
1476 | adcx $t1, $acc3 | |
1477 | mulx 8*2($a_ptr), $t0, $t1 | |
1478 | adox $t0, $acc3 | |
1479 | adcx $t1, $acc0 | |
1480 | mulx 8*3($a_ptr), $t0, $t1 | |
1481 | adox $t0, $acc0 | |
1482 | adcx $t1, $acc1 # cf=0 | |
1483 | adox %rax, $acc1 # of=0 | |
1484 | ||
1485 | ################################# | |
1486 | mov $acc2, %rdx | |
1487 | mulx 8*4($a_ptr), %rdx, $t0 | |
1488 | ||
1489 | mulx 8*0($a_ptr), $t0, $t1 | |
1490 | adcx $t0, $acc2 # guaranteed to be zero | |
1491 | adox $t1, $acc3 | |
1492 | mulx 8*1($a_ptr), $t0, $t1 | |
1493 | adcx $t0, $acc3 | |
1494 | adox $t1, $acc0 | |
1495 | mulx 8*2($a_ptr), $t0, $t1 | |
1496 | adcx $t0, $acc0 | |
1497 | adox $t1, $acc1 | |
1498 | mulx 8*3($a_ptr), $t0, $t1 | |
1499 | adcx $t0, $acc1 | |
1500 | adox $t1, $acc2 # of=0 | |
1501 | adcx %rax, $acc2 # cf=0 | |
1502 | ||
1503 | ################################# | |
1504 | mov $acc3, %rdx | |
1505 | mulx 8*4($a_ptr), %rdx, $t0 | |
1506 | ||
1507 | mulx 8*0($a_ptr), $t0, $t1 | |
1508 | adox $t0, $acc3 # guaranteed to be zero | |
1509 | adcx $t1, $acc0 | |
1510 | mulx 8*1($a_ptr), $t0, $t1 | |
1511 | adox $t0, $acc0 | |
1512 | adcx $t1, $acc1 | |
1513 | mulx 8*2($a_ptr), $t0, $t1 | |
1514 | adox $t0, $acc1 | |
1515 | adcx $t1, $acc2 | |
1516 | mulx 8*3($a_ptr), $t0, $t1 | |
1517 | adox $t0, $acc2 | |
1518 | adcx $t1, $acc3 | |
1519 | adox %rax, $acc3 | |
1520 | ||
1521 | ################################# accumulate upper half | |
1522 | add $acc0, $acc4 # add $acc4, $acc0 | |
1523 | adc $acc5, $acc1 | |
1524 | mov $acc4, %rdx | |
1525 | adc $acc6, $acc2 | |
1526 | adc $acc7, $acc3 | |
1527 | mov $acc1, $acc6 | |
1528 | adc \$0, %rax | |
1529 | ||
1530 | ################################# compare to modulus | |
1531 | sub 8*0($a_ptr), $acc4 | |
1532 | mov $acc2, $acc7 | |
1533 | sbb 8*1($a_ptr), $acc1 | |
1534 | sbb 8*2($a_ptr), $acc2 | |
1535 | mov $acc3, $acc0 | |
1536 | sbb 8*3($a_ptr), $acc3 | |
1537 | sbb \$0, %rax | |
1538 | ||
1539 | cmovnc $acc4, %rdx | |
1540 | cmovnc $acc1, $acc6 | |
1541 | cmovnc $acc2, $acc7 | |
1542 | cmovnc $acc3, $acc0 | |
1543 | ||
1544 | dec $b_ptr | |
1545 | jnz .Loop_ord_sqrx | |
1546 | ||
1547 | mov %rdx, 8*0($r_ptr) | |
1548 | mov $acc6, 8*1($r_ptr) | |
1549 | pxor %xmm1, %xmm1 | |
1550 | mov $acc7, 8*2($r_ptr) | |
1551 | pxor %xmm2, %xmm2 | |
1552 | mov $acc0, 8*3($r_ptr) | |
1553 | pxor %xmm3, %xmm3 | |
1554 | ||
d5e11843 AP |
1555 | mov 0(%rsp),%r15 |
1556 | .cfi_restore %r15 | |
1557 | mov 8(%rsp),%r14 | |
1558 | .cfi_restore %r14 | |
1559 | mov 16(%rsp),%r13 | |
1560 | .cfi_restore %r13 | |
1561 | mov 24(%rsp),%r12 | |
1562 | .cfi_restore %r12 | |
1563 | mov 32(%rsp),%rbx | |
1564 | .cfi_restore %rbx | |
1565 | mov 40(%rsp),%rbp | |
1566 | .cfi_restore %rbp | |
1567 | lea 48(%rsp),%rsp | |
1568 | .cfi_adjust_cfa_offset -48 | |
1569 | .Lord_sqrx_epilogue: | |
eb791696 | 1570 | ret |
d5e11843 | 1571 | .cfi_endproc |
eb791696 AP |
1572 | .size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx |
1573 | ___ | |
1574 | ||
4d3fa06f AP |
1575 | $code.=<<___; |
1576 | ################################################################################ | |
1577 | # void ecp_nistz256_to_mont( | |
1578 | # uint64_t res[4], | |
1579 | # uint64_t in[4]); | |
1580 | .globl ecp_nistz256_to_mont | |
1581 | .type ecp_nistz256_to_mont,\@function,2 | |
1582 | .align 32 | |
1583 | ecp_nistz256_to_mont: | |
eff5076a | 1584 | .cfi_startproc |
4d3fa06f AP |
1585 | ___ |
1586 | $code.=<<___ if ($addx); | |
1587 | mov \$0x80100, %ecx | |
1588 | and OPENSSL_ia32cap_P+8(%rip), %ecx | |
1589 | ___ | |
1590 | $code.=<<___; | |
1591 | lea .LRR(%rip), $b_org | |
1592 | jmp .Lmul_mont | |
eff5076a | 1593 | .cfi_endproc |
4d3fa06f AP |
1594 | .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont |
1595 | ||
1596 | ################################################################################ | |
1597 | # void ecp_nistz256_mul_mont( | |
1598 | # uint64_t res[4], | |
1599 | # uint64_t a[4], | |
1600 | # uint64_t b[4]); | |
1601 | ||
1602 | .globl ecp_nistz256_mul_mont | |
1603 | .type ecp_nistz256_mul_mont,\@function,3 | |
1604 | .align 32 | |
1605 | ecp_nistz256_mul_mont: | |
86e11278 | 1606 | .cfi_startproc |
4d3fa06f AP |
1607 | ___ |
1608 | $code.=<<___ if ($addx); | |
1609 | mov \$0x80100, %ecx | |
1610 | and OPENSSL_ia32cap_P+8(%rip), %ecx | |
1611 | ___ | |
1612 | $code.=<<___; | |
1613 | .Lmul_mont: | |
1614 | push %rbp | |
86e11278 | 1615 | .cfi_push %rbp |
4d3fa06f | 1616 | push %rbx |
86e11278 | 1617 | .cfi_push %rbx |
4d3fa06f | 1618 | push %r12 |
86e11278 | 1619 | .cfi_push %r12 |
4d3fa06f | 1620 | push %r13 |
86e11278 | 1621 | .cfi_push %r13 |
4d3fa06f | 1622 | push %r14 |
86e11278 | 1623 | .cfi_push %r14 |
4d3fa06f | 1624 | push %r15 |
86e11278 | 1625 | .cfi_push %r15 |
384e6de4 | 1626 | .Lmul_body: |
4d3fa06f AP |
1627 | ___ |
1628 | $code.=<<___ if ($addx); | |
1629 | cmp \$0x80100, %ecx | |
1630 | je .Lmul_montx | |
1631 | ___ | |
1632 | $code.=<<___; | |
1633 | mov $b_org, $b_ptr | |
1634 | mov 8*0($b_org), %rax | |
1635 | mov 8*0($a_ptr), $acc1 | |
1636 | mov 8*1($a_ptr), $acc2 | |
1637 | mov 8*2($a_ptr), $acc3 | |
1638 | mov 8*3($a_ptr), $acc4 | |
1639 | ||
1640 | call __ecp_nistz256_mul_montq | |
1641 | ___ | |
1642 | $code.=<<___ if ($addx); | |
1643 | jmp .Lmul_mont_done | |
1644 | ||
1645 | .align 32 | |
1646 | .Lmul_montx: | |
1647 | mov $b_org, $b_ptr | |
1648 | mov 8*0($b_org), %rdx | |
1649 | mov 8*0($a_ptr), $acc1 | |
1650 | mov 8*1($a_ptr), $acc2 | |
1651 | mov 8*2($a_ptr), $acc3 | |
1652 | mov 8*3($a_ptr), $acc4 | |
1653 | lea -128($a_ptr), $a_ptr # control u-op density | |
1654 | ||
1655 | call __ecp_nistz256_mul_montx | |
1656 | ___ | |
1657 | $code.=<<___; | |
1658 | .Lmul_mont_done: | |
384e6de4 | 1659 | mov 0(%rsp),%r15 |
86e11278 | 1660 | .cfi_restore %r15 |
384e6de4 | 1661 | mov 8(%rsp),%r14 |
86e11278 | 1662 | .cfi_restore %r14 |
384e6de4 | 1663 | mov 16(%rsp),%r13 |
86e11278 | 1664 | .cfi_restore %r13 |
384e6de4 | 1665 | mov 24(%rsp),%r12 |
86e11278 | 1666 | .cfi_restore %r12 |
384e6de4 | 1667 | mov 32(%rsp),%rbx |
86e11278 | 1668 | .cfi_restore %rbx |
384e6de4 | 1669 | mov 40(%rsp),%rbp |
86e11278 | 1670 | .cfi_restore %rbp |
384e6de4 | 1671 | lea 48(%rsp),%rsp |
86e11278 | 1672 | .cfi_adjust_cfa_offset -48 |
384e6de4 | 1673 | .Lmul_epilogue: |
4d3fa06f | 1674 | ret |
86e11278 | 1675 | .cfi_endproc |
4d3fa06f AP |
1676 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont |
1677 | ||
1678 | .type __ecp_nistz256_mul_montq,\@abi-omnipotent | |
1679 | .align 32 | |
1680 | __ecp_nistz256_mul_montq: | |
c0e8e500 | 1681 | .cfi_startproc |
4d3fa06f AP |
1682 | ######################################################################## |
1683 | # Multiply a by b[0] | |
1684 | mov %rax, $t1 | |
1685 | mulq $acc1 | |
1686 | mov .Lpoly+8*1(%rip),$poly1 | |
1687 | mov %rax, $acc0 | |
1688 | mov $t1, %rax | |
1689 | mov %rdx, $acc1 | |
1690 | ||
1691 | mulq $acc2 | |
1692 | mov .Lpoly+8*3(%rip),$poly3 | |
1693 | add %rax, $acc1 | |
1694 | mov $t1, %rax | |
1695 | adc \$0, %rdx | |
1696 | mov %rdx, $acc2 | |
1697 | ||
1698 | mulq $acc3 | |
1699 | add %rax, $acc2 | |
1700 | mov $t1, %rax | |
1701 | adc \$0, %rdx | |
1702 | mov %rdx, $acc3 | |
1703 | ||
1704 | mulq $acc4 | |
1705 | add %rax, $acc3 | |
1706 | mov $acc0, %rax | |
1707 | adc \$0, %rdx | |
1708 | xor $acc5, $acc5 | |
1709 | mov %rdx, $acc4 | |
1710 | ||
1711 | ######################################################################## | |
1712 | # First reduction step | |
1713 | # Basically now we want to multiply acc[0] by p256, | |
1714 | # and add the result to the acc. | |
1715 | # Due to the special form of p256 we do some optimizations | |
1716 | # | |
9e557ab2 AP |
1717 | # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] |
1718 | # then we add acc[0] and get acc[0] x 2^96 | |
4d3fa06f | 1719 | |
9e557ab2 AP |
1720 | mov $acc0, $t1 |
1721 | shl \$32, $acc0 | |
4d3fa06f | 1722 | mulq $poly3 |
9e557ab2 AP |
1723 | shr \$32, $t1 |
1724 | add $acc0, $acc1 # +=acc[0]<<96 | |
1725 | adc $t1, $acc2 | |
1726 | adc %rax, $acc3 | |
4d3fa06f AP |
1727 | mov 8*1($b_ptr), %rax |
1728 | adc %rdx, $acc4 | |
1729 | adc \$0, $acc5 | |
9e557ab2 | 1730 | xor $acc0, $acc0 |
4d3fa06f AP |
1731 | |
1732 | ######################################################################## | |
1733 | # Multiply by b[1] | |
1734 | mov %rax, $t1 | |
1735 | mulq 8*0($a_ptr) | |
1736 | add %rax, $acc1 | |
1737 | mov $t1, %rax | |
1738 | adc \$0, %rdx | |
1739 | mov %rdx, $t0 | |
1740 | ||
1741 | mulq 8*1($a_ptr) | |
1742 | add $t0, $acc2 | |
1743 | adc \$0, %rdx | |
1744 | add %rax, $acc2 | |
1745 | mov $t1, %rax | |
1746 | adc \$0, %rdx | |
1747 | mov %rdx, $t0 | |
1748 | ||
1749 | mulq 8*2($a_ptr) | |
1750 | add $t0, $acc3 | |
1751 | adc \$0, %rdx | |
1752 | add %rax, $acc3 | |
1753 | mov $t1, %rax | |
1754 | adc \$0, %rdx | |
1755 | mov %rdx, $t0 | |
1756 | ||
1757 | mulq 8*3($a_ptr) | |
1758 | add $t0, $acc4 | |
1759 | adc \$0, %rdx | |
1760 | add %rax, $acc4 | |
1761 | mov $acc1, %rax | |
1762 | adc %rdx, $acc5 | |
1763 | adc \$0, $acc0 | |
1764 | ||
1765 | ######################################################################## | |
609b0852 | 1766 | # Second reduction step |
9e557ab2 AP |
1767 | mov $acc1, $t1 |
1768 | shl \$32, $acc1 | |
4d3fa06f | 1769 | mulq $poly3 |
9e557ab2 AP |
1770 | shr \$32, $t1 |
1771 | add $acc1, $acc2 | |
1772 | adc $t1, $acc3 | |
1773 | adc %rax, $acc4 | |
4d3fa06f AP |
1774 | mov 8*2($b_ptr), %rax |
1775 | adc %rdx, $acc5 | |
1776 | adc \$0, $acc0 | |
9e557ab2 | 1777 | xor $acc1, $acc1 |
4d3fa06f AP |
1778 | |
1779 | ######################################################################## | |
1780 | # Multiply by b[2] | |
1781 | mov %rax, $t1 | |
1782 | mulq 8*0($a_ptr) | |
1783 | add %rax, $acc2 | |
1784 | mov $t1, %rax | |
1785 | adc \$0, %rdx | |
1786 | mov %rdx, $t0 | |
1787 | ||
1788 | mulq 8*1($a_ptr) | |
1789 | add $t0, $acc3 | |
1790 | adc \$0, %rdx | |
1791 | add %rax, $acc3 | |
1792 | mov $t1, %rax | |
1793 | adc \$0, %rdx | |
1794 | mov %rdx, $t0 | |
1795 | ||
1796 | mulq 8*2($a_ptr) | |
1797 | add $t0, $acc4 | |
1798 | adc \$0, %rdx | |
1799 | add %rax, $acc4 | |
1800 | mov $t1, %rax | |
1801 | adc \$0, %rdx | |
1802 | mov %rdx, $t0 | |
1803 | ||
1804 | mulq 8*3($a_ptr) | |
1805 | add $t0, $acc5 | |
1806 | adc \$0, %rdx | |
1807 | add %rax, $acc5 | |
1808 | mov $acc2, %rax | |
1809 | adc %rdx, $acc0 | |
1810 | adc \$0, $acc1 | |
1811 | ||
1812 | ######################################################################## | |
609b0852 | 1813 | # Third reduction step |
9e557ab2 AP |
1814 | mov $acc2, $t1 |
1815 | shl \$32, $acc2 | |
4d3fa06f | 1816 | mulq $poly3 |
9e557ab2 AP |
1817 | shr \$32, $t1 |
1818 | add $acc2, $acc3 | |
1819 | adc $t1, $acc4 | |
1820 | adc %rax, $acc5 | |
4d3fa06f AP |
1821 | mov 8*3($b_ptr), %rax |
1822 | adc %rdx, $acc0 | |
1823 | adc \$0, $acc1 | |
9e557ab2 | 1824 | xor $acc2, $acc2 |
4d3fa06f AP |
1825 | |
1826 | ######################################################################## | |
1827 | # Multiply by b[3] | |
1828 | mov %rax, $t1 | |
1829 | mulq 8*0($a_ptr) | |
1830 | add %rax, $acc3 | |
1831 | mov $t1, %rax | |
1832 | adc \$0, %rdx | |
1833 | mov %rdx, $t0 | |
1834 | ||
1835 | mulq 8*1($a_ptr) | |
1836 | add $t0, $acc4 | |
1837 | adc \$0, %rdx | |
1838 | add %rax, $acc4 | |
1839 | mov $t1, %rax | |
1840 | adc \$0, %rdx | |
1841 | mov %rdx, $t0 | |
1842 | ||
1843 | mulq 8*2($a_ptr) | |
1844 | add $t0, $acc5 | |
1845 | adc \$0, %rdx | |
1846 | add %rax, $acc5 | |
1847 | mov $t1, %rax | |
1848 | adc \$0, %rdx | |
1849 | mov %rdx, $t0 | |
1850 | ||
1851 | mulq 8*3($a_ptr) | |
1852 | add $t0, $acc0 | |
1853 | adc \$0, %rdx | |
1854 | add %rax, $acc0 | |
1855 | mov $acc3, %rax | |
1856 | adc %rdx, $acc1 | |
1857 | adc \$0, $acc2 | |
1858 | ||
1859 | ######################################################################## | |
609b0852 | 1860 | # Final reduction step |
9e557ab2 AP |
1861 | mov $acc3, $t1 |
1862 | shl \$32, $acc3 | |
4d3fa06f | 1863 | mulq $poly3 |
9e557ab2 AP |
1864 | shr \$32, $t1 |
1865 | add $acc3, $acc4 | |
1866 | adc $t1, $acc5 | |
4d3fa06f | 1867 | mov $acc4, $t0 |
9e557ab2 | 1868 | adc %rax, $acc0 |
4d3fa06f AP |
1869 | adc %rdx, $acc1 |
1870 | mov $acc5, $t1 | |
1871 | adc \$0, $acc2 | |
1872 | ||
609b0852 | 1873 | ######################################################################## |
4d3fa06f AP |
1874 | # Branch-less conditional subtraction of P |
1875 | sub \$-1, $acc4 # .Lpoly[0] | |
1876 | mov $acc0, $t2 | |
1877 | sbb $poly1, $acc5 # .Lpoly[1] | |
1878 | sbb \$0, $acc0 # .Lpoly[2] | |
1879 | mov $acc1, $t3 | |
1880 | sbb $poly3, $acc1 # .Lpoly[3] | |
9e557ab2 | 1881 | sbb \$0, $acc2 |
4d3fa06f | 1882 | |
9e557ab2 AP |
1883 | cmovc $t0, $acc4 |
1884 | cmovc $t1, $acc5 | |
4d3fa06f | 1885 | mov $acc4, 8*0($r_ptr) |
9e557ab2 | 1886 | cmovc $t2, $acc0 |
4d3fa06f | 1887 | mov $acc5, 8*1($r_ptr) |
9e557ab2 | 1888 | cmovc $t3, $acc1 |
4d3fa06f AP |
1889 | mov $acc0, 8*2($r_ptr) |
1890 | mov $acc1, 8*3($r_ptr) | |
1891 | ||
1892 | ret | |
c0e8e500 | 1893 | .cfi_endproc |
4d3fa06f AP |
1894 | .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq |
1895 | ||
1896 | ################################################################################ | |
1897 | # void ecp_nistz256_sqr_mont( | |
1898 | # uint64_t res[4], | |
1899 | # uint64_t a[4]); | |
1900 | ||
1901 | # we optimize the square according to S.Gueron and V.Krasnov, | |
1902 | # "Speeding up Big-Number Squaring" | |
1903 | .globl ecp_nistz256_sqr_mont | |
1904 | .type ecp_nistz256_sqr_mont,\@function,2 | |
1905 | .align 32 | |
1906 | ecp_nistz256_sqr_mont: | |
86e11278 | 1907 | .cfi_startproc |
4d3fa06f AP |
1908 | ___ |
1909 | $code.=<<___ if ($addx); | |
1910 | mov \$0x80100, %ecx | |
1911 | and OPENSSL_ia32cap_P+8(%rip), %ecx | |
1912 | ___ | |
1913 | $code.=<<___; | |
1914 | push %rbp | |
86e11278 | 1915 | .cfi_push %rbp |
4d3fa06f | 1916 | push %rbx |
86e11278 | 1917 | .cfi_push %rbx |
4d3fa06f | 1918 | push %r12 |
86e11278 | 1919 | .cfi_push %r12 |
4d3fa06f | 1920 | push %r13 |
86e11278 | 1921 | .cfi_push %r13 |
4d3fa06f | 1922 | push %r14 |
86e11278 | 1923 | .cfi_push %r14 |
4d3fa06f | 1924 | push %r15 |
86e11278 | 1925 | .cfi_push %r15 |
384e6de4 | 1926 | .Lsqr_body: |
4d3fa06f AP |
1927 | ___ |
1928 | $code.=<<___ if ($addx); | |
1929 | cmp \$0x80100, %ecx | |
1930 | je .Lsqr_montx | |
1931 | ___ | |
1932 | $code.=<<___; | |
1933 | mov 8*0($a_ptr), %rax | |
1934 | mov 8*1($a_ptr), $acc6 | |
1935 | mov 8*2($a_ptr), $acc7 | |
1936 | mov 8*3($a_ptr), $acc0 | |
1937 | ||
1938 | call __ecp_nistz256_sqr_montq | |
1939 | ___ | |
1940 | $code.=<<___ if ($addx); | |
1941 | jmp .Lsqr_mont_done | |
1942 | ||
1943 | .align 32 | |
1944 | .Lsqr_montx: | |
1945 | mov 8*0($a_ptr), %rdx | |
1946 | mov 8*1($a_ptr), $acc6 | |
1947 | mov 8*2($a_ptr), $acc7 | |
1948 | mov 8*3($a_ptr), $acc0 | |
1949 | lea -128($a_ptr), $a_ptr # control u-op density | |
1950 | ||
1951 | call __ecp_nistz256_sqr_montx | |
1952 | ___ | |
1953 | $code.=<<___; | |
1954 | .Lsqr_mont_done: | |
384e6de4 | 1955 | mov 0(%rsp),%r15 |
86e11278 | 1956 | .cfi_restore %r15 |
384e6de4 | 1957 | mov 8(%rsp),%r14 |
86e11278 | 1958 | .cfi_restore %r14 |
384e6de4 | 1959 | mov 16(%rsp),%r13 |
86e11278 | 1960 | .cfi_restore %r13 |
384e6de4 | 1961 | mov 24(%rsp),%r12 |
86e11278 | 1962 | .cfi_restore %r12 |
384e6de4 | 1963 | mov 32(%rsp),%rbx |
86e11278 | 1964 | .cfi_restore %rbx |
384e6de4 | 1965 | mov 40(%rsp),%rbp |
86e11278 | 1966 | .cfi_restore %rbp |
384e6de4 | 1967 | lea 48(%rsp),%rsp |
86e11278 | 1968 | .cfi_adjust_cfa_offset -48 |
384e6de4 | 1969 | .Lsqr_epilogue: |
4d3fa06f | 1970 | ret |
86e11278 | 1971 | .cfi_endproc |
4d3fa06f AP |
1972 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont |
1973 | ||
1974 | .type __ecp_nistz256_sqr_montq,\@abi-omnipotent | |
1975 | .align 32 | |
1976 | __ecp_nistz256_sqr_montq: | |
c0e8e500 | 1977 | .cfi_startproc |
4d3fa06f AP |
1978 | mov %rax, $acc5 |
1979 | mulq $acc6 # a[1]*a[0] | |
1980 | mov %rax, $acc1 | |
1981 | mov $acc7, %rax | |
1982 | mov %rdx, $acc2 | |
1983 | ||
1984 | mulq $acc5 # a[0]*a[2] | |
1985 | add %rax, $acc2 | |
1986 | mov $acc0, %rax | |
1987 | adc \$0, %rdx | |
1988 | mov %rdx, $acc3 | |
1989 | ||
1990 | mulq $acc5 # a[0]*a[3] | |
1991 | add %rax, $acc3 | |
1992 | mov $acc7, %rax | |
1993 | adc \$0, %rdx | |
1994 | mov %rdx, $acc4 | |
1995 | ||
1996 | ################################# | |
1997 | mulq $acc6 # a[1]*a[2] | |
1998 | add %rax, $acc3 | |
1999 | mov $acc0, %rax | |
2000 | adc \$0, %rdx | |
2001 | mov %rdx, $t1 | |
2002 | ||
2003 | mulq $acc6 # a[1]*a[3] | |
2004 | add %rax, $acc4 | |
2005 | mov $acc0, %rax | |
2006 | adc \$0, %rdx | |
2007 | add $t1, $acc4 | |
2008 | mov %rdx, $acc5 | |
2009 | adc \$0, $acc5 | |
2010 | ||
2011 | ################################# | |
2012 | mulq $acc7 # a[2]*a[3] | |
2013 | xor $acc7, $acc7 | |
2014 | add %rax, $acc5 | |
2015 | mov 8*0($a_ptr), %rax | |
2016 | mov %rdx, $acc6 | |
2017 | adc \$0, $acc6 | |
2018 | ||
2019 | add $acc1, $acc1 # acc1:6<<1 | |
2020 | adc $acc2, $acc2 | |
2021 | adc $acc3, $acc3 | |
2022 | adc $acc4, $acc4 | |
2023 | adc $acc5, $acc5 | |
2024 | adc $acc6, $acc6 | |
2025 | adc \$0, $acc7 | |
2026 | ||
2027 | mulq %rax | |
2028 | mov %rax, $acc0 | |
2029 | mov 8*1($a_ptr), %rax | |
2030 | mov %rdx, $t0 | |
2031 | ||
2032 | mulq %rax | |
2033 | add $t0, $acc1 | |
2034 | adc %rax, $acc2 | |
2035 | mov 8*2($a_ptr), %rax | |
2036 | adc \$0, %rdx | |
2037 | mov %rdx, $t0 | |
2038 | ||
2039 | mulq %rax | |
2040 | add $t0, $acc3 | |
2041 | adc %rax, $acc4 | |
2042 | mov 8*3($a_ptr), %rax | |
2043 | adc \$0, %rdx | |
2044 | mov %rdx, $t0 | |
2045 | ||
2046 | mulq %rax | |
2047 | add $t0, $acc5 | |
2048 | adc %rax, $acc6 | |
2049 | mov $acc0, %rax | |
2050 | adc %rdx, $acc7 | |
2051 | ||
2052 | mov .Lpoly+8*1(%rip), $a_ptr | |
2053 | mov .Lpoly+8*3(%rip), $t1 | |
2054 | ||
2055 | ########################################## | |
2056 | # Now the reduction | |
2057 | # First iteration | |
9e557ab2 AP |
2058 | mov $acc0, $t0 |
2059 | shl \$32, $acc0 | |
4d3fa06f | 2060 | mulq $t1 |
9e557ab2 AP |
2061 | shr \$32, $t0 |
2062 | add $acc0, $acc1 # +=acc[0]<<96 | |
2063 | adc $t0, $acc2 | |
2064 | adc %rax, $acc3 | |
4d3fa06f | 2065 | mov $acc1, %rax |
9e557ab2 | 2066 | adc \$0, %rdx |
4d3fa06f AP |
2067 | |
2068 | ########################################## | |
2069 | # Second iteration | |
9e557ab2 AP |
2070 | mov $acc1, $t0 |
2071 | shl \$32, $acc1 | |
2072 | mov %rdx, $acc0 | |
4d3fa06f | 2073 | mulq $t1 |
9e557ab2 AP |
2074 | shr \$32, $t0 |
2075 | add $acc1, $acc2 | |
2076 | adc $t0, $acc3 | |
2077 | adc %rax, $acc0 | |
4d3fa06f | 2078 | mov $acc2, %rax |
9e557ab2 | 2079 | adc \$0, %rdx |
4d3fa06f AP |
2080 | |
2081 | ########################################## | |
2082 | # Third iteration | |
9e557ab2 AP |
2083 | mov $acc2, $t0 |
2084 | shl \$32, $acc2 | |
2085 | mov %rdx, $acc1 | |
4d3fa06f | 2086 | mulq $t1 |
9e557ab2 AP |
2087 | shr \$32, $t0 |
2088 | add $acc2, $acc3 | |
2089 | adc $t0, $acc0 | |
2090 | adc %rax, $acc1 | |
4d3fa06f | 2091 | mov $acc3, %rax |
9e557ab2 | 2092 | adc \$0, %rdx |
4d3fa06f AP |
2093 | |
2094 | ########################################### | |
2095 | # Last iteration | |
9e557ab2 AP |
2096 | mov $acc3, $t0 |
2097 | shl \$32, $acc3 | |
2098 | mov %rdx, $acc2 | |
4d3fa06f | 2099 | mulq $t1 |
9e557ab2 AP |
2100 | shr \$32, $t0 |
2101 | add $acc3, $acc0 | |
2102 | adc $t0, $acc1 | |
2103 | adc %rax, $acc2 | |
2104 | adc \$0, %rdx | |
4d3fa06f | 2105 | xor $acc3, $acc3 |
4d3fa06f AP |
2106 | |
2107 | ############################################ | |
2108 | # Add the rest of the acc | |
9e557ab2 AP |
2109 | add $acc0, $acc4 |
2110 | adc $acc1, $acc5 | |
4d3fa06f | 2111 | mov $acc4, $acc0 |
9e557ab2 AP |
2112 | adc $acc2, $acc6 |
2113 | adc %rdx, $acc7 | |
4d3fa06f AP |
2114 | mov $acc5, $acc1 |
2115 | adc \$0, $acc3 | |
2116 | ||
2117 | sub \$-1, $acc4 # .Lpoly[0] | |
2118 | mov $acc6, $acc2 | |
2119 | sbb $a_ptr, $acc5 # .Lpoly[1] | |
2120 | sbb \$0, $acc6 # .Lpoly[2] | |
2121 | mov $acc7, $t0 | |
2122 | sbb $t1, $acc7 # .Lpoly[3] | |
9e557ab2 | 2123 | sbb \$0, $acc3 |
4d3fa06f | 2124 | |
9e557ab2 AP |
2125 | cmovc $acc0, $acc4 |
2126 | cmovc $acc1, $acc5 | |
4d3fa06f | 2127 | mov $acc4, 8*0($r_ptr) |
9e557ab2 | 2128 | cmovc $acc2, $acc6 |
4d3fa06f | 2129 | mov $acc5, 8*1($r_ptr) |
9e557ab2 | 2130 | cmovc $t0, $acc7 |
4d3fa06f AP |
2131 | mov $acc6, 8*2($r_ptr) |
2132 | mov $acc7, 8*3($r_ptr) | |
2133 | ||
2134 | ret | |
c0e8e500 | 2135 | .cfi_endproc |
4d3fa06f AP |
2136 | .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq |
2137 | ___ | |
2138 | ||
2139 | if ($addx) { | |
2140 | $code.=<<___; | |
2141 | .type __ecp_nistz256_mul_montx,\@abi-omnipotent | |
2142 | .align 32 | |
2143 | __ecp_nistz256_mul_montx: | |
c0e8e500 | 2144 | .cfi_startproc |
4d3fa06f AP |
2145 | ######################################################################## |
2146 | # Multiply by b[0] | |
2147 | mulx $acc1, $acc0, $acc1 | |
2148 | mulx $acc2, $t0, $acc2 | |
2149 | mov \$32, $poly1 | |
2150 | xor $acc5, $acc5 # cf=0 | |
2151 | mulx $acc3, $t1, $acc3 | |
2152 | mov .Lpoly+8*3(%rip), $poly3 | |
2153 | adc $t0, $acc1 | |
2154 | mulx $acc4, $t0, $acc4 | |
2155 | mov $acc0, %rdx | |
2156 | adc $t1, $acc2 | |
2157 | shlx $poly1,$acc0,$t1 | |
2158 | adc $t0, $acc3 | |
2159 | shrx $poly1,$acc0,$t0 | |
2160 | adc \$0, $acc4 | |
2161 | ||
2162 | ######################################################################## | |
2163 | # First reduction step | |
9e557ab2 AP |
2164 | add $t1, $acc1 |
2165 | adc $t0, $acc2 | |
4d3fa06f AP |
2166 | |
2167 | mulx $poly3, $t0, $t1 | |
2168 | mov 8*1($b_ptr), %rdx | |
9e557ab2 AP |
2169 | adc $t0, $acc3 |
2170 | adc $t1, $acc4 | |
2171 | adc \$0, $acc5 | |
2172 | xor $acc0, $acc0 # $acc0=0,cf=0,of=0 | |
4d3fa06f AP |
2173 | |
2174 | ######################################################################## | |
2175 | # Multiply by b[1] | |
2176 | mulx 8*0+128($a_ptr), $t0, $t1 | |
2177 | adcx $t0, $acc1 | |
2178 | adox $t1, $acc2 | |
2179 | ||
2180 | mulx 8*1+128($a_ptr), $t0, $t1 | |
2181 | adcx $t0, $acc2 | |
2182 | adox $t1, $acc3 | |
2183 | ||
2184 | mulx 8*2+128($a_ptr), $t0, $t1 | |
2185 | adcx $t0, $acc3 | |
2186 | adox $t1, $acc4 | |
2187 | ||
2188 | mulx 8*3+128($a_ptr), $t0, $t1 | |
2189 | mov $acc1, %rdx | |
2190 | adcx $t0, $acc4 | |
2191 | shlx $poly1, $acc1, $t0 | |
2192 | adox $t1, $acc5 | |
2193 | shrx $poly1, $acc1, $t1 | |
2194 | ||
2195 | adcx $acc0, $acc5 | |
2196 | adox $acc0, $acc0 | |
2197 | adc \$0, $acc0 | |
2198 | ||
2199 | ######################################################################## | |
2200 | # Second reduction step | |
9e557ab2 AP |
2201 | add $t0, $acc2 |
2202 | adc $t1, $acc3 | |
4d3fa06f AP |
2203 | |
2204 | mulx $poly3, $t0, $t1 | |
2205 | mov 8*2($b_ptr), %rdx | |
9e557ab2 AP |
2206 | adc $t0, $acc4 |
2207 | adc $t1, $acc5 | |
2208 | adc \$0, $acc0 | |
2209 | xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 | |
4d3fa06f AP |
2210 | |
2211 | ######################################################################## | |
2212 | # Multiply by b[2] | |
2213 | mulx 8*0+128($a_ptr), $t0, $t1 | |
2214 | adcx $t0, $acc2 | |
2215 | adox $t1, $acc3 | |
2216 | ||
2217 | mulx 8*1+128($a_ptr), $t0, $t1 | |
2218 | adcx $t0, $acc3 | |
2219 | adox $t1, $acc4 | |
2220 | ||
2221 | mulx 8*2+128($a_ptr), $t0, $t1 | |
2222 | adcx $t0, $acc4 | |
2223 | adox $t1, $acc5 | |
2224 | ||
2225 | mulx 8*3+128($a_ptr), $t0, $t1 | |
2226 | mov $acc2, %rdx | |
2227 | adcx $t0, $acc5 | |
2228 | shlx $poly1, $acc2, $t0 | |
2229 | adox $t1, $acc0 | |
2230 | shrx $poly1, $acc2, $t1 | |
2231 | ||
2232 | adcx $acc1, $acc0 | |
2233 | adox $acc1, $acc1 | |
2234 | adc \$0, $acc1 | |
2235 | ||
2236 | ######################################################################## | |
2237 | # Third reduction step | |
9e557ab2 AP |
2238 | add $t0, $acc3 |
2239 | adc $t1, $acc4 | |
4d3fa06f AP |
2240 | |
2241 | mulx $poly3, $t0, $t1 | |
2242 | mov 8*3($b_ptr), %rdx | |
9e557ab2 AP |
2243 | adc $t0, $acc5 |
2244 | adc $t1, $acc0 | |
2245 | adc \$0, $acc1 | |
2246 | xor $acc2, $acc2 # $acc2=0,cf=0,of=0 | |
4d3fa06f AP |
2247 | |
2248 | ######################################################################## | |
2249 | # Multiply by b[3] | |
2250 | mulx 8*0+128($a_ptr), $t0, $t1 | |
2251 | adcx $t0, $acc3 | |
2252 | adox $t1, $acc4 | |
2253 | ||
2254 | mulx 8*1+128($a_ptr), $t0, $t1 | |
2255 | adcx $t0, $acc4 | |
2256 | adox $t1, $acc5 | |
2257 | ||
2258 | mulx 8*2+128($a_ptr), $t0, $t1 | |
2259 | adcx $t0, $acc5 | |
2260 | adox $t1, $acc0 | |
2261 | ||
2262 | mulx 8*3+128($a_ptr), $t0, $t1 | |
2263 | mov $acc3, %rdx | |
2264 | adcx $t0, $acc0 | |
2265 | shlx $poly1, $acc3, $t0 | |
2266 | adox $t1, $acc1 | |
2267 | shrx $poly1, $acc3, $t1 | |
2268 | ||
2269 | adcx $acc2, $acc1 | |
2270 | adox $acc2, $acc2 | |
2271 | adc \$0, $acc2 | |
2272 | ||
2273 | ######################################################################## | |
2274 | # Fourth reduction step | |
9e557ab2 AP |
2275 | add $t0, $acc4 |
2276 | adc $t1, $acc5 | |
4d3fa06f AP |
2277 | |
2278 | mulx $poly3, $t0, $t1 | |
2279 | mov $acc4, $t2 | |
2280 | mov .Lpoly+8*1(%rip), $poly1 | |
9e557ab2 | 2281 | adc $t0, $acc0 |
4d3fa06f | 2282 | mov $acc5, $t3 |
9e557ab2 | 2283 | adc $t1, $acc1 |
4d3fa06f | 2284 | adc \$0, $acc2 |
4d3fa06f AP |
2285 | |
2286 | ######################################################################## | |
2287 | # Branch-less conditional subtraction of P | |
2288 | xor %eax, %eax | |
9e557ab2 | 2289 | mov $acc0, $t0 |
4d3fa06f AP |
2290 | sbb \$-1, $acc4 # .Lpoly[0] |
2291 | sbb $poly1, $acc5 # .Lpoly[1] | |
2292 | sbb \$0, $acc0 # .Lpoly[2] | |
2293 | mov $acc1, $t1 | |
2294 | sbb $poly3, $acc1 # .Lpoly[3] | |
9e557ab2 | 2295 | sbb \$0, $acc2 |
4d3fa06f | 2296 | |
9e557ab2 AP |
2297 | cmovc $t2, $acc4 |
2298 | cmovc $t3, $acc5 | |
4d3fa06f | 2299 | mov $acc4, 8*0($r_ptr) |
9e557ab2 | 2300 | cmovc $t0, $acc0 |
4d3fa06f | 2301 | mov $acc5, 8*1($r_ptr) |
9e557ab2 | 2302 | cmovc $t1, $acc1 |
4d3fa06f AP |
2303 | mov $acc0, 8*2($r_ptr) |
2304 | mov $acc1, 8*3($r_ptr) | |
2305 | ||
2306 | ret | |
c0e8e500 | 2307 | .cfi_endproc |
4d3fa06f AP |
2308 | .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx |
2309 | ||
2310 | .type __ecp_nistz256_sqr_montx,\@abi-omnipotent | |
2311 | .align 32 | |
2312 | __ecp_nistz256_sqr_montx: | |
c0e8e500 | 2313 | .cfi_startproc |
4d3fa06f AP |
2314 | mulx $acc6, $acc1, $acc2 # a[0]*a[1] |
2315 | mulx $acc7, $t0, $acc3 # a[0]*a[2] | |
2316 | xor %eax, %eax | |
2317 | adc $t0, $acc2 | |
2318 | mulx $acc0, $t1, $acc4 # a[0]*a[3] | |
2319 | mov $acc6, %rdx | |
2320 | adc $t1, $acc3 | |
2321 | adc \$0, $acc4 | |
2322 | xor $acc5, $acc5 # $acc5=0,cf=0,of=0 | |
2323 | ||
2324 | ################################# | |
2325 | mulx $acc7, $t0, $t1 # a[1]*a[2] | |
2326 | adcx $t0, $acc3 | |
2327 | adox $t1, $acc4 | |
2328 | ||
2329 | mulx $acc0, $t0, $t1 # a[1]*a[3] | |
2330 | mov $acc7, %rdx | |
2331 | adcx $t0, $acc4 | |
2332 | adox $t1, $acc5 | |
2333 | adc \$0, $acc5 | |
2334 | ||
2335 | ################################# | |
2336 | mulx $acc0, $t0, $acc6 # a[2]*a[3] | |
2337 | mov 8*0+128($a_ptr), %rdx | |
2338 | xor $acc7, $acc7 # $acc7=0,cf=0,of=0 | |
2339 | adcx $acc1, $acc1 # acc1:6<<1 | |
2340 | adox $t0, $acc5 | |
2341 | adcx $acc2, $acc2 | |
2342 | adox $acc7, $acc6 # of=0 | |
2343 | ||
2344 | mulx %rdx, $acc0, $t1 | |
2345 | mov 8*1+128($a_ptr), %rdx | |
2346 | adcx $acc3, $acc3 | |
2347 | adox $t1, $acc1 | |
2348 | adcx $acc4, $acc4 | |
2349 | mulx %rdx, $t0, $t4 | |
2350 | mov 8*2+128($a_ptr), %rdx | |
2351 | adcx $acc5, $acc5 | |
2352 | adox $t0, $acc2 | |
2353 | adcx $acc6, $acc6 | |
2354 | .byte 0x67 | |
2355 | mulx %rdx, $t0, $t1 | |
2356 | mov 8*3+128($a_ptr), %rdx | |
2357 | adox $t4, $acc3 | |
2358 | adcx $acc7, $acc7 | |
2359 | adox $t0, $acc4 | |
2360 | mov \$32, $a_ptr | |
2361 | adox $t1, $acc5 | |
2362 | .byte 0x67,0x67 | |
2363 | mulx %rdx, $t0, $t4 | |
8fc063dc | 2364 | mov .Lpoly+8*3(%rip), %rdx |
4d3fa06f AP |
2365 | adox $t0, $acc6 |
2366 | shlx $a_ptr, $acc0, $t0 | |
2367 | adox $t4, $acc7 | |
2368 | shrx $a_ptr, $acc0, $t4 | |
8fc063dc | 2369 | mov %rdx,$t1 |
4d3fa06f AP |
2370 | |
2371 | # reduction step 1 | |
9e557ab2 AP |
2372 | add $t0, $acc1 |
2373 | adc $t4, $acc2 | |
4d3fa06f | 2374 | |
8fc063dc | 2375 | mulx $acc0, $t0, $acc0 |
9e557ab2 | 2376 | adc $t0, $acc3 |
4d3fa06f | 2377 | shlx $a_ptr, $acc1, $t0 |
4d3fa06f | 2378 | adc \$0, $acc0 |
9e557ab2 | 2379 | shrx $a_ptr, $acc1, $t4 |
4d3fa06f AP |
2380 | |
2381 | # reduction step 2 | |
9e557ab2 AP |
2382 | add $t0, $acc2 |
2383 | adc $t4, $acc3 | |
4d3fa06f | 2384 | |
8fc063dc | 2385 | mulx $acc1, $t0, $acc1 |
9e557ab2 | 2386 | adc $t0, $acc0 |
4d3fa06f | 2387 | shlx $a_ptr, $acc2, $t0 |
4d3fa06f | 2388 | adc \$0, $acc1 |
9e557ab2 | 2389 | shrx $a_ptr, $acc2, $t4 |
4d3fa06f AP |
2390 | |
2391 | # reduction step 3 | |
9e557ab2 AP |
2392 | add $t0, $acc3 |
2393 | adc $t4, $acc0 | |
4d3fa06f | 2394 | |
8fc063dc | 2395 | mulx $acc2, $t0, $acc2 |
9e557ab2 | 2396 | adc $t0, $acc1 |
4d3fa06f | 2397 | shlx $a_ptr, $acc3, $t0 |
4d3fa06f | 2398 | adc \$0, $acc2 |
9e557ab2 | 2399 | shrx $a_ptr, $acc3, $t4 |
4d3fa06f AP |
2400 | |
2401 | # reduction step 4 | |
9e557ab2 AP |
2402 | add $t0, $acc0 |
2403 | adc $t4, $acc1 | |
4d3fa06f | 2404 | |
8fc063dc | 2405 | mulx $acc3, $t0, $acc3 |
9e557ab2 | 2406 | adc $t0, $acc2 |
4d3fa06f AP |
2407 | adc \$0, $acc3 |
2408 | ||
8fc063dc AP |
2409 | xor $t3, $t3 |
2410 | add $acc0, $acc4 # accumulate upper half | |
4d3fa06f AP |
2411 | mov .Lpoly+8*1(%rip), $a_ptr |
2412 | adc $acc1, $acc5 | |
2413 | mov $acc4, $acc0 | |
2414 | adc $acc2, $acc6 | |
2415 | adc $acc3, $acc7 | |
2416 | mov $acc5, $acc1 | |
2417 | adc \$0, $t3 | |
2418 | ||
8fc063dc | 2419 | sub \$-1, $acc4 # .Lpoly[0] |
4d3fa06f AP |
2420 | mov $acc6, $acc2 |
2421 | sbb $a_ptr, $acc5 # .Lpoly[1] | |
2422 | sbb \$0, $acc6 # .Lpoly[2] | |
2423 | mov $acc7, $acc3 | |
2424 | sbb $t1, $acc7 # .Lpoly[3] | |
9e557ab2 | 2425 | sbb \$0, $t3 |
4d3fa06f | 2426 | |
9e557ab2 AP |
2427 | cmovc $acc0, $acc4 |
2428 | cmovc $acc1, $acc5 | |
4d3fa06f | 2429 | mov $acc4, 8*0($r_ptr) |
9e557ab2 | 2430 | cmovc $acc2, $acc6 |
4d3fa06f | 2431 | mov $acc5, 8*1($r_ptr) |
9e557ab2 | 2432 | cmovc $acc3, $acc7 |
4d3fa06f AP |
2433 | mov $acc6, 8*2($r_ptr) |
2434 | mov $acc7, 8*3($r_ptr) | |
2435 | ||
2436 | ret | |
c0e8e500 | 2437 | .cfi_endproc |
4d3fa06f AP |
2438 | .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx |
2439 | ___ | |
2440 | } | |
2441 | } | |
2442 | { | |
2443 | my ($r_ptr,$in_ptr)=("%rdi","%rsi"); | |
9e557ab2 AP |
2444 | my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); |
2445 | my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); | |
4d3fa06f AP |
2446 | |
2447 | $code.=<<___; | |
2448 | ################################################################################ | |
2449 | # void ecp_nistz256_from_mont( | |
2450 | # uint64_t res[4], | |
2451 | # uint64_t in[4]); | |
2452 | # This one performs Montgomery multiplication by 1, so we only need the reduction | |
2453 | ||
2454 | .globl ecp_nistz256_from_mont | |
2455 | .type ecp_nistz256_from_mont,\@function,2 | |
2456 | .align 32 | |
2457 | ecp_nistz256_from_mont: | |
86e11278 | 2458 | .cfi_startproc |
4d3fa06f | 2459 | push %r12 |
86e11278 | 2460 | .cfi_push %r12 |
4d3fa06f | 2461 | push %r13 |
86e11278 | 2462 | .cfi_push %r13 |
384e6de4 | 2463 | .Lfrom_body: |
4d3fa06f AP |
2464 | |
2465 | mov 8*0($in_ptr), %rax | |
9e557ab2 | 2466 | mov .Lpoly+8*3(%rip), $t2 |
4d3fa06f AP |
2467 | mov 8*1($in_ptr), $acc1 |
2468 | mov 8*2($in_ptr), $acc2 | |
2469 | mov 8*3($in_ptr), $acc3 | |
4d3fa06f | 2470 | mov %rax, $acc0 |
9e557ab2 | 2471 | mov .Lpoly+8*1(%rip), $t1 |
4d3fa06f AP |
2472 | |
2473 | ######################################### | |
2474 | # First iteration | |
9e557ab2 AP |
2475 | mov %rax, $t0 |
2476 | shl \$32, $acc0 | |
2477 | mulq $t2 | |
2478 | shr \$32, $t0 | |
4d3fa06f | 2479 | add $acc0, $acc1 |
9e557ab2 AP |
2480 | adc $t0, $acc2 |
2481 | adc %rax, $acc3 | |
4d3fa06f | 2482 | mov $acc1, %rax |
9e557ab2 | 2483 | adc \$0, %rdx |
4d3fa06f AP |
2484 | |
2485 | ######################################### | |
2486 | # Second iteration | |
9e557ab2 AP |
2487 | mov $acc1, $t0 |
2488 | shl \$32, $acc1 | |
2489 | mov %rdx, $acc0 | |
2490 | mulq $t2 | |
2491 | shr \$32, $t0 | |
4d3fa06f | 2492 | add $acc1, $acc2 |
9e557ab2 AP |
2493 | adc $t0, $acc3 |
2494 | adc %rax, $acc0 | |
4d3fa06f | 2495 | mov $acc2, %rax |
9e557ab2 | 2496 | adc \$0, %rdx |
4d3fa06f AP |
2497 | |
2498 | ########################################## | |
2499 | # Third iteration | |
9e557ab2 AP |
2500 | mov $acc2, $t0 |
2501 | shl \$32, $acc2 | |
2502 | mov %rdx, $acc1 | |
2503 | mulq $t2 | |
2504 | shr \$32, $t0 | |
4d3fa06f | 2505 | add $acc2, $acc3 |
9e557ab2 AP |
2506 | adc $t0, $acc0 |
2507 | adc %rax, $acc1 | |
4d3fa06f | 2508 | mov $acc3, %rax |
9e557ab2 | 2509 | adc \$0, %rdx |
4d3fa06f AP |
2510 | |
2511 | ########################################### | |
2512 | # Last iteration | |
9e557ab2 AP |
2513 | mov $acc3, $t0 |
2514 | shl \$32, $acc3 | |
2515 | mov %rdx, $acc2 | |
2516 | mulq $t2 | |
2517 | shr \$32, $t0 | |
2518 | add $acc3, $acc0 | |
2519 | adc $t0, $acc1 | |
2520 | mov $acc0, $t0 | |
2521 | adc %rax, $acc2 | |
2522 | mov $acc1, $in_ptr | |
4d3fa06f | 2523 | adc \$0, %rdx |
4d3fa06f | 2524 | |
9e557ab2 AP |
2525 | ########################################### |
2526 | # Branch-less conditional subtraction | |
2527 | sub \$-1, $acc0 | |
2528 | mov $acc2, %rax | |
2529 | sbb $t1, $acc1 | |
2530 | sbb \$0, $acc2 | |
2531 | mov %rdx, $acc3 | |
2532 | sbb $t2, %rdx | |
2533 | sbb $t2, $t2 | |
2534 | ||
2535 | cmovnz $t0, $acc0 | |
2536 | cmovnz $in_ptr, $acc1 | |
2537 | mov $acc0, 8*0($r_ptr) | |
2538 | cmovnz %rax, $acc2 | |
2539 | mov $acc1, 8*1($r_ptr) | |
2540 | cmovz %rdx, $acc3 | |
2541 | mov $acc2, 8*2($r_ptr) | |
2542 | mov $acc3, 8*3($r_ptr) | |
4d3fa06f | 2543 | |
384e6de4 | 2544 | mov 0(%rsp),%r13 |
86e11278 | 2545 | .cfi_restore %r13 |
384e6de4 | 2546 | mov 8(%rsp),%r12 |
86e11278 | 2547 | .cfi_restore %r12 |
384e6de4 | 2548 | lea 16(%rsp),%rsp |
86e11278 | 2549 | .cfi_adjust_cfa_offset -16 |
384e6de4 | 2550 | .Lfrom_epilogue: |
4d3fa06f | 2551 | ret |
86e11278 | 2552 | .cfi_endproc |
4d3fa06f AP |
2553 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont |
2554 | ___ | |
2555 | } | |
2556 | { | |
2557 | my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); | |
2558 | my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); | |
2559 | my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); | |
2560 | my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); | |
2561 | ||
2562 | $code.=<<___; | |
2563 | ################################################################################ | |
3ff08e1d AP |
2564 | # void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index); |
2565 | .globl ecp_nistz256_scatter_w5 | |
2566 | .type ecp_nistz256_scatter_w5,\@abi-omnipotent | |
4d3fa06f | 2567 | .align 32 |
3ff08e1d | 2568 | ecp_nistz256_scatter_w5: |
eff5076a | 2569 | .cfi_startproc |
3ff08e1d AP |
2570 | lea -3($index,$index,2), $index |
2571 | movdqa 0x00($in_t), %xmm0 | |
2572 | shl \$5, $index | |
2573 | movdqa 0x10($in_t), %xmm1 | |
2574 | movdqa 0x20($in_t), %xmm2 | |
2575 | movdqa 0x30($in_t), %xmm3 | |
2576 | movdqa 0x40($in_t), %xmm4 | |
2577 | movdqa 0x50($in_t), %xmm5 | |
2578 | movdqa %xmm0, 0x00($val,$index) | |
2579 | movdqa %xmm1, 0x10($val,$index) | |
2580 | movdqa %xmm2, 0x20($val,$index) | |
2581 | movdqa %xmm3, 0x30($val,$index) | |
2582 | movdqa %xmm4, 0x40($val,$index) | |
2583 | movdqa %xmm5, 0x50($val,$index) | |
2584 | ||
2585 | ret | |
eff5076a | 2586 | .cfi_endproc |
3ff08e1d AP |
2587 | .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 |
2588 | ||
2589 | ################################################################################ | |
2590 | # void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index); | |
2591 | .globl ecp_nistz256_gather_w5 | |
2592 | .type ecp_nistz256_gather_w5,\@abi-omnipotent | |
2593 | .align 32 | |
2594 | ecp_nistz256_gather_w5: | |
c0e8e500 | 2595 | .cfi_startproc |
4d3fa06f AP |
2596 | ___ |
2597 | $code.=<<___ if ($avx>1); | |
2598 | mov OPENSSL_ia32cap_P+8(%rip), %eax | |
2599 | test \$`1<<5`, %eax | |
3ff08e1d | 2600 | jnz .Lavx2_gather_w5 |
4d3fa06f AP |
2601 | ___ |
2602 | $code.=<<___ if ($win64); | |
2603 | lea -0x88(%rsp), %rax | |
3ff08e1d | 2604 | .LSEH_begin_ecp_nistz256_gather_w5: |
4d3fa06f AP |
2605 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp |
2606 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) | |
2607 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) | |
2608 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) | |
2609 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) | |
2610 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) | |
2611 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) | |
2612 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) | |
2613 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) | |
2614 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) | |
2615 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) | |
2616 | ___ | |
2617 | $code.=<<___; | |
2618 | movdqa .LOne(%rip), $ONE | |
2619 | movd $index, $INDEX | |
2620 | ||
2621 | pxor $Ra, $Ra | |
2622 | pxor $Rb, $Rb | |
2623 | pxor $Rc, $Rc | |
2624 | pxor $Rd, $Rd | |
2625 | pxor $Re, $Re | |
2626 | pxor $Rf, $Rf | |
2627 | ||
2628 | movdqa $ONE, $M0 | |
2629 | pshufd \$0, $INDEX, $INDEX | |
2630 | ||
2631 | mov \$16, %rax | |
2632 | .Lselect_loop_sse_w5: | |
2633 | ||
2634 | movdqa $M0, $TMP0 | |
2635 | paddd $ONE, $M0 | |
2636 | pcmpeqd $INDEX, $TMP0 | |
2637 | ||
2638 | movdqa 16*0($in_t), $T0a | |
2639 | movdqa 16*1($in_t), $T0b | |
2640 | movdqa 16*2($in_t), $T0c | |
2641 | movdqa 16*3($in_t), $T0d | |
2642 | movdqa 16*4($in_t), $T0e | |
2643 | movdqa 16*5($in_t), $T0f | |
2644 | lea 16*6($in_t), $in_t | |
2645 | ||
2646 | pand $TMP0, $T0a | |
2647 | pand $TMP0, $T0b | |
2648 | por $T0a, $Ra | |
2649 | pand $TMP0, $T0c | |
2650 | por $T0b, $Rb | |
2651 | pand $TMP0, $T0d | |
2652 | por $T0c, $Rc | |
2653 | pand $TMP0, $T0e | |
2654 | por $T0d, $Rd | |
2655 | pand $TMP0, $T0f | |
2656 | por $T0e, $Re | |
2657 | por $T0f, $Rf | |
2658 | ||
2659 | dec %rax | |
2660 | jnz .Lselect_loop_sse_w5 | |
2661 | ||
2662 | movdqu $Ra, 16*0($val) | |
2663 | movdqu $Rb, 16*1($val) | |
2664 | movdqu $Rc, 16*2($val) | |
2665 | movdqu $Rd, 16*3($val) | |
2666 | movdqu $Re, 16*4($val) | |
2667 | movdqu $Rf, 16*5($val) | |
2668 | ___ | |
2669 | $code.=<<___ if ($win64); | |
2670 | movaps (%rsp), %xmm6 | |
2671 | movaps 0x10(%rsp), %xmm7 | |
2672 | movaps 0x20(%rsp), %xmm8 | |
2673 | movaps 0x30(%rsp), %xmm9 | |
2674 | movaps 0x40(%rsp), %xmm10 | |
2675 | movaps 0x50(%rsp), %xmm11 | |
2676 | movaps 0x60(%rsp), %xmm12 | |
2677 | movaps 0x70(%rsp), %xmm13 | |
2678 | movaps 0x80(%rsp), %xmm14 | |
2679 | movaps 0x90(%rsp), %xmm15 | |
2680 | lea 0xa8(%rsp), %rsp | |
4d3fa06f AP |
2681 | ___ |
2682 | $code.=<<___; | |
2683 | ret | |
c0e8e500 | 2684 | .cfi_endproc |
384e6de4 | 2685 | .LSEH_end_ecp_nistz256_gather_w5: |
3ff08e1d | 2686 | .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 |
4d3fa06f AP |
2687 | |
2688 | ################################################################################ | |
3ff08e1d AP |
2689 | # void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index); |
2690 | .globl ecp_nistz256_scatter_w7 | |
2691 | .type ecp_nistz256_scatter_w7,\@abi-omnipotent | |
4d3fa06f | 2692 | .align 32 |
3ff08e1d | 2693 | ecp_nistz256_scatter_w7: |
eff5076a | 2694 | .cfi_startproc |
3ff08e1d AP |
2695 | movdqu 0x00($in_t), %xmm0 |
2696 | shl \$6, $index | |
2697 | movdqu 0x10($in_t), %xmm1 | |
2698 | movdqu 0x20($in_t), %xmm2 | |
2699 | movdqu 0x30($in_t), %xmm3 | |
2700 | movdqa %xmm0, 0x00($val,$index) | |
2701 | movdqa %xmm1, 0x10($val,$index) | |
2702 | movdqa %xmm2, 0x20($val,$index) | |
2703 | movdqa %xmm3, 0x30($val,$index) | |
2704 | ||
2705 | ret | |
eff5076a | 2706 | .cfi_endproc |
3ff08e1d AP |
2707 | .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 |
2708 | ||
2709 | ################################################################################ | |
2710 | # void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index); | |
2711 | .globl ecp_nistz256_gather_w7 | |
2712 | .type ecp_nistz256_gather_w7,\@abi-omnipotent | |
2713 | .align 32 | |
2714 | ecp_nistz256_gather_w7: | |
c0e8e500 | 2715 | .cfi_startproc |
4d3fa06f AP |
2716 | ___ |
2717 | $code.=<<___ if ($avx>1); | |
2718 | mov OPENSSL_ia32cap_P+8(%rip), %eax | |
2719 | test \$`1<<5`, %eax | |
3ff08e1d | 2720 | jnz .Lavx2_gather_w7 |
4d3fa06f AP |
2721 | ___ |
2722 | $code.=<<___ if ($win64); | |
2723 | lea -0x88(%rsp), %rax | |
3ff08e1d | 2724 | .LSEH_begin_ecp_nistz256_gather_w7: |
4d3fa06f AP |
2725 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp |
2726 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) | |
2727 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) | |
2728 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) | |
2729 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) | |
2730 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) | |
2731 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) | |
2732 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) | |
2733 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) | |
2734 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) | |
2735 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) | |
2736 | ___ | |
2737 | $code.=<<___; | |
2738 | movdqa .LOne(%rip), $M0 | |
2739 | movd $index, $INDEX | |
2740 | ||
2741 | pxor $Ra, $Ra | |
2742 | pxor $Rb, $Rb | |
2743 | pxor $Rc, $Rc | |
2744 | pxor $Rd, $Rd | |
2745 | ||
2746 | movdqa $M0, $ONE | |
2747 | pshufd \$0, $INDEX, $INDEX | |
2748 | mov \$64, %rax | |
2749 | ||
2750 | .Lselect_loop_sse_w7: | |
2751 | movdqa $M0, $TMP0 | |
2752 | paddd $ONE, $M0 | |
2753 | movdqa 16*0($in_t), $T0a | |
2754 | movdqa 16*1($in_t), $T0b | |
2755 | pcmpeqd $INDEX, $TMP0 | |
2756 | movdqa 16*2($in_t), $T0c | |
2757 | movdqa 16*3($in_t), $T0d | |
2758 | lea 16*4($in_t), $in_t | |
2759 | ||
2760 | pand $TMP0, $T0a | |
2761 | pand $TMP0, $T0b | |
2762 | por $T0a, $Ra | |
2763 | pand $TMP0, $T0c | |
2764 | por $T0b, $Rb | |
2765 | pand $TMP0, $T0d | |
2766 | por $T0c, $Rc | |
2767 | prefetcht0 255($in_t) | |
2768 | por $T0d, $Rd | |
2769 | ||
2770 | dec %rax | |
2771 | jnz .Lselect_loop_sse_w7 | |
2772 | ||
2773 | movdqu $Ra, 16*0($val) | |
2774 | movdqu $Rb, 16*1($val) | |
2775 | movdqu $Rc, 16*2($val) | |
2776 | movdqu $Rd, 16*3($val) | |
2777 | ___ | |
2778 | $code.=<<___ if ($win64); | |
2779 | movaps (%rsp), %xmm6 | |
2780 | movaps 0x10(%rsp), %xmm7 | |
2781 | movaps 0x20(%rsp), %xmm8 | |
2782 | movaps 0x30(%rsp), %xmm9 | |
2783 | movaps 0x40(%rsp), %xmm10 | |
2784 | movaps 0x50(%rsp), %xmm11 | |
2785 | movaps 0x60(%rsp), %xmm12 | |
2786 | movaps 0x70(%rsp), %xmm13 | |
2787 | movaps 0x80(%rsp), %xmm14 | |
2788 | movaps 0x90(%rsp), %xmm15 | |
2789 | lea 0xa8(%rsp), %rsp | |
4d3fa06f AP |
2790 | ___ |
2791 | $code.=<<___; | |
2792 | ret | |
c0e8e500 | 2793 | .cfi_endproc |
384e6de4 | 2794 | .LSEH_end_ecp_nistz256_gather_w7: |
3ff08e1d | 2795 | .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 |
4d3fa06f AP |
2796 | ___ |
2797 | } | |
2798 | if ($avx>1) { | |
2799 | my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); | |
2800 | my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); | |
2801 | my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); | |
2802 | my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); | |
2803 | ||
2804 | $code.=<<___; | |
2805 | ################################################################################ | |
3ff08e1d AP |
2806 | # void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index); |
2807 | .type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent | |
4d3fa06f | 2808 | .align 32 |
3ff08e1d | 2809 | ecp_nistz256_avx2_gather_w5: |
c0e8e500 | 2810 | .cfi_startproc |
3ff08e1d | 2811 | .Lavx2_gather_w5: |
4d3fa06f AP |
2812 | vzeroupper |
2813 | ___ | |
2814 | $code.=<<___ if ($win64); | |
2815 | lea -0x88(%rsp), %rax | |
384e6de4 | 2816 | mov %rsp,%r11 |
3ff08e1d | 2817 | .LSEH_begin_ecp_nistz256_avx2_gather_w5: |
384e6de4 AP |
2818 | .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp |
2819 | .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) | |
2820 | .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) | |
2821 | .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) | |
2822 | .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) | |
2823 | .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) | |
2824 | .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) | |
2825 | .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) | |
2826 | .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) | |
2827 | .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) | |
2828 | .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) | |
4d3fa06f AP |
2829 | ___ |
2830 | $code.=<<___; | |
2831 | vmovdqa .LTwo(%rip), $TWO | |
2832 | ||
2833 | vpxor $Ra, $Ra, $Ra | |
2834 | vpxor $Rb, $Rb, $Rb | |
2835 | vpxor $Rc, $Rc, $Rc | |
2836 | ||
2837 | vmovdqa .LOne(%rip), $M0 | |
2838 | vmovdqa .LTwo(%rip), $M1 | |
2839 | ||
2840 | vmovd $index, %xmm1 | |
2841 | vpermd $INDEX, $Ra, $INDEX | |
2842 | ||
2843 | mov \$8, %rax | |
2844 | .Lselect_loop_avx2_w5: | |
2845 | ||
2846 | vmovdqa 32*0($in_t), $T0a | |
2847 | vmovdqa 32*1($in_t), $T0b | |
2848 | vmovdqa 32*2($in_t), $T0c | |
2849 | ||
2850 | vmovdqa 32*3($in_t), $T1a | |
2851 | vmovdqa 32*4($in_t), $T1b | |
2852 | vmovdqa 32*5($in_t), $T1c | |
2853 | ||
2854 | vpcmpeqd $INDEX, $M0, $TMP0 | |
2855 | vpcmpeqd $INDEX, $M1, $TMP1 | |
2856 | ||
2857 | vpaddd $TWO, $M0, $M0 | |
2858 | vpaddd $TWO, $M1, $M1 | |
2859 | lea 32*6($in_t), $in_t | |
2860 | ||
2861 | vpand $TMP0, $T0a, $T0a | |
2862 | vpand $TMP0, $T0b, $T0b | |
2863 | vpand $TMP0, $T0c, $T0c | |
2864 | vpand $TMP1, $T1a, $T1a | |
2865 | vpand $TMP1, $T1b, $T1b | |
2866 | vpand $TMP1, $T1c, $T1c | |
2867 | ||
2868 | vpxor $T0a, $Ra, $Ra | |
2869 | vpxor $T0b, $Rb, $Rb | |
2870 | vpxor $T0c, $Rc, $Rc | |
2871 | vpxor $T1a, $Ra, $Ra | |
2872 | vpxor $T1b, $Rb, $Rb | |
2873 | vpxor $T1c, $Rc, $Rc | |
2874 | ||
2875 | dec %rax | |
2876 | jnz .Lselect_loop_avx2_w5 | |
2877 | ||
2878 | vmovdqu $Ra, 32*0($val) | |
2879 | vmovdqu $Rb, 32*1($val) | |
2880 | vmovdqu $Rc, 32*2($val) | |
2881 | vzeroupper | |
2882 | ___ | |
2883 | $code.=<<___ if ($win64); | |
2884 | movaps (%rsp), %xmm6 | |
2885 | movaps 0x10(%rsp), %xmm7 | |
2886 | movaps 0x20(%rsp), %xmm8 | |
2887 | movaps 0x30(%rsp), %xmm9 | |
2888 | movaps 0x40(%rsp), %xmm10 | |
2889 | movaps 0x50(%rsp), %xmm11 | |
2890 | movaps 0x60(%rsp), %xmm12 | |
2891 | movaps 0x70(%rsp), %xmm13 | |
2892 | movaps 0x80(%rsp), %xmm14 | |
2893 | movaps 0x90(%rsp), %xmm15 | |
384e6de4 | 2894 | lea (%r11), %rsp |
4d3fa06f AP |
2895 | ___ |
2896 | $code.=<<___; | |
2897 | ret | |
c0e8e500 | 2898 | .cfi_endproc |
384e6de4 | 2899 | .LSEH_end_ecp_nistz256_avx2_gather_w5: |
3ff08e1d | 2900 | .size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 |
4d3fa06f AP |
2901 | ___ |
2902 | } | |
2903 | if ($avx>1) { | |
2904 | my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); | |
2905 | my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); | |
2906 | my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); | |
2907 | my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); | |
2908 | my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); | |
2909 | ||
2910 | $code.=<<___; | |
2911 | ||
2912 | ################################################################################ | |
3ff08e1d AP |
2913 | # void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index); |
2914 | .globl ecp_nistz256_avx2_gather_w7 | |
2915 | .type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent | |
4d3fa06f | 2916 | .align 32 |
3ff08e1d | 2917 | ecp_nistz256_avx2_gather_w7: |
c0e8e500 | 2918 | .cfi_startproc |
3ff08e1d | 2919 | .Lavx2_gather_w7: |
4d3fa06f AP |
2920 | vzeroupper |
2921 | ___ | |
2922 | $code.=<<___ if ($win64); | |
384e6de4 | 2923 | mov %rsp,%r11 |
4d3fa06f | 2924 | lea -0x88(%rsp), %rax |
3ff08e1d | 2925 | .LSEH_begin_ecp_nistz256_avx2_gather_w7: |
384e6de4 AP |
2926 | .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp |
2927 | .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) | |
2928 | .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) | |
2929 | .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) | |
2930 | .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) | |
2931 | .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) | |
2932 | .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) | |
2933 | .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) | |
2934 | .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) | |
2935 | .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) | |
2936 | .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) | |
4d3fa06f AP |
2937 | ___ |
2938 | $code.=<<___; | |
2939 | vmovdqa .LThree(%rip), $THREE | |
2940 | ||
2941 | vpxor $Ra, $Ra, $Ra | |
2942 | vpxor $Rb, $Rb, $Rb | |
2943 | ||
2944 | vmovdqa .LOne(%rip), $M0 | |
2945 | vmovdqa .LTwo(%rip), $M1 | |
2946 | vmovdqa .LThree(%rip), $M2 | |
2947 | ||
2948 | vmovd $index, %xmm1 | |
2949 | vpermd $INDEX, $Ra, $INDEX | |
2950 | # Skip index = 0, because it is implicitly the point at infinity | |
2951 | ||
2952 | mov \$21, %rax | |
2953 | .Lselect_loop_avx2_w7: | |
2954 | ||
2955 | vmovdqa 32*0($in_t), $T0a | |
2956 | vmovdqa 32*1($in_t), $T0b | |
2957 | ||
2958 | vmovdqa 32*2($in_t), $T1a | |
2959 | vmovdqa 32*3($in_t), $T1b | |
2960 | ||
2961 | vmovdqa 32*4($in_t), $T2a | |
2962 | vmovdqa 32*5($in_t), $T2b | |
2963 | ||
2964 | vpcmpeqd $INDEX, $M0, $TMP0 | |
2965 | vpcmpeqd $INDEX, $M1, $TMP1 | |
2966 | vpcmpeqd $INDEX, $M2, $TMP2 | |
2967 | ||
2968 | vpaddd $THREE, $M0, $M0 | |
2969 | vpaddd $THREE, $M1, $M1 | |
2970 | vpaddd $THREE, $M2, $M2 | |
2971 | lea 32*6($in_t), $in_t | |
2972 | ||
2973 | vpand $TMP0, $T0a, $T0a | |
2974 | vpand $TMP0, $T0b, $T0b | |
2975 | vpand $TMP1, $T1a, $T1a | |
2976 | vpand $TMP1, $T1b, $T1b | |
2977 | vpand $TMP2, $T2a, $T2a | |
2978 | vpand $TMP2, $T2b, $T2b | |
2979 | ||
2980 | vpxor $T0a, $Ra, $Ra | |
2981 | vpxor $T0b, $Rb, $Rb | |
2982 | vpxor $T1a, $Ra, $Ra | |
2983 | vpxor $T1b, $Rb, $Rb | |
2984 | vpxor $T2a, $Ra, $Ra | |
2985 | vpxor $T2b, $Rb, $Rb | |
2986 | ||
2987 | dec %rax | |
2988 | jnz .Lselect_loop_avx2_w7 | |
2989 | ||
2990 | ||
2991 | vmovdqa 32*0($in_t), $T0a | |
2992 | vmovdqa 32*1($in_t), $T0b | |
2993 | ||
2994 | vpcmpeqd $INDEX, $M0, $TMP0 | |
2995 | ||
2996 | vpand $TMP0, $T0a, $T0a | |
2997 | vpand $TMP0, $T0b, $T0b | |
2998 | ||
2999 | vpxor $T0a, $Ra, $Ra | |
3000 | vpxor $T0b, $Rb, $Rb | |
3001 | ||
3002 | vmovdqu $Ra, 32*0($val) | |
3003 | vmovdqu $Rb, 32*1($val) | |
3004 | vzeroupper | |
3005 | ___ | |
3006 | $code.=<<___ if ($win64); | |
3007 | movaps (%rsp), %xmm6 | |
3008 | movaps 0x10(%rsp), %xmm7 | |
3009 | movaps 0x20(%rsp), %xmm8 | |
3010 | movaps 0x30(%rsp), %xmm9 | |
3011 | movaps 0x40(%rsp), %xmm10 | |
3012 | movaps 0x50(%rsp), %xmm11 | |
3013 | movaps 0x60(%rsp), %xmm12 | |
3014 | movaps 0x70(%rsp), %xmm13 | |
3015 | movaps 0x80(%rsp), %xmm14 | |
3016 | movaps 0x90(%rsp), %xmm15 | |
384e6de4 | 3017 | lea (%r11), %rsp |
4d3fa06f AP |
3018 | ___ |
3019 | $code.=<<___; | |
3020 | ret | |
c0e8e500 | 3021 | .cfi_endproc |
384e6de4 | 3022 | .LSEH_end_ecp_nistz256_avx2_gather_w7: |
3ff08e1d | 3023 | .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 |
4d3fa06f AP |
3024 | ___ |
3025 | } else { | |
3026 | $code.=<<___; | |
3ff08e1d AP |
3027 | .globl ecp_nistz256_avx2_gather_w7 |
3028 | .type ecp_nistz256_avx2_gather_w7,\@function,3 | |
4d3fa06f | 3029 | .align 32 |
3ff08e1d | 3030 | ecp_nistz256_avx2_gather_w7: |
eff5076a | 3031 | .cfi_startproc |
4d3fa06f AP |
3032 | .byte 0x0f,0x0b # ud2 |
3033 | ret | |
eff5076a | 3034 | .cfi_endproc |
3ff08e1d | 3035 | .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 |
4d3fa06f AP |
3036 | ___ |
3037 | } | |
3038 | {{{ | |
3039 | ######################################################################## | |
3040 | # This block implements higher level point_double, point_add and | |
3041 | # point_add_affine. The key to performance in this case is to allow | |
3042 | # out-of-order execution logic to overlap computations from next step | |
3043 | # with tail processing from current step. By using tailored calling | |
3044 | # sequence we minimize inter-step overhead to give processor better | |
3045 | # shot at overlapping operations... | |
3046 | # | |
3047 | # You will notice that input data is copied to stack. Trouble is that | |
3048 | # there are no registers to spare for holding original pointers and | |
3049 | # reloading them, pointers, would create undesired dependencies on | |
3050 | # effective addresses calculation paths. In other words it's too done | |
3051 | # to favour out-of-order execution logic. | |
3052 | # <appro@openssl.org> | |
3053 | ||
3054 | my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); | |
3055 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); | |
3056 | my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); | |
3057 | my ($poly1,$poly3)=($acc6,$acc7); | |
3058 | ||
3059 | sub load_for_mul () { | |
3060 | my ($a,$b,$src0) = @_; | |
3061 | my $bias = $src0 eq "%rax" ? 0 : -128; | |
3062 | ||
3063 | " mov $b, $src0 | |
3064 | lea $b, $b_ptr | |
3065 | mov 8*0+$a, $acc1 | |
3066 | mov 8*1+$a, $acc2 | |
3067 | lea $bias+$a, $a_ptr | |
3068 | mov 8*2+$a, $acc3 | |
3069 | mov 8*3+$a, $acc4" | |
3070 | } | |
3071 | ||
3072 | sub load_for_sqr () { | |
3073 | my ($a,$src0) = @_; | |
3074 | my $bias = $src0 eq "%rax" ? 0 : -128; | |
3075 | ||
3076 | " mov 8*0+$a, $src0 | |
3077 | mov 8*1+$a, $acc6 | |
3078 | lea $bias+$a, $a_ptr | |
3079 | mov 8*2+$a, $acc7 | |
3080 | mov 8*3+$a, $acc0" | |
3081 | } | |
3082 | ||
3083 | { | |
3084 | ######################################################################## | |
3085 | # operate in 4-5-0-1 "name space" that matches multiplication output | |
3086 | # | |
3087 | my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | |
3088 | ||
3089 | $code.=<<___; | |
3090 | .type __ecp_nistz256_add_toq,\@abi-omnipotent | |
3091 | .align 32 | |
3092 | __ecp_nistz256_add_toq: | |
c0e8e500 | 3093 | .cfi_startproc |
b62b2454 | 3094 | xor $t4,$t4 |
4d3fa06f AP |
3095 | add 8*0($b_ptr), $a0 |
3096 | adc 8*1($b_ptr), $a1 | |
3097 | mov $a0, $t0 | |
3098 | adc 8*2($b_ptr), $a2 | |
3099 | adc 8*3($b_ptr), $a3 | |
3100 | mov $a1, $t1 | |
b62b2454 | 3101 | adc \$0, $t4 |
4d3fa06f AP |
3102 | |
3103 | sub \$-1, $a0 | |
3104 | mov $a2, $t2 | |
3105 | sbb $poly1, $a1 | |
3106 | sbb \$0, $a2 | |
3107 | mov $a3, $t3 | |
3108 | sbb $poly3, $a3 | |
b62b2454 | 3109 | sbb \$0, $t4 |
4d3fa06f | 3110 | |
d3034d31 AP |
3111 | cmovc $t0, $a0 |
3112 | cmovc $t1, $a1 | |
4d3fa06f | 3113 | mov $a0, 8*0($r_ptr) |
d3034d31 | 3114 | cmovc $t2, $a2 |
4d3fa06f | 3115 | mov $a1, 8*1($r_ptr) |
d3034d31 | 3116 | cmovc $t3, $a3 |
4d3fa06f AP |
3117 | mov $a2, 8*2($r_ptr) |
3118 | mov $a3, 8*3($r_ptr) | |
3119 | ||
3120 | ret | |
c0e8e500 | 3121 | .cfi_endproc |
4d3fa06f AP |
3122 | .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq |
3123 | ||
3124 | .type __ecp_nistz256_sub_fromq,\@abi-omnipotent | |
3125 | .align 32 | |
3126 | __ecp_nistz256_sub_fromq: | |
c0e8e500 | 3127 | .cfi_startproc |
4d3fa06f AP |
3128 | sub 8*0($b_ptr), $a0 |
3129 | sbb 8*1($b_ptr), $a1 | |
3130 | mov $a0, $t0 | |
3131 | sbb 8*2($b_ptr), $a2 | |
3132 | sbb 8*3($b_ptr), $a3 | |
3133 | mov $a1, $t1 | |
3134 | sbb $t4, $t4 | |
3135 | ||
3136 | add \$-1, $a0 | |
3137 | mov $a2, $t2 | |
3138 | adc $poly1, $a1 | |
3139 | adc \$0, $a2 | |
3140 | mov $a3, $t3 | |
3141 | adc $poly3, $a3 | |
3142 | test $t4, $t4 | |
3143 | ||
3144 | cmovz $t0, $a0 | |
3145 | cmovz $t1, $a1 | |
3146 | mov $a0, 8*0($r_ptr) | |
3147 | cmovz $t2, $a2 | |
3148 | mov $a1, 8*1($r_ptr) | |
3149 | cmovz $t3, $a3 | |
3150 | mov $a2, 8*2($r_ptr) | |
3151 | mov $a3, 8*3($r_ptr) | |
3152 | ||
3153 | ret | |
c0e8e500 | 3154 | .cfi_endproc |
4d3fa06f AP |
3155 | .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq |
3156 | ||
3157 | .type __ecp_nistz256_subq,\@abi-omnipotent | |
3158 | .align 32 | |
3159 | __ecp_nistz256_subq: | |
c0e8e500 | 3160 | .cfi_startproc |
4d3fa06f AP |
3161 | sub $a0, $t0 |
3162 | sbb $a1, $t1 | |
3163 | mov $t0, $a0 | |
3164 | sbb $a2, $t2 | |
3165 | sbb $a3, $t3 | |
3166 | mov $t1, $a1 | |
3167 | sbb $t4, $t4 | |
3168 | ||
3169 | add \$-1, $t0 | |
3170 | mov $t2, $a2 | |
3171 | adc $poly1, $t1 | |
3172 | adc \$0, $t2 | |
3173 | mov $t3, $a3 | |
3174 | adc $poly3, $t3 | |
3175 | test $t4, $t4 | |
3176 | ||
3177 | cmovnz $t0, $a0 | |
3178 | cmovnz $t1, $a1 | |
3179 | cmovnz $t2, $a2 | |
3180 | cmovnz $t3, $a3 | |
3181 | ||
3182 | ret | |
c0e8e500 | 3183 | .cfi_endproc |
4d3fa06f AP |
3184 | .size __ecp_nistz256_subq,.-__ecp_nistz256_subq |
3185 | ||
3186 | .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent | |
3187 | .align 32 | |
3188 | __ecp_nistz256_mul_by_2q: | |
c0e8e500 | 3189 | .cfi_startproc |
b62b2454 | 3190 | xor $t4, $t4 |
4d3fa06f AP |
3191 | add $a0, $a0 # a0:a3+a0:a3 |
3192 | adc $a1, $a1 | |
3193 | mov $a0, $t0 | |
3194 | adc $a2, $a2 | |
3195 | adc $a3, $a3 | |
3196 | mov $a1, $t1 | |
b62b2454 | 3197 | adc \$0, $t4 |
4d3fa06f AP |
3198 | |
3199 | sub \$-1, $a0 | |
3200 | mov $a2, $t2 | |
3201 | sbb $poly1, $a1 | |
3202 | sbb \$0, $a2 | |
3203 | mov $a3, $t3 | |
3204 | sbb $poly3, $a3 | |
b62b2454 | 3205 | sbb \$0, $t4 |
4d3fa06f | 3206 | |
d3034d31 AP |
3207 | cmovc $t0, $a0 |
3208 | cmovc $t1, $a1 | |
4d3fa06f | 3209 | mov $a0, 8*0($r_ptr) |
d3034d31 | 3210 | cmovc $t2, $a2 |
4d3fa06f | 3211 | mov $a1, 8*1($r_ptr) |
d3034d31 | 3212 | cmovc $t3, $a3 |
4d3fa06f AP |
3213 | mov $a2, 8*2($r_ptr) |
3214 | mov $a3, 8*3($r_ptr) | |
3215 | ||
3216 | ret | |
c0e8e500 | 3217 | .cfi_endproc |
4d3fa06f AP |
3218 | .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q |
3219 | ___ | |
3220 | } | |
3221 | sub gen_double () { | |
3222 | my $x = shift; | |
3223 | my ($src0,$sfx,$bias); | |
3224 | my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | |
3225 | ||
3226 | if ($x ne "x") { | |
3227 | $src0 = "%rax"; | |
3228 | $sfx = ""; | |
3229 | $bias = 0; | |
3230 | ||
3231 | $code.=<<___; | |
3232 | .globl ecp_nistz256_point_double | |
3233 | .type ecp_nistz256_point_double,\@function,2 | |
3234 | .align 32 | |
3235 | ecp_nistz256_point_double: | |
86e11278 | 3236 | .cfi_startproc |
4d3fa06f AP |
3237 | ___ |
3238 | $code.=<<___ if ($addx); | |
3239 | mov \$0x80100, %ecx | |
3240 | and OPENSSL_ia32cap_P+8(%rip), %ecx | |
3241 | cmp \$0x80100, %ecx | |
3242 | je .Lpoint_doublex | |
3243 | ___ | |
3244 | } else { | |
3245 | $src0 = "%rdx"; | |
3246 | $sfx = "x"; | |
3247 | $bias = 128; | |
3248 | ||
3249 | $code.=<<___; | |
3250 | .type ecp_nistz256_point_doublex,\@function,2 | |
3251 | .align 32 | |
3252 | ecp_nistz256_point_doublex: | |
86e11278 | 3253 | .cfi_startproc |
4d3fa06f AP |
3254 | .Lpoint_doublex: |
3255 | ___ | |
3256 | } | |
3257 | $code.=<<___; | |
3258 | push %rbp | |
86e11278 | 3259 | .cfi_push %rbp |
4d3fa06f | 3260 | push %rbx |
86e11278 | 3261 | .cfi_push %rbx |
4d3fa06f | 3262 | push %r12 |
86e11278 | 3263 | .cfi_push %r12 |
4d3fa06f | 3264 | push %r13 |
86e11278 | 3265 | .cfi_push %r13 |
4d3fa06f | 3266 | push %r14 |
86e11278 | 3267 | .cfi_push %r14 |
4d3fa06f | 3268 | push %r15 |
86e11278 | 3269 | .cfi_push %r15 |
4d3fa06f | 3270 | sub \$32*5+8, %rsp |
86e11278 | 3271 | .cfi_adjust_cfa_offset 32*5+8 |
384e6de4 | 3272 | .Lpoint_double${x}_body: |
4d3fa06f | 3273 | |
d9375341 | 3274 | .Lpoint_double_shortcut$x: |
4d3fa06f AP |
3275 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x |
3276 | mov $a_ptr, $b_ptr # backup copy | |
3277 | movdqu 0x10($a_ptr), %xmm1 | |
3278 | mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order | |
3279 | mov 0x20+8*1($a_ptr), $acc5 | |
3280 | mov 0x20+8*2($a_ptr), $acc0 | |
3281 | mov 0x20+8*3($a_ptr), $acc1 | |
3282 | mov .Lpoly+8*1(%rip), $poly1 | |
3283 | mov .Lpoly+8*3(%rip), $poly3 | |
3284 | movdqa %xmm0, $in_x(%rsp) | |
3285 | movdqa %xmm1, $in_x+0x10(%rsp) | |
3286 | lea 0x20($r_ptr), $acc2 | |
3287 | lea 0x40($r_ptr), $acc3 | |
3288 | movq $r_ptr, %xmm0 | |
3289 | movq $acc2, %xmm1 | |
3290 | movq $acc3, %xmm2 | |
3291 | ||
3292 | lea $S(%rsp), $r_ptr | |
3293 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); | |
3294 | ||
3295 | mov 0x40+8*0($a_ptr), $src0 | |
3296 | mov 0x40+8*1($a_ptr), $acc6 | |
3297 | mov 0x40+8*2($a_ptr), $acc7 | |
3298 | mov 0x40+8*3($a_ptr), $acc0 | |
3299 | lea 0x40-$bias($a_ptr), $a_ptr | |
3300 | lea $Zsqr(%rsp), $r_ptr | |
3301 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); | |
3302 | ||
3303 | `&load_for_sqr("$S(%rsp)", "$src0")` | |
3304 | lea $S(%rsp), $r_ptr | |
3305 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); | |
3306 | ||
3307 | mov 0x20($b_ptr), $src0 # $b_ptr is still valid | |
3308 | mov 0x40+8*0($b_ptr), $acc1 | |
3309 | mov 0x40+8*1($b_ptr), $acc2 | |
3310 | mov 0x40+8*2($b_ptr), $acc3 | |
3311 | mov 0x40+8*3($b_ptr), $acc4 | |
3312 | lea 0x40-$bias($b_ptr), $a_ptr | |
3313 | lea 0x20($b_ptr), $b_ptr | |
3314 | movq %xmm2, $r_ptr | |
3315 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); | |
3316 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); | |
3317 | ||
3318 | mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order | |
3319 | mov $in_x+8*1(%rsp), $acc5 | |
3320 | lea $Zsqr(%rsp), $b_ptr | |
3321 | mov $in_x+8*2(%rsp), $acc0 | |
3322 | mov $in_x+8*3(%rsp), $acc1 | |
3323 | lea $M(%rsp), $r_ptr | |
3324 | call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); | |
3325 | ||
3326 | mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order | |
3327 | mov $in_x+8*1(%rsp), $acc5 | |
3328 | lea $Zsqr(%rsp), $b_ptr | |
3329 | mov $in_x+8*2(%rsp), $acc0 | |
3330 | mov $in_x+8*3(%rsp), $acc1 | |
3331 | lea $Zsqr(%rsp), $r_ptr | |
3332 | call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); | |
3333 | ||
3334 | `&load_for_sqr("$S(%rsp)", "$src0")` | |
3335 | movq %xmm1, $r_ptr | |
3336 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); | |
3337 | ___ | |
609b0852 | 3338 | { |
4d3fa06f AP |
3339 | ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## |
3340 | # operate in 4-5-6-7 "name space" that matches squaring output | |
3341 | # | |
3342 | my ($poly1,$poly3)=($a_ptr,$t1); | |
3343 | my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); | |
3344 | ||
3345 | $code.=<<___; | |
3346 | xor $t4, $t4 | |
3347 | mov $a0, $t0 | |
3348 | add \$-1, $a0 | |
3349 | mov $a1, $t1 | |
3350 | adc $poly1, $a1 | |
3351 | mov $a2, $t2 | |
3352 | adc \$0, $a2 | |
3353 | mov $a3, $t3 | |
3354 | adc $poly3, $a3 | |
3355 | adc \$0, $t4 | |
3356 | xor $a_ptr, $a_ptr # borrow $a_ptr | |
3357 | test \$1, $t0 | |
3358 | ||
3359 | cmovz $t0, $a0 | |
3360 | cmovz $t1, $a1 | |
3361 | cmovz $t2, $a2 | |
3362 | cmovz $t3, $a3 | |
3363 | cmovz $a_ptr, $t4 | |
3364 | ||
3365 | mov $a1, $t0 # a0:a3>>1 | |
3366 | shr \$1, $a0 | |
3367 | shl \$63, $t0 | |
3368 | mov $a2, $t1 | |
3369 | shr \$1, $a1 | |
3370 | or $t0, $a0 | |
3371 | shl \$63, $t1 | |
3372 | mov $a3, $t2 | |
3373 | shr \$1, $a2 | |
3374 | or $t1, $a1 | |
3375 | shl \$63, $t2 | |
3376 | mov $a0, 8*0($r_ptr) | |
3377 | shr \$1, $a3 | |
3378 | mov $a1, 8*1($r_ptr) | |
3379 | shl \$63, $t4 | |
3380 | or $t2, $a2 | |
3381 | or $t4, $a3 | |
3382 | mov $a2, 8*2($r_ptr) | |
3383 | mov $a3, 8*3($r_ptr) | |
3384 | ___ | |
3385 | } | |
3386 | $code.=<<___; | |
3387 | `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` | |
3388 | lea $M(%rsp), $r_ptr | |
3389 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); | |
3390 | ||
3391 | lea $tmp0(%rsp), $r_ptr | |
3392 | call __ecp_nistz256_mul_by_2$x | |
3393 | ||
3394 | lea $M(%rsp), $b_ptr | |
3395 | lea $M(%rsp), $r_ptr | |
3396 | call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); | |
3397 | ||
3398 | `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` | |
3399 | lea $S(%rsp), $r_ptr | |
3400 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); | |
3401 | ||
3402 | lea $tmp0(%rsp), $r_ptr | |
3403 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); | |
3404 | ||
3405 | `&load_for_sqr("$M(%rsp)", "$src0")` | |
3406 | movq %xmm0, $r_ptr | |
3407 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); | |
3408 | ||
3409 | lea $tmp0(%rsp), $b_ptr | |
3410 | mov $acc6, $acc0 # harmonize sqr output and sub input | |
3411 | mov $acc7, $acc1 | |
3412 | mov $a_ptr, $poly1 | |
3413 | mov $t1, $poly3 | |
3414 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); | |
3415 | ||
3416 | mov $S+8*0(%rsp), $t0 | |
3417 | mov $S+8*1(%rsp), $t1 | |
3418 | mov $S+8*2(%rsp), $t2 | |
3419 | mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order | |
3420 | lea $S(%rsp), $r_ptr | |
3421 | call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); | |
3422 | ||
3423 | mov $M(%rsp), $src0 | |
3424 | lea $M(%rsp), $b_ptr | |
3425 | mov $acc4, $acc6 # harmonize sub output and mul input | |
3426 | xor %ecx, %ecx | |
609b0852 | 3427 | mov $acc4, $S+8*0(%rsp) # have to save:-( |
4d3fa06f AP |
3428 | mov $acc5, $acc2 |
3429 | mov $acc5, $S+8*1(%rsp) | |
3430 | cmovz $acc0, $acc3 | |
3431 | mov $acc0, $S+8*2(%rsp) | |
3432 | lea $S-$bias(%rsp), $a_ptr | |
3433 | cmovz $acc1, $acc4 | |
3434 | mov $acc1, $S+8*3(%rsp) | |
3435 | mov $acc6, $acc1 | |
3436 | lea $S(%rsp), $r_ptr | |
3437 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); | |
3438 | ||
3439 | movq %xmm1, $b_ptr | |
3440 | movq %xmm1, $r_ptr | |
3441 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); | |
3442 | ||
384e6de4 | 3443 | lea 32*5+56(%rsp), %rsi |
86e11278 | 3444 | .cfi_def_cfa %rsi,8 |
384e6de4 | 3445 | mov -48(%rsi),%r15 |
86e11278 | 3446 | .cfi_restore %r15 |
384e6de4 | 3447 | mov -40(%rsi),%r14 |
86e11278 | 3448 | .cfi_restore %r14 |
384e6de4 | 3449 | mov -32(%rsi),%r13 |
86e11278 | 3450 | .cfi_restore %r13 |
384e6de4 | 3451 | mov -24(%rsi),%r12 |
86e11278 | 3452 | .cfi_restore %r12 |
384e6de4 | 3453 | mov -16(%rsi),%rbx |
86e11278 | 3454 | .cfi_restore %rbx |
384e6de4 | 3455 | mov -8(%rsi),%rbp |
86e11278 | 3456 | .cfi_restore %rbp |
384e6de4 | 3457 | lea (%rsi),%rsp |
86e11278 | 3458 | .cfi_def_cfa_register %rsp |
384e6de4 | 3459 | .Lpoint_double${x}_epilogue: |
4d3fa06f | 3460 | ret |
86e11278 | 3461 | .cfi_endproc |
4d3fa06f AP |
3462 | .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx |
3463 | ___ | |
3464 | } | |
3465 | &gen_double("q"); | |
3466 | ||
3467 | sub gen_add () { | |
3468 | my $x = shift; | |
3469 | my ($src0,$sfx,$bias); | |
3470 | my ($H,$Hsqr,$R,$Rsqr,$Hcub, | |
3471 | $U1,$U2,$S1,$S2, | |
3472 | $res_x,$res_y,$res_z, | |
3473 | $in1_x,$in1_y,$in1_z, | |
3474 | $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); | |
3475 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | |
3476 | ||
3477 | if ($x ne "x") { | |
3478 | $src0 = "%rax"; | |
3479 | $sfx = ""; | |
3480 | $bias = 0; | |
3481 | ||
3482 | $code.=<<___; | |
3483 | .globl ecp_nistz256_point_add | |
3484 | .type ecp_nistz256_point_add,\@function,3 | |
3485 | .align 32 | |
3486 | ecp_nistz256_point_add: | |
86e11278 | 3487 | .cfi_startproc |
4d3fa06f AP |
3488 | ___ |
3489 | $code.=<<___ if ($addx); | |
3490 | mov \$0x80100, %ecx | |
3491 | and OPENSSL_ia32cap_P+8(%rip), %ecx | |
3492 | cmp \$0x80100, %ecx | |
3493 | je .Lpoint_addx | |
3494 | ___ | |
3495 | } else { | |
3496 | $src0 = "%rdx"; | |
3497 | $sfx = "x"; | |
3498 | $bias = 128; | |
3499 | ||
3500 | $code.=<<___; | |
3501 | .type ecp_nistz256_point_addx,\@function,3 | |
3502 | .align 32 | |
3503 | ecp_nistz256_point_addx: | |
86e11278 | 3504 | .cfi_startproc |
4d3fa06f AP |
3505 | .Lpoint_addx: |
3506 | ___ | |
3507 | } | |
3508 | $code.=<<___; | |
3509 | push %rbp | |
86e11278 | 3510 | .cfi_push %rbp |
4d3fa06f | 3511 | push %rbx |
86e11278 | 3512 | .cfi_push %rbx |
4d3fa06f | 3513 | push %r12 |
86e11278 | 3514 | .cfi_push %r12 |
4d3fa06f | 3515 | push %r13 |
86e11278 | 3516 | .cfi_push %r13 |
4d3fa06f | 3517 | push %r14 |
86e11278 | 3518 | .cfi_push %r14 |
4d3fa06f | 3519 | push %r15 |
86e11278 | 3520 | .cfi_push %r15 |
4d3fa06f | 3521 | sub \$32*18+8, %rsp |
86e11278 | 3522 | .cfi_adjust_cfa_offset 32*18+8 |
384e6de4 | 3523 | .Lpoint_add${x}_body: |
4d3fa06f AP |
3524 | |
3525 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr | |
3526 | movdqu 0x10($a_ptr), %xmm1 | |
3527 | movdqu 0x20($a_ptr), %xmm2 | |
3528 | movdqu 0x30($a_ptr), %xmm3 | |
3529 | movdqu 0x40($a_ptr), %xmm4 | |
3530 | movdqu 0x50($a_ptr), %xmm5 | |
3531 | mov $a_ptr, $b_ptr # reassign | |
3532 | mov $b_org, $a_ptr # reassign | |
3533 | movdqa %xmm0, $in1_x(%rsp) | |
3534 | movdqa %xmm1, $in1_x+0x10(%rsp) | |
4d3fa06f AP |
3535 | movdqa %xmm2, $in1_y(%rsp) |
3536 | movdqa %xmm3, $in1_y+0x10(%rsp) | |
4d3fa06f AP |
3537 | movdqa %xmm4, $in1_z(%rsp) |
3538 | movdqa %xmm5, $in1_z+0x10(%rsp) | |
e3057a57 | 3539 | por %xmm4, %xmm5 |
4d3fa06f AP |
3540 | |
3541 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr | |
e3057a57 | 3542 | pshufd \$0xb1, %xmm5, %xmm3 |
4d3fa06f AP |
3543 | movdqu 0x10($a_ptr), %xmm1 |
3544 | movdqu 0x20($a_ptr), %xmm2 | |
3545 | por %xmm3, %xmm5 | |
3546 | movdqu 0x30($a_ptr), %xmm3 | |
3547 | mov 0x40+8*0($a_ptr), $src0 # load original in2_z | |
3548 | mov 0x40+8*1($a_ptr), $acc6 | |
3549 | mov 0x40+8*2($a_ptr), $acc7 | |
3550 | mov 0x40+8*3($a_ptr), $acc0 | |
3551 | movdqa %xmm0, $in2_x(%rsp) | |
3552 | pshufd \$0x1e, %xmm5, %xmm4 | |
3553 | movdqa %xmm1, $in2_x+0x10(%rsp) | |
e3057a57 AP |
3554 | movdqu 0x40($a_ptr),%xmm0 # in2_z again |
3555 | movdqu 0x50($a_ptr),%xmm1 | |
4d3fa06f AP |
3556 | movdqa %xmm2, $in2_y(%rsp) |
3557 | movdqa %xmm3, $in2_y+0x10(%rsp) | |
4d3fa06f AP |
3558 | por %xmm4, %xmm5 |
3559 | pxor %xmm4, %xmm4 | |
e3057a57 AP |
3560 | por %xmm0, %xmm1 |
3561 | movq $r_ptr, %xmm0 # save $r_ptr | |
4d3fa06f AP |
3562 | |
3563 | lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid | |
3564 | mov $src0, $in2_z+8*0(%rsp) # make in2_z copy | |
3565 | mov $acc6, $in2_z+8*1(%rsp) | |
3566 | mov $acc7, $in2_z+8*2(%rsp) | |
3567 | mov $acc0, $in2_z+8*3(%rsp) | |
3568 | lea $Z2sqr(%rsp), $r_ptr # Z2^2 | |
3569 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); | |
3570 | ||
3571 | pcmpeqd %xmm4, %xmm5 | |
e3057a57 AP |
3572 | pshufd \$0xb1, %xmm1, %xmm4 |
3573 | por %xmm1, %xmm4 | |
4d3fa06f AP |
3574 | pshufd \$0, %xmm5, %xmm5 # in1infty |
3575 | pshufd \$0x1e, %xmm4, %xmm3 | |
3576 | por %xmm3, %xmm4 | |
3577 | pxor %xmm3, %xmm3 | |
3578 | pcmpeqd %xmm3, %xmm4 | |
3579 | pshufd \$0, %xmm4, %xmm4 # in2infty | |
3580 | mov 0x40+8*0($b_ptr), $src0 # load original in1_z | |
3581 | mov 0x40+8*1($b_ptr), $acc6 | |
3582 | mov 0x40+8*2($b_ptr), $acc7 | |
3583 | mov 0x40+8*3($b_ptr), $acc0 | |
d9375341 | 3584 | movq $b_ptr, %xmm1 |
4d3fa06f AP |
3585 | |
3586 | lea 0x40-$bias($b_ptr), $a_ptr | |
3587 | lea $Z1sqr(%rsp), $r_ptr # Z1^2 | |
3588 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); | |
3589 | ||
3590 | `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` | |
3591 | lea $S1(%rsp), $r_ptr # S1 = Z2^3 | |
3592 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); | |
3593 | ||
3594 | `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` | |
3595 | lea $S2(%rsp), $r_ptr # S2 = Z1^3 | |
3596 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); | |
3597 | ||
3598 | `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` | |
3599 | lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 | |
3600 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); | |
3601 | ||
3602 | `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` | |
3603 | lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 | |
3604 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); | |
3605 | ||
3606 | lea $S1(%rsp), $b_ptr | |
3607 | lea $R(%rsp), $r_ptr # R = S2 - S1 | |
3608 | call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); | |
3609 | ||
3610 | or $acc5, $acc4 # see if result is zero | |
3611 | movdqa %xmm4, %xmm2 | |
3612 | or $acc0, $acc4 | |
3613 | or $acc1, $acc4 | |
3614 | por %xmm5, %xmm2 # in1infty || in2infty | |
3615 | movq $acc4, %xmm3 | |
3616 | ||
3617 | `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` | |
3618 | lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 | |
3619 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); | |
3620 | ||
3621 | `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` | |
3622 | lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 | |
3623 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); | |
3624 | ||
3625 | lea $U1(%rsp), $b_ptr | |
3626 | lea $H(%rsp), $r_ptr # H = U2 - U1 | |
3627 | call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); | |
3628 | ||
3629 | or $acc5, $acc4 # see if result is zero | |
3630 | or $acc0, $acc4 | |
90b797f0 | 3631 | or $acc1, $acc4 # !is_equal(U1, U2) |
4d3fa06f | 3632 | |
5578ad1f BE |
3633 | movq %xmm2, $acc0 # in1infty | in2infty |
3634 | movq %xmm3, $acc1 # !is_equal(S1, S2) | |
90b797f0 DB |
3635 | |
3636 | or $acc0, $acc4 | |
5578ad1f | 3637 | or $acc1, $acc4 |
4d3fa06f | 3638 | |
5578ad1f BE |
3639 | # if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2)) |
3640 | .byte 0x3e # predict taken | |
3641 | jnz .Ladd_proceed$x | |
4d3fa06f | 3642 | |
d9375341 AP |
3643 | .Ladd_double$x: |
3644 | movq %xmm1, $a_ptr # restore $a_ptr | |
3645 | movq %xmm0, $r_ptr # restore $r_ptr | |
3646 | add \$`32*(18-5)`, %rsp # difference in frame sizes | |
c0e8e500 | 3647 | .cfi_adjust_cfa_offset `-32*(18-5)` |
d9375341 | 3648 | jmp .Lpoint_double_shortcut$x |
c0e8e500 | 3649 | .cfi_adjust_cfa_offset `32*(18-5)` |
d9375341 | 3650 | |
4d3fa06f AP |
3651 | .align 32 |
3652 | .Ladd_proceed$x: | |
3653 | `&load_for_sqr("$R(%rsp)", "$src0")` | |
3654 | lea $Rsqr(%rsp), $r_ptr # R^2 | |
3655 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); | |
3656 | ||
3657 | `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` | |
3658 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | |
3659 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); | |
3660 | ||
3661 | `&load_for_sqr("$H(%rsp)", "$src0")` | |
3662 | lea $Hsqr(%rsp), $r_ptr # H^2 | |
3663 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); | |
3664 | ||
3665 | `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` | |
3666 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | |
3667 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); | |
3668 | ||
3669 | `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` | |
3670 | lea $Hcub(%rsp), $r_ptr # H^3 | |
3671 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); | |
3672 | ||
3673 | `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` | |
3674 | lea $U2(%rsp), $r_ptr # U1*H^2 | |
3675 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); | |
3676 | ___ | |
3677 | { | |
3678 | ####################################################################### | |
3679 | # operate in 4-5-0-1 "name space" that matches multiplication output | |
3680 | # | |
3681 | my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | |
3682 | my ($poly1, $poly3)=($acc6,$acc7); | |
3683 | ||
3684 | $code.=<<___; | |
3685 | #lea $U2(%rsp), $a_ptr | |
3686 | #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 | |
3687 | #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); | |
3688 | ||
b62b2454 | 3689 | xor $t4, $t4 |
4d3fa06f AP |
3690 | add $acc0, $acc0 # a0:a3+a0:a3 |
3691 | lea $Rsqr(%rsp), $a_ptr | |
3692 | adc $acc1, $acc1 | |
3693 | mov $acc0, $t0 | |
3694 | adc $acc2, $acc2 | |
3695 | adc $acc3, $acc3 | |
3696 | mov $acc1, $t1 | |
b62b2454 | 3697 | adc \$0, $t4 |
4d3fa06f AP |
3698 | |
3699 | sub \$-1, $acc0 | |
3700 | mov $acc2, $t2 | |
3701 | sbb $poly1, $acc1 | |
3702 | sbb \$0, $acc2 | |
3703 | mov $acc3, $t3 | |
3704 | sbb $poly3, $acc3 | |
b62b2454 | 3705 | sbb \$0, $t4 |
4d3fa06f | 3706 | |
d3034d31 | 3707 | cmovc $t0, $acc0 |
4d3fa06f | 3708 | mov 8*0($a_ptr), $t0 |
d3034d31 | 3709 | cmovc $t1, $acc1 |
4d3fa06f | 3710 | mov 8*1($a_ptr), $t1 |
d3034d31 | 3711 | cmovc $t2, $acc2 |
4d3fa06f | 3712 | mov 8*2($a_ptr), $t2 |
d3034d31 | 3713 | cmovc $t3, $acc3 |
4d3fa06f AP |
3714 | mov 8*3($a_ptr), $t3 |
3715 | ||
3716 | call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); | |
3717 | ||
3718 | lea $Hcub(%rsp), $b_ptr | |
3719 | lea $res_x(%rsp), $r_ptr | |
3720 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); | |
3721 | ||
3722 | mov $U2+8*0(%rsp), $t0 | |
3723 | mov $U2+8*1(%rsp), $t1 | |
3724 | mov $U2+8*2(%rsp), $t2 | |
3725 | mov $U2+8*3(%rsp), $t3 | |
3726 | lea $res_y(%rsp), $r_ptr | |
3727 | ||
3728 | call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); | |
3729 | ||
3730 | mov $acc0, 8*0($r_ptr) # save the result, as | |
3731 | mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't | |
3732 | mov $acc2, 8*2($r_ptr) | |
3733 | mov $acc3, 8*3($r_ptr) | |
3734 | ___ | |
3735 | } | |
3736 | $code.=<<___; | |
3737 | `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` | |
3738 | lea $S2(%rsp), $r_ptr | |
3739 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); | |
3740 | ||
3741 | `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` | |
3742 | lea $res_y(%rsp), $r_ptr | |
3743 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); | |
3744 | ||
3745 | lea $S2(%rsp), $b_ptr | |
3746 | lea $res_y(%rsp), $r_ptr | |
3747 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); | |
3748 | ||
3749 | movq %xmm0, $r_ptr # restore $r_ptr | |
3750 | ||
3751 | movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); | |
3752 | movdqa %xmm5, %xmm1 | |
3753 | pandn $res_z(%rsp), %xmm0 | |
3754 | movdqa %xmm5, %xmm2 | |
3755 | pandn $res_z+0x10(%rsp), %xmm1 | |
3756 | movdqa %xmm5, %xmm3 | |
3757 | pand $in2_z(%rsp), %xmm2 | |
3758 | pand $in2_z+0x10(%rsp), %xmm3 | |
3759 | por %xmm0, %xmm2 | |
3760 | por %xmm1, %xmm3 | |
3761 | ||
3762 | movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); | |
3763 | movdqa %xmm4, %xmm1 | |
3764 | pandn %xmm2, %xmm0 | |
3765 | movdqa %xmm4, %xmm2 | |
3766 | pandn %xmm3, %xmm1 | |
3767 | movdqa %xmm4, %xmm3 | |
3768 | pand $in1_z(%rsp), %xmm2 | |
3769 | pand $in1_z+0x10(%rsp), %xmm3 | |
3770 | por %xmm0, %xmm2 | |
3771 | por %xmm1, %xmm3 | |
3772 | movdqu %xmm2, 0x40($r_ptr) | |
3773 | movdqu %xmm3, 0x50($r_ptr) | |
3774 | ||
3775 | movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); | |
3776 | movdqa %xmm5, %xmm1 | |
3777 | pandn $res_x(%rsp), %xmm0 | |
3778 | movdqa %xmm5, %xmm2 | |
3779 | pandn $res_x+0x10(%rsp), %xmm1 | |
3780 | movdqa %xmm5, %xmm3 | |
3781 | pand $in2_x(%rsp), %xmm2 | |
3782 | pand $in2_x+0x10(%rsp), %xmm3 | |
3783 | por %xmm0, %xmm2 | |
3784 | por %xmm1, %xmm3 | |
3785 | ||
3786 | movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); | |
3787 | movdqa %xmm4, %xmm1 | |
3788 | pandn %xmm2, %xmm0 | |
3789 | movdqa %xmm4, %xmm2 | |
3790 | pandn %xmm3, %xmm1 | |
3791 | movdqa %xmm4, %xmm3 | |
3792 | pand $in1_x(%rsp), %xmm2 | |
3793 | pand $in1_x+0x10(%rsp), %xmm3 | |
3794 | por %xmm0, %xmm2 | |
3795 | por %xmm1, %xmm3 | |
3796 | movdqu %xmm2, 0x00($r_ptr) | |
3797 | movdqu %xmm3, 0x10($r_ptr) | |
3798 | ||
3799 | movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); | |
3800 | movdqa %xmm5, %xmm1 | |
3801 | pandn $res_y(%rsp), %xmm0 | |
3802 | movdqa %xmm5, %xmm2 | |
3803 | pandn $res_y+0x10(%rsp), %xmm1 | |
3804 | movdqa %xmm5, %xmm3 | |
3805 | pand $in2_y(%rsp), %xmm2 | |
3806 | pand $in2_y+0x10(%rsp), %xmm3 | |
3807 | por %xmm0, %xmm2 | |
3808 | por %xmm1, %xmm3 | |
3809 | ||
3810 | movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); | |
3811 | movdqa %xmm4, %xmm1 | |
3812 | pandn %xmm2, %xmm0 | |
3813 | movdqa %xmm4, %xmm2 | |
3814 | pandn %xmm3, %xmm1 | |
3815 | movdqa %xmm4, %xmm3 | |
3816 | pand $in1_y(%rsp), %xmm2 | |
3817 | pand $in1_y+0x10(%rsp), %xmm3 | |
3818 | por %xmm0, %xmm2 | |
3819 | por %xmm1, %xmm3 | |
3820 | movdqu %xmm2, 0x20($r_ptr) | |
3821 | movdqu %xmm3, 0x30($r_ptr) | |
3822 | ||
3823 | .Ladd_done$x: | |
384e6de4 | 3824 | lea 32*18+56(%rsp), %rsi |
86e11278 | 3825 | .cfi_def_cfa %rsi,8 |
384e6de4 | 3826 | mov -48(%rsi),%r15 |
86e11278 | 3827 | .cfi_restore %r15 |
384e6de4 | 3828 | mov -40(%rsi),%r14 |
86e11278 | 3829 | .cfi_restore %r14 |
384e6de4 | 3830 | mov -32(%rsi),%r13 |
86e11278 | 3831 | .cfi_restore %r13 |
384e6de4 | 3832 | mov -24(%rsi),%r12 |
86e11278 | 3833 | .cfi_restore %r12 |
384e6de4 | 3834 | mov -16(%rsi),%rbx |
86e11278 | 3835 | .cfi_restore %rbx |
384e6de4 | 3836 | mov -8(%rsi),%rbp |
86e11278 | 3837 | .cfi_restore %rbp |
384e6de4 | 3838 | lea (%rsi),%rsp |
86e11278 | 3839 | .cfi_def_cfa_register %rsp |
384e6de4 | 3840 | .Lpoint_add${x}_epilogue: |
4d3fa06f | 3841 | ret |
86e11278 | 3842 | .cfi_endproc |
4d3fa06f AP |
3843 | .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx |
3844 | ___ | |
3845 | } | |
3846 | &gen_add("q"); | |
3847 | ||
3848 | sub gen_add_affine () { | |
3849 | my $x = shift; | |
3850 | my ($src0,$sfx,$bias); | |
3851 | my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, | |
3852 | $res_x,$res_y,$res_z, | |
3853 | $in1_x,$in1_y,$in1_z, | |
3854 | $in2_x,$in2_y)=map(32*$_,(0..14)); | |
3855 | my $Z1sqr = $S2; | |
3856 | ||
3857 | if ($x ne "x") { | |
3858 | $src0 = "%rax"; | |
3859 | $sfx = ""; | |
3860 | $bias = 0; | |
3861 | ||
3862 | $code.=<<___; | |
3863 | .globl ecp_nistz256_point_add_affine | |
3864 | .type ecp_nistz256_point_add_affine,\@function,3 | |
3865 | .align 32 | |
3866 | ecp_nistz256_point_add_affine: | |
86e11278 | 3867 | .cfi_startproc |
4d3fa06f AP |
3868 | ___ |
3869 | $code.=<<___ if ($addx); | |
3870 | mov \$0x80100, %ecx | |
3871 | and OPENSSL_ia32cap_P+8(%rip), %ecx | |
3872 | cmp \$0x80100, %ecx | |
3873 | je .Lpoint_add_affinex | |
3874 | ___ | |
3875 | } else { | |
3876 | $src0 = "%rdx"; | |
3877 | $sfx = "x"; | |
3878 | $bias = 128; | |
3879 | ||
3880 | $code.=<<___; | |
3881 | .type ecp_nistz256_point_add_affinex,\@function,3 | |
3882 | .align 32 | |
3883 | ecp_nistz256_point_add_affinex: | |
86e11278 | 3884 | .cfi_startproc |
4d3fa06f AP |
3885 | .Lpoint_add_affinex: |
3886 | ___ | |
3887 | } | |
3888 | $code.=<<___; | |
3889 | push %rbp | |
86e11278 | 3890 | .cfi_push %rbp |
4d3fa06f | 3891 | push %rbx |
86e11278 | 3892 | .cfi_push %rbx |
4d3fa06f | 3893 | push %r12 |
86e11278 | 3894 | .cfi_push %r12 |
4d3fa06f | 3895 | push %r13 |
86e11278 | 3896 | .cfi_push %r13 |
4d3fa06f | 3897 | push %r14 |
86e11278 | 3898 | .cfi_push %r14 |
4d3fa06f | 3899 | push %r15 |
86e11278 | 3900 | .cfi_push %r15 |
4d3fa06f | 3901 | sub \$32*15+8, %rsp |
86e11278 | 3902 | .cfi_adjust_cfa_offset 32*15+8 |
384e6de4 | 3903 | .Ladd_affine${x}_body: |
4d3fa06f AP |
3904 | |
3905 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr | |
3906 | mov $b_org, $b_ptr # reassign | |
3907 | movdqu 0x10($a_ptr), %xmm1 | |
3908 | movdqu 0x20($a_ptr), %xmm2 | |
3909 | movdqu 0x30($a_ptr), %xmm3 | |
3910 | movdqu 0x40($a_ptr), %xmm4 | |
3911 | movdqu 0x50($a_ptr), %xmm5 | |
3912 | mov 0x40+8*0($a_ptr), $src0 # load original in1_z | |
3913 | mov 0x40+8*1($a_ptr), $acc6 | |
3914 | mov 0x40+8*2($a_ptr), $acc7 | |
3915 | mov 0x40+8*3($a_ptr), $acc0 | |
3916 | movdqa %xmm0, $in1_x(%rsp) | |
3917 | movdqa %xmm1, $in1_x+0x10(%rsp) | |
4d3fa06f AP |
3918 | movdqa %xmm2, $in1_y(%rsp) |
3919 | movdqa %xmm3, $in1_y+0x10(%rsp) | |
4d3fa06f AP |
3920 | movdqa %xmm4, $in1_z(%rsp) |
3921 | movdqa %xmm5, $in1_z+0x10(%rsp) | |
e3057a57 | 3922 | por %xmm4, %xmm5 |
4d3fa06f AP |
3923 | |
3924 | movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr | |
e3057a57 | 3925 | pshufd \$0xb1, %xmm5, %xmm3 |
4d3fa06f AP |
3926 | movdqu 0x10($b_ptr), %xmm1 |
3927 | movdqu 0x20($b_ptr), %xmm2 | |
3928 | por %xmm3, %xmm5 | |
3929 | movdqu 0x30($b_ptr), %xmm3 | |
3930 | movdqa %xmm0, $in2_x(%rsp) | |
3931 | pshufd \$0x1e, %xmm5, %xmm4 | |
3932 | movdqa %xmm1, $in2_x+0x10(%rsp) | |
3933 | por %xmm0, %xmm1 | |
3934 | movq $r_ptr, %xmm0 # save $r_ptr | |
3935 | movdqa %xmm2, $in2_y(%rsp) | |
3936 | movdqa %xmm3, $in2_y+0x10(%rsp) | |
3937 | por %xmm2, %xmm3 | |
3938 | por %xmm4, %xmm5 | |
3939 | pxor %xmm4, %xmm4 | |
3940 | por %xmm1, %xmm3 | |
3941 | ||
3942 | lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid | |
3943 | lea $Z1sqr(%rsp), $r_ptr # Z1^2 | |
3944 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); | |
3945 | ||
3946 | pcmpeqd %xmm4, %xmm5 | |
3947 | pshufd \$0xb1, %xmm3, %xmm4 | |
3948 | mov 0x00($b_ptr), $src0 # $b_ptr is still valid | |
3949 | #lea 0x00($b_ptr), $b_ptr | |
3950 | mov $acc4, $acc1 # harmonize sqr output and mul input | |
3951 | por %xmm3, %xmm4 | |
3952 | pshufd \$0, %xmm5, %xmm5 # in1infty | |
3953 | pshufd \$0x1e, %xmm4, %xmm3 | |
3954 | mov $acc5, $acc2 | |
3955 | por %xmm3, %xmm4 | |
3956 | pxor %xmm3, %xmm3 | |
3957 | mov $acc6, $acc3 | |
3958 | pcmpeqd %xmm3, %xmm4 | |
3959 | pshufd \$0, %xmm4, %xmm4 # in2infty | |
3960 | ||
3961 | lea $Z1sqr-$bias(%rsp), $a_ptr | |
3962 | mov $acc7, $acc4 | |
3963 | lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 | |
3964 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); | |
3965 | ||
3966 | lea $in1_x(%rsp), $b_ptr | |
3967 | lea $H(%rsp), $r_ptr # H = U2 - U1 | |
3968 | call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); | |
3969 | ||
3970 | `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` | |
3971 | lea $S2(%rsp), $r_ptr # S2 = Z1^3 | |
3972 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); | |
3973 | ||
3974 | `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` | |
3975 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | |
3976 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); | |
3977 | ||
3978 | `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` | |
3979 | lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 | |
3980 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); | |
3981 | ||
3982 | lea $in1_y(%rsp), $b_ptr | |
3983 | lea $R(%rsp), $r_ptr # R = S2 - S1 | |
3984 | call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); | |
3985 | ||
3986 | `&load_for_sqr("$H(%rsp)", "$src0")` | |
3987 | lea $Hsqr(%rsp), $r_ptr # H^2 | |
3988 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); | |
3989 | ||
3990 | `&load_for_sqr("$R(%rsp)", "$src0")` | |
3991 | lea $Rsqr(%rsp), $r_ptr # R^2 | |
3992 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); | |
3993 | ||
3994 | `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` | |
3995 | lea $Hcub(%rsp), $r_ptr # H^3 | |
3996 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); | |
3997 | ||
3998 | `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` | |
3999 | lea $U2(%rsp), $r_ptr # U1*H^2 | |
4000 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); | |
4001 | ___ | |
4002 | { | |
4003 | ####################################################################### | |
4004 | # operate in 4-5-0-1 "name space" that matches multiplication output | |
4005 | # | |
4006 | my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | |
4007 | my ($poly1, $poly3)=($acc6,$acc7); | |
4008 | ||
4009 | $code.=<<___; | |
4010 | #lea $U2(%rsp), $a_ptr | |
4011 | #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 | |
4012 | #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); | |
4013 | ||
b62b2454 | 4014 | xor $t4, $t4 |
4d3fa06f AP |
4015 | add $acc0, $acc0 # a0:a3+a0:a3 |
4016 | lea $Rsqr(%rsp), $a_ptr | |
4017 | adc $acc1, $acc1 | |
4018 | mov $acc0, $t0 | |
4019 | adc $acc2, $acc2 | |
4020 | adc $acc3, $acc3 | |
4021 | mov $acc1, $t1 | |
b62b2454 | 4022 | adc \$0, $t4 |
4d3fa06f AP |
4023 | |
4024 | sub \$-1, $acc0 | |
4025 | mov $acc2, $t2 | |
4026 | sbb $poly1, $acc1 | |
4027 | sbb \$0, $acc2 | |
4028 | mov $acc3, $t3 | |
4029 | sbb $poly3, $acc3 | |
b62b2454 | 4030 | sbb \$0, $t4 |
4d3fa06f | 4031 | |
d3034d31 | 4032 | cmovc $t0, $acc0 |
4d3fa06f | 4033 | mov 8*0($a_ptr), $t0 |
d3034d31 | 4034 | cmovc $t1, $acc1 |
4d3fa06f | 4035 | mov 8*1($a_ptr), $t1 |
d3034d31 | 4036 | cmovc $t2, $acc2 |
4d3fa06f | 4037 | mov 8*2($a_ptr), $t2 |
d3034d31 | 4038 | cmovc $t3, $acc3 |
4d3fa06f AP |
4039 | mov 8*3($a_ptr), $t3 |
4040 | ||
4041 | call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); | |
4042 | ||
4043 | lea $Hcub(%rsp), $b_ptr | |
4044 | lea $res_x(%rsp), $r_ptr | |
4045 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); | |
4046 | ||
4047 | mov $U2+8*0(%rsp), $t0 | |
4048 | mov $U2+8*1(%rsp), $t1 | |
4049 | mov $U2+8*2(%rsp), $t2 | |
4050 | mov $U2+8*3(%rsp), $t3 | |
4051 | lea $H(%rsp), $r_ptr | |
4052 | ||
4053 | call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); | |
4054 | ||
4055 | mov $acc0, 8*0($r_ptr) # save the result, as | |
4056 | mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't | |
4057 | mov $acc2, 8*2($r_ptr) | |
4058 | mov $acc3, 8*3($r_ptr) | |
4059 | ___ | |
4060 | } | |
4061 | $code.=<<___; | |
4062 | `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` | |
4063 | lea $S2(%rsp), $r_ptr | |
4064 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); | |
4065 | ||
4066 | `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` | |
4067 | lea $H(%rsp), $r_ptr | |
4068 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); | |
4069 | ||
4070 | lea $S2(%rsp), $b_ptr | |
4071 | lea $res_y(%rsp), $r_ptr | |
4072 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); | |
4073 | ||
4074 | movq %xmm0, $r_ptr # restore $r_ptr | |
4075 | ||
4076 | movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); | |
4077 | movdqa %xmm5, %xmm1 | |
4078 | pandn $res_z(%rsp), %xmm0 | |
4079 | movdqa %xmm5, %xmm2 | |
4080 | pandn $res_z+0x10(%rsp), %xmm1 | |
4081 | movdqa %xmm5, %xmm3 | |
4082 | pand .LONE_mont(%rip), %xmm2 | |
4083 | pand .LONE_mont+0x10(%rip), %xmm3 | |
4084 | por %xmm0, %xmm2 | |
4085 | por %xmm1, %xmm3 | |
4086 | ||
4087 | movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); | |
4088 | movdqa %xmm4, %xmm1 | |
4089 | pandn %xmm2, %xmm0 | |
4090 | movdqa %xmm4, %xmm2 | |
4091 | pandn %xmm3, %xmm1 | |
4092 | movdqa %xmm4, %xmm3 | |
4093 | pand $in1_z(%rsp), %xmm2 | |
4094 | pand $in1_z+0x10(%rsp), %xmm3 | |
4095 | por %xmm0, %xmm2 | |
4096 | por %xmm1, %xmm3 | |
4097 | movdqu %xmm2, 0x40($r_ptr) | |
4098 | movdqu %xmm3, 0x50($r_ptr) | |
4099 | ||
4100 | movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); | |
4101 | movdqa %xmm5, %xmm1 | |
4102 | pandn $res_x(%rsp), %xmm0 | |
4103 | movdqa %xmm5, %xmm2 | |
4104 | pandn $res_x+0x10(%rsp), %xmm1 | |
4105 | movdqa %xmm5, %xmm3 | |
4106 | pand $in2_x(%rsp), %xmm2 | |
4107 | pand $in2_x+0x10(%rsp), %xmm3 | |
4108 | por %xmm0, %xmm2 | |
4109 | por %xmm1, %xmm3 | |
4110 | ||
4111 | movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); | |
4112 | movdqa %xmm4, %xmm1 | |
4113 | pandn %xmm2, %xmm0 | |
4114 | movdqa %xmm4, %xmm2 | |
4115 | pandn %xmm3, %xmm1 | |
4116 | movdqa %xmm4, %xmm3 | |
4117 | pand $in1_x(%rsp), %xmm2 | |
4118 | pand $in1_x+0x10(%rsp), %xmm3 | |
4119 | por %xmm0, %xmm2 | |
4120 | por %xmm1, %xmm3 | |
4121 | movdqu %xmm2, 0x00($r_ptr) | |
4122 | movdqu %xmm3, 0x10($r_ptr) | |
4123 | ||
4124 | movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); | |
4125 | movdqa %xmm5, %xmm1 | |
4126 | pandn $res_y(%rsp), %xmm0 | |
4127 | movdqa %xmm5, %xmm2 | |
4128 | pandn $res_y+0x10(%rsp), %xmm1 | |
4129 | movdqa %xmm5, %xmm3 | |
4130 | pand $in2_y(%rsp), %xmm2 | |
4131 | pand $in2_y+0x10(%rsp), %xmm3 | |
4132 | por %xmm0, %xmm2 | |
4133 | por %xmm1, %xmm3 | |
4134 | ||
4135 | movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); | |
4136 | movdqa %xmm4, %xmm1 | |
4137 | pandn %xmm2, %xmm0 | |
4138 | movdqa %xmm4, %xmm2 | |
4139 | pandn %xmm3, %xmm1 | |
4140 | movdqa %xmm4, %xmm3 | |
4141 | pand $in1_y(%rsp), %xmm2 | |
4142 | pand $in1_y+0x10(%rsp), %xmm3 | |
4143 | por %xmm0, %xmm2 | |
4144 | por %xmm1, %xmm3 | |
4145 | movdqu %xmm2, 0x20($r_ptr) | |
4146 | movdqu %xmm3, 0x30($r_ptr) | |
4147 | ||
384e6de4 | 4148 | lea 32*15+56(%rsp), %rsi |
86e11278 | 4149 | .cfi_def_cfa %rsi,8 |
384e6de4 | 4150 | mov -48(%rsi),%r15 |
86e11278 | 4151 | .cfi_restore %r15 |
384e6de4 | 4152 | mov -40(%rsi),%r14 |
86e11278 | 4153 | .cfi_restore %r14 |
384e6de4 | 4154 | mov -32(%rsi),%r13 |
86e11278 | 4155 | .cfi_restore %r13 |
384e6de4 | 4156 | mov -24(%rsi),%r12 |
86e11278 | 4157 | .cfi_restore %r12 |
384e6de4 | 4158 | mov -16(%rsi),%rbx |
86e11278 | 4159 | .cfi_restore %rbx |
384e6de4 | 4160 | mov -8(%rsi),%rbp |
86e11278 | 4161 | .cfi_restore %rbp |
384e6de4 | 4162 | lea (%rsi),%rsp |
86e11278 | 4163 | .cfi_def_cfa_register %rsp |
384e6de4 | 4164 | .Ladd_affine${x}_epilogue: |
4d3fa06f | 4165 | ret |
86e11278 | 4166 | .cfi_endproc |
4d3fa06f AP |
4167 | .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx |
4168 | ___ | |
4169 | } | |
4170 | &gen_add_affine("q"); | |
4171 | ||
4172 | ######################################################################## | |
4173 | # AD*X magic | |
4174 | # | |
4175 | if ($addx) { { | |
4176 | ######################################################################## | |
4177 | # operate in 4-5-0-1 "name space" that matches multiplication output | |
4178 | # | |
4179 | my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | |
4180 | ||
4181 | $code.=<<___; | |
4182 | .type __ecp_nistz256_add_tox,\@abi-omnipotent | |
4183 | .align 32 | |
4184 | __ecp_nistz256_add_tox: | |
c0e8e500 | 4185 | .cfi_startproc |
4d3fa06f AP |
4186 | xor $t4, $t4 |
4187 | adc 8*0($b_ptr), $a0 | |
4188 | adc 8*1($b_ptr), $a1 | |
4189 | mov $a0, $t0 | |
4190 | adc 8*2($b_ptr), $a2 | |
4191 | adc 8*3($b_ptr), $a3 | |
4192 | mov $a1, $t1 | |
4193 | adc \$0, $t4 | |
4194 | ||
4195 | xor $t3, $t3 | |
4196 | sbb \$-1, $a0 | |
4197 | mov $a2, $t2 | |
4198 | sbb $poly1, $a1 | |
4199 | sbb \$0, $a2 | |
4200 | mov $a3, $t3 | |
4201 | sbb $poly3, $a3 | |
b62b2454 | 4202 | sbb \$0, $t4 |
4d3fa06f | 4203 | |
d3034d31 AP |
4204 | cmovc $t0, $a0 |
4205 | cmovc $t1, $a1 | |
4d3fa06f | 4206 | mov $a0, 8*0($r_ptr) |
d3034d31 | 4207 | cmovc $t2, $a2 |
4d3fa06f | 4208 | mov $a1, 8*1($r_ptr) |
d3034d31 | 4209 | cmovc $t3, $a3 |
4d3fa06f AP |
4210 | mov $a2, 8*2($r_ptr) |
4211 | mov $a3, 8*3($r_ptr) | |
4212 | ||
4213 | ret | |
c0e8e500 | 4214 | .cfi_endproc |
4d3fa06f AP |
4215 | .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox |
4216 | ||
4217 | .type __ecp_nistz256_sub_fromx,\@abi-omnipotent | |
4218 | .align 32 | |
4219 | __ecp_nistz256_sub_fromx: | |
c0e8e500 | 4220 | .cfi_startproc |
4d3fa06f AP |
4221 | xor $t4, $t4 |
4222 | sbb 8*0($b_ptr), $a0 | |
4223 | sbb 8*1($b_ptr), $a1 | |
4224 | mov $a0, $t0 | |
4225 | sbb 8*2($b_ptr), $a2 | |
4226 | sbb 8*3($b_ptr), $a3 | |
4227 | mov $a1, $t1 | |
4228 | sbb \$0, $t4 | |
4229 | ||
4230 | xor $t3, $t3 | |
4231 | adc \$-1, $a0 | |
4232 | mov $a2, $t2 | |
4233 | adc $poly1, $a1 | |
4234 | adc \$0, $a2 | |
4235 | mov $a3, $t3 | |
4236 | adc $poly3, $a3 | |
4237 | ||
4238 | bt \$0, $t4 | |
4239 | cmovnc $t0, $a0 | |
4240 | cmovnc $t1, $a1 | |
4241 | mov $a0, 8*0($r_ptr) | |
4242 | cmovnc $t2, $a2 | |
4243 | mov $a1, 8*1($r_ptr) | |
4244 | cmovnc $t3, $a3 | |
4245 | mov $a2, 8*2($r_ptr) | |
4246 | mov $a3, 8*3($r_ptr) | |
4247 | ||
4248 | ret | |
c0e8e500 | 4249 | .cfi_endproc |
4d3fa06f AP |
4250 | .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx |
4251 | ||
4252 | .type __ecp_nistz256_subx,\@abi-omnipotent | |
4253 | .align 32 | |
4254 | __ecp_nistz256_subx: | |
c0e8e500 | 4255 | .cfi_startproc |
4d3fa06f AP |
4256 | xor $t4, $t4 |
4257 | sbb $a0, $t0 | |
4258 | sbb $a1, $t1 | |
4259 | mov $t0, $a0 | |
4260 | sbb $a2, $t2 | |
4261 | sbb $a3, $t3 | |
4262 | mov $t1, $a1 | |
4263 | sbb \$0, $t4 | |
4264 | ||
4265 | xor $a3 ,$a3 | |
4266 | adc \$-1, $t0 | |
4267 | mov $t2, $a2 | |
4268 | adc $poly1, $t1 | |
4269 | adc \$0, $t2 | |
4270 | mov $t3, $a3 | |
4271 | adc $poly3, $t3 | |
4272 | ||
4273 | bt \$0, $t4 | |
4274 | cmovc $t0, $a0 | |
4275 | cmovc $t1, $a1 | |
4276 | cmovc $t2, $a2 | |
4277 | cmovc $t3, $a3 | |
4278 | ||
4279 | ret | |
c0e8e500 | 4280 | .cfi_endproc |
4d3fa06f AP |
4281 | .size __ecp_nistz256_subx,.-__ecp_nistz256_subx |
4282 | ||
4283 | .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent | |
4284 | .align 32 | |
4285 | __ecp_nistz256_mul_by_2x: | |
c0e8e500 | 4286 | .cfi_startproc |
4d3fa06f AP |
4287 | xor $t4, $t4 |
4288 | adc $a0, $a0 # a0:a3+a0:a3 | |
4289 | adc $a1, $a1 | |
4290 | mov $a0, $t0 | |
4291 | adc $a2, $a2 | |
4292 | adc $a3, $a3 | |
4293 | mov $a1, $t1 | |
4294 | adc \$0, $t4 | |
4295 | ||
4296 | xor $t3, $t3 | |
4297 | sbb \$-1, $a0 | |
4298 | mov $a2, $t2 | |
4299 | sbb $poly1, $a1 | |
4300 | sbb \$0, $a2 | |
4301 | mov $a3, $t3 | |
4302 | sbb $poly3, $a3 | |
b62b2454 | 4303 | sbb \$0, $t4 |
4d3fa06f | 4304 | |
d3034d31 AP |
4305 | cmovc $t0, $a0 |
4306 | cmovc $t1, $a1 | |
4d3fa06f | 4307 | mov $a0, 8*0($r_ptr) |
d3034d31 | 4308 | cmovc $t2, $a2 |
4d3fa06f | 4309 | mov $a1, 8*1($r_ptr) |
d3034d31 | 4310 | cmovc $t3, $a3 |
4d3fa06f AP |
4311 | mov $a2, 8*2($r_ptr) |
4312 | mov $a3, 8*3($r_ptr) | |
4313 | ||
4314 | ret | |
c0e8e500 | 4315 | .cfi_endproc |
4d3fa06f AP |
4316 | .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x |
4317 | ___ | |
4318 | } | |
4319 | &gen_double("x"); | |
4320 | &gen_add("x"); | |
4321 | &gen_add_affine("x"); | |
4322 | } | |
4323 | }}} | |
4324 | ||
384e6de4 AP |
4325 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
4326 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
4327 | if ($win64) { | |
4328 | $rec="%rcx"; | |
4329 | $frame="%rdx"; | |
4330 | $context="%r8"; | |
4331 | $disp="%r9"; | |
4332 | ||
4333 | $code.=<<___; | |
4334 | .extern __imp_RtlVirtualUnwind | |
4335 | ||
4336 | .type short_handler,\@abi-omnipotent | |
4337 | .align 16 | |
4338 | short_handler: | |
4339 | push %rsi | |
4340 | push %rdi | |
4341 | push %rbx | |
4342 | push %rbp | |
4343 | push %r12 | |
4344 | push %r13 | |
4345 | push %r14 | |
4346 | push %r15 | |
4347 | pushfq | |
4348 | sub \$64,%rsp | |
4349 | ||
4350 | mov 120($context),%rax # pull context->Rax | |
4351 | mov 248($context),%rbx # pull context->Rip | |
4352 | ||
4353 | mov 8($disp),%rsi # disp->ImageBase | |
4354 | mov 56($disp),%r11 # disp->HandlerData | |
4355 | ||
4356 | mov 0(%r11),%r10d # HandlerData[0] | |
4357 | lea (%rsi,%r10),%r10 # end of prologue label | |
4358 | cmp %r10,%rbx # context->Rip<end of prologue label | |
4359 | jb .Lcommon_seh_tail | |
4360 | ||
4361 | mov 152($context),%rax # pull context->Rsp | |
4362 | ||
4363 | mov 4(%r11),%r10d # HandlerData[1] | |
4364 | lea (%rsi,%r10),%r10 # epilogue label | |
4365 | cmp %r10,%rbx # context->Rip>=epilogue label | |
4366 | jae .Lcommon_seh_tail | |
4367 | ||
4368 | lea 16(%rax),%rax | |
4369 | ||
4370 | mov -8(%rax),%r12 | |
4371 | mov -16(%rax),%r13 | |
4372 | mov %r12,216($context) # restore context->R12 | |
4373 | mov %r13,224($context) # restore context->R13 | |
4374 | ||
4375 | jmp .Lcommon_seh_tail | |
4376 | .size short_handler,.-short_handler | |
4377 | ||
4378 | .type full_handler,\@abi-omnipotent | |
4379 | .align 16 | |
4380 | full_handler: | |
4381 | push %rsi | |
4382 | push %rdi | |
4383 | push %rbx | |
4384 | push %rbp | |
4385 | push %r12 | |
4386 | push %r13 | |
4387 | push %r14 | |
4388 | push %r15 | |
4389 | pushfq | |
4390 | sub \$64,%rsp | |
4391 | ||
4392 | mov 120($context),%rax # pull context->Rax | |
4393 | mov 248($context),%rbx # pull context->Rip | |
4394 | ||
4395 | mov 8($disp),%rsi # disp->ImageBase | |
4396 | mov 56($disp),%r11 # disp->HandlerData | |
4397 | ||
4398 | mov 0(%r11),%r10d # HandlerData[0] | |
4399 | lea (%rsi,%r10),%r10 # end of prologue label | |
4400 | cmp %r10,%rbx # context->Rip<end of prologue label | |
4401 | jb .Lcommon_seh_tail | |
4402 | ||
4403 | mov 152($context),%rax # pull context->Rsp | |
4404 | ||
4405 | mov 4(%r11),%r10d # HandlerData[1] | |
4406 | lea (%rsi,%r10),%r10 # epilogue label | |
4407 | cmp %r10,%rbx # context->Rip>=epilogue label | |
4408 | jae .Lcommon_seh_tail | |
4409 | ||
4410 | mov 8(%r11),%r10d # HandlerData[2] | |
4411 | lea (%rax,%r10),%rax | |
4412 | ||
79ca382d AP |
4413 | mov -8(%rax),%rbp |
4414 | mov -16(%rax),%rbx | |
384e6de4 AP |
4415 | mov -24(%rax),%r12 |
4416 | mov -32(%rax),%r13 | |
4417 | mov -40(%rax),%r14 | |
4418 | mov -48(%rax),%r15 | |
4419 | mov %rbx,144($context) # restore context->Rbx | |
4420 | mov %rbp,160($context) # restore context->Rbp | |
4421 | mov %r12,216($context) # restore context->R12 | |
4422 | mov %r13,224($context) # restore context->R13 | |
4423 | mov %r14,232($context) # restore context->R14 | |
4424 | mov %r15,240($context) # restore context->R15 | |
4425 | ||
4426 | .Lcommon_seh_tail: | |
4427 | mov 8(%rax),%rdi | |
4428 | mov 16(%rax),%rsi | |
4429 | mov %rax,152($context) # restore context->Rsp | |
4430 | mov %rsi,168($context) # restore context->Rsi | |
4431 | mov %rdi,176($context) # restore context->Rdi | |
4432 | ||
4433 | mov 40($disp),%rdi # disp->ContextRecord | |
4434 | mov $context,%rsi # context | |
4435 | mov \$154,%ecx # sizeof(CONTEXT) | |
4436 | .long 0xa548f3fc # cld; rep movsq | |
4437 | ||
4438 | mov $disp,%rsi | |
4439 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
4440 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
4441 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
4442 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
4443 | mov 40(%rsi),%r10 # disp->ContextRecord | |
4444 | lea 56(%rsi),%r11 # &disp->HandlerData | |
4445 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
4446 | mov %r10,32(%rsp) # arg5 | |
4447 | mov %r11,40(%rsp) # arg6 | |
4448 | mov %r12,48(%rsp) # arg7 | |
4449 | mov %rcx,56(%rsp) # arg8, (NULL) | |
4450 | call *__imp_RtlVirtualUnwind(%rip) | |
4451 | ||
4452 | mov \$1,%eax # ExceptionContinueSearch | |
4453 | add \$64,%rsp | |
4454 | popfq | |
4455 | pop %r15 | |
4456 | pop %r14 | |
4457 | pop %r13 | |
4458 | pop %r12 | |
4459 | pop %rbp | |
4460 | pop %rbx | |
4461 | pop %rdi | |
4462 | pop %rsi | |
4463 | ret | |
4464 | .size full_handler,.-full_handler | |
4465 | ||
4466 | .section .pdata | |
4467 | .align 4 | |
4468 | .rva .LSEH_begin_ecp_nistz256_mul_by_2 | |
4469 | .rva .LSEH_end_ecp_nistz256_mul_by_2 | |
4470 | .rva .LSEH_info_ecp_nistz256_mul_by_2 | |
4471 | ||
4472 | .rva .LSEH_begin_ecp_nistz256_div_by_2 | |
4473 | .rva .LSEH_end_ecp_nistz256_div_by_2 | |
4474 | .rva .LSEH_info_ecp_nistz256_div_by_2 | |
4475 | ||
4476 | .rva .LSEH_begin_ecp_nistz256_mul_by_3 | |
4477 | .rva .LSEH_end_ecp_nistz256_mul_by_3 | |
4478 | .rva .LSEH_info_ecp_nistz256_mul_by_3 | |
4479 | ||
4480 | .rva .LSEH_begin_ecp_nistz256_add | |
4481 | .rva .LSEH_end_ecp_nistz256_add | |
4482 | .rva .LSEH_info_ecp_nistz256_add | |
4483 | ||
4484 | .rva .LSEH_begin_ecp_nistz256_sub | |
4485 | .rva .LSEH_end_ecp_nistz256_sub | |
4486 | .rva .LSEH_info_ecp_nistz256_sub | |
4487 | ||
4488 | .rva .LSEH_begin_ecp_nistz256_neg | |
4489 | .rva .LSEH_end_ecp_nistz256_neg | |
4490 | .rva .LSEH_info_ecp_nistz256_neg | |
4491 | ||
d5e11843 AP |
4492 | .rva .LSEH_begin_ecp_nistz256_ord_mul_mont |
4493 | .rva .LSEH_end_ecp_nistz256_ord_mul_mont | |
4494 | .rva .LSEH_info_ecp_nistz256_ord_mul_mont | |
4495 | ||
4496 | .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont | |
4497 | .rva .LSEH_end_ecp_nistz256_ord_sqr_mont | |
4498 | .rva .LSEH_info_ecp_nistz256_ord_sqr_mont | |
4499 | ___ | |
4500 | $code.=<<___ if ($addx); | |
4501 | .rva .LSEH_begin_ecp_nistz256_ord_mul_montx | |
4502 | .rva .LSEH_end_ecp_nistz256_ord_mul_montx | |
4503 | .rva .LSEH_info_ecp_nistz256_ord_mul_montx | |
4504 | ||
4505 | .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx | |
4506 | .rva .LSEH_end_ecp_nistz256_ord_sqr_montx | |
4507 | .rva .LSEH_info_ecp_nistz256_ord_sqr_montx | |
4508 | ___ | |
4509 | $code.=<<___; | |
384e6de4 AP |
4510 | .rva .LSEH_begin_ecp_nistz256_to_mont |
4511 | .rva .LSEH_end_ecp_nistz256_to_mont | |
4512 | .rva .LSEH_info_ecp_nistz256_to_mont | |
4513 | ||
4514 | .rva .LSEH_begin_ecp_nistz256_mul_mont | |
4515 | .rva .LSEH_end_ecp_nistz256_mul_mont | |
4516 | .rva .LSEH_info_ecp_nistz256_mul_mont | |
4517 | ||
4518 | .rva .LSEH_begin_ecp_nistz256_sqr_mont | |
4519 | .rva .LSEH_end_ecp_nistz256_sqr_mont | |
4520 | .rva .LSEH_info_ecp_nistz256_sqr_mont | |
4521 | ||
4522 | .rva .LSEH_begin_ecp_nistz256_from_mont | |
4523 | .rva .LSEH_end_ecp_nistz256_from_mont | |
4524 | .rva .LSEH_info_ecp_nistz256_from_mont | |
4525 | ||
4526 | .rva .LSEH_begin_ecp_nistz256_gather_w5 | |
4527 | .rva .LSEH_end_ecp_nistz256_gather_w5 | |
4528 | .rva .LSEH_info_ecp_nistz256_gather_wX | |
4529 | ||
4530 | .rva .LSEH_begin_ecp_nistz256_gather_w7 | |
4531 | .rva .LSEH_end_ecp_nistz256_gather_w7 | |
4532 | .rva .LSEH_info_ecp_nistz256_gather_wX | |
4533 | ___ | |
4534 | $code.=<<___ if ($avx>1); | |
4535 | .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5 | |
4536 | .rva .LSEH_end_ecp_nistz256_avx2_gather_w5 | |
4537 | .rva .LSEH_info_ecp_nistz256_avx2_gather_wX | |
4538 | ||
4539 | .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7 | |
4540 | .rva .LSEH_end_ecp_nistz256_avx2_gather_w7 | |
4541 | .rva .LSEH_info_ecp_nistz256_avx2_gather_wX | |
4542 | ___ | |
4543 | $code.=<<___; | |
4544 | .rva .LSEH_begin_ecp_nistz256_point_double | |
4545 | .rva .LSEH_end_ecp_nistz256_point_double | |
4546 | .rva .LSEH_info_ecp_nistz256_point_double | |
4547 | ||
4548 | .rva .LSEH_begin_ecp_nistz256_point_add | |
4549 | .rva .LSEH_end_ecp_nistz256_point_add | |
4550 | .rva .LSEH_info_ecp_nistz256_point_add | |
4551 | ||
4552 | .rva .LSEH_begin_ecp_nistz256_point_add_affine | |
4553 | .rva .LSEH_end_ecp_nistz256_point_add_affine | |
4554 | .rva .LSEH_info_ecp_nistz256_point_add_affine | |
4555 | ___ | |
4556 | $code.=<<___ if ($addx); | |
4557 | .rva .LSEH_begin_ecp_nistz256_point_doublex | |
4558 | .rva .LSEH_end_ecp_nistz256_point_doublex | |
4559 | .rva .LSEH_info_ecp_nistz256_point_doublex | |
4560 | ||
4561 | .rva .LSEH_begin_ecp_nistz256_point_addx | |
4562 | .rva .LSEH_end_ecp_nistz256_point_addx | |
4563 | .rva .LSEH_info_ecp_nistz256_point_addx | |
4564 | ||
4565 | .rva .LSEH_begin_ecp_nistz256_point_add_affinex | |
4566 | .rva .LSEH_end_ecp_nistz256_point_add_affinex | |
4567 | .rva .LSEH_info_ecp_nistz256_point_add_affinex | |
4568 | ___ | |
4569 | $code.=<<___; | |
4570 | ||
4571 | .section .xdata | |
4572 | .align 8 | |
4573 | .LSEH_info_ecp_nistz256_mul_by_2: | |
4574 | .byte 9,0,0,0 | |
4575 | .rva short_handler | |
4576 | .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[] | |
4577 | .LSEH_info_ecp_nistz256_div_by_2: | |
4578 | .byte 9,0,0,0 | |
4579 | .rva short_handler | |
4580 | .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[] | |
4581 | .LSEH_info_ecp_nistz256_mul_by_3: | |
4582 | .byte 9,0,0,0 | |
4583 | .rva short_handler | |
4584 | .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[] | |
4585 | .LSEH_info_ecp_nistz256_add: | |
4586 | .byte 9,0,0,0 | |
4587 | .rva short_handler | |
4588 | .rva .Ladd_body,.Ladd_epilogue # HandlerData[] | |
4589 | .LSEH_info_ecp_nistz256_sub: | |
4590 | .byte 9,0,0,0 | |
4591 | .rva short_handler | |
4592 | .rva .Lsub_body,.Lsub_epilogue # HandlerData[] | |
4593 | .LSEH_info_ecp_nistz256_neg: | |
4594 | .byte 9,0,0,0 | |
4595 | .rva short_handler | |
4596 | .rva .Lneg_body,.Lneg_epilogue # HandlerData[] | |
d5e11843 AP |
4597 | .LSEH_info_ecp_nistz256_ord_mul_mont: |
4598 | .byte 9,0,0,0 | |
4599 | .rva full_handler | |
4600 | .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] | |
4601 | .long 48,0 | |
4602 | .LSEH_info_ecp_nistz256_ord_sqr_mont: | |
4603 | .byte 9,0,0,0 | |
4604 | .rva full_handler | |
4605 | .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] | |
4606 | .long 48,0 | |
4607 | ___ | |
4608 | $code.=<<___ if ($addx); | |
4609 | .LSEH_info_ecp_nistz256_ord_mul_montx: | |
4610 | .byte 9,0,0,0 | |
4611 | .rva full_handler | |
4612 | .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] | |
4613 | .long 48,0 | |
4614 | .LSEH_info_ecp_nistz256_ord_sqr_montx: | |
4615 | .byte 9,0,0,0 | |
4616 | .rva full_handler | |
4617 | .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] | |
4618 | .long 48,0 | |
4619 | ___ | |
4620 | $code.=<<___; | |
384e6de4 AP |
4621 | .LSEH_info_ecp_nistz256_to_mont: |
4622 | .byte 9,0,0,0 | |
4623 | .rva full_handler | |
4624 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[] | |
4625 | .long 48,0 | |
4626 | .LSEH_info_ecp_nistz256_mul_mont: | |
4627 | .byte 9,0,0,0 | |
4628 | .rva full_handler | |
4629 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[] | |
4630 | .long 48,0 | |
4631 | .LSEH_info_ecp_nistz256_sqr_mont: | |
4632 | .byte 9,0,0,0 | |
4633 | .rva full_handler | |
4634 | .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] | |
4635 | .long 48,0 | |
4636 | .LSEH_info_ecp_nistz256_from_mont: | |
4637 | .byte 9,0,0,0 | |
4638 | .rva short_handler | |
4639 | .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] | |
4640 | .LSEH_info_ecp_nistz256_gather_wX: | |
4641 | .byte 0x01,0x33,0x16,0x00 | |
4642 | .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 | |
4643 | .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 | |
4644 | .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 | |
4645 | .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 | |
4646 | .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 | |
4647 | .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 | |
4648 | .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 | |
4649 | .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 | |
4650 | .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 | |
4651 | .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 | |
4652 | .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 | |
4653 | .align 8 | |
4654 | ___ | |
4655 | $code.=<<___ if ($avx>1); | |
4656 | .LSEH_info_ecp_nistz256_avx2_gather_wX: | |
4657 | .byte 0x01,0x36,0x17,0x0b | |
4658 | .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 | |
4659 | .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 | |
4660 | .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 | |
4661 | .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 | |
4662 | .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 | |
4663 | .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 | |
4664 | .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 | |
4665 | .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 | |
4666 | .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 | |
4667 | .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 | |
4668 | .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 | |
4669 | .byte 0x00,0xb3,0x00,0x00 # set_frame r11 | |
4670 | .align 8 | |
4671 | ___ | |
4672 | $code.=<<___; | |
4673 | .LSEH_info_ecp_nistz256_point_double: | |
4674 | .byte 9,0,0,0 | |
4675 | .rva full_handler | |
4676 | .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] | |
4677 | .long 32*5+56,0 | |
4678 | .LSEH_info_ecp_nistz256_point_add: | |
4679 | .byte 9,0,0,0 | |
4680 | .rva full_handler | |
4681 | .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] | |
4682 | .long 32*18+56,0 | |
4683 | .LSEH_info_ecp_nistz256_point_add_affine: | |
4684 | .byte 9,0,0,0 | |
4685 | .rva full_handler | |
4686 | .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] | |
4687 | .long 32*15+56,0 | |
4688 | ___ | |
4689 | $code.=<<___ if ($addx); | |
4690 | .align 8 | |
4691 | .LSEH_info_ecp_nistz256_point_doublex: | |
4692 | .byte 9,0,0,0 | |
4693 | .rva full_handler | |
4694 | .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] | |
4695 | .long 32*5+56,0 | |
4696 | .LSEH_info_ecp_nistz256_point_addx: | |
4697 | .byte 9,0,0,0 | |
4698 | .rva full_handler | |
4699 | .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] | |
4700 | .long 32*18+56,0 | |
4701 | .LSEH_info_ecp_nistz256_point_add_affinex: | |
4702 | .byte 9,0,0,0 | |
4703 | .rva full_handler | |
4704 | .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] | |
4705 | .long 32*15+56,0 | |
4706 | ___ | |
4707 | } | |
4708 | ||
3ff08e1d AP |
4709 | ######################################################################## |
4710 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 | |
4711 | # | |
609b0852 DB |
4712 | open TABLE,"<ecp_nistz256_table.c" or |
4713 | open TABLE,"<${dir}../ecp_nistz256_table.c" or | |
3ff08e1d AP |
4714 | die "failed to open ecp_nistz256_table.c:",$!; |
4715 | ||
4716 | use integer; | |
4717 | ||
4718 | foreach(<TABLE>) { | |
4719 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; | |
4720 | } | |
4721 | close TABLE; | |
4722 | ||
4723 | die "insane number of elements" if ($#arr != 64*16*37-1); | |
4724 | ||
4725 | print <<___; | |
4726 | .text | |
4727 | .globl ecp_nistz256_precomputed | |
4728 | .type ecp_nistz256_precomputed,\@object | |
4729 | .align 4096 | |
4730 | ecp_nistz256_precomputed: | |
4731 | ___ | |
4732 | while (@line=splice(@arr,0,16)) { | |
4733 | print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n"; | |
4734 | } | |
4735 | print <<___; | |
4736 | .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed | |
4737 | ___ | |
4738 | ||
4d3fa06f AP |
4739 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
4740 | print $code; | |
a21314db | 4741 | close STDOUT or die "error closing STDOUT: $!"; |