]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/asm/x25519-x86_64.pl
Add some missing cfi frame info in x25519-x86_64.pl
[thirdparty/openssl.git] / crypto / ec / asm / x25519-x86_64.pl
1 #!/usr/bin/env perl
2 # Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # X25519 lower-level primitives for x86_64.
17 #
18 # February 2018.
19 #
20 # This module implements radix 2^51 multiplication and squaring, and
21 # radix 2^64 multiplication, squaring, addition, subtraction and final
22 # reduction. Latter radix is used on ADCX/ADOX-capable processors such
23 # as Broadwell. On related note one should mention that there are
24 # vector implementations that provide significantly better performance
25 # on some processors(*), but they are large and overly complex. Which
26 # in combination with them being effectively processor-specific makes
27 # the undertaking hard to justify. The goal for this implementation
28 # is rather versatility and simplicity [and ultimately formal
29 # verification].
30 #
31 # (*) For example sandy2x should provide ~30% improvement on Sandy
32 # Bridge, but only nominal ~5% on Haswell [and big loss on
33 # Broadwell and successors].
34 #
35 ######################################################################
36 # Improvement coefficients:
37 #
38 # amd64-51(*) gcc-5.x(**)
39 #
40 # P4 +22% +40%
41 # Sandy Bridge -3% +11%
42 # Haswell -1% +13%
43 # Broadwell(***) +30% +35%
44 # Skylake(***) +33% +47%
45 # Silvermont +20% +26%
46 # Goldmont +40% +50%
47 # Bulldozer +20% +9%
48 # Ryzen(***) +43% +40%
49 # VIA +170% +120%
50 #
51 # (*) amd64-51 is popular assembly implementation with 2^51 radix,
52 # only multiplication and squaring subroutines were linked
53 # for comparison, but not complete ladder step; gain on most
54 # processors is because this module refrains from shld, and
55 # minor regression on others is because this does result in
56 # higher instruction count;
57 # (**) compiler is free to inline functions, in assembly one would
58 # need to implement ladder step to do that, and it will improve
59 # performance by several percent;
60 # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
61 # C implementation, so that comparison is always against
62 # 2^51 radix;
63
64 # $output is the last argument if it looks like a file (it has an extension)
65 # $flavour is the first argument if it doesn't look like a file
66 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
67 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
68
69 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
70
71 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
72 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
73 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
74 die "can't locate x86_64-xlate.pl";
75
76 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
77 or die "can't call $xlate: $!";
78 *STDOUT=*OUT;
79
80 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
81 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
82 $addx = ($1>=2.23);
83 }
84
85 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
86 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
87 $addx = ($1>=2.10);
88 }
89
90 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
91 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
92 $addx = ($1>=12);
93 }
94
95 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
96 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
97 $addx = ($ver>=3.03);
98 }
99
100 $code.=<<___;
101 .text
102
103 .globl x25519_fe51_mul
104 .type x25519_fe51_mul,\@function,3
105 .align 32
106 x25519_fe51_mul:
107 .cfi_startproc
108 push %rbp
109 .cfi_push %rbp
110 push %rbx
111 .cfi_push %rbx
112 push %r12
113 .cfi_push %r12
114 push %r13
115 .cfi_push %r13
116 push %r14
117 .cfi_push %r14
118 push %r15
119 .cfi_push %r15
120 lea -8*5(%rsp),%rsp
121 .cfi_adjust_cfa_offset 40
122 .Lfe51_mul_body:
123
124 mov 8*0(%rsi),%rax # f[0]
125 mov 8*0(%rdx),%r11 # load g[0-4]
126 mov 8*1(%rdx),%r12
127 mov 8*2(%rdx),%r13
128 mov 8*3(%rdx),%rbp
129 mov 8*4(%rdx),%r14
130
131 mov %rdi,8*4(%rsp) # offload 1st argument
132 mov %rax,%rdi
133 mulq %r11 # f[0]*g[0]
134 mov %r11,8*0(%rsp) # offload g[0]
135 mov %rax,%rbx # %rbx:%rcx = h0
136 mov %rdi,%rax
137 mov %rdx,%rcx
138 mulq %r12 # f[0]*g[1]
139 mov %r12,8*1(%rsp) # offload g[1]
140 mov %rax,%r8 # %r8:%r9 = h1
141 mov %rdi,%rax
142 lea (%r14,%r14,8),%r15
143 mov %rdx,%r9
144 mulq %r13 # f[0]*g[2]
145 mov %r13,8*2(%rsp) # offload g[2]
146 mov %rax,%r10 # %r10:%r11 = h2
147 mov %rdi,%rax
148 lea (%r14,%r15,2),%rdi # g[4]*19
149 mov %rdx,%r11
150 mulq %rbp # f[0]*g[3]
151 mov %rax,%r12 # %r12:%r13 = h3
152 mov 8*0(%rsi),%rax # f[0]
153 mov %rdx,%r13
154 mulq %r14 # f[0]*g[4]
155 mov %rax,%r14 # %r14:%r15 = h4
156 mov 8*1(%rsi),%rax # f[1]
157 mov %rdx,%r15
158
159 mulq %rdi # f[1]*g[4]*19
160 add %rax,%rbx
161 mov 8*2(%rsi),%rax # f[2]
162 adc %rdx,%rcx
163 mulq %rdi # f[2]*g[4]*19
164 add %rax,%r8
165 mov 8*3(%rsi),%rax # f[3]
166 adc %rdx,%r9
167 mulq %rdi # f[3]*g[4]*19
168 add %rax,%r10
169 mov 8*4(%rsi),%rax # f[4]
170 adc %rdx,%r11
171 mulq %rdi # f[4]*g[4]*19
172 imulq \$19,%rbp,%rdi # g[3]*19
173 add %rax,%r12
174 mov 8*1(%rsi),%rax # f[1]
175 adc %rdx,%r13
176 mulq %rbp # f[1]*g[3]
177 mov 8*2(%rsp),%rbp # g[2]
178 add %rax,%r14
179 mov 8*2(%rsi),%rax # f[2]
180 adc %rdx,%r15
181
182 mulq %rdi # f[2]*g[3]*19
183 add %rax,%rbx
184 mov 8*3(%rsi),%rax # f[3]
185 adc %rdx,%rcx
186 mulq %rdi # f[3]*g[3]*19
187 add %rax,%r8
188 mov 8*4(%rsi),%rax # f[4]
189 adc %rdx,%r9
190 mulq %rdi # f[4]*g[3]*19
191 imulq \$19,%rbp,%rdi # g[2]*19
192 add %rax,%r10
193 mov 8*1(%rsi),%rax # f[1]
194 adc %rdx,%r11
195 mulq %rbp # f[1]*g[2]
196 add %rax,%r12
197 mov 8*2(%rsi),%rax # f[2]
198 adc %rdx,%r13
199 mulq %rbp # f[2]*g[2]
200 mov 8*1(%rsp),%rbp # g[1]
201 add %rax,%r14
202 mov 8*3(%rsi),%rax # f[3]
203 adc %rdx,%r15
204
205 mulq %rdi # f[3]*g[2]*19
206 add %rax,%rbx
207 mov 8*4(%rsi),%rax # f[3]
208 adc %rdx,%rcx
209 mulq %rdi # f[4]*g[2]*19
210 add %rax,%r8
211 mov 8*1(%rsi),%rax # f[1]
212 adc %rdx,%r9
213 mulq %rbp # f[1]*g[1]
214 imulq \$19,%rbp,%rdi
215 add %rax,%r10
216 mov 8*2(%rsi),%rax # f[2]
217 adc %rdx,%r11
218 mulq %rbp # f[2]*g[1]
219 add %rax,%r12
220 mov 8*3(%rsi),%rax # f[3]
221 adc %rdx,%r13
222 mulq %rbp # f[3]*g[1]
223 mov 8*0(%rsp),%rbp # g[0]
224 add %rax,%r14
225 mov 8*4(%rsi),%rax # f[4]
226 adc %rdx,%r15
227
228 mulq %rdi # f[4]*g[1]*19
229 add %rax,%rbx
230 mov 8*1(%rsi),%rax # f[1]
231 adc %rdx,%rcx
232 mul %rbp # f[1]*g[0]
233 add %rax,%r8
234 mov 8*2(%rsi),%rax # f[2]
235 adc %rdx,%r9
236 mul %rbp # f[2]*g[0]
237 add %rax,%r10
238 mov 8*3(%rsi),%rax # f[3]
239 adc %rdx,%r11
240 mul %rbp # f[3]*g[0]
241 add %rax,%r12
242 mov 8*4(%rsi),%rax # f[4]
243 adc %rdx,%r13
244 mulq %rbp # f[4]*g[0]
245 add %rax,%r14
246 adc %rdx,%r15
247
248 mov 8*4(%rsp),%rdi # restore 1st argument
249 jmp .Lreduce51
250 .Lfe51_mul_epilogue:
251 .cfi_endproc
252 .size x25519_fe51_mul,.-x25519_fe51_mul
253
254 .globl x25519_fe51_sqr
255 .type x25519_fe51_sqr,\@function,2
256 .align 32
257 x25519_fe51_sqr:
258 .cfi_startproc
259 push %rbp
260 .cfi_push %rbp
261 push %rbx
262 .cfi_push %rbx
263 push %r12
264 .cfi_push %r12
265 push %r13
266 .cfi_push %r13
267 push %r14
268 .cfi_push %r14
269 push %r15
270 .cfi_push %r15
271 lea -8*5(%rsp),%rsp
272 .cfi_adjust_cfa_offset 40
273 .Lfe51_sqr_body:
274
275 mov 8*0(%rsi),%rax # g[0]
276 mov 8*2(%rsi),%r15 # g[2]
277 mov 8*4(%rsi),%rbp # g[4]
278
279 mov %rdi,8*4(%rsp) # offload 1st argument
280 lea (%rax,%rax),%r14
281 mulq %rax # g[0]*g[0]
282 mov %rax,%rbx
283 mov 8*1(%rsi),%rax # g[1]
284 mov %rdx,%rcx
285 mulq %r14 # 2*g[0]*g[1]
286 mov %rax,%r8
287 mov %r15,%rax
288 mov %r15,8*0(%rsp) # offload g[2]
289 mov %rdx,%r9
290 mulq %r14 # 2*g[0]*g[2]
291 mov %rax,%r10
292 mov 8*3(%rsi),%rax
293 mov %rdx,%r11
294 imulq \$19,%rbp,%rdi # g[4]*19
295 mulq %r14 # 2*g[0]*g[3]
296 mov %rax,%r12
297 mov %rbp,%rax
298 mov %rdx,%r13
299 mulq %r14 # 2*g[0]*g[4]
300 mov %rax,%r14
301 mov %rbp,%rax
302 mov %rdx,%r15
303
304 mulq %rdi # g[4]*g[4]*19
305 add %rax,%r12
306 mov 8*1(%rsi),%rax # g[1]
307 adc %rdx,%r13
308
309 mov 8*3(%rsi),%rsi # g[3]
310 lea (%rax,%rax),%rbp
311 mulq %rax # g[1]*g[1]
312 add %rax,%r10
313 mov 8*0(%rsp),%rax # g[2]
314 adc %rdx,%r11
315 mulq %rbp # 2*g[1]*g[2]
316 add %rax,%r12
317 mov %rbp,%rax
318 adc %rdx,%r13
319 mulq %rsi # 2*g[1]*g[3]
320 add %rax,%r14
321 mov %rbp,%rax
322 adc %rdx,%r15
323 imulq \$19,%rsi,%rbp # g[3]*19
324 mulq %rdi # 2*g[1]*g[4]*19
325 add %rax,%rbx
326 lea (%rsi,%rsi),%rax
327 adc %rdx,%rcx
328
329 mulq %rdi # 2*g[3]*g[4]*19
330 add %rax,%r10
331 mov %rsi,%rax
332 adc %rdx,%r11
333 mulq %rbp # g[3]*g[3]*19
334 add %rax,%r8
335 mov 8*0(%rsp),%rax # g[2]
336 adc %rdx,%r9
337
338 lea (%rax,%rax),%rsi
339 mulq %rax # g[2]*g[2]
340 add %rax,%r14
341 mov %rbp,%rax
342 adc %rdx,%r15
343 mulq %rsi # 2*g[2]*g[3]*19
344 add %rax,%rbx
345 mov %rsi,%rax
346 adc %rdx,%rcx
347 mulq %rdi # 2*g[2]*g[4]*19
348 add %rax,%r8
349 adc %rdx,%r9
350
351 mov 8*4(%rsp),%rdi # restore 1st argument
352 jmp .Lreduce51
353
354 .align 32
355 .Lreduce51:
356 mov \$0x7ffffffffffff,%rbp
357
358 mov %r10,%rdx
359 shr \$51,%r10
360 shl \$13,%r11
361 and %rbp,%rdx # %rdx = g2 = h2 & mask
362 or %r10,%r11 # h2>>51
363 add %r11,%r12
364 adc \$0,%r13 # h3 += h2>>51
365
366 mov %rbx,%rax
367 shr \$51,%rbx
368 shl \$13,%rcx
369 and %rbp,%rax # %rax = g0 = h0 & mask
370 or %rbx,%rcx # h0>>51
371 add %rcx,%r8 # h1 += h0>>51
372 adc \$0,%r9
373
374 mov %r12,%rbx
375 shr \$51,%r12
376 shl \$13,%r13
377 and %rbp,%rbx # %rbx = g3 = h3 & mask
378 or %r12,%r13 # h3>>51
379 add %r13,%r14 # h4 += h3>>51
380 adc \$0,%r15
381
382 mov %r8,%rcx
383 shr \$51,%r8
384 shl \$13,%r9
385 and %rbp,%rcx # %rcx = g1 = h1 & mask
386 or %r8,%r9
387 add %r9,%rdx # g2 += h1>>51
388
389 mov %r14,%r10
390 shr \$51,%r14
391 shl \$13,%r15
392 and %rbp,%r10 # %r10 = g4 = h0 & mask
393 or %r14,%r15 # h0>>51
394
395 lea (%r15,%r15,8),%r14
396 lea (%r15,%r14,2),%r15
397 add %r15,%rax # g0 += (h0>>51)*19
398
399 mov %rdx,%r8
400 and %rbp,%rdx # g2 &= mask
401 shr \$51,%r8
402 add %r8,%rbx # g3 += g2>>51
403
404 mov %rax,%r9
405 and %rbp,%rax # g0 &= mask
406 shr \$51,%r9
407 add %r9,%rcx # g1 += g0>>51
408
409 mov %rax,8*0(%rdi) # save the result
410 mov %rcx,8*1(%rdi)
411 mov %rdx,8*2(%rdi)
412 mov %rbx,8*3(%rdi)
413 mov %r10,8*4(%rdi)
414
415 mov 8*5(%rsp),%r15
416 .cfi_restore %r15
417 mov 8*6(%rsp),%r14
418 .cfi_restore %r14
419 mov 8*7(%rsp),%r13
420 .cfi_restore %r13
421 mov 8*8(%rsp),%r12
422 .cfi_restore %r12
423 mov 8*9(%rsp),%rbx
424 .cfi_restore %rbx
425 mov 8*10(%rsp),%rbp
426 .cfi_restore %rbp
427 lea 8*11(%rsp),%rsp
428 .cfi_adjust_cfa_offset 88
429 .Lfe51_sqr_epilogue:
430 ret
431 .cfi_endproc
432 .size x25519_fe51_sqr,.-x25519_fe51_sqr
433
434 .globl x25519_fe51_mul121666
435 .type x25519_fe51_mul121666,\@function,2
436 .align 32
437 x25519_fe51_mul121666:
438 .cfi_startproc
439 push %rbp
440 .cfi_push %rbp
441 push %rbx
442 .cfi_push %rbx
443 push %r12
444 .cfi_push %r12
445 push %r13
446 .cfi_push %r13
447 push %r14
448 .cfi_push %r14
449 push %r15
450 .cfi_push %r15
451 lea -8*5(%rsp),%rsp
452 .cfi_adjust_cfa_offset 40
453 .Lfe51_mul121666_body:
454 mov \$121666,%eax
455
456 mulq 8*0(%rsi)
457 mov %rax,%rbx # %rbx:%rcx = h0
458 mov \$121666,%eax
459 mov %rdx,%rcx
460 mulq 8*1(%rsi)
461 mov %rax,%r8 # %r8:%r9 = h1
462 mov \$121666,%eax
463 mov %rdx,%r9
464 mulq 8*2(%rsi)
465 mov %rax,%r10 # %r10:%r11 = h2
466 mov \$121666,%eax
467 mov %rdx,%r11
468 mulq 8*3(%rsi)
469 mov %rax,%r12 # %r12:%r13 = h3
470 mov \$121666,%eax # f[0]
471 mov %rdx,%r13
472 mulq 8*4(%rsi)
473 mov %rax,%r14 # %r14:%r15 = h4
474 mov %rdx,%r15
475
476 jmp .Lreduce51
477 .Lfe51_mul121666_epilogue:
478 .cfi_endproc
479 .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
480 ___
481 ########################################################################
482 # Base 2^64 subroutines modulo 2*(2^255-19)
483 #
484 if ($addx) {
485 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
486
487 $code.=<<___;
488 .extern OPENSSL_ia32cap_P
489 .globl x25519_fe64_eligible
490 .type x25519_fe64_eligible,\@abi-omnipotent
491 .align 32
492 x25519_fe64_eligible:
493 .cfi_startproc
494 mov OPENSSL_ia32cap_P+8(%rip),%ecx
495 xor %eax,%eax
496 and \$0x80100,%ecx
497 cmp \$0x80100,%ecx
498 cmove %ecx,%eax
499 ret
500 .cfi_endproc
501 .size x25519_fe64_eligible,.-x25519_fe64_eligible
502
503 .globl x25519_fe64_mul
504 .type x25519_fe64_mul,\@function,3
505 .align 32
506 x25519_fe64_mul:
507 .cfi_startproc
508 push %rbp
509 .cfi_push %rbp
510 push %rbx
511 .cfi_push %rbx
512 push %r12
513 .cfi_push %r12
514 push %r13
515 .cfi_push %r13
516 push %r14
517 .cfi_push %r14
518 push %r15
519 .cfi_push %r15
520 push %rdi # offload dst
521 .cfi_push %rdi
522 lea -8*2(%rsp),%rsp
523 .cfi_adjust_cfa_offset 16
524 .Lfe64_mul_body:
525
526 mov %rdx,%rax
527 mov 8*0(%rdx),%rbp # b[0]
528 mov 8*0(%rsi),%rdx # a[0]
529 mov 8*1(%rax),%rcx # b[1]
530 mov 8*2(%rax),$acc6 # b[2]
531 mov 8*3(%rax),$acc7 # b[3]
532
533 mulx %rbp,$acc0,%rax # a[0]*b[0]
534 xor %edi,%edi # cf=0,of=0
535 mulx %rcx,$acc1,%rbx # a[0]*b[1]
536 adcx %rax,$acc1
537 mulx $acc6,$acc2,%rax # a[0]*b[2]
538 adcx %rbx,$acc2
539 mulx $acc7,$acc3,$acc4 # a[0]*b[3]
540 mov 8*1(%rsi),%rdx # a[1]
541 adcx %rax,$acc3
542 mov $acc6,(%rsp) # offload b[2]
543 adcx %rdi,$acc4 # cf=0
544
545 mulx %rbp,%rax,%rbx # a[1]*b[0]
546 adox %rax,$acc1
547 adcx %rbx,$acc2
548 mulx %rcx,%rax,%rbx # a[1]*b[1]
549 adox %rax,$acc2
550 adcx %rbx,$acc3
551 mulx $acc6,%rax,%rbx # a[1]*b[2]
552 adox %rax,$acc3
553 adcx %rbx,$acc4
554 mulx $acc7,%rax,$acc5 # a[1]*b[3]
555 mov 8*2(%rsi),%rdx # a[2]
556 adox %rax,$acc4
557 adcx %rdi,$acc5 # cf=0
558 adox %rdi,$acc5 # of=0
559
560 mulx %rbp,%rax,%rbx # a[2]*b[0]
561 adcx %rax,$acc2
562 adox %rbx,$acc3
563 mulx %rcx,%rax,%rbx # a[2]*b[1]
564 adcx %rax,$acc3
565 adox %rbx,$acc4
566 mulx $acc6,%rax,%rbx # a[2]*b[2]
567 adcx %rax,$acc4
568 adox %rbx,$acc5
569 mulx $acc7,%rax,$acc6 # a[2]*b[3]
570 mov 8*3(%rsi),%rdx # a[3]
571 adcx %rax,$acc5
572 adox %rdi,$acc6 # of=0
573 adcx %rdi,$acc6 # cf=0
574
575 mulx %rbp,%rax,%rbx # a[3]*b[0]
576 adox %rax,$acc3
577 adcx %rbx,$acc4
578 mulx %rcx,%rax,%rbx # a[3]*b[1]
579 adox %rax,$acc4
580 adcx %rbx,$acc5
581 mulx (%rsp),%rax,%rbx # a[3]*b[2]
582 adox %rax,$acc5
583 adcx %rbx,$acc6
584 mulx $acc7,%rax,$acc7 # a[3]*b[3]
585 mov \$38,%edx
586 adox %rax,$acc6
587 adcx %rdi,$acc7 # cf=0
588 adox %rdi,$acc7 # of=0
589
590 jmp .Lreduce64
591 .Lfe64_mul_epilogue:
592 .cfi_endproc
593 .size x25519_fe64_mul,.-x25519_fe64_mul
594
595 .globl x25519_fe64_sqr
596 .type x25519_fe64_sqr,\@function,2
597 .align 32
598 x25519_fe64_sqr:
599 .cfi_startproc
600 push %rbp
601 .cfi_push %rbp
602 push %rbx
603 .cfi_push %rbx
604 push %r12
605 .cfi_push %r12
606 push %r13
607 .cfi_push %r13
608 push %r14
609 .cfi_push %r14
610 push %r15
611 .cfi_push %r15
612 push %rdi # offload dst
613 .cfi_push %rdi
614 lea -8*2(%rsp),%rsp
615 .cfi_adjust_cfa_offset 16
616 .Lfe64_sqr_body:
617
618 mov 8*0(%rsi),%rdx # a[0]
619 mov 8*1(%rsi),%rcx # a[1]
620 mov 8*2(%rsi),%rbp # a[2]
621 mov 8*3(%rsi),%rsi # a[3]
622
623 ################################################################
624 mulx %rdx,$acc0,$acc7 # a[0]*a[0]
625 mulx %rcx,$acc1,%rax # a[0]*a[1]
626 xor %edi,%edi # cf=0,of=0
627 mulx %rbp,$acc2,%rbx # a[0]*a[2]
628 adcx %rax,$acc2
629 mulx %rsi,$acc3,$acc4 # a[0]*a[3]
630 mov %rcx,%rdx # a[1]
631 adcx %rbx,$acc3
632 adcx %rdi,$acc4 # cf=0
633
634 ################################################################
635 mulx %rbp,%rax,%rbx # a[1]*a[2]
636 adox %rax,$acc3
637 adcx %rbx,$acc4
638 mulx %rsi,%rax,$acc5 # a[1]*a[3]
639 mov %rbp,%rdx # a[2]
640 adox %rax,$acc4
641 adcx %rdi,$acc5
642
643 ################################################################
644 mulx %rsi,%rax,$acc6 # a[2]*a[3]
645 mov %rcx,%rdx # a[1]
646 adox %rax,$acc5
647 adcx %rdi,$acc6 # cf=0
648 adox %rdi,$acc6 # of=0
649
650 adcx $acc1,$acc1 # acc1:6<<1
651 adox $acc7,$acc1
652 adcx $acc2,$acc2
653 mulx %rdx,%rax,%rbx # a[1]*a[1]
654 mov %rbp,%rdx # a[2]
655 adcx $acc3,$acc3
656 adox %rax,$acc2
657 adcx $acc4,$acc4
658 adox %rbx,$acc3
659 mulx %rdx,%rax,%rbx # a[2]*a[2]
660 mov %rsi,%rdx # a[3]
661 adcx $acc5,$acc5
662 adox %rax,$acc4
663 adcx $acc6,$acc6
664 adox %rbx,$acc5
665 mulx %rdx,%rax,$acc7 # a[3]*a[3]
666 mov \$38,%edx
667 adox %rax,$acc6
668 adcx %rdi,$acc7 # cf=0
669 adox %rdi,$acc7 # of=0
670 jmp .Lreduce64
671
672 .align 32
673 .Lreduce64:
674 mulx $acc4,%rax,%rbx
675 adcx %rax,$acc0
676 adox %rbx,$acc1
677 mulx $acc5,%rax,%rbx
678 adcx %rax,$acc1
679 adox %rbx,$acc2
680 mulx $acc6,%rax,%rbx
681 adcx %rax,$acc2
682 adox %rbx,$acc3
683 mulx $acc7,%rax,$acc4
684 adcx %rax,$acc3
685 adox %rdi,$acc4
686 adcx %rdi,$acc4
687
688 mov 8*2(%rsp),%rdi # restore dst
689 imulq %rdx,$acc4
690
691 add $acc4,$acc0
692 adc \$0,$acc1
693 adc \$0,$acc2
694 adc \$0,$acc3
695
696 sbb %rax,%rax # cf -> mask
697 and \$38,%rax
698
699 add %rax,$acc0
700 mov $acc1,8*1(%rdi)
701 mov $acc2,8*2(%rdi)
702 mov $acc3,8*3(%rdi)
703 mov $acc0,8*0(%rdi)
704
705 mov 8*3(%rsp),%r15
706 .cfi_restore %r15
707 mov 8*4(%rsp),%r14
708 .cfi_restore %r14
709 mov 8*5(%rsp),%r13
710 .cfi_restore %r13
711 mov 8*6(%rsp),%r12
712 .cfi_restore %r12
713 mov 8*7(%rsp),%rbx
714 .cfi_restore %rbx
715 mov 8*8(%rsp),%rbp
716 .cfi_restore %rbp
717 lea 8*9(%rsp),%rsp
718 .cfi_adjust_cfa_offset 88
719 .Lfe64_sqr_epilogue:
720 ret
721 .cfi_endproc
722 .size x25519_fe64_sqr,.-x25519_fe64_sqr
723
724 .globl x25519_fe64_mul121666
725 .type x25519_fe64_mul121666,\@function,2
726 .align 32
727 x25519_fe64_mul121666:
728 .Lfe64_mul121666_body:
729 .cfi_startproc
730 mov \$121666,%edx
731 mulx 8*0(%rsi),$acc0,%rcx
732 mulx 8*1(%rsi),$acc1,%rax
733 add %rcx,$acc1
734 mulx 8*2(%rsi),$acc2,%rcx
735 adc %rax,$acc2
736 mulx 8*3(%rsi),$acc3,%rax
737 adc %rcx,$acc3
738 adc \$0,%rax
739
740 imulq \$38,%rax,%rax
741
742 add %rax,$acc0
743 adc \$0,$acc1
744 adc \$0,$acc2
745 adc \$0,$acc3
746
747 sbb %rax,%rax # cf -> mask
748 and \$38,%rax
749
750 add %rax,$acc0
751 mov $acc1,8*1(%rdi)
752 mov $acc2,8*2(%rdi)
753 mov $acc3,8*3(%rdi)
754 mov $acc0,8*0(%rdi)
755
756 .Lfe64_mul121666_epilogue:
757 ret
758 .cfi_endproc
759 .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
760
761 .globl x25519_fe64_add
762 .type x25519_fe64_add,\@function,3
763 .align 32
764 x25519_fe64_add:
765 .Lfe64_add_body:
766 .cfi_startproc
767 mov 8*0(%rsi),$acc0
768 mov 8*1(%rsi),$acc1
769 mov 8*2(%rsi),$acc2
770 mov 8*3(%rsi),$acc3
771
772 add 8*0(%rdx),$acc0
773 adc 8*1(%rdx),$acc1
774 adc 8*2(%rdx),$acc2
775 adc 8*3(%rdx),$acc3
776
777 sbb %rax,%rax # cf -> mask
778 and \$38,%rax
779
780 add %rax,$acc0
781 adc \$0,$acc1
782 adc \$0,$acc2
783 mov $acc1,8*1(%rdi)
784 adc \$0,$acc3
785 mov $acc2,8*2(%rdi)
786 sbb %rax,%rax # cf -> mask
787 mov $acc3,8*3(%rdi)
788 and \$38,%rax
789
790 add %rax,$acc0
791 mov $acc0,8*0(%rdi)
792
793 .Lfe64_add_epilogue:
794 ret
795 .cfi_endproc
796 .size x25519_fe64_add,.-x25519_fe64_add
797
798 .globl x25519_fe64_sub
799 .type x25519_fe64_sub,\@function,3
800 .align 32
801 x25519_fe64_sub:
802 .Lfe64_sub_body:
803 .cfi_startproc
804 mov 8*0(%rsi),$acc0
805 mov 8*1(%rsi),$acc1
806 mov 8*2(%rsi),$acc2
807 mov 8*3(%rsi),$acc3
808
809 sub 8*0(%rdx),$acc0
810 sbb 8*1(%rdx),$acc1
811 sbb 8*2(%rdx),$acc2
812 sbb 8*3(%rdx),$acc3
813
814 sbb %rax,%rax # cf -> mask
815 and \$38,%rax
816
817 sub %rax,$acc0
818 sbb \$0,$acc1
819 sbb \$0,$acc2
820 mov $acc1,8*1(%rdi)
821 sbb \$0,$acc3
822 mov $acc2,8*2(%rdi)
823 sbb %rax,%rax # cf -> mask
824 mov $acc3,8*3(%rdi)
825 and \$38,%rax
826
827 sub %rax,$acc0
828 mov $acc0,8*0(%rdi)
829
830 .Lfe64_sub_epilogue:
831 ret
832 .cfi_endproc
833 .size x25519_fe64_sub,.-x25519_fe64_sub
834
835 .globl x25519_fe64_tobytes
836 .type x25519_fe64_tobytes,\@function,2
837 .align 32
838 x25519_fe64_tobytes:
839 .Lfe64_to_body:
840 .cfi_startproc
841 mov 8*0(%rsi),$acc0
842 mov 8*1(%rsi),$acc1
843 mov 8*2(%rsi),$acc2
844 mov 8*3(%rsi),$acc3
845
846 ################################# reduction modulo 2^255-19
847 lea ($acc3,$acc3),%rax
848 sar \$63,$acc3 # most significant bit -> mask
849 shr \$1,%rax # most significant bit cleared
850 and \$19,$acc3
851 add \$19,$acc3 # compare to modulus in the same go
852
853 add $acc3,$acc0
854 adc \$0,$acc1
855 adc \$0,$acc2
856 adc \$0,%rax
857
858 lea (%rax,%rax),$acc3
859 sar \$63,%rax # most significant bit -> mask
860 shr \$1,$acc3 # most significant bit cleared
861 not %rax
862 and \$19,%rax
863
864 sub %rax,$acc0
865 sbb \$0,$acc1
866 sbb \$0,$acc2
867 sbb \$0,$acc3
868
869 mov $acc0,8*0(%rdi)
870 mov $acc1,8*1(%rdi)
871 mov $acc2,8*2(%rdi)
872 mov $acc3,8*3(%rdi)
873
874 .Lfe64_to_epilogue:
875 ret
876 .cfi_endproc
877 .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
878 ___
879 } else {
880 $code.=<<___;
881 .globl x25519_fe64_eligible
882 .type x25519_fe64_eligible,\@abi-omnipotent
883 .align 32
884 x25519_fe64_eligible:
885 .cfi_startproc
886 xor %eax,%eax
887 ret
888 .cfi_endproc
889 .size x25519_fe64_eligible,.-x25519_fe64_eligible
890
891 .globl x25519_fe64_mul
892 .type x25519_fe64_mul,\@abi-omnipotent
893 .globl x25519_fe64_sqr
894 .globl x25519_fe64_mul121666
895 .globl x25519_fe64_add
896 .globl x25519_fe64_sub
897 .globl x25519_fe64_tobytes
898 x25519_fe64_mul:
899 x25519_fe64_sqr:
900 x25519_fe64_mul121666:
901 x25519_fe64_add:
902 x25519_fe64_sub:
903 x25519_fe64_tobytes:
904 .cfi_startproc
905 .byte 0x0f,0x0b # ud2
906 ret
907 .cfi_endproc
908 .size x25519_fe64_mul,.-x25519_fe64_mul
909 ___
910 }
911 $code.=<<___;
912 .asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
913 ___
914
915 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
916 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
917 if ($win64) {
918 $rec="%rcx";
919 $frame="%rdx";
920 $context="%r8";
921 $disp="%r9";
922
923 $code.=<<___;
924 .extern __imp_RtlVirtualUnwind
925
926 .type short_handler,\@abi-omnipotent
927 .align 16
928 short_handler:
929 push %rsi
930 push %rdi
931 push %rbx
932 push %rbp
933 push %r12
934 push %r13
935 push %r14
936 push %r15
937 pushfq
938 sub \$64,%rsp
939
940 mov 120($context),%rax # pull context->Rax
941 mov 248($context),%rbx # pull context->Rip
942
943 mov 8($disp),%rsi # disp->ImageBase
944 mov 56($disp),%r11 # disp->HandlerData
945
946 mov 0(%r11),%r10d # HandlerData[0]
947 lea (%rsi,%r10),%r10 # end of prologue label
948 cmp %r10,%rbx # context->Rip<end of prologue label
949 jb .Lcommon_seh_tail
950
951 mov 152($context),%rax # pull context->Rsp
952 jmp .Lcommon_seh_tail
953 .size short_handler,.-short_handler
954
955 .type full_handler,\@abi-omnipotent
956 .align 16
957 full_handler:
958 push %rsi
959 push %rdi
960 push %rbx
961 push %rbp
962 push %r12
963 push %r13
964 push %r14
965 push %r15
966 pushfq
967 sub \$64,%rsp
968
969 mov 120($context),%rax # pull context->Rax
970 mov 248($context),%rbx # pull context->Rip
971
972 mov 8($disp),%rsi # disp->ImageBase
973 mov 56($disp),%r11 # disp->HandlerData
974
975 mov 0(%r11),%r10d # HandlerData[0]
976 lea (%rsi,%r10),%r10 # end of prologue label
977 cmp %r10,%rbx # context->Rip<end of prologue label
978 jb .Lcommon_seh_tail
979
980 mov 152($context),%rax # pull context->Rsp
981
982 mov 4(%r11),%r10d # HandlerData[1]
983 lea (%rsi,%r10),%r10 # epilogue label
984 cmp %r10,%rbx # context->Rip>=epilogue label
985 jae .Lcommon_seh_tail
986
987 mov 8(%r11),%r10d # HandlerData[2]
988 lea (%rax,%r10),%rax
989
990 mov -8(%rax),%rbp
991 mov -16(%rax),%rbx
992 mov -24(%rax),%r12
993 mov -32(%rax),%r13
994 mov -40(%rax),%r14
995 mov -48(%rax),%r15
996 mov %rbx,144($context) # restore context->Rbx
997 mov %rbp,160($context) # restore context->Rbp
998 mov %r12,216($context) # restore context->R12
999 mov %r13,224($context) # restore context->R13
1000 mov %r14,232($context) # restore context->R14
1001 mov %r15,240($context) # restore context->R15
1002
1003 .Lcommon_seh_tail:
1004 mov 8(%rax),%rdi
1005 mov 16(%rax),%rsi
1006 mov %rax,152($context) # restore context->Rsp
1007 mov %rsi,168($context) # restore context->Rsi
1008 mov %rdi,176($context) # restore context->Rdi
1009
1010 mov 40($disp),%rdi # disp->ContextRecord
1011 mov $context,%rsi # context
1012 mov \$154,%ecx # sizeof(CONTEXT)
1013 .long 0xa548f3fc # cld; rep movsq
1014
1015 mov $disp,%rsi
1016 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1017 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1018 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1019 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1020 mov 40(%rsi),%r10 # disp->ContextRecord
1021 lea 56(%rsi),%r11 # &disp->HandlerData
1022 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1023 mov %r10,32(%rsp) # arg5
1024 mov %r11,40(%rsp) # arg6
1025 mov %r12,48(%rsp) # arg7
1026 mov %rcx,56(%rsp) # arg8, (NULL)
1027 call *__imp_RtlVirtualUnwind(%rip)
1028
1029 mov \$1,%eax # ExceptionContinueSearch
1030 add \$64,%rsp
1031 popfq
1032 pop %r15
1033 pop %r14
1034 pop %r13
1035 pop %r12
1036 pop %rbp
1037 pop %rbx
1038 pop %rdi
1039 pop %rsi
1040 ret
1041 .size full_handler,.-full_handler
1042
1043 .section .pdata
1044 .align 4
1045 .rva .LSEH_begin_x25519_fe51_mul
1046 .rva .LSEH_end_x25519_fe51_mul
1047 .rva .LSEH_info_x25519_fe51_mul
1048
1049 .rva .LSEH_begin_x25519_fe51_sqr
1050 .rva .LSEH_end_x25519_fe51_sqr
1051 .rva .LSEH_info_x25519_fe51_sqr
1052
1053 .rva .LSEH_begin_x25519_fe51_mul121666
1054 .rva .LSEH_end_x25519_fe51_mul121666
1055 .rva .LSEH_info_x25519_fe51_mul121666
1056 ___
1057 $code.=<<___ if ($addx);
1058 .rva .LSEH_begin_x25519_fe64_mul
1059 .rva .LSEH_end_x25519_fe64_mul
1060 .rva .LSEH_info_x25519_fe64_mul
1061
1062 .rva .LSEH_begin_x25519_fe64_sqr
1063 .rva .LSEH_end_x25519_fe64_sqr
1064 .rva .LSEH_info_x25519_fe64_sqr
1065
1066 .rva .LSEH_begin_x25519_fe64_mul121666
1067 .rva .LSEH_end_x25519_fe64_mul121666
1068 .rva .LSEH_info_x25519_fe64_mul121666
1069
1070 .rva .LSEH_begin_x25519_fe64_add
1071 .rva .LSEH_end_x25519_fe64_add
1072 .rva .LSEH_info_x25519_fe64_add
1073
1074 .rva .LSEH_begin_x25519_fe64_sub
1075 .rva .LSEH_end_x25519_fe64_sub
1076 .rva .LSEH_info_x25519_fe64_sub
1077
1078 .rva .LSEH_begin_x25519_fe64_tobytes
1079 .rva .LSEH_end_x25519_fe64_tobytes
1080 .rva .LSEH_info_x25519_fe64_tobytes
1081 ___
1082 $code.=<<___;
1083 .section .xdata
1084 .align 8
1085 .LSEH_info_x25519_fe51_mul:
1086 .byte 9,0,0,0
1087 .rva full_handler
1088 .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[]
1089 .long 88,0
1090 .LSEH_info_x25519_fe51_sqr:
1091 .byte 9,0,0,0
1092 .rva full_handler
1093 .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[]
1094 .long 88,0
1095 .LSEH_info_x25519_fe51_mul121666:
1096 .byte 9,0,0,0
1097 .rva full_handler
1098 .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
1099 .long 88,0
1100 ___
1101 $code.=<<___ if ($addx);
1102 .LSEH_info_x25519_fe64_mul:
1103 .byte 9,0,0,0
1104 .rva full_handler
1105 .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[]
1106 .long 72,0
1107 .LSEH_info_x25519_fe64_sqr:
1108 .byte 9,0,0,0
1109 .rva full_handler
1110 .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[]
1111 .long 72,0
1112 .LSEH_info_x25519_fe64_mul121666:
1113 .byte 9,0,0,0
1114 .rva short_handler
1115 .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
1116 .LSEH_info_x25519_fe64_add:
1117 .byte 9,0,0,0
1118 .rva short_handler
1119 .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[]
1120 .LSEH_info_x25519_fe64_sub:
1121 .byte 9,0,0,0
1122 .rva short_handler
1123 .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[]
1124 .LSEH_info_x25519_fe64_tobytes:
1125 .byte 9,0,0,0
1126 .rva short_handler
1127 .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[]
1128 ___
1129 }
1130
1131 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1132 print $code;
1133 close STDOUT;