]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/x86_64-mont5.pl
bn/asm/*x86_64*.pl: correct assembler requirement for ad*x.
[thirdparty/openssl.git] / crypto / bn / asm / x86_64-mont5.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # August 2011.
11 #
12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
13 # countermeasures. The subroutines are produced by replacing bp[i]
14 # references in their x86_64-mont.pl counterparts with cache-neutral
15 # references to powers table computed in BN_mod_exp_mont_consttime.
16 # In addition subroutine that scatters elements of the powers table
17 # is implemented, so that scatter-/gathering can be tuned without
18 # bn_exp.c modifications.
19
20 $flavour = shift;
21 $output = shift;
22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29 die "can't locate x86_64-xlate.pl";
30
31 open OUT,"| \"$^X\" $xlate $flavour $output";
32 *STDOUT=*OUT;
33
34 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
35 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
36 $addx = ($1>=2.23);
37 }
38
39 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
40 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
41 $addx = ($1>=2.10);
42 }
43
44 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
45 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
46 $addx = ($1>=11);
47 }
48
49 # int bn_mul_mont_gather5(
50 $rp="%rdi"; # BN_ULONG *rp,
51 $ap="%rsi"; # const BN_ULONG *ap,
52 $bp="%rdx"; # const BN_ULONG *bp,
53 $np="%rcx"; # const BN_ULONG *np,
54 $n0="%r8"; # const BN_ULONG *n0,
55 $num="%r9"; # int num,
56 # int idx); # 0 to 2^5-1, "index" in $bp holding
57 # pre-computed powers of a', interlaced
58 # in such manner that b[0] is $bp[idx],
59 # b[1] is [2^5+idx], etc.
60 $lo0="%r10";
61 $hi0="%r11";
62 $hi1="%r13";
63 $i="%r14";
64 $j="%r15";
65 $m0="%rbx";
66 $m1="%rbp";
67
68 $code=<<___;
69 .text
70
71 .extern OPENSSL_ia32cap_P
72
73 .globl bn_mul_mont_gather5
74 .type bn_mul_mont_gather5,\@function,6
75 .align 64
76 bn_mul_mont_gather5:
77 test \$3,${num}d
78 jnz .Lmul_enter
79 cmp \$8,${num}d
80 jb .Lmul_enter
81 ___
82 $code.=<<___ if ($addx);
83 mov OPENSSL_ia32cap_P+8(%rip),%r11d
84 ___
85 $code.=<<___;
86 jmp .Lmul4x_enter
87
88 .align 16
89 .Lmul_enter:
90 mov ${num}d,${num}d
91 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
92 push %rbx
93 push %rbp
94 push %r12
95 push %r13
96 push %r14
97 push %r15
98 ___
99 $code.=<<___ if ($win64);
100 lea -0x28(%rsp),%rsp
101 movaps %xmm6,(%rsp)
102 movaps %xmm7,0x10(%rsp)
103 .Lmul_alloca:
104 ___
105 $code.=<<___;
106 mov %rsp,%rax
107 lea 2($num),%r11
108 neg %r11
109 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
110 and \$-1024,%rsp # minimize TLB usage
111
112 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
113 .Lmul_body:
114 mov $bp,%r12 # reassign $bp
115 ___
116 $bp="%r12";
117 $STRIDE=2**5*8; # 5 is "window size"
118 $N=$STRIDE/4; # should match cache line size
119 $code.=<<___;
120 mov %r10,%r11
121 shr \$`log($N/8)/log(2)`,%r10
122 and \$`$N/8-1`,%r11
123 not %r10
124 lea .Lmagic_masks(%rip),%rax
125 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
126 lea 96($bp,%r11,8),$bp # pointer within 1st cache line
127 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
128 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
129 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
130 movq 24(%rax,%r10,8),%xmm7
131
132 movq `0*$STRIDE/4-96`($bp),%xmm0
133 movq `1*$STRIDE/4-96`($bp),%xmm1
134 pand %xmm4,%xmm0
135 movq `2*$STRIDE/4-96`($bp),%xmm2
136 pand %xmm5,%xmm1
137 movq `3*$STRIDE/4-96`($bp),%xmm3
138 pand %xmm6,%xmm2
139 por %xmm1,%xmm0
140 pand %xmm7,%xmm3
141 por %xmm2,%xmm0
142 lea $STRIDE($bp),$bp
143 por %xmm3,%xmm0
144
145 movq %xmm0,$m0 # m0=bp[0]
146
147 mov ($n0),$n0 # pull n0[0] value
148 mov ($ap),%rax
149
150 xor $i,$i # i=0
151 xor $j,$j # j=0
152
153 movq `0*$STRIDE/4-96`($bp),%xmm0
154 movq `1*$STRIDE/4-96`($bp),%xmm1
155 pand %xmm4,%xmm0
156 movq `2*$STRIDE/4-96`($bp),%xmm2
157 pand %xmm5,%xmm1
158
159 mov $n0,$m1
160 mulq $m0 # ap[0]*bp[0]
161 mov %rax,$lo0
162 mov ($np),%rax
163
164 movq `3*$STRIDE/4-96`($bp),%xmm3
165 pand %xmm6,%xmm2
166 por %xmm1,%xmm0
167 pand %xmm7,%xmm3
168
169 imulq $lo0,$m1 # "tp[0]"*n0
170 mov %rdx,$hi0
171
172 por %xmm2,%xmm0
173 lea $STRIDE($bp),$bp
174 por %xmm3,%xmm0
175
176 mulq $m1 # np[0]*m1
177 add %rax,$lo0 # discarded
178 mov 8($ap),%rax
179 adc \$0,%rdx
180 mov %rdx,$hi1
181
182 lea 1($j),$j # j++
183 jmp .L1st_enter
184
185 .align 16
186 .L1st:
187 add %rax,$hi1
188 mov ($ap,$j,8),%rax
189 adc \$0,%rdx
190 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
191 mov $lo0,$hi0
192 adc \$0,%rdx
193 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
194 mov %rdx,$hi1
195
196 .L1st_enter:
197 mulq $m0 # ap[j]*bp[0]
198 add %rax,$hi0
199 mov ($np,$j,8),%rax
200 adc \$0,%rdx
201 lea 1($j),$j # j++
202 mov %rdx,$lo0
203
204 mulq $m1 # np[j]*m1
205 cmp $num,$j
206 jne .L1st
207
208 movq %xmm0,$m0 # bp[1]
209
210 add %rax,$hi1
211 mov ($ap),%rax # ap[0]
212 adc \$0,%rdx
213 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
214 adc \$0,%rdx
215 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
216 mov %rdx,$hi1
217 mov $lo0,$hi0
218
219 xor %rdx,%rdx
220 add $hi0,$hi1
221 adc \$0,%rdx
222 mov $hi1,-8(%rsp,$num,8)
223 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
224
225 lea 1($i),$i # i++
226 jmp .Louter
227 .align 16
228 .Louter:
229 xor $j,$j # j=0
230 mov $n0,$m1
231 mov (%rsp),$lo0
232
233 movq `0*$STRIDE/4-96`($bp),%xmm0
234 movq `1*$STRIDE/4-96`($bp),%xmm1
235 pand %xmm4,%xmm0
236 movq `2*$STRIDE/4-96`($bp),%xmm2
237 pand %xmm5,%xmm1
238
239 mulq $m0 # ap[0]*bp[i]
240 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
241 mov ($np),%rax
242 adc \$0,%rdx
243
244 movq `3*$STRIDE/4-96`($bp),%xmm3
245 pand %xmm6,%xmm2
246 por %xmm1,%xmm0
247 pand %xmm7,%xmm3
248
249 imulq $lo0,$m1 # tp[0]*n0
250 mov %rdx,$hi0
251
252 por %xmm2,%xmm0
253 lea $STRIDE($bp),$bp
254 por %xmm3,%xmm0
255
256 mulq $m1 # np[0]*m1
257 add %rax,$lo0 # discarded
258 mov 8($ap),%rax
259 adc \$0,%rdx
260 mov 8(%rsp),$lo0 # tp[1]
261 mov %rdx,$hi1
262
263 lea 1($j),$j # j++
264 jmp .Linner_enter
265
266 .align 16
267 .Linner:
268 add %rax,$hi1
269 mov ($ap,$j,8),%rax
270 adc \$0,%rdx
271 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
272 mov (%rsp,$j,8),$lo0
273 adc \$0,%rdx
274 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
275 mov %rdx,$hi1
276
277 .Linner_enter:
278 mulq $m0 # ap[j]*bp[i]
279 add %rax,$hi0
280 mov ($np,$j,8),%rax
281 adc \$0,%rdx
282 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
283 mov %rdx,$hi0
284 adc \$0,$hi0
285 lea 1($j),$j # j++
286
287 mulq $m1 # np[j]*m1
288 cmp $num,$j
289 jne .Linner
290
291 movq %xmm0,$m0 # bp[i+1]
292
293 add %rax,$hi1
294 mov ($ap),%rax # ap[0]
295 adc \$0,%rdx
296 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
297 mov (%rsp,$j,8),$lo0
298 adc \$0,%rdx
299 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
300 mov %rdx,$hi1
301
302 xor %rdx,%rdx
303 add $hi0,$hi1
304 adc \$0,%rdx
305 add $lo0,$hi1 # pull upmost overflow bit
306 adc \$0,%rdx
307 mov $hi1,-8(%rsp,$num,8)
308 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
309
310 lea 1($i),$i # i++
311 cmp $num,$i
312 jl .Louter
313
314 xor $i,$i # i=0 and clear CF!
315 mov (%rsp),%rax # tp[0]
316 lea (%rsp),$ap # borrow ap for tp
317 mov $num,$j # j=num
318 jmp .Lsub
319 .align 16
320 .Lsub: sbb ($np,$i,8),%rax
321 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
322 mov 8($ap,$i,8),%rax # tp[i+1]
323 lea 1($i),$i # i++
324 dec $j # doesnn't affect CF!
325 jnz .Lsub
326
327 sbb \$0,%rax # handle upmost overflow bit
328 xor $i,$i
329 and %rax,$ap
330 not %rax
331 mov $rp,$np
332 and %rax,$np
333 mov $num,$j # j=num
334 or $np,$ap # ap=borrow?tp:rp
335 .align 16
336 .Lcopy: # copy or in-place refresh
337 mov ($ap,$i,8),%rax
338 mov $i,(%rsp,$i,8) # zap temporary vector
339 mov %rax,($rp,$i,8) # rp[i]=tp[i]
340 lea 1($i),$i
341 sub \$1,$j
342 jnz .Lcopy
343
344 mov 8(%rsp,$num,8),%rsi # restore %rsp
345 mov \$1,%rax
346 ___
347 $code.=<<___ if ($win64);
348 movaps (%rsi),%xmm6
349 movaps 0x10(%rsi),%xmm7
350 lea 0x28(%rsi),%rsi
351 ___
352 $code.=<<___;
353 mov (%rsi),%r15
354 mov 8(%rsi),%r14
355 mov 16(%rsi),%r13
356 mov 24(%rsi),%r12
357 mov 32(%rsi),%rbp
358 mov 40(%rsi),%rbx
359 lea 48(%rsi),%rsp
360 .Lmul_epilogue:
361 ret
362 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
363 ___
364 {{{
365 my @A=("%r10","%r11");
366 my @N=("%r13","%rdi");
367 $code.=<<___;
368 .type bn_mul4x_mont_gather5,\@function,6
369 .align 16
370 bn_mul4x_mont_gather5:
371 .Lmul4x_enter:
372 ___
373 $code.=<<___ if ($addx);
374 and \$0x80100,%r11d
375 cmp \$0x80100,%r11d
376 je .Lmulx4x_enter
377 ___
378 $code.=<<___;
379 mov ${num}d,${num}d
380 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
381 push %rbx
382 push %rbp
383 push %r12
384 push %r13
385 push %r14
386 push %r15
387 ___
388 $code.=<<___ if ($win64);
389 lea -0x28(%rsp),%rsp
390 movaps %xmm6,(%rsp)
391 movaps %xmm7,0x10(%rsp)
392 .Lmul4x_alloca:
393 ___
394 $code.=<<___;
395 mov %rsp,%rax
396 lea 4($num),%r11
397 neg %r11
398 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
399 and \$-1024,%rsp # minimize TLB usage
400
401 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
402 .Lmul4x_body:
403 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
404 mov %rdx,%r12 # reassign $bp
405 ___
406 $bp="%r12";
407 $STRIDE=2**5*8; # 5 is "window size"
408 $N=$STRIDE/4; # should match cache line size
409 $code.=<<___;
410 mov %r10,%r11
411 shr \$`log($N/8)/log(2)`,%r10
412 and \$`$N/8-1`,%r11
413 not %r10
414 lea .Lmagic_masks(%rip),%rax
415 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
416 lea 96($bp,%r11,8),$bp # pointer within 1st cache line
417 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
418 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
419 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
420 movq 24(%rax,%r10,8),%xmm7
421
422 movq `0*$STRIDE/4-96`($bp),%xmm0
423 movq `1*$STRIDE/4-96`($bp),%xmm1
424 pand %xmm4,%xmm0
425 movq `2*$STRIDE/4-96`($bp),%xmm2
426 pand %xmm5,%xmm1
427 movq `3*$STRIDE/4-96`($bp),%xmm3
428 pand %xmm6,%xmm2
429 por %xmm1,%xmm0
430 pand %xmm7,%xmm3
431 por %xmm2,%xmm0
432 lea $STRIDE($bp),$bp
433 por %xmm3,%xmm0
434
435 movq %xmm0,$m0 # m0=bp[0]
436 mov ($n0),$n0 # pull n0[0] value
437 mov ($ap),%rax
438
439 xor $i,$i # i=0
440 xor $j,$j # j=0
441
442 movq `0*$STRIDE/4-96`($bp),%xmm0
443 movq `1*$STRIDE/4-96`($bp),%xmm1
444 pand %xmm4,%xmm0
445 movq `2*$STRIDE/4-96`($bp),%xmm2
446 pand %xmm5,%xmm1
447
448 mov $n0,$m1
449 mulq $m0 # ap[0]*bp[0]
450 mov %rax,$A[0]
451 mov ($np),%rax
452
453 movq `3*$STRIDE/4-96`($bp),%xmm3
454 pand %xmm6,%xmm2
455 por %xmm1,%xmm0
456 pand %xmm7,%xmm3
457
458 imulq $A[0],$m1 # "tp[0]"*n0
459 mov %rdx,$A[1]
460
461 por %xmm2,%xmm0
462 lea $STRIDE($bp),$bp
463 por %xmm3,%xmm0
464
465 mulq $m1 # np[0]*m1
466 add %rax,$A[0] # discarded
467 mov 8($ap),%rax
468 adc \$0,%rdx
469 mov %rdx,$N[1]
470
471 mulq $m0
472 add %rax,$A[1]
473 mov 8($np),%rax
474 adc \$0,%rdx
475 mov %rdx,$A[0]
476
477 mulq $m1
478 add %rax,$N[1]
479 mov 16($ap),%rax
480 adc \$0,%rdx
481 add $A[1],$N[1]
482 lea 4($j),$j # j++
483 adc \$0,%rdx
484 mov $N[1],(%rsp)
485 mov %rdx,$N[0]
486 jmp .L1st4x
487 .align 16
488 .L1st4x:
489 mulq $m0 # ap[j]*bp[0]
490 add %rax,$A[0]
491 mov -16($np,$j,8),%rax
492 adc \$0,%rdx
493 mov %rdx,$A[1]
494
495 mulq $m1 # np[j]*m1
496 add %rax,$N[0]
497 mov -8($ap,$j,8),%rax
498 adc \$0,%rdx
499 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
500 adc \$0,%rdx
501 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
502 mov %rdx,$N[1]
503
504 mulq $m0 # ap[j]*bp[0]
505 add %rax,$A[1]
506 mov -8($np,$j,8),%rax
507 adc \$0,%rdx
508 mov %rdx,$A[0]
509
510 mulq $m1 # np[j]*m1
511 add %rax,$N[1]
512 mov ($ap,$j,8),%rax
513 adc \$0,%rdx
514 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
515 adc \$0,%rdx
516 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
517 mov %rdx,$N[0]
518
519 mulq $m0 # ap[j]*bp[0]
520 add %rax,$A[0]
521 mov ($np,$j,8),%rax
522 adc \$0,%rdx
523 mov %rdx,$A[1]
524
525 mulq $m1 # np[j]*m1
526 add %rax,$N[0]
527 mov 8($ap,$j,8),%rax
528 adc \$0,%rdx
529 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
530 adc \$0,%rdx
531 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
532 mov %rdx,$N[1]
533
534 mulq $m0 # ap[j]*bp[0]
535 add %rax,$A[1]
536 mov 8($np,$j,8),%rax
537 adc \$0,%rdx
538 lea 4($j),$j # j++
539 mov %rdx,$A[0]
540
541 mulq $m1 # np[j]*m1
542 add %rax,$N[1]
543 mov -16($ap,$j,8),%rax
544 adc \$0,%rdx
545 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
546 adc \$0,%rdx
547 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
548 mov %rdx,$N[0]
549 cmp $num,$j
550 jl .L1st4x
551
552 mulq $m0 # ap[j]*bp[0]
553 add %rax,$A[0]
554 mov -16($np,$j,8),%rax
555 adc \$0,%rdx
556 mov %rdx,$A[1]
557
558 mulq $m1 # np[j]*m1
559 add %rax,$N[0]
560 mov -8($ap,$j,8),%rax
561 adc \$0,%rdx
562 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
563 adc \$0,%rdx
564 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
565 mov %rdx,$N[1]
566
567 mulq $m0 # ap[j]*bp[0]
568 add %rax,$A[1]
569 mov -8($np,$j,8),%rax
570 adc \$0,%rdx
571 mov %rdx,$A[0]
572
573 mulq $m1 # np[j]*m1
574 add %rax,$N[1]
575 mov ($ap),%rax # ap[0]
576 adc \$0,%rdx
577 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
578 adc \$0,%rdx
579 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
580 mov %rdx,$N[0]
581
582 movq %xmm0,$m0 # bp[1]
583
584 xor $N[1],$N[1]
585 add $A[0],$N[0]
586 adc \$0,$N[1]
587 mov $N[0],-8(%rsp,$j,8)
588 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
589
590 lea 1($i),$i # i++
591 .align 4
592 .Louter4x:
593 xor $j,$j # j=0
594 movq `0*$STRIDE/4-96`($bp),%xmm0
595 movq `1*$STRIDE/4-96`($bp),%xmm1
596 pand %xmm4,%xmm0
597 movq `2*$STRIDE/4-96`($bp),%xmm2
598 pand %xmm5,%xmm1
599
600 mov (%rsp),$A[0]
601 mov $n0,$m1
602 mulq $m0 # ap[0]*bp[i]
603 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
604 mov ($np),%rax
605 adc \$0,%rdx
606
607 movq `3*$STRIDE/4-96`($bp),%xmm3
608 pand %xmm6,%xmm2
609 por %xmm1,%xmm0
610 pand %xmm7,%xmm3
611
612 imulq $A[0],$m1 # tp[0]*n0
613 mov %rdx,$A[1]
614
615 por %xmm2,%xmm0
616 lea $STRIDE($bp),$bp
617 por %xmm3,%xmm0
618
619 mulq $m1 # np[0]*m1
620 add %rax,$A[0] # "$N[0]", discarded
621 mov 8($ap),%rax
622 adc \$0,%rdx
623 mov %rdx,$N[1]
624
625 mulq $m0 # ap[j]*bp[i]
626 add %rax,$A[1]
627 mov 8($np),%rax
628 adc \$0,%rdx
629 add 8(%rsp),$A[1] # +tp[1]
630 adc \$0,%rdx
631 mov %rdx,$A[0]
632
633 mulq $m1 # np[j]*m1
634 add %rax,$N[1]
635 mov 16($ap),%rax
636 adc \$0,%rdx
637 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
638 lea 4($j),$j # j+=2
639 adc \$0,%rdx
640 mov %rdx,$N[0]
641 jmp .Linner4x
642 .align 16
643 .Linner4x:
644 mulq $m0 # ap[j]*bp[i]
645 add %rax,$A[0]
646 mov -16($np,$j,8),%rax
647 adc \$0,%rdx
648 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
649 adc \$0,%rdx
650 mov %rdx,$A[1]
651
652 mulq $m1 # np[j]*m1
653 add %rax,$N[0]
654 mov -8($ap,$j,8),%rax
655 adc \$0,%rdx
656 add $A[0],$N[0]
657 adc \$0,%rdx
658 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
659 mov %rdx,$N[1]
660
661 mulq $m0 # ap[j]*bp[i]
662 add %rax,$A[1]
663 mov -8($np,$j,8),%rax
664 adc \$0,%rdx
665 add -8(%rsp,$j,8),$A[1]
666 adc \$0,%rdx
667 mov %rdx,$A[0]
668
669 mulq $m1 # np[j]*m1
670 add %rax,$N[1]
671 mov ($ap,$j,8),%rax
672 adc \$0,%rdx
673 add $A[1],$N[1]
674 adc \$0,%rdx
675 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
676 mov %rdx,$N[0]
677
678 mulq $m0 # ap[j]*bp[i]
679 add %rax,$A[0]
680 mov ($np,$j,8),%rax
681 adc \$0,%rdx
682 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
683 adc \$0,%rdx
684 mov %rdx,$A[1]
685
686 mulq $m1 # np[j]*m1
687 add %rax,$N[0]
688 mov 8($ap,$j,8),%rax
689 adc \$0,%rdx
690 add $A[0],$N[0]
691 adc \$0,%rdx
692 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
693 mov %rdx,$N[1]
694
695 mulq $m0 # ap[j]*bp[i]
696 add %rax,$A[1]
697 mov 8($np,$j,8),%rax
698 adc \$0,%rdx
699 add 8(%rsp,$j,8),$A[1]
700 adc \$0,%rdx
701 lea 4($j),$j # j++
702 mov %rdx,$A[0]
703
704 mulq $m1 # np[j]*m1
705 add %rax,$N[1]
706 mov -16($ap,$j,8),%rax
707 adc \$0,%rdx
708 add $A[1],$N[1]
709 adc \$0,%rdx
710 mov $N[0],-40(%rsp,$j,8) # tp[j-1]
711 mov %rdx,$N[0]
712 cmp $num,$j
713 jl .Linner4x
714
715 mulq $m0 # ap[j]*bp[i]
716 add %rax,$A[0]
717 mov -16($np,$j,8),%rax
718 adc \$0,%rdx
719 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
720 adc \$0,%rdx
721 mov %rdx,$A[1]
722
723 mulq $m1 # np[j]*m1
724 add %rax,$N[0]
725 mov -8($ap,$j,8),%rax
726 adc \$0,%rdx
727 add $A[0],$N[0]
728 adc \$0,%rdx
729 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
730 mov %rdx,$N[1]
731
732 mulq $m0 # ap[j]*bp[i]
733 add %rax,$A[1]
734 mov -8($np,$j,8),%rax
735 adc \$0,%rdx
736 add -8(%rsp,$j,8),$A[1]
737 adc \$0,%rdx
738 lea 1($i),$i # i++
739 mov %rdx,$A[0]
740
741 mulq $m1 # np[j]*m1
742 add %rax,$N[1]
743 mov ($ap),%rax # ap[0]
744 adc \$0,%rdx
745 add $A[1],$N[1]
746 adc \$0,%rdx
747 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
748 mov %rdx,$N[0]
749
750 movq %xmm0,$m0 # bp[i+1]
751 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
752
753 xor $N[1],$N[1]
754 add $A[0],$N[0]
755 adc \$0,$N[1]
756 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
757 adc \$0,$N[1]
758 mov $N[0],-8(%rsp,$j,8)
759 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
760
761 cmp $num,$i
762 jl .Louter4x
763 ___
764 {
765 my @ri=("%rax","%rdx",$m0,$m1);
766 $code.=<<___;
767 mov 16(%rsp,$num,8),$rp # restore $rp
768 mov 0(%rsp),@ri[0] # tp[0]
769 pxor %xmm0,%xmm0
770 mov 8(%rsp),@ri[1] # tp[1]
771 shr \$2,$num # num/=4
772 lea (%rsp),$ap # borrow ap for tp
773 xor $i,$i # i=0 and clear CF!
774
775 sub 0($np),@ri[0]
776 mov 16($ap),@ri[2] # tp[2]
777 mov 24($ap),@ri[3] # tp[3]
778 sbb 8($np),@ri[1]
779 lea -1($num),$j # j=num/4-1
780 jmp .Lsub4x
781 .align 16
782 .Lsub4x:
783 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
784 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
785 sbb 16($np,$i,8),@ri[2]
786 mov 32($ap,$i,8),@ri[0] # tp[i+1]
787 mov 40($ap,$i,8),@ri[1]
788 sbb 24($np,$i,8),@ri[3]
789 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
790 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
791 sbb 32($np,$i,8),@ri[0]
792 mov 48($ap,$i,8),@ri[2]
793 mov 56($ap,$i,8),@ri[3]
794 sbb 40($np,$i,8),@ri[1]
795 lea 4($i),$i # i++
796 dec $j # doesnn't affect CF!
797 jnz .Lsub4x
798
799 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
800 mov 32($ap,$i,8),@ri[0] # load overflow bit
801 sbb 16($np,$i,8),@ri[2]
802 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
803 sbb 24($np,$i,8),@ri[3]
804 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
805
806 sbb \$0,@ri[0] # handle upmost overflow bit
807 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
808 xor $i,$i # i=0
809 and @ri[0],$ap
810 not @ri[0]
811 mov $rp,$np
812 and @ri[0],$np
813 lea -1($num),$j
814 or $np,$ap # ap=borrow?tp:rp
815
816 movdqu ($ap),%xmm1
817 movdqa %xmm0,(%rsp)
818 movdqu %xmm1,($rp)
819 jmp .Lcopy4x
820 .align 16
821 .Lcopy4x: # copy or in-place refresh
822 movdqu 16($ap,$i),%xmm2
823 movdqu 32($ap,$i),%xmm1
824 movdqa %xmm0,16(%rsp,$i)
825 movdqu %xmm2,16($rp,$i)
826 movdqa %xmm0,32(%rsp,$i)
827 movdqu %xmm1,32($rp,$i)
828 lea 32($i),$i
829 dec $j
830 jnz .Lcopy4x
831
832 shl \$2,$num
833 movdqu 16($ap,$i),%xmm2
834 movdqa %xmm0,16(%rsp,$i)
835 movdqu %xmm2,16($rp,$i)
836 ___
837 }
838 $code.=<<___;
839 mov 8(%rsp,$num,8),%rsi # restore %rsp
840 mov \$1,%rax
841 ___
842 $code.=<<___ if ($win64);
843 movaps (%rsi),%xmm6
844 movaps 0x10(%rsi),%xmm7
845 lea 0x28(%rsi),%rsi
846 ___
847 $code.=<<___;
848 mov (%rsi),%r15
849 mov 8(%rsi),%r14
850 mov 16(%rsi),%r13
851 mov 24(%rsi),%r12
852 mov 32(%rsi),%rbp
853 mov 40(%rsi),%rbx
854 lea 48(%rsi),%rsp
855 .Lmul4x_epilogue:
856 ret
857 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
858 ___
859 }}}
860 if ($addx) {{{
861 my $bp="%rdx"; # original value
862
863 $code.=<<___;
864 .type bn_mulx4x_mont_gather5,\@function,6
865 .align 32
866 bn_mulx4x_mont_gather5:
867 .Lmulx4x_enter:
868 mov %rsp,%rax
869 push %rbx
870 push %rbp
871 push %r12
872 push %r13
873 push %r14
874 push %r15
875 ___
876 $code.=<<___ if ($win64);
877 lea -0x28(%rsp),%rsp
878 movaps %xmm6,(%rsp)
879 movaps %xmm7,0x10(%rsp)
880 ___
881 $code.=<<___;
882 shl \$3,${num}d # convert $num to bytes
883 xor %r10,%r10
884 mov %rsp,%r11 # put aside %rsp
885 sub $num,%r10 # -$num
886 mov ($n0),$n0 # *n0
887 lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
888 and \$-128,%rsp
889 ##############################################################
890 # Stack layout
891 # +0 num
892 # +8 off-loaded &b[i]
893 # +16 end of b[num]
894 # +24 saved n0
895 # +32 saved rp
896 # +40
897 # +48 inner counter
898 # +56 saved %rsp
899 # +64 tmp[num+1]
900 #
901 mov $num,0(%rsp) # save $num
902 shl \$5,$num
903 lea 256($bp,$num),%r10
904 shr \$5+5,$num
905 mov %r10,16(%rsp) # end of b[num]
906 sub \$1,$num
907 mov $n0, 24(%rsp) # save *n0
908 mov $rp, 32(%rsp) # save $rp
909 mov $num,48(%rsp) # inner counter
910 mov %r11,56(%rsp) # save original %rsp
911 jmp .Lmulx4x_body
912
913 .align 32
914 .Lmulx4x_body:
915 ___
916 my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
917 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
918 my $rptr=$bptr;
919 my $STRIDE=2**5*8; # 5 is "window size"
920 my $N=$STRIDE/4; # should match cache line size
921 $code.=<<___;
922 mov `($win64?56:8)`(%rax),%r10d # load 7th argument
923 mov %r10,%r11
924 shr \$`log($N/8)/log(2)`,%r10
925 and \$`$N/8-1`,%r11
926 not %r10
927 lea .Lmagic_masks(%rip),%rax
928 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
929 lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
930 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
931 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
932 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
933 movq 24(%rax,%r10,8),%xmm7
934
935 movq `0*$STRIDE/4-96`($bptr),%xmm0
936 movq `1*$STRIDE/4-96`($bptr),%xmm1
937 pand %xmm4,%xmm0
938 movq `2*$STRIDE/4-96`($bptr),%xmm2
939 pand %xmm5,%xmm1
940 movq `3*$STRIDE/4-96`($bptr),%xmm3
941 pand %xmm6,%xmm2
942 por %xmm1,%xmm0
943 pand %xmm7,%xmm3
944 por %xmm2,%xmm0
945 lea $STRIDE($bptr),$bptr
946 por %xmm3,%xmm0
947
948 movq %xmm0,%rdx # bp[0]
949 movq `0*$STRIDE/4-96`($bptr),%xmm0
950 movq `1*$STRIDE/4-96`($bptr),%xmm1
951 pand %xmm4,%xmm0
952 movq `2*$STRIDE/4-96`($bptr),%xmm2
953 pand %xmm5,%xmm1
954
955 lea 64+32(%rsp),$tptr
956 mov %rdx,$bi
957 xor $zero,$zero # of=0,cf=0
958
959 mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
960 mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
961 adcx %rax,%r11
962 mulx 2*8($aptr),%r12,%r13 # ...
963 adcx %r14,%r12
964 adcx $zero,%r13
965
966 movq `3*$STRIDE/4-96`($bptr),%xmm3
967 lea $STRIDE($bptr),%r10 # next &b[i]
968 pand %xmm6,%xmm2
969 por %xmm1,%xmm0
970 pand %xmm7,%xmm3
971
972 mov $mi,$bptr # borrow $bptr
973 imulq 24(%rsp),$mi # "t[0]"*n0
974 xor $zero,$zero # cf=0, of=0
975
976 por %xmm2,%xmm0
977 por %xmm3,%xmm0
978 mov %r10,8(%rsp) # off-load &b[i]
979
980 mulx 3*8($aptr),%rax,%r14
981 mov $mi,%rdx
982 lea 4*8($aptr),$aptr
983 adcx %rax,%r13
984 adcx $zero,%r14 # cf=0
985
986 mulx 0*8($nptr),%rax,%r10
987 adcx %rax,$bptr # discarded
988 adox %r11,%r10
989 mulx 1*8($nptr),%rax,%r11
990 adcx %rax,%r10
991 adox %r12,%r11
992 mulx 2*8($nptr),%rax,%r12
993 mov 48(%rsp),$bptr # counter value
994 mov %r10,-4*8($tptr)
995 adcx %rax,%r11
996 adox %r13,%r12
997 mulx 3*8($nptr),%rax,%r15
998 mov $bi,%rdx
999 mov %r11,-3*8($tptr)
1000 adcx %rax,%r12
1001 adox $zero,%r15 # of=0
1002 lea 4*8($nptr),$nptr
1003 mov %r12,-2*8($tptr)
1004
1005 jmp .Lmulx4x_1st
1006
1007 .align 32
1008 .Lmulx4x_1st:
1009 adcx $zero,%r15 # cf=0, modulo-scheduled
1010 mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
1011 adcx %r14,%r10
1012 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
1013 adcx %rax,%r11
1014 mulx 2*8($aptr),%r12,%rax # ...
1015 adcx %r14,%r12
1016 mulx 3*8($aptr),%r13,%r14
1017 .byte 0x66,0x66
1018 mov $mi,%rdx
1019 adcx %rax,%r13
1020 adcx $zero,%r14 # cf=0
1021 lea 4*8($aptr),$aptr
1022 lea 4*8($tptr),$tptr
1023
1024 adox %r15,%r10
1025 mulx 0*8($nptr),%rax,%r15
1026 adcx %rax,%r10
1027 adox %r15,%r11
1028 mulx 1*8($nptr),%rax,%r15
1029 adcx %rax,%r11
1030 adox %r15,%r12
1031 .byte 0x3e
1032 mulx 2*8($nptr),%rax,%r15
1033 mov %r10,-5*8($tptr)
1034 mov %r11,-4*8($tptr)
1035 adcx %rax,%r12
1036 adox %r15,%r13
1037 mulx 3*8($nptr),%rax,%r15
1038 mov $bi,%rdx
1039 mov %r12,-3*8($tptr)
1040 adcx %rax,%r13
1041 adox $zero,%r15
1042 lea 4*8($nptr),$nptr
1043 mov %r13,-2*8($tptr)
1044
1045 dec $bptr # of=0, pass cf
1046 jnz .Lmulx4x_1st
1047
1048 mov 0(%rsp),$num # load num
1049 mov 8(%rsp),$bptr # re-load &b[i]
1050 movq %xmm0,%rdx # bp[1]
1051 adc $zero,%r15 # modulo-scheduled
1052 add %r15,%r14
1053 sbb %r15,%r15 # top-most carry
1054 mov %r14,-1*8($tptr)
1055 jmp .Lmulx4x_outer
1056
1057 .align 32
1058 .Lmulx4x_outer:
1059 sub $num,$aptr # rewind $aptr
1060 mov %r15,($tptr) # save top-most carry
1061 mov 64(%rsp),%r10
1062 lea 64(%rsp),$tptr
1063 sub $num,$nptr # rewind $nptr
1064 xor $zero,$zero # cf=0, of=0
1065 mov %rdx,$bi
1066
1067 movq `0*$STRIDE/4-96`($bptr),%xmm0
1068 movq `1*$STRIDE/4-96`($bptr),%xmm1
1069 pand %xmm4,%xmm0
1070 movq `2*$STRIDE/4-96`($bptr),%xmm2
1071 pand %xmm5,%xmm1
1072
1073 mulx 0*8($aptr),$mi,%rax # a[0]*b[i]
1074 adox %r10,$mi
1075 mov 1*8($tptr),%r10
1076 mulx 1*8($aptr),%r11,%r14 # a[1]*b[i]
1077 adcx %rax,%r11
1078 mulx 2*8($aptr),%r12,%r13 # ...
1079 adox %r10,%r11
1080 adcx %r14,%r12
1081 adox $zero,%r12
1082 adcx $zero,%r13
1083
1084 movq `3*$STRIDE/4-96`($bptr),%xmm3
1085 lea $STRIDE($bptr),%r10 # next &b[i]
1086 pand %xmm6,%xmm2
1087 por %xmm1,%xmm0
1088 pand %xmm7,%xmm3
1089
1090 mov $mi,$bptr # borrow $bptr
1091 imulq 24(%rsp),$mi # "t[0]"*n0
1092 xor $zero,$zero # cf=0, of=0
1093
1094 por %xmm2,%xmm0
1095 por %xmm3,%xmm0
1096 mov %r10,8(%rsp) # off-load &b[i]
1097 mov 2*8($tptr),%r10
1098
1099 mulx 3*8($aptr),%rax,%r14
1100 mov $mi,%rdx
1101 adox %r10,%r12
1102 adcx %rax,%r13
1103 adox 3*8($tptr),%r13
1104 adcx $zero,%r14
1105 lea 4*8($aptr),$aptr
1106 lea 4*8($tptr),$tptr
1107 adox $zero,%r14
1108
1109 mulx 0*8($nptr),%rax,%r10
1110 adcx %rax,$bptr # discarded
1111 adox %r11,%r10
1112 mulx 1*8($nptr),%rax,%r11
1113 adcx %rax,%r10
1114 adox %r12,%r11
1115 mulx 2*8($nptr),%rax,%r12
1116 .byte 0x3e
1117 mov %r10,-4*8($tptr)
1118 .byte 0x3e
1119 mov 0*8($tptr),%r10
1120 adcx %rax,%r11
1121 adox %r13,%r12
1122 mulx 3*8($nptr),%rax,%r15
1123 mov $bi,%rdx
1124 mov %r11,-3*8($tptr)
1125 adcx %rax,%r12
1126 adox $zero,%r15 # of=0
1127 mov 48(%rsp),$bptr # counter value
1128 mov %r12,-2*8($tptr)
1129 lea 4*8($nptr),$nptr
1130
1131 jmp .Lmulx4x_inner
1132
1133 .align 32
1134 .Lmulx4x_inner:
1135 adcx $zero,%r15 # cf=0, modulo-scheduled
1136 adox %r10,%r14
1137 mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
1138 mov 1*8($tptr),%r13
1139 adcx %r14,%r10
1140 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
1141 adox %rax,%r11
1142 mulx 2*8($aptr),%r12,%rax # ...
1143 adcx %r13,%r11
1144 adox %r14,%r12
1145 mulx 3*8($aptr),%r13,%r14
1146 mov $mi,%rdx
1147 adcx 2*8($tptr),%r12
1148 adox %rax,%r13
1149 adcx 3*8($tptr),%r13
1150 adox $zero,%r14 # of=0
1151 lea 4*8($aptr),$aptr
1152 .byte 0x48,0x8d,0x9b,0x20,0x00,0x00,0x00 # lea 4*8($tptr),$tptr
1153 adcx $zero,%r14 # cf=0
1154
1155 adox %r15,%r10
1156 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0x79,0x00 # mulx 0*8($nptr),%rax,%r15
1157 adcx %rax,%r10
1158 adox %r15,%r11
1159 mulx 1*8($nptr),%rax,%r15
1160 adcx %rax,%r11
1161 adox %r15,%r12
1162 mulx 2*8($nptr),%rax,%r15
1163 mov %r10,-5*8($tptr)
1164 mov 0*8($tptr),%r10
1165 adcx %rax,%r12
1166 adox %r15,%r13
1167 mulx 3*8($nptr),%rax,%r15
1168 mov $bi,%rdx
1169 mov %r11,-4*8($tptr)
1170 mov %r12,-3*8($tptr)
1171 adcx %rax,%r13
1172 adox $zero,%r15
1173 lea 4*8($nptr),$nptr
1174 mov %r13,-2*8($tptr)
1175
1176 dec $bptr # of=0, pass cf
1177 jnz .Lmulx4x_inner
1178
1179 mov 0(%rsp),$num # load num
1180 mov 8(%rsp),$bptr # re-load &b[i]
1181 movq %xmm0,%rdx # bp[i+1]
1182 adc $zero,%r15 # modulo-scheduled
1183 sub %r10,$zero # pull top-most carry
1184 adc %r15,%r14
1185 sbb %r15,%r15 # top-most carry
1186 mov %r14,-1*8($tptr)
1187
1188 cmp 16(%rsp),$bptr
1189 jb .Lmulx4x_outer
1190
1191 neg $num
1192 mov 32(%rsp),$rptr # restore rp
1193 lea 64(%rsp),$tptr
1194
1195 xor %rdx,%rdx
1196 pxor %xmm0,%xmm0
1197 mov 0*8($nptr,$num),%r8
1198 mov 1*8($nptr,$num),%r9
1199 neg %r8
1200 jmp .Lmulx4x_sub_entry
1201
1202 .align 32
1203 .Lmulx4x_sub:
1204 mov 0*8($nptr,$num),%r8
1205 mov 1*8($nptr,$num),%r9
1206 not %r8
1207 .Lmulx4x_sub_entry:
1208 mov 2*8($nptr,$num),%r10
1209 not %r9
1210 and %r15,%r8
1211 mov 3*8($nptr,$num),%r11
1212 not %r10
1213 and %r15,%r9
1214 not %r11
1215 and %r15,%r10
1216 and %r15,%r11
1217
1218 neg %rdx # mov %rdx,%cf
1219 adc 0*8($tptr),%r8
1220 adc 1*8($tptr),%r9
1221 movdqa %xmm0,($tptr)
1222 adc 2*8($tptr),%r10
1223 adc 3*8($tptr),%r11
1224 movdqa %xmm0,16($tptr)
1225 lea 4*8($tptr),$tptr
1226 sbb %rdx,%rdx # mov %cf,%rdx
1227
1228 mov %r8,0*8($rptr)
1229 mov %r9,1*8($rptr)
1230 mov %r10,2*8($rptr)
1231 mov %r11,3*8($rptr)
1232 lea 4*8($rptr),$rptr
1233
1234 add \$32,$num
1235 jnz .Lmulx4x_sub
1236
1237 mov 56(%rsp),%rsi # restore %rsp
1238 mov \$1,%rax
1239 ___
1240 $code.=<<___ if ($win64);
1241 movaps (%rsi),%xmm6
1242 movaps 0x10(%rsi),%xmm7
1243 lea 0x28(%rsi),%rsi
1244 ___
1245 $code.=<<___;
1246 mov (%rsi),%r15
1247 mov 8(%rsi),%r14
1248 mov 16(%rsi),%r13
1249 mov 24(%rsi),%r12
1250 mov 32(%rsi),%rbp
1251 mov 40(%rsi),%rbx
1252 lea 48(%rsi),%rsp
1253 .Lmulx4x_epilogue:
1254 ret
1255 .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
1256 ___
1257 }}}
1258 {
1259 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
1260 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
1261 my $out=$inp;
1262 my $STRIDE=2**5*8;
1263 my $N=$STRIDE/4;
1264
1265 $code.=<<___;
1266 .globl bn_scatter5
1267 .type bn_scatter5,\@abi-omnipotent
1268 .align 16
1269 bn_scatter5:
1270 cmp \$0, $num
1271 jz .Lscatter_epilogue
1272 lea ($tbl,$idx,8),$tbl
1273 .Lscatter:
1274 mov ($inp),%rax
1275 lea 8($inp),$inp
1276 mov %rax,($tbl)
1277 lea 32*8($tbl),$tbl
1278 sub \$1,$num
1279 jnz .Lscatter
1280 .Lscatter_epilogue:
1281 ret
1282 .size bn_scatter5,.-bn_scatter5
1283
1284 .globl bn_gather5
1285 .type bn_gather5,\@abi-omnipotent
1286 .align 16
1287 bn_gather5:
1288 ___
1289 $code.=<<___ if ($win64);
1290 .LSEH_begin_bn_gather5:
1291 # I can't trust assembler to use specific encoding:-(
1292 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
1293 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
1294 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
1295 ___
1296 $code.=<<___;
1297 mov $idx,%r11
1298 shr \$`log($N/8)/log(2)`,$idx
1299 and \$`$N/8-1`,%r11
1300 not $idx
1301 lea .Lmagic_masks(%rip),%rax
1302 and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
1303 lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
1304 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
1305 movq 8(%rax,$idx,8),%xmm5 # cache line contains element
1306 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
1307 movq 24(%rax,$idx,8),%xmm7
1308 jmp .Lgather
1309 .align 16
1310 .Lgather:
1311 movq `0*$STRIDE/4-96`($tbl),%xmm0
1312 movq `1*$STRIDE/4-96`($tbl),%xmm1
1313 pand %xmm4,%xmm0
1314 movq `2*$STRIDE/4-96`($tbl),%xmm2
1315 pand %xmm5,%xmm1
1316 movq `3*$STRIDE/4-96`($tbl),%xmm3
1317 pand %xmm6,%xmm2
1318 por %xmm1,%xmm0
1319 pand %xmm7,%xmm3
1320 por %xmm2,%xmm0
1321 lea $STRIDE($tbl),$tbl
1322 por %xmm3,%xmm0
1323
1324 movq %xmm0,($out) # m0=bp[0]
1325 lea 8($out),$out
1326 sub \$1,$num
1327 jnz .Lgather
1328 ___
1329 $code.=<<___ if ($win64);
1330 movaps %xmm6,(%rsp)
1331 movaps %xmm7,0x10(%rsp)
1332 lea 0x28(%rsp),%rsp
1333 ___
1334 $code.=<<___;
1335 ret
1336 .LSEH_end_bn_gather5:
1337 .size bn_gather5,.-bn_gather5
1338 ___
1339 }
1340 $code.=<<___;
1341 .align 64
1342 .Lmagic_masks:
1343 .long 0,0, 0,0, 0,0, -1,-1
1344 .long 0,0, 0,0, 0,0, 0,0
1345 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1346 ___
1347
1348 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1349 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1350 if ($win64) {
1351 $rec="%rcx";
1352 $frame="%rdx";
1353 $context="%r8";
1354 $disp="%r9";
1355
1356 $code.=<<___;
1357 .extern __imp_RtlVirtualUnwind
1358 .type mul_handler,\@abi-omnipotent
1359 .align 16
1360 mul_handler:
1361 push %rsi
1362 push %rdi
1363 push %rbx
1364 push %rbp
1365 push %r12
1366 push %r13
1367 push %r14
1368 push %r15
1369 pushfq
1370 sub \$64,%rsp
1371
1372 mov 120($context),%rax # pull context->Rax
1373 mov 248($context),%rbx # pull context->Rip
1374
1375 mov 8($disp),%rsi # disp->ImageBase
1376 mov 56($disp),%r11 # disp->HandlerData
1377
1378 mov 0(%r11),%r10d # HandlerData[0]
1379 lea (%rsi,%r10),%r10 # end of prologue label
1380 cmp %r10,%rbx # context->Rip<end of prologue label
1381 jb .Lcommon_seh_tail
1382
1383 lea `40+48`(%rax),%rax
1384
1385 mov 4(%r11),%r10d # HandlerData[1]
1386 lea (%rsi,%r10),%r10 # end of alloca label
1387 cmp %r10,%rbx # context->Rip<end of alloca label
1388 jb .Lcommon_seh_tail
1389
1390 mov 152($context),%rax # pull context->Rsp
1391
1392 mov 8(%r11),%r10d # HandlerData[2]
1393 lea (%rsi,%r10),%r10 # epilogue label
1394 cmp %r10,%rbx # context->Rip>=epilogue label
1395 jae .Lcommon_seh_tail
1396
1397 mov 192($context),%r10 # pull $num
1398 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1399
1400 movaps (%rax),%xmm0
1401 movaps 16(%rax),%xmm1
1402 lea `40+48`(%rax),%rax
1403
1404 mov -8(%rax),%rbx
1405 mov -16(%rax),%rbp
1406 mov -24(%rax),%r12
1407 mov -32(%rax),%r13
1408 mov -40(%rax),%r14
1409 mov -48(%rax),%r15
1410 mov %rbx,144($context) # restore context->Rbx
1411 mov %rbp,160($context) # restore context->Rbp
1412 mov %r12,216($context) # restore context->R12
1413 mov %r13,224($context) # restore context->R13
1414 mov %r14,232($context) # restore context->R14
1415 mov %r15,240($context) # restore context->R15
1416 movups %xmm0,512($context) # restore context->Xmm6
1417 movups %xmm1,528($context) # restore context->Xmm7
1418
1419 .Lcommon_seh_tail:
1420 mov 8(%rax),%rdi
1421 mov 16(%rax),%rsi
1422 mov %rax,152($context) # restore context->Rsp
1423 mov %rsi,168($context) # restore context->Rsi
1424 mov %rdi,176($context) # restore context->Rdi
1425
1426 mov 40($disp),%rdi # disp->ContextRecord
1427 mov $context,%rsi # context
1428 mov \$154,%ecx # sizeof(CONTEXT)
1429 .long 0xa548f3fc # cld; rep movsq
1430
1431 mov $disp,%rsi
1432 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1433 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1434 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1435 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1436 mov 40(%rsi),%r10 # disp->ContextRecord
1437 lea 56(%rsi),%r11 # &disp->HandlerData
1438 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1439 mov %r10,32(%rsp) # arg5
1440 mov %r11,40(%rsp) # arg6
1441 mov %r12,48(%rsp) # arg7
1442 mov %rcx,56(%rsp) # arg8, (NULL)
1443 call *__imp_RtlVirtualUnwind(%rip)
1444
1445 mov \$1,%eax # ExceptionContinueSearch
1446 add \$64,%rsp
1447 popfq
1448 pop %r15
1449 pop %r14
1450 pop %r13
1451 pop %r12
1452 pop %rbp
1453 pop %rbx
1454 pop %rdi
1455 pop %rsi
1456 ret
1457 .size mul_handler,.-mul_handler
1458
1459 .section .pdata
1460 .align 4
1461 .rva .LSEH_begin_bn_mul_mont_gather5
1462 .rva .LSEH_end_bn_mul_mont_gather5
1463 .rva .LSEH_info_bn_mul_mont_gather5
1464
1465 .rva .LSEH_begin_bn_mul4x_mont_gather5
1466 .rva .LSEH_end_bn_mul4x_mont_gather5
1467 .rva .LSEH_info_bn_mul4x_mont_gather5
1468
1469 .rva .LSEH_begin_bn_gather5
1470 .rva .LSEH_end_bn_gather5
1471 .rva .LSEH_info_bn_gather5
1472
1473 .section .xdata
1474 .align 8
1475 .LSEH_info_bn_mul_mont_gather5:
1476 .byte 9,0,0,0
1477 .rva mul_handler
1478 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
1479 .align 8
1480 .LSEH_info_bn_mul4x_mont_gather5:
1481 .byte 9,0,0,0
1482 .rva mul_handler
1483 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1484 .align 8
1485 .LSEH_info_bn_gather5:
1486 .byte 0x01,0x0d,0x05,0x00
1487 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
1488 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
1489 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
1490 .align 8
1491 ___
1492 }
1493
1494 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1495
1496 print $code;
1497 close STDOUT;