]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
367ace68 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
361512da AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # August 2011. | |
18 | # | |
19 | # Companion to x86_64-mont.pl that optimizes cache-timing attack | |
20 | # countermeasures. The subroutines are produced by replacing bp[i] | |
21 | # references in their x86_64-mont.pl counterparts with cache-neutral | |
22 | # references to powers table computed in BN_mod_exp_mont_consttime. | |
23 | # In addition subroutine that scatters elements of the powers table | |
24 | # is implemented, so that scatter-/gathering can be tuned without | |
25 | # bn_exp.c modifications. | |
26 | ||
ec9cc70f AP |
27 | # August 2013. |
28 | # | |
29 | # Add MULX/AD*X code paths and additional interfaces to optimize for | |
30 | # branch prediction unit. For input lengths that are multiples of 8 | |
31 | # the np argument is not just modulus value, but one interleaved | |
32 | # with 0. This is to optimize post-condition... | |
33 | ||
1aa89a7a RL |
34 | # $output is the last argument if it looks like a file (it has an extension) |
35 | # $flavour is the first argument if it doesn't look like a file | |
36 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
37 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
361512da AP |
38 | |
39 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
40 | ||
41 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
42 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
43 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
44 | die "can't locate x86_64-xlate.pl"; | |
45 | ||
1aa89a7a RL |
46 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
47 | or die "can't call $xlate: $!"; | |
4568182a | 48 | *STDOUT=*OUT; |
361512da | 49 | |
a5bb5bca AP |
50 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
51 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
30b9c234 | 52 | $addx = ($1>=2.23); |
a5bb5bca AP |
53 | } |
54 | ||
55 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
56 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
57 | $addx = ($1>=2.10); | |
58 | } | |
59 | ||
60 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
61 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
1b0fe79f | 62 | $addx = ($1>=12); |
a5bb5bca AP |
63 | } |
64 | ||
9bb3e5fd | 65 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { |
569e2d12 AP |
66 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 |
67 | $addx = ($ver>=3.03); | |
68 | } | |
69 | ||
361512da AP |
70 | # int bn_mul_mont_gather5( |
71 | $rp="%rdi"; # BN_ULONG *rp, | |
72 | $ap="%rsi"; # const BN_ULONG *ap, | |
73 | $bp="%rdx"; # const BN_ULONG *bp, | |
74 | $np="%rcx"; # const BN_ULONG *np, | |
75 | $n0="%r8"; # const BN_ULONG *n0, | |
76 | $num="%r9"; # int num, | |
77 | # int idx); # 0 to 2^5-1, "index" in $bp holding | |
78 | # pre-computed powers of a', interlaced | |
79 | # in such manner that b[0] is $bp[idx], | |
80 | # b[1] is [2^5+idx], etc. | |
81 | $lo0="%r10"; | |
82 | $hi0="%r11"; | |
83 | $hi1="%r13"; | |
84 | $i="%r14"; | |
85 | $j="%r15"; | |
86 | $m0="%rbx"; | |
87 | $m1="%rbp"; | |
88 | ||
89 | $code=<<___; | |
90 | .text | |
91 | ||
a5bb5bca AP |
92 | .extern OPENSSL_ia32cap_P |
93 | ||
361512da AP |
94 | .globl bn_mul_mont_gather5 |
95 | .type bn_mul_mont_gather5,\@function,6 | |
96 | .align 64 | |
97 | bn_mul_mont_gather5: | |
76e624a0 | 98 | .cfi_startproc |
3ba1ef82 AP |
99 | mov ${num}d,${num}d |
100 | mov %rsp,%rax | |
76e624a0 | 101 | .cfi_def_cfa_register %rax |
ec9cc70f | 102 | test \$7,${num}d |
361512da | 103 | jnz .Lmul_enter |
a5bb5bca AP |
104 | ___ |
105 | $code.=<<___ if ($addx); | |
106 | mov OPENSSL_ia32cap_P+8(%rip),%r11d | |
107 | ___ | |
108 | $code.=<<___; | |
361512da AP |
109 | jmp .Lmul4x_enter |
110 | ||
111 | .align 16 | |
112 | .Lmul_enter: | |
317be638 | 113 | movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
361512da | 114 | push %rbx |
76e624a0 | 115 | .cfi_push %rbx |
361512da | 116 | push %rbp |
76e624a0 | 117 | .cfi_push %rbp |
361512da | 118 | push %r12 |
76e624a0 | 119 | .cfi_push %r12 |
361512da | 120 | push %r13 |
76e624a0 | 121 | .cfi_push %r13 |
361512da | 122 | push %r14 |
76e624a0 | 123 | .cfi_push %r14 |
361512da | 124 | push %r15 |
25b802bb | 125 | .cfi_push %r15 |
317be638 | 126 | |
3ba1ef82 AP |
127 | neg $num |
128 | mov %rsp,%r11 | |
129 | lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) | |
130 | neg $num # restore $num | |
131 | and \$-1024,%r10 # minimize TLB usage | |
361512da | 132 | |
0a86f668 EK |
133 | # An OS-agnostic version of __chkstk. |
134 | # | |
1bf80d93 | 135 | # Some OSes (Windows) insist on stack being "wired" to |
adc4f1fc AP |
136 | # physical memory in strictly sequential manner, i.e. if stack |
137 | # allocation spans two pages, then reference to farmost one can | |
138 | # be punishable by SEGV. But page walking can do good even on | |
139 | # other OSes, because it guarantees that villain thread hits | |
140 | # the guard page before it can make damage to innocent one... | |
3ba1ef82 AP |
141 | sub %r10,%r11 |
142 | and \$-4096,%r11 | |
143 | lea (%r10,%r11),%rsp | |
144 | mov (%rsp),%r11 | |
145 | cmp %r10,%rsp | |
146 | ja .Lmul_page_walk | |
147 | jmp .Lmul_page_walk_done | |
148 | ||
adc4f1fc | 149 | .Lmul_page_walk: |
3ba1ef82 AP |
150 | lea -4096(%rsp),%rsp |
151 | mov (%rsp),%r11 | |
152 | cmp %r10,%rsp | |
153 | ja .Lmul_page_walk | |
154 | .Lmul_page_walk_done: | |
155 | ||
156 | lea .Linc(%rip),%r10 | |
157 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | |
76e624a0 | 158 | .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 |
3ba1ef82 | 159 | .Lmul_body: |
adc4f1fc | 160 | |
8fc8f486 | 161 | lea 128($bp),%r12 # reassign $bp (+size optimization) |
361512da AP |
162 | ___ |
163 | $bp="%r12"; | |
164 | $STRIDE=2**5*8; # 5 is "window size" | |
165 | $N=$STRIDE/4; # should match cache line size | |
166 | $code.=<<___; | |
317be638 AP |
167 | movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
168 | movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 | |
169 | lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) | |
170 | and \$-16,%r10 | |
8fc8f486 | 171 | |
317be638 AP |
172 | pshufd \$0,%xmm5,%xmm5 # broadcast index |
173 | movdqa %xmm1,%xmm4 | |
174 | movdqa %xmm1,%xmm2 | |
175 | ___ | |
176 | ######################################################################## | |
177 | # calculate mask by comparing 0..31 to index and save result to stack | |
178 | # | |
179 | $code.=<<___; | |
180 | paddd %xmm0,%xmm1 | |
181 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 | |
182 | .byte 0x67 | |
183 | movdqa %xmm4,%xmm3 | |
184 | ___ | |
185 | for($k=0;$k<$STRIDE/16-4;$k+=4) { | |
186 | $code.=<<___; | |
187 | paddd %xmm1,%xmm2 | |
188 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | |
189 | movdqa %xmm0,`16*($k+0)+112`(%r10) | |
190 | movdqa %xmm4,%xmm0 | |
361512da | 191 | |
317be638 AP |
192 | paddd %xmm2,%xmm3 |
193 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | |
194 | movdqa %xmm1,`16*($k+1)+112`(%r10) | |
195 | movdqa %xmm4,%xmm1 | |
196 | ||
197 | paddd %xmm3,%xmm0 | |
198 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | |
199 | movdqa %xmm2,`16*($k+2)+112`(%r10) | |
200 | movdqa %xmm4,%xmm2 | |
201 | ||
202 | paddd %xmm0,%xmm1 | |
203 | pcmpeqd %xmm5,%xmm0 | |
204 | movdqa %xmm3,`16*($k+3)+112`(%r10) | |
205 | movdqa %xmm4,%xmm3 | |
206 | ___ | |
207 | } | |
208 | $code.=<<___; # last iteration can be optimized | |
209 | paddd %xmm1,%xmm2 | |
210 | pcmpeqd %xmm5,%xmm1 | |
211 | movdqa %xmm0,`16*($k+0)+112`(%r10) | |
212 | ||
213 | paddd %xmm2,%xmm3 | |
214 | .byte 0x67 | |
215 | pcmpeqd %xmm5,%xmm2 | |
216 | movdqa %xmm1,`16*($k+1)+112`(%r10) | |
217 | ||
218 | pcmpeqd %xmm5,%xmm3 | |
219 | movdqa %xmm2,`16*($k+2)+112`(%r10) | |
220 | pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register | |
221 | ||
222 | pand `16*($k+1)-128`($bp),%xmm1 | |
223 | pand `16*($k+2)-128`($bp),%xmm2 | |
224 | movdqa %xmm3,`16*($k+3)+112`(%r10) | |
225 | pand `16*($k+3)-128`($bp),%xmm3 | |
226 | por %xmm2,%xmm0 | |
227 | por %xmm3,%xmm1 | |
228 | ___ | |
229 | for($k=0;$k<$STRIDE/16-4;$k+=4) { | |
230 | $code.=<<___; | |
231 | movdqa `16*($k+0)-128`($bp),%xmm4 | |
232 | movdqa `16*($k+1)-128`($bp),%xmm5 | |
233 | movdqa `16*($k+2)-128`($bp),%xmm2 | |
234 | pand `16*($k+0)+112`(%r10),%xmm4 | |
235 | movdqa `16*($k+3)-128`($bp),%xmm3 | |
236 | pand `16*($k+1)+112`(%r10),%xmm5 | |
237 | por %xmm4,%xmm0 | |
238 | pand `16*($k+2)+112`(%r10),%xmm2 | |
239 | por %xmm5,%xmm1 | |
240 | pand `16*($k+3)+112`(%r10),%xmm3 | |
241 | por %xmm2,%xmm0 | |
242 | por %xmm3,%xmm1 | |
243 | ___ | |
244 | } | |
245 | $code.=<<___; | |
246 | por %xmm1,%xmm0 | |
247 | pshufd \$0x4e,%xmm0,%xmm1 | |
248 | por %xmm1,%xmm0 | |
249 | lea $STRIDE($bp),$bp | |
361512da AP |
250 | movq %xmm0,$m0 # m0=bp[0] |
251 | ||
252 | mov ($n0),$n0 # pull n0[0] value | |
253 | mov ($ap),%rax | |
254 | ||
255 | xor $i,$i # i=0 | |
256 | xor $j,$j # j=0 | |
257 | ||
361512da AP |
258 | mov $n0,$m1 |
259 | mulq $m0 # ap[0]*bp[0] | |
260 | mov %rax,$lo0 | |
261 | mov ($np),%rax | |
262 | ||
361512da AP |
263 | imulq $lo0,$m1 # "tp[0]"*n0 |
264 | mov %rdx,$hi0 | |
265 | ||
361512da AP |
266 | mulq $m1 # np[0]*m1 |
267 | add %rax,$lo0 # discarded | |
268 | mov 8($ap),%rax | |
269 | adc \$0,%rdx | |
270 | mov %rdx,$hi1 | |
271 | ||
272 | lea 1($j),$j # j++ | |
273 | jmp .L1st_enter | |
274 | ||
275 | .align 16 | |
276 | .L1st: | |
277 | add %rax,$hi1 | |
278 | mov ($ap,$j,8),%rax | |
279 | adc \$0,%rdx | |
280 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] | |
281 | mov $lo0,$hi0 | |
282 | adc \$0,%rdx | |
283 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | |
284 | mov %rdx,$hi1 | |
285 | ||
286 | .L1st_enter: | |
287 | mulq $m0 # ap[j]*bp[0] | |
288 | add %rax,$hi0 | |
289 | mov ($np,$j,8),%rax | |
290 | adc \$0,%rdx | |
291 | lea 1($j),$j # j++ | |
292 | mov %rdx,$lo0 | |
293 | ||
294 | mulq $m1 # np[j]*m1 | |
295 | cmp $num,$j | |
317be638 AP |
296 | jne .L1st # note that upon exit $j==$num, so |
297 | # they can be used interchangeably | |
361512da AP |
298 | |
299 | add %rax,$hi1 | |
361512da AP |
300 | adc \$0,%rdx |
301 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] | |
302 | adc \$0,%rdx | |
317be638 | 303 | mov $hi1,-16(%rsp,$num,8) # tp[num-1] |
361512da AP |
304 | mov %rdx,$hi1 |
305 | mov $lo0,$hi0 | |
306 | ||
307 | xor %rdx,%rdx | |
308 | add $hi0,$hi1 | |
309 | adc \$0,%rdx | |
310 | mov $hi1,-8(%rsp,$num,8) | |
311 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | |
312 | ||
313 | lea 1($i),$i # i++ | |
314 | jmp .Louter | |
315 | .align 16 | |
316 | .Louter: | |
317be638 AP |
317 | lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) |
318 | and \$-16,%rdx | |
319 | pxor %xmm4,%xmm4 | |
320 | pxor %xmm5,%xmm5 | |
321 | ___ | |
322 | for($k=0;$k<$STRIDE/16;$k+=4) { | |
323 | $code.=<<___; | |
324 | movdqa `16*($k+0)-128`($bp),%xmm0 | |
325 | movdqa `16*($k+1)-128`($bp),%xmm1 | |
326 | movdqa `16*($k+2)-128`($bp),%xmm2 | |
327 | movdqa `16*($k+3)-128`($bp),%xmm3 | |
328 | pand `16*($k+0)-128`(%rdx),%xmm0 | |
329 | pand `16*($k+1)-128`(%rdx),%xmm1 | |
330 | por %xmm0,%xmm4 | |
331 | pand `16*($k+2)-128`(%rdx),%xmm2 | |
332 | por %xmm1,%xmm5 | |
333 | pand `16*($k+3)-128`(%rdx),%xmm3 | |
334 | por %xmm2,%xmm4 | |
335 | por %xmm3,%xmm5 | |
336 | ___ | |
337 | } | |
338 | $code.=<<___; | |
339 | por %xmm5,%xmm4 | |
340 | pshufd \$0x4e,%xmm4,%xmm0 | |
341 | por %xmm4,%xmm0 | |
8fc8f486 | 342 | lea $STRIDE($bp),$bp |
8fc8f486 AP |
343 | |
344 | mov ($ap),%rax # ap[0] | |
345 | movq %xmm0,$m0 # m0=bp[i] | |
346 | ||
361512da AP |
347 | xor $j,$j # j=0 |
348 | mov $n0,$m1 | |
349 | mov (%rsp),$lo0 | |
350 | ||
361512da AP |
351 | mulq $m0 # ap[0]*bp[i] |
352 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] | |
353 | mov ($np),%rax | |
354 | adc \$0,%rdx | |
355 | ||
361512da AP |
356 | imulq $lo0,$m1 # tp[0]*n0 |
357 | mov %rdx,$hi0 | |
358 | ||
361512da AP |
359 | mulq $m1 # np[0]*m1 |
360 | add %rax,$lo0 # discarded | |
361 | mov 8($ap),%rax | |
362 | adc \$0,%rdx | |
363 | mov 8(%rsp),$lo0 # tp[1] | |
364 | mov %rdx,$hi1 | |
365 | ||
366 | lea 1($j),$j # j++ | |
367 | jmp .Linner_enter | |
368 | ||
369 | .align 16 | |
370 | .Linner: | |
371 | add %rax,$hi1 | |
372 | mov ($ap,$j,8),%rax | |
373 | adc \$0,%rdx | |
374 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | |
375 | mov (%rsp,$j,8),$lo0 | |
376 | adc \$0,%rdx | |
377 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | |
378 | mov %rdx,$hi1 | |
379 | ||
380 | .Linner_enter: | |
381 | mulq $m0 # ap[j]*bp[i] | |
382 | add %rax,$hi0 | |
383 | mov ($np,$j,8),%rax | |
384 | adc \$0,%rdx | |
385 | add $hi0,$lo0 # ap[j]*bp[i]+tp[j] | |
386 | mov %rdx,$hi0 | |
387 | adc \$0,$hi0 | |
388 | lea 1($j),$j # j++ | |
389 | ||
390 | mulq $m1 # np[j]*m1 | |
391 | cmp $num,$j | |
317be638 AP |
392 | jne .Linner # note that upon exit $j==$num, so |
393 | # they can be used interchangeably | |
361512da | 394 | add %rax,$hi1 |
361512da AP |
395 | adc \$0,%rdx |
396 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | |
8fc8f486 | 397 | mov (%rsp,$num,8),$lo0 |
361512da | 398 | adc \$0,%rdx |
317be638 | 399 | mov $hi1,-16(%rsp,$num,8) # tp[num-1] |
361512da AP |
400 | mov %rdx,$hi1 |
401 | ||
402 | xor %rdx,%rdx | |
403 | add $hi0,$hi1 | |
404 | adc \$0,%rdx | |
405 | add $lo0,$hi1 # pull upmost overflow bit | |
406 | adc \$0,%rdx | |
407 | mov $hi1,-8(%rsp,$num,8) | |
408 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | |
409 | ||
410 | lea 1($i),$i # i++ | |
411 | cmp $num,$i | |
ec9cc70f | 412 | jb .Louter |
361512da AP |
413 | |
414 | xor $i,$i # i=0 and clear CF! | |
415 | mov (%rsp),%rax # tp[0] | |
416 | lea (%rsp),$ap # borrow ap for tp | |
417 | mov $num,$j # j=num | |
418 | jmp .Lsub | |
419 | .align 16 | |
420 | .Lsub: sbb ($np,$i,8),%rax | |
421 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] | |
422 | mov 8($ap,$i,8),%rax # tp[i+1] | |
423 | lea 1($i),$i # i++ | |
46f4e1be | 424 | dec $j # doesn't affect CF! |
361512da AP |
425 | jnz .Lsub |
426 | ||
427 | sbb \$0,%rax # handle upmost overflow bit | |
774ff8fe AP |
428 | mov \$-1,%rbx |
429 | xor %rax,%rbx | |
361512da | 430 | xor $i,$i |
361512da | 431 | mov $num,$j # j=num |
774ff8fe AP |
432 | |
433 | .Lcopy: # conditional copy | |
434 | mov ($rp,$i,8),%rcx | |
435 | mov (%rsp,$i,8),%rdx | |
436 | and %rbx,%rcx | |
437 | and %rax,%rdx | |
361512da | 438 | mov $i,(%rsp,$i,8) # zap temporary vector |
774ff8fe AP |
439 | or %rcx,%rdx |
440 | mov %rdx,($rp,$i,8) # rp[i]=tp[i] | |
361512da AP |
441 | lea 1($i),$i |
442 | sub \$1,$j | |
443 | jnz .Lcopy | |
444 | ||
445 | mov 8(%rsp,$num,8),%rsi # restore %rsp | |
76e624a0 | 446 | .cfi_def_cfa %rsi,8 |
361512da | 447 | mov \$1,%rax |
317be638 | 448 | |
ec9cc70f | 449 | mov -48(%rsi),%r15 |
76e624a0 | 450 | .cfi_restore %r15 |
ec9cc70f | 451 | mov -40(%rsi),%r14 |
3f55ec67 | 452 | .cfi_restore %r14 |
ec9cc70f | 453 | mov -32(%rsi),%r13 |
3f55ec67 | 454 | .cfi_restore %r13 |
ec9cc70f | 455 | mov -24(%rsi),%r12 |
3f55ec67 | 456 | .cfi_restore %r12 |
ec9cc70f | 457 | mov -16(%rsi),%rbp |
3f55ec67 | 458 | .cfi_restore %rbp |
ec9cc70f | 459 | mov -8(%rsi),%rbx |
3f55ec67 | 460 | .cfi_restore %rbx |
ec9cc70f | 461 | lea (%rsi),%rsp |
76e624a0 | 462 | .cfi_def_cfa_register %rsp |
361512da AP |
463 | .Lmul_epilogue: |
464 | ret | |
76e624a0 | 465 | .cfi_endproc |
361512da AP |
466 | .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 |
467 | ___ | |
468 | {{{ | |
469 | my @A=("%r10","%r11"); | |
470 | my @N=("%r13","%rdi"); | |
471 | $code.=<<___; | |
472 | .type bn_mul4x_mont_gather5,\@function,6 | |
ec9cc70f | 473 | .align 32 |
361512da | 474 | bn_mul4x_mont_gather5: |
76e624a0 | 475 | .cfi_startproc |
3ba1ef82 AP |
476 | .byte 0x67 |
477 | mov %rsp,%rax | |
76e624a0 | 478 | .cfi_def_cfa_register %rax |
361512da | 479 | .Lmul4x_enter: |
a5bb5bca AP |
480 | ___ |
481 | $code.=<<___ if ($addx); | |
8fc8f486 AP |
482 | and \$0x80108,%r11d |
483 | cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 | |
a5bb5bca AP |
484 | je .Lmulx4x_enter |
485 | ___ | |
486 | $code.=<<___; | |
361512da | 487 | push %rbx |
76e624a0 | 488 | .cfi_push %rbx |
361512da | 489 | push %rbp |
76e624a0 | 490 | .cfi_push %rbp |
361512da | 491 | push %r12 |
76e624a0 | 492 | .cfi_push %r12 |
361512da | 493 | push %r13 |
76e624a0 | 494 | .cfi_push %r13 |
361512da | 495 | push %r14 |
76e624a0 | 496 | .cfi_push %r14 |
361512da | 497 | push %r15 |
76e624a0 | 498 | .cfi_push %r15 |
3ba1ef82 | 499 | .Lmul4x_prologue: |
8fc8f486 | 500 | |
ec9cc70f | 501 | .byte 0x67 |
8fc8f486 AP |
502 | shl \$3,${num}d # convert $num to bytes |
503 | lea ($num,$num,2),%r10 # 3*$num in bytes | |
ec9cc70f | 504 | neg $num # -$num |
361512da | 505 | |
ec9cc70f | 506 | ############################################################## |
8fc8f486 AP |
507 | # Ensure that stack frame doesn't alias with $rptr+3*$num |
508 | # modulo 4096, which covers ret[num], am[num] and n[num] | |
509 | # (see bn_exp.c). This is done to allow memory disambiguation | |
510 | # logic do its magic. [Extra [num] is allocated in order | |
511 | # to align with bn_power5's frame, which is cleansed after | |
512 | # completing exponentiation. Extra 256 bytes is for power mask | |
513 | # calculated from 7th argument, the index.] | |
ec9cc70f | 514 | # |
8fc8f486 | 515 | lea -320(%rsp,$num,2),%r11 |
3ba1ef82 | 516 | mov %rsp,%rbp |
8fc8f486 | 517 | sub $rp,%r11 |
ec9cc70f AP |
518 | and \$4095,%r11 |
519 | cmp %r11,%r10 | |
520 | jb .Lmul4xsp_alt | |
3ba1ef82 AP |
521 | sub %r11,%rbp # align with $rp |
522 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) | |
ec9cc70f AP |
523 | jmp .Lmul4xsp_done |
524 | ||
525 | .align 32 | |
526 | .Lmul4xsp_alt: | |
8fc8f486 | 527 | lea 4096-320(,$num,2),%r10 |
3ba1ef82 | 528 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) |
ec9cc70f AP |
529 | sub %r10,%r11 |
530 | mov \$0,%r10 | |
531 | cmovc %r10,%r11 | |
3ba1ef82 | 532 | sub %r11,%rbp |
ec9cc70f | 533 | .Lmul4xsp_done: |
3ba1ef82 AP |
534 | and \$-64,%rbp |
535 | mov %rsp,%r11 | |
536 | sub %rbp,%r11 | |
adc4f1fc | 537 | and \$-4096,%r11 |
3ba1ef82 AP |
538 | lea (%rbp,%r11),%rsp |
539 | mov (%rsp),%r10 | |
540 | cmp %rbp,%rsp | |
541 | ja .Lmul4x_page_walk | |
542 | jmp .Lmul4x_page_walk_done | |
543 | ||
adc4f1fc | 544 | .Lmul4x_page_walk: |
3ba1ef82 AP |
545 | lea -4096(%rsp),%rsp |
546 | mov (%rsp),%r10 | |
547 | cmp %rbp,%rsp | |
548 | ja .Lmul4x_page_walk | |
549 | .Lmul4x_page_walk_done: | |
adc4f1fc | 550 | |
ec9cc70f AP |
551 | neg $num |
552 | ||
553 | mov %rax,40(%rsp) | |
76e624a0 | 554 | .cfi_cfa_expression %rsp+40,deref,+8 |
361512da | 555 | .Lmul4x_body: |
ec9cc70f AP |
556 | |
557 | call mul4x_internal | |
558 | ||
559 | mov 40(%rsp),%rsi # restore %rsp | |
76e624a0 | 560 | .cfi_def_cfa %rsi,8 |
ec9cc70f | 561 | mov \$1,%rax |
8fc8f486 | 562 | |
ec9cc70f | 563 | mov -48(%rsi),%r15 |
76e624a0 | 564 | .cfi_restore %r15 |
ec9cc70f | 565 | mov -40(%rsi),%r14 |
76e624a0 | 566 | .cfi_restore %r14 |
ec9cc70f | 567 | mov -32(%rsi),%r13 |
76e624a0 | 568 | .cfi_restore %r13 |
ec9cc70f | 569 | mov -24(%rsi),%r12 |
76e624a0 | 570 | .cfi_restore %r12 |
ec9cc70f | 571 | mov -16(%rsi),%rbp |
76e624a0 | 572 | .cfi_restore %rbp |
ec9cc70f | 573 | mov -8(%rsi),%rbx |
76e624a0 | 574 | .cfi_restore %rbx |
ec9cc70f | 575 | lea (%rsi),%rsp |
76e624a0 | 576 | .cfi_def_cfa_register %rsp |
ec9cc70f AP |
577 | .Lmul4x_epilogue: |
578 | ret | |
76e624a0 | 579 | .cfi_endproc |
ec9cc70f AP |
580 | .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 |
581 | ||
582 | .type mul4x_internal,\@abi-omnipotent | |
583 | .align 32 | |
584 | mul4x_internal: | |
0190c52a | 585 | .cfi_startproc |
8fc8f486 AP |
586 | shl \$5,$num # $num was in bytes |
587 | movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index | |
588 | lea .Linc(%rip),%rax | |
589 | lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) | |
ec9cc70f | 590 | shr \$5,$num # restore $num |
361512da AP |
591 | ___ |
592 | $bp="%r12"; | |
593 | $STRIDE=2**5*8; # 5 is "window size" | |
594 | $N=$STRIDE/4; # should match cache line size | |
ec9cc70f | 595 | $tp=$i; |
361512da | 596 | $code.=<<___; |
8fc8f486 AP |
597 | movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 |
598 | movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 | |
599 | lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) | |
600 | lea 128(%rdx),$bp # size optimization | |
601 | ||
602 | pshufd \$0,%xmm5,%xmm5 # broadcast index | |
603 | movdqa %xmm1,%xmm4 | |
604 | .byte 0x67,0x67 | |
605 | movdqa %xmm1,%xmm2 | |
606 | ___ | |
607 | ######################################################################## | |
608 | # calculate mask by comparing 0..31 to index and save result to stack | |
609 | # | |
610 | $code.=<<___; | |
611 | paddd %xmm0,%xmm1 | |
612 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 | |
ec9cc70f | 613 | .byte 0x67 |
8fc8f486 AP |
614 | movdqa %xmm4,%xmm3 |
615 | ___ | |
616 | for($i=0;$i<$STRIDE/16-4;$i+=4) { | |
617 | $code.=<<___; | |
618 | paddd %xmm1,%xmm2 | |
619 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | |
620 | movdqa %xmm0,`16*($i+0)+112`(%r10) | |
621 | movdqa %xmm4,%xmm0 | |
622 | ||
623 | paddd %xmm2,%xmm3 | |
624 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | |
625 | movdqa %xmm1,`16*($i+1)+112`(%r10) | |
626 | movdqa %xmm4,%xmm1 | |
627 | ||
628 | paddd %xmm3,%xmm0 | |
629 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | |
630 | movdqa %xmm2,`16*($i+2)+112`(%r10) | |
631 | movdqa %xmm4,%xmm2 | |
632 | ||
633 | paddd %xmm0,%xmm1 | |
634 | pcmpeqd %xmm5,%xmm0 | |
635 | movdqa %xmm3,`16*($i+3)+112`(%r10) | |
636 | movdqa %xmm4,%xmm3 | |
637 | ___ | |
638 | } | |
639 | $code.=<<___; # last iteration can be optimized | |
640 | paddd %xmm1,%xmm2 | |
641 | pcmpeqd %xmm5,%xmm1 | |
642 | movdqa %xmm0,`16*($i+0)+112`(%r10) | |
643 | ||
644 | paddd %xmm2,%xmm3 | |
ec9cc70f | 645 | .byte 0x67 |
8fc8f486 AP |
646 | pcmpeqd %xmm5,%xmm2 |
647 | movdqa %xmm1,`16*($i+1)+112`(%r10) | |
361512da | 648 | |
8fc8f486 AP |
649 | pcmpeqd %xmm5,%xmm3 |
650 | movdqa %xmm2,`16*($i+2)+112`(%r10) | |
651 | pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register | |
652 | ||
653 | pand `16*($i+1)-128`($bp),%xmm1 | |
654 | pand `16*($i+2)-128`($bp),%xmm2 | |
655 | movdqa %xmm3,`16*($i+3)+112`(%r10) | |
656 | pand `16*($i+3)-128`($bp),%xmm3 | |
657 | por %xmm2,%xmm0 | |
658 | por %xmm3,%xmm1 | |
659 | ___ | |
660 | for($i=0;$i<$STRIDE/16-4;$i+=4) { | |
661 | $code.=<<___; | |
662 | movdqa `16*($i+0)-128`($bp),%xmm4 | |
663 | movdqa `16*($i+1)-128`($bp),%xmm5 | |
664 | movdqa `16*($i+2)-128`($bp),%xmm2 | |
665 | pand `16*($i+0)+112`(%r10),%xmm4 | |
666 | movdqa `16*($i+3)-128`($bp),%xmm3 | |
667 | pand `16*($i+1)+112`(%r10),%xmm5 | |
668 | por %xmm4,%xmm0 | |
669 | pand `16*($i+2)+112`(%r10),%xmm2 | |
670 | por %xmm5,%xmm1 | |
671 | pand `16*($i+3)+112`(%r10),%xmm3 | |
672 | por %xmm2,%xmm0 | |
673 | por %xmm3,%xmm1 | |
674 | ___ | |
675 | } | |
676 | $code.=<<___; | |
677 | por %xmm1,%xmm0 | |
678 | pshufd \$0x4e,%xmm0,%xmm1 | |
679 | por %xmm1,%xmm0 | |
680 | lea $STRIDE($bp),$bp | |
361512da | 681 | movq %xmm0,$m0 # m0=bp[0] |
8fc8f486 | 682 | |
ec9cc70f AP |
683 | mov %r13,16+8(%rsp) # save end of b[num] |
684 | mov $rp, 56+8(%rsp) # save $rp | |
685 | ||
361512da AP |
686 | mov ($n0),$n0 # pull n0[0] value |
687 | mov ($ap),%rax | |
ec9cc70f AP |
688 | lea ($ap,$num),$ap # end of a[num] |
689 | neg $num | |
361512da AP |
690 | |
691 | mov $n0,$m1 | |
692 | mulq $m0 # ap[0]*bp[0] | |
693 | mov %rax,$A[0] | |
694 | mov ($np),%rax | |
695 | ||
361512da | 696 | imulq $A[0],$m1 # "tp[0]"*n0 |
8fc8f486 | 697 | lea 64+8(%rsp),$tp |
361512da AP |
698 | mov %rdx,$A[1] |
699 | ||
361512da AP |
700 | mulq $m1 # np[0]*m1 |
701 | add %rax,$A[0] # discarded | |
ec9cc70f | 702 | mov 8($ap,$num),%rax |
361512da AP |
703 | adc \$0,%rdx |
704 | mov %rdx,$N[1] | |
705 | ||
706 | mulq $m0 | |
707 | add %rax,$A[1] | |
8fc8f486 | 708 | mov 8*1($np),%rax |
361512da AP |
709 | adc \$0,%rdx |
710 | mov %rdx,$A[0] | |
711 | ||
712 | mulq $m1 | |
713 | add %rax,$N[1] | |
ec9cc70f | 714 | mov 16($ap,$num),%rax |
361512da AP |
715 | adc \$0,%rdx |
716 | add $A[1],$N[1] | |
ec9cc70f | 717 | lea 4*8($num),$j # j=4 |
8fc8f486 | 718 | lea 8*4($np),$np |
361512da | 719 | adc \$0,%rdx |
ec9cc70f | 720 | mov $N[1],($tp) |
361512da AP |
721 | mov %rdx,$N[0] |
722 | jmp .L1st4x | |
ec9cc70f AP |
723 | |
724 | .align 32 | |
361512da AP |
725 | .L1st4x: |
726 | mulq $m0 # ap[j]*bp[0] | |
727 | add %rax,$A[0] | |
8fc8f486 | 728 | mov -8*2($np),%rax |
ec9cc70f | 729 | lea 32($tp),$tp |
361512da AP |
730 | adc \$0,%rdx |
731 | mov %rdx,$A[1] | |
732 | ||
733 | mulq $m1 # np[j]*m1 | |
734 | add %rax,$N[0] | |
ec9cc70f | 735 | mov -8($ap,$j),%rax |
361512da AP |
736 | adc \$0,%rdx |
737 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | |
738 | adc \$0,%rdx | |
ec9cc70f | 739 | mov $N[0],-24($tp) # tp[j-1] |
361512da AP |
740 | mov %rdx,$N[1] |
741 | ||
742 | mulq $m0 # ap[j]*bp[0] | |
743 | add %rax,$A[1] | |
8fc8f486 | 744 | mov -8*1($np),%rax |
361512da AP |
745 | adc \$0,%rdx |
746 | mov %rdx,$A[0] | |
747 | ||
748 | mulq $m1 # np[j]*m1 | |
749 | add %rax,$N[1] | |
ec9cc70f | 750 | mov ($ap,$j),%rax |
361512da AP |
751 | adc \$0,%rdx |
752 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | |
753 | adc \$0,%rdx | |
ec9cc70f | 754 | mov $N[1],-16($tp) # tp[j-1] |
361512da AP |
755 | mov %rdx,$N[0] |
756 | ||
757 | mulq $m0 # ap[j]*bp[0] | |
758 | add %rax,$A[0] | |
8fc8f486 | 759 | mov 8*0($np),%rax |
361512da AP |
760 | adc \$0,%rdx |
761 | mov %rdx,$A[1] | |
762 | ||
763 | mulq $m1 # np[j]*m1 | |
764 | add %rax,$N[0] | |
ec9cc70f | 765 | mov 8($ap,$j),%rax |
361512da AP |
766 | adc \$0,%rdx |
767 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | |
768 | adc \$0,%rdx | |
ec9cc70f | 769 | mov $N[0],-8($tp) # tp[j-1] |
361512da AP |
770 | mov %rdx,$N[1] |
771 | ||
772 | mulq $m0 # ap[j]*bp[0] | |
773 | add %rax,$A[1] | |
8fc8f486 | 774 | mov 8*1($np),%rax |
361512da | 775 | adc \$0,%rdx |
361512da AP |
776 | mov %rdx,$A[0] |
777 | ||
778 | mulq $m1 # np[j]*m1 | |
779 | add %rax,$N[1] | |
ec9cc70f | 780 | mov 16($ap,$j),%rax |
361512da AP |
781 | adc \$0,%rdx |
782 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | |
8fc8f486 | 783 | lea 8*4($np),$np |
361512da | 784 | adc \$0,%rdx |
ec9cc70f | 785 | mov $N[1],($tp) # tp[j-1] |
361512da | 786 | mov %rdx,$N[0] |
ec9cc70f AP |
787 | |
788 | add \$32,$j # j+=4 | |
789 | jnz .L1st4x | |
361512da AP |
790 | |
791 | mulq $m0 # ap[j]*bp[0] | |
792 | add %rax,$A[0] | |
8fc8f486 | 793 | mov -8*2($np),%rax |
ec9cc70f | 794 | lea 32($tp),$tp |
361512da AP |
795 | adc \$0,%rdx |
796 | mov %rdx,$A[1] | |
797 | ||
798 | mulq $m1 # np[j]*m1 | |
799 | add %rax,$N[0] | |
ec9cc70f | 800 | mov -8($ap),%rax |
361512da AP |
801 | adc \$0,%rdx |
802 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | |
803 | adc \$0,%rdx | |
ec9cc70f | 804 | mov $N[0],-24($tp) # tp[j-1] |
361512da AP |
805 | mov %rdx,$N[1] |
806 | ||
807 | mulq $m0 # ap[j]*bp[0] | |
808 | add %rax,$A[1] | |
8fc8f486 | 809 | mov -8*1($np),%rax |
361512da AP |
810 | adc \$0,%rdx |
811 | mov %rdx,$A[0] | |
812 | ||
813 | mulq $m1 # np[j]*m1 | |
814 | add %rax,$N[1] | |
ec9cc70f | 815 | mov ($ap,$num),%rax # ap[0] |
361512da AP |
816 | adc \$0,%rdx |
817 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | |
818 | adc \$0,%rdx | |
ec9cc70f | 819 | mov $N[1],-16($tp) # tp[j-1] |
361512da AP |
820 | mov %rdx,$N[0] |
821 | ||
8fc8f486 | 822 | lea ($np,$num),$np # rewind $np |
361512da AP |
823 | |
824 | xor $N[1],$N[1] | |
825 | add $A[0],$N[0] | |
826 | adc \$0,$N[1] | |
ec9cc70f | 827 | mov $N[0],-8($tp) |
361512da | 828 | |
ec9cc70f | 829 | jmp .Louter4x |
361512da | 830 | |
ec9cc70f AP |
831 | .align 32 |
832 | .Louter4x: | |
8fc8f486 AP |
833 | lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) |
834 | pxor %xmm4,%xmm4 | |
835 | pxor %xmm5,%xmm5 | |
836 | ___ | |
837 | for($i=0;$i<$STRIDE/16;$i+=4) { | |
838 | $code.=<<___; | |
839 | movdqa `16*($i+0)-128`($bp),%xmm0 | |
840 | movdqa `16*($i+1)-128`($bp),%xmm1 | |
841 | movdqa `16*($i+2)-128`($bp),%xmm2 | |
842 | movdqa `16*($i+3)-128`($bp),%xmm3 | |
843 | pand `16*($i+0)-128`(%rdx),%xmm0 | |
844 | pand `16*($i+1)-128`(%rdx),%xmm1 | |
845 | por %xmm0,%xmm4 | |
846 | pand `16*($i+2)-128`(%rdx),%xmm2 | |
847 | por %xmm1,%xmm5 | |
848 | pand `16*($i+3)-128`(%rdx),%xmm3 | |
849 | por %xmm2,%xmm4 | |
850 | por %xmm3,%xmm5 | |
851 | ___ | |
852 | } | |
853 | $code.=<<___; | |
854 | por %xmm5,%xmm4 | |
855 | pshufd \$0x4e,%xmm4,%xmm0 | |
856 | por %xmm4,%xmm0 | |
857 | lea $STRIDE($bp),$bp | |
858 | movq %xmm0,$m0 # m0=bp[i] | |
859 | ||
ec9cc70f | 860 | mov ($tp,$num),$A[0] |
361512da AP |
861 | mov $n0,$m1 |
862 | mulq $m0 # ap[0]*bp[i] | |
863 | add %rax,$A[0] # ap[0]*bp[i]+tp[0] | |
864 | mov ($np),%rax | |
865 | adc \$0,%rdx | |
866 | ||
361512da AP |
867 | imulq $A[0],$m1 # tp[0]*n0 |
868 | mov %rdx,$A[1] | |
ec9cc70f | 869 | mov $N[1],($tp) # store upmost overflow bit |
361512da | 870 | |
ec9cc70f | 871 | lea ($tp,$num),$tp # rewind $tp |
361512da AP |
872 | |
873 | mulq $m1 # np[0]*m1 | |
874 | add %rax,$A[0] # "$N[0]", discarded | |
ec9cc70f | 875 | mov 8($ap,$num),%rax |
361512da AP |
876 | adc \$0,%rdx |
877 | mov %rdx,$N[1] | |
878 | ||
879 | mulq $m0 # ap[j]*bp[i] | |
880 | add %rax,$A[1] | |
8fc8f486 | 881 | mov 8*1($np),%rax |
361512da | 882 | adc \$0,%rdx |
ec9cc70f | 883 | add 8($tp),$A[1] # +tp[1] |
361512da AP |
884 | adc \$0,%rdx |
885 | mov %rdx,$A[0] | |
886 | ||
887 | mulq $m1 # np[j]*m1 | |
888 | add %rax,$N[1] | |
ec9cc70f | 889 | mov 16($ap,$num),%rax |
361512da AP |
890 | adc \$0,%rdx |
891 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] | |
ec9cc70f | 892 | lea 4*8($num),$j # j=4 |
8fc8f486 | 893 | lea 8*4($np),$np |
361512da | 894 | adc \$0,%rdx |
361512da AP |
895 | mov %rdx,$N[0] |
896 | jmp .Linner4x | |
ec9cc70f AP |
897 | |
898 | .align 32 | |
361512da AP |
899 | .Linner4x: |
900 | mulq $m0 # ap[j]*bp[i] | |
901 | add %rax,$A[0] | |
8fc8f486 | 902 | mov -8*2($np),%rax |
361512da | 903 | adc \$0,%rdx |
ec9cc70f AP |
904 | add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] |
905 | lea 32($tp),$tp | |
361512da AP |
906 | adc \$0,%rdx |
907 | mov %rdx,$A[1] | |
908 | ||
909 | mulq $m1 # np[j]*m1 | |
910 | add %rax,$N[0] | |
ec9cc70f | 911 | mov -8($ap,$j),%rax |
361512da AP |
912 | adc \$0,%rdx |
913 | add $A[0],$N[0] | |
914 | adc \$0,%rdx | |
ec9cc70f | 915 | mov $N[1],-32($tp) # tp[j-1] |
361512da AP |
916 | mov %rdx,$N[1] |
917 | ||
918 | mulq $m0 # ap[j]*bp[i] | |
919 | add %rax,$A[1] | |
8fc8f486 | 920 | mov -8*1($np),%rax |
361512da | 921 | adc \$0,%rdx |
ec9cc70f | 922 | add -8($tp),$A[1] |
361512da AP |
923 | adc \$0,%rdx |
924 | mov %rdx,$A[0] | |
925 | ||
926 | mulq $m1 # np[j]*m1 | |
927 | add %rax,$N[1] | |
ec9cc70f | 928 | mov ($ap,$j),%rax |
361512da AP |
929 | adc \$0,%rdx |
930 | add $A[1],$N[1] | |
931 | adc \$0,%rdx | |
ec9cc70f | 932 | mov $N[0],-24($tp) # tp[j-1] |
361512da AP |
933 | mov %rdx,$N[0] |
934 | ||
935 | mulq $m0 # ap[j]*bp[i] | |
936 | add %rax,$A[0] | |
8fc8f486 | 937 | mov 8*0($np),%rax |
361512da | 938 | adc \$0,%rdx |
ec9cc70f | 939 | add ($tp),$A[0] # ap[j]*bp[i]+tp[j] |
361512da AP |
940 | adc \$0,%rdx |
941 | mov %rdx,$A[1] | |
942 | ||
943 | mulq $m1 # np[j]*m1 | |
944 | add %rax,$N[0] | |
ec9cc70f | 945 | mov 8($ap,$j),%rax |
361512da AP |
946 | adc \$0,%rdx |
947 | add $A[0],$N[0] | |
948 | adc \$0,%rdx | |
ec9cc70f | 949 | mov $N[1],-16($tp) # tp[j-1] |
361512da AP |
950 | mov %rdx,$N[1] |
951 | ||
952 | mulq $m0 # ap[j]*bp[i] | |
953 | add %rax,$A[1] | |
8fc8f486 | 954 | mov 8*1($np),%rax |
361512da | 955 | adc \$0,%rdx |
ec9cc70f | 956 | add 8($tp),$A[1] |
361512da | 957 | adc \$0,%rdx |
361512da AP |
958 | mov %rdx,$A[0] |
959 | ||
960 | mulq $m1 # np[j]*m1 | |
961 | add %rax,$N[1] | |
ec9cc70f | 962 | mov 16($ap,$j),%rax |
361512da AP |
963 | adc \$0,%rdx |
964 | add $A[1],$N[1] | |
8fc8f486 | 965 | lea 8*4($np),$np |
361512da | 966 | adc \$0,%rdx |
ec9cc70f | 967 | mov $N[0],-8($tp) # tp[j-1] |
361512da | 968 | mov %rdx,$N[0] |
ec9cc70f AP |
969 | |
970 | add \$32,$j # j+=4 | |
971 | jnz .Linner4x | |
361512da AP |
972 | |
973 | mulq $m0 # ap[j]*bp[i] | |
974 | add %rax,$A[0] | |
8fc8f486 | 975 | mov -8*2($np),%rax |
361512da | 976 | adc \$0,%rdx |
ec9cc70f AP |
977 | add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] |
978 | lea 32($tp),$tp | |
361512da AP |
979 | adc \$0,%rdx |
980 | mov %rdx,$A[1] | |
981 | ||
982 | mulq $m1 # np[j]*m1 | |
983 | add %rax,$N[0] | |
ec9cc70f | 984 | mov -8($ap),%rax |
361512da AP |
985 | adc \$0,%rdx |
986 | add $A[0],$N[0] | |
987 | adc \$0,%rdx | |
ec9cc70f | 988 | mov $N[1],-32($tp) # tp[j-1] |
361512da AP |
989 | mov %rdx,$N[1] |
990 | ||
991 | mulq $m0 # ap[j]*bp[i] | |
992 | add %rax,$A[1] | |
ec9cc70f | 993 | mov $m1,%rax |
8fc8f486 | 994 | mov -8*1($np),$m1 |
361512da | 995 | adc \$0,%rdx |
ec9cc70f | 996 | add -8($tp),$A[1] |
361512da | 997 | adc \$0,%rdx |
361512da AP |
998 | mov %rdx,$A[0] |
999 | ||
1000 | mulq $m1 # np[j]*m1 | |
1001 | add %rax,$N[1] | |
ec9cc70f | 1002 | mov ($ap,$num),%rax # ap[0] |
361512da AP |
1003 | adc \$0,%rdx |
1004 | add $A[1],$N[1] | |
1005 | adc \$0,%rdx | |
ec9cc70f | 1006 | mov $N[0],-24($tp) # tp[j-1] |
361512da AP |
1007 | mov %rdx,$N[0] |
1008 | ||
ec9cc70f | 1009 | mov $N[1],-16($tp) # tp[j-1] |
8fc8f486 | 1010 | lea ($np,$num),$np # rewind $np |
361512da AP |
1011 | |
1012 | xor $N[1],$N[1] | |
1013 | add $A[0],$N[0] | |
1014 | adc \$0,$N[1] | |
ec9cc70f AP |
1015 | add ($tp),$N[0] # pull upmost overflow bit |
1016 | adc \$0,$N[1] # upmost overflow bit | |
1017 | mov $N[0],-8($tp) | |
361512da | 1018 | |
ec9cc70f AP |
1019 | cmp 16+8(%rsp),$bp |
1020 | jb .Louter4x | |
361512da | 1021 | ___ |
ec9cc70f | 1022 | if (1) { |
361512da | 1023 | $code.=<<___; |
8fc8f486 | 1024 | xor %rax,%rax |
ec9cc70f AP |
1025 | sub $N[0],$m1 # compare top-most words |
1026 | adc $j,$j # $j is zero | |
1027 | or $j,$N[1] | |
8fc8f486 | 1028 | sub $N[1],%rax # %rax=-$N[1] |
ec9cc70f | 1029 | lea ($tp,$num),%rbx # tptr in .sqr4x_sub |
8fc8f486 AP |
1030 | mov ($np),%r12 |
1031 | lea ($np),%rbp # nptr in .sqr4x_sub | |
ec9cc70f | 1032 | mov %r9,%rcx |
8fc8f486 | 1033 | sar \$3+2,%rcx |
ec9cc70f | 1034 | mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub |
8fc8f486 AP |
1035 | dec %r12 # so that after 'not' we get -n[0] |
1036 | xor %r10,%r10 | |
1037 | mov 8*1(%rbp),%r13 | |
1038 | mov 8*2(%rbp),%r14 | |
1039 | mov 8*3(%rbp),%r15 | |
1040 | jmp .Lsqr4x_sub_entry | |
ec9cc70f AP |
1041 | ___ |
1042 | } else { | |
1043 | my @ri=("%rax",$bp,$m0,$m1); | |
1044 | my $rp="%rdx"; | |
1045 | $code.=<<___ | |
1046 | xor \$1,$N[1] | |
1047 | lea ($tp,$num),$tp # rewind $tp | |
1048 | sar \$5,$num # cf=0 | |
1049 | lea ($np,$N[1],8),$np | |
1050 | mov 56+8(%rsp),$rp # restore $rp | |
361512da | 1051 | jmp .Lsub4x |
ec9cc70f AP |
1052 | |
1053 | .align 32 | |
361512da | 1054 | .Lsub4x: |
ec9cc70f AP |
1055 | .byte 0x66 |
1056 | mov 8*0($tp),@ri[0] | |
1057 | mov 8*1($tp),@ri[1] | |
1058 | .byte 0x66 | |
1059 | sbb 16*0($np),@ri[0] | |
1060 | mov 8*2($tp),@ri[2] | |
1061 | sbb 16*1($np),@ri[1] | |
1062 | mov 3*8($tp),@ri[3] | |
1063 | lea 4*8($tp),$tp | |
1064 | sbb 16*2($np),@ri[2] | |
1065 | mov @ri[0],8*0($rp) | |
1066 | sbb 16*3($np),@ri[3] | |
1067 | lea 16*4($np),$np | |
1068 | mov @ri[1],8*1($rp) | |
1069 | mov @ri[2],8*2($rp) | |
1070 | mov @ri[3],8*3($rp) | |
1071 | lea 8*4($rp),$rp | |
1072 | ||
1073 | inc $num | |
361512da AP |
1074 | jnz .Lsub4x |
1075 | ||
ec9cc70f | 1076 | ret |
361512da AP |
1077 | ___ |
1078 | } | |
1079 | $code.=<<___; | |
0190c52a | 1080 | .cfi_endproc |
ec9cc70f | 1081 | .size mul4x_internal,.-mul4x_internal |
361512da AP |
1082 | ___ |
1083 | }}} | |
ec9cc70f AP |
1084 | \f{{{ |
1085 | ###################################################################### | |
1086 | # void bn_power5( | |
1087 | my $rptr="%rdi"; # BN_ULONG *rptr, | |
1088 | my $aptr="%rsi"; # const BN_ULONG *aptr, | |
1089 | my $bptr="%rdx"; # const void *table, | |
1090 | my $nptr="%rcx"; # const BN_ULONG *nptr, | |
1091 | my $n0 ="%r8"; # const BN_ULONG *n0); | |
1092 | my $num ="%r9"; # int num, has to be divisible by 8 | |
609b0852 | 1093 | # int pwr |
ec9cc70f AP |
1094 | |
1095 | my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); | |
1096 | my @A0=("%r10","%r11"); | |
1097 | my @A1=("%r12","%r13"); | |
1098 | my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); | |
a5bb5bca AP |
1099 | |
1100 | $code.=<<___; | |
ec9cc70f AP |
1101 | .globl bn_power5 |
1102 | .type bn_power5,\@function,6 | |
a5bb5bca | 1103 | .align 32 |
ec9cc70f | 1104 | bn_power5: |
76e624a0 | 1105 | .cfi_startproc |
3ba1ef82 | 1106 | mov %rsp,%rax |
76e624a0 | 1107 | .cfi_def_cfa_register %rax |
ec9cc70f AP |
1108 | ___ |
1109 | $code.=<<___ if ($addx); | |
1110 | mov OPENSSL_ia32cap_P+8(%rip),%r11d | |
8fc8f486 AP |
1111 | and \$0x80108,%r11d |
1112 | cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 | |
ec9cc70f AP |
1113 | je .Lpowerx5_enter |
1114 | ___ | |
1115 | $code.=<<___; | |
a5bb5bca | 1116 | push %rbx |
76e624a0 | 1117 | .cfi_push %rbx |
a5bb5bca | 1118 | push %rbp |
76e624a0 | 1119 | .cfi_push %rbp |
a5bb5bca | 1120 | push %r12 |
76e624a0 | 1121 | .cfi_push %r12 |
a5bb5bca | 1122 | push %r13 |
76e624a0 | 1123 | .cfi_push %r13 |
a5bb5bca | 1124 | push %r14 |
76e624a0 | 1125 | .cfi_push %r14 |
a5bb5bca | 1126 | push %r15 |
76e624a0 | 1127 | .cfi_push %r15 |
3ba1ef82 | 1128 | .Lpower5_prologue: |
8fc8f486 | 1129 | |
a5bb5bca | 1130 | shl \$3,${num}d # convert $num to bytes |
8fc8f486 | 1131 | lea ($num,$num,2),%r10d # 3*$num |
ec9cc70f | 1132 | neg $num |
a5bb5bca | 1133 | mov ($n0),$n0 # *n0 |
ec9cc70f AP |
1134 | |
1135 | ############################################################## | |
8fc8f486 AP |
1136 | # Ensure that stack frame doesn't alias with $rptr+3*$num |
1137 | # modulo 4096, which covers ret[num], am[num] and n[num] | |
1138 | # (see bn_exp.c). This is done to allow memory disambiguation | |
1139 | # logic do its magic. [Extra 256 bytes is for power mask | |
1140 | # calculated from 7th argument, the index.] | |
ec9cc70f | 1141 | # |
8fc8f486 | 1142 | lea -320(%rsp,$num,2),%r11 |
3ba1ef82 | 1143 | mov %rsp,%rbp |
8fc8f486 | 1144 | sub $rptr,%r11 |
ec9cc70f AP |
1145 | and \$4095,%r11 |
1146 | cmp %r11,%r10 | |
1147 | jb .Lpwr_sp_alt | |
3ba1ef82 AP |
1148 | sub %r11,%rbp # align with $aptr |
1149 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) | |
ec9cc70f AP |
1150 | jmp .Lpwr_sp_done |
1151 | ||
1152 | .align 32 | |
1153 | .Lpwr_sp_alt: | |
8fc8f486 | 1154 | lea 4096-320(,$num,2),%r10 |
3ba1ef82 | 1155 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) |
ec9cc70f AP |
1156 | sub %r10,%r11 |
1157 | mov \$0,%r10 | |
1158 | cmovc %r10,%r11 | |
3ba1ef82 | 1159 | sub %r11,%rbp |
ec9cc70f | 1160 | .Lpwr_sp_done: |
3ba1ef82 AP |
1161 | and \$-64,%rbp |
1162 | mov %rsp,%r11 | |
1163 | sub %rbp,%r11 | |
adc4f1fc | 1164 | and \$-4096,%r11 |
3ba1ef82 AP |
1165 | lea (%rbp,%r11),%rsp |
1166 | mov (%rsp),%r10 | |
1167 | cmp %rbp,%rsp | |
1168 | ja .Lpwr_page_walk | |
1169 | jmp .Lpwr_page_walk_done | |
1170 | ||
adc4f1fc | 1171 | .Lpwr_page_walk: |
3ba1ef82 AP |
1172 | lea -4096(%rsp),%rsp |
1173 | mov (%rsp),%r10 | |
1174 | cmp %rbp,%rsp | |
1175 | ja .Lpwr_page_walk | |
1176 | .Lpwr_page_walk_done: | |
adc4f1fc | 1177 | |
609b0852 | 1178 | mov $num,%r10 |
ec9cc70f AP |
1179 | neg $num |
1180 | ||
a5bb5bca AP |
1181 | ############################################################## |
1182 | # Stack layout | |
a5bb5bca | 1183 | # |
ec9cc70f AP |
1184 | # +0 saved $num, used in reduction section |
1185 | # +8 &t[2*$num], used in reduction section | |
1186 | # +32 saved *n0 | |
1187 | # +40 saved %rsp | |
1188 | # +48 t[2*$num] | |
1189 | # | |
1190 | mov $n0, 32(%rsp) | |
1191 | mov %rax, 40(%rsp) # save original %rsp | |
76e624a0 | 1192 | .cfi_cfa_expression %rsp+40,deref,+8 |
ec9cc70f | 1193 | .Lpower5_body: |
8fc8f486 | 1194 | movq $rptr,%xmm1 # save $rptr, used in sqr8x |
ec9cc70f | 1195 | movq $nptr,%xmm2 # save $nptr |
8fc8f486 | 1196 | movq %r10, %xmm3 # -$num, used in sqr8x |
ec9cc70f AP |
1197 | movq $bptr,%xmm4 |
1198 | ||
1199 | call __bn_sqr8x_internal | |
317be638 | 1200 | call __bn_post4x_internal |
ec9cc70f | 1201 | call __bn_sqr8x_internal |
317be638 | 1202 | call __bn_post4x_internal |
ec9cc70f | 1203 | call __bn_sqr8x_internal |
317be638 | 1204 | call __bn_post4x_internal |
ec9cc70f | 1205 | call __bn_sqr8x_internal |
317be638 | 1206 | call __bn_post4x_internal |
ec9cc70f | 1207 | call __bn_sqr8x_internal |
317be638 | 1208 | call __bn_post4x_internal |
ec9cc70f | 1209 | |
eedab524 | 1210 | movq %xmm2,$nptr |
ec9cc70f AP |
1211 | movq %xmm4,$bptr |
1212 | mov $aptr,$rptr | |
1213 | mov 40(%rsp),%rax | |
1214 | lea 32(%rsp),$n0 | |
1215 | ||
1216 | call mul4x_internal | |
1217 | ||
1218 | mov 40(%rsp),%rsi # restore %rsp | |
76e624a0 | 1219 | .cfi_def_cfa %rsi,8 |
ec9cc70f AP |
1220 | mov \$1,%rax |
1221 | mov -48(%rsi),%r15 | |
76e624a0 | 1222 | .cfi_restore %r15 |
ec9cc70f | 1223 | mov -40(%rsi),%r14 |
76e624a0 | 1224 | .cfi_restore %r14 |
ec9cc70f | 1225 | mov -32(%rsi),%r13 |
76e624a0 | 1226 | .cfi_restore %r13 |
ec9cc70f | 1227 | mov -24(%rsi),%r12 |
76e624a0 | 1228 | .cfi_restore %r12 |
ec9cc70f | 1229 | mov -16(%rsi),%rbp |
76e624a0 | 1230 | .cfi_restore %rbp |
ec9cc70f | 1231 | mov -8(%rsi),%rbx |
76e624a0 | 1232 | .cfi_restore %rbx |
ec9cc70f | 1233 | lea (%rsi),%rsp |
76e624a0 | 1234 | .cfi_def_cfa_register %rsp |
ec9cc70f AP |
1235 | .Lpower5_epilogue: |
1236 | ret | |
76e624a0 | 1237 | .cfi_endproc |
ec9cc70f | 1238 | .size bn_power5,.-bn_power5 |
a5bb5bca | 1239 | |
ec9cc70f AP |
1240 | .globl bn_sqr8x_internal |
1241 | .hidden bn_sqr8x_internal | |
1242 | .type bn_sqr8x_internal,\@abi-omnipotent | |
a5bb5bca | 1243 | .align 32 |
ec9cc70f AP |
1244 | bn_sqr8x_internal: |
1245 | __bn_sqr8x_internal: | |
0190c52a | 1246 | .cfi_startproc |
ec9cc70f AP |
1247 | ############################################################## |
1248 | # Squaring part: | |
1249 | # | |
1250 | # a) multiply-n-add everything but a[i]*a[i]; | |
1251 | # b) shift result of a) by 1 to the left and accumulate | |
1252 | # a[i]*a[i] products; | |
1253 | # | |
1254 | ############################################################## | |
1255 | # a[1]a[0] | |
1256 | # a[2]a[0] | |
1257 | # a[3]a[0] | |
1258 | # a[2]a[1] | |
1259 | # a[4]a[0] | |
1260 | # a[3]a[1] | |
1261 | # a[5]a[0] | |
1262 | # a[4]a[1] | |
1263 | # a[3]a[2] | |
1264 | # a[6]a[0] | |
1265 | # a[5]a[1] | |
1266 | # a[4]a[2] | |
1267 | # a[7]a[0] | |
1268 | # a[6]a[1] | |
1269 | # a[5]a[2] | |
1270 | # a[4]a[3] | |
1271 | # a[7]a[1] | |
1272 | # a[6]a[2] | |
1273 | # a[5]a[3] | |
1274 | # a[7]a[2] | |
1275 | # a[6]a[3] | |
1276 | # a[5]a[4] | |
1277 | # a[7]a[3] | |
1278 | # a[6]a[4] | |
1279 | # a[7]a[4] | |
1280 | # a[6]a[5] | |
1281 | # a[7]a[5] | |
1282 | # a[7]a[6] | |
1283 | # a[1]a[0] | |
1284 | # a[2]a[0] | |
1285 | # a[3]a[0] | |
1286 | # a[4]a[0] | |
1287 | # a[5]a[0] | |
1288 | # a[6]a[0] | |
1289 | # a[7]a[0] | |
1290 | # a[2]a[1] | |
1291 | # a[3]a[1] | |
1292 | # a[4]a[1] | |
1293 | # a[5]a[1] | |
1294 | # a[6]a[1] | |
1295 | # a[7]a[1] | |
1296 | # a[3]a[2] | |
1297 | # a[4]a[2] | |
1298 | # a[5]a[2] | |
1299 | # a[6]a[2] | |
1300 | # a[7]a[2] | |
1301 | # a[4]a[3] | |
1302 | # a[5]a[3] | |
1303 | # a[6]a[3] | |
1304 | # a[7]a[3] | |
1305 | # a[5]a[4] | |
1306 | # a[6]a[4] | |
1307 | # a[7]a[4] | |
1308 | # a[6]a[5] | |
1309 | # a[7]a[5] | |
1310 | # a[7]a[6] | |
1311 | # a[0]a[0] | |
1312 | # a[1]a[1] | |
1313 | # a[2]a[2] | |
1314 | # a[3]a[3] | |
1315 | # a[4]a[4] | |
1316 | # a[5]a[5] | |
1317 | # a[6]a[6] | |
1318 | # a[7]a[7] | |
1319 | ||
1320 | lea 32(%r10),$i # $i=-($num-32) | |
1321 | lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] | |
1322 | ||
1323 | mov $num,$j # $j=$num | |
1324 | ||
1325 | # comments apply to $num==8 case | |
1326 | mov -32($aptr,$i),$a0 # a[0] | |
1327 | lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | |
1328 | mov -24($aptr,$i),%rax # a[1] | |
1329 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | |
1330 | mov -16($aptr,$i),$ai # a[2] | |
1331 | mov %rax,$a1 | |
1332 | ||
1333 | mul $a0 # a[1]*a[0] | |
1334 | mov %rax,$A0[0] # a[1]*a[0] | |
1335 | mov $ai,%rax # a[2] | |
1336 | mov %rdx,$A0[1] | |
1337 | mov $A0[0],-24($tptr,$i) # t[1] | |
1338 | ||
1339 | mul $a0 # a[2]*a[0] | |
1340 | add %rax,$A0[1] | |
1341 | mov $ai,%rax | |
1342 | adc \$0,%rdx | |
1343 | mov $A0[1],-16($tptr,$i) # t[2] | |
1344 | mov %rdx,$A0[0] | |
1345 | ||
1346 | ||
1347 | mov -8($aptr,$i),$ai # a[3] | |
1348 | mul $a1 # a[2]*a[1] | |
1349 | mov %rax,$A1[0] # a[2]*a[1]+t[3] | |
1350 | mov $ai,%rax | |
1351 | mov %rdx,$A1[1] | |
1352 | ||
1353 | lea ($i),$j | |
1354 | mul $a0 # a[3]*a[0] | |
1355 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | |
1356 | mov $ai,%rax | |
1357 | mov %rdx,$A0[1] | |
1358 | adc \$0,$A0[1] | |
1359 | add $A1[0],$A0[0] | |
1360 | adc \$0,$A0[1] | |
1361 | mov $A0[0],-8($tptr,$j) # t[3] | |
1362 | jmp .Lsqr4x_1st | |
1363 | ||
1364 | .align 32 | |
1365 | .Lsqr4x_1st: | |
1366 | mov ($aptr,$j),$ai # a[4] | |
1367 | mul $a1 # a[3]*a[1] | |
1368 | add %rax,$A1[1] # a[3]*a[1]+t[4] | |
1369 | mov $ai,%rax | |
1370 | mov %rdx,$A1[0] | |
1371 | adc \$0,$A1[0] | |
1372 | ||
1373 | mul $a0 # a[4]*a[0] | |
1374 | add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] | |
1375 | mov $ai,%rax # a[3] | |
1376 | mov 8($aptr,$j),$ai # a[5] | |
1377 | mov %rdx,$A0[0] | |
1378 | adc \$0,$A0[0] | |
1379 | add $A1[1],$A0[1] | |
1380 | adc \$0,$A0[0] | |
1381 | ||
1382 | ||
1383 | mul $a1 # a[4]*a[3] | |
1384 | add %rax,$A1[0] # a[4]*a[3]+t[5] | |
1385 | mov $ai,%rax | |
1386 | mov $A0[1],($tptr,$j) # t[4] | |
1387 | mov %rdx,$A1[1] | |
1388 | adc \$0,$A1[1] | |
1389 | ||
1390 | mul $a0 # a[5]*a[2] | |
1391 | add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] | |
1392 | mov $ai,%rax | |
1393 | mov 16($aptr,$j),$ai # a[6] | |
1394 | mov %rdx,$A0[1] | |
1395 | adc \$0,$A0[1] | |
1396 | add $A1[0],$A0[0] | |
1397 | adc \$0,$A0[1] | |
1398 | ||
1399 | mul $a1 # a[5]*a[3] | |
1400 | add %rax,$A1[1] # a[5]*a[3]+t[6] | |
1401 | mov $ai,%rax | |
1402 | mov $A0[0],8($tptr,$j) # t[5] | |
1403 | mov %rdx,$A1[0] | |
1404 | adc \$0,$A1[0] | |
1405 | ||
1406 | mul $a0 # a[6]*a[2] | |
1407 | add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] | |
1408 | mov $ai,%rax # a[3] | |
1409 | mov 24($aptr,$j),$ai # a[7] | |
1410 | mov %rdx,$A0[0] | |
1411 | adc \$0,$A0[0] | |
1412 | add $A1[1],$A0[1] | |
1413 | adc \$0,$A0[0] | |
1414 | ||
1415 | ||
1416 | mul $a1 # a[6]*a[5] | |
1417 | add %rax,$A1[0] # a[6]*a[5]+t[7] | |
1418 | mov $ai,%rax | |
1419 | mov $A0[1],16($tptr,$j) # t[6] | |
1420 | mov %rdx,$A1[1] | |
1421 | adc \$0,$A1[1] | |
1422 | lea 32($j),$j | |
1423 | ||
1424 | mul $a0 # a[7]*a[4] | |
1425 | add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] | |
1426 | mov $ai,%rax | |
1427 | mov %rdx,$A0[1] | |
1428 | adc \$0,$A0[1] | |
1429 | add $A1[0],$A0[0] | |
1430 | adc \$0,$A0[1] | |
1431 | mov $A0[0],-8($tptr,$j) # t[7] | |
1432 | ||
1433 | cmp \$0,$j | |
1434 | jne .Lsqr4x_1st | |
1435 | ||
1436 | mul $a1 # a[7]*a[5] | |
1437 | add %rax,$A1[1] | |
1438 | lea 16($i),$i | |
1439 | adc \$0,%rdx | |
1440 | add $A0[1],$A1[1] | |
1441 | adc \$0,%rdx | |
1442 | ||
1443 | mov $A1[1],($tptr) # t[8] | |
1444 | mov %rdx,$A1[0] | |
1445 | mov %rdx,8($tptr) # t[9] | |
1446 | jmp .Lsqr4x_outer | |
1447 | ||
1448 | .align 32 | |
1449 | .Lsqr4x_outer: # comments apply to $num==6 case | |
1450 | mov -32($aptr,$i),$a0 # a[0] | |
1451 | lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | |
1452 | mov -24($aptr,$i),%rax # a[1] | |
1453 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | |
1454 | mov -16($aptr,$i),$ai # a[2] | |
1455 | mov %rax,$a1 | |
1456 | ||
1457 | mul $a0 # a[1]*a[0] | |
1458 | mov -24($tptr,$i),$A0[0] # t[1] | |
1459 | add %rax,$A0[0] # a[1]*a[0]+t[1] | |
1460 | mov $ai,%rax # a[2] | |
1461 | adc \$0,%rdx | |
1462 | mov $A0[0],-24($tptr,$i) # t[1] | |
1463 | mov %rdx,$A0[1] | |
1464 | ||
1465 | mul $a0 # a[2]*a[0] | |
1466 | add %rax,$A0[1] | |
1467 | mov $ai,%rax | |
1468 | adc \$0,%rdx | |
1469 | add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] | |
1470 | mov %rdx,$A0[0] | |
1471 | adc \$0,$A0[0] | |
1472 | mov $A0[1],-16($tptr,$i) # t[2] | |
1473 | ||
1474 | xor $A1[0],$A1[0] | |
1475 | ||
1476 | mov -8($aptr,$i),$ai # a[3] | |
1477 | mul $a1 # a[2]*a[1] | |
1478 | add %rax,$A1[0] # a[2]*a[1]+t[3] | |
1479 | mov $ai,%rax | |
1480 | adc \$0,%rdx | |
1481 | add -8($tptr,$i),$A1[0] | |
1482 | mov %rdx,$A1[1] | |
1483 | adc \$0,$A1[1] | |
1484 | ||
1485 | mul $a0 # a[3]*a[0] | |
1486 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | |
1487 | mov $ai,%rax | |
1488 | adc \$0,%rdx | |
1489 | add $A1[0],$A0[0] | |
1490 | mov %rdx,$A0[1] | |
1491 | adc \$0,$A0[1] | |
1492 | mov $A0[0],-8($tptr,$i) # t[3] | |
1493 | ||
1494 | lea ($i),$j | |
1495 | jmp .Lsqr4x_inner | |
1496 | ||
1497 | .align 32 | |
1498 | .Lsqr4x_inner: | |
1499 | mov ($aptr,$j),$ai # a[4] | |
1500 | mul $a1 # a[3]*a[1] | |
1501 | add %rax,$A1[1] # a[3]*a[1]+t[4] | |
1502 | mov $ai,%rax | |
1503 | mov %rdx,$A1[0] | |
1504 | adc \$0,$A1[0] | |
1505 | add ($tptr,$j),$A1[1] | |
1506 | adc \$0,$A1[0] | |
1507 | ||
1508 | .byte 0x67 | |
1509 | mul $a0 # a[4]*a[0] | |
1510 | add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] | |
1511 | mov $ai,%rax # a[3] | |
1512 | mov 8($aptr,$j),$ai # a[5] | |
1513 | mov %rdx,$A0[0] | |
1514 | adc \$0,$A0[0] | |
1515 | add $A1[1],$A0[1] | |
1516 | adc \$0,$A0[0] | |
1517 | ||
1518 | mul $a1 # a[4]*a[3] | |
1519 | add %rax,$A1[0] # a[4]*a[3]+t[5] | |
1520 | mov $A0[1],($tptr,$j) # t[4] | |
1521 | mov $ai,%rax | |
1522 | mov %rdx,$A1[1] | |
1523 | adc \$0,$A1[1] | |
1524 | add 8($tptr,$j),$A1[0] | |
1525 | lea 16($j),$j # j++ | |
1526 | adc \$0,$A1[1] | |
1527 | ||
1528 | mul $a0 # a[5]*a[2] | |
1529 | add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] | |
1530 | mov $ai,%rax | |
1531 | adc \$0,%rdx | |
1532 | add $A1[0],$A0[0] | |
1533 | mov %rdx,$A0[1] | |
1534 | adc \$0,$A0[1] | |
1535 | mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below | |
1536 | ||
1537 | cmp \$0,$j | |
1538 | jne .Lsqr4x_inner | |
1539 | ||
1540 | .byte 0x67 | |
1541 | mul $a1 # a[5]*a[3] | |
1542 | add %rax,$A1[1] | |
1543 | adc \$0,%rdx | |
1544 | add $A0[1],$A1[1] | |
1545 | adc \$0,%rdx | |
1546 | ||
1547 | mov $A1[1],($tptr) # t[6], "preloaded t[2]" below | |
1548 | mov %rdx,$A1[0] | |
1549 | mov %rdx,8($tptr) # t[7], "preloaded t[3]" below | |
1550 | ||
1551 | add \$16,$i | |
1552 | jnz .Lsqr4x_outer | |
1553 | ||
1554 | # comments apply to $num==4 case | |
1555 | mov -32($aptr),$a0 # a[0] | |
1556 | lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | |
1557 | mov -24($aptr),%rax # a[1] | |
1558 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | |
1559 | mov -16($aptr),$ai # a[2] | |
1560 | mov %rax,$a1 | |
1561 | ||
1562 | mul $a0 # a[1]*a[0] | |
1563 | add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] | |
1564 | mov $ai,%rax # a[2] | |
1565 | mov %rdx,$A0[1] | |
1566 | adc \$0,$A0[1] | |
1567 | ||
1568 | mul $a0 # a[2]*a[0] | |
1569 | add %rax,$A0[1] | |
1570 | mov $ai,%rax | |
1571 | mov $A0[0],-24($tptr) # t[1] | |
1572 | mov %rdx,$A0[0] | |
1573 | adc \$0,$A0[0] | |
1574 | add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] | |
1575 | mov -8($aptr),$ai # a[3] | |
1576 | adc \$0,$A0[0] | |
1577 | ||
1578 | mul $a1 # a[2]*a[1] | |
1579 | add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] | |
1580 | mov $ai,%rax | |
1581 | mov $A0[1],-16($tptr) # t[2] | |
1582 | mov %rdx,$A1[1] | |
1583 | adc \$0,$A1[1] | |
1584 | ||
1585 | mul $a0 # a[3]*a[0] | |
1586 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | |
1587 | mov $ai,%rax | |
1588 | mov %rdx,$A0[1] | |
1589 | adc \$0,$A0[1] | |
1590 | add $A1[0],$A0[0] | |
1591 | adc \$0,$A0[1] | |
1592 | mov $A0[0],-8($tptr) # t[3] | |
1593 | ||
1594 | mul $a1 # a[3]*a[1] | |
1595 | add %rax,$A1[1] | |
1596 | mov -16($aptr),%rax # a[2] | |
1597 | adc \$0,%rdx | |
1598 | add $A0[1],$A1[1] | |
1599 | adc \$0,%rdx | |
1600 | ||
1601 | mov $A1[1],($tptr) # t[4] | |
1602 | mov %rdx,$A1[0] | |
1603 | mov %rdx,8($tptr) # t[5] | |
1604 | ||
1605 | mul $ai # a[2]*a[3] | |
a5bb5bca | 1606 | ___ |
ec9cc70f AP |
1607 | { |
1608 | my ($shift,$carry)=($a0,$a1); | |
1609 | my @S=(@A1,$ai,$n0); | |
a5bb5bca | 1610 | $code.=<<___; |
ec9cc70f AP |
1611 | add \$16,$i |
1612 | xor $shift,$shift | |
1613 | sub $num,$i # $i=16-$num | |
1614 | xor $carry,$carry | |
1615 | ||
1616 | add $A1[0],%rax # t[5] | |
1617 | adc \$0,%rdx | |
1618 | mov %rax,8($tptr) # t[5] | |
1619 | mov %rdx,16($tptr) # t[6] | |
1620 | mov $carry,24($tptr) # t[7] | |
1621 | ||
1622 | mov -16($aptr,$i),%rax # a[0] | |
1623 | lea 48+8(%rsp),$tptr | |
1624 | xor $A0[0],$A0[0] # t[0] | |
1625 | mov 8($tptr),$A0[1] # t[1] | |
1626 | ||
1627 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | |
1628 | shr \$63,$A0[0] | |
1629 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | |
1630 | shr \$63,$A0[1] | |
1631 | or $A0[0],$S[1] # | t[2*i]>>63 | |
1632 | mov 16($tptr),$A0[0] # t[2*i+2] # prefetch | |
1633 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | |
1634 | mul %rax # a[i]*a[i] | |
1635 | neg $carry # mov $carry,cf | |
1636 | mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch | |
1637 | adc %rax,$S[0] | |
1638 | mov -8($aptr,$i),%rax # a[i+1] # prefetch | |
1639 | mov $S[0],($tptr) | |
1640 | adc %rdx,$S[1] | |
1641 | ||
1642 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | |
1643 | mov $S[1],8($tptr) | |
1644 | sbb $carry,$carry # mov cf,$carry | |
1645 | shr \$63,$A0[0] | |
1646 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | |
1647 | shr \$63,$A0[1] | |
1648 | or $A0[0],$S[3] # | t[2*i]>>63 | |
1649 | mov 32($tptr),$A0[0] # t[2*i+2] # prefetch | |
1650 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | |
1651 | mul %rax # a[i]*a[i] | |
1652 | neg $carry # mov $carry,cf | |
1653 | mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch | |
1654 | adc %rax,$S[2] | |
1655 | mov 0($aptr,$i),%rax # a[i+1] # prefetch | |
1656 | mov $S[2],16($tptr) | |
1657 | adc %rdx,$S[3] | |
1658 | lea 16($i),$i | |
1659 | mov $S[3],24($tptr) | |
1660 | sbb $carry,$carry # mov cf,$carry | |
1661 | lea 64($tptr),$tptr | |
1662 | jmp .Lsqr4x_shift_n_add | |
a5bb5bca | 1663 | |
ec9cc70f AP |
1664 | .align 32 |
1665 | .Lsqr4x_shift_n_add: | |
1666 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | |
1667 | shr \$63,$A0[0] | |
1668 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | |
1669 | shr \$63,$A0[1] | |
1670 | or $A0[0],$S[1] # | t[2*i]>>63 | |
1671 | mov -16($tptr),$A0[0] # t[2*i+2] # prefetch | |
1672 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | |
1673 | mul %rax # a[i]*a[i] | |
1674 | neg $carry # mov $carry,cf | |
1675 | mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch | |
1676 | adc %rax,$S[0] | |
1677 | mov -8($aptr,$i),%rax # a[i+1] # prefetch | |
1678 | mov $S[0],-32($tptr) | |
1679 | adc %rdx,$S[1] | |
1680 | ||
1681 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | |
1682 | mov $S[1],-24($tptr) | |
1683 | sbb $carry,$carry # mov cf,$carry | |
1684 | shr \$63,$A0[0] | |
1685 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | |
1686 | shr \$63,$A0[1] | |
1687 | or $A0[0],$S[3] # | t[2*i]>>63 | |
1688 | mov 0($tptr),$A0[0] # t[2*i+2] # prefetch | |
1689 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | |
1690 | mul %rax # a[i]*a[i] | |
1691 | neg $carry # mov $carry,cf | |
1692 | mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch | |
1693 | adc %rax,$S[2] | |
1694 | mov 0($aptr,$i),%rax # a[i+1] # prefetch | |
1695 | mov $S[2],-16($tptr) | |
1696 | adc %rdx,$S[3] | |
1697 | ||
1698 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | |
1699 | mov $S[3],-8($tptr) | |
1700 | sbb $carry,$carry # mov cf,$carry | |
1701 | shr \$63,$A0[0] | |
1702 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | |
1703 | shr \$63,$A0[1] | |
1704 | or $A0[0],$S[1] # | t[2*i]>>63 | |
1705 | mov 16($tptr),$A0[0] # t[2*i+2] # prefetch | |
1706 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | |
1707 | mul %rax # a[i]*a[i] | |
1708 | neg $carry # mov $carry,cf | |
1709 | mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch | |
1710 | adc %rax,$S[0] | |
1711 | mov 8($aptr,$i),%rax # a[i+1] # prefetch | |
1712 | mov $S[0],0($tptr) | |
1713 | adc %rdx,$S[1] | |
1714 | ||
1715 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | |
1716 | mov $S[1],8($tptr) | |
1717 | sbb $carry,$carry # mov cf,$carry | |
1718 | shr \$63,$A0[0] | |
1719 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | |
1720 | shr \$63,$A0[1] | |
1721 | or $A0[0],$S[3] # | t[2*i]>>63 | |
1722 | mov 32($tptr),$A0[0] # t[2*i+2] # prefetch | |
1723 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | |
1724 | mul %rax # a[i]*a[i] | |
1725 | neg $carry # mov $carry,cf | |
1726 | mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch | |
1727 | adc %rax,$S[2] | |
1728 | mov 16($aptr,$i),%rax # a[i+1] # prefetch | |
1729 | mov $S[2],16($tptr) | |
1730 | adc %rdx,$S[3] | |
1731 | mov $S[3],24($tptr) | |
1732 | sbb $carry,$carry # mov cf,$carry | |
1733 | lea 64($tptr),$tptr | |
1734 | add \$32,$i | |
1735 | jnz .Lsqr4x_shift_n_add | |
1736 | ||
1737 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | |
1738 | .byte 0x67 | |
1739 | shr \$63,$A0[0] | |
1740 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | |
1741 | shr \$63,$A0[1] | |
1742 | or $A0[0],$S[1] # | t[2*i]>>63 | |
1743 | mov -16($tptr),$A0[0] # t[2*i+2] # prefetch | |
1744 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | |
1745 | mul %rax # a[i]*a[i] | |
1746 | neg $carry # mov $carry,cf | |
1747 | mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch | |
1748 | adc %rax,$S[0] | |
1749 | mov -8($aptr),%rax # a[i+1] # prefetch | |
1750 | mov $S[0],-32($tptr) | |
1751 | adc %rdx,$S[1] | |
1752 | ||
1753 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift | |
1754 | mov $S[1],-24($tptr) | |
1755 | sbb $carry,$carry # mov cf,$carry | |
1756 | shr \$63,$A0[0] | |
1757 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | |
1758 | shr \$63,$A0[1] | |
1759 | or $A0[0],$S[3] # | t[2*i]>>63 | |
1760 | mul %rax # a[i]*a[i] | |
1761 | neg $carry # mov $carry,cf | |
1762 | adc %rax,$S[2] | |
1763 | adc %rdx,$S[3] | |
1764 | mov $S[2],-16($tptr) | |
1765 | mov $S[3],-8($tptr) | |
1766 | ___ | |
1767 | }\f | |
1768 | ###################################################################### | |
1769 | # Montgomery reduction part, "word-by-word" algorithm. | |
1770 | # | |
1771 | # This new path is inspired by multiple submissions from Intel, by | |
1772 | # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, | |
1773 | # Vinodh Gopal... | |
1774 | { | |
1775 | my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); | |
a5bb5bca | 1776 | |
ec9cc70f AP |
1777 | $code.=<<___; |
1778 | movq %xmm2,$nptr | |
317be638 | 1779 | __bn_sqr8x_reduction: |
ec9cc70f | 1780 | xor %rax,%rax |
8fc8f486 | 1781 | lea ($nptr,$num),%rcx # end of n[] |
ec9cc70f AP |
1782 | lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer |
1783 | mov %rcx,0+8(%rsp) | |
1784 | lea 48+8(%rsp,$num),$tptr # end of initial t[] window | |
1785 | mov %rdx,8+8(%rsp) | |
1786 | neg $num | |
1787 | jmp .L8x_reduction_loop | |
a5bb5bca | 1788 | |
ec9cc70f AP |
1789 | .align 32 |
1790 | .L8x_reduction_loop: | |
1791 | lea ($tptr,$num),$tptr # start of current t[] window | |
1792 | .byte 0x66 | |
1793 | mov 8*0($tptr),$m0 | |
1794 | mov 8*1($tptr),%r9 | |
1795 | mov 8*2($tptr),%r10 | |
1796 | mov 8*3($tptr),%r11 | |
1797 | mov 8*4($tptr),%r12 | |
1798 | mov 8*5($tptr),%r13 | |
1799 | mov 8*6($tptr),%r14 | |
1800 | mov 8*7($tptr),%r15 | |
1801 | mov %rax,(%rdx) # store top-most carry bit | |
1802 | lea 8*8($tptr),$tptr | |
1803 | ||
1804 | .byte 0x67 | |
1805 | mov $m0,%r8 | |
1806 | imulq 32+8(%rsp),$m0 # n0*a[0] | |
8fc8f486 | 1807 | mov 8*0($nptr),%rax # n[0] |
ec9cc70f AP |
1808 | mov \$8,%ecx |
1809 | jmp .L8x_reduce | |
a5bb5bca | 1810 | |
ec9cc70f AP |
1811 | .align 32 |
1812 | .L8x_reduce: | |
1813 | mulq $m0 | |
8fc8f486 | 1814 | mov 8*1($nptr),%rax # n[1] |
ec9cc70f AP |
1815 | neg %r8 |
1816 | mov %rdx,%r8 | |
1817 | adc \$0,%r8 | |
a5bb5bca | 1818 | |
ec9cc70f AP |
1819 | mulq $m0 |
1820 | add %rax,%r9 | |
8fc8f486 | 1821 | mov 8*2($nptr),%rax |
ec9cc70f AP |
1822 | adc \$0,%rdx |
1823 | add %r9,%r8 | |
1824 | mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] | |
1825 | mov %rdx,%r9 | |
1826 | adc \$0,%r9 | |
361512da | 1827 | |
ec9cc70f AP |
1828 | mulq $m0 |
1829 | add %rax,%r10 | |
8fc8f486 | 1830 | mov 8*3($nptr),%rax |
ec9cc70f AP |
1831 | adc \$0,%rdx |
1832 | add %r10,%r9 | |
1833 | mov 32+8(%rsp),$carry # pull n0, borrow $carry | |
1834 | mov %rdx,%r10 | |
1835 | adc \$0,%r10 | |
a5bb5bca | 1836 | |
ec9cc70f AP |
1837 | mulq $m0 |
1838 | add %rax,%r11 | |
8fc8f486 | 1839 | mov 8*4($nptr),%rax |
ec9cc70f AP |
1840 | adc \$0,%rdx |
1841 | imulq %r8,$carry # modulo-scheduled | |
1842 | add %r11,%r10 | |
1843 | mov %rdx,%r11 | |
1844 | adc \$0,%r11 | |
a5bb5bca | 1845 | |
ec9cc70f AP |
1846 | mulq $m0 |
1847 | add %rax,%r12 | |
8fc8f486 | 1848 | mov 8*5($nptr),%rax |
ec9cc70f AP |
1849 | adc \$0,%rdx |
1850 | add %r12,%r11 | |
1851 | mov %rdx,%r12 | |
1852 | adc \$0,%r12 | |
a5bb5bca | 1853 | |
ec9cc70f AP |
1854 | mulq $m0 |
1855 | add %rax,%r13 | |
8fc8f486 | 1856 | mov 8*6($nptr),%rax |
ec9cc70f AP |
1857 | adc \$0,%rdx |
1858 | add %r13,%r12 | |
1859 | mov %rdx,%r13 | |
1860 | adc \$0,%r13 | |
1861 | ||
1862 | mulq $m0 | |
1863 | add %rax,%r14 | |
8fc8f486 | 1864 | mov 8*7($nptr),%rax |
ec9cc70f AP |
1865 | adc \$0,%rdx |
1866 | add %r14,%r13 | |
1867 | mov %rdx,%r14 | |
1868 | adc \$0,%r14 | |
a5bb5bca | 1869 | |
ec9cc70f AP |
1870 | mulq $m0 |
1871 | mov $carry,$m0 # n0*a[i] | |
1872 | add %rax,%r15 | |
8fc8f486 | 1873 | mov 8*0($nptr),%rax # n[0] |
ec9cc70f AP |
1874 | adc \$0,%rdx |
1875 | add %r15,%r14 | |
1876 | mov %rdx,%r15 | |
1877 | adc \$0,%r15 | |
1878 | ||
1879 | dec %ecx | |
1880 | jnz .L8x_reduce | |
1881 | ||
8fc8f486 | 1882 | lea 8*8($nptr),$nptr |
ec9cc70f AP |
1883 | xor %rax,%rax |
1884 | mov 8+8(%rsp),%rdx # pull end of t[] | |
1885 | cmp 0+8(%rsp),$nptr # end of n[]? | |
1886 | jae .L8x_no_tail | |
1887 | ||
1888 | .byte 0x66 | |
1889 | add 8*0($tptr),%r8 | |
1890 | adc 8*1($tptr),%r9 | |
1891 | adc 8*2($tptr),%r10 | |
1892 | adc 8*3($tptr),%r11 | |
1893 | adc 8*4($tptr),%r12 | |
1894 | adc 8*5($tptr),%r13 | |
1895 | adc 8*6($tptr),%r14 | |
1896 | adc 8*7($tptr),%r15 | |
1897 | sbb $carry,$carry # top carry | |
1898 | ||
1899 | mov 48+56+8(%rsp),$m0 # pull n0*a[0] | |
1900 | mov \$8,%ecx | |
8fc8f486 | 1901 | mov 8*0($nptr),%rax |
ec9cc70f | 1902 | jmp .L8x_tail |
a5bb5bca AP |
1903 | |
1904 | .align 32 | |
ec9cc70f AP |
1905 | .L8x_tail: |
1906 | mulq $m0 | |
1907 | add %rax,%r8 | |
8fc8f486 | 1908 | mov 8*1($nptr),%rax |
ec9cc70f AP |
1909 | mov %r8,($tptr) # save result |
1910 | mov %rdx,%r8 | |
1911 | adc \$0,%r8 | |
1912 | ||
1913 | mulq $m0 | |
1914 | add %rax,%r9 | |
8fc8f486 | 1915 | mov 8*2($nptr),%rax |
ec9cc70f AP |
1916 | adc \$0,%rdx |
1917 | add %r9,%r8 | |
1918 | lea 8($tptr),$tptr # $tptr++ | |
1919 | mov %rdx,%r9 | |
1920 | adc \$0,%r9 | |
1921 | ||
1922 | mulq $m0 | |
1923 | add %rax,%r10 | |
8fc8f486 | 1924 | mov 8*3($nptr),%rax |
ec9cc70f AP |
1925 | adc \$0,%rdx |
1926 | add %r10,%r9 | |
1927 | mov %rdx,%r10 | |
1928 | adc \$0,%r10 | |
1929 | ||
1930 | mulq $m0 | |
1931 | add %rax,%r11 | |
8fc8f486 | 1932 | mov 8*4($nptr),%rax |
ec9cc70f AP |
1933 | adc \$0,%rdx |
1934 | add %r11,%r10 | |
1935 | mov %rdx,%r11 | |
1936 | adc \$0,%r11 | |
1937 | ||
1938 | mulq $m0 | |
1939 | add %rax,%r12 | |
8fc8f486 | 1940 | mov 8*5($nptr),%rax |
ec9cc70f AP |
1941 | adc \$0,%rdx |
1942 | add %r12,%r11 | |
1943 | mov %rdx,%r12 | |
1944 | adc \$0,%r12 | |
1945 | ||
1946 | mulq $m0 | |
1947 | add %rax,%r13 | |
8fc8f486 | 1948 | mov 8*6($nptr),%rax |
ec9cc70f AP |
1949 | adc \$0,%rdx |
1950 | add %r13,%r12 | |
1951 | mov %rdx,%r13 | |
1952 | adc \$0,%r13 | |
1953 | ||
1954 | mulq $m0 | |
1955 | add %rax,%r14 | |
8fc8f486 | 1956 | mov 8*7($nptr),%rax |
ec9cc70f AP |
1957 | adc \$0,%rdx |
1958 | add %r14,%r13 | |
1959 | mov %rdx,%r14 | |
1960 | adc \$0,%r14 | |
1961 | ||
1962 | mulq $m0 | |
1963 | mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] | |
1964 | add %rax,%r15 | |
1965 | adc \$0,%rdx | |
1966 | add %r15,%r14 | |
8fc8f486 | 1967 | mov 8*0($nptr),%rax # pull n[0] |
ec9cc70f AP |
1968 | mov %rdx,%r15 |
1969 | adc \$0,%r15 | |
1970 | ||
1971 | dec %ecx | |
1972 | jnz .L8x_tail | |
1973 | ||
8fc8f486 | 1974 | lea 8*8($nptr),$nptr |
ec9cc70f AP |
1975 | mov 8+8(%rsp),%rdx # pull end of t[] |
1976 | cmp 0+8(%rsp),$nptr # end of n[]? | |
1977 | jae .L8x_tail_done # break out of loop | |
1978 | ||
1979 | mov 48+56+8(%rsp),$m0 # pull n0*a[0] | |
1980 | neg $carry | |
1981 | mov 8*0($nptr),%rax # pull n[0] | |
1982 | adc 8*0($tptr),%r8 | |
1983 | adc 8*1($tptr),%r9 | |
1984 | adc 8*2($tptr),%r10 | |
1985 | adc 8*3($tptr),%r11 | |
1986 | adc 8*4($tptr),%r12 | |
1987 | adc 8*5($tptr),%r13 | |
1988 | adc 8*6($tptr),%r14 | |
1989 | adc 8*7($tptr),%r15 | |
1990 | sbb $carry,$carry # top carry | |
1991 | ||
1992 | mov \$8,%ecx | |
1993 | jmp .L8x_tail | |
1994 | ||
1995 | .align 32 | |
1996 | .L8x_tail_done: | |
3f4bcf5b | 1997 | xor %rax,%rax |
ec9cc70f | 1998 | add (%rdx),%r8 # can this overflow? |
29851264 AP |
1999 | adc \$0,%r9 |
2000 | adc \$0,%r10 | |
2001 | adc \$0,%r11 | |
2002 | adc \$0,%r12 | |
2003 | adc \$0,%r13 | |
2004 | adc \$0,%r14 | |
3f4bcf5b AP |
2005 | adc \$0,%r15 |
2006 | adc \$0,%rax | |
ec9cc70f AP |
2007 | |
2008 | neg $carry | |
2009 | .L8x_no_tail: | |
2010 | adc 8*0($tptr),%r8 | |
2011 | adc 8*1($tptr),%r9 | |
2012 | adc 8*2($tptr),%r10 | |
2013 | adc 8*3($tptr),%r11 | |
2014 | adc 8*4($tptr),%r12 | |
2015 | adc 8*5($tptr),%r13 | |
2016 | adc 8*6($tptr),%r14 | |
2017 | adc 8*7($tptr),%r15 | |
2018 | adc \$0,%rax # top-most carry | |
8fc8f486 | 2019 | mov -8($nptr),%rcx # np[num-1] |
ec9cc70f AP |
2020 | xor $carry,$carry |
2021 | ||
2022 | movq %xmm2,$nptr # restore $nptr | |
2023 | ||
2024 | mov %r8,8*0($tptr) # store top 512 bits | |
2025 | mov %r9,8*1($tptr) | |
2026 | movq %xmm3,$num # $num is %r9, can't be moved upwards | |
2027 | mov %r10,8*2($tptr) | |
2028 | mov %r11,8*3($tptr) | |
2029 | mov %r12,8*4($tptr) | |
2030 | mov %r13,8*5($tptr) | |
2031 | mov %r14,8*6($tptr) | |
2032 | mov %r15,8*7($tptr) | |
2033 | lea 8*8($tptr),$tptr | |
2034 | ||
2035 | cmp %rdx,$tptr # end of t[]? | |
2036 | jb .L8x_reduction_loop | |
317be638 | 2037 | ret |
0190c52a | 2038 | .cfi_endproc |
317be638 | 2039 | .size bn_sqr8x_internal,.-bn_sqr8x_internal |
ec9cc70f AP |
2040 | ___ |
2041 | }\f | |
2042 | ############################################################## | |
2043 | # Post-condition, 4x unrolled | |
2044 | # | |
2045 | { | |
2046 | my ($tptr,$nptr)=("%rbx","%rbp"); | |
2047 | $code.=<<___; | |
317be638 AP |
2048 | .type __bn_post4x_internal,\@abi-omnipotent |
2049 | .align 32 | |
2050 | __bn_post4x_internal: | |
0190c52a | 2051 | .cfi_startproc |
8fc8f486 | 2052 | mov 8*0($nptr),%r12 |
ec9cc70f | 2053 | lea (%rdi,$num),$tptr # %rdi was $tptr above |
ec9cc70f | 2054 | mov $num,%rcx |
ec9cc70f | 2055 | movq %xmm1,$rptr # restore $rptr |
8fc8f486 | 2056 | neg %rax |
ec9cc70f | 2057 | movq %xmm1,$aptr # prepare for back-to-back call |
8fc8f486 AP |
2058 | sar \$3+2,%rcx |
2059 | dec %r12 # so that after 'not' we get -n[0] | |
2060 | xor %r10,%r10 | |
2061 | mov 8*1($nptr),%r13 | |
2062 | mov 8*2($nptr),%r14 | |
2063 | mov 8*3($nptr),%r15 | |
2064 | jmp .Lsqr4x_sub_entry | |
ec9cc70f | 2065 | |
8fc8f486 | 2066 | .align 16 |
ec9cc70f | 2067 | .Lsqr4x_sub: |
8fc8f486 AP |
2068 | mov 8*0($nptr),%r12 |
2069 | mov 8*1($nptr),%r13 | |
2070 | mov 8*2($nptr),%r14 | |
2071 | mov 8*3($nptr),%r15 | |
2072 | .Lsqr4x_sub_entry: | |
2073 | lea 8*4($nptr),$nptr | |
2074 | not %r12 | |
2075 | not %r13 | |
2076 | not %r14 | |
2077 | not %r15 | |
2078 | and %rax,%r12 | |
2079 | and %rax,%r13 | |
2080 | and %rax,%r14 | |
2081 | and %rax,%r15 | |
2082 | ||
2083 | neg %r10 # mov %r10,%cf | |
2084 | adc 8*0($tptr),%r12 | |
2085 | adc 8*1($tptr),%r13 | |
2086 | adc 8*2($tptr),%r14 | |
2087 | adc 8*3($tptr),%r15 | |
ec9cc70f | 2088 | mov %r12,8*0($rptr) |
8fc8f486 | 2089 | lea 8*4($tptr),$tptr |
ec9cc70f | 2090 | mov %r13,8*1($rptr) |
8fc8f486 | 2091 | sbb %r10,%r10 # mov %cf,%r10 |
ec9cc70f AP |
2092 | mov %r14,8*2($rptr) |
2093 | mov %r15,8*3($rptr) | |
2094 | lea 8*4($rptr),$rptr | |
2095 | ||
2096 | inc %rcx # pass %cf | |
2097 | jnz .Lsqr4x_sub | |
317be638 | 2098 | |
ec9cc70f | 2099 | mov $num,%r10 # prepare for back-to-back call |
609b0852 | 2100 | neg $num # restore $num |
ec9cc70f | 2101 | ret |
0190c52a | 2102 | .cfi_endproc |
317be638 | 2103 | .size __bn_post4x_internal,.-__bn_post4x_internal |
ec9cc70f | 2104 | ___ |
317be638 | 2105 | } |
ec9cc70f AP |
2106 | { |
2107 | $code.=<<___; | |
2108 | .globl bn_from_montgomery | |
2109 | .type bn_from_montgomery,\@abi-omnipotent | |
2110 | .align 32 | |
2111 | bn_from_montgomery: | |
0190c52a | 2112 | .cfi_startproc |
ec9cc70f AP |
2113 | testl \$7,`($win64?"48(%rsp)":"%r9d")` |
2114 | jz bn_from_mont8x | |
2115 | xor %eax,%eax | |
2116 | ret | |
0190c52a | 2117 | .cfi_endproc |
ec9cc70f AP |
2118 | .size bn_from_montgomery,.-bn_from_montgomery |
2119 | ||
2120 | .type bn_from_mont8x,\@function,6 | |
2121 | .align 32 | |
2122 | bn_from_mont8x: | |
76e624a0 | 2123 | .cfi_startproc |
ec9cc70f AP |
2124 | .byte 0x67 |
2125 | mov %rsp,%rax | |
76e624a0 | 2126 | .cfi_def_cfa_register %rax |
ec9cc70f | 2127 | push %rbx |
76e624a0 | 2128 | .cfi_push %rbx |
ec9cc70f | 2129 | push %rbp |
76e624a0 | 2130 | .cfi_push %rbp |
ec9cc70f | 2131 | push %r12 |
76e624a0 | 2132 | .cfi_push %r12 |
ec9cc70f | 2133 | push %r13 |
76e624a0 | 2134 | .cfi_push %r13 |
ec9cc70f | 2135 | push %r14 |
76e624a0 | 2136 | .cfi_push %r14 |
ec9cc70f | 2137 | push %r15 |
76e624a0 | 2138 | .cfi_push %r15 |
3ba1ef82 | 2139 | .Lfrom_prologue: |
8fc8f486 | 2140 | |
ec9cc70f | 2141 | shl \$3,${num}d # convert $num to bytes |
8fc8f486 | 2142 | lea ($num,$num,2),%r10 # 3*$num in bytes |
ec9cc70f AP |
2143 | neg $num |
2144 | mov ($n0),$n0 # *n0 | |
2145 | ||
2146 | ############################################################## | |
8fc8f486 AP |
2147 | # Ensure that stack frame doesn't alias with $rptr+3*$num |
2148 | # modulo 4096, which covers ret[num], am[num] and n[num] | |
2149 | # (see bn_exp.c). The stack is allocated to aligned with | |
2150 | # bn_power5's frame, and as bn_from_montgomery happens to be | |
2151 | # last operation, we use the opportunity to cleanse it. | |
ec9cc70f | 2152 | # |
8fc8f486 | 2153 | lea -320(%rsp,$num,2),%r11 |
3ba1ef82 | 2154 | mov %rsp,%rbp |
8fc8f486 | 2155 | sub $rptr,%r11 |
ec9cc70f AP |
2156 | and \$4095,%r11 |
2157 | cmp %r11,%r10 | |
2158 | jb .Lfrom_sp_alt | |
3ba1ef82 AP |
2159 | sub %r11,%rbp # align with $aptr |
2160 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) | |
ec9cc70f AP |
2161 | jmp .Lfrom_sp_done |
2162 | ||
2163 | .align 32 | |
2164 | .Lfrom_sp_alt: | |
8fc8f486 | 2165 | lea 4096-320(,$num,2),%r10 |
3ba1ef82 | 2166 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) |
ec9cc70f AP |
2167 | sub %r10,%r11 |
2168 | mov \$0,%r10 | |
2169 | cmovc %r10,%r11 | |
3ba1ef82 | 2170 | sub %r11,%rbp |
ec9cc70f | 2171 | .Lfrom_sp_done: |
3ba1ef82 AP |
2172 | and \$-64,%rbp |
2173 | mov %rsp,%r11 | |
2174 | sub %rbp,%r11 | |
adc4f1fc | 2175 | and \$-4096,%r11 |
3ba1ef82 AP |
2176 | lea (%rbp,%r11),%rsp |
2177 | mov (%rsp),%r10 | |
2178 | cmp %rbp,%rsp | |
2179 | ja .Lfrom_page_walk | |
2180 | jmp .Lfrom_page_walk_done | |
2181 | ||
adc4f1fc | 2182 | .Lfrom_page_walk: |
3ba1ef82 AP |
2183 | lea -4096(%rsp),%rsp |
2184 | mov (%rsp),%r10 | |
2185 | cmp %rbp,%rsp | |
2186 | ja .Lfrom_page_walk | |
2187 | .Lfrom_page_walk_done: | |
adc4f1fc AP |
2188 | |
2189 | mov $num,%r10 | |
ec9cc70f AP |
2190 | neg $num |
2191 | ||
2192 | ############################################################## | |
2193 | # Stack layout | |
2194 | # | |
2195 | # +0 saved $num, used in reduction section | |
2196 | # +8 &t[2*$num], used in reduction section | |
2197 | # +32 saved *n0 | |
2198 | # +40 saved %rsp | |
2199 | # +48 t[2*$num] | |
2200 | # | |
2201 | mov $n0, 32(%rsp) | |
2202 | mov %rax, 40(%rsp) # save original %rsp | |
76e624a0 | 2203 | .cfi_cfa_expression %rsp+40,deref,+8 |
ec9cc70f AP |
2204 | .Lfrom_body: |
2205 | mov $num,%r11 | |
2206 | lea 48(%rsp),%rax | |
2207 | pxor %xmm0,%xmm0 | |
2208 | jmp .Lmul_by_1 | |
2209 | ||
2210 | .align 32 | |
2211 | .Lmul_by_1: | |
2212 | movdqu ($aptr),%xmm1 | |
2213 | movdqu 16($aptr),%xmm2 | |
2214 | movdqu 32($aptr),%xmm3 | |
2215 | movdqa %xmm0,(%rax,$num) | |
2216 | movdqu 48($aptr),%xmm4 | |
2217 | movdqa %xmm0,16(%rax,$num) | |
2218 | .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr | |
2219 | movdqa %xmm1,(%rax) | |
2220 | movdqa %xmm0,32(%rax,$num) | |
2221 | movdqa %xmm2,16(%rax) | |
2222 | movdqa %xmm0,48(%rax,$num) | |
2223 | movdqa %xmm3,32(%rax) | |
2224 | movdqa %xmm4,48(%rax) | |
2225 | lea 64(%rax),%rax | |
2226 | sub \$64,%r11 | |
2227 | jnz .Lmul_by_1 | |
2228 | ||
2229 | movq $rptr,%xmm1 | |
2230 | movq $nptr,%xmm2 | |
2231 | .byte 0x67 | |
2232 | mov $nptr,%rbp | |
2233 | movq %r10, %xmm3 # -num | |
2234 | ___ | |
2235 | $code.=<<___ if ($addx); | |
2236 | mov OPENSSL_ia32cap_P+8(%rip),%r11d | |
8fc8f486 AP |
2237 | and \$0x80108,%r11d |
2238 | cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 | |
ec9cc70f AP |
2239 | jne .Lfrom_mont_nox |
2240 | ||
2241 | lea (%rax,$num),$rptr | |
317be638 AP |
2242 | call __bn_sqrx8x_reduction |
2243 | call __bn_postx4x_internal | |
ec9cc70f AP |
2244 | |
2245 | pxor %xmm0,%xmm0 | |
2246 | lea 48(%rsp),%rax | |
ec9cc70f AP |
2247 | jmp .Lfrom_mont_zero |
2248 | ||
2249 | .align 32 | |
2250 | .Lfrom_mont_nox: | |
2251 | ___ | |
2252 | $code.=<<___; | |
317be638 AP |
2253 | call __bn_sqr8x_reduction |
2254 | call __bn_post4x_internal | |
ec9cc70f AP |
2255 | |
2256 | pxor %xmm0,%xmm0 | |
2257 | lea 48(%rsp),%rax | |
ec9cc70f AP |
2258 | jmp .Lfrom_mont_zero |
2259 | ||
2260 | .align 32 | |
2261 | .Lfrom_mont_zero: | |
76e624a0 AP |
2262 | mov 40(%rsp),%rsi # restore %rsp |
2263 | .cfi_def_cfa %rsi,8 | |
ec9cc70f AP |
2264 | movdqa %xmm0,16*0(%rax) |
2265 | movdqa %xmm0,16*1(%rax) | |
2266 | movdqa %xmm0,16*2(%rax) | |
2267 | movdqa %xmm0,16*3(%rax) | |
2268 | lea 16*4(%rax),%rax | |
2269 | sub \$32,$num | |
2270 | jnz .Lfrom_mont_zero | |
2271 | ||
2272 | mov \$1,%rax | |
2273 | mov -48(%rsi),%r15 | |
76e624a0 | 2274 | .cfi_restore %r15 |
ec9cc70f | 2275 | mov -40(%rsi),%r14 |
76e624a0 | 2276 | .cfi_restore %r14 |
ec9cc70f | 2277 | mov -32(%rsi),%r13 |
76e624a0 | 2278 | .cfi_restore %r13 |
ec9cc70f | 2279 | mov -24(%rsi),%r12 |
76e624a0 | 2280 | .cfi_restore %r12 |
ec9cc70f | 2281 | mov -16(%rsi),%rbp |
76e624a0 | 2282 | .cfi_restore %rbp |
ec9cc70f | 2283 | mov -8(%rsi),%rbx |
76e624a0 | 2284 | .cfi_restore %rbx |
ec9cc70f | 2285 | lea (%rsi),%rsp |
76e624a0 | 2286 | .cfi_def_cfa_register %rsp |
ec9cc70f AP |
2287 | .Lfrom_epilogue: |
2288 | ret | |
76e624a0 | 2289 | .cfi_endproc |
ec9cc70f AP |
2290 | .size bn_from_mont8x,.-bn_from_mont8x |
2291 | ___ | |
2292 | } | |
2293 | }}} | |
2294 | \f | |
2295 | if ($addx) {{{ | |
2296 | my $bp="%rdx"; # restore original value | |
2297 | ||
2298 | $code.=<<___; | |
2299 | .type bn_mulx4x_mont_gather5,\@function,6 | |
2300 | .align 32 | |
2301 | bn_mulx4x_mont_gather5: | |
76e624a0 | 2302 | .cfi_startproc |
ec9cc70f | 2303 | mov %rsp,%rax |
76e624a0 | 2304 | .cfi_def_cfa_register %rax |
3ba1ef82 | 2305 | .Lmulx4x_enter: |
ec9cc70f | 2306 | push %rbx |
76e624a0 | 2307 | .cfi_push %rbx |
ec9cc70f | 2308 | push %rbp |
76e624a0 | 2309 | .cfi_push %rbp |
ec9cc70f | 2310 | push %r12 |
76e624a0 | 2311 | .cfi_push %r12 |
ec9cc70f | 2312 | push %r13 |
76e624a0 | 2313 | .cfi_push %r13 |
ec9cc70f | 2314 | push %r14 |
76e624a0 | 2315 | .cfi_push %r14 |
ec9cc70f | 2316 | push %r15 |
76e624a0 | 2317 | .cfi_push %r15 |
3ba1ef82 | 2318 | .Lmulx4x_prologue: |
8fc8f486 | 2319 | |
ec9cc70f | 2320 | shl \$3,${num}d # convert $num to bytes |
8fc8f486 | 2321 | lea ($num,$num,2),%r10 # 3*$num in bytes |
ec9cc70f AP |
2322 | neg $num # -$num |
2323 | mov ($n0),$n0 # *n0 | |
2324 | ||
2325 | ############################################################## | |
8fc8f486 AP |
2326 | # Ensure that stack frame doesn't alias with $rptr+3*$num |
2327 | # modulo 4096, which covers ret[num], am[num] and n[num] | |
2328 | # (see bn_exp.c). This is done to allow memory disambiguation | |
2329 | # logic do its magic. [Extra [num] is allocated in order | |
2330 | # to align with bn_power5's frame, which is cleansed after | |
2331 | # completing exponentiation. Extra 256 bytes is for power mask | |
2332 | # calculated from 7th argument, the index.] | |
ec9cc70f | 2333 | # |
8fc8f486 | 2334 | lea -320(%rsp,$num,2),%r11 |
3ba1ef82 | 2335 | mov %rsp,%rbp |
8fc8f486 | 2336 | sub $rp,%r11 |
ec9cc70f AP |
2337 | and \$4095,%r11 |
2338 | cmp %r11,%r10 | |
2339 | jb .Lmulx4xsp_alt | |
3ba1ef82 AP |
2340 | sub %r11,%rbp # align with $aptr |
2341 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) | |
ec9cc70f AP |
2342 | jmp .Lmulx4xsp_done |
2343 | ||
ec9cc70f | 2344 | .Lmulx4xsp_alt: |
8fc8f486 | 2345 | lea 4096-320(,$num,2),%r10 |
3ba1ef82 | 2346 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) |
ec9cc70f AP |
2347 | sub %r10,%r11 |
2348 | mov \$0,%r10 | |
2349 | cmovc %r10,%r11 | |
3ba1ef82 | 2350 | sub %r11,%rbp |
609b0852 | 2351 | .Lmulx4xsp_done: |
3ba1ef82 AP |
2352 | and \$-64,%rbp # ensure alignment |
2353 | mov %rsp,%r11 | |
2354 | sub %rbp,%r11 | |
adc4f1fc | 2355 | and \$-4096,%r11 |
3ba1ef82 AP |
2356 | lea (%rbp,%r11),%rsp |
2357 | mov (%rsp),%r10 | |
2358 | cmp %rbp,%rsp | |
2359 | ja .Lmulx4x_page_walk | |
2360 | jmp .Lmulx4x_page_walk_done | |
2361 | ||
adc4f1fc | 2362 | .Lmulx4x_page_walk: |
3ba1ef82 AP |
2363 | lea -4096(%rsp),%rsp |
2364 | mov (%rsp),%r10 | |
2365 | cmp %rbp,%rsp | |
2366 | ja .Lmulx4x_page_walk | |
2367 | .Lmulx4x_page_walk_done: | |
adc4f1fc | 2368 | |
ec9cc70f AP |
2369 | ############################################################## |
2370 | # Stack layout | |
2371 | # +0 -num | |
2372 | # +8 off-loaded &b[i] | |
2373 | # +16 end of b[num] | |
2374 | # +24 inner counter | |
2375 | # +32 saved n0 | |
2376 | # +40 saved %rsp | |
2377 | # +48 | |
2378 | # +56 saved rp | |
2379 | # +64 tmp[num+1] | |
2380 | # | |
2381 | mov $n0, 32(%rsp) # save *n0 | |
2382 | mov %rax,40(%rsp) # save original %rsp | |
76e624a0 | 2383 | .cfi_cfa_expression %rsp+40,deref,+8 |
ec9cc70f AP |
2384 | .Lmulx4x_body: |
2385 | call mulx4x_internal | |
2386 | ||
2387 | mov 40(%rsp),%rsi # restore %rsp | |
76e624a0 | 2388 | .cfi_def_cfa %rsi,8 |
ec9cc70f | 2389 | mov \$1,%rax |
8fc8f486 | 2390 | |
ec9cc70f | 2391 | mov -48(%rsi),%r15 |
76e624a0 | 2392 | .cfi_restore %r15 |
ec9cc70f | 2393 | mov -40(%rsi),%r14 |
76e624a0 | 2394 | .cfi_restore %r14 |
ec9cc70f | 2395 | mov -32(%rsi),%r13 |
76e624a0 | 2396 | .cfi_restore %r13 |
ec9cc70f | 2397 | mov -24(%rsi),%r12 |
76e624a0 | 2398 | .cfi_restore %r12 |
ec9cc70f | 2399 | mov -16(%rsi),%rbp |
76e624a0 | 2400 | .cfi_restore %rbp |
ec9cc70f | 2401 | mov -8(%rsi),%rbx |
76e624a0 | 2402 | .cfi_restore %rbx |
ec9cc70f | 2403 | lea (%rsi),%rsp |
76e624a0 | 2404 | .cfi_def_cfa_register %rsp |
ec9cc70f AP |
2405 | .Lmulx4x_epilogue: |
2406 | ret | |
76e624a0 | 2407 | .cfi_endproc |
ec9cc70f AP |
2408 | .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 |
2409 | ||
2410 | .type mulx4x_internal,\@abi-omnipotent | |
2411 | .align 32 | |
2412 | mulx4x_internal: | |
0190c52a | 2413 | .cfi_startproc |
8fc8f486 AP |
2414 | mov $num,8(%rsp) # save -$num (it was in bytes) |
2415 | mov $num,%r10 | |
ec9cc70f AP |
2416 | neg $num # restore $num |
2417 | shl \$5,$num | |
8fc8f486 AP |
2418 | neg %r10 # restore $num |
2419 | lea 128($bp,$num),%r13 # end of powers table (+size optimization) | |
ec9cc70f | 2420 | shr \$5+5,$num |
8fc8f486 | 2421 | movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument |
ec9cc70f | 2422 | sub \$1,$num |
8fc8f486 | 2423 | lea .Linc(%rip),%rax |
ec9cc70f AP |
2424 | mov %r13,16+8(%rsp) # end of b[num] |
2425 | mov $num,24+8(%rsp) # inner counter | |
2426 | mov $rp, 56+8(%rsp) # save $rp | |
2427 | ___ | |
2428 | my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= | |
2429 | ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); | |
2430 | my $rptr=$bptr; | |
2431 | my $STRIDE=2**5*8; # 5 is "window size" | |
2432 | my $N=$STRIDE/4; # should match cache line size | |
2433 | $code.=<<___; | |
8fc8f486 AP |
2434 | movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 |
2435 | movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 | |
46f4e1be | 2436 | lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) |
8fc8f486 | 2437 | lea 128($bp),$bptr # size optimization |
ec9cc70f | 2438 | |
8fc8f486 AP |
2439 | pshufd \$0,%xmm5,%xmm5 # broadcast index |
2440 | movdqa %xmm1,%xmm4 | |
2441 | .byte 0x67 | |
2442 | movdqa %xmm1,%xmm2 | |
2443 | ___ | |
2444 | ######################################################################## | |
2445 | # calculate mask by comparing 0..31 to index and save result to stack | |
2446 | # | |
2447 | $code.=<<___; | |
2448 | .byte 0x67 | |
2449 | paddd %xmm0,%xmm1 | |
2450 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 | |
2451 | movdqa %xmm4,%xmm3 | |
2452 | ___ | |
2453 | for($i=0;$i<$STRIDE/16-4;$i+=4) { | |
2454 | $code.=<<___; | |
2455 | paddd %xmm1,%xmm2 | |
2456 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | |
2457 | movdqa %xmm0,`16*($i+0)+112`(%r10) | |
2458 | movdqa %xmm4,%xmm0 | |
2459 | ||
2460 | paddd %xmm2,%xmm3 | |
2461 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | |
2462 | movdqa %xmm1,`16*($i+1)+112`(%r10) | |
2463 | movdqa %xmm4,%xmm1 | |
2464 | ||
2465 | paddd %xmm3,%xmm0 | |
2466 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | |
2467 | movdqa %xmm2,`16*($i+2)+112`(%r10) | |
2468 | movdqa %xmm4,%xmm2 | |
2469 | ||
2470 | paddd %xmm0,%xmm1 | |
2471 | pcmpeqd %xmm5,%xmm0 | |
2472 | movdqa %xmm3,`16*($i+3)+112`(%r10) | |
2473 | movdqa %xmm4,%xmm3 | |
2474 | ___ | |
2475 | } | |
2476 | $code.=<<___; # last iteration can be optimized | |
2477 | .byte 0x67 | |
2478 | paddd %xmm1,%xmm2 | |
2479 | pcmpeqd %xmm5,%xmm1 | |
2480 | movdqa %xmm0,`16*($i+0)+112`(%r10) | |
2481 | ||
2482 | paddd %xmm2,%xmm3 | |
2483 | pcmpeqd %xmm5,%xmm2 | |
2484 | movdqa %xmm1,`16*($i+1)+112`(%r10) | |
2485 | ||
2486 | pcmpeqd %xmm5,%xmm3 | |
2487 | movdqa %xmm2,`16*($i+2)+112`(%r10) | |
2488 | ||
2489 | pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register | |
2490 | pand `16*($i+1)-128`($bptr),%xmm1 | |
2491 | pand `16*($i+2)-128`($bptr),%xmm2 | |
2492 | movdqa %xmm3,`16*($i+3)+112`(%r10) | |
2493 | pand `16*($i+3)-128`($bptr),%xmm3 | |
2494 | por %xmm2,%xmm0 | |
2495 | por %xmm3,%xmm1 | |
2496 | ___ | |
2497 | for($i=0;$i<$STRIDE/16-4;$i+=4) { | |
2498 | $code.=<<___; | |
2499 | movdqa `16*($i+0)-128`($bptr),%xmm4 | |
2500 | movdqa `16*($i+1)-128`($bptr),%xmm5 | |
2501 | movdqa `16*($i+2)-128`($bptr),%xmm2 | |
2502 | pand `16*($i+0)+112`(%r10),%xmm4 | |
2503 | movdqa `16*($i+3)-128`($bptr),%xmm3 | |
2504 | pand `16*($i+1)+112`(%r10),%xmm5 | |
2505 | por %xmm4,%xmm0 | |
2506 | pand `16*($i+2)+112`(%r10),%xmm2 | |
2507 | por %xmm5,%xmm1 | |
2508 | pand `16*($i+3)+112`(%r10),%xmm3 | |
2509 | por %xmm2,%xmm0 | |
2510 | por %xmm3,%xmm1 | |
2511 | ___ | |
2512 | } | |
2513 | $code.=<<___; | |
2514 | pxor %xmm1,%xmm0 | |
2515 | pshufd \$0x4e,%xmm0,%xmm1 | |
2516 | por %xmm1,%xmm0 | |
2517 | lea $STRIDE($bptr),$bptr | |
ec9cc70f | 2518 | movq %xmm0,%rdx # bp[0] |
8fc8f486 | 2519 | lea 64+8*4+8(%rsp),$tptr |
ec9cc70f AP |
2520 | |
2521 | mov %rdx,$bi | |
2522 | mulx 0*8($aptr),$mi,%rax # a[0]*b[0] | |
2523 | mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] | |
2524 | add %rax,%r11 | |
2525 | mulx 2*8($aptr),%rax,%r13 # ... | |
2526 | adc %rax,%r12 | |
2527 | adc \$0,%r13 | |
2528 | mulx 3*8($aptr),%rax,%r14 | |
2529 | ||
2530 | mov $mi,%r15 | |
2531 | imulq 32+8(%rsp),$mi # "t[0]"*n0 | |
2532 | xor $zero,$zero # cf=0, of=0 | |
2533 | mov $mi,%rdx | |
2534 | ||
ec9cc70f | 2535 | mov $bptr,8+8(%rsp) # off-load &b[i] |
ec9cc70f | 2536 | |
8fc8f486 | 2537 | lea 4*8($aptr),$aptr |
ec9cc70f AP |
2538 | adcx %rax,%r13 |
2539 | adcx $zero,%r14 # cf=0 | |
2540 | ||
8fc8f486 | 2541 | mulx 0*8($nptr),%rax,%r10 |
ec9cc70f AP |
2542 | adcx %rax,%r15 # discarded |
2543 | adox %r11,%r10 | |
8fc8f486 | 2544 | mulx 1*8($nptr),%rax,%r11 |
ec9cc70f AP |
2545 | adcx %rax,%r10 |
2546 | adox %r12,%r11 | |
8fc8f486 | 2547 | mulx 2*8($nptr),%rax,%r12 |
ec9cc70f | 2548 | mov 24+8(%rsp),$bptr # counter value |
ec9cc70f AP |
2549 | mov %r10,-8*4($tptr) |
2550 | adcx %rax,%r11 | |
2551 | adox %r13,%r12 | |
8fc8f486 | 2552 | mulx 3*8($nptr),%rax,%r15 |
ec9cc70f AP |
2553 | mov $bi,%rdx |
2554 | mov %r11,-8*3($tptr) | |
2555 | adcx %rax,%r12 | |
2556 | adox $zero,%r15 # of=0 | |
8fc8f486 | 2557 | lea 4*8($nptr),$nptr |
ec9cc70f | 2558 | mov %r12,-8*2($tptr) |
8fc8f486 | 2559 | jmp .Lmulx4x_1st |
ec9cc70f AP |
2560 | |
2561 | .align 32 | |
2562 | .Lmulx4x_1st: | |
2563 | adcx $zero,%r15 # cf=0, modulo-scheduled | |
2564 | mulx 0*8($aptr),%r10,%rax # a[4]*b[0] | |
2565 | adcx %r14,%r10 | |
2566 | mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] | |
2567 | adcx %rax,%r11 | |
2568 | mulx 2*8($aptr),%r12,%rax # ... | |
a5bb5bca AP |
2569 | adcx %r14,%r12 |
2570 | mulx 3*8($aptr),%r13,%r14 | |
ec9cc70f | 2571 | .byte 0x67,0x67 |
a5bb5bca AP |
2572 | mov $mi,%rdx |
2573 | adcx %rax,%r13 | |
2574 | adcx $zero,%r14 # cf=0 | |
2575 | lea 4*8($aptr),$aptr | |
2576 | lea 4*8($tptr),$tptr | |
2577 | ||
2578 | adox %r15,%r10 | |
8fc8f486 | 2579 | mulx 0*8($nptr),%rax,%r15 |
a5bb5bca AP |
2580 | adcx %rax,%r10 |
2581 | adox %r15,%r11 | |
8fc8f486 | 2582 | mulx 1*8($nptr),%rax,%r15 |
a5bb5bca AP |
2583 | adcx %rax,%r11 |
2584 | adox %r15,%r12 | |
8fc8f486 | 2585 | mulx 2*8($nptr),%rax,%r15 |
a5bb5bca | 2586 | mov %r10,-5*8($tptr) |
a5bb5bca | 2587 | adcx %rax,%r12 |
ec9cc70f | 2588 | mov %r11,-4*8($tptr) |
a5bb5bca | 2589 | adox %r15,%r13 |
8fc8f486 | 2590 | mulx 3*8($nptr),%rax,%r15 |
a5bb5bca AP |
2591 | mov $bi,%rdx |
2592 | mov %r12,-3*8($tptr) | |
2593 | adcx %rax,%r13 | |
2594 | adox $zero,%r15 | |
8fc8f486 | 2595 | lea 4*8($nptr),$nptr |
a5bb5bca AP |
2596 | mov %r13,-2*8($tptr) |
2597 | ||
2598 | dec $bptr # of=0, pass cf | |
2599 | jnz .Lmulx4x_1st | |
2600 | ||
ec9cc70f | 2601 | mov 8(%rsp),$num # load -num |
a5bb5bca | 2602 | adc $zero,%r15 # modulo-scheduled |
ec9cc70f | 2603 | lea ($aptr,$num),$aptr # rewind $aptr |
a5bb5bca | 2604 | add %r15,%r14 |
ec9cc70f AP |
2605 | mov 8+8(%rsp),$bptr # re-load &b[i] |
2606 | adc $zero,$zero # top-most carry | |
a5bb5bca AP |
2607 | mov %r14,-1*8($tptr) |
2608 | jmp .Lmulx4x_outer | |
2609 | ||
2610 | .align 32 | |
2611 | .Lmulx4x_outer: | |
8fc8f486 AP |
2612 | lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) |
2613 | pxor %xmm4,%xmm4 | |
2614 | .byte 0x67,0x67 | |
2615 | pxor %xmm5,%xmm5 | |
2616 | ___ | |
2617 | for($i=0;$i<$STRIDE/16;$i+=4) { | |
2618 | $code.=<<___; | |
2619 | movdqa `16*($i+0)-128`($bptr),%xmm0 | |
2620 | movdqa `16*($i+1)-128`($bptr),%xmm1 | |
2621 | movdqa `16*($i+2)-128`($bptr),%xmm2 | |
2622 | pand `16*($i+0)+256`(%r10),%xmm0 | |
2623 | movdqa `16*($i+3)-128`($bptr),%xmm3 | |
2624 | pand `16*($i+1)+256`(%r10),%xmm1 | |
2625 | por %xmm0,%xmm4 | |
2626 | pand `16*($i+2)+256`(%r10),%xmm2 | |
2627 | por %xmm1,%xmm5 | |
2628 | pand `16*($i+3)+256`(%r10),%xmm3 | |
2629 | por %xmm2,%xmm4 | |
2630 | por %xmm3,%xmm5 | |
2631 | ___ | |
2632 | } | |
2633 | $code.=<<___; | |
2634 | por %xmm5,%xmm4 | |
2635 | pshufd \$0x4e,%xmm4,%xmm0 | |
2636 | por %xmm4,%xmm0 | |
2637 | lea $STRIDE($bptr),$bptr | |
2638 | movq %xmm0,%rdx # m0=bp[i] | |
2639 | ||
ec9cc70f AP |
2640 | mov $zero,($tptr) # save top-most carry |
2641 | lea 4*8($tptr,$num),$tptr # rewind $tptr | |
2642 | mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] | |
a5bb5bca AP |
2643 | xor $zero,$zero # cf=0, of=0 |
2644 | mov %rdx,$bi | |
ec9cc70f AP |
2645 | mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] |
2646 | adox -4*8($tptr),$mi # +t[0] | |
2647 | adcx %r14,%r11 | |
2648 | mulx 2*8($aptr),%r15,%r13 # ... | |
2649 | adox -3*8($tptr),%r11 | |
2650 | adcx %r15,%r12 | |
2651 | mulx 3*8($aptr),%rdx,%r14 | |
2652 | adox -2*8($tptr),%r12 | |
2653 | adcx %rdx,%r13 | |
8fc8f486 | 2654 | lea ($nptr,$num),$nptr # rewind $nptr |
ec9cc70f AP |
2655 | lea 4*8($aptr),$aptr |
2656 | adox -1*8($tptr),%r13 | |
2657 | adcx $zero,%r14 | |
2658 | adox $zero,%r14 | |
2659 | ||
ec9cc70f AP |
2660 | mov $mi,%r15 |
2661 | imulq 32+8(%rsp),$mi # "t[0]"*n0 | |
a5bb5bca | 2662 | |
ec9cc70f | 2663 | mov $mi,%rdx |
a5bb5bca | 2664 | xor $zero,$zero # cf=0, of=0 |
ec9cc70f | 2665 | mov $bptr,8+8(%rsp) # off-load &b[i] |
a5bb5bca | 2666 | |
8fc8f486 | 2667 | mulx 0*8($nptr),%rax,%r10 |
ec9cc70f | 2668 | adcx %rax,%r15 # discarded |
a5bb5bca | 2669 | adox %r11,%r10 |
8fc8f486 | 2670 | mulx 1*8($nptr),%rax,%r11 |
a5bb5bca AP |
2671 | adcx %rax,%r10 |
2672 | adox %r12,%r11 | |
8fc8f486 | 2673 | mulx 2*8($nptr),%rax,%r12 |
a5bb5bca AP |
2674 | adcx %rax,%r11 |
2675 | adox %r13,%r12 | |
8fc8f486 | 2676 | mulx 3*8($nptr),%rax,%r15 |
a5bb5bca | 2677 | mov $bi,%rdx |
ec9cc70f AP |
2678 | mov 24+8(%rsp),$bptr # counter value |
2679 | mov %r10,-8*4($tptr) | |
a5bb5bca | 2680 | adcx %rax,%r12 |
ec9cc70f | 2681 | mov %r11,-8*3($tptr) |
a5bb5bca | 2682 | adox $zero,%r15 # of=0 |
ec9cc70f | 2683 | mov %r12,-8*2($tptr) |
8fc8f486 | 2684 | lea 4*8($nptr),$nptr |
a5bb5bca AP |
2685 | jmp .Lmulx4x_inner |
2686 | ||
2687 | .align 32 | |
2688 | .Lmulx4x_inner: | |
a5bb5bca | 2689 | mulx 0*8($aptr),%r10,%rax # a[4]*b[i] |
ec9cc70f AP |
2690 | adcx $zero,%r15 # cf=0, modulo-scheduled |
2691 | adox %r14,%r10 | |
a5bb5bca | 2692 | mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] |
ec9cc70f | 2693 | adcx 0*8($tptr),%r10 |
a5bb5bca AP |
2694 | adox %rax,%r11 |
2695 | mulx 2*8($aptr),%r12,%rax # ... | |
ec9cc70f | 2696 | adcx 1*8($tptr),%r11 |
a5bb5bca AP |
2697 | adox %r14,%r12 |
2698 | mulx 3*8($aptr),%r13,%r14 | |
2699 | mov $mi,%rdx | |
2700 | adcx 2*8($tptr),%r12 | |
2701 | adox %rax,%r13 | |
2702 | adcx 3*8($tptr),%r13 | |
2703 | adox $zero,%r14 # of=0 | |
2704 | lea 4*8($aptr),$aptr | |
ec9cc70f | 2705 | lea 4*8($tptr),$tptr |
a5bb5bca AP |
2706 | adcx $zero,%r14 # cf=0 |
2707 | ||
2708 | adox %r15,%r10 | |
8fc8f486 | 2709 | mulx 0*8($nptr),%rax,%r15 |
a5bb5bca AP |
2710 | adcx %rax,%r10 |
2711 | adox %r15,%r11 | |
8fc8f486 | 2712 | mulx 1*8($nptr),%rax,%r15 |
a5bb5bca AP |
2713 | adcx %rax,%r11 |
2714 | adox %r15,%r12 | |
8fc8f486 | 2715 | mulx 2*8($nptr),%rax,%r15 |
a5bb5bca | 2716 | mov %r10,-5*8($tptr) |
a5bb5bca AP |
2717 | adcx %rax,%r12 |
2718 | adox %r15,%r13 | |
a5bb5bca | 2719 | mov %r11,-4*8($tptr) |
8fc8f486 | 2720 | mulx 3*8($nptr),%rax,%r15 |
ec9cc70f | 2721 | mov $bi,%rdx |
8fc8f486 | 2722 | lea 4*8($nptr),$nptr |
a5bb5bca AP |
2723 | mov %r12,-3*8($tptr) |
2724 | adcx %rax,%r13 | |
2725 | adox $zero,%r15 | |
a5bb5bca AP |
2726 | mov %r13,-2*8($tptr) |
2727 | ||
2728 | dec $bptr # of=0, pass cf | |
2729 | jnz .Lmulx4x_inner | |
2730 | ||
ec9cc70f | 2731 | mov 0+8(%rsp),$num # load -num |
a5bb5bca | 2732 | adc $zero,%r15 # modulo-scheduled |
ec9cc70f AP |
2733 | sub 0*8($tptr),$bptr # pull top-most carry to %cf |
2734 | mov 8+8(%rsp),$bptr # re-load &b[i] | |
2735 | mov 16+8(%rsp),%r10 | |
a5bb5bca | 2736 | adc %r15,%r14 |
ec9cc70f AP |
2737 | lea ($aptr,$num),$aptr # rewind $aptr |
2738 | adc $zero,$zero # top-most carry | |
a5bb5bca AP |
2739 | mov %r14,-1*8($tptr) |
2740 | ||
ec9cc70f | 2741 | cmp %r10,$bptr |
a5bb5bca AP |
2742 | jb .Lmulx4x_outer |
2743 | ||
8fc8f486 AP |
2744 | mov -8($nptr),%r10 |
2745 | mov $zero,%r8 | |
2746 | mov ($nptr,$num),%r12 | |
2747 | lea ($nptr,$num),%rbp # rewind $nptr | |
2748 | mov $num,%rcx | |
2749 | lea ($tptr,$num),%rdi # rewind $tptr | |
2750 | xor %eax,%eax | |
ec9cc70f AP |
2751 | xor %r15,%r15 |
2752 | sub %r14,%r10 # compare top-most words | |
2753 | adc %r15,%r15 | |
8fc8f486 AP |
2754 | or %r15,%r8 |
2755 | sar \$3+2,%rcx | |
2756 | sub %r8,%rax # %rax=-%r8 | |
ec9cc70f | 2757 | mov 56+8(%rsp),%rdx # restore rp |
8fc8f486 AP |
2758 | dec %r12 # so that after 'not' we get -n[0] |
2759 | mov 8*1(%rbp),%r13 | |
2760 | xor %r8,%r8 | |
2761 | mov 8*2(%rbp),%r14 | |
2762 | mov 8*3(%rbp),%r15 | |
2763 | jmp .Lsqrx4x_sub_entry # common post-condition | |
0190c52a | 2764 | .cfi_endproc |
ec9cc70f AP |
2765 | .size mulx4x_internal,.-mulx4x_internal |
2766 | ___ | |
2767 | }\f{ | |
2768 | ###################################################################### | |
2769 | # void bn_power5( | |
2770 | my $rptr="%rdi"; # BN_ULONG *rptr, | |
2771 | my $aptr="%rsi"; # const BN_ULONG *aptr, | |
2772 | my $bptr="%rdx"; # const void *table, | |
2773 | my $nptr="%rcx"; # const BN_ULONG *nptr, | |
2774 | my $n0 ="%r8"; # const BN_ULONG *n0); | |
2775 | my $num ="%r9"; # int num, has to be divisible by 8 | |
2776 | # int pwr); | |
2777 | ||
2778 | my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); | |
2779 | my @A0=("%r10","%r11"); | |
2780 | my @A1=("%r12","%r13"); | |
2781 | my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); | |
2782 | ||
2783 | $code.=<<___; | |
2784 | .type bn_powerx5,\@function,6 | |
2785 | .align 32 | |
2786 | bn_powerx5: | |
76e624a0 | 2787 | .cfi_startproc |
ec9cc70f | 2788 | mov %rsp,%rax |
76e624a0 | 2789 | .cfi_def_cfa_register %rax |
3ba1ef82 | 2790 | .Lpowerx5_enter: |
ec9cc70f | 2791 | push %rbx |
76e624a0 | 2792 | .cfi_push %rbx |
ec9cc70f | 2793 | push %rbp |
76e624a0 | 2794 | .cfi_push %rbp |
ec9cc70f | 2795 | push %r12 |
76e624a0 | 2796 | .cfi_push %r12 |
ec9cc70f | 2797 | push %r13 |
76e624a0 | 2798 | .cfi_push %r13 |
ec9cc70f | 2799 | push %r14 |
76e624a0 | 2800 | .cfi_push %r14 |
ec9cc70f | 2801 | push %r15 |
76e624a0 | 2802 | .cfi_push %r15 |
3ba1ef82 | 2803 | .Lpowerx5_prologue: |
8fc8f486 | 2804 | |
ec9cc70f | 2805 | shl \$3,${num}d # convert $num to bytes |
8fc8f486 | 2806 | lea ($num,$num,2),%r10 # 3*$num in bytes |
a5bb5bca | 2807 | neg $num |
ec9cc70f | 2808 | mov ($n0),$n0 # *n0 |
a5bb5bca | 2809 | |
ec9cc70f | 2810 | ############################################################## |
8fc8f486 AP |
2811 | # Ensure that stack frame doesn't alias with $rptr+3*$num |
2812 | # modulo 4096, which covers ret[num], am[num] and n[num] | |
2813 | # (see bn_exp.c). This is done to allow memory disambiguation | |
2814 | # logic do its magic. [Extra 256 bytes is for power mask | |
2815 | # calculated from 7th argument, the index.] | |
ec9cc70f | 2816 | # |
8fc8f486 | 2817 | lea -320(%rsp,$num,2),%r11 |
3ba1ef82 | 2818 | mov %rsp,%rbp |
8fc8f486 | 2819 | sub $rptr,%r11 |
ec9cc70f AP |
2820 | and \$4095,%r11 |
2821 | cmp %r11,%r10 | |
2822 | jb .Lpwrx_sp_alt | |
3ba1ef82 AP |
2823 | sub %r11,%rbp # align with $aptr |
2824 | lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) | |
ec9cc70f AP |
2825 | jmp .Lpwrx_sp_done |
2826 | ||
2827 | .align 32 | |
2828 | .Lpwrx_sp_alt: | |
8fc8f486 | 2829 | lea 4096-320(,$num,2),%r10 |
3ba1ef82 | 2830 | lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) |
ec9cc70f AP |
2831 | sub %r10,%r11 |
2832 | mov \$0,%r10 | |
2833 | cmovc %r10,%r11 | |
3ba1ef82 | 2834 | sub %r11,%rbp |
ec9cc70f | 2835 | .Lpwrx_sp_done: |
3ba1ef82 AP |
2836 | and \$-64,%rbp |
2837 | mov %rsp,%r11 | |
2838 | sub %rbp,%r11 | |
adc4f1fc | 2839 | and \$-4096,%r11 |
3ba1ef82 AP |
2840 | lea (%rbp,%r11),%rsp |
2841 | mov (%rsp),%r10 | |
2842 | cmp %rbp,%rsp | |
2843 | ja .Lpwrx_page_walk | |
2844 | jmp .Lpwrx_page_walk_done | |
2845 | ||
adc4f1fc | 2846 | .Lpwrx_page_walk: |
3ba1ef82 AP |
2847 | lea -4096(%rsp),%rsp |
2848 | mov (%rsp),%r10 | |
2849 | cmp %rbp,%rsp | |
2850 | ja .Lpwrx_page_walk | |
2851 | .Lpwrx_page_walk_done: | |
adc4f1fc | 2852 | |
609b0852 | 2853 | mov $num,%r10 |
ec9cc70f AP |
2854 | neg $num |
2855 | ||
2856 | ############################################################## | |
2857 | # Stack layout | |
2858 | # | |
2859 | # +0 saved $num, used in reduction section | |
2860 | # +8 &t[2*$num], used in reduction section | |
2861 | # +16 intermediate carry bit | |
2862 | # +24 top-most carry bit, used in reduction section | |
2863 | # +32 saved *n0 | |
2864 | # +40 saved %rsp | |
2865 | # +48 t[2*$num] | |
2866 | # | |
a5bb5bca | 2867 | pxor %xmm0,%xmm0 |
ec9cc70f AP |
2868 | movq $rptr,%xmm1 # save $rptr |
2869 | movq $nptr,%xmm2 # save $nptr | |
2870 | movq %r10, %xmm3 # -$num | |
2871 | movq $bptr,%xmm4 | |
2872 | mov $n0, 32(%rsp) | |
2873 | mov %rax, 40(%rsp) # save original %rsp | |
76e624a0 | 2874 | .cfi_cfa_expression %rsp+40,deref,+8 |
ec9cc70f AP |
2875 | .Lpowerx5_body: |
2876 | ||
2877 | call __bn_sqrx8x_internal | |
317be638 | 2878 | call __bn_postx4x_internal |
ec9cc70f | 2879 | call __bn_sqrx8x_internal |
317be638 | 2880 | call __bn_postx4x_internal |
ec9cc70f | 2881 | call __bn_sqrx8x_internal |
317be638 | 2882 | call __bn_postx4x_internal |
ec9cc70f | 2883 | call __bn_sqrx8x_internal |
317be638 | 2884 | call __bn_postx4x_internal |
ec9cc70f | 2885 | call __bn_sqrx8x_internal |
317be638 | 2886 | call __bn_postx4x_internal |
ec9cc70f AP |
2887 | |
2888 | mov %r10,$num # -num | |
2889 | mov $aptr,$rptr | |
2890 | movq %xmm2,$nptr | |
2891 | movq %xmm4,$bptr | |
2892 | mov 40(%rsp),%rax | |
2893 | ||
2894 | call mulx4x_internal | |
2895 | ||
2896 | mov 40(%rsp),%rsi # restore %rsp | |
76e624a0 | 2897 | .cfi_def_cfa %rsi,8 |
ec9cc70f | 2898 | mov \$1,%rax |
8fc8f486 | 2899 | |
ec9cc70f | 2900 | mov -48(%rsi),%r15 |
76e624a0 | 2901 | .cfi_restore %r15 |
ec9cc70f | 2902 | mov -40(%rsi),%r14 |
76e624a0 | 2903 | .cfi_restore %r14 |
ec9cc70f | 2904 | mov -32(%rsi),%r13 |
76e624a0 | 2905 | .cfi_restore %r13 |
ec9cc70f | 2906 | mov -24(%rsi),%r12 |
76e624a0 | 2907 | .cfi_restore %r12 |
ec9cc70f | 2908 | mov -16(%rsi),%rbp |
76e624a0 | 2909 | .cfi_restore %rbp |
ec9cc70f | 2910 | mov -8(%rsi),%rbx |
76e624a0 | 2911 | .cfi_restore %rbx |
ec9cc70f | 2912 | lea (%rsi),%rsp |
76e624a0 | 2913 | .cfi_def_cfa_register %rsp |
ec9cc70f AP |
2914 | .Lpowerx5_epilogue: |
2915 | ret | |
76e624a0 | 2916 | .cfi_endproc |
ec9cc70f | 2917 | .size bn_powerx5,.-bn_powerx5 |
a5bb5bca | 2918 | |
ec9cc70f AP |
2919 | .globl bn_sqrx8x_internal |
2920 | .hidden bn_sqrx8x_internal | |
2921 | .type bn_sqrx8x_internal,\@abi-omnipotent | |
a5bb5bca | 2922 | .align 32 |
ec9cc70f AP |
2923 | bn_sqrx8x_internal: |
2924 | __bn_sqrx8x_internal: | |
c0e8e500 | 2925 | .cfi_startproc |
ec9cc70f AP |
2926 | ################################################################## |
2927 | # Squaring part: | |
2928 | # | |
2929 | # a) multiply-n-add everything but a[i]*a[i]; | |
2930 | # b) shift result of a) by 1 to the left and accumulate | |
2931 | # a[i]*a[i] products; | |
2932 | # | |
2933 | ################################################################## | |
2934 | # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] | |
2935 | # a[1]a[0] | |
2936 | # a[2]a[0] | |
2937 | # a[3]a[0] | |
2938 | # a[2]a[1] | |
2939 | # a[3]a[1] | |
2940 | # a[3]a[2] | |
2941 | # | |
2942 | # a[4]a[0] | |
2943 | # a[5]a[0] | |
2944 | # a[6]a[0] | |
2945 | # a[7]a[0] | |
2946 | # a[4]a[1] | |
2947 | # a[5]a[1] | |
2948 | # a[6]a[1] | |
2949 | # a[7]a[1] | |
2950 | # a[4]a[2] | |
2951 | # a[5]a[2] | |
2952 | # a[6]a[2] | |
2953 | # a[7]a[2] | |
2954 | # a[4]a[3] | |
2955 | # a[5]a[3] | |
2956 | # a[6]a[3] | |
2957 | # a[7]a[3] | |
2958 | # | |
2959 | # a[5]a[4] | |
2960 | # a[6]a[4] | |
2961 | # a[7]a[4] | |
2962 | # a[6]a[5] | |
2963 | # a[7]a[5] | |
2964 | # a[7]a[6] | |
2965 | # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] | |
2966 | ___ | |
2967 | { | |
2968 | my ($zero,$carry)=("%rbp","%rcx"); | |
2969 | my $aaptr=$zero; | |
2970 | $code.=<<___; | |
2971 | lea 48+8(%rsp),$tptr | |
2972 | lea ($aptr,$num),$aaptr | |
2973 | mov $num,0+8(%rsp) # save $num | |
2974 | mov $aaptr,8+8(%rsp) # save end of $aptr | |
2975 | jmp .Lsqr8x_zero_start | |
2976 | ||
2977 | .align 32 | |
2978 | .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 | |
2979 | .Lsqrx8x_zero: | |
2980 | .byte 0x3e | |
2981 | movdqa %xmm0,0*8($tptr) | |
2982 | movdqa %xmm0,2*8($tptr) | |
2983 | movdqa %xmm0,4*8($tptr) | |
2984 | movdqa %xmm0,6*8($tptr) | |
2985 | .Lsqr8x_zero_start: # aligned at 32 | |
2986 | movdqa %xmm0,8*8($tptr) | |
2987 | movdqa %xmm0,10*8($tptr) | |
2988 | movdqa %xmm0,12*8($tptr) | |
2989 | movdqa %xmm0,14*8($tptr) | |
2990 | lea 16*8($tptr),$tptr | |
2991 | sub \$64,$num | |
2992 | jnz .Lsqrx8x_zero | |
2993 | ||
2994 | mov 0*8($aptr),%rdx # a[0], modulo-scheduled | |
2995 | #xor %r9,%r9 # t[1], ex-$num, zero already | |
2996 | xor %r10,%r10 | |
2997 | xor %r11,%r11 | |
2998 | xor %r12,%r12 | |
2999 | xor %r13,%r13 | |
3000 | xor %r14,%r14 | |
3001 | xor %r15,%r15 | |
3002 | lea 48+8(%rsp),$tptr | |
3003 | xor $zero,$zero # cf=0, cf=0 | |
3004 | jmp .Lsqrx8x_outer_loop | |
3005 | ||
3006 | .align 32 | |
3007 | .Lsqrx8x_outer_loop: | |
3008 | mulx 1*8($aptr),%r8,%rax # a[1]*a[0] | |
3009 | adcx %r9,%r8 # a[1]*a[0]+=t[1] | |
3010 | adox %rax,%r10 | |
3011 | mulx 2*8($aptr),%r9,%rax # a[2]*a[0] | |
3012 | adcx %r10,%r9 | |
3013 | adox %rax,%r11 | |
3014 | .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... | |
3015 | adcx %r11,%r10 | |
3016 | adox %rax,%r12 | |
3017 | .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax | |
3018 | adcx %r12,%r11 | |
3019 | adox %rax,%r13 | |
3020 | mulx 5*8($aptr),%r12,%rax | |
3021 | adcx %r13,%r12 | |
3022 | adox %rax,%r14 | |
3023 | mulx 6*8($aptr),%r13,%rax | |
3024 | adcx %r14,%r13 | |
3025 | adox %r15,%rax | |
3026 | mulx 7*8($aptr),%r14,%r15 | |
3027 | mov 1*8($aptr),%rdx # a[1] | |
3028 | adcx %rax,%r14 | |
3029 | adox $zero,%r15 | |
3030 | adc 8*8($tptr),%r15 | |
3031 | mov %r8,1*8($tptr) # t[1] | |
3032 | mov %r9,2*8($tptr) # t[2] | |
3033 | sbb $carry,$carry # mov %cf,$carry | |
3034 | xor $zero,$zero # cf=0, of=0 | |
3035 | ||
3036 | ||
3037 | mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] | |
3038 | mulx 3*8($aptr),%r9,%rax # a[3]*a[1] | |
3039 | adcx %r10,%r8 | |
3040 | adox %rbx,%r9 | |
3041 | mulx 4*8($aptr),%r10,%rbx # ... | |
3042 | adcx %r11,%r9 | |
3043 | adox %rax,%r10 | |
3044 | .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax | |
3045 | adcx %r12,%r10 | |
3046 | adox %rbx,%r11 | |
3047 | .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx | |
3048 | adcx %r13,%r11 | |
3049 | adox %r14,%r12 | |
3050 | .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 | |
3051 | mov 2*8($aptr),%rdx # a[2] | |
3052 | adcx %rax,%r12 | |
3053 | adox %rbx,%r13 | |
3054 | adcx %r15,%r13 | |
3055 | adox $zero,%r14 # of=0 | |
3056 | adcx $zero,%r14 # cf=0 | |
3057 | ||
3058 | mov %r8,3*8($tptr) # t[3] | |
3059 | mov %r9,4*8($tptr) # t[4] | |
3060 | ||
3061 | mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] | |
3062 | mulx 4*8($aptr),%r9,%rax # a[4]*a[2] | |
3063 | adcx %r10,%r8 | |
3064 | adox %rbx,%r9 | |
3065 | mulx 5*8($aptr),%r10,%rbx # ... | |
3066 | adcx %r11,%r9 | |
3067 | adox %rax,%r10 | |
3068 | .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax | |
3069 | adcx %r12,%r10 | |
3070 | adox %r13,%r11 | |
3071 | .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 | |
3072 | .byte 0x3e | |
3073 | mov 3*8($aptr),%rdx # a[3] | |
3074 | adcx %rbx,%r11 | |
3075 | adox %rax,%r12 | |
3076 | adcx %r14,%r12 | |
3077 | mov %r8,5*8($tptr) # t[5] | |
3078 | mov %r9,6*8($tptr) # t[6] | |
3079 | mulx 4*8($aptr),%r8,%rax # a[4]*a[3] | |
3080 | adox $zero,%r13 # of=0 | |
3081 | adcx $zero,%r13 # cf=0 | |
3082 | ||
3083 | mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] | |
3084 | adcx %r10,%r8 | |
3085 | adox %rax,%r9 | |
3086 | mulx 6*8($aptr),%r10,%rax # ... | |
3087 | adcx %r11,%r9 | |
3088 | adox %r12,%r10 | |
3089 | mulx 7*8($aptr),%r11,%r12 | |
3090 | mov 4*8($aptr),%rdx # a[4] | |
3091 | mov 5*8($aptr),%r14 # a[5] | |
3092 | adcx %rbx,%r10 | |
3093 | adox %rax,%r11 | |
3094 | mov 6*8($aptr),%r15 # a[6] | |
3095 | adcx %r13,%r11 | |
3096 | adox $zero,%r12 # of=0 | |
3097 | adcx $zero,%r12 # cf=0 | |
3098 | ||
3099 | mov %r8,7*8($tptr) # t[7] | |
3100 | mov %r9,8*8($tptr) # t[8] | |
3101 | ||
3102 | mulx %r14,%r9,%rax # a[5]*a[4] | |
3103 | mov 7*8($aptr),%r8 # a[7] | |
3104 | adcx %r10,%r9 | |
3105 | mulx %r15,%r10,%rbx # a[6]*a[4] | |
3106 | adox %rax,%r10 | |
3107 | adcx %r11,%r10 | |
3108 | mulx %r8,%r11,%rax # a[7]*a[4] | |
3109 | mov %r14,%rdx # a[5] | |
3110 | adox %rbx,%r11 | |
3111 | adcx %r12,%r11 | |
3112 | #adox $zero,%rax # of=0 | |
3113 | adcx $zero,%rax # cf=0 | |
3114 | ||
3115 | mulx %r15,%r14,%rbx # a[6]*a[5] | |
3116 | mulx %r8,%r12,%r13 # a[7]*a[5] | |
3117 | mov %r15,%rdx # a[6] | |
3118 | lea 8*8($aptr),$aptr | |
3119 | adcx %r14,%r11 | |
3120 | adox %rbx,%r12 | |
3121 | adcx %rax,%r12 | |
3122 | adox $zero,%r13 | |
3123 | ||
3124 | .byte 0x67,0x67 | |
3125 | mulx %r8,%r8,%r14 # a[7]*a[6] | |
3126 | adcx %r8,%r13 | |
3127 | adcx $zero,%r14 | |
3128 | ||
3129 | cmp 8+8(%rsp),$aptr | |
3130 | je .Lsqrx8x_outer_break | |
3131 | ||
3132 | neg $carry # mov $carry,%cf | |
3133 | mov \$-8,%rcx | |
3134 | mov $zero,%r15 | |
3135 | mov 8*8($tptr),%r8 | |
3136 | adcx 9*8($tptr),%r9 # +=t[9] | |
3137 | adcx 10*8($tptr),%r10 # ... | |
3138 | adcx 11*8($tptr),%r11 | |
3139 | adc 12*8($tptr),%r12 | |
3140 | adc 13*8($tptr),%r13 | |
3141 | adc 14*8($tptr),%r14 | |
3142 | adc 15*8($tptr),%r15 | |
3143 | lea ($aptr),$aaptr | |
3144 | lea 2*64($tptr),$tptr | |
3145 | sbb %rax,%rax # mov %cf,$carry | |
3146 | ||
3147 | mov -64($aptr),%rdx # a[0] | |
3148 | mov %rax,16+8(%rsp) # offload $carry | |
3149 | mov $tptr,24+8(%rsp) | |
3150 | ||
3151 | #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above | |
3152 | xor %eax,%eax # cf=0, of=0 | |
3153 | jmp .Lsqrx8x_loop | |
3154 | ||
3155 | .align 32 | |
3156 | .Lsqrx8x_loop: | |
3157 | mov %r8,%rbx | |
3158 | mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] | |
3159 | adcx %rax,%rbx # +=t[8] | |
3160 | adox %r9,%r8 | |
3161 | ||
3162 | mulx 1*8($aaptr),%rax,%r9 # ... | |
3163 | adcx %rax,%r8 | |
3164 | adox %r10,%r9 | |
3165 | ||
3166 | mulx 2*8($aaptr),%rax,%r10 | |
3167 | adcx %rax,%r9 | |
3168 | adox %r11,%r10 | |
3169 | ||
3170 | mulx 3*8($aaptr),%rax,%r11 | |
3171 | adcx %rax,%r10 | |
3172 | adox %r12,%r11 | |
3173 | ||
3174 | .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 | |
3175 | adcx %rax,%r11 | |
3176 | adox %r13,%r12 | |
3177 | ||
3178 | mulx 5*8($aaptr),%rax,%r13 | |
3179 | adcx %rax,%r12 | |
3180 | adox %r14,%r13 | |
3181 | ||
3182 | mulx 6*8($aaptr),%rax,%r14 | |
3183 | mov %rbx,($tptr,%rcx,8) # store t[8+i] | |
3184 | mov \$0,%ebx | |
3185 | adcx %rax,%r13 | |
3186 | adox %r15,%r14 | |
3187 | ||
3188 | .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 | |
3189 | mov 8($aptr,%rcx,8),%rdx # a[i] | |
3190 | adcx %rax,%r14 | |
3191 | adox %rbx,%r15 # %rbx is 0, of=0 | |
3192 | adcx %rbx,%r15 # cf=0 | |
3193 | ||
3194 | .byte 0x67 | |
3195 | inc %rcx # of=0 | |
3196 | jnz .Lsqrx8x_loop | |
3197 | ||
3198 | lea 8*8($aaptr),$aaptr | |
3199 | mov \$-8,%rcx | |
3200 | cmp 8+8(%rsp),$aaptr # done? | |
3201 | je .Lsqrx8x_break | |
3202 | ||
3203 | sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf | |
3204 | .byte 0x66 | |
3205 | mov -64($aptr),%rdx | |
3206 | adcx 0*8($tptr),%r8 | |
3207 | adcx 1*8($tptr),%r9 | |
a5bb5bca AP |
3208 | adc 2*8($tptr),%r10 |
3209 | adc 3*8($tptr),%r11 | |
ec9cc70f AP |
3210 | adc 4*8($tptr),%r12 |
3211 | adc 5*8($tptr),%r13 | |
3212 | adc 6*8($tptr),%r14 | |
3213 | adc 7*8($tptr),%r15 | |
3214 | lea 8*8($tptr),$tptr | |
3215 | .byte 0x67 | |
3216 | sbb %rax,%rax # mov %cf,%rax | |
3217 | xor %ebx,%ebx # cf=0, of=0 | |
3218 | mov %rax,16+8(%rsp) # offload carry | |
3219 | jmp .Lsqrx8x_loop | |
a5bb5bca | 3220 | |
ec9cc70f AP |
3221 | .align 32 |
3222 | .Lsqrx8x_break: | |
668a709a AP |
3223 | xor $zero,$zero |
3224 | sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf | |
3225 | adcx $zero,%r8 | |
ec9cc70f | 3226 | mov 24+8(%rsp),$carry # initial $tptr, borrow $carry |
668a709a | 3227 | adcx $zero,%r9 |
ec9cc70f | 3228 | mov 0*8($aptr),%rdx # a[8], modulo-scheduled |
668a709a | 3229 | adc \$0,%r10 |
ec9cc70f | 3230 | mov %r8,0*8($tptr) |
668a709a AP |
3231 | adc \$0,%r11 |
3232 | adc \$0,%r12 | |
3233 | adc \$0,%r13 | |
3234 | adc \$0,%r14 | |
3235 | adc \$0,%r15 | |
ec9cc70f AP |
3236 | cmp $carry,$tptr # cf=0, of=0 |
3237 | je .Lsqrx8x_outer_loop | |
3238 | ||
3239 | mov %r9,1*8($tptr) | |
3240 | mov 1*8($carry),%r9 | |
3241 | mov %r10,2*8($tptr) | |
3242 | mov 2*8($carry),%r10 | |
3243 | mov %r11,3*8($tptr) | |
3244 | mov 3*8($carry),%r11 | |
3245 | mov %r12,4*8($tptr) | |
3246 | mov 4*8($carry),%r12 | |
3247 | mov %r13,5*8($tptr) | |
3248 | mov 5*8($carry),%r13 | |
3249 | mov %r14,6*8($tptr) | |
3250 | mov 6*8($carry),%r14 | |
3251 | mov %r15,7*8($tptr) | |
3252 | mov 7*8($carry),%r15 | |
3253 | mov $carry,$tptr | |
3254 | jmp .Lsqrx8x_outer_loop | |
a5bb5bca | 3255 | |
ec9cc70f AP |
3256 | .align 32 |
3257 | .Lsqrx8x_outer_break: | |
3258 | mov %r9,9*8($tptr) # t[9] | |
3259 | movq %xmm3,%rcx # -$num | |
3260 | mov %r10,10*8($tptr) # ... | |
3261 | mov %r11,11*8($tptr) | |
3262 | mov %r12,12*8($tptr) | |
3263 | mov %r13,13*8($tptr) | |
3264 | mov %r14,14*8($tptr) | |
3265 | ___ | |
3266 | }\f{ | |
3267 | my $i="%rcx"; | |
3268 | $code.=<<___; | |
3269 | lea 48+8(%rsp),$tptr | |
3270 | mov ($aptr,$i),%rdx # a[0] | |
a5bb5bca | 3271 | |
ec9cc70f AP |
3272 | mov 8($tptr),$A0[1] # t[1] |
3273 | xor $A0[0],$A0[0] # t[0], of=0, cf=0 | |
3274 | mov 0+8(%rsp),$num # restore $num | |
3275 | adox $A0[1],$A0[1] | |
3276 | mov 16($tptr),$A1[0] # t[2] # prefetch | |
3277 | mov 24($tptr),$A1[1] # t[3] # prefetch | |
3278 | #jmp .Lsqrx4x_shift_n_add # happens to be aligned | |
3279 | ||
3280 | .align 32 | |
3281 | .Lsqrx4x_shift_n_add: | |
3282 | mulx %rdx,%rax,%rbx | |
3283 | adox $A1[0],$A1[0] | |
3284 | adcx $A0[0],%rax | |
3285 | .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch | |
3286 | .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch | |
3287 | adox $A1[1],$A1[1] | |
3288 | adcx $A0[1],%rbx | |
3289 | mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch | |
3290 | mov %rax,0($tptr) | |
3291 | mov %rbx,8($tptr) | |
3292 | ||
3293 | mulx %rdx,%rax,%rbx | |
3294 | adox $A0[0],$A0[0] | |
3295 | adcx $A1[0],%rax | |
3296 | mov 16($aptr,$i),%rdx # a[i+2] # prefetch | |
3297 | mov 48($tptr),$A1[0] # t[2*i+6] # prefetch | |
3298 | adox $A0[1],$A0[1] | |
3299 | adcx $A1[1],%rbx | |
3300 | mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch | |
3301 | mov %rax,16($tptr) | |
3302 | mov %rbx,24($tptr) | |
3303 | ||
3304 | mulx %rdx,%rax,%rbx | |
3305 | adox $A1[0],$A1[0] | |
3306 | adcx $A0[0],%rax | |
3307 | mov 24($aptr,$i),%rdx # a[i+3] # prefetch | |
3308 | lea 32($i),$i | |
3309 | mov 64($tptr),$A0[0] # t[2*i+8] # prefetch | |
3310 | adox $A1[1],$A1[1] | |
3311 | adcx $A0[1],%rbx | |
3312 | mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch | |
3313 | mov %rax,32($tptr) | |
3314 | mov %rbx,40($tptr) | |
3315 | ||
3316 | mulx %rdx,%rax,%rbx | |
3317 | adox $A0[0],$A0[0] | |
3318 | adcx $A1[0],%rax | |
3319 | jrcxz .Lsqrx4x_shift_n_add_break | |
3320 | .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch | |
3321 | adox $A0[1],$A0[1] | |
3322 | adcx $A1[1],%rbx | |
3323 | mov 80($tptr),$A1[0] # t[2*i+10] # prefetch | |
3324 | mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch | |
3325 | mov %rax,48($tptr) | |
3326 | mov %rbx,56($tptr) | |
3327 | lea 64($tptr),$tptr | |
3328 | nop | |
3329 | jmp .Lsqrx4x_shift_n_add | |
3330 | ||
3331 | .align 32 | |
3332 | .Lsqrx4x_shift_n_add_break: | |
3333 | adcx $A1[1],%rbx | |
3334 | mov %rax,48($tptr) | |
3335 | mov %rbx,56($tptr) | |
3336 | lea 64($tptr),$tptr # end of t[] buffer | |
a5bb5bca | 3337 | ___ |
ec9cc70f AP |
3338 | }\f |
3339 | ###################################################################### | |
3340 | # Montgomery reduction part, "word-by-word" algorithm. | |
3341 | # | |
3342 | # This new path is inspired by multiple submissions from Intel, by | |
3343 | # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, | |
3344 | # Vinodh Gopal... | |
3345 | { | |
3346 | my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); | |
3347 | ||
3348 | $code.=<<___; | |
3349 | movq %xmm2,$nptr | |
317be638 | 3350 | __bn_sqrx8x_reduction: |
ec9cc70f AP |
3351 | xor %eax,%eax # initial top-most carry bit |
3352 | mov 32+8(%rsp),%rbx # n0 | |
3353 | mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) | |
8fc8f486 | 3354 | lea -8*8($nptr,$num),%rcx # end of n[] |
ec9cc70f AP |
3355 | #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer |
3356 | mov %rcx, 0+8(%rsp) # save end of n[] | |
3357 | mov $tptr,8+8(%rsp) # save end of t[] | |
3358 | ||
3359 | lea 48+8(%rsp),$tptr # initial t[] window | |
3360 | jmp .Lsqrx8x_reduction_loop | |
3361 | ||
3362 | .align 32 | |
3363 | .Lsqrx8x_reduction_loop: | |
3364 | mov 8*1($tptr),%r9 | |
3365 | mov 8*2($tptr),%r10 | |
3366 | mov 8*3($tptr),%r11 | |
3367 | mov 8*4($tptr),%r12 | |
3368 | mov %rdx,%r8 | |
3369 | imulq %rbx,%rdx # n0*a[i] | |
3370 | mov 8*5($tptr),%r13 | |
3371 | mov 8*6($tptr),%r14 | |
3372 | mov 8*7($tptr),%r15 | |
3373 | mov %rax,24+8(%rsp) # store top-most carry bit | |
3374 | ||
3375 | lea 8*8($tptr),$tptr | |
3376 | xor $carry,$carry # cf=0,of=0 | |
3377 | mov \$-8,%rcx | |
3378 | jmp .Lsqrx8x_reduce | |
3379 | ||
3380 | .align 32 | |
3381 | .Lsqrx8x_reduce: | |
3382 | mov %r8, %rbx | |
8fc8f486 | 3383 | mulx 8*0($nptr),%rax,%r8 # n[0] |
ec9cc70f AP |
3384 | adcx %rbx,%rax # discarded |
3385 | adox %r9,%r8 | |
3386 | ||
8fc8f486 | 3387 | mulx 8*1($nptr),%rbx,%r9 # n[1] |
ec9cc70f AP |
3388 | adcx %rbx,%r8 |
3389 | adox %r10,%r9 | |
3390 | ||
8fc8f486 | 3391 | mulx 8*2($nptr),%rbx,%r10 |
ec9cc70f AP |
3392 | adcx %rbx,%r9 |
3393 | adox %r11,%r10 | |
3394 | ||
8fc8f486 | 3395 | mulx 8*3($nptr),%rbx,%r11 |
ec9cc70f AP |
3396 | adcx %rbx,%r10 |
3397 | adox %r12,%r11 | |
3398 | ||
8fc8f486 | 3399 | .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 |
ec9cc70f AP |
3400 | mov %rdx,%rax |
3401 | mov %r8,%rdx | |
3402 | adcx %rbx,%r11 | |
3403 | adox %r13,%r12 | |
3404 | ||
3405 | mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded | |
3406 | mov %rax,%rdx | |
3407 | mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] | |
3408 | ||
8fc8f486 | 3409 | mulx 8*5($nptr),%rax,%r13 |
ec9cc70f AP |
3410 | adcx %rax,%r12 |
3411 | adox %r14,%r13 | |
3412 | ||
8fc8f486 | 3413 | mulx 8*6($nptr),%rax,%r14 |
ec9cc70f AP |
3414 | adcx %rax,%r13 |
3415 | adox %r15,%r14 | |
3416 | ||
8fc8f486 | 3417 | mulx 8*7($nptr),%rax,%r15 |
ec9cc70f AP |
3418 | mov %rbx,%rdx |
3419 | adcx %rax,%r14 | |
3420 | adox $carry,%r15 # $carry is 0 | |
3421 | adcx $carry,%r15 # cf=0 | |
3422 | ||
3423 | .byte 0x67,0x67,0x67 | |
3424 | inc %rcx # of=0 | |
3425 | jnz .Lsqrx8x_reduce | |
3426 | ||
3427 | mov $carry,%rax # xor %rax,%rax | |
3428 | cmp 0+8(%rsp),$nptr # end of n[]? | |
3429 | jae .Lsqrx8x_no_tail | |
3430 | ||
3431 | mov 48+8(%rsp),%rdx # pull n0*a[0] | |
3432 | add 8*0($tptr),%r8 | |
8fc8f486 | 3433 | lea 8*8($nptr),$nptr |
ec9cc70f AP |
3434 | mov \$-8,%rcx |
3435 | adcx 8*1($tptr),%r9 | |
3436 | adcx 8*2($tptr),%r10 | |
3437 | adc 8*3($tptr),%r11 | |
3438 | adc 8*4($tptr),%r12 | |
3439 | adc 8*5($tptr),%r13 | |
3440 | adc 8*6($tptr),%r14 | |
3441 | adc 8*7($tptr),%r15 | |
3442 | lea 8*8($tptr),$tptr | |
3443 | sbb %rax,%rax # top carry | |
3444 | ||
3445 | xor $carry,$carry # of=0, cf=0 | |
3446 | mov %rax,16+8(%rsp) | |
3447 | jmp .Lsqrx8x_tail | |
3448 | ||
3449 | .align 32 | |
3450 | .Lsqrx8x_tail: | |
3451 | mov %r8,%rbx | |
8fc8f486 | 3452 | mulx 8*0($nptr),%rax,%r8 |
ec9cc70f AP |
3453 | adcx %rax,%rbx |
3454 | adox %r9,%r8 | |
3455 | ||
8fc8f486 | 3456 | mulx 8*1($nptr),%rax,%r9 |
ec9cc70f AP |
3457 | adcx %rax,%r8 |
3458 | adox %r10,%r9 | |
3459 | ||
8fc8f486 | 3460 | mulx 8*2($nptr),%rax,%r10 |
ec9cc70f AP |
3461 | adcx %rax,%r9 |
3462 | adox %r11,%r10 | |
3463 | ||
8fc8f486 | 3464 | mulx 8*3($nptr),%rax,%r11 |
ec9cc70f AP |
3465 | adcx %rax,%r10 |
3466 | adox %r12,%r11 | |
3467 | ||
8fc8f486 | 3468 | .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 |
ec9cc70f AP |
3469 | adcx %rax,%r11 |
3470 | adox %r13,%r12 | |
3471 | ||
8fc8f486 | 3472 | mulx 8*5($nptr),%rax,%r13 |
ec9cc70f AP |
3473 | adcx %rax,%r12 |
3474 | adox %r14,%r13 | |
3475 | ||
8fc8f486 | 3476 | mulx 8*6($nptr),%rax,%r14 |
ec9cc70f AP |
3477 | adcx %rax,%r13 |
3478 | adox %r15,%r14 | |
3479 | ||
8fc8f486 | 3480 | mulx 8*7($nptr),%rax,%r15 |
ec9cc70f AP |
3481 | mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] |
3482 | adcx %rax,%r14 | |
3483 | adox $carry,%r15 | |
3484 | mov %rbx,($tptr,%rcx,8) # save result | |
3485 | mov %r8,%rbx | |
3486 | adcx $carry,%r15 # cf=0 | |
3487 | ||
3488 | inc %rcx # of=0 | |
3489 | jnz .Lsqrx8x_tail | |
3490 | ||
3491 | cmp 0+8(%rsp),$nptr # end of n[]? | |
3492 | jae .Lsqrx8x_tail_done # break out of loop | |
3493 | ||
3494 | sub 16+8(%rsp),$carry # mov 16(%rsp),%cf | |
3495 | mov 48+8(%rsp),%rdx # pull n0*a[0] | |
8fc8f486 | 3496 | lea 8*8($nptr),$nptr |
ec9cc70f AP |
3497 | adc 8*0($tptr),%r8 |
3498 | adc 8*1($tptr),%r9 | |
3499 | adc 8*2($tptr),%r10 | |
3500 | adc 8*3($tptr),%r11 | |
3501 | adc 8*4($tptr),%r12 | |
3502 | adc 8*5($tptr),%r13 | |
3503 | adc 8*6($tptr),%r14 | |
3504 | adc 8*7($tptr),%r15 | |
3505 | lea 8*8($tptr),$tptr | |
3506 | sbb %rax,%rax | |
3507 | sub \$8,%rcx # mov \$-8,%rcx | |
3508 | ||
3509 | xor $carry,$carry # of=0, cf=0 | |
3510 | mov %rax,16+8(%rsp) | |
3511 | jmp .Lsqrx8x_tail | |
3512 | ||
3513 | .align 32 | |
3514 | .Lsqrx8x_tail_done: | |
3f4bcf5b | 3515 | xor %rax,%rax |
ec9cc70f | 3516 | add 24+8(%rsp),%r8 # can this overflow? |
29851264 AP |
3517 | adc \$0,%r9 |
3518 | adc \$0,%r10 | |
3519 | adc \$0,%r11 | |
3520 | adc \$0,%r12 | |
3521 | adc \$0,%r13 | |
3522 | adc \$0,%r14 | |
3f4bcf5b AP |
3523 | adc \$0,%r15 |
3524 | adc \$0,%rax | |
ec9cc70f AP |
3525 | |
3526 | sub 16+8(%rsp),$carry # mov 16(%rsp),%cf | |
3527 | .Lsqrx8x_no_tail: # %cf is 0 if jumped here | |
3528 | adc 8*0($tptr),%r8 | |
3529 | movq %xmm3,%rcx | |
3530 | adc 8*1($tptr),%r9 | |
8fc8f486 | 3531 | mov 8*7($nptr),$carry |
ec9cc70f AP |
3532 | movq %xmm2,$nptr # restore $nptr |
3533 | adc 8*2($tptr),%r10 | |
3534 | adc 8*3($tptr),%r11 | |
3535 | adc 8*4($tptr),%r12 | |
3536 | adc 8*5($tptr),%r13 | |
3537 | adc 8*6($tptr),%r14 | |
3538 | adc 8*7($tptr),%r15 | |
3f4bcf5b | 3539 | adc \$0,%rax # top-most carry |
ec9cc70f AP |
3540 | |
3541 | mov 32+8(%rsp),%rbx # n0 | |
3542 | mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" | |
3543 | ||
3544 | mov %r8,8*0($tptr) # store top 512 bits | |
3545 | lea 8*8($tptr),%r8 # borrow %r8 | |
3546 | mov %r9,8*1($tptr) | |
3547 | mov %r10,8*2($tptr) | |
3548 | mov %r11,8*3($tptr) | |
3549 | mov %r12,8*4($tptr) | |
3550 | mov %r13,8*5($tptr) | |
3551 | mov %r14,8*6($tptr) | |
3552 | mov %r15,8*7($tptr) | |
3553 | ||
3554 | lea 8*8($tptr,%rcx),$tptr # start of current t[] window | |
3555 | cmp 8+8(%rsp),%r8 # end of t[]? | |
3556 | jb .Lsqrx8x_reduction_loop | |
317be638 | 3557 | ret |
c0e8e500 | 3558 | .cfi_endproc |
317be638 | 3559 | .size bn_sqrx8x_internal,.-bn_sqrx8x_internal |
a5bb5bca | 3560 | ___ |
ec9cc70f AP |
3561 | }\f |
3562 | ############################################################## | |
3563 | # Post-condition, 4x unrolled | |
3564 | # | |
3565 | { | |
3566 | my ($rptr,$nptr)=("%rdx","%rbp"); | |
a5bb5bca | 3567 | $code.=<<___; |
317be638 AP |
3568 | .align 32 |
3569 | __bn_postx4x_internal: | |
0190c52a | 3570 | .cfi_startproc |
8fc8f486 | 3571 | mov 8*0($nptr),%r12 |
ec9cc70f | 3572 | mov %rcx,%r10 # -$num |
ec9cc70f | 3573 | mov %rcx,%r9 # -$num |
8fc8f486 AP |
3574 | neg %rax |
3575 | sar \$3+2,%rcx | |
ec9cc70f | 3576 | #lea 48+8(%rsp,%r9),$tptr |
ec9cc70f AP |
3577 | movq %xmm1,$rptr # restore $rptr |
3578 | movq %xmm1,$aptr # prepare for back-to-back call | |
8fc8f486 AP |
3579 | dec %r12 # so that after 'not' we get -n[0] |
3580 | mov 8*1($nptr),%r13 | |
3581 | xor %r8,%r8 | |
3582 | mov 8*2($nptr),%r14 | |
3583 | mov 8*3($nptr),%r15 | |
3584 | jmp .Lsqrx4x_sub_entry | |
ec9cc70f | 3585 | |
317be638 | 3586 | .align 16 |
ec9cc70f | 3587 | .Lsqrx4x_sub: |
8fc8f486 AP |
3588 | mov 8*0($nptr),%r12 |
3589 | mov 8*1($nptr),%r13 | |
3590 | mov 8*2($nptr),%r14 | |
3591 | mov 8*3($nptr),%r15 | |
3592 | .Lsqrx4x_sub_entry: | |
3593 | andn %rax,%r12,%r12 | |
3594 | lea 8*4($nptr),$nptr | |
3595 | andn %rax,%r13,%r13 | |
3596 | andn %rax,%r14,%r14 | |
3597 | andn %rax,%r15,%r15 | |
3598 | ||
3599 | neg %r8 # mov %r8,%cf | |
3600 | adc 8*0($tptr),%r12 | |
3601 | adc 8*1($tptr),%r13 | |
3602 | adc 8*2($tptr),%r14 | |
3603 | adc 8*3($tptr),%r15 | |
ec9cc70f | 3604 | mov %r12,8*0($rptr) |
8fc8f486 | 3605 | lea 8*4($tptr),$tptr |
ec9cc70f | 3606 | mov %r13,8*1($rptr) |
8fc8f486 | 3607 | sbb %r8,%r8 # mov %cf,%r8 |
ec9cc70f AP |
3608 | mov %r14,8*2($rptr) |
3609 | mov %r15,8*3($rptr) | |
3610 | lea 8*4($rptr),$rptr | |
3611 | ||
3612 | inc %rcx | |
3613 | jnz .Lsqrx4x_sub | |
317be638 | 3614 | |
ec9cc70f AP |
3615 | neg %r9 # restore $num |
3616 | ||
a5bb5bca | 3617 | ret |
0190c52a | 3618 | .cfi_endproc |
317be638 | 3619 | .size __bn_postx4x_internal,.-__bn_postx4x_internal |
a5bb5bca | 3620 | ___ |
317be638 | 3621 | } |
a5bb5bca | 3622 | }}} |
361512da | 3623 | { |
ec9cc70f AP |
3624 | my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order |
3625 | ("%rdi","%esi","%rdx","%ecx"); # Unix order | |
8329e2e7 AP |
3626 | my $out=$inp; |
3627 | my $STRIDE=2**5*8; | |
3628 | my $N=$STRIDE/4; | |
3629 | ||
361512da | 3630 | $code.=<<___; |
ec9cc70f AP |
3631 | .globl bn_get_bits5 |
3632 | .type bn_get_bits5,\@abi-omnipotent | |
3633 | .align 16 | |
3634 | bn_get_bits5: | |
0190c52a | 3635 | .cfi_startproc |
69567687 AP |
3636 | lea 0($inp),%r10 |
3637 | lea 1($inp),%r11 | |
ec9cc70f | 3638 | mov $num,%ecx |
69567687 AP |
3639 | shr \$4,$num |
3640 | and \$15,%ecx | |
3641 | lea -8(%ecx),%eax | |
3642 | cmp \$11,%ecx | |
3643 | cmova %r11,%r10 | |
3644 | cmova %eax,%ecx | |
3645 | movzw (%r10,$num,2),%eax | |
ec9cc70f AP |
3646 | shrl %cl,%eax |
3647 | and \$31,%eax | |
3648 | ret | |
0190c52a | 3649 | .cfi_endproc |
ec9cc70f AP |
3650 | .size bn_get_bits5,.-bn_get_bits5 |
3651 | ||
361512da AP |
3652 | .globl bn_scatter5 |
3653 | .type bn_scatter5,\@abi-omnipotent | |
3654 | .align 16 | |
3655 | bn_scatter5: | |
0190c52a | 3656 | .cfi_startproc |
cdfe0fdd BM |
3657 | cmp \$0, $num |
3658 | jz .Lscatter_epilogue | |
361512da AP |
3659 | lea ($tbl,$idx,8),$tbl |
3660 | .Lscatter: | |
3661 | mov ($inp),%rax | |
3662 | lea 8($inp),$inp | |
3663 | mov %rax,($tbl) | |
3664 | lea 32*8($tbl),$tbl | |
3665 | sub \$1,$num | |
3666 | jnz .Lscatter | |
cdfe0fdd | 3667 | .Lscatter_epilogue: |
361512da | 3668 | ret |
0190c52a | 3669 | .cfi_endproc |
361512da | 3670 | .size bn_scatter5,.-bn_scatter5 |
8329e2e7 AP |
3671 | |
3672 | .globl bn_gather5 | |
3673 | .type bn_gather5,\@abi-omnipotent | |
8fc8f486 | 3674 | .align 32 |
8329e2e7 | 3675 | bn_gather5: |
8fc8f486 | 3676 | .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases |
0190c52a | 3677 | .cfi_startproc |
8329e2e7 | 3678 | # I can't trust assembler to use specific encoding:-( |
8fc8f486 AP |
3679 | .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 |
3680 | .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp | |
3681 | lea .Linc(%rip),%rax | |
3682 | and \$-16,%rsp # shouldn't be formally required | |
3683 | ||
3684 | movd $idx,%xmm5 | |
3685 | movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 | |
3686 | movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 | |
3687 | lea 128($tbl),%r11 # size optimization | |
3688 | lea 128(%rsp),%rax # size optimization | |
3689 | ||
3690 | pshufd \$0,%xmm5,%xmm5 # broadcast $idx | |
3691 | movdqa %xmm1,%xmm4 | |
3692 | movdqa %xmm1,%xmm2 | |
8329e2e7 | 3693 | ___ |
8fc8f486 AP |
3694 | ######################################################################## |
3695 | # calculate mask by comparing 0..31 to $idx and save result to stack | |
3696 | # | |
3697 | for($i=0;$i<$STRIDE/16;$i+=4) { | |
8329e2e7 | 3698 | $code.=<<___; |
8fc8f486 AP |
3699 | paddd %xmm0,%xmm1 |
3700 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 | |
3701 | ___ | |
3702 | $code.=<<___ if ($i); | |
3703 | movdqa %xmm3,`16*($i-1)-128`(%rax) | |
3704 | ___ | |
3705 | $code.=<<___; | |
3706 | movdqa %xmm4,%xmm3 | |
3707 | ||
3708 | paddd %xmm1,%xmm2 | |
3709 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | |
3710 | movdqa %xmm0,`16*($i+0)-128`(%rax) | |
3711 | movdqa %xmm4,%xmm0 | |
3712 | ||
3713 | paddd %xmm2,%xmm3 | |
3714 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | |
3715 | movdqa %xmm1,`16*($i+1)-128`(%rax) | |
3716 | movdqa %xmm4,%xmm1 | |
3717 | ||
3718 | paddd %xmm3,%xmm0 | |
3719 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | |
3720 | movdqa %xmm2,`16*($i+2)-128`(%rax) | |
3721 | movdqa %xmm4,%xmm2 | |
3722 | ___ | |
3723 | } | |
3724 | $code.=<<___; | |
3725 | movdqa %xmm3,`16*($i-1)-128`(%rax) | |
8329e2e7 | 3726 | jmp .Lgather |
8329e2e7 | 3727 | |
8fc8f486 AP |
3728 | .align 32 |
3729 | .Lgather: | |
3730 | pxor %xmm4,%xmm4 | |
3731 | pxor %xmm5,%xmm5 | |
3732 | ___ | |
3733 | for($i=0;$i<$STRIDE/16;$i+=4) { | |
3734 | $code.=<<___; | |
3735 | movdqa `16*($i+0)-128`(%r11),%xmm0 | |
3736 | movdqa `16*($i+1)-128`(%r11),%xmm1 | |
3737 | movdqa `16*($i+2)-128`(%r11),%xmm2 | |
3738 | pand `16*($i+0)-128`(%rax),%xmm0 | |
3739 | movdqa `16*($i+3)-128`(%r11),%xmm3 | |
3740 | pand `16*($i+1)-128`(%rax),%xmm1 | |
3741 | por %xmm0,%xmm4 | |
3742 | pand `16*($i+2)-128`(%rax),%xmm2 | |
3743 | por %xmm1,%xmm5 | |
3744 | pand `16*($i+3)-128`(%rax),%xmm3 | |
3745 | por %xmm2,%xmm4 | |
3746 | por %xmm3,%xmm5 | |
3747 | ___ | |
3748 | } | |
3749 | $code.=<<___; | |
3750 | por %xmm5,%xmm4 | |
3751 | lea $STRIDE(%r11),%r11 | |
3752 | pshufd \$0x4e,%xmm4,%xmm0 | |
3753 | por %xmm4,%xmm0 | |
8329e2e7 AP |
3754 | movq %xmm0,($out) # m0=bp[0] |
3755 | lea 8($out),$out | |
3756 | sub \$1,$num | |
3757 | jnz .Lgather | |
8fc8f486 AP |
3758 | |
3759 | lea (%r10),%rsp | |
8329e2e7 AP |
3760 | ret |
3761 | .LSEH_end_bn_gather5: | |
0190c52a | 3762 | .cfi_endproc |
8329e2e7 | 3763 | .size bn_gather5,.-bn_gather5 |
361512da AP |
3764 | ___ |
3765 | } | |
3766 | $code.=<<___; | |
3767 | .align 64 | |
8fc8f486 AP |
3768 | .Linc: |
3769 | .long 0,0, 1,1 | |
3770 | .long 2,2, 2,2 | |
361512da AP |
3771 | .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
3772 | ___ | |
3773 | ||
e7d1363d AP |
3774 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
3775 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
3776 | if ($win64) { | |
3777 | $rec="%rcx"; | |
3778 | $frame="%rdx"; | |
3779 | $context="%r8"; | |
3780 | $disp="%r9"; | |
3781 | ||
3782 | $code.=<<___; | |
3783 | .extern __imp_RtlVirtualUnwind | |
3784 | .type mul_handler,\@abi-omnipotent | |
3785 | .align 16 | |
3786 | mul_handler: | |
3787 | push %rsi | |
3788 | push %rdi | |
3789 | push %rbx | |
3790 | push %rbp | |
3791 | push %r12 | |
3792 | push %r13 | |
3793 | push %r14 | |
3794 | push %r15 | |
3795 | pushfq | |
3796 | sub \$64,%rsp | |
3797 | ||
3798 | mov 120($context),%rax # pull context->Rax | |
3799 | mov 248($context),%rbx # pull context->Rip | |
3800 | ||
3801 | mov 8($disp),%rsi # disp->ImageBase | |
3802 | mov 56($disp),%r11 # disp->HandlerData | |
3803 | ||
3804 | mov 0(%r11),%r10d # HandlerData[0] | |
3805 | lea (%rsi,%r10),%r10 # end of prologue label | |
3806 | cmp %r10,%rbx # context->Rip<end of prologue label | |
3807 | jb .Lcommon_seh_tail | |
3808 | ||
3ba1ef82 | 3809 | mov 4(%r11),%r10d # HandlerData[1] |
384e6de4 AP |
3810 | lea (%rsi,%r10),%r10 # beginning of body label |
3811 | cmp %r10,%rbx # context->Rip<body label | |
3ba1ef82 AP |
3812 | jb .Lcommon_pop_regs |
3813 | ||
e7d1363d AP |
3814 | mov 152($context),%rax # pull context->Rsp |
3815 | ||
3ba1ef82 | 3816 | mov 8(%r11),%r10d # HandlerData[2] |
e7d1363d AP |
3817 | lea (%rsi,%r10),%r10 # epilogue label |
3818 | cmp %r10,%rbx # context->Rip>=epilogue label | |
3819 | jae .Lcommon_seh_tail | |
3820 | ||
ec9cc70f AP |
3821 | lea .Lmul_epilogue(%rip),%r10 |
3822 | cmp %r10,%rbx | |
8fc8f486 | 3823 | ja .Lbody_40 |
ec9cc70f | 3824 | |
e7d1363d AP |
3825 | mov 192($context),%r10 # pull $num |
3826 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | |
8fc8f486 | 3827 | |
3ba1ef82 | 3828 | jmp .Lcommon_pop_regs |
e7d1363d | 3829 | |
ec9cc70f AP |
3830 | .Lbody_40: |
3831 | mov 40(%rax),%rax # pull saved stack pointer | |
3ba1ef82 | 3832 | .Lcommon_pop_regs: |
e7d1363d AP |
3833 | mov -8(%rax),%rbx |
3834 | mov -16(%rax),%rbp | |
3835 | mov -24(%rax),%r12 | |
3836 | mov -32(%rax),%r13 | |
3837 | mov -40(%rax),%r14 | |
3838 | mov -48(%rax),%r15 | |
3839 | mov %rbx,144($context) # restore context->Rbx | |
3840 | mov %rbp,160($context) # restore context->Rbp | |
3841 | mov %r12,216($context) # restore context->R12 | |
3842 | mov %r13,224($context) # restore context->R13 | |
3843 | mov %r14,232($context) # restore context->R14 | |
3844 | mov %r15,240($context) # restore context->R15 | |
e7d1363d AP |
3845 | |
3846 | .Lcommon_seh_tail: | |
3847 | mov 8(%rax),%rdi | |
3848 | mov 16(%rax),%rsi | |
3849 | mov %rax,152($context) # restore context->Rsp | |
3850 | mov %rsi,168($context) # restore context->Rsi | |
3851 | mov %rdi,176($context) # restore context->Rdi | |
3852 | ||
3853 | mov 40($disp),%rdi # disp->ContextRecord | |
3854 | mov $context,%rsi # context | |
3855 | mov \$154,%ecx # sizeof(CONTEXT) | |
3856 | .long 0xa548f3fc # cld; rep movsq | |
3857 | ||
3858 | mov $disp,%rsi | |
3859 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
3860 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
3861 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
3862 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
3863 | mov 40(%rsi),%r10 # disp->ContextRecord | |
3864 | lea 56(%rsi),%r11 # &disp->HandlerData | |
3865 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
3866 | mov %r10,32(%rsp) # arg5 | |
3867 | mov %r11,40(%rsp) # arg6 | |
3868 | mov %r12,48(%rsp) # arg7 | |
3869 | mov %rcx,56(%rsp) # arg8, (NULL) | |
3870 | call *__imp_RtlVirtualUnwind(%rip) | |
3871 | ||
3872 | mov \$1,%eax # ExceptionContinueSearch | |
3873 | add \$64,%rsp | |
3874 | popfq | |
3875 | pop %r15 | |
3876 | pop %r14 | |
3877 | pop %r13 | |
3878 | pop %r12 | |
3879 | pop %rbp | |
3880 | pop %rbx | |
3881 | pop %rdi | |
3882 | pop %rsi | |
3883 | ret | |
3884 | .size mul_handler,.-mul_handler | |
3885 | ||
3886 | .section .pdata | |
3887 | .align 4 | |
3888 | .rva .LSEH_begin_bn_mul_mont_gather5 | |
3889 | .rva .LSEH_end_bn_mul_mont_gather5 | |
3890 | .rva .LSEH_info_bn_mul_mont_gather5 | |
3891 | ||
3892 | .rva .LSEH_begin_bn_mul4x_mont_gather5 | |
3893 | .rva .LSEH_end_bn_mul4x_mont_gather5 | |
3894 | .rva .LSEH_info_bn_mul4x_mont_gather5 | |
3895 | ||
ec9cc70f AP |
3896 | .rva .LSEH_begin_bn_power5 |
3897 | .rva .LSEH_end_bn_power5 | |
3898 | .rva .LSEH_info_bn_power5 | |
3899 | ||
3900 | .rva .LSEH_begin_bn_from_mont8x | |
3901 | .rva .LSEH_end_bn_from_mont8x | |
3902 | .rva .LSEH_info_bn_from_mont8x | |
3903 | ___ | |
3904 | $code.=<<___ if ($addx); | |
3905 | .rva .LSEH_begin_bn_mulx4x_mont_gather5 | |
3906 | .rva .LSEH_end_bn_mulx4x_mont_gather5 | |
3907 | .rva .LSEH_info_bn_mulx4x_mont_gather5 | |
3908 | ||
3909 | .rva .LSEH_begin_bn_powerx5 | |
3910 | .rva .LSEH_end_bn_powerx5 | |
3911 | .rva .LSEH_info_bn_powerx5 | |
3912 | ___ | |
3913 | $code.=<<___; | |
8329e2e7 AP |
3914 | .rva .LSEH_begin_bn_gather5 |
3915 | .rva .LSEH_end_bn_gather5 | |
3916 | .rva .LSEH_info_bn_gather5 | |
3917 | ||
e7d1363d AP |
3918 | .section .xdata |
3919 | .align 8 | |
3920 | .LSEH_info_bn_mul_mont_gather5: | |
3921 | .byte 9,0,0,0 | |
3922 | .rva mul_handler | |
3ba1ef82 | 3923 | .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] |
e7d1363d AP |
3924 | .align 8 |
3925 | .LSEH_info_bn_mul4x_mont_gather5: | |
3926 | .byte 9,0,0,0 | |
3927 | .rva mul_handler | |
3ba1ef82 | 3928 | .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] |
ec9cc70f AP |
3929 | .align 8 |
3930 | .LSEH_info_bn_power5: | |
3931 | .byte 9,0,0,0 | |
3932 | .rva mul_handler | |
3ba1ef82 | 3933 | .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] |
ec9cc70f AP |
3934 | .align 8 |
3935 | .LSEH_info_bn_from_mont8x: | |
3936 | .byte 9,0,0,0 | |
3937 | .rva mul_handler | |
3ba1ef82 | 3938 | .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] |
ec9cc70f AP |
3939 | ___ |
3940 | $code.=<<___ if ($addx); | |
3941 | .align 8 | |
3942 | .LSEH_info_bn_mulx4x_mont_gather5: | |
3943 | .byte 9,0,0,0 | |
3944 | .rva mul_handler | |
3ba1ef82 | 3945 | .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] |
ec9cc70f AP |
3946 | .align 8 |
3947 | .LSEH_info_bn_powerx5: | |
3948 | .byte 9,0,0,0 | |
3949 | .rva mul_handler | |
3ba1ef82 | 3950 | .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] |
ec9cc70f AP |
3951 | ___ |
3952 | $code.=<<___; | |
e7d1363d | 3953 | .align 8 |
8329e2e7 | 3954 | .LSEH_info_bn_gather5: |
317be638 AP |
3955 | .byte 0x01,0x0b,0x03,0x0a |
3956 | .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 | |
3957 | .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) | |
8329e2e7 | 3958 | .align 8 |
e7d1363d AP |
3959 | ___ |
3960 | } | |
3961 | ||
361512da AP |
3962 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; |
3963 | ||
3964 | print $code; | |
a21314db | 3965 | close STDOUT or die "error closing STDOUT: $!"; |