]>
Commit | Line | Data |
---|---|---|
da1c088f | 1 | # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved. |
c781eb1c AM |
2 | # Copyright (c) 2020, Intel Corporation. All Rights Reserved. |
3 | # | |
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | # | |
9 | # | |
f87b4c4e AM |
10 | # Originally written by Sergey Kirillov and Andrey Matyukov. |
11 | # Special thanks to Ilya Albrekht for his valuable hints. | |
c781eb1c AM |
12 | # Intel Corporation |
13 | # | |
14 | # December 2020 | |
15 | # | |
16 | # Initial release. | |
17 | # | |
18 | # Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. | |
19 | # | |
20 | # IceLake-Client @ 1.3GHz | |
21 | # |---------+----------------------+--------------+-------------| | |
22 | # | | OpenSSL 3.0.0-alpha9 | this | Unit | | |
23 | # |---------+----------------------+--------------+-------------| | |
24 | # | rsa2048 | 2 127 659 | 1 015 625 | cycles/sign | | |
25 | # | | 611 | 1280 / +109% | sign/s | | |
26 | # |---------+----------------------+--------------+-------------| | |
27 | # | |
28 | ||
29 | # $output is the last argument if it looks like a file (it has an extension) | |
30 | # $flavour is the first argument if it doesn't look like a file | |
31 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
32 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
33 | ||
34 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
35 | $avx512ifma=0; | |
36 | ||
37 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
38 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
39 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
40 | die "can't locate x86_64-xlate.pl"; | |
41 | ||
42 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
43 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
44 | $avx512ifma = ($1>=2.26); | |
45 | } | |
46 | ||
e5dd7327 | 47 | if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && |
c781eb1c AM |
48 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { |
49 | $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); | |
50 | } | |
51 | ||
e5dd7327 | 52 | if (!$avx512ifma && `$ENV{CC} -v 2>&1` |
523e0577 RL |
53 | =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { |
54 | my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 | |
55 | if ($1) { | |
56 | # Apple conditions, they use a different version series, see | |
57 | # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 | |
58 | # clang 7.0.0 is Apple clang 10.0.1 | |
59 | $avx512ifma = ($ver>=10.0001) | |
60 | } else { | |
61 | $avx512ifma = ($ver>=7.0); | |
62 | } | |
c781eb1c AM |
63 | } |
64 | ||
65 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" | |
66 | or die "can't call $xlate: $!"; | |
67 | *STDOUT=*OUT; | |
68 | ||
69 | if ($avx512ifma>0) {{{ | |
70 | @_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | |
71 | ||
72 | $code.=<<___; | |
73 | .extern OPENSSL_ia32cap_P | |
e475d9a4 P |
74 | .globl ossl_rsaz_avx512ifma_eligible |
75 | .type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent | |
c781eb1c | 76 | .align 32 |
e475d9a4 | 77 | ossl_rsaz_avx512ifma_eligible: |
c781eb1c AM |
78 | mov OPENSSL_ia32cap_P+8(%rip), %ecx |
79 | xor %eax,%eax | |
80 | and \$`1<<31|1<<21|1<<17|1<<16`, %ecx # avx512vl + avx512ifma + avx512dq + avx512f | |
81 | cmp \$`1<<31|1<<21|1<<17|1<<16`, %ecx | |
82 | cmove %ecx,%eax | |
83 | ret | |
e475d9a4 | 84 | .size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible |
c781eb1c AM |
85 | ___ |
86 | ||
87 | ############################################################################### | |
88 | # Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. | |
89 | # | |
f87b4c4e | 90 | # AMM is defined as presented in the paper [1]. |
c781eb1c AM |
91 | # |
92 | # The input and output are presented in 2^52 radix domain, i.e. | |
93 | # |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. | |
94 | # |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 | |
c781eb1c | 95 | # |
f87b4c4e AM |
96 | # NB: the AMM implementation does not perform "conditional" subtraction step |
97 | # specified in the original algorithm as according to the Lemma 1 from the paper | |
98 | # [2], the result will be always < 2*m and can be used as a direct input to | |
99 | # the next AMM iteration. This post-condition is true, provided the correct | |
e304aa87 | 100 | # parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, |
f87b4c4e | 101 | # which matches our case: 1040 > 1024 + 2 * 1. |
c781eb1c | 102 | # |
f87b4c4e AM |
103 | # [1] Gueron, S. Efficient software implementations of modular exponentiation. |
104 | # DOI: 10.1007/s13389-012-0031-5 | |
105 | # [2] Gueron, S. Enhanced Montgomery Multiplication. | |
106 | # DOI: 10.1007/3-540-36400-5_5 | |
107 | # | |
108 | # void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, | |
109 | # const BN_ULONG *a, | |
110 | # const BN_ULONG *b, | |
111 | # const BN_ULONG *m, | |
112 | # BN_ULONG k0); | |
c781eb1c AM |
113 | ############################################################################### |
114 | { | |
115 | # input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") | |
116 | my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; | |
117 | ||
118 | my $mask52 = "%rax"; | |
119 | my $acc0_0 = "%r9"; | |
120 | my $acc0_0_low = "%r9d"; | |
121 | my $acc0_1 = "%r15"; | |
122 | my $acc0_1_low = "%r15d"; | |
123 | my $b_ptr = "%r11"; | |
124 | ||
125 | my $iter = "%ebx"; | |
126 | ||
127 | my $zero = "%ymm0"; | |
f87b4c4e AM |
128 | my $Bi = "%ymm1"; |
129 | my $Yi = "%ymm2"; | |
130 | my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm3",map("%ymm$_",(16..19))); | |
131 | my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm4",map("%ymm$_",(20..23))); | |
c781eb1c AM |
132 | |
133 | # Registers mapping for normalization. | |
f87b4c4e | 134 | my ($T0,$T0h,$T1,$T1h,$T2) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (25..26))); |
c781eb1c AM |
135 | |
136 | sub amm52x20_x1() { | |
137 | # _data_offset - offset in the |a| or |m| arrays pointing to the beginning | |
138 | # of data for corresponding AMM operation; | |
139 | # _b_offset - offset in the |b| array pointing to the next qword digit; | |
140 | my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_k0) = @_; | |
d136db21 MC |
141 | my $_R0_xmm = $_R0; |
142 | $_R0_xmm =~ s/%y/%x/; | |
c781eb1c AM |
143 | $code.=<<___; |
144 | movq $_b_offset($b_ptr), %r13 # b[i] | |
145 | ||
146 | vpbroadcastq %r13, $Bi # broadcast b[i] | |
147 | movq $_data_offset($a), %rdx | |
148 | mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) | |
149 | addq %r13, $_acc # acc += t0 | |
150 | movq %r12, %r10 | |
151 | adcq \$0, %r10 # t2 += CF | |
152 | ||
153 | movq $_k0, %r13 | |
154 | imulq $_acc, %r13 # acc * k0 | |
155 | andq $mask52, %r13 # yi = (acc * k0) & mask52 | |
156 | ||
157 | vpbroadcastq %r13, $Yi # broadcast y[i] | |
158 | movq $_data_offset($m), %rdx | |
159 | mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) | |
160 | addq %r13, $_acc # acc += t0 | |
161 | adcq %r12, %r10 # t2 += (t1 + CF) | |
162 | ||
163 | shrq \$52, $_acc | |
164 | salq \$12, %r10 | |
165 | or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) | |
166 | ||
167 | vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 | |
168 | vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h | |
169 | vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 | |
170 | vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h | |
171 | vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 | |
172 | ||
173 | vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 | |
174 | vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h | |
175 | vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 | |
176 | vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h | |
177 | vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 | |
178 | ||
179 | # Shift accumulators right by 1 qword, zero extending the highest one | |
180 | valignq \$1, $_R0, $_R0h, $_R0 | |
181 | valignq \$1, $_R0h, $_R1, $_R0h | |
182 | valignq \$1, $_R1, $_R1h, $_R1 | |
183 | valignq \$1, $_R1h, $_R2, $_R1h | |
184 | valignq \$1, $_R2, $zero, $_R2 | |
185 | ||
186 | vmovq $_R0_xmm, %r13 | |
187 | addq %r13, $_acc # acc += R0[0] | |
188 | ||
189 | vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 | |
190 | vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h | |
191 | vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 | |
192 | vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h | |
193 | vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 | |
194 | ||
195 | vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 | |
196 | vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h | |
197 | vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 | |
198 | vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h | |
199 | vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 | |
200 | ___ | |
201 | } | |
202 | ||
f87b4c4e AM |
203 | # Normalization routine: handles carry bits and gets bignum qwords to normalized |
204 | # 2^52 representation. | |
c781eb1c AM |
205 | # |
206 | # Uses %r8-14,%e[bcd]x | |
207 | sub amm52x20_x1_norm { | |
208 | my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_; | |
209 | $code.=<<___; | |
210 | # Put accumulator to low qword in R0 | |
f87b4c4e AM |
211 | vpbroadcastq $_acc, $T0 |
212 | vpblendd \$3, $T0, $_R0, $_R0 | |
c781eb1c AM |
213 | |
214 | # Extract "carries" (12 high bits) from each QW of R0..R2 | |
215 | # Save them to LSB of QWs in T0..T2 | |
216 | vpsrlq \$52, $_R0, $T0 | |
217 | vpsrlq \$52, $_R0h, $T0h | |
218 | vpsrlq \$52, $_R1, $T1 | |
219 | vpsrlq \$52, $_R1h, $T1h | |
220 | vpsrlq \$52, $_R2, $T2 | |
221 | ||
222 | # "Shift left" T0..T2 by 1 QW | |
223 | valignq \$3, $T1h, $T2, $T2 | |
224 | valignq \$3, $T1, $T1h, $T1h | |
225 | valignq \$3, $T0h, $T1, $T1 | |
226 | valignq \$3, $T0, $T0h, $T0h | |
f87b4c4e | 227 | valignq \$3, .Lzeros(%rip), $T0, $T0 |
c781eb1c AM |
228 | |
229 | # Drop "carries" from R0..R2 QWs | |
f87b4c4e AM |
230 | vpandq .Lmask52x4(%rip), $_R0, $_R0 |
231 | vpandq .Lmask52x4(%rip), $_R0h, $_R0h | |
232 | vpandq .Lmask52x4(%rip), $_R1, $_R1 | |
233 | vpandq .Lmask52x4(%rip), $_R1h, $_R1h | |
234 | vpandq .Lmask52x4(%rip), $_R2, $_R2 | |
c781eb1c AM |
235 | |
236 | # Sum R0..R2 with corresponding adjusted carries | |
237 | vpaddq $T0, $_R0, $_R0 | |
238 | vpaddq $T0h, $_R0h, $_R0h | |
239 | vpaddq $T1, $_R1, $_R1 | |
240 | vpaddq $T1h, $_R1h, $_R1h | |
241 | vpaddq $T2, $_R2, $_R2 | |
242 | ||
243 | # Now handle carry bits from this addition | |
244 | # Get mask of QWs which 52-bit parts overflow... | |
f87b4c4e AM |
245 | vpcmpuq \$6, .Lmask52x4(%rip), $_R0, %k1 # OP=nle (i.e. gt) |
246 | vpcmpuq \$6, .Lmask52x4(%rip), $_R0h, %k2 | |
247 | vpcmpuq \$6, .Lmask52x4(%rip), $_R1, %k3 | |
248 | vpcmpuq \$6, .Lmask52x4(%rip), $_R1h, %k4 | |
249 | vpcmpuq \$6, .Lmask52x4(%rip), $_R2, %k5 | |
c781eb1c AM |
250 | kmovb %k1, %r14d # k1 |
251 | kmovb %k2, %r13d # k1h | |
252 | kmovb %k3, %r12d # k2 | |
253 | kmovb %k4, %r11d # k2h | |
254 | kmovb %k5, %r10d # k3 | |
255 | ||
256 | # ...or saturated | |
f87b4c4e AM |
257 | vpcmpuq \$0, .Lmask52x4(%rip), $_R0, %k1 # OP=eq |
258 | vpcmpuq \$0, .Lmask52x4(%rip), $_R0h, %k2 | |
259 | vpcmpuq \$0, .Lmask52x4(%rip), $_R1, %k3 | |
260 | vpcmpuq \$0, .Lmask52x4(%rip), $_R1h, %k4 | |
261 | vpcmpuq \$0, .Lmask52x4(%rip), $_R2, %k5 | |
c781eb1c AM |
262 | kmovb %k1, %r9d # k4 |
263 | kmovb %k2, %r8d # k4h | |
264 | kmovb %k3, %ebx # k5 | |
265 | kmovb %k4, %ecx # k5h | |
266 | kmovb %k5, %edx # k6 | |
267 | ||
268 | # Get mask of QWs where carries shall be propagated to. | |
269 | # Merge 4-bit masks to 8-bit values to use add with carry. | |
270 | shl \$4, %r13b | |
271 | or %r13b, %r14b | |
272 | shl \$4, %r11b | |
273 | or %r11b, %r12b | |
274 | ||
275 | add %r14b, %r14b | |
276 | adc %r12b, %r12b | |
277 | adc %r10b, %r10b | |
278 | ||
279 | shl \$4, %r8b | |
280 | or %r8b,%r9b | |
281 | shl \$4, %cl | |
282 | or %cl, %bl | |
283 | ||
284 | add %r9b, %r14b | |
285 | adc %bl, %r12b | |
286 | adc %dl, %r10b | |
287 | ||
288 | xor %r9b, %r14b | |
289 | xor %bl, %r12b | |
290 | xor %dl, %r10b | |
291 | ||
292 | kmovb %r14d, %k1 | |
293 | shr \$4, %r14b | |
294 | kmovb %r14d, %k2 | |
295 | kmovb %r12d, %k3 | |
296 | shr \$4, %r12b | |
297 | kmovb %r12d, %k4 | |
298 | kmovb %r10d, %k5 | |
299 | ||
300 | # Add carries according to the obtained mask | |
f87b4c4e AM |
301 | vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} |
302 | vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} | |
303 | vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} | |
304 | vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} | |
305 | vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} | |
306 | ||
307 | vpandq .Lmask52x4(%rip), $_R0, $_R0 | |
308 | vpandq .Lmask52x4(%rip), $_R0h, $_R0h | |
309 | vpandq .Lmask52x4(%rip), $_R1, $_R1 | |
310 | vpandq .Lmask52x4(%rip), $_R1h, $_R1h | |
311 | vpandq .Lmask52x4(%rip), $_R2, $_R2 | |
c781eb1c AM |
312 | ___ |
313 | } | |
314 | ||
315 | $code.=<<___; | |
316 | .text | |
317 | ||
f87b4c4e AM |
318 | .globl ossl_rsaz_amm52x20_x1_ifma256 |
319 | .type ossl_rsaz_amm52x20_x1_ifma256,\@function,5 | |
c781eb1c | 320 | .align 32 |
f87b4c4e | 321 | ossl_rsaz_amm52x20_x1_ifma256: |
c781eb1c AM |
322 | .cfi_startproc |
323 | endbranch | |
324 | push %rbx | |
325 | .cfi_push %rbx | |
326 | push %rbp | |
327 | .cfi_push %rbp | |
328 | push %r12 | |
329 | .cfi_push %r12 | |
330 | push %r13 | |
331 | .cfi_push %r13 | |
332 | push %r14 | |
333 | .cfi_push %r14 | |
334 | push %r15 | |
335 | .cfi_push %r15 | |
f87b4c4e | 336 | .Lossl_rsaz_amm52x20_x1_ifma256_body: |
c781eb1c AM |
337 | |
338 | # Zeroing accumulators | |
339 | vpxord $zero, $zero, $zero | |
340 | vmovdqa64 $zero, $R0_0 | |
341 | vmovdqa64 $zero, $R0_0h | |
342 | vmovdqa64 $zero, $R1_0 | |
343 | vmovdqa64 $zero, $R1_0h | |
344 | vmovdqa64 $zero, $R2_0 | |
345 | ||
346 | xorl $acc0_0_low, $acc0_0_low | |
347 | ||
348 | movq $b, $b_ptr # backup address of b | |
349 | movq \$0xfffffffffffff, $mask52 # 52-bit mask | |
350 | ||
351 | # Loop over 20 digits unrolled by 4 | |
352 | mov \$5, $iter | |
353 | ||
354 | .align 32 | |
355 | .Lloop5: | |
356 | ___ | |
357 | foreach my $idx (0..3) { | |
358 | &amm52x20_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$k0); | |
359 | } | |
360 | $code.=<<___; | |
361 | lea `4*8`($b_ptr), $b_ptr | |
362 | dec $iter | |
363 | jne .Lloop5 | |
c781eb1c AM |
364 | ___ |
365 | &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); | |
366 | $code.=<<___; | |
367 | ||
f87b4c4e AM |
368 | vmovdqu64 $R0_0, `0*32`($res) |
369 | vmovdqu64 $R0_0h, `1*32`($res) | |
370 | vmovdqu64 $R1_0, `2*32`($res) | |
371 | vmovdqu64 $R1_0h, `3*32`($res) | |
372 | vmovdqu64 $R2_0, `4*32`($res) | |
c781eb1c AM |
373 | |
374 | vzeroupper | |
375 | mov 0(%rsp),%r15 | |
376 | .cfi_restore %r15 | |
377 | mov 8(%rsp),%r14 | |
378 | .cfi_restore %r14 | |
379 | mov 16(%rsp),%r13 | |
380 | .cfi_restore %r13 | |
381 | mov 24(%rsp),%r12 | |
382 | .cfi_restore %r12 | |
383 | mov 32(%rsp),%rbp | |
384 | .cfi_restore %rbp | |
385 | mov 40(%rsp),%rbx | |
386 | .cfi_restore %rbx | |
387 | lea 48(%rsp),%rsp | |
388 | .cfi_adjust_cfa_offset -48 | |
f87b4c4e | 389 | .Lossl_rsaz_amm52x20_x1_ifma256_epilogue: |
c781eb1c AM |
390 | ret |
391 | .cfi_endproc | |
f87b4c4e | 392 | .size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 |
c781eb1c AM |
393 | ___ |
394 | ||
395 | $code.=<<___; | |
396 | .data | |
397 | .align 32 | |
398 | .Lmask52x4: | |
399 | .quad 0xfffffffffffff | |
400 | .quad 0xfffffffffffff | |
401 | .quad 0xfffffffffffff | |
402 | .quad 0xfffffffffffff | |
403 | ___ | |
404 | ||
405 | ############################################################################### | |
406 | # Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 | |
407 | # | |
f87b4c4e | 408 | # See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost |
c781eb1c AM |
409 | # Montgomery Multiplication algorithm and function input parameters description. |
410 | # | |
411 | # This function does two AMMs for two independent inputs, hence dual. | |
412 | # | |
f87b4c4e AM |
413 | # void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20], |
414 | # const BN_ULONG a[2][20], | |
415 | # const BN_ULONG b[2][20], | |
416 | # const BN_ULONG m[2][20], | |
417 | # const BN_ULONG k0[2]); | |
c781eb1c AM |
418 | ############################################################################### |
419 | ||
420 | $code.=<<___; | |
421 | .text | |
422 | ||
f87b4c4e AM |
423 | .globl ossl_rsaz_amm52x20_x2_ifma256 |
424 | .type ossl_rsaz_amm52x20_x2_ifma256,\@function,5 | |
c781eb1c | 425 | .align 32 |
f87b4c4e | 426 | ossl_rsaz_amm52x20_x2_ifma256: |
c781eb1c AM |
427 | .cfi_startproc |
428 | endbranch | |
429 | push %rbx | |
430 | .cfi_push %rbx | |
431 | push %rbp | |
432 | .cfi_push %rbp | |
433 | push %r12 | |
434 | .cfi_push %r12 | |
435 | push %r13 | |
436 | .cfi_push %r13 | |
437 | push %r14 | |
438 | .cfi_push %r14 | |
439 | push %r15 | |
440 | .cfi_push %r15 | |
f87b4c4e | 441 | .Lossl_rsaz_amm52x20_x2_ifma256_body: |
c781eb1c AM |
442 | |
443 | # Zeroing accumulators | |
444 | vpxord $zero, $zero, $zero | |
445 | vmovdqa64 $zero, $R0_0 | |
446 | vmovdqa64 $zero, $R0_0h | |
447 | vmovdqa64 $zero, $R1_0 | |
448 | vmovdqa64 $zero, $R1_0h | |
449 | vmovdqa64 $zero, $R2_0 | |
450 | vmovdqa64 $zero, $R0_1 | |
451 | vmovdqa64 $zero, $R0_1h | |
452 | vmovdqa64 $zero, $R1_1 | |
453 | vmovdqa64 $zero, $R1_1h | |
454 | vmovdqa64 $zero, $R2_1 | |
455 | ||
456 | xorl $acc0_0_low, $acc0_0_low | |
457 | xorl $acc0_1_low, $acc0_1_low | |
458 | ||
459 | movq $b, $b_ptr # backup address of b | |
460 | movq \$0xfffffffffffff, $mask52 # 52-bit mask | |
461 | ||
462 | mov \$20, $iter | |
463 | ||
464 | .align 32 | |
465 | .Lloop20: | |
466 | ___ | |
467 | &amm52x20_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,"($k0)"); | |
468 | # 20*8 = offset of the next dimension in two-dimension array | |
469 | &amm52x20_x1(20*8,20*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,"8($k0)"); | |
470 | $code.=<<___; | |
471 | lea 8($b_ptr), $b_ptr | |
472 | dec $iter | |
473 | jne .Lloop20 | |
c781eb1c AM |
474 | ___ |
475 | &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); | |
476 | &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1); | |
477 | $code.=<<___; | |
478 | ||
f87b4c4e AM |
479 | vmovdqu64 $R0_0, `0*32`($res) |
480 | vmovdqu64 $R0_0h, `1*32`($res) | |
481 | vmovdqu64 $R1_0, `2*32`($res) | |
482 | vmovdqu64 $R1_0h, `3*32`($res) | |
483 | vmovdqu64 $R2_0, `4*32`($res) | |
c781eb1c | 484 | |
f87b4c4e AM |
485 | vmovdqu64 $R0_1, `5*32`($res) |
486 | vmovdqu64 $R0_1h, `6*32`($res) | |
487 | vmovdqu64 $R1_1, `7*32`($res) | |
488 | vmovdqu64 $R1_1h, `8*32`($res) | |
489 | vmovdqu64 $R2_1, `9*32`($res) | |
c781eb1c AM |
490 | |
491 | vzeroupper | |
492 | mov 0(%rsp),%r15 | |
493 | .cfi_restore %r15 | |
494 | mov 8(%rsp),%r14 | |
495 | .cfi_restore %r14 | |
496 | mov 16(%rsp),%r13 | |
497 | .cfi_restore %r13 | |
498 | mov 24(%rsp),%r12 | |
499 | .cfi_restore %r12 | |
500 | mov 32(%rsp),%rbp | |
501 | .cfi_restore %rbp | |
502 | mov 40(%rsp),%rbx | |
503 | .cfi_restore %rbx | |
504 | lea 48(%rsp),%rsp | |
505 | .cfi_adjust_cfa_offset -48 | |
f87b4c4e | 506 | .Lossl_rsaz_amm52x20_x2_ifma256_epilogue: |
c781eb1c AM |
507 | ret |
508 | .cfi_endproc | |
f87b4c4e | 509 | .size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256 |
c781eb1c AM |
510 | ___ |
511 | } | |
512 | ||
513 | ############################################################################### | |
514 | # Constant time extraction from the precomputed table of powers base^i, where | |
515 | # i = 0..2^EXP_WIN_SIZE-1 | |
516 | # | |
f87b4c4e AM |
517 | # The input |red_table| contains precomputations for two independent base values. |
518 | # |red_table_idx1| and |red_table_idx2| are corresponding power indexes. | |
c781eb1c | 519 | # |
f87b4c4e | 520 | # Extracted value (output) is 2 20 digit numbers in 2^52 radix. |
c781eb1c | 521 | # |
190c029e P |
522 | # void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, |
523 | # const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], | |
f87b4c4e | 524 | # int red_table_idx1, int red_table_idx2); |
c781eb1c AM |
525 | # |
526 | # EXP_WIN_SIZE = 5 | |
527 | ############################################################################### | |
528 | { | |
529 | # input parameters | |
f87b4c4e AM |
530 | my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order |
531 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
c781eb1c | 532 | |
f87b4c4e AM |
533 | my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); |
534 | my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19)); | |
535 | my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24)); | |
536 | ||
537 | my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9); | |
538 | my $t0xmm = $t0; | |
539 | $t0xmm =~ s/%y/%x/; | |
c781eb1c AM |
540 | |
541 | $code.=<<___; | |
542 | .text | |
543 | ||
544 | .align 32 | |
190c029e | 545 | .globl ossl_extract_multiplier_2x20_win5 |
f87b4c4e | 546 | .type ossl_extract_multiplier_2x20_win5,\@abi-omnipotent |
190c029e | 547 | ossl_extract_multiplier_2x20_win5: |
c781eb1c AM |
548 | .cfi_startproc |
549 | endbranch | |
c781eb1c | 550 | vmovdqa64 .Lones(%rip), $ones # broadcast ones |
f87b4c4e AM |
551 | vpbroadcastq $red_tbl_idx1, $idx1 |
552 | vpbroadcastq $red_tbl_idx2, $idx2 | |
c781eb1c AM |
553 | leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl |
554 | ||
f87b4c4e AM |
555 | # zeroing t0..n, cur_idx |
556 | vpxor $t0xmm, $t0xmm, $t0xmm | |
557 | vmovdqa64 $t0, $cur_idx | |
558 | ___ | |
559 | foreach (1..9) { | |
560 | $code.="vmovdqa64 $t0, $t[$_] \n"; | |
561 | } | |
562 | $code.=<<___; | |
c781eb1c AM |
563 | |
564 | .align 32 | |
565 | .Lloop: | |
f87b4c4e AM |
566 | vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx) |
567 | vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx) | |
568 | ___ | |
569 | foreach (0..9) { | |
570 | my $mask = $_<5?"%k1":"%k2"; | |
571 | $code.=<<___; | |
572 | vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl | |
573 | vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero | |
574 | ___ | |
575 | } | |
576 | $code.=<<___; | |
577 | vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx | |
578 | addq \$`2*20*8`, $red_tbl | |
c781eb1c AM |
579 | cmpq $red_tbl, %rax |
580 | jne .Lloop | |
f87b4c4e AM |
581 | ___ |
582 | # store t0..n | |
583 | foreach (0..9) { | |
584 | $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n"; | |
585 | } | |
586 | $code.=<<___; | |
c781eb1c AM |
587 | ret |
588 | .cfi_endproc | |
190c029e | 589 | .size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 |
c781eb1c AM |
590 | ___ |
591 | $code.=<<___; | |
592 | .data | |
593 | .align 32 | |
594 | .Lones: | |
595 | .quad 1,1,1,1 | |
f87b4c4e AM |
596 | .Lzeros: |
597 | .quad 0,0,0,0 | |
c781eb1c AM |
598 | ___ |
599 | } | |
600 | ||
601 | if ($win64) { | |
602 | $rec="%rcx"; | |
603 | $frame="%rdx"; | |
604 | $context="%r8"; | |
605 | $disp="%r9"; | |
606 | ||
f87b4c4e | 607 | $code.=<<___; |
c781eb1c AM |
608 | .extern __imp_RtlVirtualUnwind |
609 | .type rsaz_def_handler,\@abi-omnipotent | |
610 | .align 16 | |
611 | rsaz_def_handler: | |
612 | push %rsi | |
613 | push %rdi | |
614 | push %rbx | |
615 | push %rbp | |
616 | push %r12 | |
617 | push %r13 | |
618 | push %r14 | |
619 | push %r15 | |
620 | pushfq | |
621 | sub \$64,%rsp | |
622 | ||
623 | mov 120($context),%rax # pull context->Rax | |
624 | mov 248($context),%rbx # pull context->Rip | |
625 | ||
626 | mov 8($disp),%rsi # disp->ImageBase | |
627 | mov 56($disp),%r11 # disp->HandlerData | |
628 | ||
629 | mov 0(%r11),%r10d # HandlerData[0] | |
630 | lea (%rsi,%r10),%r10 # prologue label | |
631 | cmp %r10,%rbx # context->Rip<.Lprologue | |
632 | jb .Lcommon_seh_tail | |
633 | ||
634 | mov 152($context),%rax # pull context->Rsp | |
635 | ||
636 | mov 4(%r11),%r10d # HandlerData[1] | |
637 | lea (%rsi,%r10),%r10 # epilogue label | |
638 | cmp %r10,%rbx # context->Rip>=.Lepilogue | |
639 | jae .Lcommon_seh_tail | |
640 | ||
641 | lea 48(%rax),%rax | |
642 | ||
643 | mov -8(%rax),%rbx | |
644 | mov -16(%rax),%rbp | |
645 | mov -24(%rax),%r12 | |
646 | mov -32(%rax),%r13 | |
647 | mov -40(%rax),%r14 | |
648 | mov -48(%rax),%r15 | |
649 | mov %rbx,144($context) # restore context->Rbx | |
650 | mov %rbp,160($context) # restore context->Rbp | |
651 | mov %r12,216($context) # restore context->R12 | |
652 | mov %r13,224($context) # restore context->R13 | |
653 | mov %r14,232($context) # restore context->R14 | |
654 | mov %r15,240($context) # restore context->R14 | |
655 | ||
656 | .Lcommon_seh_tail: | |
657 | mov 8(%rax),%rdi | |
658 | mov 16(%rax),%rsi | |
659 | mov %rax,152($context) # restore context->Rsp | |
660 | mov %rsi,168($context) # restore context->Rsi | |
661 | mov %rdi,176($context) # restore context->Rdi | |
662 | ||
663 | mov 40($disp),%rdi # disp->ContextRecord | |
664 | mov $context,%rsi # context | |
665 | mov \$154,%ecx # sizeof(CONTEXT) | |
666 | .long 0xa548f3fc # cld; rep movsq | |
667 | ||
668 | mov $disp,%rsi | |
669 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
670 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
671 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
672 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
673 | mov 40(%rsi),%r10 # disp->ContextRecord | |
674 | lea 56(%rsi),%r11 # &disp->HandlerData | |
675 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
676 | mov %r10,32(%rsp) # arg5 | |
677 | mov %r11,40(%rsp) # arg6 | |
678 | mov %r12,48(%rsp) # arg7 | |
679 | mov %rcx,56(%rsp) # arg8, (NULL) | |
680 | call *__imp_RtlVirtualUnwind(%rip) | |
681 | ||
682 | mov \$1,%eax # ExceptionContinueSearch | |
683 | add \$64,%rsp | |
684 | popfq | |
685 | pop %r15 | |
686 | pop %r14 | |
687 | pop %r13 | |
688 | pop %r12 | |
689 | pop %rbp | |
690 | pop %rbx | |
691 | pop %rdi | |
692 | pop %rsi | |
693 | ret | |
694 | .size rsaz_def_handler,.-rsaz_def_handler | |
695 | ||
696 | .section .pdata | |
697 | .align 4 | |
f87b4c4e AM |
698 | .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_ifma256 |
699 | .rva .LSEH_end_ossl_rsaz_amm52x20_x1_ifma256 | |
700 | .rva .LSEH_info_ossl_rsaz_amm52x20_x1_ifma256 | |
c781eb1c | 701 | |
f87b4c4e AM |
702 | .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_ifma256 |
703 | .rva .LSEH_end_ossl_rsaz_amm52x20_x2_ifma256 | |
704 | .rva .LSEH_info_ossl_rsaz_amm52x20_x2_ifma256 | |
b238e78f | 705 | |
c781eb1c AM |
706 | .section .xdata |
707 | .align 8 | |
f87b4c4e | 708 | .LSEH_info_ossl_rsaz_amm52x20_x1_ifma256: |
c781eb1c AM |
709 | .byte 9,0,0,0 |
710 | .rva rsaz_def_handler | |
f87b4c4e AM |
711 | .rva .Lossl_rsaz_amm52x20_x1_ifma256_body,.Lossl_rsaz_amm52x20_x1_ifma256_epilogue |
712 | .LSEH_info_ossl_rsaz_amm52x20_x2_ifma256: | |
b238e78f AM |
713 | .byte 9,0,0,0 |
714 | .rva rsaz_def_handler | |
f87b4c4e | 715 | .rva .Lossl_rsaz_amm52x20_x2_ifma256_body,.Lossl_rsaz_amm52x20_x2_ifma256_epilogue |
c781eb1c AM |
716 | ___ |
717 | } | |
718 | }}} else {{{ # fallback for old assembler | |
719 | $code.=<<___; | |
720 | .text | |
721 | ||
e475d9a4 P |
722 | .globl ossl_rsaz_avx512ifma_eligible |
723 | .type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent | |
724 | ossl_rsaz_avx512ifma_eligible: | |
c781eb1c AM |
725 | xor %eax,%eax |
726 | ret | |
e475d9a4 | 727 | .size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible |
c781eb1c | 728 | |
f87b4c4e AM |
729 | .globl ossl_rsaz_amm52x20_x1_ifma256 |
730 | .globl ossl_rsaz_amm52x20_x2_ifma256 | |
190c029e | 731 | .globl ossl_extract_multiplier_2x20_win5 |
f87b4c4e AM |
732 | .type ossl_rsaz_amm52x20_x1_ifma256,\@abi-omnipotent |
733 | ossl_rsaz_amm52x20_x1_ifma256: | |
734 | ossl_rsaz_amm52x20_x2_ifma256: | |
190c029e | 735 | ossl_extract_multiplier_2x20_win5: |
c781eb1c AM |
736 | .byte 0x0f,0x0b # ud2 |
737 | ret | |
f87b4c4e | 738 | .size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 |
c781eb1c AM |
739 | ___ |
740 | }}} | |
741 | ||
742 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
743 | print $code; | |
744 | close STDOUT or die "error closing STDOUT: $!"; |