]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
367ace68 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
58cc21fd AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # May 2011 | |
18 | # | |
19 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | |
20 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | |
21 | # the time being... Except that it has three code paths: pure integer | |
22 | # code suitable for any x86 CPU, MMX code suitable for PIII and later | |
23 | # and PCLMULQDQ suitable for Westmere and later. Improvement varies | |
053fa39a | 24 | # from one benchmark and µ-arch to another. Below are interval values |
58cc21fd AP |
25 | # for 163- and 571-bit ECDH benchmarks relative to compiler-generated |
26 | # code: | |
27 | # | |
28 | # PIII 16%-30% | |
29 | # P4 12%-12% | |
30 | # Opteron 18%-40% | |
31 | # Core2 19%-44% | |
32 | # Atom 38%-64% | |
33 | # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) | |
34 | # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) | |
35 | # | |
36 | # Note that above improvement coefficients are not coefficients for | |
37 | # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result | |
38 | # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark | |
39 | # is more and more dominated by other subroutines, most notably by | |
40 | # BN_GF2m_mod[_mul]_arr... | |
41 | ||
42 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
43 | push(@INC,"${dir}","${dir}../../perlasm"); | |
44 | require "x86asm.pl"; | |
45 | ||
1aa89a7a | 46 | $output = pop and open STDOUT,">$output"; |
6bd7a4d9 | 47 | |
e195c8a2 | 48 | &asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); |
58cc21fd AP |
49 | |
50 | $sse2=0; | |
51 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
52 | ||
53 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | |
54 | ||
55 | $a="eax"; | |
56 | $b="ebx"; | |
57 | ($a1,$a2,$a4)=("ecx","edx","ebp"); | |
58 | ||
59 | $R="mm0"; | |
60 | @T=("mm1","mm2"); | |
61 | ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); | |
62 | @i=("esi","edi"); | |
63 | ||
64 | if (!$x86only) { | |
65 | &function_begin_B("_mul_1x1_mmx"); | |
66 | &sub ("esp",32+4); | |
67 | &mov ($a1,$a); | |
68 | &lea ($a2,&DWP(0,$a,$a)); | |
69 | &and ($a1,0x3fffffff); | |
70 | &lea ($a4,&DWP(0,$a2,$a2)); | |
71 | &mov (&DWP(0*4,"esp"),0); | |
72 | &and ($a2,0x7fffffff); | |
73 | &movd ($A,$a); | |
74 | &movd ($B,$b); | |
75 | &mov (&DWP(1*4,"esp"),$a1); # a1 | |
76 | &xor ($a1,$a2); # a1^a2 | |
77 | &pxor ($B31,$B31); | |
78 | &pxor ($B30,$B30); | |
79 | &mov (&DWP(2*4,"esp"),$a2); # a2 | |
80 | &xor ($a2,$a4); # a2^a4 | |
81 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | |
82 | &pcmpgtd($B31,$A); # broadcast 31st bit | |
83 | &paddd ($A,$A); # $A<<=1 | |
84 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | |
85 | &mov (&DWP(4*4,"esp"),$a4); # a4 | |
86 | &xor ($a4,$a2); # a2=a4^a2^a4 | |
87 | &pand ($B31,$B); | |
88 | &pcmpgtd($B30,$A); # broadcast 30th bit | |
89 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | |
90 | &xor ($a4,$a1); # a1^a2^a4 | |
91 | &psllq ($B31,31); | |
92 | &pand ($B30,$B); | |
93 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | |
94 | &mov (@i[0],0x7); | |
95 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | |
96 | &mov ($a4,@i[0]); | |
97 | &and (@i[0],$b); | |
98 | &shr ($b,3); | |
99 | &mov (@i[1],$a4); | |
100 | &psllq ($B30,30); | |
101 | &and (@i[1],$b); | |
102 | &shr ($b,3); | |
103 | &movd ($R,&DWP(0,"esp",@i[0],4)); | |
104 | &mov (@i[0],$a4); | |
105 | &and (@i[0],$b); | |
106 | &shr ($b,3); | |
107 | for($n=1;$n<9;$n++) { | |
108 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | |
109 | &mov (@i[1],$a4); | |
110 | &psllq (@T[1],3*$n); | |
111 | &and (@i[1],$b); | |
112 | &shr ($b,3); | |
113 | &pxor ($R,@T[1]); | |
114 | ||
115 | push(@i,shift(@i)); push(@T,shift(@T)); | |
116 | } | |
117 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | |
118 | &pxor ($R,$B30); | |
119 | &psllq (@T[1],3*$n++); | |
120 | &pxor ($R,@T[1]); | |
121 | ||
122 | &movd (@T[0],&DWP(0,"esp",@i[0],4)); | |
123 | &pxor ($R,$B31); | |
124 | &psllq (@T[0],3*$n); | |
125 | &add ("esp",32+4); | |
126 | &pxor ($R,@T[0]); | |
127 | &ret (); | |
128 | &function_end_B("_mul_1x1_mmx"); | |
129 | } | |
130 | ||
131 | ($lo,$hi)=("eax","edx"); | |
132 | @T=("ecx","ebp"); | |
133 | ||
134 | &function_begin_B("_mul_1x1_ialu"); | |
135 | &sub ("esp",32+4); | |
136 | &mov ($a1,$a); | |
137 | &lea ($a2,&DWP(0,$a,$a)); | |
138 | &lea ($a4,&DWP(0,"",$a,4)); | |
139 | &and ($a1,0x3fffffff); | |
140 | &lea (@i[1],&DWP(0,$lo,$lo)); | |
141 | &sar ($lo,31); # broadcast 31st bit | |
142 | &mov (&DWP(0*4,"esp"),0); | |
143 | &and ($a2,0x7fffffff); | |
144 | &mov (&DWP(1*4,"esp"),$a1); # a1 | |
145 | &xor ($a1,$a2); # a1^a2 | |
146 | &mov (&DWP(2*4,"esp"),$a2); # a2 | |
147 | &xor ($a2,$a4); # a2^a4 | |
148 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | |
149 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | |
150 | &mov (&DWP(4*4,"esp"),$a4); # a4 | |
151 | &xor ($a4,$a2); # a2=a4^a2^a4 | |
152 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | |
153 | &xor ($a4,$a1); # a1^a2^a4 | |
46f4e1be | 154 | &sar (@i[1],31); # broadcast 30th bit |
58cc21fd AP |
155 | &and ($lo,$b); |
156 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | |
157 | &and (@i[1],$b); | |
158 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | |
159 | &mov ($hi,$lo); | |
160 | &shl ($lo,31); | |
161 | &mov (@T[0],@i[1]); | |
162 | &shr ($hi,1); | |
163 | ||
164 | &mov (@i[0],0x7); | |
165 | &shl (@i[1],30); | |
166 | &and (@i[0],$b); | |
167 | &shr (@T[0],2); | |
168 | &xor ($lo,@i[1]); | |
169 | ||
170 | &shr ($b,3); | |
171 | &mov (@i[1],0x7); # 5-byte instruction!? | |
172 | &and (@i[1],$b); | |
173 | &shr ($b,3); | |
174 | &xor ($hi,@T[0]); | |
175 | &xor ($lo,&DWP(0,"esp",@i[0],4)); | |
176 | &mov (@i[0],0x7); | |
177 | &and (@i[0],$b); | |
178 | &shr ($b,3); | |
179 | for($n=1;$n<9;$n++) { | |
180 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | |
181 | &mov (@i[1],0x7); | |
182 | &mov (@T[0],@T[1]); | |
183 | &shl (@T[1],3*$n); | |
184 | &and (@i[1],$b); | |
185 | &shr (@T[0],32-3*$n); | |
186 | &xor ($lo,@T[1]); | |
187 | &shr ($b,3); | |
188 | &xor ($hi,@T[0]); | |
189 | ||
190 | push(@i,shift(@i)); push(@T,shift(@T)); | |
191 | } | |
192 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | |
193 | &mov (@T[0],@T[1]); | |
194 | &shl (@T[1],3*$n); | |
195 | &mov (@i[1],&DWP(0,"esp",@i[0],4)); | |
196 | &shr (@T[0],32-3*$n); $n++; | |
197 | &mov (@i[0],@i[1]); | |
198 | &xor ($lo,@T[1]); | |
199 | &shl (@i[1],3*$n); | |
200 | &xor ($hi,@T[0]); | |
201 | &shr (@i[0],32-3*$n); | |
202 | &xor ($lo,@i[1]); | |
203 | &xor ($hi,@i[0]); | |
204 | ||
205 | &add ("esp",32+4); | |
206 | &ret (); | |
207 | &function_end_B("_mul_1x1_ialu"); | |
208 | ||
58cc21fd AP |
209 | # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); |
210 | &function_begin_B("bn_GF2m_mul_2x2"); | |
211 | if (!$x86only) { | |
212 | &picmeup("edx","OPENSSL_ia32cap_P"); | |
213 | &mov ("eax",&DWP(0,"edx")); | |
214 | &mov ("edx",&DWP(4,"edx")); | |
215 | &test ("eax",1<<23); # check MMX bit | |
216 | &jz (&label("ialu")); | |
217 | if ($sse2) { | |
218 | &test ("eax",1<<24); # check FXSR bit | |
219 | &jz (&label("mmx")); | |
220 | &test ("edx",1<<1); # check PCLMULQDQ bit | |
221 | &jz (&label("mmx")); | |
222 | ||
223 | &movups ("xmm0",&QWP(8,"esp")); | |
224 | &shufps ("xmm0","xmm0",0b10110001); | |
225 | &pclmulqdq ("xmm0","xmm0",1); | |
226 | &mov ("eax",&DWP(4,"esp")); | |
227 | &movups (&QWP(0,"eax"),"xmm0"); | |
228 | &ret (); | |
229 | ||
230 | &set_label("mmx",16); | |
231 | } | |
232 | &push ("ebp"); | |
233 | &push ("ebx"); | |
234 | &push ("esi"); | |
235 | &push ("edi"); | |
236 | &mov ($a,&wparam(1)); | |
237 | &mov ($b,&wparam(3)); | |
053fa39a | 238 | &call ("_mul_1x1_mmx"); # a1·b1 |
58cc21fd AP |
239 | &movq ("mm7",$R); |
240 | ||
241 | &mov ($a,&wparam(2)); | |
242 | &mov ($b,&wparam(4)); | |
053fa39a | 243 | &call ("_mul_1x1_mmx"); # a0·b0 |
58cc21fd AP |
244 | &movq ("mm6",$R); |
245 | ||
246 | &mov ($a,&wparam(1)); | |
247 | &mov ($b,&wparam(3)); | |
248 | &xor ($a,&wparam(2)); | |
249 | &xor ($b,&wparam(4)); | |
053fa39a | 250 | &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) |
58cc21fd AP |
251 | &pxor ($R,"mm7"); |
252 | &mov ($a,&wparam(0)); | |
053fa39a | 253 | &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 |
58cc21fd AP |
254 | |
255 | &movq ($A,$R); | |
256 | &psllq ($R,32); | |
257 | &pop ("edi"); | |
258 | &psrlq ($A,32); | |
259 | &pop ("esi"); | |
260 | &pxor ($R,"mm6"); | |
261 | &pop ("ebx"); | |
262 | &pxor ($A,"mm7"); | |
263 | &movq (&QWP(0,$a),$R); | |
264 | &pop ("ebp"); | |
265 | &movq (&QWP(8,$a),$A); | |
266 | &emms (); | |
267 | &ret (); | |
268 | &set_label("ialu",16); | |
269 | } | |
270 | &push ("ebp"); | |
271 | &push ("ebx"); | |
272 | &push ("esi"); | |
273 | &push ("edi"); | |
274 | &stack_push(4+1); | |
275 | ||
276 | &mov ($a,&wparam(1)); | |
277 | &mov ($b,&wparam(3)); | |
053fa39a | 278 | &call ("_mul_1x1_ialu"); # a1·b1 |
58cc21fd AP |
279 | &mov (&DWP(8,"esp"),$lo); |
280 | &mov (&DWP(12,"esp"),$hi); | |
281 | ||
282 | &mov ($a,&wparam(2)); | |
283 | &mov ($b,&wparam(4)); | |
053fa39a | 284 | &call ("_mul_1x1_ialu"); # a0·b0 |
58cc21fd AP |
285 | &mov (&DWP(0,"esp"),$lo); |
286 | &mov (&DWP(4,"esp"),$hi); | |
287 | ||
288 | &mov ($a,&wparam(1)); | |
289 | &mov ($b,&wparam(3)); | |
290 | &xor ($a,&wparam(2)); | |
291 | &xor ($b,&wparam(4)); | |
053fa39a | 292 | &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) |
58cc21fd AP |
293 | |
294 | &mov ("ebp",&wparam(0)); | |
295 | @r=("ebx","ecx","edi","esi"); | |
296 | &mov (@r[0],&DWP(0,"esp")); | |
297 | &mov (@r[1],&DWP(4,"esp")); | |
298 | &mov (@r[2],&DWP(8,"esp")); | |
299 | &mov (@r[3],&DWP(12,"esp")); | |
300 | ||
301 | &xor ($lo,$hi); | |
302 | &xor ($hi,@r[1]); | |
303 | &xor ($lo,@r[0]); | |
304 | &mov (&DWP(0,"ebp"),@r[0]); | |
305 | &xor ($hi,@r[2]); | |
306 | &mov (&DWP(12,"ebp"),@r[3]); | |
307 | &xor ($lo,@r[3]); | |
308 | &stack_pop(4+1); | |
309 | &xor ($hi,@r[3]); | |
310 | &pop ("edi"); | |
311 | &xor ($lo,$hi); | |
312 | &pop ("esi"); | |
313 | &mov (&DWP(8,"ebp"),$hi); | |
314 | &pop ("ebx"); | |
315 | &mov (&DWP(4,"ebp"),$lo); | |
316 | &pop ("ebp"); | |
317 | &ret (); | |
318 | &function_end_B("bn_GF2m_mul_2x2"); | |
319 | ||
2b9a8ca1 | 320 | &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
58cc21fd AP |
321 | |
322 | &asm_finish(); | |
6bd7a4d9 | 323 | |
a21314db | 324 | close STDOUT or die "error closing STDOUT: $!"; |