]>
Commit | Line | Data |
---|---|---|
aa9db2d2 AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | # | |
10 | # ECP_NISTZ256 module for x86/SSE2. | |
11 | # | |
12 | # October 2014. | |
13 | # | |
14 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | |
15 | # http://eprint.iacr.org/2013/816. In the process of adaptation | |
16 | # original .c module was made 32-bit savvy in order to make this | |
17 | # implementation possible. | |
18 | # | |
19 | # with/without -DECP_NISTZ256_ASM | |
20 | # Pentium +66-163% | |
21 | # PIII +72-172% | |
22 | # P4 +65-132% | |
23 | # Core2 +90-215% | |
24 | # Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) | |
25 | # Atom +65-155% | |
26 | # Opteron +54-110% | |
27 | # Bulldozer +99-240% | |
28 | # VIA Nano +93-290% | |
29 | # | |
30 | # Ranges denote minimum and maximum improvement coefficients depending | |
31 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | |
32 | # operation. Keep in mind that +200% means 3x improvement. | |
33 | ||
34 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
35 | push(@INC,"${dir}","${dir}../../perlasm"); | |
36 | require "x86asm.pl"; | |
37 | ||
38 | &asm_init($ARGV[0],"ecp_nistz256-x86.pl",$ARGV[$#ARGV] eq "386"); | |
39 | ||
40 | $sse2=0; | |
41 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
42 | ||
43 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | |
44 | ||
45 | ||
46 | ######################################################################## | |
47 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 | |
48 | # | |
49 | open TABLE,"<ecp_nistz256_table.c" or | |
50 | open TABLE,"<${dir}../ecp_nistz256_table.c" or | |
51 | die "failed to open ecp_nistz256_table.c:",$!; | |
52 | ||
53 | use integer; | |
54 | ||
55 | foreach(<TABLE>) { | |
56 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; | |
57 | } | |
58 | close TABLE; | |
59 | ||
60 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37. | |
61 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not | |
62 | # amount of elements. | |
63 | die "insane number of elements" if ($#arr != 64*16*37-1); | |
64 | ||
65 | &public_label("ecp_nistz256_precomputed"); | |
66 | &align(4096); | |
67 | &set_label("ecp_nistz256_precomputed"); | |
68 | ||
69 | ######################################################################## | |
70 | # this conversion smashes P256_POINT_AFFINE by individual bytes with | |
71 | # 64 byte interval, similar to | |
72 | # 1111222233334444 | |
73 | # 1234123412341234 | |
74 | for(1..37) { | |
75 | @tbl = splice(@arr,0,64*16); | |
76 | for($i=0;$i<64;$i++) { | |
77 | undef @line; | |
78 | for($j=0;$j<64;$j++) { | |
79 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; | |
80 | } | |
81 | &data_byte(join(',',map { sprintf "0x%02x",$_} @line)); | |
82 | } | |
83 | } | |
84 | ||
85 | ######################################################################## | |
86 | # Keep in mind that constants are stored least to most significant word | |
87 | &static_label("RR"); | |
88 | &set_label("RR",64); | |
89 | &data_word(3,0,-1,-5,-2,-1,-3,4); # 2^512 mod P-256 | |
90 | ||
91 | &static_label("ONE_mont"); | |
92 | &set_label("ONE_mont"); | |
93 | &data_word(1,0,0,-1,-1,-1,-2,0); | |
94 | ||
95 | &static_label("ONE"); | |
96 | &set_label("ONE"); | |
97 | &data_word(1,0,0,0,0,0,0,0); | |
98 | &asciz("ECP_NISZ256 for x86/SSE2, CRYPTOGAMS by <appro\@openssl.org>"); | |
99 | &align(64); | |
100 | ||
101 | ######################################################################## | |
102 | # void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
103 | &function_begin("ecp_nistz256_mul_by_2"); | |
104 | &mov ("esi",&wparam(1)); | |
105 | &mov ("edi",&wparam(0)); | |
106 | &mov ("ebp","esi"); | |
107 | ######################################################################## | |
108 | # common pattern for internal functions is that %edi is result pointer, | |
109 | # %esi and %ebp are input ones, %ebp being optional. %edi is preserved. | |
110 | &call ("_ecp_nistz256_add"); | |
111 | &function_end("ecp_nistz256_mul_by_2"); | |
112 | ||
113 | ######################################################################## | |
114 | # void ecp_nistz256_mul_by_3(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
115 | &function_begin("ecp_nistz256_mul_by_3"); | |
116 | &mov ("esi",&wparam(1)); | |
117 | # multiplication by 3 is performed | |
118 | # as 2*n+n, but we can't use output | |
119 | # to store 2*n, because if output | |
120 | # pointer equals to input, then | |
121 | # we'll get 2*n+2*n. | |
122 | &stack_push(8); # therefore we need to allocate | |
123 | # 256-bit intermediate buffer. | |
124 | &mov ("edi","esp"); | |
125 | &mov ("ebp","esi"); | |
126 | &call ("_ecp_nistz256_add"); | |
127 | &lea ("esi",&DWP(0,"edi")); | |
128 | &mov ("ebp",&wparam(1)); | |
129 | &mov ("edi",&wparam(0)); | |
130 | &call ("_ecp_nistz256_add"); | |
131 | &stack_pop(8); | |
132 | &function_end("ecp_nistz256_mul_by_3"); | |
133 | ||
134 | ######################################################################## | |
135 | # void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
136 | &function_begin("ecp_nistz256_div_by_2"); | |
137 | &mov ("esi",&wparam(1)); | |
138 | &mov ("edi",&wparam(0)); | |
139 | &call ("_ecp_nistz256_div_by_2"); | |
140 | &function_end("ecp_nistz256_div_by_2"); | |
141 | ||
142 | &function_begin_B("_ecp_nistz256_div_by_2"); | |
143 | # tmp = a is odd ? a+mod : a | |
144 | # | |
145 | # note that because mod has special form, i.e. consists of | |
146 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
147 | # assigning least significant bit of input to one register, | |
148 | # %ebp, and its negative to another, %edx. | |
149 | ||
150 | &mov ("ebp",&DWP(0,"esi")); | |
151 | &xor ("edx","edx"); | |
152 | &mov ("ebx",&DWP(4,"esi")); | |
153 | &mov ("eax","ebp"); | |
154 | &and ("ebp",1); | |
155 | &mov ("ecx",&DWP(8,"esi")); | |
156 | &sub ("edx","ebp"); | |
157 | ||
158 | &add ("eax","edx"); | |
159 | &adc ("ebx","edx"); | |
160 | &mov (&DWP(0,"edi"),"eax"); | |
161 | &adc ("ecx","edx"); | |
162 | &mov (&DWP(4,"edi"),"ebx"); | |
163 | &mov (&DWP(8,"edi"),"ecx"); | |
164 | ||
165 | &mov ("eax",&DWP(12,"esi")); | |
166 | &mov ("ebx",&DWP(16,"esi")); | |
167 | &adc ("eax",0); | |
168 | &mov ("ecx",&DWP(20,"esi")); | |
169 | &adc ("ebx",0); | |
170 | &mov (&DWP(12,"edi"),"eax"); | |
171 | &adc ("ecx",0); | |
172 | &mov (&DWP(16,"edi"),"ebx"); | |
173 | &mov (&DWP(20,"edi"),"ecx"); | |
174 | ||
175 | &mov ("eax",&DWP(24,"esi")); | |
176 | &mov ("ebx",&DWP(28,"esi")); | |
177 | &adc ("eax","ebp"); | |
178 | &adc ("ebx","edx"); | |
179 | &mov (&DWP(24,"edi"),"eax"); | |
180 | &sbb ("esi","esi"); # broadcast carry bit | |
181 | &mov (&DWP(28,"edi"),"ebx"); | |
182 | ||
183 | # ret = tmp >> 1 | |
184 | ||
185 | &mov ("eax",&DWP(0,"edi")); | |
186 | &mov ("ebx",&DWP(4,"edi")); | |
187 | &mov ("ecx",&DWP(8,"edi")); | |
188 | &mov ("edx",&DWP(12,"edi")); | |
189 | ||
190 | &shr ("eax",1); | |
191 | &mov ("ebp","ebx"); | |
192 | &shl ("ebx",31); | |
193 | &or ("eax","ebx"); | |
194 | ||
195 | &shr ("ebp",1); | |
196 | &mov ("ebx","ecx"); | |
197 | &shl ("ecx",31); | |
198 | &mov (&DWP(0,"edi"),"eax"); | |
199 | &or ("ebp","ecx"); | |
200 | &mov ("eax",&DWP(16,"edi")); | |
201 | ||
202 | &shr ("ebx",1); | |
203 | &mov ("ecx","edx"); | |
204 | &shl ("edx",31); | |
205 | &mov (&DWP(4,"edi"),"ebp"); | |
206 | &or ("ebx","edx"); | |
207 | &mov ("ebp",&DWP(20,"edi")); | |
208 | ||
209 | &shr ("ecx",1); | |
210 | &mov ("edx","eax"); | |
211 | &shl ("eax",31); | |
212 | &mov (&DWP(8,"edi"),"ebx"); | |
213 | &or ("ecx","eax"); | |
214 | &mov ("ebx",&DWP(24,"edi")); | |
215 | ||
216 | &shr ("edx",1); | |
217 | &mov ("eax","ebp"); | |
218 | &shl ("ebp",31); | |
219 | &mov (&DWP(12,"edi"),"ecx"); | |
220 | &or ("edx","ebp"); | |
221 | &mov ("ecx",&DWP(28,"edi")); | |
222 | ||
223 | &shr ("eax",1); | |
224 | &mov ("ebp","ebx"); | |
225 | &shl ("ebx",31); | |
226 | &mov (&DWP(16,"edi"),"edx"); | |
227 | &or ("eax","ebx"); | |
228 | ||
229 | &shr ("ebp",1); | |
230 | &mov ("ebx","ecx"); | |
231 | &shl ("ecx",31); | |
232 | &mov (&DWP(20,"edi"),"eax"); | |
233 | &or ("ebp","ecx"); | |
234 | ||
235 | &shr ("ebx",1); | |
236 | &shl ("esi",31); | |
237 | &mov (&DWP(24,"edi"),"ebp"); | |
238 | &or ("ebx","esi"); # handle top-most carry bit | |
239 | &mov (&DWP(28,"edi"),"ebx"); | |
240 | ||
241 | &ret (); | |
242 | &function_end_B("_ecp_nistz256_div_by_2"); | |
243 | ||
244 | ######################################################################## | |
245 | # void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], | |
246 | # const BN_ULONG ebp[8]); | |
247 | &function_begin("ecp_nistz256_add"); | |
248 | &mov ("esi",&wparam(1)); | |
249 | &mov ("ebp",&wparam(2)); | |
250 | &mov ("edi",&wparam(0)); | |
251 | &call ("_ecp_nistz256_add"); | |
252 | &function_end("ecp_nistz256_add"); | |
253 | ||
254 | &function_begin_B("_ecp_nistz256_add"); | |
255 | &mov ("eax",&DWP(0,"esi")); | |
256 | &mov ("ebx",&DWP(4,"esi")); | |
257 | &mov ("ecx",&DWP(8,"esi")); | |
258 | &add ("eax",&DWP(0,"ebp")); | |
259 | &mov ("edx",&DWP(12,"esi")); | |
260 | &adc ("ebx",&DWP(4,"ebp")); | |
261 | &mov (&DWP(0,"edi"),"eax"); | |
262 | &adc ("ecx",&DWP(8,"ebp")); | |
263 | &mov (&DWP(4,"edi"),"ebx"); | |
264 | &adc ("edx",&DWP(12,"ebp")); | |
265 | &mov (&DWP(8,"edi"),"ecx"); | |
266 | &mov (&DWP(12,"edi"),"edx"); | |
267 | ||
268 | &mov ("eax",&DWP(16,"esi")); | |
269 | &mov ("ebx",&DWP(20,"esi")); | |
270 | &mov ("ecx",&DWP(24,"esi")); | |
271 | &adc ("eax",&DWP(16,"ebp")); | |
272 | &mov ("edx",&DWP(28,"esi")); | |
273 | &adc ("ebx",&DWP(20,"ebp")); | |
274 | &mov (&DWP(16,"edi"),"eax"); | |
275 | &adc ("ecx",&DWP(24,"ebp")); | |
276 | &mov (&DWP(20,"edi"),"ebx"); | |
277 | &adc ("edx",&DWP(28,"ebp")); | |
278 | &mov (&DWP(24,"edi"),"ecx"); | |
279 | &sbb ("esi","esi"); # broadcast carry bit | |
280 | &mov (&DWP(28,"edi"),"edx"); | |
281 | ||
282 | # if a+b carries, subtract modulus. | |
283 | # | |
284 | # Note that because mod has special form, i.e. consists of | |
285 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
286 | # assigning carry bit to one register, %ebp, and its negative | |
287 | # to another, %esi. But we started by calculating %esi... | |
288 | ||
289 | &mov ("eax",&DWP(0,"edi")); | |
290 | &mov ("ebp","esi"); | |
291 | &mov ("ebx",&DWP(4,"edi")); | |
292 | &shr ("ebp",31); | |
293 | &mov ("ecx",&DWP(8,"edi")); | |
294 | &sub ("eax","esi"); | |
295 | &mov ("edx",&DWP(12,"edi")); | |
296 | &sbb ("ebx","esi"); | |
297 | &mov (&DWP(0,"edi"),"eax"); | |
298 | &sbb ("ecx","esi"); | |
299 | &mov (&DWP(4,"edi"),"ebx"); | |
300 | &sbb ("edx",0); | |
301 | &mov (&DWP(8,"edi"),"ecx"); | |
302 | &mov (&DWP(12,"edi"),"edx"); | |
303 | ||
304 | &mov ("eax",&DWP(16,"edi")); | |
305 | &mov ("ebx",&DWP(20,"edi")); | |
306 | &mov ("ecx",&DWP(24,"edi")); | |
307 | &sbb ("eax",0); | |
308 | &mov ("edx",&DWP(28,"edi")); | |
309 | &sbb ("ebx",0); | |
310 | &mov (&DWP(16,"edi"),"eax"); | |
311 | &sbb ("ecx","ebp"); | |
312 | &mov (&DWP(20,"edi"),"ebx"); | |
313 | &sbb ("edx","esi"); | |
314 | &mov (&DWP(24,"edi"),"ecx"); | |
315 | &mov (&DWP(28,"edi"),"edx"); | |
316 | ||
317 | &ret (); | |
318 | &function_end_B("_ecp_nistz256_add"); | |
319 | ||
320 | ######################################################################## | |
321 | # void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], | |
322 | # const BN_ULONG ebp[8]); | |
323 | &function_begin("ecp_nistz256_sub"); | |
324 | &mov ("esi",&wparam(1)); | |
325 | &mov ("ebp",&wparam(2)); | |
326 | &mov ("edi",&wparam(0)); | |
327 | &call ("_ecp_nistz256_sub"); | |
328 | &function_end("ecp_nistz256_sub"); | |
329 | ||
330 | &function_begin_B("_ecp_nistz256_sub"); | |
331 | &mov ("eax",&DWP(0,"esi")); | |
332 | &mov ("ebx",&DWP(4,"esi")); | |
333 | &mov ("ecx",&DWP(8,"esi")); | |
334 | &sub ("eax",&DWP(0,"ebp")); | |
335 | &mov ("edx",&DWP(12,"esi")); | |
336 | &sbb ("ebx",&DWP(4,"ebp")); | |
337 | &mov (&DWP(0,"edi"),"eax"); | |
338 | &sbb ("ecx",&DWP(8,"ebp")); | |
339 | &mov (&DWP(4,"edi"),"ebx"); | |
340 | &sbb ("edx",&DWP(12,"ebp")); | |
341 | &mov (&DWP(8,"edi"),"ecx"); | |
342 | &mov (&DWP(12,"edi"),"edx"); | |
343 | ||
344 | &mov ("eax",&DWP(16,"esi")); | |
345 | &mov ("ebx",&DWP(20,"esi")); | |
346 | &mov ("ecx",&DWP(24,"esi")); | |
347 | &sbb ("eax",&DWP(16,"ebp")); | |
348 | &mov ("edx",&DWP(28,"esi")); | |
349 | &sbb ("ebx",&DWP(20,"ebp")); | |
350 | &sbb ("ecx",&DWP(24,"ebp")); | |
351 | &mov (&DWP(16,"edi"),"eax"); | |
352 | &sbb ("edx",&DWP(28,"ebp")); | |
353 | &mov (&DWP(20,"edi"),"ebx"); | |
354 | &sbb ("esi","esi"); # broadcast borrow bit | |
355 | &mov (&DWP(24,"edi"),"ecx"); | |
356 | &mov (&DWP(28,"edi"),"edx"); | |
357 | ||
358 | # if a-b borrows, add modulus. | |
359 | # | |
360 | # Note that because mod has special form, i.e. consists of | |
361 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
362 | # assigning borrow bit to one register, %ebp, and its negative | |
363 | # to another, %esi. But we started by calculating %esi... | |
364 | ||
365 | &mov ("eax",&DWP(0,"edi")); | |
366 | &mov ("ebp","esi"); | |
367 | &mov ("ebx",&DWP(4,"edi")); | |
368 | &shr ("ebp",31); | |
369 | &mov ("ecx",&DWP(8,"edi")); | |
370 | &add ("eax","esi"); | |
371 | &mov ("edx",&DWP(12,"edi")); | |
372 | &adc ("ebx","esi"); | |
373 | &mov (&DWP(0,"edi"),"eax"); | |
374 | &adc ("ecx","esi"); | |
375 | &mov (&DWP(4,"edi"),"ebx"); | |
376 | &adc ("edx",0); | |
377 | &mov (&DWP(8,"edi"),"ecx"); | |
378 | &mov (&DWP(12,"edi"),"edx"); | |
379 | ||
380 | &mov ("eax",&DWP(16,"edi")); | |
381 | &mov ("ebx",&DWP(20,"edi")); | |
382 | &mov ("ecx",&DWP(24,"edi")); | |
383 | &adc ("eax",0); | |
384 | &mov ("edx",&DWP(28,"edi")); | |
385 | &adc ("ebx",0); | |
386 | &mov (&DWP(16,"edi"),"eax"); | |
387 | &adc ("ecx","ebp"); | |
388 | &mov (&DWP(20,"edi"),"ebx"); | |
389 | &adc ("edx","esi"); | |
390 | &mov (&DWP(24,"edi"),"ecx"); | |
391 | &mov (&DWP(28,"edi"),"edx"); | |
392 | ||
393 | &ret (); | |
394 | &function_end_B("_ecp_nistz256_sub"); | |
395 | ||
396 | ######################################################################## | |
397 | # void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
398 | &function_begin("ecp_nistz256_neg"); | |
399 | &mov ("ebp",&wparam(1)); | |
400 | &mov ("edi",&wparam(0)); | |
401 | ||
402 | &xor ("eax","eax"); | |
403 | &stack_push(8); | |
404 | &mov (&DWP(0,"esp"),"eax"); | |
405 | &mov ("esi","esp"); | |
406 | &mov (&DWP(4,"esp"),"eax"); | |
407 | &mov (&DWP(8,"esp"),"eax"); | |
408 | &mov (&DWP(12,"esp"),"eax"); | |
409 | &mov (&DWP(16,"esp"),"eax"); | |
410 | &mov (&DWP(20,"esp"),"eax"); | |
411 | &mov (&DWP(24,"esp"),"eax"); | |
412 | &mov (&DWP(28,"esp"),"eax"); | |
413 | ||
414 | &call ("_ecp_nistz256_sub"); | |
415 | ||
416 | &stack_pop(8); | |
417 | &function_end("ecp_nistz256_neg"); | |
418 | ||
419 | &function_begin_B("_picup_eax"); | |
420 | &mov ("eax",&DWP(0,"esp")); | |
421 | &ret (); | |
422 | &function_end_B("_picup_eax"); | |
423 | ||
424 | ######################################################################## | |
425 | # void ecp_nistz256_to_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
426 | &function_begin("ecp_nistz256_to_mont"); | |
427 | &mov ("esi",&wparam(1)); | |
428 | &call ("_picup_eax"); | |
429 | &set_label("pic"); | |
430 | &lea ("ebp",&DWP(&label("RR")."-".&label("pic"),"eax")); | |
431 | if ($sse2) { | |
432 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | |
433 | &mov ("eax",&DWP(0,"eax")); } | |
434 | &mov ("edi",&wparam(0)); | |
435 | &call ("_ecp_nistz256_mul_mont"); | |
436 | &function_end("ecp_nistz256_to_mont"); | |
437 | ||
438 | ######################################################################## | |
439 | # void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
440 | &function_begin("ecp_nistz256_from_mont"); | |
441 | &mov ("esi",&wparam(1)); | |
442 | &call ("_picup_eax"); | |
443 | &set_label("pic"); | |
444 | &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); | |
445 | if ($sse2) { | |
446 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | |
447 | &mov ("eax",&DWP(0,"eax")); } | |
448 | &mov ("edi",&wparam(0)); | |
449 | &call ("_ecp_nistz256_mul_mont"); | |
450 | &function_end("ecp_nistz256_from_mont"); | |
451 | ||
452 | ######################################################################## | |
453 | # void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], | |
454 | # const BN_ULONG ebp[8]); | |
455 | &function_begin("ecp_nistz256_mul_mont"); | |
456 | &mov ("esi",&wparam(1)); | |
457 | &mov ("ebp",&wparam(2)); | |
458 | if ($sse2) { | |
459 | &call ("_picup_eax"); | |
460 | &set_label("pic"); | |
461 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | |
462 | &mov ("eax",&DWP(0,"eax")); } | |
463 | &mov ("edi",&wparam(0)); | |
464 | &call ("_ecp_nistz256_mul_mont"); | |
465 | &function_end("ecp_nistz256_mul_mont"); | |
466 | ||
467 | ######################################################################## | |
468 | # void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
469 | &function_begin("ecp_nistz256_sqr_mont"); | |
470 | &mov ("esi",&wparam(1)); | |
471 | if ($sse2) { | |
472 | &call ("_picup_eax"); | |
473 | &set_label("pic"); | |
474 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | |
475 | &mov ("eax",&DWP(0,"eax")); } | |
476 | &mov ("edi",&wparam(0)); | |
477 | &mov ("ebp","esi"); | |
478 | &call ("_ecp_nistz256_mul_mont"); | |
479 | &function_end("ecp_nistz256_sqr_mont"); | |
480 | ||
481 | &function_begin_B("_ecp_nistz256_mul_mont"); | |
482 | if ($sse2) { | |
483 | &and ("eax",1<<24|1<<26); | |
484 | &cmp ("eax",1<<24|1<<26); # see if XMM+SSE2 is on | |
485 | &jne (&label("mul_mont_ialu")); | |
486 | ||
487 | ######################################## | |
488 | # SSE2 code path featuring 32x16-bit | |
489 | # multiplications is ~2x faster than | |
490 | # IALU counterpart (except on Atom)... | |
491 | ######################################## | |
492 | # stack layout: | |
493 | # +------------------------------------+< %esp | |
494 | # | 7 16-byte temporary XMM words, | | |
495 | # | "sliding" toward lower address | | |
496 | # . . | |
497 | # +------------------------------------+ | |
498 | # | unused XMM word | | |
499 | # +------------------------------------+< +128,%ebx | |
500 | # | 8 16-byte XMM words holding copies | | |
501 | # | of a[i]<<64|a[i] | | |
502 | # . . | |
503 | # . . | |
504 | # +------------------------------------+< +256 | |
505 | &mov ("edx","esp"); | |
506 | &sub ("esp",0x100); | |
507 | ||
508 | &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy | |
509 | &lea ("ebp",&DWP(4,"ebp")); | |
510 | &pcmpeqd("xmm6","xmm6"); | |
511 | &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff | |
512 | ||
513 | &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y | |
514 | &and ("esp",-64); | |
515 | &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | |
516 | &lea ("ebx",&DWP(0x80,"esp")); | |
517 | ||
518 | &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy | |
519 | &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy | |
520 | &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... | |
521 | &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] | |
522 | &pmuludq("xmm0","xmm7"); # a[0]*b[0] | |
523 | ||
524 | &movd ("xmm2",&DWP(4*2,"esi")); | |
525 | &pshufd ("xmm1","xmm1",0b11001100); | |
526 | &movdqa (&QWP(0x10,"ebx"),"xmm1"); | |
527 | &pmuludq("xmm1","xmm7"); # a[1]*b[0] | |
528 | ||
529 | &movq ("xmm4","xmm0"); # clear upper 64 bits | |
530 | &pslldq("xmm4",6); | |
531 | &paddq ("xmm4","xmm0"); | |
532 | &movdqa("xmm5","xmm4"); | |
533 | &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] | |
534 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] | |
535 | ||
536 | # Upper half of a[0]*b[i] is carried into next multiplication | |
537 | # iteration, while lower one "participates" in actual reduction. | |
538 | # Normally latter is done by accumulating result of multiplication | |
539 | # of modulus by "magic" digit, but thanks to special form of modulus | |
540 | # and "magic" digit it can be performed only with additions and | |
541 | # subtractions (see note in IALU section below). Note that we are | |
542 | # not bothered with carry bits, they are accumulated in "flatten" | |
543 | # phase after all multiplications and reductions. | |
544 | ||
545 | &movd ("xmm3",&DWP(4*3,"esi")); | |
546 | &pshufd ("xmm2","xmm2",0b11001100); | |
547 | &movdqa (&QWP(0x20,"ebx"),"xmm2"); | |
548 | &pmuludq("xmm2","xmm7"); # a[2]*b[0] | |
549 | &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry | |
550 | &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] | |
551 | ||
552 | &movd ("xmm0",&DWP(4*4,"esi")); | |
553 | &pshufd ("xmm3","xmm3",0b11001100); | |
554 | &movdqa (&QWP(0x30,"ebx"),"xmm3"); | |
555 | &pmuludq("xmm3","xmm7"); # a[3]*b[0] | |
556 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | |
557 | ||
558 | &movd ("xmm1",&DWP(4*5,"esi")); | |
559 | &pshufd ("xmm0","xmm0",0b11001100); | |
560 | &movdqa (&QWP(0x40,"ebx"),"xmm0"); | |
561 | &pmuludq("xmm0","xmm7"); # a[4]*b[0] | |
562 | &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step | |
563 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | |
564 | ||
565 | &movd ("xmm2",&DWP(4*6,"esi")); | |
566 | &pshufd ("xmm1","xmm1",0b11001100); | |
567 | &movdqa (&QWP(0x50,"ebx"),"xmm1"); | |
568 | &pmuludq("xmm1","xmm7"); # a[5]*b[0] | |
569 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | |
570 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | |
571 | ||
572 | &movd ("xmm3",&DWP(4*7,"esi")); | |
573 | &pshufd ("xmm2","xmm2",0b11001100); | |
574 | &movdqa (&QWP(0x60,"ebx"),"xmm2"); | |
575 | &pmuludq("xmm2","xmm7"); # a[6]*b[0] | |
576 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | |
577 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | |
578 | ||
579 | &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy | |
580 | &pshufd ("xmm3","xmm3",0b11001100); | |
581 | &movdqa (&QWP(0x70,"ebx"),"xmm3"); | |
582 | &pmuludq("xmm3","xmm7"); # a[7]*b[0] | |
583 | ||
584 | &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y | |
585 | &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] | |
586 | &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | |
587 | ||
588 | &mov ("ecx",6); | |
589 | &lea ("ebp",&DWP(4,"ebp")); | |
590 | &jmp (&label("madd_sse2")); | |
591 | ||
592 | &set_label("madd_sse2",16); | |
593 | &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] | |
594 | &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] | |
595 | &movdqa ("xmm1",&QWP(0x10,"ebx")); | |
596 | &pmuludq("xmm0","xmm7"); # a[0]*b[i] | |
597 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | |
598 | ||
599 | &movdqa ("xmm2",&QWP(0x20,"ebx")); | |
600 | &pmuludq("xmm1","xmm7"); # a[1]*b[i] | |
601 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | |
602 | &paddq ("xmm0",&QWP(0x00,"esp")); | |
603 | ||
604 | &movdqa ("xmm3",&QWP(0x30,"ebx")); | |
605 | &pmuludq("xmm2","xmm7"); # a[2]*b[i] | |
606 | &movq ("xmm4","xmm0"); # clear upper 64 bits | |
607 | &pslldq("xmm4",6); | |
608 | &paddq ("xmm1",&QWP(0x10,"esp")); | |
609 | &paddq ("xmm4","xmm0"); | |
610 | &movdqa("xmm5","xmm4"); | |
611 | &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] | |
612 | ||
613 | &movdqa ("xmm0",&QWP(0x40,"ebx")); | |
614 | &pmuludq("xmm3","xmm7"); # a[3]*b[i] | |
615 | &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry | |
616 | &paddq ("xmm2",&QWP(0x20,"esp")); | |
617 | &movdqa (&QWP(0x00,"esp"),"xmm1"); | |
618 | ||
619 | &movdqa ("xmm1",&QWP(0x50,"ebx")); | |
620 | &pmuludq("xmm0","xmm7"); # a[4]*b[i] | |
621 | &paddq ("xmm3",&QWP(0x30,"esp")); | |
622 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | |
623 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] | |
624 | ||
50292917 | 625 | &movdqa ("xmm2",&QWP(0x60,"ebx")); |
aa9db2d2 AP |
626 | &pmuludq("xmm1","xmm7"); # a[5]*b[i] |
627 | &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step | |
628 | &paddq ("xmm0",&QWP(0x40,"esp")); | |
629 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | |
630 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | |
631 | ||
632 | &movdqa ("xmm3","xmm7"); | |
633 | &pmuludq("xmm2","xmm7"); # a[6]*b[i] | |
634 | &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy | |
635 | &lea ("ebp",&DWP(4,"ebp")); | |
636 | &paddq ("xmm1",&QWP(0x50,"esp")); | |
637 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | |
638 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | |
639 | &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y | |
640 | ||
641 | &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] | |
642 | &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | |
643 | &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] | |
644 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | |
645 | &paddq ("xmm2",&QWP(0x60,"esp")); | |
646 | ||
647 | &dec ("ecx"); | |
648 | &jnz (&label("madd_sse2")); | |
649 | ||
650 | &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] | |
651 | &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] | |
652 | &movdqa ("xmm1",&QWP(0x10,"ebx")); | |
653 | &pmuludq("xmm0","xmm7"); # a[0]*b[7] | |
654 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | |
655 | ||
656 | &movdqa ("xmm2",&QWP(0x20,"ebx")); | |
657 | &pmuludq("xmm1","xmm7"); # a[1]*b[7] | |
658 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | |
659 | &paddq ("xmm0",&QWP(0x00,"esp")); | |
660 | ||
661 | &movdqa ("xmm3",&QWP(0x30,"ebx")); | |
662 | &pmuludq("xmm2","xmm7"); # a[2]*b[7] | |
663 | &movq ("xmm4","xmm0"); # clear upper 64 bits | |
664 | &pslldq("xmm4",6); | |
665 | &paddq ("xmm1",&QWP(0x10,"esp")); | |
666 | &paddq ("xmm4","xmm0"); | |
667 | &movdqa("xmm5","xmm4"); | |
668 | &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] | |
669 | ||
670 | &movdqa ("xmm0",&QWP(0x40,"ebx")); | |
671 | &pmuludq("xmm3","xmm7"); # a[3]*b[7] | |
672 | &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry | |
673 | &paddq ("xmm2",&QWP(0x20,"esp")); | |
674 | &movdqa (&QWP(0x00,"esp"),"xmm1"); | |
675 | ||
676 | &movdqa ("xmm1",&QWP(0x50,"ebx")); | |
677 | &pmuludq("xmm0","xmm7"); # a[4]*b[7] | |
678 | &paddq ("xmm3",&QWP(0x30,"esp")); | |
679 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | |
680 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] | |
681 | ||
50292917 | 682 | &movdqa ("xmm2",&QWP(0x60,"ebx")); |
aa9db2d2 AP |
683 | &pmuludq("xmm1","xmm7"); # a[5]*b[7] |
684 | &paddq ("xmm3","xmm5"); # reduction step | |
685 | &paddq ("xmm0",&QWP(0x40,"esp")); | |
686 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | |
687 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | |
688 | ||
689 | &movdqa ("xmm3",&QWP(0x70,"ebx")); | |
690 | &pmuludq("xmm2","xmm7"); # a[6]*b[7] | |
691 | &paddq ("xmm1",&QWP(0x50,"esp")); | |
692 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | |
693 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | |
694 | ||
695 | &pmuludq("xmm3","xmm7"); # a[7]*b[7] | |
696 | &pcmpeqd("xmm7","xmm7"); | |
697 | &movdqa ("xmm0",&QWP(0x00,"esp")); | |
698 | &pslldq ("xmm7",8); | |
699 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | |
700 | &paddq ("xmm2",&QWP(0x60,"esp")); | |
701 | ||
702 | &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step | |
703 | &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step | |
704 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | |
705 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | |
706 | ||
707 | &movdqa ("xmm1",&QWP(0x10,"esp")); | |
708 | &movdqa ("xmm2",&QWP(0x20,"esp")); | |
709 | &movdqa ("xmm3",&QWP(0x30,"esp")); | |
710 | ||
711 | &movq ("xmm4","xmm0"); # "flatten" | |
712 | &pand ("xmm0","xmm7"); | |
713 | &xor ("ebp","ebp"); | |
714 | &pslldq ("xmm4",6); | |
715 | &movq ("xmm5","xmm1"); | |
716 | &paddq ("xmm0","xmm4"); | |
717 | &pand ("xmm1","xmm7"); | |
718 | &psrldq ("xmm0",6); | |
719 | &movd ("eax","xmm0"); | |
720 | &psrldq ("xmm0",4); | |
721 | ||
722 | &paddq ("xmm5","xmm0"); | |
723 | &movdqa ("xmm0",&QWP(0x40,"esp")); | |
724 | &sub ("eax",-1); # start subtracting modulus, | |
725 | # this is used to determine | |
726 | # if result is larger/smaller | |
727 | # than modulus (see below) | |
728 | &pslldq ("xmm5",6); | |
729 | &movq ("xmm4","xmm2"); | |
730 | &paddq ("xmm1","xmm5"); | |
731 | &pand ("xmm2","xmm7"); | |
732 | &psrldq ("xmm1",6); | |
733 | &mov (&DWP(4*0,"edi"),"eax"); | |
734 | &movd ("eax","xmm1"); | |
735 | &psrldq ("xmm1",4); | |
736 | ||
737 | &paddq ("xmm4","xmm1"); | |
738 | &movdqa ("xmm1",&QWP(0x50,"esp")); | |
739 | &sbb ("eax",-1); | |
740 | &pslldq ("xmm4",6); | |
741 | &movq ("xmm5","xmm3"); | |
742 | &paddq ("xmm2","xmm4"); | |
743 | &pand ("xmm3","xmm7"); | |
744 | &psrldq ("xmm2",6); | |
745 | &mov (&DWP(4*1,"edi"),"eax"); | |
746 | &movd ("eax","xmm2"); | |
747 | &psrldq ("xmm2",4); | |
748 | ||
749 | &paddq ("xmm5","xmm2"); | |
750 | &movdqa ("xmm2",&QWP(0x60,"esp")); | |
751 | &sbb ("eax",-1); | |
752 | &pslldq ("xmm5",6); | |
753 | &movq ("xmm4","xmm0"); | |
754 | &paddq ("xmm3","xmm5"); | |
755 | &pand ("xmm0","xmm7"); | |
756 | &psrldq ("xmm3",6); | |
757 | &mov (&DWP(4*2,"edi"),"eax"); | |
758 | &movd ("eax","xmm3"); | |
759 | &psrldq ("xmm3",4); | |
760 | ||
761 | &paddq ("xmm4","xmm3"); | |
762 | &sbb ("eax",0); | |
763 | &pslldq ("xmm4",6); | |
764 | &movq ("xmm5","xmm1"); | |
765 | &paddq ("xmm0","xmm4"); | |
766 | &pand ("xmm1","xmm7"); | |
767 | &psrldq ("xmm0",6); | |
768 | &mov (&DWP(4*3,"edi"),"eax"); | |
769 | &movd ("eax","xmm0"); | |
770 | &psrldq ("xmm0",4); | |
771 | ||
772 | &paddq ("xmm5","xmm0"); | |
773 | &sbb ("eax",0); | |
774 | &pslldq ("xmm5",6); | |
775 | &movq ("xmm4","xmm2"); | |
776 | &paddq ("xmm1","xmm5"); | |
777 | &pand ("xmm2","xmm7"); | |
778 | &psrldq ("xmm1",6); | |
779 | &movd ("ebx","xmm1"); | |
780 | &psrldq ("xmm1",4); | |
781 | &mov ("esp","edx"); | |
782 | ||
783 | &paddq ("xmm4","xmm1"); | |
784 | &pslldq ("xmm4",6); | |
785 | &paddq ("xmm2","xmm4"); | |
786 | &psrldq ("xmm2",6); | |
787 | &movd ("ecx","xmm2"); | |
788 | &psrldq ("xmm2",4); | |
789 | &sbb ("ebx",0); | |
790 | &movd ("edx","xmm2"); | |
791 | &pextrw ("esi","xmm2",2); # top-most overflow bit | |
792 | &sbb ("ecx",1); | |
793 | &sbb ("edx",-1); | |
794 | &sbb ("esi",0); # borrow from subtraction | |
795 | ||
796 | # Final step is "if result > mod, subtract mod", and at this point | |
797 | # we have result - mod written to output buffer, as well as borrow | |
798 | # bit from this subtraction, and if borrow bit is set, we add | |
799 | # modulus back. | |
800 | # | |
801 | # Note that because mod has special form, i.e. consists of | |
802 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
803 | # assigning borrow bit to one register, %ebp, and its negative | |
804 | # to another, %esi. But we started by calculating %esi... | |
805 | ||
806 | &sub ("ebp","esi"); | |
807 | &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero | |
808 | &adc (&DWP(4*1,"edi"),"esi"); | |
809 | &adc (&DWP(4*2,"edi"),"esi"); | |
810 | &adc (&DWP(4*3,"edi"),0); | |
811 | &adc ("eax",0); | |
812 | &adc ("ebx",0); | |
813 | &mov (&DWP(4*4,"edi"),"eax"); | |
814 | &adc ("ecx","ebp"); | |
815 | &mov (&DWP(4*5,"edi"),"ebx"); | |
816 | &adc ("edx","esi"); | |
817 | &mov (&DWP(4*6,"edi"),"ecx"); | |
818 | &mov (&DWP(4*7,"edi"),"edx"); | |
819 | ||
820 | &ret (); | |
821 | ||
822 | &set_label("mul_mont_ialu",16); } | |
823 | ||
824 | ######################################## | |
825 | # IALU code path suitable for all CPUs. | |
826 | ######################################## | |
827 | # stack layout: | |
828 | # +------------------------------------+< %esp | |
829 | # | 8 32-bit temporary words, accessed | | |
830 | # | as circular buffer | | |
831 | # . . | |
832 | # . . | |
833 | # +------------------------------------+< +32 | |
834 | # | offloaded destination pointer | | |
835 | # +------------------------------------+ | |
836 | # | unused | | |
837 | # +------------------------------------+< +40 | |
838 | &sub ("esp",10*4); | |
839 | ||
840 | &mov ("eax",&DWP(0*4,"esi")); # a[0] | |
841 | &mov ("ebx",&DWP(0*4,"ebp")); # b[0] | |
842 | &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr | |
843 | ||
844 | &mul ("ebx"); # a[0]*b[0] | |
845 | &mov (&DWP(0*4,"esp"),"eax"); # t[0] | |
846 | &mov ("eax",&DWP(1*4,"esi")); | |
847 | &mov ("ecx","edx") | |
848 | ||
849 | &mul ("ebx"); # a[1]*b[0] | |
850 | &add ("ecx","eax"); | |
851 | &mov ("eax",&DWP(2*4,"esi")); | |
852 | &adc ("edx",0); | |
853 | &mov (&DWP(1*4,"esp"),"ecx"); # t[1] | |
854 | &mov ("ecx","edx"); | |
855 | ||
856 | &mul ("ebx"); # a[2]*b[0] | |
857 | &add ("ecx","eax"); | |
858 | &mov ("eax",&DWP(3*4,"esi")); | |
859 | &adc ("edx",0); | |
860 | &mov (&DWP(2*4,"esp"),"ecx"); # t[2] | |
861 | &mov ("ecx","edx"); | |
862 | ||
863 | &mul ("ebx"); # a[3]*b[0] | |
864 | &add ("ecx","eax"); | |
865 | &mov ("eax",&DWP(4*4,"esi")); | |
866 | &adc ("edx",0); | |
867 | &mov (&DWP(3*4,"esp"),"ecx"); # t[3] | |
868 | &mov ("ecx","edx"); | |
869 | ||
870 | &mul ("ebx"); # a[4]*b[0] | |
871 | &add ("ecx","eax"); | |
872 | &mov ("eax",&DWP(5*4,"esi")); | |
873 | &adc ("edx",0); | |
874 | &mov (&DWP(4*4,"esp"),"ecx"); # t[4] | |
875 | &mov ("ecx","edx"); | |
876 | ||
877 | &mul ("ebx"); # a[5]*b[0] | |
878 | &add ("ecx","eax"); | |
879 | &mov ("eax",&DWP(6*4,"esi")); | |
880 | &adc ("edx",0); | |
881 | &mov (&DWP(5*4,"esp"),"ecx"); # t[5] | |
882 | &mov ("ecx","edx"); | |
883 | ||
884 | &mul ("ebx"); # a[6]*b[0] | |
885 | &add ("ecx","eax"); | |
886 | &mov ("eax",&DWP(7*4,"esi")); | |
887 | &adc ("edx",0); | |
888 | &mov (&DWP(6*4,"esp"),"ecx"); # t[6] | |
889 | &mov ("ecx","edx"); | |
890 | ||
891 | &xor ("edi","edi"); # initial top-most carry | |
892 | &mul ("ebx"); # a[7]*b[0] | |
893 | &add ("ecx","eax"); # t[7] | |
894 | &mov ("eax",&DWP(0*4,"esp")); # t[0] | |
895 | &adc ("edx",0); # t[8] | |
896 | ||
897 | for ($i=0;$i<7;$i++) { | |
898 | my $j=$i+1; | |
899 | ||
900 | # Reduction iteration is normally performed by accumulating | |
901 | # result of multiplication of modulus by "magic" digit [and | |
902 | # omitting least significant word, which is guaranteed to | |
903 | # be 0], but thanks to special form of modulus and "magic" | |
904 | # digit being equal to least significant word, it can be | |
905 | # performed with additions and subtractions alone. Indeed: | |
906 | # | |
907 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | |
908 | # * abcd | |
909 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | |
910 | # | |
911 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | |
912 | # rewrite above as: | |
913 | # | |
914 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | |
915 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | |
916 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | |
917 | # | |
918 | # or marking redundant operations: | |
919 | # | |
920 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | |
921 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | |
922 | # - abcd.----.----.----.----.----.----.---- | |
923 | ||
924 | &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] | |
925 | &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 | |
926 | &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 | |
927 | &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] | |
928 | &adc ("ecx",0); # t[7]+=0 | |
929 | &adc ("edx","eax"); # t[8]+=t[0] | |
930 | &adc ("edi",0); # top-most carry | |
931 | &mov ("ebx",&DWP($j*4,"ebp")); # b[i] | |
932 | &sub ("ecx","eax"); # t[7]-=t[0] | |
933 | &mov ("eax",&DWP(0*4,"esi")); # a[0] | |
934 | &sbb ("edx",0); # t[8]-=0 | |
935 | &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); | |
936 | &sbb ("edi",0); # top-most carry, | |
937 | # keep in mind that | |
938 | # netto result is | |
939 | # *addition* of value | |
940 | # with (abcd<<32)-abcd | |
941 | # on top, so that | |
942 | # underflow is | |
943 | # impossible, because | |
944 | # (abcd<<32)-abcd | |
945 | # doesn't underflow | |
946 | &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); | |
947 | ||
948 | &mul ("ebx"); # a[0]*b[i] | |
949 | &add ("eax",&DWP((($j+0)%8)*4,"esp")); | |
950 | &adc ("edx",0); | |
951 | &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); | |
952 | &mov ("eax",&DWP(1*4,"esi")); | |
953 | &mov ("ecx","edx") | |
954 | ||
955 | &mul ("ebx"); # a[1]*b[i] | |
956 | &add ("ecx",&DWP((($j+1)%8)*4,"esp")); | |
957 | &adc ("edx",0); | |
958 | &add ("ecx","eax"); | |
959 | &adc ("edx",0); | |
960 | &mov ("eax",&DWP(2*4,"esi")); | |
961 | &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); | |
962 | &mov ("ecx","edx"); | |
963 | ||
964 | &mul ("ebx"); # a[2]*b[i] | |
965 | &add ("ecx",&DWP((($j+2)%8)*4,"esp")); | |
966 | &adc ("edx",0); | |
967 | &add ("ecx","eax"); | |
968 | &adc ("edx",0); | |
969 | &mov ("eax",&DWP(3*4,"esi")); | |
970 | &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); | |
971 | &mov ("ecx","edx"); | |
972 | ||
973 | &mul ("ebx"); # a[3]*b[i] | |
974 | &add ("ecx",&DWP((($j+3)%8)*4,"esp")); | |
975 | &adc ("edx",0); | |
976 | &add ("ecx","eax"); | |
977 | &adc ("edx",0); | |
978 | &mov ("eax",&DWP(4*4,"esi")); | |
979 | &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); | |
980 | &mov ("ecx","edx"); | |
981 | ||
982 | &mul ("ebx"); # a[4]*b[i] | |
983 | &add ("ecx",&DWP((($j+4)%8)*4,"esp")); | |
984 | &adc ("edx",0); | |
985 | &add ("ecx","eax"); | |
986 | &adc ("edx",0); | |
987 | &mov ("eax",&DWP(5*4,"esi")); | |
988 | &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); | |
989 | &mov ("ecx","edx"); | |
990 | ||
991 | &mul ("ebx"); # a[5]*b[i] | |
992 | &add ("ecx",&DWP((($j+5)%8)*4,"esp")); | |
993 | &adc ("edx",0); | |
994 | &add ("ecx","eax"); | |
995 | &adc ("edx",0); | |
996 | &mov ("eax",&DWP(6*4,"esi")); | |
997 | &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); | |
998 | &mov ("ecx","edx"); | |
999 | ||
1000 | &mul ("ebx"); # a[6]*b[i] | |
1001 | &add ("ecx",&DWP((($j+6)%8)*4,"esp")); | |
1002 | &adc ("edx",0); | |
1003 | &add ("ecx","eax"); | |
1004 | &adc ("edx",0); | |
1005 | &mov ("eax",&DWP(7*4,"esi")); | |
1006 | &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); | |
1007 | &mov ("ecx","edx"); | |
1008 | ||
1009 | &mul ("ebx"); # a[7]*b[i] | |
1010 | &add ("ecx",&DWP((($j+7)%8)*4,"esp")); | |
1011 | &adc ("edx",0); | |
1012 | &add ("ecx","eax"); # t[7] | |
1013 | &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] | |
1014 | &adc ("edx","edi"); # t[8] | |
1015 | &mov ("edi",0); | |
1016 | &adc ("edi",0); # top-most carry | |
1017 | } | |
1018 | &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr | |
1019 | &xor ("esi","esi"); | |
1020 | my $j=$i+1; | |
1021 | ||
1022 | # last multiplication-less reduction | |
1023 | &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] | |
1024 | &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 | |
1025 | &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 | |
1026 | &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] | |
1027 | &adc ("ecx",0); # t[7]+=0 | |
1028 | &adc ("edx","eax"); # t[8]+=t[0] | |
1029 | &adc ("edi",0); # top-most carry | |
1030 | &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); | |
1031 | &sub ("ecx","eax"); # t[7]-=t[0] | |
1032 | &mov ("eax",&DWP((($j+0)%8)*4,"esp")); | |
1033 | &sbb ("edx",0); # t[8]-=0 | |
1034 | &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); | |
1035 | &sbb ("edi",0); # top-most carry | |
1036 | &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); | |
1037 | ||
1038 | # Final step is "if result > mod, subtract mod", but we do it | |
1039 | # "other way around", namely write result - mod to output buffer | |
1040 | # and if subtraction borrowed, add modulus back. | |
1041 | ||
1042 | &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); | |
1043 | &sub ("eax",-1); | |
1044 | &mov ("edx",&DWP((($j+3)%8)*4,"esp")); | |
1045 | &sbb ("ebx",-1); | |
1046 | &mov (&DWP(0*4,"ebp"),"eax"); | |
1047 | &sbb ("ecx",-1); | |
1048 | &mov (&DWP(1*4,"ebp"),"ebx"); | |
1049 | &sbb ("edx",0); | |
1050 | &mov (&DWP(2*4,"ebp"),"ecx"); | |
1051 | &mov (&DWP(3*4,"ebp"),"edx"); | |
1052 | ||
1053 | &mov ("eax",&DWP((($j+4)%8)*4,"esp")); | |
1054 | &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); | |
1055 | &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); | |
1056 | &sbb ("eax",0); | |
1057 | &mov ("edx",&DWP((($j+7)%8)*4,"esp")); | |
1058 | &sbb ("ebx",0); | |
1059 | &sbb ("ecx",1); | |
1060 | &sbb ("edx",-1); | |
1061 | &sbb ("edi",0); | |
1062 | ||
1063 | # Note that because mod has special form, i.e. consists of | |
1064 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
1065 | # assigning borrow bit to one register, %ebp, and its negative | |
1066 | # to another, %esi. But we started by calculating %esi... | |
1067 | ||
1068 | &sub ("esi","edi"); | |
1069 | &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero | |
1070 | &adc (&DWP(1*4,"ebp"),"edi"); | |
1071 | &adc (&DWP(2*4,"ebp"),"edi"); | |
1072 | &adc (&DWP(3*4,"ebp"),0); | |
1073 | &adc ("eax",0); | |
1074 | &adc ("ebx",0); | |
1075 | &mov (&DWP(4*4,"ebp"),"eax"); | |
1076 | &adc ("ecx","esi"); | |
1077 | &mov (&DWP(5*4,"ebp"),"ebx"); | |
1078 | &adc ("edx","edi"); | |
1079 | &mov (&DWP(6*4,"ebp"),"ecx"); | |
1080 | &mov ("edi","ebp"); # fulfill contract | |
1081 | &mov (&DWP(7*4,"ebp"),"edx"); | |
1082 | ||
1083 | &add ("esp",10*4); | |
1084 | &ret (); | |
1085 | &function_end_B("_ecp_nistz256_mul_mont"); | |
1086 | ||
1087 | ######################################################################## | |
1088 | # void ecp_nistz256_scatter_w5(void *edi,const P256_POINT *esi, | |
1089 | # int ebp); | |
1090 | &function_begin("ecp_nistz256_scatter_w5"); | |
1091 | &mov ("edi",&wparam(0)); | |
1092 | &mov ("esi",&wparam(1)); | |
1093 | &mov ("ebp",&wparam(2)); | |
1094 | ||
1095 | &lea ("edi",&DWP(128-4,"edi","ebp",4)); | |
1096 | &mov ("ebp",96/16); | |
1097 | &set_label("scatter_w5_loop"); | |
1098 | &mov ("eax",&DWP(0,"esi")); | |
1099 | &mov ("ebx",&DWP(4,"esi")); | |
1100 | &mov ("ecx",&DWP(8,"esi")); | |
1101 | &mov ("edx",&DWP(12,"esi")); | |
1102 | &lea ("esi",&DWP(16,"esi")); | |
1103 | &mov (&DWP(64*0-128,"edi"),"eax"); | |
1104 | &mov (&DWP(64*1-128,"edi"),"ebx"); | |
1105 | &mov (&DWP(64*2-128,"edi"),"ecx"); | |
1106 | &mov (&DWP(64*3-128,"edi"),"edx"); | |
1107 | &lea ("edi",&DWP(64*4,"edi")); | |
1108 | &dec ("ebp"); | |
1109 | &jnz (&label("scatter_w5_loop")); | |
1110 | &function_end("ecp_nistz256_scatter_w5"); | |
1111 | ||
1112 | ######################################################################## | |
1113 | # void ecp_nistz256_gather_w5(P256_POINT *edi,const void *esi, | |
1114 | # int ebp); | |
1115 | &function_begin("ecp_nistz256_gather_w5"); | |
1116 | &mov ("esi",&wparam(1)); | |
1117 | &mov ("ebp",&wparam(2)); | |
1118 | ||
1119 | &lea ("esi",&DWP(0,"esi","ebp",4)); | |
1120 | &neg ("ebp"); | |
1121 | &sar ("ebp",31); | |
1122 | &mov ("edi",&wparam(0)); | |
1123 | &lea ("esi",&DWP(0,"esi","ebp",4)); | |
1124 | ||
1125 | for($i=0;$i<24;$i+=4) { | |
1126 | &mov ("eax",&DWP(64*($i+0),"esi")); | |
1127 | &mov ("ebx",&DWP(64*($i+1),"esi")); | |
1128 | &mov ("ecx",&DWP(64*($i+2),"esi")); | |
1129 | &mov ("edx",&DWP(64*($i+3),"esi")); | |
1130 | &and ("eax","ebp"); | |
1131 | &and ("ebx","ebp"); | |
1132 | &and ("ecx","ebp"); | |
1133 | &and ("edx","ebp"); | |
1134 | &mov (&DWP(4*($i+0),"edi"),"eax"); | |
1135 | &mov (&DWP(4*($i+1),"edi"),"ebx"); | |
1136 | &mov (&DWP(4*($i+2),"edi"),"ecx"); | |
1137 | &mov (&DWP(4*($i+3),"edi"),"edx"); | |
1138 | } | |
1139 | &function_end("ecp_nistz256_gather_w5"); | |
1140 | ||
1141 | ######################################################################## | |
1142 | # void ecp_nistz256_scatter_w7(void *edi,const P256_POINT_AFFINE *esi, | |
1143 | # int ebp); | |
1144 | &function_begin("ecp_nistz256_scatter_w7"); | |
1145 | &mov ("edi",&wparam(0)); | |
1146 | &mov ("esi",&wparam(1)); | |
1147 | &mov ("ebp",&wparam(2)); | |
1148 | ||
1149 | &lea ("edi",&DWP(-1,"edi","ebp")); | |
1150 | &mov ("ebp",64/4); | |
1151 | &set_label("scatter_w7_loop"); | |
1152 | &mov ("eax",&DWP(0,"esi")); | |
1153 | &lea ("esi",&DWP(4,"esi")); | |
1154 | &mov (&BP(64*0,"edi"),"al"); | |
1155 | &mov (&BP(64*1,"edi"),"ah"); | |
1156 | &shr ("eax",16); | |
1157 | &mov (&BP(64*2,"edi"),"al"); | |
1158 | &mov (&BP(64*3,"edi"),"ah"); | |
1159 | &lea ("edi",&DWP(64*4,"edi")); | |
1160 | &dec ("ebp"); | |
1161 | &jnz (&label("scatter_w7_loop")); | |
1162 | &function_end("ecp_nistz256_scatter_w7"); | |
1163 | ||
1164 | ######################################################################## | |
1165 | # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *edi,const void *esi, | |
1166 | # int ebp); | |
1167 | &function_begin("ecp_nistz256_gather_w7"); | |
1168 | &mov ("esi",&wparam(1)); | |
1169 | &mov ("ebp",&wparam(2)); | |
1170 | ||
1171 | &add ("esi","ebp"); | |
1172 | &neg ("ebp"), | |
1173 | &sar ("ebp",31); | |
1174 | &mov ("edi",&wparam(0)); | |
1175 | &lea ("esi",&DWP(0,"esi","ebp")); | |
1176 | ||
1177 | for($i=0;$i<64;$i+=4) { | |
1178 | &movz ("eax",&BP(64*($i+0),"esi")); | |
1179 | &movz ("ebx",&BP(64*($i+1),"esi")); | |
1180 | &movz ("ecx",&BP(64*($i+2),"esi")); | |
1181 | &and ("eax","ebp"); | |
1182 | &movz ("edx",&BP(64*($i+3),"esi")); | |
1183 | &and ("ebx","ebp"); | |
1184 | &mov (&BP($i+0,"edi"),"al"); | |
1185 | &and ("ecx","ebp"); | |
1186 | &mov (&BP($i+1,"edi"),"bl"); | |
1187 | &and ("edx","ebp"); | |
1188 | &mov (&BP($i+2,"edi"),"cl"); | |
1189 | &mov (&BP($i+3,"edi"),"dl"); | |
1190 | } | |
1191 | &function_end("ecp_nistz256_gather_w7"); | |
1192 | ||
1193 | ######################################################################## | |
1194 | # following subroutines are "literal" implementation of those found in | |
1195 | # ecp_nistz256.c | |
1196 | # | |
1197 | ######################################################################## | |
1198 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | |
1199 | # | |
143ee099 | 1200 | &static_label("point_double_shortcut"); |
aa9db2d2 AP |
1201 | &function_begin("ecp_nistz256_point_double"); |
1202 | { my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | |
1203 | ||
1204 | &mov ("esi",&wparam(1)); | |
1205 | ||
1206 | # above map() describes stack layout with 5 temporary | |
1207 | # 256-bit vectors on top, then we take extra word for | |
60d8edbc | 1208 | # OPENSSL_ia32cap_P copy. |
aa9db2d2 AP |
1209 | &stack_push(8*5+1); |
1210 | if ($sse2) { | |
1211 | &call ("_picup_eax"); | |
1212 | &set_label("pic"); | |
1213 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | |
1214 | &mov ("ebp",&DWP(0,"edx")); } | |
1215 | ||
143ee099 | 1216 | &set_label("point_double_shortcut"); |
aa9db2d2 AP |
1217 | &mov ("eax",&DWP(0,"esi")); # copy in_x |
1218 | &mov ("ebx",&DWP(4,"esi")); | |
1219 | &mov ("ecx",&DWP(8,"esi")); | |
1220 | &mov ("edx",&DWP(12,"esi")); | |
1221 | &mov (&DWP($in_x+0,"esp"),"eax"); | |
1222 | &mov (&DWP($in_x+4,"esp"),"ebx"); | |
1223 | &mov (&DWP($in_x+8,"esp"),"ecx"); | |
1224 | &mov (&DWP($in_x+12,"esp"),"edx"); | |
1225 | &mov ("eax",&DWP(16,"esi")); | |
1226 | &mov ("ebx",&DWP(20,"esi")); | |
1227 | &mov ("ecx",&DWP(24,"esi")); | |
1228 | &mov ("edx",&DWP(28,"esi")); | |
1229 | &mov (&DWP($in_x+16,"esp"),"eax"); | |
1230 | &mov (&DWP($in_x+20,"esp"),"ebx"); | |
1231 | &mov (&DWP($in_x+24,"esp"),"ecx"); | |
1232 | &mov (&DWP($in_x+28,"esp"),"edx"); | |
1233 | &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy | |
1234 | ||
1235 | &lea ("ebp",&DWP(32,"esi")); | |
1236 | &lea ("esi",&DWP(32,"esi")); | |
1237 | &lea ("edi",&DWP($S,"esp")); | |
1238 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); | |
1239 | ||
1240 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1241 | &mov ("esi",64); | |
1242 | &add ("esi",&wparam(1)); | |
1243 | &lea ("edi",&DWP($Zsqr,"esp")); | |
1244 | &mov ("ebp","esi"); | |
1245 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); | |
1246 | ||
1247 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1248 | &lea ("esi",&DWP($S,"esp")); | |
1249 | &lea ("ebp",&DWP($S,"esp")); | |
1250 | &lea ("edi",&DWP($S,"esp")); | |
1251 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); | |
1252 | ||
1253 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1254 | &mov ("ebp",&wparam(1)); | |
1255 | &lea ("esi",&DWP(32,"ebp")); | |
1256 | &lea ("ebp",&DWP(64,"ebp")); | |
1257 | &lea ("edi",&DWP($tmp0,"esp")); | |
1258 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); | |
1259 | ||
1260 | &lea ("esi",&DWP($in_x,"esp")); | |
1261 | &lea ("ebp",&DWP($Zsqr,"esp")); | |
1262 | &lea ("edi",&DWP($M,"esp")); | |
1263 | &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); | |
1264 | ||
1265 | &mov ("edi",64); | |
1266 | &lea ("esi",&DWP($tmp0,"esp")); | |
1267 | &lea ("ebp",&DWP($tmp0,"esp")); | |
1268 | &add ("edi",&wparam(0)); | |
1269 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); | |
1270 | ||
1271 | &lea ("esi",&DWP($in_x,"esp")); | |
1272 | &lea ("ebp",&DWP($Zsqr,"esp")); | |
1273 | &lea ("edi",&DWP($Zsqr,"esp")); | |
1274 | &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); | |
1275 | ||
1276 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1277 | &lea ("esi",&DWP($S,"esp")); | |
1278 | &lea ("ebp",&DWP($S,"esp")); | |
1279 | &lea ("edi",&DWP($tmp0,"esp")); | |
1280 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); | |
1281 | ||
1282 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1283 | &lea ("esi",&DWP($M,"esp")); | |
1284 | &lea ("ebp",&DWP($Zsqr,"esp")); | |
1285 | &lea ("edi",&DWP($M,"esp")); | |
1286 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); | |
1287 | ||
1288 | &mov ("edi",32); | |
1289 | &lea ("esi",&DWP($tmp0,"esp")); | |
1290 | &add ("edi",&wparam(0)); | |
1291 | &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); | |
1292 | ||
1293 | &lea ("esi",&DWP($M,"esp")); | |
1294 | &lea ("ebp",&DWP($M,"esp")); | |
1295 | &lea ("edi",&DWP($tmp0,"esp")); | |
1296 | &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); | |
1297 | ||
1298 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1299 | &lea ("esi",&DWP($in_x,"esp")); | |
1300 | &lea ("ebp",&DWP($S,"esp")); | |
1301 | &lea ("edi",&DWP($S,"esp")); | |
1302 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); | |
1303 | ||
1304 | &lea ("esi",&DWP($tmp0,"esp")); | |
1305 | &lea ("ebp",&DWP($M,"esp")); | |
1306 | &lea ("edi",&DWP($M,"esp")); | |
1307 | &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); | |
1308 | ||
1309 | &lea ("esi",&DWP($S,"esp")); | |
1310 | &lea ("ebp",&DWP($S,"esp")); | |
1311 | &lea ("edi",&DWP($tmp0,"esp")); | |
1312 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); | |
1313 | ||
1314 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1315 | &lea ("esi",&DWP($M,"esp")); | |
1316 | &lea ("ebp",&DWP($M,"esp")); | |
1317 | &mov ("edi",&wparam(0)); | |
1318 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); | |
1319 | ||
1320 | &mov ("esi","edi"); # %edi is still res_x here | |
1321 | &lea ("ebp",&DWP($tmp0,"esp")); | |
1322 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); | |
1323 | ||
1324 | &lea ("esi",&DWP($S,"esp")); | |
1325 | &mov ("ebp","edi"); # %edi is still res_x | |
1326 | &lea ("edi",&DWP($S,"esp")); | |
1327 | &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); | |
1328 | ||
1329 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1330 | &mov ("esi","edi"); # %edi is still &S | |
1331 | &lea ("ebp",&DWP($M,"esp")); | |
1332 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); | |
1333 | ||
1334 | &mov ("ebp",32); | |
1335 | &lea ("esi",&DWP($S,"esp")); | |
1336 | &add ("ebp",&wparam(0)); | |
1337 | &mov ("edi","ebp"); | |
1338 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); | |
1339 | ||
1340 | &stack_pop(8*5+1); | |
1341 | } &function_end("ecp_nistz256_point_double"); | |
1342 | ||
1343 | ######################################################################## | |
1344 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | |
1345 | # const P256_POINT *in2); | |
1346 | &function_begin("ecp_nistz256_point_add"); | |
1347 | { my ($res_x,$res_y,$res_z, | |
1348 | $in1_x,$in1_y,$in1_z, | |
1349 | $in2_x,$in2_y,$in2_z, | |
1350 | $H,$Hsqr,$R,$Rsqr,$Hcub, | |
1351 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | |
1352 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | |
1353 | ||
1354 | &mov ("esi",&wparam(2)); | |
1355 | ||
1356 | # above map() describes stack layout with 18 temporary | |
1357 | # 256-bit vectors on top, then we take extra words for | |
1358 | # !in1infty, !in2infty, result of check for zero and | |
60d8edbc | 1359 | # OPENSSL_ia32cap_P copy. [one unused word for padding] |
aa9db2d2 AP |
1360 | &stack_push(8*18+5); |
1361 | if ($sse2) { | |
1362 | &call ("_picup_eax"); | |
1363 | &set_label("pic"); | |
1364 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | |
1365 | &mov ("ebp",&DWP(0,"edx")); } | |
1366 | ||
1367 | &lea ("edi",&DWP($in2_x,"esp")); | |
1368 | for($i=0;$i<96;$i+=16) { | |
1369 | &mov ("eax",&DWP($i+0,"esi")); # copy in2 | |
1370 | &mov ("ebx",&DWP($i+4,"esi")); | |
1371 | &mov ("ecx",&DWP($i+8,"esi")); | |
1372 | &mov ("edx",&DWP($i+12,"esi")); | |
1373 | &mov (&DWP($i+0,"edi"),"eax"); | |
1374 | &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); | |
1375 | &mov ("ebp","eax") if ($i==0); | |
1376 | &or ("ebp","eax") if ($i!=0 && $i<64); | |
1377 | &mov (&DWP($i+4,"edi"),"ebx"); | |
1378 | &or ("ebp","ebx") if ($i<64); | |
1379 | &mov (&DWP($i+8,"edi"),"ecx"); | |
1380 | &or ("ebp","ecx") if ($i<64); | |
1381 | &mov (&DWP($i+12,"edi"),"edx"); | |
1382 | &or ("ebp","edx") if ($i<64); | |
1383 | } | |
1384 | &xor ("eax","eax"); | |
1385 | &mov ("esi",&wparam(1)); | |
1386 | &sub ("eax","ebp"); | |
1387 | &or ("ebp","eax"); | |
1388 | &sar ("ebp",31); | |
1389 | &mov (&DWP(32*18+4,"esp"),"ebp"); # !in2infty | |
1390 | ||
1391 | &lea ("edi",&DWP($in1_x,"esp")); | |
1392 | for($i=0;$i<96;$i+=16) { | |
1393 | &mov ("eax",&DWP($i+0,"esi")); # copy in1 | |
1394 | &mov ("ebx",&DWP($i+4,"esi")); | |
1395 | &mov ("ecx",&DWP($i+8,"esi")); | |
1396 | &mov ("edx",&DWP($i+12,"esi")); | |
1397 | &mov (&DWP($i+0,"edi"),"eax"); | |
1398 | &mov ("ebp","eax") if ($i==0); | |
1399 | &or ("ebp","eax") if ($i!=0 && $i<64); | |
1400 | &mov (&DWP($i+4,"edi"),"ebx"); | |
1401 | &or ("ebp","ebx") if ($i<64); | |
1402 | &mov (&DWP($i+8,"edi"),"ecx"); | |
1403 | &or ("ebp","ecx") if ($i<64); | |
1404 | &mov (&DWP($i+12,"edi"),"edx"); | |
1405 | &or ("ebp","edx") if ($i<64); | |
1406 | } | |
1407 | &xor ("eax","eax"); | |
1408 | &sub ("eax","ebp"); | |
1409 | &or ("ebp","eax"); | |
1410 | &sar ("ebp",31); | |
1411 | &mov (&DWP(32*18+0,"esp"),"ebp"); # !in1infty | |
1412 | ||
1413 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1414 | &lea ("esi",&DWP($in2_z,"esp")); | |
1415 | &lea ("ebp",&DWP($in2_z,"esp")); | |
1416 | &lea ("edi",&DWP($Z2sqr,"esp")); | |
1417 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); | |
1418 | ||
1419 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1420 | &lea ("esi",&DWP($in1_z,"esp")); | |
1421 | &lea ("ebp",&DWP($in1_z,"esp")); | |
1422 | &lea ("edi",&DWP($Z1sqr,"esp")); | |
1423 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); | |
1424 | ||
1425 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1426 | &lea ("esi",&DWP($Z2sqr,"esp")); | |
1427 | &lea ("ebp",&DWP($in2_z,"esp")); | |
1428 | &lea ("edi",&DWP($S1,"esp")); | |
1429 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); | |
1430 | ||
1431 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1432 | &lea ("esi",&DWP($Z1sqr,"esp")); | |
1433 | &lea ("ebp",&DWP($in1_z,"esp")); | |
1434 | &lea ("edi",&DWP($S2,"esp")); | |
1435 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); | |
1436 | ||
1437 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1438 | &lea ("esi",&DWP($in1_y,"esp")); | |
1439 | &lea ("ebp",&DWP($S1,"esp")); | |
1440 | &lea ("edi",&DWP($S1,"esp")); | |
1441 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); | |
1442 | ||
1443 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1444 | &lea ("esi",&DWP($in2_y,"esp")); | |
1445 | &lea ("ebp",&DWP($S2,"esp")); | |
1446 | &lea ("edi",&DWP($S2,"esp")); | |
1447 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); | |
1448 | ||
1449 | &lea ("esi",&DWP($S2,"esp")); | |
1450 | &lea ("ebp",&DWP($S1,"esp")); | |
1451 | &lea ("edi",&DWP($R,"esp")); | |
1452 | &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); | |
1453 | ||
1454 | &or ("ebx","eax"); # see if result is zero | |
1455 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1456 | &or ("ebx","ecx"); | |
1457 | &or ("ebx","edx"); | |
1458 | &or ("ebx",&DWP(0,"edi")); | |
1459 | &or ("ebx",&DWP(4,"edi")); | |
1460 | &lea ("esi",&DWP($in1_x,"esp")); | |
1461 | &or ("ebx",&DWP(8,"edi")); | |
1462 | &lea ("ebp",&DWP($Z2sqr,"esp")); | |
1463 | &or ("ebx",&DWP(12,"edi")); | |
1464 | &lea ("edi",&DWP($U1,"esp")); | |
1465 | &mov (&DWP(32*18+8,"esp"),"ebx"); | |
1466 | ||
1467 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); | |
1468 | ||
1469 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1470 | &lea ("esi",&DWP($in2_x,"esp")); | |
1471 | &lea ("ebp",&DWP($Z1sqr,"esp")); | |
1472 | &lea ("edi",&DWP($U2,"esp")); | |
1473 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); | |
1474 | ||
1475 | &lea ("esi",&DWP($U2,"esp")); | |
1476 | &lea ("ebp",&DWP($U1,"esp")); | |
1477 | &lea ("edi",&DWP($H,"esp")); | |
1478 | &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); | |
1479 | ||
1480 | &or ("eax","ebx"); # see if result is zero | |
1481 | &or ("eax","ecx"); | |
1482 | &or ("eax","edx"); | |
1483 | &or ("eax",&DWP(0,"edi")); | |
1484 | &or ("eax",&DWP(4,"edi")); | |
1485 | &or ("eax",&DWP(8,"edi")); | |
1486 | &or ("eax",&DWP(12,"edi")); | |
1487 | ||
1488 | &data_byte(0x3e); # predict taken | |
1489 | &jnz (&label("add_proceed")); # is_equal(U1,U2)? | |
1490 | ||
1491 | &mov ("eax",&DWP(32*18+0,"esp")); | |
1492 | &and ("eax",&DWP(32*18+4,"esp")); | |
1493 | &mov ("ebx",&DWP(32*18+8,"esp")); | |
1494 | &jz (&label("add_proceed")); # (in1infty || in2infty)? | |
1495 | &test ("ebx","ebx"); | |
143ee099 | 1496 | &jz (&label("add_double")); # is_equal(S1,S2)? |
aa9db2d2 AP |
1497 | |
1498 | &mov ("edi",&wparam(0)); | |
1499 | &xor ("eax","eax"); | |
1500 | &mov ("ecx",96/4); | |
1501 | &data_byte(0xfc,0xf3,0xab); # cld; stosd | |
1502 | &jmp (&label("add_done")); | |
1503 | ||
143ee099 AP |
1504 | &set_label("add_double",16); |
1505 | &mov ("esi",&wparam(1)); | |
1506 | &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1507 | &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes | |
1508 | &jmp (&label("point_double_shortcut")); | |
1509 | ||
aa9db2d2 AP |
1510 | &set_label("add_proceed",16); |
1511 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1512 | &lea ("esi",&DWP($R,"esp")); | |
1513 | &lea ("ebp",&DWP($R,"esp")); | |
1514 | &lea ("edi",&DWP($Rsqr,"esp")); | |
1515 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); | |
1516 | ||
1517 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1518 | &lea ("esi",&DWP($H,"esp")); | |
1519 | &lea ("ebp",&DWP($in1_z,"esp")); | |
1520 | &lea ("edi",&DWP($res_z,"esp")); | |
1521 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); | |
1522 | ||
1523 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1524 | &lea ("esi",&DWP($H,"esp")); | |
1525 | &lea ("ebp",&DWP($H,"esp")); | |
1526 | &lea ("edi",&DWP($Hsqr,"esp")); | |
1527 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); | |
1528 | ||
1529 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1530 | &lea ("esi",&DWP($in2_z,"esp")); | |
1531 | &lea ("ebp",&DWP($res_z,"esp")); | |
1532 | &lea ("edi",&DWP($res_z,"esp")); | |
1533 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); | |
1534 | ||
1535 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1536 | &lea ("esi",&DWP($Hsqr,"esp")); | |
1537 | &lea ("ebp",&DWP($U1,"esp")); | |
1538 | &lea ("edi",&DWP($U2,"esp")); | |
1539 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); | |
1540 | ||
1541 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1542 | &lea ("esi",&DWP($H,"esp")); | |
1543 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1544 | &lea ("edi",&DWP($Hcub,"esp")); | |
1545 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); | |
1546 | ||
1547 | &lea ("esi",&DWP($U2,"esp")); | |
1548 | &lea ("ebp",&DWP($U2,"esp")); | |
1549 | &lea ("edi",&DWP($Hsqr,"esp")); | |
1550 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); | |
1551 | ||
1552 | &lea ("esi",&DWP($Rsqr,"esp")); | |
1553 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1554 | &lea ("edi",&DWP($res_x,"esp")); | |
1555 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); | |
1556 | ||
1557 | &lea ("esi",&DWP($res_x,"esp")); | |
1558 | &lea ("ebp",&DWP($Hcub,"esp")); | |
1559 | &lea ("edi",&DWP($res_x,"esp")); | |
1560 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); | |
1561 | ||
1562 | &lea ("esi",&DWP($U2,"esp")); | |
1563 | &lea ("ebp",&DWP($res_x,"esp")); | |
1564 | &lea ("edi",&DWP($res_y,"esp")); | |
1565 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); | |
1566 | ||
1567 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1568 | &lea ("esi",&DWP($Hcub,"esp")); | |
1569 | &lea ("ebp",&DWP($S1,"esp")); | |
1570 | &lea ("edi",&DWP($S2,"esp")); | |
1571 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); | |
1572 | ||
1573 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1574 | &lea ("esi",&DWP($R,"esp")); | |
1575 | &lea ("ebp",&DWP($res_y,"esp")); | |
1576 | &lea ("edi",&DWP($res_y,"esp")); | |
1577 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); | |
1578 | ||
1579 | &lea ("esi",&DWP($res_y,"esp")); | |
1580 | &lea ("ebp",&DWP($S2,"esp")); | |
1581 | &lea ("edi",&DWP($res_y,"esp")); | |
1582 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); | |
1583 | ||
1584 | &mov ("ebp",&DWP(32*18+0,"esp")); # !in1infty | |
1585 | &mov ("esi",&DWP(32*18+4,"esp")); # !in2infty | |
1586 | &mov ("edi",&wparam(0)); | |
1587 | &mov ("edx","ebp"); | |
1588 | ¬ ("ebp"); | |
1589 | &and ("edx","esi"); | |
1590 | &and ("ebp","esi"); | |
1591 | ¬ ("esi"); | |
1592 | ||
1593 | ######################################## | |
1594 | # conditional moves | |
1595 | for($i=64;$i<96;$i+=4) { | |
1596 | &mov ("eax","edx"); | |
1597 | &and ("eax",&DWP($res_x+$i,"esp")); | |
1598 | &mov ("ebx","ebp"); | |
1599 | &and ("ebx",&DWP($in2_x+$i,"esp")); | |
1600 | &mov ("ecx","esi"); | |
1601 | &and ("ecx",&DWP($in1_x+$i,"esp")); | |
1602 | &or ("eax","ebx"); | |
1603 | &or ("eax","ecx"); | |
1604 | &mov (&DWP($i,"edi"),"eax"); | |
1605 | } | |
1606 | for($i=0;$i<64;$i+=4) { | |
1607 | &mov ("eax","edx"); | |
1608 | &and ("eax",&DWP($res_x+$i,"esp")); | |
1609 | &mov ("ebx","ebp"); | |
1610 | &and ("ebx",&DWP($in2_x+$i,"esp")); | |
1611 | &mov ("ecx","esi"); | |
1612 | &and ("ecx",&DWP($in1_x+$i,"esp")); | |
1613 | &or ("eax","ebx"); | |
1614 | &or ("eax","ecx"); | |
1615 | &mov (&DWP($i,"edi"),"eax"); | |
1616 | } | |
1617 | &set_label("add_done"); | |
1618 | &stack_pop(8*18+5); | |
1619 | } &function_end("ecp_nistz256_point_add"); | |
1620 | ||
1621 | ######################################################################## | |
1622 | # void ecp_nistz256_point_add_affine(P256_POINT *out, | |
1623 | # const P256_POINT *in1, | |
1624 | # const P256_POINT_AFFINE *in2); | |
1625 | &function_begin("ecp_nistz256_point_add_affine"); | |
1626 | { | |
1627 | my ($res_x,$res_y,$res_z, | |
1628 | $in1_x,$in1_y,$in1_z, | |
1629 | $in2_x,$in2_y, | |
1630 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | |
1631 | my $Z1sqr = $S2; | |
1632 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | |
1633 | ||
1634 | &mov ("esi",&wparam(1)); | |
1635 | ||
1636 | # above map() describes stack layout with 15 temporary | |
1637 | # 256-bit vectors on top, then we take extra words for | |
60d8edbc | 1638 | # !in1infty, !in2infty, and OPENSSL_ia32cap_P copy. |
aa9db2d2 AP |
1639 | &stack_push(8*15+3); |
1640 | if ($sse2) { | |
1641 | &call ("_picup_eax"); | |
1642 | &set_label("pic"); | |
1643 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | |
1644 | &mov ("ebp",&DWP(0,"edx")); } | |
1645 | ||
1646 | &lea ("edi",&DWP($in1_x,"esp")); | |
1647 | for($i=0;$i<96;$i+=16) { | |
1648 | &mov ("eax",&DWP($i+0,"esi")); # copy in1 | |
1649 | &mov ("ebx",&DWP($i+4,"esi")); | |
1650 | &mov ("ecx",&DWP($i+8,"esi")); | |
1651 | &mov ("edx",&DWP($i+12,"esi")); | |
1652 | &mov (&DWP($i+0,"edi"),"eax"); | |
1653 | &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); | |
1654 | &mov ("ebp","eax") if ($i==0); | |
1655 | &or ("ebp","eax") if ($i!=0 && $i<64); | |
1656 | &mov (&DWP($i+4,"edi"),"ebx"); | |
1657 | &or ("ebp","ebx") if ($i<64); | |
1658 | &mov (&DWP($i+8,"edi"),"ecx"); | |
1659 | &or ("ebp","ecx") if ($i<64); | |
1660 | &mov (&DWP($i+12,"edi"),"edx"); | |
1661 | &or ("ebp","edx") if ($i<64); | |
1662 | } | |
1663 | &xor ("eax","eax"); | |
1664 | &mov ("esi",&wparam(2)); | |
1665 | &sub ("eax","ebp"); | |
1666 | &or ("ebp","eax"); | |
1667 | &sar ("ebp",31); | |
1668 | &mov (&DWP(32*15+0,"esp"),"ebp"); # !in1infty | |
1669 | ||
1670 | &lea ("edi",&DWP($in2_x,"esp")); | |
1671 | for($i=0;$i<64;$i+=16) { | |
1672 | &mov ("eax",&DWP($i+0,"esi")); # copy in2 | |
1673 | &mov ("ebx",&DWP($i+4,"esi")); | |
1674 | &mov ("ecx",&DWP($i+8,"esi")); | |
1675 | &mov ("edx",&DWP($i+12,"esi")); | |
1676 | &mov (&DWP($i+0,"edi"),"eax"); | |
1677 | &mov ("ebp","eax") if ($i==0); | |
1678 | &or ("ebp","eax") if ($i!=0); | |
1679 | &mov (&DWP($i+4,"edi"),"ebx"); | |
1680 | &or ("ebp","ebx"); | |
1681 | &mov (&DWP($i+8,"edi"),"ecx"); | |
1682 | &or ("ebp","ecx"); | |
1683 | &mov (&DWP($i+12,"edi"),"edx"); | |
1684 | &or ("ebp","edx"); | |
1685 | } | |
1686 | &xor ("ebx","ebx"); | |
1687 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1688 | &sub ("ebx","ebp"); | |
1689 | &lea ("esi",&DWP($in1_z,"esp")); | |
1690 | &or ("ebx","ebp"); | |
1691 | &lea ("ebp",&DWP($in1_z,"esp")); | |
1692 | &sar ("ebx",31); | |
1693 | &lea ("edi",&DWP($Z1sqr,"esp")); | |
1694 | &mov (&DWP(32*15+4,"esp"),"ebx"); # !in2infty | |
1695 | ||
1696 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); | |
1697 | ||
1698 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1699 | &lea ("esi",&DWP($in2_x,"esp")); | |
1700 | &mov ("ebp","edi"); # %esi is stull &Z1sqr | |
1701 | &lea ("edi",&DWP($U2,"esp")); | |
1702 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); | |
1703 | ||
1704 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1705 | &lea ("esi",&DWP($in1_z,"esp")); | |
1706 | &lea ("ebp",&DWP($Z1sqr,"esp")); | |
1707 | &lea ("edi",&DWP($S2,"esp")); | |
1708 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); | |
1709 | ||
1710 | &lea ("esi",&DWP($U2,"esp")); | |
1711 | &lea ("ebp",&DWP($in1_x,"esp")); | |
1712 | &lea ("edi",&DWP($H,"esp")); | |
1713 | &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); | |
1714 | ||
1715 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1716 | &lea ("esi",&DWP($in2_y,"esp")); | |
1717 | &lea ("ebp",&DWP($S2,"esp")); | |
1718 | &lea ("edi",&DWP($S2,"esp")); | |
1719 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); | |
1720 | ||
1721 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1722 | &lea ("esi",&DWP($in1_z,"esp")); | |
1723 | &lea ("ebp",&DWP($H,"esp")); | |
1724 | &lea ("edi",&DWP($res_z,"esp")); | |
1725 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); | |
1726 | ||
1727 | &lea ("esi",&DWP($S2,"esp")); | |
1728 | &lea ("ebp",&DWP($in1_y,"esp")); | |
1729 | &lea ("edi",&DWP($R,"esp")); | |
1730 | &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); | |
1731 | ||
1732 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1733 | &lea ("esi",&DWP($H,"esp")); | |
1734 | &lea ("ebp",&DWP($H,"esp")); | |
1735 | &lea ("edi",&DWP($Hsqr,"esp")); | |
1736 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); | |
1737 | ||
1738 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1739 | &lea ("esi",&DWP($R,"esp")); | |
1740 | &lea ("ebp",&DWP($R,"esp")); | |
1741 | &lea ("edi",&DWP($Rsqr,"esp")); | |
1742 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); | |
1743 | ||
1744 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1745 | &lea ("esi",&DWP($in1_x,"esp")); | |
1746 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1747 | &lea ("edi",&DWP($U2,"esp")); | |
1748 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); | |
1749 | ||
1750 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1751 | &lea ("esi",&DWP($H,"esp")); | |
1752 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1753 | &lea ("edi",&DWP($Hcub,"esp")); | |
1754 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); | |
1755 | ||
1756 | &lea ("esi",&DWP($U2,"esp")); | |
1757 | &lea ("ebp",&DWP($U2,"esp")); | |
1758 | &lea ("edi",&DWP($Hsqr,"esp")); | |
1759 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); | |
1760 | ||
1761 | &lea ("esi",&DWP($Rsqr,"esp")); | |
1762 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1763 | &lea ("edi",&DWP($res_x,"esp")); | |
1764 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); | |
1765 | ||
1766 | &lea ("esi",&DWP($res_x,"esp")); | |
1767 | &lea ("ebp",&DWP($Hcub,"esp")); | |
1768 | &lea ("edi",&DWP($res_x,"esp")); | |
1769 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); | |
1770 | ||
1771 | &lea ("esi",&DWP($U2,"esp")); | |
1772 | &lea ("ebp",&DWP($res_x,"esp")); | |
1773 | &lea ("edi",&DWP($res_y,"esp")); | |
1774 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); | |
1775 | ||
1776 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1777 | &lea ("esi",&DWP($Hcub,"esp")); | |
1778 | &lea ("ebp",&DWP($in1_y,"esp")); | |
1779 | &lea ("edi",&DWP($S2,"esp")); | |
1780 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); | |
1781 | ||
1782 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1783 | &lea ("esi",&DWP($R,"esp")); | |
1784 | &lea ("ebp",&DWP($res_y,"esp")); | |
1785 | &lea ("edi",&DWP($res_y,"esp")); | |
1786 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); | |
1787 | ||
1788 | &lea ("esi",&DWP($res_y,"esp")); | |
1789 | &lea ("ebp",&DWP($S2,"esp")); | |
1790 | &lea ("edi",&DWP($res_y,"esp")); | |
1791 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); | |
1792 | ||
1793 | &mov ("ebp",&DWP(32*15+0,"esp")); # !in1infty | |
1794 | &mov ("esi",&DWP(32*15+4,"esp")); # !in2infty | |
1795 | &mov ("edi",&wparam(0)); | |
1796 | &mov ("edx","ebp"); | |
1797 | ¬ ("ebp"); | |
1798 | &and ("edx","esi"); | |
1799 | &and ("ebp","esi"); | |
1800 | ¬ ("esi"); | |
1801 | ||
1802 | ######################################## | |
1803 | # conditional moves | |
1804 | for($i=64;$i<96;$i+=4) { | |
1805 | my $one=@ONE_mont[($i-64)/4]; | |
1806 | ||
1807 | &mov ("eax","edx"); | |
1808 | &and ("eax",&DWP($res_x+$i,"esp")); | |
1809 | &mov ("ebx","ebp") if ($one && $one!=-1); | |
1810 | &and ("ebx",$one) if ($one && $one!=-1); | |
1811 | &mov ("ecx","esi"); | |
1812 | &and ("ecx",&DWP($in1_x+$i,"esp")); | |
1813 | &or ("eax",$one==-1?"ebp":"ebx") if ($one); | |
1814 | &or ("eax","ecx"); | |
1815 | &mov (&DWP($i,"edi"),"eax"); | |
1816 | } | |
1817 | for($i=0;$i<64;$i+=4) { | |
1818 | &mov ("eax","edx"); | |
1819 | &and ("eax",&DWP($res_x+$i,"esp")); | |
1820 | &mov ("ebx","ebp"); | |
1821 | &and ("ebx",&DWP($in2_x+$i,"esp")); | |
1822 | &mov ("ecx","esi"); | |
1823 | &and ("ecx",&DWP($in1_x+$i,"esp")); | |
1824 | &or ("eax","ebx"); | |
1825 | &or ("eax","ecx"); | |
1826 | &mov (&DWP($i,"edi"),"eax"); | |
1827 | } | |
1828 | &stack_pop(8*15+3); | |
1829 | } &function_end("ecp_nistz256_point_add_affine"); | |
1830 | ||
1831 | &asm_finish(); |