]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
1212818e | 2 | # Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e RS |
3 | # |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
aa9db2d2 AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # ECP_NISTZ256 module for x86/SSE2. | |
18 | # | |
19 | # October 2014. | |
20 | # | |
21 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | |
22 | # http://eprint.iacr.org/2013/816. In the process of adaptation | |
23 | # original .c module was made 32-bit savvy in order to make this | |
24 | # implementation possible. | |
25 | # | |
26 | # with/without -DECP_NISTZ256_ASM | |
27 | # Pentium +66-163% | |
28 | # PIII +72-172% | |
29 | # P4 +65-132% | |
30 | # Core2 +90-215% | |
31 | # Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) | |
32 | # Atom +65-155% | |
33 | # Opteron +54-110% | |
34 | # Bulldozer +99-240% | |
35 | # VIA Nano +93-290% | |
36 | # | |
37 | # Ranges denote minimum and maximum improvement coefficients depending | |
38 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | |
39 | # operation. Keep in mind that +200% means 3x improvement. | |
40 | ||
41 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
42 | push(@INC,"${dir}","${dir}../../perlasm"); | |
43 | require "x86asm.pl"; | |
44 | ||
73d2fb66 RL |
45 | $output=pop; |
46 | open STDOUT,">$output"; | |
47 | ||
e195c8a2 | 48 | &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); |
aa9db2d2 AP |
49 | |
50 | $sse2=0; | |
51 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
52 | ||
53 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | |
54 | ||
55 | ||
56 | ######################################################################## | |
57 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 | |
58 | # | |
59 | open TABLE,"<ecp_nistz256_table.c" or | |
60 | open TABLE,"<${dir}../ecp_nistz256_table.c" or | |
61 | die "failed to open ecp_nistz256_table.c:",$!; | |
62 | ||
63 | use integer; | |
64 | ||
65 | foreach(<TABLE>) { | |
66 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; | |
67 | } | |
68 | close TABLE; | |
69 | ||
70 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37. | |
71 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not | |
72 | # amount of elements. | |
73 | die "insane number of elements" if ($#arr != 64*16*37-1); | |
74 | ||
75 | &public_label("ecp_nistz256_precomputed"); | |
76 | &align(4096); | |
77 | &set_label("ecp_nistz256_precomputed"); | |
78 | ||
79 | ######################################################################## | |
80 | # this conversion smashes P256_POINT_AFFINE by individual bytes with | |
81 | # 64 byte interval, similar to | |
82 | # 1111222233334444 | |
83 | # 1234123412341234 | |
84 | for(1..37) { | |
85 | @tbl = splice(@arr,0,64*16); | |
86 | for($i=0;$i<64;$i++) { | |
87 | undef @line; | |
88 | for($j=0;$j<64;$j++) { | |
89 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; | |
90 | } | |
91 | &data_byte(join(',',map { sprintf "0x%02x",$_} @line)); | |
92 | } | |
93 | } | |
94 | ||
95 | ######################################################################## | |
96 | # Keep in mind that constants are stored least to most significant word | |
97 | &static_label("RR"); | |
98 | &set_label("RR",64); | |
99 | &data_word(3,0,-1,-5,-2,-1,-3,4); # 2^512 mod P-256 | |
100 | ||
101 | &static_label("ONE_mont"); | |
102 | &set_label("ONE_mont"); | |
103 | &data_word(1,0,0,-1,-1,-1,-2,0); | |
104 | ||
105 | &static_label("ONE"); | |
106 | &set_label("ONE"); | |
107 | &data_word(1,0,0,0,0,0,0,0); | |
108 | &asciz("ECP_NISZ256 for x86/SSE2, CRYPTOGAMS by <appro\@openssl.org>"); | |
109 | &align(64); | |
110 | ||
111 | ######################################################################## | |
112 | # void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
113 | &function_begin("ecp_nistz256_mul_by_2"); | |
114 | &mov ("esi",&wparam(1)); | |
115 | &mov ("edi",&wparam(0)); | |
116 | &mov ("ebp","esi"); | |
117 | ######################################################################## | |
118 | # common pattern for internal functions is that %edi is result pointer, | |
119 | # %esi and %ebp are input ones, %ebp being optional. %edi is preserved. | |
120 | &call ("_ecp_nistz256_add"); | |
121 | &function_end("ecp_nistz256_mul_by_2"); | |
122 | ||
123 | ######################################################################## | |
124 | # void ecp_nistz256_mul_by_3(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
125 | &function_begin("ecp_nistz256_mul_by_3"); | |
126 | &mov ("esi",&wparam(1)); | |
127 | # multiplication by 3 is performed | |
128 | # as 2*n+n, but we can't use output | |
129 | # to store 2*n, because if output | |
130 | # pointer equals to input, then | |
131 | # we'll get 2*n+2*n. | |
132 | &stack_push(8); # therefore we need to allocate | |
133 | # 256-bit intermediate buffer. | |
134 | &mov ("edi","esp"); | |
135 | &mov ("ebp","esi"); | |
136 | &call ("_ecp_nistz256_add"); | |
137 | &lea ("esi",&DWP(0,"edi")); | |
138 | &mov ("ebp",&wparam(1)); | |
139 | &mov ("edi",&wparam(0)); | |
140 | &call ("_ecp_nistz256_add"); | |
141 | &stack_pop(8); | |
142 | &function_end("ecp_nistz256_mul_by_3"); | |
143 | ||
144 | ######################################################################## | |
145 | # void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
146 | &function_begin("ecp_nistz256_div_by_2"); | |
147 | &mov ("esi",&wparam(1)); | |
148 | &mov ("edi",&wparam(0)); | |
149 | &call ("_ecp_nistz256_div_by_2"); | |
150 | &function_end("ecp_nistz256_div_by_2"); | |
151 | ||
152 | &function_begin_B("_ecp_nistz256_div_by_2"); | |
153 | # tmp = a is odd ? a+mod : a | |
154 | # | |
155 | # note that because mod has special form, i.e. consists of | |
156 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
157 | # assigning least significant bit of input to one register, | |
158 | # %ebp, and its negative to another, %edx. | |
159 | ||
160 | &mov ("ebp",&DWP(0,"esi")); | |
161 | &xor ("edx","edx"); | |
162 | &mov ("ebx",&DWP(4,"esi")); | |
163 | &mov ("eax","ebp"); | |
164 | &and ("ebp",1); | |
165 | &mov ("ecx",&DWP(8,"esi")); | |
166 | &sub ("edx","ebp"); | |
167 | ||
168 | &add ("eax","edx"); | |
169 | &adc ("ebx","edx"); | |
170 | &mov (&DWP(0,"edi"),"eax"); | |
171 | &adc ("ecx","edx"); | |
172 | &mov (&DWP(4,"edi"),"ebx"); | |
173 | &mov (&DWP(8,"edi"),"ecx"); | |
174 | ||
175 | &mov ("eax",&DWP(12,"esi")); | |
176 | &mov ("ebx",&DWP(16,"esi")); | |
177 | &adc ("eax",0); | |
178 | &mov ("ecx",&DWP(20,"esi")); | |
179 | &adc ("ebx",0); | |
180 | &mov (&DWP(12,"edi"),"eax"); | |
181 | &adc ("ecx",0); | |
182 | &mov (&DWP(16,"edi"),"ebx"); | |
183 | &mov (&DWP(20,"edi"),"ecx"); | |
184 | ||
185 | &mov ("eax",&DWP(24,"esi")); | |
186 | &mov ("ebx",&DWP(28,"esi")); | |
187 | &adc ("eax","ebp"); | |
188 | &adc ("ebx","edx"); | |
189 | &mov (&DWP(24,"edi"),"eax"); | |
190 | &sbb ("esi","esi"); # broadcast carry bit | |
191 | &mov (&DWP(28,"edi"),"ebx"); | |
192 | ||
193 | # ret = tmp >> 1 | |
194 | ||
195 | &mov ("eax",&DWP(0,"edi")); | |
196 | &mov ("ebx",&DWP(4,"edi")); | |
197 | &mov ("ecx",&DWP(8,"edi")); | |
198 | &mov ("edx",&DWP(12,"edi")); | |
199 | ||
200 | &shr ("eax",1); | |
201 | &mov ("ebp","ebx"); | |
202 | &shl ("ebx",31); | |
203 | &or ("eax","ebx"); | |
204 | ||
205 | &shr ("ebp",1); | |
206 | &mov ("ebx","ecx"); | |
207 | &shl ("ecx",31); | |
208 | &mov (&DWP(0,"edi"),"eax"); | |
209 | &or ("ebp","ecx"); | |
210 | &mov ("eax",&DWP(16,"edi")); | |
211 | ||
212 | &shr ("ebx",1); | |
213 | &mov ("ecx","edx"); | |
214 | &shl ("edx",31); | |
215 | &mov (&DWP(4,"edi"),"ebp"); | |
216 | &or ("ebx","edx"); | |
217 | &mov ("ebp",&DWP(20,"edi")); | |
218 | ||
219 | &shr ("ecx",1); | |
220 | &mov ("edx","eax"); | |
221 | &shl ("eax",31); | |
222 | &mov (&DWP(8,"edi"),"ebx"); | |
223 | &or ("ecx","eax"); | |
224 | &mov ("ebx",&DWP(24,"edi")); | |
225 | ||
226 | &shr ("edx",1); | |
227 | &mov ("eax","ebp"); | |
228 | &shl ("ebp",31); | |
229 | &mov (&DWP(12,"edi"),"ecx"); | |
230 | &or ("edx","ebp"); | |
231 | &mov ("ecx",&DWP(28,"edi")); | |
232 | ||
233 | &shr ("eax",1); | |
234 | &mov ("ebp","ebx"); | |
235 | &shl ("ebx",31); | |
236 | &mov (&DWP(16,"edi"),"edx"); | |
237 | &or ("eax","ebx"); | |
238 | ||
239 | &shr ("ebp",1); | |
240 | &mov ("ebx","ecx"); | |
241 | &shl ("ecx",31); | |
242 | &mov (&DWP(20,"edi"),"eax"); | |
243 | &or ("ebp","ecx"); | |
244 | ||
245 | &shr ("ebx",1); | |
246 | &shl ("esi",31); | |
247 | &mov (&DWP(24,"edi"),"ebp"); | |
248 | &or ("ebx","esi"); # handle top-most carry bit | |
249 | &mov (&DWP(28,"edi"),"ebx"); | |
250 | ||
251 | &ret (); | |
252 | &function_end_B("_ecp_nistz256_div_by_2"); | |
253 | ||
254 | ######################################################################## | |
255 | # void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], | |
256 | # const BN_ULONG ebp[8]); | |
257 | &function_begin("ecp_nistz256_add"); | |
258 | &mov ("esi",&wparam(1)); | |
259 | &mov ("ebp",&wparam(2)); | |
260 | &mov ("edi",&wparam(0)); | |
261 | &call ("_ecp_nistz256_add"); | |
262 | &function_end("ecp_nistz256_add"); | |
263 | ||
264 | &function_begin_B("_ecp_nistz256_add"); | |
265 | &mov ("eax",&DWP(0,"esi")); | |
266 | &mov ("ebx",&DWP(4,"esi")); | |
267 | &mov ("ecx",&DWP(8,"esi")); | |
268 | &add ("eax",&DWP(0,"ebp")); | |
269 | &mov ("edx",&DWP(12,"esi")); | |
270 | &adc ("ebx",&DWP(4,"ebp")); | |
271 | &mov (&DWP(0,"edi"),"eax"); | |
272 | &adc ("ecx",&DWP(8,"ebp")); | |
273 | &mov (&DWP(4,"edi"),"ebx"); | |
274 | &adc ("edx",&DWP(12,"ebp")); | |
275 | &mov (&DWP(8,"edi"),"ecx"); | |
276 | &mov (&DWP(12,"edi"),"edx"); | |
277 | ||
278 | &mov ("eax",&DWP(16,"esi")); | |
279 | &mov ("ebx",&DWP(20,"esi")); | |
280 | &mov ("ecx",&DWP(24,"esi")); | |
281 | &adc ("eax",&DWP(16,"ebp")); | |
282 | &mov ("edx",&DWP(28,"esi")); | |
283 | &adc ("ebx",&DWP(20,"ebp")); | |
284 | &mov (&DWP(16,"edi"),"eax"); | |
285 | &adc ("ecx",&DWP(24,"ebp")); | |
286 | &mov (&DWP(20,"edi"),"ebx"); | |
dfde4219 | 287 | &mov ("esi",0); |
aa9db2d2 AP |
288 | &adc ("edx",&DWP(28,"ebp")); |
289 | &mov (&DWP(24,"edi"),"ecx"); | |
dfde4219 | 290 | &adc ("esi",0); |
aa9db2d2 AP |
291 | &mov (&DWP(28,"edi"),"edx"); |
292 | ||
dfde4219 | 293 | # if a+b >= modulus, subtract modulus. |
aa9db2d2 | 294 | # |
dfde4219 AP |
295 | # But since comparison implies subtraction, we subtract modulus |
296 | # to see if it borrows, and then subtract it for real if | |
297 | # subtraction didn't borrow. | |
298 | ||
299 | &mov ("eax",&DWP(0,"edi")); | |
300 | &mov ("ebx",&DWP(4,"edi")); | |
301 | &mov ("ecx",&DWP(8,"edi")); | |
302 | &sub ("eax",-1); | |
303 | &mov ("edx",&DWP(12,"edi")); | |
304 | &sbb ("ebx",-1); | |
305 | &mov ("eax",&DWP(16,"edi")); | |
306 | &sbb ("ecx",-1); | |
307 | &mov ("ebx",&DWP(20,"edi")); | |
308 | &sbb ("edx",0); | |
309 | &mov ("ecx",&DWP(24,"edi")); | |
310 | &sbb ("eax",0); | |
311 | &mov ("edx",&DWP(28,"edi")); | |
312 | &sbb ("ebx",0); | |
313 | &sbb ("ecx",1); | |
314 | &sbb ("edx",-1); | |
315 | &sbb ("esi",0); | |
316 | ||
aa9db2d2 AP |
317 | # Note that because mod has special form, i.e. consists of |
318 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
dfde4219 | 319 | # by using borrow. |
aa9db2d2 | 320 | |
dfde4219 | 321 | ¬ ("esi"); |
aa9db2d2 AP |
322 | &mov ("eax",&DWP(0,"edi")); |
323 | &mov ("ebp","esi"); | |
324 | &mov ("ebx",&DWP(4,"edi")); | |
325 | &shr ("ebp",31); | |
326 | &mov ("ecx",&DWP(8,"edi")); | |
327 | &sub ("eax","esi"); | |
328 | &mov ("edx",&DWP(12,"edi")); | |
329 | &sbb ("ebx","esi"); | |
330 | &mov (&DWP(0,"edi"),"eax"); | |
331 | &sbb ("ecx","esi"); | |
332 | &mov (&DWP(4,"edi"),"ebx"); | |
333 | &sbb ("edx",0); | |
334 | &mov (&DWP(8,"edi"),"ecx"); | |
335 | &mov (&DWP(12,"edi"),"edx"); | |
336 | ||
337 | &mov ("eax",&DWP(16,"edi")); | |
338 | &mov ("ebx",&DWP(20,"edi")); | |
339 | &mov ("ecx",&DWP(24,"edi")); | |
340 | &sbb ("eax",0); | |
341 | &mov ("edx",&DWP(28,"edi")); | |
342 | &sbb ("ebx",0); | |
343 | &mov (&DWP(16,"edi"),"eax"); | |
344 | &sbb ("ecx","ebp"); | |
345 | &mov (&DWP(20,"edi"),"ebx"); | |
346 | &sbb ("edx","esi"); | |
347 | &mov (&DWP(24,"edi"),"ecx"); | |
348 | &mov (&DWP(28,"edi"),"edx"); | |
349 | ||
350 | &ret (); | |
351 | &function_end_B("_ecp_nistz256_add"); | |
352 | ||
353 | ######################################################################## | |
354 | # void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], | |
355 | # const BN_ULONG ebp[8]); | |
356 | &function_begin("ecp_nistz256_sub"); | |
357 | &mov ("esi",&wparam(1)); | |
358 | &mov ("ebp",&wparam(2)); | |
359 | &mov ("edi",&wparam(0)); | |
360 | &call ("_ecp_nistz256_sub"); | |
361 | &function_end("ecp_nistz256_sub"); | |
362 | ||
363 | &function_begin_B("_ecp_nistz256_sub"); | |
364 | &mov ("eax",&DWP(0,"esi")); | |
365 | &mov ("ebx",&DWP(4,"esi")); | |
366 | &mov ("ecx",&DWP(8,"esi")); | |
367 | &sub ("eax",&DWP(0,"ebp")); | |
368 | &mov ("edx",&DWP(12,"esi")); | |
369 | &sbb ("ebx",&DWP(4,"ebp")); | |
370 | &mov (&DWP(0,"edi"),"eax"); | |
371 | &sbb ("ecx",&DWP(8,"ebp")); | |
372 | &mov (&DWP(4,"edi"),"ebx"); | |
373 | &sbb ("edx",&DWP(12,"ebp")); | |
374 | &mov (&DWP(8,"edi"),"ecx"); | |
375 | &mov (&DWP(12,"edi"),"edx"); | |
376 | ||
377 | &mov ("eax",&DWP(16,"esi")); | |
378 | &mov ("ebx",&DWP(20,"esi")); | |
379 | &mov ("ecx",&DWP(24,"esi")); | |
380 | &sbb ("eax",&DWP(16,"ebp")); | |
381 | &mov ("edx",&DWP(28,"esi")); | |
382 | &sbb ("ebx",&DWP(20,"ebp")); | |
383 | &sbb ("ecx",&DWP(24,"ebp")); | |
384 | &mov (&DWP(16,"edi"),"eax"); | |
385 | &sbb ("edx",&DWP(28,"ebp")); | |
386 | &mov (&DWP(20,"edi"),"ebx"); | |
387 | &sbb ("esi","esi"); # broadcast borrow bit | |
388 | &mov (&DWP(24,"edi"),"ecx"); | |
389 | &mov (&DWP(28,"edi"),"edx"); | |
390 | ||
391 | # if a-b borrows, add modulus. | |
392 | # | |
393 | # Note that because mod has special form, i.e. consists of | |
394 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
395 | # assigning borrow bit to one register, %ebp, and its negative | |
396 | # to another, %esi. But we started by calculating %esi... | |
397 | ||
398 | &mov ("eax",&DWP(0,"edi")); | |
399 | &mov ("ebp","esi"); | |
400 | &mov ("ebx",&DWP(4,"edi")); | |
401 | &shr ("ebp",31); | |
402 | &mov ("ecx",&DWP(8,"edi")); | |
403 | &add ("eax","esi"); | |
404 | &mov ("edx",&DWP(12,"edi")); | |
405 | &adc ("ebx","esi"); | |
406 | &mov (&DWP(0,"edi"),"eax"); | |
407 | &adc ("ecx","esi"); | |
408 | &mov (&DWP(4,"edi"),"ebx"); | |
409 | &adc ("edx",0); | |
410 | &mov (&DWP(8,"edi"),"ecx"); | |
411 | &mov (&DWP(12,"edi"),"edx"); | |
412 | ||
413 | &mov ("eax",&DWP(16,"edi")); | |
414 | &mov ("ebx",&DWP(20,"edi")); | |
415 | &mov ("ecx",&DWP(24,"edi")); | |
416 | &adc ("eax",0); | |
417 | &mov ("edx",&DWP(28,"edi")); | |
418 | &adc ("ebx",0); | |
419 | &mov (&DWP(16,"edi"),"eax"); | |
420 | &adc ("ecx","ebp"); | |
421 | &mov (&DWP(20,"edi"),"ebx"); | |
422 | &adc ("edx","esi"); | |
423 | &mov (&DWP(24,"edi"),"ecx"); | |
424 | &mov (&DWP(28,"edi"),"edx"); | |
425 | ||
426 | &ret (); | |
427 | &function_end_B("_ecp_nistz256_sub"); | |
428 | ||
429 | ######################################################################## | |
430 | # void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
431 | &function_begin("ecp_nistz256_neg"); | |
432 | &mov ("ebp",&wparam(1)); | |
433 | &mov ("edi",&wparam(0)); | |
434 | ||
435 | &xor ("eax","eax"); | |
436 | &stack_push(8); | |
437 | &mov (&DWP(0,"esp"),"eax"); | |
438 | &mov ("esi","esp"); | |
439 | &mov (&DWP(4,"esp"),"eax"); | |
440 | &mov (&DWP(8,"esp"),"eax"); | |
441 | &mov (&DWP(12,"esp"),"eax"); | |
442 | &mov (&DWP(16,"esp"),"eax"); | |
443 | &mov (&DWP(20,"esp"),"eax"); | |
444 | &mov (&DWP(24,"esp"),"eax"); | |
445 | &mov (&DWP(28,"esp"),"eax"); | |
609b0852 | 446 | |
aa9db2d2 AP |
447 | &call ("_ecp_nistz256_sub"); |
448 | ||
449 | &stack_pop(8); | |
450 | &function_end("ecp_nistz256_neg"); | |
451 | ||
452 | &function_begin_B("_picup_eax"); | |
453 | &mov ("eax",&DWP(0,"esp")); | |
454 | &ret (); | |
455 | &function_end_B("_picup_eax"); | |
456 | ||
457 | ######################################################################## | |
458 | # void ecp_nistz256_to_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
459 | &function_begin("ecp_nistz256_to_mont"); | |
460 | &mov ("esi",&wparam(1)); | |
461 | &call ("_picup_eax"); | |
462 | &set_label("pic"); | |
463 | &lea ("ebp",&DWP(&label("RR")."-".&label("pic"),"eax")); | |
464 | if ($sse2) { | |
465 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | |
466 | &mov ("eax",&DWP(0,"eax")); } | |
467 | &mov ("edi",&wparam(0)); | |
468 | &call ("_ecp_nistz256_mul_mont"); | |
469 | &function_end("ecp_nistz256_to_mont"); | |
470 | ||
471 | ######################################################################## | |
472 | # void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
473 | &function_begin("ecp_nistz256_from_mont"); | |
474 | &mov ("esi",&wparam(1)); | |
475 | &call ("_picup_eax"); | |
476 | &set_label("pic"); | |
477 | &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); | |
478 | if ($sse2) { | |
479 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | |
480 | &mov ("eax",&DWP(0,"eax")); } | |
481 | &mov ("edi",&wparam(0)); | |
482 | &call ("_ecp_nistz256_mul_mont"); | |
483 | &function_end("ecp_nistz256_from_mont"); | |
484 | ||
485 | ######################################################################## | |
486 | # void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], | |
487 | # const BN_ULONG ebp[8]); | |
488 | &function_begin("ecp_nistz256_mul_mont"); | |
489 | &mov ("esi",&wparam(1)); | |
490 | &mov ("ebp",&wparam(2)); | |
491 | if ($sse2) { | |
492 | &call ("_picup_eax"); | |
493 | &set_label("pic"); | |
494 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | |
495 | &mov ("eax",&DWP(0,"eax")); } | |
496 | &mov ("edi",&wparam(0)); | |
497 | &call ("_ecp_nistz256_mul_mont"); | |
498 | &function_end("ecp_nistz256_mul_mont"); | |
499 | ||
500 | ######################################################################## | |
501 | # void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | |
502 | &function_begin("ecp_nistz256_sqr_mont"); | |
503 | &mov ("esi",&wparam(1)); | |
504 | if ($sse2) { | |
505 | &call ("_picup_eax"); | |
506 | &set_label("pic"); | |
507 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | |
508 | &mov ("eax",&DWP(0,"eax")); } | |
509 | &mov ("edi",&wparam(0)); | |
510 | &mov ("ebp","esi"); | |
511 | &call ("_ecp_nistz256_mul_mont"); | |
512 | &function_end("ecp_nistz256_sqr_mont"); | |
513 | ||
514 | &function_begin_B("_ecp_nistz256_mul_mont"); | |
515 | if ($sse2) { | |
516 | &and ("eax",1<<24|1<<26); | |
517 | &cmp ("eax",1<<24|1<<26); # see if XMM+SSE2 is on | |
518 | &jne (&label("mul_mont_ialu")); | |
519 | ||
520 | ######################################## | |
521 | # SSE2 code path featuring 32x16-bit | |
522 | # multiplications is ~2x faster than | |
523 | # IALU counterpart (except on Atom)... | |
524 | ######################################## | |
525 | # stack layout: | |
526 | # +------------------------------------+< %esp | |
527 | # | 7 16-byte temporary XMM words, | | |
528 | # | "sliding" toward lower address | | |
529 | # . . | |
530 | # +------------------------------------+ | |
531 | # | unused XMM word | | |
532 | # +------------------------------------+< +128,%ebx | |
533 | # | 8 16-byte XMM words holding copies | | |
534 | # | of a[i]<<64|a[i] | | |
535 | # . . | |
536 | # . . | |
537 | # +------------------------------------+< +256 | |
538 | &mov ("edx","esp"); | |
539 | &sub ("esp",0x100); | |
540 | ||
541 | &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy | |
542 | &lea ("ebp",&DWP(4,"ebp")); | |
543 | &pcmpeqd("xmm6","xmm6"); | |
544 | &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff | |
545 | ||
546 | &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y | |
547 | &and ("esp",-64); | |
548 | &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | |
549 | &lea ("ebx",&DWP(0x80,"esp")); | |
550 | ||
551 | &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy | |
552 | &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy | |
553 | &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... | |
554 | &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] | |
555 | &pmuludq("xmm0","xmm7"); # a[0]*b[0] | |
556 | ||
557 | &movd ("xmm2",&DWP(4*2,"esi")); | |
558 | &pshufd ("xmm1","xmm1",0b11001100); | |
559 | &movdqa (&QWP(0x10,"ebx"),"xmm1"); | |
560 | &pmuludq("xmm1","xmm7"); # a[1]*b[0] | |
561 | ||
562 | &movq ("xmm4","xmm0"); # clear upper 64 bits | |
563 | &pslldq("xmm4",6); | |
564 | &paddq ("xmm4","xmm0"); | |
565 | &movdqa("xmm5","xmm4"); | |
566 | &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] | |
567 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] | |
568 | ||
569 | # Upper half of a[0]*b[i] is carried into next multiplication | |
570 | # iteration, while lower one "participates" in actual reduction. | |
571 | # Normally latter is done by accumulating result of multiplication | |
572 | # of modulus by "magic" digit, but thanks to special form of modulus | |
573 | # and "magic" digit it can be performed only with additions and | |
574 | # subtractions (see note in IALU section below). Note that we are | |
575 | # not bothered with carry bits, they are accumulated in "flatten" | |
576 | # phase after all multiplications and reductions. | |
577 | ||
578 | &movd ("xmm3",&DWP(4*3,"esi")); | |
579 | &pshufd ("xmm2","xmm2",0b11001100); | |
580 | &movdqa (&QWP(0x20,"ebx"),"xmm2"); | |
581 | &pmuludq("xmm2","xmm7"); # a[2]*b[0] | |
582 | &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry | |
583 | &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] | |
584 | ||
585 | &movd ("xmm0",&DWP(4*4,"esi")); | |
586 | &pshufd ("xmm3","xmm3",0b11001100); | |
587 | &movdqa (&QWP(0x30,"ebx"),"xmm3"); | |
588 | &pmuludq("xmm3","xmm7"); # a[3]*b[0] | |
589 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | |
590 | ||
591 | &movd ("xmm1",&DWP(4*5,"esi")); | |
592 | &pshufd ("xmm0","xmm0",0b11001100); | |
593 | &movdqa (&QWP(0x40,"ebx"),"xmm0"); | |
594 | &pmuludq("xmm0","xmm7"); # a[4]*b[0] | |
595 | &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step | |
596 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | |
597 | ||
598 | &movd ("xmm2",&DWP(4*6,"esi")); | |
599 | &pshufd ("xmm1","xmm1",0b11001100); | |
600 | &movdqa (&QWP(0x50,"ebx"),"xmm1"); | |
601 | &pmuludq("xmm1","xmm7"); # a[5]*b[0] | |
602 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | |
603 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | |
604 | ||
605 | &movd ("xmm3",&DWP(4*7,"esi")); | |
606 | &pshufd ("xmm2","xmm2",0b11001100); | |
607 | &movdqa (&QWP(0x60,"ebx"),"xmm2"); | |
608 | &pmuludq("xmm2","xmm7"); # a[6]*b[0] | |
609 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | |
610 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | |
611 | ||
612 | &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy | |
613 | &pshufd ("xmm3","xmm3",0b11001100); | |
614 | &movdqa (&QWP(0x70,"ebx"),"xmm3"); | |
615 | &pmuludq("xmm3","xmm7"); # a[7]*b[0] | |
616 | ||
617 | &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y | |
618 | &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] | |
619 | &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | |
620 | ||
621 | &mov ("ecx",6); | |
622 | &lea ("ebp",&DWP(4,"ebp")); | |
623 | &jmp (&label("madd_sse2")); | |
624 | ||
625 | &set_label("madd_sse2",16); | |
626 | &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] | |
627 | &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] | |
628 | &movdqa ("xmm1",&QWP(0x10,"ebx")); | |
629 | &pmuludq("xmm0","xmm7"); # a[0]*b[i] | |
630 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | |
631 | ||
632 | &movdqa ("xmm2",&QWP(0x20,"ebx")); | |
633 | &pmuludq("xmm1","xmm7"); # a[1]*b[i] | |
634 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | |
635 | &paddq ("xmm0",&QWP(0x00,"esp")); | |
636 | ||
637 | &movdqa ("xmm3",&QWP(0x30,"ebx")); | |
638 | &pmuludq("xmm2","xmm7"); # a[2]*b[i] | |
639 | &movq ("xmm4","xmm0"); # clear upper 64 bits | |
640 | &pslldq("xmm4",6); | |
641 | &paddq ("xmm1",&QWP(0x10,"esp")); | |
642 | &paddq ("xmm4","xmm0"); | |
643 | &movdqa("xmm5","xmm4"); | |
644 | &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] | |
645 | ||
646 | &movdqa ("xmm0",&QWP(0x40,"ebx")); | |
647 | &pmuludq("xmm3","xmm7"); # a[3]*b[i] | |
648 | &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry | |
649 | &paddq ("xmm2",&QWP(0x20,"esp")); | |
650 | &movdqa (&QWP(0x00,"esp"),"xmm1"); | |
651 | ||
652 | &movdqa ("xmm1",&QWP(0x50,"ebx")); | |
653 | &pmuludq("xmm0","xmm7"); # a[4]*b[i] | |
654 | &paddq ("xmm3",&QWP(0x30,"esp")); | |
655 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | |
656 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] | |
657 | ||
50292917 | 658 | &movdqa ("xmm2",&QWP(0x60,"ebx")); |
aa9db2d2 AP |
659 | &pmuludq("xmm1","xmm7"); # a[5]*b[i] |
660 | &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step | |
661 | &paddq ("xmm0",&QWP(0x40,"esp")); | |
662 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | |
663 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | |
664 | ||
665 | &movdqa ("xmm3","xmm7"); | |
666 | &pmuludq("xmm2","xmm7"); # a[6]*b[i] | |
667 | &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy | |
668 | &lea ("ebp",&DWP(4,"ebp")); | |
669 | &paddq ("xmm1",&QWP(0x50,"esp")); | |
670 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | |
671 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | |
672 | &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y | |
673 | ||
674 | &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] | |
675 | &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | |
676 | &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] | |
677 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | |
678 | &paddq ("xmm2",&QWP(0x60,"esp")); | |
679 | ||
680 | &dec ("ecx"); | |
681 | &jnz (&label("madd_sse2")); | |
682 | ||
683 | &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] | |
684 | &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] | |
685 | &movdqa ("xmm1",&QWP(0x10,"ebx")); | |
686 | &pmuludq("xmm0","xmm7"); # a[0]*b[7] | |
687 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | |
688 | ||
689 | &movdqa ("xmm2",&QWP(0x20,"ebx")); | |
690 | &pmuludq("xmm1","xmm7"); # a[1]*b[7] | |
691 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | |
692 | &paddq ("xmm0",&QWP(0x00,"esp")); | |
693 | ||
694 | &movdqa ("xmm3",&QWP(0x30,"ebx")); | |
695 | &pmuludq("xmm2","xmm7"); # a[2]*b[7] | |
696 | &movq ("xmm4","xmm0"); # clear upper 64 bits | |
697 | &pslldq("xmm4",6); | |
698 | &paddq ("xmm1",&QWP(0x10,"esp")); | |
699 | &paddq ("xmm4","xmm0"); | |
700 | &movdqa("xmm5","xmm4"); | |
701 | &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] | |
702 | ||
703 | &movdqa ("xmm0",&QWP(0x40,"ebx")); | |
704 | &pmuludq("xmm3","xmm7"); # a[3]*b[7] | |
705 | &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry | |
706 | &paddq ("xmm2",&QWP(0x20,"esp")); | |
707 | &movdqa (&QWP(0x00,"esp"),"xmm1"); | |
708 | ||
709 | &movdqa ("xmm1",&QWP(0x50,"ebx")); | |
710 | &pmuludq("xmm0","xmm7"); # a[4]*b[7] | |
711 | &paddq ("xmm3",&QWP(0x30,"esp")); | |
712 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | |
713 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] | |
714 | ||
50292917 | 715 | &movdqa ("xmm2",&QWP(0x60,"ebx")); |
aa9db2d2 AP |
716 | &pmuludq("xmm1","xmm7"); # a[5]*b[7] |
717 | &paddq ("xmm3","xmm5"); # reduction step | |
718 | &paddq ("xmm0",&QWP(0x40,"esp")); | |
719 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | |
720 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | |
721 | ||
722 | &movdqa ("xmm3",&QWP(0x70,"ebx")); | |
723 | &pmuludq("xmm2","xmm7"); # a[6]*b[7] | |
724 | &paddq ("xmm1",&QWP(0x50,"esp")); | |
725 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | |
726 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | |
727 | ||
728 | &pmuludq("xmm3","xmm7"); # a[7]*b[7] | |
729 | &pcmpeqd("xmm7","xmm7"); | |
730 | &movdqa ("xmm0",&QWP(0x00,"esp")); | |
731 | &pslldq ("xmm7",8); | |
732 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | |
733 | &paddq ("xmm2",&QWP(0x60,"esp")); | |
734 | ||
735 | &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step | |
736 | &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step | |
737 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | |
738 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | |
739 | ||
740 | &movdqa ("xmm1",&QWP(0x10,"esp")); | |
741 | &movdqa ("xmm2",&QWP(0x20,"esp")); | |
742 | &movdqa ("xmm3",&QWP(0x30,"esp")); | |
743 | ||
744 | &movq ("xmm4","xmm0"); # "flatten" | |
745 | &pand ("xmm0","xmm7"); | |
746 | &xor ("ebp","ebp"); | |
747 | &pslldq ("xmm4",6); | |
748 | &movq ("xmm5","xmm1"); | |
749 | &paddq ("xmm0","xmm4"); | |
750 | &pand ("xmm1","xmm7"); | |
751 | &psrldq ("xmm0",6); | |
752 | &movd ("eax","xmm0"); | |
753 | &psrldq ("xmm0",4); | |
754 | ||
755 | &paddq ("xmm5","xmm0"); | |
756 | &movdqa ("xmm0",&QWP(0x40,"esp")); | |
757 | &sub ("eax",-1); # start subtracting modulus, | |
758 | # this is used to determine | |
759 | # if result is larger/smaller | |
760 | # than modulus (see below) | |
761 | &pslldq ("xmm5",6); | |
762 | &movq ("xmm4","xmm2"); | |
763 | &paddq ("xmm1","xmm5"); | |
764 | &pand ("xmm2","xmm7"); | |
765 | &psrldq ("xmm1",6); | |
766 | &mov (&DWP(4*0,"edi"),"eax"); | |
767 | &movd ("eax","xmm1"); | |
768 | &psrldq ("xmm1",4); | |
769 | ||
770 | &paddq ("xmm4","xmm1"); | |
771 | &movdqa ("xmm1",&QWP(0x50,"esp")); | |
772 | &sbb ("eax",-1); | |
773 | &pslldq ("xmm4",6); | |
774 | &movq ("xmm5","xmm3"); | |
775 | &paddq ("xmm2","xmm4"); | |
776 | &pand ("xmm3","xmm7"); | |
777 | &psrldq ("xmm2",6); | |
778 | &mov (&DWP(4*1,"edi"),"eax"); | |
779 | &movd ("eax","xmm2"); | |
780 | &psrldq ("xmm2",4); | |
781 | ||
782 | &paddq ("xmm5","xmm2"); | |
783 | &movdqa ("xmm2",&QWP(0x60,"esp")); | |
784 | &sbb ("eax",-1); | |
785 | &pslldq ("xmm5",6); | |
786 | &movq ("xmm4","xmm0"); | |
787 | &paddq ("xmm3","xmm5"); | |
788 | &pand ("xmm0","xmm7"); | |
789 | &psrldq ("xmm3",6); | |
790 | &mov (&DWP(4*2,"edi"),"eax"); | |
791 | &movd ("eax","xmm3"); | |
792 | &psrldq ("xmm3",4); | |
793 | ||
794 | &paddq ("xmm4","xmm3"); | |
795 | &sbb ("eax",0); | |
796 | &pslldq ("xmm4",6); | |
797 | &movq ("xmm5","xmm1"); | |
798 | &paddq ("xmm0","xmm4"); | |
799 | &pand ("xmm1","xmm7"); | |
800 | &psrldq ("xmm0",6); | |
801 | &mov (&DWP(4*3,"edi"),"eax"); | |
802 | &movd ("eax","xmm0"); | |
803 | &psrldq ("xmm0",4); | |
804 | ||
805 | &paddq ("xmm5","xmm0"); | |
806 | &sbb ("eax",0); | |
807 | &pslldq ("xmm5",6); | |
808 | &movq ("xmm4","xmm2"); | |
809 | &paddq ("xmm1","xmm5"); | |
810 | &pand ("xmm2","xmm7"); | |
811 | &psrldq ("xmm1",6); | |
812 | &movd ("ebx","xmm1"); | |
813 | &psrldq ("xmm1",4); | |
814 | &mov ("esp","edx"); | |
815 | ||
816 | &paddq ("xmm4","xmm1"); | |
817 | &pslldq ("xmm4",6); | |
818 | &paddq ("xmm2","xmm4"); | |
819 | &psrldq ("xmm2",6); | |
820 | &movd ("ecx","xmm2"); | |
821 | &psrldq ("xmm2",4); | |
822 | &sbb ("ebx",0); | |
823 | &movd ("edx","xmm2"); | |
824 | &pextrw ("esi","xmm2",2); # top-most overflow bit | |
825 | &sbb ("ecx",1); | |
826 | &sbb ("edx",-1); | |
827 | &sbb ("esi",0); # borrow from subtraction | |
828 | ||
829 | # Final step is "if result > mod, subtract mod", and at this point | |
830 | # we have result - mod written to output buffer, as well as borrow | |
831 | # bit from this subtraction, and if borrow bit is set, we add | |
832 | # modulus back. | |
833 | # | |
834 | # Note that because mod has special form, i.e. consists of | |
835 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
836 | # assigning borrow bit to one register, %ebp, and its negative | |
837 | # to another, %esi. But we started by calculating %esi... | |
838 | ||
839 | &sub ("ebp","esi"); | |
840 | &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero | |
841 | &adc (&DWP(4*1,"edi"),"esi"); | |
842 | &adc (&DWP(4*2,"edi"),"esi"); | |
843 | &adc (&DWP(4*3,"edi"),0); | |
844 | &adc ("eax",0); | |
845 | &adc ("ebx",0); | |
846 | &mov (&DWP(4*4,"edi"),"eax"); | |
847 | &adc ("ecx","ebp"); | |
848 | &mov (&DWP(4*5,"edi"),"ebx"); | |
849 | &adc ("edx","esi"); | |
850 | &mov (&DWP(4*6,"edi"),"ecx"); | |
851 | &mov (&DWP(4*7,"edi"),"edx"); | |
852 | ||
853 | &ret (); | |
854 | ||
855 | &set_label("mul_mont_ialu",16); } | |
856 | ||
857 | ######################################## | |
858 | # IALU code path suitable for all CPUs. | |
859 | ######################################## | |
860 | # stack layout: | |
861 | # +------------------------------------+< %esp | |
862 | # | 8 32-bit temporary words, accessed | | |
863 | # | as circular buffer | | |
864 | # . . | |
865 | # . . | |
866 | # +------------------------------------+< +32 | |
867 | # | offloaded destination pointer | | |
868 | # +------------------------------------+ | |
869 | # | unused | | |
870 | # +------------------------------------+< +40 | |
871 | &sub ("esp",10*4); | |
872 | ||
873 | &mov ("eax",&DWP(0*4,"esi")); # a[0] | |
874 | &mov ("ebx",&DWP(0*4,"ebp")); # b[0] | |
875 | &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr | |
876 | ||
877 | &mul ("ebx"); # a[0]*b[0] | |
878 | &mov (&DWP(0*4,"esp"),"eax"); # t[0] | |
879 | &mov ("eax",&DWP(1*4,"esi")); | |
880 | &mov ("ecx","edx") | |
881 | ||
882 | &mul ("ebx"); # a[1]*b[0] | |
883 | &add ("ecx","eax"); | |
884 | &mov ("eax",&DWP(2*4,"esi")); | |
885 | &adc ("edx",0); | |
886 | &mov (&DWP(1*4,"esp"),"ecx"); # t[1] | |
887 | &mov ("ecx","edx"); | |
888 | ||
889 | &mul ("ebx"); # a[2]*b[0] | |
890 | &add ("ecx","eax"); | |
891 | &mov ("eax",&DWP(3*4,"esi")); | |
892 | &adc ("edx",0); | |
893 | &mov (&DWP(2*4,"esp"),"ecx"); # t[2] | |
894 | &mov ("ecx","edx"); | |
895 | ||
896 | &mul ("ebx"); # a[3]*b[0] | |
897 | &add ("ecx","eax"); | |
898 | &mov ("eax",&DWP(4*4,"esi")); | |
899 | &adc ("edx",0); | |
900 | &mov (&DWP(3*4,"esp"),"ecx"); # t[3] | |
901 | &mov ("ecx","edx"); | |
902 | ||
903 | &mul ("ebx"); # a[4]*b[0] | |
904 | &add ("ecx","eax"); | |
905 | &mov ("eax",&DWP(5*4,"esi")); | |
906 | &adc ("edx",0); | |
907 | &mov (&DWP(4*4,"esp"),"ecx"); # t[4] | |
908 | &mov ("ecx","edx"); | |
909 | ||
910 | &mul ("ebx"); # a[5]*b[0] | |
911 | &add ("ecx","eax"); | |
912 | &mov ("eax",&DWP(6*4,"esi")); | |
913 | &adc ("edx",0); | |
914 | &mov (&DWP(5*4,"esp"),"ecx"); # t[5] | |
915 | &mov ("ecx","edx"); | |
916 | ||
917 | &mul ("ebx"); # a[6]*b[0] | |
918 | &add ("ecx","eax"); | |
919 | &mov ("eax",&DWP(7*4,"esi")); | |
920 | &adc ("edx",0); | |
921 | &mov (&DWP(6*4,"esp"),"ecx"); # t[6] | |
922 | &mov ("ecx","edx"); | |
923 | ||
924 | &xor ("edi","edi"); # initial top-most carry | |
925 | &mul ("ebx"); # a[7]*b[0] | |
926 | &add ("ecx","eax"); # t[7] | |
927 | &mov ("eax",&DWP(0*4,"esp")); # t[0] | |
928 | &adc ("edx",0); # t[8] | |
929 | ||
930 | for ($i=0;$i<7;$i++) { | |
931 | my $j=$i+1; | |
932 | ||
933 | # Reduction iteration is normally performed by accumulating | |
934 | # result of multiplication of modulus by "magic" digit [and | |
935 | # omitting least significant word, which is guaranteed to | |
936 | # be 0], but thanks to special form of modulus and "magic" | |
937 | # digit being equal to least significant word, it can be | |
938 | # performed with additions and subtractions alone. Indeed: | |
939 | # | |
940 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | |
941 | # * abcd | |
942 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | |
943 | # | |
944 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | |
945 | # rewrite above as: | |
946 | # | |
947 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | |
948 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | |
949 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | |
950 | # | |
951 | # or marking redundant operations: | |
952 | # | |
953 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | |
954 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | |
955 | # - abcd.----.----.----.----.----.----.---- | |
956 | ||
957 | &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] | |
958 | &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 | |
959 | &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 | |
960 | &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] | |
961 | &adc ("ecx",0); # t[7]+=0 | |
962 | &adc ("edx","eax"); # t[8]+=t[0] | |
963 | &adc ("edi",0); # top-most carry | |
964 | &mov ("ebx",&DWP($j*4,"ebp")); # b[i] | |
965 | &sub ("ecx","eax"); # t[7]-=t[0] | |
966 | &mov ("eax",&DWP(0*4,"esi")); # a[0] | |
967 | &sbb ("edx",0); # t[8]-=0 | |
968 | &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); | |
969 | &sbb ("edi",0); # top-most carry, | |
970 | # keep in mind that | |
971 | # netto result is | |
972 | # *addition* of value | |
973 | # with (abcd<<32)-abcd | |
974 | # on top, so that | |
975 | # underflow is | |
976 | # impossible, because | |
977 | # (abcd<<32)-abcd | |
978 | # doesn't underflow | |
979 | &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); | |
980 | ||
981 | &mul ("ebx"); # a[0]*b[i] | |
982 | &add ("eax",&DWP((($j+0)%8)*4,"esp")); | |
983 | &adc ("edx",0); | |
984 | &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); | |
985 | &mov ("eax",&DWP(1*4,"esi")); | |
986 | &mov ("ecx","edx") | |
987 | ||
988 | &mul ("ebx"); # a[1]*b[i] | |
989 | &add ("ecx",&DWP((($j+1)%8)*4,"esp")); | |
990 | &adc ("edx",0); | |
991 | &add ("ecx","eax"); | |
992 | &adc ("edx",0); | |
993 | &mov ("eax",&DWP(2*4,"esi")); | |
994 | &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); | |
995 | &mov ("ecx","edx"); | |
996 | ||
997 | &mul ("ebx"); # a[2]*b[i] | |
998 | &add ("ecx",&DWP((($j+2)%8)*4,"esp")); | |
999 | &adc ("edx",0); | |
1000 | &add ("ecx","eax"); | |
1001 | &adc ("edx",0); | |
1002 | &mov ("eax",&DWP(3*4,"esi")); | |
1003 | &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); | |
1004 | &mov ("ecx","edx"); | |
1005 | ||
1006 | &mul ("ebx"); # a[3]*b[i] | |
1007 | &add ("ecx",&DWP((($j+3)%8)*4,"esp")); | |
1008 | &adc ("edx",0); | |
1009 | &add ("ecx","eax"); | |
1010 | &adc ("edx",0); | |
1011 | &mov ("eax",&DWP(4*4,"esi")); | |
1012 | &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); | |
1013 | &mov ("ecx","edx"); | |
1014 | ||
1015 | &mul ("ebx"); # a[4]*b[i] | |
1016 | &add ("ecx",&DWP((($j+4)%8)*4,"esp")); | |
1017 | &adc ("edx",0); | |
1018 | &add ("ecx","eax"); | |
1019 | &adc ("edx",0); | |
1020 | &mov ("eax",&DWP(5*4,"esi")); | |
1021 | &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); | |
1022 | &mov ("ecx","edx"); | |
1023 | ||
1024 | &mul ("ebx"); # a[5]*b[i] | |
1025 | &add ("ecx",&DWP((($j+5)%8)*4,"esp")); | |
1026 | &adc ("edx",0); | |
1027 | &add ("ecx","eax"); | |
1028 | &adc ("edx",0); | |
1029 | &mov ("eax",&DWP(6*4,"esi")); | |
1030 | &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); | |
1031 | &mov ("ecx","edx"); | |
1032 | ||
1033 | &mul ("ebx"); # a[6]*b[i] | |
1034 | &add ("ecx",&DWP((($j+6)%8)*4,"esp")); | |
1035 | &adc ("edx",0); | |
1036 | &add ("ecx","eax"); | |
1037 | &adc ("edx",0); | |
1038 | &mov ("eax",&DWP(7*4,"esi")); | |
1039 | &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); | |
1040 | &mov ("ecx","edx"); | |
1041 | ||
1042 | &mul ("ebx"); # a[7]*b[i] | |
1043 | &add ("ecx",&DWP((($j+7)%8)*4,"esp")); | |
1044 | &adc ("edx",0); | |
1045 | &add ("ecx","eax"); # t[7] | |
1046 | &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] | |
1047 | &adc ("edx","edi"); # t[8] | |
1048 | &mov ("edi",0); | |
1049 | &adc ("edi",0); # top-most carry | |
1050 | } | |
1051 | &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr | |
1052 | &xor ("esi","esi"); | |
1053 | my $j=$i+1; | |
1054 | ||
1055 | # last multiplication-less reduction | |
1056 | &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] | |
1057 | &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 | |
1058 | &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 | |
1059 | &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] | |
1060 | &adc ("ecx",0); # t[7]+=0 | |
1061 | &adc ("edx","eax"); # t[8]+=t[0] | |
1062 | &adc ("edi",0); # top-most carry | |
1063 | &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); | |
1064 | &sub ("ecx","eax"); # t[7]-=t[0] | |
1065 | &mov ("eax",&DWP((($j+0)%8)*4,"esp")); | |
1066 | &sbb ("edx",0); # t[8]-=0 | |
1067 | &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); | |
1068 | &sbb ("edi",0); # top-most carry | |
1069 | &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); | |
1070 | ||
1071 | # Final step is "if result > mod, subtract mod", but we do it | |
1072 | # "other way around", namely write result - mod to output buffer | |
1073 | # and if subtraction borrowed, add modulus back. | |
1074 | ||
1075 | &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); | |
1076 | &sub ("eax",-1); | |
1077 | &mov ("edx",&DWP((($j+3)%8)*4,"esp")); | |
1078 | &sbb ("ebx",-1); | |
1079 | &mov (&DWP(0*4,"ebp"),"eax"); | |
1080 | &sbb ("ecx",-1); | |
1081 | &mov (&DWP(1*4,"ebp"),"ebx"); | |
1082 | &sbb ("edx",0); | |
1083 | &mov (&DWP(2*4,"ebp"),"ecx"); | |
1084 | &mov (&DWP(3*4,"ebp"),"edx"); | |
1085 | ||
1086 | &mov ("eax",&DWP((($j+4)%8)*4,"esp")); | |
1087 | &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); | |
1088 | &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); | |
1089 | &sbb ("eax",0); | |
1090 | &mov ("edx",&DWP((($j+7)%8)*4,"esp")); | |
1091 | &sbb ("ebx",0); | |
1092 | &sbb ("ecx",1); | |
1093 | &sbb ("edx",-1); | |
1094 | &sbb ("edi",0); | |
1095 | ||
1096 | # Note that because mod has special form, i.e. consists of | |
1097 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
1098 | # assigning borrow bit to one register, %ebp, and its negative | |
1099 | # to another, %esi. But we started by calculating %esi... | |
1100 | ||
1101 | &sub ("esi","edi"); | |
1102 | &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero | |
1103 | &adc (&DWP(1*4,"ebp"),"edi"); | |
1104 | &adc (&DWP(2*4,"ebp"),"edi"); | |
1105 | &adc (&DWP(3*4,"ebp"),0); | |
1106 | &adc ("eax",0); | |
1107 | &adc ("ebx",0); | |
1108 | &mov (&DWP(4*4,"ebp"),"eax"); | |
1109 | &adc ("ecx","esi"); | |
1110 | &mov (&DWP(5*4,"ebp"),"ebx"); | |
1111 | &adc ("edx","edi"); | |
1112 | &mov (&DWP(6*4,"ebp"),"ecx"); | |
1113 | &mov ("edi","ebp"); # fulfill contract | |
1114 | &mov (&DWP(7*4,"ebp"),"edx"); | |
1115 | ||
1116 | &add ("esp",10*4); | |
1117 | &ret (); | |
1118 | &function_end_B("_ecp_nistz256_mul_mont"); | |
1119 | ||
1120 | ######################################################################## | |
1121 | # void ecp_nistz256_scatter_w5(void *edi,const P256_POINT *esi, | |
1122 | # int ebp); | |
1123 | &function_begin("ecp_nistz256_scatter_w5"); | |
1124 | &mov ("edi",&wparam(0)); | |
1125 | &mov ("esi",&wparam(1)); | |
1126 | &mov ("ebp",&wparam(2)); | |
1127 | ||
1128 | &lea ("edi",&DWP(128-4,"edi","ebp",4)); | |
1129 | &mov ("ebp",96/16); | |
1130 | &set_label("scatter_w5_loop"); | |
1131 | &mov ("eax",&DWP(0,"esi")); | |
1132 | &mov ("ebx",&DWP(4,"esi")); | |
1133 | &mov ("ecx",&DWP(8,"esi")); | |
1134 | &mov ("edx",&DWP(12,"esi")); | |
1135 | &lea ("esi",&DWP(16,"esi")); | |
1136 | &mov (&DWP(64*0-128,"edi"),"eax"); | |
1137 | &mov (&DWP(64*1-128,"edi"),"ebx"); | |
1138 | &mov (&DWP(64*2-128,"edi"),"ecx"); | |
1139 | &mov (&DWP(64*3-128,"edi"),"edx"); | |
1140 | &lea ("edi",&DWP(64*4,"edi")); | |
1141 | &dec ("ebp"); | |
1142 | &jnz (&label("scatter_w5_loop")); | |
1143 | &function_end("ecp_nistz256_scatter_w5"); | |
1144 | ||
1145 | ######################################################################## | |
1146 | # void ecp_nistz256_gather_w5(P256_POINT *edi,const void *esi, | |
1147 | # int ebp); | |
1148 | &function_begin("ecp_nistz256_gather_w5"); | |
1149 | &mov ("esi",&wparam(1)); | |
1150 | &mov ("ebp",&wparam(2)); | |
1151 | ||
1152 | &lea ("esi",&DWP(0,"esi","ebp",4)); | |
1153 | &neg ("ebp"); | |
1154 | &sar ("ebp",31); | |
1155 | &mov ("edi",&wparam(0)); | |
1156 | &lea ("esi",&DWP(0,"esi","ebp",4)); | |
1157 | ||
1158 | for($i=0;$i<24;$i+=4) { | |
1159 | &mov ("eax",&DWP(64*($i+0),"esi")); | |
1160 | &mov ("ebx",&DWP(64*($i+1),"esi")); | |
1161 | &mov ("ecx",&DWP(64*($i+2),"esi")); | |
1162 | &mov ("edx",&DWP(64*($i+3),"esi")); | |
1163 | &and ("eax","ebp"); | |
1164 | &and ("ebx","ebp"); | |
1165 | &and ("ecx","ebp"); | |
1166 | &and ("edx","ebp"); | |
1167 | &mov (&DWP(4*($i+0),"edi"),"eax"); | |
1168 | &mov (&DWP(4*($i+1),"edi"),"ebx"); | |
1169 | &mov (&DWP(4*($i+2),"edi"),"ecx"); | |
1170 | &mov (&DWP(4*($i+3),"edi"),"edx"); | |
1171 | } | |
1172 | &function_end("ecp_nistz256_gather_w5"); | |
1173 | ||
1174 | ######################################################################## | |
1175 | # void ecp_nistz256_scatter_w7(void *edi,const P256_POINT_AFFINE *esi, | |
1176 | # int ebp); | |
1177 | &function_begin("ecp_nistz256_scatter_w7"); | |
1178 | &mov ("edi",&wparam(0)); | |
1179 | &mov ("esi",&wparam(1)); | |
1180 | &mov ("ebp",&wparam(2)); | |
1181 | ||
87a75b3e | 1182 | &lea ("edi",&DWP(0,"edi","ebp")); |
aa9db2d2 AP |
1183 | &mov ("ebp",64/4); |
1184 | &set_label("scatter_w7_loop"); | |
1185 | &mov ("eax",&DWP(0,"esi")); | |
1186 | &lea ("esi",&DWP(4,"esi")); | |
1187 | &mov (&BP(64*0,"edi"),"al"); | |
1188 | &mov (&BP(64*1,"edi"),"ah"); | |
1189 | &shr ("eax",16); | |
1190 | &mov (&BP(64*2,"edi"),"al"); | |
1191 | &mov (&BP(64*3,"edi"),"ah"); | |
1192 | &lea ("edi",&DWP(64*4,"edi")); | |
1193 | &dec ("ebp"); | |
1194 | &jnz (&label("scatter_w7_loop")); | |
1195 | &function_end("ecp_nistz256_scatter_w7"); | |
1196 | ||
1197 | ######################################################################## | |
1198 | # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *edi,const void *esi, | |
1199 | # int ebp); | |
1200 | &function_begin("ecp_nistz256_gather_w7"); | |
1201 | &mov ("esi",&wparam(1)); | |
1202 | &mov ("ebp",&wparam(2)); | |
1203 | ||
1204 | &add ("esi","ebp"); | |
1205 | &neg ("ebp"), | |
1206 | &sar ("ebp",31); | |
1207 | &mov ("edi",&wparam(0)); | |
1208 | &lea ("esi",&DWP(0,"esi","ebp")); | |
1209 | ||
1210 | for($i=0;$i<64;$i+=4) { | |
1211 | &movz ("eax",&BP(64*($i+0),"esi")); | |
1212 | &movz ("ebx",&BP(64*($i+1),"esi")); | |
1213 | &movz ("ecx",&BP(64*($i+2),"esi")); | |
1214 | &and ("eax","ebp"); | |
1215 | &movz ("edx",&BP(64*($i+3),"esi")); | |
1216 | &and ("ebx","ebp"); | |
1217 | &mov (&BP($i+0,"edi"),"al"); | |
1218 | &and ("ecx","ebp"); | |
1219 | &mov (&BP($i+1,"edi"),"bl"); | |
1220 | &and ("edx","ebp"); | |
1221 | &mov (&BP($i+2,"edi"),"cl"); | |
1222 | &mov (&BP($i+3,"edi"),"dl"); | |
1223 | } | |
1224 | &function_end("ecp_nistz256_gather_w7"); | |
1225 | ||
1226 | ######################################################################## | |
1227 | # following subroutines are "literal" implementation of those found in | |
1228 | # ecp_nistz256.c | |
1229 | # | |
1230 | ######################################################################## | |
1231 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | |
1232 | # | |
143ee099 | 1233 | &static_label("point_double_shortcut"); |
aa9db2d2 AP |
1234 | &function_begin("ecp_nistz256_point_double"); |
1235 | { my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | |
1236 | ||
1237 | &mov ("esi",&wparam(1)); | |
1238 | ||
1239 | # above map() describes stack layout with 5 temporary | |
1240 | # 256-bit vectors on top, then we take extra word for | |
60d8edbc | 1241 | # OPENSSL_ia32cap_P copy. |
aa9db2d2 AP |
1242 | &stack_push(8*5+1); |
1243 | if ($sse2) { | |
1244 | &call ("_picup_eax"); | |
1245 | &set_label("pic"); | |
1246 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | |
1247 | &mov ("ebp",&DWP(0,"edx")); } | |
1248 | ||
143ee099 | 1249 | &set_label("point_double_shortcut"); |
aa9db2d2 AP |
1250 | &mov ("eax",&DWP(0,"esi")); # copy in_x |
1251 | &mov ("ebx",&DWP(4,"esi")); | |
1252 | &mov ("ecx",&DWP(8,"esi")); | |
1253 | &mov ("edx",&DWP(12,"esi")); | |
1254 | &mov (&DWP($in_x+0,"esp"),"eax"); | |
1255 | &mov (&DWP($in_x+4,"esp"),"ebx"); | |
1256 | &mov (&DWP($in_x+8,"esp"),"ecx"); | |
1257 | &mov (&DWP($in_x+12,"esp"),"edx"); | |
1258 | &mov ("eax",&DWP(16,"esi")); | |
1259 | &mov ("ebx",&DWP(20,"esi")); | |
1260 | &mov ("ecx",&DWP(24,"esi")); | |
1261 | &mov ("edx",&DWP(28,"esi")); | |
1262 | &mov (&DWP($in_x+16,"esp"),"eax"); | |
1263 | &mov (&DWP($in_x+20,"esp"),"ebx"); | |
1264 | &mov (&DWP($in_x+24,"esp"),"ecx"); | |
1265 | &mov (&DWP($in_x+28,"esp"),"edx"); | |
1266 | &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy | |
1267 | ||
1268 | &lea ("ebp",&DWP(32,"esi")); | |
1269 | &lea ("esi",&DWP(32,"esi")); | |
1270 | &lea ("edi",&DWP($S,"esp")); | |
1271 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); | |
1272 | ||
1273 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1274 | &mov ("esi",64); | |
1275 | &add ("esi",&wparam(1)); | |
1276 | &lea ("edi",&DWP($Zsqr,"esp")); | |
1277 | &mov ("ebp","esi"); | |
1278 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); | |
1279 | ||
1280 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1281 | &lea ("esi",&DWP($S,"esp")); | |
1282 | &lea ("ebp",&DWP($S,"esp")); | |
1283 | &lea ("edi",&DWP($S,"esp")); | |
1284 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); | |
1285 | ||
1286 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1287 | &mov ("ebp",&wparam(1)); | |
1288 | &lea ("esi",&DWP(32,"ebp")); | |
1289 | &lea ("ebp",&DWP(64,"ebp")); | |
1290 | &lea ("edi",&DWP($tmp0,"esp")); | |
1291 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); | |
1292 | ||
1293 | &lea ("esi",&DWP($in_x,"esp")); | |
1294 | &lea ("ebp",&DWP($Zsqr,"esp")); | |
1295 | &lea ("edi",&DWP($M,"esp")); | |
1296 | &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); | |
1297 | ||
1298 | &mov ("edi",64); | |
1299 | &lea ("esi",&DWP($tmp0,"esp")); | |
1300 | &lea ("ebp",&DWP($tmp0,"esp")); | |
1301 | &add ("edi",&wparam(0)); | |
1302 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); | |
1303 | ||
1304 | &lea ("esi",&DWP($in_x,"esp")); | |
1305 | &lea ("ebp",&DWP($Zsqr,"esp")); | |
1306 | &lea ("edi",&DWP($Zsqr,"esp")); | |
1307 | &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); | |
1308 | ||
1309 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1310 | &lea ("esi",&DWP($S,"esp")); | |
1311 | &lea ("ebp",&DWP($S,"esp")); | |
1312 | &lea ("edi",&DWP($tmp0,"esp")); | |
1313 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); | |
1314 | ||
1315 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1316 | &lea ("esi",&DWP($M,"esp")); | |
1317 | &lea ("ebp",&DWP($Zsqr,"esp")); | |
1318 | &lea ("edi",&DWP($M,"esp")); | |
1319 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); | |
1320 | ||
1321 | &mov ("edi",32); | |
1322 | &lea ("esi",&DWP($tmp0,"esp")); | |
1323 | &add ("edi",&wparam(0)); | |
1324 | &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); | |
1325 | ||
1326 | &lea ("esi",&DWP($M,"esp")); | |
1327 | &lea ("ebp",&DWP($M,"esp")); | |
1328 | &lea ("edi",&DWP($tmp0,"esp")); | |
1329 | &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); | |
1330 | ||
1331 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1332 | &lea ("esi",&DWP($in_x,"esp")); | |
1333 | &lea ("ebp",&DWP($S,"esp")); | |
1334 | &lea ("edi",&DWP($S,"esp")); | |
1335 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); | |
1336 | ||
1337 | &lea ("esi",&DWP($tmp0,"esp")); | |
1338 | &lea ("ebp",&DWP($M,"esp")); | |
1339 | &lea ("edi",&DWP($M,"esp")); | |
1340 | &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); | |
1341 | ||
1342 | &lea ("esi",&DWP($S,"esp")); | |
1343 | &lea ("ebp",&DWP($S,"esp")); | |
1344 | &lea ("edi",&DWP($tmp0,"esp")); | |
1345 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); | |
1346 | ||
1347 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1348 | &lea ("esi",&DWP($M,"esp")); | |
1349 | &lea ("ebp",&DWP($M,"esp")); | |
1350 | &mov ("edi",&wparam(0)); | |
1351 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); | |
1352 | ||
1353 | &mov ("esi","edi"); # %edi is still res_x here | |
1354 | &lea ("ebp",&DWP($tmp0,"esp")); | |
1355 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); | |
1356 | ||
1357 | &lea ("esi",&DWP($S,"esp")); | |
1358 | &mov ("ebp","edi"); # %edi is still res_x | |
1359 | &lea ("edi",&DWP($S,"esp")); | |
1360 | &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); | |
1361 | ||
1362 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | |
1363 | &mov ("esi","edi"); # %edi is still &S | |
1364 | &lea ("ebp",&DWP($M,"esp")); | |
1365 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); | |
1366 | ||
1367 | &mov ("ebp",32); | |
1368 | &lea ("esi",&DWP($S,"esp")); | |
1369 | &add ("ebp",&wparam(0)); | |
1370 | &mov ("edi","ebp"); | |
1371 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); | |
1372 | ||
1373 | &stack_pop(8*5+1); | |
1374 | } &function_end("ecp_nistz256_point_double"); | |
1375 | ||
1376 | ######################################################################## | |
1377 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | |
1378 | # const P256_POINT *in2); | |
1379 | &function_begin("ecp_nistz256_point_add"); | |
1380 | { my ($res_x,$res_y,$res_z, | |
1381 | $in1_x,$in1_y,$in1_z, | |
1382 | $in2_x,$in2_y,$in2_z, | |
1383 | $H,$Hsqr,$R,$Rsqr,$Hcub, | |
1384 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | |
1385 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | |
1386 | ||
1387 | &mov ("esi",&wparam(2)); | |
1388 | ||
1389 | # above map() describes stack layout with 18 temporary | |
1390 | # 256-bit vectors on top, then we take extra words for | |
969ee511 | 1391 | # ~in1infty, ~in2infty, result of check for zero and |
60d8edbc | 1392 | # OPENSSL_ia32cap_P copy. [one unused word for padding] |
aa9db2d2 AP |
1393 | &stack_push(8*18+5); |
1394 | if ($sse2) { | |
1395 | &call ("_picup_eax"); | |
1396 | &set_label("pic"); | |
1397 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | |
1398 | &mov ("ebp",&DWP(0,"edx")); } | |
1399 | ||
1400 | &lea ("edi",&DWP($in2_x,"esp")); | |
1401 | for($i=0;$i<96;$i+=16) { | |
1402 | &mov ("eax",&DWP($i+0,"esi")); # copy in2 | |
1403 | &mov ("ebx",&DWP($i+4,"esi")); | |
1404 | &mov ("ecx",&DWP($i+8,"esi")); | |
1405 | &mov ("edx",&DWP($i+12,"esi")); | |
1406 | &mov (&DWP($i+0,"edi"),"eax"); | |
1407 | &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); | |
c74aea8d AP |
1408 | &mov ("ebp","eax") if ($i==64); |
1409 | &or ("ebp","eax") if ($i>64); | |
aa9db2d2 | 1410 | &mov (&DWP($i+4,"edi"),"ebx"); |
c74aea8d | 1411 | &or ("ebp","ebx") if ($i>=64); |
aa9db2d2 | 1412 | &mov (&DWP($i+8,"edi"),"ecx"); |
c74aea8d | 1413 | &or ("ebp","ecx") if ($i>=64); |
aa9db2d2 | 1414 | &mov (&DWP($i+12,"edi"),"edx"); |
c74aea8d | 1415 | &or ("ebp","edx") if ($i>=64); |
aa9db2d2 AP |
1416 | } |
1417 | &xor ("eax","eax"); | |
1418 | &mov ("esi",&wparam(1)); | |
1419 | &sub ("eax","ebp"); | |
1420 | &or ("ebp","eax"); | |
1421 | &sar ("ebp",31); | |
969ee511 | 1422 | &mov (&DWP(32*18+4,"esp"),"ebp"); # ~in2infty |
aa9db2d2 AP |
1423 | |
1424 | &lea ("edi",&DWP($in1_x,"esp")); | |
1425 | for($i=0;$i<96;$i+=16) { | |
1426 | &mov ("eax",&DWP($i+0,"esi")); # copy in1 | |
1427 | &mov ("ebx",&DWP($i+4,"esi")); | |
1428 | &mov ("ecx",&DWP($i+8,"esi")); | |
1429 | &mov ("edx",&DWP($i+12,"esi")); | |
1430 | &mov (&DWP($i+0,"edi"),"eax"); | |
c74aea8d AP |
1431 | &mov ("ebp","eax") if ($i==64); |
1432 | &or ("ebp","eax") if ($i>64); | |
aa9db2d2 | 1433 | &mov (&DWP($i+4,"edi"),"ebx"); |
c74aea8d | 1434 | &or ("ebp","ebx") if ($i>=64); |
aa9db2d2 | 1435 | &mov (&DWP($i+8,"edi"),"ecx"); |
c74aea8d | 1436 | &or ("ebp","ecx") if ($i>=64); |
aa9db2d2 | 1437 | &mov (&DWP($i+12,"edi"),"edx"); |
c74aea8d | 1438 | &or ("ebp","edx") if ($i>=64); |
aa9db2d2 AP |
1439 | } |
1440 | &xor ("eax","eax"); | |
1441 | &sub ("eax","ebp"); | |
1442 | &or ("ebp","eax"); | |
1443 | &sar ("ebp",31); | |
969ee511 | 1444 | &mov (&DWP(32*18+0,"esp"),"ebp"); # ~in1infty |
aa9db2d2 AP |
1445 | |
1446 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1447 | &lea ("esi",&DWP($in2_z,"esp")); | |
1448 | &lea ("ebp",&DWP($in2_z,"esp")); | |
1449 | &lea ("edi",&DWP($Z2sqr,"esp")); | |
1450 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); | |
1451 | ||
1452 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1453 | &lea ("esi",&DWP($in1_z,"esp")); | |
1454 | &lea ("ebp",&DWP($in1_z,"esp")); | |
1455 | &lea ("edi",&DWP($Z1sqr,"esp")); | |
1456 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); | |
1457 | ||
1458 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1459 | &lea ("esi",&DWP($Z2sqr,"esp")); | |
1460 | &lea ("ebp",&DWP($in2_z,"esp")); | |
1461 | &lea ("edi",&DWP($S1,"esp")); | |
1462 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); | |
1463 | ||
1464 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1465 | &lea ("esi",&DWP($Z1sqr,"esp")); | |
1466 | &lea ("ebp",&DWP($in1_z,"esp")); | |
1467 | &lea ("edi",&DWP($S2,"esp")); | |
1468 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); | |
1469 | ||
1470 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1471 | &lea ("esi",&DWP($in1_y,"esp")); | |
1472 | &lea ("ebp",&DWP($S1,"esp")); | |
1473 | &lea ("edi",&DWP($S1,"esp")); | |
1474 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); | |
1475 | ||
1476 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1477 | &lea ("esi",&DWP($in2_y,"esp")); | |
1478 | &lea ("ebp",&DWP($S2,"esp")); | |
1479 | &lea ("edi",&DWP($S2,"esp")); | |
1480 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); | |
1481 | ||
1482 | &lea ("esi",&DWP($S2,"esp")); | |
1483 | &lea ("ebp",&DWP($S1,"esp")); | |
1484 | &lea ("edi",&DWP($R,"esp")); | |
1485 | &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); | |
1486 | ||
1487 | &or ("ebx","eax"); # see if result is zero | |
1488 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1489 | &or ("ebx","ecx"); | |
1490 | &or ("ebx","edx"); | |
1491 | &or ("ebx",&DWP(0,"edi")); | |
1492 | &or ("ebx",&DWP(4,"edi")); | |
1493 | &lea ("esi",&DWP($in1_x,"esp")); | |
1494 | &or ("ebx",&DWP(8,"edi")); | |
1495 | &lea ("ebp",&DWP($Z2sqr,"esp")); | |
1496 | &or ("ebx",&DWP(12,"edi")); | |
1497 | &lea ("edi",&DWP($U1,"esp")); | |
1498 | &mov (&DWP(32*18+8,"esp"),"ebx"); | |
1499 | ||
1500 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); | |
1501 | ||
1502 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1503 | &lea ("esi",&DWP($in2_x,"esp")); | |
1504 | &lea ("ebp",&DWP($Z1sqr,"esp")); | |
1505 | &lea ("edi",&DWP($U2,"esp")); | |
1506 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); | |
1507 | ||
1508 | &lea ("esi",&DWP($U2,"esp")); | |
1509 | &lea ("ebp",&DWP($U1,"esp")); | |
1510 | &lea ("edi",&DWP($H,"esp")); | |
1511 | &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); | |
1512 | ||
1513 | &or ("eax","ebx"); # see if result is zero | |
1514 | &or ("eax","ecx"); | |
1515 | &or ("eax","edx"); | |
1516 | &or ("eax",&DWP(0,"edi")); | |
1517 | &or ("eax",&DWP(4,"edi")); | |
1518 | &or ("eax",&DWP(8,"edi")); | |
969ee511 | 1519 | &or ("eax",&DWP(12,"edi")); # ~is_equal(U1,U2) |
aa9db2d2 | 1520 | |
969ee511 BE |
1521 | &mov ("ebx",&DWP(32*18+0,"esp")); # ~in1infty |
1522 | ¬ ("ebx"); # -1/0 -> 0/-1 | |
1523 | &or ("eax","ebx"); | |
1524 | &mov ("ebx",&DWP(32*18+4,"esp")); # ~in2infty | |
1525 | ¬ ("ebx"); # -1/0 -> 0/-1 | |
1526 | &or ("eax","ebx"); | |
1527 | &or ("eax",&DWP(32*18+8,"esp")); # ~is_equal(S1,S2) | |
aa9db2d2 | 1528 | |
969ee511 BE |
1529 | # if (~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) |
1530 | &data_byte(0x3e); # predict taken | |
1531 | &jnz (&label("add_proceed")); | |
aa9db2d2 | 1532 | |
143ee099 AP |
1533 | &set_label("add_double",16); |
1534 | &mov ("esi",&wparam(1)); | |
1535 | &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1536 | &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes | |
1537 | &jmp (&label("point_double_shortcut")); | |
1538 | ||
aa9db2d2 AP |
1539 | &set_label("add_proceed",16); |
1540 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1541 | &lea ("esi",&DWP($R,"esp")); | |
1542 | &lea ("ebp",&DWP($R,"esp")); | |
1543 | &lea ("edi",&DWP($Rsqr,"esp")); | |
1544 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); | |
1545 | ||
1546 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1547 | &lea ("esi",&DWP($H,"esp")); | |
1548 | &lea ("ebp",&DWP($in1_z,"esp")); | |
1549 | &lea ("edi",&DWP($res_z,"esp")); | |
1550 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); | |
1551 | ||
1552 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1553 | &lea ("esi",&DWP($H,"esp")); | |
1554 | &lea ("ebp",&DWP($H,"esp")); | |
1555 | &lea ("edi",&DWP($Hsqr,"esp")); | |
1556 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); | |
1557 | ||
1558 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1559 | &lea ("esi",&DWP($in2_z,"esp")); | |
1560 | &lea ("ebp",&DWP($res_z,"esp")); | |
1561 | &lea ("edi",&DWP($res_z,"esp")); | |
1562 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); | |
1563 | ||
1564 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1565 | &lea ("esi",&DWP($Hsqr,"esp")); | |
1566 | &lea ("ebp",&DWP($U1,"esp")); | |
1567 | &lea ("edi",&DWP($U2,"esp")); | |
1568 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); | |
1569 | ||
1570 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1571 | &lea ("esi",&DWP($H,"esp")); | |
1572 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1573 | &lea ("edi",&DWP($Hcub,"esp")); | |
1574 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); | |
1575 | ||
1576 | &lea ("esi",&DWP($U2,"esp")); | |
1577 | &lea ("ebp",&DWP($U2,"esp")); | |
1578 | &lea ("edi",&DWP($Hsqr,"esp")); | |
1579 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); | |
1580 | ||
1581 | &lea ("esi",&DWP($Rsqr,"esp")); | |
1582 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1583 | &lea ("edi",&DWP($res_x,"esp")); | |
1584 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); | |
1585 | ||
1586 | &lea ("esi",&DWP($res_x,"esp")); | |
1587 | &lea ("ebp",&DWP($Hcub,"esp")); | |
1588 | &lea ("edi",&DWP($res_x,"esp")); | |
1589 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); | |
1590 | ||
1591 | &lea ("esi",&DWP($U2,"esp")); | |
1592 | &lea ("ebp",&DWP($res_x,"esp")); | |
1593 | &lea ("edi",&DWP($res_y,"esp")); | |
1594 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); | |
1595 | ||
1596 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1597 | &lea ("esi",&DWP($Hcub,"esp")); | |
1598 | &lea ("ebp",&DWP($S1,"esp")); | |
1599 | &lea ("edi",&DWP($S2,"esp")); | |
1600 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); | |
1601 | ||
1602 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | |
1603 | &lea ("esi",&DWP($R,"esp")); | |
1604 | &lea ("ebp",&DWP($res_y,"esp")); | |
1605 | &lea ("edi",&DWP($res_y,"esp")); | |
1606 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); | |
1607 | ||
1608 | &lea ("esi",&DWP($res_y,"esp")); | |
1609 | &lea ("ebp",&DWP($S2,"esp")); | |
1610 | &lea ("edi",&DWP($res_y,"esp")); | |
1611 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); | |
1612 | ||
969ee511 BE |
1613 | &mov ("ebp",&DWP(32*18+0,"esp")); # ~in1infty |
1614 | &mov ("esi",&DWP(32*18+4,"esp")); # ~in2infty | |
aa9db2d2 AP |
1615 | &mov ("edi",&wparam(0)); |
1616 | &mov ("edx","ebp"); | |
1617 | ¬ ("ebp"); | |
969ee511 BE |
1618 | &and ("edx","esi"); # ~in1infty & ~in2infty |
1619 | &and ("ebp","esi"); # in1infty & ~in2infty | |
1620 | ¬ ("esi"); # in2infty | |
aa9db2d2 AP |
1621 | |
1622 | ######################################## | |
1623 | # conditional moves | |
1624 | for($i=64;$i<96;$i+=4) { | |
969ee511 | 1625 | &mov ("eax","edx"); # ~in1infty & ~in2infty |
aa9db2d2 | 1626 | &and ("eax",&DWP($res_x+$i,"esp")); |
969ee511 | 1627 | &mov ("ebx","ebp"); # in1infty & ~in2infty |
aa9db2d2 | 1628 | &and ("ebx",&DWP($in2_x+$i,"esp")); |
969ee511 | 1629 | &mov ("ecx","esi"); # in2infty |
aa9db2d2 AP |
1630 | &and ("ecx",&DWP($in1_x+$i,"esp")); |
1631 | &or ("eax","ebx"); | |
1632 | &or ("eax","ecx"); | |
1633 | &mov (&DWP($i,"edi"),"eax"); | |
1634 | } | |
1635 | for($i=0;$i<64;$i+=4) { | |
969ee511 | 1636 | &mov ("eax","edx"); # ~in1infty & ~in2infty |
aa9db2d2 | 1637 | &and ("eax",&DWP($res_x+$i,"esp")); |
969ee511 | 1638 | &mov ("ebx","ebp"); # in1infty & ~in2infty |
aa9db2d2 | 1639 | &and ("ebx",&DWP($in2_x+$i,"esp")); |
969ee511 | 1640 | &mov ("ecx","esi"); # in2infty |
aa9db2d2 AP |
1641 | &and ("ecx",&DWP($in1_x+$i,"esp")); |
1642 | &or ("eax","ebx"); | |
1643 | &or ("eax","ecx"); | |
1644 | &mov (&DWP($i,"edi"),"eax"); | |
1645 | } | |
1646 | &set_label("add_done"); | |
1647 | &stack_pop(8*18+5); | |
1648 | } &function_end("ecp_nistz256_point_add"); | |
1649 | ||
1650 | ######################################################################## | |
1651 | # void ecp_nistz256_point_add_affine(P256_POINT *out, | |
1652 | # const P256_POINT *in1, | |
1653 | # const P256_POINT_AFFINE *in2); | |
1654 | &function_begin("ecp_nistz256_point_add_affine"); | |
1655 | { | |
1656 | my ($res_x,$res_y,$res_z, | |
1657 | $in1_x,$in1_y,$in1_z, | |
1658 | $in2_x,$in2_y, | |
1659 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | |
1660 | my $Z1sqr = $S2; | |
1661 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | |
1662 | ||
1663 | &mov ("esi",&wparam(1)); | |
1664 | ||
1665 | # above map() describes stack layout with 15 temporary | |
1666 | # 256-bit vectors on top, then we take extra words for | |
969ee511 | 1667 | # ~in1infty, ~in2infty, and OPENSSL_ia32cap_P copy. |
aa9db2d2 AP |
1668 | &stack_push(8*15+3); |
1669 | if ($sse2) { | |
1670 | &call ("_picup_eax"); | |
1671 | &set_label("pic"); | |
1672 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | |
1673 | &mov ("ebp",&DWP(0,"edx")); } | |
1674 | ||
1675 | &lea ("edi",&DWP($in1_x,"esp")); | |
1676 | for($i=0;$i<96;$i+=16) { | |
1677 | &mov ("eax",&DWP($i+0,"esi")); # copy in1 | |
1678 | &mov ("ebx",&DWP($i+4,"esi")); | |
1679 | &mov ("ecx",&DWP($i+8,"esi")); | |
1680 | &mov ("edx",&DWP($i+12,"esi")); | |
1681 | &mov (&DWP($i+0,"edi"),"eax"); | |
1682 | &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); | |
c74aea8d AP |
1683 | &mov ("ebp","eax") if ($i==64); |
1684 | &or ("ebp","eax") if ($i>64); | |
aa9db2d2 | 1685 | &mov (&DWP($i+4,"edi"),"ebx"); |
c74aea8d | 1686 | &or ("ebp","ebx") if ($i>=64); |
aa9db2d2 | 1687 | &mov (&DWP($i+8,"edi"),"ecx"); |
c74aea8d | 1688 | &or ("ebp","ecx") if ($i>=64); |
aa9db2d2 | 1689 | &mov (&DWP($i+12,"edi"),"edx"); |
c74aea8d | 1690 | &or ("ebp","edx") if ($i>=64); |
aa9db2d2 AP |
1691 | } |
1692 | &xor ("eax","eax"); | |
1693 | &mov ("esi",&wparam(2)); | |
1694 | &sub ("eax","ebp"); | |
1695 | &or ("ebp","eax"); | |
1696 | &sar ("ebp",31); | |
969ee511 | 1697 | &mov (&DWP(32*15+0,"esp"),"ebp"); # ~in1infty |
aa9db2d2 AP |
1698 | |
1699 | &lea ("edi",&DWP($in2_x,"esp")); | |
1700 | for($i=0;$i<64;$i+=16) { | |
1701 | &mov ("eax",&DWP($i+0,"esi")); # copy in2 | |
1702 | &mov ("ebx",&DWP($i+4,"esi")); | |
1703 | &mov ("ecx",&DWP($i+8,"esi")); | |
1704 | &mov ("edx",&DWP($i+12,"esi")); | |
1705 | &mov (&DWP($i+0,"edi"),"eax"); | |
1706 | &mov ("ebp","eax") if ($i==0); | |
1707 | &or ("ebp","eax") if ($i!=0); | |
1708 | &mov (&DWP($i+4,"edi"),"ebx"); | |
1709 | &or ("ebp","ebx"); | |
1710 | &mov (&DWP($i+8,"edi"),"ecx"); | |
1711 | &or ("ebp","ecx"); | |
1712 | &mov (&DWP($i+12,"edi"),"edx"); | |
1713 | &or ("ebp","edx"); | |
1714 | } | |
1715 | &xor ("ebx","ebx"); | |
1716 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1717 | &sub ("ebx","ebp"); | |
1718 | &lea ("esi",&DWP($in1_z,"esp")); | |
1719 | &or ("ebx","ebp"); | |
1720 | &lea ("ebp",&DWP($in1_z,"esp")); | |
1721 | &sar ("ebx",31); | |
1722 | &lea ("edi",&DWP($Z1sqr,"esp")); | |
969ee511 | 1723 | &mov (&DWP(32*15+4,"esp"),"ebx"); # ~in2infty |
aa9db2d2 AP |
1724 | |
1725 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); | |
1726 | ||
1727 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1728 | &lea ("esi",&DWP($in2_x,"esp")); | |
1729 | &mov ("ebp","edi"); # %esi is stull &Z1sqr | |
1730 | &lea ("edi",&DWP($U2,"esp")); | |
1731 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); | |
1732 | ||
1733 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1734 | &lea ("esi",&DWP($in1_z,"esp")); | |
1735 | &lea ("ebp",&DWP($Z1sqr,"esp")); | |
1736 | &lea ("edi",&DWP($S2,"esp")); | |
1737 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); | |
1738 | ||
1739 | &lea ("esi",&DWP($U2,"esp")); | |
1740 | &lea ("ebp",&DWP($in1_x,"esp")); | |
1741 | &lea ("edi",&DWP($H,"esp")); | |
1742 | &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); | |
1743 | ||
1744 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1745 | &lea ("esi",&DWP($in2_y,"esp")); | |
1746 | &lea ("ebp",&DWP($S2,"esp")); | |
1747 | &lea ("edi",&DWP($S2,"esp")); | |
1748 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); | |
1749 | ||
1750 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1751 | &lea ("esi",&DWP($in1_z,"esp")); | |
1752 | &lea ("ebp",&DWP($H,"esp")); | |
1753 | &lea ("edi",&DWP($res_z,"esp")); | |
1754 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); | |
1755 | ||
1756 | &lea ("esi",&DWP($S2,"esp")); | |
1757 | &lea ("ebp",&DWP($in1_y,"esp")); | |
1758 | &lea ("edi",&DWP($R,"esp")); | |
1759 | &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); | |
1760 | ||
1761 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1762 | &lea ("esi",&DWP($H,"esp")); | |
1763 | &lea ("ebp",&DWP($H,"esp")); | |
1764 | &lea ("edi",&DWP($Hsqr,"esp")); | |
1765 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); | |
1766 | ||
1767 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1768 | &lea ("esi",&DWP($R,"esp")); | |
1769 | &lea ("ebp",&DWP($R,"esp")); | |
1770 | &lea ("edi",&DWP($Rsqr,"esp")); | |
1771 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); | |
1772 | ||
1773 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1774 | &lea ("esi",&DWP($in1_x,"esp")); | |
1775 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1776 | &lea ("edi",&DWP($U2,"esp")); | |
1777 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); | |
1778 | ||
1779 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1780 | &lea ("esi",&DWP($H,"esp")); | |
1781 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1782 | &lea ("edi",&DWP($Hcub,"esp")); | |
1783 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); | |
1784 | ||
1785 | &lea ("esi",&DWP($U2,"esp")); | |
1786 | &lea ("ebp",&DWP($U2,"esp")); | |
1787 | &lea ("edi",&DWP($Hsqr,"esp")); | |
1788 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); | |
1789 | ||
1790 | &lea ("esi",&DWP($Rsqr,"esp")); | |
1791 | &lea ("ebp",&DWP($Hsqr,"esp")); | |
1792 | &lea ("edi",&DWP($res_x,"esp")); | |
1793 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); | |
1794 | ||
1795 | &lea ("esi",&DWP($res_x,"esp")); | |
1796 | &lea ("ebp",&DWP($Hcub,"esp")); | |
1797 | &lea ("edi",&DWP($res_x,"esp")); | |
1798 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); | |
1799 | ||
1800 | &lea ("esi",&DWP($U2,"esp")); | |
1801 | &lea ("ebp",&DWP($res_x,"esp")); | |
1802 | &lea ("edi",&DWP($res_y,"esp")); | |
1803 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); | |
1804 | ||
1805 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1806 | &lea ("esi",&DWP($Hcub,"esp")); | |
1807 | &lea ("ebp",&DWP($in1_y,"esp")); | |
1808 | &lea ("edi",&DWP($S2,"esp")); | |
1809 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); | |
1810 | ||
1811 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | |
1812 | &lea ("esi",&DWP($R,"esp")); | |
1813 | &lea ("ebp",&DWP($res_y,"esp")); | |
1814 | &lea ("edi",&DWP($res_y,"esp")); | |
1815 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); | |
1816 | ||
1817 | &lea ("esi",&DWP($res_y,"esp")); | |
1818 | &lea ("ebp",&DWP($S2,"esp")); | |
1819 | &lea ("edi",&DWP($res_y,"esp")); | |
1820 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); | |
1821 | ||
969ee511 BE |
1822 | &mov ("ebp",&DWP(32*15+0,"esp")); # ~in1infty |
1823 | &mov ("esi",&DWP(32*15+4,"esp")); # ~in2infty | |
aa9db2d2 AP |
1824 | &mov ("edi",&wparam(0)); |
1825 | &mov ("edx","ebp"); | |
1826 | ¬ ("ebp"); | |
969ee511 BE |
1827 | &and ("edx","esi"); # ~in1infty & ~in2infty |
1828 | &and ("ebp","esi"); # in1infty & ~in2infty | |
1829 | ¬ ("esi"); # in2infty | |
aa9db2d2 AP |
1830 | |
1831 | ######################################## | |
1832 | # conditional moves | |
1833 | for($i=64;$i<96;$i+=4) { | |
1834 | my $one=@ONE_mont[($i-64)/4]; | |
1835 | ||
1836 | &mov ("eax","edx"); | |
1837 | &and ("eax",&DWP($res_x+$i,"esp")); | |
1838 | &mov ("ebx","ebp") if ($one && $one!=-1); | |
1839 | &and ("ebx",$one) if ($one && $one!=-1); | |
1840 | &mov ("ecx","esi"); | |
1841 | &and ("ecx",&DWP($in1_x+$i,"esp")); | |
1842 | &or ("eax",$one==-1?"ebp":"ebx") if ($one); | |
1843 | &or ("eax","ecx"); | |
1844 | &mov (&DWP($i,"edi"),"eax"); | |
1845 | } | |
1846 | for($i=0;$i<64;$i+=4) { | |
969ee511 | 1847 | &mov ("eax","edx"); # ~in1infty & ~in2infty |
aa9db2d2 | 1848 | &and ("eax",&DWP($res_x+$i,"esp")); |
969ee511 | 1849 | &mov ("ebx","ebp"); # in1infty & ~in2infty |
aa9db2d2 | 1850 | &and ("ebx",&DWP($in2_x+$i,"esp")); |
969ee511 | 1851 | &mov ("ecx","esi"); # in2infty |
aa9db2d2 AP |
1852 | &and ("ecx",&DWP($in1_x+$i,"esp")); |
1853 | &or ("eax","ebx"); | |
1854 | &or ("eax","ecx"); | |
1855 | &mov (&DWP($i,"edi"),"eax"); | |
1856 | } | |
1857 | &stack_pop(8*15+3); | |
1858 | } &function_end("ecp_nistz256_point_add_affine"); | |
1859 | ||
1860 | &asm_finish(); | |
73d2fb66 RL |
1861 | |
1862 | close STDOUT; |