]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/x86_64-gcc.c
Conversion to UTF-8 where needed
[thirdparty/openssl.git] / crypto / bn / asm / x86_64-gcc.c
1 #include "../bn_lcl.h"
2 #if !(defined(__GNUC__) && __GNUC__>=2)
3 # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
4 #else
5 /*-
6 * x86_64 BIGNUM accelerator version 0.1, December 2002.
7 *
8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * project.
10 *
11 * Rights for redistribution and usage in source and binary forms are
12 * granted according to the OpenSSL license. Warranty of any kind is
13 * disclaimed.
14 *
15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16 * versions, like 1.0...
17 * A. Well, that's because this code is basically a quick-n-dirty
18 * proof-of-concept hack. As you can see it's implemented with
19 * inline assembler, which means that you're bound to GCC and that
20 * there might be enough room for further improvement.
21 *
22 * Q. Why inline assembler?
23 * A. x86_64 features own ABI which I'm not familiar with. This is
24 * why I decided to let the compiler take care of subroutine
25 * prologue/epilogue as well as register allocation. For reference.
26 * Win64 implements different ABI for AMD64, different from Linux.
27 *
28 * Q. How much faster does it get?
29 * A. 'apps/openssl speed rsa dsa' output with no-asm:
30 *
31 * sign verify sign/s verify/s
32 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
33 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
34 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
35 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
36 * sign verify sign/s verify/s
37 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
38 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
39 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
40 *
41 * 'apps/openssl speed rsa dsa' output with this module:
42 *
43 * sign verify sign/s verify/s
44 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
45 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
46 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
47 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
48 * sign verify sign/s verify/s
49 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
50 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
51 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
52 *
53 * For the reference. IA-32 assembler implementation performs
54 * very much like 64-bit code compiled with no-asm on the same
55 * machine.
56 */
57
58 # ifdef _WIN64
59 # define BN_ULONG unsigned long long
60 # else
61 # define BN_ULONG unsigned long
62 # endif
63
64 # undef mul
65 # undef mul_add
66 # undef sqr
67
68 /*-
69 * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code;
70 * "g"(0) let the compiler to decide where does it
71 * want to keep the value of zero;
72 */
73 # define mul_add(r,a,word,carry) do { \
74 register BN_ULONG high,low; \
75 asm ("mulq %3" \
76 : "=a"(low),"=d"(high) \
77 : "a"(word),"m"(a) \
78 : "cc"); \
79 asm ("addq %2,%0; adcq %3,%1" \
80 : "+r"(carry),"+d"(high)\
81 : "a"(low),"g"(0) \
82 : "cc"); \
83 asm ("addq %2,%0; adcq %3,%1" \
84 : "+m"(r),"+d"(high) \
85 : "r"(carry),"g"(0) \
86 : "cc"); \
87 carry=high; \
88 } while (0)
89
90 # define mul(r,a,word,carry) do { \
91 register BN_ULONG high,low; \
92 asm ("mulq %3" \
93 : "=a"(low),"=d"(high) \
94 : "a"(word),"g"(a) \
95 : "cc"); \
96 asm ("addq %2,%0; adcq %3,%1" \
97 : "+r"(carry),"+d"(high)\
98 : "a"(low),"g"(0) \
99 : "cc"); \
100 (r)=carry, carry=high; \
101 } while (0)
102
103 # define sqr(r0,r1,a) \
104 asm ("mulq %2" \
105 : "=a"(r0),"=d"(r1) \
106 : "a"(a) \
107 : "cc");
108
109 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
110 BN_ULONG w)
111 {
112 BN_ULONG c1 = 0;
113
114 if (num <= 0)
115 return (c1);
116
117 while (num & ~3) {
118 mul_add(rp[0], ap[0], w, c1);
119 mul_add(rp[1], ap[1], w, c1);
120 mul_add(rp[2], ap[2], w, c1);
121 mul_add(rp[3], ap[3], w, c1);
122 ap += 4;
123 rp += 4;
124 num -= 4;
125 }
126 if (num) {
127 mul_add(rp[0], ap[0], w, c1);
128 if (--num == 0)
129 return c1;
130 mul_add(rp[1], ap[1], w, c1);
131 if (--num == 0)
132 return c1;
133 mul_add(rp[2], ap[2], w, c1);
134 return c1;
135 }
136
137 return (c1);
138 }
139
140 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
141 {
142 BN_ULONG c1 = 0;
143
144 if (num <= 0)
145 return (c1);
146
147 while (num & ~3) {
148 mul(rp[0], ap[0], w, c1);
149 mul(rp[1], ap[1], w, c1);
150 mul(rp[2], ap[2], w, c1);
151 mul(rp[3], ap[3], w, c1);
152 ap += 4;
153 rp += 4;
154 num -= 4;
155 }
156 if (num) {
157 mul(rp[0], ap[0], w, c1);
158 if (--num == 0)
159 return c1;
160 mul(rp[1], ap[1], w, c1);
161 if (--num == 0)
162 return c1;
163 mul(rp[2], ap[2], w, c1);
164 }
165 return (c1);
166 }
167
168 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
169 {
170 if (n <= 0)
171 return;
172
173 while (n & ~3) {
174 sqr(r[0], r[1], a[0]);
175 sqr(r[2], r[3], a[1]);
176 sqr(r[4], r[5], a[2]);
177 sqr(r[6], r[7], a[3]);
178 a += 4;
179 r += 8;
180 n -= 4;
181 }
182 if (n) {
183 sqr(r[0], r[1], a[0]);
184 if (--n == 0)
185 return;
186 sqr(r[2], r[3], a[1]);
187 if (--n == 0)
188 return;
189 sqr(r[4], r[5], a[2]);
190 }
191 }
192
193 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
194 {
195 BN_ULONG ret, waste;
196
197 asm("divq %4":"=a"(ret), "=d"(waste)
198 : "a"(l), "d"(h), "g"(d)
199 : "cc");
200
201 return ret;
202 }
203
204 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
205 int n)
206 {
207 BN_ULONG ret = 0, i = 0;
208
209 if (n <= 0)
210 return 0;
211
212 asm volatile (" subq %2,%2 \n"
213 ".p2align 4 \n"
214 "1: movq (%4,%2,8),%0 \n"
215 " adcq (%5,%2,8),%0 \n"
216 " movq %0,(%3,%2,8) \n"
217 " leaq 1(%2),%2 \n"
218 " loop 1b \n"
219 " sbbq %0,%0 \n":"=&a" (ret), "+c"(n),
220 "=&r"(i)
221 :"r"(rp), "r"(ap), "r"(bp)
222 :"cc", "memory");
223
224 return ret & 1;
225 }
226
227 # ifndef SIMICS
228 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
229 int n)
230 {
231 BN_ULONG ret = 0, i = 0;
232
233 if (n <= 0)
234 return 0;
235
236 asm volatile (" subq %2,%2 \n"
237 ".p2align 4 \n"
238 "1: movq (%4,%2,8),%0 \n"
239 " sbbq (%5,%2,8),%0 \n"
240 " movq %0,(%3,%2,8) \n"
241 " leaq 1(%2),%2 \n"
242 " loop 1b \n"
243 " sbbq %0,%0 \n":"=&a" (ret), "+c"(n),
244 "=&r"(i)
245 :"r"(rp), "r"(ap), "r"(bp)
246 :"cc", "memory");
247
248 return ret & 1;
249 }
250 # else
251 /* Simics 1.4<7 has buggy sbbq:-( */
252 # define BN_MASK2 0xffffffffffffffffL
253 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
254 {
255 BN_ULONG t1, t2;
256 int c = 0;
257
258 if (n <= 0)
259 return ((BN_ULONG)0);
260
261 for (;;) {
262 t1 = a[0];
263 t2 = b[0];
264 r[0] = (t1 - t2 - c) & BN_MASK2;
265 if (t1 != t2)
266 c = (t1 < t2);
267 if (--n <= 0)
268 break;
269
270 t1 = a[1];
271 t2 = b[1];
272 r[1] = (t1 - t2 - c) & BN_MASK2;
273 if (t1 != t2)
274 c = (t1 < t2);
275 if (--n <= 0)
276 break;
277
278 t1 = a[2];
279 t2 = b[2];
280 r[2] = (t1 - t2 - c) & BN_MASK2;
281 if (t1 != t2)
282 c = (t1 < t2);
283 if (--n <= 0)
284 break;
285
286 t1 = a[3];
287 t2 = b[3];
288 r[3] = (t1 - t2 - c) & BN_MASK2;
289 if (t1 != t2)
290 c = (t1 < t2);
291 if (--n <= 0)
292 break;
293
294 a += 4;
295 b += 4;
296 r += 4;
297 }
298 return (c);
299 }
300 # endif
301
302 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
303 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
304 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
305 /*
306 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
307 * c=(c2,c1,c0)
308 */
309
310 /*
311 * Keep in mind that carrying into high part of multiplication result
312 * can not overflow, because it cannot be all-ones.
313 */
314 # if 0
315 /* original macros are kept for reference purposes */
316 # define mul_add_c(a,b,c0,c1,c2) { \
317 BN_ULONG ta=(a),tb=(b); \
318 t1 = ta * tb; \
319 t2 = BN_UMULT_HIGH(ta,tb); \
320 c0 += t1; t2 += (c0<t1)?1:0; \
321 c1 += t2; c2 += (c1<t2)?1:0; \
322 }
323
324 # define mul_add_c2(a,b,c0,c1,c2) { \
325 BN_ULONG ta=(a),tb=(b),t0; \
326 t1 = BN_UMULT_HIGH(ta,tb); \
327 t0 = ta * tb; \
328 c0 += t0; t2 = t1+((c0<t0)?1:0);\
329 c1 += t2; c2 += (c1<t2)?1:0; \
330 c0 += t0; t1 += (c0<t0)?1:0; \
331 c1 += t1; c2 += (c1<t1)?1:0; \
332 }
333 # else
334 # define mul_add_c(a,b,c0,c1,c2) do { \
335 asm ("mulq %3" \
336 : "=a"(t1),"=d"(t2) \
337 : "a"(a),"m"(b) \
338 : "cc"); \
339 asm ("addq %2,%0; adcq %3,%1" \
340 : "+r"(c0),"+d"(t2) \
341 : "a"(t1),"g"(0) \
342 : "cc"); \
343 asm ("addq %2,%0; adcq %3,%1" \
344 : "+r"(c1),"+r"(c2) \
345 : "d"(t2),"g"(0) \
346 : "cc"); \
347 } while (0)
348
349 # define sqr_add_c(a,i,c0,c1,c2) do { \
350 asm ("mulq %2" \
351 : "=a"(t1),"=d"(t2) \
352 : "a"(a[i]) \
353 : "cc"); \
354 asm ("addq %2,%0; adcq %3,%1" \
355 : "+r"(c0),"+d"(t2) \
356 : "a"(t1),"g"(0) \
357 : "cc"); \
358 asm ("addq %2,%0; adcq %3,%1" \
359 : "+r"(c1),"+r"(c2) \
360 : "d"(t2),"g"(0) \
361 : "cc"); \
362 } while (0)
363
364 # define mul_add_c2(a,b,c0,c1,c2) do { \
365 asm ("mulq %3" \
366 : "=a"(t1),"=d"(t2) \
367 : "a"(a),"m"(b) \
368 : "cc"); \
369 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
370 : "+r"(c0),"+r"(c1),"+r"(c2) \
371 : "r"(t1),"r"(t2),"g"(0) \
372 : "cc"); \
373 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
374 : "+r"(c0),"+r"(c1),"+r"(c2) \
375 : "r"(t1),"r"(t2),"g"(0) \
376 : "cc"); \
377 } while (0)
378 # endif
379
380 # define sqr_add_c2(a,i,j,c0,c1,c2) \
381 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
382
383 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
384 {
385 BN_ULONG t1, t2;
386 BN_ULONG c1, c2, c3;
387
388 c1 = 0;
389 c2 = 0;
390 c3 = 0;
391 mul_add_c(a[0], b[0], c1, c2, c3);
392 r[0] = c1;
393 c1 = 0;
394 mul_add_c(a[0], b[1], c2, c3, c1);
395 mul_add_c(a[1], b[0], c2, c3, c1);
396 r[1] = c2;
397 c2 = 0;
398 mul_add_c(a[2], b[0], c3, c1, c2);
399 mul_add_c(a[1], b[1], c3, c1, c2);
400 mul_add_c(a[0], b[2], c3, c1, c2);
401 r[2] = c3;
402 c3 = 0;
403 mul_add_c(a[0], b[3], c1, c2, c3);
404 mul_add_c(a[1], b[2], c1, c2, c3);
405 mul_add_c(a[2], b[1], c1, c2, c3);
406 mul_add_c(a[3], b[0], c1, c2, c3);
407 r[3] = c1;
408 c1 = 0;
409 mul_add_c(a[4], b[0], c2, c3, c1);
410 mul_add_c(a[3], b[1], c2, c3, c1);
411 mul_add_c(a[2], b[2], c2, c3, c1);
412 mul_add_c(a[1], b[3], c2, c3, c1);
413 mul_add_c(a[0], b[4], c2, c3, c1);
414 r[4] = c2;
415 c2 = 0;
416 mul_add_c(a[0], b[5], c3, c1, c2);
417 mul_add_c(a[1], b[4], c3, c1, c2);
418 mul_add_c(a[2], b[3], c3, c1, c2);
419 mul_add_c(a[3], b[2], c3, c1, c2);
420 mul_add_c(a[4], b[1], c3, c1, c2);
421 mul_add_c(a[5], b[0], c3, c1, c2);
422 r[5] = c3;
423 c3 = 0;
424 mul_add_c(a[6], b[0], c1, c2, c3);
425 mul_add_c(a[5], b[1], c1, c2, c3);
426 mul_add_c(a[4], b[2], c1, c2, c3);
427 mul_add_c(a[3], b[3], c1, c2, c3);
428 mul_add_c(a[2], b[4], c1, c2, c3);
429 mul_add_c(a[1], b[5], c1, c2, c3);
430 mul_add_c(a[0], b[6], c1, c2, c3);
431 r[6] = c1;
432 c1 = 0;
433 mul_add_c(a[0], b[7], c2, c3, c1);
434 mul_add_c(a[1], b[6], c2, c3, c1);
435 mul_add_c(a[2], b[5], c2, c3, c1);
436 mul_add_c(a[3], b[4], c2, c3, c1);
437 mul_add_c(a[4], b[3], c2, c3, c1);
438 mul_add_c(a[5], b[2], c2, c3, c1);
439 mul_add_c(a[6], b[1], c2, c3, c1);
440 mul_add_c(a[7], b[0], c2, c3, c1);
441 r[7] = c2;
442 c2 = 0;
443 mul_add_c(a[7], b[1], c3, c1, c2);
444 mul_add_c(a[6], b[2], c3, c1, c2);
445 mul_add_c(a[5], b[3], c3, c1, c2);
446 mul_add_c(a[4], b[4], c3, c1, c2);
447 mul_add_c(a[3], b[5], c3, c1, c2);
448 mul_add_c(a[2], b[6], c3, c1, c2);
449 mul_add_c(a[1], b[7], c3, c1, c2);
450 r[8] = c3;
451 c3 = 0;
452 mul_add_c(a[2], b[7], c1, c2, c3);
453 mul_add_c(a[3], b[6], c1, c2, c3);
454 mul_add_c(a[4], b[5], c1, c2, c3);
455 mul_add_c(a[5], b[4], c1, c2, c3);
456 mul_add_c(a[6], b[3], c1, c2, c3);
457 mul_add_c(a[7], b[2], c1, c2, c3);
458 r[9] = c1;
459 c1 = 0;
460 mul_add_c(a[7], b[3], c2, c3, c1);
461 mul_add_c(a[6], b[4], c2, c3, c1);
462 mul_add_c(a[5], b[5], c2, c3, c1);
463 mul_add_c(a[4], b[6], c2, c3, c1);
464 mul_add_c(a[3], b[7], c2, c3, c1);
465 r[10] = c2;
466 c2 = 0;
467 mul_add_c(a[4], b[7], c3, c1, c2);
468 mul_add_c(a[5], b[6], c3, c1, c2);
469 mul_add_c(a[6], b[5], c3, c1, c2);
470 mul_add_c(a[7], b[4], c3, c1, c2);
471 r[11] = c3;
472 c3 = 0;
473 mul_add_c(a[7], b[5], c1, c2, c3);
474 mul_add_c(a[6], b[6], c1, c2, c3);
475 mul_add_c(a[5], b[7], c1, c2, c3);
476 r[12] = c1;
477 c1 = 0;
478 mul_add_c(a[6], b[7], c2, c3, c1);
479 mul_add_c(a[7], b[6], c2, c3, c1);
480 r[13] = c2;
481 c2 = 0;
482 mul_add_c(a[7], b[7], c3, c1, c2);
483 r[14] = c3;
484 r[15] = c1;
485 }
486
487 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
488 {
489 BN_ULONG t1, t2;
490 BN_ULONG c1, c2, c3;
491
492 c1 = 0;
493 c2 = 0;
494 c3 = 0;
495 mul_add_c(a[0], b[0], c1, c2, c3);
496 r[0] = c1;
497 c1 = 0;
498 mul_add_c(a[0], b[1], c2, c3, c1);
499 mul_add_c(a[1], b[0], c2, c3, c1);
500 r[1] = c2;
501 c2 = 0;
502 mul_add_c(a[2], b[0], c3, c1, c2);
503 mul_add_c(a[1], b[1], c3, c1, c2);
504 mul_add_c(a[0], b[2], c3, c1, c2);
505 r[2] = c3;
506 c3 = 0;
507 mul_add_c(a[0], b[3], c1, c2, c3);
508 mul_add_c(a[1], b[2], c1, c2, c3);
509 mul_add_c(a[2], b[1], c1, c2, c3);
510 mul_add_c(a[3], b[0], c1, c2, c3);
511 r[3] = c1;
512 c1 = 0;
513 mul_add_c(a[3], b[1], c2, c3, c1);
514 mul_add_c(a[2], b[2], c2, c3, c1);
515 mul_add_c(a[1], b[3], c2, c3, c1);
516 r[4] = c2;
517 c2 = 0;
518 mul_add_c(a[2], b[3], c3, c1, c2);
519 mul_add_c(a[3], b[2], c3, c1, c2);
520 r[5] = c3;
521 c3 = 0;
522 mul_add_c(a[3], b[3], c1, c2, c3);
523 r[6] = c1;
524 r[7] = c2;
525 }
526
527 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
528 {
529 BN_ULONG t1, t2;
530 BN_ULONG c1, c2, c3;
531
532 c1 = 0;
533 c2 = 0;
534 c3 = 0;
535 sqr_add_c(a, 0, c1, c2, c3);
536 r[0] = c1;
537 c1 = 0;
538 sqr_add_c2(a, 1, 0, c2, c3, c1);
539 r[1] = c2;
540 c2 = 0;
541 sqr_add_c(a, 1, c3, c1, c2);
542 sqr_add_c2(a, 2, 0, c3, c1, c2);
543 r[2] = c3;
544 c3 = 0;
545 sqr_add_c2(a, 3, 0, c1, c2, c3);
546 sqr_add_c2(a, 2, 1, c1, c2, c3);
547 r[3] = c1;
548 c1 = 0;
549 sqr_add_c(a, 2, c2, c3, c1);
550 sqr_add_c2(a, 3, 1, c2, c3, c1);
551 sqr_add_c2(a, 4, 0, c2, c3, c1);
552 r[4] = c2;
553 c2 = 0;
554 sqr_add_c2(a, 5, 0, c3, c1, c2);
555 sqr_add_c2(a, 4, 1, c3, c1, c2);
556 sqr_add_c2(a, 3, 2, c3, c1, c2);
557 r[5] = c3;
558 c3 = 0;
559 sqr_add_c(a, 3, c1, c2, c3);
560 sqr_add_c2(a, 4, 2, c1, c2, c3);
561 sqr_add_c2(a, 5, 1, c1, c2, c3);
562 sqr_add_c2(a, 6, 0, c1, c2, c3);
563 r[6] = c1;
564 c1 = 0;
565 sqr_add_c2(a, 7, 0, c2, c3, c1);
566 sqr_add_c2(a, 6, 1, c2, c3, c1);
567 sqr_add_c2(a, 5, 2, c2, c3, c1);
568 sqr_add_c2(a, 4, 3, c2, c3, c1);
569 r[7] = c2;
570 c2 = 0;
571 sqr_add_c(a, 4, c3, c1, c2);
572 sqr_add_c2(a, 5, 3, c3, c1, c2);
573 sqr_add_c2(a, 6, 2, c3, c1, c2);
574 sqr_add_c2(a, 7, 1, c3, c1, c2);
575 r[8] = c3;
576 c3 = 0;
577 sqr_add_c2(a, 7, 2, c1, c2, c3);
578 sqr_add_c2(a, 6, 3, c1, c2, c3);
579 sqr_add_c2(a, 5, 4, c1, c2, c3);
580 r[9] = c1;
581 c1 = 0;
582 sqr_add_c(a, 5, c2, c3, c1);
583 sqr_add_c2(a, 6, 4, c2, c3, c1);
584 sqr_add_c2(a, 7, 3, c2, c3, c1);
585 r[10] = c2;
586 c2 = 0;
587 sqr_add_c2(a, 7, 4, c3, c1, c2);
588 sqr_add_c2(a, 6, 5, c3, c1, c2);
589 r[11] = c3;
590 c3 = 0;
591 sqr_add_c(a, 6, c1, c2, c3);
592 sqr_add_c2(a, 7, 5, c1, c2, c3);
593 r[12] = c1;
594 c1 = 0;
595 sqr_add_c2(a, 7, 6, c2, c3, c1);
596 r[13] = c2;
597 c2 = 0;
598 sqr_add_c(a, 7, c3, c1, c2);
599 r[14] = c3;
600 r[15] = c1;
601 }
602
603 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
604 {
605 BN_ULONG t1, t2;
606 BN_ULONG c1, c2, c3;
607
608 c1 = 0;
609 c2 = 0;
610 c3 = 0;
611 sqr_add_c(a, 0, c1, c2, c3);
612 r[0] = c1;
613 c1 = 0;
614 sqr_add_c2(a, 1, 0, c2, c3, c1);
615 r[1] = c2;
616 c2 = 0;
617 sqr_add_c(a, 1, c3, c1, c2);
618 sqr_add_c2(a, 2, 0, c3, c1, c2);
619 r[2] = c3;
620 c3 = 0;
621 sqr_add_c2(a, 3, 0, c1, c2, c3);
622 sqr_add_c2(a, 2, 1, c1, c2, c3);
623 r[3] = c1;
624 c1 = 0;
625 sqr_add_c(a, 2, c2, c3, c1);
626 sqr_add_c2(a, 3, 1, c2, c3, c1);
627 r[4] = c2;
628 c2 = 0;
629 sqr_add_c2(a, 3, 2, c3, c1, c2);
630 r[5] = c3;
631 c3 = 0;
632 sqr_add_c(a, 3, c1, c2, c3);
633 r[6] = c1;
634 r[7] = c2;
635 }
636 #endif