]> git.ipfire.org Git - thirdparty/gcc.git/blob - libphobos/src/std/internal/math/biguintx86.d
Add D front-end, libphobos library, and D2 testsuite.
[thirdparty/gcc.git] / libphobos / src / std / internal / math / biguintx86.d
1 /** Optimised asm arbitrary precision arithmetic ('bignum')
2 * routines for X86 processors.
3 *
4 * All functions operate on arrays of uints, stored LSB first.
5 * If there is a destination array, it will be the first parameter.
6 * Currently, all of these functions are subject to change, and are
7 * intended for internal use only.
8 * The symbol [#] indicates an array of machine words which is to be
9 * interpreted as a multi-byte number.
10 */
11
12 /* Copyright Don Clugston 2008 - 2010.
13 * Distributed under the Boost Software License, Version 1.0.
14 * (See accompanying file LICENSE_1_0.txt or copy at
15 * http://www.boost.org/LICENSE_1_0.txt)
16 */
17 /**
18 * In simple terms, there are 3 modern x86 microarchitectures:
19 * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel;
20 * (b) the K6, Athlon, and AMD64 families, produced by AMD; and
21 * (c) the Pentium 4, produced by Marketing.
22 *
23 * This code has been optimised for the Intel P6 family.
24 * Generally the code remains near-optimal for Intel Core2/Corei7, after
25 * translating EAX-> RAX, etc, since all these CPUs use essentially the same
26 * pipeline, and are typically limited by memory access.
27 * The code uses techniques described in Agner Fog's superb Pentium manuals
28 * available at www.agner.org.
29 * Not optimised for AMD, which can do two memory loads per cycle (Intel
30 * CPUs can only do one). Despite this, performance is superior on AMD.
31 * Performance is dreadful on P4.
32 *
33 * Timing results (cycles per int)
34 * --Intel Pentium-- --AMD--
35 * PM P4 Core2 K7
36 * +,- 2.25 15.6 2.25 1.5
37 * <<,>> 2.0 6.6 2.0 5.0
38 * (<< MMX) 1.7 5.3 1.5 1.2
39 * * 5.0 15.0 4.0 4.3
40 * mulAdd 5.7 19.0 4.9 4.0
41 * div 30.0 32.0 32.0 22.4
42 * mulAcc(32) 6.5 20.0 5.4 4.9
43 *
44 * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes
45 * function call overhead.
46 * The timing for Div is quite unpredictable, but it's probably too slow
47 * to be useful. On 64-bit processors, these times should
48 * halve if run in 64-bit mode, except for the MMX functions.
49 */
50
51 module std.internal.math.biguintx86;
52
53 @system:
54 pure:
55 nothrow:
56
57 /*
58 Naked asm is used throughout, because:
59 (a) it frees up the EBP register
60 (b) compiler bugs prevent the use of .ptr when a frame pointer is used.
61 */
62
63 version (D_InlineAsm_X86)
64 {
65
66 private:
67
68 /* Duplicate string s, with n times, substituting index for '@'.
69 *
70 * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper
71 * function for some of the asm routines.
72 */
73 string indexedLoopUnroll(int n, string s) pure @safe
74 {
75 string u;
76 for (int i = 0; i<n; ++i)
77 {
78 string nstr= (i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10);
79
80 int last = 0;
81 for (int j = 0; j<s.length; ++j)
82 {
83 if (s[j]=='@')
84 {
85 u ~= s[last .. j] ~ nstr;
86 last = j+1;
87 }
88 }
89 if (last<s.length) u = u ~ s[last..$];
90
91 }
92 return u;
93 }
94 @safe unittest
95 {
96 assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;");
97 }
98
99 public:
100
101 alias BigDigit = uint; // A Bignum is an array of BigDigits. Usually the machine word size.
102
103 // Limits for when to switch between multiplication algorithms.
104 enum : int { KARATSUBALIMIT = 18 }; // Minimum value for which Karatsuba is worthwhile.
105 enum : int { KARATSUBASQUARELIMIT=26 }; // Minimum value for which square Karatsuba is worthwhile
106
107 /** Multi-byte addition or subtraction
108 * dest[#] = src1[#] + src2[#] + carry (0 or 1).
109 * or dest[#] = src1[#] - src2[#] - carry (0 or 1).
110 * Returns carry or borrow (0 or 1).
111 * Set op == '+' for addition, '-' for subtraction.
112 */
113 uint multibyteAddSub(char op)(uint[] dest, const uint [] src1, const uint []
114 src2, uint carry) pure
115 {
116 // Timing:
117 // Pentium M: 2.25/int
118 // P6 family, Core2 have a partial flags stall when reading the carry flag in
119 // an ADC, SBB operation after an operation such as INC or DEC which
120 // modifies some, but not all, flags. We avoid this by storing carry into
121 // a resister (AL), and restoring it after the branch.
122
123 enum { LASTPARAM = 4*4 } // 3* pushes + return address.
124 asm pure nothrow {
125 naked;
126 push EDI;
127 push EBX;
128 push ESI;
129 mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
130 mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr
131 mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr
132 mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr
133 // Carry is in EAX
134 // Count UP to zero (from -len) to minimize loop overhead.
135 lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
136 lea ESI, [ESI + 4*ECX]; // EBP = end of src2.
137 lea EDI, [EDI + 4*ECX]; // EDI = end of dest.
138
139 neg ECX;
140 add ECX, 8;
141 jb L2; // if length < 8 , bypass the unrolled loop.
142 L_unrolled:
143 shr AL, 1; // get carry from EAX
144 }
145 mixin(" asm pure nothrow {"
146 ~ indexedLoopUnroll( 8,
147 "mov EAX, [@*4-8*4+EDX+ECX*4];"
148 ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4-8*4+ESI+ECX*4];"
149 ~ "mov [@*4-8*4+EDI+ECX*4], EAX;")
150 ~ "}");
151 asm pure nothrow {
152 setc AL; // save carry
153 add ECX, 8;
154 ja L_unrolled;
155 L2: // Do the residual 1 .. 7 ints.
156
157 sub ECX, 8;
158 jz done;
159 L_residual:
160 shr AL, 1; // get carry from EAX
161 }
162 mixin(" asm pure nothrow {"
163 ~ indexedLoopUnroll( 1,
164 "mov EAX, [@*4+EDX+ECX*4];"
165 ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4+ESI+ECX*4];"
166 ~ "mov [@*4+EDI+ECX*4], EAX;") ~ "}");
167 asm pure nothrow {
168 setc AL; // save carry
169 add ECX, 1;
170 jnz L_residual;
171 done:
172 and EAX, 1; // make it O or 1.
173 pop ESI;
174 pop EBX;
175 pop EDI;
176 ret 6*4;
177 }
178 }
179
180 @system unittest
181 {
182 uint [] a = new uint[40];
183 uint [] b = new uint[40];
184 uint [] c = new uint[40];
185 for (int i=0; i<a.length; ++i)
186 {
187 if (i&1) a[i]=0x8000_0000 + i;
188 else a[i]=i;
189 b[i]= 0x8000_0003;
190 }
191 c[19]=0x3333_3333;
192 uint carry = multibyteAddSub!('+')(c[0 .. 18], a[0 .. 18], b[0 .. 18], 0);
193 assert(carry == 1);
194 assert(c[0]==0x8000_0003);
195 assert(c[1]==4);
196 assert(c[19]==0x3333_3333); // check for overrun
197 for (int i=0; i<a.length; ++i)
198 {
199 a[i]=b[i]=c[i]=0;
200 }
201 a[8]=0x048D159E;
202 b[8]=0x048D159E;
203 a[10]=0x1D950C84;
204 b[10]=0x1D950C84;
205 a[5] =0x44444444;
206 carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0);
207 assert(a[11]==0);
208 for (int i=0; i<10; ++i) if (i != 5) assert(a[i]==0);
209
210 for (int q=3; q<36;++q)
211 {
212 for (int i=0; i<a.length; ++i)
213 {
214 a[i]=b[i]=c[i]=0;
215 }
216 a[q-2]=0x040000;
217 b[q-2]=0x040000;
218 carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0);
219 assert(a[q-2]==0);
220 }
221 }
222
223 /** dest[#] += carry, or dest[#] -= carry.
224 * op must be '+' or '-'
225 * Returns final carry or borrow (0 or 1)
226 */
227 uint multibyteIncrementAssign(char op)(uint[] dest, uint carry) pure
228 {
229 enum { LASTPARAM = 1*4 } // 0* pushes + return address.
230 asm pure nothrow {
231 naked;
232 mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length;
233 mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr
234 // EAX = carry
235 L1: ;
236 }
237 static if (op=='+')
238 asm pure nothrow { add [EDX], EAX; }
239 else
240 asm pure nothrow { sub [EDX], EAX; }
241 asm pure nothrow {
242 mov EAX, 1;
243 jnc L2;
244 add EDX, 4;
245 dec ECX;
246 jnz L1;
247 mov EAX, 2;
248 L2: dec EAX;
249 ret 2*4;
250 }
251 }
252
253 /** dest[#] = src[#] << numbits
254 * numbits must be in the range 1 .. 31
255 * Returns the overflow
256 */
257 uint multibyteShlNoMMX(uint [] dest, const uint [] src, uint numbits) pure
258 {
259 // Timing: Optimal for P6 family.
260 // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
261 // 5.0 cycles/int on Athlon, which has 7 cycles for SHLD!!
262 enum { LASTPARAM = 4*4 } // 3* pushes + return address.
263 asm pure nothrow {
264 naked;
265 push ESI;
266 push EDI;
267 push EBX;
268 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
269 mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
270 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
271 mov ECX, EAX; // numbits;
272
273 mov EAX, [-4+ESI + 4*EBX];
274 mov EDX, 0;
275 shld EDX, EAX, CL;
276 push EDX; // Save return value
277 cmp EBX, 1;
278 jz L_last;
279 mov EDX, [-4+ESI + 4*EBX];
280 test EBX, 1;
281 jz L_odd;
282 sub EBX, 1;
283 L_even:
284 mov EDX, [-4+ ESI + 4*EBX];
285 shld EAX, EDX, CL;
286 mov [EDI+4*EBX], EAX;
287 L_odd:
288 mov EAX, [-8+ESI + 4*EBX];
289 shld EDX, EAX, CL;
290 mov [-4+EDI + 4*EBX], EDX;
291 sub EBX, 2;
292 jg L_even;
293 L_last:
294 shl EAX, CL;
295 mov [EDI], EAX;
296 pop EAX; // pop return value
297 pop EBX;
298 pop EDI;
299 pop ESI;
300 ret 4*4;
301 }
302 }
303
304 /** dest[#] = src[#] >> numbits
305 * numbits must be in the range 1 .. 31
306 * This version uses MMX.
307 */
308 uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure
309 {
310 // Timing:
311 // K7 1.2/int. PM 1.7/int P4 5.3/int
312 enum { LASTPARAM = 4*4 } // 3* pushes + return address.
313 asm pure nothrow {
314 naked;
315 push ESI;
316 push EDI;
317 push EBX;
318 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
319 mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
320 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
321
322 movd MM3, EAX; // numbits = bits to shift left
323 xor EAX, 63;
324 align 16;
325 inc EAX;
326 movd MM4, EAX ; // 64-numbits = bits to shift right
327
328 // Get the return value into EAX
329 and EAX, 31; // EAX = 32-numbits
330 movd MM2, EAX; // 32-numbits
331 movd MM1, [ESI+4*EBX-4];
332 psrlq MM1, MM2;
333 movd EAX, MM1; // EAX = return value
334 test EBX, 1;
335 jz L_even;
336 L_odd:
337 cmp EBX, 1;
338 jz L_length1;
339
340 // deal with odd lengths
341 movq MM1, [ESI+4*EBX-8];
342 psrlq MM1, MM2;
343 movd [EDI +4*EBX-4], MM1;
344 sub EBX, 1;
345 L_even: // It's either singly or doubly even
346 movq MM2, [ESI + 4*EBX - 8];
347 psllq MM2, MM3;
348 sub EBX, 2;
349 jle L_last;
350 movq MM1, MM2;
351 add EBX, 2;
352 test EBX, 2;
353 jz L_onceeven;
354 sub EBX, 2;
355
356 // MAIN LOOP -- 128 bytes per iteration
357 L_twiceeven: // here MM2 is the carry
358 movq MM0, [ESI + 4*EBX-8];
359 psrlq MM0, MM4;
360 movq MM1, [ESI + 4*EBX-8];
361 psllq MM1, MM3;
362 por MM2, MM0;
363 movq [EDI +4*EBX], MM2;
364 L_onceeven: // here MM1 is the carry
365 movq MM0, [ESI + 4*EBX-16];
366 psrlq MM0, MM4;
367 movq MM2, [ESI + 4*EBX-16];
368 por MM1, MM0;
369 movq [EDI +4*EBX-8], MM1;
370 psllq MM2, MM3;
371 sub EBX, 4;
372 jg L_twiceeven;
373 L_last:
374 movq [EDI +4*EBX], MM2;
375 L_alldone:
376 emms; // NOTE: costs 6 cycles on Intel CPUs
377 pop EBX;
378 pop EDI;
379 pop ESI;
380 ret 4*4;
381
382 L_length1:
383 // length 1 is a special case
384 movd MM1, [ESI];
385 psllq MM1, MM3;
386 movd [EDI], MM1;
387 jmp L_alldone;
388 }
389 }
390
391 void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure
392 {
393 enum { LASTPARAM = 4*4 } // 3* pushes + return address.
394 asm pure nothrow {
395 naked;
396 push ESI;
397 push EDI;
398 push EBX;
399 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
400 mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
401 align 16;
402 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
403 lea EDI, [EDI + 4*EBX]; // EDI = end of dest
404 lea ESI, [ESI + 4*EBX]; // ESI = end of src
405 neg EBX; // count UP to zero.
406
407 movd MM3, EAX; // numbits = bits to shift right
408 xor EAX, 63;
409 inc EAX;
410 movd MM4, EAX ; // 64-numbits = bits to shift left
411
412 test EBX, 1;
413 jz L_even;
414 L_odd:
415 // deal with odd lengths
416 and EAX, 31; // EAX = 32-numbits
417 movd MM2, EAX; // 32-numbits
418 cmp EBX, -1;
419 jz L_length1;
420
421 movq MM0, [ESI+4*EBX];
422 psrlq MM0, MM3;
423 movd [EDI +4*EBX], MM0;
424 add EBX, 1;
425 L_even:
426 movq MM2, [ESI + 4*EBX];
427 psrlq MM2, MM3;
428
429 movq MM1, MM2;
430 add EBX, 4;
431 cmp EBX, -2+4;
432 jz L_last;
433 // It's either singly or doubly even
434 sub EBX, 2;
435 test EBX, 2;
436 jnz L_onceeven;
437 add EBX, 2;
438
439 // MAIN LOOP -- 128 bytes per iteration
440 L_twiceeven: // here MM2 is the carry
441 movq MM0, [ESI + 4*EBX-8];
442 psllq MM0, MM4;
443 movq MM1, [ESI + 4*EBX-8];
444 psrlq MM1, MM3;
445 por MM2, MM0;
446 movq [EDI +4*EBX-16], MM2;
447 L_onceeven: // here MM1 is the carry
448 movq MM0, [ESI + 4*EBX];
449 psllq MM0, MM4;
450 movq MM2, [ESI + 4*EBX];
451 por MM1, MM0;
452 movq [EDI +4*EBX-8], MM1;
453 psrlq MM2, MM3;
454 add EBX, 4;
455 jl L_twiceeven;
456 L_last:
457 movq [EDI +4*EBX-16], MM2;
458 L_alldone:
459 emms; // NOTE: costs 6 cycles on Intel CPUs
460 pop EBX;
461 pop EDI;
462 pop ESI;
463 ret 4*4;
464
465 L_length1:
466 // length 1 is a special case
467 movd MM1, [ESI+4*EBX];
468 psrlq MM1, MM3;
469 movd [EDI +4*EBX], MM1;
470 jmp L_alldone;
471
472 }
473 }
474
475 /** dest[#] = src[#] >> numbits
476 * numbits must be in the range 1 .. 31
477 */
478 void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure
479 {
480 // Timing: Optimal for P6 family.
481 // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
482 // Terrible performance on AMD64, which has 7 cycles for SHRD!!
483 enum { LASTPARAM = 4*4 } // 3* pushes + return address.
484 asm pure nothrow {
485 naked;
486 push ESI;
487 push EDI;
488 push EBX;
489 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
490 mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
491 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
492 mov ECX, EAX; // numbits;
493
494 lea EDI, [EDI + 4*EBX]; // EDI = end of dest
495 lea ESI, [ESI + 4*EBX]; // ESI = end of src
496 neg EBX; // count UP to zero.
497 mov EAX, [ESI + 4*EBX];
498 cmp EBX, -1;
499 jz L_last;
500 mov EDX, [ESI + 4*EBX];
501 test EBX, 1;
502 jz L_odd;
503 add EBX, 1;
504 L_even:
505 mov EDX, [ ESI + 4*EBX];
506 shrd EAX, EDX, CL;
507 mov [-4 + EDI+4*EBX], EAX;
508 L_odd:
509 mov EAX, [4 + ESI + 4*EBX];
510 shrd EDX, EAX, CL;
511 mov [EDI + 4*EBX], EDX;
512 add EBX, 2;
513 jl L_even;
514 L_last:
515 shr EAX, CL;
516 mov [-4 + EDI], EAX;
517
518 pop EBX;
519 pop EDI;
520 pop ESI;
521 ret 4*4;
522 }
523 }
524
525 @system unittest
526 {
527
528 uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
529 multibyteShr(aa[0..$-1], aa, 4);
530 assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555
531 && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
532
533 aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
534 multibyteShr(aa[2..$-1], aa[2..$-1], 4);
535 assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556
536 && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
537
538 aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
539 multibyteShr(aa[0..$-2], aa, 4);
540 assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999);
541 assert(aa[0]==0x6122_2222);
542 assert(aa[3]==0xBCCC_CCCD);
543
544
545 aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
546 uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4);
547 assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223
548 && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
549 assert(r == 8);
550
551 aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
552 r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4);
553 assert(aa[0] == 0xF0FF_FFFF
554 && aa[2]==0x5555_5561);
555 assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
556 assert(r == 8);
557 assert(aa[1]==0x2222_2230);
558
559 aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
560 r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31);
561 }
562
563 /** dest[#] = src[#] * multiplier + carry.
564 * Returns carry.
565 */
566 uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry)
567 pure
568 {
569 // Timing: definitely not optimal.
570 // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration
571 // Fastest implementation found was 4.6 cycles/op, but not worth the complexity.
572
573 enum { LASTPARAM = 4*4 } // 4* pushes + return address.
574 // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
575 // when initializing variables to zero.
576 version (D_PIC)
577 {
578 enum { zero = 0 }
579 }
580 else
581 {
582 __gshared int zero = 0;
583 }
584 asm pure nothrow {
585 naked;
586 push ESI;
587 push EDI;
588 push EBX;
589
590 mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
591 mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
592 mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr
593 align 16;
594 lea EDI, [EDI + 4*EBX]; // EDI = end of dest
595 lea ESI, [ESI + 4*EBX]; // ESI = end of src
596 mov ECX, EAX; // [carry]; -- last param is in EAX.
597 neg EBX; // count UP to zero.
598 test EBX, 1;
599 jnz L_odd;
600 add EBX, 1;
601 L1:
602 mov EAX, [-4 + ESI + 4*EBX];
603 mul int ptr [ESP+LASTPARAM]; //[multiplier];
604 add EAX, ECX;
605 mov ECX, zero;
606 mov [-4+EDI + 4*EBX], EAX;
607 adc ECX, EDX;
608 L_odd:
609 mov EAX, [ESI + 4*EBX]; // p2
610 mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3,
611 add EAX, ECX;
612 mov ECX, zero;
613 adc ECX, EDX;
614 mov [EDI + 4*EBX], EAX;
615 add EBX, 2;
616 jl L1;
617
618 mov EAX, ECX; // get final carry
619
620 pop EBX;
621 pop EDI;
622 pop ESI;
623 ret 5*4;
624 }
625 }
626
627 @system unittest
628 {
629 uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
630 multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0);
631 assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 &&
632 aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
633 }
634
635 // The inner multiply-and-add loop, together with the Even entry point.
636 // Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub"
637 // This is the most time-critical code in the BigInt library.
638 // It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate
639 string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure {
640 // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC
641 // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load
642 // (unit p2) for initializing registers to zero.
643 // There are also dependencies between the instructions, and we run up against the
644 // ROB-read limit (can only read 2 registers per cycle).
645 // We also need the number of uops in the loop to be a multiple of 3.
646 // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that
647 // if Position-Independent Code is required.
648
649 // Register usage
650 // ESI = end of src
651 // EDI = end of dest
652 // EBX = index. Counts up to zero (in steps of 2).
653 // EDX:EAX = scratch, used in multiply.
654 // ECX = carry1.
655 // EBP = carry2.
656 // ESP = points to the multiplier.
657
658 // The first member of 'dest' which will be modified is [EDI+4*EBX].
659 // EAX must already contain the first member of 'src', [ESI+4*EBX].
660
661 version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; }
662 return "
663 // Entry point for even length
664 add EBX, 1;
665 mov EBP, ECX; // carry
666
667 mul int ptr [" ~ M_ADDRESS ~ "]; // M
668 mov ECX, 0;
669
670 add EBP, EAX;
671 mov EAX, [ESI+4*EBX];
672 adc ECX, EDX;
673
674 mul int ptr [" ~ M_ADDRESS ~ "]; // M
675 " ~ OP ~ " [-4+EDI+4*EBX], EBP;
676 mov EBP, zero;
677
678 adc ECX, EAX;
679 mov EAX, [4+ESI+4*EBX];
680
681 adc EBP, EDX;
682 add EBX, 2;
683 jnl L_done;
684 L1:
685 mul int ptr [" ~ M_ADDRESS ~ "];
686 " ~ OP ~ " [-8+EDI+4*EBX], ECX;
687 adc EBP, EAX;
688 mov ECX, zero;
689 mov EAX, [ESI+4*EBX];
690 adc ECX, EDX;
691 " ~
692 (using_PIC ? "" : " mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode.
693 ~ "
694 mul int ptr [" ~ M_ADDRESS ~ "];
695 " ~ OP ~ " [-4+EDI+4*EBX], EBP;
696 mov EBP, zero;
697
698 adc ECX, EAX;
699 mov EAX, [4+ESI+4*EBX];
700
701 adc EBP, EDX;
702 add EBX, 2;
703 jl L1;
704 L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX;
705 adc EBP, 0;
706 ";
707 // final carry is now in EBP
708 }
709
710 string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure
711 {
712 return "
713 mul int ptr [" ~M_ADDRESS ~"];
714 mov EBP, zero;
715 add ECX, EAX;
716 mov EAX, [4+ESI+4*EBX];
717
718 adc EBP, EDX;
719 add EBX, 2;
720 jl L1;
721 jmp L_done;
722 ";
723 }
724
725
726
727 /**
728 * dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF).
729 * where op == '+' or '-'
730 * Returns carry out of MSB (0 .. FFFF_FFFF).
731 */
732 uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint
733 multiplier, uint carry) pure {
734 // Timing: This is the most time-critical bignum function.
735 // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration
736
737 // The main loop is pipelined and unrolled by 2,
738 // so entry to the loop is also complicated.
739
740 // Register usage
741 // EDX:EAX = multiply
742 // EBX = counter
743 // ECX = carry1
744 // EBP = carry2
745 // EDI = dest
746 // ESI = src
747
748 enum string OP = (op=='+')? "add" : "sub";
749 version (D_PIC)
750 {
751 enum { zero = 0 }
752 }
753 else
754 {
755 // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
756 // when initializing registers to zero.
757 __gshared int zero = 0;
758 // use p3/p4 units
759 __gshared int storagenop; // write-only
760 }
761
762 enum { LASTPARAM = 5*4 } // 4* pushes + return address.
763 asm pure nothrow {
764 naked;
765
766 push ESI;
767 push EDI;
768 push EBX;
769 push EBP;
770 mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
771 mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
772 align 16;
773 nop;
774 mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr
775 lea EDI, [EDI + 4*EBX]; // EDI = end of dest
776 lea ESI, [ESI + 4*EBX]; // ESI = end of src
777 mov EBP, 0;
778 mov ECX, EAX; // ECX = input carry.
779 neg EBX; // count UP to zero.
780 mov EAX, [ESI+4*EBX];
781 test EBX, 1;
782 jnz L_enter_odd;
783 }
784 // Main loop, with entry point for even length
785 mixin("asm pure nothrow {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}");
786 asm pure nothrow {
787 mov EAX, EBP; // get final carry
788 pop EBP;
789 pop EBX;
790 pop EDI;
791 pop ESI;
792 ret 5*4;
793 }
794 L_enter_odd:
795 mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}");
796 }
797
798 @system unittest
799 {
800
801 uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
802 uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0];
803 multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
804 assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
805 assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1
806 && bb[3] == 0x9999_99A4+0xF0F0_F0F0 );
807 }
808
809 /**
810 Sets result[#] = result[0 .. left.length] + left[#] * right[#]
811
812 It is defined in this way to allow cache-efficient multiplication.
813 This function is equivalent to:
814 ----
815 for (int i = 0; i< right.length; ++i)
816 {
817 dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i],
818 left, right[i], 0);
819 }
820 ----
821 */
822 void multibyteMultiplyAccumulate(uint [] dest, const uint[] left,
823 const uint [] right) pure {
824 // Register usage
825 // EDX:EAX = used in multiply
826 // EBX = index
827 // ECX = carry1
828 // EBP = carry2
829 // EDI = end of dest for this pass through the loop. Index for outer loop.
830 // ESI = end of left. never changes
831 // [ESP] = M = right[i] = multiplier for this pass through the loop.
832 // right.length is changed into dest.ptr+dest.length
833 version (D_PIC)
834 {
835 enum { zero = 0 }
836 }
837 else
838 {
839 // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
840 // when initializing registers to zero.
841 __gshared int zero = 0;
842 // use p3/p4 units
843 __gshared int storagenop; // write-only
844 }
845
846 enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
847 asm pure nothrow {
848 naked;
849
850 push ESI;
851 push EDI;
852 align 16;
853 push EBX;
854 push EBP;
855 push EAX; // local variable M
856 mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr
857 mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
858 mov ESI, [ESP + LASTPARAM + 4*3]; // left.ptr
859 lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
860
861 mov EAX, [ESP + LASTPARAM + 4*0]; // right.length
862 lea EAX, [EDI + 4*EAX];
863 mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI
864
865 lea ESI, [ESI + 4*EBX]; // ESI = end of left
866 mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
867 mov EAX, [EAX];
868 mov [ESP], EAX; // M
869 outer_loop:
870 mov EBP, 0;
871 mov ECX, 0; // ECX = input carry.
872 neg EBX; // count UP to zero.
873 mov EAX, [ESI+4*EBX];
874 test EBX, 1;
875 jnz L_enter_odd;
876 }
877 // -- Inner loop, with even entry point
878 mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
879 asm pure nothrow {
880 mov [-4+EDI+4*EBX], EBP;
881 add EDI, 4;
882 cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]?
883 jz outer_done;
884 mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
885 mov EAX, [EAX+4]; // get new M
886 mov [ESP], EAX; // save new M
887 add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr
888 mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
889 jmp outer_loop;
890 outer_done:
891 pop EAX;
892 pop EBP;
893 pop EBX;
894 pop EDI;
895 pop ESI;
896 ret 6*4;
897 }
898 L_enter_odd:
899 mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
900 }
901
902 /** dest[#] /= divisor.
903 * overflow is the initial remainder, and must be in the range 0 .. divisor-1.
904 * divisor must not be a power of 2 (use right shift for that case;
905 * A division by zero will occur if divisor is a power of 2).
906 * Returns the final remainder
907 *
908 * Based on public domain code by Eric Bainville.
909 * (http://www.bealto.com/) Used with permission.
910 */
911 uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure
912 {
913 // Timing: limited by a horrible dependency chain.
914 // Pentium M: 18 cycles/op, 8 resource stalls/op.
915 // EAX, EDX = scratch, used by MUL
916 // EDI = dest
917 // CL = shift
918 // ESI = quotient
919 // EBX = remainderhi
920 // EBP = remainderlo
921 // [ESP-4] = mask
922 // [ESP] = kinv (2^64 /divisor)
923 enum { LASTPARAM = 5*4 } // 4* pushes + return address.
924 enum { LOCALS = 2*4} // MASK, KINV
925 asm pure nothrow {
926 naked;
927
928 push ESI;
929 push EDI;
930 push EBX;
931 push EBP;
932
933 mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr
934 mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length
935
936 // Loop from msb to lsb
937 lea EDI, [EDI + 4*EBX];
938 mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1
939 // Build the pseudo-inverse of divisor k: 2^64/k
940 // First determine the shift in ecx to get the max number of bits in kinv
941 xor ECX, ECX;
942 mov EAX, [ESP + LASTPARAM]; //divisor;
943 mov EDX, 1;
944 kinv1:
945 inc ECX;
946 ror EDX, 1;
947 shl EAX, 1;
948 jnc kinv1;
949 dec ECX;
950 // Here, ecx is a left shift moving the msb of k to bit 32
951
952 mov EAX, 1;
953 shl EAX, CL;
954 dec EAX;
955 ror EAX, CL ; //ecx bits at msb
956 push EAX; // MASK
957
958 // Then divide 2^(32+cx) by divisor (edx already ok)
959 xor EAX, EAX;
960 div int ptr [ESP + LASTPARAM + LOCALS-4*1]; //divisor;
961 push EAX; // kinv
962 align 16;
963 L2:
964 // Get 32 bits of quotient approx, multiplying
965 // most significant word of (rem*2^32+input)
966 mov EAX, [ESP+4]; //MASK;
967 and EAX, [EDI - 4];
968 or EAX, EBP;
969 rol EAX, CL;
970 mov EBX, EBP;
971 mov EBP, [EDI - 4];
972 mul int ptr [ESP]; //KINV;
973
974 shl EAX, 1;
975 rcl EDX, 1;
976
977 // Multiply by k and subtract to get remainder
978 // Subtraction must be done on two words
979 mov EAX, EDX;
980 mov ESI, EDX; // quot = high word
981 mul int ptr [ESP + LASTPARAM+LOCALS]; //divisor;
982 sub EBP, EAX;
983 sbb EBX, EDX;
984 jz Lb; // high word is 0, goto adjust on single word
985
986 // Adjust quotient and remainder on two words
987 Ld: inc ESI;
988 sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
989 sbb EBX, 0;
990 jnz Ld;
991
992 // Adjust quotient and remainder on single word
993 Lb: cmp EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
994 jc Lc; // rem in 0 .. divisor-1, OK
995 sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
996 inc ESI;
997 jmp Lb;
998
999 // Store result
1000 Lc:
1001 mov [EDI - 4], ESI;
1002 lea EDI, [EDI - 4];
1003 dec int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len
1004 jnz L2;
1005
1006 pop EAX; // discard kinv
1007 pop EAX; // discard mask
1008
1009 mov EAX, EBP; // return final remainder
1010 pop EBP;
1011 pop EBX;
1012 pop EDI;
1013 pop ESI;
1014 ret 3*4;
1015 }
1016 }
1017
1018 @system unittest
1019 {
1020 uint [] aa = new uint[101];
1021 for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3);
1022 uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
1023 uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
1024 for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3));
1025 assert(r == 0x33FF_7461);
1026 }
1027
1028 // Set dest[2*i .. 2*i+1]+=src[i]*src[i]
1029 void multibyteAddDiagonalSquares(uint [] dest, const uint [] src) pure
1030 {
1031 /* Unlike mulAdd, the carry is only 1 bit,
1032 since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
1033 Note also that on the last iteration, no carry can occur.
1034 As for multibyteAdd, we save & restore carry flag through the loop.
1035
1036 The timing is entirely dictated by the dependency chain. We could
1037 improve it by moving the mov EAX after the adc [EDI], EAX. Probably not worthwhile.
1038 */
1039 enum { LASTPARAM = 4*5 } // 4* pushes + return address.
1040 asm pure nothrow {
1041 naked;
1042 push ESI;
1043 push EDI;
1044 push EBX;
1045 push ECX;
1046 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
1047 mov EBX, [ESP + LASTPARAM + 4*0]; //src.length;
1048 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
1049 lea EDI, [EDI + 8*EBX]; // EDI = end of dest
1050 lea ESI, [ESI + 4*EBX]; // ESI = end of src
1051 neg EBX; // count UP to zero.
1052 xor ECX, ECX; // initial carry = 0.
1053 L1:
1054 mov EAX, [ESI + 4*EBX];
1055 mul EAX, EAX;
1056 shr CL, 1; // get carry
1057 adc [EDI + 8*EBX], EAX;
1058 adc [EDI + 8*EBX + 4], EDX;
1059 setc CL; // save carry
1060 inc EBX;
1061 jnz L1;
1062
1063 pop ECX;
1064 pop EBX;
1065 pop EDI;
1066 pop ESI;
1067 ret 4*4;
1068 }
1069 }
1070
1071 @system unittest
1072 {
1073 uint [] aa = new uint[13];
1074 uint [] bb = new uint[6];
1075 for (int i=0; i<aa.length; ++i) aa[i] = 0x8000_0000;
1076 for (int i=0; i<bb.length; ++i) bb[i] = i;
1077 aa[$-1]= 7;
1078 multibyteAddDiagonalSquares(aa[0..$-1], bb);
1079 assert(aa[$-1]==7);
1080 for (int i=0; i<bb.length; ++i) { assert(aa[2*i]==0x8000_0000+i*i); assert(aa[2*i+1]==0x8000_0000); }
1081 }
1082
1083 void multibyteTriangleAccumulateD(uint[] dest, uint[] x) pure
1084 {
1085 for (int i = 0; i < x.length-3; ++i)
1086 {
1087 dest[i+x.length] = multibyteMulAdd!('+')(
1088 dest[i+i+1 .. i+x.length], x[i+1..$], x[i], 0);
1089 }
1090 ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[$-5];
1091 dest[$-5] = cast(uint) c;
1092 c >>= 32;
1093 c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4];
1094 dest[$-4] = cast(uint) c;
1095 c >>= 32;
1096 length2:
1097 c += cast(ulong)(x[$-2]) * x[$-1];
1098 dest[$-3] = cast(uint) c;
1099 c >>= 32;
1100 dest[$-2] = cast(uint) c;
1101 }
1102
1103 //dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
1104 // assert(dest.length = src.length*2);
1105 // assert(src.length >= 3);
1106 void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure
1107 {
1108 // Register usage
1109 // EDX:EAX = used in multiply
1110 // EBX = index
1111 // ECX = carry1
1112 // EBP = carry2
1113 // EDI = end of dest for this pass through the loop. Index for outer loop.
1114 // ESI = end of src. never changes
1115 // [ESP] = M = src[i] = multiplier for this pass through the loop.
1116 // dest.length is changed into dest.ptr+dest.length
1117 version (D_PIC)
1118 {
1119 enum { zero = 0 }
1120 }
1121 else
1122 {
1123 // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
1124 // when initializing registers to zero.
1125 __gshared int zero = 0;
1126 // use p3/p4 units
1127 __gshared int storagenop; // write-only
1128 }
1129
1130 enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
1131 asm pure nothrow {
1132 naked;
1133
1134 push ESI;
1135 push EDI;
1136 align 16;
1137 push EBX;
1138 push EBP;
1139 push EAX; // local variable M= src[i]
1140 mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr
1141 mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
1142 mov ESI, [ESP + LASTPARAM + 4*1]; // src.ptr
1143
1144 lea ESI, [ESI + 4*EBX]; // ESI = end of left
1145 add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M
1146
1147 // local variable [ESP + LASTPARAM + 4*2] = last value for EDI
1148 lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
1149
1150 lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3
1151 mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI = &dest[src.length*2 -3]
1152
1153 cmp EBX, 3;
1154 jz length_is_3;
1155
1156 // We start at src[1], not src[0].
1157 dec EBX;
1158 mov [ESP + LASTPARAM + 4*0], EBX;
1159
1160 outer_loop:
1161 mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
1162 mov EBP, 0;
1163 mov ECX, 0; // ECX = input carry.
1164 dec [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1.
1165 neg EBX; // count UP to zero.
1166
1167 mov EAX, [ESI + 4*EBX - 4*1]; // get new M
1168 mov [ESP], EAX; // save new M
1169
1170 mov EAX, [ESI+4*EBX];
1171 test EBX, 1;
1172 jnz L_enter_odd;
1173 }
1174 // -- Inner loop, with even entry point
1175 mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
1176 asm pure nothrow {
1177 mov [-4+EDI+4*EBX], EBP;
1178 add EDI, 4;
1179 cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]?
1180 jnz outer_loop;
1181 length_is_3:
1182 mov EAX, [ESI - 4*3];
1183 mul EAX, [ESI - 4*2];
1184 mov ECX, 0;
1185 add [EDI-2*4], EAX; // ECX:dest[$-5] += x[$-3] * x[$-2]
1186 adc ECX, EDX;
1187
1188 mov EAX, [ESI - 4*3];
1189 mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1]
1190 add EAX, ECX;
1191 mov ECX, 0;
1192 adc EDX, 0;
1193 // now EDX: EAX = c + x[$-3] * x[$-1]
1194 add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX)
1195 adc ECX, EDX; // ECX holds dest[$-3], it acts as carry for the last row
1196 // do length == 2
1197 mov EAX, [ESI - 4*2];
1198 mul EAX, [ESI - 4*1];
1199 add ECX, EAX;
1200 adc EDX, 0;
1201 mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1];
1202 mov [EDI + 1*4], EDX;
1203
1204 pop EAX;
1205 pop EBP;
1206 pop EBX;
1207 pop EDI;
1208 pop ESI;
1209 ret 4*4;
1210 }
1211 L_enter_odd:
1212 mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
1213 }
1214
1215 @system unittest
1216 {
1217 uint [] aa = new uint[200];
1218 uint [] a = aa[0 .. 100];
1219 uint [] b = new uint [100];
1220 aa[] = 761;
1221 a[] = 0;
1222 b[] = 0;
1223 a[3] = 6;
1224 b[0]=1;
1225 b[1] = 17;
1226 b[50 .. 100]=78;
1227 multibyteTriangleAccumulateAsm(a, b[0 .. 50]);
1228 uint [] c = new uint[100];
1229 c[] = 0;
1230 c[1] = 17;
1231 c[3] = 6;
1232 assert(a[]==c[]);
1233 assert(a[0]==0);
1234 aa[] = 0xFFFF_FFFF;
1235 a[] = 0;
1236 b[] = 0;
1237 b[0]= 0xbf6a1f01;
1238 b[1]= 0x6e38ed64;
1239 b[2]= 0xdaa797ed;
1240 b[3] = 0;
1241
1242 multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]);
1243 assert(a[1]==0x3a600964);
1244 assert(a[2]==0x339974f6);
1245 assert(a[3]==0x46736fce);
1246 assert(a[4]==0x5e24a2b4);
1247
1248 b[3] = 0xe93ff9f4;
1249 b[4] = 0x184f03;
1250 a[]=0;
1251 multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]);
1252 assert(a[3]==0x79fff5c2);
1253 assert(a[4]==0xcf384241);
1254 assert(a[5]== 0x4a17fc8);
1255 assert(a[6]==0x4d549025);
1256 }
1257
1258
1259 void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure
1260 {
1261 if (x.length < 4)
1262 {
1263 // Special cases, not worth doing triangular.
1264 result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0);
1265 multibyteMultiplyAccumulate(result[1..$], x, x[1..$]);
1266 return;
1267 }
1268 // Do half a square multiply.
1269 // dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
1270 result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0);
1271 multibyteTriangleAccumulateAsm(result[2..$], x[1..$]);
1272 // Multiply by 2
1273 result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1);
1274 // And add the diagonal elements
1275 result[0] = 0;
1276 multibyteAddDiagonalSquares(result, x);
1277 }
1278
1279 version (BignumPerformanceTest)
1280 {
1281 import core.stdc.stdio;
1282 int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } }
1283
1284 __gshared uint [2200] X1;
1285 __gshared uint [2200] Y1;
1286 __gshared uint [4000] Z1;
1287
1288 void testPerformance() pure
1289 {
1290 // The performance results at the top of this file were obtained using
1291 // a Windows device driver to access the CPU performance counters.
1292 // The code below is less accurate but more widely usable.
1293 // The value for division is quite inconsistent.
1294 for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; }
1295 int t, t0;
1296 multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
1297 t0 = clock();
1298 multibyteShl(Z1[0 .. 1000], X1[0 .. 1000], 7);
1299 t = clock();
1300 multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
1301 auto shltime = (clock() - t) - (t - t0);
1302 t0 = clock();
1303 multibyteShr(Z1[2 .. 1002], X1[4 .. 1004], 13);
1304 t = clock();
1305 multibyteShr(Z1[2 .. 2002], X1[4 .. 2004], 13);
1306 auto shrtime = (clock() - t) - (t - t0);
1307 t0 = clock();
1308 multibyteAddSub!('+')(Z1[0 .. 1000], X1[0 .. 1000], Y1[0 .. 1000], 0);
1309 t = clock();
1310 multibyteAddSub!('+')(Z1[0 .. 2000], X1[0 .. 2000], Y1[0 .. 2000], 0);
1311 auto addtime = (clock() - t) - (t-t0);
1312 t0 = clock();
1313 multibyteMul(Z1[0 .. 1000], X1[0 .. 1000], 7, 0);
1314 t = clock();
1315 multibyteMul(Z1[0 .. 2000], X1[0 .. 2000], 7, 0);
1316 auto multime = (clock() - t) - (t - t0);
1317 multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
1318 t0 = clock();
1319 multibyteMulAdd!('+')(Z1[0 .. 1000], X1[0 .. 1000], 217, 0);
1320 t = clock();
1321 multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
1322 auto muladdtime = (clock() - t) - (t - t0);
1323 multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
1324 t = clock();
1325 multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
1326 auto accumtime = clock() - t;
1327 t0 = clock();
1328 multibyteDivAssign(Z1[0 .. 2000], 217, 0);
1329 t = clock();
1330 multibyteDivAssign(Z1[0 .. 1000], 37, 0);
1331 auto divtime = (t - t0) - (clock() - t);
1332 t= clock();
1333 multibyteSquare(Z1[0 .. 64], X1[0 .. 32]);
1334 auto squaretime = clock() - t;
1335
1336 printf("-- BigInt asm performance (cycles/int) --\n");
1337 printf("Add: %.2f\n", addtime/1000.0);
1338 printf("Shl: %.2f\n", shltime/1000.0);
1339 printf("Shr: %.2f\n", shrtime/1000.0);
1340 printf("Mul: %.2f\n", multime/1000.0);
1341 printf("MulAdd: %.2f\n", muladdtime/1000.0);
1342 printf("Div: %.2f\n", divtime/1000.0);
1343 printf("MulAccum32: %.2f*n*n (total %d)\n", accumtime/(32.0*32.0), accumtime);
1344 printf("Square32: %.2f*n*n (total %d)\n\n", squaretime/(32.0*32.0), squaretime);
1345 }
1346
1347 static this()
1348 {
1349 testPerformance();
1350 }
1351 }
1352
1353 } // version (D_InlineAsm_X86)