libphobos/src/std/internal/math/biguintx86.d

   1 /** Optimised asm arbitrary precision arithmetic ('bignum')
   2  * routines for X86 processors.
   3  *
   4  * All functions operate on arrays of uints, stored LSB first.
   5  * If there is a destination array, it will be the first parameter.
   6  * Currently, all of these functions are subject to change, and are
   7  * intended for internal use only.
   8  * The symbol [#] indicates an array of machine words which is to be
   9  * interpreted as a multi-byte number.
  10  */
  11
  12 /*          Copyright Don Clugston 2008 - 2010.
  13  * Distributed under the Boost Software License, Version 1.0.
  14  *    (See accompanying file LICENSE_1_0.txt or copy at
  15  *          http://www.boost.org/LICENSE_1_0.txt)
  16  */
  17 /**
  18  * In simple terms, there are 3 modern x86 microarchitectures:
  19  * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel;
  20  * (b) the K6, Athlon, and AMD64 families, produced by AMD; and
  21  * (c) the Pentium 4, produced by Marketing.
  22  *
  23  * This code has been optimised for the Intel P6 family.
  24  * Generally the code remains near-optimal for Intel Core2/Corei7, after
  25  * translating EAX-> RAX, etc, since all these CPUs use essentially the same
  26  * pipeline, and are typically limited by memory access.
  27  * The code uses techniques described in Agner Fog's superb Pentium manuals
  28  * available at www.agner.org.
  29  * Not optimised for AMD, which can do two memory loads per cycle (Intel
  30  * CPUs can only do one). Despite this, performance is superior on AMD.
  31  * Performance is dreadful on P4.
  32  *
  33  *  Timing results (cycles per int)
  34  *              --Intel Pentium--  --AMD--
  35  *              PM     P4   Core2   K7
  36  *  +,-         2.25  15.6   2.25   1.5
  37  *  <<,>>       2.0    6.6   2.0    5.0
  38  *    (<< MMX)  1.7    5.3   1.5    1.2
  39  *  *           5.0   15.0   4.0    4.3
  40  *  mulAdd      5.7   19.0   4.9    4.0
  41  *  div        30.0   32.0  32.0   22.4
  42  *  mulAcc(32)  6.5   20.0   5.4    4.9
  43  *
  44  * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes
  45  * function call overhead.
  46  * The timing for Div is quite unpredictable, but it's probably too slow
  47  * to be useful. On 64-bit processors, these times should
  48  * halve if run in 64-bit mode, except for the MMX functions.
  49  */
  50
  51 module std.internal.math.biguintx86;
  52
  53 @system:
  54 pure:
  55 nothrow:
  56
  57 /*
  58   Naked asm is used throughout, because:
  59   (a) it frees up the EBP register
  60   (b) compiler bugs prevent the use of .ptr when a frame pointer is used.
  61 */
  62
  63 version (D_InlineAsm_X86)
  64 {
  65
  66 private:
  67
  68 /* Duplicate string s, with n times, substituting index for '@'.
  69  *
  70  * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper
  71  * function for some of the asm routines.
  72  */
  73 string indexedLoopUnroll(int n, string s) pure @safe
  74 {
  75     string u;
  76     for (int i = 0; i<n; ++i)
  77     {
  78         string nstr= (i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10);
  79
  80         int last = 0;
  81         for (int j = 0; j<s.length; ++j)
  82         {
  83             if (s[j]=='@')
  84             {
  85                 u ~= s[last .. j] ~ nstr;
  86                 last = j+1;
  87             }
  88         }
  89         if (last<s.length) u = u ~ s[last..$];
  90
  91     }
  92     return u;
  93 }
  94 @safe unittest
  95 {
  96     assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;");
  97 }
  98
  99 public:
 100
 101 alias BigDigit = uint; // A Bignum is an array of BigDigits. Usually the machine word size.
 102
 103 // Limits for when to switch between multiplication algorithms.
 104 enum : int { KARATSUBALIMIT = 18 }; // Minimum value for which Karatsuba is worthwhile.
 105 enum : int { KARATSUBASQUARELIMIT=26 }; // Minimum value for which square Karatsuba is worthwhile
 106
 107 /** Multi-byte addition or subtraction
 108  *    dest[#] = src1[#] + src2[#] + carry (0 or 1).
 109  * or dest[#] = src1[#] - src2[#] - carry (0 or 1).
 110  * Returns carry or borrow (0 or 1).
 111  * Set op == '+' for addition, '-' for subtraction.
 112  */
 113 uint multibyteAddSub(char op)(uint[] dest, const uint [] src1, const uint []
 114         src2, uint carry) pure
 115 {
 116     // Timing:
 117     // Pentium M: 2.25/int
 118     // P6 family, Core2 have a partial flags stall when reading the carry flag in
 119     // an ADC, SBB operation after an operation such as INC or DEC which
 120     // modifies some, but not all, flags. We avoid this by storing carry into
 121     // a resister (AL), and restoring it after the branch.
 122
 123     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
 124     asm pure nothrow {
 125         naked;
 126         push EDI;
 127         push EBX;
 128         push ESI;
 129         mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
 130         mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr
 131         mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr
 132         mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr
 133              // Carry is in EAX
 134         // Count UP to zero (from -len) to minimize loop overhead.
 135         lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
 136         lea ESI, [ESI + 4*ECX]; // EBP = end of src2.
 137         lea EDI, [EDI + 4*ECX]; // EDI = end of dest.
 138
 139         neg ECX;
 140         add ECX, 8;
 141         jb L2;  // if length < 8 , bypass the unrolled loop.
 142 L_unrolled:
 143         shr AL, 1; // get carry from EAX
 144     }
 145     mixin(" asm pure nothrow {"
 146         ~ indexedLoopUnroll( 8,
 147         "mov EAX, [@*4-8*4+EDX+ECX*4];"
 148         ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4-8*4+ESI+ECX*4];"
 149         ~ "mov [@*4-8*4+EDI+ECX*4], EAX;")
 150         ~ "}");
 151     asm pure nothrow {
 152         setc AL; // save carry
 153         add ECX, 8;
 154         ja L_unrolled;
 155 L2:     // Do the residual 1 .. 7 ints.
 156
 157         sub ECX, 8;
 158         jz done;
 159 L_residual:
 160         shr AL, 1; // get carry from EAX
 161     }
 162     mixin(" asm pure nothrow {"
 163         ~ indexedLoopUnroll( 1,
 164         "mov EAX, [@*4+EDX+ECX*4];"
 165         ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4+ESI+ECX*4];"
 166         ~ "mov [@*4+EDI+ECX*4], EAX;") ~ "}");
 167     asm pure nothrow {
 168         setc AL; // save carry
 169         add ECX, 1;
 170         jnz L_residual;
 171 done:
 172         and EAX, 1; // make it O or 1.
 173         pop ESI;
 174         pop EBX;
 175         pop EDI;
 176         ret 6*4;
 177     }
 178 }
 179
 180 @system unittest
 181 {
 182     uint [] a = new uint[40];
 183     uint [] b = new uint[40];
 184     uint [] c = new uint[40];
 185     for (int i=0; i<a.length; ++i)
 186     {
 187         if (i&1) a[i]=0x8000_0000 + i;
 188         else a[i]=i;
 189         b[i]= 0x8000_0003;
 190     }
 191     c[19]=0x3333_3333;
 192     uint carry = multibyteAddSub!('+')(c[0 .. 18], a[0 .. 18], b[0 .. 18], 0);
 193     assert(carry == 1);
 194     assert(c[0]==0x8000_0003);
 195     assert(c[1]==4);
 196     assert(c[19]==0x3333_3333); // check for overrun
 197     for (int i=0; i<a.length; ++i)
 198     {
 199         a[i]=b[i]=c[i]=0;
 200     }
 201     a[8]=0x048D159E;
 202     b[8]=0x048D159E;
 203     a[10]=0x1D950C84;
 204     b[10]=0x1D950C84;
 205     a[5] =0x44444444;
 206     carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0);
 207     assert(a[11]==0);
 208     for (int i=0; i<10; ++i) if (i != 5) assert(a[i]==0);
 209
 210     for (int q=3; q<36;++q)
 211     {
 212         for (int i=0; i<a.length; ++i)
 213         {
 214             a[i]=b[i]=c[i]=0;
 215         }
 216         a[q-2]=0x040000;
 217         b[q-2]=0x040000;
 218        carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0);
 219        assert(a[q-2]==0);
 220     }
 221 }
 222
 223 /** dest[#] += carry, or dest[#] -= carry.
 224  *  op must be '+' or '-'
 225  *  Returns final carry or borrow (0 or 1)
 226  */
 227 uint multibyteIncrementAssign(char op)(uint[] dest, uint carry) pure
 228 {
 229     enum { LASTPARAM = 1*4 } // 0* pushes + return address.
 230     asm pure nothrow {
 231         naked;
 232         mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length;
 233         mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr
 234         // EAX  = carry
 235 L1: ;
 236     }
 237     static if (op=='+')
 238         asm pure nothrow { add [EDX], EAX; }
 239     else
 240         asm pure nothrow { sub [EDX], EAX; }
 241     asm pure nothrow {
 242         mov EAX, 1;
 243         jnc L2;
 244         add EDX, 4;
 245         dec ECX;
 246         jnz L1;
 247         mov EAX, 2;
 248 L2:     dec EAX;
 249         ret 2*4;
 250     }
 251 }
 252
 253 /** dest[#] = src[#] << numbits
 254  *  numbits must be in the range 1 .. 31
 255  *  Returns the overflow
 256  */
 257 uint multibyteShlNoMMX(uint [] dest, const uint [] src, uint numbits) pure
 258 {
 259     // Timing: Optimal for P6 family.
 260     // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
 261     // 5.0 cycles/int on Athlon, which has 7 cycles for SHLD!!
 262     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
 263     asm pure nothrow {
 264         naked;
 265         push ESI;
 266         push EDI;
 267         push EBX;
 268         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
 269         mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
 270         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
 271         mov ECX, EAX; // numbits;
 272
 273         mov EAX, [-4+ESI + 4*EBX];
 274         mov EDX, 0;
 275         shld EDX, EAX, CL;
 276         push EDX; // Save return value
 277         cmp EBX, 1;
 278         jz L_last;
 279         mov EDX, [-4+ESI + 4*EBX];
 280         test EBX, 1;
 281         jz L_odd;
 282         sub EBX, 1;
 283 L_even:
 284         mov EDX, [-4+ ESI + 4*EBX];
 285         shld EAX, EDX, CL;
 286         mov [EDI+4*EBX], EAX;
 287 L_odd:
 288         mov EAX, [-8+ESI + 4*EBX];
 289         shld EDX, EAX, CL;
 290         mov [-4+EDI + 4*EBX], EDX;
 291         sub EBX, 2;
 292         jg L_even;
 293 L_last:
 294         shl EAX, CL;
 295         mov [EDI], EAX;
 296         pop EAX; // pop return value
 297         pop EBX;
 298         pop EDI;
 299         pop ESI;
 300         ret 4*4;
 301     }
 302 }
 303
 304 /** dest[#] = src[#] >> numbits
 305  *  numbits must be in the range 1 .. 31
 306  * This version uses MMX.
 307  */
 308 uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure
 309 {
 310     // Timing:
 311     // K7 1.2/int. PM 1.7/int P4 5.3/int
 312     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
 313     asm pure nothrow {
 314         naked;
 315         push ESI;
 316         push EDI;
 317         push EBX;
 318         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
 319         mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
 320         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
 321
 322         movd MM3, EAX; // numbits = bits to shift left
 323         xor EAX, 63;
 324         align   16;
 325         inc EAX;
 326         movd MM4, EAX ; // 64-numbits = bits to shift right
 327
 328         // Get the return value into EAX
 329         and EAX, 31; // EAX = 32-numbits
 330         movd MM2, EAX; // 32-numbits
 331         movd MM1, [ESI+4*EBX-4];
 332         psrlq MM1, MM2;
 333         movd EAX, MM1;  // EAX = return value
 334         test EBX, 1;
 335         jz L_even;
 336 L_odd:
 337         cmp EBX, 1;
 338         jz L_length1;
 339
 340          // deal with odd lengths
 341         movq MM1, [ESI+4*EBX-8];
 342         psrlq MM1, MM2;
 343         movd    [EDI +4*EBX-4], MM1;
 344         sub EBX, 1;
 345 L_even: // It's either singly or doubly even
 346         movq    MM2, [ESI + 4*EBX - 8];
 347         psllq   MM2, MM3;
 348         sub EBX, 2;
 349         jle L_last;
 350         movq MM1, MM2;
 351         add EBX, 2;
 352         test EBX, 2;
 353         jz L_onceeven;
 354         sub EBX, 2;
 355
 356         // MAIN LOOP -- 128 bytes per iteration
 357  L_twiceeven:      // here MM2 is the carry
 358         movq    MM0, [ESI + 4*EBX-8];
 359         psrlq   MM0, MM4;
 360         movq    MM1, [ESI + 4*EBX-8];
 361         psllq   MM1, MM3;
 362         por     MM2, MM0;
 363         movq    [EDI +4*EBX], MM2;
 364 L_onceeven:        // here MM1 is the carry
 365         movq    MM0, [ESI + 4*EBX-16];
 366         psrlq   MM0, MM4;
 367         movq    MM2, [ESI + 4*EBX-16];
 368         por     MM1, MM0;
 369         movq    [EDI +4*EBX-8], MM1;
 370         psllq   MM2, MM3;
 371         sub EBX, 4;
 372         jg L_twiceeven;
 373 L_last:
 374         movq    [EDI +4*EBX], MM2;
 375 L_alldone:
 376         emms;  // NOTE: costs 6 cycles on Intel CPUs
 377         pop EBX;
 378         pop EDI;
 379         pop ESI;
 380         ret 4*4;
 381
 382 L_length1:
 383         // length 1 is a special case
 384         movd MM1, [ESI];
 385         psllq MM1, MM3;
 386         movd [EDI], MM1;
 387         jmp L_alldone;
 388     }
 389 }
 390
 391 void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure
 392 {
 393     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
 394     asm pure nothrow {
 395         naked;
 396         push ESI;
 397         push EDI;
 398         push EBX;
 399         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
 400         mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
 401 align 16;
 402         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
 403         lea EDI, [EDI + 4*EBX]; // EDI = end of dest
 404         lea ESI, [ESI + 4*EBX]; // ESI = end of src
 405         neg EBX;                // count UP to zero.
 406
 407         movd MM3, EAX; // numbits = bits to shift right
 408         xor EAX, 63;
 409         inc EAX;
 410         movd MM4, EAX ; // 64-numbits = bits to shift left
 411
 412         test EBX, 1;
 413         jz L_even;
 414 L_odd:
 415          // deal with odd lengths
 416         and EAX, 31; // EAX = 32-numbits
 417         movd MM2, EAX; // 32-numbits
 418         cmp EBX, -1;
 419         jz L_length1;
 420
 421         movq MM0, [ESI+4*EBX];
 422         psrlq MM0, MM3;
 423         movd    [EDI +4*EBX], MM0;
 424         add EBX, 1;
 425 L_even:
 426         movq    MM2, [ESI + 4*EBX];
 427         psrlq   MM2, MM3;
 428
 429         movq MM1, MM2;
 430         add EBX, 4;
 431         cmp EBX, -2+4;
 432         jz L_last;
 433         // It's either singly or doubly even
 434         sub EBX, 2;
 435         test EBX, 2;
 436         jnz L_onceeven;
 437         add EBX, 2;
 438
 439         // MAIN LOOP -- 128 bytes per iteration
 440  L_twiceeven:      // here MM2 is the carry
 441         movq    MM0, [ESI + 4*EBX-8];
 442         psllq   MM0, MM4;
 443         movq    MM1, [ESI + 4*EBX-8];
 444         psrlq   MM1, MM3;
 445         por     MM2, MM0;
 446         movq    [EDI +4*EBX-16], MM2;
 447 L_onceeven:        // here MM1 is the carry
 448         movq    MM0, [ESI + 4*EBX];
 449         psllq   MM0, MM4;
 450         movq    MM2, [ESI + 4*EBX];
 451         por     MM1, MM0;
 452         movq    [EDI +4*EBX-8], MM1;
 453         psrlq   MM2, MM3;
 454         add EBX, 4;
 455         jl L_twiceeven;
 456 L_last:
 457         movq    [EDI +4*EBX-16], MM2;
 458 L_alldone:
 459         emms;  // NOTE: costs 6 cycles on Intel CPUs
 460         pop EBX;
 461         pop EDI;
 462         pop ESI;
 463         ret 4*4;
 464
 465 L_length1:
 466         // length 1 is a special case
 467         movd MM1, [ESI+4*EBX];
 468         psrlq MM1, MM3;
 469         movd    [EDI +4*EBX], MM1;
 470         jmp L_alldone;
 471
 472     }
 473 }
 474
 475 /** dest[#] = src[#] >> numbits
 476  *  numbits must be in the range 1 .. 31
 477  */
 478 void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure
 479 {
 480     // Timing: Optimal for P6 family.
 481     // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
 482     // Terrible performance on AMD64, which has 7 cycles for SHRD!!
 483     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
 484     asm pure nothrow {
 485         naked;
 486         push ESI;
 487         push EDI;
 488         push EBX;
 489         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
 490         mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
 491         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
 492         mov ECX, EAX; // numbits;
 493
 494         lea EDI, [EDI + 4*EBX]; // EDI = end of dest
 495         lea ESI, [ESI + 4*EBX]; // ESI = end of src
 496         neg EBX;                // count UP to zero.
 497         mov EAX, [ESI + 4*EBX];
 498         cmp EBX, -1;
 499         jz L_last;
 500         mov EDX, [ESI + 4*EBX];
 501         test EBX, 1;
 502         jz L_odd;
 503         add EBX, 1;
 504 L_even:
 505         mov EDX, [ ESI + 4*EBX];
 506         shrd EAX, EDX, CL;
 507         mov [-4 + EDI+4*EBX], EAX;
 508 L_odd:
 509         mov EAX, [4 + ESI + 4*EBX];
 510         shrd EDX, EAX, CL;
 511         mov [EDI + 4*EBX], EDX;
 512         add EBX, 2;
 513         jl L_even;
 514 L_last:
 515         shr EAX, CL;
 516         mov [-4 + EDI], EAX;
 517
 518         pop EBX;
 519         pop EDI;
 520         pop ESI;
 521         ret 4*4;
 522     }
 523 }
 524
 525 @system unittest
 526 {
 527
 528     uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
 529     multibyteShr(aa[0..$-1], aa, 4);
 530     assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555
 531         && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
 532
 533     aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
 534     multibyteShr(aa[2..$-1], aa[2..$-1], 4);
 535     assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556
 536         && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
 537
 538     aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
 539     multibyteShr(aa[0..$-2], aa, 4);
 540     assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999);
 541     assert(aa[0]==0x6122_2222);
 542     assert(aa[3]==0xBCCC_CCCD);
 543
 544
 545     aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
 546     uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4);
 547     assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223
 548         && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
 549     assert(r == 8);
 550
 551     aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
 552     r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4);
 553     assert(aa[0] == 0xF0FF_FFFF
 554         && aa[2]==0x5555_5561);
 555         assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
 556     assert(r == 8);
 557         assert(aa[1]==0x2222_2230);
 558
 559     aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
 560     r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31);
 561 }
 562
 563 /** dest[#] = src[#] * multiplier + carry.
 564  * Returns carry.
 565  */
 566 uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry)
 567     pure
 568 {
 569     // Timing: definitely not optimal.
 570     // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration
 571     // Fastest implementation found was 4.6 cycles/op, but not worth the complexity.
 572
 573     enum { LASTPARAM = 4*4 } // 4* pushes + return address.
 574     // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
 575     // when initializing variables to zero.
 576     version (D_PIC)
 577     {
 578         enum { zero = 0 }
 579     }
 580     else
 581     {
 582         __gshared int zero = 0;
 583     }
 584     asm pure nothrow {
 585         naked;
 586         push ESI;
 587         push EDI;
 588         push EBX;
 589
 590         mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
 591         mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
 592         mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
 593         align 16;
 594         lea EDI, [EDI + 4*EBX]; // EDI = end of dest
 595         lea ESI, [ESI + 4*EBX]; // ESI = end of src
 596         mov ECX, EAX; // [carry]; -- last param is in EAX.
 597         neg EBX;                // count UP to zero.
 598         test EBX, 1;
 599         jnz L_odd;
 600         add EBX, 1;
 601  L1:
 602         mov EAX, [-4 + ESI + 4*EBX];
 603         mul int ptr [ESP+LASTPARAM]; //[multiplier];
 604         add EAX, ECX;
 605         mov ECX, zero;
 606         mov [-4+EDI + 4*EBX], EAX;
 607         adc ECX, EDX;
 608 L_odd:
 609         mov EAX, [ESI + 4*EBX];  // p2
 610         mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3,
 611         add EAX, ECX;
 612         mov ECX, zero;
 613         adc ECX, EDX;
 614         mov [EDI + 4*EBX], EAX;
 615         add EBX, 2;
 616         jl L1;
 617
 618         mov EAX, ECX; // get final carry
 619
 620         pop EBX;
 621         pop EDI;
 622         pop ESI;
 623         ret 5*4;
 624     }
 625 }
 626
 627 @system unittest
 628 {
 629     uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
 630     multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0);
 631     assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 &&
 632         aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
 633 }
 634
 635 // The inner multiply-and-add loop, together with the Even entry point.
 636 // Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub"
 637 // This is the most time-critical code in the BigInt library.
 638 // It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate
 639 string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure {
 640     // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC
 641     // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load
 642     // (unit p2) for initializing registers to zero.
 643     // There are also dependencies between the instructions, and we run up against the
 644     // ROB-read limit (can only read 2 registers per cycle).
 645     // We also need the number of uops in the loop to be a multiple of 3.
 646     // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that
 647         // if Position-Independent Code is required.
 648
 649         // Register usage
 650     // ESI = end of src
 651     // EDI = end of dest
 652     // EBX = index. Counts up to zero (in steps of 2).
 653     // EDX:EAX = scratch, used in multiply.
 654     // ECX = carry1.
 655     // EBP = carry2.
 656         // ESP = points to the multiplier.
 657
 658         // The first member of 'dest' which will be modified is [EDI+4*EBX].
 659         // EAX must already contain the first member of 'src', [ESI+4*EBX].
 660
 661     version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; }
 662     return "
 663         // Entry point for even length
 664         add EBX, 1;
 665         mov EBP, ECX; // carry
 666
 667         mul int ptr [" ~ M_ADDRESS ~ "]; // M
 668         mov ECX, 0;
 669
 670         add EBP, EAX;
 671         mov EAX, [ESI+4*EBX];
 672         adc ECX, EDX;
 673
 674         mul int ptr [" ~ M_ADDRESS ~ "]; // M
 675         " ~ OP ~ " [-4+EDI+4*EBX], EBP;
 676         mov EBP, zero;
 677
 678         adc ECX, EAX;
 679         mov EAX, [4+ESI+4*EBX];
 680
 681         adc EBP, EDX;
 682         add EBX, 2;
 683         jnl L_done;
 684 L1:
 685         mul int ptr [" ~ M_ADDRESS ~ "];
 686         " ~ OP ~ " [-8+EDI+4*EBX], ECX;
 687         adc EBP, EAX;
 688         mov ECX, zero;
 689         mov EAX, [ESI+4*EBX];
 690         adc ECX, EDX;
 691 " ~
 692         (using_PIC ? "" : "   mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode.
 693 ~ "
 694         mul int ptr [" ~ M_ADDRESS ~ "];
 695         " ~ OP ~ " [-4+EDI+4*EBX], EBP;
 696         mov EBP, zero;
 697
 698         adc ECX, EAX;
 699         mov EAX, [4+ESI+4*EBX];
 700
 701         adc EBP, EDX;
 702         add EBX, 2;
 703         jl L1;
 704 L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX;
 705         adc EBP, 0;
 706 ";
 707                 // final carry is now in EBP
 708 }
 709
 710 string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure
 711 {
 712     return "
 713         mul int ptr [" ~M_ADDRESS ~"];
 714         mov EBP, zero;
 715         add ECX, EAX;
 716         mov EAX, [4+ESI+4*EBX];
 717
 718         adc EBP, EDX;
 719         add EBX, 2;
 720         jl L1;
 721         jmp L_done;
 722 ";
 723 }
 724
 725
 726
 727 /**
 728  * dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF).
 729  * where op == '+' or '-'
 730  * Returns carry out of MSB (0 .. FFFF_FFFF).
 731  */
 732 uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint
 733         multiplier, uint carry) pure {
 734     // Timing: This is the most time-critical bignum function.
 735     // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration
 736
 737     // The main loop is pipelined and unrolled by 2,
 738     //   so entry to the loop is also complicated.
 739
 740     // Register usage
 741     // EDX:EAX = multiply
 742     // EBX = counter
 743     // ECX = carry1
 744     // EBP = carry2
 745     // EDI = dest
 746     // ESI = src
 747
 748     enum string OP = (op=='+')? "add" : "sub";
 749     version (D_PIC)
 750     {
 751         enum { zero = 0 }
 752     }
 753     else
 754     {
 755         // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
 756         // when initializing registers to zero.
 757         __gshared int zero = 0;
 758         // use p3/p4 units
 759         __gshared int storagenop; // write-only
 760     }
 761
 762     enum { LASTPARAM = 5*4 } // 4* pushes + return address.
 763     asm pure nothrow {
 764         naked;
 765
 766         push ESI;
 767         push EDI;
 768         push EBX;
 769         push EBP;
 770         mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
 771         mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
 772         align 16;
 773         nop;
 774         mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
 775         lea EDI, [EDI + 4*EBX]; // EDI = end of dest
 776         lea ESI, [ESI + 4*EBX]; // ESI = end of src
 777         mov EBP, 0;
 778         mov ECX, EAX; // ECX = input carry.
 779         neg EBX;                // count UP to zero.
 780         mov EAX, [ESI+4*EBX];
 781         test EBX, 1;
 782         jnz L_enter_odd;
 783     }
 784     // Main loop, with entry point for even length
 785     mixin("asm pure nothrow {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}");
 786     asm pure nothrow {
 787         mov EAX, EBP; // get final carry
 788         pop EBP;
 789         pop EBX;
 790         pop EDI;
 791         pop ESI;
 792         ret 5*4;
 793     }
 794 L_enter_odd:
 795     mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}");
 796 }
 797
 798 @system unittest
 799 {
 800
 801     uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
 802     uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0];
 803     multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
 804     assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
 805     assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1
 806          && bb[3] == 0x9999_99A4+0xF0F0_F0F0 );
 807 }
 808
 809 /**
 810    Sets result[#] = result[0 .. left.length] + left[#] * right[#]
 811
 812    It is defined in this way to allow cache-efficient multiplication.
 813    This function is equivalent to:
 814     ----
 815     for (int i = 0; i< right.length; ++i)
 816     {
 817         dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i],
 818                 left, right[i], 0);
 819     }
 820     ----
 821  */
 822 void multibyteMultiplyAccumulate(uint [] dest, const uint[] left,
 823         const uint [] right) pure {
 824     // Register usage
 825     // EDX:EAX = used in multiply
 826     // EBX = index
 827     // ECX = carry1
 828     // EBP = carry2
 829     // EDI = end of dest for this pass through the loop. Index for outer loop.
 830     // ESI = end of left. never changes
 831     // [ESP] = M = right[i] = multiplier for this pass through the loop.
 832     // right.length is changed into dest.ptr+dest.length
 833     version (D_PIC)
 834     {
 835         enum { zero = 0 }
 836     }
 837     else
 838     {
 839         // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
 840         // when initializing registers to zero.
 841         __gshared int zero = 0;
 842         // use p3/p4 units
 843         __gshared int storagenop; // write-only
 844     }
 845
 846     enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
 847     asm pure nothrow {
 848         naked;
 849
 850         push ESI;
 851         push EDI;
 852         align 16;
 853         push EBX;
 854         push EBP;
 855         push EAX;    // local variable M
 856         mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr
 857         mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
 858         mov ESI, [ESP + LASTPARAM + 4*3];  // left.ptr
 859         lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
 860
 861         mov EAX, [ESP + LASTPARAM + 4*0]; // right.length
 862         lea EAX, [EDI + 4*EAX];
 863         mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI
 864
 865         lea ESI, [ESI + 4*EBX]; // ESI = end of left
 866         mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
 867         mov EAX, [EAX];
 868         mov [ESP], EAX; // M
 869 outer_loop:
 870         mov EBP, 0;
 871         mov ECX, 0; // ECX = input carry.
 872         neg EBX;                // count UP to zero.
 873         mov EAX, [ESI+4*EBX];
 874         test EBX, 1;
 875         jnz L_enter_odd;
 876     }
 877     // -- Inner loop, with even entry point
 878     mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
 879     asm pure nothrow {
 880         mov [-4+EDI+4*EBX], EBP;
 881         add EDI, 4;
 882         cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]?
 883         jz outer_done;
 884         mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
 885         mov EAX, [EAX+4];                 // get new M
 886         mov [ESP], EAX;                   // save new M
 887         add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr
 888         mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
 889         jmp outer_loop;
 890 outer_done:
 891         pop EAX;
 892         pop EBP;
 893         pop EBX;
 894         pop EDI;
 895         pop ESI;
 896         ret 6*4;
 897     }
 898 L_enter_odd:
 899     mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
 900 }
 901
 902 /**  dest[#] /= divisor.
 903  * overflow is the initial remainder, and must be in the range 0 .. divisor-1.
 904  * divisor must not be a power of 2 (use right shift for that case;
 905  * A division by zero will occur if divisor is a power of 2).
 906  * Returns the final remainder
 907  *
 908  * Based on public domain code by Eric Bainville.
 909  * (http://www.bealto.com/) Used with permission.
 910  */
 911 uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure
 912 {
 913     // Timing: limited by a horrible dependency chain.
 914     // Pentium M: 18 cycles/op, 8 resource stalls/op.
 915     // EAX, EDX = scratch, used by MUL
 916     // EDI = dest
 917     // CL = shift
 918     // ESI = quotient
 919     // EBX = remainderhi
 920     // EBP = remainderlo
 921     // [ESP-4] = mask
 922     // [ESP] = kinv (2^64 /divisor)
 923     enum { LASTPARAM = 5*4 } // 4* pushes + return address.
 924     enum { LOCALS = 2*4} // MASK, KINV
 925     asm pure nothrow {
 926         naked;
 927
 928         push ESI;
 929         push EDI;
 930         push EBX;
 931         push EBP;
 932
 933         mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr
 934         mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length
 935
 936         // Loop from msb to lsb
 937         lea     EDI, [EDI + 4*EBX];
 938         mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1
 939         // Build the pseudo-inverse of divisor k: 2^64/k
 940         // First determine the shift in ecx to get the max number of bits in kinv
 941         xor     ECX, ECX;
 942         mov     EAX, [ESP + LASTPARAM]; //divisor;
 943         mov     EDX, 1;
 944 kinv1:
 945         inc     ECX;
 946         ror     EDX, 1;
 947         shl     EAX, 1;
 948         jnc     kinv1;
 949         dec     ECX;
 950         // Here, ecx is a left shift moving the msb of k to bit 32
 951
 952         mov     EAX, 1;
 953         shl     EAX, CL;
 954         dec     EAX;
 955         ror     EAX, CL ; //ecx bits at msb
 956         push    EAX; // MASK
 957
 958         // Then divide 2^(32+cx) by divisor (edx already ok)
 959         xor     EAX, EAX;
 960         div     int ptr [ESP + LASTPARAM +  LOCALS-4*1]; //divisor;
 961         push    EAX; // kinv
 962         align   16;
 963 L2:
 964         // Get 32 bits of quotient approx, multiplying
 965         // most significant word of (rem*2^32+input)
 966         mov     EAX, [ESP+4]; //MASK;
 967         and     EAX, [EDI - 4];
 968         or      EAX, EBP;
 969         rol     EAX, CL;
 970         mov     EBX, EBP;
 971         mov     EBP, [EDI - 4];
 972         mul     int ptr [ESP]; //KINV;
 973
 974         shl     EAX, 1;
 975         rcl     EDX, 1;
 976
 977         // Multiply by k and subtract to get remainder
 978         // Subtraction must be done on two words
 979         mov     EAX, EDX;
 980         mov     ESI, EDX; // quot = high word
 981         mul     int ptr [ESP + LASTPARAM+LOCALS]; //divisor;
 982         sub     EBP, EAX;
 983         sbb     EBX, EDX;
 984         jz      Lb;  // high word is 0, goto adjust on single word
 985
 986         // Adjust quotient and remainder on two words
 987 Ld:     inc     ESI;
 988         sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
 989         sbb     EBX, 0;
 990         jnz     Ld;
 991
 992         // Adjust quotient and remainder on single word
 993 Lb:     cmp     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
 994         jc      Lc; // rem in 0 .. divisor-1, OK
 995         sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
 996         inc     ESI;
 997         jmp     Lb;
 998
 999         // Store result
1000 Lc:
1001         mov     [EDI - 4], ESI;
1002         lea     EDI, [EDI - 4];
1003         dec     int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len
1004         jnz    L2;
1005
1006         pop EAX; // discard kinv
1007         pop EAX; // discard mask
1008
1009         mov     EAX, EBP; // return final remainder
1010         pop     EBP;
1011         pop     EBX;
1012         pop     EDI;
1013         pop     ESI;
1014         ret     3*4;
1015     }
1016 }
1017
1018 @system unittest
1019 {
1020     uint [] aa = new uint[101];
1021     for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3);
1022     uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
1023     uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
1024     for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3));
1025     assert(r == 0x33FF_7461);
1026 }
1027
1028 // Set dest[2*i .. 2*i+1]+=src[i]*src[i]
1029 void multibyteAddDiagonalSquares(uint [] dest, const uint [] src) pure
1030 {
1031     /* Unlike mulAdd, the carry is only 1 bit,
1032            since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
1033            Note also that on the last iteration, no carry can occur.
1034            As for multibyteAdd, we save & restore carry flag through the loop.
1035
1036            The timing is entirely dictated by the dependency chain. We could
1037            improve it by moving the mov EAX after the adc [EDI], EAX. Probably not worthwhile.
1038     */
1039     enum { LASTPARAM = 4*5 } // 4* pushes + return address.
1040     asm pure nothrow {
1041         naked;
1042         push ESI;
1043         push EDI;
1044         push EBX;
1045             push ECX;
1046         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
1047         mov EBX, [ESP + LASTPARAM + 4*0]; //src.length;
1048         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
1049         lea EDI, [EDI + 8*EBX];      // EDI = end of dest
1050         lea ESI, [ESI + 4*EBX];      // ESI = end of src
1051         neg EBX;                     // count UP to zero.
1052         xor ECX, ECX;             // initial carry = 0.
1053 L1:
1054         mov EAX, [ESI + 4*EBX];
1055         mul EAX, EAX;
1056         shr CL, 1;                 // get carry
1057         adc [EDI + 8*EBX], EAX;
1058         adc [EDI + 8*EBX + 4], EDX;
1059         setc CL;                   // save carry
1060         inc EBX;
1061         jnz L1;
1062
1063         pop ECX;
1064         pop EBX;
1065         pop EDI;
1066         pop ESI;
1067         ret 4*4;
1068     }
1069 }
1070
1071 @system unittest
1072 {
1073     uint [] aa = new uint[13];
1074         uint [] bb = new uint[6];
1075     for (int i=0; i<aa.length; ++i) aa[i] = 0x8000_0000;
1076     for (int i=0; i<bb.length; ++i) bb[i] = i;
1077         aa[$-1]= 7;
1078     multibyteAddDiagonalSquares(aa[0..$-1], bb);
1079         assert(aa[$-1]==7);
1080         for (int i=0; i<bb.length; ++i) { assert(aa[2*i]==0x8000_0000+i*i); assert(aa[2*i+1]==0x8000_0000); }
1081 }
1082
1083 void multibyteTriangleAccumulateD(uint[] dest, uint[] x) pure
1084 {
1085     for (int i = 0; i < x.length-3; ++i)
1086     {
1087         dest[i+x.length] = multibyteMulAdd!('+')(
1088              dest[i+i+1 .. i+x.length], x[i+1..$], x[i], 0);
1089     }
1090     ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[$-5];
1091     dest[$-5] = cast(uint) c;
1092     c >>= 32;
1093     c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4];
1094     dest[$-4] = cast(uint) c;
1095     c >>= 32;
1096 length2:
1097     c += cast(ulong)(x[$-2]) * x[$-1];
1098         dest[$-3] = cast(uint) c;
1099         c >>= 32;
1100         dest[$-2] = cast(uint) c;
1101 }
1102
1103 //dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
1104 // assert(dest.length = src.length*2);
1105 // assert(src.length >= 3);
1106 void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure
1107 {
1108     // Register usage
1109     // EDX:EAX = used in multiply
1110     // EBX = index
1111     // ECX = carry1
1112     // EBP = carry2
1113     // EDI = end of dest for this pass through the loop. Index for outer loop.
1114     // ESI = end of src. never changes
1115     // [ESP] = M = src[i] = multiplier for this pass through the loop.
1116     // dest.length is changed into dest.ptr+dest.length
1117     version (D_PIC)
1118     {
1119         enum { zero = 0 }
1120     }
1121     else
1122     {
1123         // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
1124         // when initializing registers to zero.
1125         __gshared int zero = 0;
1126         // use p3/p4 units
1127         __gshared int storagenop; // write-only
1128     }
1129
1130     enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
1131     asm pure nothrow {
1132         naked;
1133
1134         push ESI;
1135         push EDI;
1136         align 16;
1137         push EBX;
1138         push EBP;
1139         push EAX;    // local variable M= src[i]
1140         mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr
1141         mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
1142         mov ESI, [ESP + LASTPARAM + 4*1];  // src.ptr
1143
1144         lea ESI, [ESI + 4*EBX]; // ESI = end of left
1145         add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M
1146
1147         // local variable [ESP + LASTPARAM + 4*2] = last value for EDI
1148         lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
1149
1150         lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3
1151         mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI  = &dest[src.length*2 -3]
1152
1153         cmp EBX, 3;
1154         jz length_is_3;
1155
1156         // We start at src[1], not src[0].
1157         dec EBX;
1158         mov [ESP + LASTPARAM + 4*0], EBX;
1159
1160 outer_loop:
1161         mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
1162         mov EBP, 0;
1163         mov ECX, 0; // ECX = input carry.
1164         dec [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1.
1165         neg EBX;                // count UP to zero.
1166
1167         mov EAX, [ESI + 4*EBX - 4*1]; // get new M
1168         mov [ESP], EAX;                   // save new M
1169
1170         mov EAX, [ESI+4*EBX];
1171         test EBX, 1;
1172         jnz L_enter_odd;
1173     }
1174     // -- Inner loop, with even entry point
1175     mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
1176     asm pure nothrow {
1177         mov [-4+EDI+4*EBX], EBP;
1178         add EDI, 4;
1179         cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]?
1180         jnz outer_loop;
1181 length_is_3:
1182         mov EAX, [ESI - 4*3];
1183         mul EAX, [ESI - 4*2];
1184         mov ECX, 0;
1185         add [EDI-2*4], EAX;  // ECX:dest[$-5] += x[$-3] * x[$-2]
1186         adc ECX, EDX;
1187
1188         mov EAX, [ESI - 4*3];
1189         mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1]
1190         add EAX, ECX;
1191         mov ECX, 0;
1192         adc EDX, 0;
1193         // now EDX: EAX = c + x[$-3] * x[$-1]
1194         add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX)
1195         adc ECX, EDX;  //  ECX holds dest[$-3], it acts as carry for the last row
1196 // do length == 2
1197         mov EAX, [ESI - 4*2];
1198         mul EAX, [ESI - 4*1];
1199         add ECX, EAX;
1200         adc EDX, 0;
1201         mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1];
1202         mov [EDI + 1*4], EDX;
1203
1204         pop EAX;
1205         pop EBP;
1206         pop EBX;
1207         pop EDI;
1208         pop ESI;
1209         ret 4*4;
1210     }
1211 L_enter_odd:
1212     mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
1213 }
1214
1215 @system unittest
1216 {
1217    uint [] aa = new uint[200];
1218    uint [] a  = aa[0 .. 100];
1219    uint [] b  = new uint [100];
1220    aa[] = 761;
1221    a[] = 0;
1222    b[] = 0;
1223    a[3] = 6;
1224    b[0]=1;
1225    b[1] = 17;
1226    b[50 .. 100]=78;
1227    multibyteTriangleAccumulateAsm(a, b[0 .. 50]);
1228    uint [] c = new uint[100];
1229    c[] = 0;
1230    c[1] = 17;
1231    c[3] = 6;
1232    assert(a[]==c[]);
1233    assert(a[0]==0);
1234    aa[] = 0xFFFF_FFFF;
1235    a[] = 0;
1236    b[] = 0;
1237    b[0]= 0xbf6a1f01;
1238    b[1]=  0x6e38ed64;
1239    b[2]=  0xdaa797ed;
1240    b[3] = 0;
1241
1242    multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]);
1243    assert(a[1]==0x3a600964);
1244    assert(a[2]==0x339974f6);
1245    assert(a[3]==0x46736fce);
1246    assert(a[4]==0x5e24a2b4);
1247
1248    b[3] = 0xe93ff9f4;
1249    b[4] = 0x184f03;
1250    a[]=0;
1251    multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]);
1252    assert(a[3]==0x79fff5c2);
1253    assert(a[4]==0xcf384241);
1254    assert(a[5]== 0x4a17fc8);
1255    assert(a[6]==0x4d549025);
1256 }
1257
1258
1259 void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure
1260 {
1261     if (x.length < 4)
1262     {
1263         // Special cases, not worth doing triangular.
1264         result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0);
1265         multibyteMultiplyAccumulate(result[1..$], x, x[1..$]);
1266         return;
1267     }
1268     //  Do half a square multiply.
1269     //  dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
1270     result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0);
1271     multibyteTriangleAccumulateAsm(result[2..$], x[1..$]);
1272     // Multiply by 2
1273     result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1);
1274     // And add the diagonal elements
1275     result[0] = 0;
1276     multibyteAddDiagonalSquares(result, x);
1277 }
1278
1279 version (BignumPerformanceTest)
1280 {
1281 import core.stdc.stdio;
1282 int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } }
1283
1284 __gshared uint [2200] X1;
1285 __gshared uint [2200] Y1;
1286 __gshared uint [4000] Z1;
1287
1288 void testPerformance() pure
1289 {
1290     // The performance results at the top of this file were obtained using
1291     // a Windows device driver to access the CPU performance counters.
1292     // The code below is less accurate but more widely usable.
1293     // The value for division is quite inconsistent.
1294     for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; }
1295     int t, t0;
1296     multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
1297     t0 = clock();
1298     multibyteShl(Z1[0 .. 1000], X1[0 .. 1000], 7);
1299     t = clock();
1300     multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
1301     auto shltime = (clock() - t) - (t - t0);
1302     t0 = clock();
1303     multibyteShr(Z1[2 .. 1002], X1[4 .. 1004], 13);
1304     t = clock();
1305     multibyteShr(Z1[2 .. 2002], X1[4 .. 2004], 13);
1306     auto shrtime = (clock() - t) - (t - t0);
1307     t0 = clock();
1308     multibyteAddSub!('+')(Z1[0 .. 1000], X1[0 .. 1000], Y1[0 .. 1000], 0);
1309     t = clock();
1310     multibyteAddSub!('+')(Z1[0 .. 2000], X1[0 .. 2000], Y1[0 .. 2000], 0);
1311     auto addtime = (clock() - t) - (t-t0);
1312     t0 = clock();
1313     multibyteMul(Z1[0 .. 1000], X1[0 .. 1000], 7, 0);
1314     t = clock();
1315     multibyteMul(Z1[0 .. 2000], X1[0 .. 2000], 7, 0);
1316     auto multime = (clock() - t) - (t - t0);
1317     multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
1318     t0 = clock();
1319     multibyteMulAdd!('+')(Z1[0 .. 1000], X1[0 .. 1000], 217, 0);
1320     t = clock();
1321     multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
1322     auto muladdtime = (clock() - t) - (t - t0);
1323     multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
1324     t = clock();
1325     multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
1326     auto accumtime = clock() - t;
1327     t0 = clock();
1328     multibyteDivAssign(Z1[0 .. 2000], 217, 0);
1329     t = clock();
1330     multibyteDivAssign(Z1[0 .. 1000], 37, 0);
1331     auto divtime = (t - t0) - (clock() - t);
1332         t= clock();
1333     multibyteSquare(Z1[0 .. 64], X1[0 .. 32]);
1334     auto squaretime = clock() - t;
1335
1336     printf("-- BigInt asm performance (cycles/int) --\n");
1337     printf("Add:        %.2f\n", addtime/1000.0);
1338     printf("Shl:        %.2f\n", shltime/1000.0);
1339     printf("Shr:        %.2f\n", shrtime/1000.0);
1340     printf("Mul:        %.2f\n", multime/1000.0);
1341     printf("MulAdd:     %.2f\n", muladdtime/1000.0);
1342     printf("Div:        %.2f\n", divtime/1000.0);
1343     printf("MulAccum32: %.2f*n*n (total %d)\n", accumtime/(32.0*32.0), accumtime);
1344     printf("Square32: %.2f*n*n (total %d)\n\n", squaretime/(32.0*32.0), squaretime);
1345 }
1346
1347 static this()
1348 {
1349     testPerformance();
1350 }
1351 }
1352
1353 } // version (D_InlineAsm_X86)