libphobos/libdruntime/core/simd.d

   1 // Written in the D programming language.
   2
   3 /**
   4  * Builtin SIMD intrinsics
   5  *
   6  * Source: $(DRUNTIMESRC core/_simd.d)
   7  *
   8  * Copyright: Copyright Digital Mars 2012.
   9  * License:   $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
  10  * Authors:   $(WEB digitalmars.com, Walter Bright),
  11  */
  12
  13 /* NOTE: This file has been patched from the original DMD distribution to
  14  * work with the GDC compiler.
  15  */
  16 module core.simd;
  17
  18 pure:
  19 nothrow:
  20 @safe:
  21 @nogc:
  22
  23 /*******************************
  24  * Create a vector type.
  25  *
  26  * Parameters:
  27  *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
  28  *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
  29  *      For 256 bit vectors,
  30  *      one of double[4], float[8], void[32], byte[32], ubyte[32],
  31  *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
  32  */
  33
  34 template Vector(T)
  35 {
  36     /* __vector is compiler magic, hide it behind a template.
  37      * The compiler will reject T's that don't work.
  38      */
  39     alias __vector(T) Vector;
  40 }
  41
  42 /* Handy aliases
  43  */
  44 static if (is(Vector!(void[8])))    alias Vector!(void[8])  void8;          ///
  45 static if (is(Vector!(float[2])))   alias Vector!(float[2])  float2;        ///
  46 static if (is(Vector!(byte[8])))    alias Vector!(byte[8])  byte8;          ///
  47 static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8]) ubyte8;         ///
  48 static if (is(Vector!(short[4])))   alias Vector!(short[4])  short4;        ///
  49 static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4]) ushort4;       ///
  50 static if (is(Vector!(int[2])))     alias Vector!(int[2])    int2;          ///
  51 static if (is(Vector!(uint[2])))    alias Vector!(uint[2])   uint2;         ///
  52
  53 static if (is(Vector!(void[16])))   alias Vector!(void[16])  void16;        ///
  54 static if (is(Vector!(double[2])))  alias Vector!(double[2]) double2;       ///
  55 static if (is(Vector!(float[4])))   alias Vector!(float[4])  float4;        ///
  56 static if (is(Vector!(byte[16])))   alias Vector!(byte[16])  byte16;        ///
  57 static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16]) ubyte16;       ///
  58 static if (is(Vector!(short[8])))   alias Vector!(short[8])  short8;        ///
  59 static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8]) ushort8;       ///
  60 static if (is(Vector!(int[4])))     alias Vector!(int[4])    int4;          ///
  61 static if (is(Vector!(uint[4])))    alias Vector!(uint[4])   uint4;         ///
  62 static if (is(Vector!(long[2])))    alias Vector!(long[2])   long2;         ///
  63 static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])  ulong2;        ///
  64
  65 static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;        ///
  66 static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;       ///
  67 static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;        ///
  68 static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;        ///
  69 static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;       ///
  70 static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;       ///
  71 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;      ///
  72 static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;          ///
  73 static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;         ///
  74 static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;         ///
  75 static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;        ///
  76
  77 version (D_SIMD)
  78 {
  79   /** XMM opcodes that conform to the following:
  80    *
  81    *  opcode xmm1,xmm2/mem
  82    *
  83    * and do not have side effects (i.e. do not write to memory).
  84    */
  85   enum XMM
  86   {
  87     ADDSS = 0xF30F58,
  88     ADDSD = 0xF20F58,
  89     ADDPS = 0x000F58,
  90     ADDPD = 0x660F58,
  91     PADDB = 0x660FFC,
  92     PADDW = 0x660FFD,
  93     PADDD = 0x660FFE,
  94     PADDQ = 0x660FD4,
  95
  96     SUBSS = 0xF30F5C,
  97     SUBSD = 0xF20F5C,
  98     SUBPS = 0x000F5C,
  99     SUBPD = 0x660F5C,
 100     PSUBB = 0x660FF8,
 101     PSUBW = 0x660FF9,
 102     PSUBD = 0x660FFA,
 103     PSUBQ = 0x660FFB,
 104
 105     MULSS = 0xF30F59,
 106     MULSD = 0xF20F59,
 107     MULPS = 0x000F59,
 108     MULPD = 0x660F59,
 109     PMULLW = 0x660FD5,
 110
 111     DIVSS = 0xF30F5E,
 112     DIVSD = 0xF20F5E,
 113     DIVPS = 0x000F5E,
 114     DIVPD = 0x660F5E,
 115
 116     PAND  = 0x660FDB,
 117     POR   = 0x660FEB,
 118
 119     UCOMISS = 0x000F2E,
 120     UCOMISD = 0x660F2E,
 121
 122     XORPS = 0x000F57,
 123     XORPD = 0x660F57,
 124
 125     // Use STO and LOD instead of MOV to distinguish the direction
 126     STOSS  = 0xF30F11,
 127     STOSD  = 0xF20F11,
 128     STOAPS = 0x000F29,
 129     STOAPD = 0x660F29,
 130     STODQA = 0x660F7F,
 131     STOD   = 0x660F7E,        // MOVD reg/mem64, xmm   66 0F 7E /r
 132     STOQ   = 0x660FD6,
 133
 134     LODSS  = 0xF30F10,
 135     LODSD  = 0xF20F10,
 136     LODAPS = 0x000F28,
 137     LODAPD = 0x660F28,
 138     LODDQA = 0x660F6F,
 139     LODD   = 0x660F6E,        // MOVD xmm, reg/mem64   66 0F 6E /r
 140     LODQ   = 0xF30F7E,
 141
 142     LODDQU   = 0xF30F6F,      // MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
 143     STODQU   = 0xF30F7F,      // MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
 144     MOVDQ2Q  = 0xF20FD6,      // MOVDQ2Q mmx, xmm          F2 0F D6 /r
 145     MOVHLPS  = 0x0F12,        // MOVHLPS xmm1, xmm2        0F 12 /r
 146     LODHPD   = 0x660F16,
 147     STOHPD   = 0x660F17,      // MOVHPD mem64, xmm         66 0F 17 /r
 148     LODHPS   = 0x0F16,
 149     STOHPS   = 0x0F17,
 150     MOVLHPS  = 0x0F16,
 151     LODLPD   = 0x660F12,
 152     STOLPD   = 0x660F13,
 153     LODLPS   = 0x0F12,
 154     STOLPS   = 0x0F13,
 155     MOVMSKPD = 0x660F50,
 156     MOVMSKPS = 0x0F50,
 157     MOVNTDQ  = 0x660FE7,
 158     MOVNTI   = 0x0FC3,
 159     MOVNTPD  = 0x660F2B,
 160     MOVNTPS  = 0x0F2B,
 161     MOVNTQ   = 0x0FE7,
 162     MOVQ2DQ  = 0xF30FD6,
 163     LODUPD   = 0x660F10,
 164     STOUPD   = 0x660F11,
 165     LODUPS   = 0x0F10,
 166     STOUPS   = 0x0F11,
 167
 168     PACKSSDW = 0x660F6B,
 169     PACKSSWB = 0x660F63,
 170     PACKUSWB = 0x660F67,
 171     PADDSB = 0x660FEC,
 172     PADDSW = 0x660FED,
 173     PADDUSB = 0x660FDC,
 174     PADDUSW = 0x660FDD,
 175     PANDN = 0x660FDF,
 176     PCMPEQB = 0x660F74,
 177     PCMPEQD = 0x660F76,
 178     PCMPEQW = 0x660F75,
 179     PCMPGTB = 0x660F64,
 180     PCMPGTD = 0x660F66,
 181     PCMPGTW = 0x660F65,
 182     PMADDWD = 0x660FF5,
 183     PSLLW = 0x660FF1,
 184     PSLLD = 0x660FF2,
 185     PSLLQ = 0x660FF3,
 186     PSRAW = 0x660FE1,
 187     PSRAD = 0x660FE2,
 188     PSRLW = 0x660FD1,
 189     PSRLD = 0x660FD2,
 190     PSRLQ = 0x660FD3,
 191     PSUBSB = 0x660FE8,
 192     PSUBSW = 0x660FE9,
 193     PSUBUSB = 0x660FD8,
 194     PSUBUSW = 0x660FD9,
 195     PUNPCKHBW = 0x660F68,
 196     PUNPCKHDQ = 0x660F6A,
 197     PUNPCKHWD = 0x660F69,
 198     PUNPCKLBW = 0x660F60,
 199     PUNPCKLDQ = 0x660F62,
 200     PUNPCKLWD = 0x660F61,
 201     PXOR = 0x660FEF,
 202     ANDPD = 0x660F54,
 203     ANDPS = 0x0F54,
 204     ANDNPD = 0x660F55,
 205     ANDNPS = 0x0F55,
 206     CMPPS = 0x0FC2,
 207     CMPPD = 0x660FC2,
 208     CMPSD = 0xF20FC2,
 209     CMPSS = 0xF30FC2,
 210     COMISD = 0x660F2F,
 211     COMISS = 0x0F2F,
 212     CVTDQ2PD = 0xF30FE6,
 213     CVTDQ2PS = 0x0F5B,
 214     CVTPD2DQ = 0xF20FE6,
 215     CVTPD2PI = 0x660F2D,
 216     CVTPD2PS = 0x660F5A,
 217     CVTPI2PD = 0x660F2A,
 218     CVTPI2PS = 0x0F2A,
 219     CVTPS2DQ = 0x660F5B,
 220     CVTPS2PD = 0x0F5A,
 221     CVTPS2PI = 0x0F2D,
 222     CVTSD2SI = 0xF20F2D,
 223     CVTSD2SS = 0xF20F5A,
 224     CVTSI2SD = 0xF20F2A,
 225     CVTSI2SS = 0xF30F2A,
 226     CVTSS2SD = 0xF30F5A,
 227     CVTSS2SI = 0xF30F2D,
 228     CVTTPD2PI = 0x660F2C,
 229     CVTTPD2DQ = 0x660FE6,
 230     CVTTPS2DQ = 0xF30F5B,
 231     CVTTPS2PI = 0x0F2C,
 232     CVTTSD2SI = 0xF20F2C,
 233     CVTTSS2SI = 0xF30F2C,
 234     MASKMOVDQU = 0x660FF7,
 235     MASKMOVQ = 0x0FF7,
 236     MAXPD = 0x660F5F,
 237     MAXPS = 0x0F5F,
 238     MAXSD = 0xF20F5F,
 239     MAXSS = 0xF30F5F,
 240     MINPD = 0x660F5D,
 241     MINPS = 0x0F5D,
 242     MINSD = 0xF20F5D,
 243     MINSS = 0xF30F5D,
 244     ORPD = 0x660F56,
 245     ORPS = 0x0F56,
 246     PAVGB = 0x660FE0,
 247     PAVGW = 0x660FE3,
 248     PMAXSW = 0x660FEE,
 249     //PINSRW = 0x660FC4,
 250     PMAXUB = 0x660FDE,
 251     PMINSW = 0x660FEA,
 252     PMINUB = 0x660FDA,
 253     //PMOVMSKB = 0x660FD7,
 254     PMULHUW = 0x660FE4,
 255     PMULHW = 0x660FE5,
 256     PMULUDQ = 0x660FF4,
 257     PSADBW = 0x660FF6,
 258     PUNPCKHQDQ = 0x660F6D,
 259     PUNPCKLQDQ = 0x660F6C,
 260     RCPPS = 0x0F53,
 261     RCPSS = 0xF30F53,
 262     RSQRTPS = 0x0F52,
 263     RSQRTSS = 0xF30F52,
 264     SQRTPD = 0x660F51,
 265     SHUFPD = 0x660FC6,
 266     SHUFPS = 0x0FC6,
 267     SQRTPS = 0x0F51,
 268     SQRTSD = 0xF20F51,
 269     SQRTSS = 0xF30F51,
 270     UNPCKHPD = 0x660F15,
 271     UNPCKHPS = 0x0F15,
 272     UNPCKLPD = 0x660F14,
 273     UNPCKLPS = 0x0F14,
 274
 275     PSHUFD = 0x660F70,
 276     PSHUFHW = 0xF30F70,
 277     PSHUFLW = 0xF20F70,
 278     PSHUFW = 0x0F70,
 279     PSLLDQ = 0x07660F73,
 280     PSRLDQ = 0x03660F73,
 281
 282     //PREFETCH = 0x0F18,
 283
 284 // SSE3 Pentium 4 (Prescott)
 285
 286     ADDSUBPD = 0x660FD0,
 287     ADDSUBPS = 0xF20FD0,
 288     HADDPD   = 0x660F7C,
 289     HADDPS   = 0xF20F7C,
 290     HSUBPD   = 0x660F7D,
 291     HSUBPS   = 0xF20F7D,
 292     MOVDDUP  = 0xF20F12,
 293     MOVSHDUP = 0xF30F16,
 294     MOVSLDUP = 0xF30F12,
 295     LDDQU    = 0xF20FF0,
 296     MONITOR  = 0x0F01C8,
 297     MWAIT    = 0x0F01C9,
 298
 299 // SSSE3
 300     PALIGNR = 0x660F3A0F,
 301     PHADDD = 0x660F3802,
 302     PHADDW = 0x660F3801,
 303     PHADDSW = 0x660F3803,
 304     PABSB = 0x660F381C,
 305     PABSD = 0x660F381E,
 306     PABSW = 0x660F381D,
 307     PSIGNB = 0x660F3808,
 308     PSIGND = 0x660F380A,
 309     PSIGNW = 0x660F3809,
 310     PSHUFB = 0x660F3800,
 311     PMADDUBSW = 0x660F3804,
 312     PMULHRSW = 0x660F380B,
 313     PHSUBD = 0x660F3806,
 314     PHSUBW = 0x660F3805,
 315     PHSUBSW = 0x660F3807,
 316
 317 // SSE4.1
 318
 319     BLENDPD   = 0x660F3A0D,
 320     BLENDPS   = 0x660F3A0C,
 321     BLENDVPD  = 0x660F3815,
 322     BLENDVPS  = 0x660F3814,
 323     DPPD      = 0x660F3A41,
 324     DPPS      = 0x660F3A40,
 325     EXTRACTPS = 0x660F3A17,
 326     INSERTPS  = 0x660F3A21,
 327     MPSADBW   = 0x660F3A42,
 328     PBLENDVB  = 0x660F3810,
 329     PBLENDW   = 0x660F3A0E,
 330     PEXTRD    = 0x660F3A16,
 331     PEXTRQ    = 0x660F3A16,
 332     PINSRB    = 0x660F3A20,
 333     PINSRD    = 0x660F3A22,
 334     PINSRQ    = 0x660F3A22,
 335
 336     MOVNTDQA = 0x660F382A,
 337     PACKUSDW = 0x660F382B,
 338     PCMPEQQ = 0x660F3829,
 339     PEXTRB = 0x660F3A14,
 340     PHMINPOSUW = 0x660F3841,
 341     PMAXSB = 0x660F383C,
 342     PMAXSD = 0x660F383D,
 343     PMAXUD = 0x660F383F,
 344     PMAXUW = 0x660F383E,
 345     PMINSB = 0x660F3838,
 346     PMINSD = 0x660F3839,
 347     PMINUD = 0x660F383B,
 348     PMINUW = 0x660F383A,
 349     PMOVSXBW = 0x660F3820,
 350     PMOVSXBD = 0x660F3821,
 351     PMOVSXBQ = 0x660F3822,
 352     PMOVSXWD = 0x660F3823,
 353     PMOVSXWQ = 0x660F3824,
 354     PMOVSXDQ = 0x660F3825,
 355     PMOVZXBW = 0x660F3830,
 356     PMOVZXBD = 0x660F3831,
 357     PMOVZXBQ = 0x660F3832,
 358     PMOVZXWD = 0x660F3833,
 359     PMOVZXWQ = 0x660F3834,
 360     PMOVZXDQ = 0x660F3835,
 361     PMULDQ   = 0x660F3828,
 362     PMULLD   = 0x660F3840,
 363     PTEST    = 0x660F3817,
 364
 365     ROUNDPD = 0x660F3A09,
 366     ROUNDPS = 0x660F3A08,
 367     ROUNDSD = 0x660F3A0B,
 368     ROUNDSS = 0x660F3A0A,
 369
 370 // SSE4.2
 371     PCMPESTRI  = 0x660F3A61,
 372     PCMPESTRM  = 0x660F3A60,
 373     PCMPISTRI  = 0x660F3A63,
 374     PCMPISTRM  = 0x660F3A62,
 375     PCMPGTQ    = 0x660F3837,
 376     //CRC32
 377
 378 // SSE4a (AMD only)
 379     // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
 380
 381 // POPCNT and LZCNT (have their own CPUID bits)
 382     POPCNT     = 0xF30FB8,
 383     // LZCNT
 384   }
 385
 386   /**
 387    * Generate two operand instruction with XMM 128 bit operands.
 388    *
 389    * This is a compiler magic function - it doesn't behave like
 390    * regular D functions.
 391    *
 392    * Parameters:
 393    *      opcode  any of the XMM opcodes; it must be a compile time constant
 394    *      op1     first operand
 395    *      op2     second operand
 396    * Returns:
 397    *      result of opcode
 398    */
 399   pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
 400
 401   /**
 402    * Unary SIMD instructions.
 403    */
 404   pure @safe void16 __simd(XMM opcode, void16 op1);
 405   pure @safe void16 __simd(XMM opcode, double d);       ///
 406   pure @safe void16 __simd(XMM opcode, float f);        ///
 407
 408   /****
 409    * For instructions:
 410    * CMPPD, CMPSS, CMPSD, CMPPS,
 411    * PSHUFD, PSHUFHW, PSHUFLW,
 412    * BLENDPD, BLENDPS, DPPD, DPPS,
 413    * MPSADBW, PBLENDW,
 414    * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
 415    * Parameters:
 416    *      opcode  any of the above XMM opcodes; it must be a compile time constant
 417    *      op1     first operand
 418    *      op2     second operand
 419    *      imm8    third operand; must be a compile time constant
 420    * Returns:
 421    *      result of opcode
 422    */
 423   pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
 424
 425   /***
 426    * For instructions with the imm8 version:
 427    * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
 428    * PSRLDQ, PSLLDQ
 429    * Parameters:
 430    *      opcode  any of the XMM opcodes; it must be a compile time constant
 431    *      op1     first operand
 432    *      imm8    second operand; must be a compile time constant
 433    * Returns:
 434    *      result of opcode
 435    */
 436   pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
 437
 438   /*****
 439    * For "store" operations of the form:
 440    *    op1 op= op2
 441    * Returns:
 442    *    op2
 443    * These cannot be marked as pure, as semantic() doesn't check them.
 444    */
 445   @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
 446   @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
 447   @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);  ///
 448
 449   /* The following use overloading to ensure correct typing.
 450    * Compile with inlining on for best performance.
 451    */
 452
 453   pure @safe short8 pcmpeq()(short8 v1, short8 v2)
 454   {
 455       return __simd(XMM.PCMPEQW, v1, v2);
 456   }
 457
 458   pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
 459   {
 460       return __simd(XMM.PCMPEQW, v1, v2);
 461   }
 462
 463   /*********************
 464    * Emit prefetch instruction.
 465    * Params:
 466    *    address = address to be prefetched
 467    *    writeFetch = true for write fetch, false for read fetch
 468    *    locality = 0..3 (0 meaning least local, 3 meaning most local)
 469    * Note:
 470    *    The Intel mappings are:
 471    *    $(TABLE
 472    *    $(THEAD writeFetch, locality, Instruction)
 473    *    $(TROW false, 0, prefetchnta)
 474    *    $(TROW false, 1, prefetch2)
 475    *    $(TROW false, 2, prefetch1)
 476    *    $(TROW false, 3, prefetch0)
 477    *    $(TROW false, 0, prefetchw)
 478    *    $(TROW false, 1, prefetchw)
 479    *    $(TROW false, 2, prefetchw)
 480    *    $(TROW false, 3, prefetchw)
 481    *    )
 482    */
 483   void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
 484   {
 485         static if (writeFetch)
 486             __prefetch(address, 4);
 487         else static if (locality < 4)
 488             __prefetch(address, 3 - locality);
 489         else
 490             static assert(0, "0..3 expected for locality");
 491   }
 492
 493   private void __prefetch(const(void*) address, ubyte encoding);
 494
 495   /*************************************
 496    * Load unaligned vector from address.
 497    * This is a compiler intrinsic.
 498    * Params:
 499    *    p = pointer to vector
 500    * Returns:
 501    *    vector
 502    */
 503
 504   V loadUnaligned(V)(const V* p)
 505         if (is(V == void16) ||
 506             is(V == byte16) ||
 507             is(V == ubyte16) ||
 508             is(V == short8) ||
 509             is(V == ushort8) ||
 510             is(V == int4) ||
 511             is(V == uint4) ||
 512             is(V == long2) ||
 513             is(V == ulong2))
 514   {
 515         pragma(inline, true);
 516         static if (is(V == double2))
 517             return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
 518         else static if (is(V == float4))
 519             return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
 520         else
 521             return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
 522   }
 523
 524   /*************************************
 525    * Store vector to unaligned address.
 526    * This is a compiler intrinsic.
 527    * Params:
 528    *    p = pointer to vector
 529    *    value = value to store
 530    * Returns:
 531    *    value
 532    */
 533
 534   V storeUnaligned(V)(V* p, V value)
 535         if (is(V == void16) ||
 536             is(V == byte16) ||
 537             is(V == ubyte16) ||
 538             is(V == short8) ||
 539             is(V == ushort8) ||
 540             is(V == int4) ||
 541             is(V == uint4) ||
 542             is(V == long2) ||
 543             is(V == ulong2))
 544   {
 545         pragma(inline, true);
 546         static if (is(V == double2))
 547             return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
 548         else static if (is(V == float4))
 549             return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
 550         else
 551             return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
 552   }
 553 }