1 // Written in the D programming language.
4 * Builtin SIMD intrinsics
6 * Source: $(DRUNTIMESRC core/_simd.d)
8 * Copyright: Copyright Digital Mars 2012.
9 * License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10 * Authors: $(WEB digitalmars.com, Walter Bright),
13 /* NOTE: This file has been patched from the original DMD distribution to
14 * work with the GDC compiler.
23 /*******************************
24 * Create a vector type.
27 * T = one of double[2], float[4], void[16], byte[16], ubyte[16],
28 * short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
29 * For 256 bit vectors,
30 * one of double[4], float[8], void[32], byte[32], ubyte[32],
31 * short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
36 /* __vector is compiler magic, hide it behind a template.
37 * The compiler will reject T's that don't work.
39 alias __vector(T) Vector;
44 static if (is(Vector!(void[8]))) alias Vector!(void[8]) void8; ///
45 static if (is(Vector!(float[2]))) alias Vector!(float[2]) float2; ///
46 static if (is(Vector!(byte[8]))) alias Vector!(byte[8]) byte8; ///
47 static if (is(Vector!(ubyte[8]))) alias Vector!(ubyte[8]) ubyte8; ///
48 static if (is(Vector!(short[4]))) alias Vector!(short[4]) short4; ///
49 static if (is(Vector!(ushort[4]))) alias Vector!(ushort[4]) ushort4; ///
50 static if (is(Vector!(int[2]))) alias Vector!(int[2]) int2; ///
51 static if (is(Vector!(uint[2]))) alias Vector!(uint[2]) uint2; ///
53 static if (is(Vector!(void[16]))) alias Vector!(void[16]) void16; ///
54 static if (is(Vector!(double[2]))) alias Vector!(double[2]) double2; ///
55 static if (is(Vector!(float[4]))) alias Vector!(float[4]) float4; ///
56 static if (is(Vector!(byte[16]))) alias Vector!(byte[16]) byte16; ///
57 static if (is(Vector!(ubyte[16]))) alias Vector!(ubyte[16]) ubyte16; ///
58 static if (is(Vector!(short[8]))) alias Vector!(short[8]) short8; ///
59 static if (is(Vector!(ushort[8]))) alias Vector!(ushort[8]) ushort8; ///
60 static if (is(Vector!(int[4]))) alias Vector!(int[4]) int4; ///
61 static if (is(Vector!(uint[4]))) alias Vector!(uint[4]) uint4; ///
62 static if (is(Vector!(long[2]))) alias Vector!(long[2]) long2; ///
63 static if (is(Vector!(ulong[2]))) alias Vector!(ulong[2]) ulong2; ///
65 static if (is(Vector!(void[32]))) alias Vector!(void[32]) void32; ///
66 static if (is(Vector!(double[4]))) alias Vector!(double[4]) double4; ///
67 static if (is(Vector!(float[8]))) alias Vector!(float[8]) float8; ///
68 static if (is(Vector!(byte[32]))) alias Vector!(byte[32]) byte32; ///
69 static if (is(Vector!(ubyte[32]))) alias Vector!(ubyte[32]) ubyte32; ///
70 static if (is(Vector!(short[16]))) alias Vector!(short[16]) short16; ///
71 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16; ///
72 static if (is(Vector!(int[8]))) alias Vector!(int[8]) int8; ///
73 static if (is(Vector!(uint[8]))) alias Vector!(uint[8]) uint8; ///
74 static if (is(Vector!(long[4]))) alias Vector!(long[4]) long4; ///
75 static if (is(Vector!(ulong[4]))) alias Vector!(ulong[4]) ulong4; ///
79 /** XMM opcodes that conform to the following:
81 * opcode xmm1,xmm2/mem
83 * and do not have side effects (i.e. do not write to memory).
125 // Use STO and LOD instead of MOV to distinguish the direction
131 STOD = 0x660F7E, // MOVD reg/mem64, xmm 66 0F 7E /r
139 LODD = 0x660F6E, // MOVD xmm, reg/mem64 66 0F 6E /r
142 LODDQU = 0xF30F6F, // MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r
143 STODQU = 0xF30F7F, // MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r
144 MOVDQ2Q = 0xF20FD6, // MOVDQ2Q mmx, xmm F2 0F D6 /r
145 MOVHLPS = 0x0F12, // MOVHLPS xmm1, xmm2 0F 12 /r
147 STOHPD = 0x660F17, // MOVHPD mem64, xmm 66 0F 17 /r
195 PUNPCKHBW = 0x660F68,
196 PUNPCKHDQ = 0x660F6A,
197 PUNPCKHWD = 0x660F69,
198 PUNPCKLBW = 0x660F60,
199 PUNPCKLDQ = 0x660F62,
200 PUNPCKLWD = 0x660F61,
228 CVTTPD2PI = 0x660F2C,
229 CVTTPD2DQ = 0x660FE6,
230 CVTTPS2DQ = 0xF30F5B,
232 CVTTSD2SI = 0xF20F2C,
233 CVTTSS2SI = 0xF30F2C,
234 MASKMOVDQU = 0x660FF7,
253 //PMOVMSKB = 0x660FD7,
258 PUNPCKHQDQ = 0x660F6D,
259 PUNPCKLQDQ = 0x660F6C,
284 // SSE3 Pentium 4 (Prescott)
300 PALIGNR = 0x660F3A0F,
303 PHADDSW = 0x660F3803,
311 PMADDUBSW = 0x660F3804,
312 PMULHRSW = 0x660F380B,
315 PHSUBSW = 0x660F3807,
319 BLENDPD = 0x660F3A0D,
320 BLENDPS = 0x660F3A0C,
321 BLENDVPD = 0x660F3815,
322 BLENDVPS = 0x660F3814,
325 EXTRACTPS = 0x660F3A17,
326 INSERTPS = 0x660F3A21,
327 MPSADBW = 0x660F3A42,
328 PBLENDVB = 0x660F3810,
329 PBLENDW = 0x660F3A0E,
336 MOVNTDQA = 0x660F382A,
337 PACKUSDW = 0x660F382B,
338 PCMPEQQ = 0x660F3829,
340 PHMINPOSUW = 0x660F3841,
349 PMOVSXBW = 0x660F3820,
350 PMOVSXBD = 0x660F3821,
351 PMOVSXBQ = 0x660F3822,
352 PMOVSXWD = 0x660F3823,
353 PMOVSXWQ = 0x660F3824,
354 PMOVSXDQ = 0x660F3825,
355 PMOVZXBW = 0x660F3830,
356 PMOVZXBD = 0x660F3831,
357 PMOVZXBQ = 0x660F3832,
358 PMOVZXWD = 0x660F3833,
359 PMOVZXWQ = 0x660F3834,
360 PMOVZXDQ = 0x660F3835,
365 ROUNDPD = 0x660F3A09,
366 ROUNDPS = 0x660F3A08,
367 ROUNDSD = 0x660F3A0B,
368 ROUNDSS = 0x660F3A0A,
371 PCMPESTRI = 0x660F3A61,
372 PCMPESTRM = 0x660F3A60,
373 PCMPISTRI = 0x660F3A63,
374 PCMPISTRM = 0x660F3A62,
375 PCMPGTQ = 0x660F3837,
379 // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
381 // POPCNT and LZCNT (have their own CPUID bits)
387 * Generate two operand instruction with XMM 128 bit operands.
389 * This is a compiler magic function - it doesn't behave like
390 * regular D functions.
393 * opcode any of the XMM opcodes; it must be a compile time constant
399 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
402 * Unary SIMD instructions.
404 pure @safe void16 __simd(XMM opcode, void16 op1);
405 pure @safe void16 __simd(XMM opcode, double d); ///
406 pure @safe void16 __simd(XMM opcode, float f); ///
410 * CMPPD, CMPSS, CMPSD, CMPPS,
411 * PSHUFD, PSHUFHW, PSHUFLW,
412 * BLENDPD, BLENDPS, DPPD, DPPS,
414 * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
416 * opcode any of the above XMM opcodes; it must be a compile time constant
419 * imm8 third operand; must be a compile time constant
423 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
426 * For instructions with the imm8 version:
427 * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
430 * opcode any of the XMM opcodes; it must be a compile time constant
432 * imm8 second operand; must be a compile time constant
436 pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
439 * For "store" operations of the form:
443 * These cannot be marked as pure, as semantic() doesn't check them.
445 @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
446 @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
447 @safe void16 __simd_sto(XMM opcode, float op1, void16 op2); ///
449 /* The following use overloading to ensure correct typing.
450 * Compile with inlining on for best performance.
453 pure @safe short8 pcmpeq()(short8 v1, short8 v2)
455 return __simd(XMM.PCMPEQW, v1, v2);
458 pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
460 return __simd(XMM.PCMPEQW, v1, v2);
463 /*********************
464 * Emit prefetch instruction.
466 * address = address to be prefetched
467 * writeFetch = true for write fetch, false for read fetch
468 * locality = 0..3 (0 meaning least local, 3 meaning most local)
470 * The Intel mappings are:
472 * $(THEAD writeFetch, locality, Instruction)
473 * $(TROW false, 0, prefetchnta)
474 * $(TROW false, 1, prefetch2)
475 * $(TROW false, 2, prefetch1)
476 * $(TROW false, 3, prefetch0)
477 * $(TROW false, 0, prefetchw)
478 * $(TROW false, 1, prefetchw)
479 * $(TROW false, 2, prefetchw)
480 * $(TROW false, 3, prefetchw)
483 void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
485 static if (writeFetch)
486 __prefetch(address, 4);
487 else static if (locality < 4)
488 __prefetch(address, 3 - locality);
490 static assert(0, "0..3 expected for locality");
493 private void __prefetch(const(void*) address, ubyte encoding);
495 /*************************************
496 * Load unaligned vector from address.
497 * This is a compiler intrinsic.
499 * p = pointer to vector
504 V loadUnaligned(V)(const V* p)
505 if (is(V == void16) ||
515 pragma(inline, true);
516 static if (is(V == double2))
517 return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
518 else static if (is(V == float4))
519 return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
521 return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
524 /*************************************
525 * Store vector to unaligned address.
526 * This is a compiler intrinsic.
528 * p = pointer to vector
529 * value = value to store
534 V storeUnaligned(V)(V* p, V value)
535 if (is(V == void16) ||
545 pragma(inline, true);
546 static if (is(V == double2))
547 return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
548 else static if (is(V == float4))
549 return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
551 return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);