]> git.ipfire.org Git - thirdparty/gcc.git/blob - libphobos/libdruntime/core/simd.d
Add D front-end, libphobos library, and D2 testsuite.
[thirdparty/gcc.git] / libphobos / libdruntime / core / simd.d
1 // Written in the D programming language.
2
3 /**
4 * Builtin SIMD intrinsics
5 *
6 * Source: $(DRUNTIMESRC core/_simd.d)
7 *
8 * Copyright: Copyright Digital Mars 2012.
9 * License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10 * Authors: $(WEB digitalmars.com, Walter Bright),
11 */
12
13 /* NOTE: This file has been patched from the original DMD distribution to
14 * work with the GDC compiler.
15 */
16 module core.simd;
17
18 pure:
19 nothrow:
20 @safe:
21 @nogc:
22
23 /*******************************
24 * Create a vector type.
25 *
26 * Parameters:
27 * T = one of double[2], float[4], void[16], byte[16], ubyte[16],
28 * short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
29 * For 256 bit vectors,
30 * one of double[4], float[8], void[32], byte[32], ubyte[32],
31 * short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
32 */
33
34 template Vector(T)
35 {
36 /* __vector is compiler magic, hide it behind a template.
37 * The compiler will reject T's that don't work.
38 */
39 alias __vector(T) Vector;
40 }
41
42 /* Handy aliases
43 */
44 static if (is(Vector!(void[8]))) alias Vector!(void[8]) void8; ///
45 static if (is(Vector!(float[2]))) alias Vector!(float[2]) float2; ///
46 static if (is(Vector!(byte[8]))) alias Vector!(byte[8]) byte8; ///
47 static if (is(Vector!(ubyte[8]))) alias Vector!(ubyte[8]) ubyte8; ///
48 static if (is(Vector!(short[4]))) alias Vector!(short[4]) short4; ///
49 static if (is(Vector!(ushort[4]))) alias Vector!(ushort[4]) ushort4; ///
50 static if (is(Vector!(int[2]))) alias Vector!(int[2]) int2; ///
51 static if (is(Vector!(uint[2]))) alias Vector!(uint[2]) uint2; ///
52
53 static if (is(Vector!(void[16]))) alias Vector!(void[16]) void16; ///
54 static if (is(Vector!(double[2]))) alias Vector!(double[2]) double2; ///
55 static if (is(Vector!(float[4]))) alias Vector!(float[4]) float4; ///
56 static if (is(Vector!(byte[16]))) alias Vector!(byte[16]) byte16; ///
57 static if (is(Vector!(ubyte[16]))) alias Vector!(ubyte[16]) ubyte16; ///
58 static if (is(Vector!(short[8]))) alias Vector!(short[8]) short8; ///
59 static if (is(Vector!(ushort[8]))) alias Vector!(ushort[8]) ushort8; ///
60 static if (is(Vector!(int[4]))) alias Vector!(int[4]) int4; ///
61 static if (is(Vector!(uint[4]))) alias Vector!(uint[4]) uint4; ///
62 static if (is(Vector!(long[2]))) alias Vector!(long[2]) long2; ///
63 static if (is(Vector!(ulong[2]))) alias Vector!(ulong[2]) ulong2; ///
64
65 static if (is(Vector!(void[32]))) alias Vector!(void[32]) void32; ///
66 static if (is(Vector!(double[4]))) alias Vector!(double[4]) double4; ///
67 static if (is(Vector!(float[8]))) alias Vector!(float[8]) float8; ///
68 static if (is(Vector!(byte[32]))) alias Vector!(byte[32]) byte32; ///
69 static if (is(Vector!(ubyte[32]))) alias Vector!(ubyte[32]) ubyte32; ///
70 static if (is(Vector!(short[16]))) alias Vector!(short[16]) short16; ///
71 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16; ///
72 static if (is(Vector!(int[8]))) alias Vector!(int[8]) int8; ///
73 static if (is(Vector!(uint[8]))) alias Vector!(uint[8]) uint8; ///
74 static if (is(Vector!(long[4]))) alias Vector!(long[4]) long4; ///
75 static if (is(Vector!(ulong[4]))) alias Vector!(ulong[4]) ulong4; ///
76
77 version (D_SIMD)
78 {
79 /** XMM opcodes that conform to the following:
80 *
81 * opcode xmm1,xmm2/mem
82 *
83 * and do not have side effects (i.e. do not write to memory).
84 */
85 enum XMM
86 {
87 ADDSS = 0xF30F58,
88 ADDSD = 0xF20F58,
89 ADDPS = 0x000F58,
90 ADDPD = 0x660F58,
91 PADDB = 0x660FFC,
92 PADDW = 0x660FFD,
93 PADDD = 0x660FFE,
94 PADDQ = 0x660FD4,
95
96 SUBSS = 0xF30F5C,
97 SUBSD = 0xF20F5C,
98 SUBPS = 0x000F5C,
99 SUBPD = 0x660F5C,
100 PSUBB = 0x660FF8,
101 PSUBW = 0x660FF9,
102 PSUBD = 0x660FFA,
103 PSUBQ = 0x660FFB,
104
105 MULSS = 0xF30F59,
106 MULSD = 0xF20F59,
107 MULPS = 0x000F59,
108 MULPD = 0x660F59,
109 PMULLW = 0x660FD5,
110
111 DIVSS = 0xF30F5E,
112 DIVSD = 0xF20F5E,
113 DIVPS = 0x000F5E,
114 DIVPD = 0x660F5E,
115
116 PAND = 0x660FDB,
117 POR = 0x660FEB,
118
119 UCOMISS = 0x000F2E,
120 UCOMISD = 0x660F2E,
121
122 XORPS = 0x000F57,
123 XORPD = 0x660F57,
124
125 // Use STO and LOD instead of MOV to distinguish the direction
126 STOSS = 0xF30F11,
127 STOSD = 0xF20F11,
128 STOAPS = 0x000F29,
129 STOAPD = 0x660F29,
130 STODQA = 0x660F7F,
131 STOD = 0x660F7E, // MOVD reg/mem64, xmm 66 0F 7E /r
132 STOQ = 0x660FD6,
133
134 LODSS = 0xF30F10,
135 LODSD = 0xF20F10,
136 LODAPS = 0x000F28,
137 LODAPD = 0x660F28,
138 LODDQA = 0x660F6F,
139 LODD = 0x660F6E, // MOVD xmm, reg/mem64 66 0F 6E /r
140 LODQ = 0xF30F7E,
141
142 LODDQU = 0xF30F6F, // MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r
143 STODQU = 0xF30F7F, // MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r
144 MOVDQ2Q = 0xF20FD6, // MOVDQ2Q mmx, xmm F2 0F D6 /r
145 MOVHLPS = 0x0F12, // MOVHLPS xmm1, xmm2 0F 12 /r
146 LODHPD = 0x660F16,
147 STOHPD = 0x660F17, // MOVHPD mem64, xmm 66 0F 17 /r
148 LODHPS = 0x0F16,
149 STOHPS = 0x0F17,
150 MOVLHPS = 0x0F16,
151 LODLPD = 0x660F12,
152 STOLPD = 0x660F13,
153 LODLPS = 0x0F12,
154 STOLPS = 0x0F13,
155 MOVMSKPD = 0x660F50,
156 MOVMSKPS = 0x0F50,
157 MOVNTDQ = 0x660FE7,
158 MOVNTI = 0x0FC3,
159 MOVNTPD = 0x660F2B,
160 MOVNTPS = 0x0F2B,
161 MOVNTQ = 0x0FE7,
162 MOVQ2DQ = 0xF30FD6,
163 LODUPD = 0x660F10,
164 STOUPD = 0x660F11,
165 LODUPS = 0x0F10,
166 STOUPS = 0x0F11,
167
168 PACKSSDW = 0x660F6B,
169 PACKSSWB = 0x660F63,
170 PACKUSWB = 0x660F67,
171 PADDSB = 0x660FEC,
172 PADDSW = 0x660FED,
173 PADDUSB = 0x660FDC,
174 PADDUSW = 0x660FDD,
175 PANDN = 0x660FDF,
176 PCMPEQB = 0x660F74,
177 PCMPEQD = 0x660F76,
178 PCMPEQW = 0x660F75,
179 PCMPGTB = 0x660F64,
180 PCMPGTD = 0x660F66,
181 PCMPGTW = 0x660F65,
182 PMADDWD = 0x660FF5,
183 PSLLW = 0x660FF1,
184 PSLLD = 0x660FF2,
185 PSLLQ = 0x660FF3,
186 PSRAW = 0x660FE1,
187 PSRAD = 0x660FE2,
188 PSRLW = 0x660FD1,
189 PSRLD = 0x660FD2,
190 PSRLQ = 0x660FD3,
191 PSUBSB = 0x660FE8,
192 PSUBSW = 0x660FE9,
193 PSUBUSB = 0x660FD8,
194 PSUBUSW = 0x660FD9,
195 PUNPCKHBW = 0x660F68,
196 PUNPCKHDQ = 0x660F6A,
197 PUNPCKHWD = 0x660F69,
198 PUNPCKLBW = 0x660F60,
199 PUNPCKLDQ = 0x660F62,
200 PUNPCKLWD = 0x660F61,
201 PXOR = 0x660FEF,
202 ANDPD = 0x660F54,
203 ANDPS = 0x0F54,
204 ANDNPD = 0x660F55,
205 ANDNPS = 0x0F55,
206 CMPPS = 0x0FC2,
207 CMPPD = 0x660FC2,
208 CMPSD = 0xF20FC2,
209 CMPSS = 0xF30FC2,
210 COMISD = 0x660F2F,
211 COMISS = 0x0F2F,
212 CVTDQ2PD = 0xF30FE6,
213 CVTDQ2PS = 0x0F5B,
214 CVTPD2DQ = 0xF20FE6,
215 CVTPD2PI = 0x660F2D,
216 CVTPD2PS = 0x660F5A,
217 CVTPI2PD = 0x660F2A,
218 CVTPI2PS = 0x0F2A,
219 CVTPS2DQ = 0x660F5B,
220 CVTPS2PD = 0x0F5A,
221 CVTPS2PI = 0x0F2D,
222 CVTSD2SI = 0xF20F2D,
223 CVTSD2SS = 0xF20F5A,
224 CVTSI2SD = 0xF20F2A,
225 CVTSI2SS = 0xF30F2A,
226 CVTSS2SD = 0xF30F5A,
227 CVTSS2SI = 0xF30F2D,
228 CVTTPD2PI = 0x660F2C,
229 CVTTPD2DQ = 0x660FE6,
230 CVTTPS2DQ = 0xF30F5B,
231 CVTTPS2PI = 0x0F2C,
232 CVTTSD2SI = 0xF20F2C,
233 CVTTSS2SI = 0xF30F2C,
234 MASKMOVDQU = 0x660FF7,
235 MASKMOVQ = 0x0FF7,
236 MAXPD = 0x660F5F,
237 MAXPS = 0x0F5F,
238 MAXSD = 0xF20F5F,
239 MAXSS = 0xF30F5F,
240 MINPD = 0x660F5D,
241 MINPS = 0x0F5D,
242 MINSD = 0xF20F5D,
243 MINSS = 0xF30F5D,
244 ORPD = 0x660F56,
245 ORPS = 0x0F56,
246 PAVGB = 0x660FE0,
247 PAVGW = 0x660FE3,
248 PMAXSW = 0x660FEE,
249 //PINSRW = 0x660FC4,
250 PMAXUB = 0x660FDE,
251 PMINSW = 0x660FEA,
252 PMINUB = 0x660FDA,
253 //PMOVMSKB = 0x660FD7,
254 PMULHUW = 0x660FE4,
255 PMULHW = 0x660FE5,
256 PMULUDQ = 0x660FF4,
257 PSADBW = 0x660FF6,
258 PUNPCKHQDQ = 0x660F6D,
259 PUNPCKLQDQ = 0x660F6C,
260 RCPPS = 0x0F53,
261 RCPSS = 0xF30F53,
262 RSQRTPS = 0x0F52,
263 RSQRTSS = 0xF30F52,
264 SQRTPD = 0x660F51,
265 SHUFPD = 0x660FC6,
266 SHUFPS = 0x0FC6,
267 SQRTPS = 0x0F51,
268 SQRTSD = 0xF20F51,
269 SQRTSS = 0xF30F51,
270 UNPCKHPD = 0x660F15,
271 UNPCKHPS = 0x0F15,
272 UNPCKLPD = 0x660F14,
273 UNPCKLPS = 0x0F14,
274
275 PSHUFD = 0x660F70,
276 PSHUFHW = 0xF30F70,
277 PSHUFLW = 0xF20F70,
278 PSHUFW = 0x0F70,
279 PSLLDQ = 0x07660F73,
280 PSRLDQ = 0x03660F73,
281
282 //PREFETCH = 0x0F18,
283
284 // SSE3 Pentium 4 (Prescott)
285
286 ADDSUBPD = 0x660FD0,
287 ADDSUBPS = 0xF20FD0,
288 HADDPD = 0x660F7C,
289 HADDPS = 0xF20F7C,
290 HSUBPD = 0x660F7D,
291 HSUBPS = 0xF20F7D,
292 MOVDDUP = 0xF20F12,
293 MOVSHDUP = 0xF30F16,
294 MOVSLDUP = 0xF30F12,
295 LDDQU = 0xF20FF0,
296 MONITOR = 0x0F01C8,
297 MWAIT = 0x0F01C9,
298
299 // SSSE3
300 PALIGNR = 0x660F3A0F,
301 PHADDD = 0x660F3802,
302 PHADDW = 0x660F3801,
303 PHADDSW = 0x660F3803,
304 PABSB = 0x660F381C,
305 PABSD = 0x660F381E,
306 PABSW = 0x660F381D,
307 PSIGNB = 0x660F3808,
308 PSIGND = 0x660F380A,
309 PSIGNW = 0x660F3809,
310 PSHUFB = 0x660F3800,
311 PMADDUBSW = 0x660F3804,
312 PMULHRSW = 0x660F380B,
313 PHSUBD = 0x660F3806,
314 PHSUBW = 0x660F3805,
315 PHSUBSW = 0x660F3807,
316
317 // SSE4.1
318
319 BLENDPD = 0x660F3A0D,
320 BLENDPS = 0x660F3A0C,
321 BLENDVPD = 0x660F3815,
322 BLENDVPS = 0x660F3814,
323 DPPD = 0x660F3A41,
324 DPPS = 0x660F3A40,
325 EXTRACTPS = 0x660F3A17,
326 INSERTPS = 0x660F3A21,
327 MPSADBW = 0x660F3A42,
328 PBLENDVB = 0x660F3810,
329 PBLENDW = 0x660F3A0E,
330 PEXTRD = 0x660F3A16,
331 PEXTRQ = 0x660F3A16,
332 PINSRB = 0x660F3A20,
333 PINSRD = 0x660F3A22,
334 PINSRQ = 0x660F3A22,
335
336 MOVNTDQA = 0x660F382A,
337 PACKUSDW = 0x660F382B,
338 PCMPEQQ = 0x660F3829,
339 PEXTRB = 0x660F3A14,
340 PHMINPOSUW = 0x660F3841,
341 PMAXSB = 0x660F383C,
342 PMAXSD = 0x660F383D,
343 PMAXUD = 0x660F383F,
344 PMAXUW = 0x660F383E,
345 PMINSB = 0x660F3838,
346 PMINSD = 0x660F3839,
347 PMINUD = 0x660F383B,
348 PMINUW = 0x660F383A,
349 PMOVSXBW = 0x660F3820,
350 PMOVSXBD = 0x660F3821,
351 PMOVSXBQ = 0x660F3822,
352 PMOVSXWD = 0x660F3823,
353 PMOVSXWQ = 0x660F3824,
354 PMOVSXDQ = 0x660F3825,
355 PMOVZXBW = 0x660F3830,
356 PMOVZXBD = 0x660F3831,
357 PMOVZXBQ = 0x660F3832,
358 PMOVZXWD = 0x660F3833,
359 PMOVZXWQ = 0x660F3834,
360 PMOVZXDQ = 0x660F3835,
361 PMULDQ = 0x660F3828,
362 PMULLD = 0x660F3840,
363 PTEST = 0x660F3817,
364
365 ROUNDPD = 0x660F3A09,
366 ROUNDPS = 0x660F3A08,
367 ROUNDSD = 0x660F3A0B,
368 ROUNDSS = 0x660F3A0A,
369
370 // SSE4.2
371 PCMPESTRI = 0x660F3A61,
372 PCMPESTRM = 0x660F3A60,
373 PCMPISTRI = 0x660F3A63,
374 PCMPISTRM = 0x660F3A62,
375 PCMPGTQ = 0x660F3837,
376 //CRC32
377
378 // SSE4a (AMD only)
379 // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
380
381 // POPCNT and LZCNT (have their own CPUID bits)
382 POPCNT = 0xF30FB8,
383 // LZCNT
384 }
385
386 /**
387 * Generate two operand instruction with XMM 128 bit operands.
388 *
389 * This is a compiler magic function - it doesn't behave like
390 * regular D functions.
391 *
392 * Parameters:
393 * opcode any of the XMM opcodes; it must be a compile time constant
394 * op1 first operand
395 * op2 second operand
396 * Returns:
397 * result of opcode
398 */
399 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
400
401 /**
402 * Unary SIMD instructions.
403 */
404 pure @safe void16 __simd(XMM opcode, void16 op1);
405 pure @safe void16 __simd(XMM opcode, double d); ///
406 pure @safe void16 __simd(XMM opcode, float f); ///
407
408 /****
409 * For instructions:
410 * CMPPD, CMPSS, CMPSD, CMPPS,
411 * PSHUFD, PSHUFHW, PSHUFLW,
412 * BLENDPD, BLENDPS, DPPD, DPPS,
413 * MPSADBW, PBLENDW,
414 * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
415 * Parameters:
416 * opcode any of the above XMM opcodes; it must be a compile time constant
417 * op1 first operand
418 * op2 second operand
419 * imm8 third operand; must be a compile time constant
420 * Returns:
421 * result of opcode
422 */
423 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
424
425 /***
426 * For instructions with the imm8 version:
427 * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
428 * PSRLDQ, PSLLDQ
429 * Parameters:
430 * opcode any of the XMM opcodes; it must be a compile time constant
431 * op1 first operand
432 * imm8 second operand; must be a compile time constant
433 * Returns:
434 * result of opcode
435 */
436 pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
437
438 /*****
439 * For "store" operations of the form:
440 * op1 op= op2
441 * Returns:
442 * op2
443 * These cannot be marked as pure, as semantic() doesn't check them.
444 */
445 @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
446 @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
447 @safe void16 __simd_sto(XMM opcode, float op1, void16 op2); ///
448
449 /* The following use overloading to ensure correct typing.
450 * Compile with inlining on for best performance.
451 */
452
453 pure @safe short8 pcmpeq()(short8 v1, short8 v2)
454 {
455 return __simd(XMM.PCMPEQW, v1, v2);
456 }
457
458 pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
459 {
460 return __simd(XMM.PCMPEQW, v1, v2);
461 }
462
463 /*********************
464 * Emit prefetch instruction.
465 * Params:
466 * address = address to be prefetched
467 * writeFetch = true for write fetch, false for read fetch
468 * locality = 0..3 (0 meaning least local, 3 meaning most local)
469 * Note:
470 * The Intel mappings are:
471 * $(TABLE
472 * $(THEAD writeFetch, locality, Instruction)
473 * $(TROW false, 0, prefetchnta)
474 * $(TROW false, 1, prefetch2)
475 * $(TROW false, 2, prefetch1)
476 * $(TROW false, 3, prefetch0)
477 * $(TROW false, 0, prefetchw)
478 * $(TROW false, 1, prefetchw)
479 * $(TROW false, 2, prefetchw)
480 * $(TROW false, 3, prefetchw)
481 * )
482 */
483 void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
484 {
485 static if (writeFetch)
486 __prefetch(address, 4);
487 else static if (locality < 4)
488 __prefetch(address, 3 - locality);
489 else
490 static assert(0, "0..3 expected for locality");
491 }
492
493 private void __prefetch(const(void*) address, ubyte encoding);
494
495 /*************************************
496 * Load unaligned vector from address.
497 * This is a compiler intrinsic.
498 * Params:
499 * p = pointer to vector
500 * Returns:
501 * vector
502 */
503
504 V loadUnaligned(V)(const V* p)
505 if (is(V == void16) ||
506 is(V == byte16) ||
507 is(V == ubyte16) ||
508 is(V == short8) ||
509 is(V == ushort8) ||
510 is(V == int4) ||
511 is(V == uint4) ||
512 is(V == long2) ||
513 is(V == ulong2))
514 {
515 pragma(inline, true);
516 static if (is(V == double2))
517 return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
518 else static if (is(V == float4))
519 return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
520 else
521 return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
522 }
523
524 /*************************************
525 * Store vector to unaligned address.
526 * This is a compiler intrinsic.
527 * Params:
528 * p = pointer to vector
529 * value = value to store
530 * Returns:
531 * value
532 */
533
534 V storeUnaligned(V)(V* p, V value)
535 if (is(V == void16) ||
536 is(V == byte16) ||
537 is(V == ubyte16) ||
538 is(V == short8) ||
539 is(V == ushort8) ||
540 is(V == int4) ||
541 is(V == uint4) ||
542 is(V == long2) ||
543 is(V == ulong2))
544 {
545 pragma(inline, true);
546 static if (is(V == double2))
547 return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
548 else static if (is(V == float4))
549 return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
550 else
551 return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
552 }
553 }