]>
Commit | Line | Data |
---|---|---|
99dee823 | 1 | /* Copyright (C) 2003-2021 Free Software Foundation, Inc. |
1fb0f892 PC |
2 | |
3 | This file is part of GCC. | |
4 | ||
5 | GCC is free software; you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
7 | the Free Software Foundation; either version 3, or (at your option) | |
8 | any later version. | |
9 | ||
10 | GCC is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | Under Section 7 of GPL version 3, you are granted additional | |
16 | permissions described in the GCC Runtime Library Exception, version | |
17 | 3.1, as published by the Free Software Foundation. | |
18 | ||
19 | You should have received a copy of the GNU General Public License and | |
20 | a copy of the GCC Runtime Library Exception along with this program; | |
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 | <http://www.gnu.org/licenses/>. */ | |
23 | ||
24 | /* Implemented from the specification included in the Intel C++ Compiler | |
25 | User Guide and Reference, version 9.0. */ | |
26 | ||
27 | #ifndef NO_WARN_X86_INTRINSICS | |
28 | /* This header is distributed to simplify porting x86_64 code that | |
29 | makes explicit use of Intel intrinsics to powerpc64le. | |
30 | It is the user's responsibility to determine if the results are | |
31 | acceptable and make additional changes as necessary. | |
32 | Note that much code that uses Intel intrinsics can be rewritten in | |
33 | standard C or GNU C extensions, which are more portable and better | |
34 | optimized across multiple targets. | |
35 | ||
36 | In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA | |
37 | is a good match for most SIMD operations. However the Horizontal | |
38 | add/sub requires the data pairs be permuted into a separate | |
39 | registers with vertical even/odd alignment for the operation. | |
40 | And the addsub operation requires the sign of only the even numbered | |
41 | elements be flipped (xored with -0.0). | |
42 | For larger blocks of code using these intrinsic implementations, | |
43 | the compiler be should be able to schedule instructions to avoid | |
44 | additional latency. | |
45 | ||
46 | In the specific case of the monitor and mwait instructions there are | |
47 | no direct equivalent in the PowerISA at this time. So those | |
48 | intrinsics are not implemented. */ | |
49 | #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." | |
50 | #endif | |
51 | ||
52 | #ifndef _PMMINTRIN_H_INCLUDED | |
53 | #define _PMMINTRIN_H_INCLUDED | |
54 | ||
55 | /* We need definitions from the SSE2 and SSE header files*/ | |
56 | #include <emmintrin.h> | |
57 | ||
58 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
59 | _mm_addsub_ps (__m128 __X, __m128 __Y) | |
60 | { | |
61 | const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0}; | |
62 | __v4sf even_neg_Y = vec_xor(__Y, even_n0); | |
63 | return (__m128) vec_add (__X, even_neg_Y); | |
64 | } | |
65 | ||
66 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
67 | _mm_addsub_pd (__m128d __X, __m128d __Y) | |
68 | { | |
69 | const __v2df even_n0 = {-0.0, 0.0}; | |
70 | __v2df even_neg_Y = vec_xor(__Y, even_n0); | |
71 | return (__m128d) vec_add (__X, even_neg_Y); | |
72 | } | |
73 | ||
74 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
75 | _mm_hadd_ps (__m128 __X, __m128 __Y) | |
76 | { | |
77 | __vector unsigned char xform2 = { | |
60c703ed PC |
78 | 0x00, 0x01, 0x02, 0x03, |
79 | 0x08, 0x09, 0x0A, 0x0B, | |
80 | 0x10, 0x11, 0x12, 0x13, | |
81 | 0x18, 0x19, 0x1A, 0x1B | |
1fb0f892 PC |
82 | }; |
83 | __vector unsigned char xform1 = { | |
60c703ed PC |
84 | 0x04, 0x05, 0x06, 0x07, |
85 | 0x0C, 0x0D, 0x0E, 0x0F, | |
86 | 0x14, 0x15, 0x16, 0x17, | |
87 | 0x1C, 0x1D, 0x1E, 0x1F | |
1fb0f892 PC |
88 | }; |
89 | return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), | |
90 | vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); | |
91 | } | |
92 | ||
93 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
94 | _mm_hsub_ps (__m128 __X, __m128 __Y) | |
95 | { | |
96 | __vector unsigned char xform2 = { | |
60c703ed PC |
97 | 0x00, 0x01, 0x02, 0x03, |
98 | 0x08, 0x09, 0x0A, 0x0B, | |
99 | 0x10, 0x11, 0x12, 0x13, | |
100 | 0x18, 0x19, 0x1A, 0x1B | |
1fb0f892 PC |
101 | }; |
102 | __vector unsigned char xform1 = { | |
60c703ed PC |
103 | 0x04, 0x05, 0x06, 0x07, |
104 | 0x0C, 0x0D, 0x0E, 0x0F, | |
105 | 0x14, 0x15, 0x16, 0x17, | |
106 | 0x1C, 0x1D, 0x1E, 0x1F | |
1fb0f892 PC |
107 | }; |
108 | return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), | |
109 | vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); | |
110 | } | |
111 | ||
112 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
113 | _mm_hadd_pd (__m128d __X, __m128d __Y) | |
114 | { | |
115 | return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y), | |
116 | vec_mergel ((__v2df) __X, (__v2df)__Y)); | |
117 | } | |
118 | ||
119 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
120 | _mm_hsub_pd (__m128d __X, __m128d __Y) | |
121 | { | |
122 | return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y), | |
123 | vec_mergel ((__v2df) __X, (__v2df)__Y)); | |
124 | } | |
125 | ||
126 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
127 | _mm_movehdup_ps (__m128 __X) | |
128 | { | |
129 | return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); | |
130 | } | |
131 | ||
132 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
133 | _mm_moveldup_ps (__m128 __X) | |
134 | { | |
135 | return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); | |
136 | } | |
137 | ||
138 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
139 | _mm_loaddup_pd (double const *__P) | |
140 | { | |
141 | return (__m128d) vec_splats (*__P); | |
142 | } | |
143 | ||
144 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
145 | _mm_movedup_pd (__m128d __X) | |
146 | { | |
147 | return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); | |
148 | } | |
149 | ||
150 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
151 | _mm_lddqu_si128 (__m128i const *__P) | |
152 | { | |
153 | return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); | |
154 | } | |
155 | ||
156 | /* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */ | |
157 | ||
158 | #endif /* _PMMINTRIN_H_INCLUDED */ |