]>
Commit | Line | Data |
---|---|---|
a945c346 | 1 | /* Copyright (C) 2008-2024 Free Software Foundation, Inc. |
95879c72 L |
2 | |
3 | This file is part of GCC. | |
4 | ||
5 | GCC is free software; you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
748086b7 | 7 | the Free Software Foundation; either version 3, or (at your option) |
95879c72 L |
8 | any later version. |
9 | ||
10 | GCC is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
748086b7 JJ |
15 | Under Section 7 of GPL version 3, you are granted additional |
16 | permissions described in the GCC Runtime Library Exception, version | |
17 | 3.1, as published by the Free Software Foundation. | |
18 | ||
19 | You should have received a copy of the GNU General Public License and | |
20 | a copy of the GCC Runtime Library Exception along with this program; | |
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 | <http://www.gnu.org/licenses/>. */ | |
95879c72 L |
23 | |
24 | /* Implemented from the specification included in the Intel C++ Compiler | |
25 | User Guide and Reference, version 11.0. */ | |
26 | ||
37fe763d UB |
27 | #ifndef _IMMINTRIN_H_INCLUDED |
28 | # error "Never use <avxintrin.h> directly; include <immintrin.h> instead." | |
29 | #endif | |
95879c72 | 30 | |
97db2bf7 ST |
31 | #ifndef _AVXINTRIN_H_INCLUDED |
32 | #define _AVXINTRIN_H_INCLUDED | |
33 | ||
34 | #ifndef __AVX__ | |
35 | #pragma GCC push_options | |
36 | #pragma GCC target("avx") | |
37 | #define __DISABLE_AVX__ | |
38 | #endif /* __AVX__ */ | |
39 | ||
95879c72 L |
40 | /* Internal data types for implementing the intrinsics. */ |
41 | typedef double __v4df __attribute__ ((__vector_size__ (32))); | |
42 | typedef float __v8sf __attribute__ ((__vector_size__ (32))); | |
43 | typedef long long __v4di __attribute__ ((__vector_size__ (32))); | |
2069d6fc | 44 | typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); |
95879c72 | 45 | typedef int __v8si __attribute__ ((__vector_size__ (32))); |
2069d6fc | 46 | typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); |
95879c72 | 47 | typedef short __v16hi __attribute__ ((__vector_size__ (32))); |
2069d6fc | 48 | typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); |
95879c72 | 49 | typedef char __v32qi __attribute__ ((__vector_size__ (32))); |
b245befc | 50 | typedef signed char __v32qs __attribute__ ((__vector_size__ (32))); |
2069d6fc | 51 | typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); |
95879c72 L |
52 | |
53 | /* The Intel API is flexible enough that we must allow aliasing with other | |
54 | vector types, and their scalar components. */ | |
55 | typedef float __m256 __attribute__ ((__vector_size__ (32), | |
56 | __may_alias__)); | |
57 | typedef long long __m256i __attribute__ ((__vector_size__ (32), | |
58 | __may_alias__)); | |
59 | typedef double __m256d __attribute__ ((__vector_size__ (32), | |
60 | __may_alias__)); | |
61 | ||
c6b0037d MG |
62 | /* Unaligned version of the same types. */ |
63 | typedef float __m256_u __attribute__ ((__vector_size__ (32), | |
64 | __may_alias__, | |
65 | __aligned__ (1))); | |
66 | typedef long long __m256i_u __attribute__ ((__vector_size__ (32), | |
67 | __may_alias__, | |
68 | __aligned__ (1))); | |
69 | typedef double __m256d_u __attribute__ ((__vector_size__ (32), | |
70 | __may_alias__, | |
71 | __aligned__ (1))); | |
72 | ||
95879c72 L |
73 | /* Compare predicates for scalar and packed compare intrinsics. */ |
74 | ||
95879c72 L |
75 | /* Equal (unordered, non-signaling) */ |
76 | #define _CMP_EQ_UQ 0x08 | |
77 | /* Not-greater-than-or-equal (unordered, signaling) */ | |
78 | #define _CMP_NGE_US 0x09 | |
79 | /* Not-greater-than (unordered, signaling) */ | |
80 | #define _CMP_NGT_US 0x0a | |
81 | /* False (ordered, non-signaling) */ | |
82 | #define _CMP_FALSE_OQ 0x0b | |
83 | /* Not-equal (ordered, non-signaling) */ | |
84 | #define _CMP_NEQ_OQ 0x0c | |
85 | /* Greater-than-or-equal (ordered, signaling) */ | |
86 | #define _CMP_GE_OS 0x0d | |
87 | /* Greater-than (ordered, signaling) */ | |
88 | #define _CMP_GT_OS 0x0e | |
89 | /* True (unordered, non-signaling) */ | |
90 | #define _CMP_TRUE_UQ 0x0f | |
91 | /* Equal (ordered, signaling) */ | |
92 | #define _CMP_EQ_OS 0x10 | |
93 | /* Less-than (ordered, non-signaling) */ | |
94 | #define _CMP_LT_OQ 0x11 | |
95 | /* Less-than-or-equal (ordered, non-signaling) */ | |
96 | #define _CMP_LE_OQ 0x12 | |
97 | /* Unordered (signaling) */ | |
98 | #define _CMP_UNORD_S 0x13 | |
99 | /* Not-equal (unordered, signaling) */ | |
100 | #define _CMP_NEQ_US 0x14 | |
101 | /* Not-less-than (unordered, non-signaling) */ | |
102 | #define _CMP_NLT_UQ 0x15 | |
103 | /* Not-less-than-or-equal (unordered, non-signaling) */ | |
104 | #define _CMP_NLE_UQ 0x16 | |
105 | /* Ordered (signaling) */ | |
106 | #define _CMP_ORD_S 0x17 | |
107 | /* Equal (unordered, signaling) */ | |
108 | #define _CMP_EQ_US 0x18 | |
109 | /* Not-greater-than-or-equal (unordered, non-signaling) */ | |
110 | #define _CMP_NGE_UQ 0x19 | |
111 | /* Not-greater-than (unordered, non-signaling) */ | |
112 | #define _CMP_NGT_UQ 0x1a | |
113 | /* False (ordered, signaling) */ | |
114 | #define _CMP_FALSE_OS 0x1b | |
115 | /* Not-equal (ordered, signaling) */ | |
116 | #define _CMP_NEQ_OS 0x1c | |
117 | /* Greater-than-or-equal (ordered, non-signaling) */ | |
118 | #define _CMP_GE_OQ 0x1d | |
119 | /* Greater-than (ordered, non-signaling) */ | |
120 | #define _CMP_GT_OQ 0x1e | |
121 | /* True (unordered, signaling) */ | |
122 | #define _CMP_TRUE_US 0x1f | |
123 | ||
124 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
125 | _mm256_add_pd (__m256d __A, __m256d __B) | |
126 | { | |
2069d6fc | 127 | return (__m256d) ((__v4df)__A + (__v4df)__B); |
95879c72 L |
128 | } |
129 | ||
130 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
131 | _mm256_add_ps (__m256 __A, __m256 __B) | |
132 | { | |
2069d6fc | 133 | return (__m256) ((__v8sf)__A + (__v8sf)__B); |
95879c72 L |
134 | } |
135 | ||
136 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
137 | _mm256_addsub_pd (__m256d __A, __m256d __B) | |
138 | { | |
139 | return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); | |
140 | } | |
141 | ||
142 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
143 | _mm256_addsub_ps (__m256 __A, __m256 __B) | |
144 | { | |
145 | return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); | |
146 | } | |
147 | ||
148 | ||
149 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
150 | _mm256_and_pd (__m256d __A, __m256d __B) | |
151 | { | |
152 | return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); | |
153 | } | |
154 | ||
155 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
156 | _mm256_and_ps (__m256 __A, __m256 __B) | |
157 | { | |
158 | return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); | |
159 | } | |
160 | ||
161 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
162 | _mm256_andnot_pd (__m256d __A, __m256d __B) | |
163 | { | |
164 | return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); | |
165 | } | |
166 | ||
167 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
168 | _mm256_andnot_ps (__m256 __A, __m256 __B) | |
169 | { | |
170 | return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); | |
171 | } | |
172 | ||
173 | /* Double/single precision floating point blend instructions - select | |
174 | data from 2 sources using constant/variable mask. */ | |
175 | ||
176 | #ifdef __OPTIMIZE__ | |
177 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
178 | _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) | |
179 | { | |
180 | return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, | |
181 | (__v4df)__Y, | |
182 | __M); | |
183 | } | |
184 | ||
185 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
186 | _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) | |
187 | { | |
188 | return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, | |
189 | (__v8sf)__Y, | |
190 | __M); | |
191 | } | |
192 | #else | |
193 | #define _mm256_blend_pd(X, Y, M) \ | |
194 | ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ | |
195 | (__v4df)(__m256d)(Y), (int)(M))) | |
196 | ||
197 | #define _mm256_blend_ps(X, Y, M) \ | |
198 | ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ | |
199 | (__v8sf)(__m256)(Y), (int)(M))) | |
200 | #endif | |
201 | ||
202 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
203 | _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) | |
204 | { | |
205 | return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, | |
206 | (__v4df)__Y, | |
207 | (__v4df)__M); | |
208 | } | |
209 | ||
210 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
211 | _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) | |
212 | { | |
213 | return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, | |
214 | (__v8sf)__Y, | |
215 | (__v8sf)__M); | |
216 | } | |
217 | ||
218 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
219 | _mm256_div_pd (__m256d __A, __m256d __B) | |
220 | { | |
2069d6fc | 221 | return (__m256d) ((__v4df)__A / (__v4df)__B); |
95879c72 L |
222 | } |
223 | ||
224 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
225 | _mm256_div_ps (__m256 __A, __m256 __B) | |
226 | { | |
2069d6fc | 227 | return (__m256) ((__v8sf)__A / (__v8sf)__B); |
95879c72 L |
228 | } |
229 | ||
230 | /* Dot product instructions with mask-defined summing and zeroing parts | |
231 | of result. */ | |
232 | ||
233 | #ifdef __OPTIMIZE__ | |
234 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
235 | _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) | |
236 | { | |
237 | return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, | |
238 | (__v8sf)__Y, | |
239 | __M); | |
240 | } | |
241 | #else | |
242 | #define _mm256_dp_ps(X, Y, M) \ | |
243 | ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ | |
244 | (__v8sf)(__m256)(Y), (int)(M))) | |
245 | #endif | |
246 | ||
247 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
248 | _mm256_hadd_pd (__m256d __X, __m256d __Y) | |
249 | { | |
250 | return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); | |
251 | } | |
252 | ||
253 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
254 | _mm256_hadd_ps (__m256 __X, __m256 __Y) | |
255 | { | |
256 | return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); | |
257 | } | |
258 | ||
259 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
260 | _mm256_hsub_pd (__m256d __X, __m256d __Y) | |
261 | { | |
262 | return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); | |
263 | } | |
264 | ||
265 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
266 | _mm256_hsub_ps (__m256 __X, __m256 __Y) | |
267 | { | |
268 | return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); | |
269 | } | |
270 | ||
271 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
272 | _mm256_max_pd (__m256d __A, __m256d __B) | |
273 | { | |
274 | return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); | |
275 | } | |
276 | ||
277 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
278 | _mm256_max_ps (__m256 __A, __m256 __B) | |
279 | { | |
280 | return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); | |
281 | } | |
282 | ||
283 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
284 | _mm256_min_pd (__m256d __A, __m256d __B) | |
285 | { | |
286 | return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); | |
287 | } | |
288 | ||
289 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
290 | _mm256_min_ps (__m256 __A, __m256 __B) | |
291 | { | |
292 | return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); | |
293 | } | |
294 | ||
295 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
296 | _mm256_mul_pd (__m256d __A, __m256d __B) | |
297 | { | |
2069d6fc | 298 | return (__m256d) ((__v4df)__A * (__v4df)__B); |
95879c72 L |
299 | } |
300 | ||
301 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
302 | _mm256_mul_ps (__m256 __A, __m256 __B) | |
303 | { | |
2069d6fc | 304 | return (__m256) ((__v8sf)__A * (__v8sf)__B); |
95879c72 L |
305 | } |
306 | ||
307 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
308 | _mm256_or_pd (__m256d __A, __m256d __B) | |
309 | { | |
310 | return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); | |
311 | } | |
312 | ||
313 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
314 | _mm256_or_ps (__m256 __A, __m256 __B) | |
315 | { | |
316 | return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); | |
317 | } | |
318 | ||
319 | #ifdef __OPTIMIZE__ | |
320 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
321 | _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) | |
322 | { | |
323 | return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, | |
324 | __mask); | |
325 | } | |
326 | ||
327 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
328 | _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) | |
329 | { | |
330 | return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, | |
331 | __mask); | |
332 | } | |
333 | #else | |
334 | #define _mm256_shuffle_pd(A, B, N) \ | |
335 | ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ | |
336 | (__v4df)(__m256d)(B), (int)(N))) | |
337 | ||
338 | #define _mm256_shuffle_ps(A, B, N) \ | |
339 | ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ | |
340 | (__v8sf)(__m256)(B), (int)(N))) | |
341 | #endif | |
342 | ||
343 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
344 | _mm256_sub_pd (__m256d __A, __m256d __B) | |
345 | { | |
2069d6fc | 346 | return (__m256d) ((__v4df)__A - (__v4df)__B); |
95879c72 L |
347 | } |
348 | ||
349 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
350 | _mm256_sub_ps (__m256 __A, __m256 __B) | |
351 | { | |
2069d6fc | 352 | return (__m256) ((__v8sf)__A - (__v8sf)__B); |
95879c72 L |
353 | } |
354 | ||
355 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
356 | _mm256_xor_pd (__m256d __A, __m256d __B) | |
357 | { | |
358 | return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); | |
359 | } | |
360 | ||
361 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
362 | _mm256_xor_ps (__m256 __A, __m256 __B) | |
363 | { | |
364 | return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); | |
365 | } | |
366 | ||
367 | #ifdef __OPTIMIZE__ | |
95879c72 L |
368 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
369 | _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) | |
370 | { | |
371 | return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, | |
372 | __P); | |
373 | } | |
374 | ||
375 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
376 | _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) | |
377 | { | |
378 | return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, | |
379 | __P); | |
380 | } | |
95879c72 | 381 | #else |
95879c72 L |
382 | #define _mm256_cmp_pd(X, Y, P) \ |
383 | ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ | |
384 | (__v4df)(__m256d)(Y), (int)(P))) | |
385 | ||
386 | #define _mm256_cmp_ps(X, Y, P) \ | |
387 | ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ | |
388 | (__v8sf)(__m256)(Y), (int)(P))) | |
95879c72 L |
389 | #endif |
390 | ||
93103603 SP |
391 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
392 | _mm256_cvtsi256_si32 (__m256i __A) | |
393 | { | |
394 | __v8si __B = (__v8si) __A; | |
395 | return __B[0]; | |
396 | } | |
397 | ||
95879c72 L |
398 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
399 | _mm256_cvtepi32_pd (__m128i __A) | |
400 | { | |
401 | return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); | |
402 | } | |
403 | ||
404 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
405 | _mm256_cvtepi32_ps (__m256i __A) | |
406 | { | |
407 | return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); | |
408 | } | |
409 | ||
410 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
411 | _mm256_cvtpd_ps (__m256d __A) | |
412 | { | |
413 | return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); | |
414 | } | |
415 | ||
416 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
417 | _mm256_cvtps_epi32 (__m256 __A) | |
418 | { | |
419 | return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); | |
420 | } | |
421 | ||
422 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
423 | _mm256_cvtps_pd (__m128 __A) | |
424 | { | |
425 | return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); | |
426 | } | |
427 | ||
428 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
429 | _mm256_cvttpd_epi32 (__m256d __A) | |
430 | { | |
431 | return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); | |
432 | } | |
433 | ||
434 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
435 | _mm256_cvtpd_epi32 (__m256d __A) | |
436 | { | |
437 | return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); | |
438 | } | |
439 | ||
440 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
441 | _mm256_cvttps_epi32 (__m256 __A) | |
442 | { | |
443 | return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); | |
444 | } | |
445 | ||
dcb2c527 JJ |
446 | extern __inline double |
447 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) | |
448 | _mm256_cvtsd_f64 (__m256d __A) | |
449 | { | |
450 | return __A[0]; | |
451 | } | |
452 | ||
453 | extern __inline float | |
454 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) | |
455 | _mm256_cvtss_f32 (__m256 __A) | |
456 | { | |
457 | return __A[0]; | |
458 | } | |
459 | ||
95879c72 L |
460 | #ifdef __OPTIMIZE__ |
461 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
462 | _mm256_extractf128_pd (__m256d __X, const int __N) | |
463 | { | |
464 | return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); | |
465 | } | |
466 | ||
467 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
468 | _mm256_extractf128_ps (__m256 __X, const int __N) | |
469 | { | |
470 | return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); | |
471 | } | |
472 | ||
473 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
474 | _mm256_extractf128_si256 (__m256i __X, const int __N) | |
475 | { | |
476 | return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); | |
477 | } | |
478 | ||
479 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
480 | _mm256_extract_epi32 (__m256i __X, int const __N) | |
481 | { | |
482 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); | |
483 | return _mm_extract_epi32 (__Y, __N % 4); | |
484 | } | |
485 | ||
486 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
487 | _mm256_extract_epi16 (__m256i __X, int const __N) | |
488 | { | |
489 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); | |
490 | return _mm_extract_epi16 (__Y, __N % 8); | |
491 | } | |
492 | ||
493 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
494 | _mm256_extract_epi8 (__m256i __X, int const __N) | |
495 | { | |
496 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); | |
497 | return _mm_extract_epi8 (__Y, __N % 16); | |
498 | } | |
499 | ||
500 | #ifdef __x86_64__ | |
501 | extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
502 | _mm256_extract_epi64 (__m256i __X, const int __N) | |
503 | { | |
504 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); | |
505 | return _mm_extract_epi64 (__Y, __N % 2); | |
506 | } | |
507 | #endif | |
508 | #else | |
509 | #define _mm256_extractf128_pd(X, N) \ | |
510 | ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ | |
511 | (int)(N))) | |
512 | ||
513 | #define _mm256_extractf128_ps(X, N) \ | |
514 | ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ | |
515 | (int)(N))) | |
516 | ||
517 | #define _mm256_extractf128_si256(X, N) \ | |
518 | ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ | |
519 | (int)(N))) | |
520 | ||
521 | #define _mm256_extract_epi32(X, N) \ | |
522 | (__extension__ \ | |
523 | ({ \ | |
524 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ | |
525 | _mm_extract_epi32 (__Y, (N) % 4); \ | |
526 | })) | |
527 | ||
528 | #define _mm256_extract_epi16(X, N) \ | |
529 | (__extension__ \ | |
530 | ({ \ | |
531 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ | |
532 | _mm_extract_epi16 (__Y, (N) % 8); \ | |
533 | })) | |
534 | ||
535 | #define _mm256_extract_epi8(X, N) \ | |
536 | (__extension__ \ | |
537 | ({ \ | |
538 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ | |
539 | _mm_extract_epi8 (__Y, (N) % 16); \ | |
540 | })) | |
541 | ||
542 | #ifdef __x86_64__ | |
543 | #define _mm256_extract_epi64(X, N) \ | |
544 | (__extension__ \ | |
545 | ({ \ | |
546 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ | |
547 | _mm_extract_epi64 (__Y, (N) % 2); \ | |
548 | })) | |
549 | #endif | |
550 | #endif | |
551 | ||
552 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
553 | _mm256_zeroall (void) | |
554 | { | |
555 | __builtin_ia32_vzeroall (); | |
556 | } | |
557 | ||
558 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
559 | _mm256_zeroupper (void) | |
560 | { | |
561 | __builtin_ia32_vzeroupper (); | |
562 | } | |
563 | ||
564 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
565 | _mm_permutevar_pd (__m128d __A, __m128i __C) | |
566 | { | |
567 | return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, | |
568 | (__v2di)__C); | |
569 | } | |
570 | ||
571 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
572 | _mm256_permutevar_pd (__m256d __A, __m256i __C) | |
573 | { | |
574 | return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, | |
575 | (__v4di)__C); | |
576 | } | |
577 | ||
578 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
579 | _mm_permutevar_ps (__m128 __A, __m128i __C) | |
580 | { | |
581 | return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, | |
582 | (__v4si)__C); | |
583 | } | |
584 | ||
585 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
586 | _mm256_permutevar_ps (__m256 __A, __m256i __C) | |
587 | { | |
588 | return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, | |
589 | (__v8si)__C); | |
590 | } | |
591 | ||
592 | #ifdef __OPTIMIZE__ | |
593 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
594 | _mm_permute_pd (__m128d __X, const int __C) | |
595 | { | |
596 | return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); | |
597 | } | |
598 | ||
599 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
600 | _mm256_permute_pd (__m256d __X, const int __C) | |
601 | { | |
602 | return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); | |
603 | } | |
604 | ||
605 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
606 | _mm_permute_ps (__m128 __X, const int __C) | |
607 | { | |
608 | return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); | |
609 | } | |
610 | ||
611 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
612 | _mm256_permute_ps (__m256 __X, const int __C) | |
613 | { | |
614 | return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); | |
615 | } | |
95879c72 L |
616 | #else |
617 | #define _mm_permute_pd(X, C) \ | |
618 | ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) | |
619 | ||
620 | #define _mm256_permute_pd(X, C) \ | |
621 | ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) | |
622 | ||
623 | #define _mm_permute_ps(X, C) \ | |
624 | ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) | |
625 | ||
626 | #define _mm256_permute_ps(X, C) \ | |
627 | ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) | |
95879c72 L |
628 | #endif |
629 | ||
630 | #ifdef __OPTIMIZE__ | |
631 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
632 | _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) | |
633 | { | |
634 | return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, | |
635 | (__v4df)__Y, | |
636 | __C); | |
637 | } | |
638 | ||
639 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
640 | _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) | |
641 | { | |
642 | return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, | |
643 | (__v8sf)__Y, | |
644 | __C); | |
645 | } | |
646 | ||
647 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
648 | _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) | |
649 | { | |
650 | return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, | |
651 | (__v8si)__Y, | |
652 | __C); | |
653 | } | |
654 | #else | |
655 | #define _mm256_permute2f128_pd(X, Y, C) \ | |
656 | ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ | |
657 | (__v4df)(__m256d)(Y), \ | |
658 | (int)(C))) | |
659 | ||
660 | #define _mm256_permute2f128_ps(X, Y, C) \ | |
661 | ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ | |
662 | (__v8sf)(__m256)(Y), \ | |
663 | (int)(C))) | |
664 | ||
665 | #define _mm256_permute2f128_si256(X, Y, C) \ | |
666 | ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ | |
667 | (__v8si)(__m256i)(Y), \ | |
668 | (int)(C))) | |
669 | #endif | |
670 | ||
671 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
672 | _mm_broadcast_ss (float const *__X) | |
673 | { | |
674 | return (__m128) __builtin_ia32_vbroadcastss (__X); | |
675 | } | |
676 | ||
677 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
678 | _mm256_broadcast_sd (double const *__X) | |
679 | { | |
680 | return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); | |
681 | } | |
682 | ||
683 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
684 | _mm256_broadcast_ss (float const *__X) | |
685 | { | |
686 | return (__m256) __builtin_ia32_vbroadcastss256 (__X); | |
687 | } | |
688 | ||
689 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
690 | _mm256_broadcast_pd (__m128d const *__X) | |
691 | { | |
692 | return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); | |
693 | } | |
694 | ||
695 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
696 | _mm256_broadcast_ps (__m128 const *__X) | |
697 | { | |
698 | return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); | |
699 | } | |
700 | ||
701 | #ifdef __OPTIMIZE__ | |
702 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
703 | _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) | |
704 | { | |
705 | return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, | |
706 | (__v2df)__Y, | |
707 | __O); | |
708 | } | |
709 | ||
710 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
711 | _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) | |
712 | { | |
713 | return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, | |
714 | (__v4sf)__Y, | |
715 | __O); | |
716 | } | |
717 | ||
718 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
719 | _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) | |
720 | { | |
721 | return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, | |
722 | (__v4si)__Y, | |
723 | __O); | |
724 | } | |
725 | ||
726 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
727 | _mm256_insert_epi32 (__m256i __X, int __D, int const __N) | |
728 | { | |
729 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); | |
23e0d930 | 730 | __Y = _mm_insert_epi32 (__Y, __D, __N % 4); |
95879c72 L |
731 | return _mm256_insertf128_si256 (__X, __Y, __N >> 2); |
732 | } | |
733 | ||
734 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
735 | _mm256_insert_epi16 (__m256i __X, int __D, int const __N) | |
736 | { | |
737 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); | |
738 | __Y = _mm_insert_epi16 (__Y, __D, __N % 8); | |
739 | return _mm256_insertf128_si256 (__X, __Y, __N >> 3); | |
740 | } | |
741 | ||
742 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
743 | _mm256_insert_epi8 (__m256i __X, int __D, int const __N) | |
744 | { | |
745 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); | |
746 | __Y = _mm_insert_epi8 (__Y, __D, __N % 16); | |
747 | return _mm256_insertf128_si256 (__X, __Y, __N >> 4); | |
748 | } | |
749 | ||
750 | #ifdef __x86_64__ | |
751 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
fcff2e9c | 752 | _mm256_insert_epi64 (__m256i __X, long long __D, int const __N) |
95879c72 L |
753 | { |
754 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); | |
23e0d930 | 755 | __Y = _mm_insert_epi64 (__Y, __D, __N % 2); |
95879c72 L |
756 | return _mm256_insertf128_si256 (__X, __Y, __N >> 1); |
757 | } | |
758 | #endif | |
759 | #else | |
760 | #define _mm256_insertf128_pd(X, Y, O) \ | |
761 | ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ | |
762 | (__v2df)(__m128d)(Y), \ | |
763 | (int)(O))) | |
764 | ||
765 | #define _mm256_insertf128_ps(X, Y, O) \ | |
766 | ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ | |
767 | (__v4sf)(__m128)(Y), \ | |
768 | (int)(O))) | |
769 | ||
770 | #define _mm256_insertf128_si256(X, Y, O) \ | |
771 | ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ | |
772 | (__v4si)(__m128i)(Y), \ | |
773 | (int)(O))) | |
774 | ||
775 | #define _mm256_insert_epi32(X, D, N) \ | |
776 | (__extension__ \ | |
777 | ({ \ | |
778 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ | |
779 | __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ | |
780 | _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ | |
781 | })) | |
782 | ||
783 | #define _mm256_insert_epi16(X, D, N) \ | |
784 | (__extension__ \ | |
785 | ({ \ | |
786 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ | |
787 | __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ | |
788 | _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ | |
789 | })) | |
790 | ||
791 | #define _mm256_insert_epi8(X, D, N) \ | |
792 | (__extension__ \ | |
793 | ({ \ | |
794 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ | |
795 | __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ | |
796 | _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ | |
797 | })) | |
798 | ||
799 | #ifdef __x86_64__ | |
800 | #define _mm256_insert_epi64(X, D, N) \ | |
801 | (__extension__ \ | |
802 | ({ \ | |
803 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ | |
804 | __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ | |
805 | _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ | |
806 | })) | |
807 | #endif | |
808 | #endif | |
809 | ||
810 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
811 | _mm256_load_pd (double const *__P) | |
812 | { | |
813 | return *(__m256d *)__P; | |
814 | } | |
815 | ||
816 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
817 | _mm256_store_pd (double *__P, __m256d __A) | |
818 | { | |
819 | *(__m256d *)__P = __A; | |
820 | } | |
821 | ||
822 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
823 | _mm256_load_ps (float const *__P) | |
824 | { | |
825 | return *(__m256 *)__P; | |
826 | } | |
827 | ||
828 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
829 | _mm256_store_ps (float *__P, __m256 __A) | |
830 | { | |
831 | *(__m256 *)__P = __A; | |
832 | } | |
833 | ||
834 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
835 | _mm256_loadu_pd (double const *__P) | |
836 | { | |
c6b0037d | 837 | return *(__m256d_u *)__P; |
95879c72 L |
838 | } |
839 | ||
840 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
841 | _mm256_storeu_pd (double *__P, __m256d __A) | |
842 | { | |
c6b0037d | 843 | *(__m256d_u *)__P = __A; |
95879c72 L |
844 | } |
845 | ||
846 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
847 | _mm256_loadu_ps (float const *__P) | |
848 | { | |
c6b0037d | 849 | return *(__m256_u *)__P; |
95879c72 L |
850 | } |
851 | ||
852 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
853 | _mm256_storeu_ps (float *__P, __m256 __A) | |
854 | { | |
c6b0037d | 855 | *(__m256_u *)__P = __A; |
95879c72 L |
856 | } |
857 | ||
858 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
859 | _mm256_load_si256 (__m256i const *__P) | |
860 | { | |
861 | return *__P; | |
862 | } | |
863 | ||
864 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
865 | _mm256_store_si256 (__m256i *__P, __m256i __A) | |
866 | { | |
867 | *__P = __A; | |
868 | } | |
869 | ||
870 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
c6b0037d | 871 | _mm256_loadu_si256 (__m256i_u const *__P) |
95879c72 | 872 | { |
c6b0037d | 873 | return *__P; |
95879c72 L |
874 | } |
875 | ||
876 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
c6b0037d | 877 | _mm256_storeu_si256 (__m256i_u *__P, __m256i __A) |
95879c72 | 878 | { |
c6b0037d | 879 | *__P = __A; |
95879c72 L |
880 | } |
881 | ||
882 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
98c6d93c | 883 | _mm_maskload_pd (double const *__P, __m128i __M) |
95879c72 L |
884 | { |
885 | return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, | |
98c6d93c | 886 | (__v2di)__M); |
95879c72 L |
887 | } |
888 | ||
889 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
98c6d93c | 890 | _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) |
95879c72 | 891 | { |
98c6d93c | 892 | __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); |
95879c72 L |
893 | } |
894 | ||
895 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
98c6d93c | 896 | _mm256_maskload_pd (double const *__P, __m256i __M) |
95879c72 L |
897 | { |
898 | return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, | |
98c6d93c | 899 | (__v4di)__M); |
95879c72 L |
900 | } |
901 | ||
902 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
98c6d93c | 903 | _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) |
95879c72 | 904 | { |
98c6d93c | 905 | __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); |
95879c72 L |
906 | } |
907 | ||
908 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
98c6d93c | 909 | _mm_maskload_ps (float const *__P, __m128i __M) |
95879c72 L |
910 | { |
911 | return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, | |
98c6d93c | 912 | (__v4si)__M); |
95879c72 L |
913 | } |
914 | ||
915 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
98c6d93c | 916 | _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) |
95879c72 | 917 | { |
98c6d93c | 918 | __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); |
95879c72 L |
919 | } |
920 | ||
921 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
98c6d93c | 922 | _mm256_maskload_ps (float const *__P, __m256i __M) |
95879c72 L |
923 | { |
924 | return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, | |
98c6d93c | 925 | (__v8si)__M); |
95879c72 L |
926 | } |
927 | ||
928 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
98c6d93c | 929 | _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) |
95879c72 | 930 | { |
98c6d93c | 931 | __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); |
95879c72 L |
932 | } |
933 | ||
934 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
935 | _mm256_movehdup_ps (__m256 __X) | |
936 | { | |
937 | return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); | |
938 | } | |
939 | ||
940 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
941 | _mm256_moveldup_ps (__m256 __X) | |
942 | { | |
943 | return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); | |
944 | } | |
945 | ||
946 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
947 | _mm256_movedup_pd (__m256d __X) | |
948 | { | |
949 | return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); | |
950 | } | |
951 | ||
952 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
953 | _mm256_lddqu_si256 (__m256i const *__P) | |
954 | { | |
955 | return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); | |
956 | } | |
957 | ||
65b82caa L |
958 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
959 | _mm256_stream_si256 (__m256i *__A, __m256i __B) | |
960 | { | |
961 | __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); | |
962 | } | |
963 | ||
964 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
965 | _mm256_stream_pd (double *__A, __m256d __B) | |
966 | { | |
967 | __builtin_ia32_movntpd256 (__A, (__v4df)__B); | |
968 | } | |
969 | ||
970 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
971 | _mm256_stream_ps (float *__P, __m256 __A) | |
972 | { | |
973 | __builtin_ia32_movntps256 (__P, (__v8sf)__A); | |
974 | } | |
975 | ||
95879c72 L |
976 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
977 | _mm256_rcp_ps (__m256 __A) | |
978 | { | |
979 | return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); | |
980 | } | |
981 | ||
982 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
983 | _mm256_rsqrt_ps (__m256 __A) | |
984 | { | |
985 | return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); | |
986 | } | |
987 | ||
988 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
989 | _mm256_sqrt_pd (__m256d __A) | |
990 | { | |
991 | return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); | |
992 | } | |
993 | ||
994 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
995 | _mm256_sqrt_ps (__m256 __A) | |
996 | { | |
997 | return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); | |
998 | } | |
999 | ||
1000 | #ifdef __OPTIMIZE__ | |
1001 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1002 | _mm256_round_pd (__m256d __V, const int __M) | |
1003 | { | |
1004 | return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); | |
1005 | } | |
1006 | ||
1007 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1008 | _mm256_round_ps (__m256 __V, const int __M) | |
1009 | { | |
1010 | return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); | |
1011 | } | |
1012 | #else | |
1013 | #define _mm256_round_pd(V, M) \ | |
1014 | ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) | |
1015 | ||
1016 | #define _mm256_round_ps(V, M) \ | |
1017 | ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) | |
1018 | #endif | |
1019 | ||
1020 | #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) | |
1021 | #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) | |
1022 | #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) | |
1023 | #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) | |
1024 | ||
1025 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1026 | _mm256_unpackhi_pd (__m256d __A, __m256d __B) | |
1027 | { | |
1028 | return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); | |
1029 | } | |
1030 | ||
1031 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1032 | _mm256_unpacklo_pd (__m256d __A, __m256d __B) | |
1033 | { | |
1034 | return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); | |
1035 | } | |
1036 | ||
1037 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1038 | _mm256_unpackhi_ps (__m256 __A, __m256 __B) | |
1039 | { | |
1040 | return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); | |
1041 | } | |
1042 | ||
1043 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1044 | _mm256_unpacklo_ps (__m256 __A, __m256 __B) | |
1045 | { | |
1046 | return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); | |
1047 | } | |
1048 | ||
1049 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1050 | _mm_testz_pd (__m128d __M, __m128d __V) | |
1051 | { | |
1052 | return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); | |
1053 | } | |
1054 | ||
1055 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1056 | _mm_testc_pd (__m128d __M, __m128d __V) | |
1057 | { | |
1058 | return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); | |
1059 | } | |
1060 | ||
1061 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1062 | _mm_testnzc_pd (__m128d __M, __m128d __V) | |
1063 | { | |
1064 | return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); | |
1065 | } | |
1066 | ||
1067 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1068 | _mm_testz_ps (__m128 __M, __m128 __V) | |
1069 | { | |
1070 | return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); | |
1071 | } | |
1072 | ||
1073 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1074 | _mm_testc_ps (__m128 __M, __m128 __V) | |
1075 | { | |
1076 | return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); | |
1077 | } | |
1078 | ||
1079 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1080 | _mm_testnzc_ps (__m128 __M, __m128 __V) | |
1081 | { | |
1082 | return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); | |
1083 | } | |
1084 | ||
1085 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1086 | _mm256_testz_pd (__m256d __M, __m256d __V) | |
1087 | { | |
1088 | return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); | |
1089 | } | |
1090 | ||
1091 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1092 | _mm256_testc_pd (__m256d __M, __m256d __V) | |
1093 | { | |
1094 | return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); | |
1095 | } | |
1096 | ||
1097 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1098 | _mm256_testnzc_pd (__m256d __M, __m256d __V) | |
1099 | { | |
1100 | return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); | |
1101 | } | |
1102 | ||
1103 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1104 | _mm256_testz_ps (__m256 __M, __m256 __V) | |
1105 | { | |
1106 | return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); | |
1107 | } | |
1108 | ||
1109 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1110 | _mm256_testc_ps (__m256 __M, __m256 __V) | |
1111 | { | |
1112 | return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); | |
1113 | } | |
1114 | ||
1115 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1116 | _mm256_testnzc_ps (__m256 __M, __m256 __V) | |
1117 | { | |
1118 | return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); | |
1119 | } | |
1120 | ||
1121 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1122 | _mm256_testz_si256 (__m256i __M, __m256i __V) | |
1123 | { | |
1124 | return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); | |
1125 | } | |
1126 | ||
1127 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1128 | _mm256_testc_si256 (__m256i __M, __m256i __V) | |
1129 | { | |
1130 | return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); | |
1131 | } | |
1132 | ||
1133 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1134 | _mm256_testnzc_si256 (__m256i __M, __m256i __V) | |
1135 | { | |
1136 | return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); | |
1137 | } | |
1138 | ||
1139 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1140 | _mm256_movemask_pd (__m256d __A) | |
1141 | { | |
1142 | return __builtin_ia32_movmskpd256 ((__v4df)__A); | |
1143 | } | |
1144 | ||
1145 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1146 | _mm256_movemask_ps (__m256 __A) | |
1147 | { | |
1148 | return __builtin_ia32_movmskps256 ((__v8sf)__A); | |
1149 | } | |
1150 | ||
0b192937 UD |
1151 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1152 | _mm256_undefined_pd (void) | |
1153 | { | |
6b0907b4 JJ |
1154 | #pragma GCC diagnostic push |
1155 | #pragma GCC diagnostic ignored "-Winit-self" | |
0b192937 | 1156 | __m256d __Y = __Y; |
6b0907b4 | 1157 | #pragma GCC diagnostic pop |
0b192937 UD |
1158 | return __Y; |
1159 | } | |
1160 | ||
1161 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1162 | _mm256_undefined_ps (void) | |
1163 | { | |
6b0907b4 JJ |
1164 | #pragma GCC diagnostic push |
1165 | #pragma GCC diagnostic ignored "-Winit-self" | |
0b192937 | 1166 | __m256 __Y = __Y; |
6b0907b4 | 1167 | #pragma GCC diagnostic pop |
0b192937 UD |
1168 | return __Y; |
1169 | } | |
1170 | ||
1171 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1172 | _mm256_undefined_si256 (void) | |
1173 | { | |
6b0907b4 JJ |
1174 | #pragma GCC diagnostic push |
1175 | #pragma GCC diagnostic ignored "-Winit-self" | |
0b192937 | 1176 | __m256i __Y = __Y; |
6b0907b4 | 1177 | #pragma GCC diagnostic pop |
0b192937 UD |
1178 | return __Y; |
1179 | } | |
1180 | ||
95879c72 L |
1181 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1182 | _mm256_setzero_pd (void) | |
1183 | { | |
1184 | return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; | |
1185 | } | |
1186 | ||
1187 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1188 | _mm256_setzero_ps (void) | |
1189 | { | |
1190 | return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, | |
1191 | 0.0, 0.0, 0.0, 0.0 }; | |
1192 | } | |
1193 | ||
1194 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1195 | _mm256_setzero_si256 (void) | |
1196 | { | |
1197 | return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; | |
1198 | } | |
1199 | ||
1200 | /* Create the vector [A B C D]. */ | |
1201 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1202 | _mm256_set_pd (double __A, double __B, double __C, double __D) | |
1203 | { | |
1204 | return __extension__ (__m256d){ __D, __C, __B, __A }; | |
1205 | } | |
1206 | ||
1207 | /* Create the vector [A B C D E F G H]. */ | |
1208 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1209 | _mm256_set_ps (float __A, float __B, float __C, float __D, | |
1210 | float __E, float __F, float __G, float __H) | |
1211 | { | |
1212 | return __extension__ (__m256){ __H, __G, __F, __E, | |
1213 | __D, __C, __B, __A }; | |
1214 | } | |
1215 | ||
1216 | /* Create the vector [A B C D E F G H]. */ | |
1217 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1218 | _mm256_set_epi32 (int __A, int __B, int __C, int __D, | |
1219 | int __E, int __F, int __G, int __H) | |
1220 | { | |
1221 | return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, | |
1222 | __D, __C, __B, __A }; | |
1223 | } | |
1224 | ||
1225 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1226 | _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, | |
1227 | short __q11, short __q10, short __q09, short __q08, | |
1228 | short __q07, short __q06, short __q05, short __q04, | |
1229 | short __q03, short __q02, short __q01, short __q00) | |
1230 | { | |
1231 | return __extension__ (__m256i)(__v16hi){ | |
1232 | __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, | |
1233 | __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 | |
1234 | }; | |
1235 | } | |
1236 | ||
1237 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1238 | _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, | |
1239 | char __q27, char __q26, char __q25, char __q24, | |
1240 | char __q23, char __q22, char __q21, char __q20, | |
1241 | char __q19, char __q18, char __q17, char __q16, | |
1242 | char __q15, char __q14, char __q13, char __q12, | |
1243 | char __q11, char __q10, char __q09, char __q08, | |
1244 | char __q07, char __q06, char __q05, char __q04, | |
1245 | char __q03, char __q02, char __q01, char __q00) | |
1246 | { | |
1247 | return __extension__ (__m256i)(__v32qi){ | |
1248 | __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, | |
1249 | __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, | |
1250 | __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, | |
1251 | __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 | |
1252 | }; | |
1253 | } | |
1254 | ||
1255 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1256 | _mm256_set_epi64x (long long __A, long long __B, long long __C, | |
1257 | long long __D) | |
1258 | { | |
1259 | return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; | |
1260 | } | |
1261 | ||
1262 | /* Create a vector with all elements equal to A. */ | |
1263 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1264 | _mm256_set1_pd (double __A) | |
1265 | { | |
1266 | return __extension__ (__m256d){ __A, __A, __A, __A }; | |
1267 | } | |
1268 | ||
1269 | /* Create a vector with all elements equal to A. */ | |
1270 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1271 | _mm256_set1_ps (float __A) | |
1272 | { | |
1273 | return __extension__ (__m256){ __A, __A, __A, __A, | |
1274 | __A, __A, __A, __A }; | |
1275 | } | |
1276 | ||
1277 | /* Create a vector with all elements equal to A. */ | |
1278 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1279 | _mm256_set1_epi32 (int __A) | |
1280 | { | |
1281 | return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, | |
1282 | __A, __A, __A, __A }; | |
1283 | } | |
1284 | ||
1285 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1286 | _mm256_set1_epi16 (short __A) | |
1287 | { | |
1288 | return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, | |
1289 | __A, __A, __A, __A, __A, __A, __A, __A); | |
1290 | } | |
1291 | ||
1292 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1293 | _mm256_set1_epi8 (char __A) | |
1294 | { | |
1295 | return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, | |
1296 | __A, __A, __A, __A, __A, __A, __A, __A, | |
1297 | __A, __A, __A, __A, __A, __A, __A, __A, | |
1298 | __A, __A, __A, __A, __A, __A, __A, __A); | |
1299 | } | |
1300 | ||
1301 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1302 | _mm256_set1_epi64x (long long __A) | |
1303 | { | |
1304 | return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; | |
1305 | } | |
1306 | ||
1307 | /* Create vectors of elements in the reversed order from the | |
1308 | _mm256_set_XXX functions. */ | |
1309 | ||
1310 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1311 | _mm256_setr_pd (double __A, double __B, double __C, double __D) | |
1312 | { | |
1313 | return _mm256_set_pd (__D, __C, __B, __A); | |
1314 | } | |
1315 | ||
1316 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1317 | _mm256_setr_ps (float __A, float __B, float __C, float __D, | |
1318 | float __E, float __F, float __G, float __H) | |
1319 | { | |
1320 | return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); | |
1321 | } | |
1322 | ||
1323 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1324 | _mm256_setr_epi32 (int __A, int __B, int __C, int __D, | |
1325 | int __E, int __F, int __G, int __H) | |
1326 | { | |
1327 | return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); | |
1328 | } | |
1329 | ||
1330 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1331 | _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, | |
1332 | short __q11, short __q10, short __q09, short __q08, | |
1333 | short __q07, short __q06, short __q05, short __q04, | |
1334 | short __q03, short __q02, short __q01, short __q00) | |
1335 | { | |
1336 | return _mm256_set_epi16 (__q00, __q01, __q02, __q03, | |
1337 | __q04, __q05, __q06, __q07, | |
1338 | __q08, __q09, __q10, __q11, | |
1339 | __q12, __q13, __q14, __q15); | |
1340 | } | |
1341 | ||
1342 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1343 | _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, | |
1344 | char __q27, char __q26, char __q25, char __q24, | |
1345 | char __q23, char __q22, char __q21, char __q20, | |
1346 | char __q19, char __q18, char __q17, char __q16, | |
1347 | char __q15, char __q14, char __q13, char __q12, | |
1348 | char __q11, char __q10, char __q09, char __q08, | |
1349 | char __q07, char __q06, char __q05, char __q04, | |
1350 | char __q03, char __q02, char __q01, char __q00) | |
1351 | { | |
1352 | return _mm256_set_epi8 (__q00, __q01, __q02, __q03, | |
1353 | __q04, __q05, __q06, __q07, | |
1354 | __q08, __q09, __q10, __q11, | |
1355 | __q12, __q13, __q14, __q15, | |
1356 | __q16, __q17, __q18, __q19, | |
1357 | __q20, __q21, __q22, __q23, | |
1358 | __q24, __q25, __q26, __q27, | |
1359 | __q28, __q29, __q30, __q31); | |
1360 | } | |
1361 | ||
1362 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1363 | _mm256_setr_epi64x (long long __A, long long __B, long long __C, | |
1364 | long long __D) | |
1365 | { | |
1366 | return _mm256_set_epi64x (__D, __C, __B, __A); | |
1367 | } | |
1368 | ||
1369 | /* Casts between various SP, DP, INT vector types. Note that these do no | |
1370 | conversion of values, they just change the type. */ | |
1371 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1372 | _mm256_castpd_ps (__m256d __A) | |
1373 | { | |
1374 | return (__m256) __A; | |
1375 | } | |
1376 | ||
1377 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1378 | _mm256_castpd_si256 (__m256d __A) | |
1379 | { | |
1380 | return (__m256i) __A; | |
1381 | } | |
1382 | ||
1383 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1384 | _mm256_castps_pd (__m256 __A) | |
1385 | { | |
1386 | return (__m256d) __A; | |
1387 | } | |
1388 | ||
1389 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1390 | _mm256_castps_si256(__m256 __A) | |
1391 | { | |
1392 | return (__m256i) __A; | |
1393 | } | |
1394 | ||
1395 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1396 | _mm256_castsi256_ps (__m256i __A) | |
1397 | { | |
1398 | return (__m256) __A; | |
1399 | } | |
1400 | ||
1401 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1402 | _mm256_castsi256_pd (__m256i __A) | |
1403 | { | |
1404 | return (__m256d) __A; | |
1405 | } | |
1406 | ||
1407 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1408 | _mm256_castpd256_pd128 (__m256d __A) | |
1409 | { | |
1410 | return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); | |
1411 | } | |
1412 | ||
1413 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1414 | _mm256_castps256_ps128 (__m256 __A) | |
1415 | { | |
1416 | return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); | |
1417 | } | |
1418 | ||
1419 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1420 | _mm256_castsi256_si128 (__m256i __A) | |
1421 | { | |
1422 | return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); | |
1423 | } | |
1424 | ||
1425 | /* When cast is done from a 128 to 256-bit type, the low 128 bits of | |
1426 | the 256-bit result contain source parameter value and the upper 128 | |
1427 | bits of the result are undefined. Those intrinsics shouldn't | |
1428 | generate any extra moves. */ | |
1429 | ||
1430 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1431 | _mm256_castpd128_pd256 (__m128d __A) | |
1432 | { | |
1433 | return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); | |
1434 | } | |
1435 | ||
1436 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1437 | _mm256_castps128_ps256 (__m128 __A) | |
1438 | { | |
1439 | return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); | |
1440 | } | |
1441 | ||
1442 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1443 | _mm256_castsi128_si256 (__m128i __A) | |
1444 | { | |
1445 | return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); | |
1446 | } | |
97db2bf7 | 1447 | |
e6b2dc24 JJ |
1448 | /* Similarly, but with zero extension instead of undefined values. */ |
1449 | ||
1450 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1451 | _mm256_zextpd128_pd256 (__m128d __A) | |
1452 | { | |
1453 | return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0); | |
1454 | } | |
1455 | ||
1456 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1457 | _mm256_zextps128_ps256 (__m128 __A) | |
1458 | { | |
1459 | return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0); | |
1460 | } | |
1461 | ||
1462 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1463 | _mm256_zextsi128_si256 (__m128i __A) | |
1464 | { | |
1465 | return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0); | |
1466 | } | |
1467 | ||
f4ee3a9e UB |
1468 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1469 | _mm256_set_m128 ( __m128 __H, __m128 __L) | |
1470 | { | |
1471 | return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1); | |
1472 | } | |
1473 | ||
1474 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1475 | _mm256_set_m128d (__m128d __H, __m128d __L) | |
1476 | { | |
1477 | return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1); | |
1478 | } | |
1479 | ||
1480 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1481 | _mm256_set_m128i (__m128i __H, __m128i __L) | |
1482 | { | |
1483 | return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1); | |
1484 | } | |
1485 | ||
1486 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1487 | _mm256_setr_m128 (__m128 __L, __m128 __H) | |
1488 | { | |
1489 | return _mm256_set_m128 (__H, __L); | |
1490 | } | |
1491 | ||
1492 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1493 | _mm256_setr_m128d (__m128d __L, __m128d __H) | |
1494 | { | |
1495 | return _mm256_set_m128d (__H, __L); | |
1496 | } | |
1497 | ||
1498 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1499 | _mm256_setr_m128i (__m128i __L, __m128i __H) | |
1500 | { | |
1501 | return _mm256_set_m128i (__H, __L); | |
1502 | } | |
1503 | ||
96d5c6dc JJ |
1504 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1505 | _mm256_loadu2_m128 (float const *__PH, float const *__PL) | |
1506 | { | |
1507 | return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)), | |
1508 | _mm_loadu_ps (__PH), 1); | |
1509 | } | |
1510 | ||
1511 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1512 | _mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A) | |
1513 | { | |
1514 | _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A)); | |
1515 | _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1)); | |
1516 | } | |
1517 | ||
1518 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1519 | _mm256_loadu2_m128d (double const *__PH, double const *__PL) | |
1520 | { | |
1521 | return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)), | |
1522 | _mm_loadu_pd (__PH), 1); | |
1523 | } | |
1524 | ||
1525 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1526 | _mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A) | |
1527 | { | |
1528 | _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A)); | |
1529 | _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1)); | |
1530 | } | |
1531 | ||
1532 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1533 | _mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL) | |
1534 | { | |
1535 | return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)), | |
1536 | _mm_loadu_si128 (__PH), 1); | |
1537 | } | |
1538 | ||
1539 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1540 | _mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A) | |
1541 | { | |
1542 | _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A)); | |
1543 | _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1)); | |
1544 | } | |
1545 | ||
97db2bf7 ST |
1546 | #ifdef __DISABLE_AVX__ |
1547 | #undef __DISABLE_AVX__ | |
1548 | #pragma GCC pop_options | |
1549 | #endif /* __DISABLE_AVX__ */ | |
1550 | ||
1551 | #endif /* _AVXINTRIN_H_INCLUDED */ |