]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/avxintrin.h
re PR libstdc++/38466 (Document std::pair vs. std::swap)
[thirdparty/gcc.git] / gcc / config / i386 / avxintrin.h
CommitLineData
e47b7d04 1/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
95879c72
L
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20/* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
25 Public License. */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 11.0. */
29
37fe763d
UB
30#ifndef _IMMINTRIN_H_INCLUDED
31# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
32#endif
95879c72
L
33
34/* Internal data types for implementing the intrinsics. */
35typedef double __v4df __attribute__ ((__vector_size__ (32)));
36typedef float __v8sf __attribute__ ((__vector_size__ (32)));
37typedef long long __v4di __attribute__ ((__vector_size__ (32)));
38typedef int __v8si __attribute__ ((__vector_size__ (32)));
39typedef short __v16hi __attribute__ ((__vector_size__ (32)));
40typedef char __v32qi __attribute__ ((__vector_size__ (32)));
41
42/* The Intel API is flexible enough that we must allow aliasing with other
43 vector types, and their scalar components. */
44typedef float __m256 __attribute__ ((__vector_size__ (32),
45 __may_alias__));
46typedef long long __m256i __attribute__ ((__vector_size__ (32),
47 __may_alias__));
48typedef double __m256d __attribute__ ((__vector_size__ (32),
49 __may_alias__));
50
51/* Compare predicates for scalar and packed compare intrinsics. */
52
53/* Equal (ordered, non-signaling) */
54#define _CMP_EQ_OQ 0x00
55/* Less-than (ordered, signaling) */
56#define _CMP_LT_OS 0x01
57/* Less-than-or-equal (ordered, signaling) */
58#define _CMP_LE_OS 0x02
59/* Unordered (non-signaling) */
60#define _CMP_UNORD_Q 0x03
61/* Not-equal (unordered, non-signaling) */
62#define _CMP_NEQ_UQ 0x04
63/* Not-less-than (unordered, signaling) */
64#define _CMP_NLT_US 0x05
65/* Not-less-than-or-equal (unordered, signaling) */
66#define _CMP_NLE_US 0x06
67/* Ordered (nonsignaling) */
68#define _CMP_ORD_Q 0x07
69/* Equal (unordered, non-signaling) */
70#define _CMP_EQ_UQ 0x08
71/* Not-greater-than-or-equal (unordered, signaling) */
72#define _CMP_NGE_US 0x09
73/* Not-greater-than (unordered, signaling) */
74#define _CMP_NGT_US 0x0a
75/* False (ordered, non-signaling) */
76#define _CMP_FALSE_OQ 0x0b
77/* Not-equal (ordered, non-signaling) */
78#define _CMP_NEQ_OQ 0x0c
79/* Greater-than-or-equal (ordered, signaling) */
80#define _CMP_GE_OS 0x0d
81/* Greater-than (ordered, signaling) */
82#define _CMP_GT_OS 0x0e
83/* True (unordered, non-signaling) */
84#define _CMP_TRUE_UQ 0x0f
85/* Equal (ordered, signaling) */
86#define _CMP_EQ_OS 0x10
87/* Less-than (ordered, non-signaling) */
88#define _CMP_LT_OQ 0x11
89/* Less-than-or-equal (ordered, non-signaling) */
90#define _CMP_LE_OQ 0x12
91/* Unordered (signaling) */
92#define _CMP_UNORD_S 0x13
93/* Not-equal (unordered, signaling) */
94#define _CMP_NEQ_US 0x14
95/* Not-less-than (unordered, non-signaling) */
96#define _CMP_NLT_UQ 0x15
97/* Not-less-than-or-equal (unordered, non-signaling) */
98#define _CMP_NLE_UQ 0x16
99/* Ordered (signaling) */
100#define _CMP_ORD_S 0x17
101/* Equal (unordered, signaling) */
102#define _CMP_EQ_US 0x18
103/* Not-greater-than-or-equal (unordered, non-signaling) */
104#define _CMP_NGE_UQ 0x19
105/* Not-greater-than (unordered, non-signaling) */
106#define _CMP_NGT_UQ 0x1a
107/* False (ordered, signaling) */
108#define _CMP_FALSE_OS 0x1b
109/* Not-equal (ordered, signaling) */
110#define _CMP_NEQ_OS 0x1c
111/* Greater-than-or-equal (ordered, non-signaling) */
112#define _CMP_GE_OQ 0x1d
113/* Greater-than (ordered, non-signaling) */
114#define _CMP_GT_OQ 0x1e
115/* True (unordered, signaling) */
116#define _CMP_TRUE_US 0x1f
117
118extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119_mm256_add_pd (__m256d __A, __m256d __B)
120{
121 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
122}
123
124extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125_mm256_add_ps (__m256 __A, __m256 __B)
126{
127 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
128}
129
130extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131_mm256_addsub_pd (__m256d __A, __m256d __B)
132{
133 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
134}
135
136extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
137_mm256_addsub_ps (__m256 __A, __m256 __B)
138{
139 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
140}
141
142
143extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
144_mm256_and_pd (__m256d __A, __m256d __B)
145{
146 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
147}
148
149extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150_mm256_and_ps (__m256 __A, __m256 __B)
151{
152 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
153}
154
155extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156_mm256_andnot_pd (__m256d __A, __m256d __B)
157{
158 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
159}
160
161extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm256_andnot_ps (__m256 __A, __m256 __B)
163{
164 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
165}
166
167/* Double/single precision floating point blend instructions - select
168 data from 2 sources using constant/variable mask. */
169
170#ifdef __OPTIMIZE__
171extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
173{
174 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
175 (__v4df)__Y,
176 __M);
177}
178
179extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
180_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
181{
182 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
183 (__v8sf)__Y,
184 __M);
185}
186#else
187#define _mm256_blend_pd(X, Y, M) \
188 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
189 (__v4df)(__m256d)(Y), (int)(M)))
190
191#define _mm256_blend_ps(X, Y, M) \
192 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
193 (__v8sf)(__m256)(Y), (int)(M)))
194#endif
195
196extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
197_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
198{
199 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
200 (__v4df)__Y,
201 (__v4df)__M);
202}
203
204extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
206{
207 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
208 (__v8sf)__Y,
209 (__v8sf)__M);
210}
211
212extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
213_mm256_div_pd (__m256d __A, __m256d __B)
214{
215 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
216}
217
218extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219_mm256_div_ps (__m256 __A, __m256 __B)
220{
221 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
222}
223
224/* Dot product instructions with mask-defined summing and zeroing parts
225 of result. */
226
227#ifdef __OPTIMIZE__
228extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
230{
231 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
232 (__v8sf)__Y,
233 __M);
234}
235#else
236#define _mm256_dp_ps(X, Y, M) \
237 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
238 (__v8sf)(__m256)(Y), (int)(M)))
239#endif
240
241extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242_mm256_hadd_pd (__m256d __X, __m256d __Y)
243{
244 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
245}
246
247extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248_mm256_hadd_ps (__m256 __X, __m256 __Y)
249{
250 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
251}
252
253extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
254_mm256_hsub_pd (__m256d __X, __m256d __Y)
255{
256 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
257}
258
259extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260_mm256_hsub_ps (__m256 __X, __m256 __Y)
261{
262 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
263}
264
265extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266_mm256_max_pd (__m256d __A, __m256d __B)
267{
268 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
269}
270
271extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272_mm256_max_ps (__m256 __A, __m256 __B)
273{
274 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
275}
276
277extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278_mm256_min_pd (__m256d __A, __m256d __B)
279{
280 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
281}
282
283extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284_mm256_min_ps (__m256 __A, __m256 __B)
285{
286 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
287}
288
289extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290_mm256_mul_pd (__m256d __A, __m256d __B)
291{
292 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
293}
294
295extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296_mm256_mul_ps (__m256 __A, __m256 __B)
297{
298 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
299}
300
301extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302_mm256_or_pd (__m256d __A, __m256d __B)
303{
304 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
305}
306
307extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308_mm256_or_ps (__m256 __A, __m256 __B)
309{
310 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
311}
312
313#ifdef __OPTIMIZE__
314extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
316{
317 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
318 __mask);
319}
320
321extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
322_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
323{
324 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
325 __mask);
326}
327#else
328#define _mm256_shuffle_pd(A, B, N) \
329 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
330 (__v4df)(__m256d)(B), (int)(N)))
331
332#define _mm256_shuffle_ps(A, B, N) \
333 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
334 (__v8sf)(__m256)(B), (int)(N)))
335#endif
336
337extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338_mm256_sub_pd (__m256d __A, __m256d __B)
339{
340 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
341}
342
343extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm256_sub_ps (__m256 __A, __m256 __B)
345{
346 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
347}
348
349extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350_mm256_xor_pd (__m256d __A, __m256d __B)
351{
352 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
353}
354
355extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm256_xor_ps (__m256 __A, __m256 __B)
357{
358 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
359}
360
361#ifdef __OPTIMIZE__
362extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
364{
365 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
366}
367
368extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
370{
371 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
372}
373
374extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
376{
377 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
378 __P);
379}
380
381extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
383{
384 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
385 __P);
386}
387
388extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
390{
391 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
392}
393
394extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
396{
397 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
398}
399#else
400#define _mm_cmp_pd(X, Y, P) \
401 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
402 (__v2df)(__m128d)(Y), (int)(P)))
403
404#define _mm_cmp_ps(X, Y, P) \
405 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
406 (__v4sf)(__m128)(Y), (int)(P)))
407
408#define _mm256_cmp_pd(X, Y, P) \
409 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
410 (__v4df)(__m256d)(Y), (int)(P)))
411
412#define _mm256_cmp_ps(X, Y, P) \
413 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
414 (__v8sf)(__m256)(Y), (int)(P)))
415
416#define _mm_cmp_sd(X, Y, P) \
417 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
418 (__v2df)(__m128d)(Y), (int)(P)))
419
420#define _mm_cmp_ss(X, Y, P) \
421 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
422 (__v4sf)(__m128)(Y), (int)(P)))
423#endif
424
425extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426_mm256_cvtepi32_pd (__m128i __A)
427{
428 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
429}
430
431extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432_mm256_cvtepi32_ps (__m256i __A)
433{
434 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
435}
436
437extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438_mm256_cvtpd_ps (__m256d __A)
439{
440 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
441}
442
443extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444_mm256_cvtps_epi32 (__m256 __A)
445{
446 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
447}
448
449extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450_mm256_cvtps_pd (__m128 __A)
451{
452 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
453}
454
455extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456_mm256_cvttpd_epi32 (__m256d __A)
457{
458 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
459}
460
461extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462_mm256_cvtpd_epi32 (__m256d __A)
463{
464 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
465}
466
467extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468_mm256_cvttps_epi32 (__m256 __A)
469{
470 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
471}
472
473#ifdef __OPTIMIZE__
474extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475_mm256_extractf128_pd (__m256d __X, const int __N)
476{
477 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
478}
479
480extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481_mm256_extractf128_ps (__m256 __X, const int __N)
482{
483 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
484}
485
486extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487_mm256_extractf128_si256 (__m256i __X, const int __N)
488{
489 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
490}
491
492extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493_mm256_extract_epi32 (__m256i __X, int const __N)
494{
495 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
496 return _mm_extract_epi32 (__Y, __N % 4);
497}
498
499extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
500_mm256_extract_epi16 (__m256i __X, int const __N)
501{
502 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
503 return _mm_extract_epi16 (__Y, __N % 8);
504}
505
506extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm256_extract_epi8 (__m256i __X, int const __N)
508{
509 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
510 return _mm_extract_epi8 (__Y, __N % 16);
511}
512
513#ifdef __x86_64__
514extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515_mm256_extract_epi64 (__m256i __X, const int __N)
516{
517 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
518 return _mm_extract_epi64 (__Y, __N % 2);
519}
520#endif
521#else
522#define _mm256_extractf128_pd(X, N) \
523 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
524 (int)(N)))
525
526#define _mm256_extractf128_ps(X, N) \
527 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
528 (int)(N)))
529
530#define _mm256_extractf128_si256(X, N) \
531 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
532 (int)(N)))
533
534#define _mm256_extract_epi32(X, N) \
535 (__extension__ \
536 ({ \
537 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
538 _mm_extract_epi32 (__Y, (N) % 4); \
539 }))
540
541#define _mm256_extract_epi16(X, N) \
542 (__extension__ \
543 ({ \
544 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
545 _mm_extract_epi16 (__Y, (N) % 8); \
546 }))
547
548#define _mm256_extract_epi8(X, N) \
549 (__extension__ \
550 ({ \
551 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
552 _mm_extract_epi8 (__Y, (N) % 16); \
553 }))
554
555#ifdef __x86_64__
556#define _mm256_extract_epi64(X, N) \
557 (__extension__ \
558 ({ \
559 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
560 _mm_extract_epi64 (__Y, (N) % 2); \
561 }))
562#endif
563#endif
564
565extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
566_mm256_zeroall (void)
567{
568 __builtin_ia32_vzeroall ();
569}
570
571extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572_mm256_zeroupper (void)
573{
574 __builtin_ia32_vzeroupper ();
575}
576
577extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
578_mm_permutevar_pd (__m128d __A, __m128i __C)
579{
580 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
581 (__v2di)__C);
582}
583
584extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585_mm256_permutevar_pd (__m256d __A, __m256i __C)
586{
587 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
588 (__v4di)__C);
589}
590
591extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592_mm_permutevar_ps (__m128 __A, __m128i __C)
593{
594 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
595 (__v4si)__C);
596}
597
598extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599_mm256_permutevar_ps (__m256 __A, __m256i __C)
600{
601 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
602 (__v8si)__C);
603}
604
605#ifdef __OPTIMIZE__
606extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607_mm_permute_pd (__m128d __X, const int __C)
608{
609 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
610}
611
612extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_mm256_permute_pd (__m256d __X, const int __C)
614{
615 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
616}
617
618extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619_mm_permute_ps (__m128 __X, const int __C)
620{
621 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
622}
623
624extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625_mm256_permute_ps (__m256 __X, const int __C)
626{
627 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
628}
95879c72
L
629#else
630#define _mm_permute_pd(X, C) \
631 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
632
633#define _mm256_permute_pd(X, C) \
634 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
635
636#define _mm_permute_ps(X, C) \
637 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
638
639#define _mm256_permute_ps(X, C) \
640 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
95879c72
L
641#endif
642
643#ifdef __OPTIMIZE__
644extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
646{
647 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
648 (__v4df)__Y,
649 __C);
650}
651
652extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
654{
655 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
656 (__v8sf)__Y,
657 __C);
658}
659
660extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
662{
663 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
664 (__v8si)__Y,
665 __C);
666}
667#else
668#define _mm256_permute2f128_pd(X, Y, C) \
669 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
670 (__v4df)(__m256d)(Y), \
671 (int)(C)))
672
673#define _mm256_permute2f128_ps(X, Y, C) \
674 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
675 (__v8sf)(__m256)(Y), \
676 (int)(C)))
677
678#define _mm256_permute2f128_si256(X, Y, C) \
679 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
680 (__v8si)(__m256i)(Y), \
681 (int)(C)))
682#endif
683
684extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685_mm_broadcast_ss (float const *__X)
686{
687 return (__m128) __builtin_ia32_vbroadcastss (__X);
688}
689
690extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691_mm256_broadcast_sd (double const *__X)
692{
693 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
694}
695
696extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697_mm256_broadcast_ss (float const *__X)
698{
699 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
700}
701
702extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703_mm256_broadcast_pd (__m128d const *__X)
704{
705 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
706}
707
708extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709_mm256_broadcast_ps (__m128 const *__X)
710{
711 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
712}
713
714#ifdef __OPTIMIZE__
715extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
717{
718 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
719 (__v2df)__Y,
720 __O);
721}
722
723extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
725{
726 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
727 (__v4sf)__Y,
728 __O);
729}
730
731extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
732_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
733{
734 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
735 (__v4si)__Y,
736 __O);
737}
738
739extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
741{
742 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
743 __Y = _mm_insert_epi16 (__Y, __D, __N % 4);
744 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
745}
746
747extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
748_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
749{
750 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
751 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
752 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
753}
754
755extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
756_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
757{
758 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
759 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
760 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
761}
762
763#ifdef __x86_64__
764extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
765_mm256_insert_epi64 (__m256i __X, int __D, int const __N)
766{
767 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
768 __Y = _mm_insert_epi16 (__Y, __D, __N % 2);
769 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
770}
771#endif
772#else
773#define _mm256_insertf128_pd(X, Y, O) \
774 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
775 (__v2df)(__m128d)(Y), \
776 (int)(O)))
777
778#define _mm256_insertf128_ps(X, Y, O) \
779 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
780 (__v4sf)(__m128)(Y), \
781 (int)(O)))
782
783#define _mm256_insertf128_si256(X, Y, O) \
784 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
785 (__v4si)(__m128i)(Y), \
786 (int)(O)))
787
788#define _mm256_insert_epi32(X, D, N) \
789 (__extension__ \
790 ({ \
791 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
792 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
793 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
794 }))
795
796#define _mm256_insert_epi16(X, D, N) \
797 (__extension__ \
798 ({ \
799 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
800 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
801 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
802 }))
803
804#define _mm256_insert_epi8(X, D, N) \
805 (__extension__ \
806 ({ \
807 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
808 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
809 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
810 }))
811
812#ifdef __x86_64__
813#define _mm256_insert_epi64(X, D, N) \
814 (__extension__ \
815 ({ \
816 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
817 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
818 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
819 }))
820#endif
821#endif
822
823extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824_mm256_load_pd (double const *__P)
825{
826 return *(__m256d *)__P;
827}
828
829extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830_mm256_store_pd (double *__P, __m256d __A)
831{
832 *(__m256d *)__P = __A;
833}
834
835extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836_mm256_load_ps (float const *__P)
837{
838 return *(__m256 *)__P;
839}
840
841extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842_mm256_store_ps (float *__P, __m256 __A)
843{
844 *(__m256 *)__P = __A;
845}
846
847extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848_mm256_loadu_pd (double const *__P)
849{
850 return (__m256d) __builtin_ia32_loadupd256 (__P);
851}
852
853extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
854_mm256_storeu_pd (double *__P, __m256d __A)
855{
856 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
857}
858
859extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860_mm256_loadu_ps (float const *__P)
861{
862 return (__m256) __builtin_ia32_loadups256 (__P);
863}
864
865extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866_mm256_storeu_ps (float *__P, __m256 __A)
867{
868 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
869}
870
871extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872_mm256_load_si256 (__m256i const *__P)
873{
874 return *__P;
875}
876
877extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878_mm256_store_si256 (__m256i *__P, __m256i __A)
879{
880 *__P = __A;
881}
882
883extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884_mm256_loadu_si256 (__m256i const *__P)
885{
886 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
887}
888
889extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890_mm256_storeu_si256 (__m256i *__P, __m256i __A)
891{
892 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
893}
894
895extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896_mm_maskload_pd (double const *__P, __m128d __M)
897{
898 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
899 (__v2df)__M);
900}
901
902extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903_mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
904{
905 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A);
906}
907
908extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
909_mm256_maskload_pd (double const *__P, __m256d __M)
910{
911 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
912 (__v4df)__M);
913}
914
915extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916_mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A)
917{
918 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A);
919}
920
921extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922_mm_maskload_ps (float const *__P, __m128 __M)
923{
924 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
925 (__v4sf)__M);
926}
927
928extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929_mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
930{
931 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A);
932}
933
934extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935_mm256_maskload_ps (float const *__P, __m256 __M)
936{
937 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
938 (__v8sf)__M);
939}
940
941extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
942_mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A)
943{
944 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A);
945}
946
947extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
948_mm256_movehdup_ps (__m256 __X)
949{
950 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
951}
952
953extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954_mm256_moveldup_ps (__m256 __X)
955{
956 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
957}
958
959extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960_mm256_movedup_pd (__m256d __X)
961{
962 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
963}
964
965extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966_mm256_lddqu_si256 (__m256i const *__P)
967{
968 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
969}
970
971extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972_mm256_rcp_ps (__m256 __A)
973{
974 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
975}
976
977extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978_mm256_rsqrt_ps (__m256 __A)
979{
980 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
981}
982
983extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984_mm256_sqrt_pd (__m256d __A)
985{
986 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
987}
988
989extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990_mm256_sqrt_ps (__m256 __A)
991{
992 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
993}
994
995#ifdef __OPTIMIZE__
996extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
997_mm256_round_pd (__m256d __V, const int __M)
998{
999 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1000}
1001
1002extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003_mm256_round_ps (__m256 __V, const int __M)
1004{
1005 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1006}
1007#else
1008#define _mm256_round_pd(V, M) \
1009 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1010
1011#define _mm256_round_ps(V, M) \
1012 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1013#endif
1014
1015#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1016#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1017#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1018#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1019
1020extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1022{
1023 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1024}
1025
1026extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1028{
1029 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1030}
1031
1032extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1034{
1035 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1036}
1037
1038extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1040{
1041 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1042}
1043
1044extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045_mm_testz_pd (__m128d __M, __m128d __V)
1046{
1047 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1048}
1049
1050extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051_mm_testc_pd (__m128d __M, __m128d __V)
1052{
1053 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1054}
1055
1056extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057_mm_testnzc_pd (__m128d __M, __m128d __V)
1058{
1059 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1060}
1061
1062extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063_mm_testz_ps (__m128 __M, __m128 __V)
1064{
1065 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1066}
1067
1068extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069_mm_testc_ps (__m128 __M, __m128 __V)
1070{
1071 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1072}
1073
1074extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075_mm_testnzc_ps (__m128 __M, __m128 __V)
1076{
1077 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1078}
1079
1080extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081_mm256_testz_pd (__m256d __M, __m256d __V)
1082{
1083 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1084}
1085
1086extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087_mm256_testc_pd (__m256d __M, __m256d __V)
1088{
1089 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1090}
1091
1092extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1093_mm256_testnzc_pd (__m256d __M, __m256d __V)
1094{
1095 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1096}
1097
1098extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099_mm256_testz_ps (__m256 __M, __m256 __V)
1100{
1101 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1102}
1103
1104extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105_mm256_testc_ps (__m256 __M, __m256 __V)
1106{
1107 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1108}
1109
1110extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111_mm256_testnzc_ps (__m256 __M, __m256 __V)
1112{
1113 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1114}
1115
1116extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117_mm256_testz_si256 (__m256i __M, __m256i __V)
1118{
1119 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1120}
1121
1122extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1123_mm256_testc_si256 (__m256i __M, __m256i __V)
1124{
1125 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1126}
1127
1128extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1130{
1131 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1132}
1133
1134extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1135_mm256_movemask_pd (__m256d __A)
1136{
1137 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1138}
1139
1140extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141_mm256_movemask_ps (__m256 __A)
1142{
1143 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1144}
1145
1146extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147_mm256_setzero_pd (void)
1148{
1149 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1150}
1151
1152extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153_mm256_setzero_ps (void)
1154{
1155 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1156 0.0, 0.0, 0.0, 0.0 };
1157}
1158
1159extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1160_mm256_setzero_si256 (void)
1161{
1162 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1163}
1164
1165/* Create the vector [A B C D]. */
1166extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167_mm256_set_pd (double __A, double __B, double __C, double __D)
1168{
1169 return __extension__ (__m256d){ __D, __C, __B, __A };
1170}
1171
1172/* Create the vector [A B C D E F G H]. */
1173extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174_mm256_set_ps (float __A, float __B, float __C, float __D,
1175 float __E, float __F, float __G, float __H)
1176{
1177 return __extension__ (__m256){ __H, __G, __F, __E,
1178 __D, __C, __B, __A };
1179}
1180
1181/* Create the vector [A B C D E F G H]. */
1182extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1184 int __E, int __F, int __G, int __H)
1185{
1186 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1187 __D, __C, __B, __A };
1188}
1189
1190extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1192 short __q11, short __q10, short __q09, short __q08,
1193 short __q07, short __q06, short __q05, short __q04,
1194 short __q03, short __q02, short __q01, short __q00)
1195{
1196 return __extension__ (__m256i)(__v16hi){
1197 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1198 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1199 };
1200}
1201
1202extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1203_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1204 char __q27, char __q26, char __q25, char __q24,
1205 char __q23, char __q22, char __q21, char __q20,
1206 char __q19, char __q18, char __q17, char __q16,
1207 char __q15, char __q14, char __q13, char __q12,
1208 char __q11, char __q10, char __q09, char __q08,
1209 char __q07, char __q06, char __q05, char __q04,
1210 char __q03, char __q02, char __q01, char __q00)
1211{
1212 return __extension__ (__m256i)(__v32qi){
1213 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1214 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1215 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1216 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1217 };
1218}
1219
1220extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221_mm256_set_epi64x (long long __A, long long __B, long long __C,
1222 long long __D)
1223{
1224 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1225}
1226
1227/* Create a vector with all elements equal to A. */
1228extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229_mm256_set1_pd (double __A)
1230{
1231 return __extension__ (__m256d){ __A, __A, __A, __A };
1232}
1233
1234/* Create a vector with all elements equal to A. */
1235extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236_mm256_set1_ps (float __A)
1237{
1238 return __extension__ (__m256){ __A, __A, __A, __A,
1239 __A, __A, __A, __A };
1240}
1241
1242/* Create a vector with all elements equal to A. */
1243extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244_mm256_set1_epi32 (int __A)
1245{
1246 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1247 __A, __A, __A, __A };
1248}
1249
1250extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251_mm256_set1_epi16 (short __A)
1252{
1253 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1254 __A, __A, __A, __A, __A, __A, __A, __A);
1255}
1256
1257extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258_mm256_set1_epi8 (char __A)
1259{
1260 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1261 __A, __A, __A, __A, __A, __A, __A, __A,
1262 __A, __A, __A, __A, __A, __A, __A, __A,
1263 __A, __A, __A, __A, __A, __A, __A, __A);
1264}
1265
1266extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1267_mm256_set1_epi64x (long long __A)
1268{
1269 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1270}
1271
1272/* Create vectors of elements in the reversed order from the
1273 _mm256_set_XXX functions. */
1274
1275extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1276_mm256_setr_pd (double __A, double __B, double __C, double __D)
1277{
1278 return _mm256_set_pd (__D, __C, __B, __A);
1279}
1280
1281extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282_mm256_setr_ps (float __A, float __B, float __C, float __D,
1283 float __E, float __F, float __G, float __H)
1284{
1285 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1286}
1287
1288extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1290 int __E, int __F, int __G, int __H)
1291{
1292 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1293}
1294
1295extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1297 short __q11, short __q10, short __q09, short __q08,
1298 short __q07, short __q06, short __q05, short __q04,
1299 short __q03, short __q02, short __q01, short __q00)
1300{
1301 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1302 __q04, __q05, __q06, __q07,
1303 __q08, __q09, __q10, __q11,
1304 __q12, __q13, __q14, __q15);
1305}
1306
1307extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1308_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1309 char __q27, char __q26, char __q25, char __q24,
1310 char __q23, char __q22, char __q21, char __q20,
1311 char __q19, char __q18, char __q17, char __q16,
1312 char __q15, char __q14, char __q13, char __q12,
1313 char __q11, char __q10, char __q09, char __q08,
1314 char __q07, char __q06, char __q05, char __q04,
1315 char __q03, char __q02, char __q01, char __q00)
1316{
1317 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1318 __q04, __q05, __q06, __q07,
1319 __q08, __q09, __q10, __q11,
1320 __q12, __q13, __q14, __q15,
1321 __q16, __q17, __q18, __q19,
1322 __q20, __q21, __q22, __q23,
1323 __q24, __q25, __q26, __q27,
1324 __q28, __q29, __q30, __q31);
1325}
1326
1327extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1329 long long __D)
1330{
1331 return _mm256_set_epi64x (__D, __C, __B, __A);
1332}
1333
1334/* Casts between various SP, DP, INT vector types. Note that these do no
1335 conversion of values, they just change the type. */
1336extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337_mm256_castpd_ps (__m256d __A)
1338{
1339 return (__m256) __A;
1340}
1341
1342extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343_mm256_castpd_si256 (__m256d __A)
1344{
1345 return (__m256i) __A;
1346}
1347
1348extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349_mm256_castps_pd (__m256 __A)
1350{
1351 return (__m256d) __A;
1352}
1353
1354extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355_mm256_castps_si256(__m256 __A)
1356{
1357 return (__m256i) __A;
1358}
1359
1360extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361_mm256_castsi256_ps (__m256i __A)
1362{
1363 return (__m256) __A;
1364}
1365
1366extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367_mm256_castsi256_pd (__m256i __A)
1368{
1369 return (__m256d) __A;
1370}
1371
1372extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373_mm256_castpd256_pd128 (__m256d __A)
1374{
1375 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1376}
1377
1378extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379_mm256_castps256_ps128 (__m256 __A)
1380{
1381 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1382}
1383
1384extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385_mm256_castsi256_si128 (__m256i __A)
1386{
1387 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1388}
1389
1390/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1391 the 256-bit result contain source parameter value and the upper 128
1392 bits of the result are undefined. Those intrinsics shouldn't
1393 generate any extra moves. */
1394
1395extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396_mm256_castpd128_pd256 (__m128d __A)
1397{
1398 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1399}
1400
1401extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1402_mm256_castps128_ps256 (__m128 __A)
1403{
1404 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1405}
1406
1407extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1408_mm256_castsi128_si256 (__m128i __A)
1409{
1410 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1411}