]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/avxintrin.h
re PR testsuite/39696 (gcc.dg/tree-ssa/ssa-ccp-25.c scan-tree-dump doesn't work on...
[thirdparty/gcc.git] / gcc / config / i386 / avxintrin.h
CommitLineData
e47b7d04 1/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
95879c72
L
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20/* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
25 Public License. */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 11.0. */
29
37fe763d
UB
30#ifndef _IMMINTRIN_H_INCLUDED
31# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
32#endif
95879c72
L
33
34/* Internal data types for implementing the intrinsics. */
35typedef double __v4df __attribute__ ((__vector_size__ (32)));
36typedef float __v8sf __attribute__ ((__vector_size__ (32)));
37typedef long long __v4di __attribute__ ((__vector_size__ (32)));
38typedef int __v8si __attribute__ ((__vector_size__ (32)));
39typedef short __v16hi __attribute__ ((__vector_size__ (32)));
40typedef char __v32qi __attribute__ ((__vector_size__ (32)));
41
42/* The Intel API is flexible enough that we must allow aliasing with other
43 vector types, and their scalar components. */
44typedef float __m256 __attribute__ ((__vector_size__ (32),
45 __may_alias__));
46typedef long long __m256i __attribute__ ((__vector_size__ (32),
47 __may_alias__));
48typedef double __m256d __attribute__ ((__vector_size__ (32),
49 __may_alias__));
50
51/* Compare predicates for scalar and packed compare intrinsics. */
52
53/* Equal (ordered, non-signaling) */
54#define _CMP_EQ_OQ 0x00
55/* Less-than (ordered, signaling) */
56#define _CMP_LT_OS 0x01
57/* Less-than-or-equal (ordered, signaling) */
58#define _CMP_LE_OS 0x02
59/* Unordered (non-signaling) */
60#define _CMP_UNORD_Q 0x03
61/* Not-equal (unordered, non-signaling) */
62#define _CMP_NEQ_UQ 0x04
63/* Not-less-than (unordered, signaling) */
64#define _CMP_NLT_US 0x05
65/* Not-less-than-or-equal (unordered, signaling) */
66#define _CMP_NLE_US 0x06
67/* Ordered (nonsignaling) */
68#define _CMP_ORD_Q 0x07
69/* Equal (unordered, non-signaling) */
70#define _CMP_EQ_UQ 0x08
71/* Not-greater-than-or-equal (unordered, signaling) */
72#define _CMP_NGE_US 0x09
73/* Not-greater-than (unordered, signaling) */
74#define _CMP_NGT_US 0x0a
75/* False (ordered, non-signaling) */
76#define _CMP_FALSE_OQ 0x0b
77/* Not-equal (ordered, non-signaling) */
78#define _CMP_NEQ_OQ 0x0c
79/* Greater-than-or-equal (ordered, signaling) */
80#define _CMP_GE_OS 0x0d
81/* Greater-than (ordered, signaling) */
82#define _CMP_GT_OS 0x0e
83/* True (unordered, non-signaling) */
84#define _CMP_TRUE_UQ 0x0f
85/* Equal (ordered, signaling) */
86#define _CMP_EQ_OS 0x10
87/* Less-than (ordered, non-signaling) */
88#define _CMP_LT_OQ 0x11
89/* Less-than-or-equal (ordered, non-signaling) */
90#define _CMP_LE_OQ 0x12
91/* Unordered (signaling) */
92#define _CMP_UNORD_S 0x13
93/* Not-equal (unordered, signaling) */
94#define _CMP_NEQ_US 0x14
95/* Not-less-than (unordered, non-signaling) */
96#define _CMP_NLT_UQ 0x15
97/* Not-less-than-or-equal (unordered, non-signaling) */
98#define _CMP_NLE_UQ 0x16
99/* Ordered (signaling) */
100#define _CMP_ORD_S 0x17
101/* Equal (unordered, signaling) */
102#define _CMP_EQ_US 0x18
103/* Not-greater-than-or-equal (unordered, non-signaling) */
104#define _CMP_NGE_UQ 0x19
105/* Not-greater-than (unordered, non-signaling) */
106#define _CMP_NGT_UQ 0x1a
107/* False (ordered, signaling) */
108#define _CMP_FALSE_OS 0x1b
109/* Not-equal (ordered, signaling) */
110#define _CMP_NEQ_OS 0x1c
111/* Greater-than-or-equal (ordered, non-signaling) */
112#define _CMP_GE_OQ 0x1d
113/* Greater-than (ordered, non-signaling) */
114#define _CMP_GT_OQ 0x1e
115/* True (unordered, signaling) */
116#define _CMP_TRUE_US 0x1f
117
118extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119_mm256_add_pd (__m256d __A, __m256d __B)
120{
121 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
122}
123
124extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125_mm256_add_ps (__m256 __A, __m256 __B)
126{
127 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
128}
129
130extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131_mm256_addsub_pd (__m256d __A, __m256d __B)
132{
133 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
134}
135
136extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
137_mm256_addsub_ps (__m256 __A, __m256 __B)
138{
139 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
140}
141
142
143extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
144_mm256_and_pd (__m256d __A, __m256d __B)
145{
146 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
147}
148
149extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150_mm256_and_ps (__m256 __A, __m256 __B)
151{
152 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
153}
154
155extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156_mm256_andnot_pd (__m256d __A, __m256d __B)
157{
158 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
159}
160
161extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm256_andnot_ps (__m256 __A, __m256 __B)
163{
164 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
165}
166
167/* Double/single precision floating point blend instructions - select
168 data from 2 sources using constant/variable mask. */
169
170#ifdef __OPTIMIZE__
171extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
173{
174 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
175 (__v4df)__Y,
176 __M);
177}
178
179extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
180_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
181{
182 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
183 (__v8sf)__Y,
184 __M);
185}
186#else
187#define _mm256_blend_pd(X, Y, M) \
188 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
189 (__v4df)(__m256d)(Y), (int)(M)))
190
191#define _mm256_blend_ps(X, Y, M) \
192 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
193 (__v8sf)(__m256)(Y), (int)(M)))
194#endif
195
196extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
197_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
198{
199 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
200 (__v4df)__Y,
201 (__v4df)__M);
202}
203
204extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
206{
207 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
208 (__v8sf)__Y,
209 (__v8sf)__M);
210}
211
212extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
213_mm256_div_pd (__m256d __A, __m256d __B)
214{
215 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
216}
217
218extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219_mm256_div_ps (__m256 __A, __m256 __B)
220{
221 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
222}
223
224/* Dot product instructions with mask-defined summing and zeroing parts
225 of result. */
226
227#ifdef __OPTIMIZE__
228extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
230{
231 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
232 (__v8sf)__Y,
233 __M);
234}
235#else
236#define _mm256_dp_ps(X, Y, M) \
237 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
238 (__v8sf)(__m256)(Y), (int)(M)))
239#endif
240
241extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242_mm256_hadd_pd (__m256d __X, __m256d __Y)
243{
244 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
245}
246
247extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248_mm256_hadd_ps (__m256 __X, __m256 __Y)
249{
250 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
251}
252
253extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
254_mm256_hsub_pd (__m256d __X, __m256d __Y)
255{
256 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
257}
258
259extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260_mm256_hsub_ps (__m256 __X, __m256 __Y)
261{
262 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
263}
264
265extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266_mm256_max_pd (__m256d __A, __m256d __B)
267{
268 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
269}
270
271extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272_mm256_max_ps (__m256 __A, __m256 __B)
273{
274 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
275}
276
277extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278_mm256_min_pd (__m256d __A, __m256d __B)
279{
280 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
281}
282
283extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284_mm256_min_ps (__m256 __A, __m256 __B)
285{
286 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
287}
288
289extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290_mm256_mul_pd (__m256d __A, __m256d __B)
291{
292 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
293}
294
295extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296_mm256_mul_ps (__m256 __A, __m256 __B)
297{
298 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
299}
300
301extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302_mm256_or_pd (__m256d __A, __m256d __B)
303{
304 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
305}
306
307extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308_mm256_or_ps (__m256 __A, __m256 __B)
309{
310 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
311}
312
313#ifdef __OPTIMIZE__
314extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
316{
317 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
318 __mask);
319}
320
321extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
322_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
323{
324 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
325 __mask);
326}
327#else
328#define _mm256_shuffle_pd(A, B, N) \
329 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
330 (__v4df)(__m256d)(B), (int)(N)))
331
332#define _mm256_shuffle_ps(A, B, N) \
333 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
334 (__v8sf)(__m256)(B), (int)(N)))
335#endif
336
337extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338_mm256_sub_pd (__m256d __A, __m256d __B)
339{
340 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
341}
342
343extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm256_sub_ps (__m256 __A, __m256 __B)
345{
346 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
347}
348
349extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350_mm256_xor_pd (__m256d __A, __m256d __B)
351{
352 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
353}
354
355extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm256_xor_ps (__m256 __A, __m256 __B)
357{
358 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
359}
360
361#ifdef __OPTIMIZE__
362extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
364{
365 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
366}
367
368extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
370{
371 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
372}
373
374extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
376{
377 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
378 __P);
379}
380
381extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
383{
384 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
385 __P);
386}
387
388extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
390{
391 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
392}
393
394extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
396{
397 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
398}
399#else
400#define _mm_cmp_pd(X, Y, P) \
401 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
402 (__v2df)(__m128d)(Y), (int)(P)))
403
404#define _mm_cmp_ps(X, Y, P) \
405 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
406 (__v4sf)(__m128)(Y), (int)(P)))
407
408#define _mm256_cmp_pd(X, Y, P) \
409 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
410 (__v4df)(__m256d)(Y), (int)(P)))
411
412#define _mm256_cmp_ps(X, Y, P) \
413 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
414 (__v8sf)(__m256)(Y), (int)(P)))
415
416#define _mm_cmp_sd(X, Y, P) \
417 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
418 (__v2df)(__m128d)(Y), (int)(P)))
419
420#define _mm_cmp_ss(X, Y, P) \
421 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
422 (__v4sf)(__m128)(Y), (int)(P)))
423#endif
424
425extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426_mm256_cvtepi32_pd (__m128i __A)
427{
428 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
429}
430
431extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432_mm256_cvtepi32_ps (__m256i __A)
433{
434 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
435}
436
437extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438_mm256_cvtpd_ps (__m256d __A)
439{
440 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
441}
442
443extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444_mm256_cvtps_epi32 (__m256 __A)
445{
446 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
447}
448
449extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450_mm256_cvtps_pd (__m128 __A)
451{
452 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
453}
454
455extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456_mm256_cvttpd_epi32 (__m256d __A)
457{
458 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
459}
460
461extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462_mm256_cvtpd_epi32 (__m256d __A)
463{
464 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
465}
466
467extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468_mm256_cvttps_epi32 (__m256 __A)
469{
470 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
471}
472
473#ifdef __OPTIMIZE__
474extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475_mm256_extractf128_pd (__m256d __X, const int __N)
476{
477 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
478}
479
480extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481_mm256_extractf128_ps (__m256 __X, const int __N)
482{
483 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
484}
485
486extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487_mm256_extractf128_si256 (__m256i __X, const int __N)
488{
489 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
490}
491
492extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493_mm256_extract_epi32 (__m256i __X, int const __N)
494{
495 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
496 return _mm_extract_epi32 (__Y, __N % 4);
497}
498
499extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
500_mm256_extract_epi16 (__m256i __X, int const __N)
501{
502 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
503 return _mm_extract_epi16 (__Y, __N % 8);
504}
505
506extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm256_extract_epi8 (__m256i __X, int const __N)
508{
509 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
510 return _mm_extract_epi8 (__Y, __N % 16);
511}
512
513#ifdef __x86_64__
514extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515_mm256_extract_epi64 (__m256i __X, const int __N)
516{
517 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
518 return _mm_extract_epi64 (__Y, __N % 2);
519}
520#endif
521#else
522#define _mm256_extractf128_pd(X, N) \
523 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
524 (int)(N)))
525
526#define _mm256_extractf128_ps(X, N) \
527 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
528 (int)(N)))
529
530#define _mm256_extractf128_si256(X, N) \
531 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
532 (int)(N)))
533
534#define _mm256_extract_epi32(X, N) \
535 (__extension__ \
536 ({ \
537 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
538 _mm_extract_epi32 (__Y, (N) % 4); \
539 }))
540
541#define _mm256_extract_epi16(X, N) \
542 (__extension__ \
543 ({ \
544 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
545 _mm_extract_epi16 (__Y, (N) % 8); \
546 }))
547
548#define _mm256_extract_epi8(X, N) \
549 (__extension__ \
550 ({ \
551 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
552 _mm_extract_epi8 (__Y, (N) % 16); \
553 }))
554
555#ifdef __x86_64__
556#define _mm256_extract_epi64(X, N) \
557 (__extension__ \
558 ({ \
559 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
560 _mm_extract_epi64 (__Y, (N) % 2); \
561 }))
562#endif
563#endif
564
565extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
566_mm256_zeroall (void)
567{
568 __builtin_ia32_vzeroall ();
569}
570
571extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572_mm256_zeroupper (void)
573{
574 __builtin_ia32_vzeroupper ();
575}
576
577extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
578_mm_permutevar_pd (__m128d __A, __m128i __C)
579{
580 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
581 (__v2di)__C);
582}
583
584extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585_mm256_permutevar_pd (__m256d __A, __m256i __C)
586{
587 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
588 (__v4di)__C);
589}
590
591extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592_mm_permutevar_ps (__m128 __A, __m128i __C)
593{
594 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
595 (__v4si)__C);
596}
597
598extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599_mm256_permutevar_ps (__m256 __A, __m256i __C)
600{
601 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
602 (__v8si)__C);
603}
604
605#ifdef __OPTIMIZE__
606extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607_mm_permute_pd (__m128d __X, const int __C)
608{
609 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
610}
611
612extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_mm256_permute_pd (__m256d __X, const int __C)
614{
615 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
616}
617
618extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619_mm_permute_ps (__m128 __X, const int __C)
620{
621 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
622}
623
624extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625_mm256_permute_ps (__m256 __X, const int __C)
626{
627 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
628}
95879c72
L
629#else
630#define _mm_permute_pd(X, C) \
631 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
632
633#define _mm256_permute_pd(X, C) \
634 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
635
636#define _mm_permute_ps(X, C) \
637 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
638
639#define _mm256_permute_ps(X, C) \
640 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
95879c72
L
641#endif
642
643#ifdef __OPTIMIZE__
644extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
646{
647 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
648 (__v4df)__Y,
649 __C);
650}
651
652extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
654{
655 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
656 (__v8sf)__Y,
657 __C);
658}
659
660extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
662{
663 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
664 (__v8si)__Y,
665 __C);
666}
667#else
668#define _mm256_permute2f128_pd(X, Y, C) \
669 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
670 (__v4df)(__m256d)(Y), \
671 (int)(C)))
672
673#define _mm256_permute2f128_ps(X, Y, C) \
674 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
675 (__v8sf)(__m256)(Y), \
676 (int)(C)))
677
678#define _mm256_permute2f128_si256(X, Y, C) \
679 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
680 (__v8si)(__m256i)(Y), \
681 (int)(C)))
682#endif
683
684extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685_mm_broadcast_ss (float const *__X)
686{
687 return (__m128) __builtin_ia32_vbroadcastss (__X);
688}
689
690extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691_mm256_broadcast_sd (double const *__X)
692{
693 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
694}
695
696extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697_mm256_broadcast_ss (float const *__X)
698{
699 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
700}
701
702extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703_mm256_broadcast_pd (__m128d const *__X)
704{
705 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
706}
707
708extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709_mm256_broadcast_ps (__m128 const *__X)
710{
711 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
712}
713
714#ifdef __OPTIMIZE__
715extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
717{
718 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
719 (__v2df)__Y,
720 __O);
721}
722
723extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
725{
726 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
727 (__v4sf)__Y,
728 __O);
729}
730
731extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
732_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
733{
734 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
735 (__v4si)__Y,
736 __O);
737}
738
739extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
741{
742 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
743 __Y = _mm_insert_epi16 (__Y, __D, __N % 4);
744 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
745}
746
747extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
748_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
749{
750 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
751 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
752 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
753}
754
755extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
756_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
757{
758 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
759 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
760 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
761}
762
763#ifdef __x86_64__
764extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
765_mm256_insert_epi64 (__m256i __X, int __D, int const __N)
766{
767 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
768 __Y = _mm_insert_epi16 (__Y, __D, __N % 2);
769 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
770}
771#endif
772#else
773#define _mm256_insertf128_pd(X, Y, O) \
774 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
775 (__v2df)(__m128d)(Y), \
776 (int)(O)))
777
778#define _mm256_insertf128_ps(X, Y, O) \
779 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
780 (__v4sf)(__m128)(Y), \
781 (int)(O)))
782
783#define _mm256_insertf128_si256(X, Y, O) \
784 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
785 (__v4si)(__m128i)(Y), \
786 (int)(O)))
787
788#define _mm256_insert_epi32(X, D, N) \
789 (__extension__ \
790 ({ \
791 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
792 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
793 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
794 }))
795
796#define _mm256_insert_epi16(X, D, N) \
797 (__extension__ \
798 ({ \
799 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
800 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
801 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
802 }))
803
804#define _mm256_insert_epi8(X, D, N) \
805 (__extension__ \
806 ({ \
807 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
808 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
809 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
810 }))
811
812#ifdef __x86_64__
813#define _mm256_insert_epi64(X, D, N) \
814 (__extension__ \
815 ({ \
816 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
817 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
818 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
819 }))
820#endif
821#endif
822
823extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824_mm256_load_pd (double const *__P)
825{
826 return *(__m256d *)__P;
827}
828
829extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830_mm256_store_pd (double *__P, __m256d __A)
831{
832 *(__m256d *)__P = __A;
833}
834
835extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836_mm256_load_ps (float const *__P)
837{
838 return *(__m256 *)__P;
839}
840
841extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842_mm256_store_ps (float *__P, __m256 __A)
843{
844 *(__m256 *)__P = __A;
845}
846
847extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848_mm256_loadu_pd (double const *__P)
849{
850 return (__m256d) __builtin_ia32_loadupd256 (__P);
851}
852
853extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
854_mm256_storeu_pd (double *__P, __m256d __A)
855{
856 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
857}
858
859extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860_mm256_loadu_ps (float const *__P)
861{
862 return (__m256) __builtin_ia32_loadups256 (__P);
863}
864
865extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866_mm256_storeu_ps (float *__P, __m256 __A)
867{
868 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
869}
870
871extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872_mm256_load_si256 (__m256i const *__P)
873{
874 return *__P;
875}
876
877extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878_mm256_store_si256 (__m256i *__P, __m256i __A)
879{
880 *__P = __A;
881}
882
883extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884_mm256_loadu_si256 (__m256i const *__P)
885{
886 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
887}
888
889extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890_mm256_storeu_si256 (__m256i *__P, __m256i __A)
891{
892 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
893}
894
895extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896_mm_maskload_pd (double const *__P, __m128d __M)
897{
898 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
899 (__v2df)__M);
900}
901
902extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903_mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
904{
905 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A);
906}
907
908extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
909_mm256_maskload_pd (double const *__P, __m256d __M)
910{
911 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
912 (__v4df)__M);
913}
914
915extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916_mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A)
917{
918 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A);
919}
920
921extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922_mm_maskload_ps (float const *__P, __m128 __M)
923{
924 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
925 (__v4sf)__M);
926}
927
928extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929_mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
930{
931 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A);
932}
933
934extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935_mm256_maskload_ps (float const *__P, __m256 __M)
936{
937 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
938 (__v8sf)__M);
939}
940
941extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
942_mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A)
943{
944 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A);
945}
946
947extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
948_mm256_movehdup_ps (__m256 __X)
949{
950 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
951}
952
953extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954_mm256_moveldup_ps (__m256 __X)
955{
956 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
957}
958
959extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960_mm256_movedup_pd (__m256d __X)
961{
962 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
963}
964
965extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966_mm256_lddqu_si256 (__m256i const *__P)
967{
968 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
969}
970
65b82caa
L
971extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972_mm256_stream_si256 (__m256i *__A, __m256i __B)
973{
974 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
975}
976
977extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978_mm256_stream_pd (double *__A, __m256d __B)
979{
980 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
981}
982
983extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984_mm256_stream_ps (float *__P, __m256 __A)
985{
986 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
987}
988
95879c72
L
989extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990_mm256_rcp_ps (__m256 __A)
991{
992 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
993}
994
995extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996_mm256_rsqrt_ps (__m256 __A)
997{
998 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
999}
1000
1001extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002_mm256_sqrt_pd (__m256d __A)
1003{
1004 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1005}
1006
1007extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1008_mm256_sqrt_ps (__m256 __A)
1009{
1010 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1011}
1012
1013#ifdef __OPTIMIZE__
1014extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015_mm256_round_pd (__m256d __V, const int __M)
1016{
1017 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1018}
1019
1020extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021_mm256_round_ps (__m256 __V, const int __M)
1022{
1023 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1024}
1025#else
1026#define _mm256_round_pd(V, M) \
1027 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1028
1029#define _mm256_round_ps(V, M) \
1030 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1031#endif
1032
1033#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1034#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1035#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1036#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1037
1038extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1040{
1041 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1042}
1043
1044extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1046{
1047 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1048}
1049
1050extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1052{
1053 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1054}
1055
1056extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1058{
1059 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1060}
1061
1062extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063_mm_testz_pd (__m128d __M, __m128d __V)
1064{
1065 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1066}
1067
1068extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069_mm_testc_pd (__m128d __M, __m128d __V)
1070{
1071 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1072}
1073
1074extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075_mm_testnzc_pd (__m128d __M, __m128d __V)
1076{
1077 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1078}
1079
1080extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081_mm_testz_ps (__m128 __M, __m128 __V)
1082{
1083 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1084}
1085
1086extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087_mm_testc_ps (__m128 __M, __m128 __V)
1088{
1089 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1090}
1091
1092extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1093_mm_testnzc_ps (__m128 __M, __m128 __V)
1094{
1095 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1096}
1097
1098extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099_mm256_testz_pd (__m256d __M, __m256d __V)
1100{
1101 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1102}
1103
1104extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105_mm256_testc_pd (__m256d __M, __m256d __V)
1106{
1107 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1108}
1109
1110extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111_mm256_testnzc_pd (__m256d __M, __m256d __V)
1112{
1113 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1114}
1115
1116extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117_mm256_testz_ps (__m256 __M, __m256 __V)
1118{
1119 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1120}
1121
1122extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1123_mm256_testc_ps (__m256 __M, __m256 __V)
1124{
1125 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1126}
1127
1128extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129_mm256_testnzc_ps (__m256 __M, __m256 __V)
1130{
1131 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1132}
1133
1134extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1135_mm256_testz_si256 (__m256i __M, __m256i __V)
1136{
1137 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1138}
1139
1140extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141_mm256_testc_si256 (__m256i __M, __m256i __V)
1142{
1143 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1144}
1145
1146extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1148{
1149 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1150}
1151
1152extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153_mm256_movemask_pd (__m256d __A)
1154{
1155 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1156}
1157
1158extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159_mm256_movemask_ps (__m256 __A)
1160{
1161 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1162}
1163
1164extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1165_mm256_setzero_pd (void)
1166{
1167 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1168}
1169
1170extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171_mm256_setzero_ps (void)
1172{
1173 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1174 0.0, 0.0, 0.0, 0.0 };
1175}
1176
1177extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178_mm256_setzero_si256 (void)
1179{
1180 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1181}
1182
1183/* Create the vector [A B C D]. */
1184extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1185_mm256_set_pd (double __A, double __B, double __C, double __D)
1186{
1187 return __extension__ (__m256d){ __D, __C, __B, __A };
1188}
1189
1190/* Create the vector [A B C D E F G H]. */
1191extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1192_mm256_set_ps (float __A, float __B, float __C, float __D,
1193 float __E, float __F, float __G, float __H)
1194{
1195 return __extension__ (__m256){ __H, __G, __F, __E,
1196 __D, __C, __B, __A };
1197}
1198
1199/* Create the vector [A B C D E F G H]. */
1200extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1201_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1202 int __E, int __F, int __G, int __H)
1203{
1204 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1205 __D, __C, __B, __A };
1206}
1207
1208extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1210 short __q11, short __q10, short __q09, short __q08,
1211 short __q07, short __q06, short __q05, short __q04,
1212 short __q03, short __q02, short __q01, short __q00)
1213{
1214 return __extension__ (__m256i)(__v16hi){
1215 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1216 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1217 };
1218}
1219
1220extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1222 char __q27, char __q26, char __q25, char __q24,
1223 char __q23, char __q22, char __q21, char __q20,
1224 char __q19, char __q18, char __q17, char __q16,
1225 char __q15, char __q14, char __q13, char __q12,
1226 char __q11, char __q10, char __q09, char __q08,
1227 char __q07, char __q06, char __q05, char __q04,
1228 char __q03, char __q02, char __q01, char __q00)
1229{
1230 return __extension__ (__m256i)(__v32qi){
1231 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1232 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1233 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1234 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1235 };
1236}
1237
1238extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239_mm256_set_epi64x (long long __A, long long __B, long long __C,
1240 long long __D)
1241{
1242 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1243}
1244
1245/* Create a vector with all elements equal to A. */
1246extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247_mm256_set1_pd (double __A)
1248{
1249 return __extension__ (__m256d){ __A, __A, __A, __A };
1250}
1251
1252/* Create a vector with all elements equal to A. */
1253extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1254_mm256_set1_ps (float __A)
1255{
1256 return __extension__ (__m256){ __A, __A, __A, __A,
1257 __A, __A, __A, __A };
1258}
1259
1260/* Create a vector with all elements equal to A. */
1261extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262_mm256_set1_epi32 (int __A)
1263{
1264 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1265 __A, __A, __A, __A };
1266}
1267
1268extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269_mm256_set1_epi16 (short __A)
1270{
1271 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1272 __A, __A, __A, __A, __A, __A, __A, __A);
1273}
1274
1275extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1276_mm256_set1_epi8 (char __A)
1277{
1278 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1279 __A, __A, __A, __A, __A, __A, __A, __A,
1280 __A, __A, __A, __A, __A, __A, __A, __A,
1281 __A, __A, __A, __A, __A, __A, __A, __A);
1282}
1283
1284extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1285_mm256_set1_epi64x (long long __A)
1286{
1287 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1288}
1289
1290/* Create vectors of elements in the reversed order from the
1291 _mm256_set_XXX functions. */
1292
1293extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1294_mm256_setr_pd (double __A, double __B, double __C, double __D)
1295{
1296 return _mm256_set_pd (__D, __C, __B, __A);
1297}
1298
1299extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300_mm256_setr_ps (float __A, float __B, float __C, float __D,
1301 float __E, float __F, float __G, float __H)
1302{
1303 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1304}
1305
1306extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1308 int __E, int __F, int __G, int __H)
1309{
1310 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1311}
1312
1313extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1314_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1315 short __q11, short __q10, short __q09, short __q08,
1316 short __q07, short __q06, short __q05, short __q04,
1317 short __q03, short __q02, short __q01, short __q00)
1318{
1319 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1320 __q04, __q05, __q06, __q07,
1321 __q08, __q09, __q10, __q11,
1322 __q12, __q13, __q14, __q15);
1323}
1324
1325extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1327 char __q27, char __q26, char __q25, char __q24,
1328 char __q23, char __q22, char __q21, char __q20,
1329 char __q19, char __q18, char __q17, char __q16,
1330 char __q15, char __q14, char __q13, char __q12,
1331 char __q11, char __q10, char __q09, char __q08,
1332 char __q07, char __q06, char __q05, char __q04,
1333 char __q03, char __q02, char __q01, char __q00)
1334{
1335 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1336 __q04, __q05, __q06, __q07,
1337 __q08, __q09, __q10, __q11,
1338 __q12, __q13, __q14, __q15,
1339 __q16, __q17, __q18, __q19,
1340 __q20, __q21, __q22, __q23,
1341 __q24, __q25, __q26, __q27,
1342 __q28, __q29, __q30, __q31);
1343}
1344
1345extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1347 long long __D)
1348{
1349 return _mm256_set_epi64x (__D, __C, __B, __A);
1350}
1351
1352/* Casts between various SP, DP, INT vector types. Note that these do no
1353 conversion of values, they just change the type. */
1354extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355_mm256_castpd_ps (__m256d __A)
1356{
1357 return (__m256) __A;
1358}
1359
1360extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361_mm256_castpd_si256 (__m256d __A)
1362{
1363 return (__m256i) __A;
1364}
1365
1366extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367_mm256_castps_pd (__m256 __A)
1368{
1369 return (__m256d) __A;
1370}
1371
1372extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373_mm256_castps_si256(__m256 __A)
1374{
1375 return (__m256i) __A;
1376}
1377
1378extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379_mm256_castsi256_ps (__m256i __A)
1380{
1381 return (__m256) __A;
1382}
1383
1384extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385_mm256_castsi256_pd (__m256i __A)
1386{
1387 return (__m256d) __A;
1388}
1389
1390extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1391_mm256_castpd256_pd128 (__m256d __A)
1392{
1393 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1394}
1395
1396extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397_mm256_castps256_ps128 (__m256 __A)
1398{
1399 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1400}
1401
1402extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403_mm256_castsi256_si128 (__m256i __A)
1404{
1405 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1406}
1407
1408/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1409 the 256-bit result contain source parameter value and the upper 128
1410 bits of the result are undefined. Those intrinsics shouldn't
1411 generate any extra moves. */
1412
1413extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1414_mm256_castpd128_pd256 (__m128d __A)
1415{
1416 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1417}
1418
1419extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420_mm256_castps128_ps256 (__m128 __A)
1421{
1422 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1423}
1424
1425extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1426_mm256_castsi128_si256 (__m128i __A)
1427{
1428 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1429}