]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/avxintrin.h
re PR c++/60574 ([c++1y] ICE with deduced return type in virtual function and LTO)
[thirdparty/gcc.git] / gcc / config / i386 / avxintrin.h
CommitLineData
23a5b65a 1/* Copyright (C) 2008-2014 Free Software Foundation, Inc.
95879c72
L
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
748086b7 7 the Free Software Foundation; either version 3, or (at your option)
95879c72
L
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
748086b7
JJ
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
95879c72
L
23
24/* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
26
37fe763d
UB
27#ifndef _IMMINTRIN_H_INCLUDED
28# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29#endif
95879c72 30
97db2bf7
ST
31#ifndef _AVXINTRIN_H_INCLUDED
32#define _AVXINTRIN_H_INCLUDED
33
34#ifndef __AVX__
35#pragma GCC push_options
36#pragma GCC target("avx")
37#define __DISABLE_AVX__
38#endif /* __AVX__ */
39
95879c72
L
40/* Internal data types for implementing the intrinsics. */
41typedef double __v4df __attribute__ ((__vector_size__ (32)));
42typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43typedef long long __v4di __attribute__ ((__vector_size__ (32)));
44typedef int __v8si __attribute__ ((__vector_size__ (32)));
45typedef short __v16hi __attribute__ ((__vector_size__ (32)));
46typedef char __v32qi __attribute__ ((__vector_size__ (32)));
47
48/* The Intel API is flexible enough that we must allow aliasing with other
49 vector types, and their scalar components. */
50typedef float __m256 __attribute__ ((__vector_size__ (32),
51 __may_alias__));
52typedef long long __m256i __attribute__ ((__vector_size__ (32),
53 __may_alias__));
54typedef double __m256d __attribute__ ((__vector_size__ (32),
55 __may_alias__));
56
57/* Compare predicates for scalar and packed compare intrinsics. */
58
59/* Equal (ordered, non-signaling) */
60#define _CMP_EQ_OQ 0x00
61/* Less-than (ordered, signaling) */
62#define _CMP_LT_OS 0x01
63/* Less-than-or-equal (ordered, signaling) */
64#define _CMP_LE_OS 0x02
65/* Unordered (non-signaling) */
66#define _CMP_UNORD_Q 0x03
67/* Not-equal (unordered, non-signaling) */
68#define _CMP_NEQ_UQ 0x04
69/* Not-less-than (unordered, signaling) */
70#define _CMP_NLT_US 0x05
71/* Not-less-than-or-equal (unordered, signaling) */
72#define _CMP_NLE_US 0x06
73/* Ordered (nonsignaling) */
74#define _CMP_ORD_Q 0x07
75/* Equal (unordered, non-signaling) */
76#define _CMP_EQ_UQ 0x08
77/* Not-greater-than-or-equal (unordered, signaling) */
78#define _CMP_NGE_US 0x09
79/* Not-greater-than (unordered, signaling) */
80#define _CMP_NGT_US 0x0a
81/* False (ordered, non-signaling) */
82#define _CMP_FALSE_OQ 0x0b
83/* Not-equal (ordered, non-signaling) */
84#define _CMP_NEQ_OQ 0x0c
85/* Greater-than-or-equal (ordered, signaling) */
86#define _CMP_GE_OS 0x0d
87/* Greater-than (ordered, signaling) */
88#define _CMP_GT_OS 0x0e
89/* True (unordered, non-signaling) */
90#define _CMP_TRUE_UQ 0x0f
91/* Equal (ordered, signaling) */
92#define _CMP_EQ_OS 0x10
93/* Less-than (ordered, non-signaling) */
94#define _CMP_LT_OQ 0x11
95/* Less-than-or-equal (ordered, non-signaling) */
96#define _CMP_LE_OQ 0x12
97/* Unordered (signaling) */
98#define _CMP_UNORD_S 0x13
99/* Not-equal (unordered, signaling) */
100#define _CMP_NEQ_US 0x14
101/* Not-less-than (unordered, non-signaling) */
102#define _CMP_NLT_UQ 0x15
103/* Not-less-than-or-equal (unordered, non-signaling) */
104#define _CMP_NLE_UQ 0x16
105/* Ordered (signaling) */
106#define _CMP_ORD_S 0x17
107/* Equal (unordered, signaling) */
108#define _CMP_EQ_US 0x18
109/* Not-greater-than-or-equal (unordered, non-signaling) */
110#define _CMP_NGE_UQ 0x19
111/* Not-greater-than (unordered, non-signaling) */
112#define _CMP_NGT_UQ 0x1a
113/* False (ordered, signaling) */
114#define _CMP_FALSE_OS 0x1b
115/* Not-equal (ordered, signaling) */
116#define _CMP_NEQ_OS 0x1c
117/* Greater-than-or-equal (ordered, non-signaling) */
118#define _CMP_GE_OQ 0x1d
119/* Greater-than (ordered, non-signaling) */
120#define _CMP_GT_OQ 0x1e
121/* True (unordered, signaling) */
122#define _CMP_TRUE_US 0x1f
123
124extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125_mm256_add_pd (__m256d __A, __m256d __B)
126{
127 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
128}
129
130extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131_mm256_add_ps (__m256 __A, __m256 __B)
132{
133 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
134}
135
136extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
137_mm256_addsub_pd (__m256d __A, __m256d __B)
138{
139 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
140}
141
142extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143_mm256_addsub_ps (__m256 __A, __m256 __B)
144{
145 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
146}
147
148
149extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150_mm256_and_pd (__m256d __A, __m256d __B)
151{
152 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
153}
154
155extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156_mm256_and_ps (__m256 __A, __m256 __B)
157{
158 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
159}
160
161extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm256_andnot_pd (__m256d __A, __m256d __B)
163{
164 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
165}
166
167extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168_mm256_andnot_ps (__m256 __A, __m256 __B)
169{
170 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
171}
172
173/* Double/single precision floating point blend instructions - select
174 data from 2 sources using constant/variable mask. */
175
176#ifdef __OPTIMIZE__
177extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
178_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
179{
180 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
181 (__v4df)__Y,
182 __M);
183}
184
185extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
187{
188 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
189 (__v8sf)__Y,
190 __M);
191}
192#else
193#define _mm256_blend_pd(X, Y, M) \
194 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
195 (__v4df)(__m256d)(Y), (int)(M)))
196
197#define _mm256_blend_ps(X, Y, M) \
198 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
199 (__v8sf)(__m256)(Y), (int)(M)))
200#endif
201
202extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
204{
205 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
206 (__v4df)__Y,
207 (__v4df)__M);
208}
209
210extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
212{
213 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
214 (__v8sf)__Y,
215 (__v8sf)__M);
216}
217
218extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219_mm256_div_pd (__m256d __A, __m256d __B)
220{
221 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
222}
223
224extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
225_mm256_div_ps (__m256 __A, __m256 __B)
226{
227 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
228}
229
230/* Dot product instructions with mask-defined summing and zeroing parts
231 of result. */
232
233#ifdef __OPTIMIZE__
234extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
236{
237 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
238 (__v8sf)__Y,
239 __M);
240}
241#else
242#define _mm256_dp_ps(X, Y, M) \
243 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
244 (__v8sf)(__m256)(Y), (int)(M)))
245#endif
246
247extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248_mm256_hadd_pd (__m256d __X, __m256d __Y)
249{
250 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
251}
252
253extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
254_mm256_hadd_ps (__m256 __X, __m256 __Y)
255{
256 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
257}
258
259extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260_mm256_hsub_pd (__m256d __X, __m256d __Y)
261{
262 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
263}
264
265extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266_mm256_hsub_ps (__m256 __X, __m256 __Y)
267{
268 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
269}
270
271extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272_mm256_max_pd (__m256d __A, __m256d __B)
273{
274 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
275}
276
277extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278_mm256_max_ps (__m256 __A, __m256 __B)
279{
280 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
281}
282
283extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284_mm256_min_pd (__m256d __A, __m256d __B)
285{
286 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
287}
288
289extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290_mm256_min_ps (__m256 __A, __m256 __B)
291{
292 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
293}
294
295extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296_mm256_mul_pd (__m256d __A, __m256d __B)
297{
298 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
299}
300
301extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302_mm256_mul_ps (__m256 __A, __m256 __B)
303{
304 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
305}
306
307extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308_mm256_or_pd (__m256d __A, __m256d __B)
309{
310 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
311}
312
313extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314_mm256_or_ps (__m256 __A, __m256 __B)
315{
316 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
317}
318
319#ifdef __OPTIMIZE__
320extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
322{
323 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
324 __mask);
325}
326
327extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
328_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
329{
330 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
331 __mask);
332}
333#else
334#define _mm256_shuffle_pd(A, B, N) \
335 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
336 (__v4df)(__m256d)(B), (int)(N)))
337
338#define _mm256_shuffle_ps(A, B, N) \
339 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
340 (__v8sf)(__m256)(B), (int)(N)))
341#endif
342
343extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm256_sub_pd (__m256d __A, __m256d __B)
345{
346 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
347}
348
349extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350_mm256_sub_ps (__m256 __A, __m256 __B)
351{
352 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
353}
354
355extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm256_xor_pd (__m256d __A, __m256d __B)
357{
358 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
359}
360
361extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362_mm256_xor_ps (__m256 __A, __m256 __B)
363{
364 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
365}
366
367#ifdef __OPTIMIZE__
368extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
370{
371 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
372}
373
374extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
376{
377 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
378}
379
380extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
382{
383 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
384 __P);
385}
386
387extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
389{
390 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
391 __P);
392}
393
394extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
396{
397 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
398}
399
400extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
401_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
402{
403 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
404}
405#else
406#define _mm_cmp_pd(X, Y, P) \
407 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
408 (__v2df)(__m128d)(Y), (int)(P)))
409
410#define _mm_cmp_ps(X, Y, P) \
411 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
412 (__v4sf)(__m128)(Y), (int)(P)))
413
414#define _mm256_cmp_pd(X, Y, P) \
415 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
416 (__v4df)(__m256d)(Y), (int)(P)))
417
418#define _mm256_cmp_ps(X, Y, P) \
419 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
420 (__v8sf)(__m256)(Y), (int)(P)))
421
422#define _mm_cmp_sd(X, Y, P) \
423 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
424 (__v2df)(__m128d)(Y), (int)(P)))
425
426#define _mm_cmp_ss(X, Y, P) \
427 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
428 (__v4sf)(__m128)(Y), (int)(P)))
429#endif
430
431extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432_mm256_cvtepi32_pd (__m128i __A)
433{
434 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
435}
436
437extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438_mm256_cvtepi32_ps (__m256i __A)
439{
440 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
441}
442
443extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444_mm256_cvtpd_ps (__m256d __A)
445{
446 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
447}
448
449extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450_mm256_cvtps_epi32 (__m256 __A)
451{
452 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
453}
454
455extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456_mm256_cvtps_pd (__m128 __A)
457{
458 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
459}
460
461extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462_mm256_cvttpd_epi32 (__m256d __A)
463{
464 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
465}
466
467extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468_mm256_cvtpd_epi32 (__m256d __A)
469{
470 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
471}
472
473extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
474_mm256_cvttps_epi32 (__m256 __A)
475{
476 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
477}
478
479#ifdef __OPTIMIZE__
480extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481_mm256_extractf128_pd (__m256d __X, const int __N)
482{
483 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
484}
485
486extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487_mm256_extractf128_ps (__m256 __X, const int __N)
488{
489 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
490}
491
492extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493_mm256_extractf128_si256 (__m256i __X, const int __N)
494{
495 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
496}
497
498extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499_mm256_extract_epi32 (__m256i __X, int const __N)
500{
501 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
502 return _mm_extract_epi32 (__Y, __N % 4);
503}
504
505extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
506_mm256_extract_epi16 (__m256i __X, int const __N)
507{
508 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
509 return _mm_extract_epi16 (__Y, __N % 8);
510}
511
512extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513_mm256_extract_epi8 (__m256i __X, int const __N)
514{
515 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
516 return _mm_extract_epi8 (__Y, __N % 16);
517}
518
519#ifdef __x86_64__
520extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521_mm256_extract_epi64 (__m256i __X, const int __N)
522{
523 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
524 return _mm_extract_epi64 (__Y, __N % 2);
525}
526#endif
527#else
528#define _mm256_extractf128_pd(X, N) \
529 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
530 (int)(N)))
531
532#define _mm256_extractf128_ps(X, N) \
533 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
534 (int)(N)))
535
536#define _mm256_extractf128_si256(X, N) \
537 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
538 (int)(N)))
539
540#define _mm256_extract_epi32(X, N) \
541 (__extension__ \
542 ({ \
543 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
544 _mm_extract_epi32 (__Y, (N) % 4); \
545 }))
546
547#define _mm256_extract_epi16(X, N) \
548 (__extension__ \
549 ({ \
550 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
551 _mm_extract_epi16 (__Y, (N) % 8); \
552 }))
553
554#define _mm256_extract_epi8(X, N) \
555 (__extension__ \
556 ({ \
557 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
558 _mm_extract_epi8 (__Y, (N) % 16); \
559 }))
560
561#ifdef __x86_64__
562#define _mm256_extract_epi64(X, N) \
563 (__extension__ \
564 ({ \
565 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
566 _mm_extract_epi64 (__Y, (N) % 2); \
567 }))
568#endif
569#endif
570
571extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572_mm256_zeroall (void)
573{
574 __builtin_ia32_vzeroall ();
575}
576
577extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
578_mm256_zeroupper (void)
579{
580 __builtin_ia32_vzeroupper ();
581}
582
583extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584_mm_permutevar_pd (__m128d __A, __m128i __C)
585{
586 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
587 (__v2di)__C);
588}
589
590extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591_mm256_permutevar_pd (__m256d __A, __m256i __C)
592{
593 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
594 (__v4di)__C);
595}
596
597extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598_mm_permutevar_ps (__m128 __A, __m128i __C)
599{
600 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
601 (__v4si)__C);
602}
603
604extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605_mm256_permutevar_ps (__m256 __A, __m256i __C)
606{
607 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
608 (__v8si)__C);
609}
610
611#ifdef __OPTIMIZE__
612extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_mm_permute_pd (__m128d __X, const int __C)
614{
615 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
616}
617
618extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619_mm256_permute_pd (__m256d __X, const int __C)
620{
621 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
622}
623
624extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625_mm_permute_ps (__m128 __X, const int __C)
626{
627 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
628}
629
630extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631_mm256_permute_ps (__m256 __X, const int __C)
632{
633 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
634}
95879c72
L
635#else
636#define _mm_permute_pd(X, C) \
637 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
638
639#define _mm256_permute_pd(X, C) \
640 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
641
642#define _mm_permute_ps(X, C) \
643 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
644
645#define _mm256_permute_ps(X, C) \
646 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
95879c72
L
647#endif
648
649#ifdef __OPTIMIZE__
650extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
652{
653 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
654 (__v4df)__Y,
655 __C);
656}
657
658extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
660{
661 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
662 (__v8sf)__Y,
663 __C);
664}
665
666extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
668{
669 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
670 (__v8si)__Y,
671 __C);
672}
673#else
674#define _mm256_permute2f128_pd(X, Y, C) \
675 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
676 (__v4df)(__m256d)(Y), \
677 (int)(C)))
678
679#define _mm256_permute2f128_ps(X, Y, C) \
680 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
681 (__v8sf)(__m256)(Y), \
682 (int)(C)))
683
684#define _mm256_permute2f128_si256(X, Y, C) \
685 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
686 (__v8si)(__m256i)(Y), \
687 (int)(C)))
688#endif
689
690extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691_mm_broadcast_ss (float const *__X)
692{
693 return (__m128) __builtin_ia32_vbroadcastss (__X);
694}
695
696extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697_mm256_broadcast_sd (double const *__X)
698{
699 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
700}
701
702extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703_mm256_broadcast_ss (float const *__X)
704{
705 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
706}
707
708extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709_mm256_broadcast_pd (__m128d const *__X)
710{
711 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
712}
713
714extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715_mm256_broadcast_ps (__m128 const *__X)
716{
717 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
718}
719
720#ifdef __OPTIMIZE__
721extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
723{
724 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
725 (__v2df)__Y,
726 __O);
727}
728
729extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
731{
732 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
733 (__v4sf)__Y,
734 __O);
735}
736
737extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
738_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
739{
740 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
741 (__v4si)__Y,
742 __O);
743}
744
745extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
746_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
747{
748 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
23e0d930 749 __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
95879c72
L
750 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
751}
752
753extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
755{
756 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
757 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
758 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
759}
760
761extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
763{
764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
765 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
766 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
767}
768
769#ifdef __x86_64__
770extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
fcff2e9c 771_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
95879c72
L
772{
773 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
23e0d930 774 __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
95879c72
L
775 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
776}
777#endif
778#else
779#define _mm256_insertf128_pd(X, Y, O) \
780 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
781 (__v2df)(__m128d)(Y), \
782 (int)(O)))
783
784#define _mm256_insertf128_ps(X, Y, O) \
785 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
786 (__v4sf)(__m128)(Y), \
787 (int)(O)))
788
789#define _mm256_insertf128_si256(X, Y, O) \
790 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
791 (__v4si)(__m128i)(Y), \
792 (int)(O)))
793
794#define _mm256_insert_epi32(X, D, N) \
795 (__extension__ \
796 ({ \
797 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
798 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
799 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
800 }))
801
802#define _mm256_insert_epi16(X, D, N) \
803 (__extension__ \
804 ({ \
805 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
806 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
807 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
808 }))
809
810#define _mm256_insert_epi8(X, D, N) \
811 (__extension__ \
812 ({ \
813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
814 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
815 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
816 }))
817
818#ifdef __x86_64__
819#define _mm256_insert_epi64(X, D, N) \
820 (__extension__ \
821 ({ \
822 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
823 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
824 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
825 }))
826#endif
827#endif
828
829extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830_mm256_load_pd (double const *__P)
831{
832 return *(__m256d *)__P;
833}
834
835extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836_mm256_store_pd (double *__P, __m256d __A)
837{
838 *(__m256d *)__P = __A;
839}
840
841extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842_mm256_load_ps (float const *__P)
843{
844 return *(__m256 *)__P;
845}
846
847extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848_mm256_store_ps (float *__P, __m256 __A)
849{
850 *(__m256 *)__P = __A;
851}
852
853extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
854_mm256_loadu_pd (double const *__P)
855{
856 return (__m256d) __builtin_ia32_loadupd256 (__P);
857}
858
859extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860_mm256_storeu_pd (double *__P, __m256d __A)
861{
862 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
863}
864
865extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866_mm256_loadu_ps (float const *__P)
867{
868 return (__m256) __builtin_ia32_loadups256 (__P);
869}
870
871extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872_mm256_storeu_ps (float *__P, __m256 __A)
873{
874 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
875}
876
877extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878_mm256_load_si256 (__m256i const *__P)
879{
880 return *__P;
881}
882
883extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884_mm256_store_si256 (__m256i *__P, __m256i __A)
885{
886 *__P = __A;
887}
888
889extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890_mm256_loadu_si256 (__m256i const *__P)
891{
892 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
893}
894
895extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896_mm256_storeu_si256 (__m256i *__P, __m256i __A)
897{
898 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
899}
900
901extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 902_mm_maskload_pd (double const *__P, __m128i __M)
95879c72
L
903{
904 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
98c6d93c 905 (__v2di)__M);
95879c72
L
906}
907
908extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 909_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
95879c72 910{
98c6d93c 911 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
95879c72
L
912}
913
914extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 915_mm256_maskload_pd (double const *__P, __m256i __M)
95879c72
L
916{
917 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
98c6d93c 918 (__v4di)__M);
95879c72
L
919}
920
921extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 922_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
95879c72 923{
98c6d93c 924 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
95879c72
L
925}
926
927extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 928_mm_maskload_ps (float const *__P, __m128i __M)
95879c72
L
929{
930 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
98c6d93c 931 (__v4si)__M);
95879c72
L
932}
933
934extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 935_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
95879c72 936{
98c6d93c 937 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
95879c72
L
938}
939
940extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 941_mm256_maskload_ps (float const *__P, __m256i __M)
95879c72
L
942{
943 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
98c6d93c 944 (__v8si)__M);
95879c72
L
945}
946
947extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 948_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
95879c72 949{
98c6d93c 950 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
95879c72
L
951}
952
953extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954_mm256_movehdup_ps (__m256 __X)
955{
956 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
957}
958
959extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960_mm256_moveldup_ps (__m256 __X)
961{
962 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
963}
964
965extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966_mm256_movedup_pd (__m256d __X)
967{
968 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
969}
970
971extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972_mm256_lddqu_si256 (__m256i const *__P)
973{
974 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
975}
976
65b82caa
L
977extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978_mm256_stream_si256 (__m256i *__A, __m256i __B)
979{
980 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
981}
982
983extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984_mm256_stream_pd (double *__A, __m256d __B)
985{
986 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
987}
988
989extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990_mm256_stream_ps (float *__P, __m256 __A)
991{
992 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
993}
994
95879c72
L
995extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996_mm256_rcp_ps (__m256 __A)
997{
998 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
999}
1000
1001extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002_mm256_rsqrt_ps (__m256 __A)
1003{
1004 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1005}
1006
1007extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1008_mm256_sqrt_pd (__m256d __A)
1009{
1010 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1011}
1012
1013extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014_mm256_sqrt_ps (__m256 __A)
1015{
1016 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1017}
1018
1019#ifdef __OPTIMIZE__
1020extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021_mm256_round_pd (__m256d __V, const int __M)
1022{
1023 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1024}
1025
1026extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027_mm256_round_ps (__m256 __V, const int __M)
1028{
1029 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1030}
1031#else
1032#define _mm256_round_pd(V, M) \
1033 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1034
1035#define _mm256_round_ps(V, M) \
1036 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1037#endif
1038
1039#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1040#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1041#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1042#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1043
1044extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1046{
1047 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1048}
1049
1050extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1052{
1053 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1054}
1055
1056extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1058{
1059 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1060}
1061
1062extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1064{
1065 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1066}
1067
1068extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069_mm_testz_pd (__m128d __M, __m128d __V)
1070{
1071 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1072}
1073
1074extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075_mm_testc_pd (__m128d __M, __m128d __V)
1076{
1077 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1078}
1079
1080extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081_mm_testnzc_pd (__m128d __M, __m128d __V)
1082{
1083 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1084}
1085
1086extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087_mm_testz_ps (__m128 __M, __m128 __V)
1088{
1089 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1090}
1091
1092extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1093_mm_testc_ps (__m128 __M, __m128 __V)
1094{
1095 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1096}
1097
1098extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099_mm_testnzc_ps (__m128 __M, __m128 __V)
1100{
1101 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1102}
1103
1104extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105_mm256_testz_pd (__m256d __M, __m256d __V)
1106{
1107 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1108}
1109
1110extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111_mm256_testc_pd (__m256d __M, __m256d __V)
1112{
1113 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1114}
1115
1116extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117_mm256_testnzc_pd (__m256d __M, __m256d __V)
1118{
1119 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1120}
1121
1122extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1123_mm256_testz_ps (__m256 __M, __m256 __V)
1124{
1125 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1126}
1127
1128extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129_mm256_testc_ps (__m256 __M, __m256 __V)
1130{
1131 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1132}
1133
1134extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1135_mm256_testnzc_ps (__m256 __M, __m256 __V)
1136{
1137 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1138}
1139
1140extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141_mm256_testz_si256 (__m256i __M, __m256i __V)
1142{
1143 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1144}
1145
1146extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147_mm256_testc_si256 (__m256i __M, __m256i __V)
1148{
1149 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1150}
1151
1152extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1154{
1155 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1156}
1157
1158extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159_mm256_movemask_pd (__m256d __A)
1160{
1161 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1162}
1163
1164extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1165_mm256_movemask_ps (__m256 __A)
1166{
1167 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1168}
1169
1170extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171_mm256_setzero_pd (void)
1172{
1173 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1174}
1175
1176extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177_mm256_setzero_ps (void)
1178{
1179 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1180 0.0, 0.0, 0.0, 0.0 };
1181}
1182
1183extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184_mm256_setzero_si256 (void)
1185{
1186 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1187}
1188
1189/* Create the vector [A B C D]. */
1190extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191_mm256_set_pd (double __A, double __B, double __C, double __D)
1192{
1193 return __extension__ (__m256d){ __D, __C, __B, __A };
1194}
1195
1196/* Create the vector [A B C D E F G H]. */
1197extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198_mm256_set_ps (float __A, float __B, float __C, float __D,
1199 float __E, float __F, float __G, float __H)
1200{
1201 return __extension__ (__m256){ __H, __G, __F, __E,
1202 __D, __C, __B, __A };
1203}
1204
1205/* Create the vector [A B C D E F G H]. */
1206extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1207_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1208 int __E, int __F, int __G, int __H)
1209{
1210 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1211 __D, __C, __B, __A };
1212}
1213
1214extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1216 short __q11, short __q10, short __q09, short __q08,
1217 short __q07, short __q06, short __q05, short __q04,
1218 short __q03, short __q02, short __q01, short __q00)
1219{
1220 return __extension__ (__m256i)(__v16hi){
1221 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1222 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1223 };
1224}
1225
1226extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1228 char __q27, char __q26, char __q25, char __q24,
1229 char __q23, char __q22, char __q21, char __q20,
1230 char __q19, char __q18, char __q17, char __q16,
1231 char __q15, char __q14, char __q13, char __q12,
1232 char __q11, char __q10, char __q09, char __q08,
1233 char __q07, char __q06, char __q05, char __q04,
1234 char __q03, char __q02, char __q01, char __q00)
1235{
1236 return __extension__ (__m256i)(__v32qi){
1237 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1238 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1239 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1240 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1241 };
1242}
1243
1244extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245_mm256_set_epi64x (long long __A, long long __B, long long __C,
1246 long long __D)
1247{
1248 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1249}
1250
1251/* Create a vector with all elements equal to A. */
1252extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253_mm256_set1_pd (double __A)
1254{
1255 return __extension__ (__m256d){ __A, __A, __A, __A };
1256}
1257
1258/* Create a vector with all elements equal to A. */
1259extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260_mm256_set1_ps (float __A)
1261{
1262 return __extension__ (__m256){ __A, __A, __A, __A,
1263 __A, __A, __A, __A };
1264}
1265
1266/* Create a vector with all elements equal to A. */
1267extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268_mm256_set1_epi32 (int __A)
1269{
1270 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1271 __A, __A, __A, __A };
1272}
1273
1274extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275_mm256_set1_epi16 (short __A)
1276{
1277 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1278 __A, __A, __A, __A, __A, __A, __A, __A);
1279}
1280
1281extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282_mm256_set1_epi8 (char __A)
1283{
1284 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1285 __A, __A, __A, __A, __A, __A, __A, __A,
1286 __A, __A, __A, __A, __A, __A, __A, __A,
1287 __A, __A, __A, __A, __A, __A, __A, __A);
1288}
1289
1290extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291_mm256_set1_epi64x (long long __A)
1292{
1293 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1294}
1295
1296/* Create vectors of elements in the reversed order from the
1297 _mm256_set_XXX functions. */
1298
1299extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300_mm256_setr_pd (double __A, double __B, double __C, double __D)
1301{
1302 return _mm256_set_pd (__D, __C, __B, __A);
1303}
1304
1305extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306_mm256_setr_ps (float __A, float __B, float __C, float __D,
1307 float __E, float __F, float __G, float __H)
1308{
1309 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1310}
1311
1312extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1314 int __E, int __F, int __G, int __H)
1315{
1316 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1317}
1318
1319extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1321 short __q11, short __q10, short __q09, short __q08,
1322 short __q07, short __q06, short __q05, short __q04,
1323 short __q03, short __q02, short __q01, short __q00)
1324{
1325 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1326 __q04, __q05, __q06, __q07,
1327 __q08, __q09, __q10, __q11,
1328 __q12, __q13, __q14, __q15);
1329}
1330
1331extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1332_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1333 char __q27, char __q26, char __q25, char __q24,
1334 char __q23, char __q22, char __q21, char __q20,
1335 char __q19, char __q18, char __q17, char __q16,
1336 char __q15, char __q14, char __q13, char __q12,
1337 char __q11, char __q10, char __q09, char __q08,
1338 char __q07, char __q06, char __q05, char __q04,
1339 char __q03, char __q02, char __q01, char __q00)
1340{
1341 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1342 __q04, __q05, __q06, __q07,
1343 __q08, __q09, __q10, __q11,
1344 __q12, __q13, __q14, __q15,
1345 __q16, __q17, __q18, __q19,
1346 __q20, __q21, __q22, __q23,
1347 __q24, __q25, __q26, __q27,
1348 __q28, __q29, __q30, __q31);
1349}
1350
1351extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1353 long long __D)
1354{
1355 return _mm256_set_epi64x (__D, __C, __B, __A);
1356}
1357
1358/* Casts between various SP, DP, INT vector types. Note that these do no
1359 conversion of values, they just change the type. */
1360extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361_mm256_castpd_ps (__m256d __A)
1362{
1363 return (__m256) __A;
1364}
1365
1366extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367_mm256_castpd_si256 (__m256d __A)
1368{
1369 return (__m256i) __A;
1370}
1371
1372extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373_mm256_castps_pd (__m256 __A)
1374{
1375 return (__m256d) __A;
1376}
1377
1378extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379_mm256_castps_si256(__m256 __A)
1380{
1381 return (__m256i) __A;
1382}
1383
1384extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385_mm256_castsi256_ps (__m256i __A)
1386{
1387 return (__m256) __A;
1388}
1389
1390extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1391_mm256_castsi256_pd (__m256i __A)
1392{
1393 return (__m256d) __A;
1394}
1395
1396extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397_mm256_castpd256_pd128 (__m256d __A)
1398{
1399 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1400}
1401
1402extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403_mm256_castps256_ps128 (__m256 __A)
1404{
1405 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1406}
1407
1408extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409_mm256_castsi256_si128 (__m256i __A)
1410{
1411 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1412}
1413
1414/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1415 the 256-bit result contain source parameter value and the upper 128
1416 bits of the result are undefined. Those intrinsics shouldn't
1417 generate any extra moves. */
1418
1419extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420_mm256_castpd128_pd256 (__m128d __A)
1421{
1422 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1423}
1424
1425extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1426_mm256_castps128_ps256 (__m128 __A)
1427{
1428 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1429}
1430
1431extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1432_mm256_castsi128_si256 (__m128i __A)
1433{
1434 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1435}
97db2bf7
ST
1436
1437#ifdef __DISABLE_AVX__
1438#undef __DISABLE_AVX__
1439#pragma GCC pop_options
1440#endif /* __DISABLE_AVX__ */
1441
1442#endif /* _AVXINTRIN_H_INCLUDED */