]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/avxintrin.h
[MAINTAINERS] Update my email address and move to DCO.
[thirdparty/gcc.git] / gcc / config / i386 / avxintrin.h
CommitLineData
a945c346 1/* Copyright (C) 2008-2024 Free Software Foundation, Inc.
95879c72
L
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
748086b7 7 the Free Software Foundation; either version 3, or (at your option)
95879c72
L
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
748086b7
JJ
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
95879c72
L
23
24/* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
26
37fe763d
UB
27#ifndef _IMMINTRIN_H_INCLUDED
28# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29#endif
95879c72 30
97db2bf7
ST
31#ifndef _AVXINTRIN_H_INCLUDED
32#define _AVXINTRIN_H_INCLUDED
33
34#ifndef __AVX__
35#pragma GCC push_options
36#pragma GCC target("avx")
37#define __DISABLE_AVX__
38#endif /* __AVX__ */
39
95879c72
L
40/* Internal data types for implementing the intrinsics. */
41typedef double __v4df __attribute__ ((__vector_size__ (32)));
42typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43typedef long long __v4di __attribute__ ((__vector_size__ (32)));
2069d6fc 44typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
95879c72 45typedef int __v8si __attribute__ ((__vector_size__ (32)));
2069d6fc 46typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
95879c72 47typedef short __v16hi __attribute__ ((__vector_size__ (32)));
2069d6fc 48typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
95879c72 49typedef char __v32qi __attribute__ ((__vector_size__ (32)));
b245befc 50typedef signed char __v32qs __attribute__ ((__vector_size__ (32)));
2069d6fc 51typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
95879c72
L
52
53/* The Intel API is flexible enough that we must allow aliasing with other
54 vector types, and their scalar components. */
55typedef float __m256 __attribute__ ((__vector_size__ (32),
56 __may_alias__));
57typedef long long __m256i __attribute__ ((__vector_size__ (32),
58 __may_alias__));
59typedef double __m256d __attribute__ ((__vector_size__ (32),
60 __may_alias__));
61
c6b0037d
MG
62/* Unaligned version of the same types. */
63typedef float __m256_u __attribute__ ((__vector_size__ (32),
64 __may_alias__,
65 __aligned__ (1)));
66typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
67 __may_alias__,
68 __aligned__ (1)));
69typedef double __m256d_u __attribute__ ((__vector_size__ (32),
70 __may_alias__,
71 __aligned__ (1)));
72
95879c72
L
73/* Compare predicates for scalar and packed compare intrinsics. */
74
95879c72
L
75/* Equal (unordered, non-signaling) */
76#define _CMP_EQ_UQ 0x08
77/* Not-greater-than-or-equal (unordered, signaling) */
78#define _CMP_NGE_US 0x09
79/* Not-greater-than (unordered, signaling) */
80#define _CMP_NGT_US 0x0a
81/* False (ordered, non-signaling) */
82#define _CMP_FALSE_OQ 0x0b
83/* Not-equal (ordered, non-signaling) */
84#define _CMP_NEQ_OQ 0x0c
85/* Greater-than-or-equal (ordered, signaling) */
86#define _CMP_GE_OS 0x0d
87/* Greater-than (ordered, signaling) */
88#define _CMP_GT_OS 0x0e
89/* True (unordered, non-signaling) */
90#define _CMP_TRUE_UQ 0x0f
91/* Equal (ordered, signaling) */
92#define _CMP_EQ_OS 0x10
93/* Less-than (ordered, non-signaling) */
94#define _CMP_LT_OQ 0x11
95/* Less-than-or-equal (ordered, non-signaling) */
96#define _CMP_LE_OQ 0x12
97/* Unordered (signaling) */
98#define _CMP_UNORD_S 0x13
99/* Not-equal (unordered, signaling) */
100#define _CMP_NEQ_US 0x14
101/* Not-less-than (unordered, non-signaling) */
102#define _CMP_NLT_UQ 0x15
103/* Not-less-than-or-equal (unordered, non-signaling) */
104#define _CMP_NLE_UQ 0x16
105/* Ordered (signaling) */
106#define _CMP_ORD_S 0x17
107/* Equal (unordered, signaling) */
108#define _CMP_EQ_US 0x18
109/* Not-greater-than-or-equal (unordered, non-signaling) */
110#define _CMP_NGE_UQ 0x19
111/* Not-greater-than (unordered, non-signaling) */
112#define _CMP_NGT_UQ 0x1a
113/* False (ordered, signaling) */
114#define _CMP_FALSE_OS 0x1b
115/* Not-equal (ordered, signaling) */
116#define _CMP_NEQ_OS 0x1c
117/* Greater-than-or-equal (ordered, non-signaling) */
118#define _CMP_GE_OQ 0x1d
119/* Greater-than (ordered, non-signaling) */
120#define _CMP_GT_OQ 0x1e
121/* True (unordered, signaling) */
122#define _CMP_TRUE_US 0x1f
123
124extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125_mm256_add_pd (__m256d __A, __m256d __B)
126{
2069d6fc 127 return (__m256d) ((__v4df)__A + (__v4df)__B);
95879c72
L
128}
129
130extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131_mm256_add_ps (__m256 __A, __m256 __B)
132{
2069d6fc 133 return (__m256) ((__v8sf)__A + (__v8sf)__B);
95879c72
L
134}
135
136extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
137_mm256_addsub_pd (__m256d __A, __m256d __B)
138{
139 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
140}
141
142extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143_mm256_addsub_ps (__m256 __A, __m256 __B)
144{
145 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
146}
147
148
149extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150_mm256_and_pd (__m256d __A, __m256d __B)
151{
152 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
153}
154
155extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156_mm256_and_ps (__m256 __A, __m256 __B)
157{
158 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
159}
160
161extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm256_andnot_pd (__m256d __A, __m256d __B)
163{
164 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
165}
166
167extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168_mm256_andnot_ps (__m256 __A, __m256 __B)
169{
170 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
171}
172
173/* Double/single precision floating point blend instructions - select
174 data from 2 sources using constant/variable mask. */
175
176#ifdef __OPTIMIZE__
177extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
178_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
179{
180 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
181 (__v4df)__Y,
182 __M);
183}
184
185extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
187{
188 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
189 (__v8sf)__Y,
190 __M);
191}
192#else
193#define _mm256_blend_pd(X, Y, M) \
194 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
195 (__v4df)(__m256d)(Y), (int)(M)))
196
197#define _mm256_blend_ps(X, Y, M) \
198 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
199 (__v8sf)(__m256)(Y), (int)(M)))
200#endif
201
202extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
204{
205 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
206 (__v4df)__Y,
207 (__v4df)__M);
208}
209
210extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
212{
213 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
214 (__v8sf)__Y,
215 (__v8sf)__M);
216}
217
218extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219_mm256_div_pd (__m256d __A, __m256d __B)
220{
2069d6fc 221 return (__m256d) ((__v4df)__A / (__v4df)__B);
95879c72
L
222}
223
224extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
225_mm256_div_ps (__m256 __A, __m256 __B)
226{
2069d6fc 227 return (__m256) ((__v8sf)__A / (__v8sf)__B);
95879c72
L
228}
229
230/* Dot product instructions with mask-defined summing and zeroing parts
231 of result. */
232
233#ifdef __OPTIMIZE__
234extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
236{
237 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
238 (__v8sf)__Y,
239 __M);
240}
241#else
242#define _mm256_dp_ps(X, Y, M) \
243 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
244 (__v8sf)(__m256)(Y), (int)(M)))
245#endif
246
247extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248_mm256_hadd_pd (__m256d __X, __m256d __Y)
249{
250 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
251}
252
253extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
254_mm256_hadd_ps (__m256 __X, __m256 __Y)
255{
256 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
257}
258
259extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260_mm256_hsub_pd (__m256d __X, __m256d __Y)
261{
262 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
263}
264
265extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266_mm256_hsub_ps (__m256 __X, __m256 __Y)
267{
268 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
269}
270
271extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272_mm256_max_pd (__m256d __A, __m256d __B)
273{
274 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
275}
276
277extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278_mm256_max_ps (__m256 __A, __m256 __B)
279{
280 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
281}
282
283extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284_mm256_min_pd (__m256d __A, __m256d __B)
285{
286 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
287}
288
289extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290_mm256_min_ps (__m256 __A, __m256 __B)
291{
292 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
293}
294
295extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296_mm256_mul_pd (__m256d __A, __m256d __B)
297{
2069d6fc 298 return (__m256d) ((__v4df)__A * (__v4df)__B);
95879c72
L
299}
300
301extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302_mm256_mul_ps (__m256 __A, __m256 __B)
303{
2069d6fc 304 return (__m256) ((__v8sf)__A * (__v8sf)__B);
95879c72
L
305}
306
307extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308_mm256_or_pd (__m256d __A, __m256d __B)
309{
310 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
311}
312
313extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314_mm256_or_ps (__m256 __A, __m256 __B)
315{
316 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
317}
318
319#ifdef __OPTIMIZE__
320extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
322{
323 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
324 __mask);
325}
326
327extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
328_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
329{
330 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
331 __mask);
332}
333#else
334#define _mm256_shuffle_pd(A, B, N) \
335 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
336 (__v4df)(__m256d)(B), (int)(N)))
337
338#define _mm256_shuffle_ps(A, B, N) \
339 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
340 (__v8sf)(__m256)(B), (int)(N)))
341#endif
342
343extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm256_sub_pd (__m256d __A, __m256d __B)
345{
2069d6fc 346 return (__m256d) ((__v4df)__A - (__v4df)__B);
95879c72
L
347}
348
349extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350_mm256_sub_ps (__m256 __A, __m256 __B)
351{
2069d6fc 352 return (__m256) ((__v8sf)__A - (__v8sf)__B);
95879c72
L
353}
354
355extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm256_xor_pd (__m256d __A, __m256d __B)
357{
358 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
359}
360
361extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362_mm256_xor_ps (__m256 __A, __m256 __B)
363{
364 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
365}
366
367#ifdef __OPTIMIZE__
95879c72
L
368extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
370{
371 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
372 __P);
373}
374
375extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
377{
378 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
379 __P);
380}
95879c72 381#else
95879c72
L
382#define _mm256_cmp_pd(X, Y, P) \
383 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
384 (__v4df)(__m256d)(Y), (int)(P)))
385
386#define _mm256_cmp_ps(X, Y, P) \
387 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
388 (__v8sf)(__m256)(Y), (int)(P)))
95879c72
L
389#endif
390
93103603
SP
391extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392_mm256_cvtsi256_si32 (__m256i __A)
393{
394 __v8si __B = (__v8si) __A;
395 return __B[0];
396}
397
95879c72
L
398extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399_mm256_cvtepi32_pd (__m128i __A)
400{
401 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
402}
403
404extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405_mm256_cvtepi32_ps (__m256i __A)
406{
407 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
408}
409
410extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411_mm256_cvtpd_ps (__m256d __A)
412{
413 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
414}
415
416extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417_mm256_cvtps_epi32 (__m256 __A)
418{
419 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
420}
421
422extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423_mm256_cvtps_pd (__m128 __A)
424{
425 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
426}
427
428extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429_mm256_cvttpd_epi32 (__m256d __A)
430{
431 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
432}
433
434extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435_mm256_cvtpd_epi32 (__m256d __A)
436{
437 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
438}
439
440extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441_mm256_cvttps_epi32 (__m256 __A)
442{
443 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
444}
445
dcb2c527
JJ
446extern __inline double
447__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
448_mm256_cvtsd_f64 (__m256d __A)
449{
450 return __A[0];
451}
452
453extern __inline float
454__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
455_mm256_cvtss_f32 (__m256 __A)
456{
457 return __A[0];
458}
459
95879c72
L
460#ifdef __OPTIMIZE__
461extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462_mm256_extractf128_pd (__m256d __X, const int __N)
463{
464 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
465}
466
467extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468_mm256_extractf128_ps (__m256 __X, const int __N)
469{
470 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
471}
472
473extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
474_mm256_extractf128_si256 (__m256i __X, const int __N)
475{
476 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
477}
478
479extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480_mm256_extract_epi32 (__m256i __X, int const __N)
481{
482 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
483 return _mm_extract_epi32 (__Y, __N % 4);
484}
485
486extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487_mm256_extract_epi16 (__m256i __X, int const __N)
488{
489 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
490 return _mm_extract_epi16 (__Y, __N % 8);
491}
492
493extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
494_mm256_extract_epi8 (__m256i __X, int const __N)
495{
496 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
497 return _mm_extract_epi8 (__Y, __N % 16);
498}
499
500#ifdef __x86_64__
501extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502_mm256_extract_epi64 (__m256i __X, const int __N)
503{
504 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
505 return _mm_extract_epi64 (__Y, __N % 2);
506}
507#endif
508#else
509#define _mm256_extractf128_pd(X, N) \
510 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
511 (int)(N)))
512
513#define _mm256_extractf128_ps(X, N) \
514 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
515 (int)(N)))
516
517#define _mm256_extractf128_si256(X, N) \
518 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
519 (int)(N)))
520
521#define _mm256_extract_epi32(X, N) \
522 (__extension__ \
523 ({ \
524 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
525 _mm_extract_epi32 (__Y, (N) % 4); \
526 }))
527
528#define _mm256_extract_epi16(X, N) \
529 (__extension__ \
530 ({ \
531 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
532 _mm_extract_epi16 (__Y, (N) % 8); \
533 }))
534
535#define _mm256_extract_epi8(X, N) \
536 (__extension__ \
537 ({ \
538 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
539 _mm_extract_epi8 (__Y, (N) % 16); \
540 }))
541
542#ifdef __x86_64__
543#define _mm256_extract_epi64(X, N) \
544 (__extension__ \
545 ({ \
546 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
547 _mm_extract_epi64 (__Y, (N) % 2); \
548 }))
549#endif
550#endif
551
552extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
553_mm256_zeroall (void)
554{
555 __builtin_ia32_vzeroall ();
556}
557
558extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
559_mm256_zeroupper (void)
560{
561 __builtin_ia32_vzeroupper ();
562}
563
564extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565_mm_permutevar_pd (__m128d __A, __m128i __C)
566{
567 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
568 (__v2di)__C);
569}
570
571extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572_mm256_permutevar_pd (__m256d __A, __m256i __C)
573{
574 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
575 (__v4di)__C);
576}
577
578extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579_mm_permutevar_ps (__m128 __A, __m128i __C)
580{
581 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
582 (__v4si)__C);
583}
584
585extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
586_mm256_permutevar_ps (__m256 __A, __m256i __C)
587{
588 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
589 (__v8si)__C);
590}
591
592#ifdef __OPTIMIZE__
593extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
594_mm_permute_pd (__m128d __X, const int __C)
595{
596 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
597}
598
599extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
600_mm256_permute_pd (__m256d __X, const int __C)
601{
602 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
603}
604
605extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606_mm_permute_ps (__m128 __X, const int __C)
607{
608 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
609}
610
611extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612_mm256_permute_ps (__m256 __X, const int __C)
613{
614 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
615}
95879c72
L
616#else
617#define _mm_permute_pd(X, C) \
618 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
619
620#define _mm256_permute_pd(X, C) \
621 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
622
623#define _mm_permute_ps(X, C) \
624 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
625
626#define _mm256_permute_ps(X, C) \
627 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
95879c72
L
628#endif
629
630#ifdef __OPTIMIZE__
631extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
632_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
633{
634 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
635 (__v4df)__Y,
636 __C);
637}
638
639extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
641{
642 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
643 (__v8sf)__Y,
644 __C);
645}
646
647extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
649{
650 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
651 (__v8si)__Y,
652 __C);
653}
654#else
655#define _mm256_permute2f128_pd(X, Y, C) \
656 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
657 (__v4df)(__m256d)(Y), \
658 (int)(C)))
659
660#define _mm256_permute2f128_ps(X, Y, C) \
661 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
662 (__v8sf)(__m256)(Y), \
663 (int)(C)))
664
665#define _mm256_permute2f128_si256(X, Y, C) \
666 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
667 (__v8si)(__m256i)(Y), \
668 (int)(C)))
669#endif
670
671extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
672_mm_broadcast_ss (float const *__X)
673{
674 return (__m128) __builtin_ia32_vbroadcastss (__X);
675}
676
677extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678_mm256_broadcast_sd (double const *__X)
679{
680 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
681}
682
683extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
684_mm256_broadcast_ss (float const *__X)
685{
686 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
687}
688
689extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690_mm256_broadcast_pd (__m128d const *__X)
691{
692 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
693}
694
695extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696_mm256_broadcast_ps (__m128 const *__X)
697{
698 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
699}
700
701#ifdef __OPTIMIZE__
702extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
704{
705 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
706 (__v2df)__Y,
707 __O);
708}
709
710extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
712{
713 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
714 (__v4sf)__Y,
715 __O);
716}
717
718extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
720{
721 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
722 (__v4si)__Y,
723 __O);
724}
725
726extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
727_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
728{
729 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
23e0d930 730 __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
95879c72
L
731 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
732}
733
734extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
736{
737 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
738 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
739 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
740}
741
742extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
744{
745 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
746 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
747 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
748}
749
750#ifdef __x86_64__
751extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
fcff2e9c 752_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
95879c72
L
753{
754 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
23e0d930 755 __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
95879c72
L
756 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
757}
758#endif
759#else
760#define _mm256_insertf128_pd(X, Y, O) \
761 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
762 (__v2df)(__m128d)(Y), \
763 (int)(O)))
764
765#define _mm256_insertf128_ps(X, Y, O) \
766 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
767 (__v4sf)(__m128)(Y), \
768 (int)(O)))
769
770#define _mm256_insertf128_si256(X, Y, O) \
771 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
772 (__v4si)(__m128i)(Y), \
773 (int)(O)))
774
775#define _mm256_insert_epi32(X, D, N) \
776 (__extension__ \
777 ({ \
778 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
779 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
780 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
781 }))
782
783#define _mm256_insert_epi16(X, D, N) \
784 (__extension__ \
785 ({ \
786 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
787 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
788 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
789 }))
790
791#define _mm256_insert_epi8(X, D, N) \
792 (__extension__ \
793 ({ \
794 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
795 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
796 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
797 }))
798
799#ifdef __x86_64__
800#define _mm256_insert_epi64(X, D, N) \
801 (__extension__ \
802 ({ \
803 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
804 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
805 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
806 }))
807#endif
808#endif
809
810extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811_mm256_load_pd (double const *__P)
812{
813 return *(__m256d *)__P;
814}
815
816extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817_mm256_store_pd (double *__P, __m256d __A)
818{
819 *(__m256d *)__P = __A;
820}
821
822extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
823_mm256_load_ps (float const *__P)
824{
825 return *(__m256 *)__P;
826}
827
828extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
829_mm256_store_ps (float *__P, __m256 __A)
830{
831 *(__m256 *)__P = __A;
832}
833
834extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835_mm256_loadu_pd (double const *__P)
836{
c6b0037d 837 return *(__m256d_u *)__P;
95879c72
L
838}
839
840extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841_mm256_storeu_pd (double *__P, __m256d __A)
842{
c6b0037d 843 *(__m256d_u *)__P = __A;
95879c72
L
844}
845
846extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847_mm256_loadu_ps (float const *__P)
848{
c6b0037d 849 return *(__m256_u *)__P;
95879c72
L
850}
851
852extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853_mm256_storeu_ps (float *__P, __m256 __A)
854{
c6b0037d 855 *(__m256_u *)__P = __A;
95879c72
L
856}
857
858extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859_mm256_load_si256 (__m256i const *__P)
860{
861 return *__P;
862}
863
864extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865_mm256_store_si256 (__m256i *__P, __m256i __A)
866{
867 *__P = __A;
868}
869
870extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c6b0037d 871_mm256_loadu_si256 (__m256i_u const *__P)
95879c72 872{
c6b0037d 873 return *__P;
95879c72
L
874}
875
876extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c6b0037d 877_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
95879c72 878{
c6b0037d 879 *__P = __A;
95879c72
L
880}
881
882extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 883_mm_maskload_pd (double const *__P, __m128i __M)
95879c72
L
884{
885 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
98c6d93c 886 (__v2di)__M);
95879c72
L
887}
888
889extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 890_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
95879c72 891{
98c6d93c 892 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
95879c72
L
893}
894
895extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 896_mm256_maskload_pd (double const *__P, __m256i __M)
95879c72
L
897{
898 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
98c6d93c 899 (__v4di)__M);
95879c72
L
900}
901
902extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 903_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
95879c72 904{
98c6d93c 905 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
95879c72
L
906}
907
908extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 909_mm_maskload_ps (float const *__P, __m128i __M)
95879c72
L
910{
911 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
98c6d93c 912 (__v4si)__M);
95879c72
L
913}
914
915extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 916_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
95879c72 917{
98c6d93c 918 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
95879c72
L
919}
920
921extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 922_mm256_maskload_ps (float const *__P, __m256i __M)
95879c72
L
923{
924 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
98c6d93c 925 (__v8si)__M);
95879c72
L
926}
927
928extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 929_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
95879c72 930{
98c6d93c 931 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
95879c72
L
932}
933
934extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935_mm256_movehdup_ps (__m256 __X)
936{
937 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
938}
939
940extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941_mm256_moveldup_ps (__m256 __X)
942{
943 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
944}
945
946extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
947_mm256_movedup_pd (__m256d __X)
948{
949 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
950}
951
952extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
953_mm256_lddqu_si256 (__m256i const *__P)
954{
955 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
956}
957
65b82caa
L
958extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959_mm256_stream_si256 (__m256i *__A, __m256i __B)
960{
961 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
962}
963
964extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
965_mm256_stream_pd (double *__A, __m256d __B)
966{
967 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
968}
969
970extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971_mm256_stream_ps (float *__P, __m256 __A)
972{
973 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
974}
975
95879c72
L
976extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977_mm256_rcp_ps (__m256 __A)
978{
979 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
980}
981
982extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm256_rsqrt_ps (__m256 __A)
984{
985 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
986}
987
988extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989_mm256_sqrt_pd (__m256d __A)
990{
991 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
992}
993
994extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
995_mm256_sqrt_ps (__m256 __A)
996{
997 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
998}
999
1000#ifdef __OPTIMIZE__
1001extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002_mm256_round_pd (__m256d __V, const int __M)
1003{
1004 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1005}
1006
1007extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1008_mm256_round_ps (__m256 __V, const int __M)
1009{
1010 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1011}
1012#else
1013#define _mm256_round_pd(V, M) \
1014 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1015
1016#define _mm256_round_ps(V, M) \
1017 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1018#endif
1019
1020#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1021#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1022#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1023#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1024
1025extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1027{
1028 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1029}
1030
1031extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1033{
1034 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1035}
1036
1037extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1038_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1039{
1040 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1041}
1042
1043extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1044_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1045{
1046 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1047}
1048
1049extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050_mm_testz_pd (__m128d __M, __m128d __V)
1051{
1052 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1053}
1054
1055extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056_mm_testc_pd (__m128d __M, __m128d __V)
1057{
1058 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1059}
1060
1061extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1062_mm_testnzc_pd (__m128d __M, __m128d __V)
1063{
1064 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1065}
1066
1067extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068_mm_testz_ps (__m128 __M, __m128 __V)
1069{
1070 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1071}
1072
1073extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074_mm_testc_ps (__m128 __M, __m128 __V)
1075{
1076 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1077}
1078
1079extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080_mm_testnzc_ps (__m128 __M, __m128 __V)
1081{
1082 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1083}
1084
1085extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1086_mm256_testz_pd (__m256d __M, __m256d __V)
1087{
1088 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1089}
1090
1091extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092_mm256_testc_pd (__m256d __M, __m256d __V)
1093{
1094 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1095}
1096
1097extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098_mm256_testnzc_pd (__m256d __M, __m256d __V)
1099{
1100 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1101}
1102
1103extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104_mm256_testz_ps (__m256 __M, __m256 __V)
1105{
1106 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1107}
1108
1109extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110_mm256_testc_ps (__m256 __M, __m256 __V)
1111{
1112 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1113}
1114
1115extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1116_mm256_testnzc_ps (__m256 __M, __m256 __V)
1117{
1118 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1119}
1120
1121extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122_mm256_testz_si256 (__m256i __M, __m256i __V)
1123{
1124 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1125}
1126
1127extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128_mm256_testc_si256 (__m256i __M, __m256i __V)
1129{
1130 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1131}
1132
1133extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1134_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1135{
1136 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1137}
1138
1139extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140_mm256_movemask_pd (__m256d __A)
1141{
1142 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1143}
1144
1145extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1146_mm256_movemask_ps (__m256 __A)
1147{
1148 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1149}
1150
0b192937
UD
1151extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152_mm256_undefined_pd (void)
1153{
6b0907b4
JJ
1154#pragma GCC diagnostic push
1155#pragma GCC diagnostic ignored "-Winit-self"
0b192937 1156 __m256d __Y = __Y;
6b0907b4 1157#pragma GCC diagnostic pop
0b192937
UD
1158 return __Y;
1159}
1160
1161extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162_mm256_undefined_ps (void)
1163{
6b0907b4
JJ
1164#pragma GCC diagnostic push
1165#pragma GCC diagnostic ignored "-Winit-self"
0b192937 1166 __m256 __Y = __Y;
6b0907b4 1167#pragma GCC diagnostic pop
0b192937
UD
1168 return __Y;
1169}
1170
1171extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172_mm256_undefined_si256 (void)
1173{
6b0907b4
JJ
1174#pragma GCC diagnostic push
1175#pragma GCC diagnostic ignored "-Winit-self"
0b192937 1176 __m256i __Y = __Y;
6b0907b4 1177#pragma GCC diagnostic pop
0b192937
UD
1178 return __Y;
1179}
1180
95879c72
L
1181extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182_mm256_setzero_pd (void)
1183{
1184 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1185}
1186
1187extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188_mm256_setzero_ps (void)
1189{
1190 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1191 0.0, 0.0, 0.0, 0.0 };
1192}
1193
1194extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195_mm256_setzero_si256 (void)
1196{
1197 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1198}
1199
1200/* Create the vector [A B C D]. */
1201extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1202_mm256_set_pd (double __A, double __B, double __C, double __D)
1203{
1204 return __extension__ (__m256d){ __D, __C, __B, __A };
1205}
1206
1207/* Create the vector [A B C D E F G H]. */
1208extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209_mm256_set_ps (float __A, float __B, float __C, float __D,
1210 float __E, float __F, float __G, float __H)
1211{
1212 return __extension__ (__m256){ __H, __G, __F, __E,
1213 __D, __C, __B, __A };
1214}
1215
1216/* Create the vector [A B C D E F G H]. */
1217extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1219 int __E, int __F, int __G, int __H)
1220{
1221 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1222 __D, __C, __B, __A };
1223}
1224
1225extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1227 short __q11, short __q10, short __q09, short __q08,
1228 short __q07, short __q06, short __q05, short __q04,
1229 short __q03, short __q02, short __q01, short __q00)
1230{
1231 return __extension__ (__m256i)(__v16hi){
1232 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1233 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1234 };
1235}
1236
1237extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1239 char __q27, char __q26, char __q25, char __q24,
1240 char __q23, char __q22, char __q21, char __q20,
1241 char __q19, char __q18, char __q17, char __q16,
1242 char __q15, char __q14, char __q13, char __q12,
1243 char __q11, char __q10, char __q09, char __q08,
1244 char __q07, char __q06, char __q05, char __q04,
1245 char __q03, char __q02, char __q01, char __q00)
1246{
1247 return __extension__ (__m256i)(__v32qi){
1248 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1249 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1250 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1251 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1252 };
1253}
1254
1255extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256_mm256_set_epi64x (long long __A, long long __B, long long __C,
1257 long long __D)
1258{
1259 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1260}
1261
1262/* Create a vector with all elements equal to A. */
1263extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1264_mm256_set1_pd (double __A)
1265{
1266 return __extension__ (__m256d){ __A, __A, __A, __A };
1267}
1268
1269/* Create a vector with all elements equal to A. */
1270extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1271_mm256_set1_ps (float __A)
1272{
1273 return __extension__ (__m256){ __A, __A, __A, __A,
1274 __A, __A, __A, __A };
1275}
1276
1277/* Create a vector with all elements equal to A. */
1278extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1279_mm256_set1_epi32 (int __A)
1280{
1281 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1282 __A, __A, __A, __A };
1283}
1284
1285extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286_mm256_set1_epi16 (short __A)
1287{
1288 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1289 __A, __A, __A, __A, __A, __A, __A, __A);
1290}
1291
1292extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293_mm256_set1_epi8 (char __A)
1294{
1295 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1296 __A, __A, __A, __A, __A, __A, __A, __A,
1297 __A, __A, __A, __A, __A, __A, __A, __A,
1298 __A, __A, __A, __A, __A, __A, __A, __A);
1299}
1300
1301extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1302_mm256_set1_epi64x (long long __A)
1303{
1304 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1305}
1306
1307/* Create vectors of elements in the reversed order from the
1308 _mm256_set_XXX functions. */
1309
1310extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311_mm256_setr_pd (double __A, double __B, double __C, double __D)
1312{
1313 return _mm256_set_pd (__D, __C, __B, __A);
1314}
1315
1316extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317_mm256_setr_ps (float __A, float __B, float __C, float __D,
1318 float __E, float __F, float __G, float __H)
1319{
1320 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1321}
1322
1323extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1324_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1325 int __E, int __F, int __G, int __H)
1326{
1327 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1328}
1329
1330extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1331_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1332 short __q11, short __q10, short __q09, short __q08,
1333 short __q07, short __q06, short __q05, short __q04,
1334 short __q03, short __q02, short __q01, short __q00)
1335{
1336 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1337 __q04, __q05, __q06, __q07,
1338 __q08, __q09, __q10, __q11,
1339 __q12, __q13, __q14, __q15);
1340}
1341
1342extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1344 char __q27, char __q26, char __q25, char __q24,
1345 char __q23, char __q22, char __q21, char __q20,
1346 char __q19, char __q18, char __q17, char __q16,
1347 char __q15, char __q14, char __q13, char __q12,
1348 char __q11, char __q10, char __q09, char __q08,
1349 char __q07, char __q06, char __q05, char __q04,
1350 char __q03, char __q02, char __q01, char __q00)
1351{
1352 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1353 __q04, __q05, __q06, __q07,
1354 __q08, __q09, __q10, __q11,
1355 __q12, __q13, __q14, __q15,
1356 __q16, __q17, __q18, __q19,
1357 __q20, __q21, __q22, __q23,
1358 __q24, __q25, __q26, __q27,
1359 __q28, __q29, __q30, __q31);
1360}
1361
1362extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1364 long long __D)
1365{
1366 return _mm256_set_epi64x (__D, __C, __B, __A);
1367}
1368
1369/* Casts between various SP, DP, INT vector types. Note that these do no
1370 conversion of values, they just change the type. */
1371extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372_mm256_castpd_ps (__m256d __A)
1373{
1374 return (__m256) __A;
1375}
1376
1377extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378_mm256_castpd_si256 (__m256d __A)
1379{
1380 return (__m256i) __A;
1381}
1382
1383extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384_mm256_castps_pd (__m256 __A)
1385{
1386 return (__m256d) __A;
1387}
1388
1389extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390_mm256_castps_si256(__m256 __A)
1391{
1392 return (__m256i) __A;
1393}
1394
1395extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396_mm256_castsi256_ps (__m256i __A)
1397{
1398 return (__m256) __A;
1399}
1400
1401extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1402_mm256_castsi256_pd (__m256i __A)
1403{
1404 return (__m256d) __A;
1405}
1406
1407extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1408_mm256_castpd256_pd128 (__m256d __A)
1409{
1410 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1411}
1412
1413extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1414_mm256_castps256_ps128 (__m256 __A)
1415{
1416 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1417}
1418
1419extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420_mm256_castsi256_si128 (__m256i __A)
1421{
1422 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1423}
1424
1425/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1426 the 256-bit result contain source parameter value and the upper 128
1427 bits of the result are undefined. Those intrinsics shouldn't
1428 generate any extra moves. */
1429
1430extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1431_mm256_castpd128_pd256 (__m128d __A)
1432{
1433 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1434}
1435
1436extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1437_mm256_castps128_ps256 (__m128 __A)
1438{
1439 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1440}
1441
1442extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1443_mm256_castsi128_si256 (__m128i __A)
1444{
1445 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1446}
97db2bf7 1447
e6b2dc24
JJ
1448/* Similarly, but with zero extension instead of undefined values. */
1449
1450extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1451_mm256_zextpd128_pd256 (__m128d __A)
1452{
1453 return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0);
1454}
1455
1456extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1457_mm256_zextps128_ps256 (__m128 __A)
1458{
1459 return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0);
1460}
1461
1462extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463_mm256_zextsi128_si256 (__m128i __A)
1464{
1465 return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0);
1466}
1467
f4ee3a9e
UB
1468extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1469_mm256_set_m128 ( __m128 __H, __m128 __L)
1470{
1471 return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
1472}
1473
1474extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1475_mm256_set_m128d (__m128d __H, __m128d __L)
1476{
1477 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
1478}
1479
1480extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1481_mm256_set_m128i (__m128i __H, __m128i __L)
1482{
1483 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
1484}
1485
1486extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1487_mm256_setr_m128 (__m128 __L, __m128 __H)
1488{
1489 return _mm256_set_m128 (__H, __L);
1490}
1491
1492extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1493_mm256_setr_m128d (__m128d __L, __m128d __H)
1494{
1495 return _mm256_set_m128d (__H, __L);
1496}
1497
1498extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1499_mm256_setr_m128i (__m128i __L, __m128i __H)
1500{
1501 return _mm256_set_m128i (__H, __L);
1502}
1503
96d5c6dc
JJ
1504extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1505_mm256_loadu2_m128 (float const *__PH, float const *__PL)
1506{
1507 return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
1508 _mm_loadu_ps (__PH), 1);
1509}
1510
1511extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
1513{
1514 _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
1515 _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
1516}
1517
1518extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1519_mm256_loadu2_m128d (double const *__PH, double const *__PL)
1520{
1521 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
1522 _mm_loadu_pd (__PH), 1);
1523}
1524
1525extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1526_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
1527{
1528 _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
1529 _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
1530}
1531
1532extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
1534{
1535 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)),
1536 _mm_loadu_si128 (__PH), 1);
1537}
1538
1539extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1540_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
1541{
1542 _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
1543 _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
1544}
1545
97db2bf7
ST
1546#ifdef __DISABLE_AVX__
1547#undef __DISABLE_AVX__
1548#pragma GCC pop_options
1549#endif /* __DISABLE_AVX__ */
1550
1551#endif /* _AVXINTRIN_H_INCLUDED */