]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/avxintrin.h
Fix the VX_CPU selection for -mcpu=xscale on arm-vxworks
[thirdparty/gcc.git] / gcc / config / i386 / avxintrin.h
CommitLineData
8d9254fc 1/* Copyright (C) 2008-2020 Free Software Foundation, Inc.
95879c72
L
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
748086b7 7 the Free Software Foundation; either version 3, or (at your option)
95879c72
L
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
748086b7
JJ
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
95879c72
L
23
24/* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
26
37fe763d
UB
27#ifndef _IMMINTRIN_H_INCLUDED
28# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29#endif
95879c72 30
97db2bf7
ST
31#ifndef _AVXINTRIN_H_INCLUDED
32#define _AVXINTRIN_H_INCLUDED
33
34#ifndef __AVX__
35#pragma GCC push_options
36#pragma GCC target("avx")
37#define __DISABLE_AVX__
38#endif /* __AVX__ */
39
95879c72
L
40/* Internal data types for implementing the intrinsics. */
41typedef double __v4df __attribute__ ((__vector_size__ (32)));
42typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43typedef long long __v4di __attribute__ ((__vector_size__ (32)));
2069d6fc 44typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
95879c72 45typedef int __v8si __attribute__ ((__vector_size__ (32)));
2069d6fc 46typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
95879c72 47typedef short __v16hi __attribute__ ((__vector_size__ (32)));
2069d6fc 48typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
95879c72 49typedef char __v32qi __attribute__ ((__vector_size__ (32)));
b245befc 50typedef signed char __v32qs __attribute__ ((__vector_size__ (32)));
2069d6fc 51typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
95879c72
L
52
53/* The Intel API is flexible enough that we must allow aliasing with other
54 vector types, and their scalar components. */
55typedef float __m256 __attribute__ ((__vector_size__ (32),
56 __may_alias__));
57typedef long long __m256i __attribute__ ((__vector_size__ (32),
58 __may_alias__));
59typedef double __m256d __attribute__ ((__vector_size__ (32),
60 __may_alias__));
61
c6b0037d
MG
62/* Unaligned version of the same types. */
63typedef float __m256_u __attribute__ ((__vector_size__ (32),
64 __may_alias__,
65 __aligned__ (1)));
66typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
67 __may_alias__,
68 __aligned__ (1)));
69typedef double __m256d_u __attribute__ ((__vector_size__ (32),
70 __may_alias__,
71 __aligned__ (1)));
72
95879c72
L
73/* Compare predicates for scalar and packed compare intrinsics. */
74
75/* Equal (ordered, non-signaling) */
76#define _CMP_EQ_OQ 0x00
77/* Less-than (ordered, signaling) */
78#define _CMP_LT_OS 0x01
79/* Less-than-or-equal (ordered, signaling) */
80#define _CMP_LE_OS 0x02
81/* Unordered (non-signaling) */
82#define _CMP_UNORD_Q 0x03
83/* Not-equal (unordered, non-signaling) */
84#define _CMP_NEQ_UQ 0x04
85/* Not-less-than (unordered, signaling) */
86#define _CMP_NLT_US 0x05
87/* Not-less-than-or-equal (unordered, signaling) */
88#define _CMP_NLE_US 0x06
89/* Ordered (nonsignaling) */
90#define _CMP_ORD_Q 0x07
91/* Equal (unordered, non-signaling) */
92#define _CMP_EQ_UQ 0x08
93/* Not-greater-than-or-equal (unordered, signaling) */
94#define _CMP_NGE_US 0x09
95/* Not-greater-than (unordered, signaling) */
96#define _CMP_NGT_US 0x0a
97/* False (ordered, non-signaling) */
98#define _CMP_FALSE_OQ 0x0b
99/* Not-equal (ordered, non-signaling) */
100#define _CMP_NEQ_OQ 0x0c
101/* Greater-than-or-equal (ordered, signaling) */
102#define _CMP_GE_OS 0x0d
103/* Greater-than (ordered, signaling) */
104#define _CMP_GT_OS 0x0e
105/* True (unordered, non-signaling) */
106#define _CMP_TRUE_UQ 0x0f
107/* Equal (ordered, signaling) */
108#define _CMP_EQ_OS 0x10
109/* Less-than (ordered, non-signaling) */
110#define _CMP_LT_OQ 0x11
111/* Less-than-or-equal (ordered, non-signaling) */
112#define _CMP_LE_OQ 0x12
113/* Unordered (signaling) */
114#define _CMP_UNORD_S 0x13
115/* Not-equal (unordered, signaling) */
116#define _CMP_NEQ_US 0x14
117/* Not-less-than (unordered, non-signaling) */
118#define _CMP_NLT_UQ 0x15
119/* Not-less-than-or-equal (unordered, non-signaling) */
120#define _CMP_NLE_UQ 0x16
121/* Ordered (signaling) */
122#define _CMP_ORD_S 0x17
123/* Equal (unordered, signaling) */
124#define _CMP_EQ_US 0x18
125/* Not-greater-than-or-equal (unordered, non-signaling) */
126#define _CMP_NGE_UQ 0x19
127/* Not-greater-than (unordered, non-signaling) */
128#define _CMP_NGT_UQ 0x1a
129/* False (ordered, signaling) */
130#define _CMP_FALSE_OS 0x1b
131/* Not-equal (ordered, signaling) */
132#define _CMP_NEQ_OS 0x1c
133/* Greater-than-or-equal (ordered, non-signaling) */
134#define _CMP_GE_OQ 0x1d
135/* Greater-than (ordered, non-signaling) */
136#define _CMP_GT_OQ 0x1e
137/* True (unordered, signaling) */
138#define _CMP_TRUE_US 0x1f
139
140extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141_mm256_add_pd (__m256d __A, __m256d __B)
142{
2069d6fc 143 return (__m256d) ((__v4df)__A + (__v4df)__B);
95879c72
L
144}
145
146extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm256_add_ps (__m256 __A, __m256 __B)
148{
2069d6fc 149 return (__m256) ((__v8sf)__A + (__v8sf)__B);
95879c72
L
150}
151
152extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153_mm256_addsub_pd (__m256d __A, __m256d __B)
154{
155 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
156}
157
158extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159_mm256_addsub_ps (__m256 __A, __m256 __B)
160{
161 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
162}
163
164
165extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166_mm256_and_pd (__m256d __A, __m256d __B)
167{
168 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
169}
170
171extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172_mm256_and_ps (__m256 __A, __m256 __B)
173{
174 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
175}
176
177extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
178_mm256_andnot_pd (__m256d __A, __m256d __B)
179{
180 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
181}
182
183extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184_mm256_andnot_ps (__m256 __A, __m256 __B)
185{
186 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
187}
188
189/* Double/single precision floating point blend instructions - select
190 data from 2 sources using constant/variable mask. */
191
192#ifdef __OPTIMIZE__
193extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
195{
196 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
197 (__v4df)__Y,
198 __M);
199}
200
201extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
203{
204 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
205 (__v8sf)__Y,
206 __M);
207}
208#else
209#define _mm256_blend_pd(X, Y, M) \
210 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
211 (__v4df)(__m256d)(Y), (int)(M)))
212
213#define _mm256_blend_ps(X, Y, M) \
214 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
215 (__v8sf)(__m256)(Y), (int)(M)))
216#endif
217
218extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
220{
221 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
222 (__v4df)__Y,
223 (__v4df)__M);
224}
225
226extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
227_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
228{
229 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
230 (__v8sf)__Y,
231 (__v8sf)__M);
232}
233
234extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235_mm256_div_pd (__m256d __A, __m256d __B)
236{
2069d6fc 237 return (__m256d) ((__v4df)__A / (__v4df)__B);
95879c72
L
238}
239
240extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241_mm256_div_ps (__m256 __A, __m256 __B)
242{
2069d6fc 243 return (__m256) ((__v8sf)__A / (__v8sf)__B);
95879c72
L
244}
245
246/* Dot product instructions with mask-defined summing and zeroing parts
247 of result. */
248
249#ifdef __OPTIMIZE__
250extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
252{
253 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
254 (__v8sf)__Y,
255 __M);
256}
257#else
258#define _mm256_dp_ps(X, Y, M) \
259 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
260 (__v8sf)(__m256)(Y), (int)(M)))
261#endif
262
263extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264_mm256_hadd_pd (__m256d __X, __m256d __Y)
265{
266 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
267}
268
269extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270_mm256_hadd_ps (__m256 __X, __m256 __Y)
271{
272 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
273}
274
275extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276_mm256_hsub_pd (__m256d __X, __m256d __Y)
277{
278 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
279}
280
281extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282_mm256_hsub_ps (__m256 __X, __m256 __Y)
283{
284 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
285}
286
287extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288_mm256_max_pd (__m256d __A, __m256d __B)
289{
290 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
291}
292
293extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294_mm256_max_ps (__m256 __A, __m256 __B)
295{
296 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
297}
298
299extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300_mm256_min_pd (__m256d __A, __m256d __B)
301{
302 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
303}
304
305extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
306_mm256_min_ps (__m256 __A, __m256 __B)
307{
308 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
309}
310
311extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312_mm256_mul_pd (__m256d __A, __m256d __B)
313{
2069d6fc 314 return (__m256d) ((__v4df)__A * (__v4df)__B);
95879c72
L
315}
316
317extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318_mm256_mul_ps (__m256 __A, __m256 __B)
319{
2069d6fc 320 return (__m256) ((__v8sf)__A * (__v8sf)__B);
95879c72
L
321}
322
323extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
324_mm256_or_pd (__m256d __A, __m256d __B)
325{
326 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
327}
328
329extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
330_mm256_or_ps (__m256 __A, __m256 __B)
331{
332 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
333}
334
335#ifdef __OPTIMIZE__
336extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
337_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
338{
339 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
340 __mask);
341}
342
343extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
345{
346 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
347 __mask);
348}
349#else
350#define _mm256_shuffle_pd(A, B, N) \
351 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
352 (__v4df)(__m256d)(B), (int)(N)))
353
354#define _mm256_shuffle_ps(A, B, N) \
355 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
356 (__v8sf)(__m256)(B), (int)(N)))
357#endif
358
359extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360_mm256_sub_pd (__m256d __A, __m256d __B)
361{
2069d6fc 362 return (__m256d) ((__v4df)__A - (__v4df)__B);
95879c72
L
363}
364
365extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366_mm256_sub_ps (__m256 __A, __m256 __B)
367{
2069d6fc 368 return (__m256) ((__v8sf)__A - (__v8sf)__B);
95879c72
L
369}
370
371extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372_mm256_xor_pd (__m256d __A, __m256d __B)
373{
374 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
375}
376
377extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
378_mm256_xor_ps (__m256 __A, __m256 __B)
379{
380 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
381}
382
383#ifdef __OPTIMIZE__
384extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
386{
387 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
388}
389
390extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
392{
393 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
394}
395
396extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
397_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
398{
399 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
400 __P);
401}
402
403extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
405{
406 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
407 __P);
408}
409
410extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
412{
413 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
414}
415
416extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
418{
419 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
420}
421#else
422#define _mm_cmp_pd(X, Y, P) \
423 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
424 (__v2df)(__m128d)(Y), (int)(P)))
425
426#define _mm_cmp_ps(X, Y, P) \
427 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
428 (__v4sf)(__m128)(Y), (int)(P)))
429
430#define _mm256_cmp_pd(X, Y, P) \
431 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
432 (__v4df)(__m256d)(Y), (int)(P)))
433
434#define _mm256_cmp_ps(X, Y, P) \
435 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
436 (__v8sf)(__m256)(Y), (int)(P)))
437
438#define _mm_cmp_sd(X, Y, P) \
439 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
440 (__v2df)(__m128d)(Y), (int)(P)))
441
442#define _mm_cmp_ss(X, Y, P) \
443 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
444 (__v4sf)(__m128)(Y), (int)(P)))
445#endif
446
447extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
448_mm256_cvtepi32_pd (__m128i __A)
449{
450 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
451}
452
453extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454_mm256_cvtepi32_ps (__m256i __A)
455{
456 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
457}
458
459extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460_mm256_cvtpd_ps (__m256d __A)
461{
462 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
463}
464
465extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466_mm256_cvtps_epi32 (__m256 __A)
467{
468 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
469}
470
471extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472_mm256_cvtps_pd (__m128 __A)
473{
474 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
475}
476
477extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478_mm256_cvttpd_epi32 (__m256d __A)
479{
480 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
481}
482
483extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484_mm256_cvtpd_epi32 (__m256d __A)
485{
486 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
487}
488
489extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490_mm256_cvttps_epi32 (__m256 __A)
491{
492 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
493}
494
dcb2c527
JJ
495extern __inline double
496__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
497_mm256_cvtsd_f64 (__m256d __A)
498{
499 return __A[0];
500}
501
502extern __inline float
503__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
504_mm256_cvtss_f32 (__m256 __A)
505{
506 return __A[0];
507}
508
95879c72
L
509#ifdef __OPTIMIZE__
510extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
511_mm256_extractf128_pd (__m256d __X, const int __N)
512{
513 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
514}
515
516extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517_mm256_extractf128_ps (__m256 __X, const int __N)
518{
519 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
520}
521
522extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523_mm256_extractf128_si256 (__m256i __X, const int __N)
524{
525 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
526}
527
528extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529_mm256_extract_epi32 (__m256i __X, int const __N)
530{
531 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
532 return _mm_extract_epi32 (__Y, __N % 4);
533}
534
535extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
536_mm256_extract_epi16 (__m256i __X, int const __N)
537{
538 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
539 return _mm_extract_epi16 (__Y, __N % 8);
540}
541
542extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
543_mm256_extract_epi8 (__m256i __X, int const __N)
544{
545 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
546 return _mm_extract_epi8 (__Y, __N % 16);
547}
548
549#ifdef __x86_64__
550extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551_mm256_extract_epi64 (__m256i __X, const int __N)
552{
553 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
554 return _mm_extract_epi64 (__Y, __N % 2);
555}
556#endif
557#else
558#define _mm256_extractf128_pd(X, N) \
559 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
560 (int)(N)))
561
562#define _mm256_extractf128_ps(X, N) \
563 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
564 (int)(N)))
565
566#define _mm256_extractf128_si256(X, N) \
567 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
568 (int)(N)))
569
570#define _mm256_extract_epi32(X, N) \
571 (__extension__ \
572 ({ \
573 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
574 _mm_extract_epi32 (__Y, (N) % 4); \
575 }))
576
577#define _mm256_extract_epi16(X, N) \
578 (__extension__ \
579 ({ \
580 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
581 _mm_extract_epi16 (__Y, (N) % 8); \
582 }))
583
584#define _mm256_extract_epi8(X, N) \
585 (__extension__ \
586 ({ \
587 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
588 _mm_extract_epi8 (__Y, (N) % 16); \
589 }))
590
591#ifdef __x86_64__
592#define _mm256_extract_epi64(X, N) \
593 (__extension__ \
594 ({ \
595 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
596 _mm_extract_epi64 (__Y, (N) % 2); \
597 }))
598#endif
599#endif
600
601extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
602_mm256_zeroall (void)
603{
604 __builtin_ia32_vzeroall ();
605}
606
607extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608_mm256_zeroupper (void)
609{
610 __builtin_ia32_vzeroupper ();
611}
612
613extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
614_mm_permutevar_pd (__m128d __A, __m128i __C)
615{
616 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
617 (__v2di)__C);
618}
619
620extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621_mm256_permutevar_pd (__m256d __A, __m256i __C)
622{
623 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
624 (__v4di)__C);
625}
626
627extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628_mm_permutevar_ps (__m128 __A, __m128i __C)
629{
630 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
631 (__v4si)__C);
632}
633
634extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635_mm256_permutevar_ps (__m256 __A, __m256i __C)
636{
637 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
638 (__v8si)__C);
639}
640
641#ifdef __OPTIMIZE__
642extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643_mm_permute_pd (__m128d __X, const int __C)
644{
645 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
646}
647
648extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649_mm256_permute_pd (__m256d __X, const int __C)
650{
651 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
652}
653
654extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655_mm_permute_ps (__m128 __X, const int __C)
656{
657 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
658}
659
660extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661_mm256_permute_ps (__m256 __X, const int __C)
662{
663 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
664}
95879c72
L
665#else
666#define _mm_permute_pd(X, C) \
667 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
668
669#define _mm256_permute_pd(X, C) \
670 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
671
672#define _mm_permute_ps(X, C) \
673 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
674
675#define _mm256_permute_ps(X, C) \
676 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
95879c72
L
677#endif
678
679#ifdef __OPTIMIZE__
680extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
682{
683 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
684 (__v4df)__Y,
685 __C);
686}
687
688extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
689_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
690{
691 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
692 (__v8sf)__Y,
693 __C);
694}
695
696extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
698{
699 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
700 (__v8si)__Y,
701 __C);
702}
703#else
704#define _mm256_permute2f128_pd(X, Y, C) \
705 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
706 (__v4df)(__m256d)(Y), \
707 (int)(C)))
708
709#define _mm256_permute2f128_ps(X, Y, C) \
710 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
711 (__v8sf)(__m256)(Y), \
712 (int)(C)))
713
714#define _mm256_permute2f128_si256(X, Y, C) \
715 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
716 (__v8si)(__m256i)(Y), \
717 (int)(C)))
718#endif
719
720extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721_mm_broadcast_ss (float const *__X)
722{
723 return (__m128) __builtin_ia32_vbroadcastss (__X);
724}
725
726extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
727_mm256_broadcast_sd (double const *__X)
728{
729 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
730}
731
732extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
733_mm256_broadcast_ss (float const *__X)
734{
735 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
736}
737
738extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
739_mm256_broadcast_pd (__m128d const *__X)
740{
741 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
742}
743
744extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745_mm256_broadcast_ps (__m128 const *__X)
746{
747 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
748}
749
750#ifdef __OPTIMIZE__
751extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
753{
754 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
755 (__v2df)__Y,
756 __O);
757}
758
759extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
761{
762 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
763 (__v4sf)__Y,
764 __O);
765}
766
767extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
768_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
769{
770 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
771 (__v4si)__Y,
772 __O);
773}
774
775extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
777{
778 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
23e0d930 779 __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
95879c72
L
780 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
781}
782
783extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
785{
786 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
787 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
788 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
789}
790
791extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
792_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
793{
794 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
795 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
796 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
797}
798
799#ifdef __x86_64__
800extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
fcff2e9c 801_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
95879c72
L
802{
803 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
23e0d930 804 __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
95879c72
L
805 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
806}
807#endif
808#else
809#define _mm256_insertf128_pd(X, Y, O) \
810 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
811 (__v2df)(__m128d)(Y), \
812 (int)(O)))
813
814#define _mm256_insertf128_ps(X, Y, O) \
815 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
816 (__v4sf)(__m128)(Y), \
817 (int)(O)))
818
819#define _mm256_insertf128_si256(X, Y, O) \
820 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
821 (__v4si)(__m128i)(Y), \
822 (int)(O)))
823
824#define _mm256_insert_epi32(X, D, N) \
825 (__extension__ \
826 ({ \
827 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
828 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
829 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
830 }))
831
832#define _mm256_insert_epi16(X, D, N) \
833 (__extension__ \
834 ({ \
835 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
836 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
837 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
838 }))
839
840#define _mm256_insert_epi8(X, D, N) \
841 (__extension__ \
842 ({ \
843 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
844 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
845 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
846 }))
847
848#ifdef __x86_64__
849#define _mm256_insert_epi64(X, D, N) \
850 (__extension__ \
851 ({ \
852 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
853 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
854 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
855 }))
856#endif
857#endif
858
859extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860_mm256_load_pd (double const *__P)
861{
862 return *(__m256d *)__P;
863}
864
865extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866_mm256_store_pd (double *__P, __m256d __A)
867{
868 *(__m256d *)__P = __A;
869}
870
871extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872_mm256_load_ps (float const *__P)
873{
874 return *(__m256 *)__P;
875}
876
877extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878_mm256_store_ps (float *__P, __m256 __A)
879{
880 *(__m256 *)__P = __A;
881}
882
883extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884_mm256_loadu_pd (double const *__P)
885{
c6b0037d 886 return *(__m256d_u *)__P;
95879c72
L
887}
888
889extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890_mm256_storeu_pd (double *__P, __m256d __A)
891{
c6b0037d 892 *(__m256d_u *)__P = __A;
95879c72
L
893}
894
895extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896_mm256_loadu_ps (float const *__P)
897{
c6b0037d 898 return *(__m256_u *)__P;
95879c72
L
899}
900
901extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902_mm256_storeu_ps (float *__P, __m256 __A)
903{
c6b0037d 904 *(__m256_u *)__P = __A;
95879c72
L
905}
906
907extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
908_mm256_load_si256 (__m256i const *__P)
909{
910 return *__P;
911}
912
913extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
914_mm256_store_si256 (__m256i *__P, __m256i __A)
915{
916 *__P = __A;
917}
918
919extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c6b0037d 920_mm256_loadu_si256 (__m256i_u const *__P)
95879c72 921{
c6b0037d 922 return *__P;
95879c72
L
923}
924
925extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c6b0037d 926_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
95879c72 927{
c6b0037d 928 *__P = __A;
95879c72
L
929}
930
931extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 932_mm_maskload_pd (double const *__P, __m128i __M)
95879c72
L
933{
934 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
98c6d93c 935 (__v2di)__M);
95879c72
L
936}
937
938extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 939_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
95879c72 940{
98c6d93c 941 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
95879c72
L
942}
943
944extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 945_mm256_maskload_pd (double const *__P, __m256i __M)
95879c72
L
946{
947 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
98c6d93c 948 (__v4di)__M);
95879c72
L
949}
950
951extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 952_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
95879c72 953{
98c6d93c 954 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
95879c72
L
955}
956
957extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 958_mm_maskload_ps (float const *__P, __m128i __M)
95879c72
L
959{
960 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
98c6d93c 961 (__v4si)__M);
95879c72
L
962}
963
964extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 965_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
95879c72 966{
98c6d93c 967 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
95879c72
L
968}
969
970extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 971_mm256_maskload_ps (float const *__P, __m256i __M)
95879c72
L
972{
973 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
98c6d93c 974 (__v8si)__M);
95879c72
L
975}
976
977extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98c6d93c 978_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
95879c72 979{
98c6d93c 980 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
95879c72
L
981}
982
983extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984_mm256_movehdup_ps (__m256 __X)
985{
986 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
987}
988
989extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990_mm256_moveldup_ps (__m256 __X)
991{
992 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
993}
994
995extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996_mm256_movedup_pd (__m256d __X)
997{
998 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
999}
1000
1001extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002_mm256_lddqu_si256 (__m256i const *__P)
1003{
1004 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
1005}
1006
65b82caa
L
1007extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1008_mm256_stream_si256 (__m256i *__A, __m256i __B)
1009{
1010 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
1011}
1012
1013extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014_mm256_stream_pd (double *__A, __m256d __B)
1015{
1016 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
1017}
1018
1019extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1020_mm256_stream_ps (float *__P, __m256 __A)
1021{
1022 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
1023}
1024
95879c72
L
1025extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026_mm256_rcp_ps (__m256 __A)
1027{
1028 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1029}
1030
1031extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032_mm256_rsqrt_ps (__m256 __A)
1033{
1034 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1035}
1036
1037extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1038_mm256_sqrt_pd (__m256d __A)
1039{
1040 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1041}
1042
1043extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1044_mm256_sqrt_ps (__m256 __A)
1045{
1046 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1047}
1048
1049#ifdef __OPTIMIZE__
1050extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051_mm256_round_pd (__m256d __V, const int __M)
1052{
1053 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1054}
1055
1056extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057_mm256_round_ps (__m256 __V, const int __M)
1058{
1059 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1060}
1061#else
1062#define _mm256_round_pd(V, M) \
1063 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1064
1065#define _mm256_round_ps(V, M) \
1066 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1067#endif
1068
1069#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1070#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1071#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1072#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1073
1074extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1076{
1077 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1078}
1079
1080extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1082{
1083 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1084}
1085
1086extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1088{
1089 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1090}
1091
1092extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1093_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1094{
1095 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1096}
1097
1098extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099_mm_testz_pd (__m128d __M, __m128d __V)
1100{
1101 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1102}
1103
1104extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105_mm_testc_pd (__m128d __M, __m128d __V)
1106{
1107 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1108}
1109
1110extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111_mm_testnzc_pd (__m128d __M, __m128d __V)
1112{
1113 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1114}
1115
1116extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117_mm_testz_ps (__m128 __M, __m128 __V)
1118{
1119 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1120}
1121
1122extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1123_mm_testc_ps (__m128 __M, __m128 __V)
1124{
1125 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1126}
1127
1128extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129_mm_testnzc_ps (__m128 __M, __m128 __V)
1130{
1131 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1132}
1133
1134extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1135_mm256_testz_pd (__m256d __M, __m256d __V)
1136{
1137 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1138}
1139
1140extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141_mm256_testc_pd (__m256d __M, __m256d __V)
1142{
1143 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1144}
1145
1146extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147_mm256_testnzc_pd (__m256d __M, __m256d __V)
1148{
1149 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1150}
1151
1152extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153_mm256_testz_ps (__m256 __M, __m256 __V)
1154{
1155 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1156}
1157
1158extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159_mm256_testc_ps (__m256 __M, __m256 __V)
1160{
1161 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1162}
1163
1164extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1165_mm256_testnzc_ps (__m256 __M, __m256 __V)
1166{
1167 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1168}
1169
1170extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171_mm256_testz_si256 (__m256i __M, __m256i __V)
1172{
1173 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1174}
1175
1176extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177_mm256_testc_si256 (__m256i __M, __m256i __V)
1178{
1179 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1180}
1181
1182extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1184{
1185 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1186}
1187
1188extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1189_mm256_movemask_pd (__m256d __A)
1190{
1191 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1192}
1193
1194extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195_mm256_movemask_ps (__m256 __A)
1196{
1197 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1198}
1199
0b192937
UD
1200extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1201_mm256_undefined_pd (void)
1202{
1203 __m256d __Y = __Y;
1204 return __Y;
1205}
1206
1207extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208_mm256_undefined_ps (void)
1209{
1210 __m256 __Y = __Y;
1211 return __Y;
1212}
1213
1214extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215_mm256_undefined_si256 (void)
1216{
1217 __m256i __Y = __Y;
1218 return __Y;
1219}
1220
95879c72
L
1221extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1222_mm256_setzero_pd (void)
1223{
1224 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1225}
1226
1227extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228_mm256_setzero_ps (void)
1229{
1230 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1231 0.0, 0.0, 0.0, 0.0 };
1232}
1233
1234extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235_mm256_setzero_si256 (void)
1236{
1237 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1238}
1239
1240/* Create the vector [A B C D]. */
1241extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242_mm256_set_pd (double __A, double __B, double __C, double __D)
1243{
1244 return __extension__ (__m256d){ __D, __C, __B, __A };
1245}
1246
1247/* Create the vector [A B C D E F G H]. */
1248extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1249_mm256_set_ps (float __A, float __B, float __C, float __D,
1250 float __E, float __F, float __G, float __H)
1251{
1252 return __extension__ (__m256){ __H, __G, __F, __E,
1253 __D, __C, __B, __A };
1254}
1255
1256/* Create the vector [A B C D E F G H]. */
1257extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1259 int __E, int __F, int __G, int __H)
1260{
1261 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1262 __D, __C, __B, __A };
1263}
1264
1265extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1267 short __q11, short __q10, short __q09, short __q08,
1268 short __q07, short __q06, short __q05, short __q04,
1269 short __q03, short __q02, short __q01, short __q00)
1270{
1271 return __extension__ (__m256i)(__v16hi){
1272 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1273 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1274 };
1275}
1276
1277extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1279 char __q27, char __q26, char __q25, char __q24,
1280 char __q23, char __q22, char __q21, char __q20,
1281 char __q19, char __q18, char __q17, char __q16,
1282 char __q15, char __q14, char __q13, char __q12,
1283 char __q11, char __q10, char __q09, char __q08,
1284 char __q07, char __q06, char __q05, char __q04,
1285 char __q03, char __q02, char __q01, char __q00)
1286{
1287 return __extension__ (__m256i)(__v32qi){
1288 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1289 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1290 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1291 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1292 };
1293}
1294
1295extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296_mm256_set_epi64x (long long __A, long long __B, long long __C,
1297 long long __D)
1298{
1299 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1300}
1301
1302/* Create a vector with all elements equal to A. */
1303extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304_mm256_set1_pd (double __A)
1305{
1306 return __extension__ (__m256d){ __A, __A, __A, __A };
1307}
1308
1309/* Create a vector with all elements equal to A. */
1310extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311_mm256_set1_ps (float __A)
1312{
1313 return __extension__ (__m256){ __A, __A, __A, __A,
1314 __A, __A, __A, __A };
1315}
1316
1317/* Create a vector with all elements equal to A. */
1318extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1319_mm256_set1_epi32 (int __A)
1320{
1321 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1322 __A, __A, __A, __A };
1323}
1324
1325extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326_mm256_set1_epi16 (short __A)
1327{
1328 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1329 __A, __A, __A, __A, __A, __A, __A, __A);
1330}
1331
1332extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1333_mm256_set1_epi8 (char __A)
1334{
1335 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1336 __A, __A, __A, __A, __A, __A, __A, __A,
1337 __A, __A, __A, __A, __A, __A, __A, __A,
1338 __A, __A, __A, __A, __A, __A, __A, __A);
1339}
1340
1341extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1342_mm256_set1_epi64x (long long __A)
1343{
1344 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1345}
1346
1347/* Create vectors of elements in the reversed order from the
1348 _mm256_set_XXX functions. */
1349
1350extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351_mm256_setr_pd (double __A, double __B, double __C, double __D)
1352{
1353 return _mm256_set_pd (__D, __C, __B, __A);
1354}
1355
1356extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357_mm256_setr_ps (float __A, float __B, float __C, float __D,
1358 float __E, float __F, float __G, float __H)
1359{
1360 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1361}
1362
1363extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1365 int __E, int __F, int __G, int __H)
1366{
1367 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1368}
1369
1370extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1372 short __q11, short __q10, short __q09, short __q08,
1373 short __q07, short __q06, short __q05, short __q04,
1374 short __q03, short __q02, short __q01, short __q00)
1375{
1376 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1377 __q04, __q05, __q06, __q07,
1378 __q08, __q09, __q10, __q11,
1379 __q12, __q13, __q14, __q15);
1380}
1381
1382extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1384 char __q27, char __q26, char __q25, char __q24,
1385 char __q23, char __q22, char __q21, char __q20,
1386 char __q19, char __q18, char __q17, char __q16,
1387 char __q15, char __q14, char __q13, char __q12,
1388 char __q11, char __q10, char __q09, char __q08,
1389 char __q07, char __q06, char __q05, char __q04,
1390 char __q03, char __q02, char __q01, char __q00)
1391{
1392 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1393 __q04, __q05, __q06, __q07,
1394 __q08, __q09, __q10, __q11,
1395 __q12, __q13, __q14, __q15,
1396 __q16, __q17, __q18, __q19,
1397 __q20, __q21, __q22, __q23,
1398 __q24, __q25, __q26, __q27,
1399 __q28, __q29, __q30, __q31);
1400}
1401
1402extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1404 long long __D)
1405{
1406 return _mm256_set_epi64x (__D, __C, __B, __A);
1407}
1408
1409/* Casts between various SP, DP, INT vector types. Note that these do no
1410 conversion of values, they just change the type. */
1411extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1412_mm256_castpd_ps (__m256d __A)
1413{
1414 return (__m256) __A;
1415}
1416
1417extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1418_mm256_castpd_si256 (__m256d __A)
1419{
1420 return (__m256i) __A;
1421}
1422
1423extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1424_mm256_castps_pd (__m256 __A)
1425{
1426 return (__m256d) __A;
1427}
1428
1429extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1430_mm256_castps_si256(__m256 __A)
1431{
1432 return (__m256i) __A;
1433}
1434
1435extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436_mm256_castsi256_ps (__m256i __A)
1437{
1438 return (__m256) __A;
1439}
1440
1441extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1442_mm256_castsi256_pd (__m256i __A)
1443{
1444 return (__m256d) __A;
1445}
1446
1447extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1448_mm256_castpd256_pd128 (__m256d __A)
1449{
1450 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1451}
1452
1453extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1454_mm256_castps256_ps128 (__m256 __A)
1455{
1456 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1457}
1458
1459extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1460_mm256_castsi256_si128 (__m256i __A)
1461{
1462 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1463}
1464
1465/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1466 the 256-bit result contain source parameter value and the upper 128
1467 bits of the result are undefined. Those intrinsics shouldn't
1468 generate any extra moves. */
1469
1470extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1471_mm256_castpd128_pd256 (__m128d __A)
1472{
1473 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1474}
1475
1476extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1477_mm256_castps128_ps256 (__m128 __A)
1478{
1479 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1480}
1481
1482extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1483_mm256_castsi128_si256 (__m128i __A)
1484{
1485 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1486}
97db2bf7 1487
e6b2dc24
JJ
1488/* Similarly, but with zero extension instead of undefined values. */
1489
1490extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1491_mm256_zextpd128_pd256 (__m128d __A)
1492{
1493 return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0);
1494}
1495
1496extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497_mm256_zextps128_ps256 (__m128 __A)
1498{
1499 return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0);
1500}
1501
1502extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1503_mm256_zextsi128_si256 (__m128i __A)
1504{
1505 return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0);
1506}
1507
f4ee3a9e
UB
1508extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1509_mm256_set_m128 ( __m128 __H, __m128 __L)
1510{
1511 return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
1512}
1513
1514extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1515_mm256_set_m128d (__m128d __H, __m128d __L)
1516{
1517 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
1518}
1519
1520extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1521_mm256_set_m128i (__m128i __H, __m128i __L)
1522{
1523 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
1524}
1525
1526extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1527_mm256_setr_m128 (__m128 __L, __m128 __H)
1528{
1529 return _mm256_set_m128 (__H, __L);
1530}
1531
1532extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533_mm256_setr_m128d (__m128d __L, __m128d __H)
1534{
1535 return _mm256_set_m128d (__H, __L);
1536}
1537
1538extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1539_mm256_setr_m128i (__m128i __L, __m128i __H)
1540{
1541 return _mm256_set_m128i (__H, __L);
1542}
1543
96d5c6dc
JJ
1544extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1545_mm256_loadu2_m128 (float const *__PH, float const *__PL)
1546{
1547 return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
1548 _mm_loadu_ps (__PH), 1);
1549}
1550
1551extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1552_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
1553{
1554 _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
1555 _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
1556}
1557
1558extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1559_mm256_loadu2_m128d (double const *__PH, double const *__PL)
1560{
1561 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
1562 _mm_loadu_pd (__PH), 1);
1563}
1564
1565extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1566_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
1567{
1568 _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
1569 _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
1570}
1571
1572extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1573_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
1574{
1575 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)),
1576 _mm_loadu_si128 (__PH), 1);
1577}
1578
1579extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1580_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
1581{
1582 _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
1583 _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
1584}
1585
97db2bf7
ST
1586#ifdef __DISABLE_AVX__
1587#undef __DISABLE_AVX__
1588#pragma GCC pop_options
1589#endif /* __DISABLE_AVX__ */
1590
1591#endif /* _AVXINTRIN_H_INCLUDED */