]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/avxintrin.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / avxintrin.h
CommitLineData
fbd26352 1/* Copyright (C) 2008-2019 Free Software Foundation, Inc.
ed30e0a6 2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
6bc9506f 7 the Free Software Foundation; either version 3, or (at your option)
ed30e0a6 8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
6bc9506f 15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
ed30e0a6 23
24/* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
26
58dc44a9 27#ifndef _IMMINTRIN_H_INCLUDED
28# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29#endif
ed30e0a6 30
ef21d40e 31#ifndef _AVXINTRIN_H_INCLUDED
32#define _AVXINTRIN_H_INCLUDED
33
34#ifndef __AVX__
35#pragma GCC push_options
36#pragma GCC target("avx")
37#define __DISABLE_AVX__
38#endif /* __AVX__ */
39
ed30e0a6 40/* Internal data types for implementing the intrinsics. */
41typedef double __v4df __attribute__ ((__vector_size__ (32)));
42typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43typedef long long __v4di __attribute__ ((__vector_size__ (32)));
d521a5b2 44typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
ed30e0a6 45typedef int __v8si __attribute__ ((__vector_size__ (32)));
d521a5b2 46typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
ed30e0a6 47typedef short __v16hi __attribute__ ((__vector_size__ (32)));
d521a5b2 48typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
ed30e0a6 49typedef char __v32qi __attribute__ ((__vector_size__ (32)));
d521a5b2 50typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
ed30e0a6 51
52/* The Intel API is flexible enough that we must allow aliasing with other
53 vector types, and their scalar components. */
54typedef float __m256 __attribute__ ((__vector_size__ (32),
55 __may_alias__));
56typedef long long __m256i __attribute__ ((__vector_size__ (32),
57 __may_alias__));
58typedef double __m256d __attribute__ ((__vector_size__ (32),
59 __may_alias__));
60
8036ac7f 61/* Unaligned version of the same types. */
62typedef float __m256_u __attribute__ ((__vector_size__ (32),
63 __may_alias__,
64 __aligned__ (1)));
65typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
66 __may_alias__,
67 __aligned__ (1)));
68typedef double __m256d_u __attribute__ ((__vector_size__ (32),
69 __may_alias__,
70 __aligned__ (1)));
71
ed30e0a6 72/* Compare predicates for scalar and packed compare intrinsics. */
73
74/* Equal (ordered, non-signaling) */
75#define _CMP_EQ_OQ 0x00
76/* Less-than (ordered, signaling) */
77#define _CMP_LT_OS 0x01
78/* Less-than-or-equal (ordered, signaling) */
79#define _CMP_LE_OS 0x02
80/* Unordered (non-signaling) */
81#define _CMP_UNORD_Q 0x03
82/* Not-equal (unordered, non-signaling) */
83#define _CMP_NEQ_UQ 0x04
84/* Not-less-than (unordered, signaling) */
85#define _CMP_NLT_US 0x05
86/* Not-less-than-or-equal (unordered, signaling) */
87#define _CMP_NLE_US 0x06
88/* Ordered (nonsignaling) */
89#define _CMP_ORD_Q 0x07
90/* Equal (unordered, non-signaling) */
91#define _CMP_EQ_UQ 0x08
92/* Not-greater-than-or-equal (unordered, signaling) */
93#define _CMP_NGE_US 0x09
94/* Not-greater-than (unordered, signaling) */
95#define _CMP_NGT_US 0x0a
96/* False (ordered, non-signaling) */
97#define _CMP_FALSE_OQ 0x0b
98/* Not-equal (ordered, non-signaling) */
99#define _CMP_NEQ_OQ 0x0c
100/* Greater-than-or-equal (ordered, signaling) */
101#define _CMP_GE_OS 0x0d
102/* Greater-than (ordered, signaling) */
103#define _CMP_GT_OS 0x0e
104/* True (unordered, non-signaling) */
105#define _CMP_TRUE_UQ 0x0f
106/* Equal (ordered, signaling) */
107#define _CMP_EQ_OS 0x10
108/* Less-than (ordered, non-signaling) */
109#define _CMP_LT_OQ 0x11
110/* Less-than-or-equal (ordered, non-signaling) */
111#define _CMP_LE_OQ 0x12
112/* Unordered (signaling) */
113#define _CMP_UNORD_S 0x13
114/* Not-equal (unordered, signaling) */
115#define _CMP_NEQ_US 0x14
116/* Not-less-than (unordered, non-signaling) */
117#define _CMP_NLT_UQ 0x15
118/* Not-less-than-or-equal (unordered, non-signaling) */
119#define _CMP_NLE_UQ 0x16
120/* Ordered (signaling) */
121#define _CMP_ORD_S 0x17
122/* Equal (unordered, signaling) */
123#define _CMP_EQ_US 0x18
124/* Not-greater-than-or-equal (unordered, non-signaling) */
125#define _CMP_NGE_UQ 0x19
126/* Not-greater-than (unordered, non-signaling) */
127#define _CMP_NGT_UQ 0x1a
128/* False (ordered, signaling) */
129#define _CMP_FALSE_OS 0x1b
130/* Not-equal (ordered, signaling) */
131#define _CMP_NEQ_OS 0x1c
132/* Greater-than-or-equal (ordered, non-signaling) */
133#define _CMP_GE_OQ 0x1d
134/* Greater-than (ordered, non-signaling) */
135#define _CMP_GT_OQ 0x1e
136/* True (unordered, signaling) */
137#define _CMP_TRUE_US 0x1f
138
139extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140_mm256_add_pd (__m256d __A, __m256d __B)
141{
d521a5b2 142 return (__m256d) ((__v4df)__A + (__v4df)__B);
ed30e0a6 143}
144
145extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146_mm256_add_ps (__m256 __A, __m256 __B)
147{
d521a5b2 148 return (__m256) ((__v8sf)__A + (__v8sf)__B);
ed30e0a6 149}
150
151extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152_mm256_addsub_pd (__m256d __A, __m256d __B)
153{
154 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
155}
156
157extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158_mm256_addsub_ps (__m256 __A, __m256 __B)
159{
160 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
161}
162
163
164extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165_mm256_and_pd (__m256d __A, __m256d __B)
166{
167 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
168}
169
170extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
171_mm256_and_ps (__m256 __A, __m256 __B)
172{
173 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
174}
175
176extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177_mm256_andnot_pd (__m256d __A, __m256d __B)
178{
179 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
180}
181
182extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183_mm256_andnot_ps (__m256 __A, __m256 __B)
184{
185 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
186}
187
188/* Double/single precision floating point blend instructions - select
189 data from 2 sources using constant/variable mask. */
190
191#ifdef __OPTIMIZE__
192extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
194{
195 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
196 (__v4df)__Y,
197 __M);
198}
199
200extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
202{
203 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
204 (__v8sf)__Y,
205 __M);
206}
207#else
208#define _mm256_blend_pd(X, Y, M) \
209 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
210 (__v4df)(__m256d)(Y), (int)(M)))
211
212#define _mm256_blend_ps(X, Y, M) \
213 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
214 (__v8sf)(__m256)(Y), (int)(M)))
215#endif
216
217extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
218_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
219{
220 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
221 (__v4df)__Y,
222 (__v4df)__M);
223}
224
225extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
227{
228 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
229 (__v8sf)__Y,
230 (__v8sf)__M);
231}
232
233extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
234_mm256_div_pd (__m256d __A, __m256d __B)
235{
d521a5b2 236 return (__m256d) ((__v4df)__A / (__v4df)__B);
ed30e0a6 237}
238
239extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
240_mm256_div_ps (__m256 __A, __m256 __B)
241{
d521a5b2 242 return (__m256) ((__v8sf)__A / (__v8sf)__B);
ed30e0a6 243}
244
245/* Dot product instructions with mask-defined summing and zeroing parts
246 of result. */
247
248#ifdef __OPTIMIZE__
249extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
251{
252 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
253 (__v8sf)__Y,
254 __M);
255}
256#else
257#define _mm256_dp_ps(X, Y, M) \
258 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
259 (__v8sf)(__m256)(Y), (int)(M)))
260#endif
261
262extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263_mm256_hadd_pd (__m256d __X, __m256d __Y)
264{
265 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
266}
267
268extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269_mm256_hadd_ps (__m256 __X, __m256 __Y)
270{
271 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
272}
273
274extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275_mm256_hsub_pd (__m256d __X, __m256d __Y)
276{
277 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
278}
279
280extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281_mm256_hsub_ps (__m256 __X, __m256 __Y)
282{
283 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
284}
285
286extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287_mm256_max_pd (__m256d __A, __m256d __B)
288{
289 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
290}
291
292extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293_mm256_max_ps (__m256 __A, __m256 __B)
294{
295 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
296}
297
298extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299_mm256_min_pd (__m256d __A, __m256d __B)
300{
301 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
302}
303
304extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305_mm256_min_ps (__m256 __A, __m256 __B)
306{
307 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
308}
309
310extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311_mm256_mul_pd (__m256d __A, __m256d __B)
312{
d521a5b2 313 return (__m256d) ((__v4df)__A * (__v4df)__B);
ed30e0a6 314}
315
316extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317_mm256_mul_ps (__m256 __A, __m256 __B)
318{
d521a5b2 319 return (__m256) ((__v8sf)__A * (__v8sf)__B);
ed30e0a6 320}
321
322extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323_mm256_or_pd (__m256d __A, __m256d __B)
324{
325 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
326}
327
328extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329_mm256_or_ps (__m256 __A, __m256 __B)
330{
331 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
332}
333
334#ifdef __OPTIMIZE__
335extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
336_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
337{
338 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
339 __mask);
340}
341
342extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
344{
345 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
346 __mask);
347}
348#else
349#define _mm256_shuffle_pd(A, B, N) \
350 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
351 (__v4df)(__m256d)(B), (int)(N)))
352
353#define _mm256_shuffle_ps(A, B, N) \
354 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
355 (__v8sf)(__m256)(B), (int)(N)))
356#endif
357
358extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359_mm256_sub_pd (__m256d __A, __m256d __B)
360{
d521a5b2 361 return (__m256d) ((__v4df)__A - (__v4df)__B);
ed30e0a6 362}
363
364extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365_mm256_sub_ps (__m256 __A, __m256 __B)
366{
d521a5b2 367 return (__m256) ((__v8sf)__A - (__v8sf)__B);
ed30e0a6 368}
369
370extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371_mm256_xor_pd (__m256d __A, __m256d __B)
372{
373 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
374}
375
376extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
377_mm256_xor_ps (__m256 __A, __m256 __B)
378{
379 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
380}
381
382#ifdef __OPTIMIZE__
383extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
385{
386 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
387}
388
389extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
391{
392 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
393}
394
395extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
397{
398 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
399 __P);
400}
401
402extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
404{
405 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
406 __P);
407}
408
409extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
411{
412 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
413}
414
415extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
417{
418 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
419}
420#else
421#define _mm_cmp_pd(X, Y, P) \
422 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
423 (__v2df)(__m128d)(Y), (int)(P)))
424
425#define _mm_cmp_ps(X, Y, P) \
426 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
427 (__v4sf)(__m128)(Y), (int)(P)))
428
429#define _mm256_cmp_pd(X, Y, P) \
430 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
431 (__v4df)(__m256d)(Y), (int)(P)))
432
433#define _mm256_cmp_ps(X, Y, P) \
434 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
435 (__v8sf)(__m256)(Y), (int)(P)))
436
437#define _mm_cmp_sd(X, Y, P) \
438 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
439 (__v2df)(__m128d)(Y), (int)(P)))
440
441#define _mm_cmp_ss(X, Y, P) \
442 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
443 (__v4sf)(__m128)(Y), (int)(P)))
444#endif
445
446extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447_mm256_cvtepi32_pd (__m128i __A)
448{
449 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
450}
451
452extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453_mm256_cvtepi32_ps (__m256i __A)
454{
455 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
456}
457
458extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459_mm256_cvtpd_ps (__m256d __A)
460{
461 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
462}
463
464extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465_mm256_cvtps_epi32 (__m256 __A)
466{
467 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
468}
469
470extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471_mm256_cvtps_pd (__m128 __A)
472{
473 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
474}
475
476extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477_mm256_cvttpd_epi32 (__m256d __A)
478{
479 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
480}
481
482extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483_mm256_cvtpd_epi32 (__m256d __A)
484{
485 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
486}
487
488extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489_mm256_cvttps_epi32 (__m256 __A)
490{
491 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
492}
493
580a6e96 494extern __inline double
495__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
496_mm256_cvtsd_f64 (__m256d __A)
497{
498 return __A[0];
499}
500
501extern __inline float
502__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
503_mm256_cvtss_f32 (__m256 __A)
504{
505 return __A[0];
506}
507
ed30e0a6 508#ifdef __OPTIMIZE__
509extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510_mm256_extractf128_pd (__m256d __X, const int __N)
511{
512 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
513}
514
515extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516_mm256_extractf128_ps (__m256 __X, const int __N)
517{
518 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
519}
520
521extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522_mm256_extractf128_si256 (__m256i __X, const int __N)
523{
524 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
525}
526
527extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
528_mm256_extract_epi32 (__m256i __X, int const __N)
529{
530 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
531 return _mm_extract_epi32 (__Y, __N % 4);
532}
533
534extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535_mm256_extract_epi16 (__m256i __X, int const __N)
536{
537 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
538 return _mm_extract_epi16 (__Y, __N % 8);
539}
540
541extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542_mm256_extract_epi8 (__m256i __X, int const __N)
543{
544 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
545 return _mm_extract_epi8 (__Y, __N % 16);
546}
547
548#ifdef __x86_64__
549extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
550_mm256_extract_epi64 (__m256i __X, const int __N)
551{
552 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
553 return _mm_extract_epi64 (__Y, __N % 2);
554}
555#endif
556#else
557#define _mm256_extractf128_pd(X, N) \
558 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
559 (int)(N)))
560
561#define _mm256_extractf128_ps(X, N) \
562 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
563 (int)(N)))
564
565#define _mm256_extractf128_si256(X, N) \
566 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
567 (int)(N)))
568
569#define _mm256_extract_epi32(X, N) \
570 (__extension__ \
571 ({ \
572 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
573 _mm_extract_epi32 (__Y, (N) % 4); \
574 }))
575
576#define _mm256_extract_epi16(X, N) \
577 (__extension__ \
578 ({ \
579 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
580 _mm_extract_epi16 (__Y, (N) % 8); \
581 }))
582
583#define _mm256_extract_epi8(X, N) \
584 (__extension__ \
585 ({ \
586 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
587 _mm_extract_epi8 (__Y, (N) % 16); \
588 }))
589
590#ifdef __x86_64__
591#define _mm256_extract_epi64(X, N) \
592 (__extension__ \
593 ({ \
594 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
595 _mm_extract_epi64 (__Y, (N) % 2); \
596 }))
597#endif
598#endif
599
600extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601_mm256_zeroall (void)
602{
603 __builtin_ia32_vzeroall ();
604}
605
606extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607_mm256_zeroupper (void)
608{
609 __builtin_ia32_vzeroupper ();
610}
611
612extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_mm_permutevar_pd (__m128d __A, __m128i __C)
614{
615 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
616 (__v2di)__C);
617}
618
619extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
620_mm256_permutevar_pd (__m256d __A, __m256i __C)
621{
622 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
623 (__v4di)__C);
624}
625
626extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627_mm_permutevar_ps (__m128 __A, __m128i __C)
628{
629 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
630 (__v4si)__C);
631}
632
633extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634_mm256_permutevar_ps (__m256 __A, __m256i __C)
635{
636 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
637 (__v8si)__C);
638}
639
640#ifdef __OPTIMIZE__
641extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642_mm_permute_pd (__m128d __X, const int __C)
643{
644 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
645}
646
647extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648_mm256_permute_pd (__m256d __X, const int __C)
649{
650 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
651}
652
653extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654_mm_permute_ps (__m128 __X, const int __C)
655{
656 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
657}
658
659extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660_mm256_permute_ps (__m256 __X, const int __C)
661{
662 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
663}
ed30e0a6 664#else
665#define _mm_permute_pd(X, C) \
666 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
667
668#define _mm256_permute_pd(X, C) \
669 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
670
671#define _mm_permute_ps(X, C) \
672 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
673
674#define _mm256_permute_ps(X, C) \
675 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
ed30e0a6 676#endif
677
678#ifdef __OPTIMIZE__
679extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
681{
682 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
683 (__v4df)__Y,
684 __C);
685}
686
687extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
689{
690 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
691 (__v8sf)__Y,
692 __C);
693}
694
695extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
697{
698 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
699 (__v8si)__Y,
700 __C);
701}
702#else
703#define _mm256_permute2f128_pd(X, Y, C) \
704 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
705 (__v4df)(__m256d)(Y), \
706 (int)(C)))
707
708#define _mm256_permute2f128_ps(X, Y, C) \
709 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
710 (__v8sf)(__m256)(Y), \
711 (int)(C)))
712
713#define _mm256_permute2f128_si256(X, Y, C) \
714 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
715 (__v8si)(__m256i)(Y), \
716 (int)(C)))
717#endif
718
719extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
720_mm_broadcast_ss (float const *__X)
721{
722 return (__m128) __builtin_ia32_vbroadcastss (__X);
723}
724
725extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
726_mm256_broadcast_sd (double const *__X)
727{
728 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
729}
730
731extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
732_mm256_broadcast_ss (float const *__X)
733{
734 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
735}
736
737extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
738_mm256_broadcast_pd (__m128d const *__X)
739{
740 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
741}
742
743extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
744_mm256_broadcast_ps (__m128 const *__X)
745{
746 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
747}
748
749#ifdef __OPTIMIZE__
750extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
751_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
752{
753 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
754 (__v2df)__Y,
755 __O);
756}
757
758extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
759_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
760{
761 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
762 (__v4sf)__Y,
763 __O);
764}
765
766extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
767_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
768{
769 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
770 (__v4si)__Y,
771 __O);
772}
773
774extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
775_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
776{
777 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
63bd35a7 778 __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
ed30e0a6 779 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
780}
781
782extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
783_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
784{
785 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
786 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
787 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
788}
789
790extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
792{
793 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
794 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
795 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
796}
797
798#ifdef __x86_64__
799extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a9a8389c 800_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
ed30e0a6 801{
802 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
63bd35a7 803 __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
ed30e0a6 804 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
805}
806#endif
807#else
808#define _mm256_insertf128_pd(X, Y, O) \
809 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
810 (__v2df)(__m128d)(Y), \
811 (int)(O)))
812
813#define _mm256_insertf128_ps(X, Y, O) \
814 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
815 (__v4sf)(__m128)(Y), \
816 (int)(O)))
817
818#define _mm256_insertf128_si256(X, Y, O) \
819 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
820 (__v4si)(__m128i)(Y), \
821 (int)(O)))
822
823#define _mm256_insert_epi32(X, D, N) \
824 (__extension__ \
825 ({ \
826 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
827 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
828 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
829 }))
830
831#define _mm256_insert_epi16(X, D, N) \
832 (__extension__ \
833 ({ \
834 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
835 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
836 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
837 }))
838
839#define _mm256_insert_epi8(X, D, N) \
840 (__extension__ \
841 ({ \
842 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
843 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
844 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
845 }))
846
847#ifdef __x86_64__
848#define _mm256_insert_epi64(X, D, N) \
849 (__extension__ \
850 ({ \
851 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
852 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
853 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
854 }))
855#endif
856#endif
857
858extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859_mm256_load_pd (double const *__P)
860{
861 return *(__m256d *)__P;
862}
863
864extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865_mm256_store_pd (double *__P, __m256d __A)
866{
867 *(__m256d *)__P = __A;
868}
869
870extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871_mm256_load_ps (float const *__P)
872{
873 return *(__m256 *)__P;
874}
875
876extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
877_mm256_store_ps (float *__P, __m256 __A)
878{
879 *(__m256 *)__P = __A;
880}
881
882extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
883_mm256_loadu_pd (double const *__P)
884{
8036ac7f 885 return *(__m256d_u *)__P;
ed30e0a6 886}
887
888extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
889_mm256_storeu_pd (double *__P, __m256d __A)
890{
8036ac7f 891 *(__m256d_u *)__P = __A;
ed30e0a6 892}
893
894extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
895_mm256_loadu_ps (float const *__P)
896{
8036ac7f 897 return *(__m256_u *)__P;
ed30e0a6 898}
899
900extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
901_mm256_storeu_ps (float *__P, __m256 __A)
902{
8036ac7f 903 *(__m256_u *)__P = __A;
ed30e0a6 904}
905
906extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
907_mm256_load_si256 (__m256i const *__P)
908{
909 return *__P;
910}
911
912extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913_mm256_store_si256 (__m256i *__P, __m256i __A)
914{
915 *__P = __A;
916}
917
918extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8036ac7f 919_mm256_loadu_si256 (__m256i_u const *__P)
ed30e0a6 920{
8036ac7f 921 return *__P;
ed30e0a6 922}
923
924extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8036ac7f 925_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
ed30e0a6 926{
8036ac7f 927 *__P = __A;
ed30e0a6 928}
929
930extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8196f263 931_mm_maskload_pd (double const *__P, __m128i __M)
ed30e0a6 932{
933 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
8196f263 934 (__v2di)__M);
ed30e0a6 935}
936
937extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8196f263 938_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
ed30e0a6 939{
8196f263 940 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
ed30e0a6 941}
942
943extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8196f263 944_mm256_maskload_pd (double const *__P, __m256i __M)
ed30e0a6 945{
946 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
8196f263 947 (__v4di)__M);
ed30e0a6 948}
949
950extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8196f263 951_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
ed30e0a6 952{
8196f263 953 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
ed30e0a6 954}
955
956extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8196f263 957_mm_maskload_ps (float const *__P, __m128i __M)
ed30e0a6 958{
959 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
8196f263 960 (__v4si)__M);
ed30e0a6 961}
962
963extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8196f263 964_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
ed30e0a6 965{
8196f263 966 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
ed30e0a6 967}
968
969extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8196f263 970_mm256_maskload_ps (float const *__P, __m256i __M)
ed30e0a6 971{
972 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
8196f263 973 (__v8si)__M);
ed30e0a6 974}
975
976extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8196f263 977_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
ed30e0a6 978{
8196f263 979 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
ed30e0a6 980}
981
982extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm256_movehdup_ps (__m256 __X)
984{
985 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
986}
987
988extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989_mm256_moveldup_ps (__m256 __X)
990{
991 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
992}
993
994extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
995_mm256_movedup_pd (__m256d __X)
996{
997 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
998}
999
1000extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1001_mm256_lddqu_si256 (__m256i const *__P)
1002{
1003 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
1004}
1005
b49a1e34 1006extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007_mm256_stream_si256 (__m256i *__A, __m256i __B)
1008{
1009 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
1010}
1011
1012extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013_mm256_stream_pd (double *__A, __m256d __B)
1014{
1015 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
1016}
1017
1018extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1019_mm256_stream_ps (float *__P, __m256 __A)
1020{
1021 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
1022}
1023
ed30e0a6 1024extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025_mm256_rcp_ps (__m256 __A)
1026{
1027 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1028}
1029
1030extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031_mm256_rsqrt_ps (__m256 __A)
1032{
1033 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1034}
1035
1036extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1037_mm256_sqrt_pd (__m256d __A)
1038{
1039 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1040}
1041
1042extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043_mm256_sqrt_ps (__m256 __A)
1044{
1045 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1046}
1047
1048#ifdef __OPTIMIZE__
1049extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050_mm256_round_pd (__m256d __V, const int __M)
1051{
1052 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1053}
1054
1055extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056_mm256_round_ps (__m256 __V, const int __M)
1057{
1058 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1059}
1060#else
1061#define _mm256_round_pd(V, M) \
1062 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1063
1064#define _mm256_round_ps(V, M) \
1065 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1066#endif
1067
1068#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1069#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1070#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1071#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1072
1073extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1075{
1076 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1077}
1078
1079extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1081{
1082 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1083}
1084
1085extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1086_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1087{
1088 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1089}
1090
1091extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1093{
1094 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1095}
1096
1097extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098_mm_testz_pd (__m128d __M, __m128d __V)
1099{
1100 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1101}
1102
1103extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104_mm_testc_pd (__m128d __M, __m128d __V)
1105{
1106 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1107}
1108
1109extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110_mm_testnzc_pd (__m128d __M, __m128d __V)
1111{
1112 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1113}
1114
1115extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1116_mm_testz_ps (__m128 __M, __m128 __V)
1117{
1118 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1119}
1120
1121extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122_mm_testc_ps (__m128 __M, __m128 __V)
1123{
1124 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1125}
1126
1127extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128_mm_testnzc_ps (__m128 __M, __m128 __V)
1129{
1130 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1131}
1132
1133extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1134_mm256_testz_pd (__m256d __M, __m256d __V)
1135{
1136 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1137}
1138
1139extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140_mm256_testc_pd (__m256d __M, __m256d __V)
1141{
1142 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1143}
1144
1145extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1146_mm256_testnzc_pd (__m256d __M, __m256d __V)
1147{
1148 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1149}
1150
1151extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152_mm256_testz_ps (__m256 __M, __m256 __V)
1153{
1154 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1155}
1156
1157extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1158_mm256_testc_ps (__m256 __M, __m256 __V)
1159{
1160 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1161}
1162
1163extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1164_mm256_testnzc_ps (__m256 __M, __m256 __V)
1165{
1166 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1167}
1168
1169extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1170_mm256_testz_si256 (__m256i __M, __m256i __V)
1171{
1172 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1173}
1174
1175extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176_mm256_testc_si256 (__m256i __M, __m256i __V)
1177{
1178 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1179}
1180
1181extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1183{
1184 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1185}
1186
1187extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188_mm256_movemask_pd (__m256d __A)
1189{
1190 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1191}
1192
1193extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194_mm256_movemask_ps (__m256 __A)
1195{
1196 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1197}
1198
0fc245cd 1199extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200_mm256_undefined_pd (void)
1201{
1202 __m256d __Y = __Y;
1203 return __Y;
1204}
1205
1206extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1207_mm256_undefined_ps (void)
1208{
1209 __m256 __Y = __Y;
1210 return __Y;
1211}
1212
1213extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214_mm256_undefined_si256 (void)
1215{
1216 __m256i __Y = __Y;
1217 return __Y;
1218}
1219
ed30e0a6 1220extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221_mm256_setzero_pd (void)
1222{
1223 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1224}
1225
1226extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227_mm256_setzero_ps (void)
1228{
1229 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1230 0.0, 0.0, 0.0, 0.0 };
1231}
1232
1233extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1234_mm256_setzero_si256 (void)
1235{
1236 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1237}
1238
1239/* Create the vector [A B C D]. */
1240extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241_mm256_set_pd (double __A, double __B, double __C, double __D)
1242{
1243 return __extension__ (__m256d){ __D, __C, __B, __A };
1244}
1245
1246/* Create the vector [A B C D E F G H]. */
1247extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248_mm256_set_ps (float __A, float __B, float __C, float __D,
1249 float __E, float __F, float __G, float __H)
1250{
1251 return __extension__ (__m256){ __H, __G, __F, __E,
1252 __D, __C, __B, __A };
1253}
1254
1255/* Create the vector [A B C D E F G H]. */
1256extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1258 int __E, int __F, int __G, int __H)
1259{
1260 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1261 __D, __C, __B, __A };
1262}
1263
1264extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1266 short __q11, short __q10, short __q09, short __q08,
1267 short __q07, short __q06, short __q05, short __q04,
1268 short __q03, short __q02, short __q01, short __q00)
1269{
1270 return __extension__ (__m256i)(__v16hi){
1271 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1272 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1273 };
1274}
1275
1276extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1278 char __q27, char __q26, char __q25, char __q24,
1279 char __q23, char __q22, char __q21, char __q20,
1280 char __q19, char __q18, char __q17, char __q16,
1281 char __q15, char __q14, char __q13, char __q12,
1282 char __q11, char __q10, char __q09, char __q08,
1283 char __q07, char __q06, char __q05, char __q04,
1284 char __q03, char __q02, char __q01, char __q00)
1285{
1286 return __extension__ (__m256i)(__v32qi){
1287 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1288 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1289 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1290 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1291 };
1292}
1293
1294extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1295_mm256_set_epi64x (long long __A, long long __B, long long __C,
1296 long long __D)
1297{
1298 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1299}
1300
1301/* Create a vector with all elements equal to A. */
1302extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303_mm256_set1_pd (double __A)
1304{
1305 return __extension__ (__m256d){ __A, __A, __A, __A };
1306}
1307
1308/* Create a vector with all elements equal to A. */
1309extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310_mm256_set1_ps (float __A)
1311{
1312 return __extension__ (__m256){ __A, __A, __A, __A,
1313 __A, __A, __A, __A };
1314}
1315
1316/* Create a vector with all elements equal to A. */
1317extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318_mm256_set1_epi32 (int __A)
1319{
1320 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1321 __A, __A, __A, __A };
1322}
1323
1324extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1325_mm256_set1_epi16 (short __A)
1326{
1327 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1328 __A, __A, __A, __A, __A, __A, __A, __A);
1329}
1330
1331extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1332_mm256_set1_epi8 (char __A)
1333{
1334 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1335 __A, __A, __A, __A, __A, __A, __A, __A,
1336 __A, __A, __A, __A, __A, __A, __A, __A,
1337 __A, __A, __A, __A, __A, __A, __A, __A);
1338}
1339
1340extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341_mm256_set1_epi64x (long long __A)
1342{
1343 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1344}
1345
1346/* Create vectors of elements in the reversed order from the
1347 _mm256_set_XXX functions. */
1348
1349extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1350_mm256_setr_pd (double __A, double __B, double __C, double __D)
1351{
1352 return _mm256_set_pd (__D, __C, __B, __A);
1353}
1354
1355extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1356_mm256_setr_ps (float __A, float __B, float __C, float __D,
1357 float __E, float __F, float __G, float __H)
1358{
1359 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1360}
1361
1362extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1364 int __E, int __F, int __G, int __H)
1365{
1366 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1367}
1368
1369extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1371 short __q11, short __q10, short __q09, short __q08,
1372 short __q07, short __q06, short __q05, short __q04,
1373 short __q03, short __q02, short __q01, short __q00)
1374{
1375 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1376 __q04, __q05, __q06, __q07,
1377 __q08, __q09, __q10, __q11,
1378 __q12, __q13, __q14, __q15);
1379}
1380
1381extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1383 char __q27, char __q26, char __q25, char __q24,
1384 char __q23, char __q22, char __q21, char __q20,
1385 char __q19, char __q18, char __q17, char __q16,
1386 char __q15, char __q14, char __q13, char __q12,
1387 char __q11, char __q10, char __q09, char __q08,
1388 char __q07, char __q06, char __q05, char __q04,
1389 char __q03, char __q02, char __q01, char __q00)
1390{
1391 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1392 __q04, __q05, __q06, __q07,
1393 __q08, __q09, __q10, __q11,
1394 __q12, __q13, __q14, __q15,
1395 __q16, __q17, __q18, __q19,
1396 __q20, __q21, __q22, __q23,
1397 __q24, __q25, __q26, __q27,
1398 __q28, __q29, __q30, __q31);
1399}
1400
1401extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1402_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1403 long long __D)
1404{
1405 return _mm256_set_epi64x (__D, __C, __B, __A);
1406}
1407
1408/* Casts between various SP, DP, INT vector types. Note that these do no
1409 conversion of values, they just change the type. */
1410extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411_mm256_castpd_ps (__m256d __A)
1412{
1413 return (__m256) __A;
1414}
1415
1416extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417_mm256_castpd_si256 (__m256d __A)
1418{
1419 return (__m256i) __A;
1420}
1421
1422extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423_mm256_castps_pd (__m256 __A)
1424{
1425 return (__m256d) __A;
1426}
1427
1428extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429_mm256_castps_si256(__m256 __A)
1430{
1431 return (__m256i) __A;
1432}
1433
1434extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435_mm256_castsi256_ps (__m256i __A)
1436{
1437 return (__m256) __A;
1438}
1439
1440extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441_mm256_castsi256_pd (__m256i __A)
1442{
1443 return (__m256d) __A;
1444}
1445
1446extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1447_mm256_castpd256_pd128 (__m256d __A)
1448{
1449 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1450}
1451
1452extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1453_mm256_castps256_ps128 (__m256 __A)
1454{
1455 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1456}
1457
1458extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459_mm256_castsi256_si128 (__m256i __A)
1460{
1461 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1462}
1463
1464/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1465 the 256-bit result contain source parameter value and the upper 128
1466 bits of the result are undefined. Those intrinsics shouldn't
1467 generate any extra moves. */
1468
1469extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1470_mm256_castpd128_pd256 (__m128d __A)
1471{
1472 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1473}
1474
1475extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1476_mm256_castps128_ps256 (__m128 __A)
1477{
1478 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1479}
1480
1481extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1482_mm256_castsi128_si256 (__m128i __A)
1483{
1484 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1485}
ef21d40e 1486
23f05e90 1487extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1488_mm256_set_m128 ( __m128 __H, __m128 __L)
1489{
1490 return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
1491}
1492
1493extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1494_mm256_set_m128d (__m128d __H, __m128d __L)
1495{
1496 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
1497}
1498
1499extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1500_mm256_set_m128i (__m128i __H, __m128i __L)
1501{
1502 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
1503}
1504
1505extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1506_mm256_setr_m128 (__m128 __L, __m128 __H)
1507{
1508 return _mm256_set_m128 (__H, __L);
1509}
1510
1511extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512_mm256_setr_m128d (__m128d __L, __m128d __H)
1513{
1514 return _mm256_set_m128d (__H, __L);
1515}
1516
1517extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1518_mm256_setr_m128i (__m128i __L, __m128i __H)
1519{
1520 return _mm256_set_m128i (__H, __L);
1521}
1522
ef21d40e 1523#ifdef __DISABLE_AVX__
1524#undef __DISABLE_AVX__
1525#pragma GCC pop_options
1526#endif /* __DISABLE_AVX__ */
1527
1528#endif /* _AVXINTRIN_H_INCLUDED */