]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/xmmintrin.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / xmmintrin.h
CommitLineData
7adcbafe 1/* Copyright (C) 2002-2022 Free Software Foundation, Inc.
6f1a6c5b 2
a805d35f 3 This file is part of GCC.
6f1a6c5b 4
a805d35f 5 GCC is free software; you can redistribute it and/or modify
6f1a6c5b 6 it under the terms of the GNU General Public License as published by
748086b7 7 the Free Software Foundation; either version 3, or (at your option)
6f1a6c5b
RH
8 any later version.
9
a805d35f 10 GCC is distributed in the hope that it will be useful,
6f1a6c5b
RH
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
748086b7
JJ
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
6f1a6c5b
RH
23
24/* Implemented from the specification included in the Intel C++ Compiler
7744bbe3 25 User Guide and Reference, version 9.0. */
6f1a6c5b
RH
26
27#ifndef _XMMINTRIN_H_INCLUDED
28#define _XMMINTRIN_H_INCLUDED
29
30/* We need type definitions from the MMX header file. */
31#include <mmintrin.h>
32
f6bc51cb
L
33/* Get _mm_malloc () and _mm_free (). */
34#include <mm_malloc.h>
35
dd96e83a
UB
36/* Constants for use with _mm_prefetch. */
37enum _mm_hint
38{
39 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
40 _MM_HINT_ET0 = 7,
41 _MM_HINT_ET1 = 6,
42 _MM_HINT_T0 = 3,
43 _MM_HINT_T1 = 2,
44 _MM_HINT_T2 = 1,
45 _MM_HINT_NTA = 0
46};
47
48/* Loads one cache line from address P to a location "closer" to the
49 processor. The selector I specifies the type of prefetch operation. */
50#ifdef __OPTIMIZE__
51extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
52_mm_prefetch (const void *__P, enum _mm_hint __I)
53{
54 __builtin_prefetch (__P, (__I & 0x4) >> 2, __I & 0x3);
55}
56#else
57#define _mm_prefetch(P, I) \
58 __builtin_prefetch ((P), ((I & 0x4) >> 2), (I & 0x3))
59#endif
60
97db2bf7
ST
61#ifndef __SSE__
62#pragma GCC push_options
63#pragma GCC target("sse")
64#define __DISABLE_SSE__
65#endif /* __SSE__ */
66
b5cf27d7
RH
67/* The Intel API is flexible enough that we must allow aliasing with other
68 vector types, and their scalar components. */
69typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
6f1a6c5b 70
c6b0037d
MG
71/* Unaligned version of the same type. */
72typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
73
d1f87653 74/* Internal data types for implementing the intrinsics. */
3336093d 75typedef float __v4sf __attribute__ ((__vector_size__ (16)));
6f1a6c5b
RH
76
77/* Create a selector for use with the SHUFPS instruction. */
78#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
79 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
80
6f1a6c5b
RH
81/* Bits in the MXCSR. */
82#define _MM_EXCEPT_MASK 0x003f
83#define _MM_EXCEPT_INVALID 0x0001
84#define _MM_EXCEPT_DENORM 0x0002
85#define _MM_EXCEPT_DIV_ZERO 0x0004
86#define _MM_EXCEPT_OVERFLOW 0x0008
87#define _MM_EXCEPT_UNDERFLOW 0x0010
88#define _MM_EXCEPT_INEXACT 0x0020
89
90#define _MM_MASK_MASK 0x1f80
91#define _MM_MASK_INVALID 0x0080
92#define _MM_MASK_DENORM 0x0100
93#define _MM_MASK_DIV_ZERO 0x0200
94#define _MM_MASK_OVERFLOW 0x0400
95#define _MM_MASK_UNDERFLOW 0x0800
96#define _MM_MASK_INEXACT 0x1000
97
98#define _MM_ROUND_MASK 0x6000
99#define _MM_ROUND_NEAREST 0x0000
100#define _MM_ROUND_DOWN 0x2000
101#define _MM_ROUND_UP 0x4000
102#define _MM_ROUND_TOWARD_ZERO 0x6000
103
104#define _MM_FLUSH_ZERO_MASK 0x8000
105#define _MM_FLUSH_ZERO_ON 0x8000
106#define _MM_FLUSH_ZERO_OFF 0x0000
107
0b192937
UD
108/* Create an undefined vector. */
109extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110_mm_undefined_ps (void)
111{
112 __m128 __Y = __Y;
113 return __Y;
114}
115
eb701deb 116/* Create a vector of zeros. */
1359ef39 117extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb
RH
118_mm_setzero_ps (void)
119{
a00cb0b9 120 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
eb701deb
RH
121}
122
6f1a6c5b
RH
123/* Perform the respective operation on the lower SPFP (single-precision
124 floating-point) values of A and B; the upper three SPFP values are
125 passed through from A. */
126
1359ef39 127extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
128_mm_add_ss (__m128 __A, __m128 __B)
129{
130 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
131}
132
1359ef39 133extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
134_mm_sub_ss (__m128 __A, __m128 __B)
135{
136 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
137}
138
1359ef39 139extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
140_mm_mul_ss (__m128 __A, __m128 __B)
141{
142 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
143}
144
1359ef39 145extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
146_mm_div_ss (__m128 __A, __m128 __B)
147{
148 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
149}
150
1359ef39 151extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
152_mm_sqrt_ss (__m128 __A)
153{
154 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
155}
156
1359ef39 157extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
158_mm_rcp_ss (__m128 __A)
159{
160 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
161}
162
1359ef39 163extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
164_mm_rsqrt_ss (__m128 __A)
165{
166 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
167}
168
1359ef39 169extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
170_mm_min_ss (__m128 __A, __m128 __B)
171{
172 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
173}
174
1359ef39 175extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
176_mm_max_ss (__m128 __A, __m128 __B)
177{
178 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
179}
180
181/* Perform the respective operation on the four SPFP values in A and B. */
182
1359ef39 183extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
184_mm_add_ps (__m128 __A, __m128 __B)
185{
2069d6fc 186 return (__m128) ((__v4sf)__A + (__v4sf)__B);
6f1a6c5b
RH
187}
188
1359ef39 189extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
190_mm_sub_ps (__m128 __A, __m128 __B)
191{
2069d6fc 192 return (__m128) ((__v4sf)__A - (__v4sf)__B);
6f1a6c5b
RH
193}
194
1359ef39 195extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
196_mm_mul_ps (__m128 __A, __m128 __B)
197{
2069d6fc 198 return (__m128) ((__v4sf)__A * (__v4sf)__B);
6f1a6c5b
RH
199}
200
1359ef39 201extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
202_mm_div_ps (__m128 __A, __m128 __B)
203{
2069d6fc 204 return (__m128) ((__v4sf)__A / (__v4sf)__B);
6f1a6c5b
RH
205}
206
1359ef39 207extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
208_mm_sqrt_ps (__m128 __A)
209{
210 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
211}
212
1359ef39 213extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
214_mm_rcp_ps (__m128 __A)
215{
216 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
217}
218
1359ef39 219extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
220_mm_rsqrt_ps (__m128 __A)
221{
222 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
223}
224
1359ef39 225extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
226_mm_min_ps (__m128 __A, __m128 __B)
227{
228 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
229}
230
1359ef39 231extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
232_mm_max_ps (__m128 __A, __m128 __B)
233{
234 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
235}
236
237/* Perform logical bit-wise operations on 128-bit values. */
238
1359ef39 239extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
240_mm_and_ps (__m128 __A, __m128 __B)
241{
242 return __builtin_ia32_andps (__A, __B);
243}
244
1359ef39 245extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
246_mm_andnot_ps (__m128 __A, __m128 __B)
247{
248 return __builtin_ia32_andnps (__A, __B);
249}
250
1359ef39 251extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
252_mm_or_ps (__m128 __A, __m128 __B)
253{
254 return __builtin_ia32_orps (__A, __B);
255}
256
1359ef39 257extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
258_mm_xor_ps (__m128 __A, __m128 __B)
259{
260 return __builtin_ia32_xorps (__A, __B);
261}
262
263/* Perform a comparison on the lower SPFP values of A and B. If the
264 comparison is true, place a mask of all ones in the result, otherwise a
265 mask of zeros. The upper three SPFP values are passed through from A. */
266
1359ef39 267extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
268_mm_cmpeq_ss (__m128 __A, __m128 __B)
269{
270 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
271}
272
1359ef39 273extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
274_mm_cmplt_ss (__m128 __A, __m128 __B)
275{
276 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
277}
278
1359ef39 279extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
280_mm_cmple_ss (__m128 __A, __m128 __B)
281{
282 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
283}
284
1359ef39 285extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
286_mm_cmpgt_ss (__m128 __A, __m128 __B)
287{
d09e61b9
JH
288 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
289 (__v4sf)
290 __builtin_ia32_cmpltss ((__v4sf) __B,
291 (__v4sf)
292 __A));
6f1a6c5b
RH
293}
294
1359ef39 295extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
296_mm_cmpge_ss (__m128 __A, __m128 __B)
297{
d09e61b9
JH
298 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
299 (__v4sf)
300 __builtin_ia32_cmpless ((__v4sf) __B,
301 (__v4sf)
302 __A));
6f1a6c5b
RH
303}
304
1359ef39 305extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
306_mm_cmpneq_ss (__m128 __A, __m128 __B)
307{
308 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
309}
310
1359ef39 311extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
312_mm_cmpnlt_ss (__m128 __A, __m128 __B)
313{
314 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
315}
316
1359ef39 317extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
318_mm_cmpnle_ss (__m128 __A, __m128 __B)
319{
320 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
321}
322
1359ef39 323extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
324_mm_cmpngt_ss (__m128 __A, __m128 __B)
325{
d09e61b9
JH
326 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
327 (__v4sf)
328 __builtin_ia32_cmpnltss ((__v4sf) __B,
329 (__v4sf)
330 __A));
6f1a6c5b
RH
331}
332
1359ef39 333extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
334_mm_cmpnge_ss (__m128 __A, __m128 __B)
335{
d09e61b9
JH
336 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
337 (__v4sf)
338 __builtin_ia32_cmpnless ((__v4sf) __B,
339 (__v4sf)
340 __A));
6f1a6c5b
RH
341}
342
1359ef39 343extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
344_mm_cmpord_ss (__m128 __A, __m128 __B)
345{
346 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
347}
348
1359ef39 349extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
350_mm_cmpunord_ss (__m128 __A, __m128 __B)
351{
352 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
353}
354
355/* Perform a comparison on the four SPFP values of A and B. For each
356 element, if the comparison is true, place a mask of all ones in the
357 result, otherwise a mask of zeros. */
358
1359ef39 359extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
360_mm_cmpeq_ps (__m128 __A, __m128 __B)
361{
362 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
363}
364
1359ef39 365extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
366_mm_cmplt_ps (__m128 __A, __m128 __B)
367{
368 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
369}
370
1359ef39 371extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
372_mm_cmple_ps (__m128 __A, __m128 __B)
373{
374 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
375}
376
1359ef39 377extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
378_mm_cmpgt_ps (__m128 __A, __m128 __B)
379{
380 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
381}
382
1359ef39 383extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
384_mm_cmpge_ps (__m128 __A, __m128 __B)
385{
386 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
387}
388
1359ef39 389extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
390_mm_cmpneq_ps (__m128 __A, __m128 __B)
391{
392 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
393}
394
1359ef39 395extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
396_mm_cmpnlt_ps (__m128 __A, __m128 __B)
397{
398 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
399}
400
1359ef39 401extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
402_mm_cmpnle_ps (__m128 __A, __m128 __B)
403{
404 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
405}
406
1359ef39 407extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
408_mm_cmpngt_ps (__m128 __A, __m128 __B)
409{
410 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
411}
412
1359ef39 413extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
414_mm_cmpnge_ps (__m128 __A, __m128 __B)
415{
416 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
417}
418
1359ef39 419extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
420_mm_cmpord_ps (__m128 __A, __m128 __B)
421{
422 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
423}
424
1359ef39 425extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
426_mm_cmpunord_ps (__m128 __A, __m128 __B)
427{
428 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
429}
430
431/* Compare the lower SPFP values of A and B and return 1 if true
432 and 0 if false. */
433
1359ef39 434extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
435_mm_comieq_ss (__m128 __A, __m128 __B)
436{
437 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
438}
439
1359ef39 440extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
441_mm_comilt_ss (__m128 __A, __m128 __B)
442{
443 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
444}
445
1359ef39 446extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
447_mm_comile_ss (__m128 __A, __m128 __B)
448{
449 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
450}
451
1359ef39 452extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
453_mm_comigt_ss (__m128 __A, __m128 __B)
454{
455 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
456}
457
1359ef39 458extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
459_mm_comige_ss (__m128 __A, __m128 __B)
460{
461 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
462}
463
1359ef39 464extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
465_mm_comineq_ss (__m128 __A, __m128 __B)
466{
467 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
468}
469
1359ef39 470extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
471_mm_ucomieq_ss (__m128 __A, __m128 __B)
472{
473 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
474}
475
1359ef39 476extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
477_mm_ucomilt_ss (__m128 __A, __m128 __B)
478{
479 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
480}
481
1359ef39 482extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
483_mm_ucomile_ss (__m128 __A, __m128 __B)
484{
485 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
486}
487
1359ef39 488extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
489_mm_ucomigt_ss (__m128 __A, __m128 __B)
490{
491 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
492}
493
1359ef39 494extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
495_mm_ucomige_ss (__m128 __A, __m128 __B)
496{
497 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
498}
499
1359ef39 500extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
501_mm_ucomineq_ss (__m128 __A, __m128 __B)
502{
503 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
504}
505
506/* Convert the lower SPFP value to a 32-bit integer according to the current
507 rounding mode. */
1359ef39 508extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
509_mm_cvtss_si32 (__m128 __A)
510{
511 return __builtin_ia32_cvtss2si ((__v4sf) __A);
512}
513
1359ef39 514extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
515_mm_cvt_ss2si (__m128 __A)
516{
517 return _mm_cvtss_si32 (__A);
518}
519
453ee231 520#ifdef __x86_64__
7744bbe3
L
521/* Convert the lower SPFP value to a 32-bit integer according to the
522 current rounding mode. */
523
524/* Intel intrinsic. */
1359ef39 525extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
7744bbe3
L
526_mm_cvtss_si64 (__m128 __A)
527{
528 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
529}
530
531/* Microsoft intrinsic. */
1359ef39 532extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453ee231
JH
533_mm_cvtss_si64x (__m128 __A)
534{
535 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
536}
537#endif
538
6f1a6c5b
RH
539/* Convert the two lower SPFP values to 32-bit integers according to the
540 current rounding mode. Return the integers in packed form. */
1359ef39 541extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
542_mm_cvtps_pi32 (__m128 __A)
543{
544 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
545}
546
1359ef39 547extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
548_mm_cvt_ps2pi (__m128 __A)
549{
550 return _mm_cvtps_pi32 (__A);
551}
552
6f1a6c5b 553/* Truncate the lower SPFP value to a 32-bit integer. */
1359ef39 554extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
555_mm_cvttss_si32 (__m128 __A)
556{
557 return __builtin_ia32_cvttss2si ((__v4sf) __A);
558}
559
1359ef39 560extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
561_mm_cvtt_ss2si (__m128 __A)
562{
563 return _mm_cvttss_si32 (__A);
564}
565
453ee231
JH
566#ifdef __x86_64__
567/* Truncate the lower SPFP value to a 32-bit integer. */
7744bbe3
L
568
569/* Intel intrinsic. */
1359ef39 570extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
7744bbe3
L
571_mm_cvttss_si64 (__m128 __A)
572{
573 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
574}
575
576/* Microsoft intrinsic. */
1359ef39 577extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453ee231
JH
578_mm_cvttss_si64x (__m128 __A)
579{
580 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
581}
582#endif
583
6f1a6c5b
RH
584/* Truncate the two lower SPFP values to 32-bit integers. Return the
585 integers in packed form. */
1359ef39 586extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
587_mm_cvttps_pi32 (__m128 __A)
588{
589 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
590}
591
1359ef39 592extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
593_mm_cvtt_ps2pi (__m128 __A)
594{
595 return _mm_cvttps_pi32 (__A);
596}
597
6f1a6c5b 598/* Convert B to a SPFP value and insert it as element zero in A. */
1359ef39 599extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
600_mm_cvtsi32_ss (__m128 __A, int __B)
601{
602 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
603}
604
1359ef39 605extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
606_mm_cvt_si2ss (__m128 __A, int __B)
607{
608 return _mm_cvtsi32_ss (__A, __B);
609}
610
453ee231
JH
611#ifdef __x86_64__
612/* Convert B to a SPFP value and insert it as element zero in A. */
7744bbe3
L
613
614/* Intel intrinsic. */
1359ef39 615extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
7744bbe3
L
616_mm_cvtsi64_ss (__m128 __A, long long __B)
617{
618 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
619}
620
621/* Microsoft intrinsic. */
1359ef39 622extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453ee231
JH
623_mm_cvtsi64x_ss (__m128 __A, long long __B)
624{
625 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
626}
627#endif
628
6f1a6c5b
RH
629/* Convert the two 32-bit values in B to SPFP form and insert them
630 as the two lower elements in A. */
1359ef39 631extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
632_mm_cvtpi32_ps (__m128 __A, __m64 __B)
633{
634 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
635}
636
1359ef39 637extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
638_mm_cvt_pi2ps (__m128 __A, __m64 __B)
639{
640 return _mm_cvtpi32_ps (__A, __B);
641}
642
6f1a6c5b 643/* Convert the four signed 16-bit values in A to SPFP form. */
1359ef39 644extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
645_mm_cvtpi16_ps (__m64 __A)
646{
647 __v4hi __sign;
648 __v2si __hisi, __losi;
6126672e 649 __v4sf __zero, __ra, __rb;
6f1a6c5b
RH
650
651 /* This comparison against zero gives us a mask that can be used to
652 fill in the missing sign bits in the unpack operations below, so
653 that we get signed values after unpacking. */
eb701deb 654 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
6f1a6c5b
RH
655
656 /* Convert the four words to doublewords. */
6f1a6c5b 657 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
5ca2bc5e 658 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
6f1a6c5b
RH
659
660 /* Convert the doublewords to floating point two at a time. */
6126672e 661 __zero = (__v4sf) _mm_setzero_ps ();
5ca2bc5e
UB
662 __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
663 __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
6f1a6c5b 664
6126672e 665 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
6f1a6c5b
RH
666}
667
668/* Convert the four unsigned 16-bit values in A to SPFP form. */
1359ef39 669extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
670_mm_cvtpu16_ps (__m64 __A)
671{
6f1a6c5b 672 __v2si __hisi, __losi;
6126672e 673 __v4sf __zero, __ra, __rb;
6f1a6c5b
RH
674
675 /* Convert the four words to doublewords. */
eb701deb 676 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
5ca2bc5e 677 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
6f1a6c5b
RH
678
679 /* Convert the doublewords to floating point two at a time. */
6126672e 680 __zero = (__v4sf) _mm_setzero_ps ();
5ca2bc5e
UB
681 __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
682 __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
6f1a6c5b 683
6126672e 684 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
6f1a6c5b
RH
685}
686
687/* Convert the low four signed 8-bit values in A to SPFP form. */
1359ef39 688extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
689_mm_cvtpi8_ps (__m64 __A)
690{
691 __v8qi __sign;
692
693 /* This comparison against zero gives us a mask that can be used to
694 fill in the missing sign bits in the unpack operations below, so
695 that we get signed values after unpacking. */
eb701deb 696 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
6f1a6c5b
RH
697
698 /* Convert the four low bytes to words. */
699 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
700
701 return _mm_cvtpi16_ps(__A);
702}
703
704/* Convert the low four unsigned 8-bit values in A to SPFP form. */
1359ef39 705extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
706_mm_cvtpu8_ps(__m64 __A)
707{
eb701deb 708 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
6f1a6c5b
RH
709 return _mm_cvtpu16_ps(__A);
710}
711
712/* Convert the four signed 32-bit values in A and B to SPFP form. */
1359ef39 713extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
714_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
715{
eb701deb 716 __v4sf __zero = (__v4sf) _mm_setzero_ps ();
6f1a6c5b 717 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
6126672e 718 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
6f1a6c5b
RH
719 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
720}
721
722/* Convert the four SPFP values in A to four signed 16-bit integers. */
1359ef39 723extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
724_mm_cvtps_pi16(__m128 __A)
725{
726 __v4sf __hisf = (__v4sf)__A;
727 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
728 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
729 __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
1194ca05 730 return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
6f1a6c5b
RH
731}
732
733/* Convert the four SPFP values in A to four signed 8-bit integers. */
1359ef39 734extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
735_mm_cvtps_pi8(__m128 __A)
736{
737 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
eb701deb 738 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
6f1a6c5b
RH
739}
740
741/* Selects four specific SPFP values from A and B based on MASK. */
8ec3e357 742#ifdef __OPTIMIZE__
1359ef39 743extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376a4c05 744_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
6f1a6c5b
RH
745{
746 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
747}
32fe15ec 748#else
bfcd7233
UB
749#define _mm_shuffle_ps(A, B, MASK) \
750 ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \
751 (__v4sf)(__m128)(B), (int)(MASK)))
32fe15ec 752#endif
6f1a6c5b
RH
753
754/* Selects and interleaves the upper two SPFP values from A and B. */
1359ef39 755extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
756_mm_unpackhi_ps (__m128 __A, __m128 __B)
757{
758 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
759}
760
761/* Selects and interleaves the lower two SPFP values from A and B. */
1359ef39 762extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
763_mm_unpacklo_ps (__m128 __A, __m128 __B)
764{
765 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
766}
767
768/* Sets the upper two SPFP values with 64-bits of data loaded from P;
769 the lower two values are passed through from A. */
1359ef39 770extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a00782ed 771_mm_loadh_pi (__m128 __A, __m64 const *__P)
6f1a6c5b 772{
bb1418c1 773 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
6f1a6c5b
RH
774}
775
776/* Stores the upper two SPFP values of A into P. */
1359ef39 777extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
778_mm_storeh_pi (__m64 *__P, __m128 __A)
779{
bb1418c1 780 __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
6f1a6c5b
RH
781}
782
783/* Moves the upper two values of B into the lower two values of A. */
1359ef39 784extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
785_mm_movehl_ps (__m128 __A, __m128 __B)
786{
787 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
788}
789
790/* Moves the lower two values of B into the upper two values of A. */
1359ef39 791extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
792_mm_movelh_ps (__m128 __A, __m128 __B)
793{
794 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
795}
796
797/* Sets the lower two SPFP values with 64-bits of data loaded from P;
798 the upper two values are passed through from A. */
1359ef39 799extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a00782ed 800_mm_loadl_pi (__m128 __A, __m64 const *__P)
6f1a6c5b 801{
bb1418c1 802 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
6f1a6c5b
RH
803}
804
805/* Stores the lower two SPFP values of A into P. */
1359ef39 806extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
807_mm_storel_pi (__m64 *__P, __m128 __A)
808{
bb1418c1 809 __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
6f1a6c5b
RH
810}
811
812/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1359ef39 813extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
814_mm_movemask_ps (__m128 __A)
815{
816 return __builtin_ia32_movmskps ((__v4sf)__A);
817}
818
819/* Return the contents of the control register. */
1359ef39 820extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
821_mm_getcsr (void)
822{
e37af218 823 return __builtin_ia32_stmxcsr ();
6f1a6c5b
RH
824}
825
826/* Read exception bits from the control register. */
1359ef39 827extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
828_MM_GET_EXCEPTION_STATE (void)
829{
830 return _mm_getcsr() & _MM_EXCEPT_MASK;
831}
832
1359ef39 833extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
834_MM_GET_EXCEPTION_MASK (void)
835{
836 return _mm_getcsr() & _MM_MASK_MASK;
837}
838
1359ef39 839extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
840_MM_GET_ROUNDING_MODE (void)
841{
842 return _mm_getcsr() & _MM_ROUND_MASK;
843}
844
1359ef39 845extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
846_MM_GET_FLUSH_ZERO_MODE (void)
847{
848 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
849}
850
851/* Set the control register to I. */
1359ef39 852extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
853_mm_setcsr (unsigned int __I)
854{
e37af218 855 __builtin_ia32_ldmxcsr (__I);
6f1a6c5b
RH
856}
857
858/* Set exception bits in the control register. */
1359ef39 859extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
860_MM_SET_EXCEPTION_STATE(unsigned int __mask)
861{
862 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
863}
864
1359ef39 865extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
866_MM_SET_EXCEPTION_MASK (unsigned int __mask)
867{
868 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
869}
870
1359ef39 871extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
872_MM_SET_ROUNDING_MODE (unsigned int __mode)
873{
874 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
875}
876
1359ef39 877extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
878_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
879{
880 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
881}
882
eb701deb 883/* Create a vector with element 0 as F and the rest zero. */
1359ef39 884extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb
RH
885_mm_set_ss (float __F)
886{
46fb8f6b 887 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
eb701deb
RH
888}
889
890/* Create a vector with all four elements equal to F. */
1359ef39 891extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb
RH
892_mm_set1_ps (float __F)
893{
a00cb0b9 894 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
eb701deb
RH
895}
896
1359ef39 897extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb
RH
898_mm_set_ps1 (float __F)
899{
900 return _mm_set1_ps (__F);
901}
902
6f1a6c5b 903/* Create a vector with element 0 as *P and the rest zero. */
1359ef39 904extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a00782ed 905_mm_load_ss (float const *__P)
6f1a6c5b 906{
eb701deb 907 return _mm_set_ss (*__P);
6f1a6c5b
RH
908}
909
910/* Create a vector with all four elements equal to *P. */
1359ef39 911extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a00782ed 912_mm_load1_ps (float const *__P)
6f1a6c5b 913{
eb701deb 914 return _mm_set1_ps (*__P);
6f1a6c5b
RH
915}
916
1359ef39 917extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a00782ed 918_mm_load_ps1 (float const *__P)
6f1a6c5b
RH
919{
920 return _mm_load1_ps (__P);
921}
922
923/* Load four SPFP values from P. The address must be 16-byte aligned. */
1359ef39 924extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a00782ed 925_mm_load_ps (float const *__P)
6f1a6c5b 926{
c6b0037d 927 return *(__m128 *)__P;
6f1a6c5b
RH
928}
929
930/* Load four SPFP values from P. The address need not be 16-byte aligned. */
1359ef39 931extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a00782ed 932_mm_loadu_ps (float const *__P)
6f1a6c5b 933{
c6b0037d 934 return *(__m128_u *)__P;
6f1a6c5b
RH
935}
936
937/* Load four SPFP values in reverse order. The address must be aligned. */
1359ef39 938extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a00782ed 939_mm_loadr_ps (float const *__P)
6f1a6c5b 940{
eb701deb 941 __v4sf __tmp = *(__v4sf *)__P;
6f1a6c5b
RH
942 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
943}
944
6f1a6c5b 945/* Create the vector [Z Y X W]. */
1359ef39 946extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
997404de 947_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
6f1a6c5b 948{
a00cb0b9 949 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
6f1a6c5b
RH
950}
951
952/* Create the vector [W X Y Z]. */
1359ef39 953extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
954_mm_setr_ps (float __Z, float __Y, float __X, float __W)
955{
a00cb0b9 956 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
6f1a6c5b
RH
957}
958
959/* Stores the lower SPFP value. */
1359ef39 960extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
961_mm_store_ss (float *__P, __m128 __A)
962{
2069d6fc 963 *__P = ((__v4sf)__A)[0];
6f1a6c5b
RH
964}
965
1359ef39 966extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
7744bbe3
L
967_mm_cvtss_f32 (__m128 __A)
968{
2069d6fc 969 return ((__v4sf)__A)[0];
7744bbe3
L
970}
971
eb701deb 972/* Store four SPFP values. The address must be 16-byte aligned. */
1359ef39 973extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb 974_mm_store_ps (float *__P, __m128 __A)
6f1a6c5b 975{
c6b0037d 976 *(__m128 *)__P = __A;
6f1a6c5b
RH
977}
978
eb701deb 979/* Store four SPFP values. The address need not be 16-byte aligned. */
1359ef39 980extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb 981_mm_storeu_ps (float *__P, __m128 __A)
6f1a6c5b 982{
c6b0037d 983 *(__m128_u *)__P = __A;
6f1a6c5b
RH
984}
985
eb701deb 986/* Store the lower SPFP value across four words. */
1359ef39 987extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb 988_mm_store1_ps (float *__P, __m128 __A)
6f1a6c5b 989{
eb701deb
RH
990 __v4sf __va = (__v4sf)__A;
991 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
992 _mm_storeu_ps (__P, __tmp);
6f1a6c5b
RH
993}
994
1359ef39 995extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb 996_mm_store_ps1 (float *__P, __m128 __A)
6f1a6c5b 997{
eb701deb 998 _mm_store1_ps (__P, __A);
6f1a6c5b
RH
999}
1000
a920aefe 1001/* Store four SPFP values in reverse order. The address must be aligned. */
1359ef39 1002extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1003_mm_storer_ps (float *__P, __m128 __A)
1004{
1005 __v4sf __va = (__v4sf)__A;
1006 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
eb701deb 1007 _mm_store_ps (__P, __tmp);
6f1a6c5b
RH
1008}
1009
1010/* Sets the low SPFP value of A from the low value of B. */
1359ef39 1011extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1012_mm_move_ss (__m128 __A, __m128 __B)
1013{
107192f7
ASJ
1014 return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
1015 __extension__
1016 (__attribute__((__vector_size__ (16))) int)
1017 {4,1,2,3});
6f1a6c5b
RH
1018}
1019
1020/* Extracts one of the four words of A. The selector N must be immediate. */
8ec3e357 1021#ifdef __OPTIMIZE__
1359ef39 1022extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb 1023_mm_extract_pi16 (__m64 const __A, int const __N)
6f1a6c5b 1024{
af60b0ec 1025 return (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
6f1a6c5b 1026}
c220e3a9 1027
1359ef39 1028extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb 1029_m_pextrw (__m64 const __A, int const __N)
c220e3a9
L
1030{
1031 return _mm_extract_pi16 (__A, __N);
1032}
32fe15ec 1033#else
bfcd7233 1034#define _mm_extract_pi16(A, N) \
af60b0ec 1035 ((int) (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
2c9fd13e
UB
1036
1037#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
32fe15ec 1038#endif
6f1a6c5b
RH
1039
1040/* Inserts word D into one of four words of A. The selector N must be
1041 immediate. */
8ec3e357 1042#ifdef __OPTIMIZE__
1359ef39 1043extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb 1044_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
6f1a6c5b 1045{
eb701deb 1046 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
6f1a6c5b 1047}
c220e3a9 1048
1359ef39 1049extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
eb701deb 1050_m_pinsrw (__m64 const __A, int const __D, int const __N)
c220e3a9
L
1051{
1052 return _mm_insert_pi16 (__A, __D, __N);
1053}
32fe15ec 1054#else
bfcd7233
UB
1055#define _mm_insert_pi16(A, D, N) \
1056 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \
1057 (int)(D), (int)(N)))
2c9fd13e
UB
1058
1059#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
32fe15ec 1060#endif
6f1a6c5b
RH
1061
1062/* Compute the element-wise maximum of signed 16-bit values. */
1359ef39 1063extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1064_mm_max_pi16 (__m64 __A, __m64 __B)
1065{
1066 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1067}
1068
1359ef39 1069extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1070_m_pmaxsw (__m64 __A, __m64 __B)
1071{
1072 return _mm_max_pi16 (__A, __B);
1073}
1074
6f1a6c5b 1075/* Compute the element-wise maximum of unsigned 8-bit values. */
1359ef39 1076extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1077_mm_max_pu8 (__m64 __A, __m64 __B)
1078{
1079 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1080}
1081
1359ef39 1082extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1083_m_pmaxub (__m64 __A, __m64 __B)
1084{
1085 return _mm_max_pu8 (__A, __B);
1086}
1087
6f1a6c5b 1088/* Compute the element-wise minimum of signed 16-bit values. */
1359ef39 1089extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1090_mm_min_pi16 (__m64 __A, __m64 __B)
1091{
1092 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1093}
1094
1359ef39 1095extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1096_m_pminsw (__m64 __A, __m64 __B)
1097{
1098 return _mm_min_pi16 (__A, __B);
1099}
1100
6f1a6c5b 1101/* Compute the element-wise minimum of unsigned 8-bit values. */
1359ef39 1102extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1103_mm_min_pu8 (__m64 __A, __m64 __B)
1104{
1105 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1106}
1107
1359ef39 1108extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1109_m_pminub (__m64 __A, __m64 __B)
1110{
1111 return _mm_min_pu8 (__A, __B);
1112}
1113
6f1a6c5b 1114/* Create an 8-bit mask of the signs of 8-bit values. */
1359ef39 1115extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1116_mm_movemask_pi8 (__m64 __A)
1117{
1118 return __builtin_ia32_pmovmskb ((__v8qi)__A);
1119}
1120
1359ef39 1121extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1122_m_pmovmskb (__m64 __A)
1123{
1124 return _mm_movemask_pi8 (__A);
1125}
1126
6f1a6c5b
RH
1127/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1128 in B and produce the high 16 bits of the 32-bit results. */
1359ef39 1129extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1130_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1131{
1132 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1133}
1134
1359ef39 1135extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1136_m_pmulhuw (__m64 __A, __m64 __B)
1137{
1138 return _mm_mulhi_pu16 (__A, __B);
1139}
1140
6f1a6c5b
RH
1141/* Return a combination of the four 16-bit values in A. The selector
1142 must be an immediate. */
8ec3e357 1143#ifdef __OPTIMIZE__
1359ef39 1144extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376a4c05 1145_mm_shuffle_pi16 (__m64 __A, int const __N)
6f1a6c5b
RH
1146{
1147 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1148}
c220e3a9 1149
1359ef39 1150extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376a4c05 1151_m_pshufw (__m64 __A, int const __N)
c220e3a9
L
1152{
1153 return _mm_shuffle_pi16 (__A, __N);
1154}
32fe15ec
UB
1155#else
1156#define _mm_shuffle_pi16(A, N) \
bfcd7233 1157 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
2c9fd13e
UB
1158
1159#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
32fe15ec 1160#endif
6f1a6c5b
RH
1161
1162/* Conditionally store byte elements of A into P. The high bit of each
1163 byte in the selector N determines whether the corresponding byte from
1164 A is stored. */
1359ef39 1165extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1166_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1167{
55cd2379
L
1168#ifdef __MMX_WITH_SSE__
1169 /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
1170 64:127 at address __P. */
1171 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
1172 typedef char __v16qi __attribute__ ((__vector_size__ (16)));
1173 /* Zero-extend __A and __N to 128 bits. */
1174 __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
1175 __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
1176
1177 /* Check the alignment of __P. */
1178 __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
1179 if (offset)
1180 {
1181 /* If the misalignment of __P > 8, subtract __P by 8 bytes.
1182 Otherwise, subtract __P by the misalignment. */
1183 if (offset > 8)
1184 offset = 8;
1185 __P = (char *) (((__SIZE_TYPE__) __P) - offset);
1186
1187 /* Shift __A128 and __N128 to the left by the adjustment. */
1188 switch (offset)
1189 {
1190 case 1:
1191 __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
1192 __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
1193 break;
1194 case 2:
1195 __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
1196 __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
1197 break;
1198 case 3:
1199 __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
1200 __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
1201 break;
1202 case 4:
1203 __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
1204 __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
1205 break;
1206 case 5:
1207 __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
1208 __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
1209 break;
1210 case 6:
1211 __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
1212 __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
1213 break;
1214 case 7:
1215 __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
1216 __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
1217 break;
1218 case 8:
1219 __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
1220 __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
1221 break;
1222 default:
1223 break;
1224 }
1225 }
1226 __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
1227#else
6f1a6c5b 1228 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
55cd2379 1229#endif
6f1a6c5b
RH
1230}
1231
1359ef39 1232extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1233_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1234{
1235 _mm_maskmove_si64 (__A, __N, __P);
1236}
1237
6f1a6c5b 1238/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1359ef39 1239extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1240_mm_avg_pu8 (__m64 __A, __m64 __B)
1241{
1242 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1243}
1244
1359ef39 1245extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1246_m_pavgb (__m64 __A, __m64 __B)
1247{
1248 return _mm_avg_pu8 (__A, __B);
1249}
1250
6f1a6c5b 1251/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1359ef39 1252extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1253_mm_avg_pu16 (__m64 __A, __m64 __B)
1254{
1255 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1256}
1257
1359ef39 1258extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1259_m_pavgw (__m64 __A, __m64 __B)
1260{
1261 return _mm_avg_pu16 (__A, __B);
1262}
1263
6f1a6c5b
RH
1264/* Compute the sum of the absolute differences of the unsigned 8-bit
1265 values in A and B. Return the value in the lower 16-bit word; the
1266 upper words are cleared. */
1359ef39 1267extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1268_mm_sad_pu8 (__m64 __A, __m64 __B)
1269{
1270 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1271}
1272
1359ef39 1273extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
c220e3a9
L
1274_m_psadbw (__m64 __A, __m64 __B)
1275{
1276 return _mm_sad_pu8 (__A, __B);
1277}
1278
6f1a6c5b 1279/* Stores the data in A to the address P without polluting the caches. */
1359ef39 1280extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1281_mm_stream_pi (__m64 *__P, __m64 __A)
1282{
f8ca7923 1283 __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
6f1a6c5b
RH
1284}
1285
1286/* Likewise. The address must be 16-byte aligned. */
1359ef39 1287extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1288_mm_stream_ps (float *__P, __m128 __A)
1289{
1290 __builtin_ia32_movntps (__P, (__v4sf)__A);
1291}
1292
d1f87653 1293/* Guarantees that every preceding store is globally visible before
6f1a6c5b 1294 any subsequent store. */
1359ef39 1295extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
6f1a6c5b
RH
1296_mm_sfence (void)
1297{
1298 __builtin_ia32_sfence ();
1299}
1300
6f1a6c5b
RH
1301/* Transpose the 4x4 matrix composed of row[0-3]. */
1302#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1303do { \
1304 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
be7724ed 1305 __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \
3287a040
DJ
1306 __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \
1307 __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \
be7724ed
EC
1308 __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \
1309 (row0) = __builtin_ia32_movlhps (__t0, __t1); \
1310 (row1) = __builtin_ia32_movhlps (__t1, __t0); \
1311 (row2) = __builtin_ia32_movlhps (__t2, __t3); \
1312 (row3) = __builtin_ia32_movhlps (__t3, __t2); \
6f1a6c5b
RH
1313} while (0)
1314
30fb3231 1315/* For backward source compatibility. */
c1d91dbd 1316# include <emmintrin.h>
916b60b7 1317
97db2bf7
ST
1318#ifdef __DISABLE_SSE__
1319#undef __DISABLE_SSE__
1320#pragma GCC pop_options
1321#endif /* __DISABLE_SSE__ */
1322
f9a8f6d9
ILT
1323/* The execution of the next instruction is delayed by an implementation
1324 specific amount of time. The instruction does not modify the
1325 architectural state. This is after the pop_options pragma because
1326 it does not require SSE support in the processor--the encoding is a
1327 nop on processors that do not support it. */
1328extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329_mm_pause (void)
1330{
1331 __builtin_ia32_pause ();
1332}
1333
6f1a6c5b 1334#endif /* _XMMINTRIN_H_INCLUDED */