]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/xmmintrin.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / xmmintrin.h
CommitLineData
fbd26352 1/* Copyright (C) 2002-2019 Free Software Foundation, Inc.
d65bfb46 2
af680fe2 3 This file is part of GCC.
d65bfb46 4
af680fe2 5 GCC is free software; you can redistribute it and/or modify
d65bfb46 6 it under the terms of the GNU General Public License as published by
6bc9506f 7 the Free Software Foundation; either version 3, or (at your option)
d65bfb46 8 any later version.
9
af680fe2 10 GCC is distributed in the hope that it will be useful,
d65bfb46 11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
6bc9506f 15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
d65bfb46 23
24/* Implemented from the specification included in the Intel C++ Compiler
52fdc46e 25 User Guide and Reference, version 9.0. */
d65bfb46 26
27#ifndef _XMMINTRIN_H_INCLUDED
28#define _XMMINTRIN_H_INCLUDED
29
30/* We need type definitions from the MMX header file. */
31#include <mmintrin.h>
32
561866ed 33/* Get _mm_malloc () and _mm_free (). */
34#include <mm_malloc.h>
35
25e01811 36/* Constants for use with _mm_prefetch. */
37enum _mm_hint
38{
39 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
40 _MM_HINT_ET0 = 7,
41 _MM_HINT_ET1 = 6,
42 _MM_HINT_T0 = 3,
43 _MM_HINT_T1 = 2,
44 _MM_HINT_T2 = 1,
45 _MM_HINT_NTA = 0
46};
47
48/* Loads one cache line from address P to a location "closer" to the
49 processor. The selector I specifies the type of prefetch operation. */
50#ifdef __OPTIMIZE__
51extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
52_mm_prefetch (const void *__P, enum _mm_hint __I)
53{
54 __builtin_prefetch (__P, (__I & 0x4) >> 2, __I & 0x3);
55}
56#else
57#define _mm_prefetch(P, I) \
58 __builtin_prefetch ((P), ((I & 0x4) >> 2), (I & 0x3))
59#endif
60
ef21d40e 61#ifndef __SSE__
62#pragma GCC push_options
63#pragma GCC target("sse")
64#define __DISABLE_SSE__
65#endif /* __SSE__ */
66
0e960ba8 67/* The Intel API is flexible enough that we must allow aliasing with other
68 vector types, and their scalar components. */
69typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
d65bfb46 70
8036ac7f 71/* Unaligned version of the same type. */
72typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
73
fcbfedc7 74/* Internal data types for implementing the intrinsics. */
aff6787f 75typedef float __v4sf __attribute__ ((__vector_size__ (16)));
d65bfb46 76
77/* Create a selector for use with the SHUFPS instruction. */
78#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
79 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
80
d65bfb46 81/* Bits in the MXCSR. */
82#define _MM_EXCEPT_MASK 0x003f
83#define _MM_EXCEPT_INVALID 0x0001
84#define _MM_EXCEPT_DENORM 0x0002
85#define _MM_EXCEPT_DIV_ZERO 0x0004
86#define _MM_EXCEPT_OVERFLOW 0x0008
87#define _MM_EXCEPT_UNDERFLOW 0x0010
88#define _MM_EXCEPT_INEXACT 0x0020
89
90#define _MM_MASK_MASK 0x1f80
91#define _MM_MASK_INVALID 0x0080
92#define _MM_MASK_DENORM 0x0100
93#define _MM_MASK_DIV_ZERO 0x0200
94#define _MM_MASK_OVERFLOW 0x0400
95#define _MM_MASK_UNDERFLOW 0x0800
96#define _MM_MASK_INEXACT 0x1000
97
98#define _MM_ROUND_MASK 0x6000
99#define _MM_ROUND_NEAREST 0x0000
100#define _MM_ROUND_DOWN 0x2000
101#define _MM_ROUND_UP 0x4000
102#define _MM_ROUND_TOWARD_ZERO 0x6000
103
104#define _MM_FLUSH_ZERO_MASK 0x8000
105#define _MM_FLUSH_ZERO_ON 0x8000
106#define _MM_FLUSH_ZERO_OFF 0x0000
107
0fc245cd 108/* Create an undefined vector. */
109extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110_mm_undefined_ps (void)
111{
112 __m128 __Y = __Y;
113 return __Y;
114}
115
ad2c46cf 116/* Create a vector of zeros. */
517b0286 117extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 118_mm_setzero_ps (void)
119{
882b157f 120 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
ad2c46cf 121}
122
d65bfb46 123/* Perform the respective operation on the lower SPFP (single-precision
124 floating-point) values of A and B; the upper three SPFP values are
125 passed through from A. */
126
517b0286 127extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 128_mm_add_ss (__m128 __A, __m128 __B)
129{
130 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
131}
132
517b0286 133extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 134_mm_sub_ss (__m128 __A, __m128 __B)
135{
136 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
137}
138
517b0286 139extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 140_mm_mul_ss (__m128 __A, __m128 __B)
141{
142 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
143}
144
517b0286 145extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 146_mm_div_ss (__m128 __A, __m128 __B)
147{
148 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
149}
150
517b0286 151extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 152_mm_sqrt_ss (__m128 __A)
153{
154 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
155}
156
517b0286 157extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 158_mm_rcp_ss (__m128 __A)
159{
160 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
161}
162
517b0286 163extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 164_mm_rsqrt_ss (__m128 __A)
165{
166 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
167}
168
517b0286 169extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 170_mm_min_ss (__m128 __A, __m128 __B)
171{
172 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
173}
174
517b0286 175extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 176_mm_max_ss (__m128 __A, __m128 __B)
177{
178 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
179}
180
181/* Perform the respective operation on the four SPFP values in A and B. */
182
517b0286 183extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 184_mm_add_ps (__m128 __A, __m128 __B)
185{
d521a5b2 186 return (__m128) ((__v4sf)__A + (__v4sf)__B);
d65bfb46 187}
188
517b0286 189extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 190_mm_sub_ps (__m128 __A, __m128 __B)
191{
d521a5b2 192 return (__m128) ((__v4sf)__A - (__v4sf)__B);
d65bfb46 193}
194
517b0286 195extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 196_mm_mul_ps (__m128 __A, __m128 __B)
197{
d521a5b2 198 return (__m128) ((__v4sf)__A * (__v4sf)__B);
d65bfb46 199}
200
517b0286 201extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 202_mm_div_ps (__m128 __A, __m128 __B)
203{
d521a5b2 204 return (__m128) ((__v4sf)__A / (__v4sf)__B);
d65bfb46 205}
206
517b0286 207extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 208_mm_sqrt_ps (__m128 __A)
209{
210 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
211}
212
517b0286 213extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 214_mm_rcp_ps (__m128 __A)
215{
216 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
217}
218
517b0286 219extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 220_mm_rsqrt_ps (__m128 __A)
221{
222 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
223}
224
517b0286 225extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 226_mm_min_ps (__m128 __A, __m128 __B)
227{
228 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
229}
230
517b0286 231extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 232_mm_max_ps (__m128 __A, __m128 __B)
233{
234 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
235}
236
237/* Perform logical bit-wise operations on 128-bit values. */
238
517b0286 239extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 240_mm_and_ps (__m128 __A, __m128 __B)
241{
242 return __builtin_ia32_andps (__A, __B);
243}
244
517b0286 245extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 246_mm_andnot_ps (__m128 __A, __m128 __B)
247{
248 return __builtin_ia32_andnps (__A, __B);
249}
250
517b0286 251extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 252_mm_or_ps (__m128 __A, __m128 __B)
253{
254 return __builtin_ia32_orps (__A, __B);
255}
256
517b0286 257extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 258_mm_xor_ps (__m128 __A, __m128 __B)
259{
260 return __builtin_ia32_xorps (__A, __B);
261}
262
263/* Perform a comparison on the lower SPFP values of A and B. If the
264 comparison is true, place a mask of all ones in the result, otherwise a
265 mask of zeros. The upper three SPFP values are passed through from A. */
266
517b0286 267extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 268_mm_cmpeq_ss (__m128 __A, __m128 __B)
269{
270 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
271}
272
517b0286 273extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 274_mm_cmplt_ss (__m128 __A, __m128 __B)
275{
276 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
277}
278
517b0286 279extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 280_mm_cmple_ss (__m128 __A, __m128 __B)
281{
282 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
283}
284
517b0286 285extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 286_mm_cmpgt_ss (__m128 __A, __m128 __B)
287{
c469025e 288 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
289 (__v4sf)
290 __builtin_ia32_cmpltss ((__v4sf) __B,
291 (__v4sf)
292 __A));
d65bfb46 293}
294
517b0286 295extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 296_mm_cmpge_ss (__m128 __A, __m128 __B)
297{
c469025e 298 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
299 (__v4sf)
300 __builtin_ia32_cmpless ((__v4sf) __B,
301 (__v4sf)
302 __A));
d65bfb46 303}
304
517b0286 305extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 306_mm_cmpneq_ss (__m128 __A, __m128 __B)
307{
308 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
309}
310
517b0286 311extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 312_mm_cmpnlt_ss (__m128 __A, __m128 __B)
313{
314 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
315}
316
517b0286 317extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 318_mm_cmpnle_ss (__m128 __A, __m128 __B)
319{
320 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
321}
322
517b0286 323extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 324_mm_cmpngt_ss (__m128 __A, __m128 __B)
325{
c469025e 326 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
327 (__v4sf)
328 __builtin_ia32_cmpnltss ((__v4sf) __B,
329 (__v4sf)
330 __A));
d65bfb46 331}
332
517b0286 333extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 334_mm_cmpnge_ss (__m128 __A, __m128 __B)
335{
c469025e 336 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
337 (__v4sf)
338 __builtin_ia32_cmpnless ((__v4sf) __B,
339 (__v4sf)
340 __A));
d65bfb46 341}
342
517b0286 343extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 344_mm_cmpord_ss (__m128 __A, __m128 __B)
345{
346 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
347}
348
517b0286 349extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 350_mm_cmpunord_ss (__m128 __A, __m128 __B)
351{
352 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
353}
354
355/* Perform a comparison on the four SPFP values of A and B. For each
356 element, if the comparison is true, place a mask of all ones in the
357 result, otherwise a mask of zeros. */
358
517b0286 359extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 360_mm_cmpeq_ps (__m128 __A, __m128 __B)
361{
362 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
363}
364
517b0286 365extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 366_mm_cmplt_ps (__m128 __A, __m128 __B)
367{
368 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
369}
370
517b0286 371extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 372_mm_cmple_ps (__m128 __A, __m128 __B)
373{
374 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
375}
376
517b0286 377extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 378_mm_cmpgt_ps (__m128 __A, __m128 __B)
379{
380 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
381}
382
517b0286 383extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 384_mm_cmpge_ps (__m128 __A, __m128 __B)
385{
386 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
387}
388
517b0286 389extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 390_mm_cmpneq_ps (__m128 __A, __m128 __B)
391{
392 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
393}
394
517b0286 395extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 396_mm_cmpnlt_ps (__m128 __A, __m128 __B)
397{
398 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
399}
400
517b0286 401extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 402_mm_cmpnle_ps (__m128 __A, __m128 __B)
403{
404 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
405}
406
517b0286 407extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 408_mm_cmpngt_ps (__m128 __A, __m128 __B)
409{
410 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
411}
412
517b0286 413extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 414_mm_cmpnge_ps (__m128 __A, __m128 __B)
415{
416 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
417}
418
517b0286 419extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 420_mm_cmpord_ps (__m128 __A, __m128 __B)
421{
422 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
423}
424
517b0286 425extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 426_mm_cmpunord_ps (__m128 __A, __m128 __B)
427{
428 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
429}
430
431/* Compare the lower SPFP values of A and B and return 1 if true
432 and 0 if false. */
433
517b0286 434extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 435_mm_comieq_ss (__m128 __A, __m128 __B)
436{
437 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
438}
439
517b0286 440extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 441_mm_comilt_ss (__m128 __A, __m128 __B)
442{
443 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
444}
445
517b0286 446extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 447_mm_comile_ss (__m128 __A, __m128 __B)
448{
449 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
450}
451
517b0286 452extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 453_mm_comigt_ss (__m128 __A, __m128 __B)
454{
455 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
456}
457
517b0286 458extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 459_mm_comige_ss (__m128 __A, __m128 __B)
460{
461 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
462}
463
517b0286 464extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 465_mm_comineq_ss (__m128 __A, __m128 __B)
466{
467 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
468}
469
517b0286 470extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 471_mm_ucomieq_ss (__m128 __A, __m128 __B)
472{
473 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
474}
475
517b0286 476extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 477_mm_ucomilt_ss (__m128 __A, __m128 __B)
478{
479 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
480}
481
517b0286 482extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 483_mm_ucomile_ss (__m128 __A, __m128 __B)
484{
485 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
486}
487
517b0286 488extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 489_mm_ucomigt_ss (__m128 __A, __m128 __B)
490{
491 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
492}
493
517b0286 494extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 495_mm_ucomige_ss (__m128 __A, __m128 __B)
496{
497 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
498}
499
517b0286 500extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 501_mm_ucomineq_ss (__m128 __A, __m128 __B)
502{
503 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
504}
505
506/* Convert the lower SPFP value to a 32-bit integer according to the current
507 rounding mode. */
517b0286 508extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 509_mm_cvtss_si32 (__m128 __A)
510{
511 return __builtin_ia32_cvtss2si ((__v4sf) __A);
512}
513
517b0286 514extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 515_mm_cvt_ss2si (__m128 __A)
516{
517 return _mm_cvtss_si32 (__A);
518}
519
1f27494a 520#ifdef __x86_64__
52fdc46e 521/* Convert the lower SPFP value to a 32-bit integer according to the
522 current rounding mode. */
523
524/* Intel intrinsic. */
517b0286 525extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
52fdc46e 526_mm_cvtss_si64 (__m128 __A)
527{
528 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
529}
530
531/* Microsoft intrinsic. */
517b0286 532extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1f27494a 533_mm_cvtss_si64x (__m128 __A)
534{
535 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
536}
537#endif
538
d65bfb46 539/* Convert the two lower SPFP values to 32-bit integers according to the
540 current rounding mode. Return the integers in packed form. */
517b0286 541extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 542_mm_cvtps_pi32 (__m128 __A)
543{
544 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
545}
546
517b0286 547extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 548_mm_cvt_ps2pi (__m128 __A)
549{
550 return _mm_cvtps_pi32 (__A);
551}
552
d65bfb46 553/* Truncate the lower SPFP value to a 32-bit integer. */
517b0286 554extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 555_mm_cvttss_si32 (__m128 __A)
556{
557 return __builtin_ia32_cvttss2si ((__v4sf) __A);
558}
559
517b0286 560extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 561_mm_cvtt_ss2si (__m128 __A)
562{
563 return _mm_cvttss_si32 (__A);
564}
565
1f27494a 566#ifdef __x86_64__
567/* Truncate the lower SPFP value to a 32-bit integer. */
52fdc46e 568
569/* Intel intrinsic. */
517b0286 570extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
52fdc46e 571_mm_cvttss_si64 (__m128 __A)
572{
573 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
574}
575
576/* Microsoft intrinsic. */
517b0286 577extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1f27494a 578_mm_cvttss_si64x (__m128 __A)
579{
580 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
581}
582#endif
583
d65bfb46 584/* Truncate the two lower SPFP values to 32-bit integers. Return the
585 integers in packed form. */
517b0286 586extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 587_mm_cvttps_pi32 (__m128 __A)
588{
589 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
590}
591
517b0286 592extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 593_mm_cvtt_ps2pi (__m128 __A)
594{
595 return _mm_cvttps_pi32 (__A);
596}
597
d65bfb46 598/* Convert B to a SPFP value and insert it as element zero in A. */
517b0286 599extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 600_mm_cvtsi32_ss (__m128 __A, int __B)
601{
602 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
603}
604
517b0286 605extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 606_mm_cvt_si2ss (__m128 __A, int __B)
607{
608 return _mm_cvtsi32_ss (__A, __B);
609}
610
1f27494a 611#ifdef __x86_64__
612/* Convert B to a SPFP value and insert it as element zero in A. */
52fdc46e 613
614/* Intel intrinsic. */
517b0286 615extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
52fdc46e 616_mm_cvtsi64_ss (__m128 __A, long long __B)
617{
618 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
619}
620
621/* Microsoft intrinsic. */
517b0286 622extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1f27494a 623_mm_cvtsi64x_ss (__m128 __A, long long __B)
624{
625 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
626}
627#endif
628
d65bfb46 629/* Convert the two 32-bit values in B to SPFP form and insert them
630 as the two lower elements in A. */
517b0286 631extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 632_mm_cvtpi32_ps (__m128 __A, __m64 __B)
633{
634 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
635}
636
517b0286 637extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 638_mm_cvt_pi2ps (__m128 __A, __m64 __B)
639{
640 return _mm_cvtpi32_ps (__A, __B);
641}
642
d65bfb46 643/* Convert the four signed 16-bit values in A to SPFP form. */
517b0286 644extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 645_mm_cvtpi16_ps (__m64 __A)
646{
647 __v4hi __sign;
648 __v2si __hisi, __losi;
6e22f3c7 649 __v4sf __zero, __ra, __rb;
d65bfb46 650
651 /* This comparison against zero gives us a mask that can be used to
652 fill in the missing sign bits in the unpack operations below, so
653 that we get signed values after unpacking. */
ad2c46cf 654 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
d65bfb46 655
656 /* Convert the four words to doublewords. */
d65bfb46 657 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
850c7736 658 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
d65bfb46 659
660 /* Convert the doublewords to floating point two at a time. */
6e22f3c7 661 __zero = (__v4sf) _mm_setzero_ps ();
850c7736 662 __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
663 __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
d65bfb46 664
6e22f3c7 665 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
d65bfb46 666}
667
668/* Convert the four unsigned 16-bit values in A to SPFP form. */
517b0286 669extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 670_mm_cvtpu16_ps (__m64 __A)
671{
d65bfb46 672 __v2si __hisi, __losi;
6e22f3c7 673 __v4sf __zero, __ra, __rb;
d65bfb46 674
675 /* Convert the four words to doublewords. */
ad2c46cf 676 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
850c7736 677 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
d65bfb46 678
679 /* Convert the doublewords to floating point two at a time. */
6e22f3c7 680 __zero = (__v4sf) _mm_setzero_ps ();
850c7736 681 __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
682 __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
d65bfb46 683
6e22f3c7 684 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
d65bfb46 685}
686
687/* Convert the low four signed 8-bit values in A to SPFP form. */
517b0286 688extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 689_mm_cvtpi8_ps (__m64 __A)
690{
691 __v8qi __sign;
692
693 /* This comparison against zero gives us a mask that can be used to
694 fill in the missing sign bits in the unpack operations below, so
695 that we get signed values after unpacking. */
ad2c46cf 696 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
d65bfb46 697
698 /* Convert the four low bytes to words. */
699 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
700
701 return _mm_cvtpi16_ps(__A);
702}
703
704/* Convert the low four unsigned 8-bit values in A to SPFP form. */
517b0286 705extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 706_mm_cvtpu8_ps(__m64 __A)
707{
ad2c46cf 708 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
d65bfb46 709 return _mm_cvtpu16_ps(__A);
710}
711
712/* Convert the four signed 32-bit values in A and B to SPFP form. */
517b0286 713extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 714_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
715{
ad2c46cf 716 __v4sf __zero = (__v4sf) _mm_setzero_ps ();
d65bfb46 717 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
6e22f3c7 718 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
d65bfb46 719 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
720}
721
722/* Convert the four SPFP values in A to four signed 16-bit integers. */
517b0286 723extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 724_mm_cvtps_pi16(__m128 __A)
725{
726 __v4sf __hisf = (__v4sf)__A;
727 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
728 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
729 __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
717b9435 730 return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
d65bfb46 731}
732
733/* Convert the four SPFP values in A to four signed 8-bit integers. */
517b0286 734extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 735_mm_cvtps_pi8(__m128 __A)
736{
737 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
ad2c46cf 738 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
d65bfb46 739}
740
741/* Selects four specific SPFP values from A and B based on MASK. */
1a60bb06 742#ifdef __OPTIMIZE__
517b0286 743extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
4dfac92d 744_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
d65bfb46 745{
746 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
747}
d61e5c1b 748#else
5f76c0f8 749#define _mm_shuffle_ps(A, B, MASK) \
750 ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \
751 (__v4sf)(__m128)(B), (int)(MASK)))
d61e5c1b 752#endif
d65bfb46 753
754/* Selects and interleaves the upper two SPFP values from A and B. */
517b0286 755extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 756_mm_unpackhi_ps (__m128 __A, __m128 __B)
757{
758 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
759}
760
761/* Selects and interleaves the lower two SPFP values from A and B. */
517b0286 762extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 763_mm_unpacklo_ps (__m128 __A, __m128 __B)
764{
765 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
766}
767
768/* Sets the upper two SPFP values with 64-bits of data loaded from P;
769 the lower two values are passed through from A. */
517b0286 770extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a7fa1ad5 771_mm_loadh_pi (__m128 __A, __m64 const *__P)
d65bfb46 772{
875a66b2 773 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
d65bfb46 774}
775
776/* Stores the upper two SPFP values of A into P. */
517b0286 777extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 778_mm_storeh_pi (__m64 *__P, __m128 __A)
779{
875a66b2 780 __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
d65bfb46 781}
782
783/* Moves the upper two values of B into the lower two values of A. */
517b0286 784extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 785_mm_movehl_ps (__m128 __A, __m128 __B)
786{
787 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
788}
789
790/* Moves the lower two values of B into the upper two values of A. */
517b0286 791extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 792_mm_movelh_ps (__m128 __A, __m128 __B)
793{
794 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
795}
796
797/* Sets the lower two SPFP values with 64-bits of data loaded from P;
798 the upper two values are passed through from A. */
517b0286 799extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a7fa1ad5 800_mm_loadl_pi (__m128 __A, __m64 const *__P)
d65bfb46 801{
875a66b2 802 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
d65bfb46 803}
804
805/* Stores the lower two SPFP values of A into P. */
517b0286 806extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 807_mm_storel_pi (__m64 *__P, __m128 __A)
808{
875a66b2 809 __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
d65bfb46 810}
811
812/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
517b0286 813extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 814_mm_movemask_ps (__m128 __A)
815{
816 return __builtin_ia32_movmskps ((__v4sf)__A);
817}
818
819/* Return the contents of the control register. */
517b0286 820extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 821_mm_getcsr (void)
822{
3381a03a 823 return __builtin_ia32_stmxcsr ();
d65bfb46 824}
825
826/* Read exception bits from the control register. */
517b0286 827extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 828_MM_GET_EXCEPTION_STATE (void)
829{
830 return _mm_getcsr() & _MM_EXCEPT_MASK;
831}
832
517b0286 833extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 834_MM_GET_EXCEPTION_MASK (void)
835{
836 return _mm_getcsr() & _MM_MASK_MASK;
837}
838
517b0286 839extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 840_MM_GET_ROUNDING_MODE (void)
841{
842 return _mm_getcsr() & _MM_ROUND_MASK;
843}
844
517b0286 845extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 846_MM_GET_FLUSH_ZERO_MODE (void)
847{
848 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
849}
850
851/* Set the control register to I. */
517b0286 852extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 853_mm_setcsr (unsigned int __I)
854{
3381a03a 855 __builtin_ia32_ldmxcsr (__I);
d65bfb46 856}
857
858/* Set exception bits in the control register. */
517b0286 859extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 860_MM_SET_EXCEPTION_STATE(unsigned int __mask)
861{
862 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
863}
864
517b0286 865extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 866_MM_SET_EXCEPTION_MASK (unsigned int __mask)
867{
868 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
869}
870
517b0286 871extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 872_MM_SET_ROUNDING_MODE (unsigned int __mode)
873{
874 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
875}
876
517b0286 877extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 878_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
879{
880 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
881}
882
ad2c46cf 883/* Create a vector with element 0 as F and the rest zero. */
517b0286 884extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 885_mm_set_ss (float __F)
886{
712fea20 887 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
ad2c46cf 888}
889
890/* Create a vector with all four elements equal to F. */
517b0286 891extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 892_mm_set1_ps (float __F)
893{
882b157f 894 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
ad2c46cf 895}
896
517b0286 897extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 898_mm_set_ps1 (float __F)
899{
900 return _mm_set1_ps (__F);
901}
902
d65bfb46 903/* Create a vector with element 0 as *P and the rest zero. */
517b0286 904extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a7fa1ad5 905_mm_load_ss (float const *__P)
d65bfb46 906{
ad2c46cf 907 return _mm_set_ss (*__P);
d65bfb46 908}
909
910/* Create a vector with all four elements equal to *P. */
517b0286 911extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a7fa1ad5 912_mm_load1_ps (float const *__P)
d65bfb46 913{
ad2c46cf 914 return _mm_set1_ps (*__P);
d65bfb46 915}
916
517b0286 917extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a7fa1ad5 918_mm_load_ps1 (float const *__P)
d65bfb46 919{
920 return _mm_load1_ps (__P);
921}
922
923/* Load four SPFP values from P. The address must be 16-byte aligned. */
517b0286 924extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a7fa1ad5 925_mm_load_ps (float const *__P)
d65bfb46 926{
8036ac7f 927 return *(__m128 *)__P;
d65bfb46 928}
929
930/* Load four SPFP values from P. The address need not be 16-byte aligned. */
517b0286 931extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a7fa1ad5 932_mm_loadu_ps (float const *__P)
d65bfb46 933{
8036ac7f 934 return *(__m128_u *)__P;
d65bfb46 935}
936
937/* Load four SPFP values in reverse order. The address must be aligned. */
517b0286 938extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
a7fa1ad5 939_mm_loadr_ps (float const *__P)
d65bfb46 940{
ad2c46cf 941 __v4sf __tmp = *(__v4sf *)__P;
d65bfb46 942 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
943}
944
d65bfb46 945/* Create the vector [Z Y X W]. */
517b0286 946extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
b8d2bcdd 947_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
d65bfb46 948{
882b157f 949 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
d65bfb46 950}
951
952/* Create the vector [W X Y Z]. */
517b0286 953extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 954_mm_setr_ps (float __Z, float __Y, float __X, float __W)
955{
882b157f 956 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
d65bfb46 957}
958
959/* Stores the lower SPFP value. */
517b0286 960extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 961_mm_store_ss (float *__P, __m128 __A)
962{
d521a5b2 963 *__P = ((__v4sf)__A)[0];
d65bfb46 964}
965
517b0286 966extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
52fdc46e 967_mm_cvtss_f32 (__m128 __A)
968{
d521a5b2 969 return ((__v4sf)__A)[0];
52fdc46e 970}
971
ad2c46cf 972/* Store four SPFP values. The address must be 16-byte aligned. */
517b0286 973extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 974_mm_store_ps (float *__P, __m128 __A)
d65bfb46 975{
8036ac7f 976 *(__m128 *)__P = __A;
d65bfb46 977}
978
ad2c46cf 979/* Store four SPFP values. The address need not be 16-byte aligned. */
517b0286 980extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 981_mm_storeu_ps (float *__P, __m128 __A)
d65bfb46 982{
8036ac7f 983 *(__m128_u *)__P = __A;
d65bfb46 984}
985
ad2c46cf 986/* Store the lower SPFP value across four words. */
517b0286 987extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 988_mm_store1_ps (float *__P, __m128 __A)
d65bfb46 989{
ad2c46cf 990 __v4sf __va = (__v4sf)__A;
991 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
992 _mm_storeu_ps (__P, __tmp);
d65bfb46 993}
994
517b0286 995extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 996_mm_store_ps1 (float *__P, __m128 __A)
d65bfb46 997{
ad2c46cf 998 _mm_store1_ps (__P, __A);
d65bfb46 999}
1000
f747aa2d 1001/* Store four SPFP values in reverse order. The address must be aligned. */
517b0286 1002extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1003_mm_storer_ps (float *__P, __m128 __A)
1004{
1005 __v4sf __va = (__v4sf)__A;
1006 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
ad2c46cf 1007 _mm_store_ps (__P, __tmp);
d65bfb46 1008}
1009
1010/* Sets the low SPFP value of A from the low value of B. */
517b0286 1011extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1012_mm_move_ss (__m128 __A, __m128 __B)
1013{
b7b03acb 1014 return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
1015 __extension__
1016 (__attribute__((__vector_size__ (16))) int)
1017 {4,1,2,3});
d65bfb46 1018}
1019
1020/* Extracts one of the four words of A. The selector N must be immediate. */
1a60bb06 1021#ifdef __OPTIMIZE__
517b0286 1022extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 1023_mm_extract_pi16 (__m64 const __A, int const __N)
d65bfb46 1024{
ad2c46cf 1025 return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
d65bfb46 1026}
3024f45d 1027
517b0286 1028extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 1029_m_pextrw (__m64 const __A, int const __N)
3024f45d 1030{
1031 return _mm_extract_pi16 (__A, __N);
1032}
d61e5c1b 1033#else
5f76c0f8 1034#define _mm_extract_pi16(A, N) \
1035 ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
8d308471 1036
1037#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
d61e5c1b 1038#endif
d65bfb46 1039
1040/* Inserts word D into one of four words of A. The selector N must be
1041 immediate. */
1a60bb06 1042#ifdef __OPTIMIZE__
517b0286 1043extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 1044_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
d65bfb46 1045{
ad2c46cf 1046 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
d65bfb46 1047}
3024f45d 1048
517b0286 1049extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
ad2c46cf 1050_m_pinsrw (__m64 const __A, int const __D, int const __N)
3024f45d 1051{
1052 return _mm_insert_pi16 (__A, __D, __N);
1053}
d61e5c1b 1054#else
5f76c0f8 1055#define _mm_insert_pi16(A, D, N) \
1056 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \
1057 (int)(D), (int)(N)))
8d308471 1058
1059#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
d61e5c1b 1060#endif
d65bfb46 1061
1062/* Compute the element-wise maximum of signed 16-bit values. */
517b0286 1063extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1064_mm_max_pi16 (__m64 __A, __m64 __B)
1065{
1066 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1067}
1068
517b0286 1069extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1070_m_pmaxsw (__m64 __A, __m64 __B)
1071{
1072 return _mm_max_pi16 (__A, __B);
1073}
1074
d65bfb46 1075/* Compute the element-wise maximum of unsigned 8-bit values. */
517b0286 1076extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1077_mm_max_pu8 (__m64 __A, __m64 __B)
1078{
1079 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1080}
1081
517b0286 1082extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1083_m_pmaxub (__m64 __A, __m64 __B)
1084{
1085 return _mm_max_pu8 (__A, __B);
1086}
1087
d65bfb46 1088/* Compute the element-wise minimum of signed 16-bit values. */
517b0286 1089extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1090_mm_min_pi16 (__m64 __A, __m64 __B)
1091{
1092 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1093}
1094
517b0286 1095extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1096_m_pminsw (__m64 __A, __m64 __B)
1097{
1098 return _mm_min_pi16 (__A, __B);
1099}
1100
d65bfb46 1101/* Compute the element-wise minimum of unsigned 8-bit values. */
517b0286 1102extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1103_mm_min_pu8 (__m64 __A, __m64 __B)
1104{
1105 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1106}
1107
517b0286 1108extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1109_m_pminub (__m64 __A, __m64 __B)
1110{
1111 return _mm_min_pu8 (__A, __B);
1112}
1113
d65bfb46 1114/* Create an 8-bit mask of the signs of 8-bit values. */
517b0286 1115extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1116_mm_movemask_pi8 (__m64 __A)
1117{
1118 return __builtin_ia32_pmovmskb ((__v8qi)__A);
1119}
1120
517b0286 1121extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1122_m_pmovmskb (__m64 __A)
1123{
1124 return _mm_movemask_pi8 (__A);
1125}
1126
d65bfb46 1127/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1128 in B and produce the high 16 bits of the 32-bit results. */
517b0286 1129extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1130_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1131{
1132 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1133}
1134
517b0286 1135extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1136_m_pmulhuw (__m64 __A, __m64 __B)
1137{
1138 return _mm_mulhi_pu16 (__A, __B);
1139}
1140
d65bfb46 1141/* Return a combination of the four 16-bit values in A. The selector
1142 must be an immediate. */
1a60bb06 1143#ifdef __OPTIMIZE__
517b0286 1144extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
4dfac92d 1145_mm_shuffle_pi16 (__m64 __A, int const __N)
d65bfb46 1146{
1147 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1148}
3024f45d 1149
517b0286 1150extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
4dfac92d 1151_m_pshufw (__m64 __A, int const __N)
3024f45d 1152{
1153 return _mm_shuffle_pi16 (__A, __N);
1154}
d61e5c1b 1155#else
1156#define _mm_shuffle_pi16(A, N) \
5f76c0f8 1157 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
8d308471 1158
1159#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
d61e5c1b 1160#endif
d65bfb46 1161
1162/* Conditionally store byte elements of A into P. The high bit of each
1163 byte in the selector N determines whether the corresponding byte from
1164 A is stored. */
517b0286 1165extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1166_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1167{
1168 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1169}
1170
517b0286 1171extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1172_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1173{
1174 _mm_maskmove_si64 (__A, __N, __P);
1175}
1176
d65bfb46 1177/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
517b0286 1178extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1179_mm_avg_pu8 (__m64 __A, __m64 __B)
1180{
1181 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1182}
1183
517b0286 1184extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1185_m_pavgb (__m64 __A, __m64 __B)
1186{
1187 return _mm_avg_pu8 (__A, __B);
1188}
1189
d65bfb46 1190/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
517b0286 1191extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1192_mm_avg_pu16 (__m64 __A, __m64 __B)
1193{
1194 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1195}
1196
517b0286 1197extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1198_m_pavgw (__m64 __A, __m64 __B)
1199{
1200 return _mm_avg_pu16 (__A, __B);
1201}
1202
d65bfb46 1203/* Compute the sum of the absolute differences of the unsigned 8-bit
1204 values in A and B. Return the value in the lower 16-bit word; the
1205 upper words are cleared. */
517b0286 1206extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1207_mm_sad_pu8 (__m64 __A, __m64 __B)
1208{
1209 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1210}
1211
517b0286 1212extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
3024f45d 1213_m_psadbw (__m64 __A, __m64 __B)
1214{
1215 return _mm_sad_pu8 (__A, __B);
1216}
1217
d65bfb46 1218/* Stores the data in A to the address P without polluting the caches. */
517b0286 1219extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1220_mm_stream_pi (__m64 *__P, __m64 __A)
1221{
c2f63288 1222 __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
d65bfb46 1223}
1224
1225/* Likewise. The address must be 16-byte aligned. */
517b0286 1226extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1227_mm_stream_ps (float *__P, __m128 __A)
1228{
1229 __builtin_ia32_movntps (__P, (__v4sf)__A);
1230}
1231
fcbfedc7 1232/* Guarantees that every preceding store is globally visible before
d65bfb46 1233 any subsequent store. */
517b0286 1234extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
d65bfb46 1235_mm_sfence (void)
1236{
1237 __builtin_ia32_sfence ();
1238}
1239
d65bfb46 1240/* Transpose the 4x4 matrix composed of row[0-3]. */
1241#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1242do { \
1243 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
ec6fefb6 1244 __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \
631252e1 1245 __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \
1246 __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \
ec6fefb6 1247 __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \
1248 (row0) = __builtin_ia32_movlhps (__t0, __t1); \
1249 (row1) = __builtin_ia32_movhlps (__t1, __t0); \
1250 (row2) = __builtin_ia32_movlhps (__t2, __t3); \
1251 (row3) = __builtin_ia32_movhlps (__t3, __t2); \
d65bfb46 1252} while (0)
1253
e829311e 1254/* For backward source compatibility. */
bfad7f66 1255# include <emmintrin.h>
d3ceaee1 1256
ef21d40e 1257#ifdef __DISABLE_SSE__
1258#undef __DISABLE_SSE__
1259#pragma GCC pop_options
1260#endif /* __DISABLE_SSE__ */
1261
8776d2d7 1262/* The execution of the next instruction is delayed by an implementation
1263 specific amount of time. The instruction does not modify the
1264 architectural state. This is after the pop_options pragma because
1265 it does not require SSE support in the processor--the encoding is a
1266 nop on processors that do not support it. */
1267extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268_mm_pause (void)
1269{
1270 __builtin_ia32_pause ();
1271}
1272
d65bfb46 1273#endif /* _XMMINTRIN_H_INCLUDED */