]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/xmmintrin.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / xmmintrin.h
1 /* Copyright (C) 2002-2023 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef _XMMINTRIN_H_INCLUDED
28 #define _XMMINTRIN_H_INCLUDED
29
30 /* We need type definitions from the MMX header file. */
31 #include <mmintrin.h>
32
33 /* Get _mm_malloc () and _mm_free (). */
34 #include <mm_malloc.h>
35
36 /* Constants for use with _mm_prefetch. */
37 enum _mm_hint
38 {
39 _MM_HINT_IT0 = 19,
40 _MM_HINT_IT1 = 18,
41 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
42 _MM_HINT_ET0 = 7,
43 _MM_HINT_ET1 = 6,
44 _MM_HINT_T0 = 3,
45 _MM_HINT_T1 = 2,
46 _MM_HINT_T2 = 1,
47 _MM_HINT_NTA = 0
48 };
49
50 /* Loads one cache line from address P to a location "closer" to the
51 processor. The selector I specifies the type of prefetch operation. */
52 #ifdef __OPTIMIZE__
53 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
54 _mm_prefetch (const void *__P, enum _mm_hint __I)
55 {
56 __builtin_ia32_prefetch (__P, (__I & 0x4) >> 2,
57 __I & 0x3, (__I & 0x10) >> 4);
58 }
59 #else
60 #define _mm_prefetch(P, I) \
61 __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4)
62 #endif
63
64 #ifndef __SSE__
65 #pragma GCC push_options
66 #pragma GCC target("sse")
67 #define __DISABLE_SSE__
68 #endif /* __SSE__ */
69
70 /* The Intel API is flexible enough that we must allow aliasing with other
71 vector types, and their scalar components. */
72 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
73
74 /* Unaligned version of the same type. */
75 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
76
77 /* Internal data types for implementing the intrinsics. */
78 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
79
80 /* Create a selector for use with the SHUFPS instruction. */
81 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
82 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
83
84 /* Bits in the MXCSR. */
85 #define _MM_EXCEPT_MASK 0x003f
86 #define _MM_EXCEPT_INVALID 0x0001
87 #define _MM_EXCEPT_DENORM 0x0002
88 #define _MM_EXCEPT_DIV_ZERO 0x0004
89 #define _MM_EXCEPT_OVERFLOW 0x0008
90 #define _MM_EXCEPT_UNDERFLOW 0x0010
91 #define _MM_EXCEPT_INEXACT 0x0020
92
93 #define _MM_MASK_MASK 0x1f80
94 #define _MM_MASK_INVALID 0x0080
95 #define _MM_MASK_DENORM 0x0100
96 #define _MM_MASK_DIV_ZERO 0x0200
97 #define _MM_MASK_OVERFLOW 0x0400
98 #define _MM_MASK_UNDERFLOW 0x0800
99 #define _MM_MASK_INEXACT 0x1000
100
101 #define _MM_ROUND_MASK 0x6000
102 #define _MM_ROUND_NEAREST 0x0000
103 #define _MM_ROUND_DOWN 0x2000
104 #define _MM_ROUND_UP 0x4000
105 #define _MM_ROUND_TOWARD_ZERO 0x6000
106
107 #define _MM_FLUSH_ZERO_MASK 0x8000
108 #define _MM_FLUSH_ZERO_ON 0x8000
109 #define _MM_FLUSH_ZERO_OFF 0x0000
110
111 /* Create an undefined vector. */
112 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113 _mm_undefined_ps (void)
114 {
115 #pragma GCC diagnostic push
116 #pragma GCC diagnostic ignored "-Winit-self"
117 __m128 __Y = __Y;
118 #pragma GCC diagnostic pop
119 return __Y;
120 }
121
122 /* Create a vector of zeros. */
123 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124 _mm_setzero_ps (void)
125 {
126 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
127 }
128
129 /* Perform the respective operation on the lower SPFP (single-precision
130 floating-point) values of A and B; the upper three SPFP values are
131 passed through from A. */
132
133 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134 _mm_add_ss (__m128 __A, __m128 __B)
135 {
136 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
137 }
138
139 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm_sub_ss (__m128 __A, __m128 __B)
141 {
142 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
143 }
144
145 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146 _mm_mul_ss (__m128 __A, __m128 __B)
147 {
148 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
149 }
150
151 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152 _mm_div_ss (__m128 __A, __m128 __B)
153 {
154 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
155 }
156
157 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158 _mm_sqrt_ss (__m128 __A)
159 {
160 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
161 }
162
163 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164 _mm_rcp_ss (__m128 __A)
165 {
166 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
167 }
168
169 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_rsqrt_ss (__m128 __A)
171 {
172 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
173 }
174
175 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_min_ss (__m128 __A, __m128 __B)
177 {
178 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
179 }
180
181 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182 _mm_max_ss (__m128 __A, __m128 __B)
183 {
184 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
185 }
186
187 /* Perform the respective operation on the four SPFP values in A and B. */
188
189 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190 _mm_add_ps (__m128 __A, __m128 __B)
191 {
192 return (__m128) ((__v4sf)__A + (__v4sf)__B);
193 }
194
195 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 _mm_sub_ps (__m128 __A, __m128 __B)
197 {
198 return (__m128) ((__v4sf)__A - (__v4sf)__B);
199 }
200
201 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm_mul_ps (__m128 __A, __m128 __B)
203 {
204 return (__m128) ((__v4sf)__A * (__v4sf)__B);
205 }
206
207 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208 _mm_div_ps (__m128 __A, __m128 __B)
209 {
210 return (__m128) ((__v4sf)__A / (__v4sf)__B);
211 }
212
213 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214 _mm_sqrt_ps (__m128 __A)
215 {
216 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
217 }
218
219 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220 _mm_rcp_ps (__m128 __A)
221 {
222 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
223 }
224
225 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm_rsqrt_ps (__m128 __A)
227 {
228 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
229 }
230
231 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
232 _mm_min_ps (__m128 __A, __m128 __B)
233 {
234 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
235 }
236
237 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
238 _mm_max_ps (__m128 __A, __m128 __B)
239 {
240 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
241 }
242
243 /* Perform logical bit-wise operations on 128-bit values. */
244
245 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
246 _mm_and_ps (__m128 __A, __m128 __B)
247 {
248 return __builtin_ia32_andps (__A, __B);
249 }
250
251 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252 _mm_andnot_ps (__m128 __A, __m128 __B)
253 {
254 return __builtin_ia32_andnps (__A, __B);
255 }
256
257 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258 _mm_or_ps (__m128 __A, __m128 __B)
259 {
260 return __builtin_ia32_orps (__A, __B);
261 }
262
263 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264 _mm_xor_ps (__m128 __A, __m128 __B)
265 {
266 return __builtin_ia32_xorps (__A, __B);
267 }
268
269 /* Perform a comparison on the lower SPFP values of A and B. If the
270 comparison is true, place a mask of all ones in the result, otherwise a
271 mask of zeros. The upper three SPFP values are passed through from A. */
272
273 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
274 _mm_cmpeq_ss (__m128 __A, __m128 __B)
275 {
276 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
277 }
278
279 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
280 _mm_cmplt_ss (__m128 __A, __m128 __B)
281 {
282 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
283 }
284
285 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
286 _mm_cmple_ss (__m128 __A, __m128 __B)
287 {
288 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
289 }
290
291 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292 _mm_cmpgt_ss (__m128 __A, __m128 __B)
293 {
294 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
295 (__v4sf)
296 __builtin_ia32_cmpltss ((__v4sf) __B,
297 (__v4sf)
298 __A));
299 }
300
301 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _mm_cmpge_ss (__m128 __A, __m128 __B)
303 {
304 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
305 (__v4sf)
306 __builtin_ia32_cmpless ((__v4sf) __B,
307 (__v4sf)
308 __A));
309 }
310
311 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 _mm_cmpneq_ss (__m128 __A, __m128 __B)
313 {
314 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
315 }
316
317 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
319 {
320 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
321 }
322
323 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
324 _mm_cmpnle_ss (__m128 __A, __m128 __B)
325 {
326 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
327 }
328
329 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
330 _mm_cmpngt_ss (__m128 __A, __m128 __B)
331 {
332 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
333 (__v4sf)
334 __builtin_ia32_cmpnltss ((__v4sf) __B,
335 (__v4sf)
336 __A));
337 }
338
339 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340 _mm_cmpnge_ss (__m128 __A, __m128 __B)
341 {
342 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
343 (__v4sf)
344 __builtin_ia32_cmpnless ((__v4sf) __B,
345 (__v4sf)
346 __A));
347 }
348
349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm_cmpord_ss (__m128 __A, __m128 __B)
351 {
352 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
353 }
354
355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm_cmpunord_ss (__m128 __A, __m128 __B)
357 {
358 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
359 }
360
361 /* Perform a comparison on the four SPFP values of A and B. For each
362 element, if the comparison is true, place a mask of all ones in the
363 result, otherwise a mask of zeros. */
364
365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366 _mm_cmpeq_ps (__m128 __A, __m128 __B)
367 {
368 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
369 }
370
371 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 _mm_cmplt_ps (__m128 __A, __m128 __B)
373 {
374 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
375 }
376
377 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
378 _mm_cmple_ps (__m128 __A, __m128 __B)
379 {
380 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
381 }
382
383 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384 _mm_cmpgt_ps (__m128 __A, __m128 __B)
385 {
386 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
387 }
388
389 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390 _mm_cmpge_ps (__m128 __A, __m128 __B)
391 {
392 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
393 }
394
395 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396 _mm_cmpneq_ps (__m128 __A, __m128 __B)
397 {
398 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
399 }
400
401 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
403 {
404 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
405 }
406
407 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
408 _mm_cmpnle_ps (__m128 __A, __m128 __B)
409 {
410 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
411 }
412
413 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
414 _mm_cmpngt_ps (__m128 __A, __m128 __B)
415 {
416 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
417 }
418
419 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
420 _mm_cmpnge_ps (__m128 __A, __m128 __B)
421 {
422 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
423 }
424
425 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426 _mm_cmpord_ps (__m128 __A, __m128 __B)
427 {
428 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
429 }
430
431 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432 _mm_cmpunord_ps (__m128 __A, __m128 __B)
433 {
434 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
435 }
436
437 /* Compare the lower SPFP values of A and B and return 1 if true
438 and 0 if false. */
439
440 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 _mm_comieq_ss (__m128 __A, __m128 __B)
442 {
443 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
444 }
445
446 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm_comilt_ss (__m128 __A, __m128 __B)
448 {
449 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
450 }
451
452 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 _mm_comile_ss (__m128 __A, __m128 __B)
454 {
455 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
456 }
457
458 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm_comigt_ss (__m128 __A, __m128 __B)
460 {
461 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
462 }
463
464 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_comige_ss (__m128 __A, __m128 __B)
466 {
467 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
468 }
469
470 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 _mm_comineq_ss (__m128 __A, __m128 __B)
472 {
473 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
474 }
475
476 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477 _mm_ucomieq_ss (__m128 __A, __m128 __B)
478 {
479 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
480 }
481
482 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _mm_ucomilt_ss (__m128 __A, __m128 __B)
484 {
485 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
486 }
487
488 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 _mm_ucomile_ss (__m128 __A, __m128 __B)
490 {
491 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
492 }
493
494 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495 _mm_ucomigt_ss (__m128 __A, __m128 __B)
496 {
497 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
498 }
499
500 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501 _mm_ucomige_ss (__m128 __A, __m128 __B)
502 {
503 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
504 }
505
506 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 _mm_ucomineq_ss (__m128 __A, __m128 __B)
508 {
509 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
510 }
511
512 /* Convert the lower SPFP value to a 32-bit integer according to the current
513 rounding mode. */
514 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515 _mm_cvtss_si32 (__m128 __A)
516 {
517 return __builtin_ia32_cvtss2si ((__v4sf) __A);
518 }
519
520 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521 _mm_cvt_ss2si (__m128 __A)
522 {
523 return _mm_cvtss_si32 (__A);
524 }
525
526 #ifdef __x86_64__
527 /* Convert the lower SPFP value to a 32-bit integer according to the
528 current rounding mode. */
529
530 /* Intel intrinsic. */
531 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532 _mm_cvtss_si64 (__m128 __A)
533 {
534 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
535 }
536
537 /* Microsoft intrinsic. */
538 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539 _mm_cvtss_si64x (__m128 __A)
540 {
541 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
542 }
543 #endif
544
545 /* Convert the two lower SPFP values to 32-bit integers according to the
546 current rounding mode. Return the integers in packed form. */
547 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
548 _mm_cvtps_pi32 (__m128 __A)
549 {
550 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
551 }
552
553 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554 _mm_cvt_ps2pi (__m128 __A)
555 {
556 return _mm_cvtps_pi32 (__A);
557 }
558
559 /* Truncate the lower SPFP value to a 32-bit integer. */
560 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
561 _mm_cvttss_si32 (__m128 __A)
562 {
563 return __builtin_ia32_cvttss2si ((__v4sf) __A);
564 }
565
566 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
567 _mm_cvtt_ss2si (__m128 __A)
568 {
569 return _mm_cvttss_si32 (__A);
570 }
571
572 #ifdef __x86_64__
573 /* Truncate the lower SPFP value to a 32-bit integer. */
574
575 /* Intel intrinsic. */
576 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577 _mm_cvttss_si64 (__m128 __A)
578 {
579 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
580 }
581
582 /* Microsoft intrinsic. */
583 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584 _mm_cvttss_si64x (__m128 __A)
585 {
586 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
587 }
588 #endif
589
590 /* Truncate the two lower SPFP values to 32-bit integers. Return the
591 integers in packed form. */
592 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593 _mm_cvttps_pi32 (__m128 __A)
594 {
595 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
596 }
597
598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 _mm_cvtt_ps2pi (__m128 __A)
600 {
601 return _mm_cvttps_pi32 (__A);
602 }
603
604 /* Convert B to a SPFP value and insert it as element zero in A. */
605 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _mm_cvtsi32_ss (__m128 __A, int __B)
607 {
608 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
609 }
610
611 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612 _mm_cvt_si2ss (__m128 __A, int __B)
613 {
614 return _mm_cvtsi32_ss (__A, __B);
615 }
616
617 #ifdef __x86_64__
618 /* Convert B to a SPFP value and insert it as element zero in A. */
619
620 /* Intel intrinsic. */
621 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622 _mm_cvtsi64_ss (__m128 __A, long long __B)
623 {
624 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
625 }
626
627 /* Microsoft intrinsic. */
628 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629 _mm_cvtsi64x_ss (__m128 __A, long long __B)
630 {
631 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
632 }
633 #endif
634
635 /* Convert the two 32-bit values in B to SPFP form and insert them
636 as the two lower elements in A. */
637 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
638 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
639 {
640 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
641 }
642
643 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
645 {
646 return _mm_cvtpi32_ps (__A, __B);
647 }
648
649 /* Convert the four signed 16-bit values in A to SPFP form. */
650 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651 _mm_cvtpi16_ps (__m64 __A)
652 {
653 __v4hi __sign;
654 __v2si __hisi, __losi;
655 __v4sf __zero, __ra, __rb;
656
657 /* This comparison against zero gives us a mask that can be used to
658 fill in the missing sign bits in the unpack operations below, so
659 that we get signed values after unpacking. */
660 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
661
662 /* Convert the four words to doublewords. */
663 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
664 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
665
666 /* Convert the doublewords to floating point two at a time. */
667 __zero = (__v4sf) _mm_setzero_ps ();
668 __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
669 __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
670
671 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
672 }
673
674 /* Convert the four unsigned 16-bit values in A to SPFP form. */
675 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676 _mm_cvtpu16_ps (__m64 __A)
677 {
678 __v2si __hisi, __losi;
679 __v4sf __zero, __ra, __rb;
680
681 /* Convert the four words to doublewords. */
682 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
683 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
684
685 /* Convert the doublewords to floating point two at a time. */
686 __zero = (__v4sf) _mm_setzero_ps ();
687 __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
688 __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
689
690 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
691 }
692
693 /* Convert the low four signed 8-bit values in A to SPFP form. */
694 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 _mm_cvtpi8_ps (__m64 __A)
696 {
697 __v8qi __sign;
698
699 /* This comparison against zero gives us a mask that can be used to
700 fill in the missing sign bits in the unpack operations below, so
701 that we get signed values after unpacking. */
702 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
703
704 /* Convert the four low bytes to words. */
705 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
706
707 return _mm_cvtpi16_ps(__A);
708 }
709
710 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
711 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
712 _mm_cvtpu8_ps(__m64 __A)
713 {
714 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
715 return _mm_cvtpu16_ps(__A);
716 }
717
718 /* Convert the four signed 32-bit values in A and B to SPFP form. */
719 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
720 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
721 {
722 __v4sf __zero = (__v4sf) _mm_setzero_ps ();
723 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
724 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
725 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
726 }
727
728 /* Convert the four SPFP values in A to four signed 16-bit integers. */
729 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730 _mm_cvtps_pi16(__m128 __A)
731 {
732 __v4sf __hisf = (__v4sf)__A;
733 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
734 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
735 __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
736 return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
737 }
738
739 /* Convert the four SPFP values in A to four signed 8-bit integers. */
740 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
741 _mm_cvtps_pi8(__m128 __A)
742 {
743 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
744 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
745 }
746
747 /* Selects four specific SPFP values from A and B based on MASK. */
748 #ifdef __OPTIMIZE__
749 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
750 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
751 {
752 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
753 }
754 #else
755 #define _mm_shuffle_ps(A, B, MASK) \
756 ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \
757 (__v4sf)(__m128)(B), (int)(MASK)))
758 #endif
759
760 /* Selects and interleaves the upper two SPFP values from A and B. */
761 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762 _mm_unpackhi_ps (__m128 __A, __m128 __B)
763 {
764 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
765 }
766
767 /* Selects and interleaves the lower two SPFP values from A and B. */
768 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
769 _mm_unpacklo_ps (__m128 __A, __m128 __B)
770 {
771 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
772 }
773
774 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
775 the lower two values are passed through from A. */
776 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
777 _mm_loadh_pi (__m128 __A, __m64 const *__P)
778 {
779 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
780 }
781
782 /* Stores the upper two SPFP values of A into P. */
783 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784 _mm_storeh_pi (__m64 *__P, __m128 __A)
785 {
786 __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
787 }
788
789 /* Moves the upper two values of B into the lower two values of A. */
790 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791 _mm_movehl_ps (__m128 __A, __m128 __B)
792 {
793 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
794 }
795
796 /* Moves the lower two values of B into the upper two values of A. */
797 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
798 _mm_movelh_ps (__m128 __A, __m128 __B)
799 {
800 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
801 }
802
803 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
804 the upper two values are passed through from A. */
805 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806 _mm_loadl_pi (__m128 __A, __m64 const *__P)
807 {
808 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
809 }
810
811 /* Stores the lower two SPFP values of A into P. */
812 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813 _mm_storel_pi (__m64 *__P, __m128 __A)
814 {
815 __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
816 }
817
818 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
819 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
820 _mm_movemask_ps (__m128 __A)
821 {
822 return __builtin_ia32_movmskps ((__v4sf)__A);
823 }
824
825 /* Return the contents of the control register. */
826 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 _mm_getcsr (void)
828 {
829 return __builtin_ia32_stmxcsr ();
830 }
831
832 /* Read exception bits from the control register. */
833 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834 _MM_GET_EXCEPTION_STATE (void)
835 {
836 return _mm_getcsr() & _MM_EXCEPT_MASK;
837 }
838
839 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840 _MM_GET_EXCEPTION_MASK (void)
841 {
842 return _mm_getcsr() & _MM_MASK_MASK;
843 }
844
845 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 _MM_GET_ROUNDING_MODE (void)
847 {
848 return _mm_getcsr() & _MM_ROUND_MASK;
849 }
850
851 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
852 _MM_GET_FLUSH_ZERO_MODE (void)
853 {
854 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
855 }
856
857 /* Set the control register to I. */
858 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 _mm_setcsr (unsigned int __I)
860 {
861 __builtin_ia32_ldmxcsr (__I);
862 }
863
864 /* Set exception bits in the control register. */
865 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866 _MM_SET_EXCEPTION_STATE(unsigned int __mask)
867 {
868 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
869 }
870
871 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872 _MM_SET_EXCEPTION_MASK (unsigned int __mask)
873 {
874 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
875 }
876
877 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878 _MM_SET_ROUNDING_MODE (unsigned int __mode)
879 {
880 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
881 }
882
883 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884 _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
885 {
886 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
887 }
888
889 /* Create a vector with element 0 as F and the rest zero. */
890 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
891 _mm_set_ss (float __F)
892 {
893 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
894 }
895
896 /* Create a vector with all four elements equal to F. */
897 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
898 _mm_set1_ps (float __F)
899 {
900 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
901 }
902
903 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904 _mm_set_ps1 (float __F)
905 {
906 return _mm_set1_ps (__F);
907 }
908
909 /* Create a vector with element 0 as *P and the rest zero. */
910 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911 _mm_load_ss (float const *__P)
912 {
913 return _mm_set_ss (*__P);
914 }
915
916 /* Create a vector with all four elements equal to *P. */
917 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
918 _mm_load1_ps (float const *__P)
919 {
920 return _mm_set1_ps (*__P);
921 }
922
923 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
924 _mm_load_ps1 (float const *__P)
925 {
926 return _mm_load1_ps (__P);
927 }
928
929 /* Load four SPFP values from P. The address must be 16-byte aligned. */
930 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
931 _mm_load_ps (float const *__P)
932 {
933 return *(__m128 *)__P;
934 }
935
936 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
937 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
938 _mm_loadu_ps (float const *__P)
939 {
940 return *(__m128_u *)__P;
941 }
942
943 /* Load four SPFP values in reverse order. The address must be aligned. */
944 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm_loadr_ps (float const *__P)
946 {
947 __v4sf __tmp = *(__v4sf *)__P;
948 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
949 }
950
951 /* Create the vector [Z Y X W]. */
952 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
953 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
954 {
955 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
956 }
957
958 /* Create the vector [W X Y Z]. */
959 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
961 {
962 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
963 }
964
965 /* Stores the lower SPFP value. */
966 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967 _mm_store_ss (float *__P, __m128 __A)
968 {
969 *__P = ((__v4sf)__A)[0];
970 }
971
972 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
973 _mm_cvtss_f32 (__m128 __A)
974 {
975 return ((__v4sf)__A)[0];
976 }
977
978 /* Store four SPFP values. The address must be 16-byte aligned. */
979 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
980 _mm_store_ps (float *__P, __m128 __A)
981 {
982 *(__m128 *)__P = __A;
983 }
984
985 /* Store four SPFP values. The address need not be 16-byte aligned. */
986 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
987 _mm_storeu_ps (float *__P, __m128 __A)
988 {
989 *(__m128_u *)__P = __A;
990 }
991
992 /* Store the lower SPFP value across four words. */
993 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994 _mm_store1_ps (float *__P, __m128 __A)
995 {
996 __v4sf __va = (__v4sf)__A;
997 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
998 _mm_storeu_ps (__P, __tmp);
999 }
1000
1001 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm_store_ps1 (float *__P, __m128 __A)
1003 {
1004 _mm_store1_ps (__P, __A);
1005 }
1006
1007 /* Store four SPFP values in reverse order. The address must be aligned. */
1008 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009 _mm_storer_ps (float *__P, __m128 __A)
1010 {
1011 __v4sf __va = (__v4sf)__A;
1012 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
1013 _mm_store_ps (__P, __tmp);
1014 }
1015
1016 /* Sets the low SPFP value of A from the low value of B. */
1017 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018 _mm_move_ss (__m128 __A, __m128 __B)
1019 {
1020 return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
1021 __extension__
1022 (__attribute__((__vector_size__ (16))) int)
1023 {4,1,2,3});
1024 }
1025
1026 /* Extracts one of the four words of A. The selector N must be immediate. */
1027 #ifdef __OPTIMIZE__
1028 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1029 _mm_extract_pi16 (__m64 const __A, int const __N)
1030 {
1031 return (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
1032 }
1033
1034 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1035 _m_pextrw (__m64 const __A, int const __N)
1036 {
1037 return _mm_extract_pi16 (__A, __N);
1038 }
1039 #else
1040 #define _mm_extract_pi16(A, N) \
1041 ((int) (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
1042
1043 #define _m_pextrw(A, N) _mm_extract_pi16(A, N)
1044 #endif
1045
1046 /* Inserts word D into one of four words of A. The selector N must be
1047 immediate. */
1048 #ifdef __OPTIMIZE__
1049 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1051 {
1052 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
1053 }
1054
1055 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1057 {
1058 return _mm_insert_pi16 (__A, __D, __N);
1059 }
1060 #else
1061 #define _mm_insert_pi16(A, D, N) \
1062 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \
1063 (int)(D), (int)(N)))
1064
1065 #define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
1066 #endif
1067
1068 /* Compute the element-wise maximum of signed 16-bit values. */
1069 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070 _mm_max_pi16 (__m64 __A, __m64 __B)
1071 {
1072 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1073 }
1074
1075 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076 _m_pmaxsw (__m64 __A, __m64 __B)
1077 {
1078 return _mm_max_pi16 (__A, __B);
1079 }
1080
1081 /* Compute the element-wise maximum of unsigned 8-bit values. */
1082 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1083 _mm_max_pu8 (__m64 __A, __m64 __B)
1084 {
1085 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1086 }
1087
1088 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1089 _m_pmaxub (__m64 __A, __m64 __B)
1090 {
1091 return _mm_max_pu8 (__A, __B);
1092 }
1093
1094 /* Compute the element-wise minimum of signed 16-bit values. */
1095 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm_min_pi16 (__m64 __A, __m64 __B)
1097 {
1098 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1099 }
1100
1101 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _m_pminsw (__m64 __A, __m64 __B)
1103 {
1104 return _mm_min_pi16 (__A, __B);
1105 }
1106
1107 /* Compute the element-wise minimum of unsigned 8-bit values. */
1108 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1109 _mm_min_pu8 (__m64 __A, __m64 __B)
1110 {
1111 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1112 }
1113
1114 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1115 _m_pminub (__m64 __A, __m64 __B)
1116 {
1117 return _mm_min_pu8 (__A, __B);
1118 }
1119
1120 /* Create an 8-bit mask of the signs of 8-bit values. */
1121 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _mm_movemask_pi8 (__m64 __A)
1123 {
1124 return __builtin_ia32_pmovmskb ((__v8qi)__A);
1125 }
1126
1127 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128 _m_pmovmskb (__m64 __A)
1129 {
1130 return _mm_movemask_pi8 (__A);
1131 }
1132
1133 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1134 in B and produce the high 16 bits of the 32-bit results. */
1135 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1136 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1137 {
1138 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1139 }
1140
1141 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 _m_pmulhuw (__m64 __A, __m64 __B)
1143 {
1144 return _mm_mulhi_pu16 (__A, __B);
1145 }
1146
1147 /* Return a combination of the four 16-bit values in A. The selector
1148 must be an immediate. */
1149 #ifdef __OPTIMIZE__
1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151 _mm_shuffle_pi16 (__m64 __A, int const __N)
1152 {
1153 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1154 }
1155
1156 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1157 _m_pshufw (__m64 __A, int const __N)
1158 {
1159 return _mm_shuffle_pi16 (__A, __N);
1160 }
1161 #else
1162 #define _mm_shuffle_pi16(A, N) \
1163 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
1164
1165 #define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
1166 #endif
1167
1168 /* Conditionally store byte elements of A into P. The high bit of each
1169 byte in the selector N determines whether the corresponding byte from
1170 A is stored. */
1171 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1173 {
1174 #ifdef __MMX_WITH_SSE__
1175 /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
1176 64:127 at address __P. */
1177 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
1178 typedef char __v16qi __attribute__ ((__vector_size__ (16)));
1179 /* Zero-extend __A and __N to 128 bits. */
1180 __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
1181 __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
1182
1183 /* Check the alignment of __P. */
1184 __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
1185 if (offset)
1186 {
1187 /* If the misalignment of __P > 8, subtract __P by 8 bytes.
1188 Otherwise, subtract __P by the misalignment. */
1189 if (offset > 8)
1190 offset = 8;
1191 __P = (char *) (((__SIZE_TYPE__) __P) - offset);
1192
1193 /* Shift __A128 and __N128 to the left by the adjustment. */
1194 switch (offset)
1195 {
1196 case 1:
1197 __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
1198 __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
1199 break;
1200 case 2:
1201 __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
1202 __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
1203 break;
1204 case 3:
1205 __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
1206 __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
1207 break;
1208 case 4:
1209 __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
1210 __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
1211 break;
1212 case 5:
1213 __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
1214 __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
1215 break;
1216 case 6:
1217 __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
1218 __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
1219 break;
1220 case 7:
1221 __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
1222 __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
1223 break;
1224 case 8:
1225 __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
1226 __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
1227 break;
1228 default:
1229 break;
1230 }
1231 }
1232 __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
1233 #else
1234 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1235 #endif
1236 }
1237
1238 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1240 {
1241 _mm_maskmove_si64 (__A, __N, __P);
1242 }
1243
1244 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1245 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246 _mm_avg_pu8 (__m64 __A, __m64 __B)
1247 {
1248 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1249 }
1250
1251 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252 _m_pavgb (__m64 __A, __m64 __B)
1253 {
1254 return _mm_avg_pu8 (__A, __B);
1255 }
1256
1257 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1258 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _mm_avg_pu16 (__m64 __A, __m64 __B)
1260 {
1261 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1262 }
1263
1264 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265 _m_pavgw (__m64 __A, __m64 __B)
1266 {
1267 return _mm_avg_pu16 (__A, __B);
1268 }
1269
1270 /* Compute the sum of the absolute differences of the unsigned 8-bit
1271 values in A and B. Return the value in the lower 16-bit word; the
1272 upper words are cleared. */
1273 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm_sad_pu8 (__m64 __A, __m64 __B)
1275 {
1276 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1277 }
1278
1279 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _m_psadbw (__m64 __A, __m64 __B)
1281 {
1282 return _mm_sad_pu8 (__A, __B);
1283 }
1284
1285 /* Stores the data in A to the address P without polluting the caches. */
1286 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287 _mm_stream_pi (__m64 *__P, __m64 __A)
1288 {
1289 __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
1290 }
1291
1292 /* Likewise. The address must be 16-byte aligned. */
1293 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1294 _mm_stream_ps (float *__P, __m128 __A)
1295 {
1296 __builtin_ia32_movntps (__P, (__v4sf)__A);
1297 }
1298
1299 /* Guarantees that every preceding store is globally visible before
1300 any subsequent store. */
1301 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1302 _mm_sfence (void)
1303 {
1304 __builtin_ia32_sfence ();
1305 }
1306
1307 /* Transpose the 4x4 matrix composed of row[0-3]. */
1308 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1309 do { \
1310 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1311 __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \
1312 __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \
1313 __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \
1314 __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \
1315 (row0) = __builtin_ia32_movlhps (__t0, __t1); \
1316 (row1) = __builtin_ia32_movhlps (__t1, __t0); \
1317 (row2) = __builtin_ia32_movlhps (__t2, __t3); \
1318 (row3) = __builtin_ia32_movhlps (__t3, __t2); \
1319 } while (0)
1320
1321 /* For backward source compatibility. */
1322 # include <emmintrin.h>
1323
1324 #ifdef __DISABLE_SSE__
1325 #undef __DISABLE_SSE__
1326 #pragma GCC pop_options
1327 #endif /* __DISABLE_SSE__ */
1328
1329 /* The execution of the next instruction is delayed by an implementation
1330 specific amount of time. The instruction does not modify the
1331 architectural state. This is after the pop_options pragma because
1332 it does not require SSE support in the processor--the encoding is a
1333 nop on processors that do not support it. */
1334 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335 _mm_pause (void)
1336 {
1337 __builtin_ia32_pause ();
1338 }
1339
1340 #endif /* _XMMINTRIN_H_INCLUDED */