]>
git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/rs6000/xmmintrin.h
1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37 VMX/VSX ISA is a good match for vector float SIMD operations.
38 However scalar float operations in vector (XMM) registers require
39 the POWER8 VSX ISA (2.07) level. Also there are important
40 differences for data format and placement of float scalars in the
41 vector register. For PowerISA Scalar floats in FPRs (left most
42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43 uses the right most 32-bits of the XMM. These differences require
44 extra steps on POWER to match the SSE scalar float semantics.
46 Most SSE scalar float intrinsic operations can be performed more
47 efficiently as C language float scalar operations or optimized to
48 use vector SIMD operations. We recommend this for new applications.
50 Another difference is the format and details of the X86_64 MXSCR vs
51 the PowerISA FPSCR / VSCR registers. We recommend applications
52 replace direct access to the MXSCR with the more portable <fenv.h>
54 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #ifndef _XMMINTRIN_H_INCLUDED
58 #define _XMMINTRIN_H_INCLUDED
60 /* Define four value permute mask */
61 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
65 /* Avoid collisions between altivec.h and strict adherence to C++ and
66 C11 standards. This should eventually be done inside altivec.h itself,
67 but only after testing a full distro build. */
68 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
69 (defined(__STDC_VERSION__) && \
70 __STDC_VERSION__ >= 201112L))
78 /* We need type definitions from the MMX header file. */
81 /* Get _mm_malloc () and _mm_free (). */
82 #include <mm_malloc.h>
84 /* The Intel API is flexible enough that we must allow aliasing with other
85 vector types, and their scalar components. */
86 typedef float __m128
__attribute__ ((__vector_size__ (16), __may_alias__
));
88 /* Internal data types for implementing the intrinsics. */
89 typedef float __v4sf
__attribute__ ((__vector_size__ (16)));
91 /* Create an undefined vector. */
92 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
93 _mm_undefined_ps (void)
99 /* Create a vector of zeros. */
100 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
101 _mm_setzero_ps (void)
103 return __extension__ (__m128
){ 0.0f
, 0.0f
, 0.0f
, 0.0f
};
106 /* Load four SPFP values from P. The address must be 16-byte aligned. */
107 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
108 _mm_load_ps (float const *__P
)
110 assert(((unsigned long)__P
& 0xfUL
) == 0UL);
111 return ((__m128
)vec_ld(0, (__v4sf
*)__P
));
114 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
115 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
116 _mm_loadu_ps (float const *__P
)
118 return (vec_vsx_ld(0, __P
));
121 /* Load four SPFP values in reverse order. The address must be aligned. */
122 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
123 _mm_loadr_ps (float const *__P
)
127 static const __vector
unsigned char permute_vector
=
128 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
129 0x17, 0x10, 0x11, 0x12, 0x13 };
131 __tmp
= vec_ld (0, (__v4sf
*) __P
);
132 result
= (__m128
) vec_perm (__tmp
, __tmp
, permute_vector
);
136 /* Create a vector with all four elements equal to F. */
137 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
138 _mm_set1_ps (float __F
)
140 return __extension__ (__m128
)(__v4sf
){ __F
, __F
, __F
, __F
};
143 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
144 _mm_set_ps1 (float __F
)
146 return _mm_set1_ps (__F
);
149 /* Create the vector [Z Y X W]. */
150 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
151 _mm_set_ps (const float __Z
, const float __Y
, const float __X
, const float __W
)
153 return __extension__ (__m128
)(__v4sf
){ __W
, __X
, __Y
, __Z
};
156 /* Create the vector [W X Y Z]. */
157 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
158 _mm_setr_ps (float __Z
, float __Y
, float __X
, float __W
)
160 return __extension__ (__m128
)(__v4sf
){ __Z
, __Y
, __X
, __W
};
163 /* Store four SPFP values. The address must be 16-byte aligned. */
164 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
165 _mm_store_ps (float *__P
, __m128 __A
)
167 assert(((unsigned long)__P
& 0xfUL
) == 0UL);
168 vec_st((__v4sf
)__A
, 0, (__v4sf
*)__P
);
171 /* Store four SPFP values. The address need not be 16-byte aligned. */
172 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
173 _mm_storeu_ps (float *__P
, __m128 __A
)
175 *(__m128
*)__P
= __A
;
178 /* Store four SPFP values in reverse order. The address must be aligned. */
179 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
180 _mm_storer_ps (float *__P
, __m128 __A
)
183 static const __vector
unsigned char permute_vector
=
184 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
185 0x17, 0x10, 0x11, 0x12, 0x13 };
187 __tmp
= (__m128
) vec_perm (__A
, __A
, permute_vector
);
189 _mm_store_ps (__P
, __tmp
);
192 /* Store the lower SPFP value across four words. */
193 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
194 _mm_store1_ps (float *__P
, __m128 __A
)
196 __v4sf __va
= vec_splat((__v4sf
)__A
, 0);
197 _mm_store_ps (__P
, __va
);
200 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
201 _mm_store_ps1 (float *__P
, __m128 __A
)
203 _mm_store1_ps (__P
, __A
);
206 /* Create a vector with element 0 as F and the rest zero. */
207 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
208 _mm_set_ss (float __F
)
210 return __extension__ (__m128
)(__v4sf
){ __F
, 0.0f
, 0.0f
, 0.0f
};
213 /* Sets the low SPFP value of A from the low value of B. */
214 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
215 _mm_move_ss (__m128 __A
, __m128 __B
)
217 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
219 return (vec_sel ((__v4sf
)__A
, (__v4sf
)__B
, mask
));
222 /* Create a vector with element 0 as *P and the rest zero. */
223 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
224 _mm_load_ss (float const *__P
)
226 return _mm_set_ss (*__P
);
229 /* Stores the lower SPFP value. */
230 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
231 _mm_store_ss (float *__P
, __m128 __A
)
233 *__P
= ((__v4sf
)__A
)[0];
236 /* Perform the respective operation on the lower SPFP (single-precision
237 floating-point) values of A and B; the upper three SPFP values are
238 passed through from A. */
240 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
241 _mm_add_ss (__m128 __A
, __m128 __B
)
245 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
246 /* PowerISA VSX does not allow partial (for just lower double)
247 results. So to insure we don't generate spurious exceptions
248 (from the upper double values) we splat the lower double
249 before we to the operation. */
250 a
= vec_splat (__A
, 0);
251 b
= vec_splat (__B
, 0);
253 /* Then we merge the lower float result with the original upper
254 float elements from __A. */
255 return (vec_sel (__A
, c
, mask
));
257 __A
[0] = __A
[0] + __B
[0];
262 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
263 _mm_sub_ss (__m128 __A
, __m128 __B
)
267 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
268 /* PowerISA VSX does not allow partial (for just lower double)
269 results. So to insure we don't generate spurious exceptions
270 (from the upper double values) we splat the lower double
271 before we to the operation. */
272 a
= vec_splat (__A
, 0);
273 b
= vec_splat (__B
, 0);
275 /* Then we merge the lower float result with the original upper
276 float elements from __A. */
277 return (vec_sel (__A
, c
, mask
));
279 __A
[0] = __A
[0] - __B
[0];
284 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
285 _mm_mul_ss (__m128 __A
, __m128 __B
)
289 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
290 /* PowerISA VSX does not allow partial (for just lower double)
291 results. So to insure we don't generate spurious exceptions
292 (from the upper double values) we splat the lower double
293 before we to the operation. */
294 a
= vec_splat (__A
, 0);
295 b
= vec_splat (__B
, 0);
297 /* Then we merge the lower float result with the original upper
298 float elements from __A. */
299 return (vec_sel (__A
, c
, mask
));
301 __A
[0] = __A
[0] * __B
[0];
306 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
307 _mm_div_ss (__m128 __A
, __m128 __B
)
311 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
312 /* PowerISA VSX does not allow partial (for just lower double)
313 results. So to insure we don't generate spurious exceptions
314 (from the upper double values) we splat the lower double
315 before we to the operation. */
316 a
= vec_splat (__A
, 0);
317 b
= vec_splat (__B
, 0);
319 /* Then we merge the lower float result with the original upper
320 float elements from __A. */
321 return (vec_sel (__A
, c
, mask
));
323 __A
[0] = __A
[0] / __B
[0];
328 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
329 _mm_sqrt_ss (__m128 __A
)
332 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
333 /* PowerISA VSX does not allow partial (for just lower double)
334 * results. So to insure we don't generate spurious exceptions
335 * (from the upper double values) we splat the lower double
336 * before we to the operation. */
337 a
= vec_splat (__A
, 0);
339 /* Then we merge the lower float result with the original upper
340 * float elements from __A. */
341 return (vec_sel (__A
, c
, mask
));
344 /* Perform the respective operation on the four SPFP values in A and B. */
345 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
346 _mm_add_ps (__m128 __A
, __m128 __B
)
348 return (__m128
) ((__v4sf
)__A
+ (__v4sf
)__B
);
351 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
352 _mm_sub_ps (__m128 __A
, __m128 __B
)
354 return (__m128
) ((__v4sf
)__A
- (__v4sf
)__B
);
357 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
358 _mm_mul_ps (__m128 __A
, __m128 __B
)
360 return (__m128
) ((__v4sf
)__A
* (__v4sf
)__B
);
363 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
364 _mm_div_ps (__m128 __A
, __m128 __B
)
366 return (__m128
) ((__v4sf
)__A
/ (__v4sf
)__B
);
369 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
370 _mm_sqrt_ps (__m128 __A
)
372 return (vec_sqrt ((__v4sf
)__A
));
375 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
376 _mm_rcp_ps (__m128 __A
)
378 return (vec_re ((__v4sf
)__A
));
381 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
382 _mm_rsqrt_ps (__m128 __A
)
384 return (vec_rsqrte (__A
));
387 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
388 _mm_rcp_ss (__m128 __A
)
391 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
392 /* PowerISA VSX does not allow partial (for just lower double)
393 * results. So to insure we don't generate spurious exceptions
394 * (from the upper double values) we splat the lower double
395 * before we to the operation. */
396 a
= vec_splat (__A
, 0);
398 /* Then we merge the lower float result with the original upper
399 * float elements from __A. */
400 return (vec_sel (__A
, c
, mask
));
403 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
404 _mm_rsqrt_ss (__m128 __A
)
407 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
408 /* PowerISA VSX does not allow partial (for just lower double)
409 * results. So to insure we don't generate spurious exceptions
410 * (from the upper double values) we splat the lower double
411 * before we to the operation. */
412 a
= vec_splat (__A
, 0);
414 /* Then we merge the lower float result with the original upper
415 * float elements from __A. */
416 return (vec_sel (__A
, c
, mask
));
419 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
420 _mm_min_ss (__m128 __A
, __m128 __B
)
423 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
424 /* PowerISA VSX does not allow partial (for just lower float)
425 * results. So to insure we don't generate spurious exceptions
426 * (from the upper float values) we splat the lower float
427 * before we to the operation. */
428 a
= vec_splat ((__v4sf
)__A
, 0);
429 b
= vec_splat ((__v4sf
)__B
, 0);
431 /* Then we merge the lower float result with the original upper
432 * float elements from __A. */
433 return (vec_sel ((__v4sf
)__A
, c
, mask
));
436 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
437 _mm_max_ss (__m128 __A
, __m128 __B
)
440 static const __vector
unsigned int mask
= {0xffffffff, 0, 0, 0};
441 /* PowerISA VSX does not allow partial (for just lower float)
442 * results. So to insure we don't generate spurious exceptions
443 * (from the upper float values) we splat the lower float
444 * before we to the operation. */
445 a
= vec_splat (__A
, 0);
446 b
= vec_splat (__B
, 0);
448 /* Then we merge the lower float result with the original upper
449 * float elements from __A. */
450 return (vec_sel ((__v4sf
)__A
, c
, mask
));
453 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
454 _mm_min_ps (__m128 __A
, __m128 __B
)
456 __m128 m
= (__m128
) vec_vcmpgtfp ((__v4sf
) __B
, (__v4sf
) __A
);
457 return vec_sel (__B
, __A
, m
);
460 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
461 _mm_max_ps (__m128 __A
, __m128 __B
)
463 __m128 m
= (__m128
) vec_vcmpgtfp ((__v4sf
) __A
, (__v4sf
) __B
);
464 return vec_sel (__B
, __A
, m
);
467 /* Perform logical bit-wise operations on 128-bit values. */
468 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
469 _mm_and_ps (__m128 __A
, __m128 __B
)
471 return ((__m128
)vec_and ((__v4sf
)__A
, (__v4sf
)__B
));
472 // return __builtin_ia32_andps (__A, __B);
475 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
476 _mm_andnot_ps (__m128 __A
, __m128 __B
)
478 return ((__m128
)vec_andc ((__v4sf
)__B
, (__v4sf
)__A
));
481 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
482 _mm_or_ps (__m128 __A
, __m128 __B
)
484 return ((__m128
)vec_or ((__v4sf
)__A
, (__v4sf
)__B
));
487 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
488 _mm_xor_ps (__m128 __A
, __m128 __B
)
490 return ((__m128
)vec_xor ((__v4sf
)__A
, (__v4sf
)__B
));
493 /* Perform a comparison on the four SPFP values of A and B. For each
494 element, if the comparison is true, place a mask of all ones in the
495 result, otherwise a mask of zeros. */
496 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
497 _mm_cmpeq_ps (__m128 __A
, __m128 __B
)
499 return ((__m128
)vec_cmpeq ((__v4sf
)__A
,(__v4sf
) __B
));
502 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
503 _mm_cmplt_ps (__m128 __A
, __m128 __B
)
505 return ((__m128
)vec_cmplt ((__v4sf
)__A
, (__v4sf
)__B
));
508 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
509 _mm_cmple_ps (__m128 __A
, __m128 __B
)
511 return ((__m128
)vec_cmple ((__v4sf
)__A
, (__v4sf
)__B
));
514 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
515 _mm_cmpgt_ps (__m128 __A
, __m128 __B
)
517 return ((__m128
)vec_cmpgt ((__v4sf
)__A
, (__v4sf
)__B
));
520 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
521 _mm_cmpge_ps (__m128 __A
, __m128 __B
)
523 return ((__m128
)vec_cmpge ((__v4sf
)__A
, (__v4sf
)__B
));
526 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
527 _mm_cmpneq_ps (__m128 __A
, __m128 __B
)
529 __v4sf temp
= (__v4sf
) vec_cmpeq ((__v4sf
) __A
, (__v4sf
)__B
);
530 return ((__m128
)vec_nor (temp
, temp
));
533 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
534 _mm_cmpnlt_ps (__m128 __A
, __m128 __B
)
536 return ((__m128
)vec_cmpge ((__v4sf
)__A
, (__v4sf
)__B
));
539 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
540 _mm_cmpnle_ps (__m128 __A
, __m128 __B
)
542 return ((__m128
)vec_cmpgt ((__v4sf
)__A
, (__v4sf
)__B
));
545 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
546 _mm_cmpngt_ps (__m128 __A
, __m128 __B
)
548 return ((__m128
)vec_cmple ((__v4sf
)__A
, (__v4sf
)__B
));
551 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
552 _mm_cmpnge_ps (__m128 __A
, __m128 __B
)
554 return ((__m128
)vec_cmplt ((__v4sf
)__A
, (__v4sf
)__B
));
557 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
558 _mm_cmpord_ps (__m128 __A
, __m128 __B
)
560 __vector
unsigned int a
, b
;
561 __vector
unsigned int c
, d
;
562 static const __vector
unsigned int float_exp_mask
=
563 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
565 a
= (__vector
unsigned int) vec_abs ((__v4sf
)__A
);
566 b
= (__vector
unsigned int) vec_abs ((__v4sf
)__B
);
567 c
= (__vector
unsigned int) vec_cmpgt (float_exp_mask
, a
);
568 d
= (__vector
unsigned int) vec_cmpgt (float_exp_mask
, b
);
569 return ((__m128
) vec_and (c
, d
));
572 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
573 _mm_cmpunord_ps (__m128 __A
, __m128 __B
)
575 __vector
unsigned int a
, b
;
576 __vector
unsigned int c
, d
;
577 static const __vector
unsigned int float_exp_mask
=
578 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
580 a
= (__vector
unsigned int) vec_abs ((__v4sf
)__A
);
581 b
= (__vector
unsigned int) vec_abs ((__v4sf
)__B
);
582 c
= (__vector
unsigned int) vec_cmpgt (a
, float_exp_mask
);
583 d
= (__vector
unsigned int) vec_cmpgt (b
, float_exp_mask
);
584 return ((__m128
) vec_or (c
, d
));
587 /* Perform a comparison on the lower SPFP values of A and B. If the
588 comparison is true, place a mask of all ones in the result, otherwise a
589 mask of zeros. The upper three SPFP values are passed through from A. */
590 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
591 _mm_cmpeq_ss (__m128 __A
, __m128 __B
)
593 static const __vector
unsigned int mask
=
594 { 0xffffffff, 0, 0, 0 };
596 /* PowerISA VMX does not allow partial (for just element 0)
597 * results. So to insure we don't generate spurious exceptions
598 * (from the upper elements) we splat the lower float
599 * before we to the operation. */
600 a
= vec_splat ((__v4sf
) __A
, 0);
601 b
= vec_splat ((__v4sf
) __B
, 0);
602 c
= (__v4sf
) vec_cmpeq(a
, b
);
603 /* Then we merge the lower float result with the original upper
604 * float elements from __A. */
605 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
608 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
609 _mm_cmplt_ss (__m128 __A
, __m128 __B
)
611 static const __vector
unsigned int mask
=
612 { 0xffffffff, 0, 0, 0 };
614 /* PowerISA VMX does not allow partial (for just element 0)
615 * results. So to insure we don't generate spurious exceptions
616 * (from the upper elements) we splat the lower float
617 * before we to the operation. */
618 a
= vec_splat ((__v4sf
) __A
, 0);
619 b
= vec_splat ((__v4sf
) __B
, 0);
620 c
= (__v4sf
) vec_cmplt(a
, b
);
621 /* Then we merge the lower float result with the original upper
622 * float elements from __A. */
623 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
626 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
627 _mm_cmple_ss (__m128 __A
, __m128 __B
)
629 static const __vector
unsigned int mask
=
630 { 0xffffffff, 0, 0, 0 };
632 /* PowerISA VMX does not allow partial (for just element 0)
633 * results. So to insure we don't generate spurious exceptions
634 * (from the upper elements) we splat the lower float
635 * before we to the operation. */
636 a
= vec_splat ((__v4sf
) __A
, 0);
637 b
= vec_splat ((__v4sf
) __B
, 0);
638 c
= (__v4sf
) vec_cmple(a
, b
);
639 /* Then we merge the lower float result with the original upper
640 * float elements from __A. */
641 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
644 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
645 _mm_cmpgt_ss (__m128 __A
, __m128 __B
)
647 static const __vector
unsigned int mask
=
648 { 0xffffffff, 0, 0, 0 };
650 /* PowerISA VMX does not allow partial (for just element 0)
651 * results. So to insure we don't generate spurious exceptions
652 * (from the upper elements) we splat the lower float
653 * before we to the operation. */
654 a
= vec_splat ((__v4sf
) __A
, 0);
655 b
= vec_splat ((__v4sf
) __B
, 0);
656 c
= (__v4sf
) vec_cmpgt(a
, b
);
657 /* Then we merge the lower float result with the original upper
658 * float elements from __A. */
659 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
662 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
663 _mm_cmpge_ss (__m128 __A
, __m128 __B
)
665 static const __vector
unsigned int mask
=
666 { 0xffffffff, 0, 0, 0 };
668 /* PowerISA VMX does not allow partial (for just element 0)
669 * results. So to insure we don't generate spurious exceptions
670 * (from the upper elements) we splat the lower float
671 * before we to the operation. */
672 a
= vec_splat ((__v4sf
) __A
, 0);
673 b
= vec_splat ((__v4sf
) __B
, 0);
674 c
= (__v4sf
) vec_cmpge(a
, b
);
675 /* Then we merge the lower float result with the original upper
676 * float elements from __A. */
677 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
680 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
681 _mm_cmpneq_ss (__m128 __A
, __m128 __B
)
683 static const __vector
unsigned int mask
=
684 { 0xffffffff, 0, 0, 0 };
686 /* PowerISA VMX does not allow partial (for just element 0)
687 * results. So to insure we don't generate spurious exceptions
688 * (from the upper elements) we splat the lower float
689 * before we to the operation. */
690 a
= vec_splat ((__v4sf
) __A
, 0);
691 b
= vec_splat ((__v4sf
) __B
, 0);
692 c
= (__v4sf
) vec_cmpeq(a
, b
);
694 /* Then we merge the lower float result with the original upper
695 * float elements from __A. */
696 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
699 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
700 _mm_cmpnlt_ss (__m128 __A
, __m128 __B
)
702 static const __vector
unsigned int mask
=
703 { 0xffffffff, 0, 0, 0 };
705 /* PowerISA VMX does not allow partial (for just element 0)
706 * results. So to insure we don't generate spurious exceptions
707 * (from the upper elements) we splat the lower float
708 * before we to the operation. */
709 a
= vec_splat ((__v4sf
) __A
, 0);
710 b
= vec_splat ((__v4sf
) __B
, 0);
711 c
= (__v4sf
) vec_cmpge(a
, b
);
712 /* Then we merge the lower float result with the original upper
713 * float elements from __A. */
714 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
717 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
718 _mm_cmpnle_ss (__m128 __A
, __m128 __B
)
720 static const __vector
unsigned int mask
=
721 { 0xffffffff, 0, 0, 0 };
723 /* PowerISA VMX does not allow partial (for just element 0)
724 * results. So to insure we don't generate spurious exceptions
725 * (from the upper elements) we splat the lower float
726 * before we to the operation. */
727 a
= vec_splat ((__v4sf
) __A
, 0);
728 b
= vec_splat ((__v4sf
) __B
, 0);
729 c
= (__v4sf
) vec_cmpgt(a
, b
);
730 /* Then we merge the lower float result with the original upper
731 * float elements from __A. */
732 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
735 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
736 _mm_cmpngt_ss (__m128 __A
, __m128 __B
)
738 static const __vector
unsigned int mask
=
739 { 0xffffffff, 0, 0, 0 };
741 /* PowerISA VMX does not allow partial (for just element 0)
742 * results. So to insure we don't generate spurious exceptions
743 * (from the upper elements) we splat the lower float
744 * before we to the operation. */
745 a
= vec_splat ((__v4sf
) __A
, 0);
746 b
= vec_splat ((__v4sf
) __B
, 0);
747 c
= (__v4sf
) vec_cmple(a
, b
);
748 /* Then we merge the lower float result with the original upper
749 * float elements from __A. */
750 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
753 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
754 _mm_cmpnge_ss (__m128 __A
, __m128 __B
)
756 static const __vector
unsigned int mask
=
757 { 0xffffffff, 0, 0, 0 };
759 /* PowerISA VMX does not allow partial (for just element 0)
760 * results. So to insure we don't generate spurious exceptions
761 * (from the upper elements) we splat the lower float
762 * before we do the operation. */
763 a
= vec_splat ((__v4sf
) __A
, 0);
764 b
= vec_splat ((__v4sf
) __B
, 0);
765 c
= (__v4sf
) vec_cmplt(a
, b
);
766 /* Then we merge the lower float result with the original upper
767 * float elements from __A. */
768 return ((__m128
)vec_sel ((__v4sf
)__A
, c
, mask
));
771 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
772 _mm_cmpord_ss (__m128 __A
, __m128 __B
)
774 __vector
unsigned int a
, b
;
775 __vector
unsigned int c
, d
;
776 static const __vector
unsigned int float_exp_mask
=
777 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
778 static const __vector
unsigned int mask
=
779 { 0xffffffff, 0, 0, 0 };
781 a
= (__vector
unsigned int) vec_abs ((__v4sf
)__A
);
782 b
= (__vector
unsigned int) vec_abs ((__v4sf
)__B
);
783 c
= (__vector
unsigned int) vec_cmpgt (float_exp_mask
, a
);
784 d
= (__vector
unsigned int) vec_cmpgt (float_exp_mask
, b
);
786 /* Then we merge the lower float result with the original upper
787 * float elements from __A. */
788 return ((__m128
)vec_sel ((__v4sf
)__A
, (__v4sf
)c
, mask
));
791 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
792 _mm_cmpunord_ss (__m128 __A
, __m128 __B
)
794 __vector
unsigned int a
, b
;
795 __vector
unsigned int c
, d
;
796 static const __vector
unsigned int float_exp_mask
=
797 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
798 static const __vector
unsigned int mask
=
799 { 0xffffffff, 0, 0, 0 };
801 a
= (__vector
unsigned int) vec_abs ((__v4sf
)__A
);
802 b
= (__vector
unsigned int) vec_abs ((__v4sf
)__B
);
803 c
= (__vector
unsigned int) vec_cmpgt (a
, float_exp_mask
);
804 d
= (__vector
unsigned int) vec_cmpgt (b
, float_exp_mask
);
806 /* Then we merge the lower float result with the original upper
807 * float elements from __A. */
808 return ((__m128
)vec_sel ((__v4sf
)__A
, (__v4sf
)c
, mask
));
811 /* Compare the lower SPFP values of A and B and return 1 if true
813 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
814 _mm_comieq_ss (__m128 __A
, __m128 __B
)
816 return (__A
[0] == __B
[0]);
819 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
820 _mm_comilt_ss (__m128 __A
, __m128 __B
)
822 return (__A
[0] < __B
[0]);
825 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
826 _mm_comile_ss (__m128 __A
, __m128 __B
)
828 return (__A
[0] <= __B
[0]);
831 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
832 _mm_comigt_ss (__m128 __A
, __m128 __B
)
834 return (__A
[0] > __B
[0]);
837 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
838 _mm_comige_ss (__m128 __A
, __m128 __B
)
840 return (__A
[0] >= __B
[0]);
843 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
844 _mm_comineq_ss (__m128 __A
, __m128 __B
)
846 return (__A
[0] != __B
[0]);
850 * The __mm_ucomi??_ss implementations below are exactly the same as
851 * __mm_comi??_ss because GCC for PowerPC only generates unordered
852 * compares (scalar and vector).
853 * Technically __mm_comieq_ss et al should be using the ordered
854 * compare and signal for QNaNs.
855 * The __mm_ucomieq_sd et all should be OK, as is.
857 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
858 _mm_ucomieq_ss (__m128 __A
, __m128 __B
)
860 return (__A
[0] == __B
[0]);
863 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
864 _mm_ucomilt_ss (__m128 __A
, __m128 __B
)
866 return (__A
[0] < __B
[0]);
869 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
870 _mm_ucomile_ss (__m128 __A
, __m128 __B
)
872 return (__A
[0] <= __B
[0]);
875 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
876 _mm_ucomigt_ss (__m128 __A
, __m128 __B
)
878 return (__A
[0] > __B
[0]);
881 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
882 _mm_ucomige_ss (__m128 __A
, __m128 __B
)
884 return (__A
[0] >= __B
[0]);
887 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
888 _mm_ucomineq_ss (__m128 __A
, __m128 __B
)
890 return (__A
[0] != __B
[0]);
893 extern __inline
float __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
894 _mm_cvtss_f32 (__m128 __A
)
896 return ((__v4sf
)__A
)[0];
899 /* Convert the lower SPFP value to a 32-bit integer according to the current
901 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
902 _mm_cvtss_si32 (__m128 __A
)
908 "xxsldwi %x1,%x2,%x2,3;\n"
909 "xscvspdp %x1,%x1;\n"
917 res
= __builtin_rint(__A
[0]);
922 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
923 _mm_cvt_ss2si (__m128 __A
)
925 return _mm_cvtss_si32 (__A
);
928 /* Convert the lower SPFP value to a 32-bit integer according to the
929 current rounding mode. */
931 /* Intel intrinsic. */
932 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
933 _mm_cvtss_si64 (__m128 __A
)
939 "xxsldwi %x1,%x2,%x2,3;\n"
940 "xscvspdp %x1,%x1;\n"
948 res
= __builtin_llrint(__A
[0]);
953 /* Microsoft intrinsic. */
954 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
955 _mm_cvtss_si64x (__m128 __A
)
957 return _mm_cvtss_si64 ((__v4sf
) __A
);
960 /* Constants for use with _mm_prefetch. */
963 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
972 /* Loads one cache line from address P to a location "closer" to the
973 processor. The selector I specifies the type of prefetch operation. */
974 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
975 _mm_prefetch (const void *__P
, enum _mm_hint __I
)
977 /* Current PowerPC will ignores the hint parameters. */
978 __builtin_prefetch (__P
);
981 /* Convert the two lower SPFP values to 32-bit integers according to the
982 current rounding mode. Return the integers in packed form. */
983 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
984 _mm_cvtps_pi32 (__m128 __A
)
986 /* Splat two lower SPFP values to both halves. */
987 __v4sf temp
, rounded
;
988 __vector __m64 result
;
990 /* Splat two lower SPFP values to both halves. */
991 temp
= (__v4sf
) vec_splat ((__vector
long long)__A
, 0);
992 rounded
= vec_rint(temp
);
993 result
= (__vector __m64
) vec_cts (rounded
, 0);
995 return ((__m64
) __builtin_unpack_vector_int128 ((__vector __int128
)result
, 0));
998 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
999 _mm_cvt_ps2pi (__m128 __A
)
1001 return _mm_cvtps_pi32 (__A
);
1004 /* Truncate the lower SPFP value to a 32-bit integer. */
1005 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1006 _mm_cvttss_si32 (__m128 __A
)
1008 /* Extract the lower float element. */
1009 float temp
= __A
[0];
1010 /* truncate to 32-bit integer and return. */
1014 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1015 _mm_cvtt_ss2si (__m128 __A
)
1017 return _mm_cvttss_si32 (__A
);
1020 /* Intel intrinsic. */
1021 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1022 _mm_cvttss_si64 (__m128 __A
)
1024 /* Extract the lower float element. */
1025 float temp
= __A
[0];
1026 /* truncate to 32-bit integer and return. */
1030 /* Microsoft intrinsic. */
1031 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1032 _mm_cvttss_si64x (__m128 __A
)
1034 /* Extract the lower float element. */
1035 float temp
= __A
[0];
1036 /* truncate to 32-bit integer and return. */
1040 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1041 integers in packed form. */
1042 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1043 _mm_cvttps_pi32 (__m128 __A
)
1046 __vector __m64 result
;
1048 /* Splat two lower SPFP values to both halves. */
1049 temp
= (__v4sf
) vec_splat ((__vector
long long)__A
, 0);
1050 result
= (__vector __m64
) vec_cts (temp
, 0);
1052 return ((__m64
) __builtin_unpack_vector_int128 ((__vector __int128
)result
, 0));
1055 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1056 _mm_cvtt_ps2pi (__m128 __A
)
1058 return _mm_cvttps_pi32 (__A
);
1061 /* Convert B to a SPFP value and insert it as element zero in A. */
1062 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1063 _mm_cvtsi32_ss (__m128 __A
, int __B
)
1071 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1072 _mm_cvt_si2ss (__m128 __A
, int __B
)
1074 return _mm_cvtsi32_ss (__A
, __B
);
1077 /* Convert B to a SPFP value and insert it as element zero in A. */
1078 /* Intel intrinsic. */
1079 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1080 _mm_cvtsi64_ss (__m128 __A
, long long __B
)
1088 /* Microsoft intrinsic. */
1089 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1090 _mm_cvtsi64x_ss (__m128 __A
, long long __B
)
1092 return _mm_cvtsi64_ss (__A
, __B
);
1095 /* Convert the two 32-bit values in B to SPFP form and insert them
1096 as the two lower elements in A. */
1097 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1098 _mm_cvtpi32_ps (__m128 __A
, __m64 __B
)
1100 __vector
signed int vm1
;
1103 vm1
= (__vector
signed int) __builtin_pack_vector_int128 (__B
, __B
);
1104 vf1
= (__vector
float) vec_ctf (vm1
, 0);
1106 return ((__m128
) (__vector __m64
)
1107 { ((__vector __m64
)vf1
) [0], ((__vector __m64
)__A
) [1]});
1110 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1111 _mm_cvt_pi2ps (__m128 __A
, __m64 __B
)
1113 return _mm_cvtpi32_ps (__A
, __B
);
1116 /* Convert the four signed 16-bit values in A to SPFP form. */
1117 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1118 _mm_cvtpi16_ps (__m64 __A
)
1120 __vector
signed short vs8
;
1121 __vector
signed int vi4
;
1124 vs8
= (__vector
signed short) __builtin_pack_vector_int128 (__A
, __A
);
1125 vi4
= vec_vupklsh (vs8
);
1126 vf1
= (__vector
float) vec_ctf (vi4
, 0);
1128 return (__m128
) vf1
;
1131 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1132 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1133 _mm_cvtpu16_ps (__m64 __A
)
1135 const __vector
unsigned short zero
=
1136 { 0, 0, 0, 0, 0, 0, 0, 0 };
1137 __vector
unsigned short vs8
;
1138 __vector
unsigned int vi4
;
1141 vs8
= (__vector
unsigned short) __builtin_pack_vector_int128 (__A
, __A
);
1142 vi4
= (__vector
unsigned int) vec_vmrglh (vs8
, zero
);
1143 vf1
= (__vector
float) vec_ctf (vi4
, 0);
1145 return (__m128
) vf1
;
1148 /* Convert the low four signed 8-bit values in A to SPFP form. */
1149 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1150 _mm_cvtpi8_ps (__m64 __A
)
1152 __vector
signed char vc16
;
1153 __vector
signed short vs8
;
1154 __vector
signed int vi4
;
1157 vc16
= (__vector
signed char) __builtin_pack_vector_int128 (__A
, __A
);
1158 vs8
= vec_vupkhsb (vc16
);
1159 vi4
= vec_vupkhsh (vs8
);
1160 vf1
= (__vector
float) vec_ctf (vi4
, 0);
1162 return (__m128
) vf1
;
1165 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1166 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1168 _mm_cvtpu8_ps (__m64 __A
)
1170 const __vector
unsigned char zero
=
1171 { 0, 0, 0, 0, 0, 0, 0, 0 };
1172 __vector
unsigned char vc16
;
1173 __vector
unsigned short vs8
;
1174 __vector
unsigned int vi4
;
1177 vc16
= (__vector
unsigned char) __builtin_pack_vector_int128 (__A
, __A
);
1178 vs8
= (__vector
unsigned short) vec_vmrglb (vc16
, zero
);
1179 vi4
= (__vector
unsigned int) vec_vmrghh (vs8
,
1180 (__vector
unsigned short) zero
);
1181 vf1
= (__vector
float) vec_ctf (vi4
, 0);
1183 return (__m128
) vf1
;
1186 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1187 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1188 _mm_cvtpi32x2_ps(__m64 __A
, __m64 __B
)
1190 __vector
signed int vi4
;
1193 vi4
= (__vector
signed int) __builtin_pack_vector_int128 (__B
, __A
);
1194 vf4
= (__vector
float) vec_ctf (vi4
, 0);
1195 return (__m128
) vf4
;
1198 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1199 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1200 _mm_cvtps_pi16(__m128 __A
)
1203 __vector
signed int temp
;
1204 __vector __m64 result
;
1206 rounded
= vec_rint(__A
);
1207 temp
= vec_cts (rounded
, 0);
1208 result
= (__vector __m64
) vec_pack (temp
, temp
);
1210 return ((__m64
) __builtin_unpack_vector_int128 ((__vector __int128
)result
, 0));
1213 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1214 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1215 _mm_cvtps_pi8(__m128 __A
)
1218 __vector
signed int tmp_i
;
1219 static const __vector
signed int zero
= {0, 0, 0, 0};
1220 __vector
signed short tmp_s
;
1221 __vector
signed char res_v
;
1224 rounded
= vec_rint(__A
);
1225 tmp_i
= vec_cts (rounded
, 0);
1226 tmp_s
= vec_pack (tmp_i
, zero
);
1227 res_v
= vec_pack (tmp_s
, tmp_s
);
1228 result
= (__m64
) __builtin_unpack_vector_int128 ((__vector __int128
)res_v
, 0);
1233 /* Selects four specific SPFP values from A and B based on MASK. */
1234 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1236 _mm_shuffle_ps (__m128 __A
, __m128 __B
, int const __mask
)
1238 unsigned long element_selector_10
= __mask
& 0x03;
1239 unsigned long element_selector_32
= (__mask
>> 2) & 0x03;
1240 unsigned long element_selector_54
= (__mask
>> 4) & 0x03;
1241 unsigned long element_selector_76
= (__mask
>> 6) & 0x03;
1242 static const unsigned int permute_selectors
[4] =
1244 #ifdef __LITTLE_ENDIAN__
1245 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1246 #elif __BIG_ENDIAN__
1247 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1250 __vector
unsigned int t
;
1252 #ifdef __LITTLE_ENDIAN__
1253 t
[0] = permute_selectors
[element_selector_10
];
1254 t
[1] = permute_selectors
[element_selector_32
];
1255 t
[2] = permute_selectors
[element_selector_54
] + 0x10101010;
1256 t
[3] = permute_selectors
[element_selector_76
] + 0x10101010;
1257 #elif __BIG_ENDIAN__
1258 t
[3] = permute_selectors
[element_selector_10
] + 0x10101010;
1259 t
[2] = permute_selectors
[element_selector_32
] + 0x10101010;
1260 t
[1] = permute_selectors
[element_selector_54
];
1261 t
[0] = permute_selectors
[element_selector_76
];
1263 return vec_perm ((__v4sf
) __A
, (__v4sf
)__B
, (__vector
unsigned char)t
);
1266 /* Selects and interleaves the upper two SPFP values from A and B. */
1267 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1268 _mm_unpackhi_ps (__m128 __A
, __m128 __B
)
1270 return (__m128
) vec_vmrglw ((__v4sf
) __A
, (__v4sf
)__B
);
1273 /* Selects and interleaves the lower two SPFP values from A and B. */
1274 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1275 _mm_unpacklo_ps (__m128 __A
, __m128 __B
)
1277 return (__m128
) vec_vmrghw ((__v4sf
) __A
, (__v4sf
)__B
);
1280 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1281 the lower two values are passed through from A. */
1282 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1283 _mm_loadh_pi (__m128 __A
, __m64
const *__P
)
1285 __vector __m64 __a
= (__vector __m64
)__A
;
1286 __vector __m64 __p
= vec_splats(*__P
);
1292 /* Stores the upper two SPFP values of A into P. */
1293 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1294 _mm_storeh_pi (__m64
*__P
, __m128 __A
)
1296 __vector __m64 __a
= (__vector __m64
) __A
;
1301 /* Moves the upper two values of B into the lower two values of A. */
1302 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1303 _mm_movehl_ps (__m128 __A
, __m128 __B
)
1305 return (__m128
) vec_mergel ((__vector __m64
)__B
, (__vector __m64
)__A
);
1308 /* Moves the lower two values of B into the upper two values of A. */
1309 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1310 _mm_movelh_ps (__m128 __A
, __m128 __B
)
1312 return (__m128
) vec_mergeh ((__vector __m64
)__A
, (__vector __m64
)__B
);
1315 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1316 the upper two values are passed through from A. */
1317 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1318 _mm_loadl_pi (__m128 __A
, __m64
const *__P
)
1320 __vector __m64 __a
= (__vector __m64
)__A
;
1321 __vector __m64 __p
= vec_splats(*__P
);
1327 /* Stores the lower two SPFP values of A into P. */
1328 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1329 _mm_storel_pi (__m64
*__P
, __m128 __A
)
1331 __vector __m64 __a
= (__vector __m64
) __A
;
1337 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1339 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1340 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1341 _mm_movemask_ps (__m128 __A
)
1343 __vector __m64 result
;
1344 static const __vector
unsigned int perm_mask
=
1346 #ifdef __LITTLE_ENDIAN__
1347 0x00204060, 0x80808080, 0x80808080, 0x80808080
1348 #elif __BIG_ENDIAN__
1349 0x80808080, 0x80808080, 0x80808080, 0x00204060
1353 result
= (__vector __m64
) vec_vbpermq ((__vector
unsigned char) __A
,
1354 (__vector
unsigned char) perm_mask
);
1356 #ifdef __LITTLE_ENDIAN__
1358 #elif __BIG_ENDIAN__
1362 #endif /* _ARCH_PWR8 */
1364 /* Create a vector with all four elements equal to *P. */
1365 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1366 _mm_load1_ps (float const *__P
)
1368 return _mm_set1_ps (*__P
);
1371 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1372 _mm_load_ps1 (float const *__P
)
1374 return _mm_load1_ps (__P
);
1377 /* Extracts one of the four words of A. The selector N must be immediate. */
1378 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1379 _mm_extract_pi16 (__m64
const __A
, int const __N
)
1381 const int shiftr
= (__N
& 3) * 16;
1383 return ((__A
>> shiftr
) & 0xffff);
1386 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1387 _m_pextrw (__m64
const __A
, int const __N
)
1389 return _mm_extract_pi16 (__A
, __N
);
1392 /* Inserts word D into one of four words of A. The selector N must be
1394 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1395 _mm_insert_pi16 (__m64
const __A
, int const __D
, int const __N
)
1397 const int shiftl
= (__N
& 3) * 16;
1398 const __m64 shiftD
= (const __m64
) __D
<< shiftl
;
1399 const __m64 mask
= 0xffffUL
<< shiftl
;
1400 __m64 result
= (__A
& (~mask
)) | (shiftD
& mask
);
1405 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1406 _m_pinsrw (__m64
const __A
, int const __D
, int const __N
)
1408 return _mm_insert_pi16 (__A
, __D
, __N
);
1411 /* Compute the element-wise maximum of signed 16-bit values. */
1412 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1414 _mm_max_pi16 (__m64 __A
, __m64 __B
)
1417 __vector
signed short a
, b
, r
;
1418 __vector __bool
short c
;
1420 a
= (__vector
signed short)vec_splats (__A
);
1421 b
= (__vector
signed short)vec_splats (__B
);
1422 c
= (__vector __bool
short)vec_cmpgt (a
, b
);
1423 r
= vec_sel (b
, a
, c
);
1424 return (__builtin_unpack_vector_int128 ((__vector __int128
)r
, 0));
1426 __m64_union m1
, m2
, res
;
1432 (m1
.as_short
[0] > m2
.as_short
[0]) ? m1
.as_short
[0] : m2
.as_short
[0];
1434 (m1
.as_short
[1] > m2
.as_short
[1]) ? m1
.as_short
[1] : m2
.as_short
[1];
1436 (m1
.as_short
[2] > m2
.as_short
[2]) ? m1
.as_short
[2] : m2
.as_short
[2];
1438 (m1
.as_short
[3] > m2
.as_short
[3]) ? m1
.as_short
[3] : m2
.as_short
[3];
1440 return (__m64
) res
.as_m64
;
1444 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1445 _m_pmaxsw (__m64 __A
, __m64 __B
)
1447 return _mm_max_pi16 (__A
, __B
);
1450 /* Compute the element-wise maximum of unsigned 8-bit values. */
1451 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1452 _mm_max_pu8 (__m64 __A
, __m64 __B
)
1455 __vector
unsigned char a
, b
, r
;
1456 __vector __bool
char c
;
1458 a
= (__vector
unsigned char)vec_splats (__A
);
1459 b
= (__vector
unsigned char)vec_splats (__B
);
1460 c
= (__vector __bool
char)vec_cmpgt (a
, b
);
1461 r
= vec_sel (b
, a
, c
);
1462 return (__builtin_unpack_vector_int128 ((__vector __int128
)r
, 0));
1464 __m64_union m1
, m2
, res
;
1471 for (i
= 0; i
< 8; i
++)
1473 ((unsigned char) m1
.as_char
[i
] > (unsigned char) m2
.as_char
[i
]) ?
1474 m1
.as_char
[i
] : m2
.as_char
[i
];
1476 return (__m64
) res
.as_m64
;
1480 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1481 _m_pmaxub (__m64 __A
, __m64 __B
)
1483 return _mm_max_pu8 (__A
, __B
);
1486 /* Compute the element-wise minimum of signed 16-bit values. */
1487 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1488 _mm_min_pi16 (__m64 __A
, __m64 __B
)
1491 __vector
signed short a
, b
, r
;
1492 __vector __bool
short c
;
1494 a
= (__vector
signed short)vec_splats (__A
);
1495 b
= (__vector
signed short)vec_splats (__B
);
1496 c
= (__vector __bool
short)vec_cmplt (a
, b
);
1497 r
= vec_sel (b
, a
, c
);
1498 return (__builtin_unpack_vector_int128 ((__vector __int128
)r
, 0));
1500 __m64_union m1
, m2
, res
;
1506 (m1
.as_short
[0] < m2
.as_short
[0]) ? m1
.as_short
[0] : m2
.as_short
[0];
1508 (m1
.as_short
[1] < m2
.as_short
[1]) ? m1
.as_short
[1] : m2
.as_short
[1];
1510 (m1
.as_short
[2] < m2
.as_short
[2]) ? m1
.as_short
[2] : m2
.as_short
[2];
1512 (m1
.as_short
[3] < m2
.as_short
[3]) ? m1
.as_short
[3] : m2
.as_short
[3];
1514 return (__m64
) res
.as_m64
;
1518 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1519 _m_pminsw (__m64 __A
, __m64 __B
)
1521 return _mm_min_pi16 (__A
, __B
);
1524 /* Compute the element-wise minimum of unsigned 8-bit values. */
1525 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1526 _mm_min_pu8 (__m64 __A
, __m64 __B
)
1529 __vector
unsigned char a
, b
, r
;
1530 __vector __bool
char c
;
1532 a
= (__vector
unsigned char)vec_splats (__A
);
1533 b
= (__vector
unsigned char)vec_splats (__B
);
1534 c
= (__vector __bool
char)vec_cmplt (a
, b
);
1535 r
= vec_sel (b
, a
, c
);
1536 return (__builtin_unpack_vector_int128 ((__vector __int128
)r
, 0));
1538 __m64_union m1
, m2
, res
;
1545 for (i
= 0; i
< 8; i
++)
1547 ((unsigned char) m1
.as_char
[i
] < (unsigned char) m2
.as_char
[i
]) ?
1548 m1
.as_char
[i
] : m2
.as_char
[i
];
1550 return (__m64
) res
.as_m64
;
1554 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1555 _m_pminub (__m64 __A
, __m64 __B
)
1557 return _mm_min_pu8 (__A
, __B
);
1560 /* Create an 8-bit mask of the signs of 8-bit values. */
1561 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1562 _mm_movemask_pi8 (__m64 __A
)
1564 unsigned long p
= 0x0008101820283038UL
; // permute control for sign bits
1566 return __builtin_bpermd (p
, __A
);
1569 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1570 _m_pmovmskb (__m64 __A
)
1572 return _mm_movemask_pi8 (__A
);
1575 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1576 in B and produce the high 16 bits of the 32-bit results. */
1577 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1578 _mm_mulhi_pu16 (__m64 __A
, __m64 __B
)
1580 __vector
unsigned short a
, b
;
1581 __vector
unsigned short c
;
1582 __vector
unsigned int w0
, w1
;
1583 __vector
unsigned char xform1
= {
1584 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1585 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1588 a
= (__vector
unsigned short)vec_splats (__A
);
1589 b
= (__vector
unsigned short)vec_splats (__B
);
1591 w0
= vec_vmuleuh (a
, b
);
1592 w1
= vec_vmulouh (a
, b
);
1593 c
= (__vector
unsigned short)vec_perm (w0
, w1
, xform1
);
1595 return (__builtin_unpack_vector_int128 ((__vector __int128
)c
, 0));
1598 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1599 _m_pmulhuw (__m64 __A
, __m64 __B
)
1601 return _mm_mulhi_pu16 (__A
, __B
);
1604 /* Return a combination of the four 16-bit values in A. The selector
1605 must be an immediate. */
1606 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1607 _mm_shuffle_pi16 (__m64 __A
, int const __N
)
1609 unsigned long element_selector_10
= __N
& 0x03;
1610 unsigned long element_selector_32
= (__N
>> 2) & 0x03;
1611 unsigned long element_selector_54
= (__N
>> 4) & 0x03;
1612 unsigned long element_selector_76
= (__N
>> 6) & 0x03;
1613 static const unsigned short permute_selectors
[4] =
1615 #ifdef __LITTLE_ENDIAN__
1616 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1617 #elif __BIG_ENDIAN__
1618 0x0607, 0x0405, 0x0203, 0x0001
1622 __vector __m64 a
, p
, r
;
1624 #ifdef __LITTLE_ENDIAN__
1625 t
.as_short
[0] = permute_selectors
[element_selector_10
];
1626 t
.as_short
[1] = permute_selectors
[element_selector_32
];
1627 t
.as_short
[2] = permute_selectors
[element_selector_54
];
1628 t
.as_short
[3] = permute_selectors
[element_selector_76
];
1629 #elif __BIG_ENDIAN__
1630 t
.as_short
[3] = permute_selectors
[element_selector_10
];
1631 t
.as_short
[2] = permute_selectors
[element_selector_32
];
1632 t
.as_short
[1] = permute_selectors
[element_selector_54
];
1633 t
.as_short
[0] = permute_selectors
[element_selector_76
];
1635 p
= vec_splats (t
.as_m64
);
1636 a
= vec_splats (__A
);
1637 r
= vec_perm (a
, a
, (__vector
unsigned char)p
);
1638 return (__builtin_unpack_vector_int128 ((__vector __int128
)r
, 0));
1641 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1642 _m_pshufw (__m64 __A
, int const __N
)
1644 return _mm_shuffle_pi16 (__A
, __N
);
1647 /* Conditionally store byte elements of A into P. The high bit of each
1648 byte in the selector N determines whether the corresponding byte from
1650 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1651 _mm_maskmove_si64 (__m64 __A
, __m64 __N
, char *__P
)
1653 __m64 hibit
= 0x8080808080808080UL
;
1655 __m64
*p
= (__m64
*)__P
;
1658 mask
= _mm_cmpeq_pi8 ((__N
& hibit
), hibit
);
1659 tmp
= (tmp
& (~mask
)) | (__A
& mask
);
1663 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1664 _m_maskmovq (__m64 __A
, __m64 __N
, char *__P
)
1666 _mm_maskmove_si64 (__A
, __N
, __P
);
1669 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1670 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1671 _mm_avg_pu8 (__m64 __A
, __m64 __B
)
1673 __vector
unsigned char a
, b
, c
;
1675 a
= (__vector
unsigned char)vec_splats (__A
);
1676 b
= (__vector
unsigned char)vec_splats (__B
);
1678 return (__builtin_unpack_vector_int128 ((__vector __int128
)c
, 0));
1681 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1682 _m_pavgb (__m64 __A
, __m64 __B
)
1684 return _mm_avg_pu8 (__A
, __B
);
1687 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1688 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1689 _mm_avg_pu16 (__m64 __A
, __m64 __B
)
1691 __vector
unsigned short a
, b
, c
;
1693 a
= (__vector
unsigned short)vec_splats (__A
);
1694 b
= (__vector
unsigned short)vec_splats (__B
);
1696 return (__builtin_unpack_vector_int128 ((__vector __int128
)c
, 0));
1699 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1700 _m_pavgw (__m64 __A
, __m64 __B
)
1702 return _mm_avg_pu16 (__A
, __B
);
1705 /* Compute the sum of the absolute differences of the unsigned 8-bit
1706 values in A and B. Return the value in the lower 16-bit word; the
1707 upper words are cleared. */
1708 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1709 _mm_sad_pu8 (__m64 __A
, __m64 __B
)
1711 __vector
unsigned char a
, b
;
1712 __vector
unsigned char vmin
, vmax
, vabsdiff
;
1713 __vector
signed int vsum
;
1714 const __vector
unsigned int zero
=
1716 unsigned short result
;
1718 a
= (__vector
unsigned char) __builtin_pack_vector_int128 (0UL, __A
);
1719 b
= (__vector
unsigned char) __builtin_pack_vector_int128 (0UL, __B
);
1720 vmin
= vec_min (a
, b
);
1721 vmax
= vec_max (a
, b
);
1722 vabsdiff
= vec_sub (vmax
, vmin
);
1723 /* Sum four groups of bytes into integers. */
1724 vsum
= (__vector
signed int) vec_sum4s (vabsdiff
, zero
);
1725 /* Sum across four integers with integer result. */
1726 vsum
= vec_sums (vsum
, (__vector
signed int) zero
);
1727 /* The sum is in the right most 32-bits of the vector result.
1728 Transfer to a GPR and truncate to 16 bits. */
1733 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1734 _m_psadbw (__m64 __A
, __m64 __B
)
1736 return _mm_sad_pu8 (__A
, __B
);
1739 /* Stores the data in A to the address P without polluting the caches. */
1740 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1741 _mm_stream_pi (__m64
*__P
, __m64 __A
)
1743 /* Use the data cache block touch for store transient. */
1753 /* Likewise. The address must be 16-byte aligned. */
1754 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1755 _mm_stream_ps (float *__P
, __m128 __A
)
1757 /* Use the data cache block touch for store transient. */
1764 _mm_store_ps (__P
, __A
);
1767 /* Guarantees that every preceding store is globally visible before
1768 any subsequent store. */
1769 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1772 /* Generate a light weight sync. */
1773 __atomic_thread_fence (__ATOMIC_RELEASE
);
1776 /* The execution of the next instruction is delayed by an implementation
1777 specific amount of time. The instruction does not modify the
1778 architectural state. This is after the pop_options pragma because
1779 it does not require SSE support in the processor--the encoding is a
1780 nop on processors that do not support it. */
1781 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1784 /* There is no exact match with this construct, but the following is
1785 close to the desired effect. */
1787 /* On power8 and later processors we can depend on Program Priority
1788 (PRI) and associated "very low" PPI setting. Since we don't know
1789 what PPI this thread is running at we: 1) save the current PRI
1790 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1791 via the special or 31,31,31 encoding. 3) issue an "isync" to
1792 insure the PRI change takes effect before we execute any more
1794 Now we can execute a lwsync (release barrier) while we execute
1795 this thread at "very low" PRI. Finally we restore the original
1796 PRI and continue execution. */
1797 unsigned long __PPR
;
1811 /* For older processor where we may not even have Program Priority
1812 controls we can only depend on Heavy Weight Sync. */
1813 __atomic_thread_fence (__ATOMIC_SEQ_CST
);
1817 /* Transpose the 4x4 matrix composed of row[0-3]. */
1818 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1820 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1821 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1822 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1823 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1824 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1825 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1826 (__vector long long)__t1); \
1827 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1828 (__vector long long)__t1); \
1829 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1830 (__vector long long)__t3); \
1831 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1832 (__vector long long)__t3); \
1835 /* For backward source compatibility. */
1836 //# include <emmintrin.h>
1838 #endif /* _XMMINTRIN_H_INCLUDED */