]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/rs6000/xmmintrin.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / rs6000 / xmmintrin.h
CommitLineData
7adcbafe 1/* Copyright (C) 2002-2022 Free Software Foundation, Inc.
20253250
SM
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27#ifndef NO_WARN_X86_INTRINSICS
28/* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
35
36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37 VMX/VSX ISA is a good match for vector float SIMD operations.
38 However scalar float operations in vector (XMM) registers require
39 the POWER8 VSX ISA (2.07) level. Also there are important
40 differences for data format and placement of float scalars in the
41 vector register. For PowerISA Scalar floats in FPRs (left most
42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43 uses the right most 32-bits of the XMM. These differences require
44 extra steps on POWER to match the SSE scalar float semantics.
45
46 Most SSE scalar float intrinsic operations can be performed more
47 efficiently as C language float scalar operations or optimized to
48 use vector SIMD operations. We recommend this for new applications.
49
50 Another difference is the format and details of the X86_64 MXSCR vs
51 the PowerISA FPSCR / VSCR registers. We recommend applications
52 replace direct access to the MXSCR with the more portable <fenv.h>
53 Posix APIs. */
9e0fa36a 54#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
20253250
SM
55#endif
56
57#ifndef _XMMINTRIN_H_INCLUDED
58#define _XMMINTRIN_H_INCLUDED
59
8cafacb5
CL
60/* Define four value permute mask */
61#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
62
20253250 63#include <altivec.h>
8ab30b97
BS
64
65/* Avoid collisions between altivec.h and strict adherence to C++ and
66 C11 standards. This should eventually be done inside altivec.h itself,
67 but only after testing a full distro build. */
68#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
69 (defined(__STDC_VERSION__) && \
70 __STDC_VERSION__ >= 201112L))
71#undef vector
72#undef pixel
73#undef bool
74#endif
75
20253250
SM
76#include <assert.h>
77
78/* We need type definitions from the MMX header file. */
79#include <mmintrin.h>
80
81/* Get _mm_malloc () and _mm_free (). */
82#include <mm_malloc.h>
83
84/* The Intel API is flexible enough that we must allow aliasing with other
85 vector types, and their scalar components. */
86typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
87
ddec5aea
WS
88/* Unaligned version of the same type. */
89typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
90 __aligned__ (1)));
91
20253250
SM
92/* Internal data types for implementing the intrinsics. */
93typedef float __v4sf __attribute__ ((__vector_size__ (16)));
94
95/* Create an undefined vector. */
96extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97_mm_undefined_ps (void)
98{
99 __m128 __Y = __Y;
100 return __Y;
101}
102
103/* Create a vector of zeros. */
104extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105_mm_setzero_ps (void)
106{
107 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
108}
109
110/* Load four SPFP values from P. The address must be 16-byte aligned. */
111extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
112_mm_load_ps (float const *__P)
113{
114 assert(((unsigned long)__P & 0xfUL) == 0UL);
115 return ((__m128)vec_ld(0, (__v4sf*)__P));
116}
117
118/* Load four SPFP values from P. The address need not be 16-byte aligned. */
119extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_mm_loadu_ps (float const *__P)
121{
122 return (vec_vsx_ld(0, __P));
123}
124
125/* Load four SPFP values in reverse order. The address must be aligned. */
126extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
127_mm_loadr_ps (float const *__P)
128{
129 __v4sf __tmp;
130 __m128 result;
131 static const __vector unsigned char permute_vector =
132 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
133 0x17, 0x10, 0x11, 0x12, 0x13 };
134
135 __tmp = vec_ld (0, (__v4sf *) __P);
136 result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
137 return result;
138}
139
140/* Create a vector with all four elements equal to F. */
141extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142_mm_set1_ps (float __F)
143{
144 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
145}
146
147extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148_mm_set_ps1 (float __F)
149{
150 return _mm_set1_ps (__F);
151}
152
153/* Create the vector [Z Y X W]. */
154extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
156{
157 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
158}
159
160/* Create the vector [W X Y Z]. */
161extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm_setr_ps (float __Z, float __Y, float __X, float __W)
163{
164 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
165}
166
167/* Store four SPFP values. The address must be 16-byte aligned. */
168extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169_mm_store_ps (float *__P, __m128 __A)
170{
171 assert(((unsigned long)__P & 0xfUL) == 0UL);
172 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
173}
174
175/* Store four SPFP values. The address need not be 16-byte aligned. */
176extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177_mm_storeu_ps (float *__P, __m128 __A)
178{
ddec5aea 179 *(__m128_u *)__P = __A;
20253250
SM
180}
181
182/* Store four SPFP values in reverse order. The address must be aligned. */
183extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184_mm_storer_ps (float *__P, __m128 __A)
185{
186 __v4sf __tmp;
187 static const __vector unsigned char permute_vector =
188 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
189 0x17, 0x10, 0x11, 0x12, 0x13 };
190
191 __tmp = (__m128) vec_perm (__A, __A, permute_vector);
192
193 _mm_store_ps (__P, __tmp);
194}
195
196/* Store the lower SPFP value across four words. */
197extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198_mm_store1_ps (float *__P, __m128 __A)
199{
200 __v4sf __va = vec_splat((__v4sf)__A, 0);
201 _mm_store_ps (__P, __va);
202}
203
204extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205_mm_store_ps1 (float *__P, __m128 __A)
206{
207 _mm_store1_ps (__P, __A);
208}
209
210/* Create a vector with element 0 as F and the rest zero. */
211extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212_mm_set_ss (float __F)
213{
214 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
215}
216
217/* Sets the low SPFP value of A from the low value of B. */
218extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219_mm_move_ss (__m128 __A, __m128 __B)
220{
221 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
222
223 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
224}
225
226/* Create a vector with element 0 as *P and the rest zero. */
227extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228_mm_load_ss (float const *__P)
229{
230 return _mm_set_ss (*__P);
231}
232
233/* Stores the lower SPFP value. */
234extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235_mm_store_ss (float *__P, __m128 __A)
236{
237 *__P = ((__v4sf)__A)[0];
238}
239
240/* Perform the respective operation on the lower SPFP (single-precision
241 floating-point) values of A and B; the upper three SPFP values are
242 passed through from A. */
243
244extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245_mm_add_ss (__m128 __A, __m128 __B)
246{
247#ifdef _ARCH_PWR7
248 __m128 a, b, c;
249 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
250 /* PowerISA VSX does not allow partial (for just lower double)
251 results. So to insure we don't generate spurious exceptions
252 (from the upper double values) we splat the lower double
253 before we to the operation. */
254 a = vec_splat (__A, 0);
255 b = vec_splat (__B, 0);
256 c = a + b;
257 /* Then we merge the lower float result with the original upper
258 float elements from __A. */
259 return (vec_sel (__A, c, mask));
260#else
261 __A[0] = __A[0] + __B[0];
262 return (__A);
263#endif
264}
265
266extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267_mm_sub_ss (__m128 __A, __m128 __B)
268{
269#ifdef _ARCH_PWR7
270 __m128 a, b, c;
271 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
272 /* PowerISA VSX does not allow partial (for just lower double)
273 results. So to insure we don't generate spurious exceptions
274 (from the upper double values) we splat the lower double
275 before we to the operation. */
276 a = vec_splat (__A, 0);
277 b = vec_splat (__B, 0);
278 c = a - b;
279 /* Then we merge the lower float result with the original upper
280 float elements from __A. */
281 return (vec_sel (__A, c, mask));
282#else
283 __A[0] = __A[0] - __B[0];
284 return (__A);
285#endif
286}
287
288extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289_mm_mul_ss (__m128 __A, __m128 __B)
290{
291#ifdef _ARCH_PWR7
292 __m128 a, b, c;
293 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
294 /* PowerISA VSX does not allow partial (for just lower double)
295 results. So to insure we don't generate spurious exceptions
296 (from the upper double values) we splat the lower double
297 before we to the operation. */
298 a = vec_splat (__A, 0);
299 b = vec_splat (__B, 0);
300 c = a * b;
301 /* Then we merge the lower float result with the original upper
302 float elements from __A. */
303 return (vec_sel (__A, c, mask));
304#else
305 __A[0] = __A[0] * __B[0];
306 return (__A);
307#endif
308}
309
310extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311_mm_div_ss (__m128 __A, __m128 __B)
312{
313#ifdef _ARCH_PWR7
314 __m128 a, b, c;
315 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
316 /* PowerISA VSX does not allow partial (for just lower double)
317 results. So to insure we don't generate spurious exceptions
318 (from the upper double values) we splat the lower double
319 before we to the operation. */
320 a = vec_splat (__A, 0);
321 b = vec_splat (__B, 0);
322 c = a / b;
323 /* Then we merge the lower float result with the original upper
324 float elements from __A. */
325 return (vec_sel (__A, c, mask));
326#else
327 __A[0] = __A[0] / __B[0];
328 return (__A);
329#endif
330}
331
332extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333_mm_sqrt_ss (__m128 __A)
334{
335 __m128 a, c;
336 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
337 /* PowerISA VSX does not allow partial (for just lower double)
338 * results. So to insure we don't generate spurious exceptions
339 * (from the upper double values) we splat the lower double
340 * before we to the operation. */
341 a = vec_splat (__A, 0);
342 c = vec_sqrt (a);
343 /* Then we merge the lower float result with the original upper
344 * float elements from __A. */
345 return (vec_sel (__A, c, mask));
346}
347
348/* Perform the respective operation on the four SPFP values in A and B. */
349extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350_mm_add_ps (__m128 __A, __m128 __B)
351{
352 return (__m128) ((__v4sf)__A + (__v4sf)__B);
353}
354
355extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm_sub_ps (__m128 __A, __m128 __B)
357{
358 return (__m128) ((__v4sf)__A - (__v4sf)__B);
359}
360
361extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362_mm_mul_ps (__m128 __A, __m128 __B)
363{
364 return (__m128) ((__v4sf)__A * (__v4sf)__B);
365}
366
367extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368_mm_div_ps (__m128 __A, __m128 __B)
369{
370 return (__m128) ((__v4sf)__A / (__v4sf)__B);
371}
372
373extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374_mm_sqrt_ps (__m128 __A)
375{
376 return (vec_sqrt ((__v4sf)__A));
377}
378
379extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380_mm_rcp_ps (__m128 __A)
381{
382 return (vec_re ((__v4sf)__A));
383}
384
385extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386_mm_rsqrt_ps (__m128 __A)
387{
388 return (vec_rsqrte (__A));
389}
390
391extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392_mm_rcp_ss (__m128 __A)
393{
394 __m128 a, c;
395 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
396 /* PowerISA VSX does not allow partial (for just lower double)
397 * results. So to insure we don't generate spurious exceptions
398 * (from the upper double values) we splat the lower double
399 * before we to the operation. */
400 a = vec_splat (__A, 0);
401 c = _mm_rcp_ps (a);
402 /* Then we merge the lower float result with the original upper
403 * float elements from __A. */
404 return (vec_sel (__A, c, mask));
405}
406
407extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
408_mm_rsqrt_ss (__m128 __A)
409{
410 __m128 a, c;
411 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
412 /* PowerISA VSX does not allow partial (for just lower double)
413 * results. So to insure we don't generate spurious exceptions
414 * (from the upper double values) we splat the lower double
415 * before we to the operation. */
416 a = vec_splat (__A, 0);
417 c = vec_rsqrte (a);
418 /* Then we merge the lower float result with the original upper
419 * float elements from __A. */
420 return (vec_sel (__A, c, mask));
421}
422
423extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424_mm_min_ss (__m128 __A, __m128 __B)
425{
426 __v4sf a, b, c;
427 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
428 /* PowerISA VSX does not allow partial (for just lower float)
429 * results. So to insure we don't generate spurious exceptions
430 * (from the upper float values) we splat the lower float
431 * before we to the operation. */
432 a = vec_splat ((__v4sf)__A, 0);
433 b = vec_splat ((__v4sf)__B, 0);
434 c = vec_min (a, b);
435 /* Then we merge the lower float result with the original upper
436 * float elements from __A. */
437 return (vec_sel ((__v4sf)__A, c, mask));
438}
439
440extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441_mm_max_ss (__m128 __A, __m128 __B)
442{
443 __v4sf a, b, c;
444 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
445 /* PowerISA VSX does not allow partial (for just lower float)
446 * results. So to insure we don't generate spurious exceptions
447 * (from the upper float values) we splat the lower float
448 * before we to the operation. */
449 a = vec_splat (__A, 0);
450 b = vec_splat (__B, 0);
451 c = vec_max (a, b);
452 /* Then we merge the lower float result with the original upper
453 * float elements from __A. */
454 return (vec_sel ((__v4sf)__A, c, mask));
455}
456
457extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458_mm_min_ps (__m128 __A, __m128 __B)
459{
71c3949e 460 __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
ec8d8a5b 461 return vec_sel (__B, __A, m);
20253250
SM
462}
463
464extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465_mm_max_ps (__m128 __A, __m128 __B)
466{
71c3949e 467 __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
ec8d8a5b 468 return vec_sel (__B, __A, m);
20253250
SM
469}
470
471/* Perform logical bit-wise operations on 128-bit values. */
472extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473_mm_and_ps (__m128 __A, __m128 __B)
474{
475 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
476// return __builtin_ia32_andps (__A, __B);
477}
478
479extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480_mm_andnot_ps (__m128 __A, __m128 __B)
481{
482 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
483}
484
485extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486_mm_or_ps (__m128 __A, __m128 __B)
487{
488 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
489}
490
491extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492_mm_xor_ps (__m128 __A, __m128 __B)
493{
494 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
495}
496
497/* Perform a comparison on the four SPFP values of A and B. For each
498 element, if the comparison is true, place a mask of all ones in the
499 result, otherwise a mask of zeros. */
500extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501_mm_cmpeq_ps (__m128 __A, __m128 __B)
502{
503 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
504}
505
506extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm_cmplt_ps (__m128 __A, __m128 __B)
508{
509 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
510}
511
512extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513_mm_cmple_ps (__m128 __A, __m128 __B)
514{
515 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
516}
517
518extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519_mm_cmpgt_ps (__m128 __A, __m128 __B)
520{
521 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
522}
523
524extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525_mm_cmpge_ps (__m128 __A, __m128 __B)
526{
527 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
528}
529
530extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
531_mm_cmpneq_ps (__m128 __A, __m128 __B)
532{
533 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
534 return ((__m128)vec_nor (temp, temp));
535}
536
537extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538_mm_cmpnlt_ps (__m128 __A, __m128 __B)
539{
540 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
541}
542
543extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
544_mm_cmpnle_ps (__m128 __A, __m128 __B)
545{
546 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
547}
548
549extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
550_mm_cmpngt_ps (__m128 __A, __m128 __B)
551{
552 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
553}
554
555extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556_mm_cmpnge_ps (__m128 __A, __m128 __B)
557{
558 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
559}
560
561extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562_mm_cmpord_ps (__m128 __A, __m128 __B)
563{
564 __vector unsigned int a, b;
565 __vector unsigned int c, d;
566 static const __vector unsigned int float_exp_mask =
567 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
568
569 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
570 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
571 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
572 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
573 return ((__m128 ) vec_and (c, d));
574}
575
576extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577_mm_cmpunord_ps (__m128 __A, __m128 __B)
578{
579 __vector unsigned int a, b;
580 __vector unsigned int c, d;
581 static const __vector unsigned int float_exp_mask =
582 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
583
584 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
585 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
586 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
587 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
588 return ((__m128 ) vec_or (c, d));
589}
590
591/* Perform a comparison on the lower SPFP values of A and B. If the
592 comparison is true, place a mask of all ones in the result, otherwise a
593 mask of zeros. The upper three SPFP values are passed through from A. */
594extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595_mm_cmpeq_ss (__m128 __A, __m128 __B)
596{
597 static const __vector unsigned int mask =
598 { 0xffffffff, 0, 0, 0 };
599 __v4sf a, b, c;
600 /* PowerISA VMX does not allow partial (for just element 0)
601 * results. So to insure we don't generate spurious exceptions
602 * (from the upper elements) we splat the lower float
603 * before we to the operation. */
604 a = vec_splat ((__v4sf) __A, 0);
605 b = vec_splat ((__v4sf) __B, 0);
606 c = (__v4sf) vec_cmpeq(a, b);
607 /* Then we merge the lower float result with the original upper
608 * float elements from __A. */
609 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
610}
611
612extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_mm_cmplt_ss (__m128 __A, __m128 __B)
614{
615 static const __vector unsigned int mask =
616 { 0xffffffff, 0, 0, 0 };
617 __v4sf a, b, c;
618 /* PowerISA VMX does not allow partial (for just element 0)
619 * results. So to insure we don't generate spurious exceptions
620 * (from the upper elements) we splat the lower float
621 * before we to the operation. */
622 a = vec_splat ((__v4sf) __A, 0);
623 b = vec_splat ((__v4sf) __B, 0);
624 c = (__v4sf) vec_cmplt(a, b);
625 /* Then we merge the lower float result with the original upper
626 * float elements from __A. */
627 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
628}
629
630extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631_mm_cmple_ss (__m128 __A, __m128 __B)
632{
633 static const __vector unsigned int mask =
634 { 0xffffffff, 0, 0, 0 };
635 __v4sf a, b, c;
636 /* PowerISA VMX does not allow partial (for just element 0)
637 * results. So to insure we don't generate spurious exceptions
638 * (from the upper elements) we splat the lower float
639 * before we to the operation. */
640 a = vec_splat ((__v4sf) __A, 0);
641 b = vec_splat ((__v4sf) __B, 0);
642 c = (__v4sf) vec_cmple(a, b);
643 /* Then we merge the lower float result with the original upper
644 * float elements from __A. */
645 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
646}
647
648extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649_mm_cmpgt_ss (__m128 __A, __m128 __B)
650{
651 static const __vector unsigned int mask =
652 { 0xffffffff, 0, 0, 0 };
653 __v4sf a, b, c;
654 /* PowerISA VMX does not allow partial (for just element 0)
655 * results. So to insure we don't generate spurious exceptions
656 * (from the upper elements) we splat the lower float
657 * before we to the operation. */
658 a = vec_splat ((__v4sf) __A, 0);
659 b = vec_splat ((__v4sf) __B, 0);
660 c = (__v4sf) vec_cmpgt(a, b);
661 /* Then we merge the lower float result with the original upper
662 * float elements from __A. */
663 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
664}
665
666extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667_mm_cmpge_ss (__m128 __A, __m128 __B)
668{
669 static const __vector unsigned int mask =
670 { 0xffffffff, 0, 0, 0 };
671 __v4sf a, b, c;
672 /* PowerISA VMX does not allow partial (for just element 0)
673 * results. So to insure we don't generate spurious exceptions
674 * (from the upper elements) we splat the lower float
675 * before we to the operation. */
676 a = vec_splat ((__v4sf) __A, 0);
677 b = vec_splat ((__v4sf) __B, 0);
678 c = (__v4sf) vec_cmpge(a, b);
679 /* Then we merge the lower float result with the original upper
680 * float elements from __A. */
681 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
682}
683
684extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685_mm_cmpneq_ss (__m128 __A, __m128 __B)
686{
687 static const __vector unsigned int mask =
688 { 0xffffffff, 0, 0, 0 };
689 __v4sf a, b, c;
690 /* PowerISA VMX does not allow partial (for just element 0)
691 * results. So to insure we don't generate spurious exceptions
692 * (from the upper elements) we splat the lower float
693 * before we to the operation. */
694 a = vec_splat ((__v4sf) __A, 0);
695 b = vec_splat ((__v4sf) __B, 0);
696 c = (__v4sf) vec_cmpeq(a, b);
697 c = vec_nor (c, c);
698 /* Then we merge the lower float result with the original upper
699 * float elements from __A. */
700 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
701}
702
703extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704_mm_cmpnlt_ss (__m128 __A, __m128 __B)
705{
706 static const __vector unsigned int mask =
707 { 0xffffffff, 0, 0, 0 };
708 __v4sf a, b, c;
709 /* PowerISA VMX does not allow partial (for just element 0)
710 * results. So to insure we don't generate spurious exceptions
711 * (from the upper elements) we splat the lower float
712 * before we to the operation. */
713 a = vec_splat ((__v4sf) __A, 0);
714 b = vec_splat ((__v4sf) __B, 0);
715 c = (__v4sf) vec_cmpge(a, b);
716 /* Then we merge the lower float result with the original upper
717 * float elements from __A. */
718 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
719}
720
721extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722_mm_cmpnle_ss (__m128 __A, __m128 __B)
723{
724 static const __vector unsigned int mask =
725 { 0xffffffff, 0, 0, 0 };
726 __v4sf a, b, c;
727 /* PowerISA VMX does not allow partial (for just element 0)
728 * results. So to insure we don't generate spurious exceptions
729 * (from the upper elements) we splat the lower float
730 * before we to the operation. */
731 a = vec_splat ((__v4sf) __A, 0);
732 b = vec_splat ((__v4sf) __B, 0);
733 c = (__v4sf) vec_cmpgt(a, b);
734 /* Then we merge the lower float result with the original upper
735 * float elements from __A. */
736 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
737}
738
739extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740_mm_cmpngt_ss (__m128 __A, __m128 __B)
741{
742 static const __vector unsigned int mask =
743 { 0xffffffff, 0, 0, 0 };
744 __v4sf a, b, c;
745 /* PowerISA VMX does not allow partial (for just element 0)
746 * results. So to insure we don't generate spurious exceptions
747 * (from the upper elements) we splat the lower float
748 * before we to the operation. */
749 a = vec_splat ((__v4sf) __A, 0);
750 b = vec_splat ((__v4sf) __B, 0);
751 c = (__v4sf) vec_cmple(a, b);
752 /* Then we merge the lower float result with the original upper
753 * float elements from __A. */
754 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
755}
756
757extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758_mm_cmpnge_ss (__m128 __A, __m128 __B)
759{
760 static const __vector unsigned int mask =
761 { 0xffffffff, 0, 0, 0 };
762 __v4sf a, b, c;
763 /* PowerISA VMX does not allow partial (for just element 0)
764 * results. So to insure we don't generate spurious exceptions
765 * (from the upper elements) we splat the lower float
766 * before we do the operation. */
767 a = vec_splat ((__v4sf) __A, 0);
768 b = vec_splat ((__v4sf) __B, 0);
769 c = (__v4sf) vec_cmplt(a, b);
770 /* Then we merge the lower float result with the original upper
771 * float elements from __A. */
772 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
773}
774
775extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776_mm_cmpord_ss (__m128 __A, __m128 __B)
777{
778 __vector unsigned int a, b;
779 __vector unsigned int c, d;
780 static const __vector unsigned int float_exp_mask =
781 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
782 static const __vector unsigned int mask =
783 { 0xffffffff, 0, 0, 0 };
784
785 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
786 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
787 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
788 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
789 c = vec_and (c, d);
790 /* Then we merge the lower float result with the original upper
791 * float elements from __A. */
792 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
793}
794
795extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796_mm_cmpunord_ss (__m128 __A, __m128 __B)
797{
798 __vector unsigned int a, b;
799 __vector unsigned int c, d;
800 static const __vector unsigned int float_exp_mask =
801 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
802 static const __vector unsigned int mask =
803 { 0xffffffff, 0, 0, 0 };
804
805 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
806 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
807 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
808 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
809 c = vec_or (c, d);
810 /* Then we merge the lower float result with the original upper
811 * float elements from __A. */
812 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
813}
814
815/* Compare the lower SPFP values of A and B and return 1 if true
816 and 0 if false. */
817extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818_mm_comieq_ss (__m128 __A, __m128 __B)
819{
820 return (__A[0] == __B[0]);
821}
822
823extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824_mm_comilt_ss (__m128 __A, __m128 __B)
825{
826 return (__A[0] < __B[0]);
827}
828
829extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830_mm_comile_ss (__m128 __A, __m128 __B)
831{
832 return (__A[0] <= __B[0]);
833}
834
835extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836_mm_comigt_ss (__m128 __A, __m128 __B)
837{
838 return (__A[0] > __B[0]);
839}
840
841extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842_mm_comige_ss (__m128 __A, __m128 __B)
843{
844 return (__A[0] >= __B[0]);
845}
846
847extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848_mm_comineq_ss (__m128 __A, __m128 __B)
849{
850 return (__A[0] != __B[0]);
851}
852
853/* FIXME
854 * The __mm_ucomi??_ss implementations below are exactly the same as
855 * __mm_comi??_ss because GCC for PowerPC only generates unordered
856 * compares (scalar and vector).
857 * Technically __mm_comieq_ss et al should be using the ordered
858 * compare and signal for QNaNs.
859 * The __mm_ucomieq_sd et all should be OK, as is.
860 */
861extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
862_mm_ucomieq_ss (__m128 __A, __m128 __B)
863{
864 return (__A[0] == __B[0]);
865}
866
867extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
868_mm_ucomilt_ss (__m128 __A, __m128 __B)
869{
870 return (__A[0] < __B[0]);
871}
872
873extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
874_mm_ucomile_ss (__m128 __A, __m128 __B)
875{
876 return (__A[0] <= __B[0]);
877}
878
879extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
880_mm_ucomigt_ss (__m128 __A, __m128 __B)
881{
882 return (__A[0] > __B[0]);
883}
884
885extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
886_mm_ucomige_ss (__m128 __A, __m128 __B)
887{
888 return (__A[0] >= __B[0]);
889}
890
891extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
892_mm_ucomineq_ss (__m128 __A, __m128 __B)
893{
894 return (__A[0] != __B[0]);
895}
896
897extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
898_mm_cvtss_f32 (__m128 __A)
899{
900 return ((__v4sf)__A)[0];
901}
902
903/* Convert the lower SPFP value to a 32-bit integer according to the current
904 rounding mode. */
905extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906_mm_cvtss_si32 (__m128 __A)
907{
db739d3c 908 int res;
20253250 909#ifdef _ARCH_PWR8
32999d87 910 double dtmp;
20253250 911 __asm__(
60c703ed
PC
912#ifdef __LITTLE_ENDIAN__
913 "xxsldwi %x0,%x0,%x0,3;\n"
914#endif
915 "xscvspdp %x2,%x0;\n"
32999d87 916 "fctiw %2,%2;\n"
60c703ed
PC
917 "mfvsrd %1,%x2;\n"
918 : "+wa" (__A),
919 "=r" (res),
32999d87 920 "=f" (dtmp)
20253250
SM
921 : );
922#else
923 res = __builtin_rint(__A[0]);
924#endif
925 return (res);
926}
927
928extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929_mm_cvt_ss2si (__m128 __A)
930{
931 return _mm_cvtss_si32 (__A);
932}
933
934/* Convert the lower SPFP value to a 32-bit integer according to the
935 current rounding mode. */
936
937/* Intel intrinsic. */
938extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939_mm_cvtss_si64 (__m128 __A)
940{
db739d3c
PC
941 long long res;
942#if defined (_ARCH_PWR8) && defined (__powerpc64__)
32999d87 943 double dtmp;
20253250 944 __asm__(
60c703ed
PC
945#ifdef __LITTLE_ENDIAN__
946 "xxsldwi %x0,%x0,%x0,3;\n"
947#endif
948 "xscvspdp %x2,%x0;\n"
32999d87 949 "fctid %2,%2;\n"
60c703ed
PC
950 "mfvsrd %1,%x2;\n"
951 : "+wa" (__A),
952 "=r" (res),
32999d87 953 "=f" (dtmp)
20253250
SM
954 : );
955#else
956 res = __builtin_llrint(__A[0]);
957#endif
958 return (res);
959}
960
961/* Microsoft intrinsic. */
962extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963_mm_cvtss_si64x (__m128 __A)
964{
965 return _mm_cvtss_si64 ((__v4sf) __A);
966}
967
968/* Constants for use with _mm_prefetch. */
969enum _mm_hint
970{
971 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
972 _MM_HINT_ET0 = 7,
973 _MM_HINT_ET1 = 6,
974 _MM_HINT_T0 = 3,
975 _MM_HINT_T1 = 2,
976 _MM_HINT_T2 = 1,
977 _MM_HINT_NTA = 0
978};
979
980/* Loads one cache line from address P to a location "closer" to the
981 processor. The selector I specifies the type of prefetch operation. */
982extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_prefetch (const void *__P, enum _mm_hint __I)
984{
985 /* Current PowerPC will ignores the hint parameters. */
986 __builtin_prefetch (__P);
987}
988
989/* Convert the two lower SPFP values to 32-bit integers according to the
990 current rounding mode. Return the integers in packed form. */
991extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992_mm_cvtps_pi32 (__m128 __A)
993{
994 /* Splat two lower SPFP values to both halves. */
995 __v4sf temp, rounded;
5d9c5a96 996 __vector unsigned long long result;
20253250
SM
997
998 /* Splat two lower SPFP values to both halves. */
999 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1000 rounded = vec_rint(temp);
5d9c5a96 1001 result = (__vector unsigned long long) vec_cts (rounded, 0);
20253250 1002
8505bf12 1003 return (__m64) ((__vector long long) result)[0];
20253250
SM
1004}
1005
1006extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007_mm_cvt_ps2pi (__m128 __A)
1008{
1009 return _mm_cvtps_pi32 (__A);
1010}
1011
1012/* Truncate the lower SPFP value to a 32-bit integer. */
1013extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014_mm_cvttss_si32 (__m128 __A)
1015{
1016 /* Extract the lower float element. */
1017 float temp = __A[0];
1018 /* truncate to 32-bit integer and return. */
1019 return temp;
1020}
1021
1022extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1023_mm_cvtt_ss2si (__m128 __A)
1024{
1025 return _mm_cvttss_si32 (__A);
1026}
1027
1028/* Intel intrinsic. */
1029extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1030_mm_cvttss_si64 (__m128 __A)
1031{
1032 /* Extract the lower float element. */
1033 float temp = __A[0];
1034 /* truncate to 32-bit integer and return. */
1035 return temp;
1036}
1037
1038/* Microsoft intrinsic. */
1039extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040_mm_cvttss_si64x (__m128 __A)
1041{
1042 /* Extract the lower float element. */
1043 float temp = __A[0];
1044 /* truncate to 32-bit integer and return. */
1045 return temp;
1046}
1047
1048/* Truncate the two lower SPFP values to 32-bit integers. Return the
1049 integers in packed form. */
1050extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051_mm_cvttps_pi32 (__m128 __A)
1052{
1053 __v4sf temp;
5d9c5a96 1054 __vector unsigned long long result;
20253250
SM
1055
1056 /* Splat two lower SPFP values to both halves. */
1057 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
5d9c5a96 1058 result = (__vector unsigned long long) vec_cts (temp, 0);
20253250 1059
8505bf12 1060 return (__m64) ((__vector long long) result)[0];
20253250
SM
1061}
1062
1063extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064_mm_cvtt_ps2pi (__m128 __A)
1065{
1066 return _mm_cvttps_pi32 (__A);
1067}
1068
1069/* Convert B to a SPFP value and insert it as element zero in A. */
1070extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1071_mm_cvtsi32_ss (__m128 __A, int __B)
1072{
1073 float temp = __B;
1074 __A[0] = temp;
1075
1076 return __A;
1077}
1078
1079extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080_mm_cvt_si2ss (__m128 __A, int __B)
1081{
1082 return _mm_cvtsi32_ss (__A, __B);
1083}
1084
1085/* Convert B to a SPFP value and insert it as element zero in A. */
1086/* Intel intrinsic. */
1087extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088_mm_cvtsi64_ss (__m128 __A, long long __B)
1089{
1090 float temp = __B;
1091 __A[0] = temp;
1092
1093 return __A;
1094}
1095
1096/* Microsoft intrinsic. */
1097extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098_mm_cvtsi64x_ss (__m128 __A, long long __B)
1099{
1100 return _mm_cvtsi64_ss (__A, __B);
1101}
1102
1103/* Convert the two 32-bit values in B to SPFP form and insert them
1104 as the two lower elements in A. */
1105extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106_mm_cvtpi32_ps (__m128 __A, __m64 __B)
1107{
1108 __vector signed int vm1;
1109 __vector float vf1;
1110
dbafa0f5 1111 vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
20253250
SM
1112 vf1 = (__vector float) vec_ctf (vm1, 0);
1113
5d9c5a96
WS
1114 return ((__m128) (__vector unsigned long long)
1115 { ((__vector unsigned long long)vf1) [0],
1116 ((__vector unsigned long long)__A) [1]});
20253250
SM
1117}
1118
1119extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120_mm_cvt_pi2ps (__m128 __A, __m64 __B)
1121{
1122 return _mm_cvtpi32_ps (__A, __B);
1123}
1124
1125/* Convert the four signed 16-bit values in A to SPFP form. */
1126extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1127_mm_cvtpi16_ps (__m64 __A)
1128{
1129 __vector signed short vs8;
1130 __vector signed int vi4;
1131 __vector float vf1;
1132
dbafa0f5 1133 vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
20253250
SM
1134 vi4 = vec_vupklsh (vs8);
1135 vf1 = (__vector float) vec_ctf (vi4, 0);
1136
1137 return (__m128) vf1;
1138}
1139
1140/* Convert the four unsigned 16-bit values in A to SPFP form. */
1141extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142_mm_cvtpu16_ps (__m64 __A)
1143{
1144 const __vector unsigned short zero =
1145 { 0, 0, 0, 0, 0, 0, 0, 0 };
1146 __vector unsigned short vs8;
1147 __vector unsigned int vi4;
1148 __vector float vf1;
1149
dbafa0f5 1150 vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
60c703ed
PC
1151 vi4 = (__vector unsigned int) vec_mergel
1152#ifdef __LITTLE_ENDIAN__
1153 (vs8, zero);
1154#else
1155 (zero, vs8);
1156#endif
20253250
SM
1157 vf1 = (__vector float) vec_ctf (vi4, 0);
1158
1159 return (__m128) vf1;
1160}
1161
1162/* Convert the low four signed 8-bit values in A to SPFP form. */
1163extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1164_mm_cvtpi8_ps (__m64 __A)
1165{
1166 __vector signed char vc16;
1167 __vector signed short vs8;
1168 __vector signed int vi4;
1169 __vector float vf1;
1170
dbafa0f5 1171 vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
20253250
SM
1172 vs8 = vec_vupkhsb (vc16);
1173 vi4 = vec_vupkhsh (vs8);
1174 vf1 = (__vector float) vec_ctf (vi4, 0);
1175
1176 return (__m128) vf1;
1177}
1178
1179/* Convert the low four unsigned 8-bit values in A to SPFP form. */
1180extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181
1182_mm_cvtpu8_ps (__m64 __A)
1183{
1184 const __vector unsigned char zero =
1185 { 0, 0, 0, 0, 0, 0, 0, 0 };
1186 __vector unsigned char vc16;
1187 __vector unsigned short vs8;
1188 __vector unsigned int vi4;
1189 __vector float vf1;
1190
dbafa0f5 1191 vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
60c703ed
PC
1192#ifdef __LITTLE_ENDIAN__
1193 vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1194 vi4 = (__vector unsigned int) vec_mergeh (vs8,
20253250 1195 (__vector unsigned short) zero);
60c703ed
PC
1196#else
1197 vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1198 vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1199 vs8);
1200#endif
20253250
SM
1201 vf1 = (__vector float) vec_ctf (vi4, 0);
1202
1203 return (__m128) vf1;
1204}
1205
1206/* Convert the four signed 32-bit values in A and B to SPFP form. */
1207extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8505bf12 1208_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
20253250
SM
1209{
1210 __vector signed int vi4;
1211 __vector float vf4;
1212
60c703ed 1213 vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
20253250
SM
1214 vf4 = (__vector float) vec_ctf (vi4, 0);
1215 return (__m128) vf4;
1216}
1217
1218/* Convert the four SPFP values in A to four signed 16-bit integers. */
1219extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8505bf12 1220_mm_cvtps_pi16 (__m128 __A)
20253250
SM
1221{
1222 __v4sf rounded;
1223 __vector signed int temp;
5d9c5a96 1224 __vector unsigned long long result;
20253250
SM
1225
1226 rounded = vec_rint(__A);
1227 temp = vec_cts (rounded, 0);
5d9c5a96 1228 result = (__vector unsigned long long) vec_pack (temp, temp);
20253250 1229
8505bf12 1230 return (__m64) ((__vector long long) result)[0];
20253250
SM
1231}
1232
1233/* Convert the four SPFP values in A to four signed 8-bit integers. */
1234extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
8505bf12 1235_mm_cvtps_pi8 (__m128 __A)
20253250
SM
1236{
1237 __v4sf rounded;
1238 __vector signed int tmp_i;
1239 static const __vector signed int zero = {0, 0, 0, 0};
1240 __vector signed short tmp_s;
1241 __vector signed char res_v;
20253250
SM
1242
1243 rounded = vec_rint(__A);
1244 tmp_i = vec_cts (rounded, 0);
1245 tmp_s = vec_pack (tmp_i, zero);
1246 res_v = vec_pack (tmp_s, tmp_s);
8505bf12 1247 return (__m64) ((__vector long long) res_v)[0];
20253250
SM
1248}
1249
1250/* Selects four specific SPFP values from A and B based on MASK. */
1251extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252
1253_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1254{
1255 unsigned long element_selector_10 = __mask & 0x03;
1256 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1257 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1258 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1259 static const unsigned int permute_selectors[4] =
1260 {
1261#ifdef __LITTLE_ENDIAN__
1262 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
60c703ed
PC
1263#else
1264 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
20253250
SM
1265#endif
1266 };
1267 __vector unsigned int t;
1268
20253250
SM
1269 t[0] = permute_selectors[element_selector_10];
1270 t[1] = permute_selectors[element_selector_32];
1271 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1272 t[3] = permute_selectors[element_selector_76] + 0x10101010;
20253250
SM
1273 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1274}
1275
1276/* Selects and interleaves the upper two SPFP values from A and B. */
1277extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278_mm_unpackhi_ps (__m128 __A, __m128 __B)
1279{
1280 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1281}
1282
1283/* Selects and interleaves the lower two SPFP values from A and B. */
1284extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1285_mm_unpacklo_ps (__m128 __A, __m128 __B)
1286{
1287 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1288}
1289
1290/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1291 the lower two values are passed through from A. */
1292extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293_mm_loadh_pi (__m128 __A, __m64 const *__P)
1294{
5d9c5a96
WS
1295 __vector unsigned long long __a = (__vector unsigned long long)__A;
1296 __vector unsigned long long __p = vec_splats(*__P);
20253250
SM
1297 __a [1] = __p [1];
1298
1299 return (__m128)__a;
1300}
1301
1302/* Stores the upper two SPFP values of A into P. */
1303extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304_mm_storeh_pi (__m64 *__P, __m128 __A)
1305{
5d9c5a96 1306 __vector unsigned long long __a = (__vector unsigned long long) __A;
20253250
SM
1307
1308 *__P = __a[1];
1309}
1310
1311/* Moves the upper two values of B into the lower two values of A. */
1312extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313_mm_movehl_ps (__m128 __A, __m128 __B)
1314{
5d9c5a96
WS
1315 return (__m128) vec_mergel ((__vector unsigned long long)__B,
1316 (__vector unsigned long long)__A);
20253250
SM
1317}
1318
1319/* Moves the lower two values of B into the upper two values of A. */
1320extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321_mm_movelh_ps (__m128 __A, __m128 __B)
1322{
5d9c5a96
WS
1323 return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1324 (__vector unsigned long long)__B);
20253250
SM
1325}
1326
1327/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1328 the upper two values are passed through from A. */
1329extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330_mm_loadl_pi (__m128 __A, __m64 const *__P)
1331{
5d9c5a96
WS
1332 __vector unsigned long long __a = (__vector unsigned long long)__A;
1333 __vector unsigned long long __p = vec_splats(*__P);
20253250
SM
1334 __a [0] = __p [0];
1335
1336 return (__m128)__a;
1337}
1338
1339/* Stores the lower two SPFP values of A into P. */
1340extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341_mm_storel_pi (__m64 *__P, __m128 __A)
1342{
5d9c5a96 1343 __vector unsigned long long __a = (__vector unsigned long long) __A;
20253250
SM
1344
1345 *__P = __a[0];
1346}
1347
1348#ifdef _ARCH_PWR8
1349/* Intrinsic functions that require PowerISA 2.07 minimum. */
1350
1351/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1352extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353_mm_movemask_ps (__m128 __A)
1354{
85289ba3 1355#ifdef _ARCH_PWR10
325c6163 1356 return vec_extractm ((__vector unsigned int) __A);
85289ba3 1357#else
5d9c5a96 1358 __vector unsigned long long result;
20253250
SM
1359 static const __vector unsigned int perm_mask =
1360 {
1361#ifdef __LITTLE_ENDIAN__
1362 0x00204060, 0x80808080, 0x80808080, 0x80808080
60c703ed 1363#else
20253250
SM
1364 0x80808080, 0x80808080, 0x80808080, 0x00204060
1365#endif
1366 };
1367
5d9c5a96
WS
1368 result = ((__vector unsigned long long)
1369 vec_vbpermq ((__vector unsigned char) __A,
1370 (__vector unsigned char) perm_mask));
20253250
SM
1371
1372#ifdef __LITTLE_ENDIAN__
1373 return result[1];
60c703ed 1374#else
20253250
SM
1375 return result[0];
1376#endif
85289ba3 1377#endif /* !_ARCH_PWR10 */
20253250
SM
1378}
1379#endif /* _ARCH_PWR8 */
1380
1381/* Create a vector with all four elements equal to *P. */
1382extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383_mm_load1_ps (float const *__P)
1384{
1385 return _mm_set1_ps (*__P);
1386}
1387
1388extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389_mm_load_ps1 (float const *__P)
1390{
1391 return _mm_load1_ps (__P);
1392}
1393
1394/* Extracts one of the four words of A. The selector N must be immediate. */
1395extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396_mm_extract_pi16 (__m64 const __A, int const __N)
1397{
4fa008a7
PC
1398 unsigned int shiftr = __N & 3;
1399#ifdef __BIG_ENDIAN__
1400 shiftr = 3 - shiftr;
1401#endif
20253250 1402
4fa008a7 1403 return ((__A >> (shiftr * 16)) & 0xffff);
20253250
SM
1404}
1405
1406extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407_m_pextrw (__m64 const __A, int const __N)
1408{
1409 return _mm_extract_pi16 (__A, __N);
1410}
1411
1412/* Inserts word D into one of four words of A. The selector N must be
1413 immediate. */
1414extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1415_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1416{
1417 const int shiftl = (__N & 3) * 16;
1418 const __m64 shiftD = (const __m64) __D << shiftl;
1419 const __m64 mask = 0xffffUL << shiftl;
1420 __m64 result = (__A & (~mask)) | (shiftD & mask);
1421
1422 return (result);
1423}
1424
1425extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1426_m_pinsrw (__m64 const __A, int const __D, int const __N)
1427{
1428 return _mm_insert_pi16 (__A, __D, __N);
1429}
1430
1431/* Compute the element-wise maximum of signed 16-bit values. */
1432extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1433
1434_mm_max_pi16 (__m64 __A, __m64 __B)
1435{
1436#if _ARCH_PWR8
1437 __vector signed short a, b, r;
69c94135 1438 __vector __bool short c;
20253250
SM
1439
1440 a = (__vector signed short)vec_splats (__A);
1441 b = (__vector signed short)vec_splats (__B);
69c94135 1442 c = (__vector __bool short)vec_cmpgt (a, b);
20253250 1443 r = vec_sel (b, a, c);
8505bf12 1444 return (__m64) ((__vector long long) r)[0];
20253250
SM
1445#else
1446 __m64_union m1, m2, res;
1447
1448 m1.as_m64 = __A;
1449 m2.as_m64 = __B;
1450
1451 res.as_short[0] =
1452 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1453 res.as_short[1] =
1454 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1455 res.as_short[2] =
1456 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1457 res.as_short[3] =
1458 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1459
1460 return (__m64) res.as_m64;
1461#endif
1462}
1463
1464extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465_m_pmaxsw (__m64 __A, __m64 __B)
1466{
1467 return _mm_max_pi16 (__A, __B);
1468}
1469
1470/* Compute the element-wise maximum of unsigned 8-bit values. */
1471extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1472_mm_max_pu8 (__m64 __A, __m64 __B)
1473{
1474#if _ARCH_PWR8
1475 __vector unsigned char a, b, r;
69c94135 1476 __vector __bool char c;
20253250
SM
1477
1478 a = (__vector unsigned char)vec_splats (__A);
1479 b = (__vector unsigned char)vec_splats (__B);
69c94135 1480 c = (__vector __bool char)vec_cmpgt (a, b);
20253250 1481 r = vec_sel (b, a, c);
8505bf12 1482 return (__m64) ((__vector long long) r)[0];
20253250
SM
1483#else
1484 __m64_union m1, m2, res;
1485 long i;
1486
1487 m1.as_m64 = __A;
1488 m2.as_m64 = __B;
1489
1490
1491 for (i = 0; i < 8; i++)
1492 res.as_char[i] =
1493 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1494 m1.as_char[i] : m2.as_char[i];
1495
1496 return (__m64) res.as_m64;
1497#endif
1498}
1499
1500extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1501_m_pmaxub (__m64 __A, __m64 __B)
1502{
1503 return _mm_max_pu8 (__A, __B);
1504}
1505
1506/* Compute the element-wise minimum of signed 16-bit values. */
1507extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1508_mm_min_pi16 (__m64 __A, __m64 __B)
1509{
1510#if _ARCH_PWR8
1511 __vector signed short a, b, r;
69c94135 1512 __vector __bool short c;
20253250
SM
1513
1514 a = (__vector signed short)vec_splats (__A);
1515 b = (__vector signed short)vec_splats (__B);
69c94135 1516 c = (__vector __bool short)vec_cmplt (a, b);
20253250 1517 r = vec_sel (b, a, c);
8505bf12 1518 return (__m64) ((__vector long long) r)[0];
20253250
SM
1519#else
1520 __m64_union m1, m2, res;
1521
1522 m1.as_m64 = __A;
1523 m2.as_m64 = __B;
1524
1525 res.as_short[0] =
1526 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1527 res.as_short[1] =
1528 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1529 res.as_short[2] =
1530 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1531 res.as_short[3] =
1532 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1533
1534 return (__m64) res.as_m64;
1535#endif
1536}
1537
1538extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1539_m_pminsw (__m64 __A, __m64 __B)
1540{
1541 return _mm_min_pi16 (__A, __B);
1542}
1543
1544/* Compute the element-wise minimum of unsigned 8-bit values. */
1545extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1546_mm_min_pu8 (__m64 __A, __m64 __B)
1547{
1548#if _ARCH_PWR8
1549 __vector unsigned char a, b, r;
69c94135 1550 __vector __bool char c;
20253250
SM
1551
1552 a = (__vector unsigned char)vec_splats (__A);
1553 b = (__vector unsigned char)vec_splats (__B);
69c94135 1554 c = (__vector __bool char)vec_cmplt (a, b);
20253250 1555 r = vec_sel (b, a, c);
8505bf12 1556 return (__m64) ((__vector long long) r)[0];
20253250
SM
1557#else
1558 __m64_union m1, m2, res;
1559 long i;
1560
1561 m1.as_m64 = __A;
1562 m2.as_m64 = __B;
1563
1564
1565 for (i = 0; i < 8; i++)
1566 res.as_char[i] =
1567 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1568 m1.as_char[i] : m2.as_char[i];
1569
1570 return (__m64) res.as_m64;
1571#endif
1572}
1573
1574extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1575_m_pminub (__m64 __A, __m64 __B)
1576{
1577 return _mm_min_pu8 (__A, __B);
1578}
1579
1580/* Create an 8-bit mask of the signs of 8-bit values. */
1581extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582_mm_movemask_pi8 (__m64 __A)
1583{
db739d3c 1584#ifdef __powerpc64__
60c703ed
PC
1585 unsigned long long p =
1586#ifdef __LITTLE_ENDIAN__
1587 0x0008101820283038UL; // permute control for sign bits
1588#else
1589 0x3830282018100800UL; // permute control for sign bits
1590#endif
20253250 1591 return __builtin_bpermd (p, __A);
db739d3c 1592#else
8ab1df52 1593#ifdef __LITTLE_ENDIAN__
db739d3c
PC
1594 unsigned int mask = 0x20283038UL;
1595 unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
1596 unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
8ab1df52
SB
1597#else
1598 unsigned int mask = 0x38302820UL;
1599 unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
1600 unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
1601#endif
db739d3c
PC
1602 return (r2 << 4) | r1;
1603#endif
20253250
SM
1604}
1605
1606extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1607_m_pmovmskb (__m64 __A)
1608{
1609 return _mm_movemask_pi8 (__A);
1610}
1611
1612/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1613 in B and produce the high 16 bits of the 32-bit results. */
1614extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1615_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1616{
1617 __vector unsigned short a, b;
1618 __vector unsigned short c;
1619 __vector unsigned int w0, w1;
1620 __vector unsigned char xform1 = {
60c703ed 1621#ifdef __LITTLE_ENDIAN__
20253250
SM
1622 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1623 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
60c703ed
PC
1624#else
1625 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1626 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1627#endif
20253250
SM
1628 };
1629
1630 a = (__vector unsigned short)vec_splats (__A);
1631 b = (__vector unsigned short)vec_splats (__B);
1632
1633 w0 = vec_vmuleuh (a, b);
1634 w1 = vec_vmulouh (a, b);
1635 c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1636
8505bf12 1637 return (__m64) ((__vector long long) c)[0];
20253250
SM
1638}
1639
1640extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1641_m_pmulhuw (__m64 __A, __m64 __B)
1642{
1643 return _mm_mulhi_pu16 (__A, __B);
1644}
1645
1646/* Return a combination of the four 16-bit values in A. The selector
1647 must be an immediate. */
1648extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1649_mm_shuffle_pi16 (__m64 __A, int const __N)
1650{
1651 unsigned long element_selector_10 = __N & 0x03;
1652 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1653 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1654 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1655 static const unsigned short permute_selectors[4] =
1656 {
1657#ifdef __LITTLE_ENDIAN__
1658 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
60c703ed 1659#else
20253250
SM
1660 0x0607, 0x0405, 0x0203, 0x0001
1661#endif
1662 };
1663 __m64_union t;
5d9c5a96 1664 __vector unsigned long long a, p, r;
20253250
SM
1665
1666#ifdef __LITTLE_ENDIAN__
1667 t.as_short[0] = permute_selectors[element_selector_10];
1668 t.as_short[1] = permute_selectors[element_selector_32];
1669 t.as_short[2] = permute_selectors[element_selector_54];
1670 t.as_short[3] = permute_selectors[element_selector_76];
60c703ed 1671#else
20253250
SM
1672 t.as_short[3] = permute_selectors[element_selector_10];
1673 t.as_short[2] = permute_selectors[element_selector_32];
1674 t.as_short[1] = permute_selectors[element_selector_54];
1675 t.as_short[0] = permute_selectors[element_selector_76];
1676#endif
1677 p = vec_splats (t.as_m64);
1678 a = vec_splats (__A);
1679 r = vec_perm (a, a, (__vector unsigned char)p);
8505bf12 1680 return (__m64) ((__vector long long) r)[0];
20253250
SM
1681}
1682
1683extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684_m_pshufw (__m64 __A, int const __N)
1685{
1686 return _mm_shuffle_pi16 (__A, __N);
1687}
1688
1689/* Conditionally store byte elements of A into P. The high bit of each
1690 byte in the selector N determines whether the corresponding byte from
1691 A is stored. */
1692extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1693_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1694{
1695 __m64 hibit = 0x8080808080808080UL;
1696 __m64 mask, tmp;
1697 __m64 *p = (__m64*)__P;
1698
1699 tmp = *p;
1700 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1701 tmp = (tmp & (~mask)) | (__A & mask);
1702 *p = tmp;
1703}
1704
1705extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1706_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1707{
1708 _mm_maskmove_si64 (__A, __N, __P);
1709}
1710
1711/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1712extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1713_mm_avg_pu8 (__m64 __A, __m64 __B)
1714{
1715 __vector unsigned char a, b, c;
1716
1717 a = (__vector unsigned char)vec_splats (__A);
1718 b = (__vector unsigned char)vec_splats (__B);
1719 c = vec_avg (a, b);
8505bf12 1720 return (__m64) ((__vector long long) c)[0];
20253250
SM
1721}
1722
1723extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1724_m_pavgb (__m64 __A, __m64 __B)
1725{
1726 return _mm_avg_pu8 (__A, __B);
1727}
1728
1729/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1730extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1731_mm_avg_pu16 (__m64 __A, __m64 __B)
1732{
1733 __vector unsigned short a, b, c;
1734
1735 a = (__vector unsigned short)vec_splats (__A);
1736 b = (__vector unsigned short)vec_splats (__B);
1737 c = vec_avg (a, b);
8505bf12 1738 return (__m64) ((__vector long long) c)[0];
20253250
SM
1739}
1740
1741extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742_m_pavgw (__m64 __A, __m64 __B)
1743{
1744 return _mm_avg_pu16 (__A, __B);
1745}
1746
1747/* Compute the sum of the absolute differences of the unsigned 8-bit
1748 values in A and B. Return the value in the lower 16-bit word; the
1749 upper words are cleared. */
1750extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1751_mm_sad_pu8 (__m64 __A, __m64 __B)
1752{
1753 __vector unsigned char a, b;
1754 __vector unsigned char vmin, vmax, vabsdiff;
1755 __vector signed int vsum;
1756 const __vector unsigned int zero =
1757 { 0, 0, 0, 0 };
60c703ed 1758 __m64_union result = {0};
20253250 1759
dbafa0f5
PC
1760 a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1761 b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
20253250
SM
1762 vmin = vec_min (a, b);
1763 vmax = vec_max (a, b);
1764 vabsdiff = vec_sub (vmax, vmin);
1765 /* Sum four groups of bytes into integers. */
1766 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1767 /* Sum across four integers with integer result. */
1768 vsum = vec_sums (vsum, (__vector signed int) zero);
1769 /* The sum is in the right most 32-bits of the vector result.
1770 Transfer to a GPR and truncate to 16 bits. */
60c703ed
PC
1771 result.as_short[0] = vsum[3];
1772 return result.as_m64;
20253250
SM
1773}
1774
1775extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1776_m_psadbw (__m64 __A, __m64 __B)
1777{
1778 return _mm_sad_pu8 (__A, __B);
1779}
1780
1781/* Stores the data in A to the address P without polluting the caches. */
1782extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1783_mm_stream_pi (__m64 *__P, __m64 __A)
1784{
1785 /* Use the data cache block touch for store transient. */
1786 __asm__ (
1787 " dcbtstt 0,%0"
1788 :
1789 : "b" (__P)
1790 : "memory"
1791 );
1792 *__P = __A;
1793}
1794
1795/* Likewise. The address must be 16-byte aligned. */
1796extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1797_mm_stream_ps (float *__P, __m128 __A)
1798{
1799 /* Use the data cache block touch for store transient. */
1800 __asm__ (
1801 " dcbtstt 0,%0"
1802 :
1803 : "b" (__P)
1804 : "memory"
1805 );
1806 _mm_store_ps (__P, __A);
1807}
1808
1809/* Guarantees that every preceding store is globally visible before
1810 any subsequent store. */
1811extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1812_mm_sfence (void)
1813{
1814 /* Generate a light weight sync. */
1815 __atomic_thread_fence (__ATOMIC_RELEASE);
1816}
1817
1818/* The execution of the next instruction is delayed by an implementation
1819 specific amount of time. The instruction does not modify the
1820 architectural state. This is after the pop_options pragma because
1821 it does not require SSE support in the processor--the encoding is a
1822 nop on processors that do not support it. */
1823extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1824_mm_pause (void)
1825{
1826 /* There is no exact match with this construct, but the following is
1827 close to the desired effect. */
1828#if _ARCH_PWR8
1829 /* On power8 and later processors we can depend on Program Priority
1830 (PRI) and associated "very low" PPI setting. Since we don't know
1831 what PPI this thread is running at we: 1) save the current PRI
1832 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1833 via the special or 31,31,31 encoding. 3) issue an "isync" to
1834 insure the PRI change takes effect before we execute any more
1835 instructions.
1836 Now we can execute a lwsync (release barrier) while we execute
1837 this thread at "very low" PRI. Finally we restore the original
1838 PRI and continue execution. */
1839 unsigned long __PPR;
1840
1841 __asm__ volatile (
1842 " mfppr %0;"
1843 " or 31,31,31;"
1844 " isync;"
1845 " lwsync;"
1846 " isync;"
1847 " mtppr %0;"
1848 : "=r" (__PPR)
1849 :
1850 : "memory"
1851 );
1852#else
1853 /* For older processor where we may not even have Program Priority
1854 controls we can only depend on Heavy Weight Sync. */
1855 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1856#endif
1857}
1858
1859/* Transpose the 4x4 matrix composed of row[0-3]. */
1860#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1861do { \
1862 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1863 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1864 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1865 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1866 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1867 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1868 (__vector long long)__t1); \
1869 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1870 (__vector long long)__t1); \
1871 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1872 (__vector long long)__t3); \
1873 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1874 (__vector long long)__t3); \
1875} while (0)
1876
1877/* For backward source compatibility. */
1878//# include <emmintrin.h>
1879
1880#endif /* _XMMINTRIN_H_INCLUDED */