]>
Commit | Line | Data |
---|---|---|
7adcbafe | 1 | /* Copyright (C) 2002-2022 Free Software Foundation, Inc. |
20253250 SM |
2 | |
3 | This file is part of GCC. | |
4 | ||
5 | GCC is free software; you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
7 | the Free Software Foundation; either version 3, or (at your option) | |
8 | any later version. | |
9 | ||
10 | GCC is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | Under Section 7 of GPL version 3, you are granted additional | |
16 | permissions described in the GCC Runtime Library Exception, version | |
17 | 3.1, as published by the Free Software Foundation. | |
18 | ||
19 | You should have received a copy of the GNU General Public License and | |
20 | a copy of the GCC Runtime Library Exception along with this program; | |
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 | <http://www.gnu.org/licenses/>. */ | |
23 | ||
24 | /* Implemented from the specification included in the Intel C++ Compiler | |
25 | User Guide and Reference, version 9.0. */ | |
26 | ||
27 | #ifndef NO_WARN_X86_INTRINSICS | |
28 | /* This header is distributed to simplify porting x86_64 code that | |
29 | makes explicit use of Intel intrinsics to powerpc64le. | |
30 | It is the user's responsibility to determine if the results are | |
31 | acceptable and make additional changes as necessary. | |
32 | Note that much code that uses Intel intrinsics can be rewritten in | |
33 | standard C or GNU C extensions, which are more portable and better | |
34 | optimized across multiple targets. | |
35 | ||
36 | In the specific case of X86 SSE (__m128) intrinsics, the PowerPC | |
37 | VMX/VSX ISA is a good match for vector float SIMD operations. | |
38 | However scalar float operations in vector (XMM) registers require | |
39 | the POWER8 VSX ISA (2.07) level. Also there are important | |
40 | differences for data format and placement of float scalars in the | |
41 | vector register. For PowerISA Scalar floats in FPRs (left most | |
42 | 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE | |
43 | uses the right most 32-bits of the XMM. These differences require | |
44 | extra steps on POWER to match the SSE scalar float semantics. | |
45 | ||
46 | Most SSE scalar float intrinsic operations can be performed more | |
47 | efficiently as C language float scalar operations or optimized to | |
48 | use vector SIMD operations. We recommend this for new applications. | |
49 | ||
50 | Another difference is the format and details of the X86_64 MXSCR vs | |
51 | the PowerISA FPSCR / VSCR registers. We recommend applications | |
52 | replace direct access to the MXSCR with the more portable <fenv.h> | |
53 | Posix APIs. */ | |
9e0fa36a | 54 | #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." |
20253250 SM |
55 | #endif |
56 | ||
57 | #ifndef _XMMINTRIN_H_INCLUDED | |
58 | #define _XMMINTRIN_H_INCLUDED | |
59 | ||
8cafacb5 CL |
60 | /* Define four value permute mask */ |
61 | #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) | |
62 | ||
20253250 | 63 | #include <altivec.h> |
8ab30b97 BS |
64 | |
65 | /* Avoid collisions between altivec.h and strict adherence to C++ and | |
66 | C11 standards. This should eventually be done inside altivec.h itself, | |
67 | but only after testing a full distro build. */ | |
68 | #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \ | |
69 | (defined(__STDC_VERSION__) && \ | |
70 | __STDC_VERSION__ >= 201112L)) | |
71 | #undef vector | |
72 | #undef pixel | |
73 | #undef bool | |
74 | #endif | |
75 | ||
20253250 SM |
76 | #include <assert.h> |
77 | ||
78 | /* We need type definitions from the MMX header file. */ | |
79 | #include <mmintrin.h> | |
80 | ||
81 | /* Get _mm_malloc () and _mm_free (). */ | |
82 | #include <mm_malloc.h> | |
83 | ||
84 | /* The Intel API is flexible enough that we must allow aliasing with other | |
85 | vector types, and their scalar components. */ | |
86 | typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); | |
87 | ||
ddec5aea WS |
88 | /* Unaligned version of the same type. */ |
89 | typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, | |
90 | __aligned__ (1))); | |
91 | ||
20253250 SM |
92 | /* Internal data types for implementing the intrinsics. */ |
93 | typedef float __v4sf __attribute__ ((__vector_size__ (16))); | |
94 | ||
95 | /* Create an undefined vector. */ | |
96 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
97 | _mm_undefined_ps (void) | |
98 | { | |
99 | __m128 __Y = __Y; | |
100 | return __Y; | |
101 | } | |
102 | ||
103 | /* Create a vector of zeros. */ | |
104 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
105 | _mm_setzero_ps (void) | |
106 | { | |
107 | return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; | |
108 | } | |
109 | ||
110 | /* Load four SPFP values from P. The address must be 16-byte aligned. */ | |
111 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
112 | _mm_load_ps (float const *__P) | |
113 | { | |
114 | assert(((unsigned long)__P & 0xfUL) == 0UL); | |
115 | return ((__m128)vec_ld(0, (__v4sf*)__P)); | |
116 | } | |
117 | ||
118 | /* Load four SPFP values from P. The address need not be 16-byte aligned. */ | |
119 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
120 | _mm_loadu_ps (float const *__P) | |
121 | { | |
122 | return (vec_vsx_ld(0, __P)); | |
123 | } | |
124 | ||
125 | /* Load four SPFP values in reverse order. The address must be aligned. */ | |
126 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
127 | _mm_loadr_ps (float const *__P) | |
128 | { | |
129 | __v4sf __tmp; | |
130 | __m128 result; | |
131 | static const __vector unsigned char permute_vector = | |
132 | { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, | |
133 | 0x17, 0x10, 0x11, 0x12, 0x13 }; | |
134 | ||
135 | __tmp = vec_ld (0, (__v4sf *) __P); | |
136 | result = (__m128) vec_perm (__tmp, __tmp, permute_vector); | |
137 | return result; | |
138 | } | |
139 | ||
140 | /* Create a vector with all four elements equal to F. */ | |
141 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
142 | _mm_set1_ps (float __F) | |
143 | { | |
144 | return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; | |
145 | } | |
146 | ||
147 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
148 | _mm_set_ps1 (float __F) | |
149 | { | |
150 | return _mm_set1_ps (__F); | |
151 | } | |
152 | ||
153 | /* Create the vector [Z Y X W]. */ | |
154 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
155 | _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) | |
156 | { | |
157 | return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; | |
158 | } | |
159 | ||
160 | /* Create the vector [W X Y Z]. */ | |
161 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
162 | _mm_setr_ps (float __Z, float __Y, float __X, float __W) | |
163 | { | |
164 | return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; | |
165 | } | |
166 | ||
167 | /* Store four SPFP values. The address must be 16-byte aligned. */ | |
168 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
169 | _mm_store_ps (float *__P, __m128 __A) | |
170 | { | |
171 | assert(((unsigned long)__P & 0xfUL) == 0UL); | |
172 | vec_st((__v4sf)__A, 0, (__v4sf*)__P); | |
173 | } | |
174 | ||
175 | /* Store four SPFP values. The address need not be 16-byte aligned. */ | |
176 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
177 | _mm_storeu_ps (float *__P, __m128 __A) | |
178 | { | |
ddec5aea | 179 | *(__m128_u *)__P = __A; |
20253250 SM |
180 | } |
181 | ||
182 | /* Store four SPFP values in reverse order. The address must be aligned. */ | |
183 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
184 | _mm_storer_ps (float *__P, __m128 __A) | |
185 | { | |
186 | __v4sf __tmp; | |
187 | static const __vector unsigned char permute_vector = | |
188 | { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, | |
189 | 0x17, 0x10, 0x11, 0x12, 0x13 }; | |
190 | ||
191 | __tmp = (__m128) vec_perm (__A, __A, permute_vector); | |
192 | ||
193 | _mm_store_ps (__P, __tmp); | |
194 | } | |
195 | ||
196 | /* Store the lower SPFP value across four words. */ | |
197 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
198 | _mm_store1_ps (float *__P, __m128 __A) | |
199 | { | |
200 | __v4sf __va = vec_splat((__v4sf)__A, 0); | |
201 | _mm_store_ps (__P, __va); | |
202 | } | |
203 | ||
204 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
205 | _mm_store_ps1 (float *__P, __m128 __A) | |
206 | { | |
207 | _mm_store1_ps (__P, __A); | |
208 | } | |
209 | ||
210 | /* Create a vector with element 0 as F and the rest zero. */ | |
211 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
212 | _mm_set_ss (float __F) | |
213 | { | |
214 | return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; | |
215 | } | |
216 | ||
217 | /* Sets the low SPFP value of A from the low value of B. */ | |
218 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
219 | _mm_move_ss (__m128 __A, __m128 __B) | |
220 | { | |
221 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
222 | ||
223 | return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask)); | |
224 | } | |
225 | ||
226 | /* Create a vector with element 0 as *P and the rest zero. */ | |
227 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
228 | _mm_load_ss (float const *__P) | |
229 | { | |
230 | return _mm_set_ss (*__P); | |
231 | } | |
232 | ||
233 | /* Stores the lower SPFP value. */ | |
234 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
235 | _mm_store_ss (float *__P, __m128 __A) | |
236 | { | |
237 | *__P = ((__v4sf)__A)[0]; | |
238 | } | |
239 | ||
240 | /* Perform the respective operation on the lower SPFP (single-precision | |
241 | floating-point) values of A and B; the upper three SPFP values are | |
242 | passed through from A. */ | |
243 | ||
244 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
245 | _mm_add_ss (__m128 __A, __m128 __B) | |
246 | { | |
247 | #ifdef _ARCH_PWR7 | |
248 | __m128 a, b, c; | |
249 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
250 | /* PowerISA VSX does not allow partial (for just lower double) | |
251 | results. So to insure we don't generate spurious exceptions | |
252 | (from the upper double values) we splat the lower double | |
253 | before we to the operation. */ | |
254 | a = vec_splat (__A, 0); | |
255 | b = vec_splat (__B, 0); | |
256 | c = a + b; | |
257 | /* Then we merge the lower float result with the original upper | |
258 | float elements from __A. */ | |
259 | return (vec_sel (__A, c, mask)); | |
260 | #else | |
261 | __A[0] = __A[0] + __B[0]; | |
262 | return (__A); | |
263 | #endif | |
264 | } | |
265 | ||
266 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
267 | _mm_sub_ss (__m128 __A, __m128 __B) | |
268 | { | |
269 | #ifdef _ARCH_PWR7 | |
270 | __m128 a, b, c; | |
271 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
272 | /* PowerISA VSX does not allow partial (for just lower double) | |
273 | results. So to insure we don't generate spurious exceptions | |
274 | (from the upper double values) we splat the lower double | |
275 | before we to the operation. */ | |
276 | a = vec_splat (__A, 0); | |
277 | b = vec_splat (__B, 0); | |
278 | c = a - b; | |
279 | /* Then we merge the lower float result with the original upper | |
280 | float elements from __A. */ | |
281 | return (vec_sel (__A, c, mask)); | |
282 | #else | |
283 | __A[0] = __A[0] - __B[0]; | |
284 | return (__A); | |
285 | #endif | |
286 | } | |
287 | ||
288 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
289 | _mm_mul_ss (__m128 __A, __m128 __B) | |
290 | { | |
291 | #ifdef _ARCH_PWR7 | |
292 | __m128 a, b, c; | |
293 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
294 | /* PowerISA VSX does not allow partial (for just lower double) | |
295 | results. So to insure we don't generate spurious exceptions | |
296 | (from the upper double values) we splat the lower double | |
297 | before we to the operation. */ | |
298 | a = vec_splat (__A, 0); | |
299 | b = vec_splat (__B, 0); | |
300 | c = a * b; | |
301 | /* Then we merge the lower float result with the original upper | |
302 | float elements from __A. */ | |
303 | return (vec_sel (__A, c, mask)); | |
304 | #else | |
305 | __A[0] = __A[0] * __B[0]; | |
306 | return (__A); | |
307 | #endif | |
308 | } | |
309 | ||
310 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
311 | _mm_div_ss (__m128 __A, __m128 __B) | |
312 | { | |
313 | #ifdef _ARCH_PWR7 | |
314 | __m128 a, b, c; | |
315 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
316 | /* PowerISA VSX does not allow partial (for just lower double) | |
317 | results. So to insure we don't generate spurious exceptions | |
318 | (from the upper double values) we splat the lower double | |
319 | before we to the operation. */ | |
320 | a = vec_splat (__A, 0); | |
321 | b = vec_splat (__B, 0); | |
322 | c = a / b; | |
323 | /* Then we merge the lower float result with the original upper | |
324 | float elements from __A. */ | |
325 | return (vec_sel (__A, c, mask)); | |
326 | #else | |
327 | __A[0] = __A[0] / __B[0]; | |
328 | return (__A); | |
329 | #endif | |
330 | } | |
331 | ||
332 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
333 | _mm_sqrt_ss (__m128 __A) | |
334 | { | |
335 | __m128 a, c; | |
336 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
337 | /* PowerISA VSX does not allow partial (for just lower double) | |
338 | * results. So to insure we don't generate spurious exceptions | |
339 | * (from the upper double values) we splat the lower double | |
340 | * before we to the operation. */ | |
341 | a = vec_splat (__A, 0); | |
342 | c = vec_sqrt (a); | |
343 | /* Then we merge the lower float result with the original upper | |
344 | * float elements from __A. */ | |
345 | return (vec_sel (__A, c, mask)); | |
346 | } | |
347 | ||
348 | /* Perform the respective operation on the four SPFP values in A and B. */ | |
349 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
350 | _mm_add_ps (__m128 __A, __m128 __B) | |
351 | { | |
352 | return (__m128) ((__v4sf)__A + (__v4sf)__B); | |
353 | } | |
354 | ||
355 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
356 | _mm_sub_ps (__m128 __A, __m128 __B) | |
357 | { | |
358 | return (__m128) ((__v4sf)__A - (__v4sf)__B); | |
359 | } | |
360 | ||
361 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
362 | _mm_mul_ps (__m128 __A, __m128 __B) | |
363 | { | |
364 | return (__m128) ((__v4sf)__A * (__v4sf)__B); | |
365 | } | |
366 | ||
367 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
368 | _mm_div_ps (__m128 __A, __m128 __B) | |
369 | { | |
370 | return (__m128) ((__v4sf)__A / (__v4sf)__B); | |
371 | } | |
372 | ||
373 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
374 | _mm_sqrt_ps (__m128 __A) | |
375 | { | |
376 | return (vec_sqrt ((__v4sf)__A)); | |
377 | } | |
378 | ||
379 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
380 | _mm_rcp_ps (__m128 __A) | |
381 | { | |
382 | return (vec_re ((__v4sf)__A)); | |
383 | } | |
384 | ||
385 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
386 | _mm_rsqrt_ps (__m128 __A) | |
387 | { | |
388 | return (vec_rsqrte (__A)); | |
389 | } | |
390 | ||
391 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
392 | _mm_rcp_ss (__m128 __A) | |
393 | { | |
394 | __m128 a, c; | |
395 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
396 | /* PowerISA VSX does not allow partial (for just lower double) | |
397 | * results. So to insure we don't generate spurious exceptions | |
398 | * (from the upper double values) we splat the lower double | |
399 | * before we to the operation. */ | |
400 | a = vec_splat (__A, 0); | |
401 | c = _mm_rcp_ps (a); | |
402 | /* Then we merge the lower float result with the original upper | |
403 | * float elements from __A. */ | |
404 | return (vec_sel (__A, c, mask)); | |
405 | } | |
406 | ||
407 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
408 | _mm_rsqrt_ss (__m128 __A) | |
409 | { | |
410 | __m128 a, c; | |
411 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
412 | /* PowerISA VSX does not allow partial (for just lower double) | |
413 | * results. So to insure we don't generate spurious exceptions | |
414 | * (from the upper double values) we splat the lower double | |
415 | * before we to the operation. */ | |
416 | a = vec_splat (__A, 0); | |
417 | c = vec_rsqrte (a); | |
418 | /* Then we merge the lower float result with the original upper | |
419 | * float elements from __A. */ | |
420 | return (vec_sel (__A, c, mask)); | |
421 | } | |
422 | ||
423 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
424 | _mm_min_ss (__m128 __A, __m128 __B) | |
425 | { | |
426 | __v4sf a, b, c; | |
427 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
428 | /* PowerISA VSX does not allow partial (for just lower float) | |
429 | * results. So to insure we don't generate spurious exceptions | |
430 | * (from the upper float values) we splat the lower float | |
431 | * before we to the operation. */ | |
432 | a = vec_splat ((__v4sf)__A, 0); | |
433 | b = vec_splat ((__v4sf)__B, 0); | |
434 | c = vec_min (a, b); | |
435 | /* Then we merge the lower float result with the original upper | |
436 | * float elements from __A. */ | |
437 | return (vec_sel ((__v4sf)__A, c, mask)); | |
438 | } | |
439 | ||
440 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
441 | _mm_max_ss (__m128 __A, __m128 __B) | |
442 | { | |
443 | __v4sf a, b, c; | |
444 | static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
445 | /* PowerISA VSX does not allow partial (for just lower float) | |
446 | * results. So to insure we don't generate spurious exceptions | |
447 | * (from the upper float values) we splat the lower float | |
448 | * before we to the operation. */ | |
449 | a = vec_splat (__A, 0); | |
450 | b = vec_splat (__B, 0); | |
451 | c = vec_max (a, b); | |
452 | /* Then we merge the lower float result with the original upper | |
453 | * float elements from __A. */ | |
454 | return (vec_sel ((__v4sf)__A, c, mask)); | |
455 | } | |
456 | ||
457 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
458 | _mm_min_ps (__m128 __A, __m128 __B) | |
459 | { | |
71c3949e | 460 | __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A); |
ec8d8a5b | 461 | return vec_sel (__B, __A, m); |
20253250 SM |
462 | } |
463 | ||
464 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
465 | _mm_max_ps (__m128 __A, __m128 __B) | |
466 | { | |
71c3949e | 467 | __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B); |
ec8d8a5b | 468 | return vec_sel (__B, __A, m); |
20253250 SM |
469 | } |
470 | ||
471 | /* Perform logical bit-wise operations on 128-bit values. */ | |
472 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
473 | _mm_and_ps (__m128 __A, __m128 __B) | |
474 | { | |
475 | return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B)); | |
476 | // return __builtin_ia32_andps (__A, __B); | |
477 | } | |
478 | ||
479 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
480 | _mm_andnot_ps (__m128 __A, __m128 __B) | |
481 | { | |
482 | return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A)); | |
483 | } | |
484 | ||
485 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
486 | _mm_or_ps (__m128 __A, __m128 __B) | |
487 | { | |
488 | return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B)); | |
489 | } | |
490 | ||
491 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
492 | _mm_xor_ps (__m128 __A, __m128 __B) | |
493 | { | |
494 | return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B)); | |
495 | } | |
496 | ||
497 | /* Perform a comparison on the four SPFP values of A and B. For each | |
498 | element, if the comparison is true, place a mask of all ones in the | |
499 | result, otherwise a mask of zeros. */ | |
500 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
501 | _mm_cmpeq_ps (__m128 __A, __m128 __B) | |
502 | { | |
503 | return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B)); | |
504 | } | |
505 | ||
506 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
507 | _mm_cmplt_ps (__m128 __A, __m128 __B) | |
508 | { | |
509 | return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); | |
510 | } | |
511 | ||
512 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
513 | _mm_cmple_ps (__m128 __A, __m128 __B) | |
514 | { | |
515 | return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); | |
516 | } | |
517 | ||
518 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
519 | _mm_cmpgt_ps (__m128 __A, __m128 __B) | |
520 | { | |
521 | return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); | |
522 | } | |
523 | ||
524 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
525 | _mm_cmpge_ps (__m128 __A, __m128 __B) | |
526 | { | |
527 | return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); | |
528 | } | |
529 | ||
530 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
531 | _mm_cmpneq_ps (__m128 __A, __m128 __B) | |
532 | { | |
533 | __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); | |
534 | return ((__m128)vec_nor (temp, temp)); | |
535 | } | |
536 | ||
537 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
538 | _mm_cmpnlt_ps (__m128 __A, __m128 __B) | |
539 | { | |
540 | return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); | |
541 | } | |
542 | ||
543 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
544 | _mm_cmpnle_ps (__m128 __A, __m128 __B) | |
545 | { | |
546 | return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); | |
547 | } | |
548 | ||
549 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
550 | _mm_cmpngt_ps (__m128 __A, __m128 __B) | |
551 | { | |
552 | return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); | |
553 | } | |
554 | ||
555 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
556 | _mm_cmpnge_ps (__m128 __A, __m128 __B) | |
557 | { | |
558 | return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); | |
559 | } | |
560 | ||
561 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
562 | _mm_cmpord_ps (__m128 __A, __m128 __B) | |
563 | { | |
564 | __vector unsigned int a, b; | |
565 | __vector unsigned int c, d; | |
566 | static const __vector unsigned int float_exp_mask = | |
567 | { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; | |
568 | ||
569 | a = (__vector unsigned int) vec_abs ((__v4sf)__A); | |
570 | b = (__vector unsigned int) vec_abs ((__v4sf)__B); | |
571 | c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); | |
572 | d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); | |
573 | return ((__m128 ) vec_and (c, d)); | |
574 | } | |
575 | ||
576 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
577 | _mm_cmpunord_ps (__m128 __A, __m128 __B) | |
578 | { | |
579 | __vector unsigned int a, b; | |
580 | __vector unsigned int c, d; | |
581 | static const __vector unsigned int float_exp_mask = | |
582 | { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; | |
583 | ||
584 | a = (__vector unsigned int) vec_abs ((__v4sf)__A); | |
585 | b = (__vector unsigned int) vec_abs ((__v4sf)__B); | |
586 | c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); | |
587 | d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); | |
588 | return ((__m128 ) vec_or (c, d)); | |
589 | } | |
590 | ||
591 | /* Perform a comparison on the lower SPFP values of A and B. If the | |
592 | comparison is true, place a mask of all ones in the result, otherwise a | |
593 | mask of zeros. The upper three SPFP values are passed through from A. */ | |
594 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
595 | _mm_cmpeq_ss (__m128 __A, __m128 __B) | |
596 | { | |
597 | static const __vector unsigned int mask = | |
598 | { 0xffffffff, 0, 0, 0 }; | |
599 | __v4sf a, b, c; | |
600 | /* PowerISA VMX does not allow partial (for just element 0) | |
601 | * results. So to insure we don't generate spurious exceptions | |
602 | * (from the upper elements) we splat the lower float | |
603 | * before we to the operation. */ | |
604 | a = vec_splat ((__v4sf) __A, 0); | |
605 | b = vec_splat ((__v4sf) __B, 0); | |
606 | c = (__v4sf) vec_cmpeq(a, b); | |
607 | /* Then we merge the lower float result with the original upper | |
608 | * float elements from __A. */ | |
609 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
610 | } | |
611 | ||
612 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
613 | _mm_cmplt_ss (__m128 __A, __m128 __B) | |
614 | { | |
615 | static const __vector unsigned int mask = | |
616 | { 0xffffffff, 0, 0, 0 }; | |
617 | __v4sf a, b, c; | |
618 | /* PowerISA VMX does not allow partial (for just element 0) | |
619 | * results. So to insure we don't generate spurious exceptions | |
620 | * (from the upper elements) we splat the lower float | |
621 | * before we to the operation. */ | |
622 | a = vec_splat ((__v4sf) __A, 0); | |
623 | b = vec_splat ((__v4sf) __B, 0); | |
624 | c = (__v4sf) vec_cmplt(a, b); | |
625 | /* Then we merge the lower float result with the original upper | |
626 | * float elements from __A. */ | |
627 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
628 | } | |
629 | ||
630 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
631 | _mm_cmple_ss (__m128 __A, __m128 __B) | |
632 | { | |
633 | static const __vector unsigned int mask = | |
634 | { 0xffffffff, 0, 0, 0 }; | |
635 | __v4sf a, b, c; | |
636 | /* PowerISA VMX does not allow partial (for just element 0) | |
637 | * results. So to insure we don't generate spurious exceptions | |
638 | * (from the upper elements) we splat the lower float | |
639 | * before we to the operation. */ | |
640 | a = vec_splat ((__v4sf) __A, 0); | |
641 | b = vec_splat ((__v4sf) __B, 0); | |
642 | c = (__v4sf) vec_cmple(a, b); | |
643 | /* Then we merge the lower float result with the original upper | |
644 | * float elements from __A. */ | |
645 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
646 | } | |
647 | ||
648 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
649 | _mm_cmpgt_ss (__m128 __A, __m128 __B) | |
650 | { | |
651 | static const __vector unsigned int mask = | |
652 | { 0xffffffff, 0, 0, 0 }; | |
653 | __v4sf a, b, c; | |
654 | /* PowerISA VMX does not allow partial (for just element 0) | |
655 | * results. So to insure we don't generate spurious exceptions | |
656 | * (from the upper elements) we splat the lower float | |
657 | * before we to the operation. */ | |
658 | a = vec_splat ((__v4sf) __A, 0); | |
659 | b = vec_splat ((__v4sf) __B, 0); | |
660 | c = (__v4sf) vec_cmpgt(a, b); | |
661 | /* Then we merge the lower float result with the original upper | |
662 | * float elements from __A. */ | |
663 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
664 | } | |
665 | ||
666 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
667 | _mm_cmpge_ss (__m128 __A, __m128 __B) | |
668 | { | |
669 | static const __vector unsigned int mask = | |
670 | { 0xffffffff, 0, 0, 0 }; | |
671 | __v4sf a, b, c; | |
672 | /* PowerISA VMX does not allow partial (for just element 0) | |
673 | * results. So to insure we don't generate spurious exceptions | |
674 | * (from the upper elements) we splat the lower float | |
675 | * before we to the operation. */ | |
676 | a = vec_splat ((__v4sf) __A, 0); | |
677 | b = vec_splat ((__v4sf) __B, 0); | |
678 | c = (__v4sf) vec_cmpge(a, b); | |
679 | /* Then we merge the lower float result with the original upper | |
680 | * float elements from __A. */ | |
681 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
682 | } | |
683 | ||
684 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
685 | _mm_cmpneq_ss (__m128 __A, __m128 __B) | |
686 | { | |
687 | static const __vector unsigned int mask = | |
688 | { 0xffffffff, 0, 0, 0 }; | |
689 | __v4sf a, b, c; | |
690 | /* PowerISA VMX does not allow partial (for just element 0) | |
691 | * results. So to insure we don't generate spurious exceptions | |
692 | * (from the upper elements) we splat the lower float | |
693 | * before we to the operation. */ | |
694 | a = vec_splat ((__v4sf) __A, 0); | |
695 | b = vec_splat ((__v4sf) __B, 0); | |
696 | c = (__v4sf) vec_cmpeq(a, b); | |
697 | c = vec_nor (c, c); | |
698 | /* Then we merge the lower float result with the original upper | |
699 | * float elements from __A. */ | |
700 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
701 | } | |
702 | ||
703 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
704 | _mm_cmpnlt_ss (__m128 __A, __m128 __B) | |
705 | { | |
706 | static const __vector unsigned int mask = | |
707 | { 0xffffffff, 0, 0, 0 }; | |
708 | __v4sf a, b, c; | |
709 | /* PowerISA VMX does not allow partial (for just element 0) | |
710 | * results. So to insure we don't generate spurious exceptions | |
711 | * (from the upper elements) we splat the lower float | |
712 | * before we to the operation. */ | |
713 | a = vec_splat ((__v4sf) __A, 0); | |
714 | b = vec_splat ((__v4sf) __B, 0); | |
715 | c = (__v4sf) vec_cmpge(a, b); | |
716 | /* Then we merge the lower float result with the original upper | |
717 | * float elements from __A. */ | |
718 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
719 | } | |
720 | ||
721 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
722 | _mm_cmpnle_ss (__m128 __A, __m128 __B) | |
723 | { | |
724 | static const __vector unsigned int mask = | |
725 | { 0xffffffff, 0, 0, 0 }; | |
726 | __v4sf a, b, c; | |
727 | /* PowerISA VMX does not allow partial (for just element 0) | |
728 | * results. So to insure we don't generate spurious exceptions | |
729 | * (from the upper elements) we splat the lower float | |
730 | * before we to the operation. */ | |
731 | a = vec_splat ((__v4sf) __A, 0); | |
732 | b = vec_splat ((__v4sf) __B, 0); | |
733 | c = (__v4sf) vec_cmpgt(a, b); | |
734 | /* Then we merge the lower float result with the original upper | |
735 | * float elements from __A. */ | |
736 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
737 | } | |
738 | ||
739 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
740 | _mm_cmpngt_ss (__m128 __A, __m128 __B) | |
741 | { | |
742 | static const __vector unsigned int mask = | |
743 | { 0xffffffff, 0, 0, 0 }; | |
744 | __v4sf a, b, c; | |
745 | /* PowerISA VMX does not allow partial (for just element 0) | |
746 | * results. So to insure we don't generate spurious exceptions | |
747 | * (from the upper elements) we splat the lower float | |
748 | * before we to the operation. */ | |
749 | a = vec_splat ((__v4sf) __A, 0); | |
750 | b = vec_splat ((__v4sf) __B, 0); | |
751 | c = (__v4sf) vec_cmple(a, b); | |
752 | /* Then we merge the lower float result with the original upper | |
753 | * float elements from __A. */ | |
754 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
755 | } | |
756 | ||
757 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
758 | _mm_cmpnge_ss (__m128 __A, __m128 __B) | |
759 | { | |
760 | static const __vector unsigned int mask = | |
761 | { 0xffffffff, 0, 0, 0 }; | |
762 | __v4sf a, b, c; | |
763 | /* PowerISA VMX does not allow partial (for just element 0) | |
764 | * results. So to insure we don't generate spurious exceptions | |
765 | * (from the upper elements) we splat the lower float | |
766 | * before we do the operation. */ | |
767 | a = vec_splat ((__v4sf) __A, 0); | |
768 | b = vec_splat ((__v4sf) __B, 0); | |
769 | c = (__v4sf) vec_cmplt(a, b); | |
770 | /* Then we merge the lower float result with the original upper | |
771 | * float elements from __A. */ | |
772 | return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
773 | } | |
774 | ||
775 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
776 | _mm_cmpord_ss (__m128 __A, __m128 __B) | |
777 | { | |
778 | __vector unsigned int a, b; | |
779 | __vector unsigned int c, d; | |
780 | static const __vector unsigned int float_exp_mask = | |
781 | { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; | |
782 | static const __vector unsigned int mask = | |
783 | { 0xffffffff, 0, 0, 0 }; | |
784 | ||
785 | a = (__vector unsigned int) vec_abs ((__v4sf)__A); | |
786 | b = (__vector unsigned int) vec_abs ((__v4sf)__B); | |
787 | c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); | |
788 | d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); | |
789 | c = vec_and (c, d); | |
790 | /* Then we merge the lower float result with the original upper | |
791 | * float elements from __A. */ | |
792 | return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); | |
793 | } | |
794 | ||
795 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
796 | _mm_cmpunord_ss (__m128 __A, __m128 __B) | |
797 | { | |
798 | __vector unsigned int a, b; | |
799 | __vector unsigned int c, d; | |
800 | static const __vector unsigned int float_exp_mask = | |
801 | { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; | |
802 | static const __vector unsigned int mask = | |
803 | { 0xffffffff, 0, 0, 0 }; | |
804 | ||
805 | a = (__vector unsigned int) vec_abs ((__v4sf)__A); | |
806 | b = (__vector unsigned int) vec_abs ((__v4sf)__B); | |
807 | c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); | |
808 | d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); | |
809 | c = vec_or (c, d); | |
810 | /* Then we merge the lower float result with the original upper | |
811 | * float elements from __A. */ | |
812 | return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); | |
813 | } | |
814 | ||
815 | /* Compare the lower SPFP values of A and B and return 1 if true | |
816 | and 0 if false. */ | |
817 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
818 | _mm_comieq_ss (__m128 __A, __m128 __B) | |
819 | { | |
820 | return (__A[0] == __B[0]); | |
821 | } | |
822 | ||
823 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
824 | _mm_comilt_ss (__m128 __A, __m128 __B) | |
825 | { | |
826 | return (__A[0] < __B[0]); | |
827 | } | |
828 | ||
829 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
830 | _mm_comile_ss (__m128 __A, __m128 __B) | |
831 | { | |
832 | return (__A[0] <= __B[0]); | |
833 | } | |
834 | ||
835 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
836 | _mm_comigt_ss (__m128 __A, __m128 __B) | |
837 | { | |
838 | return (__A[0] > __B[0]); | |
839 | } | |
840 | ||
841 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
842 | _mm_comige_ss (__m128 __A, __m128 __B) | |
843 | { | |
844 | return (__A[0] >= __B[0]); | |
845 | } | |
846 | ||
847 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
848 | _mm_comineq_ss (__m128 __A, __m128 __B) | |
849 | { | |
850 | return (__A[0] != __B[0]); | |
851 | } | |
852 | ||
853 | /* FIXME | |
854 | * The __mm_ucomi??_ss implementations below are exactly the same as | |
855 | * __mm_comi??_ss because GCC for PowerPC only generates unordered | |
856 | * compares (scalar and vector). | |
857 | * Technically __mm_comieq_ss et al should be using the ordered | |
858 | * compare and signal for QNaNs. | |
859 | * The __mm_ucomieq_sd et all should be OK, as is. | |
860 | */ | |
861 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
862 | _mm_ucomieq_ss (__m128 __A, __m128 __B) | |
863 | { | |
864 | return (__A[0] == __B[0]); | |
865 | } | |
866 | ||
867 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
868 | _mm_ucomilt_ss (__m128 __A, __m128 __B) | |
869 | { | |
870 | return (__A[0] < __B[0]); | |
871 | } | |
872 | ||
873 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
874 | _mm_ucomile_ss (__m128 __A, __m128 __B) | |
875 | { | |
876 | return (__A[0] <= __B[0]); | |
877 | } | |
878 | ||
879 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
880 | _mm_ucomigt_ss (__m128 __A, __m128 __B) | |
881 | { | |
882 | return (__A[0] > __B[0]); | |
883 | } | |
884 | ||
885 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
886 | _mm_ucomige_ss (__m128 __A, __m128 __B) | |
887 | { | |
888 | return (__A[0] >= __B[0]); | |
889 | } | |
890 | ||
891 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
892 | _mm_ucomineq_ss (__m128 __A, __m128 __B) | |
893 | { | |
894 | return (__A[0] != __B[0]); | |
895 | } | |
896 | ||
897 | extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
898 | _mm_cvtss_f32 (__m128 __A) | |
899 | { | |
900 | return ((__v4sf)__A)[0]; | |
901 | } | |
902 | ||
903 | /* Convert the lower SPFP value to a 32-bit integer according to the current | |
904 | rounding mode. */ | |
905 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
906 | _mm_cvtss_si32 (__m128 __A) | |
907 | { | |
db739d3c | 908 | int res; |
20253250 | 909 | #ifdef _ARCH_PWR8 |
32999d87 | 910 | double dtmp; |
20253250 | 911 | __asm__( |
60c703ed PC |
912 | #ifdef __LITTLE_ENDIAN__ |
913 | "xxsldwi %x0,%x0,%x0,3;\n" | |
914 | #endif | |
915 | "xscvspdp %x2,%x0;\n" | |
32999d87 | 916 | "fctiw %2,%2;\n" |
60c703ed PC |
917 | "mfvsrd %1,%x2;\n" |
918 | : "+wa" (__A), | |
919 | "=r" (res), | |
32999d87 | 920 | "=f" (dtmp) |
20253250 SM |
921 | : ); |
922 | #else | |
923 | res = __builtin_rint(__A[0]); | |
924 | #endif | |
925 | return (res); | |
926 | } | |
927 | ||
928 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
929 | _mm_cvt_ss2si (__m128 __A) | |
930 | { | |
931 | return _mm_cvtss_si32 (__A); | |
932 | } | |
933 | ||
934 | /* Convert the lower SPFP value to a 32-bit integer according to the | |
935 | current rounding mode. */ | |
936 | ||
937 | /* Intel intrinsic. */ | |
938 | extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
939 | _mm_cvtss_si64 (__m128 __A) | |
940 | { | |
db739d3c PC |
941 | long long res; |
942 | #if defined (_ARCH_PWR8) && defined (__powerpc64__) | |
32999d87 | 943 | double dtmp; |
20253250 | 944 | __asm__( |
60c703ed PC |
945 | #ifdef __LITTLE_ENDIAN__ |
946 | "xxsldwi %x0,%x0,%x0,3;\n" | |
947 | #endif | |
948 | "xscvspdp %x2,%x0;\n" | |
32999d87 | 949 | "fctid %2,%2;\n" |
60c703ed PC |
950 | "mfvsrd %1,%x2;\n" |
951 | : "+wa" (__A), | |
952 | "=r" (res), | |
32999d87 | 953 | "=f" (dtmp) |
20253250 SM |
954 | : ); |
955 | #else | |
956 | res = __builtin_llrint(__A[0]); | |
957 | #endif | |
958 | return (res); | |
959 | } | |
960 | ||
961 | /* Microsoft intrinsic. */ | |
962 | extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
963 | _mm_cvtss_si64x (__m128 __A) | |
964 | { | |
965 | return _mm_cvtss_si64 ((__v4sf) __A); | |
966 | } | |
967 | ||
968 | /* Constants for use with _mm_prefetch. */ | |
969 | enum _mm_hint | |
970 | { | |
971 | /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ | |
972 | _MM_HINT_ET0 = 7, | |
973 | _MM_HINT_ET1 = 6, | |
974 | _MM_HINT_T0 = 3, | |
975 | _MM_HINT_T1 = 2, | |
976 | _MM_HINT_T2 = 1, | |
977 | _MM_HINT_NTA = 0 | |
978 | }; | |
979 | ||
980 | /* Loads one cache line from address P to a location "closer" to the | |
981 | processor. The selector I specifies the type of prefetch operation. */ | |
982 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
983 | _mm_prefetch (const void *__P, enum _mm_hint __I) | |
984 | { | |
985 | /* Current PowerPC will ignores the hint parameters. */ | |
986 | __builtin_prefetch (__P); | |
987 | } | |
988 | ||
989 | /* Convert the two lower SPFP values to 32-bit integers according to the | |
990 | current rounding mode. Return the integers in packed form. */ | |
991 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
992 | _mm_cvtps_pi32 (__m128 __A) | |
993 | { | |
994 | /* Splat two lower SPFP values to both halves. */ | |
995 | __v4sf temp, rounded; | |
5d9c5a96 | 996 | __vector unsigned long long result; |
20253250 SM |
997 | |
998 | /* Splat two lower SPFP values to both halves. */ | |
999 | temp = (__v4sf) vec_splat ((__vector long long)__A, 0); | |
1000 | rounded = vec_rint(temp); | |
5d9c5a96 | 1001 | result = (__vector unsigned long long) vec_cts (rounded, 0); |
20253250 | 1002 | |
8505bf12 | 1003 | return (__m64) ((__vector long long) result)[0]; |
20253250 SM |
1004 | } |
1005 | ||
1006 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1007 | _mm_cvt_ps2pi (__m128 __A) | |
1008 | { | |
1009 | return _mm_cvtps_pi32 (__A); | |
1010 | } | |
1011 | ||
1012 | /* Truncate the lower SPFP value to a 32-bit integer. */ | |
1013 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1014 | _mm_cvttss_si32 (__m128 __A) | |
1015 | { | |
1016 | /* Extract the lower float element. */ | |
1017 | float temp = __A[0]; | |
1018 | /* truncate to 32-bit integer and return. */ | |
1019 | return temp; | |
1020 | } | |
1021 | ||
1022 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1023 | _mm_cvtt_ss2si (__m128 __A) | |
1024 | { | |
1025 | return _mm_cvttss_si32 (__A); | |
1026 | } | |
1027 | ||
1028 | /* Intel intrinsic. */ | |
1029 | extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1030 | _mm_cvttss_si64 (__m128 __A) | |
1031 | { | |
1032 | /* Extract the lower float element. */ | |
1033 | float temp = __A[0]; | |
1034 | /* truncate to 32-bit integer and return. */ | |
1035 | return temp; | |
1036 | } | |
1037 | ||
1038 | /* Microsoft intrinsic. */ | |
1039 | extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1040 | _mm_cvttss_si64x (__m128 __A) | |
1041 | { | |
1042 | /* Extract the lower float element. */ | |
1043 | float temp = __A[0]; | |
1044 | /* truncate to 32-bit integer and return. */ | |
1045 | return temp; | |
1046 | } | |
1047 | ||
1048 | /* Truncate the two lower SPFP values to 32-bit integers. Return the | |
1049 | integers in packed form. */ | |
1050 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1051 | _mm_cvttps_pi32 (__m128 __A) | |
1052 | { | |
1053 | __v4sf temp; | |
5d9c5a96 | 1054 | __vector unsigned long long result; |
20253250 SM |
1055 | |
1056 | /* Splat two lower SPFP values to both halves. */ | |
1057 | temp = (__v4sf) vec_splat ((__vector long long)__A, 0); | |
5d9c5a96 | 1058 | result = (__vector unsigned long long) vec_cts (temp, 0); |
20253250 | 1059 | |
8505bf12 | 1060 | return (__m64) ((__vector long long) result)[0]; |
20253250 SM |
1061 | } |
1062 | ||
1063 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1064 | _mm_cvtt_ps2pi (__m128 __A) | |
1065 | { | |
1066 | return _mm_cvttps_pi32 (__A); | |
1067 | } | |
1068 | ||
1069 | /* Convert B to a SPFP value and insert it as element zero in A. */ | |
1070 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1071 | _mm_cvtsi32_ss (__m128 __A, int __B) | |
1072 | { | |
1073 | float temp = __B; | |
1074 | __A[0] = temp; | |
1075 | ||
1076 | return __A; | |
1077 | } | |
1078 | ||
1079 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1080 | _mm_cvt_si2ss (__m128 __A, int __B) | |
1081 | { | |
1082 | return _mm_cvtsi32_ss (__A, __B); | |
1083 | } | |
1084 | ||
1085 | /* Convert B to a SPFP value and insert it as element zero in A. */ | |
1086 | /* Intel intrinsic. */ | |
1087 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1088 | _mm_cvtsi64_ss (__m128 __A, long long __B) | |
1089 | { | |
1090 | float temp = __B; | |
1091 | __A[0] = temp; | |
1092 | ||
1093 | return __A; | |
1094 | } | |
1095 | ||
1096 | /* Microsoft intrinsic. */ | |
1097 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1098 | _mm_cvtsi64x_ss (__m128 __A, long long __B) | |
1099 | { | |
1100 | return _mm_cvtsi64_ss (__A, __B); | |
1101 | } | |
1102 | ||
1103 | /* Convert the two 32-bit values in B to SPFP form and insert them | |
1104 | as the two lower elements in A. */ | |
1105 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1106 | _mm_cvtpi32_ps (__m128 __A, __m64 __B) | |
1107 | { | |
1108 | __vector signed int vm1; | |
1109 | __vector float vf1; | |
1110 | ||
dbafa0f5 | 1111 | vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B}; |
20253250 SM |
1112 | vf1 = (__vector float) vec_ctf (vm1, 0); |
1113 | ||
5d9c5a96 WS |
1114 | return ((__m128) (__vector unsigned long long) |
1115 | { ((__vector unsigned long long)vf1) [0], | |
1116 | ((__vector unsigned long long)__A) [1]}); | |
20253250 SM |
1117 | } |
1118 | ||
1119 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1120 | _mm_cvt_pi2ps (__m128 __A, __m64 __B) | |
1121 | { | |
1122 | return _mm_cvtpi32_ps (__A, __B); | |
1123 | } | |
1124 | ||
1125 | /* Convert the four signed 16-bit values in A to SPFP form. */ | |
1126 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1127 | _mm_cvtpi16_ps (__m64 __A) | |
1128 | { | |
1129 | __vector signed short vs8; | |
1130 | __vector signed int vi4; | |
1131 | __vector float vf1; | |
1132 | ||
dbafa0f5 | 1133 | vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A }; |
20253250 SM |
1134 | vi4 = vec_vupklsh (vs8); |
1135 | vf1 = (__vector float) vec_ctf (vi4, 0); | |
1136 | ||
1137 | return (__m128) vf1; | |
1138 | } | |
1139 | ||
1140 | /* Convert the four unsigned 16-bit values in A to SPFP form. */ | |
1141 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1142 | _mm_cvtpu16_ps (__m64 __A) | |
1143 | { | |
1144 | const __vector unsigned short zero = | |
1145 | { 0, 0, 0, 0, 0, 0, 0, 0 }; | |
1146 | __vector unsigned short vs8; | |
1147 | __vector unsigned int vi4; | |
1148 | __vector float vf1; | |
1149 | ||
dbafa0f5 | 1150 | vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; |
60c703ed PC |
1151 | vi4 = (__vector unsigned int) vec_mergel |
1152 | #ifdef __LITTLE_ENDIAN__ | |
1153 | (vs8, zero); | |
1154 | #else | |
1155 | (zero, vs8); | |
1156 | #endif | |
20253250 SM |
1157 | vf1 = (__vector float) vec_ctf (vi4, 0); |
1158 | ||
1159 | return (__m128) vf1; | |
1160 | } | |
1161 | ||
1162 | /* Convert the low four signed 8-bit values in A to SPFP form. */ | |
1163 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1164 | _mm_cvtpi8_ps (__m64 __A) | |
1165 | { | |
1166 | __vector signed char vc16; | |
1167 | __vector signed short vs8; | |
1168 | __vector signed int vi4; | |
1169 | __vector float vf1; | |
1170 | ||
dbafa0f5 | 1171 | vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A }; |
20253250 SM |
1172 | vs8 = vec_vupkhsb (vc16); |
1173 | vi4 = vec_vupkhsh (vs8); | |
1174 | vf1 = (__vector float) vec_ctf (vi4, 0); | |
1175 | ||
1176 | return (__m128) vf1; | |
1177 | } | |
1178 | ||
1179 | /* Convert the low four unsigned 8-bit values in A to SPFP form. */ | |
1180 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1181 | ||
1182 | _mm_cvtpu8_ps (__m64 __A) | |
1183 | { | |
1184 | const __vector unsigned char zero = | |
1185 | { 0, 0, 0, 0, 0, 0, 0, 0 }; | |
1186 | __vector unsigned char vc16; | |
1187 | __vector unsigned short vs8; | |
1188 | __vector unsigned int vi4; | |
1189 | __vector float vf1; | |
1190 | ||
dbafa0f5 | 1191 | vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; |
60c703ed PC |
1192 | #ifdef __LITTLE_ENDIAN__ |
1193 | vs8 = (__vector unsigned short) vec_mergel (vc16, zero); | |
1194 | vi4 = (__vector unsigned int) vec_mergeh (vs8, | |
20253250 | 1195 | (__vector unsigned short) zero); |
60c703ed PC |
1196 | #else |
1197 | vs8 = (__vector unsigned short) vec_mergel (zero, vc16); | |
1198 | vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero, | |
1199 | vs8); | |
1200 | #endif | |
20253250 SM |
1201 | vf1 = (__vector float) vec_ctf (vi4, 0); |
1202 | ||
1203 | return (__m128) vf1; | |
1204 | } | |
1205 | ||
1206 | /* Convert the four signed 32-bit values in A and B to SPFP form. */ | |
1207 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
8505bf12 | 1208 | _mm_cvtpi32x2_ps (__m64 __A, __m64 __B) |
20253250 SM |
1209 | { |
1210 | __vector signed int vi4; | |
1211 | __vector float vf4; | |
1212 | ||
60c703ed | 1213 | vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; |
20253250 SM |
1214 | vf4 = (__vector float) vec_ctf (vi4, 0); |
1215 | return (__m128) vf4; | |
1216 | } | |
1217 | ||
1218 | /* Convert the four SPFP values in A to four signed 16-bit integers. */ | |
1219 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
8505bf12 | 1220 | _mm_cvtps_pi16 (__m128 __A) |
20253250 SM |
1221 | { |
1222 | __v4sf rounded; | |
1223 | __vector signed int temp; | |
5d9c5a96 | 1224 | __vector unsigned long long result; |
20253250 SM |
1225 | |
1226 | rounded = vec_rint(__A); | |
1227 | temp = vec_cts (rounded, 0); | |
5d9c5a96 | 1228 | result = (__vector unsigned long long) vec_pack (temp, temp); |
20253250 | 1229 | |
8505bf12 | 1230 | return (__m64) ((__vector long long) result)[0]; |
20253250 SM |
1231 | } |
1232 | ||
1233 | /* Convert the four SPFP values in A to four signed 8-bit integers. */ | |
1234 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
8505bf12 | 1235 | _mm_cvtps_pi8 (__m128 __A) |
20253250 SM |
1236 | { |
1237 | __v4sf rounded; | |
1238 | __vector signed int tmp_i; | |
1239 | static const __vector signed int zero = {0, 0, 0, 0}; | |
1240 | __vector signed short tmp_s; | |
1241 | __vector signed char res_v; | |
20253250 SM |
1242 | |
1243 | rounded = vec_rint(__A); | |
1244 | tmp_i = vec_cts (rounded, 0); | |
1245 | tmp_s = vec_pack (tmp_i, zero); | |
1246 | res_v = vec_pack (tmp_s, tmp_s); | |
8505bf12 | 1247 | return (__m64) ((__vector long long) res_v)[0]; |
20253250 SM |
1248 | } |
1249 | ||
1250 | /* Selects four specific SPFP values from A and B based on MASK. */ | |
1251 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1252 | ||
1253 | _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) | |
1254 | { | |
1255 | unsigned long element_selector_10 = __mask & 0x03; | |
1256 | unsigned long element_selector_32 = (__mask >> 2) & 0x03; | |
1257 | unsigned long element_selector_54 = (__mask >> 4) & 0x03; | |
1258 | unsigned long element_selector_76 = (__mask >> 6) & 0x03; | |
1259 | static const unsigned int permute_selectors[4] = | |
1260 | { | |
1261 | #ifdef __LITTLE_ENDIAN__ | |
1262 | 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C | |
60c703ed PC |
1263 | #else |
1264 | 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F | |
20253250 SM |
1265 | #endif |
1266 | }; | |
1267 | __vector unsigned int t; | |
1268 | ||
20253250 SM |
1269 | t[0] = permute_selectors[element_selector_10]; |
1270 | t[1] = permute_selectors[element_selector_32]; | |
1271 | t[2] = permute_selectors[element_selector_54] + 0x10101010; | |
1272 | t[3] = permute_selectors[element_selector_76] + 0x10101010; | |
20253250 SM |
1273 | return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); |
1274 | } | |
1275 | ||
1276 | /* Selects and interleaves the upper two SPFP values from A and B. */ | |
1277 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1278 | _mm_unpackhi_ps (__m128 __A, __m128 __B) | |
1279 | { | |
1280 | return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B); | |
1281 | } | |
1282 | ||
1283 | /* Selects and interleaves the lower two SPFP values from A and B. */ | |
1284 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1285 | _mm_unpacklo_ps (__m128 __A, __m128 __B) | |
1286 | { | |
1287 | return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B); | |
1288 | } | |
1289 | ||
1290 | /* Sets the upper two SPFP values with 64-bits of data loaded from P; | |
1291 | the lower two values are passed through from A. */ | |
1292 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1293 | _mm_loadh_pi (__m128 __A, __m64 const *__P) | |
1294 | { | |
5d9c5a96 WS |
1295 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
1296 | __vector unsigned long long __p = vec_splats(*__P); | |
20253250 SM |
1297 | __a [1] = __p [1]; |
1298 | ||
1299 | return (__m128)__a; | |
1300 | } | |
1301 | ||
1302 | /* Stores the upper two SPFP values of A into P. */ | |
1303 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1304 | _mm_storeh_pi (__m64 *__P, __m128 __A) | |
1305 | { | |
5d9c5a96 | 1306 | __vector unsigned long long __a = (__vector unsigned long long) __A; |
20253250 SM |
1307 | |
1308 | *__P = __a[1]; | |
1309 | } | |
1310 | ||
1311 | /* Moves the upper two values of B into the lower two values of A. */ | |
1312 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1313 | _mm_movehl_ps (__m128 __A, __m128 __B) | |
1314 | { | |
5d9c5a96 WS |
1315 | return (__m128) vec_mergel ((__vector unsigned long long)__B, |
1316 | (__vector unsigned long long)__A); | |
20253250 SM |
1317 | } |
1318 | ||
1319 | /* Moves the lower two values of B into the upper two values of A. */ | |
1320 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1321 | _mm_movelh_ps (__m128 __A, __m128 __B) | |
1322 | { | |
5d9c5a96 WS |
1323 | return (__m128) vec_mergeh ((__vector unsigned long long)__A, |
1324 | (__vector unsigned long long)__B); | |
20253250 SM |
1325 | } |
1326 | ||
1327 | /* Sets the lower two SPFP values with 64-bits of data loaded from P; | |
1328 | the upper two values are passed through from A. */ | |
1329 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1330 | _mm_loadl_pi (__m128 __A, __m64 const *__P) | |
1331 | { | |
5d9c5a96 WS |
1332 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
1333 | __vector unsigned long long __p = vec_splats(*__P); | |
20253250 SM |
1334 | __a [0] = __p [0]; |
1335 | ||
1336 | return (__m128)__a; | |
1337 | } | |
1338 | ||
1339 | /* Stores the lower two SPFP values of A into P. */ | |
1340 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1341 | _mm_storel_pi (__m64 *__P, __m128 __A) | |
1342 | { | |
5d9c5a96 | 1343 | __vector unsigned long long __a = (__vector unsigned long long) __A; |
20253250 SM |
1344 | |
1345 | *__P = __a[0]; | |
1346 | } | |
1347 | ||
1348 | #ifdef _ARCH_PWR8 | |
1349 | /* Intrinsic functions that require PowerISA 2.07 minimum. */ | |
1350 | ||
1351 | /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ | |
1352 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1353 | _mm_movemask_ps (__m128 __A) | |
1354 | { | |
85289ba3 | 1355 | #ifdef _ARCH_PWR10 |
325c6163 | 1356 | return vec_extractm ((__vector unsigned int) __A); |
85289ba3 | 1357 | #else |
5d9c5a96 | 1358 | __vector unsigned long long result; |
20253250 SM |
1359 | static const __vector unsigned int perm_mask = |
1360 | { | |
1361 | #ifdef __LITTLE_ENDIAN__ | |
1362 | 0x00204060, 0x80808080, 0x80808080, 0x80808080 | |
60c703ed | 1363 | #else |
20253250 SM |
1364 | 0x80808080, 0x80808080, 0x80808080, 0x00204060 |
1365 | #endif | |
1366 | }; | |
1367 | ||
5d9c5a96 WS |
1368 | result = ((__vector unsigned long long) |
1369 | vec_vbpermq ((__vector unsigned char) __A, | |
1370 | (__vector unsigned char) perm_mask)); | |
20253250 SM |
1371 | |
1372 | #ifdef __LITTLE_ENDIAN__ | |
1373 | return result[1]; | |
60c703ed | 1374 | #else |
20253250 SM |
1375 | return result[0]; |
1376 | #endif | |
85289ba3 | 1377 | #endif /* !_ARCH_PWR10 */ |
20253250 SM |
1378 | } |
1379 | #endif /* _ARCH_PWR8 */ | |
1380 | ||
1381 | /* Create a vector with all four elements equal to *P. */ | |
1382 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1383 | _mm_load1_ps (float const *__P) | |
1384 | { | |
1385 | return _mm_set1_ps (*__P); | |
1386 | } | |
1387 | ||
1388 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1389 | _mm_load_ps1 (float const *__P) | |
1390 | { | |
1391 | return _mm_load1_ps (__P); | |
1392 | } | |
1393 | ||
1394 | /* Extracts one of the four words of A. The selector N must be immediate. */ | |
1395 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1396 | _mm_extract_pi16 (__m64 const __A, int const __N) | |
1397 | { | |
4fa008a7 PC |
1398 | unsigned int shiftr = __N & 3; |
1399 | #ifdef __BIG_ENDIAN__ | |
1400 | shiftr = 3 - shiftr; | |
1401 | #endif | |
20253250 | 1402 | |
4fa008a7 | 1403 | return ((__A >> (shiftr * 16)) & 0xffff); |
20253250 SM |
1404 | } |
1405 | ||
1406 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1407 | _m_pextrw (__m64 const __A, int const __N) | |
1408 | { | |
1409 | return _mm_extract_pi16 (__A, __N); | |
1410 | } | |
1411 | ||
1412 | /* Inserts word D into one of four words of A. The selector N must be | |
1413 | immediate. */ | |
1414 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1415 | _mm_insert_pi16 (__m64 const __A, int const __D, int const __N) | |
1416 | { | |
1417 | const int shiftl = (__N & 3) * 16; | |
1418 | const __m64 shiftD = (const __m64) __D << shiftl; | |
1419 | const __m64 mask = 0xffffUL << shiftl; | |
1420 | __m64 result = (__A & (~mask)) | (shiftD & mask); | |
1421 | ||
1422 | return (result); | |
1423 | } | |
1424 | ||
1425 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1426 | _m_pinsrw (__m64 const __A, int const __D, int const __N) | |
1427 | { | |
1428 | return _mm_insert_pi16 (__A, __D, __N); | |
1429 | } | |
1430 | ||
1431 | /* Compute the element-wise maximum of signed 16-bit values. */ | |
1432 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1433 | ||
1434 | _mm_max_pi16 (__m64 __A, __m64 __B) | |
1435 | { | |
1436 | #if _ARCH_PWR8 | |
1437 | __vector signed short a, b, r; | |
69c94135 | 1438 | __vector __bool short c; |
20253250 SM |
1439 | |
1440 | a = (__vector signed short)vec_splats (__A); | |
1441 | b = (__vector signed short)vec_splats (__B); | |
69c94135 | 1442 | c = (__vector __bool short)vec_cmpgt (a, b); |
20253250 | 1443 | r = vec_sel (b, a, c); |
8505bf12 | 1444 | return (__m64) ((__vector long long) r)[0]; |
20253250 SM |
1445 | #else |
1446 | __m64_union m1, m2, res; | |
1447 | ||
1448 | m1.as_m64 = __A; | |
1449 | m2.as_m64 = __B; | |
1450 | ||
1451 | res.as_short[0] = | |
1452 | (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; | |
1453 | res.as_short[1] = | |
1454 | (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; | |
1455 | res.as_short[2] = | |
1456 | (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; | |
1457 | res.as_short[3] = | |
1458 | (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; | |
1459 | ||
1460 | return (__m64) res.as_m64; | |
1461 | #endif | |
1462 | } | |
1463 | ||
1464 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1465 | _m_pmaxsw (__m64 __A, __m64 __B) | |
1466 | { | |
1467 | return _mm_max_pi16 (__A, __B); | |
1468 | } | |
1469 | ||
1470 | /* Compute the element-wise maximum of unsigned 8-bit values. */ | |
1471 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1472 | _mm_max_pu8 (__m64 __A, __m64 __B) | |
1473 | { | |
1474 | #if _ARCH_PWR8 | |
1475 | __vector unsigned char a, b, r; | |
69c94135 | 1476 | __vector __bool char c; |
20253250 SM |
1477 | |
1478 | a = (__vector unsigned char)vec_splats (__A); | |
1479 | b = (__vector unsigned char)vec_splats (__B); | |
69c94135 | 1480 | c = (__vector __bool char)vec_cmpgt (a, b); |
20253250 | 1481 | r = vec_sel (b, a, c); |
8505bf12 | 1482 | return (__m64) ((__vector long long) r)[0]; |
20253250 SM |
1483 | #else |
1484 | __m64_union m1, m2, res; | |
1485 | long i; | |
1486 | ||
1487 | m1.as_m64 = __A; | |
1488 | m2.as_m64 = __B; | |
1489 | ||
1490 | ||
1491 | for (i = 0; i < 8; i++) | |
1492 | res.as_char[i] = | |
1493 | ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ? | |
1494 | m1.as_char[i] : m2.as_char[i]; | |
1495 | ||
1496 | return (__m64) res.as_m64; | |
1497 | #endif | |
1498 | } | |
1499 | ||
1500 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1501 | _m_pmaxub (__m64 __A, __m64 __B) | |
1502 | { | |
1503 | return _mm_max_pu8 (__A, __B); | |
1504 | } | |
1505 | ||
1506 | /* Compute the element-wise minimum of signed 16-bit values. */ | |
1507 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1508 | _mm_min_pi16 (__m64 __A, __m64 __B) | |
1509 | { | |
1510 | #if _ARCH_PWR8 | |
1511 | __vector signed short a, b, r; | |
69c94135 | 1512 | __vector __bool short c; |
20253250 SM |
1513 | |
1514 | a = (__vector signed short)vec_splats (__A); | |
1515 | b = (__vector signed short)vec_splats (__B); | |
69c94135 | 1516 | c = (__vector __bool short)vec_cmplt (a, b); |
20253250 | 1517 | r = vec_sel (b, a, c); |
8505bf12 | 1518 | return (__m64) ((__vector long long) r)[0]; |
20253250 SM |
1519 | #else |
1520 | __m64_union m1, m2, res; | |
1521 | ||
1522 | m1.as_m64 = __A; | |
1523 | m2.as_m64 = __B; | |
1524 | ||
1525 | res.as_short[0] = | |
1526 | (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; | |
1527 | res.as_short[1] = | |
1528 | (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; | |
1529 | res.as_short[2] = | |
1530 | (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; | |
1531 | res.as_short[3] = | |
1532 | (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; | |
1533 | ||
1534 | return (__m64) res.as_m64; | |
1535 | #endif | |
1536 | } | |
1537 | ||
1538 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1539 | _m_pminsw (__m64 __A, __m64 __B) | |
1540 | { | |
1541 | return _mm_min_pi16 (__A, __B); | |
1542 | } | |
1543 | ||
1544 | /* Compute the element-wise minimum of unsigned 8-bit values. */ | |
1545 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1546 | _mm_min_pu8 (__m64 __A, __m64 __B) | |
1547 | { | |
1548 | #if _ARCH_PWR8 | |
1549 | __vector unsigned char a, b, r; | |
69c94135 | 1550 | __vector __bool char c; |
20253250 SM |
1551 | |
1552 | a = (__vector unsigned char)vec_splats (__A); | |
1553 | b = (__vector unsigned char)vec_splats (__B); | |
69c94135 | 1554 | c = (__vector __bool char)vec_cmplt (a, b); |
20253250 | 1555 | r = vec_sel (b, a, c); |
8505bf12 | 1556 | return (__m64) ((__vector long long) r)[0]; |
20253250 SM |
1557 | #else |
1558 | __m64_union m1, m2, res; | |
1559 | long i; | |
1560 | ||
1561 | m1.as_m64 = __A; | |
1562 | m2.as_m64 = __B; | |
1563 | ||
1564 | ||
1565 | for (i = 0; i < 8; i++) | |
1566 | res.as_char[i] = | |
1567 | ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ? | |
1568 | m1.as_char[i] : m2.as_char[i]; | |
1569 | ||
1570 | return (__m64) res.as_m64; | |
1571 | #endif | |
1572 | } | |
1573 | ||
1574 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1575 | _m_pminub (__m64 __A, __m64 __B) | |
1576 | { | |
1577 | return _mm_min_pu8 (__A, __B); | |
1578 | } | |
1579 | ||
1580 | /* Create an 8-bit mask of the signs of 8-bit values. */ | |
1581 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1582 | _mm_movemask_pi8 (__m64 __A) | |
1583 | { | |
db739d3c | 1584 | #ifdef __powerpc64__ |
60c703ed PC |
1585 | unsigned long long p = |
1586 | #ifdef __LITTLE_ENDIAN__ | |
1587 | 0x0008101820283038UL; // permute control for sign bits | |
1588 | #else | |
1589 | 0x3830282018100800UL; // permute control for sign bits | |
1590 | #endif | |
20253250 | 1591 | return __builtin_bpermd (p, __A); |
db739d3c | 1592 | #else |
8ab1df52 | 1593 | #ifdef __LITTLE_ENDIAN__ |
db739d3c PC |
1594 | unsigned int mask = 0x20283038UL; |
1595 | unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf; | |
1596 | unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf; | |
8ab1df52 SB |
1597 | #else |
1598 | unsigned int mask = 0x38302820UL; | |
1599 | unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf; | |
1600 | unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf; | |
1601 | #endif | |
db739d3c PC |
1602 | return (r2 << 4) | r1; |
1603 | #endif | |
20253250 SM |
1604 | } |
1605 | ||
1606 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1607 | _m_pmovmskb (__m64 __A) | |
1608 | { | |
1609 | return _mm_movemask_pi8 (__A); | |
1610 | } | |
1611 | ||
1612 | /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values | |
1613 | in B and produce the high 16 bits of the 32-bit results. */ | |
1614 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1615 | _mm_mulhi_pu16 (__m64 __A, __m64 __B) | |
1616 | { | |
1617 | __vector unsigned short a, b; | |
1618 | __vector unsigned short c; | |
1619 | __vector unsigned int w0, w1; | |
1620 | __vector unsigned char xform1 = { | |
60c703ed | 1621 | #ifdef __LITTLE_ENDIAN__ |
20253250 SM |
1622 | 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, |
1623 | 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F | |
60c703ed PC |
1624 | #else |
1625 | 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, | |
1626 | 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 | |
1627 | #endif | |
20253250 SM |
1628 | }; |
1629 | ||
1630 | a = (__vector unsigned short)vec_splats (__A); | |
1631 | b = (__vector unsigned short)vec_splats (__B); | |
1632 | ||
1633 | w0 = vec_vmuleuh (a, b); | |
1634 | w1 = vec_vmulouh (a, b); | |
1635 | c = (__vector unsigned short)vec_perm (w0, w1, xform1); | |
1636 | ||
8505bf12 | 1637 | return (__m64) ((__vector long long) c)[0]; |
20253250 SM |
1638 | } |
1639 | ||
1640 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1641 | _m_pmulhuw (__m64 __A, __m64 __B) | |
1642 | { | |
1643 | return _mm_mulhi_pu16 (__A, __B); | |
1644 | } | |
1645 | ||
1646 | /* Return a combination of the four 16-bit values in A. The selector | |
1647 | must be an immediate. */ | |
1648 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1649 | _mm_shuffle_pi16 (__m64 __A, int const __N) | |
1650 | { | |
1651 | unsigned long element_selector_10 = __N & 0x03; | |
1652 | unsigned long element_selector_32 = (__N >> 2) & 0x03; | |
1653 | unsigned long element_selector_54 = (__N >> 4) & 0x03; | |
1654 | unsigned long element_selector_76 = (__N >> 6) & 0x03; | |
1655 | static const unsigned short permute_selectors[4] = | |
1656 | { | |
1657 | #ifdef __LITTLE_ENDIAN__ | |
1658 | 0x0908, 0x0B0A, 0x0D0C, 0x0F0E | |
60c703ed | 1659 | #else |
20253250 SM |
1660 | 0x0607, 0x0405, 0x0203, 0x0001 |
1661 | #endif | |
1662 | }; | |
1663 | __m64_union t; | |
5d9c5a96 | 1664 | __vector unsigned long long a, p, r; |
20253250 SM |
1665 | |
1666 | #ifdef __LITTLE_ENDIAN__ | |
1667 | t.as_short[0] = permute_selectors[element_selector_10]; | |
1668 | t.as_short[1] = permute_selectors[element_selector_32]; | |
1669 | t.as_short[2] = permute_selectors[element_selector_54]; | |
1670 | t.as_short[3] = permute_selectors[element_selector_76]; | |
60c703ed | 1671 | #else |
20253250 SM |
1672 | t.as_short[3] = permute_selectors[element_selector_10]; |
1673 | t.as_short[2] = permute_selectors[element_selector_32]; | |
1674 | t.as_short[1] = permute_selectors[element_selector_54]; | |
1675 | t.as_short[0] = permute_selectors[element_selector_76]; | |
1676 | #endif | |
1677 | p = vec_splats (t.as_m64); | |
1678 | a = vec_splats (__A); | |
1679 | r = vec_perm (a, a, (__vector unsigned char)p); | |
8505bf12 | 1680 | return (__m64) ((__vector long long) r)[0]; |
20253250 SM |
1681 | } |
1682 | ||
1683 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1684 | _m_pshufw (__m64 __A, int const __N) | |
1685 | { | |
1686 | return _mm_shuffle_pi16 (__A, __N); | |
1687 | } | |
1688 | ||
1689 | /* Conditionally store byte elements of A into P. The high bit of each | |
1690 | byte in the selector N determines whether the corresponding byte from | |
1691 | A is stored. */ | |
1692 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1693 | _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) | |
1694 | { | |
1695 | __m64 hibit = 0x8080808080808080UL; | |
1696 | __m64 mask, tmp; | |
1697 | __m64 *p = (__m64*)__P; | |
1698 | ||
1699 | tmp = *p; | |
1700 | mask = _mm_cmpeq_pi8 ((__N & hibit), hibit); | |
1701 | tmp = (tmp & (~mask)) | (__A & mask); | |
1702 | *p = tmp; | |
1703 | } | |
1704 | ||
1705 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1706 | _m_maskmovq (__m64 __A, __m64 __N, char *__P) | |
1707 | { | |
1708 | _mm_maskmove_si64 (__A, __N, __P); | |
1709 | } | |
1710 | ||
1711 | /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ | |
1712 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1713 | _mm_avg_pu8 (__m64 __A, __m64 __B) | |
1714 | { | |
1715 | __vector unsigned char a, b, c; | |
1716 | ||
1717 | a = (__vector unsigned char)vec_splats (__A); | |
1718 | b = (__vector unsigned char)vec_splats (__B); | |
1719 | c = vec_avg (a, b); | |
8505bf12 | 1720 | return (__m64) ((__vector long long) c)[0]; |
20253250 SM |
1721 | } |
1722 | ||
1723 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1724 | _m_pavgb (__m64 __A, __m64 __B) | |
1725 | { | |
1726 | return _mm_avg_pu8 (__A, __B); | |
1727 | } | |
1728 | ||
1729 | /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ | |
1730 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1731 | _mm_avg_pu16 (__m64 __A, __m64 __B) | |
1732 | { | |
1733 | __vector unsigned short a, b, c; | |
1734 | ||
1735 | a = (__vector unsigned short)vec_splats (__A); | |
1736 | b = (__vector unsigned short)vec_splats (__B); | |
1737 | c = vec_avg (a, b); | |
8505bf12 | 1738 | return (__m64) ((__vector long long) c)[0]; |
20253250 SM |
1739 | } |
1740 | ||
1741 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1742 | _m_pavgw (__m64 __A, __m64 __B) | |
1743 | { | |
1744 | return _mm_avg_pu16 (__A, __B); | |
1745 | } | |
1746 | ||
1747 | /* Compute the sum of the absolute differences of the unsigned 8-bit | |
1748 | values in A and B. Return the value in the lower 16-bit word; the | |
1749 | upper words are cleared. */ | |
1750 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1751 | _mm_sad_pu8 (__m64 __A, __m64 __B) | |
1752 | { | |
1753 | __vector unsigned char a, b; | |
1754 | __vector unsigned char vmin, vmax, vabsdiff; | |
1755 | __vector signed int vsum; | |
1756 | const __vector unsigned int zero = | |
1757 | { 0, 0, 0, 0 }; | |
60c703ed | 1758 | __m64_union result = {0}; |
20253250 | 1759 | |
dbafa0f5 PC |
1760 | a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; |
1761 | b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; | |
20253250 SM |
1762 | vmin = vec_min (a, b); |
1763 | vmax = vec_max (a, b); | |
1764 | vabsdiff = vec_sub (vmax, vmin); | |
1765 | /* Sum four groups of bytes into integers. */ | |
1766 | vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); | |
1767 | /* Sum across four integers with integer result. */ | |
1768 | vsum = vec_sums (vsum, (__vector signed int) zero); | |
1769 | /* The sum is in the right most 32-bits of the vector result. | |
1770 | Transfer to a GPR and truncate to 16 bits. */ | |
60c703ed PC |
1771 | result.as_short[0] = vsum[3]; |
1772 | return result.as_m64; | |
20253250 SM |
1773 | } |
1774 | ||
1775 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1776 | _m_psadbw (__m64 __A, __m64 __B) | |
1777 | { | |
1778 | return _mm_sad_pu8 (__A, __B); | |
1779 | } | |
1780 | ||
1781 | /* Stores the data in A to the address P without polluting the caches. */ | |
1782 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1783 | _mm_stream_pi (__m64 *__P, __m64 __A) | |
1784 | { | |
1785 | /* Use the data cache block touch for store transient. */ | |
1786 | __asm__ ( | |
1787 | " dcbtstt 0,%0" | |
1788 | : | |
1789 | : "b" (__P) | |
1790 | : "memory" | |
1791 | ); | |
1792 | *__P = __A; | |
1793 | } | |
1794 | ||
1795 | /* Likewise. The address must be 16-byte aligned. */ | |
1796 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1797 | _mm_stream_ps (float *__P, __m128 __A) | |
1798 | { | |
1799 | /* Use the data cache block touch for store transient. */ | |
1800 | __asm__ ( | |
1801 | " dcbtstt 0,%0" | |
1802 | : | |
1803 | : "b" (__P) | |
1804 | : "memory" | |
1805 | ); | |
1806 | _mm_store_ps (__P, __A); | |
1807 | } | |
1808 | ||
1809 | /* Guarantees that every preceding store is globally visible before | |
1810 | any subsequent store. */ | |
1811 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1812 | _mm_sfence (void) | |
1813 | { | |
1814 | /* Generate a light weight sync. */ | |
1815 | __atomic_thread_fence (__ATOMIC_RELEASE); | |
1816 | } | |
1817 | ||
1818 | /* The execution of the next instruction is delayed by an implementation | |
1819 | specific amount of time. The instruction does not modify the | |
1820 | architectural state. This is after the pop_options pragma because | |
1821 | it does not require SSE support in the processor--the encoding is a | |
1822 | nop on processors that do not support it. */ | |
1823 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1824 | _mm_pause (void) | |
1825 | { | |
1826 | /* There is no exact match with this construct, but the following is | |
1827 | close to the desired effect. */ | |
1828 | #if _ARCH_PWR8 | |
1829 | /* On power8 and later processors we can depend on Program Priority | |
1830 | (PRI) and associated "very low" PPI setting. Since we don't know | |
1831 | what PPI this thread is running at we: 1) save the current PRI | |
1832 | from the PPR SPR into a local GRP, 2) set the PRI to "very low* | |
1833 | via the special or 31,31,31 encoding. 3) issue an "isync" to | |
1834 | insure the PRI change takes effect before we execute any more | |
1835 | instructions. | |
1836 | Now we can execute a lwsync (release barrier) while we execute | |
1837 | this thread at "very low" PRI. Finally we restore the original | |
1838 | PRI and continue execution. */ | |
1839 | unsigned long __PPR; | |
1840 | ||
1841 | __asm__ volatile ( | |
1842 | " mfppr %0;" | |
1843 | " or 31,31,31;" | |
1844 | " isync;" | |
1845 | " lwsync;" | |
1846 | " isync;" | |
1847 | " mtppr %0;" | |
1848 | : "=r" (__PPR) | |
1849 | : | |
1850 | : "memory" | |
1851 | ); | |
1852 | #else | |
1853 | /* For older processor where we may not even have Program Priority | |
1854 | controls we can only depend on Heavy Weight Sync. */ | |
1855 | __atomic_thread_fence (__ATOMIC_SEQ_CST); | |
1856 | #endif | |
1857 | } | |
1858 | ||
1859 | /* Transpose the 4x4 matrix composed of row[0-3]. */ | |
1860 | #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ | |
1861 | do { \ | |
1862 | __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ | |
1863 | __v4sf __t0 = vec_vmrghw (__r0, __r1); \ | |
1864 | __v4sf __t1 = vec_vmrghw (__r2, __r3); \ | |
1865 | __v4sf __t2 = vec_vmrglw (__r0, __r1); \ | |
1866 | __v4sf __t3 = vec_vmrglw (__r2, __r3); \ | |
1867 | (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \ | |
1868 | (__vector long long)__t1); \ | |
1869 | (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \ | |
1870 | (__vector long long)__t1); \ | |
1871 | (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \ | |
1872 | (__vector long long)__t3); \ | |
1873 | (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \ | |
1874 | (__vector long long)__t3); \ | |
1875 | } while (0) | |
1876 | ||
1877 | /* For backward source compatibility. */ | |
1878 | //# include <emmintrin.h> | |
1879 | ||
1880 | #endif /* _XMMINTRIN_H_INCLUDED */ |