]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/rs6000/emmintrin.h
x86intrin.h: Change #warning to #error.
[thirdparty/gcc.git] / gcc / config / rs6000 / emmintrin.h
1 /* Copyright (C) 2003-2018 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
35
36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38 operations. However scalar double operations in vector (XMM)
39 registers require the POWER8 VSX ISA (2.07) level. Also there are
40 important differences for data format and placement of double
41 scalars in the vector register.
42
43 For PowerISA Scalar double is in FPRs (left most 64-bits of the
44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45 the XMM. These differences require extra steps on POWER to match
46 the SSE2 scalar double semantics.
47
48 Most SSE2 scalar double intrinsic operations can be performed more
49 efficiently as C language double scalar operations or optimized to
50 use vector SIMD operations. We recommend this for new applications.
51
52 Another difference is the format and details of the X86_64 MXSCR vs
53 the PowerISA FPSCR / VSCR registers. We recommend applications
54 replace direct access to the MXSCR with the more portable <fenv.h>
55 Posix APIs. */
56 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #endif
58
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
61
62 #include <altivec.h>
63 #include <assert.h>
64
65 /* We need definitions from the SSE header files. */
66 #include <xmmintrin.h>
67
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
78
79 /* The Intel API is flexible enough that we must allow aliasing with other
80 vector types, and their scalar components. */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
83
84 /* Unaligned version of the same types. */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
87
88 /* Create a vector with element 0 as F and the rest zero. */
89 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90 _mm_set_sd (double __F)
91 {
92 return __extension__ (__m128d){ __F, 0.0 };
93 }
94
95 /* Create a vector with both elements equal to F. */
96 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _mm_set1_pd (double __F)
98 {
99 return __extension__ (__m128d){ __F, __F };
100 }
101
102 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103 _mm_set_pd1 (double __F)
104 {
105 return _mm_set1_pd (__F);
106 }
107
108 /* Create a vector with the lower value X and upper value W. */
109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _mm_set_pd (double __W, double __X)
111 {
112 return __extension__ (__m128d){ __X, __W };
113 }
114
115 /* Create a vector with the lower value W and upper value X. */
116 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_setr_pd (double __W, double __X)
118 {
119 return __extension__ (__m128d){ __W, __X };
120 }
121
122 /* Create an undefined vector. */
123 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124 _mm_undefined_pd (void)
125 {
126 __m128d __Y = __Y;
127 return __Y;
128 }
129
130 /* Create a vector of zeros. */
131 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 _mm_setzero_pd (void)
133 {
134 return (__m128d) vec_splats (0);
135 }
136
137 /* Sets the low DPFP value of A from the low value of B. */
138 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 _mm_move_sd (__m128d __A, __m128d __B)
140 {
141 __v2df result = (__v2df) __A;
142 result [0] = ((__v2df) __B)[0];
143 return (__m128d) result;
144 }
145
146 /* Load two DPFP values from P. The address must be 16-byte aligned. */
147 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _mm_load_pd (double const *__P)
149 {
150 assert(((unsigned long)__P & 0xfUL) == 0UL);
151 return ((__m128d)vec_ld(0, (__v16qu*)__P));
152 }
153
154 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
155 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _mm_loadu_pd (double const *__P)
157 {
158 return (vec_vsx_ld(0, __P));
159 }
160
161 /* Create a vector with all two elements equal to *P. */
162 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163 _mm_load1_pd (double const *__P)
164 {
165 return (vec_splats (*__P));
166 }
167
168 /* Create a vector with element 0 as *P and the rest zero. */
169 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_load_sd (double const *__P)
171 {
172 return _mm_set_sd (*__P);
173 }
174
175 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_load_pd1 (double const *__P)
177 {
178 return _mm_load1_pd (__P);
179 }
180
181 /* Load two DPFP values in reverse order. The address must be aligned. */
182 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm_loadr_pd (double const *__P)
184 {
185 __v2df __tmp = _mm_load_pd (__P);
186 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
187 }
188
189 /* Store two DPFP values. The address must be 16-byte aligned. */
190 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 _mm_store_pd (double *__P, __m128d __A)
192 {
193 assert(((unsigned long)__P & 0xfUL) == 0UL);
194 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
195 }
196
197 /* Store two DPFP values. The address need not be 16-byte aligned. */
198 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199 _mm_storeu_pd (double *__P, __m128d __A)
200 {
201 *(__m128d *)__P = __A;
202 }
203
204 /* Stores the lower DPFP value. */
205 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
206 _mm_store_sd (double *__P, __m128d __A)
207 {
208 *__P = ((__v2df)__A)[0];
209 }
210
211 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 _mm_cvtsd_f64 (__m128d __A)
213 {
214 return ((__v2df)__A)[0];
215 }
216
217 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
218 _mm_storel_pd (double *__P, __m128d __A)
219 {
220 _mm_store_sd (__P, __A);
221 }
222
223 /* Stores the upper DPFP value. */
224 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
225 _mm_storeh_pd (double *__P, __m128d __A)
226 {
227 *__P = ((__v2df)__A)[1];
228 }
229 /* Store the lower DPFP value across two words.
230 The address must be 16-byte aligned. */
231 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
232 _mm_store1_pd (double *__P, __m128d __A)
233 {
234 _mm_store_pd (__P, vec_splat (__A, 0));
235 }
236
237 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
238 _mm_store_pd1 (double *__P, __m128d __A)
239 {
240 _mm_store1_pd (__P, __A);
241 }
242
243 /* Store two DPFP values in reverse order. The address must be aligned. */
244 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 _mm_storer_pd (double *__P, __m128d __A)
246 {
247 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
248 }
249
250 /* Intel intrinsic. */
251 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252 _mm_cvtsi128_si64 (__m128i __A)
253 {
254 return ((__v2di)__A)[0];
255 }
256
257 /* Microsoft intrinsic. */
258 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259 _mm_cvtsi128_si64x (__m128i __A)
260 {
261 return ((__v2di)__A)[0];
262 }
263
264 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_add_pd (__m128d __A, __m128d __B)
266 {
267 return (__m128d) ((__v2df)__A + (__v2df)__B);
268 }
269
270 /* Add the lower double-precision (64-bit) floating-point element in
271 a and b, store the result in the lower element of dst, and copy
272 the upper element from a to the upper element of dst. */
273 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
274 _mm_add_sd (__m128d __A, __m128d __B)
275 {
276 __A[0] = __A[0] + __B[0];
277 return (__A);
278 }
279
280 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _mm_sub_pd (__m128d __A, __m128d __B)
282 {
283 return (__m128d) ((__v2df)__A - (__v2df)__B);
284 }
285
286 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm_sub_sd (__m128d __A, __m128d __B)
288 {
289 __A[0] = __A[0] - __B[0];
290 return (__A);
291 }
292
293 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294 _mm_mul_pd (__m128d __A, __m128d __B)
295 {
296 return (__m128d) ((__v2df)__A * (__v2df)__B);
297 }
298
299 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300 _mm_mul_sd (__m128d __A, __m128d __B)
301 {
302 __A[0] = __A[0] * __B[0];
303 return (__A);
304 }
305
306 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307 _mm_div_pd (__m128d __A, __m128d __B)
308 {
309 return (__m128d) ((__v2df)__A / (__v2df)__B);
310 }
311
312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
313 _mm_div_sd (__m128d __A, __m128d __B)
314 {
315 __A[0] = __A[0] / __B[0];
316 return (__A);
317 }
318
319 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320 _mm_sqrt_pd (__m128d __A)
321 {
322 return (vec_sqrt (__A));
323 }
324
325 /* Return pair {sqrt (B[0]), A[1]}. */
326 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_sqrt_sd (__m128d __A, __m128d __B)
328 {
329 __v2df c;
330 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
331 return (__m128d) _mm_setr_pd (c[0], __A[1]);
332 }
333
334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm_min_pd (__m128d __A, __m128d __B)
336 {
337 return (vec_min (__A, __B));
338 }
339
340 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm_min_sd (__m128d __A, __m128d __B)
342 {
343 __v2df a, b, c;
344 a = vec_splats (__A[0]);
345 b = vec_splats (__B[0]);
346 c = vec_min (a, b);
347 return (__m128d) _mm_setr_pd (c[0], __A[1]);
348 }
349
350 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm_max_pd (__m128d __A, __m128d __B)
352 {
353 return (vec_max (__A, __B));
354 }
355
356 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 _mm_max_sd (__m128d __A, __m128d __B)
358 {
359 __v2df a, b, c;
360 a = vec_splats (__A[0]);
361 b = vec_splats (__B[0]);
362 c = vec_max (a, b);
363 return (__m128d) _mm_setr_pd (c[0], __A[1]);
364 }
365
366 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_cmpeq_pd (__m128d __A, __m128d __B)
368 {
369 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
370 }
371
372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_cmplt_pd (__m128d __A, __m128d __B)
374 {
375 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
376 }
377
378 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm_cmple_pd (__m128d __A, __m128d __B)
380 {
381 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
382 }
383
384 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385 _mm_cmpgt_pd (__m128d __A, __m128d __B)
386 {
387 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
388 }
389
390 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391 _mm_cmpge_pd (__m128d __A, __m128d __B)
392 {
393 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
394 }
395
396 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
397 _mm_cmpneq_pd (__m128d __A, __m128d __B)
398 {
399 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
400 return ((__m128d)vec_nor (temp, temp));
401 }
402
403 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
405 {
406 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
407 }
408
409 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410 _mm_cmpnle_pd (__m128d __A, __m128d __B)
411 {
412 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
413 }
414
415 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416 _mm_cmpngt_pd (__m128d __A, __m128d __B)
417 {
418 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
419 }
420
421 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 _mm_cmpnge_pd (__m128d __A, __m128d __B)
423 {
424 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
425 }
426
427 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428 _mm_cmpord_pd (__m128d __A, __m128d __B)
429 {
430 #if _ARCH_PWR8
431 __v2du c, d;
432 /* Compare against self will return false (0's) if NAN. */
433 c = (__v2du)vec_cmpeq (__A, __A);
434 d = (__v2du)vec_cmpeq (__B, __B);
435 #else
436 __v2du a, b;
437 __v2du c, d;
438 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
439 a = (__v2du)vec_abs ((__v2df)__A);
440 b = (__v2du)vec_abs ((__v2df)__B);
441 c = (__v2du)vec_cmpgt (double_exp_mask, a);
442 d = (__v2du)vec_cmpgt (double_exp_mask, b);
443 #endif
444 /* A != NAN and B != NAN. */
445 return ((__m128d)vec_and(c, d));
446 }
447
448 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _mm_cmpunord_pd (__m128d __A, __m128d __B)
450 {
451 #if _ARCH_PWR8
452 __v2du c, d;
453 /* Compare against self will return false (0's) if NAN. */
454 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
455 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
456 /* A == NAN OR B == NAN converts too:
457 NOT(A != NAN) OR NOT(B != NAN). */
458 c = vec_nor (c, c);
459 return ((__m128d)vec_orc(c, d));
460 #else
461 __v2du c, d;
462 /* Compare against self will return false (0's) if NAN. */
463 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
464 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
465 /* Convert the true ('1's) is NAN. */
466 c = vec_nor (c, c);
467 d = vec_nor (d, d);
468 return ((__m128d)vec_or(c, d));
469 #endif
470 }
471
472 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473 _mm_cmpeq_sd(__m128d __A, __m128d __B)
474 {
475 __v2df a, b, c;
476 /* PowerISA VSX does not allow partial (for just lower double)
477 results. So to insure we don't generate spurious exceptions
478 (from the upper double values) we splat the lower double
479 before we do the operation. */
480 a = vec_splats (__A[0]);
481 b = vec_splats (__B[0]);
482 c = (__v2df) vec_cmpeq(a, b);
483 /* Then we merge the lower double result with the original upper
484 double from __A. */
485 return (__m128d) _mm_setr_pd (c[0], __A[1]);
486 }
487
488 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 _mm_cmplt_sd (__m128d __A, __m128d __B)
490 {
491 __v2df a, b, c;
492 a = vec_splats (__A[0]);
493 b = vec_splats (__B[0]);
494 c = (__v2df) vec_cmplt(a, b);
495 return (__m128d) _mm_setr_pd (c[0], __A[1]);
496 }
497
498 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499 _mm_cmple_sd (__m128d __A, __m128d __B)
500 {
501 __v2df a, b, c;
502 a = vec_splats (__A[0]);
503 b = vec_splats (__B[0]);
504 c = (__v2df) vec_cmple(a, b);
505 return (__m128d) _mm_setr_pd (c[0], __A[1]);
506 }
507
508 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509 _mm_cmpgt_sd (__m128d __A, __m128d __B)
510 {
511 __v2df a, b, c;
512 a = vec_splats (__A[0]);
513 b = vec_splats (__B[0]);
514 c = (__v2df) vec_cmpgt(a, b);
515 return (__m128d) _mm_setr_pd (c[0], __A[1]);
516 }
517
518 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519 _mm_cmpge_sd (__m128d __A, __m128d __B)
520 {
521 __v2df a, b, c;
522 a = vec_splats (__A[0]);
523 b = vec_splats (__B[0]);
524 c = (__v2df) vec_cmpge(a, b);
525 return (__m128d) _mm_setr_pd (c[0], __A[1]);
526 }
527
528 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 _mm_cmpneq_sd (__m128d __A, __m128d __B)
530 {
531 __v2df a, b, c;
532 a = vec_splats (__A[0]);
533 b = vec_splats (__B[0]);
534 c = (__v2df) vec_cmpeq(a, b);
535 c = vec_nor (c, c);
536 return (__m128d) _mm_setr_pd (c[0], __A[1]);
537 }
538
539 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
540 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
541 {
542 __v2df a, b, c;
543 a = vec_splats (__A[0]);
544 b = vec_splats (__B[0]);
545 /* Not less than is just greater than or equal. */
546 c = (__v2df) vec_cmpge(a, b);
547 return (__m128d) _mm_setr_pd (c[0], __A[1]);
548 }
549
550 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551 _mm_cmpnle_sd (__m128d __A, __m128d __B)
552 {
553 __v2df a, b, c;
554 a = vec_splats (__A[0]);
555 b = vec_splats (__B[0]);
556 /* Not less than or equal is just greater than. */
557 c = (__v2df) vec_cmpge(a, b);
558 return (__m128d) _mm_setr_pd (c[0], __A[1]);
559 }
560
561 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562 _mm_cmpngt_sd (__m128d __A, __m128d __B)
563 {
564 __v2df a, b, c;
565 a = vec_splats (__A[0]);
566 b = vec_splats (__B[0]);
567 /* Not greater than is just less than or equal. */
568 c = (__v2df) vec_cmple(a, b);
569 return (__m128d) _mm_setr_pd (c[0], __A[1]);
570 }
571
572 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
573 _mm_cmpnge_sd (__m128d __A, __m128d __B)
574 {
575 __v2df a, b, c;
576 a = vec_splats (__A[0]);
577 b = vec_splats (__B[0]);
578 /* Not greater than or equal is just less than. */
579 c = (__v2df) vec_cmplt(a, b);
580 return (__m128d) _mm_setr_pd (c[0], __A[1]);
581 }
582
583 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584 _mm_cmpord_sd (__m128d __A, __m128d __B)
585 {
586 __v2df r;
587 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
588 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
589 }
590
591 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 _mm_cmpunord_sd (__m128d __A, __m128d __B)
593 {
594 __v2df r;
595 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
596 return (__m128d) _mm_setr_pd (r[0], __A[1]);
597 }
598
599 /* FIXME
600 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
601 exactly the same because GCC for PowerPC only generates unordered
602 compares (scalar and vector).
603 Technically __mm_comieq_sp et all should be using the ordered
604 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
605 be OK. */
606 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_comieq_sd (__m128d __A, __m128d __B)
608 {
609 return (__A[0] == __B[0]);
610 }
611
612 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_comilt_sd (__m128d __A, __m128d __B)
614 {
615 return (__A[0] < __B[0]);
616 }
617
618 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm_comile_sd (__m128d __A, __m128d __B)
620 {
621 return (__A[0] <= __B[0]);
622 }
623
624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_comigt_sd (__m128d __A, __m128d __B)
626 {
627 return (__A[0] > __B[0]);
628 }
629
630 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm_comige_sd (__m128d __A, __m128d __B)
632 {
633 return (__A[0] >= __B[0]);
634 }
635
636 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637 _mm_comineq_sd (__m128d __A, __m128d __B)
638 {
639 return (__A[0] != __B[0]);
640 }
641
642 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_ucomieq_sd (__m128d __A, __m128d __B)
644 {
645 return (__A[0] == __B[0]);
646 }
647
648 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 _mm_ucomilt_sd (__m128d __A, __m128d __B)
650 {
651 return (__A[0] < __B[0]);
652 }
653
654 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655 _mm_ucomile_sd (__m128d __A, __m128d __B)
656 {
657 return (__A[0] <= __B[0]);
658 }
659
660 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661 _mm_ucomigt_sd (__m128d __A, __m128d __B)
662 {
663 return (__A[0] > __B[0]);
664 }
665
666 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667 _mm_ucomige_sd (__m128d __A, __m128d __B)
668 {
669 return (__A[0] >= __B[0]);
670 }
671
672 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673 _mm_ucomineq_sd (__m128d __A, __m128d __B)
674 {
675 return (__A[0] != __B[0]);
676 }
677
678 /* Create a vector of Qi, where i is the element number. */
679 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680 _mm_set_epi64x (long long __q1, long long __q0)
681 {
682 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
683 }
684
685 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686 _mm_set_epi64 (__m64 __q1, __m64 __q0)
687 {
688 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
689 }
690
691 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
693 {
694 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
695 }
696
697 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
699 short __q3, short __q2, short __q1, short __q0)
700 {
701 return __extension__ (__m128i)(__v8hi){
702 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
703 }
704
705 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
707 char __q11, char __q10, char __q09, char __q08,
708 char __q07, char __q06, char __q05, char __q04,
709 char __q03, char __q02, char __q01, char __q00)
710 {
711 return __extension__ (__m128i)(__v16qi){
712 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
713 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
714 };
715 }
716
717 /* Set all of the elements of the vector to A. */
718 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm_set1_epi64x (long long __A)
720 {
721 return _mm_set_epi64x (__A, __A);
722 }
723
724 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
725 _mm_set1_epi64 (__m64 __A)
726 {
727 return _mm_set_epi64 (__A, __A);
728 }
729
730 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
731 _mm_set1_epi32 (int __A)
732 {
733 return _mm_set_epi32 (__A, __A, __A, __A);
734 }
735
736 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm_set1_epi16 (short __A)
738 {
739 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
740 }
741
742 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 _mm_set1_epi8 (char __A)
744 {
745 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
746 __A, __A, __A, __A, __A, __A, __A, __A);
747 }
748
749 /* Create a vector of Qi, where i is the element number.
750 The parameter order is reversed from the _mm_set_epi* functions. */
751 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
753 {
754 return _mm_set_epi64 (__q1, __q0);
755 }
756
757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
759 {
760 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
761 }
762
763 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
765 short __q4, short __q5, short __q6, short __q7)
766 {
767 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
768 }
769
770 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
772 char __q04, char __q05, char __q06, char __q07,
773 char __q08, char __q09, char __q10, char __q11,
774 char __q12, char __q13, char __q14, char __q15)
775 {
776 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
777 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
778 }
779
780 /* Create a vector with element 0 as *P and the rest zero. */
781 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
782 _mm_load_si128 (__m128i const *__P)
783 {
784 return *__P;
785 }
786
787 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788 _mm_loadu_si128 (__m128i_u const *__P)
789 {
790 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
791 }
792
793 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 _mm_loadl_epi64 (__m128i_u const *__P)
795 {
796 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
797 }
798
799 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm_store_si128 (__m128i *__P, __m128i __B)
801 {
802 assert(((unsigned long )__P & 0xfUL) == 0UL);
803 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
804 }
805
806 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
808 {
809 *__P = __B;
810 }
811
812 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
814 {
815 *(long long *)__P = ((__v2di)__B)[0];
816 }
817
818 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
819 _mm_movepi64_pi64 (__m128i_u __B)
820 {
821 return (__m64) ((__v2di)__B)[0];
822 }
823
824 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825 _mm_movpi64_epi64 (__m64 __A)
826 {
827 return _mm_set_epi64 ((__m64)0LL, __A);
828 }
829
830 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
831 _mm_move_epi64 (__m128i __A)
832 {
833 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
834 }
835
836 /* Create an undefined vector. */
837 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838 _mm_undefined_si128 (void)
839 {
840 __m128i __Y = __Y;
841 return __Y;
842 }
843
844 /* Create a vector of zeros. */
845 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 _mm_setzero_si128 (void)
847 {
848 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
849 }
850
851 #ifdef _ARCH_PWR8
852 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 _mm_cvtepi32_pd (__m128i __A)
854 {
855 __v2di val;
856 /* For LE need to generate Vector Unpack Low Signed Word.
857 Which is generated from unpackh. */
858 val = (__v2di)vec_unpackh ((__v4si)__A);
859
860 return (__m128d)vec_ctf (val, 0);
861 }
862 #endif
863
864 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _mm_cvtepi32_ps (__m128i __A)
866 {
867 return ((__m128)vec_ctf((__v4si)__A, 0));
868 }
869
870 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm_cvtpd_epi32 (__m128d __A)
872 {
873 __v2df rounded = vec_rint (__A);
874 __v4si result, temp;
875 const __v4si vzero =
876 { 0, 0, 0, 0 };
877
878 /* VSX Vector truncate Double-Precision to integer and Convert to
879 Signed Integer Word format with Saturate. */
880 __asm__(
881 "xvcvdpsxws %x0,%x1"
882 : "=wa" (temp)
883 : "wa" (rounded)
884 : );
885
886 #ifdef _ARCH_PWR8
887 temp = vec_mergeo (temp, temp);
888 result = (__v4si)vec_vpkudum ((vector long)temp, (vector long)vzero);
889 #else
890 {
891 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
892 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
893 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
894 }
895 #endif
896 return (__m128i) result;
897 }
898
899 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900 _mm_cvtpd_pi32 (__m128d __A)
901 {
902 __m128i result = _mm_cvtpd_epi32(__A);
903
904 return (__m64) result[0];
905 }
906
907 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
908 _mm_cvtpd_ps (__m128d __A)
909 {
910 __v4sf result;
911 __v4si temp;
912 const __v4si vzero = { 0, 0, 0, 0 };
913
914 __asm__(
915 "xvcvdpsp %x0,%x1"
916 : "=wa" (temp)
917 : "wa" (__A)
918 : );
919
920 #ifdef _ARCH_PWR8
921 temp = vec_mergeo (temp, temp);
922 result = (__v4sf)vec_vpkudum ((vector long)temp, (vector long)vzero);
923 #else
924 {
925 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
927 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
928 }
929 #endif
930 return ((__m128)result);
931 }
932
933 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
934 _mm_cvttpd_epi32 (__m128d __A)
935 {
936 __v4si result;
937 __v4si temp;
938 const __v4si vzero = { 0, 0, 0, 0 };
939
940 /* VSX Vector truncate Double-Precision to integer and Convert to
941 Signed Integer Word format with Saturate. */
942 __asm__(
943 "xvcvdpsxws %x0,%x1"
944 : "=wa" (temp)
945 : "wa" (__A)
946 : );
947
948 #ifdef _ARCH_PWR8
949 temp = vec_mergeo (temp, temp);
950 result = (__v4si)vec_vpkudum ((vector long)temp, (vector long)vzero);
951 #else
952 {
953 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
954 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
955 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
956 }
957 #endif
958
959 return ((__m128i) result);
960 }
961
962 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm_cvttpd_pi32 (__m128d __A)
964 {
965 __m128i result = _mm_cvttpd_epi32 (__A);
966
967 return (__m64) result[0];
968 }
969
970 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971 _mm_cvtsi128_si32 (__m128i __A)
972 {
973 return ((__v4si)__A)[0];
974 }
975
976 #ifdef _ARCH_PWR8
977 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978 _mm_cvtpi32_pd (__m64 __A)
979 {
980 __v4si temp;
981 __v2di tmp2;
982 __v2df result;
983
984 temp = (__v4si)vec_splats (__A);
985 tmp2 = (__v2di)vec_unpackl (temp);
986 result = vec_ctf ((__vector signed long)tmp2, 0);
987 return (__m128d)result;
988 }
989 #endif
990
991 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992 _mm_cvtps_epi32 (__m128 __A)
993 {
994 __v4sf rounded;
995 __v4si result;
996
997 rounded = vec_rint((__v4sf) __A);
998 result = vec_cts (rounded, 0);
999 return (__m128i) result;
1000 }
1001
1002 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm_cvttps_epi32 (__m128 __A)
1004 {
1005 __v4si result;
1006
1007 result = vec_cts ((__v4sf) __A, 0);
1008 return (__m128i) result;
1009 }
1010
1011 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm_cvtps_pd (__m128 __A)
1013 {
1014 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1015 #ifdef vec_doubleh
1016 return (__m128d) vec_doubleh ((__v4sf)__A);
1017 #else
1018 /* Otherwise the compiler is not current and so need to generate the
1019 equivalent code. */
1020 __v4sf a = (__v4sf)__A;
1021 __v4sf temp;
1022 __v2df result;
1023 #ifdef __LITTLE_ENDIAN__
1024 /* The input float values are in elements {[0], [1]} but the convert
1025 instruction needs them in elements {[1], [3]}, So we use two
1026 shift left double vector word immediates to get the elements
1027 lined up. */
1028 temp = __builtin_vsx_xxsldwi (a, a, 3);
1029 temp = __builtin_vsx_xxsldwi (a, temp, 2);
1030 #elif __BIG_ENDIAN__
1031 /* The input float values are in elements {[0], [1]} but the convert
1032 instruction needs them in elements {[0], [2]}, So we use two
1033 shift left double vector word immediates to get the elements
1034 lined up. */
1035 temp = vec_vmrghw (a, a);
1036 #endif
1037 __asm__(
1038 " xvcvspdp %x0,%x1"
1039 : "=wa" (result)
1040 : "wa" (temp)
1041 : );
1042 return (__m128d) result;
1043 #endif
1044 }
1045
1046 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047 _mm_cvtsd_si32 (__m128d __A)
1048 {
1049 __v2df rounded = vec_rint((__v2df) __A);
1050 int result = ((__v2df)rounded)[0];
1051
1052 return result;
1053 }
1054 /* Intel intrinsic. */
1055 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_cvtsd_si64 (__m128d __A)
1057 {
1058 __v2df rounded = vec_rint ((__v2df) __A );
1059 long long result = ((__v2df) rounded)[0];
1060
1061 return result;
1062 }
1063
1064 /* Microsoft intrinsic. */
1065 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm_cvtsd_si64x (__m128d __A)
1067 {
1068 return _mm_cvtsd_si64 ((__v2df)__A);
1069 }
1070
1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_cvttsd_si32 (__m128d __A)
1073 {
1074 int result = ((__v2df)__A)[0];
1075
1076 return result;
1077 }
1078
1079 /* Intel intrinsic. */
1080 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_cvttsd_si64 (__m128d __A)
1082 {
1083 long long result = ((__v2df)__A)[0];
1084
1085 return result;
1086 }
1087
1088 /* Microsoft intrinsic. */
1089 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_cvttsd_si64x (__m128d __A)
1091 {
1092 return _mm_cvttsd_si64 (__A);
1093 }
1094
1095 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1097 {
1098 __v4sf result = (__v4sf)__A;
1099
1100 #ifdef __LITTLE_ENDIAN__
1101 __v4sf temp_s;
1102 /* Copy double element[0] to element [1] for conversion. */
1103 __v2df temp_b = vec_splat((__v2df)__B, 0);
1104
1105 /* Pre-rotate __A left 3 (logically right 1) elements. */
1106 result = __builtin_vsx_xxsldwi (result, result, 3);
1107 /* Convert double to single float scalar in a vector. */
1108 __asm__(
1109 "xscvdpsp %x0,%x1"
1110 : "=wa" (temp_s)
1111 : "wa" (temp_b)
1112 : );
1113 /* Shift the resulting scalar into vector element [0]. */
1114 result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1115 #else
1116 result [0] = ((__v2df)__B)[0];
1117 #endif
1118 return (__m128) result;
1119 }
1120
1121 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _mm_cvtsi32_sd (__m128d __A, int __B)
1123 {
1124 __v2df result = (__v2df)__A;
1125 double db = __B;
1126 result [0] = db;
1127 return (__m128d)result;
1128 }
1129
1130 /* Intel intrinsic. */
1131 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132 _mm_cvtsi64_sd (__m128d __A, long long __B)
1133 {
1134 __v2df result = (__v2df)__A;
1135 double db = __B;
1136 result [0] = db;
1137 return (__m128d)result;
1138 }
1139
1140 /* Microsoft intrinsic. */
1141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1143 {
1144 return _mm_cvtsi64_sd (__A, __B);
1145 }
1146
1147 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1148 _mm_cvtss_sd (__m128d __A, __m128 __B)
1149 {
1150 #ifdef __LITTLE_ENDIAN__
1151 /* Use splat to move element [0] into position for the convert. */
1152 __v4sf temp = vec_splat ((__v4sf)__B, 0);
1153 __v2df res;
1154 /* Convert single float scalar to double in a vector. */
1155 __asm__(
1156 "xscvspdp %x0,%x1"
1157 : "=wa" (res)
1158 : "wa" (temp)
1159 : );
1160 return (__m128d) vec_mergel (res, (__v2df)__A);
1161 #else
1162 __v2df res = (__v2df)__A;
1163 res [0] = ((__v4sf)__B) [0];
1164 return (__m128d) res;
1165 #endif
1166 }
1167
1168 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1169 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1170 {
1171 __vector double result;
1172 const int litmsk = __mask & 0x3;
1173
1174 if (litmsk == 0)
1175 result = vec_mergeh (__A, __B);
1176 #if __GNUC__ < 6
1177 else if (litmsk == 1)
1178 result = vec_xxpermdi (__B, __A, 2);
1179 else if (litmsk == 2)
1180 result = vec_xxpermdi (__B, __A, 1);
1181 #else
1182 else if (litmsk == 1)
1183 result = vec_xxpermdi (__A, __B, 2);
1184 else if (litmsk == 2)
1185 result = vec_xxpermdi (__A, __B, 1);
1186 #endif
1187 else
1188 result = vec_mergel (__A, __B);
1189
1190 return result;
1191 }
1192
1193 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1195 {
1196 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1197 }
1198
1199 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1201 {
1202 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1203 }
1204
1205 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm_loadh_pd (__m128d __A, double const *__B)
1207 {
1208 __v2df result = (__v2df)__A;
1209 result [1] = *__B;
1210 return (__m128d)result;
1211 }
1212
1213 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm_loadl_pd (__m128d __A, double const *__B)
1215 {
1216 __v2df result = (__v2df)__A;
1217 result [0] = *__B;
1218 return (__m128d)result;
1219 }
1220
1221 #ifdef _ARCH_PWR8
1222 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1223
1224 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1225 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226 _mm_movemask_pd (__m128d __A)
1227 {
1228 __vector __m64 result;
1229 static const __vector unsigned int perm_mask =
1230 {
1231 #ifdef __LITTLE_ENDIAN__
1232 0x80800040, 0x80808080, 0x80808080, 0x80808080
1233 #elif __BIG_ENDIAN__
1234 0x80808080, 0x80808080, 0x80808080, 0x80800040
1235 #endif
1236 };
1237
1238 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1239 (__vector unsigned char) perm_mask);
1240
1241 #ifdef __LITTLE_ENDIAN__
1242 return result[1];
1243 #elif __BIG_ENDIAN__
1244 return result[0];
1245 #endif
1246 }
1247 #endif /* _ARCH_PWR8 */
1248
1249 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250 _mm_packs_epi16 (__m128i __A, __m128i __B)
1251 {
1252 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1253 }
1254
1255 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256 _mm_packs_epi32 (__m128i __A, __m128i __B)
1257 {
1258 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1259 }
1260
1261 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262 _mm_packus_epi16 (__m128i __A, __m128i __B)
1263 {
1264 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1265 }
1266
1267 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1269 {
1270 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1271 }
1272
1273 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1275 {
1276 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1277 }
1278
1279 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1281 {
1282 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1283 }
1284
1285 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1287 {
1288 return (__m128i) vec_mergel ((__vector long)__A, (__vector long)__B);
1289 }
1290
1291 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1293 {
1294 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1295 }
1296
1297 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1299 {
1300 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1301 }
1302
1303 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1305 {
1306 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1307 }
1308
1309 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1311 {
1312 return (__m128i) vec_mergeh ((__vector long)__A, (__vector long)__B);
1313 }
1314
1315 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316 _mm_add_epi8 (__m128i __A, __m128i __B)
1317 {
1318 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1319 }
1320
1321 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322 _mm_add_epi16 (__m128i __A, __m128i __B)
1323 {
1324 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1325 }
1326
1327 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _mm_add_epi32 (__m128i __A, __m128i __B)
1329 {
1330 return (__m128i) ((__v4su)__A + (__v4su)__B);
1331 }
1332
1333 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334 _mm_add_epi64 (__m128i __A, __m128i __B)
1335 {
1336 return (__m128i) ((__v2du)__A + (__v2du)__B);
1337 }
1338
1339 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340 _mm_adds_epi8 (__m128i __A, __m128i __B)
1341 {
1342 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1343 }
1344
1345 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346 _mm_adds_epi16 (__m128i __A, __m128i __B)
1347 {
1348 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1349 }
1350
1351 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm_adds_epu8 (__m128i __A, __m128i __B)
1353 {
1354 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1355 }
1356
1357 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358 _mm_adds_epu16 (__m128i __A, __m128i __B)
1359 {
1360 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1361 }
1362
1363 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _mm_sub_epi8 (__m128i __A, __m128i __B)
1365 {
1366 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1367 }
1368
1369 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_sub_epi16 (__m128i __A, __m128i __B)
1371 {
1372 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1373 }
1374
1375 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm_sub_epi32 (__m128i __A, __m128i __B)
1377 {
1378 return (__m128i) ((__v4su)__A - (__v4su)__B);
1379 }
1380
1381 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm_sub_epi64 (__m128i __A, __m128i __B)
1383 {
1384 return (__m128i) ((__v2du)__A - (__v2du)__B);
1385 }
1386
1387 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_subs_epi8 (__m128i __A, __m128i __B)
1389 {
1390 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1391 }
1392
1393 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm_subs_epi16 (__m128i __A, __m128i __B)
1395 {
1396 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1397 }
1398
1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400 _mm_subs_epu8 (__m128i __A, __m128i __B)
1401 {
1402 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1403 }
1404
1405 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406 _mm_subs_epu16 (__m128i __A, __m128i __B)
1407 {
1408 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1409 }
1410
1411 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1412 _mm_madd_epi16 (__m128i __A, __m128i __B)
1413 {
1414 __vector signed int zero = {0, 0, 0, 0};
1415
1416 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1417 }
1418
1419 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1421 {
1422 __vector signed int w0, w1;
1423
1424 __vector unsigned char xform1 = {
1425 #ifdef __LITTLE_ENDIAN__
1426 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1427 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1428 #elif __BIG_ENDIAN__
1429 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1430 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1431 #endif
1432 };
1433
1434 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1435 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1436 return (__m128i) vec_perm (w0, w1, xform1);
1437 }
1438
1439 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1441 {
1442 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1443 }
1444
1445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1446 _mm_mul_su32 (__m64 __A, __m64 __B)
1447 {
1448 unsigned int a = __A;
1449 unsigned int b = __B;
1450
1451 return ((__m64)a * (__m64)b);
1452 }
1453
1454 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1455 _mm_mul_epu32 (__m128i __A, __m128i __B)
1456 {
1457 #if __GNUC__ < 8
1458 __v2du result;
1459
1460 #ifdef __LITTLE_ENDIAN__
1461 /* VMX Vector Multiply Odd Unsigned Word. */
1462 __asm__(
1463 "vmulouw %0,%1,%2"
1464 : "=v" (result)
1465 : "v" (__A), "v" (__B)
1466 : );
1467 #elif __BIG_ENDIAN__
1468 /* VMX Vector Multiply Even Unsigned Word. */
1469 __asm__(
1470 "vmuleuw %0,%1,%2"
1471 : "=v" (result)
1472 : "v" (__A), "v" (__B)
1473 : );
1474 #endif
1475 return (__m128i) result;
1476 #else
1477 #ifdef __LITTLE_ENDIAN__
1478 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1479 #elif __BIG_ENDIAN__
1480 return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B);
1481 #endif
1482 #endif
1483 }
1484
1485 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1486 _mm_slli_epi16 (__m128i __A, int __B)
1487 {
1488 __v8hu lshift;
1489 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1490
1491 if (__B < 16)
1492 {
1493 if (__builtin_constant_p(__B))
1494 lshift = (__v8hu) vec_splat_s16(__B);
1495 else
1496 lshift = vec_splats ((unsigned short) __B);
1497
1498 result = vec_vslh ((__v8hi) __A, lshift);
1499 }
1500
1501 return (__m128i) result;
1502 }
1503
1504 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1505 _mm_slli_epi32 (__m128i __A, int __B)
1506 {
1507 __v4su lshift;
1508 __v4si result = { 0, 0, 0, 0 };
1509
1510 if (__B < 32)
1511 {
1512 if (__builtin_constant_p(__B))
1513 lshift = (__v4su) vec_splat_s32(__B);
1514 else
1515 lshift = vec_splats ((unsigned int) __B);
1516
1517 result = vec_vslw ((__v4si) __A, lshift);
1518 }
1519
1520 return (__m128i) result;
1521 }
1522
1523 #ifdef _ARCH_PWR8
1524 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1525 _mm_slli_epi64 (__m128i __A, int __B)
1526 {
1527 __v2du lshift;
1528 __v2di result = { 0, 0 };
1529
1530 if (__B < 64)
1531 {
1532 if (__builtin_constant_p(__B))
1533 {
1534 if (__B < 32)
1535 lshift = (__v2du) vec_splat_s32(__B);
1536 else
1537 lshift = (__v2du) vec_splats((unsigned long long)__B);
1538 }
1539 else
1540 lshift = (__v2du) vec_splats ((unsigned int) __B);
1541
1542 result = vec_vsld ((__v2di) __A, lshift);
1543 }
1544
1545 return (__m128i) result;
1546 }
1547 #endif
1548
1549 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1550 _mm_srai_epi16 (__m128i __A, int __B)
1551 {
1552 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1553 __v8hi result;
1554
1555 if (__B < 16)
1556 {
1557 if (__builtin_constant_p(__B))
1558 rshift = (__v8hu) vec_splat_s16(__B);
1559 else
1560 rshift = vec_splats ((unsigned short) __B);
1561 }
1562 result = vec_vsrah ((__v8hi) __A, rshift);
1563
1564 return (__m128i) result;
1565 }
1566
1567 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1568 _mm_srai_epi32 (__m128i __A, int __B)
1569 {
1570 __v4su rshift = { 31, 31, 31, 31 };
1571 __v4si result;
1572
1573 if (__B < 32)
1574 {
1575 if (__builtin_constant_p(__B))
1576 {
1577 if (__B < 16)
1578 rshift = (__v4su) vec_splat_s32(__B);
1579 else
1580 rshift = (__v4su) vec_splats((unsigned int)__B);
1581 }
1582 else
1583 rshift = vec_splats ((unsigned int) __B);
1584 }
1585 result = vec_vsraw ((__v4si) __A, rshift);
1586
1587 return (__m128i) result;
1588 }
1589
1590 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1591 _mm_bslli_si128 (__m128i __A, const int __N)
1592 {
1593 __v16qu result;
1594 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1595
1596 if (__N < 16)
1597 result = vec_sld ((__v16qu) __A, zeros, __N);
1598 else
1599 result = zeros;
1600
1601 return (__m128i) result;
1602 }
1603
1604 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1605 _mm_bsrli_si128 (__m128i __A, const int __N)
1606 {
1607 __v16qu result;
1608 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1609
1610 if (__N < 16)
1611 if (__builtin_constant_p(__N))
1612 /* Would like to use Vector Shift Left Double by Octet
1613 Immediate here to use the immediate form and avoid
1614 load of __N * 8 value into a separate VR. */
1615 result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1616 else
1617 {
1618 __v16qu shift = vec_splats((unsigned char)(__N*8));
1619 result = vec_sro ((__v16qu)__A, shift);
1620 }
1621 else
1622 result = zeros;
1623
1624 return (__m128i) result;
1625 }
1626
1627 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1628 _mm_srli_si128 (__m128i __A, const int __N)
1629 {
1630 return _mm_bsrli_si128 (__A, __N);
1631 }
1632
1633 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1634 _mm_slli_si128 (__m128i __A, const int _imm5)
1635 {
1636 __v16qu result;
1637 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1638
1639 if (_imm5 < 16)
1640 #ifdef __LITTLE_ENDIAN__
1641 result = vec_sld ((__v16qu) __A, zeros, _imm5);
1642 #elif __BIG_ENDIAN__
1643 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1644 #endif
1645 else
1646 result = zeros;
1647
1648 return (__m128i) result;
1649 }
1650
1651 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652
1653 _mm_srli_epi16 (__m128i __A, int __B)
1654 {
1655 __v8hu rshift;
1656 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1657
1658 if (__B < 16)
1659 {
1660 if (__builtin_constant_p(__B))
1661 rshift = (__v8hu) vec_splat_s16(__B);
1662 else
1663 rshift = vec_splats ((unsigned short) __B);
1664
1665 result = vec_vsrh ((__v8hi) __A, rshift);
1666 }
1667
1668 return (__m128i) result;
1669 }
1670
1671 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672 _mm_srli_epi32 (__m128i __A, int __B)
1673 {
1674 __v4su rshift;
1675 __v4si result = { 0, 0, 0, 0 };
1676
1677 if (__B < 32)
1678 {
1679 if (__builtin_constant_p(__B))
1680 {
1681 if (__B < 16)
1682 rshift = (__v4su) vec_splat_s32(__B);
1683 else
1684 rshift = (__v4su) vec_splats((unsigned int)__B);
1685 }
1686 else
1687 rshift = vec_splats ((unsigned int) __B);
1688
1689 result = vec_vsrw ((__v4si) __A, rshift);
1690 }
1691
1692 return (__m128i) result;
1693 }
1694
1695 #ifdef _ARCH_PWR8
1696 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1697 _mm_srli_epi64 (__m128i __A, int __B)
1698 {
1699 __v2du rshift;
1700 __v2di result = { 0, 0 };
1701
1702 if (__B < 64)
1703 {
1704 if (__builtin_constant_p(__B))
1705 {
1706 if (__B < 16)
1707 rshift = (__v2du) vec_splat_s32(__B);
1708 else
1709 rshift = (__v2du) vec_splats((unsigned long long)__B);
1710 }
1711 else
1712 rshift = (__v2du) vec_splats ((unsigned int) __B);
1713
1714 result = vec_vsrd ((__v2di) __A, rshift);
1715 }
1716
1717 return (__m128i) result;
1718 }
1719 #endif
1720
1721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1722 _mm_sll_epi16 (__m128i __A, __m128i __B)
1723 {
1724 __v8hu lshift, shmask;
1725 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1726 __v8hu result;
1727
1728 #ifdef __LITTLE_ENDIAN__
1729 lshift = vec_splat ((__v8hu)__B, 0);
1730 #elif __BIG_ENDIAN__
1731 lshift = vec_splat ((__v8hu)__B, 3);
1732 #endif
1733 shmask = lshift <= shmax;
1734 result = vec_vslh ((__v8hu) __A, lshift);
1735 result = vec_sel (shmask, result, shmask);
1736
1737 return (__m128i) result;
1738 }
1739
1740 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1741 _mm_sll_epi32 (__m128i __A, __m128i __B)
1742 {
1743 __v4su lshift, shmask;
1744 const __v4su shmax = { 32, 32, 32, 32 };
1745 __v4su result;
1746 #ifdef __LITTLE_ENDIAN__
1747 lshift = vec_splat ((__v4su)__B, 0);
1748 #elif __BIG_ENDIAN__
1749 lshift = vec_splat ((__v4su)__B, 1);
1750 #endif
1751 shmask = lshift < shmax;
1752 result = vec_vslw ((__v4su) __A, lshift);
1753 result = vec_sel (shmask, result, shmask);
1754
1755 return (__m128i) result;
1756 }
1757
1758 #ifdef _ARCH_PWR8
1759 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1760 _mm_sll_epi64 (__m128i __A, __m128i __B)
1761 {
1762 __v2du lshift, shmask;
1763 const __v2du shmax = { 64, 64 };
1764 __v2du result;
1765
1766 lshift = (__v2du) vec_splat ((__v2du)__B, 0);
1767 shmask = lshift < shmax;
1768 result = vec_vsld ((__v2du) __A, lshift);
1769 result = (__v2du) vec_sel ((__v2df) shmask, (__v2df) result,
1770 (__v2df) shmask);
1771
1772 return (__m128i) result;
1773 }
1774 #endif
1775
1776 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777 _mm_sra_epi16 (__m128i __A, __m128i __B)
1778 {
1779 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1780 __v8hu rshift;
1781 __v8hi result;
1782
1783 #ifdef __LITTLE_ENDIAN__
1784 rshift = vec_splat ((__v8hu)__B, 0);
1785 #elif __BIG_ENDIAN__
1786 rshift = vec_splat ((__v8hu)__B, 3);
1787 #endif
1788 rshift = vec_min (rshift, rshmax);
1789 result = vec_vsrah ((__v8hi) __A, rshift);
1790
1791 return (__m128i) result;
1792 }
1793
1794 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1795 _mm_sra_epi32 (__m128i __A, __m128i __B)
1796 {
1797 const __v4su rshmax = { 31, 31, 31, 31 };
1798 __v4su rshift;
1799 __v4si result;
1800
1801 #ifdef __LITTLE_ENDIAN__
1802 rshift = vec_splat ((__v4su)__B, 0);
1803 #elif __BIG_ENDIAN__
1804 rshift = vec_splat ((__v4su)__B, 1);
1805 #endif
1806 rshift = vec_min (rshift, rshmax);
1807 result = vec_vsraw ((__v4si) __A, rshift);
1808
1809 return (__m128i) result;
1810 }
1811
1812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1813 _mm_srl_epi16 (__m128i __A, __m128i __B)
1814 {
1815 __v8hu rshift, shmask;
1816 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1817 __v8hu result;
1818
1819 #ifdef __LITTLE_ENDIAN__
1820 rshift = vec_splat ((__v8hu)__B, 0);
1821 #elif __BIG_ENDIAN__
1822 rshift = vec_splat ((__v8hu)__B, 3);
1823 #endif
1824 shmask = rshift <= shmax;
1825 result = vec_vsrh ((__v8hu) __A, rshift);
1826 result = vec_sel (shmask, result, shmask);
1827
1828 return (__m128i) result;
1829 }
1830
1831 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1832 _mm_srl_epi32 (__m128i __A, __m128i __B)
1833 {
1834 __v4su rshift, shmask;
1835 const __v4su shmax = { 32, 32, 32, 32 };
1836 __v4su result;
1837
1838 #ifdef __LITTLE_ENDIAN__
1839 rshift = vec_splat ((__v4su)__B, 0);
1840 #elif __BIG_ENDIAN__
1841 rshift = vec_splat ((__v4su)__B, 1);
1842 #endif
1843 shmask = rshift < shmax;
1844 result = vec_vsrw ((__v4su) __A, rshift);
1845 result = vec_sel (shmask, result, shmask);
1846
1847 return (__m128i) result;
1848 }
1849
1850 #ifdef _ARCH_PWR8
1851 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1852 _mm_srl_epi64 (__m128i __A, __m128i __B)
1853 {
1854 __v2du rshift, shmask;
1855 const __v2du shmax = { 64, 64 };
1856 __v2du result;
1857
1858 rshift = (__v2du) vec_splat ((__v2du)__B, 0);
1859 shmask = rshift < shmax;
1860 result = vec_vsrd ((__v2du) __A, rshift);
1861 result = (__v2du)vec_sel ((__v2du)shmask, (__v2du)result, (__v2du)shmask);
1862
1863 return (__m128i) result;
1864 }
1865 #endif
1866
1867 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1868 _mm_and_pd (__m128d __A, __m128d __B)
1869 {
1870 return (vec_and ((__v2df) __A, (__v2df) __B));
1871 }
1872
1873 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1874 _mm_andnot_pd (__m128d __A, __m128d __B)
1875 {
1876 return (vec_andc ((__v2df) __B, (__v2df) __A));
1877 }
1878
1879 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1880 _mm_or_pd (__m128d __A, __m128d __B)
1881 {
1882 return (vec_or ((__v2df) __A, (__v2df) __B));
1883 }
1884
1885 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1886 _mm_xor_pd (__m128d __A, __m128d __B)
1887 {
1888 return (vec_xor ((__v2df) __A, (__v2df) __B));
1889 }
1890
1891 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1892 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1893 {
1894 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1895 }
1896
1897 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1898 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1899 {
1900 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1901 }
1902
1903 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1904 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1905 {
1906 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1907 }
1908
1909 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1910 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1911 {
1912 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1913 }
1914
1915 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1916 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1917 {
1918 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1919 }
1920
1921 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1922 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1923 {
1924 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1925 }
1926
1927 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1928 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1929 {
1930 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1931 }
1932
1933 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1934 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1935 {
1936 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1937 }
1938
1939 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1940 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1941 {
1942 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1943 }
1944
1945 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1946 _mm_extract_epi16 (__m128i const __A, int const __N)
1947 {
1948 return (unsigned short) ((__v8hi)__A)[__N & 7];
1949 }
1950
1951 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1952 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1953 {
1954 __v8hi result = (__v8hi)__A;
1955
1956 result [(__N & 7)] = __D;
1957
1958 return (__m128i) result;
1959 }
1960
1961 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1962 _mm_max_epi16 (__m128i __A, __m128i __B)
1963 {
1964 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1965 }
1966
1967 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1968 _mm_max_epu8 (__m128i __A, __m128i __B)
1969 {
1970 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1971 }
1972
1973 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1974 _mm_min_epi16 (__m128i __A, __m128i __B)
1975 {
1976 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1977 }
1978
1979 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1980 _mm_min_epu8 (__m128i __A, __m128i __B)
1981 {
1982 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1983 }
1984
1985
1986 #ifdef _ARCH_PWR8
1987 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1988
1989 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1990 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1991 _mm_movemask_epi8 (__m128i __A)
1992 {
1993 __vector __m64 result;
1994 static const __vector unsigned char perm_mask =
1995 {
1996 #ifdef __LITTLE_ENDIAN__
1997 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1998 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
1999 #elif __BIG_ENDIAN__
2000 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
2001 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78
2002 #endif
2003 };
2004
2005 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
2006 (__vector unsigned char) perm_mask);
2007
2008 #ifdef __LITTLE_ENDIAN__
2009 return result[1];
2010 #elif __BIG_ENDIAN__
2011 return result[0];
2012 #endif
2013 }
2014 #endif /* _ARCH_PWR8 */
2015
2016 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2017 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2018 {
2019 __v4su w0, w1;
2020 __v16qu xform1 = {
2021 #ifdef __LITTLE_ENDIAN__
2022 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2023 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2024 #elif __BIG_ENDIAN__
2025 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2026 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2027 #endif
2028 };
2029
2030 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2031 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2032 return (__m128i) vec_perm (w0, w1, xform1);
2033 }
2034
2035 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2036 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2037 {
2038 unsigned long element_selector_98 = __mask & 0x03;
2039 unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2040 unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2041 unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2042 static const unsigned short permute_selectors[4] =
2043 {
2044 #ifdef __LITTLE_ENDIAN__
2045 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2046 #elif __BIG_ENDIAN__
2047 0x0607, 0x0405, 0x0203, 0x0001
2048 #endif
2049 };
2050 __v2du pmask =
2051 #ifdef __LITTLE_ENDIAN__
2052 { 0x1716151413121110UL, 0x1f1e1d1c1b1a1918UL};
2053 #elif __BIG_ENDIAN__
2054 { 0x1011121314151617UL, 0x18191a1b1c1d1e1fUL};
2055 #endif
2056 __m64_union t;
2057 __v2du a, r;
2058
2059 #ifdef __LITTLE_ENDIAN__
2060 t.as_short[0] = permute_selectors[element_selector_98];
2061 t.as_short[1] = permute_selectors[element_selector_BA];
2062 t.as_short[2] = permute_selectors[element_selector_DC];
2063 t.as_short[3] = permute_selectors[element_selector_FE];
2064 #elif __BIG_ENDIAN__
2065 t.as_short[3] = permute_selectors[element_selector_98];
2066 t.as_short[2] = permute_selectors[element_selector_BA];
2067 t.as_short[1] = permute_selectors[element_selector_DC];
2068 t.as_short[0] = permute_selectors[element_selector_FE];
2069 #endif
2070 #ifdef __LITTLE_ENDIAN__
2071 pmask[1] = t.as_m64;
2072 #elif __BIG_ENDIAN__
2073 pmask[0] = t.as_m64;
2074 #endif
2075 a = (__v2du)__A;
2076 r = vec_perm (a, a, (__vector unsigned char)pmask);
2077 return (__m128i) r;
2078 }
2079
2080 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2081 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2082 {
2083 unsigned long element_selector_10 = __mask & 0x03;
2084 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2085 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2086 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2087 static const unsigned short permute_selectors[4] =
2088 {
2089 #ifdef __LITTLE_ENDIAN__
2090 0x0100, 0x0302, 0x0504, 0x0706
2091 #elif __BIG_ENDIAN__
2092 0x0e0f, 0x0c0d, 0x0a0b, 0x0809
2093 #endif
2094 };
2095 __v2du pmask = { 0x1011121314151617UL, 0x1f1e1d1c1b1a1918UL};
2096 __m64_union t;
2097 __v2du a, r;
2098
2099 #ifdef __LITTLE_ENDIAN__
2100 t.as_short[0] = permute_selectors[element_selector_10];
2101 t.as_short[1] = permute_selectors[element_selector_32];
2102 t.as_short[2] = permute_selectors[element_selector_54];
2103 t.as_short[3] = permute_selectors[element_selector_76];
2104 #elif __BIG_ENDIAN__
2105 t.as_short[3] = permute_selectors[element_selector_10];
2106 t.as_short[2] = permute_selectors[element_selector_32];
2107 t.as_short[1] = permute_selectors[element_selector_54];
2108 t.as_short[0] = permute_selectors[element_selector_76];
2109 #endif
2110 #ifdef __LITTLE_ENDIAN__
2111 pmask[0] = t.as_m64;
2112 #elif __BIG_ENDIAN__
2113 pmask[1] = t.as_m64;
2114 #endif
2115 a = (__v2du)__A;
2116 r = vec_perm (a, a, (__vector unsigned char)pmask);
2117 return (__m128i) r;
2118 }
2119
2120 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2121 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2122 {
2123 unsigned long element_selector_10 = __mask & 0x03;
2124 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2125 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2126 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2127 static const unsigned int permute_selectors[4] =
2128 {
2129 #ifdef __LITTLE_ENDIAN__
2130 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2131 #elif __BIG_ENDIAN__
2132 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
2133 #endif
2134 };
2135 __v4su t;
2136
2137 #ifdef __LITTLE_ENDIAN__
2138 t[0] = permute_selectors[element_selector_10];
2139 t[1] = permute_selectors[element_selector_32];
2140 t[2] = permute_selectors[element_selector_54] + 0x10101010;
2141 t[3] = permute_selectors[element_selector_76] + 0x10101010;
2142 #elif __BIG_ENDIAN__
2143 t[3] = permute_selectors[element_selector_10] + 0x10101010;
2144 t[2] = permute_selectors[element_selector_32] + 0x10101010;
2145 t[1] = permute_selectors[element_selector_54];
2146 t[0] = permute_selectors[element_selector_76];
2147 #endif
2148 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2149 }
2150
2151 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2152 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2153 {
2154 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2155 __v16qu mask, tmp;
2156 __m128i *p = (__m128i*)__C;
2157
2158 tmp = (__v16qu)_mm_loadu_si128(p);
2159 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2160 tmp = vec_sel (tmp, (__v16qu)__A, mask);
2161 _mm_storeu_si128 (p, (__m128i)tmp);
2162 }
2163
2164 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2165 _mm_avg_epu8 (__m128i __A, __m128i __B)
2166 {
2167 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2168 }
2169
2170 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2171 _mm_avg_epu16 (__m128i __A, __m128i __B)
2172 {
2173 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2174 }
2175
2176
2177 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2178 _mm_sad_epu8 (__m128i __A, __m128i __B)
2179 {
2180 __v16qu a, b;
2181 __v16qu vmin, vmax, vabsdiff;
2182 __v4si vsum;
2183 const __v4su zero = { 0, 0, 0, 0 };
2184 __v4si result;
2185
2186 a = (__v16qu) __A;
2187 b = (__v16qu) __B;
2188 vmin = vec_min (a, b);
2189 vmax = vec_max (a, b);
2190 vabsdiff = vec_sub (vmax, vmin);
2191 /* Sum four groups of bytes into integers. */
2192 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2193 /* Sum across four integers with two integer results. */
2194 result = vec_sum2s (vsum, (__vector signed int) zero);
2195 /* Rotate the sums into the correct position. */
2196 #ifdef __LITTLE_ENDIAN__
2197 result = vec_sld (result, result, 4);
2198 #elif __BIG_ENDIAN__
2199 result = vec_sld (result, result, 6);
2200 #endif
2201 /* Rotate the sums into the correct position. */
2202 return (__m128i) result;
2203 }
2204
2205 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2206 _mm_stream_si32 (int *__A, int __B)
2207 {
2208 /* Use the data cache block touch for store transient. */
2209 __asm__ (
2210 "dcbtstt 0,%0"
2211 :
2212 : "b" (__A)
2213 : "memory"
2214 );
2215 *__A = __B;
2216 }
2217
2218 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2219 _mm_stream_si64 (long long int *__A, long long int __B)
2220 {
2221 /* Use the data cache block touch for store transient. */
2222 __asm__ (
2223 " dcbtstt 0,%0"
2224 :
2225 : "b" (__A)
2226 : "memory"
2227 );
2228 *__A = __B;
2229 }
2230
2231 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2232 _mm_stream_si128 (__m128i *__A, __m128i __B)
2233 {
2234 /* Use the data cache block touch for store transient. */
2235 __asm__ (
2236 "dcbtstt 0,%0"
2237 :
2238 : "b" (__A)
2239 : "memory"
2240 );
2241 *__A = __B;
2242 }
2243
2244 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2245 _mm_stream_pd (double *__A, __m128d __B)
2246 {
2247 /* Use the data cache block touch for store transient. */
2248 __asm__ (
2249 "dcbtstt 0,%0"
2250 :
2251 : "b" (__A)
2252 : "memory"
2253 );
2254 *(__m128d*)__A = __B;
2255 }
2256
2257 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2258 _mm_clflush (void const *__A)
2259 {
2260 /* Use the data cache block flush. */
2261 __asm__ (
2262 "dcbf 0,%0"
2263 :
2264 : "b" (__A)
2265 : "memory"
2266 );
2267 }
2268
2269 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2270 _mm_lfence (void)
2271 {
2272 /* Use light weight sync for load to load ordering. */
2273 __atomic_thread_fence (__ATOMIC_RELEASE);
2274 }
2275
2276 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2277 _mm_mfence (void)
2278 {
2279 /* Use heavy weight sync for any to any ordering. */
2280 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2281 }
2282
2283 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2284 _mm_cvtsi32_si128 (int __A)
2285 {
2286 return _mm_set_epi32 (0, 0, 0, __A);
2287 }
2288
2289 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2290 _mm_cvtsi64_si128 (long long __A)
2291 {
2292 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2293 }
2294
2295 /* Microsoft intrinsic. */
2296 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2297 _mm_cvtsi64x_si128 (long long __A)
2298 {
2299 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2300 }
2301
2302 /* Casts between various SP, DP, INT vector types. Note that these do no
2303 conversion of values, they just change the type. */
2304 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2305 _mm_castpd_ps(__m128d __A)
2306 {
2307 return (__m128) __A;
2308 }
2309
2310 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2311 _mm_castpd_si128(__m128d __A)
2312 {
2313 return (__m128i) __A;
2314 }
2315
2316 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2317 _mm_castps_pd(__m128 __A)
2318 {
2319 return (__m128d) __A;
2320 }
2321
2322 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2323 _mm_castps_si128(__m128 __A)
2324 {
2325 return (__m128i) __A;
2326 }
2327
2328 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2329 _mm_castsi128_ps(__m128i __A)
2330 {
2331 return (__m128) __A;
2332 }
2333
2334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2335 _mm_castsi128_pd(__m128i __A)
2336 {
2337 return (__m128d) __A;
2338 }
2339
2340 #endif /* EMMINTRIN_H_ */