]>
git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/rs6000/mmintrin.h
1 /* Copyright (C) 2002-2023 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37 target does not support a native __vector_size__ (8) type. Instead
38 we typedef __m64 to a 64-bit unsigned long long, which is natively
39 supported in 64-bit mode. This works well for the _si64 and some
40 _pi32 operations, but starts to generate long sequences for _pi16
41 and _pi8 operations. For those cases it better (faster and
42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43 unit, perform the operation, and then transfer the result back to
44 the __m64 type. This implies that the direct register move
45 instructions, introduced with power8, are available for efficient
46 implementation of these transfers.
48 Most MMX intrinsic operations can be performed efficiently as
49 C language 64-bit scalar operation or optimized to use the newer
50 128-bit SSE/Altivec operations. We recomend this for new
52 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
55 #ifndef _MMINTRIN_H_INCLUDED
56 #define _MMINTRIN_H_INCLUDED
59 /* The Intel API is flexible enough that we must allow aliasing with other
60 vector types, and their scalar components. */
61 typedef __attribute__ ((__aligned__ (8),
62 __may_alias__
)) unsigned long long __m64
;
64 typedef __attribute__ ((__aligned__ (8)))
69 signed char as_signed_char
[8];
72 long long as_long_long
;
77 /* Empty the multimedia state. */
78 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
81 /* nothing to do on PowerPC. */
84 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
87 /* nothing to do on PowerPC. */
90 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
91 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
92 _mm_cvtsi32_si64 (int __i
)
94 return (__m64
) (unsigned int) __i
;
97 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
100 return _mm_cvtsi32_si64 (__i
);
103 /* Convert the lower 32 bits of the __m64 object into an integer. */
104 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
105 _mm_cvtsi64_si32 (__m64 __i
)
110 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
111 _m_to_int (__m64 __i
)
113 return _mm_cvtsi64_si32 (__i
);
116 /* Convert I to a __m64 object. */
118 /* Intel intrinsic. */
119 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
120 _m_from_int64 (long long __i
)
125 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
126 _mm_cvtsi64_m64 (long long __i
)
131 /* Microsoft intrinsic. */
132 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
133 _mm_cvtsi64x_si64 (long long __i
)
138 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
139 _mm_set_pi64x (long long __i
)
144 /* Convert the __m64 object to a 64bit integer. */
146 /* Intel intrinsic. */
147 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
148 _m_to_int64 (__m64 __i
)
150 return (long long)__i
;
153 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
154 _mm_cvtm64_si64 (__m64 __i
)
156 return (long long) __i
;
159 /* Microsoft intrinsic. */
160 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
161 _mm_cvtsi64_si64x (__m64 __i
)
163 return (long long) __i
;
167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
168 the result, and the four 16-bit values from M2 into the upper four 8-bit
169 values of the result, all with signed saturation. */
170 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
171 _mm_packs_pi16 (__m64 __m1
, __m64 __m2
)
173 __vector
signed short __vm1
;
174 __vector
signed char __vresult
;
176 __vm1
= (__vector
signed short) (__vector
unsigned long long)
177 #ifdef __LITTLE_ENDIAN__
182 __vresult
= vec_packs (__vm1
, __vm1
);
183 return (__m64
) ((__vector
long long) __vresult
)[0];
186 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
187 _m_packsswb (__m64 __m1
, __m64 __m2
)
189 return _mm_packs_pi16 (__m1
, __m2
);
192 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
193 the result, and the two 32-bit values from M2 into the upper two 16-bit
194 values of the result, all with signed saturation. */
195 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
196 _mm_packs_pi32 (__m64 __m1
, __m64 __m2
)
198 __vector
signed int __vm1
;
199 __vector
signed short __vresult
;
201 __vm1
= (__vector
signed int) (__vector
unsigned long long)
202 #ifdef __LITTLE_ENDIAN__
207 __vresult
= vec_packs (__vm1
, __vm1
);
208 return (__m64
) ((__vector
long long) __vresult
)[0];
211 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
212 _m_packssdw (__m64 __m1
, __m64 __m2
)
214 return _mm_packs_pi32 (__m1
, __m2
);
217 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
218 the result, and the four 16-bit values from M2 into the upper four 8-bit
219 values of the result, all with unsigned saturation. */
220 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
221 _mm_packs_pu16 (__m64 __m1
, __m64 __m2
)
223 __vector
unsigned char __r
;
224 __vector
signed short __vm1
= (__vector
signed short) (__vector
long long)
225 #ifdef __LITTLE_ENDIAN__
230 const __vector
signed short __zero
= { 0 };
231 __vector __bool
short __select
= vec_cmplt (__vm1
, __zero
);
232 __r
= vec_packs ((__vector
unsigned short) __vm1
, (__vector
unsigned short) __vm1
);
233 __vector __bool
char __packsel
= vec_pack (__select
, __select
);
234 __r
= vec_sel (__r
, (const __vector
unsigned char) __zero
, __packsel
);
235 return (__m64
) ((__vector
long long) __r
)[0];
238 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
239 _m_packuswb (__m64 __m1
, __m64 __m2
)
241 return _mm_packs_pu16 (__m1
, __m2
);
243 #endif /* end ARCH_PWR8 */
245 /* Interleave the four 8-bit values from the high half of M1 with the four
246 8-bit values from the high half of M2. */
247 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
248 _mm_unpackhi_pi8 (__m64 __m1
, __m64 __m2
)
251 __vector
unsigned char __a
, __b
, __c
;
253 __a
= (__vector
unsigned char)vec_splats (__m1
);
254 __b
= (__vector
unsigned char)vec_splats (__m2
);
255 __c
= vec_mergel (__a
, __b
);
256 return (__m64
) ((__vector
long long) __c
)[1];
258 __m64_union __mu1
, __mu2
, __res
;
263 __res
.as_char
[0] = __mu1
.as_char
[4];
264 __res
.as_char
[1] = __mu2
.as_char
[4];
265 __res
.as_char
[2] = __mu1
.as_char
[5];
266 __res
.as_char
[3] = __mu2
.as_char
[5];
267 __res
.as_char
[4] = __mu1
.as_char
[6];
268 __res
.as_char
[5] = __mu2
.as_char
[6];
269 __res
.as_char
[6] = __mu1
.as_char
[7];
270 __res
.as_char
[7] = __mu2
.as_char
[7];
272 return (__m64
) __res
.as_m64
;
276 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
277 _m_punpckhbw (__m64 __m1
, __m64 __m2
)
279 return _mm_unpackhi_pi8 (__m1
, __m2
);
282 /* Interleave the two 16-bit values from the high half of M1 with the two
283 16-bit values from the high half of M2. */
284 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
285 _mm_unpackhi_pi16 (__m64 __m1
, __m64 __m2
)
287 __m64_union __mu1
, __mu2
, __res
;
292 __res
.as_short
[0] = __mu1
.as_short
[2];
293 __res
.as_short
[1] = __mu2
.as_short
[2];
294 __res
.as_short
[2] = __mu1
.as_short
[3];
295 __res
.as_short
[3] = __mu2
.as_short
[3];
297 return (__m64
) __res
.as_m64
;
300 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
301 _m_punpckhwd (__m64 __m1
, __m64 __m2
)
303 return _mm_unpackhi_pi16 (__m1
, __m2
);
305 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
306 value from the high half of M2. */
307 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
308 _mm_unpackhi_pi32 (__m64 __m1
, __m64 __m2
)
310 __m64_union __mu1
, __mu2
, __res
;
315 __res
.as_int
[0] = __mu1
.as_int
[1];
316 __res
.as_int
[1] = __mu2
.as_int
[1];
318 return (__m64
) __res
.as_m64
;
321 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
322 _m_punpckhdq (__m64 __m1
, __m64 __m2
)
324 return _mm_unpackhi_pi32 (__m1
, __m2
);
326 /* Interleave the four 8-bit values from the low half of M1 with the four
327 8-bit values from the low half of M2. */
328 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
329 _mm_unpacklo_pi8 (__m64 __m1
, __m64 __m2
)
332 __vector
unsigned char __a
, __b
, __c
;
334 __a
= (__vector
unsigned char)vec_splats (__m1
);
335 __b
= (__vector
unsigned char)vec_splats (__m2
);
336 __c
= vec_mergel (__a
, __b
);
337 return (__m64
) ((__vector
long long) __c
)[0];
339 __m64_union __mu1
, __mu2
, __res
;
344 __res
.as_char
[0] = __mu1
.as_char
[0];
345 __res
.as_char
[1] = __mu2
.as_char
[0];
346 __res
.as_char
[2] = __mu1
.as_char
[1];
347 __res
.as_char
[3] = __mu2
.as_char
[1];
348 __res
.as_char
[4] = __mu1
.as_char
[2];
349 __res
.as_char
[5] = __mu2
.as_char
[2];
350 __res
.as_char
[6] = __mu1
.as_char
[3];
351 __res
.as_char
[7] = __mu2
.as_char
[3];
353 return (__m64
) __res
.as_m64
;
357 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
358 _m_punpcklbw (__m64 __m1
, __m64 __m2
)
360 return _mm_unpacklo_pi8 (__m1
, __m2
);
362 /* Interleave the two 16-bit values from the low half of M1 with the two
363 16-bit values from the low half of M2. */
364 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
365 _mm_unpacklo_pi16 (__m64 __m1
, __m64 __m2
)
367 __m64_union __mu1
, __mu2
, __res
;
372 __res
.as_short
[0] = __mu1
.as_short
[0];
373 __res
.as_short
[1] = __mu2
.as_short
[0];
374 __res
.as_short
[2] = __mu1
.as_short
[1];
375 __res
.as_short
[3] = __mu2
.as_short
[1];
377 return (__m64
) __res
.as_m64
;
380 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
381 _m_punpcklwd (__m64 __m1
, __m64 __m2
)
383 return _mm_unpacklo_pi16 (__m1
, __m2
);
386 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
387 value from the low half of M2. */
388 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
389 _mm_unpacklo_pi32 (__m64 __m1
, __m64 __m2
)
391 __m64_union __mu1
, __mu2
, __res
;
396 __res
.as_int
[0] = __mu1
.as_int
[0];
397 __res
.as_int
[1] = __mu2
.as_int
[0];
399 return (__m64
) __res
.as_m64
;
402 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
403 _m_punpckldq (__m64 __m1
, __m64 __m2
)
405 return _mm_unpacklo_pi32 (__m1
, __m2
);
408 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
409 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
410 _mm_add_pi8 (__m64 __m1
, __m64 __m2
)
413 __vector
signed char __a
, __b
, __c
;
415 __a
= (__vector
signed char)vec_splats (__m1
);
416 __b
= (__vector
signed char)vec_splats (__m2
);
417 __c
= vec_add (__a
, __b
);
418 return (__m64
) ((__vector
long long) __c
)[0];
420 __m64_union __mu1
, __mu2
, __res
;
425 __res
.as_char
[0] = __mu1
.as_char
[0] + __mu2
.as_char
[0];
426 __res
.as_char
[1] = __mu1
.as_char
[1] + __mu2
.as_char
[1];
427 __res
.as_char
[2] = __mu1
.as_char
[2] + __mu2
.as_char
[2];
428 __res
.as_char
[3] = __mu1
.as_char
[3] + __mu2
.as_char
[3];
429 __res
.as_char
[4] = __mu1
.as_char
[4] + __mu2
.as_char
[4];
430 __res
.as_char
[5] = __mu1
.as_char
[5] + __mu2
.as_char
[5];
431 __res
.as_char
[6] = __mu1
.as_char
[6] + __mu2
.as_char
[6];
432 __res
.as_char
[7] = __mu1
.as_char
[7] + __mu2
.as_char
[7];
434 return (__m64
) __res
.as_m64
;
438 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
439 _m_paddb (__m64 __m1
, __m64 __m2
)
441 return _mm_add_pi8 (__m1
, __m2
);
444 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
445 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
446 _mm_add_pi16 (__m64 __m1
, __m64 __m2
)
449 __vector
signed short __a
, __b
, __c
;
451 __a
= (__vector
signed short)vec_splats (__m1
);
452 __b
= (__vector
signed short)vec_splats (__m2
);
453 __c
= vec_add (__a
, __b
);
454 return (__m64
) ((__vector
long long) __c
)[0];
456 __m64_union __mu1
, __mu2
, __res
;
461 __res
.as_short
[0] = __mu1
.as_short
[0] + __mu2
.as_short
[0];
462 __res
.as_short
[1] = __mu1
.as_short
[1] + __mu2
.as_short
[1];
463 __res
.as_short
[2] = __mu1
.as_short
[2] + __mu2
.as_short
[2];
464 __res
.as_short
[3] = __mu1
.as_short
[3] + __mu2
.as_short
[3];
466 return (__m64
) __res
.as_m64
;
470 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
471 _m_paddw (__m64 __m1
, __m64 __m2
)
473 return _mm_add_pi16 (__m1
, __m2
);
476 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
477 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
478 _mm_add_pi32 (__m64 __m1
, __m64 __m2
)
481 __vector
signed int __a
, __b
, __c
;
483 __a
= (__vector
signed int)vec_splats (__m1
);
484 __b
= (__vector
signed int)vec_splats (__m2
);
485 __c
= vec_add (__a
, __b
);
486 return (__m64
) ((__vector
long long) __c
)[0];
488 __m64_union __mu1
, __mu2
, __res
;
493 __res
.as_int
[0] = __mu1
.as_int
[0] + __mu2
.as_int
[0];
494 __res
.as_int
[1] = __mu1
.as_int
[1] + __mu2
.as_int
[1];
496 return (__m64
) __res
.as_m64
;
500 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
501 _m_paddd (__m64 __m1
, __m64 __m2
)
503 return _mm_add_pi32 (__m1
, __m2
);
506 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
507 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
508 _mm_sub_pi8 (__m64 __m1
, __m64 __m2
)
511 __vector
signed char __a
, __b
, __c
;
513 __a
= (__vector
signed char)vec_splats (__m1
);
514 __b
= (__vector
signed char)vec_splats (__m2
);
515 __c
= vec_sub (__a
, __b
);
516 return (__m64
) ((__vector
long long) __c
)[0];
518 __m64_union __mu1
, __mu2
, __res
;
523 __res
.as_char
[0] = __mu1
.as_char
[0] - __mu2
.as_char
[0];
524 __res
.as_char
[1] = __mu1
.as_char
[1] - __mu2
.as_char
[1];
525 __res
.as_char
[2] = __mu1
.as_char
[2] - __mu2
.as_char
[2];
526 __res
.as_char
[3] = __mu1
.as_char
[3] - __mu2
.as_char
[3];
527 __res
.as_char
[4] = __mu1
.as_char
[4] - __mu2
.as_char
[4];
528 __res
.as_char
[5] = __mu1
.as_char
[5] - __mu2
.as_char
[5];
529 __res
.as_char
[6] = __mu1
.as_char
[6] - __mu2
.as_char
[6];
530 __res
.as_char
[7] = __mu1
.as_char
[7] - __mu2
.as_char
[7];
532 return (__m64
) __res
.as_m64
;
536 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
537 _m_psubb (__m64 __m1
, __m64 __m2
)
539 return _mm_sub_pi8 (__m1
, __m2
);
542 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
543 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
544 _mm_sub_pi16 (__m64 __m1
, __m64 __m2
)
547 __vector
signed short __a
, __b
, __c
;
549 __a
= (__vector
signed short)vec_splats (__m1
);
550 __b
= (__vector
signed short)vec_splats (__m2
);
551 __c
= vec_sub (__a
, __b
);
552 return (__m64
) ((__vector
long long) __c
)[0];
554 __m64_union __mu1
, __mu2
, __res
;
559 __res
.as_short
[0] = __mu1
.as_short
[0] - __mu2
.as_short
[0];
560 __res
.as_short
[1] = __mu1
.as_short
[1] - __mu2
.as_short
[1];
561 __res
.as_short
[2] = __mu1
.as_short
[2] - __mu2
.as_short
[2];
562 __res
.as_short
[3] = __mu1
.as_short
[3] - __mu2
.as_short
[3];
564 return (__m64
) __res
.as_m64
;
568 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
569 _m_psubw (__m64 __m1
, __m64 __m2
)
571 return _mm_sub_pi16 (__m1
, __m2
);
574 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
575 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
576 _mm_sub_pi32 (__m64 __m1
, __m64 __m2
)
579 __vector
signed int __a
, __b
, __c
;
581 __a
= (__vector
signed int)vec_splats (__m1
);
582 __b
= (__vector
signed int)vec_splats (__m2
);
583 __c
= vec_sub (__a
, __b
);
584 return (__m64
) ((__vector
long long) __c
)[0];
586 __m64_union __mu1
, __mu2
, __res
;
591 __res
.as_int
[0] = __mu1
.as_int
[0] - __mu2
.as_int
[0];
592 __res
.as_int
[1] = __mu1
.as_int
[1] - __mu2
.as_int
[1];
594 return (__m64
) __res
.as_m64
;
598 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
599 _m_psubd (__m64 __m1
, __m64 __m2
)
601 return _mm_sub_pi32 (__m1
, __m2
);
604 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
605 _mm_add_si64 (__m64 __m1
, __m64 __m2
)
607 return (__m1
+ __m2
);
610 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
611 _mm_sub_si64 (__m64 __m1
, __m64 __m2
)
613 return (__m1
- __m2
);
616 /* Shift the 64-bit value in M left by COUNT. */
617 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
618 _mm_sll_si64 (__m64 __m
, __m64 __count
)
620 return (__m
<< __count
);
623 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
624 _m_psllq (__m64 __m
, __m64 __count
)
626 return _mm_sll_si64 (__m
, __count
);
629 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
630 _mm_slli_si64 (__m64 __m
, const int __count
)
632 return (__m
<< __count
);
635 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
636 _m_psllqi (__m64 __m
, const int __count
)
638 return _mm_slli_si64 (__m
, __count
);
641 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
642 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
643 _mm_srl_si64 (__m64 __m
, __m64 __count
)
645 return (__m
>> __count
);
648 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
649 _m_psrlq (__m64 __m
, __m64 __count
)
651 return _mm_srl_si64 (__m
, __count
);
654 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
655 _mm_srli_si64 (__m64 __m
, const int __count
)
657 return (__m
>> __count
);
660 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
661 _m_psrlqi (__m64 __m
, const int __count
)
663 return _mm_srli_si64 (__m
, __count
);
666 /* Bit-wise AND the 64-bit values in M1 and M2. */
667 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
668 _mm_and_si64 (__m64 __m1
, __m64 __m2
)
670 return (__m1
& __m2
);
673 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
674 _m_pand (__m64 __m1
, __m64 __m2
)
676 return _mm_and_si64 (__m1
, __m2
);
679 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
680 64-bit value in M2. */
681 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
682 _mm_andnot_si64 (__m64 __m1
, __m64 __m2
)
684 return (~__m1
& __m2
);
687 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
688 _m_pandn (__m64 __m1
, __m64 __m2
)
690 return _mm_andnot_si64 (__m1
, __m2
);
693 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
694 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
695 _mm_or_si64 (__m64 __m1
, __m64 __m2
)
697 return (__m1
| __m2
);
700 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
701 _m_por (__m64 __m1
, __m64 __m2
)
703 return _mm_or_si64 (__m1
, __m2
);
706 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
707 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
708 _mm_xor_si64 (__m64 __m1
, __m64 __m2
)
710 return (__m1
^ __m2
);
713 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
714 _m_pxor (__m64 __m1
, __m64 __m2
)
716 return _mm_xor_si64 (__m1
, __m2
);
719 /* Creates a 64-bit zero. */
720 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
721 _mm_setzero_si64 (void)
726 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
727 test is true and zero if false. */
728 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
729 _mm_cmpeq_pi8 (__m64 __m1
, __m64 __m2
)
731 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
741 __m64_union __mu1
, __mu2
, __res
;
746 __res
.as_char
[0] = (__mu1
.as_char
[0] == __mu2
.as_char
[0])? -1: 0;
747 __res
.as_char
[1] = (__mu1
.as_char
[1] == __mu2
.as_char
[1])? -1: 0;
748 __res
.as_char
[2] = (__mu1
.as_char
[2] == __mu2
.as_char
[2])? -1: 0;
749 __res
.as_char
[3] = (__mu1
.as_char
[3] == __mu2
.as_char
[3])? -1: 0;
750 __res
.as_char
[4] = (__mu1
.as_char
[4] == __mu2
.as_char
[4])? -1: 0;
751 __res
.as_char
[5] = (__mu1
.as_char
[5] == __mu2
.as_char
[5])? -1: 0;
752 __res
.as_char
[6] = (__mu1
.as_char
[6] == __mu2
.as_char
[6])? -1: 0;
753 __res
.as_char
[7] = (__mu1
.as_char
[7] == __mu2
.as_char
[7])? -1: 0;
755 return (__m64
) __res
.as_m64
;
759 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
760 _m_pcmpeqb (__m64 __m1
, __m64 __m2
)
762 return _mm_cmpeq_pi8 (__m1
, __m2
);
765 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
766 _mm_cmpgt_pi8 (__m64 __m1
, __m64 __m2
)
769 __vector
signed char __a
, __b
, __c
;
771 __a
= (__vector
signed char)vec_splats (__m1
);
772 __b
= (__vector
signed char)vec_splats (__m2
);
773 __c
= (__vector
signed char)vec_cmpgt (__a
, __b
);
774 return (__m64
) ((__vector
long long) __c
)[0];
776 __m64_union __mu1
, __mu2
, __res
;
781 __res
.as_char
[0] = (__mu1
.as_char
[0] > __mu2
.as_char
[0])? -1: 0;
782 __res
.as_char
[1] = (__mu1
.as_char
[1] > __mu2
.as_char
[1])? -1: 0;
783 __res
.as_char
[2] = (__mu1
.as_char
[2] > __mu2
.as_char
[2])? -1: 0;
784 __res
.as_char
[3] = (__mu1
.as_char
[3] > __mu2
.as_char
[3])? -1: 0;
785 __res
.as_char
[4] = (__mu1
.as_char
[4] > __mu2
.as_char
[4])? -1: 0;
786 __res
.as_char
[5] = (__mu1
.as_char
[5] > __mu2
.as_char
[5])? -1: 0;
787 __res
.as_char
[6] = (__mu1
.as_char
[6] > __mu2
.as_char
[6])? -1: 0;
788 __res
.as_char
[7] = (__mu1
.as_char
[7] > __mu2
.as_char
[7])? -1: 0;
790 return (__m64
) __res
.as_m64
;
794 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
795 _m_pcmpgtb (__m64 __m1
, __m64 __m2
)
797 return _mm_cmpgt_pi8 (__m1
, __m2
);
800 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
801 the test is true and zero if false. */
802 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
803 _mm_cmpeq_pi16 (__m64 __m1
, __m64 __m2
)
806 __vector
signed short __a
, __b
, __c
;
808 __a
= (__vector
signed short)vec_splats (__m1
);
809 __b
= (__vector
signed short)vec_splats (__m2
);
810 __c
= (__vector
signed short)vec_cmpeq (__a
, __b
);
811 return (__m64
) ((__vector
long long) __c
)[0];
813 __m64_union __mu1
, __mu2
, __res
;
818 __res
.as_short
[0] = (__mu1
.as_short
[0] == __mu2
.as_short
[0])? -1: 0;
819 __res
.as_short
[1] = (__mu1
.as_short
[1] == __mu2
.as_short
[1])? -1: 0;
820 __res
.as_short
[2] = (__mu1
.as_short
[2] == __mu2
.as_short
[2])? -1: 0;
821 __res
.as_short
[3] = (__mu1
.as_short
[3] == __mu2
.as_short
[3])? -1: 0;
823 return (__m64
) __res
.as_m64
;
827 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
828 _m_pcmpeqw (__m64 __m1
, __m64 __m2
)
830 return _mm_cmpeq_pi16 (__m1
, __m2
);
833 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
834 _mm_cmpgt_pi16 (__m64 __m1
, __m64 __m2
)
837 __vector
signed short __a
, __b
, __c
;
839 __a
= (__vector
signed short)vec_splats (__m1
);
840 __b
= (__vector
signed short)vec_splats (__m2
);
841 __c
= (__vector
signed short)vec_cmpgt (__a
, __b
);
842 return (__m64
) ((__vector
long long) __c
)[0];
844 __m64_union __mu1
, __mu2
, __res
;
849 __res
.as_short
[0] = (__mu1
.as_short
[0] > __mu2
.as_short
[0])? -1: 0;
850 __res
.as_short
[1] = (__mu1
.as_short
[1] > __mu2
.as_short
[1])? -1: 0;
851 __res
.as_short
[2] = (__mu1
.as_short
[2] > __mu2
.as_short
[2])? -1: 0;
852 __res
.as_short
[3] = (__mu1
.as_short
[3] > __mu2
.as_short
[3])? -1: 0;
854 return (__m64
) __res
.as_m64
;
858 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
859 _m_pcmpgtw (__m64 __m1
, __m64 __m2
)
861 return _mm_cmpgt_pi16 (__m1
, __m2
);
864 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
865 the test is true and zero if false. */
866 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
867 _mm_cmpeq_pi32 (__m64 __m1
, __m64 __m2
)
870 __vector
signed int __a
, __b
, __c
;
872 __a
= (__vector
signed int)vec_splats (__m1
);
873 __b
= (__vector
signed int)vec_splats (__m2
);
874 __c
= (__vector
signed int)vec_cmpeq (__a
, __b
);
875 return (__m64
) ((__vector
long long) __c
)[0];
877 __m64_union __mu1
, __mu2
, __res
;
882 __res
.as_int
[0] = (__mu1
.as_int
[0] == __mu2
.as_int
[0])? -1: 0;
883 __res
.as_int
[1] = (__mu1
.as_int
[1] == __mu2
.as_int
[1])? -1: 0;
885 return (__m64
) __res
.as_m64
;
889 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
890 _m_pcmpeqd (__m64 __m1
, __m64 __m2
)
892 return _mm_cmpeq_pi32 (__m1
, __m2
);
895 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
896 _mm_cmpgt_pi32 (__m64 __m1
, __m64 __m2
)
899 __vector
signed int __a
, __b
, __c
;
901 __a
= (__vector
signed int)vec_splats (__m1
);
902 __b
= (__vector
signed int)vec_splats (__m2
);
903 __c
= (__vector
signed int)vec_cmpgt (__a
, __b
);
904 return (__m64
) ((__vector
long long) __c
)[0];
906 __m64_union __mu1
, __mu2
, __res
;
911 __res
.as_int
[0] = (__mu1
.as_int
[0] > __mu2
.as_int
[0])? -1: 0;
912 __res
.as_int
[1] = (__mu1
.as_int
[1] > __mu2
.as_int
[1])? -1: 0;
914 return (__m64
) __res
.as_m64
;
918 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
919 _m_pcmpgtd (__m64 __m1
, __m64 __m2
)
921 return _mm_cmpgt_pi32 (__m1
, __m2
);
925 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
926 saturated arithmetic. */
927 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
928 _mm_adds_pi8 (__m64 __m1
, __m64 __m2
)
930 __vector
signed char __a
, __b
, __c
;
932 __a
= (__vector
signed char)vec_splats (__m1
);
933 __b
= (__vector
signed char)vec_splats (__m2
);
934 __c
= vec_adds (__a
, __b
);
935 return (__m64
) ((__vector
long long) __c
)[0];
938 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
939 _m_paddsb (__m64 __m1
, __m64 __m2
)
941 return _mm_adds_pi8 (__m1
, __m2
);
943 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
944 saturated arithmetic. */
945 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
946 _mm_adds_pi16 (__m64 __m1
, __m64 __m2
)
948 __vector
signed short __a
, __b
, __c
;
950 __a
= (__vector
signed short)vec_splats (__m1
);
951 __b
= (__vector
signed short)vec_splats (__m2
);
952 __c
= vec_adds (__a
, __b
);
953 return (__m64
) ((__vector
long long) __c
)[0];
956 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
957 _m_paddsw (__m64 __m1
, __m64 __m2
)
959 return _mm_adds_pi16 (__m1
, __m2
);
961 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
962 saturated arithmetic. */
963 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
964 _mm_adds_pu8 (__m64 __m1
, __m64 __m2
)
966 __vector
unsigned char __a
, __b
, __c
;
968 __a
= (__vector
unsigned char)vec_splats (__m1
);
969 __b
= (__vector
unsigned char)vec_splats (__m2
);
970 __c
= vec_adds (__a
, __b
);
971 return (__m64
) ((__vector
long long) __c
)[0];
974 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
975 _m_paddusb (__m64 __m1
, __m64 __m2
)
977 return _mm_adds_pu8 (__m1
, __m2
);
980 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
981 saturated arithmetic. */
982 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
983 _mm_adds_pu16 (__m64 __m1
, __m64 __m2
)
985 __vector
unsigned short __a
, __b
, __c
;
987 __a
= (__vector
unsigned short)vec_splats (__m1
);
988 __b
= (__vector
unsigned short)vec_splats (__m2
);
989 __c
= vec_adds (__a
, __b
);
990 return (__m64
) ((__vector
long long) __c
)[0];
993 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
994 _m_paddusw (__m64 __m1
, __m64 __m2
)
996 return _mm_adds_pu16 (__m1
, __m2
);
999 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
1000 saturating arithmetic. */
1001 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1002 _mm_subs_pi8 (__m64 __m1
, __m64 __m2
)
1004 __vector
signed char __a
, __b
, __c
;
1006 __a
= (__vector
signed char)vec_splats (__m1
);
1007 __b
= (__vector
signed char)vec_splats (__m2
);
1008 __c
= vec_subs (__a
, __b
);
1009 return (__m64
) ((__vector
long long) __c
)[0];
1012 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1013 _m_psubsb (__m64 __m1
, __m64 __m2
)
1015 return _mm_subs_pi8 (__m1
, __m2
);
1018 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1019 signed saturating arithmetic. */
1020 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1021 _mm_subs_pi16 (__m64 __m1
, __m64 __m2
)
1023 __vector
signed short __a
, __b
, __c
;
1025 __a
= (__vector
signed short)vec_splats (__m1
);
1026 __b
= (__vector
signed short)vec_splats (__m2
);
1027 __c
= vec_subs (__a
, __b
);
1028 return (__m64
) ((__vector
long long) __c
)[0];
1031 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1032 _m_psubsw (__m64 __m1
, __m64 __m2
)
1034 return _mm_subs_pi16 (__m1
, __m2
);
1037 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1038 unsigned saturating arithmetic. */
1039 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1040 _mm_subs_pu8 (__m64 __m1
, __m64 __m2
)
1042 __vector
unsigned char __a
, __b
, __c
;
1044 __a
= (__vector
unsigned char)vec_splats (__m1
);
1045 __b
= (__vector
unsigned char)vec_splats (__m2
);
1046 __c
= vec_subs (__a
, __b
);
1047 return (__m64
) ((__vector
long long) __c
)[0];
1050 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1051 _m_psubusb (__m64 __m1
, __m64 __m2
)
1053 return _mm_subs_pu8 (__m1
, __m2
);
1056 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1057 unsigned saturating arithmetic. */
1058 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1059 _mm_subs_pu16 (__m64 __m1
, __m64 __m2
)
1061 __vector
unsigned short __a
, __b
, __c
;
1063 __a
= (__vector
unsigned short)vec_splats (__m1
);
1064 __b
= (__vector
unsigned short)vec_splats (__m2
);
1065 __c
= vec_subs (__a
, __b
);
1066 return (__m64
) ((__vector
long long) __c
)[0];
1069 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1070 _m_psubusw (__m64 __m1
, __m64 __m2
)
1072 return _mm_subs_pu16 (__m1
, __m2
);
1075 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1076 four 32-bit intermediate results, which are then summed by pairs to
1077 produce two 32-bit results. */
1078 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1079 _mm_madd_pi16 (__m64 __m1
, __m64 __m2
)
1081 __vector
signed short __a
, __b
;
1082 __vector
signed int __c
;
1083 __vector
signed int __zero
= {0, 0, 0, 0};
1085 __a
= (__vector
signed short)vec_splats (__m1
);
1086 __b
= (__vector
signed short)vec_splats (__m2
);
1087 __c
= vec_vmsumshm (__a
, __b
, __zero
);
1088 return (__m64
) ((__vector
long long) __c
)[0];
1091 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1092 _m_pmaddwd (__m64 __m1
, __m64 __m2
)
1094 return _mm_madd_pi16 (__m1
, __m2
);
1096 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1097 M2 and produce the high 16 bits of the 32-bit results. */
1098 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1099 _mm_mulhi_pi16 (__m64 __m1
, __m64 __m2
)
1101 __vector
signed short __a
, __b
;
1102 __vector
signed short __c
;
1103 __vector
signed int __w0
, __w1
;
1104 __vector
unsigned char __xform1
= {
1105 #ifdef __LITTLE_ENDIAN__
1106 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1107 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1109 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1110 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1114 __a
= (__vector
signed short)vec_splats (__m1
);
1115 __b
= (__vector
signed short)vec_splats (__m2
);
1117 __w0
= vec_vmulesh (__a
, __b
);
1118 __w1
= vec_vmulosh (__a
, __b
);
1119 __c
= (__vector
signed short)vec_perm (__w0
, __w1
, __xform1
);
1121 return (__m64
) ((__vector
long long) __c
)[0];
1124 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1125 _m_pmulhw (__m64 __m1
, __m64 __m2
)
1127 return _mm_mulhi_pi16 (__m1
, __m2
);
1130 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1131 the low 16 bits of the results. */
1132 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1133 _mm_mullo_pi16 (__m64 __m1
, __m64 __m2
)
1135 __vector
signed short __a
, __b
, __c
;
1137 __a
= (__vector
signed short)vec_splats (__m1
);
1138 __b
= (__vector
signed short)vec_splats (__m2
);
1140 return (__m64
) ((__vector
long long) __c
)[0];
1143 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1144 _m_pmullw (__m64 __m1
, __m64 __m2
)
1146 return _mm_mullo_pi16 (__m1
, __m2
);
1149 /* Shift four 16-bit values in M left by COUNT. */
1150 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1151 _mm_sll_pi16 (__m64 __m
, __m64 __count
)
1153 __vector
signed short __r
;
1154 __vector
unsigned short __c
;
1158 __r
= (__vector
signed short)vec_splats (__m
);
1159 __c
= (__vector
unsigned short)vec_splats ((unsigned short)__count
);
1160 __r
= vec_sl (__r
, (__vector
unsigned short)__c
);
1161 return (__m64
) ((__vector
long long) __r
)[0];
1167 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1168 _m_psllw (__m64 __m
, __m64 __count
)
1170 return _mm_sll_pi16 (__m
, __count
);
1173 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1174 _mm_slli_pi16 (__m64 __m
, int __count
)
1176 /* Promote int to long then invoke mm_sll_pi16. */
1177 return _mm_sll_pi16 (__m
, __count
);
1180 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1181 _m_psllwi (__m64 __m
, int __count
)
1183 return _mm_slli_pi16 (__m
, __count
);
1186 /* Shift two 32-bit values in M left by COUNT. */
1187 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1188 _mm_sll_pi32 (__m64 __m
, __m64 __count
)
1194 __res
.as_int
[0] = __res
.as_int
[0] << __count
;
1195 __res
.as_int
[1] = __res
.as_int
[1] << __count
;
1196 return (__res
.as_m64
);
1199 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1200 _m_pslld (__m64 __m
, __m64 __count
)
1202 return _mm_sll_pi32 (__m
, __count
);
1205 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1206 _mm_slli_pi32 (__m64 __m
, int __count
)
1208 /* Promote int to long then invoke mm_sll_pi32. */
1209 return _mm_sll_pi32 (__m
, __count
);
1212 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1213 _m_pslldi (__m64 __m
, int __count
)
1215 return _mm_slli_pi32 (__m
, __count
);
1218 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1219 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1220 _mm_sra_pi16 (__m64 __m
, __m64 __count
)
1222 __vector
signed short __r
;
1223 __vector
unsigned short __c
;
1227 __r
= (__vector
signed short)vec_splats (__m
);
1228 __c
= (__vector
unsigned short)vec_splats ((unsigned short)__count
);
1229 __r
= vec_sra (__r
, (__vector
unsigned short)__c
);
1230 return (__m64
) ((__vector
long long) __r
)[0];
1236 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1237 _m_psraw (__m64 __m
, __m64 __count
)
1239 return _mm_sra_pi16 (__m
, __count
);
1242 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1243 _mm_srai_pi16 (__m64 __m
, int __count
)
1245 /* Promote int to long then invoke mm_sra_pi32. */
1246 return _mm_sra_pi16 (__m
, __count
);
1249 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1250 _m_psrawi (__m64 __m
, int __count
)
1252 return _mm_srai_pi16 (__m
, __count
);
1255 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1256 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1257 _mm_sra_pi32 (__m64 __m
, __m64 __count
)
1263 __res
.as_int
[0] = __res
.as_int
[0] >> __count
;
1264 __res
.as_int
[1] = __res
.as_int
[1] >> __count
;
1265 return (__res
.as_m64
);
1268 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1269 _m_psrad (__m64 __m
, __m64 __count
)
1271 return _mm_sra_pi32 (__m
, __count
);
1274 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1275 _mm_srai_pi32 (__m64 __m
, int __count
)
1277 /* Promote int to long then invoke mm_sra_pi32. */
1278 return _mm_sra_pi32 (__m
, __count
);
1281 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1282 _m_psradi (__m64 __m
, int __count
)
1284 return _mm_srai_pi32 (__m
, __count
);
1287 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1288 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1289 _mm_srl_pi16 (__m64 __m
, __m64 __count
)
1291 __vector
unsigned short __r
;
1292 __vector
unsigned short __c
;
1296 __r
= (__vector
unsigned short)vec_splats (__m
);
1297 __c
= (__vector
unsigned short)vec_splats ((unsigned short)__count
);
1298 __r
= vec_sr (__r
, (__vector
unsigned short)__c
);
1299 return (__m64
) ((__vector
long long) __r
)[0];
1305 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1306 _m_psrlw (__m64 __m
, __m64 __count
)
1308 return _mm_srl_pi16 (__m
, __count
);
1311 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1312 _mm_srli_pi16 (__m64 __m
, int __count
)
1314 /* Promote int to long then invoke mm_sra_pi32. */
1315 return _mm_srl_pi16 (__m
, __count
);
1318 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1319 _m_psrlwi (__m64 __m
, int __count
)
1321 return _mm_srli_pi16 (__m
, __count
);
1324 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1325 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1326 _mm_srl_pi32 (__m64 __m
, __m64 __count
)
1332 __res
.as_int
[0] = (unsigned int)__res
.as_int
[0] >> __count
;
1333 __res
.as_int
[1] = (unsigned int)__res
.as_int
[1] >> __count
;
1334 return (__res
.as_m64
);
1337 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1338 _m_psrld (__m64 __m
, __m64 __count
)
1340 return _mm_srl_pi32 (__m
, __count
);
1343 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1344 _mm_srli_pi32 (__m64 __m
, int __count
)
1346 /* Promote int to long then invoke mm_srl_pi32. */
1347 return _mm_srl_pi32 (__m
, __count
);
1350 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1351 _m_psrldi (__m64 __m
, int __count
)
1353 return _mm_srli_pi32 (__m
, __count
);
1355 #endif /* _ARCH_PWR8 */
1357 /* Creates a vector of two 32-bit values; I0 is least significant. */
1358 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1359 _mm_set_pi32 (int __i1
, int __i0
)
1363 __res
.as_int
[0] = __i0
;
1364 __res
.as_int
[1] = __i1
;
1365 return (__res
.as_m64
);
1368 /* Creates a vector of four 16-bit values; W0 is least significant. */
1369 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1370 _mm_set_pi16 (short __w3
, short __w2
, short __w1
, short __w0
)
1374 __res
.as_short
[0] = __w0
;
1375 __res
.as_short
[1] = __w1
;
1376 __res
.as_short
[2] = __w2
;
1377 __res
.as_short
[3] = __w3
;
1378 return (__res
.as_m64
);
1381 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1382 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1383 _mm_set_pi8 (char __b7
, char __b6
, char __b5
, char __b4
,
1384 char __b3
, char __b2
, char __b1
, char __b0
)
1388 __res
.as_char
[0] = __b0
;
1389 __res
.as_char
[1] = __b1
;
1390 __res
.as_char
[2] = __b2
;
1391 __res
.as_char
[3] = __b3
;
1392 __res
.as_char
[4] = __b4
;
1393 __res
.as_char
[5] = __b5
;
1394 __res
.as_char
[6] = __b6
;
1395 __res
.as_char
[7] = __b7
;
1396 return (__res
.as_m64
);
1399 /* Similar, but with the arguments in reverse order. */
1400 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1401 _mm_setr_pi32 (int __i0
, int __i1
)
1405 __res
.as_int
[0] = __i0
;
1406 __res
.as_int
[1] = __i1
;
1407 return (__res
.as_m64
);
1410 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1411 _mm_setr_pi16 (short __w0
, short __w1
, short __w2
, short __w3
)
1413 return _mm_set_pi16 (__w3
, __w2
, __w1
, __w0
);
1416 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1417 _mm_setr_pi8 (char __b0
, char __b1
, char __b2
, char __b3
,
1418 char __b4
, char __b5
, char __b6
, char __b7
)
1420 return _mm_set_pi8 (__b7
, __b6
, __b5
, __b4
, __b3
, __b2
, __b1
, __b0
);
1423 /* Creates a vector of two 32-bit values, both elements containing I. */
1424 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1425 _mm_set1_pi32 (int __i
)
1429 __res
.as_int
[0] = __i
;
1430 __res
.as_int
[1] = __i
;
1431 return (__res
.as_m64
);
1434 /* Creates a vector of four 16-bit values, all elements containing W. */
1435 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1436 _mm_set1_pi16 (short __w
)
1439 __vector
signed short w
;
1441 w
= (__vector
signed short)vec_splats (__w
);
1442 return (__m64
) ((__vector
long long) w
)[0];
1446 __res
.as_short
[0] = __w
;
1447 __res
.as_short
[1] = __w
;
1448 __res
.as_short
[2] = __w
;
1449 __res
.as_short
[3] = __w
;
1450 return (__res
.as_m64
);
1454 /* Creates a vector of eight 8-bit values, all elements containing B. */
1455 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1456 _mm_set1_pi8 (signed char __b
)
1459 __vector
signed char __res
;
1461 __res
= (__vector
signed char)vec_splats (__b
);
1462 return (__m64
) ((__vector
long long) __res
)[0];
1466 __res
.as_char
[0] = __b
;
1467 __res
.as_char
[1] = __b
;
1468 __res
.as_char
[2] = __b
;
1469 __res
.as_char
[3] = __b
;
1470 __res
.as_char
[4] = __b
;
1471 __res
.as_char
[5] = __b
;
1472 __res
.as_char
[6] = __b
;
1473 __res
.as_char
[7] = __b
;
1474 return (__res
.as_m64
);
1477 #endif /* _MMINTRIN_H_INCLUDED */