]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/rs6000/mmintrin.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / rs6000 / mmintrin.h
1 /* Copyright (C) 2002-2023 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
35
36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37 target does not support a native __vector_size__ (8) type. Instead
38 we typedef __m64 to a 64-bit unsigned long long, which is natively
39 supported in 64-bit mode. This works well for the _si64 and some
40 _pi32 operations, but starts to generate long sequences for _pi16
41 and _pi8 operations. For those cases it better (faster and
42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43 unit, perform the operation, and then transfer the result back to
44 the __m64 type. This implies that the direct register move
45 instructions, introduced with power8, are available for efficient
46 implementation of these transfers.
47
48 Most MMX intrinsic operations can be performed efficiently as
49 C language 64-bit scalar operation or optimized to use the newer
50 128-bit SSE/Altivec operations. We recomend this for new
51 applications. */
52 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
53 #endif
54
55 #ifndef _MMINTRIN_H_INCLUDED
56 #define _MMINTRIN_H_INCLUDED
57
58 #include <altivec.h>
59 /* The Intel API is flexible enough that we must allow aliasing with other
60 vector types, and their scalar components. */
61 typedef __attribute__ ((__aligned__ (8),
62 __may_alias__)) unsigned long long __m64;
63
64 typedef __attribute__ ((__aligned__ (8)))
65 union
66 {
67 __m64 as_m64;
68 char as_char[8];
69 signed char as_signed_char [8];
70 short as_short[4];
71 int as_int[2];
72 long long as_long_long;
73 float as_float[2];
74 double as_double;
75 } __m64_union;
76
77 /* Empty the multimedia state. */
78 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79 _mm_empty (void)
80 {
81 /* nothing to do on PowerPC. */
82 }
83
84 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85 _m_empty (void)
86 {
87 /* nothing to do on PowerPC. */
88 }
89
90 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
91 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92 _mm_cvtsi32_si64 (int __i)
93 {
94 return (__m64) (unsigned int) __i;
95 }
96
97 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98 _m_from_int (int __i)
99 {
100 return _mm_cvtsi32_si64 (__i);
101 }
102
103 /* Convert the lower 32 bits of the __m64 object into an integer. */
104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105 _mm_cvtsi64_si32 (__m64 __i)
106 {
107 return ((int) __i);
108 }
109
110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
111 _m_to_int (__m64 __i)
112 {
113 return _mm_cvtsi64_si32 (__i);
114 }
115
116 /* Convert I to a __m64 object. */
117
118 /* Intel intrinsic. */
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _m_from_int64 (long long __i)
121 {
122 return (__m64) __i;
123 }
124
125 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 _mm_cvtsi64_m64 (long long __i)
127 {
128 return (__m64) __i;
129 }
130
131 /* Microsoft intrinsic. */
132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 _mm_cvtsi64x_si64 (long long __i)
134 {
135 return (__m64) __i;
136 }
137
138 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 _mm_set_pi64x (long long __i)
140 {
141 return (__m64) __i;
142 }
143
144 /* Convert the __m64 object to a 64bit integer. */
145
146 /* Intel intrinsic. */
147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _m_to_int64 (__m64 __i)
149 {
150 return (long long)__i;
151 }
152
153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154 _mm_cvtm64_si64 (__m64 __i)
155 {
156 return (long long) __i;
157 }
158
159 /* Microsoft intrinsic. */
160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 _mm_cvtsi64_si64x (__m64 __i)
162 {
163 return (long long) __i;
164 }
165
166 #ifdef _ARCH_PWR8
167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
168 the result, and the four 16-bit values from M2 into the upper four 8-bit
169 values of the result, all with signed saturation. */
170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
171 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
172 {
173 __vector signed short __vm1;
174 __vector signed char __vresult;
175
176 __vm1 = (__vector signed short) (__vector unsigned long long)
177 #ifdef __LITTLE_ENDIAN__
178 { __m1, __m2 };
179 #else
180 { __m2, __m1 };
181 #endif
182 __vresult = vec_packs (__vm1, __vm1);
183 return (__m64) ((__vector long long) __vresult)[0];
184 }
185
186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
187 _m_packsswb (__m64 __m1, __m64 __m2)
188 {
189 return _mm_packs_pi16 (__m1, __m2);
190 }
191
192 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
193 the result, and the two 32-bit values from M2 into the upper two 16-bit
194 values of the result, all with signed saturation. */
195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
197 {
198 __vector signed int __vm1;
199 __vector signed short __vresult;
200
201 __vm1 = (__vector signed int) (__vector unsigned long long)
202 #ifdef __LITTLE_ENDIAN__
203 { __m1, __m2 };
204 #else
205 { __m2, __m1 };
206 #endif
207 __vresult = vec_packs (__vm1, __vm1);
208 return (__m64) ((__vector long long) __vresult)[0];
209 }
210
211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 _m_packssdw (__m64 __m1, __m64 __m2)
213 {
214 return _mm_packs_pi32 (__m1, __m2);
215 }
216
217 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
218 the result, and the four 16-bit values from M2 into the upper four 8-bit
219 values of the result, all with unsigned saturation. */
220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
222 {
223 __vector unsigned char __r;
224 __vector signed short __vm1 = (__vector signed short) (__vector long long)
225 #ifdef __LITTLE_ENDIAN__
226 { __m1, __m2 };
227 #else
228 { __m2, __m1 };
229 #endif
230 const __vector signed short __zero = { 0 };
231 __vector __bool short __select = vec_cmplt (__vm1, __zero);
232 __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1);
233 __vector __bool char __packsel = vec_pack (__select, __select);
234 __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel);
235 return (__m64) ((__vector long long) __r)[0];
236 }
237
238 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239 _m_packuswb (__m64 __m1, __m64 __m2)
240 {
241 return _mm_packs_pu16 (__m1, __m2);
242 }
243 #endif /* end ARCH_PWR8 */
244
245 /* Interleave the four 8-bit values from the high half of M1 with the four
246 8-bit values from the high half of M2. */
247 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
249 {
250 #if _ARCH_PWR8
251 __vector unsigned char __a, __b, __c;
252
253 __a = (__vector unsigned char)vec_splats (__m1);
254 __b = (__vector unsigned char)vec_splats (__m2);
255 __c = vec_mergel (__a, __b);
256 return (__m64) ((__vector long long) __c)[1];
257 #else
258 __m64_union __mu1, __mu2, __res;
259
260 __mu1.as_m64 = __m1;
261 __mu2.as_m64 = __m2;
262
263 __res.as_char[0] = __mu1.as_char[4];
264 __res.as_char[1] = __mu2.as_char[4];
265 __res.as_char[2] = __mu1.as_char[5];
266 __res.as_char[3] = __mu2.as_char[5];
267 __res.as_char[4] = __mu1.as_char[6];
268 __res.as_char[5] = __mu2.as_char[6];
269 __res.as_char[6] = __mu1.as_char[7];
270 __res.as_char[7] = __mu2.as_char[7];
271
272 return (__m64) __res.as_m64;
273 #endif
274 }
275
276 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277 _m_punpckhbw (__m64 __m1, __m64 __m2)
278 {
279 return _mm_unpackhi_pi8 (__m1, __m2);
280 }
281
282 /* Interleave the two 16-bit values from the high half of M1 with the two
283 16-bit values from the high half of M2. */
284 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
286 {
287 __m64_union __mu1, __mu2, __res;
288
289 __mu1.as_m64 = __m1;
290 __mu2.as_m64 = __m2;
291
292 __res.as_short[0] = __mu1.as_short[2];
293 __res.as_short[1] = __mu2.as_short[2];
294 __res.as_short[2] = __mu1.as_short[3];
295 __res.as_short[3] = __mu2.as_short[3];
296
297 return (__m64) __res.as_m64;
298 }
299
300 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
301 _m_punpckhwd (__m64 __m1, __m64 __m2)
302 {
303 return _mm_unpackhi_pi16 (__m1, __m2);
304 }
305 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
306 value from the high half of M2. */
307 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
309 {
310 __m64_union __mu1, __mu2, __res;
311
312 __mu1.as_m64 = __m1;
313 __mu2.as_m64 = __m2;
314
315 __res.as_int[0] = __mu1.as_int[1];
316 __res.as_int[1] = __mu2.as_int[1];
317
318 return (__m64) __res.as_m64;
319 }
320
321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
322 _m_punpckhdq (__m64 __m1, __m64 __m2)
323 {
324 return _mm_unpackhi_pi32 (__m1, __m2);
325 }
326 /* Interleave the four 8-bit values from the low half of M1 with the four
327 8-bit values from the low half of M2. */
328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
330 {
331 #if _ARCH_PWR8
332 __vector unsigned char __a, __b, __c;
333
334 __a = (__vector unsigned char)vec_splats (__m1);
335 __b = (__vector unsigned char)vec_splats (__m2);
336 __c = vec_mergel (__a, __b);
337 return (__m64) ((__vector long long) __c)[0];
338 #else
339 __m64_union __mu1, __mu2, __res;
340
341 __mu1.as_m64 = __m1;
342 __mu2.as_m64 = __m2;
343
344 __res.as_char[0] = __mu1.as_char[0];
345 __res.as_char[1] = __mu2.as_char[0];
346 __res.as_char[2] = __mu1.as_char[1];
347 __res.as_char[3] = __mu2.as_char[1];
348 __res.as_char[4] = __mu1.as_char[2];
349 __res.as_char[5] = __mu2.as_char[2];
350 __res.as_char[6] = __mu1.as_char[3];
351 __res.as_char[7] = __mu2.as_char[3];
352
353 return (__m64) __res.as_m64;
354 #endif
355 }
356
357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
358 _m_punpcklbw (__m64 __m1, __m64 __m2)
359 {
360 return _mm_unpacklo_pi8 (__m1, __m2);
361 }
362 /* Interleave the two 16-bit values from the low half of M1 with the two
363 16-bit values from the low half of M2. */
364 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
366 {
367 __m64_union __mu1, __mu2, __res;
368
369 __mu1.as_m64 = __m1;
370 __mu2.as_m64 = __m2;
371
372 __res.as_short[0] = __mu1.as_short[0];
373 __res.as_short[1] = __mu2.as_short[0];
374 __res.as_short[2] = __mu1.as_short[1];
375 __res.as_short[3] = __mu2.as_short[1];
376
377 return (__m64) __res.as_m64;
378 }
379
380 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381 _m_punpcklwd (__m64 __m1, __m64 __m2)
382 {
383 return _mm_unpacklo_pi16 (__m1, __m2);
384 }
385
386 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
387 value from the low half of M2. */
388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
390 {
391 __m64_union __mu1, __mu2, __res;
392
393 __mu1.as_m64 = __m1;
394 __mu2.as_m64 = __m2;
395
396 __res.as_int[0] = __mu1.as_int[0];
397 __res.as_int[1] = __mu2.as_int[0];
398
399 return (__m64) __res.as_m64;
400 }
401
402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403 _m_punpckldq (__m64 __m1, __m64 __m2)
404 {
405 return _mm_unpacklo_pi32 (__m1, __m2);
406 }
407
408 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410 _mm_add_pi8 (__m64 __m1, __m64 __m2)
411 {
412 #if _ARCH_PWR8
413 __vector signed char __a, __b, __c;
414
415 __a = (__vector signed char)vec_splats (__m1);
416 __b = (__vector signed char)vec_splats (__m2);
417 __c = vec_add (__a, __b);
418 return (__m64) ((__vector long long) __c)[0];
419 #else
420 __m64_union __mu1, __mu2, __res;
421
422 __mu1.as_m64 = __m1;
423 __mu2.as_m64 = __m2;
424
425 __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
426 __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
427 __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
428 __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
429 __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
430 __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
431 __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
432 __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
433
434 return (__m64) __res.as_m64;
435 #endif
436 }
437
438 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
439 _m_paddb (__m64 __m1, __m64 __m2)
440 {
441 return _mm_add_pi8 (__m1, __m2);
442 }
443
444 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446 _mm_add_pi16 (__m64 __m1, __m64 __m2)
447 {
448 #if _ARCH_PWR8
449 __vector signed short __a, __b, __c;
450
451 __a = (__vector signed short)vec_splats (__m1);
452 __b = (__vector signed short)vec_splats (__m2);
453 __c = vec_add (__a, __b);
454 return (__m64) ((__vector long long) __c)[0];
455 #else
456 __m64_union __mu1, __mu2, __res;
457
458 __mu1.as_m64 = __m1;
459 __mu2.as_m64 = __m2;
460
461 __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
462 __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
463 __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
464 __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
465
466 return (__m64) __res.as_m64;
467 #endif
468 }
469
470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 _m_paddw (__m64 __m1, __m64 __m2)
472 {
473 return _mm_add_pi16 (__m1, __m2);
474 }
475
476 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478 _mm_add_pi32 (__m64 __m1, __m64 __m2)
479 {
480 #if _ARCH_PWR9
481 __vector signed int __a, __b, __c;
482
483 __a = (__vector signed int)vec_splats (__m1);
484 __b = (__vector signed int)vec_splats (__m2);
485 __c = vec_add (__a, __b);
486 return (__m64) ((__vector long long) __c)[0];
487 #else
488 __m64_union __mu1, __mu2, __res;
489
490 __mu1.as_m64 = __m1;
491 __mu2.as_m64 = __m2;
492
493 __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
494 __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
495
496 return (__m64) __res.as_m64;
497 #endif
498 }
499
500 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501 _m_paddd (__m64 __m1, __m64 __m2)
502 {
503 return _mm_add_pi32 (__m1, __m2);
504 }
505
506 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
507 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
508 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
509 {
510 #if _ARCH_PWR8
511 __vector signed char __a, __b, __c;
512
513 __a = (__vector signed char)vec_splats (__m1);
514 __b = (__vector signed char)vec_splats (__m2);
515 __c = vec_sub (__a, __b);
516 return (__m64) ((__vector long long) __c)[0];
517 #else
518 __m64_union __mu1, __mu2, __res;
519
520 __mu1.as_m64 = __m1;
521 __mu2.as_m64 = __m2;
522
523 __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
524 __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
525 __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
526 __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
527 __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
528 __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
529 __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
530 __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
531
532 return (__m64) __res.as_m64;
533 #endif
534 }
535
536 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
537 _m_psubb (__m64 __m1, __m64 __m2)
538 {
539 return _mm_sub_pi8 (__m1, __m2);
540 }
541
542 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
543 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
544 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
545 {
546 #if _ARCH_PWR8
547 __vector signed short __a, __b, __c;
548
549 __a = (__vector signed short)vec_splats (__m1);
550 __b = (__vector signed short)vec_splats (__m2);
551 __c = vec_sub (__a, __b);
552 return (__m64) ((__vector long long) __c)[0];
553 #else
554 __m64_union __mu1, __mu2, __res;
555
556 __mu1.as_m64 = __m1;
557 __mu2.as_m64 = __m2;
558
559 __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
560 __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
561 __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
562 __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
563
564 return (__m64) __res.as_m64;
565 #endif
566 }
567
568 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569 _m_psubw (__m64 __m1, __m64 __m2)
570 {
571 return _mm_sub_pi16 (__m1, __m2);
572 }
573
574 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
575 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
576 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
577 {
578 #if _ARCH_PWR9
579 __vector signed int __a, __b, __c;
580
581 __a = (__vector signed int)vec_splats (__m1);
582 __b = (__vector signed int)vec_splats (__m2);
583 __c = vec_sub (__a, __b);
584 return (__m64) ((__vector long long) __c)[0];
585 #else
586 __m64_union __mu1, __mu2, __res;
587
588 __mu1.as_m64 = __m1;
589 __mu2.as_m64 = __m2;
590
591 __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
592 __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
593
594 return (__m64) __res.as_m64;
595 #endif
596 }
597
598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 _m_psubd (__m64 __m1, __m64 __m2)
600 {
601 return _mm_sub_pi32 (__m1, __m2);
602 }
603
604 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605 _mm_add_si64 (__m64 __m1, __m64 __m2)
606 {
607 return (__m1 + __m2);
608 }
609
610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611 _mm_sub_si64 (__m64 __m1, __m64 __m2)
612 {
613 return (__m1 - __m2);
614 }
615
616 /* Shift the 64-bit value in M left by COUNT. */
617 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618 _mm_sll_si64 (__m64 __m, __m64 __count)
619 {
620 return (__m << __count);
621 }
622
623 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624 _m_psllq (__m64 __m, __m64 __count)
625 {
626 return _mm_sll_si64 (__m, __count);
627 }
628
629 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630 _mm_slli_si64 (__m64 __m, const int __count)
631 {
632 return (__m << __count);
633 }
634
635 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
636 _m_psllqi (__m64 __m, const int __count)
637 {
638 return _mm_slli_si64 (__m, __count);
639 }
640
641 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_srl_si64 (__m64 __m, __m64 __count)
644 {
645 return (__m >> __count);
646 }
647
648 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 _m_psrlq (__m64 __m, __m64 __count)
650 {
651 return _mm_srl_si64 (__m, __count);
652 }
653
654 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655 _mm_srli_si64 (__m64 __m, const int __count)
656 {
657 return (__m >> __count);
658 }
659
660 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661 _m_psrlqi (__m64 __m, const int __count)
662 {
663 return _mm_srli_si64 (__m, __count);
664 }
665
666 /* Bit-wise AND the 64-bit values in M1 and M2. */
667 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668 _mm_and_si64 (__m64 __m1, __m64 __m2)
669 {
670 return (__m1 & __m2);
671 }
672
673 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674 _m_pand (__m64 __m1, __m64 __m2)
675 {
676 return _mm_and_si64 (__m1, __m2);
677 }
678
679 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
680 64-bit value in M2. */
681 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
683 {
684 return (~__m1 & __m2);
685 }
686
687 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 _m_pandn (__m64 __m1, __m64 __m2)
689 {
690 return _mm_andnot_si64 (__m1, __m2);
691 }
692
693 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
694 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 _mm_or_si64 (__m64 __m1, __m64 __m2)
696 {
697 return (__m1 | __m2);
698 }
699
700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _m_por (__m64 __m1, __m64 __m2)
702 {
703 return _mm_or_si64 (__m1, __m2);
704 }
705
706 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
707 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
708 _mm_xor_si64 (__m64 __m1, __m64 __m2)
709 {
710 return (__m1 ^ __m2);
711 }
712
713 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
714 _m_pxor (__m64 __m1, __m64 __m2)
715 {
716 return _mm_xor_si64 (__m1, __m2);
717 }
718
719 /* Creates a 64-bit zero. */
720 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721 _mm_setzero_si64 (void)
722 {
723 return (__m64) 0;
724 }
725
726 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
727 test is true and zero if false. */
728 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
730 {
731 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
732 __m64 __res;
733 __asm__(
734 "cmpb %0,%1,%2;\n"
735 : "=r" (__res)
736 : "r" (__m1),
737 "r" (__m2)
738 : );
739 return (__res);
740 #else
741 __m64_union __mu1, __mu2, __res;
742
743 __mu1.as_m64 = __m1;
744 __mu2.as_m64 = __m2;
745
746 __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0;
747 __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0;
748 __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0;
749 __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0;
750 __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0;
751 __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0;
752 __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0;
753 __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0;
754
755 return (__m64) __res.as_m64;
756 #endif
757 }
758
759 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760 _m_pcmpeqb (__m64 __m1, __m64 __m2)
761 {
762 return _mm_cmpeq_pi8 (__m1, __m2);
763 }
764
765 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
767 {
768 #if _ARCH_PWR8
769 __vector signed char __a, __b, __c;
770
771 __a = (__vector signed char)vec_splats (__m1);
772 __b = (__vector signed char)vec_splats (__m2);
773 __c = (__vector signed char)vec_cmpgt (__a, __b);
774 return (__m64) ((__vector long long) __c)[0];
775 #else
776 __m64_union __mu1, __mu2, __res;
777
778 __mu1.as_m64 = __m1;
779 __mu2.as_m64 = __m2;
780
781 __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0;
782 __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0;
783 __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0;
784 __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0;
785 __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0;
786 __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0;
787 __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0;
788 __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0;
789
790 return (__m64) __res.as_m64;
791 #endif
792 }
793
794 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
795 _m_pcmpgtb (__m64 __m1, __m64 __m2)
796 {
797 return _mm_cmpgt_pi8 (__m1, __m2);
798 }
799
800 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
801 the test is true and zero if false. */
802 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
804 {
805 #if _ARCH_PWR8
806 __vector signed short __a, __b, __c;
807
808 __a = (__vector signed short)vec_splats (__m1);
809 __b = (__vector signed short)vec_splats (__m2);
810 __c = (__vector signed short)vec_cmpeq (__a, __b);
811 return (__m64) ((__vector long long) __c)[0];
812 #else
813 __m64_union __mu1, __mu2, __res;
814
815 __mu1.as_m64 = __m1;
816 __mu2.as_m64 = __m2;
817
818 __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0;
819 __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0;
820 __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0;
821 __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0;
822
823 return (__m64) __res.as_m64;
824 #endif
825 }
826
827 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828 _m_pcmpeqw (__m64 __m1, __m64 __m2)
829 {
830 return _mm_cmpeq_pi16 (__m1, __m2);
831 }
832
833 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
835 {
836 #if _ARCH_PWR8
837 __vector signed short __a, __b, __c;
838
839 __a = (__vector signed short)vec_splats (__m1);
840 __b = (__vector signed short)vec_splats (__m2);
841 __c = (__vector signed short)vec_cmpgt (__a, __b);
842 return (__m64) ((__vector long long) __c)[0];
843 #else
844 __m64_union __mu1, __mu2, __res;
845
846 __mu1.as_m64 = __m1;
847 __mu2.as_m64 = __m2;
848
849 __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0;
850 __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0;
851 __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0;
852 __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0;
853
854 return (__m64) __res.as_m64;
855 #endif
856 }
857
858 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 _m_pcmpgtw (__m64 __m1, __m64 __m2)
860 {
861 return _mm_cmpgt_pi16 (__m1, __m2);
862 }
863
864 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
865 the test is true and zero if false. */
866 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
868 {
869 #if _ARCH_PWR9
870 __vector signed int __a, __b, __c;
871
872 __a = (__vector signed int)vec_splats (__m1);
873 __b = (__vector signed int)vec_splats (__m2);
874 __c = (__vector signed int)vec_cmpeq (__a, __b);
875 return (__m64) ((__vector long long) __c)[0];
876 #else
877 __m64_union __mu1, __mu2, __res;
878
879 __mu1.as_m64 = __m1;
880 __mu2.as_m64 = __m2;
881
882 __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0;
883 __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0;
884
885 return (__m64) __res.as_m64;
886 #endif
887 }
888
889 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890 _m_pcmpeqd (__m64 __m1, __m64 __m2)
891 {
892 return _mm_cmpeq_pi32 (__m1, __m2);
893 }
894
895 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
897 {
898 #if _ARCH_PWR9
899 __vector signed int __a, __b, __c;
900
901 __a = (__vector signed int)vec_splats (__m1);
902 __b = (__vector signed int)vec_splats (__m2);
903 __c = (__vector signed int)vec_cmpgt (__a, __b);
904 return (__m64) ((__vector long long) __c)[0];
905 #else
906 __m64_union __mu1, __mu2, __res;
907
908 __mu1.as_m64 = __m1;
909 __mu2.as_m64 = __m2;
910
911 __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0;
912 __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0;
913
914 return (__m64) __res.as_m64;
915 #endif
916 }
917
918 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919 _m_pcmpgtd (__m64 __m1, __m64 __m2)
920 {
921 return _mm_cmpgt_pi32 (__m1, __m2);
922 }
923
924 #if _ARCH_PWR8
925 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
926 saturated arithmetic. */
927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
928 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
929 {
930 __vector signed char __a, __b, __c;
931
932 __a = (__vector signed char)vec_splats (__m1);
933 __b = (__vector signed char)vec_splats (__m2);
934 __c = vec_adds (__a, __b);
935 return (__m64) ((__vector long long) __c)[0];
936 }
937
938 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _m_paddsb (__m64 __m1, __m64 __m2)
940 {
941 return _mm_adds_pi8 (__m1, __m2);
942 }
943 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
944 saturated arithmetic. */
945 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
946 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
947 {
948 __vector signed short __a, __b, __c;
949
950 __a = (__vector signed short)vec_splats (__m1);
951 __b = (__vector signed short)vec_splats (__m2);
952 __c = vec_adds (__a, __b);
953 return (__m64) ((__vector long long) __c)[0];
954 }
955
956 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957 _m_paddsw (__m64 __m1, __m64 __m2)
958 {
959 return _mm_adds_pi16 (__m1, __m2);
960 }
961 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
962 saturated arithmetic. */
963 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
964 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
965 {
966 __vector unsigned char __a, __b, __c;
967
968 __a = (__vector unsigned char)vec_splats (__m1);
969 __b = (__vector unsigned char)vec_splats (__m2);
970 __c = vec_adds (__a, __b);
971 return (__m64) ((__vector long long) __c)[0];
972 }
973
974 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 _m_paddusb (__m64 __m1, __m64 __m2)
976 {
977 return _mm_adds_pu8 (__m1, __m2);
978 }
979
980 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
981 saturated arithmetic. */
982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
984 {
985 __vector unsigned short __a, __b, __c;
986
987 __a = (__vector unsigned short)vec_splats (__m1);
988 __b = (__vector unsigned short)vec_splats (__m2);
989 __c = vec_adds (__a, __b);
990 return (__m64) ((__vector long long) __c)[0];
991 }
992
993 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994 _m_paddusw (__m64 __m1, __m64 __m2)
995 {
996 return _mm_adds_pu16 (__m1, __m2);
997 }
998
999 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
1000 saturating arithmetic. */
1001 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
1003 {
1004 __vector signed char __a, __b, __c;
1005
1006 __a = (__vector signed char)vec_splats (__m1);
1007 __b = (__vector signed char)vec_splats (__m2);
1008 __c = vec_subs (__a, __b);
1009 return (__m64) ((__vector long long) __c)[0];
1010 }
1011
1012 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013 _m_psubsb (__m64 __m1, __m64 __m2)
1014 {
1015 return _mm_subs_pi8 (__m1, __m2);
1016 }
1017
1018 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1019 signed saturating arithmetic. */
1020 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1022 {
1023 __vector signed short __a, __b, __c;
1024
1025 __a = (__vector signed short)vec_splats (__m1);
1026 __b = (__vector signed short)vec_splats (__m2);
1027 __c = vec_subs (__a, __b);
1028 return (__m64) ((__vector long long) __c)[0];
1029 }
1030
1031 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032 _m_psubsw (__m64 __m1, __m64 __m2)
1033 {
1034 return _mm_subs_pi16 (__m1, __m2);
1035 }
1036
1037 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1038 unsigned saturating arithmetic. */
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1041 {
1042 __vector unsigned char __a, __b, __c;
1043
1044 __a = (__vector unsigned char)vec_splats (__m1);
1045 __b = (__vector unsigned char)vec_splats (__m2);
1046 __c = vec_subs (__a, __b);
1047 return (__m64) ((__vector long long) __c)[0];
1048 }
1049
1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _m_psubusb (__m64 __m1, __m64 __m2)
1052 {
1053 return _mm_subs_pu8 (__m1, __m2);
1054 }
1055
1056 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1057 unsigned saturating arithmetic. */
1058 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1060 {
1061 __vector unsigned short __a, __b, __c;
1062
1063 __a = (__vector unsigned short)vec_splats (__m1);
1064 __b = (__vector unsigned short)vec_splats (__m2);
1065 __c = vec_subs (__a, __b);
1066 return (__m64) ((__vector long long) __c)[0];
1067 }
1068
1069 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070 _m_psubusw (__m64 __m1, __m64 __m2)
1071 {
1072 return _mm_subs_pu16 (__m1, __m2);
1073 }
1074
1075 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1076 four 32-bit intermediate results, which are then summed by pairs to
1077 produce two 32-bit results. */
1078 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1079 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1080 {
1081 __vector signed short __a, __b;
1082 __vector signed int __c;
1083 __vector signed int __zero = {0, 0, 0, 0};
1084
1085 __a = (__vector signed short)vec_splats (__m1);
1086 __b = (__vector signed short)vec_splats (__m2);
1087 __c = vec_vmsumshm (__a, __b, __zero);
1088 return (__m64) ((__vector long long) __c)[0];
1089 }
1090
1091 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092 _m_pmaddwd (__m64 __m1, __m64 __m2)
1093 {
1094 return _mm_madd_pi16 (__m1, __m2);
1095 }
1096 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1097 M2 and produce the high 16 bits of the 32-bit results. */
1098 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1100 {
1101 __vector signed short __a, __b;
1102 __vector signed short __c;
1103 __vector signed int __w0, __w1;
1104 __vector unsigned char __xform1 = {
1105 #ifdef __LITTLE_ENDIAN__
1106 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1107 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1108 #else
1109 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1110 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1111 #endif
1112 };
1113
1114 __a = (__vector signed short)vec_splats (__m1);
1115 __b = (__vector signed short)vec_splats (__m2);
1116
1117 __w0 = vec_vmulesh (__a, __b);
1118 __w1 = vec_vmulosh (__a, __b);
1119 __c = (__vector signed short)vec_perm (__w0, __w1, __xform1);
1120
1121 return (__m64) ((__vector long long) __c)[0];
1122 }
1123
1124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125 _m_pmulhw (__m64 __m1, __m64 __m2)
1126 {
1127 return _mm_mulhi_pi16 (__m1, __m2);
1128 }
1129
1130 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1131 the low 16 bits of the results. */
1132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1134 {
1135 __vector signed short __a, __b, __c;
1136
1137 __a = (__vector signed short)vec_splats (__m1);
1138 __b = (__vector signed short)vec_splats (__m2);
1139 __c = __a * __b;
1140 return (__m64) ((__vector long long) __c)[0];
1141 }
1142
1143 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 _m_pmullw (__m64 __m1, __m64 __m2)
1145 {
1146 return _mm_mullo_pi16 (__m1, __m2);
1147 }
1148
1149 /* Shift four 16-bit values in M left by COUNT. */
1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151 _mm_sll_pi16 (__m64 __m, __m64 __count)
1152 {
1153 __vector signed short __r;
1154 __vector unsigned short __c;
1155
1156 if (__count <= 15)
1157 {
1158 __r = (__vector signed short)vec_splats (__m);
1159 __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1160 __r = vec_sl (__r, (__vector unsigned short)__c);
1161 return (__m64) ((__vector long long) __r)[0];
1162 }
1163 else
1164 return (0);
1165 }
1166
1167 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _m_psllw (__m64 __m, __m64 __count)
1169 {
1170 return _mm_sll_pi16 (__m, __count);
1171 }
1172
1173 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174 _mm_slli_pi16 (__m64 __m, int __count)
1175 {
1176 /* Promote int to long then invoke mm_sll_pi16. */
1177 return _mm_sll_pi16 (__m, __count);
1178 }
1179
1180 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181 _m_psllwi (__m64 __m, int __count)
1182 {
1183 return _mm_slli_pi16 (__m, __count);
1184 }
1185
1186 /* Shift two 32-bit values in M left by COUNT. */
1187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm_sll_pi32 (__m64 __m, __m64 __count)
1189 {
1190 __m64_union __res;
1191
1192 __res.as_m64 = __m;
1193
1194 __res.as_int[0] = __res.as_int[0] << __count;
1195 __res.as_int[1] = __res.as_int[1] << __count;
1196 return (__res.as_m64);
1197 }
1198
1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _m_pslld (__m64 __m, __m64 __count)
1201 {
1202 return _mm_sll_pi32 (__m, __count);
1203 }
1204
1205 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm_slli_pi32 (__m64 __m, int __count)
1207 {
1208 /* Promote int to long then invoke mm_sll_pi32. */
1209 return _mm_sll_pi32 (__m, __count);
1210 }
1211
1212 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1213 _m_pslldi (__m64 __m, int __count)
1214 {
1215 return _mm_slli_pi32 (__m, __count);
1216 }
1217
1218 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm_sra_pi16 (__m64 __m, __m64 __count)
1221 {
1222 __vector signed short __r;
1223 __vector unsigned short __c;
1224
1225 if (__count <= 15)
1226 {
1227 __r = (__vector signed short)vec_splats (__m);
1228 __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1229 __r = vec_sra (__r, (__vector unsigned short)__c);
1230 return (__m64) ((__vector long long) __r)[0];
1231 }
1232 else
1233 return (0);
1234 }
1235
1236 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237 _m_psraw (__m64 __m, __m64 __count)
1238 {
1239 return _mm_sra_pi16 (__m, __count);
1240 }
1241
1242 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243 _mm_srai_pi16 (__m64 __m, int __count)
1244 {
1245 /* Promote int to long then invoke mm_sra_pi32. */
1246 return _mm_sra_pi16 (__m, __count);
1247 }
1248
1249 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250 _m_psrawi (__m64 __m, int __count)
1251 {
1252 return _mm_srai_pi16 (__m, __count);
1253 }
1254
1255 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1256 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm_sra_pi32 (__m64 __m, __m64 __count)
1258 {
1259 __m64_union __res;
1260
1261 __res.as_m64 = __m;
1262
1263 __res.as_int[0] = __res.as_int[0] >> __count;
1264 __res.as_int[1] = __res.as_int[1] >> __count;
1265 return (__res.as_m64);
1266 }
1267
1268 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 _m_psrad (__m64 __m, __m64 __count)
1270 {
1271 return _mm_sra_pi32 (__m, __count);
1272 }
1273
1274 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm_srai_pi32 (__m64 __m, int __count)
1276 {
1277 /* Promote int to long then invoke mm_sra_pi32. */
1278 return _mm_sra_pi32 (__m, __count);
1279 }
1280
1281 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 _m_psradi (__m64 __m, int __count)
1283 {
1284 return _mm_srai_pi32 (__m, __count);
1285 }
1286
1287 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1288 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_srl_pi16 (__m64 __m, __m64 __count)
1290 {
1291 __vector unsigned short __r;
1292 __vector unsigned short __c;
1293
1294 if (__count <= 15)
1295 {
1296 __r = (__vector unsigned short)vec_splats (__m);
1297 __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1298 __r = vec_sr (__r, (__vector unsigned short)__c);
1299 return (__m64) ((__vector long long) __r)[0];
1300 }
1301 else
1302 return (0);
1303 }
1304
1305 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306 _m_psrlw (__m64 __m, __m64 __count)
1307 {
1308 return _mm_srl_pi16 (__m, __count);
1309 }
1310
1311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm_srli_pi16 (__m64 __m, int __count)
1313 {
1314 /* Promote int to long then invoke mm_sra_pi32. */
1315 return _mm_srl_pi16 (__m, __count);
1316 }
1317
1318 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1319 _m_psrlwi (__m64 __m, int __count)
1320 {
1321 return _mm_srli_pi16 (__m, __count);
1322 }
1323
1324 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1325 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326 _mm_srl_pi32 (__m64 __m, __m64 __count)
1327 {
1328 __m64_union __res;
1329
1330 __res.as_m64 = __m;
1331
1332 __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1333 __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1334 return (__res.as_m64);
1335 }
1336
1337 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1338 _m_psrld (__m64 __m, __m64 __count)
1339 {
1340 return _mm_srl_pi32 (__m, __count);
1341 }
1342
1343 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1344 _mm_srli_pi32 (__m64 __m, int __count)
1345 {
1346 /* Promote int to long then invoke mm_srl_pi32. */
1347 return _mm_srl_pi32 (__m, __count);
1348 }
1349
1350 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351 _m_psrldi (__m64 __m, int __count)
1352 {
1353 return _mm_srli_pi32 (__m, __count);
1354 }
1355 #endif /* _ARCH_PWR8 */
1356
1357 /* Creates a vector of two 32-bit values; I0 is least significant. */
1358 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359 _mm_set_pi32 (int __i1, int __i0)
1360 {
1361 __m64_union __res;
1362
1363 __res.as_int[0] = __i0;
1364 __res.as_int[1] = __i1;
1365 return (__res.as_m64);
1366 }
1367
1368 /* Creates a vector of four 16-bit values; W0 is least significant. */
1369 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1371 {
1372 __m64_union __res;
1373
1374 __res.as_short[0] = __w0;
1375 __res.as_short[1] = __w1;
1376 __res.as_short[2] = __w2;
1377 __res.as_short[3] = __w3;
1378 return (__res.as_m64);
1379 }
1380
1381 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1382 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1384 char __b3, char __b2, char __b1, char __b0)
1385 {
1386 __m64_union __res;
1387
1388 __res.as_char[0] = __b0;
1389 __res.as_char[1] = __b1;
1390 __res.as_char[2] = __b2;
1391 __res.as_char[3] = __b3;
1392 __res.as_char[4] = __b4;
1393 __res.as_char[5] = __b5;
1394 __res.as_char[6] = __b6;
1395 __res.as_char[7] = __b7;
1396 return (__res.as_m64);
1397 }
1398
1399 /* Similar, but with the arguments in reverse order. */
1400 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401 _mm_setr_pi32 (int __i0, int __i1)
1402 {
1403 __m64_union __res;
1404
1405 __res.as_int[0] = __i0;
1406 __res.as_int[1] = __i1;
1407 return (__res.as_m64);
1408 }
1409
1410 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1412 {
1413 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1414 }
1415
1416 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1418 char __b4, char __b5, char __b6, char __b7)
1419 {
1420 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1421 }
1422
1423 /* Creates a vector of two 32-bit values, both elements containing I. */
1424 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425 _mm_set1_pi32 (int __i)
1426 {
1427 __m64_union __res;
1428
1429 __res.as_int[0] = __i;
1430 __res.as_int[1] = __i;
1431 return (__res.as_m64);
1432 }
1433
1434 /* Creates a vector of four 16-bit values, all elements containing W. */
1435 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436 _mm_set1_pi16 (short __w)
1437 {
1438 #if _ARCH_PWR9
1439 __vector signed short w;
1440
1441 w = (__vector signed short)vec_splats (__w);
1442 return (__m64) ((__vector long long) w)[0];
1443 #else
1444 __m64_union __res;
1445
1446 __res.as_short[0] = __w;
1447 __res.as_short[1] = __w;
1448 __res.as_short[2] = __w;
1449 __res.as_short[3] = __w;
1450 return (__res.as_m64);
1451 #endif
1452 }
1453
1454 /* Creates a vector of eight 8-bit values, all elements containing B. */
1455 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1456 _mm_set1_pi8 (signed char __b)
1457 {
1458 #if _ARCH_PWR8
1459 __vector signed char __res;
1460
1461 __res = (__vector signed char)vec_splats (__b);
1462 return (__m64) ((__vector long long) __res)[0];
1463 #else
1464 __m64_union __res;
1465
1466 __res.as_char[0] = __b;
1467 __res.as_char[1] = __b;
1468 __res.as_char[2] = __b;
1469 __res.as_char[3] = __b;
1470 __res.as_char[4] = __b;
1471 __res.as_char[5] = __b;
1472 __res.as_char[6] = __b;
1473 __res.as_char[7] = __b;
1474 return (__res.as_m64);
1475 #endif
1476 }
1477 #endif /* _MMINTRIN_H_INCLUDED */