gcc/config/rs6000/mmintrin.h

   1 /* Copyright (C) 2002-2023 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    Under Section 7 of GPL version 3, you are granted additional
  16    permissions described in the GCC Runtime Library Exception, version
  17    3.1, as published by the Free Software Foundation.
  18
  19    You should have received a copy of the GNU General Public License and
  20    a copy of the GCC Runtime Library Exception along with this program;
  21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22    <http://www.gnu.org/licenses/>.  */
  23
  24 /* Implemented from the specification included in the Intel C++ Compiler
  25    User Guide and Reference, version 9.0.  */
  26
  27 #ifndef NO_WARN_X86_INTRINSICS
  28 /* This header is distributed to simplify porting x86_64 code that
  29    makes explicit use of Intel intrinsics to powerpc64le.
  30    It is the user's responsibility to determine if the results are
  31    acceptable and make additional changes as necessary.
  32    Note that much code that uses Intel intrinsics can be rewritten in
  33    standard C or GNU C extensions, which are more portable and better
  34    optimized across multiple targets.
  35
  36    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
  37    target does not support a native __vector_size__ (8) type.  Instead
  38    we typedef __m64 to a 64-bit unsigned long long, which is natively
  39    supported in 64-bit mode.  This works well for the _si64 and some
  40    _pi32 operations, but starts to generate long sequences for _pi16
  41    and _pi8 operations.  For those cases it better (faster and
  42    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
  43    unit, perform the operation, and then transfer the result back to
  44    the __m64 type. This implies that the direct register move
  45    instructions, introduced with power8, are available for efficient
  46    implementation of these transfers.
  47
  48    Most MMX intrinsic operations can be performed efficiently as
  49    C language 64-bit scalar operation or optimized to use the newer
  50    128-bit SSE/Altivec operations.  We recomend this for new
  51    applications.  */
  52 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  53 #endif
  54
  55 #ifndef _MMINTRIN_H_INCLUDED
  56 #define _MMINTRIN_H_INCLUDED
  57
  58 #include <altivec.h>
  59 /* The Intel API is flexible enough that we must allow aliasing with other
  60    vector types, and their scalar components.  */
  61 typedef __attribute__ ((__aligned__ (8),
  62                         __may_alias__)) unsigned long long __m64;
  63
  64 typedef __attribute__ ((__aligned__ (8)))
  65 union
  66   {
  67     __m64 as_m64;
  68     char as_char[8];
  69     signed char as_signed_char [8];
  70     short as_short[4];
  71     int as_int[2];
  72     long long as_long_long;
  73     float as_float[2];
  74     double as_double;
  75   } __m64_union;
  76
  77 /* Empty the multimedia state.  */
  78 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  79 _mm_empty (void)
  80 {
  81   /* nothing to do on PowerPC.  */
  82 }
  83
  84 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  85 _m_empty (void)
  86 {
  87   /* nothing to do on PowerPC.  */
  88 }
  89
  90 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  91 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  92 _mm_cvtsi32_si64 (int __i)
  93 {
  94   return (__m64) (unsigned int) __i;
  95 }
  96
  97 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  98 _m_from_int (int __i)
  99 {
 100   return _mm_cvtsi32_si64 (__i);
 101 }
 102
 103 /* Convert the lower 32 bits of the __m64 object into an integer.  */
 104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 105 _mm_cvtsi64_si32 (__m64 __i)
 106 {
 107   return ((int) __i);
 108 }
 109
 110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 111 _m_to_int (__m64 __i)
 112 {
 113   return _mm_cvtsi64_si32 (__i);
 114 }
 115
 116 /* Convert I to a __m64 object.  */
 117
 118 /* Intel intrinsic.  */
 119 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 120 _m_from_int64 (long long __i)
 121 {
 122   return (__m64) __i;
 123 }
 124
 125 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 126 _mm_cvtsi64_m64 (long long __i)
 127 {
 128   return (__m64) __i;
 129 }
 130
 131 /* Microsoft intrinsic.  */
 132 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 133 _mm_cvtsi64x_si64 (long long __i)
 134 {
 135   return (__m64) __i;
 136 }
 137
 138 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 139 _mm_set_pi64x (long long __i)
 140 {
 141   return (__m64) __i;
 142 }
 143
 144 /* Convert the __m64 object to a 64bit integer.  */
 145
 146 /* Intel intrinsic.  */
 147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 148 _m_to_int64 (__m64 __i)
 149 {
 150   return (long long)__i;
 151 }
 152
 153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 154 _mm_cvtm64_si64 (__m64 __i)
 155 {
 156   return (long long) __i;
 157 }
 158
 159 /* Microsoft intrinsic.  */
 160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 161 _mm_cvtsi64_si64x (__m64 __i)
 162 {
 163   return (long long) __i;
 164 }
 165
 166 #ifdef _ARCH_PWR8
 167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 168    the result, and the four 16-bit values from M2 into the upper four 8-bit
 169    values of the result, all with signed saturation.  */
 170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 171 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
 172 {
 173   __vector signed short __vm1;
 174   __vector signed char __vresult;
 175
 176   __vm1 = (__vector signed short) (__vector unsigned long long)
 177 #ifdef __LITTLE_ENDIAN__
 178         { __m1, __m2 };
 179 #else
 180         { __m2, __m1 };
 181 #endif
 182   __vresult = vec_packs (__vm1, __vm1);
 183   return (__m64) ((__vector long long) __vresult)[0];
 184 }
 185
 186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 187 _m_packsswb (__m64 __m1, __m64 __m2)
 188 {
 189   return _mm_packs_pi16 (__m1, __m2);
 190 }
 191
 192 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 193    the result, and the two 32-bit values from M2 into the upper two 16-bit
 194    values of the result, all with signed saturation.  */
 195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 196 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
 197 {
 198   __vector signed int __vm1;
 199   __vector signed short __vresult;
 200
 201   __vm1 = (__vector signed int) (__vector unsigned long long)
 202 #ifdef __LITTLE_ENDIAN__
 203         { __m1, __m2 };
 204 #else
 205         { __m2, __m1 };
 206 #endif
 207   __vresult = vec_packs (__vm1, __vm1);
 208   return (__m64) ((__vector long long) __vresult)[0];
 209 }
 210
 211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 212 _m_packssdw (__m64 __m1, __m64 __m2)
 213 {
 214   return _mm_packs_pi32 (__m1, __m2);
 215 }
 216
 217 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 218    the result, and the four 16-bit values from M2 into the upper four 8-bit
 219    values of the result, all with unsigned saturation.  */
 220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 221 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 222 {
 223   __vector unsigned char __r;
 224   __vector signed short __vm1 = (__vector signed short) (__vector long long)
 225 #ifdef __LITTLE_ENDIAN__
 226         { __m1, __m2 };
 227 #else
 228         { __m2, __m1 };
 229 #endif
 230   const __vector signed short __zero = { 0 };
 231   __vector __bool short __select = vec_cmplt (__vm1, __zero);
 232   __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1);
 233   __vector __bool char __packsel = vec_pack (__select, __select);
 234   __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel);
 235   return (__m64) ((__vector long long) __r)[0];
 236 }
 237
 238 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 239 _m_packuswb (__m64 __m1, __m64 __m2)
 240 {
 241   return _mm_packs_pu16 (__m1, __m2);
 242 }
 243 #endif /* end ARCH_PWR8 */
 244
 245 /* Interleave the four 8-bit values from the high half of M1 with the four
 246    8-bit values from the high half of M2.  */
 247 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 248 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 249 {
 250 #if _ARCH_PWR8
 251   __vector unsigned char __a, __b, __c;
 252
 253   __a = (__vector unsigned char)vec_splats (__m1);
 254   __b = (__vector unsigned char)vec_splats (__m2);
 255   __c = vec_mergel (__a, __b);
 256   return (__m64) ((__vector long long) __c)[1];
 257 #else
 258   __m64_union __mu1, __mu2, __res;
 259
 260   __mu1.as_m64 = __m1;
 261   __mu2.as_m64 = __m2;
 262
 263   __res.as_char[0] = __mu1.as_char[4];
 264   __res.as_char[1] = __mu2.as_char[4];
 265   __res.as_char[2] = __mu1.as_char[5];
 266   __res.as_char[3] = __mu2.as_char[5];
 267   __res.as_char[4] = __mu1.as_char[6];
 268   __res.as_char[5] = __mu2.as_char[6];
 269   __res.as_char[6] = __mu1.as_char[7];
 270   __res.as_char[7] = __mu2.as_char[7];
 271
 272   return (__m64) __res.as_m64;
 273 #endif
 274 }
 275
 276 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 277 _m_punpckhbw (__m64 __m1, __m64 __m2)
 278 {
 279   return _mm_unpackhi_pi8 (__m1, __m2);
 280 }
 281
 282 /* Interleave the two 16-bit values from the high half of M1 with the two
 283    16-bit values from the high half of M2.  */
 284 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 285 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
 286 {
 287   __m64_union __mu1, __mu2, __res;
 288
 289   __mu1.as_m64 = __m1;
 290   __mu2.as_m64 = __m2;
 291
 292   __res.as_short[0] = __mu1.as_short[2];
 293   __res.as_short[1] = __mu2.as_short[2];
 294   __res.as_short[2] = __mu1.as_short[3];
 295   __res.as_short[3] = __mu2.as_short[3];
 296
 297   return (__m64) __res.as_m64;
 298 }
 299
 300 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 301 _m_punpckhwd (__m64 __m1, __m64 __m2)
 302 {
 303   return _mm_unpackhi_pi16 (__m1, __m2);
 304 }
 305 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 306    value from the high half of M2.  */
 307 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 308 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
 309 {
 310   __m64_union __mu1, __mu2, __res;
 311
 312   __mu1.as_m64 = __m1;
 313   __mu2.as_m64 = __m2;
 314
 315   __res.as_int[0] = __mu1.as_int[1];
 316   __res.as_int[1] = __mu2.as_int[1];
 317
 318   return (__m64) __res.as_m64;
 319 }
 320
 321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 322 _m_punpckhdq (__m64 __m1, __m64 __m2)
 323 {
 324   return _mm_unpackhi_pi32 (__m1, __m2);
 325 }
 326 /* Interleave the four 8-bit values from the low half of M1 with the four
 327    8-bit values from the low half of M2.  */
 328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 329 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 330 {
 331 #if _ARCH_PWR8
 332   __vector unsigned char __a, __b, __c;
 333
 334   __a = (__vector unsigned char)vec_splats (__m1);
 335   __b = (__vector unsigned char)vec_splats (__m2);
 336   __c = vec_mergel (__a, __b);
 337   return (__m64) ((__vector long long) __c)[0];
 338 #else
 339   __m64_union __mu1, __mu2, __res;
 340
 341   __mu1.as_m64 = __m1;
 342   __mu2.as_m64 = __m2;
 343
 344   __res.as_char[0] = __mu1.as_char[0];
 345   __res.as_char[1] = __mu2.as_char[0];
 346   __res.as_char[2] = __mu1.as_char[1];
 347   __res.as_char[3] = __mu2.as_char[1];
 348   __res.as_char[4] = __mu1.as_char[2];
 349   __res.as_char[5] = __mu2.as_char[2];
 350   __res.as_char[6] = __mu1.as_char[3];
 351   __res.as_char[7] = __mu2.as_char[3];
 352
 353   return (__m64) __res.as_m64;
 354 #endif
 355 }
 356
 357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 358 _m_punpcklbw (__m64 __m1, __m64 __m2)
 359 {
 360   return _mm_unpacklo_pi8 (__m1, __m2);
 361 }
 362 /* Interleave the two 16-bit values from the low half of M1 with the two
 363    16-bit values from the low half of M2.  */
 364 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 365 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
 366 {
 367   __m64_union __mu1, __mu2, __res;
 368
 369   __mu1.as_m64 = __m1;
 370   __mu2.as_m64 = __m2;
 371
 372   __res.as_short[0] = __mu1.as_short[0];
 373   __res.as_short[1] = __mu2.as_short[0];
 374   __res.as_short[2] = __mu1.as_short[1];
 375   __res.as_short[3] = __mu2.as_short[1];
 376
 377   return (__m64) __res.as_m64;
 378 }
 379
 380 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 381 _m_punpcklwd (__m64 __m1, __m64 __m2)
 382 {
 383   return _mm_unpacklo_pi16 (__m1, __m2);
 384 }
 385
 386 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 387    value from the low half of M2.  */
 388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 389 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
 390 {
 391   __m64_union __mu1, __mu2, __res;
 392
 393   __mu1.as_m64 = __m1;
 394   __mu2.as_m64 = __m2;
 395
 396   __res.as_int[0] = __mu1.as_int[0];
 397   __res.as_int[1] = __mu2.as_int[0];
 398
 399   return (__m64) __res.as_m64;
 400 }
 401
 402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 403 _m_punpckldq (__m64 __m1, __m64 __m2)
 404 {
 405   return _mm_unpacklo_pi32 (__m1, __m2);
 406 }
 407
 408 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 410 _mm_add_pi8 (__m64 __m1, __m64 __m2)
 411 {
 412 #if _ARCH_PWR8
 413   __vector signed char __a, __b, __c;
 414
 415   __a = (__vector signed char)vec_splats (__m1);
 416   __b = (__vector signed char)vec_splats (__m2);
 417   __c = vec_add (__a, __b);
 418   return (__m64) ((__vector long long) __c)[0];
 419 #else
 420   __m64_union __mu1, __mu2, __res;
 421
 422   __mu1.as_m64 = __m1;
 423   __mu2.as_m64 = __m2;
 424
 425   __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
 426   __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
 427   __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
 428   __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
 429   __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
 430   __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
 431   __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
 432   __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
 433
 434   return (__m64) __res.as_m64;
 435 #endif
 436 }
 437
 438 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 439 _m_paddb (__m64 __m1, __m64 __m2)
 440 {
 441   return _mm_add_pi8 (__m1, __m2);
 442 }
 443
 444 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 446 _mm_add_pi16 (__m64 __m1, __m64 __m2)
 447 {
 448 #if _ARCH_PWR8
 449   __vector signed short __a, __b, __c;
 450
 451   __a = (__vector signed short)vec_splats (__m1);
 452   __b = (__vector signed short)vec_splats (__m2);
 453   __c = vec_add (__a, __b);
 454   return (__m64) ((__vector long long) __c)[0];
 455 #else
 456   __m64_union __mu1, __mu2, __res;
 457
 458   __mu1.as_m64 = __m1;
 459   __mu2.as_m64 = __m2;
 460
 461   __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
 462   __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
 463   __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
 464   __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
 465
 466   return (__m64) __res.as_m64;
 467 #endif
 468 }
 469
 470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 471 _m_paddw (__m64 __m1, __m64 __m2)
 472 {
 473   return _mm_add_pi16 (__m1, __m2);
 474 }
 475
 476 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 478 _mm_add_pi32 (__m64 __m1, __m64 __m2)
 479 {
 480 #if _ARCH_PWR9
 481   __vector signed int __a, __b, __c;
 482
 483   __a = (__vector signed int)vec_splats (__m1);
 484   __b = (__vector signed int)vec_splats (__m2);
 485   __c = vec_add (__a, __b);
 486   return (__m64) ((__vector long long) __c)[0];
 487 #else
 488   __m64_union __mu1, __mu2, __res;
 489
 490   __mu1.as_m64 = __m1;
 491   __mu2.as_m64 = __m2;
 492
 493   __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
 494   __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
 495
 496   return (__m64) __res.as_m64;
 497 #endif
 498 }
 499
 500 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 501 _m_paddd (__m64 __m1, __m64 __m2)
 502 {
 503   return _mm_add_pi32 (__m1, __m2);
 504 }
 505
 506 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 507 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 508 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
 509 {
 510 #if _ARCH_PWR8
 511   __vector signed char __a, __b, __c;
 512
 513   __a = (__vector signed char)vec_splats (__m1);
 514   __b = (__vector signed char)vec_splats (__m2);
 515   __c = vec_sub (__a, __b);
 516   return (__m64) ((__vector long long) __c)[0];
 517 #else
 518   __m64_union __mu1, __mu2, __res;
 519
 520   __mu1.as_m64 = __m1;
 521   __mu2.as_m64 = __m2;
 522
 523   __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
 524   __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
 525   __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
 526   __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
 527   __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
 528   __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
 529   __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
 530   __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
 531
 532   return (__m64) __res.as_m64;
 533 #endif
 534 }
 535
 536 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 537 _m_psubb (__m64 __m1, __m64 __m2)
 538 {
 539   return _mm_sub_pi8 (__m1, __m2);
 540 }
 541
 542 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 543 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 544 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
 545 {
 546 #if _ARCH_PWR8
 547   __vector signed short __a, __b, __c;
 548
 549   __a = (__vector signed short)vec_splats (__m1);
 550   __b = (__vector signed short)vec_splats (__m2);
 551   __c = vec_sub (__a, __b);
 552   return (__m64) ((__vector long long) __c)[0];
 553 #else
 554   __m64_union __mu1, __mu2, __res;
 555
 556   __mu1.as_m64 = __m1;
 557   __mu2.as_m64 = __m2;
 558
 559   __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
 560   __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
 561   __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
 562   __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
 563
 564   return (__m64) __res.as_m64;
 565 #endif
 566 }
 567
 568 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 569 _m_psubw (__m64 __m1, __m64 __m2)
 570 {
 571   return _mm_sub_pi16 (__m1, __m2);
 572 }
 573
 574 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 575 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 576 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
 577 {
 578 #if _ARCH_PWR9
 579   __vector signed int __a, __b, __c;
 580
 581   __a = (__vector signed int)vec_splats (__m1);
 582   __b = (__vector signed int)vec_splats (__m2);
 583   __c = vec_sub (__a, __b);
 584   return (__m64) ((__vector long long) __c)[0];
 585 #else
 586   __m64_union __mu1, __mu2, __res;
 587
 588   __mu1.as_m64 = __m1;
 589   __mu2.as_m64 = __m2;
 590
 591   __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
 592   __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
 593
 594   return (__m64) __res.as_m64;
 595 #endif
 596 }
 597
 598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 599 _m_psubd (__m64 __m1, __m64 __m2)
 600 {
 601   return _mm_sub_pi32 (__m1, __m2);
 602 }
 603
 604 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 605 _mm_add_si64 (__m64 __m1, __m64 __m2)
 606 {
 607   return (__m1 + __m2);
 608 }
 609
 610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 611 _mm_sub_si64 (__m64 __m1, __m64 __m2)
 612 {
 613   return (__m1 - __m2);
 614 }
 615
 616 /* Shift the 64-bit value in M left by COUNT.  */
 617 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 618 _mm_sll_si64 (__m64 __m, __m64 __count)
 619 {
 620   return (__m << __count);
 621 }
 622
 623 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 624 _m_psllq (__m64 __m, __m64 __count)
 625 {
 626   return _mm_sll_si64 (__m, __count);
 627 }
 628
 629 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 630 _mm_slli_si64 (__m64 __m, const int __count)
 631 {
 632   return (__m << __count);
 633 }
 634
 635 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 636 _m_psllqi (__m64 __m, const int __count)
 637 {
 638   return _mm_slli_si64 (__m, __count);
 639 }
 640
 641 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 643 _mm_srl_si64 (__m64 __m, __m64 __count)
 644 {
 645   return (__m >> __count);
 646 }
 647
 648 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 649 _m_psrlq (__m64 __m, __m64 __count)
 650 {
 651   return _mm_srl_si64 (__m, __count);
 652 }
 653
 654 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 655 _mm_srli_si64 (__m64 __m, const int __count)
 656 {
 657   return (__m >> __count);
 658 }
 659
 660 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 661 _m_psrlqi (__m64 __m, const int __count)
 662 {
 663   return _mm_srli_si64 (__m, __count);
 664 }
 665
 666 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 667 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 668 _mm_and_si64 (__m64 __m1, __m64 __m2)
 669 {
 670   return (__m1 & __m2);
 671 }
 672
 673 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 674 _m_pand (__m64 __m1, __m64 __m2)
 675 {
 676   return _mm_and_si64 (__m1, __m2);
 677 }
 678
 679 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 680    64-bit value in M2.  */
 681 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 682 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 683 {
 684   return (~__m1 & __m2);
 685 }
 686
 687 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 688 _m_pandn (__m64 __m1, __m64 __m2)
 689 {
 690   return _mm_andnot_si64 (__m1, __m2);
 691 }
 692
 693 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 694 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 695 _mm_or_si64 (__m64 __m1, __m64 __m2)
 696 {
 697   return (__m1 | __m2);
 698 }
 699
 700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 701 _m_por (__m64 __m1, __m64 __m2)
 702 {
 703   return _mm_or_si64 (__m1, __m2);
 704 }
 705
 706 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 707 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 708 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 709 {
 710   return  (__m1 ^ __m2);
 711 }
 712
 713 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 714 _m_pxor (__m64 __m1, __m64 __m2)
 715 {
 716   return _mm_xor_si64 (__m1, __m2);
 717 }
 718
 719 /* Creates a 64-bit zero.  */
 720 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 721 _mm_setzero_si64 (void)
 722 {
 723   return (__m64) 0;
 724 }
 725
 726 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 727    test is true and zero if false.  */
 728 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 729 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
 730 {
 731 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
 732   __m64 __res;
 733   __asm__(
 734       "cmpb %0,%1,%2;\n"
 735       : "=r" (__res)
 736       : "r" (__m1),
 737         "r" (__m2)
 738       : );
 739   return (__res);
 740 #else
 741   __m64_union __mu1, __mu2, __res;
 742
 743   __mu1.as_m64 = __m1;
 744   __mu2.as_m64 = __m2;
 745
 746   __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0;
 747   __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0;
 748   __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0;
 749   __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0;
 750   __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0;
 751   __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0;
 752   __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0;
 753   __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0;
 754
 755   return (__m64) __res.as_m64;
 756 #endif
 757 }
 758
 759 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 760 _m_pcmpeqb (__m64 __m1, __m64 __m2)
 761 {
 762   return _mm_cmpeq_pi8 (__m1, __m2);
 763 }
 764
 765 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 766 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
 767 {
 768 #if _ARCH_PWR8
 769   __vector signed char __a, __b, __c;
 770
 771   __a = (__vector signed char)vec_splats (__m1);
 772   __b = (__vector signed char)vec_splats (__m2);
 773   __c = (__vector signed char)vec_cmpgt (__a, __b);
 774   return (__m64) ((__vector long long) __c)[0];
 775 #else
 776   __m64_union __mu1, __mu2, __res;
 777
 778   __mu1.as_m64 = __m1;
 779   __mu2.as_m64 = __m2;
 780
 781   __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0;
 782   __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0;
 783   __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0;
 784   __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0;
 785   __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0;
 786   __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0;
 787   __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0;
 788   __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0;
 789
 790   return (__m64) __res.as_m64;
 791 #endif
 792 }
 793
 794 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 795 _m_pcmpgtb (__m64 __m1, __m64 __m2)
 796 {
 797   return _mm_cmpgt_pi8 (__m1, __m2);
 798 }
 799
 800 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 801    the test is true and zero if false.  */
 802 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 803 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
 804 {
 805 #if _ARCH_PWR8
 806   __vector signed short __a, __b, __c;
 807
 808   __a = (__vector signed short)vec_splats (__m1);
 809   __b = (__vector signed short)vec_splats (__m2);
 810   __c = (__vector signed short)vec_cmpeq (__a, __b);
 811   return (__m64) ((__vector long long) __c)[0];
 812 #else
 813   __m64_union __mu1, __mu2, __res;
 814
 815   __mu1.as_m64 = __m1;
 816   __mu2.as_m64 = __m2;
 817
 818   __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0;
 819   __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0;
 820   __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0;
 821   __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0;
 822
 823   return (__m64) __res.as_m64;
 824 #endif
 825 }
 826
 827 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 828 _m_pcmpeqw (__m64 __m1, __m64 __m2)
 829 {
 830   return _mm_cmpeq_pi16 (__m1, __m2);
 831 }
 832
 833 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 834 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
 835 {
 836 #if _ARCH_PWR8
 837   __vector signed short __a, __b, __c;
 838
 839   __a = (__vector signed short)vec_splats (__m1);
 840   __b = (__vector signed short)vec_splats (__m2);
 841   __c = (__vector signed short)vec_cmpgt (__a, __b);
 842   return (__m64) ((__vector long long) __c)[0];
 843 #else
 844   __m64_union __mu1, __mu2, __res;
 845
 846   __mu1.as_m64 = __m1;
 847   __mu2.as_m64 = __m2;
 848
 849   __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0;
 850   __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0;
 851   __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0;
 852   __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0;
 853
 854   return (__m64) __res.as_m64;
 855 #endif
 856 }
 857
 858 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 859 _m_pcmpgtw (__m64 __m1, __m64 __m2)
 860 {
 861   return _mm_cmpgt_pi16 (__m1, __m2);
 862 }
 863
 864 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 865    the test is true and zero if false.  */
 866 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 867 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 868 {
 869 #if _ARCH_PWR9
 870   __vector signed int __a, __b, __c;
 871
 872   __a = (__vector signed int)vec_splats (__m1);
 873   __b = (__vector signed int)vec_splats (__m2);
 874   __c = (__vector signed int)vec_cmpeq (__a, __b);
 875   return (__m64) ((__vector long long) __c)[0];
 876 #else
 877   __m64_union __mu1, __mu2, __res;
 878
 879   __mu1.as_m64 = __m1;
 880   __mu2.as_m64 = __m2;
 881
 882   __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0;
 883   __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0;
 884
 885   return (__m64) __res.as_m64;
 886 #endif
 887 }
 888
 889 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 890 _m_pcmpeqd (__m64 __m1, __m64 __m2)
 891 {
 892   return _mm_cmpeq_pi32 (__m1, __m2);
 893 }
 894
 895 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 896 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
 897 {
 898 #if _ARCH_PWR9
 899   __vector signed int __a, __b, __c;
 900
 901   __a = (__vector signed int)vec_splats (__m1);
 902   __b = (__vector signed int)vec_splats (__m2);
 903   __c = (__vector signed int)vec_cmpgt (__a, __b);
 904   return (__m64) ((__vector long long) __c)[0];
 905 #else
 906   __m64_union __mu1, __mu2, __res;
 907
 908   __mu1.as_m64 = __m1;
 909   __mu2.as_m64 = __m2;
 910
 911   __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0;
 912   __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0;
 913
 914   return (__m64) __res.as_m64;
 915 #endif
 916 }
 917
 918 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 919 _m_pcmpgtd (__m64 __m1, __m64 __m2)
 920 {
 921   return _mm_cmpgt_pi32 (__m1, __m2);
 922 }
 923
 924 #if _ARCH_PWR8
 925 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 926    saturated arithmetic.  */
 927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 928 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
 929 {
 930   __vector signed char __a, __b, __c;
 931
 932   __a = (__vector signed char)vec_splats (__m1);
 933   __b = (__vector signed char)vec_splats (__m2);
 934   __c = vec_adds (__a, __b);
 935   return (__m64) ((__vector long long) __c)[0];
 936 }
 937
 938 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 939 _m_paddsb (__m64 __m1, __m64 __m2)
 940 {
 941   return _mm_adds_pi8 (__m1, __m2);
 942 }
 943 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 944    saturated arithmetic.  */
 945 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 946 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
 947 {
 948   __vector signed short __a, __b, __c;
 949
 950   __a = (__vector signed short)vec_splats (__m1);
 951   __b = (__vector signed short)vec_splats (__m2);
 952   __c = vec_adds (__a, __b);
 953   return (__m64) ((__vector long long) __c)[0];
 954 }
 955
 956 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 957 _m_paddsw (__m64 __m1, __m64 __m2)
 958 {
 959   return _mm_adds_pi16 (__m1, __m2);
 960 }
 961 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 962    saturated arithmetic.  */
 963 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 964 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 965 {
 966   __vector unsigned char __a, __b, __c;
 967
 968   __a = (__vector unsigned char)vec_splats (__m1);
 969   __b = (__vector unsigned char)vec_splats (__m2);
 970   __c = vec_adds (__a, __b);
 971   return (__m64) ((__vector long long) __c)[0];
 972 }
 973
 974 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 975 _m_paddusb (__m64 __m1, __m64 __m2)
 976 {
 977   return _mm_adds_pu8 (__m1, __m2);
 978 }
 979
 980 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 981    saturated arithmetic.  */
 982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 983 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 984 {
 985   __vector unsigned short __a, __b, __c;
 986
 987   __a = (__vector unsigned short)vec_splats (__m1);
 988   __b = (__vector unsigned short)vec_splats (__m2);
 989   __c = vec_adds (__a, __b);
 990   return (__m64) ((__vector long long) __c)[0];
 991 }
 992
 993 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 994 _m_paddusw (__m64 __m1, __m64 __m2)
 995 {
 996   return _mm_adds_pu16 (__m1, __m2);
 997 }
 998
 999 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
1000    saturating arithmetic.  */
1001 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
1003 {
1004   __vector signed char __a, __b, __c;
1005
1006   __a = (__vector signed char)vec_splats (__m1);
1007   __b = (__vector signed char)vec_splats (__m2);
1008   __c = vec_subs (__a, __b);
1009   return (__m64) ((__vector long long) __c)[0];
1010 }
1011
1012 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013 _m_psubsb (__m64 __m1, __m64 __m2)
1014 {
1015   return _mm_subs_pi8 (__m1, __m2);
1016 }
1017
1018 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1019    signed saturating arithmetic.  */
1020 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1022 {
1023   __vector signed short __a, __b, __c;
1024
1025   __a = (__vector signed short)vec_splats (__m1);
1026   __b = (__vector signed short)vec_splats (__m2);
1027   __c = vec_subs (__a, __b);
1028   return (__m64) ((__vector long long) __c)[0];
1029 }
1030
1031 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032 _m_psubsw (__m64 __m1, __m64 __m2)
1033 {
1034   return _mm_subs_pi16 (__m1, __m2);
1035 }
1036
1037 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1038    unsigned saturating arithmetic.  */
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1041 {
1042   __vector unsigned char __a, __b, __c;
1043
1044   __a = (__vector unsigned char)vec_splats (__m1);
1045   __b = (__vector unsigned char)vec_splats (__m2);
1046   __c = vec_subs (__a, __b);
1047   return (__m64) ((__vector long long) __c)[0];
1048 }
1049
1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _m_psubusb (__m64 __m1, __m64 __m2)
1052 {
1053   return _mm_subs_pu8 (__m1, __m2);
1054 }
1055
1056 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1057    unsigned saturating arithmetic.  */
1058 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1060 {
1061   __vector unsigned short __a, __b, __c;
1062
1063   __a = (__vector unsigned short)vec_splats (__m1);
1064   __b = (__vector unsigned short)vec_splats (__m2);
1065   __c = vec_subs (__a, __b);
1066   return (__m64) ((__vector long long) __c)[0];
1067 }
1068
1069 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070 _m_psubusw (__m64 __m1, __m64 __m2)
1071 {
1072   return _mm_subs_pu16 (__m1, __m2);
1073 }
1074
1075 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1076    four 32-bit intermediate results, which are then summed by pairs to
1077    produce two 32-bit results.  */
1078 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1079 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1080 {
1081   __vector signed short __a, __b;
1082   __vector signed int __c;
1083   __vector signed int __zero = {0, 0, 0, 0};
1084
1085   __a = (__vector signed short)vec_splats (__m1);
1086   __b = (__vector signed short)vec_splats (__m2);
1087   __c = vec_vmsumshm (__a, __b, __zero);
1088   return (__m64) ((__vector long long) __c)[0];
1089 }
1090
1091 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092 _m_pmaddwd (__m64 __m1, __m64 __m2)
1093 {
1094   return _mm_madd_pi16 (__m1, __m2);
1095 }
1096 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1097    M2 and produce the high 16 bits of the 32-bit results.  */
1098 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1100 {
1101   __vector signed short __a, __b;
1102   __vector signed short __c;
1103   __vector signed int __w0, __w1;
1104   __vector unsigned char __xform1 = {
1105 #ifdef __LITTLE_ENDIAN__
1106       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1107       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1108 #else
1109       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1110       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
1111 #endif
1112     };
1113
1114   __a = (__vector signed short)vec_splats (__m1);
1115   __b = (__vector signed short)vec_splats (__m2);
1116
1117   __w0 = vec_vmulesh (__a, __b);
1118   __w1 = vec_vmulosh (__a, __b);
1119   __c = (__vector signed short)vec_perm (__w0, __w1, __xform1);
1120
1121   return (__m64) ((__vector long long) __c)[0];
1122 }
1123
1124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125 _m_pmulhw (__m64 __m1, __m64 __m2)
1126 {
1127   return _mm_mulhi_pi16 (__m1, __m2);
1128 }
1129
1130 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1131    the low 16 bits of the results.  */
1132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1134 {
1135   __vector signed short __a, __b, __c;
1136
1137   __a = (__vector signed short)vec_splats (__m1);
1138   __b = (__vector signed short)vec_splats (__m2);
1139   __c = __a * __b;
1140   return (__m64) ((__vector long long) __c)[0];
1141 }
1142
1143 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 _m_pmullw (__m64 __m1, __m64 __m2)
1145 {
1146   return _mm_mullo_pi16 (__m1, __m2);
1147 }
1148
1149 /* Shift four 16-bit values in M left by COUNT.  */
1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151 _mm_sll_pi16 (__m64 __m, __m64 __count)
1152 {
1153   __vector signed short __r;
1154   __vector unsigned short __c;
1155
1156   if (__count <= 15)
1157     {
1158       __r = (__vector signed short)vec_splats (__m);
1159       __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1160       __r = vec_sl (__r, (__vector unsigned short)__c);
1161       return (__m64) ((__vector long long) __r)[0];
1162     }
1163   else
1164   return (0);
1165 }
1166
1167 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _m_psllw (__m64 __m, __m64 __count)
1169 {
1170   return _mm_sll_pi16 (__m, __count);
1171 }
1172
1173 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174 _mm_slli_pi16 (__m64 __m, int __count)
1175 {
1176   /* Promote int to long then invoke mm_sll_pi16.  */
1177   return _mm_sll_pi16 (__m, __count);
1178 }
1179
1180 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181 _m_psllwi (__m64 __m, int __count)
1182 {
1183   return _mm_slli_pi16 (__m, __count);
1184 }
1185
1186 /* Shift two 32-bit values in M left by COUNT.  */
1187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm_sll_pi32 (__m64 __m, __m64 __count)
1189 {
1190   __m64_union __res;
1191
1192   __res.as_m64 = __m;
1193
1194   __res.as_int[0] = __res.as_int[0] << __count;
1195   __res.as_int[1] = __res.as_int[1] << __count;
1196   return (__res.as_m64);
1197 }
1198
1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _m_pslld (__m64 __m, __m64 __count)
1201 {
1202   return _mm_sll_pi32 (__m, __count);
1203 }
1204
1205 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm_slli_pi32 (__m64 __m, int __count)
1207 {
1208   /* Promote int to long then invoke mm_sll_pi32.  */
1209   return _mm_sll_pi32 (__m, __count);
1210 }
1211
1212 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1213 _m_pslldi (__m64 __m, int __count)
1214 {
1215   return _mm_slli_pi32 (__m, __count);
1216 }
1217
1218 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm_sra_pi16 (__m64 __m, __m64 __count)
1221 {
1222   __vector signed short __r;
1223   __vector unsigned short __c;
1224
1225   if (__count <= 15)
1226     {
1227         __r = (__vector signed short)vec_splats (__m);
1228         __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1229         __r = vec_sra (__r, (__vector unsigned short)__c);
1230         return (__m64) ((__vector long long) __r)[0];
1231     }
1232   else
1233   return (0);
1234 }
1235
1236 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237 _m_psraw (__m64 __m, __m64 __count)
1238 {
1239   return _mm_sra_pi16 (__m, __count);
1240 }
1241
1242 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243 _mm_srai_pi16 (__m64 __m, int __count)
1244 {
1245   /* Promote int to long then invoke mm_sra_pi32.  */
1246   return _mm_sra_pi16 (__m, __count);
1247 }
1248
1249 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250 _m_psrawi (__m64 __m, int __count)
1251 {
1252   return _mm_srai_pi16 (__m, __count);
1253 }
1254
1255 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1256 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm_sra_pi32 (__m64 __m, __m64 __count)
1258 {
1259   __m64_union __res;
1260
1261   __res.as_m64 = __m;
1262
1263   __res.as_int[0] = __res.as_int[0] >> __count;
1264   __res.as_int[1] = __res.as_int[1] >> __count;
1265   return (__res.as_m64);
1266 }
1267
1268 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 _m_psrad (__m64 __m, __m64 __count)
1270 {
1271   return _mm_sra_pi32 (__m, __count);
1272 }
1273
1274 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm_srai_pi32 (__m64 __m, int __count)
1276 {
1277   /* Promote int to long then invoke mm_sra_pi32.  */
1278   return _mm_sra_pi32 (__m, __count);
1279 }
1280
1281 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 _m_psradi (__m64 __m, int __count)
1283 {
1284   return _mm_srai_pi32 (__m, __count);
1285 }
1286
1287 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1288 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_srl_pi16 (__m64 __m, __m64 __count)
1290 {
1291   __vector unsigned short __r;
1292   __vector unsigned short __c;
1293
1294   if (__count <= 15)
1295     {
1296         __r = (__vector unsigned short)vec_splats (__m);
1297         __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1298         __r = vec_sr (__r, (__vector unsigned short)__c);
1299         return (__m64) ((__vector long long) __r)[0];
1300     }
1301   else
1302     return (0);
1303 }
1304
1305 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306 _m_psrlw (__m64 __m, __m64 __count)
1307 {
1308   return _mm_srl_pi16 (__m, __count);
1309 }
1310
1311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm_srli_pi16 (__m64 __m, int __count)
1313 {
1314   /* Promote int to long then invoke mm_sra_pi32.  */
1315   return _mm_srl_pi16 (__m, __count);
1316 }
1317
1318 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1319 _m_psrlwi (__m64 __m, int __count)
1320 {
1321   return _mm_srli_pi16 (__m, __count);
1322 }
1323
1324 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1325 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326 _mm_srl_pi32 (__m64 __m, __m64 __count)
1327 {
1328   __m64_union __res;
1329
1330   __res.as_m64 = __m;
1331
1332   __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1333   __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1334   return (__res.as_m64);
1335 }
1336
1337 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1338 _m_psrld (__m64 __m, __m64 __count)
1339 {
1340   return _mm_srl_pi32 (__m, __count);
1341 }
1342
1343 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1344 _mm_srli_pi32 (__m64 __m, int __count)
1345 {
1346   /* Promote int to long then invoke mm_srl_pi32.  */
1347   return _mm_srl_pi32 (__m, __count);
1348 }
1349
1350 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351 _m_psrldi (__m64 __m, int __count)
1352 {
1353   return _mm_srli_pi32 (__m, __count);
1354 }
1355 #endif /* _ARCH_PWR8 */
1356
1357 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1358 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359 _mm_set_pi32 (int __i1, int __i0)
1360 {
1361   __m64_union __res;
1362
1363   __res.as_int[0] = __i0;
1364   __res.as_int[1] = __i1;
1365   return (__res.as_m64);
1366 }
1367
1368 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1369 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1371 {
1372   __m64_union __res;
1373
1374   __res.as_short[0] = __w0;
1375   __res.as_short[1] = __w1;
1376   __res.as_short[2] = __w2;
1377   __res.as_short[3] = __w3;
1378   return (__res.as_m64);
1379 }
1380
1381 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1382 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1384              char __b3, char __b2, char __b1, char __b0)
1385 {
1386   __m64_union __res;
1387
1388   __res.as_char[0] = __b0;
1389   __res.as_char[1] = __b1;
1390   __res.as_char[2] = __b2;
1391   __res.as_char[3] = __b3;
1392   __res.as_char[4] = __b4;
1393   __res.as_char[5] = __b5;
1394   __res.as_char[6] = __b6;
1395   __res.as_char[7] = __b7;
1396   return (__res.as_m64);
1397 }
1398
1399 /* Similar, but with the arguments in reverse order.  */
1400 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401 _mm_setr_pi32 (int __i0, int __i1)
1402 {
1403   __m64_union __res;
1404
1405   __res.as_int[0] = __i0;
1406   __res.as_int[1] = __i1;
1407   return (__res.as_m64);
1408 }
1409
1410 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1412 {
1413   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1414 }
1415
1416 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1418               char __b4, char __b5, char __b6, char __b7)
1419 {
1420   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1421 }
1422
1423 /* Creates a vector of two 32-bit values, both elements containing I.  */
1424 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425 _mm_set1_pi32 (int __i)
1426 {
1427   __m64_union __res;
1428
1429   __res.as_int[0] = __i;
1430   __res.as_int[1] = __i;
1431   return (__res.as_m64);
1432 }
1433
1434 /* Creates a vector of four 16-bit values, all elements containing W.  */
1435 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436 _mm_set1_pi16 (short __w)
1437 {
1438 #if _ARCH_PWR9
1439   __vector signed short w;
1440
1441   w = (__vector signed short)vec_splats (__w);
1442   return (__m64) ((__vector long long) w)[0];
1443 #else
1444   __m64_union __res;
1445
1446   __res.as_short[0] = __w;
1447   __res.as_short[1] = __w;
1448   __res.as_short[2] = __w;
1449   __res.as_short[3] = __w;
1450   return (__res.as_m64);
1451 #endif
1452 }
1453
1454 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1455 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1456 _mm_set1_pi8 (signed char __b)
1457 {
1458 #if _ARCH_PWR8
1459   __vector signed char __res;
1460
1461   __res = (__vector signed char)vec_splats (__b);
1462   return (__m64) ((__vector long long) __res)[0];
1463 #else
1464   __m64_union __res;
1465
1466   __res.as_char[0] = __b;
1467   __res.as_char[1] = __b;
1468   __res.as_char[2] = __b;
1469   __res.as_char[3] = __b;
1470   __res.as_char[4] = __b;
1471   __res.as_char[5] = __b;
1472   __res.as_char[6] = __b;
1473   __res.as_char[7] = __b;
1474   return (__res.as_m64);
1475 #endif
1476 }
1477 #endif /* _MMINTRIN_H_INCLUDED */