gcc/config/rs6000/xmmintrin.h

   1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    Under Section 7 of GPL version 3, you are granted additional
  16    permissions described in the GCC Runtime Library Exception, version
  17    3.1, as published by the Free Software Foundation.
  18
  19    You should have received a copy of the GNU General Public License and
  20    a copy of the GCC Runtime Library Exception along with this program;
  21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22    <http://www.gnu.org/licenses/>.  */
  23
  24 /* Implemented from the specification included in the Intel C++ Compiler
  25    User Guide and Reference, version 9.0.  */
  26
  27 #ifndef NO_WARN_X86_INTRINSICS
  28 /* This header is distributed to simplify porting x86_64 code that
  29    makes explicit use of Intel intrinsics to powerpc64le.
  30    It is the user's responsibility to determine if the results are
  31    acceptable and make additional changes as necessary.
  32    Note that much code that uses Intel intrinsics can be rewritten in
  33    standard C or GNU C extensions, which are more portable and better
  34    optimized across multiple targets.
  35
  36    In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
  37    VMX/VSX ISA is a good match for vector float SIMD operations.
  38    However scalar float operations in vector (XMM) registers require
  39    the POWER8 VSX ISA (2.07) level. Also there are important
  40    differences for data format and placement of float scalars in the
  41    vector register. For PowerISA Scalar floats in FPRs (left most
  42    64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
  43    uses the right most 32-bits of the XMM. These differences require
  44    extra steps on POWER to match the SSE scalar float semantics.
  45
  46    Most SSE scalar float intrinsic operations can be performed more
  47    efficiently as C language float scalar operations or optimized to
  48    use vector SIMD operations.  We recommend this for new applications.
  49
  50    Another difference is the format and details of the X86_64 MXSCR vs
  51    the PowerISA FPSCR / VSCR registers. We recommend applications
  52    replace direct access to the MXSCR with the more portable <fenv.h>
  53    Posix APIs. */
  54 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  55 #endif
  56
  57 #ifndef _XMMINTRIN_H_INCLUDED
  58 #define _XMMINTRIN_H_INCLUDED
  59
  60 #include <altivec.h>
  61
  62 /* Avoid collisions between altivec.h and strict adherence to C++ and
  63    C11 standards.  This should eventually be done inside altivec.h itself,
  64    but only after testing a full distro build.  */
  65 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
  66                                  (defined(__STDC_VERSION__) &&  \
  67                                   __STDC_VERSION__ >= 201112L))
  68 #undef vector
  69 #undef pixel
  70 #undef bool
  71 #endif
  72
  73 #include <assert.h>
  74
  75 /* We need type definitions from the MMX header file.  */
  76 #include <mmintrin.h>
  77
  78 /* Get _mm_malloc () and _mm_free ().  */
  79 #include <mm_malloc.h>
  80
  81 /* The Intel API is flexible enough that we must allow aliasing with other
  82    vector types, and their scalar components.  */
  83 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
  84
  85 /* Internal data types for implementing the intrinsics.  */
  86 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  87
  88 /* Create an undefined vector.  */
  89 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  90 _mm_undefined_ps (void)
  91 {
  92   __m128 __Y = __Y;
  93   return __Y;
  94 }
  95
  96 /* Create a vector of zeros.  */
  97 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  98 _mm_setzero_ps (void)
  99 {
 100   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 101 }
 102
 103 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
 104 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 105 _mm_load_ps (float const *__P)
 106 {
 107   assert(((unsigned long)__P & 0xfUL) == 0UL);
 108   return ((__m128)vec_ld(0, (__v4sf*)__P));
 109 }
 110
 111 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
 112 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 113 _mm_loadu_ps (float const *__P)
 114 {
 115   return (vec_vsx_ld(0, __P));
 116 }
 117
 118 /* Load four SPFP values in reverse order.  The address must be aligned.  */
 119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 120 _mm_loadr_ps (float const *__P)
 121 {
 122   __v4sf   __tmp;
 123   __m128 result;
 124   static const __vector unsigned char permute_vector =
 125     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 126         0x17, 0x10, 0x11, 0x12, 0x13 };
 127
 128   __tmp = vec_ld (0, (__v4sf *) __P);
 129   result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
 130   return result;
 131 }
 132
 133 /* Create a vector with all four elements equal to F.  */
 134 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 135 _mm_set1_ps (float __F)
 136 {
 137   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
 138 }
 139
 140 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 141 _mm_set_ps1 (float __F)
 142 {
 143   return _mm_set1_ps (__F);
 144 }
 145
 146 /* Create the vector [Z Y X W].  */
 147 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 148 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
 149 {
 150   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
 151 }
 152
 153 /* Create the vector [W X Y Z].  */
 154 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 155 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 156 {
 157   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 158 }
 159
 160 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 161 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 162 _mm_store_ps (float *__P, __m128 __A)
 163 {
 164   assert(((unsigned long)__P & 0xfUL) == 0UL);
 165   vec_st((__v4sf)__A, 0, (__v4sf*)__P);
 166 }
 167
 168 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
 169 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 170 _mm_storeu_ps (float *__P, __m128 __A)
 171 {
 172   *(__m128 *)__P = __A;
 173 }
 174
 175 /* Store four SPFP values in reverse order.  The address must be aligned.  */
 176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 177 _mm_storer_ps (float *__P, __m128 __A)
 178 {
 179   __v4sf   __tmp;
 180   static const __vector unsigned char permute_vector =
 181     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 182         0x17, 0x10, 0x11, 0x12, 0x13 };
 183
 184   __tmp = (__m128) vec_perm (__A, __A, permute_vector);
 185
 186   _mm_store_ps (__P, __tmp);
 187 }
 188
 189 /* Store the lower SPFP value across four words.  */
 190 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 191 _mm_store1_ps (float *__P, __m128 __A)
 192 {
 193   __v4sf __va = vec_splat((__v4sf)__A, 0);
 194   _mm_store_ps (__P, __va);
 195 }
 196
 197 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 198 _mm_store_ps1 (float *__P, __m128 __A)
 199 {
 200   _mm_store1_ps (__P, __A);
 201 }
 202
 203 /* Create a vector with element 0 as F and the rest zero.  */
 204 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 205 _mm_set_ss (float __F)
 206 {
 207   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
 208 }
 209
 210 /* Sets the low SPFP value of A from the low value of B.  */
 211 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 212 _mm_move_ss (__m128 __A, __m128 __B)
 213 {
 214   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 215
 216   return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
 217 }
 218
 219 /* Create a vector with element 0 as *P and the rest zero.  */
 220 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 221 _mm_load_ss (float const *__P)
 222 {
 223   return _mm_set_ss (*__P);
 224 }
 225
 226 /* Stores the lower SPFP value.  */
 227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 228 _mm_store_ss (float *__P, __m128 __A)
 229 {
 230   *__P = ((__v4sf)__A)[0];
 231 }
 232
 233 /* Perform the respective operation on the lower SPFP (single-precision
 234    floating-point) values of A and B; the upper three SPFP values are
 235    passed through from A.  */
 236
 237 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 238 _mm_add_ss (__m128 __A, __m128 __B)
 239 {
 240 #ifdef _ARCH_PWR7
 241   __m128 a, b, c;
 242   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 243   /* PowerISA VSX does not allow partial (for just lower double)
 244      results. So to insure we don't generate spurious exceptions
 245      (from the upper double values) we splat the lower double
 246      before we to the operation.  */
 247   a = vec_splat (__A, 0);
 248   b = vec_splat (__B, 0);
 249   c = a + b;
 250   /* Then we merge the lower float result with the original upper
 251      float elements from __A.  */
 252   return (vec_sel (__A, c, mask));
 253 #else
 254   __A[0] = __A[0] + __B[0];
 255   return (__A);
 256 #endif
 257 }
 258
 259 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 260 _mm_sub_ss (__m128 __A, __m128 __B)
 261 {
 262 #ifdef _ARCH_PWR7
 263   __m128 a, b, c;
 264   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 265   /* PowerISA VSX does not allow partial (for just lower double)
 266      results. So to insure we don't generate spurious exceptions
 267      (from the upper double values) we splat the lower double
 268      before we to the operation.  */
 269   a = vec_splat (__A, 0);
 270   b = vec_splat (__B, 0);
 271   c = a - b;
 272   /* Then we merge the lower float result with the original upper
 273      float elements from __A.  */
 274   return (vec_sel (__A, c, mask));
 275 #else
 276   __A[0] = __A[0] - __B[0];
 277   return (__A);
 278 #endif
 279 }
 280
 281 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 282 _mm_mul_ss (__m128 __A, __m128 __B)
 283 {
 284 #ifdef _ARCH_PWR7
 285   __m128 a, b, c;
 286   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 287   /* PowerISA VSX does not allow partial (for just lower double)
 288      results. So to insure we don't generate spurious exceptions
 289      (from the upper double values) we splat the lower double
 290      before we to the operation.  */
 291   a = vec_splat (__A, 0);
 292   b = vec_splat (__B, 0);
 293   c = a * b;
 294   /* Then we merge the lower float result with the original upper
 295      float elements from __A.  */
 296   return (vec_sel (__A, c, mask));
 297 #else
 298   __A[0] = __A[0] * __B[0];
 299   return (__A);
 300 #endif
 301 }
 302
 303 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 304 _mm_div_ss (__m128 __A, __m128 __B)
 305 {
 306 #ifdef _ARCH_PWR7
 307   __m128 a, b, c;
 308   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 309   /* PowerISA VSX does not allow partial (for just lower double)
 310      results. So to insure we don't generate spurious exceptions
 311      (from the upper double values) we splat the lower double
 312      before we to the operation.  */
 313   a = vec_splat (__A, 0);
 314   b = vec_splat (__B, 0);
 315   c = a / b;
 316   /* Then we merge the lower float result with the original upper
 317      float elements from __A.  */
 318   return (vec_sel (__A, c, mask));
 319 #else
 320   __A[0] = __A[0] / __B[0];
 321   return (__A);
 322 #endif
 323 }
 324
 325 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 326 _mm_sqrt_ss (__m128 __A)
 327 {
 328   __m128 a, c;
 329   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 330   /* PowerISA VSX does not allow partial (for just lower double)
 331    * results. So to insure we don't generate spurious exceptions
 332    * (from the upper double values) we splat the lower double
 333    * before we to the operation. */
 334   a = vec_splat (__A, 0);
 335   c = vec_sqrt (a);
 336   /* Then we merge the lower float result with the original upper
 337    * float elements from __A.  */
 338   return (vec_sel (__A, c, mask));
 339 }
 340
 341 /* Perform the respective operation on the four SPFP values in A and B.  */
 342 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 343 _mm_add_ps (__m128 __A, __m128 __B)
 344 {
 345   return (__m128) ((__v4sf)__A + (__v4sf)__B);
 346 }
 347
 348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 349 _mm_sub_ps (__m128 __A, __m128 __B)
 350 {
 351   return (__m128) ((__v4sf)__A - (__v4sf)__B);
 352 }
 353
 354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 355 _mm_mul_ps (__m128 __A, __m128 __B)
 356 {
 357   return (__m128) ((__v4sf)__A * (__v4sf)__B);
 358 }
 359
 360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 361 _mm_div_ps (__m128 __A, __m128 __B)
 362 {
 363   return (__m128) ((__v4sf)__A / (__v4sf)__B);
 364 }
 365
 366 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 367 _mm_sqrt_ps (__m128 __A)
 368 {
 369   return (vec_sqrt ((__v4sf)__A));
 370 }
 371
 372 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 373 _mm_rcp_ps (__m128 __A)
 374 {
 375   return (vec_re ((__v4sf)__A));
 376 }
 377
 378 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 379 _mm_rsqrt_ps (__m128 __A)
 380 {
 381   return (vec_rsqrte (__A));
 382 }
 383
 384 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 385 _mm_rcp_ss (__m128 __A)
 386 {
 387   __m128 a, c;
 388   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 389   /* PowerISA VSX does not allow partial (for just lower double)
 390    * results. So to insure we don't generate spurious exceptions
 391    * (from the upper double values) we splat the lower double
 392    * before we to the operation. */
 393   a = vec_splat (__A, 0);
 394   c = _mm_rcp_ps (a);
 395   /* Then we merge the lower float result with the original upper
 396    * float elements from __A.  */
 397   return (vec_sel (__A, c, mask));
 398 }
 399
 400 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 401 _mm_rsqrt_ss (__m128 __A)
 402 {
 403   __m128 a, c;
 404   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 405   /* PowerISA VSX does not allow partial (for just lower double)
 406    * results. So to insure we don't generate spurious exceptions
 407    * (from the upper double values) we splat the lower double
 408    * before we to the operation. */
 409   a = vec_splat (__A, 0);
 410   c = vec_rsqrte (a);
 411   /* Then we merge the lower float result with the original upper
 412    * float elements from __A.  */
 413   return (vec_sel (__A, c, mask));
 414 }
 415
 416 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 417 _mm_min_ss (__m128 __A, __m128 __B)
 418 {
 419   __v4sf a, b, c;
 420   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 421   /* PowerISA VSX does not allow partial (for just lower float)
 422    * results. So to insure we don't generate spurious exceptions
 423    * (from the upper float values) we splat the lower float
 424    * before we to the operation. */
 425   a = vec_splat ((__v4sf)__A, 0);
 426   b = vec_splat ((__v4sf)__B, 0);
 427   c = vec_min (a, b);
 428   /* Then we merge the lower float result with the original upper
 429    * float elements from __A.  */
 430   return (vec_sel ((__v4sf)__A, c, mask));
 431 }
 432
 433 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 434 _mm_max_ss (__m128 __A, __m128 __B)
 435 {
 436   __v4sf a, b, c;
 437   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 438   /* PowerISA VSX does not allow partial (for just lower float)
 439    * results. So to insure we don't generate spurious exceptions
 440    * (from the upper float values) we splat the lower float
 441    * before we to the operation. */
 442   a = vec_splat (__A, 0);
 443   b = vec_splat (__B, 0);
 444   c = vec_max (a, b);
 445   /* Then we merge the lower float result with the original upper
 446    * float elements from __A.  */
 447   return (vec_sel ((__v4sf)__A, c, mask));
 448 }
 449
 450 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 451 _mm_min_ps (__m128 __A, __m128 __B)
 452 {
 453   __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __B, (__v4sf) __A);
 454   return vec_sel (__B, __A, m);
 455 }
 456
 457 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 458 _mm_max_ps (__m128 __A, __m128 __B)
 459 {
 460   __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __A, (__v4sf) __B);
 461   return vec_sel (__B, __A, m);
 462 }
 463
 464 /* Perform logical bit-wise operations on 128-bit values.  */
 465 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 466 _mm_and_ps (__m128 __A, __m128 __B)
 467 {
 468   return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
 469 //  return __builtin_ia32_andps (__A, __B);
 470 }
 471
 472 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 473 _mm_andnot_ps (__m128 __A, __m128 __B)
 474 {
 475   return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
 476 }
 477
 478 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 479 _mm_or_ps (__m128 __A, __m128 __B)
 480 {
 481   return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
 482 }
 483
 484 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 485 _mm_xor_ps (__m128 __A, __m128 __B)
 486 {
 487   return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
 488 }
 489
 490 /* Perform a comparison on the four SPFP values of A and B.  For each
 491    element, if the comparison is true, place a mask of all ones in the
 492    result, otherwise a mask of zeros.  */
 493 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 494 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 495 {
 496   return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
 497 }
 498
 499 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 500 _mm_cmplt_ps (__m128 __A, __m128 __B)
 501 {
 502   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 503 }
 504
 505 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 506 _mm_cmple_ps (__m128 __A, __m128 __B)
 507 {
 508   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 509 }
 510
 511 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 512 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 513 {
 514   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 515 }
 516
 517 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 518 _mm_cmpge_ps (__m128 __A, __m128 __B)
 519 {
 520   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 521 }
 522
 523 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 524 _mm_cmpneq_ps (__m128  __A, __m128  __B)
 525 {
 526   __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
 527   return ((__m128)vec_nor (temp, temp));
 528 }
 529
 530 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 531 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 532 {
 533   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 534 }
 535
 536 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 537 _mm_cmpnle_ps (__m128 __A, __m128 __B)
 538 {
 539   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 540 }
 541
 542 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 543 _mm_cmpngt_ps (__m128 __A, __m128 __B)
 544 {
 545   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 546 }
 547
 548 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 549 _mm_cmpnge_ps (__m128 __A, __m128 __B)
 550 {
 551   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 552 }
 553
 554 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 555 _mm_cmpord_ps (__m128  __A, __m128  __B)
 556 {
 557   __vector unsigned int a, b;
 558   __vector unsigned int c, d;
 559   static const __vector unsigned int float_exp_mask =
 560     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 561
 562   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 563   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 564   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 565   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 566   return ((__m128 ) vec_and (c, d));
 567 }
 568
 569 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 570 _mm_cmpunord_ps (__m128 __A, __m128 __B)
 571 {
 572   __vector unsigned int a, b;
 573   __vector unsigned int c, d;
 574   static const __vector unsigned int float_exp_mask =
 575     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 576
 577   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 578   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 579   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 580   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 581   return ((__m128 ) vec_or (c, d));
 582 }
 583
 584 /* Perform a comparison on the lower SPFP values of A and B.  If the
 585    comparison is true, place a mask of all ones in the result, otherwise a
 586    mask of zeros.  The upper three SPFP values are passed through from A.  */
 587 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 588 _mm_cmpeq_ss (__m128  __A, __m128  __B)
 589 {
 590   static const __vector unsigned int mask =
 591     { 0xffffffff, 0, 0, 0 };
 592   __v4sf a, b, c;
 593   /* PowerISA VMX does not allow partial (for just element 0)
 594    * results. So to insure we don't generate spurious exceptions
 595    * (from the upper elements) we splat the lower float
 596    * before we to the operation. */
 597   a = vec_splat ((__v4sf) __A, 0);
 598   b = vec_splat ((__v4sf) __B, 0);
 599   c = (__v4sf) vec_cmpeq(a, b);
 600   /* Then we merge the lower float result with the original upper
 601    * float elements from __A.  */
 602   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 603 }
 604
 605 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 606 _mm_cmplt_ss (__m128 __A, __m128 __B)
 607 {
 608   static const __vector unsigned int mask =
 609     { 0xffffffff, 0, 0, 0 };
 610   __v4sf a, b, c;
 611   /* PowerISA VMX does not allow partial (for just element 0)
 612    * results. So to insure we don't generate spurious exceptions
 613    * (from the upper elements) we splat the lower float
 614    * before we to the operation. */
 615   a = vec_splat ((__v4sf) __A, 0);
 616   b = vec_splat ((__v4sf) __B, 0);
 617   c = (__v4sf) vec_cmplt(a, b);
 618   /* Then we merge the lower float result with the original upper
 619    * float elements from __A.  */
 620   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 621 }
 622
 623 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 624 _mm_cmple_ss (__m128 __A, __m128 __B)
 625 {
 626   static const __vector unsigned int mask =
 627     { 0xffffffff, 0, 0, 0 };
 628   __v4sf a, b, c;
 629   /* PowerISA VMX does not allow partial (for just element 0)
 630    * results. So to insure we don't generate spurious exceptions
 631    * (from the upper elements) we splat the lower float
 632    * before we to the operation. */
 633   a = vec_splat ((__v4sf) __A, 0);
 634   b = vec_splat ((__v4sf) __B, 0);
 635   c = (__v4sf) vec_cmple(a, b);
 636   /* Then we merge the lower float result with the original upper
 637    * float elements from __A.  */
 638   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 639 }
 640
 641 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 642 _mm_cmpgt_ss (__m128 __A, __m128 __B)
 643 {
 644   static const __vector unsigned int mask =
 645     { 0xffffffff, 0, 0, 0 };
 646   __v4sf a, b, c;
 647   /* PowerISA VMX does not allow partial (for just element 0)
 648    * results. So to insure we don't generate spurious exceptions
 649    * (from the upper elements) we splat the lower float
 650    * before we to the operation. */
 651   a = vec_splat ((__v4sf) __A, 0);
 652   b = vec_splat ((__v4sf) __B, 0);
 653   c = (__v4sf) vec_cmpgt(a, b);
 654   /* Then we merge the lower float result with the original upper
 655    * float elements from __A.  */
 656   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 657 }
 658
 659 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 660 _mm_cmpge_ss (__m128 __A, __m128 __B)
 661 {
 662   static const __vector unsigned int mask =
 663     { 0xffffffff, 0, 0, 0 };
 664   __v4sf a, b, c;
 665   /* PowerISA VMX does not allow partial (for just element 0)
 666    * results. So to insure we don't generate spurious exceptions
 667    * (from the upper elements) we splat the lower float
 668    * before we to the operation. */
 669   a = vec_splat ((__v4sf) __A, 0);
 670   b = vec_splat ((__v4sf) __B, 0);
 671   c = (__v4sf) vec_cmpge(a, b);
 672   /* Then we merge the lower float result with the original upper
 673    * float elements from __A.  */
 674   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 675 }
 676
 677 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 678 _mm_cmpneq_ss (__m128 __A, __m128 __B)
 679 {
 680   static const __vector unsigned int mask =
 681     { 0xffffffff, 0, 0, 0 };
 682   __v4sf a, b, c;
 683   /* PowerISA VMX does not allow partial (for just element 0)
 684    * results. So to insure we don't generate spurious exceptions
 685    * (from the upper elements) we splat the lower float
 686    * before we to the operation. */
 687   a = vec_splat ((__v4sf) __A, 0);
 688   b = vec_splat ((__v4sf) __B, 0);
 689   c = (__v4sf) vec_cmpeq(a, b);
 690   c = vec_nor (c, c);
 691   /* Then we merge the lower float result with the original upper
 692    * float elements from __A.  */
 693   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 694 }
 695
 696 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 697 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
 698 {
 699   static const __vector unsigned int mask =
 700     { 0xffffffff, 0, 0, 0 };
 701   __v4sf a, b, c;
 702   /* PowerISA VMX does not allow partial (for just element 0)
 703    * results. So to insure we don't generate spurious exceptions
 704    * (from the upper elements) we splat the lower float
 705    * before we to the operation. */
 706   a = vec_splat ((__v4sf) __A, 0);
 707   b = vec_splat ((__v4sf) __B, 0);
 708   c = (__v4sf) vec_cmpge(a, b);
 709   /* Then we merge the lower float result with the original upper
 710    * float elements from __A.  */
 711   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 712 }
 713
 714 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 715 _mm_cmpnle_ss (__m128 __A, __m128 __B)
 716 {
 717   static const __vector unsigned int mask =
 718     { 0xffffffff, 0, 0, 0 };
 719   __v4sf a, b, c;
 720   /* PowerISA VMX does not allow partial (for just element 0)
 721    * results. So to insure we don't generate spurious exceptions
 722    * (from the upper elements) we splat the lower float
 723    * before we to the operation. */
 724   a = vec_splat ((__v4sf) __A, 0);
 725   b = vec_splat ((__v4sf) __B, 0);
 726   c = (__v4sf) vec_cmpgt(a, b);
 727   /* Then we merge the lower float result with the original upper
 728    * float elements from __A.  */
 729   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 730 }
 731
 732 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 733 _mm_cmpngt_ss (__m128 __A, __m128 __B)
 734 {
 735   static const __vector unsigned int mask =
 736     { 0xffffffff, 0, 0, 0 };
 737   __v4sf a, b, c;
 738   /* PowerISA VMX does not allow partial (for just element 0)
 739    * results. So to insure we don't generate spurious exceptions
 740    * (from the upper elements) we splat the lower float
 741    * before we to the operation. */
 742   a = vec_splat ((__v4sf) __A, 0);
 743   b = vec_splat ((__v4sf) __B, 0);
 744   c = (__v4sf) vec_cmple(a, b);
 745   /* Then we merge the lower float result with the original upper
 746    * float elements from __A.  */
 747   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 748 }
 749
 750 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 751 _mm_cmpnge_ss (__m128 __A, __m128 __B)
 752 {
 753   static const __vector unsigned int mask =
 754     { 0xffffffff, 0, 0, 0 };
 755   __v4sf a, b, c;
 756   /* PowerISA VMX does not allow partial (for just element 0)
 757    * results. So to insure we don't generate spurious exceptions
 758    * (from the upper elements) we splat the lower float
 759    * before we do the operation. */
 760   a = vec_splat ((__v4sf) __A, 0);
 761   b = vec_splat ((__v4sf) __B, 0);
 762   c = (__v4sf) vec_cmplt(a, b);
 763   /* Then we merge the lower float result with the original upper
 764    * float elements from __A.  */
 765   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 766 }
 767
 768 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 769 _mm_cmpord_ss (__m128 __A, __m128 __B)
 770 {
 771   __vector unsigned int a, b;
 772   __vector unsigned int c, d;
 773   static const __vector unsigned int float_exp_mask =
 774     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 775   static const __vector unsigned int mask =
 776     { 0xffffffff, 0, 0, 0 };
 777
 778   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 779   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 780   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 781   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 782   c = vec_and (c, d);
 783   /* Then we merge the lower float result with the original upper
 784    * float elements from __A.  */
 785   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 786 }
 787
 788 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 789 _mm_cmpunord_ss (__m128 __A, __m128 __B)
 790 {
 791   __vector unsigned int a, b;
 792   __vector unsigned int c, d;
 793   static const __vector unsigned int float_exp_mask =
 794     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 795   static const __vector unsigned int mask =
 796     { 0xffffffff, 0, 0, 0 };
 797
 798   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 799   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 800   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 801   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 802   c = vec_or (c, d);
 803   /* Then we merge the lower float result with the original upper
 804    * float elements from __A.  */
 805   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 806 }
 807
 808 /* Compare the lower SPFP values of A and B and return 1 if true
 809    and 0 if false.  */
 810 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 811 _mm_comieq_ss (__m128 __A, __m128 __B)
 812 {
 813   return (__A[0] == __B[0]);
 814 }
 815
 816 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 817 _mm_comilt_ss (__m128 __A, __m128 __B)
 818 {
 819   return (__A[0] < __B[0]);
 820 }
 821
 822 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 823 _mm_comile_ss (__m128 __A, __m128 __B)
 824 {
 825   return (__A[0] <= __B[0]);
 826 }
 827
 828 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 829 _mm_comigt_ss (__m128 __A, __m128 __B)
 830 {
 831   return (__A[0] > __B[0]);
 832 }
 833
 834 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 835 _mm_comige_ss (__m128 __A, __m128 __B)
 836 {
 837   return (__A[0] >= __B[0]);
 838 }
 839
 840 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 841 _mm_comineq_ss (__m128 __A, __m128 __B)
 842 {
 843   return (__A[0] != __B[0]);
 844 }
 845
 846 /* FIXME
 847  * The __mm_ucomi??_ss implementations below are exactly the same as
 848  * __mm_comi??_ss because GCC for PowerPC only generates unordered
 849  * compares (scalar and vector).
 850  * Technically __mm_comieq_ss et al should be using the ordered
 851  * compare and signal for QNaNs.
 852  * The __mm_ucomieq_sd et all should be OK, as is.
 853  */
 854 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 855 _mm_ucomieq_ss (__m128 __A, __m128 __B)
 856 {
 857   return (__A[0] == __B[0]);
 858 }
 859
 860 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 861 _mm_ucomilt_ss (__m128 __A, __m128 __B)
 862 {
 863   return (__A[0] < __B[0]);
 864 }
 865
 866 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 867 _mm_ucomile_ss (__m128 __A, __m128 __B)
 868 {
 869   return (__A[0] <= __B[0]);
 870 }
 871
 872 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 873 _mm_ucomigt_ss (__m128 __A, __m128 __B)
 874 {
 875   return (__A[0] > __B[0]);
 876 }
 877
 878 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 879 _mm_ucomige_ss (__m128 __A, __m128 __B)
 880 {
 881   return (__A[0] >= __B[0]);
 882 }
 883
 884 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 885 _mm_ucomineq_ss (__m128 __A, __m128 __B)
 886 {
 887   return (__A[0] != __B[0]);
 888 }
 889
 890 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 891 _mm_cvtss_f32 (__m128 __A)
 892 {
 893   return ((__v4sf)__A)[0];
 894 }
 895
 896 /* Convert the lower SPFP value to a 32-bit integer according to the current
 897    rounding mode.  */
 898 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 899 _mm_cvtss_si32 (__m128 __A)
 900 {
 901   __m64 res = 0;
 902 #ifdef _ARCH_PWR8
 903   __m128 vtmp;
 904   __asm__(
 905       "xxsldwi %x1,%x2,%x2,3;\n"
 906       "xscvspdp %x1,%x1;\n"
 907       "fctiw  %1,%1;\n"
 908       "mfvsrd  %0,%x1;\n"
 909       : "=r" (res),
 910         "=&wi" (vtmp)
 911       : "wa" (__A)
 912       : );
 913 #else
 914   res = __builtin_rint(__A[0]);
 915 #endif
 916   return (res);
 917 }
 918
 919 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 920 _mm_cvt_ss2si (__m128 __A)
 921 {
 922   return _mm_cvtss_si32 (__A);
 923 }
 924
 925 /* Convert the lower SPFP value to a 32-bit integer according to the
 926    current rounding mode.  */
 927
 928 /* Intel intrinsic.  */
 929 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 930 _mm_cvtss_si64 (__m128 __A)
 931 {
 932   __m64 res = 0;
 933 #ifdef _ARCH_PWR8
 934   __m128 vtmp;
 935   __asm__(
 936       "xxsldwi %x1,%x2,%x2,3;\n"
 937       "xscvspdp %x1,%x1;\n"
 938       "fctid  %1,%1;\n"
 939       "mfvsrd  %0,%x1;\n"
 940       : "=r" (res),
 941         "=&wi" (vtmp)
 942       : "wa" (__A)
 943       : );
 944 #else
 945   res = __builtin_llrint(__A[0]);
 946 #endif
 947   return (res);
 948 }
 949
 950 /* Microsoft intrinsic.  */
 951 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 952 _mm_cvtss_si64x (__m128 __A)
 953 {
 954   return _mm_cvtss_si64 ((__v4sf) __A);
 955 }
 956
 957 /* Constants for use with _mm_prefetch.  */
 958 enum _mm_hint
 959 {
 960   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
 961   _MM_HINT_ET0 = 7,
 962   _MM_HINT_ET1 = 6,
 963   _MM_HINT_T0 = 3,
 964   _MM_HINT_T1 = 2,
 965   _MM_HINT_T2 = 1,
 966   _MM_HINT_NTA = 0
 967 };
 968
 969 /* Loads one cache line from address P to a location "closer" to the
 970    processor.  The selector I specifies the type of prefetch operation.  */
 971 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 972 _mm_prefetch (const void *__P, enum _mm_hint __I)
 973 {
 974   /* Current PowerPC will ignores the hint parameters.  */
 975   __builtin_prefetch (__P);
 976 }
 977
 978 /* Convert the two lower SPFP values to 32-bit integers according to the
 979    current rounding mode.  Return the integers in packed form.  */
 980 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 981 _mm_cvtps_pi32 (__m128 __A)
 982 {
 983   /* Splat two lower SPFP values to both halves.  */
 984   __v4sf temp, rounded;
 985   __vector __m64 result;
 986
 987   /* Splat two lower SPFP values to both halves.  */
 988   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
 989   rounded = vec_rint(temp);
 990   result = (__vector __m64) vec_cts (rounded, 0);
 991
 992   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
 993 }
 994
 995 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 996 _mm_cvt_ps2pi (__m128 __A)
 997 {
 998   return _mm_cvtps_pi32 (__A);
 999 }
1000
1001 /* Truncate the lower SPFP value to a 32-bit integer.  */
1002 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm_cvttss_si32 (__m128 __A)
1004 {
1005   /* Extract the lower float element.  */
1006   float temp = __A[0];
1007   /* truncate to 32-bit integer and return.  */
1008   return temp;
1009 }
1010
1011 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm_cvtt_ss2si (__m128 __A)
1013 {
1014   return _mm_cvttss_si32 (__A);
1015 }
1016
1017 /* Intel intrinsic.  */
1018 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1019 _mm_cvttss_si64 (__m128 __A)
1020 {
1021   /* Extract the lower float element.  */
1022   float temp = __A[0];
1023   /* truncate to 32-bit integer and return.  */
1024   return temp;
1025 }
1026
1027 /* Microsoft intrinsic.  */
1028 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1029 _mm_cvttss_si64x (__m128 __A)
1030 {
1031   /* Extract the lower float element.  */
1032   float temp = __A[0];
1033   /* truncate to 32-bit integer and return.  */
1034   return temp;
1035 }
1036
1037 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
1038    integers in packed form.  */
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _mm_cvttps_pi32 (__m128 __A)
1041 {
1042   __v4sf temp;
1043   __vector __m64 result;
1044
1045   /* Splat two lower SPFP values to both halves.  */
1046   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1047   result = (__vector __m64) vec_cts (temp, 0);
1048
1049   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1050 }
1051
1052 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1053 _mm_cvtt_ps2pi (__m128 __A)
1054 {
1055   return _mm_cvttps_pi32 (__A);
1056 }
1057
1058 /* Convert B to a SPFP value and insert it as element zero in A.  */
1059 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_cvtsi32_ss (__m128 __A, int __B)
1061 {
1062   float temp = __B;
1063   __A[0] = temp;
1064
1065   return __A;
1066 }
1067
1068 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 _mm_cvt_si2ss (__m128 __A, int __B)
1070 {
1071   return _mm_cvtsi32_ss (__A, __B);
1072 }
1073
1074 /* Convert B to a SPFP value and insert it as element zero in A.  */
1075 /* Intel intrinsic.  */
1076 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1077 _mm_cvtsi64_ss (__m128 __A, long long __B)
1078 {
1079   float temp = __B;
1080   __A[0] = temp;
1081
1082   return __A;
1083 }
1084
1085 /* Microsoft intrinsic.  */
1086 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1088 {
1089   return _mm_cvtsi64_ss (__A, __B);
1090 }
1091
1092 /* Convert the two 32-bit values in B to SPFP form and insert them
1093    as the two lower elements in A.  */
1094 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1095 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
1096 {
1097   __vector signed int vm1;
1098   __vector float vf1;
1099
1100   vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B);
1101   vf1 = (__vector float) vec_ctf (vm1, 0);
1102
1103   return ((__m128) (__vector __m64)
1104     { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]});
1105 }
1106
1107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1109 {
1110   return _mm_cvtpi32_ps (__A, __B);
1111 }
1112
1113 /* Convert the four signed 16-bit values in A to SPFP form.  */
1114 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1115 _mm_cvtpi16_ps (__m64 __A)
1116 {
1117   __vector signed short vs8;
1118   __vector signed int vi4;
1119   __vector float vf1;
1120
1121   vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A);
1122   vi4 = vec_vupklsh (vs8);
1123   vf1 = (__vector float) vec_ctf (vi4, 0);
1124
1125   return (__m128) vf1;
1126 }
1127
1128 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
1129 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130 _mm_cvtpu16_ps (__m64 __A)
1131 {
1132   const __vector unsigned short zero =
1133     { 0, 0, 0, 0, 0, 0, 0, 0 };
1134   __vector unsigned short vs8;
1135   __vector unsigned int vi4;
1136   __vector float vf1;
1137
1138   vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A);
1139   vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1140   vf1 = (__vector float) vec_ctf (vi4, 0);
1141
1142   return (__m128) vf1;
1143 }
1144
1145 /* Convert the low four signed 8-bit values in A to SPFP form.  */
1146 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_cvtpi8_ps (__m64 __A)
1148 {
1149   __vector signed char vc16;
1150   __vector signed short vs8;
1151   __vector signed int vi4;
1152   __vector float vf1;
1153
1154   vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A);
1155   vs8 = vec_vupkhsb (vc16);
1156   vi4 = vec_vupkhsh (vs8);
1157   vf1 = (__vector float) vec_ctf (vi4, 0);
1158
1159   return (__m128) vf1;
1160 }
1161
1162 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1163 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1164
1165 _mm_cvtpu8_ps (__m64  __A)
1166 {
1167   const __vector unsigned char zero =
1168     { 0, 0, 0, 0, 0, 0, 0, 0 };
1169   __vector unsigned char vc16;
1170   __vector unsigned short vs8;
1171   __vector unsigned int vi4;
1172   __vector float vf1;
1173
1174   vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A);
1175   vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1176   vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1177                                             (__vector unsigned short) zero);
1178   vf1 = (__vector float) vec_ctf (vi4, 0);
1179
1180   return (__m128) vf1;
1181 }
1182
1183 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
1184 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1185 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
1186 {
1187   __vector signed int vi4;
1188   __vector float vf4;
1189
1190   vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A);
1191   vf4 = (__vector float) vec_ctf (vi4, 0);
1192   return (__m128) vf4;
1193 }
1194
1195 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
1196 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm_cvtps_pi16(__m128 __A)
1198 {
1199   __v4sf rounded;
1200   __vector signed int temp;
1201   __vector __m64 result;
1202
1203   rounded = vec_rint(__A);
1204   temp = vec_cts (rounded, 0);
1205   result = (__vector __m64) vec_pack (temp, temp);
1206
1207   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1208 }
1209
1210 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
1211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 _mm_cvtps_pi8(__m128 __A)
1213 {
1214   __v4sf rounded;
1215   __vector signed int tmp_i;
1216   static const __vector signed int zero = {0, 0, 0, 0};
1217   __vector signed short tmp_s;
1218   __vector signed char res_v;
1219   __m64 result;
1220
1221   rounded = vec_rint(__A);
1222   tmp_i = vec_cts (rounded, 0);
1223   tmp_s = vec_pack (tmp_i, zero);
1224   res_v = vec_pack (tmp_s, tmp_s);
1225   result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0);
1226
1227   return (result);
1228 }
1229
1230 /* Selects four specific SPFP values from A and B based on MASK.  */
1231 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232
1233 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1234 {
1235   unsigned long element_selector_10 = __mask & 0x03;
1236   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1237   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1238   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1239   static const unsigned int permute_selectors[4] =
1240     {
1241 #ifdef __LITTLE_ENDIAN__
1242       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1243 #elif __BIG_ENDIAN__
1244       0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1245 #endif
1246     };
1247   __vector unsigned int t;
1248
1249 #ifdef __LITTLE_ENDIAN__
1250   t[0] = permute_selectors[element_selector_10];
1251   t[1] = permute_selectors[element_selector_32];
1252   t[2] = permute_selectors[element_selector_54] + 0x10101010;
1253   t[3] = permute_selectors[element_selector_76] + 0x10101010;
1254 #elif __BIG_ENDIAN__
1255   t[3] = permute_selectors[element_selector_10] + 0x10101010;
1256   t[2] = permute_selectors[element_selector_32] + 0x10101010;
1257   t[1] = permute_selectors[element_selector_54];
1258   t[0] = permute_selectors[element_selector_76];
1259 #endif
1260   return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1261 }
1262
1263 /* Selects and interleaves the upper two SPFP values from A and B.  */
1264 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1266 {
1267   return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1268 }
1269
1270 /* Selects and interleaves the lower two SPFP values from A and B.  */
1271 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1273 {
1274   return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1275 }
1276
1277 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1278    the lower two values are passed through from A.  */
1279 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1281 {
1282   __vector __m64 __a = (__vector __m64)__A;
1283   __vector __m64 __p = vec_splats(*__P);
1284   __a [1] = __p [1];
1285
1286   return (__m128)__a;
1287 }
1288
1289 /* Stores the upper two SPFP values of A into P.  */
1290 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291 _mm_storeh_pi (__m64 *__P, __m128 __A)
1292 {
1293   __vector __m64 __a = (__vector __m64) __A;
1294
1295   *__P = __a[1];
1296 }
1297
1298 /* Moves the upper two values of B into the lower two values of A.  */
1299 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 _mm_movehl_ps (__m128 __A, __m128 __B)
1301 {
1302   return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A);
1303 }
1304
1305 /* Moves the lower two values of B into the upper two values of A.  */
1306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307 _mm_movelh_ps (__m128 __A, __m128 __B)
1308 {
1309   return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B);
1310 }
1311
1312 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1313    the upper two values are passed through from A.  */
1314 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1316 {
1317   __vector __m64 __a = (__vector __m64)__A;
1318   __vector __m64 __p = vec_splats(*__P);
1319   __a [0] = __p [0];
1320
1321   return (__m128)__a;
1322 }
1323
1324 /* Stores the lower two SPFP values of A into P.  */
1325 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326 _mm_storel_pi (__m64 *__P, __m128 __A)
1327 {
1328   __vector __m64 __a = (__vector __m64) __A;
1329
1330   *__P = __a[0];
1331 }
1332
1333 #ifdef _ARCH_PWR8
1334 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1335
1336 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1337 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1338 _mm_movemask_ps (__m128  __A)
1339 {
1340   __vector __m64 result;
1341   static const __vector unsigned int perm_mask =
1342     {
1343 #ifdef __LITTLE_ENDIAN__
1344         0x00204060, 0x80808080, 0x80808080, 0x80808080
1345 #elif __BIG_ENDIAN__
1346       0x80808080, 0x80808080, 0x80808080, 0x00204060
1347 #endif
1348     };
1349
1350   result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1351                                          (__vector unsigned char) perm_mask);
1352
1353 #ifdef __LITTLE_ENDIAN__
1354   return result[1];
1355 #elif __BIG_ENDIAN__
1356   return result[0];
1357 #endif
1358 }
1359 #endif /* _ARCH_PWR8 */
1360
1361 /* Create a vector with all four elements equal to *P.  */
1362 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363 _mm_load1_ps (float const *__P)
1364 {
1365   return _mm_set1_ps (*__P);
1366 }
1367
1368 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369 _mm_load_ps1 (float const *__P)
1370 {
1371   return _mm_load1_ps (__P);
1372 }
1373
1374 /* Extracts one of the four words of A.  The selector N must be immediate.  */
1375 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm_extract_pi16 (__m64 const __A, int const __N)
1377 {
1378   const int shiftr = (__N & 3) * 16;
1379
1380   return ((__A >> shiftr) & 0xffff);
1381 }
1382
1383 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384 _m_pextrw (__m64 const __A, int const __N)
1385 {
1386   return _mm_extract_pi16 (__A, __N);
1387 }
1388
1389 /* Inserts word D into one of four words of A.  The selector N must be
1390    immediate.  */
1391 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1393 {
1394   const int shiftl = (__N & 3) * 16;
1395   const __m64 shiftD = (const __m64) __D << shiftl;
1396   const __m64 mask = 0xffffUL << shiftl;
1397   __m64 result = (__A & (~mask)) | (shiftD & mask);
1398
1399   return (result);
1400 }
1401
1402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1404 {
1405   return _mm_insert_pi16 (__A, __D, __N);
1406 }
1407
1408 /* Compute the element-wise maximum of signed 16-bit values.  */
1409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410
1411 _mm_max_pi16 (__m64 __A, __m64 __B)
1412 {
1413 #if _ARCH_PWR8
1414   __vector signed short a, b, r;
1415   __vector __bool short c;
1416
1417   a = (__vector signed short)vec_splats (__A);
1418   b = (__vector signed short)vec_splats (__B);
1419   c = (__vector __bool short)vec_cmpgt (a, b);
1420   r = vec_sel (b, a, c);
1421   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1422 #else
1423   __m64_union m1, m2, res;
1424
1425   m1.as_m64 = __A;
1426   m2.as_m64 = __B;
1427
1428   res.as_short[0] =
1429       (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1430   res.as_short[1] =
1431       (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1432   res.as_short[2] =
1433       (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1434   res.as_short[3] =
1435       (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1436
1437   return (__m64) res.as_m64;
1438 #endif
1439 }
1440
1441 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1442 _m_pmaxsw (__m64 __A, __m64 __B)
1443 {
1444   return _mm_max_pi16 (__A, __B);
1445 }
1446
1447 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1448 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1449 _mm_max_pu8 (__m64 __A, __m64 __B)
1450 {
1451 #if _ARCH_PWR8
1452   __vector unsigned char a, b, r;
1453   __vector __bool char c;
1454
1455   a = (__vector unsigned char)vec_splats (__A);
1456   b = (__vector unsigned char)vec_splats (__B);
1457   c = (__vector __bool char)vec_cmpgt (a, b);
1458   r = vec_sel (b, a, c);
1459   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1460 #else
1461   __m64_union m1, m2, res;
1462   long i;
1463
1464   m1.as_m64 = __A;
1465   m2.as_m64 = __B;
1466
1467
1468   for (i = 0; i < 8; i++)
1469   res.as_char[i] =
1470       ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1471           m1.as_char[i] : m2.as_char[i];
1472
1473   return (__m64) res.as_m64;
1474 #endif
1475 }
1476
1477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1478 _m_pmaxub (__m64 __A, __m64 __B)
1479 {
1480   return _mm_max_pu8 (__A, __B);
1481 }
1482
1483 /* Compute the element-wise minimum of signed 16-bit values.  */
1484 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1485 _mm_min_pi16 (__m64 __A, __m64 __B)
1486 {
1487 #if _ARCH_PWR8
1488   __vector signed short a, b, r;
1489   __vector __bool short c;
1490
1491   a = (__vector signed short)vec_splats (__A);
1492   b = (__vector signed short)vec_splats (__B);
1493   c = (__vector __bool short)vec_cmplt (a, b);
1494   r = vec_sel (b, a, c);
1495   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1496 #else
1497   __m64_union m1, m2, res;
1498
1499   m1.as_m64 = __A;
1500   m2.as_m64 = __B;
1501
1502   res.as_short[0] =
1503       (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1504   res.as_short[1] =
1505       (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1506   res.as_short[2] =
1507       (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1508   res.as_short[3] =
1509       (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1510
1511   return (__m64) res.as_m64;
1512 #endif
1513 }
1514
1515 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1516 _m_pminsw (__m64 __A, __m64 __B)
1517 {
1518   return _mm_min_pi16 (__A, __B);
1519 }
1520
1521 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1522 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1523 _mm_min_pu8 (__m64 __A, __m64 __B)
1524 {
1525 #if _ARCH_PWR8
1526   __vector unsigned char a, b, r;
1527   __vector __bool char c;
1528
1529   a = (__vector unsigned char)vec_splats (__A);
1530   b = (__vector unsigned char)vec_splats (__B);
1531   c = (__vector __bool char)vec_cmplt (a, b);
1532   r = vec_sel (b, a, c);
1533   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1534 #else
1535   __m64_union m1, m2, res;
1536   long i;
1537
1538   m1.as_m64 = __A;
1539   m2.as_m64 = __B;
1540
1541
1542   for (i = 0; i < 8; i++)
1543   res.as_char[i] =
1544       ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1545           m1.as_char[i] : m2.as_char[i];
1546
1547   return (__m64) res.as_m64;
1548 #endif
1549 }
1550
1551 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1552 _m_pminub (__m64 __A, __m64 __B)
1553 {
1554   return _mm_min_pu8 (__A, __B);
1555 }
1556
1557 /* Create an 8-bit mask of the signs of 8-bit values.  */
1558 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1559 _mm_movemask_pi8 (__m64 __A)
1560 {
1561   unsigned long p = 0x0008101820283038UL; // permute control for sign bits
1562
1563   return __builtin_bpermd (p, __A);
1564 }
1565
1566 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1567 _m_pmovmskb (__m64 __A)
1568 {
1569   return _mm_movemask_pi8 (__A);
1570 }
1571
1572 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1573    in B and produce the high 16 bits of the 32-bit results.  */
1574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1575 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1576 {
1577   __vector unsigned short a, b;
1578   __vector unsigned short c;
1579   __vector unsigned int w0, w1;
1580   __vector unsigned char xform1 = {
1581       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1582       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1583     };
1584
1585   a = (__vector unsigned short)vec_splats (__A);
1586   b = (__vector unsigned short)vec_splats (__B);
1587
1588   w0 = vec_vmuleuh (a, b);
1589   w1 = vec_vmulouh (a, b);
1590   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1591
1592   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1593 }
1594
1595 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1596 _m_pmulhuw (__m64 __A, __m64 __B)
1597 {
1598   return _mm_mulhi_pu16 (__A, __B);
1599 }
1600
1601 /* Return a combination of the four 16-bit values in A.  The selector
1602    must be an immediate.  */
1603 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1604 _mm_shuffle_pi16 (__m64 __A, int const __N)
1605 {
1606   unsigned long element_selector_10 = __N & 0x03;
1607   unsigned long element_selector_32 = (__N >> 2) & 0x03;
1608   unsigned long element_selector_54 = (__N >> 4) & 0x03;
1609   unsigned long element_selector_76 = (__N >> 6) & 0x03;
1610   static const unsigned short permute_selectors[4] =
1611     {
1612 #ifdef __LITTLE_ENDIAN__
1613               0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1614 #elif __BIG_ENDIAN__
1615               0x0607, 0x0405, 0x0203, 0x0001
1616 #endif
1617     };
1618   __m64_union t;
1619   __vector __m64 a, p, r;
1620
1621 #ifdef __LITTLE_ENDIAN__
1622   t.as_short[0] = permute_selectors[element_selector_10];
1623   t.as_short[1] = permute_selectors[element_selector_32];
1624   t.as_short[2] = permute_selectors[element_selector_54];
1625   t.as_short[3] = permute_selectors[element_selector_76];
1626 #elif __BIG_ENDIAN__
1627   t.as_short[3] = permute_selectors[element_selector_10];
1628   t.as_short[2] = permute_selectors[element_selector_32];
1629   t.as_short[1] = permute_selectors[element_selector_54];
1630   t.as_short[0] = permute_selectors[element_selector_76];
1631 #endif
1632   p = vec_splats (t.as_m64);
1633   a = vec_splats (__A);
1634   r = vec_perm (a, a, (__vector unsigned char)p);
1635   return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1636 }
1637
1638 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1639 _m_pshufw (__m64 __A, int const __N)
1640 {
1641   return _mm_shuffle_pi16 (__A, __N);
1642 }
1643
1644 /* Conditionally store byte elements of A into P.  The high bit of each
1645    byte in the selector N determines whether the corresponding byte from
1646    A is stored.  */
1647 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1648 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1649 {
1650   __m64 hibit = 0x8080808080808080UL;
1651   __m64 mask, tmp;
1652   __m64 *p = (__m64*)__P;
1653
1654   tmp = *p;
1655   mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1656   tmp = (tmp & (~mask)) | (__A & mask);
1657   *p = tmp;
1658 }
1659
1660 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1661 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1662 {
1663   _mm_maskmove_si64 (__A, __N, __P);
1664 }
1665
1666 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1667 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1668 _mm_avg_pu8 (__m64 __A, __m64 __B)
1669 {
1670   __vector unsigned char a, b, c;
1671
1672   a = (__vector unsigned char)vec_splats (__A);
1673   b = (__vector unsigned char)vec_splats (__B);
1674   c = vec_avg (a, b);
1675   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1676 }
1677
1678 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1679 _m_pavgb (__m64 __A, __m64 __B)
1680 {
1681   return _mm_avg_pu8 (__A, __B);
1682 }
1683
1684 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1685 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1686 _mm_avg_pu16 (__m64 __A, __m64 __B)
1687 {
1688   __vector unsigned short a, b, c;
1689
1690   a = (__vector unsigned short)vec_splats (__A);
1691   b = (__vector unsigned short)vec_splats (__B);
1692   c = vec_avg (a, b);
1693   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1694 }
1695
1696 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1697 _m_pavgw (__m64 __A, __m64 __B)
1698 {
1699   return _mm_avg_pu16 (__A, __B);
1700 }
1701
1702 /* Compute the sum of the absolute differences of the unsigned 8-bit
1703    values in A and B.  Return the value in the lower 16-bit word; the
1704    upper words are cleared.  */
1705 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1706 _mm_sad_pu8 (__m64  __A, __m64  __B)
1707 {
1708   __vector unsigned char a, b;
1709   __vector unsigned char vmin, vmax, vabsdiff;
1710   __vector signed int vsum;
1711   const __vector unsigned int zero =
1712     { 0, 0, 0, 0 };
1713   unsigned short result;
1714
1715   a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A);
1716   b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B);
1717   vmin = vec_min (a, b);
1718   vmax = vec_max (a, b);
1719   vabsdiff = vec_sub (vmax, vmin);
1720   /* Sum four groups of bytes into integers.  */
1721   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1722   /* Sum across four integers with integer result.  */
1723   vsum = vec_sums (vsum, (__vector signed int) zero);
1724   /* The sum is in the right most 32-bits of the vector result.
1725      Transfer to a GPR and truncate to 16 bits.  */
1726   result = vsum[3];
1727   return (result);
1728 }
1729
1730 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1731 _m_psadbw (__m64 __A, __m64 __B)
1732 {
1733   return _mm_sad_pu8 (__A, __B);
1734 }
1735
1736 /* Stores the data in A to the address P without polluting the caches.  */
1737 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738 _mm_stream_pi (__m64 *__P, __m64 __A)
1739 {
1740   /* Use the data cache block touch for store transient.  */
1741   __asm__ (
1742     "   dcbtstt 0,%0"
1743     :
1744     : "b" (__P)
1745     : "memory"
1746   );
1747   *__P = __A;
1748 }
1749
1750 /* Likewise.  The address must be 16-byte aligned.  */
1751 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1752 _mm_stream_ps (float *__P, __m128 __A)
1753 {
1754   /* Use the data cache block touch for store transient.  */
1755   __asm__ (
1756     "   dcbtstt 0,%0"
1757     :
1758     : "b" (__P)
1759     : "memory"
1760   );
1761   _mm_store_ps (__P, __A);
1762 }
1763
1764 /* Guarantees that every preceding store is globally visible before
1765    any subsequent store.  */
1766 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1767 _mm_sfence (void)
1768 {
1769   /* Generate a light weight sync.  */
1770   __atomic_thread_fence (__ATOMIC_RELEASE);
1771 }
1772
1773 /* The execution of the next instruction is delayed by an implementation
1774    specific amount of time.  The instruction does not modify the
1775    architectural state.  This is after the pop_options pragma because
1776    it does not require SSE support in the processor--the encoding is a
1777    nop on processors that do not support it.  */
1778 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1779 _mm_pause (void)
1780 {
1781   /* There is no exact match with this construct, but the following is
1782      close to the desired effect.  */
1783 #if _ARCH_PWR8
1784   /* On power8 and later processors we can depend on Program Priority
1785      (PRI) and associated "very low" PPI setting.  Since we don't know
1786      what PPI this thread is running at we: 1) save the current PRI
1787      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1788      via the special or 31,31,31 encoding. 3) issue an "isync" to
1789      insure the PRI change takes effect before we execute any more
1790      instructions.
1791      Now we can execute a lwsync (release barrier) while we execute
1792      this thread at "very low" PRI.  Finally we restore the original
1793      PRI and continue execution.  */
1794   unsigned long __PPR;
1795
1796   __asm__ volatile (
1797     "   mfppr   %0;"
1798     "   or 31,31,31;"
1799     "   isync;"
1800     "   lwsync;"
1801     "   isync;"
1802     "   mtppr   %0;"
1803     : "=r" (__PPR)
1804     :
1805     : "memory"
1806   );
1807 #else
1808   /* For older processor where we may not even have Program Priority
1809      controls we can only depend on Heavy Weight Sync.  */
1810   __atomic_thread_fence (__ATOMIC_SEQ_CST);
1811 #endif
1812 }
1813
1814 /* Transpose the 4x4 matrix composed of row[0-3].  */
1815 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \
1816 do {                                                                    \
1817   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \
1818   __v4sf __t0 = vec_vmrghw (__r0, __r1);                        \
1819   __v4sf __t1 = vec_vmrghw (__r2, __r3);                        \
1820   __v4sf __t2 = vec_vmrglw (__r0, __r1);                        \
1821   __v4sf __t3 = vec_vmrglw (__r2, __r3);                        \
1822   (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0,        \
1823                                (__vector long long)__t1);       \
1824   (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,        \
1825                                (__vector long long)__t1);       \
1826   (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,        \
1827                                (__vector long long)__t3);       \
1828   (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,        \
1829                                (__vector long long)__t3);       \
1830 } while (0)
1831
1832 /* For backward source compatibility.  */
1833 //# include <emmintrin.h>
1834
1835 #endif /* _XMMINTRIN_H_INCLUDED */