gcc/config/rs6000/xmmintrin.h

   1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    Under Section 7 of GPL version 3, you are granted additional
  16    permissions described in the GCC Runtime Library Exception, version
  17    3.1, as published by the Free Software Foundation.
  18
  19    You should have received a copy of the GNU General Public License and
  20    a copy of the GCC Runtime Library Exception along with this program;
  21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22    <http://www.gnu.org/licenses/>.  */
  23
  24 /* Implemented from the specification included in the Intel C++ Compiler
  25    User Guide and Reference, version 9.0.  */
  26
  27 #ifndef NO_WARN_X86_INTRINSICS
  28 /* This header is distributed to simplify porting x86_64 code that
  29    makes explicit use of Intel intrinsics to powerpc64le.
  30    It is the user's responsibility to determine if the results are
  31    acceptable and make additional changes as necessary.
  32    Note that much code that uses Intel intrinsics can be rewritten in
  33    standard C or GNU C extensions, which are more portable and better
  34    optimized across multiple targets.
  35
  36    In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
  37    VMX/VSX ISA is a good match for vector float SIMD operations.
  38    However scalar float operations in vector (XMM) registers require
  39    the POWER8 VSX ISA (2.07) level. Also there are important
  40    differences for data format and placement of float scalars in the
  41    vector register. For PowerISA Scalar floats in FPRs (left most
  42    64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
  43    uses the right most 32-bits of the XMM. These differences require
  44    extra steps on POWER to match the SSE scalar float semantics.
  45
  46    Most SSE scalar float intrinsic operations can be performed more
  47    efficiently as C language float scalar operations or optimized to
  48    use vector SIMD operations.  We recommend this for new applications.
  49
  50    Another difference is the format and details of the X86_64 MXSCR vs
  51    the PowerISA FPSCR / VSCR registers. We recommend applications
  52    replace direct access to the MXSCR with the more portable <fenv.h>
  53    Posix APIs. */
  54 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  55 #endif
  56
  57 #ifndef _XMMINTRIN_H_INCLUDED
  58 #define _XMMINTRIN_H_INCLUDED
  59
  60 /* Define four value permute mask */
  61 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
  62
  63 #include <altivec.h>
  64
  65 /* Avoid collisions between altivec.h and strict adherence to C++ and
  66    C11 standards.  This should eventually be done inside altivec.h itself,
  67    but only after testing a full distro build.  */
  68 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
  69                                  (defined(__STDC_VERSION__) &&  \
  70                                   __STDC_VERSION__ >= 201112L))
  71 #undef vector
  72 #undef pixel
  73 #undef bool
  74 #endif
  75
  76 #include <assert.h>
  77
  78 /* We need type definitions from the MMX header file.  */
  79 #include <mmintrin.h>
  80
  81 /* Get _mm_malloc () and _mm_free ().  */
  82 #include <mm_malloc.h>
  83
  84 /* The Intel API is flexible enough that we must allow aliasing with other
  85    vector types, and their scalar components.  */
  86 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
  87
  88 /* Internal data types for implementing the intrinsics.  */
  89 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  90
  91 /* Create an undefined vector.  */
  92 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  93 _mm_undefined_ps (void)
  94 {
  95   __m128 __Y = __Y;
  96   return __Y;
  97 }
  98
  99 /* Create a vector of zeros.  */
 100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 101 _mm_setzero_ps (void)
 102 {
 103   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 104 }
 105
 106 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
 107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 108 _mm_load_ps (float const *__P)
 109 {
 110   assert(((unsigned long)__P & 0xfUL) == 0UL);
 111   return ((__m128)vec_ld(0, (__v4sf*)__P));
 112 }
 113
 114 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
 115 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 116 _mm_loadu_ps (float const *__P)
 117 {
 118   return (vec_vsx_ld(0, __P));
 119 }
 120
 121 /* Load four SPFP values in reverse order.  The address must be aligned.  */
 122 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 123 _mm_loadr_ps (float const *__P)
 124 {
 125   __v4sf   __tmp;
 126   __m128 result;
 127   static const __vector unsigned char permute_vector =
 128     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 129         0x17, 0x10, 0x11, 0x12, 0x13 };
 130
 131   __tmp = vec_ld (0, (__v4sf *) __P);
 132   result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
 133   return result;
 134 }
 135
 136 /* Create a vector with all four elements equal to F.  */
 137 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 138 _mm_set1_ps (float __F)
 139 {
 140   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
 141 }
 142
 143 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 144 _mm_set_ps1 (float __F)
 145 {
 146   return _mm_set1_ps (__F);
 147 }
 148
 149 /* Create the vector [Z Y X W].  */
 150 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 151 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
 152 {
 153   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
 154 }
 155
 156 /* Create the vector [W X Y Z].  */
 157 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 158 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 159 {
 160   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 161 }
 162
 163 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 164 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 165 _mm_store_ps (float *__P, __m128 __A)
 166 {
 167   assert(((unsigned long)__P & 0xfUL) == 0UL);
 168   vec_st((__v4sf)__A, 0, (__v4sf*)__P);
 169 }
 170
 171 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
 172 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 173 _mm_storeu_ps (float *__P, __m128 __A)
 174 {
 175   *(__m128 *)__P = __A;
 176 }
 177
 178 /* Store four SPFP values in reverse order.  The address must be aligned.  */
 179 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 180 _mm_storer_ps (float *__P, __m128 __A)
 181 {
 182   __v4sf   __tmp;
 183   static const __vector unsigned char permute_vector =
 184     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 185         0x17, 0x10, 0x11, 0x12, 0x13 };
 186
 187   __tmp = (__m128) vec_perm (__A, __A, permute_vector);
 188
 189   _mm_store_ps (__P, __tmp);
 190 }
 191
 192 /* Store the lower SPFP value across four words.  */
 193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 194 _mm_store1_ps (float *__P, __m128 __A)
 195 {
 196   __v4sf __va = vec_splat((__v4sf)__A, 0);
 197   _mm_store_ps (__P, __va);
 198 }
 199
 200 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 201 _mm_store_ps1 (float *__P, __m128 __A)
 202 {
 203   _mm_store1_ps (__P, __A);
 204 }
 205
 206 /* Create a vector with element 0 as F and the rest zero.  */
 207 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 208 _mm_set_ss (float __F)
 209 {
 210   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
 211 }
 212
 213 /* Sets the low SPFP value of A from the low value of B.  */
 214 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 215 _mm_move_ss (__m128 __A, __m128 __B)
 216 {
 217   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 218
 219   return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
 220 }
 221
 222 /* Create a vector with element 0 as *P and the rest zero.  */
 223 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 224 _mm_load_ss (float const *__P)
 225 {
 226   return _mm_set_ss (*__P);
 227 }
 228
 229 /* Stores the lower SPFP value.  */
 230 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 231 _mm_store_ss (float *__P, __m128 __A)
 232 {
 233   *__P = ((__v4sf)__A)[0];
 234 }
 235
 236 /* Perform the respective operation on the lower SPFP (single-precision
 237    floating-point) values of A and B; the upper three SPFP values are
 238    passed through from A.  */
 239
 240 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 241 _mm_add_ss (__m128 __A, __m128 __B)
 242 {
 243 #ifdef _ARCH_PWR7
 244   __m128 a, b, c;
 245   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 246   /* PowerISA VSX does not allow partial (for just lower double)
 247      results. So to insure we don't generate spurious exceptions
 248      (from the upper double values) we splat the lower double
 249      before we to the operation.  */
 250   a = vec_splat (__A, 0);
 251   b = vec_splat (__B, 0);
 252   c = a + b;
 253   /* Then we merge the lower float result with the original upper
 254      float elements from __A.  */
 255   return (vec_sel (__A, c, mask));
 256 #else
 257   __A[0] = __A[0] + __B[0];
 258   return (__A);
 259 #endif
 260 }
 261
 262 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 263 _mm_sub_ss (__m128 __A, __m128 __B)
 264 {
 265 #ifdef _ARCH_PWR7
 266   __m128 a, b, c;
 267   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 268   /* PowerISA VSX does not allow partial (for just lower double)
 269      results. So to insure we don't generate spurious exceptions
 270      (from the upper double values) we splat the lower double
 271      before we to the operation.  */
 272   a = vec_splat (__A, 0);
 273   b = vec_splat (__B, 0);
 274   c = a - b;
 275   /* Then we merge the lower float result with the original upper
 276      float elements from __A.  */
 277   return (vec_sel (__A, c, mask));
 278 #else
 279   __A[0] = __A[0] - __B[0];
 280   return (__A);
 281 #endif
 282 }
 283
 284 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 285 _mm_mul_ss (__m128 __A, __m128 __B)
 286 {
 287 #ifdef _ARCH_PWR7
 288   __m128 a, b, c;
 289   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 290   /* PowerISA VSX does not allow partial (for just lower double)
 291      results. So to insure we don't generate spurious exceptions
 292      (from the upper double values) we splat the lower double
 293      before we to the operation.  */
 294   a = vec_splat (__A, 0);
 295   b = vec_splat (__B, 0);
 296   c = a * b;
 297   /* Then we merge the lower float result with the original upper
 298      float elements from __A.  */
 299   return (vec_sel (__A, c, mask));
 300 #else
 301   __A[0] = __A[0] * __B[0];
 302   return (__A);
 303 #endif
 304 }
 305
 306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 307 _mm_div_ss (__m128 __A, __m128 __B)
 308 {
 309 #ifdef _ARCH_PWR7
 310   __m128 a, b, c;
 311   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 312   /* PowerISA VSX does not allow partial (for just lower double)
 313      results. So to insure we don't generate spurious exceptions
 314      (from the upper double values) we splat the lower double
 315      before we to the operation.  */
 316   a = vec_splat (__A, 0);
 317   b = vec_splat (__B, 0);
 318   c = a / b;
 319   /* Then we merge the lower float result with the original upper
 320      float elements from __A.  */
 321   return (vec_sel (__A, c, mask));
 322 #else
 323   __A[0] = __A[0] / __B[0];
 324   return (__A);
 325 #endif
 326 }
 327
 328 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 329 _mm_sqrt_ss (__m128 __A)
 330 {
 331   __m128 a, c;
 332   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 333   /* PowerISA VSX does not allow partial (for just lower double)
 334    * results. So to insure we don't generate spurious exceptions
 335    * (from the upper double values) we splat the lower double
 336    * before we to the operation. */
 337   a = vec_splat (__A, 0);
 338   c = vec_sqrt (a);
 339   /* Then we merge the lower float result with the original upper
 340    * float elements from __A.  */
 341   return (vec_sel (__A, c, mask));
 342 }
 343
 344 /* Perform the respective operation on the four SPFP values in A and B.  */
 345 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 346 _mm_add_ps (__m128 __A, __m128 __B)
 347 {
 348   return (__m128) ((__v4sf)__A + (__v4sf)__B);
 349 }
 350
 351 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 352 _mm_sub_ps (__m128 __A, __m128 __B)
 353 {
 354   return (__m128) ((__v4sf)__A - (__v4sf)__B);
 355 }
 356
 357 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 358 _mm_mul_ps (__m128 __A, __m128 __B)
 359 {
 360   return (__m128) ((__v4sf)__A * (__v4sf)__B);
 361 }
 362
 363 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 364 _mm_div_ps (__m128 __A, __m128 __B)
 365 {
 366   return (__m128) ((__v4sf)__A / (__v4sf)__B);
 367 }
 368
 369 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 370 _mm_sqrt_ps (__m128 __A)
 371 {
 372   return (vec_sqrt ((__v4sf)__A));
 373 }
 374
 375 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 376 _mm_rcp_ps (__m128 __A)
 377 {
 378   return (vec_re ((__v4sf)__A));
 379 }
 380
 381 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 382 _mm_rsqrt_ps (__m128 __A)
 383 {
 384   return (vec_rsqrte (__A));
 385 }
 386
 387 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 388 _mm_rcp_ss (__m128 __A)
 389 {
 390   __m128 a, c;
 391   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 392   /* PowerISA VSX does not allow partial (for just lower double)
 393    * results. So to insure we don't generate spurious exceptions
 394    * (from the upper double values) we splat the lower double
 395    * before we to the operation. */
 396   a = vec_splat (__A, 0);
 397   c = _mm_rcp_ps (a);
 398   /* Then we merge the lower float result with the original upper
 399    * float elements from __A.  */
 400   return (vec_sel (__A, c, mask));
 401 }
 402
 403 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 404 _mm_rsqrt_ss (__m128 __A)
 405 {
 406   __m128 a, c;
 407   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 408   /* PowerISA VSX does not allow partial (for just lower double)
 409    * results. So to insure we don't generate spurious exceptions
 410    * (from the upper double values) we splat the lower double
 411    * before we to the operation. */
 412   a = vec_splat (__A, 0);
 413   c = vec_rsqrte (a);
 414   /* Then we merge the lower float result with the original upper
 415    * float elements from __A.  */
 416   return (vec_sel (__A, c, mask));
 417 }
 418
 419 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 420 _mm_min_ss (__m128 __A, __m128 __B)
 421 {
 422   __v4sf a, b, c;
 423   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 424   /* PowerISA VSX does not allow partial (for just lower float)
 425    * results. So to insure we don't generate spurious exceptions
 426    * (from the upper float values) we splat the lower float
 427    * before we to the operation. */
 428   a = vec_splat ((__v4sf)__A, 0);
 429   b = vec_splat ((__v4sf)__B, 0);
 430   c = vec_min (a, b);
 431   /* Then we merge the lower float result with the original upper
 432    * float elements from __A.  */
 433   return (vec_sel ((__v4sf)__A, c, mask));
 434 }
 435
 436 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 437 _mm_max_ss (__m128 __A, __m128 __B)
 438 {
 439   __v4sf a, b, c;
 440   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 441   /* PowerISA VSX does not allow partial (for just lower float)
 442    * results. So to insure we don't generate spurious exceptions
 443    * (from the upper float values) we splat the lower float
 444    * before we to the operation. */
 445   a = vec_splat (__A, 0);
 446   b = vec_splat (__B, 0);
 447   c = vec_max (a, b);
 448   /* Then we merge the lower float result with the original upper
 449    * float elements from __A.  */
 450   return (vec_sel ((__v4sf)__A, c, mask));
 451 }
 452
 453 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 454 _mm_min_ps (__m128 __A, __m128 __B)
 455 {
 456   __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __B, (__v4sf) __A);
 457   return vec_sel (__B, __A, m);
 458 }
 459
 460 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 461 _mm_max_ps (__m128 __A, __m128 __B)
 462 {
 463   __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __A, (__v4sf) __B);
 464   return vec_sel (__B, __A, m);
 465 }
 466
 467 /* Perform logical bit-wise operations on 128-bit values.  */
 468 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 469 _mm_and_ps (__m128 __A, __m128 __B)
 470 {
 471   return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
 472 //  return __builtin_ia32_andps (__A, __B);
 473 }
 474
 475 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 476 _mm_andnot_ps (__m128 __A, __m128 __B)
 477 {
 478   return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
 479 }
 480
 481 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 482 _mm_or_ps (__m128 __A, __m128 __B)
 483 {
 484   return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
 485 }
 486
 487 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 488 _mm_xor_ps (__m128 __A, __m128 __B)
 489 {
 490   return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
 491 }
 492
 493 /* Perform a comparison on the four SPFP values of A and B.  For each
 494    element, if the comparison is true, place a mask of all ones in the
 495    result, otherwise a mask of zeros.  */
 496 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 497 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 498 {
 499   return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
 500 }
 501
 502 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 503 _mm_cmplt_ps (__m128 __A, __m128 __B)
 504 {
 505   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 506 }
 507
 508 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 509 _mm_cmple_ps (__m128 __A, __m128 __B)
 510 {
 511   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 512 }
 513
 514 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 515 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 516 {
 517   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 518 }
 519
 520 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 521 _mm_cmpge_ps (__m128 __A, __m128 __B)
 522 {
 523   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 524 }
 525
 526 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 527 _mm_cmpneq_ps (__m128  __A, __m128  __B)
 528 {
 529   __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
 530   return ((__m128)vec_nor (temp, temp));
 531 }
 532
 533 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 534 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 535 {
 536   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 537 }
 538
 539 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 540 _mm_cmpnle_ps (__m128 __A, __m128 __B)
 541 {
 542   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 543 }
 544
 545 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 546 _mm_cmpngt_ps (__m128 __A, __m128 __B)
 547 {
 548   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 549 }
 550
 551 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 552 _mm_cmpnge_ps (__m128 __A, __m128 __B)
 553 {
 554   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 555 }
 556
 557 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 558 _mm_cmpord_ps (__m128  __A, __m128  __B)
 559 {
 560   __vector unsigned int a, b;
 561   __vector unsigned int c, d;
 562   static const __vector unsigned int float_exp_mask =
 563     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 564
 565   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 566   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 567   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 568   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 569   return ((__m128 ) vec_and (c, d));
 570 }
 571
 572 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 573 _mm_cmpunord_ps (__m128 __A, __m128 __B)
 574 {
 575   __vector unsigned int a, b;
 576   __vector unsigned int c, d;
 577   static const __vector unsigned int float_exp_mask =
 578     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 579
 580   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 581   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 582   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 583   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 584   return ((__m128 ) vec_or (c, d));
 585 }
 586
 587 /* Perform a comparison on the lower SPFP values of A and B.  If the
 588    comparison is true, place a mask of all ones in the result, otherwise a
 589    mask of zeros.  The upper three SPFP values are passed through from A.  */
 590 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 591 _mm_cmpeq_ss (__m128  __A, __m128  __B)
 592 {
 593   static const __vector unsigned int mask =
 594     { 0xffffffff, 0, 0, 0 };
 595   __v4sf a, b, c;
 596   /* PowerISA VMX does not allow partial (for just element 0)
 597    * results. So to insure we don't generate spurious exceptions
 598    * (from the upper elements) we splat the lower float
 599    * before we to the operation. */
 600   a = vec_splat ((__v4sf) __A, 0);
 601   b = vec_splat ((__v4sf) __B, 0);
 602   c = (__v4sf) vec_cmpeq(a, b);
 603   /* Then we merge the lower float result with the original upper
 604    * float elements from __A.  */
 605   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 606 }
 607
 608 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 609 _mm_cmplt_ss (__m128 __A, __m128 __B)
 610 {
 611   static const __vector unsigned int mask =
 612     { 0xffffffff, 0, 0, 0 };
 613   __v4sf a, b, c;
 614   /* PowerISA VMX does not allow partial (for just element 0)
 615    * results. So to insure we don't generate spurious exceptions
 616    * (from the upper elements) we splat the lower float
 617    * before we to the operation. */
 618   a = vec_splat ((__v4sf) __A, 0);
 619   b = vec_splat ((__v4sf) __B, 0);
 620   c = (__v4sf) vec_cmplt(a, b);
 621   /* Then we merge the lower float result with the original upper
 622    * float elements from __A.  */
 623   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 624 }
 625
 626 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 627 _mm_cmple_ss (__m128 __A, __m128 __B)
 628 {
 629   static const __vector unsigned int mask =
 630     { 0xffffffff, 0, 0, 0 };
 631   __v4sf a, b, c;
 632   /* PowerISA VMX does not allow partial (for just element 0)
 633    * results. So to insure we don't generate spurious exceptions
 634    * (from the upper elements) we splat the lower float
 635    * before we to the operation. */
 636   a = vec_splat ((__v4sf) __A, 0);
 637   b = vec_splat ((__v4sf) __B, 0);
 638   c = (__v4sf) vec_cmple(a, b);
 639   /* Then we merge the lower float result with the original upper
 640    * float elements from __A.  */
 641   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 642 }
 643
 644 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 645 _mm_cmpgt_ss (__m128 __A, __m128 __B)
 646 {
 647   static const __vector unsigned int mask =
 648     { 0xffffffff, 0, 0, 0 };
 649   __v4sf a, b, c;
 650   /* PowerISA VMX does not allow partial (for just element 0)
 651    * results. So to insure we don't generate spurious exceptions
 652    * (from the upper elements) we splat the lower float
 653    * before we to the operation. */
 654   a = vec_splat ((__v4sf) __A, 0);
 655   b = vec_splat ((__v4sf) __B, 0);
 656   c = (__v4sf) vec_cmpgt(a, b);
 657   /* Then we merge the lower float result with the original upper
 658    * float elements from __A.  */
 659   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 660 }
 661
 662 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 663 _mm_cmpge_ss (__m128 __A, __m128 __B)
 664 {
 665   static const __vector unsigned int mask =
 666     { 0xffffffff, 0, 0, 0 };
 667   __v4sf a, b, c;
 668   /* PowerISA VMX does not allow partial (for just element 0)
 669    * results. So to insure we don't generate spurious exceptions
 670    * (from the upper elements) we splat the lower float
 671    * before we to the operation. */
 672   a = vec_splat ((__v4sf) __A, 0);
 673   b = vec_splat ((__v4sf) __B, 0);
 674   c = (__v4sf) vec_cmpge(a, b);
 675   /* Then we merge the lower float result with the original upper
 676    * float elements from __A.  */
 677   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 678 }
 679
 680 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 681 _mm_cmpneq_ss (__m128 __A, __m128 __B)
 682 {
 683   static const __vector unsigned int mask =
 684     { 0xffffffff, 0, 0, 0 };
 685   __v4sf a, b, c;
 686   /* PowerISA VMX does not allow partial (for just element 0)
 687    * results. So to insure we don't generate spurious exceptions
 688    * (from the upper elements) we splat the lower float
 689    * before we to the operation. */
 690   a = vec_splat ((__v4sf) __A, 0);
 691   b = vec_splat ((__v4sf) __B, 0);
 692   c = (__v4sf) vec_cmpeq(a, b);
 693   c = vec_nor (c, c);
 694   /* Then we merge the lower float result with the original upper
 695    * float elements from __A.  */
 696   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 697 }
 698
 699 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 700 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
 701 {
 702   static const __vector unsigned int mask =
 703     { 0xffffffff, 0, 0, 0 };
 704   __v4sf a, b, c;
 705   /* PowerISA VMX does not allow partial (for just element 0)
 706    * results. So to insure we don't generate spurious exceptions
 707    * (from the upper elements) we splat the lower float
 708    * before we to the operation. */
 709   a = vec_splat ((__v4sf) __A, 0);
 710   b = vec_splat ((__v4sf) __B, 0);
 711   c = (__v4sf) vec_cmpge(a, b);
 712   /* Then we merge the lower float result with the original upper
 713    * float elements from __A.  */
 714   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 715 }
 716
 717 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 718 _mm_cmpnle_ss (__m128 __A, __m128 __B)
 719 {
 720   static const __vector unsigned int mask =
 721     { 0xffffffff, 0, 0, 0 };
 722   __v4sf a, b, c;
 723   /* PowerISA VMX does not allow partial (for just element 0)
 724    * results. So to insure we don't generate spurious exceptions
 725    * (from the upper elements) we splat the lower float
 726    * before we to the operation. */
 727   a = vec_splat ((__v4sf) __A, 0);
 728   b = vec_splat ((__v4sf) __B, 0);
 729   c = (__v4sf) vec_cmpgt(a, b);
 730   /* Then we merge the lower float result with the original upper
 731    * float elements from __A.  */
 732   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 733 }
 734
 735 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 736 _mm_cmpngt_ss (__m128 __A, __m128 __B)
 737 {
 738   static const __vector unsigned int mask =
 739     { 0xffffffff, 0, 0, 0 };
 740   __v4sf a, b, c;
 741   /* PowerISA VMX does not allow partial (for just element 0)
 742    * results. So to insure we don't generate spurious exceptions
 743    * (from the upper elements) we splat the lower float
 744    * before we to the operation. */
 745   a = vec_splat ((__v4sf) __A, 0);
 746   b = vec_splat ((__v4sf) __B, 0);
 747   c = (__v4sf) vec_cmple(a, b);
 748   /* Then we merge the lower float result with the original upper
 749    * float elements from __A.  */
 750   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 751 }
 752
 753 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 754 _mm_cmpnge_ss (__m128 __A, __m128 __B)
 755 {
 756   static const __vector unsigned int mask =
 757     { 0xffffffff, 0, 0, 0 };
 758   __v4sf a, b, c;
 759   /* PowerISA VMX does not allow partial (for just element 0)
 760    * results. So to insure we don't generate spurious exceptions
 761    * (from the upper elements) we splat the lower float
 762    * before we do the operation. */
 763   a = vec_splat ((__v4sf) __A, 0);
 764   b = vec_splat ((__v4sf) __B, 0);
 765   c = (__v4sf) vec_cmplt(a, b);
 766   /* Then we merge the lower float result with the original upper
 767    * float elements from __A.  */
 768   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 769 }
 770
 771 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 772 _mm_cmpord_ss (__m128 __A, __m128 __B)
 773 {
 774   __vector unsigned int a, b;
 775   __vector unsigned int c, d;
 776   static const __vector unsigned int float_exp_mask =
 777     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 778   static const __vector unsigned int mask =
 779     { 0xffffffff, 0, 0, 0 };
 780
 781   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 782   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 783   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 784   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 785   c = vec_and (c, d);
 786   /* Then we merge the lower float result with the original upper
 787    * float elements from __A.  */
 788   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 789 }
 790
 791 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 792 _mm_cmpunord_ss (__m128 __A, __m128 __B)
 793 {
 794   __vector unsigned int a, b;
 795   __vector unsigned int c, d;
 796   static const __vector unsigned int float_exp_mask =
 797     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 798   static const __vector unsigned int mask =
 799     { 0xffffffff, 0, 0, 0 };
 800
 801   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 802   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 803   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 804   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 805   c = vec_or (c, d);
 806   /* Then we merge the lower float result with the original upper
 807    * float elements from __A.  */
 808   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 809 }
 810
 811 /* Compare the lower SPFP values of A and B and return 1 if true
 812    and 0 if false.  */
 813 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 814 _mm_comieq_ss (__m128 __A, __m128 __B)
 815 {
 816   return (__A[0] == __B[0]);
 817 }
 818
 819 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 820 _mm_comilt_ss (__m128 __A, __m128 __B)
 821 {
 822   return (__A[0] < __B[0]);
 823 }
 824
 825 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 826 _mm_comile_ss (__m128 __A, __m128 __B)
 827 {
 828   return (__A[0] <= __B[0]);
 829 }
 830
 831 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 832 _mm_comigt_ss (__m128 __A, __m128 __B)
 833 {
 834   return (__A[0] > __B[0]);
 835 }
 836
 837 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 838 _mm_comige_ss (__m128 __A, __m128 __B)
 839 {
 840   return (__A[0] >= __B[0]);
 841 }
 842
 843 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 844 _mm_comineq_ss (__m128 __A, __m128 __B)
 845 {
 846   return (__A[0] != __B[0]);
 847 }
 848
 849 /* FIXME
 850  * The __mm_ucomi??_ss implementations below are exactly the same as
 851  * __mm_comi??_ss because GCC for PowerPC only generates unordered
 852  * compares (scalar and vector).
 853  * Technically __mm_comieq_ss et al should be using the ordered
 854  * compare and signal for QNaNs.
 855  * The __mm_ucomieq_sd et all should be OK, as is.
 856  */
 857 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 858 _mm_ucomieq_ss (__m128 __A, __m128 __B)
 859 {
 860   return (__A[0] == __B[0]);
 861 }
 862
 863 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 864 _mm_ucomilt_ss (__m128 __A, __m128 __B)
 865 {
 866   return (__A[0] < __B[0]);
 867 }
 868
 869 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 870 _mm_ucomile_ss (__m128 __A, __m128 __B)
 871 {
 872   return (__A[0] <= __B[0]);
 873 }
 874
 875 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 876 _mm_ucomigt_ss (__m128 __A, __m128 __B)
 877 {
 878   return (__A[0] > __B[0]);
 879 }
 880
 881 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 882 _mm_ucomige_ss (__m128 __A, __m128 __B)
 883 {
 884   return (__A[0] >= __B[0]);
 885 }
 886
 887 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 888 _mm_ucomineq_ss (__m128 __A, __m128 __B)
 889 {
 890   return (__A[0] != __B[0]);
 891 }
 892
 893 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 894 _mm_cvtss_f32 (__m128 __A)
 895 {
 896   return ((__v4sf)__A)[0];
 897 }
 898
 899 /* Convert the lower SPFP value to a 32-bit integer according to the current
 900    rounding mode.  */
 901 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 902 _mm_cvtss_si32 (__m128 __A)
 903 {
 904   __m64 res = 0;
 905 #ifdef _ARCH_PWR8
 906   __m128 vtmp;
 907   __asm__(
 908       "xxsldwi %x1,%x2,%x2,3;\n"
 909       "xscvspdp %x1,%x1;\n"
 910       "fctiw  %1,%1;\n"
 911       "mfvsrd  %0,%x1;\n"
 912       : "=r" (res),
 913         "=&wi" (vtmp)
 914       : "wa" (__A)
 915       : );
 916 #else
 917   res = __builtin_rint(__A[0]);
 918 #endif
 919   return (res);
 920 }
 921
 922 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 923 _mm_cvt_ss2si (__m128 __A)
 924 {
 925   return _mm_cvtss_si32 (__A);
 926 }
 927
 928 /* Convert the lower SPFP value to a 32-bit integer according to the
 929    current rounding mode.  */
 930
 931 /* Intel intrinsic.  */
 932 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 933 _mm_cvtss_si64 (__m128 __A)
 934 {
 935   __m64 res = 0;
 936 #ifdef _ARCH_PWR8
 937   __m128 vtmp;
 938   __asm__(
 939       "xxsldwi %x1,%x2,%x2,3;\n"
 940       "xscvspdp %x1,%x1;\n"
 941       "fctid  %1,%1;\n"
 942       "mfvsrd  %0,%x1;\n"
 943       : "=r" (res),
 944         "=&wi" (vtmp)
 945       : "wa" (__A)
 946       : );
 947 #else
 948   res = __builtin_llrint(__A[0]);
 949 #endif
 950   return (res);
 951 }
 952
 953 /* Microsoft intrinsic.  */
 954 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 955 _mm_cvtss_si64x (__m128 __A)
 956 {
 957   return _mm_cvtss_si64 ((__v4sf) __A);
 958 }
 959
 960 /* Constants for use with _mm_prefetch.  */
 961 enum _mm_hint
 962 {
 963   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
 964   _MM_HINT_ET0 = 7,
 965   _MM_HINT_ET1 = 6,
 966   _MM_HINT_T0 = 3,
 967   _MM_HINT_T1 = 2,
 968   _MM_HINT_T2 = 1,
 969   _MM_HINT_NTA = 0
 970 };
 971
 972 /* Loads one cache line from address P to a location "closer" to the
 973    processor.  The selector I specifies the type of prefetch operation.  */
 974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 975 _mm_prefetch (const void *__P, enum _mm_hint __I)
 976 {
 977   /* Current PowerPC will ignores the hint parameters.  */
 978   __builtin_prefetch (__P);
 979 }
 980
 981 /* Convert the two lower SPFP values to 32-bit integers according to the
 982    current rounding mode.  Return the integers in packed form.  */
 983 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 984 _mm_cvtps_pi32 (__m128 __A)
 985 {
 986   /* Splat two lower SPFP values to both halves.  */
 987   __v4sf temp, rounded;
 988   __vector __m64 result;
 989
 990   /* Splat two lower SPFP values to both halves.  */
 991   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
 992   rounded = vec_rint(temp);
 993   result = (__vector __m64) vec_cts (rounded, 0);
 994
 995   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
 996 }
 997
 998 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 999 _mm_cvt_ps2pi (__m128 __A)
1000 {
1001   return _mm_cvtps_pi32 (__A);
1002 }
1003
1004 /* Truncate the lower SPFP value to a 32-bit integer.  */
1005 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006 _mm_cvttss_si32 (__m128 __A)
1007 {
1008   /* Extract the lower float element.  */
1009   float temp = __A[0];
1010   /* truncate to 32-bit integer and return.  */
1011   return temp;
1012 }
1013
1014 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm_cvtt_ss2si (__m128 __A)
1016 {
1017   return _mm_cvttss_si32 (__A);
1018 }
1019
1020 /* Intel intrinsic.  */
1021 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1022 _mm_cvttss_si64 (__m128 __A)
1023 {
1024   /* Extract the lower float element.  */
1025   float temp = __A[0];
1026   /* truncate to 32-bit integer and return.  */
1027   return temp;
1028 }
1029
1030 /* Microsoft intrinsic.  */
1031 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032 _mm_cvttss_si64x (__m128 __A)
1033 {
1034   /* Extract the lower float element.  */
1035   float temp = __A[0];
1036   /* truncate to 32-bit integer and return.  */
1037   return temp;
1038 }
1039
1040 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
1041    integers in packed form.  */
1042 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043 _mm_cvttps_pi32 (__m128 __A)
1044 {
1045   __v4sf temp;
1046   __vector __m64 result;
1047
1048   /* Splat two lower SPFP values to both halves.  */
1049   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1050   result = (__vector __m64) vec_cts (temp, 0);
1051
1052   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1053 }
1054
1055 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_cvtt_ps2pi (__m128 __A)
1057 {
1058   return _mm_cvttps_pi32 (__A);
1059 }
1060
1061 /* Convert B to a SPFP value and insert it as element zero in A.  */
1062 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm_cvtsi32_ss (__m128 __A, int __B)
1064 {
1065   float temp = __B;
1066   __A[0] = temp;
1067
1068   return __A;
1069 }
1070
1071 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_cvt_si2ss (__m128 __A, int __B)
1073 {
1074   return _mm_cvtsi32_ss (__A, __B);
1075 }
1076
1077 /* Convert B to a SPFP value and insert it as element zero in A.  */
1078 /* Intel intrinsic.  */
1079 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080 _mm_cvtsi64_ss (__m128 __A, long long __B)
1081 {
1082   float temp = __B;
1083   __A[0] = temp;
1084
1085   return __A;
1086 }
1087
1088 /* Microsoft intrinsic.  */
1089 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1091 {
1092   return _mm_cvtsi64_ss (__A, __B);
1093 }
1094
1095 /* Convert the two 32-bit values in B to SPFP form and insert them
1096    as the two lower elements in A.  */
1097 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
1099 {
1100   __vector signed int vm1;
1101   __vector float vf1;
1102
1103   vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B);
1104   vf1 = (__vector float) vec_ctf (vm1, 0);
1105
1106   return ((__m128) (__vector __m64)
1107     { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]});
1108 }
1109
1110 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1112 {
1113   return _mm_cvtpi32_ps (__A, __B);
1114 }
1115
1116 /* Convert the four signed 16-bit values in A to SPFP form.  */
1117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1118 _mm_cvtpi16_ps (__m64 __A)
1119 {
1120   __vector signed short vs8;
1121   __vector signed int vi4;
1122   __vector float vf1;
1123
1124   vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A);
1125   vi4 = vec_vupklsh (vs8);
1126   vf1 = (__vector float) vec_ctf (vi4, 0);
1127
1128   return (__m128) vf1;
1129 }
1130
1131 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
1132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133 _mm_cvtpu16_ps (__m64 __A)
1134 {
1135   const __vector unsigned short zero =
1136     { 0, 0, 0, 0, 0, 0, 0, 0 };
1137   __vector unsigned short vs8;
1138   __vector unsigned int vi4;
1139   __vector float vf1;
1140
1141   vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A);
1142   vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1143   vf1 = (__vector float) vec_ctf (vi4, 0);
1144
1145   return (__m128) vf1;
1146 }
1147
1148 /* Convert the low four signed 8-bit values in A to SPFP form.  */
1149 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm_cvtpi8_ps (__m64 __A)
1151 {
1152   __vector signed char vc16;
1153   __vector signed short vs8;
1154   __vector signed int vi4;
1155   __vector float vf1;
1156
1157   vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A);
1158   vs8 = vec_vupkhsb (vc16);
1159   vi4 = vec_vupkhsh (vs8);
1160   vf1 = (__vector float) vec_ctf (vi4, 0);
1161
1162   return (__m128) vf1;
1163 }
1164
1165 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1166 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167
1168 _mm_cvtpu8_ps (__m64  __A)
1169 {
1170   const __vector unsigned char zero =
1171     { 0, 0, 0, 0, 0, 0, 0, 0 };
1172   __vector unsigned char vc16;
1173   __vector unsigned short vs8;
1174   __vector unsigned int vi4;
1175   __vector float vf1;
1176
1177   vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A);
1178   vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1179   vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1180                                             (__vector unsigned short) zero);
1181   vf1 = (__vector float) vec_ctf (vi4, 0);
1182
1183   return (__m128) vf1;
1184 }
1185
1186 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
1187 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
1189 {
1190   __vector signed int vi4;
1191   __vector float vf4;
1192
1193   vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A);
1194   vf4 = (__vector float) vec_ctf (vi4, 0);
1195   return (__m128) vf4;
1196 }
1197
1198 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm_cvtps_pi16(__m128 __A)
1201 {
1202   __v4sf rounded;
1203   __vector signed int temp;
1204   __vector __m64 result;
1205
1206   rounded = vec_rint(__A);
1207   temp = vec_cts (rounded, 0);
1208   result = (__vector __m64) vec_pack (temp, temp);
1209
1210   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1211 }
1212
1213 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
1214 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 _mm_cvtps_pi8(__m128 __A)
1216 {
1217   __v4sf rounded;
1218   __vector signed int tmp_i;
1219   static const __vector signed int zero = {0, 0, 0, 0};
1220   __vector signed short tmp_s;
1221   __vector signed char res_v;
1222   __m64 result;
1223
1224   rounded = vec_rint(__A);
1225   tmp_i = vec_cts (rounded, 0);
1226   tmp_s = vec_pack (tmp_i, zero);
1227   res_v = vec_pack (tmp_s, tmp_s);
1228   result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0);
1229
1230   return (result);
1231 }
1232
1233 /* Selects four specific SPFP values from A and B based on MASK.  */
1234 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235
1236 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1237 {
1238   unsigned long element_selector_10 = __mask & 0x03;
1239   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1240   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1241   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1242   static const unsigned int permute_selectors[4] =
1243     {
1244 #ifdef __LITTLE_ENDIAN__
1245       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1246 #elif __BIG_ENDIAN__
1247       0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1248 #endif
1249     };
1250   __vector unsigned int t;
1251
1252 #ifdef __LITTLE_ENDIAN__
1253   t[0] = permute_selectors[element_selector_10];
1254   t[1] = permute_selectors[element_selector_32];
1255   t[2] = permute_selectors[element_selector_54] + 0x10101010;
1256   t[3] = permute_selectors[element_selector_76] + 0x10101010;
1257 #elif __BIG_ENDIAN__
1258   t[3] = permute_selectors[element_selector_10] + 0x10101010;
1259   t[2] = permute_selectors[element_selector_32] + 0x10101010;
1260   t[1] = permute_selectors[element_selector_54];
1261   t[0] = permute_selectors[element_selector_76];
1262 #endif
1263   return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1264 }
1265
1266 /* Selects and interleaves the upper two SPFP values from A and B.  */
1267 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1269 {
1270   return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1271 }
1272
1273 /* Selects and interleaves the lower two SPFP values from A and B.  */
1274 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1276 {
1277   return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1278 }
1279
1280 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1281    the lower two values are passed through from A.  */
1282 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1284 {
1285   __vector __m64 __a = (__vector __m64)__A;
1286   __vector __m64 __p = vec_splats(*__P);
1287   __a [1] = __p [1];
1288
1289   return (__m128)__a;
1290 }
1291
1292 /* Stores the upper two SPFP values of A into P.  */
1293 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1294 _mm_storeh_pi (__m64 *__P, __m128 __A)
1295 {
1296   __vector __m64 __a = (__vector __m64) __A;
1297
1298   *__P = __a[1];
1299 }
1300
1301 /* Moves the upper two values of B into the lower two values of A.  */
1302 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm_movehl_ps (__m128 __A, __m128 __B)
1304 {
1305   return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A);
1306 }
1307
1308 /* Moves the lower two values of B into the upper two values of A.  */
1309 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310 _mm_movelh_ps (__m128 __A, __m128 __B)
1311 {
1312   return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B);
1313 }
1314
1315 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1316    the upper two values are passed through from A.  */
1317 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1319 {
1320   __vector __m64 __a = (__vector __m64)__A;
1321   __vector __m64 __p = vec_splats(*__P);
1322   __a [0] = __p [0];
1323
1324   return (__m128)__a;
1325 }
1326
1327 /* Stores the lower two SPFP values of A into P.  */
1328 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 _mm_storel_pi (__m64 *__P, __m128 __A)
1330 {
1331   __vector __m64 __a = (__vector __m64) __A;
1332
1333   *__P = __a[0];
1334 }
1335
1336 #ifdef _ARCH_PWR8
1337 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1338
1339 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1340 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm_movemask_ps (__m128  __A)
1342 {
1343   __vector __m64 result;
1344   static const __vector unsigned int perm_mask =
1345     {
1346 #ifdef __LITTLE_ENDIAN__
1347         0x00204060, 0x80808080, 0x80808080, 0x80808080
1348 #elif __BIG_ENDIAN__
1349       0x80808080, 0x80808080, 0x80808080, 0x00204060
1350 #endif
1351     };
1352
1353   result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1354                                          (__vector unsigned char) perm_mask);
1355
1356 #ifdef __LITTLE_ENDIAN__
1357   return result[1];
1358 #elif __BIG_ENDIAN__
1359   return result[0];
1360 #endif
1361 }
1362 #endif /* _ARCH_PWR8 */
1363
1364 /* Create a vector with all four elements equal to *P.  */
1365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1366 _mm_load1_ps (float const *__P)
1367 {
1368   return _mm_set1_ps (*__P);
1369 }
1370
1371 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372 _mm_load_ps1 (float const *__P)
1373 {
1374   return _mm_load1_ps (__P);
1375 }
1376
1377 /* Extracts one of the four words of A.  The selector N must be immediate.  */
1378 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm_extract_pi16 (__m64 const __A, int const __N)
1380 {
1381   const int shiftr = (__N & 3) * 16;
1382
1383   return ((__A >> shiftr) & 0xffff);
1384 }
1385
1386 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387 _m_pextrw (__m64 const __A, int const __N)
1388 {
1389   return _mm_extract_pi16 (__A, __N);
1390 }
1391
1392 /* Inserts word D into one of four words of A.  The selector N must be
1393    immediate.  */
1394 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1396 {
1397   const int shiftl = (__N & 3) * 16;
1398   const __m64 shiftD = (const __m64) __D << shiftl;
1399   const __m64 mask = 0xffffUL << shiftl;
1400   __m64 result = (__A & (~mask)) | (shiftD & mask);
1401
1402   return (result);
1403 }
1404
1405 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1407 {
1408   return _mm_insert_pi16 (__A, __D, __N);
1409 }
1410
1411 /* Compute the element-wise maximum of signed 16-bit values.  */
1412 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1413
1414 _mm_max_pi16 (__m64 __A, __m64 __B)
1415 {
1416 #if _ARCH_PWR8
1417   __vector signed short a, b, r;
1418   __vector __bool short c;
1419
1420   a = (__vector signed short)vec_splats (__A);
1421   b = (__vector signed short)vec_splats (__B);
1422   c = (__vector __bool short)vec_cmpgt (a, b);
1423   r = vec_sel (b, a, c);
1424   return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1425 #else
1426   __m64_union m1, m2, res;
1427
1428   m1.as_m64 = __A;
1429   m2.as_m64 = __B;
1430
1431   res.as_short[0] =
1432       (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1433   res.as_short[1] =
1434       (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1435   res.as_short[2] =
1436       (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1437   res.as_short[3] =
1438       (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1439
1440   return (__m64) res.as_m64;
1441 #endif
1442 }
1443
1444 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1445 _m_pmaxsw (__m64 __A, __m64 __B)
1446 {
1447   return _mm_max_pi16 (__A, __B);
1448 }
1449
1450 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1451 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1452 _mm_max_pu8 (__m64 __A, __m64 __B)
1453 {
1454 #if _ARCH_PWR8
1455   __vector unsigned char a, b, r;
1456   __vector __bool char c;
1457
1458   a = (__vector unsigned char)vec_splats (__A);
1459   b = (__vector unsigned char)vec_splats (__B);
1460   c = (__vector __bool char)vec_cmpgt (a, b);
1461   r = vec_sel (b, a, c);
1462   return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1463 #else
1464   __m64_union m1, m2, res;
1465   long i;
1466
1467   m1.as_m64 = __A;
1468   m2.as_m64 = __B;
1469
1470
1471   for (i = 0; i < 8; i++)
1472   res.as_char[i] =
1473       ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1474           m1.as_char[i] : m2.as_char[i];
1475
1476   return (__m64) res.as_m64;
1477 #endif
1478 }
1479
1480 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1481 _m_pmaxub (__m64 __A, __m64 __B)
1482 {
1483   return _mm_max_pu8 (__A, __B);
1484 }
1485
1486 /* Compute the element-wise minimum of signed 16-bit values.  */
1487 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1488 _mm_min_pi16 (__m64 __A, __m64 __B)
1489 {
1490 #if _ARCH_PWR8
1491   __vector signed short a, b, r;
1492   __vector __bool short c;
1493
1494   a = (__vector signed short)vec_splats (__A);
1495   b = (__vector signed short)vec_splats (__B);
1496   c = (__vector __bool short)vec_cmplt (a, b);
1497   r = vec_sel (b, a, c);
1498   return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1499 #else
1500   __m64_union m1, m2, res;
1501
1502   m1.as_m64 = __A;
1503   m2.as_m64 = __B;
1504
1505   res.as_short[0] =
1506       (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1507   res.as_short[1] =
1508       (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1509   res.as_short[2] =
1510       (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1511   res.as_short[3] =
1512       (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1513
1514   return (__m64) res.as_m64;
1515 #endif
1516 }
1517
1518 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1519 _m_pminsw (__m64 __A, __m64 __B)
1520 {
1521   return _mm_min_pi16 (__A, __B);
1522 }
1523
1524 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1525 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1526 _mm_min_pu8 (__m64 __A, __m64 __B)
1527 {
1528 #if _ARCH_PWR8
1529   __vector unsigned char a, b, r;
1530   __vector __bool char c;
1531
1532   a = (__vector unsigned char)vec_splats (__A);
1533   b = (__vector unsigned char)vec_splats (__B);
1534   c = (__vector __bool char)vec_cmplt (a, b);
1535   r = vec_sel (b, a, c);
1536   return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1537 #else
1538   __m64_union m1, m2, res;
1539   long i;
1540
1541   m1.as_m64 = __A;
1542   m2.as_m64 = __B;
1543
1544
1545   for (i = 0; i < 8; i++)
1546   res.as_char[i] =
1547       ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1548           m1.as_char[i] : m2.as_char[i];
1549
1550   return (__m64) res.as_m64;
1551 #endif
1552 }
1553
1554 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1555 _m_pminub (__m64 __A, __m64 __B)
1556 {
1557   return _mm_min_pu8 (__A, __B);
1558 }
1559
1560 /* Create an 8-bit mask of the signs of 8-bit values.  */
1561 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1562 _mm_movemask_pi8 (__m64 __A)
1563 {
1564   unsigned long p = 0x0008101820283038UL; // permute control for sign bits
1565
1566   return __builtin_bpermd (p, __A);
1567 }
1568
1569 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1570 _m_pmovmskb (__m64 __A)
1571 {
1572   return _mm_movemask_pi8 (__A);
1573 }
1574
1575 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1576    in B and produce the high 16 bits of the 32-bit results.  */
1577 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1578 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1579 {
1580   __vector unsigned short a, b;
1581   __vector unsigned short c;
1582   __vector unsigned int w0, w1;
1583   __vector unsigned char xform1 = {
1584       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1585       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1586     };
1587
1588   a = (__vector unsigned short)vec_splats (__A);
1589   b = (__vector unsigned short)vec_splats (__B);
1590
1591   w0 = vec_vmuleuh (a, b);
1592   w1 = vec_vmulouh (a, b);
1593   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1594
1595   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1596 }
1597
1598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1599 _m_pmulhuw (__m64 __A, __m64 __B)
1600 {
1601   return _mm_mulhi_pu16 (__A, __B);
1602 }
1603
1604 /* Return a combination of the four 16-bit values in A.  The selector
1605    must be an immediate.  */
1606 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1607 _mm_shuffle_pi16 (__m64 __A, int const __N)
1608 {
1609   unsigned long element_selector_10 = __N & 0x03;
1610   unsigned long element_selector_32 = (__N >> 2) & 0x03;
1611   unsigned long element_selector_54 = (__N >> 4) & 0x03;
1612   unsigned long element_selector_76 = (__N >> 6) & 0x03;
1613   static const unsigned short permute_selectors[4] =
1614     {
1615 #ifdef __LITTLE_ENDIAN__
1616               0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1617 #elif __BIG_ENDIAN__
1618               0x0607, 0x0405, 0x0203, 0x0001
1619 #endif
1620     };
1621   __m64_union t;
1622   __vector __m64 a, p, r;
1623
1624 #ifdef __LITTLE_ENDIAN__
1625   t.as_short[0] = permute_selectors[element_selector_10];
1626   t.as_short[1] = permute_selectors[element_selector_32];
1627   t.as_short[2] = permute_selectors[element_selector_54];
1628   t.as_short[3] = permute_selectors[element_selector_76];
1629 #elif __BIG_ENDIAN__
1630   t.as_short[3] = permute_selectors[element_selector_10];
1631   t.as_short[2] = permute_selectors[element_selector_32];
1632   t.as_short[1] = permute_selectors[element_selector_54];
1633   t.as_short[0] = permute_selectors[element_selector_76];
1634 #endif
1635   p = vec_splats (t.as_m64);
1636   a = vec_splats (__A);
1637   r = vec_perm (a, a, (__vector unsigned char)p);
1638   return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1639 }
1640
1641 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1642 _m_pshufw (__m64 __A, int const __N)
1643 {
1644   return _mm_shuffle_pi16 (__A, __N);
1645 }
1646
1647 /* Conditionally store byte elements of A into P.  The high bit of each
1648    byte in the selector N determines whether the corresponding byte from
1649    A is stored.  */
1650 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1651 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1652 {
1653   __m64 hibit = 0x8080808080808080UL;
1654   __m64 mask, tmp;
1655   __m64 *p = (__m64*)__P;
1656
1657   tmp = *p;
1658   mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1659   tmp = (tmp & (~mask)) | (__A & mask);
1660   *p = tmp;
1661 }
1662
1663 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1665 {
1666   _mm_maskmove_si64 (__A, __N, __P);
1667 }
1668
1669 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1670 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1671 _mm_avg_pu8 (__m64 __A, __m64 __B)
1672 {
1673   __vector unsigned char a, b, c;
1674
1675   a = (__vector unsigned char)vec_splats (__A);
1676   b = (__vector unsigned char)vec_splats (__B);
1677   c = vec_avg (a, b);
1678   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1679 }
1680
1681 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1682 _m_pavgb (__m64 __A, __m64 __B)
1683 {
1684   return _mm_avg_pu8 (__A, __B);
1685 }
1686
1687 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1688 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1689 _mm_avg_pu16 (__m64 __A, __m64 __B)
1690 {
1691   __vector unsigned short a, b, c;
1692
1693   a = (__vector unsigned short)vec_splats (__A);
1694   b = (__vector unsigned short)vec_splats (__B);
1695   c = vec_avg (a, b);
1696   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1697 }
1698
1699 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1700 _m_pavgw (__m64 __A, __m64 __B)
1701 {
1702   return _mm_avg_pu16 (__A, __B);
1703 }
1704
1705 /* Compute the sum of the absolute differences of the unsigned 8-bit
1706    values in A and B.  Return the value in the lower 16-bit word; the
1707    upper words are cleared.  */
1708 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1709 _mm_sad_pu8 (__m64  __A, __m64  __B)
1710 {
1711   __vector unsigned char a, b;
1712   __vector unsigned char vmin, vmax, vabsdiff;
1713   __vector signed int vsum;
1714   const __vector unsigned int zero =
1715     { 0, 0, 0, 0 };
1716   unsigned short result;
1717
1718   a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A);
1719   b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B);
1720   vmin = vec_min (a, b);
1721   vmax = vec_max (a, b);
1722   vabsdiff = vec_sub (vmax, vmin);
1723   /* Sum four groups of bytes into integers.  */
1724   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1725   /* Sum across four integers with integer result.  */
1726   vsum = vec_sums (vsum, (__vector signed int) zero);
1727   /* The sum is in the right most 32-bits of the vector result.
1728      Transfer to a GPR and truncate to 16 bits.  */
1729   result = vsum[3];
1730   return (result);
1731 }
1732
1733 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1734 _m_psadbw (__m64 __A, __m64 __B)
1735 {
1736   return _mm_sad_pu8 (__A, __B);
1737 }
1738
1739 /* Stores the data in A to the address P without polluting the caches.  */
1740 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1741 _mm_stream_pi (__m64 *__P, __m64 __A)
1742 {
1743   /* Use the data cache block touch for store transient.  */
1744   __asm__ (
1745     "   dcbtstt 0,%0"
1746     :
1747     : "b" (__P)
1748     : "memory"
1749   );
1750   *__P = __A;
1751 }
1752
1753 /* Likewise.  The address must be 16-byte aligned.  */
1754 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1755 _mm_stream_ps (float *__P, __m128 __A)
1756 {
1757   /* Use the data cache block touch for store transient.  */
1758   __asm__ (
1759     "   dcbtstt 0,%0"
1760     :
1761     : "b" (__P)
1762     : "memory"
1763   );
1764   _mm_store_ps (__P, __A);
1765 }
1766
1767 /* Guarantees that every preceding store is globally visible before
1768    any subsequent store.  */
1769 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1770 _mm_sfence (void)
1771 {
1772   /* Generate a light weight sync.  */
1773   __atomic_thread_fence (__ATOMIC_RELEASE);
1774 }
1775
1776 /* The execution of the next instruction is delayed by an implementation
1777    specific amount of time.  The instruction does not modify the
1778    architectural state.  This is after the pop_options pragma because
1779    it does not require SSE support in the processor--the encoding is a
1780    nop on processors that do not support it.  */
1781 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1782 _mm_pause (void)
1783 {
1784   /* There is no exact match with this construct, but the following is
1785      close to the desired effect.  */
1786 #if _ARCH_PWR8
1787   /* On power8 and later processors we can depend on Program Priority
1788      (PRI) and associated "very low" PPI setting.  Since we don't know
1789      what PPI this thread is running at we: 1) save the current PRI
1790      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1791      via the special or 31,31,31 encoding. 3) issue an "isync" to
1792      insure the PRI change takes effect before we execute any more
1793      instructions.
1794      Now we can execute a lwsync (release barrier) while we execute
1795      this thread at "very low" PRI.  Finally we restore the original
1796      PRI and continue execution.  */
1797   unsigned long __PPR;
1798
1799   __asm__ volatile (
1800     "   mfppr   %0;"
1801     "   or 31,31,31;"
1802     "   isync;"
1803     "   lwsync;"
1804     "   isync;"
1805     "   mtppr   %0;"
1806     : "=r" (__PPR)
1807     :
1808     : "memory"
1809   );
1810 #else
1811   /* For older processor where we may not even have Program Priority
1812      controls we can only depend on Heavy Weight Sync.  */
1813   __atomic_thread_fence (__ATOMIC_SEQ_CST);
1814 #endif
1815 }
1816
1817 /* Transpose the 4x4 matrix composed of row[0-3].  */
1818 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \
1819 do {                                                                    \
1820   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \
1821   __v4sf __t0 = vec_vmrghw (__r0, __r1);                        \
1822   __v4sf __t1 = vec_vmrghw (__r2, __r3);                        \
1823   __v4sf __t2 = vec_vmrglw (__r0, __r1);                        \
1824   __v4sf __t3 = vec_vmrglw (__r2, __r3);                        \
1825   (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0,        \
1826                                (__vector long long)__t1);       \
1827   (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,        \
1828                                (__vector long long)__t1);       \
1829   (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,        \
1830                                (__vector long long)__t3);       \
1831   (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,        \
1832                                (__vector long long)__t3);       \
1833 } while (0)
1834
1835 /* For backward source compatibility.  */
1836 //# include <emmintrin.h>
1837
1838 #endif /* _XMMINTRIN_H_INCLUDED */