gcc/config/i386/mmintrin.h

   1 /* Copyright (C) 2002-2024 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    Under Section 7 of GPL version 3, you are granted additional
  16    permissions described in the GCC Runtime Library Exception, version
  17    3.1, as published by the Free Software Foundation.
  18
  19    You should have received a copy of the GNU General Public License and
  20    a copy of the GCC Runtime Library Exception along with this program;
  21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22    <http://www.gnu.org/licenses/>.  */
  23
  24 /* Implemented from the specification included in the Intel C++ Compiler
  25    User Guide and Reference, version 9.0.  */
  26
  27 #ifndef _MMINTRIN_H_INCLUDED
  28 #define _MMINTRIN_H_INCLUDED
  29
  30 #if defined __x86_64__ && !defined __SSE__ || !defined __MMX__
  31 #pragma GCC push_options
  32 #ifdef __MMX_WITH_SSE__
  33 #pragma GCC target("sse2")
  34 #elif defined __x86_64__
  35 #pragma GCC target("sse,mmx")
  36 #else
  37 #pragma GCC target("mmx")
  38 #endif
  39 #define __DISABLE_MMX__
  40 #endif /* __MMX__ */
  41
  42 /* The Intel API is flexible enough that we must allow aliasing with other
  43    vector types, and their scalar components.  */
  44 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
  45 typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__));
  46 typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__));
  47
  48 /* Unaligned version of the same type  */
  49 typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1)));
  50 typedef int __m32_u __attribute__ ((__vector_size__ (4), \
  51                                     __may_alias__, __aligned__ (1)));
  52 typedef short __m16_u __attribute__ ((__vector_size__ (2), \
  53                                       __may_alias__, __aligned__ (1)));
  54
  55 /* Internal data types for implementing the intrinsics.  */
  56 typedef int __v2si __attribute__ ((__vector_size__ (8)));
  57 typedef short __v4hi __attribute__ ((__vector_size__ (8)));
  58 typedef char __v8qi __attribute__ ((__vector_size__ (8)));
  59 typedef long long __v1di __attribute__ ((__vector_size__ (8)));
  60 typedef float __v2sf __attribute__ ((__vector_size__ (8)));
  61
  62 /* Empty the multimedia state.  */
  63 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  64 _mm_empty (void)
  65 {
  66   __builtin_ia32_emms ();
  67 }
  68
  69 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  70 _m_empty (void)
  71 {
  72   _mm_empty ();
  73 }
  74
  75 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  76 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  77 _mm_cvtsi32_si64 (int __i)
  78 {
  79   return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
  80 }
  81
  82 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  83 _m_from_int (int __i)
  84 {
  85   return _mm_cvtsi32_si64 (__i);
  86 }
  87
  88 #ifdef __x86_64__
  89 /* Convert I to a __m64 object.  */
  90
  91 /* Intel intrinsic.  */
  92 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  93 _m_from_int64 (long long __i)
  94 {
  95   return (__m64) __i;
  96 }
  97
  98 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  99 _mm_cvtsi64_m64 (long long __i)
 100 {
 101   return (__m64) __i;
 102 }
 103
 104 /* Microsoft intrinsic.  */
 105 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 106 _mm_cvtsi64x_si64 (long long __i)
 107 {
 108   return (__m64) __i;
 109 }
 110
 111 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 112 _mm_set_pi64x (long long __i)
 113 {
 114   return (__m64) __i;
 115 }
 116 #endif
 117
 118 /* Convert the lower 32 bits of the __m64 object into an integer.  */
 119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 120 _mm_cvtsi64_si32 (__m64 __i)
 121 {
 122   return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
 123 }
 124
 125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 126 _m_to_int (__m64 __i)
 127 {
 128   return _mm_cvtsi64_si32 (__i);
 129 }
 130
 131 #ifdef __x86_64__
 132 /* Convert the __m64 object to a 64bit integer.  */
 133
 134 /* Intel intrinsic.  */
 135 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 136 _m_to_int64 (__m64 __i)
 137 {
 138   return (long long)__i;
 139 }
 140
 141 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 142 _mm_cvtm64_si64 (__m64 __i)
 143 {
 144   return (long long)__i;
 145 }
 146
 147 /* Microsoft intrinsic.  */
 148 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 149 _mm_cvtsi64_si64x (__m64 __i)
 150 {
 151   return (long long)__i;
 152 }
 153 #endif
 154
 155 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 156    the result, and the four 16-bit values from M2 into the upper four 8-bit
 157    values of the result, all with signed saturation.  */
 158 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 159 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
 160 {
 161   return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
 162 }
 163
 164 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 165 _m_packsswb (__m64 __m1, __m64 __m2)
 166 {
 167   return _mm_packs_pi16 (__m1, __m2);
 168 }
 169
 170 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 171    the result, and the two 32-bit values from M2 into the upper two 16-bit
 172    values of the result, all with signed saturation.  */
 173 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 174 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
 175 {
 176   return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
 177 }
 178
 179 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 180 _m_packssdw (__m64 __m1, __m64 __m2)
 181 {
 182   return _mm_packs_pi32 (__m1, __m2);
 183 }
 184
 185 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 186    the result, and the four 16-bit values from M2 into the upper four 8-bit
 187    values of the result, all with unsigned saturation.  */
 188 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 189 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 190 {
 191   return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
 192 }
 193
 194 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 195 _m_packuswb (__m64 __m1, __m64 __m2)
 196 {
 197   return _mm_packs_pu16 (__m1, __m2);
 198 }
 199
 200 /* Interleave the four 8-bit values from the high half of M1 with the four
 201    8-bit values from the high half of M2.  */
 202 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 203 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 204 {
 205   return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
 206 }
 207
 208 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 209 _m_punpckhbw (__m64 __m1, __m64 __m2)
 210 {
 211   return _mm_unpackhi_pi8 (__m1, __m2);
 212 }
 213
 214 /* Interleave the two 16-bit values from the high half of M1 with the two
 215    16-bit values from the high half of M2.  */
 216 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 217 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
 218 {
 219   return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
 220 }
 221
 222 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 223 _m_punpckhwd (__m64 __m1, __m64 __m2)
 224 {
 225   return _mm_unpackhi_pi16 (__m1, __m2);
 226 }
 227
 228 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 229    value from the high half of M2.  */
 230 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 231 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
 232 {
 233   return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
 234 }
 235
 236 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 237 _m_punpckhdq (__m64 __m1, __m64 __m2)
 238 {
 239   return _mm_unpackhi_pi32 (__m1, __m2);
 240 }
 241
 242 /* Interleave the four 8-bit values from the low half of M1 with the four
 243    8-bit values from the low half of M2.  */
 244 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 245 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 246 {
 247   return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
 248 }
 249
 250 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 251 _m_punpcklbw (__m64 __m1, __m64 __m2)
 252 {
 253   return _mm_unpacklo_pi8 (__m1, __m2);
 254 }
 255
 256 /* Interleave the two 16-bit values from the low half of M1 with the two
 257    16-bit values from the low half of M2.  */
 258 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 259 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
 260 {
 261   return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
 262 }
 263
 264 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 265 _m_punpcklwd (__m64 __m1, __m64 __m2)
 266 {
 267   return _mm_unpacklo_pi16 (__m1, __m2);
 268 }
 269
 270 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 271    value from the low half of M2.  */
 272 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 273 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
 274 {
 275   return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
 276 }
 277
 278 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 279 _m_punpckldq (__m64 __m1, __m64 __m2)
 280 {
 281   return _mm_unpacklo_pi32 (__m1, __m2);
 282 }
 283
 284 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 285 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 286 _mm_add_pi8 (__m64 __m1, __m64 __m2)
 287 {
 288   return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
 289 }
 290
 291 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 292 _m_paddb (__m64 __m1, __m64 __m2)
 293 {
 294   return _mm_add_pi8 (__m1, __m2);
 295 }
 296
 297 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 298 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 299 _mm_add_pi16 (__m64 __m1, __m64 __m2)
 300 {
 301   return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
 302 }
 303
 304 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 305 _m_paddw (__m64 __m1, __m64 __m2)
 306 {
 307   return _mm_add_pi16 (__m1, __m2);
 308 }
 309
 310 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 312 _mm_add_pi32 (__m64 __m1, __m64 __m2)
 313 {
 314   return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
 315 }
 316
 317 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 318 _m_paddd (__m64 __m1, __m64 __m2)
 319 {
 320   return _mm_add_pi32 (__m1, __m2);
 321 }
 322
 323 /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
 324 #ifndef __SSE2__
 325 #pragma GCC push_options
 326 #ifdef __MMX_WITH_SSE__
 327 #pragma GCC target("sse2")
 328 #else
 329 #pragma GCC target("sse2,mmx")
 330 #endif
 331 #define __DISABLE_SSE2__
 332 #endif /* __SSE2__ */
 333
 334 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 335 _mm_add_si64 (__m64 __m1, __m64 __m2)
 336 {
 337   return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
 338 }
 339 #ifdef __DISABLE_SSE2__
 340 #undef __DISABLE_SSE2__
 341 #pragma GCC pop_options
 342 #endif /* __DISABLE_SSE2__ */
 343
 344 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 345    saturated arithmetic.  */
 346 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 347 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
 348 {
 349   return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
 350 }
 351
 352 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 353 _m_paddsb (__m64 __m1, __m64 __m2)
 354 {
 355   return _mm_adds_pi8 (__m1, __m2);
 356 }
 357
 358 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 359    saturated arithmetic.  */
 360 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 361 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
 362 {
 363   return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
 364 }
 365
 366 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 367 _m_paddsw (__m64 __m1, __m64 __m2)
 368 {
 369   return _mm_adds_pi16 (__m1, __m2);
 370 }
 371
 372 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 373    saturated arithmetic.  */
 374 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 375 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 376 {
 377   return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
 378 }
 379
 380 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 381 _m_paddusb (__m64 __m1, __m64 __m2)
 382 {
 383   return _mm_adds_pu8 (__m1, __m2);
 384 }
 385
 386 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 387    saturated arithmetic.  */
 388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 389 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 390 {
 391   return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
 392 }
 393
 394 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 395 _m_paddusw (__m64 __m1, __m64 __m2)
 396 {
 397   return _mm_adds_pu16 (__m1, __m2);
 398 }
 399
 400 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 401 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 402 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
 403 {
 404   return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
 405 }
 406
 407 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 408 _m_psubb (__m64 __m1, __m64 __m2)
 409 {
 410   return _mm_sub_pi8 (__m1, __m2);
 411 }
 412
 413 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 414 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 415 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
 416 {
 417   return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
 418 }
 419
 420 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 421 _m_psubw (__m64 __m1, __m64 __m2)
 422 {
 423   return _mm_sub_pi16 (__m1, __m2);
 424 }
 425
 426 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 427 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 428 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
 429 {
 430   return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
 431 }
 432
 433 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 434 _m_psubd (__m64 __m1, __m64 __m2)
 435 {
 436   return _mm_sub_pi32 (__m1, __m2);
 437 }
 438
 439 /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
 440 #ifndef __SSE2__
 441 #pragma GCC push_options
 442 #ifdef __MMX_WITH_SSE__
 443 #pragma GCC target("sse2")
 444 #else
 445 #pragma GCC target("sse2,mmx")
 446 #endif
 447 #define __DISABLE_SSE2__
 448 #endif /* __SSE2__ */
 449
 450 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 451 _mm_sub_si64 (__m64 __m1, __m64 __m2)
 452 {
 453   return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
 454 }
 455 #ifdef __DISABLE_SSE2__
 456 #undef __DISABLE_SSE2__
 457 #pragma GCC pop_options
 458 #endif /* __DISABLE_SSE2__ */
 459
 460 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 461    saturating arithmetic.  */
 462 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 463 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
 464 {
 465   return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
 466 }
 467
 468 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 469 _m_psubsb (__m64 __m1, __m64 __m2)
 470 {
 471   return _mm_subs_pi8 (__m1, __m2);
 472 }
 473
 474 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 475    signed saturating arithmetic.  */
 476 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 477 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
 478 {
 479   return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
 480 }
 481
 482 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 483 _m_psubsw (__m64 __m1, __m64 __m2)
 484 {
 485   return _mm_subs_pi16 (__m1, __m2);
 486 }
 487
 488 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
 489    unsigned saturating arithmetic.  */
 490 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 491 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
 492 {
 493   return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
 494 }
 495
 496 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 497 _m_psubusb (__m64 __m1, __m64 __m2)
 498 {
 499   return _mm_subs_pu8 (__m1, __m2);
 500 }
 501
 502 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 503    unsigned saturating arithmetic.  */
 504 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 505 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
 506 {
 507   return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
 508 }
 509
 510 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 511 _m_psubusw (__m64 __m1, __m64 __m2)
 512 {
 513   return _mm_subs_pu16 (__m1, __m2);
 514 }
 515
 516 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
 517    four 32-bit intermediate results, which are then summed by pairs to
 518    produce two 32-bit results.  */
 519 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 520 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
 521 {
 522   return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
 523 }
 524
 525 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 526 _m_pmaddwd (__m64 __m1, __m64 __m2)
 527 {
 528   return _mm_madd_pi16 (__m1, __m2);
 529 }
 530
 531 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
 532    M2 and produce the high 16 bits of the 32-bit results.  */
 533 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 534 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
 535 {
 536   return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
 537 }
 538
 539 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 540 _m_pmulhw (__m64 __m1, __m64 __m2)
 541 {
 542   return _mm_mulhi_pi16 (__m1, __m2);
 543 }
 544
 545 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
 546    the low 16 bits of the results.  */
 547 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 548 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
 549 {
 550   return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
 551 }
 552
 553 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 554 _m_pmullw (__m64 __m1, __m64 __m2)
 555 {
 556   return _mm_mullo_pi16 (__m1, __m2);
 557 }
 558
 559 /* Shift four 16-bit values in M left by COUNT.  */
 560 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 561 _mm_sll_pi16 (__m64 __m, __m64 __count)
 562 {
 563   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
 564 }
 565
 566 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 567 _m_psllw (__m64 __m, __m64 __count)
 568 {
 569   return _mm_sll_pi16 (__m, __count);
 570 }
 571
 572 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 573 _mm_slli_pi16 (__m64 __m, int __count)
 574 {
 575   return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
 576 }
 577
 578 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 579 _m_psllwi (__m64 __m, int __count)
 580 {
 581   return _mm_slli_pi16 (__m, __count);
 582 }
 583
 584 /* Shift two 32-bit values in M left by COUNT.  */
 585 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 586 _mm_sll_pi32 (__m64 __m, __m64 __count)
 587 {
 588   return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
 589 }
 590
 591 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 592 _m_pslld (__m64 __m, __m64 __count)
 593 {
 594   return _mm_sll_pi32 (__m, __count);
 595 }
 596
 597 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 598 _mm_slli_pi32 (__m64 __m, int __count)
 599 {
 600   return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
 601 }
 602
 603 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 604 _m_pslldi (__m64 __m, int __count)
 605 {
 606   return _mm_slli_pi32 (__m, __count);
 607 }
 608
 609 /* Shift the 64-bit value in M left by COUNT.  */
 610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 611 _mm_sll_si64 (__m64 __m, __m64 __count)
 612 {
 613   return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
 614 }
 615
 616 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 617 _m_psllq (__m64 __m, __m64 __count)
 618 {
 619   return _mm_sll_si64 (__m, __count);
 620 }
 621
 622 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 623 _mm_slli_si64 (__m64 __m, int __count)
 624 {
 625   return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
 626 }
 627
 628 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 629 _m_psllqi (__m64 __m, int __count)
 630 {
 631   return _mm_slli_si64 (__m, __count);
 632 }
 633
 634 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
 635 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 636 _mm_sra_pi16 (__m64 __m, __m64 __count)
 637 {
 638   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
 639 }
 640
 641 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 642 _m_psraw (__m64 __m, __m64 __count)
 643 {
 644   return _mm_sra_pi16 (__m, __count);
 645 }
 646
 647 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 648 _mm_srai_pi16 (__m64 __m, int __count)
 649 {
 650   return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
 651 }
 652
 653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 654 _m_psrawi (__m64 __m, int __count)
 655 {
 656   return _mm_srai_pi16 (__m, __count);
 657 }
 658
 659 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
 660 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 661 _mm_sra_pi32 (__m64 __m, __m64 __count)
 662 {
 663   return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
 664 }
 665
 666 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 667 _m_psrad (__m64 __m, __m64 __count)
 668 {
 669   return _mm_sra_pi32 (__m, __count);
 670 }
 671
 672 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 673 _mm_srai_pi32 (__m64 __m, int __count)
 674 {
 675   return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
 676 }
 677
 678 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 679 _m_psradi (__m64 __m, int __count)
 680 {
 681   return _mm_srai_pi32 (__m, __count);
 682 }
 683
 684 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
 685 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 686 _mm_srl_pi16 (__m64 __m, __m64 __count)
 687 {
 688   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
 689 }
 690
 691 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 692 _m_psrlw (__m64 __m, __m64 __count)
 693 {
 694   return _mm_srl_pi16 (__m, __count);
 695 }
 696
 697 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 698 _mm_srli_pi16 (__m64 __m, int __count)
 699 {
 700   return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
 701 }
 702
 703 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 704 _m_psrlwi (__m64 __m, int __count)
 705 {
 706   return _mm_srli_pi16 (__m, __count);
 707 }
 708
 709 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
 710 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 711 _mm_srl_pi32 (__m64 __m, __m64 __count)
 712 {
 713   return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
 714 }
 715
 716 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 717 _m_psrld (__m64 __m, __m64 __count)
 718 {
 719   return _mm_srl_pi32 (__m, __count);
 720 }
 721
 722 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 723 _mm_srli_pi32 (__m64 __m, int __count)
 724 {
 725   return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
 726 }
 727
 728 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 729 _m_psrldi (__m64 __m, int __count)
 730 {
 731   return _mm_srli_pi32 (__m, __count);
 732 }
 733
 734 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 735 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 736 _mm_srl_si64 (__m64 __m, __m64 __count)
 737 {
 738   return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
 739 }
 740
 741 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 742 _m_psrlq (__m64 __m, __m64 __count)
 743 {
 744   return _mm_srl_si64 (__m, __count);
 745 }
 746
 747 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 748 _mm_srli_si64 (__m64 __m, int __count)
 749 {
 750   return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
 751 }
 752
 753 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 754 _m_psrlqi (__m64 __m, int __count)
 755 {
 756   return _mm_srli_si64 (__m, __count);
 757 }
 758
 759 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 760 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 761 _mm_and_si64 (__m64 __m1, __m64 __m2)
 762 {
 763   return __builtin_ia32_pand (__m1, __m2);
 764 }
 765
 766 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 767 _m_pand (__m64 __m1, __m64 __m2)
 768 {
 769   return _mm_and_si64 (__m1, __m2);
 770 }
 771
 772 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 773    64-bit value in M2.  */
 774 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 775 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 776 {
 777   return __builtin_ia32_pandn (__m1, __m2);
 778 }
 779
 780 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 781 _m_pandn (__m64 __m1, __m64 __m2)
 782 {
 783   return _mm_andnot_si64 (__m1, __m2);
 784 }
 785
 786 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 787 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 788 _mm_or_si64 (__m64 __m1, __m64 __m2)
 789 {
 790   return __builtin_ia32_por (__m1, __m2);
 791 }
 792
 793 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 794 _m_por (__m64 __m1, __m64 __m2)
 795 {
 796   return _mm_or_si64 (__m1, __m2);
 797 }
 798
 799 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 800 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 801 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 802 {
 803   return __builtin_ia32_pxor (__m1, __m2);
 804 }
 805
 806 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 807 _m_pxor (__m64 __m1, __m64 __m2)
 808 {
 809   return _mm_xor_si64 (__m1, __m2);
 810 }
 811
 812 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 813    test is true and zero if false.  */
 814 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 815 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
 816 {
 817   return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
 818 }
 819
 820 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 821 _m_pcmpeqb (__m64 __m1, __m64 __m2)
 822 {
 823   return _mm_cmpeq_pi8 (__m1, __m2);
 824 }
 825
 826 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 827 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
 828 {
 829   return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
 830 }
 831
 832 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 833 _m_pcmpgtb (__m64 __m1, __m64 __m2)
 834 {
 835   return _mm_cmpgt_pi8 (__m1, __m2);
 836 }
 837
 838 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 839    the test is true and zero if false.  */
 840 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 841 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
 842 {
 843   return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
 844 }
 845
 846 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 847 _m_pcmpeqw (__m64 __m1, __m64 __m2)
 848 {
 849   return _mm_cmpeq_pi16 (__m1, __m2);
 850 }
 851
 852 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 853 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
 854 {
 855   return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
 856 }
 857
 858 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 859 _m_pcmpgtw (__m64 __m1, __m64 __m2)
 860 {
 861   return _mm_cmpgt_pi16 (__m1, __m2);
 862 }
 863
 864 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 865    the test is true and zero if false.  */
 866 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 867 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 868 {
 869   return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
 870 }
 871
 872 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 873 _m_pcmpeqd (__m64 __m1, __m64 __m2)
 874 {
 875   return _mm_cmpeq_pi32 (__m1, __m2);
 876 }
 877
 878 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 879 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
 880 {
 881   return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
 882 }
 883
 884 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 885 _m_pcmpgtd (__m64 __m1, __m64 __m2)
 886 {
 887   return _mm_cmpgt_pi32 (__m1, __m2);
 888 }
 889
 890 /* Creates a 64-bit zero.  */
 891 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 892 _mm_setzero_si64 (void)
 893 {
 894   return (__m64)0LL;
 895 }
 896
 897 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 898 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 899 _mm_set_pi32 (int __i1, int __i0)
 900 {
 901   return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
 902 }
 903
 904 /* Creates a vector of four 16-bit values; W0 is least significant.  */
 905 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 906 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
 907 {
 908   return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
 909 }
 910
 911 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
 912 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 913 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
 914              char __b3, char __b2, char __b1, char __b0)
 915 {
 916   return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
 917                                                __b4, __b5, __b6, __b7);
 918 }
 919
 920 /* Similar, but with the arguments in reverse order.  */
 921 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 922 _mm_setr_pi32 (int __i0, int __i1)
 923 {
 924   return _mm_set_pi32 (__i1, __i0);
 925 }
 926
 927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 928 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
 929 {
 930   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
 931 }
 932
 933 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 934 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
 935               char __b4, char __b5, char __b6, char __b7)
 936 {
 937   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 938 }
 939
 940 /* Creates a vector of two 32-bit values, both elements containing I.  */
 941 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 942 _mm_set1_pi32 (int __i)
 943 {
 944   return _mm_set_pi32 (__i, __i);
 945 }
 946
 947 /* Creates a vector of four 16-bit values, all elements containing W.  */
 948 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 949 _mm_set1_pi16 (short __w)
 950 {
 951   return _mm_set_pi16 (__w, __w, __w, __w);
 952 }
 953
 954 /* Creates a vector of eight 8-bit values, all elements containing B.  */
 955 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 956 _mm_set1_pi8 (char __b)
 957 {
 958   return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
 959 }
 960 #ifdef __DISABLE_MMX__
 961 #undef __DISABLE_MMX__
 962 #pragma GCC pop_options
 963 #endif /* __DISABLE_MMX__ */
 964
 965 #endif /* _MMINTRIN_H_INCLUDED */