libstdc++-v3/include/experimental/bits/simd_neon.h

   1 // Simd NEON specific implementations -*- C++ -*-
   2
   3 // Copyright (C) 2020-2024 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the
   7 // terms of the GNU General Public License as published by the
   8 // Free Software Foundation; either version 3, or (at your option)
   9 // any later version.
  10
  11 // This library is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // Under Section 7 of GPL version 3, you are granted additional
  17 // permissions described in the GCC Runtime Library Exception, version
  18 // 3.1, as published by the Free Software Foundation.
  19
  20 // You should have received a copy of the GNU General Public License and
  21 // a copy of the GCC Runtime Library Exception along with this program;
  22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 // <http://www.gnu.org/licenses/>.
  24
  25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
  26 #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
  27
  28 #if __cplusplus >= 201703L
  29
  30 #if !_GLIBCXX_SIMD_HAVE_NEON
  31 #error "simd_neon.h may only be included when NEON on ARM is available"
  32 #endif
  33
  34 _GLIBCXX_SIMD_BEGIN_NAMESPACE
  35
  36 // _CommonImplNeon {{{
  37 struct _CommonImplNeon : _CommonImplBuiltin
  38 {
  39   // _S_store {{{
  40   using _CommonImplBuiltin::_S_store;
  41
  42   // }}}
  43 };
  44
  45 // }}}
  46 // _SimdImplNeon {{{
  47 template <typename _Abi, typename>
  48   struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
  49   {
  50     using _Base = _SimdImplBuiltin<_Abi>;
  51
  52     template <typename _Tp>
  53       using _MaskMember = typename _Base::template _MaskMember<_Tp>;
  54
  55     template <typename _Tp>
  56       static constexpr size_t _S_max_store_size = 16;
  57
  58     // _S_masked_load {{{
  59     template <typename _Tp, size_t _Np, typename _Up>
  60       static inline _SimdWrapper<_Tp, _Np>
  61       _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
  62                      const _Up* __mem) noexcept
  63       {
  64         __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  65           if (__k[__i] != 0)
  66             __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
  67         });
  68         return __merge;
  69       }
  70
  71     // }}}
  72     // _S_masked_store_nocvt {{{
  73     template <typename _Tp, size_t _Np>
  74       _GLIBCXX_SIMD_INTRINSIC static void
  75       _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
  76                             _MaskMember<_Tp> __k)
  77       {
  78         __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  79           if (__k[__i] != 0)
  80             __mem[__i] = __v[__i];
  81         });
  82       }
  83
  84     // }}}
  85     // _S_reduce {{{
  86     template <typename _Tp, typename _BinaryOperation>
  87       _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
  88       _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
  89       {
  90         if (not __builtin_is_constant_evaluated())
  91           {
  92             constexpr size_t _Np = __x.size();
  93             if constexpr (sizeof(__x) == 16 && _Np >= 4
  94                             && !_Abi::template _S_is_partial<_Tp>)
  95               {
  96                 const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
  97                 const auto __y = __binary_op(__halves[0], __halves[1]);
  98                 return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
  99                          __y, static_cast<_BinaryOperation&&>(__binary_op));
 100               }
 101             else if constexpr (_Np == 8)
 102               {
 103                 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 104                                          __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(__x._M_data)));
 105                 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 106                                          __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(__x._M_data)));
 107                 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 108                                          __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(__x._M_data)));
 109                 return __x[0];
 110               }
 111             else if constexpr (_Np == 4)
 112               {
 113                 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 114                                          __vector_permute<1, 0, 3, 2>(__x._M_data)));
 115                 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 116                                          __vector_permute<3, 2, 1, 0>(__x._M_data)));
 117                 return __x[0];
 118               }
 119             else if constexpr (_Np == 2)
 120               {
 121                 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 122                                          __vector_permute<1, 0>(__x._M_data)));
 123                 return __x[0];
 124               }
 125           }
 126         return _Base::_S_reduce(__x, static_cast<_BinaryOperation&&>(__binary_op));
 127       }
 128
 129     // }}}
 130     // math {{{
 131     // _S_sqrt {{{
 132     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
 133       _GLIBCXX_SIMD_INTRINSIC static _Tp
 134       _S_sqrt(_Tp __x)
 135       {
 136         if constexpr (__have_neon_a64)
 137           {
 138             const auto __intrin = __to_intrin(__x);
 139             if constexpr (_TVT::template _S_is<float, 2>)
 140               return vsqrt_f32(__intrin);
 141             else if constexpr (_TVT::template _S_is<float, 4>)
 142               return vsqrtq_f32(__intrin);
 143             else if constexpr (_TVT::template _S_is<double, 1>)
 144               return vsqrt_f64(__intrin);
 145             else if constexpr (_TVT::template _S_is<double, 2>)
 146               return vsqrtq_f64(__intrin);
 147             else
 148               __assert_unreachable<_Tp>();
 149           }
 150         else
 151           return _Base::_S_sqrt(__x);
 152       }
 153
 154     // }}}
 155     // _S_trunc {{{
 156     template <typename _TW, typename _TVT = _VectorTraits<_TW>>
 157       _GLIBCXX_SIMD_INTRINSIC static _TW
 158       _S_trunc(_TW __x)
 159       {
 160         using _Tp = typename _TVT::value_type;
 161         if constexpr (__have_neon_a32)
 162           {
 163             const auto __intrin = __to_intrin(__x);
 164             if constexpr (_TVT::template _S_is<float, 2>)
 165               return vrnd_f32(__intrin);
 166             else if constexpr (_TVT::template _S_is<float, 4>)
 167               return vrndq_f32(__intrin);
 168             else if constexpr (_TVT::template _S_is<double, 1>)
 169               return vrnd_f64(__intrin);
 170             else if constexpr (_TVT::template _S_is<double, 2>)
 171               return vrndq_f64(__intrin);
 172             else
 173               __assert_unreachable<_Tp>();
 174           }
 175         else if constexpr (is_same_v<_Tp, float>)
 176           {
 177             auto __intrin = __to_intrin(__x);
 178             if constexpr (sizeof(__x) == 16)
 179               __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
 180             else
 181               __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
 182             return _Base::_S_abs(__x)._M_data < 0x1p23f
 183                      ? __vector_bitcast<float>(__intrin)
 184                      : __x._M_data;
 185           }
 186         else
 187           return _Base::_S_trunc(__x);
 188       }
 189
 190     // }}}
 191     // _S_round {{{
 192     template <typename _Tp, size_t _Np>
 193       _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
 194       _S_round(_SimdWrapper<_Tp, _Np> __x)
 195       {
 196         if constexpr (__have_neon_a32)
 197           {
 198             const auto __intrin = __to_intrin(__x);
 199             if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
 200               return vrnda_f32(__intrin);
 201             else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
 202               return vrndaq_f32(__intrin);
 203             else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
 204               return vrnda_f64(__intrin);
 205             else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
 206               return vrndaq_f64(__intrin);
 207             else
 208               __assert_unreachable<_Tp>();
 209           }
 210         else
 211           return _Base::_S_round(__x);
 212       }
 213
 214     // }}}
 215     // _S_floor {{{
 216     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
 217       _GLIBCXX_SIMD_INTRINSIC static _Tp
 218       _S_floor(_Tp __x)
 219       {
 220         if constexpr (__have_neon_a32)
 221           {
 222             const auto __intrin = __to_intrin(__x);
 223             if constexpr (_TVT::template _S_is<float, 2>)
 224               return vrndm_f32(__intrin);
 225             else if constexpr (_TVT::template _S_is<float, 4>)
 226               return vrndmq_f32(__intrin);
 227             else if constexpr (_TVT::template _S_is<double, 1>)
 228               return vrndm_f64(__intrin);
 229             else if constexpr (_TVT::template _S_is<double, 2>)
 230               return vrndmq_f64(__intrin);
 231             else
 232               __assert_unreachable<_Tp>();
 233           }
 234         else
 235           return _Base::_S_floor(__x);
 236       }
 237
 238     // }}}
 239     // _S_ceil {{{
 240     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
 241       _GLIBCXX_SIMD_INTRINSIC static _Tp
 242       _S_ceil(_Tp __x)
 243       {
 244         if constexpr (__have_neon_a32)
 245           {
 246             const auto __intrin = __to_intrin(__x);
 247             if constexpr (_TVT::template _S_is<float, 2>)
 248               return vrndp_f32(__intrin);
 249             else if constexpr (_TVT::template _S_is<float, 4>)
 250               return vrndpq_f32(__intrin);
 251             else if constexpr (_TVT::template _S_is<double, 1>)
 252               return vrndp_f64(__intrin);
 253             else if constexpr (_TVT::template _S_is<double, 2>)
 254               return vrndpq_f64(__intrin);
 255             else
 256               __assert_unreachable<_Tp>();
 257           }
 258         else
 259           return _Base::_S_ceil(__x);
 260       }
 261
 262     //}}} }}}
 263   }; // }}}
 264 // _MaskImplNeonMixin {{{
 265 struct _MaskImplNeonMixin
 266 {
 267   using _Base = _MaskImplBuiltinMixin;
 268
 269   template <typename _Tp, size_t _Np>
 270     _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
 271     _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
 272     {
 273       if (__builtin_is_constant_evaluated())
 274         return _Base::_S_to_bits(__x);
 275
 276       using _I = __int_for_sizeof_t<_Tp>;
 277       if constexpr (sizeof(__x) == 16)
 278         {
 279           auto __asint = __vector_bitcast<_I>(__x);
 280 #ifdef __aarch64__
 281           [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
 282 #else
 283           [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
 284 #endif
 285           if constexpr (sizeof(_Tp) == 1)
 286             {
 287               constexpr auto __bitsel
 288                 = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
 289                   [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 290                     return static_cast<_I>(
 291                       __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
 292                   });
 293               __asint &= __bitsel;
 294 #ifdef __aarch64__
 295               return __vector_bitcast<_UShort>(
 296                 vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
 297                           __zero))[0];
 298 #else
 299               return __vector_bitcast<_UShort>(
 300                 vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
 301                                   __zero),
 302                          __zero))[0];
 303 #endif
 304             }
 305           else if constexpr (sizeof(_Tp) == 2)
 306             {
 307               constexpr auto __bitsel
 308                 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
 309                   [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 310                     return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 311                   });
 312               __asint &= __bitsel;
 313 #ifdef __aarch64__
 314               return vaddvq_s16(__asint);
 315 #else
 316               return vpadd_s16(
 317                 vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
 318                 __zero)[0];
 319 #endif
 320             }
 321           else if constexpr (sizeof(_Tp) == 4)
 322             {
 323               constexpr auto __bitsel
 324                 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
 325                   [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 326                     return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 327                   });
 328               __asint &= __bitsel;
 329 #ifdef __aarch64__
 330               return vaddvq_s32(__asint);
 331 #else
 332               return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
 333                                __zero)[0];
 334 #endif
 335             }
 336           else if constexpr (sizeof(_Tp) == 8)
 337             return (__asint[0] & 1) | (__asint[1] & 2);
 338           else
 339             __assert_unreachable<_Tp>();
 340         }
 341       else if constexpr (sizeof(__x) == 8)
 342         {
 343           auto __asint = __vector_bitcast<_I>(__x);
 344           [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
 345           if constexpr (sizeof(_Tp) == 1)
 346             {
 347               constexpr auto __bitsel
 348                 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
 349                   [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 350                     return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 351                   });
 352               __asint &= __bitsel;
 353 #ifdef __aarch64__
 354               return vaddv_s8(__asint);
 355 #else
 356               return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
 357                               __zero)[0];
 358 #endif
 359             }
 360           else if constexpr (sizeof(_Tp) == 2)
 361             {
 362               constexpr auto __bitsel
 363                 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
 364                   [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 365                     return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 366                   });
 367               __asint &= __bitsel;
 368 #ifdef __aarch64__
 369               return vaddv_s16(__asint);
 370 #else
 371               return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
 372 #endif
 373             }
 374           else if constexpr (sizeof(_Tp) == 4)
 375             {
 376               __asint &= __make_vector<_I>(0x1, 0x2);
 377 #ifdef __aarch64__
 378               return vaddv_s32(__asint);
 379 #else
 380               return vpadd_s32(__asint, __zero)[0];
 381 #endif
 382             }
 383           else
 384             __assert_unreachable<_Tp>();
 385         }
 386       else
 387         return _Base::_S_to_bits(__x);
 388     }
 389 };
 390
 391 // }}}
 392 // _MaskImplNeon {{{
 393 template <typename _Abi, typename>
 394   struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
 395   {
 396     using _MaskImplBuiltinMixin::_S_to_maskvector;
 397     using _MaskImplNeonMixin::_S_to_bits;
 398     using _Base = _MaskImplBuiltin<_Abi>;
 399     using _Base::_S_convert;
 400
 401     // _S_all_of {{{
 402     template <typename _Tp>
 403       _GLIBCXX_SIMD_INTRINSIC static bool
 404       _S_all_of(simd_mask<_Tp, _Abi> __k)
 405       {
 406         const auto __kk
 407           = __vector_bitcast<char>(__k._M_data)
 408             | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
 409         if constexpr (sizeof(__k) == 16)
 410           {
 411             const auto __x = __vector_bitcast<long long>(__kk);
 412             return __x[0] + __x[1] == -2;
 413           }
 414         else if constexpr (sizeof(__k) <= 8)
 415           return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
 416         else
 417           __assert_unreachable<_Tp>();
 418       }
 419
 420     // }}}
 421     // _S_any_of {{{
 422     template <typename _Tp>
 423       _GLIBCXX_SIMD_INTRINSIC static bool
 424       _S_any_of(simd_mask<_Tp, _Abi> __k)
 425       {
 426         const auto __kk
 427           = __vector_bitcast<char>(__k._M_data)
 428             | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
 429         if constexpr (sizeof(__k) == 16)
 430           {
 431             const auto __x = __vector_bitcast<long long>(__kk);
 432             return (__x[0] | __x[1]) != 0;
 433           }
 434         else if constexpr (sizeof(__k) <= 8)
 435           return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
 436         else
 437           __assert_unreachable<_Tp>();
 438       }
 439
 440     // }}}
 441     // _S_none_of {{{
 442     template <typename _Tp>
 443       _GLIBCXX_SIMD_INTRINSIC static bool
 444       _S_none_of(simd_mask<_Tp, _Abi> __k)
 445       {
 446         const auto __kk = _Abi::_S_masked(__k._M_data);
 447         if constexpr (sizeof(__k) == 16)
 448           {
 449             const auto __x = __vector_bitcast<long long>(__kk);
 450             return (__x[0] | __x[1]) == 0;
 451           }
 452         else if constexpr (sizeof(__k) <= 8)
 453           return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
 454         else
 455           __assert_unreachable<_Tp>();
 456       }
 457
 458     // }}}
 459     // _S_some_of {{{
 460     template <typename _Tp>
 461       _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
 462       {
 463         if constexpr (sizeof(__k) <= 8)
 464           {
 465             const auto __kk = __vector_bitcast<char>(__k._M_data)
 466                               | ~__vector_bitcast<char>(
 467                                 _Abi::template _S_implicit_mask<_Tp>());
 468             using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
 469             return __bit_cast<_Up>(__kk) + 1 > 1;
 470           }
 471         else
 472           return _Base::_S_some_of(__k);
 473       }
 474
 475     // }}}
 476     // _S_popcount {{{
 477     template <typename _Tp>
 478       _GLIBCXX_SIMD_INTRINSIC static int
 479       _S_popcount(simd_mask<_Tp, _Abi> __k)
 480       {
 481         if constexpr (sizeof(_Tp) == 1)
 482           {
 483             const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
 484             int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
 485             return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
 486                              int8x8_t())[0];
 487           }
 488         else if constexpr (sizeof(_Tp) == 2)
 489           {
 490             const auto __s16 = __vector_bitcast<short>(__k._M_data);
 491             int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
 492             return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
 493           }
 494         else if constexpr (sizeof(_Tp) == 4)
 495           {
 496             const auto __s32 = __vector_bitcast<int>(__k._M_data);
 497             int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
 498             return -vpadd_s32(__tmp, int32x2_t())[0];
 499           }
 500         else if constexpr (sizeof(_Tp) == 8)
 501           {
 502             static_assert(sizeof(__k) == 16);
 503             const auto __s64 = __vector_bitcast<long>(__k._M_data);
 504             return -(__s64[0] + __s64[1]);
 505           }
 506       }
 507
 508     // }}}
 509     // _S_find_first_set {{{
 510     template <typename _Tp>
 511       _GLIBCXX_SIMD_INTRINSIC static int
 512       _S_find_first_set(simd_mask<_Tp, _Abi> __k)
 513       {
 514         // TODO: the _Base implementation is not optimal for NEON
 515         return _Base::_S_find_first_set(__k);
 516       }
 517
 518     // }}}
 519     // _S_find_last_set {{{
 520     template <typename _Tp>
 521       _GLIBCXX_SIMD_INTRINSIC static int
 522       _S_find_last_set(simd_mask<_Tp, _Abi> __k)
 523       {
 524         // TODO: the _Base implementation is not optimal for NEON
 525         return _Base::_S_find_last_set(__k);
 526       }
 527
 528     // }}}
 529   }; // }}}
 530
 531 _GLIBCXX_SIMD_END_NAMESPACE
 532 #endif // __cplusplus >= 201703L
 533 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
 534 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80