[thirdparty/gcc.git] / libstdc++-v3 / include / experimental / bits / simd_neon.h

// Simd NEON specific implementations -*- C++ -*-

// Copyright (C) 2020-2021 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library.  This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.

// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.

// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
// <http://www.gnu.org/licenses/>.

#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_

#if __cplusplus >= 201703L

#if !_GLIBCXX_SIMD_HAVE_NEON
#error "simd_neon.h may only be included when NEON on ARM is available"
#endif

_GLIBCXX_SIMD_BEGIN_NAMESPACE

// _CommonImplNeon {{{
struct _CommonImplNeon : _CommonImplBuiltin
{
  // _S_store {{{
  using _CommonImplBuiltin::_S_store;

  // }}}
};

// }}}
// _SimdImplNeon {{{
template <typename _Abi>
  struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
  {
    using _Base = _SimdImplBuiltin<_Abi>;

    template <typename _Tp>
      using _MaskMember = typename _Base::template _MaskMember<_Tp>;

    template <typename _Tp>
      static constexpr size_t _S_max_store_size = 16;

    // _S_masked_load {{{
    template <typename _Tp, size_t _Np, typename _Up>
      static inline _SimdWrapper<_Tp, _Np>
      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
		     const _Up* __mem) noexcept
      {
	__execute_n_times<_Np>([&](auto __i) {
	  if (__k[__i] != 0)
	    __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
	});
	return __merge;
      }

    // }}}
    // _S_masked_store_nocvt {{{
    template <typename _Tp, size_t _Np>
      _GLIBCXX_SIMD_INTRINSIC static void
      _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
			    _MaskMember<_Tp> __k)
      {
	__execute_n_times<_Np>([&](auto __i) {
	  if (__k[__i] != 0)
	    __mem[__i] = __v[__i];
	});
      }

    // }}}
    // _S_reduce {{{
    template <typename _Tp, typename _BinaryOperation>
      _GLIBCXX_SIMD_INTRINSIC static _Tp
      _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
      {
	constexpr size_t _Np = __x.size();
	if constexpr (sizeof(__x) == 16 && _Np >= 4
		      && !_Abi::template _S_is_partial<_Tp>)
	  {
	    const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
	    const auto __y = __binary_op(__halves[0], __halves[1]);
	    return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
	      __y, static_cast<_BinaryOperation&&>(__binary_op));
	  }
	else if constexpr (_Np == 8)
	  {
	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
				     __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
				       __x._M_data)));
	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
				     __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
				       __x._M_data)));
	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
				     __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
				       __x._M_data)));
	    return __x[0];
	  }
	else if constexpr (_Np == 4)
	  {
	    __x
	      = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
				   __vector_permute<1, 0, 3, 2>(__x._M_data)));
	    __x
	      = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
				   __vector_permute<3, 2, 1, 0>(__x._M_data)));
	    return __x[0];
	  }
	else if constexpr (_Np == 2)
	  {
	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
				     __vector_permute<1, 0>(__x._M_data)));
	    return __x[0];
	  }
	else
	  return _Base::_S_reduce(__x,
				  static_cast<_BinaryOperation&&>(__binary_op));
      }

    // }}}
    // math {{{
    // _S_sqrt {{{
    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
      {
	if constexpr (__have_neon_a64)
	  {
	    const auto __intrin = __to_intrin(__x);
	    if constexpr (_TVT::template _S_is<float, 2>)
	      return vsqrt_f32(__intrin);
	    else if constexpr (_TVT::template _S_is<float, 4>)
	      return vsqrtq_f32(__intrin);
	    else if constexpr (_TVT::template _S_is<double, 1>)
	      return vsqrt_f64(__intrin);
	    else if constexpr (_TVT::template _S_is<double, 2>)
	      return vsqrtq_f64(__intrin);
	    else
	      __assert_unreachable<_Tp>();
	  }
	else
	  return _Base::_S_sqrt(__x);
      }

    // }}}
    // _S_trunc {{{
    template <typename _TW, typename _TVT = _VectorTraits<_TW>>
      _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
      {
	using _Tp = typename _TVT::value_type;
	if constexpr (__have_neon_a32)
	  {
	    const auto __intrin = __to_intrin(__x);
	    if constexpr (_TVT::template _S_is<float, 2>)
	      return vrnd_f32(__intrin);
	    else if constexpr (_TVT::template _S_is<float, 4>)
	      return vrndq_f32(__intrin);
	    else if constexpr (_TVT::template _S_is<double, 1>)
	      return vrnd_f64(__intrin);
	    else if constexpr (_TVT::template _S_is<double, 2>)
	      return vrndq_f64(__intrin);
	    else
	      __assert_unreachable<_Tp>();
	  }
	else if constexpr (is_same_v<_Tp, float>)
	  {
	    auto __intrin = __to_intrin(__x);
	    if constexpr (sizeof(__x) == 16)
	      __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
	    else
	      __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
	    return _Base::_S_abs(__x)._M_data < 0x1p23f
		     ? __vector_bitcast<float>(__intrin)
		     : __x._M_data;
	  }
	else
	  return _Base::_S_trunc(__x);
      }

    // }}}
    // _S_round {{{
    template <typename _Tp, size_t _Np>
      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
      _S_round(_SimdWrapper<_Tp, _Np> __x)
      {
	if constexpr (__have_neon_a32)
	  {
	    const auto __intrin = __to_intrin(__x);
	    if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
	      return vrnda_f32(__intrin);
	    else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
	      return vrndaq_f32(__intrin);
	    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
	      return vrnda_f64(__intrin);
	    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
	      return vrndaq_f64(__intrin);
	    else
	      __assert_unreachable<_Tp>();
	  }
	else
	  return _Base::_S_round(__x);
      }

    // }}}
    // _S_floor {{{
    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
      {
	if constexpr (__have_neon_a32)
	  {
	    const auto __intrin = __to_intrin(__x);
	    if constexpr (_TVT::template _S_is<float, 2>)
	      return vrndm_f32(__intrin);
	    else if constexpr (_TVT::template _S_is<float, 4>)
	      return vrndmq_f32(__intrin);
	    else if constexpr (_TVT::template _S_is<double, 1>)
	      return vrndm_f64(__intrin);
	    else if constexpr (_TVT::template _S_is<double, 2>)
	      return vrndmq_f64(__intrin);
	    else
	      __assert_unreachable<_Tp>();
	  }
	else
	  return _Base::_S_floor(__x);
      }

    // }}}
    // _S_ceil {{{
    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
      {
	if constexpr (__have_neon_a32)
	  {
	    const auto __intrin = __to_intrin(__x);
	    if constexpr (_TVT::template _S_is<float, 2>)
	      return vrndp_f32(__intrin);
	    else if constexpr (_TVT::template _S_is<float, 4>)
	      return vrndpq_f32(__intrin);
	    else if constexpr (_TVT::template _S_is<double, 1>)
	      return vrndp_f64(__intrin);
	    else if constexpr (_TVT::template _S_is<double, 2>)
	      return vrndpq_f64(__intrin);
	    else
	      __assert_unreachable<_Tp>();
	  }
	else
	  return _Base::_S_ceil(__x);
      }

    //}}} }}}
  }; // }}}
// _MaskImplNeonMixin {{{
struct _MaskImplNeonMixin
{
  using _Base = _MaskImplBuiltinMixin;

  template <typename _Tp, size_t _Np>
    _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
    _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
    {
      if (__builtin_is_constant_evaluated())
	return _Base::_S_to_bits(__x);

      using _I = __int_for_sizeof_t<_Tp>;
      if constexpr (sizeof(__x) == 16)
	{
	  auto __asint = __vector_bitcast<_I>(__x);
#ifdef __aarch64__
	  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
#else
	  [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
#endif
	  if constexpr (sizeof(_Tp) == 1)
	    {
	      constexpr auto __bitsel
		= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
		  [&](auto __i) {
		    return static_cast<_I>(
		      __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
		  });
	      __asint &= __bitsel;
#ifdef __aarch64__
	      return __vector_bitcast<_UShort>(
		vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
			  __zero))[0];
#else
	      return __vector_bitcast<_UShort>(
		vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
				  __zero),
			 __zero))[0];
#endif
	    }
	  else if constexpr (sizeof(_Tp) == 2)
	    {
	      constexpr auto __bitsel
		= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
		  [&](auto __i) {
		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
		  });
	      __asint &= __bitsel;
#ifdef __aarch64__
	      return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero),
				__zero)[0];
#else
	      return vpadd_s16(
		vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
		__zero)[0];
#endif
	    }
	  else if constexpr (sizeof(_Tp) == 4)
	    {
	      constexpr auto __bitsel
		= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
		  [&](auto __i) {
		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
		  });
	      __asint &= __bitsel;
#ifdef __aarch64__
	      return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0];
#else
	      return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
			       __zero)[0];
#endif
	    }
	  else if constexpr (sizeof(_Tp) == 8)
	    return (__asint[0] & 1) | (__asint[1] & 2);
	  else
	    __assert_unreachable<_Tp>();
	}
      else if constexpr (sizeof(__x) == 8)
	{
	  auto __asint = __vector_bitcast<_I>(__x);
	  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
	  if constexpr (sizeof(_Tp) == 1)
	    {
	      constexpr auto __bitsel
		= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
		  [&](auto __i) {
		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
		  });
	      __asint &= __bitsel;
	      return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
			      __zero)[0];
	    }
	  else if constexpr (sizeof(_Tp) == 2)
	    {
	      constexpr auto __bitsel
		= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
		  [&](auto __i) {
		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
		  });
	      __asint &= __bitsel;
	      return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
	    }
	  else if constexpr (sizeof(_Tp) == 4)
	    {
	      __asint &= __make_vector<_I>(0x1, 0x2);
	      return vpadd_s32(__asint, __zero)[0];
	    }
	  else
	    __assert_unreachable<_Tp>();
	}
      else
	return _Base::_S_to_bits(__x);
    }
};

// }}}
// _MaskImplNeon {{{
template <typename _Abi>
  struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
  {
    using _MaskImplBuiltinMixin::_S_to_maskvector;
    using _MaskImplNeonMixin::_S_to_bits;
    using _Base = _MaskImplBuiltin<_Abi>;
    using _Base::_S_convert;

    // _S_all_of {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
      {
	const auto __kk
	  = __vector_bitcast<char>(__k._M_data)
	    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
	if constexpr (sizeof(__k) == 16)
	  {
	    const auto __x = __vector_bitcast<long long>(__kk);
	    return __x[0] + __x[1] == -2;
	  }
	else if constexpr (sizeof(__k) <= 8)
	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
	else
	  __assert_unreachable<_Tp>();
      }

    // }}}
    // _S_any_of {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
      {
	const auto __kk
	  = __vector_bitcast<char>(__k._M_data)
	    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
	if constexpr (sizeof(__k) == 16)
	  {
	    const auto __x = __vector_bitcast<long long>(__kk);
	    return (__x[0] | __x[1]) != 0;
	  }
	else if constexpr (sizeof(__k) <= 8)
	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
	else
	  __assert_unreachable<_Tp>();
      }

    // }}}
    // _S_none_of {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
      {
	const auto __kk = _Abi::_S_masked(__k._M_data);
	if constexpr (sizeof(__k) == 16)
	  {
	    const auto __x = __vector_bitcast<long long>(__kk);
	    return (__x[0] | __x[1]) == 0;
	  }
	else if constexpr (sizeof(__k) <= 8)
	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
	else
	  __assert_unreachable<_Tp>();
      }

    // }}}
    // _S_some_of {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
      {
	if constexpr (sizeof(__k) <= 8)
	  {
	    const auto __kk = __vector_bitcast<char>(__k._M_data)
			      | ~__vector_bitcast<char>(
				_Abi::template _S_implicit_mask<_Tp>());
	    using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
	    return __bit_cast<_Up>(__kk) + 1 > 1;
	  }
	else
	  return _Base::_S_some_of(__k);
      }

    // }}}
    // _S_popcount {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
      {
	if constexpr (sizeof(_Tp) == 1)
	  {
	    const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
	    int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
	    return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
			     int8x8_t())[0];
	  }
	else if constexpr (sizeof(_Tp) == 2)
	  {
	    const auto __s16 = __vector_bitcast<short>(__k._M_data);
	    int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
	    return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
	  }
	else if constexpr (sizeof(_Tp) == 4)
	  {
	    const auto __s32 = __vector_bitcast<int>(__k._M_data);
	    int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
	    return -vpadd_s32(__tmp, int32x2_t())[0];
	  }
	else if constexpr (sizeof(_Tp) == 8)
	  {
	    static_assert(sizeof(__k) == 16);
	    const auto __s64 = __vector_bitcast<long>(__k._M_data);
	    return -(__s64[0] + __s64[1]);
	  }
      }

    // }}}
    // _S_find_first_set {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static int
      _S_find_first_set(simd_mask<_Tp, _Abi> __k)
      {
	// TODO: the _Base implementation is not optimal for NEON
	return _Base::_S_find_first_set(__k);
      }

    // }}}
    // _S_find_last_set {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static int
      _S_find_last_set(simd_mask<_Tp, _Abi> __k)
      {
	// TODO: the _Base implementation is not optimal for NEON
	return _Base::_S_find_last_set(__k);
      }

    // }}}
  }; // }}}

_GLIBCXX_SIMD_END_NAMESPACE
#endif // __cplusplus >= 201703L
#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
Commit	Line	Data
2bcceb6f MK	1	// Simd NEON specific implementations -- C++ --
2bcceb6f MK	2
a054608c	3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
2bcceb6f MK	4	//
	5	// This file is part of the GNU ISO C++ Library. This library is free
	6	// software; you can redistribute it and/or modify it under the
	7	// terms of the GNU General Public License as published by the
	8	// Free Software Foundation; either version 3, or (at your option)
	9	// any later version.
	10
	11	// This library is distributed in the hope that it will be useful,
	12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	// GNU General Public License for more details.
	15
	16	// Under Section 7 of GPL version 3, you are granted additional
	17	// permissions described in the GCC Runtime Library Exception, version
	18	// 3.1, as published by the Free Software Foundation.
	19
	20	// You should have received a copy of the GNU General Public License and
	21	// a copy of the GCC Runtime Library Exception along with this program;
	22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	23	// <http://www.gnu.org/licenses/>.
	24
	25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
	26	#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
	27
	28	#if __cplusplus >= 201703L
	29
	30	#if !_GLIBCXX_SIMD_HAVE_NEON
	31	#error "simd_neon.h may only be included when NEON on ARM is available"
	32	#endif
	33
	34	_GLIBCXX_SIMD_BEGIN_NAMESPACE
	35
	36	// _CommonImplNeon {{{
	37	struct _CommonImplNeon : _CommonImplBuiltin
	38	{
	39	// _S_store {{{
	40	using _CommonImplBuiltin::_S_store;
	41
	42	// }}}
	43	};
	44
	45	// }}}
	46	// _SimdImplNeon {{{
	47	template <typename _Abi>
	48	struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
	49	{
	50	using _Base = _SimdImplBuiltin<_Abi>;
	51
	52	template <typename _Tp>
	53	using _MaskMember = typename _Base::template _MaskMember<_Tp>;
	54
	55	template <typename _Tp>
	56	static constexpr size_t _S_max_store_size = 16;
	57
	58	// _S_masked_load {{{
	59	template <typename _Tp, size_t _Np, typename _Up>
	60	static inline _SimdWrapper<_Tp, _Np>
	61	_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
	62	const _Up* __mem) noexcept
	63	{
	64	__execute_n_times<_Np>([&](auto __i) {
	65	if (__k[__i] != 0)
	66	__merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
	67	});
68	return __merge;
69	}
70
71	// }}}
72	// _S_masked_store_nocvt {{{
73	template <typename _Tp, size_t _Np>
74	_GLIBCXX_SIMD_INTRINSIC static void
75	_S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
76	_MaskMember<_Tp> __k)
77	{
78	__execute_n_times<_Np>([&](auto __i) {
79	if (__k[__i] != 0)
80	__mem[__i] = __v[__i];
81	});
82	}
83
84	// }}}
85	// _S_reduce {{{
86	template <typename _Tp, typename _BinaryOperation>
87	_GLIBCXX_SIMD_INTRINSIC static _Tp
88	_S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
89	{
90	constexpr size_t _Np = __x.size();
91	if constexpr (sizeof(__x) == 16 && _Np >= 4
92	&& !_Abi::template _S_is_partial<_Tp>)
93	{
94	const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
95	const auto __y = __binary_op(__halves[0], __halves[1]);
96	return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
97	__y, static_cast<_BinaryOperation&&>(__binary_op));
98	}
99	else if constexpr (_Np == 8)
100	{
101	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
102	__vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
103	__x._M_data)));
104	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
105	__vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
106	__x._M_data)));
107	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
108	__vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
109	__x._M_data)));
110	return __x[0];
111	}
112	else if constexpr (_Np == 4)
113	{
114	__x
115	= __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
116	__vector_permute<1, 0, 3, 2>(__x._M_data)));
117	__x
118	= __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
119	__vector_permute<3, 2, 1, 0>(__x._M_data)));
120	return __x[0];
121	}
122	else if constexpr (_Np == 2)
123	{
124	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
125	__vector_permute<1, 0>(__x._M_data)));
126	return __x[0];
127	}
128	else
129	return _Base::_S_reduce(__x,
130	static_cast<_BinaryOperation&&>(__binary_op));
131	}
132
133	// }}}
134	// math {{{
135	// _S_sqrt {{{
136	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
137	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
138	{
139	if constexpr (__have_neon_a64)
140	{
141	const auto __intrin = __to_intrin(__x);
142	if constexpr (_TVT::template _S_is<float, 2>)
143	return vsqrt_f32(__intrin);
144	else if constexpr (_TVT::template _S_is<float, 4>)
145	return vsqrtq_f32(__intrin);
146	else if constexpr (_TVT::template _S_is<double, 1>)
147	return vsqrt_f64(__intrin);
148	else if constexpr (_TVT::template _S_is<double, 2>)
149	return vsqrtq_f64(__intrin);
150	else
151	__assert_unreachable<_Tp>();
152	}
153	else
154	return _Base::_S_sqrt(__x);
155	}
156
157	// }}}
158	// _S_trunc {{{
159	template <typename _TW, typename _TVT = _VectorTraits<_TW>>
160	_GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
161	{
162	using _Tp = typename _TVT::value_type;
163	if constexpr (__have_neon_a32)
164	{
165	const auto __intrin = __to_intrin(__x);
166	if constexpr (_TVT::template _S_is<float, 2>)
167	return vrnd_f32(__intrin);
168	else if constexpr (_TVT::template _S_is<float, 4>)
169	return vrndq_f32(__intrin);
170	else if constexpr (_TVT::template _S_is<double, 1>)
171	return vrnd_f64(__intrin);
172	else if constexpr (_TVT::template _S_is<double, 2>)
173	return vrndq_f64(__intrin);
174	else
175	__assert_unreachable<_Tp>();
176	}
177	else if constexpr (is_same_v<_Tp, float>)
178	{
179	auto __intrin = __to_intrin(__x);
180	if constexpr (sizeof(__x) == 16)
181	__intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
182	else
183	__intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
184	return _Base::_S_abs(__x)._M_data < 0x1p23f
185	? __vector_bitcast<float>(__intrin)
186	: __x._M_data;
187	}
188	else
189	return _Base::_S_trunc(__x);
190	}
191
192	// }}}
193	// _S_round {{{
194	template <typename _Tp, size_t _Np>
195	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
196	_S_round(_SimdWrapper<_Tp, _Np> __x)
197	{
198	if constexpr (__have_neon_a32)
199	{
200	const auto __intrin = __to_intrin(__x);
201	if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
202	return vrnda_f32(__intrin);
203	else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
204	return vrndaq_f32(__intrin);
205	else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
206	return vrnda_f64(__intrin);
207	else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
208	return vrndaq_f64(__intrin);
209	else
210	__assert_unreachable<_Tp>();
211	}
212	else
213	return _Base::_S_round(__x);
214	}
215
216	// }}}
217	// _S_floor {{{
218	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
219	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
220	{
221	if constexpr (__have_neon_a32)
222	{
223	const auto __intrin = __to_intrin(__x);
224	if constexpr (_TVT::template _S_is<float, 2>)
225	return vrndm_f32(__intrin);
226	else if constexpr (_TVT::template _S_is<float, 4>)
227	return vrndmq_f32(__intrin);
228	else if constexpr (_TVT::template _S_is<double, 1>)
229	return vrndm_f64(__intrin);
230	else if constexpr (_TVT::template _S_is<double, 2>)
231	return vrndmq_f64(__intrin);
232	else
233	__assert_unreachable<_Tp>();
234	}
235	else
236	return _Base::_S_floor(__x);
237	}
238
239	// }}}
240	// _S_ceil {{{
241	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
242	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
243	{
244	if constexpr (__have_neon_a32)
245	{
246	const auto __intrin = __to_intrin(__x);
247	if constexpr (_TVT::template _S_is<float, 2>)
248	return vrndp_f32(__intrin);
249	else if constexpr (_TVT::template _S_is<float, 4>)
250	return vrndpq_f32(__intrin);
251	else if constexpr (_TVT::template _S_is<double, 1>)
252	return vrndp_f64(__intrin);
253	else if constexpr (_TVT::template _S_is<double, 2>)
254	return vrndpq_f64(__intrin);
255	else
256	__assert_unreachable<_Tp>();
257	}
258	else
259	return _Base::_S_ceil(__x);
260	}
261
262	//}}} }}}
263	}; // }}}
264	// _MaskImplNeonMixin {{{
265	struct _MaskImplNeonMixin
266	{
267	using _Base = _MaskImplBuiltinMixin;
268
269	template <typename _Tp, size_t _Np>
270	_GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
271	_S_to_bits(_SimdWrapper<_Tp, _Np> __x)
272	{
273	if (__builtin_is_constant_evaluated())
274	return _Base::_S_to_bits(__x);
275
276	using _I = __int_for_sizeof_t<_Tp>;
277	if constexpr (sizeof(__x) == 16)
278	{
279	auto __asint = __vector_bitcast<_I>(__x);
280	#ifdef __aarch64__
281	[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
282	#else
283	[[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
284	#endif
285	if constexpr (sizeof(_Tp) == 1)
286	{
287	constexpr auto __bitsel
288	= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
289	[&](auto __i) {
290	return static_cast<_I>(
291	__i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
292	});
293	__asint &= __bitsel;
294	#ifdef __aarch64__
295	return __vector_bitcast<_UShort>(
296	vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
297	__zero))[0];
298	#else
299	return __vector_bitcast<_UShort>(
300	vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
301	__zero),
302	__zero))[0];
303	#endif
304	}
305	else if constexpr (sizeof(_Tp) == 2)
306	{
307	constexpr auto __bitsel
308	= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
309	[&](auto __i) {
310	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
311	});
312	__asint &= __bitsel;
313	#ifdef __aarch64__
314	return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero),
315	__zero)[0];
316	#else
317	return vpadd_s16(
318	vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
319	__zero)[0];
320	#endif
321	}
322	else if constexpr (sizeof(_Tp) == 4)
323	{
324	constexpr auto __bitsel
325	= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
326	[&](auto __i) {
327	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
328	});
329	__asint &= __bitsel;
330	#ifdef __aarch64__
331	return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0];
332	#else
333	return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
334	__zero)[0];
335	#endif
336	}
337	else if constexpr (sizeof(_Tp) == 8)
338	return (__asint[0] & 1) \| (__asint[1] & 2);
339	else
340	__assert_unreachable<_Tp>();
341	}
342	else if constexpr (sizeof(__x) == 8)
343	{
344	auto __asint = __vector_bitcast<_I>(__x);
345	[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
346	if constexpr (sizeof(_Tp) == 1)
347	{
348	constexpr auto __bitsel
349	= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
350	[&](auto __i) {
351	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
352	});
353	__asint &= __bitsel;
354	return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
355	__zero)[0];
356	}
357	else if constexpr (sizeof(_Tp) == 2)
358	{
359	constexpr auto __bitsel
360	= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
361	[&](auto __i) {
362	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
363	});
364	__asint &= __bitsel;
365	return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
366	}
367	else if constexpr (sizeof(_Tp) == 4)
368	{
369	__asint &= __make_vector<_I>(0x1, 0x2);
370	return vpadd_s32(__asint, __zero)[0];
371	}
372	else
373	__assert_unreachable<_Tp>();
374	}
375	else
376	return _Base::_S_to_bits(__x);
377	}
378	};
379
380	// }}}
381	// _MaskImplNeon {{{
382	template <typename _Abi>
383	struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
384	{
385	using _MaskImplBuiltinMixin::_S_to_maskvector;
386	using _MaskImplNeonMixin::_S_to_bits;
387	using _Base = _MaskImplBuiltin<_Abi>;
388	using _Base::_S_convert;
389
390	// _S_all_of {{{
391	template <typename _Tp>
392	_GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
393	{
394	const auto __kk
395	= __vector_bitcast<char>(__k._M_data)
396	\| ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
397	if constexpr (sizeof(__k) == 16)
398	{
399	const auto __x = __vector_bitcast<long long>(__kk);
400	return __x[0] + __x[1] == -2;
401	}
402	else if constexpr (sizeof(__k) <= 8)
403	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
404	else
405	__assert_unreachable<_Tp>();
406	}
407
408	// }}}
409	// _S_any_of {{{
410	template <typename _Tp>
411	_GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
412	{
413	const auto __kk
414	= __vector_bitcast<char>(__k._M_data)
415	\| ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
416	if constexpr (sizeof(__k) == 16)
417	{
418	const auto __x = __vector_bitcast<long long>(__kk);
419	return (__x[0] \| __x[1]) != 0;
420	}
421	else if constexpr (sizeof(__k) <= 8)
422	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
423	else
424	__assert_unreachable<_Tp>();
425	}
426
427	// }}}
428	// _S_none_of {{{
429	template <typename _Tp>
430	_GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
431	{
432	const auto __kk = _Abi::_S_masked(__k._M_data);
433	if constexpr (sizeof(__k) == 16)
434	{
435	const auto __x = __vector_bitcast<long long>(__kk);
436	return (__x[0] \| __x[1]) == 0;
437	}
438	else if constexpr (sizeof(__k) <= 8)
439	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
440	else
441	__assert_unreachable<_Tp>();
442	}
443
444	// }}}
445	// _S_some_of {{{
446	template <typename _Tp>
447	_GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
448	{
449	if constexpr (sizeof(__k) <= 8)
450	{
451	const auto __kk = __vector_bitcast<char>(__k._M_data)
452	\| ~__vector_bitcast<char>(
453	_Abi::template _S_implicit_mask<_Tp>());
454	using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
455	return __bit_cast<_Up>(__kk) + 1 > 1;
456	}
457	else
458	return _Base::_S_some_of(__k);
459	}
460
461	// }}}
462	// _S_popcount {{{
463	template <typename _Tp>
464	_GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
465	{
466	if constexpr (sizeof(_Tp) == 1)
467	{
468	const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
469	int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
470	return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
471	int8x8_t())[0];
472	}
473	else if constexpr (sizeof(_Tp) == 2)
474	{
475	const auto __s16 = __vector_bitcast<short>(__k._M_data);
476	int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
477	return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
478	}
479	else if constexpr (sizeof(_Tp) == 4)
480	{
481	const auto __s32 = __vector_bitcast<int>(__k._M_data);
482	int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
483	return -vpadd_s32(__tmp, int32x2_t())[0];
484	}
485	else if constexpr (sizeof(_Tp) == 8)
486	{
487	static_assert(sizeof(__k) == 16);
488	const auto __s64 = __vector_bitcast<long>(__k._M_data);
489	return -(__s64[0] + __s64[1]);
490	}
491	}
492
493	// }}}
494	// _S_find_first_set {{{
495	template <typename _Tp>
496	_GLIBCXX_SIMD_INTRINSIC static int
497	_S_find_first_set(simd_mask<_Tp, _Abi> __k)
498	{
499	// TODO: the _Base implementation is not optimal for NEON
500	return _Base::_S_find_first_set(__k);
501	}
502
503	// }}}
504	// _S_find_last_set {{{
505	template <typename _Tp>
506	_GLIBCXX_SIMD_INTRINSIC static int
507	_S_find_last_set(simd_mask<_Tp, _Abi> __k)
508	{
509	// TODO: the _Base implementation is not optimal for NEON
510	return _Base::_S_find_last_set(__k);
511	}
512
513	// }}}
514	}; // }}}
515
516	_GLIBCXX_SIMD_END_NAMESPACE
517	#endif // __cplusplus >= 201703L
518	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
519	// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80