${std_srcdir}/semaphore \
${std_srcdir}/set \
${std_srcdir}/shared_mutex \
+ ${std_srcdir}/simd \
${std_srcdir}/spanstream \
${std_srcdir}/sstream \
${std_srcdir}/syncstream \
${bits_srcdir}/shared_ptr.h \
${bits_srcdir}/shared_ptr_atomic.h \
${bits_srcdir}/shared_ptr_base.h \
+ ${bits_srcdir}/simd_alg.h \
+ ${bits_srcdir}/simd_details.h \
+ ${bits_srcdir}/simd_flags.h \
+ ${bits_srcdir}/simd_iterator.h \
+ ${bits_srcdir}/simd_loadstore.h \
+ ${bits_srcdir}/simd_mask.h \
+ ${bits_srcdir}/simd_mask_reductions.h \
+ ${bits_srcdir}/simd_reductions.h \
+ ${bits_srcdir}/simd_vec.h \
+ ${bits_srcdir}/simd_x86.h \
${bits_srcdir}/slice_array.h \
${bits_srcdir}/specfun.h \
${bits_srcdir}/sstream.tcc \
${bits_srcdir}/valarray_array.tcc \
${bits_srcdir}/valarray_before.h \
${bits_srcdir}/valarray_after.h \
+ ${bits_srcdir}/vec_ops.h \
${bits_srcdir}/vector.tcc
endif GLIBCXX_HOSTED
@GLIBCXX_HOSTED_TRUE@ ${std_srcdir}/semaphore \
@GLIBCXX_HOSTED_TRUE@ ${std_srcdir}/set \
@GLIBCXX_HOSTED_TRUE@ ${std_srcdir}/shared_mutex \
+@GLIBCXX_HOSTED_TRUE@ ${std_srcdir}/simd \
@GLIBCXX_HOSTED_TRUE@ ${std_srcdir}/spanstream \
@GLIBCXX_HOSTED_TRUE@ ${std_srcdir}/sstream \
@GLIBCXX_HOSTED_TRUE@ ${std_srcdir}/syncstream \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/shared_ptr.h \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/shared_ptr_atomic.h \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/shared_ptr_base.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_alg.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_details.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_flags.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_iterator.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_loadstore.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_mask.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_mask_reductions.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_reductions.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_vec.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/simd_x86.h \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/slice_array.h \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/specfun.h \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/sstream.tcc \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/valarray_array.tcc \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/valarray_before.h \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/valarray_after.h \
+@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/vec_ops.h \
@GLIBCXX_HOSTED_TRUE@ ${bits_srcdir}/vector.tcc
bits_host_headers = \
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_ALG_H
+#define _GLIBCXX_SIMD_ALG_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_vec.h"
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+// [simd.alg] -----------------------------------------------------------------
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ template<typename _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec<_Tp, _Ap>
+ min(const basic_vec<_Tp, _Ap>& __a, const basic_vec<_Tp, _Ap>& __b) noexcept
+ { return __select_impl(__a < __b, __a, __b); }
+
+ template<typename _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec<_Tp, _Ap>
+ max(const basic_vec<_Tp, _Ap>& __a, const basic_vec<_Tp, _Ap>& __b) noexcept
+ { return __select_impl(__a < __b, __b, __a); }
+
+ template<typename _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr pair<basic_vec<_Tp, _Ap>, basic_vec<_Tp, _Ap>>
+ minmax(const basic_vec<_Tp, _Ap>& __a, const basic_vec<_Tp, _Ap>& __b) noexcept
+ { return {min(__a, __b), max(__a, __b)}; }
+
+ template<typename _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec<_Tp, _Ap>
+ clamp(const basic_vec<_Tp, _Ap>& __v, const basic_vec<_Tp, _Ap>& __lo,
+ const basic_vec<_Tp, _Ap>& __hi)
+ {
+ __glibcxx_simd_precondition(none_of(__lo > __hi), "lower bound is larger than upper bound");
+ return max(__lo, min(__hi, __v));
+ }
+
+ template<typename _Tp, typename _Up>
+ constexpr auto
+ select(bool __c, const _Tp& __a, const _Up& __b)
+ -> remove_cvref_t<decltype(__c ? __a : __b)>
+ { return __c ? __a : __b; }
+
+ template<size_t _Bytes, typename _Ap, typename _Tp, typename _Up>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ select(const basic_mask<_Bytes, _Ap>& __c, const _Tp& __a, const _Up& __b)
+ noexcept -> decltype(__select_impl(__c, __a, __b))
+ { return __select_impl(__c, __a, __b); }
+} // namespace simd
+
+ using simd::min;
+ using simd::max;
+ using simd::minmax;
+ using simd::clamp;
+
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_ALG_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_DETAILS_H
+#define _GLIBCXX_SIMD_DETAILS_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include <bit>
+#include <bits/c++config.h> // _GLIBCXX_FLOAT_IS_IEEE_BINARY32
+#include <bits/stl_function.h> // plus, minus, multiplies, ...
+#include <bits/utility.h> // integer_sequence, etc.
+#include <cmath> // for math_errhandling :(
+#include <concepts>
+#include <cstdint>
+#include <limits>
+#include <span> // for dynamic_extent
+
+#if __CHAR_BIT__ != 8
+// There are simply too many constants and bit operators that currently depend on CHAR_BIT == 8.
+// Generalization to CHAR_BIT != 8 does not make sense without testability (i.e. a test target).
+#error "<simd> is not supported for CHAR_BIT != 8"
+#endif
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+#if defined __x86_64__ || defined __i386__
+#define _GLIBCXX_X86 1
+#else
+#define _GLIBCXX_X86 0
+#endif
+
+#ifndef _GLIBCXX_SIMD_NOEXCEPT
+/** @internal
+ * For unit-testing preconditions, use this macro to remove noexcept.
+ */
+#define _GLIBCXX_SIMD_NOEXCEPT noexcept
+#endif
+
+#define _GLIBCXX_SIMD_TOSTRING_IMPL(x) #x
+#define _GLIBCXX_SIMD_TOSTRING(x) _GLIBCXX_SIMD_TOSTRING_IMPL(x)
+
+// This is used for unit-testing precondition checking
+#define __glibcxx_simd_precondition(expr, msg, ...) \
+ __glibcxx_assert(expr)
+
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+
+namespace simd
+{
+ template <typename _Tp>
+ inline constexpr _Tp
+ __iota = [] { static_assert(false, "invalid __iota specialization"); }();
+
+ // [simd.general] vectorizable types
+
+ template <typename _Tp>
+ concept __vectorizable_scalar
+ = same_as<remove_cv_t<_Tp>, _Tp>
+#ifdef __STDCPP_BFLOAT16_T__
+ && !same_as<_Tp, __gnu_cxx::__bfloat16_t>
+#endif
+ && ((integral<_Tp> && sizeof(_Tp) <= sizeof(0ULL) && !same_as<_Tp, bool>)
+ || (floating_point<_Tp> && sizeof(_Tp) <= sizeof(double)));
+
+ // [simd.general] p2
+ template <typename _Tp>
+ concept __vectorizable = __vectorizable_scalar<_Tp>;
+
+ /** @internal
+ * Describes variants of _Abi.
+ */
+ enum class _AbiVariant : unsigned long long
+ {
+ _BitMask = 0x01, // AVX512 bit-masks
+ _MaskVariants = 0x0f, // vector masks if bits [0:3] are 0
+ };
+
+ /** @internal
+ * Return @p __in with only bits set that are set in any of @p __to_keep.
+ */
+ consteval _AbiVariant
+ __filter_abi_variant(_AbiVariant __in, same_as<_AbiVariant> auto... __to_keep)
+ {
+ using _Up = underlying_type_t<_AbiVariant>;
+ return static_cast<_AbiVariant>(static_cast<_Up>(__in) & (static_cast<_Up>(__to_keep) | ...));
+ }
+
+ /** @internal
+ * Type used whenever no valid integer/value type exists.
+ */
+ struct _InvalidInteger
+ {};
+
+ /** @internal
+ * Alias for a signed integer type T such that sizeof(T) equals _Bytes.
+ *
+ * C++26 [simd.expos.defn]
+ */
+ template <size_t _Bytes>
+ using __integer_from
+ = decltype([] consteval {
+ if constexpr (sizeof(signed char) == _Bytes)
+ return static_cast<signed char>(0);
+ else if constexpr (sizeof(signed short) == _Bytes)
+ return static_cast<signed short>(0);
+ else if constexpr (sizeof(signed int) == _Bytes)
+ return static_cast<signed int>(0);
+ else if constexpr (sizeof(signed long long) == _Bytes)
+ return static_cast<signed long long>(0);
+ else
+ return _InvalidInteger();
+ }());
+
+ /** @internal
+ * Alias for an unsigned integer type T such that sizeof(T) equals _Bytes.
+ */
+ template <size_t _Bytes>
+ using _UInt = make_unsigned_t<__integer_from<_Bytes>>;
+
+ /** @internal
+ * Divide @p __x by @p __y while rounding up instead of down.
+ *
+ * Preconditions: __x >= 0 && __y > 0.
+ */
+ template <typename _Tp>
+ consteval _Tp
+ __div_ceil(_Tp __x, _Tp __y)
+ { return (__x + __y - 1) / __y; }
+
+ /** @internal
+ * Alias for an unsigned integer type that can store at least @p _NBits bits.
+ */
+ template <int _NBits>
+ requires (_NBits > 0 && _NBits <= numeric_limits<unsigned long long>::digits)
+ using _Bitmask = _UInt<__div_ceil(__bit_ceil(unsigned(_NBits)), unsigned(__CHAR_BIT__))>;
+
+ /** @internal
+ * Map a given type @p _Tp to an equivalent type.
+ *
+ * This helps with reducing the necessary branches && casts in the implementation as well as
+ * reducing the number of template instantiations.
+ */
+ template <typename _Tp>
+ struct __canonical_vec_type
+ { using type = _Tp; };
+
+ template <typename _Tp>
+ using __canonical_vec_type_t = typename __canonical_vec_type<_Tp>::type;
+
+#if __SIZEOF_INT__ == __SIZEOF_LONG__
+ template <>
+ struct __canonical_vec_type<long>
+ { using type = int; };
+
+ template <>
+ struct __canonical_vec_type<unsigned long>
+ { using type = unsigned int; };
+#elif __SIZEOF_LONG_LONG__ == __SIZEOF_LONG__
+ template <>
+ struct __canonical_vec_type<long>
+ { using type = long long; };
+
+ template <>
+ struct __canonical_vec_type<unsigned long>
+ { using type = unsigned long long; };
+#endif
+
+ template <typename _Tp>
+ requires std::is_enum_v<_Tp>
+ struct __canonical_vec_type<_Tp>
+ { using type = __canonical_vec_type<std::underlying_type_t<_Tp>>::type; };
+
+ template <>
+ struct __canonical_vec_type<char>
+#if __CHAR_UNSIGNED__
+ { using type = unsigned char; };
+#else
+ { using type = signed char; };
+#endif
+
+ template <>
+ struct __canonical_vec_type<char8_t>
+ { using type = unsigned char; };
+
+ template <>
+ struct __canonical_vec_type<char16_t>
+ { using type = uint_least16_t; };
+
+ template <>
+ struct __canonical_vec_type<char32_t>
+ { using type = uint_least32_t; };
+
+ template <>
+ struct __canonical_vec_type<wchar_t>
+ {
+ using type = std::__conditional_t<std::is_signed_v<wchar_t>,
+ simd::__integer_from<sizeof(wchar_t)>,
+ simd::_UInt<sizeof(wchar_t)>>;
+ };
+
+#if defined(__FLT64_DIG__) && defined(_GLIBCXX_DOUBLE_IS_IEEE_BINARY64)
+ template <>
+ struct __canonical_vec_type<_Float64>
+ { using type = double; };
+#endif
+
+#if defined(__FLT32_DIG__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32)
+ template <>
+ struct __canonical_vec_type<_Float32>
+ { using type = float; };
+#endif
+
+ /** @internal
+ * This ABI tag describes basic_vec objects that store one element per data member and basic_mask
+ * objects that store one bool data members.
+ *
+ * @tparam _Np The number of elements, which also matches the number of data members in
+ * basic_vec and basic_mask.
+ */
+ template <int _Np = 1>
+ struct _ScalarAbi
+ {
+ static constexpr int _S_size = _Np;
+
+ static constexpr int _S_nreg = _Np;
+
+ static constexpr _AbiVariant _S_variant = {};
+
+ template <typename _Tp>
+ using _DataType = __canonical_vec_type_t<_Tp>;
+
+ static constexpr bool _S_is_vecmask = false;
+
+ // in principle a bool is a 1-bit bitmask, but this is asking for an AVX512 bitmask
+ static constexpr bool _S_is_bitmask = false;
+
+ template <size_t>
+ using _MaskDataType = bool;
+
+ template <int _N2, int _Nreg2 = _N2>
+ static consteval _ScalarAbi<_N2>
+ _S_resize()
+ {
+ static_assert(_N2 == _Nreg2);
+ return {};
+ }
+ };
+
+ /** @internal
+ * This ABI tag describes basic_vec objects that store one or more objects declared with the
+ * [[gnu::vector_size(N)]] attribute.
+ * Applied to basic_mask objects, this ABI tag either describes corresponding vector-mask objects
+ * or bit-mask objects. Which one is used is determined via @p _Var.
+ *
+ * @tparam _Np The number of elements.
+ * @tparam _Nreg The number of registers needed to store @p _Np elements.
+ * @tparam _Var Determines how complex value-types are layed out and whether mask types use
+ * bit-masks or vector-masks.
+ */
+ template <int _Np, int _Nreg, underlying_type_t<_AbiVariant> _Var>
+ struct _Abi
+ {
+ static constexpr int _S_size = _Np;
+
+ /** @internal
+ * The number of registers needed to represent one basic_vec for the element type that was
+ * used on ABI deduction.
+ *
+ * Examples:
+ * - '_Abi< 8, 2>' for 'int' is 2x 128-bit
+ * - '_Abi< 9, 3>' for 'int' is 2x 128-bit and 1x 32-bit
+ * - '_Abi<10, 3>' for 'int' is 2x 128-bit and 1x 64-bit
+ * - '_Abi<10, 1>' for 'int' is 1x 512-bit
+ * - '_Abi<10, 2>' for 'int' is 1x 256-bit and 1x 64-bit
+ */
+ static constexpr int _S_nreg = _Nreg;
+
+ static_assert(_S_size > 0);
+ static_assert(_S_nreg > 0);
+
+ static constexpr _AbiVariant _S_variant = static_cast<_AbiVariant>(_Var);
+
+ static constexpr bool _S_is_bitmask
+ = __filter_abi_variant(_S_variant, _AbiVariant::_BitMask) == _AbiVariant::_BitMask;
+
+ static constexpr bool _S_is_vecmask = !_S_is_bitmask;
+
+ template <typename _Tp>
+ using _DataType = decltype([] {
+ static_assert(_S_nreg == 1);
+ if constexpr (_S_size == 1)
+ return __canonical_vec_type_t<_Tp>();
+ else
+ {
+ constexpr int __n = __bit_ceil(unsigned(_S_size));
+ using _Vp [[__gnu__::__vector_size__(sizeof(_Tp) * __n)]]
+ = __canonical_vec_type_t<_Tp>;
+ return _Vp();
+ }
+ }());
+
+ template <size_t _Bytes>
+ using _MaskDataType
+ = decltype([] {
+ static_assert(_S_nreg == 1);
+ if constexpr (_S_size == 1)
+ return bool();
+ else if constexpr (_S_is_vecmask)
+ {
+ constexpr unsigned __vbytes = _Bytes * __bit_ceil(unsigned(_S_size));
+ using _Vp [[__gnu__::__vector_size__(__vbytes)]] = __integer_from<_Bytes>;
+ return _Vp();
+ }
+ else if constexpr (_Nreg > 1)
+ return _InvalidInteger();
+ else
+ return _Bitmask<_S_size>();
+ }());
+
+ template <int _N2, int _Nreg2 = __div_ceil(_N2, _S_size)>
+ static consteval auto
+ _S_resize()
+ {
+ if constexpr (_N2 == 1)
+ return _Abi<1, 1, _Var>();
+ else
+ return _Abi<_N2, _Nreg2, _Var>();
+ }
+ };
+
+ /** @internal
+ * Alias for an _Abi specialization where the _AbiVariant bits are combined into a single integer
+ * value.
+ *
+ * Rationale: Consider diagnostic output and mangling of e.g. vec<int, 4> with AVX512. That's an
+ * alias for std::simd::basic_vec<int, std::simd::_Abi<4, 1, 1ull>>. If _AbiVariant were the
+ * template argument type of _Abi, the diagnostic output would be 'std::simd::basic_vec<int,
+ * std::simd::_Abi<4, 1, (std::simd::_AbiVariant)std::simd::_AbiVariant::_BitMask>>'. That's a lot
+ * longer, requires longer mangled names, and bakes the names of the enumerators into the ABI. As
+ * soon as bits of multiple _AbiVariants are combined, this becomes hard to parse for humans
+ * anyway.
+ */
+ template <int _Np, int _Nreg, _AbiVariant... _Vs>
+ using _Abi_t = _Abi<_Np, _Nreg, (static_cast<underlying_type_t<_AbiVariant>>(_Vs) | ... | 0)>;
+
+ /** @internal
+ * This type is used whenever ABI tag deduction can't give a useful answer.
+ */
+ struct _InvalidAbi
+ { static constexpr int _S_size = 0; };
+
+ /** @internal
+ * Satisfied if @p _Tp is a valid simd ABI tag. This is a necessary but not sufficient condition
+ * for an enabled basic_vec/basic_mask specialization.
+ */
+ template <typename _Tp>
+ concept __abi_tag
+ = same_as<decltype(_Tp::_S_variant), const _AbiVariant>
+ && (_Tp::_S_size >= _Tp::_S_nreg) && (_Tp::_S_nreg >= 1)
+ && requires(_Tp __x) {
+ { __x.template _S_resize<_Tp::_S_size, _Tp::_S_nreg>() } -> same_as<_Tp>;
+ };
+
+ template <typename _Tp>
+ concept __scalar_abi_tag
+ = same_as<_Tp, _ScalarAbi<_Tp::_S_size>> && __abi_tag<_Tp>;
+
+ // Determine if math functions must *raise* floating-point exceptions.
+ // math_errhandling may expand to an extern symbol, in which case we must assume fp exceptions
+ // need to be considered. A conforming C library must define math_errhandling, but in case it
+ // isn't defined we simply use the fallback.
+#ifdef math_errhandling
+ template <int = 0>
+ requires requires { typename bool_constant<0 != (math_errhandling & MATH_ERREXCEPT)>; }
+ consteval bool
+ __handle_fpexcept_impl(int)
+ { return 0 != (math_errhandling & MATH_ERREXCEPT); }
+#endif
+
+ // Fallback if math_errhandling doesn't work: implement correct exception behavior.
+ consteval bool
+ __handle_fpexcept_impl(float)
+ { return true; }
+
+ /** @internal
+ * This type can be used as a template parameter for avoiding ODR violations, where code needs to
+ * differ depending on optimization flags (mostly fp-math related).
+ */
+ struct _OptTraits
+ {
+ consteval bool
+ _M_test(int __bit) const
+ { return ((_M_build_flags >> __bit) & 1) == 1; }
+
+ // true iff floating-point operations can signal an exception (allow non-default handler)
+ consteval bool
+ _M_fp_may_signal() const
+ { return _M_test(0); }
+
+ // true iff floating-point operations can raise an exception flag
+ consteval bool
+ _M_fp_may_raise() const
+ { return _M_test(12); }
+
+ consteval bool
+ _M_fast_math() const
+ { return _M_test(1); }
+
+ consteval bool
+ _M_finite_math_only() const
+ { return _M_test(2); }
+
+ consteval bool
+ _M_no_signed_zeros() const
+ { return _M_test(3); }
+
+ consteval bool
+ _M_signed_zeros() const
+ { return !_M_test(3); }
+
+ consteval bool
+ _M_reciprocal_math() const
+ { return _M_test(4); }
+
+ consteval bool
+ _M_no_math_errno() const
+ { return _M_test(5); }
+
+ consteval bool
+ _M_math_errno() const
+ { return !_M_test(5); }
+
+ consteval bool
+ _M_associative_math() const
+ { return _M_test(6); }
+
+ consteval bool
+ _M_conforming_to_STDC_annex_G() const
+ { return _M_test(10) && !_M_finite_math_only(); }
+
+ consteval bool
+ _M_support_snan() const
+ { return _M_test(11); }
+
+ __UINT64_TYPE__ _M_build_flags
+ = 0
+#if !__NO_TRAPPING_MATH__
+ + (1 << 0)
+#endif
+ + (__handle_fpexcept_impl(0) << 12)
+#if __FAST_MATH__
+ + (1 << 1)
+#endif
+#if __FINITE_MATH_ONLY__
+ + (1 << 2)
+#endif
+#if __NO_SIGNED_ZEROS__
+ + (1 << 3)
+#endif
+#if __RECIPROCAL_MATH__
+ + (1 << 4)
+#endif
+#if __NO_MATH_ERRNO__
+ + (1 << 5)
+#endif
+#if __ASSOCIATIVE_MATH__
+ + (1 << 6)
+#endif
+ // bits 7, 8, and 9 reserved for __FLT_EVAL_METHOD__
+#if __FLT_EVAL_METHOD__ == 1
+ + (1 << 7)
+#elif __FLT_EVAL_METHOD__ == 2
+ + (2 << 7)
+#elif __FLT_EVAL_METHOD__ != 0
+ + (3 << 7)
+#endif
+
+ // C Annex G defines the behavior of complex<T> where T is IEC60559 floating-point. If
+ // __STDC_IEC_60559_COMPLEX__ is defined then Annex G is implemented - and simd<complex>
+ // will do so as well. However, Clang never defines the macro.
+#if defined __STDC_IEC_60559_COMPLEX__ || defined __STDC_IEC_559_COMPLEX__ || defined _GLIBCXX_CLANG
+ + (1 << 10)
+#endif
+#if __SUPPORT_SNAN__
+ + (1 << 11)
+#endif
+ ;
+ };
+
+ /** @internal
+ * Return true iff @p __s equals "1".
+ */
+ consteval bool
+ __streq_to_1(const char* __s)
+ { return __s != nullptr && __s[0] == '1' && __s[1] == '\0'; }
+
+ /** @internal
+ * If the macro given as @p feat is defined to 1, expands to a bit set at position @p off.
+ * Otherwise, expand to zero.
+ */
+#define _GLIBCXX_SIMD_ARCH_FLAG(off, feat) \
+ (static_cast<__UINT64_TYPE__>(std::simd::__streq_to_1(_GLIBCXX_SIMD_TOSTRING_IMPL(feat))) << off)
+
+#if _GLIBCXX_X86
+
+#define _GLIBCXX_SIMD_ARCH_TRAITS_INIT { \
+ _GLIBCXX_SIMD_ARCH_FLAG(0, __MMX__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 1, __SSE__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 2, __SSE2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 3, __SSE3__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 4, __SSSE3__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 5, __SSE4_1__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 6, __SSE4_2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 7, __POPCNT__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 8, __AVX__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 9, __F16C__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(10, __BMI__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(11, __BMI2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(12, __LZCNT__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(13, __AVX2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(14, __FMA__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(15, __AVX512F__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(16, __AVX512CD__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(17, __AVX512DQ__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(18, __AVX512BW__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(19, __AVX512VL__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(20, __AVX512BITALG__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(21, __AVX512VBMI__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(22, __AVX512VBMI2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(23, __AVX512IFMA__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(24, __AVX512VNNI__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(25, __AVX512VPOPCNTDQ__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(26, __AVX512FP16__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(27, __AVX512BF16__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(28, __AVXIFMA__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(29, __AVXNECONVERT__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(30, __AVXVNNI__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(31, __AVXVNNIINT8__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(32, __AVXVNNIINT16__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(33, __AVX10_1__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(34, __AVX10_2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(35, __AVX512VP2INTERSECT__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(36, __SSE4A__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(37, __FMA4__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(38, __XOP__) \
+ }
+ // Should this include __APX_F__? I don't think it's relevant for use in constexpr-if branches =>
+ // no ODR issue? The same could be said about several other flags above that are not checked
+ // anywhere.
+
+ struct _ArchTraits
+ {
+ __UINT64_TYPE__ _M_flags = _GLIBCXX_SIMD_ARCH_TRAITS_INIT;
+
+ consteval bool
+ _M_test(int __bit) const
+ { return ((_M_flags >> __bit) & 1) == 1; }
+
+ consteval bool
+ _M_have_mmx() const
+ { return _M_test(0); }
+
+ consteval bool
+ _M_have_sse() const
+ { return _M_test(1); }
+
+ consteval bool
+ _M_have_sse2() const
+ { return _M_test(2); }
+
+ consteval bool
+ _M_have_sse3() const
+ { return _M_test(3); }
+
+ consteval bool
+ _M_have_ssse3() const
+ { return _M_test(4); }
+
+ consteval bool
+ _M_have_sse4_1() const
+ { return _M_test(5); }
+
+ consteval bool
+ _M_have_sse4_2() const
+ { return _M_test(6); }
+
+ consteval bool
+ _M_have_popcnt() const
+ { return _M_test(7); }
+
+ consteval bool
+ _M_have_avx() const
+ { return _M_test(8); }
+
+ consteval bool
+ _M_have_f16c() const
+ { return _M_test(9); }
+
+ consteval bool
+ _M_have_bmi() const
+ { return _M_test(10); }
+
+ consteval bool
+ _M_have_bmi2() const
+ { return _M_test(11); }
+
+ consteval bool
+ _M_have_lzcnt() const
+ { return _M_test(12); }
+
+ consteval bool
+ _M_have_avx2() const
+ { return _M_test(13); }
+
+ consteval bool
+ _M_have_fma() const
+ { return _M_test(14); }
+
+ consteval bool
+ _M_have_avx512f() const
+ { return _M_test(15); }
+
+ consteval bool
+ _M_have_avx512cd() const
+ { return _M_test(16); }
+
+ consteval bool
+ _M_have_avx512dq() const
+ { return _M_test(17); }
+
+ consteval bool
+ _M_have_avx512bw() const
+ { return _M_test(18); }
+
+ consteval bool
+ _M_have_avx512vl() const
+ { return _M_test(19); }
+
+ consteval bool
+ _M_have_avx512bitalg() const
+ { return _M_test(20); }
+
+ consteval bool
+ _M_have_avx512vbmi() const
+ { return _M_test(21); }
+
+ consteval bool
+ _M_have_avx512vbmi2() const
+ { return _M_test(22); }
+
+ consteval bool
+ _M_have_avx512ifma() const
+ { return _M_test(23); }
+
+ consteval bool
+ _M_have_avx512vnni() const
+ { return _M_test(24); }
+
+ consteval bool
+ _M_have_avx512vpopcntdq() const
+ { return _M_test(25); }
+
+ consteval bool
+ _M_have_avx512fp16() const
+ { return _M_test(26); }
+
+ consteval bool
+ _M_have_avx512bf16() const
+ { return _M_test(27); }
+
+ consteval bool
+ _M_have_avxifma() const
+ { return _M_test(28); }
+
+ consteval bool
+ _M_have_avxneconvert() const
+ { return _M_test(29); }
+
+ consteval bool
+ _M_have_avxvnni() const
+ { return _M_test(30); }
+
+ consteval bool
+ _M_have_avxvnniint8() const
+ { return _M_test(31); }
+
+ consteval bool
+ _M_have_avxvnniint16() const
+ { return _M_test(32); }
+
+ consteval bool
+ _M_have_avx10_1() const
+ { return _M_test(33); }
+
+ consteval bool
+ _M_have_avx10_2() const
+ { return _M_test(34); }
+
+ consteval bool
+ _M_have_avx512vp2intersect() const
+ { return _M_test(35); }
+
+ consteval bool
+ _M_have_sse4a() const
+ { return _M_test(36); }
+
+ consteval bool
+ _M_have_fma4() const
+ { return _M_test(37); }
+
+ consteval bool
+ _M_have_xop() const
+ { return _M_test(38); }
+
+ template <typename _Tp>
+ consteval bool
+ _M_eval_as_f32() const
+ { return is_same_v<_Tp, _Float16> && !_M_have_avx512fp16(); }
+ };
+
+ template <typename _Tp, _ArchTraits _Traits = {}>
+ consteval auto
+ __native_abi()
+ {
+ constexpr int __adj_sizeof = sizeof(_Tp) * (1 + is_same_v<_Tp, _Float16>);
+ if constexpr (!__vectorizable<_Tp>)
+ return _InvalidAbi();
+ else if constexpr (_Traits._M_have_avx512fp16())
+ return _Abi_t<64 / sizeof(_Tp), 1, _AbiVariant::_BitMask>();
+ else if constexpr (_Traits._M_have_avx512f())
+ return _Abi_t<64 / __adj_sizeof, 1, _AbiVariant::_BitMask>();
+ else if constexpr (is_same_v<_Tp, _Float16> && !_Traits._M_have_f16c())
+ return _ScalarAbi<1>();
+ else if constexpr (_Traits._M_have_avx2())
+ return _Abi_t<32 / __adj_sizeof, 1>();
+ else if constexpr (_Traits._M_have_avx() && is_floating_point_v<_Tp>)
+ return _Abi_t<32 / __adj_sizeof, 1>();
+ else if constexpr (_Traits._M_have_sse2())
+ return _Abi_t<16 / __adj_sizeof, 1>();
+ else if constexpr (_Traits._M_have_sse() && is_floating_point_v<_Tp>
+ && sizeof(_Tp) == sizeof(float))
+ return _Abi_t<16 / __adj_sizeof, 1>();
+ // no MMX: we can't emit EMMS where it would be necessary
+ else
+ return _ScalarAbi<1>();
+ }
+
+#else
+
+ // scalar fallback
+ struct _ArchTraits
+ {
+ __UINT64_TYPE__ _M_flags = 0;
+
+ constexpr bool
+ _M_test(int __bit) const
+ { return ((_M_flags >> __bit) & 1) == 1; }
+ };
+
+ template <typename _Tp>
+ consteval auto
+ __native_abi()
+ {
+ if constexpr (!__vectorizable<_Tp>)
+ return _InvalidAbi();
+ else
+ return _ScalarAbi<1>();
+ }
+
+#endif
+
+ /** @internal
+ * You must use this type as template argument to function templates that are not declared
+ * always_inline (to avoid issues when linking code compiled with different compiler flags).
+ */
+ struct _TargetTraits
+ : _ArchTraits, _OptTraits
+ {};
+
+ /** @internal
+ * Alias for an ABI tag such that basic_vec<_Tp, __native_abi_t_<_Tp>> stores one SIMD register of
+ * optimal width.
+ *
+ * @tparam _Tp A vectorizable type.
+ *
+ * C++26 [simd.expos.abi]
+ */
+ template <typename _Tp>
+ using __native_abi_t = decltype(std::simd::__native_abi<_Tp>());
+
+ template <typename _Tp, int _Np, _TargetTraits _Target = {}>
+ consteval auto
+ __deduce_abi()
+ {
+ constexpr auto __native = std::simd::__native_abi<_Tp>();
+ if constexpr (0 == __native._S_size || _Np <= 0)
+ return _InvalidAbi();
+ else if constexpr (_Np == __native._S_size)
+ return __native;
+ else
+ return __native.template _S_resize<_Np>();
+ }
+
+ /** @internal
+ * Alias for an ABI tag @c A such that `basic_vec<_Tp, A>` stores @p _Np elements.
+ *
+ * C++26 [simd.expos.abi]
+ */
+ template <typename _Tp, int _Np>
+ using __deduce_abi_t = decltype(std::simd::__deduce_abi<_Tp, _Np>());
+
+ /** @internal
+ * \c rebind implementation detail for basic_vec, and basic_mask where we know the destination
+ * value-type
+ */
+ template <typename _Tp, int _Np, __abi_tag _A0, _ArchTraits = {}>
+ consteval auto
+ __abi_rebind()
+ {
+ if constexpr (_Np <= 0 || !__vectorizable<_Tp>)
+ return _InvalidAbi();
+
+ else if constexpr (__scalar_abi_tag<_A0>)
+ return _A0::template _S_resize<_Np>();
+
+ else
+ {
+ using _Native = remove_const_t<decltype(std::simd::__native_abi<_Tp>())>;
+ static_assert(0 != _Native::_S_size);
+ constexpr int __nreg = __div_ceil(_Np, _Native::_S_size);
+
+ if constexpr (__scalar_abi_tag<_Native>)
+ return _Native::template _S_resize<_Np>();
+ else
+ return _Abi_t<_Native::_S_size, 1, __filter_abi_variant(_A0::_S_variant,
+ _AbiVariant::_MaskVariants)
+ >::template _S_resize<_Np, __nreg>();
+ }
+ }
+
+ /** @internal
+ * @c rebind implementation detail for basic_mask.
+ *
+ * The important difference here is that we have no information about the actual value-type other
+ * than its @c sizeof. So `_Bytes == 8` could mean `complex<float>`, @c double, or @c int64_t.
+ * E.g. `_Np == 4` with AVX w/o AVX2 that's `vector(4) int`, `vector(4) long long`, or `2x
+ * vector(2) long long`.
+ * That's why this overload has the additional @p _IsOnlyResize parameter, which tells us that the
+ * value-type doesn't change.
+ */
+ template <size_t _Bytes, int _Np, __abi_tag _A0, bool _IsOnlyResize, _ArchTraits _Traits = {}>
+ consteval auto
+ __abi_rebind()
+ {
+ if constexpr (_Bytes == 0 || _Np <= 0)
+ return _InvalidAbi();
+
+ else if constexpr (__scalar_abi_tag<_A0>)
+ return _A0::template _S_resize<_Np>();
+
+#if _GLIBCXX_X86
+ // AVX w/o AVX2:
+ // e.g. resize_t<8, mask<float, Whatever>> needs to be _Abi<8, 1> not _Abi<8, 2>
+ // We determine whether _A0 identifies an AVX vector by looking at the size of a native
+ // register. If it's 32, it's a YMM register, otherwise it's 16 or less.
+ else if constexpr (_IsOnlyResize
+ && _Traits._M_have_avx() && !_Traits._M_have_avx2()
+ && __bit_ceil(__div_ceil<unsigned>(
+ _A0::_S_size, _A0::_S_nreg)) * _Bytes == 32)
+ {
+ if constexpr (_Bytes == sizeof(double))
+ return __abi_rebind<double, _Np, _A0>();
+ else if constexpr (_Bytes == sizeof(float))
+ return __abi_rebind<float, _Np, _A0>();
+ else if constexpr (_Traits._M_have_f16c() && _Bytes == sizeof(_Float16))
+ return __abi_rebind<_Float16, _Np, _A0>();
+ else // impossible
+ static_assert(false);
+ }
+#endif
+
+ else
+ return __abi_rebind<__integer_from<_Bytes>, _Np, _A0>();
+ }
+
+ /** @internal
+ * Returns true unless _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION is defined.
+ *
+ * On IvyBridge, (vec<float> == 0.f) == (rebind_t<int, vec<float>> == 0) does not compile. It does
+ * compile on basically every other target, though. This is due to the difference in ABI tag:
+ * _Abi<8, 1, [...]> vs. _Abi<8, 2, [...]> (8 elements, 1 vs. 2 registers).
+ * I know how to define this funtion for libstdc++ to avoid interconvertible masks. The question
+ * is whether we can specify this in general for C++29.
+ *
+ * Idea: Is rebind_t<integer-from<...>, mask>::abi_type the same type as
+ * deduce-t<integer-from<...>, mask::size()>? If yes, it's the "better" ABI tag. However, this
+ * makes the conversion behavior dependent on compiler flags. Probably not what we want.
+ */
+ template <typename _To, typename _From>
+ consteval bool
+ __is_mask_conversion_explicit([[maybe_unused]] size_t __b0, [[maybe_unused]] size_t __b1)
+ {
+ constexpr int __n = _To::_S_size;
+ static_assert(__n == _From::_S_size);
+#ifndef _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION
+ /// C++26 [simd.mask.ctor] uses unconditional explicit
+ return true;
+#else
+ if (__b0 != __b1)
+ return true;
+
+ // everything is better than _ScalarAbi, except when converting to a single bool
+ if constexpr (__scalar_abi_tag<_To>)
+ return __n > 1;
+ else if constexpr (__scalar_abi_tag<_From>)
+ return true;
+
+ // converting to a bit-mask is better
+ else if constexpr (_To::_S_is_vecmask != _From::_S_is_vecmask)
+ return _To::_S_is_vecmask; // to vector-mask is explicit
+
+ // with vec-masks, fewer registers is better
+ else if constexpr (_From::_S_nreg != _To::_S_nreg)
+ return _From::_S_nreg < _To::_S_nreg;
+
+ else
+ __builtin_unreachable();
+#endif
+ }
+
+ /** @internal
+ * An alias for a signed integer type.
+ *
+ * libstdc++ unconditionally uses @c int here, since it matches the return type of
+ * 'Bit Operation Builtins' in GCC.
+ *
+ * C++26 [simd.expos.defn]
+ */
+ using __simd_size_type = int;
+
+ // integral_constant shortcut
+ template <__simd_size_type _Xp>
+ inline constexpr integral_constant<__simd_size_type, _Xp> __simd_size_c = {};
+
+ // [simd.syn]
+ template <typename _Tp, typename _Ap = __native_abi_t<_Tp>>
+ class basic_vec;
+
+ template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>
+ using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;
+
+ template <size_t _Bytes, typename _Ap = __native_abi_t<__integer_from<_Bytes>>>
+ class basic_mask;
+
+ template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>
+ using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;
+
+ // [simd.ctor] load constructor constraints
+ template <typename _Tp, size_t _Np = -1uz>
+ concept __static_sized_range
+ = ranges::sized_range<_Tp> && requires(_Tp&& __r) {
+ typename integral_constant<size_t, ranges::size(__r)>;
+ requires (_Np == -1uz || ranges::size(__r) == _Np);
+ };
+
+ template <typename _Rg>
+ consteval size_t
+ __static_range_size(_Rg& __r)
+ {
+ if constexpr (requires { typename integral_constant<size_t, ranges::size(__r)>; })
+ return ranges::size(__r);
+ else
+ return dynamic_extent;
+ }
+
+ // [simd.general] value-preserving
+ template <typename _From, typename _To>
+ concept __arithmetic_only_value_preserving_convertible_to
+ = convertible_to<_From, _To> && is_arithmetic_v<_From> && is_arithmetic_v<_To>
+ && !(is_signed_v<_From> && is_unsigned_v<_To>)
+ && numeric_limits<_From>::digits <= numeric_limits<_To>::digits
+ && numeric_limits<_From>::max() <= numeric_limits<_To>::max()
+ && numeric_limits<_From>::lowest() >= numeric_limits<_To>::lowest();
+
+ /** @internal
+ * Satisfied if the conversion from @p _From to @p _To is a value-preserving conversion.
+ *
+ * C++26 [simd.general]
+ */
+ template <typename _From, typename _To>
+ concept __value_preserving_convertible_to
+ = __arithmetic_only_value_preserving_convertible_to<_From, _To>;
+
+ // LWG4420
+ template <typename _From, typename _To>
+ concept __explicitly_convertible_to = requires {
+ static_cast<_To>(declval<_From>());
+ };
+
+ /** @internal
+ * C++26 [simd.expos]
+ */
+ template<typename _Tp>
+ concept __constexpr_wrapper_like
+ = convertible_to<_Tp, decltype(_Tp::value)>
+ && equality_comparable_with<_Tp, decltype(_Tp::value)>
+ && bool_constant<_Tp() == _Tp::value>::value
+ && bool_constant<static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value>::value;
+
+ // [simd.ctor] explicit(...) of broadcast ctor
+ template <auto _From, typename _To>
+ concept __non_narrowing_constexpr_conversion
+ = is_arithmetic_v<decltype(_From)>
+ && static_cast<decltype(_From)>(static_cast<_To>(_From)) == _From
+ && !(unsigned_integral<_To> && _From < decltype(_From)())
+ && _From <= std::numeric_limits<_To>::max()
+ && _From >= std::numeric_limits<_To>::lowest();
+
+ // [simd.ctor] p4
+ // This implements LWG4436 (submitted on 2025-10-28)
+ template <typename _From, typename _To>
+ concept __broadcast_constructible
+ = ((convertible_to<_From, _To> && !is_arithmetic_v<remove_cvref_t<_From>>
+ && !__constexpr_wrapper_like<remove_cvref_t<_From>>) // 4.1
+ || __value_preserving_convertible_to<remove_cvref_t<_From>, _To> // 4.2
+ || (__constexpr_wrapper_like<remove_cvref_t<_From>> // 4.3
+ && __non_narrowing_constexpr_conversion<auto(remove_cvref_t<_From>::value),
+ _To>));
+
+ // __higher_floating_point_rank_than<_Tp, U> (_Tp has higher or equal floating point rank than U)
+ template <typename _From, typename _To>
+ consteval bool
+ __higher_floating_point_rank_than()
+ {
+ return floating_point<_From> && floating_point<_To>
+ && is_same_v<common_type_t<_From, _To>, _From> && !is_same_v<_From, _To>;
+ }
+
+ // __higher_integer_rank_than<_Tp, U> (_Tp has higher or equal integer rank than U)
+ template <typename _From, typename _To>
+ consteval bool
+ __higher_integer_rank_than()
+ {
+ return integral<_From> && integral<_To>
+ && (sizeof(_From) > sizeof(_To) || is_same_v<common_type_t<_From, _To>, _From>)
+ && !is_same_v<_From, _To>;
+ }
+
+ template <typename _From, typename _To>
+ concept __higher_rank_than
+ = __higher_floating_point_rank_than<_From, _To>() || __higher_integer_rank_than<_From, _To>();
+
+ struct __convert_flag;
+
+ template <typename _From, typename _To, typename... _Flags>
+ concept __loadstore_convertible_to
+ = same_as<_From, _To>
+ || (__vectorizable<_From> && __vectorizable<_To>
+ && (__value_preserving_convertible_to<_From, _To>
+ || (__explicitly_convertible_to<_From, _To>
+ && (std::is_same_v<_Flags, __convert_flag> || ...))));
+
+ template <typename _From, typename _To>
+ concept __simd_generator_convertible_to
+ = std::convertible_to<_From, _To>
+ && (!is_arithmetic_v<_From> || __value_preserving_convertible_to<_From, _To>);
+
+ template <typename _Fp, typename _Tp, __simd_size_type... _Is>
+ requires (__simd_generator_convertible_to<
+ decltype(declval<_Fp>()(__simd_size_c<_Is>)), _Tp> && ...)
+ constexpr void
+ __simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);
+
+ template <typename _Fp, typename _Tp, __simd_size_type _Np>
+ concept __simd_generator_invokable = requires {
+ __simd_generator_invokable_impl<_Fp, _Tp>(make_integer_sequence<__simd_size_type, _Np>());
+ };
+
+ template <typename _Fp>
+ concept __index_permutation_function_sized = requires(_Fp const& __f)
+ {
+ { __f(0, 0) } -> std::integral;
+ };
+
+ template <typename _Fp, typename _Simd>
+ concept __index_permutation_function
+ = __index_permutation_function_sized<_Fp> || requires(_Fp const& __f) {
+ { __f(0) } -> std::integral;
+ };
+
+ /** @internal
+ * The value of the @c _Bytes template argument to a @c basic_mask specialization.
+ *
+ * C++26 [simd.expos.defn]
+ */
+ template <typename _Tp>
+ constexpr size_t __mask_element_size = 0;
+
+ template <size_t _Bytes, __abi_tag _Ap>
+ constexpr size_t __mask_element_size<basic_mask<_Bytes, _Ap>> = _Bytes;
+
+ // [simd.expos]
+ template <typename _Vp>
+ concept __simd_vec_type
+ = same_as<_Vp, basic_vec<typename _Vp::value_type, typename _Vp::abi_type>>
+ && is_default_constructible_v<_Vp>;
+
+ template <typename _Vp>
+ concept __simd_mask_type
+ = same_as<_Vp, basic_mask<__mask_element_size<_Vp>, typename _Vp::abi_type>>
+ && is_default_constructible_v<_Vp>;
+
+ /** @internal
+ * Satisfied if @p _Tp is a data-parallel type.
+ */
+ template <typename _Vp>
+ concept __simd_vec_or_mask_type = __simd_vec_type<_Vp> || __simd_mask_type<_Vp>;
+
+ template <typename _Vp>
+ concept __simd_floating_point
+ = __simd_vec_type<_Vp> && floating_point<typename _Vp::value_type>;
+
+ template <typename _Vp>
+ concept __simd_integral
+ = __simd_vec_type<_Vp> && integral<typename _Vp::value_type>;
+
+ template <typename _Tp>
+ concept __converts_to_vec
+ = __simd_vec_type<decltype(declval<const _Tp&>() + declval<const _Tp&>())>;
+
+ template <__converts_to_vec _Tp>
+ using __deduced_vec_t = decltype(declval<const _Tp&>() + declval<const _Tp&>());
+
+ template <typename _Vp, typename _Tp>
+ using __make_compatible_simd_t
+ = decltype([] {
+ using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());
+ if constexpr (__simd_vec_type<_Up>)
+ return _Up();
+ else
+ return vec<_Up, _Vp::size()>();
+ }());
+
+ template <typename _Tp>
+ concept __math_floating_point = __simd_floating_point<__deduced_vec_t<_Tp>>;
+
+ template <typename _BinaryOperation, typename _Tp>
+ concept __reduction_binary_operation
+ = requires (const _BinaryOperation __binary_op, const vec<_Tp, 1> __v) {
+ { __binary_op(__v, __v) } -> same_as<vec<_Tp, 1>>;
+ };
+
+ /** @internal
+ * Returns the highest index @c i where `(__bits >> i) & 1` equals @c 1.
+ */
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ __highest_bit(std::unsigned_integral auto __bits)
+ {
+ using __gnu_cxx::__int_traits;
+ constexpr auto _Nd = __int_traits<decltype(__bits)>::__digits;
+ return _Nd - 1 - __countl_zero(__bits);
+ }
+
+ template <__vectorizable _Tp, __simd_size_type _Np, __abi_tag _Ap>
+ using __similar_mask = basic_mask<sizeof(_Tp), decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
+
+ // Allow _Tp to be _InvalidInteger for __integer_from<16>
+ template <typename _Tp, __simd_size_type _Np, __abi_tag _Ap>
+ using __similar_vec = basic_vec<_Tp, decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
+
+ // LWG4470 [simd.expos]
+ template <size_t _Bytes, typename _Ap>
+ using __simd_vec_from_mask_t = __similar_vec<__integer_from<_Bytes>, _Ap::_S_size, _Ap>;
+
+#if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // used for unit tests (also see P3844)
+ class __bad_value_preserving_cast
+ {};
+
+#define __glibcxx_on_bad_value_preserving_cast throw __bad_value_preserving_cast
+#else
+ void __bad_value_preserving_cast(); // not defined
+
+#define __glibcxx_on_bad_value_preserving_cast __bad_value_preserving_cast
+#endif
+
+ template <typename _To, typename _From>
+#if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // see P3844
+ [[__gnu__::__optimize__("exceptions")]] // work around potential -fno-exceptions
+#endif
+ consteval _To
+ __value_preserving_cast(const _From& __x)
+ {
+ static_assert(is_arithmetic_v<_From>);
+ if constexpr (!__value_preserving_convertible_to<_From, _To>)
+ {
+ using _Up = typename __make_unsigned<_From>::__type;
+ if (static_cast<_Up>(static_cast<_To>(__x)) != static_cast<_Up>(__x))
+ __glibcxx_on_bad_value_preserving_cast();
+ else if constexpr (is_signed_v<_From> && is_unsigned_v<_To>)
+ {
+ if (__x < _From())
+ __glibcxx_on_bad_value_preserving_cast();
+ }
+ else if constexpr (unsigned_integral<_From> && signed_integral<_To>)
+ {
+ if (__x > numeric_limits<_To>::max())
+ __glibcxx_on_bad_value_preserving_cast();
+ }
+ }
+ return static_cast<_To>(__x);
+ }
+
+ template <typename _From, typename _To>
+ concept __simd_vec_bcast_consteval
+ = __explicitly_convertible_to<_From, _To>
+ && is_arithmetic_v<remove_cvref_t<_From>> && convertible_to<_From, _To>
+ && !__value_preserving_convertible_to<remove_cvref_t<_From>, _To>
+ && (is_same_v<common_type_t<_From, _To>, _To>
+ || (is_same_v<remove_cvref_t<_From>, int> && is_integral_v<_To>)
+ || (is_same_v<remove_cvref_t<_From>, unsigned> && unsigned_integral<_To>));
+
+ /** @internal
+ * std::pair is not trivially copyable, this one is
+ */
+ template <typename _T0, typename _T1>
+ struct __trivial_pair
+ {
+ _T0 _M_first;
+ _T1 _M_second;
+ };
+
+ template <typename _From, typename _To>
+ concept __converts_trivially = convertible_to<_From, _To>
+ && sizeof(_From) == sizeof(_To)
+ && is_integral_v<_From> == is_integral_v<_To>
+ && is_floating_point_v<_From> == is_floating_point_v<_To>;
+
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ __bit_foreach(unsigned_integral auto __bits, auto&& __fun)
+ {
+ static_assert(sizeof(__bits) >= sizeof(int)); // avoid promotion to int
+ while (__bits)
+ {
+ __fun(__countr_zero(__bits));
+ __bits &= (__bits - 1);
+ }
+ }
+
+ /** @internal
+ * Optimized @c memcpy for use in partial loads and stores.
+ *
+ * The implementation uses at most two fixed-size power-of-2 @c memcpy calls and reduces the
+ * number of branches to a minimum. The variable size is achieved by overlapping two @c memcpy
+ * calls.
+ *
+ * @tparam _Chunk Copies @p __n times @p _Chunk bytes.
+ * @tparam _Max Copy no more than @p _Max bytes.
+ *
+ * @param __dst The destination pointer.
+ * @param __src The source pointer.
+ * @param __n Thu number of chunks that need to be copied.
+ */
+ template <size_t _Chunk, size_t _Max>
+ inline void
+ __memcpy_chunks(byte* __restrict__ __dst, const byte* __restrict__ __src,
+ size_t __n)
+ {
+ static_assert(_Max <= 64);
+ static_assert(__has_single_bit(_Chunk) && _Chunk <= 8);
+ size_t __bytes = _Chunk * __n;
+ if (__builtin_constant_p(__bytes))
+ { // If __n is known via constant propagation use a single memcpy call. Since this is still
+ // a fixed-size memcpy to the compiler, this leaves more room for optimization.
+ __builtin_memcpy(__dst, __src, __bytes);
+ }
+ else if (__bytes > 32 && _Max > 32)
+ {
+ __builtin_memcpy(__dst, __src, 32);
+ __bytes -= 32;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 32);
+ }
+ else if (__bytes > 16 && _Max > 16)
+ {
+ __builtin_memcpy(__dst, __src, 16);
+ if constexpr (_Chunk == 8)
+ {
+ __bytes -= 8;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 8);
+ }
+ else
+ {
+ __bytes -= 16;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 16);
+ }
+ }
+ else if (__bytes > 8 && _Max > 8)
+ {
+ __builtin_memcpy(__dst, __src, 8);
+ if constexpr (_Chunk == 4)
+ {
+ __bytes -= 4;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 4);
+ }
+ else if constexpr (_Chunk < 4)
+ {
+ __bytes -= 8;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 8);
+ }
+ }
+ else if (__bytes > 4 && _Max > 4)
+ {
+ __builtin_memcpy(__dst, __src, 4);
+ if constexpr (_Chunk == 2)
+ {
+ __bytes -= 2;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 2);
+ }
+ else if constexpr (_Chunk == 1)
+ {
+ __bytes -= 4;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 4);
+ }
+ }
+ else if (__bytes >= 2)
+ {
+ __builtin_memcpy(__dst, __src, 2);
+ if constexpr (_Chunk == 2)
+ {
+ __bytes -= 2;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 2);
+ }
+ else if constexpr (_Chunk == 1)
+ {
+ __bytes -= 1;
+ __builtin_memcpy(__dst + __bytes, __src + __bytes, 1);
+ }
+ }
+ else if (__bytes == 1)
+ __builtin_memcpy(__dst, __src, 1);
+ }
+
+ // [simd.reductions] identity_element = *see below*
+ template <typename _Tp, typename _BinaryOperation>
+ requires __is_one_of<_BinaryOperation,
+ plus<>, multiplies<>, bit_and<>, bit_or<>, bit_xor<>>::value
+ consteval _Tp
+ __default_identity_element()
+ {
+ if constexpr (same_as<_BinaryOperation, multiplies<>>)
+ return _Tp(1);
+ else if constexpr (same_as<_BinaryOperation, bit_and<>>)
+ return _Tp(~_Tp());
+ else
+ return _Tp(0);
+ }
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_DETAILS_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_FLAGS_H
+#define _GLIBCXX_SIMD_FLAGS_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_details.h"
+#include <bits/align.h> // assume_aligned
+
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ // [simd.traits]
+ // --- alignment ---
+ template <typename _Tp, typename _Up = typename _Tp::value_type>
+ struct alignment
+ {};
+
+ template <typename _Tp, typename _Ap, __vectorizable _Up>
+ struct alignment<basic_vec<_Tp, _Ap>, _Up>
+ : integral_constant<size_t, alignof(basic_vec<_Tp, _Ap>)>
+ {};
+
+ template <typename _Tp, typename _Up = typename _Tp::value_type>
+ constexpr size_t alignment_v = alignment<_Tp, _Up>::value;
+
+ // [simd.flags] -------------------------------------------------------------
+ struct _LoadStoreTag
+ {};
+
+ /** @internal
+ * `struct convert-flag`
+ *
+ * C++26 [simd.expos] / [simd.flags]
+ */
+ struct __convert_flag
+ : _LoadStoreTag
+ {};
+
+ /** @internal
+ * `struct aligned-flag`
+ *
+ * C++26 [simd.expos] / [simd.flags]
+ */
+ struct __aligned_flag
+ : _LoadStoreTag
+ {
+ template <typename _Tp, typename _Up>
+ [[__gnu__::__always_inline__]]
+ static constexpr _Up*
+ _S_adjust_pointer(_Up* __ptr)
+ { return assume_aligned<simd::alignment_v<_Tp, remove_cv_t<_Up>>>(__ptr); }
+ };
+
+ /** @internal
+ * `template<size_t N> struct overaligned-flag`
+ *
+ * @tparam _Np alignment in bytes
+ *
+ * C++26 [simd.expos] / [simd.flags]
+ */
+ template <size_t _Np>
+ struct __overaligned_flag
+ : _LoadStoreTag
+ {
+ static_assert(__has_single_bit(_Np));
+
+ template <typename, typename _Up>
+ [[__gnu__::__always_inline__]]
+ static constexpr _Up*
+ _S_adjust_pointer(_Up* __ptr)
+ { return assume_aligned<_Np>(__ptr); }
+ };
+
+ struct __partial_loadstore_flag
+ : _LoadStoreTag
+ {};
+
+
+ template <typename _Tp>
+ concept __loadstore_tag = is_base_of_v<_LoadStoreTag, _Tp>;
+
+ template <typename...>
+ struct flags;
+
+ template <typename... _Flags>
+ requires (__loadstore_tag<_Flags> && ...)
+ struct flags<_Flags...>
+ {
+ /** @internal
+ * Returns @c true if the given argument is part of this specialization, otherwise returns @c
+ * false.
+ */
+ template <typename _F0>
+ static consteval bool
+ _S_test(flags<_F0>)
+ { return (is_same_v<_Flags, _F0> || ...); }
+
+ friend consteval flags
+ operator|(flags, flags<>)
+ { return flags{}; }
+
+ template <typename _T0, typename... _More>
+ friend consteval auto
+ operator|(flags, flags<_T0, _More...>)
+ {
+ if constexpr ((same_as<_Flags, _T0> || ...))
+ return flags<_Flags...>{} | flags<_More...>{};
+ else
+ return flags<_Flags..., _T0>{} | flags<_More...>{};
+ }
+
+ /** @internal
+ * Adjusts a pointer according to the alignment requirements of the flags.
+ *
+ * This function iterates over all flags in the pack and applies each flag's
+ * `_S_adjust_pointer` method to the input pointer. Flags that don't provide
+ * this method are ignored.
+ *
+ * @tparam _Tp A basic_vec type for which a load/store pointer is adjusted
+ * @tparam _Up The value-type of the input/output range
+ * @param __ptr The pointer to the range
+ * @return The adjusted pointer
+ */
+ template <typename _Tp, typename _Up>
+ static constexpr _Up*
+ _S_adjust_pointer(_Up* __ptr)
+ {
+ template for ([[maybe_unused]] constexpr auto __f : {_Flags()...})
+ {
+ if constexpr (requires {__f.template _S_adjust_pointer<_Tp>(__ptr); })
+ __ptr = __f.template _S_adjust_pointer<_Tp>(__ptr);
+ }
+ return __ptr;
+ }
+ };
+
+ inline constexpr flags<> flag_default {};
+
+ inline constexpr flags<__convert_flag> flag_convert {};
+
+ inline constexpr flags<__aligned_flag> flag_aligned {};
+
+ template <size_t _Np>
+ requires(__has_single_bit(_Np))
+ inline constexpr flags<__overaligned_flag<_Np>> flag_overaligned {};
+
+ /** @internal
+ * Pass to unchecked_load or unchecked_store to make it behave like partial_load / partial_store.
+ */
+ inline constexpr flags<__partial_loadstore_flag> __allow_partial_loadstore {};
+
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#endif // C++26
+#endif // _GLIBCXX_SIMD_FLAGS_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_ITERATOR_H
+#define _GLIBCXX_SIMD_ITERATOR_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_details.h"
+
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ /** @internal
+ * Iterator type for basic_vec and basic_mask.
+ *
+ * C++26 [simd.iterator]
+ */
+ template <typename _Vp>
+ class __iterator
+ {
+ friend class __iterator<const _Vp>;
+
+ template <typename, typename>
+ friend class _VecBase;
+
+ template <size_t, typename>
+ friend class _MaskBase;
+
+ _Vp* _M_data = nullptr;
+
+ __simd_size_type _M_offset = 0;
+
+ constexpr
+ __iterator(_Vp& __d, __simd_size_type __off)
+ : _M_data(&__d), _M_offset(__off)
+ {}
+
+ public:
+ using value_type = typename _Vp::value_type;
+
+ using iterator_category = input_iterator_tag;
+
+ using iterator_concept = random_access_iterator_tag;
+
+ using difference_type = __simd_size_type;
+
+ constexpr __iterator() = default;
+
+ constexpr
+ __iterator(const __iterator &) = default;
+
+ constexpr __iterator&
+ operator=(const __iterator &) = default;
+
+ constexpr
+ __iterator(const __iterator<remove_const_t<_Vp>> &__i) requires is_const_v<_Vp>
+ : _M_data(__i._M_data), _M_offset(__i._M_offset)
+ {}
+
+ constexpr value_type
+ operator*() const
+ { return (*_M_data)[_M_offset]; } // checked in operator[]
+
+ constexpr __iterator&
+ operator++()
+ {
+ ++_M_offset;
+ return *this;
+ }
+
+ constexpr __iterator
+ operator++(int)
+ {
+ __iterator r = *this;
+ ++_M_offset;
+ return r;
+ }
+
+ constexpr __iterator&
+ operator--()
+ {
+ --_M_offset;
+ return *this;
+ }
+
+ constexpr __iterator
+ operator--(int)
+ {
+ __iterator r = *this;
+ --_M_offset;
+ return r;
+ }
+
+ constexpr __iterator&
+ operator+=(difference_type __x)
+ {
+ _M_offset += __x;
+ return *this;
+ }
+
+ constexpr __iterator&
+ operator-=(difference_type __x)
+ {
+ _M_offset -= __x;
+ return *this;
+ }
+
+ constexpr value_type
+ operator[](difference_type __i) const
+ { return (*_M_data)[_M_offset + __i]; } // checked in operator[]
+
+ constexpr friend bool operator==(__iterator __a, __iterator __b) = default;
+
+ constexpr friend bool operator==(__iterator __a, std::default_sentinel_t) noexcept
+ { return __a._M_offset == _Vp::size.value; }
+
+ constexpr friend auto operator<=>(__iterator __a, __iterator __b)
+ { return __a._M_offset <=> __b._M_offset; }
+
+ constexpr friend __iterator
+ operator+(const __iterator& __it, difference_type __x)
+ { return __iterator(*__it._M_data, __it._M_offset + __x); }
+
+ constexpr friend __iterator
+ operator+(difference_type __x, const __iterator& __it)
+ { return __iterator(*__it._M_data, __it._M_offset + __x); }
+
+ constexpr friend __iterator
+ operator-(const __iterator& __it, difference_type __x)
+ { return __iterator(*__it._M_data, __it._M_offset - __x); }
+
+ constexpr friend difference_type
+ operator-(__iterator __a, __iterator __b)
+ { return __a._M_offset - __b._M_offset; }
+
+ constexpr friend difference_type
+ operator-(__iterator __it, std::default_sentinel_t) noexcept
+ { return __it._M_offset - difference_type(_Vp::size.value); }
+
+ constexpr friend difference_type
+ operator-(std::default_sentinel_t, __iterator __it) noexcept
+ { return difference_type(_Vp::size.value) - __it._M_offset; }
+ };
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#endif // C++26
+#endif // _GLIBCXX_SIMD_ITERATOR_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_LOADSTORE_H
+#define _GLIBCXX_SIMD_LOADSTORE_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_vec.h"
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+// [simd.reductions] ----------------------------------------------------------
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ template <typename _Vp, typename _Tp>
+ struct __vec_load_return
+ { using type = _Vp; };
+
+ template <typename _Tp>
+ struct __vec_load_return<void, _Tp>
+ { using type = basic_vec<_Tp>; };
+
+ template <typename _Vp, typename _Tp>
+ using __vec_load_return_t = typename __vec_load_return<_Vp, _Tp>::type;
+
+ template <typename _Vp, typename _Tp>
+ using __load_mask_type_t = typename __vec_load_return_t<_Vp, _Tp>::mask_type;
+
+ template <typename _Tp>
+ concept __sized_contiguous_range
+ = ranges::contiguous_range<_Tp> && ranges::sized_range<_Tp>;
+
+ template <typename _Vp = void, __sized_contiguous_range _Rg, typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, ranges::range_value_t<_Rg>>
+ unchecked_load(_Rg&& __r, flags<_Flags...> __f = {})
+ {
+ using _Tp = ranges::range_value_t<_Rg>;
+ using _RV = __vec_load_return_t<_Vp, _Tp>;
+ using _Rp = typename _RV::value_type;
+ static_assert(__loadstore_convertible_to<ranges::range_value_t<_Rg>, _Rp, _Flags...>,
+ "'flag_convert' must be used for conversions that are not value-preserving");
+
+ constexpr bool __allow_out_of_bounds = __f._S_test(__allow_partial_loadstore);
+ constexpr size_t __static_size = __static_range_size(__r);
+
+ if constexpr (!__allow_out_of_bounds && __static_sized_range<_Rg>)
+ static_assert(ranges::size(__r) >= _RV::size(), "given range must have sufficient size");
+
+ const auto* __ptr = __f.template _S_adjust_pointer<_RV>(ranges::data(__r));
+ const auto __rg_size = std::ranges::size(__r);
+ if constexpr (!__allow_out_of_bounds)
+ __glibcxx_simd_precondition(
+ std::ranges::size(__r) >= _RV::size(),
+ "Input range is too small. Did you mean to use 'partial_load'?");
+
+ if consteval
+ {
+ return _RV([&](size_t __i) -> _Rp {
+ if (__i >= __rg_size)
+ return _Rp();
+ else
+ return static_cast<_Rp>(__r[__i]);
+ });
+ }
+ else
+ {
+ if constexpr ((__static_size != dynamic_extent && __static_size >= size_t(_RV::size()))
+ || !__allow_out_of_bounds)
+ return _RV(_LoadCtorTag(), __ptr);
+ else
+ return _RV::_S_partial_load(__ptr, __rg_size);
+ }
+ }
+
+ template <typename _Vp = void, __sized_contiguous_range _Rg, typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, ranges::range_value_t<_Rg>>
+ unchecked_load(_Rg&& __r, const __load_mask_type_t<_Vp, ranges::range_value_t<_Rg>>& __mask,
+ flags<_Flags...> __f = {})
+ {
+ using _Tp = ranges::range_value_t<_Rg>;
+ using _RV = __vec_load_return_t<_Vp, _Tp>;
+ using _Rp = typename _RV::value_type;
+ static_assert(__vectorizable<_Tp>);
+ static_assert(__explicitly_convertible_to<_Tp, _Rp>);
+ static_assert(__loadstore_convertible_to<_Tp, _Rp, _Flags...>,
+ "'flag_convert' must be used for conversions that are not value-preserving");
+
+ constexpr bool __allow_out_of_bounds = __f._S_test(__allow_partial_loadstore);
+ constexpr auto __static_size = __static_range_size(__r);
+
+ if constexpr (!__allow_out_of_bounds && __static_sized_range<_Rg>)
+ static_assert(ranges::size(__r) >= _RV::size(), "given range must have sufficient size");
+
+ const auto* __ptr = __f.template _S_adjust_pointer<_RV>(ranges::data(__r));
+
+ if constexpr (!__allow_out_of_bounds)
+ __glibcxx_simd_precondition(
+ ranges::size(__r) >= size_t(_RV::size()),
+ "Input range is too small. Did you mean to use 'partial_load'?");
+
+ const size_t __rg_size = ranges::size(__r);
+ if consteval
+ {
+ return _RV([&](size_t __i) -> _Rp {
+ if (__i >= __rg_size || !__mask[int(__i)])
+ return _Rp();
+ else
+ return static_cast<_Rp>(__r[__i]);
+ });
+ }
+ else
+ {
+ constexpr bool __no_size_check
+ = !__allow_out_of_bounds
+ || (__static_size != dynamic_extent
+ && __static_size >= size_t(_RV::size.value));
+ if constexpr (_RV::size() == 1)
+ return __mask[0] && (__no_size_check || __rg_size > 0) ? _RV(_LoadCtorTag(), __ptr)
+ : _RV();
+ else if constexpr (__no_size_check)
+ return _RV::_S_masked_load(__ptr, __mask);
+ else if (__rg_size >= size_t(_RV::size()))
+ return _RV::_S_masked_load(__ptr, __mask);
+ else if (__rg_size > 0)
+ return _RV::_S_masked_load(
+ __ptr, __mask && _RV::mask_type::_S_partial_mask_of_n(int(__rg_size)));
+ else
+ return _RV();
+ }
+ }
+
+ template <typename _Vp = void, contiguous_iterator _It, typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, iter_value_t<_It>>
+ unchecked_load(_It __first, iter_difference_t<_It> __n, flags<_Flags...> __f = {})
+ { return simd::unchecked_load<_Vp>(span<const iter_value_t<_It>>(__first, __n), __f); }
+
+ template <typename _Vp = void, contiguous_iterator _It, typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, iter_value_t<_It>>
+ unchecked_load(_It __first, iter_difference_t<_It> __n,
+ const __load_mask_type_t<_Vp, iter_value_t<_It>>& __mask,
+ flags<_Flags...> __f = {})
+ { return simd::unchecked_load<_Vp>(span<const iter_value_t<_It>>(__first, __n), __mask, __f); }
+
+ template <typename _Vp = void, contiguous_iterator _It, sized_sentinel_for<_It> _Sp,
+ typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, iter_value_t<_It>>
+ unchecked_load(_It __first, _Sp __last, flags<_Flags...> __f = {})
+ { return simd::unchecked_load<_Vp>(span<const iter_value_t<_It>>(__first, __last), __f); }
+
+ template <typename _Vp = void, contiguous_iterator _It, sized_sentinel_for<_It> _Sp,
+ typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, iter_value_t<_It>>
+ unchecked_load(_It __first, _Sp __last,
+ const __load_mask_type_t<_Vp, iter_value_t<_It>>& __mask,
+ flags<_Flags...> __f = {})
+ {
+ return simd::unchecked_load<_Vp>(span<const iter_value_t<_It>>(__first, __last), __mask, __f);
+ }
+
+ template <typename _Vp = void, __sized_contiguous_range _Rg, typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, ranges::range_value_t<_Rg>>
+ partial_load(_Rg&& __r, flags<_Flags...> __f = {})
+ { return simd::unchecked_load<_Vp>(__r, __f | __allow_partial_loadstore); }
+
+ template <typename _Vp = void, __sized_contiguous_range _Rg, typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, ranges::range_value_t<_Rg>>
+ partial_load(_Rg&& __r, const __load_mask_type_t<_Vp, ranges::range_value_t<_Rg>>& __mask,
+ flags<_Flags...> __f = {})
+ { return simd::unchecked_load<_Vp>(__r, __mask, __f | __allow_partial_loadstore); }
+
+ template <typename _Vp = void, contiguous_iterator _It, typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, iter_value_t<_It>>
+ partial_load(_It __first, iter_difference_t<_It> __n, flags<_Flags...> __f = {})
+ { return partial_load<_Vp>(span<const iter_value_t<_It>>(__first, __n), __f); }
+
+ template <typename _Vp = void, contiguous_iterator _It, typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, iter_value_t<_It>>
+ partial_load(_It __first, iter_difference_t<_It> __n,
+ const __load_mask_type_t<_Vp, iter_value_t<_It>>& __mask,
+ flags<_Flags...> __f = {})
+ { return partial_load<_Vp>(span<const iter_value_t<_It>>(__first, __n), __mask, __f); }
+
+ template <typename _Vp = void, contiguous_iterator _It, sized_sentinel_for<_It> _Sp,
+ typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, iter_value_t<_It>>
+ partial_load(_It __first, _Sp __last, flags<_Flags...> __f = {})
+ { return partial_load<_Vp>(span<const iter_value_t<_It>>(__first, __last), __f); }
+
+ template <typename _Vp = void, contiguous_iterator _It, sized_sentinel_for<_It> _Sp,
+ typename... _Flags>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_load_return_t<_Vp, iter_value_t<_It>>
+ partial_load(_It __first, _Sp __last, const __load_mask_type_t<_Vp, iter_value_t<_It>>& __mask,
+ flags<_Flags...> __f = {})
+ { return partial_load<_Vp>(span<const iter_value_t<_It>>(__first, __last), __mask, __f); }
+
+ template <typename _Tp, typename _Ap, __sized_contiguous_range _Rg, typename... _Flags>
+ requires indirectly_writable<ranges::iterator_t<_Rg>, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ unchecked_store(const basic_vec<_Tp, _Ap>& __v, _Rg&& __r, flags<_Flags...> __f = {})
+ {
+ using _TV = basic_vec<_Tp, _Ap>;
+ static_assert(destructible<_TV>);
+ static_assert(__loadstore_convertible_to<_Tp, ranges::range_value_t<_Rg>, _Flags...>,
+ "'flag_convert' must be used for conversions that are not value-preserving");
+
+ constexpr bool __allow_out_of_bounds = __f._S_test(__allow_partial_loadstore);
+ if constexpr (!__allow_out_of_bounds && __static_sized_range<_Rg>)
+ static_assert(ranges::size(__r) >= _TV::size(), "given range must have sufficient size");
+
+ auto* __ptr = __f.template _S_adjust_pointer<_TV>(ranges::data(__r));
+ const auto __rg_size = ranges::size(__r);
+ if constexpr (!__allow_out_of_bounds)
+ __glibcxx_simd_precondition(
+ ranges::size(__r) >= _TV::size(),
+ "output range is too small. Did you mean to use 'partial_store'?");
+
+ if consteval
+ {
+ for (unsigned __i = 0; __i < __rg_size && __i < _TV::size(); ++__i)
+ __ptr[__i] = static_cast<ranges::range_value_t<_Rg>>(__v[__i]);
+ }
+ else
+ {
+ if constexpr (!__allow_out_of_bounds)
+ __v._M_store(__ptr);
+ else
+ _TV::_S_partial_store(__v, __ptr, __rg_size);
+ }
+ }
+
+ template <typename _Tp, typename _Ap, __sized_contiguous_range _Rg, typename... _Flags>
+ requires indirectly_writable<ranges::iterator_t<_Rg>, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ unchecked_store(const basic_vec<_Tp, _Ap>& __v, _Rg&& __r,
+ const typename basic_vec<_Tp, _Ap>::mask_type& __mask,
+ flags<_Flags...> __f = {})
+ {
+ using _TV = basic_vec<_Tp, _Ap>;
+ static_assert(__loadstore_convertible_to<_Tp, ranges::range_value_t<_Rg>, _Flags...>,
+ "'flag_convert' must be used for conversions that are not value-preserving");
+
+ constexpr bool __allow_out_of_bounds = __f._S_test(__allow_partial_loadstore);
+ if constexpr (!__allow_out_of_bounds && __static_sized_range<_Rg>)
+ static_assert(ranges::size(__r) >= _TV::size(), "given range must have sufficient size");
+
+ auto* __ptr = __f.template _S_adjust_pointer<_TV>(ranges::data(__r));
+
+ if constexpr (!__allow_out_of_bounds)
+ __glibcxx_simd_precondition(
+ ranges::size(__r) >= size_t(_TV::size()),
+ "output range is too small. Did you mean to use 'partial_store'?");
+
+ const size_t __rg_size = ranges::size(__r);
+ if consteval
+ {
+ for (int __i = 0; __i < _TV::size(); ++__i)
+ {
+ if (__mask[__i] && (!__allow_out_of_bounds || size_t(__i) < __rg_size))
+ __ptr[__i] = static_cast<ranges::range_value_t<_Rg>>(__v[__i]);
+ }
+ }
+ else
+ {
+ if (__allow_out_of_bounds && __rg_size < size_t(_TV::size()))
+ _TV::_S_masked_store(__v, __ptr,
+ __mask && _TV::mask_type::_S_partial_mask_of_n(int(__rg_size)));
+ else
+ _TV::_S_masked_store(__v, __ptr, __mask);
+ }
+ }
+
+ template <typename _Tp, typename _Ap, contiguous_iterator _It, typename... _Flags>
+ requires indirectly_writable<_It, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ unchecked_store(const basic_vec<_Tp, _Ap>& __v, _It __first,
+ iter_difference_t<_It> __n, flags<_Flags...> __f = {})
+ { simd::unchecked_store(__v, std::span<iter_value_t<_It>>(__first, __n), __f); }
+
+ template <typename _Tp, typename _Ap, contiguous_iterator _It, typename... _Flags>
+ requires indirectly_writable<_It, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ unchecked_store(const basic_vec<_Tp, _Ap>& __v, _It __first, iter_difference_t<_It> __n,
+ const typename basic_vec<_Tp, _Ap>::mask_type& __mask,
+ flags<_Flags...> __f = {})
+ { simd::unchecked_store(__v, std::span<iter_value_t<_It>>(__first, __n), __mask, __f); }
+
+ template <typename _Tp, typename _Ap, contiguous_iterator _It, sized_sentinel_for<_It> _Sp,
+ typename... _Flags>
+ requires indirectly_writable<_It, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ unchecked_store(const basic_vec<_Tp, _Ap>& __v, _It __first, _Sp __last,
+ flags<_Flags...> __f = {})
+ { simd::unchecked_store(__v, std::span<iter_value_t<_It>>(__first, __last), __f); }
+
+ template <typename _Tp, typename _Ap, contiguous_iterator _It, sized_sentinel_for<_It> _Sp,
+ typename... _Flags>
+ requires indirectly_writable<_It, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ unchecked_store(const basic_vec<_Tp, _Ap>& __v, _It __first, _Sp __last,
+ const typename basic_vec<_Tp, _Ap>::mask_type& __mask,
+ flags<_Flags...> __f = {})
+ { simd::unchecked_store(__v, std::span<iter_value_t<_It>>(__first, __last), __mask, __f); }
+
+ template <typename _Tp, typename _Ap, __sized_contiguous_range _Rg, typename... _Flags>
+ requires indirectly_writable<ranges::iterator_t<_Rg>, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ partial_store(const basic_vec<_Tp, _Ap>& __v, _Rg&& __r, flags<_Flags...> __f = {})
+ { simd::unchecked_store(__v, __r, __f | __allow_partial_loadstore); }
+
+ template <typename _Tp, typename _Ap, __sized_contiguous_range _Rg, typename... _Flags>
+ requires indirectly_writable<ranges::iterator_t<_Rg>, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ partial_store(const basic_vec<_Tp, _Ap>& __v, _Rg&& __r,
+ const typename basic_vec<_Tp, _Ap>::mask_type& __mask,
+ flags<_Flags...> __f = {})
+ { simd::unchecked_store(__v, __r, __mask, __f | __allow_partial_loadstore); }
+
+ template <typename _Tp, typename _Ap, contiguous_iterator _It, typename... _Flags>
+ requires indirectly_writable<_It, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ partial_store(const basic_vec<_Tp, _Ap>& __v, _It __first, iter_difference_t<_It> __n,
+ flags<_Flags...> __f = {})
+ { partial_store(__v, span(__first, __n), __f); }
+
+ template <typename _Tp, typename _Ap, contiguous_iterator _It, typename... _Flags>
+ requires indirectly_writable<_It, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ partial_store(const basic_vec<_Tp, _Ap>& __v, _It __first, iter_difference_t<_It> __n,
+ const typename basic_vec<_Tp, _Ap>::mask_type& __mask, flags<_Flags...> __f = {})
+ { partial_store(__v, span(__first, __n), __mask, __f); }
+
+ template <typename _Tp, typename _Ap, contiguous_iterator _It, sized_sentinel_for<_It> _Sp,
+ typename... _Flags>
+ requires indirectly_writable<_It, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ partial_store(const basic_vec<_Tp, _Ap>& __v, _It __first, _Sp __last,
+ flags<_Flags...> __f = {})
+ { partial_store(__v, span(__first, __last), __f); }
+
+ template <typename _Tp, typename _Ap, contiguous_iterator _It, sized_sentinel_for<_It> _Sp,
+ typename... _Flags>
+ requires indirectly_writable<_It, _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ partial_store(const basic_vec<_Tp, _Ap>& __v, _It __first, _Sp __last,
+ const typename basic_vec<_Tp, _Ap>::mask_type& __mask, flags<_Flags...> __f = {})
+ { partial_store(__v, span(__first, __last), __mask, __f); }
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_LOADSTORE_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_MASK_H
+#define _GLIBCXX_SIMD_MASK_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_iterator.h"
+#include "vec_ops.h"
+#if _GLIBCXX_X86
+#include "simd_x86.h"
+#endif
+
+#include <bit>
+#include <bitset>
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ template <unsigned _Np>
+ struct _SwapNeighbors
+ {
+ consteval unsigned
+ operator()(unsigned __i, unsigned __size) const
+ {
+ if (__size % (2 * _Np) != 0)
+ __builtin_abort(); // swap_neighbors<N> permutation requires a multiple of 2N elements
+ else if (std::has_single_bit(_Np))
+ return __i ^ _Np;
+ else if (__i % (2 * _Np) >= _Np)
+ return __i - _Np;
+ else
+ return __i + _Np;
+ }
+ };
+
+ template <size_t _Np, size_t _Mp>
+ constexpr auto
+ __bitset_split(const bitset<_Mp>& __b)
+ {
+ constexpr auto __bits_per_word = __CHAR_BIT__ * __SIZEOF_LONG__;
+ if constexpr (_Np % __bits_per_word == 0)
+ {
+ struct _Tmp
+ {
+ bitset<_Np> _M_lo;
+ bitset<_Mp - _Np> _M_hi;
+ };
+ return __builtin_bit_cast(_Tmp, __b);
+ }
+ else
+ {
+ constexpr auto __bits_per_ullong = __CHAR_BIT__ * __SIZEOF_LONG_LONG__;
+ static_assert(_Mp <= __bits_per_ullong);
+ using _Lo = _Bitmask<_Np>;
+ using _Hi = _Bitmask<_Mp - _Np>;
+ struct _Tmp
+ {
+ _Lo _M_lo;
+ _Hi _M_hi;
+ };
+ return _Tmp {static_cast<_Lo>(__b.to_ullong()), static_cast<_Hi>(__b.to_ullong() >> _Np)};
+ }
+ }
+
+ static_assert(__bitset_split<64>(bitset<128>(1))._M_lo == bitset<64>(1));
+ static_assert(__bitset_split<64>(bitset<128>(1))._M_hi == bitset<64>(0));
+
+ // [simd.traits]
+ // --- rebind ---
+ template <typename _Tp, typename _Vp, _ArchTraits _Traits = {}>
+ struct rebind
+ {};
+
+ /**
+ * Computes a member @c type `basic_vec<_Tp, Abi>`, where @c Abi is chosen such that the
+ * number of elements is equal to `_Vp::size()` and features of the ABI tag (such as the
+ * internal representation of masks, or storage order of complex components) are preserved.
+ */
+ template <__vectorizable _Tp, __simd_vec_type _Vp, _ArchTraits _Traits>
+ //requires requires { typename __deduce_abi_t<_Tp, _Vp::size()>; }
+ struct rebind<_Tp, _Vp, _Traits>
+ { using type = __similar_vec<_Tp, _Vp::size(), typename _Vp::abi_type>; };
+
+ /**
+ * As above, except for @c basic_mask.
+ */
+ template <__vectorizable _Tp, __simd_mask_type _Mp, _ArchTraits _Traits>
+ //requires requires { typename __deduce_abi_t<_Tp, _Mp::size()>; }
+ struct rebind<_Tp, _Mp, _Traits>
+ { using type = __similar_mask<_Tp, _Mp::size(), typename _Mp::abi_type>; };
+
+ template <typename _Tp, typename _Vp>
+ using rebind_t = typename rebind<_Tp, _Vp>::type;
+
+ // --- resize ---
+ template <__simd_size_type _Np, typename _Vp, _ArchTraits _Traits = {}>
+ struct resize
+ {};
+
+ template <__simd_size_type _Np, __simd_vec_type _Vp, _ArchTraits _Traits>
+ requires (_Np >= 1)
+ //requires requires { typename __deduce_abi_t<typename _Vp::value_type, _Np>; }
+ struct resize<_Np, _Vp, _Traits>
+ { using type = __similar_vec<typename _Vp::value_type, _Np, typename _Vp::abi_type>; };
+
+ template <__simd_size_type _Np, __simd_mask_type _Mp, _ArchTraits _Traits>
+ requires (_Np >= 1)
+ //requires requires { typename __deduce_abi_t<typename _Mp::value_type, _Np>; }
+ struct resize<_Np, _Mp, _Traits>
+ {
+ using _A1 = decltype(__abi_rebind<__mask_element_size<_Mp>, _Np, typename _Mp::abi_type,
+ true>());
+
+ static_assert(__abi_tag<_A1>);
+
+ static_assert(_Mp::abi_type::_S_variant == _A1::_S_variant || __scalar_abi_tag<_A1>
+ || __scalar_abi_tag<typename _Mp::abi_type>);
+
+ using type = basic_mask<__mask_element_size<_Mp>, _A1>;
+ };
+
+ template <__simd_size_type _Np, typename _Vp>
+ using resize_t = typename resize<_Np, _Vp>::type;
+
+ // [simd.syn]
+ inline constexpr __simd_size_type zero_element = numeric_limits<int>::min();
+
+ inline constexpr __simd_size_type uninit_element = zero_element + 1;
+
+ // [simd.permute.static]
+ template<__simd_size_type _Np = 0, __simd_vec_or_mask_type _Vp,
+ __index_permutation_function<_Vp> _IdxMap>
+ [[__gnu__::__always_inline__]]
+ constexpr resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>
+ permute(const _Vp& __v, _IdxMap&& __idxmap)
+ { return resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>::_S_static_permute(__v, __idxmap); }
+
+ // [simd.permute.dynamic]
+ template<__simd_vec_or_mask_type _Vp, __simd_integral _Ip>
+ [[__gnu__::__always_inline__]]
+ constexpr resize_t<_Ip::size(), _Vp>
+ permute(const _Vp& __v, const _Ip& __indices)
+ { return __v[__indices]; }
+
+ // [simd.creation] ----------------------------------------------------------
+ template<__simd_vec_type _Vp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ chunk(const basic_vec<typename _Vp::value_type, _Ap>& __x) noexcept
+ { return __x.template _M_chunk<_Vp>(); }
+
+ template<__simd_mask_type _Mp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ chunk(const basic_mask<__mask_element_size<_Mp>, _Ap>& __x) noexcept
+ { return __x.template _M_chunk<_Mp>(); }
+
+ template<__simd_size_type _Np, typename _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ chunk(const basic_vec<_Tp, _Ap>& __x) noexcept
+ -> decltype(chunk<resize_t<_Np, basic_vec<_Tp, _Ap>>>(__x))
+ { return chunk<resize_t<_Np, basic_vec<_Tp, _Ap>>>(__x); }
+
+ template<__simd_size_type _Np, size_t _Bytes, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ chunk(const basic_mask<_Bytes, _Ap>& __x) noexcept
+ -> decltype(chunk<resize_t<_Np, basic_mask<_Bytes, _Ap>>>(__x))
+ { return chunk<resize_t<_Np, basic_mask<_Bytes, _Ap>>>(__x); }
+
+ // LWG???? (reported 2025-11-25)
+ template<typename _Tp, typename _A0, typename... _Abis>
+ constexpr resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_vec<_Tp, _A0>>
+ cat(const basic_vec<_Tp, _A0>& __x0, const basic_vec<_Tp, _Abis>&... __xs) noexcept
+ {
+ return resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_vec<_Tp, _A0>>
+ ::_S_concat(__x0, __xs...);
+ }
+
+ // LWG???? (reported 2025-11-25)
+ template<size_t _Bytes, typename _A0, typename... _Abis>
+ constexpr resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_mask<_Bytes, _A0>>
+ cat(const basic_mask<_Bytes, _A0>& __x0, const basic_mask<_Bytes, _Abis>&... __xs) noexcept
+ {
+ return resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_mask<_Bytes, _A0>>
+ ::_S_concat(__x0, __xs...);
+ }
+
+ // implementation helper for chunk and cat
+ consteval int
+ __packs_to_skip_at_front(int __offset, initializer_list<int> __sizes)
+ {
+ int __i = 0;
+ int __n = 0;
+ for (int __s : __sizes)
+ {
+ __n += __s;
+ if (__n > __offset)
+ return __i;
+ ++__i;
+ }
+ __builtin_trap(); // called out of contract
+ }
+
+ consteval int
+ __packs_to_skip_at_back(int __offset, int __max, initializer_list<int> __sizes)
+ {
+ int __i = 0;
+ int __n = -__offset;
+ for (int __s : __sizes)
+ {
+ ++__i;
+ __n += __s;
+ if (__n >= __max)
+ return int(__sizes.size()) - __i;
+ }
+ return 0;
+ }
+
+ // in principle, this overload allows conversions to _Dst - and it wouldn't be wrong - but the
+ // general overload below is still a better candidate in overload resolution
+ template <typename _Dst>
+ [[__gnu__::__always_inline__]]
+ constexpr _Dst
+ __extract_simd_at(auto _Offset, const _Dst& __r, const auto&...)
+ requires(_Offset.value == 0)
+ { return __r; }
+
+ template <typename _Dst, typename _V0>
+ [[__gnu__::__always_inline__]]
+ constexpr _Dst
+ __extract_simd_at(auto _Offset, const _V0&, const _Dst& __r, const auto&...)
+ requires(_Offset.value == _V0::size.value)
+ { return __r; }
+
+ template <typename _Dst, typename... _Vs>
+ [[__gnu__::__always_inline__]]
+ constexpr _Dst
+ __extract_simd_at(auto _Offset, const _Vs&... __xs)
+ {
+ using _Adst = typename _Dst::abi_type;
+ if constexpr (_Adst::_S_nreg >= 2)
+ {
+ using _Dst0 = remove_cvref_t<decltype(declval<_Dst>()._M_get_low())>;
+ using _Dst1 = remove_cvref_t<decltype(declval<_Dst>()._M_get_high())>;
+ return _Dst::_S_init(__extract_simd_at<_Dst0>(_Offset, __xs...),
+ __extract_simd_at<_Dst1>(_Offset + _Dst0::size, __xs...));
+ }
+ else
+ {
+ using _Ret = remove_cvref_t<decltype(declval<_Dst>()._M_get())>;
+ constexpr bool __use_bitmask = __simd_mask_type<_Dst> && _Adst::_S_is_bitmask;
+ constexpr int __dst_full_size = __bit_ceil(unsigned(_Adst::_S_size));
+ constexpr int __nargs = sizeof...(__xs);
+ using _Afirst = typename _Vs...[0]::abi_type;
+ using _Alast = typename _Vs...[__nargs - 1]::abi_type;
+ const auto& __x0 = __xs...[0];
+ const auto& __xlast = __xs...[__nargs - 1];
+ constexpr int __ninputs = (_Vs::size.value + ...);
+ if constexpr (_Offset.value >= _Afirst::_S_size
+ || __ninputs - _Offset.value - _Alast::_S_size >= _Adst::_S_size)
+ { // can drop inputs at the front and/or back of the pack
+ constexpr int __skip_front = __packs_to_skip_at_front(_Offset.value,
+ {_Vs::size.value...});
+ constexpr int __skip_back = __packs_to_skip_at_back(_Offset.value, _Adst::_S_size,
+ {_Vs::size.value...});
+ static_assert(__skip_front > 0 || __skip_back > 0);
+ constexpr auto [...__skip] = _IotaArray<__skip_front>;
+ constexpr auto [...__is] = _IotaArray<__nargs - __skip_front - __skip_back>;
+ constexpr int __new_offset = _Offset.value - (0 + ... + _Vs...[__skip]::size.value);
+ return __extract_simd_at<_Dst>(cw<__new_offset>, __xs...[__is + __skip_front]...);
+ }
+ else if constexpr (_Adst::_S_size == 1)
+ { // trivial conversion to one value_type
+ return _Dst(__x0[_Offset.value]);
+ }
+ else if constexpr (_Afirst::_S_nreg >= 2 || _Alast::_S_nreg >= 2)
+ { // flatten first and/or last multi-register argument
+ constexpr bool __flatten_first = _Afirst::_S_nreg >= 2;
+ constexpr bool __flatten_last = __nargs > 1 && _Alast::_S_nreg >= 2;
+ constexpr auto [...__is] = _IotaArray<__nargs - __flatten_first - __flatten_last>;
+ if constexpr (__flatten_first && __flatten_last)
+ return __extract_simd_at<_Dst>(
+ _Offset, __x0._M_get_low(), __x0._M_get_high(), __xs...[__is + 1]...,
+ __xlast._M_get_low(), __xlast._M_get_high());
+ else if constexpr (__flatten_first)
+ return __extract_simd_at<_Dst>(
+ _Offset, __x0._M_get_low(), __x0._M_get_high(), __xs...[__is + 1]...);
+ else
+ return __extract_simd_at<_Dst>(
+ _Offset, __xs...[__is]..., __xlast._M_get_low(), __xlast._M_get_high());
+ }
+ else if constexpr (__simd_mask_type<_Dst>
+ && ((_Adst::_S_variant != _Vs::abi_type::_S_variant
+ && !__scalar_abi_tag<typename _Vs::abi_type>) || ...))
+ { // convert ABI tag if incompatible
+ return __extract_simd_at<_Dst>(
+ _Offset, static_cast<const resize_t<_Vs::size.value, _Dst>&>(__xs)...);
+ }
+
+ // at this point __xs should be as small as possible; there may be some corner cases left
+
+ else if constexpr (__nargs == 1)
+ { // simple and optimal
+ if constexpr (__use_bitmask)
+ return _Dst(_Ret(__x0._M_to_uint() >> _Offset.value));
+ else
+ return _VecOps<_Ret>::_S_extract(__x0._M_concat_data(false), _Offset);
+ }
+ else if constexpr (__use_bitmask)
+ { // fairly simple and optimal bit shifting solution
+ static_assert(_Afirst::_S_nreg == 1);
+ static_assert(_Offset.value < _Afirst::_S_size);
+ int __offset = -_Offset.value;
+ _Ret __r;
+ template for (const auto& __x : {__xs...})
+ {
+ if (__offset <= 0)
+ __r = _Ret(__x._M_to_uint() >> -__offset);
+ else if (__offset < _Adst::_S_size)
+ __r |= _Ret(_Ret(__x._M_to_uint()) << __offset);
+ __offset += __x.size.value;
+ }
+ return _Dst(__r);
+ }
+ else if constexpr (__nargs == 2 && _Offset == 0 && _Adst::_S_nreg == 1
+ && _Afirst::_S_size >= _Alast::_S_size
+ && __has_single_bit(unsigned(_Afirst::_S_size)))
+ { // simple __vec_concat
+ if constexpr (_Afirst::_S_size == 1)
+ // even simpler init from two values
+ return _Ret{__x0._M_concat_data()[0], __xlast._M_concat_data()[0]};
+ else
+ {
+ const auto __v0 = __x0._M_concat_data();
+ const auto __v1 = __vec_zero_pad_to<sizeof(__v0)>(__xlast._M_concat_data());
+ return __vec_concat(__v0, __v1);
+ }
+ }
+ else if constexpr (__nargs == 2 && _Adst::_S_nreg == 1 && _Offset == 0
+ && _Afirst::_S_nreg == 1 && _Alast::_S_size == 1)
+ { // optimize insertion of one element at the end
+ _Ret __r = __vec_zero_pad_to<sizeof(_Ret)>(__x0._M_get());
+ __vec_set(__r, _Afirst::_S_size, __xlast._M_concat_data()[0]);
+ return __r;
+ }
+ else if constexpr (__nargs == 2 && _Adst::_S_nreg == 1 && _Offset == 0
+ && _Afirst::_S_nreg == 1 && _Alast::_S_size == 2)
+ { // optimize insertion of two elements at the end
+ _Ret __r = __vec_zero_pad_to<sizeof(_Ret)>(__x0._M_concat_data());
+ const auto __x1 = __xlast._M_concat_data();
+ if constexpr (sizeof(__x1) <= sizeof(double) && (_Afirst::_S_size & 1) == 0)
+ { // can use a single insert instruction
+ using _Up = __conditional_t<
+ is_floating_point_v<__vec_value_type<_Ret>>,
+ __conditional_t<sizeof(__x1) == sizeof(double), double, float>,
+ __integer_from<sizeof(__x1)>>;
+ auto __r2 = __vec_bit_cast<_Up>(__r);
+ __vec_set(__r2, _Afirst::_S_size / 2, __vec_bit_cast<_Up>(__x1)[0]);
+ __r = reinterpret_cast<_Ret>(__r2);
+ }
+ else
+ {
+ __vec_set(__r, _Afirst::_S_size, __x1[0]);
+ __vec_set(__r, _Afirst::_S_size + 1, __x1[1]);
+ }
+ return __r;
+ }
+ else if constexpr (__nargs == 2 && _Afirst::_S_nreg == 1 && _Alast::_S_nreg == 1)
+ { // optimize concat of two input vectors (e.g. using palignr)
+ constexpr auto [...__is] = _IotaArray<__dst_full_size>;
+ constexpr int __v2_offset = __width_of<decltype(__x0._M_concat_data())>;
+ return __builtin_shufflevector(
+ __x0._M_concat_data(), __xlast._M_concat_data(), [](int __i) consteval {
+ if (__i < _Afirst::_S_size)
+ return __i;
+ __i -= _Afirst::_S_size;
+ if (__i < _Alast::_S_size)
+ return __i + __v2_offset;
+ else
+ return -1;
+ }(__is + _Offset.value)...);
+ }
+ else if (__is_const_known(__xs...) || __ninputs == _Adst::_S_size)
+ { // hard to optimize for the compiler, but necessary in constant expressions
+ return _VecOps<_Ret>::_S_extract(
+ __vec_concat_sized<__xs.size.value...>(__xs._M_concat_data(false)...),
+ _Offset);
+ }
+ else
+ { // fallback to concatenation in memory => load the result
+ alignas(_Ret) __vec_value_type<_Ret>
+ __tmp[std::max(__ninputs, _Offset.value + __dst_full_size)] = {};
+ int __offset = 0;
+ template for (const auto& __x : {__xs...})
+ {
+ if constexpr (__simd_mask_type<_Dst>)
+ (-__x)._M_store(__tmp + __offset);
+ else
+ __x._M_store(__tmp + __offset);
+ __offset += __x.size.value;
+ }
+ _Ret __r;
+ __builtin_memcpy(&__r, __tmp + _Offset.value, sizeof(_Ret));
+ return __r;
+ }
+ }
+ }
+
+ // [simd.mask] --------------------------------------------------------------
+ template <size_t _Bytes, typename _Ap>
+ class basic_mask
+ {
+ public:
+ using value_type = bool;
+
+ using abi_type = _Ap;
+
+#define _GLIBCXX_DELETE_SIMD "This specialization is disabled because of an invalid combination " \
+ "of template arguments to basic_mask."
+
+ basic_mask() = delete(_GLIBCXX_DELETE_SIMD);
+
+ ~basic_mask() = delete(_GLIBCXX_DELETE_SIMD);
+
+ basic_mask(const basic_mask&) = delete(_GLIBCXX_DELETE_SIMD);
+
+ basic_mask& operator=(const basic_mask&) = delete(_GLIBCXX_DELETE_SIMD);
+
+#undef _GLIBCXX_DELETE_SIMD
+ };
+
+ template <size_t _Bytes, typename _Ap>
+ class _MaskBase
+ {
+ using _Mp = basic_mask<_Bytes, _Ap>;
+
+ protected:
+ using _VecType = __simd_vec_from_mask_t<_Bytes, _Ap>;
+
+ static_assert(destructible<_VecType> || _Bytes > sizeof(0ull));
+
+ public:
+ using iterator = __iterator<_Mp>;
+
+ using const_iterator = __iterator<const _Mp>;
+
+ constexpr iterator
+ begin() noexcept
+ { return {static_cast<_Mp&>(*this), 0}; }
+
+ constexpr const_iterator
+ begin() const noexcept
+ { return cbegin(); }
+
+ constexpr const_iterator
+ cbegin() const noexcept
+ { return {static_cast<const _Mp&>(*this), 0}; }
+
+ constexpr default_sentinel_t
+ end() const noexcept
+ { return {}; }
+
+ constexpr default_sentinel_t
+ cend() const noexcept
+ { return {}; }
+
+ static constexpr auto size = __simd_size_c<_Ap::_S_size>;
+
+ _MaskBase() = default;
+
+ // LWG issue from 2026-03-04 / P4042R0
+ template <size_t _UBytes, typename _UAbi>
+ requires (_Ap::_S_size != _UAbi::_S_size)
+ explicit
+ _MaskBase(const basic_mask<_UBytes, _UAbi>&) = delete("size mismatch");
+
+ template <typename _Up, typename _UAbi>
+ explicit
+ _MaskBase(const basic_vec<_Up, _UAbi>&)
+ = delete("use operator! or a comparison to convert a vec into a mask");
+
+ template <typename _Up, typename _UAbi>
+ requires (_Ap::_S_size != _UAbi::_S_size)
+ operator basic_vec<_Up, _UAbi>() const
+ = delete("size mismatch");
+ };
+
+ template <size_t _Bytes, __abi_tag _Ap>
+ requires (_Ap::_S_nreg == 1)
+ class basic_mask<_Bytes, _Ap>
+ : public _MaskBase<_Bytes, _Ap>
+ {
+ using _Base = _MaskBase<_Bytes, _Ap>;
+
+ using _VecType = _Base::_VecType;
+
+ template <size_t, typename>
+ friend class basic_mask;
+
+ template <typename, typename>
+ friend class basic_vec;
+
+ static constexpr int _S_size = _Ap::_S_size;
+
+ using _DataType = typename _Ap::template _MaskDataType<_Bytes>;
+
+ static constexpr bool _S_has_bool_member = is_same_v<_DataType, bool>;
+
+ static constexpr bool _S_is_scalar = _S_has_bool_member;
+
+ static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask;
+
+ static constexpr int _S_full_size = [] {
+ if constexpr (_S_is_scalar)
+ return _S_size;
+ else if constexpr (_S_use_bitmask && _S_size < __CHAR_BIT__)
+ return __CHAR_BIT__;
+ else
+ return __bit_ceil(unsigned(_S_size));
+ }();
+
+ static constexpr bool _S_is_partial = _S_size != _S_full_size;
+
+ static constexpr _DataType _S_implicit_mask = [] {
+ if constexpr (_S_is_scalar)
+ return true;
+ else if (!_S_is_partial)
+ return _DataType(~_DataType());
+ else if constexpr (_S_use_bitmask)
+ return _DataType((_DataType(1) << _S_size) - 1);
+ else
+ {
+ constexpr auto [...__is] = _IotaArray<_S_full_size>;
+ return _DataType{ (__is < _S_size ? -1 : 0)... };
+ }
+ }();
+
+ // Actual padding bytes, not padding elements.
+ // => _S_padding_bytes is 0 even if _S_is_partial is true.
+ static constexpr size_t _S_padding_bytes = 0;
+
+ _DataType _M_data;
+
+ public:
+ using value_type = bool;
+
+ using abi_type = _Ap;
+
+ using iterator = _Base::iterator;
+
+ using const_iterator = _Base::const_iterator;
+
+ // internal but public API ----------------------------------------------
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(_DataType __x)
+ {
+ basic_mask __r;
+ __r._M_data = __x;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(unsigned_integral auto __bits)
+ { return basic_mask(__bits); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr const _DataType&
+ _M_get() const
+ { return _M_data; }
+
+ /** @internal
+ * Bit-cast the given object @p __x to basic_mask.
+ *
+ * This is necessary for _S_nreg > 1 where the last element can be bool or when the sizeof
+ * doesn't match because of different alignment requirements of the sub-masks.
+ */
+ template <size_t _UBytes, typename _UAbi>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x)
+ { return __builtin_bit_cast(basic_mask, __x._M_concat_data()); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_concat_data(bool __do_sanitize = _S_is_partial) const
+ {
+ if constexpr (_S_is_scalar)
+ return __vec_builtin_type<__integer_from<_Bytes>, 1>{__integer_from<_Bytes>(-_M_data)};
+ else
+ {
+ if constexpr (_S_is_partial)
+ if (__do_sanitize)
+ return _DataType(_M_data & _S_implicit_mask);
+ return _M_data;
+ }
+ }
+
+ /** @internal
+ * Returns a mask where the first @p __n elements are true. All remaining elements are false.
+ *
+ * @pre @p __n > 0 && @p __n < _S_size
+ */
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_partial_mask_of_n(int __n)
+ {
+ static_assert(!_S_is_scalar);
+ if constexpr (!_S_use_bitmask)
+ {
+ using _Ip = __integer_from<_Bytes>;
+ __glibcxx_simd_precondition(__n >= 0 && __n <= numeric_limits<_Ip>::max(),
+ "_S_partial_mask_of_n without _S_use_bitmask requires "
+ "positive __n that does not overflow.");
+ constexpr _DataType __0123
+ = __builtin_bit_cast(_DataType, _IotaArray<_Ip(_S_full_size)>);
+ return basic_mask(__0123 < _Ip(__n));
+ }
+ else
+ {
+ __glibcxx_simd_precondition(__n >= 0 && __n <= 255,
+ "The x86 BZHI instruction requires __n to "
+ "only use bits 0:7");
+#if __has_builtin(__builtin_ia32_bzhi_si)
+ if constexpr (_S_size <= 32 && _Traits._M_have_bmi2())
+ return _S_init(_Bitmask<_S_size>(
+ __builtin_ia32_bzhi_si(~0u >> (32 - _S_size), unsigned(__n))));
+#endif
+#if __has_builtin(__builtin_ia32_bzhi_di)
+ else if constexpr (_S_size <= 64 && _Traits._M_have_bmi2())
+ return _S_init(__builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n)));
+#endif
+ if constexpr (_S_size <= 32)
+ {
+ __glibcxx_simd_precondition(__n < 32, "invalid shift");
+ return _S_init(_Bitmask<_S_size>((1u << unsigned(__n)) - 1));
+ }
+ else if constexpr (_S_size <= 64)
+ {
+ __glibcxx_simd_precondition(__n < 64, "invalid shift");
+ return _S_init((1ull << unsigned(__n)) - 1);
+ }
+ else
+ static_assert(false);
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask&
+ _M_and_neighbors()
+ {
+ if constexpr (_S_use_bitmask)
+ _M_data &= ((_M_data >> 1) & 0x5555'5555'5555'5555ull)
+ | ((_M_data << 1) & ~0x5555'5555'5555'5555ull);
+ else
+ _M_data &= _VecOps<_DataType>::_S_swap_neighbors(_M_data);
+ return *this;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask&
+ _M_or_neighbors()
+ {
+ if constexpr (_S_use_bitmask)
+ _M_data |= ((_M_data >> 1) & 0x5555'5555'5555'5555ull)
+ | ((_M_data << 1) & ~0x5555'5555'5555'5555ull);
+ else
+ _M_data |= _VecOps<_DataType>::_S_swap_neighbors(_M_data);
+ return *this;
+ }
+
+ template <typename _Mp>
+ [[__gnu__::__always_inline__]]
+ constexpr auto _M_chunk() const noexcept
+ {
+ constexpr int __n = _S_size / _Mp::_S_size;
+ constexpr int __rem = _S_size % _Mp::_S_size;
+ constexpr auto [...__is] = _IotaArray<__n>;
+ if constexpr (__rem == 0)
+ return array<_Mp, __n>{__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, *this)...};
+ else
+ {
+ using _Rest = resize_t<__rem, _Mp>;
+ return tuple(__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, *this)...,
+ __extract_simd_at<_Rest>(cw<_Mp::_S_size * __n>, *this));
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr const basic_mask&
+ _S_concat(const basic_mask& __x0) noexcept
+ { return __x0; }
+
+ template <typename... _As>
+ requires (sizeof...(_As) > 1)
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_concat(const basic_mask<_Bytes, _As>&... __xs) noexcept
+ {
+ static_assert(_S_size == (_As::_S_size + ...));
+ return __extract_simd_at<basic_mask>(cw<0>, __xs...);
+ }
+
+ // [simd.mask.overview] default constructor -----------------------------
+ basic_mask() = default;
+
+ // [simd.mask.overview] conversion extensions ---------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_mask(_DataType __x) requires(!_S_is_scalar && !_S_use_bitmask)
+ : _M_data(__x)
+ {}
+
+ [[__gnu__::__always_inline__]]
+ constexpr
+ operator _DataType() requires(!_S_is_scalar && !_S_use_bitmask)
+ { return _M_data; }
+
+ // [simd.mask.ctor] broadcast constructor -------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(same_as<bool> auto __x) noexcept // LWG 4382.
+ : _M_data(__x ? _S_implicit_mask : _DataType())
+ {}
+
+ // [simd.mask.ctor] conversion constructor ------------------------------
+ template <size_t _UBytes, typename _UAbi>
+ requires (_S_size == _UAbi::_S_size)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes))
+ basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
+ : _M_data([&] [[__gnu__::__always_inline__]] {
+ using _UV = basic_mask<_UBytes, _UAbi>;
+ // bool to bool
+ if constexpr (_S_is_scalar)
+ return __x[0];
+
+ // converting from an "array of bool"
+ else if constexpr (_UV::_S_is_scalar)
+ {
+ constexpr auto [...__is] = _IotaArray<_S_size>;
+ if constexpr (_S_use_bitmask)
+ return ((_DataType(__x[__is]) << __is) | ...);
+ else
+ return _DataType{__vec_value_type<_DataType>(-__x[__is])...};
+ }
+
+ // vec-/bit-mask to bit-mask | bit-mask to vec-mask
+ else if constexpr (_S_use_bitmask || _UV::_S_use_bitmask)
+ return basic_mask(__x.to_bitset())._M_data;
+
+ // vec-mask to vec-mask
+ else if constexpr (_Bytes == _UBytes)
+ return _S_recursive_bit_cast(__x)._M_data;
+
+ else
+ {
+#if _GLIBCXX_X86
+ // TODO: turn this into a __vec_mask_cast overload in simd_x86.h
+ if constexpr (_Bytes == 1 && _UBytes == 2)
+ if (!__is_const_known(__x))
+ {
+ if constexpr (_UAbi::_S_nreg == 1)
+ return __x86_cvt_vecmask<_DataType>(__x._M_data);
+ else if constexpr (_UAbi::_S_nreg == 2)
+ {
+ auto __lo = __x._M_data0._M_data;
+ auto __hi = __vec_zero_pad_to<sizeof(__lo)>(
+ __x._M_data1._M_concat_data());
+ return __x86_cvt_vecmask<_DataType>(__lo, __hi);
+ }
+ }
+#endif
+ return __vec_mask_cast<_DataType>(__x._M_concat_data());
+ }
+ }())
+ {}
+
+ using _Base::_MaskBase;
+
+ // [simd.mask.ctor] generator constructor -------------------------------
+ template <__simd_generator_invokable<bool, _S_size> _Fp>
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(_Fp&& __gen)
+ : _M_data([&] [[__gnu__::__always_inline__]] {
+ constexpr auto [...__is] = _IotaArray<_S_size>;
+ if constexpr (_S_is_scalar)
+ return __gen(__simd_size_c<0>);
+ else if constexpr (_S_use_bitmask)
+ return _DataType(((_DataType(__gen(__simd_size_c<__is>)) << __is)
+ | ...));
+ else
+ return _DataType{__vec_value_type<_DataType>(
+ __gen(__simd_size_c<__is>) ? -1 : 0)...};
+ }())
+ {}
+
+ // [simd.mask.ctor] bitset constructor ----------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_mask(const same_as<bitset<_S_size>> auto& __b) noexcept // LWG 4382.
+ : basic_mask(static_cast<_Bitmask<_S_size>>(__b.to_ullong()))
+ {
+ // more than 64 elements in one register? not yet.
+ static_assert(_S_size <= numeric_limits<unsigned long long>::digits);
+ }
+
+ // [simd.mask.ctor] uint constructor ------------------------------------
+ template <unsigned_integral _Tp>
+ requires (!same_as<_Tp, bool>) // LWG 4382.
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(_Tp __val) noexcept
+ : _M_data([&] [[__gnu__::__always_inline__]] () {
+ if constexpr (_S_use_bitmask)
+ return __val;
+ else if constexpr (_S_is_scalar)
+ return bool(__val & 1);
+ else if (__is_const_known(__val))
+ {
+ constexpr auto [...__is] = _IotaArray<_S_size>;
+ return _DataType {__vec_value_type<_DataType>((__val & (1ull << __is)) == 0
+ ? 0 : -1)...};
+ }
+ else
+ {
+ using _Ip = typename _VecType::value_type;
+ _VecType __v0 = _Ip(__val);
+ constexpr int __bits_per_element = sizeof(_Ip) * __CHAR_BIT__;
+ constexpr _VecType __pow2 = _VecType(1) << (__iota<_VecType> % __bits_per_element);
+ if constexpr (_S_size < __bits_per_element)
+ return ((__v0 & __pow2) > 0)._M_concat_data();
+ else if constexpr (_S_size == __bits_per_element)
+ return ((__v0 & __pow2) != 0)._M_concat_data();
+ else
+ {
+ static_assert(_Bytes == 1);
+ static_assert(sizeof(_Ip) == 1);
+ _Bitmask<_S_size> __bits = __val;
+ static_assert(sizeof(_VecType) % sizeof(__bits) == 0);
+ if constexpr (sizeof(_DataType) == 32)
+ {
+ __vec_builtin_type<_UInt<8>, 4> __v1 = {
+ 0xffu & (__bits >> (0 * __CHAR_BIT__)),
+ 0xffu & (__bits >> (1 * __CHAR_BIT__)),
+ 0xffu & (__bits >> (2 * __CHAR_BIT__)),
+ 0xffu & (__bits >> (3 * __CHAR_BIT__)),
+ };
+ __v1 *= 0x0101'0101'0101'0101ull;
+ __v0 = __builtin_bit_cast(_VecType, __v1);
+ return ((__v0 & __pow2) != 0)._M_data;
+ }
+ else
+ {
+ using _V1 = vec<_Ip, sizeof(__bits)>;
+ _V1 __v1 = __builtin_bit_cast(_V1, __bits);
+ __v0 = _VecType::_S_static_permute(__v1, [](int __i) {
+ return __i / __CHAR_BIT__;
+ });
+ return ((__v0 & __pow2) != 0)._M_data;
+ }
+ }
+ }
+ }())
+ {}
+
+ //Effects: Initializes the first M elements to the corresponding bit values in val, where M is
+ //the smaller of size() and the number of bits in the value representation
+ //([basic.types.general]) of the type of val. If M is less than size(), the remaining elements
+ //are initialized to zero.
+
+
+ // [simd.mask.subscr] ---------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ operator[](__simd_size_type __i) const
+ {
+ __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
+ if constexpr (_S_is_scalar)
+ return _M_data;
+ else if constexpr (_S_use_bitmask)
+ return bool((_M_data >> __i) & 1);
+ else
+ return _M_data[__i] & 1;
+ }
+
+ // [simd.mask.unary] ----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask
+ operator!() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return _S_init(!_M_data);
+ else
+ return _S_init(~_M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator+() const noexcept requires destructible<_VecType>
+ { return operator _VecType(); }
+
+ constexpr _VecType
+ operator+() const noexcept = delete;
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator-() const noexcept requires destructible<_VecType>
+ {
+ using _Ip = typename _VecType::value_type;
+ if constexpr (_S_is_scalar)
+ return _Ip(-int(_M_data));
+ else if constexpr (_S_use_bitmask)
+ return __select_impl(*this, _Ip(-1), _Ip());
+ else
+ {
+ static_assert(sizeof(_VecType) == sizeof(_M_data));
+ return __builtin_bit_cast(_VecType, _M_data);
+ }
+ }
+
+ constexpr _VecType
+ operator-() const noexcept = delete;
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator~() const noexcept requires destructible<_VecType>
+ {
+ using _Ip = typename _VecType::value_type;
+ if constexpr (_S_is_scalar)
+ return _Ip(~int(_M_data));
+ else if constexpr (_S_use_bitmask)
+ return __select_impl(*this, _Ip(-2), _Ip(-1));
+ else
+ {
+ static_assert(sizeof(_VecType) == sizeof(_M_data));
+ return __builtin_bit_cast(_VecType, _M_data) - _Ip(1);
+ }
+ }
+
+ constexpr _VecType
+ operator~() const noexcept = delete;
+
+ // [simd.mask.conv] -----------------------------------------------------
+ template <typename _Up, typename _UAbi>
+ requires (_UAbi::_S_size == _S_size)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(sizeof(_Up) != _Bytes)
+ operator basic_vec<_Up, _UAbi>() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return _Up(_M_data);
+ else
+ {
+ using _UV = basic_vec<_Up, _UAbi>;
+ return __select_impl(static_cast<_UV::mask_type>(*this), _UV(1), _UV(0));
+ }
+ }
+
+ using _Base::operator basic_vec;
+
+ // [simd.mask.namedconv] ------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr bitset<_S_size>
+ to_bitset() const noexcept
+ {
+ // more than 64 elements in one register? not yet.
+ static_assert(_S_size <= numeric_limits<unsigned long long>::digits);
+ return to_ullong();
+ }
+
+ /** @internal
+ * Return the mask as the smallest possible unsigned integer (up to 64 bits).
+ *
+ * @tparam _Offset Adjust the return type & value to start at bit @p _Offset.
+ * @tparam _Use_2_for_1 Store the value of every second element into one bit of the result.
+ * (precondition: each even/odd pair stores the same value)
+ */
+ template <int _Offset = 0, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr _Bitmask<_S_size + _Offset>
+ _M_to_uint() const
+ {
+ constexpr int __nbits = _S_size;
+ static_assert(__nbits + _Offset <= numeric_limits<unsigned long long>::digits);
+ // before shifting
+ using _U0 = _Bitmask<__nbits>;
+ // potentially wider type needed for shift by _Offset
+ using _Ur = _Bitmask<__nbits + _Offset>;
+ if constexpr (_S_is_scalar || _S_use_bitmask)
+ {
+ auto __bits = _M_data;
+ if constexpr (_S_is_partial)
+ __bits &= _S_implicit_mask;
+ return _Ur(__bits) << _Offset;
+ }
+ else
+ {
+#if _GLIBCXX_X86
+ if (!__is_const_known(*this))
+ {
+ _U0 __uint;
+ if constexpr (_Bytes != 2) // movmskb would duplicate each bit
+ __uint = _U0(__x86_movmsk(_M_data));
+ else if constexpr (_Bytes == 2 && _Traits._M_have_bmi2())
+ __uint = __bit_extract_even<__nbits>(__x86_movmsk(_M_data));
+ else if constexpr (_Bytes == 2)
+ return __similar_mask<char, __nbits, _Ap>(*this).template _M_to_uint<_Offset>();
+ else
+ static_assert(false);
+ // TODO: with AVX512 use __builtin_ia32_cvt[bwdq]2mask(128|256|512)
+ // TODO: Ask for compiler builtin to do the best of the above. This should also
+ // combine with a preceding vector-mask compare to produce a bit-mask compare (on
+ // AVX512)
+ if constexpr (_S_is_partial)
+ __uint &= (_U0(1) << _S_size) - 1;
+ return _Ur(__uint) << _Offset;
+ }
+#endif
+ using _IV = _VecType;
+ static_assert(destructible<_IV>);
+ const typename _IV::mask_type& __k = [&] [[__gnu__::__always_inline__]] () {
+ if constexpr (is_same_v<typename _IV::mask_type, basic_mask>)
+ return *this;
+ else
+ return typename _IV::mask_type(*this);
+ }();
+ constexpr int __n = _IV::size();
+ if constexpr (_Bytes * __CHAR_BIT__ >= __n) // '1 << __iota' cannot overflow
+ { // reduce(select(k, powers_of_2, 0))
+ constexpr _IV __pow2 = _IV(1) << __iota<_IV>;
+ return _Ur(_U0(__select_impl(__k, __pow2, _IV())
+ ._M_reduce(bit_or<>()))) << _Offset;
+ }
+ else if constexpr (__n % __CHAR_BIT__ != 0)
+ { // recurse after splitting in two
+ constexpr int __n_lo = __n - __n % __CHAR_BIT__;
+ const auto [__lo, __hi] = chunk<__n_lo>(__k);
+ _Ur __bits = __hi.template _M_to_uint<_Offset + __n_lo>();
+ return __bits | __lo.template _M_to_uint<_Offset>();
+ }
+ else
+ { // limit powers_of_2 to 1, 2, 4, ..., 128
+ constexpr _IV __pow2 = _IV(1) << (__iota<_IV> % _IV(__CHAR_BIT__));
+ _IV __x = __select_impl(__k, __pow2, _IV());
+ // partial reductions of 8 neighboring elements
+ __x |= _IV::_S_static_permute(__x, _SwapNeighbors<4>());
+ __x |= _IV::_S_static_permute(__x, _SwapNeighbors<2>());
+ __x |= _IV::_S_static_permute(__x, _SwapNeighbors<1>());
+ // permute partial reduction results to the front
+ __x = _IV::_S_static_permute(__x, [](int __i) {
+ return __i * 8 < __n ? __i * 8 : uninit_element;
+ });
+ // extract front as scalar unsigned
+ _U0 __bits = __builtin_bit_cast(
+ __similar_vec<_U0, __n * _Bytes / sizeof(_U0), _Ap>, __x)[0];
+ // mask off unused bits
+ if constexpr (!__has_single_bit(unsigned(__nbits)))
+ __bits &= (_U0(1) << __nbits) - 1;
+ return _Ur(__bits) << _Offset;
+ }
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr unsigned long long
+ to_ullong() const
+ { return _M_to_uint(); }
+
+ // [simd.mask.binary] ---------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator&&(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data & __y._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator||(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data | __y._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator&(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data & __y._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator|(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data | __y._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator^(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data ^ __y._M_data); }
+
+ // [simd.mask.cassign] --------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator&=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data &= __y._M_data;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator|=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data |= __y._M_data;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator^=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data ^= __y._M_data;
+ return __x;
+ }
+
+ // [simd.mask.comparison] -----------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator==(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !(__x ^ __y); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator!=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x ^ __y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator>=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x || !__y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator<=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !__x || __y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator>(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x && !__y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator<(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !__x && __y; }
+
+ // [simd.mask.cond] -----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ __select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept
+ {
+ if constexpr (!_S_use_bitmask)
+ {
+#if _GLIBCXX_X86
+ // this works around bad code-gen when the compiler can't see that __k is a vector-mask.
+ // This pattern, is recognized to match the x86 blend instructions, which only consider
+ // the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k
+ // is a vector-mask, then the '< 0' is elided.
+ return __k._M_data < 0 ? __t._M_data : __f._M_data;
+#endif
+ return __k._M_data ? __t._M_data : __f._M_data;
+ }
+ else
+ return (__k._M_data & __t._M_data) | (~__k._M_data & __f._M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ __select_impl(const basic_mask& __k, same_as<bool> auto __t, same_as<bool> auto __f) noexcept
+ {
+ if (__t == __f)
+ return basic_mask(__t);
+ else
+ return __t ? __k : !__k;
+ }
+
+ template <__vectorizable _T0, same_as<_T0> _T1>
+ requires (sizeof(_T0) == _Bytes)
+ [[__gnu__::__always_inline__]]
+ friend constexpr vec<_T0, _S_size>
+ __select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return __k._M_data ? __t : __f;
+ else
+ {
+ using _Vp = vec<_T0, _S_size>;
+ using _Mp = typename _Vp::mask_type;
+ return __select_impl(_Mp(__k), _Vp(__t), _Vp(__f));
+ }
+ }
+
+ // [simd.mask.reductions] implementation --------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_all_of() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return _M_data;
+ else if constexpr (_S_use_bitmask)
+ {
+ if constexpr (_S_is_partial)
+ // PR120925 (partial kortest pattern not recognized)
+ return (_M_data & _S_implicit_mask) == _S_implicit_mask;
+ else
+ return _M_data == _S_implicit_mask;
+ }
+#if _GLIBCXX_X86
+ else if (!__is_const_known(_M_data))
+ return __x86_vecmask_all<_S_size>(_M_data);
+#endif
+ else
+ return _VecOps<_DataType, _S_size>::_S_all_of(_M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_any_of() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return _M_data;
+ else if constexpr (_S_use_bitmask)
+ {
+ if constexpr (_S_is_partial)
+ // PR120925 (partial kortest pattern not recognized)
+ return (_M_data & _S_implicit_mask) != 0;
+ else
+ return _M_data != 0;
+ }
+#if _GLIBCXX_X86
+ else if (!__is_const_known(_M_data))
+ return __x86_vecmask_any<_S_size>(_M_data);
+#endif
+ else
+ return _VecOps<_DataType, _S_size>::_S_any_of(_M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_none_of() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return !_M_data;
+ else if constexpr (_S_use_bitmask)
+ {
+ if constexpr (_S_is_partial)
+ // PR120925 (partial kortest pattern not recognized)
+ return (_M_data & _S_implicit_mask) == 0;
+ else
+ return _M_data == 0;
+ }
+#if _GLIBCXX_X86
+ else if (!__is_const_known(_M_data))
+ return __x86_vecmask_none<_S_size>(_M_data);
+#endif
+ else
+ return _VecOps<_DataType, _S_size>::_S_none_of(_M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ _M_reduce_count() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return int(_M_data);
+ else if constexpr (_S_size <= numeric_limits<unsigned>::digits)
+ return __builtin_popcount(_M_to_uint());
+ else
+ return __builtin_popcountll(to_ullong());
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ _M_reduce_min_index() const
+ {
+ const auto __bits = _M_to_uint();
+ __glibcxx_simd_precondition(__bits, "An empty mask does not have a min_index.");
+ if constexpr (_S_size == 1)
+ return 0;
+ else
+ return __countr_zero(__bits);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ _M_reduce_max_index() const
+ {
+ const auto __bits = _M_to_uint();
+ __glibcxx_simd_precondition(__bits, "An empty mask does not have a max_index.");
+ if constexpr (_S_size == 1)
+ return 0;
+ else
+ return __highest_bit(__bits);
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr bool
+ __is_const_known(const basic_mask& __x)
+ { return __builtin_constant_p(__x._M_data); }
+ };
+
+ template <size_t _Bytes, __abi_tag _Ap>
+ requires (_Ap::_S_nreg > 1)
+ class basic_mask<_Bytes, _Ap>
+ : public _MaskBase<_Bytes, _Ap>
+ {
+ using _Base = _MaskBase<_Bytes, _Ap>;
+
+ using _VecType = _Base::_VecType;
+
+ template <size_t, typename>
+ friend class basic_mask;
+
+ template <typename, typename>
+ friend class basic_vec;
+
+ static constexpr int _S_size = _Ap::_S_size;
+
+ static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2;
+
+ static constexpr int _N1 = _S_size - _N0;
+
+ static constexpr int _Nreg0 = __bit_ceil(unsigned(_Ap::_S_nreg)) / 2;
+
+ static constexpr int _Nreg1 = _Ap::_S_nreg - _Nreg0;
+
+ // explicitly request _Nreg0 rather than use __abi_rebind. This way _Float16 can use half
+ // of native registers (since they convert to full float32 registers).
+ using _Abi0 = decltype(_Ap::template _S_resize<_N0, _Nreg0>());
+
+ using _Abi1 = decltype(_Ap::template _S_resize<_N1, _Nreg1>());
+
+ using _Mask0 = basic_mask<_Bytes, _Abi0>;
+
+ // the implementation (and users) depend on elements being contiguous in memory
+ static_assert(_Mask0::_S_padding_bytes == 0 && !_Mask0::_S_is_partial);
+
+ using _Mask1 = basic_mask<_Bytes, _Abi1>;
+
+ static constexpr bool _S_is_partial = _Mask1::_S_is_partial;
+
+ // _Ap::_S_nreg determines how deep the recursion goes. E.g. basic_mask<4, _Abi<8, 4>> cannot
+ // use basic_mask<4, _Abi<4, 1>> as _Mask0/1 types.
+ static_assert(_Mask0::abi_type::_S_nreg + _Mask1::abi_type::_S_nreg == _Ap::_S_nreg);
+
+ static constexpr bool _S_use_bitmask = _Mask0::_S_use_bitmask;
+
+ static constexpr bool _S_is_scalar = _Mask0::_S_is_scalar;
+
+ _Mask0 _M_data0;
+
+ _Mask1 _M_data1;
+
+ static constexpr bool _S_has_bool_member = _Mask1::_S_has_bool_member;
+
+ // by construction _N0 >= _N1
+ // => sizeof(_Mask0) >= sizeof(_Mask1)
+ // and __alignof__(_Mask0) >= __alignof__(_Mask1)
+ static constexpr size_t _S_padding_bytes
+ = (__alignof__(_Mask0) == __alignof__(_Mask1)
+ ? 0 : __alignof__(_Mask0) - (sizeof(_Mask1) % __alignof__(_Mask0)))
+ + _Mask1::_S_padding_bytes;
+
+ public:
+ using value_type = bool;
+
+ using abi_type = _Ap;
+
+ using iterator = _Base::iterator;
+
+ using const_iterator = _Base::const_iterator;
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(const _Mask0& __x, const _Mask1& __y)
+ {
+ basic_mask __r;
+ __r._M_data0 = __x;
+ __r._M_data1 = __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(unsigned_integral auto __bits)
+ { return basic_mask(__bits); }
+
+ template <typename _U0, typename _U1>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(const __trivial_pair<_U0, _U1>& __bits)
+ {
+ if constexpr (is_unsigned_v<_U0>)
+ {
+ static_assert(is_unsigned_v<_U1>);
+ return _S_init(_Mask0(__bits._M_first), _Mask1(__bits._M_second));
+ }
+ else if constexpr (is_unsigned_v<_U1>)
+ return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1(__bits._M_second));
+ else
+ return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1::_S_init(__bits._M_second));
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr const _Mask0&
+ _M_get_low() const
+ { return _M_data0; }
+
+ [[__gnu__::__always_inline__]]
+ constexpr const _Mask1&
+ _M_get_high() const
+ { return _M_data1; }
+
+ template <size_t _UBytes, typename _UAbi>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x)
+ {
+ using _Mp = basic_mask<_UBytes, _UAbi>;
+ if constexpr (_Mp::_S_has_bool_member || sizeof(basic_mask) > sizeof(__x)
+ || _Mp::_S_padding_bytes != 0)
+ return _S_init(__builtin_bit_cast(_Mask0, __x._M_data0),
+ _Mask1::_S_recursive_bit_cast(__x._M_data1));
+ else if constexpr (sizeof(basic_mask) == sizeof(__x))
+ return __builtin_bit_cast(basic_mask, __x);
+ else
+ { // e.g. on IvyBridge (different alignment => different sizeof)
+ struct _Tmp { alignas(_Mp) basic_mask _M_data; };
+ return __builtin_bit_cast(_Tmp, __x)._M_data;
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_concat_data(bool __do_sanitize = _S_is_partial) const
+ {
+ if constexpr (_S_use_bitmask)
+ {
+ static_assert(_S_size <= numeric_limits<unsigned long long>::digits,
+ "cannot concat more than 64 bits");
+ using _Up = _Bitmask<_S_size>;
+ return _Up(_M_data0._M_concat_data() | (_Up(_M_data1._M_concat_data(__do_sanitize)) << _N0));
+ }
+ else
+ {
+ auto __lo = _M_data0._M_concat_data();
+ auto __hi = __vec_zero_pad_to<sizeof(__lo)>(_M_data1._M_concat_data(__do_sanitize));
+ return __vec_concat(__lo, __hi);
+ }
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_partial_mask_of_n(int __n)
+ {
+#if __has_builtin(__builtin_ia32_bzhi_di)
+ if constexpr (_S_use_bitmask && _S_size <= 64 && _Traits._M_have_bmi2())
+ return basic_mask(__builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n)));
+#endif
+ if constexpr (_N0 == 1)
+ {
+ static_assert(_S_size == 2); // => __n == 1
+ return _S_init(_Mask0(true), _Mask1(false));
+ }
+ else if (__n < _N0)
+ return _S_init(_Mask0::_S_partial_mask_of_n(__n), _Mask1(false));
+ else if (__n == _N0 || _N1 == 1)
+ return _S_init(_Mask0(true), _Mask1(false));
+ else if constexpr (_N1 != 1)
+ return _S_init(_Mask0(true), _Mask1::_S_partial_mask_of_n(__n - _N0));
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask&
+ _M_and_neighbors()
+ {
+ _M_data0._M_and_neighbors();
+ _M_data1._M_and_neighbors();
+ return *this;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask&
+ _M_or_neighbors()
+ {
+ _M_data0._M_or_neighbors();
+ _M_data1._M_or_neighbors();
+ return *this;
+ }
+
+ template <typename _Mp>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_chunk() const noexcept
+ {
+ constexpr int __n = _S_size / _Mp::_S_size;
+ constexpr int __rem = _S_size % _Mp::_S_size;
+ constexpr auto [...__is] = _IotaArray<__n>;
+ if constexpr (__rem == 0)
+ return array<_Mp, __n>{__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>,
+ _M_data0, _M_data1)...};
+ else
+ {
+ using _Rest = resize_t<__rem, _Mp>;
+ return tuple(__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, _M_data0, _M_data1)...,
+ __extract_simd_at<_Rest>(cw<_Mp::_S_size * __n>, _M_data0, _M_data1));
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_concat(const basic_mask& __x0) noexcept
+ { return __x0; }
+
+ template <typename... _As>
+ requires (sizeof...(_As) >= 2)
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_concat(const basic_mask<_Bytes, _As>&... __xs) noexcept
+ {
+ static_assert(_S_size == (_As::_S_size + ...));
+ return _S_init(__extract_simd_at<_Mask0>(cw<0>, __xs...),
+ __extract_simd_at<_Mask1>(cw<_N0>, __xs...));
+ }
+
+ // [simd.mask.overview] default constructor -----------------------------
+ basic_mask() = default;
+
+ // [simd.mask.overview] conversion extensions ---------------------------
+ // TODO: any?
+
+ // [simd.mask.ctor] broadcast constructor -------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(same_as<bool> auto __x) noexcept // LWG 4382.
+ : _M_data0(__x), _M_data1(__x)
+ {}
+
+ // [simd.mask.ctor] conversion constructor ------------------------------
+ template <size_t _UBytes, typename _UAbi>
+ requires (_S_size == _UAbi::_S_size)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes))
+ basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
+ : _M_data0([&] {
+ if constexpr (_UAbi::_S_nreg > 1)
+ {
+ return __x._M_data0;
+ }
+ else if constexpr (_N0 == 1)
+ return _Mask0(__x[0]);
+ else
+ return get<0>(chunk<_N0>(__x));
+ }()),
+ _M_data1([&] {
+ if constexpr (_UAbi::_S_nreg > 1)
+ {
+ return __x._M_data1;
+ }
+ else if constexpr (_N1 == 1)
+ return _Mask1(__x[_N0]);
+ else
+ return get<1>(chunk<_N0>(__x));
+ }())
+ {}
+
+ using _Base::_MaskBase;
+
+ // [simd.mask.ctor] generator constructor -------------------------------
+ template <__simd_generator_invokable<bool, _S_size> _Fp>
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(_Fp&& __gen)
+ : _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) {
+ return __gen(__simd_size_c<__i + _N0>);
+ })
+ {}
+
+ // [simd.mask.ctor] bitset constructor ----------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_mask(const same_as<bitset<_S_size>> auto& __b) noexcept // LWG 4382.
+ : _M_data0(__bitset_split<_N0>(__b)._M_lo), _M_data1(__bitset_split<_N0>(__b)._M_hi)
+ {}
+
+ // [simd.mask.ctor] uint constructor ------------------------------------------
+ template <unsigned_integral _Tp>
+ requires (!same_as<_Tp, bool>) // LWG 4382.
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(_Tp __val) noexcept
+ : _M_data0(static_cast<_Bitmask<_N0>>(__val)),
+ _M_data1(sizeof(_Tp) * __CHAR_BIT__ > _N0
+ ? static_cast<_Bitmask<_N1>>(__val >> _N0) : _Bitmask<_N1>())
+ {}
+
+ // [simd.mask.subscr] ---------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ operator[](__simd_size_type __i) const
+ {
+ __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
+ if (__is_const_known(__i))
+ return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0];
+ else if constexpr (_M_data1._S_has_bool_member)
+ // in some cases the last element can be 'bool' instead of bit-/vector-mask;
+ // e.g. mask<short, 17> is {mask<short, 16>, mask<short, 1>}, where the latter uses
+ // _ScalarAbi<1>, which is stored as 'bool'
+ return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0];
+ else if constexpr (abi_type::_S_is_bitmask)
+ {
+ using _AliasingByte [[__gnu__::__may_alias__]] = unsigned char;
+ return bool((reinterpret_cast<const _AliasingByte*>(this)
+ [__i / __CHAR_BIT__] >> (__i % __CHAR_BIT__)) & 1);
+ }
+ else
+ {
+ using _AliasingInt [[__gnu__::__may_alias__]] = __integer_from<_Bytes>;
+ return reinterpret_cast<const _AliasingInt*>(this)[__i] != 0;
+ }
+ }
+
+ // [simd.mask.unary] ----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask
+ operator!() const noexcept
+ { return _S_init(!_M_data0, !_M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator+() const noexcept requires destructible<_VecType>
+ { return _VecType::_S_concat(+_M_data0, +_M_data1); }
+
+ constexpr _VecType
+ operator+() const noexcept = delete;
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator-() const noexcept requires destructible<_VecType>
+ { return _VecType::_S_concat(-_M_data0, -_M_data1); }
+
+ constexpr _VecType
+ operator-() const noexcept = delete;
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator~() const noexcept requires destructible<_VecType>
+ { return _VecType::_S_concat(~_M_data0, ~_M_data1); }
+
+ constexpr _VecType
+ operator~() const noexcept = delete;
+
+ // [simd.mask.conv] -----------------------------------------------------
+ template <typename _Up, typename _UAbi>
+ requires (_UAbi::_S_size == _S_size)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(sizeof(_Up) != _Bytes)
+ operator basic_vec<_Up, _UAbi>() const noexcept
+ {
+ using _Rp = basic_vec<_Up, _UAbi>;
+ return _Rp::_S_init(static_cast<_Rp::_DataType0>(_M_data0),
+ static_cast<_Rp::_DataType1>(_M_data1));
+ }
+
+ using _Base::operator basic_vec;
+
+ // [simd.mask.namedconv] ------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr bitset<_S_size>
+ to_bitset() const noexcept
+ {
+ if constexpr (_S_size <= numeric_limits<unsigned long long>::digits)
+ return to_ullong();
+ else
+ {
+ static_assert(_N0 % numeric_limits<unsigned long long>::digits == 0);
+ struct _Tmp
+ {
+ bitset<_N0> _M_lo;
+ bitset<_N1> _M_hi;
+ } __tmp = {_M_data0.to_bitset(), _M_data1.to_bitset()};
+ return __builtin_bit_cast(bitset<_S_size>, __tmp);
+ }
+ }
+
+ template <int _Offset = 0, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_to_uint() const
+ {
+ constexpr int _N0x = _N0;
+ if constexpr (_N0x >= numeric_limits<unsigned long long>::digits)
+ {
+ static_assert(_Offset == 0);
+ return __trivial_pair {
+ _M_data0.template _M_to_uint<0>(),
+ _M_data1.template _M_to_uint<0>()
+ };
+ }
+ else
+ {
+#if _GLIBCXX_X86
+ if constexpr (_Bytes == 2 && !_Traits._M_have_bmi2() && _Ap::_S_nreg == 2
+ && !_S_use_bitmask)
+ return __similar_mask<char, _S_size, _Ap>(*this).template _M_to_uint<_Offset>();
+#endif
+ auto __uint = _M_data1.template _M_to_uint<_N0x + _Offset>();
+ __uint |= _M_data0.template _M_to_uint<_Offset>();
+ return __uint;
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr unsigned long long
+ to_ullong() const
+ {
+ if constexpr (_S_size <= numeric_limits<unsigned long long>::digits)
+ return _M_to_uint();
+ else
+ {
+ __glibcxx_simd_precondition(_M_data1.to_ullong() == 0,
+ "to_ullong called on mask with 'true' elements at indices"
+ "higher than representable in a ullong");
+ return _M_data0.to_ullong();
+ }
+ }
+
+ // [simd.mask.binary]
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator&&(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 && __y._M_data0, __x._M_data1 && __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator||(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 || __y._M_data0, __x._M_data1 || __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator&(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 & __y._M_data0, __x._M_data1 & __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator|(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 | __y._M_data0, __x._M_data1 | __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator^(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 ^ __y._M_data0, __x._M_data1 ^ __y._M_data1); }
+
+ // [simd.mask.cassign]
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator&=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data0 &= __y._M_data0;
+ __x._M_data1 &= __y._M_data1;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator|=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data0 |= __y._M_data0;
+ __x._M_data1 |= __y._M_data1;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator^=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data0 ^= __y._M_data0;
+ __x._M_data1 ^= __y._M_data1;
+ return __x;
+ }
+
+ // [simd.mask.comparison] -----------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator==(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !(__x ^ __y); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator!=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x ^ __y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator>=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x || !__y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator<=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !__x || __y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator>(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x && !__y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator<(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !__x && __y; }
+
+ // [simd.mask.cond] -----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ __select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept
+ {
+ return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0),
+ __select_impl(__k._M_data1, __t._M_data1, __f._M_data1));
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ __select_impl(const basic_mask& __k, same_as<bool> auto __t, same_as<bool> auto __f) noexcept
+ {
+ if (__t == __f)
+ return basic_mask(__t);
+ else
+ return __t ? __k : !__k;
+ }
+
+ template <__vectorizable _T0, same_as<_T0> _T1>
+ requires (sizeof(_T0) == _Bytes)
+ [[__gnu__::__always_inline__]]
+ friend constexpr vec<_T0, _S_size>
+ __select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept
+ {
+ using _Vp = vec<_T0, _S_size>;
+ if constexpr (!is_same_v<basic_mask, typename _Vp::mask_type>)
+ return __select_impl(static_cast<_Vp::mask_type>(__k), __t, __f);
+ else
+ return _Vp::_S_init(__select_impl(__k._M_data0, __t, __f),
+ __select_impl(__k._M_data1, __t, __f));
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_all_of() const
+ {
+ if constexpr (_N0 == _N1)
+ return (_M_data0 && _M_data1)._M_all_of();
+ else
+ return _M_data0._M_all_of() && _M_data1._M_all_of();
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_any_of() const
+ {
+ if constexpr (_N0 == _N1)
+ return (_M_data0 || _M_data1)._M_any_of();
+ else
+ return _M_data0._M_any_of() || _M_data1._M_any_of();
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_none_of() const
+ {
+ if constexpr (_N0 == _N1)
+ return (_M_data0 || _M_data1)._M_none_of();
+ else
+ return _M_data0._M_none_of() && _M_data1._M_none_of();
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ _M_reduce_min_index() const
+ {
+ if constexpr (_S_size <= numeric_limits<unsigned long long>::digits)
+ {
+ const auto __bits = _M_to_uint();
+ __glibcxx_simd_precondition(__bits, "An empty mask does not have a min_index.");
+ if constexpr (_S_size == 1)
+ return 0;
+ else
+ return __countr_zero(_M_to_uint());
+ }
+ else if (_M_data0._M_none_of())
+ return _M_data1._M_reduce_min_index() + _N0;
+ else
+ return _M_data0._M_reduce_min_index();
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ _M_reduce_max_index() const
+ {
+ if constexpr (_S_size <= numeric_limits<unsigned long long>::digits)
+ {
+ const auto __bits = _M_to_uint();
+ __glibcxx_simd_precondition(__bits, "An empty mask does not have a max_index.");
+ if constexpr (_S_size == 1)
+ return 0;
+ else
+ return __highest_bit(_M_to_uint());
+ }
+ else if (_M_data1._M_none_of())
+ return _M_data0._M_reduce_max_index();
+ else
+ return _M_data1._M_reduce_max_index() + _N0;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr bool
+ __is_const_known(const basic_mask& __x)
+ { return __is_const_known(__x._M_data0) && __is_const_known(__x._M_data1); }
+ };
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_MASK_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_MASK_REDUCTIONS_H
+#define _GLIBCXX_SIMD_MASK_REDUCTIONS_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_mask.h"
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+// [simd.mask.reductions] -----------------------------------------------------
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ template <size_t _Bytes, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ all_of(const basic_mask<_Bytes, _Ap>& __k) noexcept
+ { return __k._M_all_of(); }
+
+ template <size_t _Bytes, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ any_of(const basic_mask<_Bytes, _Ap>& __k) noexcept
+ { return __k._M_any_of(); }
+
+ template <size_t _Bytes, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ none_of(const basic_mask<_Bytes, _Ap>& __k) noexcept
+ { return __k._M_none_of(); }
+
+ template <size_t _Bytes, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ reduce_count(const basic_mask<_Bytes, _Ap>& __k) noexcept
+ {
+ if constexpr (_Ap::_S_size == 1)
+ return +__k[0];
+ else if constexpr (_Ap::_S_is_vecmask)
+ return -reduce(-__k);
+ else
+ return __k._M_reduce_count();
+ }
+
+ template <size_t _Bytes, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ reduce_min_index(const basic_mask<_Bytes, _Ap>& __k)
+ { return __k._M_reduce_min_index(); }
+
+ template <size_t _Bytes, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ reduce_max_index(const basic_mask<_Bytes, _Ap>& __k)
+ { return __k._M_reduce_max_index(); }
+
+ constexpr bool
+ all_of(same_as<bool> auto __x) noexcept
+ { return __x; }
+
+ constexpr bool
+ any_of(same_as<bool> auto __x) noexcept
+ { return __x; }
+
+ constexpr bool
+ none_of(same_as<bool> auto __x) noexcept
+ { return !__x; }
+
+ constexpr __simd_size_type
+ reduce_count(same_as<bool> auto __x) noexcept
+ { return __x; }
+
+ constexpr __simd_size_type
+ reduce_min_index(same_as<bool> auto __x)
+ { return 0; }
+
+ constexpr __simd_size_type
+ reduce_max_index(same_as<bool> auto __x)
+ { return 0; }
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_MASK_REDUCTIONS_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_REDUCTIONS_H
+#define _GLIBCXX_SIMD_REDUCTIONS_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_vec.h"
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+// [simd.reductions] ----------------------------------------------------------
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ template <typename _Tp, typename _Ap, __reduction_binary_operation<_Tp> _BinaryOperation = plus<>>
+ [[__gnu__::__always_inline__]]
+ constexpr _Tp
+ reduce(const basic_vec<_Tp, _Ap>& __x, _BinaryOperation __binary_op = {})
+ { return __x._M_reduce(__binary_op); }
+
+ template <typename _Tp, typename _Ap, __reduction_binary_operation<_Tp> _BinaryOperation = plus<>>
+ [[__gnu__::__always_inline__]]
+ constexpr _Tp
+ reduce(const basic_vec<_Tp, _Ap>& __x, const typename basic_vec<_Tp, _Ap>::mask_type& __mask,
+ _BinaryOperation __binary_op = {}, type_identity_t<_Tp> __identity_element
+ = __default_identity_element<_Tp, _BinaryOperation>())
+ { return reduce(__select_impl(__mask, __x, __identity_element), __binary_op); }
+
+ template <totally_ordered _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr _Tp
+ reduce_min(const basic_vec<_Tp, _Ap>& __x) noexcept
+ {
+ return reduce(__x, []<typename _UV>(const _UV& __a, const _UV& __b) {
+ return __select_impl(__a < __b, __a, __b);
+ });
+ }
+
+ template <totally_ordered _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr _Tp
+ reduce_min(const basic_vec<_Tp, _Ap>& __x,
+ const typename basic_vec<_Tp, _Ap>::mask_type& __mask) noexcept
+ {
+ return reduce(__select_impl(__mask, __x, numeric_limits<_Tp>::max()),
+ []<typename _UV>(const _UV& __a, const _UV& __b) {
+ return __select_impl(__a < __b, __a, __b);
+ });
+ }
+
+ template <totally_ordered _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr _Tp
+ reduce_max(const basic_vec<_Tp, _Ap>& __x) noexcept
+ {
+ return reduce(__x, []<typename _UV>(const _UV& __a, const _UV& __b) {
+ return __select_impl(__a < __b, __b, __a);
+ });
+ }
+
+ template <totally_ordered _Tp, typename _Ap>
+ [[__gnu__::__always_inline__]]
+ constexpr _Tp
+ reduce_max(const basic_vec<_Tp, _Ap>& __x,
+ const typename basic_vec<_Tp, _Ap>::mask_type& __mask) noexcept
+ {
+ return reduce(__select_impl(__mask, __x, numeric_limits<_Tp>::lowest()),
+ []<typename _UV>(const _UV& __a, const _UV& __b) {
+ return __select_impl(__a < __b, __b, __a);
+ });
+ }
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_REDUCTIONS_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_VEC_H
+#define _GLIBCXX_SIMD_VEC_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_mask.h"
+#include "simd_flags.h"
+
+#include <bits/utility.h>
+#include <bits/stl_function.h>
+#include <cmath>
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ // disabled basic_vec
+ template <typename _Tp, typename _Ap>
+ class basic_vec
+ {
+ public:
+ using value_type = _Tp;
+
+ using abi_type = _Ap;
+
+ using mask_type = basic_mask<0, void>; // disabled
+
+#define _GLIBCXX_DELETE_SIMD "This specialization is disabled because of an invalid combination " \
+ "of template arguments to basic_vec."
+
+ basic_vec() = delete(_GLIBCXX_DELETE_SIMD);
+
+ ~basic_vec() = delete(_GLIBCXX_DELETE_SIMD);
+
+ basic_vec(const basic_vec&) = delete(_GLIBCXX_DELETE_SIMD);
+
+ basic_vec& operator=(const basic_vec&) = delete(_GLIBCXX_DELETE_SIMD);
+
+#undef _GLIBCXX_DELETE_SIMD
+ };
+
+ template <typename _Tp, typename _Ap>
+ class _VecBase
+ {
+ using _Vp = basic_vec<_Tp, _Ap>;
+
+ public:
+ using value_type = _Tp;
+
+ using abi_type = _Ap;
+
+ using mask_type = basic_mask<sizeof(_Tp), abi_type>;
+
+ using iterator = __iterator<_Vp>;
+
+ using const_iterator = __iterator<const _Vp>;
+
+ constexpr iterator
+ begin() noexcept
+ { return {static_cast<_Vp&>(*this), 0}; }
+
+ constexpr const_iterator
+ begin() const noexcept
+ { return cbegin(); }
+
+ constexpr const_iterator
+ cbegin() const noexcept
+ { return {static_cast<const _Vp&>(*this), 0}; }
+
+ constexpr default_sentinel_t
+ end() const noexcept
+ { return {}; }
+
+ constexpr default_sentinel_t
+ cend() const noexcept
+ { return {}; }
+
+ static constexpr auto size = __simd_size_c<_Ap::_S_size>;
+
+ _VecBase() = default;
+
+ // LWG issue from 2026-03-04 / P4042R0
+ template <typename _Up, typename _UAbi>
+ requires (_Ap::_S_size != _UAbi::_S_size)
+ _VecBase(const basic_vec<_Up, _UAbi>&) = delete("size mismatch");
+
+ template <typename _Up, typename _UAbi>
+ requires (_Ap::_S_size == _UAbi::_S_size) && (!__explicitly_convertible_to<_Up, _Tp>)
+ explicit
+ _VecBase(const basic_vec<_Up, _UAbi>&)
+ = delete("the value types are not convertible");
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator+(const _Vp& __x, const _Vp& __y) noexcept
+ {
+ _Vp __r = __x;
+ __r += __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator-(const _Vp& __x, const _Vp& __y) noexcept
+ {
+ _Vp __r = __x;
+ __r -= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator*(const _Vp& __x, const _Vp& __y) noexcept
+ {
+ _Vp __r = __x;
+ __r *= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator/(const _Vp& __x, const _Vp& __y) noexcept
+ {
+ _Vp __r = __x;
+ __r /= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator%(const _Vp& __x, const _Vp& __y) noexcept
+ requires requires (_Tp __a) { __a % __a; }
+ {
+ _Vp __r = __x;
+ __r %= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator&(const _Vp& __x, const _Vp& __y) noexcept
+ requires requires (_Tp __a) { __a & __a; }
+ {
+ _Vp __r = __x;
+ __r &= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator|(const _Vp& __x, const _Vp& __y) noexcept
+ requires requires (_Tp __a) { __a | __a; }
+ {
+ _Vp __r = __x;
+ __r |= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator^(const _Vp& __x, const _Vp& __y) noexcept
+ requires requires (_Tp __a) { __a ^ __a; }
+ {
+ _Vp __r = __x;
+ __r ^= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator<<(const _Vp& __x, const _Vp& __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires (_Tp __a) { __a << __a; }
+ {
+ _Vp __r = __x;
+ __r <<= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator<<(const _Vp& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires (_Tp __a, __simd_size_type __b) { __a << __b; }
+ {
+ _Vp __r = __x;
+ __r <<= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator>>(const _Vp& __x, const _Vp& __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires (_Tp __a) { __a >> __a; }
+ {
+ _Vp __r = __x;
+ __r >>= __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr _Vp
+ operator>>(const _Vp& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires (_Tp __a, __simd_size_type __b) { __a >> __b; }
+ {
+ _Vp __r = __x;
+ __r >>= __y;
+ return __r;
+ }
+ };
+
+ struct _LoadCtorTag
+ {};
+
+ template <integral _Tp>
+ inline constexpr _Tp __max_shift
+ = (sizeof(_Tp) < sizeof(int) ? sizeof(int) : sizeof(_Tp)) * __CHAR_BIT__;
+
+ template <__vectorizable _Tp, __abi_tag _Ap>
+ requires (_Ap::_S_nreg == 1)
+ class basic_vec<_Tp, _Ap>
+ : public _VecBase<_Tp, _Ap>
+ {
+ template <typename, typename>
+ friend class basic_vec;
+
+ template <size_t, typename>
+ friend class basic_mask;
+
+ static constexpr int _S_size = _Ap::_S_size;
+
+ static constexpr int _S_full_size = __bit_ceil(unsigned(_S_size));
+
+ static constexpr bool _S_is_scalar = _S_size == 1;
+
+ static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask && !_S_is_scalar;
+
+ using _DataType = typename _Ap::template _DataType<_Tp>;
+
+ /** @internal
+ * @brief Underlying vector data storage.
+ *
+ * This member holds the vector object using a GNU vector type or a platform-specific vector
+ * type determined by the ABI tag. For size 1 vectors, this is a single value (_Tp).
+ */
+ _DataType _M_data;
+
+ static constexpr bool _S_is_partial = sizeof(_M_data) > sizeof(_Tp) * _S_size;
+
+ using __canon_value_type = __canonical_vec_type_t<_Tp>;
+
+ public:
+ using value_type = _Tp;
+
+ using mask_type = _VecBase<_Tp, _Ap>::mask_type;
+
+ // internal but public API ----------------------------------------------
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_vec
+ _S_init(_DataType __x)
+ {
+ basic_vec __r;
+ __r._M_data = __x;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr const _DataType&
+ _M_get() const
+ { return _M_data; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr bool
+ __is_const_known(const basic_vec& __x)
+ { return __builtin_constant_p(__x._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_concat_data([[maybe_unused]] bool __do_sanitize = false) const
+ {
+ if constexpr (_S_is_scalar)
+ return __vec_builtin_type<__canon_value_type, 1>{_M_data};
+ else
+ return _M_data;
+ }
+
+ template <int _Size = _S_size, int _Offset = 0, typename _A0, typename _Fp>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_vec
+ _S_static_permute(const basic_vec<value_type, _A0>& __x, _Fp&& __idxmap)
+ {
+ using _Xp = basic_vec<value_type, _A0>;
+ basic_vec __r;
+ if constexpr (_S_is_scalar)
+ {
+ constexpr __simd_size_type __j = [&] consteval {
+ if constexpr (__index_permutation_function_sized<_Fp>)
+ return __idxmap(_Offset, _Size);
+ else
+ return __idxmap(_Offset);
+ }();
+ if constexpr (__j == simd::zero_element || __j == simd::uninit_element)
+ return basic_vec();
+ else
+ static_assert(__j >= 0 && __j < _Xp::_S_size);
+ __r._M_data = __x[__j];
+ }
+ else
+ {
+ auto __idxmap2 = [=](auto __i) consteval {
+ if constexpr (int(__i + _Offset) >= _Size) // _S_full_size > _Size
+ return __simd_size_c<simd::uninit_element>;
+ else if constexpr (__index_permutation_function_sized<_Fp>)
+ return __simd_size_c<__idxmap(__i + _Offset, _Size)>;
+ else
+ return __simd_size_c<__idxmap(__i + _Offset)>;
+ };
+ constexpr auto __adj_idx = [](auto __i) {
+ constexpr int __j = __i;
+ if constexpr (__j == simd::zero_element)
+ return __simd_size_c<__bit_ceil(unsigned(_Xp::_S_size))>;
+ else if constexpr (__j == simd::uninit_element)
+ return __simd_size_c<-1>;
+ else
+ {
+ static_assert(__j >= 0 && __j < _Xp::_S_size);
+ return __simd_size_c<__j>;
+ }
+ };
+ constexpr auto [...__is0] = _IotaArray<_S_size>;
+ constexpr bool __needs_zero_element
+ = ((__idxmap2(__simd_size_c<__is0>).value == simd::zero_element) || ...);
+ constexpr auto [...__is_full] = _IotaArray<_S_full_size>;
+ if constexpr (_A0::_S_nreg == 2 && !__needs_zero_element)
+ {
+ __r._M_data = __builtin_shufflevector(
+ __x._M_data0._M_data, __x._M_data1._M_data,
+ __adj_idx(__idxmap2(__simd_size_c<__is_full>)).value...);
+ }
+ else
+ {
+ __r._M_data = __builtin_shufflevector(
+ __x._M_concat_data(), decltype(__x._M_concat_data())(),
+ __adj_idx(__idxmap2(__simd_size_c<__is_full>)).value...);
+ }
+ }
+ return __r;
+ }
+
+ template <typename _Vp>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_chunk() const noexcept
+ {
+ constexpr int __n = _S_size / _Vp::_S_size;
+ constexpr int __rem = _S_size % _Vp::_S_size;
+ constexpr auto [...__is] = _IotaArray<__n>;
+ if constexpr (__rem == 0)
+ return array<_Vp, __n> {__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, *this)...};
+ else
+ {
+ using _Rest = resize_t<__rem, _Vp>;
+ return tuple(__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, *this)...,
+ __extract_simd_at<_Rest>(cw<_Vp::_S_size * __n>, *this));
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_vec
+ _S_concat(const basic_vec& __x0) noexcept
+ { return __x0; }
+
+ template <typename... _As>
+ requires (sizeof...(_As) > 1)
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_vec
+ _S_concat(const basic_vec<value_type, _As>&... __xs) noexcept
+ {
+ static_assert(_S_size == (_As::_S_size + ...));
+ return __extract_simd_at<basic_vec>(cw<0>, __xs...);
+ }
+
+ /** @internal
+ * Shifts elements to the front by @p _Shift positions (or to the back for negative @p
+ * _Shift).
+ *
+ * This function moves elements towards lower indices (front of the vector).
+ * Elements that would shift beyond the vector bounds are replaced with zero. Negative shift
+ * values shift in the opposite direction.
+ *
+ * @warning The naming can be confusing due to little-endian byte order:
+ * - Despite the name "shifted_to_front", the underlying hardware instruction
+ * shifts bits to the right (psrl...)
+ * - The function name refers to element indices, not bit positions
+ *
+ * @tparam _Shift Number of positions to shift elements towards the front.
+ * Must be -size() < _Shift < size().
+ *
+ * @return A new vector with elements shifted to front or back.
+ *
+ * Example:
+ * @code
+ * __iota<vec<int, 4>>._M_elements_shifted_to_front<2>(); // {2, 3, 0, 0}
+ * __iota<vec<int, 4>>._M_elements_shifted_to_front<-2>(); // {0, 0, 0, 1}
+ * @endcode
+ */
+ template <int _Shift, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ _M_elements_shifted_to_front() const
+ {
+ static_assert(_Shift < _S_size && -_Shift < _S_size);
+ if constexpr (_Shift == 0)
+ return *this;
+#ifdef __SSE2__
+ else if (!__is_const_known(*this))
+ {
+ if constexpr (sizeof(_M_data) == 16 && _Shift > 0)
+ return reinterpret_cast<_DataType>(
+ __builtin_ia32_psrldqi128(__vec_bit_cast<long long>(_M_data),
+ _Shift * sizeof(value_type) * 8));
+ else if constexpr (sizeof(_M_data) == 16 && _Shift < 0)
+ return reinterpret_cast<_DataType>(
+ __builtin_ia32_pslldqi128(__vec_bit_cast<long long>(_M_data),
+ -_Shift * sizeof(value_type) * 8));
+ else if constexpr (sizeof(_M_data) < 16)
+ {
+ auto __x = reinterpret_cast<__vec_builtin_type_bytes<long long, 16>>(
+ __vec_zero_pad_to_16(_M_data));
+ if constexpr (_Shift > 0)
+ __x = __builtin_ia32_psrldqi128(__x, _Shift * sizeof(value_type) * 8);
+ else
+ __x = __builtin_ia32_pslldqi128(__x, -_Shift * sizeof(value_type) * 8);
+ return _VecOps<_DataType>::_S_extract(__vec_bit_cast<__canon_value_type>(__x));
+ }
+ }
+#endif
+ return _S_static_permute(*this, [](int __i) consteval {
+ int __off = __i + _Shift;
+ return __off >= _S_size || __off < 0 ? zero_element : __off;
+ });
+ }
+
+ /** @internal
+ * @brief Set padding elements to @p __id; add more padding elements if necessary.
+ *
+ * @note This function can rearrange the element order since the result is only used for
+ * reductions.
+ */
+ template <typename _Vp, __canon_value_type __id>
+ [[__gnu__::__always_inline__]]
+ constexpr _Vp
+ _M_pad_to_T_with_value() const noexcept
+ {
+ static_assert(!_Vp::_S_is_partial);
+ static_assert(_Ap::_S_nreg == 1);
+ if constexpr (sizeof(_Vp) == 32)
+ { // when we need to reduce from a 512-bit register
+ static_assert(sizeof(_M_data) == 32);
+ constexpr auto __k = _Vp::mask_type::_S_partial_mask_of_n(_S_size);
+ return __select_impl(__k, _Vp::_S_init(_M_data), __id);
+ }
+ else
+ {
+ static_assert(sizeof(_Vp) <= 16); // => max. 7 Bytes need to be zeroed
+ static_assert(sizeof(_M_data) <= sizeof(_Vp));
+ _Vp __v1 = __vec_zero_pad_to<sizeof(_Vp)>(_M_data);
+ if constexpr (__id == 0 && _S_is_partial)
+ // cheapest solution: shift values to the back while shifting in zeros
+ // This is valid because we shift out padding elements and use all elements in a
+ // subsequent reduction.
+ __v1 = __v1.template _M_elements_shifted_to_front<-(_Vp::_S_size - _S_size)>();
+ else if constexpr (_Vp::_S_size - _S_size == 1)
+ // if a single element needs to be changed, use an insert instruction
+ __vec_set(__v1._M_data, _Vp::_S_size - 1, __id);
+ else if constexpr (__has_single_bit(unsigned(_Vp::_S_size - _S_size)))
+ { // if 2^n elements need to be changed, use a single insert instruction
+ constexpr int __n = _Vp::_S_size - _S_size;
+ using _Ip = __integer_from<__n * sizeof(__canon_value_type)>;
+ constexpr auto [...__is] = _IotaArray<__n>;
+ constexpr __canon_value_type __idn[__n] = {((void)__is, __id)...};
+ auto __vn = __vec_bit_cast<_Ip>(__v1._M_data);
+ __vec_set(__vn, _Vp::_S_size / __n - 1, __builtin_bit_cast(_Ip, __idn));
+ __v1._M_data = reinterpret_cast<typename _Vp::_DataType>(__vn);
+ }
+ else if constexpr (__id != 0 && !_S_is_partial)
+ { // if __vec_zero_pad_to added zeros in all the places where we need __id, a
+ // bitwise or is sufficient (needs a vector constant for the __id vector, which
+ // isn't optimal)
+ constexpr _Vp __idn([](int __i) {
+ return __i >= _S_size ? __id : __canon_value_type();
+ });
+ __v1._M_data = __vec_or(__v1._M_data, __idn._M_data);
+ }
+ else if constexpr (__id != 0 || _S_is_partial)
+ { // fallback
+ constexpr auto __k = _Vp::mask_type::_S_partial_mask_of_n(_S_size);
+ __v1 = __select_impl(__k, __v1, __id);
+ }
+ return __v1;
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_reduce_to_half(auto __binary_op) const
+ {
+ static_assert(__has_single_bit(unsigned(_S_size)));
+ auto [__a, __b] = chunk<_S_size / 2>(*this);
+ return __binary_op(__a, __b);
+ }
+
+ template <typename _Rest, typename _BinaryOp>
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ _M_reduce_tail(const _Rest& __rest, _BinaryOp __binary_op) const
+ {
+ if constexpr (_S_is_scalar)
+ return __binary_op(*this, __rest)._M_data;
+ else if constexpr (_Rest::_S_size == _S_size)
+ return __binary_op(*this, __rest)._M_reduce(__binary_op);
+ else if constexpr (_Rest::_S_size > _S_size)
+ {
+ auto [__a, __b] = __rest.template _M_chunk<basic_vec>();
+ return __binary_op(*this, __a)._M_reduce_tail(__b, __binary_op);
+ }
+ else if constexpr (_Rest::_S_size == 1)
+ return __binary_op(_Rest(_M_reduce(__binary_op)), __rest)[0];
+ else if constexpr (sizeof(_M_data) <= 16
+ && requires { __default_identity_element<__canon_value_type, _BinaryOp>(); })
+ { // extend __rest with identity element for more parallelism
+ constexpr __canon_value_type __id
+ = __default_identity_element<__canon_value_type, _BinaryOp>();
+ return __binary_op(_M_data, __rest.template _M_pad_to_T_with_value<basic_vec, __id>())
+ ._M_reduce(__binary_op);
+ }
+ else
+ return _M_reduce_to_half(__binary_op)._M_reduce_tail(__rest, __binary_op);
+ }
+
+ /** @internal
+ * @brief Reduction over @p __binary_op of all (non-padding) elements.
+ *
+ * @note The implementation assumes it is most efficient to first reduce to one 128-bit SIMD
+ * register and then shuffle elements while sticking to 128-bit registers.
+ */
+ template <typename _BinaryOp, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ _M_reduce(_BinaryOp __binary_op) const
+ {
+ constexpr bool __have_id_elem
+ = requires { __default_identity_element<__canon_value_type, _BinaryOp>(); };
+ if constexpr (_S_size == 1)
+ return operator[](0);
+ else if constexpr (_Traits.template _M_eval_as_f32<value_type>()
+ && (is_same_v<_BinaryOp, plus<>>
+ || is_same_v<_BinaryOp, multiplies<>>))
+ return value_type(rebind_t<float, basic_vec>(*this)._M_reduce(__binary_op));
+#ifdef __SSE2__
+ else if constexpr (is_integral_v<value_type> && sizeof(value_type) == 1
+ && is_same_v<decltype(__binary_op), multiplies<>>)
+ {
+ // convert to unsigned short because of missing 8-bit mul instruction
+ // we don't need to preserve the order of elements
+ //
+ // The left columns under Latency and Throughput show bit-cast to ushort with shift by
+ // 8. The right column uses the alternative in the else branch.
+ // Benchmark on Intel Ultra 7 165U (AVX2)
+ // TYPE Latency Throughput
+ // [cycles/call] [cycles/call]
+ //schar, 2 9.11 7.73 3.17 3.21
+ //schar, 4 31.6 34.9 5.11 6.97
+ //schar, 8 35.7 41.5 7.77 7.17
+ //schar, 16 36.7 44.1 6.66 8.96
+ //schar, 32 42.2 61.1 8.82 10.1
+ if constexpr (!_S_is_partial)
+ { // If all elements participate in the reduction we can take this shortcut
+ using _V16 = resize_t<_S_size / 2, rebind_t<unsigned short, basic_vec>>;
+ auto __a = __builtin_bit_cast(_V16, *this);
+ return __binary_op(__a, __a >> 8)._M_reduce(__binary_op);
+ }
+ else
+ {
+ using _V16 = rebind_t<unsigned short, basic_vec>;
+ return _V16(*this)._M_reduce(__binary_op);
+ }
+ }
+#endif
+ else if constexpr (__has_single_bit(unsigned(_S_size)))
+ {
+ if constexpr (sizeof(_M_data) > 16)
+ return _M_reduce_to_half(__binary_op)._M_reduce(__binary_op);
+ else if constexpr (_S_size == 2)
+ return _M_reduce_to_half(__binary_op)[0];
+ else
+ {
+ static_assert(_S_size <= 16);
+ auto __x = *this;
+#ifdef __SSE2__
+ if constexpr (sizeof(_M_data) <= 16 && is_integral_v<value_type>)
+ {
+ if constexpr (_S_size > 8)
+ __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<8>());
+ if constexpr (_S_size > 4)
+ __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<4>());
+ if constexpr (_S_size > 2)
+ __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<2>());
+ // We could also call __binary_op with vec<T, 1> arguments. However,
+ // micro-benchmarking on Intel Ultra 7 165U showed this to be more efficient:
+ return __binary_op(__x, __x.template _M_elements_shifted_to_front<1>())[0];
+ }
+#endif
+ if constexpr (_S_size > 8)
+ __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<8>()));
+ if constexpr (_S_size > 4)
+ __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<4>()));
+#ifdef __SSE2__
+ // avoid pshufb by "promoting" to int
+ if constexpr (is_integral_v<value_type> && sizeof(value_type) <= 1)
+ return value_type(resize_t<4, rebind_t<int, basic_vec>>(chunk<4>(__x)[0])
+ ._M_reduce(__binary_op));
+#endif
+ if constexpr (_S_size > 2)
+ __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<2>()));
+ if constexpr (is_integral_v<value_type> && sizeof(value_type) == 2)
+ return __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<1>()))[0];
+ else
+ return __binary_op(vec<value_type, 1>(__x[0]), vec<value_type, 1>(__x[1]))[0];
+ }
+ }
+ else if constexpr (sizeof(_M_data) == 32)
+ {
+ const auto [__lo, __hi] = chunk<__bit_floor(unsigned(_S_size))>(*this);
+ return __lo._M_reduce_tail(__hi, __binary_op);
+ }
+ else if constexpr (sizeof(_M_data) == 64)
+ {
+ // e.g. _S_size = 16 + 16 + 15 (vec<char, 47>)
+ // -> 8 + 8 + 7 -> 4 + 4 + 3 -> 2 + 2 + 1 -> 1
+ auto __chunked = chunk<__bit_floor(unsigned(_S_size)) / 2>(*this);
+ using _Cp = decltype(__chunked);
+ if constexpr (tuple_size_v<_Cp> == 4)
+ {
+ const auto& [__a, __b, __c, __rest] = __chunked;
+ constexpr bool __amd_cpu = _Traits._M_have_sse4a();
+ if constexpr (__have_id_elem && __rest._S_size > 1 && __amd_cpu)
+ { // do one 256-bit op -> one 128-bit op
+ // 4 cycles on Zen4/5 until _M_reduce (short, 26, plus<>)
+ // 9 cycles on Skylake-AVX512 until _M_reduce
+ // 9 cycles on Zen4/5 until _M_reduce (short, 27, multiplies<>)
+ // 17 cycles on Skylake-AVX512 until _M_reduce (short, 27, multiplies<>)
+ const auto& [__a, __rest] = chunk<__bit_floor(unsigned(_S_size))>(*this);
+ using _Vp = remove_cvref_t<decltype(__a)>;
+ constexpr __canon_value_type __id
+ = __default_identity_element<__canon_value_type, _BinaryOp>();
+ const _Vp __b = __rest.template _M_pad_to_T_with_value<_Vp, __id>();
+ return __binary_op(__a, __b)._M_reduce(__binary_op);
+ }
+ else if constexpr (__have_id_elem && __rest._S_size > 1)
+ { // do two 128-bit ops -> one 128-bit op
+ // 5 cycles on Zen4/5 until _M_reduce (short, 26, plus<>)
+ // 7 cycles on Skylake-AVX512 until _M_reduce (short, 26, plus<>)
+ // 9 cycles on Zen4/5 until _M_reduce (short, 27, multiplies<>)
+ // 16 cycles on Skylake-AVX512 until _M_reduce (short, 27, multiplies<>)
+ using _Vp = remove_cvref_t<decltype(__a)>;
+ constexpr __canon_value_type __id
+ = __default_identity_element<__canon_value_type, _BinaryOp>();
+ const _Vp __d = __rest.template _M_pad_to_T_with_value<_Vp, __id>();
+ return __binary_op(__binary_op(__a, __b), __binary_op(__c, __d))
+ ._M_reduce(__binary_op);
+ }
+ else
+ return __binary_op(__binary_op(__a, __b), __c)
+ ._M_reduce_tail(__rest, __binary_op);
+ }
+ else if constexpr (tuple_size_v<_Cp> == 3)
+ {
+ const auto& [__a, __b, __rest] = __chunked;
+ return __binary_op(__a, __b)._M_reduce_tail(__rest, __binary_op);
+ }
+ else
+ static_assert(false);
+ }
+ else if constexpr (__have_id_elem)
+ {
+ constexpr __canon_value_type __id
+ = __default_identity_element<__canon_value_type, _BinaryOp>();
+ using _Vp = resize_t<__bit_ceil(unsigned(_S_size)), basic_vec>;
+ return _M_pad_to_T_with_value<_Vp, __id>()._M_reduce(__binary_op);
+ }
+ else
+ {
+ const auto& [__a, __rest] = chunk<__bit_floor(unsigned(_S_size))>(*this);
+ return __a._M_reduce_tail(__rest, __binary_op);
+ }
+ }
+
+ // [simd.math] ----------------------------------------------------------
+ //
+ // ISO/IEC 60559 on the classification operations (5.7.2 General Operations):
+ // "They are never exceptional, even for signaling NaNs."
+ //
+ template <_OptTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ _M_isnan() const requires is_floating_point_v<value_type>
+ {
+ if constexpr (_Traits._M_finite_math_only())
+ return mask_type(false);
+ else if constexpr (_S_is_scalar)
+ return mask_type(std::isnan(_M_data));
+ else if constexpr (_S_use_bitmask)
+ return _M_isunordered(*this);
+ else if constexpr (!_Traits._M_support_snan())
+ return !(*this == *this);
+ else if (__is_const_known(_M_data))
+ return mask_type([&](int __i) { return std::isnan(_M_data[__i]); });
+ else
+ {
+ // 60559: NaN is represented as Inf + non-zero mantissa bits
+ using _Ip = __integer_from<sizeof(value_type)>;
+ return __builtin_bit_cast(_Ip, numeric_limits<value_type>::infinity())
+ < __builtin_bit_cast(rebind_t<_Ip, basic_vec>, _M_fabs());
+ }
+ }
+
+ template <_TargetTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ _M_isinf() const requires is_floating_point_v<value_type>
+ {
+ if constexpr (_Traits._M_finite_math_only())
+ return mask_type(false);
+ else if constexpr (_S_is_scalar)
+ return mask_type(std::isinf(_M_data));
+ else if (__is_const_known(_M_data))
+ return mask_type([&](int __i) { return std::isinf(_M_data[__i]); });
+#ifdef _GLIBCXX_X86
+ else if constexpr (_S_use_bitmask)
+ return mask_type::_S_init(__x86_bitmask_isinf(_M_data));
+ else if constexpr (_Traits._M_have_avx512dq())
+ return __x86_bit_to_vecmask<typename mask_type::_DataType>(
+ __x86_bitmask_isinf(_M_data));
+#endif
+ else
+ {
+ using _Ip = __integer_from<sizeof(value_type)>;
+ return __vec_bit_cast<_Ip>(_M_fabs()._M_data)
+ == __builtin_bit_cast(_Ip, numeric_limits<value_type>::infinity());
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ _M_abs() const requires signed_integral<value_type>
+ { return _M_data < 0 ? -_M_data : _M_data; }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ _M_fabs() const requires floating_point<value_type>
+ {
+ if constexpr (_S_is_scalar)
+ return std::fabs(_M_data);
+ else
+ return __vec_and(__vec_not(_S_signmask<_DataType>), _M_data);
+ }
+
+ template <_TargetTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ _M_isunordered(basic_vec __y) const requires is_floating_point_v<value_type>
+ {
+ if constexpr (_Traits._M_finite_math_only())
+ return mask_type(false);
+ else if constexpr (_S_is_scalar)
+ return mask_type(std::isunordered(_M_data, __y._M_data));
+#ifdef _GLIBCXX_X86
+ else if constexpr (_S_use_bitmask)
+ return _M_bitmask_cmp<_X86Cmp::_Unord>(__y._M_data);
+#endif
+ else
+ return mask_type([&](int __i) {
+ return std::isunordered(_M_data[__i], __y._M_data[__i]);
+ });
+ }
+
+ /** @internal
+ * Implementation of @ref partial_load.
+ *
+ * @param __mem A pointer to an array of @p __n values. Can be complex or real.
+ * @param __n Read no more than @p __n values from memory. However, depending on @p __mem
+ * alignment, out of bounds reads are benign.
+ */
+ template <typename _Up, _ArchTraits _Traits = {}>
+ static inline basic_vec
+ _S_partial_load(const _Up* __mem, size_t __n)
+ {
+ if constexpr (_S_is_scalar)
+ return __n == 0 ? basic_vec() : basic_vec(static_cast<value_type>(*__mem));
+ else if (__is_const_known_equal_to(__n >= size_t(_S_size), true))
+ return basic_vec(_LoadCtorTag(), __mem);
+ else if constexpr (!__converts_trivially<_Up, value_type>)
+ return static_cast<basic_vec>(rebind_t<_Up, basic_vec>::_S_partial_load(__mem, __n));
+ else
+ {
+#if _GLIBCXX_X86
+ if constexpr (_Traits._M_have_avx512f()
+ || (_Traits._M_have_avx() && sizeof(_Up) >= 4))
+ {
+ const auto __k = __n < _S_size ? mask_type::_S_partial_mask_of_n(int(__n))
+ : mask_type(true);
+ return _S_masked_load(__mem, mask_type::_S_partial_mask_of_n(int(__n)));
+ }
+#endif
+ if (__n >= size_t(_S_size)) [[unlikely]]
+ return basic_vec(_LoadCtorTag(), __mem);
+#if _GLIBCXX_X86 // TODO: where else is this "safe"?
+ // allow out-of-bounds read when it cannot lead to a #GP
+ else if (__is_const_known_equal_to(
+ is_sufficiently_aligned<sizeof(_Up) * _S_full_size>(__mem), true))
+ return __select_impl(mask_type::_S_partial_mask_of_n(int(__n)),
+ basic_vec(_LoadCtorTag(), __mem), basic_vec());
+#endif
+ else if constexpr (_S_size > 4)
+ {
+ alignas(_DataType) byte __dst[sizeof(_DataType)] = {};
+ const byte* __src = reinterpret_cast<const byte*>(__mem);
+ __memcpy_chunks<sizeof(_Up), sizeof(_DataType)>(__dst, __src, __n);
+ return __builtin_bit_cast(_DataType, __dst);
+ }
+ else if (__n == 0) [[unlikely]]
+ return basic_vec();
+ else if constexpr (_S_size == 2)
+ return _DataType {static_cast<value_type>(__mem[0]), 0};
+ else
+ {
+ constexpr auto [...__is] = _IotaArray<_S_size - 2>;
+ return _DataType{
+ static_cast<value_type>(__mem[0]),
+ static_cast<value_type>(__is + 1 < __n ? __mem[__is + 1] : 0)...
+ };
+ }
+ }
+ }
+
+ /** @internal
+ * Loads elements from @p __mem according to mask @p __k.
+ *
+ * @param __mem Pointer (in)to array.
+ * @param __k Mask controlling which elements to load. For each bit i in the mask:
+ * - If bit i is 1: copy __mem[i] into result[i]
+ * - If bit i is 0: result[i] is default initialized
+ *
+ * @note This function assumes it's called after determining that no other method
+ * (like full load) is more appropriate. Calling with all mask bits set to 1
+ * is suboptimal for performance but still correct.
+ */
+ template <typename _Up, _ArchTraits _Traits = {}>
+ static inline basic_vec
+ _S_masked_load(const _Up* __mem, mask_type __k)
+ {
+ if constexpr (_S_size == 1)
+ return __k[0] ? static_cast<value_type>(__mem[0]) : value_type();
+#if _GLIBCXX_X86
+ else if constexpr (_Traits._M_have_avx512f())
+ return __x86_masked_load<_DataType>(__mem, __k._M_data);
+ else if constexpr (_Traits._M_have_avx() && (sizeof(_Up) == 4 || sizeof(_Up) == 8))
+ {
+ if constexpr (__converts_trivially<_Up, value_type>)
+ return __x86_masked_load<_DataType>(__mem, __k._M_data);
+ else
+ {
+ using _UV = rebind_t<_Up, basic_vec>;
+ return basic_vec(_UV::_S_masked_load(__mem, typename _UV::mask_type(__k)));
+ }
+ }
+#endif
+ else if (__k._M_none_of()) [[unlikely]]
+ return basic_vec();
+ else if constexpr (_S_is_scalar)
+ return basic_vec(static_cast<value_type>(*__mem));
+ else
+ {
+ // Use at least 4-byte __bits in __bit_foreach for better code-gen
+ _Bitmask<_S_size < 32 ? 32 : _S_size> __bits = __k._M_to_uint();
+ [[assume(__bits != 0)]]; // because of '__k._M_none_of()' branch above
+ if constexpr (__converts_trivially<_Up, value_type>)
+ {
+ _DataType __r = {};
+ __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
+ __r[__i] = __mem[__i];
+ });
+ return __r;
+ }
+ else
+ {
+ using _UV = rebind_t<_Up, basic_vec>;
+ alignas(_UV) _Up __tmp[sizeof(_UV) / sizeof(_Up)] = {};
+ __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
+ __tmp[__i] = __mem[__i];
+ });
+ return basic_vec(__builtin_bit_cast(_UV, __tmp));
+ }
+ }
+ }
+
+ template <typename _Up>
+ [[__gnu__::__always_inline__]]
+ inline void
+ _M_store(_Up* __mem) const
+ {
+ if constexpr (__converts_trivially<value_type, _Up>)
+ __builtin_memcpy(__mem, &_M_data, sizeof(_Up) * _S_size);
+ else
+ rebind_t<_Up, basic_vec>(*this)._M_store(__mem);
+ }
+
+ /** @internal
+ * Implementation of @ref partial_store.
+ *
+ * @note This is a static function to allow passing @p __v via register in case the function
+ * is not inlined.
+ *
+ * @note The function is not marked @c __always_inline__ since code-gen can become fairly
+ * long.
+ */
+ template <typename _Up, _ArchTraits _Traits = {}>
+ static inline void
+ _S_partial_store(const basic_vec __v, _Up* __mem, size_t __n)
+ {
+ if (__is_const_known_equal_to(__n >= _S_size, true))
+ __v._M_store(__mem);
+#if _GLIBCXX_X86
+ else if constexpr (_Traits._M_have_avx512f() && !_S_is_scalar)
+ {
+ const auto __k = __n < _S_size ? mask_type::_S_partial_mask_of_n(int(__n))
+ : mask_type(true);
+ return _S_masked_store(__v, __mem, __k);
+ }
+#endif
+ else if (__n >= _S_size) [[unlikely]]
+ __v._M_store(__mem);
+ else if (__n == 0) [[unlikely]]
+ return;
+ else if constexpr (__converts_trivially<value_type, _Up>)
+ {
+ byte* __dst = reinterpret_cast<byte*>(__mem);
+ const byte* __src = reinterpret_cast<const byte*>(&__v._M_data);
+ __memcpy_chunks<sizeof(_Up), sizeof(_M_data)>(__dst, __src, __n);
+ }
+ else
+ {
+ using _UV = rebind_t<_Up, basic_vec>;
+ _UV::_S_partial_store(_UV(__v), __mem, __n);
+ }
+ }
+
+ /** @internal
+ * Stores elements of @p __v to @p __mem according to mask @p __k.
+ *
+ * @param __v Values to store to @p __mem.
+ * @param __mem Pointer (in)to array.
+ * @param __k Mask controlling which elements to store. For each bit i in the mask:
+ * - If bit i is 1: store __v[i] to __mem[i]
+ * - If bit i is 0: __mem[i] is left unchanged
+ *
+ * @note This function assumes it's called after determining that no other method
+ * (like full store) is more appropriate. Calling with all mask bits set to 1
+ * is suboptimal for performance but still correct.
+ */
+ template <typename _Up, _ArchTraits _Traits = {}>
+ //[[__gnu__::__always_inline__]]
+ static inline void
+ _S_masked_store(const basic_vec __v, _Up* __mem, const mask_type __k)
+ {
+#if _GLIBCXX_X86
+ if constexpr (_Traits._M_have_avx512f())
+ {
+ __x86_masked_store(__v._M_data, __mem, __k._M_data);
+ return;
+ }
+ else if constexpr (_Traits._M_have_avx() && (sizeof(_Up) == 4 || sizeof(_Up) == 8))
+ {
+ if constexpr (__converts_trivially<value_type, _Up>)
+ __x86_masked_store(__v._M_data, __mem, __k._M_data);
+ else
+ {
+ using _UV = rebind_t<_Up, basic_vec>;
+ _UV::_S_masked_store(_UV(__v), __mem, typename _UV::mask_type(__k));
+ }
+ return;
+ }
+#endif
+ if (__k._M_none_of()) [[unlikely]]
+ return;
+ else if constexpr (_S_is_scalar)
+ __mem[0] = __v._M_data;
+ else
+ {
+ // Use at least 4-byte __bits in __bit_foreach for better code-gen
+ _Bitmask<_S_size < 32 ? 32 : _S_size> __bits = __k._M_to_uint();
+ [[assume(__bits != 0)]]; // because of '__k._M_none_of()' branch above
+ if constexpr (__converts_trivially<value_type, _Up>)
+ {
+ __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
+ __mem[__i] = __v[__i];
+ });
+ }
+ else
+ {
+ const rebind_t<_Up, basic_vec> __cvted(__v);
+ __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
+ __mem[__i] = __cvted[__i];
+ });
+ }
+ }
+ }
+
+ // [simd.overview] default constructor ----------------------------------
+ basic_vec() = default;
+
+ // [simd.overview] p2 impl-def conversions ------------------------------
+ using _NativeVecType = decltype([] {
+ if constexpr (_S_is_scalar)
+ return __vec_builtin_type<__canon_value_type, 1>();
+ else
+ return _DataType();
+ }());
+ /**
+ * @brief Converting constructor from GCC vector builtins.
+ *
+ * This constructor enables direct construction from GCC vector builtins
+ * (`[[gnu::vector_size(N)]]`).
+ *
+ * @param __x GCC vector builtin to convert from.
+ *
+ * @note This constructor is not available when size() equals 1.
+ *
+ * @see operator _NativeVecType() for the reverse conversion.
+ */
+ constexpr
+ basic_vec(_NativeVecType __x)
+ : _M_data([&] [[__gnu__::__always_inline__]] {
+ if constexpr (_S_is_scalar)
+ return __x[0];
+ else
+ return __x;
+ }())
+ {}
+
+ /**
+ * @brief Conversion operator to GCC vector builtins.
+ *
+ * This operator enables implicit conversion from basic_vec to GCC vector builtins.
+ *
+ * @note This operator is not available when size() equals 1.
+ *
+ * @see basic_vec(_NativeVecType) for the reverse conversion.
+ */
+ constexpr
+ operator _NativeVecType() const
+ {
+ if constexpr (_S_is_scalar)
+ return _NativeVecType{_M_data};
+ else
+ return _M_data;
+ }
+
+#if _GLIBCXX_X86
+ /**
+ * @brief Converting constructor from Intel Intrinsics (__m128, __m128i, ...).
+ */
+ template <__vec_builtin _IV>
+ requires same_as<__x86_intel_intrin_value_type<value_type>, __vec_value_type<_IV>>
+ && (sizeof(_IV) == sizeof(_DataType) && sizeof(_IV) >= 16
+ && !is_same_v<_IV, _DataType>)
+ constexpr
+ basic_vec(_IV __x)
+ : _M_data(reinterpret_cast<_DataType>(__x))
+ {}
+
+ /**
+ * @brief Conversion operator to Intel Intrinsics (__m128, __m128i, ...).
+ */
+ template <__vec_builtin _IV>
+ requires same_as<__x86_intel_intrin_value_type<value_type>, __vec_value_type<_IV>>
+ && (sizeof(_IV) == sizeof(_DataType) && sizeof(_IV) >= 16
+ && !is_same_v<_IV, _DataType>)
+ constexpr
+ operator _IV() const
+ { return reinterpret_cast<_IV>(_M_data); }
+#endif
+
+ // [simd.ctor] broadcast constructor ------------------------------------
+ /**
+ * @brief Broadcast constructor from scalar value.
+ *
+ * Constructs a vector where all elements are initialized to the same scalar value.
+ * The scalar value is converted to the vector's element type.
+ *
+ * @param __x Scalar value to broadcast to all vector elements.
+ * @tparam _Up Type of scalar value (must be explicitly convertible to value_type).
+ *
+ * @note The constructor is implicit if the conversion (if any) is value-preserving.
+ */
+ template <__explicitly_convertible_to<value_type> _Up>
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(!__broadcast_constructible<_Up, value_type>)
+ basic_vec(_Up&& __x) noexcept
+ : _M_data(_DataType() == _DataType() ? static_cast<value_type>(__x) : value_type())
+ {}
+
+ template <__simd_vec_bcast_consteval<value_type> _Up>
+ consteval
+ basic_vec(_Up&& __x)
+ : _M_data(_DataType() == _DataType()
+ ? __value_preserving_cast<value_type>(__x) : value_type())
+ {}
+
+ // [simd.ctor] conversion constructor -----------------------------------
+ template <typename _Up, typename _UAbi, _TargetTraits _Traits = {}>
+ requires (_S_size == _UAbi::_S_size)
+ && __explicitly_convertible_to<_Up, value_type>
+ [[__gnu__::__always_inline__]]
+ constexpr
+ explicit(!__value_preserving_convertible_to<_Up, value_type>
+ || __higher_rank_than<_Up, value_type>)
+ basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
+ : _M_data([&] [[__gnu__::__always_inline__]] {
+ if constexpr (_S_is_scalar)
+ return static_cast<value_type>(__x[0]);
+ else if constexpr (_UAbi::_S_nreg >= 2)
+ // __builtin_convertvector (__vec_cast) is inefficient for over-sized inputs.
+ // Also e.g. vec<float, 12> -> vec<char, 12> (with SSE2) would otherwise emit 4
+ // vcvttps2dq instructions, where only 3 are needed
+ return _S_concat(resize_t<__x._N0, basic_vec>(__x._M_data0),
+ resize_t<__x._N1, basic_vec>(__x._M_data1))._M_data;
+ else
+ return __vec_cast<_DataType>(__x._M_concat_data());
+ }())
+ {}
+
+ using _VecBase<_Tp, _Ap>::_VecBase;
+
+ // [simd.ctor] generator constructor ------------------------------------
+ template <__simd_generator_invokable<value_type, _S_size> _Fp>
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_vec(_Fp&& __gen)
+ : _M_data([&] [[__gnu__::__always_inline__]] {
+ constexpr auto [...__is] = _IotaArray<_S_size>;
+ return _DataType{static_cast<value_type>(__gen(__simd_size_c<__is>))...};
+ }())
+ {}
+
+ // [simd.ctor] load constructor -----------------------------------------
+ template <typename _Up>
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_vec(_LoadCtorTag, const _Up* __ptr)
+ : _M_data()
+ {
+ if constexpr (_S_is_scalar)
+ _M_data = static_cast<value_type>(__ptr[0]);
+ else if consteval
+ {
+ constexpr auto [...__is] = _IotaArray<_S_size>;
+ _M_data = _DataType{static_cast<value_type>(__ptr[__is])...};
+ }
+ else
+ {
+ if constexpr (__converts_trivially<_Up, value_type>)
+ // This assumes std::floatN_t to be bitwise equal to float/double
+ __builtin_memcpy(&_M_data, __ptr, sizeof(value_type) * _S_size);
+ else
+ {
+ __vec_builtin_type<_Up, _S_full_size> __tmp = {};
+ __builtin_memcpy(&__tmp, __ptr, sizeof(_Up) * _S_size);
+ _M_data = __vec_cast<_DataType>(__tmp);
+ }
+ }
+ }
+
+ template <ranges::contiguous_range _Rg, typename... _Flags>
+ requires __static_sized_range<_Rg, _S_size>
+ && __vectorizable<ranges::range_value_t<_Rg>>
+ && __explicitly_convertible_to<ranges::range_value_t<_Rg>, value_type>
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_vec(_Rg&& __range, flags<_Flags...> __flags = {})
+ : basic_vec(_LoadCtorTag(), __flags.template _S_adjust_pointer<basic_vec>(
+ ranges::data(__range)))
+ {
+ static_assert(__loadstore_convertible_to<ranges::range_value_t<_Rg>, value_type,
+ _Flags...>);
+ }
+
+ // [simd.subscr] --------------------------------------------------------
+ /**
+ * @brief Return the value of the element at index @p __i.
+ *
+ * @pre __i >= 0 && __i < size().
+ */
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ operator[](__simd_size_type __i) const
+ {
+ __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
+ if constexpr (_S_is_scalar)
+ return _M_data;
+ else
+ return _M_data[__i];
+ }
+
+ // [simd.unary] unary operators -----------------------------------------
+ // increment and decrement are implemented in terms of operator+=/-= which avoids UB on
+ // padding elements while not breaking UBsan
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec&
+ operator++() noexcept requires requires(value_type __a) { ++__a; }
+ { return *this += value_type(1); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator++(int) noexcept requires requires(value_type __a) { __a++; }
+ {
+ basic_vec __r = *this;
+ *this += value_type(1);
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec&
+ operator--() noexcept requires requires(value_type __a) { --__a; }
+ { return *this -= value_type(1); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator--(int) noexcept requires requires(value_type __a) { __a--; }
+ {
+ basic_vec __r = *this;
+ *this -= value_type(1);
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ operator!() const noexcept requires requires(value_type __a) { !__a; }
+ { return *this == value_type(); }
+
+ /**
+ * @brief Unary plus operator (no-op).
+ *
+ * Returns an unchanged copy of the object.
+ */
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator+() const noexcept requires requires(value_type __a) { +__a; }
+ { return *this; }
+
+ /**
+ * @brief Unary negation operator.
+ *
+ * Returns a new SIMD vector after element-wise negation.
+ */
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator-() const noexcept requires requires(value_type __a) { -__a; }
+ { return _S_init(-_M_data); }
+
+ /**
+ * @brief Bitwise NOT / complement operator.
+ *
+ * Returns a new SIMD vector after element-wise complement.
+ */
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator~() const noexcept requires requires(value_type __a) { ~__a; }
+ { return _S_init(~_M_data); }
+
+ // [simd.cassign] binary operators
+ /**
+ * @brief Bitwise AND operator.
+ *
+ * Returns a new SIMD vector after element-wise AND.
+ */
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator&=(basic_vec& __x, const basic_vec& __y) noexcept
+ requires requires(value_type __a) { __a & __a; }
+ {
+ __x._M_data &= __y._M_data;
+ return __x;
+ }
+
+ /**
+ * @brief Bitwise OR operator.
+ *
+ * Returns a new SIMD vector after element-wise OR.
+ */
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator|=(basic_vec& __x, const basic_vec& __y) noexcept
+ requires requires(value_type __a) { __a | __a; }
+ {
+ __x._M_data |= __y._M_data;
+ return __x;
+ }
+
+ /**
+ * @brief Bitwise XOR operator.
+ *
+ * Returns a new SIMD vector after element-wise XOR.
+ */
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator^=(basic_vec& __x, const basic_vec& __y) noexcept
+ requires requires(value_type __a) { __a ^ __a; }
+ {
+ __x._M_data ^= __y._M_data;
+ return __x;
+ }
+
+ /**
+ * @brief Applies the compound assignment operator element-wise.
+ *
+ * @pre If @c value_type is a signed integral type, the result is representable by @c
+ * value_type. (This does not apply to padding elements the implementation might add for
+ * non-power-of-2 widths.) UBsan will only see a call to @c unreachable() on overflow.
+ *
+ * @note The overflow detection code is discarded unless UBsan is active.
+ */
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator+=(basic_vec& __x, const basic_vec& __y) noexcept
+ requires requires(value_type __a) { __a + __a; }
+ {
+ if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
+ { // avoid spurious UB on signed integer overflow of the padding element(s). But don't
+ // remove UB of the active elements (so that UBsan can still do its job).
+ //
+ // This check is essentially free (at runtime) because DCE removes everything except
+ // the final change to _M_data. The overflow check is only emitted if UBsan is active.
+ //
+ // The alternative would be to always zero padding elements after operations that can
+ // produce non-zero values. However, right now:
+ // - auto f(simd::mask<int, 3> k) { return +k; } is a single VPABSD and would have to
+ // sanitize
+ // - bit_cast to basic_vec with non-zero padding elements is fine
+ // - conversion from intrinsics can create non-zero padding elements
+ // - shuffles are allowed to put whatever they want into padding elements for
+ // optimization purposes (e.g. for better instruction selection)
+ using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
+ const _DataType __result
+ = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
+ + reinterpret_cast<_UV>(__y._M_data));
+ const auto __positive = __y > value_type();
+ const auto __overflow = __positive != (__result > __x);
+ if (__overflow._M_any_of())
+ __builtin_unreachable(); // trigger UBsan
+ __x._M_data = __result;
+ }
+ else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
+ __x = basic_vec(rebind_t<float, basic_vec>(__x) + __y);
+ else
+ __x._M_data += __y._M_data;
+ return __x;
+ }
+
+ /** @copydoc operator+=
+ */
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator-=(basic_vec& __x, const basic_vec& __y) noexcept
+ requires requires(value_type __a) { __a - __a; }
+ {
+ if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
+ { // see comment on operator+=
+ using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
+ const _DataType __result
+ = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
+ - reinterpret_cast<_UV>(__y._M_data));
+ const auto __positive = __y > value_type();
+ const auto __overflow = __positive != (__result < __x);
+ if (__overflow._M_any_of())
+ __builtin_unreachable(); // trigger UBsan
+ __x._M_data = __result;
+ }
+ else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
+ __x = basic_vec(rebind_t<float, basic_vec>(__x) - __y);
+ else
+ __x._M_data -= __y._M_data;
+ return __x;
+ }
+
+ /** @copydoc operator+=
+ */
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator*=(basic_vec& __x, const basic_vec& __y) noexcept
+ requires requires(value_type __a) { __a * __a; }
+ {
+ if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
+ { // see comment on operator+=
+ for (int __i = 0; __i < _S_size; ++__i)
+ {
+ if (__builtin_mul_overflow_p(__x._M_data[__i], __y._M_data[__i], value_type()))
+ __builtin_unreachable();
+ }
+ using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
+ __x._M_data = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
+ * reinterpret_cast<_UV>(__y._M_data));
+ }
+
+ // 'uint16 * uint16' promotes to int and can therefore lead to UB. The standard does not
+ // require to avoid the undefined behavior. It's unnecessary and easy to avoid. It's also
+ // unexpected because there's no UB on the vector types (which don't promote).
+ else if constexpr (_S_is_scalar && is_unsigned_v<value_type>
+ && is_signed_v<decltype(value_type() * value_type())>)
+ __x._M_data = unsigned(__x._M_data) * unsigned(__y._M_data);
+
+ else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
+ __x = basic_vec(rebind_t<float, basic_vec>(__x) * __y);
+
+ else
+ __x._M_data *= __y._M_data;
+ return __x;
+ }
+
+ template <_TargetTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator/=(basic_vec& __x, const basic_vec& __y) noexcept
+ requires requires(value_type __a) { __a / __a; }
+ {
+ const basic_vec __result([&](int __i) -> value_type { return __x[__i] / __y[__i]; });
+ if (__is_const_known(__result))
+ // the optimizer already knows the values of the result
+ return __x = __result;
+
+#ifdef __SSE2__
+ // x86 doesn't have integral SIMD division instructions
+ // While division is faster, the required conversions are still a problem:
+ // see PR121274, PR121284, and PR121296 for missed optimizations wrt. conversions
+ //
+ // With only 1 or 2 divisions, the conversion to and from fp is too expensive.
+ if constexpr (is_integral_v<value_type> && _S_size > 2
+ && __value_preserving_convertible_to<value_type, double>)
+ {
+ // If the denominator (y) is known to the optimizer, don't convert to fp because the
+ // integral division can be translated into shifts/multiplications.
+ if (!__is_const_known(__y))
+ {
+ // With AVX512FP16 use vdivph for 8-bit integers
+ if constexpr (_Traits._M_have_avx512fp16()
+ && __value_preserving_convertible_to<value_type, _Float16>)
+ return __x = basic_vec(rebind_t<_Float16, basic_vec>(__x) / __y);
+ else if constexpr (__value_preserving_convertible_to<value_type, float>)
+ return __x = basic_vec(rebind_t<float, basic_vec>(__x) / __y);
+ else
+ return __x = basic_vec(rebind_t<double, basic_vec>(__x) / __y);
+ }
+ }
+#endif
+ if constexpr (_Traits._M_eval_as_f32<value_type>())
+ return __x = basic_vec(rebind_t<float, basic_vec>(__x) / __y);
+
+ basic_vec __y1 = __y;
+ if constexpr (_S_is_partial)
+ {
+ if constexpr (is_integral_v<value_type>)
+ {
+ // Assume integral division doesn't have SIMD instructions and must be done per
+ // element anyway. Partial vectors should skip their padding elements.
+ for (int __i = 0; __i < _S_size; ++__i)
+ __x._M_data[__i] /= __y._M_data[__i];
+ return __x;
+ }
+ else
+ __y1 = __select_impl(mask_type::_S_init(mask_type::_S_implicit_mask),
+ __y, basic_vec(value_type(1)));
+ }
+ __x._M_data /= __y1._M_data;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator%=(basic_vec& __x, const basic_vec& __y) noexcept
+ requires requires(value_type __a) { __a % __a; }
+ {
+ static_assert(is_integral_v<value_type>);
+ if constexpr (_S_is_partial)
+ {
+ const basic_vec __y1 = __select_impl(mask_type::_S_init(mask_type::_S_implicit_mask),
+ __y, basic_vec(value_type(1)));
+ if (__is_const_known(__y1))
+ __x._M_data %= __y1._M_data;
+ else
+ {
+ // Assume integral division doesn't have SIMD instructions and must be done per
+ // element anyway. Partial vectors should skip their padding elements.
+ for (int __i = 0; __i < _S_size; ++__i)
+ __x._M_data[__i] %= __y._M_data[__i];
+ }
+ }
+ else
+ __x._M_data %= __y._M_data;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator<<=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires(value_type __a) { __a << __a; }
+ {
+ __glibcxx_simd_precondition(is_unsigned_v<value_type> || all_of(__y >= value_type()),
+ "negative shift is undefined behavior");
+ __glibcxx_simd_precondition(all_of(__y < __max_shift<value_type>),
+ "too large shift invokes undefined behavior");
+ __x._M_data <<= __y._M_data;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator>>=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires(value_type __a) { __a >> __a; }
+ {
+ __glibcxx_simd_precondition(is_unsigned_v<value_type> || all_of(__y >= value_type()),
+ "negative shift is undefined behavior");
+ __glibcxx_simd_precondition(all_of(__y < __max_shift<value_type>),
+ "too large shift invokes undefined behavior");
+ __x._M_data >>= __y._M_data;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator<<=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires(value_type __a, __simd_size_type __b) { __a << __b; }
+ {
+ __glibcxx_simd_precondition(__y >= 0, "negative shift is undefined behavior");
+ __glibcxx_simd_precondition(__y < int(__max_shift<value_type>),
+ "too large shift invokes undefined behavior");
+ __x._M_data <<= __y;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator>>=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires(value_type __a, __simd_size_type __b) { __a >> __b; }
+ {
+ __glibcxx_simd_precondition(__y >= 0, "negative shift is undefined behavior");
+ __glibcxx_simd_precondition(__y < int(__max_shift<value_type>),
+ "too large shift invokes undefined behavior");
+ __x._M_data >>= __y;
+ return __x;
+ }
+
+ // [simd.comparison] ----------------------------------------------------
+#if _GLIBCXX_X86
+ template <_X86Cmp _Cmp>
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ _M_bitmask_cmp(_DataType __y) const
+ {
+ static_assert(_S_use_bitmask);
+ if (__is_const_known(_M_data, __y))
+ {
+ constexpr auto [...__is] = _IotaArray<_S_size>;
+ constexpr auto __cmp_op = [] [[__gnu__::__always_inline__]]
+ (value_type __a, value_type __b) {
+ if constexpr (_Cmp == _X86Cmp::_Eq)
+ return __a == __b;
+ else if constexpr (_Cmp == _X86Cmp::_Lt)
+ return __a < __b;
+ else if constexpr (_Cmp == _X86Cmp::_Le)
+ return __a <= __b;
+ else if constexpr (_Cmp == _X86Cmp::_Unord)
+ return std::isunordered(__a, __b);
+ else if constexpr (_Cmp == _X86Cmp::_Neq)
+ return __a != __b;
+ else if constexpr (_Cmp == _X86Cmp::_Nlt)
+ return !(__a < __b);
+ else if constexpr (_Cmp == _X86Cmp::_Nle)
+ return !(__a <= __b);
+ else
+ static_assert(false);
+ };
+ const _Bitmask<_S_size> __bits
+ = ((__cmp_op(__vec_get(_M_data, __is), __vec_get(__y, __is))
+ ? (1ULL << __is) : 0) | ...);
+ return mask_type::_S_init(__bits);
+ }
+ else
+ return mask_type::_S_init(__x86_bitmask_cmp<_Cmp>(_M_data, __y));
+ }
+#endif
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator==(const basic_vec& __x, const basic_vec& __y) noexcept
+ {
+#if _GLIBCXX_X86
+ if constexpr (_S_use_bitmask)
+ return __x._M_bitmask_cmp<_X86Cmp::_Eq>(__y._M_data);
+ else
+#endif
+ return mask_type::_S_init(__x._M_data == __y._M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator!=(const basic_vec& __x, const basic_vec& __y) noexcept
+ {
+#if _GLIBCXX_X86
+ if constexpr (_S_use_bitmask)
+ return __x._M_bitmask_cmp<_X86Cmp::_Neq>(__y._M_data);
+ else
+#endif
+ return mask_type::_S_init(__x._M_data != __y._M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator<(const basic_vec& __x, const basic_vec& __y) noexcept
+ {
+#if _GLIBCXX_X86
+ if constexpr (_S_use_bitmask)
+ return __x._M_bitmask_cmp<_X86Cmp::_Lt>(__y._M_data);
+ else
+#endif
+ return mask_type::_S_init(__x._M_data < __y._M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator<=(const basic_vec& __x, const basic_vec& __y) noexcept
+ {
+#if _GLIBCXX_X86
+ if constexpr (_S_use_bitmask)
+ return __x._M_bitmask_cmp<_X86Cmp::_Le>(__y._M_data);
+ else
+#endif
+ return mask_type::_S_init(__x._M_data <= __y._M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator>(const basic_vec& __x, const basic_vec& __y) noexcept
+ { return __y < __x; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator>=(const basic_vec& __x, const basic_vec& __y) noexcept
+ { return __y <= __x; }
+
+ // [simd.cond] ---------------------------------------------------------
+ template <_TargetTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec
+ __select_impl(const mask_type& __k, const basic_vec& __t, const basic_vec& __f) noexcept
+ {
+ if constexpr (_S_size == 1)
+ return __k[0] ? __t : __f;
+ else if constexpr (_S_use_bitmask)
+ {
+#if _GLIBCXX_X86
+ if (__is_const_known(__k, __t, __f))
+ return basic_vec([&](int __i) { return __k[__i] ? __t[__i] : __f[__i]; });
+ else
+ return __x86_bitmask_blend(__k._M_data, __t._M_data, __f._M_data);
+#else
+ static_assert(false, "TODO");
+#endif
+ }
+ else if consteval
+ {
+ return __k._M_data ? __t._M_data : __f._M_data;
+ }
+ else
+ {
+ constexpr bool __uses_simd_register = sizeof(_M_data) >= 8;
+ using _VO = _VecOps<_DataType>;
+ if (_VO::_S_is_const_known_equal_to(__f._M_data, 0))
+ {
+ if (is_integral_v<value_type> && __uses_simd_register
+ && _VO::_S_is_const_known_equal_to(__t._M_data, 1))
+ // This is equivalent to converting the mask into a vec of 0s and 1s. So +__k.
+ // However, basic_mask::operator+ arrives here; returning +__k would be
+ // recursive. Instead we use -__k (which is a no-op for vector-masks) and then
+ // flip all -1 elements to +1 by taking the absolute value.
+ return basic_vec((-__k)._M_abs());
+ else
+ return __vec_and(reinterpret_cast<_DataType>(__k._M_data), __t._M_data);
+ }
+ else if (_VecOps<_DataType>::_S_is_const_known_equal_to(__t._M_data, 0))
+ {
+ if (is_integral_v<value_type> && __uses_simd_register
+ && _VO::_S_is_const_known_equal_to(__f._M_data, 1))
+ return value_type(1) + basic_vec(-__k);
+ else
+ return __vec_and(reinterpret_cast<_DataType>(__vec_not(__k._M_data)), __f._M_data);
+ }
+ else
+ {
+#if _GLIBCXX_X86
+ // this works around bad code-gen when the compiler can't see that __k is a vector-mask.
+ // This pattern, is recognized to match the x86 blend instructions, which only consider
+ // the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k
+ // is a vector-mask, then the '< 0' is elided.
+ return __k._M_data < 0 ? __t._M_data : __f._M_data;
+#endif
+ return __k._M_data ? __t._M_data : __f._M_data;
+ }
+ }
+ }
+ };
+
+ template <__vectorizable _Tp, __abi_tag _Ap>
+ requires (_Ap::_S_nreg > 1)
+ class basic_vec<_Tp, _Ap>
+ : public _VecBase<_Tp, _Ap>
+ {
+ template <typename, typename>
+ friend class basic_vec;
+
+ template <size_t, typename>
+ friend class basic_mask;
+
+ static constexpr int _S_size = _Ap::_S_size;
+
+ static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2;
+
+ static constexpr int _N1 = _S_size - _N0;
+
+ using _DataType0 = __similar_vec<_Tp, _N0, _Ap>;
+
+ // the implementation (and users) depend on elements being contiguous in memory
+ static_assert(_N0 * sizeof(_Tp) == sizeof(_DataType0));
+
+ using _DataType1 = __similar_vec<_Tp, _N1, _Ap>;
+
+ static_assert(_DataType0::abi_type::_S_nreg + _DataType1::abi_type::_S_nreg == _Ap::_S_nreg);
+
+ static constexpr bool _S_is_scalar = _DataType0::_S_is_scalar;
+
+ _DataType0 _M_data0;
+
+ _DataType1 _M_data1;
+
+ static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask;
+
+ static constexpr bool _S_is_partial = _DataType1::_S_is_partial;
+
+ public:
+ using value_type = _Tp;
+
+ using mask_type = _VecBase<_Tp, _Ap>::mask_type;
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_vec
+ _S_init(const _DataType0& __x, const _DataType1& __y)
+ {
+ basic_vec __r;
+ __r._M_data0 = __x;
+ __r._M_data1 = __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr const _DataType0&
+ _M_get_low() const
+ { return _M_data0; }
+
+ [[__gnu__::__always_inline__]]
+ constexpr const _DataType1&
+ _M_get_high() const
+ { return _M_data1; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr bool
+ __is_const_known(const basic_vec& __x)
+ { return __is_const_known(__x._M_data0) && __is_const_known(__x._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_concat_data([[maybe_unused]] bool __do_sanitize = false) const
+ {
+ return __vec_concat(_M_data0._M_concat_data(false),
+ __vec_zero_pad_to<sizeof(_M_data0)>(
+ _M_data1._M_concat_data(__do_sanitize)));
+ }
+
+ template <int _Size = _S_size, int _Offset = 0, typename _A0, typename _Fp>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_vec
+ _S_static_permute(const basic_vec<value_type, _A0>& __x, _Fp&& __idxmap)
+ {
+ return _S_init(
+ _DataType0::template _S_static_permute<_Size, _Offset>(__x, __idxmap),
+ _DataType1::template _S_static_permute<_Size, _Offset + _N0>(__x, __idxmap));
+ }
+
+ template <typename _Vp>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_chunk() const noexcept
+ {
+ constexpr int __n = _S_size / _Vp::_S_size;
+ constexpr int __rem = _S_size % _Vp::_S_size;
+ constexpr auto [...__is] = _IotaArray<__n>;
+ if constexpr (__rem == 0)
+ return array<_Vp, __n>{__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>,
+ _M_data0, _M_data1)...};
+ else
+ {
+ using _Rest = resize_t<__rem, _Vp>;
+ return tuple(__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, _M_data0, _M_data1)...,
+ __extract_simd_at<_Rest>(cw<_Vp::_S_size * __n>, _M_data0, _M_data1));
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr const basic_vec&
+ _S_concat(const basic_vec& __x0) noexcept
+ { return __x0; }
+
+ template <typename... _As>
+ requires (sizeof...(_As) >= 2)
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_vec
+ _S_concat(const basic_vec<value_type, _As>&... __xs) noexcept
+ {
+ static_assert(_S_size == (_As::_S_size + ...));
+ return _S_init(__extract_simd_at<_DataType0>(cw<0>, __xs...),
+ __extract_simd_at<_DataType1>(cw<_N0>, __xs...));
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_reduce_to_half(auto __binary_op) const requires (_N0 == _N1)
+ { return __binary_op(_M_data0, _M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ _M_reduce_tail(const auto& __rest, auto __binary_op) const
+ {
+ if constexpr (__rest.size() > _S_size)
+ {
+ auto [__a, __b] = __rest.template _M_chunk<basic_vec>();
+ return __binary_op(*this, __a)._M_reduce_tail(__b, __binary_op);
+ }
+ else if constexpr (__rest.size() == _S_size)
+ return __binary_op(*this, __rest)._M_reduce(__binary_op);
+ else
+ return _M_reduce_to_half(__binary_op)._M_reduce_tail(__rest, __binary_op);
+ }
+
+ template <typename _BinaryOp, _TargetTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ _M_reduce(_BinaryOp __binary_op) const
+ {
+ if constexpr (_Traits.template _M_eval_as_f32<value_type>()
+ && (is_same_v<_BinaryOp, plus<>>
+ || is_same_v<_BinaryOp, multiplies<>>))
+ return value_type(rebind_t<float, basic_vec>(*this)._M_reduce(__binary_op));
+#ifdef __SSE2__
+ else if constexpr (is_integral_v<value_type> && sizeof(value_type) == 1
+ && is_same_v<decltype(__binary_op), multiplies<>>)
+ {
+ // convert to unsigned short because of missing 8-bit mul instruction
+ // we don't need to preserve the order of elements
+ //
+ // The left columns under Latency and Throughput show bit-cast to ushort with shift by
+ // 8. The right column uses the alternative in the else branch.
+ // Benchmark on Intel Ultra 7 165U (AVX2)
+ // TYPE Latency Throughput
+ // [cycles/call] [cycles/call]
+ //schar, 64 59.9 70.7 10.5 13.3
+ //schar, 128 81.4 97.2 12.2 21
+ //schar, 256 92.4 129 17.2 35.2
+ if constexpr (_DataType1::_S_is_scalar)
+ return __binary_op(_DataType1(_M_data0._M_reduce(__binary_op)), _M_data1)[0];
+ // TODO: optimize trailing scalar (e.g. (8+8)+(8+1))
+ else if constexpr (_S_size % 2 == 0)
+ { // If all elements participate in the reduction we can take this shortcut
+ using _V16 = resize_t<_S_size / 2, rebind_t<unsigned short, basic_vec>>;
+ auto __a = __builtin_bit_cast(_V16, *this);
+ return __binary_op(__a, __a >> __CHAR_BIT__)._M_reduce(__binary_op);
+ }
+ else
+ {
+ using _V16 = rebind_t<unsigned short, basic_vec>;
+ return _V16(*this)._M_reduce(__binary_op);
+ }
+ }
+#endif
+ else
+ return _M_data0._M_reduce_tail(_M_data1, __binary_op);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ _M_isnan() const requires is_floating_point_v<value_type>
+ { return mask_type::_S_init(_M_data0._M_isnan(), _M_data1._M_isnan()); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ _M_isinf() const requires is_floating_point_v<value_type>
+ { return mask_type::_S_init(_M_data0._M_isinf(), _M_data1._M_isinf()); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ _M_isunordered(basic_vec __y) const requires is_floating_point_v<value_type>
+ {
+ return mask_type::_S_init(_M_data0._M_isunordered(__y._M_data0),
+ _M_data1._M_isunordered(__y._M_data1));
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ _M_abs() const requires signed_integral<value_type>
+ { return _S_init(_M_data0._M_abs(), _M_data1._M_abs()); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ _M_fabs() const requires floating_point<value_type>
+ { return _S_init(_M_data0._M_fabs(), _M_data1._M_fabs()); }
+
+ template <typename _Up>
+ [[__gnu__::__always_inline__]]
+ static inline basic_vec
+ _S_partial_load(const _Up* __mem, size_t __n)
+ {
+ if (__n >= _N0)
+ return _S_init(_DataType0(_LoadCtorTag(), __mem),
+ _DataType1::_S_partial_load(__mem + _N0, __n - _N0));
+ else
+ return _S_init(_DataType0::_S_partial_load(__mem, __n),
+ _DataType1());
+ }
+
+ template <typename _Up, _ArchTraits _Traits = {}>
+ static inline basic_vec
+ _S_masked_load(const _Up* __mem, mask_type __k)
+ {
+ return _S_init(_DataType0::_S_masked_load(__mem, __k._M_data0),
+ _DataType1::_S_masked_load(__mem + _N0, __k._M_data1));
+ }
+
+ template <typename _Up>
+ [[__gnu__::__always_inline__]]
+ inline void
+ _M_store(_Up* __mem) const
+ {
+ _M_data0._M_store(__mem);
+ _M_data1._M_store(__mem + _N0);
+ }
+
+ template <typename _Up>
+ [[__gnu__::__always_inline__]]
+ static inline void
+ _S_partial_store(const basic_vec& __v, _Up* __mem, size_t __n)
+ {
+ if (__n >= _N0)
+ {
+ __v._M_data0._M_store(__mem);
+ _DataType1::_S_partial_store(__v._M_data1, __mem + _N0, __n - _N0);
+ }
+ else
+ {
+ _DataType0::_S_partial_store(__v._M_data0, __mem, __n);
+ }
+ }
+
+ template <typename _Up>
+ [[__gnu__::__always_inline__]]
+ static inline void
+ _S_masked_store(const basic_vec& __v, _Up* __mem, const mask_type& __k)
+ {
+ _DataType0::_S_masked_store(__v._M_data0, __mem, __k._M_data0);
+ _DataType1::_S_masked_store(__v._M_data1, __mem + _N0, __k._M_data1);
+ }
+
+ basic_vec() = default;
+
+ // [simd.overview] p2 impl-def conversions ------------------------------
+ using _NativeVecType = __vec_builtin_type<value_type, __bit_ceil(unsigned(_S_size))>;
+
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_vec(const _NativeVecType& __x)
+ : _M_data0(_VecOps<__vec_builtin_type<value_type, _N0>>::_S_extract(__x)),
+ _M_data1(_VecOps<__vec_builtin_type<value_type, __bit_ceil(unsigned(_N1))>>
+ ::_S_extract(__x, integral_constant<int, _N0>()))
+ {}
+
+ [[__gnu__::__always_inline__]]
+ constexpr
+ operator _NativeVecType() const
+ { return _M_concat_data(); }
+
+ // [simd.ctor] broadcast constructor ------------------------------------
+ template <__explicitly_convertible_to<value_type> _Up>
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(!__broadcast_constructible<_Up, value_type>)
+ basic_vec(_Up&& __x) noexcept
+ : _M_data0(static_cast<value_type>(__x)), _M_data1(static_cast<value_type>(__x))
+ {}
+
+ template <__simd_vec_bcast_consteval<value_type> _Up>
+ consteval
+ basic_vec(_Up&& __x)
+ : _M_data0(__value_preserving_cast<value_type>(__x)),
+ _M_data1(__value_preserving_cast<value_type>(__x))
+ {}
+
+ // [simd.ctor] conversion constructor -----------------------------------
+ template <typename _Up, typename _UAbi>
+ requires (_S_size == _UAbi::_S_size)
+ && __explicitly_convertible_to<_Up, value_type>
+ [[__gnu__::__always_inline__]]
+ constexpr
+ explicit(!__value_preserving_convertible_to<_Up, value_type>
+ || __higher_rank_than<_Up, value_type>)
+ basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
+ : _M_data0(get<0>(chunk<_N0>(__x))),
+ _M_data1(get<1>(chunk<_N0>(__x)))
+ {}
+
+ using _VecBase<_Tp, _Ap>::_VecBase;
+
+ // [simd.ctor] generator constructor ------------------------------------
+ template <__simd_generator_invokable<value_type, _S_size> _Fp>
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_vec(_Fp&& __gen)
+ : _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) {
+ return __gen(__simd_size_c<__i + _N0>);
+ })
+ {}
+
+ // [simd.ctor] load constructor -----------------------------------------
+ template <typename _Up>
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_vec(_LoadCtorTag, const _Up* __ptr)
+ : _M_data0(_LoadCtorTag(), __ptr),
+ _M_data1(_LoadCtorTag(), __ptr + _N0)
+ {}
+
+ template <ranges::contiguous_range _Rg, typename... _Flags>
+ requires __static_sized_range<_Rg, _S_size>
+ && __vectorizable<ranges::range_value_t<_Rg>>
+ && __explicitly_convertible_to<ranges::range_value_t<_Rg>, value_type>
+ constexpr
+ basic_vec(_Rg&& __range, flags<_Flags...> __flags = {})
+ : basic_vec(_LoadCtorTag(),
+ __flags.template _S_adjust_pointer<basic_vec>(ranges::data(__range)))
+ {
+ static_assert(__loadstore_convertible_to<ranges::range_value_t<_Rg>, value_type,
+ _Flags...>);
+ }
+
+ // [simd.subscr] --------------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ operator[](__simd_size_type __i) const
+ {
+ __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
+ if (__is_const_known(__i))
+ return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0];
+ else
+ {
+ using _AliasingT [[__gnu__::__may_alias__]] = value_type;
+ return reinterpret_cast<const _AliasingT*>(this)[__i];
+ }
+ }
+
+ // [simd.unary] unary operators -----------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec&
+ operator++() noexcept requires requires(value_type __a) { ++__a; }
+ {
+ ++_M_data0;
+ ++_M_data1;
+ return *this;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator++(int) noexcept requires requires(value_type __a) { __a++; }
+ {
+ basic_vec __r = *this;
+ ++_M_data0;
+ ++_M_data1;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec&
+ operator--() noexcept requires requires(value_type __a) { --__a; }
+ {
+ --_M_data0;
+ --_M_data1;
+ return *this;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator--(int) noexcept requires requires(value_type __a) { __a--; }
+ {
+ basic_vec __r = *this;
+ --_M_data0;
+ --_M_data1;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr mask_type
+ operator!() const noexcept requires requires(value_type __a) { !__a; }
+ { return mask_type::_S_init(!_M_data0, !_M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator+() const noexcept requires requires(value_type __a) { +__a; }
+ { return *this; }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator-() const noexcept requires requires(value_type __a) { -__a; }
+ { return _S_init(-_M_data0, -_M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_vec
+ operator~() const noexcept requires requires(value_type __a) { ~__a; }
+ { return _S_init(~_M_data0, ~_M_data1); }
+
+ // [simd.cassign] -------------------------------------------------------
+#define _GLIBCXX_SIMD_DEFINE_OP(sym) \
+ [[__gnu__::__always_inline__]] \
+ friend constexpr basic_vec& \
+ operator sym##=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT \
+ { \
+ __x._M_data0 sym##= __y._M_data0; \
+ __x._M_data1 sym##= __y._M_data1; \
+ return __x; \
+ }
+
+ _GLIBCXX_SIMD_DEFINE_OP(+)
+ _GLIBCXX_SIMD_DEFINE_OP(-)
+ _GLIBCXX_SIMD_DEFINE_OP(*)
+ _GLIBCXX_SIMD_DEFINE_OP(/)
+ _GLIBCXX_SIMD_DEFINE_OP(%)
+ _GLIBCXX_SIMD_DEFINE_OP(&)
+ _GLIBCXX_SIMD_DEFINE_OP(|)
+ _GLIBCXX_SIMD_DEFINE_OP(^)
+ _GLIBCXX_SIMD_DEFINE_OP(<<)
+ _GLIBCXX_SIMD_DEFINE_OP(>>)
+
+#undef _GLIBCXX_SIMD_DEFINE_OP
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator<<=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires(value_type __a, __simd_size_type __b) { __a << __b; }
+ {
+ __x._M_data0 <<= __y;
+ __x._M_data1 <<= __y;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec&
+ operator>>=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+ requires requires(value_type __a, __simd_size_type __b) { __a >> __b; }
+ {
+ __x._M_data0 >>= __y;
+ __x._M_data1 >>= __y;
+ return __x;
+ }
+
+ // [simd.comparison] ----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator==(const basic_vec& __x, const basic_vec& __y) noexcept
+ { return mask_type::_S_init(__x._M_data0 == __y._M_data0, __x._M_data1 == __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator!=(const basic_vec& __x, const basic_vec& __y) noexcept
+ { return mask_type::_S_init(__x._M_data0 != __y._M_data0, __x._M_data1 != __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator<(const basic_vec& __x, const basic_vec& __y) noexcept
+ { return mask_type::_S_init(__x._M_data0 < __y._M_data0, __x._M_data1 < __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator<=(const basic_vec& __x, const basic_vec& __y) noexcept
+ { return mask_type::_S_init(__x._M_data0 <= __y._M_data0, __x._M_data1 <= __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator>(const basic_vec& __x, const basic_vec& __y) noexcept
+ { return mask_type::_S_init(__x._M_data0 > __y._M_data0, __x._M_data1 > __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr mask_type
+ operator>=(const basic_vec& __x, const basic_vec& __y) noexcept
+ { return mask_type::_S_init(__x._M_data0 >= __y._M_data0, __x._M_data1 >= __y._M_data1); }
+
+ // [simd.cond] ---------------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_vec
+ __select_impl(const mask_type& __k, const basic_vec& __t, const basic_vec& __f) noexcept
+ {
+ return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0),
+ __select_impl(__k._M_data1, __t._M_data1, __f._M_data1));
+ }
+ };
+
+ // [simd.overview] deduction guide ------------------------------------------
+ template <ranges::contiguous_range _Rg, typename... _Ts>
+ requires __static_sized_range<_Rg>
+ basic_vec(_Rg&& __r, _Ts...)
+ -> basic_vec<ranges::range_value_t<_Rg>,
+ __deduce_abi_t<ranges::range_value_t<_Rg>,
+#if 0 // PR117849
+ ranges::size(__r)>>;
+#else
+ decltype(std::span(__r))::extent>>;
+#endif
+
+ template <size_t _Bytes, typename _Ap>
+ basic_vec(basic_mask<_Bytes, _Ap>)
+ -> basic_vec<__integer_from<_Bytes>,
+ decltype(__abi_rebind<__integer_from<_Bytes>, basic_mask<_Bytes, _Ap>::size.value,
+ _Ap>())>;
+
+ // [P3319R5] ----------------------------------------------------------------
+ template <__vectorizable _Tp>
+ requires is_arithmetic_v<_Tp>
+ inline constexpr _Tp
+ __iota<_Tp> = _Tp();
+
+ template <typename _Tp, typename _Ap>
+ inline constexpr basic_vec<_Tp, _Ap>
+ __iota<basic_vec<_Tp, _Ap>> = basic_vec<_Tp, _Ap>([](_Tp __i) -> _Tp {
+ static_assert(_Ap::_S_size - 1 <= numeric_limits<_Tp>::max(),
+ "iota object would overflow");
+ return __i;
+ });
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_VEC_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_SIMD_X86_H
+#define _GLIBCXX_SIMD_X86_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "vec_ops.h"
+
+#if !_GLIBCXX_X86
+#error "wrong include for this target"
+#endif
+
+#pragma GCC push_options
+// ensure GCC knows about the __builtin_ia32_* calls
+#pragma GCC target("avx2,bmi,bmi2,avx512vl,avx512bw,avx512dq,avx10.2")
+#pragma GCC pop_options
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ static constexpr size_t __x86_max_general_register_size
+#ifdef __x86_64__
+ = 8;
+#else
+ = 4;
+#endif
+
+ /** @internal
+ * Return a bit-mask for the given vector-mask.
+ *
+ * Caveats:
+ * 1. The bit-mask of 2-Byte vector-masks has duplicated entries (because of missing instruction)
+ * 2. The return type internally is 'int', but that fails on conversion to uint64 if the MSB of a
+ * YMM 1/2-Byte vector-mask is set (sign extension). Therefore these helper functions return
+ * unsigned instead.
+ * 3. ZMM inputs are not supported.
+ */
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<8>, 16> __x)
+ { return __builtin_ia32_movmskpd(__vec_bit_cast<double>(__x)); }
+
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<8>, 32> __x)
+ { return __builtin_ia32_movmskpd256(__vec_bit_cast<double>(__x)); }
+
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 16> __x)
+ { return __builtin_ia32_movmskps(__vec_bit_cast<float>(__x)); }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline _Bitmask<8>
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 8> __x)
+ {
+#if __has_builtin(__builtin_ia32_pext_di)
+ if constexpr (_Traits._M_have_bmi2())
+ return _Bitmask<8>(__builtin_ia32_pext_di(
+ __builtin_bit_cast(unsigned long long, __x),
+ 0x80000000'80000000ULL));
+#endif
+ return _Bitmask<8>(__x86_movmsk(__vec_zero_pad_to_16(__x)));
+ }
+
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 32> __x)
+ { return __builtin_ia32_movmskps256(__vec_bit_cast<float>(__x)); }
+
+ template <__vec_builtin _TV, auto _Traits = _ArchTraits()>
+ requires (sizeof(__vec_value_type<_TV>) <= 2)
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(_TV __x)
+ {
+ static_assert(__width_of<_TV> > 1);
+ if constexpr (sizeof(__x) == 32)
+ return __builtin_ia32_pmovmskb256(__vec_bit_cast<char>(__x));
+ else if constexpr (sizeof(__x) == 16)
+ return __builtin_ia32_pmovmskb128(__vec_bit_cast<char>(__x));
+ else if constexpr (sizeof(__x) == 8)
+ {
+#if __has_builtin(__builtin_ia32_pext_di)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_di(__builtin_bit_cast(unsigned long long, __x),
+ 0x8080'8080'8080'8080ULL);
+#endif
+ return __x86_movmsk(__vec_zero_pad_to_16(__x));
+ }
+ else if constexpr (sizeof(__x) == 4)
+ {
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__builtin_bit_cast(unsigned int, __x), 0x80808080u);
+#endif
+ return __x86_movmsk(__vec_zero_pad_to_16(__x));
+ }
+ else if constexpr (sizeof(__x) == 2)
+ {
+ auto __bits = __builtin_bit_cast(unsigned short, __x);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__bits, 0x00008080u);
+#endif
+ return ((__bits >> 7) & 1) | ((__bits & 0x8000) >> 14);
+ }
+ else
+ static_assert(false);
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline bool
+ __x86_vec_is_zero(_TV __a)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_integral_v<_Tp>);
+ if constexpr (sizeof(_TV) <= __x86_max_general_register_size)
+ return __builtin_bit_cast(__integer_from<sizeof(_TV)>, __a) == 0;
+ else if constexpr (_Traits._M_have_avx())
+ {
+ if constexpr (sizeof(_TV) == 32)
+ return __builtin_ia32_ptestz256(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__a));
+ else if constexpr (sizeof(_TV) == 16)
+ return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__a));
+ else if constexpr (sizeof(_TV) < 16)
+ return __x86_vec_is_zero(__vec_zero_pad_to_16(__a));
+ else
+ static_assert(false);
+ }
+ else if constexpr (_Traits._M_have_sse4_1())
+ {
+ if constexpr (sizeof(_TV) == 16)
+ return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__a));
+ else if constexpr (sizeof(_TV) < 16)
+ return __x86_vec_is_zero(__vec_zero_pad_to_16(__a));
+ else
+ static_assert(false);
+ }
+ else
+ return __x86_movmsk(__a) == 0;
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline int
+ __x86_vec_testz(_TV __a, _TV __b)
+ {
+ static_assert(sizeof(_TV) == 16 || sizeof(_TV) == 32);
+ static_assert(_Traits._M_have_sse4_1());
+ if constexpr (sizeof(_TV) == 32)
+ return __builtin_ia32_ptestz256(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__b));
+ else
+ return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__b));
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline int
+ __x86_vec_testc(_TV __a, _TV __b)
+ {
+ static_assert(sizeof(_TV) == 16 || sizeof(_TV) == 32);
+ static_assert(_Traits._M_have_sse4_1());
+ if constexpr (sizeof(_TV) == 32)
+ return __builtin_ia32_ptestc256(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__b));
+ else
+ return __builtin_ia32_ptestc128(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__b));
+ }
+
+ template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline bool
+ __x86_vecmask_all(_TV __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_integral_v<_Tp> && is_signed_v<_Tp>);
+ constexpr int __width = __width_of<_TV>;
+ static_assert(sizeof(__k) <= 32);
+ if constexpr (_Np == __width)
+ {
+ if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+ {
+ using _Ip = __integer_from<sizeof(__k)>;
+ return __builtin_bit_cast(_Ip, __k) == ~_Ip();
+ }
+ else if constexpr (!_Traits._M_have_sse4_1())
+ {
+ constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+ return __x86_movmsk(__k) == __valid_bits;
+ }
+ else if constexpr (sizeof(__k) < 16)
+ return __x86_vecmask_all<_Np>(__vec_zero_pad_to_16(__k));
+ else
+ return 0 != __x86_vec_testc(__k, ~_TV());
+ }
+ else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+ {
+ using _Ip = __integer_from<sizeof(__k)>;
+ constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+ return (__builtin_bit_cast(_Ip, __k) & __valid_bits) == __valid_bits;
+ }
+ else if constexpr (!_Traits._M_have_sse4_1())
+ {
+ constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+ return (__x86_movmsk(__k) & __valid_bits) == __valid_bits;
+ }
+ else if constexpr (sizeof(__k) < 16)
+ return __x86_vecmask_all<_Np>(__vec_zero_pad_to_16(__k));
+ else
+ return 0 != __x86_vec_testc(__k, _S_vec_implicit_mask<_Np, _TV>);
+ }
+
+ template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline bool
+ __x86_vecmask_any(_TV __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_integral_v<_Tp> && is_signed_v<_Tp>);
+ constexpr int __width = __width_of<_TV>;
+ static_assert(sizeof(__k) <= 32);
+ if constexpr (_Np == __width)
+ return !__x86_vec_is_zero(__k);
+ else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+ {
+ using _Ip = __integer_from<sizeof(__k)>;
+ constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+ return (__builtin_bit_cast(_Ip, __k) & __valid_bits) != _Ip();
+ }
+ else if constexpr (!_Traits._M_have_sse4_1())
+ {
+ constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+ return (__x86_movmsk(__k) & __valid_bits) != 0;
+ }
+ else if constexpr (sizeof(__k) < 16)
+ return __x86_vecmask_any<_Np>(__vec_zero_pad_to_16(__k));
+ else
+ return 0 == __x86_vec_testz(__k, _S_vec_implicit_mask<_Np, _TV>);
+ }
+
+ template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline bool
+ __x86_vecmask_none(_TV __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_integral_v<_Tp> && is_signed_v<_Tp>);
+ constexpr int __width = __width_of<_TV>;
+ static_assert(sizeof(__k) <= 32);
+ if constexpr (_Np == __width)
+ return __x86_vec_is_zero(__k);
+ else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+ {
+ using _Ip = __integer_from<sizeof(__k)>;
+ constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+ return (__builtin_bit_cast(_Ip, __k) & __valid_bits) == _Ip();
+ }
+ else if constexpr (!_Traits._M_have_sse4_1())
+ {
+ constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+ return (__x86_movmsk(__k) & __valid_bits) == 0;
+ }
+ else if constexpr (sizeof(__k) < 16)
+ return __x86_vecmask_none<_Np>(__vec_zero_pad_to_16(__k));
+ else
+ return 0 != __x86_vec_testz(__k, _S_vec_implicit_mask<_Np, _TV>);
+ }
+
+ enum class _X86Cmp
+ {
+ _Eq = 0,
+ _Lt = 1,
+ _Le = 2,
+ _Unord = 3,
+ _Neq = 4,
+ _Nlt = 5,
+ _Nle = 6,
+ };
+
+ template <_X86Cmp _Cmp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ requires is_floating_point_v<__vec_value_type<_TV>>
+ [[__gnu__::__always_inline__]]
+ inline auto
+ __x86_bitmask_cmp(_TV __x, _TV __y)
+ {
+ constexpr int __c = int(_Cmp);
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
+ return __builtin_ia32_cmppd512_mask(__x, __y, __c, -1, 4);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpps512_mask(__x, __y, __c, -1, 4);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
+ return __builtin_ia32_cmppd256_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpps256_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
+ return __builtin_ia32_cmppd128_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpps128_mask(__x, __y, __c, -1);
+ else if constexpr (is_same_v<_Tp, _Float16>)
+ {
+ if constexpr (sizeof(_TV) == 64 && _Traits._M_have_avx512fp16())
+ return __builtin_ia32_cmpph512_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && _Traits._M_have_avx512fp16())
+ return __builtin_ia32_cmpph256_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && _Traits._M_have_avx512fp16())
+ return __builtin_ia32_cmpph128_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) < 16 && _Traits._M_have_avx512fp16())
+ return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+ else
+ {
+ // without AVX512_FP16, float16_t size needs to match float32_t size
+ // (cf. __native_abi())
+ static_assert(sizeof(_TV) <= 32);
+ return __x86_bitmask_cmp<_Cmp>(__vec_cast<float>(__x), __vec_cast<float>(__y));
+ }
+ }
+ else if constexpr (sizeof(_TV) < 16)
+ return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+ else
+ static_assert(false);
+ }
+
+ template <typename _Tp>
+ using __x86_intrin_int
+ = decltype([] {
+ if constexpr (sizeof(_Tp) == 1)
+ return char();
+ else
+ return __integer_from<sizeof(_Tp)>();
+ }());
+
+ template <typename _Tp>
+ using __x86_intrin_type
+ = decltype([] {
+ if constexpr (is_integral_v<_Tp> || sizeof(_Tp) <= 2)
+ return __x86_intrin_int<_Tp>();
+ else
+ return __canonical_vec_type_t<_Tp>();
+ }());
+
+ template <typename _Tp>
+ using __x86_intel_intrin_value_type
+ = decltype([] {
+ if constexpr (is_integral_v<_Tp>)
+ return 0ll;
+ else if constexpr (sizeof(_Tp) == 8)
+ return 0.;
+ else if constexpr (sizeof(_Tp) == 4)
+ return 0.f;
+ else if constexpr (sizeof(_Tp) == 2)
+ return 0.f16;
+ }());
+
+#if !_GLIBCXX_CLANG
+ // overload __vec_andnot from simd_detail.h
+ template <__vec_builtin _TV>
+ requires (sizeof(_TV) >= 16)
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_andnot(_TV __a, _TV __b)
+ {
+ constexpr _TargetTraits _Traits = {};
+ using _Tp = __vec_value_type<_TV>;
+ using _UV = __vec_builtin_type<_UInt<sizeof(_Tp)>, __width_of<_TV>>;
+ if (__builtin_is_constant_evaluated()
+ || (__builtin_constant_p(__a) && __builtin_constant_p(__b)))
+ return reinterpret_cast<_TV>(~reinterpret_cast<_UV>(__a) & reinterpret_cast<_UV>(__b));
+ else if constexpr (is_same_v<_Tp, _Float16>)
+ return reinterpret_cast<_TV>(__vec_andnot(__vec_bit_cast<float>(__a),
+ __vec_bit_cast<float>(__b)));
+ else if constexpr (sizeof(_TV) == 16 && is_same_v<_Tp, float>)
+ return __builtin_ia32_andnps(__a, __b);
+ else if constexpr (sizeof(_TV) == 16 && is_same_v<_Tp, double>)
+ return __builtin_ia32_andnpd(__a, __b);
+ else if constexpr (sizeof(_TV) == 32 && is_same_v<_Tp, float>)
+ return __builtin_ia32_andnps256(__a, __b);
+ else if constexpr (sizeof(_TV) == 32 && is_same_v<_Tp, double>)
+ return __builtin_ia32_andnpd256(__a, __b);
+ else if constexpr (sizeof(_TV) == 64 && is_same_v<_Tp, float> && _Traits._M_have_avx512dq())
+ return __builtin_ia32_andnps512_mask(__a, __b, _TV{}, -1);
+ else if constexpr (sizeof(_TV) == 64 && is_same_v<_Tp, double> && _Traits._M_have_avx512dq())
+ return __builtin_ia32_andnpd512_mask(__a, __b, _TV{}, -1);
+ else
+ {
+ auto __all = __vec_bit_cast<long long>(__a);
+ auto __bll = __vec_bit_cast<long long>(__b);
+ if constexpr (sizeof(_TV) == 16 && is_integral_v<_Tp>)
+ return reinterpret_cast<_TV>(__builtin_ia32_pandn128(__all, __bll));
+ else if constexpr (sizeof(_TV) == 32 && is_integral_v<_Tp> && _Traits._M_have_avx2())
+ return reinterpret_cast<_TV>(__builtin_ia32_andnotsi256(__all, __bll));
+ else if constexpr (sizeof(_TV) == 32 && is_integral_v<_Tp>)
+ return reinterpret_cast<_TV>(__builtin_ia32_andnpd256(__vec_bit_cast<double>(__a),
+ __vec_bit_cast<double>(__b)));
+ else if constexpr (sizeof(_TV) == 64)
+ {
+ auto __ai = __vec_bit_cast<int>(__a);
+ auto __bi = __vec_bit_cast<int>(__b);
+ return reinterpret_cast<_TV>(
+ __builtin_ia32_pandnd512_mask(__ai, __bi, decltype(__ai)(), -1));
+ }
+ }
+ }
+#endif // not Clang
+
+ template <_X86Cmp _Cmp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ requires is_integral_v<__vec_value_type<_TV>>
+ [[__gnu__::__always_inline__]]
+ inline auto
+ __x86_bitmask_cmp(_TV __x, _TV __y)
+ {
+ constexpr int __c = int(_Cmp);
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (sizeof(_TV) < 16)
+ return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+ else if constexpr (is_signed_v<_Tp>)
+ {
+ const auto __xi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__x);
+ const auto __yi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__y);
+ if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
+ return __builtin_ia32_cmpq512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpd512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 2)
+ return __builtin_ia32_cmpw512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 1)
+ return __builtin_ia32_cmpb512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
+ return __builtin_ia32_cmpq256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpd256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 2)
+ return __builtin_ia32_cmpw256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 1)
+ return __builtin_ia32_cmpb256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
+ return __builtin_ia32_cmpq128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpd128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 2)
+ return __builtin_ia32_cmpw128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 1)
+ return __builtin_ia32_cmpb128_mask(__xi, __yi, __c, -1);
+ else
+ static_assert(false);
+ }
+ else
+ {
+ const auto __xi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__x);
+ const auto __yi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__y);
+ if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
+ return __builtin_ia32_ucmpq512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
+ return __builtin_ia32_ucmpd512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 2)
+ return __builtin_ia32_ucmpw512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 1)
+ return __builtin_ia32_ucmpb512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
+ return __builtin_ia32_ucmpq256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
+ return __builtin_ia32_ucmpd256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 2)
+ return __builtin_ia32_ucmpw256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 1)
+ return __builtin_ia32_ucmpb256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
+ return __builtin_ia32_ucmpq128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
+ return __builtin_ia32_ucmpd128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 2)
+ return __builtin_ia32_ucmpw128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 1)
+ return __builtin_ia32_ucmpb128_mask(__xi, __yi, __c, -1);
+ else
+ static_assert(false);
+ }
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline auto
+ __x86_bitmask_isinf(_TV __x)
+ {
+ static_assert(_Traits._M_have_avx512dq());
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_floating_point_v<_Tp>);
+ if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
+ return __builtin_ia32_fpclasspd512_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
+ return __builtin_ia32_fpclasspd256_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
+ return __builtin_ia32_fpclasspd128_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
+ return __builtin_ia32_fpclassps512_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
+ return __builtin_ia32_fpclassps256_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
+ return __builtin_ia32_fpclassps128_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 2 && _Traits._M_have_avx512fp16())
+ return __builtin_ia32_fpclassph512_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 2 && _Traits._M_have_avx512fp16())
+ return __builtin_ia32_fpclassph256_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 2 && _Traits._M_have_avx512fp16())
+ return __builtin_ia32_fpclassph128_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_Tp) == 2 && !_Traits._M_have_avx512fp16())
+ return __x86_bitmask_isinf(__vec_cast<float>(__x));
+ else if constexpr (sizeof(_TV) < 16)
+ return __x86_bitmask_isinf(__vec_zero_pad_to_16(__x));
+ else
+ static_assert(false);
+ }
+
+ template <__vec_builtin _KV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline _KV
+ __x86_bit_to_vecmask(std::integral auto __bits)
+ {
+ using _Kp = __vec_value_type<_KV>;
+ static_assert((sizeof(__bits) * __CHAR_BIT__ == __width_of<_KV>)
+ || (sizeof(__bits) == 1 && __CHAR_BIT__ > __width_of<_KV>));
+
+ if constexpr (sizeof(_Kp) == 1 && sizeof(_KV) == 64)
+ return __builtin_ia32_cvtmask2b512(__bits);
+ else if constexpr (sizeof(_Kp) == 1 && sizeof(_KV) == 32)
+ return __builtin_ia32_cvtmask2b256(__bits);
+ else if constexpr (sizeof(_Kp) == 1 && sizeof(_KV) == 16)
+ return __builtin_ia32_cvtmask2b128(__bits);
+ else if constexpr (sizeof(_Kp) == 1 && sizeof(_KV) <= 8)
+ return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2b128(__bits));
+
+ else if constexpr (sizeof(_Kp) == 2 && sizeof(_KV) == 64)
+ return __builtin_ia32_cvtmask2w512(__bits);
+ else if constexpr (sizeof(_Kp) == 2 && sizeof(_KV) == 32)
+ return __builtin_ia32_cvtmask2w256(__bits);
+ else if constexpr (sizeof(_Kp) == 2 && sizeof(_KV) == 16)
+ return __builtin_ia32_cvtmask2w128(__bits);
+ else if constexpr (sizeof(_Kp) == 2 && sizeof(_KV) <= 8)
+ return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2w128(__bits));
+
+ else if constexpr (sizeof(_Kp) == 4 && sizeof(_KV) == 64)
+ return __builtin_ia32_cvtmask2d512(__bits);
+ else if constexpr (sizeof(_Kp) == 4 && sizeof(_KV) == 32)
+ return __builtin_ia32_cvtmask2d256(__bits);
+ else if constexpr (sizeof(_Kp) == 4 && sizeof(_KV) <= 16)
+ return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2d128(__bits));
+
+ else if constexpr (sizeof(_Kp) == 8 && sizeof(_KV) == 64)
+ return __builtin_ia32_cvtmask2q512(__bits);
+ else if constexpr (sizeof(_Kp) == 8 && sizeof(_KV) == 32)
+ return __builtin_ia32_cvtmask2q256(__bits);
+ else if constexpr (sizeof(_Kp) == 8 && sizeof(_KV) == 16)
+ return __builtin_ia32_cvtmask2q128(__bits);
+
+ else
+ static_assert(false);
+ }
+
+ template <unsigned_integral _Kp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ requires is_integral_v<__vec_value_type<_TV>>
+ [[__gnu__::__always_inline__]]
+ constexpr inline _TV
+ __x86_bitmask_blend(_Kp __k, _TV __t, _TV __f)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ using _Ip = __x86_intrin_int<_Tp>;
+ if constexpr (!is_same_v<_Ip, _Tp>)
+ return reinterpret_cast<_TV>(__x86_bitmask_blend(__k, __vec_bit_cast<_Ip>(__t),
+ __vec_bit_cast<_Ip>(__f)));
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmq_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmd_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 2)
+ return __builtin_ia32_blendmw_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 1)
+ return __builtin_ia32_blendmb_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmq_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmd_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 2)
+ return __builtin_ia32_blendmw_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 1)
+ return __builtin_ia32_blendmb_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmq_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmd_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 2)
+ return __builtin_ia32_blendmw_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 1)
+ return __builtin_ia32_blendmb_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) < 16)
+ return _VecOps<_TV>::_S_extract(__x86_bitmask_blend(__k, __vec_zero_pad_to_16(__t),
+ __vec_zero_pad_to_16(__f)));
+ else
+ static_assert(false);
+ }
+
+ template <unsigned_integral _Kp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ requires is_floating_point_v<__vec_value_type<_TV>>
+ [[__gnu__::__always_inline__]]
+ constexpr inline _TV
+ __x86_bitmask_blend(_Kp __k, _TV __t, _TV __f)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmpd_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmps_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmpd_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmps_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmpd_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmps_128_mask (__f, __t, __k);
+ else if constexpr (is_same_v<_Tp, _Float16>)
+ {
+ using _Up = __integer_from<sizeof(_Tp)>;
+ return __vec_bit_cast<_Float16>(__x86_bitmask_blend(__k, __vec_bit_cast<_Up>(__t),
+ __vec_bit_cast<_Up>(__f)));
+ }
+ else if constexpr (sizeof(_TV) < 16)
+ return _VecOps<_TV>::_S_extract(__x86_bitmask_blend(__k, __vec_zero_pad_to_16(__t),
+ __vec_zero_pad_to_16(__f)));
+ else
+ static_assert(false);
+ }
+
+ template <int _OutputBits = 4, _ArchTraits _Traits = {}>
+ constexpr _Bitmask<1>
+ __bit_extract_even(_UInt<1> __x)
+ {
+ static_assert(_OutputBits <= 4);
+ constexpr _UInt<1> __mask = 0x55u >> ((4 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__x, __mask);
+#endif
+ __x &= __mask;
+ __x |= __x >> 1;
+ __x &= 0x33u;
+ __x |= __x >> 2;
+ __x &= 0x0Fu;
+ return __x;
+ }
+
+ template <int _OutputBits = 8, _ArchTraits _Traits = {}>
+ constexpr _Bitmask<1>
+ __bit_extract_even(_UInt<2> __x)
+ {
+ if constexpr (_OutputBits <= 4)
+ return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+ else
+ {
+ static_assert(_OutputBits <= 8);
+ constexpr _UInt<2> __mask = 0x5555u >> ((8 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__x, __mask);
+#endif
+ __x &= __mask;
+ __x |= __x >> 1;
+ __x &= 0x3333u;
+ __x |= __x >> 2;
+ __x &= 0x0F0Fu;
+ __x |= __x >> 4;
+ return __x;
+ }
+ }
+
+ template <int _OutputBits = 16, _ArchTraits _Traits = {}>
+ constexpr _Bitmask<_OutputBits>
+ __bit_extract_even(_UInt<4> __x)
+ {
+ if constexpr (_OutputBits <= 4)
+ return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+ else if constexpr (_OutputBits <= 8)
+ return __bit_extract_even<_OutputBits>(_UInt<2>(__x));
+ else
+ {
+ static_assert(_OutputBits <= 16);
+ constexpr _UInt<4> __mask = 0x5555'5555u >> ((16 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__x, __mask);
+#endif
+ __x &= __mask;
+ __x |= __x >> 1;
+ __x &= 0x3333'3333u;
+ __x |= __x >> 2;
+ __x &= 0x0F0F'0F0Fu;
+ __x |= __x >> 4;
+ __x &= 0x00FF'00FFu;
+ __x |= __x >> 8;
+ return __x;
+ }
+ }
+
+ template <int _OutputBits = 32, _ArchTraits _Traits = {}>
+ constexpr _Bitmask<_OutputBits>
+ __bit_extract_even(_UInt<8> __x)
+ {
+ if constexpr (_OutputBits <= 4)
+ return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+ else if constexpr (_OutputBits <= 8)
+ return __bit_extract_even<_OutputBits>(_UInt<2>(__x));
+ else if constexpr (_OutputBits <= 16)
+ return __bit_extract_even<_OutputBits>(_UInt<4>(__x));
+ else
+ {
+ static_assert(_OutputBits <= 32);
+ constexpr _UInt<8> __mask = 0x5555'5555'5555'5555ull >> ((32 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ {
+#if __has_builtin(__builtin_ia32_pext_di)
+ return __builtin_ia32_pext_di(__x, __mask);
+#else
+ return __builtin_ia32_pext_si(__x, static_cast<unsigned>(__mask))
+ | (__builtin_ia32_pext_si(__x >> 32, __mask >> 32) << 16);
+#endif
+ }
+#endif
+ __x &= __mask;
+ __x |= __x >> 1;
+ __x &= 0x3333'3333'3333'3333ull;
+ __x |= __x >> 2;
+ __x &= 0x0F0F'0F0F'0F0F'0F0Full;
+ __x |= __x >> 4;
+ __x &= 0x00FF'00FF'00FF'00FFull;
+ __x |= __x >> 8;
+ __x &= 0x0000'FFFF'0000'FFFFull;
+ __x |= __x >> 16;
+ return __x;
+ }
+ }
+
+ // input bits must be 0 for all bits > _InputBits
+ template <int _InputBits = -1, _ArchTraits _Traits = {}>
+ constexpr auto
+ __duplicate_each_bit(unsigned_integral auto __x)
+ {
+ constexpr int __input_bits = _InputBits == -1 ? sizeof(__x) * __CHAR_BIT__ : _InputBits;
+ static_assert(__input_bits >= 1);
+ static_assert(sizeof(__x) * __CHAR_BIT__ >= __input_bits);
+ if constexpr (__input_bits <= 8)
+ {
+ constexpr _UInt<2> __mask = 0x5555u >> ((8 - __input_bits) * 2);
+ if constexpr (__input_bits == 1)
+ return _UInt<1>(__x * 3u);
+#if __has_builtin(__builtin_ia32_pdep_si)
+ else if constexpr (_Traits._M_have_bmi2())
+ return _Bitmask<__input_bits * 2>(3u * __builtin_ia32_pdep_si(__x, __mask));
+#endif
+ else if constexpr (__input_bits == 2) // 0000'00BA
+ return _UInt<1>(((__x + 0b0010u) & 0b0101u) * 3u); // 0B?A -> 0B0A -> BBAA
+ else if constexpr (__input_bits <= 4) // 0000'DCBA
+ {
+ __x = ((__x << 2) | __x ) & 0b0011'0011u; // 00DC'??BA -> 00DC'00BA
+ return _UInt<1>(((__x + 0b0010'0010u) & __mask) * 3u); // -> DDCC'BBAA
+ }
+ else
+ { // HGFE'DCBA
+ _UInt<2> __y = ((__x << 4) | __x) & 0x0F0Fu; // HGFE'0000'DCBA
+ __y |= __y << 2; // 00HG'??FE'00DC'??BA
+ __y &= 0x3333u; // 00HG'00FE'00DC'00BA
+ __y += 0x2222u; // 0H?G'0F?E'0D?C'0B?A
+ return _UInt<2>((__y & __mask) * 3u); // HHGG'FFEE'DDCC'BBAA
+ }
+ }
+ else if constexpr (__input_bits <= 16)
+ {
+ constexpr _UInt<4> __mask = 0x5555'5555u >> ((16 - __input_bits) * 2);
+#if __has_builtin(__builtin_ia32_pdep_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return 3u * __builtin_ia32_pdep_si(__x, __mask);
+#endif
+ _UInt<4> __y = ((__x << 8) | __x) & 0x00FF00FFu;
+ __y |= __y << 4;
+ __y &= 0x0F0F'0F0Fu;
+ __y |= __y << 2;
+ __y &= 0x3333'3333u;
+ return ((__y + 0x2222'2222u) & __mask) * 3;
+ }
+ else if constexpr (__input_bits <= 32)
+ {
+ constexpr _UInt<8> __mask = 0x5555'5555'5555'5555u >> ((32 - __input_bits) * 2);
+#if __has_builtin(__builtin_ia32_pdep_si)
+ if constexpr (_Traits._M_have_bmi2())
+ {
+#if __has_builtin(__builtin_ia32_pdep_di)
+ return 3ull * __builtin_ia32_pdep_di(__x, __mask);
+#else
+ const _UInt<8> __hi = 3 * __builtin_ia32_pdep_si(__x >> 16, __mask >> 32);
+ return (3u * __builtin_ia32_pdep_si(__x, static_cast<unsigned>(__mask))) | __hi << 32;
+#endif
+ }
+#endif
+ _UInt<8> __y = ((__x & 0xFFFF'0000ull) << 16) | (__x & 0x0000'FFFFu);
+ __y |= __y << 8;
+ __y &= 0x00FF'00FF'00FF'00FFull;
+ __y |= __y << 4;
+ __y &= 0x0F0F'0F0F'0F0F'0F0Full;
+ __y |= __y << 2;
+ __y &= 0x3333'3333'3333'3333ull;
+ return ((__y + 0x2222'2222'2222'2222ull) & __mask) * 3;
+ }
+ else
+ return __trivial_pair { __duplicate_each_bit(_UInt<4>(__x)),
+ __duplicate_each_bit<__input_bits - 32>(
+ _Bitmask<__input_bits - 32>(__x >> 32)) };
+ }
+
+ template <int _InputBits = -1, typename _U0, typename _U1>
+ constexpr auto
+ __duplicate_each_bit(const __trivial_pair<_U0, _U1>& __x)
+ {
+ static_assert(_InputBits != -1 || is_unsigned_v<_U1>);
+ constexpr int __input_bits = _InputBits == -1 ? (sizeof(_U0) + sizeof(_U1)) * __CHAR_BIT__
+ : _InputBits;
+ constexpr int __in0 = min(int(sizeof(_U0)) * __CHAR_BIT__, __input_bits);
+ constexpr int __in1 = __input_bits - __in0;
+ if constexpr (__in1 == 0)
+ return __duplicate_each_bit<__in0>(__x._M_first);
+ else
+ return __trivial_pair { __duplicate_each_bit<__in0>(__x._M_first),
+ __duplicate_each_bit<__in1>(__x._M_second) };
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline _TV
+ __x86_complex_multiplies(_TV __x, _TV __y)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ using _VO = _VecOps<_TV>;
+
+ static_assert(_Traits._M_have_fma());
+ static_assert(is_floating_point_v<_Tp>);
+
+ if constexpr (!_Traits._M_have_avx512fp16() && sizeof(_Tp) == 2)
+ return __vec_cast<_Tp>(__x86_complex_multiplies(__vec_cast<float>(__x),
+ __vec_cast<float>(__y)));
+ else if constexpr (sizeof(_TV) < 16)
+ return _VO::_S_extract(__x86_complex_multiplies(__vec_zero_pad_to_16(__x),
+ __vec_zero_pad_to_16(__y)));
+
+ else
+ {
+ _TV __x_real = _VO::_S_dup_even(__x);
+ _TV __x_imag = _VO::_S_dup_odd(__x);
+ _TV __y_swapped = _VO::_S_swap_neighbors(__y);
+
+ if constexpr (sizeof(__x) == 16 && sizeof(_Tp) == 2)
+ return __builtin_ia32_vfmaddsubph128_mask(__x_real, __y, __x_imag * __y_swapped, -1);
+ else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 2)
+ return __builtin_ia32_vfmaddsubph256_mask(__x_real, __y, __x_imag * __y_swapped, -1);
+ else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 2)
+ return __builtin_ia32_vfmaddsubph512_mask(
+ __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+ else if constexpr (sizeof(__x) == 16 && sizeof(_Tp) == 4)
+ return __builtin_ia32_vfmaddsubps(__x_real, __y, __x_imag * __y_swapped);
+ else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
+ return __builtin_ia32_vfmaddsubps256(__x_real, __y, __x_imag * __y_swapped);
+ else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
+ return __builtin_ia32_vfmaddsubps512_mask(
+ __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+ else if constexpr (sizeof(__x) == 16 && sizeof(_Tp) == 8)
+ return __builtin_ia32_vfmaddsubpd(__x_real, __y, __x_imag * __y_swapped);
+ else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
+ return __builtin_ia32_vfmaddsubpd256(__x_real, __y, __x_imag * __y_swapped);
+ else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
+ return __builtin_ia32_vfmaddsubpd512_mask(
+ __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+ else
+ static_assert(false);
+ }
+ }
+
+ // FIXME: Work around PR121688
+ template <__vec_builtin _UV, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ inline _UV
+ __x86_cvt_f16c(_TV __v)
+ {
+ constexpr bool __from_f16 = is_same_v<__vec_value_type<_TV>, _Float16>;
+ constexpr bool __to_f16 = !__from_f16;
+ if constexpr (__to_f16 && !is_same_v<__vec_value_type<_TV>, float>)
+ return __x86_cvt_f16c<_UV>(__vec_cast<float>(__v));
+ else if constexpr (__from_f16 && !is_same_v<__vec_value_type<_UV>, float>)
+ return __vec_cast<_UV>(__x86_cvt_f16c<__vec_builtin_type<float, __width_of<_TV>>>(__v));
+ else if constexpr (__from_f16)
+ {
+ const auto __vi = __vec_bit_cast<__x86_intrin_int<_Float16>>(__v);
+ if constexpr (sizeof(_TV) == 4)
+ return __vec_split_lo(__builtin_ia32_vcvtph2ps(__vec_zero_pad_to_16(__vi)));
+ else if constexpr (sizeof(_TV) == 8)
+ return __builtin_ia32_vcvtph2ps(__vec_zero_pad_to_16(__vi));
+ else if constexpr (sizeof(_TV) == 16)
+ return __builtin_ia32_vcvtph2ps256(__vi);
+ else if constexpr (sizeof(_TV) == 32)
+ return __builtin_ia32_vcvtph2ps512_mask(__vi, __vec_builtin_type<float, 16>(), -1, 4);
+ else if constexpr (sizeof(_TV) >= 64)
+ return __vec_concat(__x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_lo(__v)),
+ __x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_hi(__v)));
+ else
+ static_assert(false);
+ }
+ else if constexpr (sizeof(_TV) == 8)
+ return reinterpret_cast<_UV>(
+ __vec_split_lo(__vec_split_lo(__builtin_ia32_vcvtps2ph(
+ __vec_zero_pad_to_16(__v), 4))));
+ else if constexpr (sizeof(_TV) == 16)
+ return reinterpret_cast<_UV>(__vec_split_lo(__builtin_ia32_vcvtps2ph(__v, 4)));
+ else if constexpr (sizeof(_TV) == 32)
+ return reinterpret_cast<_UV>(__builtin_ia32_vcvtps2ph256(__v, 4));
+ else if constexpr (sizeof(_TV) == 64)
+ return reinterpret_cast<_UV>(__builtin_ia32_vcvtps2ph512_mask(
+ __v, 4, __vec_builtin_type<short, 16>(), -1));
+ else if constexpr (sizeof(_TV) >= 128)
+ return __vec_concat(__x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_lo(__v)),
+ __x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_hi(__v)));
+ else
+ static_assert(false);
+ }
+
+ /** @internal
+ * AVX instructions typically work per 128-bit chunk. Horizontal operations thus produce vectors
+ * where the two 128-bit chunks in the center are swapped. This function works as a fix-up step.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ inline _TV
+ __x86_swizzle4x64_acbd(_TV __x)
+ {
+ static_assert(sizeof(_TV) == 32);
+ using _UV = __vec_builtin_type_bytes<long long, 32>;
+ return reinterpret_cast<_TV>(__builtin_shufflevector(reinterpret_cast<_UV>(__x), _UV(),
+ 0, 2, 1, 3));
+ }
+
+ /** @internal
+ * Like __builtin_convertvector but with a precondition that input values are either 0 or -1.
+ */
+ template <__vec_builtin _To, __vec_builtin _From>
+ [[__gnu__::__always_inline__]]
+ inline _To
+ __x86_cvt_vecmask(_From __k)
+ {
+ using _T0 = __vec_value_type<_From>;
+ using _T1 = __vec_value_type<_To>;
+ if constexpr (sizeof(_From) > sizeof(_To) && sizeof(_From) < 16)
+ {
+ using _ToPadded = __vec_builtin_type_bytes<_T1, sizeof(_To) * 16 / sizeof(_From)>;
+ return _VecOps<_To>::_S_extract(__x86_cvt_vecmask<_ToPadded>(__vec_zero_pad_to_16(__k)));
+ }
+ else if constexpr (sizeof(_T0) == 2 && sizeof(_T1) == 1) // -> packsswb
+ {
+ if constexpr (sizeof(__k) == 16)
+ return reinterpret_cast<_To>(__vec_split_lo(__builtin_ia32_packsswb128(__k, __k)));
+ else if constexpr (sizeof(__k) == 32)
+ return reinterpret_cast<_To>(
+ __vec_split_lo(__x86_swizzle4x64_acbd(
+ __builtin_ia32_packsswb256(__k, __k))));
+ else
+ static_assert(false);
+ }
+ else
+ static_assert(false, "TODO");
+ }
+
+ /** @internal
+ * Overload that concatenates @p __k0 and @p __k1 while converting.
+ */
+ template <__vec_builtin _To, __vec_builtin _From>
+ [[__gnu__::__always_inline__]]
+ inline _To
+ __x86_cvt_vecmask(_From __k0, _From __k1)
+ {
+ using _T0 = __vec_value_type<_From>;
+ using _T1 = __vec_value_type<_To>;
+ static_assert(sizeof(_From) >= 16);
+ if constexpr (sizeof(_T0) == 2 && sizeof(_T1) == 1) // -> packsswb
+ {
+ if constexpr (sizeof(__k0) == 16)
+ return reinterpret_cast<_To>(__builtin_ia32_packsswb128(__k0, __k1));
+ else if constexpr (sizeof(__k0) == 32)
+ return reinterpret_cast<_To>(__x86_swizzle4x64_acbd(
+ __builtin_ia32_packsswb256(__k0, __k1)));
+ else
+ static_assert(false);
+ }
+ else
+ static_assert(false, "TODO");
+ }
+
+ /** @internal
+ * AVX512 masked (converting) loads
+ *
+ * @note AVX512VL and AVX512BW is required
+ */
+ template <__vec_builtin _TV, typename _Up, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline _TV
+ __x86_masked_load(const _Up* __mem, unsigned_integral auto __k)
+ {
+ static_assert(_Traits._M_have_avx512vl() && _Traits._M_have_avx512bw());
+ using _Tp = __vec_value_type<_TV>;
+ constexpr int __n = __width_of<_TV>;
+ if constexpr (!__converts_trivially<_Up, _Tp>)
+ {
+ const auto __uvec
+ = __x86_masked_load<__vec_builtin_type<__canonical_vec_type_t<_Up>, __n>>(__mem, __k);
+ return __vec_cast<_TV>(__uvec);
+ }
+ else if constexpr (sizeof(_TV) < 16)
+ {
+ return _VecOps<_TV>::_S_extract(
+ __x86_masked_load<__vec_builtin_type_bytes<_Tp, 16>>(__mem, __k));
+ }
+ else if constexpr (sizeof(_TV) > 64)
+ {
+ return __vec_concat(
+ __x86_masked_load<__vec_builtin_type<_Tp, __n / 2>>(__mem, __k),
+ __x86_masked_load<__vec_builtin_type<_Tp, __n / 2>>(__mem + __n / 2, __k >> __n / 2)
+ );
+ }
+ else if constexpr (sizeof(_TV) == 64)
+ {
+ const auto* __src = reinterpret_cast<const __x86_intrin_type<_Up>*>(__mem);
+ const __vec_builtin_type_bytes<__x86_intrin_type<_Up>, 64> __z = {};
+ if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
+ return __builtin_ia32_loadups512_mask(__src, __z, __k);
+ else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
+ return __builtin_ia32_loadupd512_mask(__src, __z, __k);
+ else if constexpr (sizeof(_Tp) == 1)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddquqi512_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 2)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddquhi512_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 4)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddqusi512_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 8)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddqudi512_mask(__src, __z, __k));
+ else
+ static_assert(false);
+ }
+ else if constexpr (sizeof(_TV) == 32)
+ {
+ const auto* __src = reinterpret_cast<const __x86_intrin_type<_Up>*>(__mem);
+ const __vec_builtin_type_bytes<__x86_intrin_type<_Up>, 32> __z = {};
+ if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
+ return __builtin_ia32_loadups256_mask(__src, __z, __k);
+ else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
+ return __builtin_ia32_loadupd256_mask(__src, __z, __k);
+ else if constexpr (sizeof(_Tp) == 1)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddquqi256_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 2)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddquhi256_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 4)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddqusi256_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 8)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddqudi256_mask(__src, __z, __k));
+ else
+ static_assert(false);
+ }
+ else if constexpr (sizeof(_TV) == 16)
+ {
+ const auto* __src = reinterpret_cast<const __x86_intrin_type<_Up>*>(__mem);
+ const __vec_builtin_type_bytes<__x86_intrin_type<_Up>, 16> __z = {};
+ if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
+ return __builtin_ia32_loadups128_mask(__src, __z, __k);
+ else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
+ return __builtin_ia32_loadupd128_mask(__src, __z, __k);
+ else if constexpr (sizeof(_Tp) == 1)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddquqi128_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 2)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddquhi128_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 4)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddqusi128_mask(__src, __z, __k));
+ else if constexpr (sizeof(_Tp) == 8)
+ return reinterpret_cast<_TV>(__builtin_ia32_loaddqudi128_mask(__src, __z, __k));
+ else
+ static_assert(false);
+ }
+ else
+ static_assert(false);
+ }
+
+ /** @internal
+ * AVX(2) masked loads (only trivial conversions)
+ */
+ template <__vec_builtin _TV, typename _Up, __vec_builtin _KV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline _TV
+ __x86_masked_load(const _Up* __mem, const _KV __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(_Traits._M_have_avx() && __converts_trivially<_Up, _Tp> && sizeof(_Up) >= 4);
+ constexpr int __n = __width_of<_TV>;
+ using _IV = __vec_builtin_type<__x86_intrin_int<_Tp>, __n>;
+ const auto __vk = reinterpret_cast<_IV>(__k);
+ if constexpr (sizeof(_TV) < 16)
+ return _VecOps<_TV>::_S_extract(__x86_masked_load<__vec_builtin_type_bytes<_Tp, 16>>(
+ __mem, __vec_zero_pad_to_16(__k)));
+ else if constexpr (_Traits._M_have_avx2() && is_integral_v<_Up>)
+ {
+ const auto* __src
+ = reinterpret_cast<const __vec_builtin_type<__x86_intrin_int<_Up>, __n>*>(__mem);
+ if constexpr (sizeof(_Up) == 4 && sizeof(_TV) == 32)
+ return reinterpret_cast<_TV>(__builtin_ia32_maskloadd256(__src, __vk));
+ else if constexpr (sizeof(_Up) == 4 && sizeof(_TV) == 16)
+ return reinterpret_cast<_TV>(__builtin_ia32_maskloadd(__src, __vk));
+ else if constexpr (sizeof(_Up) == 8 && sizeof(_TV) == 32)
+ return reinterpret_cast<_TV>(__builtin_ia32_maskloadq256(__src, __vk));
+ else if constexpr (sizeof(_Up) == 8 && sizeof(_TV) == 16)
+ return reinterpret_cast<_TV>(__builtin_ia32_maskloadq(__src, __vk));
+ else
+ static_assert(false);
+ }
+ else if constexpr (sizeof(_Up) == 4)
+ {
+ const auto* __src = reinterpret_cast<const __vec_builtin_type<float, __n>*>(__mem);
+ if constexpr (sizeof(_TV) == 32)
+ return reinterpret_cast<_TV>(__builtin_ia32_maskloadps256(__src, __vk));
+ else if constexpr (sizeof(_TV) == 16)
+ return reinterpret_cast<_TV>(__builtin_ia32_maskloadps(__src, __vk));
+ else
+ static_assert(false);
+ }
+ else
+ {
+ const auto* __src = reinterpret_cast<const __vec_builtin_type<double, __n>*>(__mem);
+ if constexpr (sizeof(_TV) == 32)
+ return reinterpret_cast<_TV>(__builtin_ia32_maskloadpd256(__src, __vk));
+ else if constexpr (sizeof(_TV) == 16)
+ return reinterpret_cast<_TV>(__builtin_ia32_maskloadpd(__src, __vk));
+ else
+ static_assert(false);
+ }
+ }
+
+ /** @internal
+ * AVX512 masked stores
+ *
+ * @note AVX512VL is required
+ */
+ template <__vec_builtin _TV, typename _Up>
+ [[__gnu__::__always_inline__]]
+ inline void
+ __x86_masked_store(const _TV __v, _Up* __mem, unsigned_integral auto __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ constexpr int __n = __width_of<_TV>;
+ [[maybe_unused]] const auto __w = __vec_bit_cast<__x86_intrin_type<_Tp>>(__v);
+ if constexpr (sizeof(_TV) == 64)
+ {
+ if constexpr (sizeof(_Tp) > sizeof(_Up) && is_integral_v<_Tp> && is_integral_v<_Up>)
+ {
+ auto* __dst = reinterpret_cast<
+ __vec_builtin_type<__x86_intrin_int<_Up>, __n>*>(__mem);
+ if constexpr (sizeof(_Tp) == 2)
+ __builtin_ia32_pmovwb512mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1)
+ __builtin_ia32_pmovdb512mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2)
+ __builtin_ia32_pmovdw512mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1)
+ __builtin_ia32_pmovqb512mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2)
+ __builtin_ia32_pmovqw512mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4)
+ __builtin_ia32_pmovqd512mem_mask(__dst, __w, __k);
+ else
+ static_assert(false);
+ }
+ else if constexpr (__converts_trivially<_Tp, _Up>)
+ {
+ auto* __dst = reinterpret_cast<__x86_intrin_type<_Up>*>(__mem);
+ if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
+ __builtin_ia32_storeups512_mask(__dst, __w, __k);
+ else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
+ __builtin_ia32_storeupd512_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 1)
+ __builtin_ia32_storedquqi512_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 2)
+ __builtin_ia32_storedquhi512_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4)
+ __builtin_ia32_storedqusi512_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8)
+ __builtin_ia32_storedqudi512_mask(__dst, __w, __k);
+ else
+ static_assert(false);
+ }
+ else if constexpr (sizeof(_Tp) >= sizeof(_Up))
+ {
+ if constexpr (is_floating_point_v<_Tp> && is_integral_v<_Up>
+ && sizeof(_Tp) > sizeof(_Up))
+ __x86_masked_store(__vec_cast<__integer_from<sizeof(_Tp)>>(__v), __mem, __k);
+ else
+ __x86_masked_store(__vec_cast<_Up>(__v), __mem, __k);
+ }
+ else
+ {
+ __x86_masked_store(__vec_split_lo(__v), __mem, _Bitmask<__n / 2>(__k));
+ __x86_masked_store(__vec_split_hi(__v), __mem + __n / 2,
+ _Bitmask<__n / 2>(__k >> (__n / 2)));
+ }
+ }
+ else if constexpr (sizeof(_TV) == 32)
+ {
+ if constexpr (sizeof(_Tp) > sizeof(_Up) && is_integral_v<_Tp> && is_integral_v<_Up>)
+ {
+ auto* __dst = reinterpret_cast<
+ __vec_builtin_type<__x86_intrin_int<_Up>, __n>*>(__mem);
+ if constexpr (sizeof(_Tp) == 2)
+ __builtin_ia32_pmovwb256mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1)
+ __builtin_ia32_pmovdb256mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2)
+ __builtin_ia32_pmovdw256mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1)
+ __builtin_ia32_pmovqb256mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2)
+ __builtin_ia32_pmovqw256mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4)
+ __builtin_ia32_pmovqd256mem_mask(__dst, __w, __k);
+ else
+ static_assert(false);
+ }
+ else if constexpr (__converts_trivially<_Tp, _Up>)
+ {
+ auto* __dst = reinterpret_cast<__x86_intrin_type<_Up>*>(__mem);
+ if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
+ __builtin_ia32_storeups256_mask(__dst, __w, __k);
+ else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
+ __builtin_ia32_storeupd256_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 1)
+ __builtin_ia32_storedquqi256_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 2)
+ __builtin_ia32_storedquhi256_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4)
+ __builtin_ia32_storedqusi256_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8)
+ __builtin_ia32_storedqudi256_mask(__dst, __w, __k);
+ else
+ static_assert(false);
+ }
+ else if constexpr (2 * sizeof(_Tp) >= sizeof(_Up))
+ {
+ __x86_masked_store(__vec_cast<_Up>(__v), __mem, __k);
+ }
+ else
+ {
+ __x86_masked_store(__vec_split_lo(__v), __mem, _Bitmask<__n / 2>(__k));
+ __x86_masked_store(__vec_split_hi(__v), __mem + __n / 2,
+ _Bitmask<__n / 2>(__k >> (__n / 2)));
+ }
+ }
+ else if constexpr (sizeof(_TV) == 16)
+ {
+ if constexpr (sizeof(_Tp) > sizeof(_Up) && is_integral_v<_Tp> && is_integral_v<_Up>)
+ {
+ auto* __dst = reinterpret_cast<
+ __vec_builtin_type<__x86_intrin_int<_Up>, __n>*>(__mem);
+ if constexpr (sizeof(_Tp) == 2)
+ __builtin_ia32_pmovwb128mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1)
+ __builtin_ia32_pmovdb128mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2)
+ __builtin_ia32_pmovdw128mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1)
+ __builtin_ia32_pmovqb128mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2)
+ __builtin_ia32_pmovqw128mem_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4)
+ __builtin_ia32_pmovqd128mem_mask(reinterpret_cast<unsigned long long*>(__mem),
+ __w, __k);
+ else
+ static_assert(false);
+ }
+ else if constexpr (__converts_trivially<_Tp, _Up>)
+ {
+ auto* __dst = reinterpret_cast<__x86_intrin_type<_Up>*>(__mem);
+ if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
+ __builtin_ia32_storeups128_mask(__dst, __w, __k);
+ else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
+ __builtin_ia32_storeupd128_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 1)
+ __builtin_ia32_storedquqi128_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 2)
+ __builtin_ia32_storedquhi128_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 4)
+ __builtin_ia32_storedqusi128_mask(__dst, __w, __k);
+ else if constexpr (sizeof(_Tp) == 8)
+ __builtin_ia32_storedqudi128_mask(__dst, __w, __k);
+ else
+ static_assert(false);
+ }
+ else if constexpr (4 * sizeof(_Tp) >= sizeof(_Up))
+ {
+ __x86_masked_store(__vec_cast<_Up>(__v), __mem, __k);
+ }
+ else
+ {
+ __x86_masked_store(__vec_cast<_Up>(__vec_split_lo(__v)), __mem,
+ _Bitmask<__n / 2>(__k));
+ __x86_masked_store(__vec_cast<_Up>(__vec_split_hi(__v)), __mem + __n / 2,
+ _Bitmask<__n / 2>(__k >> (__n / 2)));
+ }
+ }
+ else
+ __x86_masked_store(__vec_zero_pad_to_16(__v), __mem, __k);
+ }
+
+ /** @internal
+ * AVX(2) masked stores
+ */
+ template <__vec_builtin _TV, typename _Up, __vec_builtin _KV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline void
+ __x86_masked_store(const _TV __v, _Up* __mem, const _KV __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ constexpr int __n = __width_of<_TV>;
+ static_assert(sizeof(_Tp) == 4 || sizeof(_Tp) == 8);
+ auto* __dst = reinterpret_cast<
+ __vec_builtin_type<__x86_intrin_type<_Up>, __n>*>(__mem);
+ [[maybe_unused]] const auto __w = __vec_bit_cast<__x86_intrin_type<_Tp>>(__v);
+ if constexpr (sizeof(_TV) < 16)
+ __x86_masked_store(__vec_zero_pad_to_16(__v), __mem, __vec_zero_pad_to_16(__k));
+ else if constexpr (_Traits._M_have_avx2() && is_integral_v<_Tp>)
+ {
+ if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
+ __builtin_ia32_maskstored256(__dst, __k, __w);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
+ __builtin_ia32_maskstored(__dst, __k, __w);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
+ __builtin_ia32_maskstoreq256(__dst, __k, __w);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
+ __builtin_ia32_maskstoreq(__dst, __k, __w);
+ else
+ static_assert(false);
+ }
+ else
+ {
+ if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
+ __builtin_ia32_maskstoreps256(__dst, __k, __w);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
+ __builtin_ia32_maskstoreps(__dst, __k, __w);
+ else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
+ __builtin_ia32_maskstorepd256(__dst, __k, __w);
+ else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
+ __builtin_ia32_maskstorepd(__dst, __k, __w);
+ else
+ static_assert(false);
+ }
+ }
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_X86_H
--- /dev/null
+// Implementation of <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_VEC_OPS_H
+#define _GLIBCXX_VEC_OPS_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_details.h"
+
+#include <bit>
+#include <bits/utility.h>
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+namespace simd
+{
+ template <std::signed_integral _Tp>
+ constexpr bool
+ __signed_has_single_bit(_Tp __x)
+ { return __has_single_bit(make_unsigned_t<_Tp>(__x)); }
+
+ /**
+ * Alias for a vector builtin with given value type and total sizeof.
+ */
+ template <__vectorizable _Tp, size_t _Bytes>
+ requires (__has_single_bit(_Bytes))
+ using __vec_builtin_type_bytes [[__gnu__::__vector_size__(_Bytes)]] = _Tp;
+
+ /**
+ * Alias for a vector builtin with given value type @p _Tp and @p _Width.
+ */
+ template <__vectorizable _Tp, __simd_size_type _Width>
+ requires (__signed_has_single_bit(_Width))
+ using __vec_builtin_type = __vec_builtin_type_bytes<_Tp, sizeof(_Tp) * _Width>;
+
+ /**
+ * Constrain to any vector builtin with given value type and optional width.
+ */
+ template <typename _Tp, typename _ValueType,
+ __simd_size_type _Width = sizeof(_Tp) / sizeof(_ValueType)>
+ concept __vec_builtin_of
+ = !is_class_v<_Tp> && !is_pointer_v<_Tp> && !is_arithmetic_v<_Tp>
+ && __vectorizable<_ValueType>
+ && _Width >= 1 && sizeof(_Tp) / sizeof(_ValueType) == _Width
+ && same_as<__vec_builtin_type_bytes<_ValueType, sizeof(_Tp)>, _Tp>
+ && requires(_Tp& __v, _ValueType __x) { __v[0] = __x; };
+
+ /**
+ * Constrain to any vector builtin.
+ */
+ template <typename _Tp>
+ concept __vec_builtin
+ = __vec_builtin_of<_Tp, remove_cvref_t<decltype(declval<const _Tp>()[0])>>;
+
+ /**
+ * Alias for the value type of the given __vec_builtin type @p _Tp.
+ */
+ template <__vec_builtin _Tp>
+ using __vec_value_type = remove_cvref_t<decltype(declval<const _Tp>()[0])>;
+
+ /**
+ * The width (number of value_type elements) of the given vector builtin or arithmetic type.
+ */
+ template <typename _Tp>
+ inline constexpr __simd_size_type __width_of = 1;
+
+ template <typename _Tp>
+ requires __vec_builtin<_Tp>
+ inline constexpr __simd_size_type __width_of<_Tp> = sizeof(_Tp) / sizeof(__vec_value_type<_Tp>);
+
+ /**
+ * Alias for a vector builtin with equal value type and new width @p _Np.
+ */
+ template <__simd_size_type _Np, __vec_builtin _TV>
+ using __resize_vec_builtin_t = __vec_builtin_type<__vec_value_type<_TV>, _Np>;
+
+ template <__vec_builtin _TV>
+ requires (__width_of<_TV> > 1)
+ using __half_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> / 2, _TV>;
+
+ template <__vec_builtin _TV>
+ using __double_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> * 2, _TV>;
+
+ template <typename _Up, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type_bytes<_Up, sizeof(_TV)>
+ __vec_bit_cast(_TV __v)
+ { return reinterpret_cast<__vec_builtin_type_bytes<_Up, sizeof(_TV)>>(__v); }
+
+ template <int _Np, __vec_builtin _TV>
+ requires signed_integral<__vec_value_type<_TV>>
+ static constexpr _TV _S_vec_implicit_mask = []<int... _Is> (integer_sequence<int, _Is...>) {
+ return _TV{ (_Is < _Np ? -1 : 0)... };
+ } (make_integer_sequence<int, __width_of<_TV>>());
+
+ /**
+ * Helper function to work around Clang not allowing v[i] in constant expressions.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_value_type<_TV>
+ __vec_get(_TV __v, int __i)
+ {
+#ifdef _GLIBCXX_CLANG
+ if consteval
+ {
+ return __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v)[__i];
+ }
+ else
+#endif
+ {
+ return __v[__i];
+ }
+ }
+
+ /**
+ * Helper function to work around Clang and GCC not allowing assignment to v[i] in constant
+ * expressions.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ __vec_set(_TV& __v, int __i, __vec_value_type<_TV> __x)
+ {
+ if consteval
+ {
+#ifdef _GLIBCXX_CLANG
+ auto __arr = __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v);
+ __arr[__i] = __x;
+ __v = __builtin_bit_cast(_TV, __arr);
+#else
+ constexpr auto [...__j] = _IotaArray<__width_of<_TV>>;
+ __v = _TV{(__i == __j ? __x : __v[__j])...};
+#endif
+ }
+ else
+ {
+ __v[__i] = __x;
+ }
+ }
+
+ /** @internal
+ * Return vector builtin with all values from @p __a and @p __b.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type<__vec_value_type<_TV>, __width_of<_TV> * 2>
+ __vec_concat(_TV __a, _TV __b)
+ {
+ constexpr auto [...__is] = _IotaArray<__width_of<_TV> * 2>;
+ return __builtin_shufflevector(__a, __b, __is...);
+ }
+
+ /** @internal
+ * Concatenate the first @p _N0 elements from @p __a with the first @p _N1 elements from @p __b
+ * with the elements from applying this function recursively to @p __rest.
+ *
+ * @pre _N0 <= __width_of<_TV0> && _N1 <= __width_of<_TV1> && _Ns <= __width_of<_TVs> && ...
+ *
+ * Strategy: Aim for a power-of-2 tree concat. E.g.
+ * - cat(2, 2, 2, 2) -> cat(4, 2, 2) -> cat(4, 4)
+ * - cat(2, 2, 2, 2, 8) -> cat(4, 2, 2, 8) -> cat(4, 4, 8) -> cat(8, 8)
+ */
+ template <int _N0, int _N1, int... _Ns, __vec_builtin _TV0, __vec_builtin _TV1,
+ __vec_builtin... _TVs>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type<__vec_value_type<_TV0>,
+ __bit_ceil(unsigned(_N0 + (_N1 + ... + _Ns)))>
+ __vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TVs&... __rest);
+
+ template <int _N0, int _N1, int _N2, int... _Ns, __vec_builtin _TV0, __vec_builtin _TV1,
+ __vec_builtin _TV2, __vec_builtin... _TVs>
+ requires (__has_single_bit(unsigned(_N0))) && (_N0 >= (_N1 + _N2))
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type<__vec_value_type<_TV0>,
+ __bit_ceil(unsigned(_N0 + _N1 + (_N2 + ... + _Ns)))>
+ __vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TV2& __c, const _TVs&... __rest)
+ {
+ return __vec_concat_sized<_N0, _N1 + _N2, _Ns...>(
+ __a, __vec_concat_sized<_N1, _N2>(__b, __c), __rest...);
+ }
+
+ template <int _N0, int _N1, int... _Ns, __vec_builtin _TV0, __vec_builtin _TV1,
+ __vec_builtin... _TVs>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type<__vec_value_type<_TV0>,
+ __bit_ceil(unsigned(_N0 + (_N1 + ... + _Ns)))>
+ __vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TVs&... __rest)
+ {
+ // __is is rounded up because we need to generate a power-of-2 vector:
+ constexpr auto [...__is] = _IotaArray<__bit_ceil(unsigned(_N0 + _N1)), int>;
+ const auto __ab = __builtin_shufflevector(__a, __b, [](int __i) consteval {
+ if (__i < _N0) // copy from __a
+ return __i;
+ else if (__i < _N0 + _N1) // copy from __b
+ return __i - _N0 + __width_of<_TV0>; // _N0 <= __width_of<_TV0>
+ else // can't index into __rest
+ return -1; // don't care
+ }(__is)...);
+ if constexpr (sizeof...(__rest) == 0)
+ return __ab;
+ else
+ return __vec_concat_sized<_N0 + _N1, _Ns...>(__ab, __rest...);
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __half_vec_builtin_t<_TV>
+ __vec_split_lo(_TV __v)
+ {
+ constexpr int __n = __width_of<_TV> / 2;
+ constexpr auto [...__is] = _IotaArray<__n>;
+ return __builtin_shufflevector(__v, __v, __is...);
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __half_vec_builtin_t<_TV>
+ __vec_split_hi(_TV __v)
+ {
+ constexpr int __n = __width_of<_TV> / 2;
+ constexpr auto [...__is] = _IotaArray<__n>;
+ return __builtin_shufflevector(__v, __v, (__n + __is)...);
+ }
+
+ /** @internal
+ * Return @p __x zero-padded to @p _Bytes bytes.
+ *
+ * Use this function when you need two objects of the same size (e.g. for __vec_concat).
+ */
+ template <size_t _Bytes, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ __vec_zero_pad_to(_TV __x)
+ {
+ if constexpr (sizeof(_TV) == _Bytes)
+ return __x;
+ else if constexpr (sizeof(_TV) <= sizeof(0ull))
+ {
+ using _Up = _UInt<sizeof(_TV)>;
+ __vec_builtin_type_bytes<_Up, _Bytes> __tmp = {__builtin_bit_cast(_Up, __x)};
+ return __builtin_bit_cast(__vec_builtin_type_bytes<__vec_value_type<_TV>, _Bytes>, __tmp);
+ }
+ else if constexpr (sizeof(_TV) < _Bytes)
+ return __vec_zero_pad_to<_Bytes>(__vec_concat(__x, _TV()));
+ else
+ static_assert(false);
+ }
+
+ /** @internal
+ * Return a type with sizeof 16, add zero-padding to @p __x. The input must be smaller.
+ *
+ * Use this function instead of the above when you need to pad an argument for a SIMD builtin.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ __vec_zero_pad_to_16(_TV __x)
+ {
+ static_assert(sizeof(_TV) < 16);
+ return __vec_zero_pad_to<16>(__x);
+ }
+
+ // work around __builtin_constant_p returning false unless passed a variable
+ // (__builtin_constant_p(x[0]) is false while __is_const_known(x[0]) is true)
+ template <typename _Tp>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ __is_const_known(const _Tp& __x)
+ {
+ return __builtin_constant_p(__x);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ __is_const_known(const auto&... __xs) requires(sizeof...(__xs) >= 2)
+ {
+ if consteval
+ {
+ return true;
+ }
+ else
+ {
+ return (__is_const_known(__xs) && ...);
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ __is_const_known_equal_to(const auto& __x, const auto& __expect)
+ { return __is_const_known(__x == __expect) && __x == __expect; }
+
+#if _GLIBCXX_X86
+ template <__vec_builtin _UV, __vec_builtin _TV>
+ inline _UV
+ __x86_cvt_f16c(_TV __v);
+#endif
+
+
+ /** @internal
+ * Simple wrapper around __builtin_convertvector to provide static_cast-like syntax.
+ *
+ * Works around GCC failing to use the F16C/AVX512F cvtps2ph/cvtph2ps instructions.
+ */
+ template <__vec_builtin _UV, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr _UV
+ __vec_cast(_TV __v)
+ {
+ static_assert(__width_of<_UV> == __width_of<_TV>);
+#if _GLIBCXX_X86
+ using _Up = __vec_value_type<_UV>;
+ using _Tp = __vec_value_type<_TV>;
+ constexpr bool __to_f16 = is_same_v<_Up, _Float16>;
+ constexpr bool __from_f16 = is_same_v<_Tp, _Float16>;
+ constexpr bool __needs_f16c = _Traits._M_have_f16c() && !_Traits._M_have_avx512fp16()
+ && (__to_f16 || __from_f16);
+ if (__needs_f16c && !__is_const_known(__v))
+ { // Work around PR121688
+ if constexpr (__needs_f16c)
+ return __x86_cvt_f16c<_UV>(__v);
+ }
+ if constexpr (is_floating_point_v<_Tp> && is_integral_v<_Up>
+ && sizeof(_UV) < sizeof(_TV) && sizeof(_Up) < sizeof(int))
+ {
+ using _Ip = __integer_from<std::min(sizeof(int), sizeof(_Tp))>;
+ using _IV = __vec_builtin_type<_Ip, __width_of<_TV>>;
+ return __vec_cast<_UV>(__vec_cast<_IV>(__v));
+ }
+#endif
+ return __builtin_convertvector(__v, _UV);
+ }
+
+ /** @internal
+ * Overload of the above cast function that determines the destination vector type from a given
+ * element type @p _Up and the `__width_of` the argument type.
+ *
+ * Calls the above overload.
+ */
+ template <__vectorizable _Up, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type<_Up, __width_of<_TV>>
+ __vec_cast(_TV __v)
+ { return __vec_cast<__vec_builtin_type<_Up, __width_of<_TV>>>(__v); }
+
+ /** @internal
+ * As above, but with additional precondition on possible values of the argument.
+ *
+ * Precondition: __k[i] is either 0 or -1 for all i.
+ */
+ template <__vec_builtin _UV, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _UV
+ __vec_mask_cast(_TV __k)
+ {
+ static_assert(signed_integral<__vec_value_type<_UV>>);
+ static_assert(signed_integral<__vec_value_type<_TV>>);
+ // TODO: __builtin_convertvector cannot be optimal because it doesn't consider input and
+ // output can only be 0 or -1.
+ return __builtin_convertvector(__k, _UV);
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_xor(_TV __a, _TV __b)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (is_floating_point_v<_Tp>)
+ {
+ using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+ return __builtin_bit_cast(
+ _TV, __builtin_bit_cast(_UV, __a) ^ __builtin_bit_cast(_UV, __b));
+ }
+ else
+ return __a ^ __b;
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_or(_TV __a, _TV __b)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (is_floating_point_v<_Tp>)
+ {
+ using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+ return __builtin_bit_cast(
+ _TV, __builtin_bit_cast(_UV, __a) | __builtin_bit_cast(_UV, __b));
+ }
+ else
+ return __a | __b;
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_and(_TV __a, _TV __b)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (is_floating_point_v<_Tp>)
+ {
+ using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+ return __builtin_bit_cast(
+ _TV, __builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
+ }
+ else
+ return __a & __b;
+ }
+
+ /** @internal
+ * Returns the bit-wise and of not @p __a and @p __b.
+ *
+ * Use __vec_and(__vec_not(__a), __b) unless an andnot instruction is necessary for optimization.
+ *
+ * @see __vec_andnot in simd_x86.h
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_andnot(_TV __a, _TV __b)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+ return __builtin_bit_cast(
+ _TV, ~__builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_not(_TV __a)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ using _UV = __vec_builtin_type_bytes<__integer_from<sizeof(_Tp)>, sizeof(_TV)>;
+ if constexpr (is_floating_point_v<__vec_value_type<_TV>>)
+ return __builtin_bit_cast(_TV, ~__builtin_bit_cast(_UV, __a));
+ else
+ return ~__a;
+ }
+
+ /**
+ * An object of given type where only the sign bits are 1.
+ */
+ template <__vec_builtin _V>
+ requires std::floating_point<__vec_value_type<_V>>
+ constexpr _V _S_signmask = __vec_xor(_V() + 1, _V() - 1);
+
+ template <__vec_builtin _TV, int _Np = __width_of<_TV>,
+ typename = make_integer_sequence<int, _Np>>
+ struct _VecOps;
+
+ template <__vec_builtin _TV, int _Np, int... _Is>
+ struct _VecOps<_TV, _Np, integer_sequence<int, _Is...>>
+ {
+ static_assert(_Np <= __width_of<_TV>);
+
+ using _Tp = __vec_value_type<_TV>;
+
+ using _HV = __half_vec_builtin_t<__conditional_t<_Np >= 2, _TV, __double_vec_builtin_t<_TV>>>;
+
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_broadcast_to_even(_Tp __init)
+ { return _TV {((_Is & 1) == 0 ? __init : _Tp())...}; }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_broadcast_to_odd(_Tp __init)
+ { return _TV {((_Is & 1) == 1 ? __init : _Tp())...}; }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_all_of(_TV __k) noexcept
+ { return (... && (__k[_Is] != 0)); }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_any_of(_TV __k) noexcept
+ { return (... || (__k[_Is] != 0)); }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_none_of(_TV __k) noexcept
+ { return (... && (__k[_Is] == 0)); }
+
+ template <typename _Offset = integral_constant<int, 0>>
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_extract(__vec_builtin auto __x, _Offset = {})
+ {
+ static_assert(is_same_v<__vec_value_type<_TV>, __vec_value_type<decltype(__x)>>);
+ return __builtin_shufflevector(__x, decltype(__x)(), (_Is + _Offset::value)...);
+ }
+
+ // swap neighboring elements
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_swap_neighbors(_TV __x)
+ { return __builtin_shufflevector(__x, __x, (_Is ^ 1)...); }
+
+ // duplicate even indexed elements, dropping the odd ones
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_dup_even(_TV __x)
+ { return __builtin_shufflevector(__x, __x, (_Is & ~1)...); }
+
+ // duplicate odd indexed elements, dropping the even ones
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_dup_odd(_TV __x)
+ { return __builtin_shufflevector(__x, __x, (_Is | 1)...); }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr void
+ _S_overwrite_even_elements(_TV& __x, _HV __y) requires (_Np > 1)
+ {
+ constexpr __simd_size_type __n = __width_of<_TV>;
+ __x = __builtin_shufflevector(__x,
+#ifdef _GLIBCXX_CLANG
+ __vec_concat(__y, __y),
+#else
+ __y,
+#endif
+ ((_Is & 1) == 0 ? __n + _Is / 2 : _Is)...);
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr void
+ _S_overwrite_even_elements(_TV& __xl, _TV& __xh, _TV __y)
+ {
+ constexpr __simd_size_type __nl = __width_of<_TV>;
+ constexpr __simd_size_type __nh = __nl * 3 / 2;
+ __xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 0 ? __nl + _Is / 2 : _Is)...);
+ __xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 0 ? __nh + _Is / 2 : _Is)...);
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr void
+ _S_overwrite_odd_elements(_TV& __x, _HV __y) requires (_Np > 1)
+ {
+ constexpr __simd_size_type __n = __width_of<_TV>;
+ __x = __builtin_shufflevector(__x,
+#ifdef _GLIBCXX_CLANG
+ __vec_concat(__y, __y),
+#else
+ __y,
+#endif
+ ((_Is & 1) == 1 ? __n + _Is / 2 : _Is)...);
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr void
+ _S_overwrite_odd_elements(_TV& __xl, _TV& __xh, _TV __y)
+ {
+ constexpr __simd_size_type __nl = __width_of<_TV>;
+ constexpr __simd_size_type __nh = __nl * 3 / 2;
+ __xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 1 ? __nl + _Is / 2 : _Is)...);
+ __xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 1 ? __nh + _Is / 2 : _Is)...);
+ }
+
+ // true if all elements are know to be equal to __ref at compile time
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_is_const_known_equal_to(_TV __x, _Tp __ref)
+ { return (__is_const_known_equal_to(__x[_Is], __ref) && ...); }
+
+ };
+} // namespace simd
+_GLIBCXX_END_NAMESPACE_VERSION
+} // namespace std
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_VEC_OPS_H
};
};
+ftms = {
+ name = simd;
+ values = {
+ no_stdname = true; // TODO: change once complete
+ v = 202506;
+ cxxmin = 26;
+ extra_cond = "__cpp_structured_bindings >= 202411L "
+ "&& __cpp_expansion_statements >= 202411L "
+ "&& (__x86_64__ || __i386__)"; // TODO: lift initial restriction to x86
+ hosted = yes;
+ };
+};
+
// Standard test specifications.
stds[97] = ">= 199711L";
stds[03] = ">= 199711L";
#endif /* !defined(__cpp_lib_contracts) */
#undef __glibcxx_want_contracts
+#if !defined(__cpp_lib_simd)
+# if (__cplusplus > 202302L) && _GLIBCXX_HOSTED && (__cpp_structured_bindings >= 202411L && __cpp_expansion_statements >= 202411L && (__x86_64__ || __i386__))
+# define __glibcxx_simd 202506L
+# if defined(__glibcxx_want_all) || defined(__glibcxx_want_simd)
+# endif
+# endif
+#endif /* !defined(__cpp_lib_simd) */
+#undef __glibcxx_want_simd
+
#undef __glibcxx_want_all
--- /dev/null
+// <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+/** @file simd
+ * This is a Standard C++ Library header.
+ */
+
+#ifndef _GLIBCXX_SIMD
+#define _GLIBCXX_SIMD 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#define __glibcxx_want_simd
+#include <bits/version.h>
+
+#ifdef __glibcxx_simd
+
+#include "bits/simd_vec.h"
+#include "bits/simd_loadstore.h"
+#include "bits/simd_mask_reductions.h"
+#include "bits/simd_reductions.h"
+#include "bits/simd_alg.h"
+
+#endif
+#endif
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+
+static constexpr bool is_iec559 =
+#ifdef __GCC_IEC_559
+ __GCC_IEC_559 >= 2;
+#elif defined __STDC_IEC_559__
+ __STDC_IEC_559__ == 1;
+#else
+ false;
+#endif
+
+#if VIR_NEXT_PATCH
+template <typename V>
+ requires complex_like<typename V::value_type>
+ struct Tests<V>
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+ using Real = typename T::value_type;
+ using RealV = simd::rebind_t<Real, V>;
+
+ static_assert(std::is_floating_point_v<Real>);
+
+ static constexpr T min = std::numeric_limits<Real>::lowest();
+ static constexpr T norm_min = std::numeric_limits<Real>::min();
+ static constexpr T denorm_min = std::numeric_limits<Real>::denorm_min();
+ static constexpr T max = std::numeric_limits<Real>::max();
+ static constexpr T inf = std::numeric_limits<Real>::infinity();
+
+ ADD_TEST(plus_minus) {
+ std::tuple {V(), init_vec<V, C(1, 1), C(2, 2), C(3, 3)>},
+ [](auto& t, V x, V y) {
+ t.verify_equal(x + x, x);
+ t.verify_equal(x - x, x);
+ t.verify_equal(x + y, y);
+ t.verify_equal(y + x, y);
+ t.verify_equal(x - y, -y);
+ t.verify_equal(y - x, y);
+ t.verify_equal(x += T(1, -2), T(1, -2));
+ t.verify_equal(x = x + x, T(2, -4));
+ t.verify_equal(x = x - y, init_vec<V, C(1, -5), C(0, -6), C(-1, -7)>);
+ t.verify_equal(x, init_vec<V, C(1, -5), C(0, -6), C(-1, -7)>);
+ }
+ };
+
+ // complex multiplication & division has an edge case which is due to '-0. - -0.'. If we
+ // interpret negative zero to represent a value between denorm_min and 0 (exclusive) then we
+ // cannot know whether the resulting zero is negative or positive. ISO 60559 simply defines the
+ // result to be positive zero, but that's throwing away half of the truth.
+ //
+ // Consider (https://compiler-explorer.com/z/61cYhrE48):
+ // sqrt(x * complex{1.}) -> {0, +/-1}.
+ // The sign of the imaginary part depends on whether x is double{-1} or complex{-1.}. This is
+ // due to the type of the operand influencing the formula used for multiplication:
+ //
+ // 1. 'x * (u+iv)' is implemented as 'xu + i(xv)'
+ //
+ // 2. '(x+iy) * (u+iv)' is implemented as '(xu-yv) + i(xv+yu)'
+ //
+ // 'xv' is equal to -0 and 'yu' is equal to +0. Consequently the imaginary part in (1.) is -0
+ // and in (2.) it is (-0 + 0) which is +0. The example above then uses that difference to hit
+ // the branch cut on sqrt.
+
+ // (x+iy)(u+iv) = (xu-yv)+i(xv+yu)
+ // depending on FMA contraction or FLT_EVAL_METHOD 'inf - inf' can be 0, inf, -inf, or NaN (no
+ // contraction).
+ //
+ // Because of all these issues, verify_equal is implemented to interpret "an infinity" as equal
+ // to another infinity according to the interpretation of C23 Annex G.3.
+
+ ADD_TEST(multiplication_corner_cases) {
+ std::array {min, norm_min, denorm_min, max, inf},
+ [](auto& t, V x) {
+ t.verify_equal(x * x, x[0] * x[0]);
+ const V y = x * T(1, 1);
+ t.verify_equal(y * y, y[0] * y[0])(y);
+ x *= T(0, 1);
+ t.verify_equal(x * x, x[0] * x[0]);
+ x *= T(1, 1);
+ t.verify_equal(x * x, x[0] * x[0])(x);
+ x *= T(1, Real(.5));
+ t.verify_equal(x * x, x[0] * x[0])(x);
+ }
+ };
+
+ ADD_TEST(multiplication) {
+ std::tuple {V(), V(RealV(1), RealV()), V(RealV(), RealV(1)), init_vec<V, C(0, 2), C(2, 0), C(-1, 2)>},
+ [](auto& t, V x, V one, V I, V z) {
+ t.verify_equal(x * x, x);
+ t.verify_equal(x * z, x);
+ t.verify_equal(z * x, x);
+ t.verify_equal(one * one, one);
+ t.verify_equal(one * z, z);
+ t.verify_equal(z * one, z);
+
+ // Notes:
+ // inf + -inf -> NaN
+ // 0. + -0. -> 0. (this is arbitrary, why not NaN: indeterminable sign?)
+ // complex(0.) * -complex(2., 2.) -> (0, -0)
+ // 0. * -complex(2., 2.) -> (-0, -0)
+ // => the *type* of the operand determines the sign of the zero, which is *impossible*
+ // to implement with vec<complex>!
+ // complex(DBL_MAX, DBL_MAX) * complex(2., 2.) -> (-nan, inf) => θ got lost
+ // complex(1.) / complex(0., 0.) -> (inf, -nan) => θ got lost
+ // complex(1.) / complex(-0., 0.) -> (inf, -nan) => θ got lost
+ // complex(1.) / complex(0., -0.) -> (inf, -nan) => θ got lost
+ // complex(1.) / complex(-DBL_INF, 0.) -> (-0, -0) => θ is wrong
+
+ t.verify_bit_equal(one * I, I);
+
+ // (0+i0) * (-0-i0) -> (-0 + 0) + i(-0 + -0) -> 0-i0
+ t.verify_bit_equal(x * -x, T() * -T());
+ t.verify_bit_equal(-x * x, -T() * T());
+
+ t.verify_bit_equal(x * conj(x), T() * conj(T()));
+ t.verify_bit_equal(x * -conj(x), T() * -conj(T()));
+
+ // real * complex has extra overloads on complex but not on vec<complex>
+ // for vec<complex> the result therefore needs to be "bit equal" only to
+ // complex * complex
+ t.verify_equal(x.real() * -x, T().real() * -T());
+ t.verify_bit_equal(x.real() * -x, T() * -T());
+
+ t.verify_bit_equal(I * one, I);
+ t.verify_bit_equal(I * I, T(-1, 0));
+ t.verify_bit_equal(z * I, init_vec<V, C(-2, 0), C(0., 2.), C(-2, -1)>);
+ t.verify_bit_equal(std::complex{-0., 0.} * std::complex{0., 1.}, std::complex{-0., 0.});
+ t.verify_bit_equal(std::complex{-0., -1.} * std::complex{0., 0.}, std::complex{0., -0.});
+ t.verify_bit_equal(0. + -0., 0.);
+ }
+ };
+ };
+#endif
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ static constexpr T min = std::numeric_limits<T>::lowest();
+ static constexpr T norm_min = std::numeric_limits<T>::min();
+ static constexpr T max = std::numeric_limits<T>::max();
+
+ ADD_TEST(plus0, requires(T x) { x + x; }) {
+ std::tuple{V(), init_vec<V, 1, 2, 3, 4, 5, 6, 7>},
+ [](auto& t, V x, V y) {
+ t.verify_equal(x + x, x);
+ t.verify_equal(x = x + T(1), T(1));
+ t.verify_equal(x + x, T(2));
+ t.verify_equal(x = x + y, init_vec<V, 2, 3, 4, 5, 6, 7, 8>);
+ t.verify_equal(x = x + -y, T(1));
+ t.verify_equal(x += y, init_vec<V, 2, 3, 4, 5, 6, 7, 8>);
+ t.verify_equal(x, init_vec<V, 2, 3, 4, 5, 6, 7, 8>);
+ t.verify_equal(x += -y, T(1));
+ t.verify_equal(x, T(1));
+ }
+ };
+
+ ADD_TEST(plus1, requires(T x) { x + x; }) {
+ std::tuple{test_iota<V>},
+ [](auto& t, V x) {
+ t.verify_equal(x + std::cw<0>, x);
+ t.verify_equal(std::cw<0> + x, x);
+ t.verify_equal(x + T(), x);
+ t.verify_equal(T() + x, x);
+ t.verify_equal(x + -x, V());
+ t.verify_equal(-x + x, V());
+ }
+ };
+
+ ADD_TEST(minus0, requires(T x) { x - x; }) {
+ std::tuple{T(1), T(0), init_vec<V, 1, 2, 3, 4, 5, 6, 7>},
+ [](auto& t, V x, V y, V z) {
+ t.verify_equal(x - y, x);
+ t.verify_equal(x - T(1), y);
+ t.verify_equal(y, x - T(1));
+ t.verify_equal(x - x, y);
+ t.verify_equal(x = z - x, init_vec<V, 0, 1, 2, 3, 4, 5, 6>);
+ t.verify_equal(x = z - x, V(1));
+ t.verify_equal(z -= x, init_vec<V, 0, 1, 2, 3, 4, 5, 6>);
+ t.verify_equal(z, init_vec<V, 0, 1, 2, 3, 4, 5, 6>);
+ t.verify_equal(z -= z, V(0));
+ t.verify_equal(z, V(0));
+ }
+ };
+
+ ADD_TEST(minus1, requires(T x) { x - x; }) {
+ std::tuple{test_iota<V>},
+ [](auto& t, V x) {
+ t.verify_equal(x - x, V());
+ t.verify_equal(x - std::cw<0>, x);
+ t.verify_equal(std::cw<0> - x, -x);
+ t.verify_equal(x - T(), x);
+ t.verify_equal(T() - x, -x);
+ }
+ };
+
+ ADD_TEST(times0, requires(T x) { x * x; }) {
+ std::tuple{T(0), T(1), T(2)},
+ [](auto& t, T v0, T v1, T v2) {
+ V x = v1;
+ V y = v0;
+ t.verify_equal(x * y, y);
+ t.verify_equal(x = x * T(2), T(2));
+ t.verify_equal(x * x, T(4));
+ y = init_vec<V, 1, 2, 3, 4, 5, 6, 7>;
+ t.verify_equal(x = x * y, init_vec<V, 2, 4, 6, 8, 10, 12, 14>);
+ y = v2;
+ // don't test norm_min/2*2 in the following. There's no guarantee, in
+ // general, that the result isn't flushed to zero (e.g. NEON without
+ // subnormals)
+ for (T n : {T(max - T(1)), std::is_floating_point_v<T> ? T(norm_min * T(3)) : min})
+ {
+ x = T(n / 2);
+ t.verify_equal(x * y, V(n));
+ }
+ if (std::is_integral<T>::value && std::is_unsigned<T>::value)
+ {
+ // test modulo arithmetics
+ T n = max;
+ x = n;
+ for (T m : {T(2), T(7), T(max / 127), max})
+ {
+ y = m;
+ // if T is of lower rank than int, `n * m` will promote to int
+ // before executing the multiplication. In this case an overflow
+ // will be UB (and ubsan will warn about it). The solution is to
+ // cast to uint in that case.
+ using U
+ = std::conditional_t<(sizeof(T) < sizeof(int)), unsigned, T>;
+ t.verify_equal(x * y, V(T(U(n) * U(m))));
+ }
+ }
+ x = v2;
+ t.verify_equal(x *= init_vec<V, 1, 2, 3>, init_vec<V, 2, 4, 6>);
+ t.verify_equal(x, init_vec<V, 2, 4, 6>);
+ }
+ };
+
+ ADD_TEST(times1, requires(T x) { x * x; }) {
+ std::tuple{test_iota<V, 0, 11>},
+ [](auto& t, V x) {
+ t.verify_equal(x * x, V([](int i) { return T(T(i % 12) * T(i % 12)); }));
+ t.verify_equal(x * std::cw<1>, x);
+ t.verify_equal(std::cw<1> * x, x);
+ t.verify_equal(x * T(1), x);
+ t.verify_equal(T(1) * x, x);
+ t.verify_equal(x * T(-1), -x);
+ t.verify_equal(T(-1) * x, -x);
+ }
+ };
+
+ // avoid testing subnormals and expect minor deltas for non-IEC559 float
+ ADD_TEST(divide0, std::is_floating_point_v<T> && !is_iec559) {
+ std::tuple{T(2), init_vec<V, 1, 2, 3, 4, 5, 6, 7>},
+ [](auto& t, V x, V y) {
+ t.verify_equal_to_ulp(x / x, V(T(1)), 1);
+ t.verify_equal_to_ulp(T(3) / x, V(T(3) / T(2)), 1);
+ t.verify_equal_to_ulp(x / T(3), V(T(2) / T(3)), 1);
+ t.verify_equal_to_ulp(y / x, init_vec<V, .5, 1, 1.5, 2, 2.5, 3, 3.5>, 1);
+ }
+ };
+
+ // avoid testing subnormals and expect minor deltas for non-IEC559 float
+ ADD_TEST(divide1, std::is_floating_point_v<T> && !is_iec559) {
+ std::array{T{norm_min * 1024}, T{1}, T{}, T{-1}, T{max / 1024}, T{max / T(4.1)}, max, min},
+ [](auto& t, V a) {
+ V b = std::cw<2>;
+ V ref([&](int i) { return a[i] / 2; });
+ t.verify_equal_to_ulp(a / b, ref, 1);
+ a = select(a == std::cw<0>, T(1), a);
+ // -freciprocal-math together with flush-to-zero makes
+ // the following range restriction necessary (i.e.
+ // 1/|a| must be >= min). Intel vrcpps and vrcp14ps
+ // need some extra slack (use 1.1 instead of 1).
+ a = select(fabs(a) >= T(1.1) / norm_min, T(1), a);
+ t.verify_equal_to_ulp(a / a, V(1), 1)("\na = ", a);
+ ref = V([&](int i) { return 2 / a[i]; });
+ t.verify_equal_to_ulp(b / a, ref, 1)("\na = ", a);
+ t.verify_equal_to_ulp(b /= a, ref, 1);
+ t.verify_equal_to_ulp(b, ref, 1);
+ }
+ };
+
+ ADD_TEST(divide2, (is_iec559 || !std::is_floating_point_v<T>) && requires(T x) { x / x; }) {
+ std::tuple{T(2), init_vec<V, 1, 2, 3, 4, 5, 6, 7>, init_vec<V, T(max), T(norm_min)>,
+ init_vec<V, T(norm_min), T(max)>, init_vec<V, T(max), T(norm_min) + 1>},
+ [](auto& t, V x, V y, V z, V a, V b) {
+ t.verify_equal(x / x, V(1));
+ t.verify_equal(T(3) / x, V(T(3) / T(2)));
+ t.verify_equal(x / T(3), V(T(2) / T(3)));
+ t.verify_equal(y / x, init_vec<V, .5, 1, 1.5, 2, 2.5, 3, 3.5>);
+ V ref = init_vec<V, T(max / 2), T(norm_min / 2)>;
+ t.verify_equal(z / x, ref);
+ ref = init_vec<V, T(norm_min / 2), T(max / 2)>;
+ t.verify_equal(a / x, ref);
+ t.verify_equal(b / b, V(1));
+ ref = init_vec<V, T(2 / max), T(2 / (norm_min + 1))>;
+ t.verify_equal(x / b, ref);
+ t.verify_equal(x /= b, ref);
+ t.verify_equal(x, ref);
+ }
+ };
+
+ static constexpr V from0 = test_iota<V, 0, 63>;
+ static constexpr V from1 = test_iota<V, 1, 64>;
+ static constexpr V from2 = test_iota<V, 2, 65>;
+
+ ADD_TEST(incdec, requires(T x) { ++x; x++; --x; x--; }) {
+ std::tuple{from0},
+ [](auto& t, V x) {
+ t.verify_equal(x++, from0);
+ t.verify_equal(x, from1);
+ t.verify_equal(++x, from2);
+ t.verify_equal(x, from2);
+
+ t.verify_equal(x--, from2);
+ t.verify_equal(x, from1);
+ t.verify_equal(--x, from0);
+ t.verify_equal(x, from0);
+ }
+ };
+ };
+
+#include "create_tests.h"
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "arithmetic.cc" // { dg-prune-output "Wpsabi" }
--- /dev/null
+#include <stdfloat>
+
+void create_tests()
+{
+ template for (auto t : {char(), short(), unsigned(), 0l, 0ull, float(), double()})
+ {
+ using T = decltype(t);
+#ifndef EXPENSIVE_TESTS
+ [[maybe_unused]] Tests<simd::vec<T>> test;
+#else
+ [[maybe_unused]] Tests<simd::vec<T, simd::vec<T>::size() + 3>> test0;
+ [[maybe_unused]] Tests<simd::vec<T, 1>> test1;
+#endif
+ }
+}
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ ADD_TEST(VecCatChunk) {
+ std::tuple{test_iota<V>, test_iota<V, 1>},
+ [](auto& t, const V v0, const V v1) {
+ auto c = cat(v0, v1);
+ t.verify_equal(c.size(), V::size() * 2);
+ for (int i = 0; i < V::size(); ++i)
+ t.verify_equal(c[i], v0[i])(i);
+ for (int i = 0; i < V::size(); ++i)
+ t.verify_equal(c[i + V::size()], v1[i])(i);
+ const auto [c0, c1] = simd::chunk<V>(c);
+ t.verify_equal(c0, v0);
+ t.verify_equal(c1, v1);
+ if constexpr (V::size() <= 35)
+ {
+ auto d = cat(v1, c, v0);
+ for (int i = 0; i < V::size(); ++i)
+ {
+ t.verify_equal(d[i], v1[i])(i);
+ t.verify_equal(d[i + V::size()], v0[i])(i);
+ t.verify_equal(d[i + 2 * V::size()], v1[i])(i);
+ t.verify_equal(d[i + 3 * V::size()], v0[i])(i);
+ }
+ const auto [...chunked] = simd::chunk<3>(d);
+ t.verify_equal(cat(chunked...), d);
+ }
+ }
+ };
+
+ ADD_TEST(MaskCatChunk) {
+ std::tuple{M([](int i) { return 1 == (i & 1); }), M([](int i) { return 1 == (i % 3); })},
+ [](auto& t, const M k0, const M k1) {
+ auto c = cat(k0, k1);
+ t.verify_equal(c.size(), V::size() * 2);
+ for (int i = 0; i < V::size(); ++i)
+ t.verify_equal(c[i], k0[i])(i);
+ for (int i = 0; i < V::size(); ++i)
+ t.verify_equal(c[i + V::size()], k1[i])(i);
+ const auto [c0, c1] = simd::chunk<M>(c);
+ t.verify_equal(c0, k0);
+ t.verify_equal(c1, k1);
+ if constexpr (V::size() <= 35)
+ {
+ auto d = cat(k1, c, k0);
+ for (int i = 0; i < V::size(); ++i)
+ {
+ t.verify_equal(d[i], k1[i])(i);
+ t.verify_equal(d[i + V::size()], k0[i])(i);
+ t.verify_equal(d[i + 2 * V::size()], k1[i])(i);
+ t.verify_equal(d[i + 3 * V::size()], k0[i])(i);
+ }
+ const auto [...chunked] = simd::chunk<3>(d);
+ t.verify_equal(cat(chunked...), d);
+ }
+ }
+ };
+ };
+
+#include "create_tests.h" // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "creation.cc" // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+#include <numeric>
+
+template <typename T, std::size_t N, std::size_t Alignment>
+ class alignas(Alignment) aligned_array
+ : public std::array<T, N>
+ {};
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ static_assert(simd::alignment_v<V> <= 256);
+
+ ADD_TEST(load_zeros) {
+ std::tuple {aligned_array<T, V::size * 2, 256> {}, aligned_array<int, V::size * 2, 256> {}},
+ [](auto& t, auto mem, auto ints) {
+ t.verify_equal(simd::unchecked_load<V>(mem), V());
+ t.verify_equal(simd::partial_load<V>(mem), V());
+
+ t.verify_equal(simd::unchecked_load<V>(mem, simd::flag_aligned), V());
+ t.verify_equal(simd::partial_load<V>(mem, simd::flag_aligned), V());
+
+ t.verify_equal(simd::unchecked_load<V>(mem, simd::flag_overaligned<256>), V());
+ t.verify_equal(simd::partial_load<V>(mem, simd::flag_overaligned<256>), V());
+
+ t.verify_equal(simd::unchecked_load<V>(mem.begin() + 1, mem.end()), V());
+ t.verify_equal(simd::partial_load<V>(mem.begin() + 1, mem.end()), V());
+ t.verify_equal(simd::partial_load<V>(mem.begin() + 1, mem.begin() + 1), V());
+ t.verify_equal(simd::partial_load<V>(mem.begin() + 1, mem.begin() + 2), V());
+
+ t.verify_equal(simd::unchecked_load<V>(ints, simd::flag_convert), V());
+ t.verify_equal(simd::partial_load<V>(ints, simd::flag_convert), V());
+
+ t.verify_equal(simd::unchecked_load<V>(mem, M(true)), V());
+ t.verify_equal(simd::unchecked_load<V>(mem, M(false)), V());
+ t.verify_equal(simd::partial_load<V>(mem, M(true)), V());
+ t.verify_equal(simd::partial_load<V>(mem, M(false)), V());
+ }
+ };
+
+ static constexpr V ref = test_iota<V, 1, 0>;
+ static constexpr V ref1 = V([](int i) { return i == 0 ? T(1): T(); });
+
+ template <typename U>
+ static constexpr auto
+ make_iota_array()
+ {
+ aligned_array<U, V::size * 2, simd::alignment_v<V, U>> arr = {};
+ U init = 0;
+ for (auto& x : arr) x = (init += U(1));
+ return arr;
+ }
+
+ ADD_TEST(load_iotas, requires {T() + T(1);}) {
+ std::tuple {make_iota_array<T>(), make_iota_array<int>()},
+ [](auto& t, auto mem, auto ints) {
+ t.verify_equal(simd::unchecked_load<V>(mem), ref);
+ t.verify_equal(simd::partial_load<V>(mem), ref);
+
+ t.verify_equal(simd::unchecked_load<V>(mem.begin() + 1, mem.end()), ref + T(1));
+ t.verify_equal(simd::partial_load<V>(mem.begin() + 1, mem.end()), ref + T(1));
+ t.verify_equal(simd::partial_load<V>(mem.begin(), mem.begin() + 1), ref1);
+
+ t.verify_equal(simd::unchecked_load<V>(mem, simd::flag_aligned), ref);
+ t.verify_equal(simd::partial_load<V>(mem, simd::flag_aligned), ref);
+
+ t.verify_equal(simd::unchecked_load<V>(ints, simd::flag_convert), ref);
+ t.verify_equal(simd::partial_load<V>(ints, simd::flag_convert), ref);
+ t.verify_equal(simd::partial_load<V>(
+ ints.begin(), ints.begin(), simd::flag_convert), V());
+ t.verify_equal(simd::partial_load<V>(
+ ints.begin(), ints.begin() + 1, simd::flag_convert), ref1);
+
+ t.verify_equal(simd::unchecked_load<V>(mem, M(true)), ref);
+ t.verify_equal(simd::unchecked_load<V>(mem, M(false)), V());
+ t.verify_equal(simd::partial_load<V>(mem, M(true)), ref);
+ t.verify_equal(simd::partial_load<V>(mem, M(false)), V());
+ }
+ };
+
+ static constexpr M alternating = M([](int i) { return 1 == (i & 1); });
+ static constexpr V ref_k = select(alternating, ref, T());
+ static constexpr V ref_2 = select(M([](int i) { return i < 2; }), ref, T());
+ static constexpr V ref_k_2 = select(M([](int i) { return i < 2; }), ref_k, T());
+
+ ADD_TEST(masked_loads) {
+ std::tuple {make_iota_array<T>(), make_iota_array<int>(), alternating, M(true), M(false)},
+ [](auto& t, auto mem, auto ints, M k, M tr, M fa) {
+ t.verify_equal(simd::unchecked_load<V>(mem, tr), ref);
+ t.verify_equal(simd::unchecked_load<V>(mem, fa), V());
+ t.verify_equal(simd::unchecked_load<V>(mem, k), ref_k);
+
+ t.verify_equal(simd::unchecked_load<V>(ints, tr, simd::flag_convert), ref);
+ t.verify_equal(simd::unchecked_load<V>(ints, fa, simd::flag_convert), V());
+ t.verify_equal(simd::unchecked_load<V>(ints, k, simd::flag_convert), ref_k);
+
+ t.verify_equal(simd::partial_load<V>(mem, tr), ref);
+ t.verify_equal(simd::partial_load<V>(mem, fa), V());
+ t.verify_equal(simd::partial_load<V>(mem, k), ref_k);
+
+ t.verify_equal(simd::partial_load<V>(mem.begin(), mem.begin() + 2, tr), ref_2);
+ t.verify_equal(simd::partial_load<V>(mem.begin(), mem.begin() + 2, fa), V());
+ t.verify_equal(simd::partial_load<V>(mem.begin(), mem.begin() + 2, k), ref_k_2);
+
+ t.verify_equal(simd::partial_load<V>(ints.begin(), ints.begin() + 2, tr,
+ simd::flag_convert), ref_2);
+ t.verify_equal(simd::partial_load<V>(ints.begin(), ints.begin() + 2, fa,
+ simd::flag_convert), V());
+ t.verify_equal(simd::partial_load<V>(ints.begin(), ints.begin() + 2, k,
+ simd::flag_convert), ref_k_2);
+ }
+ };
+ };
+
+#include "create_tests.h"
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "loads.cc" // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+#include <utility>
+
+namespace simd = std::simd;
+
+template <std::size_t B, typename A>
+ consteval std::size_t
+ element_size(const simd::basic_mask<B, A>&)
+ { return B; }
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ ADD_TEST(Sanity) {
+ std::tuple{M([](int i) { return 1 == (i & 1); })},
+ [](auto& t, const M k) {
+ t.verify_equal(element_size(k), sizeof(T));
+ for (int i = 0; i < k.size(); i += 2)
+ t.verify_equal(k[i], false)(k);
+ for (int i = 1; i < k.size(); i += 2)
+ t.verify_equal(k[i], true)(k);
+ }
+ };
+
+ ADD_TEST(Reductions) {
+ std::tuple{M([](int i) { return 1 == (i & 1); }), M(true), M(false)},
+ [](auto& t, const M k, const M tr, const M fa) {
+ t.verify(!all_of(k))(k);
+ if constexpr (V::size() > 1)
+ {
+ t.verify(any_of(k))(k);
+ t.verify(!none_of(k))(k);
+ }
+
+ t.verify(all_of(tr));
+ t.verify(any_of(tr));
+ t.verify(!none_of(tr));
+
+ t.verify(!all_of(fa));
+ t.verify(!any_of(fa));
+ t.verify(none_of(fa));
+ }
+ };
+
+ ADD_TEST(CvtToInt, (sizeof(T) <= sizeof(0ull))) {
+ std::tuple{M([](int i) { return 1 == (i & 1); }), M(true), M(false), M([](int i) {
+ return i % 13 == 0 || i % 7 == 0;
+ })},
+ [](auto& t, const M k, const M tr, const M fa, const M k2) {
+ t.verify_equal(V(+tr), V(1));
+ t.verify_equal(V(+fa), V());
+ t.verify_equal(V(+k), init_vec<V, 0, 1>);
+
+ if constexpr (std::is_integral_v<T>)
+ {
+ t.verify_equal(V(~tr), ~V(1));
+ t.verify_equal(V(~fa), ~V(0));
+ t.verify_equal(V(~k), ~init_vec<V, 0, 1>);
+ }
+
+ t.verify(all_of(simd::rebind_t<char, M>(tr)));
+ t.verify(!all_of(simd::rebind_t<char, M>(fa)));
+ t.verify(!all_of(simd::rebind_t<char, M>(k)));
+
+ t.verify_equal(fa.to_ullong(), 0ull);
+ t.verify_equal(fa.to_bitset(), std::bitset<V::size()>());
+
+ // test whether 'M -> bitset -> M' is an identity transformation
+ t.verify_equal(M(fa.to_bitset()), fa)(fa.to_bitset());
+ t.verify_equal(M(tr.to_bitset()), tr)(tr.to_bitset());
+ t.verify_equal(M(k.to_bitset()), k)(k.to_bitset());
+ t.verify_equal(M(k2.to_bitset()), k2)(k2.to_bitset());
+
+ static_assert(sizeof(0ull) * CHAR_BIT == 64);
+ if constexpr (V::size() <= 64)
+ {
+ constexpr unsigned long long full = -1ull >> (64 - V::size());
+ t.verify_equal(tr.to_ullong(), full)(std::hex, tr.to_ullong(), '^', full, "->",
+ tr.to_ullong() ^ full);
+ t.verify_equal(tr.to_bitset(), full);
+
+ constexpr unsigned long long alternating = 0xaaaa'aaaa'aaaa'aaaaULL & full;
+ t.verify_equal(k.to_ullong(), alternating)(std::hex, k.to_ullong(), '^', alternating,
+ "->", k.to_ullong() ^ alternating);
+ t.verify_equal(k.to_bitset(), alternating);
+
+ // 0, 7, 13, 14, 21, 26, 28, 35, 39, 42, 49, 52, 56, 63, 65, ...
+ constexpr unsigned long long bits7_13 = 0x8112'0488'1420'6081ULL & full;
+ t.verify_equal(k2.to_ullong(), bits7_13)(std::hex, k2.to_ullong());
+ }
+ else
+ {
+ constexpr unsigned long long full = -1ull;
+ constexpr unsigned long long alternating = 0xaaaa'aaaa'aaaa'aaaaULL;
+ int shift = M::size() - 64;
+ t.verify_equal((tr.to_bitset() >> shift).to_ullong(), full);
+ t.verify_equal((k.to_bitset() >> shift).to_ullong(), alternating);
+ }
+
+ t.verify_equal(+tr, -(-tr));
+ t.verify_equal(-+tr, -tr);
+ }
+ };
+ };
+
+#include "create_tests.h" // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+#include <utility>
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ static constexpr M alternating = M([](int i) { return 1 == (i & 1); });
+ static constexpr M k010 = M([](int i) { return 1 == (i % 3); });
+ static constexpr M k00111 = M([](int i) { return 2 < (i % 5); });
+
+ ADD_TEST(mask_conversion) {
+ std::array {alternating, k010, k00111},
+ [](auto& t, M k) {
+ template for (auto tmp : {char(), short(), int(), double()})
+ {
+ using U = decltype(tmp);
+ using M2 = simd::rebind_t<U, M>;
+ using M3 = simd::mask<U, V::size()>;
+ const M2 ref2 = M2([&](int i) { return k[i]; });
+ t.verify_equal(M2(k), ref2);
+ t.verify_equal(M(M2(k)), k);
+ if constexpr (!std::is_same_v<M2, M3>)
+ {
+ const M3 ref3 = M3([&](int i) { return k[i]; });
+ t.verify_equal(M3(k), ref3);
+ t.verify_equal(M(M3(k)), k);
+ t.verify_equal(M2(M3(k)), ref2);
+ t.verify_equal(M3(M2(k)), ref3);
+ }
+ }
+ }
+ };
+
+ ADD_TEST(mask_reductions_sanity) {
+ std::tuple {M(true)},
+ [](auto& t, M x) {
+ t.verify_equal(std::simd::reduce_min_index(x), 0);
+ t.verify_equal(std::simd::reduce_max_index(x), V::size - 1);
+ t.verify_precondition_failure("An empty mask does not have a min_index.", [&] {
+ std::simd::reduce_min_index(!x);
+ });
+ t.verify_precondition_failure("An empty mask does not have a max_index.", [&] {
+ std::simd::reduce_max_index(!x);
+ });
+ }
+ };
+
+ ADD_TEST(mask_reductions) {
+ std::tuple{test_iota<V>, test_iota<V> == T(0)},
+ [](auto& t, V v, M k0) {
+ // Caveat:
+ // k0[n0 * (test_iota_max<V> + 1)] is true if it exists
+ // k[n * (test_iota_max<V> + 1) + i] is true if it exists
+ // none_of(k) is true if i > test_iota_max<V>
+ // by construction of test_iota_max:
+ static_assert(test_iota_max<V> < V::size());
+ for (int i = 0; i < int(test_iota_max<V>) + 1; ++i)
+ {
+ M k = v == T(i);
+
+ const int nk = 1 + (V::size() - i - 1) / (test_iota_max<V> + 1);
+ const int maxk = (nk - 1) * (test_iota_max<V> + 1) + i;
+ t.verify(maxk < V::size());
+
+ const int nk0 = 1 + (V::size() - 1) / (test_iota_max<V> + 1);
+ const int maxk0 = (nk0 - 1) * (test_iota_max<V> + 1);
+ t.verify(maxk0 < V::size());
+
+ const int maxkork0 = std::max(maxk, maxk0);
+
+ t.verify_equal(k[i], true);
+ t.verify_equal(std::as_const(k)[i], true);
+ t.verify_equal(std::simd::reduce_min_index(k), i)(k);
+ t.verify_equal(std::simd::reduce_max_index(k), maxk)(k);
+ t.verify_equal(std::simd::reduce_min_index(k || k0), 0);
+ t.verify_equal(std::simd::reduce_max_index(k || k0), maxkork0);
+ t.verify_equal(k, k);
+ t.verify_not_equal(!k, k);
+ t.verify_equal(k | k, k);
+ t.verify_equal(k & k, k);
+ t.verify(none_of(k ^ k));
+ t.verify_equal(std::simd::reduce_count(k), nk);
+ if constexpr (sizeof(T) <= sizeof(0ULL))
+ t.verify_equal(-std::simd::reduce(-k), nk)(k)(-k);
+ t.verify_equal(std::simd::reduce_count(!k), V::size - nk)(!k);
+ if constexpr (V::size <= 128 && sizeof(T) <= sizeof(0ULL))
+ t.verify_equal(-std::simd::reduce(-!k), V::size - nk)(-!k);
+ t.verify(any_of(k));
+ t.verify(bool(any_of(k & k0) ^ (i != 0)));
+ k = M([&](int j) { return j == 0 ? true : k[j]; });
+ t.verify_equal(k[i], true);
+ t.verify_equal(std::as_const(k)[i], true);
+ t.verify_equal(k[0], true);
+ t.verify_equal(std::as_const(k)[0], true);
+ t.verify_equal(std::simd::reduce_min_index(k), 0)(k);
+ t.verify_equal(std::simd::reduce_max_index(k), maxk)(k);
+ }
+ }
+ };
+ };
+
+#include "create_tests.h" // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-require-effective-target run_expensive_tests }
+// { dg-timeout-factor 2 }
+
+#define EXPENSIVE_TESTS 1
+#include "mask2.cc" // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "mask.cc" // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+
+template <typename T, std::size_t N, std::size_t Alignment>
+ class alignas(Alignment) aligned_array
+ : public std::array<T, N>
+ {};
+
+inline constexpr std::multiplies<> mul;
+inline constexpr std::bit_and<> bit_and;
+inline constexpr std::bit_or<> bit_or;
+inline constexpr std::bit_xor<> bit_xor;
+
+inline constexpr auto my_add = [](auto a, auto b) { return a + b; };
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ static_assert(simd::alignment_v<V> <= 256);
+
+ static consteval V
+ poisoned(T x)
+ {
+ if constexpr (sizeof(V) == sizeof(T) * V::size())
+ return V(x);
+ else
+ {
+ using P = simd::resize_t<sizeof(V) / sizeof(T), V>;
+ static_assert(P::size() > V::size());
+ constexpr auto [...is] = std::_IotaArray<P::size()>;
+ const T arr[P::size()] = {(is < V::size() ? x : T(7))...};
+ return std::bit_cast<V>(P(arr));
+ }
+ }
+
+ ADD_TEST(Sum) {
+ std::tuple {poisoned(0), poisoned(1)},
+ [](auto& t, V v0, V v1) {
+ t.verify_equal(simd::reduce(v0), T(0));
+ t.verify_equal(simd::reduce(v1), T(V::size()));
+ }
+ };
+
+ ADD_TEST(Product) {
+ std::tuple {poisoned(0), poisoned(1)},
+ [](auto& t, V v0, V v1) {
+ t.verify_equal(simd::reduce(v0, mul), T(0));
+ t.verify_equal(simd::reduce(v1, mul), T(1));
+ }
+ };
+
+ ADD_TEST(UnknownSum) {
+ std::tuple {poisoned(0), poisoned(1)},
+ [](auto& t, V v0, V v1) {
+ t.verify_equal(simd::reduce(v0, my_add), T(0));
+ t.verify_equal(simd::reduce(v1, my_add), T(V::size()));
+ }
+ };
+
+ ADD_TEST(And, std::is_integral_v<T>) {
+ std::tuple {poisoned(0), poisoned(1)},
+ [](auto& t, V v0, V v1) {
+ t.verify_equal(simd::reduce(v0, bit_and), T(0));
+ t.verify_equal(simd::reduce(v1, bit_and), T(1));
+ }
+ };
+
+ ADD_TEST(Or, std::is_integral_v<T>) {
+ std::tuple {poisoned(0), poisoned(1)},
+ [](auto& t, V v0, V v1) {
+ t.verify_equal(simd::reduce(v0, bit_or), T(0));
+ t.verify_equal(simd::reduce(v1, bit_or), T(1));
+ }
+ };
+
+ ADD_TEST(Xor, std::is_integral_v<T>) {
+ std::tuple {poisoned(0), poisoned(1)},
+ [](auto& t, V v0, V v1) {
+ t.verify_equal(simd::reduce(v0, bit_xor), T(0));
+ t.verify_equal(simd::reduce(v1, bit_xor), T(V::size() & 1));
+ }
+ };
+ };
+
+#include "create_tests.h"
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "reductions.cc"
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+
+template <typename V>
+ requires (V::size() * sizeof(typename V::value_type) <= 70 * 4) // avoid exploding RAM usage
+ struct Tests<V>
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ static constexpr int max = sizeof(T) == 8 ? 64 : 32;
+
+ ADD_TEST_N(known_shift, 4, std::is_integral_v<T>) {
+ std::tuple {test_iota<V, 0, 0>},
+ []<int N>(auto& t, const V x) {
+ constexpr int shift = max * (N + 1) / 4 - 1;
+ constexpr V vshift = T(shift);
+ const V vshiftx = vshift ^ (x & std::cw<1>);
+ V ref([](T i) -> T { return i << shift; });
+ V refx([](T i) -> T { return i << (shift ^ (i & 1)); });
+ t.verify_equal(x << shift, ref)("{:d} << {:d}", x, shift);
+ t.verify_equal(x << vshift, ref)("{:d} << {:d}", x, vshift);
+ t.verify_equal(x << vshiftx, refx)("{:d} << {:d}", x, vshiftx);
+ const auto y = ~x;
+ ref = V([](T i) -> T { return T(~i) << shift; });
+ refx = V([](T i) -> T { return T(~i) << (shift ^ (i & 1)); });
+ t.verify_equal(y << shift, ref)("{:d} << {:d}", y, shift);
+ t.verify_equal(y << vshift, ref)("{:d} << {:d}", y, vshift);
+ t.verify_equal(y << vshiftx, refx)("{:d} << {:d}", y, vshiftx);
+ }
+ };
+
+ ADD_TEST(unknown_shift, std::is_integral_v<T>) {
+ std::tuple {test_iota<V, 0, 0>},
+ [](auto& t, const V x) {
+ if !consteval
+ {
+ for (int shift = 0; shift < max; ++shift)
+ {
+ const auto y = ~x;
+ shift = make_value_unknown(shift);
+ const V vshift = T(shift);
+ V ref([=](T i) -> T { return i << shift; });
+ t.verify_equal(x << shift, ref)("{:d} << {:d}", y, shift);
+ t.verify_equal(x << vshift, ref)("{:d} << {:d}", y, vshift);
+ ref = V([=](T i) -> T { return T(~i) << shift; });
+ t.verify_equal(y << shift, ref)("{:d} << {:d}", y, shift);
+ t.verify_equal(y << vshift, ref)("{:d} << {:d}", y, vshift);
+ }
+ }
+ }
+ };
+ };
+
+template <typename V>
+ struct Tests
+ {};
+
+void create_tests()
+{
+ template for (auto t : {char(), short(), unsigned(), 0l, 0ull})
+ [[maybe_unused]] Tests<simd::vec<decltype(t)>> test;
+ template for (constexpr int n : {1, 3, 17})
+ [[maybe_unused]] Tests<simd::vec<int, n>> test;
+}
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "shift_left.cc"
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+
+template <typename V>
+ requires (V::size() * sizeof(typename V::value_type) <= 70 * 4) // avoid exploding RAM usage
+ struct Tests<V>
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ static constexpr int max = sizeof(T) == 8 ? 64 : 32;
+
+ ADD_TEST_N(known_shift, 4, std::is_integral_v<T>) {
+ std::tuple {test_iota<V>},
+ []<int N>(auto& t, const V x) {
+ constexpr int shift = max * (N + 1) / 4 - 1;
+ constexpr T tmax = std::numeric_limits<T>::max();
+ constexpr V vshift = T(shift);
+ const V vshiftx = vshift ^ (x & std::cw<1>);
+ t.verify(__is_const_known(vshift));
+
+ V ref([&](int i) -> T { return x[i] >> shift; });
+ V refx([&](int i) -> T { return x[i] >> (shift ^ (i & 1)); });
+ t.verify_equal(x >> shift, ref)("{:d} >> {:d}", x, shift);
+ t.verify_equal(x >> vshift, ref)("{:d} >> {:d}", x, vshift);
+ t.verify_equal(x >> vshiftx, refx)("{:d} >> {:d}", x, vshiftx);
+
+ const V y = ~x;
+ ref = V([&](int i) -> T { return T(~x[i]) >> shift; });
+ refx = V([&](int i) -> T { return T(~x[i]) >> (shift ^ (i & 1)); });
+ t.verify_equal(y >> shift, ref)("{:d} >> {:d}", y, shift);
+ t.verify_equal(y >> vshift, ref)("{:d} >> {:d}", y, vshift);
+ t.verify_equal(y >> vshiftx, refx)("{:d} >> {:d}", y, vshiftx);
+
+ const V z = tmax - x;
+ ref = V([&](int i) -> T { return T(tmax - x[i]) >> shift; });
+ refx = V([&](int i) -> T { return T(tmax - x[i]) >> (shift ^ (i & 1)); });
+ t.verify_equal(z >> shift, ref)("{:d} >> {:d}", z, shift);
+ t.verify_equal(z >> vshift, ref)("{:d} >> {:d}", z, vshift);
+ t.verify_equal(z >> vshiftx, refx)("{:d} >> {:d}", z, vshiftx);
+ }
+ };
+
+ ADD_TEST(unknown_shift, std::is_integral_v<T>) {
+ std::tuple {test_iota<V>},
+ [](auto& t, const V x) {
+ for (int shift = 0; shift < max; ++shift)
+ {
+ constexpr T tmax = std::numeric_limits<T>::max();
+ const V vshift = T(shift);
+ const V vshiftx = vshift ^ (x & std::cw<1>);
+ t.verify(std::is_constant_evaluated()
+ || (!is_const_known(vshift) && !is_const_known(shift)));
+
+ V ref([&](int i) -> T { return x[i] >> shift; });
+ V refx([&](int i) -> T { return x[i] >> (shift ^ (i & 1)); });
+ t.verify_equal(x >> shift, ref)("{:d} >> {:d}", x, shift);
+ t.verify_equal(x >> vshift, ref)("{:d} >> {:d}", x, vshift);
+ t.verify_equal(x >> vshiftx, refx)("{:d} >> {:d}", x, vshiftx);
+
+ const V y = ~x;
+ ref = V([&](int i) -> T { return T(~x[i]) >> shift; });
+ refx = V([&](int i) -> T { return T(~x[i]) >> (shift ^ (i & 1)); });
+ t.verify_equal(y >> shift, ref)("{:d} >> {:d}", y, shift);
+ t.verify_equal(y >> vshift, ref)("{:d} >> {:d}", y, vshift);
+ t.verify_equal(y >> vshiftx, refx)("{:d} >> {:d}", y, vshiftx);
+
+ const V z = tmax - x;
+ ref = V([&](int i) -> T { return T(tmax - x[i]) >> shift; });
+ refx = V([&](int i) -> T { return T(tmax - x[i]) >> (shift ^ (i & 1)); });
+ t.verify_equal(z >> shift, ref)("{:d} >> {:d}", z, shift);
+ t.verify_equal(z >> vshift, ref)("{:d} >> {:d}", z, vshift);
+ t.verify_equal(z >> vshiftx, refx)("{:d} >> {:d}", z, vshiftx);
+ }
+ }
+ };
+ };
+
+template <typename V>
+ struct Tests
+ {};
+
+void create_tests()
+{
+ template for (auto t : {char(), short(), unsigned(), 0l, 0ull})
+ [[maybe_unused]] Tests<simd::vec<decltype(t)>> test;
+ template for (constexpr int n : {1, 3, 17})
+ [[maybe_unused]] Tests<simd::vec<int, n>> test;
+}
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "shift_right.cc"
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+#include <utility>
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+
+ using M = typename V::mask_type;
+
+ using pair = std::pair<V, V>;
+ static constexpr std::conditional_t<std::is_floating_point_v<T>, short, T> x_max
+ = test_iota_max<V, 1>;
+ static constexpr int x_max_int = static_cast<int>(x_max);
+
+ static constexpr V
+ reverse_iota(const V x)
+ {
+ if constexpr (std::is_enum_v<T>)
+ {
+ using Vu = simd::rebind_t<std::underlying_type_t<T>, V>;
+ return static_cast<V>(std::to_underlying(x_max) - static_cast<Vu>(x));
+ }
+ else
+ return x_max - x;
+ }
+
+ ADD_TEST(Select) {
+ std::tuple{test_iota<V, 0, 63>, test_iota<V, 1, 64>, T(2),
+ M([](int i) { return 1 == (i & 1); }),
+ M([](int i) { return 1 == (i % 3); })},
+ [](auto& t, const V x, const V y, const T z, const M k, const M k3) {
+ t.verify_equal(select(M(true), x, y), x);
+ t.verify_equal(select(M(false), x, y), y);
+ t.verify_equal(select(M(true), y, x), y);
+ t.verify_equal(select(M(false), y, x), x);
+ t.verify_equal(select(k, x, T()),
+ V([](int i) { return (1 == (i & 1)) ? T(i & 63) : T(); }));
+
+ t.verify_equal(select(M(true), z, T()), z);
+ t.verify_equal(select(M(true), T(), z), V());
+ t.verify_equal(select(k, z, T()), V([](int i) { return (1 == (i & 1)) ? T(2) : T(); }));
+ t.verify_equal(select(k3, z, T()), V([](int i) { return (1 == (i % 3)) ? T(2) : T(); }));
+ }
+ };
+
+ ADD_TEST(Min, std::totally_ordered<T>) {
+ std::tuple{test_iota<V, 0, -1>, reverse_iota(test_iota<V, 0, -1>), test_iota<V, 1>},
+ [](auto& t, const V x, const V y, const V x1) {
+ t.verify_equal(min(x, x), x);
+ t.verify_equal(min(V(), x), V());
+ t.verify_equal(min(x, V()), V());
+ if constexpr (std::is_signed_v<T>)
+ {
+ t.verify_equal(min(-x, x), -x);
+ t.verify_equal(min(x, -x), -x);
+ }
+ t.verify_equal(min(x1, x), x);
+ t.verify_equal(min(x, x1), x);
+ t.verify_equal(min(x, y), min(y, x));
+ t.verify_equal(min(x, y), V([](int i) {
+ i %= x_max_int;
+ return std::min(T(x_max_int - i), T(i));
+ }));
+ }
+ };
+
+ ADD_TEST(Max, std::totally_ordered<T>) {
+ std::tuple{test_iota<V, 0, -1>, reverse_iota(test_iota<V, 0, -1>), test_iota<V, 1>},
+ [](auto& t, const V x, const V y, const V x1) {
+ t.verify_equal(max(x, x), x);
+ t.verify_equal(max(V(), x), x);
+ t.verify_equal(max(x, V()), x);
+ if constexpr (std::is_signed_v<T>)
+ {
+ t.verify_equal(max(-x, x), x);
+ t.verify_equal(max(x, -x), x);
+ }
+ t.verify_equal(max(x1, x), x1);
+ t.verify_equal(max(x, x1), x1);
+ t.verify_equal(max(x, y), max(y, x));
+ t.verify_equal(max(x, y), V([](int i) {
+ i %= x_max_int;
+ return std::max(T(x_max_int - i), T(i));
+ }));
+ }
+ };
+
+ ADD_TEST(Minmax, std::totally_ordered<T>) {
+ std::tuple{test_iota<V, 0, -1>, reverse_iota(test_iota<V, 0, -1>), test_iota<V, 1>},
+ [](auto& t, const V x, const V y, const V x1) {
+ t.verify_equal(minmax(x, x), pair{x, x});
+ t.verify_equal(minmax(V(), x), pair{V(), x});
+ t.verify_equal(minmax(x, V()), pair{V(), x});
+ if constexpr (std::is_signed_v<T>)
+ {
+ t.verify_equal(minmax(-x, x), pair{-x, x});
+ t.verify_equal(minmax(x, -x), pair{-x, x});
+ }
+ t.verify_equal(minmax(x1, x), pair{x, x1});
+ t.verify_equal(minmax(x, x1), pair{x, x1});
+ t.verify_equal(minmax(x, y), minmax(y, x));
+ t.verify_equal(minmax(x, y),
+ pair{V([](int i) {
+ i %= x_max_int;
+ return std::min(T(x_max_int - i), T(i));
+ }),
+ V([](int i) {
+ i %= x_max_int;
+ return std::max(T(x_max_int - i), T(i));
+ })});
+ }
+ };
+
+ ADD_TEST(Clamp, std::totally_ordered<T>) {
+ std::tuple{test_iota<V>, reverse_iota(test_iota<V>)},
+ [](auto& t, const V x, const V y) {
+ t.verify_equal(clamp(x, V(), x), x);
+ t.verify_equal(clamp(x, x, x), x);
+ t.verify_equal(clamp(V(), x, x), x);
+ t.verify_equal(clamp(V(), V(), x), V());
+ t.verify_equal(clamp(x, V(), V()), V());
+ t.verify_equal(clamp(x, V(), y), min(x, y));
+ t.verify_equal(clamp(y, V(), x), min(x, y));
+ if constexpr (std::is_signed_v<T>)
+ {
+ t.verify_equal(clamp(V(T(-test_iota_max<V>)), -x, x), -x);
+ t.verify_equal(clamp(V(T(test_iota_max<V>)), -x, x), x);
+ }
+ }
+ };
+ };
+
+#include "create_tests.h"
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "simd_alg.cc" // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+
+#ifdef __SSE__
+#include <x86intrin.h>
+#endif
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ ADD_TEST(misc, !simd::__scalar_abi_tag<typename V::abi_type>) {
+ std::tuple{init_vec<V, 0, 100, 2, 54, 3>},
+ [](auto& t, V x) {
+ t.verify_equal(x, x);
+ if !consteval
+ {
+#ifdef __SSE__
+ V r = x;
+ if constexpr (sizeof(x) == 16 && std::is_same_v<T, float>)
+ t.verify_equal(r = _mm_and_ps(x, x), x);
+#endif
+#ifdef __SSE2__
+ if constexpr (sizeof(x) == 16 && std::is_integral_v<T>)
+ t.verify_equal(r = _mm_and_si128(x, x), x);
+ if constexpr (sizeof(x) == 16 && std::is_same_v<T, double>)
+ t.verify_equal(r = _mm_and_pd(x, x), x);
+#endif
+ }
+ }
+ };
+ };
+
+void create_tests()
+{
+ template for (auto t : {char(), short(), unsigned(), 0l, 0ull, float(), double()})
+ [[maybe_unused]] Tests<simd::vec<decltype(t), 16 / sizeof(t)>> test;
+}
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include "test_setup.h"
+
+template <typename V>
+ struct Tests
+ {
+ using T = typename V::value_type;
+ using M = typename V::mask_type;
+
+ static_assert(simd::alignment_v<V> <= 256);
+
+ ADD_TEST(stores, requires {T() + T(1);}) {
+ std::tuple {test_iota<V, 1, 0>, std::array<T, V::size * 2> {}, std::array<int, V::size * 2> {}},
+ [](auto& t, const V v, const auto& mem_init, const auto& ints_init) {
+ alignas(256) std::array<T, V::size * 2> mem = mem_init;
+ alignas(256) std::array<int, V::size * 2> ints = ints_init;
+
+ simd::unchecked_store(v, mem, simd::flag_aligned);
+ simd::unchecked_store(v, mem.begin() + V::size(), mem.end());
+ for (int i = 0; i < V::size; ++i)
+ {
+ t.verify_equal(mem[i], T(i + 1));
+ t.verify_equal(mem[V::size + i], T(i + 1));
+ }
+#if VIR_NEXT_PATCH
+ if constexpr (complex_like<T>)
+ {
+ }
+ else
+#endif
+ {
+ simd::unchecked_store(v, ints, simd::flag_convert);
+ simd::partial_store(v, ints.begin() + V::size() + 1, ints.end(),
+ simd::flag_convert | simd::flag_overaligned<alignof(int)>);
+ for (int i = 0; i < V::size; ++i)
+ {
+ t.verify_equal(ints[i], int(T(i + 1)));
+ t.verify_equal(ints[V::size + i], int(T(i)));
+ }
+
+ simd::unchecked_store(V(), ints.begin(), V::size(), simd::flag_convert);
+ simd::unchecked_store(V(), ints.begin() + V::size(), V::size(), simd::flag_convert);
+ for (int i = 0; i < 2 * V::size; ++i)
+ t.verify_equal(ints[i], 0)("i =", i);
+
+ if constexpr (V::size() > 1)
+ {
+ simd::partial_store(v, ints.begin() + 1, V::size() - 2, simd::flag_convert);
+ for (int i = 0; i < V::size - 2; ++i)
+ t.verify_equal(ints[i], int(T(i)));
+ t.verify_equal(ints[V::size - 1], 0);
+ t.verify_equal(ints[V::size], 0);
+ }
+ else
+ {
+ simd::partial_store(v, ints.begin() + 1, 0, simd::flag_convert);
+ t.verify_equal(ints[0], 0);
+ t.verify_equal(ints[1], 0);
+ }
+ }
+ }
+ };
+ };
+
+#include "create_tests.h"
--- /dev/null
+// { dg-do run { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+// { dg-require-effective-target run_expensive_tests }
+
+#define EXPENSIVE_TESTS 1
+#include "stores.cc"
--- /dev/null
+// Test framework for <simd> -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef SIMD_TEST_SETUP_H
+#define SIMD_TEST_SETUP_H
+
+#include <bits/simd_details.h>
+#include <string_view>
+
+namespace test
+{
+ struct precondition_failure
+ {
+ std::string_view file;
+ int line;
+ std::string_view expr;
+ std::string_view msg;
+ };
+
+#undef __glibcxx_simd_precondition
+
+#define __glibcxx_simd_precondition(expr, msg, ...) \
+ do { \
+ if (__builtin_expect(!bool(expr), false)) \
+ throw test::precondition_failure{__FILE__, __LINE__, #expr, msg}; \
+ } while(false)
+}
+
+#undef _GLIBCXX_SIMD_NOEXCEPT
+#define _GLIBCXX_SIMD_NOEXCEPT noexcept(false)
+
+#include <simd>
+
+#include <source_location>
+#include <iostream>
+#include <concepts>
+#include <cfenv>
+#include <vector>
+#include <cstdint>
+#include <climits>
+
+// global objects
+static std::vector<void(*)()> test_functions = {};
+
+static std::int64_t passed_tests = 0;
+
+static std::int64_t failed_tests = 0;
+
+static std::string_view test_name = "unknown";
+
+// ------------------------------------------------
+
+namespace simd = std::simd;
+
+template <typename T>
+ struct is_character_type
+ : std::bool_constant<false>
+ {};
+
+template <typename T>
+ inline constexpr bool is_character_type_v = is_character_type<T>::value;
+
+template <typename T>
+ struct is_character_type<const T>
+ : is_character_type<T>
+ {};
+
+template <typename T>
+ struct is_character_type<T&>
+ : is_character_type<T>
+ {};
+
+template <> struct is_character_type<char> : std::bool_constant<true> {};
+template <> struct is_character_type<wchar_t> : std::bool_constant<true> {};
+template <> struct is_character_type<char8_t> : std::bool_constant<true> {};
+template <> struct is_character_type<char16_t> : std::bool_constant<true> {};
+template <> struct is_character_type<char32_t> : std::bool_constant<true> {};
+
+std::ostream& operator<<(std::ostream& s, std::byte b)
+{ return s << std::hex << static_cast<unsigned>(b) << std::dec; }
+
+template <typename T, typename Abi>
+std::ostream& operator<<(std::ostream& s, std::simd::basic_vec<T, Abi> const& v)
+{
+ if constexpr (std::is_arithmetic_v<T>)
+ {
+ using U = std::conditional_t<
+ sizeof(T) == 1, int, std::conditional_t<
+ is_character_type_v<T>,
+ std::simd::_UInt<sizeof(T)>, T>>;
+ s << '[' << U(v[0]);
+ for (int i = 1; i < v.size(); ++i)
+ s << ", " << U(v[i]);
+ }
+ else
+ {
+ s << '[' << v[0];
+ for (int i = 1; i < v.size(); ++i)
+ s << ", " << v[i];
+ }
+ return s << ']';
+}
+
+template <std::size_t B, typename Abi>
+std::ostream& operator<<(std::ostream& s, std::simd::basic_mask<B, Abi> const& v)
+{
+ s << '<';
+ for (int i = 0; i < v.size(); ++i)
+ s << int(v[i]);
+ return s << '>';
+}
+
+template <std::simd::__vec_builtin V>
+ std::ostream& operator<<(std::ostream& s, V v)
+ { return s << std::simd::vec<std::simd::__vec_value_type<V>, std::simd::__width_of<V>>(v); }
+
+template <typename T, typename U>
+ std::ostream& operator<<(std::ostream& s, const std::pair<T, U>& x)
+ { return s << '{' << x.first << ", " << x.second << '}'; }
+
+template <typename T>
+ concept is_string_type
+ = is_character_type_v<std::ranges::range_value_t<T>>
+ && std::is_convertible_v<T, std::basic_string_view<std::ranges::range_value_t<T>>>;
+
+template <std::ranges::range R>
+ requires (!is_string_type<R>)
+ std::ostream& operator<<(std::ostream& s, R&& x)
+ {
+ s << '[';
+ auto it = std::ranges::begin(x);
+ if (it != std::ranges::end(x))
+ {
+ s << *it;
+ while (++it != std::ranges::end(x))
+ s << ',' << *it;
+ }
+ return s << ']';
+ }
+
+struct additional_info
+{
+ const bool failed = false;
+
+ additional_info
+ operator()(auto const& value0, auto const&... more)
+ {
+ if (failed)
+ [&] {
+ std::cout << " " << value0;
+ ((std::cout << ' ' << more), ...);
+ std::cout << std::endl;
+ }();
+ return *this;
+ }
+};
+
+struct log_novalue {};
+
+template <typename T>
+ struct unwrap_value_types
+ { using type = T; };
+
+template <typename T>
+ requires requires { typename T::value_type; }
+ struct unwrap_value_types<T>
+ { using type = typename unwrap_value_types<typename T::value_type>::type; };
+
+template <typename T>
+ using value_type_t = typename unwrap_value_types<std::remove_cvref_t<T>>::type;
+
+template <typename T>
+ struct as_unsigned;
+
+template <typename T>
+ using as_unsigned_t = typename as_unsigned<T>::type;
+
+template <typename T>
+ requires (sizeof(T) == sizeof(unsigned char))
+ struct as_unsigned<T>
+ { using type = unsigned char; };
+
+template <typename T>
+ requires (sizeof(T) == sizeof(unsigned short))
+ struct as_unsigned<T>
+ { using type = unsigned short; };
+
+template <typename T>
+ requires (sizeof(T) == sizeof(unsigned int))
+ struct as_unsigned<T>
+ { using type = unsigned int; };
+
+template <typename T>
+ requires (sizeof(T) == sizeof(unsigned long long))
+ struct as_unsigned<T>
+ { using type = unsigned long long; };
+
+template <typename T, typename Abi>
+ struct as_unsigned<std::simd::basic_vec<T, Abi>>
+ { using type = std::simd::rebind_t<as_unsigned_t<T>, std::simd::basic_vec<T, Abi>>; };
+
+template <typename T0, typename T1>
+ constexpr T0
+ ulp_distance_signed(T0 val0, const T1& ref1)
+ {
+ if constexpr (std::is_floating_point_v<T1>)
+ return ulp_distance_signed(val0, std::simd::rebind_t<T1, T0>(ref1));
+ else if constexpr (std::is_floating_point_v<value_type_t<T0>>)
+ {
+ int fp_exceptions = 0;
+ if !consteval
+ {
+ fp_exceptions = std::fetestexcept(FE_ALL_EXCEPT);
+ }
+ using std::isnan;
+ using std::abs;
+ using T = value_type_t<T0>;
+ using L = std::numeric_limits<T>;
+ constexpr T0 signexp_mask = -L::infinity();
+ T0 ref0(ref1);
+ T1 val1(val0);
+ const auto subnormal = fabs(ref1) < L::min();
+ using I = as_unsigned_t<T1>;
+ const T1 eps1 = select(subnormal, L::denorm_min(),
+ L::epsilon() * std::bit_cast<T0>(
+ std::bit_cast<I>(ref1)
+ & std::bit_cast<I>(signexp_mask)));
+ const T0 ulp = select(val0 == ref0 || (isnan(val0) && isnan(ref0)),
+ T0(), T0((ref1 - val1) / eps1));
+ if !consteval
+ {
+ std::feclearexcept(FE_ALL_EXCEPT ^ fp_exceptions);
+ }
+ return ulp;
+ }
+ else
+ return ref1 - val0;
+ }
+
+template <typename T0, typename T1>
+ constexpr T0
+ ulp_distance(const T0& val, const T1& ref)
+ {
+ auto ulp = ulp_distance_signed(val, ref);
+ using T = value_type_t<decltype(ulp)>;
+ if constexpr (std::is_unsigned_v<T>)
+ return ulp;
+ else
+ {
+ using std::abs;
+ return fabs(ulp);
+ }
+ }
+
+template <typename T>
+ constexpr bool
+ bit_equal(const T& a, const T& b)
+ {
+ using std::simd::_UInt;
+ if constexpr (sizeof(T) <= sizeof(0ull))
+ return std::bit_cast<_UInt<sizeof(T)>>(a) == std::bit_cast<_UInt<sizeof(T)>>(b);
+ else if constexpr (std::simd::__simd_vec_or_mask_type<T>)
+ {
+ using TT = typename T::value_type;
+ if constexpr (std::is_integral_v<TT>)
+ return all_of(a == b);
+ else
+ {
+ constexpr size_t uint_size = std::min(size_t(8), sizeof(TT));
+ struct B
+ {
+ alignas(T) simd::rebind_t<_UInt<uint_size>,
+ simd::resize_t<T::size() * sizeof(TT) / uint_size, T>> data;
+ };
+ if constexpr (sizeof(B) == sizeof(a))
+ return all_of(std::bit_cast<B>(a).data == std::bit_cast<B>(b).data);
+ else
+ {
+ auto [a0, a1] = chunk<std::bit_ceil(unsigned(T::size())) / 2>(a);
+ auto [b0, b1] = chunk<std::bit_ceil(unsigned(T::size())) / 2>(b);
+ return bit_equal(a0, b0) && bit_equal(a1, b1);
+ }
+ }
+ }
+ else
+ static_assert(false);
+ }
+
+// treat as equal if either:
+// - operator== yields true
+// - or for floats, a and b are NaNs
+template <typename V>
+ constexpr bool
+ equal_with_nan_and_inf_fixup(const V& a, const V& b)
+ {
+ auto eq = a == b;
+ if (std::simd::all_of(eq))
+ return true;
+ else if constexpr (std::simd::__simd_vec_type<V>)
+ {
+ using M = typename V::mask_type;
+ using T = typename V::value_type;
+ if constexpr (std::is_floating_point_v<T>)
+ { // fix up nan == nan results
+ eq |= a._M_isnan() && b._M_isnan();
+ }
+ else
+ return false;
+ return std::simd::all_of(eq);
+ }
+ else if constexpr (std::is_floating_point_v<V>)
+ return std::isnan(a) && std::isnan(b);
+ else
+ return false;
+ }
+
+struct constexpr_verifier
+{
+ struct ignore_the_rest
+ {
+ constexpr ignore_the_rest
+ operator()(auto const&, auto const&...)
+ { return *this; }
+ };
+
+ bool okay = true;
+
+ constexpr ignore_the_rest
+ verify_precondition_failure(std::string_view expected_msg, auto&& f) &
+ {
+ try
+ {
+ f();
+ okay = false;
+ }
+ catch (const test::precondition_failure& failure)
+ {
+ okay = okay && failure.msg == expected_msg;
+ }
+ catch (...)
+ {
+ okay = false;
+ }
+ return {};
+ }
+
+ constexpr ignore_the_rest
+ verify(const auto& k) &
+ {
+ okay = okay && std::simd::all_of(k);
+ return {};
+ }
+
+ constexpr ignore_the_rest
+ verify_equal(const auto& v, const auto& ref) &
+ {
+ using V = decltype(std::simd::select(v == ref, v, ref));
+ okay = okay && equal_with_nan_and_inf_fixup<V>(v, ref);
+ return {};
+ }
+
+ constexpr ignore_the_rest
+ verify_bit_equal(const auto& v, const auto& ref) &
+ {
+ using V = decltype(std::simd::select(v == ref, v, ref));
+ okay = okay && bit_equal<V>(v, ref);
+ return {};
+ }
+
+ template <typename T, typename U>
+ constexpr ignore_the_rest
+ verify_equal(const std::pair<T, U>& x, const std::pair<T, U>& y) &
+ {
+ verify_equal(x.first, y.first);
+ verify_equal(x.second, y.second);
+ return {};
+ }
+
+ constexpr ignore_the_rest
+ verify_not_equal(const auto& v, const auto& ref) &
+ {
+ okay = okay && std::simd::all_of(v != ref);
+ return {};
+ }
+
+ constexpr ignore_the_rest
+ verify_equal_to_ulp(const auto& x, const auto& y, float allowed_distance) &
+ {
+ okay = okay && std::simd::all_of(ulp_distance(x, y) <= allowed_distance);
+ return {};
+ }
+
+ constexpr_verifier() = default;
+
+ constexpr_verifier(const constexpr_verifier&) = delete;
+
+ constexpr_verifier(constexpr_verifier&&) = delete;
+};
+
+template <int... is>
+ [[nodiscard]]
+ consteval bool
+ constexpr_test(auto&& fun, auto&&... args)
+ {
+ constexpr_verifier t;
+ try
+ {
+ fun.template operator()<is...>(t, args...);
+ }
+ catch(const test::precondition_failure& fail)
+ {
+ return false;
+ }
+ return t.okay;
+ }
+
+template <typename T>
+ T
+ make_value_unknown(const T& x)
+ { return *std::start_lifetime_as<T>(&x); }
+
+template <typename T>
+ concept pair_specialization
+ = std::same_as<std::remove_cvref_t<T>, std::pair<typename std::remove_cvref_t<T>::first_type,
+ typename std::remove_cvref_t<T>::second_type>>;
+
+struct runtime_verifier
+{
+ const std::string_view test_kind;
+
+ template <typename X, typename Y>
+ additional_info
+ log_failure(const X& x, const Y& y, std::source_location loc, std::string_view s)
+ {
+ ++failed_tests;
+ std::cout << loc.file_name() << ':' << loc.line() << ':' << loc.column() << ": in "
+ << test_kind << " test of '" << test_name
+ << "' " << s << " failed";
+ if constexpr (!std::is_same_v<X, log_novalue>)
+ {
+ std::cout << ":\n result: " << std::boolalpha;
+ if constexpr (is_character_type_v<X>)
+ std::cout << int(x);
+ else
+ std::cout << x;
+ if constexpr (!std::is_same_v<decltype(y), const log_novalue&>)
+ {
+ std::cout << "\n expected: ";
+ if constexpr (is_character_type_v<Y>)
+ std::cout << int(y);
+ else
+ std::cout << y;
+ }
+ }
+ std::cout << std::endl;
+ return additional_info {true};
+ }
+
+ [[gnu::always_inline]]
+ additional_info
+ verify_precondition_failure(std::string_view expected_msg, auto&& f,
+ std::source_location loc = std::source_location::current()) &
+ {
+ try
+ {
+ f();
+ ++failed_tests;
+ return log_failure(log_novalue(), log_novalue(), loc, "precondition failure not detected");
+ }
+ catch (const test::precondition_failure& failure)
+ {
+ if (failure.msg != expected_msg)
+ {
+ ++failed_tests;
+ return log_failure(failure.msg, expected_msg, loc, "unexpected exception");
+ }
+ else
+ {
+ ++passed_tests;
+ return {};
+ }
+ }
+ catch (...)
+ {
+ ++failed_tests;
+ return log_failure(log_novalue(), log_novalue(), loc, "unexpected exception");
+ }
+ }
+
+ [[gnu::always_inline]]
+ additional_info
+ verify(auto&& k, std::source_location loc = std::source_location::current())
+ {
+ if (std::simd::all_of(k))
+ {
+ ++passed_tests;
+ return {};
+ }
+ else
+ return log_failure(log_novalue(), log_novalue(), loc, "verify");
+ }
+
+ [[gnu::always_inline]]
+ additional_info
+ verify_equal(auto&& x, auto&& y,
+ std::source_location loc = std::source_location::current())
+ {
+ bool ok;
+ if constexpr (pair_specialization<decltype(x)> && pair_specialization<decltype(y)>)
+ ok = std::simd::all_of(x.first == y.first) && std::simd::all_of(x.second == y.second);
+ else
+ ok = equal_with_nan_and_inf_fixup<decltype(std::simd::select(x == y, x, y))>(x, y);
+ if (ok)
+ {
+ ++passed_tests;
+ return {};
+ }
+ else
+ return log_failure(x, y, loc, "verify_equal");
+ }
+
+ [[gnu::always_inline]]
+ additional_info
+ verify_bit_equal(auto&& x, auto&& y,
+ std::source_location loc = std::source_location::current())
+ {
+ using V = decltype(std::simd::select(x == y, x, y));
+ if (bit_equal<V>(x, y))
+ {
+ ++passed_tests;
+ return {};
+ }
+ else
+ return log_failure(x, y, loc, "verify_bit_equal");
+ }
+
+ [[gnu::always_inline]]
+ additional_info
+ verify_not_equal(auto&& x, auto&& y,
+ std::source_location loc = std::source_location::current())
+ {
+ if (std::simd::all_of(x != y))
+ {
+ ++passed_tests;
+ return {};
+ }
+ else
+ return log_failure(x, y, loc, "verify_not_equal");
+ }
+
+ // ulp_distance_signed can raise FP exceptions and thus must be conditionally executed
+ [[gnu::always_inline]]
+ additional_info
+ verify_equal_to_ulp(auto&& x, auto&& y, float allowed_distance,
+ std::source_location loc = std::source_location::current())
+ {
+ const bool success = std::simd::all_of(ulp_distance(x, y) <= allowed_distance);
+ if (success)
+ {
+ ++passed_tests;
+ return {};
+ }
+ else
+ return log_failure(x, y, loc, "verify_equal_to_ulp")
+ ("distance:", ulp_distance_signed(x, y),
+ "\n allowed:", allowed_distance);
+ }
+};
+
+template <int... is>
+ [[gnu::noinline, gnu::noipa]]
+ void
+ runtime_test(auto&& fun, auto&&... args)
+ {
+ runtime_verifier t {"runtime"};
+ fun.template operator()<is...>(t, make_value_unknown(args)...);
+ }
+
+template <typename T>
+ concept constant_value = requires {
+ typename std::integral_constant<std::remove_cvref_t<decltype(T::value)>, T::value>;
+ };
+
+template <typename T>
+ [[gnu::always_inline]] inline bool
+ is_const_known(const T& x)
+ { return constant_value<T> || __builtin_constant_p(x); }
+
+template <typename T, typename Abi>
+ [[gnu::always_inline]] inline bool
+ is_const_known(const std::simd::basic_vec<T, Abi>& x)
+ { return __is_const_known(x); }
+
+template <std::size_t B, typename Abi>
+ [[gnu::always_inline]] inline bool
+ is_const_known(const std::simd::basic_mask<B, Abi>& x)
+ { return __is_const_known(x); }
+
+template <std::ranges::sized_range R>
+ [[gnu::always_inline]] inline bool
+ is_const_known(const R& arr)
+ {
+ constexpr std::size_t N = std::ranges::size(arr);
+ constexpr auto [...is] = std::_IotaArray<N>;
+ return (is_const_known(arr[is]) && ...);
+ }
+
+template <int... is>
+ [[gnu::always_inline, gnu::flatten]]
+ inline void
+ constprop_test(auto&& fun, auto... args)
+ {
+ runtime_verifier t{"constprop"};
+#ifndef __clang__
+ t.verify((is_const_known(args) && ...))("=> Some argument(s) failed to constant-propagate.");
+#endif
+ fun.template operator()<is...>(t, args...);
+ }
+
+/**
+ * The value of the largest element in test_iota<V, Init>.
+ */
+template <typename V, int Init = 0, int Max = V::size() + Init - 1>
+ constexpr value_type_t<V> test_iota_max
+ = sizeof(value_type_t<V>) < sizeof(int)
+ ? std::min(int(std::numeric_limits<value_type_t<V>>::max()),
+ Max < 0 ? std::min(V::size() + Init - 1,
+ int(std::numeric_limits<value_type_t<V>>::max()) + Max)
+ : Max)
+ : V::size() + Init - 1;
+
+template <typename T, typename Abi, int Init, int Max>
+ requires std::is_enum_v<T>
+ constexpr T test_iota_max<simd::basic_vec<T, Abi>, Init, Max>
+ = static_cast<T>(test_iota_max<simd::basic_vec<std::underlying_type_t<T>, Abi>, Init, Max>);
+
+/**
+ * Starts iota sequence at Init.
+ *
+ * With `Max == 0`: Wrap-around on overflow
+ * With `Max < 0`: Subtract from numeric_limits::max (to leave room for arithmetic ops)
+ * Otherwise: [Init..Max, Init..Max, ...] (inclusive)
+ *
+ * Use simd::__iota if a non-monotonic sequence is a bug.
+ */
+template <typename V, int Init = 0, int MaxArg = int(test_iota_max<V, Init>)>
+ constexpr V test_iota = V([](int i) {
+ constexpr int Max = MaxArg < 0 ? int(test_iota_max<V, Init, MaxArg>) : MaxArg;
+ static_assert(Max == 0 || Max > Init || V::size() == 1);
+ i += Init;
+ if constexpr (Max > Init)
+ {
+ while (i > Max)
+ i -= Max - Init + 1;
+ }
+ using T = value_type_t<V>;
+ return static_cast<T>(i);
+ });
+
+/**
+ * A data-parallel object initialized with {values..., values..., ...}
+ */
+template <typename V, auto... values>
+ constexpr V init_vec = [] {
+ using T = typename V::value_type;
+ constexpr std::array<T, sizeof...(values)> arr = {T(values)...};
+ return V([&](size_t i) { return arr[i % arr.size()]; });
+ }();
+
+template <typename V>
+ struct Tests;
+
+template <typename T>
+ concept array_specialization
+ = std::same_as<T, std::array<typename T::value_type, std::tuple_size_v<T>>>;
+
+template <typename Args = void, typename Fun = void>
+ struct add_test
+ {
+ alignas(std::bit_floor(sizeof(Args))) Args args;
+ Fun fun;
+ };
+
+struct dummy_test
+{
+ static constexpr std::array<int, 0> args = {};
+ static constexpr auto fun = [](auto&, auto...) {};
+};
+
+template <auto test_ref, int... is, std::size_t... arg_idx>
+ void
+ invoke_test_impl(std::index_sequence<arg_idx...>)
+ {
+ constexpr auto fun = test_ref->fun;
+ [[maybe_unused]] constexpr auto args = test_ref->args;
+#ifdef EXPENSIVE_TESTS
+ constprop_test<is...>(fun, std::get<arg_idx>(args)...);
+ constexpr bool passed = constexpr_test<is...>(fun, std::get<arg_idx>(args)...);
+ if (passed)
+ ++passed_tests;
+ else
+ {
+ ++failed_tests;
+ std::cout << "=> constexpr test of '" << test_name << "' failed.\n";
+ }
+#endif
+ runtime_test<is...>(fun, std::get<arg_idx>(args)...);
+ }
+
+template <auto test_ref, int... is>
+ void
+ invoke_test(std::string_view name)
+ {
+ test_name = name;
+ constexpr auto args = test_ref->args;
+ using A = std::remove_const_t<decltype(args)>;
+ if constexpr (array_specialization<A>)
+ { // call for each element
+ template for (constexpr std::size_t I : std::_IotaArray<args.size()>)
+ {
+ std::string tmp_name = std::string(name) + '|' + std::to_string(I);
+ test_name = tmp_name;
+ ((std::cout << "Testing '" << test_name) << ... << (' ' + std::to_string(is)))
+ << ' ' << args[I] << "'\n";
+ invoke_test_impl<test_ref, is...>(std::index_sequence<I>());
+ }
+ }
+ else
+ {
+ ((std::cout << "Testing '" << test_name) << ... << (' ' + std::to_string(is))) << "'\n";
+ invoke_test_impl<test_ref, is...>(std::make_index_sequence<std::tuple_size_v<A>>());
+ }
+ }
+
+#define ADD_TEST(name, ...) \
+ template <int> \
+ static constexpr auto name##_tmpl = dummy_test {}; \
+ \
+ const int init_##name = [] { \
+ test_functions.push_back([] { invoke_test<&name##_tmpl<0>>(#name); }); \
+ return 0; \
+ }(); \
+ \
+ template <int Tmp> \
+ requires (Tmp == 0) __VA_OPT__(&& (__VA_ARGS__)) \
+ static constexpr auto name##_tmpl<Tmp> = add_test
+
+#define ADD_TEST_N(name, N, ...) \
+ template <int> \
+ static constexpr auto name##_tmpl = dummy_test {}; \
+ \
+ static void \
+ name() \
+ { \
+ template for (constexpr int i : std::_IotaArray<N, int>) \
+ invoke_test<&name##_tmpl<0>, i>(#name); \
+ } \
+ \
+ const int init_##name = [] { \
+ test_functions.push_back(name); \
+ return 0; \
+ }(); \
+ \
+ template <int Tmp> \
+ requires (Tmp == 0) __VA_OPT__(&& (__VA_ARGS__)) \
+ static constexpr auto name##_tmpl<Tmp> = add_test
+
+void create_tests();
+
+int main()
+{
+ create_tests();
+ try
+ {
+ for (auto f : test_functions)
+ f();
+ }
+ catch(const test::precondition_failure& fail)
+ {
+ std::cout << fail.file << ':' << fail.line << ": Error: precondition '" << fail.expr
+ << "' does not hold: " << fail.msg << '\n';
+ return EXIT_FAILURE;
+ }
+ std::cout << "Passed tests: " << passed_tests << "\nFailed tests: " << failed_tests << '\n';
+ return failed_tests != 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#endif // SIMD_TEST_SETUP_H
--- /dev/null
+// { dg-do compile { target c++26 } }
+// { dg-require-effective-target x86 }
+// { dg-timeout-factor 2 }
+
+#include <simd>
+#include <stdfloat>
+
+namespace simd = std::simd;
+
+// test that instantiation of the complete class is well-formed
+template class simd::basic_vec<int, typename simd::vec<int, 1>::abi_type>;
+template class simd::basic_vec<int, typename simd::vec<int, 5>::abi_type>;
+template class simd::basic_vec<int, typename simd::vec<int, 8>::abi_type>;
+template class simd::basic_vec<int, typename simd::vec<int, 13>::abi_type>;
+template class simd::basic_vec<float, typename simd::vec<float, 8>::abi_type>;
+template class simd::basic_vec<float, typename simd::vec<float, 13>::abi_type>;
+
+constexpr auto default_mask_abi_variant =
+#ifdef __AVX512F__
+ simd::_AbiVariant::_BitMask;
+#else
+ simd::_AbiVariant();
+#endif
+
+namespace test01
+{
+ using std::same_as;
+
+ using Abi1 = simd::_Abi_t<1, 1, default_mask_abi_variant>;
+ static_assert(same_as<simd::vec<int, 1>::abi_type, Abi1>);
+ static_assert(same_as<simd::vec<float, 1>::abi_type, Abi1>);
+
+#if defined __SSE__ && !defined __AVX__
+ static_assert(same_as<simd::vec<float>::abi_type, simd::_Abi_t<4, 1>>);
+ static_assert(same_as<simd::vec<float, 3>::abi_type, simd::_Abi_t<3, 1>>);
+ static_assert(same_as<simd::vec<float, 7>::abi_type, simd::_Abi_t<7, 2>>);
+
+ static_assert(simd::vec<float>::size > 1);
+ static_assert(alignof(simd::vec<float>) > alignof(float));
+ static_assert(alignof(simd::vec<float, 4>) > alignof(float));
+ static_assert(alignof(simd::vec<float, 3>) > alignof(float));
+ static_assert(sizeof(simd::vec<float, 7>) == 2 * sizeof(simd::vec<float>));
+ static_assert(alignof(simd::vec<float, 7>) == alignof(simd::vec<float>));
+#endif
+}
+
+namespace test02
+{
+ using namespace std;
+ using namespace std::simd;
+
+ static_assert(!destructible<simd::basic_mask<7>>);
+
+ static_assert(same_as<simd::vec<int>::mask_type, simd::mask<int>>);
+ static_assert(same_as<simd::vec<float>::mask_type, simd::mask<float>>);
+ static_assert(same_as<simd::vec<float, 1>::mask_type, simd::mask<float, 1>>);
+
+ // ensure 'true ? int : vec<float>' doesn't work
+ template <typename T>
+ concept has_type_member = requires { typename T::type; };
+ static_assert(has_type_member<common_type<int, simd::vec<float>>>);
+}
+
+#if defined __AVX__ && !defined __AVX2__
+static_assert(alignof(simd::mask<int, 8>) == 16);
+static_assert(alignof(simd::mask<float, 8>) == 32);
+static_assert(alignof(simd::mask<int, 16>) == 16);
+static_assert(alignof(simd::mask<float, 16>) == 32);
+static_assert(alignof(simd::mask<long long, 4>) == 16);
+static_assert(alignof(simd::mask<double, 4>) == 32);
+static_assert(alignof(simd::mask<long long, 8>) == 16);
+static_assert(alignof(simd::mask<double, 8>) == 32);
+static_assert(std::same_as<decltype(+simd::mask<float, 8>()), simd::vec<int, 8>>);
+#endif
+
+#if defined __SSE__ && !defined __F16C__ && defined __STDCPP_FLOAT16_T__
+static_assert(simd::vec<std::float16_t>::size() == 1);
+static_assert(simd::mask<std::float16_t>::size() == 1);
+static_assert(alignof(simd::vec<std::float16_t, 8>) == alignof(std::float16_t));
+static_assert(alignof(simd::rebind_t<std::float16_t, simd::vec<float>>) == alignof(std::float16_t));
+static_assert(simd::rebind_t<std::float16_t, simd::mask<float>>::abi_type::_S_nreg
+ == simd::vec<float>::size());
+#endif
+
+template <auto X>
+ using Ic = std::integral_constant<std::remove_const_t<decltype(X)>, X>;
+
+static_assert( std::convertible_to<Ic<1>, simd::vec<float>>);
+static_assert(!std::convertible_to<Ic<1.1>, simd::vec<float>>);
+static_assert(!std::convertible_to<simd::vec<int, 4>, simd::vec<float, 4>>);
+static_assert(!std::convertible_to<simd::vec<float, 4>, simd::vec<int, 4>>);
+static_assert( std::convertible_to<int, simd::vec<float>>);
+static_assert( std::convertible_to<simd::vec<int, 4>, simd::vec<double, 4>>);
+
+template <typename V>
+ concept has_static_size = requires {
+ { V::size } -> std::convertible_to<int>;
+ { V::size() } -> std::signed_integral;
+ { auto(V::size.value) } -> std::signed_integral;
+ };
+
+template <typename V, typename T = typename V::value_type>
+ concept usable_vec_or_mask
+ = std::destructible<V>
+ && std::is_nothrow_move_constructible_v<V>
+ && std::is_nothrow_move_assignable_v<V>
+ && std::is_nothrow_default_constructible_v<V>
+ && std::is_trivially_copyable_v<V>
+ && std::is_standard_layout_v<V>
+ && std::ranges::random_access_range<V&>
+ && !std::ranges::output_range<V&, T>
+ && std::constructible_from<V, T> // broadcast
+ && has_static_size<V>
+ && simd::__simd_vec_or_mask_type<V>
+ ;
+
+template <typename V, typename T = typename V::value_type>
+ concept usable_vec
+ = usable_vec_or_mask<V, T>
+ && !std::convertible_to<V, std::array<T, V::size()>>
+ && std::convertible_to<std::array<T, V::size()>, V>
+ && std::constructible_from<V, simd::rebind_t<int, V>>
+ && std::constructible_from<V, simd::rebind_t<float, V>>
+ && !std::constructible_from<V, simd::resize_t<V::size() + 1, V>>
+ && !std::constructible_from<V, simd::resize_t<V::size() + 1, typename V::mask_type>>
+ && !std::constructible_from<typename V::mask_type, V>
+ ;
+
+template <typename M, typename T = typename M::value_type>
+ concept usable_mask
+ = std::is_same_v<T, bool>
+ && usable_vec_or_mask<M, T>
+ && std::convertible_to<std::bitset<M::size()>, M>
+ && std::constructible_from<M, unsigned long long>
+ && std::constructible_from<M, unsigned char>
+ && std::constructible_from<M, simd::rebind_t<int, M>>
+ && std::constructible_from<M, simd::rebind_t<float, M>>
+ && !std::constructible_from<M, simd::resize_t<M::size() + 1, M>>
+ && !std::convertible_to<unsigned long long, M>
+ && !std::convertible_to<unsigned char, M>
+ && !std::convertible_to<bool, M>
+ && !std::constructible_from<M, std::bitset<M::size() + 1>>
+ && !std::constructible_from<M, std::bitset<M::size() - 1>>
+ && !std::constructible_from<M, int>
+ && !std::constructible_from<M, float>
+ ;
+
+template <typename T>
+ struct test_usable_simd
+ {
+ static_assert(!usable_vec<simd::vec<T, 0>>);
+ static_assert(!has_static_size<simd::vec<T, 0>>);
+ static_assert(usable_vec<simd::vec<T, 1>>);
+ static_assert(usable_vec<simd::vec<T, 2>>);
+ static_assert(usable_vec<simd::vec<T, 3>>);
+ static_assert(usable_vec<simd::vec<T, 4>>);
+ static_assert(usable_vec<simd::vec<T, 7>>);
+ static_assert(usable_vec<simd::vec<T, 8>>);
+ static_assert(usable_vec<simd::vec<T, 16>>);
+ static_assert(usable_vec<simd::vec<T, 32>>);
+ static_assert(usable_vec<simd::vec<T, 63>>);
+ static_assert(usable_vec<simd::vec<T, 64>>);
+
+ static_assert(!usable_mask<simd::mask<T, 0>>);
+ static_assert(!has_static_size<simd::mask<T, 0>>);
+ static_assert(usable_mask<simd::mask<T, 1>>);
+ static_assert(usable_mask<simd::mask<T, 2>>);
+ static_assert(usable_mask<simd::mask<T, 3>>);
+ static_assert(usable_mask<simd::mask<T, 4>>);
+ static_assert(usable_mask<simd::mask<T, 7>>);
+ static_assert(usable_mask<simd::mask<T, 8>>);
+ static_assert(usable_mask<simd::mask<T, 16>>);
+ static_assert(usable_mask<simd::mask<T, 32>>);
+ static_assert(usable_mask<simd::mask<T, 63>>);
+ static_assert(usable_mask<simd::mask<T, 64>>);
+ };
+
+template <template <typename> class Tpl>
+ struct instantiate_all_vectorizable
+ {
+ Tpl<float> a;
+ Tpl<double> b;
+ Tpl<char> c;
+ Tpl<char8_t> c8;
+ Tpl<char16_t> d;
+ Tpl<char32_t> e;
+ Tpl<wchar_t> f;
+ Tpl<signed char> g;
+ Tpl<unsigned char> h;
+ Tpl<short> i;
+ Tpl<unsigned short> j;
+ Tpl<int> k;
+ Tpl<unsigned int> l;
+ Tpl<long> m;
+ Tpl<unsigned long> n;
+ Tpl<long long> o;
+ Tpl<unsigned long long> p;
+#ifdef __STDCPP_FLOAT16_T__
+ Tpl<std::float16_t> q;
+#endif
+#ifdef __STDCPP_FLOAT32_T__
+ Tpl<std::float32_t> r;
+#endif
+#ifdef __STDCPP_FLOAT64_T__
+ Tpl<std::float64_t> s;
+#endif
+ };
+
+template struct instantiate_all_vectorizable<test_usable_simd>;
+
+// vec generator ctor ///////////////
+
+namespace test_generator
+{
+ struct udt_convertible_to_float
+ { operator float() const; };
+
+ static_assert( std::constructible_from<simd::vec<float>, float (&)(int)>);
+ static_assert(!std::convertible_to<float (&)(int), simd::vec<float>>);
+ static_assert(!std::constructible_from<simd::vec<float>, int (&)(int)>);
+ static_assert(!std::constructible_from<simd::vec<float>, double (&)(int)>);
+ static_assert( std::constructible_from<simd::vec<float>, short (&)(int)>);
+ static_assert(!std::constructible_from<simd::vec<float>, long double (&)(int)>);
+ static_assert( std::constructible_from<simd::vec<float>, udt_convertible_to_float (&)(int)>);
+}
+
+// mask generator ctor ///////////////
+
+static_assert(
+ all_of(simd::mask<float, 4>([](int) { return true; }) == simd::mask<float, 4>(true)));
+static_assert(
+ all_of(simd::mask<float, 4>([](int) { return false; }) == simd::mask<float, 4>(false)));
+static_assert(
+ all_of(simd::mask<float, 4>([](int i) { return i < 2; })
+ == simd::mask<float, 4>([](int i) {
+ return std::array{true, true, false, false}[i];
+ })));
+
+static_assert(all_of((simd::vec<int, 4>([](int i) { return i << 10; }) >> 10)
+ == simd::__iota<simd::vec<int, 4>>));
+
+// vec iterators /////////////////////
+
+#if SIMD_IS_A_RANGE
+static_assert([] { simd::vec<float> x = {}; return x.begin() == x.begin(); }());
+static_assert([] { simd::vec<float> x = {}; return x.begin() == x.cbegin(); }());
+static_assert([] { simd::vec<float> x = {}; return x.cbegin() == x.begin(); }());
+static_assert([] { simd::vec<float> x = {}; return x.cbegin() == x.cbegin(); }());
+static_assert([] { simd::vec<float> x = {}; return x.begin() + x.size() == x.end(); }());
+static_assert([] { simd::vec<float> x = {}; return x.end() == x.begin() + x.size(); }());
+static_assert([] { simd::vec<float> x = {}; return x.begin() < x.end(); }());
+static_assert([] { simd::vec<float> x = {}; return x.begin() <= x.end(); }());
+static_assert(![] { simd::vec<float> x = {}; return x.begin() > x.end(); }());
+static_assert(![] { simd::vec<float> x = {}; return x.begin() >= x.end(); }());
+static_assert(![] { simd::vec<float> x = {}; return x.end() < x.begin(); }());
+static_assert(![] { simd::vec<float> x = {}; return x.end() <= x.begin(); }());
+static_assert([] { simd::vec<float> x = {}; return x.end() > x.begin(); }());
+static_assert([] { simd::vec<float> x = {}; return x.end() >= x.begin(); }());
+static_assert([] { simd::vec<float> x = {}; return x.end() - x.begin(); }() == simd::vec<float>::size());
+static_assert([] { simd::vec<float> x = {}; return x.begin() - x.end(); }() == -simd::vec<float>::size());
+static_assert([] { simd::vec<float> x = {}; return x.begin() - x.begin(); }() == 0);
+static_assert([] { simd::vec<float> x = {}; return x.begin() + 1 - x.begin(); }() == 1);
+static_assert([] { simd::vec<float> x = {}; return x.begin() + 1 - x.cbegin(); }() == 1);
+#endif
+
+// mask to vec ///////////////////////
+
+// Clang says all kinds of expressions are not constant expressions. Why? Come on … explain! 🤷
+#ifdef __clang__
+#define AVOID_BROKEN_CLANG_FAILURES 1
+#endif
+
+#ifndef AVOID_BROKEN_CLANG_FAILURES
+
+static_assert([] constexpr {
+ constexpr simd::mask<float, 7> a([](int i) -> bool { return i < 3; });
+ constexpr simd::basic_vec b = -a;
+ static_assert(b[0] == -(0 < 3));
+ static_assert(b[1] == -(1 < 3));
+ static_assert(b[2] == -(2 < 3));
+ static_assert(b[3] == -(3 < 3));
+ return all_of(b == simd::vec<int, 7>([](int i) { return -int(i < 3); }));
+}());
+
+static_assert([] constexpr {
+ constexpr simd::mask<float, 7> a([](int i) -> bool { return i < 3; });
+ constexpr simd::basic_vec b = ~a;
+ static_assert(b[0] == ~int(0 < 3));
+ static_assert(b[1] == ~int(1 < 3));
+ static_assert(b[2] == ~int(2 < 3));
+ static_assert(b[3] == ~int(3 < 3));
+ return all_of(b == simd::vec<int, 7>([](int i) { return ~int(i < 3); }));
+}());
+
+static_assert([] constexpr {
+ constexpr simd::mask<float, 4> a([](int i) -> bool { return i < 2; });
+ constexpr simd::basic_vec b = a;
+ static_assert(b[0] == 1);
+ static_assert(b[1] == 1);
+ static_assert(b[2] == 0);
+ return b[3] == 0;
+}());
+
+static_assert([] constexpr {
+ // Corner case on AVX w/o AVX2 systems. <float, 5> is an AVX register;
+ // <int, 5> is deduced as SSE + scalar.
+ constexpr simd::mask<float, 5> a([](int i) -> bool { return i >= 2; });
+ constexpr simd::basic_vec b = a;
+ static_assert(b[0] == 0);
+ static_assert(b[1] == 0);
+ static_assert(b[2] == 1);
+ static_assert(b[3] == 1);
+ static_assert(b[4] == 1);
+#if defined __AVX2__ || !defined __AVX__
+ static_assert(all_of((b == 1) == a));
+#endif
+ constexpr simd::mask<float, 8> a8([](int i) -> bool { return i <= 4; });
+ constexpr simd::basic_vec b8 = a8;
+ static_assert(b8[0] == 1);
+ static_assert(b8[1] == 1);
+ static_assert(b8[2] == 1);
+ static_assert(b8[3] == 1);
+ static_assert(b8[4] == 1);
+ static_assert(b8[5] == 0);
+ static_assert(b8[6] == 0);
+ static_assert(b8[7] == 0);
+#if SIMD_MASK_IMPLICIT_CONVERSIONS || defined __AVX2__ || !defined __AVX__
+ static_assert(all_of((b8 == 1) == a8));
+#endif
+ constexpr simd::mask<float, 15> a15([](int i) -> bool { return i <= 4; });
+ constexpr simd::basic_vec b15 = a15;
+ static_assert(b15[0] == 1);
+ static_assert(b15[4] == 1);
+ static_assert(b15[5] == 0);
+ static_assert(b15[8] == 0);
+ static_assert(b15[14] == 0);
+ static_assert(all_of((b15 == 1) == a15));
+ return true;
+}());
+
+static_assert([] constexpr {
+ constexpr simd::mask<float, 4> a([](int i) -> bool { return i < 2; });
+ constexpr simd::basic_vec b = ~a;
+ constexpr simd::basic_vec c = a;
+ static_assert(c[0] == int(a[0]));
+ static_assert(c[1] == int(a[1]));
+ static_assert(c[2] == int(a[2]));
+ static_assert(c[3] == int(a[3]));
+ static_assert(b[0] == ~int(0 < 2));
+ static_assert(b[1] == ~int(1 < 2));
+ static_assert(b[2] == ~int(2 < 2));
+ static_assert(b[3] == ~int(3 < 2));
+ return all_of(b == simd::vec<int, 4>([](int i) { return ~int(i < 2); }));
+}());
+#endif
+
+// mask conversions //////////////////
+namespace mask_conversion_tests
+{
+ using simd::mask;
+
+ struct TestResult
+ {
+ int state;
+ unsigned long long a, b;
+ };
+
+ template <auto Res>
+ consteval void
+ check()
+ {
+ if constexpr (Res.state != 0 && Res.a != Res.b)
+ static_assert(Res.a == Res.b);
+ else
+ static_assert(Res.state == 0);
+ }
+
+ template <typename U>
+ consteval TestResult
+ do_test(const auto& k)
+ {
+ using M = simd::mask<U, k.size()>;
+ if constexpr (std::is_destructible_v<M>)
+ {
+ if (!std::ranges::equal(M(k), k))
+ {
+ if constexpr (k.size() <= 64)
+ return {1, M(k).to_ullong(), k.to_ullong()};
+ else
+ return {1, 0, 0};
+ }
+ else
+ return {0, 0, 0};
+ }
+ else
+ return {0, 0, 0};
+ }
+
+ template <typename T, int N, int P = 0>
+ consteval void
+ do_test()
+ {
+ if constexpr (std::is_destructible_v<simd::mask<T, N>>)
+ {
+ constexpr simd::mask<T, N> k([](int i) {
+ if constexpr (P == 2)
+ return std::has_single_bit(unsigned(i));
+ else if constexpr (P == 3)
+ return !std::has_single_bit(unsigned(i));
+ else
+ return (i & 1) == P;
+ });
+ check<do_test<char>( k)>();
+ check<do_test<char>(!k)>();
+ check<do_test<short>( k)>();
+ check<do_test<short>(!k)>();
+ check<do_test<int>( k)>();
+ check<do_test<int>(!k)>();
+ check<do_test<double>( k)>();
+ check<do_test<double>(!k)>();
+#ifdef __STDCPP_FLOAT16_T__
+ check<do_test<std::float16_t>( k)>();
+ check<do_test<std::float16_t>(!k)>();
+#endif
+ if constexpr (P <= 2)
+ do_test<T, N, P + 1>();
+ }
+ }
+
+ template <typename T>
+ consteval bool
+ test()
+ {
+ using V = simd::mask<T>;
+ do_test<T, 1>();
+ do_test<T, V::size()>();
+ do_test<T, 2 * V::size()>();
+ do_test<T, 4 * V::size()>();
+ do_test<T, 5 * V::size()>();
+ do_test<T, 2 * V::size() + 1>();
+ do_test<T, 2 * V::size() - 1>();
+ do_test<T, V::size() / 2>();
+ do_test<T, V::size() / 3>();
+ do_test<T, V::size() / 5>();
+ return true;
+ }
+
+ static_assert(test<char>());
+ static_assert(test<short>());
+ static_assert(test<float>());
+ static_assert(test<double>());
+#ifdef __STDCPP_FLOAT16_T__
+ static_assert(test<std::float16_t>());
+#endif
+}
+
+// vec reductions ///////////////////
+
+namespace simd_reduction_tests
+{
+ static_assert(reduce(simd::vec<int, 7>(1)) == 7);
+ static_assert(reduce(simd::vec<int, 7>(2), std::multiplies<>()) == 128);
+ static_assert(reduce(simd::vec<int, 8>(2), std::bit_and<>()) == 2);
+ static_assert(reduce(simd::vec<int, 8>(2), std::bit_or<>()) == 2);
+ static_assert(reduce(simd::vec<int, 8>(2), std::bit_xor<>()) == 0);
+ static_assert(reduce(simd::vec<int, 3>(2), std::bit_and<>()) == 2);
+ static_assert(reduce(simd::vec<int, 6>(2), std::bit_and<>()) == 2);
+ static_assert(reduce(simd::vec<int, 7>(2), std::bit_and<>()) == 2);
+ static_assert(reduce(simd::vec<int, 7>(2), std::bit_or<>()) == 2);
+ static_assert(reduce(simd::vec<int, 7>(2), std::bit_xor<>()) == 2);
+#ifndef AVOID_BROKEN_CLANG_FAILURES
+ static_assert(reduce(simd::vec<int, 4>(2), simd::mask<int, 4>(false)) == 0);
+ static_assert(reduce(simd::vec<int, 4>(2), simd::mask<int, 4>(false), std::multiplies<>()) == 1);
+ static_assert(reduce(simd::vec<int, 4>(2), simd::mask<int, 4>(false), std::bit_and<>()) == ~0);
+ static_assert(reduce(simd::vec<int, 4>(2), simd::mask<int, 4>(false), [](auto a, auto b) {
+ return select(a < b, a, b);
+ }, __INT_MAX__) == __INT_MAX__);
+#endif
+
+ template <typename BinaryOperation>
+ concept masked_reduce_works = requires(simd::vec<int, 4> a, simd::vec<int, 4> b) {
+ reduce(a, a < b, BinaryOperation());
+ };
+
+ static_assert(!masked_reduce_works<std::minus<>>);
+}
+
+// mask reductions ///////////////////
+
+static_assert(all_of(simd::vec<float>() == simd::vec<float>()));
+static_assert(any_of(simd::vec<float>() == simd::vec<float>()));
+static_assert(!none_of(simd::vec<float>() == simd::vec<float>()));
+static_assert(reduce_count(simd::vec<float>() == simd::vec<float>()) == simd::vec<float>::size);
+static_assert(reduce_min_index(simd::vec<float>() == simd::vec<float>()) == 0);
+static_assert(reduce_max_index(simd::vec<float>() == simd::vec<float>()) == simd::vec<float>::size - 1);
+
+// chunk ////////////////////////
+
+static_assert([] {
+ constexpr auto a = simd::vec<int, 8>([] (int i) { return i; });
+ auto a4 = chunk<simd::vec<int, 4>>(a);
+ auto a3 = chunk<simd::vec<int, 3>>(a);
+ auto a3_ = chunk<3>(a);
+ return a4.size() == 2 && std::same_as<decltype(a4), std::array<simd::vec<int, 4>, 2>>
+ && std::tuple_size_v<decltype(a3)> == 3
+ && all_of(std::get<0>(a3) == simd::vec<int, 3>([] (int i) { return i; }))
+ && all_of(std::get<1>(a3) == simd::vec<int, 3>([] (int i) { return i + 3; }))
+ && all_of(std::get<2>(a3) == simd::vec<int, 2>([] (int i) { return i + 6; }))
+ && std::same_as<decltype(a3), decltype(a3_)>
+ && all_of(std::get<0>(a3) == std::get<0>(a3_));
+}());
+
+static_assert([] {
+ constexpr simd::mask<int, 8> a([] (int i) -> bool { return i & 1; });
+ auto a4 = chunk<simd::mask<int, 4>>(a);
+ auto a3 = chunk<simd::mask<int, 3>>(a);
+ auto a3_ = chunk<3>(a);
+ return a4.size() == 2 && std::same_as<decltype(a4), std::array<simd::mask<int, 4>, 2>>
+ && std::tuple_size_v<decltype(a3)> == 3
+ && all_of(std::get<0>(a3) == simd::mask<int, 3>(
+ [] (int i) -> bool { return i & 1; }))
+ && all_of(std::get<1>(a3) == simd::mask<int, 3>(
+ [] (int i) -> bool { return (i + 3) & 1; }))
+ && all_of(std::get<2>(a3) == simd::mask<int, 2>(
+ [] (int i) -> bool { return (i + 6) & 1; }))
+ && std::same_as<decltype(a3), decltype(a3_)>
+ && all_of(std::get<0>(a3) == std::get<0>(a3_));
+}());
+
+// cat ///////////////////////////
+
+static_assert(all_of(simd::cat(simd::__iota<simd::vec<int, 3>>, simd::vec<int, 1>(3))
+ == simd::__iota<simd::vec<int, 4>>));
+
+static_assert(all_of(simd::cat(simd::__iota<simd::vec<int, 4>>, simd::__iota<simd::vec<int, 4>> + 4)
+ == simd::__iota<simd::vec<int, 8>>));
+
+static_assert(all_of(simd::cat(simd::__iota<simd::vec<double, 4>>, simd::__iota<simd::vec<double, 2>> + 4)
+ == simd::__iota<simd::vec<double, 6>>));
+
+static_assert(all_of(simd::cat(simd::__iota<simd::vec<double, 4>>, simd::__iota<simd::vec<double, 4>> + 4)
+ == simd::__iota<simd::vec<double, 8>>));
+
+// select ////////////////////////
+
+#ifndef AVOID_BROKEN_CLANG_FAILURES
+static_assert(all_of(simd::vec<long long, 8>(std::array{0, 0, 0, 0, 4, 4, 4, 4})
+ == select(simd::__iota<simd::vec<double, 8>> < 4, 0ll, 4ll)));
+
+static_assert(all_of(simd::vec<int, 8>(std::array{0, 0, 0, 0, 4, 4, 4, 4})
+ == select(simd::__iota<simd::vec<float, 8>> < 4.f, 0, 4)));
+#endif
+
+// permute ////////////////////////
+
+namespace permutations
+{
+ struct _DuplicateEven
+ {
+ consteval unsigned
+ operator()(unsigned __i) const
+ { return __i & ~1u; }
+ };
+
+ inline constexpr _DuplicateEven duplicate_even {};
+
+ struct _DuplicateOdd
+ {
+ consteval unsigned
+ operator()(unsigned __i) const
+ { return __i | 1u; }
+ };
+
+ inline constexpr _DuplicateOdd duplicate_odd {};
+
+ template <unsigned _Np>
+ struct _SwapNeighbors
+ {
+ consteval unsigned
+ operator()(unsigned __i, unsigned __size) const
+ {
+ if (__size % (2 * _Np) != 0)
+ abort(); // swap_neighbors<N> permutation requires a multiple of 2N elements
+ else if (std::has_single_bit(_Np))
+ return __i ^ _Np;
+ else if (__i % (2 * _Np) >= _Np)
+ return __i - _Np;
+ else
+ return __i + _Np;
+ }
+ };
+
+ template <unsigned _Np = 1u>
+ inline constexpr _SwapNeighbors<_Np> swap_neighbors {};
+
+ template <int _Position>
+ struct _Broadcast
+ {
+ consteval int
+ operator()(int, int __size) const
+ { return _Position < 0 ? __size + _Position : _Position; }
+ };
+
+ template <int _Position>
+ inline constexpr _Broadcast<_Position> broadcast {};
+
+ inline constexpr _Broadcast<0> broadcast_first {};
+
+ inline constexpr _Broadcast<-1> broadcast_last {};
+
+ struct _Reverse
+ {
+ consteval int
+ operator()(int __i, int __size) const
+ { return __size - 1 - __i; }
+ };
+
+ inline constexpr _Reverse reverse {};
+
+ template <int _Offset>
+ struct _Rotate
+ {
+ consteval int
+ operator()(int __i, int __size) const
+ {
+ __i += _Offset;
+ __i %= __size;
+ if (__i < 0)
+ __i += __size;
+ return __i;
+ }
+ };
+
+ template <int _Offset>
+ inline constexpr _Rotate<_Offset> rotate {};
+
+ template <int _Offset>
+ struct _Shift
+ {
+ consteval int
+ operator()(int __i, int __size) const
+ {
+ const int __j = __i + _Offset;
+ if (__j >= __size || -__j > __size)
+ return simd::zero_element;
+ else if (__j < 0)
+ return __size + __j;
+ else
+ return __j;
+ }
+ };
+
+ template <int _Offset>
+ inline constexpr _Shift<_Offset> shift {};
+}
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::duplicate_even)
+ == simd::__iota<simd::vec<int>> / 2 * 2));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::duplicate_odd)
+ == simd::__iota<simd::vec<int>> / 2 * 2 + 1));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::swap_neighbors<1>)
+ == simd::vec<int>([](int i) { return i ^ 1; })));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int, 8>>,
+ permutations::swap_neighbors<2>)
+ == simd::vec<int, 8>(std::array{2, 3, 0, 1, 6, 7, 4, 5})));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int, 12>>,
+ permutations::swap_neighbors<3>)
+ == simd::vec<int, 12>(
+ std::array{3, 4, 5, 0, 1, 2, 9, 10, 11, 6, 7, 8})));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::broadcast<1>)
+ == simd::vec<int>(1)));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::broadcast_first)
+ == simd::vec<int>(0)));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::broadcast_last)
+ == simd::vec<int>(int(simd::vec<int>::size() - 1))));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::reverse)
+ == simd::vec<int>([](int i) { return int(simd::vec<int>::size()) - 1 - i; })));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::rotate<1>)
+ == (simd::__iota<simd::vec<int>> + 1) % int(simd::vec<int>::size())));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int>>, permutations::rotate<2>)
+ == (simd::__iota<simd::vec<int>> + 2) % int(simd::vec<int>::size())));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int, 7>>, permutations::rotate<2>)
+ == simd::vec<int, 7>(std::array {2, 3, 4, 5, 6, 0, 1})));
+
+static_assert(
+ all_of(simd::permute(simd::__iota<simd::vec<int, 7>>, permutations::rotate<-2>)
+ == simd::vec<int, 7>(std::array {5, 6, 0, 1, 2, 3, 4}))); // { dg-prune-output "Wpsabi" }
--- /dev/null
+// { dg-do compile { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#define _GLIBCXX_SIMD_THROW_ON_BAD_VALUE 1
+
+#include <bits/simd_details.h>
+#include <bits/simd_flags.h>
+#include <stdfloat>
+
+namespace simd = std::simd;
+
+using std::float16_t;
+using std::float32_t;
+using std::float64_t;
+
+using namespace std::simd;
+
+void test()
+{
+ template for (auto t : {float(), double(), float16_t(), float32_t(), float64_t()})
+ {
+ using T = decltype(t);
+ static_assert(__vectorizable<T>);
+ }
+
+ static_assert(!__vectorizable<const float>);
+ static_assert(!__vectorizable<float&>);
+ static_assert(!__vectorizable<std::bfloat16_t>);
+
+ template for (constexpr int N : {1, 2, 4, 8})
+ {
+ static_assert(std::signed_integral<__integer_from<N>>);
+ static_assert(sizeof(__integer_from<N>) == N);
+ static_assert(__vectorizable<__integer_from<N>>);
+ }
+
+ static_assert(__div_ceil(5, 3) == 2);
+
+ static_assert(sizeof(_Bitmask<3>) == 1);
+ static_assert(sizeof(_Bitmask<30>) == 4);
+
+ static_assert(__scalar_abi_tag<_ScalarAbi<1>>);
+ static_assert(__scalar_abi_tag<_ScalarAbi<2>>);
+ static_assert(!__scalar_abi_tag<_Abi_t<1, 1>>);
+
+ static_assert(__abi_tag<_ScalarAbi<1>>);
+ static_assert(__abi_tag<_ScalarAbi<2>>);
+
+ using AN = decltype(__native_abi<float>());
+ using A1 = decltype(__native_abi<float>()._S_resize<1>());
+ static_assert(A1::_S_size == 1);
+ static_assert(A1::_S_nreg == 1);
+ static_assert(A1::_S_variant == AN::_S_variant);
+ static_assert(__scalar_abi_tag<A1> == __scalar_abi_tag<AN>);
+ static_assert(std::is_same_v<decltype(__abi_rebind<float, AN::_S_size, A1>()), AN>);
+ if constexpr (AN::_S_size >= 2) // the target has SIMD support for float
+ {
+ {
+ using A2 = decltype(__abi_rebind<float, 2, AN>());
+ static_assert(A2::_S_size == 2);
+ static_assert(A2::_S_nreg == 1);
+ static_assert(A2::_S_variant == AN::_S_variant);
+ using A2x = decltype(__abi_rebind<float, 2, decltype(__abi_rebind<float, 1, A2>())>());
+ static_assert(std::is_same_v<A2, A2x>);
+ }
+ using A4 = decltype(__abi_rebind<float, 4, AN>());
+ static_assert(A4::_S_size == 4);
+ }
+
+ static_assert(__streq_to_1("1"));
+ static_assert(!__streq_to_1(""));
+ static_assert(!__streq_to_1(nullptr));
+ static_assert(!__streq_to_1("0"));
+ static_assert(!__streq_to_1("1 "));
+
+ static_assert(__static_sized_range<int[4]>);
+ static_assert(__static_sized_range<int[4], 4>);
+ static_assert(__static_sized_range<std::array<int, 4>, 4>);
+
+ static_assert( __value_preserving_convertible_to<int, double>);
+ static_assert(!__value_preserving_convertible_to<int, float>);
+ static_assert( __value_preserving_convertible_to<float, double>);
+ static_assert(!__value_preserving_convertible_to<double, float>);
+
+ static_assert(__explicitly_convertible_to<float, float16_t>);
+ static_assert(__explicitly_convertible_to<long, float16_t>);
+
+ static_assert(__constexpr_wrapper_like<std::constant_wrapper<2>>);
+ static_assert(__constexpr_wrapper_like<std::integral_constant<int, 1>>);
+
+ static_assert(!__broadcast_constructible<int, float>);
+ static_assert(!__broadcast_constructible<int&, float>);
+ static_assert(!__broadcast_constructible<int&&, float>);
+ static_assert(!__broadcast_constructible<const int&, float>);
+ static_assert(!__broadcast_constructible<const int, float>);
+
+ static_assert(__broadcast_constructible<decltype(std::cw<2>), float>);
+ static_assert(__broadcast_constructible<decltype(std::cw<0.f>), std::float16_t>);
+
+
+ static_assert(__higher_rank_than<long, int>);
+ static_assert(__higher_rank_than<long long, long>);
+ static_assert(__higher_rank_than<int, short>);
+ static_assert(__higher_rank_than<short, char>);
+
+ static_assert(!__higher_rank_than<char, signed char>);
+ static_assert(!__higher_rank_than<signed char, char>);
+ static_assert(!__higher_rank_than<char, unsigned char>);
+ static_assert(!__higher_rank_than<unsigned char, char>);
+
+ static_assert(__higher_rank_than<unsigned int, short>);
+ static_assert(__higher_rank_than<unsigned long, int>);
+ static_assert(__higher_rank_than<unsigned long long, long>);
+
+ static_assert(__higher_rank_than<float, float16_t>);
+ static_assert(__higher_rank_than<float32_t, float>);
+ static_assert(__higher_rank_than<double, float32_t>);
+ static_assert(__higher_rank_than<double, float>);
+ static_assert(__higher_rank_than<float64_t, float32_t>);
+ static_assert(__higher_rank_than<float64_t, float>);
+ static_assert(__higher_rank_than<float64_t, double>);
+
+ static_assert(__loadstore_convertible_to<float, double>);
+ static_assert(__loadstore_convertible_to<int, double>);
+ static_assert(!__loadstore_convertible_to<int, float>);
+ static_assert(!__loadstore_convertible_to<int, float, __aligned_flag>);
+ static_assert(__loadstore_convertible_to<int, float, __convert_flag>);
+ static_assert(__loadstore_convertible_to<int, float, __aligned_flag, __convert_flag>);
+
+ static_assert(__mask_element_size<basic_mask<4>> == 4);
+
+ static_assert(__highest_bit(0b1000u) == 3);
+ static_assert(__highest_bit(0b10000001000ull) == 10);
+}
+
+consteval bool
+throws(auto f)
+{
+ try { f(); }
+ catch (...) { return true; }
+ return false;
+}
+
+static_assert(!throws([] { __value_preserving_cast<float>(1); }));
+static_assert(!throws([] { __value_preserving_cast<float>(1.5); }));
+static_assert(throws([] { __value_preserving_cast<float>(0x5EAF00D); }));
+static_assert(throws([] { __value_preserving_cast<unsigned>(-1); }));
+static_assert(!throws([] { __value_preserving_cast<unsigned short>(0xffff); }));
+static_assert(throws([] { __value_preserving_cast<unsigned short>(0x10000); }));
+
+static_assert(__converts_trivially<int, unsigned>);
+#if __SIZEOF_LONG__ == __SIZEOF_LONG_LONG__
+static_assert(__converts_trivially<long long, long>);
+#elif __SIZEOF_INT__ == __SIZEOF_LONG__
+static_assert(__converts_trivially<int, long>);
+#endif
+static_assert(__converts_trivially<float, float32_t>);
+
+static_assert([] {
+ bool to_find[10] = {0, 1, 1, 1, 0, 1, 0, 0, 1};
+ __bit_foreach(0b100101110u, [&](int i) {
+ if (!to_find[i]) throw false;
+ to_find[i] = false;
+ });
+ for (bool b : to_find)
+ if (b)
+ return false;
+ return true;
+}());
+
+// flags ////////////////////////
+static_assert(std::is_same_v<decltype(flag_default | flag_default), flags<>>);
+static_assert(std::is_same_v<decltype(flag_convert | flag_default), flags<__convert_flag>>);
+static_assert(std::is_same_v<decltype(flag_convert | flag_convert), flags<__convert_flag>>);
+static_assert(std::is_same_v<decltype(flag_aligned | flag_convert),
+ flags<__aligned_flag, __convert_flag>>);
+static_assert(std::is_same_v<decltype(flag_aligned | flag_convert | flag_aligned),
+ flags<__aligned_flag, __convert_flag>>);
+static_assert(std::is_same_v<decltype(flag_aligned | (flag_convert | flag_aligned)),
+ flags<__aligned_flag, __convert_flag>>);
+
+static_assert(!flag_default._S_test(flag_convert));
+static_assert(flag_convert._S_test(flag_convert));
+static_assert(!flag_convert._S_test(flag_aligned));
+static_assert((flag_overaligned<32> | flag_convert | flag_aligned)._S_test(flag_convert));
--- /dev/null
+// { dg-do compile { target c++26 } }
+// { dg-require-effective-target x86 }
+
+#include <simd>
+#include <stdfloat>
+
+namespace simd = std::simd;
+
+// vec.math ///////////////////////////////////////
+
+namespace math_tests
+{
+ using simd::__deduced_vec_t;
+ using simd::__math_floating_point;
+ using std::is_same_v;
+
+ using vf2 = simd::vec<float, 2>;
+ using vf4 = simd::vec<float, 4>;
+
+ template <typename T0, typename T1>
+ concept has_common_type = requires { typename std::common_type<T0, T1>::type; };
+
+ template <typename T>
+ concept has_deduced_vec = requires { typename simd::__deduced_vec_t<T>; };
+
+ static_assert(!has_common_type<vf2, vf4>);
+ static_assert( has_common_type<int, vf2>);
+
+ template <typename T, bool Strict = false>
+ struct holder
+ {
+ T value;
+
+ constexpr
+ operator const T&() const
+ { return value; }
+
+ template <typename U>
+ requires (!std::same_as<T, U>) && Strict
+ operator U() const = delete;
+ };
+
+ // The next always has a common_type because the UDT is convertible_to<float> and is not an
+ // arithmetic type:
+ static_assert( has_common_type<holder<int>, vf2>);
+
+ // It's up to the UDT to constrain itself better:
+ static_assert(!has_common_type<holder<int, true>, vf2>);
+
+ // However, a strict UDT can still work
+ static_assert( has_common_type<holder<float, true>, vf2>);
+
+ // Except if it needs any kind of conversion, even if it's value-preserving. Again the semantics
+ // are what the UDT defined.
+ static_assert(!has_common_type<holder<short, true>, vf2>);
+
+ static_assert(!has_deduced_vec<int>);
+ static_assert(!__math_floating_point<int>);
+ static_assert(!__math_floating_point<float>);
+ static_assert(!__math_floating_point<simd::vec<int>>);
+ static_assert( __math_floating_point<simd::vec<float>>);
+}