1 // Simd NEON specific implementations -*- C++ -*-
3 // Copyright (C) 2020-2024 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
26 #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
28 #if __cplusplus >= 201703L
30 #if !_GLIBCXX_SIMD_HAVE_NEON
31 #error "simd_neon.h may only be included when NEON on ARM is available"
34 _GLIBCXX_SIMD_BEGIN_NAMESPACE
36 // _CommonImplNeon {{{
37 struct _CommonImplNeon
: _CommonImplBuiltin
40 using _CommonImplBuiltin::_S_store
;
47 template <typename _Abi
, typename
>
48 struct _SimdImplNeon
: _SimdImplBuiltin
<_Abi
>
50 using _Base
= _SimdImplBuiltin
<_Abi
>;
52 template <typename _Tp
>
53 using _MaskMember
= typename
_Base::template _MaskMember
<_Tp
>;
55 template <typename _Tp
>
56 static constexpr size_t _S_max_store_size
= 16;
59 template <typename _Tp
, size_t _Np
, typename _Up
>
60 static inline _SimdWrapper
<_Tp
, _Np
>
61 _S_masked_load(_SimdWrapper
<_Tp
, _Np
> __merge
, _MaskMember
<_Tp
> __k
,
62 const _Up
* __mem
) noexcept
64 __execute_n_times
<_Np
>([&](auto __i
) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{
66 __merge
._M_set(__i
, static_cast<_Tp
>(__mem
[__i
]));
72 // _S_masked_store_nocvt {{{
73 template <typename _Tp
, size_t _Np
>
74 _GLIBCXX_SIMD_INTRINSIC
static void
75 _S_masked_store_nocvt(_SimdWrapper
<_Tp
, _Np
> __v
, _Tp
* __mem
,
78 __execute_n_times
<_Np
>([&](auto __i
) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{
80 __mem
[__i
] = __v
[__i
];
86 template <typename _Tp
, typename _BinaryOperation
>
87 _GLIBCXX_SIMD_INTRINSIC
static constexpr _Tp
88 _S_reduce(simd
<_Tp
, _Abi
> __x
, _BinaryOperation
&& __binary_op
)
90 if (not __builtin_is_constant_evaluated())
92 constexpr size_t _Np
= __x
.size();
93 if constexpr (sizeof(__x
) == 16 && _Np
>= 4
94 && !_Abi::template _S_is_partial
<_Tp
>)
96 const auto __halves
= split
<simd
<_Tp
, simd_abi::_Neon
<8>>>(__x
);
97 const auto __y
= __binary_op(__halves
[0], __halves
[1]);
98 return _SimdImplNeon
<simd_abi::_Neon
<8>>::_S_reduce(
99 __y
, static_cast<_BinaryOperation
&&>(__binary_op
));
101 else if constexpr (_Np
== 8)
103 __x
= __binary_op(__x
, _Base::template _M_make_simd
<_Tp
, _Np
>(
104 __vector_permute
<1, 0, 3, 2, 5, 4, 7, 6>(__x
._M_data
)));
105 __x
= __binary_op(__x
, _Base::template _M_make_simd
<_Tp
, _Np
>(
106 __vector_permute
<3, 2, 1, 0, 7, 6, 5, 4>(__x
._M_data
)));
107 __x
= __binary_op(__x
, _Base::template _M_make_simd
<_Tp
, _Np
>(
108 __vector_permute
<7, 6, 5, 4, 3, 2, 1, 0>(__x
._M_data
)));
111 else if constexpr (_Np
== 4)
113 __x
= __binary_op(__x
, _Base::template _M_make_simd
<_Tp
, _Np
>(
114 __vector_permute
<1, 0, 3, 2>(__x
._M_data
)));
115 __x
= __binary_op(__x
, _Base::template _M_make_simd
<_Tp
, _Np
>(
116 __vector_permute
<3, 2, 1, 0>(__x
._M_data
)));
119 else if constexpr (_Np
== 2)
121 __x
= __binary_op(__x
, _Base::template _M_make_simd
<_Tp
, _Np
>(
122 __vector_permute
<1, 0>(__x
._M_data
)));
126 return _Base::_S_reduce(__x
, static_cast<_BinaryOperation
&&>(__binary_op
));
132 template <typename _Tp
, typename _TVT
= _VectorTraits
<_Tp
>>
133 _GLIBCXX_SIMD_INTRINSIC
static _Tp
136 if constexpr (__have_neon_a64
)
138 const auto __intrin
= __to_intrin(__x
);
139 if constexpr (_TVT::template _S_is
<float, 2>)
140 return vsqrt_f32(__intrin
);
141 else if constexpr (_TVT::template _S_is
<float, 4>)
142 return vsqrtq_f32(__intrin
);
143 else if constexpr (_TVT::template _S_is
<double, 1>)
144 return vsqrt_f64(__intrin
);
145 else if constexpr (_TVT::template _S_is
<double, 2>)
146 return vsqrtq_f64(__intrin
);
148 __assert_unreachable
<_Tp
>();
151 return _Base::_S_sqrt(__x
);
156 template <typename _TW
, typename _TVT
= _VectorTraits
<_TW
>>
157 _GLIBCXX_SIMD_INTRINSIC
static _TW
160 using _Tp
= typename
_TVT::value_type
;
161 if constexpr (__have_neon_a32
)
163 const auto __intrin
= __to_intrin(__x
);
164 if constexpr (_TVT::template _S_is
<float, 2>)
165 return vrnd_f32(__intrin
);
166 else if constexpr (_TVT::template _S_is
<float, 4>)
167 return vrndq_f32(__intrin
);
168 else if constexpr (_TVT::template _S_is
<double, 1>)
169 return vrnd_f64(__intrin
);
170 else if constexpr (_TVT::template _S_is
<double, 2>)
171 return vrndq_f64(__intrin
);
173 __assert_unreachable
<_Tp
>();
175 else if constexpr (is_same_v
<_Tp
, float>)
177 auto __intrin
= __to_intrin(__x
);
178 if constexpr (sizeof(__x
) == 16)
179 __intrin
= vcvtq_f32_s32(vcvtq_s32_f32(__intrin
));
181 __intrin
= vcvt_f32_s32(vcvt_s32_f32(__intrin
));
182 return _Base::_S_abs(__x
)._M_data
< 0x1p
23f
183 ? __vector_bitcast
<float>(__intrin
)
187 return _Base::_S_trunc(__x
);
192 template <typename _Tp
, size_t _Np
>
193 _GLIBCXX_SIMD_INTRINSIC
static _SimdWrapper
<_Tp
, _Np
>
194 _S_round(_SimdWrapper
<_Tp
, _Np
> __x
)
196 if constexpr (__have_neon_a32
)
198 const auto __intrin
= __to_intrin(__x
);
199 if constexpr (sizeof(_Tp
) == 4 && sizeof(__x
) == 8)
200 return vrnda_f32(__intrin
);
201 else if constexpr (sizeof(_Tp
) == 4 && sizeof(__x
) == 16)
202 return vrndaq_f32(__intrin
);
203 else if constexpr (sizeof(_Tp
) == 8 && sizeof(__x
) == 8)
204 return vrnda_f64(__intrin
);
205 else if constexpr (sizeof(_Tp
) == 8 && sizeof(__x
) == 16)
206 return vrndaq_f64(__intrin
);
208 __assert_unreachable
<_Tp
>();
211 return _Base::_S_round(__x
);
216 template <typename _Tp
, typename _TVT
= _VectorTraits
<_Tp
>>
217 _GLIBCXX_SIMD_INTRINSIC
static _Tp
220 if constexpr (__have_neon_a32
)
222 const auto __intrin
= __to_intrin(__x
);
223 if constexpr (_TVT::template _S_is
<float, 2>)
224 return vrndm_f32(__intrin
);
225 else if constexpr (_TVT::template _S_is
<float, 4>)
226 return vrndmq_f32(__intrin
);
227 else if constexpr (_TVT::template _S_is
<double, 1>)
228 return vrndm_f64(__intrin
);
229 else if constexpr (_TVT::template _S_is
<double, 2>)
230 return vrndmq_f64(__intrin
);
232 __assert_unreachable
<_Tp
>();
235 return _Base::_S_floor(__x
);
240 template <typename _Tp
, typename _TVT
= _VectorTraits
<_Tp
>>
241 _GLIBCXX_SIMD_INTRINSIC
static _Tp
244 if constexpr (__have_neon_a32
)
246 const auto __intrin
= __to_intrin(__x
);
247 if constexpr (_TVT::template _S_is
<float, 2>)
248 return vrndp_f32(__intrin
);
249 else if constexpr (_TVT::template _S_is
<float, 4>)
250 return vrndpq_f32(__intrin
);
251 else if constexpr (_TVT::template _S_is
<double, 1>)
252 return vrndp_f64(__intrin
);
253 else if constexpr (_TVT::template _S_is
<double, 2>)
254 return vrndpq_f64(__intrin
);
256 __assert_unreachable
<_Tp
>();
259 return _Base::_S_ceil(__x
);
264 // _MaskImplNeonMixin {{{
265 struct _MaskImplNeonMixin
267 using _Base
= _MaskImplBuiltinMixin
;
269 template <typename _Tp
, size_t _Np
>
270 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SanitizedBitMask
<_Np
>
271 _S_to_bits(_SimdWrapper
<_Tp
, _Np
> __x
)
273 if (__builtin_is_constant_evaluated())
274 return _Base::_S_to_bits(__x
);
276 using _I
= __int_for_sizeof_t
<_Tp
>;
277 if constexpr (sizeof(__x
) == 16)
279 auto __asint
= __vector_bitcast
<_I
>(__x
);
281 [[maybe_unused
]] constexpr auto __zero
= decltype(__asint
)();
283 [[maybe_unused
]] constexpr auto __zero
= decltype(__lo64(__asint
))();
285 if constexpr (sizeof(_Tp
) == 1)
287 constexpr auto __bitsel
288 = __generate_from_n_evaluations
<16, __vector_type_t
<_I
, 16>>(
289 [&](auto __i
) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{
290 return static_cast<_I
>(
291 __i
< _Np
? (__i
< 8 ? 1 << __i
: 1 << (__i
- 8)) : 0);
295 return __vector_bitcast
<_UShort
>(
296 vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint
, __zero
), __zero
),
299 return __vector_bitcast
<_UShort
>(
300 vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint
), __hi64(__asint
)),
305 else if constexpr (sizeof(_Tp
) == 2)
307 constexpr auto __bitsel
308 = __generate_from_n_evaluations
<8, __vector_type_t
<_I
, 8>>(
309 [&](auto __i
) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{
310 return static_cast<_I
>(__i
< _Np
? 1 << __i
: 0);
314 return vaddvq_s16(__asint
);
317 vpadd_s16(vpadd_s16(__lo64(__asint
), __hi64(__asint
)), __zero
),
321 else if constexpr (sizeof(_Tp
) == 4)
323 constexpr auto __bitsel
324 = __generate_from_n_evaluations
<4, __vector_type_t
<_I
, 4>>(
325 [&](auto __i
) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{
326 return static_cast<_I
>(__i
< _Np
? 1 << __i
: 0);
330 return vaddvq_s32(__asint
);
332 return vpadd_s32(vpadd_s32(__lo64(__asint
), __hi64(__asint
)),
336 else if constexpr (sizeof(_Tp
) == 8)
337 return (__asint
[0] & 1) | (__asint
[1] & 2);
339 __assert_unreachable
<_Tp
>();
341 else if constexpr (sizeof(__x
) == 8)
343 auto __asint
= __vector_bitcast
<_I
>(__x
);
344 [[maybe_unused
]] constexpr auto __zero
= decltype(__asint
)();
345 if constexpr (sizeof(_Tp
) == 1)
347 constexpr auto __bitsel
348 = __generate_from_n_evaluations
<8, __vector_type_t
<_I
, 8>>(
349 [&](auto __i
) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{
350 return static_cast<_I
>(__i
< _Np
? 1 << __i
: 0);
354 return vaddv_s8(__asint
);
356 return vpadd_s8(vpadd_s8(vpadd_s8(__asint
, __zero
), __zero
),
360 else if constexpr (sizeof(_Tp
) == 2)
362 constexpr auto __bitsel
363 = __generate_from_n_evaluations
<4, __vector_type_t
<_I
, 4>>(
364 [&](auto __i
) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{
365 return static_cast<_I
>(__i
< _Np
? 1 << __i
: 0);
369 return vaddv_s16(__asint
);
371 return vpadd_s16(vpadd_s16(__asint
, __zero
), __zero
)[0];
374 else if constexpr (sizeof(_Tp
) == 4)
376 __asint
&= __make_vector
<_I
>(0x1, 0x2);
378 return vaddv_s32(__asint
);
380 return vpadd_s32(__asint
, __zero
)[0];
384 __assert_unreachable
<_Tp
>();
387 return _Base::_S_to_bits(__x
);
393 template <typename _Abi
, typename
>
394 struct _MaskImplNeon
: _MaskImplNeonMixin
, _MaskImplBuiltin
<_Abi
>
396 using _MaskImplBuiltinMixin::_S_to_maskvector
;
397 using _MaskImplNeonMixin::_S_to_bits
;
398 using _Base
= _MaskImplBuiltin
<_Abi
>;
399 using _Base::_S_convert
;
402 template <typename _Tp
>
403 _GLIBCXX_SIMD_INTRINSIC
static bool
404 _S_all_of(simd_mask
<_Tp
, _Abi
> __k
)
407 = __vector_bitcast
<char>(__k
._M_data
)
408 | ~__vector_bitcast
<char>(_Abi::template _S_implicit_mask
<_Tp
>());
409 if constexpr (sizeof(__k
) == 16)
411 const auto __x
= __vector_bitcast
<long long>(__kk
);
412 return __x
[0] + __x
[1] == -2;
414 else if constexpr (sizeof(__k
) <= 8)
415 return __bit_cast
<__int_for_sizeof_t
<decltype(__kk
)>>(__kk
) == -1;
417 __assert_unreachable
<_Tp
>();
422 template <typename _Tp
>
423 _GLIBCXX_SIMD_INTRINSIC
static bool
424 _S_any_of(simd_mask
<_Tp
, _Abi
> __k
)
427 = __vector_bitcast
<char>(__k
._M_data
)
428 | ~__vector_bitcast
<char>(_Abi::template _S_implicit_mask
<_Tp
>());
429 if constexpr (sizeof(__k
) == 16)
431 const auto __x
= __vector_bitcast
<long long>(__kk
);
432 return (__x
[0] | __x
[1]) != 0;
434 else if constexpr (sizeof(__k
) <= 8)
435 return __bit_cast
<__int_for_sizeof_t
<decltype(__kk
)>>(__kk
) != 0;
437 __assert_unreachable
<_Tp
>();
442 template <typename _Tp
>
443 _GLIBCXX_SIMD_INTRINSIC
static bool
444 _S_none_of(simd_mask
<_Tp
, _Abi
> __k
)
446 const auto __kk
= _Abi::_S_masked(__k
._M_data
);
447 if constexpr (sizeof(__k
) == 16)
449 const auto __x
= __vector_bitcast
<long long>(__kk
);
450 return (__x
[0] | __x
[1]) == 0;
452 else if constexpr (sizeof(__k
) <= 8)
453 return __bit_cast
<__int_for_sizeof_t
<decltype(__kk
)>>(__kk
) == 0;
455 __assert_unreachable
<_Tp
>();
460 template <typename _Tp
>
461 _GLIBCXX_SIMD_INTRINSIC
static bool _S_some_of(simd_mask
<_Tp
, _Abi
> __k
)
463 if constexpr (sizeof(__k
) <= 8)
465 const auto __kk
= __vector_bitcast
<char>(__k
._M_data
)
466 | ~__vector_bitcast
<char>(
467 _Abi::template _S_implicit_mask
<_Tp
>());
468 using _Up
= make_unsigned_t
<__int_for_sizeof_t
<decltype(__kk
)>>;
469 return __bit_cast
<_Up
>(__kk
) + 1 > 1;
472 return _Base::_S_some_of(__k
);
477 template <typename _Tp
>
478 _GLIBCXX_SIMD_INTRINSIC
static int
479 _S_popcount(simd_mask
<_Tp
, _Abi
> __k
)
481 if constexpr (sizeof(_Tp
) == 1)
483 const auto __s8
= __vector_bitcast
<_SChar
>(__k
._M_data
);
484 int8x8_t __tmp
= __lo64(__s8
) + __hi64z(__s8
);
485 return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp
, int8x8_t()), int8x8_t()),
488 else if constexpr (sizeof(_Tp
) == 2)
490 const auto __s16
= __vector_bitcast
<short>(__k
._M_data
);
491 int16x4_t __tmp
= __lo64(__s16
) + __hi64z(__s16
);
492 return -vpadd_s16(vpadd_s16(__tmp
, int16x4_t()), int16x4_t())[0];
494 else if constexpr (sizeof(_Tp
) == 4)
496 const auto __s32
= __vector_bitcast
<int>(__k
._M_data
);
497 int32x2_t __tmp
= __lo64(__s32
) + __hi64z(__s32
);
498 return -vpadd_s32(__tmp
, int32x2_t())[0];
500 else if constexpr (sizeof(_Tp
) == 8)
502 static_assert(sizeof(__k
) == 16);
503 const auto __s64
= __vector_bitcast
<long>(__k
._M_data
);
504 return -(__s64
[0] + __s64
[1]);
509 // _S_find_first_set {{{
510 template <typename _Tp
>
511 _GLIBCXX_SIMD_INTRINSIC
static int
512 _S_find_first_set(simd_mask
<_Tp
, _Abi
> __k
)
514 // TODO: the _Base implementation is not optimal for NEON
515 return _Base::_S_find_first_set(__k
);
519 // _S_find_last_set {{{
520 template <typename _Tp
>
521 _GLIBCXX_SIMD_INTRINSIC
static int
522 _S_find_last_set(simd_mask
<_Tp
, _Abi
> __k
)
524 // TODO: the _Base implementation is not optimal for NEON
525 return _Base::_S_find_last_set(__k
);
531 _GLIBCXX_SIMD_END_NAMESPACE
532 #endif // __cplusplus >= 201703L
533 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
534 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80