]> git.ipfire.org Git - thirdparty/gcc.git/blame - libstdc++-v3/include/experimental/bits/simd_neon.h
libstdc++: Work around test failures using -mno-tree-vrp
[thirdparty/gcc.git] / libstdc++-v3 / include / experimental / bits / simd_neon.h
CommitLineData
2bcceb6f
MK
1// Simd NEON specific implementations -*- C++ -*-
2
a054608c 3// Copyright (C) 2020-2021 Free Software Foundation, Inc.
2bcceb6f
MK
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_HAVE_NEON
31#error "simd_neon.h may only be included when NEON on ARM is available"
32#endif
33
34_GLIBCXX_SIMD_BEGIN_NAMESPACE
35
36// _CommonImplNeon {{{
37struct _CommonImplNeon : _CommonImplBuiltin
38{
39 // _S_store {{{
40 using _CommonImplBuiltin::_S_store;
41
42 // }}}
43};
44
45// }}}
46// _SimdImplNeon {{{
47template <typename _Abi>
48 struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
49 {
50 using _Base = _SimdImplBuiltin<_Abi>;
51
52 template <typename _Tp>
53 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
54
55 template <typename _Tp>
56 static constexpr size_t _S_max_store_size = 16;
57
58 // _S_masked_load {{{
59 template <typename _Tp, size_t _Np, typename _Up>
60 static inline _SimdWrapper<_Tp, _Np>
61 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
62 const _Up* __mem) noexcept
63 {
64 __execute_n_times<_Np>([&](auto __i) {
65 if (__k[__i] != 0)
66 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
67 });
68 return __merge;
69 }
70
71 // }}}
72 // _S_masked_store_nocvt {{{
73 template <typename _Tp, size_t _Np>
74 _GLIBCXX_SIMD_INTRINSIC static void
75 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
76 _MaskMember<_Tp> __k)
77 {
78 __execute_n_times<_Np>([&](auto __i) {
79 if (__k[__i] != 0)
80 __mem[__i] = __v[__i];
81 });
82 }
83
84 // }}}
85 // _S_reduce {{{
86 template <typename _Tp, typename _BinaryOperation>
87 _GLIBCXX_SIMD_INTRINSIC static _Tp
88 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
89 {
90 constexpr size_t _Np = __x.size();
91 if constexpr (sizeof(__x) == 16 && _Np >= 4
92 && !_Abi::template _S_is_partial<_Tp>)
93 {
94 const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
95 const auto __y = __binary_op(__halves[0], __halves[1]);
96 return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
97 __y, static_cast<_BinaryOperation&&>(__binary_op));
98 }
99 else if constexpr (_Np == 8)
100 {
101 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
102 __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
103 __x._M_data)));
104 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
105 __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
106 __x._M_data)));
107 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
108 __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
109 __x._M_data)));
110 return __x[0];
111 }
112 else if constexpr (_Np == 4)
113 {
114 __x
115 = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
116 __vector_permute<1, 0, 3, 2>(__x._M_data)));
117 __x
118 = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
119 __vector_permute<3, 2, 1, 0>(__x._M_data)));
120 return __x[0];
121 }
122 else if constexpr (_Np == 2)
123 {
124 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
125 __vector_permute<1, 0>(__x._M_data)));
126 return __x[0];
127 }
128 else
129 return _Base::_S_reduce(__x,
130 static_cast<_BinaryOperation&&>(__binary_op));
131 }
132
133 // }}}
134 // math {{{
135 // _S_sqrt {{{
136 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
137 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
138 {
139 if constexpr (__have_neon_a64)
140 {
141 const auto __intrin = __to_intrin(__x);
142 if constexpr (_TVT::template _S_is<float, 2>)
143 return vsqrt_f32(__intrin);
144 else if constexpr (_TVT::template _S_is<float, 4>)
145 return vsqrtq_f32(__intrin);
146 else if constexpr (_TVT::template _S_is<double, 1>)
147 return vsqrt_f64(__intrin);
148 else if constexpr (_TVT::template _S_is<double, 2>)
149 return vsqrtq_f64(__intrin);
150 else
151 __assert_unreachable<_Tp>();
152 }
153 else
154 return _Base::_S_sqrt(__x);
155 }
156
157 // }}}
158 // _S_trunc {{{
159 template <typename _TW, typename _TVT = _VectorTraits<_TW>>
160 _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
161 {
162 using _Tp = typename _TVT::value_type;
163 if constexpr (__have_neon_a32)
164 {
165 const auto __intrin = __to_intrin(__x);
166 if constexpr (_TVT::template _S_is<float, 2>)
167 return vrnd_f32(__intrin);
168 else if constexpr (_TVT::template _S_is<float, 4>)
169 return vrndq_f32(__intrin);
170 else if constexpr (_TVT::template _S_is<double, 1>)
171 return vrnd_f64(__intrin);
172 else if constexpr (_TVT::template _S_is<double, 2>)
173 return vrndq_f64(__intrin);
174 else
175 __assert_unreachable<_Tp>();
176 }
177 else if constexpr (is_same_v<_Tp, float>)
178 {
179 auto __intrin = __to_intrin(__x);
180 if constexpr (sizeof(__x) == 16)
181 __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
182 else
183 __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
184 return _Base::_S_abs(__x)._M_data < 0x1p23f
185 ? __vector_bitcast<float>(__intrin)
186 : __x._M_data;
187 }
188 else
189 return _Base::_S_trunc(__x);
190 }
191
192 // }}}
193 // _S_round {{{
194 template <typename _Tp, size_t _Np>
195 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
196 _S_round(_SimdWrapper<_Tp, _Np> __x)
197 {
198 if constexpr (__have_neon_a32)
199 {
200 const auto __intrin = __to_intrin(__x);
201 if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
202 return vrnda_f32(__intrin);
203 else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
204 return vrndaq_f32(__intrin);
205 else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
206 return vrnda_f64(__intrin);
207 else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
208 return vrndaq_f64(__intrin);
209 else
210 __assert_unreachable<_Tp>();
211 }
212 else
213 return _Base::_S_round(__x);
214 }
215
216 // }}}
217 // _S_floor {{{
218 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
219 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
220 {
221 if constexpr (__have_neon_a32)
222 {
223 const auto __intrin = __to_intrin(__x);
224 if constexpr (_TVT::template _S_is<float, 2>)
225 return vrndm_f32(__intrin);
226 else if constexpr (_TVT::template _S_is<float, 4>)
227 return vrndmq_f32(__intrin);
228 else if constexpr (_TVT::template _S_is<double, 1>)
229 return vrndm_f64(__intrin);
230 else if constexpr (_TVT::template _S_is<double, 2>)
231 return vrndmq_f64(__intrin);
232 else
233 __assert_unreachable<_Tp>();
234 }
235 else
236 return _Base::_S_floor(__x);
237 }
238
239 // }}}
240 // _S_ceil {{{
241 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
242 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
243 {
244 if constexpr (__have_neon_a32)
245 {
246 const auto __intrin = __to_intrin(__x);
247 if constexpr (_TVT::template _S_is<float, 2>)
248 return vrndp_f32(__intrin);
249 else if constexpr (_TVT::template _S_is<float, 4>)
250 return vrndpq_f32(__intrin);
251 else if constexpr (_TVT::template _S_is<double, 1>)
252 return vrndp_f64(__intrin);
253 else if constexpr (_TVT::template _S_is<double, 2>)
254 return vrndpq_f64(__intrin);
255 else
256 __assert_unreachable<_Tp>();
257 }
258 else
259 return _Base::_S_ceil(__x);
260 }
261
262 //}}} }}}
263 }; // }}}
264// _MaskImplNeonMixin {{{
265struct _MaskImplNeonMixin
266{
267 using _Base = _MaskImplBuiltinMixin;
268
269 template <typename _Tp, size_t _Np>
270 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
271 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
272 {
273 if (__builtin_is_constant_evaluated())
274 return _Base::_S_to_bits(__x);
275
276 using _I = __int_for_sizeof_t<_Tp>;
277 if constexpr (sizeof(__x) == 16)
278 {
279 auto __asint = __vector_bitcast<_I>(__x);
280#ifdef __aarch64__
281 [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
282#else
283 [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
284#endif
285 if constexpr (sizeof(_Tp) == 1)
286 {
287 constexpr auto __bitsel
288 = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
289 [&](auto __i) {
290 return static_cast<_I>(
291 __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
292 });
293 __asint &= __bitsel;
294#ifdef __aarch64__
295 return __vector_bitcast<_UShort>(
296 vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
297 __zero))[0];
298#else
299 return __vector_bitcast<_UShort>(
300 vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
301 __zero),
302 __zero))[0];
303#endif
304 }
305 else if constexpr (sizeof(_Tp) == 2)
306 {
307 constexpr auto __bitsel
308 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
309 [&](auto __i) {
310 return static_cast<_I>(__i < _Np ? 1 << __i : 0);
311 });
312 __asint &= __bitsel;
313#ifdef __aarch64__
314 return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero),
315 __zero)[0];
316#else
317 return vpadd_s16(
318 vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
319 __zero)[0];
320#endif
321 }
322 else if constexpr (sizeof(_Tp) == 4)
323 {
324 constexpr auto __bitsel
325 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
326 [&](auto __i) {
327 return static_cast<_I>(__i < _Np ? 1 << __i : 0);
328 });
329 __asint &= __bitsel;
330#ifdef __aarch64__
331 return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0];
332#else
333 return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
334 __zero)[0];
335#endif
336 }
337 else if constexpr (sizeof(_Tp) == 8)
338 return (__asint[0] & 1) | (__asint[1] & 2);
339 else
340 __assert_unreachable<_Tp>();
341 }
342 else if constexpr (sizeof(__x) == 8)
343 {
344 auto __asint = __vector_bitcast<_I>(__x);
345 [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
346 if constexpr (sizeof(_Tp) == 1)
347 {
348 constexpr auto __bitsel
349 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
350 [&](auto __i) {
351 return static_cast<_I>(__i < _Np ? 1 << __i : 0);
352 });
353 __asint &= __bitsel;
354 return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
355 __zero)[0];
356 }
357 else if constexpr (sizeof(_Tp) == 2)
358 {
359 constexpr auto __bitsel
360 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
361 [&](auto __i) {
362 return static_cast<_I>(__i < _Np ? 1 << __i : 0);
363 });
364 __asint &= __bitsel;
365 return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
366 }
367 else if constexpr (sizeof(_Tp) == 4)
368 {
369 __asint &= __make_vector<_I>(0x1, 0x2);
370 return vpadd_s32(__asint, __zero)[0];
371 }
372 else
373 __assert_unreachable<_Tp>();
374 }
375 else
376 return _Base::_S_to_bits(__x);
377 }
378};
379
380// }}}
381// _MaskImplNeon {{{
382template <typename _Abi>
383 struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
384 {
385 using _MaskImplBuiltinMixin::_S_to_maskvector;
386 using _MaskImplNeonMixin::_S_to_bits;
387 using _Base = _MaskImplBuiltin<_Abi>;
388 using _Base::_S_convert;
389
390 // _S_all_of {{{
391 template <typename _Tp>
392 _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
393 {
394 const auto __kk
395 = __vector_bitcast<char>(__k._M_data)
396 | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
397 if constexpr (sizeof(__k) == 16)
398 {
399 const auto __x = __vector_bitcast<long long>(__kk);
400 return __x[0] + __x[1] == -2;
401 }
402 else if constexpr (sizeof(__k) <= 8)
403 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
404 else
405 __assert_unreachable<_Tp>();
406 }
407
408 // }}}
409 // _S_any_of {{{
410 template <typename _Tp>
411 _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
412 {
413 const auto __kk
414 = __vector_bitcast<char>(__k._M_data)
415 | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
416 if constexpr (sizeof(__k) == 16)
417 {
418 const auto __x = __vector_bitcast<long long>(__kk);
419 return (__x[0] | __x[1]) != 0;
420 }
421 else if constexpr (sizeof(__k) <= 8)
422 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
423 else
424 __assert_unreachable<_Tp>();
425 }
426
427 // }}}
428 // _S_none_of {{{
429 template <typename _Tp>
430 _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
431 {
432 const auto __kk = _Abi::_S_masked(__k._M_data);
433 if constexpr (sizeof(__k) == 16)
434 {
435 const auto __x = __vector_bitcast<long long>(__kk);
436 return (__x[0] | __x[1]) == 0;
437 }
438 else if constexpr (sizeof(__k) <= 8)
439 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
440 else
441 __assert_unreachable<_Tp>();
442 }
443
444 // }}}
445 // _S_some_of {{{
446 template <typename _Tp>
447 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
448 {
449 if constexpr (sizeof(__k) <= 8)
450 {
451 const auto __kk = __vector_bitcast<char>(__k._M_data)
452 | ~__vector_bitcast<char>(
453 _Abi::template _S_implicit_mask<_Tp>());
454 using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
455 return __bit_cast<_Up>(__kk) + 1 > 1;
456 }
457 else
458 return _Base::_S_some_of(__k);
459 }
460
461 // }}}
462 // _S_popcount {{{
463 template <typename _Tp>
464 _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
465 {
466 if constexpr (sizeof(_Tp) == 1)
467 {
468 const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
469 int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
470 return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
471 int8x8_t())[0];
472 }
473 else if constexpr (sizeof(_Tp) == 2)
474 {
475 const auto __s16 = __vector_bitcast<short>(__k._M_data);
476 int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
477 return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
478 }
479 else if constexpr (sizeof(_Tp) == 4)
480 {
481 const auto __s32 = __vector_bitcast<int>(__k._M_data);
482 int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
483 return -vpadd_s32(__tmp, int32x2_t())[0];
484 }
485 else if constexpr (sizeof(_Tp) == 8)
486 {
487 static_assert(sizeof(__k) == 16);
488 const auto __s64 = __vector_bitcast<long>(__k._M_data);
489 return -(__s64[0] + __s64[1]);
490 }
491 }
492
493 // }}}
494 // _S_find_first_set {{{
495 template <typename _Tp>
496 _GLIBCXX_SIMD_INTRINSIC static int
497 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
498 {
499 // TODO: the _Base implementation is not optimal for NEON
500 return _Base::_S_find_first_set(__k);
501 }
502
503 // }}}
504 // _S_find_last_set {{{
505 template <typename _Tp>
506 _GLIBCXX_SIMD_INTRINSIC static int
507 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
508 {
509 // TODO: the _Base implementation is not optimal for NEON
510 return _Base::_S_find_last_set(__k);
511 }
512
513 // }}}
514 }; // }}}
515
516_GLIBCXX_SIMD_END_NAMESPACE
517#endif // __cplusplus >= 201703L
518#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
519// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80