]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/fpu/cosf_sve.c
577cbd864e2ecf5bae0c616c5140d98a19ee3132
[thirdparty/glibc.git] / sysdeps / aarch64 / fpu / cosf_sve.c
1 /* Single-precision vector (SVE) cos function.
2
3 Copyright (C) 2023 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 #include "sv_math.h"
21
22 static const struct data
23 {
24 float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift;
25 } data = {
26 /* Polynomial coefficients are hard-wired in FTMAD instructions. */
27 .neg_pio2_1 = -0x1.921fb6p+0f,
28 .neg_pio2_2 = 0x1.777a5cp-25f,
29 .neg_pio2_3 = 0x1.ee59dap-50f,
30 .inv_pio2 = 0x1.45f306p-1f,
31 /* Original shift used in AdvSIMD cosf,
32 plus a contribution to set the bit #0 of q
33 as expected by trigonometric instructions. */
34 .shift = 0x1.800002p+23f
35 };
36
37 #define RangeVal 0x49800000 /* asuint32(0x1p20f). */
38
39 static svfloat32_t NOINLINE
40 special_case (svfloat32_t x, svfloat32_t y, svbool_t out_of_bounds)
41 {
42 return sv_call_f32 (cosf, x, y, out_of_bounds);
43 }
44
45 /* A fast SVE implementation of cosf based on trigonometric
46 instructions (FTMAD, FTSSEL, FTSMUL).
47 Maximum measured error: 2.06 ULPs.
48 SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6
49 want 0x1.fffe76p-6. */
50 svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg)
51 {
52 const struct data *d = ptr_barrier (&data);
53
54 svfloat32_t r = svabs_f32_x (pg, x);
55 svbool_t out_of_bounds
56 = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (r), RangeVal);
57
58 /* Load some constants in quad-word chunks to minimise memory access. */
59 svfloat32_t negpio2_and_invpio2
60 = svld1rq_f32 (svptrue_b32 (), &d->neg_pio2_1);
61
62 /* n = rint(|x|/(pi/2)). */
63 svfloat32_t q
64 = svmla_lane_f32 (sv_f32 (d->shift), r, negpio2_and_invpio2, 3);
65 svfloat32_t n = svsub_n_f32_x (pg, q, d->shift);
66
67 /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */
68 r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 0);
69 r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 1);
70 r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 2);
71
72 /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */
73 svfloat32_t f = svtssel_f32 (r, svreinterpret_u32_f32 (q));
74
75 /* cos(r) poly approx. */
76 svfloat32_t r2 = svtsmul_f32 (r, svreinterpret_u32_f32 (q));
77 svfloat32_t y = sv_f32 (0.0f);
78 y = svtmad_f32 (y, r2, 4);
79 y = svtmad_f32 (y, r2, 3);
80 y = svtmad_f32 (y, r2, 2);
81 y = svtmad_f32 (y, r2, 1);
82 y = svtmad_f32 (y, r2, 0);
83
84 /* Apply factor. */
85 y = svmul_f32_x (pg, f, y);
86
87 if (__glibc_unlikely (svptest_any (pg, out_of_bounds)))
88 return special_case (x, y, out_of_bounds);
89 return y;
90 }