1 /* Single-precision vector (SVE) cos function.
3 Copyright (C) 2023 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
22 static const struct data
24 float neg_pio2_1
, neg_pio2_2
, neg_pio2_3
, inv_pio2
, shift
;
26 /* Polynomial coefficients are hard-wired in FTMAD instructions. */
27 .neg_pio2_1
= -0x1.921fb6p
+0f
,
28 .neg_pio2_2
= 0x1.777a5cp
-25f
,
29 .neg_pio2_3
= 0x1.ee59dap
-50f
,
30 .inv_pio2
= 0x1.45f306p
-1f
,
31 /* Original shift used in AdvSIMD cosf,
32 plus a contribution to set the bit #0 of q
33 as expected by trigonometric instructions. */
34 .shift
= 0x1.800002p
+23f
37 #define RangeVal 0x49800000 /* asuint32(0x1p20f). */
39 static svfloat32_t NOINLINE
40 special_case (svfloat32_t x
, svfloat32_t y
, svbool_t out_of_bounds
)
42 return sv_call_f32 (cosf
, x
, y
, out_of_bounds
);
45 /* A fast SVE implementation of cosf based on trigonometric
46 instructions (FTMAD, FTSSEL, FTSMUL).
47 Maximum measured error: 2.06 ULPs.
48 SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6
49 want 0x1.fffe76p-6. */
50 svfloat32_t
SV_NAME_F1 (cos
) (svfloat32_t x
, const svbool_t pg
)
52 const struct data
*d
= ptr_barrier (&data
);
54 svfloat32_t r
= svabs_f32_x (pg
, x
);
55 svbool_t out_of_bounds
56 = svcmpge_n_u32 (pg
, svreinterpret_u32_f32 (r
), RangeVal
);
58 /* Load some constants in quad-word chunks to minimise memory access. */
59 svfloat32_t negpio2_and_invpio2
60 = svld1rq_f32 (svptrue_b32 (), &d
->neg_pio2_1
);
62 /* n = rint(|x|/(pi/2)). */
64 = svmla_lane_f32 (sv_f32 (d
->shift
), r
, negpio2_and_invpio2
, 3);
65 svfloat32_t n
= svsub_n_f32_x (pg
, q
, d
->shift
);
67 /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */
68 r
= svmla_lane_f32 (r
, n
, negpio2_and_invpio2
, 0);
69 r
= svmla_lane_f32 (r
, n
, negpio2_and_invpio2
, 1);
70 r
= svmla_lane_f32 (r
, n
, negpio2_and_invpio2
, 2);
72 /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */
73 svfloat32_t f
= svtssel_f32 (r
, svreinterpret_u32_f32 (q
));
75 /* cos(r) poly approx. */
76 svfloat32_t r2
= svtsmul_f32 (r
, svreinterpret_u32_f32 (q
));
77 svfloat32_t y
= sv_f32 (0.0f
);
78 y
= svtmad_f32 (y
, r2
, 4);
79 y
= svtmad_f32 (y
, r2
, 3);
80 y
= svtmad_f32 (y
, r2
, 2);
81 y
= svtmad_f32 (y
, r2
, 1);
82 y
= svtmad_f32 (y
, r2
, 0);
85 y
= svmul_f32_x (pg
, f
, y
);
87 if (__glibc_unlikely (svptest_any (pg
, out_of_bounds
)))
88 return special_case (x
, y
, out_of_bounds
);