1 /* Double-precision vector (Advanced SIMD) cos function.
3 Copyright (C) 2023 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
22 static const struct data
25 float64x2_t range_val
, shift
, inv_pi
, half_pi
, pi_1
, pi_2
, pi_3
;
27 /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
28 .poly
= { V2 (-0x1.555555555547bp
-3), V2 (0x1.1111111108a4dp
-7),
29 V2 (-0x1.a01a019936f27p
-13), V2 (0x1.71de37a97d93ep
-19),
30 V2 (-0x1.ae633919987c6p
-26), V2 (0x1.60e277ae07cecp
-33),
31 V2 (-0x1.9e9540300a1p
-41) },
32 .inv_pi
= V2 (0x1.45f306dc9c883p
-2),
33 .half_pi
= V2 (0x1.921fb54442d18p
+0),
34 .pi_1
= V2 (0x1.921fb54442d18p
+1),
35 .pi_2
= V2 (0x1.1a62633145c06p
-53),
36 .pi_3
= V2 (0x1.c1cd129024e09p
-106),
37 .shift
= V2 (0x1.8p52
),
38 .range_val
= V2 (0x1p
23)
41 #define C(i) d->poly[i]
43 static float64x2_t VPCS_ATTR NOINLINE
44 special_case (float64x2_t x
, float64x2_t y
, uint64x2_t odd
, uint64x2_t cmp
)
46 y
= vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y
), odd
));
47 return v_call_f64 (cos
, x
, y
, cmp
);
50 float64x2_t VPCS_ATTR
V_NAME_D1 (cos
) (float64x2_t x
)
52 const struct data
*d
= ptr_barrier (&data
);
53 float64x2_t n
, r
, r2
, r3
, r4
, t1
, t2
, t3
, y
;
58 cmp
= vcgeq_u64 (vreinterpretq_u64_f64 (r
),
59 vreinterpretq_u64_f64 (d
->range_val
));
60 if (__glibc_unlikely (v_any_u64 (cmp
)))
61 /* If fenv exceptions are to be triggered correctly, set any special lanes
62 to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
63 special-case handler later. */
64 r
= vbslq_f64 (cmp
, v_f64 (1.0), r
);
66 cmp
= vcageq_f64 (d
->range_val
, x
);
67 cmp
= vceqzq_u64 (cmp
); /* cmp = ~cmp. */
71 /* n = rint((|x|+pi/2)/pi) - 0.5. */
72 n
= vfmaq_f64 (d
->shift
, d
->inv_pi
, vaddq_f64 (r
, d
->half_pi
));
73 odd
= vshlq_n_u64 (vreinterpretq_u64_f64 (n
), 63);
74 n
= vsubq_f64 (n
, d
->shift
);
75 n
= vsubq_f64 (n
, v_f64 (0.5));
77 /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
78 r
= vfmsq_f64 (r
, d
->pi_1
, n
);
79 r
= vfmsq_f64 (r
, d
->pi_2
, n
);
80 r
= vfmsq_f64 (r
, d
->pi_3
, n
);
82 /* sin(r) poly approx. */
83 r2
= vmulq_f64 (r
, r
);
84 r3
= vmulq_f64 (r2
, r
);
85 r4
= vmulq_f64 (r2
, r2
);
87 t1
= vfmaq_f64 (C (4), C (5), r2
);
88 t2
= vfmaq_f64 (C (2), C (3), r2
);
89 t3
= vfmaq_f64 (C (0), C (1), r2
);
91 y
= vfmaq_f64 (t1
, C (6), r4
);
92 y
= vfmaq_f64 (t2
, y
, r4
);
93 y
= vfmaq_f64 (t3
, y
, r4
);
94 y
= vfmaq_f64 (r
, y
, r3
);
96 if (__glibc_unlikely (v_any_u64 (cmp
)))
97 return special_case (x
, y
, odd
, cmp
);
98 return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y
), odd
));