]>
git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/fpu/atan_advsimd.c
1 /* Double-precision AdvSIMD inverse tan
3 Copyright (C) 2023-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
21 #include "poly_advsimd_f64.h"
23 static const struct data
25 float64x2_t pi_over_2
;
28 /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
30 .poly
= { V2 (-0x1.5555555555555p
-2), V2 (0x1.99999999996c1p
-3),
31 V2 (-0x1.2492492478f88p
-3), V2 (0x1.c71c71bc3951cp
-4),
32 V2 (-0x1.745d160a7e368p
-4), V2 (0x1.3b139b6a88ba1p
-4),
33 V2 (-0x1.11100ee084227p
-4), V2 (0x1.e1d0f9696f63bp
-5),
34 V2 (-0x1.aebfe7b418581p
-5), V2 (0x1.842dbe9b0d916p
-5),
35 V2 (-0x1.5d30140ae5e99p
-5), V2 (0x1.338e31eb2fbbcp
-5),
36 V2 (-0x1.00e6eece7de8p
-5), V2 (0x1.860897b29e5efp
-6),
37 V2 (-0x1.0051381722a59p
-6), V2 (0x1.14e9dc19a4a4ep
-7),
38 V2 (-0x1.d0062b42fe3bfp
-9), V2 (0x1.17739e210171ap
-10),
39 V2 (-0x1.ab24da7be7402p
-13), V2 (0x1.358851160a528p
-16), },
40 .pi_over_2
= V2 (0x1.921fb54442d18p
+0),
43 #define SignMask v_u64 (0x8000000000000000)
44 #define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */
45 #define BigBound 0x4340000000000000 /* asuint64(0x1p53). */
47 /* Fast implementation of vector atan.
48 Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
49 z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
50 _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
51 want 0x1.9225645bdd7c3p-1. */
52 float64x2_t VPCS_ATTR
V_NAME_D1 (atan
) (float64x2_t x
)
54 const struct data
*d
= ptr_barrier (&data
);
56 /* Small cases, infs and nans are supported by our approximation technique,
57 but do not set fenv flags correctly. Only trigger special case if we need
59 uint64x2_t ix
= vreinterpretq_u64_f64 (x
);
60 uint64x2_t sign
= vandq_u64 (ix
, SignMask
);
63 uint64x2_t ia12
= vandq_u64 (ix
, v_u64 (0x7ff0000000000000));
64 uint64x2_t special
= vcgtq_u64 (vsubq_u64 (ia12
, v_u64 (TinyBound
)),
65 v_u64 (BigBound
- TinyBound
));
66 /* If any lane is special, fall back to the scalar routine for all lanes. */
67 if (__glibc_unlikely (v_any_u64 (special
)))
68 return v_call_f64 (atan
, x
, v_f64 (0), v_u64 (-1));
71 /* Argument reduction:
72 y := arctan(x) for x < 1
73 y := pi/2 + arctan(-1/x) for x > 1
74 Hence, use z=-1/a if x>=1, otherwise z=a. */
75 uint64x2_t red
= vcagtq_f64 (x
, v_f64 (1.0));
76 /* Avoid dependency in abs(x) in division (and comparison). */
77 float64x2_t z
= vbslq_f64 (red
, vdivq_f64 (v_f64 (1.0), x
), x
);
78 float64x2_t shift
= vreinterpretq_f64_u64 (
79 vandq_u64 (red
, vreinterpretq_u64_f64 (d
->pi_over_2
)));
80 /* Use absolute value only when needed (odd powers of z). */
81 float64x2_t az
= vbslq_f64 (
82 SignMask
, vreinterpretq_f64_u64 (vandq_u64 (SignMask
, red
)), z
);
84 /* Calculate the polynomial approximation.
85 Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
86 full scheme to avoid underflow in x^16.
87 The order 19 polynomial P approximates
88 (atan(sqrt(x))-sqrt(x))/x^(3/2). */
89 float64x2_t z2
= vmulq_f64 (z
, z
);
90 float64x2_t x2
= vmulq_f64 (z2
, z2
);
91 float64x2_t x4
= vmulq_f64 (x2
, x2
);
92 float64x2_t x8
= vmulq_f64 (x4
, x4
);
94 = vfmaq_f64 (v_estrin_7_f64 (z2
, x2
, x4
, d
->poly
),
95 v_estrin_11_f64 (z2
, x2
, x4
, x8
, d
->poly
+ 8), x8
);
97 /* Finalize. y = shift + z + z^3 * P(z^2). */
98 y
= vfmaq_f64 (az
, y
, vmulq_f64 (z2
, az
));
99 y
= vaddq_f64 (y
, shift
);
101 /* y = atan(x) if x>0, -atan(-x) otherwise. */
102 y
= vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y
), sign
));