sysdeps/aarch64/fpu/cosf_sve.c

   1 /* Single-precision vector (SVE) cos function.
   2
   3    Copyright (C) 2023 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include "sv_math.h"
  21
  22 static const struct data
  23 {
  24   float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift;
  25 } data = {
  26   /* Polynomial coefficients are hard-wired in FTMAD instructions.  */
  27   .neg_pio2_1 = -0x1.921fb6p+0f,
  28   .neg_pio2_2 = 0x1.777a5cp-25f,
  29   .neg_pio2_3 = 0x1.ee59dap-50f,
  30   .inv_pio2 = 0x1.45f306p-1f,
  31   /* Original shift used in AdvSIMD cosf,
  32      plus a contribution to set the bit #0 of q
  33      as expected by trigonometric instructions.  */
  34   .shift = 0x1.800002p+23f
  35 };
  36
  37 #define RangeVal 0x49800000 /* asuint32(0x1p20f).  */
  38
  39 static svfloat32_t NOINLINE
  40 special_case (svfloat32_t x, svfloat32_t y, svbool_t out_of_bounds)
  41 {
  42   return sv_call_f32 (cosf, x, y, out_of_bounds);
  43 }
  44
  45 /* A fast SVE implementation of cosf based on trigonometric
  46    instructions (FTMAD, FTSSEL, FTSMUL).
  47    Maximum measured error: 2.06 ULPs.
  48    SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6
  49                                    want 0x1.fffe76p-6.  */
  50 svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg)
  51 {
  52   const struct data *d = ptr_barrier (&data);
  53
  54   svfloat32_t r = svabs_f32_x (pg, x);
  55   svbool_t out_of_bounds
  56     = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (r), RangeVal);
  57
  58   /* Load some constants in quad-word chunks to minimise memory access.  */
  59   svfloat32_t negpio2_and_invpio2
  60       = svld1rq_f32 (svptrue_b32 (), &d->neg_pio2_1);
  61
  62   /* n = rint(|x|/(pi/2)).  */
  63   svfloat32_t q
  64       = svmla_lane_f32 (sv_f32 (d->shift), r, negpio2_and_invpio2, 3);
  65   svfloat32_t n = svsub_n_f32_x (pg, q, d->shift);
  66
  67   /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
  68   r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 0);
  69   r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 1);
  70   r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 2);
  71
  72   /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
  73   svfloat32_t f = svtssel_f32 (r, svreinterpret_u32_f32 (q));
  74
  75   /* cos(r) poly approx.  */
  76   svfloat32_t r2 = svtsmul_f32 (r, svreinterpret_u32_f32 (q));
  77   svfloat32_t y = sv_f32 (0.0f);
  78   y = svtmad_f32 (y, r2, 4);
  79   y = svtmad_f32 (y, r2, 3);
  80   y = svtmad_f32 (y, r2, 2);
  81   y = svtmad_f32 (y, r2, 1);
  82   y = svtmad_f32 (y, r2, 0);
  83
  84   /* Apply factor.  */
  85   y = svmul_f32_x (pg, f, y);
  86
  87   if (__glibc_unlikely (svptest_any (pg, out_of_bounds)))
  88     return special_case (x, y, out_of_bounds);
  89   return y;
  90 }