From: James Chesterman <James.Chesterman@arm.com>
Date: Fri, 28 Nov 2025 11:18:51 +0000 (+0000)
Subject: aarch64: Optimise AdvSIMD acoshf
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0e80864c07ffb549557e3c317b6b5326444da95e;p=thirdparty%2Fglibc.git

aarch64: Optimise AdvSIMD acoshf

Optimise AdvSIMD acoshf by vectorising the special case.
For values greater than 0x1p64, scale the input down first.
This is because the output will overflow with inputs greater than
or equal to this value as there is a squaring operation in the
algorithm.
To scale, do:
2acosh(sqrt[(x+1)/2])
Because:
acosh(x) = 1/2acosh(2x^2 - 1) for x>=1.
Apply opposite operations in opposite order for x, and you get:
acosh(x) = 2acosh(sqrt[(x+1)/2]).

R.Throughput difference on V2 with GCC@15:
30-49% improvement in special cases.
2% regression in fast pass.
---

diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
index 7c3f590723..1d0ff5ad19 100644
--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
@@ -19,21 +19,66 @@
 
 #include "v_log1pf_inline.h"
 
-#define SquareLim 0x1p64
-
 const static struct data
 {
   struct v_log1pf_data log1pf_consts;
   uint32x4_t one;
-} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
+  uint16x4_t special_bound_u16;
+  uint32x4_t special_bound_u32;
+  float32x4_t pinf, nan;
+} data = {
+  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+  .one = V4 (0x3f800000),
+  .special_bound_u16 = V4 (0x2000),
+  /* asuint(sqrt(FLT_MAX)) - asuint(1).  */
+  .special_bound_u32 = V4 (0x20000000),
+  .pinf = V4 (INFINITY),
+  .nan = V4 (NAN),
+};
 
-#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
+static inline float32x4_t VPCS_ATTR
+inline_acoshf (float32x4_t x, const struct data *d)
+{
+  /* acosh(x) = ln(x + sqrt[x^2 -1]).
+    So acosh(x) = log1p (x + sqrt[x^2 - 1] - 1).  */
+  float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+  float32x4_t u
+      = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
 
-static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
-	      const struct v_log1pf_data *d)
+  float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+
+  return log1pf_inline (y, &d->log1pf_consts);
+}
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, const struct data *d)
 {
-  return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
+  uint32x4_t special = vcgeq_u32 (
+      vsubq_u32 (vreinterpretq_u32_f32 (x), d->one), d->special_bound_u32);
+
+  /* To avoid the overflow in x^2 (so the x < sqrt(FLT_MAX) constraint), we
+    reduce the input of acosh to a narrower interval by relying on the identity
+    acosh(t) = 1/2acosh(2t^2 - 1) for t>=1.
+    If we set t=sqrt((x+1)/2), since x>=1 then t>=sqrt(2/2)=1, and therefore
+    acosh(x) = 2acosh(sqrt((x+1)/2)).  */
+  float32x4_t r = vaddq_f32 (x, vreinterpretq_f32_u32 (d->one));
+  r = vmulq_f32 (r, v_f32 (0.5f));
+  r = vbslq_f32 (special, vsqrtq_f32 (r), x);
+
+  float32x4_t y = inline_acoshf (r, d);
+
+  y = vbslq_f32 (special, vmulq_f32 (y, v_f32 (2.0f)), y);
+
+  /* Check whether x is less than 1, or x is inf or nan.  */
+  uint32x4_t inf_minus_one
+      = vsubq_u32 (vreinterpretq_u32_f32 (d->pinf), d->one);
+  uint32x4_t is_infnan = vcgeq_u32 (
+      vsubq_u32 (vreinterpretq_u32_f32 (x), d->one), inf_minus_one);
+
+  y = vbslq_f32 (is_infnan, d->nan, y);
+  uint32x4_t ret_pinf = vceqq_f32 (x, d->pinf);
+  y = vbslq_f32 (ret_pinf, d->pinf, y);
+  return y;
 }
 
 /* Vector approximation for single-precision acosh, based on log1p.
@@ -41,21 +86,19 @@ special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
    _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
 				 want 0x1.fdcdd2p-5.  */
 
-VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
-
-  float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
-  float32x4_t u
-      = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
-
-  float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+  /* Inputs greater than or equal to special_bound will cause the output to
+    overflow. This is because there is a square operation in log1pf_inline.
+    This also captures inf, nan and any input less than or equal to 1.  */
+  uint16x4_t special
+      = vcge_u16 (vsubhn_u32 (ix, d->one), d->special_bound_u16);
 
   if (__glibc_unlikely (v_any_u16h (special)))
-    return special_case (x, y, special, &d->log1pf_consts);
-  return log1pf_inline (y, &d->log1pf_consts);
+    return special_case (x, d);
+  return inline_acoshf (x, d);
 }
 libmvec_hidden_def (V_NAME_F1 (acosh))
 HALF_WIDTH_ALIAS_F1 (acosh)