aarch64: Optimise AdvSIMD log2f

author James Chesterman <James.Chesterman@arm.com>

Wed, 19 Nov 2025 14:11:39 +0000 (14:11 +0000)

committer Adhemerval Zanella <adhemerval.zanella@linaro.org>

Thu, 4 Dec 2025 11:31:15 +0000 (08:31 -0300)
author James Chesterman <James.Chesterman@arm.com>
Wed, 19 Nov 2025 14:11:39 +0000 (14:11 +0000)
committer Adhemerval Zanella <adhemerval.zanella@linaro.org>
Thu, 4 Dec 2025 11:31:15 +0000 (08:31 -0300)
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c

index 28f1857fe4ee504527d54b31f9ae59e33cd96411..ed44e6533f037468b9a8496e34d95b9d55f6ea12 100644 (file)
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
@@ -23,9 +23,11 @@ static const struct data
  {
    float32x4_t c0, c2, c4, c6, c8;
    uint32x4_t off, offset_lower_bound;
-  uint16x8_t special_bound;
+  uint32x4_t special_bound;
+  uint16x8_t special_bound_u16;
    uint32x4_t mantissa_mask;
    float c1, c3, c5, c7;
+  float32x4_t pinf, minf, nan;
  } data = {
    /* Coefficients generated using Remez algorithm approximate
       log2(1+r)/r for r in [ -1/3, 1/3 ].
@@ -42,24 +44,72 @@ static const struct data
    /* Lower bound is the smallest positive normal float 0x00800000. For
       optimised register use subnormals are detected after offset has been
       subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .off = V4 (0x3f2aaaab), /* 0.666667.  */
    .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
-  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
-  .off = V4 (0x3f2aaaab),      /* 0.666667.  */
+  .special_bound = V4 (0x7f000000), /* asuint32(inf) - 0x00800000.  */
+  .special_bound_u16 = V8 (0x7f00),
    .mantissa_mask = V4 (0x007fffff),
+  .pinf = V4 (INFINITY),
+  .minf = V4 (-INFINITY),
+  .nan = V4 (NAN),
  };
  
+static inline float32x4_t VPCS_ATTR
+inline_log2f (uint32x4_t u_off, float32x4_t n, const struct data *d)
+{
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log2(1+r) + n.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+
+  float32x4_t c1357 = vld1q_f32 (&d->c1);
+  float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+  float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+  float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+  float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+  float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
+  float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
+  float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
+  float32x4_t p = vfmaq_f32 (c01, r2, p28);
+
+  return vfmaq_f32 (n, p, r);
+}
+
  static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
-             uint16x4_t cmp, const struct data *d)
+special_case (uint32x4_t u_off, const struct data *d)
  {
-  /* Fall back to scalar code.  */
-  return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
-                    vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+  float32x4_t x = vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off));
+  uint32x4_t special
+      = vcgeq_u32 (vsubq_u32 (u_off, d->offset_lower_bound), d->special_bound);
+  float32x4_t x_sqrt = vbslq_f32 (special, vsqrtq_f32 (x), x);
+
+  u_off = vsubq_u32 (vreinterpretq_u32_f32 (x_sqrt), d->off);
+  float32x4_t n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+
+  float32x4_t y = inline_log2f (u_off, n, d);
+
+  /* Scale down by multiplying output by two.
+         Because log(x) = 2log(sqrt(x)).  */
+  y = vbslq_f32 (special, vmulq_f32 (y, v_f32 (2.0f)), y);
+
+  /* Is true for +/- inf, +/- nan as well as all negative numbers.  */
+  uint32x4_t is_infnan
+      = vcgeq_u32 (vreinterpretq_u32_f32 (x), vreinterpretq_u32_f32 (d->pinf));
+  uint32x4_t infnan_or_zero = vorrq_u32 (is_infnan, vceqzq_f32 (x));
+
+  y = vbslq_f32 (infnan_or_zero, d->nan, y);
+  uint32x4_t ret_pinf = vceqq_f32 (x, d->pinf);
+  uint32x4_t ret_minf = vceqzq_f32 (x);
+  y = vbslq_f32 (ret_pinf, d->pinf, y);
+  y = vbslq_f32 (ret_minf, d->minf, y);
+  return y;
  }
  
  /* Fast implementation for single precision AdvSIMD log2,
     relies on same argument reduction as AdvSIMD logf.
-   Maximum error: 2.48 ULPs
+   Maximum error: 1.99 + 0.5
     _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
                                 want 0x1.a9be8p-2.  */
  float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
@@ -69,35 +119,18 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
    /* To avoid having to mov x out of the way, keep u after offset has been
       applied, and recover x by adding the offset back in the special-case
       handler.  */
-  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+  uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
  
    /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  u_off = vsubq_u32 (u_off, d->off);
    float32x4_t n = vcvtq_f32_s32 (
        vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
  
-  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
-                                vget_low_u16 (d->special_bound));
-
-  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
-  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
-
-  /* y = log2(1+r) + n.  */
-  float32x4_t r2 = vmulq_f32 (r, r);
-
-  float32x4_t c1357 = vld1q_f32 (&d->c1);
-  float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
-  float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
-  float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
-  float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
-  float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
-  float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
-  float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
-  float32x4_t p = vfmaq_f32 (c01, r2, p28);
+  uint16x4_t special_u16 = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+                                    vget_low_u16 (d->special_bound_u16));
  
-  if (__glibc_unlikely (v_any_u16h (special)))
-    return special_case (n, u_off, p, r, special, d);
-  return vfmaq_f32 (n, p, r);
+  if (__glibc_unlikely (v_any_u16h (special_u16)))
+    return special_case (u_off, d);
+  return inline_log2f (u_off, n, d);
  }
  
  libmvec_hidden_def (V_NAME_F1 (log2))
author	James Chesterman <James.Chesterman@arm.com>
	Wed, 19 Nov 2025 14:11:39 +0000 (14:11 +0000)
committer	Adhemerval Zanella <adhemerval.zanella@linaro.org>
	Thu, 4 Dec 2025 11:31:15 +0000 (08:31 -0300)