aarch64: Implement AdvSIMD and SVE rsqrt(f) routines

author James Chesterman <James.Chesterman@arm.com>

Fri, 5 Dec 2025 10:37:45 +0000 (10:37 +0000)

committer Adhemerval Zanella <adhemerval.zanella@linaro.org>

Fri, 5 Dec 2025 18:05:54 +0000 (15:05 -0300)
author James Chesterman <James.Chesterman@arm.com>
Fri, 5 Dec 2025 10:37:45 +0000 (10:37 +0000)
committer Adhemerval Zanella <adhemerval.zanella@linaro.org>
Fri, 5 Dec 2025 18:05:54 +0000 (15:05 -0300)
diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h

index 21e72917f7ebcf60f9994540127b00c9b97c3346..e12936d7f79ef17a42e39038ce71c81efbd7cd0a 100644 (file)
--- a/bits/libm-simd-decl-stubs.h
+++ b/bits/libm-simd-decl-stubs.h
@@ -242,6 +242,17 @@
  #define __DECL_SIMD_atan2f64x
  #define __DECL_SIMD_atan2f128x
  
+#define __DECL_SIMD_rsqrt
+#define __DECL_SIMD_rsqrtf
+#define __DECL_SIMD_rsqrtl
+#define __DECL_SIMD_rsqrtf16
+#define __DECL_SIMD_rsqrtf32
+#define __DECL_SIMD_rsqrtf64
+#define __DECL_SIMD_rsqrtf128
+#define __DECL_SIMD_rsqrtf32x
+#define __DECL_SIMD_rsqrtf64x
+#define __DECL_SIMD_rsqrtf128x
+
  #define __DECL_SIMD_log10
  #define __DECL_SIMD_log10f
  #define __DECL_SIMD_log10l
diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h

index 37149d836414007a4e05b6c2a91b0f13f77a377c..592a80fcb909eefd2f89ec86c839216458a19585 100644 (file)
--- a/math/bits/mathcalls.h
+++ b/math/bits/mathcalls.h
@@ -203,7 +203,7 @@ __MATHCALL (powr,, (_Mdouble_ __x, _Mdouble_ __y));
  __MATHCALL (rootn,, (_Mdouble_ __x, long long int __y));
  
  /* Return the reciprocal of the square root of X.  */
-__MATHCALL (rsqrt,, (_Mdouble_ __x));
+__MATHCALL_VEC (rsqrt,, (_Mdouble_ __x));
  #endif
  
  
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile

index df6cbe6b53159e6f22500e12c7e6a089e64841b9..998fc08d43532facb56cacba041b9ad2a4ab1496 100644 (file)
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -29,6 +29,7 @@ libmvec-supported-funcs = acos \
                            log2 \
                            log2p1 \
                            pow \
+                          rsqrt \
                            sin \
                            sinh \
                            sinpi \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions

index c03de40fe114bc75e2930319b825f87b2f3c11ad..d68510a20ef5722da70be309fb8103a1717cc471 100644 (file)
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -200,5 +200,10 @@ libmvec {
      _ZGVnN4v_log10p1f;
      _ZGVsMxv_log10p1;
      _ZGVsMxv_log10p1f;
+    _ZGVnN2v_rsqrt;
+    _ZGVnN2v_rsqrtf;
+    _ZGVnN4v_rsqrtf;
+    _ZGVsMxv_rsqrt;
+    _ZGVsMxv_rsqrtf;
    }
  }
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h

index 77a902c11a1cfdb4e69ed321e141196b0b17f899..abdb1ff114f97a71b6c8b8f6d48829c4117b9f8c 100644 (file)
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -47,6 +47,7 @@ libmvec_hidden_proto (V_NAME_F1(log2p1));
  libmvec_hidden_proto (V_NAME_F1(logp1));
  libmvec_hidden_proto (V_NAME_F1(log));
  libmvec_hidden_proto (V_NAME_F2(pow));
+libmvec_hidden_proto (V_NAME_F1(rsqrt));
  libmvec_hidden_proto (V_NAME_F1(sin));
  libmvec_hidden_proto (V_NAME_F1(sinh));
  libmvec_hidden_proto (V_NAME_F1(sinpi));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h

index 56b47f58ef097f840674c17f1fa13aae7633ac94..7406552f492027fc4babd26b412626a982565a17 100644 (file)
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -157,6 +157,10 @@
  # define __DECL_SIMD_pow __DECL_SIMD_aarch64
  # undef __DECL_SIMD_powf
  # define __DECL_SIMD_powf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_rsqrt
+# define __DECL_SIMD_rsqrt __DECL_SIMD_aarch64
+# undef __DECL_SIMD_rsqrtf
+# define __DECL_SIMD_rsqrtf __DECL_SIMD_aarch64
  # undef __DECL_SIMD_sin
  # define __DECL_SIMD_sin __DECL_SIMD_aarch64
  # undef __DECL_SIMD_sinf
@@ -239,6 +243,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
  __vpcs __f32x4_t _ZGVnN4v_log2p1f (__f32x4_t);
  __vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t);
  __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_rsqrtf (__f32x4_t);
  __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
  __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
  __vpcs __f32x4_t _ZGVnN4v_sinpif (__f32x4_t);
@@ -278,6 +283,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
  __vpcs __f64x2_t _ZGVnN2v_log2p1 (__f64x2_t);
  __vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t);
  __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_rsqrt (__f64x2_t);
  __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
  __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
  __vpcs __f64x2_t _ZGVnN2v_sinpi (__f64x2_t);
@@ -322,6 +328,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
  __sv_f32_t _ZGVsMxv_log2p1f (__sv_f32_t, __sv_bool_t);
  __sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t);
  __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_rsqrtf (__sv_f32_t, __sv_bool_t);
  __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
  __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
  __sv_f32_t _ZGVsMxv_sinpif (__sv_f32_t, __sv_bool_t);
@@ -361,6 +368,7 @@ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
  __sv_f64_t _ZGVsMxv_log2p1 (__sv_f64_t, __sv_bool_t);
  __sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t);
  __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_rsqrt (__sv_f64_t, __sv_bool_t);
  __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
  __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
  __sv_f64_t _ZGVsMxv_sinpi (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h

index 0109687b5b7cced3dcd165155a98257b21404084..55e3469a2cf49fa4f052b6f35938bf7948bf2317 100644 (file)
--- a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
@@ -80,6 +80,8 @@
  !GCC$ builtin (logp1f) attributes simd (notinbranch)
  !GCC$ builtin (pow) attributes simd (notinbranch)
  !GCC$ builtin (powf) attributes simd (notinbranch)
+!GCC$ builtin (rsqrt) attributes simd (notinbranch)
+!GCC$ builtin (rsqrtf) attributes simd (notinbranch)
  !GCC$ builtin (sin) attributes simd (notinbranch)
  !GCC$ builtin (sinf) attributes simd (notinbranch)
  !GCC$ builtin (sinh) attributes simd (notinbranch)
diff --git a/sysdeps/aarch64/fpu/rsqrt_advsimd.c b/sysdeps/aarch64/fpu/rsqrt_advsimd.c

new file mode 100644 (file)

index 0000000..f1504c6
--- /dev/null
+++ b/sysdeps/aarch64/fpu/rsqrt_advsimd.c
@@ -0,0 +1,84 @@
+/* Double-precision vector (Advanced SIMD) rsqrt function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t special_bound;
+  float64x2_t scale_up, scale_down;
+} data = {
+  /* When x < 0x1p-1021, estimate becomes infinity.
+    x is scaled up by 0x1p54, so the estimate does not reach infinity.
+    Then the result is multiplied by 0x1p27.
+    The difference between the lowest power possible (-1074) and the special
+    bound (-1021) is 54, so 2^54 is used as the scaling value.  */
+  .special_bound = V2 (0x1p-1021),
+  .scale_up = V2 (0x1p54),
+  .scale_down = V2 (0x1p27),
+};
+
+static inline float64x2_t VPCS_ATTR
+inline_rsqrt (float64x2_t x)
+{
+  /* Do estimate instruction.  */
+  float64x2_t estimate = vrsqrteq_f64 (x);
+
+  /* Do first step instruction.  */
+  float64x2_t estimate_squared = vmulq_f64 (estimate, estimate);
+  float64x2_t step = vrsqrtsq_f64 (x, estimate_squared);
+  estimate = vmulq_f64 (estimate, step);
+
+  /* Do second step instruction.  */
+  estimate_squared = vmulq_f64 (estimate, estimate);
+  step = vrsqrtsq_f64 (x, estimate_squared);
+  estimate = vmulq_f64 (estimate, step);
+
+  /* Do third step instruction.
+     This is required to achieve < 3.0 ULP.  */
+  estimate_squared = vmulq_f64 (estimate, estimate);
+  step = vrsqrtsq_f64 (x, estimate_squared);
+  estimate = vmulq_f64 (estimate, step);
+  return estimate;
+}
+
+static float64x2_t NOINLINE
+special_case (float64x2_t x, uint64x2_t special, const struct data *d)
+{
+  x = vbslq_f64 (special, vmulq_f64 (x, d->scale_up), x);
+  float64x2_t estimate = inline_rsqrt (x);
+  return vbslq_f64 (special, vmulq_f64 (estimate, d->scale_down), estimate);
+}
+
+/* Double-precision implementation of vector rsqrt(x).
+  Maximum observed error: 1.45 + 0.5
+  _ZGVnN2v_rsqrt(0x1.d13fb41254643p+1023) got 0x1.0c8dee1b29dfap-512
+                                        want 0x1.0c8dee1b29df8p-512.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (rsqrt) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* Special case: x < special_bound.  */
+  uint64x2_t special = vcgtq_f64 (d->special_bound, x);
+  if (__glibc_unlikely (v_any_u64 (special)))
+    {
+      return special_case (x, special, d);
+    }
+  return inline_rsqrt (x);
+}
diff --git a/sysdeps/aarch64/fpu/rsqrt_sve.c b/sysdeps/aarch64/fpu/rsqrt_sve.c

new file mode 100644 (file)

index 0000000..6eda24f
--- /dev/null
+++ b/sysdeps/aarch64/fpu/rsqrt_sve.c
@@ -0,0 +1,83 @@
+/* Double-precision vector (SVE) rsqrt function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float64_t special_bound;
+  int64_t scale_up, scale_down;
+} data = {
+  /* When x < 0x1p-1021, estimate becomes infinity.
+    x is scaled up by 0x1p54, so the estimate does not reach infinity.
+    Then the result is multiplied by 0x1p27.
+    The difference between the lowest power possible (-1074) and the special
+    bound (-1021) is 54, so 2^54 is used as the scaling value.  */
+  .special_bound = 0x1p-1021,
+  .scale_up = 54,
+  .scale_down = 27,
+};
+
+static inline svfloat64_t
+inline_rsqrt (svfloat64_t x)
+{
+  /* Do estimate instruction.  */
+  svfloat64_t estimate = svrsqrte_f64 (x);
+
+  /* Do first step instruction.  */
+  svfloat64_t estimate_squared = svmul_x (svptrue_b64 (), estimate, estimate);
+  svfloat64_t step = svrsqrts_f64 (x, estimate_squared);
+  estimate = svmul_x (svptrue_b64 (), estimate, step);
+
+  /* Do second step instruction.  */
+  estimate_squared = svmul_x (svptrue_b64 (), estimate, estimate);
+  step = svrsqrts_f64 (x, estimate_squared);
+  estimate = svmul_x (svptrue_b64 (), estimate, step);
+
+  /* Do third step instruction.
+     This is required to achieve < 3.0 ULP.  */
+  estimate_squared = svmul_x (svptrue_b64 (), estimate, estimate);
+  step = svrsqrts_f64 (x, estimate_squared);
+  estimate = svmul_x (svptrue_b64 (), estimate, step);
+  return estimate;
+}
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svbool_t special, const struct data *d)
+{
+  x = svscale_f64_m (special, x, sv_s64 (d->scale_up));
+  svfloat64_t estimate = inline_rsqrt (x);
+  return svscale_f64_m (special, estimate, sv_s64 (d->scale_down));
+}
+
+/* Double-precision SVE implementation of rsqrt(x).
+  Maximum observed error: 1.45 + 0.5
+  _ZGVnN2v_rsqrt(0x1.d13fb41254643p+1023) got 0x1.0c8dee1b29dfap-512
+                                        want 0x1.0c8dee1b29df8p-512.  */
+svfloat64_t SV_NAME_D1 (rsqrt) (svfloat64_t x, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svbool_t special = svcmplt_n_f64 (pg, x, d->special_bound);
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    {
+      return special_case (x, special, d);
+    }
+  return inline_rsqrt (x);
+}
diff --git a/sysdeps/aarch64/fpu/rsqrtf_advsimd.c b/sysdeps/aarch64/fpu/rsqrtf_advsimd.c

new file mode 100644 (file)

index 0000000..2bf5a0a
--- /dev/null
+++ b/sysdeps/aarch64/fpu/rsqrtf_advsimd.c
@@ -0,0 +1,82 @@
+/* Single-precision vector (Advanced SIMD) rsqrt function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t special_bound;
+  float32x4_t scale_up, scale_down;
+} data = {
+  /* When x < 0x1p-128, estimate becomes infinity.
+    x is scaled up by 0x1p22f, so estimate does not reach infinity.
+    Then the result is multiplied by 0x1p11f.
+    The difference between the lowest power possible (-149) and the special
+    bound (-128) is 21. 22 is used here so that a power of 2 can be used for
+    scaling in both directions.  */
+  .special_bound = V4 (0x1p-128f),
+  .scale_up = V4 (0x1p22f),
+  .scale_down = V4 (0x1p11f),
+};
+
+static inline float32x4_t VPCS_ATTR
+inline_rsqrt (float32x4_t x)
+{
+  /* Do estimate instruction.  */
+  float32x4_t estimate = vrsqrteq_f32 (x);
+
+  /* Do first step instruction.  */
+  float32x4_t estimate_squared = vmulq_f32 (estimate, estimate);
+  float32x4_t step = vrsqrtsq_f32 (x, estimate_squared);
+  estimate = vmulq_f32 (estimate, step);
+
+  /* Do second step instruction.
+    This is required to achieve < 3.0 ULP.  */
+  estimate_squared = vmulq_f32 (estimate, estimate);
+  step = vrsqrtsq_f32 (x, estimate_squared);
+  estimate = vmulq_f32 (estimate, step);
+  return estimate;
+}
+
+static float32x4_t NOINLINE
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
+{
+  x = vbslq_f32 (special, vmulq_f32 (x, d->scale_up), x);
+  float32x4_t estimate = inline_rsqrt (x);
+  return vbslq_f32 (special, vmulq_f32 (estimate, d->scale_down), estimate);
+}
+
+/* Single-precision implementation of vector rqsrtf(x).
+  Maximum observed error: 1.47 + 0.5
+  _ZGVnN4v_rsqrtf (0x1.f610dep+127) got 0x1.02852cp-64
+                                  want 0x1.02853p-64.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (rsqrt) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* Special case: x < special_bound.  */
+  uint32x4_t special = vcgtq_f32 (d->special_bound, x);
+  if (__glibc_unlikely (v_any_u32 (special)))
+    {
+      return special_case (x, special, d);
+    }
+  return inline_rsqrt (x);
+}
+libmvec_hidden_def (V_NAME_F1 (rsqrt))
+HALF_WIDTH_ALIAS_F1 (rsqrt)
diff --git a/sysdeps/aarch64/fpu/rsqrtf_sve.c b/sysdeps/aarch64/fpu/rsqrtf_sve.c

new file mode 100644 (file)

index 0000000..b752694
--- /dev/null
+++ b/sysdeps/aarch64/fpu/rsqrtf_sve.c
@@ -0,0 +1,79 @@
+/* Single-precision vector (SVE) rsqrt function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float32_t special_bound;
+  int32_t scale_up, scale_down;
+} data = {
+  /* When x < 0x1p-128, estimate becomes infinity.
+    x is scaled up by 0x1p22f, so estimate does not reach infinity.
+    Then the result is multiplied by 0x1p11f.
+    The difference between the lowest power possible (-149) and the special
+    bound (-128) is 21. 22 is used here so that a power of 2 can be used for
+    scaling in both directions.  */
+  .special_bound = 0x1p-128f,
+  .scale_up = 22,
+  .scale_down = 11,
+};
+
+static inline svfloat32_t
+inline_rsqrt (svfloat32_t x)
+{
+  /* Do estimate instruction.  */
+  svfloat32_t estimate = svrsqrte_f32 (x);
+
+  /* Do first step instruction.  */
+  svfloat32_t estimate_squared = svmul_x (svptrue_b32 (), estimate, estimate);
+  svfloat32_t step = svrsqrts_f32 (x, estimate_squared);
+  estimate = svmul_x (svptrue_b32 (), estimate, step);
+
+  /* Do second step instruction.
+    This is required to achieve < 3.0 ULP.  */
+  estimate_squared = svmul_x (svptrue_b32 (), estimate, estimate);
+  step = svrsqrts_f32 (x, estimate_squared);
+  estimate = svmul_x (svptrue_b32 (), estimate, step);
+  return estimate;
+}
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+  x = svscale_f32_m (special, x, sv_s32 (d->scale_up));
+  svfloat32_t estimate = inline_rsqrt (x);
+  return svscale_f32_m (special, estimate, sv_s32 (d->scale_down));
+}
+
+/* Single-precision SVE implementation of rsqrtf(x).
+  Maximum observed error: 1.47 + 0.5
+  _ZGVsMxv_rsqrtf (0x1.f610dep+127) got 0x1.02852cp-64
+                                  want 0x1.02853p-64.  */
+svfloat32_t SV_NAME_F1 (rsqrt) (svfloat32_t x, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svbool_t special = svcmplt_n_f32 (pg, x, 0x1p-128f);
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    {
+      return special_case (x, special, d);
+    }
+  return inline_rsqrt (x);
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c

index 8138f2f2a78a789f9817ee7ee75bcc88e3c1cac7..42d076b9a91b3e07d791854357b96d775e861945 100644 (file)
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -54,6 +54,7 @@ VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
  VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
  VPCS_VECTOR_WRAPPER (log2p1_advsimd, _ZGVnN2v_log2p1)
  VPCS_VECTOR_WRAPPER_ff (pow_advsimd, _ZGVnN2vv_pow)
+VPCS_VECTOR_WRAPPER (rsqrt_advsimd, _ZGVnN2v_rsqrt)
  VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
  VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh)
  VPCS_VECTOR_WRAPPER (sinpi_advsimd, _ZGVnN2v_sinpi)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c

index 034098c1ec9259c8a384b8153112f147a53a7259..543816558b0afca6801ecc073f7749c9c00d4f15 100644 (file)
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -73,6 +73,7 @@ SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
  SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
  SVE_VECTOR_WRAPPER (log2p1_sve, _ZGVsMxv_log2p1)
  SVE_VECTOR_WRAPPER_ff (pow_sve, _ZGVsMxvv_pow)
+SVE_VECTOR_WRAPPER (rsqrt_sve, _ZGVsMxv_rsqrt)
  SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
  SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh)
  SVE_VECTOR_WRAPPER (sinpi_sve, _ZGVsMxv_sinpi)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c

index 3879b3c654b1d019860efedc33d88197a0fd3ac7..52177097962e2903771d4b3c80015438630ab49c 100644 (file)
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -54,6 +54,7 @@ VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
  VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
  VPCS_VECTOR_WRAPPER (log2p1f_advsimd, _ZGVnN4v_log2p1f)
  VPCS_VECTOR_WRAPPER_ff (powf_advsimd, _ZGVnN4vv_powf)
+VPCS_VECTOR_WRAPPER (rsqrtf_advsimd, _ZGVnN4v_rsqrtf)
  VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
  VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf)
  VPCS_VECTOR_WRAPPER (sinpif_advsimd, _ZGVnN4v_sinpif)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c

index 38a6e03c23f11b4ffc646e83766d92fa1320bf6f..a35b2fc7b55aa2f6cd0b8d2bdf58e9f37cd1e49a 100644 (file)
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -73,6 +73,7 @@ SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
  SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
  SVE_VECTOR_WRAPPER (log2p1f_sve, _ZGVsMxv_log2p1f)
  SVE_VECTOR_WRAPPER_ff (powf_sve, _ZGVsMxvv_powf)
+SVE_VECTOR_WRAPPER (rsqrtf_sve, _ZGVsMxv_rsqrtf)
  SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
  SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf)
  SVE_VECTOR_WRAPPER (sinpif_sve, _ZGVsMxv_sinpif)
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist

index fb2eec79a5a05a2de0b20aa33667384a7e16a2aa..6d13d536130ecaabe168f96b996a7c7a8a298496 100644 (file)
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -176,10 +176,13 @@ GLIBC_2.43 _ZGVnN2v_log10p1 F
  GLIBC_2.43 _ZGVnN2v_log10p1f F
  GLIBC_2.43 _ZGVnN2v_log2p1 F
  GLIBC_2.43 _ZGVnN2v_log2p1f F
+GLIBC_2.43 _ZGVnN2v_rsqrt F
+GLIBC_2.43 _ZGVnN2v_rsqrtf F
  GLIBC_2.43 _ZGVnN4v_exp10m1f F
  GLIBC_2.43 _ZGVnN4v_exp2m1f F
  GLIBC_2.43 _ZGVnN4v_log10p1f F
  GLIBC_2.43 _ZGVnN4v_log2p1f F
+GLIBC_2.43 _ZGVnN4v_rsqrtf F
  GLIBC_2.43 _ZGVsMxv_exp10m1 F
  GLIBC_2.43 _ZGVsMxv_exp10m1f F
  GLIBC_2.43 _ZGVsMxv_exp2m1 F
@@ -188,3 +191,5 @@ GLIBC_2.43 _ZGVsMxv_log10p1 F
  GLIBC_2.43 _ZGVsMxv_log10p1f F
  GLIBC_2.43 _ZGVsMxv_log2p1 F
  GLIBC_2.43 _ZGVsMxv_log2p1f F
+GLIBC_2.43 _ZGVsMxv_rsqrt F
+GLIBC_2.43 _ZGVsMxv_rsqrtf F
author	James Chesterman <James.Chesterman@arm.com>
	Fri, 5 Dec 2025 10:37:45 +0000 (10:37 +0000)
committer	Adhemerval Zanella <adhemerval.zanella@linaro.org>
	Fri, 5 Dec 2025 18:05:54 +0000 (15:05 -0300)
bits/libm-simd-decl-stubs.h		patch \| blob \| blame \| history
math/bits/mathcalls.h		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/Makefile		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/Versions		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/advsimd_f32_protos.h		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/bits/math-vector.h		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/finclude/math-vector-fortran.h		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/rsqrt_advsimd.c	[new file with mode: 0644]	patch \| blob
sysdeps/aarch64/fpu/rsqrt_sve.c	[new file with mode: 0644]	patch \| blob
sysdeps/aarch64/fpu/rsqrtf_advsimd.c	[new file with mode: 0644]	patch \| blob
sysdeps/aarch64/fpu/rsqrtf_sve.c	[new file with mode: 0644]	patch \| blob
sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/test-double-sve-wrappers.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c		patch \| blob \| blame \| history
sysdeps/aarch64/fpu/test-float-sve-wrappers.c		patch \| blob \| blame \| history
sysdeps/unix/sysv/linux/aarch64/libmvec.abilist		patch \| blob \| blame \| history