The SVE algorithm drops the tail in the exp computation at the price of
a lower accuracy, slightly above 1ULP.
The SVE algorithm also drops the special treatement of small (< 2^-65) and
- large (> 2^63) finite values of |y|, as they only affect non-round to nearest
- modes.
+ large (> 2^63) finite values of |y|, as they only affect non-round to
+ nearest modes.
Maximum measured error is 1.04 ULPs:
SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
a double. (int32_t)KI is the k used in the argument reduction and exponent
adjustment of scale, positive k here means the result may overflow and
negative k means the result may underflow. */
-static inline double
-specialcase (double tmp, uint64_t sbits, uint64_t ki)
-{
- double scale;
- if ((ki & 0x80000000) == 0)
- {
- /* k > 0, the exponent of scale might have overflowed by <= 460. */
- sbits -= 1009ull << 52;
- scale = asdouble (sbits);
- return 0x1p1009 * (scale + scale * tmp);
- }
- /* k < 0, need special care in the subnormal range. */
- sbits += 1022ull << 52;
- /* Note: sbits is signed scale. */
- scale = asdouble (sbits);
- double y = scale + scale * tmp;
- return 0x1p-1022 * y;
-}
-
-/* Scalar fallback for special cases of SVE pow's exp. */
static inline svfloat64_t
-sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
- svfloat64_t y, svbool_t cmp)
+specialcase (svfloat64_t tmp, svuint64_t sbits, svuint64_t ki, svbool_t cmp)
{
- svbool_t p = svpfirst (cmp, svpfalse ());
- while (svptest_any (cmp, p))
- {
- double sx1 = svclastb (p, 0, x1);
- uint64_t su1 = svclastb (p, 0, u1);
- uint64_t su2 = svclastb (p, 0, u2);
- double elem = specialcase (sx1, su1, su2);
- svfloat64_t y2 = sv_f64 (elem);
- y = svsel (p, y2, y);
- p = svpnext_b64 (cmp, p);
- }
- return y;
+ svbool_t p_pos = svcmpge_n_f64 (cmp, svreinterpret_f64_u64 (ki), 0.0);
+
+ /* Scale up or down depending on sign of k. */
+ svint64_t offset
+ = svsel_s64 (p_pos, sv_s64 (1009ull << 52), sv_s64 (-1022ull << 52));
+ svfloat64_t factor
+ = svsel_f64 (p_pos, sv_f64 (0x1p1009), sv_f64 (0x1p-1022));
+
+ svuint64_t offset_sbits
+ = svsub_u64_x (cmp, sbits, svreinterpret_u64_s64 (offset));
+ svfloat64_t scale = svreinterpret_f64_u64 (offset_sbits);
+ svfloat64_t res = svmad_f64_x (cmp, scale, tmp, scale);
+ return svmul_f64_x (cmp, res, factor);
}
/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
/* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
- that uses array of structures. We also do the lookup earlier in the code to
- make sure it finishes as early as possible. */
+ that uses array of structures. We also do the lookup earlier in the code
+ to make sure it finishes as early as possible. */
svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
svbool_t oflow = svcmpge (pg, abstop, HugeExp);
oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
- /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
- or underflow. */
+ /* Handle underflow and overlow in scale.
+ For large |x| values (512 < |x| < 1024), scale * (1 + TMP) can
+ overflow or underflow. */
svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ z = svsel (special, specialcase (tmp, sbits, ki, special), z);
- /* Update result with special and large cases. */
- z = sv_call_specialcase (tmp, sbits, ki, z, special);
-
- /* Handle underflow and overflow. */
+ /* Handle underflow and overflow in exp. */
svbool_t x_is_neg = svcmplt (pg, x, 0);
svuint64_t sign_mask
= svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
}
static inline double
-pow_sc (double x, double y)
+pow_specialcase (double x, double y)
{
uint64_t ix = asuint64 (x);
uint64_t iy = asuint64 (y);
return x;
}
+/* Scalar fallback for special case routines with custom signature. */
+static svfloat64_t NOINLINE
+sv_pow_specialcase (svfloat64_t x1, svfloat64_t x2, svfloat64_t y,
+ svbool_t cmp)
+{
+ return sv_call2_f64 (pow_specialcase, x1, x2, y, cmp);
+}
+
svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
/* Cases of zero/inf/nan x or y. */
if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
- vz = sv_call2_f64 (pow_sc, x, y, vz, special);
+ vz = sv_pow_specialcase (x, y, vz, special);
return vz;
}
preamble of scalar powf except that we do not update ix and sign_bias. This
is done in the preamble of the SVE powf. */
static inline float
-powf_specialcase (float x, float y, float z)
+powf_specialcase (float x, float y)
{
uint32_t ix = asuint (x);
uint32_t iy = asuint (y);
- /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */
if (__glibc_unlikely (zeroinfnan (iy)))
{
if (2 * iy == 0)
x2 = -x2;
return iy & 0x80000000 ? 1 / x2 : x2;
}
- /* We need a return here in case x<0 and y is integer, but all other tests
- need to be run. */
- return z;
+ /* Return x for convenience, but make sure result is never used. */
+ return x;
}
/* Scalar fallback for special case routines with custom signature. */
static svfloat32_t NOINLINE
-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
{
- /* Special cases of x or y: zero, inf and nan. */
- svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
- svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
- svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
-
- svbool_t p = svpfirst (cmp, svpfalse ());
- while (svptest_any (cmp, p))
- {
- float sx1 = svclastb (p, 0, x1);
- float sx2 = svclastb (p, 0, x2);
- float elem = svclastb (p, 0, y);
- elem = powf_specialcase (sx1, sx2, elem);
- svfloat32_t y2 = sv_f32 (elem);
- y = svsel (p, y2, y);
- p = svpnext_b32 (cmp, p);
- }
- return y;
+ return sv_call2_f32 (powf_specialcase, x1, x2, y, cmp);
}
/* Compute core for half of the lanes in double precision. */
ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
if (__glibc_unlikely (svptest_any (cmp, cmp)))
- return sv_call_powf_sc (x, y, ret);
+ return sv_call_powf_sc (x, y, ret, cmp);
return ret;
}