--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target glibc } */
+/* { dg-options "-O3 -fno-math-errno -ftrapping-math -march=armv8-a+sve" } */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <fenv.h>
+#include <signal.h>
+
+#pragma STDC FENV_ACCESS ON
+
+__attribute__((noinline))
+void f(float *__restrict c, int n)
+{
+ for (int i = 0; i < n; i++)
+ c[i] = __builtin_sqrtf (c[i] - 2.0f);
+}
+
+static void on_fpe (int sig)
+{
+ (void) sig;
+ puts ("SIGFPE: trapped FP exception (unexpected invalid from sqrt)");
+ fflush (stdout);
+ __builtin_abort ();
+}
+
+int
+main (void)
+{
+ signal (SIGFPE, on_fpe);
+
+ /* Clear flags and enable trap on invalid operations. */
+ feclearexcept (FE_ALL_EXCEPT);
+ feenableexcept (FE_INVALID);
+
+ /* Choose a length that is NOT a multiple of typical SVE VL (unknown at
+ runtime), and includes plenty of extra lanes. */
+ const int n = 37;
+
+ float *c = aligned_alloc (64, (size_t) n * sizeof (float));
+ if (!c)
+ return 1;
+
+ /* Populate c so that (c[i] - 2) is a perfect square; this avoids FE_INVALID
+ while giving deterministic results. */
+ for (int i = 0; i < n; i++)
+ {
+ int t = i & 3;
+ c[i] = (float) (t * t) + 2.0f;
+ }
+
+ f (c, n);
+
+ /* Only FE_INVALID would indicate a wrong extra-lane computation here. */
+ if (fetestexcept (FE_INVALID))
+ {
+ puts ("Unexpected FE_INVALID");
+ return 2;
+ }
+
+ int ok = 1;
+ for (int i = 0; i < n; i++)
+ {
+ float expected = (float) (i & 3);
+ if (!(c[i] == expected))
+ {
+ printf ("Mismatch at %d: expected %g, got %g\n", i, expected, c[i]);
+ ok = 0;
+ }
+ }
+
+ puts (ok ? "OK" : "FAIL");
+ free (c);
+ return ok ? 0 : 3;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-math-errno -ftrapping-math -march=armv9-a" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f:
+** ...
+** whilelo p([0-9]+).s, wzr, w[0-9]+
+** ...
+** ld1w z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\]
+** fadd z[0-9]+.s, p\1/m, z[0-9]+.s, z[0-9]+.s
+** fsqrt z[0-9]+.s, p\1/m, z[0-9]+.s
+** st1w z[0-9]+.s, p\1, \[x[0-9]+, x[0-9]+, lsl 2\]
+** incw x[0-9]+
+** whilelo p\1.s, w[0-9]+, w[0-9]+
+** ...
+*/
+void
+f (float *__restrict c, int n)
+{
+ for (int i = 0; i < n; i++)
+ c[i] = __builtin_sqrtf (c[i] - 2.0f);
+}
+
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-math-errno -fno-trapping-math -march=armv9-a" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f:
+** ...
+** whilelo p([0-9]+).s, wzr, w[0-9]+
+** ...
+** ld1w z[0-9]+.s, p\1/z, \[x[0-9]+, x[0-9]+, lsl 2\]
+** fadd z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** fsqrt z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+** st1w z[0-9]+.s, p\1, \[x[0-9]+, x[0-9]+, lsl 2\]
+** incw x[0-9]+
+** whilelo p\1.s, w[0-9]+, w[0-9]+
+** ...
+*/
+void
+f (float *__restrict c, int n)
+{
+ for (int i = 0; i < n; i++)
+ c[i] = __builtin_sqrtf (c[i] - 2.0f);
+}
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
tree fndecl, new_temp, rhs_type;
- enum vect_def_type dt[4]
+ enum vect_def_type dt[5]
= { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
- vect_unknown_def_type };
+ vect_unknown_def_type, vect_unknown_def_type };
tree vectypes[ARRAY_SIZE (dt)] = {};
slp_tree slp_op[ARRAY_SIZE (dt)] = {};
auto_vec<tree, 8> vargs;
/* Bail out if the function has more than four arguments, we do not have
interesting builtin functions to vectorize with more than two arguments
- except for fma. No arguments is also not good. */
- if (nargs == 0 || nargs > 4)
+ except for fma (cond_fma has more). No arguments is also not good. */
+ if (nargs == 0 || nargs > 5)
return false;
/* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic. */
ifn = vectorizable_internal_function (cfn, callee, vectype_out,
vectype_in);
+ /* Check if the operation traps. */
+ bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
+ if (could_trap && cost_vec && loop_vinfo)
+ {
+ /* If the operation can trap it must be conditional, otherwise fail. */
+ internal_fn cond_fn = get_conditional_internal_fn (ifn);
+ internal_fn cond_len_fn = get_len_internal_fn (ifn);
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ /* We assume that BB SLP fills all lanes, so no inactive lanes can
+ cause issues. */
+ if ((cond_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_fn, vectype_out,
+ OPTIMIZE_FOR_SPEED))
+ && (cond_len_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
+ OPTIMIZE_FOR_SPEED)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because no"
+ " conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ }
+ }
+
/* If that fails, try asking for a target-specific built-in function. */
if (ifn == IFN_LAST)
{
else if (reduc_idx >= 0)
gcc_unreachable ();
}
- else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap))
{
ifn = cond_fn;
vect_nargs += 2;
{
int varg = 0;
/* Add the mask if necessary. */
- if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ if (masked_loop_p && mask_opno == -1
+ && (reduc_idx >= 0 || could_trap))
{
gcc_assert (internal_fn_mask_index (ifn) == varg);
unsigned int vec_num = vec_oprnds0.length ();
vargs[varg++] = vec_oprndsk[i];
}
/* Add the else value if necessary. */
- if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ if (masked_loop_p && mask_opno == -1
+ && (reduc_idx >= 0 || could_trap))
{
gcc_assert (internal_fn_else_index (ifn) == varg);
- vargs[varg++] = vargs[reduc_idx + 1];
+ if (reduc_idx >= 0)
+ vargs[varg++] = vargs[reduc_idx + 1];
+ else
+ {
+ auto else_value = targetm.preferred_else_value
+ (cond_fn, vectype_out, varg - 1, &vargs[1]);
+ vargs[varg++] = else_value;
+ }
}
if (clz_ctz_arg1)
vargs[varg++] = clz_ctz_arg1;