From: Richard Sandiford <richard.sandiford@arm.com>
Date: Mon, 3 Apr 2023 08:57:09 +0000 (+0100)
Subject: vect: Make partial trapping ops use predication [PR96373]
X-Git-Tag: releases/gcc-12.3.0~132
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e11513c7688f583d1f4d0961d79d8aa775add03d;p=thirdparty%2Fgcc.git

vect: Make partial trapping ops use predication [PR96373]

PR96373 points out that a predicated SVE loop currently converts
trapping unconditional ops into unpredicated vector ops.  Doing
the operation on inactive lanes can then raise an exception.

As discussed in the PR trail, we aren't 100% consistent about
whether we preserve traps or not.  But the direction of travel
is clearly to improve that rather than live with it.  This patch
tries to do that for the SVE case.

Doing this regresses gcc.target/aarch64/sve/fabd_1.c.  I've added
-fno-trapping-math for now and filed PR108571 to track it.
A similar problem applies to fsubr_1.c.

I think this is likely to regress Power 10, since conditional
operations are only available for masked loops.  I think we'll
need to add -fno-trapping-math to any affected testcases,
but I don't have a Power 10 system to test on.

gcc/
	PR tree-optimization/96373
	PR tree-optimization/108979
	* tree-vect-stmts.cc (vectorizable_operation): Predicate trapping
	operations on the loop mask.  Reject partial vectors if this isn't
	possible.  Don't mask operations on invariants.

gcc/testsuite/
	PR tree-optimization/96373
	PR tree-optimization/108571
	PR tree-optimization/108979
	* gcc.target/aarch64/sve/fabd_1.c: Add -fno-trapping-math.
	* gcc.target/aarch64/sve/fsubr_1.c: Likewise.
	* gcc.target/aarch64/sve/fmul_1.c: Expect predicate ops.
	* gcc.target/aarch64/sve/fp_arith_1.c: Likewise.
	* gfortran.dg/vect/pr108979.f90: New test.
---

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c
index 13ad83be24ce..30bde6f0df79 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c
@@ -1,5 +1,5 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O3 --save-temps" } */
+/* { dg-options "-O3 --save-temps -fno-trapping-math" } */
 
 #define N 16
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c
index 4a3e7c067453..0245a8c14225 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c
@@ -27,20 +27,20 @@ DO_ARITH_OPS (_Float16, *, mul)
 DO_ARITH_OPS (float, *, mul)
 DO_ARITH_OPS (double, *, mul)
 
-/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
 /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 1 } } */
-/* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2} } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2.0\n} 1 } } */
 /* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #5} } } */
 /* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #-} } } */
 
-/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
 /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 1 } } */
-/* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2} } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2.0\n} 1 } } */
 /* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #5} } } */
 /* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #-} } } */
 
-/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
 /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0.5\n} 1 } } */
-/* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2} } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2.0\n} 1 } } */
 /* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #5} } } */
 /* { dg-final { scan-assembler-not   {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #-} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c
index 5aed0dcb4908..419d6e1b5ec5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c
@@ -34,37 +34,37 @@ DO_ARITH_OPS (double, -, minus)
 
 /* No specific count because it's valid to use fadd or fsub for the
    out-of-range constants.  */
-/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} } } */
 /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 2 } } */
 /* { dg-final { scan-assembler-not   {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2} } } */
 /* { dg-final { scan-assembler-not   {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #-} } } */
 
-/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 2 } } */
 /* { dg-final { scan-assembler-not   {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2} } } */
 /* { dg-final { scan-assembler-not   {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #-} } } */
 
-/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} } } */
+/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} } } */
 /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 2 } } */
 /* { dg-final { scan-assembler-not   {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2} } } */
 /* { dg-final { scan-assembler-not   {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #-} } } */
 
-/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} } } */
+/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 2 } } */
 /* { dg-final { scan-assembler-not   {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2} } } */
 /* { dg-final { scan-assembler-not   {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #-} } } */
 
-/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} } } */
+/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} } } */
 /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1.0\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0.5\n} 2 } } */
 /* { dg-final { scan-assembler-not   {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2} } } */
 /* { dg-final { scan-assembler-not   {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #-} } } */
 
-/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} } } */
+/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1.0\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0.5\n} 2 } } */
 /* { dg-final { scan-assembler-not   {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c
index f47a360dee91..012cf6e9e5d2 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c
@@ -1,5 +1,5 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O3 --save-temps" } */
+/* { dg-options "-O3 --save-temps -fno-trapping-math" } */
 
 #define DO_IMMEDIATE_OPS(VALUE, TYPE, NAME)			\
 void vsubrarithimm_##NAME##_##TYPE (TYPE *dst, int count)	\
diff --git a/gcc/testsuite/gfortran.dg/vect/pr108979.f90 b/gcc/testsuite/gfortran.dg/vect/pr108979.f90
new file mode 100644
index 000000000000..623eb67826f7
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/vect/pr108979.f90
@@ -0,0 +1,21 @@
+! { dg-do compile }
+! { dg-additional-options "-fnon-call-exceptions" }
+! { dg-additional-options "-march=armv8.2-a+sve" { target aarch64*-*-* } }
+
+MODULE hfx_contract_block
+  INTEGER, PARAMETER :: dp=8
+CONTAINS
+  SUBROUTINE block_2_1_2_1(kbd,kbc,kad,kac,pbd,pbc,pad,pac,prim,scale)
+    REAL(KIND=dp) :: kbd(1*1), kbc(1*2), kad(2*1), kac(2*2), pbd(1*1), &
+      pbc(1*2), pad(2*1), pac(2*2), prim(2*1*2*1), scale
+      DO md = 1,1
+        DO mc = 1,2
+          DO mb = 1,1
+            DO ma = 1,2
+              kac((mc-1)*2+ma) = kac((mc-1)*2+ma)-tmp*p_bd
+            END DO
+          END DO
+        END DO
+      END DO
+  END SUBROUTINE block_2_1_2_1
+END MODULE hfx_contract_block
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index e6e02fd9d2f5..52685f4c23af 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6128,6 +6128,8 @@ vectorizable_operation (vec_info *vinfo,
                          "use not simple.\n");
       return false;
     }
+  bool is_invariant = (dt[0] == vect_external_def
+		       || dt[0] == vect_constant_def);
   /* If op0 is an external or constant def, infer the vector type
      from the scalar type.  */
   if (!vectype)
@@ -6181,6 +6183,8 @@ vectorizable_operation (vec_info *vinfo,
                              "use not simple.\n");
 	  return false;
 	}
+      is_invariant &= (dt[1] == vect_external_def
+		       || dt[1] == vect_constant_def);
       if (vectype2
 	  && maybe_ne (nunits_out, TYPE_VECTOR_SUBPARTS (vectype2)))
 	return false;
@@ -6195,6 +6199,8 @@ vectorizable_operation (vec_info *vinfo,
                              "use not simple.\n");
 	  return false;
 	}
+      is_invariant &= (dt[2] == vect_external_def
+		       || dt[2] == vect_constant_def);
       if (vectype3
 	  && maybe_ne (nunits_out, TYPE_VECTOR_SUBPARTS (vectype3)))
 	return false;
@@ -6292,14 +6298,22 @@ vectorizable_operation (vec_info *vinfo,
   vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
   internal_fn cond_fn = get_conditional_internal_fn (code);
 
+  /* If operating on inactive elements could generate spurious traps,
+     we need to restrict the operation to active lanes.  Note that this
+     specifically doesn't apply to unhoisted invariants, since they
+     operate on the same value for every lane.
+
+     Similarly, if this operation is part of a reduction, a fully-masked
+     loop should only change the active lanes of the reduction chain,
+     keeping the inactive lanes as-is.  */
+  bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
+			    || reduc_idx >= 0);
+
   if (!vec_stmt) /* transformation not required.  */
     {
-      /* If this operation is part of a reduction, a fully-masked loop
-	 should only change the active lanes of the reduction chain,
-	 keeping the inactive lanes as-is.  */
       if (loop_vinfo
 	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
-	  && reduc_idx >= 0)
+	  && mask_out_inactive)
 	{
 	  if (cond_fn == IFN_LAST
 	      || !direct_internal_fn_supported_p (cond_fn, vectype,
@@ -6442,16 +6456,31 @@ vectorizable_operation (vec_info *vinfo,
       vop1 = ((op_type == binary_op || op_type == ternary_op)
 	      ? vec_oprnds1[i] : NULL_TREE);
       vop2 = ((op_type == ternary_op) ? vec_oprnds2[i] : NULL_TREE);
-      if (masked_loop_p && reduc_idx >= 0)
+      if (masked_loop_p && mask_out_inactive)
 	{
-	  /* Perform the operation on active elements only and take
-	     inactive elements from the reduction chain input.  */
-	  gcc_assert (!vop2);
-	  vop2 = reduc_idx == 1 ? vop1 : vop0;
 	  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
 					  vectype, i);
-	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
-						    vop0, vop1, vop2);
+	  auto_vec<tree> vops (5);
+	  vops.quick_push (mask);
+	  vops.quick_push (vop0);
+	  if (vop1)
+	    vops.quick_push (vop1);
+	  if (vop2)
+	    vops.quick_push (vop2);
+	  if (reduc_idx >= 0)
+	    {
+	      /* Perform the operation on active elements only and take
+		 inactive elements from the reduction chain input.  */
+	      gcc_assert (!vop2);
+	      vops.quick_push (reduc_idx == 1 ? vop1 : vop0);
+	    }
+	  else
+	    {
+	      auto else_value = targetm.preferred_else_value
+		(cond_fn, vectype, vops.length () - 1, &vops[1]);
+	      vops.quick_push (else_value);
+	    }
+	  gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
 	  new_temp = make_ssa_name (vec_dest, call);
 	  gimple_call_set_lhs (call, new_temp);
 	  gimple_call_set_nothrow (call, true);