AArch64: Fix costing of emulated gathers/scatters [PR118188]

author Tamar Christina <tamar.christina@arm.com>

Thu, 9 Jan 2025 21:31:05 +0000 (21:31 +0000)

committer Tamar Christina <tamar.christina@arm.com>

Thu, 9 Jan 2025 21:31:05 +0000 (21:31 +0000)
author Tamar Christina <tamar.christina@arm.com>
Thu, 9 Jan 2025 21:31:05 +0000 (21:31 +0000)
committer Tamar Christina <tamar.christina@arm.com>
Thu, 9 Jan 2025 21:31:05 +0000 (21:31 +0000)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 78d2cc4bbe4933c79153d0741bfd8d7b076952d0..6fe0fa2722bd4cdff5545f1c8bf2eabd4a03c1ff 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17388,6 +17388,47 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
         return;
      }
  
+  /* Detect the case where we are using an emulated gather/scatter.  When a
+     target does not support gathers and scatters directly the vectorizer
+     emulates these by constructing an index vector and then issuing an
+     extraction for every lane in the vector.  If the index vector is loaded
+     from memory, the vector load and extractions are subsequently lowered by
+     veclower into a series of scalar index loads.  After the final loads are
+     done it issues a vec_construct to recreate the vector from the scalar.  For
+     costing when we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER we
+     are dealing with an emulated instruction and should adjust costing
+     properly.  */
+  if (kind == vec_to_scalar
+      && (m_vec_flags & VEC_ADVSIMD)
+      && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+    {
+      auto dr = STMT_VINFO_DATA_REF (stmt_info);
+      tree dr_ref = DR_REF (dr);
+      while (handled_component_p (dr_ref))
+       {
+         if (TREE_CODE (dr_ref) == ARRAY_REF)
+           {
+             tree offset = TREE_OPERAND (dr_ref, 1);
+             if (SSA_VAR_P (offset))
+               {
+                 if (gimple_vuse (SSA_NAME_DEF_STMT (offset)))
+                   {
+                     if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type)
+                       ops->loads += count - 1;
+                     else
+                         /* Stores want to count both the index to array and data to
+                            array using vec_to_scalar.  However we have index stores
+                            in Adv.SIMD and so we only want to adjust the index
+                            loads.  */
+                       ops->loads += count / 2;
+                     return;
+                   }
+                 break;
+               }
+           }
+         dr_ref = TREE_OPERAND (dr_ref, 0);
+       }
+    }
  
    /* Count the basic operation cost associated with KIND.  */
    switch (kind)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c

new file mode 100644 (file)

index 0000000..d550f00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D], b[LEN_1D];
+
+float
+s4115 (int *ip)
+{
+    float sum = 0.;
+    for (int i = 0; i < LEN_1D; i++)
+      {
+        sum += a[i] * b[ip[i]];
+      }
+    return sum;
+}
+
+/* { dg-final { scan-assembler {\s+ld1w\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c

new file mode 100644 (file)

index 0000000..24da064
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D], b[LEN_1D];
+
+float
+s4115 (int *ip)
+{
+    float sum = 0.;
+    for (int i = 0; i < LEN_1D; i++)
+      {
+        sum += a[i] * b[ip[i] + 1];
+      }
+    return sum;
+}
+
+/* { dg-final { scan-assembler {\s+ld1w\t} { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c

new file mode 100644 (file)

index 0000000..77d06d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D], b[LEN_1D];
+
+float
+s4115 (int *ip)
+{
+    float sum = 0.;
+    for (int i = 0; i < LEN_1D; i++)
+      {
+        sum += a[i] * b[ip[i]];
+      }
+    return sum;
+}
+
+/* { dg-final { scan-assembler-not {\s+st1w\t} } } */
author	Tamar Christina <tamar.christina@arm.com>
	Thu, 9 Jan 2025 21:31:05 +0000 (21:31 +0000)
committer	Tamar Christina <tamar.christina@arm.com>
	Thu, 9 Jan 2025 21:31:05 +0000 (21:31 +0000)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c	[new file with mode: 0644]	patch \| blob