AArch64: take gather/scatter decode overhead into account

author Tamar Christina <tamar.christina@arm.com>

Tue, 6 Aug 2024 21:41:10 +0000 (22:41 +0100)

committer Tamar Christina <tamar.christina@arm.com>

Tue, 6 Aug 2024 21:41:10 +0000 (22:41 +0100)
author Tamar Christina <tamar.christina@arm.com>
Tue, 6 Aug 2024 21:41:10 +0000 (22:41 +0100)
committer Tamar Christina <tamar.christina@arm.com>
Tue, 6 Aug 2024 21:41:10 +0000 (22:41 +0100)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index f64afe2889018e1c4735a1677e6bf5febc4a7665..44b881b5c57a40a0990589ca792e15c08d1cd34f 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -262,6 +262,8 @@ struct sve_vec_cost : simd_vec_cost
                           unsigned int fadda_f64_cost,
                           unsigned int gather_load_x32_cost,
                           unsigned int gather_load_x64_cost,
+                         unsigned int gather_load_x32_init_cost,
+                         unsigned int gather_load_x64_init_cost,
                           unsigned int scatter_store_elt_cost)
      : simd_vec_cost (base),
        clast_cost (clast_cost),
@@ -270,6 +272,8 @@ struct sve_vec_cost : simd_vec_cost
        fadda_f64_cost (fadda_f64_cost),
        gather_load_x32_cost (gather_load_x32_cost),
        gather_load_x64_cost (gather_load_x64_cost),
+      gather_load_x32_init_cost (gather_load_x32_init_cost),
+      gather_load_x64_init_cost (gather_load_x64_init_cost),
        scatter_store_elt_cost (scatter_store_elt_cost)
    {}
  
@@ -289,6 +293,12 @@ struct sve_vec_cost : simd_vec_cost
    const int gather_load_x32_cost;
    const int gather_load_x64_cost;
  
+  /* Additional loop initialization cost of using a gather load instruction.  The x32
+     value is for loads of 32-bit elements and the x64 value is for loads of
+     64-bit elements.  */
+  const int gather_load_x32_init_cost;
+  const int gather_load_x64_init_cost;
+
    /* The per-element cost of a scatter store.  */
    const int scatter_store_elt_cost;
  };
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 9e12bd9711cdccfd9a503311257b125fb00ea32d..2ac5a22c848e0c694e08cea80237545736961c2e 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16231,6 +16231,10 @@ private:
       supported by Advanced SIMD and SVE2.  */
    bool m_has_avg = false;
  
+  /* Additional initialization costs for using gather or scatter operation in
+     the current loop.  */
+  unsigned int m_sve_gather_scatter_init_cost = 0;
+
    /* True if the vector body contains a store to a decl and if the
       function is known to have a vld1 from the same decl.
  
@@ -17295,6 +17299,23 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
                                                         stmt_info, vectype,
                                                         where, stmt_cost);
+
+      /* Check if we've seen an SVE gather/scatter operation and which size.  */
+      if (kind == scalar_load
+         && aarch64_sve_mode_p (TYPE_MODE (vectype))
+         && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+       {
+         const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
+         if (sve_costs)
+           {
+             if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
+               m_sve_gather_scatter_init_cost
+                 += sve_costs->gather_load_x64_init_cost;
+             else
+               m_sve_gather_scatter_init_cost
+                 += sve_costs->gather_load_x32_init_cost;
+           }
+       }
      }
  
    /* Do any SVE-specific adjustments to the cost.  */
@@ -17680,6 +17701,11 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
        m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                              m_costs[vect_body]);
        m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+
+      /* For gather and scatters there's an additional overhead for the first
+        iteration.  For low count loops they're not beneficial so model the
+        overhead as loop prologue costs.  */
+      m_costs[vect_prologue] += m_sve_gather_scatter_init_cost;
      }
  
    /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
diff --git a/gcc/config/aarch64/tuning_models/a64fx.h b/gcc/config/aarch64/tuning_models/a64fx.h

index 6091289d4c3c66f01d7e4dbf97a85c1f8c40bb0b..378a1b3889ee265859786c1ff6525fce2305b615 100644 (file)
--- a/gcc/config/aarch64/tuning_models/a64fx.h
+++ b/gcc/config/aarch64/tuning_models/a64fx.h
@@ -104,6 +104,8 @@ static const sve_vec_cost a64fx_sve_vector_cost =
    13, /* fadda_f64_cost  */
    64, /* gather_load_x32_cost  */
    32, /* gather_load_x64_cost  */
+  0, /* gather_load_x32_init_cost  */
+  0, /* gather_load_x64_init_cost  */
    1 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h b/gcc/config/aarch64/tuning_models/cortexx925.h

index 6cae5b7de5ca7ffad8a0f683e1285039bb55d159..b509cae758419a415d9067ec751ef1e6528eb09a 100644 (file)
--- a/gcc/config/aarch64/tuning_models/cortexx925.h
+++ b/gcc/config/aarch64/tuning_models/cortexx925.h
@@ -135,6 +135,8 @@ static const sve_vec_cost cortexx925_sve_vector_cost =
       operation more than a 64-bit gather.  */
    14, /* gather_load_x32_cost  */
    12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
    1 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/generic.h b/gcc/config/aarch64/tuning_models/generic.h

index 2b1f68b3052117814161a32f426422736ad6462b..101969bdbb9ccf7eafbd9a1cd6e25f0b584fb261 100644 (file)
--- a/gcc/config/aarch64/tuning_models/generic.h
+++ b/gcc/config/aarch64/tuning_models/generic.h
@@ -105,6 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost =
    2, /* fadda_f64_cost  */
    4, /* gather_load_x32_cost  */
    2, /* gather_load_x64_cost  */
+  12, /* gather_load_x32_init_cost  */
+  4, /* gather_load_x64_init_cost  */
    1 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h b/gcc/config/aarch64/tuning_models/generic_armv8_a.h

index b38b9a8c5cad7d12aa38afdb610a14a25e755010..b5088afe068aa4be7f9dd614cfdd2a51fa96e524 100644 (file)
--- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
@@ -106,6 +106,8 @@ static const sve_vec_cost generic_armv8_a_sve_vector_cost =
    2, /* fadda_f64_cost  */
    4, /* gather_load_x32_cost  */
    2, /* gather_load_x64_cost  */
+  12, /* gather_load_x32_init_cost  */
+  4, /* gather_load_x64_init_cost  */
    1 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h

index 7156dbe5787e831bc4343deb7d7b88e9823fc1bc..999985ed40f694f2681779d940bdb282f289b8e3 100644 (file)
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -136,6 +136,8 @@ static const sve_vec_cost generic_armv9_a_sve_vector_cost =
       operation more than a 64-bit gather.  */
    14, /* gather_load_x32_cost  */
    12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
    3 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h b/gcc/config/aarch64/tuning_models/neoverse512tvb.h

index 825c6a64990b72cda3641737957dc94d75db1509..d2a0b647791de8fca6d7684849d2ab1e9104b045 100644 (file)
--- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -79,6 +79,8 @@ static const sve_vec_cost neoverse512tvb_sve_vector_cost =
       operation more than a 64-bit gather.  */
    14, /* gather_load_x32_cost  */
    12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
    3 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h

index d41e714aa045266ecae62a36ed02dfbfb7597c3a..1a5b66901b5c3fb78f87fee40236957139644585 100644 (file)
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
       operation more than a 64-bit gather.  */
    14, /* gather_load_x32_cost  */
    12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
    3 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h b/gcc/config/aarch64/tuning_models/neoversen3.h

index c027cefbd2fd20d99747660e3bd27c409b69e3c3..3e2b84ca497e71aaceae7df0956ed81fb7fcefda 100644 (file)
--- a/gcc/config/aarch64/tuning_models/neoversen3.h
+++ b/gcc/config/aarch64/tuning_models/neoversen3.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversen3_sve_vector_cost =
       operation more than a 64-bit gather.  */
    14, /* gather_load_x32_cost  */
    12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
    1 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h

index 0fc41ce6a41b3135fa06d2bda1f517fdf4f8dbcf..705ed025730f6683109a4796c6eefa55b437cec9 100644 (file)
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -126,6 +126,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost =
    8, /* fadda_f64_cost  */
    32, /* gather_load_x32_cost  */
    16, /* gather_load_x64_cost  */
+  96, /* gather_load_x32_init_cost  */
+  32, /* gather_load_x64_init_cost  */
    3 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h

index bd259a37e9c9562d354f04fde4f0c6c20f616414..1ebb96b296d3947ee7d228686492bdf38574c4ed 100644 (file)
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev2_sve_vector_cost =
       operation more than a 64-bit gather.  */
    14, /* gather_load_x32_cost  */
    12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
    3 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h b/gcc/config/aarch64/tuning_models/neoversev3.h

index c602d067c7116cf6b081caeae8d36f9969e06d8d..c91e8c829532f9236de0102770e5c6b94e83da9a 100644 (file)
--- a/gcc/config/aarch64/tuning_models/neoversev3.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3_sve_vector_cost =
       operation more than a 64-bit gather.  */
    14, /* gather_load_x32_cost  */
    12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
    1 /* scatter_store_elt_cost  */
  };
  
diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h b/gcc/config/aarch64/tuning_models/neoversev3ae.h

index 96d7ccf03cd96056d09676d908c63a25e3da6765..61e439326eb6f983abf8574e657cfbb0c2f9bb33 100644 (file)
--- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3ae_sve_vector_cost =
       operation more than a 64-bit gather.  */
    14, /* gather_load_x32_cost  */
    12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
    1 /* scatter_store_elt_cost  */
  };
author	Tamar Christina <tamar.christina@arm.com>
	Tue, 6 Aug 2024 21:41:10 +0000 (22:41 +0100)
committer	Tamar Christina <tamar.christina@arm.com>
	Tue, 6 Aug 2024 21:41:10 +0000 (22:41 +0100)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/a64fx.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/cortexx925.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/generic.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/generic_armv8_a.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/generic_armv9_a.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/neoverse512tvb.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/neoversen2.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/neoversen3.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/neoversev1.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/neoversev2.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/neoversev3.h		patch \| blob \| blame \| history
gcc/config/aarch64/tuning_models/neoversev3ae.h		patch \| blob \| blame \| history