Gather and scatters are not usually beneficial when the loop count is small.
This is because there's not only a cost to their execution within the loop but
there is also some cost to enter loops with them.
As such this patch models this overhead. For generic tuning we however still
prefer gathers/scatters when the loop costs work out.
gcc/ChangeLog:
* config/aarch64/aarch64-protos.h (struct sve_vec_cost): Add
gather_load_x32_init_cost and gather_load_x64_init_cost.
* config/aarch64/aarch64.cc (aarch64_vector_costs): Add
m_sve_gather_scatter_init_cost.
(aarch64_vector_costs::add_stmt_cost): Use them.
(aarch64_vector_costs::finish_cost): Likewise.
* config/aarch64/tuning_models/a64fx.h: Update.
* config/aarch64/tuning_models/cortexx925.h: Update.
* config/aarch64/tuning_models/generic.h: Update.
* config/aarch64/tuning_models/generic_armv8_a.h: Update.
* config/aarch64/tuning_models/generic_armv9_a.h: Update.
* config/aarch64/tuning_models/neoverse512tvb.h: Update.
* config/aarch64/tuning_models/neoversen2.h: Update.
* config/aarch64/tuning_models/neoversen3.h: Update.
* config/aarch64/tuning_models/neoversev1.h: Update.
* config/aarch64/tuning_models/neoversev2.h: Update.
* config/aarch64/tuning_models/neoversev3.h: Update.
* config/aarch64/tuning_models/neoversev3ae.h: Update.
unsigned int fadda_f64_cost,
unsigned int gather_load_x32_cost,
unsigned int gather_load_x64_cost,
+ unsigned int gather_load_x32_init_cost,
+ unsigned int gather_load_x64_init_cost,
unsigned int scatter_store_elt_cost)
: simd_vec_cost (base),
clast_cost (clast_cost),
fadda_f64_cost (fadda_f64_cost),
gather_load_x32_cost (gather_load_x32_cost),
gather_load_x64_cost (gather_load_x64_cost),
+ gather_load_x32_init_cost (gather_load_x32_init_cost),
+ gather_load_x64_init_cost (gather_load_x64_init_cost),
scatter_store_elt_cost (scatter_store_elt_cost)
{}
const int gather_load_x32_cost;
const int gather_load_x64_cost;
+ /* Additional loop initialization cost of using a gather load instruction. The x32
+ value is for loads of 32-bit elements and the x64 value is for loads of
+ 64-bit elements. */
+ const int gather_load_x32_init_cost;
+ const int gather_load_x64_init_cost;
+
/* The per-element cost of a scatter store. */
const int scatter_store_elt_cost;
};
supported by Advanced SIMD and SVE2. */
bool m_has_avg = false;
+ /* Additional initialization costs for using gather or scatter operation in
+ the current loop. */
+ unsigned int m_sve_gather_scatter_init_cost = 0;
+
/* True if the vector body contains a store to a decl and if the
function is known to have a vld1 from the same decl.
stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
stmt_info, vectype,
where, stmt_cost);
+
+ /* Check if we've seen an SVE gather/scatter operation and which size. */
+ if (kind == scalar_load
+ && aarch64_sve_mode_p (TYPE_MODE (vectype))
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+ {
+ const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
+ if (sve_costs)
+ {
+ if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
+ m_sve_gather_scatter_init_cost
+ += sve_costs->gather_load_x64_init_cost;
+ else
+ m_sve_gather_scatter_init_cost
+ += sve_costs->gather_load_x32_init_cost;
+ }
+ }
}
/* Do any SVE-specific adjustments to the cost. */
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
m_costs[vect_body]);
m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+
+ /* For gather and scatters there's an additional overhead for the first
+ iteration. For low count loops they're not beneficial so model the
+ overhead as loop prologue costs. */
+ m_costs[vect_prologue] += m_sve_gather_scatter_init_cost;
}
/* Apply the heuristic described above m_stp_sequence_cost. Prefer
13, /* fadda_f64_cost */
64, /* gather_load_x32_cost */
32, /* gather_load_x64_cost */
+ 0, /* gather_load_x32_init_cost */
+ 0, /* gather_load_x64_init_cost */
1 /* scatter_store_elt_cost */
};
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
1 /* scatter_store_elt_cost */
};
2, /* fadda_f64_cost */
4, /* gather_load_x32_cost */
2, /* gather_load_x64_cost */
+ 12, /* gather_load_x32_init_cost */
+ 4, /* gather_load_x64_init_cost */
1 /* scatter_store_elt_cost */
};
2, /* fadda_f64_cost */
4, /* gather_load_x32_cost */
2, /* gather_load_x64_cost */
+ 12, /* gather_load_x32_init_cost */
+ 4, /* gather_load_x64_init_cost */
1 /* scatter_store_elt_cost */
};
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
3 /* scatter_store_elt_cost */
};
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
3 /* scatter_store_elt_cost */
};
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
3 /* scatter_store_elt_cost */
};
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
1 /* scatter_store_elt_cost */
};
8, /* fadda_f64_cost */
32, /* gather_load_x32_cost */
16, /* gather_load_x64_cost */
+ 96, /* gather_load_x32_init_cost */
+ 32, /* gather_load_x64_init_cost */
3 /* scatter_store_elt_cost */
};
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
3 /* scatter_store_elt_cost */
};
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
1 /* scatter_store_elt_cost */
};
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
1 /* scatter_store_elt_cost */
};