/* Machine description for AArch64 architecture.
- Copyright (C) 2009-2019 Free Software Foundation, Inc.
+ Copyright (C) 2009-2020 Free Software Foundation, Inc.
Contributed by ARM Ltd.
This file is part of GCC.
#include "reload.h"
#include "langhooks.h"
#include "opts.h"
-#include "params.h"
#include "gimplify.h"
#include "dwarf2.h"
#include "gimple-iterator.h"
1, /* vec_int_stmt_cost */
1, /* vec_fp_stmt_cost */
2, /* vec_permute_cost */
- 1, /* vec_to_scalar_cost */
+ 2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* vec_align_load_cost */
1, /* vec_unalign_load_cost */
1, /* scalar_store_cost */
5, /* vec_int_stmt_cost */
6, /* vec_fp_stmt_cost */
- 3, /* vec_permute_cost */
+ 10, /* vec_permute_cost */
6, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
8, /* vec_align_load_cost */
SVE_NOT_IMPLEMENTED, /* sve_width */
6, /* memmov_cost */
2, /* issue_rate */
- AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
+ AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
"8", /* function_align. */
"8", /* jump_align. */
"8", /* loop_align. */
SVE_NOT_IMPLEMENTED, /* sve_width */
6, /* memmov_cost */
2, /* issue_rate */
- AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
+ AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
"8", /* function_align. */
"8", /* jump_align. */
"8", /* loop_align. */
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
4, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
- | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
+ | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost. */
4, /* issue_rate. */
- (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
- | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
+ (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
+ | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
/* The current tuning set. */
struct tune_params aarch64_tune_params = generic_tunings;
+/* Check whether an 'aarch64_vector_pcs' attribute is valid. */
+
+static tree
+handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
+ int, bool *no_add_attrs)
+{
+ /* Since we set fn_type_req to true, the caller should have checked
+ this for us. */
+ gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
+ switch ((arm_pcs) fntype_abi (*node).id ())
+ {
+ case ARM_PCS_AAPCS64:
+ case ARM_PCS_SIMD:
+ return NULL_TREE;
+
+ case ARM_PCS_SVE:
+ error ("the %qE attribute cannot be applied to an SVE function type",
+ name);
+ *no_add_attrs = true;
+ return NULL_TREE;
+
+ case ARM_PCS_TLSDESC:
+ case ARM_PCS_UNKNOWN:
+ break;
+ }
+ gcc_unreachable ();
+}
+
/* Table of machine attributes. */
static const struct attribute_spec aarch64_attribute_table[] =
{
/* { name, min_len, max_len, decl_req, type_req, fn_type_req,
affects_type_identity, handler, exclude } */
- { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
+ { "aarch64_vector_pcs", 0, 0, false, true, true, true,
+ handle_aarch64_vector_pcs_attribute, NULL },
{ NULL, 0, 0, false, false, false, false, NULL, NULL }
};
return simd_abi;
}
+/* Return the descriptor of the SVE PCS. */
+
+static const predefined_function_abi &
+aarch64_sve_abi (void)
+{
+ predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
+ if (!sve_abi.initialized_p ())
+ {
+ HARD_REG_SET full_reg_clobbers
+ = default_function_abi.full_reg_clobbers ();
+ for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
+ CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
+ for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
+ CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
+ sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
+ }
+ return sve_abi;
+}
+
/* Generate code to enable conditional branches in functions over 1 MiB. */
const char *
aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
" vector types", "+nofp");
}
+/* Report when we try to do something that requires SVE when SVE is disabled.
+ This is an error of last resort and isn't very high-quality. It usually
+ involves attempts to measure the vector length in some way. */
+static void
+aarch64_report_sve_required (void)
+{
+ static bool reported_p = false;
+
+ /* Avoid reporting a slew of messages for a single oversight. */
+ if (reported_p)
+ return;
+
+ error ("this operation requires the SVE ISA extension");
+ inform (input_location, "you can enable SVE using the command-line"
+ " option %<-march%>, or by using the %<target%>"
+ " attribute or pragma");
+ reported_p = true;
+}
+
+/* Return true if REGNO is P0-P15 or one of the special FFR-related
+ registers. */
+inline bool
+pr_or_ffr_regnum_p (unsigned int regno)
+{
+ return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
+}
+
/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
case E_VNx4HImode:
/* Partial SVE SI vector. */
case E_VNx2SImode:
+ /* Partial SVE HF vectors. */
+ case E_VNx2HFmode:
+ case E_VNx4HFmode:
+ /* Partial SVE SF vector. */
+ case E_VNx2SFmode:
return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
case E_VNx16QImode:
return false;
}
+/* MODE is some form of SVE vector mode. For data modes, return the number
+ of vector register bits that each element of MODE occupies, such as 64
+ for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
+ in a 64-bit container). For predicate modes, return the number of
+ data bits controlled by each significant predicate bit. */
+
+static unsigned int
+aarch64_sve_container_bits (machine_mode mode)
+{
+ unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
+ ? BITS_PER_SVE_VECTOR
+ : GET_MODE_BITSIZE (mode));
+ return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
+}
+
/* Return the SVE predicate mode to use for elements that have
ELEM_NBYTES bytes, if such a mode exists. */
return opt_machine_mode ();
}
+/* Return the SVE predicate mode that should be used to control
+ SVE mode MODE. */
+
+machine_mode
+aarch64_sve_pred_mode (machine_mode mode)
+{
+ unsigned int bits = aarch64_sve_container_bits (mode);
+ return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
+}
+
/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
static opt_machine_mode
-aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
+aarch64_get_mask_mode (machine_mode mode)
{
- if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
- {
- unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
- machine_mode pred_mode;
- if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
- return pred_mode;
- }
+ unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ if (vec_flags & VEC_SVE_DATA)
+ return aarch64_sve_pred_mode (mode);
- return default_get_mask_mode (nunits, nbytes);
+ return default_get_mask_mode (mode);
}
/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
-static opt_machine_mode
+opt_machine_mode
aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
{
enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
static scalar_int_mode
aarch64_sve_element_int_mode (machine_mode mode)
{
- unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
+ poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+ ? BITS_PER_SVE_VECTOR
+ : GET_MODE_BITSIZE (mode));
+ unsigned int elt_bits = vector_element_size (vector_bits,
GET_MODE_NUNITS (mode));
return int_mode_for_size (elt_bits, 0).require ();
}
+/* Return an integer element mode that contains exactly
+ aarch64_sve_container_bits (MODE) bits. This is wider than
+ aarch64_sve_element_int_mode if MODE is a partial vector,
+ otherwise it's the same. */
+
+static scalar_int_mode
+aarch64_sve_container_int_mode (machine_mode mode)
+{
+ return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
+}
+
/* Return the integer vector mode associated with SVE mode MODE.
- Unlike mode_for_int_vector, this can handle the case in which
+ Unlike related_int_vector_mode, this can handle the case in which
MODE is a predicate (and thus has a different total size). */
-static machine_mode
+machine_mode
aarch64_sve_int_mode (machine_mode mode)
{
scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
}
+/* Implement TARGET_VECTORIZE_RELATED_MODE. */
+
+static opt_machine_mode
+aarch64_vectorize_related_mode (machine_mode vector_mode,
+ scalar_mode element_mode,
+ poly_uint64 nunits)
+{
+ unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
+
+ /* If we're operating on SVE vectors, try to return an SVE mode. */
+ poly_uint64 sve_nunits;
+ if ((vec_flags & VEC_SVE_DATA)
+ && multiple_p (BYTES_PER_SVE_VECTOR,
+ GET_MODE_SIZE (element_mode), &sve_nunits))
+ {
+ machine_mode sve_mode;
+ if (maybe_ne (nunits, 0U))
+ {
+ /* Try to find a full or partial SVE mode with exactly
+ NUNITS units. */
+ if (multiple_p (sve_nunits, nunits)
+ && aarch64_sve_data_mode (element_mode,
+ nunits).exists (&sve_mode))
+ return sve_mode;
+ }
+ else
+ {
+ /* Take the preferred number of units from the number of bytes
+ that fit in VECTOR_MODE. We always start by "autodetecting"
+ a full vector mode with preferred_simd_mode, so vectors
+ chosen here will also be full vector modes. Then
+ autovectorize_vector_modes tries smaller starting modes
+ and thus smaller preferred numbers of units. */
+ sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
+ if (aarch64_sve_data_mode (element_mode,
+ sve_nunits).exists (&sve_mode))
+ return sve_mode;
+ }
+ }
+
+ /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
+ if ((vec_flags & VEC_ADVSIMD)
+ && known_eq (nunits, 0U)
+ && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
+ && maybe_ge (GET_MODE_BITSIZE (element_mode)
+ * GET_MODE_NUNITS (vector_mode), 128U))
+ {
+ machine_mode res = aarch64_simd_container_mode (element_mode, 128);
+ if (VECTOR_MODE_P (res))
+ return res;
+ }
+
+ return default_vectorize_related_mode (vector_mode, element_mode, nunits);
+}
+
/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
prefer to use the first arithmetic operand as the else value if
the else value doesn't matter, since that exactly matches the SVE
case PR_REGS:
case PR_LO_REGS:
case PR_HI_REGS:
+ case FFR_REGS:
+ case PR_AND_FFR_REGS:
return 1;
default:
return CEIL (lowest_size, UNITS_PER_WORD);
return mode == DImode;
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
- /* At the moment, partial vector modes are only useful for memory
- references, but that could change in future. */
- if (vec_flags & VEC_PARTIAL)
- return false;
-
if (vec_flags & VEC_SVE_PRED)
- return PR_REGNUM_P (regno);
+ return pr_or_ffr_regnum_p (regno);
- if (PR_REGNUM_P (regno))
- return 0;
+ if (pr_or_ffr_regnum_p (regno))
+ return false;
if (regno == SP_REGNUM)
/* The purpose of comparing with ptr_mode is to support the
if (GP_REGNUM_P (regno))
{
+ if (vec_flags & VEC_ANY_SVE)
+ return false;
if (known_le (GET_MODE_SIZE (mode), 8))
return true;
- else if (known_le (GET_MODE_SIZE (mode), 16))
+ if (known_le (GET_MODE_SIZE (mode), 16))
return (regno & 1) == 0;
}
else if (FP_REGNUM_P (regno))
return false;
}
+/* Return true if TYPE is a type that should be passed or returned in
+ SVE registers, assuming enough registers are available. When returning
+ true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
+ respectively. */
+
+static bool
+aarch64_sve_argument_p (const_tree type, unsigned int *num_zr,
+ unsigned int *num_pr)
+{
+ if (aarch64_sve::svbool_type_p (type))
+ {
+ *num_pr = 1;
+ *num_zr = 0;
+ return true;
+ }
+
+ if (unsigned int nvectors = aarch64_sve::nvectors_if_data_type (type))
+ {
+ *num_pr = 0;
+ *num_zr = nvectors;
+ return true;
+ }
+
+ return false;
+}
+
+/* Return true if a function with type FNTYPE returns its value in
+ SVE vector or predicate registers. */
+
+static bool
+aarch64_returns_value_in_sve_regs_p (const_tree fntype)
+{
+ unsigned int num_zr, num_pr;
+ tree return_type = TREE_TYPE (fntype);
+ return (return_type != error_mark_node
+ && aarch64_sve_argument_p (return_type, &num_zr, &num_pr));
+}
+
+/* Return true if a function with type FNTYPE takes arguments in
+ SVE vector or predicate registers. */
+
+static bool
+aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
+{
+ CUMULATIVE_ARGS args_so_far_v;
+ aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
+ NULL_TREE, 0, true);
+ cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
+
+ for (tree chain = TYPE_ARG_TYPES (fntype);
+ chain && chain != void_list_node;
+ chain = TREE_CHAIN (chain))
+ {
+ tree arg_type = TREE_VALUE (chain);
+ if (arg_type == error_mark_node)
+ return false;
+
+ function_arg_info arg (arg_type, /*named=*/true);
+ apply_pass_by_reference_rules (&args_so_far_v, arg);
+ unsigned int num_zr, num_pr;
+ if (aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
+ return true;
+
+ targetm.calls.function_arg_advance (args_so_far, arg);
+ }
+ return false;
+}
+
/* Implement TARGET_FNTYPE_ABI. */
static const predefined_function_abi &
{
if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
return aarch64_simd_abi ();
+
+ if (aarch64_returns_value_in_sve_regs_p (fntype)
+ || aarch64_takes_arguments_in_sve_regs_p (fntype))
+ return aarch64_sve_abi ();
+
return default_function_abi;
}
-/* Return true if this is a definition of a vectorized simd function. */
+/* Return true if we should emit CFI for register REGNO. */
static bool
-aarch64_simd_decl_p (tree fndecl)
+aarch64_emit_cfi_for_reg_p (unsigned int regno)
{
- tree fntype;
-
- if (fndecl == NULL)
- return false;
- fntype = TREE_TYPE (fndecl);
- if (fntype == NULL)
- return false;
-
- /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
- if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
- return true;
-
- return false;
+ return (GP_REGNUM_P (regno)
+ || !default_function_abi.clobbers_full_reg_p (regno));
}
-/* Return the mode a register save/restore should use. DImode for integer
- registers, DFmode for FP registers in non-SIMD functions (they only save
- the bottom half of a 128 bit register), or TFmode for FP registers in
- SIMD functions. */
+/* Return the mode we should use to save and restore register REGNO. */
static machine_mode
-aarch64_reg_save_mode (tree fndecl, unsigned regno)
+aarch64_reg_save_mode (unsigned int regno)
{
- return GP_REGNUM_P (regno)
- ? E_DImode
- : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
+ if (GP_REGNUM_P (regno))
+ return DImode;
+
+ if (FP_REGNUM_P (regno))
+ switch (crtl->abi->id ())
+ {
+ case ARM_PCS_AAPCS64:
+ /* Only the low 64 bits are saved by the base PCS. */
+ return DFmode;
+
+ case ARM_PCS_SIMD:
+ /* The vector PCS saves the low 128 bits (which is the full
+ register on non-SVE targets). */
+ return TFmode;
+
+ case ARM_PCS_SVE:
+ /* Use vectors of DImode for registers that need frame
+ information, so that the first 64 bytes of the save slot
+ are always the equivalent of what storing D<n> would give. */
+ if (aarch64_emit_cfi_for_reg_p (regno))
+ return VNx2DImode;
+
+ /* Use vectors of bytes otherwise, so that the layout is
+ endian-agnostic, and so that we can use LDR and STR for
+ big-endian targets. */
+ return VNx16QImode;
+
+ case ARM_PCS_TLSDESC:
+ case ARM_PCS_UNKNOWN:
+ break;
+ }
+
+ if (PR_REGNUM_P (regno))
+ /* Save the full predicate register. */
+ return VNx16BImode;
+
+ gcc_unreachable ();
}
/* Implement TARGET_INSN_CALLEE_ABI. */
unsigned int regno,
machine_mode mode)
{
- if (FP_REGNUM_P (regno))
+ if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
{
poly_int64 per_register_size = GET_MODE_SIZE (mode);
unsigned int nregs = hard_regno_nregs (regno, mode);
return mask & -mask;
}
+/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
+ return that predicate mode, otherwise return opt_machine_mode (). */
+
+opt_machine_mode
+aarch64_ptrue_all_mode (rtx x)
+{
+ gcc_assert (GET_MODE (x) == VNx16BImode);
+ if (GET_CODE (x) != CONST_VECTOR
+ || !CONST_VECTOR_DUPLICATE_P (x)
+ || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
+ || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
+ return opt_machine_mode ();
+
+ unsigned int nelts = const_vector_encoded_nelts (x);
+ for (unsigned int i = 1; i < nelts; ++i)
+ if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
+ return opt_machine_mode ();
+
+ return aarch64_sve_pred_mode (nelts);
+}
+
/* BUILDER is a predicate constant of mode VNx16BI. Consider the value
that the constant would have with predicate element size ELT_SIZE
(ignoring the upper bits in each element) and return:
the corresponding SVE predicate mode. Use TARGET for the result
if it's nonnull and convenient. */
-static rtx
+rtx
aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
{
machine_mode src_mode = GET_MODE (src);
src, CONST0_RTX (src_mode));
}
+/* Return the assembly token for svprfop value PRFOP. */
+
+static const char *
+svprfop_token (enum aarch64_svprfop prfop)
+{
+ switch (prfop)
+ {
+#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
+ AARCH64_FOR_SVPRFOP (CASE)
+#undef CASE
+ case AARCH64_NUM_SVPRFOPS:
+ break;
+ }
+ gcc_unreachable ();
+}
+
+/* Return the assembly string for an SVE prefetch operation with
+ mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
+ and that SUFFIX is the format for the remaining operands. */
+
+char *
+aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
+ const char *suffix)
+{
+ static char buffer[128];
+ aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
+ unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
+ mnemonic, svprfop_token (prfop), suffix);
+ gcc_assert (written < sizeof (buffer));
+ return buffer;
+}
+
+/* Check whether we can calculate the number of elements in PATTERN
+ at compile time, given that there are NELTS_PER_VQ elements per
+ 128-bit block. Return the value if so, otherwise return -1. */
+
+HOST_WIDE_INT
+aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
+{
+ unsigned int vl, const_vg;
+ if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
+ vl = 1 + (pattern - AARCH64_SV_VL1);
+ else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
+ vl = 16 << (pattern - AARCH64_SV_VL16);
+ else if (aarch64_sve_vg.is_constant (&const_vg))
+ {
+ /* There are two vector granules per quadword. */
+ unsigned int nelts = (const_vg / 2) * nelts_per_vq;
+ switch (pattern)
+ {
+ case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
+ case AARCH64_SV_MUL4: return nelts & -4;
+ case AARCH64_SV_MUL3: return (nelts / 3) * 3;
+ case AARCH64_SV_ALL: return nelts;
+ default: gcc_unreachable ();
+ }
+ }
+ else
+ return -1;
+
+ /* There are two vector granules per quadword. */
+ poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
+ if (known_le (vl, nelts_all))
+ return vl;
+
+ /* Requesting more elements than are available results in a PFALSE. */
+ if (known_gt (vl, nelts_all))
+ return 0;
+
+ return -1;
+}
+
/* Return true if we can move VALUE into a register using a single
CNT[BHWD] instruction. */
value.coeffs[1], 0);
}
+/* Return the asm string for an instruction with a CNT-like vector size
+ operand (a vector pattern followed by a multiplier in the range [1, 16]).
+ PREFIX is the mnemonic without the size suffix and OPERANDS is the
+ first part of the operands template (the part that comes before the
+ vector size itself). CNT_PAT[0..2] are the operands of the
+ UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
+
+char *
+aarch64_output_sve_cnt_pat_immediate (const char *prefix,
+ const char *operands, rtx *cnt_pat)
+{
+ aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
+ unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
+ unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
+ return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
+ factor, nelts_per_vq);
+}
+
/* Return true if we can add X using a single SVE INC or DEC instruction. */
bool
}
machine_mode mode = GET_MODE (dest);
- unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
- machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+ machine_mode pred_mode = aarch64_sve_pred_mode (mode);
rtx ptrue = aarch64_ptrue_reg (pred_mode);
emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
return true;
unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
scalar_mode elt_mode = GET_MODE_INNER (mode);
unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
- unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
+ unsigned int container_bits = aarch64_sve_container_bits (mode);
+ unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
+
+ if (nelts_per_pattern == 1
+ && encoded_bits <= 128
+ && container_bits != elt_bits)
+ {
+ /* We have a partial vector mode and a constant whose full-vector
+ equivalent would occupy a repeating 128-bit sequence. Build that
+ full-vector equivalent instead, so that we have the option of
+ using LD1RQ and Advanced SIMD operations. */
+ unsigned int repeat = container_bits / elt_bits;
+ machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
+ rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
+ for (unsigned int i = 0; i < npatterns; ++i)
+ for (unsigned int j = 0; j < repeat; ++j)
+ builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
+ target = aarch64_target_reg (target, full_mode);
+ return aarch64_expand_sve_const_vector (target, builder.build ());
+ }
if (nelts_per_pattern == 1 && encoded_bits == 128)
{
{
rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
target = aarch64_target_reg (target, mode);
- emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
+ emit_insn (gen_while (UNSPEC_WHILE_LO, DImode, mode,
+ target, const0_rtx, limit));
return target;
}
folding it into the relocation. */
if (!offset.is_constant (&const_offset))
{
+ if (!TARGET_SVE)
+ {
+ aarch64_report_sve_required ();
+ return;
+ }
if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
emit_insn (gen_rtx_SET (dest, imm));
else
attributes. Unlike gen_lowpart, this doesn't care whether the
mode change is valid. */
-static rtx
+rtx
aarch64_replace_reg_mode (rtx x, machine_mode mode)
{
if (GET_MODE (x) == mode)
std::swap (mode_with_wider_elts, mode_with_narrower_elts);
unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
- unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
- machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
+ machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
/* Get the operands in the appropriate modes and emit the instruction. */
ptrue = gen_lowpart (pred_mode, ptrue);
}
static bool
-aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
- tree exp ATTRIBUTE_UNUSED)
+aarch64_function_ok_for_sibcall (tree, tree exp)
{
- if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
+ if (crtl->abi->id () != expr_callee_abi (exp).id ())
return false;
return true;
/* Implement TARGET_PASS_BY_REFERENCE. */
static bool
-aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
+aarch64_pass_by_reference (cumulative_args_t pcum_v,
+ const function_arg_info &arg)
{
+ CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
HOST_WIDE_INT size;
machine_mode dummymode;
int nregs;
+ unsigned int num_zr, num_pr;
+ if (arg.type && aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
+ {
+ if (pcum && !pcum->silent_p && !TARGET_SVE)
+ /* We can't gracefully recover at this point, so make this a
+ fatal error. */
+ fatal_error (input_location, "arguments of type %qT require"
+ " the SVE ISA extension", arg.type);
+
+ /* Variadic SVE types are passed by reference. Normal non-variadic
+ arguments are too if we've run out of registers. */
+ return (!arg.named
+ || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
+ || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
+ }
+
/* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
if (arg.mode == BLKmode && arg.type)
size = int_size_in_bytes (arg.type);
return true;
}
-/* Implement TARGET_FUNCTION_VALUE.
- Define how to find the value returned by a function. */
-
+/* Subroutine of aarch64_function_value. MODE is the mode of the argument
+ after promotion, and after partial SVE types have been replaced by
+ their integer equivalents. */
static rtx
-aarch64_function_value (const_tree type, const_tree func,
- bool outgoing ATTRIBUTE_UNUSED)
+aarch64_function_value_1 (const_tree type, machine_mode mode)
{
- machine_mode mode;
- int unsignedp;
- int count;
- machine_mode ag_mode;
+ unsigned int num_zr, num_pr;
+ if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
+ {
+ /* Don't raise an error here if we're called when SVE is disabled,
+ since this is really just a query function. Other code must
+ do that where appropriate. */
+ mode = TYPE_MODE_RAW (type);
+ gcc_assert (VECTOR_MODE_P (mode)
+ && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
- mode = TYPE_MODE (type);
- if (INTEGRAL_TYPE_P (type))
- mode = promote_function_mode (type, mode, &unsignedp, func, 1);
+ if (num_zr > 0 && num_pr == 0)
+ return gen_rtx_REG (mode, V0_REGNUM);
+
+ if (num_zr == 0 && num_pr == 1)
+ return gen_rtx_REG (mode, P0_REGNUM);
+
+ gcc_unreachable ();
+ }
+
+ /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
+ returned in memory, not by value. */
+ gcc_assert (!aarch64_sve_mode_p (mode));
if (aarch64_return_in_msb (type))
{
}
}
+ int count;
+ machine_mode ag_mode;
if (aarch64_vfp_is_call_or_return_candidate (mode, type,
&ag_mode, &count, NULL))
{
return gen_rtx_REG (mode, R0_REGNUM);
}
+/* Implement TARGET_FUNCTION_VALUE.
+ Define how to find the value returned by a function. */
+
+static rtx
+aarch64_function_value (const_tree type, const_tree func,
+ bool outgoing ATTRIBUTE_UNUSED)
+{
+ machine_mode mode;
+ int unsignedp;
+
+ mode = TYPE_MODE (type);
+ if (INTEGRAL_TYPE_P (type))
+ mode = promote_function_mode (type, mode, &unsignedp, func, 1);
+
+ /* Vector types can acquire a partial SVE mode using things like
+ __attribute__((vector_size(N))), and this is potentially useful.
+ However, the choice of mode doesn't affect the type's ABI identity,
+ so we should treat the types as though they had the associated
+ integer mode, just like they did before SVE was introduced.
+
+ We know that the vector must be 128 bits or smaller, otherwise we'd
+ have returned it in memory instead. */
+ unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
+ {
+ scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
+ rtx reg = aarch64_function_value_1 (type, int_mode);
+ /* Vector types are never returned in the MSB and are never split. */
+ gcc_assert (REG_P (reg) && GET_MODE (reg) == int_mode);
+ rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
+ return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, pair));
+ }
+
+ return aarch64_function_value_1 (type, mode);
+}
+
/* Implements TARGET_FUNCTION_VALUE_REGNO_P.
Return true if REGNO is the number of a hard register in which the values
of called function may come back. */
/* Simple scalar types always returned in registers. */
return false;
+ unsigned int num_zr, num_pr;
+ if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
+ {
+ /* All SVE types we support fit in registers. For example, it isn't
+ yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
+ predicates. */
+ gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
+ return false;
+ }
+
if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
type,
&ag_mode,
}
/* Layout a function argument according to the AAPCS64 rules. The rule
- numbers refer to the rule numbers in the AAPCS64. */
+ numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
+ mode that was originally given to us by the target hook, whereas the
+ mode in ARG might be the result of replacing partial SVE modes with
+ the equivalent integer mode. */
static void
-aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
- const_tree type,
- bool named ATTRIBUTE_UNUSED)
+aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg,
+ machine_mode orig_mode)
{
CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
+ tree type = arg.type;
+ machine_mode mode = arg.mode;
int ncrn, nvrn, nregs;
bool allocate_ncrn, allocate_nvrn;
HOST_WIDE_INT size;
if (pcum->aapcs_arg_processed)
return;
+ /* Vector types can acquire a partial SVE mode using things like
+ __attribute__((vector_size(N))), and this is potentially useful.
+ However, the choice of mode doesn't affect the type's ABI identity,
+ so we should treat the types as though they had the associated
+ integer mode, just like they did before SVE was introduced.
+
+ We know that the vector must be 128 bits or smaller, otherwise we'd
+ have passed it by reference instead. */
+ unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
+ {
+ function_arg_info tmp_arg = arg;
+ tmp_arg.mode = int_mode_for_mode (mode).require ();
+ aarch64_layout_arg (pcum_v, tmp_arg, orig_mode);
+ if (rtx reg = pcum->aapcs_reg)
+ {
+ gcc_assert (REG_P (reg) && GET_MODE (reg) == tmp_arg.mode);
+ rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
+ pcum->aapcs_reg = gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
+ }
+ return;
+ }
+
pcum->aapcs_arg_processed = true;
+ unsigned int num_zr, num_pr;
+ if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
+ {
+ /* The PCS says that it is invalid to pass an SVE value to an
+ unprototyped function. There is no ABI-defined location we
+ can return in this case, so we have no real choice but to raise
+ an error immediately, even though this is only a query function. */
+ if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
+ {
+ gcc_assert (!pcum->silent_p);
+ error ("SVE type %qT cannot be passed to an unprototyped function",
+ arg.type);
+ /* Avoid repeating the message, and avoid tripping the assert
+ below. */
+ pcum->pcs_variant = ARM_PCS_SVE;
+ }
+
+ /* We would have converted the argument into pass-by-reference
+ form if it didn't fit in registers. */
+ pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
+ pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
+ gcc_assert (arg.named
+ && pcum->pcs_variant == ARM_PCS_SVE
+ && aarch64_sve_mode_p (mode)
+ && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
+ && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
+
+ if (num_zr > 0 && num_pr == 0)
+ pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
+ else if (num_zr == 0 && num_pr == 1)
+ pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
+ else
+ gcc_unreachable ();
+ return;
+ }
+
+ /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
+ passed by reference, not by value. */
+ gcc_assert (!aarch64_sve_mode_p (mode));
+
/* Size in bytes, rounded to the nearest multiple of 8 bytes. */
if (type)
size = int_size_in_bytes (type);
and homogenous short-vector aggregates (HVA). */
if (allocate_nvrn)
{
- if (!TARGET_FLOAT)
+ if (!pcum->silent_p && !TARGET_FLOAT)
aarch64_err_no_fpadvsimd (mode);
if (nvrn + nregs <= NUM_FP_ARG_REGS)
comparison is there because for > 16 * BITS_PER_UNIT
alignment nregs should be > 2 and therefore it should be
passed by reference rather than value. */
- && (aarch64_function_arg_alignment (mode, type, &abi_break)
+ && (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
== 16 * BITS_PER_UNIT))
{
if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
on_stack:
pcum->aapcs_stack_words = size / UNITS_PER_WORD;
- if (aarch64_function_arg_alignment (mode, type, &abi_break)
+ if (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
== 16 * BITS_PER_UNIT)
{
int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
{
CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
- || pcum->pcs_variant == ARM_PCS_SIMD);
+ || pcum->pcs_variant == ARM_PCS_SIMD
+ || pcum->pcs_variant == ARM_PCS_SVE);
if (arg.end_marker_p ())
return gen_int_mode (pcum->pcs_variant, DImode);
- aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
+ aarch64_layout_arg (pcum_v, arg, arg.mode);
return pcum->aapcs_reg;
}
const_tree fntype,
rtx libname ATTRIBUTE_UNUSED,
const_tree fndecl ATTRIBUTE_UNUSED,
- unsigned n_named ATTRIBUTE_UNUSED)
+ unsigned n_named ATTRIBUTE_UNUSED,
+ bool silent_p)
{
pcum->aapcs_ncrn = 0;
pcum->aapcs_nvrn = 0;
+ pcum->aapcs_nprn = 0;
pcum->aapcs_nextncrn = 0;
pcum->aapcs_nextnvrn = 0;
+ pcum->aapcs_nextnprn = 0;
if (fntype)
pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
else
pcum->aapcs_arg_processed = false;
pcum->aapcs_stack_words = 0;
pcum->aapcs_stack_size = 0;
+ pcum->silent_p = silent_p;
- if (!TARGET_FLOAT
+ if (!silent_p
+ && !TARGET_FLOAT
&& fndecl && TREE_PUBLIC (fndecl)
&& fntype && fntype != error_mark_node)
{
&mode, &nregs, NULL))
aarch64_err_no_fpadvsimd (TYPE_MODE (type));
}
- return;
+
+ if (!silent_p
+ && !TARGET_SVE
+ && pcum->pcs_variant == ARM_PCS_SVE)
+ {
+ /* We can't gracefully recover at this point, so make this a
+ fatal error. */
+ if (fndecl)
+ fatal_error (input_location, "%qE requires the SVE ISA extension",
+ fndecl);
+ else
+ fatal_error (input_location, "calls to functions of type %qT require"
+ " the SVE ISA extension", fntype);
+ }
}
static void
{
CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
if (pcum->pcs_variant == ARM_PCS_AAPCS64
- || pcum->pcs_variant == ARM_PCS_SIMD)
+ || pcum->pcs_variant == ARM_PCS_SIMD
+ || pcum->pcs_variant == ARM_PCS_SVE)
{
- aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
+ aarch64_layout_arg (pcum_v, arg, arg.mode);
gcc_assert ((pcum->aapcs_reg != NULL_RTX)
!= (pcum->aapcs_stack_words != 0));
pcum->aapcs_arg_processed = false;
pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
+ pcum->aapcs_nprn = pcum->aapcs_nextnprn;
pcum->aapcs_stack_size += pcum->aapcs_stack_words;
pcum->aapcs_stack_words = 0;
pcum->aapcs_reg = NULL_RTX;
ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
HOST_WIDE_INT stack_clash_probe_interval
- = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+ = 1 << param_stack_clash_protection_guard_size;
/* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
xops[0] = reg1;
static void
aarch64_layout_frame (void)
{
- HOST_WIDE_INT offset = 0;
+ poly_int64 offset = 0;
int regno, last_fp_reg = INVALID_REGNUM;
- bool simd_function = (crtl->abi->id () == ARM_PCS_SIMD);
+ machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
+ poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
+ bool frame_related_fp_reg_p = false;
aarch64_frame &frame = cfun->machine->frame;
frame.emit_frame_chain = aarch64_needs_frame_chain ();
frame.wb_candidate1 = INVALID_REGNUM;
frame.wb_candidate2 = INVALID_REGNUM;
+ frame.spare_pred_reg = INVALID_REGNUM;
/* First mark all the registers that really need to be saved... */
- for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
- frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
-
- for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+ for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
/* ... that includes the eh data registers (if needed)... */
{
frame.reg_offset[regno] = SLOT_REQUIRED;
last_fp_reg = regno;
+ if (aarch64_emit_cfi_for_reg_p (regno))
+ frame_related_fp_reg_p = true;
}
+ /* Big-endian SVE frames need a spare predicate register in order
+ to save Z8-Z15. Decide which register they should use. Prefer
+ an unused argument register if possible, so that we don't force P4
+ to be saved unnecessarily. */
+ if (frame_related_fp_reg_p
+ && crtl->abi->id () == ARM_PCS_SVE
+ && BYTES_BIG_ENDIAN)
+ {
+ bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+ bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
+ for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
+ if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
+ break;
+ gcc_assert (regno <= P7_REGNUM);
+ frame.spare_pred_reg = regno;
+ df_set_regs_ever_live (regno, true);
+ }
+
+ for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+ if (df_regs_ever_live_p (regno)
+ && !fixed_regs[regno]
+ && !crtl->abi->clobbers_full_reg_p (regno))
+ frame.reg_offset[regno] = SLOT_REQUIRED;
+
+ /* With stack-clash, LR must be saved in non-leaf functions. */
+ gcc_assert (crtl->is_leaf
+ || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
+
+ /* Now assign stack slots for the registers. Start with the predicate
+ registers, since predicate LDR and STR have a relatively small
+ offset range. These saves happen below the hard frame pointer. */
+ for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ {
+ frame.reg_offset[regno] = offset;
+ offset += BYTES_PER_SVE_PRED;
+ }
+
+ /* We save a maximum of 8 predicate registers, and since vector
+ registers are 8 times the size of a predicate register, all the
+ saved predicates fit within a single vector. Doing this also
+ rounds the offset to a 128-bit boundary. */
+ if (maybe_ne (offset, 0))
+ {
+ gcc_assert (known_le (offset, vector_save_size));
+ offset = vector_save_size;
+ }
+
+ /* If we need to save any SVE vector registers, add them next. */
+ if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
+ for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ {
+ frame.reg_offset[regno] = offset;
+ offset += vector_save_size;
+ }
+
+ /* OFFSET is now the offset of the hard frame pointer from the bottom
+ of the callee save area. */
+ bool saves_below_hard_fp_p = maybe_ne (offset, 0);
+ frame.below_hard_fp_saved_regs_size = offset;
if (frame.emit_frame_chain)
{
/* FP and LR are placed in the linkage record. */
- frame.reg_offset[R29_REGNUM] = 0;
+ frame.reg_offset[R29_REGNUM] = offset;
frame.wb_candidate1 = R29_REGNUM;
- frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
+ frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
frame.wb_candidate2 = R30_REGNUM;
- offset = 2 * UNITS_PER_WORD;
+ offset += 2 * UNITS_PER_WORD;
}
- /* With stack-clash, LR must be saved in non-leaf functions. */
- gcc_assert (crtl->is_leaf
- || frame.reg_offset[R30_REGNUM] != SLOT_NOT_REQUIRED);
-
- /* Now assign stack slots for them. */
for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
- if (frame.reg_offset[regno] == SLOT_REQUIRED)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
{
frame.reg_offset[regno] = offset;
if (frame.wb_candidate1 == INVALID_REGNUM)
offset += UNITS_PER_WORD;
}
- HOST_WIDE_INT max_int_offset = offset;
- offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
- bool has_align_gap = offset != max_int_offset;
+ poly_int64 max_int_offset = offset;
+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+ bool has_align_gap = maybe_ne (offset, max_int_offset);
for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
- if (frame.reg_offset[regno] == SLOT_REQUIRED)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
{
/* If there is an alignment gap between integer and fp callee-saves,
allocate the last fp register to it if possible. */
if (regno == last_fp_reg
&& has_align_gap
- && !simd_function
- && (offset & 8) == 0)
+ && known_eq (vector_save_size, 8)
+ && multiple_p (offset, 16))
{
frame.reg_offset[regno] = max_int_offset;
break;
else if (frame.wb_candidate2 == INVALID_REGNUM
&& frame.wb_candidate1 >= V0_REGNUM)
frame.wb_candidate2 = regno;
- offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
+ offset += vector_save_size;
}
- offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
frame.saved_regs_size = offset;
- HOST_WIDE_INT varargs_and_saved_regs_size
- = offset + frame.saved_varargs_size;
+ poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
- frame.hard_fp_offset
+ poly_int64 above_outgoing_args
= aligned_upper_bound (varargs_and_saved_regs_size
+ get_frame_size (),
STACK_BOUNDARY / BITS_PER_UNIT);
+ frame.hard_fp_offset
+ = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
+
/* Both these values are already aligned. */
gcc_assert (multiple_p (crtl->outgoing_args_size,
STACK_BOUNDARY / BITS_PER_UNIT));
- frame.frame_size = frame.hard_fp_offset + crtl->outgoing_args_size;
+ frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
frame.locals_offset = frame.saved_varargs_size;
frame.initial_adjust = 0;
frame.final_adjust = 0;
frame.callee_adjust = 0;
+ frame.sve_callee_adjust = 0;
frame.callee_offset = 0;
HOST_WIDE_INT max_push_offset = 0;
max_push_offset = 256;
HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
+ HOST_WIDE_INT const_saved_regs_size;
if (frame.frame_size.is_constant (&const_size)
&& const_size < max_push_offset
- && known_eq (crtl->outgoing_args_size, 0))
+ && known_eq (frame.hard_fp_offset, const_size))
{
/* Simple, small frame with no outgoing arguments:
+
stp reg1, reg2, [sp, -frame_size]!
stp reg3, reg4, [sp, 16] */
frame.callee_adjust = const_size;
}
else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
- && const_outgoing_args_size + frame.saved_regs_size < 512
+ && frame.saved_regs_size.is_constant (&const_saved_regs_size)
+ && const_outgoing_args_size + const_saved_regs_size < 512
+ /* We could handle this case even with outgoing args, provided
+ that the number of args left us with valid offsets for all
+ predicate and vector save slots. It's such a rare case that
+ it hardly seems worth the effort though. */
+ && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
&& !(cfun->calls_alloca
&& frame.hard_fp_offset.is_constant (&const_fp_offset)
&& const_fp_offset < max_push_offset))
{
/* Frame with small outgoing arguments:
+
sub sp, sp, frame_size
stp reg1, reg2, [sp, outgoing_args_size]
stp reg3, reg4, [sp, outgoing_args_size + 16] */
frame.initial_adjust = frame.frame_size;
frame.callee_offset = const_outgoing_args_size;
}
+ else if (saves_below_hard_fp_p
+ && known_eq (frame.saved_regs_size,
+ frame.below_hard_fp_saved_regs_size))
+ {
+ /* Frame in which all saves are SVE saves:
+
+ sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
+ save SVE registers relative to SP
+ sub sp, sp, outgoing_args_size */
+ frame.initial_adjust = (frame.hard_fp_offset
+ + frame.below_hard_fp_saved_regs_size);
+ frame.final_adjust = crtl->outgoing_args_size;
+ }
else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
&& const_fp_offset < max_push_offset)
{
- /* Frame with large outgoing arguments but a small local area:
+ /* Frame with large outgoing arguments or SVE saves, but with
+ a small local area:
+
stp reg1, reg2, [sp, -hard_fp_offset]!
stp reg3, reg4, [sp, 16]
+ [sub sp, sp, below_hard_fp_saved_regs_size]
+ [save SVE registers relative to SP]
sub sp, sp, outgoing_args_size */
frame.callee_adjust = const_fp_offset;
+ frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
frame.final_adjust = crtl->outgoing_args_size;
}
else
{
- /* Frame with large local area and outgoing arguments using frame pointer:
+ /* Frame with large local area and outgoing arguments or SVE saves,
+ using frame pointer:
+
sub sp, sp, hard_fp_offset
stp x29, x30, [sp, 0]
add x29, sp, 0
stp reg3, reg4, [sp, 16]
+ [sub sp, sp, below_hard_fp_saved_regs_size]
+ [save SVE registers relative to SP]
sub sp, sp, outgoing_args_size */
frame.initial_adjust = frame.hard_fp_offset;
+ frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
frame.final_adjust = crtl->outgoing_args_size;
}
/* Make sure the individual adjustments add up to the full frame size. */
gcc_assert (known_eq (frame.initial_adjust
+ frame.callee_adjust
+ + frame.sve_callee_adjust
+ frame.final_adjust, frame.frame_size));
frame.laid_out = true;
static bool
aarch64_register_saved_on_entry (int regno)
{
- return cfun->machine->frame.reg_offset[regno] >= 0;
+ return known_ge (cfun->machine->frame.reg_offset[regno], 0);
}
/* Return the next register up from REGNO up to LIMIT for the callee
aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
{
rtx_insn *insn;
- machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
+ machine_mode mode = aarch64_reg_save_mode (regno1);
if (regno2 == INVALID_REGNUM)
return aarch64_pushwb_single_reg (mode, regno1, adjustment);
aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
rtx *cfi_ops)
{
- machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
+ machine_mode mode = aarch64_reg_save_mode (regno1);
rtx reg1 = gen_rtx_REG (mode, regno1);
*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
if its LR is pushed onto stack. */
return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
|| (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
- && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
+ && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
}
/* Return TRUE if Branch Target Identification Mechanism is enabled. */
return (aarch64_enable_bti == 1);
}
+/* The caller is going to use ST1D or LD1D to save or restore an SVE
+ register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
+ the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
+
+ (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
+ or LD1D address
+
+ (2) setting PRED to a valid predicate register for the ST1D or LD1D,
+ if the variable isn't already nonnull
+
+ (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
+ Handle this case using a temporary base register that is suitable for
+ all offsets in that range. Use ANCHOR_REG as this base register if it
+ is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
+
+static inline void
+aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
+ rtx &anchor_reg, poly_int64 &offset,
+ rtx &ptrue)
+{
+ if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
+ {
+ /* This is the maximum valid offset of the anchor from the base.
+ Lower values would be valid too. */
+ poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
+ if (!anchor_reg)
+ {
+ anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
+ emit_insn (gen_add3_insn (anchor_reg, base_rtx,
+ gen_int_mode (anchor_offset, Pmode)));
+ }
+ base_rtx = anchor_reg;
+ offset -= anchor_offset;
+ }
+ if (!ptrue)
+ {
+ int pred_reg = cfun->machine->frame.spare_pred_reg;
+ emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
+ CONSTM1_RTX (VNx16BImode));
+ ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
+ }
+}
+
+/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
+ is saved at BASE + OFFSET. */
+
+static void
+aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
+ rtx base, poly_int64 offset)
+{
+ rtx mem = gen_frame_mem (GET_MODE (reg),
+ plus_constant (Pmode, base, offset));
+ add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+}
+
/* Emit code to save the callee-saved registers from register number START
to LIMIT to the stack at the location starting at offset START_OFFSET,
- skipping any write-back candidates if SKIP_WB is true. */
+ skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
+ is true if the hard frame pointer has been set up. */
static void
-aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
- unsigned start, unsigned limit, bool skip_wb)
+aarch64_save_callee_saves (poly_int64 start_offset,
+ unsigned start, unsigned limit, bool skip_wb,
+ bool hard_fp_valid_p)
{
rtx_insn *insn;
unsigned regno;
unsigned regno2;
+ rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
for (regno = aarch64_next_callee_save (start, limit);
regno <= limit;
{
rtx reg, mem;
poly_int64 offset;
- int offset_diff;
+ bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
if (skip_wb
&& (regno == cfun->machine->frame.wb_candidate1
continue;
if (cfun->machine->reg_is_wrapped_separately[regno])
- continue;
+ continue;
+ machine_mode mode = aarch64_reg_save_mode (regno);
reg = gen_rtx_REG (mode, regno);
offset = start_offset + cfun->machine->frame.reg_offset[regno];
- mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
- offset));
+ rtx base_rtx = stack_pointer_rtx;
+ poly_int64 sp_offset = offset;
- regno2 = aarch64_next_callee_save (regno + 1, limit);
- offset_diff = cfun->machine->frame.reg_offset[regno2]
- - cfun->machine->frame.reg_offset[regno];
+ HOST_WIDE_INT const_offset;
+ if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
+ offset, ptrue);
+ else if (GP_REGNUM_P (regno)
+ && (!offset.is_constant (&const_offset) || const_offset >= 512))
+ {
+ gcc_assert (known_eq (start_offset, 0));
+ poly_int64 fp_offset
+ = cfun->machine->frame.below_hard_fp_saved_regs_size;
+ if (hard_fp_valid_p)
+ base_rtx = hard_frame_pointer_rtx;
+ else
+ {
+ if (!anchor_reg)
+ {
+ anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
+ emit_insn (gen_add3_insn (anchor_reg, base_rtx,
+ gen_int_mode (fp_offset, Pmode)));
+ }
+ base_rtx = anchor_reg;
+ }
+ offset -= fp_offset;
+ }
+ mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
+ bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
- if (regno2 <= limit
+ if (!aarch64_sve_mode_p (mode)
+ && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
&& !cfun->machine->reg_is_wrapped_separately[regno2]
- && known_eq (GET_MODE_SIZE (mode), offset_diff))
+ && known_eq (GET_MODE_SIZE (mode),
+ cfun->machine->frame.reg_offset[regno2]
+ - cfun->machine->frame.reg_offset[regno]))
{
rtx reg2 = gen_rtx_REG (mode, regno2);
rtx mem2;
- offset = start_offset + cfun->machine->frame.reg_offset[regno2];
- mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
- offset));
+ offset += GET_MODE_SIZE (mode);
+ mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
reg2));
always assumed to be relevant to the frame
calculations; subsequent parts, are only
frame-related if explicitly marked. */
- RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
+ if (aarch64_emit_cfi_for_reg_p (regno2))
+ {
+ if (need_cfa_note_p)
+ aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
+ sp_offset + GET_MODE_SIZE (mode));
+ else
+ RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
+ }
+
regno = regno2;
}
+ else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ {
+ insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
+ need_cfa_note_p = true;
+ }
+ else if (aarch64_sve_mode_p (mode))
+ insn = emit_insn (gen_rtx_SET (mem, reg));
else
insn = emit_move_insn (mem, reg);
- RTX_FRAME_RELATED_P (insn) = 1;
+ RTX_FRAME_RELATED_P (insn) = frame_related_p;
+ if (frame_related_p && need_cfa_note_p)
+ aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
}
}
-/* Emit code to restore the callee registers of mode MODE from register
- number START up to and including LIMIT. Restore from the stack offset
- START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
- Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
+/* Emit code to restore the callee registers from register number START
+ up to and including LIMIT. Restore from the stack offset START_OFFSET,
+ skipping any write-back candidates if SKIP_WB is true. Write the
+ appropriate REG_CFA_RESTORE notes into CFI_OPS. */
static void
-aarch64_restore_callee_saves (machine_mode mode,
- poly_int64 start_offset, unsigned start,
+aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
unsigned limit, bool skip_wb, rtx *cfi_ops)
{
- rtx base_rtx = stack_pointer_rtx;
unsigned regno;
unsigned regno2;
poly_int64 offset;
+ rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
for (regno = aarch64_next_callee_save (start, limit);
regno <= limit;
regno = aarch64_next_callee_save (regno + 1, limit))
{
+ bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
if (cfun->machine->reg_is_wrapped_separately[regno])
- continue;
+ continue;
rtx reg, mem;
- int offset_diff;
if (skip_wb
&& (regno == cfun->machine->frame.wb_candidate1
|| regno == cfun->machine->frame.wb_candidate2))
continue;
+ machine_mode mode = aarch64_reg_save_mode (regno);
reg = gen_rtx_REG (mode, regno);
offset = start_offset + cfun->machine->frame.reg_offset[regno];
+ rtx base_rtx = stack_pointer_rtx;
+ if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
+ offset, ptrue);
mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
- regno2 = aarch64_next_callee_save (regno + 1, limit);
- offset_diff = cfun->machine->frame.reg_offset[regno2]
- - cfun->machine->frame.reg_offset[regno];
-
- if (regno2 <= limit
+ if (!aarch64_sve_mode_p (mode)
+ && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
&& !cfun->machine->reg_is_wrapped_separately[regno2]
- && known_eq (GET_MODE_SIZE (mode), offset_diff))
+ && known_eq (GET_MODE_SIZE (mode),
+ cfun->machine->frame.reg_offset[regno2]
+ - cfun->machine->frame.reg_offset[regno]))
{
rtx reg2 = gen_rtx_REG (mode, regno2);
rtx mem2;
- offset = start_offset + cfun->machine->frame.reg_offset[regno2];
+ offset += GET_MODE_SIZE (mode);
mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
regno = regno2;
}
+ else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
+ else if (aarch64_sve_mode_p (mode))
+ emit_insn (gen_rtx_SET (reg, mem));
else
emit_move_insn (reg, mem);
- *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
+ if (frame_related_p)
+ *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
}
}
for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
if (aarch64_register_saved_on_entry (regno))
{
+ /* Punt on saves and restores that use ST1D and LD1D. We could
+ try to be smarter, but it would involve making sure that the
+ spare predicate register itself is safe to use at the save
+ and restore points. Also, when a frame pointer is being used,
+ the slots are often out of reach of ST1D and LD1D anyway. */
+ machine_mode mode = aarch64_reg_save_mode (regno);
+ if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ continue;
+
poly_int64 offset = cfun->machine->frame.reg_offset[regno];
- if (!frame_pointer_needed)
- offset += cfun->machine->frame.frame_size
- - cfun->machine->frame.hard_fp_offset;
+
+ /* If the register is saved in the first SVE save slot, we use
+ it as a stack probe for -fstack-clash-protection. */
+ if (flag_stack_clash_protection
+ && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
+ && known_eq (offset, 0))
+ continue;
+
+ /* Get the offset relative to the register we'll use. */
+ if (frame_pointer_needed)
+ offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
+ else
+ offset += crtl->outgoing_args_size;
+
/* Check that we can access the stack slot of the register with one
direct load with no adjustments needed. */
- if (offset_12bit_unsigned_scaled_p (DImode, offset))
+ if (aarch64_sve_mode_p (mode)
+ ? offset_9bit_signed_scaled_p (mode, offset)
+ : offset_12bit_unsigned_scaled_p (mode, offset))
bitmap_set_bit (components, regno);
}
if (frame_pointer_needed)
bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
+ /* If the spare predicate register used by big-endian SVE code
+ is call-preserved, it must be saved in the main prologue
+ before any saves that use it. */
+ if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
+ bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
+
unsigned reg1 = cfun->machine->frame.wb_candidate1;
unsigned reg2 = cfun->machine->frame.wb_candidate2;
/* If registers have been chosen to be stored/restored with
|| bitmap_bit_p (gen, regno)
|| bitmap_bit_p (kill, regno)))
{
- unsigned regno2, offset, offset2;
bitmap_set_bit (components, regno);
/* If there is a callee-save at an adjacent offset, add it too
to increase the use of LDP/STP. */
- offset = cfun->machine->frame.reg_offset[regno];
- regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
+ poly_int64 offset = cfun->machine->frame.reg_offset[regno];
+ unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
if (regno2 <= LAST_SAVED_REGNUM)
{
- offset2 = cfun->machine->frame.reg_offset[regno2];
- if ((offset & ~8) == (offset2 & ~8))
+ poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
+ if (regno < regno2
+ ? known_eq (offset + 8, offset2)
+ : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
bitmap_set_bit (components, regno2);
}
}
while (regno != last_regno)
{
- /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
- so DFmode for the vector registers is enough. For simd functions
- we want to save the low 128 bits. */
- machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
+ bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
+ machine_mode mode = aarch64_reg_save_mode (regno);
rtx reg = gen_rtx_REG (mode, regno);
poly_int64 offset = cfun->machine->frame.reg_offset[regno];
- if (!frame_pointer_needed)
- offset += cfun->machine->frame.frame_size
- - cfun->machine->frame.hard_fp_offset;
+ if (frame_pointer_needed)
+ offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
+ else
+ offset += crtl->outgoing_args_size;
+
rtx addr = plus_constant (Pmode, ptr_reg, offset);
rtx mem = gen_frame_mem (mode, addr);
if (regno2 == last_regno)
{
insn = emit_insn (set);
- RTX_FRAME_RELATED_P (insn) = 1;
- if (prologue_p)
- add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
- else
- add_reg_note (insn, REG_CFA_RESTORE, reg);
+ if (frame_related_p)
+ {
+ RTX_FRAME_RELATED_P (insn) = 1;
+ if (prologue_p)
+ add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
+ else
+ add_reg_note (insn, REG_CFA_RESTORE, reg);
+ }
break;
}
poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
/* The next register is not of the same class or its offset is not
mergeable with the current one into a pair. */
- if (!satisfies_constraint_Ump (mem)
+ if (aarch64_sve_mode_p (mode)
+ || !satisfies_constraint_Ump (mem)
|| GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
|| (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
|| maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
GET_MODE_SIZE (mode)))
{
insn = emit_insn (set);
- RTX_FRAME_RELATED_P (insn) = 1;
- if (prologue_p)
- add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
- else
- add_reg_note (insn, REG_CFA_RESTORE, reg);
+ if (frame_related_p)
+ {
+ RTX_FRAME_RELATED_P (insn) = 1;
+ if (prologue_p)
+ add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
+ else
+ add_reg_note (insn, REG_CFA_RESTORE, reg);
+ }
regno = regno2;
continue;
}
+ bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
+
/* REGNO2 can be saved/restored in a pair with REGNO. */
rtx reg2 = gen_rtx_REG (mode, regno2);
- if (!frame_pointer_needed)
- offset2 += cfun->machine->frame.frame_size
- - cfun->machine->frame.hard_fp_offset;
+ if (frame_pointer_needed)
+ offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
+ else
+ offset2 += crtl->outgoing_args_size;
rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
rtx mem2 = gen_frame_mem (mode, addr2);
rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
else
insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
- RTX_FRAME_RELATED_P (insn) = 1;
- if (prologue_p)
- {
- add_reg_note (insn, REG_CFA_OFFSET, set);
- add_reg_note (insn, REG_CFA_OFFSET, set2);
- }
- else
+ if (frame_related_p || frame_related2_p)
{
- add_reg_note (insn, REG_CFA_RESTORE, reg);
- add_reg_note (insn, REG_CFA_RESTORE, reg2);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ if (prologue_p)
+ {
+ if (frame_related_p)
+ add_reg_note (insn, REG_CFA_OFFSET, set);
+ if (frame_related2_p)
+ add_reg_note (insn, REG_CFA_OFFSET, set2);
+ }
+ else
+ {
+ if (frame_related_p)
+ add_reg_note (insn, REG_CFA_RESTORE, reg);
+ if (frame_related2_p)
+ add_reg_note (insn, REG_CFA_RESTORE, reg2);
+ }
}
regno = aarch64_get_next_set_bit (components, regno2 + 1);
bool final_adjustment_p)
{
HOST_WIDE_INT guard_size
- = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+ = 1 << param_stack_clash_protection_guard_size;
HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
- /* When doing the final adjustment for the outgoing argument size we can't
- assume that LR was saved at position 0. So subtract it's offset from the
- ABI safe buffer so that we don't accidentally allow an adjustment that
- would result in an allocation larger than the ABI buffer without
- probing. */
HOST_WIDE_INT min_probe_threshold
- = final_adjustment_p
- ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
- : guard_size - guard_used_by_caller;
+ = (final_adjustment_p
+ ? guard_used_by_caller
+ : guard_size - guard_used_by_caller);
+ /* When doing the final adjustment for the outgoing arguments, take into
+ account any unprobed space there is above the current SP. There are
+ two cases:
+
+ - When saving SVE registers below the hard frame pointer, we force
+ the lowest save to take place in the prologue before doing the final
+ adjustment (i.e. we don't allow the save to be shrink-wrapped).
+ This acts as a probe at SP, so there is no unprobed space.
+
+ - When there are no SVE register saves, we use the store of the link
+ register as a probe. We can't assume that LR was saved at position 0
+ though, so treat any space below it as unprobed. */
+ if (final_adjustment_p
+ && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
+ {
+ poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
+ if (known_ge (lr_offset, 0))
+ min_probe_threshold -= lr_offset.to_constant ();
+ else
+ gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
+ }
poly_int64 frame_size = cfun->machine->frame.frame_size;
if (flag_stack_clash_protection && !final_adjustment_p)
{
poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
+ poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
poly_int64 final_adjust = cfun->machine->frame.final_adjust;
if (known_eq (frame_size, 0))
{
dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
}
- else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
+ else if (known_lt (initial_adjust + sve_callee_adjust,
+ guard_size - guard_used_by_caller)
&& known_lt (final_adjust, guard_used_by_caller))
{
dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
return 0;
}
-/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
- is saved at BASE + OFFSET. */
-
-static void
-aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
- rtx base, poly_int64 offset)
-{
- rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
- add_reg_note (insn, REG_CFA_EXPRESSION,
- gen_rtx_SET (mem, regno_reg_rtx[reg]));
-}
-
/* AArch64 stack frames generated by this compiler look like:
+-------------------------------+
+-------------------------------+ |
| LR' | |
+-------------------------------+ |
- | FP' | / <- hard_frame_pointer_rtx (aligned)
- +-------------------------------+
+ | FP' | |
+ +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
+ | SVE vector registers | | \
+ +-------------------------------+ | | below_hard_fp_saved_regs_size
+ | SVE predicate registers | / /
+ +-------------------------------+
| dynamic allocation |
+-------------------------------+
| padding |
The following registers are reserved during frame layout and should not be
used for any other purpose:
- - r11: Used by stack clash protection when SVE is enabled.
+ - r11: Used by stack clash protection when SVE is enabled, and also
+ as an anchor register when saving and restoring registers
- r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
- r14 and r15: Used for speculation tracking.
- r16(IP0), r17(IP1): Used by indirect tailcalls.
HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
poly_int64 final_adjust = cfun->machine->frame.final_adjust;
poly_int64 callee_offset = cfun->machine->frame.callee_offset;
+ poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
+ poly_int64 below_hard_fp_saved_regs_size
+ = cfun->machine->frame.below_hard_fp_saved_regs_size;
unsigned reg1 = cfun->machine->frame.wb_candidate1;
unsigned reg2 = cfun->machine->frame.wb_candidate2;
bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
rtx_insn *insn;
+ if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
+ {
+ /* Fold the SVE allocation into the initial allocation.
+ We don't do this in aarch64_layout_arg to avoid pessimizing
+ the epilogue code. */
+ initial_adjust += sve_callee_adjust;
+ sve_callee_adjust = 0;
+ }
+
/* Sign return address for functions. */
if (aarch64_return_address_signing_enabled ())
{
if (callee_adjust != 0)
aarch64_push_regs (reg1, reg2, callee_adjust);
+ /* The offset of the frame chain record (if any) from the current SP. */
+ poly_int64 chain_offset = (initial_adjust + callee_adjust
+ - cfun->machine->frame.hard_fp_offset);
+ gcc_assert (known_ge (chain_offset, 0));
+
+ /* The offset of the bottom of the save area from the current SP. */
+ poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
+
if (emit_frame_chain)
{
- poly_int64 reg_offset = callee_adjust;
if (callee_adjust == 0)
{
reg1 = R29_REGNUM;
reg2 = R30_REGNUM;
- reg_offset = callee_offset;
- aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
+ aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
+ false, false);
}
+ else
+ gcc_assert (known_eq (chain_offset, 0));
aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
- stack_pointer_rtx, callee_offset,
+ stack_pointer_rtx, chain_offset,
tmp1_rtx, tmp0_rtx, frame_pointer_needed);
if (frame_pointer_needed && !frame_size.is_constant ())
{
/* Change the save slot expressions for the registers that
we've already saved. */
- reg_offset -= callee_offset;
- aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
- reg_offset + UNITS_PER_WORD);
- aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
- reg_offset);
+ aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
+ hard_frame_pointer_rtx, UNITS_PER_WORD);
+ aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
+ hard_frame_pointer_rtx, 0);
}
emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
}
- aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
- callee_adjust != 0 || emit_frame_chain);
- if (crtl->abi->id () == ARM_PCS_SIMD)
- aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
- callee_adjust != 0 || emit_frame_chain);
- else
- aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
- callee_adjust != 0 || emit_frame_chain);
+ aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
+ callee_adjust != 0 || emit_frame_chain,
+ emit_frame_chain);
+ if (maybe_ne (sve_callee_adjust, 0))
+ {
+ gcc_assert (!flag_stack_clash_protection
+ || known_eq (initial_adjust, 0));
+ aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
+ sve_callee_adjust,
+ !frame_pointer_needed, false);
+ saved_regs_offset += sve_callee_adjust;
+ }
+ aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
+ false, emit_frame_chain);
+ aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
+ callee_adjust != 0 || emit_frame_chain,
+ emit_frame_chain);
/* We may need to probe the final adjustment if it is larger than the guard
that is assumed by the called. */
HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
poly_int64 final_adjust = cfun->machine->frame.final_adjust;
poly_int64 callee_offset = cfun->machine->frame.callee_offset;
+ poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
+ poly_int64 below_hard_fp_saved_regs_size
+ = cfun->machine->frame.below_hard_fp_saved_regs_size;
unsigned reg1 = cfun->machine->frame.wb_candidate1;
unsigned reg2 = cfun->machine->frame.wb_candidate2;
rtx cfi_ops = NULL;
for each allocation. For stack clash we are in a usable state if
the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
HOST_WIDE_INT guard_size
- = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+ = 1 << param_stack_clash_protection_guard_size;
HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
- /* We can re-use the registers when the allocation amount is smaller than
- guard_size - guard_used_by_caller because we won't be doing any probes
- then. In such situations the register should remain live with the correct
+ /* We can re-use the registers when:
+
+ (a) the deallocation amount is the same as the corresponding
+ allocation amount (which is false if we combine the initial
+ and SVE callee save allocations in the prologue); and
+
+ (b) the allocation amount doesn't need a probe (which is false
+ if the amount is guard_size - guard_used_by_caller or greater).
+
+ In such situations the register should remain live with the correct
value. */
bool can_inherit_p = (initial_adjust.is_constant ()
- && final_adjust.is_constant ())
+ && final_adjust.is_constant ()
&& (!flag_stack_clash_protection
- || known_lt (initial_adjust,
- guard_size - guard_used_by_caller));
+ || (known_lt (initial_adjust,
+ guard_size - guard_used_by_caller)
+ && known_eq (sve_callee_adjust, 0))));
/* We need to add memory barrier to prevent read from deallocated stack. */
bool need_barrier_p
/* If writeback is used when restoring callee-saves, the CFA
is restored on the instruction doing the writeback. */
aarch64_add_offset (Pmode, stack_pointer_rtx,
- hard_frame_pointer_rtx, -callee_offset,
+ hard_frame_pointer_rtx,
+ -callee_offset - below_hard_fp_saved_regs_size,
tmp1_rtx, tmp0_rtx, callee_adjust == 0);
else
/* The case where we need to re-use the register here is very rare, so
immediate doesn't fit. */
aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
- aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
+ /* Restore the vector registers before the predicate registers,
+ so that we can use P4 as a temporary for big-endian SVE frames. */
+ aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
+ callee_adjust != 0, &cfi_ops);
+ aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
+ false, &cfi_ops);
+ if (maybe_ne (sve_callee_adjust, 0))
+ aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+ aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
+ R0_REGNUM, R30_REGNUM,
callee_adjust != 0, &cfi_ops);
- if (crtl->abi->id () == ARM_PCS_SIMD)
- aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
- callee_adjust != 0, &cfi_ops);
- else
- aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
- callee_adjust != 0, &cfi_ops);
if (need_barrier_p)
emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
if (PR_REGNUM_P (regno))
return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
+ if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
+ return FFR_REGS;
+
return NO_REGS;
}
machine_mode mode,
secondary_reload_info *sri)
{
- /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
- directly by the *aarch64_sve_mov<mode>_be move pattern. See the
- comment at the head of aarch64-sve.md for more details about the
- big-endian handling. */
- if (BYTES_BIG_ENDIAN
- && reg_class_subset_p (rclass, FP_REGS)
+ /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
+ LDR and STR. See the comment at the head of aarch64-sve.md for
+ more details about the big-endian handling. */
+ if (reg_class_subset_p (rclass, FP_REGS)
&& !((REG_P (x) && HARD_REGISTER_P (x))
|| aarch64_simd_valid_immediate (x, NULL))
- && aarch64_sve_data_mode_p (mode))
+ && mode != VNx16QImode)
{
- sri->icode = CODE_FOR_aarch64_sve_reload_be;
- return NO_REGS;
+ unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ if ((vec_flags & VEC_SVE_DATA)
+ && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
+ {
+ sri->icode = CODE_FOR_aarch64_sve_reload_mem;
+ return NO_REGS;
+ }
}
/* If we have to disable direct literal pool loads and stores because the
case PR_REGS:
case PR_LO_REGS:
case PR_HI_REGS:
+ case FFR_REGS:
+ case PR_AND_FFR_REGS:
return 1;
case NO_REGS:
if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
from = GENERAL_REGS;
+ /* Make RDFFR very expensive. In particular, if we know that the FFR
+ contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
+ as a way of obtaining a PTRUE. */
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+ && hard_reg_set_subset_p (reg_class_contents[from_i],
+ reg_class_contents[FFR_REGS]))
+ return 80;
+
/* Moving between GPR and stack cost is the same as GP2GP. */
if ((from == GENERAL_REGS && to == STACK_REG)
|| (to == GENERAL_REGS && from == STACK_REG))
aarch64_init_builtins ()
{
aarch64_general_init_builtins ();
+ aarch64_sve::init_builtins ();
}
/* Implement TARGET_FOLD_BUILTIN. */
{
case AARCH64_BUILTIN_GENERAL:
return aarch64_general_fold_builtin (subcode, type, nargs, args);
+
+ case AARCH64_BUILTIN_SVE:
+ return NULL_TREE;
}
gcc_unreachable ();
}
case AARCH64_BUILTIN_GENERAL:
new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
break;
+
+ case AARCH64_BUILTIN_SVE:
+ new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
+ break;
}
if (!new_stmt)
/* Implement TARGET_EXPAND_BUILTIN. */
static rtx
-aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int)
+aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
{
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
switch (code & AARCH64_BUILTIN_CLASS)
{
case AARCH64_BUILTIN_GENERAL:
- return aarch64_general_expand_builtin (subcode, exp, target);
+ return aarch64_general_expand_builtin (subcode, exp, target, ignore);
+
+ case AARCH64_BUILTIN_SVE:
+ return aarch64_sve::expand_builtin (subcode, exp, target);
}
gcc_unreachable ();
}
{
case AARCH64_BUILTIN_GENERAL:
return aarch64_general_builtin_decl (subcode, initialize_p);
+
+ case AARCH64_BUILTIN_SVE:
+ return aarch64_sve::builtin_decl (subcode, initialize_p);
}
gcc_unreachable ();
}
{
case AARCH64_BUILTIN_GENERAL:
return aarch64_general_builtin_rsqrt (subcode);
+
+ case AARCH64_BUILTIN_SVE:
+ return NULL_TREE;
}
gcc_unreachable ();
}
/* Caller assumes we cannot fail. */
gcc_assert (use_rsqrt_p (mode));
- machine_mode mmsk = mode_for_int_vector (mode).require ();
+ machine_mode mmsk = (VECTOR_MODE_P (mode)
+ ? related_int_vector_mode (mode).require ()
+ : int_mode_for_mode (mode).require ());
rtx xmsk = gen_reg_rtx (mmsk);
if (!recp)
/* When calculating the approximate square root, compare the
}
}
+/* Return true if STMT_INFO extends the result of a load. */
+static bool
+aarch64_extending_load_p (stmt_vec_info stmt_info)
+{
+ gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
+ if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
+ return false;
+
+ tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+ tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
+ tree rhs_type = TREE_TYPE (rhs);
+ if (!INTEGRAL_TYPE_P (lhs_type)
+ || !INTEGRAL_TYPE_P (rhs_type)
+ || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
+ return false;
+
+ stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
+ return (def_stmt_info
+ && STMT_VINFO_DATA_REF (def_stmt_info)
+ && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
+}
+
+/* Return true if STMT_INFO is an integer truncation. */
+static bool
+aarch64_integer_truncation_p (stmt_vec_info stmt_info)
+{
+ gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
+ if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
+ return false;
+
+ tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
+ tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
+ return (INTEGRAL_TYPE_P (lhs_type)
+ && INTEGRAL_TYPE_P (rhs_type)
+ && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
+}
+
+/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
+ for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
+ for SVE targets. */
+static unsigned int
+aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
+ unsigned int stmt_cost)
+{
+ /* Unlike vec_promote_demote, vector_stmt conversions do not change the
+ vector register size or number of units. Integer promotions of this
+ type therefore map to SXT[BHW] or UXT[BHW].
+
+ Most loads have extending forms that can do the sign or zero extension
+ on the fly. Optimistically assume that a load followed by an extension
+ will fold to this form during combine, and that the extension therefore
+ comes for free. */
+ if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
+ stmt_cost = 0;
+
+ /* For similar reasons, vector_stmt integer truncations are a no-op,
+ because we can just ignore the unused upper bits of the source. */
+ if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
+ stmt_cost = 0;
+
+ return stmt_cost;
+}
+
/* Implement targetm.vectorize.add_stmt_cost. */
static unsigned
aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
int stmt_cost =
aarch64_builtin_vectorization_cost (kind, vectype, misalign);
+ if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
+ stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
+
/* Statements in an inner loop relative to the loop being
vectorized are weighted more heavily. The value here is
arbitrary and could potentially be improved with analysis. */
/* We don't mind passing in global_options_set here as we don't use
the *options_set structs anyway. */
- maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
- queue_depth,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_sched_autopref_queue_depth, queue_depth);
/* Set up parameters to be used in prefetching algorithm. Do not
override the defaults unless we are tuning for a core we have
researched values for. */
if (aarch64_tune_params.prefetch->num_slots > 0)
- maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
- aarch64_tune_params.prefetch->num_slots,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_simultaneous_prefetches,
+ aarch64_tune_params.prefetch->num_slots);
if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
- maybe_set_param_value (PARAM_L1_CACHE_SIZE,
- aarch64_tune_params.prefetch->l1_cache_size,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_l1_cache_size,
+ aarch64_tune_params.prefetch->l1_cache_size);
if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
- maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
- aarch64_tune_params.prefetch->l1_cache_line_size,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_l1_cache_line_size,
+ aarch64_tune_params.prefetch->l1_cache_line_size);
if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
- maybe_set_param_value (PARAM_L2_CACHE_SIZE,
- aarch64_tune_params.prefetch->l2_cache_size,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_l2_cache_size,
+ aarch64_tune_params.prefetch->l2_cache_size);
if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
- maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
- 0,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_prefetch_dynamic_strides, 0);
if (aarch64_tune_params.prefetch->minimum_stride >= 0)
- maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
- aarch64_tune_params.prefetch->minimum_stride,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_prefetch_minimum_stride,
+ aarch64_tune_params.prefetch->minimum_stride);
/* Use the alternative scheduling-pressure algorithm by default. */
- maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
- opts->x_param_values,
- global_options_set.x_param_values);
-
- /* If the user hasn't changed it via configure then set the default to 64 KB
- for the backend. */
- maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
- DEFAULT_STK_CLASH_GUARD_SIZE == 0
- ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_sched_pressure_algorithm,
+ SCHED_PRESSURE_MODEL);
/* Validate the guard size. */
- int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+ int guard_size = param_stack_clash_protection_guard_size;
+
+ if (guard_size != 12 && guard_size != 16)
+ error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
+ "size. Given value %d (%llu KB) is out of range",
+ guard_size, (1ULL << guard_size) / 1024ULL);
/* Enforce that interval is the same size as size so the mid-end does the
right thing. */
- maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
- guard_size,
- opts->x_param_values,
- global_options_set.x_param_values);
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_stack_clash_protection_probe_interval,
+ guard_size);
/* The maybe_set calls won't update the value if the user has explicitly set
one. Which means we need to validate that probing interval and guard size
are equal. */
int probe_interval
- = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+ = param_stack_clash_protection_probe_interval;
if (guard_size != probe_interval)
error ("stack clash guard size %<%d%> must be equal to probing interval "
"%<%d%>", guard_size, probe_interval);
static bool
aarch64_handle_attr_branch_protection (const char* str)
{
- char *err_str = (char *) xmalloc (strlen (str));
+ char *err_str = (char *) xmalloc (strlen (str) + 1);
enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
&err_str);
bool success = false;
call_used_regs[i] = 1;
}
+ /* Only allow the FFR and FFRT to be accessed via special patterns. */
+ CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
+ CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
+
/* When tracking speculation, we need a couple of call-clobbered registers
to track the speculation state. It would be nice to just use
IP0 and IP1, but currently there are numerous places that just
machine_mode mode;
HOST_WIDE_INT size;
+ /* SVE types (and types containing SVE types) must be handled
+ before calling this function. */
+ gcc_assert (!aarch64_sve::builtin_type_p (type));
+
switch (TREE_CODE (type))
{
case REAL_TYPE:
{
poly_int64 size = -1;
+ if (type && aarch64_sve::builtin_type_p (type))
+ return false;
+
if (type && TREE_CODE (type) == VECTOR_TYPE)
size = int_size_in_bytes (type);
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
int *count,
bool *is_ha)
{
+ if (is_ha != NULL) *is_ha = false;
+
+ if (type && aarch64_sve::builtin_type_p (type))
+ return false;
+
machine_mode new_mode = VOIDmode;
bool composite_p = aarch64_composite_type_p (type, mode);
- if (is_ha != NULL) *is_ha = false;
-
if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
|| aarch64_short_vector_p (type, mode))
{
aarch64_vector_mode_supported_p (machine_mode mode)
{
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
- return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0;
+ return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
}
/* Return the full-width SVE vector mode for element mode MODE, if one
/* Return a list of possible vector sizes for the vectorizer
to iterate over. */
-static void
-aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
+static unsigned int
+aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
{
- if (TARGET_SVE)
- sizes->safe_push (BYTES_PER_SVE_VECTOR);
- sizes->safe_push (16);
- sizes->safe_push (8);
+ static const machine_mode sve_modes[] = {
+ /* Try using full vectors for all element types. */
+ VNx16QImode,
+
+ /* Try using 16-bit containers for 8-bit elements and full vectors
+ for wider elements. */
+ VNx8QImode,
+
+ /* Try using 32-bit containers for 8-bit and 16-bit elements and
+ full vectors for wider elements. */
+ VNx4QImode,
+
+ /* Try using 64-bit containers for all element types. */
+ VNx2QImode
+ };
+
+ static const machine_mode advsimd_modes[] = {
+ /* Try using 128-bit vectors for all element types. */
+ V16QImode,
+
+ /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
+ for wider elements. */
+ V8QImode,
+
+ /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
+ for wider elements.
+
+ TODO: We could support a limited form of V4QImode too, so that
+ we use 32-bit vectors for 8-bit elements. */
+ V4HImode,
+
+ /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
+ for 64-bit elements.
+
+ TODO: We could similarly support limited forms of V2QImode and V2HImode
+ for this case. */
+ V2SImode
+ };
+
+ /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
+ This is because:
+
+ - If we can't use N-byte Advanced SIMD vectors then the placement
+ doesn't matter; we'll just continue as though the Advanced SIMD
+ entry didn't exist.
+
+ - If an SVE main loop with N bytes ends up being cheaper than an
+ Advanced SIMD main loop with N bytes then by default we'll replace
+ the Advanced SIMD version with the SVE one.
+
+ - If an Advanced SIMD main loop with N bytes ends up being cheaper
+ than an SVE main loop with N bytes then by default we'll try to
+ use the SVE loop to vectorize the epilogue instead. */
+ unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
+ unsigned int advsimd_i = 0;
+ while (advsimd_i < ARRAY_SIZE (advsimd_modes))
+ {
+ if (sve_i < ARRAY_SIZE (sve_modes)
+ && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
+ GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
+ modes->safe_push (sve_modes[sve_i++]);
+ else
+ modes->safe_push (advsimd_modes[advsimd_i++]);
+ }
+ while (sve_i < ARRAY_SIZE (sve_modes))
+ modes->safe_push (sve_modes[sve_i++]);
+
+ unsigned int flags = 0;
+ /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
+ can compare SVE against Advanced SIMD and so that we can compare
+ multiple SVE vectorization approaches against each other. There's
+ not really any point doing this for Advanced SIMD only, since the
+ first mode that works should always be the best. */
+ if (TARGET_SVE && aarch64_sve_compare_costs)
+ flags |= VECT_COMPARE_COSTS;
+ return flags;
}
/* Implement TARGET_MANGLE_TYPE. */
/* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
builtin types. */
if (TYPE_NAME (type) != NULL)
- return aarch64_general_mangle_builtin_type (type);
+ {
+ const char *res;
+ if ((res = aarch64_general_mangle_builtin_type (type))
+ || (res = aarch64_sve::mangle_builtin_type (type)))
+ return res;
+ }
/* Use the default mangling. */
return NULL;
}
+/* Implement TARGET_VERIFY_TYPE_CONTEXT. */
+
+static bool
+aarch64_verify_type_context (location_t loc, type_context_kind context,
+ const_tree type, bool silent_p)
+{
+ return aarch64_sve::verify_type_context (loc, context, type, silent_p);
+}
+
/* Find the first rtx_insn before insn that will generate an assembly
instruction. */
return IN_RANGE (val, 0, 0xff00);
}
+/* Return true if X is a valid immediate for the SVE SQADD and SQSUB
+ instructions. Negate X first if NEGATE_P is true. */
+
+bool
+aarch64_sve_sqadd_sqsub_immediate_p (rtx x, bool negate_p)
+{
+ rtx elt;
+
+ if (!const_vec_duplicate_p (x, &elt)
+ || !CONST_INT_P (elt))
+ return false;
+
+ if (!aarch64_sve_arith_immediate_p (x, negate_p))
+ return false;
+
+ /* After the optional negation, the immediate must be nonnegative.
+ E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
+ instead of SQADD Zn.B, Zn.B, #129. */
+ return negate_p == (INTVAL (elt) < 0);
+}
+
/* Return true if X is a valid immediate operand for an SVE logical
instruction such as AND. */
bool
aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
{
- rtx elt;
-
- return (const_vec_duplicate_p (x, &elt)
- && CONST_INT_P (elt)
+ x = unwrap_const_vec_duplicate (x);
+ return (CONST_INT_P (x)
&& (signed_p
- ? IN_RANGE (INTVAL (elt), -16, 15)
- : IN_RANGE (INTVAL (elt), 0, 127)));
+ ? IN_RANGE (INTVAL (x), -16, 15)
+ : IN_RANGE (INTVAL (x), 0, 127)));
}
/* Return true if X is a valid immediate operand for an SVE FADD or FSUB
return false;
}
+/* Return true if X is an UNSPEC_PTRUE constant of the form:
+
+ (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
+
+ where PATTERN is the svpattern as a CONST_INT and where ZERO
+ is a zero constant of the required PTRUE mode (which can have
+ fewer elements than X's mode, if zero bits are significant).
+
+ If so, and if INFO is nonnull, describe the immediate in INFO. */
+bool
+aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
+{
+ if (GET_CODE (x) != CONST)
+ return false;
+
+ x = XEXP (x, 0);
+ if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
+ return false;
+
+ if (info)
+ {
+ aarch64_svpattern pattern
+ = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
+ machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
+ scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
+ *info = simd_immediate_info (int_mode, pattern);
+ }
+ return true;
+}
+
/* Return true if X is a valid SVE predicate. If INFO is nonnull, use
it to describe valid immediates. */
static bool
aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
{
+ if (aarch64_sve_ptrue_svpattern_p (x, info))
+ return true;
+
if (x == CONST0_RTX (GET_MODE (x)))
{
if (info)
return false;
if (info)
- *info = simd_immediate_info (elt_mode, base, step);
+ {
+ /* Get the corresponding container mode. E.g. an INDEX on V2SI
+ should yield two integer values per 128-bit block, meaning
+ that we need to treat it in the same way as V2DI and then
+ ignore the upper 32 bits of each element. */
+ elt_mode = aarch64_sve_container_int_mode (mode);
+ *info = simd_immediate_info (elt_mode, base, step);
+ }
return true;
}
else if (GET_CODE (op) == CONST_VECTOR
}
}
- unsigned int elt_size = GET_MODE_SIZE (elt_mode);
+ /* If all elements in an SVE vector have the same value, we have a free
+ choice between using the element mode and using the container mode.
+ Using the element mode means that unused parts of the vector are
+ duplicates of the used elements, while using the container mode means
+ that the unused parts are an extension of the used elements. Using the
+ element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
+ for its container mode VNx4SI while 0x00000101 isn't.
+
+ If not all elements in an SVE vector have the same value, we need the
+ transition from one element to the next to occur at container boundaries.
+ E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
+ in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
+ scalar_int_mode elt_int_mode;
+ if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
+ elt_int_mode = aarch64_sve_container_int_mode (mode);
+ else
+ elt_int_mode = int_mode_for_mode (elt_mode).require ();
+
+ unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
if (elt_size > 8)
return false;
- scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
-
/* Expand the vector constant out into a byte vector, with the least
significant byte of the register first. */
auto_vec<unsigned char, 16> bytes;
bool
aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
{
+ x = unwrap_const_vec_duplicate (x);
+ if (!CONST_INT_P (x))
+ return false;
int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
if (left)
- return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
+ return IN_RANGE (INTVAL (x), 0, bit_width - 1);
else
- return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
+ return IN_RANGE (INTVAL (x), 1, bit_width);
}
/* Return the bitmask CONST_INT to select the bits required by a zero extract
if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
return true;
- if (aarch64_sve_cnt_immediate_p (x))
+ if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
return true;
return aarch64_classify_symbolic_expression (x)
return false;
}
+/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
+bool
+aarch64_sve_ldff1_operand_p (rtx op)
+{
+ if (!MEM_P (op))
+ return false;
+
+ struct aarch64_address_info addr;
+ if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
+ return false;
+
+ if (addr.type == ADDRESS_REG_IMM)
+ return known_eq (addr.const_offset, 0);
+
+ return addr.type == ADDRESS_REG_REG;
+}
+
+/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
+bool
+aarch64_sve_ldnf1_operand_p (rtx op)
+{
+ struct aarch64_address_info addr;
+
+ return (MEM_P (op)
+ && aarch64_classify_address (&addr, XEXP (op, 0),
+ GET_MODE (op), false)
+ && addr.type == ADDRESS_REG_IMM);
+}
+
/* Return true if OP is a valid MEM operand for an SVE LDR instruction.
The conditions for STR are the same. */
bool
&& addr.type == ADDRESS_REG_IMM);
}
+/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
+ addressing memory of mode MODE. */
+bool
+aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
+{
+ struct aarch64_address_info addr;
+ if (!aarch64_classify_address (&addr, op, mode, false))
+ return false;
+
+ if (addr.type == ADDRESS_REG_IMM)
+ return known_eq (addr.const_offset, 0);
+
+ return addr.type == ADDRESS_REG_REG;
+}
+
/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
We need to be able to access the individual pieces, so the range
is different from LD[234] and ST[234]. */
direct way we have of identifying real SVE predicate types. */
if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
return 16;
- if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
- return 128;
- return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
+ widest_int min_size
+ = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
+ return wi::umin (min_size, 128).to_uhwi ();
}
/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
static void
aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
{
- if (aarch64_simd_decl_p (decl))
+ if (TREE_CODE (decl) == FUNCTION_DECL)
{
- fprintf (stream, "\t.variant_pcs\t");
- assemble_name (stream, name);
- fprintf (stream, "\n");
+ arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
+ if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
+ {
+ fprintf (stream, "\t.variant_pcs\t");
+ assemble_name (stream, name);
+ fprintf (stream, "\n");
+ }
}
}
return templ;
}
+/* Return the asm template for a PTRUES. CONST_UNSPEC is the
+ aarch64_sve_ptrue_svpattern_immediate that describes the predicate
+ pattern. */
+
+char *
+aarch64_output_sve_ptrues (rtx const_unspec)
+{
+ static char templ[40];
+
+ struct simd_immediate_info info;
+ bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
+ gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
+
+ char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
+ snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
+ svpattern_token (info.u.pattern));
+ return templ;
+}
+
/* Split operands into moves from op[1] + op[2] into op[0]. */
void
{
poly_uint64 nelt = d->perm.length ();
- if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
+ if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
return false;
if (!d->perm.series_p (0, 1, nelt - 1, -1))
if (d->testing_p)
return true;
- machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
+ machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
if (d->one_vector_p)
emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
if (d->testing_p)
return true;
- machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
+ machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
rtx_vector_builder builder (pred_mode, n_patterns, 2);
for (int i = 0; i < n_patterns * 2; i++)
aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
rtx *ops)
{
- machine_mode pred_mode
- = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
- GET_MODE_SIZE (cmp_mode)).require ();
+ machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
rtx pred = gen_reg_rtx (pred_mode);
if (FLOAT_MODE_P (cmp_mode))
{
}
}
+ /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
+ && prev_set && curr_set && any_condjump_p (curr)
+ && GET_CODE (SET_SRC (prev_set)) == COMPARE
+ && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
+ && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
+ return true;
+
+ /* Fuse flag-setting ALU instructions and conditional branch. */
+ if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
&& any_condjump_p (curr))
{
unsigned int condreg1, condreg2;
}
}
+ /* Fuse ALU instructions and CBZ/CBNZ. */
if (prev_set
&& curr_set
- && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
+ && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
&& any_condjump_p (curr))
{
/* We're trying to match:
aarch64_can_change_mode_class (machine_mode from,
machine_mode to, reg_class_t)
{
+ unsigned int from_flags = aarch64_classify_vector_mode (from);
+ unsigned int to_flags = aarch64_classify_vector_mode (to);
+
+ bool from_sve_p = (from_flags & VEC_ANY_SVE);
+ bool to_sve_p = (to_flags & VEC_ANY_SVE);
+
+ bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
+ bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
+
+ /* Don't allow changes between partial SVE modes and other modes.
+ The contents of partial SVE modes are distributed evenly across
+ the register, whereas GCC expects them to be clustered together. */
+ if (from_partial_sve_p != to_partial_sve_p)
+ return false;
+
+ /* Similarly reject changes between partial SVE modes that have
+ different patterns of significant and insignificant bits. */
+ if (from_partial_sve_p
+ && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
+ || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
+ return false;
+
if (BYTES_BIG_ENDIAN)
{
- bool from_sve_p = aarch64_sve_data_mode_p (from);
- bool to_sve_p = aarch64_sve_data_mode_p (to);
-
/* Don't allow changes between SVE data modes and non-SVE modes.
See the comment at the head of aarch64-sve.md for details. */
if (from_sve_p != to_sve_p)
#undef TARGET_MANGLE_TYPE
#define TARGET_MANGLE_TYPE aarch64_mangle_type
+#undef TARGET_VERIFY_TYPE_CONTEXT
+#define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
+
#undef TARGET_MEMORY_MOVE_COST
#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
aarch64_builtin_vectorized_function
-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
- aarch64_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
+ aarch64_autovectorize_vector_modes
#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
#define TARGET_VECTORIZE_VEC_PERM_CONST \
aarch64_vectorize_vec_perm_const
+#undef TARGET_VECTORIZE_RELATED_MODE
+#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
#undef TARGET_VECTORIZE_GET_MASK_MODE
#define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
#undef TARGET_ASM_POST_CFI_STARTPROC
#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
+#undef TARGET_STRICT_ARGUMENT_NAMING
+#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
+
+#undef TARGET_MD_ASM_ADJUST
+#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-aarch64.h"