PR translation/90118 Missing space between words

[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index b44ee40115dce526c7cc302b2a47c28ab8b41508..10b0bd2d5321ee380292d13eb3973424fd91ff24 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1,5 +1,5 @@
  /* Machine description for AArch64 architecture.
-   Copyright (C) 2009-2018 Free Software Foundation, Inc.
+   Copyright (C) 2009-2019 Free Software Foundation, Inc.
     Contributed by ARM Ltd.
  
     This file is part of GCC.
@@ -40,6 +40,7 @@
  #include "regs.h"
  #include "emit-rtl.h"
  #include "recog.h"
+#include "cgraph.h"
  #include "diagnostic.h"
  #include "insn-attr.h"
  #include "alias.h"
@@ -71,6 +72,7 @@
  #include "selftest.h"
  #include "selftest-rtl.h"
  #include "rtx-vector-builder.h"
+#include "intl.h"
  
  /* This file should be included last.  */
  #include "target-def.h"
@@ -183,6 +185,12 @@ bool aarch64_pcrelative_literal_loads;
  /* Global flag for whether frame pointer is enabled.  */
  bool aarch64_use_frame_pointer;
  
+#define BRANCH_PROTECT_STR_MAX 255
+char *accepted_branch_protection_string = NULL;
+
+static enum aarch64_parse_opt_result
+aarch64_parse_branch_protection (const char*, char**);
+
  /* Support for command line parsing of boolean flags in the tuning
     structures.  */
  struct aarch64_flag_desc
@@ -254,7 +262,7 @@ static const struct cpu_addrcost_table xgene1_addrcost_table =
        1, /* ti  */
      },
    1, /* pre_modify  */
-  0, /* post_modify  */
+  1, /* post_modify  */
    0, /* register_offset  */
    1, /* register_sextend  */
    1, /* register_zextend  */
@@ -662,6 +670,17 @@ static const cpu_prefetch_tune tsv110_prefetch_tune =
    -1                    /* default_opt_level  */
  };
  
+static const cpu_prefetch_tune xgene1_prefetch_tune =
+{
+  8,                   /* num_slots  */
+  32,                  /* l1_cache_size  */
+  64,                  /* l1_cache_line_size  */
+  256,                 /* l2_cache_size  */
+  true,                 /* prefetch_dynamic_strides */
+  -1,                   /* minimum_stride */
+  -1                   /* default_opt_level  */
+};
+
  static const struct tune_params generic_tunings =
  {
    &cortexa57_extra_costs,
@@ -670,6 +689,7 @@ static const struct tune_params generic_tunings =
    &generic_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    2, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -695,6 +715,7 @@ static const struct tune_params cortexa35_tunings =
    &generic_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    1, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -721,6 +742,7 @@ static const struct tune_params cortexa53_tunings =
    &generic_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    2, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -747,6 +769,7 @@ static const struct tune_params cortexa57_tunings =
    &cortexa57_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    3, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -773,6 +796,7 @@ static const struct tune_params cortexa72_tunings =
    &cortexa57_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    3, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -799,6 +823,7 @@ static const struct tune_params cortexa73_tunings =
    &cortexa57_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost.  */
    2, /* issue_rate.  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -827,6 +852,7 @@ static const struct tune_params exynosm1_tunings =
    &exynosm1_vector_cost,
    &generic_branch_cost,
    &exynosm1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4,   /* memmov_cost  */
    3,   /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -852,6 +878,7 @@ static const struct tune_params thunderxt88_tunings =
    &thunderx_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    6, /* memmov_cost  */
    2, /* issue_rate  */
    AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -877,6 +904,7 @@ static const struct tune_params thunderx_tunings =
    &thunderx_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    6, /* memmov_cost  */
    2, /* issue_rate  */
    AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -903,6 +931,7 @@ static const struct tune_params tsv110_tunings =
    &tsv110_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4,    /* memmov_cost  */
    4,    /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
@@ -929,21 +958,48 @@ static const struct tune_params xgene1_tunings =
    &xgene1_vector_cost,
    &generic_branch_cost,
    &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    6, /* memmov_cost  */
    4, /* issue_rate  */
    AARCH64_FUSE_NOTHING, /* fusible_ops  */
    "16",        /* function_align.  */
-  "8", /* jump_align.  */
+  "16",        /* jump_align.  */
    "16",        /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
-  0,   /* max_case_values.  */
+  17,  /* max_case_values.  */
    tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
    (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),       /* tune_flags.  */
-  &generic_prefetch_tune
+  &xgene1_prefetch_tune
+};
+
+static const struct tune_params emag_tunings =
+{
+  &xgene1_extra_costs,
+  &xgene1_addrcost_table,
+  &xgene1_regmove_cost,
+  &xgene1_vector_cost,
+  &generic_branch_cost,
+  &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED,
+  6, /* memmov_cost  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_NOTHING, /* fusible_ops  */
+  "16",        /* function_align.  */
+  "16",        /* jump_align.  */
+  "16",        /* loop_align.  */
+  2,   /* int_reassoc_width.  */
+  4,   /* fp_reassoc_width.  */
+  1,   /* vec_reassoc_width.  */
+  2,   /* min_div_recip_mul_sf.  */
+  2,   /* min_div_recip_mul_df.  */
+  17,  /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),       /* tune_flags.  */
+  &xgene1_prefetch_tune
  };
  
  static const struct tune_params qdf24xx_tunings =
@@ -954,6 +1010,7 @@ static const struct tune_params qdf24xx_tunings =
    &qdf24xx_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    4, /* issue_rate  */
    (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -982,6 +1039,7 @@ static const struct tune_params saphira_tunings =
    &generic_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    4, /* issue_rate  */
    (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -1008,6 +1066,7 @@ static const struct tune_params thunderx2t99_tunings =
    &thunderx2t99_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost.  */
    4, /* issue_rate.  */
    (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
@@ -1026,6 +1085,32 @@ static const struct tune_params thunderx2t99_tunings =
    &thunderx2t99_prefetch_tune
  };
  
+static const struct tune_params neoversen1_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  4, /* memmov_cost  */
+  3, /* issue_rate  */
+  AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
+  "32:16",     /* function_align.  */
+  "32:16",     /* jump_align.  */
+  "32:16",     /* loop_align.  */
+  2,   /* int_reassoc_width.  */
+  4,   /* fp_reassoc_width.  */
+  2,   /* vec_reassoc_width.  */
+  2,   /* min_div_recip_mul_sf.  */
+  2,   /* min_div_recip_mul_df.  */
+  0,   /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
+  &generic_prefetch_tune
+};
+
  /* Support for fine-grained override of the tuning structures.  */
  struct aarch64_tuning_override_function
  {
@@ -1035,12 +1120,14 @@ struct aarch64_tuning_override_function
  
  static void aarch64_parse_fuse_string (const char*, struct tune_params*);
  static void aarch64_parse_tune_string (const char*, struct tune_params*);
+static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
  
  static const struct aarch64_tuning_override_function
  aarch64_tuning_override_functions[] =
  {
    { "fuse", aarch64_parse_fuse_string },
    { "tune", aarch64_parse_tune_string },
+  { "sve_width", aarch64_parse_sve_width_string },
    { NULL, NULL }
  };
  
@@ -1088,6 +1175,15 @@ static const struct processor *selected_tune;
  /* The current tuning set.  */
  struct tune_params aarch64_tune_params = generic_tunings;
  
+/* Table of machine attributes.  */
+static const struct attribute_spec aarch64_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
+  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+};
+
  #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
  
  /* An ISA extension in the co-processor and main instruction set space.  */
@@ -1108,6 +1204,90 @@ aarch64_cc;
  
  #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
  
+struct aarch64_branch_protect_type
+{
+  /* The type's name that the user passes to the branch-protection option
+    string.  */
+  const char* name;
+  /* Function to handle the protection type and set global variables.
+    First argument is the string token corresponding with this type and the
+    second argument is the next token in the option string.
+    Return values:
+    * AARCH64_PARSE_OK: Handling was sucessful.
+    * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
+      should print an error.
+    * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
+      own error.  */
+  enum aarch64_parse_opt_result (*handler)(char*, char*);
+  /* A list of types that can follow this type in the option string.  */
+  const aarch64_branch_protect_type* subtypes;
+  unsigned int num_subtypes;
+};
+
+static enum aarch64_parse_opt_result
+aarch64_handle_no_branch_protection (char* str, char* rest)
+{
+  aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
+  aarch64_enable_bti = 0;
+  if (rest)
+    {
+      error ("unexpected %<%s%> after %<%s%>", rest, str);
+      return AARCH64_PARSE_INVALID_FEATURE;
+    }
+  return AARCH64_PARSE_OK;
+}
+
+static enum aarch64_parse_opt_result
+aarch64_handle_standard_branch_protection (char* str, char* rest)
+{
+  aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
+  aarch64_enable_bti = 1;
+  if (rest)
+    {
+      error ("unexpected %<%s%> after %<%s%>", rest, str);
+      return AARCH64_PARSE_INVALID_FEATURE;
+    }
+  return AARCH64_PARSE_OK;
+}
+
+static enum aarch64_parse_opt_result
+aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
+                                   char* rest ATTRIBUTE_UNUSED)
+{
+  aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
+  return AARCH64_PARSE_OK;
+}
+
+static enum aarch64_parse_opt_result
+aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
+                             char* rest ATTRIBUTE_UNUSED)
+{
+  aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
+  return AARCH64_PARSE_OK;
+}
+
+static enum aarch64_parse_opt_result
+aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
+                                   char* rest ATTRIBUTE_UNUSED)
+{
+  aarch64_enable_bti = 1;
+  return AARCH64_PARSE_OK;
+}
+
+static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
+  { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
+  { NULL, NULL, NULL, 0 }
+};
+
+static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
+  { "none", aarch64_handle_no_branch_protection, NULL, 0 },
+  { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
+  { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
+    ARRAY_SIZE (aarch64_pac_ret_subtypes) },
+  { "bti", aarch64_handle_bti_protection, NULL, 0 },
+  { NULL, NULL, NULL, 0 }
+};
+
  /* The condition codes of the processor, and the inverse function.  */
  static const char * const aarch64_condition_codes[] =
  {
@@ -1470,14 +1650,102 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
    return false;
  }
  
+/* Return true if this is a definition of a vectorized simd function.  */
+
+static bool
+aarch64_simd_decl_p (tree fndecl)
+{
+  tree fntype;
+
+  if (fndecl == NULL)
+    return false;
+  fntype = TREE_TYPE (fndecl);
+  if (fntype == NULL)
+    return false;
+
+  /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
+    return true;
+
+  return false;
+}
+
+/* Return the mode a register save/restore should use.  DImode for integer
+   registers, DFmode for FP registers in non-SIMD functions (they only save
+   the bottom half of a 128 bit register), or TFmode for FP registers in
+   SIMD functions.  */
+
+static machine_mode
+aarch64_reg_save_mode (tree fndecl, unsigned regno)
+{
+  return GP_REGNUM_P (regno)
+          ? E_DImode
+          : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
+}
+
+/* Return true if the instruction is a call to a SIMD function, false
+   if it is not a SIMD function or if we do not know anything about
+   the function.  */
+
+static bool
+aarch64_simd_call_p (rtx_insn *insn)
+{
+  rtx symbol;
+  rtx call;
+  tree fndecl;
+
+  gcc_assert (CALL_P (insn));
+  call = get_call_rtx_from (insn);
+  symbol = XEXP (XEXP (call, 0), 0);
+  if (GET_CODE (symbol) != SYMBOL_REF)
+    return false;
+  fndecl = SYMBOL_REF_DECL (symbol);
+  if (!fndecl)
+    return false;
+
+  return aarch64_simd_decl_p (fndecl);
+}
+
+/* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
+   a function that uses the SIMD ABI, take advantage of the extra
+   call-preserved registers that the ABI provides.  */
+
+void
+aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
+                                         HARD_REG_SET *return_set)
+{
+  if (aarch64_simd_call_p (insn))
+    {
+      for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+       if (FP_SIMD_SAVED_REGNUM_P (regno))
+         CLEAR_HARD_REG_BIT (*return_set, regno);
+    }
+}
+
  /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
     the lower 64 bits of a 128-bit register.  Tell the compiler the callee
     clobbers the top 64 bits when restoring the bottom 64 bits.  */
  
  static bool
-aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
+aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
+                                       machine_mode mode)
+{
+  bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
+  return FP_REGNUM_P (regno)
+        && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
+}
+
+/* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
+
+rtx_insn *
+aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
  {
-  return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
+  gcc_assert (CALL_P (call_1) && CALL_P (call_2));
+
+  if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
+    return call_1;
+  else
+    return call_2;
  }
  
  /* Implement REGMODE_NATURAL_SIZE.  */
@@ -3146,9 +3414,12 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm,
  void
  aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
  {
-  emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
-                                               gen_rtvec (2, pred, src),
-                                               UNSPEC_MERGE_PTRUE)));
+  expand_operand ops[3];
+  machine_mode mode = GET_MODE (dest);
+  create_output_operand (&ops[0], dest, mode);
+  create_input_operand (&ops[1], pred, GET_MODE(pred));
+  create_input_operand (&ops[2], src, mode);
+  expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
  }
  
  /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
@@ -3296,7 +3567,9 @@ static bool
  aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
                                  tree exp ATTRIBUTE_UNUSED)
  {
-  /* Currently, always true.  */
+  if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
+    return false;
+
    return true;
  }
  
@@ -3495,12 +3768,16 @@ aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
  
  /* Given MODE and TYPE of a function argument, return the alignment in
     bits.  The idea is to suppress any stronger alignment requested by
-   the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
-   This is a helper function for local use only.  */
+   the user and opt for the natural alignment (specified in AAPCS64 \S
+   4.1).  ABI_BREAK is set to true if the alignment was incorrectly
+   calculated in versions of GCC prior to GCC-9.  This is a helper
+   function for local use only.  */
  
  static unsigned int
-aarch64_function_arg_alignment (machine_mode mode, const_tree type)
+aarch64_function_arg_alignment (machine_mode mode, const_tree type,
+                               bool *abi_break)
  {
+  *abi_break = false;
    if (!type)
      return GET_MODE_ALIGNMENT (mode);
  
@@ -3516,9 +3793,22 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type)
      return TYPE_ALIGN (TREE_TYPE (type));
  
    unsigned int alignment = 0;
+  unsigned int bitfield_alignment = 0;
    for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
      if (TREE_CODE (field) == FIELD_DECL)
-      alignment = std::max (alignment, DECL_ALIGN (field));
+      {
+       alignment = std::max (alignment, DECL_ALIGN (field));
+       if (DECL_BIT_FIELD_TYPE (field))
+         bitfield_alignment
+           = std::max (bitfield_alignment,
+                       TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
+      }
+
+  if (bitfield_alignment > alignment)
+    {
+      *abi_break = true;
+      return bitfield_alignment;
+    }
  
    return alignment;
  }
@@ -3535,6 +3825,7 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
    int ncrn, nvrn, nregs;
    bool allocate_ncrn, allocate_nvrn;
    HOST_WIDE_INT size;
+  bool abi_break;
  
    /* We need to do this once per argument.  */
    if (pcum->aapcs_arg_processed)
@@ -3611,25 +3902,28 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
       entirely general registers.  */
    if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
      {
-
        gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
  
        /* C.8 if the argument has an alignment of 16 then the NGRN is
-         rounded up to the next even number.  */
+        rounded up to the next even number.  */
        if (nregs == 2
           && ncrn % 2
           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
              comparison is there because for > 16 * BITS_PER_UNIT
              alignment nregs should be > 2 and therefore it should be
              passed by reference rather than value.  */
-         && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
+         && (aarch64_function_arg_alignment (mode, type, &abi_break)
+             == 16 * BITS_PER_UNIT))
         {
+         if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
+           inform (input_location, "parameter passing for argument of type "
+                   "%qT changed in GCC 9.1", type);
           ++ncrn;
           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
         }
  
        /* NREGS can be 0 when e.g. an empty structure is to be passed.
-         A reg is still generated for it, but the caller should be smart
+        A reg is still generated for it, but the caller should be smart
          enough not to use it.  */
        if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
@@ -3661,9 +3955,18 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
  on_stack:
    pcum->aapcs_stack_words = size / UNITS_PER_WORD;
  
-  if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
-    pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
-                                      16 / UNITS_PER_WORD);
+  if (aarch64_function_arg_alignment (mode, type, &abi_break)
+      == 16 * BITS_PER_UNIT)
+    {
+      int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
+      if (pcum->aapcs_stack_size != new_size)
+       {
+         if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
+           inform (input_location, "parameter passing for argument of type "
+                   "%qT changed in GCC 9.1", type);
+         pcum->aapcs_stack_size = new_size;
+       }
+    }
    return;
  }
  
@@ -3752,7 +4055,13 @@ aarch64_function_arg_regno_p (unsigned regno)
  static unsigned int
  aarch64_function_arg_boundary (machine_mode mode, const_tree type)
  {
-  unsigned int alignment = aarch64_function_arg_alignment (mode, type);
+  bool abi_break;
+  unsigned int alignment = aarch64_function_arg_alignment (mode, type,
+                                                          &abi_break);
+  if (abi_break & warn_psabi)
+    inform (input_location, "parameter passing for argument of type "
+           "%qT changed in GCC 9.1", type);
+
    return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
  }
  
@@ -3860,8 +4169,8 @@ aarch64_libgcc_cmp_return_mode (void)
  #endif
  
  /* The pair of scratch registers used for stack probing.  */
-#define PROBE_STACK_FIRST_REG  9
-#define PROBE_STACK_SECOND_REG 10
+#define PROBE_STACK_FIRST_REG  R9_REGNUM
+#define PROBE_STACK_SECOND_REG R10_REGNUM
  
  /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
     inclusive.  These are offsets from the current stack pointer.  */
@@ -4157,6 +4466,7 @@ aarch64_layout_frame (void)
  {
    HOST_WIDE_INT offset = 0;
    int regno, last_fp_reg = INVALID_REGNUM;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
  
    cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
  
@@ -4170,6 +4480,17 @@ aarch64_layout_frame (void)
    cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
    cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
  
+  /* If this is a non-leaf simd function with calls we assume that
+     at least one of those calls is to a non-simd function and thus
+     we must save V8 to V23 in the prologue.  */
+
+  if (simd_function && !crtl->is_leaf)
+    {
+      for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+       if (FP_SIMD_SAVED_REGNUM_P (regno))
+         df_set_regs_ever_live (regno, true);
+    }
+
    /* First mark all the registers that really need to be saved...  */
    for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
      cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
@@ -4192,7 +4513,8 @@ aarch64_layout_frame (void)
  
    for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
      if (df_regs_ever_live_p (regno)
-       && !call_used_regs[regno])
+       && (!call_used_regs[regno]
+           || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
        {
         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
         last_fp_reg = regno;
@@ -4234,7 +4556,10 @@ aarch64_layout_frame (void)
        {
         /* If there is an alignment gap between integer and fp callee-saves,
            allocate the last fp register to it if possible.  */
-       if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+       if (regno == last_fp_reg
+           && has_align_gap
+           && !simd_function
+           && (offset & 8) == 0)
           {
             cfun->machine->frame.reg_offset[regno] = max_int_offset;
             break;
@@ -4246,7 +4571,7 @@ aarch64_layout_frame (void)
         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
           cfun->machine->frame.wb_candidate2 = regno;
-       offset += UNITS_PER_WORD;
+       offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
        }
  
    offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
@@ -4389,6 +4714,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
        return gen_storewb_pairdf_di (base, base, reg, reg2,
                                     GEN_INT (-adjustment),
                                     GEN_INT (UNITS_PER_WORD - adjustment));
+    case E_TFmode:
+      return gen_storewb_pairtf_di (base, base, reg, reg2,
+                                   GEN_INT (-adjustment),
+                                   GEN_INT (UNITS_PER_VREG - adjustment));
      default:
        gcc_unreachable ();
      }
@@ -4401,7 +4730,7 @@ static void
  aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
  {
    rtx_insn *insn;
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
  
    if (regno2 == INVALID_REGNUM)
      return aarch64_pushwb_single_reg (mode, regno1, adjustment);
@@ -4431,6 +4760,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
      case E_DFmode:
        return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
                                    GEN_INT (UNITS_PER_WORD));
+    case E_TFmode:
+      return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
+                                  GEN_INT (UNITS_PER_VREG));
      default:
        gcc_unreachable ();
      }
@@ -4444,7 +4776,7 @@ static void
  aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
                   rtx *cfi_ops)
  {
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
    rtx reg1 = gen_rtx_REG (mode, regno1);
  
    *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
@@ -4479,6 +4811,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
      case E_DFmode:
        return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
  
+    case E_TFmode:
+      return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
+
      default:
        gcc_unreachable ();
      }
@@ -4499,6 +4834,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
      case E_DFmode:
        return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
  
+    case E_TFmode:
+      return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
+
      default:
        gcc_unreachable ();
      }
@@ -4520,6 +4858,13 @@ aarch64_return_address_signing_enabled (void)
               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
  }
  
+/* Return TRUE if Branch Target Identification Mechanism is enabled.  */
+bool
+aarch64_bti_enabled (void)
+{
+  return (aarch64_enable_bti == 1);
+}
+
  /* Emit code to save the callee-saved registers from register number START
     to LIMIT to the stack at the location starting at offset START_OFFSET,
     skipping any write-back candidates if SKIP_WB is true.  */
@@ -4538,6 +4883,7 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
      {
        rtx reg, mem;
        poly_int64 offset;
+      int offset_diff;
  
        if (skip_wb
           && (regno == cfun->machine->frame.wb_candidate1
@@ -4553,12 +4899,12 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
                                                 offset));
  
        regno2 = aarch64_next_callee_save (regno + 1, limit);
+      offset_diff = cfun->machine->frame.reg_offset[regno2]
+                   - cfun->machine->frame.reg_offset[regno];
  
        if (regno2 <= limit
           && !cfun->machine->reg_is_wrapped_separately[regno2]
-         && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
-             == cfun->machine->frame.reg_offset[regno2]))
-
+         && known_eq (GET_MODE_SIZE (mode), offset_diff))
         {
           rtx reg2 = gen_rtx_REG (mode, regno2);
           rtx mem2;
@@ -4606,6 +4952,7 @@ aarch64_restore_callee_saves (machine_mode mode,
         continue;
  
        rtx reg, mem;
+      int offset_diff;
  
        if (skip_wb
           && (regno == cfun->machine->frame.wb_candidate1
@@ -4617,11 +4964,12 @@ aarch64_restore_callee_saves (machine_mode mode,
        mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
  
        regno2 = aarch64_next_callee_save (regno + 1, limit);
+      offset_diff = cfun->machine->frame.reg_offset[regno2]
+                   - cfun->machine->frame.reg_offset[regno];
  
        if (regno2 <= limit
           && !cfun->machine->reg_is_wrapped_separately[regno2]
-         && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
-             == cfun->machine->frame.reg_offset[regno2]))
+         && known_eq (GET_MODE_SIZE (mode), offset_diff))
         {
           rtx reg2 = gen_rtx_REG (mode, regno2);
           rtx mem2;
@@ -4755,13 +5103,15 @@ aarch64_components_for_bb (basic_block bb)
    bitmap in = DF_LIVE_IN (bb);
    bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
    bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
  
    sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
    bitmap_clear (components);
  
    /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
    for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
-    if ((!call_used_regs[regno])
+    if ((!call_used_regs[regno]
+       || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
         && (bitmap_bit_p (in, regno)
            || bitmap_bit_p (gen, regno)
            || bitmap_bit_p (kill, regno)))
@@ -4832,9 +5182,11 @@ aarch64_process_components (sbitmap components, bool prologue_p)
  
    while (regno != last_regno)
      {
-      /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
-        so DFmode for the vector registers is enough.  */
-      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
+      /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
+        so DFmode for the vector registers is enough.  For simd functions
+        we want to save the low 128 bits.  */
+      machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
+      
        rtx reg = gen_rtx_REG (mode, regno);
        poly_int64 offset = cfun->machine->frame.reg_offset[regno];
        if (!frame_pointer_needed)
@@ -4863,6 +5215,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
          mergeable with the current one into a pair.  */
        if (!satisfies_constraint_Ump (mem)
           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+         || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
                        GET_MODE_SIZE (mode)))
         {
@@ -5031,11 +5384,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
         {
           /* This is done to provide unwinding information for the stack
              adjustments we're about to do, however to prevent the optimizers
-            from removing the R15 move and leaving the CFA note (which would be
+            from removing the R11 move and leaving the CFA note (which would be
              very wrong) we tie the old and new stack pointer together.
              The tie will expand to nothing but the optimizers will not touch
              the instruction.  */
-         rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+         rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
  
@@ -5178,6 +5531,28 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
      }
  }
  
+/* Return 1 if the register is used by the epilogue.  We need to say the
+   return register is used, but only after epilogue generation is complete.
+   Note that in the case of sibcalls, the values "used by the epilogue" are
+   considered live at the start of the called function.
+
+   For SIMD functions we need to return 1 for FP registers that are saved and
+   restored by a function but are not zero in call_used_regs.  If we do not do 
+   this optimizations may remove the restore of the register.  */
+
+int
+aarch64_epilogue_uses (int regno)
+{
+  if (epilogue_completed)
+    {
+      if (regno == LR_REGNUM)
+       return 1;
+      if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
+       return 1;
+    }
+  return 0;
+}
+
  /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
     is saved at BASE + OFFSET.  */
  
@@ -5240,7 +5615,19 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
     to the stack we track as implicit probes are the FP/LR stores.
  
     For outgoing arguments we probe if the size is larger than 1KB, such that
-   the ABI specified buffer is maintained for the next callee.  */
+   the ABI specified buffer is maintained for the next callee.
+
+   The following registers are reserved during frame layout and should not be
+   used for any other purpose:
+
+   - r11: Used by stack clash protection when SVE is enabled.
+   - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
+   - r14 and r15: Used for speculation tracking.
+   - r16(IP0), r17(IP1): Used by indirect tailcalls.
+   - r30(LR), r29(FP): Used by standard frame layout.
+
+   These registers must be avoided in frame layout related code unless the
+   explicit intention is to interact with one of the features listed above.  */
  
  /* Generate the prologue instructions for entry into a function.
     Establish the stack frame by decreasing the stack pointer with a
@@ -5286,8 +5673,8 @@ aarch64_expand_prologue (void)
         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
      }
  
-  rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
-  rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
+  rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
+  rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
  
    /* In theory we should never have both an initial adjustment
       and a callee save adjustment.  Verify that is the case since the
@@ -5297,7 +5684,7 @@ aarch64_expand_prologue (void)
    /* Will only probe if the initial adjustment is larger than the guard
       less the amount of the guard reserved for use by the caller's
       outgoing args.  */
-  aarch64_allocate_and_probe_stack_space (ip0_rtx, ip1_rtx, initial_adjust,
+  aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
                                           true, false);
  
    if (callee_adjust != 0)
@@ -5315,7 +5702,7 @@ aarch64_expand_prologue (void)
         }
        aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
                           stack_pointer_rtx, callee_offset,
-                         ip1_rtx, ip0_rtx, frame_pointer_needed);
+                         tmp1_rtx, tmp0_rtx, frame_pointer_needed);
        if (frame_pointer_needed && !frame_size.is_constant ())
         {
           /* Variable-sized frames need to describe the save slot
@@ -5352,12 +5739,16 @@ aarch64_expand_prologue (void)
  
    aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
                              callee_adjust != 0 || emit_frame_chain);
-  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-                            callee_adjust != 0 || emit_frame_chain);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                              callee_adjust != 0 || emit_frame_chain);
+  else
+    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                              callee_adjust != 0 || emit_frame_chain);
  
    /* We may need to probe the final adjustment if it is larger than the guard
       that is assumed by the called.  */
-  aarch64_allocate_and_probe_stack_space (ip1_rtx, ip0_rtx, final_adjust,
+  aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
                                           !frame_pointer_needed, true);
  }
  
@@ -5379,6 +5770,19 @@ aarch64_use_return_insn_p (void)
    return known_eq (cfun->machine->frame.frame_size, 0);
  }
  
+/* Return false for non-leaf SIMD functions in order to avoid
+   shrink-wrapping them.  Doing this will lose the necessary
+   save/restore of FP registers.  */
+
+bool
+aarch64_use_simple_return_insn_p (void)
+{
+  if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
+    return false;
+
+  return true;
+}
+
  /* Generate the epilogue instructions for returning from a function.
     This is almost exactly the reverse of the prolog sequence, except
     that we need to insert barriers to avoid scheduling loads that read
@@ -5395,8 +5799,8 @@ aarch64_expand_epilogue (bool for_sibcall)
    unsigned reg2 = cfun->machine->frame.wb_candidate2;
    rtx cfi_ops = NULL;
    rtx_insn *insn;
-  /* A stack clash protection prologue may not have left IP0_REGNUM or
-     IP1_REGNUM in a usable state.  The same is true for allocations
+  /* A stack clash protection prologue may not have left EP0_REGNUM or
+     EP1_REGNUM in a usable state.  The same is true for allocations
       with an SVE component, since we then need both temporary registers
       for each allocation.  For stack clash we are in a usable state if
       the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
@@ -5411,8 +5815,8 @@ aarch64_expand_epilogue (bool for_sibcall)
    bool can_inherit_p = (initial_adjust.is_constant ()
                         && final_adjust.is_constant ())
                         && (!flag_stack_clash_protection
-                            || known_lt (initial_adjust,
-                                         guard_size - guard_used_by_caller));
+                           || known_lt (initial_adjust,
+                                        guard_size - guard_used_by_caller));
  
    /* We need to add memory barrier to prevent read from deallocated stack.  */
    bool need_barrier_p
@@ -5430,25 +5834,29 @@ aarch64_expand_epilogue (bool for_sibcall)
  
    /* Restore the stack pointer from the frame pointer if it may not
       be the same as the stack pointer.  */
-  rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
-  rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
+  rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
+  rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
    if (frame_pointer_needed
        && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
      /* If writeback is used when restoring callee-saves, the CFA
         is restored on the instruction doing the writeback.  */
      aarch64_add_offset (Pmode, stack_pointer_rtx,
                         hard_frame_pointer_rtx, -callee_offset,
-                       ip1_rtx, ip0_rtx, callee_adjust == 0);
+                       tmp1_rtx, tmp0_rtx, callee_adjust == 0);
    else
       /* The case where we need to re-use the register here is very rare, so
         avoid the complicated condition and just always emit a move if the
         immediate doesn't fit.  */
-     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust, true);
+     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
  
    aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
                                 callee_adjust != 0, &cfi_ops);
-  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-                               callee_adjust != 0, &cfi_ops);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                                 callee_adjust != 0, &cfi_ops);
+  else
+    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                                 callee_adjust != 0, &cfi_ops);
  
    if (need_barrier_p)
      emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -5466,8 +5874,11 @@ aarch64_expand_epilogue (bool for_sibcall)
        cfi_ops = NULL;
      }
  
-  aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
-                 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
+  /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
+     add restriction on emit_move optimization to leaf functions.  */
+  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
+                 (!can_inherit_p || !crtl->is_leaf
+                  || df_regs_ever_live_p (EP0_REGNUM)));
  
    if (cfi_ops)
      {
@@ -5569,12 +5980,15 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
    rtx this_rtx, temp0, temp1, addr, funexp;
    rtx_insn *insn;
  
+  if (aarch64_bti_enabled ())
+    emit_insn (gen_bti_c());
+
    reload_completed = 1;
    emit_note (NOTE_INSN_PROLOGUE_END);
  
    this_rtx = gen_rtx_REG (Pmode, this_regno);
-  temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-  temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
+  temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
+  temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
  
    if (vcall_offset == 0)
      aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
@@ -6757,9 +7171,12 @@ aarch64_emit_call_insn (rtx pat)
  machine_mode
  aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
  {
+  machine_mode mode_x = GET_MODE (x);
+  rtx_code code_x = GET_CODE (x);
+
    /* All floating point compares return CCFP if it is an equality
       comparison, and CCFPE otherwise.  */
-  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+  if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
      {
        switch (code)
         {
@@ -6788,57 +7205,67 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
  
    /* Equality comparisons of short modes against zero can be performed
       using the TST instruction with the appropriate bitmask.  */
-  if (y == const0_rtx && REG_P (x)
+  if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
        && (code == EQ || code == NE)
-      && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
+      && (mode_x == HImode || mode_x == QImode))
      return CC_NZmode;
  
    /* Similarly, comparisons of zero_extends from shorter modes can
       be performed using an ANDS with an immediate mask.  */
-  if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
-      && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+  if (y == const0_rtx && code_x == ZERO_EXTEND
+      && (mode_x == SImode || mode_x == DImode)
        && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
        && (code == EQ || code == NE))
      return CC_NZmode;
  
-  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+  if ((mode_x == SImode || mode_x == DImode)
        && y == const0_rtx
        && (code == EQ || code == NE || code == LT || code == GE)
-      && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
-         || GET_CODE (x) == NEG
-         || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
+      && (code_x == PLUS || code_x == MINUS || code_x == AND
+         || code_x == NEG
+         || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
               && CONST_INT_P (XEXP (x, 2)))))
      return CC_NZmode;
  
    /* A compare with a shifted operand.  Because of canonicalization,
       the comparison will have to be swapped when we emit the assembly
       code.  */
-  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+  if ((mode_x == SImode || mode_x == DImode)
        && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
-      && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
-         || GET_CODE (x) == LSHIFTRT
-         || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
+      && (code_x == ASHIFT || code_x == ASHIFTRT
+         || code_x == LSHIFTRT
+         || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
      return CC_SWPmode;
  
    /* Similarly for a negated operand, but we can only do this for
       equalities.  */
-  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+  if ((mode_x == SImode || mode_x == DImode)
        && (REG_P (y) || GET_CODE (y) == SUBREG)
        && (code == EQ || code == NE)
-      && GET_CODE (x) == NEG)
+      && code_x == NEG)
      return CC_Zmode;
  
-  /* A test for unsigned overflow.  */
-  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
-      && code == NE
-      && GET_CODE (x) == PLUS
-      && GET_CODE (y) == ZERO_EXTEND)
+  /* A test for unsigned overflow from an addition.  */
+  if ((mode_x == DImode || mode_x == TImode)
+      && (code == LTU || code == GEU)
+      && code_x == PLUS
+      && rtx_equal_p (XEXP (x, 0), y))
      return CC_Cmode;
  
+  /* A test for unsigned overflow from an add with carry.  */
+  if ((mode_x == DImode || mode_x == TImode)
+      && (code == LTU || code == GEU)
+      && code_x == PLUS
+      && CONST_SCALAR_INT_P (y)
+      && (rtx_mode_t (y, mode_x)
+         == (wi::shwi (1, mode_x)
+             << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
+    return CC_ADCmode;
+
    /* A test for signed overflow.  */
-  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
+  if ((mode_x == DImode || mode_x == TImode)
        && code == NE
-      && GET_CODE (x) == PLUS
+      && code_x == PLUS
        && GET_CODE (y) == SIGN_EXTEND)
      return CC_Vmode;
  
@@ -6942,8 +7369,17 @@ aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
      case E_CC_Cmode:
        switch (comp_code)
         {
-       case NE: return AARCH64_CS;
-       case EQ: return AARCH64_CC;
+       case LTU: return AARCH64_CS;
+       case GEU: return AARCH64_CC;
+       default: return -1;
+       }
+      break;
+
+    case E_CC_ADCmode:
+      switch (comp_code)
+       {
+       case GEU: return AARCH64_CS;
+       case LTU: return AARCH64_CC;
         default: return -1;
         }
        break;
@@ -7599,8 +8035,13 @@ aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
    unsigned int size;
  
    /* Check all addresses are Pmode - including ILP32.  */
-  if (GET_MODE (x) != Pmode)
-    output_operand_lossage ("invalid address mode");
+  if (GET_MODE (x) != Pmode
+      && (!CONST_INT_P (x)
+         || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
+    {
+      output_operand_lossage ("invalid address mode");
+      return false;
+    }
  
    if (aarch64_classify_address (&addr, x, mode, true, type))
      switch (addr.type)
@@ -7968,18 +8409,36 @@ aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
  static void
  aarch64_asm_trampoline_template (FILE *f)
  {
+  int offset1 = 16;
+  int offset2 = 20;
+
+  if (aarch64_bti_enabled ())
+    {
+      asm_fprintf (f, "\thint\t34 // bti c\n");
+      offset1 -= 4;
+      offset2 -= 4;
+    }
+
    if (TARGET_ILP32)
      {
-      asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
-      asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
+      asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
+      asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
+                  offset1);
      }
    else
      {
-      asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
-      asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
+      asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
+      asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
+                  offset2);
      }
    asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
-  assemble_aligned_integer (4, const0_rtx);
+
+  /* The trampoline needs an extra padding instruction.  In case if BTI is
+     enabled the padding instruction is replaced by the BTI instruction at
+     the beginning.  */
+  if (!aarch64_bti_enabled ())
+    assemble_aligned_integer (4, const0_rtx);
+
    assemble_aligned_integer (POINTER_BYTES, const0_rtx);
    assemble_aligned_integer (POINTER_BYTES, const0_rtx);
  }
@@ -8877,6 +9336,35 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
  }
  
+/* Return true if the masks and a shift amount from an RTX of the form
+   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
+   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
+
+bool
+aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
+                                  unsigned HOST_WIDE_INT mask1,
+                                  unsigned HOST_WIDE_INT shft_amnt,
+                                  unsigned HOST_WIDE_INT mask2)
+{
+  unsigned HOST_WIDE_INT t;
+
+  /* Verify that there is no overlap in what bits are set in the two masks.  */
+  if (mask1 != ~mask2)
+    return false;
+
+  /* Verify that mask2 is not all zeros or ones.  */
+  if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
+    return false;
+
+  /* The shift amount should always be less than the mode size.  */
+  gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
+
+  /* Verify that the mask being shifted is contiguous and would be in the
+     least significant bits after shifting by shft_amnt.  */
+  t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
+  return (t == (t & -t));
+}
+
  /* Calculate the cost of calculating X, storing it in *COST.  Result
     is true if the total cost of the operation has now been calculated.  */
  static bool
@@ -10553,19 +11041,16 @@ static enum aarch64_parse_opt_result
  aarch64_parse_arch (const char *to_parse, const struct processor **res,
                     unsigned long *isa_flags, std::string *invalid_extension)
  {
-  char *ext;
+  const char *ext;
    const struct processor *arch;
-  char *str = (char *) alloca (strlen (to_parse) + 1);
    size_t len;
  
-  strcpy (str, to_parse);
-
-  ext = strchr (str, '+');
+  ext = strchr (to_parse, '+');
  
    if (ext != NULL)
-    len = ext - str;
+    len = ext - to_parse;
    else
-    len = strlen (str);
+    len = strlen (to_parse);
  
    if (len == 0)
      return AARCH64_PARSE_MISSING_ARG;
@@ -10574,7 +11059,8 @@ aarch64_parse_arch (const char *to_parse, const struct processor **res,
    /* Loop through the list of supported ARCHes to find a match.  */
    for (arch = all_architectures; arch->name != NULL; arch++)
      {
-      if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
+      if (strlen (arch->name) == len
+         && strncmp (arch->name, to_parse, len) == 0)
         {
           unsigned long isa_temp = arch->flags;
  
@@ -10610,19 +11096,16 @@ static enum aarch64_parse_opt_result
  aarch64_parse_cpu (const char *to_parse, const struct processor **res,
                    unsigned long *isa_flags, std::string *invalid_extension)
  {
-  char *ext;
+  const char *ext;
    const struct processor *cpu;
-  char *str = (char *) alloca (strlen (to_parse) + 1);
    size_t len;
  
-  strcpy (str, to_parse);
-
-  ext = strchr (str, '+');
+  ext = strchr (to_parse, '+');
  
    if (ext != NULL)
-    len = ext - str;
+    len = ext - to_parse;
    else
-    len = strlen (str);
+    len = strlen (to_parse);
  
    if (len == 0)
      return AARCH64_PARSE_MISSING_ARG;
@@ -10631,7 +11114,7 @@ aarch64_parse_cpu (const char *to_parse, const struct processor **res,
    /* Loop through the list of supported CPUs to find a match.  */
    for (cpu = all_cores; cpu->name != NULL; cpu++)
      {
-      if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
+      if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
         {
           unsigned long isa_temp = cpu->flags;
  
@@ -10665,14 +11148,11 @@ static enum aarch64_parse_opt_result
  aarch64_parse_tune (const char *to_parse, const struct processor **res)
  {
    const struct processor *cpu;
-  char *str = (char *) alloca (strlen (to_parse) + 1);
-
-  strcpy (str, to_parse);
  
    /* Loop through the list of supported CPUs to find a match.  */
    for (cpu = all_cores; cpu->name != NULL; cpu++)
      {
-      if (strcmp (cpu->name, str) == 0)
+      if (strcmp (cpu->name, to_parse) == 0)
         {
           *res = cpu;
           return AARCH64_PARSE_OK;
@@ -10700,7 +11180,7 @@ aarch64_parse_one_option_token (const char *token,
         return flag->flag;
      }
  
-  error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
+  error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
    return 0;
  }
  
@@ -10786,6 +11266,37 @@ aarch64_parse_tune_string (const char *tune_string,
                                      "tune=");
  }
  
+/* Parse the sve_width tuning moverride string in TUNE_STRING.
+   Accept the valid SVE vector widths allowed by
+   aarch64_sve_vector_bits_enum and use it to override sve_width
+   in TUNE.  */
+
+static void
+aarch64_parse_sve_width_string (const char *tune_string,
+                               struct tune_params *tune)
+{
+  int width = -1;
+
+  int n = sscanf (tune_string, "%d", &width);
+  if (n == EOF)
+    {
+      error ("invalid format for sve_width");
+      return;
+    }
+  switch (width)
+    {
+    case SVE_128:
+    case SVE_256:
+    case SVE_512:
+    case SVE_1024:
+    case SVE_2048:
+      break;
+    default:
+      error ("invalid sve_width value: %d", width);
+    }
+  tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
+}
+
  /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
     we understand.  If it is, extract the option string and handoff to
     the appropriate function.  */
@@ -10894,6 +11405,12 @@ aarch64_parse_override_string (const char* input_string,
  static void
  aarch64_override_options_after_change_1 (struct gcc_options *opts)
  {
+  if (accepted_branch_protection_string)
+    {
+      opts->x_aarch64_branch_protection_string
+       = xstrdup (accepted_branch_protection_string);
+    }
+
    /* PR 70044: We have to be careful about being called multiple times for the
       same function.  This means all changes should be repeatable.  */
  
@@ -10963,6 +11480,41 @@ aarch64_override_options_internal (struct gcc_options *opts)
    if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
      opts->x_flag_strict_volatile_bitfields = 1;
  
+  if (aarch64_stack_protector_guard == SSP_GLOBAL
+      && opts->x_aarch64_stack_protector_guard_offset_str)
+    {
+      error ("incompatible options %<-mstack-protector-guard=global%> and "
+            "%<-mstack-protector-guard-offset=%s%>",
+            aarch64_stack_protector_guard_offset_str);
+    }
+
+  if (aarch64_stack_protector_guard == SSP_SYSREG
+      && !(opts->x_aarch64_stack_protector_guard_offset_str
+          && opts->x_aarch64_stack_protector_guard_reg_str))
+    {
+      error ("both %<-mstack-protector-guard-offset%> and "
+            "%<-mstack-protector-guard-reg%> must be used "
+            "with %<-mstack-protector-guard=sysreg%>");
+    }
+
+  if (opts->x_aarch64_stack_protector_guard_reg_str)
+    {
+      if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
+         error ("specify a system register with a small string length.");
+    }
+
+  if (opts->x_aarch64_stack_protector_guard_offset_str)
+    {
+      char *end;
+      const char *str = aarch64_stack_protector_guard_offset_str;
+      errno = 0;
+      long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
+      if (!*str || *end || errno)
+       error ("%qs is not a valid offset in %qs", str,
+              "-mstack-protector-guard-offset=");
+      aarch64_stack_protector_guard_offset = offs;
+    }
+
    initialize_aarch64_code_model (opts);
    initialize_aarch64_tls_size (opts);
  
@@ -11052,8 +11604,8 @@ aarch64_override_options_internal (struct gcc_options *opts)
    int probe_interval
      = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
    if (guard_size != probe_interval)
-    error ("stack clash guard size '%d' must be equal to probing interval "
-          "'%d'", guard_size, probe_interval);
+    error ("stack clash guard size %<%d%> must be equal to probing interval "
+          "%<%d%>", guard_size, probe_interval);
  
    /* Enable sw prefetching at specified optimization level for
       CPUS that have prefetch.  Lower optimization level threshold by 1
@@ -11164,7 +11716,7 @@ aarch64_validate_mcpu (const char *str, const struct processor **res,
         error ("missing cpu name in %<-mcpu=%s%>", str);
         break;
        case AARCH64_PARSE_INVALID_ARG:
-       error ("unknown value %qs for -mcpu", str);
+       error ("unknown value %qs for %<-mcpu%>", str);
         aarch64_print_hint_for_core (str);
         break;
        case AARCH64_PARSE_INVALID_FEATURE:
@@ -11179,6 +11731,110 @@ aarch64_validate_mcpu (const char *str, const struct processor **res,
    return false;
  }
  
+/* Parses CONST_STR for branch protection features specified in
+   aarch64_branch_protect_types, and set any global variables required.  Returns
+   the parsing result and assigns LAST_STR to the last processed token from
+   CONST_STR so that it can be used for error reporting.  */
+
+static enum
+aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
+                                                         char** last_str)
+{
+  char *str_root = xstrdup (const_str);
+  char* token_save = NULL;
+  char *str = strtok_r (str_root, "+", &token_save);
+  enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
+  if (!str)
+    res = AARCH64_PARSE_MISSING_ARG;
+  else
+    {
+      char *next_str = strtok_r (NULL, "+", &token_save);
+      /* Reset the branch protection features to their defaults.  */
+      aarch64_handle_no_branch_protection (NULL, NULL);
+
+      while (str && res == AARCH64_PARSE_OK)
+       {
+         const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
+         bool found = false;
+         /* Search for this type.  */
+         while (type && type->name && !found && res == AARCH64_PARSE_OK)
+           {
+             if (strcmp (str, type->name) == 0)
+               {
+                 found = true;
+                 res = type->handler (str, next_str);
+                 str = next_str;
+                 next_str = strtok_r (NULL, "+", &token_save);
+               }
+             else
+               type++;
+           }
+         if (found && res == AARCH64_PARSE_OK)
+           {
+             bool found_subtype = true;
+             /* Loop through each token until we find one that isn't a
+                subtype.  */
+             while (found_subtype)
+               {
+                 found_subtype = false;
+                 const aarch64_branch_protect_type *subtype = type->subtypes;
+                 /* Search for the subtype.  */
+                 while (str && subtype && subtype->name && !found_subtype
+                         && res == AARCH64_PARSE_OK)
+                   {
+                     if (strcmp (str, subtype->name) == 0)
+                       {
+                         found_subtype = true;
+                         res = subtype->handler (str, next_str);
+                         str = next_str;
+                         next_str = strtok_r (NULL, "+", &token_save);
+                       }
+                     else
+                       subtype++;
+                   }
+               }
+           }
+         else if (!found)
+           res = AARCH64_PARSE_INVALID_ARG;
+       }
+    }
+  /* Copy the last processed token into the argument to pass it back.
+    Used by option and attribute validation to print the offending token.  */
+  if (last_str)
+    {
+      if (str) strcpy (*last_str, str);
+      else *last_str = NULL;
+    }
+  if (res == AARCH64_PARSE_OK)
+    {
+      /* If needed, alloc the accepted string then copy in const_str.
+       Used by override_option_after_change_1.  */
+      if (!accepted_branch_protection_string)
+       accepted_branch_protection_string = (char *) xmalloc (
+                                                     BRANCH_PROTECT_STR_MAX
+                                                       + 1);
+      strncpy (accepted_branch_protection_string, const_str,
+               BRANCH_PROTECT_STR_MAX + 1);
+      /* Forcibly null-terminate.  */
+      accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
+    }
+  return res;
+}
+
+static bool
+aarch64_validate_mbranch_protection (const char *const_str)
+{
+  char *str = (char *) xmalloc (strlen (const_str));
+  enum aarch64_parse_opt_result res =
+    aarch64_parse_branch_protection (const_str, &str);
+  if (res == AARCH64_PARSE_INVALID_ARG)
+    error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
+  else if (res == AARCH64_PARSE_MISSING_ARG)
+    error ("missing arg for %<-mbranch-protection=%>");
+  free (str);
+  return res == AARCH64_PARSE_OK;
+}
+
  /* Validate a command-line -march option.  Parse the arch and extensions
     (if any) specified in STR and throw errors if appropriate.  Put the
     results, if they are valid, in RES and ISA_FLAGS.  Return whether the
@@ -11201,7 +11857,7 @@ aarch64_validate_march (const char *str, const struct processor **res,
         error ("missing arch name in %<-march=%s%>", str);
         break;
        case AARCH64_PARSE_INVALID_ARG:
-       error ("unknown value %qs for -march", str);
+       error ("unknown value %qs for %<-march%>", str);
         aarch64_print_hint_for_arch (str);
         break;
        case AARCH64_PARSE_INVALID_FEATURE:
@@ -11236,7 +11892,7 @@ aarch64_validate_mtune (const char *str, const struct processor **res)
         error ("missing cpu name in %<-mtune=%s%>", str);
         break;
        case AARCH64_PARSE_INVALID_ARG:
-       error ("unknown value %qs for -mtune", str);
+       error ("unknown value %qs for %<-mtune%>", str);
         aarch64_print_hint_for_core (str);
         break;
        default:
@@ -11313,6 +11969,9 @@ aarch64_override_options (void)
    selected_arch = NULL;
    selected_tune = NULL;
  
+  if (aarch64_branch_protection_string)
+    aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
+
    /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
       If either of -march or -mtune is given, they override their
       respective component of -mcpu.  */
@@ -11327,6 +11986,10 @@ aarch64_override_options (void)
    if (aarch64_tune_string)
      valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
  
+#ifdef SUBTARGET_OVERRIDE_OPTIONS
+  SUBTARGET_OVERRIDE_OPTIONS;
+#endif
+
    /* If the user did not specify a processor, choose the default
       one for them.  This will be the CPU set during configuration using
       --with-cpu, otherwise it is "generic".  */
@@ -11354,7 +12017,7 @@ aarch64_override_options (void)
      {
        if (selected_arch->arch != selected_cpu->arch)
         {
-         warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
+         warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
                        all_architectures[selected_cpu->arch].name,
                        selected_arch->name);
         }
@@ -11385,18 +12048,39 @@ aarch64_override_options (void)
    if (!selected_tune)
      selected_tune = selected_cpu;
  
+  if (aarch64_enable_bti == 2)
+    {
+#ifdef TARGET_ENABLE_BTI
+      aarch64_enable_bti = 1;
+#else
+      aarch64_enable_bti = 0;
+#endif
+    }
+
+  /* Return address signing is currently not supported for ILP32 targets.  For
+     LP64 targets use the configured option in the absence of a command-line
+     option for -mbranch-protection.  */
+  if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
+    {
+#ifdef TARGET_ENABLE_PAC_RET
+      aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
+#else
+      aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
+#endif
+    }
+
  #ifndef HAVE_AS_MABI_OPTION
    /* The compiler may have been configured with 2.23.* binutils, which does
       not have support for ILP32.  */
    if (TARGET_ILP32)
-    error ("assembler does not support -mabi=ilp32");
+    error ("assembler does not support %<-mabi=ilp32%>");
  #endif
  
    /* Convert -msve-vector-bits to a VG count.  */
    aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
  
    if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
-    sorry ("return address signing is only supported for -mabi=lp64");
+    sorry ("return address signing is only supported for %<-mabi=lp64%>");
  
    /* Make sure we properly set up the explicit options.  */
    if ((aarch64_cpu_string && valid_cpu)
@@ -11407,6 +12091,12 @@ aarch64_override_options (void)
         || (aarch64_arch_string && valid_arch))
      gcc_assert (explicit_arch != aarch64_no_arch);
  
+  /* The pass to insert speculation tracking runs before
+     shrink-wrapping and the latter does not know how to update the
+     tracking status.  So disable it in this case.  */
+  if (aarch64_track_speculation)
+    flag_shrink_wrap = 0;
+
    aarch64_override_options_internal (&global_options);
  
    /* Save these options as the default ones in case we push and pop them later
@@ -11458,7 +12148,7 @@ initialize_aarch64_code_model (struct gcc_options *opts)
  #endif
            break;
          case AARCH64_CMODEL_LARGE:
-          sorry ("code model %qs with -f%s", "large",
+          sorry ("code model %qs with %<-f%s%>", "large",
                   opts->x_flag_pic > 1 ? "PIC" : "pic");
            break;
          default:
@@ -11475,6 +12165,8 @@ static void
  aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
  {
    ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
+  ptr->x_aarch64_branch_protection_string
+    = opts->x_aarch64_branch_protection_string;
  }
  
  /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
@@ -11488,6 +12180,13 @@ aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
    opts->x_explicit_arch = ptr->x_explicit_arch;
    selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
    opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
+  opts->x_aarch64_branch_protection_string
+    = ptr->x_aarch64_branch_protection_string;
+  if (opts->x_aarch64_branch_protection_string)
+    {
+      aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
+                                       NULL);
+    }
  
    aarch64_override_options_internal (opts);
  }
@@ -11682,6 +12381,37 @@ aarch64_handle_attr_cpu (const char *str)
    return false;
  }
  
+/* Handle the argument STR to the branch-protection= attribute.  */
+
+ static bool
+ aarch64_handle_attr_branch_protection (const char* str)
+ {
+  char *err_str = (char *) xmalloc (strlen (str));
+  enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
+                                                                     &err_str);
+  bool success = false;
+  switch (res)
+    {
+     case AARCH64_PARSE_MISSING_ARG:
+       error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
+             " attribute");
+       break;
+     case AARCH64_PARSE_INVALID_ARG:
+       error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
+             "=\")%> pragma or attribute", err_str);
+       break;
+     case AARCH64_PARSE_OK:
+       success = true;
+      /* Fall through.  */
+     case AARCH64_PARSE_INVALID_FEATURE:
+       break;
+     default:
+       gcc_unreachable ();
+    }
+  free (err_str);
+  return success;
+ }
+
  /* Handle the argument STR to the tune= target attribute.  */
  
  static bool
@@ -11780,6 +12510,8 @@ static const struct aarch64_attribute_info aarch64_attributes[] =
    { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
    { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
       OPT_mtune_ },
+  { "branch-protection", aarch64_attr_custom, false,
+     aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
    { "sign-return-address", aarch64_attr_enum, false, NULL,
       OPT_msign_return_address_ },
    { NULL, aarch64_attr_custom, false, NULL, OPT____ }
@@ -11983,7 +12715,7 @@ aarch64_process_target_attr (tree args)
    unsigned int num_commas = num_occurences_in_str (',', str_to_check);
  
    /* Handle multiple target attributes separated by ','.  */
-  char *token = strtok (str_to_check, ",");
+  char *token = strtok_r (str_to_check, ",", &str_to_check);
  
    unsigned int num_attrs = 0;
    while (token)
@@ -11995,7 +12727,7 @@ aarch64_process_target_attr (tree args)
           return false;
         }
  
-      token = strtok (NULL, ",");
+      token = strtok_r (NULL, ",", &str_to_check);
      }
  
    if (num_attrs != num_commas + 1)
@@ -12658,7 +13390,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
    stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
                   f_stack, NULL_TREE);
    size = int_size_in_bytes (type);
-  align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
+
+  bool abi_break;
+  align
+    = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
  
    dw_align = false;
    adjust = 0;
@@ -12705,7 +13440,12 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
        nregs = rsize / UNITS_PER_WORD;
  
        if (align > 8)
-       dw_align = true;
+       {
+         if (abi_break && warn_psabi)
+           inform (input_location, "parameter passing for argument of type "
+                   "%qT changed in GCC 9.1", type);
+         dw_align = true;
+       }
  
        if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
           && size < UNITS_PER_WORD)
@@ -13383,7 +14123,7 @@ static const char *
  aarch64_mangle_type (const_tree type)
  {
    /* The AArch64 ABI documents say that "__va_list" has to be
-     managled as if it is in the "std" namespace.  */
+     mangled as if it is in the "std" namespace.  */
    if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
      return "St9__va_list";
  
@@ -14184,12 +14924,11 @@ aarch64_simd_vector_alignment (const_tree type)
         be set for non-predicate vectors of booleans.  Modes are the most
         direct way we have of identifying real SVE predicate types.  */
      return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
-  HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
-  return MIN (align, 128);
+  return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
  }
  
  /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
-static HOST_WIDE_INT
+static poly_uint64
  aarch64_vectorize_preferred_vector_alignment (const_tree type)
  {
    if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
@@ -14214,9 +14953,11 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
    /* For fixed-length vectors, check that the vectorizer will aim for
       full-vector alignment.  This isn't true for generic GCC vectors
       that are wider than the ABI maximum of 128 bits.  */
+  poly_uint64 preferred_alignment =
+    aarch64_vectorize_preferred_vector_alignment (type);
    if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
-      && (wi::to_widest (TYPE_SIZE (type))
-         != aarch64_vectorize_preferred_vector_alignment (type)))
+      && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
+                  preferred_alignment))
      return false;
  
    /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
@@ -14268,7 +15009,7 @@ aarch64_simd_dup_constant (rtx vals)
  /* Generate code to load VALS, which is a PARALLEL containing only
     constants (for vec_init) or CONST_VECTOR, efficiently into a
     register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that can not be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
  static rtx
  aarch64_simd_make_constant (rtx vals)
  {
@@ -14306,12 +15047,12 @@ aarch64_simd_make_constant (rtx vals)
      /* Loaded using DUP.  */
      return const_dup;
    else if (const_vec != NULL_RTX)
-    /* Load from constant pool. We can not take advantage of single-cycle
+    /* Load from constant pool. We cannot take advantage of single-cycle
         LD1 because we need a PC-relative addressing mode.  */
      return const_vec;
    else
      /* A PARALLEL containing something not valid inside CONST_VECTOR.
-       We can not construct an initializer.  */
+       We cannot construct an initializer.  */
      return NULL_RTX;
  }
  
@@ -16451,32 +17192,38 @@ aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
     LOW_IN2 represents the low half (DImode) of TImode operand 2
     HIGH_DEST represents the high half (DImode) of TImode operand 0
     HIGH_IN1 represents the high half (DImode) of TImode operand 1
-   HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
-
+   HIGH_IN2 represents the high half (DImode) of TImode operand 2
+   UNSIGNED_P is true if the operation is being performed on unsigned
+   values.  */
  void
  aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
                        rtx low_in2, rtx high_dest, rtx high_in1,
-                      rtx high_in2)
+                      rtx high_in2, bool unsigned_p)
  {
    if (low_in2 == const0_rtx)
      {
        low_dest = low_in1;
-      emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
-                                     force_reg (DImode, high_in2)));
+      high_in2 = force_reg (DImode, high_in2);
+      if (unsigned_p)
+       emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
+      else
+       emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
      }
    else
      {
        if (CONST_INT_P (low_in2))
         {
-         low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
           high_in2 = force_reg (DImode, high_in2);
-         emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
+         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
+                                             GEN_INT (-INTVAL (low_in2))));
         }
        else
         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
-      emit_insn (gen_subdi3_carryinCV (high_dest,
-                                      force_reg (DImode, high_in1),
-                                      high_in2));
+
+      if (unsigned_p)
+       emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
+      else
+       emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
      }
  
    emit_move_insn (gen_lowpart (DImode, op0), low_dest);
@@ -17828,6 +18575,204 @@ aarch64_speculation_safe_value (machine_mode mode,
    return result;
  }
  
+/* Implement TARGET_ESTIMATED_POLY_VALUE.
+   Look into the tuning structure for an estimate.
+   VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
+   Advanced SIMD 128 bits.  */
+
+static HOST_WIDE_INT
+aarch64_estimated_poly_value (poly_int64 val)
+{
+  enum aarch64_sve_vector_bits_enum width_source
+    = aarch64_tune_params.sve_width;
+
+  /* If we still don't have an estimate, use the default.  */
+  if (width_source == SVE_SCALABLE)
+    return default_estimated_poly_value (val);
+
+  HOST_WIDE_INT over_128 = width_source - 128;
+  return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
+}
+
+
+/* Return true for types that could be supported as SIMD return or
+   argument types.  */
+
+static bool
+supported_simd_type (tree t)
+{
+  if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
+    {
+      HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
+      return s == 1 || s == 2 || s == 4 || s == 8;
+    }
+  return false;
+}
+
+/* Return true for types that currently are supported as SIMD return
+   or argument types.  */
+
+static bool
+currently_supported_simd_type (tree t, tree b)
+{
+  if (COMPLEX_FLOAT_TYPE_P (t))
+    return false;
+
+  if (TYPE_SIZE (t) != TYPE_SIZE (b))
+    return false;
+
+  return supported_simd_type (t);
+}
+
+/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
+
+static int
+aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
+                                       struct cgraph_simd_clone *clonei,
+                                       tree base_type, int num)
+{
+  tree t, ret_type, arg_type;
+  unsigned int elt_bits, vec_bits, count;
+
+  if (!TARGET_SIMD)
+    return 0;
+
+  if (clonei->simdlen
+      && (clonei->simdlen < 2
+         || clonei->simdlen > 1024
+         || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
+    {
+      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                 "unsupported simdlen %d", clonei->simdlen);
+      return 0;
+    }
+
+  ret_type = TREE_TYPE (TREE_TYPE (node->decl));
+  if (TREE_CODE (ret_type) != VOID_TYPE
+      && !currently_supported_simd_type (ret_type, base_type))
+    {
+      if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
+       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                   "GCC does not currently support mixed size types "
+                   "for %<simd%> functions");
+      else if (supported_simd_type (ret_type))
+       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                   "GCC does not currently support return type %qT "
+                   "for %<simd%> functions", ret_type);
+      else
+       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                   "unsupported return type %qT for %<simd%> functions",
+                   ret_type);
+      return 0;
+    }
+
+  for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
+    {
+      arg_type = TREE_TYPE (t);
+
+      if (!currently_supported_simd_type (arg_type, base_type))
+       {
+         if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
+           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                       "GCC does not currently support mixed size types "
+                       "for %<simd%> functions");
+         else
+           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                       "GCC does not currently support argument type %qT "
+                       "for %<simd%> functions", arg_type);
+         return 0;
+       }
+    }
+
+  clonei->vecsize_mangle = 'n';
+  clonei->mask_mode = VOIDmode;
+  elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
+  if (clonei->simdlen == 0)
+    {
+      count = 2;
+      vec_bits = (num == 0 ? 64 : 128);
+      clonei->simdlen = vec_bits / elt_bits;
+    }
+  else
+    {
+      count = 1;
+      vec_bits = clonei->simdlen * elt_bits;
+      if (vec_bits != 64 && vec_bits != 128)
+       {
+         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                     "GCC does not currently support simdlen %d for type %qT",
+                     clonei->simdlen, base_type);
+         return 0;
+       }
+    }
+  clonei->vecsize_int = vec_bits;
+  clonei->vecsize_float = vec_bits;
+  return count;
+}
+
+/* Implement TARGET_SIMD_CLONE_ADJUST.  */
+
+static void
+aarch64_simd_clone_adjust (struct cgraph_node *node)
+{
+  /* Add aarch64_vector_pcs target attribute to SIMD clones so they
+     use the correct ABI.  */
+
+  tree t = TREE_TYPE (node->decl);
+  TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
+                                       TYPE_ATTRIBUTES (t));
+}
+
+/* Implement TARGET_SIMD_CLONE_USABLE.  */
+
+static int
+aarch64_simd_clone_usable (struct cgraph_node *node)
+{
+  switch (node->simdclone->vecsize_mangle)
+    {
+    case 'n':
+      if (!TARGET_SIMD)
+       return -1;
+      return 0;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Implement TARGET_COMP_TYPE_ATTRIBUTES */
+
+static int
+aarch64_comp_type_attributes (const_tree type1, const_tree type2)
+{
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
+      != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
+    return 0;
+  return 1;
+}
+
+/* Implement TARGET_GET_MULTILIB_ABI_NAME */
+
+static const char *
+aarch64_get_multilib_abi_name (void)
+{
+  if (TARGET_BIG_END)
+    return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
+  return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
+}
+
+/* Implement TARGET_STACK_PROTECT_GUARD. In case of a
+   global variable based guard use the default else
+   return a null tree.  */
+static tree
+aarch64_stack_protect_guard (void)
+{
+  if (aarch64_stack_protector_guard == SSP_GLOBAL)
+    return default_stack_protect_guard ();
+
+  return NULL_TREE;
+}
+
+
  /* Target-specific selftests.  */
  
  #if CHECKING_P
@@ -17874,6 +18819,9 @@ aarch64_run_selftests (void)
  
  #endif /* #if CHECKING_P */
  
+#undef TARGET_STACK_PROTECT_GUARD
+#define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
+
  #undef TARGET_ADDRESS_COST
  #define TARGET_ADDRESS_COST aarch64_address_cost
  
@@ -18288,6 +19236,14 @@ aarch64_libgcc_floating_mode_supported_p
  #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
    aarch64_hard_regno_call_part_clobbered
  
+#undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
+#define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
+  aarch64_remove_extra_call_preserved_regs
+
+#undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
+#define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
+  aarch64_return_call_with_max_clobbers
+
  #undef TARGET_CONSTANT_ALIGNMENT
  #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
  
@@ -18307,6 +19263,28 @@ aarch64_libgcc_floating_mode_supported_p
  #undef TARGET_SPECULATION_SAFE_VALUE
  #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
  
+#undef TARGET_ESTIMATED_POLY_VALUE
+#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
+
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
+#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
+#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
+  aarch64_simd_clone_compute_vecsize_and_simdlen
+
+#undef TARGET_SIMD_CLONE_ADJUST
+#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
+
+#undef TARGET_SIMD_CLONE_USABLE
+#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
+
+#undef TARGET_COMP_TYPE_ATTRIBUTES
+#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
+
+#undef TARGET_GET_MULTILIB_ABI_NAME
+#define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
+
  #if CHECKING_P
  #undef TARGET_RUN_TARGET_SELFTESTS
  #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests