PR translation/90118 Missing space between words

[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index e15b4e83d20a765e9c38274931f4c0a3e2c19e87..10b0bd2d5321ee380292d13eb3973424fd91ff24 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1,5 +1,5 @@
  /* Machine description for AArch64 architecture.
-   Copyright (C) 2009-2018 Free Software Foundation, Inc.
+   Copyright (C) 2009-2019 Free Software Foundation, Inc.
     Contributed by ARM Ltd.
  
     This file is part of GCC.
@@ -40,6 +40,7 @@
  #include "regs.h"
  #include "emit-rtl.h"
  #include "recog.h"
+#include "cgraph.h"
  #include "diagnostic.h"
  #include "insn-attr.h"
  #include "alias.h"
@@ -71,6 +72,7 @@
  #include "selftest.h"
  #include "selftest-rtl.h"
  #include "rtx-vector-builder.h"
+#include "intl.h"
  
  /* This file should be included last.  */
  #include "target-def.h"
@@ -78,48 +80,6 @@
  /* Defined for convenience.  */
  #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  
-/* Classifies an address.
-
-   ADDRESS_REG_IMM
-       A simple base register plus immediate offset.
-
-   ADDRESS_REG_WB
-       A base register indexed by immediate offset with writeback.
-
-   ADDRESS_REG_REG
-       A base register indexed by (optionally scaled) register.
-
-   ADDRESS_REG_UXTW
-       A base register indexed by (optionally scaled) zero-extended register.
-
-   ADDRESS_REG_SXTW
-       A base register indexed by (optionally scaled) sign-extended register.
-
-   ADDRESS_LO_SUM
-       A LO_SUM rtx with a base register and "LO12" symbol relocation.
-
-   ADDRESS_SYMBOLIC:
-       A constant symbolic address, in pc-relative literal pool.  */
-
-enum aarch64_address_type {
-  ADDRESS_REG_IMM,
-  ADDRESS_REG_WB,
-  ADDRESS_REG_REG,
-  ADDRESS_REG_UXTW,
-  ADDRESS_REG_SXTW,
-  ADDRESS_LO_SUM,
-  ADDRESS_SYMBOLIC
-};
-
-struct aarch64_address_info {
-  enum aarch64_address_type type;
-  rtx base;
-  rtx offset;
-  poly_int64 const_offset;
-  int shift;
-  enum aarch64_symbol_type symbol_type;
-};
-
  /* Information about a legitimate vector immediate operand.  */
  struct simd_immediate_info
  {
@@ -206,7 +166,9 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
                                                          int misalignment,
                                                          bool is_packed);
  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
-static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
+static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
+                                           aarch64_addr_query_type);
+static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
  
  /* Major revision number of the ARM Architecture implemented by the target.  */
  unsigned aarch64_architecture_version;
@@ -223,6 +185,12 @@ bool aarch64_pcrelative_literal_loads;
  /* Global flag for whether frame pointer is enabled.  */
  bool aarch64_use_frame_pointer;
  
+#define BRANCH_PROTECT_STR_MAX 255
+char *accepted_branch_protection_string = NULL;
+
+static enum aarch64_parse_opt_result
+aarch64_parse_branch_protection (const char*, char**);
+
  /* Support for command line parsing of boolean flags in the tuning
     structures.  */
  struct aarch64_flag_desc
@@ -294,7 +262,7 @@ static const struct cpu_addrcost_table xgene1_addrcost_table =
        1, /* ti  */
      },
    1, /* pre_modify  */
-  0, /* post_modify  */
+  1, /* post_modify  */
    0, /* register_offset  */
    1, /* register_sextend  */
    1, /* register_zextend  */
@@ -317,6 +285,22 @@ static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
    0, /* imm_offset  */
  };
  
+static const struct cpu_addrcost_table tsv110_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* register_offset  */
+  1, /* register_sextend  */
+  1, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
  static const struct cpu_addrcost_table qdf24xx_addrcost_table =
  {
      {
@@ -328,7 +312,7 @@ static const struct cpu_addrcost_table qdf24xx_addrcost_table =
    1, /* pre_modify  */
    1, /* post_modify  */
    3, /* register_offset  */
-  4, /* register_sextend  */
+  3, /* register_sextend  */
    3, /* register_zextend  */
    2, /* imm_offset  */
  };
@@ -409,6 +393,16 @@ static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
    4  /* FP2FP  */
  };
  
+static const struct cpu_regmove_cost tsv110_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  2, /* GP2FP  */
+  3, /* FP2GP  */
+  2  /* FP2FP  */
+};
+
  /* Generic costs for vector insn classes.  */
  static const struct cpu_vector_cost generic_vector_cost =
  {
@@ -429,6 +423,26 @@ static const struct cpu_vector_cost generic_vector_cost =
    1 /* cond_not_taken_branch_cost  */
  };
  
+/* QDF24XX costs for vector insn classes.  */
+static const struct cpu_vector_cost qdf24xx_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  1, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* vec_int_stmt_cost  */
+  3, /* vec_fp_stmt_cost  */
+  2, /* vec_permute_cost  */
+  1, /* vec_to_scalar_cost  */
+  1, /* scalar_to_vec_cost  */
+  1, /* vec_align_load_cost  */
+  1, /* vec_unalign_load_cost  */
+  1, /* vec_unalign_store_cost  */
+  1, /* vec_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  1 /* cond_not_taken_branch_cost  */
+};
+
  /* ThunderX costs for vector insn classes.  */
  static const struct cpu_vector_cost thunderx_vector_cost =
  {
@@ -449,6 +463,25 @@ static const struct cpu_vector_cost thunderx_vector_cost =
    3 /* cond_not_taken_branch_cost  */
  };
  
+static const struct cpu_vector_cost tsv110_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  2, /* vec_int_stmt_cost  */
+  2, /* vec_fp_stmt_cost  */
+  2, /* vec_permute_cost  */
+  3, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  5, /* vec_align_load_cost  */
+  5, /* vec_unalign_load_cost  */
+  1, /* vec_unalign_store_cost  */
+  1, /* vec_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1 /* cond_not_taken_branch_cost  */
+};
+
  /* Generic costs for vector insn classes.  */
  static const struct cpu_vector_cost cortexa57_vector_cost =
  {
@@ -626,6 +659,28 @@ static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
    -1                   /* default_opt_level  */
  };
  
+static const cpu_prefetch_tune tsv110_prefetch_tune =
+{
+  0,                    /* num_slots  */
+  64,                   /* l1_cache_size  */
+  64,                   /* l1_cache_line_size  */
+  512,                  /* l2_cache_size  */
+  true,                 /* prefetch_dynamic_strides */
+  -1,                   /* minimum_stride */
+  -1                    /* default_opt_level  */
+};
+
+static const cpu_prefetch_tune xgene1_prefetch_tune =
+{
+  8,                   /* num_slots  */
+  32,                  /* l1_cache_size  */
+  64,                  /* l1_cache_line_size  */
+  256,                 /* l2_cache_size  */
+  true,                 /* prefetch_dynamic_strides */
+  -1,                   /* minimum_stride */
+  -1                   /* default_opt_level  */
+};
+
  static const struct tune_params generic_tunings =
  {
    &cortexa57_extra_costs,
@@ -634,6 +689,7 @@ static const struct tune_params generic_tunings =
    &generic_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    2, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -659,6 +715,7 @@ static const struct tune_params cortexa35_tunings =
    &generic_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    1, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -685,6 +742,7 @@ static const struct tune_params cortexa53_tunings =
    &generic_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    2, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -711,6 +769,7 @@ static const struct tune_params cortexa57_tunings =
    &cortexa57_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    3, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -737,6 +796,7 @@ static const struct tune_params cortexa72_tunings =
    &cortexa57_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    3, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -763,6 +823,7 @@ static const struct tune_params cortexa73_tunings =
    &cortexa57_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost.  */
    2, /* issue_rate.  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -791,6 +852,7 @@ static const struct tune_params exynosm1_tunings =
    &exynosm1_vector_cost,
    &generic_branch_cost,
    &exynosm1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4,   /* memmov_cost  */
    3,   /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -816,6 +878,7 @@ static const struct tune_params thunderxt88_tunings =
    &thunderx_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    6, /* memmov_cost  */
    2, /* issue_rate  */
    AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -841,6 +904,7 @@ static const struct tune_params thunderx_tunings =
    &thunderx_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    6, /* memmov_cost  */
    2, /* issue_rate  */
    AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -859,6 +923,33 @@ static const struct tune_params thunderx_tunings =
    &thunderx_prefetch_tune
  };
  
+static const struct tune_params tsv110_tunings =
+{
+  &tsv110_extra_costs,
+  &tsv110_addrcost_table,
+  &tsv110_regmove_cost,
+  &tsv110_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  4,    /* memmov_cost  */
+  4,    /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
+   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
+  "16", /* function_align.  */
+  "4",  /* jump_align.  */
+  "8",  /* loop_align.  */
+  2,    /* int_reassoc_width.  */
+  4,    /* fp_reassoc_width.  */
+  1,    /* vec_reassoc_width.  */
+  2,    /* min_div_recip_mul_sf.  */
+  2,    /* min_div_recip_mul_df.  */
+  0,    /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
+  &tsv110_prefetch_tune
+};
+
  static const struct tune_params xgene1_tunings =
  {
    &xgene1_extra_costs,
@@ -867,21 +958,48 @@ static const struct tune_params xgene1_tunings =
    &xgene1_vector_cost,
    &generic_branch_cost,
    &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    6, /* memmov_cost  */
    4, /* issue_rate  */
    AARCH64_FUSE_NOTHING, /* fusible_ops  */
    "16",        /* function_align.  */
-  "8", /* jump_align.  */
+  "16",        /* jump_align.  */
    "16",        /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
-  0,   /* max_case_values.  */
+  17,  /* max_case_values.  */
    tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
    (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),       /* tune_flags.  */
-  &generic_prefetch_tune
+  &xgene1_prefetch_tune
+};
+
+static const struct tune_params emag_tunings =
+{
+  &xgene1_extra_costs,
+  &xgene1_addrcost_table,
+  &xgene1_regmove_cost,
+  &xgene1_vector_cost,
+  &generic_branch_cost,
+  &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED,
+  6, /* memmov_cost  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_NOTHING, /* fusible_ops  */
+  "16",        /* function_align.  */
+  "16",        /* jump_align.  */
+  "16",        /* loop_align.  */
+  2,   /* int_reassoc_width.  */
+  4,   /* fp_reassoc_width.  */
+  1,   /* vec_reassoc_width.  */
+  2,   /* min_div_recip_mul_sf.  */
+  2,   /* min_div_recip_mul_df.  */
+  17,  /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),       /* tune_flags.  */
+  &xgene1_prefetch_tune
  };
  
  static const struct tune_params qdf24xx_tunings =
@@ -889,9 +1007,10 @@ static const struct tune_params qdf24xx_tunings =
    &qdf24xx_extra_costs,
    &qdf24xx_addrcost_table,
    &qdf24xx_regmove_cost,
-  &generic_vector_cost,
+  &qdf24xx_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    4, /* issue_rate  */
    (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -906,7 +1025,7 @@ static const struct tune_params qdf24xx_tunings =
    2,   /* min_div_recip_mul_df.  */
    0,   /* max_case_values.  */
    tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),           /* tune_flags.  */
+  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
    &qdf24xx_prefetch_tune
  };
  
@@ -920,6 +1039,7 @@ static const struct tune_params saphira_tunings =
    &generic_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost  */
    4, /* issue_rate  */
    (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -946,6 +1066,7 @@ static const struct tune_params thunderx2t99_tunings =
    &thunderx2t99_vector_cost,
    &generic_branch_cost,
    &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
    4, /* memmov_cost.  */
    4, /* issue_rate.  */
    (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
@@ -964,6 +1085,32 @@ static const struct tune_params thunderx2t99_tunings =
    &thunderx2t99_prefetch_tune
  };
  
+static const struct tune_params neoversen1_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  4, /* memmov_cost  */
+  3, /* issue_rate  */
+  AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
+  "32:16",     /* function_align.  */
+  "32:16",     /* jump_align.  */
+  "32:16",     /* loop_align.  */
+  2,   /* int_reassoc_width.  */
+  4,   /* fp_reassoc_width.  */
+  2,   /* vec_reassoc_width.  */
+  2,   /* min_div_recip_mul_sf.  */
+  2,   /* min_div_recip_mul_df.  */
+  0,   /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
+  &generic_prefetch_tune
+};
+
  /* Support for fine-grained override of the tuning structures.  */
  struct aarch64_tuning_override_function
  {
@@ -973,12 +1120,14 @@ struct aarch64_tuning_override_function
  
  static void aarch64_parse_fuse_string (const char*, struct tune_params*);
  static void aarch64_parse_tune_string (const char*, struct tune_params*);
+static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
  
  static const struct aarch64_tuning_override_function
  aarch64_tuning_override_functions[] =
  {
    { "fuse", aarch64_parse_fuse_string },
    { "tune", aarch64_parse_tune_string },
+  { "sve_width", aarch64_parse_sve_width_string },
    { NULL, NULL }
  };
  
@@ -1026,6 +1175,15 @@ static const struct processor *selected_tune;
  /* The current tuning set.  */
  struct tune_params aarch64_tune_params = generic_tunings;
  
+/* Table of machine attributes.  */
+static const struct attribute_spec aarch64_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
+  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+};
+
  #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
  
  /* An ISA extension in the co-processor and main instruction set space.  */
@@ -1046,6 +1204,90 @@ aarch64_cc;
  
  #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
  
+struct aarch64_branch_protect_type
+{
+  /* The type's name that the user passes to the branch-protection option
+    string.  */
+  const char* name;
+  /* Function to handle the protection type and set global variables.
+    First argument is the string token corresponding with this type and the
+    second argument is the next token in the option string.
+    Return values:
+    * AARCH64_PARSE_OK: Handling was sucessful.
+    * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
+      should print an error.
+    * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
+      own error.  */
+  enum aarch64_parse_opt_result (*handler)(char*, char*);
+  /* A list of types that can follow this type in the option string.  */
+  const aarch64_branch_protect_type* subtypes;
+  unsigned int num_subtypes;
+};
+
+static enum aarch64_parse_opt_result
+aarch64_handle_no_branch_protection (char* str, char* rest)
+{
+  aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
+  aarch64_enable_bti = 0;
+  if (rest)
+    {
+      error ("unexpected %<%s%> after %<%s%>", rest, str);
+      return AARCH64_PARSE_INVALID_FEATURE;
+    }
+  return AARCH64_PARSE_OK;
+}
+
+static enum aarch64_parse_opt_result
+aarch64_handle_standard_branch_protection (char* str, char* rest)
+{
+  aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
+  aarch64_enable_bti = 1;
+  if (rest)
+    {
+      error ("unexpected %<%s%> after %<%s%>", rest, str);
+      return AARCH64_PARSE_INVALID_FEATURE;
+    }
+  return AARCH64_PARSE_OK;
+}
+
+static enum aarch64_parse_opt_result
+aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
+                                   char* rest ATTRIBUTE_UNUSED)
+{
+  aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
+  return AARCH64_PARSE_OK;
+}
+
+static enum aarch64_parse_opt_result
+aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
+                             char* rest ATTRIBUTE_UNUSED)
+{
+  aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
+  return AARCH64_PARSE_OK;
+}
+
+static enum aarch64_parse_opt_result
+aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
+                                   char* rest ATTRIBUTE_UNUSED)
+{
+  aarch64_enable_bti = 1;
+  return AARCH64_PARSE_OK;
+}
+
+static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
+  { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
+  { NULL, NULL, NULL, 0 }
+};
+
+static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
+  { "none", aarch64_handle_no_branch_protection, NULL, 0 },
+  { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
+  { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
+    ARRAY_SIZE (aarch64_pac_ret_subtypes) },
+  { "bti", aarch64_handle_bti_protection, NULL, 0 },
+  { NULL, NULL, NULL, 0 }
+};
+
  /* The condition codes of the processor, and the inverse function.  */
  static const char * const aarch64_condition_codes[] =
  {
@@ -1390,10 +1632,14 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
    if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
      return mode == Pmode;
  
-  if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
-    return true;
-
-  if (FP_REGNUM_P (regno))
+  if (GP_REGNUM_P (regno))
+    {
+      if (known_le (GET_MODE_SIZE (mode), 8))
+       return true;
+      else if (known_le (GET_MODE_SIZE (mode), 16))
+       return (regno & 1) == 0;
+    }
+  else if (FP_REGNUM_P (regno))
      {
        if (vec_flags & VEC_STRUCT)
         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
@@ -1404,14 +1650,102 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
    return false;
  }
  
+/* Return true if this is a definition of a vectorized simd function.  */
+
+static bool
+aarch64_simd_decl_p (tree fndecl)
+{
+  tree fntype;
+
+  if (fndecl == NULL)
+    return false;
+  fntype = TREE_TYPE (fndecl);
+  if (fntype == NULL)
+    return false;
+
+  /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
+    return true;
+
+  return false;
+}
+
+/* Return the mode a register save/restore should use.  DImode for integer
+   registers, DFmode for FP registers in non-SIMD functions (they only save
+   the bottom half of a 128 bit register), or TFmode for FP registers in
+   SIMD functions.  */
+
+static machine_mode
+aarch64_reg_save_mode (tree fndecl, unsigned regno)
+{
+  return GP_REGNUM_P (regno)
+          ? E_DImode
+          : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
+}
+
+/* Return true if the instruction is a call to a SIMD function, false
+   if it is not a SIMD function or if we do not know anything about
+   the function.  */
+
+static bool
+aarch64_simd_call_p (rtx_insn *insn)
+{
+  rtx symbol;
+  rtx call;
+  tree fndecl;
+
+  gcc_assert (CALL_P (insn));
+  call = get_call_rtx_from (insn);
+  symbol = XEXP (XEXP (call, 0), 0);
+  if (GET_CODE (symbol) != SYMBOL_REF)
+    return false;
+  fndecl = SYMBOL_REF_DECL (symbol);
+  if (!fndecl)
+    return false;
+
+  return aarch64_simd_decl_p (fndecl);
+}
+
+/* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
+   a function that uses the SIMD ABI, take advantage of the extra
+   call-preserved registers that the ABI provides.  */
+
+void
+aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
+                                         HARD_REG_SET *return_set)
+{
+  if (aarch64_simd_call_p (insn))
+    {
+      for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+       if (FP_SIMD_SAVED_REGNUM_P (regno))
+         CLEAR_HARD_REG_BIT (*return_set, regno);
+    }
+}
+
  /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
     the lower 64 bits of a 128-bit register.  Tell the compiler the callee
     clobbers the top 64 bits when restoring the bottom 64 bits.  */
  
  static bool
-aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
+aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
+                                       machine_mode mode)
+{
+  bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
+  return FP_REGNUM_P (regno)
+        && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
+}
+
+/* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
+
+rtx_insn *
+aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
  {
-  return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
+  gcc_assert (CALL_P (call_1) && CALL_P (call_2));
+
+  if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
+    return call_1;
+  else
+    return call_2;
  }
  
  /* Implement REGMODE_NATURAL_SIZE.  */
@@ -1453,6 +1787,13 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
      return SImode;
  }
  
+/* Return true if I's bits are consecutive ones from the MSB.  */
+bool
+aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
+{
+  return exact_log2 (-i) != HOST_WIDE_INT_M1;
+}
+
  /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
     that strcpy from constants will be faster.  */
  
@@ -1545,6 +1886,33 @@ aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
    return cc_reg;
  }
  
+/* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
+
+static rtx
+aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
+                                  machine_mode y_mode)
+{
+  if (y_mode == E_QImode || y_mode == E_HImode)
+    {
+      if (CONST_INT_P (y))
+       y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+      else
+       {
+         rtx t, cc_reg;
+         machine_mode cc_mode;
+
+         t = gen_rtx_ZERO_EXTEND (SImode, y);
+         t = gen_rtx_COMPARE (CC_SWPmode, t, x);
+         cc_mode = CC_SWPmode;
+         cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+         emit_set_insn (cc_reg, t);
+         return cc_reg;
+       }
+    }
+
+  return aarch64_gen_compare_reg (code, x, y);
+}
+
  /* Build the SYMBOL_REF for __tls_get_addr.  */
  
  static GTY(()) rtx tls_get_addr_libfunc;
@@ -1980,16 +2348,8 @@ aarch64_split_128bit_move (rtx dst, rtx src)
           src_lo = gen_lowpart (word_mode, src);
           src_hi = gen_highpart (word_mode, src);
  
-         if (mode == TImode)
-           {
-             emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
-             emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
-           }
-         else
-           {
-             emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
-             emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
-           }
+         emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
+         emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
           return;
         }
        else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
@@ -1997,16 +2357,8 @@ aarch64_split_128bit_move (rtx dst, rtx src)
           dst_lo = gen_lowpart (word_mode, dst);
           dst_hi = gen_highpart (word_mode, dst);
  
-         if (mode == TImode)
-           {
-             emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
-             emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
-           }
-         else
-           {
-             emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
-             emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
-           }
+         emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
+         emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
           return;
         }
      }
@@ -2049,36 +2401,7 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
               && register_operand (src1, src_mode)
               && register_operand (src2, src_mode));
  
-  rtx (*gen) (rtx, rtx, rtx);
-
-  switch (src_mode)
-    {
-    case E_V8QImode:
-      gen = gen_aarch64_simd_combinev8qi;
-      break;
-    case E_V4HImode:
-      gen = gen_aarch64_simd_combinev4hi;
-      break;
-    case E_V2SImode:
-      gen = gen_aarch64_simd_combinev2si;
-      break;
-    case E_V4HFmode:
-      gen = gen_aarch64_simd_combinev4hf;
-      break;
-    case E_V2SFmode:
-      gen = gen_aarch64_simd_combinev2sf;
-      break;
-    case E_DImode:
-      gen = gen_aarch64_simd_combinedi;
-      break;
-    case E_DFmode:
-      gen = gen_aarch64_simd_combinedf;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen (dst, src1, src2));
+  emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
    return;
  }
  
@@ -2094,39 +2417,8 @@ aarch64_split_simd_move (rtx dst, rtx src)
  
    if (REG_P (dst) && REG_P (src))
      {
-      rtx (*gen) (rtx, rtx);
-
        gcc_assert (VECTOR_MODE_P (src_mode));
-
-      switch (src_mode)
-       {
-       case E_V16QImode:
-         gen = gen_aarch64_split_simd_movv16qi;
-         break;
-       case E_V8HImode:
-         gen = gen_aarch64_split_simd_movv8hi;
-         break;
-       case E_V4SImode:
-         gen = gen_aarch64_split_simd_movv4si;
-         break;
-       case E_V2DImode:
-         gen = gen_aarch64_split_simd_movv2di;
-         break;
-       case E_V8HFmode:
-         gen = gen_aarch64_split_simd_movv8hf;
-         break;
-       case E_V4SFmode:
-         gen = gen_aarch64_split_simd_movv4sf;
-         break;
-       case E_V2DFmode:
-         gen = gen_aarch64_split_simd_movv2df;
-         break;
-       default:
-         gcc_unreachable ();
-       }
-
-      emit_insn (gen (dst, src));
-      return;
+      emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
      }
  }
  
@@ -2824,10 +3116,11 @@ aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
     if nonnull.  */
  
  static inline void
-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
+aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
+               bool emit_move_imm = true)
  {
    aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
-                     temp1, temp2, frame_related_p);
+                     temp1, temp2, frame_related_p, emit_move_imm);
  }
  
  /* Set DEST to (vec_series BASE STEP).  */
@@ -3121,9 +3414,12 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm,
  void
  aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
  {
-  emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
-                                               gen_rtvec (2, pred, src),
-                                               UNSPEC_MERGE_PTRUE)));
+  expand_operand ops[3];
+  machine_mode mode = GET_MODE (dest);
+  create_output_operand (&ops[0], dest, mode);
+  create_input_operand (&ops[1], pred, GET_MODE(pred));
+  create_input_operand (&ops[2], src, mode);
+  expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
  }
  
  /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
@@ -3271,7 +3567,9 @@ static bool
  aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
                                  tree exp ATTRIBUTE_UNUSED)
  {
-  /* Currently, always true.  */
+  if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
+    return false;
+
    return true;
  }
  
@@ -3470,12 +3768,16 @@ aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
  
  /* Given MODE and TYPE of a function argument, return the alignment in
     bits.  The idea is to suppress any stronger alignment requested by
-   the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
-   This is a helper function for local use only.  */
+   the user and opt for the natural alignment (specified in AAPCS64 \S
+   4.1).  ABI_BREAK is set to true if the alignment was incorrectly
+   calculated in versions of GCC prior to GCC-9.  This is a helper
+   function for local use only.  */
  
  static unsigned int
-aarch64_function_arg_alignment (machine_mode mode, const_tree type)
+aarch64_function_arg_alignment (machine_mode mode, const_tree type,
+                               bool *abi_break)
  {
+  *abi_break = false;
    if (!type)
      return GET_MODE_ALIGNMENT (mode);
  
@@ -3491,9 +3793,22 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type)
      return TYPE_ALIGN (TREE_TYPE (type));
  
    unsigned int alignment = 0;
+  unsigned int bitfield_alignment = 0;
    for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
      if (TREE_CODE (field) == FIELD_DECL)
-      alignment = std::max (alignment, DECL_ALIGN (field));
+      {
+       alignment = std::max (alignment, DECL_ALIGN (field));
+       if (DECL_BIT_FIELD_TYPE (field))
+         bitfield_alignment
+           = std::max (bitfield_alignment,
+                       TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
+      }
+
+  if (bitfield_alignment > alignment)
+    {
+      *abi_break = true;
+      return bitfield_alignment;
+    }
  
    return alignment;
  }
@@ -3510,6 +3825,7 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
    int ncrn, nvrn, nregs;
    bool allocate_ncrn, allocate_nvrn;
    HOST_WIDE_INT size;
+  bool abi_break;
  
    /* We need to do this once per argument.  */
    if (pcum->aapcs_arg_processed)
@@ -3586,25 +3902,28 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
       entirely general registers.  */
    if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
      {
-
        gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
  
        /* C.8 if the argument has an alignment of 16 then the NGRN is
-         rounded up to the next even number.  */
+        rounded up to the next even number.  */
        if (nregs == 2
           && ncrn % 2
           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
              comparison is there because for > 16 * BITS_PER_UNIT
              alignment nregs should be > 2 and therefore it should be
              passed by reference rather than value.  */
-         && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
+         && (aarch64_function_arg_alignment (mode, type, &abi_break)
+             == 16 * BITS_PER_UNIT))
         {
+         if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
+           inform (input_location, "parameter passing for argument of type "
+                   "%qT changed in GCC 9.1", type);
           ++ncrn;
           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
         }
  
        /* NREGS can be 0 when e.g. an empty structure is to be passed.
-         A reg is still generated for it, but the caller should be smart
+        A reg is still generated for it, but the caller should be smart
          enough not to use it.  */
        if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
@@ -3636,9 +3955,18 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
  on_stack:
    pcum->aapcs_stack_words = size / UNITS_PER_WORD;
  
-  if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
-    pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
-                                      16 / UNITS_PER_WORD);
+  if (aarch64_function_arg_alignment (mode, type, &abi_break)
+      == 16 * BITS_PER_UNIT)
+    {
+      int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
+      if (pcum->aapcs_stack_size != new_size)
+       {
+         if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
+           inform (input_location, "parameter passing for argument of type "
+                   "%qT changed in GCC 9.1", type);
+         pcum->aapcs_stack_size = new_size;
+       }
+    }
    return;
  }
  
@@ -3727,7 +4055,13 @@ aarch64_function_arg_regno_p (unsigned regno)
  static unsigned int
  aarch64_function_arg_boundary (machine_mode mode, const_tree type)
  {
-  unsigned int alignment = aarch64_function_arg_alignment (mode, type);
+  bool abi_break;
+  unsigned int alignment = aarch64_function_arg_alignment (mode, type,
+                                                          &abi_break);
+  if (abi_break & warn_psabi)
+    inform (input_location, "parameter passing for argument of type "
+           "%qT changed in GCC 9.1", type);
+
    return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
  }
  
@@ -3835,8 +4169,8 @@ aarch64_libgcc_cmp_return_mode (void)
  #endif
  
  /* The pair of scratch registers used for stack probing.  */
-#define PROBE_STACK_FIRST_REG  9
-#define PROBE_STACK_SECOND_REG 10
+#define PROBE_STACK_FIRST_REG  R9_REGNUM
+#define PROBE_STACK_SECOND_REG R10_REGNUM
  
  /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
     inclusive.  These are offsets from the current stack pointer.  */
@@ -3987,13 +4321,33 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
    /* Loop.  */
    ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
  
+  HOST_WIDE_INT stack_clash_probe_interval
+    = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+
    /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
    xops[0] = reg1;
-  xops[1] = GEN_INT (PROBE_INTERVAL);
+  HOST_WIDE_INT interval;
+  if (flag_stack_clash_protection)
+    interval = stack_clash_probe_interval;
+  else
+    interval = PROBE_INTERVAL;
+
+  gcc_assert (aarch64_uimm12_shift (interval));
+  xops[1] = GEN_INT (interval);
+
    output_asm_insn ("sub\t%0, %0, %1", xops);
  
-  /* Probe at TEST_ADDR.  */
-  output_asm_insn ("str\txzr, [%0]", xops);
+  /* If doing stack clash protection then we probe up by the ABI specified
+     amount.  We do this because we're dropping full pages at a time in the
+     loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
+  if (flag_stack_clash_protection)
+    xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
+  else
+    xops[1] = CONST0_RTX (GET_MODE (xops[1]));
+
+  /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
+     by this amount for each iteration.  */
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
  
    /* Test if TEST_ADDR == LAST_ADDR.  */
    xops[1] = reg2;
@@ -4007,6 +4361,84 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
    return "";
  }
  
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
+   of GUARD_SIZE.  When a probe is emitted it is done at most
+   MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
+   at most MIN_PROBE_THRESHOLD.  By the end of this function
+   BASE = BASE - ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+                                     rtx min_probe_threshold, rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
+  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
+
+  /* The minimum required allocation before the residual requires probing.  */
+  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
+
+  /* Clamp the value down to the nearest value that can be used with a cmp.  */
+  residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
+  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
+
+  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
+  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end if not enough adjustment to probe.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = base;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to start if still more bytes to allocate.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+  return "";
+}
+
  /* Determine whether a frame chain needs to be generated.  */
  static bool
  aarch64_needs_frame_chain (void)
@@ -4034,18 +4466,31 @@ aarch64_layout_frame (void)
  {
    HOST_WIDE_INT offset = 0;
    int regno, last_fp_reg = INVALID_REGNUM;
-
-  if (reload_completed && cfun->machine->frame.laid_out)
-    return;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
  
    cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
  
+  /* Adjust the outgoing arguments size if required.  Keep it in sync with what
+     the mid-end is doing.  */
+  crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
+
  #define SLOT_NOT_REQUIRED (-2)
  #define SLOT_REQUIRED     (-1)
  
    cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
    cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
  
+  /* If this is a non-leaf simd function with calls we assume that
+     at least one of those calls is to a non-simd function and thus
+     we must save V8 to V23 in the prologue.  */
+
+  if (simd_function && !crtl->is_leaf)
+    {
+      for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+       if (FP_SIMD_SAVED_REGNUM_P (regno))
+         df_set_regs_ever_live (regno, true);
+    }
+
    /* First mark all the registers that really need to be saved...  */
    for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
      cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
@@ -4068,7 +4513,8 @@ aarch64_layout_frame (void)
  
    for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
      if (df_regs_ever_live_p (regno)
-       && !call_used_regs[regno])
+       && (!call_used_regs[regno]
+           || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
        {
         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
         last_fp_reg = regno;
@@ -4084,6 +4530,11 @@ aarch64_layout_frame (void)
        offset = 2 * UNITS_PER_WORD;
      }
  
+  /* With stack-clash, LR must be saved in non-leaf functions.  */
+  gcc_assert (crtl->is_leaf
+             || (cfun->machine->frame.reg_offset[R30_REGNUM]
+                 != SLOT_NOT_REQUIRED));
+
    /* Now assign stack slots for them.  */
    for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
      if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
@@ -4105,7 +4556,10 @@ aarch64_layout_frame (void)
        {
         /* If there is an alignment gap between integer and fp callee-saves,
            allocate the last fp register to it if possible.  */
-       if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+       if (regno == last_fp_reg
+           && has_align_gap
+           && !simd_function
+           && (offset & 8) == 0)
           {
             cfun->machine->frame.reg_offset[regno] = max_int_offset;
             break;
@@ -4117,7 +4571,7 @@ aarch64_layout_frame (void)
         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
           cfun->machine->frame.wb_candidate2 = regno;
-       offset += UNITS_PER_WORD;
+       offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
        }
  
    offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
@@ -4260,6 +4714,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
        return gen_storewb_pairdf_di (base, base, reg, reg2,
                                     GEN_INT (-adjustment),
                                     GEN_INT (UNITS_PER_WORD - adjustment));
+    case E_TFmode:
+      return gen_storewb_pairtf_di (base, base, reg, reg2,
+                                   GEN_INT (-adjustment),
+                                   GEN_INT (UNITS_PER_VREG - adjustment));
      default:
        gcc_unreachable ();
      }
@@ -4272,7 +4730,7 @@ static void
  aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
  {
    rtx_insn *insn;
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
  
    if (regno2 == INVALID_REGNUM)
      return aarch64_pushwb_single_reg (mode, regno1, adjustment);
@@ -4302,6 +4760,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
      case E_DFmode:
        return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
                                    GEN_INT (UNITS_PER_WORD));
+    case E_TFmode:
+      return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
+                                  GEN_INT (UNITS_PER_VREG));
      default:
        gcc_unreachable ();
      }
@@ -4315,7 +4776,7 @@ static void
  aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
                   rtx *cfi_ops)
  {
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
    rtx reg1 = gen_rtx_REG (mode, regno1);
  
    *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
@@ -4350,6 +4811,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
      case E_DFmode:
        return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
  
+    case E_TFmode:
+      return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
+
      default:
        gcc_unreachable ();
      }
@@ -4370,6 +4834,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
      case E_DFmode:
        return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
  
+    case E_TFmode:
+      return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
+
      default:
        gcc_unreachable ();
      }
@@ -4391,6 +4858,13 @@ aarch64_return_address_signing_enabled (void)
               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
  }
  
+/* Return TRUE if Branch Target Identification Mechanism is enabled.  */
+bool
+aarch64_bti_enabled (void)
+{
+  return (aarch64_enable_bti == 1);
+}
+
  /* Emit code to save the callee-saved registers from register number START
     to LIMIT to the stack at the location starting at offset START_OFFSET,
     skipping any write-back candidates if SKIP_WB is true.  */
@@ -4409,6 +4883,7 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
      {
        rtx reg, mem;
        poly_int64 offset;
+      int offset_diff;
  
        if (skip_wb
           && (regno == cfun->machine->frame.wb_candidate1
@@ -4424,12 +4899,12 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
                                                 offset));
  
        regno2 = aarch64_next_callee_save (regno + 1, limit);
+      offset_diff = cfun->machine->frame.reg_offset[regno2]
+                   - cfun->machine->frame.reg_offset[regno];
  
        if (regno2 <= limit
           && !cfun->machine->reg_is_wrapped_separately[regno2]
-         && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
-             == cfun->machine->frame.reg_offset[regno2]))
-
+         && known_eq (GET_MODE_SIZE (mode), offset_diff))
         {
           rtx reg2 = gen_rtx_REG (mode, regno2);
           rtx mem2;
@@ -4477,6 +4952,7 @@ aarch64_restore_callee_saves (machine_mode mode,
         continue;
  
        rtx reg, mem;
+      int offset_diff;
  
        if (skip_wb
           && (regno == cfun->machine->frame.wb_candidate1
@@ -4488,11 +4964,12 @@ aarch64_restore_callee_saves (machine_mode mode,
        mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
  
        regno2 = aarch64_next_callee_save (regno + 1, limit);
+      offset_diff = cfun->machine->frame.reg_offset[regno2]
+                   - cfun->machine->frame.reg_offset[regno];
  
        if (regno2 <= limit
           && !cfun->machine->reg_is_wrapped_separately[regno2]
-         && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
-             == cfun->machine->frame.reg_offset[regno2]))
+         && known_eq (GET_MODE_SIZE (mode), offset_diff))
         {
           rtx reg2 = gen_rtx_REG (mode, regno2);
           rtx mem2;
@@ -4545,9 +5022,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
  
  /* Return true if OFFSET is a signed 9-bit value.  */
  
-static inline bool
-offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
-                              poly_int64 offset)
+bool
+aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
+                                      poly_int64 offset)
  {
    HOST_WIDE_INT const_offset;
    return (offset.is_constant (&const_offset)
@@ -4581,8 +5058,6 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
  static sbitmap
  aarch64_get_separate_components (void)
  {
-  aarch64_layout_frame ();
-
    sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
    bitmap_clear (components);
  
@@ -4606,7 +5081,7 @@ aarch64_get_separate_components (void)
  
    unsigned reg1 = cfun->machine->frame.wb_candidate1;
    unsigned reg2 = cfun->machine->frame.wb_candidate2;
-  /* If aarch64_layout_frame has chosen registers to store/restore with
+  /* If registers have been chosen to be stored/restored with
       writeback don't interfere with them to avoid having to output explicit
       stack adjustment instructions.  */
    if (reg2 != INVALID_REGNUM)
@@ -4628,13 +5103,15 @@ aarch64_components_for_bb (basic_block bb)
    bitmap in = DF_LIVE_IN (bb);
    bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
    bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
  
    sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
    bitmap_clear (components);
  
    /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
    for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
-    if ((!call_used_regs[regno])
+    if ((!call_used_regs[regno]
+       || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
         && (bitmap_bit_p (in, regno)
            || bitmap_bit_p (gen, regno)
            || bitmap_bit_p (kill, regno)))
@@ -4705,9 +5182,11 @@ aarch64_process_components (sbitmap components, bool prologue_p)
  
    while (regno != last_regno)
      {
-      /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
-        so DFmode for the vector registers is enough.  */
-      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
+      /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
+        so DFmode for the vector registers is enough.  For simd functions
+        we want to save the low 128 bits.  */
+      machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
+      
        rtx reg = gen_rtx_REG (mode, regno);
        poly_int64 offset = cfun->machine->frame.reg_offset[regno];
        if (!frame_pointer_needed)
@@ -4736,6 +5215,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
          mergeable with the current one into a pair.  */
        if (!satisfies_constraint_Ump (mem)
           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+         || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
                        GET_MODE_SIZE (mode)))
         {
@@ -4773,38 +5253,304 @@ aarch64_process_components (sbitmap components, bool prologue_p)
         }
        else
         {
-         add_reg_note (insn, REG_CFA_RESTORE, reg);
-         add_reg_note (insn, REG_CFA_RESTORE, reg2);
+         add_reg_note (insn, REG_CFA_RESTORE, reg);
+         add_reg_note (insn, REG_CFA_RESTORE, reg2);
+       }
+
+      regno = aarch64_get_next_set_bit (components, regno2 + 1);
+    }
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
+
+static void
+aarch64_emit_prologue_components (sbitmap components)
+{
+  aarch64_process_components (components, true);
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
+
+static void
+aarch64_emit_epilogue_components (sbitmap components)
+{
+  aarch64_process_components (components, false);
+}
+
+/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
+
+static void
+aarch64_set_handled_components (sbitmap components)
+{
+  for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
+    if (bitmap_bit_p (components, regno))
+      cfun->machine->reg_is_wrapped_separately[regno] = true;
+}
+
+/* On AArch64 we have an ABI defined safe buffer.  This constant is used to
+   determining the probe offset for alloca.  */
+
+static HOST_WIDE_INT
+aarch64_stack_clash_protection_alloca_probe_range (void)
+{
+  return STACK_CLASH_CALLER_GUARD;
+}
+
+
+/* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
+   registers.  If POLY_SIZE is not large enough to require a probe this function
+   will only adjust the stack.  When allocating the stack space
+   FRAME_RELATED_P is then used to indicate if the allocation is frame related.
+   FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
+   arguments.  If we are then we ensure that any allocation larger than the ABI
+   defined buffer needs a probe so that the invariant of having a 1KB buffer is
+   maintained.
+
+   We emit barriers after each stack adjustment to prevent optimizations from
+   breaking the invariant that we never drop the stack more than a page.  This
+   invariant is needed to make it easier to correctly handle asynchronous
+   events, e.g. if we were to allow the stack to be dropped by more than a page
+   and then have multiple probes up and we take a signal somewhere in between
+   then the signal handler doesn't know the state of the stack and can make no
+   assumptions about which pages have been probed.  */
+
+static void
+aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+                                       poly_int64 poly_size,
+                                       bool frame_related_p,
+                                       bool final_adjustment_p)
+{
+  HOST_WIDE_INT guard_size
+    = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+  HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+  /* When doing the final adjustment for the outgoing argument size we can't
+     assume that LR was saved at position 0.  So subtract it's offset from the
+     ABI safe buffer so that we don't accidentally allow an adjustment that
+     would result in an allocation larger than the ABI buffer without
+     probing.  */
+  HOST_WIDE_INT min_probe_threshold
+    = final_adjustment_p
+      ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
+      : guard_size - guard_used_by_caller;
+
+  poly_int64 frame_size = cfun->machine->frame.frame_size;
+
+  /* We should always have a positive probe threshold.  */
+  gcc_assert (min_probe_threshold > 0);
+
+  if (flag_stack_clash_protection && !final_adjustment_p)
+    {
+      poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
+      poly_int64 final_adjust = cfun->machine->frame.final_adjust;
+
+      if (known_eq (frame_size, 0))
+       {
+         dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+       }
+      else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
+              && known_lt (final_adjust, guard_used_by_caller))
+       {
+         dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+       }
+    }
+
+  /* If SIZE is not large enough to require probing, just adjust the stack and
+     exit.  */
+  if (known_lt (poly_size, min_probe_threshold)
+      || !flag_stack_clash_protection)
+    {
+      aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
+      return;
+    }
+
+  HOST_WIDE_INT size;
+  /* Handle the SVE non-constant case first.  */
+  if (!poly_size.is_constant (&size))
+    {
+     if (dump_file)
+      {
+       fprintf (dump_file, "Stack clash SVE prologue: ");
+       print_dec (poly_size, dump_file);
+       fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+      }
+
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
+                         poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+       {
+         /* This is done to provide unwinding information for the stack
+            adjustments we're about to do, however to prevent the optimizers
+            from removing the R11 move and leaving the CFA note (which would be
+            very wrong) we tie the old and new stack pointer together.
+            The tie will expand to nothing but the optimizers will not touch
+            the instruction.  */
+         rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
+         emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+         emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+         /* We want the CFA independent of the stack pointer for the
+            duration of the loop.  */
+         add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
+
+      rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
+      rtx guard_const = gen_int_mode (guard_size, Pmode);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
+                                                  stack_pointer_rtx, temp1,
+                                                  probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+       {
+         add_reg_note (insn, REG_CFA_DEF_CFA,
+                       gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+                                     gen_int_mode (poly_size, Pmode)));
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
+
+      return;
+    }
+
+  if (dump_file)
+    fprintf (dump_file,
+            "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+            " bytes, probing will be required.\n", size);
+
+  /* Round size to the nearest multiple of guard_size, and calculate the
+     residual as the difference between the original size and the rounded
+     size.  */
+  HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
+  HOST_WIDE_INT residual = size - rounded_size;
+
+  /* We can handle a small number of allocations/probes inline.  Otherwise
+     punt to a loop.  */
+  if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
+    {
+      for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
+       {
+         aarch64_sub_sp (NULL, temp2, guard_size, true);
+         emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+                                          guard_used_by_caller));
+         emit_insn (gen_blockage ());
+       }
+      dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
+    }
+  else
+    {
+      /* Compute the ending address.  */
+      aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
+                         temp1, NULL, false, true);
+      rtx_insn *insn = get_last_insn ();
+
+      /* For the initial allocation, we don't have a frame pointer
+        set up, so we always need CFI notes.  If we're doing the
+        final allocation, then we may have a frame pointer, in which
+        case it is the CFA, otherwise we need CFI notes.
+
+        We can determine which allocation we are doing by looking at
+        the value of FRAME_RELATED_P since the final allocations are not
+        frame related.  */
+      if (frame_related_p)
+       {
+         /* We want the CFA independent of the stack pointer for the
+            duration of the loop.  */
+         add_reg_note (insn, REG_CFA_DEF_CFA,
+                       plus_constant (Pmode, temp1, rounded_size));
+         RTX_FRAME_RELATED_P (insn) = 1;
         }
  
-      regno = aarch64_get_next_set_bit (components, regno2 + 1);
-    }
-}
+      /* This allocates and probes the stack.  Note that this re-uses some of
+        the existing Ada stack protection code.  However we are guaranteed not
+        to enter the non loop or residual branches of that code.
  
-/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
+        The non-loop part won't be entered because if our allocation amount
+        doesn't require a loop, the case above would handle it.
  
-static void
-aarch64_emit_prologue_components (sbitmap components)
-{
-  aarch64_process_components (components, true);
-}
+        The residual amount won't be entered because TEMP1 is a mutliple of
+        the allocation size.  The residual will always be 0.  As such, the only
+        part we are actually using from that code is the loop setup.  The
+        actual probing is done in aarch64_output_probe_stack_range.  */
+      insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
+                                              stack_pointer_rtx, temp1));
  
-/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+       {
+         add_reg_note (insn, REG_CFA_DEF_CFA,
+                       plus_constant (Pmode, stack_pointer_rtx, rounded_size));
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
  
-static void
-aarch64_emit_epilogue_components (sbitmap components)
-{
-  aarch64_process_components (components, false);
+      emit_insn (gen_blockage ());
+      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
+    }
+
+  /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
+     be probed.  This maintains the requirement that each page is probed at
+     least once.  For initial probing we probe only if the allocation is
+     more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
+     if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
+     GUARD_SIZE.  This works that for any allocation that is large enough to
+     trigger a probe here, we'll have at least one, and if they're not large
+     enough for this code to emit anything for them, The page would have been
+     probed by the saving of FP/LR either by this function or any callees.  If
+     we don't have any callees then we won't have more stack adjustments and so
+     are still safe.  */
+  if (residual)
+    {
+      HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
+      /* If we're doing final adjustments, and we've done any full page
+        allocations then any residual needs to be probed.  */
+      if (final_adjustment_p && rounded_size != 0)
+       min_probe_threshold = 0;
+      /* If doing a small final adjustment, we always probe at offset 0.
+        This is done to avoid issues when LR is not at position 0 or when
+        the final adjustment is smaller than the probing offset.  */
+      else if (final_adjustment_p && rounded_size == 0)
+       residual_probe_offset = 0;
+
+      aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
+      if (residual >= min_probe_threshold)
+       {
+         if (dump_file)
+           fprintf (dump_file,
+                    "Stack clash AArch64 prologue residuals: "
+                    HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
+                    "\n", residual);
+
+           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+                                            residual_probe_offset));
+         emit_insn (gen_blockage ());
+       }
+    }
  }
  
-/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
+/* Return 1 if the register is used by the epilogue.  We need to say the
+   return register is used, but only after epilogue generation is complete.
+   Note that in the case of sibcalls, the values "used by the epilogue" are
+   considered live at the start of the called function.
  
-static void
-aarch64_set_handled_components (sbitmap components)
+   For SIMD functions we need to return 1 for FP registers that are saved and
+   restored by a function but are not zero in call_used_regs.  If we do not do 
+   this optimizations may remove the restore of the register.  */
+
+int
+aarch64_epilogue_uses (int regno)
  {
-  for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
-    if (bitmap_bit_p (components, regno))
-      cfun->machine->reg_is_wrapped_separately[regno] = true;
+  if (epilogue_completed)
+    {
+      if (regno == LR_REGNUM)
+       return 1;
+      if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
+       return 1;
+    }
+  return 0;
  }
  
  /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
@@ -4834,7 +5580,7 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
         |  local variables              | <-- frame_pointer_rtx
         |                               |
         +-------------------------------+
-       |  padding0                     | \
+       |  padding                      | \
         +-------------------------------+  |
         |  callee-saved registers       |  | frame.saved_regs_size
         +-------------------------------+  |
@@ -4853,7 +5599,35 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
  
     Dynamic stack allocations via alloca() decrease stack_pointer_rtx
     but leave frame_pointer_rtx and hard_frame_pointer_rtx
-   unchanged.  */
+   unchanged.
+
+   By default for stack-clash we assume the guard is at least 64KB, but this
+   value is configurable to either 4KB or 64KB.  We also force the guard size to
+   be the same as the probing interval and both values are kept in sync.
+
+   With those assumptions the callee can allocate up to 63KB (or 3KB depending
+   on the guard size) of stack space without probing.
+
+   When probing is needed, we emit a probe at the start of the prologue
+   and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
+
+   We have to track how much space has been allocated and the only stores
+   to the stack we track as implicit probes are the FP/LR stores.
+
+   For outgoing arguments we probe if the size is larger than 1KB, such that
+   the ABI specified buffer is maintained for the next callee.
+
+   The following registers are reserved during frame layout and should not be
+   used for any other purpose:
+
+   - r11: Used by stack clash protection when SVE is enabled.
+   - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
+   - r14 and r15: Used for speculation tracking.
+   - r16(IP0), r17(IP1): Used by indirect tailcalls.
+   - r30(LR), r29(FP): Used by standard frame layout.
+
+   These registers must be avoided in frame layout related code unless the
+   explicit intention is to interact with one of the features listed above.  */
  
  /* Generate the prologue instructions for entry into a function.
     Establish the stack frame by decreasing the stack pointer with a
@@ -4864,8 +5638,6 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
  void
  aarch64_expand_prologue (void)
  {
-  aarch64_layout_frame ();
-
    poly_int64 frame_size = cfun->machine->frame.frame_size;
    poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
    HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
@@ -4901,10 +5673,19 @@ aarch64_expand_prologue (void)
         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
      }
  
-  rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
-  rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
+  rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
+  rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
+
+  /* In theory we should never have both an initial adjustment
+     and a callee save adjustment.  Verify that is the case since the
+     code below does not handle it for -fstack-clash-protection.  */
+  gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
  
-  aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
+  /* Will only probe if the initial adjustment is larger than the guard
+     less the amount of the guard reserved for use by the caller's
+     outgoing args.  */
+  aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
+                                         true, false);
  
    if (callee_adjust != 0)
      aarch64_push_regs (reg1, reg2, callee_adjust);
@@ -4921,7 +5702,7 @@ aarch64_expand_prologue (void)
         }
        aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
                           stack_pointer_rtx, callee_offset,
-                         ip1_rtx, ip0_rtx, frame_pointer_needed);
+                         tmp1_rtx, tmp0_rtx, frame_pointer_needed);
        if (frame_pointer_needed && !frame_size.is_constant ())
         {
           /* Variable-sized frames need to describe the save slot
@@ -4958,9 +5739,17 @@ aarch64_expand_prologue (void)
  
    aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
                              callee_adjust != 0 || emit_frame_chain);
-  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-                            callee_adjust != 0 || emit_frame_chain);
-  aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                              callee_adjust != 0 || emit_frame_chain);
+  else
+    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                              callee_adjust != 0 || emit_frame_chain);
+
+  /* We may need to probe the final adjustment if it is larger than the guard
+     that is assumed by the called.  */
+  aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
+                                         !frame_pointer_needed, true);
  }
  
  /* Return TRUE if we can use a simple_return insn.
@@ -4978,11 +5767,22 @@ aarch64_use_return_insn_p (void)
    if (crtl->profile)
      return false;
  
-  aarch64_layout_frame ();
-
    return known_eq (cfun->machine->frame.frame_size, 0);
  }
  
+/* Return false for non-leaf SIMD functions in order to avoid
+   shrink-wrapping them.  Doing this will lose the necessary
+   save/restore of FP registers.  */
+
+bool
+aarch64_use_simple_return_insn_p (void)
+{
+  if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
+    return false;
+
+  return true;
+}
+
  /* Generate the epilogue instructions for returning from a function.
     This is almost exactly the reverse of the prolog sequence, except
     that we need to insert barriers to avoid scheduling loads that read
@@ -4991,8 +5791,6 @@ aarch64_use_return_insn_p (void)
  void
  aarch64_expand_epilogue (bool for_sibcall)
  {
-  aarch64_layout_frame ();
-
    poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
    HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
    poly_int64 final_adjust = cfun->machine->frame.final_adjust;
@@ -5001,13 +5799,24 @@ aarch64_expand_epilogue (bool for_sibcall)
    unsigned reg2 = cfun->machine->frame.wb_candidate2;
    rtx cfi_ops = NULL;
    rtx_insn *insn;
-  /* A stack clash protection prologue may not have left IP0_REGNUM or
-     IP1_REGNUM in a usable state.  The same is true for allocations
+  /* A stack clash protection prologue may not have left EP0_REGNUM or
+     EP1_REGNUM in a usable state.  The same is true for allocations
       with an SVE component, since we then need both temporary registers
-     for each allocation.  */
+     for each allocation.  For stack clash we are in a usable state if
+     the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
+  HOST_WIDE_INT guard_size
+    = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+  HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+
+  /* We can re-use the registers when the allocation amount is smaller than
+     guard_size - guard_used_by_caller because we won't be doing any probes
+     then.  In such situations the register should remain live with the correct
+     value.  */
    bool can_inherit_p = (initial_adjust.is_constant ()
-                       && final_adjust.is_constant ()
-                       && !flag_stack_clash_protection);
+                       && final_adjust.is_constant ())
+                       && (!flag_stack_clash_protection
+                           || known_lt (initial_adjust,
+                                        guard_size - guard_used_by_caller));
  
    /* We need to add memory barrier to prevent read from deallocated stack.  */
    bool need_barrier_p
@@ -5025,23 +5834,29 @@ aarch64_expand_epilogue (bool for_sibcall)
  
    /* Restore the stack pointer from the frame pointer if it may not
       be the same as the stack pointer.  */
-  rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
-  rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
+  rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
+  rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
    if (frame_pointer_needed
        && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
      /* If writeback is used when restoring callee-saves, the CFA
         is restored on the instruction doing the writeback.  */
      aarch64_add_offset (Pmode, stack_pointer_rtx,
                         hard_frame_pointer_rtx, -callee_offset,
-                       ip1_rtx, ip0_rtx, callee_adjust == 0);
+                       tmp1_rtx, tmp0_rtx, callee_adjust == 0);
    else
-    aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
-                   !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
+     /* The case where we need to re-use the register here is very rare, so
+       avoid the complicated condition and just always emit a move if the
+       immediate doesn't fit.  */
+     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
  
    aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
                                 callee_adjust != 0, &cfi_ops);
-  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-                               callee_adjust != 0, &cfi_ops);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                                 callee_adjust != 0, &cfi_ops);
+  else
+    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                                 callee_adjust != 0, &cfi_ops);
  
    if (need_barrier_p)
      emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -5059,8 +5874,11 @@ aarch64_expand_epilogue (bool for_sibcall)
        cfi_ops = NULL;
      }
  
-  aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
-                 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
+  /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
+     add restriction on emit_move optimization to leaf functions.  */
+  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
+                 (!can_inherit_p || !crtl->is_leaf
+                  || df_regs_ever_live_p (EP0_REGNUM)));
  
    if (cfi_ops)
      {
@@ -5162,12 +5980,15 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
    rtx this_rtx, temp0, temp1, addr, funexp;
    rtx_insn *insn;
  
+  if (aarch64_bti_enabled ())
+    emit_insn (gen_bti_c());
+
    reload_completed = 1;
    emit_note (NOTE_INSN_PROLOGUE_END);
  
    this_rtx = gen_rtx_REG (Pmode, this_regno);
-  temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-  temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
+  temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
+  temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
  
    if (vcall_offset == 0)
      aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
@@ -5263,6 +6084,20 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
           );
  }
  
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
+{
+  /* Check to see if the value fits in 24 bits, as that is the maximum we can
+     handle correctly.  */
+  gcc_assert ((val & 0xffffff) == val);
+
+  if (((val & 0xfff) << 0) == val)
+    return val;
+
+  return val & (0xfff << 12);
+}
  
  /* Return true if val is an immediate that can be loaded into a
     register by a MOVZ instruction.  */
@@ -5726,10 +6561,10 @@ virt_or_elim_regno_p (unsigned regno)
     If it is, fill in INFO appropriately.  STRICT_P is true if
     REG_OK_STRICT is in effect.  */
  
-static bool
+bool
  aarch64_classify_address (struct aarch64_address_info *info,
                           rtx x, machine_mode mode, bool strict_p,
-                         aarch64_addr_query_type type = ADDR_QUERY_M)
+                         aarch64_addr_query_type type)
  {
    enum rtx_code code = GET_CODE (x);
    rtx op0, op1;
@@ -5742,10 +6577,18 @@ aarch64_classify_address (struct aarch64_address_info *info,
    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
    bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
    bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
+                           || type == ADDR_QUERY_LDP_STP_N
                             || mode == TImode
                             || mode == TFmode
                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
  
+  /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
+     corresponds to the actual size of the memory being loaded/stored and the
+     mode of the corresponding addressing mode is half of that.  */
+  if (type == ADDR_QUERY_LDP_STP_N
+      && known_eq (GET_MODE_SIZE (mode), 16))
+    mode = DFmode;
+
    bool allow_reg_index_p = (!load_store_pair_p
                             && (known_lt (GET_MODE_SIZE (mode), 16)
                                 || vec_flags == VEC_ADVSIMD
@@ -5814,7 +6657,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
              instruction memory accesses.  */
           if (mode == TImode || mode == TFmode)
             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
-                   && (offset_9bit_signed_unscaled_p (mode, offset)
+                   && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
                         || offset_12bit_unsigned_scaled_p (mode, offset)));
  
           /* A 7bit offset check because OImode will emit a ldp/stp
@@ -5828,7 +6671,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
              ldr/str instructions (only big endian will get here).  */
           if (mode == CImode)
             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
-                   && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
+                   && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
+                                                              offset + 32)
                         || offset_12bit_unsigned_scaled_p (V16QImode,
                                                            offset + 32)));
  
@@ -5868,7 +6712,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
                      || known_eq (GET_MODE_SIZE (mode), 16))
                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
           else
-           return (offset_9bit_signed_unscaled_p (mode, offset)
+           return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
                     || offset_12bit_unsigned_scaled_p (mode, offset));
         }
  
@@ -5921,7 +6765,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
            */
           if (mode == TImode || mode == TFmode)
             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
-                   && offset_9bit_signed_unscaled_p (mode, offset));
+                   && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
  
           if (load_store_pair_p)
             return ((known_eq (GET_MODE_SIZE (mode), 4)
@@ -5929,7 +6773,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
                      || known_eq (GET_MODE_SIZE (mode), 16))
                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
           else
-           return offset_9bit_signed_unscaled_p (mode, offset);
+           return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
         }
        return false;
  
@@ -6327,9 +7171,12 @@ aarch64_emit_call_insn (rtx pat)
  machine_mode
  aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
  {
+  machine_mode mode_x = GET_MODE (x);
+  rtx_code code_x = GET_CODE (x);
+
    /* All floating point compares return CCFP if it is an equality
       comparison, and CCFPE otherwise.  */
-  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+  if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
      {
        switch (code)
         {
@@ -6358,53 +7205,70 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
  
    /* Equality comparisons of short modes against zero can be performed
       using the TST instruction with the appropriate bitmask.  */
-  if (y == const0_rtx && REG_P (x)
+  if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
        && (code == EQ || code == NE)
-      && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
+      && (mode_x == HImode || mode_x == QImode))
      return CC_NZmode;
  
    /* Similarly, comparisons of zero_extends from shorter modes can
       be performed using an ANDS with an immediate mask.  */
-  if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
-      && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+  if (y == const0_rtx && code_x == ZERO_EXTEND
+      && (mode_x == SImode || mode_x == DImode)
        && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
        && (code == EQ || code == NE))
      return CC_NZmode;
  
-  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+  if ((mode_x == SImode || mode_x == DImode)
        && y == const0_rtx
        && (code == EQ || code == NE || code == LT || code == GE)
-      && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
-         || GET_CODE (x) == NEG
-         || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
+      && (code_x == PLUS || code_x == MINUS || code_x == AND
+         || code_x == NEG
+         || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
               && CONST_INT_P (XEXP (x, 2)))))
      return CC_NZmode;
  
    /* A compare with a shifted operand.  Because of canonicalization,
       the comparison will have to be swapped when we emit the assembly
       code.  */
-  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+  if ((mode_x == SImode || mode_x == DImode)
        && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
-      && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
-         || GET_CODE (x) == LSHIFTRT
-         || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
+      && (code_x == ASHIFT || code_x == ASHIFTRT
+         || code_x == LSHIFTRT
+         || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
      return CC_SWPmode;
  
    /* Similarly for a negated operand, but we can only do this for
       equalities.  */
-  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+  if ((mode_x == SImode || mode_x == DImode)
        && (REG_P (y) || GET_CODE (y) == SUBREG)
        && (code == EQ || code == NE)
-      && GET_CODE (x) == NEG)
+      && code_x == NEG)
      return CC_Zmode;
  
-  /* A test for unsigned overflow.  */
-  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
-      && code == NE
-      && GET_CODE (x) == PLUS
-      && GET_CODE (y) == ZERO_EXTEND)
+  /* A test for unsigned overflow from an addition.  */
+  if ((mode_x == DImode || mode_x == TImode)
+      && (code == LTU || code == GEU)
+      && code_x == PLUS
+      && rtx_equal_p (XEXP (x, 0), y))
      return CC_Cmode;
  
+  /* A test for unsigned overflow from an add with carry.  */
+  if ((mode_x == DImode || mode_x == TImode)
+      && (code == LTU || code == GEU)
+      && code_x == PLUS
+      && CONST_SCALAR_INT_P (y)
+      && (rtx_mode_t (y, mode_x)
+         == (wi::shwi (1, mode_x)
+             << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
+    return CC_ADCmode;
+
+  /* A test for signed overflow.  */
+  if ((mode_x == DImode || mode_x == TImode)
+      && code == NE
+      && code_x == PLUS
+      && GET_CODE (y) == SIGN_EXTEND)
+    return CC_Vmode;
+
    /* For everything else, return CCmode.  */
    return CCmode;
  }
@@ -6505,8 +7369,26 @@ aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
      case E_CC_Cmode:
        switch (comp_code)
         {
-       case NE: return AARCH64_CS;
-       case EQ: return AARCH64_CC;
+       case LTU: return AARCH64_CS;
+       case GEU: return AARCH64_CC;
+       default: return -1;
+       }
+      break;
+
+    case E_CC_ADCmode:
+      switch (comp_code)
+       {
+       case GEU: return AARCH64_CS;
+       case LTU: return AARCH64_CC;
+       default: return -1;
+       }
+      break;
+
+    case E_CC_Vmode:
+      switch (comp_code)
+       {
+       case NE: return AARCH64_VS;
+       case EQ: return AARCH64_VC;
         default: return -1;
         }
        break;
@@ -6760,6 +7642,12 @@ aarch64_print_operand (FILE *f, rtx x, int code)
        break;
  
      case 'H':
+      if (x == const0_rtx)
+       {
+         asm_fprintf (f, "xzr");
+         break;
+       }
+
        if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
         {
           output_operand_lossage ("invalid operand for '%%%c'", code);
@@ -7122,13 +8010,10 @@ aarch64_print_operand (FILE *f, rtx x, int code)
             return;
           }
  
-       if (code == 'y')
-         /* LDP/STP which uses a single double-width memory operand.
-            Adjust the mode to appear like a typical LDP/STP.
-            Currently this is supported for 16-byte accesses only.  */
-         mode = DFmode;
-
-       if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
+       if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
+                                           code == 'y'
+                                           ? ADDR_QUERY_LDP_STP_N
+                                           : ADDR_QUERY_LDP_STP))
           output_operand_lossage ("invalid operand prefix '%%%c'", code);
        }
        break;
@@ -7150,8 +8035,13 @@ aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
    unsigned int size;
  
    /* Check all addresses are Pmode - including ILP32.  */
-  if (GET_MODE (x) != Pmode)
-    output_operand_lossage ("invalid address mode");
+  if (GET_MODE (x) != Pmode
+      && (!CONST_INT_P (x)
+         || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
+    {
+      output_operand_lossage ("invalid address mode");
+      return false;
+    }
  
    if (aarch64_classify_address (&addr, x, mode, true, type))
      switch (addr.type)
@@ -7251,13 +8141,6 @@ aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
    return false;
  }
  
-/* Print address 'x' of a LDP/STP with mode 'mode'.  */
-static bool
-aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
-{
-  return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
-}
-
  /* Print address 'x' of a memory access with mode 'mode'.  */
  static void
  aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
@@ -7422,51 +8305,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
    return x;
  }
  
-/* Return the reload icode required for a constant pool in mode.  */
-static enum insn_code
-aarch64_constant_pool_reload_icode (machine_mode mode)
-{
-  switch (mode)
-    {
-    case E_SFmode:
-      return CODE_FOR_aarch64_reload_movcpsfdi;
-
-    case E_DFmode:
-      return CODE_FOR_aarch64_reload_movcpdfdi;
-
-    case E_TFmode:
-      return CODE_FOR_aarch64_reload_movcptfdi;
-
-    case E_V8QImode:
-      return CODE_FOR_aarch64_reload_movcpv8qidi;
-
-    case E_V16QImode:
-      return CODE_FOR_aarch64_reload_movcpv16qidi;
-
-    case E_V4HImode:
-      return CODE_FOR_aarch64_reload_movcpv4hidi;
-
-    case E_V8HImode:
-      return CODE_FOR_aarch64_reload_movcpv8hidi;
-
-    case E_V2SImode:
-      return CODE_FOR_aarch64_reload_movcpv2sidi;
-
-    case E_V4SImode:
-      return CODE_FOR_aarch64_reload_movcpv4sidi;
-
-    case E_V2DImode:
-      return CODE_FOR_aarch64_reload_movcpv2didi;
-
-    case E_V2DFmode:
-      return CODE_FOR_aarch64_reload_movcpv2dfdi;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  gcc_unreachable ();
-}
  static reg_class_t
  aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
                           reg_class_t rclass,
@@ -7494,7 +8332,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
           || targetm.vector_mode_supported_p (GET_MODE (x)))
        && !aarch64_pcrelative_literal_loads)
      {
-      sri->icode = aarch64_constant_pool_reload_icode (mode);
+      sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
        return NO_REGS;
      }
  
@@ -7504,10 +8342,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
        && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
        && reg_class_subset_p (rclass, FP_REGS))
      {
-      if (mode == TFmode)
-        sri->icode = CODE_FOR_aarch64_reload_movtf;
-      else if (mode == TImode)
-        sri->icode = CODE_FOR_aarch64_reload_movti;
+      sri->icode = code_for_aarch64_reload_mov (mode);
        return NO_REGS;
      }
  
@@ -7539,8 +8374,6 @@ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
  poly_int64
  aarch64_initial_elimination_offset (unsigned from, unsigned to)
  {
-  aarch64_layout_frame ();
-
    if (to == HARD_FRAME_POINTER_REGNUM)
      {
        if (from == ARG_POINTER_REGNUM)
@@ -7576,18 +8409,36 @@ aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
  static void
  aarch64_asm_trampoline_template (FILE *f)
  {
+  int offset1 = 16;
+  int offset2 = 20;
+
+  if (aarch64_bti_enabled ())
+    {
+      asm_fprintf (f, "\thint\t34 // bti c\n");
+      offset1 -= 4;
+      offset2 -= 4;
+    }
+
    if (TARGET_ILP32)
      {
-      asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
-      asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
+      asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
+      asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
+                  offset1);
      }
    else
      {
-      asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
-      asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
+      asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
+      asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
+                  offset2);
      }
    asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
-  assemble_aligned_integer (4, const0_rtx);
+
+  /* The trampoline needs an extra padding instruction.  In case if BTI is
+     enabled the padding instruction is replaced by the BTI instruction at
+     the beginning.  */
+  if (!aarch64_bti_enabled ())
+    assemble_aligned_integer (4, const0_rtx);
+
    assemble_aligned_integer (POINTER_BYTES, const0_rtx);
    assemble_aligned_integer (POINTER_BYTES, const0_rtx);
  }
@@ -8481,7 +9332,37 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
    return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
-        && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
+        && (INTVAL (mask)
+            & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
+}
+
+/* Return true if the masks and a shift amount from an RTX of the form
+   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
+   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
+
+bool
+aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
+                                  unsigned HOST_WIDE_INT mask1,
+                                  unsigned HOST_WIDE_INT shft_amnt,
+                                  unsigned HOST_WIDE_INT mask2)
+{
+  unsigned HOST_WIDE_INT t;
+
+  /* Verify that there is no overlap in what bits are set in the two masks.  */
+  if (mask1 != ~mask2)
+    return false;
+
+  /* Verify that mask2 is not all zeros or ones.  */
+  if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
+    return false;
+
+  /* The shift amount should always be less than the mode size.  */
+  gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
+
+  /* Verify that the mask being shifted is contiguous and would be in the
+     least significant bits after shifting by shft_amnt.  */
+  t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
+  return (t == (t & -t));
  }
  
  /* Calculate the cost of calculating X, storing it in *COST.  Result
@@ -9873,43 +10754,6 @@ aarch64_builtin_reciprocal (tree fndecl)
    return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
  }
  
-typedef rtx (*rsqrte_type) (rtx, rtx);
-
-/* Select reciprocal square root initial estimate insn depending on machine
-   mode.  */
-
-static rsqrte_type
-get_rsqrte_type (machine_mode mode)
-{
-  switch (mode)
-  {
-    case E_DFmode:   return gen_aarch64_rsqrtedf;
-    case E_SFmode:   return gen_aarch64_rsqrtesf;
-    case E_V2DFmode: return gen_aarch64_rsqrtev2df;
-    case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
-    case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
-    default: gcc_unreachable ();
-  }
-}
-
-typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
-
-/* Select reciprocal square root series step insn depending on machine mode.  */
-
-static rsqrts_type
-get_rsqrts_type (machine_mode mode)
-{
-  switch (mode)
-  {
-    case E_DFmode:   return gen_aarch64_rsqrtsdf;
-    case E_SFmode:   return gen_aarch64_rsqrtssf;
-    case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
-    case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
-    case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
-    default: gcc_unreachable ();
-  }
-}
-
  /* Emit instruction sequence to compute either the approximate square root
     or its approximate reciprocal, depending on the flag RECP, and return
     whether the sequence was emitted or not.  */
@@ -9954,7 +10798,7 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
  
    /* Estimate the approximate reciprocal square root.  */
    rtx xdst = gen_reg_rtx (mode);
-  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
+  emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
  
    /* Iterate over the series twice for SF and thrice for DF.  */
    int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
@@ -9973,7 +10817,7 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
        rtx x2 = gen_reg_rtx (mode);
        emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
  
-      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
+      emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
  
        if (iterations > 0)
         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
@@ -9998,42 +10842,6 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
    return true;
  }
  
-typedef rtx (*recpe_type) (rtx, rtx);
-
-/* Select reciprocal initial estimate insn depending on machine mode.  */
-
-static recpe_type
-get_recpe_type (machine_mode mode)
-{
-  switch (mode)
-  {
-    case E_SFmode:   return (gen_aarch64_frecpesf);
-    case E_V2SFmode: return (gen_aarch64_frecpev2sf);
-    case E_V4SFmode: return (gen_aarch64_frecpev4sf);
-    case E_DFmode:   return (gen_aarch64_frecpedf);
-    case E_V2DFmode: return (gen_aarch64_frecpev2df);
-    default:         gcc_unreachable ();
-  }
-}
-
-typedef rtx (*recps_type) (rtx, rtx, rtx);
-
-/* Select reciprocal series step insn depending on machine mode.  */
-
-static recps_type
-get_recps_type (machine_mode mode)
-{
-  switch (mode)
-  {
-    case E_SFmode:   return (gen_aarch64_frecpssf);
-    case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
-    case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
-    case E_DFmode:   return (gen_aarch64_frecpsdf);
-    case E_V2DFmode: return (gen_aarch64_frecpsv2df);
-    default:         gcc_unreachable ();
-  }
-}
-
  /* Emit the instruction sequence to compute the approximation for the division
     of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
  
@@ -10061,7 +10869,7 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
  
    /* Estimate the approximate reciprocal.  */
    rtx xrcp = gen_reg_rtx (mode);
-  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
+  emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
  
    /* Iterate over the series twice for SF and thrice for DF.  */
    int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
@@ -10075,7 +10883,7 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
    rtx xtmp = gen_reg_rtx (mode);
    while (iterations--)
      {
-      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
+      emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
  
        if (iterations > 0)
         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
@@ -10225,25 +11033,24 @@ static void initialize_aarch64_code_model (struct gcc_options *);
  /* Parse the TO_PARSE string and put the architecture struct that it
     selects into RES and the architectural features into ISA_FLAGS.
     Return an aarch64_parse_opt_result describing the parse result.
-   If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
+   If there is an error parsing, RES and ISA_FLAGS are left unchanged.
+   When the TO_PARSE string contains an invalid extension,
+   a copy of the string is created and stored to INVALID_EXTENSION.  */
  
  static enum aarch64_parse_opt_result
  aarch64_parse_arch (const char *to_parse, const struct processor **res,
-                   unsigned long *isa_flags)
+                   unsigned long *isa_flags, std::string *invalid_extension)
  {
-  char *ext;
+  const char *ext;
    const struct processor *arch;
-  char *str = (char *) alloca (strlen (to_parse) + 1);
    size_t len;
  
-  strcpy (str, to_parse);
-
-  ext = strchr (str, '+');
+  ext = strchr (to_parse, '+');
  
    if (ext != NULL)
-    len = ext - str;
+    len = ext - to_parse;
    else
-    len = strlen (str);
+    len = strlen (to_parse);
  
    if (len == 0)
      return AARCH64_PARSE_MISSING_ARG;
@@ -10252,7 +11059,8 @@ aarch64_parse_arch (const char *to_parse, const struct processor **res,
    /* Loop through the list of supported ARCHes to find a match.  */
    for (arch = all_architectures; arch->name != NULL; arch++)
      {
-      if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
+      if (strlen (arch->name) == len
+         && strncmp (arch->name, to_parse, len) == 0)
         {
           unsigned long isa_temp = arch->flags;
  
@@ -10260,7 +11068,7 @@ aarch64_parse_arch (const char *to_parse, const struct processor **res,
             {
               /* TO_PARSE string contains at least one extension.  */
               enum aarch64_parse_opt_result ext_res
-               = aarch64_parse_extension (ext, &isa_temp);
+               = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
  
               if (ext_res != AARCH64_PARSE_OK)
                 return ext_res;
@@ -10280,25 +11088,24 @@ aarch64_parse_arch (const char *to_parse, const struct processor **res,
  /* Parse the TO_PARSE string and put the result tuning in RES and the
     architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
     describing the parse result.  If there is an error parsing, RES and
-   ISA_FLAGS are left unchanged.  */
+   ISA_FLAGS are left unchanged.
+   When the TO_PARSE string contains an invalid extension,
+   a copy of the string is created and stored to INVALID_EXTENSION.  */
  
  static enum aarch64_parse_opt_result
  aarch64_parse_cpu (const char *to_parse, const struct processor **res,
-                  unsigned long *isa_flags)
+                  unsigned long *isa_flags, std::string *invalid_extension)
  {
-  char *ext;
+  const char *ext;
    const struct processor *cpu;
-  char *str = (char *) alloca (strlen (to_parse) + 1);
    size_t len;
  
-  strcpy (str, to_parse);
-
-  ext = strchr (str, '+');
+  ext = strchr (to_parse, '+');
  
    if (ext != NULL)
-    len = ext - str;
+    len = ext - to_parse;
    else
-    len = strlen (str);
+    len = strlen (to_parse);
  
    if (len == 0)
      return AARCH64_PARSE_MISSING_ARG;
@@ -10307,7 +11114,7 @@ aarch64_parse_cpu (const char *to_parse, const struct processor **res,
    /* Loop through the list of supported CPUs to find a match.  */
    for (cpu = all_cores; cpu->name != NULL; cpu++)
      {
-      if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
+      if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
         {
           unsigned long isa_temp = cpu->flags;
  
@@ -10316,7 +11123,7 @@ aarch64_parse_cpu (const char *to_parse, const struct processor **res,
             {
               /* TO_PARSE string contains at least one extension.  */
               enum aarch64_parse_opt_result ext_res
-               = aarch64_parse_extension (ext, &isa_temp);
+               = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
  
               if (ext_res != AARCH64_PARSE_OK)
                 return ext_res;
@@ -10341,14 +11148,11 @@ static enum aarch64_parse_opt_result
  aarch64_parse_tune (const char *to_parse, const struct processor **res)
  {
    const struct processor *cpu;
-  char *str = (char *) alloca (strlen (to_parse) + 1);
-
-  strcpy (str, to_parse);
  
    /* Loop through the list of supported CPUs to find a match.  */
    for (cpu = all_cores; cpu->name != NULL; cpu++)
      {
-      if (strcmp (cpu->name, str) == 0)
+      if (strcmp (cpu->name, to_parse) == 0)
         {
           *res = cpu;
           return AARCH64_PARSE_OK;
@@ -10376,7 +11180,7 @@ aarch64_parse_one_option_token (const char *token,
         return flag->flag;
      }
  
-  error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
+  error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
    return 0;
  }
  
@@ -10462,6 +11266,37 @@ aarch64_parse_tune_string (const char *tune_string,
                                      "tune=");
  }
  
+/* Parse the sve_width tuning moverride string in TUNE_STRING.
+   Accept the valid SVE vector widths allowed by
+   aarch64_sve_vector_bits_enum and use it to override sve_width
+   in TUNE.  */
+
+static void
+aarch64_parse_sve_width_string (const char *tune_string,
+                               struct tune_params *tune)
+{
+  int width = -1;
+
+  int n = sscanf (tune_string, "%d", &width);
+  if (n == EOF)
+    {
+      error ("invalid format for sve_width");
+      return;
+    }
+  switch (width)
+    {
+    case SVE_128:
+    case SVE_256:
+    case SVE_512:
+    case SVE_1024:
+    case SVE_2048:
+      break;
+    default:
+      error ("invalid sve_width value: %d", width);
+    }
+  tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
+}
+
  /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
     we understand.  If it is, extract the option string and handoff to
     the appropriate function.  */
@@ -10570,6 +11405,12 @@ aarch64_parse_override_string (const char* input_string,
  static void
  aarch64_override_options_after_change_1 (struct gcc_options *opts)
  {
+  if (accepted_branch_protection_string)
+    {
+      opts->x_aarch64_branch_protection_string
+       = xstrdup (accepted_branch_protection_string);
+    }
+
    /* PR 70044: We have to be careful about being called multiple times for the
       same function.  This means all changes should be repeatable.  */
  
@@ -10639,6 +11480,41 @@ aarch64_override_options_internal (struct gcc_options *opts)
    if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
      opts->x_flag_strict_volatile_bitfields = 1;
  
+  if (aarch64_stack_protector_guard == SSP_GLOBAL
+      && opts->x_aarch64_stack_protector_guard_offset_str)
+    {
+      error ("incompatible options %<-mstack-protector-guard=global%> and "
+            "%<-mstack-protector-guard-offset=%s%>",
+            aarch64_stack_protector_guard_offset_str);
+    }
+
+  if (aarch64_stack_protector_guard == SSP_SYSREG
+      && !(opts->x_aarch64_stack_protector_guard_offset_str
+          && opts->x_aarch64_stack_protector_guard_reg_str))
+    {
+      error ("both %<-mstack-protector-guard-offset%> and "
+            "%<-mstack-protector-guard-reg%> must be used "
+            "with %<-mstack-protector-guard=sysreg%>");
+    }
+
+  if (opts->x_aarch64_stack_protector_guard_reg_str)
+    {
+      if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
+         error ("specify a system register with a small string length.");
+    }
+
+  if (opts->x_aarch64_stack_protector_guard_offset_str)
+    {
+      char *end;
+      const char *str = aarch64_stack_protector_guard_offset_str;
+      errno = 0;
+      long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
+      if (!*str || *end || errno)
+       error ("%qs is not a valid offset in %qs", str,
+              "-mstack-protector-guard-offset=");
+      aarch64_stack_protector_guard_offset = offs;
+    }
+
    initialize_aarch64_code_model (opts);
    initialize_aarch64_tls_size (opts);
  
@@ -10704,6 +11580,33 @@ aarch64_override_options_internal (struct gcc_options *opts)
                          opts->x_param_values,
                          global_options_set.x_param_values);
  
+  /* If the user hasn't changed it via configure then set the default to 64 KB
+     for the backend.  */
+  maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
+                        DEFAULT_STK_CLASH_GUARD_SIZE == 0
+                          ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
+                        opts->x_param_values,
+                        global_options_set.x_param_values);
+
+  /* Validate the guard size.  */
+  int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+
+  /* Enforce that interval is the same size as size so the mid-end does the
+     right thing.  */
+  maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
+                        guard_size,
+                        opts->x_param_values,
+                        global_options_set.x_param_values);
+
+  /* The maybe_set calls won't update the value if the user has explicitly set
+     one.  Which means we need to validate that probing interval and guard size
+     are equal.  */
+  int probe_interval
+    = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+  if (guard_size != probe_interval)
+    error ("stack clash guard size %<%d%> must be equal to probing interval "
+          "%<%d%>", guard_size, probe_interval);
+
    /* Enable sw prefetching at specified optimization level for
       CPUS that have prefetch.  Lower optimization level threshold by 1
       when profiling is enabled.  */
@@ -10713,6 +11616,13 @@ aarch64_override_options_internal (struct gcc_options *opts)
        && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
      opts->x_flag_prefetch_loop_arrays = 1;
  
+  if (opts->x_aarch64_arch_string == NULL)
+    opts->x_aarch64_arch_string = selected_arch->name;
+  if (opts->x_aarch64_cpu_string == NULL)
+    opts->x_aarch64_cpu_string = selected_cpu->name;
+  if (opts->x_aarch64_tune_string == NULL)
+    opts->x_aarch64_tune_string = selected_tune->name;
+
    aarch64_override_options_after_change_1 (opts);
  }
  
@@ -10764,6 +11674,26 @@ aarch64_print_hint_for_arch (const char *str)
    aarch64_print_hint_for_core_or_arch (str, true);
  }
  
+
+/* Print a hint with a suggestion for an extension name
+   that most closely resembles what the user passed in STR.  */
+
+void
+aarch64_print_hint_for_extensions (const std::string &str)
+{
+  auto_vec<const char *> candidates;
+  aarch64_get_all_extension_candidates (&candidates);
+  char *s;
+  const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
+  if (hint)
+    inform (input_location, "valid arguments are: %s;"
+                            " did you mean %qs?", s, hint);
+  else
+    inform (input_location, "valid arguments are: %s;", s);
+
+  XDELETEVEC (s);
+}
+
  /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
     specified in STR and throw errors if appropriate.  Put the results if
     they are valid in RES and ISA_FLAGS.  Return whether the option is
@@ -10773,8 +11703,9 @@ static bool
  aarch64_validate_mcpu (const char *str, const struct processor **res,
                        unsigned long *isa_flags)
  {
+  std::string invalid_extension;
    enum aarch64_parse_opt_result parse_res
-    = aarch64_parse_cpu (str, res, isa_flags);
+    = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
  
    if (parse_res == AARCH64_PARSE_OK)
      return true;
@@ -10785,17 +11716,123 @@ aarch64_validate_mcpu (const char *str, const struct processor **res,
         error ("missing cpu name in %<-mcpu=%s%>", str);
         break;
        case AARCH64_PARSE_INVALID_ARG:
-       error ("unknown value %qs for -mcpu", str);
+       error ("unknown value %qs for %<-mcpu%>", str);
         aarch64_print_hint_for_core (str);
         break;
        case AARCH64_PARSE_INVALID_FEATURE:
-       error ("invalid feature modifier in %<-mcpu=%s%>", str);
+       error ("invalid feature modifier %qs in %<-mcpu=%s%>",
+              invalid_extension.c_str (), str);
+       aarch64_print_hint_for_extensions (invalid_extension);
         break;
        default:
         gcc_unreachable ();
      }
  
-  return false;
+  return false;
+}
+
+/* Parses CONST_STR for branch protection features specified in
+   aarch64_branch_protect_types, and set any global variables required.  Returns
+   the parsing result and assigns LAST_STR to the last processed token from
+   CONST_STR so that it can be used for error reporting.  */
+
+static enum
+aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
+                                                         char** last_str)
+{
+  char *str_root = xstrdup (const_str);
+  char* token_save = NULL;
+  char *str = strtok_r (str_root, "+", &token_save);
+  enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
+  if (!str)
+    res = AARCH64_PARSE_MISSING_ARG;
+  else
+    {
+      char *next_str = strtok_r (NULL, "+", &token_save);
+      /* Reset the branch protection features to their defaults.  */
+      aarch64_handle_no_branch_protection (NULL, NULL);
+
+      while (str && res == AARCH64_PARSE_OK)
+       {
+         const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
+         bool found = false;
+         /* Search for this type.  */
+         while (type && type->name && !found && res == AARCH64_PARSE_OK)
+           {
+             if (strcmp (str, type->name) == 0)
+               {
+                 found = true;
+                 res = type->handler (str, next_str);
+                 str = next_str;
+                 next_str = strtok_r (NULL, "+", &token_save);
+               }
+             else
+               type++;
+           }
+         if (found && res == AARCH64_PARSE_OK)
+           {
+             bool found_subtype = true;
+             /* Loop through each token until we find one that isn't a
+                subtype.  */
+             while (found_subtype)
+               {
+                 found_subtype = false;
+                 const aarch64_branch_protect_type *subtype = type->subtypes;
+                 /* Search for the subtype.  */
+                 while (str && subtype && subtype->name && !found_subtype
+                         && res == AARCH64_PARSE_OK)
+                   {
+                     if (strcmp (str, subtype->name) == 0)
+                       {
+                         found_subtype = true;
+                         res = subtype->handler (str, next_str);
+                         str = next_str;
+                         next_str = strtok_r (NULL, "+", &token_save);
+                       }
+                     else
+                       subtype++;
+                   }
+               }
+           }
+         else if (!found)
+           res = AARCH64_PARSE_INVALID_ARG;
+       }
+    }
+  /* Copy the last processed token into the argument to pass it back.
+    Used by option and attribute validation to print the offending token.  */
+  if (last_str)
+    {
+      if (str) strcpy (*last_str, str);
+      else *last_str = NULL;
+    }
+  if (res == AARCH64_PARSE_OK)
+    {
+      /* If needed, alloc the accepted string then copy in const_str.
+       Used by override_option_after_change_1.  */
+      if (!accepted_branch_protection_string)
+       accepted_branch_protection_string = (char *) xmalloc (
+                                                     BRANCH_PROTECT_STR_MAX
+                                                       + 1);
+      strncpy (accepted_branch_protection_string, const_str,
+               BRANCH_PROTECT_STR_MAX + 1);
+      /* Forcibly null-terminate.  */
+      accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
+    }
+  return res;
+}
+
+static bool
+aarch64_validate_mbranch_protection (const char *const_str)
+{
+  char *str = (char *) xmalloc (strlen (const_str));
+  enum aarch64_parse_opt_result res =
+    aarch64_parse_branch_protection (const_str, &str);
+  if (res == AARCH64_PARSE_INVALID_ARG)
+    error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
+  else if (res == AARCH64_PARSE_MISSING_ARG)
+    error ("missing arg for %<-mbranch-protection=%>");
+  free (str);
+  return res == AARCH64_PARSE_OK;
  }
  
  /* Validate a command-line -march option.  Parse the arch and extensions
@@ -10807,8 +11844,9 @@ static bool
  aarch64_validate_march (const char *str, const struct processor **res,
                          unsigned long *isa_flags)
  {
+  std::string invalid_extension;
    enum aarch64_parse_opt_result parse_res
-    = aarch64_parse_arch (str, res, isa_flags);
+    = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
  
    if (parse_res == AARCH64_PARSE_OK)
      return true;
@@ -10819,11 +11857,13 @@ aarch64_validate_march (const char *str, const struct processor **res,
         error ("missing arch name in %<-march=%s%>", str);
         break;
        case AARCH64_PARSE_INVALID_ARG:
-       error ("unknown value %qs for -march", str);
+       error ("unknown value %qs for %<-march%>", str);
         aarch64_print_hint_for_arch (str);
         break;
        case AARCH64_PARSE_INVALID_FEATURE:
-       error ("invalid feature modifier in %<-march=%s%>", str);
+       error ("invalid feature modifier %qs in %<-march=%s%>",
+              invalid_extension.c_str (), str);
+       aarch64_print_hint_for_extensions (invalid_extension);
         break;
        default:
         gcc_unreachable ();
@@ -10852,7 +11892,7 @@ aarch64_validate_mtune (const char *str, const struct processor **res)
         error ("missing cpu name in %<-mtune=%s%>", str);
         break;
        case AARCH64_PARSE_INVALID_ARG:
-       error ("unknown value %qs for -mtune", str);
+       error ("unknown value %qs for %<-mtune%>", str);
         aarch64_print_hint_for_core (str);
         break;
        default:
@@ -10929,6 +11969,9 @@ aarch64_override_options (void)
    selected_arch = NULL;
    selected_tune = NULL;
  
+  if (aarch64_branch_protection_string)
+    aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
+
    /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
       If either of -march or -mtune is given, they override their
       respective component of -mcpu.  */
@@ -10943,6 +11986,10 @@ aarch64_override_options (void)
    if (aarch64_tune_string)
      valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
  
+#ifdef SUBTARGET_OVERRIDE_OPTIONS
+  SUBTARGET_OVERRIDE_OPTIONS;
+#endif
+
    /* If the user did not specify a processor, choose the default
       one for them.  This will be the CPU set during configuration using
       --with-cpu, otherwise it is "generic".  */
@@ -10970,7 +12017,7 @@ aarch64_override_options (void)
      {
        if (selected_arch->arch != selected_cpu->arch)
         {
-         warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
+         warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
                        all_architectures[selected_cpu->arch].name,
                        selected_arch->name);
         }
@@ -11001,18 +12048,39 @@ aarch64_override_options (void)
    if (!selected_tune)
      selected_tune = selected_cpu;
  
+  if (aarch64_enable_bti == 2)
+    {
+#ifdef TARGET_ENABLE_BTI
+      aarch64_enable_bti = 1;
+#else
+      aarch64_enable_bti = 0;
+#endif
+    }
+
+  /* Return address signing is currently not supported for ILP32 targets.  For
+     LP64 targets use the configured option in the absence of a command-line
+     option for -mbranch-protection.  */
+  if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
+    {
+#ifdef TARGET_ENABLE_PAC_RET
+      aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
+#else
+      aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
+#endif
+    }
+
  #ifndef HAVE_AS_MABI_OPTION
    /* The compiler may have been configured with 2.23.* binutils, which does
       not have support for ILP32.  */
    if (TARGET_ILP32)
-    error ("assembler does not support -mabi=ilp32");
+    error ("assembler does not support %<-mabi=ilp32%>");
  #endif
  
    /* Convert -msve-vector-bits to a VG count.  */
    aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
  
    if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
-    sorry ("return address signing is only supported for -mabi=lp64");
+    sorry ("return address signing is only supported for %<-mabi=lp64%>");
  
    /* Make sure we properly set up the explicit options.  */
    if ((aarch64_cpu_string && valid_cpu)
@@ -11023,6 +12091,12 @@ aarch64_override_options (void)
         || (aarch64_arch_string && valid_arch))
      gcc_assert (explicit_arch != aarch64_no_arch);
  
+  /* The pass to insert speculation tracking runs before
+     shrink-wrapping and the latter does not know how to update the
+     tracking status.  So disable it in this case.  */
+  if (aarch64_track_speculation)
+    flag_shrink_wrap = 0;
+
    aarch64_override_options_internal (&global_options);
  
    /* Save these options as the default ones in case we push and pop them later
@@ -11074,7 +12148,7 @@ initialize_aarch64_code_model (struct gcc_options *opts)
  #endif
            break;
          case AARCH64_CMODEL_LARGE:
-          sorry ("code model %qs with -f%s", "large",
+          sorry ("code model %qs with %<-f%s%>", "large",
                   opts->x_flag_pic > 1 ? "PIC" : "pic");
            break;
          default:
@@ -11091,6 +12165,8 @@ static void
  aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
  {
    ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
+  ptr->x_aarch64_branch_protection_string
+    = opts->x_aarch64_branch_protection_string;
  }
  
  /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
@@ -11104,6 +12180,13 @@ aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
    opts->x_explicit_arch = ptr->x_explicit_arch;
    selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
    opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
+  opts->x_aarch64_branch_protection_string
+    = ptr->x_aarch64_branch_protection_string;
+  if (opts->x_aarch64_branch_protection_string)
+    {
+      aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
+                                       NULL);
+    }
  
    aarch64_override_options_internal (opts);
  }
@@ -11223,8 +12306,9 @@ static bool
  aarch64_handle_attr_arch (const char *str)
  {
    const struct processor *tmp_arch = NULL;
+  std::string invalid_extension;
    enum aarch64_parse_opt_result parse_res
-    = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
+    = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
  
    if (parse_res == AARCH64_PARSE_OK)
      {
@@ -11244,7 +12328,9 @@ aarch64_handle_attr_arch (const char *str)
         aarch64_print_hint_for_arch (str);
         break;
        case AARCH64_PARSE_INVALID_FEATURE:
-       error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
+       error ("invalid feature modifier %s of value (\"%s\") in "
+              "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
+       aarch64_print_hint_for_extensions (invalid_extension);
         break;
        default:
         gcc_unreachable ();
@@ -11259,8 +12345,9 @@ static bool
  aarch64_handle_attr_cpu (const char *str)
  {
    const struct processor *tmp_cpu = NULL;
+  std::string invalid_extension;
    enum aarch64_parse_opt_result parse_res
-    = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
+    = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
  
    if (parse_res == AARCH64_PARSE_OK)
      {
@@ -11283,7 +12370,9 @@ aarch64_handle_attr_cpu (const char *str)
         aarch64_print_hint_for_core (str);
         break;
        case AARCH64_PARSE_INVALID_FEATURE:
-       error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
+       error ("invalid feature modifier %s of value (\"%s\") in "
+              "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
+       aarch64_print_hint_for_extensions (invalid_extension);
         break;
        default:
         gcc_unreachable ();
@@ -11292,6 +12381,37 @@ aarch64_handle_attr_cpu (const char *str)
    return false;
  }
  
+/* Handle the argument STR to the branch-protection= attribute.  */
+
+ static bool
+ aarch64_handle_attr_branch_protection (const char* str)
+ {
+  char *err_str = (char *) xmalloc (strlen (str));
+  enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
+                                                                     &err_str);
+  bool success = false;
+  switch (res)
+    {
+     case AARCH64_PARSE_MISSING_ARG:
+       error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
+             " attribute");
+       break;
+     case AARCH64_PARSE_INVALID_ARG:
+       error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
+             "=\")%> pragma or attribute", err_str);
+       break;
+     case AARCH64_PARSE_OK:
+       success = true;
+      /* Fall through.  */
+     case AARCH64_PARSE_INVALID_FEATURE:
+       break;
+     default:
+       gcc_unreachable ();
+    }
+  free (err_str);
+  return success;
+ }
+
  /* Handle the argument STR to the tune= target attribute.  */
  
  static bool
@@ -11341,7 +12461,8 @@ aarch64_handle_attr_isa_flags (char *str)
        str += 8;
      }
  
-  parse_res = aarch64_parse_extension (str, &isa_flags);
+  std::string invalid_extension;
+  parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
  
    if (parse_res == AARCH64_PARSE_OK)
      {
@@ -11356,7 +12477,8 @@ aarch64_handle_attr_isa_flags (char *str)
         break;
  
        case AARCH64_PARSE_INVALID_FEATURE:
-       error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
+       error ("invalid feature modifier %s of value (\"%s\") in "
+              "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
         break;
  
        default:
@@ -11388,6 +12510,8 @@ static const struct aarch64_attribute_info aarch64_attributes[] =
    { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
    { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
       OPT_mtune_ },
+  { "branch-protection", aarch64_attr_custom, false,
+     aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
    { "sign-return-address", aarch64_attr_enum, false, NULL,
       OPT_msign_return_address_ },
    { NULL, aarch64_attr_custom, false, NULL, OPT____ }
@@ -11591,7 +12715,7 @@ aarch64_process_target_attr (tree args)
    unsigned int num_commas = num_occurences_in_str (',', str_to_check);
  
    /* Handle multiple target attributes separated by ','.  */
-  char *token = strtok (str_to_check, ",");
+  char *token = strtok_r (str_to_check, ",", &str_to_check);
  
    unsigned int num_attrs = 0;
    while (token)
@@ -11603,7 +12727,7 @@ aarch64_process_target_attr (tree args)
           return false;
         }
  
-      token = strtok (NULL, ",");
+      token = strtok_r (NULL, ",", &str_to_check);
      }
  
    if (num_attrs != num_commas + 1)
@@ -12266,7 +13390,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
    stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
                   f_stack, NULL_TREE);
    size = int_size_in_bytes (type);
-  align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
+
+  bool abi_break;
+  align
+    = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
  
    dw_align = false;
    adjust = 0;
@@ -12313,7 +13440,12 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
        nregs = rsize / UNITS_PER_WORD;
  
        if (align > 8)
-       dw_align = true;
+       {
+         if (abi_break && warn_psabi)
+           inform (input_location, "parameter passing for argument of type "
+                   "%qT changed in GCC 9.1", type);
+         dw_align = true;
+       }
  
        if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
           && size < UNITS_PER_WORD)
@@ -12587,6 +13719,19 @@ aarch64_conditional_register_usage (void)
         fixed_regs[i] = 1;
         call_used_regs[i] = 1;
        }
+
+  /* When tracking speculation, we need a couple of call-clobbered registers
+     to track the speculation state.  It would be nice to just use
+     IP0 and IP1, but currently there are numerous places that just
+     assume these registers are free for other uses (eg pointer
+     authentication).  */
+  if (aarch64_track_speculation)
+    {
+      fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
+      call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
+      fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
+      call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
+    }
  }
  
  /* Walk down the type tree of TYPE counting consecutive base elements.
@@ -12978,7 +14123,7 @@ static const char *
  aarch64_mangle_type (const_tree type)
  {
    /* The AArch64 ABI documents say that "__va_list" has to be
-     managled as if it is in the "std" namespace.  */
+     mangled as if it is in the "std" namespace.  */
    if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
      return "St9__va_list";
  
@@ -13779,12 +14924,11 @@ aarch64_simd_vector_alignment (const_tree type)
         be set for non-predicate vectors of booleans.  Modes are the most
         direct way we have of identifying real SVE predicate types.  */
      return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
-  HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
-  return MIN (align, 128);
+  return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
  }
  
  /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
-static HOST_WIDE_INT
+static poly_uint64
  aarch64_vectorize_preferred_vector_alignment (const_tree type)
  {
    if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
@@ -13809,9 +14953,11 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
    /* For fixed-length vectors, check that the vectorizer will aim for
       full-vector alignment.  This isn't true for generic GCC vectors
       that are wider than the ABI maximum of 128 bits.  */
+  poly_uint64 preferred_alignment =
+    aarch64_vectorize_preferred_vector_alignment (type);
    if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
-      && (wi::to_widest (TYPE_SIZE (type))
-         != aarch64_vectorize_preferred_vector_alignment (type)))
+      && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
+                  preferred_alignment))
      return false;
  
    /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
@@ -13863,7 +15009,7 @@ aarch64_simd_dup_constant (rtx vals)
  /* Generate code to load VALS, which is a PARALLEL containing only
     constants (for vec_init) or CONST_VECTOR, efficiently into a
     register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that can not be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
  static rtx
  aarch64_simd_make_constant (rtx vals)
  {
@@ -13901,12 +15047,12 @@ aarch64_simd_make_constant (rtx vals)
      /* Loaded using DUP.  */
      return const_dup;
    else if (const_vec != NULL_RTX)
-    /* Load from constant pool. We can not take advantage of single-cycle
+    /* Load from constant pool. We cannot take advantage of single-cycle
         LD1 because we need a PC-relative addressing mode.  */
      return const_vec;
    else
      /* A PARALLEL containing something not valid inside CONST_VECTOR.
-       We can not construct an initializer.  */
+       We cannot construct an initializer.  */
      return NULL_RTX;
  }
  
@@ -14213,19 +15359,7 @@ static void
  aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
                              rtx mem, rtx model_rtx)
  {
-  rtx (*gen) (rtx, rtx, rtx);
-
-  switch (mode)
-    {
-    case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
-    case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
-    case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
-    case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen (rval, mem, model_rtx));
+  emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
  }
  
  /* Emit store exclusive.  */
@@ -14234,19 +15368,7 @@ static void
  aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
                               rtx rval, rtx mem, rtx model_rtx)
  {
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-    {
-    case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
-    case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
-    case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
-    case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen (bval, rval, mem, model_rtx));
+  emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
  }
  
  /* Mark the previous jump instruction as unlikely.  */
@@ -14263,25 +15385,8 @@ aarch64_emit_unlikely_jump (rtx insn)
  void
  aarch64_expand_compare_and_swap (rtx operands[])
  {
-  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
-  machine_mode mode, cmp_mode;
-  typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
-  int idx;
-  gen_cas_fn gen;
-  const gen_cas_fn split_cas[] =
-  {
-    gen_aarch64_compare_and_swapqi,
-    gen_aarch64_compare_and_swaphi,
-    gen_aarch64_compare_and_swapsi,
-    gen_aarch64_compare_and_swapdi
-  };
-  const gen_cas_fn atomic_cas[] =
-  {
-    gen_aarch64_compare_and_swapqi_lse,
-    gen_aarch64_compare_and_swaphi_lse,
-    gen_aarch64_compare_and_swapsi_lse,
-    gen_aarch64_compare_and_swapdi_lse
-  };
+  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
+  machine_mode mode, r_mode;
  
    bval = operands[0];
    rval = operands[1];
@@ -14292,88 +15397,55 @@ aarch64_expand_compare_and_swap (rtx operands[])
    mod_s = operands[6];
    mod_f = operands[7];
    mode = GET_MODE (mem);
-  cmp_mode = mode;
  
    /* Normally the succ memory model must be stronger than fail, but in the
       unlikely event of fail being ACQUIRE and succ being RELEASE we need to
       promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
-
    if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
        && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
      mod_s = GEN_INT (MEMMODEL_ACQ_REL);
  
-  switch (mode)
+  r_mode = mode;
+  if (mode == QImode || mode == HImode)
      {
-    case E_QImode:
-    case E_HImode:
-      /* For short modes, we're going to perform the comparison in SImode,
-        so do the zero-extension now.  */
-      cmp_mode = SImode;
-      rval = gen_reg_rtx (SImode);
-      oldval = convert_modes (SImode, mode, oldval, true);
-      /* Fall through.  */
-
-    case E_SImode:
-    case E_DImode:
-      /* Force the value into a register if needed.  */
-      if (!aarch64_plus_operand (oldval, mode))
-       oldval = force_reg (cmp_mode, oldval);
-      break;
-
-    default:
-      gcc_unreachable ();
+      r_mode = SImode;
+      rval = gen_reg_rtx (r_mode);
      }
  
-  switch (mode)
+  if (TARGET_LSE)
      {
-    case E_QImode: idx = 0; break;
-    case E_HImode: idx = 1; break;
-    case E_SImode: idx = 2; break;
-    case E_DImode: idx = 3; break;
-    default:
-      gcc_unreachable ();
+      /* The CAS insn requires oldval and rval overlap, but we need to
+        have a copy of oldval saved across the operation to tell if
+        the operation is successful.  */
+      if (reg_overlap_mentioned_p (rval, oldval))
+        rval = copy_to_mode_reg (r_mode, oldval);
+      else
+       emit_move_insn (rval, gen_lowpart (r_mode, oldval));
+
+      emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
+                                                  newval, mod_s));
+      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
      }
-  if (TARGET_LSE)
-    gen = atomic_cas[idx];
    else
-    gen = split_cas[idx];
+    {
+      /* The oldval predicate varies by mode.  Test it and force to reg.  */
+      insn_code code = code_for_aarch64_compare_and_swap (mode);
+      if (!insn_data[code].operand[2].predicate (oldval, mode))
+       oldval = force_reg (mode, oldval);
  
-  emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
+      emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
+                                is_weak, mod_s, mod_f));
+      cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+    }
  
-  if (mode == QImode || mode == HImode)
-    emit_move_insn (operands[1], gen_lowpart (mode, rval));
+  if (r_mode != mode)
+    rval = gen_lowpart (mode, rval);
+  emit_move_insn (operands[1], rval);
  
-  x = gen_rtx_REG (CCmode, CC_REGNUM);
-  x = gen_rtx_EQ (SImode, x, const0_rtx);
+  x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
    emit_insn (gen_rtx_SET (bval, x));
  }
  
-/* Test whether the target supports using a atomic load-operate instruction.
-   CODE is the operation and AFTER is TRUE if the data in memory after the
-   operation should be returned and FALSE if the data before the operation
-   should be returned.  Returns FALSE if the operation isn't supported by the
-   architecture.  */
-
-bool
-aarch64_atomic_ldop_supported_p (enum rtx_code code)
-{
-  if (!TARGET_LSE)
-    return false;
-
-  switch (code)
-    {
-    case SET:
-    case AND:
-    case IOR:
-    case XOR:
-    case MINUS:
-    case PLUS:
-      return true;
-    default:
-      return false;
-    }
-}
-
  /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
     sequence implementing an atomic operation.  */
  
@@ -14391,42 +15463,6 @@ aarch64_emit_post_barrier (enum memmodel model)
      }
  }
  
-/* Emit an atomic compare-and-swap operation.  RVAL is the destination register
-   for the data in memory.  EXPECTED is the value expected to be in memory.
-   DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
-   is the memory ordering to use.  */
-
-void
-aarch64_gen_atomic_cas (rtx rval, rtx mem,
-                       rtx expected, rtx desired,
-                       rtx model)
-{
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-  machine_mode mode;
-
-  mode = GET_MODE (mem);
-
-  switch (mode)
-    {
-    case E_QImode: gen = gen_aarch64_atomic_casqi; break;
-    case E_HImode: gen = gen_aarch64_atomic_cashi; break;
-    case E_SImode: gen = gen_aarch64_atomic_cassi; break;
-    case E_DImode: gen = gen_aarch64_atomic_casdi; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  /* Move the expected value into the CAS destination register.  */
-  emit_insn (gen_rtx_SET (rval, expected));
-
-  /* Emit the CAS.  */
-  emit_insn (gen (rval, mem, desired, model));
-
-  /* Compare the expected value with the value loaded by the CAS, to establish
-     whether the swap was made.  */
-  aarch64_gen_compare_reg (EQ, rval, expected);
-}
-
  /* Split a compare and swap pattern.  */
  
  void
@@ -14479,17 +15515,26 @@ aarch64_split_compare_and_swap (rtx operands[])
  
    if (strong_zero_p)
      {
-      x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
+      if (aarch64_track_speculation)
+       {
+         /* Emit an explicit compare instruction, so that we can correctly
+            track the condition codes.  */
+         rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
+         x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+       }
+      else
+       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
+
        x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
        aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
      }
    else
      {
-      cond = aarch64_gen_compare_reg (NE, rval, oldval);
+      cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
        x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
        x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-                                gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+                               gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
        aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
      }
  
@@ -14497,277 +15542,40 @@ aarch64_split_compare_and_swap (rtx operands[])
  
    if (!is_weak)
      {
-      x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
-      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-                               gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
-      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
-    }
-  else
-    {
-      cond = gen_rtx_REG (CCmode, CC_REGNUM);
-      x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
-      emit_insn (gen_rtx_SET (cond, x));
-    }
-
-  emit_label (label2);
-  /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
-     to set the condition flags.  If this is not used it will be removed by
-     later passes.  */
-  if (strong_zero_p)
-    {
-      cond = gen_rtx_REG (CCmode, CC_REGNUM);
-      x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
-      emit_insn (gen_rtx_SET (cond, x));
-    }
-  /* Emit any final barrier needed for a __sync operation.  */
-  if (is_mm_sync (model))
-    aarch64_emit_post_barrier (model);
-}
-
-/* Emit a BIC instruction.  */
-
-static void
-aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
-{
-  rtx shift_rtx = GEN_INT (shift);
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-    {
-    case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
-    case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen (dst, s2, shift_rtx, s1));
-}
-
-/* Emit an atomic swap.  */
-
-static void
-aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
-                         rtx mem, rtx model)
-{
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-    {
-    case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
-    case E_HImode: gen = gen_aarch64_atomic_swphi; break;
-    case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
-    case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen (dst, mem, value, model));
-}
-
-/* Operations supported by aarch64_emit_atomic_load_op.  */
-
-enum aarch64_atomic_load_op_code
-{
-  AARCH64_LDOP_PLUS,   /* A + B  */
-  AARCH64_LDOP_XOR,    /* A ^ B  */
-  AARCH64_LDOP_OR,     /* A | B  */
-  AARCH64_LDOP_BIC     /* A & ~B  */
-};
-
-/* Emit an atomic load-operate.  */
-
-static void
-aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
-                            machine_mode mode, rtx dst, rtx src,
-                            rtx mem, rtx model)
-{
-  typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
-  const aarch64_atomic_load_op_fn plus[] =
-  {
-    gen_aarch64_atomic_loadaddqi,
-    gen_aarch64_atomic_loadaddhi,
-    gen_aarch64_atomic_loadaddsi,
-    gen_aarch64_atomic_loadadddi
-  };
-  const aarch64_atomic_load_op_fn eor[] =
-  {
-    gen_aarch64_atomic_loadeorqi,
-    gen_aarch64_atomic_loadeorhi,
-    gen_aarch64_atomic_loadeorsi,
-    gen_aarch64_atomic_loadeordi
-  };
-  const aarch64_atomic_load_op_fn ior[] =
-  {
-    gen_aarch64_atomic_loadsetqi,
-    gen_aarch64_atomic_loadsethi,
-    gen_aarch64_atomic_loadsetsi,
-    gen_aarch64_atomic_loadsetdi
-  };
-  const aarch64_atomic_load_op_fn bic[] =
-  {
-    gen_aarch64_atomic_loadclrqi,
-    gen_aarch64_atomic_loadclrhi,
-    gen_aarch64_atomic_loadclrsi,
-    gen_aarch64_atomic_loadclrdi
-  };
-  aarch64_atomic_load_op_fn gen;
-  int idx = 0;
-
-  switch (mode)
-    {
-    case E_QImode: idx = 0; break;
-    case E_HImode: idx = 1; break;
-    case E_SImode: idx = 2; break;
-    case E_DImode: idx = 3; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  switch (code)
-    {
-    case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
-    case AARCH64_LDOP_XOR: gen = eor[idx]; break;
-    case AARCH64_LDOP_OR: gen = ior[idx]; break;
-    case AARCH64_LDOP_BIC: gen = bic[idx]; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen (dst, mem, src, model));
-}
-
-/* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
-   location to store the data read from memory.  OUT_RESULT is the location to
-   store the result of the operation.  MEM is the memory location to read and
-   modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
-   operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
-   be NULL.  */
-
-void
-aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
-                        rtx mem, rtx value, rtx model_rtx)
-{
-  machine_mode mode = GET_MODE (mem);
-  machine_mode wmode = (mode == DImode ? DImode : SImode);
-  const bool short_mode = (mode < SImode);
-  aarch64_atomic_load_op_code ldop_code;
-  rtx src;
-  rtx x;
-
-  if (out_data)
-    out_data = gen_lowpart (mode, out_data);
-
-  if (out_result)
-    out_result = gen_lowpart (mode, out_result);
-
-  /* Make sure the value is in a register, putting it into a destination
-     register if it needs to be manipulated.  */
-  if (!register_operand (value, mode)
-      || code == AND || code == MINUS)
-    {
-      src = out_result ? out_result : out_data;
-      emit_move_insn (src, gen_lowpart (mode, value));
-    }
-  else
-    src = value;
-  gcc_assert (register_operand (src, mode));
-
-  /* Preprocess the data for the operation as necessary.  If the operation is
-     a SET then emit a swap instruction and finish.  */
-  switch (code)
-    {
-    case SET:
-      aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
-      return;
-
-    case MINUS:
-      /* Negate the value and treat it as a PLUS.  */
-      {
-       rtx neg_src;
-
-       /* Resize the value if necessary.  */
-       if (short_mode)
-         src = gen_lowpart (wmode, src);
-
-       neg_src = gen_rtx_NEG (wmode, src);
-       emit_insn (gen_rtx_SET (src, neg_src));
-
-       if (short_mode)
-         src = gen_lowpart (mode, src);
-      }
-      /* Fall-through.  */
-    case PLUS:
-      ldop_code = AARCH64_LDOP_PLUS;
-      break;
-
-    case IOR:
-      ldop_code = AARCH64_LDOP_OR;
-      break;
-
-    case XOR:
-      ldop_code = AARCH64_LDOP_XOR;
-      break;
-
-    case AND:
-      {
-       rtx not_src;
-
-       /* Resize the value if necessary.  */
-       if (short_mode)
-         src = gen_lowpart (wmode, src);
-
-       not_src = gen_rtx_NOT (wmode, src);
-       emit_insn (gen_rtx_SET (src, not_src));
-
-       if (short_mode)
-         src = gen_lowpart (mode, src);
-      }
-      ldop_code = AARCH64_LDOP_BIC;
-      break;
-
-    default:
-      /* The operation can't be done with atomic instructions.  */
-      gcc_unreachable ();
-    }
-
-  aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
-
-  /* If necessary, calculate the data in memory after the update by redoing the
-     operation from values in registers.  */
-  if (!out_result)
-    return;
+      if (aarch64_track_speculation)
+       {
+         /* Emit an explicit compare instruction, so that we can correctly
+            track the condition codes.  */
+         rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+         x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+       }
+      else
+       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
  
-  if (short_mode)
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                               gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+    }
+  else
      {
-      src = gen_lowpart (wmode, src);
-      out_data = gen_lowpart (wmode, out_data);
-      out_result = gen_lowpart (wmode, out_result);
+      cond = gen_rtx_REG (CCmode, CC_REGNUM);
+      x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
+      emit_insn (gen_rtx_SET (cond, x));
      }
  
-  x = NULL_RTX;
-
-  switch (code)
+  emit_label (label2);
+  /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
+     to set the condition flags.  If this is not used it will be removed by
+     later passes.  */
+  if (strong_zero_p)
      {
-    case MINUS:
-    case PLUS:
-      x = gen_rtx_PLUS (wmode, out_data, src);
-      break;
-    case IOR:
-      x = gen_rtx_IOR (wmode, out_data, src);
-      break;
-    case XOR:
-      x = gen_rtx_XOR (wmode, out_data, src);
-      break;
-    case AND:
-      aarch64_emit_bic (wmode, out_result, out_data, src, 0);
-      return;
-    default:
-      gcc_unreachable ();
+      cond = gen_rtx_REG (CCmode, CC_REGNUM);
+      x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
+      emit_insn (gen_rtx_SET (cond, x));
      }
-
-  emit_set_insn (out_result, x);
-
-  return;
+  /* Emit any final barrier needed for a __sync operation.  */
+  if (is_mm_sync (model))
+    aarch64_emit_post_barrier (model);
  }
  
  /* Split an atomic operation.  */
@@ -14833,7 +15641,16 @@ aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
    aarch64_emit_store_exclusive (mode, cond, mem,
                                 gen_lowpart (mode, new_out), model_rtx);
  
-  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+  if (aarch64_track_speculation)
+    {
+      /* Emit an explicit compare instruction, so that we can correctly
+        track the condition codes.  */
+      rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
+      x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+    }
+  else
+    x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+
    x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
    aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
@@ -14908,8 +15725,8 @@ aarch64_float_const_representable_p (rtx x)
    if (!CONST_DOUBLE_P (x))
      return false;
  
-  /* We don't support HFmode constants yet.  */
-  if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
+  if (GET_MODE (x) == VOIDmode
+      || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
      return false;
  
    r = *CONST_DOUBLE_REAL_VALUE (x);
@@ -15681,7 +16498,10 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
  
    machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
    rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
-  aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
+  if (d->one_vector_p)
+    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
+  else
+    aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
    return true;
  }
  
@@ -15719,7 +16539,7 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
         return true;
        if (d->vec_flags == VEC_SVE_DATA)
         return aarch64_evpc_sve_tbl (d);
-      else if (d->vec_flags == VEC_SVE_DATA)
+      else if (d->vec_flags == VEC_ADVSIMD)
         return aarch64_evpc_tbl (d);
      }
    return false;
@@ -15734,7 +16554,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
    struct expand_vec_perm_d d;
  
    /* Check whether the mask can be applied to a single vector.  */
-  if (op0 && rtx_equal_p (op0, op1))
+  if (sel.ninputs () == 1
+      || (op0 && rtx_equal_p (op0, op1)))
      d.one_vector_p = true;
    else if (sel.all_from_input_p (0))
      {
@@ -16185,13 +17006,17 @@ aarch64_expand_movmem (rtx *operands)
    /* Convert n to bits to make the rest of the code simpler.  */
    n = n * BITS_PER_UNIT;
  
+  /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
+     larger than TImode, but we should not use them for loads/stores here.  */
+  const int copy_limit = GET_MODE_BITSIZE (TImode);
+
    while (n > 0)
      {
        /* Find the largest mode in which to do the copy in without over reading
          or writing.  */
        opt_scalar_int_mode mode_iter;
        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
+       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
           cur_mode = mode_iter.require ();
  
        gcc_assert (cur_mode != BLKmode);
@@ -16205,10 +17030,10 @@ aarch64_expand_movmem (rtx *operands)
          cheaper.  i.e. less instructions to do so.  For instance doing a 15
          byte copy it's more efficient to do two overlapping 8 byte copies than
          8 + 6 + 1.  */
-      next_mode = smallest_mode_for_size (n, MODE_INT);
-      int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-      if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
+      if (n > 0 && n <= 8 * BITS_PER_UNIT)
         {
+         next_mode = smallest_mode_for_size (n, MODE_INT);
+         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
           n = n_bits;
@@ -16275,6 +17100,137 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
    return true;
  }
  
+/* Generate RTL for a conditional branch with rtx comparison CODE in
+   mode CC_MODE.  The destination of the unlikely conditional branch
+   is LABEL_REF.  */
+
+void
+aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
+                             rtx label_ref)
+{
+  rtx x;
+  x = gen_rtx_fmt_ee (code, VOIDmode,
+                     gen_rtx_REG (cc_mode, CC_REGNUM),
+                     const0_rtx);
+
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                           gen_rtx_LABEL_REF (VOIDmode, label_ref),
+                           pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+}
+
+/* Generate DImode scratch registers for 128-bit (TImode) addition.
+
+   OP1 represents the TImode destination operand 1
+   OP2 represents the TImode destination operand 2
+   LOW_DEST represents the low half (DImode) of TImode operand 0
+   LOW_IN1 represents the low half (DImode) of TImode operand 1
+   LOW_IN2 represents the low half (DImode) of TImode operand 2
+   HIGH_DEST represents the high half (DImode) of TImode operand 0
+   HIGH_IN1 represents the high half (DImode) of TImode operand 1
+   HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
+
+void
+aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+                           rtx *low_in1, rtx *low_in2,
+                           rtx *high_dest, rtx *high_in1,
+                           rtx *high_in2)
+{
+  *low_dest = gen_reg_rtx (DImode);
+  *low_in1 = gen_lowpart (DImode, op1);
+  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
+                                 subreg_lowpart_offset (DImode, TImode));
+  *high_dest = gen_reg_rtx (DImode);
+  *high_in1 = gen_highpart (DImode, op1);
+  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
+                                  subreg_highpart_offset (DImode, TImode));
+}
+
+/* Generate DImode scratch registers for 128-bit (TImode) subtraction.
+
+   This function differs from 'arch64_addti_scratch_regs' in that
+   OP1 can be an immediate constant (zero). We must call
+   subreg_highpart_offset with DImode and TImode arguments, otherwise
+   VOIDmode will be used for the const_int which generates an internal
+   error from subreg_size_highpart_offset which does not expect a size of zero.
+
+   OP1 represents the TImode destination operand 1
+   OP2 represents the TImode destination operand 2
+   LOW_DEST represents the low half (DImode) of TImode operand 0
+   LOW_IN1 represents the low half (DImode) of TImode operand 1
+   LOW_IN2 represents the low half (DImode) of TImode operand 2
+   HIGH_DEST represents the high half (DImode) of TImode operand 0
+   HIGH_IN1 represents the high half (DImode) of TImode operand 1
+   HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
+
+
+void
+aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+                            rtx *low_in1, rtx *low_in2,
+                            rtx *high_dest, rtx *high_in1,
+                            rtx *high_in2)
+{
+  *low_dest = gen_reg_rtx (DImode);
+  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
+                                 subreg_lowpart_offset (DImode, TImode));
+
+  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
+                                 subreg_lowpart_offset (DImode, TImode));
+  *high_dest = gen_reg_rtx (DImode);
+
+  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
+                                  subreg_highpart_offset (DImode, TImode));
+  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
+                                  subreg_highpart_offset (DImode, TImode));
+}
+
+/* Generate RTL for 128-bit (TImode) subtraction with overflow.
+
+   OP0 represents the TImode destination operand 0
+   LOW_DEST represents the low half (DImode) of TImode operand 0
+   LOW_IN1 represents the low half (DImode) of TImode operand 1
+   LOW_IN2 represents the low half (DImode) of TImode operand 2
+   HIGH_DEST represents the high half (DImode) of TImode operand 0
+   HIGH_IN1 represents the high half (DImode) of TImode operand 1
+   HIGH_IN2 represents the high half (DImode) of TImode operand 2
+   UNSIGNED_P is true if the operation is being performed on unsigned
+   values.  */
+void
+aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
+                      rtx low_in2, rtx high_dest, rtx high_in1,
+                      rtx high_in2, bool unsigned_p)
+{
+  if (low_in2 == const0_rtx)
+    {
+      low_dest = low_in1;
+      high_in2 = force_reg (DImode, high_in2);
+      if (unsigned_p)
+       emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
+      else
+       emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
+    }
+  else
+    {
+      if (CONST_INT_P (low_in2))
+       {
+         high_in2 = force_reg (DImode, high_in2);
+         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
+                                             GEN_INT (-INTVAL (low_in2))));
+       }
+      else
+       emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
+
+      if (unsigned_p)
+       emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
+      else
+       emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, op0), low_dest);
+  emit_move_insn (gen_highpart (DImode, op0), high_dest);
+
+}
+
  /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
  
  static unsigned HOST_WIDE_INT
@@ -16593,8 +17549,6 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
    if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
        && any_condjump_p (curr))
      {
-      enum attr_type prev_type = get_attr_type (prev);
-
        unsigned int condreg1, condreg2;
        rtx cc_reg_1;
        aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
@@ -16604,6 +17558,8 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
           && prev
           && modified_in_p (cc_reg_1, prev))
         {
+         enum attr_type prev_type = get_attr_type (prev);
+
           /* FIXME: this misses some which is considered simple arthematic
              instructions for ThunderX.  Simple shifts are missed here.  */
           if (prev_type == TYPE_ALUS_SREG
@@ -17040,23 +17996,20 @@ bool
  aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
                                        scalar_mode mode)
  {
-  enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
-  HOST_WIDE_INT offvals[4], msize;
-  rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
-  rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
+  const int num_insns = 4;
+  enum reg_class rclass;
+  HOST_WIDE_INT offvals[num_insns], msize;
+  rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
  
    if (load)
      {
-      reg_1 = operands[0];
-      mem_1 = operands[1];
-      reg_2 = operands[2];
-      mem_2 = operands[3];
-      reg_3 = operands[4];
-      mem_3 = operands[5];
-      reg_4 = operands[6];
-      mem_4 = operands[7];
-      gcc_assert (REG_P (reg_1) && REG_P (reg_2)
-                 && REG_P (reg_3) && REG_P (reg_4));
+      for (int i = 0; i < num_insns; i++)
+       {
+         reg[i] = operands[2 * i];
+         mem[i] = operands[2 * i + 1];
+
+         gcc_assert (REG_P (reg[i]));
+       }
  
        /* Do not attempt to merge the loads if the loads clobber each other.  */
        for (int i = 0; i < 8; i += 2)
@@ -17065,53 +18018,64 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
             return false;
      }
    else
+    for (int i = 0; i < num_insns; i++)
+      {
+       mem[i] = operands[2 * i];
+       reg[i] = operands[2 * i + 1];
+      }
+
+  /* Skip if memory operand is by itself valid for ldp/stp.  */
+  if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
+    return false;
+
+  for (int i = 0; i < num_insns; i++)
      {
-      mem_1 = operands[0];
-      reg_1 = operands[1];
-      mem_2 = operands[2];
-      reg_2 = operands[3];
-      mem_3 = operands[4];
-      reg_3 = operands[5];
-      mem_4 = operands[6];
-      reg_4 = operands[7];
+      /* The mems cannot be volatile.  */
+      if (MEM_VOLATILE_P (mem[i]))
+       return false;
+
+      /* Check if the addresses are in the form of [base+offset].  */
+      extract_base_offset_in_addr (mem[i], base + i, offset + i);
+      if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
+       return false;
      }
-  /* Skip if memory operand is by itslef valid for ldp/stp.  */
-  if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
-    return false;
  
-  /* The mems cannot be volatile.  */
-  if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
-      || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
-    return false;
+  /* Check if the registers are of same class.  */
+  rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
+    ? FP_REGS : GENERAL_REGS;
  
-  /* Check if the addresses are in the form of [base+offset].  */
-  extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
-  if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
-    return false;
-  extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
-  if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
-    return false;
-  extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
-  if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
-    return false;
-  extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
-  if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
-    return false;
+  for (int i = 1; i < num_insns; i++)
+    if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
+      {
+       if (rclass != FP_REGS)
+         return false;
+      }
+    else
+      {
+       if (rclass != GENERAL_REGS)
+         return false;
+      }
+
+  /* Only the last register in the order in which they occur
+     may be clobbered by the load.  */
+  if (rclass == GENERAL_REGS && load)
+    for (int i = 0; i < num_insns - 1; i++)
+      if (reg_mentioned_p (reg[i], mem[i]))
+       return false;
  
    /* Check if the bases are same.  */
-  if (!rtx_equal_p (base_1, base_2)
-      || !rtx_equal_p (base_2, base_3)
-      || !rtx_equal_p (base_3, base_4))
-    return false;
+  for (int i = 0; i < num_insns - 1; i++)
+    if (!rtx_equal_p (base[i], base[i + 1]))
+      return false;
+
+  for (int i = 0; i < num_insns; i++)
+    offvals[i] = INTVAL (offset[i]);
  
-  offvals[0] = INTVAL (offset_1);
-  offvals[1] = INTVAL (offset_2);
-  offvals[2] = INTVAL (offset_3);
-  offvals[3] = INTVAL (offset_4);
    msize = GET_MODE_SIZE (mode);
  
    /* Check if the offsets can be put in the right order to do a ldp/stp.  */
-  qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
+  qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
+        aarch64_host_wide_int_compare);
  
    if (!(offvals[1] == offvals[0] + msize
         && offvals[3] == offvals[2] + msize))
@@ -17126,44 +18090,13 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
    if (offvals[0] % msize != offvals[2] % msize)
      return false;
  
-  /* Check if the addresses are clobbered by load.  */
-  if (load && (reg_mentioned_p (reg_1, mem_1)
-              || reg_mentioned_p (reg_2, mem_2)
-              || reg_mentioned_p (reg_3, mem_3)
-              || reg_mentioned_p (reg_4, mem_4)))
-    return false;
-
    /* If we have SImode and slow unaligned ldp,
       check the alignment to be at least 8 byte. */
    if (mode == SImode
        && (aarch64_tune_params.extra_tuning_flags
-          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
+         & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
        && !optimize_size
-      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
-    return false;
-
-  if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
-    rclass_1 = FP_REGS;
-  else
-    rclass_1 = GENERAL_REGS;
-
-  if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
-    rclass_2 = FP_REGS;
-  else
-    rclass_2 = GENERAL_REGS;
-
-  if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
-    rclass_3 = FP_REGS;
-  else
-    rclass_3 = GENERAL_REGS;
-
-  if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
-    rclass_4 = FP_REGS;
-  else
-    rclass_4 = GENERAL_REGS;
-
-  /* Check if the registers are of same class.  */
-  if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
+      && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
      return false;
  
    return true;
@@ -17622,6 +18555,224 @@ aarch64_select_early_remat_modes (sbitmap modes)
      }
  }
  
+/* Override the default target speculation_safe_value.  */
+static rtx
+aarch64_speculation_safe_value (machine_mode mode,
+                               rtx result, rtx val, rtx failval)
+{
+  /* Maybe we should warn if falling back to hard barriers.  They are
+     likely to be noticably more expensive than the alternative below.  */
+  if (!aarch64_track_speculation)
+    return default_speculation_safe_value (mode, result, val, failval);
+
+  if (!REG_P (val))
+    val = copy_to_mode_reg (mode, val);
+
+  if (!aarch64_reg_or_zero (failval, mode))
+    failval = copy_to_mode_reg (mode, failval);
+
+  emit_insn (gen_despeculate_copy (mode, result, val, failval));
+  return result;
+}
+
+/* Implement TARGET_ESTIMATED_POLY_VALUE.
+   Look into the tuning structure for an estimate.
+   VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
+   Advanced SIMD 128 bits.  */
+
+static HOST_WIDE_INT
+aarch64_estimated_poly_value (poly_int64 val)
+{
+  enum aarch64_sve_vector_bits_enum width_source
+    = aarch64_tune_params.sve_width;
+
+  /* If we still don't have an estimate, use the default.  */
+  if (width_source == SVE_SCALABLE)
+    return default_estimated_poly_value (val);
+
+  HOST_WIDE_INT over_128 = width_source - 128;
+  return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
+}
+
+
+/* Return true for types that could be supported as SIMD return or
+   argument types.  */
+
+static bool
+supported_simd_type (tree t)
+{
+  if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
+    {
+      HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
+      return s == 1 || s == 2 || s == 4 || s == 8;
+    }
+  return false;
+}
+
+/* Return true for types that currently are supported as SIMD return
+   or argument types.  */
+
+static bool
+currently_supported_simd_type (tree t, tree b)
+{
+  if (COMPLEX_FLOAT_TYPE_P (t))
+    return false;
+
+  if (TYPE_SIZE (t) != TYPE_SIZE (b))
+    return false;
+
+  return supported_simd_type (t);
+}
+
+/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
+
+static int
+aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
+                                       struct cgraph_simd_clone *clonei,
+                                       tree base_type, int num)
+{
+  tree t, ret_type, arg_type;
+  unsigned int elt_bits, vec_bits, count;
+
+  if (!TARGET_SIMD)
+    return 0;
+
+  if (clonei->simdlen
+      && (clonei->simdlen < 2
+         || clonei->simdlen > 1024
+         || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
+    {
+      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                 "unsupported simdlen %d", clonei->simdlen);
+      return 0;
+    }
+
+  ret_type = TREE_TYPE (TREE_TYPE (node->decl));
+  if (TREE_CODE (ret_type) != VOID_TYPE
+      && !currently_supported_simd_type (ret_type, base_type))
+    {
+      if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
+       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                   "GCC does not currently support mixed size types "
+                   "for %<simd%> functions");
+      else if (supported_simd_type (ret_type))
+       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                   "GCC does not currently support return type %qT "
+                   "for %<simd%> functions", ret_type);
+      else
+       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                   "unsupported return type %qT for %<simd%> functions",
+                   ret_type);
+      return 0;
+    }
+
+  for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
+    {
+      arg_type = TREE_TYPE (t);
+
+      if (!currently_supported_simd_type (arg_type, base_type))
+       {
+         if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
+           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                       "GCC does not currently support mixed size types "
+                       "for %<simd%> functions");
+         else
+           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                       "GCC does not currently support argument type %qT "
+                       "for %<simd%> functions", arg_type);
+         return 0;
+       }
+    }
+
+  clonei->vecsize_mangle = 'n';
+  clonei->mask_mode = VOIDmode;
+  elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
+  if (clonei->simdlen == 0)
+    {
+      count = 2;
+      vec_bits = (num == 0 ? 64 : 128);
+      clonei->simdlen = vec_bits / elt_bits;
+    }
+  else
+    {
+      count = 1;
+      vec_bits = clonei->simdlen * elt_bits;
+      if (vec_bits != 64 && vec_bits != 128)
+       {
+         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+                     "GCC does not currently support simdlen %d for type %qT",
+                     clonei->simdlen, base_type);
+         return 0;
+       }
+    }
+  clonei->vecsize_int = vec_bits;
+  clonei->vecsize_float = vec_bits;
+  return count;
+}
+
+/* Implement TARGET_SIMD_CLONE_ADJUST.  */
+
+static void
+aarch64_simd_clone_adjust (struct cgraph_node *node)
+{
+  /* Add aarch64_vector_pcs target attribute to SIMD clones so they
+     use the correct ABI.  */
+
+  tree t = TREE_TYPE (node->decl);
+  TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
+                                       TYPE_ATTRIBUTES (t));
+}
+
+/* Implement TARGET_SIMD_CLONE_USABLE.  */
+
+static int
+aarch64_simd_clone_usable (struct cgraph_node *node)
+{
+  switch (node->simdclone->vecsize_mangle)
+    {
+    case 'n':
+      if (!TARGET_SIMD)
+       return -1;
+      return 0;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Implement TARGET_COMP_TYPE_ATTRIBUTES */
+
+static int
+aarch64_comp_type_attributes (const_tree type1, const_tree type2)
+{
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
+      != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
+    return 0;
+  return 1;
+}
+
+/* Implement TARGET_GET_MULTILIB_ABI_NAME */
+
+static const char *
+aarch64_get_multilib_abi_name (void)
+{
+  if (TARGET_BIG_END)
+    return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
+  return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
+}
+
+/* Implement TARGET_STACK_PROTECT_GUARD. In case of a
+   global variable based guard use the default else
+   return a null tree.  */
+static tree
+aarch64_stack_protect_guard (void)
+{
+  if (aarch64_stack_protector_guard == SSP_GLOBAL)
+    return default_stack_protect_guard ();
+
+  return NULL_TREE;
+}
+
+
  /* Target-specific selftests.  */
  
  #if CHECKING_P
@@ -17668,6 +18819,9 @@ aarch64_run_selftests (void)
  
  #endif /* #if CHECKING_P */
  
+#undef TARGET_STACK_PROTECT_GUARD
+#define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
+
  #undef TARGET_ADDRESS_COST
  #define TARGET_ADDRESS_COST aarch64_address_cost
  
@@ -18082,9 +19236,21 @@ aarch64_libgcc_floating_mode_supported_p
  #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
    aarch64_hard_regno_call_part_clobbered
  
+#undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
+#define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
+  aarch64_remove_extra_call_preserved_regs
+
+#undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
+#define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
+  aarch64_return_call_with_max_clobbers
+
  #undef TARGET_CONSTANT_ALIGNMENT
  #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
  
+#undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
+#define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
+  aarch64_stack_clash_protection_alloca_probe_range
+
  #undef TARGET_COMPUTE_PRESSURE_CLASSES
  #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
  
@@ -18094,6 +19260,31 @@ aarch64_libgcc_floating_mode_supported_p
  #undef TARGET_SELECT_EARLY_REMAT_MODES
  #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
  
+#undef TARGET_SPECULATION_SAFE_VALUE
+#define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
+
+#undef TARGET_ESTIMATED_POLY_VALUE
+#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
+
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
+#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
+#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
+  aarch64_simd_clone_compute_vecsize_and_simdlen
+
+#undef TARGET_SIMD_CLONE_ADJUST
+#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
+
+#undef TARGET_SIMD_CLONE_USABLE
+#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
+
+#undef TARGET_COMP_TYPE_ATTRIBUTES
+#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
+
+#undef TARGET_GET_MULTILIB_ABI_NAME
+#define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
+
  #if CHECKING_P
  #undef TARGET_RUN_TARGET_SELFTESTS
  #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests