gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2020 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74 #include "intl.h"
  75 #include "expmed.h"
  76 #include "function-abi.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 /* The current code model.  */
 178 enum aarch64_code_model aarch64_cmodel;
 179
 180 /* The number of 64-bit elements in an SVE vector.  */
 181 poly_uint16 aarch64_sve_vg;
 182
 183 #ifdef HAVE_AS_TLS
 184 #undef TARGET_HAVE_TLS
 185 #define TARGET_HAVE_TLS 1
 186 #endif
 187
 188 static bool aarch64_composite_type_p (const_tree, machine_mode);
 189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 190                                                      const_tree,
 191                                                      machine_mode *, int *,
 192                                                      bool *);
 193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_override_options_after_change (void);
 196 static bool aarch64_vector_mode_supported_p (machine_mode);
 197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 199                                                          const_tree type,
 200                                                          int misalignment,
 201                                                          bool is_packed);
 202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 204                                             aarch64_addr_query_type);
 205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 206
 207 /* Major revision number of the ARM Architecture implemented by the target.  */
 208 unsigned aarch64_architecture_version;
 209
 210 /* The processor for which instructions should be scheduled.  */
 211 enum aarch64_processor aarch64_tune = cortexa53;
 212
 213 /* Mask to specify which instruction scheduling options should be used.  */
 214 uint64_t aarch64_tune_flags = 0;
 215
 216 /* Global flag for PC relative loads.  */
 217 bool aarch64_pcrelative_literal_loads;
 218
 219 /* Global flag for whether frame pointer is enabled.  */
 220 bool aarch64_use_frame_pointer;
 221
 222 #define BRANCH_PROTECT_STR_MAX 255
 223 char *accepted_branch_protection_string = NULL;
 224
 225 static enum aarch64_parse_opt_result
 226 aarch64_parse_branch_protection (const char*, char**);
 227
 228 /* Support for command line parsing of boolean flags in the tuning
 229    structures.  */
 230 struct aarch64_flag_desc
 231 {
 232   const char* name;
 233   unsigned int flag;
 234 };
 235
 236 #define AARCH64_FUSION_PAIR(name, internal_name) \
 237   { name, AARCH64_FUSE_##internal_name },
 238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 239 {
 240   { "none", AARCH64_FUSE_NOTHING },
 241 #include "aarch64-fusion-pairs.def"
 242   { "all", AARCH64_FUSE_ALL },
 243   { NULL, AARCH64_FUSE_NOTHING }
 244 };
 245
 246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 247   { name, AARCH64_EXTRA_TUNE_##internal_name },
 248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 249 {
 250   { "none", AARCH64_EXTRA_TUNE_NONE },
 251 #include "aarch64-tuning-flags.def"
 252   { "all", AARCH64_EXTRA_TUNE_ALL },
 253   { NULL, AARCH64_EXTRA_TUNE_NONE }
 254 };
 255
 256 /* Tuning parameters.  */
 257
 258 static const struct cpu_addrcost_table generic_addrcost_table =
 259 {
 260     {
 261       1, /* hi  */
 262       0, /* si  */
 263       0, /* di  */
 264       1, /* ti  */
 265     },
 266   0, /* pre_modify  */
 267   0, /* post_modify  */
 268   0, /* register_offset  */
 269   0, /* register_sextend  */
 270   0, /* register_zextend  */
 271   0 /* imm_offset  */
 272 };
 273
 274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 275 {
 276     {
 277       0, /* hi  */
 278       0, /* si  */
 279       0, /* di  */
 280       2, /* ti  */
 281     },
 282   0, /* pre_modify  */
 283   0, /* post_modify  */
 284   1, /* register_offset  */
 285   1, /* register_sextend  */
 286   2, /* register_zextend  */
 287   0, /* imm_offset  */
 288 };
 289
 290 static const struct cpu_addrcost_table xgene1_addrcost_table =
 291 {
 292     {
 293       1, /* hi  */
 294       0, /* si  */
 295       0, /* di  */
 296       1, /* ti  */
 297     },
 298   1, /* pre_modify  */
 299   1, /* post_modify  */
 300   0, /* register_offset  */
 301   1, /* register_sextend  */
 302   1, /* register_zextend  */
 303   0, /* imm_offset  */
 304 };
 305
 306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 307 {
 308     {
 309       1, /* hi  */
 310       1, /* si  */
 311       1, /* di  */
 312       2, /* ti  */
 313     },
 314   0, /* pre_modify  */
 315   0, /* post_modify  */
 316   2, /* register_offset  */
 317   3, /* register_sextend  */
 318   3, /* register_zextend  */
 319   0, /* imm_offset  */
 320 };
 321
 322 static const struct cpu_addrcost_table tsv110_addrcost_table =
 323 {
 324     {
 325       1, /* hi  */
 326       0, /* si  */
 327       0, /* di  */
 328       1, /* ti  */
 329     },
 330   0, /* pre_modify  */
 331   0, /* post_modify  */
 332   0, /* register_offset  */
 333   1, /* register_sextend  */
 334   1, /* register_zextend  */
 335   0, /* imm_offset  */
 336 };
 337
 338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 339 {
 340     {
 341       1, /* hi  */
 342       1, /* si  */
 343       1, /* di  */
 344       2, /* ti  */
 345     },
 346   1, /* pre_modify  */
 347   1, /* post_modify  */
 348   3, /* register_offset  */
 349   3, /* register_sextend  */
 350   3, /* register_zextend  */
 351   2, /* imm_offset  */
 352 };
 353
 354 static const struct cpu_regmove_cost generic_regmove_cost =
 355 {
 356   1, /* GP2GP  */
 357   /* Avoid the use of slow int<->fp moves for spilling by setting
 358      their cost higher than memmov_cost.  */
 359   5, /* GP2FP  */
 360   5, /* FP2GP  */
 361   2 /* FP2FP  */
 362 };
 363
 364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 365 {
 366   1, /* GP2GP  */
 367   /* Avoid the use of slow int<->fp moves for spilling by setting
 368      their cost higher than memmov_cost.  */
 369   5, /* GP2FP  */
 370   5, /* FP2GP  */
 371   2 /* FP2FP  */
 372 };
 373
 374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 375 {
 376   1, /* GP2GP  */
 377   /* Avoid the use of slow int<->fp moves for spilling by setting
 378      their cost higher than memmov_cost.  */
 379   5, /* GP2FP  */
 380   5, /* FP2GP  */
 381   2 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost (actual, 4 and 9).  */
 389   9, /* GP2FP  */
 390   9, /* FP2GP  */
 391   1 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost thunderx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   2, /* GP2FP  */
 398   6, /* FP2GP  */
 399   4 /* FP2FP  */
 400 };
 401
 402 static const struct cpu_regmove_cost xgene1_regmove_cost =
 403 {
 404   1, /* GP2GP  */
 405   /* Avoid the use of slow int<->fp moves for spilling by setting
 406      their cost higher than memmov_cost.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   2 /* FP2FP  */
 410 };
 411
 412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 413 {
 414   2, /* GP2GP  */
 415   /* Avoid the use of int<->fp moves for spilling.  */
 416   6, /* GP2FP  */
 417   6, /* FP2GP  */
 418   4 /* FP2FP  */
 419 };
 420
 421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 422 {
 423   1, /* GP2GP  */
 424   /* Avoid the use of int<->fp moves for spilling.  */
 425   8, /* GP2FP  */
 426   8, /* FP2GP  */
 427   4  /* FP2FP  */
 428 };
 429
 430 static const struct cpu_regmove_cost tsv110_regmove_cost =
 431 {
 432   1, /* GP2GP  */
 433   /* Avoid the use of slow int<->fp moves for spilling by setting
 434      their cost higher than memmov_cost.  */
 435   2, /* GP2FP  */
 436   3, /* FP2GP  */
 437   2  /* FP2FP  */
 438 };
 439
 440 /* Generic costs for vector insn classes.  */
 441 static const struct cpu_vector_cost generic_vector_cost =
 442 {
 443   1, /* scalar_int_stmt_cost  */
 444   1, /* scalar_fp_stmt_cost  */
 445   1, /* scalar_load_cost  */
 446   1, /* scalar_store_cost  */
 447   1, /* vec_int_stmt_cost  */
 448   1, /* vec_fp_stmt_cost  */
 449   2, /* vec_permute_cost  */
 450   2, /* vec_to_scalar_cost  */
 451   1, /* scalar_to_vec_cost  */
 452   1, /* vec_align_load_cost  */
 453   1, /* vec_unalign_load_cost  */
 454   1, /* vec_unalign_store_cost  */
 455   1, /* vec_store_cost  */
 456   3, /* cond_taken_branch_cost  */
 457   1 /* cond_not_taken_branch_cost  */
 458 };
 459
 460 /* QDF24XX costs for vector insn classes.  */
 461 static const struct cpu_vector_cost qdf24xx_vector_cost =
 462 {
 463   1, /* scalar_int_stmt_cost  */
 464   1, /* scalar_fp_stmt_cost  */
 465   1, /* scalar_load_cost  */
 466   1, /* scalar_store_cost  */
 467   1, /* vec_int_stmt_cost  */
 468   3, /* vec_fp_stmt_cost  */
 469   2, /* vec_permute_cost  */
 470   1, /* vec_to_scalar_cost  */
 471   1, /* scalar_to_vec_cost  */
 472   1, /* vec_align_load_cost  */
 473   1, /* vec_unalign_load_cost  */
 474   1, /* vec_unalign_store_cost  */
 475   1, /* vec_store_cost  */
 476   3, /* cond_taken_branch_cost  */
 477   1 /* cond_not_taken_branch_cost  */
 478 };
 479
 480 /* ThunderX costs for vector insn classes.  */
 481 static const struct cpu_vector_cost thunderx_vector_cost =
 482 {
 483   1, /* scalar_int_stmt_cost  */
 484   1, /* scalar_fp_stmt_cost  */
 485   3, /* scalar_load_cost  */
 486   1, /* scalar_store_cost  */
 487   4, /* vec_int_stmt_cost  */
 488   1, /* vec_fp_stmt_cost  */
 489   4, /* vec_permute_cost  */
 490   2, /* vec_to_scalar_cost  */
 491   2, /* scalar_to_vec_cost  */
 492   3, /* vec_align_load_cost  */
 493   5, /* vec_unalign_load_cost  */
 494   5, /* vec_unalign_store_cost  */
 495   1, /* vec_store_cost  */
 496   3, /* cond_taken_branch_cost  */
 497   3 /* cond_not_taken_branch_cost  */
 498 };
 499
 500 static const struct cpu_vector_cost tsv110_vector_cost =
 501 {
 502   1, /* scalar_int_stmt_cost  */
 503   1, /* scalar_fp_stmt_cost  */
 504   5, /* scalar_load_cost  */
 505   1, /* scalar_store_cost  */
 506   2, /* vec_int_stmt_cost  */
 507   2, /* vec_fp_stmt_cost  */
 508   2, /* vec_permute_cost  */
 509   3, /* vec_to_scalar_cost  */
 510   2, /* scalar_to_vec_cost  */
 511   5, /* vec_align_load_cost  */
 512   5, /* vec_unalign_load_cost  */
 513   1, /* vec_unalign_store_cost  */
 514   1, /* vec_store_cost  */
 515   1, /* cond_taken_branch_cost  */
 516   1 /* cond_not_taken_branch_cost  */
 517 };
 518
 519 /* Generic costs for vector insn classes.  */
 520 static const struct cpu_vector_cost cortexa57_vector_cost =
 521 {
 522   1, /* scalar_int_stmt_cost  */
 523   1, /* scalar_fp_stmt_cost  */
 524   4, /* scalar_load_cost  */
 525   1, /* scalar_store_cost  */
 526   2, /* vec_int_stmt_cost  */
 527   2, /* vec_fp_stmt_cost  */
 528   3, /* vec_permute_cost  */
 529   8, /* vec_to_scalar_cost  */
 530   8, /* scalar_to_vec_cost  */
 531   4, /* vec_align_load_cost  */
 532   4, /* vec_unalign_load_cost  */
 533   1, /* vec_unalign_store_cost  */
 534   1, /* vec_store_cost  */
 535   1, /* cond_taken_branch_cost  */
 536   1 /* cond_not_taken_branch_cost  */
 537 };
 538
 539 static const struct cpu_vector_cost exynosm1_vector_cost =
 540 {
 541   1, /* scalar_int_stmt_cost  */
 542   1, /* scalar_fp_stmt_cost  */
 543   5, /* scalar_load_cost  */
 544   1, /* scalar_store_cost  */
 545   3, /* vec_int_stmt_cost  */
 546   3, /* vec_fp_stmt_cost  */
 547   3, /* vec_permute_cost  */
 548   3, /* vec_to_scalar_cost  */
 549   3, /* scalar_to_vec_cost  */
 550   5, /* vec_align_load_cost  */
 551   5, /* vec_unalign_load_cost  */
 552   1, /* vec_unalign_store_cost  */
 553   1, /* vec_store_cost  */
 554   1, /* cond_taken_branch_cost  */
 555   1 /* cond_not_taken_branch_cost  */
 556 };
 557
 558 /* Generic costs for vector insn classes.  */
 559 static const struct cpu_vector_cost xgene1_vector_cost =
 560 {
 561   1, /* scalar_int_stmt_cost  */
 562   1, /* scalar_fp_stmt_cost  */
 563   5, /* scalar_load_cost  */
 564   1, /* scalar_store_cost  */
 565   2, /* vec_int_stmt_cost  */
 566   2, /* vec_fp_stmt_cost  */
 567   2, /* vec_permute_cost  */
 568   4, /* vec_to_scalar_cost  */
 569   4, /* scalar_to_vec_cost  */
 570   10, /* vec_align_load_cost  */
 571   10, /* vec_unalign_load_cost  */
 572   2, /* vec_unalign_store_cost  */
 573   2, /* vec_store_cost  */
 574   2, /* cond_taken_branch_cost  */
 575   1 /* cond_not_taken_branch_cost  */
 576 };
 577
 578 /* Costs for vector insn classes for Vulcan.  */
 579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 580 {
 581   1, /* scalar_int_stmt_cost  */
 582   6, /* scalar_fp_stmt_cost  */
 583   4, /* scalar_load_cost  */
 584   1, /* scalar_store_cost  */
 585   5, /* vec_int_stmt_cost  */
 586   6, /* vec_fp_stmt_cost  */
 587   10, /* vec_permute_cost  */
 588   6, /* vec_to_scalar_cost  */
 589   5, /* scalar_to_vec_cost  */
 590   8, /* vec_align_load_cost  */
 591   8, /* vec_unalign_load_cost  */
 592   4, /* vec_unalign_store_cost  */
 593   4, /* vec_store_cost  */
 594   2, /* cond_taken_branch_cost  */
 595   1  /* cond_not_taken_branch_cost  */
 596 };
 597
 598 /* Generic costs for branch instructions.  */
 599 static const struct cpu_branch_cost generic_branch_cost =
 600 {
 601   1,  /* Predictable.  */
 602   3   /* Unpredictable.  */
 603 };
 604
 605 /* Generic approximation modes.  */
 606 static const cpu_approx_modes generic_approx_modes =
 607 {
 608   AARCH64_APPROX_NONE,  /* division  */
 609   AARCH64_APPROX_NONE,  /* sqrt  */
 610   AARCH64_APPROX_NONE   /* recip_sqrt  */
 611 };
 612
 613 /* Approximation modes for Exynos M1.  */
 614 static const cpu_approx_modes exynosm1_approx_modes =
 615 {
 616   AARCH64_APPROX_NONE,  /* division  */
 617   AARCH64_APPROX_ALL,   /* sqrt  */
 618   AARCH64_APPROX_ALL    /* recip_sqrt  */
 619 };
 620
 621 /* Approximation modes for X-Gene 1.  */
 622 static const cpu_approx_modes xgene1_approx_modes =
 623 {
 624   AARCH64_APPROX_NONE,  /* division  */
 625   AARCH64_APPROX_NONE,  /* sqrt  */
 626   AARCH64_APPROX_ALL    /* recip_sqrt  */
 627 };
 628
 629 /* Generic prefetch settings (which disable prefetch).  */
 630 static const cpu_prefetch_tune generic_prefetch_tune =
 631 {
 632   0,                    /* num_slots  */
 633   -1,                   /* l1_cache_size  */
 634   -1,                   /* l1_cache_line_size  */
 635   -1,                   /* l2_cache_size  */
 636   true,                 /* prefetch_dynamic_strides */
 637   -1,                   /* minimum_stride */
 638   -1                    /* default_opt_level  */
 639 };
 640
 641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 642 {
 643   0,                    /* num_slots  */
 644   -1,                   /* l1_cache_size  */
 645   64,                   /* l1_cache_line_size  */
 646   -1,                   /* l2_cache_size  */
 647   true,                 /* prefetch_dynamic_strides */
 648   -1,                   /* minimum_stride */
 649   -1                    /* default_opt_level  */
 650 };
 651
 652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 653 {
 654   4,                    /* num_slots  */
 655   32,                   /* l1_cache_size  */
 656   64,                   /* l1_cache_line_size  */
 657   512,                  /* l2_cache_size  */
 658   false,                /* prefetch_dynamic_strides */
 659   2048,                 /* minimum_stride */
 660   3                     /* default_opt_level  */
 661 };
 662
 663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 664 {
 665   8,                    /* num_slots  */
 666   32,                   /* l1_cache_size  */
 667   128,                  /* l1_cache_line_size  */
 668   16*1024,              /* l2_cache_size  */
 669   true,                 /* prefetch_dynamic_strides */
 670   -1,                   /* minimum_stride */
 671   3                     /* default_opt_level  */
 672 };
 673
 674 static const cpu_prefetch_tune thunderx_prefetch_tune =
 675 {
 676   8,                    /* num_slots  */
 677   32,                   /* l1_cache_size  */
 678   128,                  /* l1_cache_line_size  */
 679   -1,                   /* l2_cache_size  */
 680   true,                 /* prefetch_dynamic_strides */
 681   -1,                   /* minimum_stride */
 682   -1                    /* default_opt_level  */
 683 };
 684
 685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 686 {
 687   8,                    /* num_slots  */
 688   32,                   /* l1_cache_size  */
 689   64,                   /* l1_cache_line_size  */
 690   256,                  /* l2_cache_size  */
 691   true,                 /* prefetch_dynamic_strides */
 692   -1,                   /* minimum_stride */
 693   -1                    /* default_opt_level  */
 694 };
 695
 696 static const cpu_prefetch_tune tsv110_prefetch_tune =
 697 {
 698   0,                    /* num_slots  */
 699   64,                   /* l1_cache_size  */
 700   64,                   /* l1_cache_line_size  */
 701   512,                  /* l2_cache_size  */
 702   true,                 /* prefetch_dynamic_strides */
 703   -1,                   /* minimum_stride */
 704   -1                    /* default_opt_level  */
 705 };
 706
 707 static const cpu_prefetch_tune xgene1_prefetch_tune =
 708 {
 709   8,                    /* num_slots  */
 710   32,                   /* l1_cache_size  */
 711   64,                   /* l1_cache_line_size  */
 712   256,                  /* l2_cache_size  */
 713   true,                 /* prefetch_dynamic_strides */
 714   -1,                   /* minimum_stride */
 715   -1                    /* default_opt_level  */
 716 };
 717
 718 static const struct tune_params generic_tunings =
 719 {
 720   &cortexa57_extra_costs,
 721   &generic_addrcost_table,
 722   &generic_regmove_cost,
 723   &generic_vector_cost,
 724   &generic_branch_cost,
 725   &generic_approx_modes,
 726   SVE_NOT_IMPLEMENTED, /* sve_width  */
 727   4, /* memmov_cost  */
 728   2, /* issue_rate  */
 729   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 730   "16:12",      /* function_align.  */
 731   "4",  /* jump_align.  */
 732   "8",  /* loop_align.  */
 733   2,    /* int_reassoc_width.  */
 734   4,    /* fp_reassoc_width.  */
 735   1,    /* vec_reassoc_width.  */
 736   2,    /* min_div_recip_mul_sf.  */
 737   2,    /* min_div_recip_mul_df.  */
 738   0,    /* max_case_values.  */
 739   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 741   &generic_prefetch_tune
 742 };
 743
 744 static const struct tune_params cortexa35_tunings =
 745 {
 746   &cortexa53_extra_costs,
 747   &generic_addrcost_table,
 748   &cortexa53_regmove_cost,
 749   &generic_vector_cost,
 750   &generic_branch_cost,
 751   &generic_approx_modes,
 752   SVE_NOT_IMPLEMENTED, /* sve_width  */
 753   4, /* memmov_cost  */
 754   1, /* issue_rate  */
 755   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 756    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 757   "16", /* function_align.  */
 758   "4",  /* jump_align.  */
 759   "8",  /* loop_align.  */
 760   2,    /* int_reassoc_width.  */
 761   4,    /* fp_reassoc_width.  */
 762   1,    /* vec_reassoc_width.  */
 763   2,    /* min_div_recip_mul_sf.  */
 764   2,    /* min_div_recip_mul_df.  */
 765   0,    /* max_case_values.  */
 766   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 767   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 768   &generic_prefetch_tune
 769 };
 770
 771 static const struct tune_params cortexa53_tunings =
 772 {
 773   &cortexa53_extra_costs,
 774   &generic_addrcost_table,
 775   &cortexa53_regmove_cost,
 776   &generic_vector_cost,
 777   &generic_branch_cost,
 778   &generic_approx_modes,
 779   SVE_NOT_IMPLEMENTED, /* sve_width  */
 780   4, /* memmov_cost  */
 781   2, /* issue_rate  */
 782   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 783    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 784   "16", /* function_align.  */
 785   "4",  /* jump_align.  */
 786   "8",  /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params cortexa57_tunings =
 799 {
 800   &cortexa57_extra_costs,
 801   &generic_addrcost_table,
 802   &cortexa57_regmove_cost,
 803   &cortexa57_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   SVE_NOT_IMPLEMENTED, /* sve_width  */
 807   4, /* memmov_cost  */
 808   3, /* issue_rate  */
 809   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 811   "16", /* function_align.  */
 812   "4",  /* jump_align.  */
 813   "8",  /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 822   &generic_prefetch_tune
 823 };
 824
 825 static const struct tune_params cortexa72_tunings =
 826 {
 827   &cortexa57_extra_costs,
 828   &generic_addrcost_table,
 829   &cortexa57_regmove_cost,
 830   &cortexa57_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   SVE_NOT_IMPLEMENTED, /* sve_width  */
 834   4, /* memmov_cost  */
 835   3, /* issue_rate  */
 836   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 837    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 838   "16", /* function_align.  */
 839   "4",  /* jump_align.  */
 840   "8",  /* loop_align.  */
 841   2,    /* int_reassoc_width.  */
 842   4,    /* fp_reassoc_width.  */
 843   1,    /* vec_reassoc_width.  */
 844   2,    /* min_div_recip_mul_sf.  */
 845   2,    /* min_div_recip_mul_df.  */
 846   0,    /* max_case_values.  */
 847   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 848   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 849   &generic_prefetch_tune
 850 };
 851
 852 static const struct tune_params cortexa73_tunings =
 853 {
 854   &cortexa57_extra_costs,
 855   &generic_addrcost_table,
 856   &cortexa57_regmove_cost,
 857   &cortexa57_vector_cost,
 858   &generic_branch_cost,
 859   &generic_approx_modes,
 860   SVE_NOT_IMPLEMENTED, /* sve_width  */
 861   4, /* memmov_cost.  */
 862   2, /* issue_rate.  */
 863   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 864    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 865   "16", /* function_align.  */
 866   "4",  /* jump_align.  */
 867   "8",  /* loop_align.  */
 868   2,    /* int_reassoc_width.  */
 869   4,    /* fp_reassoc_width.  */
 870   1,    /* vec_reassoc_width.  */
 871   2,    /* min_div_recip_mul_sf.  */
 872   2,    /* min_div_recip_mul_df.  */
 873   0,    /* max_case_values.  */
 874   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 875   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 876   &generic_prefetch_tune
 877 };
 878
 879
 880
 881 static const struct tune_params exynosm1_tunings =
 882 {
 883   &exynosm1_extra_costs,
 884   &exynosm1_addrcost_table,
 885   &exynosm1_regmove_cost,
 886   &exynosm1_vector_cost,
 887   &generic_branch_cost,
 888   &exynosm1_approx_modes,
 889   SVE_NOT_IMPLEMENTED, /* sve_width  */
 890   4,    /* memmov_cost  */
 891   3,    /* issue_rate  */
 892   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 893   "4",  /* function_align.  */
 894   "4",  /* jump_align.  */
 895   "4",  /* loop_align.  */
 896   2,    /* int_reassoc_width.  */
 897   4,    /* fp_reassoc_width.  */
 898   1,    /* vec_reassoc_width.  */
 899   2,    /* min_div_recip_mul_sf.  */
 900   2,    /* min_div_recip_mul_df.  */
 901   48,   /* max_case_values.  */
 902   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 903   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 904   &exynosm1_prefetch_tune
 905 };
 906
 907 static const struct tune_params thunderxt88_tunings =
 908 {
 909   &thunderx_extra_costs,
 910   &generic_addrcost_table,
 911   &thunderx_regmove_cost,
 912   &thunderx_vector_cost,
 913   &generic_branch_cost,
 914   &generic_approx_modes,
 915   SVE_NOT_IMPLEMENTED, /* sve_width  */
 916   6, /* memmov_cost  */
 917   2, /* issue_rate  */
 918   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
 919   "8",  /* function_align.  */
 920   "8",  /* jump_align.  */
 921   "8",  /* loop_align.  */
 922   2,    /* int_reassoc_width.  */
 923   4,    /* fp_reassoc_width.  */
 924   1,    /* vec_reassoc_width.  */
 925   2,    /* min_div_recip_mul_sf.  */
 926   2,    /* min_div_recip_mul_df.  */
 927   0,    /* max_case_values.  */
 928   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 929   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 930   &thunderxt88_prefetch_tune
 931 };
 932
 933 static const struct tune_params thunderx_tunings =
 934 {
 935   &thunderx_extra_costs,
 936   &generic_addrcost_table,
 937   &thunderx_regmove_cost,
 938   &thunderx_vector_cost,
 939   &generic_branch_cost,
 940   &generic_approx_modes,
 941   SVE_NOT_IMPLEMENTED, /* sve_width  */
 942   6, /* memmov_cost  */
 943   2, /* issue_rate  */
 944   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
 945   "8",  /* function_align.  */
 946   "8",  /* jump_align.  */
 947   "8",  /* loop_align.  */
 948   2,    /* int_reassoc_width.  */
 949   4,    /* fp_reassoc_width.  */
 950   1,    /* vec_reassoc_width.  */
 951   2,    /* min_div_recip_mul_sf.  */
 952   2,    /* min_div_recip_mul_df.  */
 953   0,    /* max_case_values.  */
 954   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 955   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 956    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 957   &thunderx_prefetch_tune
 958 };
 959
 960 static const struct tune_params tsv110_tunings =
 961 {
 962   &tsv110_extra_costs,
 963   &tsv110_addrcost_table,
 964   &tsv110_regmove_cost,
 965   &tsv110_vector_cost,
 966   &generic_branch_cost,
 967   &generic_approx_modes,
 968   SVE_NOT_IMPLEMENTED, /* sve_width  */
 969   4,    /* memmov_cost  */
 970   4,    /* issue_rate  */
 971   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
 972    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
 973   "16", /* function_align.  */
 974   "4",  /* jump_align.  */
 975   "8",  /* loop_align.  */
 976   2,    /* int_reassoc_width.  */
 977   4,    /* fp_reassoc_width.  */
 978   1,    /* vec_reassoc_width.  */
 979   2,    /* min_div_recip_mul_sf.  */
 980   2,    /* min_div_recip_mul_df.  */
 981   0,    /* max_case_values.  */
 982   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 983   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 984   &tsv110_prefetch_tune
 985 };
 986
 987 static const struct tune_params xgene1_tunings =
 988 {
 989   &xgene1_extra_costs,
 990   &xgene1_addrcost_table,
 991   &xgene1_regmove_cost,
 992   &xgene1_vector_cost,
 993   &generic_branch_cost,
 994   &xgene1_approx_modes,
 995   SVE_NOT_IMPLEMENTED, /* sve_width  */
 996   6, /* memmov_cost  */
 997   4, /* issue_rate  */
 998   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 999   "16", /* function_align.  */
1000   "16", /* jump_align.  */
1001   "16", /* loop_align.  */
1002   2,    /* int_reassoc_width.  */
1003   4,    /* fp_reassoc_width.  */
1004   1,    /* vec_reassoc_width.  */
1005   2,    /* min_div_recip_mul_sf.  */
1006   2,    /* min_div_recip_mul_df.  */
1007   17,   /* max_case_values.  */
1008   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1009   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1010   &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015   &xgene1_extra_costs,
1016   &xgene1_addrcost_table,
1017   &xgene1_regmove_cost,
1018   &xgene1_vector_cost,
1019   &generic_branch_cost,
1020   &xgene1_approx_modes,
1021   SVE_NOT_IMPLEMENTED,
1022   6, /* memmov_cost  */
1023   4, /* issue_rate  */
1024   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1025   "16", /* function_align.  */
1026   "16", /* jump_align.  */
1027   "16", /* loop_align.  */
1028   2,    /* int_reassoc_width.  */
1029   4,    /* fp_reassoc_width.  */
1030   1,    /* vec_reassoc_width.  */
1031   2,    /* min_div_recip_mul_sf.  */
1032   2,    /* min_div_recip_mul_df.  */
1033   17,   /* max_case_values.  */
1034   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1035   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1036   &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041   &qdf24xx_extra_costs,
1042   &qdf24xx_addrcost_table,
1043   &qdf24xx_regmove_cost,
1044   &qdf24xx_vector_cost,
1045   &generic_branch_cost,
1046   &generic_approx_modes,
1047   SVE_NOT_IMPLEMENTED, /* sve_width  */
1048   4, /* memmov_cost  */
1049   4, /* issue_rate  */
1050   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1052   "16", /* function_align.  */
1053   "8",  /* jump_align.  */
1054   "16", /* loop_align.  */
1055   2,    /* int_reassoc_width.  */
1056   4,    /* fp_reassoc_width.  */
1057   1,    /* vec_reassoc_width.  */
1058   2,    /* min_div_recip_mul_sf.  */
1059   2,    /* min_div_recip_mul_df.  */
1060   0,    /* max_case_values.  */
1061   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1062   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1063   &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1067    for now.  */
1068 static const struct tune_params saphira_tunings =
1069 {
1070   &generic_extra_costs,
1071   &generic_addrcost_table,
1072   &generic_regmove_cost,
1073   &generic_vector_cost,
1074   &generic_branch_cost,
1075   &generic_approx_modes,
1076   SVE_NOT_IMPLEMENTED, /* sve_width  */
1077   4, /* memmov_cost  */
1078   4, /* issue_rate  */
1079   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1081   "16", /* function_align.  */
1082   "8",  /* jump_align.  */
1083   "16", /* loop_align.  */
1084   2,    /* int_reassoc_width.  */
1085   4,    /* fp_reassoc_width.  */
1086   1,    /* vec_reassoc_width.  */
1087   2,    /* min_div_recip_mul_sf.  */
1088   2,    /* min_div_recip_mul_df.  */
1089   0,    /* max_case_values.  */
1090   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1091   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1092   &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097   &thunderx2t99_extra_costs,
1098   &thunderx2t99_addrcost_table,
1099   &thunderx2t99_regmove_cost,
1100   &thunderx2t99_vector_cost,
1101   &generic_branch_cost,
1102   &generic_approx_modes,
1103   SVE_NOT_IMPLEMENTED, /* sve_width  */
1104   4, /* memmov_cost.  */
1105   4, /* issue_rate.  */
1106   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1107    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1108   "16", /* function_align.  */
1109   "8",  /* jump_align.  */
1110   "16", /* loop_align.  */
1111   3,    /* int_reassoc_width.  */
1112   2,    /* fp_reassoc_width.  */
1113   2,    /* vec_reassoc_width.  */
1114   2,    /* min_div_recip_mul_sf.  */
1115   2,    /* min_div_recip_mul_df.  */
1116   0,    /* max_case_values.  */
1117   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1118   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1119   &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124   &cortexa57_extra_costs,
1125   &generic_addrcost_table,
1126   &generic_regmove_cost,
1127   &cortexa57_vector_cost,
1128   &generic_branch_cost,
1129   &generic_approx_modes,
1130   SVE_NOT_IMPLEMENTED, /* sve_width  */
1131   4, /* memmov_cost  */
1132   3, /* issue_rate  */
1133   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1134   "32:16",      /* function_align.  */
1135   "32:16",      /* jump_align.  */
1136   "32:16",      /* loop_align.  */
1137   2,    /* int_reassoc_width.  */
1138   4,    /* fp_reassoc_width.  */
1139   2,    /* vec_reassoc_width.  */
1140   2,    /* min_div_recip_mul_sf.  */
1141   2,    /* min_div_recip_mul_df.  */
1142   0,    /* max_case_values.  */
1143   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1144   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1145   &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures.  */
1149 struct aarch64_tuning_override_function
1150 {
1151   const char* name;
1152   void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162   { "fuse", aarch64_parse_fuse_string },
1163   { "tune", aarch64_parse_tune_string },
1164   { "sve_width", aarch64_parse_sve_width_string },
1165   { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64.  */
1169 struct processor
1170 {
1171   const char *const name;
1172   enum aarch64_processor ident;
1173   enum aarch64_processor sched_core;
1174   enum aarch64_arch arch;
1175   unsigned architecture_version;
1176   const uint64_t flags;
1177   const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64.  */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64.  */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1194   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1195   FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1204    handling code or by target attributes.  */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set.  */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1215
1216 static tree
1217 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1218                                      int, bool *no_add_attrs)
1219 {
1220   /* Since we set fn_type_req to true, the caller should have checked
1221      this for us.  */
1222   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1223   switch ((arm_pcs) fntype_abi (*node).id ())
1224     {
1225     case ARM_PCS_AAPCS64:
1226     case ARM_PCS_SIMD:
1227       return NULL_TREE;
1228
1229     case ARM_PCS_SVE:
1230       error ("the %qE attribute cannot be applied to an SVE function type",
1231              name);
1232       *no_add_attrs = true;
1233       return NULL_TREE;
1234
1235     case ARM_PCS_TLSDESC:
1236     case ARM_PCS_UNKNOWN:
1237       break;
1238     }
1239   gcc_unreachable ();
1240 }
1241
1242 /* Table of machine attributes.  */
1243 static const struct attribute_spec aarch64_attribute_table[] =
1244 {
1245   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246        affects_type_identity, handler, exclude } */
1247   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1248                           handle_aarch64_vector_pcs_attribute, NULL },
1249   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1250 };
1251
1252 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1253
1254 /* An ISA extension in the co-processor and main instruction set space.  */
1255 struct aarch64_option_extension
1256 {
1257   const char *const name;
1258   const unsigned long flags_on;
1259   const unsigned long flags_off;
1260 };
1261
1262 typedef enum aarch64_cond_code
1263 {
1264   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1265   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1266   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1267 }
1268 aarch64_cc;
1269
1270 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1271
1272 struct aarch64_branch_protect_type
1273 {
1274   /* The type's name that the user passes to the branch-protection option
1275     string.  */
1276   const char* name;
1277   /* Function to handle the protection type and set global variables.
1278     First argument is the string token corresponding with this type and the
1279     second argument is the next token in the option string.
1280     Return values:
1281     * AARCH64_PARSE_OK: Handling was sucessful.
1282     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1283       should print an error.
1284     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1285       own error.  */
1286   enum aarch64_parse_opt_result (*handler)(char*, char*);
1287   /* A list of types that can follow this type in the option string.  */
1288   const aarch64_branch_protect_type* subtypes;
1289   unsigned int num_subtypes;
1290 };
1291
1292 static enum aarch64_parse_opt_result
1293 aarch64_handle_no_branch_protection (char* str, char* rest)
1294 {
1295   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1296   aarch64_enable_bti = 0;
1297   if (rest)
1298     {
1299       error ("unexpected %<%s%> after %<%s%>", rest, str);
1300       return AARCH64_PARSE_INVALID_FEATURE;
1301     }
1302   return AARCH64_PARSE_OK;
1303 }
1304
1305 static enum aarch64_parse_opt_result
1306 aarch64_handle_standard_branch_protection (char* str, char* rest)
1307 {
1308   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1309   aarch64_ra_sign_key = AARCH64_KEY_A;
1310   aarch64_enable_bti = 1;
1311   if (rest)
1312     {
1313       error ("unexpected %<%s%> after %<%s%>", rest, str);
1314       return AARCH64_PARSE_INVALID_FEATURE;
1315     }
1316   return AARCH64_PARSE_OK;
1317 }
1318
1319 static enum aarch64_parse_opt_result
1320 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1321                                     char* rest ATTRIBUTE_UNUSED)
1322 {
1323   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1324   aarch64_ra_sign_key = AARCH64_KEY_A;
1325   return AARCH64_PARSE_OK;
1326 }
1327
1328 static enum aarch64_parse_opt_result
1329 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1330                               char* rest ATTRIBUTE_UNUSED)
1331 {
1332   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1333   return AARCH64_PARSE_OK;
1334 }
1335
1336 static enum aarch64_parse_opt_result
1337 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1338                               char* rest ATTRIBUTE_UNUSED)
1339 {
1340   aarch64_ra_sign_key = AARCH64_KEY_B;
1341   return AARCH64_PARSE_OK;
1342 }
1343
1344 static enum aarch64_parse_opt_result
1345 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1346                                     char* rest ATTRIBUTE_UNUSED)
1347 {
1348   aarch64_enable_bti = 1;
1349   return AARCH64_PARSE_OK;
1350 }
1351
1352 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1353   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1354   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1355   { NULL, NULL, NULL, 0 }
1356 };
1357
1358 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1359   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1360   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1361   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1362     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1363   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1364   { NULL, NULL, NULL, 0 }
1365 };
1366
1367 /* The condition codes of the processor, and the inverse function.  */
1368 static const char * const aarch64_condition_codes[] =
1369 {
1370   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1371   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1372 };
1373
1374 /* The preferred condition codes for SVE conditions.  */
1375 static const char *const aarch64_sve_condition_codes[] =
1376 {
1377   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1378   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1379 };
1380
1381 /* Return the assembly token for svpattern value VALUE.  */
1382
1383 static const char *
1384 svpattern_token (enum aarch64_svpattern pattern)
1385 {
1386   switch (pattern)
1387     {
1388 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1389     AARCH64_FOR_SVPATTERN (CASE)
1390 #undef CASE
1391     case AARCH64_NUM_SVPATTERNS:
1392       break;
1393     }
1394   gcc_unreachable ();
1395 }
1396
1397 /* Return the descriptor of the SIMD ABI.  */
1398
1399 static const predefined_function_abi &
1400 aarch64_simd_abi (void)
1401 {
1402   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1403   if (!simd_abi.initialized_p ())
1404     {
1405       HARD_REG_SET full_reg_clobbers
1406         = default_function_abi.full_reg_clobbers ();
1407       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1408         if (FP_SIMD_SAVED_REGNUM_P (regno))
1409           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1410       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1411     }
1412   return simd_abi;
1413 }
1414
1415 /* Return the descriptor of the SVE PCS.  */
1416
1417 static const predefined_function_abi &
1418 aarch64_sve_abi (void)
1419 {
1420   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1421   if (!sve_abi.initialized_p ())
1422     {
1423       HARD_REG_SET full_reg_clobbers
1424         = default_function_abi.full_reg_clobbers ();
1425       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1426         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1427       for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
1428         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1429       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1430     }
1431   return sve_abi;
1432 }
1433
1434 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1435 const char *
1436 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1437                         const char * branch_format)
1438 {
1439     rtx_code_label * tmp_label = gen_label_rtx ();
1440     char label_buf[256];
1441     char buffer[128];
1442     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1443                                  CODE_LABEL_NUMBER (tmp_label));
1444     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1445     rtx dest_label = operands[pos_label];
1446     operands[pos_label] = tmp_label;
1447
1448     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1449     output_asm_insn (buffer, operands);
1450
1451     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1452     operands[pos_label] = dest_label;
1453     output_asm_insn (buffer, operands);
1454     return "";
1455 }
1456
1457 void
1458 aarch64_err_no_fpadvsimd (machine_mode mode)
1459 {
1460   if (TARGET_GENERAL_REGS_ONLY)
1461     if (FLOAT_MODE_P (mode))
1462       error ("%qs is incompatible with the use of floating-point types",
1463              "-mgeneral-regs-only");
1464     else
1465       error ("%qs is incompatible with the use of vector types",
1466              "-mgeneral-regs-only");
1467   else
1468     if (FLOAT_MODE_P (mode))
1469       error ("%qs feature modifier is incompatible with the use of"
1470              " floating-point types", "+nofp");
1471     else
1472       error ("%qs feature modifier is incompatible with the use of"
1473              " vector types", "+nofp");
1474 }
1475
1476 /* Report when we try to do something that requires SVE when SVE is disabled.
1477    This is an error of last resort and isn't very high-quality.  It usually
1478    involves attempts to measure the vector length in some way.  */
1479 static void
1480 aarch64_report_sve_required (void)
1481 {
1482   static bool reported_p = false;
1483
1484   /* Avoid reporting a slew of messages for a single oversight.  */
1485   if (reported_p)
1486     return;
1487
1488   error ("this operation requires the SVE ISA extension");
1489   inform (input_location, "you can enable SVE using the command-line"
1490           " option %<-march%>, or by using the %<target%>"
1491           " attribute or pragma");
1492   reported_p = true;
1493 }
1494
1495 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1496    registers.  */
1497 inline bool
1498 pr_or_ffr_regnum_p (unsigned int regno)
1499 {
1500   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1501 }
1502
1503 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1504    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1505    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1506    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1507    and GENERAL_REGS is lower than the memory cost (in this case the best class
1508    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1509    cost results in bad allocations with many redundant int<->FP moves which
1510    are expensive on various cores.
1511    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1512    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1513    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1514    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1515    The result of this is that it is no longer inefficient to have a higher
1516    memory move cost than the register move cost.
1517 */
1518
1519 static reg_class_t
1520 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1521                                          reg_class_t best_class)
1522 {
1523   machine_mode mode;
1524
1525   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1526       || !reg_class_subset_p (FP_REGS, allocno_class))
1527     return allocno_class;
1528
1529   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1530       || !reg_class_subset_p (FP_REGS, best_class))
1531     return best_class;
1532
1533   mode = PSEUDO_REGNO_MODE (regno);
1534   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1535 }
1536
1537 static unsigned int
1538 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1539 {
1540   if (GET_MODE_UNIT_SIZE (mode) == 4)
1541     return aarch64_tune_params.min_div_recip_mul_sf;
1542   return aarch64_tune_params.min_div_recip_mul_df;
1543 }
1544
1545 /* Return the reassociation width of treeop OPC with mode MODE.  */
1546 static int
1547 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1548 {
1549   if (VECTOR_MODE_P (mode))
1550     return aarch64_tune_params.vec_reassoc_width;
1551   if (INTEGRAL_MODE_P (mode))
1552     return aarch64_tune_params.int_reassoc_width;
1553   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1554   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1555     return aarch64_tune_params.fp_reassoc_width;
1556   return 1;
1557 }
1558
1559 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1560 unsigned
1561 aarch64_dbx_register_number (unsigned regno)
1562 {
1563    if (GP_REGNUM_P (regno))
1564      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1565    else if (regno == SP_REGNUM)
1566      return AARCH64_DWARF_SP;
1567    else if (FP_REGNUM_P (regno))
1568      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1569    else if (PR_REGNUM_P (regno))
1570      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1571    else if (regno == VG_REGNUM)
1572      return AARCH64_DWARF_VG;
1573
1574    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1575       equivalent DWARF register.  */
1576    return DWARF_FRAME_REGISTERS;
1577 }
1578
1579 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1580    integer, otherwise return X unmodified.  */
1581 static rtx
1582 aarch64_bit_representation (rtx x)
1583 {
1584   if (CONST_DOUBLE_P (x))
1585     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1586   return x;
1587 }
1588
1589 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1590 static bool
1591 aarch64_advsimd_struct_mode_p (machine_mode mode)
1592 {
1593   return (TARGET_SIMD
1594           && (mode == OImode || mode == CImode || mode == XImode));
1595 }
1596
1597 /* Return true if MODE is an SVE predicate mode.  */
1598 static bool
1599 aarch64_sve_pred_mode_p (machine_mode mode)
1600 {
1601   return (TARGET_SVE
1602           && (mode == VNx16BImode
1603               || mode == VNx8BImode
1604               || mode == VNx4BImode
1605               || mode == VNx2BImode));
1606 }
1607
1608 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1609 const unsigned int VEC_ADVSIMD  = 1;
1610 const unsigned int VEC_SVE_DATA = 2;
1611 const unsigned int VEC_SVE_PRED = 4;
1612 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1613    a structure of 2, 3 or 4 vectors.  */
1614 const unsigned int VEC_STRUCT   = 8;
1615 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1616    vector has fewer significant bytes than a full SVE vector.  */
1617 const unsigned int VEC_PARTIAL  = 16;
1618 /* Useful combinations of the above.  */
1619 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1620 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1621
1622 /* Return a set of flags describing the vector properties of mode MODE.
1623    Ignore modes that are not supported by the current target.  */
1624 static unsigned int
1625 aarch64_classify_vector_mode (machine_mode mode)
1626 {
1627   if (aarch64_advsimd_struct_mode_p (mode))
1628     return VEC_ADVSIMD | VEC_STRUCT;
1629
1630   if (aarch64_sve_pred_mode_p (mode))
1631     return VEC_SVE_PRED;
1632
1633   /* Make the decision based on the mode's enum value rather than its
1634      properties, so that we keep the correct classification regardless
1635      of -msve-vector-bits.  */
1636   switch (mode)
1637     {
1638     /* Partial SVE QI vectors.  */
1639     case E_VNx2QImode:
1640     case E_VNx4QImode:
1641     case E_VNx8QImode:
1642     /* Partial SVE HI vectors.  */
1643     case E_VNx2HImode:
1644     case E_VNx4HImode:
1645     /* Partial SVE SI vector.  */
1646     case E_VNx2SImode:
1647     /* Partial SVE HF vectors.  */
1648     case E_VNx2HFmode:
1649     case E_VNx4HFmode:
1650     /* Partial SVE SF vector.  */
1651     case E_VNx2SFmode:
1652       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1653
1654     case E_VNx16QImode:
1655     case E_VNx8HImode:
1656     case E_VNx4SImode:
1657     case E_VNx2DImode:
1658     case E_VNx8HFmode:
1659     case E_VNx4SFmode:
1660     case E_VNx2DFmode:
1661       return TARGET_SVE ? VEC_SVE_DATA : 0;
1662
1663     /* x2 SVE vectors.  */
1664     case E_VNx32QImode:
1665     case E_VNx16HImode:
1666     case E_VNx8SImode:
1667     case E_VNx4DImode:
1668     case E_VNx16HFmode:
1669     case E_VNx8SFmode:
1670     case E_VNx4DFmode:
1671     /* x3 SVE vectors.  */
1672     case E_VNx48QImode:
1673     case E_VNx24HImode:
1674     case E_VNx12SImode:
1675     case E_VNx6DImode:
1676     case E_VNx24HFmode:
1677     case E_VNx12SFmode:
1678     case E_VNx6DFmode:
1679     /* x4 SVE vectors.  */
1680     case E_VNx64QImode:
1681     case E_VNx32HImode:
1682     case E_VNx16SImode:
1683     case E_VNx8DImode:
1684     case E_VNx32HFmode:
1685     case E_VNx16SFmode:
1686     case E_VNx8DFmode:
1687       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1688
1689     /* 64-bit Advanced SIMD vectors.  */
1690     case E_V8QImode:
1691     case E_V4HImode:
1692     case E_V2SImode:
1693     /* ...E_V1DImode doesn't exist.  */
1694     case E_V4HFmode:
1695     case E_V2SFmode:
1696     case E_V1DFmode:
1697     /* 128-bit Advanced SIMD vectors.  */
1698     case E_V16QImode:
1699     case E_V8HImode:
1700     case E_V4SImode:
1701     case E_V2DImode:
1702     case E_V8HFmode:
1703     case E_V4SFmode:
1704     case E_V2DFmode:
1705       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1706
1707     default:
1708       return 0;
1709     }
1710 }
1711
1712 /* Return true if MODE is any of the data vector modes, including
1713    structure modes.  */
1714 static bool
1715 aarch64_vector_data_mode_p (machine_mode mode)
1716 {
1717   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1718 }
1719
1720 /* Return true if MODE is any form of SVE mode, including predicates,
1721    vectors and structures.  */
1722 bool
1723 aarch64_sve_mode_p (machine_mode mode)
1724 {
1725   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1726 }
1727
1728 /* Return true if MODE is an SVE data vector mode; either a single vector
1729    or a structure of vectors.  */
1730 static bool
1731 aarch64_sve_data_mode_p (machine_mode mode)
1732 {
1733   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1734 }
1735
1736 /* Return the number of defined bytes in one constituent vector of
1737    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1738 static poly_int64
1739 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1740 {
1741   if (vec_flags & VEC_PARTIAL)
1742     /* A single partial vector.  */
1743     return GET_MODE_SIZE (mode);
1744
1745   if (vec_flags & VEC_SVE_DATA)
1746     /* A single vector or a tuple.  */
1747     return BYTES_PER_SVE_VECTOR;
1748
1749   /* A single predicate.  */
1750   gcc_assert (vec_flags & VEC_SVE_PRED);
1751   return BYTES_PER_SVE_PRED;
1752 }
1753
1754 /* Implement target hook TARGET_ARRAY_MODE.  */
1755 static opt_machine_mode
1756 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1757 {
1758   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1759       && IN_RANGE (nelems, 2, 4))
1760     return mode_for_vector (GET_MODE_INNER (mode),
1761                             GET_MODE_NUNITS (mode) * nelems);
1762
1763   return opt_machine_mode ();
1764 }
1765
1766 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1767 static bool
1768 aarch64_array_mode_supported_p (machine_mode mode,
1769                                 unsigned HOST_WIDE_INT nelems)
1770 {
1771   if (TARGET_SIMD
1772       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1773           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1774       && (nelems >= 2 && nelems <= 4))
1775     return true;
1776
1777   return false;
1778 }
1779
1780 /* MODE is some form of SVE vector mode.  For data modes, return the number
1781    of vector register bits that each element of MODE occupies, such as 64
1782    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1783    in a 64-bit container).  For predicate modes, return the number of
1784    data bits controlled by each significant predicate bit.  */
1785
1786 static unsigned int
1787 aarch64_sve_container_bits (machine_mode mode)
1788 {
1789   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1790   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1791                              ? BITS_PER_SVE_VECTOR
1792                              : GET_MODE_BITSIZE (mode));
1793   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1794 }
1795
1796 /* Return the SVE predicate mode to use for elements that have
1797    ELEM_NBYTES bytes, if such a mode exists.  */
1798
1799 opt_machine_mode
1800 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1801 {
1802   if (TARGET_SVE)
1803     {
1804       if (elem_nbytes == 1)
1805         return VNx16BImode;
1806       if (elem_nbytes == 2)
1807         return VNx8BImode;
1808       if (elem_nbytes == 4)
1809         return VNx4BImode;
1810       if (elem_nbytes == 8)
1811         return VNx2BImode;
1812     }
1813   return opt_machine_mode ();
1814 }
1815
1816 /* Return the SVE predicate mode that should be used to control
1817    SVE mode MODE.  */
1818
1819 machine_mode
1820 aarch64_sve_pred_mode (machine_mode mode)
1821 {
1822   unsigned int bits = aarch64_sve_container_bits (mode);
1823   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1824 }
1825
1826 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1827
1828 static opt_machine_mode
1829 aarch64_get_mask_mode (machine_mode mode)
1830 {
1831   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1832   if (vec_flags & VEC_SVE_DATA)
1833     return aarch64_sve_pred_mode (mode);
1834
1835   return default_get_mask_mode (mode);
1836 }
1837
1838 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1839
1840 opt_machine_mode
1841 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1842 {
1843   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1844                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1845   machine_mode mode;
1846   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1847     if (inner_mode == GET_MODE_INNER (mode)
1848         && known_eq (nunits, GET_MODE_NUNITS (mode))
1849         && aarch64_sve_data_mode_p (mode))
1850       return mode;
1851   return opt_machine_mode ();
1852 }
1853
1854 /* Return the integer element mode associated with SVE mode MODE.  */
1855
1856 static scalar_int_mode
1857 aarch64_sve_element_int_mode (machine_mode mode)
1858 {
1859   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1860                              ? BITS_PER_SVE_VECTOR
1861                              : GET_MODE_BITSIZE (mode));
1862   unsigned int elt_bits = vector_element_size (vector_bits,
1863                                                GET_MODE_NUNITS (mode));
1864   return int_mode_for_size (elt_bits, 0).require ();
1865 }
1866
1867 /* Return an integer element mode that contains exactly
1868    aarch64_sve_container_bits (MODE) bits.  This is wider than
1869    aarch64_sve_element_int_mode if MODE is a partial vector,
1870    otherwise it's the same.  */
1871
1872 static scalar_int_mode
1873 aarch64_sve_container_int_mode (machine_mode mode)
1874 {
1875   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1876 }
1877
1878 /* Return the integer vector mode associated with SVE mode MODE.
1879    Unlike related_int_vector_mode, this can handle the case in which
1880    MODE is a predicate (and thus has a different total size).  */
1881
1882 machine_mode
1883 aarch64_sve_int_mode (machine_mode mode)
1884 {
1885   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1886   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1887 }
1888
1889 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
1890
1891 static opt_machine_mode
1892 aarch64_vectorize_related_mode (machine_mode vector_mode,
1893                                 scalar_mode element_mode,
1894                                 poly_uint64 nunits)
1895 {
1896   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1897
1898   /* If we're operating on SVE vectors, try to return an SVE mode.  */
1899   poly_uint64 sve_nunits;
1900   if ((vec_flags & VEC_SVE_DATA)
1901       && multiple_p (BYTES_PER_SVE_VECTOR,
1902                      GET_MODE_SIZE (element_mode), &sve_nunits))
1903     {
1904       machine_mode sve_mode;
1905       if (maybe_ne (nunits, 0U))
1906         {
1907           /* Try to find a full or partial SVE mode with exactly
1908              NUNITS units.  */
1909           if (multiple_p (sve_nunits, nunits)
1910               && aarch64_sve_data_mode (element_mode,
1911                                         nunits).exists (&sve_mode))
1912             return sve_mode;
1913         }
1914       else
1915         {
1916           /* Take the preferred number of units from the number of bytes
1917              that fit in VECTOR_MODE.  We always start by "autodetecting"
1918              a full vector mode with preferred_simd_mode, so vectors
1919              chosen here will also be full vector modes.  Then
1920              autovectorize_vector_modes tries smaller starting modes
1921              and thus smaller preferred numbers of units.  */
1922           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1923           if (aarch64_sve_data_mode (element_mode,
1924                                      sve_nunits).exists (&sve_mode))
1925             return sve_mode;
1926         }
1927     }
1928
1929   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
1930   if ((vec_flags & VEC_ADVSIMD)
1931       && known_eq (nunits, 0U)
1932       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1933       && maybe_ge (GET_MODE_BITSIZE (element_mode)
1934                    * GET_MODE_NUNITS (vector_mode), 128U))
1935     {
1936       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1937       if (VECTOR_MODE_P (res))
1938         return res;
1939     }
1940
1941   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1942 }
1943
1944 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1945    prefer to use the first arithmetic operand as the else value if
1946    the else value doesn't matter, since that exactly matches the SVE
1947    destructive merging form.  For ternary operations we could either
1948    pick the first operand and use FMAD-like instructions or the last
1949    operand and use FMLA-like instructions; the latter seems more
1950    natural.  */
1951
1952 static tree
1953 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1954 {
1955   return nops == 3 ? ops[2] : ops[0];
1956 }
1957
1958 /* Implement TARGET_HARD_REGNO_NREGS.  */
1959
1960 static unsigned int
1961 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1962 {
1963   /* ??? Logically we should only need to provide a value when
1964      HARD_REGNO_MODE_OK says that the combination is valid,
1965      but at the moment we need to handle all modes.  Just ignore
1966      any runtime parts for registers that can't store them.  */
1967   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1968   switch (aarch64_regno_regclass (regno))
1969     {
1970     case FP_REGS:
1971     case FP_LO_REGS:
1972     case FP_LO8_REGS:
1973       {
1974         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1975         if (vec_flags & VEC_SVE_DATA)
1976           return exact_div (GET_MODE_SIZE (mode),
1977                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
1978         return CEIL (lowest_size, UNITS_PER_VREG);
1979       }
1980     case PR_REGS:
1981     case PR_LO_REGS:
1982     case PR_HI_REGS:
1983     case FFR_REGS:
1984     case PR_AND_FFR_REGS:
1985       return 1;
1986     default:
1987       return CEIL (lowest_size, UNITS_PER_WORD);
1988     }
1989   gcc_unreachable ();
1990 }
1991
1992 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1993
1994 static bool
1995 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1996 {
1997   if (GET_MODE_CLASS (mode) == MODE_CC)
1998     return regno == CC_REGNUM;
1999
2000   if (regno == VG_REGNUM)
2001     /* This must have the same size as _Unwind_Word.  */
2002     return mode == DImode;
2003
2004   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2005   if (vec_flags & VEC_SVE_PRED)
2006     return pr_or_ffr_regnum_p (regno);
2007
2008   if (pr_or_ffr_regnum_p (regno))
2009     return false;
2010
2011   if (regno == SP_REGNUM)
2012     /* The purpose of comparing with ptr_mode is to support the
2013        global register variable associated with the stack pointer
2014        register via the syntax of asm ("wsp") in ILP32.  */
2015     return mode == Pmode || mode == ptr_mode;
2016
2017   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2018     return mode == Pmode;
2019
2020   if (GP_REGNUM_P (regno))
2021     {
2022       if (vec_flags & VEC_ANY_SVE)
2023         return false;
2024       if (known_le (GET_MODE_SIZE (mode), 8))
2025         return true;
2026       if (known_le (GET_MODE_SIZE (mode), 16))
2027         return (regno & 1) == 0;
2028     }
2029   else if (FP_REGNUM_P (regno))
2030     {
2031       if (vec_flags & VEC_STRUCT)
2032         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2033       else
2034         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2035     }
2036
2037   return false;
2038 }
2039
2040 /* Return true if TYPE is a type that should be passed or returned in
2041    SVE registers, assuming enough registers are available.  When returning
2042    true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2043    respectively.  */
2044
2045 static bool
2046 aarch64_sve_argument_p (const_tree type, unsigned int *num_zr,
2047                         unsigned int *num_pr)
2048 {
2049   if (aarch64_sve::svbool_type_p (type))
2050     {
2051       *num_pr = 1;
2052       *num_zr = 0;
2053       return true;
2054     }
2055
2056   if (unsigned int nvectors = aarch64_sve::nvectors_if_data_type (type))
2057     {
2058       *num_pr = 0;
2059       *num_zr = nvectors;
2060       return true;
2061     }
2062
2063   return false;
2064 }
2065
2066 /* Return true if a function with type FNTYPE returns its value in
2067    SVE vector or predicate registers.  */
2068
2069 static bool
2070 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2071 {
2072   unsigned int num_zr, num_pr;
2073   tree return_type = TREE_TYPE (fntype);
2074   return (return_type != error_mark_node
2075           && aarch64_sve_argument_p (return_type, &num_zr, &num_pr));
2076 }
2077
2078 /* Return true if a function with type FNTYPE takes arguments in
2079    SVE vector or predicate registers.  */
2080
2081 static bool
2082 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2083 {
2084   CUMULATIVE_ARGS args_so_far_v;
2085   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2086                                 NULL_TREE, 0, true);
2087   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2088
2089   for (tree chain = TYPE_ARG_TYPES (fntype);
2090        chain && chain != void_list_node;
2091        chain = TREE_CHAIN (chain))
2092     {
2093       tree arg_type = TREE_VALUE (chain);
2094       if (arg_type == error_mark_node)
2095         return false;
2096
2097       function_arg_info arg (arg_type, /*named=*/true);
2098       apply_pass_by_reference_rules (&args_so_far_v, arg);
2099       unsigned int num_zr, num_pr;
2100       if (aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
2101         return true;
2102
2103       targetm.calls.function_arg_advance (args_so_far, arg);
2104     }
2105   return false;
2106 }
2107
2108 /* Implement TARGET_FNTYPE_ABI.  */
2109
2110 static const predefined_function_abi &
2111 aarch64_fntype_abi (const_tree fntype)
2112 {
2113   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2114     return aarch64_simd_abi ();
2115
2116   if (aarch64_returns_value_in_sve_regs_p (fntype)
2117       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2118     return aarch64_sve_abi ();
2119
2120   return default_function_abi;
2121 }
2122
2123 /* Return true if we should emit CFI for register REGNO.  */
2124
2125 static bool
2126 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2127 {
2128   return (GP_REGNUM_P (regno)
2129           || !default_function_abi.clobbers_full_reg_p (regno));
2130 }
2131
2132 /* Return the mode we should use to save and restore register REGNO.  */
2133
2134 static machine_mode
2135 aarch64_reg_save_mode (unsigned int regno)
2136 {
2137   if (GP_REGNUM_P (regno))
2138     return DImode;
2139
2140   if (FP_REGNUM_P (regno))
2141     switch (crtl->abi->id ())
2142       {
2143       case ARM_PCS_AAPCS64:
2144         /* Only the low 64 bits are saved by the base PCS.  */
2145         return DFmode;
2146
2147       case ARM_PCS_SIMD:
2148         /* The vector PCS saves the low 128 bits (which is the full
2149            register on non-SVE targets).  */
2150         return TFmode;
2151
2152       case ARM_PCS_SVE:
2153         /* Use vectors of DImode for registers that need frame
2154            information, so that the first 64 bytes of the save slot
2155            are always the equivalent of what storing D<n> would give.  */
2156         if (aarch64_emit_cfi_for_reg_p (regno))
2157           return VNx2DImode;
2158
2159         /* Use vectors of bytes otherwise, so that the layout is
2160            endian-agnostic, and so that we can use LDR and STR for
2161            big-endian targets.  */
2162         return VNx16QImode;
2163
2164       case ARM_PCS_TLSDESC:
2165       case ARM_PCS_UNKNOWN:
2166         break;
2167       }
2168
2169   if (PR_REGNUM_P (regno))
2170     /* Save the full predicate register.  */
2171     return VNx16BImode;
2172
2173   gcc_unreachable ();
2174 }
2175
2176 /* Implement TARGET_INSN_CALLEE_ABI.  */
2177
2178 const predefined_function_abi &
2179 aarch64_insn_callee_abi (const rtx_insn *insn)
2180 {
2181   rtx pat = PATTERN (insn);
2182   gcc_assert (GET_CODE (pat) == PARALLEL);
2183   rtx unspec = XVECEXP (pat, 0, 1);
2184   gcc_assert (GET_CODE (unspec) == UNSPEC
2185               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2186   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2187 }
2188
2189 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2190    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2191    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2192
2193 static bool
2194 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2195                                         unsigned int regno,
2196                                         machine_mode mode)
2197 {
2198   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2199     {
2200       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2201       unsigned int nregs = hard_regno_nregs (regno, mode);
2202       if (nregs > 1)
2203         per_register_size = exact_div (per_register_size, nregs);
2204       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2205         return maybe_gt (per_register_size, 16);
2206       return maybe_gt (per_register_size, 8);
2207     }
2208   return false;
2209 }
2210
2211 /* Implement REGMODE_NATURAL_SIZE.  */
2212 poly_uint64
2213 aarch64_regmode_natural_size (machine_mode mode)
2214 {
2215   /* The natural size for SVE data modes is one SVE data vector,
2216      and similarly for predicates.  We can't independently modify
2217      anything smaller than that.  */
2218   /* ??? For now, only do this for variable-width SVE registers.
2219      Doing it for constant-sized registers breaks lower-subreg.c.  */
2220   /* ??? And once that's fixed, we should probably have similar
2221      code for Advanced SIMD.  */
2222   if (!aarch64_sve_vg.is_constant ())
2223     {
2224       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2225       if (vec_flags & VEC_SVE_PRED)
2226         return BYTES_PER_SVE_PRED;
2227       if (vec_flags & VEC_SVE_DATA)
2228         return BYTES_PER_SVE_VECTOR;
2229     }
2230   return UNITS_PER_WORD;
2231 }
2232
2233 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2234 machine_mode
2235 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2236                                      machine_mode mode)
2237 {
2238   /* The predicate mode determines which bits are significant and
2239      which are "don't care".  Decreasing the number of lanes would
2240      lose data while increasing the number of lanes would make bits
2241      unnecessarily significant.  */
2242   if (PR_REGNUM_P (regno))
2243     return mode;
2244   if (known_ge (GET_MODE_SIZE (mode), 4))
2245     return mode;
2246   else
2247     return SImode;
2248 }
2249
2250 /* Return true if I's bits are consecutive ones from the MSB.  */
2251 bool
2252 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2253 {
2254   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2255 }
2256
2257 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2258    that strcpy from constants will be faster.  */
2259
2260 static HOST_WIDE_INT
2261 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2262 {
2263   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2264     return MAX (align, BITS_PER_WORD);
2265   return align;
2266 }
2267
2268 /* Return true if calls to DECL should be treated as
2269    long-calls (ie called via a register).  */
2270 static bool
2271 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2272 {
2273   return false;
2274 }
2275
2276 /* Return true if calls to symbol-ref SYM should be treated as
2277    long-calls (ie called via a register).  */
2278 bool
2279 aarch64_is_long_call_p (rtx sym)
2280 {
2281   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2282 }
2283
2284 /* Return true if calls to symbol-ref SYM should not go through
2285    plt stubs.  */
2286
2287 bool
2288 aarch64_is_noplt_call_p (rtx sym)
2289 {
2290   const_tree decl = SYMBOL_REF_DECL (sym);
2291
2292   if (flag_pic
2293       && decl
2294       && (!flag_plt
2295           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2296       && !targetm.binds_local_p (decl))
2297     return true;
2298
2299   return false;
2300 }
2301
2302 /* Return true if the offsets to a zero/sign-extract operation
2303    represent an expression that matches an extend operation.  The
2304    operands represent the paramters from
2305
2306    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2307 bool
2308 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2309                                 rtx extract_imm)
2310 {
2311   HOST_WIDE_INT mult_val, extract_val;
2312
2313   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2314     return false;
2315
2316   mult_val = INTVAL (mult_imm);
2317   extract_val = INTVAL (extract_imm);
2318
2319   if (extract_val > 8
2320       && extract_val < GET_MODE_BITSIZE (mode)
2321       && exact_log2 (extract_val & ~7) > 0
2322       && (extract_val & 7) <= 4
2323       && mult_val == (1 << (extract_val & 7)))
2324     return true;
2325
2326   return false;
2327 }
2328
2329 /* Emit an insn that's a simple single-set.  Both the operands must be
2330    known to be valid.  */
2331 inline static rtx_insn *
2332 emit_set_insn (rtx x, rtx y)
2333 {
2334   return emit_insn (gen_rtx_SET (x, y));
2335 }
2336
2337 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2338    return the rtx for register 0 in the proper mode.  */
2339 rtx
2340 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2341 {
2342   machine_mode cmp_mode = GET_MODE (x);
2343   machine_mode cc_mode;
2344   rtx cc_reg;
2345
2346   if (cmp_mode == TImode)
2347     {
2348       gcc_assert (code == NE);
2349
2350       cc_mode = CCmode;
2351       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2352
2353       rtx x_lo = operand_subword (x, 0, 0, TImode);
2354       rtx y_lo = operand_subword (y, 0, 0, TImode);
2355       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2356
2357       rtx x_hi = operand_subword (x, 1, 0, TImode);
2358       rtx y_hi = operand_subword (y, 1, 0, TImode);
2359       emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2360                              gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2361                              GEN_INT (AARCH64_EQ)));
2362     }
2363   else
2364     {
2365       cc_mode = SELECT_CC_MODE (code, x, y);
2366       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2367       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2368     }
2369   return cc_reg;
2370 }
2371
2372 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2373
2374 static rtx
2375 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2376                                   machine_mode y_mode)
2377 {
2378   if (y_mode == E_QImode || y_mode == E_HImode)
2379     {
2380       if (CONST_INT_P (y))
2381         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2382       else
2383         {
2384           rtx t, cc_reg;
2385           machine_mode cc_mode;
2386
2387           t = gen_rtx_ZERO_EXTEND (SImode, y);
2388           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2389           cc_mode = CC_SWPmode;
2390           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2391           emit_set_insn (cc_reg, t);
2392           return cc_reg;
2393         }
2394     }
2395
2396   if (!aarch64_plus_operand (y, y_mode))
2397     y = force_reg (y_mode, y);
2398
2399   return aarch64_gen_compare_reg (code, x, y);
2400 }
2401
2402 /* Build the SYMBOL_REF for __tls_get_addr.  */
2403
2404 static GTY(()) rtx tls_get_addr_libfunc;
2405
2406 rtx
2407 aarch64_tls_get_addr (void)
2408 {
2409   if (!tls_get_addr_libfunc)
2410     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2411   return tls_get_addr_libfunc;
2412 }
2413
2414 /* Return the TLS model to use for ADDR.  */
2415
2416 static enum tls_model
2417 tls_symbolic_operand_type (rtx addr)
2418 {
2419   enum tls_model tls_kind = TLS_MODEL_NONE;
2420   if (GET_CODE (addr) == CONST)
2421     {
2422       poly_int64 addend;
2423       rtx sym = strip_offset (addr, &addend);
2424       if (GET_CODE (sym) == SYMBOL_REF)
2425         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2426     }
2427   else if (GET_CODE (addr) == SYMBOL_REF)
2428     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2429
2430   return tls_kind;
2431 }
2432
2433 /* We'll allow lo_sum's in addresses in our legitimate addresses
2434    so that combine would take care of combining addresses where
2435    necessary, but for generation purposes, we'll generate the address
2436    as :
2437    RTL                               Absolute
2438    tmp = hi (symbol_ref);            adrp  x1, foo
2439    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2440                                      nop
2441
2442    PIC                               TLS
2443    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2444    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2445                                      bl   __tls_get_addr
2446                                      nop
2447
2448    Load TLS symbol, depending on TLS mechanism and TLS access model.
2449
2450    Global Dynamic - Traditional TLS:
2451    adrp tmp, :tlsgd:imm
2452    add  dest, tmp, #:tlsgd_lo12:imm
2453    bl   __tls_get_addr
2454
2455    Global Dynamic - TLS Descriptors:
2456    adrp dest, :tlsdesc:imm
2457    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2458    add  dest, dest, #:tlsdesc_lo12:imm
2459    blr  tmp
2460    mrs  tp, tpidr_el0
2461    add  dest, dest, tp
2462
2463    Initial Exec:
2464    mrs  tp, tpidr_el0
2465    adrp tmp, :gottprel:imm
2466    ldr  dest, [tmp, #:gottprel_lo12:imm]
2467    add  dest, dest, tp
2468
2469    Local Exec:
2470    mrs  tp, tpidr_el0
2471    add  t0, tp, #:tprel_hi12:imm, lsl #12
2472    add  t0, t0, #:tprel_lo12_nc:imm
2473 */
2474
2475 static void
2476 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2477                                    enum aarch64_symbol_type type)
2478 {
2479   switch (type)
2480     {
2481     case SYMBOL_SMALL_ABSOLUTE:
2482       {
2483         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2484         rtx tmp_reg = dest;
2485         machine_mode mode = GET_MODE (dest);
2486
2487         gcc_assert (mode == Pmode || mode == ptr_mode);
2488
2489         if (can_create_pseudo_p ())
2490           tmp_reg = gen_reg_rtx (mode);
2491
2492         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2493         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2494         return;
2495       }
2496
2497     case SYMBOL_TINY_ABSOLUTE:
2498       emit_insn (gen_rtx_SET (dest, imm));
2499       return;
2500
2501     case SYMBOL_SMALL_GOT_28K:
2502       {
2503         machine_mode mode = GET_MODE (dest);
2504         rtx gp_rtx = pic_offset_table_rtx;
2505         rtx insn;
2506         rtx mem;
2507
2508         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2509            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2510            decide rtx costs, in which case pic_offset_table_rtx is not
2511            initialized.  For that case no need to generate the first adrp
2512            instruction as the final cost for global variable access is
2513            one instruction.  */
2514         if (gp_rtx != NULL)
2515           {
2516             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2517                using the page base as GOT base, the first page may be wasted,
2518                in the worst scenario, there is only 28K space for GOT).
2519
2520                The generate instruction sequence for accessing global variable
2521                is:
2522
2523                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2524
2525                Only one instruction needed. But we must initialize
2526                pic_offset_table_rtx properly.  We generate initialize insn for
2527                every global access, and allow CSE to remove all redundant.
2528
2529                The final instruction sequences will look like the following
2530                for multiply global variables access.
2531
2532                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2533
2534                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2535                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2536                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2537                  ...  */
2538
2539             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2540             crtl->uses_pic_offset_table = 1;
2541             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2542
2543             if (mode != GET_MODE (gp_rtx))
2544              gp_rtx = gen_lowpart (mode, gp_rtx);
2545
2546           }
2547
2548         if (mode == ptr_mode)
2549           {
2550             if (mode == DImode)
2551               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2552             else
2553               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2554
2555             mem = XVECEXP (SET_SRC (insn), 0, 0);
2556           }
2557         else
2558           {
2559             gcc_assert (mode == Pmode);
2560
2561             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2562             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2563           }
2564
2565         /* The operand is expected to be MEM.  Whenever the related insn
2566            pattern changed, above code which calculate mem should be
2567            updated.  */
2568         gcc_assert (GET_CODE (mem) == MEM);
2569         MEM_READONLY_P (mem) = 1;
2570         MEM_NOTRAP_P (mem) = 1;
2571         emit_insn (insn);
2572         return;
2573       }
2574
2575     case SYMBOL_SMALL_GOT_4G:
2576       {
2577         /* In ILP32, the mode of dest can be either SImode or DImode,
2578            while the got entry is always of SImode size.  The mode of
2579            dest depends on how dest is used: if dest is assigned to a
2580            pointer (e.g. in the memory), it has SImode; it may have
2581            DImode if dest is dereferenced to access the memeory.
2582            This is why we have to handle three different ldr_got_small
2583            patterns here (two patterns for ILP32).  */
2584
2585         rtx insn;
2586         rtx mem;
2587         rtx tmp_reg = dest;
2588         machine_mode mode = GET_MODE (dest);
2589
2590         if (can_create_pseudo_p ())
2591           tmp_reg = gen_reg_rtx (mode);
2592
2593         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2594         if (mode == ptr_mode)
2595           {
2596             if (mode == DImode)
2597               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2598             else
2599               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2600
2601             mem = XVECEXP (SET_SRC (insn), 0, 0);
2602           }
2603         else
2604           {
2605             gcc_assert (mode == Pmode);
2606
2607             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2608             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2609           }
2610
2611         gcc_assert (GET_CODE (mem) == MEM);
2612         MEM_READONLY_P (mem) = 1;
2613         MEM_NOTRAP_P (mem) = 1;
2614         emit_insn (insn);
2615         return;
2616       }
2617
2618     case SYMBOL_SMALL_TLSGD:
2619       {
2620         rtx_insn *insns;
2621         machine_mode mode = GET_MODE (dest);
2622         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2623
2624         start_sequence ();
2625         if (TARGET_ILP32)
2626           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2627         else
2628           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2629         insns = get_insns ();
2630         end_sequence ();
2631
2632         RTL_CONST_CALL_P (insns) = 1;
2633         emit_libcall_block (insns, dest, result, imm);
2634         return;
2635       }
2636
2637     case SYMBOL_SMALL_TLSDESC:
2638       {
2639         machine_mode mode = GET_MODE (dest);
2640         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2641         rtx tp;
2642
2643         gcc_assert (mode == Pmode || mode == ptr_mode);
2644
2645         /* In ILP32, the got entry is always of SImode size.  Unlike
2646            small GOT, the dest is fixed at reg 0.  */
2647         if (TARGET_ILP32)
2648           emit_insn (gen_tlsdesc_small_si (imm));
2649         else
2650           emit_insn (gen_tlsdesc_small_di (imm));
2651         tp = aarch64_load_tp (NULL);
2652
2653         if (mode != Pmode)
2654           tp = gen_lowpart (mode, tp);
2655
2656         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2657         if (REG_P (dest))
2658           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2659         return;
2660       }
2661
2662     case SYMBOL_SMALL_TLSIE:
2663       {
2664         /* In ILP32, the mode of dest can be either SImode or DImode,
2665            while the got entry is always of SImode size.  The mode of
2666            dest depends on how dest is used: if dest is assigned to a
2667            pointer (e.g. in the memory), it has SImode; it may have
2668            DImode if dest is dereferenced to access the memeory.
2669            This is why we have to handle three different tlsie_small
2670            patterns here (two patterns for ILP32).  */
2671         machine_mode mode = GET_MODE (dest);
2672         rtx tmp_reg = gen_reg_rtx (mode);
2673         rtx tp = aarch64_load_tp (NULL);
2674
2675         if (mode == ptr_mode)
2676           {
2677             if (mode == DImode)
2678               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2679             else
2680               {
2681                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2682                 tp = gen_lowpart (mode, tp);
2683               }
2684           }
2685         else
2686           {
2687             gcc_assert (mode == Pmode);
2688             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2689           }
2690
2691         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2692         if (REG_P (dest))
2693           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2694         return;
2695       }
2696
2697     case SYMBOL_TLSLE12:
2698     case SYMBOL_TLSLE24:
2699     case SYMBOL_TLSLE32:
2700     case SYMBOL_TLSLE48:
2701       {
2702         machine_mode mode = GET_MODE (dest);
2703         rtx tp = aarch64_load_tp (NULL);
2704
2705         if (mode != Pmode)
2706           tp = gen_lowpart (mode, tp);
2707
2708         switch (type)
2709           {
2710           case SYMBOL_TLSLE12:
2711             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2712                         (dest, tp, imm));
2713             break;
2714           case SYMBOL_TLSLE24:
2715             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2716                         (dest, tp, imm));
2717           break;
2718           case SYMBOL_TLSLE32:
2719             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2720                         (dest, imm));
2721             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2722                         (dest, dest, tp));
2723           break;
2724           case SYMBOL_TLSLE48:
2725             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2726                         (dest, imm));
2727             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2728                         (dest, dest, tp));
2729             break;
2730           default:
2731             gcc_unreachable ();
2732           }
2733
2734         if (REG_P (dest))
2735           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2736         return;
2737       }
2738
2739     case SYMBOL_TINY_GOT:
2740       emit_insn (gen_ldr_got_tiny (dest, imm));
2741       return;
2742
2743     case SYMBOL_TINY_TLSIE:
2744       {
2745         machine_mode mode = GET_MODE (dest);
2746         rtx tp = aarch64_load_tp (NULL);
2747
2748         if (mode == ptr_mode)
2749           {
2750             if (mode == DImode)
2751               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2752             else
2753               {
2754                 tp = gen_lowpart (mode, tp);
2755                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2756               }
2757           }
2758         else
2759           {
2760             gcc_assert (mode == Pmode);
2761             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2762           }
2763
2764         if (REG_P (dest))
2765           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2766         return;
2767       }
2768
2769     default:
2770       gcc_unreachable ();
2771     }
2772 }
2773
2774 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2775    handle all moves if !can_create_pseudo_p ().  The distinction is
2776    important because, unlike emit_move_insn, the move expanders know
2777    how to force Pmode objects into the constant pool even when the
2778    constant pool address is not itself legitimate.  */
2779 static rtx
2780 aarch64_emit_move (rtx dest, rtx src)
2781 {
2782   return (can_create_pseudo_p ()
2783           ? emit_move_insn (dest, src)
2784           : emit_move_insn_1 (dest, src));
2785 }
2786
2787 /* Apply UNOPTAB to OP and store the result in DEST.  */
2788
2789 static void
2790 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2791 {
2792   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2793   if (dest != tmp)
2794     emit_move_insn (dest, tmp);
2795 }
2796
2797 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2798
2799 static void
2800 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2801 {
2802   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2803                           OPTAB_DIRECT);
2804   if (dest != tmp)
2805     emit_move_insn (dest, tmp);
2806 }
2807
2808 /* Split a 128-bit move operation into two 64-bit move operations,
2809    taking care to handle partial overlap of register to register
2810    copies.  Special cases are needed when moving between GP regs and
2811    FP regs.  SRC can be a register, constant or memory; DST a register
2812    or memory.  If either operand is memory it must not have any side
2813    effects.  */
2814 void
2815 aarch64_split_128bit_move (rtx dst, rtx src)
2816 {
2817   rtx dst_lo, dst_hi;
2818   rtx src_lo, src_hi;
2819
2820   machine_mode mode = GET_MODE (dst);
2821
2822   gcc_assert (mode == TImode || mode == TFmode);
2823   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2824   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2825
2826   if (REG_P (dst) && REG_P (src))
2827     {
2828       int src_regno = REGNO (src);
2829       int dst_regno = REGNO (dst);
2830
2831       /* Handle FP <-> GP regs.  */
2832       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2833         {
2834           src_lo = gen_lowpart (word_mode, src);
2835           src_hi = gen_highpart (word_mode, src);
2836
2837           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2838           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2839           return;
2840         }
2841       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2842         {
2843           dst_lo = gen_lowpart (word_mode, dst);
2844           dst_hi = gen_highpart (word_mode, dst);
2845
2846           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2847           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2848           return;
2849         }
2850     }
2851
2852   dst_lo = gen_lowpart (word_mode, dst);
2853   dst_hi = gen_highpart (word_mode, dst);
2854   src_lo = gen_lowpart (word_mode, src);
2855   src_hi = gen_highpart_mode (word_mode, mode, src);
2856
2857   /* At most one pairing may overlap.  */
2858   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2859     {
2860       aarch64_emit_move (dst_hi, src_hi);
2861       aarch64_emit_move (dst_lo, src_lo);
2862     }
2863   else
2864     {
2865       aarch64_emit_move (dst_lo, src_lo);
2866       aarch64_emit_move (dst_hi, src_hi);
2867     }
2868 }
2869
2870 bool
2871 aarch64_split_128bit_move_p (rtx dst, rtx src)
2872 {
2873   return (! REG_P (src)
2874           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2875 }
2876
2877 /* Split a complex SIMD combine.  */
2878
2879 void
2880 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2881 {
2882   machine_mode src_mode = GET_MODE (src1);
2883   machine_mode dst_mode = GET_MODE (dst);
2884
2885   gcc_assert (VECTOR_MODE_P (dst_mode));
2886   gcc_assert (register_operand (dst, dst_mode)
2887               && register_operand (src1, src_mode)
2888               && register_operand (src2, src_mode));
2889
2890   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2891   return;
2892 }
2893
2894 /* Split a complex SIMD move.  */
2895
2896 void
2897 aarch64_split_simd_move (rtx dst, rtx src)
2898 {
2899   machine_mode src_mode = GET_MODE (src);
2900   machine_mode dst_mode = GET_MODE (dst);
2901
2902   gcc_assert (VECTOR_MODE_P (dst_mode));
2903
2904   if (REG_P (dst) && REG_P (src))
2905     {
2906       gcc_assert (VECTOR_MODE_P (src_mode));
2907       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2908     }
2909 }
2910
2911 bool
2912 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2913                               machine_mode ymode, rtx y)
2914 {
2915   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2916   gcc_assert (r != NULL);
2917   return rtx_equal_p (x, r);
2918 }
2919
2920 /* Return TARGET if it is nonnull and a register of mode MODE.
2921    Otherwise, return a fresh register of mode MODE if we can,
2922    or TARGET reinterpreted as MODE if we can't.  */
2923
2924 static rtx
2925 aarch64_target_reg (rtx target, machine_mode mode)
2926 {
2927   if (target && REG_P (target) && GET_MODE (target) == mode)
2928     return target;
2929   if (!can_create_pseudo_p ())
2930     {
2931       gcc_assert (target);
2932       return gen_lowpart (mode, target);
2933     }
2934   return gen_reg_rtx (mode);
2935 }
2936
2937 /* Return a register that contains the constant in BUILDER, given that
2938    the constant is a legitimate move operand.  Use TARGET as the register
2939    if it is nonnull and convenient.  */
2940
2941 static rtx
2942 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2943 {
2944   rtx src = builder.build ();
2945   target = aarch64_target_reg (target, GET_MODE (src));
2946   emit_insn (gen_rtx_SET (target, src));
2947   return target;
2948 }
2949
2950 static rtx
2951 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2952 {
2953   if (can_create_pseudo_p ())
2954     return force_reg (mode, value);
2955   else
2956     {
2957       gcc_assert (x);
2958       aarch64_emit_move (x, value);
2959       return x;
2960     }
2961 }
2962
2963 /* Return true if predicate value X is a constant in which every element
2964    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2965    value, i.e. as a predicate in which all bits are significant.  */
2966
2967 static bool
2968 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2969 {
2970   if (GET_CODE (x) != CONST_VECTOR)
2971     return false;
2972
2973   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2974                                              GET_MODE_NUNITS (GET_MODE (x)));
2975   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2976   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2977   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2978
2979   unsigned int nelts = const_vector_encoded_nelts (x);
2980   for (unsigned int i = 0; i < nelts; ++i)
2981     {
2982       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2983       if (!CONST_INT_P (elt))
2984         return false;
2985
2986       builder.quick_push (elt);
2987       for (unsigned int j = 1; j < factor; ++j)
2988         builder.quick_push (const0_rtx);
2989     }
2990   builder.finalize ();
2991   return true;
2992 }
2993
2994 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2995    widest predicate element size it can have (that is, the largest size
2996    for which each element would still be 0 or 1).  */
2997
2998 unsigned int
2999 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3000 {
3001   /* Start with the most optimistic assumption: that we only need
3002      one bit per pattern.  This is what we will use if only the first
3003      bit in each pattern is ever set.  */
3004   unsigned int mask = GET_MODE_SIZE (DImode);
3005   mask |= builder.npatterns ();
3006
3007   /* Look for set bits.  */
3008   unsigned int nelts = builder.encoded_nelts ();
3009   for (unsigned int i = 1; i < nelts; ++i)
3010     if (INTVAL (builder.elt (i)) != 0)
3011       {
3012         if (i & 1)
3013           return 1;
3014         mask |= i;
3015       }
3016   return mask & -mask;
3017 }
3018
3019 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3020    return that predicate mode, otherwise return opt_machine_mode ().  */
3021
3022 opt_machine_mode
3023 aarch64_ptrue_all_mode (rtx x)
3024 {
3025   gcc_assert (GET_MODE (x) == VNx16BImode);
3026   if (GET_CODE (x) != CONST_VECTOR
3027       || !CONST_VECTOR_DUPLICATE_P (x)
3028       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3029       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3030     return opt_machine_mode ();
3031
3032   unsigned int nelts = const_vector_encoded_nelts (x);
3033   for (unsigned int i = 1; i < nelts; ++i)
3034     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3035       return opt_machine_mode ();
3036
3037   return aarch64_sve_pred_mode (nelts);
3038 }
3039
3040 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3041    that the constant would have with predicate element size ELT_SIZE
3042    (ignoring the upper bits in each element) and return:
3043
3044    * -1 if all bits are set
3045    * N if the predicate has N leading set bits followed by all clear bits
3046    * 0 if the predicate does not have any of these forms.  */
3047
3048 int
3049 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3050                               unsigned int elt_size)
3051 {
3052   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3053      followed by set bits.  */
3054   if (builder.nelts_per_pattern () == 3)
3055     return 0;
3056
3057   /* Skip over leading set bits.  */
3058   unsigned int nelts = builder.encoded_nelts ();
3059   unsigned int i = 0;
3060   for (; i < nelts; i += elt_size)
3061     if (INTVAL (builder.elt (i)) == 0)
3062       break;
3063   unsigned int vl = i / elt_size;
3064
3065   /* Check for the all-true case.  */
3066   if (i == nelts)
3067     return -1;
3068
3069   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3070      repeating pattern of set bits followed by clear bits.  */
3071   if (builder.nelts_per_pattern () != 2)
3072     return 0;
3073
3074   /* We have a "foreground" value and a duplicated "background" value.
3075      If the background might repeat and the last set bit belongs to it,
3076      we might have set bits followed by clear bits followed by set bits.  */
3077   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3078     return 0;
3079
3080   /* Make sure that the rest are all clear.  */
3081   for (; i < nelts; i += elt_size)
3082     if (INTVAL (builder.elt (i)) != 0)
3083       return 0;
3084
3085   return vl;
3086 }
3087
3088 /* See if there is an svpattern that encodes an SVE predicate of mode
3089    PRED_MODE in which the first VL bits are set and the rest are clear.
3090    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3091    A VL of -1 indicates an all-true vector.  */
3092
3093 aarch64_svpattern
3094 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3095 {
3096   if (vl < 0)
3097     return AARCH64_SV_ALL;
3098
3099   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3100     return AARCH64_NUM_SVPATTERNS;
3101
3102   if (vl >= 1 && vl <= 8)
3103     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3104
3105   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3106     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3107
3108   int max_vl;
3109   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3110     {
3111       if (vl == (max_vl / 3) * 3)
3112         return AARCH64_SV_MUL3;
3113       /* These would only trigger for non-power-of-2 lengths.  */
3114       if (vl == (max_vl & -4))
3115         return AARCH64_SV_MUL4;
3116       if (vl == (1 << floor_log2 (max_vl)))
3117         return AARCH64_SV_POW2;
3118       if (vl == max_vl)
3119         return AARCH64_SV_ALL;
3120     }
3121   return AARCH64_NUM_SVPATTERNS;
3122 }
3123
3124 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3125    bits has the lowest bit set and the upper bits clear.  This is the
3126    VNx16BImode equivalent of a PTRUE for controlling elements of
3127    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3128    all bits are significant, even the upper zeros.  */
3129
3130 rtx
3131 aarch64_ptrue_all (unsigned int elt_size)
3132 {
3133   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3134   builder.quick_push (const1_rtx);
3135   for (unsigned int i = 1; i < elt_size; ++i)
3136     builder.quick_push (const0_rtx);
3137   return builder.build ();
3138 }
3139
3140 /* Return an all-true predicate register of mode MODE.  */
3141
3142 rtx
3143 aarch64_ptrue_reg (machine_mode mode)
3144 {
3145   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3146   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3147   return gen_lowpart (mode, reg);
3148 }
3149
3150 /* Return an all-false predicate register of mode MODE.  */
3151
3152 rtx
3153 aarch64_pfalse_reg (machine_mode mode)
3154 {
3155   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3156   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3157   return gen_lowpart (mode, reg);
3158 }
3159
3160 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3161    true, or alternatively if we know that the operation predicated by
3162    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
3163    aarch64_sve_gp_strictness operand that describes the operation
3164    predicated by PRED1[0].  */
3165
3166 bool
3167 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3168 {
3169   machine_mode mode = GET_MODE (pred2);
3170   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3171               && mode == GET_MODE (pred1[0])
3172               && aarch64_sve_gp_strictness (pred1[1], SImode));
3173   return (pred1[0] == CONSTM1_RTX (mode)
3174           || INTVAL (pred1[1]) == SVE_RELAXED_GP
3175           || rtx_equal_p (pred1[0], pred2));
3176 }
3177
3178 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3179    for it.  PRED2[0] is the predicate for the instruction whose result
3180    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3181    for it.  Return true if we can prove that the two predicates are
3182    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3183    with PRED1[0] without changing behavior.  */
3184
3185 bool
3186 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3187 {
3188   machine_mode mode = GET_MODE (pred1[0]);
3189   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3190               && mode == GET_MODE (pred2[0])
3191               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3192               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3193
3194   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3195                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3196   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3197                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3198   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3199 }
3200
3201 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3202    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3203    Use TARGET as the target register if nonnull and convenient.  */
3204
3205 static rtx
3206 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3207                           machine_mode data_mode, rtx op1, rtx op2)
3208 {
3209   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3210   expand_operand ops[5];
3211   create_output_operand (&ops[0], target, pred_mode);
3212   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3213   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3214   create_input_operand (&ops[3], op1, data_mode);
3215   create_input_operand (&ops[4], op2, data_mode);
3216   expand_insn (icode, 5, ops);
3217   return ops[0].value;
3218 }
3219
3220 /* Use a comparison to convert integer vector SRC into MODE, which is
3221    the corresponding SVE predicate mode.  Use TARGET for the result
3222    if it's nonnull and convenient.  */
3223
3224 rtx
3225 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3226 {
3227   machine_mode src_mode = GET_MODE (src);
3228   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3229                                    src, CONST0_RTX (src_mode));
3230 }
3231
3232 /* Return the assembly token for svprfop value PRFOP.  */
3233
3234 static const char *
3235 svprfop_token (enum aarch64_svprfop prfop)
3236 {
3237   switch (prfop)
3238     {
3239 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3240     AARCH64_FOR_SVPRFOP (CASE)
3241 #undef CASE
3242     case AARCH64_NUM_SVPRFOPS:
3243       break;
3244     }
3245   gcc_unreachable ();
3246 }
3247
3248 /* Return the assembly string for an SVE prefetch operation with
3249    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3250    and that SUFFIX is the format for the remaining operands.  */
3251
3252 char *
3253 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3254                              const char *suffix)
3255 {
3256   static char buffer[128];
3257   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3258   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3259                                    mnemonic, svprfop_token (prfop), suffix);
3260   gcc_assert (written < sizeof (buffer));
3261   return buffer;
3262 }
3263
3264 /* Check whether we can calculate the number of elements in PATTERN
3265    at compile time, given that there are NELTS_PER_VQ elements per
3266    128-bit block.  Return the value if so, otherwise return -1.  */
3267
3268 HOST_WIDE_INT
3269 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3270 {
3271   unsigned int vl, const_vg;
3272   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3273     vl = 1 + (pattern - AARCH64_SV_VL1);
3274   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3275     vl = 16 << (pattern - AARCH64_SV_VL16);
3276   else if (aarch64_sve_vg.is_constant (&const_vg))
3277     {
3278       /* There are two vector granules per quadword.  */
3279       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3280       switch (pattern)
3281         {
3282         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3283         case AARCH64_SV_MUL4: return nelts & -4;
3284         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3285         case AARCH64_SV_ALL: return nelts;
3286         default: gcc_unreachable ();
3287         }
3288     }
3289   else
3290     return -1;
3291
3292   /* There are two vector granules per quadword.  */
3293   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3294   if (known_le (vl, nelts_all))
3295     return vl;
3296
3297   /* Requesting more elements than are available results in a PFALSE.  */
3298   if (known_gt (vl, nelts_all))
3299     return 0;
3300
3301   return -1;
3302 }
3303
3304 /* Return true if we can move VALUE into a register using a single
3305    CNT[BHWD] instruction.  */
3306
3307 static bool
3308 aarch64_sve_cnt_immediate_p (poly_int64 value)
3309 {
3310   HOST_WIDE_INT factor = value.coeffs[0];
3311   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3312   return (value.coeffs[1] == factor
3313           && IN_RANGE (factor, 2, 16 * 16)
3314           && (factor & 1) == 0
3315           && factor <= 16 * (factor & -factor));
3316 }
3317
3318 /* Likewise for rtx X.  */
3319
3320 bool
3321 aarch64_sve_cnt_immediate_p (rtx x)
3322 {
3323   poly_int64 value;
3324   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3325 }
3326
3327 /* Return the asm string for an instruction with a CNT-like vector size
3328    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3329    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3330    first part of the operands template (the part that comes before the
3331    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3332    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3333    in each quadword.  If it is zero, we can use any element size.  */
3334
3335 static char *
3336 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3337                                   aarch64_svpattern pattern,
3338                                   unsigned int factor,
3339                                   unsigned int nelts_per_vq)
3340 {
3341   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3342
3343   if (nelts_per_vq == 0)
3344     /* There is some overlap in the ranges of the four CNT instructions.
3345        Here we always use the smallest possible element size, so that the
3346        multiplier is 1 whereever possible.  */
3347     nelts_per_vq = factor & -factor;
3348   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3349   gcc_assert (IN_RANGE (shift, 1, 4));
3350   char suffix = "dwhb"[shift - 1];
3351
3352   factor >>= shift;
3353   unsigned int written;
3354   if (pattern == AARCH64_SV_ALL && factor == 1)
3355     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3356                         prefix, suffix, operands);
3357   else if (factor == 1)
3358     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3359                         prefix, suffix, operands, svpattern_token (pattern));
3360   else
3361     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3362                         prefix, suffix, operands, svpattern_token (pattern),
3363                         factor);
3364   gcc_assert (written < sizeof (buffer));
3365   return buffer;
3366 }
3367
3368 /* Return the asm string for an instruction with a CNT-like vector size
3369    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3370    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3371    first part of the operands template (the part that comes before the
3372    vector size itself).  X is the value of the vector size operand,
3373    as a polynomial integer rtx; we need to convert this into an "all"
3374    pattern with a multiplier.  */
3375
3376 char *
3377 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3378                                   rtx x)
3379 {
3380   poly_int64 value = rtx_to_poly_int64 (x);
3381   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3382   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3383                                            value.coeffs[1], 0);
3384 }
3385
3386 /* Return the asm string for an instruction with a CNT-like vector size
3387    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3388    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3389    first part of the operands template (the part that comes before the
3390    vector size itself).  CNT_PAT[0..2] are the operands of the
3391    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3392
3393 char *
3394 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3395                                       const char *operands, rtx *cnt_pat)
3396 {
3397   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3398   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3399   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3400   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3401                                            factor, nelts_per_vq);
3402 }
3403
3404 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3405
3406 bool
3407 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3408 {
3409   poly_int64 value;
3410   return (poly_int_rtx_p (x, &value)
3411           && (aarch64_sve_cnt_immediate_p (value)
3412               || aarch64_sve_cnt_immediate_p (-value)));
3413 }
3414
3415 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3416    operand 0.  */
3417
3418 char *
3419 aarch64_output_sve_scalar_inc_dec (rtx offset)
3420 {
3421   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3422   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3423   if (offset_value.coeffs[1] > 0)
3424     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3425                                              offset_value.coeffs[1], 0);
3426   else
3427     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3428                                              -offset_value.coeffs[1], 0);
3429 }
3430
3431 /* Return true if we can add VALUE to a register using a single ADDVL
3432    or ADDPL instruction.  */
3433
3434 static bool
3435 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3436 {
3437   HOST_WIDE_INT factor = value.coeffs[0];
3438   if (factor == 0 || value.coeffs[1] != factor)
3439     return false;
3440   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3441      and a value of 16 is one vector width.  */
3442   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3443           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3444 }
3445
3446 /* Likewise for rtx X.  */
3447
3448 bool
3449 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3450 {
3451   poly_int64 value;
3452   return (poly_int_rtx_p (x, &value)
3453           && aarch64_sve_addvl_addpl_immediate_p (value));
3454 }
3455
3456 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3457    to operand 1 and storing the result in operand 0.  */
3458
3459 char *
3460 aarch64_output_sve_addvl_addpl (rtx offset)
3461 {
3462   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3463   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3464   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3465
3466   int factor = offset_value.coeffs[1];
3467   if ((factor & 15) == 0)
3468     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3469   else
3470     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3471   return buffer;
3472 }
3473
3474 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3475    instruction.  If it is, store the number of elements in each vector
3476    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3477    factor in *FACTOR_OUT (if nonnull).  */
3478
3479 bool
3480 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3481                                         unsigned int *nelts_per_vq_out)
3482 {
3483   rtx elt;
3484   poly_int64 value;
3485
3486   if (!const_vec_duplicate_p (x, &elt)
3487       || !poly_int_rtx_p (elt, &value))
3488     return false;
3489
3490   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3491   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3492     /* There's no vector INCB.  */
3493     return false;
3494
3495   HOST_WIDE_INT factor = value.coeffs[0];
3496   if (value.coeffs[1] != factor)
3497     return false;
3498
3499   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3500   if ((factor % nelts_per_vq) != 0
3501       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3502     return false;
3503
3504   if (factor_out)
3505     *factor_out = factor;
3506   if (nelts_per_vq_out)
3507     *nelts_per_vq_out = nelts_per_vq;
3508   return true;
3509 }
3510
3511 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3512    instruction.  */
3513
3514 bool
3515 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3516 {
3517   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3518 }
3519
3520 /* Return the asm template for an SVE vector INC or DEC instruction.
3521    OPERANDS gives the operands before the vector count and X is the
3522    value of the vector count operand itself.  */
3523
3524 char *
3525 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3526 {
3527   int factor;
3528   unsigned int nelts_per_vq;
3529   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3530     gcc_unreachable ();
3531   if (factor < 0)
3532     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3533                                              -factor, nelts_per_vq);
3534   else
3535     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3536                                              factor, nelts_per_vq);
3537 }
3538
3539 static int
3540 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3541                                 scalar_int_mode mode)
3542 {
3543   int i;
3544   unsigned HOST_WIDE_INT val, val2, mask;
3545   int one_match, zero_match;
3546   int num_insns;
3547
3548   val = INTVAL (imm);
3549
3550   if (aarch64_move_imm (val, mode))
3551     {
3552       if (generate)
3553         emit_insn (gen_rtx_SET (dest, imm));
3554       return 1;
3555     }
3556
3557   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3558      (with XXXX non-zero). In that case check to see if the move can be done in
3559      a smaller mode.  */
3560   val2 = val & 0xffffffff;
3561   if (mode == DImode
3562       && aarch64_move_imm (val2, SImode)
3563       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3564     {
3565       if (generate)
3566         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3567
3568       /* Check if we have to emit a second instruction by checking to see
3569          if any of the upper 32 bits of the original DI mode value is set.  */
3570       if (val == val2)
3571         return 1;
3572
3573       i = (val >> 48) ? 48 : 32;
3574
3575       if (generate)
3576          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3577                                     GEN_INT ((val >> i) & 0xffff)));
3578
3579       return 2;
3580     }
3581
3582   if ((val >> 32) == 0 || mode == SImode)
3583     {
3584       if (generate)
3585         {
3586           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3587           if (mode == SImode)
3588             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3589                                        GEN_INT ((val >> 16) & 0xffff)));
3590           else
3591             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3592                                        GEN_INT ((val >> 16) & 0xffff)));
3593         }
3594       return 2;
3595     }
3596
3597   /* Remaining cases are all for DImode.  */
3598
3599   mask = 0xffff;
3600   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3601     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3602   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3603     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3604
3605   if (zero_match != 2 && one_match != 2)
3606     {
3607       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3608          For a 64-bit bitmask try whether changing 16 bits to all ones or
3609          zeroes creates a valid bitmask.  To check any repeated bitmask,
3610          try using 16 bits from the other 32-bit half of val.  */
3611
3612       for (i = 0; i < 64; i += 16, mask <<= 16)
3613         {
3614           val2 = val & ~mask;
3615           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3616             break;
3617           val2 = val | mask;
3618           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3619             break;
3620           val2 = val2 & ~mask;
3621           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3622           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3623             break;
3624         }
3625       if (i != 64)
3626         {
3627           if (generate)
3628             {
3629               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3630               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3631                                          GEN_INT ((val >> i) & 0xffff)));
3632             }
3633           return 2;
3634         }
3635     }
3636
3637   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3638      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3639      otherwise skip zero bits.  */
3640
3641   num_insns = 1;
3642   mask = 0xffff;
3643   val2 = one_match > zero_match ? ~val : val;
3644   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3645
3646   if (generate)
3647     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3648                                            ? (val | ~(mask << i))
3649                                            : (val & (mask << i)))));
3650   for (i += 16; i < 64; i += 16)
3651     {
3652       if ((val2 & (mask << i)) == 0)
3653         continue;
3654       if (generate)
3655         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3656                                    GEN_INT ((val >> i) & 0xffff)));
3657       num_insns ++;
3658     }
3659
3660   return num_insns;
3661 }
3662
3663 /* Return whether imm is a 128-bit immediate which is simple enough to
3664    expand inline.  */
3665 bool
3666 aarch64_mov128_immediate (rtx imm)
3667 {
3668   if (GET_CODE (imm) == CONST_INT)
3669     return true;
3670
3671   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3672
3673   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3674   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3675
3676   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3677          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3678 }
3679
3680
3681 /* Return the number of temporary registers that aarch64_add_offset_1
3682    would need to add OFFSET to a register.  */
3683
3684 static unsigned int
3685 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3686 {
3687   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3688 }
3689
3690 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3691    a non-polynomial OFFSET.  MODE is the mode of the addition.
3692    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3693    be set and CFA adjustments added to the generated instructions.
3694
3695    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3696    temporary if register allocation is already complete.  This temporary
3697    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3698    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3699    the immediate again.
3700
3701    Since this function may be used to adjust the stack pointer, we must
3702    ensure that it cannot cause transient stack deallocation (for example
3703    by first incrementing SP and then decrementing when adjusting by a
3704    large immediate).  */
3705
3706 static void
3707 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3708                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3709                       bool frame_related_p, bool emit_move_imm)
3710 {
3711   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3712   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3713
3714   HOST_WIDE_INT moffset = abs_hwi (offset);
3715   rtx_insn *insn;
3716
3717   if (!moffset)
3718     {
3719       if (!rtx_equal_p (dest, src))
3720         {
3721           insn = emit_insn (gen_rtx_SET (dest, src));
3722           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3723         }
3724       return;
3725     }
3726
3727   /* Single instruction adjustment.  */
3728   if (aarch64_uimm12_shift (moffset))
3729     {
3730       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3731       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3732       return;
3733     }
3734
3735   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3736      and either:
3737
3738      a) the offset cannot be loaded by a 16-bit move or
3739      b) there is no spare register into which we can move it.  */
3740   if (moffset < 0x1000000
3741       && ((!temp1 && !can_create_pseudo_p ())
3742           || !aarch64_move_imm (moffset, mode)))
3743     {
3744       HOST_WIDE_INT low_off = moffset & 0xfff;
3745
3746       low_off = offset < 0 ? -low_off : low_off;
3747       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3748       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3749       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3750       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3751       return;
3752     }
3753
3754   /* Emit a move immediate if required and an addition/subtraction.  */
3755   if (emit_move_imm)
3756     {
3757       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3758       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3759     }
3760   insn = emit_insn (offset < 0
3761                     ? gen_sub3_insn (dest, src, temp1)
3762                     : gen_add3_insn (dest, src, temp1));
3763   if (frame_related_p)
3764     {
3765       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3766       rtx adj = plus_constant (mode, src, offset);
3767       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3768     }
3769 }
3770
3771 /* Return the number of temporary registers that aarch64_add_offset
3772    would need to move OFFSET into a register or add OFFSET to a register;
3773    ADD_P is true if we want the latter rather than the former.  */
3774
3775 static unsigned int
3776 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3777 {
3778   /* This follows the same structure as aarch64_add_offset.  */
3779   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3780     return 0;
3781
3782   unsigned int count = 0;
3783   HOST_WIDE_INT factor = offset.coeffs[1];
3784   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3785   poly_int64 poly_offset (factor, factor);
3786   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3787     /* Need one register for the ADDVL/ADDPL result.  */
3788     count += 1;
3789   else if (factor != 0)
3790     {
3791       factor = abs (factor);
3792       if (factor > 16 * (factor & -factor))
3793         /* Need one register for the CNT result and one for the multiplication
3794            factor.  If necessary, the second temporary can be reused for the
3795            constant part of the offset.  */
3796         return 2;
3797       /* Need one register for the CNT result (which might then
3798          be shifted).  */
3799       count += 1;
3800     }
3801   return count + aarch64_add_offset_1_temporaries (constant);
3802 }
3803
3804 /* If X can be represented as a poly_int64, return the number
3805    of temporaries that are required to add it to a register.
3806    Return -1 otherwise.  */
3807
3808 int
3809 aarch64_add_offset_temporaries (rtx x)
3810 {
3811   poly_int64 offset;
3812   if (!poly_int_rtx_p (x, &offset))
3813     return -1;
3814   return aarch64_offset_temporaries (true, offset);
3815 }
3816
3817 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3818    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3819    be set and CFA adjustments added to the generated instructions.
3820
3821    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3822    temporary if register allocation is already complete.  This temporary
3823    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3824    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3825    false to avoid emitting the immediate again.
3826
3827    TEMP2, if nonnull, is a second temporary register that doesn't
3828    overlap either DEST or REG.
3829
3830    Since this function may be used to adjust the stack pointer, we must
3831    ensure that it cannot cause transient stack deallocation (for example
3832    by first incrementing SP and then decrementing when adjusting by a
3833    large immediate).  */
3834
3835 static void
3836 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3837                     poly_int64 offset, rtx temp1, rtx temp2,
3838                     bool frame_related_p, bool emit_move_imm = true)
3839 {
3840   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3841   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3842   gcc_assert (temp1 == NULL_RTX
3843               || !frame_related_p
3844               || !reg_overlap_mentioned_p (temp1, dest));
3845   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3846
3847   /* Try using ADDVL or ADDPL to add the whole value.  */
3848   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3849     {
3850       rtx offset_rtx = gen_int_mode (offset, mode);
3851       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3852       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3853       return;
3854     }
3855
3856   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3857      SVE vector register, over and above the minimum size of 128 bits.
3858      This is equivalent to half the value returned by CNTD with a
3859      vector shape of ALL.  */
3860   HOST_WIDE_INT factor = offset.coeffs[1];
3861   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3862
3863   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3864   poly_int64 poly_offset (factor, factor);
3865   if (src != const0_rtx
3866       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3867     {
3868       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3869       if (frame_related_p)
3870         {
3871           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3872           RTX_FRAME_RELATED_P (insn) = true;
3873           src = dest;
3874         }
3875       else
3876         {
3877           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3878           src = aarch64_force_temporary (mode, temp1, addr);
3879           temp1 = temp2;
3880           temp2 = NULL_RTX;
3881         }
3882     }
3883   /* Otherwise use a CNT-based sequence.  */
3884   else if (factor != 0)
3885     {
3886       /* Use a subtraction if we have a negative factor.  */
3887       rtx_code code = PLUS;
3888       if (factor < 0)
3889         {
3890           factor = -factor;
3891           code = MINUS;
3892         }
3893
3894       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3895          into the multiplication.  */
3896       rtx val;
3897       int shift = 0;
3898       if (factor & 1)
3899         /* Use a right shift by 1.  */
3900         shift = -1;
3901       else
3902         factor /= 2;
3903       HOST_WIDE_INT low_bit = factor & -factor;
3904       if (factor <= 16 * low_bit)
3905         {
3906           if (factor > 16 * 8)
3907             {
3908               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3909                  the value with the minimum multiplier and shift it into
3910                  position.  */
3911               int extra_shift = exact_log2 (low_bit);
3912               shift += extra_shift;
3913               factor >>= extra_shift;
3914             }
3915           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3916         }
3917       else
3918         {
3919           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3920              directly, since that should increase the chances of being
3921              able to use a shift and add sequence.  If LOW_BIT itself
3922              is out of range, just use CNTD.  */
3923           if (low_bit <= 16 * 8)
3924             factor /= low_bit;
3925           else
3926             low_bit = 1;
3927
3928           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3929           val = aarch64_force_temporary (mode, temp1, val);
3930
3931           if (can_create_pseudo_p ())
3932             {
3933               rtx coeff1 = gen_int_mode (factor, mode);
3934               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3935             }
3936           else
3937             {
3938               /* Go back to using a negative multiplication factor if we have
3939                  no register from which to subtract.  */
3940               if (code == MINUS && src == const0_rtx)
3941                 {
3942                   factor = -factor;
3943                   code = PLUS;
3944                 }
3945               rtx coeff1 = gen_int_mode (factor, mode);
3946               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3947               val = gen_rtx_MULT (mode, val, coeff1);
3948             }
3949         }
3950
3951       if (shift > 0)
3952         {
3953           /* Multiply by 1 << SHIFT.  */
3954           val = aarch64_force_temporary (mode, temp1, val);
3955           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3956         }
3957       else if (shift == -1)
3958         {
3959           /* Divide by 2.  */
3960           val = aarch64_force_temporary (mode, temp1, val);
3961           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3962         }
3963
3964       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3965       if (src != const0_rtx)
3966         {
3967           val = aarch64_force_temporary (mode, temp1, val);
3968           val = gen_rtx_fmt_ee (code, mode, src, val);
3969         }
3970       else if (code == MINUS)
3971         {
3972           val = aarch64_force_temporary (mode, temp1, val);
3973           val = gen_rtx_NEG (mode, val);
3974         }
3975
3976       if (constant == 0 || frame_related_p)
3977         {
3978           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3979           if (frame_related_p)
3980             {
3981               RTX_FRAME_RELATED_P (insn) = true;
3982               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3983                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3984                                                               poly_offset)));
3985             }
3986           src = dest;
3987           if (constant == 0)
3988             return;
3989         }
3990       else
3991         {
3992           src = aarch64_force_temporary (mode, temp1, val);
3993           temp1 = temp2;
3994           temp2 = NULL_RTX;
3995         }
3996
3997       emit_move_imm = true;
3998     }
3999
4000   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4001                         frame_related_p, emit_move_imm);
4002 }
4003
4004 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4005    than a poly_int64.  */
4006
4007 void
4008 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4009                           rtx offset_rtx, rtx temp1, rtx temp2)
4010 {
4011   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4012                       temp1, temp2, false);
4013 }
4014
4015 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4016    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
4017    if TEMP1 already contains abs (DELTA).  */
4018
4019 static inline void
4020 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4021 {
4022   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4023                       temp1, temp2, true, emit_move_imm);
4024 }
4025
4026 /* Subtract DELTA from the stack pointer, marking the instructions
4027    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
4028    if nonnull.  */
4029
4030 static inline void
4031 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4032                 bool emit_move_imm = true)
4033 {
4034   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4035                       temp1, temp2, frame_related_p, emit_move_imm);
4036 }
4037
4038 /* Set DEST to (vec_series BASE STEP).  */
4039
4040 static void
4041 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4042 {
4043   machine_mode mode = GET_MODE (dest);
4044   scalar_mode inner = GET_MODE_INNER (mode);
4045
4046   /* Each operand can be a register or an immediate in the range [-16, 15].  */
4047   if (!aarch64_sve_index_immediate_p (base))
4048     base = force_reg (inner, base);
4049   if (!aarch64_sve_index_immediate_p (step))
4050     step = force_reg (inner, step);
4051
4052   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4053 }
4054
4055 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4056    register of mode MODE.  Use TARGET for the result if it's nonnull
4057    and convenient.
4058
4059    The two vector modes must have the same element mode.  The behavior
4060    is to duplicate architectural lane N of SRC into architectural lanes
4061    N + I * STEP of the result.  On big-endian targets, architectural
4062    lane 0 of an Advanced SIMD vector is the last element of the vector
4063    in memory layout, so for big-endian targets this operation has the
4064    effect of reversing SRC before duplicating it.  Callers need to
4065    account for this.  */
4066
4067 rtx
4068 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4069 {
4070   machine_mode src_mode = GET_MODE (src);
4071   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4072   insn_code icode = (BYTES_BIG_ENDIAN
4073                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
4074                      : code_for_aarch64_vec_duplicate_vq_le (mode));
4075
4076   unsigned int i = 0;
4077   expand_operand ops[3];
4078   create_output_operand (&ops[i++], target, mode);
4079   create_output_operand (&ops[i++], src, src_mode);
4080   if (BYTES_BIG_ENDIAN)
4081     {
4082       /* Create a PARALLEL describing the reversal of SRC.  */
4083       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4084       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4085                                                   nelts_per_vq - 1, -1);
4086       create_fixed_operand (&ops[i++], sel);
4087     }
4088   expand_insn (icode, i, ops);
4089   return ops[0].value;
4090 }
4091
4092 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4093    the memory image into DEST.  Return true on success.  */
4094
4095 static bool
4096 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4097 {
4098   src = force_const_mem (GET_MODE (src), src);
4099   if (!src)
4100     return false;
4101
4102   /* Make sure that the address is legitimate.  */
4103   if (!aarch64_sve_ld1rq_operand_p (src))
4104     {
4105       rtx addr = force_reg (Pmode, XEXP (src, 0));
4106       src = replace_equiv_address (src, addr);
4107     }
4108
4109   machine_mode mode = GET_MODE (dest);
4110   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4111   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4112   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4113   return true;
4114 }
4115
4116 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4117    SVE data mode and isn't a legitimate constant.  Use TARGET for the
4118    result if convenient.
4119
4120    The returned register can have whatever mode seems most natural
4121    given the contents of SRC.  */
4122
4123 static rtx
4124 aarch64_expand_sve_const_vector (rtx target, rtx src)
4125 {
4126   machine_mode mode = GET_MODE (src);
4127   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4128   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4129   scalar_mode elt_mode = GET_MODE_INNER (mode);
4130   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4131   unsigned int container_bits = aarch64_sve_container_bits (mode);
4132   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4133
4134   if (nelts_per_pattern == 1
4135       && encoded_bits <= 128
4136       && container_bits != elt_bits)
4137     {
4138       /* We have a partial vector mode and a constant whose full-vector
4139          equivalent would occupy a repeating 128-bit sequence.  Build that
4140          full-vector equivalent instead, so that we have the option of
4141          using LD1RQ and Advanced SIMD operations.  */
4142       unsigned int repeat = container_bits / elt_bits;
4143       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4144       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4145       for (unsigned int i = 0; i < npatterns; ++i)
4146         for (unsigned int j = 0; j < repeat; ++j)
4147           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4148       target = aarch64_target_reg (target, full_mode);
4149       return aarch64_expand_sve_const_vector (target, builder.build ());
4150     }
4151
4152   if (nelts_per_pattern == 1 && encoded_bits == 128)
4153     {
4154       /* The constant is a duplicated quadword but can't be narrowed
4155          beyond a quadword.  Get the memory image of the first quadword
4156          as a 128-bit vector and try using LD1RQ to load it from memory.
4157
4158          The effect for both endiannesses is to load memory lane N into
4159          architectural lanes N + I * STEP of the result.  On big-endian
4160          targets, the layout of the 128-bit vector in an Advanced SIMD
4161          register would be different from its layout in an SVE register,
4162          but this 128-bit vector is a memory value only.  */
4163       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4164       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4165       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4166         return target;
4167     }
4168
4169   if (nelts_per_pattern == 1 && encoded_bits < 128)
4170     {
4171       /* The vector is a repeating sequence of 64 bits or fewer.
4172          See if we can load them using an Advanced SIMD move and then
4173          duplicate it to fill a vector.  This is better than using a GPR
4174          move because it keeps everything in the same register file.  */
4175       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4176       rtx_vector_builder builder (vq_mode, npatterns, 1);
4177       for (unsigned int i = 0; i < npatterns; ++i)
4178         {
4179           /* We want memory lane N to go into architectural lane N,
4180              so reverse for big-endian targets.  The DUP .Q pattern
4181              has a compensating reverse built-in.  */
4182           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4183           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4184         }
4185       rtx vq_src = builder.build ();
4186       if (aarch64_simd_valid_immediate (vq_src, NULL))
4187         {
4188           vq_src = force_reg (vq_mode, vq_src);
4189           return aarch64_expand_sve_dupq (target, mode, vq_src);
4190         }
4191
4192       /* Get an integer representation of the repeating part of Advanced
4193          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4194          which for big-endian targets is lane-swapped wrt a normal
4195          Advanced SIMD vector.  This means that for both endiannesses,
4196          memory lane N of SVE vector SRC corresponds to architectural
4197          lane N of a register holding VQ_SRC.  This in turn means that
4198          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4199          as a single 128-bit value) and thus that memory lane 0 of SRC is
4200          in the lsb of the integer.  Duplicating the integer therefore
4201          ensures that memory lane N of SRC goes into architectural lane
4202          N + I * INDEX of the SVE register.  */
4203       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4204       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4205       if (elt_value)
4206         {
4207           /* Pretend that we had a vector of INT_MODE to start with.  */
4208           elt_mode = int_mode;
4209           mode = aarch64_full_sve_mode (int_mode).require ();
4210
4211           /* If the integer can be moved into a general register by a
4212              single instruction, do that and duplicate the result.  */
4213           if (CONST_INT_P (elt_value)
4214               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4215             {
4216               elt_value = force_reg (elt_mode, elt_value);
4217               return expand_vector_broadcast (mode, elt_value);
4218             }
4219         }
4220       else if (npatterns == 1)
4221         /* We're duplicating a single value, but can't do better than
4222            force it to memory and load from there.  This handles things
4223            like symbolic constants.  */
4224         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4225
4226       if (elt_value)
4227         {
4228           /* Load the element from memory if we can, otherwise move it into
4229              a register and use a DUP.  */
4230           rtx op = force_const_mem (elt_mode, elt_value);
4231           if (!op)
4232             op = force_reg (elt_mode, elt_value);
4233           return expand_vector_broadcast (mode, op);
4234         }
4235     }
4236
4237   /* Try using INDEX.  */
4238   rtx base, step;
4239   if (const_vec_series_p (src, &base, &step))
4240     {
4241       aarch64_expand_vec_series (target, base, step);
4242       return target;
4243     }
4244
4245   /* From here on, it's better to force the whole constant to memory
4246      if we can.  */
4247   if (GET_MODE_NUNITS (mode).is_constant ())
4248     return NULL_RTX;
4249
4250   /* Expand each pattern individually.  */
4251   gcc_assert (npatterns > 1);
4252   rtx_vector_builder builder;
4253   auto_vec<rtx, 16> vectors (npatterns);
4254   for (unsigned int i = 0; i < npatterns; ++i)
4255     {
4256       builder.new_vector (mode, 1, nelts_per_pattern);
4257       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4258         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4259       vectors.quick_push (force_reg (mode, builder.build ()));
4260     }
4261
4262   /* Use permutes to interleave the separate vectors.  */
4263   while (npatterns > 1)
4264     {
4265       npatterns /= 2;
4266       for (unsigned int i = 0; i < npatterns; ++i)
4267         {
4268           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4269           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4270           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4271           vectors[i] = tmp;
4272         }
4273     }
4274   gcc_assert (vectors[0] == target);
4275   return target;
4276 }
4277
4278 /* Use WHILE to set a predicate register of mode MODE in which the first
4279    VL bits are set and the rest are clear.  Use TARGET for the register
4280    if it's nonnull and convenient.  */
4281
4282 static rtx
4283 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4284                                  unsigned int vl)
4285 {
4286   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4287   target = aarch64_target_reg (target, mode);
4288   emit_insn (gen_while (UNSPEC_WHILE_LO, DImode, mode,
4289                         target, const0_rtx, limit));
4290   return target;
4291 }
4292
4293 static rtx
4294 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4295
4296 /* BUILDER is a constant predicate in which the index of every set bit
4297    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4298    by inverting every element at a multiple of ELT_SIZE and EORing the
4299    result with an ELT_SIZE PTRUE.
4300
4301    Return a register that contains the constant on success, otherwise
4302    return null.  Use TARGET as the register if it is nonnull and
4303    convenient.  */
4304
4305 static rtx
4306 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4307                                    unsigned int elt_size)
4308 {
4309   /* Invert every element at a multiple of ELT_SIZE, keeping the
4310      other bits zero.  */
4311   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4312                                   builder.nelts_per_pattern ());
4313   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4314     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4315       inv_builder.quick_push (const1_rtx);
4316     else
4317       inv_builder.quick_push (const0_rtx);
4318   inv_builder.finalize ();
4319
4320   /* See if we can load the constant cheaply.  */
4321   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4322   if (!inv)
4323     return NULL_RTX;
4324
4325   /* EOR the result with an ELT_SIZE PTRUE.  */
4326   rtx mask = aarch64_ptrue_all (elt_size);
4327   mask = force_reg (VNx16BImode, mask);
4328   target = aarch64_target_reg (target, VNx16BImode);
4329   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4330   return target;
4331 }
4332
4333 /* BUILDER is a constant predicate in which the index of every set bit
4334    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4335    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
4336    register on success, otherwise return null.  Use TARGET as the register
4337    if nonnull and convenient.  */
4338
4339 static rtx
4340 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4341                                    unsigned int elt_size,
4342                                    unsigned int permute_size)
4343 {
4344   /* We're going to split the constant into two new constants A and B,
4345      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4346      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4347
4348      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4349      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4350
4351      where _ indicates elements that will be discarded by the permute.
4352
4353      First calculate the ELT_SIZEs for A and B.  */
4354   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4355   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4356   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4357     if (INTVAL (builder.elt (i)) != 0)
4358       {
4359         if (i & permute_size)
4360           b_elt_size |= i - permute_size;
4361         else
4362           a_elt_size |= i;
4363       }
4364   a_elt_size &= -a_elt_size;
4365   b_elt_size &= -b_elt_size;
4366
4367   /* Now construct the vectors themselves.  */
4368   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4369                                 builder.nelts_per_pattern ());
4370   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4371                                 builder.nelts_per_pattern ());
4372   unsigned int nelts = builder.encoded_nelts ();
4373   for (unsigned int i = 0; i < nelts; ++i)
4374     if (i & (elt_size - 1))
4375       {
4376         a_builder.quick_push (const0_rtx);
4377         b_builder.quick_push (const0_rtx);
4378       }
4379     else if ((i & permute_size) == 0)
4380       {
4381         /* The A and B elements are significant.  */
4382         a_builder.quick_push (builder.elt (i));
4383         b_builder.quick_push (builder.elt (i + permute_size));
4384       }
4385     else
4386       {
4387         /* The A and B elements are going to be discarded, so pick whatever
4388            is likely to give a nice constant.  We are targeting element
4389            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4390            with the aim of each being a sequence of ones followed by
4391            a sequence of zeros.  So:
4392
4393            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4394              duplicate the last X_ELT_SIZE element, to extend the
4395              current sequence of ones or zeros.
4396
4397            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4398              zero, so that the constant really does have X_ELT_SIZE and
4399              not a smaller size.  */
4400         if (a_elt_size > permute_size)
4401           a_builder.quick_push (const0_rtx);
4402         else
4403           a_builder.quick_push (a_builder.elt (i - a_elt_size));
4404         if (b_elt_size > permute_size)
4405           b_builder.quick_push (const0_rtx);
4406         else
4407           b_builder.quick_push (b_builder.elt (i - b_elt_size));
4408       }
4409   a_builder.finalize ();
4410   b_builder.finalize ();
4411
4412   /* Try loading A into a register.  */
4413   rtx_insn *last = get_last_insn ();
4414   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4415   if (!a)
4416     return NULL_RTX;
4417
4418   /* Try loading B into a register.  */
4419   rtx b = a;
4420   if (a_builder != b_builder)
4421     {
4422       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4423       if (!b)
4424         {
4425           delete_insns_since (last);
4426           return NULL_RTX;
4427         }
4428     }
4429
4430   /* Emit the TRN1 itself.  */
4431   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4432   target = aarch64_target_reg (target, mode);
4433   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4434                               gen_lowpart (mode, a),
4435                               gen_lowpart (mode, b)));
4436   return target;
4437 }
4438
4439 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4440    constant in BUILDER into an SVE predicate register.  Return the register
4441    on success, otherwise return null.  Use TARGET for the register if
4442    nonnull and convenient.
4443
4444    ALLOW_RECURSE_P is true if we can use methods that would call this
4445    function recursively.  */
4446
4447 static rtx
4448 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4449                                  bool allow_recurse_p)
4450 {
4451   if (builder.encoded_nelts () == 1)
4452     /* A PFALSE or a PTRUE .B ALL.  */
4453     return aarch64_emit_set_immediate (target, builder);
4454
4455   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4456   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4457     {
4458       /* If we can load the constant using PTRUE, use it as-is.  */
4459       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4460       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4461         return aarch64_emit_set_immediate (target, builder);
4462
4463       /* Otherwise use WHILE to set the first VL bits.  */
4464       return aarch64_sve_move_pred_via_while (target, mode, vl);
4465     }
4466
4467   if (!allow_recurse_p)
4468     return NULL_RTX;
4469
4470   /* Try inverting the vector in element size ELT_SIZE and then EORing
4471      the result with an ELT_SIZE PTRUE.  */
4472   if (INTVAL (builder.elt (0)) == 0)
4473     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4474                                                      elt_size))
4475       return res;
4476
4477   /* Try using TRN1 to permute two simpler constants.  */
4478   for (unsigned int i = elt_size; i <= 8; i *= 2)
4479     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4480                                                      elt_size, i))
4481       return res;
4482
4483   return NULL_RTX;
4484 }
4485
4486 /* Return an SVE predicate register that contains the VNx16BImode
4487    constant in BUILDER, without going through the move expanders.
4488
4489    The returned register can have whatever mode seems most natural
4490    given the contents of BUILDER.  Use TARGET for the result if
4491    convenient.  */
4492
4493 static rtx
4494 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4495 {
4496   /* Try loading the constant using pure predicate operations.  */
4497   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4498     return res;
4499
4500   /* Try forcing the constant to memory.  */
4501   if (builder.full_nelts ().is_constant ())
4502     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4503       {
4504         target = aarch64_target_reg (target, VNx16BImode);
4505         emit_move_insn (target, mem);
4506         return target;
4507       }
4508
4509   /* The last resort is to load the constant as an integer and then
4510      compare it against zero.  Use -1 for set bits in order to increase
4511      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4512   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4513                                   builder.nelts_per_pattern ());
4514   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4515     int_builder.quick_push (INTVAL (builder.elt (i))
4516                             ? constm1_rtx : const0_rtx);
4517   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4518                                            int_builder.build ());
4519 }
4520
4521 /* Set DEST to immediate IMM.  */
4522
4523 void
4524 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4525 {
4526   machine_mode mode = GET_MODE (dest);
4527
4528   /* Check on what type of symbol it is.  */
4529   scalar_int_mode int_mode;
4530   if ((GET_CODE (imm) == SYMBOL_REF
4531        || GET_CODE (imm) == LABEL_REF
4532        || GET_CODE (imm) == CONST
4533        || GET_CODE (imm) == CONST_POLY_INT)
4534       && is_a <scalar_int_mode> (mode, &int_mode))
4535     {
4536       rtx mem;
4537       poly_int64 offset;
4538       HOST_WIDE_INT const_offset;
4539       enum aarch64_symbol_type sty;
4540
4541       /* If we have (const (plus symbol offset)), separate out the offset
4542          before we start classifying the symbol.  */
4543       rtx base = strip_offset (imm, &offset);
4544
4545       /* We must always add an offset involving VL separately, rather than
4546          folding it into the relocation.  */
4547       if (!offset.is_constant (&const_offset))
4548         {
4549           if (!TARGET_SVE)
4550             {
4551               aarch64_report_sve_required ();
4552               return;
4553             }
4554           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4555             emit_insn (gen_rtx_SET (dest, imm));
4556           else
4557             {
4558               /* Do arithmetic on 32-bit values if the result is smaller
4559                  than that.  */
4560               if (partial_subreg_p (int_mode, SImode))
4561                 {
4562                   /* It is invalid to do symbol calculations in modes
4563                      narrower than SImode.  */
4564                   gcc_assert (base == const0_rtx);
4565                   dest = gen_lowpart (SImode, dest);
4566                   int_mode = SImode;
4567                 }
4568               if (base != const0_rtx)
4569                 {
4570                   base = aarch64_force_temporary (int_mode, dest, base);
4571                   aarch64_add_offset (int_mode, dest, base, offset,
4572                                       NULL_RTX, NULL_RTX, false);
4573                 }
4574               else
4575                 aarch64_add_offset (int_mode, dest, base, offset,
4576                                     dest, NULL_RTX, false);
4577             }
4578           return;
4579         }
4580
4581       sty = aarch64_classify_symbol (base, const_offset);
4582       switch (sty)
4583         {
4584         case SYMBOL_FORCE_TO_MEM:
4585           if (const_offset != 0
4586               && targetm.cannot_force_const_mem (int_mode, imm))
4587             {
4588               gcc_assert (can_create_pseudo_p ());
4589               base = aarch64_force_temporary (int_mode, dest, base);
4590               aarch64_add_offset (int_mode, dest, base, const_offset,
4591                                   NULL_RTX, NULL_RTX, false);
4592               return;
4593             }
4594
4595           mem = force_const_mem (ptr_mode, imm);
4596           gcc_assert (mem);
4597
4598           /* If we aren't generating PC relative literals, then
4599              we need to expand the literal pool access carefully.
4600              This is something that needs to be done in a number
4601              of places, so could well live as a separate function.  */
4602           if (!aarch64_pcrelative_literal_loads)
4603             {
4604               gcc_assert (can_create_pseudo_p ());
4605               base = gen_reg_rtx (ptr_mode);
4606               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4607               if (ptr_mode != Pmode)
4608                 base = convert_memory_address (Pmode, base);
4609               mem = gen_rtx_MEM (ptr_mode, base);
4610             }
4611
4612           if (int_mode != ptr_mode)
4613             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4614
4615           emit_insn (gen_rtx_SET (dest, mem));
4616
4617           return;
4618
4619         case SYMBOL_SMALL_TLSGD:
4620         case SYMBOL_SMALL_TLSDESC:
4621         case SYMBOL_SMALL_TLSIE:
4622         case SYMBOL_SMALL_GOT_28K:
4623         case SYMBOL_SMALL_GOT_4G:
4624         case SYMBOL_TINY_GOT:
4625         case SYMBOL_TINY_TLSIE:
4626           if (const_offset != 0)
4627             {
4628               gcc_assert(can_create_pseudo_p ());
4629               base = aarch64_force_temporary (int_mode, dest, base);
4630               aarch64_add_offset (int_mode, dest, base, const_offset,
4631                                   NULL_RTX, NULL_RTX, false);
4632               return;
4633             }
4634           /* FALLTHRU */
4635
4636         case SYMBOL_SMALL_ABSOLUTE:
4637         case SYMBOL_TINY_ABSOLUTE:
4638         case SYMBOL_TLSLE12:
4639         case SYMBOL_TLSLE24:
4640         case SYMBOL_TLSLE32:
4641         case SYMBOL_TLSLE48:
4642           aarch64_load_symref_appropriately (dest, imm, sty);
4643           return;
4644
4645         default:
4646           gcc_unreachable ();
4647         }
4648     }
4649
4650   if (!CONST_INT_P (imm))
4651     {
4652       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4653         {
4654           /* Only the low bit of each .H, .S and .D element is defined,
4655              so we can set the upper bits to whatever we like.  If the
4656              predicate is all-true in MODE, prefer to set all the undefined
4657              bits as well, so that we can share a single .B predicate for
4658              all modes.  */
4659           if (imm == CONSTM1_RTX (mode))
4660             imm = CONSTM1_RTX (VNx16BImode);
4661
4662           /* All methods for constructing predicate modes wider than VNx16BI
4663              will set the upper bits of each element to zero.  Expose this
4664              by moving such constants as a VNx16BI, so that all bits are
4665              significant and so that constants for different modes can be
4666              shared.  The wider constant will still be available as a
4667              REG_EQUAL note.  */
4668           rtx_vector_builder builder;
4669           if (aarch64_get_sve_pred_bits (builder, imm))
4670             {
4671               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4672               if (dest != res)
4673                 emit_move_insn (dest, gen_lowpart (mode, res));
4674               return;
4675             }
4676         }
4677
4678       if (GET_CODE (imm) == HIGH
4679           || aarch64_simd_valid_immediate (imm, NULL))
4680         {
4681           emit_insn (gen_rtx_SET (dest, imm));
4682           return;
4683         }
4684
4685       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4686         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4687           {
4688             if (dest != res)
4689               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4690             return;
4691           }
4692
4693       rtx mem = force_const_mem (mode, imm);
4694       gcc_assert (mem);
4695       emit_move_insn (dest, mem);
4696       return;
4697     }
4698
4699   aarch64_internal_mov_immediate (dest, imm, true,
4700                                   as_a <scalar_int_mode> (mode));
4701 }
4702
4703 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4704    that is known to contain PTRUE.  */
4705
4706 void
4707 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4708 {
4709   expand_operand ops[3];
4710   machine_mode mode = GET_MODE (dest);
4711   create_output_operand (&ops[0], dest, mode);
4712   create_input_operand (&ops[1], pred, GET_MODE(pred));
4713   create_input_operand (&ops[2], src, mode);
4714   temporary_volatile_ok v (true);
4715   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4716 }
4717
4718 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4719    operand is in memory.  In this case we need to use the predicated LD1
4720    and ST1 instead of LDR and STR, both for correctness on big-endian
4721    targets and because LD1 and ST1 support a wider range of addressing modes.
4722    PRED_MODE is the mode of the predicate.
4723
4724    See the comment at the head of aarch64-sve.md for details about the
4725    big-endian handling.  */
4726
4727 void
4728 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4729 {
4730   machine_mode mode = GET_MODE (dest);
4731   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4732   if (!register_operand (src, mode)
4733       && !register_operand (dest, mode))
4734     {
4735       rtx tmp = gen_reg_rtx (mode);
4736       if (MEM_P (src))
4737         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4738       else
4739         emit_move_insn (tmp, src);
4740       src = tmp;
4741     }
4742   aarch64_emit_sve_pred_move (dest, ptrue, src);
4743 }
4744
4745 /* Called only on big-endian targets.  See whether an SVE vector move
4746    from SRC to DEST is effectively a REV[BHW] instruction, because at
4747    least one operand is a subreg of an SVE vector that has wider or
4748    narrower elements.  Return true and emit the instruction if so.
4749
4750    For example:
4751
4752      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4753
4754    represents a VIEW_CONVERT between the following vectors, viewed
4755    in memory order:
4756
4757      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4758      R1: { [0],      [1],      [2],      [3],     ... }
4759
4760    The high part of lane X in R2 should therefore correspond to lane X*2
4761    of R1, but the register representations are:
4762
4763          msb                                      lsb
4764      R2: ...... [1].high  [1].low   [0].high  [0].low
4765      R1: ...... [3]       [2]       [1]       [0]
4766
4767    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4768    We therefore need a reverse operation to swap the high and low values
4769    around.
4770
4771    This is purely an optimization.  Without it we would spill the
4772    subreg operand to the stack in one mode and reload it in the
4773    other mode, which has the same effect as the REV.  */
4774
4775 bool
4776 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4777 {
4778   gcc_assert (BYTES_BIG_ENDIAN);
4779   if (GET_CODE (dest) == SUBREG)
4780     dest = SUBREG_REG (dest);
4781   if (GET_CODE (src) == SUBREG)
4782     src = SUBREG_REG (src);
4783
4784   /* The optimization handles two single SVE REGs with different element
4785      sizes.  */
4786   if (!REG_P (dest)
4787       || !REG_P (src)
4788       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4789       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4790       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4791           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4792     return false;
4793
4794   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4795   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4796   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4797                                UNSPEC_REV_SUBREG);
4798   emit_insn (gen_rtx_SET (dest, unspec));
4799   return true;
4800 }
4801
4802 /* Return a copy of X with mode MODE, without changing its other
4803    attributes.  Unlike gen_lowpart, this doesn't care whether the
4804    mode change is valid.  */
4805
4806 rtx
4807 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4808 {
4809   if (GET_MODE (x) == mode)
4810     return x;
4811
4812   x = shallow_copy_rtx (x);
4813   set_mode_and_regno (x, mode, REGNO (x));
4814   return x;
4815 }
4816
4817 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4818    stored in wider integer containers.  */
4819
4820 static unsigned int
4821 aarch64_sve_rev_unspec (machine_mode mode)
4822 {
4823   switch (GET_MODE_UNIT_SIZE (mode))
4824     {
4825     case 1: return UNSPEC_REVB;
4826     case 2: return UNSPEC_REVH;
4827     case 4: return UNSPEC_REVW;
4828     }
4829   gcc_unreachable ();
4830 }
4831
4832 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4833    operands.  */
4834
4835 void
4836 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4837 {
4838   /* Decide which REV operation we need.  The mode with wider elements
4839      determines the mode of the operands and the mode with the narrower
4840      elements determines the reverse width.  */
4841   machine_mode mode_with_wider_elts = GET_MODE (dest);
4842   machine_mode mode_with_narrower_elts = GET_MODE (src);
4843   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4844       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4845     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4846
4847   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4848   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
4849
4850   /* Get the operands in the appropriate modes and emit the instruction.  */
4851   ptrue = gen_lowpart (pred_mode, ptrue);
4852   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4853   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4854   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4855                                dest, ptrue, src));
4856 }
4857
4858 static bool
4859 aarch64_function_ok_for_sibcall (tree, tree exp)
4860 {
4861   if (crtl->abi->id () != expr_callee_abi (exp).id ())
4862     return false;
4863
4864   return true;
4865 }
4866
4867 /* Implement TARGET_PASS_BY_REFERENCE.  */
4868
4869 static bool
4870 aarch64_pass_by_reference (cumulative_args_t pcum_v,
4871                            const function_arg_info &arg)
4872 {
4873   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4874   HOST_WIDE_INT size;
4875   machine_mode dummymode;
4876   int nregs;
4877
4878   unsigned int num_zr, num_pr;
4879   if (arg.type && aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
4880     {
4881       if (pcum && !pcum->silent_p && !TARGET_SVE)
4882         /* We can't gracefully recover at this point, so make this a
4883            fatal error.  */
4884         fatal_error (input_location, "arguments of type %qT require"
4885                      " the SVE ISA extension", arg.type);
4886
4887       /* Variadic SVE types are passed by reference.  Normal non-variadic
4888          arguments are too if we've run out of registers.  */
4889       return (!arg.named
4890               || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
4891               || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
4892     }
4893
4894   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4895   if (arg.mode == BLKmode && arg.type)
4896     size = int_size_in_bytes (arg.type);
4897   else
4898     /* No frontends can create types with variable-sized modes, so we
4899        shouldn't be asked to pass or return them.  */
4900     size = GET_MODE_SIZE (arg.mode).to_constant ();
4901
4902   /* Aggregates are passed by reference based on their size.  */
4903   if (arg.aggregate_type_p ())
4904     size = int_size_in_bytes (arg.type);
4905
4906   /* Variable sized arguments are always returned by reference.  */
4907   if (size < 0)
4908     return true;
4909
4910   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4911   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4912                                                &dummymode, &nregs,
4913                                                NULL))
4914     return false;
4915
4916   /* Arguments which are variable sized or larger than 2 registers are
4917      passed by reference unless they are a homogenous floating point
4918      aggregate.  */
4919   return size > 2 * UNITS_PER_WORD;
4920 }
4921
4922 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4923 static bool
4924 aarch64_return_in_msb (const_tree valtype)
4925 {
4926   machine_mode dummy_mode;
4927   int dummy_int;
4928
4929   /* Never happens in little-endian mode.  */
4930   if (!BYTES_BIG_ENDIAN)
4931     return false;
4932
4933   /* Only composite types smaller than or equal to 16 bytes can
4934      be potentially returned in registers.  */
4935   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4936       || int_size_in_bytes (valtype) <= 0
4937       || int_size_in_bytes (valtype) > 16)
4938     return false;
4939
4940   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4941      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4942      is always passed/returned in the least significant bits of fp/simd
4943      register(s).  */
4944   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4945                                                &dummy_mode, &dummy_int, NULL))
4946     return false;
4947
4948   return true;
4949 }
4950
4951 /* Subroutine of aarch64_function_value.  MODE is the mode of the argument
4952    after promotion, and after partial SVE types have been replaced by
4953    their integer equivalents.  */
4954 static rtx
4955 aarch64_function_value_1 (const_tree type, machine_mode mode)
4956 {
4957   unsigned int num_zr, num_pr;
4958   if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
4959     {
4960       /* Don't raise an error here if we're called when SVE is disabled,
4961          since this is really just a query function.  Other code must
4962          do that where appropriate.  */
4963       mode = TYPE_MODE_RAW (type);
4964       gcc_assert (VECTOR_MODE_P (mode)
4965                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
4966
4967       if (num_zr > 0 && num_pr == 0)
4968         return gen_rtx_REG (mode, V0_REGNUM);
4969
4970       if (num_zr == 0 && num_pr == 1)
4971         return gen_rtx_REG (mode, P0_REGNUM);
4972
4973       gcc_unreachable ();
4974     }
4975
4976   /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4977      returned in memory, not by value.  */
4978   gcc_assert (!aarch64_sve_mode_p (mode));
4979
4980   if (aarch64_return_in_msb (type))
4981     {
4982       HOST_WIDE_INT size = int_size_in_bytes (type);
4983
4984       if (size % UNITS_PER_WORD != 0)
4985         {
4986           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4987           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4988         }
4989     }
4990
4991   int count;
4992   machine_mode ag_mode;
4993   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4994                                                &ag_mode, &count, NULL))
4995     {
4996       if (!aarch64_composite_type_p (type, mode))
4997         {
4998           gcc_assert (count == 1 && mode == ag_mode);
4999           return gen_rtx_REG (mode, V0_REGNUM);
5000         }
5001       else
5002         {
5003           int i;
5004           rtx par;
5005
5006           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5007           for (i = 0; i < count; i++)
5008             {
5009               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5010               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5011               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5012               XVECEXP (par, 0, i) = tmp;
5013             }
5014           return par;
5015         }
5016     }
5017   else
5018     return gen_rtx_REG (mode, R0_REGNUM);
5019 }
5020
5021 /* Implement TARGET_FUNCTION_VALUE.
5022    Define how to find the value returned by a function.  */
5023
5024 static rtx
5025 aarch64_function_value (const_tree type, const_tree func,
5026                         bool outgoing ATTRIBUTE_UNUSED)
5027 {
5028   machine_mode mode;
5029   int unsignedp;
5030
5031   mode = TYPE_MODE (type);
5032   if (INTEGRAL_TYPE_P (type))
5033     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5034
5035   /* Vector types can acquire a partial SVE mode using things like
5036      __attribute__((vector_size(N))), and this is potentially useful.
5037      However, the choice of mode doesn't affect the type's ABI identity,
5038      so we should treat the types as though they had the associated
5039      integer mode, just like they did before SVE was introduced.
5040
5041      We know that the vector must be 128 bits or smaller, otherwise we'd
5042      have returned it in memory instead.  */
5043   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5044   if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5045     {
5046       scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5047       rtx reg = aarch64_function_value_1 (type, int_mode);
5048       /* Vector types are never returned in the MSB and are never split.  */
5049       gcc_assert (REG_P (reg) && GET_MODE (reg) == int_mode);
5050       rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5051       return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, pair));
5052     }
5053
5054   return aarch64_function_value_1 (type, mode);
5055 }
5056
5057 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5058    Return true if REGNO is the number of a hard register in which the values
5059    of called function may come back.  */
5060
5061 static bool
5062 aarch64_function_value_regno_p (const unsigned int regno)
5063 {
5064   /* Maximum of 16 bytes can be returned in the general registers.  Examples
5065      of 16-byte return values are: 128-bit integers and 16-byte small
5066      structures (excluding homogeneous floating-point aggregates).  */
5067   if (regno == R0_REGNUM || regno == R1_REGNUM)
5068     return true;
5069
5070   /* Up to four fp/simd registers can return a function value, e.g. a
5071      homogeneous floating-point aggregate having four members.  */
5072   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5073     return TARGET_FLOAT;
5074
5075   return false;
5076 }
5077
5078 /* Implement TARGET_RETURN_IN_MEMORY.
5079
5080    If the type T of the result of a function is such that
5081      void func (T arg)
5082    would require that arg be passed as a value in a register (or set of
5083    registers) according to the parameter passing rules, then the result
5084    is returned in the same registers as would be used for such an
5085    argument.  */
5086
5087 static bool
5088 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5089 {
5090   HOST_WIDE_INT size;
5091   machine_mode ag_mode;
5092   int count;
5093
5094   if (!AGGREGATE_TYPE_P (type)
5095       && TREE_CODE (type) != COMPLEX_TYPE
5096       && TREE_CODE (type) != VECTOR_TYPE)
5097     /* Simple scalar types always returned in registers.  */
5098     return false;
5099
5100   unsigned int num_zr, num_pr;
5101   if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
5102     {
5103       /* All SVE types we support fit in registers.  For example, it isn't
5104          yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5105          predicates.  */
5106       gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
5107       return false;
5108     }
5109
5110   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
5111                                                type,
5112                                                &ag_mode,
5113                                                &count,
5114                                                NULL))
5115     return false;
5116
5117   /* Types larger than 2 registers returned in memory.  */
5118   size = int_size_in_bytes (type);
5119   return (size < 0 || size > 2 * UNITS_PER_WORD);
5120 }
5121
5122 static bool
5123 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5124                                const_tree type, int *nregs)
5125 {
5126   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5127   return aarch64_vfp_is_call_or_return_candidate (mode,
5128                                                   type,
5129                                                   &pcum->aapcs_vfp_rmode,
5130                                                   nregs,
5131                                                   NULL);
5132 }
5133
5134 /* Given MODE and TYPE of a function argument, return the alignment in
5135    bits.  The idea is to suppress any stronger alignment requested by
5136    the user and opt for the natural alignment (specified in AAPCS64 \S
5137    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
5138    calculated in versions of GCC prior to GCC-9.  This is a helper
5139    function for local use only.  */
5140
5141 static unsigned int
5142 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5143                                 bool *abi_break)
5144 {
5145   *abi_break = false;
5146   if (!type)
5147     return GET_MODE_ALIGNMENT (mode);
5148
5149   if (integer_zerop (TYPE_SIZE (type)))
5150     return 0;
5151
5152   gcc_assert (TYPE_MODE (type) == mode);
5153
5154   if (!AGGREGATE_TYPE_P (type))
5155     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5156
5157   if (TREE_CODE (type) == ARRAY_TYPE)
5158     return TYPE_ALIGN (TREE_TYPE (type));
5159
5160   unsigned int alignment = 0;
5161   unsigned int bitfield_alignment = 0;
5162   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5163     if (TREE_CODE (field) == FIELD_DECL)
5164       {
5165         alignment = std::max (alignment, DECL_ALIGN (field));
5166         if (DECL_BIT_FIELD_TYPE (field))
5167           bitfield_alignment
5168             = std::max (bitfield_alignment,
5169                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5170       }
5171
5172   if (bitfield_alignment > alignment)
5173     {
5174       *abi_break = true;
5175       return bitfield_alignment;
5176     }
5177
5178   return alignment;
5179 }
5180
5181 /* Layout a function argument according to the AAPCS64 rules.  The rule
5182    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
5183    mode that was originally given to us by the target hook, whereas the
5184    mode in ARG might be the result of replacing partial SVE modes with
5185    the equivalent integer mode.  */
5186
5187 static void
5188 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg,
5189                     machine_mode orig_mode)
5190 {
5191   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5192   tree type = arg.type;
5193   machine_mode mode = arg.mode;
5194   int ncrn, nvrn, nregs;
5195   bool allocate_ncrn, allocate_nvrn;
5196   HOST_WIDE_INT size;
5197   bool abi_break;
5198
5199   /* We need to do this once per argument.  */
5200   if (pcum->aapcs_arg_processed)
5201     return;
5202
5203   /* Vector types can acquire a partial SVE mode using things like
5204      __attribute__((vector_size(N))), and this is potentially useful.
5205      However, the choice of mode doesn't affect the type's ABI identity,
5206      so we should treat the types as though they had the associated
5207      integer mode, just like they did before SVE was introduced.
5208
5209      We know that the vector must be 128 bits or smaller, otherwise we'd
5210      have passed it by reference instead.  */
5211   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5212   if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5213     {
5214       function_arg_info tmp_arg = arg;
5215       tmp_arg.mode = int_mode_for_mode (mode).require ();
5216       aarch64_layout_arg (pcum_v, tmp_arg, orig_mode);
5217       if (rtx reg = pcum->aapcs_reg)
5218         {
5219           gcc_assert (REG_P (reg) && GET_MODE (reg) == tmp_arg.mode);
5220           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5221           pcum->aapcs_reg = gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5222         }
5223       return;
5224     }
5225
5226   pcum->aapcs_arg_processed = true;
5227
5228   unsigned int num_zr, num_pr;
5229   if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
5230     {
5231       /* The PCS says that it is invalid to pass an SVE value to an
5232          unprototyped function.  There is no ABI-defined location we
5233          can return in this case, so we have no real choice but to raise
5234          an error immediately, even though this is only a query function.  */
5235       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5236         {
5237           gcc_assert (!pcum->silent_p);
5238           error ("SVE type %qT cannot be passed to an unprototyped function",
5239                  arg.type);
5240           /* Avoid repeating the message, and avoid tripping the assert
5241              below.  */
5242           pcum->pcs_variant = ARM_PCS_SVE;
5243         }
5244
5245       /* We would have converted the argument into pass-by-reference
5246          form if it didn't fit in registers.  */
5247       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
5248       pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
5249       gcc_assert (arg.named
5250                   && pcum->pcs_variant == ARM_PCS_SVE
5251                   && aarch64_sve_mode_p (mode)
5252                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5253                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5254
5255       if (num_zr > 0 && num_pr == 0)
5256         pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
5257       else if (num_zr == 0 && num_pr == 1)
5258         pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
5259       else
5260         gcc_unreachable ();
5261       return;
5262     }
5263
5264   /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5265      passed by reference, not by value.  */
5266   gcc_assert (!aarch64_sve_mode_p (mode));
5267
5268   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
5269   if (type)
5270     size = int_size_in_bytes (type);
5271   else
5272     /* No frontends can create types with variable-sized modes, so we
5273        shouldn't be asked to pass or return them.  */
5274     size = GET_MODE_SIZE (mode).to_constant ();
5275   size = ROUND_UP (size, UNITS_PER_WORD);
5276
5277   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5278   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5279                                                  mode,
5280                                                  type,
5281                                                  &nregs);
5282
5283   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5284      The following code thus handles passing by SIMD/FP registers first.  */
5285
5286   nvrn = pcum->aapcs_nvrn;
5287
5288   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5289      and homogenous short-vector aggregates (HVA).  */
5290   if (allocate_nvrn)
5291     {
5292       if (!pcum->silent_p && !TARGET_FLOAT)
5293         aarch64_err_no_fpadvsimd (mode);
5294
5295       if (nvrn + nregs <= NUM_FP_ARG_REGS)
5296         {
5297           pcum->aapcs_nextnvrn = nvrn + nregs;
5298           if (!aarch64_composite_type_p (type, mode))
5299             {
5300               gcc_assert (nregs == 1);
5301               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5302             }
5303           else
5304             {
5305               rtx par;
5306               int i;
5307               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5308               for (i = 0; i < nregs; i++)
5309                 {
5310                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5311                                          V0_REGNUM + nvrn + i);
5312                   rtx offset = gen_int_mode
5313                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5314                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5315                   XVECEXP (par, 0, i) = tmp;
5316                 }
5317               pcum->aapcs_reg = par;
5318             }
5319           return;
5320         }
5321       else
5322         {
5323           /* C.3 NSRN is set to 8.  */
5324           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5325           goto on_stack;
5326         }
5327     }
5328
5329   ncrn = pcum->aapcs_ncrn;
5330   nregs = size / UNITS_PER_WORD;
5331
5332   /* C6 - C9.  though the sign and zero extension semantics are
5333      handled elsewhere.  This is the case where the argument fits
5334      entirely general registers.  */
5335   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5336     {
5337       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5338
5339       /* C.8 if the argument has an alignment of 16 then the NGRN is
5340          rounded up to the next even number.  */
5341       if (nregs == 2
5342           && ncrn % 2
5343           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5344              comparison is there because for > 16 * BITS_PER_UNIT
5345              alignment nregs should be > 2 and therefore it should be
5346              passed by reference rather than value.  */
5347           && (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5348               == 16 * BITS_PER_UNIT))
5349         {
5350           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5351             inform (input_location, "parameter passing for argument of type "
5352                     "%qT changed in GCC 9.1", type);
5353           ++ncrn;
5354           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5355         }
5356
5357       /* NREGS can be 0 when e.g. an empty structure is to be passed.
5358          A reg is still generated for it, but the caller should be smart
5359          enough not to use it.  */
5360       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
5361         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5362       else
5363         {
5364           rtx par;
5365           int i;
5366
5367           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5368           for (i = 0; i < nregs; i++)
5369             {
5370               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
5371               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5372                                        GEN_INT (i * UNITS_PER_WORD));
5373               XVECEXP (par, 0, i) = tmp;
5374             }
5375           pcum->aapcs_reg = par;
5376         }
5377
5378       pcum->aapcs_nextncrn = ncrn + nregs;
5379       return;
5380     }
5381
5382   /* C.11  */
5383   pcum->aapcs_nextncrn = NUM_ARG_REGS;
5384
5385   /* The argument is passed on stack; record the needed number of words for
5386      this argument and align the total size if necessary.  */
5387 on_stack:
5388   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5389
5390   if (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5391       == 16 * BITS_PER_UNIT)
5392     {
5393       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5394       if (pcum->aapcs_stack_size != new_size)
5395         {
5396           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5397             inform (input_location, "parameter passing for argument of type "
5398                     "%qT changed in GCC 9.1", type);
5399           pcum->aapcs_stack_size = new_size;
5400         }
5401     }
5402   return;
5403 }
5404
5405 /* Implement TARGET_FUNCTION_ARG.  */
5406
5407 static rtx
5408 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5409 {
5410   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5411   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5412               || pcum->pcs_variant == ARM_PCS_SIMD
5413               || pcum->pcs_variant == ARM_PCS_SVE);
5414
5415   if (arg.end_marker_p ())
5416     return gen_int_mode (pcum->pcs_variant, DImode);
5417
5418   aarch64_layout_arg (pcum_v, arg, arg.mode);
5419   return pcum->aapcs_reg;
5420 }
5421
5422 void
5423 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5424                               const_tree fntype,
5425                               rtx libname ATTRIBUTE_UNUSED,
5426                               const_tree fndecl ATTRIBUTE_UNUSED,
5427                               unsigned n_named ATTRIBUTE_UNUSED,
5428                               bool silent_p)
5429 {
5430   pcum->aapcs_ncrn = 0;
5431   pcum->aapcs_nvrn = 0;
5432   pcum->aapcs_nprn = 0;
5433   pcum->aapcs_nextncrn = 0;
5434   pcum->aapcs_nextnvrn = 0;
5435   pcum->aapcs_nextnprn = 0;
5436   if (fntype)
5437     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5438   else
5439     pcum->pcs_variant = ARM_PCS_AAPCS64;
5440   pcum->aapcs_reg = NULL_RTX;
5441   pcum->aapcs_arg_processed = false;
5442   pcum->aapcs_stack_words = 0;
5443   pcum->aapcs_stack_size = 0;
5444   pcum->silent_p = silent_p;
5445
5446   if (!silent_p
5447       && !TARGET_FLOAT
5448       && fndecl && TREE_PUBLIC (fndecl)
5449       && fntype && fntype != error_mark_node)
5450     {
5451       const_tree type = TREE_TYPE (fntype);
5452       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
5453       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
5454       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5455                                                    &mode, &nregs, NULL))
5456         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5457     }
5458
5459   if (!silent_p
5460       && !TARGET_SVE
5461       && pcum->pcs_variant == ARM_PCS_SVE)
5462     {
5463       /* We can't gracefully recover at this point, so make this a
5464          fatal error.  */
5465       if (fndecl)
5466         fatal_error (input_location, "%qE requires the SVE ISA extension",
5467                      fndecl);
5468       else
5469         fatal_error (input_location, "calls to functions of type %qT require"
5470                      " the SVE ISA extension", fntype);
5471     }
5472 }
5473
5474 static void
5475 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5476                               const function_arg_info &arg)
5477 {
5478   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5479   if (pcum->pcs_variant == ARM_PCS_AAPCS64
5480       || pcum->pcs_variant == ARM_PCS_SIMD
5481       || pcum->pcs_variant == ARM_PCS_SVE)
5482     {
5483       aarch64_layout_arg (pcum_v, arg, arg.mode);
5484       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5485                   != (pcum->aapcs_stack_words != 0));
5486       pcum->aapcs_arg_processed = false;
5487       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5488       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5489       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5490       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5491       pcum->aapcs_stack_words = 0;
5492       pcum->aapcs_reg = NULL_RTX;
5493     }
5494 }
5495
5496 bool
5497 aarch64_function_arg_regno_p (unsigned regno)
5498 {
5499   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5500           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5501 }
5502
5503 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
5504    PARM_BOUNDARY bits of alignment, but will be given anything up
5505    to STACK_BOUNDARY bits if the type requires it.  This makes sure
5506    that both before and after the layout of each argument, the Next
5507    Stacked Argument Address (NSAA) will have a minimum alignment of
5508    8 bytes.  */
5509
5510 static unsigned int
5511 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5512 {
5513   bool abi_break;
5514   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5515                                                            &abi_break);
5516   if (abi_break & warn_psabi)
5517     inform (input_location, "parameter passing for argument of type "
5518             "%qT changed in GCC 9.1", type);
5519
5520   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5521 }
5522
5523 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
5524
5525 static fixed_size_mode
5526 aarch64_get_reg_raw_mode (int regno)
5527 {
5528   if (TARGET_SVE && FP_REGNUM_P (regno))
5529     /* Don't use the SVE part of the register for __builtin_apply and
5530        __builtin_return.  The SVE registers aren't used by the normal PCS,
5531        so using them there would be a waste of time.  The PCS extensions
5532        for SVE types are fundamentally incompatible with the
5533        __builtin_return/__builtin_apply interface.  */
5534     return as_a <fixed_size_mode> (V16QImode);
5535   return default_get_reg_raw_mode (regno);
5536 }
5537
5538 /* Implement TARGET_FUNCTION_ARG_PADDING.
5539
5540    Small aggregate types are placed in the lowest memory address.
5541
5542    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
5543
5544 static pad_direction
5545 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5546 {
5547   /* On little-endian targets, the least significant byte of every stack
5548      argument is passed at the lowest byte address of the stack slot.  */
5549   if (!BYTES_BIG_ENDIAN)
5550     return PAD_UPWARD;
5551
5552   /* Otherwise, integral, floating-point and pointer types are padded downward:
5553      the least significant byte of a stack argument is passed at the highest
5554      byte address of the stack slot.  */
5555   if (type
5556       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5557          || POINTER_TYPE_P (type))
5558       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5559     return PAD_DOWNWARD;
5560
5561   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
5562   return PAD_UPWARD;
5563 }
5564
5565 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5566
5567    It specifies padding for the last (may also be the only)
5568    element of a block move between registers and memory.  If
5569    assuming the block is in the memory, padding upward means that
5570    the last element is padded after its highest significant byte,
5571    while in downward padding, the last element is padded at the
5572    its least significant byte side.
5573
5574    Small aggregates and small complex types are always padded
5575    upwards.
5576
5577    We don't need to worry about homogeneous floating-point or
5578    short-vector aggregates; their move is not affected by the
5579    padding direction determined here.  Regardless of endianness,
5580    each element of such an aggregate is put in the least
5581    significant bits of a fp/simd register.
5582
5583    Return !BYTES_BIG_ENDIAN if the least significant byte of the
5584    register has useful data, and return the opposite if the most
5585    significant byte does.  */
5586
5587 bool
5588 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5589                      bool first ATTRIBUTE_UNUSED)
5590 {
5591
5592   /* Small composite types are always padded upward.  */
5593   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5594     {
5595       HOST_WIDE_INT size;
5596       if (type)
5597         size = int_size_in_bytes (type);
5598       else
5599         /* No frontends can create types with variable-sized modes, so we
5600            shouldn't be asked to pass or return them.  */
5601         size = GET_MODE_SIZE (mode).to_constant ();
5602       if (size < 2 * UNITS_PER_WORD)
5603         return true;
5604     }
5605
5606   /* Otherwise, use the default padding.  */
5607   return !BYTES_BIG_ENDIAN;
5608 }
5609
5610 static scalar_int_mode
5611 aarch64_libgcc_cmp_return_mode (void)
5612 {
5613   return SImode;
5614 }
5615
5616 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5617
5618 /* We use the 12-bit shifted immediate arithmetic instructions so values
5619    must be multiple of (1 << 12), i.e. 4096.  */
5620 #define ARITH_FACTOR 4096
5621
5622 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5623 #error Cannot use simple address calculation for stack probing
5624 #endif
5625
5626 /* The pair of scratch registers used for stack probing.  */
5627 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5628 #define PROBE_STACK_SECOND_REG R10_REGNUM
5629
5630 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5631    inclusive.  These are offsets from the current stack pointer.  */
5632
5633 static void
5634 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5635 {
5636   HOST_WIDE_INT size;
5637   if (!poly_size.is_constant (&size))
5638     {
5639       sorry ("stack probes for SVE frames");
5640       return;
5641     }
5642
5643   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5644
5645   /* See the same assertion on PROBE_INTERVAL above.  */
5646   gcc_assert ((first % ARITH_FACTOR) == 0);
5647
5648   /* See if we have a constant small number of probes to generate.  If so,
5649      that's the easy case.  */
5650   if (size <= PROBE_INTERVAL)
5651     {
5652       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5653
5654       emit_set_insn (reg1,
5655                      plus_constant (Pmode,
5656                                     stack_pointer_rtx, -(first + base)));
5657       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5658     }
5659
5660   /* The run-time loop is made up of 8 insns in the generic case while the
5661      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5662   else if (size <= 4 * PROBE_INTERVAL)
5663     {
5664       HOST_WIDE_INT i, rem;
5665
5666       emit_set_insn (reg1,
5667                      plus_constant (Pmode,
5668                                     stack_pointer_rtx,
5669                                     -(first + PROBE_INTERVAL)));
5670       emit_stack_probe (reg1);
5671
5672       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5673          it exceeds SIZE.  If only two probes are needed, this will not
5674          generate any code.  Then probe at FIRST + SIZE.  */
5675       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5676         {
5677           emit_set_insn (reg1,
5678                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5679           emit_stack_probe (reg1);
5680         }
5681
5682       rem = size - (i - PROBE_INTERVAL);
5683       if (rem > 256)
5684         {
5685           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5686
5687           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5688           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5689         }
5690       else
5691         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5692     }
5693
5694   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5695      extra careful with variables wrapping around because we might be at
5696      the very top (or the very bottom) of the address space and we have
5697      to be able to handle this case properly; in particular, we use an
5698      equality test for the loop condition.  */
5699   else
5700     {
5701       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5702
5703       /* Step 1: round SIZE to the previous multiple of the interval.  */
5704
5705       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5706
5707
5708       /* Step 2: compute initial and final value of the loop counter.  */
5709
5710       /* TEST_ADDR = SP + FIRST.  */
5711       emit_set_insn (reg1,
5712                      plus_constant (Pmode, stack_pointer_rtx, -first));
5713
5714       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5715       HOST_WIDE_INT adjustment = - (first + rounded_size);
5716       if (! aarch64_uimm12_shift (adjustment))
5717         {
5718           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5719                                           true, Pmode);
5720           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5721         }
5722       else
5723         emit_set_insn (reg2,
5724                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5725
5726       /* Step 3: the loop
5727
5728          do
5729            {
5730              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5731              probe at TEST_ADDR
5732            }
5733          while (TEST_ADDR != LAST_ADDR)
5734
5735          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5736          until it is equal to ROUNDED_SIZE.  */
5737
5738       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5739
5740
5741       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5742          that SIZE is equal to ROUNDED_SIZE.  */
5743
5744       if (size != rounded_size)
5745         {
5746           HOST_WIDE_INT rem = size - rounded_size;
5747
5748           if (rem > 256)
5749             {
5750               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5751
5752               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5753               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5754             }
5755           else
5756             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5757         }
5758     }
5759
5760   /* Make sure nothing is scheduled before we are done.  */
5761   emit_insn (gen_blockage ());
5762 }
5763
5764 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5765    absolute addresses.  */
5766
5767 const char *
5768 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5769 {
5770   static int labelno = 0;
5771   char loop_lab[32];
5772   rtx xops[2];
5773
5774   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5775
5776   /* Loop.  */
5777   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5778
5779   HOST_WIDE_INT stack_clash_probe_interval
5780     = 1 << param_stack_clash_protection_guard_size;
5781
5782   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5783   xops[0] = reg1;
5784   HOST_WIDE_INT interval;
5785   if (flag_stack_clash_protection)
5786     interval = stack_clash_probe_interval;
5787   else
5788     interval = PROBE_INTERVAL;
5789
5790   gcc_assert (aarch64_uimm12_shift (interval));
5791   xops[1] = GEN_INT (interval);
5792
5793   output_asm_insn ("sub\t%0, %0, %1", xops);
5794
5795   /* If doing stack clash protection then we probe up by the ABI specified
5796      amount.  We do this because we're dropping full pages at a time in the
5797      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5798   if (flag_stack_clash_protection)
5799     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5800   else
5801     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5802
5803   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5804      by this amount for each iteration.  */
5805   output_asm_insn ("str\txzr, [%0, %1]", xops);
5806
5807   /* Test if TEST_ADDR == LAST_ADDR.  */
5808   xops[1] = reg2;
5809   output_asm_insn ("cmp\t%0, %1", xops);
5810
5811   /* Branch.  */
5812   fputs ("\tb.ne\t", asm_out_file);
5813   assemble_name_raw (asm_out_file, loop_lab);
5814   fputc ('\n', asm_out_file);
5815
5816   return "";
5817 }
5818
5819 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5820    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5821    of GUARD_SIZE.  When a probe is emitted it is done at most
5822    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5823    at most MIN_PROBE_THRESHOLD.  By the end of this function
5824    BASE = BASE - ADJUSTMENT.  */
5825
5826 const char *
5827 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5828                                       rtx min_probe_threshold, rtx guard_size)
5829 {
5830   /* This function is not allowed to use any instruction generation function
5831      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5832      so instead emit the code you want using output_asm_insn.  */
5833   gcc_assert (flag_stack_clash_protection);
5834   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5835   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5836
5837   /* The minimum required allocation before the residual requires probing.  */
5838   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5839
5840   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5841   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5842   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5843
5844   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5845   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5846
5847   static int labelno = 0;
5848   char loop_start_lab[32];
5849   char loop_end_lab[32];
5850   rtx xops[2];
5851
5852   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5853   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5854
5855   /* Emit loop start label.  */
5856   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5857
5858   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5859   xops[0] = adjustment;
5860   xops[1] = probe_offset_value_rtx;
5861   output_asm_insn ("cmp\t%0, %1", xops);
5862
5863   /* Branch to end if not enough adjustment to probe.  */
5864   fputs ("\tb.lt\t", asm_out_file);
5865   assemble_name_raw (asm_out_file, loop_end_lab);
5866   fputc ('\n', asm_out_file);
5867
5868   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5869   xops[0] = base;
5870   xops[1] = probe_offset_value_rtx;
5871   output_asm_insn ("sub\t%0, %0, %1", xops);
5872
5873   /* Probe at BASE.  */
5874   xops[1] = const0_rtx;
5875   output_asm_insn ("str\txzr, [%0, %1]", xops);
5876
5877   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5878   xops[0] = adjustment;
5879   xops[1] = probe_offset_value_rtx;
5880   output_asm_insn ("sub\t%0, %0, %1", xops);
5881
5882   /* Branch to start if still more bytes to allocate.  */
5883   fputs ("\tb\t", asm_out_file);
5884   assemble_name_raw (asm_out_file, loop_start_lab);
5885   fputc ('\n', asm_out_file);
5886
5887   /* No probe leave.  */
5888   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5889
5890   /* BASE = BASE - ADJUSTMENT.  */
5891   xops[0] = base;
5892   xops[1] = adjustment;
5893   output_asm_insn ("sub\t%0, %0, %1", xops);
5894   return "";
5895 }
5896
5897 /* Determine whether a frame chain needs to be generated.  */
5898 static bool
5899 aarch64_needs_frame_chain (void)
5900 {
5901   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5902   if (frame_pointer_needed || crtl->calls_eh_return)
5903     return true;
5904
5905   /* A leaf function cannot have calls or write LR.  */
5906   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5907
5908   /* Don't use a frame chain in leaf functions if leaf frame pointers
5909      are disabled.  */
5910   if (flag_omit_leaf_frame_pointer && is_leaf)
5911     return false;
5912
5913   return aarch64_use_frame_pointer;
5914 }
5915
5916 /* Mark the registers that need to be saved by the callee and calculate
5917    the size of the callee-saved registers area and frame record (both FP
5918    and LR may be omitted).  */
5919 static void
5920 aarch64_layout_frame (void)
5921 {
5922   poly_int64 offset = 0;
5923   int regno, last_fp_reg = INVALID_REGNUM;
5924   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
5925   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
5926   bool frame_related_fp_reg_p = false;
5927   aarch64_frame &frame = cfun->machine->frame;
5928
5929   frame.emit_frame_chain = aarch64_needs_frame_chain ();
5930
5931   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5932      the mid-end is doing.  */
5933   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5934
5935 #define SLOT_NOT_REQUIRED (-2)
5936 #define SLOT_REQUIRED     (-1)
5937
5938   frame.wb_candidate1 = INVALID_REGNUM;
5939   frame.wb_candidate2 = INVALID_REGNUM;
5940   frame.spare_pred_reg = INVALID_REGNUM;
5941
5942   /* First mark all the registers that really need to be saved...  */
5943   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5944     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5945
5946   /* ... that includes the eh data registers (if needed)...  */
5947   if (crtl->calls_eh_return)
5948     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5949       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
5950
5951   /* ... and any callee saved register that dataflow says is live.  */
5952   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5953     if (df_regs_ever_live_p (regno)
5954         && !fixed_regs[regno]
5955         && (regno == R30_REGNUM
5956             || !crtl->abi->clobbers_full_reg_p (regno)))
5957       frame.reg_offset[regno] = SLOT_REQUIRED;
5958
5959   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5960     if (df_regs_ever_live_p (regno)
5961         && !fixed_regs[regno]
5962         && !crtl->abi->clobbers_full_reg_p (regno))
5963       {
5964         frame.reg_offset[regno] = SLOT_REQUIRED;
5965         last_fp_reg = regno;
5966         if (aarch64_emit_cfi_for_reg_p (regno))
5967           frame_related_fp_reg_p = true;
5968       }
5969
5970   /* Big-endian SVE frames need a spare predicate register in order
5971      to save Z8-Z15.  Decide which register they should use.  Prefer
5972      an unused argument register if possible, so that we don't force P4
5973      to be saved unnecessarily.  */
5974   if (frame_related_fp_reg_p
5975       && crtl->abi->id () == ARM_PCS_SVE
5976       && BYTES_BIG_ENDIAN)
5977     {
5978       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5979       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
5980       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
5981         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
5982           break;
5983       gcc_assert (regno <= P7_REGNUM);
5984       frame.spare_pred_reg = regno;
5985       df_set_regs_ever_live (regno, true);
5986     }
5987
5988   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5989     if (df_regs_ever_live_p (regno)
5990         && !fixed_regs[regno]
5991         && !crtl->abi->clobbers_full_reg_p (regno))
5992       frame.reg_offset[regno] = SLOT_REQUIRED;
5993
5994   /* With stack-clash, LR must be saved in non-leaf functions.  */
5995   gcc_assert (crtl->is_leaf
5996               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
5997
5998   /* Now assign stack slots for the registers.  Start with the predicate
5999      registers, since predicate LDR and STR have a relatively small
6000      offset range.  These saves happen below the hard frame pointer.  */
6001   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6002     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6003       {
6004         frame.reg_offset[regno] = offset;
6005         offset += BYTES_PER_SVE_PRED;
6006       }
6007
6008   /* We save a maximum of 8 predicate registers, and since vector
6009      registers are 8 times the size of a predicate register, all the
6010      saved predicates fit within a single vector.  Doing this also
6011      rounds the offset to a 128-bit boundary.  */
6012   if (maybe_ne (offset, 0))
6013     {
6014       gcc_assert (known_le (offset, vector_save_size));
6015       offset = vector_save_size;
6016     }
6017
6018   /* If we need to save any SVE vector registers, add them next.  */
6019   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6020     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6021       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6022         {
6023           frame.reg_offset[regno] = offset;
6024           offset += vector_save_size;
6025         }
6026
6027   /* OFFSET is now the offset of the hard frame pointer from the bottom
6028      of the callee save area.  */
6029   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6030   frame.below_hard_fp_saved_regs_size = offset;
6031   if (frame.emit_frame_chain)
6032     {
6033       /* FP and LR are placed in the linkage record.  */
6034       frame.reg_offset[R29_REGNUM] = offset;
6035       frame.wb_candidate1 = R29_REGNUM;
6036       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6037       frame.wb_candidate2 = R30_REGNUM;
6038       offset += 2 * UNITS_PER_WORD;
6039     }
6040
6041   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6042     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6043       {
6044         frame.reg_offset[regno] = offset;
6045         if (frame.wb_candidate1 == INVALID_REGNUM)
6046           frame.wb_candidate1 = regno;
6047         else if (frame.wb_candidate2 == INVALID_REGNUM)
6048           frame.wb_candidate2 = regno;
6049         offset += UNITS_PER_WORD;
6050       }
6051
6052   poly_int64 max_int_offset = offset;
6053   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6054   bool has_align_gap = maybe_ne (offset, max_int_offset);
6055
6056   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6057     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6058       {
6059         /* If there is an alignment gap between integer and fp callee-saves,
6060            allocate the last fp register to it if possible.  */
6061         if (regno == last_fp_reg
6062             && has_align_gap
6063             && known_eq (vector_save_size, 8)
6064             && multiple_p (offset, 16))
6065           {
6066             frame.reg_offset[regno] = max_int_offset;
6067             break;
6068           }
6069
6070         frame.reg_offset[regno] = offset;
6071         if (frame.wb_candidate1 == INVALID_REGNUM)
6072           frame.wb_candidate1 = regno;
6073         else if (frame.wb_candidate2 == INVALID_REGNUM
6074                  && frame.wb_candidate1 >= V0_REGNUM)
6075           frame.wb_candidate2 = regno;
6076         offset += vector_save_size;
6077       }
6078
6079   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6080
6081   frame.saved_regs_size = offset;
6082
6083   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6084
6085   poly_int64 above_outgoing_args
6086     = aligned_upper_bound (varargs_and_saved_regs_size
6087                            + get_frame_size (),
6088                            STACK_BOUNDARY / BITS_PER_UNIT);
6089
6090   frame.hard_fp_offset
6091     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6092
6093   /* Both these values are already aligned.  */
6094   gcc_assert (multiple_p (crtl->outgoing_args_size,
6095                           STACK_BOUNDARY / BITS_PER_UNIT));
6096   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6097
6098   frame.locals_offset = frame.saved_varargs_size;
6099
6100   frame.initial_adjust = 0;
6101   frame.final_adjust = 0;
6102   frame.callee_adjust = 0;
6103   frame.sve_callee_adjust = 0;
6104   frame.callee_offset = 0;
6105
6106   HOST_WIDE_INT max_push_offset = 0;
6107   if (frame.wb_candidate2 != INVALID_REGNUM)
6108     max_push_offset = 512;
6109   else if (frame.wb_candidate1 != INVALID_REGNUM)
6110     max_push_offset = 256;
6111
6112   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6113   HOST_WIDE_INT const_saved_regs_size;
6114   if (frame.frame_size.is_constant (&const_size)
6115       && const_size < max_push_offset
6116       && known_eq (frame.hard_fp_offset, const_size))
6117     {
6118       /* Simple, small frame with no outgoing arguments:
6119
6120          stp reg1, reg2, [sp, -frame_size]!
6121          stp reg3, reg4, [sp, 16]  */
6122       frame.callee_adjust = const_size;
6123     }
6124   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6125            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6126            && const_outgoing_args_size + const_saved_regs_size < 512
6127            /* We could handle this case even with outgoing args, provided
6128               that the number of args left us with valid offsets for all
6129               predicate and vector save slots.  It's such a rare case that
6130               it hardly seems worth the effort though.  */
6131            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6132            && !(cfun->calls_alloca
6133                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6134                 && const_fp_offset < max_push_offset))
6135     {
6136       /* Frame with small outgoing arguments:
6137
6138          sub sp, sp, frame_size
6139          stp reg1, reg2, [sp, outgoing_args_size]
6140          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
6141       frame.initial_adjust = frame.frame_size;
6142       frame.callee_offset = const_outgoing_args_size;
6143     }
6144   else if (saves_below_hard_fp_p
6145            && known_eq (frame.saved_regs_size,
6146                         frame.below_hard_fp_saved_regs_size))
6147     {
6148       /* Frame in which all saves are SVE saves:
6149
6150          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6151          save SVE registers relative to SP
6152          sub sp, sp, outgoing_args_size  */
6153       frame.initial_adjust = (frame.hard_fp_offset
6154                               + frame.below_hard_fp_saved_regs_size);
6155       frame.final_adjust = crtl->outgoing_args_size;
6156     }
6157   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6158            && const_fp_offset < max_push_offset)
6159     {
6160       /* Frame with large outgoing arguments or SVE saves, but with
6161          a small local area:
6162
6163          stp reg1, reg2, [sp, -hard_fp_offset]!
6164          stp reg3, reg4, [sp, 16]
6165          [sub sp, sp, below_hard_fp_saved_regs_size]
6166          [save SVE registers relative to SP]
6167          sub sp, sp, outgoing_args_size  */
6168       frame.callee_adjust = const_fp_offset;
6169       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6170       frame.final_adjust = crtl->outgoing_args_size;
6171     }
6172   else
6173     {
6174       /* Frame with large local area and outgoing arguments or SVE saves,
6175          using frame pointer:
6176
6177          sub sp, sp, hard_fp_offset
6178          stp x29, x30, [sp, 0]
6179          add x29, sp, 0
6180          stp reg3, reg4, [sp, 16]
6181          [sub sp, sp, below_hard_fp_saved_regs_size]
6182          [save SVE registers relative to SP]
6183          sub sp, sp, outgoing_args_size  */
6184       frame.initial_adjust = frame.hard_fp_offset;
6185       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6186       frame.final_adjust = crtl->outgoing_args_size;
6187     }
6188
6189   /* Make sure the individual adjustments add up to the full frame size.  */
6190   gcc_assert (known_eq (frame.initial_adjust
6191                         + frame.callee_adjust
6192                         + frame.sve_callee_adjust
6193                         + frame.final_adjust, frame.frame_size));
6194
6195   frame.laid_out = true;
6196 }
6197
6198 /* Return true if the register REGNO is saved on entry to
6199    the current function.  */
6200
6201 static bool
6202 aarch64_register_saved_on_entry (int regno)
6203 {
6204   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6205 }
6206
6207 /* Return the next register up from REGNO up to LIMIT for the callee
6208    to save.  */
6209
6210 static unsigned
6211 aarch64_next_callee_save (unsigned regno, unsigned limit)
6212 {
6213   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6214     regno ++;
6215   return regno;
6216 }
6217
6218 /* Push the register number REGNO of mode MODE to the stack with write-back
6219    adjusting the stack by ADJUSTMENT.  */
6220
6221 static void
6222 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6223                            HOST_WIDE_INT adjustment)
6224  {
6225   rtx base_rtx = stack_pointer_rtx;
6226   rtx insn, reg, mem;
6227
6228   reg = gen_rtx_REG (mode, regno);
6229   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6230                             plus_constant (Pmode, base_rtx, -adjustment));
6231   mem = gen_frame_mem (mode, mem);
6232
6233   insn = emit_move_insn (mem, reg);
6234   RTX_FRAME_RELATED_P (insn) = 1;
6235 }
6236
6237 /* Generate and return an instruction to store the pair of registers
6238    REG and REG2 of mode MODE to location BASE with write-back adjusting
6239    the stack location BASE by ADJUSTMENT.  */
6240
6241 static rtx
6242 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6243                           HOST_WIDE_INT adjustment)
6244 {
6245   switch (mode)
6246     {
6247     case E_DImode:
6248       return gen_storewb_pairdi_di (base, base, reg, reg2,
6249                                     GEN_INT (-adjustment),
6250                                     GEN_INT (UNITS_PER_WORD - adjustment));
6251     case E_DFmode:
6252       return gen_storewb_pairdf_di (base, base, reg, reg2,
6253                                     GEN_INT (-adjustment),
6254                                     GEN_INT (UNITS_PER_WORD - adjustment));
6255     case E_TFmode:
6256       return gen_storewb_pairtf_di (base, base, reg, reg2,
6257                                     GEN_INT (-adjustment),
6258                                     GEN_INT (UNITS_PER_VREG - adjustment));
6259     default:
6260       gcc_unreachable ();
6261     }
6262 }
6263
6264 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6265    stack pointer by ADJUSTMENT.  */
6266
6267 static void
6268 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6269 {
6270   rtx_insn *insn;
6271   machine_mode mode = aarch64_reg_save_mode (regno1);
6272
6273   if (regno2 == INVALID_REGNUM)
6274     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6275
6276   rtx reg1 = gen_rtx_REG (mode, regno1);
6277   rtx reg2 = gen_rtx_REG (mode, regno2);
6278
6279   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6280                                               reg2, adjustment));
6281   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6282   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6283   RTX_FRAME_RELATED_P (insn) = 1;
6284 }
6285
6286 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6287    adjusting it by ADJUSTMENT afterwards.  */
6288
6289 static rtx
6290 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6291                          HOST_WIDE_INT adjustment)
6292 {
6293   switch (mode)
6294     {
6295     case E_DImode:
6296       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6297                                    GEN_INT (UNITS_PER_WORD));
6298     case E_DFmode:
6299       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6300                                    GEN_INT (UNITS_PER_WORD));
6301     case E_TFmode:
6302       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6303                                    GEN_INT (UNITS_PER_VREG));
6304     default:
6305       gcc_unreachable ();
6306     }
6307 }
6308
6309 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6310    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6311    into CFI_OPS.  */
6312
6313 static void
6314 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6315                   rtx *cfi_ops)
6316 {
6317   machine_mode mode = aarch64_reg_save_mode (regno1);
6318   rtx reg1 = gen_rtx_REG (mode, regno1);
6319
6320   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6321
6322   if (regno2 == INVALID_REGNUM)
6323     {
6324       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6325       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6326       emit_move_insn (reg1, gen_frame_mem (mode, mem));
6327     }
6328   else
6329     {
6330       rtx reg2 = gen_rtx_REG (mode, regno2);
6331       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6332       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6333                                           reg2, adjustment));
6334     }
6335 }
6336
6337 /* Generate and return a store pair instruction of mode MODE to store
6338    register REG1 to MEM1 and register REG2 to MEM2.  */
6339
6340 static rtx
6341 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6342                         rtx reg2)
6343 {
6344   switch (mode)
6345     {
6346     case E_DImode:
6347       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6348
6349     case E_DFmode:
6350       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6351
6352     case E_TFmode:
6353       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6354
6355     default:
6356       gcc_unreachable ();
6357     }
6358 }
6359
6360 /* Generate and regurn a load pair isntruction of mode MODE to load register
6361    REG1 from MEM1 and register REG2 from MEM2.  */
6362
6363 static rtx
6364 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6365                        rtx mem2)
6366 {
6367   switch (mode)
6368     {
6369     case E_DImode:
6370       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6371
6372     case E_DFmode:
6373       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6374
6375     case E_TFmode:
6376       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6377
6378     default:
6379       gcc_unreachable ();
6380     }
6381 }
6382
6383 /* Return TRUE if return address signing should be enabled for the current
6384    function, otherwise return FALSE.  */
6385
6386 bool
6387 aarch64_return_address_signing_enabled (void)
6388 {
6389   /* This function should only be called after frame laid out.   */
6390   gcc_assert (cfun->machine->frame.laid_out);
6391
6392   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6393      if its LR is pushed onto stack.  */
6394   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6395           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6396               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6397 }
6398
6399 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
6400 bool
6401 aarch64_bti_enabled (void)
6402 {
6403   return (aarch64_enable_bti == 1);
6404 }
6405
6406 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6407    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6408    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
6409
6410      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6411          or LD1D address
6412
6413      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6414          if the variable isn't already nonnull
6415
6416    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6417    Handle this case using a temporary base register that is suitable for
6418    all offsets in that range.  Use ANCHOR_REG as this base register if it
6419    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
6420
6421 static inline void
6422 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6423                                      rtx &anchor_reg, poly_int64 &offset,
6424                                      rtx &ptrue)
6425 {
6426   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6427     {
6428       /* This is the maximum valid offset of the anchor from the base.
6429          Lower values would be valid too.  */
6430       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6431       if (!anchor_reg)
6432         {
6433           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6434           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6435                                     gen_int_mode (anchor_offset, Pmode)));
6436         }
6437       base_rtx = anchor_reg;
6438       offset -= anchor_offset;
6439     }
6440   if (!ptrue)
6441     {
6442       int pred_reg = cfun->machine->frame.spare_pred_reg;
6443       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6444                       CONSTM1_RTX (VNx16BImode));
6445       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6446     }
6447 }
6448
6449 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6450    is saved at BASE + OFFSET.  */
6451
6452 static void
6453 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6454                             rtx base, poly_int64 offset)
6455 {
6456   rtx mem = gen_frame_mem (GET_MODE (reg),
6457                            plus_constant (Pmode, base, offset));
6458   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6459 }
6460
6461 /* Emit code to save the callee-saved registers from register number START
6462    to LIMIT to the stack at the location starting at offset START_OFFSET,
6463    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
6464    is true if the hard frame pointer has been set up.  */
6465
6466 static void
6467 aarch64_save_callee_saves (poly_int64 start_offset,
6468                            unsigned start, unsigned limit, bool skip_wb,
6469                            bool hard_fp_valid_p)
6470 {
6471   rtx_insn *insn;
6472   unsigned regno;
6473   unsigned regno2;
6474   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6475
6476   for (regno = aarch64_next_callee_save (start, limit);
6477        regno <= limit;
6478        regno = aarch64_next_callee_save (regno + 1, limit))
6479     {
6480       rtx reg, mem;
6481       poly_int64 offset;
6482       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6483
6484       if (skip_wb
6485           && (regno == cfun->machine->frame.wb_candidate1
6486               || regno == cfun->machine->frame.wb_candidate2))
6487         continue;
6488
6489       if (cfun->machine->reg_is_wrapped_separately[regno])
6490         continue;
6491
6492       machine_mode mode = aarch64_reg_save_mode (regno);
6493       reg = gen_rtx_REG (mode, regno);
6494       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6495       rtx base_rtx = stack_pointer_rtx;
6496       poly_int64 sp_offset = offset;
6497
6498       HOST_WIDE_INT const_offset;
6499       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6500         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6501                                              offset, ptrue);
6502       else if (GP_REGNUM_P (regno)
6503                && (!offset.is_constant (&const_offset) || const_offset >= 512))
6504         {
6505           gcc_assert (known_eq (start_offset, 0));
6506           poly_int64 fp_offset
6507             = cfun->machine->frame.below_hard_fp_saved_regs_size;
6508           if (hard_fp_valid_p)
6509             base_rtx = hard_frame_pointer_rtx;
6510           else
6511             {
6512               if (!anchor_reg)
6513                 {
6514                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6515                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6516                                             gen_int_mode (fp_offset, Pmode)));
6517                 }
6518               base_rtx = anchor_reg;
6519             }
6520           offset -= fp_offset;
6521         }
6522       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6523       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6524
6525       if (!aarch64_sve_mode_p (mode)
6526           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6527           && !cfun->machine->reg_is_wrapped_separately[regno2]
6528           && known_eq (GET_MODE_SIZE (mode),
6529                        cfun->machine->frame.reg_offset[regno2]
6530                        - cfun->machine->frame.reg_offset[regno]))
6531         {
6532           rtx reg2 = gen_rtx_REG (mode, regno2);
6533           rtx mem2;
6534
6535           offset += GET_MODE_SIZE (mode);
6536           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6537           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
6538                                                     reg2));
6539
6540           /* The first part of a frame-related parallel insn is
6541              always assumed to be relevant to the frame
6542              calculations; subsequent parts, are only
6543              frame-related if explicitly marked.  */
6544           if (aarch64_emit_cfi_for_reg_p (regno2))
6545             {
6546               if (need_cfa_note_p)
6547                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
6548                                             sp_offset + GET_MODE_SIZE (mode));
6549               else
6550                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6551             }
6552
6553           regno = regno2;
6554         }
6555       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6556         {
6557           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
6558           need_cfa_note_p = true;
6559         }
6560       else if (aarch64_sve_mode_p (mode))
6561         insn = emit_insn (gen_rtx_SET (mem, reg));
6562       else
6563         insn = emit_move_insn (mem, reg);
6564
6565       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6566       if (frame_related_p && need_cfa_note_p)
6567         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
6568     }
6569 }
6570
6571 /* Emit code to restore the callee registers from register number START
6572    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
6573    skipping any write-back candidates if SKIP_WB is true.  Write the
6574    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
6575
6576 static void
6577 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
6578                               unsigned limit, bool skip_wb, rtx *cfi_ops)
6579 {
6580   unsigned regno;
6581   unsigned regno2;
6582   poly_int64 offset;
6583   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6584
6585   for (regno = aarch64_next_callee_save (start, limit);
6586        regno <= limit;
6587        regno = aarch64_next_callee_save (regno + 1, limit))
6588     {
6589       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6590       if (cfun->machine->reg_is_wrapped_separately[regno])
6591         continue;
6592
6593       rtx reg, mem;
6594
6595       if (skip_wb
6596           && (regno == cfun->machine->frame.wb_candidate1
6597               || regno == cfun->machine->frame.wb_candidate2))
6598         continue;
6599
6600       machine_mode mode = aarch64_reg_save_mode (regno);
6601       reg = gen_rtx_REG (mode, regno);
6602       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6603       rtx base_rtx = stack_pointer_rtx;
6604       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6605         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6606                                              offset, ptrue);
6607       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6608
6609       if (!aarch64_sve_mode_p (mode)
6610           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6611           && !cfun->machine->reg_is_wrapped_separately[regno2]
6612           && known_eq (GET_MODE_SIZE (mode),
6613                        cfun->machine->frame.reg_offset[regno2]
6614                        - cfun->machine->frame.reg_offset[regno]))
6615         {
6616           rtx reg2 = gen_rtx_REG (mode, regno2);
6617           rtx mem2;
6618
6619           offset += GET_MODE_SIZE (mode);
6620           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6621           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6622
6623           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6624           regno = regno2;
6625         }
6626       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6627         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
6628       else if (aarch64_sve_mode_p (mode))
6629         emit_insn (gen_rtx_SET (reg, mem));
6630       else
6631         emit_move_insn (reg, mem);
6632       if (frame_related_p)
6633         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
6634     }
6635 }
6636
6637 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6638    of MODE.  */
6639
6640 static inline bool
6641 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6642 {
6643   HOST_WIDE_INT multiple;
6644   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6645           && IN_RANGE (multiple, -8, 7));
6646 }
6647
6648 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6649    of MODE.  */
6650
6651 static inline bool
6652 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6653 {
6654   HOST_WIDE_INT multiple;
6655   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6656           && IN_RANGE (multiple, 0, 63));
6657 }
6658
6659 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6660    of MODE.  */
6661
6662 bool
6663 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6664 {
6665   HOST_WIDE_INT multiple;
6666   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6667           && IN_RANGE (multiple, -64, 63));
6668 }
6669
6670 /* Return true if OFFSET is a signed 9-bit value.  */
6671
6672 bool
6673 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
6674                                        poly_int64 offset)
6675 {
6676   HOST_WIDE_INT const_offset;
6677   return (offset.is_constant (&const_offset)
6678           && IN_RANGE (const_offset, -256, 255));
6679 }
6680
6681 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6682    of MODE.  */
6683
6684 static inline bool
6685 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6686 {
6687   HOST_WIDE_INT multiple;
6688   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6689           && IN_RANGE (multiple, -256, 255));
6690 }
6691
6692 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6693    of MODE.  */
6694
6695 static inline bool
6696 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6697 {
6698   HOST_WIDE_INT multiple;
6699   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6700           && IN_RANGE (multiple, 0, 4095));
6701 }
6702
6703 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
6704
6705 static sbitmap
6706 aarch64_get_separate_components (void)
6707 {
6708   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6709   bitmap_clear (components);
6710
6711   /* The registers we need saved to the frame.  */
6712   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6713     if (aarch64_register_saved_on_entry (regno))
6714       {
6715         /* Punt on saves and restores that use ST1D and LD1D.  We could
6716            try to be smarter, but it would involve making sure that the
6717            spare predicate register itself is safe to use at the save
6718            and restore points.  Also, when a frame pointer is being used,
6719            the slots are often out of reach of ST1D and LD1D anyway.  */
6720         machine_mode mode = aarch64_reg_save_mode (regno);
6721         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6722           continue;
6723
6724         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6725
6726         /* If the register is saved in the first SVE save slot, we use
6727            it as a stack probe for -fstack-clash-protection.  */
6728         if (flag_stack_clash_protection
6729             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
6730             && known_eq (offset, 0))
6731           continue;
6732
6733         /* Get the offset relative to the register we'll use.  */
6734         if (frame_pointer_needed)
6735           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6736         else
6737           offset += crtl->outgoing_args_size;
6738
6739         /* Check that we can access the stack slot of the register with one
6740            direct load with no adjustments needed.  */
6741         if (aarch64_sve_mode_p (mode)
6742             ? offset_9bit_signed_scaled_p (mode, offset)
6743             : offset_12bit_unsigned_scaled_p (mode, offset))
6744           bitmap_set_bit (components, regno);
6745       }
6746
6747   /* Don't mess with the hard frame pointer.  */
6748   if (frame_pointer_needed)
6749     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
6750
6751   /* If the spare predicate register used by big-endian SVE code
6752      is call-preserved, it must be saved in the main prologue
6753      before any saves that use it.  */
6754   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
6755     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
6756
6757   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6758   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6759   /* If registers have been chosen to be stored/restored with
6760      writeback don't interfere with them to avoid having to output explicit
6761      stack adjustment instructions.  */
6762   if (reg2 != INVALID_REGNUM)
6763     bitmap_clear_bit (components, reg2);
6764   if (reg1 != INVALID_REGNUM)
6765     bitmap_clear_bit (components, reg1);
6766
6767   bitmap_clear_bit (components, LR_REGNUM);
6768   bitmap_clear_bit (components, SP_REGNUM);
6769
6770   return components;
6771 }
6772
6773 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
6774
6775 static sbitmap
6776 aarch64_components_for_bb (basic_block bb)
6777 {
6778   bitmap in = DF_LIVE_IN (bb);
6779   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
6780   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
6781
6782   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6783   bitmap_clear (components);
6784
6785   /* Clobbered registers don't generate values in any meaningful sense,
6786      since nothing after the clobber can rely on their value.  And we can't
6787      say that partially-clobbered registers are unconditionally killed,
6788      because whether they're killed or not depends on the mode of the
6789      value they're holding.  Thus partially call-clobbered registers
6790      appear in neither the kill set nor the gen set.
6791
6792      Check manually for any calls that clobber more of a register than the
6793      current function can.  */
6794   function_abi_aggregator callee_abis;
6795   rtx_insn *insn;
6796   FOR_BB_INSNS (bb, insn)
6797     if (CALL_P (insn))
6798       callee_abis.note_callee_abi (insn_callee_abi (insn));
6799   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
6800
6801   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
6802   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6803     if (!fixed_regs[regno]
6804         && !crtl->abi->clobbers_full_reg_p (regno)
6805         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
6806             || bitmap_bit_p (in, regno)
6807             || bitmap_bit_p (gen, regno)
6808             || bitmap_bit_p (kill, regno)))
6809       {
6810         bitmap_set_bit (components, regno);
6811
6812         /* If there is a callee-save at an adjacent offset, add it too
6813            to increase the use of LDP/STP.  */
6814         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6815         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
6816
6817         if (regno2 <= LAST_SAVED_REGNUM)
6818           {
6819             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6820             if (regno < regno2
6821                 ? known_eq (offset + 8, offset2)
6822                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
6823               bitmap_set_bit (components, regno2);
6824           }
6825       }
6826
6827   return components;
6828 }
6829
6830 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6831    Nothing to do for aarch64.  */
6832
6833 static void
6834 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6835 {
6836 }
6837
6838 /* Return the next set bit in BMP from START onwards.  Return the total number
6839    of bits in BMP if no set bit is found at or after START.  */
6840
6841 static unsigned int
6842 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6843 {
6844   unsigned int nbits = SBITMAP_SIZE (bmp);
6845   if (start == nbits)
6846     return start;
6847
6848   gcc_assert (start < nbits);
6849   for (unsigned int i = start; i < nbits; i++)
6850     if (bitmap_bit_p (bmp, i))
6851       return i;
6852
6853   return nbits;
6854 }
6855
6856 /* Do the work for aarch64_emit_prologue_components and
6857    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
6858    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6859    for these components or the epilogue sequence.  That is, it determines
6860    whether we should emit stores or loads and what kind of CFA notes to attach
6861    to the insns.  Otherwise the logic for the two sequences is very
6862    similar.  */
6863
6864 static void
6865 aarch64_process_components (sbitmap components, bool prologue_p)
6866 {
6867   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6868                              ? HARD_FRAME_POINTER_REGNUM
6869                              : STACK_POINTER_REGNUM);
6870
6871   unsigned last_regno = SBITMAP_SIZE (components);
6872   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6873   rtx_insn *insn = NULL;
6874
6875   while (regno != last_regno)
6876     {
6877       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6878       machine_mode mode = aarch64_reg_save_mode (regno);
6879
6880       rtx reg = gen_rtx_REG (mode, regno);
6881       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6882       if (frame_pointer_needed)
6883         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6884       else
6885         offset += crtl->outgoing_args_size;
6886
6887       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6888       rtx mem = gen_frame_mem (mode, addr);
6889
6890       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6891       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6892       /* No more registers to handle after REGNO.
6893          Emit a single save/restore and exit.  */
6894       if (regno2 == last_regno)
6895         {
6896           insn = emit_insn (set);
6897           if (frame_related_p)
6898             {
6899               RTX_FRAME_RELATED_P (insn) = 1;
6900               if (prologue_p)
6901                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6902               else
6903                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6904             }
6905           break;
6906         }
6907
6908       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6909       /* The next register is not of the same class or its offset is not
6910          mergeable with the current one into a pair.  */
6911       if (aarch64_sve_mode_p (mode)
6912           || !satisfies_constraint_Ump (mem)
6913           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6914           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6915           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6916                        GET_MODE_SIZE (mode)))
6917         {
6918           insn = emit_insn (set);
6919           if (frame_related_p)
6920             {
6921               RTX_FRAME_RELATED_P (insn) = 1;
6922               if (prologue_p)
6923                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6924               else
6925                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6926             }
6927
6928           regno = regno2;
6929           continue;
6930         }
6931
6932       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
6933
6934       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6935       rtx reg2 = gen_rtx_REG (mode, regno2);
6936       if (frame_pointer_needed)
6937         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6938       else
6939         offset2 += crtl->outgoing_args_size;
6940       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6941       rtx mem2 = gen_frame_mem (mode, addr2);
6942       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6943                              : gen_rtx_SET (reg2, mem2);
6944
6945       if (prologue_p)
6946         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6947       else
6948         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6949
6950       if (frame_related_p || frame_related2_p)
6951         {
6952           RTX_FRAME_RELATED_P (insn) = 1;
6953           if (prologue_p)
6954             {
6955               if (frame_related_p)
6956                 add_reg_note (insn, REG_CFA_OFFSET, set);
6957               if (frame_related2_p)
6958                 add_reg_note (insn, REG_CFA_OFFSET, set2);
6959             }
6960           else
6961             {
6962               if (frame_related_p)
6963                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6964               if (frame_related2_p)
6965                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6966             }
6967         }
6968
6969       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6970     }
6971 }
6972
6973 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6974
6975 static void
6976 aarch64_emit_prologue_components (sbitmap components)
6977 {
6978   aarch64_process_components (components, true);
6979 }
6980
6981 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6982
6983 static void
6984 aarch64_emit_epilogue_components (sbitmap components)
6985 {
6986   aarch64_process_components (components, false);
6987 }
6988
6989 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6990
6991 static void
6992 aarch64_set_handled_components (sbitmap components)
6993 {
6994   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6995     if (bitmap_bit_p (components, regno))
6996       cfun->machine->reg_is_wrapped_separately[regno] = true;
6997 }
6998
6999 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
7000    determining the probe offset for alloca.  */
7001
7002 static HOST_WIDE_INT
7003 aarch64_stack_clash_protection_alloca_probe_range (void)
7004 {
7005   return STACK_CLASH_CALLER_GUARD;
7006 }
7007
7008
7009 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7010    registers.  If POLY_SIZE is not large enough to require a probe this function
7011    will only adjust the stack.  When allocating the stack space
7012    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7013    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7014    arguments.  If we are then we ensure that any allocation larger than the ABI
7015    defined buffer needs a probe so that the invariant of having a 1KB buffer is
7016    maintained.
7017
7018    We emit barriers after each stack adjustment to prevent optimizations from
7019    breaking the invariant that we never drop the stack more than a page.  This
7020    invariant is needed to make it easier to correctly handle asynchronous
7021    events, e.g. if we were to allow the stack to be dropped by more than a page
7022    and then have multiple probes up and we take a signal somewhere in between
7023    then the signal handler doesn't know the state of the stack and can make no
7024    assumptions about which pages have been probed.  */
7025
7026 static void
7027 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7028                                         poly_int64 poly_size,
7029                                         bool frame_related_p,
7030                                         bool final_adjustment_p)
7031 {
7032   HOST_WIDE_INT guard_size
7033     = 1 << param_stack_clash_protection_guard_size;
7034   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7035   HOST_WIDE_INT min_probe_threshold
7036     = (final_adjustment_p
7037        ? guard_used_by_caller
7038        : guard_size - guard_used_by_caller);
7039   /* When doing the final adjustment for the outgoing arguments, take into
7040      account any unprobed space there is above the current SP.  There are
7041      two cases:
7042
7043      - When saving SVE registers below the hard frame pointer, we force
7044        the lowest save to take place in the prologue before doing the final
7045        adjustment (i.e. we don't allow the save to be shrink-wrapped).
7046        This acts as a probe at SP, so there is no unprobed space.
7047
7048      - When there are no SVE register saves, we use the store of the link
7049        register as a probe.  We can't assume that LR was saved at position 0
7050        though, so treat any space below it as unprobed.  */
7051   if (final_adjustment_p
7052       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7053     {
7054       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7055       if (known_ge (lr_offset, 0))
7056         min_probe_threshold -= lr_offset.to_constant ();
7057       else
7058         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7059     }
7060
7061   poly_int64 frame_size = cfun->machine->frame.frame_size;
7062
7063   /* We should always have a positive probe threshold.  */
7064   gcc_assert (min_probe_threshold > 0);
7065
7066   if (flag_stack_clash_protection && !final_adjustment_p)
7067     {
7068       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7069       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7070       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7071
7072       if (known_eq (frame_size, 0))
7073         {
7074           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7075         }
7076       else if (known_lt (initial_adjust + sve_callee_adjust,
7077                          guard_size - guard_used_by_caller)
7078                && known_lt (final_adjust, guard_used_by_caller))
7079         {
7080           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7081         }
7082     }
7083
7084   /* If SIZE is not large enough to require probing, just adjust the stack and
7085      exit.  */
7086   if (known_lt (poly_size, min_probe_threshold)
7087       || !flag_stack_clash_protection)
7088     {
7089       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7090       return;
7091     }
7092
7093   HOST_WIDE_INT size;
7094   /* Handle the SVE non-constant case first.  */
7095   if (!poly_size.is_constant (&size))
7096     {
7097      if (dump_file)
7098       {
7099         fprintf (dump_file, "Stack clash SVE prologue: ");
7100         print_dec (poly_size, dump_file);
7101         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7102       }
7103
7104       /* First calculate the amount of bytes we're actually spilling.  */
7105       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7106                           poly_size, temp1, temp2, false, true);
7107
7108       rtx_insn *insn = get_last_insn ();
7109
7110       if (frame_related_p)
7111         {
7112           /* This is done to provide unwinding information for the stack
7113              adjustments we're about to do, however to prevent the optimizers
7114              from removing the R11 move and leaving the CFA note (which would be
7115              very wrong) we tie the old and new stack pointer together.
7116              The tie will expand to nothing but the optimizers will not touch
7117              the instruction.  */
7118           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7119           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7120           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7121
7122           /* We want the CFA independent of the stack pointer for the
7123              duration of the loop.  */
7124           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7125           RTX_FRAME_RELATED_P (insn) = 1;
7126         }
7127
7128       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7129       rtx guard_const = gen_int_mode (guard_size, Pmode);
7130
7131       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7132                                                    stack_pointer_rtx, temp1,
7133                                                    probe_const, guard_const));
7134
7135       /* Now reset the CFA register if needed.  */
7136       if (frame_related_p)
7137         {
7138           add_reg_note (insn, REG_CFA_DEF_CFA,
7139                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7140                                       gen_int_mode (poly_size, Pmode)));
7141           RTX_FRAME_RELATED_P (insn) = 1;
7142         }
7143
7144       return;
7145     }
7146
7147   if (dump_file)
7148     fprintf (dump_file,
7149              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7150              " bytes, probing will be required.\n", size);
7151
7152   /* Round size to the nearest multiple of guard_size, and calculate the
7153      residual as the difference between the original size and the rounded
7154      size.  */
7155   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7156   HOST_WIDE_INT residual = size - rounded_size;
7157
7158   /* We can handle a small number of allocations/probes inline.  Otherwise
7159      punt to a loop.  */
7160   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7161     {
7162       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7163         {
7164           aarch64_sub_sp (NULL, temp2, guard_size, true);
7165           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7166                                            guard_used_by_caller));
7167           emit_insn (gen_blockage ());
7168         }
7169       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7170     }
7171   else
7172     {
7173       /* Compute the ending address.  */
7174       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7175                           temp1, NULL, false, true);
7176       rtx_insn *insn = get_last_insn ();
7177
7178       /* For the initial allocation, we don't have a frame pointer
7179          set up, so we always need CFI notes.  If we're doing the
7180          final allocation, then we may have a frame pointer, in which
7181          case it is the CFA, otherwise we need CFI notes.
7182
7183          We can determine which allocation we are doing by looking at
7184          the value of FRAME_RELATED_P since the final allocations are not
7185          frame related.  */
7186       if (frame_related_p)
7187         {
7188           /* We want the CFA independent of the stack pointer for the
7189              duration of the loop.  */
7190           add_reg_note (insn, REG_CFA_DEF_CFA,
7191                         plus_constant (Pmode, temp1, rounded_size));
7192           RTX_FRAME_RELATED_P (insn) = 1;
7193         }
7194
7195       /* This allocates and probes the stack.  Note that this re-uses some of
7196          the existing Ada stack protection code.  However we are guaranteed not
7197          to enter the non loop or residual branches of that code.
7198
7199          The non-loop part won't be entered because if our allocation amount
7200          doesn't require a loop, the case above would handle it.
7201
7202          The residual amount won't be entered because TEMP1 is a mutliple of
7203          the allocation size.  The residual will always be 0.  As such, the only
7204          part we are actually using from that code is the loop setup.  The
7205          actual probing is done in aarch64_output_probe_stack_range.  */
7206       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7207                                                stack_pointer_rtx, temp1));
7208
7209       /* Now reset the CFA register if needed.  */
7210       if (frame_related_p)
7211         {
7212           add_reg_note (insn, REG_CFA_DEF_CFA,
7213                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7214           RTX_FRAME_RELATED_P (insn) = 1;
7215         }
7216
7217       emit_insn (gen_blockage ());
7218       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7219     }
7220
7221   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
7222      be probed.  This maintains the requirement that each page is probed at
7223      least once.  For initial probing we probe only if the allocation is
7224      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7225      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
7226      GUARD_SIZE.  This works that for any allocation that is large enough to
7227      trigger a probe here, we'll have at least one, and if they're not large
7228      enough for this code to emit anything for them, The page would have been
7229      probed by the saving of FP/LR either by this function or any callees.  If
7230      we don't have any callees then we won't have more stack adjustments and so
7231      are still safe.  */
7232   if (residual)
7233     {
7234       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7235       /* If we're doing final adjustments, and we've done any full page
7236          allocations then any residual needs to be probed.  */
7237       if (final_adjustment_p && rounded_size != 0)
7238         min_probe_threshold = 0;
7239       /* If doing a small final adjustment, we always probe at offset 0.
7240          This is done to avoid issues when LR is not at position 0 or when
7241          the final adjustment is smaller than the probing offset.  */
7242       else if (final_adjustment_p && rounded_size == 0)
7243         residual_probe_offset = 0;
7244
7245       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7246       if (residual >= min_probe_threshold)
7247         {
7248           if (dump_file)
7249             fprintf (dump_file,
7250                      "Stack clash AArch64 prologue residuals: "
7251                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7252                      "\n", residual);
7253
7254             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7255                                              residual_probe_offset));
7256           emit_insn (gen_blockage ());
7257         }
7258     }
7259 }
7260
7261 /* Return 1 if the register is used by the epilogue.  We need to say the
7262    return register is used, but only after epilogue generation is complete.
7263    Note that in the case of sibcalls, the values "used by the epilogue" are
7264    considered live at the start of the called function.
7265
7266    For SIMD functions we need to return 1 for FP registers that are saved and
7267    restored by a function but are not zero in call_used_regs.  If we do not do
7268    this optimizations may remove the restore of the register.  */
7269
7270 int
7271 aarch64_epilogue_uses (int regno)
7272 {
7273   if (epilogue_completed)
7274     {
7275       if (regno == LR_REGNUM)
7276         return 1;
7277     }
7278   return 0;
7279 }
7280
7281 /* AArch64 stack frames generated by this compiler look like:
7282
7283         +-------------------------------+
7284         |                               |
7285         |  incoming stack arguments     |
7286         |                               |
7287         +-------------------------------+
7288         |                               | <-- incoming stack pointer (aligned)
7289         |  callee-allocated save area   |
7290         |  for register varargs         |
7291         |                               |
7292         +-------------------------------+
7293         |  local variables              | <-- frame_pointer_rtx
7294         |                               |
7295         +-------------------------------+
7296         |  padding                      | \
7297         +-------------------------------+  |
7298         |  callee-saved registers       |  | frame.saved_regs_size
7299         +-------------------------------+  |
7300         |  LR'                          |  |
7301         +-------------------------------+  |
7302         |  FP'                          |  |
7303         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
7304         |  SVE vector registers         |  | \
7305         +-------------------------------+  |  | below_hard_fp_saved_regs_size
7306         |  SVE predicate registers      | /  /
7307         +-------------------------------+
7308         |  dynamic allocation           |
7309         +-------------------------------+
7310         |  padding                      |
7311         +-------------------------------+
7312         |  outgoing stack arguments     | <-- arg_pointer
7313         |                               |
7314         +-------------------------------+
7315         |                               | <-- stack_pointer_rtx (aligned)
7316
7317    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7318    but leave frame_pointer_rtx and hard_frame_pointer_rtx
7319    unchanged.
7320
7321    By default for stack-clash we assume the guard is at least 64KB, but this
7322    value is configurable to either 4KB or 64KB.  We also force the guard size to
7323    be the same as the probing interval and both values are kept in sync.
7324
7325    With those assumptions the callee can allocate up to 63KB (or 3KB depending
7326    on the guard size) of stack space without probing.
7327
7328    When probing is needed, we emit a probe at the start of the prologue
7329    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7330
7331    We have to track how much space has been allocated and the only stores
7332    to the stack we track as implicit probes are the FP/LR stores.
7333
7334    For outgoing arguments we probe if the size is larger than 1KB, such that
7335    the ABI specified buffer is maintained for the next callee.
7336
7337    The following registers are reserved during frame layout and should not be
7338    used for any other purpose:
7339
7340    - r11: Used by stack clash protection when SVE is enabled, and also
7341           as an anchor register when saving and restoring registers
7342    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7343    - r14 and r15: Used for speculation tracking.
7344    - r16(IP0), r17(IP1): Used by indirect tailcalls.
7345    - r30(LR), r29(FP): Used by standard frame layout.
7346
7347    These registers must be avoided in frame layout related code unless the
7348    explicit intention is to interact with one of the features listed above.  */
7349
7350 /* Generate the prologue instructions for entry into a function.
7351    Establish the stack frame by decreasing the stack pointer with a
7352    properly calculated size and, if necessary, create a frame record
7353    filled with the values of LR and previous frame pointer.  The
7354    current FP is also set up if it is in use.  */
7355
7356 void
7357 aarch64_expand_prologue (void)
7358 {
7359   poly_int64 frame_size = cfun->machine->frame.frame_size;
7360   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7361   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7362   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7363   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7364   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7365   poly_int64 below_hard_fp_saved_regs_size
7366     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7367   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7368   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7369   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7370   rtx_insn *insn;
7371
7372   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7373     {
7374       /* Fold the SVE allocation into the initial allocation.
7375          We don't do this in aarch64_layout_arg to avoid pessimizing
7376          the epilogue code.  */
7377       initial_adjust += sve_callee_adjust;
7378       sve_callee_adjust = 0;
7379     }
7380
7381   /* Sign return address for functions.  */
7382   if (aarch64_return_address_signing_enabled ())
7383     {
7384       switch (aarch64_ra_sign_key)
7385         {
7386           case AARCH64_KEY_A:
7387             insn = emit_insn (gen_paciasp ());
7388             break;
7389           case AARCH64_KEY_B:
7390             insn = emit_insn (gen_pacibsp ());
7391             break;
7392           default:
7393             gcc_unreachable ();
7394         }
7395       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7396       RTX_FRAME_RELATED_P (insn) = 1;
7397     }
7398
7399   if (flag_stack_usage_info)
7400     current_function_static_stack_size = constant_lower_bound (frame_size);
7401
7402   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7403     {
7404       if (crtl->is_leaf && !cfun->calls_alloca)
7405         {
7406           if (maybe_gt (frame_size, PROBE_INTERVAL)
7407               && maybe_gt (frame_size, get_stack_check_protect ()))
7408             aarch64_emit_probe_stack_range (get_stack_check_protect (),
7409                                             (frame_size
7410                                              - get_stack_check_protect ()));
7411         }
7412       else if (maybe_gt (frame_size, 0))
7413         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7414     }
7415
7416   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7417   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7418
7419   /* In theory we should never have both an initial adjustment
7420      and a callee save adjustment.  Verify that is the case since the
7421      code below does not handle it for -fstack-clash-protection.  */
7422   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7423
7424   /* Will only probe if the initial adjustment is larger than the guard
7425      less the amount of the guard reserved for use by the caller's
7426      outgoing args.  */
7427   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7428                                           true, false);
7429
7430   if (callee_adjust != 0)
7431     aarch64_push_regs (reg1, reg2, callee_adjust);
7432
7433   /* The offset of the frame chain record (if any) from the current SP.  */
7434   poly_int64 chain_offset = (initial_adjust + callee_adjust
7435                              - cfun->machine->frame.hard_fp_offset);
7436   gcc_assert (known_ge (chain_offset, 0));
7437
7438   /* The offset of the bottom of the save area from the current SP.  */
7439   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7440
7441   if (emit_frame_chain)
7442     {
7443       if (callee_adjust == 0)
7444         {
7445           reg1 = R29_REGNUM;
7446           reg2 = R30_REGNUM;
7447           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7448                                      false, false);
7449         }
7450       else
7451         gcc_assert (known_eq (chain_offset, 0));
7452       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7453                           stack_pointer_rtx, chain_offset,
7454                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7455       if (frame_pointer_needed && !frame_size.is_constant ())
7456         {
7457           /* Variable-sized frames need to describe the save slot
7458              address using DW_CFA_expression rather than DW_CFA_offset.
7459              This means that, without taking further action, the
7460              locations of the registers that we've already saved would
7461              remain based on the stack pointer even after we redefine
7462              the CFA based on the frame pointer.  We therefore need new
7463              DW_CFA_expressions to re-express the save slots with addresses
7464              based on the frame pointer.  */
7465           rtx_insn *insn = get_last_insn ();
7466           gcc_assert (RTX_FRAME_RELATED_P (insn));
7467
7468           /* Add an explicit CFA definition if this was previously
7469              implicit.  */
7470           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7471             {
7472               rtx src = plus_constant (Pmode, stack_pointer_rtx,
7473                                        callee_offset);
7474               add_reg_note (insn, REG_CFA_ADJUST_CFA,
7475                             gen_rtx_SET (hard_frame_pointer_rtx, src));
7476             }
7477
7478           /* Change the save slot expressions for the registers that
7479              we've already saved.  */
7480           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7481                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
7482           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7483                                       hard_frame_pointer_rtx, 0);
7484         }
7485       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7486     }
7487
7488   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7489                              callee_adjust != 0 || emit_frame_chain,
7490                              emit_frame_chain);
7491   if (maybe_ne (sve_callee_adjust, 0))
7492     {
7493       gcc_assert (!flag_stack_clash_protection
7494                   || known_eq (initial_adjust, 0));
7495       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7496                                               sve_callee_adjust,
7497                                               !frame_pointer_needed, false);
7498       saved_regs_offset += sve_callee_adjust;
7499     }
7500   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7501                              false, emit_frame_chain);
7502   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7503                              callee_adjust != 0 || emit_frame_chain,
7504                              emit_frame_chain);
7505
7506   /* We may need to probe the final adjustment if it is larger than the guard
7507      that is assumed by the called.  */
7508   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7509                                           !frame_pointer_needed, true);
7510 }
7511
7512 /* Return TRUE if we can use a simple_return insn.
7513
7514    This function checks whether the callee saved stack is empty, which
7515    means no restore actions are need. The pro_and_epilogue will use
7516    this to check whether shrink-wrapping opt is feasible.  */
7517
7518 bool
7519 aarch64_use_return_insn_p (void)
7520 {
7521   if (!reload_completed)
7522     return false;
7523
7524   if (crtl->profile)
7525     return false;
7526
7527   return known_eq (cfun->machine->frame.frame_size, 0);
7528 }
7529
7530 /* Generate the epilogue instructions for returning from a function.
7531    This is almost exactly the reverse of the prolog sequence, except
7532    that we need to insert barriers to avoid scheduling loads that read
7533    from a deallocated stack, and we optimize the unwind records by
7534    emitting them all together if possible.  */
7535 void
7536 aarch64_expand_epilogue (bool for_sibcall)
7537 {
7538   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7539   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7540   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7541   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7542   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7543   poly_int64 below_hard_fp_saved_regs_size
7544     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7545   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7546   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7547   rtx cfi_ops = NULL;
7548   rtx_insn *insn;
7549   /* A stack clash protection prologue may not have left EP0_REGNUM or
7550      EP1_REGNUM in a usable state.  The same is true for allocations
7551      with an SVE component, since we then need both temporary registers
7552      for each allocation.  For stack clash we are in a usable state if
7553      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
7554   HOST_WIDE_INT guard_size
7555     = 1 << param_stack_clash_protection_guard_size;
7556   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7557
7558   /* We can re-use the registers when:
7559
7560      (a) the deallocation amount is the same as the corresponding
7561          allocation amount (which is false if we combine the initial
7562          and SVE callee save allocations in the prologue); and
7563
7564      (b) the allocation amount doesn't need a probe (which is false
7565          if the amount is guard_size - guard_used_by_caller or greater).
7566
7567      In such situations the register should remain live with the correct
7568      value.  */
7569   bool can_inherit_p = (initial_adjust.is_constant ()
7570                         && final_adjust.is_constant ()
7571                         && (!flag_stack_clash_protection
7572                             || (known_lt (initial_adjust,
7573                                           guard_size - guard_used_by_caller)
7574                                 && known_eq (sve_callee_adjust, 0))));
7575
7576   /* We need to add memory barrier to prevent read from deallocated stack.  */
7577   bool need_barrier_p
7578     = maybe_ne (get_frame_size ()
7579                 + cfun->machine->frame.saved_varargs_size, 0);
7580
7581   /* Emit a barrier to prevent loads from a deallocated stack.  */
7582   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
7583       || cfun->calls_alloca
7584       || crtl->calls_eh_return)
7585     {
7586       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7587       need_barrier_p = false;
7588     }
7589
7590   /* Restore the stack pointer from the frame pointer if it may not
7591      be the same as the stack pointer.  */
7592   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7593   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7594   if (frame_pointer_needed
7595       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
7596     /* If writeback is used when restoring callee-saves, the CFA
7597        is restored on the instruction doing the writeback.  */
7598     aarch64_add_offset (Pmode, stack_pointer_rtx,
7599                         hard_frame_pointer_rtx,
7600                         -callee_offset - below_hard_fp_saved_regs_size,
7601                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
7602   else
7603      /* The case where we need to re-use the register here is very rare, so
7604         avoid the complicated condition and just always emit a move if the
7605         immediate doesn't fit.  */
7606      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
7607
7608   /* Restore the vector registers before the predicate registers,
7609      so that we can use P4 as a temporary for big-endian SVE frames.  */
7610   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
7611                                 callee_adjust != 0, &cfi_ops);
7612   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
7613                                 false, &cfi_ops);
7614   if (maybe_ne (sve_callee_adjust, 0))
7615     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
7616   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
7617                                 R0_REGNUM, R30_REGNUM,
7618                                 callee_adjust != 0, &cfi_ops);
7619
7620   if (need_barrier_p)
7621     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7622
7623   if (callee_adjust != 0)
7624     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
7625
7626   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
7627     {
7628       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
7629       insn = get_last_insn ();
7630       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
7631       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
7632       RTX_FRAME_RELATED_P (insn) = 1;
7633       cfi_ops = NULL;
7634     }
7635
7636   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7637      add restriction on emit_move optimization to leaf functions.  */
7638   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
7639                   (!can_inherit_p || !crtl->is_leaf
7640                    || df_regs_ever_live_p (EP0_REGNUM)));
7641
7642   if (cfi_ops)
7643     {
7644       /* Emit delayed restores and reset the CFA to be SP.  */
7645       insn = get_last_insn ();
7646       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
7647       REG_NOTES (insn) = cfi_ops;
7648       RTX_FRAME_RELATED_P (insn) = 1;
7649     }
7650
7651   /* We prefer to emit the combined return/authenticate instruction RETAA,
7652      however there are three cases in which we must instead emit an explicit
7653      authentication instruction.
7654
7655         1) Sibcalls don't return in a normal way, so if we're about to call one
7656            we must authenticate.
7657
7658         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7659            generating code for !TARGET_ARMV8_3 we can't use it and must
7660            explicitly authenticate.
7661
7662         3) On an eh_return path we make extra stack adjustments to update the
7663            canonical frame address to be the exception handler's CFA.  We want
7664            to authenticate using the CFA of the function which calls eh_return.
7665     */
7666   if (aarch64_return_address_signing_enabled ()
7667       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
7668     {
7669       switch (aarch64_ra_sign_key)
7670         {
7671           case AARCH64_KEY_A:
7672             insn = emit_insn (gen_autiasp ());
7673             break;
7674           case AARCH64_KEY_B:
7675             insn = emit_insn (gen_autibsp ());
7676             break;
7677           default:
7678             gcc_unreachable ();
7679         }
7680       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7681       RTX_FRAME_RELATED_P (insn) = 1;
7682     }
7683
7684   /* Stack adjustment for exception handler.  */
7685   if (crtl->calls_eh_return && !for_sibcall)
7686     {
7687       /* We need to unwind the stack by the offset computed by
7688          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
7689          to be SP; letting the CFA move during this adjustment
7690          is just as correct as retaining the CFA from the body
7691          of the function.  Therefore, do nothing special.  */
7692       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
7693     }
7694
7695   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
7696   if (!for_sibcall)
7697     emit_jump_insn (ret_rtx);
7698 }
7699
7700 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
7701    normally or return to a previous frame after unwinding.
7702
7703    An EH return uses a single shared return sequence.  The epilogue is
7704    exactly like a normal epilogue except that it has an extra input
7705    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7706    that must be applied after the frame has been destroyed.  An extra label
7707    is inserted before the epilogue which initializes this register to zero,
7708    and this is the entry point for a normal return.
7709
7710    An actual EH return updates the return address, initializes the stack
7711    adjustment and jumps directly into the epilogue (bypassing the zeroing
7712    of the adjustment).  Since the return address is typically saved on the
7713    stack when a function makes a call, the saved LR must be updated outside
7714    the epilogue.
7715
7716    This poses problems as the store is generated well before the epilogue,
7717    so the offset of LR is not known yet.  Also optimizations will remove the
7718    store as it appears dead, even after the epilogue is generated (as the
7719    base or offset for loading LR is different in many cases).
7720
7721    To avoid these problems this implementation forces the frame pointer
7722    in eh_return functions so that the location of LR is fixed and known early.
7723    It also marks the store volatile, so no optimization is permitted to
7724    remove the store.  */
7725 rtx
7726 aarch64_eh_return_handler_rtx (void)
7727 {
7728   rtx tmp = gen_frame_mem (Pmode,
7729     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
7730
7731   /* Mark the store volatile, so no optimization is permitted to remove it.  */
7732   MEM_VOLATILE_P (tmp) = true;
7733   return tmp;
7734 }
7735
7736 /* Output code to add DELTA to the first argument, and then jump
7737    to FUNCTION.  Used for C++ multiple inheritance.  */
7738 static void
7739 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7740                          HOST_WIDE_INT delta,
7741                          HOST_WIDE_INT vcall_offset,
7742                          tree function)
7743 {
7744   /* The this pointer is always in x0.  Note that this differs from
7745      Arm where the this pointer maybe bumped to r1 if r0 is required
7746      to return a pointer to an aggregate.  On AArch64 a result value
7747      pointer will be in x8.  */
7748   int this_regno = R0_REGNUM;
7749   rtx this_rtx, temp0, temp1, addr, funexp;
7750   rtx_insn *insn;
7751   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
7752
7753   if (aarch64_bti_enabled ())
7754     emit_insn (gen_bti_c());
7755
7756   reload_completed = 1;
7757   emit_note (NOTE_INSN_PROLOGUE_END);
7758
7759   this_rtx = gen_rtx_REG (Pmode, this_regno);
7760   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
7761   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
7762
7763   if (vcall_offset == 0)
7764     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
7765   else
7766     {
7767       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
7768
7769       addr = this_rtx;
7770       if (delta != 0)
7771         {
7772           if (delta >= -256 && delta < 256)
7773             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
7774                                        plus_constant (Pmode, this_rtx, delta));
7775           else
7776             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
7777                                 temp1, temp0, false);
7778         }
7779
7780       if (Pmode == ptr_mode)
7781         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
7782       else
7783         aarch64_emit_move (temp0,
7784                            gen_rtx_ZERO_EXTEND (Pmode,
7785                                                 gen_rtx_MEM (ptr_mode, addr)));
7786
7787       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
7788           addr = plus_constant (Pmode, temp0, vcall_offset);
7789       else
7790         {
7791           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
7792                                           Pmode);
7793           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
7794         }
7795
7796       if (Pmode == ptr_mode)
7797         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
7798       else
7799         aarch64_emit_move (temp1,
7800                            gen_rtx_SIGN_EXTEND (Pmode,
7801                                                 gen_rtx_MEM (ptr_mode, addr)));
7802
7803       emit_insn (gen_add2_insn (this_rtx, temp1));
7804     }
7805
7806   /* Generate a tail call to the target function.  */
7807   if (!TREE_USED (function))
7808     {
7809       assemble_external (function);
7810       TREE_USED (function) = 1;
7811     }
7812   funexp = XEXP (DECL_RTL (function), 0);
7813   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
7814   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
7815   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
7816   SIBLING_CALL_P (insn) = 1;
7817
7818   insn = get_insns ();
7819   shorten_branches (insn);
7820
7821   assemble_start_function (thunk, fnname);
7822   final_start_function (insn, file, 1);
7823   final (insn, file, 1);
7824   final_end_function ();
7825   assemble_end_function (thunk, fnname);
7826
7827   /* Stop pretending to be a post-reload pass.  */
7828   reload_completed = 0;
7829 }
7830
7831 static bool
7832 aarch64_tls_referenced_p (rtx x)
7833 {
7834   if (!TARGET_HAVE_TLS)
7835     return false;
7836   subrtx_iterator::array_type array;
7837   FOR_EACH_SUBRTX (iter, array, x, ALL)
7838     {
7839       const_rtx x = *iter;
7840       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
7841         return true;
7842       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7843          TLS offsets, not real symbol references.  */
7844       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7845         iter.skip_subrtxes ();
7846     }
7847   return false;
7848 }
7849
7850
7851 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7852    a left shift of 0 or 12 bits.  */
7853 bool
7854 aarch64_uimm12_shift (HOST_WIDE_INT val)
7855 {
7856   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
7857           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
7858           );
7859 }
7860
7861 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7862    that can be created with a left shift of 0 or 12.  */
7863 static HOST_WIDE_INT
7864 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
7865 {
7866   /* Check to see if the value fits in 24 bits, as that is the maximum we can
7867      handle correctly.  */
7868   gcc_assert ((val & 0xffffff) == val);
7869
7870   if (((val & 0xfff) << 0) == val)
7871     return val;
7872
7873   return val & (0xfff << 12);
7874 }
7875
7876 /* Return true if val is an immediate that can be loaded into a
7877    register by a MOVZ instruction.  */
7878 static bool
7879 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
7880 {
7881   if (GET_MODE_SIZE (mode) > 4)
7882     {
7883       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7884           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7885         return 1;
7886     }
7887   else
7888     {
7889       /* Ignore sign extension.  */
7890       val &= (HOST_WIDE_INT) 0xffffffff;
7891     }
7892   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7893           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7894 }
7895
7896 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
7897    64-bit (DImode) integer.  */
7898
7899 static unsigned HOST_WIDE_INT
7900 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7901 {
7902   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7903   while (size < 64)
7904     {
7905       val &= (HOST_WIDE_INT_1U << size) - 1;
7906       val |= val << size;
7907       size *= 2;
7908     }
7909   return val;
7910 }
7911
7912 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
7913
7914 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7915   {
7916     0x0000000100000001ull,
7917     0x0001000100010001ull,
7918     0x0101010101010101ull,
7919     0x1111111111111111ull,
7920     0x5555555555555555ull,
7921   };
7922
7923
7924 /* Return true if val is a valid bitmask immediate.  */
7925
7926 bool
7927 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7928 {
7929   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7930   int bits;
7931
7932   /* Check for a single sequence of one bits and return quickly if so.
7933      The special cases of all ones and all zeroes returns false.  */
7934   val = aarch64_replicate_bitmask_imm (val_in, mode);
7935   tmp = val + (val & -val);
7936
7937   if (tmp == (tmp & -tmp))
7938     return (val + 1) > 1;
7939
7940   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7941   if (mode == SImode)
7942     val = (val << 32) | (val & 0xffffffff);
7943
7944   /* Invert if the immediate doesn't start with a zero bit - this means we
7945      only need to search for sequences of one bits.  */
7946   if (val & 1)
7947     val = ~val;
7948
7949   /* Find the first set bit and set tmp to val with the first sequence of one
7950      bits removed.  Return success if there is a single sequence of ones.  */
7951   first_one = val & -val;
7952   tmp = val & (val + first_one);
7953
7954   if (tmp == 0)
7955     return true;
7956
7957   /* Find the next set bit and compute the difference in bit position.  */
7958   next_one = tmp & -tmp;
7959   bits = clz_hwi (first_one) - clz_hwi (next_one);
7960   mask = val ^ tmp;
7961
7962   /* Check the bit position difference is a power of 2, and that the first
7963      sequence of one bits fits within 'bits' bits.  */
7964   if ((mask >> bits) != 0 || bits != (bits & -bits))
7965     return false;
7966
7967   /* Check the sequence of one bits is repeated 64/bits times.  */
7968   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7969 }
7970
7971 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7972    Assumed precondition: VAL_IN Is not zero.  */
7973
7974 unsigned HOST_WIDE_INT
7975 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7976 {
7977   int lowest_bit_set = ctz_hwi (val_in);
7978   int highest_bit_set = floor_log2 (val_in);
7979   gcc_assert (val_in != 0);
7980
7981   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7982           (HOST_WIDE_INT_1U << lowest_bit_set));
7983 }
7984
7985 /* Create constant where bits outside of lowest bit set to highest bit set
7986    are set to 1.  */
7987
7988 unsigned HOST_WIDE_INT
7989 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7990 {
7991   return val_in | ~aarch64_and_split_imm1 (val_in);
7992 }
7993
7994 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7995
7996 bool
7997 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7998 {
7999   scalar_int_mode int_mode;
8000   if (!is_a <scalar_int_mode> (mode, &int_mode))
8001     return false;
8002
8003   if (aarch64_bitmask_imm (val_in, int_mode))
8004     return false;
8005
8006   if (aarch64_move_imm (val_in, int_mode))
8007     return false;
8008
8009   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8010
8011   return aarch64_bitmask_imm (imm2, int_mode);
8012 }
8013
8014 /* Return true if val is an immediate that can be loaded into a
8015    register in a single instruction.  */
8016 bool
8017 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8018 {
8019   scalar_int_mode int_mode;
8020   if (!is_a <scalar_int_mode> (mode, &int_mode))
8021     return false;
8022
8023   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8024     return 1;
8025   return aarch64_bitmask_imm (val, int_mode);
8026 }
8027
8028 static bool
8029 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8030 {
8031   rtx base, offset;
8032
8033   if (GET_CODE (x) == HIGH)
8034     return true;
8035
8036   /* There's no way to calculate VL-based values using relocations.  */
8037   subrtx_iterator::array_type array;
8038   FOR_EACH_SUBRTX (iter, array, x, ALL)
8039     if (GET_CODE (*iter) == CONST_POLY_INT)
8040       return true;
8041
8042   split_const (x, &base, &offset);
8043   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8044     {
8045       if (aarch64_classify_symbol (base, INTVAL (offset))
8046           != SYMBOL_FORCE_TO_MEM)
8047         return true;
8048       else
8049         /* Avoid generating a 64-bit relocation in ILP32; leave
8050            to aarch64_expand_mov_immediate to handle it properly.  */
8051         return mode != ptr_mode;
8052     }
8053
8054   return aarch64_tls_referenced_p (x);
8055 }
8056
8057 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8058    The expansion for a table switch is quite expensive due to the number
8059    of instructions, the table lookup and hard to predict indirect jump.
8060    When optimizing for speed, and -O3 enabled, use the per-core tuning if
8061    set, otherwise use tables for > 16 cases as a tradeoff between size and
8062    performance.  When optimizing for size, use the default setting.  */
8063
8064 static unsigned int
8065 aarch64_case_values_threshold (void)
8066 {
8067   /* Use the specified limit for the number of cases before using jump
8068      tables at higher optimization levels.  */
8069   if (optimize > 2
8070       && selected_cpu->tune->max_case_values != 0)
8071     return selected_cpu->tune->max_case_values;
8072   else
8073     return optimize_size ? default_case_values_threshold () : 17;
8074 }
8075
8076 /* Return true if register REGNO is a valid index register.
8077    STRICT_P is true if REG_OK_STRICT is in effect.  */
8078
8079 bool
8080 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8081 {
8082   if (!HARD_REGISTER_NUM_P (regno))
8083     {
8084       if (!strict_p)
8085         return true;
8086
8087       if (!reg_renumber)
8088         return false;
8089
8090       regno = reg_renumber[regno];
8091     }
8092   return GP_REGNUM_P (regno);
8093 }
8094
8095 /* Return true if register REGNO is a valid base register for mode MODE.
8096    STRICT_P is true if REG_OK_STRICT is in effect.  */
8097
8098 bool
8099 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8100 {
8101   if (!HARD_REGISTER_NUM_P (regno))
8102     {
8103       if (!strict_p)
8104         return true;
8105
8106       if (!reg_renumber)
8107         return false;
8108
8109       regno = reg_renumber[regno];
8110     }
8111
8112   /* The fake registers will be eliminated to either the stack or
8113      hard frame pointer, both of which are usually valid base registers.
8114      Reload deals with the cases where the eliminated form isn't valid.  */
8115   return (GP_REGNUM_P (regno)
8116           || regno == SP_REGNUM
8117           || regno == FRAME_POINTER_REGNUM
8118           || regno == ARG_POINTER_REGNUM);
8119 }
8120
8121 /* Return true if X is a valid base register for mode MODE.
8122    STRICT_P is true if REG_OK_STRICT is in effect.  */
8123
8124 static bool
8125 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8126 {
8127   if (!strict_p
8128       && GET_CODE (x) == SUBREG
8129       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8130     x = SUBREG_REG (x);
8131
8132   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8133 }
8134
8135 /* Return true if address offset is a valid index.  If it is, fill in INFO
8136    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8137
8138 static bool
8139 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8140                         machine_mode mode, bool strict_p)
8141 {
8142   enum aarch64_address_type type;
8143   rtx index;
8144   int shift;
8145
8146   /* (reg:P) */
8147   if ((REG_P (x) || GET_CODE (x) == SUBREG)
8148       && GET_MODE (x) == Pmode)
8149     {
8150       type = ADDRESS_REG_REG;
8151       index = x;
8152       shift = 0;
8153     }
8154   /* (sign_extend:DI (reg:SI)) */
8155   else if ((GET_CODE (x) == SIGN_EXTEND
8156             || GET_CODE (x) == ZERO_EXTEND)
8157            && GET_MODE (x) == DImode
8158            && GET_MODE (XEXP (x, 0)) == SImode)
8159     {
8160       type = (GET_CODE (x) == SIGN_EXTEND)
8161         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8162       index = XEXP (x, 0);
8163       shift = 0;
8164     }
8165   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8166   else if (GET_CODE (x) == MULT
8167            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8168                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8169            && GET_MODE (XEXP (x, 0)) == DImode
8170            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8171            && CONST_INT_P (XEXP (x, 1)))
8172     {
8173       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8174         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8175       index = XEXP (XEXP (x, 0), 0);
8176       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8177     }
8178   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8179   else if (GET_CODE (x) == ASHIFT
8180            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8181                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8182            && GET_MODE (XEXP (x, 0)) == DImode
8183            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8184            && CONST_INT_P (XEXP (x, 1)))
8185     {
8186       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8187         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8188       index = XEXP (XEXP (x, 0), 0);
8189       shift = INTVAL (XEXP (x, 1));
8190     }
8191   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8192   else if ((GET_CODE (x) == SIGN_EXTRACT
8193             || GET_CODE (x) == ZERO_EXTRACT)
8194            && GET_MODE (x) == DImode
8195            && GET_CODE (XEXP (x, 0)) == MULT
8196            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8197            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8198     {
8199       type = (GET_CODE (x) == SIGN_EXTRACT)
8200         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8201       index = XEXP (XEXP (x, 0), 0);
8202       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8203       if (INTVAL (XEXP (x, 1)) != 32 + shift
8204           || INTVAL (XEXP (x, 2)) != 0)
8205         shift = -1;
8206     }
8207   /* (and:DI (mult:DI (reg:DI) (const_int scale))
8208      (const_int 0xffffffff<<shift)) */
8209   else if (GET_CODE (x) == AND
8210            && GET_MODE (x) == DImode
8211            && GET_CODE (XEXP (x, 0)) == MULT
8212            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8213            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8214            && CONST_INT_P (XEXP (x, 1)))
8215     {
8216       type = ADDRESS_REG_UXTW;
8217       index = XEXP (XEXP (x, 0), 0);
8218       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8219       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8220         shift = -1;
8221     }
8222   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8223   else if ((GET_CODE (x) == SIGN_EXTRACT
8224             || GET_CODE (x) == ZERO_EXTRACT)
8225            && GET_MODE (x) == DImode
8226            && GET_CODE (XEXP (x, 0)) == ASHIFT
8227            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8228            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8229     {
8230       type = (GET_CODE (x) == SIGN_EXTRACT)
8231         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8232       index = XEXP (XEXP (x, 0), 0);
8233       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8234       if (INTVAL (XEXP (x, 1)) != 32 + shift
8235           || INTVAL (XEXP (x, 2)) != 0)
8236         shift = -1;
8237     }
8238   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8239      (const_int 0xffffffff<<shift)) */
8240   else if (GET_CODE (x) == AND
8241            && GET_MODE (x) == DImode
8242            && GET_CODE (XEXP (x, 0)) == ASHIFT
8243            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8244            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8245            && CONST_INT_P (XEXP (x, 1)))
8246     {
8247       type = ADDRESS_REG_UXTW;
8248       index = XEXP (XEXP (x, 0), 0);
8249       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8250       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8251         shift = -1;
8252     }
8253   /* (mult:P (reg:P) (const_int scale)) */
8254   else if (GET_CODE (x) == MULT
8255            && GET_MODE (x) == Pmode
8256            && GET_MODE (XEXP (x, 0)) == Pmode
8257            && CONST_INT_P (XEXP (x, 1)))
8258     {
8259       type = ADDRESS_REG_REG;
8260       index = XEXP (x, 0);
8261       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8262     }
8263   /* (ashift:P (reg:P) (const_int shift)) */
8264   else if (GET_CODE (x) == ASHIFT
8265            && GET_MODE (x) == Pmode
8266            && GET_MODE (XEXP (x, 0)) == Pmode
8267            && CONST_INT_P (XEXP (x, 1)))
8268     {
8269       type = ADDRESS_REG_REG;
8270       index = XEXP (x, 0);
8271       shift = INTVAL (XEXP (x, 1));
8272     }
8273   else
8274     return false;
8275
8276   if (!strict_p
8277       && GET_CODE (index) == SUBREG
8278       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8279     index = SUBREG_REG (index);
8280
8281   if (aarch64_sve_data_mode_p (mode))
8282     {
8283       if (type != ADDRESS_REG_REG
8284           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8285         return false;
8286     }
8287   else
8288     {
8289       if (shift != 0
8290           && !(IN_RANGE (shift, 1, 3)
8291                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8292         return false;
8293     }
8294
8295   if (REG_P (index)
8296       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8297     {
8298       info->type = type;
8299       info->offset = index;
8300       info->shift = shift;
8301       return true;
8302     }
8303
8304   return false;
8305 }
8306
8307 /* Return true if MODE is one of the modes for which we
8308    support LDP/STP operations.  */
8309
8310 static bool
8311 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8312 {
8313   return mode == SImode || mode == DImode
8314          || mode == SFmode || mode == DFmode
8315          || (aarch64_vector_mode_supported_p (mode)
8316              && (known_eq (GET_MODE_SIZE (mode), 8)
8317                  || (known_eq (GET_MODE_SIZE (mode), 16)
8318                     && (aarch64_tune_params.extra_tuning_flags
8319                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8320 }
8321
8322 /* Return true if REGNO is a virtual pointer register, or an eliminable
8323    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
8324    include stack_pointer or hard_frame_pointer.  */
8325 static bool
8326 virt_or_elim_regno_p (unsigned regno)
8327 {
8328   return ((regno >= FIRST_VIRTUAL_REGISTER
8329            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8330           || regno == FRAME_POINTER_REGNUM
8331           || regno == ARG_POINTER_REGNUM);
8332 }
8333
8334 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8335    If it is, fill in INFO appropriately.  STRICT_P is true if
8336    REG_OK_STRICT is in effect.  */
8337
8338 bool
8339 aarch64_classify_address (struct aarch64_address_info *info,
8340                           rtx x, machine_mode mode, bool strict_p,
8341                           aarch64_addr_query_type type)
8342 {
8343   enum rtx_code code = GET_CODE (x);
8344   rtx op0, op1;
8345   poly_int64 offset;
8346
8347   HOST_WIDE_INT const_size;
8348
8349   /* Whether a vector mode is partial doesn't affect address legitimacy.
8350      Partial vectors like VNx8QImode allow the same indexed addressing
8351      mode and MUL VL addressing mode as full vectors like VNx16QImode;
8352      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
8353   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8354   vec_flags &= ~VEC_PARTIAL;
8355
8356   /* On BE, we use load/store pair for all large int mode load/stores.
8357      TI/TFmode may also use a load/store pair.  */
8358   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8359   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8360                             || type == ADDR_QUERY_LDP_STP_N
8361                             || mode == TImode
8362                             || mode == TFmode
8363                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8364
8365   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8366      corresponds to the actual size of the memory being loaded/stored and the
8367      mode of the corresponding addressing mode is half of that.  */
8368   if (type == ADDR_QUERY_LDP_STP_N
8369       && known_eq (GET_MODE_SIZE (mode), 16))
8370     mode = DFmode;
8371
8372   bool allow_reg_index_p = (!load_store_pair_p
8373                             && (known_lt (GET_MODE_SIZE (mode), 16)
8374                                 || vec_flags == VEC_ADVSIMD
8375                                 || vec_flags & VEC_SVE_DATA));
8376
8377   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8378      [Rn, #offset, MUL VL].  */
8379   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8380       && (code != REG && code != PLUS))
8381     return false;
8382
8383   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8384      REG addressing.  */
8385   if (advsimd_struct_p
8386       && !BYTES_BIG_ENDIAN
8387       && (code != POST_INC && code != REG))
8388     return false;
8389
8390   gcc_checking_assert (GET_MODE (x) == VOIDmode
8391                        || SCALAR_INT_MODE_P (GET_MODE (x)));
8392
8393   switch (code)
8394     {
8395     case REG:
8396     case SUBREG:
8397       info->type = ADDRESS_REG_IMM;
8398       info->base = x;
8399       info->offset = const0_rtx;
8400       info->const_offset = 0;
8401       return aarch64_base_register_rtx_p (x, strict_p);
8402
8403     case PLUS:
8404       op0 = XEXP (x, 0);
8405       op1 = XEXP (x, 1);
8406
8407       if (! strict_p
8408           && REG_P (op0)
8409           && virt_or_elim_regno_p (REGNO (op0))
8410           && poly_int_rtx_p (op1, &offset))
8411         {
8412           info->type = ADDRESS_REG_IMM;
8413           info->base = op0;
8414           info->offset = op1;
8415           info->const_offset = offset;
8416
8417           return true;
8418         }
8419
8420       if (maybe_ne (GET_MODE_SIZE (mode), 0)
8421           && aarch64_base_register_rtx_p (op0, strict_p)
8422           && poly_int_rtx_p (op1, &offset))
8423         {
8424           info->type = ADDRESS_REG_IMM;
8425           info->base = op0;
8426           info->offset = op1;
8427           info->const_offset = offset;
8428
8429           /* TImode and TFmode values are allowed in both pairs of X
8430              registers and individual Q registers.  The available
8431              address modes are:
8432              X,X: 7-bit signed scaled offset
8433              Q:   9-bit signed offset
8434              We conservatively require an offset representable in either mode.
8435              When performing the check for pairs of X registers i.e.  LDP/STP
8436              pass down DImode since that is the natural size of the LDP/STP
8437              instruction memory accesses.  */
8438           if (mode == TImode || mode == TFmode)
8439             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8440                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8441                         || offset_12bit_unsigned_scaled_p (mode, offset)));
8442
8443           /* A 7bit offset check because OImode will emit a ldp/stp
8444              instruction (only big endian will get here).
8445              For ldp/stp instructions, the offset is scaled for the size of a
8446              single element of the pair.  */
8447           if (mode == OImode)
8448             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8449
8450           /* Three 9/12 bit offsets checks because CImode will emit three
8451              ldr/str instructions (only big endian will get here).  */
8452           if (mode == CImode)
8453             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8454                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8455                                                                offset + 32)
8456                         || offset_12bit_unsigned_scaled_p (V16QImode,
8457                                                            offset + 32)));
8458
8459           /* Two 7bit offsets checks because XImode will emit two ldp/stp
8460              instructions (only big endian will get here).  */
8461           if (mode == XImode)
8462             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8463                     && aarch64_offset_7bit_signed_scaled_p (TImode,
8464                                                             offset + 32));
8465
8466           /* Make "m" use the LD1 offset range for SVE data modes, so
8467              that pre-RTL optimizers like ivopts will work to that
8468              instead of the wider LDR/STR range.  */
8469           if (vec_flags == VEC_SVE_DATA)
8470             return (type == ADDR_QUERY_M
8471                     ? offset_4bit_signed_scaled_p (mode, offset)
8472                     : offset_9bit_signed_scaled_p (mode, offset));
8473
8474           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8475             {
8476               poly_int64 end_offset = (offset
8477                                        + GET_MODE_SIZE (mode)
8478                                        - BYTES_PER_SVE_VECTOR);
8479               return (type == ADDR_QUERY_M
8480                       ? offset_4bit_signed_scaled_p (mode, offset)
8481                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8482                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8483                                                          end_offset)));
8484             }
8485
8486           if (vec_flags == VEC_SVE_PRED)
8487             return offset_9bit_signed_scaled_p (mode, offset);
8488
8489           if (load_store_pair_p)
8490             return ((known_eq (GET_MODE_SIZE (mode), 4)
8491                      || known_eq (GET_MODE_SIZE (mode), 8)
8492                      || known_eq (GET_MODE_SIZE (mode), 16))
8493                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8494           else
8495             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8496                     || offset_12bit_unsigned_scaled_p (mode, offset));
8497         }
8498
8499       if (allow_reg_index_p)
8500         {
8501           /* Look for base + (scaled/extended) index register.  */
8502           if (aarch64_base_register_rtx_p (op0, strict_p)
8503               && aarch64_classify_index (info, op1, mode, strict_p))
8504             {
8505               info->base = op0;
8506               return true;
8507             }
8508           if (aarch64_base_register_rtx_p (op1, strict_p)
8509               && aarch64_classify_index (info, op0, mode, strict_p))
8510             {
8511               info->base = op1;
8512               return true;
8513             }
8514         }
8515
8516       return false;
8517
8518     case POST_INC:
8519     case POST_DEC:
8520     case PRE_INC:
8521     case PRE_DEC:
8522       info->type = ADDRESS_REG_WB;
8523       info->base = XEXP (x, 0);
8524       info->offset = NULL_RTX;
8525       return aarch64_base_register_rtx_p (info->base, strict_p);
8526
8527     case POST_MODIFY:
8528     case PRE_MODIFY:
8529       info->type = ADDRESS_REG_WB;
8530       info->base = XEXP (x, 0);
8531       if (GET_CODE (XEXP (x, 1)) == PLUS
8532           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
8533           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
8534           && aarch64_base_register_rtx_p (info->base, strict_p))
8535         {
8536           info->offset = XEXP (XEXP (x, 1), 1);
8537           info->const_offset = offset;
8538
8539           /* TImode and TFmode values are allowed in both pairs of X
8540              registers and individual Q registers.  The available
8541              address modes are:
8542              X,X: 7-bit signed scaled offset
8543              Q:   9-bit signed offset
8544              We conservatively require an offset representable in either mode.
8545            */
8546           if (mode == TImode || mode == TFmode)
8547             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
8548                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
8549
8550           if (load_store_pair_p)
8551             return ((known_eq (GET_MODE_SIZE (mode), 4)
8552                      || known_eq (GET_MODE_SIZE (mode), 8)
8553                      || known_eq (GET_MODE_SIZE (mode), 16))
8554                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8555           else
8556             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
8557         }
8558       return false;
8559
8560     case CONST:
8561     case SYMBOL_REF:
8562     case LABEL_REF:
8563       /* load literal: pc-relative constant pool entry.  Only supported
8564          for SI mode or larger.  */
8565       info->type = ADDRESS_SYMBOLIC;
8566
8567       if (!load_store_pair_p
8568           && GET_MODE_SIZE (mode).is_constant (&const_size)
8569           && const_size >= 4)
8570         {
8571           rtx sym, addend;
8572
8573           split_const (x, &sym, &addend);
8574           return ((GET_CODE (sym) == LABEL_REF
8575                    || (GET_CODE (sym) == SYMBOL_REF
8576                        && CONSTANT_POOL_ADDRESS_P (sym)
8577                        && aarch64_pcrelative_literal_loads)));
8578         }
8579       return false;
8580
8581     case LO_SUM:
8582       info->type = ADDRESS_LO_SUM;
8583       info->base = XEXP (x, 0);
8584       info->offset = XEXP (x, 1);
8585       if (allow_reg_index_p
8586           && aarch64_base_register_rtx_p (info->base, strict_p))
8587         {
8588           rtx sym, offs;
8589           split_const (info->offset, &sym, &offs);
8590           if (GET_CODE (sym) == SYMBOL_REF
8591               && (aarch64_classify_symbol (sym, INTVAL (offs))
8592                   == SYMBOL_SMALL_ABSOLUTE))
8593             {
8594               /* The symbol and offset must be aligned to the access size.  */
8595               unsigned int align;
8596
8597               if (CONSTANT_POOL_ADDRESS_P (sym))
8598                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
8599               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
8600                 {
8601                   tree exp = SYMBOL_REF_DECL (sym);
8602                   align = TYPE_ALIGN (TREE_TYPE (exp));
8603                   align = aarch64_constant_alignment (exp, align);
8604                 }
8605               else if (SYMBOL_REF_DECL (sym))
8606                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
8607               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
8608                        && SYMBOL_REF_BLOCK (sym) != NULL)
8609                 align = SYMBOL_REF_BLOCK (sym)->alignment;
8610               else
8611                 align = BITS_PER_UNIT;
8612
8613               poly_int64 ref_size = GET_MODE_SIZE (mode);
8614               if (known_eq (ref_size, 0))
8615                 ref_size = GET_MODE_SIZE (DImode);
8616
8617               return (multiple_p (INTVAL (offs), ref_size)
8618                       && multiple_p (align / BITS_PER_UNIT, ref_size));
8619             }
8620         }
8621       return false;
8622
8623     default:
8624       return false;
8625     }
8626 }
8627
8628 /* Return true if the address X is valid for a PRFM instruction.
8629    STRICT_P is true if we should do strict checking with
8630    aarch64_classify_address.  */
8631
8632 bool
8633 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
8634 {
8635   struct aarch64_address_info addr;
8636
8637   /* PRFM accepts the same addresses as DImode...  */
8638   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
8639   if (!res)
8640     return false;
8641
8642   /* ... except writeback forms.  */
8643   return addr.type != ADDRESS_REG_WB;
8644 }
8645
8646 bool
8647 aarch64_symbolic_address_p (rtx x)
8648 {
8649   rtx offset;
8650
8651   split_const (x, &x, &offset);
8652   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
8653 }
8654
8655 /* Classify the base of symbolic expression X.  */
8656
8657 enum aarch64_symbol_type
8658 aarch64_classify_symbolic_expression (rtx x)
8659 {
8660   rtx offset;
8661
8662   split_const (x, &x, &offset);
8663   return aarch64_classify_symbol (x, INTVAL (offset));
8664 }
8665
8666
8667 /* Return TRUE if X is a legitimate address for accessing memory in
8668    mode MODE.  */
8669 static bool
8670 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
8671 {
8672   struct aarch64_address_info addr;
8673
8674   return aarch64_classify_address (&addr, x, mode, strict_p);
8675 }
8676
8677 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8678    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8679 bool
8680 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
8681                               aarch64_addr_query_type type)
8682 {
8683   struct aarch64_address_info addr;
8684
8685   return aarch64_classify_address (&addr, x, mode, strict_p, type);
8686 }
8687
8688 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
8689
8690 static bool
8691 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
8692                                          poly_int64 orig_offset,
8693                                          machine_mode mode)
8694 {
8695   HOST_WIDE_INT size;
8696   if (GET_MODE_SIZE (mode).is_constant (&size))
8697     {
8698       HOST_WIDE_INT const_offset, second_offset;
8699
8700       /* A general SVE offset is A * VQ + B.  Remove the A component from
8701          coefficient 0 in order to get the constant B.  */
8702       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
8703
8704       /* Split an out-of-range address displacement into a base and
8705          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
8706          range otherwise to increase opportunities for sharing the base
8707          address of different sizes.  Unaligned accesses use the signed
8708          9-bit range, TImode/TFmode use the intersection of signed
8709          scaled 7-bit and signed 9-bit offset.  */
8710       if (mode == TImode || mode == TFmode)
8711         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
8712       else if ((const_offset & (size - 1)) != 0)
8713         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
8714       else
8715         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
8716
8717       if (second_offset == 0 || known_eq (orig_offset, second_offset))
8718         return false;
8719
8720       /* Split the offset into second_offset and the rest.  */
8721       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8722       *offset2 = gen_int_mode (second_offset, Pmode);
8723       return true;
8724     }
8725   else
8726     {
8727       /* Get the mode we should use as the basis of the range.  For structure
8728          modes this is the mode of one vector.  */
8729       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8730       machine_mode step_mode
8731         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
8732
8733       /* Get the "mul vl" multiplier we'd like to use.  */
8734       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
8735       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
8736       if (vec_flags & VEC_SVE_DATA)
8737         /* LDR supports a 9-bit range, but the move patterns for
8738            structure modes require all vectors to be in range of the
8739            same base.  The simplest way of accomodating that while still
8740            promoting reuse of anchor points between different modes is
8741            to use an 8-bit range unconditionally.  */
8742         vnum = ((vnum + 128) & 255) - 128;
8743       else
8744         /* Predicates are only handled singly, so we might as well use
8745            the full range.  */
8746         vnum = ((vnum + 256) & 511) - 256;
8747       if (vnum == 0)
8748         return false;
8749
8750       /* Convert the "mul vl" multiplier into a byte offset.  */
8751       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
8752       if (known_eq (second_offset, orig_offset))
8753         return false;
8754
8755       /* Split the offset into second_offset and the rest.  */
8756       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8757       *offset2 = gen_int_mode (second_offset, Pmode);
8758       return true;
8759     }
8760 }
8761
8762 /* Return the binary representation of floating point constant VALUE in INTVAL.
8763    If the value cannot be converted, return false without setting INTVAL.
8764    The conversion is done in the given MODE.  */
8765 bool
8766 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
8767 {
8768
8769   /* We make a general exception for 0.  */
8770   if (aarch64_float_const_zero_rtx_p (value))
8771     {
8772       *intval = 0;
8773       return true;
8774     }
8775
8776   scalar_float_mode mode;
8777   if (GET_CODE (value) != CONST_DOUBLE
8778       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
8779       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
8780       /* Only support up to DF mode.  */
8781       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
8782     return false;
8783
8784   unsigned HOST_WIDE_INT ival = 0;
8785
8786   long res[2];
8787   real_to_target (res,
8788                   CONST_DOUBLE_REAL_VALUE (value),
8789                   REAL_MODE_FORMAT (mode));
8790
8791   if (mode == DFmode)
8792     {
8793       int order = BYTES_BIG_ENDIAN ? 1 : 0;
8794       ival = zext_hwi (res[order], 32);
8795       ival |= (zext_hwi (res[1 - order], 32) << 32);
8796     }
8797   else
8798       ival = zext_hwi (res[0], 32);
8799
8800   *intval = ival;
8801   return true;
8802 }
8803
8804 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8805    single MOV(+MOVK) followed by an FMOV.  */
8806 bool
8807 aarch64_float_const_rtx_p (rtx x)
8808 {
8809   machine_mode mode = GET_MODE (x);
8810   if (mode == VOIDmode)
8811     return false;
8812
8813   /* Determine whether it's cheaper to write float constants as
8814      mov/movk pairs over ldr/adrp pairs.  */
8815   unsigned HOST_WIDE_INT ival;
8816
8817   if (GET_CODE (x) == CONST_DOUBLE
8818       && SCALAR_FLOAT_MODE_P (mode)
8819       && aarch64_reinterpret_float_as_int (x, &ival))
8820     {
8821       scalar_int_mode imode = (mode == HFmode
8822                                ? SImode
8823                                : int_mode_for_mode (mode).require ());
8824       int num_instr = aarch64_internal_mov_immediate
8825                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8826       return num_instr < 3;
8827     }
8828
8829   return false;
8830 }
8831
8832 /* Return TRUE if rtx X is immediate constant 0.0 */
8833 bool
8834 aarch64_float_const_zero_rtx_p (rtx x)
8835 {
8836   if (GET_MODE (x) == VOIDmode)
8837     return false;
8838
8839   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
8840     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
8841   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
8842 }
8843
8844 /* Return TRUE if rtx X is immediate constant that fits in a single
8845    MOVI immediate operation.  */
8846 bool
8847 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
8848 {
8849   if (!TARGET_SIMD)
8850      return false;
8851
8852   machine_mode vmode;
8853   scalar_int_mode imode;
8854   unsigned HOST_WIDE_INT ival;
8855
8856   if (GET_CODE (x) == CONST_DOUBLE
8857       && SCALAR_FLOAT_MODE_P (mode))
8858     {
8859       if (!aarch64_reinterpret_float_as_int (x, &ival))
8860         return false;
8861
8862       /* We make a general exception for 0.  */
8863       if (aarch64_float_const_zero_rtx_p (x))
8864         return true;
8865
8866       imode = int_mode_for_mode (mode).require ();
8867     }
8868   else if (GET_CODE (x) == CONST_INT
8869            && is_a <scalar_int_mode> (mode, &imode))
8870     ival = INTVAL (x);
8871   else
8872     return false;
8873
8874    /* use a 64 bit mode for everything except for DI/DF mode, where we use
8875      a 128 bit vector mode.  */
8876   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
8877
8878   vmode = aarch64_simd_container_mode (imode, width);
8879   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
8880
8881   return aarch64_simd_valid_immediate (v_op, NULL);
8882 }
8883
8884
8885 /* Return the fixed registers used for condition codes.  */
8886
8887 static bool
8888 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8889 {
8890   *p1 = CC_REGNUM;
8891   *p2 = INVALID_REGNUM;
8892   return true;
8893 }
8894
8895 /* This function is used by the call expanders of the machine description.
8896    RESULT is the register in which the result is returned.  It's NULL for
8897    "call" and "sibcall".
8898    MEM is the location of the function call.
8899    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8900    SIBCALL indicates whether this function call is normal call or sibling call.
8901    It will generate different pattern accordingly.  */
8902
8903 void
8904 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
8905 {
8906   rtx call, callee, tmp;
8907   rtvec vec;
8908   machine_mode mode;
8909
8910   gcc_assert (MEM_P (mem));
8911   callee = XEXP (mem, 0);
8912   mode = GET_MODE (callee);
8913   gcc_assert (mode == Pmode);
8914
8915   /* Decide if we should generate indirect calls by loading the
8916      address of the callee into a register before performing
8917      the branch-and-link.  */
8918   if (SYMBOL_REF_P (callee)
8919       ? (aarch64_is_long_call_p (callee)
8920          || aarch64_is_noplt_call_p (callee))
8921       : !REG_P (callee))
8922     XEXP (mem, 0) = force_reg (mode, callee);
8923
8924   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8925
8926   if (result != NULL_RTX)
8927     call = gen_rtx_SET (result, call);
8928
8929   if (sibcall)
8930     tmp = ret_rtx;
8931   else
8932     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8933
8934   gcc_assert (CONST_INT_P (callee_abi));
8935   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8936                                UNSPEC_CALLEE_ABI);
8937
8938   vec = gen_rtvec (3, call, callee_abi, tmp);
8939   call = gen_rtx_PARALLEL (VOIDmode, vec);
8940
8941   aarch64_emit_call_insn (call);
8942 }
8943
8944 /* Emit call insn with PAT and do aarch64-specific handling.  */
8945
8946 void
8947 aarch64_emit_call_insn (rtx pat)
8948 {
8949   rtx insn = emit_call_insn (pat);
8950
8951   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8952   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8953   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8954 }
8955
8956 machine_mode
8957 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8958 {
8959   machine_mode mode_x = GET_MODE (x);
8960   rtx_code code_x = GET_CODE (x);
8961
8962   /* All floating point compares return CCFP if it is an equality
8963      comparison, and CCFPE otherwise.  */
8964   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8965     {
8966       switch (code)
8967         {
8968         case EQ:
8969         case NE:
8970         case UNORDERED:
8971         case ORDERED:
8972         case UNLT:
8973         case UNLE:
8974         case UNGT:
8975         case UNGE:
8976         case UNEQ:
8977           return CCFPmode;
8978
8979         case LT:
8980         case LE:
8981         case GT:
8982         case GE:
8983         case LTGT:
8984           return CCFPEmode;
8985
8986         default:
8987           gcc_unreachable ();
8988         }
8989     }
8990
8991   /* Equality comparisons of short modes against zero can be performed
8992      using the TST instruction with the appropriate bitmask.  */
8993   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8994       && (code == EQ || code == NE)
8995       && (mode_x == HImode || mode_x == QImode))
8996     return CC_NZmode;
8997
8998   /* Similarly, comparisons of zero_extends from shorter modes can
8999      be performed using an ANDS with an immediate mask.  */
9000   if (y == const0_rtx && code_x == ZERO_EXTEND
9001       && (mode_x == SImode || mode_x == DImode)
9002       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9003       && (code == EQ || code == NE))
9004     return CC_NZmode;
9005
9006   if ((mode_x == SImode || mode_x == DImode)
9007       && y == const0_rtx
9008       && (code == EQ || code == NE || code == LT || code == GE)
9009       && (code_x == PLUS || code_x == MINUS || code_x == AND
9010           || code_x == NEG
9011           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9012               && CONST_INT_P (XEXP (x, 2)))))
9013     return CC_NZmode;
9014
9015   /* A compare with a shifted operand.  Because of canonicalization,
9016      the comparison will have to be swapped when we emit the assembly
9017      code.  */
9018   if ((mode_x == SImode || mode_x == DImode)
9019       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9020       && (code_x == ASHIFT || code_x == ASHIFTRT
9021           || code_x == LSHIFTRT
9022           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9023     return CC_SWPmode;
9024
9025   /* Similarly for a negated operand, but we can only do this for
9026      equalities.  */
9027   if ((mode_x == SImode || mode_x == DImode)
9028       && (REG_P (y) || GET_CODE (y) == SUBREG)
9029       && (code == EQ || code == NE)
9030       && code_x == NEG)
9031     return CC_Zmode;
9032
9033   /* A test for unsigned overflow from an addition.  */
9034   if ((mode_x == DImode || mode_x == TImode)
9035       && (code == LTU || code == GEU)
9036       && code_x == PLUS
9037       && rtx_equal_p (XEXP (x, 0), y))
9038     return CC_Cmode;
9039
9040   /* A test for unsigned overflow from an add with carry.  */
9041   if ((mode_x == DImode || mode_x == TImode)
9042       && (code == LTU || code == GEU)
9043       && code_x == PLUS
9044       && CONST_SCALAR_INT_P (y)
9045       && (rtx_mode_t (y, mode_x)
9046           == (wi::shwi (1, mode_x)
9047               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9048     return CC_ADCmode;
9049
9050   /* A test for signed overflow.  */
9051   if ((mode_x == DImode || mode_x == TImode)
9052       && code == NE
9053       && code_x == PLUS
9054       && GET_CODE (y) == SIGN_EXTEND)
9055     return CC_Vmode;
9056
9057   /* For everything else, return CCmode.  */
9058   return CCmode;
9059 }
9060
9061 static int
9062 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9063
9064 int
9065 aarch64_get_condition_code (rtx x)
9066 {
9067   machine_mode mode = GET_MODE (XEXP (x, 0));
9068   enum rtx_code comp_code = GET_CODE (x);
9069
9070   if (GET_MODE_CLASS (mode) != MODE_CC)
9071     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9072   return aarch64_get_condition_code_1 (mode, comp_code);
9073 }
9074
9075 static int
9076 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9077 {
9078   switch (mode)
9079     {
9080     case E_CCFPmode:
9081     case E_CCFPEmode:
9082       switch (comp_code)
9083         {
9084         case GE: return AARCH64_GE;
9085         case GT: return AARCH64_GT;
9086         case LE: return AARCH64_LS;
9087         case LT: return AARCH64_MI;
9088         case NE: return AARCH64_NE;
9089         case EQ: return AARCH64_EQ;
9090         case ORDERED: return AARCH64_VC;
9091         case UNORDERED: return AARCH64_VS;
9092         case UNLT: return AARCH64_LT;
9093         case UNLE: return AARCH64_LE;
9094         case UNGT: return AARCH64_HI;
9095         case UNGE: return AARCH64_PL;
9096         default: return -1;
9097         }
9098       break;
9099
9100     case E_CCmode:
9101       switch (comp_code)
9102         {
9103         case NE: return AARCH64_NE;
9104         case EQ: return AARCH64_EQ;
9105         case GE: return AARCH64_GE;
9106         case GT: return AARCH64_GT;
9107         case LE: return AARCH64_LE;
9108         case LT: return AARCH64_LT;
9109         case GEU: return AARCH64_CS;
9110         case GTU: return AARCH64_HI;
9111         case LEU: return AARCH64_LS;
9112         case LTU: return AARCH64_CC;
9113         default: return -1;
9114         }
9115       break;
9116
9117     case E_CC_SWPmode:
9118       switch (comp_code)
9119         {
9120         case NE: return AARCH64_NE;
9121         case EQ: return AARCH64_EQ;
9122         case GE: return AARCH64_LE;
9123         case GT: return AARCH64_LT;
9124         case LE: return AARCH64_GE;
9125         case LT: return AARCH64_GT;
9126         case GEU: return AARCH64_LS;
9127         case GTU: return AARCH64_CC;
9128         case LEU: return AARCH64_CS;
9129         case LTU: return AARCH64_HI;
9130         default: return -1;
9131         }
9132       break;
9133
9134     case E_CC_NZCmode:
9135       switch (comp_code)
9136         {
9137         case NE: return AARCH64_NE; /* = any */
9138         case EQ: return AARCH64_EQ; /* = none */
9139         case GE: return AARCH64_PL; /* = nfrst */
9140         case LT: return AARCH64_MI; /* = first */
9141         case GEU: return AARCH64_CS; /* = nlast */
9142         case GTU: return AARCH64_HI; /* = pmore */
9143         case LEU: return AARCH64_LS; /* = plast */
9144         case LTU: return AARCH64_CC; /* = last */
9145         default: return -1;
9146         }
9147       break;
9148
9149     case E_CC_NZmode:
9150       switch (comp_code)
9151         {
9152         case NE: return AARCH64_NE;
9153         case EQ: return AARCH64_EQ;
9154         case GE: return AARCH64_PL;
9155         case LT: return AARCH64_MI;
9156         default: return -1;
9157         }
9158       break;
9159
9160     case E_CC_Zmode:
9161       switch (comp_code)
9162         {
9163         case NE: return AARCH64_NE;
9164         case EQ: return AARCH64_EQ;
9165         default: return -1;
9166         }
9167       break;
9168
9169     case E_CC_Cmode:
9170       switch (comp_code)
9171         {
9172         case LTU: return AARCH64_CS;
9173         case GEU: return AARCH64_CC;
9174         default: return -1;
9175         }
9176       break;
9177
9178     case E_CC_ADCmode:
9179       switch (comp_code)
9180         {
9181         case GEU: return AARCH64_CS;
9182         case LTU: return AARCH64_CC;
9183         default: return -1;
9184         }
9185       break;
9186
9187     case E_CC_Vmode:
9188       switch (comp_code)
9189         {
9190         case NE: return AARCH64_VS;
9191         case EQ: return AARCH64_VC;
9192         default: return -1;
9193         }
9194       break;
9195
9196     default:
9197       return -1;
9198     }
9199
9200   return -1;
9201 }
9202
9203 bool
9204 aarch64_const_vec_all_same_in_range_p (rtx x,
9205                                        HOST_WIDE_INT minval,
9206                                        HOST_WIDE_INT maxval)
9207 {
9208   rtx elt;
9209   return (const_vec_duplicate_p (x, &elt)
9210           && CONST_INT_P (elt)
9211           && IN_RANGE (INTVAL (elt), minval, maxval));
9212 }
9213
9214 bool
9215 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9216 {
9217   return aarch64_const_vec_all_same_in_range_p (x, val, val);
9218 }
9219
9220 /* Return true if VEC is a constant in which every element is in the range
9221    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
9222
9223 static bool
9224 aarch64_const_vec_all_in_range_p (rtx vec,
9225                                   HOST_WIDE_INT minval,
9226                                   HOST_WIDE_INT maxval)
9227 {
9228   if (GET_CODE (vec) != CONST_VECTOR
9229       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9230     return false;
9231
9232   int nunits;
9233   if (!CONST_VECTOR_STEPPED_P (vec))
9234     nunits = const_vector_encoded_nelts (vec);
9235   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9236     return false;
9237
9238   for (int i = 0; i < nunits; i++)
9239     {
9240       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9241       if (!CONST_INT_P (vec_elem)
9242           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9243         return false;
9244     }
9245   return true;
9246 }
9247
9248 /* N Z C V.  */
9249 #define AARCH64_CC_V 1
9250 #define AARCH64_CC_C (1 << 1)
9251 #define AARCH64_CC_Z (1 << 2)
9252 #define AARCH64_CC_N (1 << 3)
9253
9254 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
9255 static const int aarch64_nzcv_codes[] =
9256 {
9257   0,            /* EQ, Z == 1.  */
9258   AARCH64_CC_Z, /* NE, Z == 0.  */
9259   0,            /* CS, C == 1.  */
9260   AARCH64_CC_C, /* CC, C == 0.  */
9261   0,            /* MI, N == 1.  */
9262   AARCH64_CC_N, /* PL, N == 0.  */
9263   0,            /* VS, V == 1.  */
9264   AARCH64_CC_V, /* VC, V == 0.  */
9265   0,            /* HI, C ==1 && Z == 0.  */
9266   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
9267   AARCH64_CC_V, /* GE, N == V.  */
9268   0,            /* LT, N != V.  */
9269   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
9270   0,            /* LE, !(Z == 0 && N == V).  */
9271   0,            /* AL, Any.  */
9272   0             /* NV, Any.  */
9273 };
9274
9275 /* Print floating-point vector immediate operand X to F, negating it
9276    first if NEGATE is true.  Return true on success, false if it isn't
9277    a constant we can handle.  */
9278
9279 static bool
9280 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9281 {
9282   rtx elt;
9283
9284   if (!const_vec_duplicate_p (x, &elt))
9285     return false;
9286
9287   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9288   if (negate)
9289     r = real_value_negate (&r);
9290
9291   /* Handle the SVE single-bit immediates specially, since they have a
9292      fixed form in the assembly syntax.  */
9293   if (real_equal (&r, &dconst0))
9294     asm_fprintf (f, "0.0");
9295   else if (real_equal (&r, &dconst2))
9296     asm_fprintf (f, "2.0");
9297   else if (real_equal (&r, &dconst1))
9298     asm_fprintf (f, "1.0");
9299   else if (real_equal (&r, &dconsthalf))
9300     asm_fprintf (f, "0.5");
9301   else
9302     {
9303       const int buf_size = 20;
9304       char float_buf[buf_size] = {'\0'};
9305       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9306                                 1, GET_MODE (elt));
9307       asm_fprintf (f, "%s", float_buf);
9308     }
9309
9310   return true;
9311 }
9312
9313 /* Return the equivalent letter for size.  */
9314 static char
9315 sizetochar (int size)
9316 {
9317   switch (size)
9318     {
9319     case 64: return 'd';
9320     case 32: return 's';
9321     case 16: return 'h';
9322     case 8 : return 'b';
9323     default: gcc_unreachable ();
9324     }
9325 }
9326
9327 /* Print operand X to file F in a target specific manner according to CODE.
9328    The acceptable formatting commands given by CODE are:
9329      'c':               An integer or symbol address without a preceding #
9330                         sign.
9331      'C':               Take the duplicated element in a vector constant
9332                         and print it in hex.
9333      'D':               Take the duplicated element in a vector constant
9334                         and print it as an unsigned integer, in decimal.
9335      'e':               Print the sign/zero-extend size as a character 8->b,
9336                         16->h, 32->w.  Can also be used for masks:
9337                         0xff->b, 0xffff->h, 0xffffffff->w.
9338      'I':               If the operand is a duplicated vector constant,
9339                         replace it with the duplicated scalar.  If the
9340                         operand is then a floating-point constant, replace
9341                         it with the integer bit representation.  Print the
9342                         transformed constant as a signed decimal number.
9343      'p':               Prints N such that 2^N == X (X must be power of 2 and
9344                         const int).
9345      'P':               Print the number of non-zero bits in X (a const_int).
9346      'H':               Print the higher numbered register of a pair (TImode)
9347                         of regs.
9348      'm':               Print a condition (eq, ne, etc).
9349      'M':               Same as 'm', but invert condition.
9350      'N':               Take the duplicated element in a vector constant
9351                         and print the negative of it in decimal.
9352      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
9353      'S/T/U/V':         Print a FP/SIMD register name for a register list.
9354                         The register printed is the FP/SIMD register name
9355                         of X + 0/1/2/3 for S/T/U/V.
9356      'R':               Print a scalar Integer/FP/SIMD register name + 1.
9357      'X':               Print bottom 16 bits of integer constant in hex.
9358      'w/x':             Print a general register name or the zero register
9359                         (32-bit or 64-bit).
9360      '0':               Print a normal operand, if it's a general register,
9361                         then we assume DImode.
9362      'k':               Print NZCV for conditional compare instructions.
9363      'A':               Output address constant representing the first
9364                         argument of X, specifying a relocation offset
9365                         if appropriate.
9366      'L':               Output constant address specified by X
9367                         with a relocation offset if appropriate.
9368      'G':               Prints address of X, specifying a PC relative
9369                         relocation mode if appropriate.
9370      'y':               Output address of LDP or STP - this is used for
9371                         some LDP/STPs which don't use a PARALLEL in their
9372                         pattern (so the mode needs to be adjusted).
9373      'z':               Output address of a typical LDP or STP.  */
9374
9375 static void
9376 aarch64_print_operand (FILE *f, rtx x, int code)
9377 {
9378   rtx elt;
9379   switch (code)
9380     {
9381     case 'c':
9382       switch (GET_CODE (x))
9383         {
9384         case CONST_INT:
9385           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9386           break;
9387
9388         case SYMBOL_REF:
9389           output_addr_const (f, x);
9390           break;
9391
9392         case CONST:
9393           if (GET_CODE (XEXP (x, 0)) == PLUS
9394               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9395             {
9396               output_addr_const (f, x);
9397               break;
9398             }
9399           /* Fall through.  */
9400
9401         default:
9402           output_operand_lossage ("unsupported operand for code '%c'", code);
9403         }
9404       break;
9405
9406     case 'e':
9407       {
9408         x = unwrap_const_vec_duplicate (x);
9409         if (!CONST_INT_P (x))
9410           {
9411             output_operand_lossage ("invalid operand for '%%%c'", code);
9412             return;
9413           }
9414
9415         HOST_WIDE_INT val = INTVAL (x);
9416         if ((val & ~7) == 8 || val == 0xff)
9417           fputc ('b', f);
9418         else if ((val & ~7) == 16 || val == 0xffff)
9419           fputc ('h', f);
9420         else if ((val & ~7) == 32 || val == 0xffffffff)
9421           fputc ('w', f);
9422         else
9423           {
9424             output_operand_lossage ("invalid operand for '%%%c'", code);
9425             return;
9426           }
9427       }
9428       break;
9429
9430     case 'p':
9431       {
9432         int n;
9433
9434         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9435           {
9436             output_operand_lossage ("invalid operand for '%%%c'", code);
9437             return;
9438           }
9439
9440         asm_fprintf (f, "%d", n);
9441       }
9442       break;
9443
9444     case 'P':
9445       if (!CONST_INT_P (x))
9446         {
9447           output_operand_lossage ("invalid operand for '%%%c'", code);
9448           return;
9449         }
9450
9451       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9452       break;
9453
9454     case 'H':
9455       if (x == const0_rtx)
9456         {
9457           asm_fprintf (f, "xzr");
9458           break;
9459         }
9460
9461       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9462         {
9463           output_operand_lossage ("invalid operand for '%%%c'", code);
9464           return;
9465         }
9466
9467       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9468       break;
9469
9470     case 'I':
9471       {
9472         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9473         if (CONST_INT_P (x))
9474           asm_fprintf (f, "%wd", INTVAL (x));
9475         else
9476           {
9477             output_operand_lossage ("invalid operand for '%%%c'", code);
9478             return;
9479           }
9480         break;
9481       }
9482
9483     case 'M':
9484     case 'm':
9485       {
9486         int cond_code;
9487         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
9488         if (x == const_true_rtx)
9489           {
9490             if (code == 'M')
9491               fputs ("nv", f);
9492             return;
9493           }
9494
9495         if (!COMPARISON_P (x))
9496           {
9497             output_operand_lossage ("invalid operand for '%%%c'", code);
9498             return;
9499           }
9500
9501         cond_code = aarch64_get_condition_code (x);
9502         gcc_assert (cond_code >= 0);
9503         if (code == 'M')
9504           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9505         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9506           fputs (aarch64_sve_condition_codes[cond_code], f);
9507         else
9508           fputs (aarch64_condition_codes[cond_code], f);
9509       }
9510       break;
9511
9512     case 'N':
9513       if (!const_vec_duplicate_p (x, &elt))
9514         {
9515           output_operand_lossage ("invalid vector constant");
9516           return;
9517         }
9518
9519       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9520         asm_fprintf (f, "%wd", -INTVAL (elt));
9521       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9522                && aarch64_print_vector_float_operand (f, x, true))
9523         ;
9524       else
9525         {
9526           output_operand_lossage ("invalid vector constant");
9527           return;
9528         }
9529       break;
9530
9531     case 'b':
9532     case 'h':
9533     case 's':
9534     case 'd':
9535     case 'q':
9536       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9537         {
9538           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9539           return;
9540         }
9541       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
9542       break;
9543
9544     case 'S':
9545     case 'T':
9546     case 'U':
9547     case 'V':
9548       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9549         {
9550           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9551           return;
9552         }
9553       asm_fprintf (f, "%c%d",
9554                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
9555                    REGNO (x) - V0_REGNUM + (code - 'S'));
9556       break;
9557
9558     case 'R':
9559       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
9560         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
9561       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9562         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
9563       else
9564         output_operand_lossage ("incompatible register operand for '%%%c'",
9565                                 code);
9566       break;
9567
9568     case 'X':
9569       if (!CONST_INT_P (x))
9570         {
9571           output_operand_lossage ("invalid operand for '%%%c'", code);
9572           return;
9573         }
9574       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
9575       break;
9576
9577     case 'C':
9578       {
9579         /* Print a replicated constant in hex.  */
9580         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9581           {
9582             output_operand_lossage ("invalid operand for '%%%c'", code);
9583             return;
9584           }
9585         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9586         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9587       }
9588       break;
9589
9590     case 'D':
9591       {
9592         /* Print a replicated constant in decimal, treating it as
9593            unsigned.  */
9594         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9595           {
9596             output_operand_lossage ("invalid operand for '%%%c'", code);
9597             return;
9598           }
9599         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9600         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9601       }
9602       break;
9603
9604     case 'w':
9605     case 'x':
9606       if (x == const0_rtx
9607           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
9608         {
9609           asm_fprintf (f, "%czr", code);
9610           break;
9611         }
9612
9613       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9614         {
9615           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
9616           break;
9617         }
9618
9619       if (REG_P (x) && REGNO (x) == SP_REGNUM)
9620         {
9621           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
9622           break;
9623         }
9624
9625       /* Fall through */
9626
9627     case 0:
9628       if (x == NULL)
9629         {
9630           output_operand_lossage ("missing operand");
9631           return;
9632         }
9633
9634       switch (GET_CODE (x))
9635         {
9636         case REG:
9637           if (aarch64_sve_data_mode_p (GET_MODE (x)))
9638             {
9639               if (REG_NREGS (x) == 1)
9640                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
9641               else
9642                 {
9643                   char suffix
9644                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
9645                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
9646                                REGNO (x) - V0_REGNUM, suffix,
9647                                END_REGNO (x) - V0_REGNUM - 1, suffix);
9648                 }
9649             }
9650           else
9651             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
9652           break;
9653
9654         case MEM:
9655           output_address (GET_MODE (x), XEXP (x, 0));
9656           break;
9657
9658         case LABEL_REF:
9659         case SYMBOL_REF:
9660           output_addr_const (asm_out_file, x);
9661           break;
9662
9663         case CONST_INT:
9664           asm_fprintf (f, "%wd", INTVAL (x));
9665           break;
9666
9667         case CONST:
9668           if (!VECTOR_MODE_P (GET_MODE (x)))
9669             {
9670               output_addr_const (asm_out_file, x);
9671               break;
9672             }
9673           /* fall through */
9674
9675         case CONST_VECTOR:
9676           if (!const_vec_duplicate_p (x, &elt))
9677             {
9678               output_operand_lossage ("invalid vector constant");
9679               return;
9680             }
9681
9682           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9683             asm_fprintf (f, "%wd", INTVAL (elt));
9684           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9685                    && aarch64_print_vector_float_operand (f, x, false))
9686             ;
9687           else
9688             {
9689               output_operand_lossage ("invalid vector constant");
9690               return;
9691             }
9692           break;
9693
9694         case CONST_DOUBLE:
9695           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9696              be getting CONST_DOUBLEs holding integers.  */
9697           gcc_assert (GET_MODE (x) != VOIDmode);
9698           if (aarch64_float_const_zero_rtx_p (x))
9699             {
9700               fputc ('0', f);
9701               break;
9702             }
9703           else if (aarch64_float_const_representable_p (x))
9704             {
9705 #define buf_size 20
9706               char float_buf[buf_size] = {'\0'};
9707               real_to_decimal_for_mode (float_buf,
9708                                         CONST_DOUBLE_REAL_VALUE (x),
9709                                         buf_size, buf_size,
9710                                         1, GET_MODE (x));
9711               asm_fprintf (asm_out_file, "%s", float_buf);
9712               break;
9713 #undef buf_size
9714             }
9715           output_operand_lossage ("invalid constant");
9716           return;
9717         default:
9718           output_operand_lossage ("invalid operand");
9719           return;
9720         }
9721       break;
9722
9723     case 'A':
9724       if (GET_CODE (x) == HIGH)
9725         x = XEXP (x, 0);
9726
9727       switch (aarch64_classify_symbolic_expression (x))
9728         {
9729         case SYMBOL_SMALL_GOT_4G:
9730           asm_fprintf (asm_out_file, ":got:");
9731           break;
9732
9733         case SYMBOL_SMALL_TLSGD:
9734           asm_fprintf (asm_out_file, ":tlsgd:");
9735           break;
9736
9737         case SYMBOL_SMALL_TLSDESC:
9738           asm_fprintf (asm_out_file, ":tlsdesc:");
9739           break;
9740
9741         case SYMBOL_SMALL_TLSIE:
9742           asm_fprintf (asm_out_file, ":gottprel:");
9743           break;
9744
9745         case SYMBOL_TLSLE24:
9746           asm_fprintf (asm_out_file, ":tprel:");
9747           break;
9748
9749         case SYMBOL_TINY_GOT:
9750           gcc_unreachable ();
9751           break;
9752
9753         default:
9754           break;
9755         }
9756       output_addr_const (asm_out_file, x);
9757       break;
9758
9759     case 'L':
9760       switch (aarch64_classify_symbolic_expression (x))
9761         {
9762         case SYMBOL_SMALL_GOT_4G:
9763           asm_fprintf (asm_out_file, ":lo12:");
9764           break;
9765
9766         case SYMBOL_SMALL_TLSGD:
9767           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
9768           break;
9769
9770         case SYMBOL_SMALL_TLSDESC:
9771           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
9772           break;
9773
9774         case SYMBOL_SMALL_TLSIE:
9775           asm_fprintf (asm_out_file, ":gottprel_lo12:");
9776           break;
9777
9778         case SYMBOL_TLSLE12:
9779           asm_fprintf (asm_out_file, ":tprel_lo12:");
9780           break;
9781
9782         case SYMBOL_TLSLE24:
9783           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
9784           break;
9785
9786         case SYMBOL_TINY_GOT:
9787           asm_fprintf (asm_out_file, ":got:");
9788           break;
9789
9790         case SYMBOL_TINY_TLSIE:
9791           asm_fprintf (asm_out_file, ":gottprel:");
9792           break;
9793
9794         default:
9795           break;
9796         }
9797       output_addr_const (asm_out_file, x);
9798       break;
9799
9800     case 'G':
9801       switch (aarch64_classify_symbolic_expression (x))
9802         {
9803         case SYMBOL_TLSLE24:
9804           asm_fprintf (asm_out_file, ":tprel_hi12:");
9805           break;
9806         default:
9807           break;
9808         }
9809       output_addr_const (asm_out_file, x);
9810       break;
9811
9812     case 'k':
9813       {
9814         HOST_WIDE_INT cond_code;
9815
9816         if (!CONST_INT_P (x))
9817           {
9818             output_operand_lossage ("invalid operand for '%%%c'", code);
9819             return;
9820           }
9821
9822         cond_code = INTVAL (x);
9823         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
9824         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
9825       }
9826       break;
9827
9828     case 'y':
9829     case 'z':
9830       {
9831         machine_mode mode = GET_MODE (x);
9832
9833         if (GET_CODE (x) != MEM
9834             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
9835           {
9836             output_operand_lossage ("invalid operand for '%%%c'", code);
9837             return;
9838           }
9839
9840         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
9841                                             code == 'y'
9842                                             ? ADDR_QUERY_LDP_STP_N
9843                                             : ADDR_QUERY_LDP_STP))
9844           output_operand_lossage ("invalid operand prefix '%%%c'", code);
9845       }
9846       break;
9847
9848     default:
9849       output_operand_lossage ("invalid operand prefix '%%%c'", code);
9850       return;
9851     }
9852 }
9853
9854 /* Print address 'x' of a memory access with mode 'mode'.
9855    'op' is the context required by aarch64_classify_address.  It can either be
9856    MEM for a normal memory access or PARALLEL for LDP/STP.  */
9857 static bool
9858 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
9859                                 aarch64_addr_query_type type)
9860 {
9861   struct aarch64_address_info addr;
9862   unsigned int size, vec_flags;
9863
9864   /* Check all addresses are Pmode - including ILP32.  */
9865   if (GET_MODE (x) != Pmode
9866       && (!CONST_INT_P (x)
9867           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
9868     {
9869       output_operand_lossage ("invalid address mode");
9870       return false;
9871     }
9872
9873   if (aarch64_classify_address (&addr, x, mode, true, type))
9874     switch (addr.type)
9875       {
9876       case ADDRESS_REG_IMM:
9877         if (known_eq (addr.const_offset, 0))
9878           {
9879             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
9880             return true;
9881           }
9882
9883         vec_flags = aarch64_classify_vector_mode (mode);
9884         if (vec_flags & VEC_ANY_SVE)
9885           {
9886             HOST_WIDE_INT vnum
9887               = exact_div (addr.const_offset,
9888                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
9889             asm_fprintf (f, "[%s, #%wd, mul vl]",
9890                          reg_names[REGNO (addr.base)], vnum);
9891             return true;
9892           }
9893
9894         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
9895                      INTVAL (addr.offset));
9896         return true;
9897
9898       case ADDRESS_REG_REG:
9899         if (addr.shift == 0)
9900           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9901                        reg_names [REGNO (addr.offset)]);
9902         else
9903           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9904                        reg_names [REGNO (addr.offset)], addr.shift);
9905         return true;
9906
9907       case ADDRESS_REG_UXTW:
9908         if (addr.shift == 0)
9909           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9910                        REGNO (addr.offset) - R0_REGNUM);
9911         else
9912           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9913                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9914         return true;
9915
9916       case ADDRESS_REG_SXTW:
9917         if (addr.shift == 0)
9918           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9919                        REGNO (addr.offset) - R0_REGNUM);
9920         else
9921           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9922                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9923         return true;
9924
9925       case ADDRESS_REG_WB:
9926         /* Writeback is only supported for fixed-width modes.  */
9927         size = GET_MODE_SIZE (mode).to_constant ();
9928         switch (GET_CODE (x))
9929           {
9930           case PRE_INC:
9931             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9932             return true;
9933           case POST_INC:
9934             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9935             return true;
9936           case PRE_DEC:
9937             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9938             return true;
9939           case POST_DEC:
9940             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9941             return true;
9942           case PRE_MODIFY:
9943             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9944                          INTVAL (addr.offset));
9945             return true;
9946           case POST_MODIFY:
9947             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9948                          INTVAL (addr.offset));
9949             return true;
9950           default:
9951             break;
9952           }
9953         break;
9954
9955       case ADDRESS_LO_SUM:
9956         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9957         output_addr_const (f, addr.offset);
9958         asm_fprintf (f, "]");
9959         return true;
9960
9961       case ADDRESS_SYMBOLIC:
9962         output_addr_const (f, x);
9963         return true;
9964       }
9965
9966   return false;
9967 }
9968
9969 /* Print address 'x' of a memory access with mode 'mode'.  */
9970 static void
9971 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9972 {
9973   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9974     output_addr_const (f, x);
9975 }
9976
9977 bool
9978 aarch64_label_mentioned_p (rtx x)
9979 {
9980   const char *fmt;
9981   int i;
9982
9983   if (GET_CODE (x) == LABEL_REF)
9984     return true;
9985
9986   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9987      referencing instruction, but they are constant offsets, not
9988      symbols.  */
9989   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9990     return false;
9991
9992   fmt = GET_RTX_FORMAT (GET_CODE (x));
9993   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9994     {
9995       if (fmt[i] == 'E')
9996         {
9997           int j;
9998
9999           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10000             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10001               return 1;
10002         }
10003       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10004         return 1;
10005     }
10006
10007   return 0;
10008 }
10009
10010 /* Implement REGNO_REG_CLASS.  */
10011
10012 enum reg_class
10013 aarch64_regno_regclass (unsigned regno)
10014 {
10015   if (GP_REGNUM_P (regno))
10016     return GENERAL_REGS;
10017
10018   if (regno == SP_REGNUM)
10019     return STACK_REG;
10020
10021   if (regno == FRAME_POINTER_REGNUM
10022       || regno == ARG_POINTER_REGNUM)
10023     return POINTER_REGS;
10024
10025   if (FP_REGNUM_P (regno))
10026     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10027             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10028
10029   if (PR_REGNUM_P (regno))
10030     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10031
10032   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10033     return FFR_REGS;
10034
10035   return NO_REGS;
10036 }
10037
10038 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10039    If OFFSET is out of range, return an offset of an anchor point
10040    that is in range.  Return 0 otherwise.  */
10041
10042 static HOST_WIDE_INT
10043 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10044                        machine_mode mode)
10045 {
10046   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
10047   if (size > 16)
10048     return (offset + 0x400) & ~0x7f0;
10049
10050   /* For offsets that aren't a multiple of the access size, the limit is
10051      -256...255.  */
10052   if (offset & (size - 1))
10053     {
10054       /* BLKmode typically uses LDP of X-registers.  */
10055       if (mode == BLKmode)
10056         return (offset + 512) & ~0x3ff;
10057       return (offset + 0x100) & ~0x1ff;
10058     }
10059
10060   /* Small negative offsets are supported.  */
10061   if (IN_RANGE (offset, -256, 0))
10062     return 0;
10063
10064   if (mode == TImode || mode == TFmode)
10065     return (offset + 0x100) & ~0x1ff;
10066
10067   /* Use 12-bit offset by access size.  */
10068   return offset & (~0xfff * size);
10069 }
10070
10071 static rtx
10072 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
10073 {
10074   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10075      where mask is selected by alignment and size of the offset.
10076      We try to pick as large a range for the offset as possible to
10077      maximize the chance of a CSE.  However, for aligned addresses
10078      we limit the range to 4k so that structures with different sized
10079      elements are likely to use the same base.  We need to be careful
10080      not to split a CONST for some forms of address expression, otherwise
10081      it will generate sub-optimal code.  */
10082
10083   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10084     {
10085       rtx base = XEXP (x, 0);
10086       rtx offset_rtx = XEXP (x, 1);
10087       HOST_WIDE_INT offset = INTVAL (offset_rtx);
10088
10089       if (GET_CODE (base) == PLUS)
10090         {
10091           rtx op0 = XEXP (base, 0);
10092           rtx op1 = XEXP (base, 1);
10093
10094           /* Force any scaling into a temp for CSE.  */
10095           op0 = force_reg (Pmode, op0);
10096           op1 = force_reg (Pmode, op1);
10097
10098           /* Let the pointer register be in op0.  */
10099           if (REG_POINTER (op1))
10100             std::swap (op0, op1);
10101
10102           /* If the pointer is virtual or frame related, then we know that
10103              virtual register instantiation or register elimination is going
10104              to apply a second constant.  We want the two constants folded
10105              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
10106           if (virt_or_elim_regno_p (REGNO (op0)))
10107             {
10108               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10109                                    NULL_RTX, true, OPTAB_DIRECT);
10110               return gen_rtx_PLUS (Pmode, base, op1);
10111             }
10112
10113           /* Otherwise, in order to encourage CSE (and thence loop strength
10114              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
10115           base = expand_binop (Pmode, add_optab, op0, op1,
10116                                NULL_RTX, true, OPTAB_DIRECT);
10117           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10118         }
10119
10120       HOST_WIDE_INT size;
10121       if (GET_MODE_SIZE (mode).is_constant (&size))
10122         {
10123           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10124                                                              mode);
10125           if (base_offset != 0)
10126             {
10127               base = plus_constant (Pmode, base, base_offset);
10128               base = force_operand (base, NULL_RTX);
10129               return plus_constant (Pmode, base, offset - base_offset);
10130             }
10131         }
10132     }
10133
10134   return x;
10135 }
10136
10137 static reg_class_t
10138 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10139                           reg_class_t rclass,
10140                           machine_mode mode,
10141                           secondary_reload_info *sri)
10142 {
10143   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10144      LDR and STR.  See the comment at the head of aarch64-sve.md for
10145      more details about the big-endian handling.  */
10146   if (reg_class_subset_p (rclass, FP_REGS)
10147       && !((REG_P (x) && HARD_REGISTER_P (x))
10148            || aarch64_simd_valid_immediate (x, NULL))
10149       && mode != VNx16QImode)
10150     {
10151       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10152       if ((vec_flags & VEC_SVE_DATA)
10153           && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10154         {
10155           sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10156           return NO_REGS;
10157         }
10158     }
10159
10160   /* If we have to disable direct literal pool loads and stores because the
10161      function is too big, then we need a scratch register.  */
10162   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10163       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10164           || targetm.vector_mode_supported_p (GET_MODE (x)))
10165       && !aarch64_pcrelative_literal_loads)
10166     {
10167       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10168       return NO_REGS;
10169     }
10170
10171   /* Without the TARGET_SIMD instructions we cannot move a Q register
10172      to a Q register directly.  We need a scratch.  */
10173   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10174       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10175       && reg_class_subset_p (rclass, FP_REGS))
10176     {
10177       sri->icode = code_for_aarch64_reload_mov (mode);
10178       return NO_REGS;
10179     }
10180
10181   /* A TFmode or TImode memory access should be handled via an FP_REGS
10182      because AArch64 has richer addressing modes for LDR/STR instructions
10183      than LDP/STP instructions.  */
10184   if (TARGET_FLOAT && rclass == GENERAL_REGS
10185       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10186     return FP_REGS;
10187
10188   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10189       return GENERAL_REGS;
10190
10191   return NO_REGS;
10192 }
10193
10194 static bool
10195 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10196 {
10197   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10198
10199   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10200      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
10201   if (frame_pointer_needed)
10202     return to == HARD_FRAME_POINTER_REGNUM;
10203   return true;
10204 }
10205
10206 poly_int64
10207 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10208 {
10209   if (to == HARD_FRAME_POINTER_REGNUM)
10210     {
10211       if (from == ARG_POINTER_REGNUM)
10212         return cfun->machine->frame.hard_fp_offset;
10213
10214       if (from == FRAME_POINTER_REGNUM)
10215         return cfun->machine->frame.hard_fp_offset
10216                - cfun->machine->frame.locals_offset;
10217     }
10218
10219   if (to == STACK_POINTER_REGNUM)
10220     {
10221       if (from == FRAME_POINTER_REGNUM)
10222           return cfun->machine->frame.frame_size
10223                  - cfun->machine->frame.locals_offset;
10224     }
10225
10226   return cfun->machine->frame.frame_size;
10227 }
10228
10229 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
10230    previous frame.  */
10231
10232 rtx
10233 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10234 {
10235   if (count != 0)
10236     return const0_rtx;
10237   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10238 }
10239
10240
10241 static void
10242 aarch64_asm_trampoline_template (FILE *f)
10243 {
10244   int offset1 = 16;
10245   int offset2 = 20;
10246
10247   if (aarch64_bti_enabled ())
10248     {
10249       asm_fprintf (f, "\thint\t34 // bti c\n");
10250       offset1 -= 4;
10251       offset2 -= 4;
10252     }
10253
10254   if (TARGET_ILP32)
10255     {
10256       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10257       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10258                    offset1);
10259     }
10260   else
10261     {
10262       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10263       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10264                    offset2);
10265     }
10266   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10267
10268   /* The trampoline needs an extra padding instruction.  In case if BTI is
10269      enabled the padding instruction is replaced by the BTI instruction at
10270      the beginning.  */
10271   if (!aarch64_bti_enabled ())
10272     assemble_aligned_integer (4, const0_rtx);
10273
10274   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10275   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10276 }
10277
10278 static void
10279 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10280 {
10281   rtx fnaddr, mem, a_tramp;
10282   const int tramp_code_sz = 16;
10283
10284   /* Don't need to copy the trailing D-words, we fill those in below.  */
10285   emit_block_move (m_tramp, assemble_trampoline_template (),
10286                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10287   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10288   fnaddr = XEXP (DECL_RTL (fndecl), 0);
10289   if (GET_MODE (fnaddr) != ptr_mode)
10290     fnaddr = convert_memory_address (ptr_mode, fnaddr);
10291   emit_move_insn (mem, fnaddr);
10292
10293   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10294   emit_move_insn (mem, chain_value);
10295
10296   /* XXX We should really define a "clear_cache" pattern and use
10297      gen_clear_cache().  */
10298   a_tramp = XEXP (m_tramp, 0);
10299   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10300                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10301                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10302                      ptr_mode);
10303 }
10304
10305 static unsigned char
10306 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10307 {
10308   /* ??? Logically we should only need to provide a value when
10309      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10310      can hold MODE, but at the moment we need to handle all modes.
10311      Just ignore any runtime parts for registers that can't store them.  */
10312   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10313   unsigned int nregs, vec_flags;
10314   switch (regclass)
10315     {
10316     case TAILCALL_ADDR_REGS:
10317     case POINTER_REGS:
10318     case GENERAL_REGS:
10319     case ALL_REGS:
10320     case POINTER_AND_FP_REGS:
10321     case FP_REGS:
10322     case FP_LO_REGS:
10323     case FP_LO8_REGS:
10324       vec_flags = aarch64_classify_vector_mode (mode);
10325       if ((vec_flags & VEC_SVE_DATA)
10326           && constant_multiple_p (GET_MODE_SIZE (mode),
10327                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
10328         return nregs;
10329       return (vec_flags & VEC_ADVSIMD
10330               ? CEIL (lowest_size, UNITS_PER_VREG)
10331               : CEIL (lowest_size, UNITS_PER_WORD));
10332     case STACK_REG:
10333     case PR_REGS:
10334     case PR_LO_REGS:
10335     case PR_HI_REGS:
10336     case FFR_REGS:
10337     case PR_AND_FFR_REGS:
10338       return 1;
10339
10340     case NO_REGS:
10341       return 0;
10342
10343     default:
10344       break;
10345     }
10346   gcc_unreachable ();
10347 }
10348
10349 static reg_class_t
10350 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10351 {
10352   if (regclass == POINTER_REGS)
10353     return GENERAL_REGS;
10354
10355   if (regclass == STACK_REG)
10356     {
10357       if (REG_P(x)
10358           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10359           return regclass;
10360
10361       return NO_REGS;
10362     }
10363
10364   /* Register eliminiation can result in a request for
10365      SP+constant->FP_REGS.  We cannot support such operations which
10366      use SP as source and an FP_REG as destination, so reject out
10367      right now.  */
10368   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10369     {
10370       rtx lhs = XEXP (x, 0);
10371
10372       /* Look through a possible SUBREG introduced by ILP32.  */
10373       if (GET_CODE (lhs) == SUBREG)
10374         lhs = SUBREG_REG (lhs);
10375
10376       gcc_assert (REG_P (lhs));
10377       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10378                                       POINTER_REGS));
10379       return NO_REGS;
10380     }
10381
10382   return regclass;
10383 }
10384
10385 void
10386 aarch64_asm_output_labelref (FILE* f, const char *name)
10387 {
10388   asm_fprintf (f, "%U%s", name);
10389 }
10390
10391 static void
10392 aarch64_elf_asm_constructor (rtx symbol, int priority)
10393 {
10394   if (priority == DEFAULT_INIT_PRIORITY)
10395     default_ctor_section_asm_out_constructor (symbol, priority);
10396   else
10397     {
10398       section *s;
10399       /* While priority is known to be in range [0, 65535], so 18 bytes
10400          would be enough, the compiler might not know that.  To avoid
10401          -Wformat-truncation false positive, use a larger size.  */
10402       char buf[23];
10403       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10404       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10405       switch_to_section (s);
10406       assemble_align (POINTER_SIZE);
10407       assemble_aligned_integer (POINTER_BYTES, symbol);
10408     }
10409 }
10410
10411 static void
10412 aarch64_elf_asm_destructor (rtx symbol, int priority)
10413 {
10414   if (priority == DEFAULT_INIT_PRIORITY)
10415     default_dtor_section_asm_out_destructor (symbol, priority);
10416   else
10417     {
10418       section *s;
10419       /* While priority is known to be in range [0, 65535], so 18 bytes
10420          would be enough, the compiler might not know that.  To avoid
10421          -Wformat-truncation false positive, use a larger size.  */
10422       char buf[23];
10423       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10424       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10425       switch_to_section (s);
10426       assemble_align (POINTER_SIZE);
10427       assemble_aligned_integer (POINTER_BYTES, symbol);
10428     }
10429 }
10430
10431 const char*
10432 aarch64_output_casesi (rtx *operands)
10433 {
10434   char buf[100];
10435   char label[100];
10436   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10437   int index;
10438   static const char *const patterns[4][2] =
10439   {
10440     {
10441       "ldrb\t%w3, [%0,%w1,uxtw]",
10442       "add\t%3, %4, %w3, sxtb #2"
10443     },
10444     {
10445       "ldrh\t%w3, [%0,%w1,uxtw #1]",
10446       "add\t%3, %4, %w3, sxth #2"
10447     },
10448     {
10449       "ldr\t%w3, [%0,%w1,uxtw #2]",
10450       "add\t%3, %4, %w3, sxtw #2"
10451     },
10452     /* We assume that DImode is only generated when not optimizing and
10453        that we don't really need 64-bit address offsets.  That would
10454        imply an object file with 8GB of code in a single function!  */
10455     {
10456       "ldr\t%w3, [%0,%w1,uxtw #2]",
10457       "add\t%3, %4, %w3, sxtw #2"
10458     }
10459   };
10460
10461   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10462
10463   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10464   index = exact_log2 (GET_MODE_SIZE (mode));
10465
10466   gcc_assert (index >= 0 && index <= 3);
10467
10468   /* Need to implement table size reduction, by chaning the code below.  */
10469   output_asm_insn (patterns[index][0], operands);
10470   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10471   snprintf (buf, sizeof (buf),
10472             "adr\t%%4, %s", targetm.strip_name_encoding (label));
10473   output_asm_insn (buf, operands);
10474   output_asm_insn (patterns[index][1], operands);
10475   output_asm_insn ("br\t%3", operands);
10476   assemble_label (asm_out_file, label);
10477   return "";
10478 }
10479
10480
10481 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10482    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10483    operator.  */
10484
10485 int
10486 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10487 {
10488   if (shift >= 0 && shift <= 3)
10489     {
10490       int size;
10491       for (size = 8; size <= 32; size *= 2)
10492         {
10493           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10494           if (mask == bits << shift)
10495             return size;
10496         }
10497     }
10498   return 0;
10499 }
10500
10501 /* Constant pools are per function only when PC relative
10502    literal loads are true or we are in the large memory
10503    model.  */
10504
10505 static inline bool
10506 aarch64_can_use_per_function_literal_pools_p (void)
10507 {
10508   return (aarch64_pcrelative_literal_loads
10509           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10510 }
10511
10512 static bool
10513 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
10514 {
10515   /* We can't use blocks for constants when we're using a per-function
10516      constant pool.  */
10517   return !aarch64_can_use_per_function_literal_pools_p ();
10518 }
10519
10520 /* Select appropriate section for constants depending
10521    on where we place literal pools.  */
10522
10523 static section *
10524 aarch64_select_rtx_section (machine_mode mode,
10525                             rtx x,
10526                             unsigned HOST_WIDE_INT align)
10527 {
10528   if (aarch64_can_use_per_function_literal_pools_p ())
10529     return function_section (current_function_decl);
10530
10531   return default_elf_select_rtx_section (mode, x, align);
10532 }
10533
10534 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
10535 void
10536 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
10537                                   HOST_WIDE_INT offset)
10538 {
10539   /* When using per-function literal pools, we must ensure that any code
10540      section is aligned to the minimal instruction length, lest we get
10541      errors from the assembler re "unaligned instructions".  */
10542   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
10543     ASM_OUTPUT_ALIGN (f, 2);
10544 }
10545
10546 /* Costs.  */
10547
10548 /* Helper function for rtx cost calculation.  Strip a shift expression
10549    from X.  Returns the inner operand if successful, or the original
10550    expression on failure.  */
10551 static rtx
10552 aarch64_strip_shift (rtx x)
10553 {
10554   rtx op = x;
10555
10556   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10557      we can convert both to ROR during final output.  */
10558   if ((GET_CODE (op) == ASHIFT
10559        || GET_CODE (op) == ASHIFTRT
10560        || GET_CODE (op) == LSHIFTRT
10561        || GET_CODE (op) == ROTATERT
10562        || GET_CODE (op) == ROTATE)
10563       && CONST_INT_P (XEXP (op, 1)))
10564     return XEXP (op, 0);
10565
10566   if (GET_CODE (op) == MULT
10567       && CONST_INT_P (XEXP (op, 1))
10568       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
10569     return XEXP (op, 0);
10570
10571   return x;
10572 }
10573
10574 /* Helper function for rtx cost calculation.  Strip an extend
10575    expression from X.  Returns the inner operand if successful, or the
10576    original expression on failure.  We deal with a number of possible
10577    canonicalization variations here. If STRIP_SHIFT is true, then
10578    we can strip off a shift also.  */
10579 static rtx
10580 aarch64_strip_extend (rtx x, bool strip_shift)
10581 {
10582   scalar_int_mode mode;
10583   rtx op = x;
10584
10585   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
10586     return op;
10587
10588   /* Zero and sign extraction of a widened value.  */
10589   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
10590       && XEXP (op, 2) == const0_rtx
10591       && GET_CODE (XEXP (op, 0)) == MULT
10592       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
10593                                          XEXP (op, 1)))
10594     return XEXP (XEXP (op, 0), 0);
10595
10596   /* It can also be represented (for zero-extend) as an AND with an
10597      immediate.  */
10598   if (GET_CODE (op) == AND
10599       && GET_CODE (XEXP (op, 0)) == MULT
10600       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
10601       && CONST_INT_P (XEXP (op, 1))
10602       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
10603                            INTVAL (XEXP (op, 1))) != 0)
10604     return XEXP (XEXP (op, 0), 0);
10605
10606   /* Now handle extended register, as this may also have an optional
10607      left shift by 1..4.  */
10608   if (strip_shift
10609       && GET_CODE (op) == ASHIFT
10610       && CONST_INT_P (XEXP (op, 1))
10611       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
10612     op = XEXP (op, 0);
10613
10614   if (GET_CODE (op) == ZERO_EXTEND
10615       || GET_CODE (op) == SIGN_EXTEND)
10616     op = XEXP (op, 0);
10617
10618   if (op != x)
10619     return op;
10620
10621   return x;
10622 }
10623
10624 /* Return true iff CODE is a shift supported in combination
10625    with arithmetic instructions.  */
10626
10627 static bool
10628 aarch64_shift_p (enum rtx_code code)
10629 {
10630   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
10631 }
10632
10633
10634 /* Return true iff X is a cheap shift without a sign extend. */
10635
10636 static bool
10637 aarch64_cheap_mult_shift_p (rtx x)
10638 {
10639   rtx op0, op1;
10640
10641   op0 = XEXP (x, 0);
10642   op1 = XEXP (x, 1);
10643
10644   if (!(aarch64_tune_params.extra_tuning_flags
10645                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
10646     return false;
10647
10648   if (GET_CODE (op0) == SIGN_EXTEND)
10649     return false;
10650
10651   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
10652       && UINTVAL (op1) <= 4)
10653     return true;
10654
10655   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
10656     return false;
10657
10658   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
10659
10660   if (l2 > 0 && l2 <= 4)
10661     return true;
10662
10663   return false;
10664 }
10665
10666 /* Helper function for rtx cost calculation.  Calculate the cost of
10667    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10668    Return the calculated cost of the expression, recursing manually in to
10669    operands where needed.  */
10670
10671 static int
10672 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
10673 {
10674   rtx op0, op1;
10675   const struct cpu_cost_table *extra_cost
10676     = aarch64_tune_params.insn_extra_cost;
10677   int cost = 0;
10678   bool compound_p = (outer == PLUS || outer == MINUS);
10679   machine_mode mode = GET_MODE (x);
10680
10681   gcc_checking_assert (code == MULT);
10682
10683   op0 = XEXP (x, 0);
10684   op1 = XEXP (x, 1);
10685
10686   if (VECTOR_MODE_P (mode))
10687     mode = GET_MODE_INNER (mode);
10688
10689   /* Integer multiply/fma.  */
10690   if (GET_MODE_CLASS (mode) == MODE_INT)
10691     {
10692       /* The multiply will be canonicalized as a shift, cost it as such.  */
10693       if (aarch64_shift_p (GET_CODE (x))
10694           || (CONST_INT_P (op1)
10695               && exact_log2 (INTVAL (op1)) > 0))
10696         {
10697           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
10698                            || GET_CODE (op0) == SIGN_EXTEND;
10699           if (speed)
10700             {
10701               if (compound_p)
10702                 {
10703                   /* If the shift is considered cheap,
10704                      then don't add any cost. */
10705                   if (aarch64_cheap_mult_shift_p (x))
10706                     ;
10707                   else if (REG_P (op1))
10708                     /* ARITH + shift-by-register.  */
10709                     cost += extra_cost->alu.arith_shift_reg;
10710                   else if (is_extend)
10711                     /* ARITH + extended register.  We don't have a cost field
10712                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
10713                     cost += extra_cost->alu.extend_arith;
10714                   else
10715                     /* ARITH + shift-by-immediate.  */
10716                     cost += extra_cost->alu.arith_shift;
10717                 }
10718               else
10719                 /* LSL (immediate).  */
10720                 cost += extra_cost->alu.shift;
10721
10722             }
10723           /* Strip extends as we will have costed them in the case above.  */
10724           if (is_extend)
10725             op0 = aarch64_strip_extend (op0, true);
10726
10727           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
10728
10729           return cost;
10730         }
10731
10732       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
10733          compound and let the below cases handle it.  After all, MNEG is a
10734          special-case alias of MSUB.  */
10735       if (GET_CODE (op0) == NEG)
10736         {
10737           op0 = XEXP (op0, 0);
10738           compound_p = true;
10739         }
10740
10741       /* Integer multiplies or FMAs have zero/sign extending variants.  */
10742       if ((GET_CODE (op0) == ZERO_EXTEND
10743            && GET_CODE (op1) == ZERO_EXTEND)
10744           || (GET_CODE (op0) == SIGN_EXTEND
10745               && GET_CODE (op1) == SIGN_EXTEND))
10746         {
10747           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
10748           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
10749
10750           if (speed)
10751             {
10752               if (compound_p)
10753                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
10754                 cost += extra_cost->mult[0].extend_add;
10755               else
10756                 /* MUL/SMULL/UMULL.  */
10757                 cost += extra_cost->mult[0].extend;
10758             }
10759
10760           return cost;
10761         }
10762
10763       /* This is either an integer multiply or a MADD.  In both cases
10764          we want to recurse and cost the operands.  */
10765       cost += rtx_cost (op0, mode, MULT, 0, speed);
10766       cost += rtx_cost (op1, mode, MULT, 1, speed);
10767
10768       if (speed)
10769         {
10770           if (compound_p)
10771             /* MADD/MSUB.  */
10772             cost += extra_cost->mult[mode == DImode].add;
10773           else
10774             /* MUL.  */
10775             cost += extra_cost->mult[mode == DImode].simple;
10776         }
10777
10778       return cost;
10779     }
10780   else
10781     {
10782       if (speed)
10783         {
10784           /* Floating-point FMA/FMUL can also support negations of the
10785              operands, unless the rounding mode is upward or downward in
10786              which case FNMUL is different than FMUL with operand negation.  */
10787           bool neg0 = GET_CODE (op0) == NEG;
10788           bool neg1 = GET_CODE (op1) == NEG;
10789           if (compound_p || !flag_rounding_math || (neg0 && neg1))
10790             {
10791               if (neg0)
10792                 op0 = XEXP (op0, 0);
10793               if (neg1)
10794                 op1 = XEXP (op1, 0);
10795             }
10796
10797           if (compound_p)
10798             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
10799             cost += extra_cost->fp[mode == DFmode].fma;
10800           else
10801             /* FMUL/FNMUL.  */
10802             cost += extra_cost->fp[mode == DFmode].mult;
10803         }
10804
10805       cost += rtx_cost (op0, mode, MULT, 0, speed);
10806       cost += rtx_cost (op1, mode, MULT, 1, speed);
10807       return cost;
10808     }
10809 }
10810
10811 static int
10812 aarch64_address_cost (rtx x,
10813                       machine_mode mode,
10814                       addr_space_t as ATTRIBUTE_UNUSED,
10815                       bool speed)
10816 {
10817   enum rtx_code c = GET_CODE (x);
10818   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
10819   struct aarch64_address_info info;
10820   int cost = 0;
10821   info.shift = 0;
10822
10823   if (!aarch64_classify_address (&info, x, mode, false))
10824     {
10825       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
10826         {
10827           /* This is a CONST or SYMBOL ref which will be split
10828              in a different way depending on the code model in use.
10829              Cost it through the generic infrastructure.  */
10830           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
10831           /* Divide through by the cost of one instruction to
10832              bring it to the same units as the address costs.  */
10833           cost_symbol_ref /= COSTS_N_INSNS (1);
10834           /* The cost is then the cost of preparing the address,
10835              followed by an immediate (possibly 0) offset.  */
10836           return cost_symbol_ref + addr_cost->imm_offset;
10837         }
10838       else
10839         {
10840           /* This is most likely a jump table from a case
10841              statement.  */
10842           return addr_cost->register_offset;
10843         }
10844     }
10845
10846   switch (info.type)
10847     {
10848       case ADDRESS_LO_SUM:
10849       case ADDRESS_SYMBOLIC:
10850       case ADDRESS_REG_IMM:
10851         cost += addr_cost->imm_offset;
10852         break;
10853
10854       case ADDRESS_REG_WB:
10855         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
10856           cost += addr_cost->pre_modify;
10857         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
10858           cost += addr_cost->post_modify;
10859         else
10860           gcc_unreachable ();
10861
10862         break;
10863
10864       case ADDRESS_REG_REG:
10865         cost += addr_cost->register_offset;
10866         break;
10867
10868       case ADDRESS_REG_SXTW:
10869         cost += addr_cost->register_sextend;
10870         break;
10871
10872       case ADDRESS_REG_UXTW:
10873         cost += addr_cost->register_zextend;
10874         break;
10875
10876       default:
10877         gcc_unreachable ();
10878     }
10879
10880
10881   if (info.shift > 0)
10882     {
10883       /* For the sake of calculating the cost of the shifted register
10884          component, we can treat same sized modes in the same way.  */
10885       if (known_eq (GET_MODE_BITSIZE (mode), 16))
10886         cost += addr_cost->addr_scale_costs.hi;
10887       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
10888         cost += addr_cost->addr_scale_costs.si;
10889       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
10890         cost += addr_cost->addr_scale_costs.di;
10891       else
10892         /* We can't tell, or this is a 128-bit vector.  */
10893         cost += addr_cost->addr_scale_costs.ti;
10894     }
10895
10896   return cost;
10897 }
10898
10899 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
10900    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
10901    to be taken.  */
10902
10903 int
10904 aarch64_branch_cost (bool speed_p, bool predictable_p)
10905 {
10906   /* When optimizing for speed, use the cost of unpredictable branches.  */
10907   const struct cpu_branch_cost *branch_costs =
10908     aarch64_tune_params.branch_costs;
10909
10910   if (!speed_p || predictable_p)
10911     return branch_costs->predictable;
10912   else
10913     return branch_costs->unpredictable;
10914 }
10915
10916 /* Return true if the RTX X in mode MODE is a zero or sign extract
10917    usable in an ADD or SUB (extended register) instruction.  */
10918 static bool
10919 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10920 {
10921   /* Catch add with a sign extract.
10922      This is add_<optab><mode>_multp2.  */
10923   if (GET_CODE (x) == SIGN_EXTRACT
10924       || GET_CODE (x) == ZERO_EXTRACT)
10925     {
10926       rtx op0 = XEXP (x, 0);
10927       rtx op1 = XEXP (x, 1);
10928       rtx op2 = XEXP (x, 2);
10929
10930       if (GET_CODE (op0) == MULT
10931           && CONST_INT_P (op1)
10932           && op2 == const0_rtx
10933           && CONST_INT_P (XEXP (op0, 1))
10934           && aarch64_is_extend_from_extract (mode,
10935                                              XEXP (op0, 1),
10936                                              op1))
10937         {
10938           return true;
10939         }
10940     }
10941   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10942      No shift.  */
10943   else if (GET_CODE (x) == SIGN_EXTEND
10944            || GET_CODE (x) == ZERO_EXTEND)
10945     return REG_P (XEXP (x, 0));
10946
10947   return false;
10948 }
10949
10950 static bool
10951 aarch64_frint_unspec_p (unsigned int u)
10952 {
10953   switch (u)
10954     {
10955       case UNSPEC_FRINTZ:
10956       case UNSPEC_FRINTP:
10957       case UNSPEC_FRINTM:
10958       case UNSPEC_FRINTA:
10959       case UNSPEC_FRINTN:
10960       case UNSPEC_FRINTX:
10961       case UNSPEC_FRINTI:
10962         return true;
10963
10964       default:
10965         return false;
10966     }
10967 }
10968
10969 /* Return true iff X is an rtx that will match an extr instruction
10970    i.e. as described in the *extr<mode>5_insn family of patterns.
10971    OP0 and OP1 will be set to the operands of the shifts involved
10972    on success and will be NULL_RTX otherwise.  */
10973
10974 static bool
10975 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10976 {
10977   rtx op0, op1;
10978   scalar_int_mode mode;
10979   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10980     return false;
10981
10982   *res_op0 = NULL_RTX;
10983   *res_op1 = NULL_RTX;
10984
10985   if (GET_CODE (x) != IOR)
10986     return false;
10987
10988   op0 = XEXP (x, 0);
10989   op1 = XEXP (x, 1);
10990
10991   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10992       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10993     {
10994      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10995       if (GET_CODE (op1) == ASHIFT)
10996         std::swap (op0, op1);
10997
10998       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10999         return false;
11000
11001       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11002       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11003
11004       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11005           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11006         {
11007           *res_op0 = XEXP (op0, 0);
11008           *res_op1 = XEXP (op1, 0);
11009           return true;
11010         }
11011     }
11012
11013   return false;
11014 }
11015
11016 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11017    storing it in *COST.  Result is true if the total cost of the operation
11018    has now been calculated.  */
11019 static bool
11020 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11021 {
11022   rtx inner;
11023   rtx comparator;
11024   enum rtx_code cmpcode;
11025
11026   if (COMPARISON_P (op0))
11027     {
11028       inner = XEXP (op0, 0);
11029       comparator = XEXP (op0, 1);
11030       cmpcode = GET_CODE (op0);
11031     }
11032   else
11033     {
11034       inner = op0;
11035       comparator = const0_rtx;
11036       cmpcode = NE;
11037     }
11038
11039   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11040     {
11041       /* Conditional branch.  */
11042       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11043         return true;
11044       else
11045         {
11046           if (cmpcode == NE || cmpcode == EQ)
11047             {
11048               if (comparator == const0_rtx)
11049                 {
11050                   /* TBZ/TBNZ/CBZ/CBNZ.  */
11051                   if (GET_CODE (inner) == ZERO_EXTRACT)
11052                     /* TBZ/TBNZ.  */
11053                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11054                                        ZERO_EXTRACT, 0, speed);
11055                   else
11056                     /* CBZ/CBNZ.  */
11057                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11058
11059                 return true;
11060               }
11061             }
11062           else if (cmpcode == LT || cmpcode == GE)
11063             {
11064               /* TBZ/TBNZ.  */
11065               if (comparator == const0_rtx)
11066                 return true;
11067             }
11068         }
11069     }
11070   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11071     {
11072       /* CCMP.  */
11073       if (GET_CODE (op1) == COMPARE)
11074         {
11075           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
11076           if (XEXP (op1, 1) == const0_rtx)
11077             *cost += 1;
11078           if (speed)
11079             {
11080               machine_mode mode = GET_MODE (XEXP (op1, 0));
11081               const struct cpu_cost_table *extra_cost
11082                 = aarch64_tune_params.insn_extra_cost;
11083
11084               if (GET_MODE_CLASS (mode) == MODE_INT)
11085                 *cost += extra_cost->alu.arith;
11086               else
11087                 *cost += extra_cost->fp[mode == DFmode].compare;
11088             }
11089           return true;
11090         }
11091
11092       /* It's a conditional operation based on the status flags,
11093          so it must be some flavor of CSEL.  */
11094
11095       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
11096       if (GET_CODE (op1) == NEG
11097           || GET_CODE (op1) == NOT
11098           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11099         op1 = XEXP (op1, 0);
11100       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11101         {
11102           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
11103           op1 = XEXP (op1, 0);
11104           op2 = XEXP (op2, 0);
11105         }
11106
11107       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11108       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11109       return true;
11110     }
11111
11112   /* We don't know what this is, cost all operands.  */
11113   return false;
11114 }
11115
11116 /* Check whether X is a bitfield operation of the form shift + extend that
11117    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
11118    operand to which the bitfield operation is applied.  Otherwise return
11119    NULL_RTX.  */
11120
11121 static rtx
11122 aarch64_extend_bitfield_pattern_p (rtx x)
11123 {
11124   rtx_code outer_code = GET_CODE (x);
11125   machine_mode outer_mode = GET_MODE (x);
11126
11127   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11128       && outer_mode != SImode && outer_mode != DImode)
11129     return NULL_RTX;
11130
11131   rtx inner = XEXP (x, 0);
11132   rtx_code inner_code = GET_CODE (inner);
11133   machine_mode inner_mode = GET_MODE (inner);
11134   rtx op = NULL_RTX;
11135
11136   switch (inner_code)
11137     {
11138       case ASHIFT:
11139         if (CONST_INT_P (XEXP (inner, 1))
11140             && (inner_mode == QImode || inner_mode == HImode))
11141           op = XEXP (inner, 0);
11142         break;
11143       case LSHIFTRT:
11144         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11145             && (inner_mode == QImode || inner_mode == HImode))
11146           op = XEXP (inner, 0);
11147         break;
11148       case ASHIFTRT:
11149         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11150             && (inner_mode == QImode || inner_mode == HImode))
11151           op = XEXP (inner, 0);
11152         break;
11153       default:
11154         break;
11155     }
11156
11157   return op;
11158 }
11159
11160 /* Return true if the mask and a shift amount from an RTX of the form
11161    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11162    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
11163
11164 bool
11165 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11166                                     rtx shft_amnt)
11167 {
11168   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11169          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11170          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11171          && (INTVAL (mask)
11172              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11173 }
11174
11175 /* Return true if the masks and a shift amount from an RTX of the form
11176    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11177    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
11178
11179 bool
11180 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11181                                    unsigned HOST_WIDE_INT mask1,
11182                                    unsigned HOST_WIDE_INT shft_amnt,
11183                                    unsigned HOST_WIDE_INT mask2)
11184 {
11185   unsigned HOST_WIDE_INT t;
11186
11187   /* Verify that there is no overlap in what bits are set in the two masks.  */
11188   if (mask1 != ~mask2)
11189     return false;
11190
11191   /* Verify that mask2 is not all zeros or ones.  */
11192   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11193     return false;
11194
11195   /* The shift amount should always be less than the mode size.  */
11196   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11197
11198   /* Verify that the mask being shifted is contiguous and would be in the
11199      least significant bits after shifting by shft_amnt.  */
11200   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11201   return (t == (t & -t));
11202 }
11203
11204 /* Calculate the cost of calculating X, storing it in *COST.  Result
11205    is true if the total cost of the operation has now been calculated.  */
11206 static bool
11207 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11208                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11209 {
11210   rtx op0, op1, op2;
11211   const struct cpu_cost_table *extra_cost
11212     = aarch64_tune_params.insn_extra_cost;
11213   int code = GET_CODE (x);
11214   scalar_int_mode int_mode;
11215
11216   /* By default, assume that everything has equivalent cost to the
11217      cheapest instruction.  Any additional costs are applied as a delta
11218      above this default.  */
11219   *cost = COSTS_N_INSNS (1);
11220
11221   switch (code)
11222     {
11223     case SET:
11224       /* The cost depends entirely on the operands to SET.  */
11225       *cost = 0;
11226       op0 = SET_DEST (x);
11227       op1 = SET_SRC (x);
11228
11229       switch (GET_CODE (op0))
11230         {
11231         case MEM:
11232           if (speed)
11233             {
11234               rtx address = XEXP (op0, 0);
11235               if (VECTOR_MODE_P (mode))
11236                 *cost += extra_cost->ldst.storev;
11237               else if (GET_MODE_CLASS (mode) == MODE_INT)
11238                 *cost += extra_cost->ldst.store;
11239               else if (mode == SFmode)
11240                 *cost += extra_cost->ldst.storef;
11241               else if (mode == DFmode)
11242                 *cost += extra_cost->ldst.stored;
11243
11244               *cost +=
11245                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11246                                                      0, speed));
11247             }
11248
11249           *cost += rtx_cost (op1, mode, SET, 1, speed);
11250           return true;
11251
11252         case SUBREG:
11253           if (! REG_P (SUBREG_REG (op0)))
11254             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11255
11256           /* Fall through.  */
11257         case REG:
11258           /* The cost is one per vector-register copied.  */
11259           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11260             {
11261               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11262               *cost = COSTS_N_INSNS (nregs);
11263             }
11264           /* const0_rtx is in general free, but we will use an
11265              instruction to set a register to 0.  */
11266           else if (REG_P (op1) || op1 == const0_rtx)
11267             {
11268               /* The cost is 1 per register copied.  */
11269               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11270               *cost = COSTS_N_INSNS (nregs);
11271             }
11272           else
11273             /* Cost is just the cost of the RHS of the set.  */
11274             *cost += rtx_cost (op1, mode, SET, 1, speed);
11275           return true;
11276
11277         case ZERO_EXTRACT:
11278         case SIGN_EXTRACT:
11279           /* Bit-field insertion.  Strip any redundant widening of
11280              the RHS to meet the width of the target.  */
11281           if (GET_CODE (op1) == SUBREG)
11282             op1 = SUBREG_REG (op1);
11283           if ((GET_CODE (op1) == ZERO_EXTEND
11284                || GET_CODE (op1) == SIGN_EXTEND)
11285               && CONST_INT_P (XEXP (op0, 1))
11286               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11287               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11288             op1 = XEXP (op1, 0);
11289
11290           if (CONST_INT_P (op1))
11291             {
11292               /* MOV immediate is assumed to always be cheap.  */
11293               *cost = COSTS_N_INSNS (1);
11294             }
11295           else
11296             {
11297               /* BFM.  */
11298               if (speed)
11299                 *cost += extra_cost->alu.bfi;
11300               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11301             }
11302
11303           return true;
11304
11305         default:
11306           /* We can't make sense of this, assume default cost.  */
11307           *cost = COSTS_N_INSNS (1);
11308           return false;
11309         }
11310       return false;
11311
11312     case CONST_INT:
11313       /* If an instruction can incorporate a constant within the
11314          instruction, the instruction's expression avoids calling
11315          rtx_cost() on the constant.  If rtx_cost() is called on a
11316          constant, then it is usually because the constant must be
11317          moved into a register by one or more instructions.
11318
11319          The exception is constant 0, which can be expressed
11320          as XZR/WZR and is therefore free.  The exception to this is
11321          if we have (set (reg) (const0_rtx)) in which case we must cost
11322          the move.  However, we can catch that when we cost the SET, so
11323          we don't need to consider that here.  */
11324       if (x == const0_rtx)
11325         *cost = 0;
11326       else
11327         {
11328           /* To an approximation, building any other constant is
11329              proportionally expensive to the number of instructions
11330              required to build that constant.  This is true whether we
11331              are compiling for SPEED or otherwise.  */
11332           if (!is_a <scalar_int_mode> (mode, &int_mode))
11333             int_mode = word_mode;
11334           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11335                                  (NULL_RTX, x, false, int_mode));
11336         }
11337       return true;
11338
11339     case CONST_DOUBLE:
11340
11341       /* First determine number of instructions to do the move
11342           as an integer constant.  */
11343       if (!aarch64_float_const_representable_p (x)
11344            && !aarch64_can_const_movi_rtx_p (x, mode)
11345            && aarch64_float_const_rtx_p (x))
11346         {
11347           unsigned HOST_WIDE_INT ival;
11348           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11349           gcc_assert (succeed);
11350
11351           scalar_int_mode imode = (mode == HFmode
11352                                    ? SImode
11353                                    : int_mode_for_mode (mode).require ());
11354           int ncost = aarch64_internal_mov_immediate
11355                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11356           *cost += COSTS_N_INSNS (ncost);
11357           return true;
11358         }
11359
11360       if (speed)
11361         {
11362           /* mov[df,sf]_aarch64.  */
11363           if (aarch64_float_const_representable_p (x))
11364             /* FMOV (scalar immediate).  */
11365             *cost += extra_cost->fp[mode == DFmode].fpconst;
11366           else if (!aarch64_float_const_zero_rtx_p (x))
11367             {
11368               /* This will be a load from memory.  */
11369               if (mode == DFmode)
11370                 *cost += extra_cost->ldst.loadd;
11371               else
11372                 *cost += extra_cost->ldst.loadf;
11373             }
11374           else
11375             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
11376                or MOV v0.s[0], wzr - neither of which are modeled by the
11377                cost tables.  Just use the default cost.  */
11378             {
11379             }
11380         }
11381
11382       return true;
11383
11384     case MEM:
11385       if (speed)
11386         {
11387           /* For loads we want the base cost of a load, plus an
11388              approximation for the additional cost of the addressing
11389              mode.  */
11390           rtx address = XEXP (x, 0);
11391           if (VECTOR_MODE_P (mode))
11392             *cost += extra_cost->ldst.loadv;
11393           else if (GET_MODE_CLASS (mode) == MODE_INT)
11394             *cost += extra_cost->ldst.load;
11395           else if (mode == SFmode)
11396             *cost += extra_cost->ldst.loadf;
11397           else if (mode == DFmode)
11398             *cost += extra_cost->ldst.loadd;
11399
11400           *cost +=
11401                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11402                                                      0, speed));
11403         }
11404
11405       return true;
11406
11407     case NEG:
11408       op0 = XEXP (x, 0);
11409
11410       if (VECTOR_MODE_P (mode))
11411         {
11412           if (speed)
11413             {
11414               /* FNEG.  */
11415               *cost += extra_cost->vect.alu;
11416             }
11417           return false;
11418         }
11419
11420       if (GET_MODE_CLASS (mode) == MODE_INT)
11421         {
11422           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11423               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11424             {
11425               /* CSETM.  */
11426               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11427               return true;
11428             }
11429
11430           /* Cost this as SUB wzr, X.  */
11431           op0 = CONST0_RTX (mode);
11432           op1 = XEXP (x, 0);
11433           goto cost_minus;
11434         }
11435
11436       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11437         {
11438           /* Support (neg(fma...)) as a single instruction only if
11439              sign of zeros is unimportant.  This matches the decision
11440              making in aarch64.md.  */
11441           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11442             {
11443               /* FNMADD.  */
11444               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11445               return true;
11446             }
11447           if (GET_CODE (op0) == MULT)
11448             {
11449               /* FNMUL.  */
11450               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11451               return true;
11452             }
11453           if (speed)
11454             /* FNEG.  */
11455             *cost += extra_cost->fp[mode == DFmode].neg;
11456           return false;
11457         }
11458
11459       return false;
11460
11461     case CLRSB:
11462     case CLZ:
11463       if (speed)
11464         {
11465           if (VECTOR_MODE_P (mode))
11466             *cost += extra_cost->vect.alu;
11467           else
11468             *cost += extra_cost->alu.clz;
11469         }
11470
11471       return false;
11472
11473     case COMPARE:
11474       op0 = XEXP (x, 0);
11475       op1 = XEXP (x, 1);
11476
11477       if (op1 == const0_rtx
11478           && GET_CODE (op0) == AND)
11479         {
11480           x = op0;
11481           mode = GET_MODE (op0);
11482           goto cost_logic;
11483         }
11484
11485       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11486         {
11487           /* TODO: A write to the CC flags possibly costs extra, this
11488              needs encoding in the cost tables.  */
11489
11490           mode = GET_MODE (op0);
11491           /* ANDS.  */
11492           if (GET_CODE (op0) == AND)
11493             {
11494               x = op0;
11495               goto cost_logic;
11496             }
11497
11498           if (GET_CODE (op0) == PLUS)
11499             {
11500               /* ADDS (and CMN alias).  */
11501               x = op0;
11502               goto cost_plus;
11503             }
11504
11505           if (GET_CODE (op0) == MINUS)
11506             {
11507               /* SUBS.  */
11508               x = op0;
11509               goto cost_minus;
11510             }
11511
11512           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
11513               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
11514               && CONST_INT_P (XEXP (op0, 2)))
11515             {
11516               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11517                  Handle it here directly rather than going to cost_logic
11518                  since we know the immediate generated for the TST is valid
11519                  so we can avoid creating an intermediate rtx for it only
11520                  for costing purposes.  */
11521               if (speed)
11522                 *cost += extra_cost->alu.logical;
11523
11524               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
11525                                  ZERO_EXTRACT, 0, speed);
11526               return true;
11527             }
11528
11529           if (GET_CODE (op1) == NEG)
11530             {
11531               /* CMN.  */
11532               if (speed)
11533                 *cost += extra_cost->alu.arith;
11534
11535               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
11536               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
11537               return true;
11538             }
11539
11540           /* CMP.
11541
11542              Compare can freely swap the order of operands, and
11543              canonicalization puts the more complex operation first.
11544              But the integer MINUS logic expects the shift/extend
11545              operation in op1.  */
11546           if (! (REG_P (op0)
11547                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
11548           {
11549             op0 = XEXP (x, 1);
11550             op1 = XEXP (x, 0);
11551           }
11552           goto cost_minus;
11553         }
11554
11555       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
11556         {
11557           /* FCMP.  */
11558           if (speed)
11559             *cost += extra_cost->fp[mode == DFmode].compare;
11560
11561           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
11562             {
11563               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
11564               /* FCMP supports constant 0.0 for no extra cost. */
11565               return true;
11566             }
11567           return false;
11568         }
11569
11570       if (VECTOR_MODE_P (mode))
11571         {
11572           /* Vector compare.  */
11573           if (speed)
11574             *cost += extra_cost->vect.alu;
11575
11576           if (aarch64_float_const_zero_rtx_p (op1))
11577             {
11578               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11579                  cost.  */
11580               return true;
11581             }
11582           return false;
11583         }
11584       return false;
11585
11586     case MINUS:
11587       {
11588         op0 = XEXP (x, 0);
11589         op1 = XEXP (x, 1);
11590
11591 cost_minus:
11592         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
11593
11594         /* Detect valid immediates.  */
11595         if ((GET_MODE_CLASS (mode) == MODE_INT
11596              || (GET_MODE_CLASS (mode) == MODE_CC
11597                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
11598             && CONST_INT_P (op1)
11599             && aarch64_uimm12_shift (INTVAL (op1)))
11600           {
11601             if (speed)
11602               /* SUB(S) (immediate).  */
11603               *cost += extra_cost->alu.arith;
11604             return true;
11605           }
11606
11607         /* Look for SUB (extended register).  */
11608         if (is_a <scalar_int_mode> (mode, &int_mode)
11609             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
11610           {
11611             if (speed)
11612               *cost += extra_cost->alu.extend_arith;
11613
11614             op1 = aarch64_strip_extend (op1, true);
11615             *cost += rtx_cost (op1, VOIDmode,
11616                                (enum rtx_code) GET_CODE (op1), 0, speed);
11617             return true;
11618           }
11619
11620         rtx new_op1 = aarch64_strip_extend (op1, false);
11621
11622         /* Cost this as an FMA-alike operation.  */
11623         if ((GET_CODE (new_op1) == MULT
11624              || aarch64_shift_p (GET_CODE (new_op1)))
11625             && code != COMPARE)
11626           {
11627             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
11628                                             (enum rtx_code) code,
11629                                             speed);
11630             return true;
11631           }
11632
11633         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
11634
11635         if (speed)
11636           {
11637             if (VECTOR_MODE_P (mode))
11638               {
11639                 /* Vector SUB.  */
11640                 *cost += extra_cost->vect.alu;
11641               }
11642             else if (GET_MODE_CLASS (mode) == MODE_INT)
11643               {
11644                 /* SUB(S).  */
11645                 *cost += extra_cost->alu.arith;
11646               }
11647             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11648               {
11649                 /* FSUB.  */
11650                 *cost += extra_cost->fp[mode == DFmode].addsub;
11651               }
11652           }
11653         return true;
11654       }
11655
11656     case PLUS:
11657       {
11658         rtx new_op0;
11659
11660         op0 = XEXP (x, 0);
11661         op1 = XEXP (x, 1);
11662
11663 cost_plus:
11664         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11665             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11666           {
11667             /* CSINC.  */
11668             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
11669             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11670             return true;
11671           }
11672
11673         if (GET_MODE_CLASS (mode) == MODE_INT
11674             && (aarch64_plus_immediate (op1, mode)
11675                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
11676           {
11677             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
11678
11679             if (speed)
11680               /* ADD (immediate).  */
11681               *cost += extra_cost->alu.arith;
11682             return true;
11683           }
11684
11685         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11686
11687         /* Look for ADD (extended register).  */
11688         if (is_a <scalar_int_mode> (mode, &int_mode)
11689             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
11690           {
11691             if (speed)
11692               *cost += extra_cost->alu.extend_arith;
11693
11694             op0 = aarch64_strip_extend (op0, true);
11695             *cost += rtx_cost (op0, VOIDmode,
11696                                (enum rtx_code) GET_CODE (op0), 0, speed);
11697             return true;
11698           }
11699
11700         /* Strip any extend, leave shifts behind as we will
11701            cost them through mult_cost.  */
11702         new_op0 = aarch64_strip_extend (op0, false);
11703
11704         if (GET_CODE (new_op0) == MULT
11705             || aarch64_shift_p (GET_CODE (new_op0)))
11706           {
11707             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
11708                                             speed);
11709             return true;
11710           }
11711
11712         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
11713
11714         if (speed)
11715           {
11716             if (VECTOR_MODE_P (mode))
11717               {
11718                 /* Vector ADD.  */
11719                 *cost += extra_cost->vect.alu;
11720               }
11721             else if (GET_MODE_CLASS (mode) == MODE_INT)
11722               {
11723                 /* ADD.  */
11724                 *cost += extra_cost->alu.arith;
11725               }
11726             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11727               {
11728                 /* FADD.  */
11729                 *cost += extra_cost->fp[mode == DFmode].addsub;
11730               }
11731           }
11732         return true;
11733       }
11734
11735     case BSWAP:
11736       *cost = COSTS_N_INSNS (1);
11737
11738       if (speed)
11739         {
11740           if (VECTOR_MODE_P (mode))
11741             *cost += extra_cost->vect.alu;
11742           else
11743             *cost += extra_cost->alu.rev;
11744         }
11745       return false;
11746
11747     case IOR:
11748       if (aarch_rev16_p (x))
11749         {
11750           *cost = COSTS_N_INSNS (1);
11751
11752           if (speed)
11753             {
11754               if (VECTOR_MODE_P (mode))
11755                 *cost += extra_cost->vect.alu;
11756               else
11757                 *cost += extra_cost->alu.rev;
11758             }
11759           return true;
11760         }
11761
11762       if (aarch64_extr_rtx_p (x, &op0, &op1))
11763         {
11764           *cost += rtx_cost (op0, mode, IOR, 0, speed);
11765           *cost += rtx_cost (op1, mode, IOR, 1, speed);
11766           if (speed)
11767             *cost += extra_cost->alu.shift;
11768
11769           return true;
11770         }
11771     /* Fall through.  */
11772     case XOR:
11773     case AND:
11774     cost_logic:
11775       op0 = XEXP (x, 0);
11776       op1 = XEXP (x, 1);
11777
11778       if (VECTOR_MODE_P (mode))
11779         {
11780           if (speed)
11781             *cost += extra_cost->vect.alu;
11782           return true;
11783         }
11784
11785       if (code == AND
11786           && GET_CODE (op0) == MULT
11787           && CONST_INT_P (XEXP (op0, 1))
11788           && CONST_INT_P (op1)
11789           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
11790                                INTVAL (op1)) != 0)
11791         {
11792           /* This is a UBFM/SBFM.  */
11793           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
11794           if (speed)
11795             *cost += extra_cost->alu.bfx;
11796           return true;
11797         }
11798
11799       if (is_int_mode (mode, &int_mode))
11800         {
11801           if (CONST_INT_P (op1))
11802             {
11803               /* We have a mask + shift version of a UBFIZ
11804                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
11805               if (GET_CODE (op0) == ASHIFT
11806                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
11807                                                          XEXP (op0, 1)))
11808                 {
11809                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
11810                                      (enum rtx_code) code, 0, speed);
11811                   if (speed)
11812                     *cost += extra_cost->alu.bfx;
11813
11814                   return true;
11815                 }
11816               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
11817                 {
11818                 /* We possibly get the immediate for free, this is not
11819                    modelled.  */
11820                   *cost += rtx_cost (op0, int_mode,
11821                                      (enum rtx_code) code, 0, speed);
11822                   if (speed)
11823                     *cost += extra_cost->alu.logical;
11824
11825                   return true;
11826                 }
11827             }
11828           else
11829             {
11830               rtx new_op0 = op0;
11831
11832               /* Handle ORN, EON, or BIC.  */
11833               if (GET_CODE (op0) == NOT)
11834                 op0 = XEXP (op0, 0);
11835
11836               new_op0 = aarch64_strip_shift (op0);
11837
11838               /* If we had a shift on op0 then this is a logical-shift-
11839                  by-register/immediate operation.  Otherwise, this is just
11840                  a logical operation.  */
11841               if (speed)
11842                 {
11843                   if (new_op0 != op0)
11844                     {
11845                       /* Shift by immediate.  */
11846                       if (CONST_INT_P (XEXP (op0, 1)))
11847                         *cost += extra_cost->alu.log_shift;
11848                       else
11849                         *cost += extra_cost->alu.log_shift_reg;
11850                     }
11851                   else
11852                     *cost += extra_cost->alu.logical;
11853                 }
11854
11855               /* In both cases we want to cost both operands.  */
11856               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
11857                                  0, speed);
11858               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
11859                                  1, speed);
11860
11861               return true;
11862             }
11863         }
11864       return false;
11865
11866     case NOT:
11867       x = XEXP (x, 0);
11868       op0 = aarch64_strip_shift (x);
11869
11870       if (VECTOR_MODE_P (mode))
11871         {
11872           /* Vector NOT.  */
11873           *cost += extra_cost->vect.alu;
11874           return false;
11875         }
11876
11877       /* MVN-shifted-reg.  */
11878       if (op0 != x)
11879         {
11880           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11881
11882           if (speed)
11883             *cost += extra_cost->alu.log_shift;
11884
11885           return true;
11886         }
11887       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11888          Handle the second form here taking care that 'a' in the above can
11889          be a shift.  */
11890       else if (GET_CODE (op0) == XOR)
11891         {
11892           rtx newop0 = XEXP (op0, 0);
11893           rtx newop1 = XEXP (op0, 1);
11894           rtx op0_stripped = aarch64_strip_shift (newop0);
11895
11896           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
11897           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
11898
11899           if (speed)
11900             {
11901               if (op0_stripped != newop0)
11902                 *cost += extra_cost->alu.log_shift;
11903               else
11904                 *cost += extra_cost->alu.logical;
11905             }
11906
11907           return true;
11908         }
11909       /* MVN.  */
11910       if (speed)
11911         *cost += extra_cost->alu.logical;
11912
11913       return false;
11914
11915     case ZERO_EXTEND:
11916
11917       op0 = XEXP (x, 0);
11918       /* If a value is written in SI mode, then zero extended to DI
11919          mode, the operation will in general be free as a write to
11920          a 'w' register implicitly zeroes the upper bits of an 'x'
11921          register.  However, if this is
11922
11923            (set (reg) (zero_extend (reg)))
11924
11925          we must cost the explicit register move.  */
11926       if (mode == DImode
11927           && GET_MODE (op0) == SImode
11928           && outer == SET)
11929         {
11930           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11931
11932         /* If OP_COST is non-zero, then the cost of the zero extend
11933            is effectively the cost of the inner operation.  Otherwise
11934            we have a MOV instruction and we take the cost from the MOV
11935            itself.  This is true independently of whether we are
11936            optimizing for space or time.  */
11937           if (op_cost)
11938             *cost = op_cost;
11939
11940           return true;
11941         }
11942       else if (MEM_P (op0))
11943         {
11944           /* All loads can zero extend to any size for free.  */
11945           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11946           return true;
11947         }
11948
11949       op0 = aarch64_extend_bitfield_pattern_p (x);
11950       if (op0)
11951         {
11952           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11953           if (speed)
11954             *cost += extra_cost->alu.bfx;
11955           return true;
11956         }
11957
11958       if (speed)
11959         {
11960           if (VECTOR_MODE_P (mode))
11961             {
11962               /* UMOV.  */
11963               *cost += extra_cost->vect.alu;
11964             }
11965           else
11966             {
11967               /* We generate an AND instead of UXTB/UXTH.  */
11968               *cost += extra_cost->alu.logical;
11969             }
11970         }
11971       return false;
11972
11973     case SIGN_EXTEND:
11974       if (MEM_P (XEXP (x, 0)))
11975         {
11976           /* LDRSH.  */
11977           if (speed)
11978             {
11979               rtx address = XEXP (XEXP (x, 0), 0);
11980               *cost += extra_cost->ldst.load_sign_extend;
11981
11982               *cost +=
11983                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11984                                                      0, speed));
11985             }
11986           return true;
11987         }
11988
11989       op0 = aarch64_extend_bitfield_pattern_p (x);
11990       if (op0)
11991         {
11992           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11993           if (speed)
11994             *cost += extra_cost->alu.bfx;
11995           return true;
11996         }
11997
11998       if (speed)
11999         {
12000           if (VECTOR_MODE_P (mode))
12001             *cost += extra_cost->vect.alu;
12002           else
12003             *cost += extra_cost->alu.extend;
12004         }
12005       return false;
12006
12007     case ASHIFT:
12008       op0 = XEXP (x, 0);
12009       op1 = XEXP (x, 1);
12010
12011       if (CONST_INT_P (op1))
12012         {
12013           if (speed)
12014             {
12015               if (VECTOR_MODE_P (mode))
12016                 {
12017                   /* Vector shift (immediate).  */
12018                   *cost += extra_cost->vect.alu;
12019                 }
12020               else
12021                 {
12022                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
12023                      aliases.  */
12024                   *cost += extra_cost->alu.shift;
12025                 }
12026             }
12027
12028           /* We can incorporate zero/sign extend for free.  */
12029           if (GET_CODE (op0) == ZERO_EXTEND
12030               || GET_CODE (op0) == SIGN_EXTEND)
12031             op0 = XEXP (op0, 0);
12032
12033           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12034           return true;
12035         }
12036       else
12037         {
12038           if (VECTOR_MODE_P (mode))
12039             {
12040               if (speed)
12041                 /* Vector shift (register).  */
12042                 *cost += extra_cost->vect.alu;
12043             }
12044           else
12045             {
12046               if (speed)
12047                 /* LSLV.  */
12048                 *cost += extra_cost->alu.shift_reg;
12049
12050               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12051                   && CONST_INT_P (XEXP (op1, 1))
12052                   && known_eq (INTVAL (XEXP (op1, 1)),
12053                                GET_MODE_BITSIZE (mode) - 1))
12054                 {
12055                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12056                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12057                      don't recurse into it.  */
12058                   return true;
12059                 }
12060             }
12061           return false;  /* All arguments need to be in registers.  */
12062         }
12063
12064     case ROTATE:
12065     case ROTATERT:
12066     case LSHIFTRT:
12067     case ASHIFTRT:
12068       op0 = XEXP (x, 0);
12069       op1 = XEXP (x, 1);
12070
12071       if (CONST_INT_P (op1))
12072         {
12073           /* ASR (immediate) and friends.  */
12074           if (speed)
12075             {
12076               if (VECTOR_MODE_P (mode))
12077                 *cost += extra_cost->vect.alu;
12078               else
12079                 *cost += extra_cost->alu.shift;
12080             }
12081
12082           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12083           return true;
12084         }
12085       else
12086         {
12087           if (VECTOR_MODE_P (mode))
12088             {
12089               if (speed)
12090                 /* Vector shift (register).  */
12091                 *cost += extra_cost->vect.alu;
12092             }
12093           else
12094             {
12095               if (speed)
12096                 /* ASR (register) and friends.  */
12097                 *cost += extra_cost->alu.shift_reg;
12098
12099               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12100                   && CONST_INT_P (XEXP (op1, 1))
12101                   && known_eq (INTVAL (XEXP (op1, 1)),
12102                                GET_MODE_BITSIZE (mode) - 1))
12103                 {
12104                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12105                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12106                      don't recurse into it.  */
12107                   return true;
12108                 }
12109             }
12110           return false;  /* All arguments need to be in registers.  */
12111         }
12112
12113     case SYMBOL_REF:
12114
12115       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12116           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12117         {
12118           /* LDR.  */
12119           if (speed)
12120             *cost += extra_cost->ldst.load;
12121         }
12122       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12123                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12124         {
12125           /* ADRP, followed by ADD.  */
12126           *cost += COSTS_N_INSNS (1);
12127           if (speed)
12128             *cost += 2 * extra_cost->alu.arith;
12129         }
12130       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12131                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12132         {
12133           /* ADR.  */
12134           if (speed)
12135             *cost += extra_cost->alu.arith;
12136         }
12137
12138       if (flag_pic)
12139         {
12140           /* One extra load instruction, after accessing the GOT.  */
12141           *cost += COSTS_N_INSNS (1);
12142           if (speed)
12143             *cost += extra_cost->ldst.load;
12144         }
12145       return true;
12146
12147     case HIGH:
12148     case LO_SUM:
12149       /* ADRP/ADD (immediate).  */
12150       if (speed)
12151         *cost += extra_cost->alu.arith;
12152       return true;
12153
12154     case ZERO_EXTRACT:
12155     case SIGN_EXTRACT:
12156       /* UBFX/SBFX.  */
12157       if (speed)
12158         {
12159           if (VECTOR_MODE_P (mode))
12160             *cost += extra_cost->vect.alu;
12161           else
12162             *cost += extra_cost->alu.bfx;
12163         }
12164
12165       /* We can trust that the immediates used will be correct (there
12166          are no by-register forms), so we need only cost op0.  */
12167       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12168       return true;
12169
12170     case MULT:
12171       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12172       /* aarch64_rtx_mult_cost always handles recursion to its
12173          operands.  */
12174       return true;
12175
12176     case MOD:
12177     /* We can expand signed mod by power of 2 using a NEGS, two parallel
12178        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
12179        an unconditional negate.  This case should only ever be reached through
12180        the set_smod_pow2_cheap check in expmed.c.  */
12181       if (CONST_INT_P (XEXP (x, 1))
12182           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12183           && (mode == SImode || mode == DImode))
12184         {
12185           /* We expand to 4 instructions.  Reset the baseline.  */
12186           *cost = COSTS_N_INSNS (4);
12187
12188           if (speed)
12189             *cost += 2 * extra_cost->alu.logical
12190                      + 2 * extra_cost->alu.arith;
12191
12192           return true;
12193         }
12194
12195     /* Fall-through.  */
12196     case UMOD:
12197       if (speed)
12198         {
12199           /* Slighly prefer UMOD over SMOD.  */
12200           if (VECTOR_MODE_P (mode))
12201             *cost += extra_cost->vect.alu;
12202           else if (GET_MODE_CLASS (mode) == MODE_INT)
12203             *cost += (extra_cost->mult[mode == DImode].add
12204                       + extra_cost->mult[mode == DImode].idiv
12205                       + (code == MOD ? 1 : 0));
12206         }
12207       return false;  /* All arguments need to be in registers.  */
12208
12209     case DIV:
12210     case UDIV:
12211     case SQRT:
12212       if (speed)
12213         {
12214           if (VECTOR_MODE_P (mode))
12215             *cost += extra_cost->vect.alu;
12216           else if (GET_MODE_CLASS (mode) == MODE_INT)
12217             /* There is no integer SQRT, so only DIV and UDIV can get
12218                here.  */
12219             *cost += (extra_cost->mult[mode == DImode].idiv
12220                      /* Slighly prefer UDIV over SDIV.  */
12221                      + (code == DIV ? 1 : 0));
12222           else
12223             *cost += extra_cost->fp[mode == DFmode].div;
12224         }
12225       return false;  /* All arguments need to be in registers.  */
12226
12227     case IF_THEN_ELSE:
12228       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12229                                          XEXP (x, 2), cost, speed);
12230
12231     case EQ:
12232     case NE:
12233     case GT:
12234     case GTU:
12235     case LT:
12236     case LTU:
12237     case GE:
12238     case GEU:
12239     case LE:
12240     case LEU:
12241
12242       return false; /* All arguments must be in registers.  */
12243
12244     case FMA:
12245       op0 = XEXP (x, 0);
12246       op1 = XEXP (x, 1);
12247       op2 = XEXP (x, 2);
12248
12249       if (speed)
12250         {
12251           if (VECTOR_MODE_P (mode))
12252             *cost += extra_cost->vect.alu;
12253           else
12254             *cost += extra_cost->fp[mode == DFmode].fma;
12255         }
12256
12257       /* FMSUB, FNMADD, and FNMSUB are free.  */
12258       if (GET_CODE (op0) == NEG)
12259         op0 = XEXP (op0, 0);
12260
12261       if (GET_CODE (op2) == NEG)
12262         op2 = XEXP (op2, 0);
12263
12264       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12265          and the by-element operand as operand 0.  */
12266       if (GET_CODE (op1) == NEG)
12267         op1 = XEXP (op1, 0);
12268
12269       /* Catch vector-by-element operations.  The by-element operand can
12270          either be (vec_duplicate (vec_select (x))) or just
12271          (vec_select (x)), depending on whether we are multiplying by
12272          a vector or a scalar.
12273
12274          Canonicalization is not very good in these cases, FMA4 will put the
12275          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
12276       if (GET_CODE (op0) == VEC_DUPLICATE)
12277         op0 = XEXP (op0, 0);
12278       else if (GET_CODE (op1) == VEC_DUPLICATE)
12279         op1 = XEXP (op1, 0);
12280
12281       if (GET_CODE (op0) == VEC_SELECT)
12282         op0 = XEXP (op0, 0);
12283       else if (GET_CODE (op1) == VEC_SELECT)
12284         op1 = XEXP (op1, 0);
12285
12286       /* If the remaining parameters are not registers,
12287          get the cost to put them into registers.  */
12288       *cost += rtx_cost (op0, mode, FMA, 0, speed);
12289       *cost += rtx_cost (op1, mode, FMA, 1, speed);
12290       *cost += rtx_cost (op2, mode, FMA, 2, speed);
12291       return true;
12292
12293     case FLOAT:
12294     case UNSIGNED_FLOAT:
12295       if (speed)
12296         *cost += extra_cost->fp[mode == DFmode].fromint;
12297       return false;
12298
12299     case FLOAT_EXTEND:
12300       if (speed)
12301         {
12302           if (VECTOR_MODE_P (mode))
12303             {
12304               /*Vector truncate.  */
12305               *cost += extra_cost->vect.alu;
12306             }
12307           else
12308             *cost += extra_cost->fp[mode == DFmode].widen;
12309         }
12310       return false;
12311
12312     case FLOAT_TRUNCATE:
12313       if (speed)
12314         {
12315           if (VECTOR_MODE_P (mode))
12316             {
12317               /*Vector conversion.  */
12318               *cost += extra_cost->vect.alu;
12319             }
12320           else
12321             *cost += extra_cost->fp[mode == DFmode].narrow;
12322         }
12323       return false;
12324
12325     case FIX:
12326     case UNSIGNED_FIX:
12327       x = XEXP (x, 0);
12328       /* Strip the rounding part.  They will all be implemented
12329          by the fcvt* family of instructions anyway.  */
12330       if (GET_CODE (x) == UNSPEC)
12331         {
12332           unsigned int uns_code = XINT (x, 1);
12333
12334           if (uns_code == UNSPEC_FRINTA
12335               || uns_code == UNSPEC_FRINTM
12336               || uns_code == UNSPEC_FRINTN
12337               || uns_code == UNSPEC_FRINTP
12338               || uns_code == UNSPEC_FRINTZ)
12339             x = XVECEXP (x, 0, 0);
12340         }
12341
12342       if (speed)
12343         {
12344           if (VECTOR_MODE_P (mode))
12345             *cost += extra_cost->vect.alu;
12346           else
12347             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12348         }
12349
12350       /* We can combine fmul by a power of 2 followed by a fcvt into a single
12351          fixed-point fcvt.  */
12352       if (GET_CODE (x) == MULT
12353           && ((VECTOR_MODE_P (mode)
12354                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12355               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12356         {
12357           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12358                              0, speed);
12359           return true;
12360         }
12361
12362       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12363       return true;
12364
12365     case ABS:
12366       if (VECTOR_MODE_P (mode))
12367         {
12368           /* ABS (vector).  */
12369           if (speed)
12370             *cost += extra_cost->vect.alu;
12371         }
12372       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12373         {
12374           op0 = XEXP (x, 0);
12375
12376           /* FABD, which is analogous to FADD.  */
12377           if (GET_CODE (op0) == MINUS)
12378             {
12379               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12380               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12381               if (speed)
12382                 *cost += extra_cost->fp[mode == DFmode].addsub;
12383
12384               return true;
12385             }
12386           /* Simple FABS is analogous to FNEG.  */
12387           if (speed)
12388             *cost += extra_cost->fp[mode == DFmode].neg;
12389         }
12390       else
12391         {
12392           /* Integer ABS will either be split to
12393              two arithmetic instructions, or will be an ABS
12394              (scalar), which we don't model.  */
12395           *cost = COSTS_N_INSNS (2);
12396           if (speed)
12397             *cost += 2 * extra_cost->alu.arith;
12398         }
12399       return false;
12400
12401     case SMAX:
12402     case SMIN:
12403       if (speed)
12404         {
12405           if (VECTOR_MODE_P (mode))
12406             *cost += extra_cost->vect.alu;
12407           else
12408             {
12409               /* FMAXNM/FMINNM/FMAX/FMIN.
12410                  TODO: This may not be accurate for all implementations, but
12411                  we do not model this in the cost tables.  */
12412               *cost += extra_cost->fp[mode == DFmode].addsub;
12413             }
12414         }
12415       return false;
12416
12417     case UNSPEC:
12418       /* The floating point round to integer frint* instructions.  */
12419       if (aarch64_frint_unspec_p (XINT (x, 1)))
12420         {
12421           if (speed)
12422             *cost += extra_cost->fp[mode == DFmode].roundint;
12423
12424           return false;
12425         }
12426
12427       if (XINT (x, 1) == UNSPEC_RBIT)
12428         {
12429           if (speed)
12430             *cost += extra_cost->alu.rev;
12431
12432           return false;
12433         }
12434       break;
12435
12436     case TRUNCATE:
12437
12438       /* Decompose <su>muldi3_highpart.  */
12439       if (/* (truncate:DI  */
12440           mode == DImode
12441           /*   (lshiftrt:TI  */
12442           && GET_MODE (XEXP (x, 0)) == TImode
12443           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12444           /*      (mult:TI  */
12445           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12446           /*        (ANY_EXTEND:TI (reg:DI))
12447                     (ANY_EXTEND:TI (reg:DI)))  */
12448           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12449                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12450               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12451                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12452           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12453           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12454           /*     (const_int 64)  */
12455           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12456           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12457         {
12458           /* UMULH/SMULH.  */
12459           if (speed)
12460             *cost += extra_cost->mult[mode == DImode].extend;
12461           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12462                              mode, MULT, 0, speed);
12463           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12464                              mode, MULT, 1, speed);
12465           return true;
12466         }
12467
12468       /* Fall through.  */
12469     default:
12470       break;
12471     }
12472
12473   if (dump_file
12474       && flag_aarch64_verbose_cost)
12475     fprintf (dump_file,
12476       "\nFailed to cost RTX.  Assuming default cost.\n");
12477
12478   return true;
12479 }
12480
12481 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12482    calculated for X.  This cost is stored in *COST.  Returns true
12483    if the total cost of X was calculated.  */
12484 static bool
12485 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12486                    int param, int *cost, bool speed)
12487 {
12488   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12489
12490   if (dump_file
12491       && flag_aarch64_verbose_cost)
12492     {
12493       print_rtl_single (dump_file, x);
12494       fprintf (dump_file, "\n%s cost: %d (%s)\n",
12495                speed ? "Hot" : "Cold",
12496                *cost, result ? "final" : "partial");
12497     }
12498
12499   return result;
12500 }
12501
12502 static int
12503 aarch64_register_move_cost (machine_mode mode,
12504                             reg_class_t from_i, reg_class_t to_i)
12505 {
12506   enum reg_class from = (enum reg_class) from_i;
12507   enum reg_class to = (enum reg_class) to_i;
12508   const struct cpu_regmove_cost *regmove_cost
12509     = aarch64_tune_params.regmove_cost;
12510
12511   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
12512   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
12513     to = GENERAL_REGS;
12514
12515   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
12516     from = GENERAL_REGS;
12517
12518   /* Make RDFFR very expensive.  In particular, if we know that the FFR
12519      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12520      as a way of obtaining a PTRUE.  */
12521   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
12522       && hard_reg_set_subset_p (reg_class_contents[from_i],
12523                                 reg_class_contents[FFR_REGS]))
12524     return 80;
12525
12526   /* Moving between GPR and stack cost is the same as GP2GP.  */
12527   if ((from == GENERAL_REGS && to == STACK_REG)
12528       || (to == GENERAL_REGS && from == STACK_REG))
12529     return regmove_cost->GP2GP;
12530
12531   /* To/From the stack register, we move via the gprs.  */
12532   if (to == STACK_REG || from == STACK_REG)
12533     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
12534             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
12535
12536   if (known_eq (GET_MODE_SIZE (mode), 16))
12537     {
12538       /* 128-bit operations on general registers require 2 instructions.  */
12539       if (from == GENERAL_REGS && to == GENERAL_REGS)
12540         return regmove_cost->GP2GP * 2;
12541       else if (from == GENERAL_REGS)
12542         return regmove_cost->GP2FP * 2;
12543       else if (to == GENERAL_REGS)
12544         return regmove_cost->FP2GP * 2;
12545
12546       /* When AdvSIMD instructions are disabled it is not possible to move
12547          a 128-bit value directly between Q registers.  This is handled in
12548          secondary reload.  A general register is used as a scratch to move
12549          the upper DI value and the lower DI value is moved directly,
12550          hence the cost is the sum of three moves. */
12551       if (! TARGET_SIMD)
12552         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
12553
12554       return regmove_cost->FP2FP;
12555     }
12556
12557   if (from == GENERAL_REGS && to == GENERAL_REGS)
12558     return regmove_cost->GP2GP;
12559   else if (from == GENERAL_REGS)
12560     return regmove_cost->GP2FP;
12561   else if (to == GENERAL_REGS)
12562     return regmove_cost->FP2GP;
12563
12564   return regmove_cost->FP2FP;
12565 }
12566
12567 static int
12568 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
12569                           reg_class_t rclass ATTRIBUTE_UNUSED,
12570                           bool in ATTRIBUTE_UNUSED)
12571 {
12572   return aarch64_tune_params.memmov_cost;
12573 }
12574
12575 /* Implement TARGET_INIT_BUILTINS.  */
12576 static void
12577 aarch64_init_builtins ()
12578 {
12579   aarch64_general_init_builtins ();
12580   aarch64_sve::init_builtins ();
12581 }
12582
12583 /* Implement TARGET_FOLD_BUILTIN.  */
12584 static tree
12585 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
12586 {
12587   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12588   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12589   tree type = TREE_TYPE (TREE_TYPE (fndecl));
12590   switch (code & AARCH64_BUILTIN_CLASS)
12591     {
12592     case AARCH64_BUILTIN_GENERAL:
12593       return aarch64_general_fold_builtin (subcode, type, nargs, args);
12594
12595     case AARCH64_BUILTIN_SVE:
12596       return NULL_TREE;
12597     }
12598   gcc_unreachable ();
12599 }
12600
12601 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
12602 static bool
12603 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
12604 {
12605   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
12606   tree fndecl = gimple_call_fndecl (stmt);
12607   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12608   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12609   gimple *new_stmt = NULL;
12610   switch (code & AARCH64_BUILTIN_CLASS)
12611     {
12612     case AARCH64_BUILTIN_GENERAL:
12613       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
12614       break;
12615
12616     case AARCH64_BUILTIN_SVE:
12617       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
12618       break;
12619     }
12620
12621   if (!new_stmt)
12622     return false;
12623
12624   gsi_replace (gsi, new_stmt, true);
12625   return true;
12626 }
12627
12628 /* Implement TARGET_EXPAND_BUILTIN.  */
12629 static rtx
12630 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
12631 {
12632   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12633   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12634   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12635   switch (code & AARCH64_BUILTIN_CLASS)
12636     {
12637     case AARCH64_BUILTIN_GENERAL:
12638       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
12639
12640     case AARCH64_BUILTIN_SVE:
12641       return aarch64_sve::expand_builtin (subcode, exp, target);
12642     }
12643   gcc_unreachable ();
12644 }
12645
12646 /* Implement TARGET_BUILTIN_DECL.  */
12647 static tree
12648 aarch64_builtin_decl (unsigned int code, bool initialize_p)
12649 {
12650   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12651   switch (code & AARCH64_BUILTIN_CLASS)
12652     {
12653     case AARCH64_BUILTIN_GENERAL:
12654       return aarch64_general_builtin_decl (subcode, initialize_p);
12655
12656     case AARCH64_BUILTIN_SVE:
12657       return aarch64_sve::builtin_decl (subcode, initialize_p);
12658     }
12659   gcc_unreachable ();
12660 }
12661
12662 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12663    to optimize 1.0/sqrt.  */
12664
12665 static bool
12666 use_rsqrt_p (machine_mode mode)
12667 {
12668   return (!flag_trapping_math
12669           && flag_unsafe_math_optimizations
12670           && ((aarch64_tune_params.approx_modes->recip_sqrt
12671                & AARCH64_APPROX_MODE (mode))
12672               || flag_mrecip_low_precision_sqrt));
12673 }
12674
12675 /* Function to decide when to use the approximate reciprocal square root
12676    builtin.  */
12677
12678 static tree
12679 aarch64_builtin_reciprocal (tree fndecl)
12680 {
12681   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
12682
12683   if (!use_rsqrt_p (mode))
12684     return NULL_TREE;
12685   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12686   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12687   switch (code & AARCH64_BUILTIN_CLASS)
12688     {
12689     case AARCH64_BUILTIN_GENERAL:
12690       return aarch64_general_builtin_rsqrt (subcode);
12691
12692     case AARCH64_BUILTIN_SVE:
12693       return NULL_TREE;
12694     }
12695   gcc_unreachable ();
12696 }
12697
12698 /* Emit instruction sequence to compute either the approximate square root
12699    or its approximate reciprocal, depending on the flag RECP, and return
12700    whether the sequence was emitted or not.  */
12701
12702 bool
12703 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
12704 {
12705   machine_mode mode = GET_MODE (dst);
12706
12707   if (GET_MODE_INNER (mode) == HFmode)
12708     {
12709       gcc_assert (!recp);
12710       return false;
12711     }
12712
12713   if (!recp)
12714     {
12715       if (!(flag_mlow_precision_sqrt
12716             || (aarch64_tune_params.approx_modes->sqrt
12717                 & AARCH64_APPROX_MODE (mode))))
12718         return false;
12719
12720       if (flag_finite_math_only
12721           || flag_trapping_math
12722           || !flag_unsafe_math_optimizations
12723           || optimize_function_for_size_p (cfun))
12724         return false;
12725     }
12726   else
12727     /* Caller assumes we cannot fail.  */
12728     gcc_assert (use_rsqrt_p (mode));
12729
12730   machine_mode mmsk = (VECTOR_MODE_P (mode)
12731                        ? related_int_vector_mode (mode).require ()
12732                        : int_mode_for_mode (mode).require ());
12733   rtx xmsk = gen_reg_rtx (mmsk);
12734   if (!recp)
12735     /* When calculating the approximate square root, compare the
12736        argument with 0.0 and create a mask.  */
12737     emit_insn (gen_rtx_SET (xmsk,
12738                             gen_rtx_NEG (mmsk,
12739                                          gen_rtx_EQ (mmsk, src,
12740                                                      CONST0_RTX (mode)))));
12741
12742   /* Estimate the approximate reciprocal square root.  */
12743   rtx xdst = gen_reg_rtx (mode);
12744   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
12745
12746   /* Iterate over the series twice for SF and thrice for DF.  */
12747   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12748
12749   /* Optionally iterate over the series once less for faster performance
12750      while sacrificing the accuracy.  */
12751   if ((recp && flag_mrecip_low_precision_sqrt)
12752       || (!recp && flag_mlow_precision_sqrt))
12753     iterations--;
12754
12755   /* Iterate over the series to calculate the approximate reciprocal square
12756      root.  */
12757   rtx x1 = gen_reg_rtx (mode);
12758   while (iterations--)
12759     {
12760       rtx x2 = gen_reg_rtx (mode);
12761       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
12762
12763       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
12764
12765       if (iterations > 0)
12766         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
12767     }
12768
12769   if (!recp)
12770     {
12771       /* Qualify the approximate reciprocal square root when the argument is
12772          0.0 by squashing the intermediary result to 0.0.  */
12773       rtx xtmp = gen_reg_rtx (mmsk);
12774       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
12775                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
12776       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
12777
12778       /* Calculate the approximate square root.  */
12779       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
12780     }
12781
12782   /* Finalize the approximation.  */
12783   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
12784
12785   return true;
12786 }
12787
12788 /* Emit the instruction sequence to compute the approximation for the division
12789    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
12790
12791 bool
12792 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
12793 {
12794   machine_mode mode = GET_MODE (quo);
12795
12796   if (GET_MODE_INNER (mode) == HFmode)
12797     return false;
12798
12799   bool use_approx_division_p = (flag_mlow_precision_div
12800                                 || (aarch64_tune_params.approx_modes->division
12801                                     & AARCH64_APPROX_MODE (mode)));
12802
12803   if (!flag_finite_math_only
12804       || flag_trapping_math
12805       || !flag_unsafe_math_optimizations
12806       || optimize_function_for_size_p (cfun)
12807       || !use_approx_division_p)
12808     return false;
12809
12810   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
12811     return false;
12812
12813   /* Estimate the approximate reciprocal.  */
12814   rtx xrcp = gen_reg_rtx (mode);
12815   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
12816
12817   /* Iterate over the series twice for SF and thrice for DF.  */
12818   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12819
12820   /* Optionally iterate over the series once less for faster performance,
12821      while sacrificing the accuracy.  */
12822   if (flag_mlow_precision_div)
12823     iterations--;
12824
12825   /* Iterate over the series to calculate the approximate reciprocal.  */
12826   rtx xtmp = gen_reg_rtx (mode);
12827   while (iterations--)
12828     {
12829       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
12830
12831       if (iterations > 0)
12832         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
12833     }
12834
12835   if (num != CONST1_RTX (mode))
12836     {
12837       /* As the approximate reciprocal of DEN is already calculated, only
12838          calculate the approximate division when NUM is not 1.0.  */
12839       rtx xnum = force_reg (mode, num);
12840       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
12841     }
12842
12843   /* Finalize the approximation.  */
12844   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
12845   return true;
12846 }
12847
12848 /* Return the number of instructions that can be issued per cycle.  */
12849 static int
12850 aarch64_sched_issue_rate (void)
12851 {
12852   return aarch64_tune_params.issue_rate;
12853 }
12854
12855 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
12856 static int
12857 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
12858 {
12859   if (DEBUG_INSN_P (insn))
12860     return more;
12861
12862   rtx_code code = GET_CODE (PATTERN (insn));
12863   if (code == USE || code == CLOBBER)
12864     return more;
12865
12866   if (get_attr_type (insn) == TYPE_NO_INSN)
12867     return more;
12868
12869   return more - 1;
12870 }
12871
12872 static int
12873 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12874 {
12875   int issue_rate = aarch64_sched_issue_rate ();
12876
12877   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
12878 }
12879
12880
12881 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12882    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
12883    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
12884
12885 static int
12886 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
12887                                                     int ready_index)
12888 {
12889   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
12890 }
12891
12892
12893 /* Vectorizer cost model target hooks.  */
12894
12895 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
12896 static int
12897 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
12898                                     tree vectype,
12899                                     int misalign ATTRIBUTE_UNUSED)
12900 {
12901   unsigned elements;
12902   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
12903   bool fp = false;
12904
12905   if (vectype != NULL)
12906     fp = FLOAT_TYPE_P (vectype);
12907
12908   switch (type_of_cost)
12909     {
12910       case scalar_stmt:
12911         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
12912
12913       case scalar_load:
12914         return costs->scalar_load_cost;
12915
12916       case scalar_store:
12917         return costs->scalar_store_cost;
12918
12919       case vector_stmt:
12920         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12921
12922       case vector_load:
12923         return costs->vec_align_load_cost;
12924
12925       case vector_store:
12926         return costs->vec_store_cost;
12927
12928       case vec_to_scalar:
12929         return costs->vec_to_scalar_cost;
12930
12931       case scalar_to_vec:
12932         return costs->scalar_to_vec_cost;
12933
12934       case unaligned_load:
12935       case vector_gather_load:
12936         return costs->vec_unalign_load_cost;
12937
12938       case unaligned_store:
12939       case vector_scatter_store:
12940         return costs->vec_unalign_store_cost;
12941
12942       case cond_branch_taken:
12943         return costs->cond_taken_branch_cost;
12944
12945       case cond_branch_not_taken:
12946         return costs->cond_not_taken_branch_cost;
12947
12948       case vec_perm:
12949         return costs->vec_permute_cost;
12950
12951       case vec_promote_demote:
12952         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12953
12954       case vec_construct:
12955         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12956         return elements / 2 + 1;
12957
12958       default:
12959         gcc_unreachable ();
12960     }
12961 }
12962
12963 /* Return true if STMT_INFO extends the result of a load.  */
12964 static bool
12965 aarch64_extending_load_p (stmt_vec_info stmt_info)
12966 {
12967   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
12968   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
12969     return false;
12970
12971   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
12972   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
12973   tree rhs_type = TREE_TYPE (rhs);
12974   if (!INTEGRAL_TYPE_P (lhs_type)
12975       || !INTEGRAL_TYPE_P (rhs_type)
12976       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
12977     return false;
12978
12979   stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
12980   return (def_stmt_info
12981           && STMT_VINFO_DATA_REF (def_stmt_info)
12982           && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
12983 }
12984
12985 /* Return true if STMT_INFO is an integer truncation.  */
12986 static bool
12987 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
12988 {
12989   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
12990   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
12991     return false;
12992
12993   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
12994   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
12995   return (INTEGRAL_TYPE_P (lhs_type)
12996           && INTEGRAL_TYPE_P (rhs_type)
12997           && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
12998 }
12999
13000 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13001    for STMT_INFO, which has cost kind KIND.  Adjust the cost as necessary
13002    for SVE targets.  */
13003 static unsigned int
13004 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
13005                               unsigned int stmt_cost)
13006 {
13007   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13008      vector register size or number of units.  Integer promotions of this
13009      type therefore map to SXT[BHW] or UXT[BHW].
13010
13011      Most loads have extending forms that can do the sign or zero extension
13012      on the fly.  Optimistically assume that a load followed by an extension
13013      will fold to this form during combine, and that the extension therefore
13014      comes for free.  */
13015   if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13016     stmt_cost = 0;
13017
13018   /* For similar reasons, vector_stmt integer truncations are a no-op,
13019      because we can just ignore the unused upper bits of the source.  */
13020   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13021     stmt_cost = 0;
13022
13023   return stmt_cost;
13024 }
13025
13026 /* Implement targetm.vectorize.add_stmt_cost.  */
13027 static unsigned
13028 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13029                        struct _stmt_vec_info *stmt_info, int misalign,
13030                        enum vect_cost_model_location where)
13031 {
13032   unsigned *cost = (unsigned *) data;
13033   unsigned retval = 0;
13034
13035   if (flag_vect_cost_model)
13036     {
13037       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13038       int stmt_cost =
13039             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13040
13041       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13042         stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
13043
13044       /* Statements in an inner loop relative to the loop being
13045          vectorized are weighted more heavily.  The value here is
13046          arbitrary and could potentially be improved with analysis.  */
13047       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13048         count *= 50; /*  FIXME  */
13049
13050       retval = (unsigned) (count * stmt_cost);
13051       cost[where] += retval;
13052     }
13053
13054   return retval;
13055 }
13056
13057 static void initialize_aarch64_code_model (struct gcc_options *);
13058
13059 /* Parse the TO_PARSE string and put the architecture struct that it
13060    selects into RES and the architectural features into ISA_FLAGS.
13061    Return an aarch64_parse_opt_result describing the parse result.
13062    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13063    When the TO_PARSE string contains an invalid extension,
13064    a copy of the string is created and stored to INVALID_EXTENSION.  */
13065
13066 static enum aarch64_parse_opt_result
13067 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13068                     uint64_t *isa_flags, std::string *invalid_extension)
13069 {
13070   const char *ext;
13071   const struct processor *arch;
13072   size_t len;
13073
13074   ext = strchr (to_parse, '+');
13075
13076   if (ext != NULL)
13077     len = ext - to_parse;
13078   else
13079     len = strlen (to_parse);
13080
13081   if (len == 0)
13082     return AARCH64_PARSE_MISSING_ARG;
13083
13084
13085   /* Loop through the list of supported ARCHes to find a match.  */
13086   for (arch = all_architectures; arch->name != NULL; arch++)
13087     {
13088       if (strlen (arch->name) == len
13089           && strncmp (arch->name, to_parse, len) == 0)
13090         {
13091           uint64_t isa_temp = arch->flags;
13092
13093           if (ext != NULL)
13094             {
13095               /* TO_PARSE string contains at least one extension.  */
13096               enum aarch64_parse_opt_result ext_res
13097                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13098
13099               if (ext_res != AARCH64_PARSE_OK)
13100                 return ext_res;
13101             }
13102           /* Extension parsing was successful.  Confirm the result
13103              arch and ISA flags.  */
13104           *res = arch;
13105           *isa_flags = isa_temp;
13106           return AARCH64_PARSE_OK;
13107         }
13108     }
13109
13110   /* ARCH name not found in list.  */
13111   return AARCH64_PARSE_INVALID_ARG;
13112 }
13113
13114 /* Parse the TO_PARSE string and put the result tuning in RES and the
13115    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
13116    describing the parse result.  If there is an error parsing, RES and
13117    ISA_FLAGS are left unchanged.
13118    When the TO_PARSE string contains an invalid extension,
13119    a copy of the string is created and stored to INVALID_EXTENSION.  */
13120
13121 static enum aarch64_parse_opt_result
13122 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13123                    uint64_t *isa_flags, std::string *invalid_extension)
13124 {
13125   const char *ext;
13126   const struct processor *cpu;
13127   size_t len;
13128
13129   ext = strchr (to_parse, '+');
13130
13131   if (ext != NULL)
13132     len = ext - to_parse;
13133   else
13134     len = strlen (to_parse);
13135
13136   if (len == 0)
13137     return AARCH64_PARSE_MISSING_ARG;
13138
13139
13140   /* Loop through the list of supported CPUs to find a match.  */
13141   for (cpu = all_cores; cpu->name != NULL; cpu++)
13142     {
13143       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13144         {
13145           uint64_t isa_temp = cpu->flags;
13146
13147
13148           if (ext != NULL)
13149             {
13150               /* TO_PARSE string contains at least one extension.  */
13151               enum aarch64_parse_opt_result ext_res
13152                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13153
13154               if (ext_res != AARCH64_PARSE_OK)
13155                 return ext_res;
13156             }
13157           /* Extension parsing was successfull.  Confirm the result
13158              cpu and ISA flags.  */
13159           *res = cpu;
13160           *isa_flags = isa_temp;
13161           return AARCH64_PARSE_OK;
13162         }
13163     }
13164
13165   /* CPU name not found in list.  */
13166   return AARCH64_PARSE_INVALID_ARG;
13167 }
13168
13169 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13170    Return an aarch64_parse_opt_result describing the parse result.
13171    If the parsing fails the RES does not change.  */
13172
13173 static enum aarch64_parse_opt_result
13174 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13175 {
13176   const struct processor *cpu;
13177
13178   /* Loop through the list of supported CPUs to find a match.  */
13179   for (cpu = all_cores; cpu->name != NULL; cpu++)
13180     {
13181       if (strcmp (cpu->name, to_parse) == 0)
13182         {
13183           *res = cpu;
13184           return AARCH64_PARSE_OK;
13185         }
13186     }
13187
13188   /* CPU name not found in list.  */
13189   return AARCH64_PARSE_INVALID_ARG;
13190 }
13191
13192 /* Parse TOKEN, which has length LENGTH to see if it is an option
13193    described in FLAG.  If it is, return the index bit for that fusion type.
13194    If not, error (printing OPTION_NAME) and return zero.  */
13195
13196 static unsigned int
13197 aarch64_parse_one_option_token (const char *token,
13198                                 size_t length,
13199                                 const struct aarch64_flag_desc *flag,
13200                                 const char *option_name)
13201 {
13202   for (; flag->name != NULL; flag++)
13203     {
13204       if (length == strlen (flag->name)
13205           && !strncmp (flag->name, token, length))
13206         return flag->flag;
13207     }
13208
13209   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13210   return 0;
13211 }
13212
13213 /* Parse OPTION which is a comma-separated list of flags to enable.
13214    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13215    default state we inherit from the CPU tuning structures.  OPTION_NAME
13216    gives the top-level option we are parsing in the -moverride string,
13217    for use in error messages.  */
13218
13219 static unsigned int
13220 aarch64_parse_boolean_options (const char *option,
13221                                const struct aarch64_flag_desc *flags,
13222                                unsigned int initial_state,
13223                                const char *option_name)
13224 {
13225   const char separator = '.';
13226   const char* specs = option;
13227   const char* ntoken = option;
13228   unsigned int found_flags = initial_state;
13229
13230   while ((ntoken = strchr (specs, separator)))
13231     {
13232       size_t token_length = ntoken - specs;
13233       unsigned token_ops = aarch64_parse_one_option_token (specs,
13234                                                            token_length,
13235                                                            flags,
13236                                                            option_name);
13237       /* If we find "none" (or, for simplicity's sake, an error) anywhere
13238          in the token stream, reset the supported operations.  So:
13239
13240            adrp+add.cmp+branch.none.adrp+add
13241
13242            would have the result of turning on only adrp+add fusion.  */
13243       if (!token_ops)
13244         found_flags = 0;
13245
13246       found_flags |= token_ops;
13247       specs = ++ntoken;
13248     }
13249
13250   /* We ended with a comma, print something.  */
13251   if (!(*specs))
13252     {
13253       error ("%s string ill-formed\n", option_name);
13254       return 0;
13255     }
13256
13257   /* We still have one more token to parse.  */
13258   size_t token_length = strlen (specs);
13259   unsigned token_ops = aarch64_parse_one_option_token (specs,
13260                                                        token_length,
13261                                                        flags,
13262                                                        option_name);
13263    if (!token_ops)
13264      found_flags = 0;
13265
13266   found_flags |= token_ops;
13267   return found_flags;
13268 }
13269
13270 /* Support for overriding instruction fusion.  */
13271
13272 static void
13273 aarch64_parse_fuse_string (const char *fuse_string,
13274                             struct tune_params *tune)
13275 {
13276   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13277                                                      aarch64_fusible_pairs,
13278                                                      tune->fusible_ops,
13279                                                      "fuse=");
13280 }
13281
13282 /* Support for overriding other tuning flags.  */
13283
13284 static void
13285 aarch64_parse_tune_string (const char *tune_string,
13286                             struct tune_params *tune)
13287 {
13288   tune->extra_tuning_flags
13289     = aarch64_parse_boolean_options (tune_string,
13290                                      aarch64_tuning_flags,
13291                                      tune->extra_tuning_flags,
13292                                      "tune=");
13293 }
13294
13295 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13296    Accept the valid SVE vector widths allowed by
13297    aarch64_sve_vector_bits_enum and use it to override sve_width
13298    in TUNE.  */
13299
13300 static void
13301 aarch64_parse_sve_width_string (const char *tune_string,
13302                                 struct tune_params *tune)
13303 {
13304   int width = -1;
13305
13306   int n = sscanf (tune_string, "%d", &width);
13307   if (n == EOF)
13308     {
13309       error ("invalid format for sve_width");
13310       return;
13311     }
13312   switch (width)
13313     {
13314     case SVE_128:
13315     case SVE_256:
13316     case SVE_512:
13317     case SVE_1024:
13318     case SVE_2048:
13319       break;
13320     default:
13321       error ("invalid sve_width value: %d", width);
13322     }
13323   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13324 }
13325
13326 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13327    we understand.  If it is, extract the option string and handoff to
13328    the appropriate function.  */
13329
13330 void
13331 aarch64_parse_one_override_token (const char* token,
13332                                   size_t length,
13333                                   struct tune_params *tune)
13334 {
13335   const struct aarch64_tuning_override_function *fn
13336     = aarch64_tuning_override_functions;
13337
13338   const char *option_part = strchr (token, '=');
13339   if (!option_part)
13340     {
13341       error ("tuning string missing in option (%s)", token);
13342       return;
13343     }
13344
13345   /* Get the length of the option name.  */
13346   length = option_part - token;
13347   /* Skip the '=' to get to the option string.  */
13348   option_part++;
13349
13350   for (; fn->name != NULL; fn++)
13351     {
13352       if (!strncmp (fn->name, token, length))
13353         {
13354           fn->parse_override (option_part, tune);
13355           return;
13356         }
13357     }
13358
13359   error ("unknown tuning option (%s)",token);
13360   return;
13361 }
13362
13363 /* A checking mechanism for the implementation of the tls size.  */
13364
13365 static void
13366 initialize_aarch64_tls_size (struct gcc_options *opts)
13367 {
13368   if (aarch64_tls_size == 0)
13369     aarch64_tls_size = 24;
13370
13371   switch (opts->x_aarch64_cmodel_var)
13372     {
13373     case AARCH64_CMODEL_TINY:
13374       /* Both the default and maximum TLS size allowed under tiny is 1M which
13375          needs two instructions to address, so we clamp the size to 24.  */
13376       if (aarch64_tls_size > 24)
13377         aarch64_tls_size = 24;
13378       break;
13379     case AARCH64_CMODEL_SMALL:
13380       /* The maximum TLS size allowed under small is 4G.  */
13381       if (aarch64_tls_size > 32)
13382         aarch64_tls_size = 32;
13383       break;
13384     case AARCH64_CMODEL_LARGE:
13385       /* The maximum TLS size allowed under large is 16E.
13386          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
13387       if (aarch64_tls_size > 48)
13388         aarch64_tls_size = 48;
13389       break;
13390     default:
13391       gcc_unreachable ();
13392     }
13393
13394   return;
13395 }
13396
13397 /* Parse STRING looking for options in the format:
13398      string     :: option:string
13399      option     :: name=substring
13400      name       :: {a-z}
13401      substring  :: defined by option.  */
13402
13403 static void
13404 aarch64_parse_override_string (const char* input_string,
13405                                struct tune_params* tune)
13406 {
13407   const char separator = ':';
13408   size_t string_length = strlen (input_string) + 1;
13409   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13410   char *string = string_root;
13411   strncpy (string, input_string, string_length);
13412   string[string_length - 1] = '\0';
13413
13414   char* ntoken = string;
13415
13416   while ((ntoken = strchr (string, separator)))
13417     {
13418       size_t token_length = ntoken - string;
13419       /* Make this substring look like a string.  */
13420       *ntoken = '\0';
13421       aarch64_parse_one_override_token (string, token_length, tune);
13422       string = ++ntoken;
13423     }
13424
13425   /* One last option to parse.  */
13426   aarch64_parse_one_override_token (string, strlen (string), tune);
13427   free (string_root);
13428 }
13429
13430
13431 static void
13432 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13433 {
13434   if (accepted_branch_protection_string)
13435     {
13436       opts->x_aarch64_branch_protection_string
13437         = xstrdup (accepted_branch_protection_string);
13438     }
13439
13440   /* PR 70044: We have to be careful about being called multiple times for the
13441      same function.  This means all changes should be repeatable.  */
13442
13443   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13444      Disable the frame pointer flag so the mid-end will not use a frame
13445      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13446      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13447      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
13448   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
13449   if (opts->x_flag_omit_frame_pointer == 0)
13450     opts->x_flag_omit_frame_pointer = 2;
13451
13452   /* If not optimizing for size, set the default
13453      alignment to what the target wants.  */
13454   if (!opts->x_optimize_size)
13455     {
13456       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
13457         opts->x_str_align_loops = aarch64_tune_params.loop_align;
13458       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
13459         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
13460       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
13461         opts->x_str_align_functions = aarch64_tune_params.function_align;
13462     }
13463
13464   /* We default to no pc-relative literal loads.  */
13465
13466   aarch64_pcrelative_literal_loads = false;
13467
13468   /* If -mpc-relative-literal-loads is set on the command line, this
13469      implies that the user asked for PC relative literal loads.  */
13470   if (opts->x_pcrelative_literal_loads == 1)
13471     aarch64_pcrelative_literal_loads = true;
13472
13473   /* In the tiny memory model it makes no sense to disallow PC relative
13474      literal pool loads.  */
13475   if (aarch64_cmodel == AARCH64_CMODEL_TINY
13476       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13477     aarch64_pcrelative_literal_loads = true;
13478
13479   /* When enabling the lower precision Newton series for the square root, also
13480      enable it for the reciprocal square root, since the latter is an
13481      intermediary step for the former.  */
13482   if (flag_mlow_precision_sqrt)
13483     flag_mrecip_low_precision_sqrt = true;
13484 }
13485
13486 /* 'Unpack' up the internal tuning structs and update the options
13487     in OPTS.  The caller must have set up selected_tune and selected_arch
13488     as all the other target-specific codegen decisions are
13489     derived from them.  */
13490
13491 void
13492 aarch64_override_options_internal (struct gcc_options *opts)
13493 {
13494   aarch64_tune_flags = selected_tune->flags;
13495   aarch64_tune = selected_tune->sched_core;
13496   /* Make a copy of the tuning parameters attached to the core, which
13497      we may later overwrite.  */
13498   aarch64_tune_params = *(selected_tune->tune);
13499   aarch64_architecture_version = selected_arch->architecture_version;
13500
13501   if (opts->x_aarch64_override_tune_string)
13502     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
13503                                   &aarch64_tune_params);
13504
13505   /* This target defaults to strict volatile bitfields.  */
13506   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
13507     opts->x_flag_strict_volatile_bitfields = 1;
13508
13509   if (aarch64_stack_protector_guard == SSP_GLOBAL
13510       && opts->x_aarch64_stack_protector_guard_offset_str)
13511     {
13512       error ("incompatible options %<-mstack-protector-guard=global%> and "
13513              "%<-mstack-protector-guard-offset=%s%>",
13514              aarch64_stack_protector_guard_offset_str);
13515     }
13516
13517   if (aarch64_stack_protector_guard == SSP_SYSREG
13518       && !(opts->x_aarch64_stack_protector_guard_offset_str
13519            && opts->x_aarch64_stack_protector_guard_reg_str))
13520     {
13521       error ("both %<-mstack-protector-guard-offset%> and "
13522              "%<-mstack-protector-guard-reg%> must be used "
13523              "with %<-mstack-protector-guard=sysreg%>");
13524     }
13525
13526   if (opts->x_aarch64_stack_protector_guard_reg_str)
13527     {
13528       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
13529           error ("specify a system register with a small string length.");
13530     }
13531
13532   if (opts->x_aarch64_stack_protector_guard_offset_str)
13533     {
13534       char *end;
13535       const char *str = aarch64_stack_protector_guard_offset_str;
13536       errno = 0;
13537       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
13538       if (!*str || *end || errno)
13539         error ("%qs is not a valid offset in %qs", str,
13540                "-mstack-protector-guard-offset=");
13541       aarch64_stack_protector_guard_offset = offs;
13542     }
13543
13544   initialize_aarch64_code_model (opts);
13545   initialize_aarch64_tls_size (opts);
13546
13547   int queue_depth = 0;
13548   switch (aarch64_tune_params.autoprefetcher_model)
13549     {
13550       case tune_params::AUTOPREFETCHER_OFF:
13551         queue_depth = -1;
13552         break;
13553       case tune_params::AUTOPREFETCHER_WEAK:
13554         queue_depth = 0;
13555         break;
13556       case tune_params::AUTOPREFETCHER_STRONG:
13557         queue_depth = max_insn_queue_index + 1;
13558         break;
13559       default:
13560         gcc_unreachable ();
13561     }
13562
13563   /* We don't mind passing in global_options_set here as we don't use
13564      the *options_set structs anyway.  */
13565   SET_OPTION_IF_UNSET (opts, &global_options_set,
13566                        param_sched_autopref_queue_depth, queue_depth);
13567
13568   /* Set up parameters to be used in prefetching algorithm.  Do not
13569      override the defaults unless we are tuning for a core we have
13570      researched values for.  */
13571   if (aarch64_tune_params.prefetch->num_slots > 0)
13572     SET_OPTION_IF_UNSET (opts, &global_options_set,
13573                          param_simultaneous_prefetches,
13574                          aarch64_tune_params.prefetch->num_slots);
13575   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
13576     SET_OPTION_IF_UNSET (opts, &global_options_set,
13577                          param_l1_cache_size,
13578                          aarch64_tune_params.prefetch->l1_cache_size);
13579   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
13580     SET_OPTION_IF_UNSET (opts, &global_options_set,
13581                          param_l1_cache_line_size,
13582                          aarch64_tune_params.prefetch->l1_cache_line_size);
13583   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
13584     SET_OPTION_IF_UNSET (opts, &global_options_set,
13585                          param_l2_cache_size,
13586                          aarch64_tune_params.prefetch->l2_cache_size);
13587   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
13588     SET_OPTION_IF_UNSET (opts, &global_options_set,
13589                          param_prefetch_dynamic_strides, 0);
13590   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
13591     SET_OPTION_IF_UNSET (opts, &global_options_set,
13592                          param_prefetch_minimum_stride,
13593                          aarch64_tune_params.prefetch->minimum_stride);
13594
13595   /* Use the alternative scheduling-pressure algorithm by default.  */
13596   SET_OPTION_IF_UNSET (opts, &global_options_set,
13597                        param_sched_pressure_algorithm,
13598                        SCHED_PRESSURE_MODEL);
13599
13600   /* Validate the guard size.  */
13601   int guard_size = param_stack_clash_protection_guard_size;
13602
13603   if (guard_size != 12 && guard_size != 16)
13604     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13605            "size.  Given value %d (%llu KB) is out of range",
13606            guard_size, (1ULL << guard_size) / 1024ULL);
13607
13608   /* Enforce that interval is the same size as size so the mid-end does the
13609      right thing.  */
13610   SET_OPTION_IF_UNSET (opts, &global_options_set,
13611                        param_stack_clash_protection_probe_interval,
13612                        guard_size);
13613
13614   /* The maybe_set calls won't update the value if the user has explicitly set
13615      one.  Which means we need to validate that probing interval and guard size
13616      are equal.  */
13617   int probe_interval
13618     = param_stack_clash_protection_probe_interval;
13619   if (guard_size != probe_interval)
13620     error ("stack clash guard size %<%d%> must be equal to probing interval "
13621            "%<%d%>", guard_size, probe_interval);
13622
13623   /* Enable sw prefetching at specified optimization level for
13624      CPUS that have prefetch.  Lower optimization level threshold by 1
13625      when profiling is enabled.  */
13626   if (opts->x_flag_prefetch_loop_arrays < 0
13627       && !opts->x_optimize_size
13628       && aarch64_tune_params.prefetch->default_opt_level >= 0
13629       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
13630     opts->x_flag_prefetch_loop_arrays = 1;
13631
13632   if (opts->x_aarch64_arch_string == NULL)
13633     opts->x_aarch64_arch_string = selected_arch->name;
13634   if (opts->x_aarch64_cpu_string == NULL)
13635     opts->x_aarch64_cpu_string = selected_cpu->name;
13636   if (opts->x_aarch64_tune_string == NULL)
13637     opts->x_aarch64_tune_string = selected_tune->name;
13638
13639   aarch64_override_options_after_change_1 (opts);
13640 }
13641
13642 /* Print a hint with a suggestion for a core or architecture name that
13643    most closely resembles what the user passed in STR.  ARCH is true if
13644    the user is asking for an architecture name.  ARCH is false if the user
13645    is asking for a core name.  */
13646
13647 static void
13648 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
13649 {
13650   auto_vec<const char *> candidates;
13651   const struct processor *entry = arch ? all_architectures : all_cores;
13652   for (; entry->name != NULL; entry++)
13653     candidates.safe_push (entry->name);
13654
13655 #ifdef HAVE_LOCAL_CPU_DETECT
13656   /* Add also "native" as possible value.  */
13657   if (arch)
13658     candidates.safe_push ("native");
13659 #endif
13660
13661   char *s;
13662   const char *hint = candidates_list_and_hint (str, s, candidates);
13663   if (hint)
13664     inform (input_location, "valid arguments are: %s;"
13665                              " did you mean %qs?", s, hint);
13666   else
13667     inform (input_location, "valid arguments are: %s", s);
13668
13669   XDELETEVEC (s);
13670 }
13671
13672 /* Print a hint with a suggestion for a core name that most closely resembles
13673    what the user passed in STR.  */
13674
13675 inline static void
13676 aarch64_print_hint_for_core (const char *str)
13677 {
13678   aarch64_print_hint_for_core_or_arch (str, false);
13679 }
13680
13681 /* Print a hint with a suggestion for an architecture name that most closely
13682    resembles what the user passed in STR.  */
13683
13684 inline static void
13685 aarch64_print_hint_for_arch (const char *str)
13686 {
13687   aarch64_print_hint_for_core_or_arch (str, true);
13688 }
13689
13690
13691 /* Print a hint with a suggestion for an extension name
13692    that most closely resembles what the user passed in STR.  */
13693
13694 void
13695 aarch64_print_hint_for_extensions (const std::string &str)
13696 {
13697   auto_vec<const char *> candidates;
13698   aarch64_get_all_extension_candidates (&candidates);
13699   char *s;
13700   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
13701   if (hint)
13702     inform (input_location, "valid arguments are: %s;"
13703                              " did you mean %qs?", s, hint);
13704   else
13705     inform (input_location, "valid arguments are: %s;", s);
13706
13707   XDELETEVEC (s);
13708 }
13709
13710 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
13711    specified in STR and throw errors if appropriate.  Put the results if
13712    they are valid in RES and ISA_FLAGS.  Return whether the option is
13713    valid.  */
13714
13715 static bool
13716 aarch64_validate_mcpu (const char *str, const struct processor **res,
13717                        uint64_t *isa_flags)
13718 {
13719   std::string invalid_extension;
13720   enum aarch64_parse_opt_result parse_res
13721     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
13722
13723   if (parse_res == AARCH64_PARSE_OK)
13724     return true;
13725
13726   switch (parse_res)
13727     {
13728       case AARCH64_PARSE_MISSING_ARG:
13729         error ("missing cpu name in %<-mcpu=%s%>", str);
13730         break;
13731       case AARCH64_PARSE_INVALID_ARG:
13732         error ("unknown value %qs for %<-mcpu%>", str);
13733         aarch64_print_hint_for_core (str);
13734         break;
13735       case AARCH64_PARSE_INVALID_FEATURE:
13736         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13737                invalid_extension.c_str (), str);
13738         aarch64_print_hint_for_extensions (invalid_extension);
13739         break;
13740       default:
13741         gcc_unreachable ();
13742     }
13743
13744   return false;
13745 }
13746
13747 /* Parses CONST_STR for branch protection features specified in
13748    aarch64_branch_protect_types, and set any global variables required.  Returns
13749    the parsing result and assigns LAST_STR to the last processed token from
13750    CONST_STR so that it can be used for error reporting.  */
13751
13752 static enum
13753 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
13754                                                           char** last_str)
13755 {
13756   char *str_root = xstrdup (const_str);
13757   char* token_save = NULL;
13758   char *str = strtok_r (str_root, "+", &token_save);
13759   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
13760   if (!str)
13761     res = AARCH64_PARSE_MISSING_ARG;
13762   else
13763     {
13764       char *next_str = strtok_r (NULL, "+", &token_save);
13765       /* Reset the branch protection features to their defaults.  */
13766       aarch64_handle_no_branch_protection (NULL, NULL);
13767
13768       while (str && res == AARCH64_PARSE_OK)
13769         {
13770           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
13771           bool found = false;
13772           /* Search for this type.  */
13773           while (type && type->name && !found && res == AARCH64_PARSE_OK)
13774             {
13775               if (strcmp (str, type->name) == 0)
13776                 {
13777                   found = true;
13778                   res = type->handler (str, next_str);
13779                   str = next_str;
13780                   next_str = strtok_r (NULL, "+", &token_save);
13781                 }
13782               else
13783                 type++;
13784             }
13785           if (found && res == AARCH64_PARSE_OK)
13786             {
13787               bool found_subtype = true;
13788               /* Loop through each token until we find one that isn't a
13789                  subtype.  */
13790               while (found_subtype)
13791                 {
13792                   found_subtype = false;
13793                   const aarch64_branch_protect_type *subtype = type->subtypes;
13794                   /* Search for the subtype.  */
13795                   while (str && subtype && subtype->name && !found_subtype
13796                           && res == AARCH64_PARSE_OK)
13797                     {
13798                       if (strcmp (str, subtype->name) == 0)
13799                         {
13800                           found_subtype = true;
13801                           res = subtype->handler (str, next_str);
13802                           str = next_str;
13803                           next_str = strtok_r (NULL, "+", &token_save);
13804                         }
13805                       else
13806                         subtype++;
13807                     }
13808                 }
13809             }
13810           else if (!found)
13811             res = AARCH64_PARSE_INVALID_ARG;
13812         }
13813     }
13814   /* Copy the last processed token into the argument to pass it back.
13815     Used by option and attribute validation to print the offending token.  */
13816   if (last_str)
13817     {
13818       if (str) strcpy (*last_str, str);
13819       else *last_str = NULL;
13820     }
13821   if (res == AARCH64_PARSE_OK)
13822     {
13823       /* If needed, alloc the accepted string then copy in const_str.
13824         Used by override_option_after_change_1.  */
13825       if (!accepted_branch_protection_string)
13826         accepted_branch_protection_string = (char *) xmalloc (
13827                                                       BRANCH_PROTECT_STR_MAX
13828                                                         + 1);
13829       strncpy (accepted_branch_protection_string, const_str,
13830                 BRANCH_PROTECT_STR_MAX + 1);
13831       /* Forcibly null-terminate.  */
13832       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
13833     }
13834   return res;
13835 }
13836
13837 static bool
13838 aarch64_validate_mbranch_protection (const char *const_str)
13839 {
13840   char *str = (char *) xmalloc (strlen (const_str));
13841   enum aarch64_parse_opt_result res =
13842     aarch64_parse_branch_protection (const_str, &str);
13843   if (res == AARCH64_PARSE_INVALID_ARG)
13844     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
13845   else if (res == AARCH64_PARSE_MISSING_ARG)
13846     error ("missing argument for %<-mbranch-protection=%>");
13847   free (str);
13848   return res == AARCH64_PARSE_OK;
13849 }
13850
13851 /* Validate a command-line -march option.  Parse the arch and extensions
13852    (if any) specified in STR and throw errors if appropriate.  Put the
13853    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
13854    option is valid.  */
13855
13856 static bool
13857 aarch64_validate_march (const char *str, const struct processor **res,
13858                          uint64_t *isa_flags)
13859 {
13860   std::string invalid_extension;
13861   enum aarch64_parse_opt_result parse_res
13862     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
13863
13864   if (parse_res == AARCH64_PARSE_OK)
13865     return true;
13866
13867   switch (parse_res)
13868     {
13869       case AARCH64_PARSE_MISSING_ARG:
13870         error ("missing arch name in %<-march=%s%>", str);
13871         break;
13872       case AARCH64_PARSE_INVALID_ARG:
13873         error ("unknown value %qs for %<-march%>", str);
13874         aarch64_print_hint_for_arch (str);
13875         break;
13876       case AARCH64_PARSE_INVALID_FEATURE:
13877         error ("invalid feature modifier %qs in %<-march=%s%>",
13878                invalid_extension.c_str (), str);
13879         aarch64_print_hint_for_extensions (invalid_extension);
13880         break;
13881       default:
13882         gcc_unreachable ();
13883     }
13884
13885   return false;
13886 }
13887
13888 /* Validate a command-line -mtune option.  Parse the cpu
13889    specified in STR and throw errors if appropriate.  Put the
13890    result, if it is valid, in RES.  Return whether the option is
13891    valid.  */
13892
13893 static bool
13894 aarch64_validate_mtune (const char *str, const struct processor **res)
13895 {
13896   enum aarch64_parse_opt_result parse_res
13897     = aarch64_parse_tune (str, res);
13898
13899   if (parse_res == AARCH64_PARSE_OK)
13900     return true;
13901
13902   switch (parse_res)
13903     {
13904       case AARCH64_PARSE_MISSING_ARG:
13905         error ("missing cpu name in %<-mtune=%s%>", str);
13906         break;
13907       case AARCH64_PARSE_INVALID_ARG:
13908         error ("unknown value %qs for %<-mtune%>", str);
13909         aarch64_print_hint_for_core (str);
13910         break;
13911       default:
13912         gcc_unreachable ();
13913     }
13914   return false;
13915 }
13916
13917 /* Return the CPU corresponding to the enum CPU.
13918    If it doesn't specify a cpu, return the default.  */
13919
13920 static const struct processor *
13921 aarch64_get_tune_cpu (enum aarch64_processor cpu)
13922 {
13923   if (cpu != aarch64_none)
13924     return &all_cores[cpu];
13925
13926   /* The & 0x3f is to extract the bottom 6 bits that encode the
13927      default cpu as selected by the --with-cpu GCC configure option
13928      in config.gcc.
13929      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
13930      flags mechanism should be reworked to make it more sane.  */
13931   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13932 }
13933
13934 /* Return the architecture corresponding to the enum ARCH.
13935    If it doesn't specify a valid architecture, return the default.  */
13936
13937 static const struct processor *
13938 aarch64_get_arch (enum aarch64_arch arch)
13939 {
13940   if (arch != aarch64_no_arch)
13941     return &all_architectures[arch];
13942
13943   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13944
13945   return &all_architectures[cpu->arch];
13946 }
13947
13948 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
13949
13950 static poly_uint16
13951 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
13952 {
13953   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
13954      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
13955      deciding which .md file patterns to use and when deciding whether
13956      something is a legitimate address or constant.  */
13957   if (value == SVE_SCALABLE || value == SVE_128)
13958     return poly_uint16 (2, 2);
13959   else
13960     return (int) value / 64;
13961 }
13962
13963 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
13964    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
13965    tuning structs.  In particular it must set selected_tune and
13966    aarch64_isa_flags that define the available ISA features and tuning
13967    decisions.  It must also set selected_arch as this will be used to
13968    output the .arch asm tags for each function.  */
13969
13970 static void
13971 aarch64_override_options (void)
13972 {
13973   uint64_t cpu_isa = 0;
13974   uint64_t arch_isa = 0;
13975   aarch64_isa_flags = 0;
13976
13977   bool valid_cpu = true;
13978   bool valid_tune = true;
13979   bool valid_arch = true;
13980
13981   selected_cpu = NULL;
13982   selected_arch = NULL;
13983   selected_tune = NULL;
13984
13985   if (aarch64_branch_protection_string)
13986     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
13987
13988   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13989      If either of -march or -mtune is given, they override their
13990      respective component of -mcpu.  */
13991   if (aarch64_cpu_string)
13992     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
13993                                         &cpu_isa);
13994
13995   if (aarch64_arch_string)
13996     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13997                                           &arch_isa);
13998
13999   if (aarch64_tune_string)
14000     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14001
14002 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14003   SUBTARGET_OVERRIDE_OPTIONS;
14004 #endif
14005
14006   /* If the user did not specify a processor, choose the default
14007      one for them.  This will be the CPU set during configuration using
14008      --with-cpu, otherwise it is "generic".  */
14009   if (!selected_cpu)
14010     {
14011       if (selected_arch)
14012         {
14013           selected_cpu = &all_cores[selected_arch->ident];
14014           aarch64_isa_flags = arch_isa;
14015           explicit_arch = selected_arch->arch;
14016         }
14017       else
14018         {
14019           /* Get default configure-time CPU.  */
14020           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14021           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14022         }
14023
14024       if (selected_tune)
14025         explicit_tune_core = selected_tune->ident;
14026     }
14027   /* If both -mcpu and -march are specified check that they are architecturally
14028      compatible, warn if they're not and prefer the -march ISA flags.  */
14029   else if (selected_arch)
14030     {
14031       if (selected_arch->arch != selected_cpu->arch)
14032         {
14033           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14034                        all_architectures[selected_cpu->arch].name,
14035                        selected_arch->name);
14036         }
14037       aarch64_isa_flags = arch_isa;
14038       explicit_arch = selected_arch->arch;
14039       explicit_tune_core = selected_tune ? selected_tune->ident
14040                                           : selected_cpu->ident;
14041     }
14042   else
14043     {
14044       /* -mcpu but no -march.  */
14045       aarch64_isa_flags = cpu_isa;
14046       explicit_tune_core = selected_tune ? selected_tune->ident
14047                                           : selected_cpu->ident;
14048       gcc_assert (selected_cpu);
14049       selected_arch = &all_architectures[selected_cpu->arch];
14050       explicit_arch = selected_arch->arch;
14051     }
14052
14053   /* Set the arch as well as we will need it when outputing
14054      the .arch directive in assembly.  */
14055   if (!selected_arch)
14056     {
14057       gcc_assert (selected_cpu);
14058       selected_arch = &all_architectures[selected_cpu->arch];
14059     }
14060
14061   if (!selected_tune)
14062     selected_tune = selected_cpu;
14063
14064   if (aarch64_enable_bti == 2)
14065     {
14066 #ifdef TARGET_ENABLE_BTI
14067       aarch64_enable_bti = 1;
14068 #else
14069       aarch64_enable_bti = 0;
14070 #endif
14071     }
14072
14073   /* Return address signing is currently not supported for ILP32 targets.  For
14074      LP64 targets use the configured option in the absence of a command-line
14075      option for -mbranch-protection.  */
14076   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14077     {
14078 #ifdef TARGET_ENABLE_PAC_RET
14079       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14080 #else
14081       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14082 #endif
14083     }
14084
14085 #ifndef HAVE_AS_MABI_OPTION
14086   /* The compiler may have been configured with 2.23.* binutils, which does
14087      not have support for ILP32.  */
14088   if (TARGET_ILP32)
14089     error ("assembler does not support %<-mabi=ilp32%>");
14090 #endif
14091
14092   /* Convert -msve-vector-bits to a VG count.  */
14093   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14094
14095   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14096     sorry ("return address signing is only supported for %<-mabi=lp64%>");
14097
14098   /* Make sure we properly set up the explicit options.  */
14099   if ((aarch64_cpu_string && valid_cpu)
14100        || (aarch64_tune_string && valid_tune))
14101     gcc_assert (explicit_tune_core != aarch64_none);
14102
14103   if ((aarch64_cpu_string && valid_cpu)
14104        || (aarch64_arch_string && valid_arch))
14105     gcc_assert (explicit_arch != aarch64_no_arch);
14106
14107   /* The pass to insert speculation tracking runs before
14108      shrink-wrapping and the latter does not know how to update the
14109      tracking status.  So disable it in this case.  */
14110   if (aarch64_track_speculation)
14111     flag_shrink_wrap = 0;
14112
14113   aarch64_override_options_internal (&global_options);
14114
14115   /* Save these options as the default ones in case we push and pop them later
14116      while processing functions with potential target attributes.  */
14117   target_option_default_node = target_option_current_node
14118       = build_target_option_node (&global_options);
14119 }
14120
14121 /* Implement targetm.override_options_after_change.  */
14122
14123 static void
14124 aarch64_override_options_after_change (void)
14125 {
14126   aarch64_override_options_after_change_1 (&global_options);
14127 }
14128
14129 static struct machine_function *
14130 aarch64_init_machine_status (void)
14131 {
14132   struct machine_function *machine;
14133   machine = ggc_cleared_alloc<machine_function> ();
14134   return machine;
14135 }
14136
14137 void
14138 aarch64_init_expanders (void)
14139 {
14140   init_machine_status = aarch64_init_machine_status;
14141 }
14142
14143 /* A checking mechanism for the implementation of the various code models.  */
14144 static void
14145 initialize_aarch64_code_model (struct gcc_options *opts)
14146 {
14147    if (opts->x_flag_pic)
14148      {
14149        switch (opts->x_aarch64_cmodel_var)
14150          {
14151          case AARCH64_CMODEL_TINY:
14152            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14153            break;
14154          case AARCH64_CMODEL_SMALL:
14155 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14156            aarch64_cmodel = (flag_pic == 2
14157                              ? AARCH64_CMODEL_SMALL_PIC
14158                              : AARCH64_CMODEL_SMALL_SPIC);
14159 #else
14160            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14161 #endif
14162            break;
14163          case AARCH64_CMODEL_LARGE:
14164            sorry ("code model %qs with %<-f%s%>", "large",
14165                   opts->x_flag_pic > 1 ? "PIC" : "pic");
14166            break;
14167          default:
14168            gcc_unreachable ();
14169          }
14170      }
14171    else
14172      aarch64_cmodel = opts->x_aarch64_cmodel_var;
14173 }
14174
14175 /* Implement TARGET_OPTION_SAVE.  */
14176
14177 static void
14178 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14179 {
14180   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14181   ptr->x_aarch64_branch_protection_string
14182     = opts->x_aarch64_branch_protection_string;
14183 }
14184
14185 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
14186    using the information saved in PTR.  */
14187
14188 static void
14189 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14190 {
14191   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14192   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14193   opts->x_explicit_arch = ptr->x_explicit_arch;
14194   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14195   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14196   opts->x_aarch64_branch_protection_string
14197     = ptr->x_aarch64_branch_protection_string;
14198   if (opts->x_aarch64_branch_protection_string)
14199     {
14200       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14201                                         NULL);
14202     }
14203
14204   aarch64_override_options_internal (opts);
14205 }
14206
14207 /* Implement TARGET_OPTION_PRINT.  */
14208
14209 static void
14210 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14211 {
14212   const struct processor *cpu
14213     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14214   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14215   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14216   std::string extension
14217     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14218
14219   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14220   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14221            arch->name, extension.c_str ());
14222 }
14223
14224 static GTY(()) tree aarch64_previous_fndecl;
14225
14226 void
14227 aarch64_reset_previous_fndecl (void)
14228 {
14229   aarch64_previous_fndecl = NULL;
14230 }
14231
14232 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14233    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14234    make sure optab availability predicates are recomputed when necessary.  */
14235
14236 void
14237 aarch64_save_restore_target_globals (tree new_tree)
14238 {
14239   if (TREE_TARGET_GLOBALS (new_tree))
14240     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
14241   else if (new_tree == target_option_default_node)
14242     restore_target_globals (&default_target_globals);
14243   else
14244     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14245 }
14246
14247 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
14248    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14249    of the function, if such exists.  This function may be called multiple
14250    times on a single function so use aarch64_previous_fndecl to avoid
14251    setting up identical state.  */
14252
14253 static void
14254 aarch64_set_current_function (tree fndecl)
14255 {
14256   if (!fndecl || fndecl == aarch64_previous_fndecl)
14257     return;
14258
14259   tree old_tree = (aarch64_previous_fndecl
14260                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14261                    : NULL_TREE);
14262
14263   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14264
14265   /* If current function has no attributes but the previous one did,
14266      use the default node.  */
14267   if (!new_tree && old_tree)
14268     new_tree = target_option_default_node;
14269
14270   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
14271      the default have been handled by aarch64_save_restore_target_globals from
14272      aarch64_pragma_target_parse.  */
14273   if (old_tree == new_tree)
14274     return;
14275
14276   aarch64_previous_fndecl = fndecl;
14277
14278   /* First set the target options.  */
14279   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14280
14281   aarch64_save_restore_target_globals (new_tree);
14282 }
14283
14284 /* Enum describing the various ways we can handle attributes.
14285    In many cases we can reuse the generic option handling machinery.  */
14286
14287 enum aarch64_attr_opt_type
14288 {
14289   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
14290   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
14291   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
14292   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
14293 };
14294
14295 /* All the information needed to handle a target attribute.
14296    NAME is the name of the attribute.
14297    ATTR_TYPE specifies the type of behavior of the attribute as described
14298    in the definition of enum aarch64_attr_opt_type.
14299    ALLOW_NEG is true if the attribute supports a "no-" form.
14300    HANDLER is the function that takes the attribute string as an argument
14301    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14302    OPT_NUM is the enum specifying the option that the attribute modifies.
14303    This is needed for attributes that mirror the behavior of a command-line
14304    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14305    aarch64_attr_enum.  */
14306
14307 struct aarch64_attribute_info
14308 {
14309   const char *name;
14310   enum aarch64_attr_opt_type attr_type;
14311   bool allow_neg;
14312   bool (*handler) (const char *);
14313   enum opt_code opt_num;
14314 };
14315
14316 /* Handle the ARCH_STR argument to the arch= target attribute.  */
14317
14318 static bool
14319 aarch64_handle_attr_arch (const char *str)
14320 {
14321   const struct processor *tmp_arch = NULL;
14322   std::string invalid_extension;
14323   enum aarch64_parse_opt_result parse_res
14324     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14325
14326   if (parse_res == AARCH64_PARSE_OK)
14327     {
14328       gcc_assert (tmp_arch);
14329       selected_arch = tmp_arch;
14330       explicit_arch = selected_arch->arch;
14331       return true;
14332     }
14333
14334   switch (parse_res)
14335     {
14336       case AARCH64_PARSE_MISSING_ARG:
14337         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14338         break;
14339       case AARCH64_PARSE_INVALID_ARG:
14340         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14341         aarch64_print_hint_for_arch (str);
14342         break;
14343       case AARCH64_PARSE_INVALID_FEATURE:
14344         error ("invalid feature modifier %s of value (\"%s\") in "
14345                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14346         aarch64_print_hint_for_extensions (invalid_extension);
14347         break;
14348       default:
14349         gcc_unreachable ();
14350     }
14351
14352   return false;
14353 }
14354
14355 /* Handle the argument CPU_STR to the cpu= target attribute.  */
14356
14357 static bool
14358 aarch64_handle_attr_cpu (const char *str)
14359 {
14360   const struct processor *tmp_cpu = NULL;
14361   std::string invalid_extension;
14362   enum aarch64_parse_opt_result parse_res
14363     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14364
14365   if (parse_res == AARCH64_PARSE_OK)
14366     {
14367       gcc_assert (tmp_cpu);
14368       selected_tune = tmp_cpu;
14369       explicit_tune_core = selected_tune->ident;
14370
14371       selected_arch = &all_architectures[tmp_cpu->arch];
14372       explicit_arch = selected_arch->arch;
14373       return true;
14374     }
14375
14376   switch (parse_res)
14377     {
14378       case AARCH64_PARSE_MISSING_ARG:
14379         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14380         break;
14381       case AARCH64_PARSE_INVALID_ARG:
14382         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14383         aarch64_print_hint_for_core (str);
14384         break;
14385       case AARCH64_PARSE_INVALID_FEATURE:
14386         error ("invalid feature modifier %s of value (\"%s\") in "
14387                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14388         aarch64_print_hint_for_extensions (invalid_extension);
14389         break;
14390       default:
14391         gcc_unreachable ();
14392     }
14393
14394   return false;
14395 }
14396
14397 /* Handle the argument STR to the branch-protection= attribute.  */
14398
14399  static bool
14400  aarch64_handle_attr_branch_protection (const char* str)
14401  {
14402   char *err_str = (char *) xmalloc (strlen (str) + 1);
14403   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14404                                                                       &err_str);
14405   bool success = false;
14406   switch (res)
14407     {
14408      case AARCH64_PARSE_MISSING_ARG:
14409        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14410               " attribute");
14411        break;
14412      case AARCH64_PARSE_INVALID_ARG:
14413        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14414               "=\")%> pragma or attribute", err_str);
14415        break;
14416      case AARCH64_PARSE_OK:
14417        success = true;
14418       /* Fall through.  */
14419      case AARCH64_PARSE_INVALID_FEATURE:
14420        break;
14421      default:
14422        gcc_unreachable ();
14423     }
14424   free (err_str);
14425   return success;
14426  }
14427
14428 /* Handle the argument STR to the tune= target attribute.  */
14429
14430 static bool
14431 aarch64_handle_attr_tune (const char *str)
14432 {
14433   const struct processor *tmp_tune = NULL;
14434   enum aarch64_parse_opt_result parse_res
14435     = aarch64_parse_tune (str, &tmp_tune);
14436
14437   if (parse_res == AARCH64_PARSE_OK)
14438     {
14439       gcc_assert (tmp_tune);
14440       selected_tune = tmp_tune;
14441       explicit_tune_core = selected_tune->ident;
14442       return true;
14443     }
14444
14445   switch (parse_res)
14446     {
14447       case AARCH64_PARSE_INVALID_ARG:
14448         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
14449         aarch64_print_hint_for_core (str);
14450         break;
14451       default:
14452         gcc_unreachable ();
14453     }
14454
14455   return false;
14456 }
14457
14458 /* Parse an architecture extensions target attribute string specified in STR.
14459    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
14460    if successful.  Update aarch64_isa_flags to reflect the ISA features
14461    modified.  */
14462
14463 static bool
14464 aarch64_handle_attr_isa_flags (char *str)
14465 {
14466   enum aarch64_parse_opt_result parse_res;
14467   uint64_t isa_flags = aarch64_isa_flags;
14468
14469   /* We allow "+nothing" in the beginning to clear out all architectural
14470      features if the user wants to handpick specific features.  */
14471   if (strncmp ("+nothing", str, 8) == 0)
14472     {
14473       isa_flags = 0;
14474       str += 8;
14475     }
14476
14477   std::string invalid_extension;
14478   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
14479
14480   if (parse_res == AARCH64_PARSE_OK)
14481     {
14482       aarch64_isa_flags = isa_flags;
14483       return true;
14484     }
14485
14486   switch (parse_res)
14487     {
14488       case AARCH64_PARSE_MISSING_ARG:
14489         error ("missing value in %<target()%> pragma or attribute");
14490         break;
14491
14492       case AARCH64_PARSE_INVALID_FEATURE:
14493         error ("invalid feature modifier %s of value (\"%s\") in "
14494                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14495         break;
14496
14497       default:
14498         gcc_unreachable ();
14499     }
14500
14501  return false;
14502 }
14503
14504 /* The target attributes that we support.  On top of these we also support just
14505    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
14506    handled explicitly in aarch64_process_one_target_attr.  */
14507
14508 static const struct aarch64_attribute_info aarch64_attributes[] =
14509 {
14510   { "general-regs-only", aarch64_attr_mask, false, NULL,
14511      OPT_mgeneral_regs_only },
14512   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
14513      OPT_mfix_cortex_a53_835769 },
14514   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
14515      OPT_mfix_cortex_a53_843419 },
14516   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
14517   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
14518   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
14519      OPT_momit_leaf_frame_pointer },
14520   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
14521   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
14522      OPT_march_ },
14523   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
14524   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
14525      OPT_mtune_ },
14526   { "branch-protection", aarch64_attr_custom, false,
14527      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
14528   { "sign-return-address", aarch64_attr_enum, false, NULL,
14529      OPT_msign_return_address_ },
14530   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
14531 };
14532
14533 /* Parse ARG_STR which contains the definition of one target attribute.
14534    Show appropriate errors if any or return true if the attribute is valid.  */
14535
14536 static bool
14537 aarch64_process_one_target_attr (char *arg_str)
14538 {
14539   bool invert = false;
14540
14541   size_t len = strlen (arg_str);
14542
14543   if (len == 0)
14544     {
14545       error ("malformed %<target()%> pragma or attribute");
14546       return false;
14547     }
14548
14549   char *str_to_check = (char *) alloca (len + 1);
14550   strcpy (str_to_check, arg_str);
14551
14552   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14553      It is easier to detect and handle it explicitly here rather than going
14554      through the machinery for the rest of the target attributes in this
14555      function.  */
14556   if (*str_to_check == '+')
14557     return aarch64_handle_attr_isa_flags (str_to_check);
14558
14559   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
14560     {
14561       invert = true;
14562       str_to_check += 3;
14563     }
14564   char *arg = strchr (str_to_check, '=');
14565
14566   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14567      and point ARG to "foo".  */
14568   if (arg)
14569     {
14570       *arg = '\0';
14571       arg++;
14572     }
14573   const struct aarch64_attribute_info *p_attr;
14574   bool found = false;
14575   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
14576     {
14577       /* If the names don't match up, or the user has given an argument
14578          to an attribute that doesn't accept one, or didn't give an argument
14579          to an attribute that expects one, fail to match.  */
14580       if (strcmp (str_to_check, p_attr->name) != 0)
14581         continue;
14582
14583       found = true;
14584       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
14585                               || p_attr->attr_type == aarch64_attr_enum;
14586
14587       if (attr_need_arg_p ^ (arg != NULL))
14588         {
14589           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
14590           return false;
14591         }
14592
14593       /* If the name matches but the attribute does not allow "no-" versions
14594          then we can't match.  */
14595       if (invert && !p_attr->allow_neg)
14596         {
14597           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
14598           return false;
14599         }
14600
14601       switch (p_attr->attr_type)
14602         {
14603         /* Has a custom handler registered.
14604            For example, cpu=, arch=, tune=.  */
14605           case aarch64_attr_custom:
14606             gcc_assert (p_attr->handler);
14607             if (!p_attr->handler (arg))
14608               return false;
14609             break;
14610
14611           /* Either set or unset a boolean option.  */
14612           case aarch64_attr_bool:
14613             {
14614               struct cl_decoded_option decoded;
14615
14616               generate_option (p_attr->opt_num, NULL, !invert,
14617                                CL_TARGET, &decoded);
14618               aarch64_handle_option (&global_options, &global_options_set,
14619                                       &decoded, input_location);
14620               break;
14621             }
14622           /* Set or unset a bit in the target_flags.  aarch64_handle_option
14623              should know what mask to apply given the option number.  */
14624           case aarch64_attr_mask:
14625             {
14626               struct cl_decoded_option decoded;
14627               /* We only need to specify the option number.
14628                  aarch64_handle_option will know which mask to apply.  */
14629               decoded.opt_index = p_attr->opt_num;
14630               decoded.value = !invert;
14631               aarch64_handle_option (&global_options, &global_options_set,
14632                                       &decoded, input_location);
14633               break;
14634             }
14635           /* Use the option setting machinery to set an option to an enum.  */
14636           case aarch64_attr_enum:
14637             {
14638               gcc_assert (arg);
14639               bool valid;
14640               int value;
14641               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
14642                                               &value, CL_TARGET);
14643               if (valid)
14644                 {
14645                   set_option (&global_options, NULL, p_attr->opt_num, value,
14646                               NULL, DK_UNSPECIFIED, input_location,
14647                               global_dc);
14648                 }
14649               else
14650                 {
14651                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
14652                 }
14653               break;
14654             }
14655           default:
14656             gcc_unreachable ();
14657         }
14658     }
14659
14660   /* If we reached here we either have found an attribute and validated
14661      it or didn't match any.  If we matched an attribute but its arguments
14662      were malformed we will have returned false already.  */
14663   return found;
14664 }
14665
14666 /* Count how many times the character C appears in
14667    NULL-terminated string STR.  */
14668
14669 static unsigned int
14670 num_occurences_in_str (char c, char *str)
14671 {
14672   unsigned int res = 0;
14673   while (*str != '\0')
14674     {
14675       if (*str == c)
14676         res++;
14677
14678       str++;
14679     }
14680
14681   return res;
14682 }
14683
14684 /* Parse the tree in ARGS that contains the target attribute information
14685    and update the global target options space.  */
14686
14687 bool
14688 aarch64_process_target_attr (tree args)
14689 {
14690   if (TREE_CODE (args) == TREE_LIST)
14691     {
14692       do
14693         {
14694           tree head = TREE_VALUE (args);
14695           if (head)
14696             {
14697               if (!aarch64_process_target_attr (head))
14698                 return false;
14699             }
14700           args = TREE_CHAIN (args);
14701         } while (args);
14702
14703       return true;
14704     }
14705
14706   if (TREE_CODE (args) != STRING_CST)
14707     {
14708       error ("attribute %<target%> argument not a string");
14709       return false;
14710     }
14711
14712   size_t len = strlen (TREE_STRING_POINTER (args));
14713   char *str_to_check = (char *) alloca (len + 1);
14714   strcpy (str_to_check, TREE_STRING_POINTER (args));
14715
14716   if (len == 0)
14717     {
14718       error ("malformed %<target()%> pragma or attribute");
14719       return false;
14720     }
14721
14722   /* Used to catch empty spaces between commas i.e.
14723      attribute ((target ("attr1,,attr2"))).  */
14724   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
14725
14726   /* Handle multiple target attributes separated by ','.  */
14727   char *token = strtok_r (str_to_check, ",", &str_to_check);
14728
14729   unsigned int num_attrs = 0;
14730   while (token)
14731     {
14732       num_attrs++;
14733       if (!aarch64_process_one_target_attr (token))
14734         {
14735           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
14736           return false;
14737         }
14738
14739       token = strtok_r (NULL, ",", &str_to_check);
14740     }
14741
14742   if (num_attrs != num_commas + 1)
14743     {
14744       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
14745       return false;
14746     }
14747
14748   return true;
14749 }
14750
14751 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
14752    process attribute ((target ("..."))).  */
14753
14754 static bool
14755 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
14756 {
14757   struct cl_target_option cur_target;
14758   bool ret;
14759   tree old_optimize;
14760   tree new_target, new_optimize;
14761   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14762
14763   /* If what we're processing is the current pragma string then the
14764      target option node is already stored in target_option_current_node
14765      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
14766      having to re-parse the string.  This is especially useful to keep
14767      arm_neon.h compile times down since that header contains a lot
14768      of intrinsics enclosed in pragmas.  */
14769   if (!existing_target && args == current_target_pragma)
14770     {
14771       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
14772       return true;
14773     }
14774   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14775
14776   old_optimize = build_optimization_node (&global_options);
14777   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14778
14779   /* If the function changed the optimization levels as well as setting
14780      target options, start with the optimizations specified.  */
14781   if (func_optimize && func_optimize != old_optimize)
14782     cl_optimization_restore (&global_options,
14783                              TREE_OPTIMIZATION (func_optimize));
14784
14785   /* Save the current target options to restore at the end.  */
14786   cl_target_option_save (&cur_target, &global_options);
14787
14788   /* If fndecl already has some target attributes applied to it, unpack
14789      them so that we add this attribute on top of them, rather than
14790      overwriting them.  */
14791   if (existing_target)
14792     {
14793       struct cl_target_option *existing_options
14794         = TREE_TARGET_OPTION (existing_target);
14795
14796       if (existing_options)
14797         cl_target_option_restore (&global_options, existing_options);
14798     }
14799   else
14800     cl_target_option_restore (&global_options,
14801                         TREE_TARGET_OPTION (target_option_current_node));
14802
14803   ret = aarch64_process_target_attr (args);
14804
14805   /* Set up any additional state.  */
14806   if (ret)
14807     {
14808       aarch64_override_options_internal (&global_options);
14809       /* Initialize SIMD builtins if we haven't already.
14810          Set current_target_pragma to NULL for the duration so that
14811          the builtin initialization code doesn't try to tag the functions
14812          being built with the attributes specified by any current pragma, thus
14813          going into an infinite recursion.  */
14814       if (TARGET_SIMD)
14815         {
14816           tree saved_current_target_pragma = current_target_pragma;
14817           current_target_pragma = NULL;
14818           aarch64_init_simd_builtins ();
14819           current_target_pragma = saved_current_target_pragma;
14820         }
14821       new_target = build_target_option_node (&global_options);
14822     }
14823   else
14824     new_target = NULL;
14825
14826   new_optimize = build_optimization_node (&global_options);
14827
14828   if (fndecl && ret)
14829     {
14830       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
14831
14832       if (old_optimize != new_optimize)
14833         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
14834     }
14835
14836   cl_target_option_restore (&global_options, &cur_target);
14837
14838   if (old_optimize != new_optimize)
14839     cl_optimization_restore (&global_options,
14840                              TREE_OPTIMIZATION (old_optimize));
14841   return ret;
14842 }
14843
14844 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
14845    tri-bool options (yes, no, don't care) and the default value is
14846    DEF, determine whether to reject inlining.  */
14847
14848 static bool
14849 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
14850                                      int dont_care, int def)
14851 {
14852   /* If the callee doesn't care, always allow inlining.  */
14853   if (callee == dont_care)
14854     return true;
14855
14856   /* If the caller doesn't care, always allow inlining.  */
14857   if (caller == dont_care)
14858     return true;
14859
14860   /* Otherwise, allow inlining if either the callee and caller values
14861      agree, or if the callee is using the default value.  */
14862   return (callee == caller || callee == def);
14863 }
14864
14865 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
14866    to inline CALLEE into CALLER based on target-specific info.
14867    Make sure that the caller and callee have compatible architectural
14868    features.  Then go through the other possible target attributes
14869    and see if they can block inlining.  Try not to reject always_inline
14870    callees unless they are incompatible architecturally.  */
14871
14872 static bool
14873 aarch64_can_inline_p (tree caller, tree callee)
14874 {
14875   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
14876   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
14877
14878   struct cl_target_option *caller_opts
14879         = TREE_TARGET_OPTION (caller_tree ? caller_tree
14880                                            : target_option_default_node);
14881
14882   struct cl_target_option *callee_opts
14883         = TREE_TARGET_OPTION (callee_tree ? callee_tree
14884                                            : target_option_default_node);
14885
14886   /* Callee's ISA flags should be a subset of the caller's.  */
14887   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
14888        != callee_opts->x_aarch64_isa_flags)
14889     return false;
14890
14891   /* Allow non-strict aligned functions inlining into strict
14892      aligned ones.  */
14893   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
14894        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
14895       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
14896            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
14897     return false;
14898
14899   bool always_inline = lookup_attribute ("always_inline",
14900                                           DECL_ATTRIBUTES (callee));
14901
14902   /* If the architectural features match up and the callee is always_inline
14903      then the other attributes don't matter.  */
14904   if (always_inline)
14905     return true;
14906
14907   if (caller_opts->x_aarch64_cmodel_var
14908       != callee_opts->x_aarch64_cmodel_var)
14909     return false;
14910
14911   if (caller_opts->x_aarch64_tls_dialect
14912       != callee_opts->x_aarch64_tls_dialect)
14913     return false;
14914
14915   /* Honour explicit requests to workaround errata.  */
14916   if (!aarch64_tribools_ok_for_inlining_p (
14917           caller_opts->x_aarch64_fix_a53_err835769,
14918           callee_opts->x_aarch64_fix_a53_err835769,
14919           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
14920     return false;
14921
14922   if (!aarch64_tribools_ok_for_inlining_p (
14923           caller_opts->x_aarch64_fix_a53_err843419,
14924           callee_opts->x_aarch64_fix_a53_err843419,
14925           2, TARGET_FIX_ERR_A53_843419))
14926     return false;
14927
14928   /* If the user explicitly specified -momit-leaf-frame-pointer for the
14929      caller and calle and they don't match up, reject inlining.  */
14930   if (!aarch64_tribools_ok_for_inlining_p (
14931           caller_opts->x_flag_omit_leaf_frame_pointer,
14932           callee_opts->x_flag_omit_leaf_frame_pointer,
14933           2, 1))
14934     return false;
14935
14936   /* If the callee has specific tuning overrides, respect them.  */
14937   if (callee_opts->x_aarch64_override_tune_string != NULL
14938       && caller_opts->x_aarch64_override_tune_string == NULL)
14939     return false;
14940
14941   /* If the user specified tuning override strings for the
14942      caller and callee and they don't match up, reject inlining.
14943      We just do a string compare here, we don't analyze the meaning
14944      of the string, as it would be too costly for little gain.  */
14945   if (callee_opts->x_aarch64_override_tune_string
14946       && caller_opts->x_aarch64_override_tune_string
14947       && (strcmp (callee_opts->x_aarch64_override_tune_string,
14948                   caller_opts->x_aarch64_override_tune_string) != 0))
14949     return false;
14950
14951   return true;
14952 }
14953
14954 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
14955    been already.  */
14956
14957 unsigned int
14958 aarch64_tlsdesc_abi_id ()
14959 {
14960   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
14961   if (!tlsdesc_abi.initialized_p ())
14962     {
14963       HARD_REG_SET full_reg_clobbers;
14964       CLEAR_HARD_REG_SET (full_reg_clobbers);
14965       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
14966       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
14967       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
14968         SET_HARD_REG_BIT (full_reg_clobbers, regno);
14969       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
14970     }
14971   return tlsdesc_abi.id ();
14972 }
14973
14974 /* Return true if SYMBOL_REF X binds locally.  */
14975
14976 static bool
14977 aarch64_symbol_binds_local_p (const_rtx x)
14978 {
14979   return (SYMBOL_REF_DECL (x)
14980           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
14981           : SYMBOL_REF_LOCAL_P (x));
14982 }
14983
14984 /* Return true if SYMBOL_REF X is thread local */
14985 static bool
14986 aarch64_tls_symbol_p (rtx x)
14987 {
14988   if (! TARGET_HAVE_TLS)
14989     return false;
14990
14991   if (GET_CODE (x) != SYMBOL_REF)
14992     return false;
14993
14994   return SYMBOL_REF_TLS_MODEL (x) != 0;
14995 }
14996
14997 /* Classify a TLS symbol into one of the TLS kinds.  */
14998 enum aarch64_symbol_type
14999 aarch64_classify_tls_symbol (rtx x)
15000 {
15001   enum tls_model tls_kind = tls_symbolic_operand_type (x);
15002
15003   switch (tls_kind)
15004     {
15005     case TLS_MODEL_GLOBAL_DYNAMIC:
15006     case TLS_MODEL_LOCAL_DYNAMIC:
15007       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15008
15009     case TLS_MODEL_INITIAL_EXEC:
15010       switch (aarch64_cmodel)
15011         {
15012         case AARCH64_CMODEL_TINY:
15013         case AARCH64_CMODEL_TINY_PIC:
15014           return SYMBOL_TINY_TLSIE;
15015         default:
15016           return SYMBOL_SMALL_TLSIE;
15017         }
15018
15019     case TLS_MODEL_LOCAL_EXEC:
15020       if (aarch64_tls_size == 12)
15021         return SYMBOL_TLSLE12;
15022       else if (aarch64_tls_size == 24)
15023         return SYMBOL_TLSLE24;
15024       else if (aarch64_tls_size == 32)
15025         return SYMBOL_TLSLE32;
15026       else if (aarch64_tls_size == 48)
15027         return SYMBOL_TLSLE48;
15028       else
15029         gcc_unreachable ();
15030
15031     case TLS_MODEL_EMULATED:
15032     case TLS_MODEL_NONE:
15033       return SYMBOL_FORCE_TO_MEM;
15034
15035     default:
15036       gcc_unreachable ();
15037     }
15038 }
15039
15040 /* Return the correct method for accessing X + OFFSET, where X is either
15041    a SYMBOL_REF or LABEL_REF.  */
15042
15043 enum aarch64_symbol_type
15044 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15045 {
15046   if (GET_CODE (x) == LABEL_REF)
15047     {
15048       switch (aarch64_cmodel)
15049         {
15050         case AARCH64_CMODEL_LARGE:
15051           return SYMBOL_FORCE_TO_MEM;
15052
15053         case AARCH64_CMODEL_TINY_PIC:
15054         case AARCH64_CMODEL_TINY:
15055           return SYMBOL_TINY_ABSOLUTE;
15056
15057         case AARCH64_CMODEL_SMALL_SPIC:
15058         case AARCH64_CMODEL_SMALL_PIC:
15059         case AARCH64_CMODEL_SMALL:
15060           return SYMBOL_SMALL_ABSOLUTE;
15061
15062         default:
15063           gcc_unreachable ();
15064         }
15065     }
15066
15067   if (GET_CODE (x) == SYMBOL_REF)
15068     {
15069       if (aarch64_tls_symbol_p (x))
15070         return aarch64_classify_tls_symbol (x);
15071
15072       switch (aarch64_cmodel)
15073         {
15074         case AARCH64_CMODEL_TINY:
15075           /* When we retrieve symbol + offset address, we have to make sure
15076              the offset does not cause overflow of the final address.  But
15077              we have no way of knowing the address of symbol at compile time
15078              so we can't accurately say if the distance between the PC and
15079              symbol + offset is outside the addressible range of +/-1MB in the
15080              TINY code model.  So we limit the maximum offset to +/-64KB and
15081              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15082              If offset_within_block_p is true we allow larger offsets.
15083              Furthermore force to memory if the symbol is a weak reference to
15084              something that doesn't resolve to a symbol in this module.  */
15085
15086           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15087             return SYMBOL_FORCE_TO_MEM;
15088           if (!(IN_RANGE (offset, -0x10000, 0x10000)
15089                 || offset_within_block_p (x, offset)))
15090             return SYMBOL_FORCE_TO_MEM;
15091
15092           return SYMBOL_TINY_ABSOLUTE;
15093
15094         case AARCH64_CMODEL_SMALL:
15095           /* Same reasoning as the tiny code model, but the offset cap here is
15096              1MB, allowing +/-3.9GB for the offset to the symbol.  */
15097
15098           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15099             return SYMBOL_FORCE_TO_MEM;
15100           if (!(IN_RANGE (offset, -0x100000, 0x100000)
15101                 || offset_within_block_p (x, offset)))
15102             return SYMBOL_FORCE_TO_MEM;
15103
15104           return SYMBOL_SMALL_ABSOLUTE;
15105
15106         case AARCH64_CMODEL_TINY_PIC:
15107           if (!aarch64_symbol_binds_local_p (x))
15108             return SYMBOL_TINY_GOT;
15109           return SYMBOL_TINY_ABSOLUTE;
15110
15111         case AARCH64_CMODEL_SMALL_SPIC:
15112         case AARCH64_CMODEL_SMALL_PIC:
15113           if (!aarch64_symbol_binds_local_p (x))
15114             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15115                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15116           return SYMBOL_SMALL_ABSOLUTE;
15117
15118         case AARCH64_CMODEL_LARGE:
15119           /* This is alright even in PIC code as the constant
15120              pool reference is always PC relative and within
15121              the same translation unit.  */
15122           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15123             return SYMBOL_SMALL_ABSOLUTE;
15124           else
15125             return SYMBOL_FORCE_TO_MEM;
15126
15127         default:
15128           gcc_unreachable ();
15129         }
15130     }
15131
15132   /* By default push everything into the constant pool.  */
15133   return SYMBOL_FORCE_TO_MEM;
15134 }
15135
15136 bool
15137 aarch64_constant_address_p (rtx x)
15138 {
15139   return (CONSTANT_P (x) && memory_address_p (DImode, x));
15140 }
15141
15142 bool
15143 aarch64_legitimate_pic_operand_p (rtx x)
15144 {
15145   if (GET_CODE (x) == SYMBOL_REF
15146       || (GET_CODE (x) == CONST
15147           && GET_CODE (XEXP (x, 0)) == PLUS
15148           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15149      return false;
15150
15151   return true;
15152 }
15153
15154 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
15155    that should be rematerialized rather than spilled.  */
15156
15157 static bool
15158 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15159 {
15160   /* Support CSE and rematerialization of common constants.  */
15161   if (CONST_INT_P (x)
15162       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15163       || GET_CODE (x) == CONST_VECTOR)
15164     return true;
15165
15166   /* Do not allow vector struct mode constants for Advanced SIMD.
15167      We could support 0 and -1 easily, but they need support in
15168      aarch64-simd.md.  */
15169   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15170   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15171     return false;
15172
15173   /* Only accept variable-length vector constants if they can be
15174      handled directly.
15175
15176      ??? It would be possible to handle rematerialization of other
15177      constants via secondary reloads.  */
15178   if (vec_flags & VEC_ANY_SVE)
15179     return aarch64_simd_valid_immediate (x, NULL);
15180
15181   if (GET_CODE (x) == HIGH)
15182     x = XEXP (x, 0);
15183
15184   /* Accept polynomial constants that can be calculated by using the
15185      destination of a move as the sole temporary.  Constants that
15186      require a second temporary cannot be rematerialized (they can't be
15187      forced to memory and also aren't legitimate constants).  */
15188   poly_int64 offset;
15189   if (poly_int_rtx_p (x, &offset))
15190     return aarch64_offset_temporaries (false, offset) <= 1;
15191
15192   /* If an offset is being added to something else, we need to allow the
15193      base to be moved into the destination register, meaning that there
15194      are no free temporaries for the offset.  */
15195   x = strip_offset (x, &offset);
15196   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15197     return false;
15198
15199   /* Do not allow const (plus (anchor_symbol, const_int)).  */
15200   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15201     return false;
15202
15203   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
15204      so spilling them is better than rematerialization.  */
15205   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15206     return true;
15207
15208   /* Label references are always constant.  */
15209   if (GET_CODE (x) == LABEL_REF)
15210     return true;
15211
15212   return false;
15213 }
15214
15215 rtx
15216 aarch64_load_tp (rtx target)
15217 {
15218   if (!target
15219       || GET_MODE (target) != Pmode
15220       || !register_operand (target, Pmode))
15221     target = gen_reg_rtx (Pmode);
15222
15223   /* Can return in any reg.  */
15224   emit_insn (gen_aarch64_load_tp_hard (target));
15225   return target;
15226 }
15227
15228 /* On AAPCS systems, this is the "struct __va_list".  */
15229 static GTY(()) tree va_list_type;
15230
15231 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15232    Return the type to use as __builtin_va_list.
15233
15234    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15235
15236    struct __va_list
15237    {
15238      void *__stack;
15239      void *__gr_top;
15240      void *__vr_top;
15241      int   __gr_offs;
15242      int   __vr_offs;
15243    };  */
15244
15245 static tree
15246 aarch64_build_builtin_va_list (void)
15247 {
15248   tree va_list_name;
15249   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15250
15251   /* Create the type.  */
15252   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15253   /* Give it the required name.  */
15254   va_list_name = build_decl (BUILTINS_LOCATION,
15255                              TYPE_DECL,
15256                              get_identifier ("__va_list"),
15257                              va_list_type);
15258   DECL_ARTIFICIAL (va_list_name) = 1;
15259   TYPE_NAME (va_list_type) = va_list_name;
15260   TYPE_STUB_DECL (va_list_type) = va_list_name;
15261
15262   /* Create the fields.  */
15263   f_stack = build_decl (BUILTINS_LOCATION,
15264                         FIELD_DECL, get_identifier ("__stack"),
15265                         ptr_type_node);
15266   f_grtop = build_decl (BUILTINS_LOCATION,
15267                         FIELD_DECL, get_identifier ("__gr_top"),
15268                         ptr_type_node);
15269   f_vrtop = build_decl (BUILTINS_LOCATION,
15270                         FIELD_DECL, get_identifier ("__vr_top"),
15271                         ptr_type_node);
15272   f_groff = build_decl (BUILTINS_LOCATION,
15273                         FIELD_DECL, get_identifier ("__gr_offs"),
15274                         integer_type_node);
15275   f_vroff = build_decl (BUILTINS_LOCATION,
15276                         FIELD_DECL, get_identifier ("__vr_offs"),
15277                         integer_type_node);
15278
15279   /* Tell tree-stdarg pass about our internal offset fields.
15280      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15281      purpose to identify whether the code is updating va_list internal
15282      offset fields through irregular way.  */
15283   va_list_gpr_counter_field = f_groff;
15284   va_list_fpr_counter_field = f_vroff;
15285
15286   DECL_ARTIFICIAL (f_stack) = 1;
15287   DECL_ARTIFICIAL (f_grtop) = 1;
15288   DECL_ARTIFICIAL (f_vrtop) = 1;
15289   DECL_ARTIFICIAL (f_groff) = 1;
15290   DECL_ARTIFICIAL (f_vroff) = 1;
15291
15292   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15293   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15294   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15295   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15296   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15297
15298   TYPE_FIELDS (va_list_type) = f_stack;
15299   DECL_CHAIN (f_stack) = f_grtop;
15300   DECL_CHAIN (f_grtop) = f_vrtop;
15301   DECL_CHAIN (f_vrtop) = f_groff;
15302   DECL_CHAIN (f_groff) = f_vroff;
15303
15304   /* Compute its layout.  */
15305   layout_type (va_list_type);
15306
15307   return va_list_type;
15308 }
15309
15310 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
15311 static void
15312 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15313 {
15314   const CUMULATIVE_ARGS *cum;
15315   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15316   tree stack, grtop, vrtop, groff, vroff;
15317   tree t;
15318   int gr_save_area_size = cfun->va_list_gpr_size;
15319   int vr_save_area_size = cfun->va_list_fpr_size;
15320   int vr_offset;
15321
15322   cum = &crtl->args.info;
15323   if (cfun->va_list_gpr_size)
15324     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15325                              cfun->va_list_gpr_size);
15326   if (cfun->va_list_fpr_size)
15327     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15328                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
15329
15330   if (!TARGET_FLOAT)
15331     {
15332       gcc_assert (cum->aapcs_nvrn == 0);
15333       vr_save_area_size = 0;
15334     }
15335
15336   f_stack = TYPE_FIELDS (va_list_type_node);
15337   f_grtop = DECL_CHAIN (f_stack);
15338   f_vrtop = DECL_CHAIN (f_grtop);
15339   f_groff = DECL_CHAIN (f_vrtop);
15340   f_vroff = DECL_CHAIN (f_groff);
15341
15342   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15343                   NULL_TREE);
15344   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15345                   NULL_TREE);
15346   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15347                   NULL_TREE);
15348   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15349                   NULL_TREE);
15350   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15351                   NULL_TREE);
15352
15353   /* Emit code to initialize STACK, which points to the next varargs stack
15354      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
15355      by named arguments.  STACK is 8-byte aligned.  */
15356   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15357   if (cum->aapcs_stack_size > 0)
15358     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15359   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15360   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15361
15362   /* Emit code to initialize GRTOP, the top of the GR save area.
15363      virtual_incoming_args_rtx should have been 16 byte aligned.  */
15364   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15365   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15366   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15367
15368   /* Emit code to initialize VRTOP, the top of the VR save area.
15369      This address is gr_save_area_bytes below GRTOP, rounded
15370      down to the next 16-byte boundary.  */
15371   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15372   vr_offset = ROUND_UP (gr_save_area_size,
15373                         STACK_BOUNDARY / BITS_PER_UNIT);
15374
15375   if (vr_offset)
15376     t = fold_build_pointer_plus_hwi (t, -vr_offset);
15377   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15378   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15379
15380   /* Emit code to initialize GROFF, the offset from GRTOP of the
15381      next GPR argument.  */
15382   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15383               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15384   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15385
15386   /* Likewise emit code to initialize VROFF, the offset from FTOP
15387      of the next VR argument.  */
15388   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15389               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15390   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15391 }
15392
15393 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
15394
15395 static tree
15396 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15397                               gimple_seq *post_p ATTRIBUTE_UNUSED)
15398 {
15399   tree addr;
15400   bool indirect_p;
15401   bool is_ha;           /* is HFA or HVA.  */
15402   bool dw_align;        /* double-word align.  */
15403   machine_mode ag_mode = VOIDmode;
15404   int nregs;
15405   machine_mode mode;
15406
15407   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15408   tree stack, f_top, f_off, off, arg, roundup, on_stack;
15409   HOST_WIDE_INT size, rsize, adjust, align;
15410   tree t, u, cond1, cond2;
15411
15412   indirect_p = pass_va_arg_by_reference (type);
15413   if (indirect_p)
15414     type = build_pointer_type (type);
15415
15416   mode = TYPE_MODE (type);
15417
15418   f_stack = TYPE_FIELDS (va_list_type_node);
15419   f_grtop = DECL_CHAIN (f_stack);
15420   f_vrtop = DECL_CHAIN (f_grtop);
15421   f_groff = DECL_CHAIN (f_vrtop);
15422   f_vroff = DECL_CHAIN (f_groff);
15423
15424   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15425                   f_stack, NULL_TREE);
15426   size = int_size_in_bytes (type);
15427
15428   bool abi_break;
15429   align
15430     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15431
15432   dw_align = false;
15433   adjust = 0;
15434   if (aarch64_vfp_is_call_or_return_candidate (mode,
15435                                                type,
15436                                                &ag_mode,
15437                                                &nregs,
15438                                                &is_ha))
15439     {
15440       /* No frontends can create types with variable-sized modes, so we
15441          shouldn't be asked to pass or return them.  */
15442       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
15443
15444       /* TYPE passed in fp/simd registers.  */
15445       if (!TARGET_FLOAT)
15446         aarch64_err_no_fpadvsimd (mode);
15447
15448       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
15449                       unshare_expr (valist), f_vrtop, NULL_TREE);
15450       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
15451                       unshare_expr (valist), f_vroff, NULL_TREE);
15452
15453       rsize = nregs * UNITS_PER_VREG;
15454
15455       if (is_ha)
15456         {
15457           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
15458             adjust = UNITS_PER_VREG - ag_size;
15459         }
15460       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15461                && size < UNITS_PER_VREG)
15462         {
15463           adjust = UNITS_PER_VREG - size;
15464         }
15465     }
15466   else
15467     {
15468       /* TYPE passed in general registers.  */
15469       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
15470                       unshare_expr (valist), f_grtop, NULL_TREE);
15471       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
15472                       unshare_expr (valist), f_groff, NULL_TREE);
15473       rsize = ROUND_UP (size, UNITS_PER_WORD);
15474       nregs = rsize / UNITS_PER_WORD;
15475
15476       if (align > 8)
15477         {
15478           if (abi_break && warn_psabi)
15479             inform (input_location, "parameter passing for argument of type "
15480                     "%qT changed in GCC 9.1", type);
15481           dw_align = true;
15482         }
15483
15484       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15485           && size < UNITS_PER_WORD)
15486         {
15487           adjust = UNITS_PER_WORD  - size;
15488         }
15489     }
15490
15491   /* Get a local temporary for the field value.  */
15492   off = get_initialized_tmp_var (f_off, pre_p, NULL);
15493
15494   /* Emit code to branch if off >= 0.  */
15495   t = build2 (GE_EXPR, boolean_type_node, off,
15496               build_int_cst (TREE_TYPE (off), 0));
15497   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
15498
15499   if (dw_align)
15500     {
15501       /* Emit: offs = (offs + 15) & -16.  */
15502       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15503                   build_int_cst (TREE_TYPE (off), 15));
15504       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
15505                   build_int_cst (TREE_TYPE (off), -16));
15506       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
15507     }
15508   else
15509     roundup = NULL;
15510
15511   /* Update ap.__[g|v]r_offs  */
15512   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15513               build_int_cst (TREE_TYPE (off), rsize));
15514   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
15515
15516   /* String up.  */
15517   if (roundup)
15518     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15519
15520   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
15521   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
15522               build_int_cst (TREE_TYPE (f_off), 0));
15523   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
15524
15525   /* String up: make sure the assignment happens before the use.  */
15526   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
15527   COND_EXPR_ELSE (cond1) = t;
15528
15529   /* Prepare the trees handling the argument that is passed on the stack;
15530      the top level node will store in ON_STACK.  */
15531   arg = get_initialized_tmp_var (stack, pre_p, NULL);
15532   if (align > 8)
15533     {
15534       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
15535       t = fold_build_pointer_plus_hwi (arg, 15);
15536       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15537                   build_int_cst (TREE_TYPE (t), -16));
15538       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
15539     }
15540   else
15541     roundup = NULL;
15542   /* Advance ap.__stack  */
15543   t = fold_build_pointer_plus_hwi (arg, size + 7);
15544   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15545               build_int_cst (TREE_TYPE (t), -8));
15546   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
15547   /* String up roundup and advance.  */
15548   if (roundup)
15549     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15550   /* String up with arg */
15551   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
15552   /* Big-endianness related address adjustment.  */
15553   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15554       && size < UNITS_PER_WORD)
15555   {
15556     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
15557                 size_int (UNITS_PER_WORD - size));
15558     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
15559   }
15560
15561   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
15562   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
15563
15564   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
15565   t = off;
15566   if (adjust)
15567     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
15568                 build_int_cst (TREE_TYPE (off), adjust));
15569
15570   t = fold_convert (sizetype, t);
15571   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
15572
15573   if (is_ha)
15574     {
15575       /* type ha; // treat as "struct {ftype field[n];}"
15576          ... [computing offs]
15577          for (i = 0; i <nregs; ++i, offs += 16)
15578            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15579          return ha;  */
15580       int i;
15581       tree tmp_ha, field_t, field_ptr_t;
15582
15583       /* Declare a local variable.  */
15584       tmp_ha = create_tmp_var_raw (type, "ha");
15585       gimple_add_tmp_var (tmp_ha);
15586
15587       /* Establish the base type.  */
15588       switch (ag_mode)
15589         {
15590         case E_SFmode:
15591           field_t = float_type_node;
15592           field_ptr_t = float_ptr_type_node;
15593           break;
15594         case E_DFmode:
15595           field_t = double_type_node;
15596           field_ptr_t = double_ptr_type_node;
15597           break;
15598         case E_TFmode:
15599           field_t = long_double_type_node;
15600           field_ptr_t = long_double_ptr_type_node;
15601           break;
15602         case E_HFmode:
15603           field_t = aarch64_fp16_type_node;
15604           field_ptr_t = aarch64_fp16_ptr_type_node;
15605           break;
15606         case E_V2SImode:
15607         case E_V4SImode:
15608             {
15609               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
15610               field_t = build_vector_type_for_mode (innertype, ag_mode);
15611               field_ptr_t = build_pointer_type (field_t);
15612             }
15613           break;
15614         default:
15615           gcc_assert (0);
15616         }
15617
15618       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
15619       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
15620       addr = t;
15621       t = fold_convert (field_ptr_t, addr);
15622       t = build2 (MODIFY_EXPR, field_t,
15623                   build1 (INDIRECT_REF, field_t, tmp_ha),
15624                   build1 (INDIRECT_REF, field_t, t));
15625
15626       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
15627       for (i = 1; i < nregs; ++i)
15628         {
15629           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
15630           u = fold_convert (field_ptr_t, addr);
15631           u = build2 (MODIFY_EXPR, field_t,
15632                       build2 (MEM_REF, field_t, tmp_ha,
15633                               build_int_cst (field_ptr_t,
15634                                              (i *
15635                                               int_size_in_bytes (field_t)))),
15636                       build1 (INDIRECT_REF, field_t, u));
15637           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
15638         }
15639
15640       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
15641       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
15642     }
15643
15644   COND_EXPR_ELSE (cond2) = t;
15645   addr = fold_convert (build_pointer_type (type), cond1);
15646   addr = build_va_arg_indirect_ref (addr);
15647
15648   if (indirect_p)
15649     addr = build_va_arg_indirect_ref (addr);
15650
15651   return addr;
15652 }
15653
15654 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
15655
15656 static void
15657 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
15658                                 const function_arg_info &arg,
15659                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
15660 {
15661   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
15662   CUMULATIVE_ARGS local_cum;
15663   int gr_saved = cfun->va_list_gpr_size;
15664   int vr_saved = cfun->va_list_fpr_size;
15665
15666   /* The caller has advanced CUM up to, but not beyond, the last named
15667      argument.  Advance a local copy of CUM past the last "real" named
15668      argument, to find out how many registers are left over.  */
15669   local_cum = *cum;
15670   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
15671
15672   /* Found out how many registers we need to save.
15673      Honor tree-stdvar analysis results.  */
15674   if (cfun->va_list_gpr_size)
15675     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
15676                     cfun->va_list_gpr_size / UNITS_PER_WORD);
15677   if (cfun->va_list_fpr_size)
15678     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
15679                     cfun->va_list_fpr_size / UNITS_PER_VREG);
15680
15681   if (!TARGET_FLOAT)
15682     {
15683       gcc_assert (local_cum.aapcs_nvrn == 0);
15684       vr_saved = 0;
15685     }
15686
15687   if (!no_rtl)
15688     {
15689       if (gr_saved > 0)
15690         {
15691           rtx ptr, mem;
15692
15693           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
15694           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
15695                                - gr_saved * UNITS_PER_WORD);
15696           mem = gen_frame_mem (BLKmode, ptr);
15697           set_mem_alias_set (mem, get_varargs_alias_set ());
15698
15699           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
15700                                mem, gr_saved);
15701         }
15702       if (vr_saved > 0)
15703         {
15704           /* We can't use move_block_from_reg, because it will use
15705              the wrong mode, storing D regs only.  */
15706           machine_mode mode = TImode;
15707           int off, i, vr_start;
15708
15709           /* Set OFF to the offset from virtual_incoming_args_rtx of
15710              the first vector register.  The VR save area lies below
15711              the GR one, and is aligned to 16 bytes.  */
15712           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
15713                            STACK_BOUNDARY / BITS_PER_UNIT);
15714           off -= vr_saved * UNITS_PER_VREG;
15715
15716           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
15717           for (i = 0; i < vr_saved; ++i)
15718             {
15719               rtx ptr, mem;
15720
15721               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
15722               mem = gen_frame_mem (mode, ptr);
15723               set_mem_alias_set (mem, get_varargs_alias_set ());
15724               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
15725               off += UNITS_PER_VREG;
15726             }
15727         }
15728     }
15729
15730   /* We don't save the size into *PRETEND_SIZE because we want to avoid
15731      any complication of having crtl->args.pretend_args_size changed.  */
15732   cfun->machine->frame.saved_varargs_size
15733     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
15734                  STACK_BOUNDARY / BITS_PER_UNIT)
15735        + vr_saved * UNITS_PER_VREG);
15736 }
15737
15738 static void
15739 aarch64_conditional_register_usage (void)
15740 {
15741   int i;
15742   if (!TARGET_FLOAT)
15743     {
15744       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
15745         {
15746           fixed_regs[i] = 1;
15747           call_used_regs[i] = 1;
15748         }
15749     }
15750   if (!TARGET_SVE)
15751     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
15752       {
15753         fixed_regs[i] = 1;
15754         call_used_regs[i] = 1;
15755       }
15756
15757   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
15758   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
15759   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
15760
15761   /* When tracking speculation, we need a couple of call-clobbered registers
15762      to track the speculation state.  It would be nice to just use
15763      IP0 and IP1, but currently there are numerous places that just
15764      assume these registers are free for other uses (eg pointer
15765      authentication).  */
15766   if (aarch64_track_speculation)
15767     {
15768       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
15769       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
15770       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15771       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15772     }
15773 }
15774
15775 /* Walk down the type tree of TYPE counting consecutive base elements.
15776    If *MODEP is VOIDmode, then set it to the first valid floating point
15777    type.  If a non-floating point type is found, or if a floating point
15778    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15779    otherwise return the count in the sub-tree.  */
15780 static int
15781 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
15782 {
15783   machine_mode mode;
15784   HOST_WIDE_INT size;
15785
15786   /* SVE types (and types containing SVE types) must be handled
15787      before calling this function.  */
15788   gcc_assert (!aarch64_sve::builtin_type_p (type));
15789
15790   switch (TREE_CODE (type))
15791     {
15792     case REAL_TYPE:
15793       mode = TYPE_MODE (type);
15794       if (mode != DFmode && mode != SFmode
15795           && mode != TFmode && mode != HFmode)
15796         return -1;
15797
15798       if (*modep == VOIDmode)
15799         *modep = mode;
15800
15801       if (*modep == mode)
15802         return 1;
15803
15804       break;
15805
15806     case COMPLEX_TYPE:
15807       mode = TYPE_MODE (TREE_TYPE (type));
15808       if (mode != DFmode && mode != SFmode
15809           && mode != TFmode && mode != HFmode)
15810         return -1;
15811
15812       if (*modep == VOIDmode)
15813         *modep = mode;
15814
15815       if (*modep == mode)
15816         return 2;
15817
15818       break;
15819
15820     case VECTOR_TYPE:
15821       /* Use V2SImode and V4SImode as representatives of all 64-bit
15822          and 128-bit vector types.  */
15823       size = int_size_in_bytes (type);
15824       switch (size)
15825         {
15826         case 8:
15827           mode = V2SImode;
15828           break;
15829         case 16:
15830           mode = V4SImode;
15831           break;
15832         default:
15833           return -1;
15834         }
15835
15836       if (*modep == VOIDmode)
15837         *modep = mode;
15838
15839       /* Vector modes are considered to be opaque: two vectors are
15840          equivalent for the purposes of being homogeneous aggregates
15841          if they are the same size.  */
15842       if (*modep == mode)
15843         return 1;
15844
15845       break;
15846
15847     case ARRAY_TYPE:
15848       {
15849         int count;
15850         tree index = TYPE_DOMAIN (type);
15851
15852         /* Can't handle incomplete types nor sizes that are not
15853            fixed.  */
15854         if (!COMPLETE_TYPE_P (type)
15855             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15856           return -1;
15857
15858         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
15859         if (count == -1
15860             || !index
15861             || !TYPE_MAX_VALUE (index)
15862             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
15863             || !TYPE_MIN_VALUE (index)
15864             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
15865             || count < 0)
15866           return -1;
15867
15868         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
15869                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
15870
15871         /* There must be no padding.  */
15872         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15873                       count * GET_MODE_BITSIZE (*modep)))
15874           return -1;
15875
15876         return count;
15877       }
15878
15879     case RECORD_TYPE:
15880       {
15881         int count = 0;
15882         int sub_count;
15883         tree field;
15884
15885         /* Can't handle incomplete types nor sizes that are not
15886            fixed.  */
15887         if (!COMPLETE_TYPE_P (type)
15888             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15889           return -1;
15890
15891         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15892           {
15893             if (TREE_CODE (field) != FIELD_DECL)
15894               continue;
15895
15896             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15897             if (sub_count < 0)
15898               return -1;
15899             count += sub_count;
15900           }
15901
15902         /* There must be no padding.  */
15903         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15904                       count * GET_MODE_BITSIZE (*modep)))
15905           return -1;
15906
15907         return count;
15908       }
15909
15910     case UNION_TYPE:
15911     case QUAL_UNION_TYPE:
15912       {
15913         /* These aren't very interesting except in a degenerate case.  */
15914         int count = 0;
15915         int sub_count;
15916         tree field;
15917
15918         /* Can't handle incomplete types nor sizes that are not
15919            fixed.  */
15920         if (!COMPLETE_TYPE_P (type)
15921             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15922           return -1;
15923
15924         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15925           {
15926             if (TREE_CODE (field) != FIELD_DECL)
15927               continue;
15928
15929             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15930             if (sub_count < 0)
15931               return -1;
15932             count = count > sub_count ? count : sub_count;
15933           }
15934
15935         /* There must be no padding.  */
15936         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15937                       count * GET_MODE_BITSIZE (*modep)))
15938           return -1;
15939
15940         return count;
15941       }
15942
15943     default:
15944       break;
15945     }
15946
15947   return -1;
15948 }
15949
15950 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
15951    type as described in AAPCS64 \S 4.1.2.
15952
15953    See the comment above aarch64_composite_type_p for the notes on MODE.  */
15954
15955 static bool
15956 aarch64_short_vector_p (const_tree type,
15957                         machine_mode mode)
15958 {
15959   poly_int64 size = -1;
15960
15961   if (type && aarch64_sve::builtin_type_p (type))
15962     return false;
15963
15964   if (type && TREE_CODE (type) == VECTOR_TYPE)
15965     size = int_size_in_bytes (type);
15966   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
15967             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
15968     size = GET_MODE_SIZE (mode);
15969
15970   return known_eq (size, 8) || known_eq (size, 16);
15971 }
15972
15973 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
15974    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
15975    array types.  The C99 floating-point complex types are also considered
15976    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
15977    types, which are GCC extensions and out of the scope of AAPCS64, are
15978    treated as composite types here as well.
15979
15980    Note that MODE itself is not sufficient in determining whether a type
15981    is such a composite type or not.  This is because
15982    stor-layout.c:compute_record_mode may have already changed the MODE
15983    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
15984    structure with only one field may have its MODE set to the mode of the
15985    field.  Also an integer mode whose size matches the size of the
15986    RECORD_TYPE type may be used to substitute the original mode
15987    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
15988    solely relied on.  */
15989
15990 static bool
15991 aarch64_composite_type_p (const_tree type,
15992                           machine_mode mode)
15993 {
15994   if (aarch64_short_vector_p (type, mode))
15995     return false;
15996
15997   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
15998     return true;
15999
16000   if (mode == BLKmode
16001       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16002       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16003     return true;
16004
16005   return false;
16006 }
16007
16008 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16009    shall be passed or returned in simd/fp register(s) (providing these
16010    parameter passing registers are available).
16011
16012    Upon successful return, *COUNT returns the number of needed registers,
16013    *BASE_MODE returns the mode of the individual register and when IS_HAF
16014    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16015    floating-point aggregate or a homogeneous short-vector aggregate.  */
16016
16017 static bool
16018 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16019                                          const_tree type,
16020                                          machine_mode *base_mode,
16021                                          int *count,
16022                                          bool *is_ha)
16023 {
16024   if (is_ha != NULL) *is_ha = false;
16025
16026   if (type && aarch64_sve::builtin_type_p (type))
16027     return false;
16028
16029   machine_mode new_mode = VOIDmode;
16030   bool composite_p = aarch64_composite_type_p (type, mode);
16031
16032   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16033       || aarch64_short_vector_p (type, mode))
16034     {
16035       *count = 1;
16036       new_mode = mode;
16037     }
16038   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16039     {
16040       if (is_ha != NULL) *is_ha = true;
16041       *count = 2;
16042       new_mode = GET_MODE_INNER (mode);
16043     }
16044   else if (type && composite_p)
16045     {
16046       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
16047
16048       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16049         {
16050           if (is_ha != NULL) *is_ha = true;
16051           *count = ag_count;
16052         }
16053       else
16054         return false;
16055     }
16056   else
16057     return false;
16058
16059   *base_mode = new_mode;
16060   return true;
16061 }
16062
16063 /* Implement TARGET_STRUCT_VALUE_RTX.  */
16064
16065 static rtx
16066 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16067                           int incoming ATTRIBUTE_UNUSED)
16068 {
16069   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16070 }
16071
16072 /* Implements target hook vector_mode_supported_p.  */
16073 static bool
16074 aarch64_vector_mode_supported_p (machine_mode mode)
16075 {
16076   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16077   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16078 }
16079
16080 /* Return the full-width SVE vector mode for element mode MODE, if one
16081    exists.  */
16082 opt_machine_mode
16083 aarch64_full_sve_mode (scalar_mode mode)
16084 {
16085   switch (mode)
16086     {
16087     case E_DFmode:
16088       return VNx2DFmode;
16089     case E_SFmode:
16090       return VNx4SFmode;
16091     case E_HFmode:
16092       return VNx8HFmode;
16093     case E_DImode:
16094         return VNx2DImode;
16095     case E_SImode:
16096       return VNx4SImode;
16097     case E_HImode:
16098       return VNx8HImode;
16099     case E_QImode:
16100       return VNx16QImode;
16101     default:
16102       return opt_machine_mode ();
16103     }
16104 }
16105
16106 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16107    if it exists.  */
16108 opt_machine_mode
16109 aarch64_vq_mode (scalar_mode mode)
16110 {
16111   switch (mode)
16112     {
16113     case E_DFmode:
16114       return V2DFmode;
16115     case E_SFmode:
16116       return V4SFmode;
16117     case E_HFmode:
16118       return V8HFmode;
16119     case E_SImode:
16120       return V4SImode;
16121     case E_HImode:
16122       return V8HImode;
16123     case E_QImode:
16124       return V16QImode;
16125     case E_DImode:
16126       return V2DImode;
16127     default:
16128       return opt_machine_mode ();
16129     }
16130 }
16131
16132 /* Return appropriate SIMD container
16133    for MODE within a vector of WIDTH bits.  */
16134 static machine_mode
16135 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
16136 {
16137   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
16138     return aarch64_full_sve_mode (mode).else_mode (word_mode);
16139
16140   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
16141   if (TARGET_SIMD)
16142     {
16143       if (known_eq (width, 128))
16144         return aarch64_vq_mode (mode).else_mode (word_mode);
16145       else
16146         switch (mode)
16147           {
16148           case E_SFmode:
16149             return V2SFmode;
16150           case E_HFmode:
16151             return V4HFmode;
16152           case E_SImode:
16153             return V2SImode;
16154           case E_HImode:
16155             return V4HImode;
16156           case E_QImode:
16157             return V8QImode;
16158           default:
16159             break;
16160           }
16161     }
16162   return word_mode;
16163 }
16164
16165 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
16166 static machine_mode
16167 aarch64_preferred_simd_mode (scalar_mode mode)
16168 {
16169   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
16170   return aarch64_simd_container_mode (mode, bits);
16171 }
16172
16173 /* Return a list of possible vector sizes for the vectorizer
16174    to iterate over.  */
16175 static unsigned int
16176 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
16177 {
16178   static const machine_mode sve_modes[] = {
16179     /* Try using full vectors for all element types.  */
16180     VNx16QImode,
16181
16182     /* Try using 16-bit containers for 8-bit elements and full vectors
16183        for wider elements.  */
16184     VNx8QImode,
16185
16186     /* Try using 32-bit containers for 8-bit and 16-bit elements and
16187        full vectors for wider elements.  */
16188     VNx4QImode,
16189
16190     /* Try using 64-bit containers for all element types.  */
16191     VNx2QImode
16192   };
16193
16194   static const machine_mode advsimd_modes[] = {
16195     /* Try using 128-bit vectors for all element types.  */
16196     V16QImode,
16197
16198     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16199        for wider elements.  */
16200     V8QImode,
16201
16202     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16203        for wider elements.
16204
16205        TODO: We could support a limited form of V4QImode too, so that
16206        we use 32-bit vectors for 8-bit elements.  */
16207     V4HImode,
16208
16209     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16210        for 64-bit elements.
16211
16212        TODO: We could similarly support limited forms of V2QImode and V2HImode
16213        for this case.  */
16214     V2SImode
16215   };
16216
16217   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16218      This is because:
16219
16220      - If we can't use N-byte Advanced SIMD vectors then the placement
16221        doesn't matter; we'll just continue as though the Advanced SIMD
16222        entry didn't exist.
16223
16224      - If an SVE main loop with N bytes ends up being cheaper than an
16225        Advanced SIMD main loop with N bytes then by default we'll replace
16226        the Advanced SIMD version with the SVE one.
16227
16228      - If an Advanced SIMD main loop with N bytes ends up being cheaper
16229        than an SVE main loop with N bytes then by default we'll try to
16230        use the SVE loop to vectorize the epilogue instead.  */
16231   unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
16232   unsigned int advsimd_i = 0;
16233   while (advsimd_i < ARRAY_SIZE (advsimd_modes))
16234     {
16235       if (sve_i < ARRAY_SIZE (sve_modes)
16236           && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
16237                        GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
16238         modes->safe_push (sve_modes[sve_i++]);
16239       else
16240         modes->safe_push (advsimd_modes[advsimd_i++]);
16241     }
16242   while (sve_i < ARRAY_SIZE (sve_modes))
16243     modes->safe_push (sve_modes[sve_i++]);
16244
16245   unsigned int flags = 0;
16246   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16247      can compare SVE against Advanced SIMD and so that we can compare
16248      multiple SVE vectorization approaches against each other.  There's
16249      not really any point doing this for Advanced SIMD only, since the
16250      first mode that works should always be the best.  */
16251   if (TARGET_SVE && aarch64_sve_compare_costs)
16252     flags |= VECT_COMPARE_COSTS;
16253   return flags;
16254 }
16255
16256 /* Implement TARGET_MANGLE_TYPE.  */
16257
16258 static const char *
16259 aarch64_mangle_type (const_tree type)
16260 {
16261   /* The AArch64 ABI documents say that "__va_list" has to be
16262      mangled as if it is in the "std" namespace.  */
16263   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
16264     return "St9__va_list";
16265
16266   /* Half-precision float.  */
16267   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
16268     return "Dh";
16269
16270   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
16271      builtin types.  */
16272   if (TYPE_NAME (type) != NULL)
16273     {
16274       const char *res;
16275       if ((res = aarch64_general_mangle_builtin_type (type))
16276           || (res = aarch64_sve::mangle_builtin_type (type)))
16277         return res;
16278     }
16279
16280   /* Use the default mangling.  */
16281   return NULL;
16282 }
16283
16284 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
16285
16286 static bool
16287 aarch64_verify_type_context (location_t loc, type_context_kind context,
16288                              const_tree type, bool silent_p)
16289 {
16290   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
16291 }
16292
16293 /* Find the first rtx_insn before insn that will generate an assembly
16294    instruction.  */
16295
16296 static rtx_insn *
16297 aarch64_prev_real_insn (rtx_insn *insn)
16298 {
16299   if (!insn)
16300     return NULL;
16301
16302   do
16303     {
16304       insn = prev_real_insn (insn);
16305     }
16306   while (insn && recog_memoized (insn) < 0);
16307
16308   return insn;
16309 }
16310
16311 static bool
16312 is_madd_op (enum attr_type t1)
16313 {
16314   unsigned int i;
16315   /* A number of these may be AArch32 only.  */
16316   enum attr_type mlatypes[] = {
16317     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
16318     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
16319     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
16320   };
16321
16322   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
16323     {
16324       if (t1 == mlatypes[i])
16325         return true;
16326     }
16327
16328   return false;
16329 }
16330
16331 /* Check if there is a register dependency between a load and the insn
16332    for which we hold recog_data.  */
16333
16334 static bool
16335 dep_between_memop_and_curr (rtx memop)
16336 {
16337   rtx load_reg;
16338   int opno;
16339
16340   gcc_assert (GET_CODE (memop) == SET);
16341
16342   if (!REG_P (SET_DEST (memop)))
16343     return false;
16344
16345   load_reg = SET_DEST (memop);
16346   for (opno = 1; opno < recog_data.n_operands; opno++)
16347     {
16348       rtx operand = recog_data.operand[opno];
16349       if (REG_P (operand)
16350           && reg_overlap_mentioned_p (load_reg, operand))
16351         return true;
16352
16353     }
16354   return false;
16355 }
16356
16357
16358 /* When working around the Cortex-A53 erratum 835769,
16359    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16360    instruction and has a preceding memory instruction such that a NOP
16361    should be inserted between them.  */
16362
16363 bool
16364 aarch64_madd_needs_nop (rtx_insn* insn)
16365 {
16366   enum attr_type attr_type;
16367   rtx_insn *prev;
16368   rtx body;
16369
16370   if (!TARGET_FIX_ERR_A53_835769)
16371     return false;
16372
16373   if (!INSN_P (insn) || recog_memoized (insn) < 0)
16374     return false;
16375
16376   attr_type = get_attr_type (insn);
16377   if (!is_madd_op (attr_type))
16378     return false;
16379
16380   prev = aarch64_prev_real_insn (insn);
16381   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16382      Restore recog state to INSN to avoid state corruption.  */
16383   extract_constrain_insn_cached (insn);
16384
16385   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
16386     return false;
16387
16388   body = single_set (prev);
16389
16390   /* If the previous insn is a memory op and there is no dependency between
16391      it and the DImode madd, emit a NOP between them.  If body is NULL then we
16392      have a complex memory operation, probably a load/store pair.
16393      Be conservative for now and emit a NOP.  */
16394   if (GET_MODE (recog_data.operand[0]) == DImode
16395       && (!body || !dep_between_memop_and_curr (body)))
16396     return true;
16397
16398   return false;
16399
16400 }
16401
16402
16403 /* Implement FINAL_PRESCAN_INSN.  */
16404
16405 void
16406 aarch64_final_prescan_insn (rtx_insn *insn)
16407 {
16408   if (aarch64_madd_needs_nop (insn))
16409     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
16410 }
16411
16412
16413 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16414    instruction.  */
16415
16416 bool
16417 aarch64_sve_index_immediate_p (rtx base_or_step)
16418 {
16419   return (CONST_INT_P (base_or_step)
16420           && IN_RANGE (INTVAL (base_or_step), -16, 15));
16421 }
16422
16423 /* Return true if X is a valid immediate for the SVE ADD and SUB
16424    instructions.  Negate X first if NEGATE_P is true.  */
16425
16426 bool
16427 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
16428 {
16429   rtx elt;
16430
16431   if (!const_vec_duplicate_p (x, &elt)
16432       || !CONST_INT_P (elt))
16433     return false;
16434
16435   HOST_WIDE_INT val = INTVAL (elt);
16436   if (negate_p)
16437     val = -val;
16438   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
16439
16440   if (val & 0xff)
16441     return IN_RANGE (val, 0, 0xff);
16442   return IN_RANGE (val, 0, 0xff00);
16443 }
16444
16445 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16446    instructions.  Negate X first if NEGATE_P is true.  */
16447
16448 bool
16449 aarch64_sve_sqadd_sqsub_immediate_p (rtx x, bool negate_p)
16450 {
16451   rtx elt;
16452
16453   if (!const_vec_duplicate_p (x, &elt)
16454       || !CONST_INT_P (elt))
16455     return false;
16456
16457   if (!aarch64_sve_arith_immediate_p (x, negate_p))
16458     return false;
16459
16460   /* After the optional negation, the immediate must be nonnegative.
16461      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16462      instead of SQADD Zn.B, Zn.B, #129.  */
16463   return negate_p == (INTVAL (elt) < 0);
16464 }
16465
16466 /* Return true if X is a valid immediate operand for an SVE logical
16467    instruction such as AND.  */
16468
16469 bool
16470 aarch64_sve_bitmask_immediate_p (rtx x)
16471 {
16472   rtx elt;
16473
16474   return (const_vec_duplicate_p (x, &elt)
16475           && CONST_INT_P (elt)
16476           && aarch64_bitmask_imm (INTVAL (elt),
16477                                   GET_MODE_INNER (GET_MODE (x))));
16478 }
16479
16480 /* Return true if X is a valid immediate for the SVE DUP and CPY
16481    instructions.  */
16482
16483 bool
16484 aarch64_sve_dup_immediate_p (rtx x)
16485 {
16486   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
16487   if (!CONST_INT_P (x))
16488     return false;
16489
16490   HOST_WIDE_INT val = INTVAL (x);
16491   if (val & 0xff)
16492     return IN_RANGE (val, -0x80, 0x7f);
16493   return IN_RANGE (val, -0x8000, 0x7f00);
16494 }
16495
16496 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16497    SIGNED_P says whether the operand is signed rather than unsigned.  */
16498
16499 bool
16500 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
16501 {
16502   x = unwrap_const_vec_duplicate (x);
16503   return (CONST_INT_P (x)
16504           && (signed_p
16505               ? IN_RANGE (INTVAL (x), -16, 15)
16506               : IN_RANGE (INTVAL (x), 0, 127)));
16507 }
16508
16509 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16510    instruction.  Negate X first if NEGATE_P is true.  */
16511
16512 bool
16513 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
16514 {
16515   rtx elt;
16516   REAL_VALUE_TYPE r;
16517
16518   if (!const_vec_duplicate_p (x, &elt)
16519       || GET_CODE (elt) != CONST_DOUBLE)
16520     return false;
16521
16522   r = *CONST_DOUBLE_REAL_VALUE (elt);
16523
16524   if (negate_p)
16525     r = real_value_negate (&r);
16526
16527   if (real_equal (&r, &dconst1))
16528     return true;
16529   if (real_equal (&r, &dconsthalf))
16530     return true;
16531   return false;
16532 }
16533
16534 /* Return true if X is a valid immediate operand for an SVE FMUL
16535    instruction.  */
16536
16537 bool
16538 aarch64_sve_float_mul_immediate_p (rtx x)
16539 {
16540   rtx elt;
16541
16542   return (const_vec_duplicate_p (x, &elt)
16543           && GET_CODE (elt) == CONST_DOUBLE
16544           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
16545               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
16546 }
16547
16548 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16549    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
16550    is nonnull, use it to describe valid immediates.  */
16551 static bool
16552 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
16553                                     simd_immediate_info *info,
16554                                     enum simd_immediate_check which,
16555                                     simd_immediate_info::insn_type insn)
16556 {
16557   /* Try a 4-byte immediate with LSL.  */
16558   for (unsigned int shift = 0; shift < 32; shift += 8)
16559     if ((val32 & (0xff << shift)) == val32)
16560       {
16561         if (info)
16562           *info = simd_immediate_info (SImode, val32 >> shift, insn,
16563                                        simd_immediate_info::LSL, shift);
16564         return true;
16565       }
16566
16567   /* Try a 2-byte immediate with LSL.  */
16568   unsigned int imm16 = val32 & 0xffff;
16569   if (imm16 == (val32 >> 16))
16570     for (unsigned int shift = 0; shift < 16; shift += 8)
16571       if ((imm16 & (0xff << shift)) == imm16)
16572         {
16573           if (info)
16574             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
16575                                          simd_immediate_info::LSL, shift);
16576           return true;
16577         }
16578
16579   /* Try a 4-byte immediate with MSL, except for cases that MVN
16580      can handle.  */
16581   if (which == AARCH64_CHECK_MOV)
16582     for (unsigned int shift = 8; shift < 24; shift += 8)
16583       {
16584         unsigned int low = (1 << shift) - 1;
16585         if (((val32 & (0xff << shift)) | low) == val32)
16586           {
16587             if (info)
16588               *info = simd_immediate_info (SImode, val32 >> shift, insn,
16589                                            simd_immediate_info::MSL, shift);
16590             return true;
16591           }
16592       }
16593
16594   return false;
16595 }
16596
16597 /* Return true if replicating VAL64 is a valid immediate for the
16598    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
16599    use it to describe valid immediates.  */
16600 static bool
16601 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
16602                                  simd_immediate_info *info,
16603                                  enum simd_immediate_check which)
16604 {
16605   unsigned int val32 = val64 & 0xffffffff;
16606   unsigned int val16 = val64 & 0xffff;
16607   unsigned int val8 = val64 & 0xff;
16608
16609   if (val32 == (val64 >> 32))
16610     {
16611       if ((which & AARCH64_CHECK_ORR) != 0
16612           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
16613                                                  simd_immediate_info::MOV))
16614         return true;
16615
16616       if ((which & AARCH64_CHECK_BIC) != 0
16617           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
16618                                                  simd_immediate_info::MVN))
16619         return true;
16620
16621       /* Try using a replicated byte.  */
16622       if (which == AARCH64_CHECK_MOV
16623           && val16 == (val32 >> 16)
16624           && val8 == (val16 >> 8))
16625         {
16626           if (info)
16627             *info = simd_immediate_info (QImode, val8);
16628           return true;
16629         }
16630     }
16631
16632   /* Try using a bit-to-bytemask.  */
16633   if (which == AARCH64_CHECK_MOV)
16634     {
16635       unsigned int i;
16636       for (i = 0; i < 64; i += 8)
16637         {
16638           unsigned char byte = (val64 >> i) & 0xff;
16639           if (byte != 0 && byte != 0xff)
16640             break;
16641         }
16642       if (i == 64)
16643         {
16644           if (info)
16645             *info = simd_immediate_info (DImode, val64);
16646           return true;
16647         }
16648     }
16649   return false;
16650 }
16651
16652 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16653    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
16654
16655 static bool
16656 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
16657                              simd_immediate_info *info)
16658 {
16659   scalar_int_mode mode = DImode;
16660   unsigned int val32 = val64 & 0xffffffff;
16661   if (val32 == (val64 >> 32))
16662     {
16663       mode = SImode;
16664       unsigned int val16 = val32 & 0xffff;
16665       if (val16 == (val32 >> 16))
16666         {
16667           mode = HImode;
16668           unsigned int val8 = val16 & 0xff;
16669           if (val8 == (val16 >> 8))
16670             mode = QImode;
16671         }
16672     }
16673   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
16674   if (IN_RANGE (val, -0x80, 0x7f))
16675     {
16676       /* DUP with no shift.  */
16677       if (info)
16678         *info = simd_immediate_info (mode, val);
16679       return true;
16680     }
16681   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
16682     {
16683       /* DUP with LSL #8.  */
16684       if (info)
16685         *info = simd_immediate_info (mode, val);
16686       return true;
16687     }
16688   if (aarch64_bitmask_imm (val64, mode))
16689     {
16690       /* DUPM.  */
16691       if (info)
16692         *info = simd_immediate_info (mode, val);
16693       return true;
16694     }
16695   return false;
16696 }
16697
16698 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16699
16700        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16701
16702    where PATTERN is the svpattern as a CONST_INT and where ZERO
16703    is a zero constant of the required PTRUE mode (which can have
16704    fewer elements than X's mode, if zero bits are significant).
16705
16706    If so, and if INFO is nonnull, describe the immediate in INFO.  */
16707 bool
16708 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
16709 {
16710   if (GET_CODE (x) != CONST)
16711     return false;
16712
16713   x = XEXP (x, 0);
16714   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
16715     return false;
16716
16717   if (info)
16718     {
16719       aarch64_svpattern pattern
16720         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
16721       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
16722       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
16723       *info = simd_immediate_info (int_mode, pattern);
16724     }
16725   return true;
16726 }
16727
16728 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
16729    it to describe valid immediates.  */
16730
16731 static bool
16732 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
16733 {
16734   if (aarch64_sve_ptrue_svpattern_p (x, info))
16735     return true;
16736
16737   if (x == CONST0_RTX (GET_MODE (x)))
16738     {
16739       if (info)
16740         *info = simd_immediate_info (DImode, 0);
16741       return true;
16742     }
16743
16744   /* Analyze the value as a VNx16BImode.  This should be relatively
16745      efficient, since rtx_vector_builder has enough built-in capacity
16746      to store all VLA predicate constants without needing the heap.  */
16747   rtx_vector_builder builder;
16748   if (!aarch64_get_sve_pred_bits (builder, x))
16749     return false;
16750
16751   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
16752   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
16753     {
16754       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
16755       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
16756       if (pattern != AARCH64_NUM_SVPATTERNS)
16757         {
16758           if (info)
16759             {
16760               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
16761               *info = simd_immediate_info (int_mode, pattern);
16762             }
16763           return true;
16764         }
16765     }
16766   return false;
16767 }
16768
16769 /* Return true if OP is a valid SIMD immediate for the operation
16770    described by WHICH.  If INFO is nonnull, use it to describe valid
16771    immediates.  */
16772 bool
16773 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
16774                               enum simd_immediate_check which)
16775 {
16776   machine_mode mode = GET_MODE (op);
16777   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16778   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16779     return false;
16780
16781   if (vec_flags & VEC_SVE_PRED)
16782     return aarch64_sve_pred_valid_immediate (op, info);
16783
16784   scalar_mode elt_mode = GET_MODE_INNER (mode);
16785   rtx base, step;
16786   unsigned int n_elts;
16787   if (GET_CODE (op) == CONST_VECTOR
16788       && CONST_VECTOR_DUPLICATE_P (op))
16789     n_elts = CONST_VECTOR_NPATTERNS (op);
16790   else if ((vec_flags & VEC_SVE_DATA)
16791            && const_vec_series_p (op, &base, &step))
16792     {
16793       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
16794       if (!aarch64_sve_index_immediate_p (base)
16795           || !aarch64_sve_index_immediate_p (step))
16796         return false;
16797
16798       if (info)
16799         {
16800           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
16801              should yield two integer values per 128-bit block, meaning
16802              that we need to treat it in the same way as V2DI and then
16803              ignore the upper 32 bits of each element.  */
16804           elt_mode = aarch64_sve_container_int_mode (mode);
16805           *info = simd_immediate_info (elt_mode, base, step);
16806         }
16807       return true;
16808     }
16809   else if (GET_CODE (op) == CONST_VECTOR
16810            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
16811     /* N_ELTS set above.  */;
16812   else
16813     return false;
16814
16815   scalar_float_mode elt_float_mode;
16816   if (n_elts == 1
16817       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
16818     {
16819       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
16820       if (aarch64_float_const_zero_rtx_p (elt)
16821           || aarch64_float_const_representable_p (elt))
16822         {
16823           if (info)
16824             *info = simd_immediate_info (elt_float_mode, elt);
16825           return true;
16826         }
16827     }
16828
16829   /* If all elements in an SVE vector have the same value, we have a free
16830      choice between using the element mode and using the container mode.
16831      Using the element mode means that unused parts of the vector are
16832      duplicates of the used elements, while using the container mode means
16833      that the unused parts are an extension of the used elements.  Using the
16834      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16835      for its container mode VNx4SI while 0x00000101 isn't.
16836
16837      If not all elements in an SVE vector have the same value, we need the
16838      transition from one element to the next to occur at container boundaries.
16839      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16840      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
16841   scalar_int_mode elt_int_mode;
16842   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
16843     elt_int_mode = aarch64_sve_container_int_mode (mode);
16844   else
16845     elt_int_mode = int_mode_for_mode (elt_mode).require ();
16846
16847   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
16848   if (elt_size > 8)
16849     return false;
16850
16851   /* Expand the vector constant out into a byte vector, with the least
16852      significant byte of the register first.  */
16853   auto_vec<unsigned char, 16> bytes;
16854   bytes.reserve (n_elts * elt_size);
16855   for (unsigned int i = 0; i < n_elts; i++)
16856     {
16857       /* The vector is provided in gcc endian-neutral fashion.
16858          For aarch64_be Advanced SIMD, it must be laid out in the vector
16859          register in reverse order.  */
16860       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
16861       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
16862
16863       if (elt_mode != elt_int_mode)
16864         elt = gen_lowpart (elt_int_mode, elt);
16865
16866       if (!CONST_INT_P (elt))
16867         return false;
16868
16869       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
16870       for (unsigned int byte = 0; byte < elt_size; byte++)
16871         {
16872           bytes.quick_push (elt_val & 0xff);
16873           elt_val >>= BITS_PER_UNIT;
16874         }
16875     }
16876
16877   /* The immediate must repeat every eight bytes.  */
16878   unsigned int nbytes = bytes.length ();
16879   for (unsigned i = 8; i < nbytes; ++i)
16880     if (bytes[i] != bytes[i - 8])
16881       return false;
16882
16883   /* Get the repeating 8-byte value as an integer.  No endian correction
16884      is needed here because bytes is already in lsb-first order.  */
16885   unsigned HOST_WIDE_INT val64 = 0;
16886   for (unsigned int i = 0; i < 8; i++)
16887     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
16888               << (i * BITS_PER_UNIT));
16889
16890   if (vec_flags & VEC_SVE_DATA)
16891     return aarch64_sve_valid_immediate (val64, info);
16892   else
16893     return aarch64_advsimd_valid_immediate (val64, info, which);
16894 }
16895
16896 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
16897    has a step in the range of INDEX.  Return the index expression if so,
16898    otherwise return null.  */
16899 rtx
16900 aarch64_check_zero_based_sve_index_immediate (rtx x)
16901 {
16902   rtx base, step;
16903   if (const_vec_series_p (x, &base, &step)
16904       && base == const0_rtx
16905       && aarch64_sve_index_immediate_p (step))
16906     return step;
16907   return NULL_RTX;
16908 }
16909
16910 /* Check of immediate shift constants are within range.  */
16911 bool
16912 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
16913 {
16914   x = unwrap_const_vec_duplicate (x);
16915   if (!CONST_INT_P (x))
16916     return false;
16917   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
16918   if (left)
16919     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
16920   else
16921     return IN_RANGE (INTVAL (x), 1, bit_width);
16922 }
16923
16924 /* Return the bitmask CONST_INT to select the bits required by a zero extract
16925    operation of width WIDTH at bit position POS.  */
16926
16927 rtx
16928 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
16929 {
16930   gcc_assert (CONST_INT_P (width));
16931   gcc_assert (CONST_INT_P (pos));
16932
16933   unsigned HOST_WIDE_INT mask
16934     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
16935   return GEN_INT (mask << UINTVAL (pos));
16936 }
16937
16938 bool
16939 aarch64_mov_operand_p (rtx x, machine_mode mode)
16940 {
16941   if (GET_CODE (x) == HIGH
16942       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
16943     return true;
16944
16945   if (CONST_INT_P (x))
16946     return true;
16947
16948   if (VECTOR_MODE_P (GET_MODE (x)))
16949     {
16950       /* Require predicate constants to be VNx16BI before RA, so that we
16951          force everything to have a canonical form.  */
16952       if (!lra_in_progress
16953           && !reload_completed
16954           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
16955           && GET_MODE (x) != VNx16BImode)
16956         return false;
16957
16958       return aarch64_simd_valid_immediate (x, NULL);
16959     }
16960
16961   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
16962     return true;
16963
16964   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
16965     return true;
16966
16967   return aarch64_classify_symbolic_expression (x)
16968     == SYMBOL_TINY_ABSOLUTE;
16969 }
16970
16971 /* Return a const_int vector of VAL.  */
16972 rtx
16973 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
16974 {
16975   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
16976   return gen_const_vec_duplicate (mode, c);
16977 }
16978
16979 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
16980
16981 bool
16982 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
16983 {
16984   machine_mode vmode;
16985
16986   vmode = aarch64_simd_container_mode (mode, 64);
16987   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
16988   return aarch64_simd_valid_immediate (op_v, NULL);
16989 }
16990
16991 /* Construct and return a PARALLEL RTX vector with elements numbering the
16992    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
16993    the vector - from the perspective of the architecture.  This does not
16994    line up with GCC's perspective on lane numbers, so we end up with
16995    different masks depending on our target endian-ness.  The diagram
16996    below may help.  We must draw the distinction when building masks
16997    which select one half of the vector.  An instruction selecting
16998    architectural low-lanes for a big-endian target, must be described using
16999    a mask selecting GCC high-lanes.
17000
17001                  Big-Endian             Little-Endian
17002
17003 GCC             0   1   2   3           3   2   1   0
17004               | x | x | x | x |       | x | x | x | x |
17005 Architecture    3   2   1   0           3   2   1   0
17006
17007 Low Mask:         { 2, 3 }                { 0, 1 }
17008 High Mask:        { 0, 1 }                { 2, 3 }
17009
17010    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
17011
17012 rtx
17013 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17014 {
17015   rtvec v = rtvec_alloc (nunits / 2);
17016   int high_base = nunits / 2;
17017   int low_base = 0;
17018   int base;
17019   rtx t1;
17020   int i;
17021
17022   if (BYTES_BIG_ENDIAN)
17023     base = high ? low_base : high_base;
17024   else
17025     base = high ? high_base : low_base;
17026
17027   for (i = 0; i < nunits / 2; i++)
17028     RTVEC_ELT (v, i) = GEN_INT (base + i);
17029
17030   t1 = gen_rtx_PARALLEL (mode, v);
17031   return t1;
17032 }
17033
17034 /* Check OP for validity as a PARALLEL RTX vector with elements
17035    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17036    from the perspective of the architecture.  See the diagram above
17037    aarch64_simd_vect_par_cnst_half for more details.  */
17038
17039 bool
17040 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17041                                        bool high)
17042 {
17043   int nelts;
17044   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17045     return false;
17046
17047   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17048   HOST_WIDE_INT count_op = XVECLEN (op, 0);
17049   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17050   int i = 0;
17051
17052   if (count_op != count_ideal)
17053     return false;
17054
17055   for (i = 0; i < count_ideal; i++)
17056     {
17057       rtx elt_op = XVECEXP (op, 0, i);
17058       rtx elt_ideal = XVECEXP (ideal, 0, i);
17059
17060       if (!CONST_INT_P (elt_op)
17061           || INTVAL (elt_ideal) != INTVAL (elt_op))
17062         return false;
17063     }
17064   return true;
17065 }
17066
17067 /* Return a PARALLEL containing NELTS elements, with element I equal
17068    to BASE + I * STEP.  */
17069
17070 rtx
17071 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17072 {
17073   rtvec vec = rtvec_alloc (nelts);
17074   for (unsigned int i = 0; i < nelts; ++i)
17075     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17076   return gen_rtx_PARALLEL (VOIDmode, vec);
17077 }
17078
17079 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17080    series with step STEP.  */
17081
17082 bool
17083 aarch64_stepped_int_parallel_p (rtx op, int step)
17084 {
17085   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17086     return false;
17087
17088   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17089   for (int i = 1; i < XVECLEN (op, 0); ++i)
17090     if (!CONST_INT_P (XVECEXP (op, 0, i))
17091         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17092       return false;
17093
17094   return true;
17095 }
17096
17097 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
17098    HIGH (exclusive).  */
17099 void
17100 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17101                           const_tree exp)
17102 {
17103   HOST_WIDE_INT lane;
17104   gcc_assert (CONST_INT_P (operand));
17105   lane = INTVAL (operand);
17106
17107   if (lane < low || lane >= high)
17108   {
17109     if (exp)
17110       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
17111     else
17112       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
17113   }
17114 }
17115
17116 /* Peform endian correction on lane number N, which indexes a vector
17117    of mode MODE, and return the result as an SImode rtx.  */
17118
17119 rtx
17120 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
17121 {
17122   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
17123 }
17124
17125 /* Return TRUE if OP is a valid vector addressing mode.  */
17126
17127 bool
17128 aarch64_simd_mem_operand_p (rtx op)
17129 {
17130   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
17131                         || REG_P (XEXP (op, 0)));
17132 }
17133
17134 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
17135
17136 bool
17137 aarch64_sve_ld1r_operand_p (rtx op)
17138 {
17139   struct aarch64_address_info addr;
17140   scalar_mode mode;
17141
17142   return (MEM_P (op)
17143           && is_a <scalar_mode> (GET_MODE (op), &mode)
17144           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
17145           && addr.type == ADDRESS_REG_IMM
17146           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
17147 }
17148
17149 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
17150 bool
17151 aarch64_sve_ld1rq_operand_p (rtx op)
17152 {
17153   struct aarch64_address_info addr;
17154   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
17155   if (!MEM_P (op)
17156       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
17157     return false;
17158
17159   if (addr.type == ADDRESS_REG_IMM)
17160     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
17161
17162   if (addr.type == ADDRESS_REG_REG)
17163     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
17164
17165   return false;
17166 }
17167
17168 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
17169 bool
17170 aarch64_sve_ldff1_operand_p (rtx op)
17171 {
17172   if (!MEM_P (op))
17173     return false;
17174
17175   struct aarch64_address_info addr;
17176   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
17177     return false;
17178
17179   if (addr.type == ADDRESS_REG_IMM)
17180     return known_eq (addr.const_offset, 0);
17181
17182   return addr.type == ADDRESS_REG_REG;
17183 }
17184
17185 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
17186 bool
17187 aarch64_sve_ldnf1_operand_p (rtx op)
17188 {
17189   struct aarch64_address_info addr;
17190
17191   return (MEM_P (op)
17192           && aarch64_classify_address (&addr, XEXP (op, 0),
17193                                        GET_MODE (op), false)
17194           && addr.type == ADDRESS_REG_IMM);
17195 }
17196
17197 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17198    The conditions for STR are the same.  */
17199 bool
17200 aarch64_sve_ldr_operand_p (rtx op)
17201 {
17202   struct aarch64_address_info addr;
17203
17204   return (MEM_P (op)
17205           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
17206                                        false, ADDR_QUERY_ANY)
17207           && addr.type == ADDRESS_REG_IMM);
17208 }
17209
17210 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17211    addressing memory of mode MODE.  */
17212 bool
17213 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
17214 {
17215   struct aarch64_address_info addr;
17216   if (!aarch64_classify_address (&addr, op, mode, false))
17217     return false;
17218
17219   if (addr.type == ADDRESS_REG_IMM)
17220     return known_eq (addr.const_offset, 0);
17221
17222   return addr.type == ADDRESS_REG_REG;
17223 }
17224
17225 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17226    We need to be able to access the individual pieces, so the range
17227    is different from LD[234] and ST[234].  */
17228 bool
17229 aarch64_sve_struct_memory_operand_p (rtx op)
17230 {
17231   if (!MEM_P (op))
17232     return false;
17233
17234   machine_mode mode = GET_MODE (op);
17235   struct aarch64_address_info addr;
17236   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
17237                                  ADDR_QUERY_ANY)
17238       || addr.type != ADDRESS_REG_IMM)
17239     return false;
17240
17241   poly_int64 first = addr.const_offset;
17242   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
17243   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
17244           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
17245 }
17246
17247 /* Emit a register copy from operand to operand, taking care not to
17248    early-clobber source registers in the process.
17249
17250    COUNT is the number of components into which the copy needs to be
17251    decomposed.  */
17252 void
17253 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
17254                                 unsigned int count)
17255 {
17256   unsigned int i;
17257   int rdest = REGNO (operands[0]);
17258   int rsrc = REGNO (operands[1]);
17259
17260   if (!reg_overlap_mentioned_p (operands[0], operands[1])
17261       || rdest < rsrc)
17262     for (i = 0; i < count; i++)
17263       emit_move_insn (gen_rtx_REG (mode, rdest + i),
17264                       gen_rtx_REG (mode, rsrc + i));
17265   else
17266     for (i = 0; i < count; i++)
17267       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
17268                       gen_rtx_REG (mode, rsrc + count - i - 1));
17269 }
17270
17271 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17272    one of VSTRUCT modes: OI, CI, or XI.  */
17273 int
17274 aarch64_simd_attr_length_rglist (machine_mode mode)
17275 {
17276   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
17277   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
17278 }
17279
17280 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
17281    alignment of a vector to 128 bits.  SVE predicates have an alignment of
17282    16 bits.  */
17283 static HOST_WIDE_INT
17284 aarch64_simd_vector_alignment (const_tree type)
17285 {
17286   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17287      be set for non-predicate vectors of booleans.  Modes are the most
17288      direct way we have of identifying real SVE predicate types.  */
17289   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
17290     return 16;
17291   widest_int min_size
17292     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
17293   return wi::umin (min_size, 128).to_uhwi ();
17294 }
17295
17296 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
17297 static poly_uint64
17298 aarch64_vectorize_preferred_vector_alignment (const_tree type)
17299 {
17300   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
17301     {
17302       /* If the length of the vector is fixed, try to align to that length,
17303          otherwise don't try to align at all.  */
17304       HOST_WIDE_INT result;
17305       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
17306         result = TYPE_ALIGN (TREE_TYPE (type));
17307       return result;
17308     }
17309   return TYPE_ALIGN (type);
17310 }
17311
17312 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
17313 static bool
17314 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
17315 {
17316   if (is_packed)
17317     return false;
17318
17319   /* For fixed-length vectors, check that the vectorizer will aim for
17320      full-vector alignment.  This isn't true for generic GCC vectors
17321      that are wider than the ABI maximum of 128 bits.  */
17322   poly_uint64 preferred_alignment =
17323     aarch64_vectorize_preferred_vector_alignment (type);
17324   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
17325       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
17326                    preferred_alignment))
17327     return false;
17328
17329   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
17330   return true;
17331 }
17332
17333 /* Return true if the vector misalignment factor is supported by the
17334    target.  */
17335 static bool
17336 aarch64_builtin_support_vector_misalignment (machine_mode mode,
17337                                              const_tree type, int misalignment,
17338                                              bool is_packed)
17339 {
17340   if (TARGET_SIMD && STRICT_ALIGNMENT)
17341     {
17342       /* Return if movmisalign pattern is not supported for this mode.  */
17343       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
17344         return false;
17345
17346       /* Misalignment factor is unknown at compile time.  */
17347       if (misalignment == -1)
17348         return false;
17349     }
17350   return default_builtin_support_vector_misalignment (mode, type, misalignment,
17351                                                       is_packed);
17352 }
17353
17354 /* If VALS is a vector constant that can be loaded into a register
17355    using DUP, generate instructions to do so and return an RTX to
17356    assign to the register.  Otherwise return NULL_RTX.  */
17357 static rtx
17358 aarch64_simd_dup_constant (rtx vals)
17359 {
17360   machine_mode mode = GET_MODE (vals);
17361   machine_mode inner_mode = GET_MODE_INNER (mode);
17362   rtx x;
17363
17364   if (!const_vec_duplicate_p (vals, &x))
17365     return NULL_RTX;
17366
17367   /* We can load this constant by using DUP and a constant in a
17368      single ARM register.  This will be cheaper than a vector
17369      load.  */
17370   x = copy_to_mode_reg (inner_mode, x);
17371   return gen_vec_duplicate (mode, x);
17372 }
17373
17374
17375 /* Generate code to load VALS, which is a PARALLEL containing only
17376    constants (for vec_init) or CONST_VECTOR, efficiently into a
17377    register.  Returns an RTX to copy into the register, or NULL_RTX
17378    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
17379 static rtx
17380 aarch64_simd_make_constant (rtx vals)
17381 {
17382   machine_mode mode = GET_MODE (vals);
17383   rtx const_dup;
17384   rtx const_vec = NULL_RTX;
17385   int n_const = 0;
17386   int i;
17387
17388   if (GET_CODE (vals) == CONST_VECTOR)
17389     const_vec = vals;
17390   else if (GET_CODE (vals) == PARALLEL)
17391     {
17392       /* A CONST_VECTOR must contain only CONST_INTs and
17393          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17394          Only store valid constants in a CONST_VECTOR.  */
17395       int n_elts = XVECLEN (vals, 0);
17396       for (i = 0; i < n_elts; ++i)
17397         {
17398           rtx x = XVECEXP (vals, 0, i);
17399           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17400             n_const++;
17401         }
17402       if (n_const == n_elts)
17403         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
17404     }
17405   else
17406     gcc_unreachable ();
17407
17408   if (const_vec != NULL_RTX
17409       && aarch64_simd_valid_immediate (const_vec, NULL))
17410     /* Load using MOVI/MVNI.  */
17411     return const_vec;
17412   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
17413     /* Loaded using DUP.  */
17414     return const_dup;
17415   else if (const_vec != NULL_RTX)
17416     /* Load from constant pool. We cannot take advantage of single-cycle
17417        LD1 because we need a PC-relative addressing mode.  */
17418     return const_vec;
17419   else
17420     /* A PARALLEL containing something not valid inside CONST_VECTOR.
17421        We cannot construct an initializer.  */
17422     return NULL_RTX;
17423 }
17424
17425 /* Expand a vector initialisation sequence, such that TARGET is
17426    initialised to contain VALS.  */
17427
17428 void
17429 aarch64_expand_vector_init (rtx target, rtx vals)
17430 {
17431   machine_mode mode = GET_MODE (target);
17432   scalar_mode inner_mode = GET_MODE_INNER (mode);
17433   /* The number of vector elements.  */
17434   int n_elts = XVECLEN (vals, 0);
17435   /* The number of vector elements which are not constant.  */
17436   int n_var = 0;
17437   rtx any_const = NULL_RTX;
17438   /* The first element of vals.  */
17439   rtx v0 = XVECEXP (vals, 0, 0);
17440   bool all_same = true;
17441
17442   /* This is a special vec_init<M><N> where N is not an element mode but a
17443      vector mode with half the elements of M.  We expect to find two entries
17444      of mode N in VALS and we must put their concatentation into TARGET.  */
17445   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
17446     {
17447       gcc_assert (known_eq (GET_MODE_SIZE (mode),
17448                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
17449       rtx lo = XVECEXP (vals, 0, 0);
17450       rtx hi = XVECEXP (vals, 0, 1);
17451       machine_mode narrow_mode = GET_MODE (lo);
17452       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
17453       gcc_assert (narrow_mode == GET_MODE (hi));
17454
17455       /* When we want to concatenate a half-width vector with zeroes we can
17456          use the aarch64_combinez[_be] patterns.  Just make sure that the
17457          zeroes are in the right half.  */
17458       if (BYTES_BIG_ENDIAN
17459           && aarch64_simd_imm_zero (lo, narrow_mode)
17460           && general_operand (hi, narrow_mode))
17461         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
17462       else if (!BYTES_BIG_ENDIAN
17463                && aarch64_simd_imm_zero (hi, narrow_mode)
17464                && general_operand (lo, narrow_mode))
17465         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
17466       else
17467         {
17468           /* Else create the two half-width registers and combine them.  */
17469           if (!REG_P (lo))
17470             lo = force_reg (GET_MODE (lo), lo);
17471           if (!REG_P (hi))
17472             hi = force_reg (GET_MODE (hi), hi);
17473
17474           if (BYTES_BIG_ENDIAN)
17475             std::swap (lo, hi);
17476           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
17477         }
17478      return;
17479    }
17480
17481   /* Count the number of variable elements to initialise.  */
17482   for (int i = 0; i < n_elts; ++i)
17483     {
17484       rtx x = XVECEXP (vals, 0, i);
17485       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
17486         ++n_var;
17487       else
17488         any_const = x;
17489
17490       all_same &= rtx_equal_p (x, v0);
17491     }
17492
17493   /* No variable elements, hand off to aarch64_simd_make_constant which knows
17494      how best to handle this.  */
17495   if (n_var == 0)
17496     {
17497       rtx constant = aarch64_simd_make_constant (vals);
17498       if (constant != NULL_RTX)
17499         {
17500           emit_move_insn (target, constant);
17501           return;
17502         }
17503     }
17504
17505   /* Splat a single non-constant element if we can.  */
17506   if (all_same)
17507     {
17508       rtx x = copy_to_mode_reg (inner_mode, v0);
17509       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17510       return;
17511     }
17512
17513   enum insn_code icode = optab_handler (vec_set_optab, mode);
17514   gcc_assert (icode != CODE_FOR_nothing);
17515
17516   /* If there are only variable elements, try to optimize
17517      the insertion using dup for the most common element
17518      followed by insertions.  */
17519
17520   /* The algorithm will fill matches[*][0] with the earliest matching element,
17521      and matches[X][1] with the count of duplicate elements (if X is the
17522      earliest element which has duplicates).  */
17523
17524   if (n_var == n_elts && n_elts <= 16)
17525     {
17526       int matches[16][2] = {0};
17527       for (int i = 0; i < n_elts; i++)
17528         {
17529           for (int j = 0; j <= i; j++)
17530             {
17531               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
17532                 {
17533                   matches[i][0] = j;
17534                   matches[j][1]++;
17535                   break;
17536                 }
17537             }
17538         }
17539       int maxelement = 0;
17540       int maxv = 0;
17541       for (int i = 0; i < n_elts; i++)
17542         if (matches[i][1] > maxv)
17543           {
17544             maxelement = i;
17545             maxv = matches[i][1];
17546           }
17547
17548       /* Create a duplicate of the most common element, unless all elements
17549          are equally useless to us, in which case just immediately set the
17550          vector register using the first element.  */
17551
17552       if (maxv == 1)
17553         {
17554           /* For vectors of two 64-bit elements, we can do even better.  */
17555           if (n_elts == 2
17556               && (inner_mode == E_DImode
17557                   || inner_mode == E_DFmode))
17558
17559             {
17560               rtx x0 = XVECEXP (vals, 0, 0);
17561               rtx x1 = XVECEXP (vals, 0, 1);
17562               /* Combine can pick up this case, but handling it directly
17563                  here leaves clearer RTL.
17564
17565                  This is load_pair_lanes<mode>, and also gives us a clean-up
17566                  for store_pair_lanes<mode>.  */
17567               if (memory_operand (x0, inner_mode)
17568                   && memory_operand (x1, inner_mode)
17569                   && !STRICT_ALIGNMENT
17570                   && rtx_equal_p (XEXP (x1, 0),
17571                                   plus_constant (Pmode,
17572                                                  XEXP (x0, 0),
17573                                                  GET_MODE_SIZE (inner_mode))))
17574                 {
17575                   rtx t;
17576                   if (inner_mode == DFmode)
17577                     t = gen_load_pair_lanesdf (target, x0, x1);
17578                   else
17579                     t = gen_load_pair_lanesdi (target, x0, x1);
17580                   emit_insn (t);
17581                   return;
17582                 }
17583             }
17584           /* The subreg-move sequence below will move into lane zero of the
17585              vector register.  For big-endian we want that position to hold
17586              the last element of VALS.  */
17587           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
17588           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17589           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
17590         }
17591       else
17592         {
17593           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17594           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17595         }
17596
17597       /* Insert the rest.  */
17598       for (int i = 0; i < n_elts; i++)
17599         {
17600           rtx x = XVECEXP (vals, 0, i);
17601           if (matches[i][0] == maxelement)
17602             continue;
17603           x = copy_to_mode_reg (inner_mode, x);
17604           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17605         }
17606       return;
17607     }
17608
17609   /* Initialise a vector which is part-variable.  We want to first try
17610      to build those lanes which are constant in the most efficient way we
17611      can.  */
17612   if (n_var != n_elts)
17613     {
17614       rtx copy = copy_rtx (vals);
17615
17616       /* Load constant part of vector.  We really don't care what goes into the
17617          parts we will overwrite, but we're more likely to be able to load the
17618          constant efficiently if it has fewer, larger, repeating parts
17619          (see aarch64_simd_valid_immediate).  */
17620       for (int i = 0; i < n_elts; i++)
17621         {
17622           rtx x = XVECEXP (vals, 0, i);
17623           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17624             continue;
17625           rtx subst = any_const;
17626           for (int bit = n_elts / 2; bit > 0; bit /= 2)
17627             {
17628               /* Look in the copied vector, as more elements are const.  */
17629               rtx test = XVECEXP (copy, 0, i ^ bit);
17630               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
17631                 {
17632                   subst = test;
17633                   break;
17634                 }
17635             }
17636           XVECEXP (copy, 0, i) = subst;
17637         }
17638       aarch64_expand_vector_init (target, copy);
17639     }
17640
17641   /* Insert the variable lanes directly.  */
17642   for (int i = 0; i < n_elts; i++)
17643     {
17644       rtx x = XVECEXP (vals, 0, i);
17645       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17646         continue;
17647       x = copy_to_mode_reg (inner_mode, x);
17648       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17649     }
17650 }
17651
17652 /* Emit RTL corresponding to:
17653    insr TARGET, ELEM.  */
17654
17655 static void
17656 emit_insr (rtx target, rtx elem)
17657 {
17658   machine_mode mode = GET_MODE (target);
17659   scalar_mode elem_mode = GET_MODE_INNER (mode);
17660   elem = force_reg (elem_mode, elem);
17661
17662   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
17663   gcc_assert (icode != CODE_FOR_nothing);
17664   emit_insn (GEN_FCN (icode) (target, target, elem));
17665 }
17666
17667 /* Subroutine of aarch64_sve_expand_vector_init for handling
17668    trailing constants.
17669    This function works as follows:
17670    (a) Create a new vector consisting of trailing constants.
17671    (b) Initialize TARGET with the constant vector using emit_move_insn.
17672    (c) Insert remaining elements in TARGET using insr.
17673    NELTS is the total number of elements in original vector while
17674    while NELTS_REQD is the number of elements that are actually
17675    significant.
17676
17677    ??? The heuristic used is to do above only if number of constants
17678    is at least half the total number of elements.  May need fine tuning.  */
17679
17680 static bool
17681 aarch64_sve_expand_vector_init_handle_trailing_constants
17682  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
17683 {
17684   machine_mode mode = GET_MODE (target);
17685   scalar_mode elem_mode = GET_MODE_INNER (mode);
17686   int n_trailing_constants = 0;
17687
17688   for (int i = nelts_reqd - 1;
17689        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
17690        i--)
17691     n_trailing_constants++;
17692
17693   if (n_trailing_constants >= nelts_reqd / 2)
17694     {
17695       rtx_vector_builder v (mode, 1, nelts);
17696       for (int i = 0; i < nelts; i++)
17697         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
17698       rtx const_vec = v.build ();
17699       emit_move_insn (target, const_vec);
17700
17701       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
17702         emit_insr (target, builder.elt (i));
17703
17704       return true;
17705     }
17706
17707   return false;
17708 }
17709
17710 /* Subroutine of aarch64_sve_expand_vector_init.
17711    Works as follows:
17712    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17713    (b) Skip trailing elements from BUILDER, which are the same as
17714        element NELTS_REQD - 1.
17715    (c) Insert earlier elements in reverse order in TARGET using insr.  */
17716
17717 static void
17718 aarch64_sve_expand_vector_init_insert_elems (rtx target,
17719                                              const rtx_vector_builder &builder,
17720                                              int nelts_reqd)
17721 {
17722   machine_mode mode = GET_MODE (target);
17723   scalar_mode elem_mode = GET_MODE_INNER (mode);
17724
17725   struct expand_operand ops[2];
17726   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
17727   gcc_assert (icode != CODE_FOR_nothing);
17728
17729   create_output_operand (&ops[0], target, mode);
17730   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
17731   expand_insn (icode, 2, ops);
17732
17733   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17734   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
17735     emit_insr (target, builder.elt (i));
17736 }
17737
17738 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17739    when all trailing elements of builder are same.
17740    This works as follows:
17741    (a) Use expand_insn interface to broadcast last vector element in TARGET.
17742    (b) Insert remaining elements in TARGET using insr.
17743
17744    ??? The heuristic used is to do above if number of same trailing elements
17745    is at least 3/4 of total number of elements, loosely based on
17746    heuristic from mostly_zeros_p.  May need fine-tuning.  */
17747
17748 static bool
17749 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17750  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
17751 {
17752   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17753   if (ndups >= (3 * nelts_reqd) / 4)
17754     {
17755       aarch64_sve_expand_vector_init_insert_elems (target, builder,
17756                                                    nelts_reqd - ndups + 1);
17757       return true;
17758     }
17759
17760   return false;
17761 }
17762
17763 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17764    of elements in BUILDER.
17765
17766    The function tries to initialize TARGET from BUILDER if it fits one
17767    of the special cases outlined below.
17768
17769    Failing that, the function divides BUILDER into two sub-vectors:
17770    v_even = even elements of BUILDER;
17771    v_odd = odd elements of BUILDER;
17772
17773    and recursively calls itself with v_even and v_odd.
17774
17775    if (recursive call succeeded for v_even or v_odd)
17776      TARGET = zip (v_even, v_odd)
17777
17778    The function returns true if it managed to build TARGET from BUILDER
17779    with one of the special cases, false otherwise.
17780
17781    Example: {a, 1, b, 2, c, 3, d, 4}
17782
17783    The vector gets divided into:
17784    v_even = {a, b, c, d}
17785    v_odd = {1, 2, 3, 4}
17786
17787    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17788    initialize tmp2 from constant vector v_odd using emit_move_insn.
17789
17790    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17791    4 elements, so we construct tmp1 from v_even using insr:
17792    tmp1 = dup(d)
17793    insr tmp1, c
17794    insr tmp1, b
17795    insr tmp1, a
17796
17797    And finally:
17798    TARGET = zip (tmp1, tmp2)
17799    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
17800
17801 static bool
17802 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
17803                                 int nelts, int nelts_reqd)
17804 {
17805   machine_mode mode = GET_MODE (target);
17806
17807   /* Case 1: Vector contains trailing constants.  */
17808
17809   if (aarch64_sve_expand_vector_init_handle_trailing_constants
17810        (target, builder, nelts, nelts_reqd))
17811     return true;
17812
17813   /* Case 2: Vector contains leading constants.  */
17814
17815   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
17816   for (int i = 0; i < nelts_reqd; i++)
17817     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
17818   rev_builder.finalize ();
17819
17820   if (aarch64_sve_expand_vector_init_handle_trailing_constants
17821        (target, rev_builder, nelts, nelts_reqd))
17822     {
17823       emit_insn (gen_aarch64_sve_rev (mode, target, target));
17824       return true;
17825     }
17826
17827   /* Case 3: Vector contains trailing same element.  */
17828
17829   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17830        (target, builder, nelts_reqd))
17831     return true;
17832
17833   /* Case 4: Vector contains leading same element.  */
17834
17835   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17836        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
17837     {
17838       emit_insn (gen_aarch64_sve_rev (mode, target, target));
17839       return true;
17840     }
17841
17842   /* Avoid recursing below 4-elements.
17843      ??? The threshold 4 may need fine-tuning.  */
17844
17845   if (nelts_reqd <= 4)
17846     return false;
17847
17848   rtx_vector_builder v_even (mode, 1, nelts);
17849   rtx_vector_builder v_odd (mode, 1, nelts);
17850
17851   for (int i = 0; i < nelts * 2; i += 2)
17852     {
17853       v_even.quick_push (builder.elt (i));
17854       v_odd.quick_push (builder.elt (i + 1));
17855     }
17856
17857   v_even.finalize ();
17858   v_odd.finalize ();
17859
17860   rtx tmp1 = gen_reg_rtx (mode);
17861   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
17862                                                     nelts, nelts_reqd / 2);
17863
17864   rtx tmp2 = gen_reg_rtx (mode);
17865   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
17866                                                    nelts, nelts_reqd / 2);
17867
17868   if (!did_even_p && !did_odd_p)
17869     return false;
17870
17871   /* Initialize v_even and v_odd using INSR if it didn't match any of the
17872      special cases and zip v_even, v_odd.  */
17873
17874   if (!did_even_p)
17875     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
17876
17877   if (!did_odd_p)
17878     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
17879
17880   rtvec v = gen_rtvec (2, tmp1, tmp2);
17881   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
17882   return true;
17883 }
17884
17885 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
17886
17887 void
17888 aarch64_sve_expand_vector_init (rtx target, rtx vals)
17889 {
17890   machine_mode mode = GET_MODE (target);
17891   int nelts = XVECLEN (vals, 0);
17892
17893   rtx_vector_builder v (mode, 1, nelts);
17894   for (int i = 0; i < nelts; i++)
17895     v.quick_push (XVECEXP (vals, 0, i));
17896   v.finalize ();
17897
17898   /* If neither sub-vectors of v could be initialized specially,
17899      then use INSR to insert all elements from v into TARGET.
17900      ??? This might not be optimal for vectors with large
17901      initializers like 16-element or above.
17902      For nelts < 4, it probably isn't useful to handle specially.  */
17903
17904   if (nelts < 4
17905       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
17906     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
17907 }
17908
17909 /* Check whether VALUE is a vector constant in which every element
17910    is either a power of 2 or a negated power of 2.  If so, return
17911    a constant vector of log2s, and flip CODE between PLUS and MINUS
17912    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
17913
17914 static rtx
17915 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
17916 {
17917   if (GET_CODE (value) != CONST_VECTOR)
17918     return NULL_RTX;
17919
17920   rtx_vector_builder builder;
17921   if (!builder.new_unary_operation (GET_MODE (value), value, false))
17922     return NULL_RTX;
17923
17924   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
17925   /* 1 if the result of the multiplication must be negated,
17926      0 if it mustn't, or -1 if we don't yet care.  */
17927   int negate = -1;
17928   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
17929   for (unsigned int i = 0; i < encoded_nelts; ++i)
17930     {
17931       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
17932       if (!CONST_SCALAR_INT_P (elt))
17933         return NULL_RTX;
17934       rtx_mode_t val (elt, int_mode);
17935       wide_int pow2 = wi::neg (val);
17936       if (val != pow2)
17937         {
17938           /* It matters whether we negate or not.  Make that choice,
17939              and make sure that it's consistent with previous elements.  */
17940           if (negate == !wi::neg_p (val))
17941             return NULL_RTX;
17942           negate = wi::neg_p (val);
17943           if (!negate)
17944             pow2 = val;
17945         }
17946       /* POW2 is now the value that we want to be a power of 2.  */
17947       int shift = wi::exact_log2 (pow2);
17948       if (shift < 0)
17949         return NULL_RTX;
17950       builder.quick_push (gen_int_mode (shift, int_mode));
17951     }
17952   if (negate == -1)
17953     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
17954     code = PLUS;
17955   else if (negate == 1)
17956     code = code == PLUS ? MINUS : PLUS;
17957   return builder.build ();
17958 }
17959
17960 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
17961    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
17962    operands array, in the same order as for fma_optab.  Return true if
17963    the function emitted all the necessary instructions, false if the caller
17964    should generate the pattern normally with the new OPERANDS array.  */
17965
17966 bool
17967 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
17968 {
17969   machine_mode mode = GET_MODE (operands[0]);
17970   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
17971     {
17972       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
17973                                   NULL_RTX, true, OPTAB_DIRECT);
17974       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
17975                           operands[3], product, operands[0], true,
17976                           OPTAB_DIRECT);
17977       return true;
17978     }
17979   operands[2] = force_reg (mode, operands[2]);
17980   return false;
17981 }
17982
17983 /* Likewise, but for a conditional pattern.  */
17984
17985 bool
17986 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
17987 {
17988   machine_mode mode = GET_MODE (operands[0]);
17989   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
17990     {
17991       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
17992                                   NULL_RTX, true, OPTAB_DIRECT);
17993       emit_insn (gen_cond (code, mode, operands[0], operands[1],
17994                            operands[4], product, operands[5]));
17995       return true;
17996     }
17997   operands[3] = force_reg (mode, operands[3]);
17998   return false;
17999 }
18000
18001 static unsigned HOST_WIDE_INT
18002 aarch64_shift_truncation_mask (machine_mode mode)
18003 {
18004   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18005     return 0;
18006   return GET_MODE_UNIT_BITSIZE (mode) - 1;
18007 }
18008
18009 /* Select a format to encode pointers in exception handling data.  */
18010 int
18011 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18012 {
18013    int type;
18014    switch (aarch64_cmodel)
18015      {
18016      case AARCH64_CMODEL_TINY:
18017      case AARCH64_CMODEL_TINY_PIC:
18018      case AARCH64_CMODEL_SMALL:
18019      case AARCH64_CMODEL_SMALL_PIC:
18020      case AARCH64_CMODEL_SMALL_SPIC:
18021        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
18022           for everything.  */
18023        type = DW_EH_PE_sdata4;
18024        break;
18025      default:
18026        /* No assumptions here.  8-byte relocs required.  */
18027        type = DW_EH_PE_sdata8;
18028        break;
18029      }
18030    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18031 }
18032
18033 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
18034
18035 static void
18036 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18037 {
18038   if (TREE_CODE (decl) == FUNCTION_DECL)
18039     {
18040       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18041       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18042         {
18043           fprintf (stream, "\t.variant_pcs\t");
18044           assemble_name (stream, name);
18045           fprintf (stream, "\n");
18046         }
18047     }
18048 }
18049
18050 /* The last .arch and .tune assembly strings that we printed.  */
18051 static std::string aarch64_last_printed_arch_string;
18052 static std::string aarch64_last_printed_tune_string;
18053
18054 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
18055    by the function fndecl.  */
18056
18057 void
18058 aarch64_declare_function_name (FILE *stream, const char* name,
18059                                 tree fndecl)
18060 {
18061   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18062
18063   struct cl_target_option *targ_options;
18064   if (target_parts)
18065     targ_options = TREE_TARGET_OPTION (target_parts);
18066   else
18067     targ_options = TREE_TARGET_OPTION (target_option_current_node);
18068   gcc_assert (targ_options);
18069
18070   const struct processor *this_arch
18071     = aarch64_get_arch (targ_options->x_explicit_arch);
18072
18073   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18074   std::string extension
18075     = aarch64_get_extension_string_for_isa_flags (isa_flags,
18076                                                   this_arch->flags);
18077   /* Only update the assembler .arch string if it is distinct from the last
18078      such string we printed.  */
18079   std::string to_print = this_arch->name + extension;
18080   if (to_print != aarch64_last_printed_arch_string)
18081     {
18082       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
18083       aarch64_last_printed_arch_string = to_print;
18084     }
18085
18086   /* Print the cpu name we're tuning for in the comments, might be
18087      useful to readers of the generated asm.  Do it only when it changes
18088      from function to function and verbose assembly is requested.  */
18089   const struct processor *this_tune
18090     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
18091
18092   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
18093     {
18094       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
18095                    this_tune->name);
18096       aarch64_last_printed_tune_string = this_tune->name;
18097     }
18098
18099   aarch64_asm_output_variant_pcs (stream, fndecl, name);
18100
18101   /* Don't forget the type directive for ELF.  */
18102   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
18103   ASM_OUTPUT_LABEL (stream, name);
18104 }
18105
18106 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
18107
18108 void
18109 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
18110 {
18111   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
18112   const char *value = IDENTIFIER_POINTER (target);
18113   aarch64_asm_output_variant_pcs (stream, decl, name);
18114   ASM_OUTPUT_DEF (stream, name, value);
18115 }
18116
18117 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
18118    function symbol references.  */
18119
18120 void
18121 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
18122 {
18123   default_elf_asm_output_external (stream, decl, name);
18124   aarch64_asm_output_variant_pcs (stream, decl, name);
18125 }
18126
18127 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18128    Used to output the .cfi_b_key_frame directive when signing the current
18129    function with the B key.  */
18130
18131 void
18132 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
18133 {
18134   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
18135       && aarch64_ra_sign_key == AARCH64_KEY_B)
18136         asm_fprintf (f, "\t.cfi_b_key_frame\n");
18137 }
18138
18139 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
18140
18141 static void
18142 aarch64_start_file (void)
18143 {
18144   struct cl_target_option *default_options
18145     = TREE_TARGET_OPTION (target_option_default_node);
18146
18147   const struct processor *default_arch
18148     = aarch64_get_arch (default_options->x_explicit_arch);
18149   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
18150   std::string extension
18151     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
18152                                                   default_arch->flags);
18153
18154    aarch64_last_printed_arch_string = default_arch->name + extension;
18155    aarch64_last_printed_tune_string = "";
18156    asm_fprintf (asm_out_file, "\t.arch %s\n",
18157                 aarch64_last_printed_arch_string.c_str ());
18158
18159    default_file_start ();
18160 }
18161
18162 /* Emit load exclusive.  */
18163
18164 static void
18165 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
18166                              rtx mem, rtx model_rtx)
18167 {
18168   if (mode == TImode)
18169     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
18170                                                 gen_highpart (DImode, rval),
18171                                                 mem, model_rtx));
18172   else
18173     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
18174 }
18175
18176 /* Emit store exclusive.  */
18177
18178 static void
18179 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
18180                               rtx mem, rtx rval, rtx model_rtx)
18181 {
18182   if (mode == TImode)
18183     emit_insn (gen_aarch64_store_exclusive_pair
18184                (bval, mem, operand_subword (rval, 0, 0, TImode),
18185                 operand_subword (rval, 1, 0, TImode), model_rtx));
18186   else
18187     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
18188 }
18189
18190 /* Mark the previous jump instruction as unlikely.  */
18191
18192 static void
18193 aarch64_emit_unlikely_jump (rtx insn)
18194 {
18195   rtx_insn *jump = emit_jump_insn (insn);
18196   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
18197 }
18198
18199 /* We store the names of the various atomic helpers in a 5x4 array.
18200    Return the libcall function given MODE, MODEL and NAMES.  */
18201
18202 rtx
18203 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
18204                         const atomic_ool_names *names)
18205 {
18206   memmodel model = memmodel_base (INTVAL (model_rtx));
18207   int mode_idx, model_idx;
18208
18209   switch (mode)
18210     {
18211     case E_QImode:
18212       mode_idx = 0;
18213       break;
18214     case E_HImode:
18215       mode_idx = 1;
18216       break;
18217     case E_SImode:
18218       mode_idx = 2;
18219       break;
18220     case E_DImode:
18221       mode_idx = 3;
18222       break;
18223     case E_TImode:
18224       mode_idx = 4;
18225       break;
18226     default:
18227       gcc_unreachable ();
18228     }
18229
18230   switch (model)
18231     {
18232     case MEMMODEL_RELAXED:
18233       model_idx = 0;
18234       break;
18235     case MEMMODEL_CONSUME:
18236     case MEMMODEL_ACQUIRE:
18237       model_idx = 1;
18238       break;
18239     case MEMMODEL_RELEASE:
18240       model_idx = 2;
18241       break;
18242     case MEMMODEL_ACQ_REL:
18243     case MEMMODEL_SEQ_CST:
18244       model_idx = 3;
18245       break;
18246     default:
18247       gcc_unreachable ();
18248     }
18249
18250   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
18251                                       VISIBILITY_HIDDEN);
18252 }
18253
18254 #define DEF0(B, N) \
18255   { "__aarch64_" #B #N "_relax", \
18256     "__aarch64_" #B #N "_acq", \
18257     "__aarch64_" #B #N "_rel", \
18258     "__aarch64_" #B #N "_acq_rel" }
18259
18260 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18261                  { NULL, NULL, NULL, NULL }
18262 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18263
18264 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
18265 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
18266 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
18267 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
18268 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
18269 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
18270
18271 #undef DEF0
18272 #undef DEF4
18273 #undef DEF5
18274
18275 /* Expand a compare and swap pattern.  */
18276
18277 void
18278 aarch64_expand_compare_and_swap (rtx operands[])
18279 {
18280   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
18281   machine_mode mode, r_mode;
18282
18283   bval = operands[0];
18284   rval = operands[1];
18285   mem = operands[2];
18286   oldval = operands[3];
18287   newval = operands[4];
18288   is_weak = operands[5];
18289   mod_s = operands[6];
18290   mod_f = operands[7];
18291   mode = GET_MODE (mem);
18292
18293   /* Normally the succ memory model must be stronger than fail, but in the
18294      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18295      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
18296   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
18297       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
18298     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
18299
18300   r_mode = mode;
18301   if (mode == QImode || mode == HImode)
18302     {
18303       r_mode = SImode;
18304       rval = gen_reg_rtx (r_mode);
18305     }
18306
18307   if (TARGET_LSE)
18308     {
18309       /* The CAS insn requires oldval and rval overlap, but we need to
18310          have a copy of oldval saved across the operation to tell if
18311          the operation is successful.  */
18312       if (reg_overlap_mentioned_p (rval, oldval))
18313         rval = copy_to_mode_reg (r_mode, oldval);
18314       else
18315         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
18316
18317       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
18318                                                    newval, mod_s));
18319       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18320     }
18321   else if (TARGET_OUTLINE_ATOMICS)
18322     {
18323       /* Oldval must satisfy compare afterward.  */
18324       if (!aarch64_plus_operand (oldval, mode))
18325         oldval = force_reg (mode, oldval);
18326       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
18327       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
18328                                       oldval, mode, newval, mode,
18329                                       XEXP (mem, 0), Pmode);
18330       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18331     }
18332   else
18333     {
18334       /* The oldval predicate varies by mode.  Test it and force to reg.  */
18335       insn_code code = code_for_aarch64_compare_and_swap (mode);
18336       if (!insn_data[code].operand[2].predicate (oldval, mode))
18337         oldval = force_reg (mode, oldval);
18338
18339       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
18340                                  is_weak, mod_s, mod_f));
18341       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
18342     }
18343
18344   if (r_mode != mode)
18345     rval = gen_lowpart (mode, rval);
18346   emit_move_insn (operands[1], rval);
18347
18348   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
18349   emit_insn (gen_rtx_SET (bval, x));
18350 }
18351
18352 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18353    sequence implementing an atomic operation.  */
18354
18355 static void
18356 aarch64_emit_post_barrier (enum memmodel model)
18357 {
18358   const enum memmodel base_model = memmodel_base (model);
18359
18360   if (is_mm_sync (model)
18361       && (base_model == MEMMODEL_ACQUIRE
18362           || base_model == MEMMODEL_ACQ_REL
18363           || base_model == MEMMODEL_SEQ_CST))
18364     {
18365       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
18366     }
18367 }
18368
18369 /* Split a compare and swap pattern.  */
18370
18371 void
18372 aarch64_split_compare_and_swap (rtx operands[])
18373 {
18374   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
18375   machine_mode mode;
18376   bool is_weak;
18377   rtx_code_label *label1, *label2;
18378   enum memmodel model;
18379
18380   rval = operands[0];
18381   mem = operands[1];
18382   oldval = operands[2];
18383   newval = operands[3];
18384   is_weak = (operands[4] != const0_rtx);
18385   model_rtx = operands[5];
18386   scratch = operands[7];
18387   mode = GET_MODE (mem);
18388   model = memmodel_from_int (INTVAL (model_rtx));
18389
18390   /* When OLDVAL is zero and we want the strong version we can emit a tighter
18391     loop:
18392     .label1:
18393         LD[A]XR rval, [mem]
18394         CBNZ    rval, .label2
18395         ST[L]XR scratch, newval, [mem]
18396         CBNZ    scratch, .label1
18397     .label2:
18398         CMP     rval, 0.  */
18399   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
18400                         oldval == const0_rtx && mode != TImode);
18401
18402   label1 = NULL;
18403   if (!is_weak)
18404     {
18405       label1 = gen_label_rtx ();
18406       emit_label (label1);
18407     }
18408   label2 = gen_label_rtx ();
18409
18410   /* The initial load can be relaxed for a __sync operation since a final
18411      barrier will be emitted to stop code hoisting.  */
18412   if (is_mm_sync (model))
18413     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
18414   else
18415     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
18416
18417   if (strong_zero_p)
18418     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
18419   else
18420     {
18421       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18422       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
18423     }
18424   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18425                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
18426   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18427
18428   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
18429
18430   if (!is_weak)
18431     {
18432       if (aarch64_track_speculation)
18433         {
18434           /* Emit an explicit compare instruction, so that we can correctly
18435              track the condition codes.  */
18436           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18437           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18438         }
18439       else
18440         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
18441
18442       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18443                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
18444       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18445     }
18446   else
18447     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18448
18449   emit_label (label2);
18450
18451   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18452      to set the condition flags.  If this is not used it will be removed by
18453      later passes.  */
18454   if (strong_zero_p)
18455     aarch64_gen_compare_reg (NE, rval, const0_rtx);
18456
18457   /* Emit any final barrier needed for a __sync operation.  */
18458   if (is_mm_sync (model))
18459     aarch64_emit_post_barrier (model);
18460 }
18461
18462 /* Split an atomic operation.  */
18463
18464 void
18465 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
18466                          rtx value, rtx model_rtx, rtx cond)
18467 {
18468   machine_mode mode = GET_MODE (mem);
18469   machine_mode wmode = (mode == DImode ? DImode : SImode);
18470   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
18471   const bool is_sync = is_mm_sync (model);
18472   rtx_code_label *label;
18473   rtx x;
18474
18475   /* Split the atomic operation into a sequence.  */
18476   label = gen_label_rtx ();
18477   emit_label (label);
18478
18479   if (new_out)
18480     new_out = gen_lowpart (wmode, new_out);
18481   if (old_out)
18482     old_out = gen_lowpart (wmode, old_out);
18483   else
18484     old_out = new_out;
18485   value = simplify_gen_subreg (wmode, value, mode, 0);
18486
18487   /* The initial load can be relaxed for a __sync operation since a final
18488      barrier will be emitted to stop code hoisting.  */
18489  if (is_sync)
18490     aarch64_emit_load_exclusive (mode, old_out, mem,
18491                                  GEN_INT (MEMMODEL_RELAXED));
18492   else
18493     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
18494
18495   switch (code)
18496     {
18497     case SET:
18498       new_out = value;
18499       break;
18500
18501     case NOT:
18502       x = gen_rtx_AND (wmode, old_out, value);
18503       emit_insn (gen_rtx_SET (new_out, x));
18504       x = gen_rtx_NOT (wmode, new_out);
18505       emit_insn (gen_rtx_SET (new_out, x));
18506       break;
18507
18508     case MINUS:
18509       if (CONST_INT_P (value))
18510         {
18511           value = GEN_INT (-INTVAL (value));
18512           code = PLUS;
18513         }
18514       /* Fall through.  */
18515
18516     default:
18517       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
18518       emit_insn (gen_rtx_SET (new_out, x));
18519       break;
18520     }
18521
18522   aarch64_emit_store_exclusive (mode, cond, mem,
18523                                 gen_lowpart (mode, new_out), model_rtx);
18524
18525   if (aarch64_track_speculation)
18526     {
18527       /* Emit an explicit compare instruction, so that we can correctly
18528          track the condition codes.  */
18529       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
18530       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18531     }
18532   else
18533     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
18534
18535   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18536                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
18537   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18538
18539   /* Emit any final barrier needed for a __sync operation.  */
18540   if (is_sync)
18541     aarch64_emit_post_barrier (model);
18542 }
18543
18544 static void
18545 aarch64_init_libfuncs (void)
18546 {
18547    /* Half-precision float operations.  The compiler handles all operations
18548      with NULL libfuncs by converting to SFmode.  */
18549
18550   /* Conversions.  */
18551   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
18552   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
18553
18554   /* Arithmetic.  */
18555   set_optab_libfunc (add_optab, HFmode, NULL);
18556   set_optab_libfunc (sdiv_optab, HFmode, NULL);
18557   set_optab_libfunc (smul_optab, HFmode, NULL);
18558   set_optab_libfunc (neg_optab, HFmode, NULL);
18559   set_optab_libfunc (sub_optab, HFmode, NULL);
18560
18561   /* Comparisons.  */
18562   set_optab_libfunc (eq_optab, HFmode, NULL);
18563   set_optab_libfunc (ne_optab, HFmode, NULL);
18564   set_optab_libfunc (lt_optab, HFmode, NULL);
18565   set_optab_libfunc (le_optab, HFmode, NULL);
18566   set_optab_libfunc (ge_optab, HFmode, NULL);
18567   set_optab_libfunc (gt_optab, HFmode, NULL);
18568   set_optab_libfunc (unord_optab, HFmode, NULL);
18569 }
18570
18571 /* Target hook for c_mode_for_suffix.  */
18572 static machine_mode
18573 aarch64_c_mode_for_suffix (char suffix)
18574 {
18575   if (suffix == 'q')
18576     return TFmode;
18577
18578   return VOIDmode;
18579 }
18580
18581 /* We can only represent floating point constants which will fit in
18582    "quarter-precision" values.  These values are characterised by
18583    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
18584    by:
18585
18586    (-1)^s * (n/16) * 2^r
18587
18588    Where:
18589      's' is the sign bit.
18590      'n' is an integer in the range 16 <= n <= 31.
18591      'r' is an integer in the range -3 <= r <= 4.  */
18592
18593 /* Return true iff X can be represented by a quarter-precision
18594    floating point immediate operand X.  Note, we cannot represent 0.0.  */
18595 bool
18596 aarch64_float_const_representable_p (rtx x)
18597 {
18598   /* This represents our current view of how many bits
18599      make up the mantissa.  */
18600   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
18601   int exponent;
18602   unsigned HOST_WIDE_INT mantissa, mask;
18603   REAL_VALUE_TYPE r, m;
18604   bool fail;
18605
18606   x = unwrap_const_vec_duplicate (x);
18607   if (!CONST_DOUBLE_P (x))
18608     return false;
18609
18610   if (GET_MODE (x) == VOIDmode
18611       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
18612     return false;
18613
18614   r = *CONST_DOUBLE_REAL_VALUE (x);
18615
18616   /* We cannot represent infinities, NaNs or +/-zero.  We won't
18617      know if we have +zero until we analyse the mantissa, but we
18618      can reject the other invalid values.  */
18619   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
18620       || REAL_VALUE_MINUS_ZERO (r))
18621     return false;
18622
18623   /* Extract exponent.  */
18624   r = real_value_abs (&r);
18625   exponent = REAL_EXP (&r);
18626
18627   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18628      highest (sign) bit, with a fixed binary point at bit point_pos.
18629      m1 holds the low part of the mantissa, m2 the high part.
18630      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18631      bits for the mantissa, this can fail (low bits will be lost).  */
18632   real_ldexp (&m, &r, point_pos - exponent);
18633   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
18634
18635   /* If the low part of the mantissa has bits set we cannot represent
18636      the value.  */
18637   if (w.ulow () != 0)
18638     return false;
18639   /* We have rejected the lower HOST_WIDE_INT, so update our
18640      understanding of how many bits lie in the mantissa and
18641      look only at the high HOST_WIDE_INT.  */
18642   mantissa = w.elt (1);
18643   point_pos -= HOST_BITS_PER_WIDE_INT;
18644
18645   /* We can only represent values with a mantissa of the form 1.xxxx.  */
18646   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
18647   if ((mantissa & mask) != 0)
18648     return false;
18649
18650   /* Having filtered unrepresentable values, we may now remove all
18651      but the highest 5 bits.  */
18652   mantissa >>= point_pos - 5;
18653
18654   /* We cannot represent the value 0.0, so reject it.  This is handled
18655      elsewhere.  */
18656   if (mantissa == 0)
18657     return false;
18658
18659   /* Then, as bit 4 is always set, we can mask it off, leaving
18660      the mantissa in the range [0, 15].  */
18661   mantissa &= ~(1 << 4);
18662   gcc_assert (mantissa <= 15);
18663
18664   /* GCC internally does not use IEEE754-like encoding (where normalized
18665      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
18666      Our mantissa values are shifted 4 places to the left relative to
18667      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18668      by 5 places to correct for GCC's representation.  */
18669   exponent = 5 - exponent;
18670
18671   return (exponent >= 0 && exponent <= 7);
18672 }
18673
18674 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18675    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
18676    output MOVI/MVNI, ORR or BIC immediate.  */
18677 char*
18678 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
18679                                    enum simd_immediate_check which)
18680 {
18681   bool is_valid;
18682   static char templ[40];
18683   const char *mnemonic;
18684   const char *shift_op;
18685   unsigned int lane_count = 0;
18686   char element_char;
18687
18688   struct simd_immediate_info info;
18689
18690   /* This will return true to show const_vector is legal for use as either
18691      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18692      It will also update INFO to show how the immediate should be generated.
18693      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
18694   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
18695   gcc_assert (is_valid);
18696
18697   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18698   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
18699
18700   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18701     {
18702       gcc_assert (info.insn == simd_immediate_info::MOV
18703                   && info.u.mov.shift == 0);
18704       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18705          move immediate path.  */
18706       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18707         info.u.mov.value = GEN_INT (0);
18708       else
18709         {
18710           const unsigned int buf_size = 20;
18711           char float_buf[buf_size] = {'\0'};
18712           real_to_decimal_for_mode (float_buf,
18713                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18714                                     buf_size, buf_size, 1, info.elt_mode);
18715
18716           if (lane_count == 1)
18717             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
18718           else
18719             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
18720                       lane_count, element_char, float_buf);
18721           return templ;
18722         }
18723     }
18724
18725   gcc_assert (CONST_INT_P (info.u.mov.value));
18726
18727   if (which == AARCH64_CHECK_MOV)
18728     {
18729       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
18730       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
18731                   ? "msl" : "lsl");
18732       if (lane_count == 1)
18733         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
18734                   mnemonic, UINTVAL (info.u.mov.value));
18735       else if (info.u.mov.shift)
18736         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18737                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
18738                   element_char, UINTVAL (info.u.mov.value), shift_op,
18739                   info.u.mov.shift);
18740       else
18741         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18742                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
18743                   element_char, UINTVAL (info.u.mov.value));
18744     }
18745   else
18746     {
18747       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
18748       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
18749       if (info.u.mov.shift)
18750         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18751                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
18752                   element_char, UINTVAL (info.u.mov.value), "lsl",
18753                   info.u.mov.shift);
18754       else
18755         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18756                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
18757                   element_char, UINTVAL (info.u.mov.value));
18758     }
18759   return templ;
18760 }
18761
18762 char*
18763 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
18764 {
18765
18766   /* If a floating point number was passed and we desire to use it in an
18767      integer mode do the conversion to integer.  */
18768   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
18769     {
18770       unsigned HOST_WIDE_INT ival;
18771       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
18772           gcc_unreachable ();
18773       immediate = gen_int_mode (ival, mode);
18774     }
18775
18776   machine_mode vmode;
18777   /* use a 64 bit mode for everything except for DI/DF mode, where we use
18778      a 128 bit vector mode.  */
18779   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
18780
18781   vmode = aarch64_simd_container_mode (mode, width);
18782   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
18783   return aarch64_output_simd_mov_immediate (v_op, width);
18784 }
18785
18786 /* Return the output string to use for moving immediate CONST_VECTOR
18787    into an SVE register.  */
18788
18789 char *
18790 aarch64_output_sve_mov_immediate (rtx const_vector)
18791 {
18792   static char templ[40];
18793   struct simd_immediate_info info;
18794   char element_char;
18795
18796   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
18797   gcc_assert (is_valid);
18798
18799   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18800
18801   machine_mode vec_mode = GET_MODE (const_vector);
18802   if (aarch64_sve_pred_mode_p (vec_mode))
18803     {
18804       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
18805       if (info.insn == simd_immediate_info::MOV)
18806         {
18807           gcc_assert (info.u.mov.value == const0_rtx);
18808           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
18809         }
18810       else
18811         {
18812           gcc_assert (info.insn == simd_immediate_info::PTRUE);
18813           unsigned int total_bytes;
18814           if (info.u.pattern == AARCH64_SV_ALL
18815               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
18816             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
18817                       total_bytes / GET_MODE_SIZE (info.elt_mode));
18818           else
18819             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
18820                       svpattern_token (info.u.pattern));
18821         }
18822       return buf;
18823     }
18824
18825   if (info.insn == simd_immediate_info::INDEX)
18826     {
18827       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
18828                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
18829                 element_char, INTVAL (info.u.index.base),
18830                 INTVAL (info.u.index.step));
18831       return templ;
18832     }
18833
18834   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18835     {
18836       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18837         info.u.mov.value = GEN_INT (0);
18838       else
18839         {
18840           const int buf_size = 20;
18841           char float_buf[buf_size] = {};
18842           real_to_decimal_for_mode (float_buf,
18843                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18844                                     buf_size, buf_size, 1, info.elt_mode);
18845
18846           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
18847                     element_char, float_buf);
18848           return templ;
18849         }
18850     }
18851
18852   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
18853             element_char, INTVAL (info.u.mov.value));
18854   return templ;
18855 }
18856
18857 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
18858    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
18859    pattern.  */
18860
18861 char *
18862 aarch64_output_sve_ptrues (rtx const_unspec)
18863 {
18864   static char templ[40];
18865
18866   struct simd_immediate_info info;
18867   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
18868   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
18869
18870   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18871   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
18872             svpattern_token (info.u.pattern));
18873   return templ;
18874 }
18875
18876 /* Split operands into moves from op[1] + op[2] into op[0].  */
18877
18878 void
18879 aarch64_split_combinev16qi (rtx operands[3])
18880 {
18881   unsigned int dest = REGNO (operands[0]);
18882   unsigned int src1 = REGNO (operands[1]);
18883   unsigned int src2 = REGNO (operands[2]);
18884   machine_mode halfmode = GET_MODE (operands[1]);
18885   unsigned int halfregs = REG_NREGS (operands[1]);
18886   rtx destlo, desthi;
18887
18888   gcc_assert (halfmode == V16QImode);
18889
18890   if (src1 == dest && src2 == dest + halfregs)
18891     {
18892       /* No-op move.  Can't split to nothing; emit something.  */
18893       emit_note (NOTE_INSN_DELETED);
18894       return;
18895     }
18896
18897   /* Preserve register attributes for variable tracking.  */
18898   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
18899   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
18900                                GET_MODE_SIZE (halfmode));
18901
18902   /* Special case of reversed high/low parts.  */
18903   if (reg_overlap_mentioned_p (operands[2], destlo)
18904       && reg_overlap_mentioned_p (operands[1], desthi))
18905     {
18906       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18907       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
18908       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18909     }
18910   else if (!reg_overlap_mentioned_p (operands[2], destlo))
18911     {
18912       /* Try to avoid unnecessary moves if part of the result
18913          is in the right place already.  */
18914       if (src1 != dest)
18915         emit_move_insn (destlo, operands[1]);
18916       if (src2 != dest + halfregs)
18917         emit_move_insn (desthi, operands[2]);
18918     }
18919   else
18920     {
18921       if (src2 != dest + halfregs)
18922         emit_move_insn (desthi, operands[2]);
18923       if (src1 != dest)
18924         emit_move_insn (destlo, operands[1]);
18925     }
18926 }
18927
18928 /* vec_perm support.  */
18929
18930 struct expand_vec_perm_d
18931 {
18932   rtx target, op0, op1;
18933   vec_perm_indices perm;
18934   machine_mode vmode;
18935   unsigned int vec_flags;
18936   bool one_vector_p;
18937   bool testing_p;
18938 };
18939
18940 /* Generate a variable permutation.  */
18941
18942 static void
18943 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
18944 {
18945   machine_mode vmode = GET_MODE (target);
18946   bool one_vector_p = rtx_equal_p (op0, op1);
18947
18948   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
18949   gcc_checking_assert (GET_MODE (op0) == vmode);
18950   gcc_checking_assert (GET_MODE (op1) == vmode);
18951   gcc_checking_assert (GET_MODE (sel) == vmode);
18952   gcc_checking_assert (TARGET_SIMD);
18953
18954   if (one_vector_p)
18955     {
18956       if (vmode == V8QImode)
18957         {
18958           /* Expand the argument to a V16QI mode by duplicating it.  */
18959           rtx pair = gen_reg_rtx (V16QImode);
18960           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
18961           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18962         }
18963       else
18964         {
18965           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
18966         }
18967     }
18968   else
18969     {
18970       rtx pair;
18971
18972       if (vmode == V8QImode)
18973         {
18974           pair = gen_reg_rtx (V16QImode);
18975           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
18976           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18977         }
18978       else
18979         {
18980           pair = gen_reg_rtx (OImode);
18981           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
18982           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
18983         }
18984     }
18985 }
18986
18987 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
18988    NELT is the number of elements in the vector.  */
18989
18990 void
18991 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
18992                          unsigned int nelt)
18993 {
18994   machine_mode vmode = GET_MODE (target);
18995   bool one_vector_p = rtx_equal_p (op0, op1);
18996   rtx mask;
18997
18998   /* The TBL instruction does not use a modulo index, so we must take care
18999      of that ourselves.  */
19000   mask = aarch64_simd_gen_const_vector_dup (vmode,
19001       one_vector_p ? nelt - 1 : 2 * nelt - 1);
19002   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19003
19004   /* For big-endian, we also need to reverse the index within the vector
19005      (but not which vector).  */
19006   if (BYTES_BIG_ENDIAN)
19007     {
19008       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
19009       if (!one_vector_p)
19010         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19011       sel = expand_simple_binop (vmode, XOR, sel, mask,
19012                                  NULL, 0, OPTAB_LIB_WIDEN);
19013     }
19014   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19015 }
19016
19017 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
19018
19019 static void
19020 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19021 {
19022   emit_insn (gen_rtx_SET (target,
19023                           gen_rtx_UNSPEC (GET_MODE (target),
19024                                           gen_rtvec (2, op0, op1), code)));
19025 }
19026
19027 /* Expand an SVE vec_perm with the given operands.  */
19028
19029 void
19030 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19031 {
19032   machine_mode data_mode = GET_MODE (target);
19033   machine_mode sel_mode = GET_MODE (sel);
19034   /* Enforced by the pattern condition.  */
19035   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19036
19037   /* Note: vec_perm indices are supposed to wrap when they go beyond the
19038      size of the two value vectors, i.e. the upper bits of the indices
19039      are effectively ignored.  SVE TBL instead produces 0 for any
19040      out-of-range indices, so we need to modulo all the vec_perm indices
19041      to ensure they are all in range.  */
19042   rtx sel_reg = force_reg (sel_mode, sel);
19043
19044   /* Check if the sel only references the first values vector.  */
19045   if (GET_CODE (sel) == CONST_VECTOR
19046       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
19047     {
19048       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
19049       return;
19050     }
19051
19052   /* Check if the two values vectors are the same.  */
19053   if (rtx_equal_p (op0, op1))
19054     {
19055       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
19056       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19057                                          NULL, 0, OPTAB_DIRECT);
19058       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
19059       return;
19060     }
19061
19062   /* Run TBL on for each value vector and combine the results.  */
19063
19064   rtx res0 = gen_reg_rtx (data_mode);
19065   rtx res1 = gen_reg_rtx (data_mode);
19066   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
19067   if (GET_CODE (sel) != CONST_VECTOR
19068       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
19069     {
19070       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
19071                                                        2 * nunits - 1);
19072       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19073                                      NULL, 0, OPTAB_DIRECT);
19074     }
19075   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
19076   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
19077                                      NULL, 0, OPTAB_DIRECT);
19078   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
19079   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
19080     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
19081   else
19082     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
19083 }
19084
19085 /* Recognize patterns suitable for the TRN instructions.  */
19086 static bool
19087 aarch64_evpc_trn (struct expand_vec_perm_d *d)
19088 {
19089   HOST_WIDE_INT odd;
19090   poly_uint64 nelt = d->perm.length ();
19091   rtx out, in0, in1, x;
19092   machine_mode vmode = d->vmode;
19093
19094   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19095     return false;
19096
19097   /* Note that these are little-endian tests.
19098      We correct for big-endian later.  */
19099   if (!d->perm[0].is_constant (&odd)
19100       || (odd != 0 && odd != 1)
19101       || !d->perm.series_p (0, 2, odd, 2)
19102       || !d->perm.series_p (1, 2, nelt + odd, 2))
19103     return false;
19104
19105   /* Success!  */
19106   if (d->testing_p)
19107     return true;
19108
19109   in0 = d->op0;
19110   in1 = d->op1;
19111   /* We don't need a big-endian lane correction for SVE; see the comment
19112      at the head of aarch64-sve.md for details.  */
19113   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19114     {
19115       x = in0, in0 = in1, in1 = x;
19116       odd = !odd;
19117     }
19118   out = d->target;
19119
19120   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19121                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
19122   return true;
19123 }
19124
19125 /* Recognize patterns suitable for the UZP instructions.  */
19126 static bool
19127 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
19128 {
19129   HOST_WIDE_INT odd;
19130   rtx out, in0, in1, x;
19131   machine_mode vmode = d->vmode;
19132
19133   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19134     return false;
19135
19136   /* Note that these are little-endian tests.
19137      We correct for big-endian later.  */
19138   if (!d->perm[0].is_constant (&odd)
19139       || (odd != 0 && odd != 1)
19140       || !d->perm.series_p (0, 1, odd, 2))
19141     return false;
19142
19143   /* Success!  */
19144   if (d->testing_p)
19145     return true;
19146
19147   in0 = d->op0;
19148   in1 = d->op1;
19149   /* We don't need a big-endian lane correction for SVE; see the comment
19150      at the head of aarch64-sve.md for details.  */
19151   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19152     {
19153       x = in0, in0 = in1, in1 = x;
19154       odd = !odd;
19155     }
19156   out = d->target;
19157
19158   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19159                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
19160   return true;
19161 }
19162
19163 /* Recognize patterns suitable for the ZIP instructions.  */
19164 static bool
19165 aarch64_evpc_zip (struct expand_vec_perm_d *d)
19166 {
19167   unsigned int high;
19168   poly_uint64 nelt = d->perm.length ();
19169   rtx out, in0, in1, x;
19170   machine_mode vmode = d->vmode;
19171
19172   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19173     return false;
19174
19175   /* Note that these are little-endian tests.
19176      We correct for big-endian later.  */
19177   poly_uint64 first = d->perm[0];
19178   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
19179       || !d->perm.series_p (0, 2, first, 1)
19180       || !d->perm.series_p (1, 2, first + nelt, 1))
19181     return false;
19182   high = maybe_ne (first, 0U);
19183
19184   /* Success!  */
19185   if (d->testing_p)
19186     return true;
19187
19188   in0 = d->op0;
19189   in1 = d->op1;
19190   /* We don't need a big-endian lane correction for SVE; see the comment
19191      at the head of aarch64-sve.md for details.  */
19192   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19193     {
19194       x = in0, in0 = in1, in1 = x;
19195       high = !high;
19196     }
19197   out = d->target;
19198
19199   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19200                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
19201   return true;
19202 }
19203
19204 /* Recognize patterns for the EXT insn.  */
19205
19206 static bool
19207 aarch64_evpc_ext (struct expand_vec_perm_d *d)
19208 {
19209   HOST_WIDE_INT location;
19210   rtx offset;
19211
19212   /* The first element always refers to the first vector.
19213      Check if the extracted indices are increasing by one.  */
19214   if (d->vec_flags == VEC_SVE_PRED
19215       || !d->perm[0].is_constant (&location)
19216       || !d->perm.series_p (0, 1, location, 1))
19217     return false;
19218
19219   /* Success! */
19220   if (d->testing_p)
19221     return true;
19222
19223   /* The case where (location == 0) is a no-op for both big- and little-endian,
19224      and is removed by the mid-end at optimization levels -O1 and higher.
19225
19226      We don't need a big-endian lane correction for SVE; see the comment
19227      at the head of aarch64-sve.md for details.  */
19228   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
19229     {
19230       /* After setup, we want the high elements of the first vector (stored
19231          at the LSB end of the register), and the low elements of the second
19232          vector (stored at the MSB end of the register). So swap.  */
19233       std::swap (d->op0, d->op1);
19234       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19235          to_constant () is safe since this is restricted to Advanced SIMD
19236          vectors.  */
19237       location = d->perm.length ().to_constant () - location;
19238     }
19239
19240   offset = GEN_INT (location);
19241   emit_set_insn (d->target,
19242                  gen_rtx_UNSPEC (d->vmode,
19243                                  gen_rtvec (3, d->op0, d->op1, offset),
19244                                  UNSPEC_EXT));
19245   return true;
19246 }
19247
19248 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19249    within each 64-bit, 32-bit or 16-bit granule.  */
19250
19251 static bool
19252 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
19253 {
19254   HOST_WIDE_INT diff;
19255   unsigned int i, size, unspec;
19256   machine_mode pred_mode;
19257
19258   if (d->vec_flags == VEC_SVE_PRED
19259       || !d->one_vector_p
19260       || !d->perm[0].is_constant (&diff))
19261     return false;
19262
19263   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
19264   if (size == 8)
19265     {
19266       unspec = UNSPEC_REV64;
19267       pred_mode = VNx2BImode;
19268     }
19269   else if (size == 4)
19270     {
19271       unspec = UNSPEC_REV32;
19272       pred_mode = VNx4BImode;
19273     }
19274   else if (size == 2)
19275     {
19276       unspec = UNSPEC_REV16;
19277       pred_mode = VNx8BImode;
19278     }
19279   else
19280     return false;
19281
19282   unsigned int step = diff + 1;
19283   for (i = 0; i < step; ++i)
19284     if (!d->perm.series_p (i, step, diff - i, step))
19285       return false;
19286
19287   /* Success! */
19288   if (d->testing_p)
19289     return true;
19290
19291   if (d->vec_flags == VEC_SVE_DATA)
19292     {
19293       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
19294       rtx target = gen_reg_rtx (int_mode);
19295       if (BYTES_BIG_ENDIAN)
19296         /* The act of taking a subreg between INT_MODE and d->vmode
19297            is itself a reversing operation on big-endian targets;
19298            see the comment at the head of aarch64-sve.md for details.
19299            First reinterpret OP0 as INT_MODE without using a subreg
19300            and without changing the contents.  */
19301         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
19302       else
19303         {
19304           /* For SVE we use REV[BHW] unspecs derived from the element size
19305              of v->mode and vector modes whose elements have SIZE bytes.
19306              This ensures that the vector modes match the predicate modes.  */
19307           int unspec = aarch64_sve_rev_unspec (d->vmode);
19308           rtx pred = aarch64_ptrue_reg (pred_mode);
19309           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
19310                                        gen_lowpart (int_mode, d->op0)));
19311         }
19312       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19313       return true;
19314     }
19315   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
19316   emit_set_insn (d->target, src);
19317   return true;
19318 }
19319
19320 /* Recognize patterns for the REV insn, which reverses elements within
19321    a full vector.  */
19322
19323 static bool
19324 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
19325 {
19326   poly_uint64 nelt = d->perm.length ();
19327
19328   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
19329     return false;
19330
19331   if (!d->perm.series_p (0, 1, nelt - 1, -1))
19332     return false;
19333
19334   /* Success! */
19335   if (d->testing_p)
19336     return true;
19337
19338   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
19339   emit_set_insn (d->target, src);
19340   return true;
19341 }
19342
19343 static bool
19344 aarch64_evpc_dup (struct expand_vec_perm_d *d)
19345 {
19346   rtx out = d->target;
19347   rtx in0;
19348   HOST_WIDE_INT elt;
19349   machine_mode vmode = d->vmode;
19350   rtx lane;
19351
19352   if (d->vec_flags == VEC_SVE_PRED
19353       || d->perm.encoding ().encoded_nelts () != 1
19354       || !d->perm[0].is_constant (&elt))
19355     return false;
19356
19357   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
19358     return false;
19359
19360   /* Success! */
19361   if (d->testing_p)
19362     return true;
19363
19364   /* The generic preparation in aarch64_expand_vec_perm_const_1
19365      swaps the operand order and the permute indices if it finds
19366      d->perm[0] to be in the second operand.  Thus, we can always
19367      use d->op0 and need not do any extra arithmetic to get the
19368      correct lane number.  */
19369   in0 = d->op0;
19370   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
19371
19372   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
19373   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
19374   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
19375   return true;
19376 }
19377
19378 static bool
19379 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
19380 {
19381   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
19382   machine_mode vmode = d->vmode;
19383
19384   /* Make sure that the indices are constant.  */
19385   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
19386   for (unsigned int i = 0; i < encoded_nelts; ++i)
19387     if (!d->perm[i].is_constant ())
19388       return false;
19389
19390   if (d->testing_p)
19391     return true;
19392
19393   /* Generic code will try constant permutation twice.  Once with the
19394      original mode and again with the elements lowered to QImode.
19395      So wait and don't do the selector expansion ourselves.  */
19396   if (vmode != V8QImode && vmode != V16QImode)
19397     return false;
19398
19399   /* to_constant is safe since this routine is specific to Advanced SIMD
19400      vectors.  */
19401   unsigned int nelt = d->perm.length ().to_constant ();
19402   for (unsigned int i = 0; i < nelt; ++i)
19403     /* If big-endian and two vectors we end up with a weird mixed-endian
19404        mode on NEON.  Reverse the index within each word but not the word
19405        itself.  to_constant is safe because we checked is_constant above.  */
19406     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
19407                         ? d->perm[i].to_constant () ^ (nelt - 1)
19408                         : d->perm[i].to_constant ());
19409
19410   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19411   sel = force_reg (vmode, sel);
19412
19413   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
19414   return true;
19415 }
19416
19417 /* Try to implement D using an SVE TBL instruction.  */
19418
19419 static bool
19420 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
19421 {
19422   unsigned HOST_WIDE_INT nelt;
19423
19424   /* Permuting two variable-length vectors could overflow the
19425      index range.  */
19426   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
19427     return false;
19428
19429   if (d->testing_p)
19430     return true;
19431
19432   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
19433   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
19434   if (d->one_vector_p)
19435     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
19436   else
19437     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
19438   return true;
19439 }
19440
19441 /* Try to implement D using SVE SEL instruction.  */
19442
19443 static bool
19444 aarch64_evpc_sel (struct expand_vec_perm_d *d)
19445 {
19446   machine_mode vmode = d->vmode;
19447   int unit_size = GET_MODE_UNIT_SIZE (vmode);
19448
19449   if (d->vec_flags != VEC_SVE_DATA
19450       || unit_size > 8)
19451     return false;
19452
19453   int n_patterns = d->perm.encoding ().npatterns ();
19454   poly_int64 vec_len = d->perm.length ();
19455
19456   for (int i = 0; i < n_patterns; ++i)
19457     if (!known_eq (d->perm[i], i)
19458         && !known_eq (d->perm[i], vec_len + i))
19459       return false;
19460
19461   for (int i = n_patterns; i < n_patterns * 2; i++)
19462     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
19463         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
19464       return false;
19465
19466   if (d->testing_p)
19467     return true;
19468
19469   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
19470
19471   rtx_vector_builder builder (pred_mode, n_patterns, 2);
19472   for (int i = 0; i < n_patterns * 2; i++)
19473     {
19474       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
19475                                           : CONST0_RTX (BImode);
19476       builder.quick_push (elem);
19477     }
19478
19479   rtx const_vec = builder.build ();
19480   rtx pred = force_reg (pred_mode, const_vec);
19481   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
19482   return true;
19483 }
19484
19485 static bool
19486 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19487 {
19488   /* The pattern matching functions above are written to look for a small
19489      number to begin the sequence (0, 1, N/2).  If we begin with an index
19490      from the second operand, we can swap the operands.  */
19491   poly_int64 nelt = d->perm.length ();
19492   if (known_ge (d->perm[0], nelt))
19493     {
19494       d->perm.rotate_inputs (1);
19495       std::swap (d->op0, d->op1);
19496     }
19497
19498   if ((d->vec_flags == VEC_ADVSIMD
19499        || d->vec_flags == VEC_SVE_DATA
19500        || d->vec_flags == VEC_SVE_PRED)
19501       && known_gt (nelt, 1))
19502     {
19503       if (aarch64_evpc_rev_local (d))
19504         return true;
19505       else if (aarch64_evpc_rev_global (d))
19506         return true;
19507       else if (aarch64_evpc_ext (d))
19508         return true;
19509       else if (aarch64_evpc_dup (d))
19510         return true;
19511       else if (aarch64_evpc_zip (d))
19512         return true;
19513       else if (aarch64_evpc_uzp (d))
19514         return true;
19515       else if (aarch64_evpc_trn (d))
19516         return true;
19517       else if (aarch64_evpc_sel (d))
19518         return true;
19519       if (d->vec_flags == VEC_SVE_DATA)
19520         return aarch64_evpc_sve_tbl (d);
19521       else if (d->vec_flags == VEC_ADVSIMD)
19522         return aarch64_evpc_tbl (d);
19523     }
19524   return false;
19525 }
19526
19527 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
19528
19529 static bool
19530 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19531                                   rtx op1, const vec_perm_indices &sel)
19532 {
19533   struct expand_vec_perm_d d;
19534
19535   /* Check whether the mask can be applied to a single vector.  */
19536   if (sel.ninputs () == 1
19537       || (op0 && rtx_equal_p (op0, op1)))
19538     d.one_vector_p = true;
19539   else if (sel.all_from_input_p (0))
19540     {
19541       d.one_vector_p = true;
19542       op1 = op0;
19543     }
19544   else if (sel.all_from_input_p (1))
19545     {
19546       d.one_vector_p = true;
19547       op0 = op1;
19548     }
19549   else
19550     d.one_vector_p = false;
19551
19552   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
19553                      sel.nelts_per_input ());
19554   d.vmode = vmode;
19555   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
19556   d.target = target;
19557   d.op0 = op0;
19558   d.op1 = op1;
19559   d.testing_p = !target;
19560
19561   if (!d.testing_p)
19562     return aarch64_expand_vec_perm_const_1 (&d);
19563
19564   rtx_insn *last = get_last_insn ();
19565   bool ret = aarch64_expand_vec_perm_const_1 (&d);
19566   gcc_assert (last == get_last_insn ());
19567
19568   return ret;
19569 }
19570
19571 /* Generate a byte permute mask for a register of mode MODE,
19572    which has NUNITS units.  */
19573
19574 rtx
19575 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
19576 {
19577   /* We have to reverse each vector because we dont have
19578      a permuted load that can reverse-load according to ABI rules.  */
19579   rtx mask;
19580   rtvec v = rtvec_alloc (16);
19581   unsigned int i, j;
19582   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
19583
19584   gcc_assert (BYTES_BIG_ENDIAN);
19585   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
19586
19587   for (i = 0; i < nunits; i++)
19588     for (j = 0; j < usize; j++)
19589       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
19590   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
19591   return force_reg (V16QImode, mask);
19592 }
19593
19594 /* Expand an SVE integer comparison using the SVE equivalent of:
19595
19596      (set TARGET (CODE OP0 OP1)).  */
19597
19598 void
19599 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
19600 {
19601   machine_mode pred_mode = GET_MODE (target);
19602   machine_mode data_mode = GET_MODE (op0);
19603   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
19604                                       op0, op1);
19605   if (!rtx_equal_p (target, res))
19606     emit_move_insn (target, res);
19607 }
19608
19609 /* Return the UNSPEC_COND_* code for comparison CODE.  */
19610
19611 static unsigned int
19612 aarch64_unspec_cond_code (rtx_code code)
19613 {
19614   switch (code)
19615     {
19616     case NE:
19617       return UNSPEC_COND_FCMNE;
19618     case EQ:
19619       return UNSPEC_COND_FCMEQ;
19620     case LT:
19621       return UNSPEC_COND_FCMLT;
19622     case GT:
19623       return UNSPEC_COND_FCMGT;
19624     case LE:
19625       return UNSPEC_COND_FCMLE;
19626     case GE:
19627       return UNSPEC_COND_FCMGE;
19628     case UNORDERED:
19629       return UNSPEC_COND_FCMUO;
19630     default:
19631       gcc_unreachable ();
19632     }
19633 }
19634
19635 /* Emit:
19636
19637       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19638
19639    where <X> is the operation associated with comparison CODE.
19640    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19641
19642 static void
19643 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
19644                           bool known_ptrue_p, rtx op0, rtx op1)
19645 {
19646   rtx flag = gen_int_mode (known_ptrue_p, SImode);
19647   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
19648                                gen_rtvec (4, pred, flag, op0, op1),
19649                                aarch64_unspec_cond_code (code));
19650   emit_set_insn (target, unspec);
19651 }
19652
19653 /* Emit the SVE equivalent of:
19654
19655       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19656       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19657       (set TARGET (ior:PRED_MODE TMP1 TMP2))
19658
19659    where <Xi> is the operation associated with comparison CODEi.
19660    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19661
19662 static void
19663 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
19664                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
19665 {
19666   machine_mode pred_mode = GET_MODE (pred);
19667   rtx tmp1 = gen_reg_rtx (pred_mode);
19668   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
19669   rtx tmp2 = gen_reg_rtx (pred_mode);
19670   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
19671   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
19672 }
19673
19674 /* Emit the SVE equivalent of:
19675
19676       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19677       (set TARGET (not TMP))
19678
19679    where <X> is the operation associated with comparison CODE.
19680    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19681
19682 static void
19683 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
19684                                  bool known_ptrue_p, rtx op0, rtx op1)
19685 {
19686   machine_mode pred_mode = GET_MODE (pred);
19687   rtx tmp = gen_reg_rtx (pred_mode);
19688   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
19689   aarch64_emit_unop (target, one_cmpl_optab, tmp);
19690 }
19691
19692 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19693
19694      (set TARGET (CODE OP0 OP1))
19695
19696    If CAN_INVERT_P is true, the caller can also handle inverted results;
19697    return true if the result is in fact inverted.  */
19698
19699 bool
19700 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
19701                                   rtx op0, rtx op1, bool can_invert_p)
19702 {
19703   machine_mode pred_mode = GET_MODE (target);
19704   machine_mode data_mode = GET_MODE (op0);
19705
19706   rtx ptrue = aarch64_ptrue_reg (pred_mode);
19707   switch (code)
19708     {
19709     case UNORDERED:
19710       /* UNORDERED has no immediate form.  */
19711       op1 = force_reg (data_mode, op1);
19712       /* fall through */
19713     case LT:
19714     case LE:
19715     case GT:
19716     case GE:
19717     case EQ:
19718     case NE:
19719       {
19720         /* There is native support for the comparison.  */
19721         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19722         return false;
19723       }
19724
19725     case LTGT:
19726       /* This is a trapping operation (LT or GT).  */
19727       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
19728       return false;
19729
19730     case UNEQ:
19731       if (!flag_trapping_math)
19732         {
19733           /* This would trap for signaling NaNs.  */
19734           op1 = force_reg (data_mode, op1);
19735           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
19736                                         ptrue, true, op0, op1);
19737           return false;
19738         }
19739       /* fall through */
19740     case UNLT:
19741     case UNLE:
19742     case UNGT:
19743     case UNGE:
19744       if (flag_trapping_math)
19745         {
19746           /* Work out which elements are ordered.  */
19747           rtx ordered = gen_reg_rtx (pred_mode);
19748           op1 = force_reg (data_mode, op1);
19749           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
19750                                            ptrue, true, op0, op1);
19751
19752           /* Test the opposite condition for the ordered elements,
19753              then invert the result.  */
19754           if (code == UNEQ)
19755             code = NE;
19756           else
19757             code = reverse_condition_maybe_unordered (code);
19758           if (can_invert_p)
19759             {
19760               aarch64_emit_sve_fp_cond (target, code,
19761                                         ordered, false, op0, op1);
19762               return true;
19763             }
19764           aarch64_emit_sve_invert_fp_cond (target, code,
19765                                            ordered, false, op0, op1);
19766           return false;
19767         }
19768       break;
19769
19770     case ORDERED:
19771       /* ORDERED has no immediate form.  */
19772       op1 = force_reg (data_mode, op1);
19773       break;
19774
19775     default:
19776       gcc_unreachable ();
19777     }
19778
19779   /* There is native support for the inverse comparison.  */
19780   code = reverse_condition_maybe_unordered (code);
19781   if (can_invert_p)
19782     {
19783       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19784       return true;
19785     }
19786   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
19787   return false;
19788 }
19789
19790 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
19791    of the data being selected and CMP_MODE is the mode of the values being
19792    compared.  */
19793
19794 void
19795 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
19796                           rtx *ops)
19797 {
19798   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
19799   rtx pred = gen_reg_rtx (pred_mode);
19800   if (FLOAT_MODE_P (cmp_mode))
19801     {
19802       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
19803                                             ops[4], ops[5], true))
19804         std::swap (ops[1], ops[2]);
19805     }
19806   else
19807     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
19808
19809   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
19810     ops[1] = force_reg (data_mode, ops[1]);
19811   /* The "false" value can only be zero if the "true" value is a constant.  */
19812   if (register_operand (ops[1], data_mode)
19813       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
19814     ops[2] = force_reg (data_mode, ops[2]);
19815
19816   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
19817   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
19818 }
19819
19820 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
19821    true.  However due to issues with register allocation it is preferable
19822    to avoid tieing integer scalar and FP scalar modes.  Executing integer
19823    operations in general registers is better than treating them as scalar
19824    vector operations.  This reduces latency and avoids redundant int<->FP
19825    moves.  So tie modes if they are either the same class, or vector modes
19826    with other vector modes, vector structs or any scalar mode.  */
19827
19828 static bool
19829 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
19830 {
19831   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
19832     return true;
19833
19834   /* We specifically want to allow elements of "structure" modes to
19835      be tieable to the structure.  This more general condition allows
19836      other rarer situations too.  The reason we don't extend this to
19837      predicate modes is that there are no predicate structure modes
19838      nor any specific instructions for extracting part of a predicate
19839      register.  */
19840   if (aarch64_vector_data_mode_p (mode1)
19841       && aarch64_vector_data_mode_p (mode2))
19842     return true;
19843
19844   /* Also allow any scalar modes with vectors.  */
19845   if (aarch64_vector_mode_supported_p (mode1)
19846       || aarch64_vector_mode_supported_p (mode2))
19847     return true;
19848
19849   return false;
19850 }
19851
19852 /* Return a new RTX holding the result of moving POINTER forward by
19853    AMOUNT bytes.  */
19854
19855 static rtx
19856 aarch64_move_pointer (rtx pointer, poly_int64 amount)
19857 {
19858   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
19859
19860   return adjust_automodify_address (pointer, GET_MODE (pointer),
19861                                     next, amount);
19862 }
19863
19864 /* Return a new RTX holding the result of moving POINTER forward by the
19865    size of the mode it points to.  */
19866
19867 static rtx
19868 aarch64_progress_pointer (rtx pointer)
19869 {
19870   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
19871 }
19872
19873 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
19874    MODE bytes.  */
19875
19876 static void
19877 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
19878                                               machine_mode mode)
19879 {
19880   rtx reg = gen_reg_rtx (mode);
19881
19882   /* "Cast" the pointers to the correct mode.  */
19883   *src = adjust_address (*src, mode, 0);
19884   *dst = adjust_address (*dst, mode, 0);
19885   /* Emit the memcpy.  */
19886   emit_move_insn (reg, *src);
19887   emit_move_insn (*dst, reg);
19888   /* Move the pointers forward.  */
19889   *src = aarch64_progress_pointer (*src);
19890   *dst = aarch64_progress_pointer (*dst);
19891 }
19892
19893 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
19894    we succeed, otherwise return false.  */
19895
19896 bool
19897 aarch64_expand_cpymem (rtx *operands)
19898 {
19899   int n, mode_bits;
19900   rtx dst = operands[0];
19901   rtx src = operands[1];
19902   rtx base;
19903   machine_mode cur_mode = BLKmode, next_mode;
19904   bool speed_p = !optimize_function_for_size_p (cfun);
19905
19906   /* When optimizing for size, give a better estimate of the length of a
19907      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
19908      will always require an even number of instructions to do now.  And each
19909      operation requires both a load+store, so devide the max number by 2.  */
19910   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
19911
19912   /* We can't do anything smart if the amount to copy is not constant.  */
19913   if (!CONST_INT_P (operands[2]))
19914     return false;
19915
19916   n = INTVAL (operands[2]);
19917
19918   /* Try to keep the number of instructions low.  For all cases we will do at
19919      most two moves for the residual amount, since we'll always overlap the
19920      remainder.  */
19921   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
19922     return false;
19923
19924   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
19925   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
19926
19927   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
19928   src = adjust_automodify_address (src, VOIDmode, base, 0);
19929
19930   /* Convert n to bits to make the rest of the code simpler.  */
19931   n = n * BITS_PER_UNIT;
19932
19933   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
19934      larger than TImode, but we should not use them for loads/stores here.  */
19935   const int copy_limit = GET_MODE_BITSIZE (TImode);
19936
19937   while (n > 0)
19938     {
19939       /* Find the largest mode in which to do the copy in without over reading
19940          or writing.  */
19941       opt_scalar_int_mode mode_iter;
19942       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
19943         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
19944           cur_mode = mode_iter.require ();
19945
19946       gcc_assert (cur_mode != BLKmode);
19947
19948       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
19949       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
19950
19951       n -= mode_bits;
19952
19953       /* Do certain trailing copies as overlapping if it's going to be
19954          cheaper.  i.e. less instructions to do so.  For instance doing a 15
19955          byte copy it's more efficient to do two overlapping 8 byte copies than
19956          8 + 6 + 1.  */
19957       if (n > 0 && n <= 8 * BITS_PER_UNIT)
19958         {
19959           next_mode = smallest_mode_for_size (n, MODE_INT);
19960           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
19961           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
19962           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
19963           n = n_bits;
19964         }
19965     }
19966
19967   return true;
19968 }
19969
19970 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
19971    SImode stores.  Handle the case when the constant has identical
19972    bottom and top halves.  This is beneficial when the two stores can be
19973    merged into an STP and we avoid synthesising potentially expensive
19974    immediates twice.  Return true if such a split is possible.  */
19975
19976 bool
19977 aarch64_split_dimode_const_store (rtx dst, rtx src)
19978 {
19979   rtx lo = gen_lowpart (SImode, src);
19980   rtx hi = gen_highpart_mode (SImode, DImode, src);
19981
19982   bool size_p = optimize_function_for_size_p (cfun);
19983
19984   if (!rtx_equal_p (lo, hi))
19985     return false;
19986
19987   unsigned int orig_cost
19988     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
19989   unsigned int lo_cost
19990     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
19991
19992   /* We want to transform:
19993      MOV        x1, 49370
19994      MOVK       x1, 0x140, lsl 16
19995      MOVK       x1, 0xc0da, lsl 32
19996      MOVK       x1, 0x140, lsl 48
19997      STR        x1, [x0]
19998    into:
19999      MOV        w1, 49370
20000      MOVK       w1, 0x140, lsl 16
20001      STP        w1, w1, [x0]
20002    So we want to perform this only when we save two instructions
20003    or more.  When optimizing for size, however, accept any code size
20004    savings we can.  */
20005   if (size_p && orig_cost <= lo_cost)
20006     return false;
20007
20008   if (!size_p
20009       && (orig_cost <= lo_cost + 1))
20010     return false;
20011
20012   rtx mem_lo = adjust_address (dst, SImode, 0);
20013   if (!aarch64_mem_pair_operand (mem_lo, SImode))
20014     return false;
20015
20016   rtx tmp_reg = gen_reg_rtx (SImode);
20017   aarch64_expand_mov_immediate (tmp_reg, lo);
20018   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20019   /* Don't emit an explicit store pair as this may not be always profitable.
20020      Let the sched-fusion logic decide whether to merge them.  */
20021   emit_move_insn (mem_lo, tmp_reg);
20022   emit_move_insn (mem_hi, tmp_reg);
20023
20024   return true;
20025 }
20026
20027 /* Generate RTL for a conditional branch with rtx comparison CODE in
20028    mode CC_MODE.  The destination of the unlikely conditional branch
20029    is LABEL_REF.  */
20030
20031 void
20032 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20033                               rtx label_ref)
20034 {
20035   rtx x;
20036   x = gen_rtx_fmt_ee (code, VOIDmode,
20037                       gen_rtx_REG (cc_mode, CC_REGNUM),
20038                       const0_rtx);
20039
20040   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20041                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
20042                             pc_rtx);
20043   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20044 }
20045
20046 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20047
20048    OP1 represents the TImode destination operand 1
20049    OP2 represents the TImode destination operand 2
20050    LOW_DEST represents the low half (DImode) of TImode operand 0
20051    LOW_IN1 represents the low half (DImode) of TImode operand 1
20052    LOW_IN2 represents the low half (DImode) of TImode operand 2
20053    HIGH_DEST represents the high half (DImode) of TImode operand 0
20054    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20055    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
20056
20057 void
20058 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20059                             rtx *low_in1, rtx *low_in2,
20060                             rtx *high_dest, rtx *high_in1,
20061                             rtx *high_in2)
20062 {
20063   *low_dest = gen_reg_rtx (DImode);
20064   *low_in1 = gen_lowpart (DImode, op1);
20065   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20066                                   subreg_lowpart_offset (DImode, TImode));
20067   *high_dest = gen_reg_rtx (DImode);
20068   *high_in1 = gen_highpart (DImode, op1);
20069   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20070                                    subreg_highpart_offset (DImode, TImode));
20071 }
20072
20073 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20074
20075    This function differs from 'arch64_addti_scratch_regs' in that
20076    OP1 can be an immediate constant (zero). We must call
20077    subreg_highpart_offset with DImode and TImode arguments, otherwise
20078    VOIDmode will be used for the const_int which generates an internal
20079    error from subreg_size_highpart_offset which does not expect a size of zero.
20080
20081    OP1 represents the TImode destination operand 1
20082    OP2 represents the TImode destination operand 2
20083    LOW_DEST represents the low half (DImode) of TImode operand 0
20084    LOW_IN1 represents the low half (DImode) of TImode operand 1
20085    LOW_IN2 represents the low half (DImode) of TImode operand 2
20086    HIGH_DEST represents the high half (DImode) of TImode operand 0
20087    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20088    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
20089
20090
20091 void
20092 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20093                              rtx *low_in1, rtx *low_in2,
20094                              rtx *high_dest, rtx *high_in1,
20095                              rtx *high_in2)
20096 {
20097   *low_dest = gen_reg_rtx (DImode);
20098   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
20099                                   subreg_lowpart_offset (DImode, TImode));
20100
20101   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20102                                   subreg_lowpart_offset (DImode, TImode));
20103   *high_dest = gen_reg_rtx (DImode);
20104
20105   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
20106                                    subreg_highpart_offset (DImode, TImode));
20107   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20108                                    subreg_highpart_offset (DImode, TImode));
20109 }
20110
20111 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20112
20113    OP0 represents the TImode destination operand 0
20114    LOW_DEST represents the low half (DImode) of TImode operand 0
20115    LOW_IN1 represents the low half (DImode) of TImode operand 1
20116    LOW_IN2 represents the low half (DImode) of TImode operand 2
20117    HIGH_DEST represents the high half (DImode) of TImode operand 0
20118    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20119    HIGH_IN2 represents the high half (DImode) of TImode operand 2
20120    UNSIGNED_P is true if the operation is being performed on unsigned
20121    values.  */
20122 void
20123 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
20124                        rtx low_in2, rtx high_dest, rtx high_in1,
20125                        rtx high_in2, bool unsigned_p)
20126 {
20127   if (low_in2 == const0_rtx)
20128     {
20129       low_dest = low_in1;
20130       high_in2 = force_reg (DImode, high_in2);
20131       if (unsigned_p)
20132         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
20133       else
20134         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
20135     }
20136   else
20137     {
20138       if (CONST_INT_P (low_in2))
20139         {
20140           high_in2 = force_reg (DImode, high_in2);
20141           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
20142                                               GEN_INT (-INTVAL (low_in2))));
20143         }
20144       else
20145         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
20146
20147       if (unsigned_p)
20148         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
20149       else
20150         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
20151     }
20152
20153   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
20154   emit_move_insn (gen_highpart (DImode, op0), high_dest);
20155
20156 }
20157
20158 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
20159
20160 static unsigned HOST_WIDE_INT
20161 aarch64_asan_shadow_offset (void)
20162 {
20163   if (TARGET_ILP32)
20164     return (HOST_WIDE_INT_1 << 29);
20165   else
20166     return (HOST_WIDE_INT_1 << 36);
20167 }
20168
20169 static rtx
20170 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
20171                         int code, tree treeop0, tree treeop1)
20172 {
20173   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20174   rtx op0, op1;
20175   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20176   insn_code icode;
20177   struct expand_operand ops[4];
20178
20179   start_sequence ();
20180   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20181
20182   op_mode = GET_MODE (op0);
20183   if (op_mode == VOIDmode)
20184     op_mode = GET_MODE (op1);
20185
20186   switch (op_mode)
20187     {
20188     case E_QImode:
20189     case E_HImode:
20190     case E_SImode:
20191       cmp_mode = SImode;
20192       icode = CODE_FOR_cmpsi;
20193       break;
20194
20195     case E_DImode:
20196       cmp_mode = DImode;
20197       icode = CODE_FOR_cmpdi;
20198       break;
20199
20200     case E_SFmode:
20201       cmp_mode = SFmode;
20202       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20203       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
20204       break;
20205
20206     case E_DFmode:
20207       cmp_mode = DFmode;
20208       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20209       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
20210       break;
20211
20212     default:
20213       end_sequence ();
20214       return NULL_RTX;
20215     }
20216
20217   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
20218   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
20219   if (!op0 || !op1)
20220     {
20221       end_sequence ();
20222       return NULL_RTX;
20223     }
20224   *prep_seq = get_insns ();
20225   end_sequence ();
20226
20227   create_fixed_operand (&ops[0], op0);
20228   create_fixed_operand (&ops[1], op1);
20229
20230   start_sequence ();
20231   if (!maybe_expand_insn (icode, 2, ops))
20232     {
20233       end_sequence ();
20234       return NULL_RTX;
20235     }
20236   *gen_seq = get_insns ();
20237   end_sequence ();
20238
20239   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
20240                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
20241 }
20242
20243 static rtx
20244 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
20245                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
20246 {
20247   rtx op0, op1, target;
20248   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20249   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20250   insn_code icode;
20251   struct expand_operand ops[6];
20252   int aarch64_cond;
20253
20254   push_to_sequence (*prep_seq);
20255   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20256
20257   op_mode = GET_MODE (op0);
20258   if (op_mode == VOIDmode)
20259     op_mode = GET_MODE (op1);
20260
20261   switch (op_mode)
20262     {
20263     case E_QImode:
20264     case E_HImode:
20265     case E_SImode:
20266       cmp_mode = SImode;
20267       icode = CODE_FOR_ccmpsi;
20268       break;
20269
20270     case E_DImode:
20271       cmp_mode = DImode;
20272       icode = CODE_FOR_ccmpdi;
20273       break;
20274
20275     case E_SFmode:
20276       cmp_mode = SFmode;
20277       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20278       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
20279       break;
20280
20281     case E_DFmode:
20282       cmp_mode = DFmode;
20283       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20284       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
20285       break;
20286
20287     default:
20288       end_sequence ();
20289       return NULL_RTX;
20290     }
20291
20292   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
20293   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
20294   if (!op0 || !op1)
20295     {
20296       end_sequence ();
20297       return NULL_RTX;
20298     }
20299   *prep_seq = get_insns ();
20300   end_sequence ();
20301
20302   target = gen_rtx_REG (cc_mode, CC_REGNUM);
20303   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
20304
20305   if (bit_code != AND)
20306     {
20307       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
20308                                                 GET_MODE (XEXP (prev, 0))),
20309                              VOIDmode, XEXP (prev, 0), const0_rtx);
20310       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
20311     }
20312
20313   create_fixed_operand (&ops[0], XEXP (prev, 0));
20314   create_fixed_operand (&ops[1], target);
20315   create_fixed_operand (&ops[2], op0);
20316   create_fixed_operand (&ops[3], op1);
20317   create_fixed_operand (&ops[4], prev);
20318   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
20319
20320   push_to_sequence (*gen_seq);
20321   if (!maybe_expand_insn (icode, 6, ops))
20322     {
20323       end_sequence ();
20324       return NULL_RTX;
20325     }
20326
20327   *gen_seq = get_insns ();
20328   end_sequence ();
20329
20330   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
20331 }
20332
20333 #undef TARGET_GEN_CCMP_FIRST
20334 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20335
20336 #undef TARGET_GEN_CCMP_NEXT
20337 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20338
20339 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
20340    instruction fusion of some sort.  */
20341
20342 static bool
20343 aarch64_macro_fusion_p (void)
20344 {
20345   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
20346 }
20347
20348
20349 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
20350    should be kept together during scheduling.  */
20351
20352 static bool
20353 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
20354 {
20355   rtx set_dest;
20356   rtx prev_set = single_set (prev);
20357   rtx curr_set = single_set (curr);
20358   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
20359   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
20360
20361   if (!aarch64_macro_fusion_p ())
20362     return false;
20363
20364   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
20365     {
20366       /* We are trying to match:
20367          prev (mov)  == (set (reg r0) (const_int imm16))
20368          curr (movk) == (set (zero_extract (reg r0)
20369                                            (const_int 16)
20370                                            (const_int 16))
20371                              (const_int imm16_1))  */
20372
20373       set_dest = SET_DEST (curr_set);
20374
20375       if (GET_CODE (set_dest) == ZERO_EXTRACT
20376           && CONST_INT_P (SET_SRC (curr_set))
20377           && CONST_INT_P (SET_SRC (prev_set))
20378           && CONST_INT_P (XEXP (set_dest, 2))
20379           && INTVAL (XEXP (set_dest, 2)) == 16
20380           && REG_P (XEXP (set_dest, 0))
20381           && REG_P (SET_DEST (prev_set))
20382           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
20383         {
20384           return true;
20385         }
20386     }
20387
20388   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
20389     {
20390
20391       /*  We're trying to match:
20392           prev (adrp) == (set (reg r1)
20393                               (high (symbol_ref ("SYM"))))
20394           curr (add) == (set (reg r0)
20395                              (lo_sum (reg r1)
20396                                      (symbol_ref ("SYM"))))
20397           Note that r0 need not necessarily be the same as r1, especially
20398           during pre-regalloc scheduling.  */
20399
20400       if (satisfies_constraint_Ush (SET_SRC (prev_set))
20401           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20402         {
20403           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
20404               && REG_P (XEXP (SET_SRC (curr_set), 0))
20405               && REGNO (XEXP (SET_SRC (curr_set), 0))
20406                  == REGNO (SET_DEST (prev_set))
20407               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
20408                               XEXP (SET_SRC (curr_set), 1)))
20409             return true;
20410         }
20411     }
20412
20413   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
20414     {
20415
20416       /* We're trying to match:
20417          prev (movk) == (set (zero_extract (reg r0)
20418                                            (const_int 16)
20419                                            (const_int 32))
20420                              (const_int imm16_1))
20421          curr (movk) == (set (zero_extract (reg r0)
20422                                            (const_int 16)
20423                                            (const_int 48))
20424                              (const_int imm16_2))  */
20425
20426       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
20427           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
20428           && REG_P (XEXP (SET_DEST (prev_set), 0))
20429           && REG_P (XEXP (SET_DEST (curr_set), 0))
20430           && REGNO (XEXP (SET_DEST (prev_set), 0))
20431              == REGNO (XEXP (SET_DEST (curr_set), 0))
20432           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
20433           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
20434           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
20435           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
20436           && CONST_INT_P (SET_SRC (prev_set))
20437           && CONST_INT_P (SET_SRC (curr_set)))
20438         return true;
20439
20440     }
20441   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
20442     {
20443       /* We're trying to match:
20444           prev (adrp) == (set (reg r0)
20445                               (high (symbol_ref ("SYM"))))
20446           curr (ldr) == (set (reg r1)
20447                              (mem (lo_sum (reg r0)
20448                                              (symbol_ref ("SYM")))))
20449                  or
20450           curr (ldr) == (set (reg r1)
20451                              (zero_extend (mem
20452                                            (lo_sum (reg r0)
20453                                                    (symbol_ref ("SYM"))))))  */
20454       if (satisfies_constraint_Ush (SET_SRC (prev_set))
20455           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20456         {
20457           rtx curr_src = SET_SRC (curr_set);
20458
20459           if (GET_CODE (curr_src) == ZERO_EXTEND)
20460             curr_src = XEXP (curr_src, 0);
20461
20462           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
20463               && REG_P (XEXP (XEXP (curr_src, 0), 0))
20464               && REGNO (XEXP (XEXP (curr_src, 0), 0))
20465                  == REGNO (SET_DEST (prev_set))
20466               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
20467                               XEXP (SET_SRC (prev_set), 0)))
20468               return true;
20469         }
20470     }
20471
20472   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
20473   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
20474       && prev_set && curr_set && any_condjump_p (curr)
20475       && GET_CODE (SET_SRC (prev_set)) == COMPARE
20476       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
20477       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
20478     return true;
20479
20480   /* Fuse flag-setting ALU instructions and conditional branch.  */
20481   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
20482       && any_condjump_p (curr))
20483     {
20484       unsigned int condreg1, condreg2;
20485       rtx cc_reg_1;
20486       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
20487       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
20488
20489       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
20490           && prev
20491           && modified_in_p (cc_reg_1, prev))
20492         {
20493           enum attr_type prev_type = get_attr_type (prev);
20494
20495           /* FIXME: this misses some which is considered simple arthematic
20496              instructions for ThunderX.  Simple shifts are missed here.  */
20497           if (prev_type == TYPE_ALUS_SREG
20498               || prev_type == TYPE_ALUS_IMM
20499               || prev_type == TYPE_LOGICS_REG
20500               || prev_type == TYPE_LOGICS_IMM)
20501             return true;
20502         }
20503     }
20504
20505   /* Fuse ALU instructions and CBZ/CBNZ.  */
20506   if (prev_set
20507       && curr_set
20508       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
20509       && any_condjump_p (curr))
20510     {
20511       /* We're trying to match:
20512           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20513           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
20514                                                          (const_int 0))
20515                                                  (label_ref ("SYM"))
20516                                                  (pc))  */
20517       if (SET_DEST (curr_set) == (pc_rtx)
20518           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
20519           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
20520           && REG_P (SET_DEST (prev_set))
20521           && REGNO (SET_DEST (prev_set))
20522              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
20523         {
20524           /* Fuse ALU operations followed by conditional branch instruction.  */
20525           switch (get_attr_type (prev))
20526             {
20527             case TYPE_ALU_IMM:
20528             case TYPE_ALU_SREG:
20529             case TYPE_ADC_REG:
20530             case TYPE_ADC_IMM:
20531             case TYPE_ADCS_REG:
20532             case TYPE_ADCS_IMM:
20533             case TYPE_LOGIC_REG:
20534             case TYPE_LOGIC_IMM:
20535             case TYPE_CSEL:
20536             case TYPE_ADR:
20537             case TYPE_MOV_IMM:
20538             case TYPE_SHIFT_REG:
20539             case TYPE_SHIFT_IMM:
20540             case TYPE_BFM:
20541             case TYPE_RBIT:
20542             case TYPE_REV:
20543             case TYPE_EXTEND:
20544               return true;
20545
20546             default:;
20547             }
20548         }
20549     }
20550
20551   return false;
20552 }
20553
20554 /* Return true iff the instruction fusion described by OP is enabled.  */
20555
20556 bool
20557 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
20558 {
20559   return (aarch64_tune_params.fusible_ops & op) != 0;
20560 }
20561
20562 /* If MEM is in the form of [base+offset], extract the two parts
20563    of address and set to BASE and OFFSET, otherwise return false
20564    after clearing BASE and OFFSET.  */
20565
20566 bool
20567 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
20568 {
20569   rtx addr;
20570
20571   gcc_assert (MEM_P (mem));
20572
20573   addr = XEXP (mem, 0);
20574
20575   if (REG_P (addr))
20576     {
20577       *base = addr;
20578       *offset = const0_rtx;
20579       return true;
20580     }
20581
20582   if (GET_CODE (addr) == PLUS
20583       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
20584     {
20585       *base = XEXP (addr, 0);
20586       *offset = XEXP (addr, 1);
20587       return true;
20588     }
20589
20590   *base = NULL_RTX;
20591   *offset = NULL_RTX;
20592
20593   return false;
20594 }
20595
20596 /* Types for scheduling fusion.  */
20597 enum sched_fusion_type
20598 {
20599   SCHED_FUSION_NONE = 0,
20600   SCHED_FUSION_LD_SIGN_EXTEND,
20601   SCHED_FUSION_LD_ZERO_EXTEND,
20602   SCHED_FUSION_LD,
20603   SCHED_FUSION_ST,
20604   SCHED_FUSION_NUM
20605 };
20606
20607 /* If INSN is a load or store of address in the form of [base+offset],
20608    extract the two parts and set to BASE and OFFSET.  Return scheduling
20609    fusion type this INSN is.  */
20610
20611 static enum sched_fusion_type
20612 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
20613 {
20614   rtx x, dest, src;
20615   enum sched_fusion_type fusion = SCHED_FUSION_LD;
20616
20617   gcc_assert (INSN_P (insn));
20618   x = PATTERN (insn);
20619   if (GET_CODE (x) != SET)
20620     return SCHED_FUSION_NONE;
20621
20622   src = SET_SRC (x);
20623   dest = SET_DEST (x);
20624
20625   machine_mode dest_mode = GET_MODE (dest);
20626
20627   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
20628     return SCHED_FUSION_NONE;
20629
20630   if (GET_CODE (src) == SIGN_EXTEND)
20631     {
20632       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
20633       src = XEXP (src, 0);
20634       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20635         return SCHED_FUSION_NONE;
20636     }
20637   else if (GET_CODE (src) == ZERO_EXTEND)
20638     {
20639       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
20640       src = XEXP (src, 0);
20641       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20642         return SCHED_FUSION_NONE;
20643     }
20644
20645   if (GET_CODE (src) == MEM && REG_P (dest))
20646     extract_base_offset_in_addr (src, base, offset);
20647   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
20648     {
20649       fusion = SCHED_FUSION_ST;
20650       extract_base_offset_in_addr (dest, base, offset);
20651     }
20652   else
20653     return SCHED_FUSION_NONE;
20654
20655   if (*base == NULL_RTX || *offset == NULL_RTX)
20656     fusion = SCHED_FUSION_NONE;
20657
20658   return fusion;
20659 }
20660
20661 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20662
20663    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20664    and PRI are only calculated for these instructions.  For other instruction,
20665    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
20666    type instruction fusion can be added by returning different priorities.
20667
20668    It's important that irrelevant instructions get the largest FUSION_PRI.  */
20669
20670 static void
20671 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
20672                                int *fusion_pri, int *pri)
20673 {
20674   int tmp, off_val;
20675   rtx base, offset;
20676   enum sched_fusion_type fusion;
20677
20678   gcc_assert (INSN_P (insn));
20679
20680   tmp = max_pri - 1;
20681   fusion = fusion_load_store (insn, &base, &offset);
20682   if (fusion == SCHED_FUSION_NONE)
20683     {
20684       *pri = tmp;
20685       *fusion_pri = tmp;
20686       return;
20687     }
20688
20689   /* Set FUSION_PRI according to fusion type and base register.  */
20690   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
20691
20692   /* Calculate PRI.  */
20693   tmp /= 2;
20694
20695   /* INSN with smaller offset goes first.  */
20696   off_val = (int)(INTVAL (offset));
20697   if (off_val >= 0)
20698     tmp -= (off_val & 0xfffff);
20699   else
20700     tmp += ((- off_val) & 0xfffff);
20701
20702   *pri = tmp;
20703   return;
20704 }
20705
20706 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20707    Adjust priority of sha1h instructions so they are scheduled before
20708    other SHA1 instructions.  */
20709
20710 static int
20711 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
20712 {
20713   rtx x = PATTERN (insn);
20714
20715   if (GET_CODE (x) == SET)
20716     {
20717       x = SET_SRC (x);
20718
20719       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
20720         return priority + 10;
20721     }
20722
20723   return priority;
20724 }
20725
20726 /* Given OPERANDS of consecutive load/store, check if we can merge
20727    them into ldp/stp.  LOAD is true if they are load instructions.
20728    MODE is the mode of memory operands.  */
20729
20730 bool
20731 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
20732                                 machine_mode mode)
20733 {
20734   HOST_WIDE_INT offval_1, offval_2, msize;
20735   enum reg_class rclass_1, rclass_2;
20736   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
20737
20738   if (load)
20739     {
20740       mem_1 = operands[1];
20741       mem_2 = operands[3];
20742       reg_1 = operands[0];
20743       reg_2 = operands[2];
20744       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
20745       if (REGNO (reg_1) == REGNO (reg_2))
20746         return false;
20747     }
20748   else
20749     {
20750       mem_1 = operands[0];
20751       mem_2 = operands[2];
20752       reg_1 = operands[1];
20753       reg_2 = operands[3];
20754     }
20755
20756   /* The mems cannot be volatile.  */
20757   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
20758     return false;
20759
20760   /* If we have SImode and slow unaligned ldp,
20761      check the alignment to be at least 8 byte. */
20762   if (mode == SImode
20763       && (aarch64_tune_params.extra_tuning_flags
20764           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20765       && !optimize_size
20766       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
20767     return false;
20768
20769   /* Check if the addresses are in the form of [base+offset].  */
20770   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20771   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
20772     return false;
20773   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20774   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
20775     return false;
20776
20777   /* Check if the bases are same.  */
20778   if (!rtx_equal_p (base_1, base_2))
20779     return false;
20780
20781   /* The operands must be of the same size.  */
20782   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
20783                          GET_MODE_SIZE (GET_MODE (mem_2))));
20784
20785   offval_1 = INTVAL (offset_1);
20786   offval_2 = INTVAL (offset_2);
20787   /* We should only be trying this for fixed-sized modes.  There is no
20788      SVE LDP/STP instruction.  */
20789   msize = GET_MODE_SIZE (mode).to_constant ();
20790   /* Check if the offsets are consecutive.  */
20791   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
20792     return false;
20793
20794   /* Check if the addresses are clobbered by load.  */
20795   if (load)
20796     {
20797       if (reg_mentioned_p (reg_1, mem_1))
20798         return false;
20799
20800       /* In increasing order, the last load can clobber the address.  */
20801       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
20802         return false;
20803     }
20804
20805   /* One of the memory accesses must be a mempair operand.
20806      If it is not the first one, they need to be swapped by the
20807      peephole.  */
20808   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
20809        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
20810     return false;
20811
20812   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
20813     rclass_1 = FP_REGS;
20814   else
20815     rclass_1 = GENERAL_REGS;
20816
20817   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
20818     rclass_2 = FP_REGS;
20819   else
20820     rclass_2 = GENERAL_REGS;
20821
20822   /* Check if the registers are of same class.  */
20823   if (rclass_1 != rclass_2)
20824     return false;
20825
20826   return true;
20827 }
20828
20829 /* Given OPERANDS of consecutive load/store that can be merged,
20830    swap them if they are not in ascending order.  */
20831 void
20832 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
20833 {
20834   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
20835   HOST_WIDE_INT offval_1, offval_2;
20836
20837   if (load)
20838     {
20839       mem_1 = operands[1];
20840       mem_2 = operands[3];
20841     }
20842   else
20843     {
20844       mem_1 = operands[0];
20845       mem_2 = operands[2];
20846     }
20847
20848   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20849   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20850
20851   offval_1 = INTVAL (offset_1);
20852   offval_2 = INTVAL (offset_2);
20853
20854   if (offval_1 > offval_2)
20855     {
20856       /* Irrespective of whether this is a load or a store,
20857          we do the same swap.  */
20858       std::swap (operands[0], operands[2]);
20859       std::swap (operands[1], operands[3]);
20860     }
20861 }
20862
20863 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
20864    comparison between the two.  */
20865 int
20866 aarch64_host_wide_int_compare (const void *x, const void *y)
20867 {
20868   return wi::cmps (* ((const HOST_WIDE_INT *) x),
20869                    * ((const HOST_WIDE_INT *) y));
20870 }
20871
20872 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
20873    other pointing to a REG rtx containing an offset, compare the offsets
20874    of the two pairs.
20875
20876    Return:
20877
20878         1 iff offset (X) > offset (Y)
20879         0 iff offset (X) == offset (Y)
20880         -1 iff offset (X) < offset (Y)  */
20881 int
20882 aarch64_ldrstr_offset_compare (const void *x, const void *y)
20883 {
20884   const rtx * operands_1 = (const rtx *) x;
20885   const rtx * operands_2 = (const rtx *) y;
20886   rtx mem_1, mem_2, base, offset_1, offset_2;
20887
20888   if (MEM_P (operands_1[0]))
20889     mem_1 = operands_1[0];
20890   else
20891     mem_1 = operands_1[1];
20892
20893   if (MEM_P (operands_2[0]))
20894     mem_2 = operands_2[0];
20895   else
20896     mem_2 = operands_2[1];
20897
20898   /* Extract the offsets.  */
20899   extract_base_offset_in_addr (mem_1, &base, &offset_1);
20900   extract_base_offset_in_addr (mem_2, &base, &offset_2);
20901
20902   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
20903
20904   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
20905 }
20906
20907 /* Given OPERANDS of consecutive load/store, check if we can merge
20908    them into ldp/stp by adjusting the offset.  LOAD is true if they
20909    are load instructions.  MODE is the mode of memory operands.
20910
20911    Given below consecutive stores:
20912
20913      str  w1, [xb, 0x100]
20914      str  w1, [xb, 0x104]
20915      str  w1, [xb, 0x108]
20916      str  w1, [xb, 0x10c]
20917
20918    Though the offsets are out of the range supported by stp, we can
20919    still pair them after adjusting the offset, like:
20920
20921      add  scratch, xb, 0x100
20922      stp  w1, w1, [scratch]
20923      stp  w1, w1, [scratch, 0x8]
20924
20925    The peephole patterns detecting this opportunity should guarantee
20926    the scratch register is avaliable.  */
20927
20928 bool
20929 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
20930                                        scalar_mode mode)
20931 {
20932   const int num_insns = 4;
20933   enum reg_class rclass;
20934   HOST_WIDE_INT offvals[num_insns], msize;
20935   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
20936
20937   if (load)
20938     {
20939       for (int i = 0; i < num_insns; i++)
20940         {
20941           reg[i] = operands[2 * i];
20942           mem[i] = operands[2 * i + 1];
20943
20944           gcc_assert (REG_P (reg[i]));
20945         }
20946
20947       /* Do not attempt to merge the loads if the loads clobber each other.  */
20948       for (int i = 0; i < 8; i += 2)
20949         for (int j = i + 2; j < 8; j += 2)
20950           if (reg_overlap_mentioned_p (operands[i], operands[j]))
20951             return false;
20952     }
20953   else
20954     for (int i = 0; i < num_insns; i++)
20955       {
20956         mem[i] = operands[2 * i];
20957         reg[i] = operands[2 * i + 1];
20958       }
20959
20960   /* Skip if memory operand is by itself valid for ldp/stp.  */
20961   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
20962     return false;
20963
20964   for (int i = 0; i < num_insns; i++)
20965     {
20966       /* The mems cannot be volatile.  */
20967       if (MEM_VOLATILE_P (mem[i]))
20968         return false;
20969
20970       /* Check if the addresses are in the form of [base+offset].  */
20971       extract_base_offset_in_addr (mem[i], base + i, offset + i);
20972       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
20973         return false;
20974     }
20975
20976   /* Check if the registers are of same class.  */
20977   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
20978     ? FP_REGS : GENERAL_REGS;
20979
20980   for (int i = 1; i < num_insns; i++)
20981     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
20982       {
20983         if (rclass != FP_REGS)
20984           return false;
20985       }
20986     else
20987       {
20988         if (rclass != GENERAL_REGS)
20989           return false;
20990       }
20991
20992   /* Only the last register in the order in which they occur
20993      may be clobbered by the load.  */
20994   if (rclass == GENERAL_REGS && load)
20995     for (int i = 0; i < num_insns - 1; i++)
20996       if (reg_mentioned_p (reg[i], mem[i]))
20997         return false;
20998
20999   /* Check if the bases are same.  */
21000   for (int i = 0; i < num_insns - 1; i++)
21001     if (!rtx_equal_p (base[i], base[i + 1]))
21002       return false;
21003
21004   for (int i = 0; i < num_insns; i++)
21005     offvals[i] = INTVAL (offset[i]);
21006
21007   msize = GET_MODE_SIZE (mode);
21008
21009   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
21010   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21011          aarch64_host_wide_int_compare);
21012
21013   if (!(offvals[1] == offvals[0] + msize
21014         && offvals[3] == offvals[2] + msize))
21015     return false;
21016
21017   /* Check that offsets are within range of each other.  The ldp/stp
21018      instructions have 7 bit immediate offsets, so use 0x80.  */
21019   if (offvals[2] - offvals[0] >= msize * 0x80)
21020     return false;
21021
21022   /* The offsets must be aligned with respect to each other.  */
21023   if (offvals[0] % msize != offvals[2] % msize)
21024     return false;
21025
21026   /* If we have SImode and slow unaligned ldp,
21027      check the alignment to be at least 8 byte. */
21028   if (mode == SImode
21029       && (aarch64_tune_params.extra_tuning_flags
21030           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21031       && !optimize_size
21032       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
21033     return false;
21034
21035   return true;
21036 }
21037
21038 /* Given OPERANDS of consecutive load/store, this function pairs them
21039    into LDP/STP after adjusting the offset.  It depends on the fact
21040    that the operands can be sorted so the offsets are correct for STP.
21041    MODE is the mode of memory operands.  CODE is the rtl operator
21042    which should be applied to all memory operands, it's SIGN_EXTEND,
21043    ZERO_EXTEND or UNKNOWN.  */
21044
21045 bool
21046 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
21047                              scalar_mode mode, RTX_CODE code)
21048 {
21049   rtx base, offset_1, offset_3, t1, t2;
21050   rtx mem_1, mem_2, mem_3, mem_4;
21051   rtx temp_operands[8];
21052   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
21053                 stp_off_upper_limit, stp_off_lower_limit, msize;
21054
21055   /* We make changes on a copy as we may still bail out.  */
21056   for (int i = 0; i < 8; i ++)
21057     temp_operands[i] = operands[i];
21058
21059   /* Sort the operands.  */
21060   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
21061
21062   /* Copy the memory operands so that if we have to bail for some
21063      reason the original addresses are unchanged.  */
21064   if (load)
21065     {
21066       mem_1 = copy_rtx (temp_operands[1]);
21067       mem_2 = copy_rtx (temp_operands[3]);
21068       mem_3 = copy_rtx (temp_operands[5]);
21069       mem_4 = copy_rtx (temp_operands[7]);
21070     }
21071   else
21072     {
21073       mem_1 = copy_rtx (temp_operands[0]);
21074       mem_2 = copy_rtx (temp_operands[2]);
21075       mem_3 = copy_rtx (temp_operands[4]);
21076       mem_4 = copy_rtx (temp_operands[6]);
21077       gcc_assert (code == UNKNOWN);
21078     }
21079
21080   extract_base_offset_in_addr (mem_1, &base, &offset_1);
21081   extract_base_offset_in_addr (mem_3, &base, &offset_3);
21082   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
21083               && offset_3 != NULL_RTX);
21084
21085   /* Adjust offset so it can fit in LDP/STP instruction.  */
21086   msize = GET_MODE_SIZE (mode);
21087   stp_off_upper_limit = msize * (0x40 - 1);
21088   stp_off_lower_limit = - msize * 0x40;
21089
21090   off_val_1 = INTVAL (offset_1);
21091   off_val_3 = INTVAL (offset_3);
21092
21093   /* The base offset is optimally half way between the two STP/LDP offsets.  */
21094   if (msize <= 4)
21095     base_off = (off_val_1 + off_val_3) / 2;
21096   else
21097     /* However, due to issues with negative LDP/STP offset generation for
21098        larger modes, for DF, DI and vector modes. we must not use negative
21099        addresses smaller than 9 signed unadjusted bits can store.  This
21100        provides the most range in this case.  */
21101     base_off = off_val_1;
21102
21103   /* Adjust the base so that it is aligned with the addresses but still
21104      optimal.  */
21105   if (base_off % msize != off_val_1 % msize)
21106     /* Fix the offset, bearing in mind we want to make it bigger not
21107        smaller.  */
21108     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21109   else if (msize <= 4)
21110     /* The negative range of LDP/STP is one larger than the positive range.  */
21111     base_off += msize;
21112
21113   /* Check if base offset is too big or too small.  We can attempt to resolve
21114      this issue by setting it to the maximum value and seeing if the offsets
21115      still fit.  */
21116   if (base_off >= 0x1000)
21117     {
21118       base_off = 0x1000 - 1;
21119       /* We must still make sure that the base offset is aligned with respect
21120          to the address.  But it may may not be made any bigger.  */
21121       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21122     }
21123
21124   /* Likewise for the case where the base is too small.  */
21125   if (base_off <= -0x1000)
21126     {
21127       base_off = -0x1000 + 1;
21128       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21129     }
21130
21131   /* Offset of the first STP/LDP.  */
21132   new_off_1 = off_val_1 - base_off;
21133
21134   /* Offset of the second STP/LDP.  */
21135   new_off_3 = off_val_3 - base_off;
21136
21137   /* The offsets must be within the range of the LDP/STP instructions.  */
21138   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
21139       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
21140     return false;
21141
21142   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
21143                                                   new_off_1), true);
21144   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
21145                                                   new_off_1 + msize), true);
21146   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
21147                                                   new_off_3), true);
21148   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
21149                                                   new_off_3 + msize), true);
21150
21151   if (!aarch64_mem_pair_operand (mem_1, mode)
21152       || !aarch64_mem_pair_operand (mem_3, mode))
21153     return false;
21154
21155   if (code == ZERO_EXTEND)
21156     {
21157       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
21158       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
21159       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
21160       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
21161     }
21162   else if (code == SIGN_EXTEND)
21163     {
21164       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
21165       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
21166       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
21167       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
21168     }
21169
21170   if (load)
21171     {
21172       operands[0] = temp_operands[0];
21173       operands[1] = mem_1;
21174       operands[2] = temp_operands[2];
21175       operands[3] = mem_2;
21176       operands[4] = temp_operands[4];
21177       operands[5] = mem_3;
21178       operands[6] = temp_operands[6];
21179       operands[7] = mem_4;
21180     }
21181   else
21182     {
21183       operands[0] = mem_1;
21184       operands[1] = temp_operands[1];
21185       operands[2] = mem_2;
21186       operands[3] = temp_operands[3];
21187       operands[4] = mem_3;
21188       operands[5] = temp_operands[5];
21189       operands[6] = mem_4;
21190       operands[7] = temp_operands[7];
21191     }
21192
21193   /* Emit adjusting instruction.  */
21194   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
21195   /* Emit ldp/stp instructions.  */
21196   t1 = gen_rtx_SET (operands[0], operands[1]);
21197   t2 = gen_rtx_SET (operands[2], operands[3]);
21198   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21199   t1 = gen_rtx_SET (operands[4], operands[5]);
21200   t2 = gen_rtx_SET (operands[6], operands[7]);
21201   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21202   return true;
21203 }
21204
21205 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
21206    it isn't worth branching around empty masked ops (including masked
21207    stores).  */
21208
21209 static bool
21210 aarch64_empty_mask_is_expensive (unsigned)
21211 {
21212   return false;
21213 }
21214
21215 /* Return 1 if pseudo register should be created and used to hold
21216    GOT address for PIC code.  */
21217
21218 bool
21219 aarch64_use_pseudo_pic_reg (void)
21220 {
21221   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
21222 }
21223
21224 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
21225
21226 static int
21227 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
21228 {
21229   switch (XINT (x, 1))
21230     {
21231     case UNSPEC_GOTSMALLPIC:
21232     case UNSPEC_GOTSMALLPIC28K:
21233     case UNSPEC_GOTTINYPIC:
21234       return 0;
21235     default:
21236       break;
21237     }
21238
21239   return default_unspec_may_trap_p (x, flags);
21240 }
21241
21242
21243 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21244    return the log2 of that value.  Otherwise return -1.  */
21245
21246 int
21247 aarch64_fpconst_pow_of_2 (rtx x)
21248 {
21249   const REAL_VALUE_TYPE *r;
21250
21251   if (!CONST_DOUBLE_P (x))
21252     return -1;
21253
21254   r = CONST_DOUBLE_REAL_VALUE (x);
21255
21256   if (REAL_VALUE_NEGATIVE (*r)
21257       || REAL_VALUE_ISNAN (*r)
21258       || REAL_VALUE_ISINF (*r)
21259       || !real_isinteger (r, DFmode))
21260     return -1;
21261
21262   return exact_log2 (real_to_integer (r));
21263 }
21264
21265 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21266    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21267    return n. Otherwise return -1.  */
21268
21269 int
21270 aarch64_fpconst_pow2_recip (rtx x)
21271 {
21272   REAL_VALUE_TYPE r0;
21273
21274   if (!CONST_DOUBLE_P (x))
21275     return -1;
21276
21277   r0 = *CONST_DOUBLE_REAL_VALUE (x);
21278   if (exact_real_inverse (DFmode, &r0)
21279       && !REAL_VALUE_NEGATIVE (r0))
21280     {
21281         int ret = exact_log2 (real_to_integer (&r0));
21282         if (ret >= 1 && ret <= 32)
21283             return ret;
21284     }
21285   return -1;
21286 }
21287
21288 /* If X is a vector of equal CONST_DOUBLE values and that value is
21289    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
21290
21291 int
21292 aarch64_vec_fpconst_pow_of_2 (rtx x)
21293 {
21294   int nelts;
21295   if (GET_CODE (x) != CONST_VECTOR
21296       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
21297     return -1;
21298
21299   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
21300     return -1;
21301
21302   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
21303   if (firstval <= 0)
21304     return -1;
21305
21306   for (int i = 1; i < nelts; i++)
21307     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
21308       return -1;
21309
21310   return firstval;
21311 }
21312
21313 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21314    to float.
21315
21316    __fp16 always promotes through this hook.
21317    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21318    through the generic excess precision logic rather than here.  */
21319
21320 static tree
21321 aarch64_promoted_type (const_tree t)
21322 {
21323   if (SCALAR_FLOAT_TYPE_P (t)
21324       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
21325     return float_type_node;
21326
21327   return NULL_TREE;
21328 }
21329
21330 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
21331
21332 static bool
21333 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
21334                            optimization_type opt_type)
21335 {
21336   switch (op)
21337     {
21338     case rsqrt_optab:
21339       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
21340
21341     default:
21342       return true;
21343     }
21344 }
21345
21346 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
21347
21348 static unsigned int
21349 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
21350                                         int *offset)
21351 {
21352   /* Polynomial invariant 1 == (VG / 2) - 1.  */
21353   gcc_assert (i == 1);
21354   *factor = 2;
21355   *offset = 1;
21356   return AARCH64_DWARF_VG;
21357 }
21358
21359 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21360    if MODE is HFmode, and punt to the generic implementation otherwise.  */
21361
21362 static bool
21363 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
21364 {
21365   return (mode == HFmode
21366           ? true
21367           : default_libgcc_floating_mode_supported_p (mode));
21368 }
21369
21370 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21371    if MODE is HFmode, and punt to the generic implementation otherwise.  */
21372
21373 static bool
21374 aarch64_scalar_mode_supported_p (scalar_mode mode)
21375 {
21376   return (mode == HFmode
21377           ? true
21378           : default_scalar_mode_supported_p (mode));
21379 }
21380
21381 /* Set the value of FLT_EVAL_METHOD.
21382    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21383
21384     0: evaluate all operations and constants, whose semantic type has at
21385        most the range and precision of type float, to the range and
21386        precision of float; evaluate all other operations and constants to
21387        the range and precision of the semantic type;
21388
21389     N, where _FloatN is a supported interchange floating type
21390        evaluate all operations and constants, whose semantic type has at
21391        most the range and precision of _FloatN type, to the range and
21392        precision of the _FloatN type; evaluate all other operations and
21393        constants to the range and precision of the semantic type;
21394
21395    If we have the ARMv8.2-A extensions then we support _Float16 in native
21396    precision, so we should set this to 16.  Otherwise, we support the type,
21397    but want to evaluate expressions in float precision, so set this to
21398    0.  */
21399
21400 static enum flt_eval_method
21401 aarch64_excess_precision (enum excess_precision_type type)
21402 {
21403   switch (type)
21404     {
21405       case EXCESS_PRECISION_TYPE_FAST:
21406       case EXCESS_PRECISION_TYPE_STANDARD:
21407         /* We can calculate either in 16-bit range and precision or
21408            32-bit range and precision.  Make that decision based on whether
21409            we have native support for the ARMv8.2-A 16-bit floating-point
21410            instructions or not.  */
21411         return (TARGET_FP_F16INST
21412                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21413                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
21414       case EXCESS_PRECISION_TYPE_IMPLICIT:
21415         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
21416       default:
21417         gcc_unreachable ();
21418     }
21419   return FLT_EVAL_METHOD_UNPREDICTABLE;
21420 }
21421
21422 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
21423    scheduled for speculative execution.  Reject the long-running division
21424    and square-root instructions.  */
21425
21426 static bool
21427 aarch64_sched_can_speculate_insn (rtx_insn *insn)
21428 {
21429   switch (get_attr_type (insn))
21430     {
21431       case TYPE_SDIV:
21432       case TYPE_UDIV:
21433       case TYPE_FDIVS:
21434       case TYPE_FDIVD:
21435       case TYPE_FSQRTS:
21436       case TYPE_FSQRTD:
21437       case TYPE_NEON_FP_SQRT_S:
21438       case TYPE_NEON_FP_SQRT_D:
21439       case TYPE_NEON_FP_SQRT_S_Q:
21440       case TYPE_NEON_FP_SQRT_D_Q:
21441       case TYPE_NEON_FP_DIV_S:
21442       case TYPE_NEON_FP_DIV_D:
21443       case TYPE_NEON_FP_DIV_S_Q:
21444       case TYPE_NEON_FP_DIV_D_Q:
21445         return false;
21446       default:
21447         return true;
21448     }
21449 }
21450
21451 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
21452
21453 static int
21454 aarch64_compute_pressure_classes (reg_class *classes)
21455 {
21456   int i = 0;
21457   classes[i++] = GENERAL_REGS;
21458   classes[i++] = FP_REGS;
21459   /* PR_REGS isn't a useful pressure class because many predicate pseudo
21460      registers need to go in PR_LO_REGS at some point during their
21461      lifetime.  Splitting it into two halves has the effect of making
21462      all predicates count against PR_LO_REGS, so that we try whenever
21463      possible to restrict the number of live predicates to 8.  This
21464      greatly reduces the amount of spilling in certain loops.  */
21465   classes[i++] = PR_LO_REGS;
21466   classes[i++] = PR_HI_REGS;
21467   return i;
21468 }
21469
21470 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
21471
21472 static bool
21473 aarch64_can_change_mode_class (machine_mode from,
21474                                machine_mode to, reg_class_t)
21475 {
21476   unsigned int from_flags = aarch64_classify_vector_mode (from);
21477   unsigned int to_flags = aarch64_classify_vector_mode (to);
21478
21479   bool from_sve_p = (from_flags & VEC_ANY_SVE);
21480   bool to_sve_p = (to_flags & VEC_ANY_SVE);
21481
21482   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
21483   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
21484
21485   /* Don't allow changes between partial SVE modes and other modes.
21486      The contents of partial SVE modes are distributed evenly across
21487      the register, whereas GCC expects them to be clustered together.  */
21488   if (from_partial_sve_p != to_partial_sve_p)
21489     return false;
21490
21491   /* Similarly reject changes between partial SVE modes that have
21492      different patterns of significant and insignificant bits.  */
21493   if (from_partial_sve_p
21494       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
21495           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
21496     return false;
21497
21498   if (BYTES_BIG_ENDIAN)
21499     {
21500       /* Don't allow changes between SVE data modes and non-SVE modes.
21501          See the comment at the head of aarch64-sve.md for details.  */
21502       if (from_sve_p != to_sve_p)
21503         return false;
21504
21505       /* Don't allow changes in element size: lane 0 of the new vector
21506          would not then be lane 0 of the old vector.  See the comment
21507          above aarch64_maybe_expand_sve_subreg_move for a more detailed
21508          description.
21509
21510          In the worst case, this forces a register to be spilled in
21511          one mode and reloaded in the other, which handles the
21512          endianness correctly.  */
21513       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
21514         return false;
21515     }
21516   return true;
21517 }
21518
21519 /* Implement TARGET_EARLY_REMAT_MODES.  */
21520
21521 static void
21522 aarch64_select_early_remat_modes (sbitmap modes)
21523 {
21524   /* SVE values are not normally live across a call, so it should be
21525      worth doing early rematerialization even in VL-specific mode.  */
21526   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
21527     if (aarch64_sve_mode_p ((machine_mode) i))
21528       bitmap_set_bit (modes, i);
21529 }
21530
21531 /* Override the default target speculation_safe_value.  */
21532 static rtx
21533 aarch64_speculation_safe_value (machine_mode mode,
21534                                 rtx result, rtx val, rtx failval)
21535 {
21536   /* Maybe we should warn if falling back to hard barriers.  They are
21537      likely to be noticably more expensive than the alternative below.  */
21538   if (!aarch64_track_speculation)
21539     return default_speculation_safe_value (mode, result, val, failval);
21540
21541   if (!REG_P (val))
21542     val = copy_to_mode_reg (mode, val);
21543
21544   if (!aarch64_reg_or_zero (failval, mode))
21545     failval = copy_to_mode_reg (mode, failval);
21546
21547   emit_insn (gen_despeculate_copy (mode, result, val, failval));
21548   return result;
21549 }
21550
21551 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21552    Look into the tuning structure for an estimate.
21553    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21554    Advanced SIMD 128 bits.  */
21555
21556 static HOST_WIDE_INT
21557 aarch64_estimated_poly_value (poly_int64 val)
21558 {
21559   enum aarch64_sve_vector_bits_enum width_source
21560     = aarch64_tune_params.sve_width;
21561
21562   /* If we still don't have an estimate, use the default.  */
21563   if (width_source == SVE_SCALABLE)
21564     return default_estimated_poly_value (val);
21565
21566   HOST_WIDE_INT over_128 = width_source - 128;
21567   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
21568 }
21569
21570
21571 /* Return true for types that could be supported as SIMD return or
21572    argument types.  */
21573
21574 static bool
21575 supported_simd_type (tree t)
21576 {
21577   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
21578     {
21579       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
21580       return s == 1 || s == 2 || s == 4 || s == 8;
21581     }
21582   return false;
21583 }
21584
21585 /* Return true for types that currently are supported as SIMD return
21586    or argument types.  */
21587
21588 static bool
21589 currently_supported_simd_type (tree t, tree b)
21590 {
21591   if (COMPLEX_FLOAT_TYPE_P (t))
21592     return false;
21593
21594   if (TYPE_SIZE (t) != TYPE_SIZE (b))
21595     return false;
21596
21597   return supported_simd_type (t);
21598 }
21599
21600 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
21601
21602 static int
21603 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
21604                                         struct cgraph_simd_clone *clonei,
21605                                         tree base_type, int num)
21606 {
21607   tree t, ret_type, arg_type;
21608   unsigned int elt_bits, vec_bits, count;
21609
21610   if (!TARGET_SIMD)
21611     return 0;
21612
21613   if (clonei->simdlen
21614       && (clonei->simdlen < 2
21615           || clonei->simdlen > 1024
21616           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
21617     {
21618       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21619                   "unsupported simdlen %d", clonei->simdlen);
21620       return 0;
21621     }
21622
21623   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
21624   if (TREE_CODE (ret_type) != VOID_TYPE
21625       && !currently_supported_simd_type (ret_type, base_type))
21626     {
21627       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
21628         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21629                     "GCC does not currently support mixed size types "
21630                     "for %<simd%> functions");
21631       else if (supported_simd_type (ret_type))
21632         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21633                     "GCC does not currently support return type %qT "
21634                     "for %<simd%> functions", ret_type);
21635       else
21636         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21637                     "unsupported return type %qT for %<simd%> functions",
21638                     ret_type);
21639       return 0;
21640     }
21641
21642   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
21643     {
21644       arg_type = TREE_TYPE (t);
21645
21646       if (!currently_supported_simd_type (arg_type, base_type))
21647         {
21648           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
21649             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21650                         "GCC does not currently support mixed size types "
21651                         "for %<simd%> functions");
21652           else
21653             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21654                         "GCC does not currently support argument type %qT "
21655                         "for %<simd%> functions", arg_type);
21656           return 0;
21657         }
21658     }
21659
21660   clonei->vecsize_mangle = 'n';
21661   clonei->mask_mode = VOIDmode;
21662   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
21663   if (clonei->simdlen == 0)
21664     {
21665       count = 2;
21666       vec_bits = (num == 0 ? 64 : 128);
21667       clonei->simdlen = vec_bits / elt_bits;
21668     }
21669   else
21670     {
21671       count = 1;
21672       vec_bits = clonei->simdlen * elt_bits;
21673       if (vec_bits != 64 && vec_bits != 128)
21674         {
21675           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21676                       "GCC does not currently support simdlen %d for type %qT",
21677                       clonei->simdlen, base_type);
21678           return 0;
21679         }
21680     }
21681   clonei->vecsize_int = vec_bits;
21682   clonei->vecsize_float = vec_bits;
21683   return count;
21684 }
21685
21686 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
21687
21688 static void
21689 aarch64_simd_clone_adjust (struct cgraph_node *node)
21690 {
21691   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21692      use the correct ABI.  */
21693
21694   tree t = TREE_TYPE (node->decl);
21695   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
21696                                         TYPE_ATTRIBUTES (t));
21697 }
21698
21699 /* Implement TARGET_SIMD_CLONE_USABLE.  */
21700
21701 static int
21702 aarch64_simd_clone_usable (struct cgraph_node *node)
21703 {
21704   switch (node->simdclone->vecsize_mangle)
21705     {
21706     case 'n':
21707       if (!TARGET_SIMD)
21708         return -1;
21709       return 0;
21710     default:
21711       gcc_unreachable ();
21712     }
21713 }
21714
21715 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21716
21717 static int
21718 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
21719 {
21720   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
21721       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
21722     return 0;
21723   return 1;
21724 }
21725
21726 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21727
21728 static const char *
21729 aarch64_get_multilib_abi_name (void)
21730 {
21731   if (TARGET_BIG_END)
21732     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
21733   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
21734 }
21735
21736 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21737    global variable based guard use the default else
21738    return a null tree.  */
21739 static tree
21740 aarch64_stack_protect_guard (void)
21741 {
21742   if (aarch64_stack_protector_guard == SSP_GLOBAL)
21743     return default_stack_protect_guard ();
21744
21745   return NULL_TREE;
21746 }
21747
21748 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
21749    section at the end if needed.  */
21750 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
21751 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
21752 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
21753 void
21754 aarch64_file_end_indicate_exec_stack ()
21755 {
21756   file_end_indicate_exec_stack ();
21757
21758   unsigned feature_1_and = 0;
21759   if (aarch64_bti_enabled ())
21760     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
21761
21762   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
21763     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
21764
21765   if (feature_1_and)
21766     {
21767       /* Generate .note.gnu.property section.  */
21768       switch_to_section (get_section (".note.gnu.property",
21769                                       SECTION_NOTYPE, NULL));
21770
21771       /* PT_NOTE header: namesz, descsz, type.
21772          namesz = 4 ("GNU\0")
21773          descsz = 16 (Size of the program property array)
21774                   [(12 + padding) * Number of array elements]
21775          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
21776       assemble_align (POINTER_SIZE);
21777       assemble_integer (GEN_INT (4), 4, 32, 1);
21778       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
21779       assemble_integer (GEN_INT (5), 4, 32, 1);
21780
21781       /* PT_NOTE name.  */
21782       assemble_string ("GNU", 4);
21783
21784       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
21785          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
21786          datasz = 4
21787          data   = feature_1_and.  */
21788       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
21789       assemble_integer (GEN_INT (4), 4, 32, 1);
21790       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
21791
21792       /* Pad the size of the note to the required alignment.  */
21793       assemble_align (POINTER_SIZE);
21794     }
21795 }
21796 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
21797 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
21798 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
21799
21800 /* Target-specific selftests.  */
21801
21802 #if CHECKING_P
21803
21804 namespace selftest {
21805
21806 /* Selftest for the RTL loader.
21807    Verify that the RTL loader copes with a dump from
21808    print_rtx_function.  This is essentially just a test that class
21809    function_reader can handle a real dump, but it also verifies
21810    that lookup_reg_by_dump_name correctly handles hard regs.
21811    The presence of hard reg names in the dump means that the test is
21812    target-specific, hence it is in this file.  */
21813
21814 static void
21815 aarch64_test_loading_full_dump ()
21816 {
21817   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
21818
21819   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
21820
21821   rtx_insn *insn_1 = get_insn_by_uid (1);
21822   ASSERT_EQ (NOTE, GET_CODE (insn_1));
21823
21824   rtx_insn *insn_15 = get_insn_by_uid (15);
21825   ASSERT_EQ (INSN, GET_CODE (insn_15));
21826   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
21827
21828   /* Verify crtl->return_rtx.  */
21829   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
21830   ASSERT_EQ (0, REGNO (crtl->return_rtx));
21831   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
21832 }
21833
21834 /* Run all target-specific selftests.  */
21835
21836 static void
21837 aarch64_run_selftests (void)
21838 {
21839   aarch64_test_loading_full_dump ();
21840 }
21841
21842 } // namespace selftest
21843
21844 #endif /* #if CHECKING_P */
21845
21846 #undef TARGET_STACK_PROTECT_GUARD
21847 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
21848
21849 #undef TARGET_ADDRESS_COST
21850 #define TARGET_ADDRESS_COST aarch64_address_cost
21851
21852 /* This hook will determines whether unnamed bitfields affect the alignment
21853    of the containing structure.  The hook returns true if the structure
21854    should inherit the alignment requirements of an unnamed bitfield's
21855    type.  */
21856 #undef TARGET_ALIGN_ANON_BITFIELD
21857 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
21858
21859 #undef TARGET_ASM_ALIGNED_DI_OP
21860 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
21861
21862 #undef TARGET_ASM_ALIGNED_HI_OP
21863 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
21864
21865 #undef TARGET_ASM_ALIGNED_SI_OP
21866 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
21867
21868 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
21869 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
21870   hook_bool_const_tree_hwi_hwi_const_tree_true
21871
21872 #undef TARGET_ASM_FILE_START
21873 #define TARGET_ASM_FILE_START aarch64_start_file
21874
21875 #undef TARGET_ASM_OUTPUT_MI_THUNK
21876 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
21877
21878 #undef TARGET_ASM_SELECT_RTX_SECTION
21879 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
21880
21881 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
21882 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
21883
21884 #undef TARGET_BUILD_BUILTIN_VA_LIST
21885 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
21886
21887 #undef TARGET_CALLEE_COPIES
21888 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
21889
21890 #undef TARGET_CAN_ELIMINATE
21891 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
21892
21893 #undef TARGET_CAN_INLINE_P
21894 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
21895
21896 #undef TARGET_CANNOT_FORCE_CONST_MEM
21897 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
21898
21899 #undef TARGET_CASE_VALUES_THRESHOLD
21900 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
21901
21902 #undef TARGET_CONDITIONAL_REGISTER_USAGE
21903 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
21904
21905 /* Only the least significant bit is used for initialization guard
21906    variables.  */
21907 #undef TARGET_CXX_GUARD_MASK_BIT
21908 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
21909
21910 #undef TARGET_C_MODE_FOR_SUFFIX
21911 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
21912
21913 #ifdef TARGET_BIG_ENDIAN_DEFAULT
21914 #undef  TARGET_DEFAULT_TARGET_FLAGS
21915 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
21916 #endif
21917
21918 #undef TARGET_CLASS_MAX_NREGS
21919 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
21920
21921 #undef TARGET_BUILTIN_DECL
21922 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
21923
21924 #undef TARGET_BUILTIN_RECIPROCAL
21925 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
21926
21927 #undef TARGET_C_EXCESS_PRECISION
21928 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
21929
21930 #undef  TARGET_EXPAND_BUILTIN
21931 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
21932
21933 #undef TARGET_EXPAND_BUILTIN_VA_START
21934 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
21935
21936 #undef TARGET_FOLD_BUILTIN
21937 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
21938
21939 #undef TARGET_FUNCTION_ARG
21940 #define TARGET_FUNCTION_ARG aarch64_function_arg
21941
21942 #undef TARGET_FUNCTION_ARG_ADVANCE
21943 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
21944
21945 #undef TARGET_FUNCTION_ARG_BOUNDARY
21946 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
21947
21948 #undef TARGET_FUNCTION_ARG_PADDING
21949 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
21950
21951 #undef TARGET_GET_RAW_RESULT_MODE
21952 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
21953 #undef TARGET_GET_RAW_ARG_MODE
21954 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
21955
21956 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
21957 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
21958
21959 #undef TARGET_FUNCTION_VALUE
21960 #define TARGET_FUNCTION_VALUE aarch64_function_value
21961
21962 #undef TARGET_FUNCTION_VALUE_REGNO_P
21963 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
21964
21965 #undef TARGET_GIMPLE_FOLD_BUILTIN
21966 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
21967
21968 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
21969 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
21970
21971 #undef  TARGET_INIT_BUILTINS
21972 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
21973
21974 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
21975 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
21976   aarch64_ira_change_pseudo_allocno_class
21977
21978 #undef TARGET_LEGITIMATE_ADDRESS_P
21979 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
21980
21981 #undef TARGET_LEGITIMATE_CONSTANT_P
21982 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
21983
21984 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
21985 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
21986   aarch64_legitimize_address_displacement
21987
21988 #undef TARGET_LIBGCC_CMP_RETURN_MODE
21989 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
21990
21991 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
21992 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
21993 aarch64_libgcc_floating_mode_supported_p
21994
21995 #undef TARGET_MANGLE_TYPE
21996 #define TARGET_MANGLE_TYPE aarch64_mangle_type
21997
21998 #undef TARGET_VERIFY_TYPE_CONTEXT
21999 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22000
22001 #undef TARGET_MEMORY_MOVE_COST
22002 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22003
22004 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22005 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22006
22007 #undef TARGET_MUST_PASS_IN_STACK
22008 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22009
22010 /* This target hook should return true if accesses to volatile bitfields
22011    should use the narrowest mode possible.  It should return false if these
22012    accesses should use the bitfield container type.  */
22013 #undef TARGET_NARROW_VOLATILE_BITFIELD
22014 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22015
22016 #undef  TARGET_OPTION_OVERRIDE
22017 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22018
22019 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22020 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22021   aarch64_override_options_after_change
22022
22023 #undef TARGET_OPTION_SAVE
22024 #define TARGET_OPTION_SAVE aarch64_option_save
22025
22026 #undef TARGET_OPTION_RESTORE
22027 #define TARGET_OPTION_RESTORE aarch64_option_restore
22028
22029 #undef TARGET_OPTION_PRINT
22030 #define TARGET_OPTION_PRINT aarch64_option_print
22031
22032 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22033 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22034
22035 #undef TARGET_SET_CURRENT_FUNCTION
22036 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22037
22038 #undef TARGET_PASS_BY_REFERENCE
22039 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22040
22041 #undef TARGET_PREFERRED_RELOAD_CLASS
22042 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22043
22044 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22045 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22046
22047 #undef TARGET_PROMOTED_TYPE
22048 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22049
22050 #undef TARGET_SECONDARY_RELOAD
22051 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22052
22053 #undef TARGET_SHIFT_TRUNCATION_MASK
22054 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22055
22056 #undef TARGET_SETUP_INCOMING_VARARGS
22057 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22058
22059 #undef TARGET_STRUCT_VALUE_RTX
22060 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
22061
22062 #undef TARGET_REGISTER_MOVE_COST
22063 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22064
22065 #undef TARGET_RETURN_IN_MEMORY
22066 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22067
22068 #undef TARGET_RETURN_IN_MSB
22069 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22070
22071 #undef TARGET_RTX_COSTS
22072 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22073
22074 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22075 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22076
22077 #undef TARGET_SCHED_ISSUE_RATE
22078 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22079
22080 #undef TARGET_SCHED_VARIABLE_ISSUE
22081 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22082
22083 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22084 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22085   aarch64_sched_first_cycle_multipass_dfa_lookahead
22086
22087 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22088 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22089   aarch64_first_cycle_multipass_dfa_lookahead_guard
22090
22091 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22092 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22093   aarch64_get_separate_components
22094
22095 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22096 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22097   aarch64_components_for_bb
22098
22099 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22100 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22101   aarch64_disqualify_components
22102
22103 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22104 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22105   aarch64_emit_prologue_components
22106
22107 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22108 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22109   aarch64_emit_epilogue_components
22110
22111 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22112 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22113   aarch64_set_handled_components
22114
22115 #undef TARGET_TRAMPOLINE_INIT
22116 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22117
22118 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22119 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22120
22121 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22122 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22123
22124 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22125 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22126   aarch64_builtin_support_vector_misalignment
22127
22128 #undef TARGET_ARRAY_MODE
22129 #define TARGET_ARRAY_MODE aarch64_array_mode
22130
22131 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22132 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22133
22134 #undef TARGET_VECTORIZE_ADD_STMT_COST
22135 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22136
22137 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22138 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22139   aarch64_builtin_vectorization_cost
22140
22141 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22142 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22143
22144 #undef TARGET_VECTORIZE_BUILTINS
22145 #define TARGET_VECTORIZE_BUILTINS
22146
22147 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22148 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22149   aarch64_builtin_vectorized_function
22150
22151 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22152 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22153   aarch64_autovectorize_vector_modes
22154
22155 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22156 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22157   aarch64_atomic_assign_expand_fenv
22158
22159 /* Section anchor support.  */
22160
22161 #undef TARGET_MIN_ANCHOR_OFFSET
22162 #define TARGET_MIN_ANCHOR_OFFSET -256
22163
22164 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22165    byte offset; we can do much more for larger data types, but have no way
22166    to determine the size of the access.  We assume accesses are aligned.  */
22167 #undef TARGET_MAX_ANCHOR_OFFSET
22168 #define TARGET_MAX_ANCHOR_OFFSET 4095
22169
22170 #undef TARGET_VECTOR_ALIGNMENT
22171 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22172
22173 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22174 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22175   aarch64_vectorize_preferred_vector_alignment
22176 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22177 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22178   aarch64_simd_vector_alignment_reachable
22179
22180 /* vec_perm support.  */
22181
22182 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22183 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22184   aarch64_vectorize_vec_perm_const
22185
22186 #undef TARGET_VECTORIZE_RELATED_MODE
22187 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22188 #undef TARGET_VECTORIZE_GET_MASK_MODE
22189 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22190 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22191 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22192   aarch64_empty_mask_is_expensive
22193 #undef TARGET_PREFERRED_ELSE_VALUE
22194 #define TARGET_PREFERRED_ELSE_VALUE \
22195   aarch64_preferred_else_value
22196
22197 #undef TARGET_INIT_LIBFUNCS
22198 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22199
22200 #undef TARGET_FIXED_CONDITION_CODE_REGS
22201 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22202
22203 #undef TARGET_FLAGS_REGNUM
22204 #define TARGET_FLAGS_REGNUM CC_REGNUM
22205
22206 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22207 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22208
22209 #undef TARGET_ASAN_SHADOW_OFFSET
22210 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22211
22212 #undef TARGET_LEGITIMIZE_ADDRESS
22213 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22214
22215 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22216 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22217
22218 #undef TARGET_CAN_USE_DOLOOP_P
22219 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22220
22221 #undef TARGET_SCHED_ADJUST_PRIORITY
22222 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22223
22224 #undef TARGET_SCHED_MACRO_FUSION_P
22225 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22226
22227 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22228 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22229
22230 #undef TARGET_SCHED_FUSION_PRIORITY
22231 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22232
22233 #undef TARGET_UNSPEC_MAY_TRAP_P
22234 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22235
22236 #undef TARGET_USE_PSEUDO_PIC_REG
22237 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22238
22239 #undef TARGET_PRINT_OPERAND
22240 #define TARGET_PRINT_OPERAND aarch64_print_operand
22241
22242 #undef TARGET_PRINT_OPERAND_ADDRESS
22243 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22244
22245 #undef TARGET_OPTAB_SUPPORTED_P
22246 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22247
22248 #undef TARGET_OMIT_STRUCT_RETURN_REG
22249 #define TARGET_OMIT_STRUCT_RETURN_REG true
22250
22251 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22252 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22253   aarch64_dwarf_poly_indeterminate_value
22254
22255 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
22256 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22257 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22258
22259 #undef TARGET_HARD_REGNO_NREGS
22260 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22261 #undef TARGET_HARD_REGNO_MODE_OK
22262 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22263
22264 #undef TARGET_MODES_TIEABLE_P
22265 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22266
22267 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22268 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22269   aarch64_hard_regno_call_part_clobbered
22270
22271 #undef TARGET_INSN_CALLEE_ABI
22272 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22273
22274 #undef TARGET_CONSTANT_ALIGNMENT
22275 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22276
22277 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22278 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22279   aarch64_stack_clash_protection_alloca_probe_range
22280
22281 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22282 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22283
22284 #undef TARGET_CAN_CHANGE_MODE_CLASS
22285 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22286
22287 #undef TARGET_SELECT_EARLY_REMAT_MODES
22288 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22289
22290 #undef TARGET_SPECULATION_SAFE_VALUE
22291 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22292
22293 #undef TARGET_ESTIMATED_POLY_VALUE
22294 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22295
22296 #undef TARGET_ATTRIBUTE_TABLE
22297 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22298
22299 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22300 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22301   aarch64_simd_clone_compute_vecsize_and_simdlen
22302
22303 #undef TARGET_SIMD_CLONE_ADJUST
22304 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22305
22306 #undef TARGET_SIMD_CLONE_USABLE
22307 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22308
22309 #undef TARGET_COMP_TYPE_ATTRIBUTES
22310 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22311
22312 #undef TARGET_GET_MULTILIB_ABI_NAME
22313 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22314
22315 #undef TARGET_FNTYPE_ABI
22316 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22317
22318 #if CHECKING_P
22319 #undef TARGET_RUN_TARGET_SELFTESTS
22320 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22321 #endif /* #if CHECKING_P */
22322
22323 #undef TARGET_ASM_POST_CFI_STARTPROC
22324 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22325
22326 #undef TARGET_STRICT_ARGUMENT_NAMING
22327 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22328
22329 #undef TARGET_MD_ASM_ADJUST
22330 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22331
22332 struct gcc_target targetm = TARGET_INITIALIZER;
22333
22334 #include "gt-aarch64.h"