gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN, INDEX, PTRUE };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  96
  97   /* The mode of the elements.  */
  98   scalar_mode elt_mode;
  99
 100   /* The instruction to use to move the immediate into a vector.  */
 101   insn_type insn;
 102
 103   union
 104   {
 105     /* For MOV and MVN.  */
 106     struct
 107     {
 108       /* The value of each element.  */
 109       rtx value;
 110
 111       /* The kind of shift modifier to use, and the number of bits to shift.
 112          This is (LSL, 0) if no shift is needed.  */
 113       modifier_type modifier;
 114       unsigned int shift;
 115     } mov;
 116
 117     /* For INDEX.  */
 118     struct
 119     {
 120       /* The value of the first element and the step to be added for each
 121          subsequent element.  */
 122       rtx base, step;
 123     } index;
 124
 125     /* For PTRUE.  */
 126     aarch64_svpattern pattern;
 127   } u;
 128 };
 129
 130 /* Construct a floating-point immediate in which each element has mode
 131    ELT_MODE_IN and value VALUE_IN.  */
 132 inline simd_immediate_info
 133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 134   : elt_mode (elt_mode_in), insn (MOV)
 135 {
 136   u.mov.value = value_in;
 137   u.mov.modifier = LSL;
 138   u.mov.shift = 0;
 139 }
 140
 141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 142    and value VALUE_IN.  The other parameters are as for the structure
 143    fields.  */
 144 inline simd_immediate_info
 145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 146                        unsigned HOST_WIDE_INT value_in,
 147                        insn_type insn_in, modifier_type modifier_in,
 148                        unsigned int shift_in)
 149   : elt_mode (elt_mode_in), insn (insn_in)
 150 {
 151   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 152   u.mov.modifier = modifier_in;
 153   u.mov.shift = shift_in;
 154 }
 155
 156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 157    and where element I is equal to BASE_IN + I * STEP_IN.  */
 158 inline simd_immediate_info
 159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 160   : elt_mode (elt_mode_in), insn (INDEX)
 161 {
 162   u.index.base = base_in;
 163   u.index.step = step_in;
 164 }
 165
 166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 167    and has PTRUE pattern PATTERN_IN.  */
 168 inline simd_immediate_info
 169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 170                        aarch64_svpattern pattern_in)
 171   : elt_mode (elt_mode_in), insn (PTRUE)
 172 {
 173   u.pattern = pattern_in;
 174 }
 175
 176 /* The current code model.  */
 177 enum aarch64_code_model aarch64_cmodel;
 178
 179 /* The number of 64-bit elements in an SVE vector.  */
 180 poly_uint16 aarch64_sve_vg;
 181
 182 #ifdef HAVE_AS_TLS
 183 #undef TARGET_HAVE_TLS
 184 #define TARGET_HAVE_TLS 1
 185 #endif
 186
 187 static bool aarch64_composite_type_p (const_tree, machine_mode);
 188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 189                                                      const_tree,
 190                                                      machine_mode *, int *,
 191                                                      bool *);
 192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_override_options_after_change (void);
 195 static bool aarch64_vector_mode_supported_p (machine_mode);
 196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 198                                                          const_tree type,
 199                                                          int misalignment,
 200                                                          bool is_packed);
 201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 203                                             aarch64_addr_query_type);
 204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 205
 206 /* Major revision number of the ARM Architecture implemented by the target.  */
 207 unsigned aarch64_architecture_version;
 208
 209 /* The processor for which instructions should be scheduled.  */
 210 enum aarch64_processor aarch64_tune = cortexa53;
 211
 212 /* Mask to specify which instruction scheduling options should be used.  */
 213 uint64_t aarch64_tune_flags = 0;
 214
 215 /* Global flag for PC relative loads.  */
 216 bool aarch64_pcrelative_literal_loads;
 217
 218 /* Global flag for whether frame pointer is enabled.  */
 219 bool aarch64_use_frame_pointer;
 220
 221 #define BRANCH_PROTECT_STR_MAX 255
 222 char *accepted_branch_protection_string = NULL;
 223
 224 static enum aarch64_parse_opt_result
 225 aarch64_parse_branch_protection (const char*, char**);
 226
 227 /* Support for command line parsing of boolean flags in the tuning
 228    structures.  */
 229 struct aarch64_flag_desc
 230 {
 231   const char* name;
 232   unsigned int flag;
 233 };
 234
 235 #define AARCH64_FUSION_PAIR(name, internal_name) \
 236   { name, AARCH64_FUSE_##internal_name },
 237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 238 {
 239   { "none", AARCH64_FUSE_NOTHING },
 240 #include "aarch64-fusion-pairs.def"
 241   { "all", AARCH64_FUSE_ALL },
 242   { NULL, AARCH64_FUSE_NOTHING }
 243 };
 244
 245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 246   { name, AARCH64_EXTRA_TUNE_##internal_name },
 247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 248 {
 249   { "none", AARCH64_EXTRA_TUNE_NONE },
 250 #include "aarch64-tuning-flags.def"
 251   { "all", AARCH64_EXTRA_TUNE_ALL },
 252   { NULL, AARCH64_EXTRA_TUNE_NONE }
 253 };
 254
 255 /* Tuning parameters.  */
 256
 257 static const struct cpu_addrcost_table generic_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   1, /* register_offset  */
 284   1, /* register_sextend  */
 285   2, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_addrcost_table xgene1_addrcost_table =
 290 {
 291     {
 292       1, /* hi  */
 293       0, /* si  */
 294       0, /* di  */
 295       1, /* ti  */
 296     },
 297   1, /* pre_modify  */
 298   1, /* post_modify  */
 299   0, /* register_offset  */
 300   1, /* register_sextend  */
 301   1, /* register_zextend  */
 302   0, /* imm_offset  */
 303 };
 304
 305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 306 {
 307     {
 308       1, /* hi  */
 309       1, /* si  */
 310       1, /* di  */
 311       2, /* ti  */
 312     },
 313   0, /* pre_modify  */
 314   0, /* post_modify  */
 315   2, /* register_offset  */
 316   3, /* register_sextend  */
 317   3, /* register_zextend  */
 318   0, /* imm_offset  */
 319 };
 320
 321 static const struct cpu_addrcost_table tsv110_addrcost_table =
 322 {
 323     {
 324       1, /* hi  */
 325       0, /* si  */
 326       0, /* di  */
 327       1, /* ti  */
 328     },
 329   0, /* pre_modify  */
 330   0, /* post_modify  */
 331   0, /* register_offset  */
 332   1, /* register_sextend  */
 333   1, /* register_zextend  */
 334   0, /* imm_offset  */
 335 };
 336
 337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 338 {
 339     {
 340       1, /* hi  */
 341       1, /* si  */
 342       1, /* di  */
 343       2, /* ti  */
 344     },
 345   1, /* pre_modify  */
 346   1, /* post_modify  */
 347   3, /* register_offset  */
 348   3, /* register_sextend  */
 349   3, /* register_zextend  */
 350   2, /* imm_offset  */
 351 };
 352
 353 static const struct cpu_regmove_cost generic_regmove_cost =
 354 {
 355   1, /* GP2GP  */
 356   /* Avoid the use of slow int<->fp moves for spilling by setting
 357      their cost higher than memmov_cost.  */
 358   5, /* GP2FP  */
 359   5, /* FP2GP  */
 360   2 /* FP2FP  */
 361 };
 362
 363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 364 {
 365   1, /* GP2GP  */
 366   /* Avoid the use of slow int<->fp moves for spilling by setting
 367      their cost higher than memmov_cost.  */
 368   5, /* GP2FP  */
 369   5, /* FP2GP  */
 370   2 /* FP2FP  */
 371 };
 372
 373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 374 {
 375   1, /* GP2GP  */
 376   /* Avoid the use of slow int<->fp moves for spilling by setting
 377      their cost higher than memmov_cost.  */
 378   5, /* GP2FP  */
 379   5, /* FP2GP  */
 380   2 /* FP2FP  */
 381 };
 382
 383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 384 {
 385   1, /* GP2GP  */
 386   /* Avoid the use of slow int<->fp moves for spilling by setting
 387      their cost higher than memmov_cost (actual, 4 and 9).  */
 388   9, /* GP2FP  */
 389   9, /* FP2GP  */
 390   1 /* FP2FP  */
 391 };
 392
 393 static const struct cpu_regmove_cost thunderx_regmove_cost =
 394 {
 395   2, /* GP2GP  */
 396   2, /* GP2FP  */
 397   6, /* FP2GP  */
 398   4 /* FP2FP  */
 399 };
 400
 401 static const struct cpu_regmove_cost xgene1_regmove_cost =
 402 {
 403   1, /* GP2GP  */
 404   /* Avoid the use of slow int<->fp moves for spilling by setting
 405      their cost higher than memmov_cost.  */
 406   8, /* GP2FP  */
 407   8, /* FP2GP  */
 408   2 /* FP2FP  */
 409 };
 410
 411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 412 {
 413   2, /* GP2GP  */
 414   /* Avoid the use of int<->fp moves for spilling.  */
 415   6, /* GP2FP  */
 416   6, /* FP2GP  */
 417   4 /* FP2FP  */
 418 };
 419
 420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 421 {
 422   1, /* GP2GP  */
 423   /* Avoid the use of int<->fp moves for spilling.  */
 424   8, /* GP2FP  */
 425   8, /* FP2GP  */
 426   4  /* FP2FP  */
 427 };
 428
 429 static const struct cpu_regmove_cost tsv110_regmove_cost =
 430 {
 431   1, /* GP2GP  */
 432   /* Avoid the use of slow int<->fp moves for spilling by setting
 433      their cost higher than memmov_cost.  */
 434   2, /* GP2FP  */
 435   3, /* FP2GP  */
 436   2  /* FP2FP  */
 437 };
 438
 439 /* Generic costs for vector insn classes.  */
 440 static const struct cpu_vector_cost generic_vector_cost =
 441 {
 442   1, /* scalar_int_stmt_cost  */
 443   1, /* scalar_fp_stmt_cost  */
 444   1, /* scalar_load_cost  */
 445   1, /* scalar_store_cost  */
 446   1, /* vec_int_stmt_cost  */
 447   1, /* vec_fp_stmt_cost  */
 448   2, /* vec_permute_cost  */
 449   1, /* vec_to_scalar_cost  */
 450   1, /* scalar_to_vec_cost  */
 451   1, /* vec_align_load_cost  */
 452   1, /* vec_unalign_load_cost  */
 453   1, /* vec_unalign_store_cost  */
 454   1, /* vec_store_cost  */
 455   3, /* cond_taken_branch_cost  */
 456   1 /* cond_not_taken_branch_cost  */
 457 };
 458
 459 /* QDF24XX costs for vector insn classes.  */
 460 static const struct cpu_vector_cost qdf24xx_vector_cost =
 461 {
 462   1, /* scalar_int_stmt_cost  */
 463   1, /* scalar_fp_stmt_cost  */
 464   1, /* scalar_load_cost  */
 465   1, /* scalar_store_cost  */
 466   1, /* vec_int_stmt_cost  */
 467   3, /* vec_fp_stmt_cost  */
 468   2, /* vec_permute_cost  */
 469   1, /* vec_to_scalar_cost  */
 470   1, /* scalar_to_vec_cost  */
 471   1, /* vec_align_load_cost  */
 472   1, /* vec_unalign_load_cost  */
 473   1, /* vec_unalign_store_cost  */
 474   1, /* vec_store_cost  */
 475   3, /* cond_taken_branch_cost  */
 476   1 /* cond_not_taken_branch_cost  */
 477 };
 478
 479 /* ThunderX costs for vector insn classes.  */
 480 static const struct cpu_vector_cost thunderx_vector_cost =
 481 {
 482   1, /* scalar_int_stmt_cost  */
 483   1, /* scalar_fp_stmt_cost  */
 484   3, /* scalar_load_cost  */
 485   1, /* scalar_store_cost  */
 486   4, /* vec_int_stmt_cost  */
 487   1, /* vec_fp_stmt_cost  */
 488   4, /* vec_permute_cost  */
 489   2, /* vec_to_scalar_cost  */
 490   2, /* scalar_to_vec_cost  */
 491   3, /* vec_align_load_cost  */
 492   5, /* vec_unalign_load_cost  */
 493   5, /* vec_unalign_store_cost  */
 494   1, /* vec_store_cost  */
 495   3, /* cond_taken_branch_cost  */
 496   3 /* cond_not_taken_branch_cost  */
 497 };
 498
 499 static const struct cpu_vector_cost tsv110_vector_cost =
 500 {
 501   1, /* scalar_int_stmt_cost  */
 502   1, /* scalar_fp_stmt_cost  */
 503   5, /* scalar_load_cost  */
 504   1, /* scalar_store_cost  */
 505   2, /* vec_int_stmt_cost  */
 506   2, /* vec_fp_stmt_cost  */
 507   2, /* vec_permute_cost  */
 508   3, /* vec_to_scalar_cost  */
 509   2, /* scalar_to_vec_cost  */
 510   5, /* vec_align_load_cost  */
 511   5, /* vec_unalign_load_cost  */
 512   1, /* vec_unalign_store_cost  */
 513   1, /* vec_store_cost  */
 514   1, /* cond_taken_branch_cost  */
 515   1 /* cond_not_taken_branch_cost  */
 516 };
 517
 518 /* Generic costs for vector insn classes.  */
 519 static const struct cpu_vector_cost cortexa57_vector_cost =
 520 {
 521   1, /* scalar_int_stmt_cost  */
 522   1, /* scalar_fp_stmt_cost  */
 523   4, /* scalar_load_cost  */
 524   1, /* scalar_store_cost  */
 525   2, /* vec_int_stmt_cost  */
 526   2, /* vec_fp_stmt_cost  */
 527   3, /* vec_permute_cost  */
 528   8, /* vec_to_scalar_cost  */
 529   8, /* scalar_to_vec_cost  */
 530   4, /* vec_align_load_cost  */
 531   4, /* vec_unalign_load_cost  */
 532   1, /* vec_unalign_store_cost  */
 533   1, /* vec_store_cost  */
 534   1, /* cond_taken_branch_cost  */
 535   1 /* cond_not_taken_branch_cost  */
 536 };
 537
 538 static const struct cpu_vector_cost exynosm1_vector_cost =
 539 {
 540   1, /* scalar_int_stmt_cost  */
 541   1, /* scalar_fp_stmt_cost  */
 542   5, /* scalar_load_cost  */
 543   1, /* scalar_store_cost  */
 544   3, /* vec_int_stmt_cost  */
 545   3, /* vec_fp_stmt_cost  */
 546   3, /* vec_permute_cost  */
 547   3, /* vec_to_scalar_cost  */
 548   3, /* scalar_to_vec_cost  */
 549   5, /* vec_align_load_cost  */
 550   5, /* vec_unalign_load_cost  */
 551   1, /* vec_unalign_store_cost  */
 552   1, /* vec_store_cost  */
 553   1, /* cond_taken_branch_cost  */
 554   1 /* cond_not_taken_branch_cost  */
 555 };
 556
 557 /* Generic costs for vector insn classes.  */
 558 static const struct cpu_vector_cost xgene1_vector_cost =
 559 {
 560   1, /* scalar_int_stmt_cost  */
 561   1, /* scalar_fp_stmt_cost  */
 562   5, /* scalar_load_cost  */
 563   1, /* scalar_store_cost  */
 564   2, /* vec_int_stmt_cost  */
 565   2, /* vec_fp_stmt_cost  */
 566   2, /* vec_permute_cost  */
 567   4, /* vec_to_scalar_cost  */
 568   4, /* scalar_to_vec_cost  */
 569   10, /* vec_align_load_cost  */
 570   10, /* vec_unalign_load_cost  */
 571   2, /* vec_unalign_store_cost  */
 572   2, /* vec_store_cost  */
 573   2, /* cond_taken_branch_cost  */
 574   1 /* cond_not_taken_branch_cost  */
 575 };
 576
 577 /* Costs for vector insn classes for Vulcan.  */
 578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 579 {
 580   1, /* scalar_int_stmt_cost  */
 581   6, /* scalar_fp_stmt_cost  */
 582   4, /* scalar_load_cost  */
 583   1, /* scalar_store_cost  */
 584   5, /* vec_int_stmt_cost  */
 585   6, /* vec_fp_stmt_cost  */
 586   3, /* vec_permute_cost  */
 587   6, /* vec_to_scalar_cost  */
 588   5, /* scalar_to_vec_cost  */
 589   8, /* vec_align_load_cost  */
 590   8, /* vec_unalign_load_cost  */
 591   4, /* vec_unalign_store_cost  */
 592   4, /* vec_store_cost  */
 593   2, /* cond_taken_branch_cost  */
 594   1  /* cond_not_taken_branch_cost  */
 595 };
 596
 597 /* Generic costs for branch instructions.  */
 598 static const struct cpu_branch_cost generic_branch_cost =
 599 {
 600   1,  /* Predictable.  */
 601   3   /* Unpredictable.  */
 602 };
 603
 604 /* Generic approximation modes.  */
 605 static const cpu_approx_modes generic_approx_modes =
 606 {
 607   AARCH64_APPROX_NONE,  /* division  */
 608   AARCH64_APPROX_NONE,  /* sqrt  */
 609   AARCH64_APPROX_NONE   /* recip_sqrt  */
 610 };
 611
 612 /* Approximation modes for Exynos M1.  */
 613 static const cpu_approx_modes exynosm1_approx_modes =
 614 {
 615   AARCH64_APPROX_NONE,  /* division  */
 616   AARCH64_APPROX_ALL,   /* sqrt  */
 617   AARCH64_APPROX_ALL    /* recip_sqrt  */
 618 };
 619
 620 /* Approximation modes for X-Gene 1.  */
 621 static const cpu_approx_modes xgene1_approx_modes =
 622 {
 623   AARCH64_APPROX_NONE,  /* division  */
 624   AARCH64_APPROX_NONE,  /* sqrt  */
 625   AARCH64_APPROX_ALL    /* recip_sqrt  */
 626 };
 627
 628 /* Generic prefetch settings (which disable prefetch).  */
 629 static const cpu_prefetch_tune generic_prefetch_tune =
 630 {
 631   0,                    /* num_slots  */
 632   -1,                   /* l1_cache_size  */
 633   -1,                   /* l1_cache_line_size  */
 634   -1,                   /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   -1                    /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 641 {
 642   0,                    /* num_slots  */
 643   -1,                   /* l1_cache_size  */
 644   64,                   /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 652 {
 653   4,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   512,                  /* l2_cache_size  */
 657   false,                /* prefetch_dynamic_strides */
 658   2048,                 /* minimum_stride */
 659   3                     /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 663 {
 664   8,                    /* num_slots  */
 665   32,                   /* l1_cache_size  */
 666   128,                  /* l1_cache_line_size  */
 667   16*1024,              /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   3                     /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune thunderx_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   128,                  /* l1_cache_line_size  */
 678   -1,                   /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 685 {
 686   8,                    /* num_slots  */
 687   32,                   /* l1_cache_size  */
 688   64,                   /* l1_cache_line_size  */
 689   256,                  /* l2_cache_size  */
 690   true,                 /* prefetch_dynamic_strides */
 691   -1,                   /* minimum_stride */
 692   -1                    /* default_opt_level  */
 693 };
 694
 695 static const cpu_prefetch_tune tsv110_prefetch_tune =
 696 {
 697   0,                    /* num_slots  */
 698   64,                   /* l1_cache_size  */
 699   64,                   /* l1_cache_line_size  */
 700   512,                  /* l2_cache_size  */
 701   true,                 /* prefetch_dynamic_strides */
 702   -1,                   /* minimum_stride */
 703   -1                    /* default_opt_level  */
 704 };
 705
 706 static const cpu_prefetch_tune xgene1_prefetch_tune =
 707 {
 708   8,                    /* num_slots  */
 709   32,                   /* l1_cache_size  */
 710   64,                   /* l1_cache_line_size  */
 711   256,                  /* l2_cache_size  */
 712   true,                 /* prefetch_dynamic_strides */
 713   -1,                   /* minimum_stride */
 714   -1                    /* default_opt_level  */
 715 };
 716
 717 static const struct tune_params generic_tunings =
 718 {
 719   &cortexa57_extra_costs,
 720   &generic_addrcost_table,
 721   &generic_regmove_cost,
 722   &generic_vector_cost,
 723   &generic_branch_cost,
 724   &generic_approx_modes,
 725   SVE_NOT_IMPLEMENTED, /* sve_width  */
 726   4, /* memmov_cost  */
 727   2, /* issue_rate  */
 728   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 729   "16:12",      /* function_align.  */
 730   "4",  /* jump_align.  */
 731   "8",  /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 739   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 740   &generic_prefetch_tune
 741 };
 742
 743 static const struct tune_params cortexa35_tunings =
 744 {
 745   &cortexa53_extra_costs,
 746   &generic_addrcost_table,
 747   &cortexa53_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   SVE_NOT_IMPLEMENTED, /* sve_width  */
 752   4, /* memmov_cost  */
 753   1, /* issue_rate  */
 754   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 755    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 756   "16", /* function_align.  */
 757   "4",  /* jump_align.  */
 758   "8",  /* loop_align.  */
 759   2,    /* int_reassoc_width.  */
 760   4,    /* fp_reassoc_width.  */
 761   1,    /* vec_reassoc_width.  */
 762   2,    /* min_div_recip_mul_sf.  */
 763   2,    /* min_div_recip_mul_df.  */
 764   0,    /* max_case_values.  */
 765   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 767   &generic_prefetch_tune
 768 };
 769
 770 static const struct tune_params cortexa53_tunings =
 771 {
 772   &cortexa53_extra_costs,
 773   &generic_addrcost_table,
 774   &cortexa53_regmove_cost,
 775   &generic_vector_cost,
 776   &generic_branch_cost,
 777   &generic_approx_modes,
 778   SVE_NOT_IMPLEMENTED, /* sve_width  */
 779   4, /* memmov_cost  */
 780   2, /* issue_rate  */
 781   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 782    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 783   "16", /* function_align.  */
 784   "4",  /* jump_align.  */
 785   "8",  /* loop_align.  */
 786   2,    /* int_reassoc_width.  */
 787   4,    /* fp_reassoc_width.  */
 788   1,    /* vec_reassoc_width.  */
 789   2,    /* min_div_recip_mul_sf.  */
 790   2,    /* min_div_recip_mul_df.  */
 791   0,    /* max_case_values.  */
 792   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 793   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 794   &generic_prefetch_tune
 795 };
 796
 797 static const struct tune_params cortexa57_tunings =
 798 {
 799   &cortexa57_extra_costs,
 800   &generic_addrcost_table,
 801   &cortexa57_regmove_cost,
 802   &cortexa57_vector_cost,
 803   &generic_branch_cost,
 804   &generic_approx_modes,
 805   SVE_NOT_IMPLEMENTED, /* sve_width  */
 806   4, /* memmov_cost  */
 807   3, /* issue_rate  */
 808   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 809    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 810   "16", /* function_align.  */
 811   "4",  /* jump_align.  */
 812   "8",  /* loop_align.  */
 813   2,    /* int_reassoc_width.  */
 814   4,    /* fp_reassoc_width.  */
 815   1,    /* vec_reassoc_width.  */
 816   2,    /* min_div_recip_mul_sf.  */
 817   2,    /* min_div_recip_mul_df.  */
 818   0,    /* max_case_values.  */
 819   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 820   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 821   &generic_prefetch_tune
 822 };
 823
 824 static const struct tune_params cortexa72_tunings =
 825 {
 826   &cortexa57_extra_costs,
 827   &generic_addrcost_table,
 828   &cortexa57_regmove_cost,
 829   &cortexa57_vector_cost,
 830   &generic_branch_cost,
 831   &generic_approx_modes,
 832   SVE_NOT_IMPLEMENTED, /* sve_width  */
 833   4, /* memmov_cost  */
 834   3, /* issue_rate  */
 835   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 836    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 837   "16", /* function_align.  */
 838   "4",  /* jump_align.  */
 839   "8",  /* loop_align.  */
 840   2,    /* int_reassoc_width.  */
 841   4,    /* fp_reassoc_width.  */
 842   1,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &generic_prefetch_tune
 849 };
 850
 851 static const struct tune_params cortexa73_tunings =
 852 {
 853   &cortexa57_extra_costs,
 854   &generic_addrcost_table,
 855   &cortexa57_regmove_cost,
 856   &cortexa57_vector_cost,
 857   &generic_branch_cost,
 858   &generic_approx_modes,
 859   SVE_NOT_IMPLEMENTED, /* sve_width  */
 860   4, /* memmov_cost.  */
 861   2, /* issue_rate.  */
 862   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 863    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 864   "16", /* function_align.  */
 865   "4",  /* jump_align.  */
 866   "8",  /* loop_align.  */
 867   2,    /* int_reassoc_width.  */
 868   4,    /* fp_reassoc_width.  */
 869   1,    /* vec_reassoc_width.  */
 870   2,    /* min_div_recip_mul_sf.  */
 871   2,    /* min_div_recip_mul_df.  */
 872   0,    /* max_case_values.  */
 873   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 874   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 875   &generic_prefetch_tune
 876 };
 877
 878
 879
 880 static const struct tune_params exynosm1_tunings =
 881 {
 882   &exynosm1_extra_costs,
 883   &exynosm1_addrcost_table,
 884   &exynosm1_regmove_cost,
 885   &exynosm1_vector_cost,
 886   &generic_branch_cost,
 887   &exynosm1_approx_modes,
 888   SVE_NOT_IMPLEMENTED, /* sve_width  */
 889   4,    /* memmov_cost  */
 890   3,    /* issue_rate  */
 891   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 892   "4",  /* function_align.  */
 893   "4",  /* jump_align.  */
 894   "4",  /* loop_align.  */
 895   2,    /* int_reassoc_width.  */
 896   4,    /* fp_reassoc_width.  */
 897   1,    /* vec_reassoc_width.  */
 898   2,    /* min_div_recip_mul_sf.  */
 899   2,    /* min_div_recip_mul_df.  */
 900   48,   /* max_case_values.  */
 901   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 902   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 903   &exynosm1_prefetch_tune
 904 };
 905
 906 static const struct tune_params thunderxt88_tunings =
 907 {
 908   &thunderx_extra_costs,
 909   &generic_addrcost_table,
 910   &thunderx_regmove_cost,
 911   &thunderx_vector_cost,
 912   &generic_branch_cost,
 913   &generic_approx_modes,
 914   SVE_NOT_IMPLEMENTED, /* sve_width  */
 915   6, /* memmov_cost  */
 916   2, /* issue_rate  */
 917   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 918   "8",  /* function_align.  */
 919   "8",  /* jump_align.  */
 920   "8",  /* loop_align.  */
 921   2,    /* int_reassoc_width.  */
 922   4,    /* fp_reassoc_width.  */
 923   1,    /* vec_reassoc_width.  */
 924   2,    /* min_div_recip_mul_sf.  */
 925   2,    /* min_div_recip_mul_df.  */
 926   0,    /* max_case_values.  */
 927   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 928   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 929   &thunderxt88_prefetch_tune
 930 };
 931
 932 static const struct tune_params thunderx_tunings =
 933 {
 934   &thunderx_extra_costs,
 935   &generic_addrcost_table,
 936   &thunderx_regmove_cost,
 937   &thunderx_vector_cost,
 938   &generic_branch_cost,
 939   &generic_approx_modes,
 940   SVE_NOT_IMPLEMENTED, /* sve_width  */
 941   6, /* memmov_cost  */
 942   2, /* issue_rate  */
 943   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 944   "8",  /* function_align.  */
 945   "8",  /* jump_align.  */
 946   "8",  /* loop_align.  */
 947   2,    /* int_reassoc_width.  */
 948   4,    /* fp_reassoc_width.  */
 949   1,    /* vec_reassoc_width.  */
 950   2,    /* min_div_recip_mul_sf.  */
 951   2,    /* min_div_recip_mul_df.  */
 952   0,    /* max_case_values.  */
 953   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 954   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 955    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 956   &thunderx_prefetch_tune
 957 };
 958
 959 static const struct tune_params tsv110_tunings =
 960 {
 961   &tsv110_extra_costs,
 962   &tsv110_addrcost_table,
 963   &tsv110_regmove_cost,
 964   &tsv110_vector_cost,
 965   &generic_branch_cost,
 966   &generic_approx_modes,
 967   SVE_NOT_IMPLEMENTED, /* sve_width  */
 968   4,    /* memmov_cost  */
 969   4,    /* issue_rate  */
 970   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 971    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 972   "16", /* function_align.  */
 973   "4",  /* jump_align.  */
 974   "8",  /* loop_align.  */
 975   2,    /* int_reassoc_width.  */
 976   4,    /* fp_reassoc_width.  */
 977   1,    /* vec_reassoc_width.  */
 978   2,    /* min_div_recip_mul_sf.  */
 979   2,    /* min_div_recip_mul_df.  */
 980   0,    /* max_case_values.  */
 981   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 982   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 983   &tsv110_prefetch_tune
 984 };
 985
 986 static const struct tune_params xgene1_tunings =
 987 {
 988   &xgene1_extra_costs,
 989   &xgene1_addrcost_table,
 990   &xgene1_regmove_cost,
 991   &xgene1_vector_cost,
 992   &generic_branch_cost,
 993   &xgene1_approx_modes,
 994   SVE_NOT_IMPLEMENTED, /* sve_width  */
 995   6, /* memmov_cost  */
 996   4, /* issue_rate  */
 997   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 998   "16", /* function_align.  */
 999   "16", /* jump_align.  */
1000   "16", /* loop_align.  */
1001   2,    /* int_reassoc_width.  */
1002   4,    /* fp_reassoc_width.  */
1003   1,    /* vec_reassoc_width.  */
1004   2,    /* min_div_recip_mul_sf.  */
1005   2,    /* min_div_recip_mul_df.  */
1006   17,   /* max_case_values.  */
1007   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1008   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1009   &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014   &xgene1_extra_costs,
1015   &xgene1_addrcost_table,
1016   &xgene1_regmove_cost,
1017   &xgene1_vector_cost,
1018   &generic_branch_cost,
1019   &xgene1_approx_modes,
1020   SVE_NOT_IMPLEMENTED,
1021   6, /* memmov_cost  */
1022   4, /* issue_rate  */
1023   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1024   "16", /* function_align.  */
1025   "16", /* jump_align.  */
1026   "16", /* loop_align.  */
1027   2,    /* int_reassoc_width.  */
1028   4,    /* fp_reassoc_width.  */
1029   1,    /* vec_reassoc_width.  */
1030   2,    /* min_div_recip_mul_sf.  */
1031   2,    /* min_div_recip_mul_df.  */
1032   17,   /* max_case_values.  */
1033   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1034   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1035   &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040   &qdf24xx_extra_costs,
1041   &qdf24xx_addrcost_table,
1042   &qdf24xx_regmove_cost,
1043   &qdf24xx_vector_cost,
1044   &generic_branch_cost,
1045   &generic_approx_modes,
1046   SVE_NOT_IMPLEMENTED, /* sve_width  */
1047   4, /* memmov_cost  */
1048   4, /* issue_rate  */
1049   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1051   "16", /* function_align.  */
1052   "8",  /* jump_align.  */
1053   "16", /* loop_align.  */
1054   2,    /* int_reassoc_width.  */
1055   4,    /* fp_reassoc_width.  */
1056   1,    /* vec_reassoc_width.  */
1057   2,    /* min_div_recip_mul_sf.  */
1058   2,    /* min_div_recip_mul_df.  */
1059   0,    /* max_case_values.  */
1060   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1061   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1062   &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1066    for now.  */
1067 static const struct tune_params saphira_tunings =
1068 {
1069   &generic_extra_costs,
1070   &generic_addrcost_table,
1071   &generic_regmove_cost,
1072   &generic_vector_cost,
1073   &generic_branch_cost,
1074   &generic_approx_modes,
1075   SVE_NOT_IMPLEMENTED, /* sve_width  */
1076   4, /* memmov_cost  */
1077   4, /* issue_rate  */
1078   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1080   "16", /* function_align.  */
1081   "8",  /* jump_align.  */
1082   "16", /* loop_align.  */
1083   2,    /* int_reassoc_width.  */
1084   4,    /* fp_reassoc_width.  */
1085   1,    /* vec_reassoc_width.  */
1086   2,    /* min_div_recip_mul_sf.  */
1087   2,    /* min_div_recip_mul_df.  */
1088   0,    /* max_case_values.  */
1089   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1090   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1091   &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096   &thunderx2t99_extra_costs,
1097   &thunderx2t99_addrcost_table,
1098   &thunderx2t99_regmove_cost,
1099   &thunderx2t99_vector_cost,
1100   &generic_branch_cost,
1101   &generic_approx_modes,
1102   SVE_NOT_IMPLEMENTED, /* sve_width  */
1103   4, /* memmov_cost.  */
1104   4, /* issue_rate.  */
1105   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1107   "16", /* function_align.  */
1108   "8",  /* jump_align.  */
1109   "16", /* loop_align.  */
1110   3,    /* int_reassoc_width.  */
1111   2,    /* fp_reassoc_width.  */
1112   2,    /* vec_reassoc_width.  */
1113   2,    /* min_div_recip_mul_sf.  */
1114   2,    /* min_div_recip_mul_df.  */
1115   0,    /* max_case_values.  */
1116   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1117   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1118   &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123   &cortexa57_extra_costs,
1124   &generic_addrcost_table,
1125   &generic_regmove_cost,
1126   &cortexa57_vector_cost,
1127   &generic_branch_cost,
1128   &generic_approx_modes,
1129   SVE_NOT_IMPLEMENTED, /* sve_width  */
1130   4, /* memmov_cost  */
1131   3, /* issue_rate  */
1132   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1133   "32:16",      /* function_align.  */
1134   "32:16",      /* jump_align.  */
1135   "32:16",      /* loop_align.  */
1136   2,    /* int_reassoc_width.  */
1137   4,    /* fp_reassoc_width.  */
1138   2,    /* vec_reassoc_width.  */
1139   2,    /* min_div_recip_mul_sf.  */
1140   2,    /* min_div_recip_mul_df.  */
1141   0,    /* max_case_values.  */
1142   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1143   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1144   &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures.  */
1148 struct aarch64_tuning_override_function
1149 {
1150   const char* name;
1151   void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161   { "fuse", aarch64_parse_fuse_string },
1162   { "tune", aarch64_parse_tune_string },
1163   { "sve_width", aarch64_parse_sve_width_string },
1164   { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64.  */
1168 struct processor
1169 {
1170   const char *const name;
1171   enum aarch64_processor ident;
1172   enum aarch64_processor sched_core;
1173   enum aarch64_arch arch;
1174   unsigned architecture_version;
1175   const uint64_t flags;
1176   const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64.  */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64.  */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1193   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1194   FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1203    handling code or by target attributes.  */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set.  */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes.  */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217        affects_type_identity, handler, exclude } */
1218   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1219   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space.  */
1225 struct aarch64_option_extension
1226 {
1227   const char *const name;
1228   const unsigned long flags_on;
1229   const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244   /* The type's name that the user passes to the branch-protection option
1245     string.  */
1246   const char* name;
1247   /* Function to handle the protection type and set global variables.
1248     First argument is the string token corresponding with this type and the
1249     second argument is the next token in the option string.
1250     Return values:
1251     * AARCH64_PARSE_OK: Handling was sucessful.
1252     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253       should print an error.
1254     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255       own error.  */
1256   enum aarch64_parse_opt_result (*handler)(char*, char*);
1257   /* A list of types that can follow this type in the option string.  */
1258   const aarch64_branch_protect_type* subtypes;
1259   unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266   aarch64_enable_bti = 0;
1267   if (rest)
1268     {
1269       error ("unexpected %<%s%> after %<%s%>", rest, str);
1270       return AARCH64_PARSE_INVALID_FEATURE;
1271     }
1272   return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279   aarch64_ra_sign_key = AARCH64_KEY_A;
1280   aarch64_enable_bti = 1;
1281   if (rest)
1282     {
1283       error ("unexpected %<%s%> after %<%s%>", rest, str);
1284       return AARCH64_PARSE_INVALID_FEATURE;
1285     }
1286   return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291                                     char* rest ATTRIBUTE_UNUSED)
1292 {
1293   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294   aarch64_ra_sign_key = AARCH64_KEY_A;
1295   return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300                               char* rest ATTRIBUTE_UNUSED)
1301 {
1302   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308                               char* rest ATTRIBUTE_UNUSED)
1309 {
1310   aarch64_ra_sign_key = AARCH64_KEY_B;
1311   return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316                                     char* rest ATTRIBUTE_UNUSED)
1317 {
1318   aarch64_enable_bti = 1;
1319   return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325   { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334   { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function.  */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions.  */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE.  */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356   switch (pattern)
1357     {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359     AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361     case AARCH64_NUM_SVPATTERNS:
1362       break;
1363     }
1364   gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370                         const char * branch_format)
1371 {
1372     rtx_code_label * tmp_label = gen_label_rtx ();
1373     char label_buf[256];
1374     char buffer[128];
1375     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376                                  CODE_LABEL_NUMBER (tmp_label));
1377     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378     rtx dest_label = operands[pos_label];
1379     operands[pos_label] = tmp_label;
1380
1381     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382     output_asm_insn (buffer, operands);
1383
1384     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385     operands[pos_label] = dest_label;
1386     output_asm_insn (buffer, operands);
1387     return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393   if (TARGET_GENERAL_REGS_ONLY)
1394     if (FLOAT_MODE_P (mode))
1395       error ("%qs is incompatible with the use of floating-point types",
1396              "-mgeneral-regs-only");
1397     else
1398       error ("%qs is incompatible with the use of vector types",
1399              "-mgeneral-regs-only");
1400   else
1401     if (FLOAT_MODE_P (mode))
1402       error ("%qs feature modifier is incompatible with the use of"
1403              " floating-point types", "+nofp");
1404     else
1405       error ("%qs feature modifier is incompatible with the use of"
1406              " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413    and GENERAL_REGS is lower than the memory cost (in this case the best class
1414    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1415    cost results in bad allocations with many redundant int<->FP moves which
1416    are expensive on various cores.
1417    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1419    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1420    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1421    The result of this is that it is no longer inefficient to have a higher
1422    memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427                                          reg_class_t best_class)
1428 {
1429   machine_mode mode;
1430
1431   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432       || !reg_class_subset_p (FP_REGS, allocno_class))
1433     return allocno_class;
1434
1435   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436       || !reg_class_subset_p (FP_REGS, best_class))
1437     return best_class;
1438
1439   mode = PSEUDO_REGNO_MODE (regno);
1440   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446   if (GET_MODE_UNIT_SIZE (mode) == 4)
1447     return aarch64_tune_params.min_div_recip_mul_sf;
1448   return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE.  */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455   if (VECTOR_MODE_P (mode))
1456     return aarch64_tune_params.vec_reassoc_width;
1457   if (INTEGRAL_MODE_P (mode))
1458     return aarch64_tune_params.int_reassoc_width;
1459   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1460   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461     return aarch64_tune_params.fp_reassoc_width;
1462   return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469    if (GP_REGNUM_P (regno))
1470      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471    else if (regno == SP_REGNUM)
1472      return AARCH64_DWARF_SP;
1473    else if (FP_REGNUM_P (regno))
1474      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475    else if (PR_REGNUM_P (regno))
1476      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477    else if (regno == VG_REGNUM)
1478      return AARCH64_DWARF_VG;
1479
1480    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481       equivalent DWARF register.  */
1482    return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1486 static bool
1487 aarch64_advsimd_struct_mode_p (machine_mode mode)
1488 {
1489   return (TARGET_SIMD
1490           && (mode == OImode || mode == CImode || mode == XImode));
1491 }
1492
1493 /* Return true if MODE is an SVE predicate mode.  */
1494 static bool
1495 aarch64_sve_pred_mode_p (machine_mode mode)
1496 {
1497   return (TARGET_SVE
1498           && (mode == VNx16BImode
1499               || mode == VNx8BImode
1500               || mode == VNx4BImode
1501               || mode == VNx2BImode));
1502 }
1503
1504 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1505 const unsigned int VEC_ADVSIMD  = 1;
1506 const unsigned int VEC_SVE_DATA = 2;
1507 const unsigned int VEC_SVE_PRED = 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509    a structure of 2, 3 or 4 vectors.  */
1510 const unsigned int VEC_STRUCT   = 8;
1511 /* Useful combinations of the above.  */
1512 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1513 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1514
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516    Ignore modes that are not supported by the current target.  */
1517 static unsigned int
1518 aarch64_classify_vector_mode (machine_mode mode)
1519 {
1520   if (aarch64_advsimd_struct_mode_p (mode))
1521     return VEC_ADVSIMD | VEC_STRUCT;
1522
1523   if (aarch64_sve_pred_mode_p (mode))
1524     return VEC_SVE_PRED;
1525
1526   /* Make the decision based on the mode's enum value rather than its
1527      properties, so that we keep the correct classification regardless
1528      of -msve-vector-bits.  */
1529   switch (mode)
1530     {
1531     /* Single SVE vectors.  */
1532     case E_VNx16QImode:
1533     case E_VNx8HImode:
1534     case E_VNx4SImode:
1535     case E_VNx2DImode:
1536     case E_VNx8HFmode:
1537     case E_VNx4SFmode:
1538     case E_VNx2DFmode:
1539       return TARGET_SVE ? VEC_SVE_DATA : 0;
1540
1541     /* x2 SVE vectors.  */
1542     case E_VNx32QImode:
1543     case E_VNx16HImode:
1544     case E_VNx8SImode:
1545     case E_VNx4DImode:
1546     case E_VNx16HFmode:
1547     case E_VNx8SFmode:
1548     case E_VNx4DFmode:
1549     /* x3 SVE vectors.  */
1550     case E_VNx48QImode:
1551     case E_VNx24HImode:
1552     case E_VNx12SImode:
1553     case E_VNx6DImode:
1554     case E_VNx24HFmode:
1555     case E_VNx12SFmode:
1556     case E_VNx6DFmode:
1557     /* x4 SVE vectors.  */
1558     case E_VNx64QImode:
1559     case E_VNx32HImode:
1560     case E_VNx16SImode:
1561     case E_VNx8DImode:
1562     case E_VNx32HFmode:
1563     case E_VNx16SFmode:
1564     case E_VNx8DFmode:
1565       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1566
1567     /* 64-bit Advanced SIMD vectors.  */
1568     case E_V8QImode:
1569     case E_V4HImode:
1570     case E_V2SImode:
1571     /* ...E_V1DImode doesn't exist.  */
1572     case E_V4HFmode:
1573     case E_V2SFmode:
1574     case E_V1DFmode:
1575     /* 128-bit Advanced SIMD vectors.  */
1576     case E_V16QImode:
1577     case E_V8HImode:
1578     case E_V4SImode:
1579     case E_V2DImode:
1580     case E_V8HFmode:
1581     case E_V4SFmode:
1582     case E_V2DFmode:
1583       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1584
1585     default:
1586       return 0;
1587     }
1588 }
1589
1590 /* Return true if MODE is any of the data vector modes, including
1591    structure modes.  */
1592 static bool
1593 aarch64_vector_data_mode_p (machine_mode mode)
1594 {
1595   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1596 }
1597
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599    or a structure of vectors.  */
1600 static bool
1601 aarch64_sve_data_mode_p (machine_mode mode)
1602 {
1603   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1604 }
1605
1606 /* Implement target hook TARGET_ARRAY_MODE.  */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1609 {
1610   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1611       && IN_RANGE (nelems, 2, 4))
1612     return mode_for_vector (GET_MODE_INNER (mode),
1613                             GET_MODE_NUNITS (mode) * nelems);
1614
1615   return opt_machine_mode ();
1616 }
1617
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1619 static bool
1620 aarch64_array_mode_supported_p (machine_mode mode,
1621                                 unsigned HOST_WIDE_INT nelems)
1622 {
1623   if (TARGET_SIMD
1624       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1625           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1626       && (nelems >= 2 && nelems <= 4))
1627     return true;
1628
1629   return false;
1630 }
1631
1632 /* Return the SVE predicate mode to use for elements that have
1633    ELEM_NBYTES bytes, if such a mode exists.  */
1634
1635 opt_machine_mode
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1637 {
1638   if (TARGET_SVE)
1639     {
1640       if (elem_nbytes == 1)
1641         return VNx16BImode;
1642       if (elem_nbytes == 2)
1643         return VNx8BImode;
1644       if (elem_nbytes == 4)
1645         return VNx4BImode;
1646       if (elem_nbytes == 8)
1647         return VNx2BImode;
1648     }
1649   return opt_machine_mode ();
1650 }
1651
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1653
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1656 {
1657   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1658     {
1659       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1660       machine_mode pred_mode;
1661       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1662         return pred_mode;
1663     }
1664
1665   return default_get_mask_mode (nunits, nbytes);
1666 }
1667
1668 /* Return the integer element mode associated with SVE mode MODE.  */
1669
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode)
1672 {
1673   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1674                                                GET_MODE_NUNITS (mode));
1675   return int_mode_for_size (elt_bits, 0).require ();
1676 }
1677
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1679    prefer to use the first arithmetic operand as the else value if
1680    the else value doesn't matter, since that exactly matches the SVE
1681    destructive merging form.  For ternary operations we could either
1682    pick the first operand and use FMAD-like instructions or the last
1683    operand and use FMLA-like instructions; the latter seems more
1684    natural.  */
1685
1686 static tree
1687 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1688 {
1689   return nops == 3 ? ops[2] : ops[0];
1690 }
1691
1692 /* Implement TARGET_HARD_REGNO_NREGS.  */
1693
1694 static unsigned int
1695 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1696 {
1697   /* ??? Logically we should only need to provide a value when
1698      HARD_REGNO_MODE_OK says that the combination is valid,
1699      but at the moment we need to handle all modes.  Just ignore
1700      any runtime parts for registers that can't store them.  */
1701   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1702   switch (aarch64_regno_regclass (regno))
1703     {
1704     case FP_REGS:
1705     case FP_LO_REGS:
1706     case FP_LO8_REGS:
1707       if (aarch64_sve_data_mode_p (mode))
1708         return exact_div (GET_MODE_SIZE (mode),
1709                           BYTES_PER_SVE_VECTOR).to_constant ();
1710       return CEIL (lowest_size, UNITS_PER_VREG);
1711     case PR_REGS:
1712     case PR_LO_REGS:
1713     case PR_HI_REGS:
1714       return 1;
1715     default:
1716       return CEIL (lowest_size, UNITS_PER_WORD);
1717     }
1718   gcc_unreachable ();
1719 }
1720
1721 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1722
1723 static bool
1724 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1725 {
1726   if (GET_MODE_CLASS (mode) == MODE_CC)
1727     return regno == CC_REGNUM;
1728
1729   if (regno == VG_REGNUM)
1730     /* This must have the same size as _Unwind_Word.  */
1731     return mode == DImode;
1732
1733   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1734   if (vec_flags & VEC_SVE_PRED)
1735     return PR_REGNUM_P (regno);
1736
1737   if (PR_REGNUM_P (regno))
1738     return 0;
1739
1740   if (regno == SP_REGNUM)
1741     /* The purpose of comparing with ptr_mode is to support the
1742        global register variable associated with the stack pointer
1743        register via the syntax of asm ("wsp") in ILP32.  */
1744     return mode == Pmode || mode == ptr_mode;
1745
1746   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1747     return mode == Pmode;
1748
1749   if (GP_REGNUM_P (regno))
1750     {
1751       if (known_le (GET_MODE_SIZE (mode), 8))
1752         return true;
1753       else if (known_le (GET_MODE_SIZE (mode), 16))
1754         return (regno & 1) == 0;
1755     }
1756   else if (FP_REGNUM_P (regno))
1757     {
1758       if (vec_flags & VEC_STRUCT)
1759         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1760       else
1761         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1762     }
1763
1764   return false;
1765 }
1766
1767 /* Return true if this is a definition of a vectorized simd function.  */
1768
1769 static bool
1770 aarch64_simd_decl_p (tree fndecl)
1771 {
1772   tree fntype;
1773
1774   if (fndecl == NULL)
1775     return false;
1776   fntype = TREE_TYPE (fndecl);
1777   if (fntype == NULL)
1778     return false;
1779
1780   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1781   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1782     return true;
1783
1784   return false;
1785 }
1786
1787 /* Return the mode a register save/restore should use.  DImode for integer
1788    registers, DFmode for FP registers in non-SIMD functions (they only save
1789    the bottom half of a 128 bit register), or TFmode for FP registers in
1790    SIMD functions.  */
1791
1792 static machine_mode
1793 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1794 {
1795   return GP_REGNUM_P (regno)
1796            ? E_DImode
1797            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1798 }
1799
1800 /* Return true if the instruction is a call to a SIMD function, false
1801    if it is not a SIMD function or if we do not know anything about
1802    the function.  */
1803
1804 static bool
1805 aarch64_simd_call_p (rtx_insn *insn)
1806 {
1807   rtx symbol;
1808   rtx call;
1809   tree fndecl;
1810
1811   gcc_assert (CALL_P (insn));
1812   call = get_call_rtx_from (insn);
1813   symbol = XEXP (XEXP (call, 0), 0);
1814   if (GET_CODE (symbol) != SYMBOL_REF)
1815     return false;
1816   fndecl = SYMBOL_REF_DECL (symbol);
1817   if (!fndecl)
1818     return false;
1819
1820   return aarch64_simd_decl_p (fndecl);
1821 }
1822
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1824    a function that uses the SIMD ABI, take advantage of the extra
1825    call-preserved registers that the ABI provides.  */
1826
1827 void
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1829                                           HARD_REG_SET *return_set)
1830 {
1831   if (aarch64_simd_call_p (insn))
1832     {
1833       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1834         if (FP_SIMD_SAVED_REGNUM_P (regno))
1835           CLEAR_HARD_REG_BIT (*return_set, regno);
1836     }
1837 }
1838
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1840    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1841    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1842
1843 static bool
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1845                                         machine_mode mode)
1846 {
1847   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1848   return FP_REGNUM_P (regno)
1849          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1850 }
1851
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1853
1854 rtx_insn *
1855 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1856 {
1857   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1858
1859   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1860     return call_1;
1861   else
1862     return call_2;
1863 }
1864
1865 /* Implement REGMODE_NATURAL_SIZE.  */
1866 poly_uint64
1867 aarch64_regmode_natural_size (machine_mode mode)
1868 {
1869   /* The natural size for SVE data modes is one SVE data vector,
1870      and similarly for predicates.  We can't independently modify
1871      anything smaller than that.  */
1872   /* ??? For now, only do this for variable-width SVE registers.
1873      Doing it for constant-sized registers breaks lower-subreg.c.  */
1874   /* ??? And once that's fixed, we should probably have similar
1875      code for Advanced SIMD.  */
1876   if (!aarch64_sve_vg.is_constant ())
1877     {
1878       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1879       if (vec_flags & VEC_SVE_PRED)
1880         return BYTES_PER_SVE_PRED;
1881       if (vec_flags & VEC_SVE_DATA)
1882         return BYTES_PER_SVE_VECTOR;
1883     }
1884   return UNITS_PER_WORD;
1885 }
1886
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1888 machine_mode
1889 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1890                                      machine_mode mode)
1891 {
1892   /* The predicate mode determines which bits are significant and
1893      which are "don't care".  Decreasing the number of lanes would
1894      lose data while increasing the number of lanes would make bits
1895      unnecessarily significant.  */
1896   if (PR_REGNUM_P (regno))
1897     return mode;
1898   if (known_ge (GET_MODE_SIZE (mode), 4))
1899     return mode;
1900   else
1901     return SImode;
1902 }
1903
1904 /* Return true if I's bits are consecutive ones from the MSB.  */
1905 bool
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1907 {
1908   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1909 }
1910
1911 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1912    that strcpy from constants will be faster.  */
1913
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1916 {
1917   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1918     return MAX (align, BITS_PER_WORD);
1919   return align;
1920 }
1921
1922 /* Return true if calls to DECL should be treated as
1923    long-calls (ie called via a register).  */
1924 static bool
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1926 {
1927   return false;
1928 }
1929
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931    long-calls (ie called via a register).  */
1932 bool
1933 aarch64_is_long_call_p (rtx sym)
1934 {
1935   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1936 }
1937
1938 /* Return true if calls to symbol-ref SYM should not go through
1939    plt stubs.  */
1940
1941 bool
1942 aarch64_is_noplt_call_p (rtx sym)
1943 {
1944   const_tree decl = SYMBOL_REF_DECL (sym);
1945
1946   if (flag_pic
1947       && decl
1948       && (!flag_plt
1949           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1950       && !targetm.binds_local_p (decl))
1951     return true;
1952
1953   return false;
1954 }
1955
1956 /* Return true if the offsets to a zero/sign-extract operation
1957    represent an expression that matches an extend operation.  The
1958    operands represent the paramters from
1959
1960    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1961 bool
1962 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1963                                 rtx extract_imm)
1964 {
1965   HOST_WIDE_INT mult_val, extract_val;
1966
1967   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1968     return false;
1969
1970   mult_val = INTVAL (mult_imm);
1971   extract_val = INTVAL (extract_imm);
1972
1973   if (extract_val > 8
1974       && extract_val < GET_MODE_BITSIZE (mode)
1975       && exact_log2 (extract_val & ~7) > 0
1976       && (extract_val & 7) <= 4
1977       && mult_val == (1 << (extract_val & 7)))
1978     return true;
1979
1980   return false;
1981 }
1982
1983 /* Emit an insn that's a simple single-set.  Both the operands must be
1984    known to be valid.  */
1985 inline static rtx_insn *
1986 emit_set_insn (rtx x, rtx y)
1987 {
1988   return emit_insn (gen_rtx_SET (x, y));
1989 }
1990
1991 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1992    return the rtx for register 0 in the proper mode.  */
1993 rtx
1994 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1995 {
1996   machine_mode mode = SELECT_CC_MODE (code, x, y);
1997   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1998
1999   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2000   return cc_reg;
2001 }
2002
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2004
2005 static rtx
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2007                                   machine_mode y_mode)
2008 {
2009   if (y_mode == E_QImode || y_mode == E_HImode)
2010     {
2011       if (CONST_INT_P (y))
2012         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2013       else
2014         {
2015           rtx t, cc_reg;
2016           machine_mode cc_mode;
2017
2018           t = gen_rtx_ZERO_EXTEND (SImode, y);
2019           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2020           cc_mode = CC_SWPmode;
2021           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2022           emit_set_insn (cc_reg, t);
2023           return cc_reg;
2024         }
2025     }
2026
2027   return aarch64_gen_compare_reg (code, x, y);
2028 }
2029
2030 /* Build the SYMBOL_REF for __tls_get_addr.  */
2031
2032 static GTY(()) rtx tls_get_addr_libfunc;
2033
2034 rtx
2035 aarch64_tls_get_addr (void)
2036 {
2037   if (!tls_get_addr_libfunc)
2038     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2039   return tls_get_addr_libfunc;
2040 }
2041
2042 /* Return the TLS model to use for ADDR.  */
2043
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr)
2046 {
2047   enum tls_model tls_kind = TLS_MODEL_NONE;
2048   if (GET_CODE (addr) == CONST)
2049     {
2050       poly_int64 addend;
2051       rtx sym = strip_offset (addr, &addend);
2052       if (GET_CODE (sym) == SYMBOL_REF)
2053         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2054     }
2055   else if (GET_CODE (addr) == SYMBOL_REF)
2056     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2057
2058   return tls_kind;
2059 }
2060
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062    so that combine would take care of combining addresses where
2063    necessary, but for generation purposes, we'll generate the address
2064    as :
2065    RTL                               Absolute
2066    tmp = hi (symbol_ref);            adrp  x1, foo
2067    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2068                                      nop
2069
2070    PIC                               TLS
2071    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2072    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2073                                      bl   __tls_get_addr
2074                                      nop
2075
2076    Load TLS symbol, depending on TLS mechanism and TLS access model.
2077
2078    Global Dynamic - Traditional TLS:
2079    adrp tmp, :tlsgd:imm
2080    add  dest, tmp, #:tlsgd_lo12:imm
2081    bl   __tls_get_addr
2082
2083    Global Dynamic - TLS Descriptors:
2084    adrp dest, :tlsdesc:imm
2085    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2086    add  dest, dest, #:tlsdesc_lo12:imm
2087    blr  tmp
2088    mrs  tp, tpidr_el0
2089    add  dest, dest, tp
2090
2091    Initial Exec:
2092    mrs  tp, tpidr_el0
2093    adrp tmp, :gottprel:imm
2094    ldr  dest, [tmp, #:gottprel_lo12:imm]
2095    add  dest, dest, tp
2096
2097    Local Exec:
2098    mrs  tp, tpidr_el0
2099    add  t0, tp, #:tprel_hi12:imm, lsl #12
2100    add  t0, t0, #:tprel_lo12_nc:imm
2101 */
2102
2103 static void
2104 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2105                                    enum aarch64_symbol_type type)
2106 {
2107   switch (type)
2108     {
2109     case SYMBOL_SMALL_ABSOLUTE:
2110       {
2111         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2112         rtx tmp_reg = dest;
2113         machine_mode mode = GET_MODE (dest);
2114
2115         gcc_assert (mode == Pmode || mode == ptr_mode);
2116
2117         if (can_create_pseudo_p ())
2118           tmp_reg = gen_reg_rtx (mode);
2119
2120         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2121         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2122         return;
2123       }
2124
2125     case SYMBOL_TINY_ABSOLUTE:
2126       emit_insn (gen_rtx_SET (dest, imm));
2127       return;
2128
2129     case SYMBOL_SMALL_GOT_28K:
2130       {
2131         machine_mode mode = GET_MODE (dest);
2132         rtx gp_rtx = pic_offset_table_rtx;
2133         rtx insn;
2134         rtx mem;
2135
2136         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2138            decide rtx costs, in which case pic_offset_table_rtx is not
2139            initialized.  For that case no need to generate the first adrp
2140            instruction as the final cost for global variable access is
2141            one instruction.  */
2142         if (gp_rtx != NULL)
2143           {
2144             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145                using the page base as GOT base, the first page may be wasted,
2146                in the worst scenario, there is only 28K space for GOT).
2147
2148                The generate instruction sequence for accessing global variable
2149                is:
2150
2151                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2152
2153                Only one instruction needed. But we must initialize
2154                pic_offset_table_rtx properly.  We generate initialize insn for
2155                every global access, and allow CSE to remove all redundant.
2156
2157                The final instruction sequences will look like the following
2158                for multiply global variables access.
2159
2160                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2161
2162                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2165                  ...  */
2166
2167             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2168             crtl->uses_pic_offset_table = 1;
2169             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2170
2171             if (mode != GET_MODE (gp_rtx))
2172              gp_rtx = gen_lowpart (mode, gp_rtx);
2173
2174           }
2175
2176         if (mode == ptr_mode)
2177           {
2178             if (mode == DImode)
2179               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2180             else
2181               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2182
2183             mem = XVECEXP (SET_SRC (insn), 0, 0);
2184           }
2185         else
2186           {
2187             gcc_assert (mode == Pmode);
2188
2189             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2190             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2191           }
2192
2193         /* The operand is expected to be MEM.  Whenever the related insn
2194            pattern changed, above code which calculate mem should be
2195            updated.  */
2196         gcc_assert (GET_CODE (mem) == MEM);
2197         MEM_READONLY_P (mem) = 1;
2198         MEM_NOTRAP_P (mem) = 1;
2199         emit_insn (insn);
2200         return;
2201       }
2202
2203     case SYMBOL_SMALL_GOT_4G:
2204       {
2205         /* In ILP32, the mode of dest can be either SImode or DImode,
2206            while the got entry is always of SImode size.  The mode of
2207            dest depends on how dest is used: if dest is assigned to a
2208            pointer (e.g. in the memory), it has SImode; it may have
2209            DImode if dest is dereferenced to access the memeory.
2210            This is why we have to handle three different ldr_got_small
2211            patterns here (two patterns for ILP32).  */
2212
2213         rtx insn;
2214         rtx mem;
2215         rtx tmp_reg = dest;
2216         machine_mode mode = GET_MODE (dest);
2217
2218         if (can_create_pseudo_p ())
2219           tmp_reg = gen_reg_rtx (mode);
2220
2221         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2222         if (mode == ptr_mode)
2223           {
2224             if (mode == DImode)
2225               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2226             else
2227               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2228
2229             mem = XVECEXP (SET_SRC (insn), 0, 0);
2230           }
2231         else
2232           {
2233             gcc_assert (mode == Pmode);
2234
2235             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2236             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2237           }
2238
2239         gcc_assert (GET_CODE (mem) == MEM);
2240         MEM_READONLY_P (mem) = 1;
2241         MEM_NOTRAP_P (mem) = 1;
2242         emit_insn (insn);
2243         return;
2244       }
2245
2246     case SYMBOL_SMALL_TLSGD:
2247       {
2248         rtx_insn *insns;
2249         machine_mode mode = GET_MODE (dest);
2250         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2251
2252         start_sequence ();
2253         if (TARGET_ILP32)
2254           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2255         else
2256           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2257         insns = get_insns ();
2258         end_sequence ();
2259
2260         RTL_CONST_CALL_P (insns) = 1;
2261         emit_libcall_block (insns, dest, result, imm);
2262         return;
2263       }
2264
2265     case SYMBOL_SMALL_TLSDESC:
2266       {
2267         machine_mode mode = GET_MODE (dest);
2268         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2269         rtx tp;
2270
2271         gcc_assert (mode == Pmode || mode == ptr_mode);
2272
2273         /* In ILP32, the got entry is always of SImode size.  Unlike
2274            small GOT, the dest is fixed at reg 0.  */
2275         if (TARGET_ILP32)
2276           emit_insn (gen_tlsdesc_small_si (imm));
2277         else
2278           emit_insn (gen_tlsdesc_small_di (imm));
2279         tp = aarch64_load_tp (NULL);
2280
2281         if (mode != Pmode)
2282           tp = gen_lowpart (mode, tp);
2283
2284         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2285         if (REG_P (dest))
2286           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2287         return;
2288       }
2289
2290     case SYMBOL_SMALL_TLSIE:
2291       {
2292         /* In ILP32, the mode of dest can be either SImode or DImode,
2293            while the got entry is always of SImode size.  The mode of
2294            dest depends on how dest is used: if dest is assigned to a
2295            pointer (e.g. in the memory), it has SImode; it may have
2296            DImode if dest is dereferenced to access the memeory.
2297            This is why we have to handle three different tlsie_small
2298            patterns here (two patterns for ILP32).  */
2299         machine_mode mode = GET_MODE (dest);
2300         rtx tmp_reg = gen_reg_rtx (mode);
2301         rtx tp = aarch64_load_tp (NULL);
2302
2303         if (mode == ptr_mode)
2304           {
2305             if (mode == DImode)
2306               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2307             else
2308               {
2309                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2310                 tp = gen_lowpart (mode, tp);
2311               }
2312           }
2313         else
2314           {
2315             gcc_assert (mode == Pmode);
2316             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2317           }
2318
2319         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2320         if (REG_P (dest))
2321           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2322         return;
2323       }
2324
2325     case SYMBOL_TLSLE12:
2326     case SYMBOL_TLSLE24:
2327     case SYMBOL_TLSLE32:
2328     case SYMBOL_TLSLE48:
2329       {
2330         machine_mode mode = GET_MODE (dest);
2331         rtx tp = aarch64_load_tp (NULL);
2332
2333         if (mode != Pmode)
2334           tp = gen_lowpart (mode, tp);
2335
2336         switch (type)
2337           {
2338           case SYMBOL_TLSLE12:
2339             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2340                         (dest, tp, imm));
2341             break;
2342           case SYMBOL_TLSLE24:
2343             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2344                         (dest, tp, imm));
2345           break;
2346           case SYMBOL_TLSLE32:
2347             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2348                         (dest, imm));
2349             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2350                         (dest, dest, tp));
2351           break;
2352           case SYMBOL_TLSLE48:
2353             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2354                         (dest, imm));
2355             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2356                         (dest, dest, tp));
2357             break;
2358           default:
2359             gcc_unreachable ();
2360           }
2361
2362         if (REG_P (dest))
2363           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2364         return;
2365       }
2366
2367     case SYMBOL_TINY_GOT:
2368       emit_insn (gen_ldr_got_tiny (dest, imm));
2369       return;
2370
2371     case SYMBOL_TINY_TLSIE:
2372       {
2373         machine_mode mode = GET_MODE (dest);
2374         rtx tp = aarch64_load_tp (NULL);
2375
2376         if (mode == ptr_mode)
2377           {
2378             if (mode == DImode)
2379               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2380             else
2381               {
2382                 tp = gen_lowpart (mode, tp);
2383                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2384               }
2385           }
2386         else
2387           {
2388             gcc_assert (mode == Pmode);
2389             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2390           }
2391
2392         if (REG_P (dest))
2393           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2394         return;
2395       }
2396
2397     default:
2398       gcc_unreachable ();
2399     }
2400 }
2401
2402 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2403    handle all moves if !can_create_pseudo_p ().  The distinction is
2404    important because, unlike emit_move_insn, the move expanders know
2405    how to force Pmode objects into the constant pool even when the
2406    constant pool address is not itself legitimate.  */
2407 static rtx
2408 aarch64_emit_move (rtx dest, rtx src)
2409 {
2410   return (can_create_pseudo_p ()
2411           ? emit_move_insn (dest, src)
2412           : emit_move_insn_1 (dest, src));
2413 }
2414
2415 /* Apply UNOPTAB to OP and store the result in DEST.  */
2416
2417 static void
2418 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2419 {
2420   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2421   if (dest != tmp)
2422     emit_move_insn (dest, tmp);
2423 }
2424
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2426
2427 static void
2428 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2429 {
2430   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2431                           OPTAB_DIRECT);
2432   if (dest != tmp)
2433     emit_move_insn (dest, tmp);
2434 }
2435
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437    taking care to handle partial overlap of register to register
2438    copies.  Special cases are needed when moving between GP regs and
2439    FP regs.  SRC can be a register, constant or memory; DST a register
2440    or memory.  If either operand is memory it must not have any side
2441    effects.  */
2442 void
2443 aarch64_split_128bit_move (rtx dst, rtx src)
2444 {
2445   rtx dst_lo, dst_hi;
2446   rtx src_lo, src_hi;
2447
2448   machine_mode mode = GET_MODE (dst);
2449
2450   gcc_assert (mode == TImode || mode == TFmode);
2451   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2452   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2453
2454   if (REG_P (dst) && REG_P (src))
2455     {
2456       int src_regno = REGNO (src);
2457       int dst_regno = REGNO (dst);
2458
2459       /* Handle FP <-> GP regs.  */
2460       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2461         {
2462           src_lo = gen_lowpart (word_mode, src);
2463           src_hi = gen_highpart (word_mode, src);
2464
2465           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2466           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2467           return;
2468         }
2469       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2470         {
2471           dst_lo = gen_lowpart (word_mode, dst);
2472           dst_hi = gen_highpart (word_mode, dst);
2473
2474           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2475           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2476           return;
2477         }
2478     }
2479
2480   dst_lo = gen_lowpart (word_mode, dst);
2481   dst_hi = gen_highpart (word_mode, dst);
2482   src_lo = gen_lowpart (word_mode, src);
2483   src_hi = gen_highpart_mode (word_mode, mode, src);
2484
2485   /* At most one pairing may overlap.  */
2486   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2487     {
2488       aarch64_emit_move (dst_hi, src_hi);
2489       aarch64_emit_move (dst_lo, src_lo);
2490     }
2491   else
2492     {
2493       aarch64_emit_move (dst_lo, src_lo);
2494       aarch64_emit_move (dst_hi, src_hi);
2495     }
2496 }
2497
2498 bool
2499 aarch64_split_128bit_move_p (rtx dst, rtx src)
2500 {
2501   return (! REG_P (src)
2502           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2503 }
2504
2505 /* Split a complex SIMD combine.  */
2506
2507 void
2508 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2509 {
2510   machine_mode src_mode = GET_MODE (src1);
2511   machine_mode dst_mode = GET_MODE (dst);
2512
2513   gcc_assert (VECTOR_MODE_P (dst_mode));
2514   gcc_assert (register_operand (dst, dst_mode)
2515               && register_operand (src1, src_mode)
2516               && register_operand (src2, src_mode));
2517
2518   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2519   return;
2520 }
2521
2522 /* Split a complex SIMD move.  */
2523
2524 void
2525 aarch64_split_simd_move (rtx dst, rtx src)
2526 {
2527   machine_mode src_mode = GET_MODE (src);
2528   machine_mode dst_mode = GET_MODE (dst);
2529
2530   gcc_assert (VECTOR_MODE_P (dst_mode));
2531
2532   if (REG_P (dst) && REG_P (src))
2533     {
2534       gcc_assert (VECTOR_MODE_P (src_mode));
2535       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2536     }
2537 }
2538
2539 bool
2540 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2541                               machine_mode ymode, rtx y)
2542 {
2543   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2544   gcc_assert (r != NULL);
2545   return rtx_equal_p (x, r);
2546 }
2547
2548
2549 static rtx
2550 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2551 {
2552   if (can_create_pseudo_p ())
2553     return force_reg (mode, value);
2554   else
2555     {
2556       gcc_assert (x);
2557       aarch64_emit_move (x, value);
2558       return x;
2559     }
2560 }
2561
2562 /* Return true if predicate value X is a constant in which every element
2563    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2564    value, i.e. as a predicate in which all bits are significant.  */
2565
2566 static bool
2567 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2568 {
2569   if (GET_CODE (x) != CONST_VECTOR)
2570     return false;
2571
2572   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2573                                              GET_MODE_NUNITS (GET_MODE (x)));
2574   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2575   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2576   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2577
2578   unsigned int nelts = const_vector_encoded_nelts (x);
2579   for (unsigned int i = 0; i < nelts; ++i)
2580     {
2581       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2582       if (!CONST_INT_P (elt))
2583         return false;
2584
2585       builder.quick_push (elt);
2586       for (unsigned int j = 1; j < factor; ++j)
2587         builder.quick_push (const0_rtx);
2588     }
2589   builder.finalize ();
2590   return true;
2591 }
2592
2593 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2594    widest predicate element size it can have (that is, the largest size
2595    for which each element would still be 0 or 1).  */
2596
2597 unsigned int
2598 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2599 {
2600   /* Start with the most optimistic assumption: that we only need
2601      one bit per pattern.  This is what we will use if only the first
2602      bit in each pattern is ever set.  */
2603   unsigned int mask = GET_MODE_SIZE (DImode);
2604   mask |= builder.npatterns ();
2605
2606   /* Look for set bits.  */
2607   unsigned int nelts = builder.encoded_nelts ();
2608   for (unsigned int i = 1; i < nelts; ++i)
2609     if (INTVAL (builder.elt (i)) != 0)
2610       {
2611         if (i & 1)
2612           return 1;
2613         mask |= i;
2614       }
2615   return mask & -mask;
2616 }
2617
2618 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2619    that the constant would have with predicate element size ELT_SIZE
2620    (ignoring the upper bits in each element) and return:
2621
2622    * -1 if all bits are set
2623    * N if the predicate has N leading set bits followed by all clear bits
2624    * 0 if the predicate does not have any of these forms.  */
2625
2626 int
2627 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2628                               unsigned int elt_size)
2629 {
2630   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2631      followed by set bits.  */
2632   if (builder.nelts_per_pattern () == 3)
2633     return 0;
2634
2635   /* Skip over leading set bits.  */
2636   unsigned int nelts = builder.encoded_nelts ();
2637   unsigned int i = 0;
2638   for (; i < nelts; i += elt_size)
2639     if (INTVAL (builder.elt (i)) == 0)
2640       break;
2641   unsigned int vl = i / elt_size;
2642
2643   /* Check for the all-true case.  */
2644   if (i == nelts)
2645     return -1;
2646
2647   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2648      repeating pattern of set bits followed by clear bits.  */
2649   if (builder.nelts_per_pattern () != 2)
2650     return 0;
2651
2652   /* We have a "foreground" value and a duplicated "background" value.
2653      If the background might repeat and the last set bit belongs to it,
2654      we might have set bits followed by clear bits followed by set bits.  */
2655   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2656     return 0;
2657
2658   /* Make sure that the rest are all clear.  */
2659   for (; i < nelts; i += elt_size)
2660     if (INTVAL (builder.elt (i)) != 0)
2661       return 0;
2662
2663   return vl;
2664 }
2665
2666 /* See if there is an svpattern that encodes an SVE predicate of mode
2667    PRED_MODE in which the first VL bits are set and the rest are clear.
2668    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2669    A VL of -1 indicates an all-true vector.  */
2670
2671 aarch64_svpattern
2672 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2673 {
2674   if (vl < 0)
2675     return AARCH64_SV_ALL;
2676
2677   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2678     return AARCH64_NUM_SVPATTERNS;
2679
2680   if (vl >= 1 && vl <= 8)
2681     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2682
2683   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2684     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2685
2686   int max_vl;
2687   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2688     {
2689       if (vl == (max_vl / 3) * 3)
2690         return AARCH64_SV_MUL3;
2691       /* These would only trigger for non-power-of-2 lengths.  */
2692       if (vl == (max_vl & -4))
2693         return AARCH64_SV_MUL4;
2694       if (vl == (1 << floor_log2 (max_vl)))
2695         return AARCH64_SV_POW2;
2696       if (vl == max_vl)
2697         return AARCH64_SV_ALL;
2698     }
2699   return AARCH64_NUM_SVPATTERNS;
2700 }
2701
2702 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2703    bits has the lowest bit set and the upper bits clear.  This is the
2704    VNx16BImode equivalent of a PTRUE for controlling elements of
2705    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2706    all bits are significant, even the upper zeros.  */
2707
2708 rtx
2709 aarch64_ptrue_all (unsigned int elt_size)
2710 {
2711   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2712   builder.quick_push (const1_rtx);
2713   for (unsigned int i = 1; i < elt_size; ++i)
2714     builder.quick_push (const0_rtx);
2715   return builder.build ();
2716 }
2717
2718 /* Return an all-true predicate register of mode MODE.  */
2719
2720 rtx
2721 aarch64_ptrue_reg (machine_mode mode)
2722 {
2723   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2724   return force_reg (mode, CONSTM1_RTX (mode));
2725 }
2726
2727 /* Return an all-false predicate register of mode MODE.  */
2728
2729 rtx
2730 aarch64_pfalse_reg (machine_mode mode)
2731 {
2732   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2733   return force_reg (mode, CONST0_RTX (mode));
2734 }
2735
2736 /* Return true if we can move VALUE into a register using a single
2737    CNT[BHWD] instruction.  */
2738
2739 static bool
2740 aarch64_sve_cnt_immediate_p (poly_int64 value)
2741 {
2742   HOST_WIDE_INT factor = value.coeffs[0];
2743   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2744   return (value.coeffs[1] == factor
2745           && IN_RANGE (factor, 2, 16 * 16)
2746           && (factor & 1) == 0
2747           && factor <= 16 * (factor & -factor));
2748 }
2749
2750 /* Likewise for rtx X.  */
2751
2752 bool
2753 aarch64_sve_cnt_immediate_p (rtx x)
2754 {
2755   poly_int64 value;
2756   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2757 }
2758
2759 /* Return the asm string for an instruction with a CNT-like vector size
2760    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2761    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2762    first part of the operands template (the part that comes before the
2763    vector size itself).  FACTOR is the number of quadwords.
2764    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2765    If it is zero, we can use any element size.  */
2766
2767 static char *
2768 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2769                                   unsigned int factor,
2770                                   unsigned int nelts_per_vq)
2771 {
2772   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2773
2774   if (nelts_per_vq == 0)
2775     /* There is some overlap in the ranges of the four CNT instructions.
2776        Here we always use the smallest possible element size, so that the
2777        multiplier is 1 whereever possible.  */
2778     nelts_per_vq = factor & -factor;
2779   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2780   gcc_assert (IN_RANGE (shift, 1, 4));
2781   char suffix = "dwhb"[shift - 1];
2782
2783   factor >>= shift;
2784   unsigned int written;
2785   if (factor == 1)
2786     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2787                         prefix, suffix, operands);
2788   else
2789     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2790                         prefix, suffix, operands, factor);
2791   gcc_assert (written < sizeof (buffer));
2792   return buffer;
2793 }
2794
2795 /* Return the asm string for an instruction with a CNT-like vector size
2796    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2797    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2798    first part of the operands template (the part that comes before the
2799    vector size itself).  X is the value of the vector size operand,
2800    as a polynomial integer rtx.  */
2801
2802 char *
2803 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2804                                   rtx x)
2805 {
2806   poly_int64 value = rtx_to_poly_int64 (x);
2807   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2808   return aarch64_output_sve_cnt_immediate (prefix, operands,
2809                                            value.coeffs[1], 0);
2810 }
2811
2812 /* Return true if we can add VALUE to a register using a single ADDVL
2813    or ADDPL instruction.  */
2814
2815 static bool
2816 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2817 {
2818   HOST_WIDE_INT factor = value.coeffs[0];
2819   if (factor == 0 || value.coeffs[1] != factor)
2820     return false;
2821   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2822      and a value of 16 is one vector width.  */
2823   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2824           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2825 }
2826
2827 /* Likewise for rtx X.  */
2828
2829 bool
2830 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2831 {
2832   poly_int64 value;
2833   return (poly_int_rtx_p (x, &value)
2834           && aarch64_sve_addvl_addpl_immediate_p (value));
2835 }
2836
2837 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2838    and storing the result in operand 0.  */
2839
2840 char *
2841 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2842 {
2843   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2844   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2845   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2846
2847   /* Use INC or DEC if possible.  */
2848   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2849     {
2850       if (aarch64_sve_cnt_immediate_p (offset_value))
2851         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2852                                                  offset_value.coeffs[1], 0);
2853       if (aarch64_sve_cnt_immediate_p (-offset_value))
2854         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2855                                                  -offset_value.coeffs[1], 0);
2856     }
2857
2858   int factor = offset_value.coeffs[1];
2859   if ((factor & 15) == 0)
2860     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2861   else
2862     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2863   return buffer;
2864 }
2865
2866 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2867    instruction.  If it is, store the number of elements in each vector
2868    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2869    factor in *FACTOR_OUT (if nonnull).  */
2870
2871 bool
2872 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2873                                  unsigned int *nelts_per_vq_out)
2874 {
2875   rtx elt;
2876   poly_int64 value;
2877
2878   if (!const_vec_duplicate_p (x, &elt)
2879       || !poly_int_rtx_p (elt, &value))
2880     return false;
2881
2882   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2883   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2884     /* There's no vector INCB.  */
2885     return false;
2886
2887   HOST_WIDE_INT factor = value.coeffs[0];
2888   if (value.coeffs[1] != factor)
2889     return false;
2890
2891   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2892   if ((factor % nelts_per_vq) != 0
2893       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2894     return false;
2895
2896   if (factor_out)
2897     *factor_out = factor;
2898   if (nelts_per_vq_out)
2899     *nelts_per_vq_out = nelts_per_vq;
2900   return true;
2901 }
2902
2903 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2904    instruction.  */
2905
2906 bool
2907 aarch64_sve_inc_dec_immediate_p (rtx x)
2908 {
2909   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2910 }
2911
2912 /* Return the asm template for an SVE vector INC or DEC instruction.
2913    OPERANDS gives the operands before the vector count and X is the
2914    value of the vector count operand itself.  */
2915
2916 char *
2917 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2918 {
2919   int factor;
2920   unsigned int nelts_per_vq;
2921   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2922     gcc_unreachable ();
2923   if (factor < 0)
2924     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2925                                              nelts_per_vq);
2926   else
2927     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2928                                              nelts_per_vq);
2929 }
2930
2931 static int
2932 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2933                                 scalar_int_mode mode)
2934 {
2935   int i;
2936   unsigned HOST_WIDE_INT val, val2, mask;
2937   int one_match, zero_match;
2938   int num_insns;
2939
2940   val = INTVAL (imm);
2941
2942   if (aarch64_move_imm (val, mode))
2943     {
2944       if (generate)
2945         emit_insn (gen_rtx_SET (dest, imm));
2946       return 1;
2947     }
2948
2949   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2950      (with XXXX non-zero). In that case check to see if the move can be done in
2951      a smaller mode.  */
2952   val2 = val & 0xffffffff;
2953   if (mode == DImode
2954       && aarch64_move_imm (val2, SImode)
2955       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2956     {
2957       if (generate)
2958         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2959
2960       /* Check if we have to emit a second instruction by checking to see
2961          if any of the upper 32 bits of the original DI mode value is set.  */
2962       if (val == val2)
2963         return 1;
2964
2965       i = (val >> 48) ? 48 : 32;
2966
2967       if (generate)
2968          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2969                                     GEN_INT ((val >> i) & 0xffff)));
2970
2971       return 2;
2972     }
2973
2974   if ((val >> 32) == 0 || mode == SImode)
2975     {
2976       if (generate)
2977         {
2978           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2979           if (mode == SImode)
2980             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2981                                        GEN_INT ((val >> 16) & 0xffff)));
2982           else
2983             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2984                                        GEN_INT ((val >> 16) & 0xffff)));
2985         }
2986       return 2;
2987     }
2988
2989   /* Remaining cases are all for DImode.  */
2990
2991   mask = 0xffff;
2992   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2993     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2994   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2995     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2996
2997   if (zero_match != 2 && one_match != 2)
2998     {
2999       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3000          For a 64-bit bitmask try whether changing 16 bits to all ones or
3001          zeroes creates a valid bitmask.  To check any repeated bitmask,
3002          try using 16 bits from the other 32-bit half of val.  */
3003
3004       for (i = 0; i < 64; i += 16, mask <<= 16)
3005         {
3006           val2 = val & ~mask;
3007           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3008             break;
3009           val2 = val | mask;
3010           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3011             break;
3012           val2 = val2 & ~mask;
3013           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3014           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3015             break;
3016         }
3017       if (i != 64)
3018         {
3019           if (generate)
3020             {
3021               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3022               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3023                                          GEN_INT ((val >> i) & 0xffff)));
3024             }
3025           return 2;
3026         }
3027     }
3028
3029   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3030      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3031      otherwise skip zero bits.  */
3032
3033   num_insns = 1;
3034   mask = 0xffff;
3035   val2 = one_match > zero_match ? ~val : val;
3036   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3037
3038   if (generate)
3039     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3040                                            ? (val | ~(mask << i))
3041                                            : (val & (mask << i)))));
3042   for (i += 16; i < 64; i += 16)
3043     {
3044       if ((val2 & (mask << i)) == 0)
3045         continue;
3046       if (generate)
3047         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3048                                    GEN_INT ((val >> i) & 0xffff)));
3049       num_insns ++;
3050     }
3051
3052   return num_insns;
3053 }
3054
3055 /* Return whether imm is a 128-bit immediate which is simple enough to
3056    expand inline.  */
3057 bool
3058 aarch64_mov128_immediate (rtx imm)
3059 {
3060   if (GET_CODE (imm) == CONST_INT)
3061     return true;
3062
3063   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3064
3065   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3066   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3067
3068   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3069          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3070 }
3071
3072
3073 /* Return the number of temporary registers that aarch64_add_offset_1
3074    would need to add OFFSET to a register.  */
3075
3076 static unsigned int
3077 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3078 {
3079   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3080 }
3081
3082 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3083    a non-polynomial OFFSET.  MODE is the mode of the addition.
3084    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3085    be set and CFA adjustments added to the generated instructions.
3086
3087    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3088    temporary if register allocation is already complete.  This temporary
3089    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3090    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3091    the immediate again.
3092
3093    Since this function may be used to adjust the stack pointer, we must
3094    ensure that it cannot cause transient stack deallocation (for example
3095    by first incrementing SP and then decrementing when adjusting by a
3096    large immediate).  */
3097
3098 static void
3099 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3100                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3101                       bool frame_related_p, bool emit_move_imm)
3102 {
3103   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3104   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3105
3106   HOST_WIDE_INT moffset = abs_hwi (offset);
3107   rtx_insn *insn;
3108
3109   if (!moffset)
3110     {
3111       if (!rtx_equal_p (dest, src))
3112         {
3113           insn = emit_insn (gen_rtx_SET (dest, src));
3114           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3115         }
3116       return;
3117     }
3118
3119   /* Single instruction adjustment.  */
3120   if (aarch64_uimm12_shift (moffset))
3121     {
3122       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3123       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3124       return;
3125     }
3126
3127   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3128      and either:
3129
3130      a) the offset cannot be loaded by a 16-bit move or
3131      b) there is no spare register into which we can move it.  */
3132   if (moffset < 0x1000000
3133       && ((!temp1 && !can_create_pseudo_p ())
3134           || !aarch64_move_imm (moffset, mode)))
3135     {
3136       HOST_WIDE_INT low_off = moffset & 0xfff;
3137
3138       low_off = offset < 0 ? -low_off : low_off;
3139       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3140       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3141       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3142       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3143       return;
3144     }
3145
3146   /* Emit a move immediate if required and an addition/subtraction.  */
3147   if (emit_move_imm)
3148     {
3149       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3150       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3151     }
3152   insn = emit_insn (offset < 0
3153                     ? gen_sub3_insn (dest, src, temp1)
3154                     : gen_add3_insn (dest, src, temp1));
3155   if (frame_related_p)
3156     {
3157       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3158       rtx adj = plus_constant (mode, src, offset);
3159       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3160     }
3161 }
3162
3163 /* Return the number of temporary registers that aarch64_add_offset
3164    would need to move OFFSET into a register or add OFFSET to a register;
3165    ADD_P is true if we want the latter rather than the former.  */
3166
3167 static unsigned int
3168 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3169 {
3170   /* This follows the same structure as aarch64_add_offset.  */
3171   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3172     return 0;
3173
3174   unsigned int count = 0;
3175   HOST_WIDE_INT factor = offset.coeffs[1];
3176   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3177   poly_int64 poly_offset (factor, factor);
3178   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3179     /* Need one register for the ADDVL/ADDPL result.  */
3180     count += 1;
3181   else if (factor != 0)
3182     {
3183       factor = abs (factor);
3184       if (factor > 16 * (factor & -factor))
3185         /* Need one register for the CNT result and one for the multiplication
3186            factor.  If necessary, the second temporary can be reused for the
3187            constant part of the offset.  */
3188         return 2;
3189       /* Need one register for the CNT result (which might then
3190          be shifted).  */
3191       count += 1;
3192     }
3193   return count + aarch64_add_offset_1_temporaries (constant);
3194 }
3195
3196 /* If X can be represented as a poly_int64, return the number
3197    of temporaries that are required to add it to a register.
3198    Return -1 otherwise.  */
3199
3200 int
3201 aarch64_add_offset_temporaries (rtx x)
3202 {
3203   poly_int64 offset;
3204   if (!poly_int_rtx_p (x, &offset))
3205     return -1;
3206   return aarch64_offset_temporaries (true, offset);
3207 }
3208
3209 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3210    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3211    be set and CFA adjustments added to the generated instructions.
3212
3213    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3214    temporary if register allocation is already complete.  This temporary
3215    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3216    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3217    false to avoid emitting the immediate again.
3218
3219    TEMP2, if nonnull, is a second temporary register that doesn't
3220    overlap either DEST or REG.
3221
3222    Since this function may be used to adjust the stack pointer, we must
3223    ensure that it cannot cause transient stack deallocation (for example
3224    by first incrementing SP and then decrementing when adjusting by a
3225    large immediate).  */
3226
3227 static void
3228 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3229                     poly_int64 offset, rtx temp1, rtx temp2,
3230                     bool frame_related_p, bool emit_move_imm = true)
3231 {
3232   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3233   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3234   gcc_assert (temp1 == NULL_RTX
3235               || !frame_related_p
3236               || !reg_overlap_mentioned_p (temp1, dest));
3237   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3238
3239   /* Try using ADDVL or ADDPL to add the whole value.  */
3240   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3241     {
3242       rtx offset_rtx = gen_int_mode (offset, mode);
3243       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3244       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3245       return;
3246     }
3247
3248   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3249      SVE vector register, over and above the minimum size of 128 bits.
3250      This is equivalent to half the value returned by CNTD with a
3251      vector shape of ALL.  */
3252   HOST_WIDE_INT factor = offset.coeffs[1];
3253   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3254
3255   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3256   poly_int64 poly_offset (factor, factor);
3257   if (src != const0_rtx
3258       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3259     {
3260       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3261       if (frame_related_p)
3262         {
3263           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3264           RTX_FRAME_RELATED_P (insn) = true;
3265           src = dest;
3266         }
3267       else
3268         {
3269           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3270           src = aarch64_force_temporary (mode, temp1, addr);
3271           temp1 = temp2;
3272           temp2 = NULL_RTX;
3273         }
3274     }
3275   /* Otherwise use a CNT-based sequence.  */
3276   else if (factor != 0)
3277     {
3278       /* Use a subtraction if we have a negative factor.  */
3279       rtx_code code = PLUS;
3280       if (factor < 0)
3281         {
3282           factor = -factor;
3283           code = MINUS;
3284         }
3285
3286       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3287          into the multiplication.  */
3288       rtx val;
3289       int shift = 0;
3290       if (factor & 1)
3291         /* Use a right shift by 1.  */
3292         shift = -1;
3293       else
3294         factor /= 2;
3295       HOST_WIDE_INT low_bit = factor & -factor;
3296       if (factor <= 16 * low_bit)
3297         {
3298           if (factor > 16 * 8)
3299             {
3300               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3301                  the value with the minimum multiplier and shift it into
3302                  position.  */
3303               int extra_shift = exact_log2 (low_bit);
3304               shift += extra_shift;
3305               factor >>= extra_shift;
3306             }
3307           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3308         }
3309       else
3310         {
3311           /* Use CNTD, then multiply it by FACTOR.  */
3312           val = gen_int_mode (poly_int64 (2, 2), mode);
3313           val = aarch64_force_temporary (mode, temp1, val);
3314
3315           /* Go back to using a negative multiplication factor if we have
3316              no register from which to subtract.  */
3317           if (code == MINUS && src == const0_rtx)
3318             {
3319               factor = -factor;
3320               code = PLUS;
3321             }
3322           rtx coeff1 = gen_int_mode (factor, mode);
3323           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3324           val = gen_rtx_MULT (mode, val, coeff1);
3325         }
3326
3327       if (shift > 0)
3328         {
3329           /* Multiply by 1 << SHIFT.  */
3330           val = aarch64_force_temporary (mode, temp1, val);
3331           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3332         }
3333       else if (shift == -1)
3334         {
3335           /* Divide by 2.  */
3336           val = aarch64_force_temporary (mode, temp1, val);
3337           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3338         }
3339
3340       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3341       if (src != const0_rtx)
3342         {
3343           val = aarch64_force_temporary (mode, temp1, val);
3344           val = gen_rtx_fmt_ee (code, mode, src, val);
3345         }
3346       else if (code == MINUS)
3347         {
3348           val = aarch64_force_temporary (mode, temp1, val);
3349           val = gen_rtx_NEG (mode, val);
3350         }
3351
3352       if (constant == 0 || frame_related_p)
3353         {
3354           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3355           if (frame_related_p)
3356             {
3357               RTX_FRAME_RELATED_P (insn) = true;
3358               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3359                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3360                                                               poly_offset)));
3361             }
3362           src = dest;
3363           if (constant == 0)
3364             return;
3365         }
3366       else
3367         {
3368           src = aarch64_force_temporary (mode, temp1, val);
3369           temp1 = temp2;
3370           temp2 = NULL_RTX;
3371         }
3372
3373       emit_move_imm = true;
3374     }
3375
3376   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3377                         frame_related_p, emit_move_imm);
3378 }
3379
3380 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3381    than a poly_int64.  */
3382
3383 void
3384 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3385                           rtx offset_rtx, rtx temp1, rtx temp2)
3386 {
3387   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3388                       temp1, temp2, false);
3389 }
3390
3391 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3392    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3393    if TEMP1 already contains abs (DELTA).  */
3394
3395 static inline void
3396 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3397 {
3398   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3399                       temp1, temp2, true, emit_move_imm);
3400 }
3401
3402 /* Subtract DELTA from the stack pointer, marking the instructions
3403    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3404    if nonnull.  */
3405
3406 static inline void
3407 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3408                 bool emit_move_imm = true)
3409 {
3410   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3411                       temp1, temp2, frame_related_p, emit_move_imm);
3412 }
3413
3414 /* Set DEST to (vec_series BASE STEP).  */
3415
3416 static void
3417 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3418 {
3419   machine_mode mode = GET_MODE (dest);
3420   scalar_mode inner = GET_MODE_INNER (mode);
3421
3422   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3423   if (!aarch64_sve_index_immediate_p (base))
3424     base = force_reg (inner, base);
3425   if (!aarch64_sve_index_immediate_p (step))
3426     step = force_reg (inner, step);
3427
3428   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3429 }
3430
3431 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3432    register of mode MODE.  Use TARGET for the result if it's nonnull
3433    and convenient.
3434
3435    The two vector modes must have the same element mode.  The behavior
3436    is to duplicate architectural lane N of SRC into architectural lanes
3437    N + I * STEP of the result.  On big-endian targets, architectural
3438    lane 0 of an Advanced SIMD vector is the last element of the vector
3439    in memory layout, so for big-endian targets this operation has the
3440    effect of reversing SRC before duplicating it.  Callers need to
3441    account for this.  */
3442
3443 rtx
3444 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3445 {
3446   machine_mode src_mode = GET_MODE (src);
3447   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3448   insn_code icode = (BYTES_BIG_ENDIAN
3449                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3450                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3451
3452   unsigned int i = 0;
3453   expand_operand ops[3];
3454   create_output_operand (&ops[i++], target, mode);
3455   create_output_operand (&ops[i++], src, src_mode);
3456   if (BYTES_BIG_ENDIAN)
3457     {
3458       /* Create a PARALLEL describing the reversal of SRC.  */
3459       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3460       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3461                                                   nelts_per_vq - 1, -1);
3462       create_fixed_operand (&ops[i++], sel);
3463     }
3464   expand_insn (icode, i, ops);
3465   return ops[0].value;
3466 }
3467
3468 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3469    the memory image into DEST.  Return true on success.  */
3470
3471 static bool
3472 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3473 {
3474   src = force_const_mem (GET_MODE (src), src);
3475   if (!src)
3476     return false;
3477
3478   /* Make sure that the address is legitimate.  */
3479   if (!aarch64_sve_ld1rq_operand_p (src))
3480     {
3481       rtx addr = force_reg (Pmode, XEXP (src, 0));
3482       src = replace_equiv_address (src, addr);
3483     }
3484
3485   machine_mode mode = GET_MODE (dest);
3486   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3487   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3488   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3489   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3490   return true;
3491 }
3492
3493 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3494    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3495    result if convenient.
3496
3497    The returned register can have whatever mode seems most natural
3498    given the contents of SRC.  */
3499
3500 static rtx
3501 aarch64_expand_sve_const_vector (rtx target, rtx src)
3502 {
3503   machine_mode mode = GET_MODE (src);
3504   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3505   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3506   scalar_mode elt_mode = GET_MODE_INNER (mode);
3507   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3508   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3509
3510   if (nelts_per_pattern == 1 && encoded_bits == 128)
3511     {
3512       /* The constant is a duplicated quadword but can't be narrowed
3513          beyond a quadword.  Get the memory image of the first quadword
3514          as a 128-bit vector and try using LD1RQ to load it from memory.
3515
3516          The effect for both endiannesses is to load memory lane N into
3517          architectural lanes N + I * STEP of the result.  On big-endian
3518          targets, the layout of the 128-bit vector in an Advanced SIMD
3519          register would be different from its layout in an SVE register,
3520          but this 128-bit vector is a memory value only.  */
3521       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3522       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3523       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3524         return target;
3525     }
3526
3527   if (nelts_per_pattern == 1 && encoded_bits < 128)
3528     {
3529       /* The vector is a repeating sequence of 64 bits or fewer.
3530          See if we can load them using an Advanced SIMD move and then
3531          duplicate it to fill a vector.  This is better than using a GPR
3532          move because it keeps everything in the same register file.  */
3533       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3534       rtx_vector_builder builder (vq_mode, npatterns, 1);
3535       for (unsigned int i = 0; i < npatterns; ++i)
3536         {
3537           /* We want memory lane N to go into architectural lane N,
3538              so reverse for big-endian targets.  The DUP .Q pattern
3539              has a compensating reverse built-in.  */
3540           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3541           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3542         }
3543       rtx vq_src = builder.build ();
3544       if (aarch64_simd_valid_immediate (vq_src, NULL))
3545         {
3546           vq_src = force_reg (vq_mode, vq_src);
3547           return aarch64_expand_sve_dupq (target, mode, vq_src);
3548         }
3549
3550       /* Get an integer representation of the repeating part of Advanced
3551          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3552          which for big-endian targets is lane-swapped wrt a normal
3553          Advanced SIMD vector.  This means that for both endiannesses,
3554          memory lane N of SVE vector SRC corresponds to architectural
3555          lane N of a register holding VQ_SRC.  This in turn means that
3556          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3557          as a single 128-bit value) and thus that memory lane 0 of SRC is
3558          in the lsb of the integer.  Duplicating the integer therefore
3559          ensures that memory lane N of SRC goes into architectural lane
3560          N + I * INDEX of the SVE register.  */
3561       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3562       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3563       if (elt_value)
3564         {
3565           /* Pretend that we had a vector of INT_MODE to start with.  */
3566           elt_mode = int_mode;
3567           mode = aarch64_full_sve_mode (int_mode).require ();
3568
3569           /* If the integer can be moved into a general register by a
3570              single instruction, do that and duplicate the result.  */
3571           if (CONST_INT_P (elt_value)
3572               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3573             {
3574               elt_value = force_reg (elt_mode, elt_value);
3575               return expand_vector_broadcast (mode, elt_value);
3576             }
3577         }
3578       else if (npatterns == 1)
3579         /* We're duplicating a single value, but can't do better than
3580            force it to memory and load from there.  This handles things
3581            like symbolic constants.  */
3582         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3583
3584       if (elt_value)
3585         {
3586           /* Load the element from memory if we can, otherwise move it into
3587              a register and use a DUP.  */
3588           rtx op = force_const_mem (elt_mode, elt_value);
3589           if (!op)
3590             op = force_reg (elt_mode, elt_value);
3591           return expand_vector_broadcast (mode, op);
3592         }
3593     }
3594
3595   /* Try using INDEX.  */
3596   rtx base, step;
3597   if (const_vec_series_p (src, &base, &step))
3598     {
3599       aarch64_expand_vec_series (target, base, step);
3600       return target;
3601     }
3602
3603   /* From here on, it's better to force the whole constant to memory
3604      if we can.  */
3605   if (GET_MODE_NUNITS (mode).is_constant ())
3606     return NULL_RTX;
3607
3608   /* Expand each pattern individually.  */
3609   gcc_assert (npatterns > 1);
3610   rtx_vector_builder builder;
3611   auto_vec<rtx, 16> vectors (npatterns);
3612   for (unsigned int i = 0; i < npatterns; ++i)
3613     {
3614       builder.new_vector (mode, 1, nelts_per_pattern);
3615       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3616         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3617       vectors.quick_push (force_reg (mode, builder.build ()));
3618     }
3619
3620   /* Use permutes to interleave the separate vectors.  */
3621   while (npatterns > 1)
3622     {
3623       npatterns /= 2;
3624       for (unsigned int i = 0; i < npatterns; ++i)
3625         {
3626           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3627           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3628           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3629           vectors[i] = tmp;
3630         }
3631     }
3632   gcc_assert (vectors[0] == target);
3633   return target;
3634 }
3635
3636 /* Use WHILE to set predicate register DEST so that the first VL bits
3637    are set and the rest are clear.  */
3638
3639 static void
3640 aarch64_sve_move_pred_via_while (rtx dest, unsigned int vl)
3641 {
3642   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3643   emit_insn (gen_while_ult (DImode, GET_MODE (dest),
3644                             dest, const0_rtx, limit));
3645 }
3646
3647 /* Set DEST to immediate IMM.  */
3648
3649 void
3650 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3651 {
3652   machine_mode mode = GET_MODE (dest);
3653
3654   /* Check on what type of symbol it is.  */
3655   scalar_int_mode int_mode;
3656   if ((GET_CODE (imm) == SYMBOL_REF
3657        || GET_CODE (imm) == LABEL_REF
3658        || GET_CODE (imm) == CONST
3659        || GET_CODE (imm) == CONST_POLY_INT)
3660       && is_a <scalar_int_mode> (mode, &int_mode))
3661     {
3662       rtx mem;
3663       poly_int64 offset;
3664       HOST_WIDE_INT const_offset;
3665       enum aarch64_symbol_type sty;
3666
3667       /* If we have (const (plus symbol offset)), separate out the offset
3668          before we start classifying the symbol.  */
3669       rtx base = strip_offset (imm, &offset);
3670
3671       /* We must always add an offset involving VL separately, rather than
3672          folding it into the relocation.  */
3673       if (!offset.is_constant (&const_offset))
3674         {
3675           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3676             emit_insn (gen_rtx_SET (dest, imm));
3677           else
3678             {
3679               /* Do arithmetic on 32-bit values if the result is smaller
3680                  than that.  */
3681               if (partial_subreg_p (int_mode, SImode))
3682                 {
3683                   /* It is invalid to do symbol calculations in modes
3684                      narrower than SImode.  */
3685                   gcc_assert (base == const0_rtx);
3686                   dest = gen_lowpart (SImode, dest);
3687                   int_mode = SImode;
3688                 }
3689               if (base != const0_rtx)
3690                 {
3691                   base = aarch64_force_temporary (int_mode, dest, base);
3692                   aarch64_add_offset (int_mode, dest, base, offset,
3693                                       NULL_RTX, NULL_RTX, false);
3694                 }
3695               else
3696                 aarch64_add_offset (int_mode, dest, base, offset,
3697                                     dest, NULL_RTX, false);
3698             }
3699           return;
3700         }
3701
3702       sty = aarch64_classify_symbol (base, const_offset);
3703       switch (sty)
3704         {
3705         case SYMBOL_FORCE_TO_MEM:
3706           if (const_offset != 0
3707               && targetm.cannot_force_const_mem (int_mode, imm))
3708             {
3709               gcc_assert (can_create_pseudo_p ());
3710               base = aarch64_force_temporary (int_mode, dest, base);
3711               aarch64_add_offset (int_mode, dest, base, const_offset,
3712                                   NULL_RTX, NULL_RTX, false);
3713               return;
3714             }
3715
3716           mem = force_const_mem (ptr_mode, imm);
3717           gcc_assert (mem);
3718
3719           /* If we aren't generating PC relative literals, then
3720              we need to expand the literal pool access carefully.
3721              This is something that needs to be done in a number
3722              of places, so could well live as a separate function.  */
3723           if (!aarch64_pcrelative_literal_loads)
3724             {
3725               gcc_assert (can_create_pseudo_p ());
3726               base = gen_reg_rtx (ptr_mode);
3727               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3728               if (ptr_mode != Pmode)
3729                 base = convert_memory_address (Pmode, base);
3730               mem = gen_rtx_MEM (ptr_mode, base);
3731             }
3732
3733           if (int_mode != ptr_mode)
3734             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3735
3736           emit_insn (gen_rtx_SET (dest, mem));
3737
3738           return;
3739
3740         case SYMBOL_SMALL_TLSGD:
3741         case SYMBOL_SMALL_TLSDESC:
3742         case SYMBOL_SMALL_TLSIE:
3743         case SYMBOL_SMALL_GOT_28K:
3744         case SYMBOL_SMALL_GOT_4G:
3745         case SYMBOL_TINY_GOT:
3746         case SYMBOL_TINY_TLSIE:
3747           if (const_offset != 0)
3748             {
3749               gcc_assert(can_create_pseudo_p ());
3750               base = aarch64_force_temporary (int_mode, dest, base);
3751               aarch64_add_offset (int_mode, dest, base, const_offset,
3752                                   NULL_RTX, NULL_RTX, false);
3753               return;
3754             }
3755           /* FALLTHRU */
3756
3757         case SYMBOL_SMALL_ABSOLUTE:
3758         case SYMBOL_TINY_ABSOLUTE:
3759         case SYMBOL_TLSLE12:
3760         case SYMBOL_TLSLE24:
3761         case SYMBOL_TLSLE32:
3762         case SYMBOL_TLSLE48:
3763           aarch64_load_symref_appropriately (dest, imm, sty);
3764           return;
3765
3766         default:
3767           gcc_unreachable ();
3768         }
3769     }
3770
3771   if (!CONST_INT_P (imm))
3772     {
3773       if (GET_CODE (imm) == HIGH
3774           || aarch64_simd_valid_immediate (imm, NULL))
3775         {
3776           emit_insn (gen_rtx_SET (dest, imm));
3777           return;
3778         }
3779
3780       rtx_vector_builder builder;
3781       if (GET_MODE_CLASS (GET_MODE (imm)) == MODE_VECTOR_BOOL
3782           && aarch64_get_sve_pred_bits (builder, imm))
3783         {
3784           unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3785           int vl = aarch64_partial_ptrue_length (builder, elt_size);
3786           if (vl > 0)
3787             {
3788               aarch64_sve_move_pred_via_while (dest, vl);
3789               return;
3790             }
3791         }
3792
3793       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
3794         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
3795           {
3796             if (dest != res)
3797               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
3798             return;
3799           }
3800
3801       rtx mem = force_const_mem (mode, imm);
3802       gcc_assert (mem);
3803       emit_move_insn (dest, mem);
3804       return;
3805     }
3806
3807   aarch64_internal_mov_immediate (dest, imm, true,
3808                                   as_a <scalar_int_mode> (mode));
3809 }
3810
3811 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3812    that is known to contain PTRUE.  */
3813
3814 void
3815 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3816 {
3817   expand_operand ops[3];
3818   machine_mode mode = GET_MODE (dest);
3819   create_output_operand (&ops[0], dest, mode);
3820   create_input_operand (&ops[1], pred, GET_MODE(pred));
3821   create_input_operand (&ops[2], src, mode);
3822   temporary_volatile_ok v (true);
3823   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3824 }
3825
3826 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3827    operand is in memory.  In this case we need to use the predicated LD1
3828    and ST1 instead of LDR and STR, both for correctness on big-endian
3829    targets and because LD1 and ST1 support a wider range of addressing modes.
3830    PRED_MODE is the mode of the predicate.
3831
3832    See the comment at the head of aarch64-sve.md for details about the
3833    big-endian handling.  */
3834
3835 void
3836 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3837 {
3838   machine_mode mode = GET_MODE (dest);
3839   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3840   if (!register_operand (src, mode)
3841       && !register_operand (dest, mode))
3842     {
3843       rtx tmp = gen_reg_rtx (mode);
3844       if (MEM_P (src))
3845         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3846       else
3847         emit_move_insn (tmp, src);
3848       src = tmp;
3849     }
3850   aarch64_emit_sve_pred_move (dest, ptrue, src);
3851 }
3852
3853 /* Called only on big-endian targets.  See whether an SVE vector move
3854    from SRC to DEST is effectively a REV[BHW] instruction, because at
3855    least one operand is a subreg of an SVE vector that has wider or
3856    narrower elements.  Return true and emit the instruction if so.
3857
3858    For example:
3859
3860      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3861
3862    represents a VIEW_CONVERT between the following vectors, viewed
3863    in memory order:
3864
3865      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3866      R1: { [0],      [1],      [2],      [3],     ... }
3867
3868    The high part of lane X in R2 should therefore correspond to lane X*2
3869    of R1, but the register representations are:
3870
3871          msb                                      lsb
3872      R2: ...... [1].high  [1].low   [0].high  [0].low
3873      R1: ...... [3]       [2]       [1]       [0]
3874
3875    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3876    We therefore need a reverse operation to swap the high and low values
3877    around.
3878
3879    This is purely an optimization.  Without it we would spill the
3880    subreg operand to the stack in one mode and reload it in the
3881    other mode, which has the same effect as the REV.  */
3882
3883 bool
3884 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3885 {
3886   gcc_assert (BYTES_BIG_ENDIAN);
3887   if (GET_CODE (dest) == SUBREG)
3888     dest = SUBREG_REG (dest);
3889   if (GET_CODE (src) == SUBREG)
3890     src = SUBREG_REG (src);
3891
3892   /* The optimization handles two single SVE REGs with different element
3893      sizes.  */
3894   if (!REG_P (dest)
3895       || !REG_P (src)
3896       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3897       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3898       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3899           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3900     return false;
3901
3902   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3903   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
3904   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3905                                UNSPEC_REV_SUBREG);
3906   emit_insn (gen_rtx_SET (dest, unspec));
3907   return true;
3908 }
3909
3910 /* Return a copy of X with mode MODE, without changing its other
3911    attributes.  Unlike gen_lowpart, this doesn't care whether the
3912    mode change is valid.  */
3913
3914 static rtx
3915 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3916 {
3917   if (GET_MODE (x) == mode)
3918     return x;
3919
3920   x = shallow_copy_rtx (x);
3921   set_mode_and_regno (x, mode, REGNO (x));
3922   return x;
3923 }
3924
3925 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3926    operands.  */
3927
3928 void
3929 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3930 {
3931   /* Decide which REV operation we need.  The mode with narrower elements
3932      determines the mode of the operands and the mode with the wider
3933      elements determines the reverse width.  */
3934   machine_mode mode_with_wider_elts = GET_MODE (dest);
3935   machine_mode mode_with_narrower_elts = GET_MODE (src);
3936   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3937       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3938     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3939
3940   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3941   unsigned int unspec;
3942   if (wider_bytes == 8)
3943     unspec = UNSPEC_REV64;
3944   else if (wider_bytes == 4)
3945     unspec = UNSPEC_REV32;
3946   else if (wider_bytes == 2)
3947     unspec = UNSPEC_REV16;
3948   else
3949     gcc_unreachable ();
3950   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3951
3952   /* Emit:
3953
3954        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3955                          UNSPEC_MERGE_PTRUE))
3956
3957      with the appropriate modes.  */
3958   ptrue = gen_lowpart (pred_mode, ptrue);
3959   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3960   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3961   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3962   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3963                         UNSPEC_MERGE_PTRUE);
3964   emit_insn (gen_rtx_SET (dest, src));
3965 }
3966
3967 static bool
3968 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3969                                  tree exp ATTRIBUTE_UNUSED)
3970 {
3971   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3972     return false;
3973
3974   return true;
3975 }
3976
3977 /* Implement TARGET_PASS_BY_REFERENCE.  */
3978
3979 static bool
3980 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3981                            machine_mode mode,
3982                            const_tree type,
3983                            bool named ATTRIBUTE_UNUSED)
3984 {
3985   HOST_WIDE_INT size;
3986   machine_mode dummymode;
3987   int nregs;
3988
3989   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3990   if (mode == BLKmode && type)
3991     size = int_size_in_bytes (type);
3992   else
3993     /* No frontends can create types with variable-sized modes, so we
3994        shouldn't be asked to pass or return them.  */
3995     size = GET_MODE_SIZE (mode).to_constant ();
3996
3997   /* Aggregates are passed by reference based on their size.  */
3998   if (type && AGGREGATE_TYPE_P (type))
3999     {
4000       size = int_size_in_bytes (type);
4001     }
4002
4003   /* Variable sized arguments are always returned by reference.  */
4004   if (size < 0)
4005     return true;
4006
4007   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4008   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4009                                                &dummymode, &nregs,
4010                                                NULL))
4011     return false;
4012
4013   /* Arguments which are variable sized or larger than 2 registers are
4014      passed by reference unless they are a homogenous floating point
4015      aggregate.  */
4016   return size > 2 * UNITS_PER_WORD;
4017 }
4018
4019 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4020 static bool
4021 aarch64_return_in_msb (const_tree valtype)
4022 {
4023   machine_mode dummy_mode;
4024   int dummy_int;
4025
4026   /* Never happens in little-endian mode.  */
4027   if (!BYTES_BIG_ENDIAN)
4028     return false;
4029
4030   /* Only composite types smaller than or equal to 16 bytes can
4031      be potentially returned in registers.  */
4032   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4033       || int_size_in_bytes (valtype) <= 0
4034       || int_size_in_bytes (valtype) > 16)
4035     return false;
4036
4037   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4038      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4039      is always passed/returned in the least significant bits of fp/simd
4040      register(s).  */
4041   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4042                                                &dummy_mode, &dummy_int, NULL))
4043     return false;
4044
4045   return true;
4046 }
4047
4048 /* Implement TARGET_FUNCTION_VALUE.
4049    Define how to find the value returned by a function.  */
4050
4051 static rtx
4052 aarch64_function_value (const_tree type, const_tree func,
4053                         bool outgoing ATTRIBUTE_UNUSED)
4054 {
4055   machine_mode mode;
4056   int unsignedp;
4057   int count;
4058   machine_mode ag_mode;
4059
4060   mode = TYPE_MODE (type);
4061   if (INTEGRAL_TYPE_P (type))
4062     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4063
4064   if (aarch64_return_in_msb (type))
4065     {
4066       HOST_WIDE_INT size = int_size_in_bytes (type);
4067
4068       if (size % UNITS_PER_WORD != 0)
4069         {
4070           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4071           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4072         }
4073     }
4074
4075   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4076                                                &ag_mode, &count, NULL))
4077     {
4078       if (!aarch64_composite_type_p (type, mode))
4079         {
4080           gcc_assert (count == 1 && mode == ag_mode);
4081           return gen_rtx_REG (mode, V0_REGNUM);
4082         }
4083       else
4084         {
4085           int i;
4086           rtx par;
4087
4088           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4089           for (i = 0; i < count; i++)
4090             {
4091               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4092               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4093               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4094               XVECEXP (par, 0, i) = tmp;
4095             }
4096           return par;
4097         }
4098     }
4099   else
4100     return gen_rtx_REG (mode, R0_REGNUM);
4101 }
4102
4103 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4104    Return true if REGNO is the number of a hard register in which the values
4105    of called function may come back.  */
4106
4107 static bool
4108 aarch64_function_value_regno_p (const unsigned int regno)
4109 {
4110   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4111      of 16-byte return values are: 128-bit integers and 16-byte small
4112      structures (excluding homogeneous floating-point aggregates).  */
4113   if (regno == R0_REGNUM || regno == R1_REGNUM)
4114     return true;
4115
4116   /* Up to four fp/simd registers can return a function value, e.g. a
4117      homogeneous floating-point aggregate having four members.  */
4118   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4119     return TARGET_FLOAT;
4120
4121   return false;
4122 }
4123
4124 /* Implement TARGET_RETURN_IN_MEMORY.
4125
4126    If the type T of the result of a function is such that
4127      void func (T arg)
4128    would require that arg be passed as a value in a register (or set of
4129    registers) according to the parameter passing rules, then the result
4130    is returned in the same registers as would be used for such an
4131    argument.  */
4132
4133 static bool
4134 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4135 {
4136   HOST_WIDE_INT size;
4137   machine_mode ag_mode;
4138   int count;
4139
4140   if (!AGGREGATE_TYPE_P (type)
4141       && TREE_CODE (type) != COMPLEX_TYPE
4142       && TREE_CODE (type) != VECTOR_TYPE)
4143     /* Simple scalar types always returned in registers.  */
4144     return false;
4145
4146   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4147                                                type,
4148                                                &ag_mode,
4149                                                &count,
4150                                                NULL))
4151     return false;
4152
4153   /* Types larger than 2 registers returned in memory.  */
4154   size = int_size_in_bytes (type);
4155   return (size < 0 || size > 2 * UNITS_PER_WORD);
4156 }
4157
4158 static bool
4159 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4160                                const_tree type, int *nregs)
4161 {
4162   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4163   return aarch64_vfp_is_call_or_return_candidate (mode,
4164                                                   type,
4165                                                   &pcum->aapcs_vfp_rmode,
4166                                                   nregs,
4167                                                   NULL);
4168 }
4169
4170 /* Given MODE and TYPE of a function argument, return the alignment in
4171    bits.  The idea is to suppress any stronger alignment requested by
4172    the user and opt for the natural alignment (specified in AAPCS64 \S
4173    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4174    calculated in versions of GCC prior to GCC-9.  This is a helper
4175    function for local use only.  */
4176
4177 static unsigned int
4178 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4179                                 bool *abi_break)
4180 {
4181   *abi_break = false;
4182   if (!type)
4183     return GET_MODE_ALIGNMENT (mode);
4184
4185   if (integer_zerop (TYPE_SIZE (type)))
4186     return 0;
4187
4188   gcc_assert (TYPE_MODE (type) == mode);
4189
4190   if (!AGGREGATE_TYPE_P (type))
4191     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4192
4193   if (TREE_CODE (type) == ARRAY_TYPE)
4194     return TYPE_ALIGN (TREE_TYPE (type));
4195
4196   unsigned int alignment = 0;
4197   unsigned int bitfield_alignment = 0;
4198   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4199     if (TREE_CODE (field) == FIELD_DECL)
4200       {
4201         alignment = std::max (alignment, DECL_ALIGN (field));
4202         if (DECL_BIT_FIELD_TYPE (field))
4203           bitfield_alignment
4204             = std::max (bitfield_alignment,
4205                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4206       }
4207
4208   if (bitfield_alignment > alignment)
4209     {
4210       *abi_break = true;
4211       return bitfield_alignment;
4212     }
4213
4214   return alignment;
4215 }
4216
4217 /* Layout a function argument according to the AAPCS64 rules.  The rule
4218    numbers refer to the rule numbers in the AAPCS64.  */
4219
4220 static void
4221 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4222                     const_tree type,
4223                     bool named ATTRIBUTE_UNUSED)
4224 {
4225   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4226   int ncrn, nvrn, nregs;
4227   bool allocate_ncrn, allocate_nvrn;
4228   HOST_WIDE_INT size;
4229   bool abi_break;
4230
4231   /* We need to do this once per argument.  */
4232   if (pcum->aapcs_arg_processed)
4233     return;
4234
4235   pcum->aapcs_arg_processed = true;
4236
4237   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4238   if (type)
4239     size = int_size_in_bytes (type);
4240   else
4241     /* No frontends can create types with variable-sized modes, so we
4242        shouldn't be asked to pass or return them.  */
4243     size = GET_MODE_SIZE (mode).to_constant ();
4244   size = ROUND_UP (size, UNITS_PER_WORD);
4245
4246   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4247   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4248                                                  mode,
4249                                                  type,
4250                                                  &nregs);
4251
4252   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4253      The following code thus handles passing by SIMD/FP registers first.  */
4254
4255   nvrn = pcum->aapcs_nvrn;
4256
4257   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4258      and homogenous short-vector aggregates (HVA).  */
4259   if (allocate_nvrn)
4260     {
4261       if (!TARGET_FLOAT)
4262         aarch64_err_no_fpadvsimd (mode);
4263
4264       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4265         {
4266           pcum->aapcs_nextnvrn = nvrn + nregs;
4267           if (!aarch64_composite_type_p (type, mode))
4268             {
4269               gcc_assert (nregs == 1);
4270               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4271             }
4272           else
4273             {
4274               rtx par;
4275               int i;
4276               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4277               for (i = 0; i < nregs; i++)
4278                 {
4279                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4280                                          V0_REGNUM + nvrn + i);
4281                   rtx offset = gen_int_mode
4282                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4283                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4284                   XVECEXP (par, 0, i) = tmp;
4285                 }
4286               pcum->aapcs_reg = par;
4287             }
4288           return;
4289         }
4290       else
4291         {
4292           /* C.3 NSRN is set to 8.  */
4293           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4294           goto on_stack;
4295         }
4296     }
4297
4298   ncrn = pcum->aapcs_ncrn;
4299   nregs = size / UNITS_PER_WORD;
4300
4301   /* C6 - C9.  though the sign and zero extension semantics are
4302      handled elsewhere.  This is the case where the argument fits
4303      entirely general registers.  */
4304   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4305     {
4306       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4307
4308       /* C.8 if the argument has an alignment of 16 then the NGRN is
4309          rounded up to the next even number.  */
4310       if (nregs == 2
4311           && ncrn % 2
4312           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4313              comparison is there because for > 16 * BITS_PER_UNIT
4314              alignment nregs should be > 2 and therefore it should be
4315              passed by reference rather than value.  */
4316           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4317               == 16 * BITS_PER_UNIT))
4318         {
4319           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4320             inform (input_location, "parameter passing for argument of type "
4321                     "%qT changed in GCC 9.1", type);
4322           ++ncrn;
4323           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4324         }
4325
4326       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4327          A reg is still generated for it, but the caller should be smart
4328          enough not to use it.  */
4329       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4330         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4331       else
4332         {
4333           rtx par;
4334           int i;
4335
4336           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4337           for (i = 0; i < nregs; i++)
4338             {
4339               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4340               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4341                                        GEN_INT (i * UNITS_PER_WORD));
4342               XVECEXP (par, 0, i) = tmp;
4343             }
4344           pcum->aapcs_reg = par;
4345         }
4346
4347       pcum->aapcs_nextncrn = ncrn + nregs;
4348       return;
4349     }
4350
4351   /* C.11  */
4352   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4353
4354   /* The argument is passed on stack; record the needed number of words for
4355      this argument and align the total size if necessary.  */
4356 on_stack:
4357   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4358
4359   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4360       == 16 * BITS_PER_UNIT)
4361     {
4362       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4363       if (pcum->aapcs_stack_size != new_size)
4364         {
4365           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4366             inform (input_location, "parameter passing for argument of type "
4367                     "%qT changed in GCC 9.1", type);
4368           pcum->aapcs_stack_size = new_size;
4369         }
4370     }
4371   return;
4372 }
4373
4374 /* Implement TARGET_FUNCTION_ARG.  */
4375
4376 static rtx
4377 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4378                       const_tree type, bool named)
4379 {
4380   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4381   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4382
4383   if (mode == VOIDmode)
4384     return NULL_RTX;
4385
4386   aarch64_layout_arg (pcum_v, mode, type, named);
4387   return pcum->aapcs_reg;
4388 }
4389
4390 void
4391 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4392                            const_tree fntype ATTRIBUTE_UNUSED,
4393                            rtx libname ATTRIBUTE_UNUSED,
4394                            const_tree fndecl ATTRIBUTE_UNUSED,
4395                            unsigned n_named ATTRIBUTE_UNUSED)
4396 {
4397   pcum->aapcs_ncrn = 0;
4398   pcum->aapcs_nvrn = 0;
4399   pcum->aapcs_nextncrn = 0;
4400   pcum->aapcs_nextnvrn = 0;
4401   pcum->pcs_variant = ARM_PCS_AAPCS64;
4402   pcum->aapcs_reg = NULL_RTX;
4403   pcum->aapcs_arg_processed = false;
4404   pcum->aapcs_stack_words = 0;
4405   pcum->aapcs_stack_size = 0;
4406
4407   if (!TARGET_FLOAT
4408       && fndecl && TREE_PUBLIC (fndecl)
4409       && fntype && fntype != error_mark_node)
4410     {
4411       const_tree type = TREE_TYPE (fntype);
4412       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4413       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4414       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4415                                                    &mode, &nregs, NULL))
4416         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4417     }
4418   return;
4419 }
4420
4421 static void
4422 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4423                               machine_mode mode,
4424                               const_tree type,
4425                               bool named)
4426 {
4427   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4428   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4429     {
4430       aarch64_layout_arg (pcum_v, mode, type, named);
4431       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4432                   != (pcum->aapcs_stack_words != 0));
4433       pcum->aapcs_arg_processed = false;
4434       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4435       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4436       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4437       pcum->aapcs_stack_words = 0;
4438       pcum->aapcs_reg = NULL_RTX;
4439     }
4440 }
4441
4442 bool
4443 aarch64_function_arg_regno_p (unsigned regno)
4444 {
4445   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4446           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4447 }
4448
4449 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4450    PARM_BOUNDARY bits of alignment, but will be given anything up
4451    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4452    that both before and after the layout of each argument, the Next
4453    Stacked Argument Address (NSAA) will have a minimum alignment of
4454    8 bytes.  */
4455
4456 static unsigned int
4457 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4458 {
4459   bool abi_break;
4460   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4461                                                            &abi_break);
4462   if (abi_break & warn_psabi)
4463     inform (input_location, "parameter passing for argument of type "
4464             "%qT changed in GCC 9.1", type);
4465
4466   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4467 }
4468
4469 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4470
4471 static fixed_size_mode
4472 aarch64_get_reg_raw_mode (int regno)
4473 {
4474   if (TARGET_SVE && FP_REGNUM_P (regno))
4475     /* Don't use the SVE part of the register for __builtin_apply and
4476        __builtin_return.  The SVE registers aren't used by the normal PCS,
4477        so using them there would be a waste of time.  The PCS extensions
4478        for SVE types are fundamentally incompatible with the
4479        __builtin_return/__builtin_apply interface.  */
4480     return as_a <fixed_size_mode> (V16QImode);
4481   return default_get_reg_raw_mode (regno);
4482 }
4483
4484 /* Implement TARGET_FUNCTION_ARG_PADDING.
4485
4486    Small aggregate types are placed in the lowest memory address.
4487
4488    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4489
4490 static pad_direction
4491 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4492 {
4493   /* On little-endian targets, the least significant byte of every stack
4494      argument is passed at the lowest byte address of the stack slot.  */
4495   if (!BYTES_BIG_ENDIAN)
4496     return PAD_UPWARD;
4497
4498   /* Otherwise, integral, floating-point and pointer types are padded downward:
4499      the least significant byte of a stack argument is passed at the highest
4500      byte address of the stack slot.  */
4501   if (type
4502       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4503          || POINTER_TYPE_P (type))
4504       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4505     return PAD_DOWNWARD;
4506
4507   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4508   return PAD_UPWARD;
4509 }
4510
4511 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4512
4513    It specifies padding for the last (may also be the only)
4514    element of a block move between registers and memory.  If
4515    assuming the block is in the memory, padding upward means that
4516    the last element is padded after its highest significant byte,
4517    while in downward padding, the last element is padded at the
4518    its least significant byte side.
4519
4520    Small aggregates and small complex types are always padded
4521    upwards.
4522
4523    We don't need to worry about homogeneous floating-point or
4524    short-vector aggregates; their move is not affected by the
4525    padding direction determined here.  Regardless of endianness,
4526    each element of such an aggregate is put in the least
4527    significant bits of a fp/simd register.
4528
4529    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4530    register has useful data, and return the opposite if the most
4531    significant byte does.  */
4532
4533 bool
4534 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4535                      bool first ATTRIBUTE_UNUSED)
4536 {
4537
4538   /* Small composite types are always padded upward.  */
4539   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4540     {
4541       HOST_WIDE_INT size;
4542       if (type)
4543         size = int_size_in_bytes (type);
4544       else
4545         /* No frontends can create types with variable-sized modes, so we
4546            shouldn't be asked to pass or return them.  */
4547         size = GET_MODE_SIZE (mode).to_constant ();
4548       if (size < 2 * UNITS_PER_WORD)
4549         return true;
4550     }
4551
4552   /* Otherwise, use the default padding.  */
4553   return !BYTES_BIG_ENDIAN;
4554 }
4555
4556 static scalar_int_mode
4557 aarch64_libgcc_cmp_return_mode (void)
4558 {
4559   return SImode;
4560 }
4561
4562 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4563
4564 /* We use the 12-bit shifted immediate arithmetic instructions so values
4565    must be multiple of (1 << 12), i.e. 4096.  */
4566 #define ARITH_FACTOR 4096
4567
4568 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4569 #error Cannot use simple address calculation for stack probing
4570 #endif
4571
4572 /* The pair of scratch registers used for stack probing.  */
4573 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4574 #define PROBE_STACK_SECOND_REG R10_REGNUM
4575
4576 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4577    inclusive.  These are offsets from the current stack pointer.  */
4578
4579 static void
4580 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4581 {
4582   HOST_WIDE_INT size;
4583   if (!poly_size.is_constant (&size))
4584     {
4585       sorry ("stack probes for SVE frames");
4586       return;
4587     }
4588
4589   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4590
4591   /* See the same assertion on PROBE_INTERVAL above.  */
4592   gcc_assert ((first % ARITH_FACTOR) == 0);
4593
4594   /* See if we have a constant small number of probes to generate.  If so,
4595      that's the easy case.  */
4596   if (size <= PROBE_INTERVAL)
4597     {
4598       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4599
4600       emit_set_insn (reg1,
4601                      plus_constant (Pmode,
4602                                     stack_pointer_rtx, -(first + base)));
4603       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4604     }
4605
4606   /* The run-time loop is made up of 8 insns in the generic case while the
4607      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4608   else if (size <= 4 * PROBE_INTERVAL)
4609     {
4610       HOST_WIDE_INT i, rem;
4611
4612       emit_set_insn (reg1,
4613                      plus_constant (Pmode,
4614                                     stack_pointer_rtx,
4615                                     -(first + PROBE_INTERVAL)));
4616       emit_stack_probe (reg1);
4617
4618       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4619          it exceeds SIZE.  If only two probes are needed, this will not
4620          generate any code.  Then probe at FIRST + SIZE.  */
4621       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4622         {
4623           emit_set_insn (reg1,
4624                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4625           emit_stack_probe (reg1);
4626         }
4627
4628       rem = size - (i - PROBE_INTERVAL);
4629       if (rem > 256)
4630         {
4631           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4632
4633           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4634           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4635         }
4636       else
4637         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4638     }
4639
4640   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4641      extra careful with variables wrapping around because we might be at
4642      the very top (or the very bottom) of the address space and we have
4643      to be able to handle this case properly; in particular, we use an
4644      equality test for the loop condition.  */
4645   else
4646     {
4647       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4648
4649       /* Step 1: round SIZE to the previous multiple of the interval.  */
4650
4651       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4652
4653
4654       /* Step 2: compute initial and final value of the loop counter.  */
4655
4656       /* TEST_ADDR = SP + FIRST.  */
4657       emit_set_insn (reg1,
4658                      plus_constant (Pmode, stack_pointer_rtx, -first));
4659
4660       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4661       HOST_WIDE_INT adjustment = - (first + rounded_size);
4662       if (! aarch64_uimm12_shift (adjustment))
4663         {
4664           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4665                                           true, Pmode);
4666           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4667         }
4668       else
4669         emit_set_insn (reg2,
4670                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4671
4672       /* Step 3: the loop
4673
4674          do
4675            {
4676              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4677              probe at TEST_ADDR
4678            }
4679          while (TEST_ADDR != LAST_ADDR)
4680
4681          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4682          until it is equal to ROUNDED_SIZE.  */
4683
4684       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4685
4686
4687       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4688          that SIZE is equal to ROUNDED_SIZE.  */
4689
4690       if (size != rounded_size)
4691         {
4692           HOST_WIDE_INT rem = size - rounded_size;
4693
4694           if (rem > 256)
4695             {
4696               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4697
4698               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4699               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4700             }
4701           else
4702             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4703         }
4704     }
4705
4706   /* Make sure nothing is scheduled before we are done.  */
4707   emit_insn (gen_blockage ());
4708 }
4709
4710 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4711    absolute addresses.  */
4712
4713 const char *
4714 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4715 {
4716   static int labelno = 0;
4717   char loop_lab[32];
4718   rtx xops[2];
4719
4720   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4721
4722   /* Loop.  */
4723   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4724
4725   HOST_WIDE_INT stack_clash_probe_interval
4726     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4727
4728   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4729   xops[0] = reg1;
4730   HOST_WIDE_INT interval;
4731   if (flag_stack_clash_protection)
4732     interval = stack_clash_probe_interval;
4733   else
4734     interval = PROBE_INTERVAL;
4735
4736   gcc_assert (aarch64_uimm12_shift (interval));
4737   xops[1] = GEN_INT (interval);
4738
4739   output_asm_insn ("sub\t%0, %0, %1", xops);
4740
4741   /* If doing stack clash protection then we probe up by the ABI specified
4742      amount.  We do this because we're dropping full pages at a time in the
4743      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4744   if (flag_stack_clash_protection)
4745     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4746   else
4747     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4748
4749   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4750      by this amount for each iteration.  */
4751   output_asm_insn ("str\txzr, [%0, %1]", xops);
4752
4753   /* Test if TEST_ADDR == LAST_ADDR.  */
4754   xops[1] = reg2;
4755   output_asm_insn ("cmp\t%0, %1", xops);
4756
4757   /* Branch.  */
4758   fputs ("\tb.ne\t", asm_out_file);
4759   assemble_name_raw (asm_out_file, loop_lab);
4760   fputc ('\n', asm_out_file);
4761
4762   return "";
4763 }
4764
4765 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4766    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4767    of GUARD_SIZE.  When a probe is emitted it is done at most
4768    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4769    at most MIN_PROBE_THRESHOLD.  By the end of this function
4770    BASE = BASE - ADJUSTMENT.  */
4771
4772 const char *
4773 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4774                                       rtx min_probe_threshold, rtx guard_size)
4775 {
4776   /* This function is not allowed to use any instruction generation function
4777      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4778      so instead emit the code you want using output_asm_insn.  */
4779   gcc_assert (flag_stack_clash_protection);
4780   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4781   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4782
4783   /* The minimum required allocation before the residual requires probing.  */
4784   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4785
4786   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4787   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4788   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4789
4790   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4791   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4792
4793   static int labelno = 0;
4794   char loop_start_lab[32];
4795   char loop_end_lab[32];
4796   rtx xops[2];
4797
4798   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4799   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4800
4801   /* Emit loop start label.  */
4802   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4803
4804   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4805   xops[0] = adjustment;
4806   xops[1] = probe_offset_value_rtx;
4807   output_asm_insn ("cmp\t%0, %1", xops);
4808
4809   /* Branch to end if not enough adjustment to probe.  */
4810   fputs ("\tb.lt\t", asm_out_file);
4811   assemble_name_raw (asm_out_file, loop_end_lab);
4812   fputc ('\n', asm_out_file);
4813
4814   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4815   xops[0] = base;
4816   xops[1] = probe_offset_value_rtx;
4817   output_asm_insn ("sub\t%0, %0, %1", xops);
4818
4819   /* Probe at BASE.  */
4820   xops[1] = const0_rtx;
4821   output_asm_insn ("str\txzr, [%0, %1]", xops);
4822
4823   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4824   xops[0] = adjustment;
4825   xops[1] = probe_offset_value_rtx;
4826   output_asm_insn ("sub\t%0, %0, %1", xops);
4827
4828   /* Branch to start if still more bytes to allocate.  */
4829   fputs ("\tb\t", asm_out_file);
4830   assemble_name_raw (asm_out_file, loop_start_lab);
4831   fputc ('\n', asm_out_file);
4832
4833   /* No probe leave.  */
4834   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4835
4836   /* BASE = BASE - ADJUSTMENT.  */
4837   xops[0] = base;
4838   xops[1] = adjustment;
4839   output_asm_insn ("sub\t%0, %0, %1", xops);
4840   return "";
4841 }
4842
4843 /* Determine whether a frame chain needs to be generated.  */
4844 static bool
4845 aarch64_needs_frame_chain (void)
4846 {
4847   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4848   if (frame_pointer_needed || crtl->calls_eh_return)
4849     return true;
4850
4851   /* A leaf function cannot have calls or write LR.  */
4852   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4853
4854   /* Don't use a frame chain in leaf functions if leaf frame pointers
4855      are disabled.  */
4856   if (flag_omit_leaf_frame_pointer && is_leaf)
4857     return false;
4858
4859   return aarch64_use_frame_pointer;
4860 }
4861
4862 /* Mark the registers that need to be saved by the callee and calculate
4863    the size of the callee-saved registers area and frame record (both FP
4864    and LR may be omitted).  */
4865 static void
4866 aarch64_layout_frame (void)
4867 {
4868   HOST_WIDE_INT offset = 0;
4869   int regno, last_fp_reg = INVALID_REGNUM;
4870   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4871
4872   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4873
4874   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4875      the mid-end is doing.  */
4876   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4877
4878 #define SLOT_NOT_REQUIRED (-2)
4879 #define SLOT_REQUIRED     (-1)
4880
4881   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4882   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4883
4884   /* If this is a non-leaf simd function with calls we assume that
4885      at least one of those calls is to a non-simd function and thus
4886      we must save V8 to V23 in the prologue.  */
4887
4888   if (simd_function && !crtl->is_leaf)
4889     {
4890       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4891         if (FP_SIMD_SAVED_REGNUM_P (regno))
4892           df_set_regs_ever_live (regno, true);
4893     }
4894
4895   /* First mark all the registers that really need to be saved...  */
4896   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4897     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4898
4899   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4900     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4901
4902   /* ... that includes the eh data registers (if needed)...  */
4903   if (crtl->calls_eh_return)
4904     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4905       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4906         = SLOT_REQUIRED;
4907
4908   /* ... and any callee saved register that dataflow says is live.  */
4909   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4910     if (df_regs_ever_live_p (regno)
4911         && (regno == R30_REGNUM
4912             || !call_used_regs[regno]))
4913       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4914
4915   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4916     if (df_regs_ever_live_p (regno)
4917         && (!call_used_regs[regno]
4918             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4919       {
4920         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4921         last_fp_reg = regno;
4922       }
4923
4924   if (cfun->machine->frame.emit_frame_chain)
4925     {
4926       /* FP and LR are placed in the linkage record.  */
4927       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4928       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4929       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4930       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4931       offset = 2 * UNITS_PER_WORD;
4932     }
4933
4934   /* With stack-clash, LR must be saved in non-leaf functions.  */
4935   gcc_assert (crtl->is_leaf
4936               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4937                   != SLOT_NOT_REQUIRED));
4938
4939   /* Now assign stack slots for them.  */
4940   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4941     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4942       {
4943         cfun->machine->frame.reg_offset[regno] = offset;
4944         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4945           cfun->machine->frame.wb_candidate1 = regno;
4946         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4947           cfun->machine->frame.wb_candidate2 = regno;
4948         offset += UNITS_PER_WORD;
4949       }
4950
4951   HOST_WIDE_INT max_int_offset = offset;
4952   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4953   bool has_align_gap = offset != max_int_offset;
4954
4955   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4956     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4957       {
4958         /* If there is an alignment gap between integer and fp callee-saves,
4959            allocate the last fp register to it if possible.  */
4960         if (regno == last_fp_reg
4961             && has_align_gap
4962             && !simd_function
4963             && (offset & 8) == 0)
4964           {
4965             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4966             break;
4967           }
4968
4969         cfun->machine->frame.reg_offset[regno] = offset;
4970         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4971           cfun->machine->frame.wb_candidate1 = regno;
4972         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4973                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4974           cfun->machine->frame.wb_candidate2 = regno;
4975         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4976       }
4977
4978   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4979
4980   cfun->machine->frame.saved_regs_size = offset;
4981
4982   HOST_WIDE_INT varargs_and_saved_regs_size
4983     = offset + cfun->machine->frame.saved_varargs_size;
4984
4985   cfun->machine->frame.hard_fp_offset
4986     = aligned_upper_bound (varargs_and_saved_regs_size
4987                            + get_frame_size (),
4988                            STACK_BOUNDARY / BITS_PER_UNIT);
4989
4990   /* Both these values are already aligned.  */
4991   gcc_assert (multiple_p (crtl->outgoing_args_size,
4992                           STACK_BOUNDARY / BITS_PER_UNIT));
4993   cfun->machine->frame.frame_size
4994     = (cfun->machine->frame.hard_fp_offset
4995        + crtl->outgoing_args_size);
4996
4997   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4998
4999   cfun->machine->frame.initial_adjust = 0;
5000   cfun->machine->frame.final_adjust = 0;
5001   cfun->machine->frame.callee_adjust = 0;
5002   cfun->machine->frame.callee_offset = 0;
5003
5004   HOST_WIDE_INT max_push_offset = 0;
5005   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5006     max_push_offset = 512;
5007   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5008     max_push_offset = 256;
5009
5010   HOST_WIDE_INT const_size, const_fp_offset;
5011   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5012       && const_size < max_push_offset
5013       && known_eq (crtl->outgoing_args_size, 0))
5014     {
5015       /* Simple, small frame with no outgoing arguments:
5016          stp reg1, reg2, [sp, -frame_size]!
5017          stp reg3, reg4, [sp, 16]  */
5018       cfun->machine->frame.callee_adjust = const_size;
5019     }
5020   else if (known_lt (crtl->outgoing_args_size
5021                      + cfun->machine->frame.saved_regs_size, 512)
5022            && !(cfun->calls_alloca
5023                 && known_lt (cfun->machine->frame.hard_fp_offset,
5024                              max_push_offset)))
5025     {
5026       /* Frame with small outgoing arguments:
5027          sub sp, sp, frame_size
5028          stp reg1, reg2, [sp, outgoing_args_size]
5029          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5030       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5031       cfun->machine->frame.callee_offset
5032         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5033     }
5034   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5035            && const_fp_offset < max_push_offset)
5036     {
5037       /* Frame with large outgoing arguments but a small local area:
5038          stp reg1, reg2, [sp, -hard_fp_offset]!
5039          stp reg3, reg4, [sp, 16]
5040          sub sp, sp, outgoing_args_size  */
5041       cfun->machine->frame.callee_adjust = const_fp_offset;
5042       cfun->machine->frame.final_adjust
5043         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5044     }
5045   else
5046     {
5047       /* Frame with large local area and outgoing arguments using frame pointer:
5048          sub sp, sp, hard_fp_offset
5049          stp x29, x30, [sp, 0]
5050          add x29, sp, 0
5051          stp reg3, reg4, [sp, 16]
5052          sub sp, sp, outgoing_args_size  */
5053       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5054       cfun->machine->frame.final_adjust
5055         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5056     }
5057
5058   cfun->machine->frame.laid_out = true;
5059 }
5060
5061 /* Return true if the register REGNO is saved on entry to
5062    the current function.  */
5063
5064 static bool
5065 aarch64_register_saved_on_entry (int regno)
5066 {
5067   return cfun->machine->frame.reg_offset[regno] >= 0;
5068 }
5069
5070 /* Return the next register up from REGNO up to LIMIT for the callee
5071    to save.  */
5072
5073 static unsigned
5074 aarch64_next_callee_save (unsigned regno, unsigned limit)
5075 {
5076   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5077     regno ++;
5078   return regno;
5079 }
5080
5081 /* Push the register number REGNO of mode MODE to the stack with write-back
5082    adjusting the stack by ADJUSTMENT.  */
5083
5084 static void
5085 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5086                            HOST_WIDE_INT adjustment)
5087  {
5088   rtx base_rtx = stack_pointer_rtx;
5089   rtx insn, reg, mem;
5090
5091   reg = gen_rtx_REG (mode, regno);
5092   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5093                             plus_constant (Pmode, base_rtx, -adjustment));
5094   mem = gen_frame_mem (mode, mem);
5095
5096   insn = emit_move_insn (mem, reg);
5097   RTX_FRAME_RELATED_P (insn) = 1;
5098 }
5099
5100 /* Generate and return an instruction to store the pair of registers
5101    REG and REG2 of mode MODE to location BASE with write-back adjusting
5102    the stack location BASE by ADJUSTMENT.  */
5103
5104 static rtx
5105 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5106                           HOST_WIDE_INT adjustment)
5107 {
5108   switch (mode)
5109     {
5110     case E_DImode:
5111       return gen_storewb_pairdi_di (base, base, reg, reg2,
5112                                     GEN_INT (-adjustment),
5113                                     GEN_INT (UNITS_PER_WORD - adjustment));
5114     case E_DFmode:
5115       return gen_storewb_pairdf_di (base, base, reg, reg2,
5116                                     GEN_INT (-adjustment),
5117                                     GEN_INT (UNITS_PER_WORD - adjustment));
5118     case E_TFmode:
5119       return gen_storewb_pairtf_di (base, base, reg, reg2,
5120                                     GEN_INT (-adjustment),
5121                                     GEN_INT (UNITS_PER_VREG - adjustment));
5122     default:
5123       gcc_unreachable ();
5124     }
5125 }
5126
5127 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5128    stack pointer by ADJUSTMENT.  */
5129
5130 static void
5131 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5132 {
5133   rtx_insn *insn;
5134   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5135
5136   if (regno2 == INVALID_REGNUM)
5137     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5138
5139   rtx reg1 = gen_rtx_REG (mode, regno1);
5140   rtx reg2 = gen_rtx_REG (mode, regno2);
5141
5142   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5143                                               reg2, adjustment));
5144   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5145   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5146   RTX_FRAME_RELATED_P (insn) = 1;
5147 }
5148
5149 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5150    adjusting it by ADJUSTMENT afterwards.  */
5151
5152 static rtx
5153 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5154                          HOST_WIDE_INT adjustment)
5155 {
5156   switch (mode)
5157     {
5158     case E_DImode:
5159       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5160                                    GEN_INT (UNITS_PER_WORD));
5161     case E_DFmode:
5162       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5163                                    GEN_INT (UNITS_PER_WORD));
5164     case E_TFmode:
5165       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5166                                    GEN_INT (UNITS_PER_VREG));
5167     default:
5168       gcc_unreachable ();
5169     }
5170 }
5171
5172 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5173    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5174    into CFI_OPS.  */
5175
5176 static void
5177 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5178                   rtx *cfi_ops)
5179 {
5180   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5181   rtx reg1 = gen_rtx_REG (mode, regno1);
5182
5183   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5184
5185   if (regno2 == INVALID_REGNUM)
5186     {
5187       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5188       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5189       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5190     }
5191   else
5192     {
5193       rtx reg2 = gen_rtx_REG (mode, regno2);
5194       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5195       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5196                                           reg2, adjustment));
5197     }
5198 }
5199
5200 /* Generate and return a store pair instruction of mode MODE to store
5201    register REG1 to MEM1 and register REG2 to MEM2.  */
5202
5203 static rtx
5204 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5205                         rtx reg2)
5206 {
5207   switch (mode)
5208     {
5209     case E_DImode:
5210       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5211
5212     case E_DFmode:
5213       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5214
5215     case E_TFmode:
5216       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5217
5218     default:
5219       gcc_unreachable ();
5220     }
5221 }
5222
5223 /* Generate and regurn a load pair isntruction of mode MODE to load register
5224    REG1 from MEM1 and register REG2 from MEM2.  */
5225
5226 static rtx
5227 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5228                        rtx mem2)
5229 {
5230   switch (mode)
5231     {
5232     case E_DImode:
5233       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5234
5235     case E_DFmode:
5236       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5237
5238     case E_TFmode:
5239       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5240
5241     default:
5242       gcc_unreachable ();
5243     }
5244 }
5245
5246 /* Return TRUE if return address signing should be enabled for the current
5247    function, otherwise return FALSE.  */
5248
5249 bool
5250 aarch64_return_address_signing_enabled (void)
5251 {
5252   /* This function should only be called after frame laid out.   */
5253   gcc_assert (cfun->machine->frame.laid_out);
5254
5255   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5256      if its LR is pushed onto stack.  */
5257   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5258           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5259               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5260 }
5261
5262 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5263 bool
5264 aarch64_bti_enabled (void)
5265 {
5266   return (aarch64_enable_bti == 1);
5267 }
5268
5269 /* Emit code to save the callee-saved registers from register number START
5270    to LIMIT to the stack at the location starting at offset START_OFFSET,
5271    skipping any write-back candidates if SKIP_WB is true.  */
5272
5273 static void
5274 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5275                            unsigned start, unsigned limit, bool skip_wb)
5276 {
5277   rtx_insn *insn;
5278   unsigned regno;
5279   unsigned regno2;
5280
5281   for (regno = aarch64_next_callee_save (start, limit);
5282        regno <= limit;
5283        regno = aarch64_next_callee_save (regno + 1, limit))
5284     {
5285       rtx reg, mem;
5286       poly_int64 offset;
5287       int offset_diff;
5288
5289       if (skip_wb
5290           && (regno == cfun->machine->frame.wb_candidate1
5291               || regno == cfun->machine->frame.wb_candidate2))
5292         continue;
5293
5294       if (cfun->machine->reg_is_wrapped_separately[regno])
5295        continue;
5296
5297       reg = gen_rtx_REG (mode, regno);
5298       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5299       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5300                                                 offset));
5301
5302       regno2 = aarch64_next_callee_save (regno + 1, limit);
5303       offset_diff = cfun->machine->frame.reg_offset[regno2]
5304                     - cfun->machine->frame.reg_offset[regno];
5305
5306       if (regno2 <= limit
5307           && !cfun->machine->reg_is_wrapped_separately[regno2]
5308           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5309         {
5310           rtx reg2 = gen_rtx_REG (mode, regno2);
5311           rtx mem2;
5312
5313           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5314           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5315                                                      offset));
5316           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5317                                                     reg2));
5318
5319           /* The first part of a frame-related parallel insn is
5320              always assumed to be relevant to the frame
5321              calculations; subsequent parts, are only
5322              frame-related if explicitly marked.  */
5323           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5324           regno = regno2;
5325         }
5326       else
5327         insn = emit_move_insn (mem, reg);
5328
5329       RTX_FRAME_RELATED_P (insn) = 1;
5330     }
5331 }
5332
5333 /* Emit code to restore the callee registers of mode MODE from register
5334    number START up to and including LIMIT.  Restore from the stack offset
5335    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5336    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5337
5338 static void
5339 aarch64_restore_callee_saves (machine_mode mode,
5340                               poly_int64 start_offset, unsigned start,
5341                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5342 {
5343   rtx base_rtx = stack_pointer_rtx;
5344   unsigned regno;
5345   unsigned regno2;
5346   poly_int64 offset;
5347
5348   for (regno = aarch64_next_callee_save (start, limit);
5349        regno <= limit;
5350        regno = aarch64_next_callee_save (regno + 1, limit))
5351     {
5352       if (cfun->machine->reg_is_wrapped_separately[regno])
5353        continue;
5354
5355       rtx reg, mem;
5356       int offset_diff;
5357
5358       if (skip_wb
5359           && (regno == cfun->machine->frame.wb_candidate1
5360               || regno == cfun->machine->frame.wb_candidate2))
5361         continue;
5362
5363       reg = gen_rtx_REG (mode, regno);
5364       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5365       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5366
5367       regno2 = aarch64_next_callee_save (regno + 1, limit);
5368       offset_diff = cfun->machine->frame.reg_offset[regno2]
5369                     - cfun->machine->frame.reg_offset[regno];
5370
5371       if (regno2 <= limit
5372           && !cfun->machine->reg_is_wrapped_separately[regno2]
5373           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5374         {
5375           rtx reg2 = gen_rtx_REG (mode, regno2);
5376           rtx mem2;
5377
5378           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5379           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5380           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5381
5382           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5383           regno = regno2;
5384         }
5385       else
5386         emit_move_insn (reg, mem);
5387       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5388     }
5389 }
5390
5391 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5392    of MODE.  */
5393
5394 static inline bool
5395 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5396 {
5397   HOST_WIDE_INT multiple;
5398   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5399           && IN_RANGE (multiple, -8, 7));
5400 }
5401
5402 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5403    of MODE.  */
5404
5405 static inline bool
5406 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5407 {
5408   HOST_WIDE_INT multiple;
5409   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5410           && IN_RANGE (multiple, 0, 63));
5411 }
5412
5413 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5414    of MODE.  */
5415
5416 bool
5417 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5418 {
5419   HOST_WIDE_INT multiple;
5420   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5421           && IN_RANGE (multiple, -64, 63));
5422 }
5423
5424 /* Return true if OFFSET is a signed 9-bit value.  */
5425
5426 bool
5427 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5428                                        poly_int64 offset)
5429 {
5430   HOST_WIDE_INT const_offset;
5431   return (offset.is_constant (&const_offset)
5432           && IN_RANGE (const_offset, -256, 255));
5433 }
5434
5435 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5436    of MODE.  */
5437
5438 static inline bool
5439 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5440 {
5441   HOST_WIDE_INT multiple;
5442   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5443           && IN_RANGE (multiple, -256, 255));
5444 }
5445
5446 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5447    of MODE.  */
5448
5449 static inline bool
5450 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5451 {
5452   HOST_WIDE_INT multiple;
5453   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5454           && IN_RANGE (multiple, 0, 4095));
5455 }
5456
5457 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5458
5459 static sbitmap
5460 aarch64_get_separate_components (void)
5461 {
5462   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5463   bitmap_clear (components);
5464
5465   /* The registers we need saved to the frame.  */
5466   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5467     if (aarch64_register_saved_on_entry (regno))
5468       {
5469         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5470         if (!frame_pointer_needed)
5471           offset += cfun->machine->frame.frame_size
5472                     - cfun->machine->frame.hard_fp_offset;
5473         /* Check that we can access the stack slot of the register with one
5474            direct load with no adjustments needed.  */
5475         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5476           bitmap_set_bit (components, regno);
5477       }
5478
5479   /* Don't mess with the hard frame pointer.  */
5480   if (frame_pointer_needed)
5481     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5482
5483   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5484   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5485   /* If registers have been chosen to be stored/restored with
5486      writeback don't interfere with them to avoid having to output explicit
5487      stack adjustment instructions.  */
5488   if (reg2 != INVALID_REGNUM)
5489     bitmap_clear_bit (components, reg2);
5490   if (reg1 != INVALID_REGNUM)
5491     bitmap_clear_bit (components, reg1);
5492
5493   bitmap_clear_bit (components, LR_REGNUM);
5494   bitmap_clear_bit (components, SP_REGNUM);
5495
5496   return components;
5497 }
5498
5499 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5500
5501 static sbitmap
5502 aarch64_components_for_bb (basic_block bb)
5503 {
5504   bitmap in = DF_LIVE_IN (bb);
5505   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5506   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5507   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5508
5509   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5510   bitmap_clear (components);
5511
5512   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5513   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5514     if ((!call_used_regs[regno]
5515         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5516        && (bitmap_bit_p (in, regno)
5517            || bitmap_bit_p (gen, regno)
5518            || bitmap_bit_p (kill, regno)))
5519       {
5520         unsigned regno2, offset, offset2;
5521         bitmap_set_bit (components, regno);
5522
5523         /* If there is a callee-save at an adjacent offset, add it too
5524            to increase the use of LDP/STP.  */
5525         offset = cfun->machine->frame.reg_offset[regno];
5526         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5527
5528         if (regno2 <= LAST_SAVED_REGNUM)
5529           {
5530             offset2 = cfun->machine->frame.reg_offset[regno2];
5531             if ((offset & ~8) == (offset2 & ~8))
5532               bitmap_set_bit (components, regno2);
5533           }
5534       }
5535
5536   return components;
5537 }
5538
5539 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5540    Nothing to do for aarch64.  */
5541
5542 static void
5543 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5544 {
5545 }
5546
5547 /* Return the next set bit in BMP from START onwards.  Return the total number
5548    of bits in BMP if no set bit is found at or after START.  */
5549
5550 static unsigned int
5551 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5552 {
5553   unsigned int nbits = SBITMAP_SIZE (bmp);
5554   if (start == nbits)
5555     return start;
5556
5557   gcc_assert (start < nbits);
5558   for (unsigned int i = start; i < nbits; i++)
5559     if (bitmap_bit_p (bmp, i))
5560       return i;
5561
5562   return nbits;
5563 }
5564
5565 /* Do the work for aarch64_emit_prologue_components and
5566    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5567    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5568    for these components or the epilogue sequence.  That is, it determines
5569    whether we should emit stores or loads and what kind of CFA notes to attach
5570    to the insns.  Otherwise the logic for the two sequences is very
5571    similar.  */
5572
5573 static void
5574 aarch64_process_components (sbitmap components, bool prologue_p)
5575 {
5576   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5577                              ? HARD_FRAME_POINTER_REGNUM
5578                              : STACK_POINTER_REGNUM);
5579
5580   unsigned last_regno = SBITMAP_SIZE (components);
5581   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5582   rtx_insn *insn = NULL;
5583
5584   while (regno != last_regno)
5585     {
5586       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5587          so DFmode for the vector registers is enough.  For simd functions
5588          we want to save the low 128 bits.  */
5589       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5590
5591       rtx reg = gen_rtx_REG (mode, regno);
5592       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5593       if (!frame_pointer_needed)
5594         offset += cfun->machine->frame.frame_size
5595                   - cfun->machine->frame.hard_fp_offset;
5596       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5597       rtx mem = gen_frame_mem (mode, addr);
5598
5599       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5600       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5601       /* No more registers to handle after REGNO.
5602          Emit a single save/restore and exit.  */
5603       if (regno2 == last_regno)
5604         {
5605           insn = emit_insn (set);
5606           RTX_FRAME_RELATED_P (insn) = 1;
5607           if (prologue_p)
5608             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5609           else
5610             add_reg_note (insn, REG_CFA_RESTORE, reg);
5611           break;
5612         }
5613
5614       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5615       /* The next register is not of the same class or its offset is not
5616          mergeable with the current one into a pair.  */
5617       if (!satisfies_constraint_Ump (mem)
5618           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5619           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5620           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5621                        GET_MODE_SIZE (mode)))
5622         {
5623           insn = emit_insn (set);
5624           RTX_FRAME_RELATED_P (insn) = 1;
5625           if (prologue_p)
5626             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5627           else
5628             add_reg_note (insn, REG_CFA_RESTORE, reg);
5629
5630           regno = regno2;
5631           continue;
5632         }
5633
5634       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5635       rtx reg2 = gen_rtx_REG (mode, regno2);
5636       if (!frame_pointer_needed)
5637         offset2 += cfun->machine->frame.frame_size
5638                   - cfun->machine->frame.hard_fp_offset;
5639       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5640       rtx mem2 = gen_frame_mem (mode, addr2);
5641       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5642                              : gen_rtx_SET (reg2, mem2);
5643
5644       if (prologue_p)
5645         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5646       else
5647         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5648
5649       RTX_FRAME_RELATED_P (insn) = 1;
5650       if (prologue_p)
5651         {
5652           add_reg_note (insn, REG_CFA_OFFSET, set);
5653           add_reg_note (insn, REG_CFA_OFFSET, set2);
5654         }
5655       else
5656         {
5657           add_reg_note (insn, REG_CFA_RESTORE, reg);
5658           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5659         }
5660
5661       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5662     }
5663 }
5664
5665 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5666
5667 static void
5668 aarch64_emit_prologue_components (sbitmap components)
5669 {
5670   aarch64_process_components (components, true);
5671 }
5672
5673 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5674
5675 static void
5676 aarch64_emit_epilogue_components (sbitmap components)
5677 {
5678   aarch64_process_components (components, false);
5679 }
5680
5681 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5682
5683 static void
5684 aarch64_set_handled_components (sbitmap components)
5685 {
5686   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5687     if (bitmap_bit_p (components, regno))
5688       cfun->machine->reg_is_wrapped_separately[regno] = true;
5689 }
5690
5691 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5692    determining the probe offset for alloca.  */
5693
5694 static HOST_WIDE_INT
5695 aarch64_stack_clash_protection_alloca_probe_range (void)
5696 {
5697   return STACK_CLASH_CALLER_GUARD;
5698 }
5699
5700
5701 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5702    registers.  If POLY_SIZE is not large enough to require a probe this function
5703    will only adjust the stack.  When allocating the stack space
5704    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5705    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5706    arguments.  If we are then we ensure that any allocation larger than the ABI
5707    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5708    maintained.
5709
5710    We emit barriers after each stack adjustment to prevent optimizations from
5711    breaking the invariant that we never drop the stack more than a page.  This
5712    invariant is needed to make it easier to correctly handle asynchronous
5713    events, e.g. if we were to allow the stack to be dropped by more than a page
5714    and then have multiple probes up and we take a signal somewhere in between
5715    then the signal handler doesn't know the state of the stack and can make no
5716    assumptions about which pages have been probed.  */
5717
5718 static void
5719 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5720                                         poly_int64 poly_size,
5721                                         bool frame_related_p,
5722                                         bool final_adjustment_p)
5723 {
5724   HOST_WIDE_INT guard_size
5725     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5726   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5727   /* When doing the final adjustment for the outgoing argument size we can't
5728      assume that LR was saved at position 0.  So subtract it's offset from the
5729      ABI safe buffer so that we don't accidentally allow an adjustment that
5730      would result in an allocation larger than the ABI buffer without
5731      probing.  */
5732   HOST_WIDE_INT min_probe_threshold
5733     = final_adjustment_p
5734       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5735       : guard_size - guard_used_by_caller;
5736
5737   poly_int64 frame_size = cfun->machine->frame.frame_size;
5738
5739   /* We should always have a positive probe threshold.  */
5740   gcc_assert (min_probe_threshold > 0);
5741
5742   if (flag_stack_clash_protection && !final_adjustment_p)
5743     {
5744       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5745       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5746
5747       if (known_eq (frame_size, 0))
5748         {
5749           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5750         }
5751       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5752                && known_lt (final_adjust, guard_used_by_caller))
5753         {
5754           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5755         }
5756     }
5757
5758   /* If SIZE is not large enough to require probing, just adjust the stack and
5759      exit.  */
5760   if (known_lt (poly_size, min_probe_threshold)
5761       || !flag_stack_clash_protection)
5762     {
5763       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5764       return;
5765     }
5766
5767   HOST_WIDE_INT size;
5768   /* Handle the SVE non-constant case first.  */
5769   if (!poly_size.is_constant (&size))
5770     {
5771      if (dump_file)
5772       {
5773         fprintf (dump_file, "Stack clash SVE prologue: ");
5774         print_dec (poly_size, dump_file);
5775         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5776       }
5777
5778       /* First calculate the amount of bytes we're actually spilling.  */
5779       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5780                           poly_size, temp1, temp2, false, true);
5781
5782       rtx_insn *insn = get_last_insn ();
5783
5784       if (frame_related_p)
5785         {
5786           /* This is done to provide unwinding information for the stack
5787              adjustments we're about to do, however to prevent the optimizers
5788              from removing the R11 move and leaving the CFA note (which would be
5789              very wrong) we tie the old and new stack pointer together.
5790              The tie will expand to nothing but the optimizers will not touch
5791              the instruction.  */
5792           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5793           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5794           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5795
5796           /* We want the CFA independent of the stack pointer for the
5797              duration of the loop.  */
5798           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5799           RTX_FRAME_RELATED_P (insn) = 1;
5800         }
5801
5802       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5803       rtx guard_const = gen_int_mode (guard_size, Pmode);
5804
5805       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5806                                                    stack_pointer_rtx, temp1,
5807                                                    probe_const, guard_const));
5808
5809       /* Now reset the CFA register if needed.  */
5810       if (frame_related_p)
5811         {
5812           add_reg_note (insn, REG_CFA_DEF_CFA,
5813                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5814                                       gen_int_mode (poly_size, Pmode)));
5815           RTX_FRAME_RELATED_P (insn) = 1;
5816         }
5817
5818       return;
5819     }
5820
5821   if (dump_file)
5822     fprintf (dump_file,
5823              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5824              " bytes, probing will be required.\n", size);
5825
5826   /* Round size to the nearest multiple of guard_size, and calculate the
5827      residual as the difference between the original size and the rounded
5828      size.  */
5829   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5830   HOST_WIDE_INT residual = size - rounded_size;
5831
5832   /* We can handle a small number of allocations/probes inline.  Otherwise
5833      punt to a loop.  */
5834   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5835     {
5836       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5837         {
5838           aarch64_sub_sp (NULL, temp2, guard_size, true);
5839           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5840                                            guard_used_by_caller));
5841           emit_insn (gen_blockage ());
5842         }
5843       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5844     }
5845   else
5846     {
5847       /* Compute the ending address.  */
5848       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5849                           temp1, NULL, false, true);
5850       rtx_insn *insn = get_last_insn ();
5851
5852       /* For the initial allocation, we don't have a frame pointer
5853          set up, so we always need CFI notes.  If we're doing the
5854          final allocation, then we may have a frame pointer, in which
5855          case it is the CFA, otherwise we need CFI notes.
5856
5857          We can determine which allocation we are doing by looking at
5858          the value of FRAME_RELATED_P since the final allocations are not
5859          frame related.  */
5860       if (frame_related_p)
5861         {
5862           /* We want the CFA independent of the stack pointer for the
5863              duration of the loop.  */
5864           add_reg_note (insn, REG_CFA_DEF_CFA,
5865                         plus_constant (Pmode, temp1, rounded_size));
5866           RTX_FRAME_RELATED_P (insn) = 1;
5867         }
5868
5869       /* This allocates and probes the stack.  Note that this re-uses some of
5870          the existing Ada stack protection code.  However we are guaranteed not
5871          to enter the non loop or residual branches of that code.
5872
5873          The non-loop part won't be entered because if our allocation amount
5874          doesn't require a loop, the case above would handle it.
5875
5876          The residual amount won't be entered because TEMP1 is a mutliple of
5877          the allocation size.  The residual will always be 0.  As such, the only
5878          part we are actually using from that code is the loop setup.  The
5879          actual probing is done in aarch64_output_probe_stack_range.  */
5880       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5881                                                stack_pointer_rtx, temp1));
5882
5883       /* Now reset the CFA register if needed.  */
5884       if (frame_related_p)
5885         {
5886           add_reg_note (insn, REG_CFA_DEF_CFA,
5887                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5888           RTX_FRAME_RELATED_P (insn) = 1;
5889         }
5890
5891       emit_insn (gen_blockage ());
5892       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5893     }
5894
5895   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5896      be probed.  This maintains the requirement that each page is probed at
5897      least once.  For initial probing we probe only if the allocation is
5898      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5899      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5900      GUARD_SIZE.  This works that for any allocation that is large enough to
5901      trigger a probe here, we'll have at least one, and if they're not large
5902      enough for this code to emit anything for them, The page would have been
5903      probed by the saving of FP/LR either by this function or any callees.  If
5904      we don't have any callees then we won't have more stack adjustments and so
5905      are still safe.  */
5906   if (residual)
5907     {
5908       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5909       /* If we're doing final adjustments, and we've done any full page
5910          allocations then any residual needs to be probed.  */
5911       if (final_adjustment_p && rounded_size != 0)
5912         min_probe_threshold = 0;
5913       /* If doing a small final adjustment, we always probe at offset 0.
5914          This is done to avoid issues when LR is not at position 0 or when
5915          the final adjustment is smaller than the probing offset.  */
5916       else if (final_adjustment_p && rounded_size == 0)
5917         residual_probe_offset = 0;
5918
5919       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5920       if (residual >= min_probe_threshold)
5921         {
5922           if (dump_file)
5923             fprintf (dump_file,
5924                      "Stack clash AArch64 prologue residuals: "
5925                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5926                      "\n", residual);
5927
5928             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5929                                              residual_probe_offset));
5930           emit_insn (gen_blockage ());
5931         }
5932     }
5933 }
5934
5935 /* Return 1 if the register is used by the epilogue.  We need to say the
5936    return register is used, but only after epilogue generation is complete.
5937    Note that in the case of sibcalls, the values "used by the epilogue" are
5938    considered live at the start of the called function.
5939
5940    For SIMD functions we need to return 1 for FP registers that are saved and
5941    restored by a function but are not zero in call_used_regs.  If we do not do
5942    this optimizations may remove the restore of the register.  */
5943
5944 int
5945 aarch64_epilogue_uses (int regno)
5946 {
5947   if (epilogue_completed)
5948     {
5949       if (regno == LR_REGNUM)
5950         return 1;
5951       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5952         return 1;
5953     }
5954   return 0;
5955 }
5956
5957 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5958    is saved at BASE + OFFSET.  */
5959
5960 static void
5961 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5962                             rtx base, poly_int64 offset)
5963 {
5964   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5965   add_reg_note (insn, REG_CFA_EXPRESSION,
5966                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5967 }
5968
5969 /* AArch64 stack frames generated by this compiler look like:
5970
5971         +-------------------------------+
5972         |                               |
5973         |  incoming stack arguments     |
5974         |                               |
5975         +-------------------------------+
5976         |                               | <-- incoming stack pointer (aligned)
5977         |  callee-allocated save area   |
5978         |  for register varargs         |
5979         |                               |
5980         +-------------------------------+
5981         |  local variables              | <-- frame_pointer_rtx
5982         |                               |
5983         +-------------------------------+
5984         |  padding                      | \
5985         +-------------------------------+  |
5986         |  callee-saved registers       |  | frame.saved_regs_size
5987         +-------------------------------+  |
5988         |  LR'                          |  |
5989         +-------------------------------+  |
5990         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5991         +-------------------------------+
5992         |  dynamic allocation           |
5993         +-------------------------------+
5994         |  padding                      |
5995         +-------------------------------+
5996         |  outgoing stack arguments     | <-- arg_pointer
5997         |                               |
5998         +-------------------------------+
5999         |                               | <-- stack_pointer_rtx (aligned)
6000
6001    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6002    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6003    unchanged.
6004
6005    By default for stack-clash we assume the guard is at least 64KB, but this
6006    value is configurable to either 4KB or 64KB.  We also force the guard size to
6007    be the same as the probing interval and both values are kept in sync.
6008
6009    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6010    on the guard size) of stack space without probing.
6011
6012    When probing is needed, we emit a probe at the start of the prologue
6013    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6014
6015    We have to track how much space has been allocated and the only stores
6016    to the stack we track as implicit probes are the FP/LR stores.
6017
6018    For outgoing arguments we probe if the size is larger than 1KB, such that
6019    the ABI specified buffer is maintained for the next callee.
6020
6021    The following registers are reserved during frame layout and should not be
6022    used for any other purpose:
6023
6024    - r11: Used by stack clash protection when SVE is enabled.
6025    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6026    - r14 and r15: Used for speculation tracking.
6027    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6028    - r30(LR), r29(FP): Used by standard frame layout.
6029
6030    These registers must be avoided in frame layout related code unless the
6031    explicit intention is to interact with one of the features listed above.  */
6032
6033 /* Generate the prologue instructions for entry into a function.
6034    Establish the stack frame by decreasing the stack pointer with a
6035    properly calculated size and, if necessary, create a frame record
6036    filled with the values of LR and previous frame pointer.  The
6037    current FP is also set up if it is in use.  */
6038
6039 void
6040 aarch64_expand_prologue (void)
6041 {
6042   poly_int64 frame_size = cfun->machine->frame.frame_size;
6043   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6044   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6045   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6046   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6047   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6048   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6049   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6050   rtx_insn *insn;
6051
6052   /* Sign return address for functions.  */
6053   if (aarch64_return_address_signing_enabled ())
6054     {
6055       switch (aarch64_ra_sign_key)
6056         {
6057           case AARCH64_KEY_A:
6058             insn = emit_insn (gen_paciasp ());
6059             break;
6060           case AARCH64_KEY_B:
6061             insn = emit_insn (gen_pacibsp ());
6062             break;
6063           default:
6064             gcc_unreachable ();
6065         }
6066       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6067       RTX_FRAME_RELATED_P (insn) = 1;
6068     }
6069
6070   if (flag_stack_usage_info)
6071     current_function_static_stack_size = constant_lower_bound (frame_size);
6072
6073   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6074     {
6075       if (crtl->is_leaf && !cfun->calls_alloca)
6076         {
6077           if (maybe_gt (frame_size, PROBE_INTERVAL)
6078               && maybe_gt (frame_size, get_stack_check_protect ()))
6079             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6080                                             (frame_size
6081                                              - get_stack_check_protect ()));
6082         }
6083       else if (maybe_gt (frame_size, 0))
6084         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6085     }
6086
6087   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6088   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6089
6090   /* In theory we should never have both an initial adjustment
6091      and a callee save adjustment.  Verify that is the case since the
6092      code below does not handle it for -fstack-clash-protection.  */
6093   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6094
6095   /* Will only probe if the initial adjustment is larger than the guard
6096      less the amount of the guard reserved for use by the caller's
6097      outgoing args.  */
6098   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6099                                           true, false);
6100
6101   if (callee_adjust != 0)
6102     aarch64_push_regs (reg1, reg2, callee_adjust);
6103
6104   if (emit_frame_chain)
6105     {
6106       poly_int64 reg_offset = callee_adjust;
6107       if (callee_adjust == 0)
6108         {
6109           reg1 = R29_REGNUM;
6110           reg2 = R30_REGNUM;
6111           reg_offset = callee_offset;
6112           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6113         }
6114       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6115                           stack_pointer_rtx, callee_offset,
6116                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6117       if (frame_pointer_needed && !frame_size.is_constant ())
6118         {
6119           /* Variable-sized frames need to describe the save slot
6120              address using DW_CFA_expression rather than DW_CFA_offset.
6121              This means that, without taking further action, the
6122              locations of the registers that we've already saved would
6123              remain based on the stack pointer even after we redefine
6124              the CFA based on the frame pointer.  We therefore need new
6125              DW_CFA_expressions to re-express the save slots with addresses
6126              based on the frame pointer.  */
6127           rtx_insn *insn = get_last_insn ();
6128           gcc_assert (RTX_FRAME_RELATED_P (insn));
6129
6130           /* Add an explicit CFA definition if this was previously
6131              implicit.  */
6132           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6133             {
6134               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6135                                        callee_offset);
6136               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6137                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6138             }
6139
6140           /* Change the save slot expressions for the registers that
6141              we've already saved.  */
6142           reg_offset -= callee_offset;
6143           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6144                                       reg_offset + UNITS_PER_WORD);
6145           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6146                                       reg_offset);
6147         }
6148       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6149     }
6150
6151   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6152                              callee_adjust != 0 || emit_frame_chain);
6153   if (aarch64_simd_decl_p (cfun->decl))
6154     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6155                                callee_adjust != 0 || emit_frame_chain);
6156   else
6157     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6158                                callee_adjust != 0 || emit_frame_chain);
6159
6160   /* We may need to probe the final adjustment if it is larger than the guard
6161      that is assumed by the called.  */
6162   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6163                                           !frame_pointer_needed, true);
6164 }
6165
6166 /* Return TRUE if we can use a simple_return insn.
6167
6168    This function checks whether the callee saved stack is empty, which
6169    means no restore actions are need. The pro_and_epilogue will use
6170    this to check whether shrink-wrapping opt is feasible.  */
6171
6172 bool
6173 aarch64_use_return_insn_p (void)
6174 {
6175   if (!reload_completed)
6176     return false;
6177
6178   if (crtl->profile)
6179     return false;
6180
6181   return known_eq (cfun->machine->frame.frame_size, 0);
6182 }
6183
6184 /* Return false for non-leaf SIMD functions in order to avoid
6185    shrink-wrapping them.  Doing this will lose the necessary
6186    save/restore of FP registers.  */
6187
6188 bool
6189 aarch64_use_simple_return_insn_p (void)
6190 {
6191   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6192     return false;
6193
6194   return true;
6195 }
6196
6197 /* Generate the epilogue instructions for returning from a function.
6198    This is almost exactly the reverse of the prolog sequence, except
6199    that we need to insert barriers to avoid scheduling loads that read
6200    from a deallocated stack, and we optimize the unwind records by
6201    emitting them all together if possible.  */
6202 void
6203 aarch64_expand_epilogue (bool for_sibcall)
6204 {
6205   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6206   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6207   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6208   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6209   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6210   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6211   rtx cfi_ops = NULL;
6212   rtx_insn *insn;
6213   /* A stack clash protection prologue may not have left EP0_REGNUM or
6214      EP1_REGNUM in a usable state.  The same is true for allocations
6215      with an SVE component, since we then need both temporary registers
6216      for each allocation.  For stack clash we are in a usable state if
6217      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6218   HOST_WIDE_INT guard_size
6219     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6220   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6221
6222   /* We can re-use the registers when the allocation amount is smaller than
6223      guard_size - guard_used_by_caller because we won't be doing any probes
6224      then.  In such situations the register should remain live with the correct
6225      value.  */
6226   bool can_inherit_p = (initial_adjust.is_constant ()
6227                         && final_adjust.is_constant ())
6228                         && (!flag_stack_clash_protection
6229                             || known_lt (initial_adjust,
6230                                          guard_size - guard_used_by_caller));
6231
6232   /* We need to add memory barrier to prevent read from deallocated stack.  */
6233   bool need_barrier_p
6234     = maybe_ne (get_frame_size ()
6235                 + cfun->machine->frame.saved_varargs_size, 0);
6236
6237   /* Emit a barrier to prevent loads from a deallocated stack.  */
6238   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6239       || cfun->calls_alloca
6240       || crtl->calls_eh_return)
6241     {
6242       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6243       need_barrier_p = false;
6244     }
6245
6246   /* Restore the stack pointer from the frame pointer if it may not
6247      be the same as the stack pointer.  */
6248   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6249   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6250   if (frame_pointer_needed
6251       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6252     /* If writeback is used when restoring callee-saves, the CFA
6253        is restored on the instruction doing the writeback.  */
6254     aarch64_add_offset (Pmode, stack_pointer_rtx,
6255                         hard_frame_pointer_rtx, -callee_offset,
6256                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6257   else
6258      /* The case where we need to re-use the register here is very rare, so
6259         avoid the complicated condition and just always emit a move if the
6260         immediate doesn't fit.  */
6261      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6262
6263   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6264                                 callee_adjust != 0, &cfi_ops);
6265   if (aarch64_simd_decl_p (cfun->decl))
6266     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6267                                   callee_adjust != 0, &cfi_ops);
6268   else
6269     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6270                                   callee_adjust != 0, &cfi_ops);
6271
6272   if (need_barrier_p)
6273     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6274
6275   if (callee_adjust != 0)
6276     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6277
6278   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6279     {
6280       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6281       insn = get_last_insn ();
6282       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6283       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6284       RTX_FRAME_RELATED_P (insn) = 1;
6285       cfi_ops = NULL;
6286     }
6287
6288   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6289      add restriction on emit_move optimization to leaf functions.  */
6290   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6291                   (!can_inherit_p || !crtl->is_leaf
6292                    || df_regs_ever_live_p (EP0_REGNUM)));
6293
6294   if (cfi_ops)
6295     {
6296       /* Emit delayed restores and reset the CFA to be SP.  */
6297       insn = get_last_insn ();
6298       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6299       REG_NOTES (insn) = cfi_ops;
6300       RTX_FRAME_RELATED_P (insn) = 1;
6301     }
6302
6303   /* We prefer to emit the combined return/authenticate instruction RETAA,
6304      however there are three cases in which we must instead emit an explicit
6305      authentication instruction.
6306
6307         1) Sibcalls don't return in a normal way, so if we're about to call one
6308            we must authenticate.
6309
6310         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6311            generating code for !TARGET_ARMV8_3 we can't use it and must
6312            explicitly authenticate.
6313
6314         3) On an eh_return path we make extra stack adjustments to update the
6315            canonical frame address to be the exception handler's CFA.  We want
6316            to authenticate using the CFA of the function which calls eh_return.
6317     */
6318   if (aarch64_return_address_signing_enabled ()
6319       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6320     {
6321       switch (aarch64_ra_sign_key)
6322         {
6323           case AARCH64_KEY_A:
6324             insn = emit_insn (gen_autiasp ());
6325             break;
6326           case AARCH64_KEY_B:
6327             insn = emit_insn (gen_autibsp ());
6328             break;
6329           default:
6330             gcc_unreachable ();
6331         }
6332       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6333       RTX_FRAME_RELATED_P (insn) = 1;
6334     }
6335
6336   /* Stack adjustment for exception handler.  */
6337   if (crtl->calls_eh_return && !for_sibcall)
6338     {
6339       /* We need to unwind the stack by the offset computed by
6340          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6341          to be SP; letting the CFA move during this adjustment
6342          is just as correct as retaining the CFA from the body
6343          of the function.  Therefore, do nothing special.  */
6344       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6345     }
6346
6347   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6348   if (!for_sibcall)
6349     emit_jump_insn (ret_rtx);
6350 }
6351
6352 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6353    normally or return to a previous frame after unwinding.
6354
6355    An EH return uses a single shared return sequence.  The epilogue is
6356    exactly like a normal epilogue except that it has an extra input
6357    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6358    that must be applied after the frame has been destroyed.  An extra label
6359    is inserted before the epilogue which initializes this register to zero,
6360    and this is the entry point for a normal return.
6361
6362    An actual EH return updates the return address, initializes the stack
6363    adjustment and jumps directly into the epilogue (bypassing the zeroing
6364    of the adjustment).  Since the return address is typically saved on the
6365    stack when a function makes a call, the saved LR must be updated outside
6366    the epilogue.
6367
6368    This poses problems as the store is generated well before the epilogue,
6369    so the offset of LR is not known yet.  Also optimizations will remove the
6370    store as it appears dead, even after the epilogue is generated (as the
6371    base or offset for loading LR is different in many cases).
6372
6373    To avoid these problems this implementation forces the frame pointer
6374    in eh_return functions so that the location of LR is fixed and known early.
6375    It also marks the store volatile, so no optimization is permitted to
6376    remove the store.  */
6377 rtx
6378 aarch64_eh_return_handler_rtx (void)
6379 {
6380   rtx tmp = gen_frame_mem (Pmode,
6381     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6382
6383   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6384   MEM_VOLATILE_P (tmp) = true;
6385   return tmp;
6386 }
6387
6388 /* Output code to add DELTA to the first argument, and then jump
6389    to FUNCTION.  Used for C++ multiple inheritance.  */
6390 static void
6391 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6392                          HOST_WIDE_INT delta,
6393                          HOST_WIDE_INT vcall_offset,
6394                          tree function)
6395 {
6396   /* The this pointer is always in x0.  Note that this differs from
6397      Arm where the this pointer maybe bumped to r1 if r0 is required
6398      to return a pointer to an aggregate.  On AArch64 a result value
6399      pointer will be in x8.  */
6400   int this_regno = R0_REGNUM;
6401   rtx this_rtx, temp0, temp1, addr, funexp;
6402   rtx_insn *insn;
6403   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6404
6405   if (aarch64_bti_enabled ())
6406     emit_insn (gen_bti_c());
6407
6408   reload_completed = 1;
6409   emit_note (NOTE_INSN_PROLOGUE_END);
6410
6411   this_rtx = gen_rtx_REG (Pmode, this_regno);
6412   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6413   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6414
6415   if (vcall_offset == 0)
6416     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6417   else
6418     {
6419       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6420
6421       addr = this_rtx;
6422       if (delta != 0)
6423         {
6424           if (delta >= -256 && delta < 256)
6425             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6426                                        plus_constant (Pmode, this_rtx, delta));
6427           else
6428             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6429                                 temp1, temp0, false);
6430         }
6431
6432       if (Pmode == ptr_mode)
6433         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6434       else
6435         aarch64_emit_move (temp0,
6436                            gen_rtx_ZERO_EXTEND (Pmode,
6437                                                 gen_rtx_MEM (ptr_mode, addr)));
6438
6439       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6440           addr = plus_constant (Pmode, temp0, vcall_offset);
6441       else
6442         {
6443           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6444                                           Pmode);
6445           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6446         }
6447
6448       if (Pmode == ptr_mode)
6449         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6450       else
6451         aarch64_emit_move (temp1,
6452                            gen_rtx_SIGN_EXTEND (Pmode,
6453                                                 gen_rtx_MEM (ptr_mode, addr)));
6454
6455       emit_insn (gen_add2_insn (this_rtx, temp1));
6456     }
6457
6458   /* Generate a tail call to the target function.  */
6459   if (!TREE_USED (function))
6460     {
6461       assemble_external (function);
6462       TREE_USED (function) = 1;
6463     }
6464   funexp = XEXP (DECL_RTL (function), 0);
6465   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6466   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6467   SIBLING_CALL_P (insn) = 1;
6468
6469   insn = get_insns ();
6470   shorten_branches (insn);
6471
6472   assemble_start_function (thunk, fnname);
6473   final_start_function (insn, file, 1);
6474   final (insn, file, 1);
6475   final_end_function ();
6476   assemble_end_function (thunk, fnname);
6477
6478   /* Stop pretending to be a post-reload pass.  */
6479   reload_completed = 0;
6480 }
6481
6482 static bool
6483 aarch64_tls_referenced_p (rtx x)
6484 {
6485   if (!TARGET_HAVE_TLS)
6486     return false;
6487   subrtx_iterator::array_type array;
6488   FOR_EACH_SUBRTX (iter, array, x, ALL)
6489     {
6490       const_rtx x = *iter;
6491       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6492         return true;
6493       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6494          TLS offsets, not real symbol references.  */
6495       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6496         iter.skip_subrtxes ();
6497     }
6498   return false;
6499 }
6500
6501
6502 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6503    a left shift of 0 or 12 bits.  */
6504 bool
6505 aarch64_uimm12_shift (HOST_WIDE_INT val)
6506 {
6507   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6508           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6509           );
6510 }
6511
6512 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6513    that can be created with a left shift of 0 or 12.  */
6514 static HOST_WIDE_INT
6515 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6516 {
6517   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6518      handle correctly.  */
6519   gcc_assert ((val & 0xffffff) == val);
6520
6521   if (((val & 0xfff) << 0) == val)
6522     return val;
6523
6524   return val & (0xfff << 12);
6525 }
6526
6527 /* Return true if val is an immediate that can be loaded into a
6528    register by a MOVZ instruction.  */
6529 static bool
6530 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6531 {
6532   if (GET_MODE_SIZE (mode) > 4)
6533     {
6534       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6535           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6536         return 1;
6537     }
6538   else
6539     {
6540       /* Ignore sign extension.  */
6541       val &= (HOST_WIDE_INT) 0xffffffff;
6542     }
6543   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6544           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6545 }
6546
6547 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6548    64-bit (DImode) integer.  */
6549
6550 static unsigned HOST_WIDE_INT
6551 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6552 {
6553   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6554   while (size < 64)
6555     {
6556       val &= (HOST_WIDE_INT_1U << size) - 1;
6557       val |= val << size;
6558       size *= 2;
6559     }
6560   return val;
6561 }
6562
6563 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6564
6565 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6566   {
6567     0x0000000100000001ull,
6568     0x0001000100010001ull,
6569     0x0101010101010101ull,
6570     0x1111111111111111ull,
6571     0x5555555555555555ull,
6572   };
6573
6574
6575 /* Return true if val is a valid bitmask immediate.  */
6576
6577 bool
6578 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6579 {
6580   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6581   int bits;
6582
6583   /* Check for a single sequence of one bits and return quickly if so.
6584      The special cases of all ones and all zeroes returns false.  */
6585   val = aarch64_replicate_bitmask_imm (val_in, mode);
6586   tmp = val + (val & -val);
6587
6588   if (tmp == (tmp & -tmp))
6589     return (val + 1) > 1;
6590
6591   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6592   if (mode == SImode)
6593     val = (val << 32) | (val & 0xffffffff);
6594
6595   /* Invert if the immediate doesn't start with a zero bit - this means we
6596      only need to search for sequences of one bits.  */
6597   if (val & 1)
6598     val = ~val;
6599
6600   /* Find the first set bit and set tmp to val with the first sequence of one
6601      bits removed.  Return success if there is a single sequence of ones.  */
6602   first_one = val & -val;
6603   tmp = val & (val + first_one);
6604
6605   if (tmp == 0)
6606     return true;
6607
6608   /* Find the next set bit and compute the difference in bit position.  */
6609   next_one = tmp & -tmp;
6610   bits = clz_hwi (first_one) - clz_hwi (next_one);
6611   mask = val ^ tmp;
6612
6613   /* Check the bit position difference is a power of 2, and that the first
6614      sequence of one bits fits within 'bits' bits.  */
6615   if ((mask >> bits) != 0 || bits != (bits & -bits))
6616     return false;
6617
6618   /* Check the sequence of one bits is repeated 64/bits times.  */
6619   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6620 }
6621
6622 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6623    Assumed precondition: VAL_IN Is not zero.  */
6624
6625 unsigned HOST_WIDE_INT
6626 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6627 {
6628   int lowest_bit_set = ctz_hwi (val_in);
6629   int highest_bit_set = floor_log2 (val_in);
6630   gcc_assert (val_in != 0);
6631
6632   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6633           (HOST_WIDE_INT_1U << lowest_bit_set));
6634 }
6635
6636 /* Create constant where bits outside of lowest bit set to highest bit set
6637    are set to 1.  */
6638
6639 unsigned HOST_WIDE_INT
6640 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6641 {
6642   return val_in | ~aarch64_and_split_imm1 (val_in);
6643 }
6644
6645 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6646
6647 bool
6648 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6649 {
6650   scalar_int_mode int_mode;
6651   if (!is_a <scalar_int_mode> (mode, &int_mode))
6652     return false;
6653
6654   if (aarch64_bitmask_imm (val_in, int_mode))
6655     return false;
6656
6657   if (aarch64_move_imm (val_in, int_mode))
6658     return false;
6659
6660   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6661
6662   return aarch64_bitmask_imm (imm2, int_mode);
6663 }
6664
6665 /* Return true if val is an immediate that can be loaded into a
6666    register in a single instruction.  */
6667 bool
6668 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6669 {
6670   scalar_int_mode int_mode;
6671   if (!is_a <scalar_int_mode> (mode, &int_mode))
6672     return false;
6673
6674   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6675     return 1;
6676   return aarch64_bitmask_imm (val, int_mode);
6677 }
6678
6679 static bool
6680 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6681 {
6682   rtx base, offset;
6683
6684   if (GET_CODE (x) == HIGH)
6685     return true;
6686
6687   /* There's no way to calculate VL-based values using relocations.  */
6688   subrtx_iterator::array_type array;
6689   FOR_EACH_SUBRTX (iter, array, x, ALL)
6690     if (GET_CODE (*iter) == CONST_POLY_INT)
6691       return true;
6692
6693   split_const (x, &base, &offset);
6694   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6695     {
6696       if (aarch64_classify_symbol (base, INTVAL (offset))
6697           != SYMBOL_FORCE_TO_MEM)
6698         return true;
6699       else
6700         /* Avoid generating a 64-bit relocation in ILP32; leave
6701            to aarch64_expand_mov_immediate to handle it properly.  */
6702         return mode != ptr_mode;
6703     }
6704
6705   return aarch64_tls_referenced_p (x);
6706 }
6707
6708 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6709    The expansion for a table switch is quite expensive due to the number
6710    of instructions, the table lookup and hard to predict indirect jump.
6711    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6712    set, otherwise use tables for > 16 cases as a tradeoff between size and
6713    performance.  When optimizing for size, use the default setting.  */
6714
6715 static unsigned int
6716 aarch64_case_values_threshold (void)
6717 {
6718   /* Use the specified limit for the number of cases before using jump
6719      tables at higher optimization levels.  */
6720   if (optimize > 2
6721       && selected_cpu->tune->max_case_values != 0)
6722     return selected_cpu->tune->max_case_values;
6723   else
6724     return optimize_size ? default_case_values_threshold () : 17;
6725 }
6726
6727 /* Return true if register REGNO is a valid index register.
6728    STRICT_P is true if REG_OK_STRICT is in effect.  */
6729
6730 bool
6731 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6732 {
6733   if (!HARD_REGISTER_NUM_P (regno))
6734     {
6735       if (!strict_p)
6736         return true;
6737
6738       if (!reg_renumber)
6739         return false;
6740
6741       regno = reg_renumber[regno];
6742     }
6743   return GP_REGNUM_P (regno);
6744 }
6745
6746 /* Return true if register REGNO is a valid base register for mode MODE.
6747    STRICT_P is true if REG_OK_STRICT is in effect.  */
6748
6749 bool
6750 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6751 {
6752   if (!HARD_REGISTER_NUM_P (regno))
6753     {
6754       if (!strict_p)
6755         return true;
6756
6757       if (!reg_renumber)
6758         return false;
6759
6760       regno = reg_renumber[regno];
6761     }
6762
6763   /* The fake registers will be eliminated to either the stack or
6764      hard frame pointer, both of which are usually valid base registers.
6765      Reload deals with the cases where the eliminated form isn't valid.  */
6766   return (GP_REGNUM_P (regno)
6767           || regno == SP_REGNUM
6768           || regno == FRAME_POINTER_REGNUM
6769           || regno == ARG_POINTER_REGNUM);
6770 }
6771
6772 /* Return true if X is a valid base register for mode MODE.
6773    STRICT_P is true if REG_OK_STRICT is in effect.  */
6774
6775 static bool
6776 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6777 {
6778   if (!strict_p
6779       && GET_CODE (x) == SUBREG
6780       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6781     x = SUBREG_REG (x);
6782
6783   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6784 }
6785
6786 /* Return true if address offset is a valid index.  If it is, fill in INFO
6787    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6788
6789 static bool
6790 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6791                         machine_mode mode, bool strict_p)
6792 {
6793   enum aarch64_address_type type;
6794   rtx index;
6795   int shift;
6796
6797   /* (reg:P) */
6798   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6799       && GET_MODE (x) == Pmode)
6800     {
6801       type = ADDRESS_REG_REG;
6802       index = x;
6803       shift = 0;
6804     }
6805   /* (sign_extend:DI (reg:SI)) */
6806   else if ((GET_CODE (x) == SIGN_EXTEND
6807             || GET_CODE (x) == ZERO_EXTEND)
6808            && GET_MODE (x) == DImode
6809            && GET_MODE (XEXP (x, 0)) == SImode)
6810     {
6811       type = (GET_CODE (x) == SIGN_EXTEND)
6812         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6813       index = XEXP (x, 0);
6814       shift = 0;
6815     }
6816   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6817   else if (GET_CODE (x) == MULT
6818            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6819                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6820            && GET_MODE (XEXP (x, 0)) == DImode
6821            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6822            && CONST_INT_P (XEXP (x, 1)))
6823     {
6824       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6825         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6826       index = XEXP (XEXP (x, 0), 0);
6827       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6828     }
6829   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6830   else if (GET_CODE (x) == ASHIFT
6831            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6832                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6833            && GET_MODE (XEXP (x, 0)) == DImode
6834            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6835            && CONST_INT_P (XEXP (x, 1)))
6836     {
6837       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6838         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6839       index = XEXP (XEXP (x, 0), 0);
6840       shift = INTVAL (XEXP (x, 1));
6841     }
6842   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6843   else if ((GET_CODE (x) == SIGN_EXTRACT
6844             || GET_CODE (x) == ZERO_EXTRACT)
6845            && GET_MODE (x) == DImode
6846            && GET_CODE (XEXP (x, 0)) == MULT
6847            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6848            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6849     {
6850       type = (GET_CODE (x) == SIGN_EXTRACT)
6851         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6852       index = XEXP (XEXP (x, 0), 0);
6853       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6854       if (INTVAL (XEXP (x, 1)) != 32 + shift
6855           || INTVAL (XEXP (x, 2)) != 0)
6856         shift = -1;
6857     }
6858   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6859      (const_int 0xffffffff<<shift)) */
6860   else if (GET_CODE (x) == AND
6861            && GET_MODE (x) == DImode
6862            && GET_CODE (XEXP (x, 0)) == MULT
6863            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6864            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6865            && CONST_INT_P (XEXP (x, 1)))
6866     {
6867       type = ADDRESS_REG_UXTW;
6868       index = XEXP (XEXP (x, 0), 0);
6869       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6870       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6871         shift = -1;
6872     }
6873   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6874   else if ((GET_CODE (x) == SIGN_EXTRACT
6875             || GET_CODE (x) == ZERO_EXTRACT)
6876            && GET_MODE (x) == DImode
6877            && GET_CODE (XEXP (x, 0)) == ASHIFT
6878            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6879            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6880     {
6881       type = (GET_CODE (x) == SIGN_EXTRACT)
6882         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6883       index = XEXP (XEXP (x, 0), 0);
6884       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6885       if (INTVAL (XEXP (x, 1)) != 32 + shift
6886           || INTVAL (XEXP (x, 2)) != 0)
6887         shift = -1;
6888     }
6889   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6890      (const_int 0xffffffff<<shift)) */
6891   else if (GET_CODE (x) == AND
6892            && GET_MODE (x) == DImode
6893            && GET_CODE (XEXP (x, 0)) == ASHIFT
6894            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6895            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6896            && CONST_INT_P (XEXP (x, 1)))
6897     {
6898       type = ADDRESS_REG_UXTW;
6899       index = XEXP (XEXP (x, 0), 0);
6900       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6901       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6902         shift = -1;
6903     }
6904   /* (mult:P (reg:P) (const_int scale)) */
6905   else if (GET_CODE (x) == MULT
6906            && GET_MODE (x) == Pmode
6907            && GET_MODE (XEXP (x, 0)) == Pmode
6908            && CONST_INT_P (XEXP (x, 1)))
6909     {
6910       type = ADDRESS_REG_REG;
6911       index = XEXP (x, 0);
6912       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6913     }
6914   /* (ashift:P (reg:P) (const_int shift)) */
6915   else if (GET_CODE (x) == ASHIFT
6916            && GET_MODE (x) == Pmode
6917            && GET_MODE (XEXP (x, 0)) == Pmode
6918            && CONST_INT_P (XEXP (x, 1)))
6919     {
6920       type = ADDRESS_REG_REG;
6921       index = XEXP (x, 0);
6922       shift = INTVAL (XEXP (x, 1));
6923     }
6924   else
6925     return false;
6926
6927   if (!strict_p
6928       && GET_CODE (index) == SUBREG
6929       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6930     index = SUBREG_REG (index);
6931
6932   if (aarch64_sve_data_mode_p (mode))
6933     {
6934       if (type != ADDRESS_REG_REG
6935           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6936         return false;
6937     }
6938   else
6939     {
6940       if (shift != 0
6941           && !(IN_RANGE (shift, 1, 3)
6942                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6943         return false;
6944     }
6945
6946   if (REG_P (index)
6947       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6948     {
6949       info->type = type;
6950       info->offset = index;
6951       info->shift = shift;
6952       return true;
6953     }
6954
6955   return false;
6956 }
6957
6958 /* Return true if MODE is one of the modes for which we
6959    support LDP/STP operations.  */
6960
6961 static bool
6962 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6963 {
6964   return mode == SImode || mode == DImode
6965          || mode == SFmode || mode == DFmode
6966          || (aarch64_vector_mode_supported_p (mode)
6967              && (known_eq (GET_MODE_SIZE (mode), 8)
6968                  || (known_eq (GET_MODE_SIZE (mode), 16)
6969                     && (aarch64_tune_params.extra_tuning_flags
6970                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6971 }
6972
6973 /* Return true if REGNO is a virtual pointer register, or an eliminable
6974    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6975    include stack_pointer or hard_frame_pointer.  */
6976 static bool
6977 virt_or_elim_regno_p (unsigned regno)
6978 {
6979   return ((regno >= FIRST_VIRTUAL_REGISTER
6980            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6981           || regno == FRAME_POINTER_REGNUM
6982           || regno == ARG_POINTER_REGNUM);
6983 }
6984
6985 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6986    If it is, fill in INFO appropriately.  STRICT_P is true if
6987    REG_OK_STRICT is in effect.  */
6988
6989 bool
6990 aarch64_classify_address (struct aarch64_address_info *info,
6991                           rtx x, machine_mode mode, bool strict_p,
6992                           aarch64_addr_query_type type)
6993 {
6994   enum rtx_code code = GET_CODE (x);
6995   rtx op0, op1;
6996   poly_int64 offset;
6997
6998   HOST_WIDE_INT const_size;
6999
7000   /* On BE, we use load/store pair for all large int mode load/stores.
7001      TI/TFmode may also use a load/store pair.  */
7002   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7003   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7004   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7005                             || type == ADDR_QUERY_LDP_STP_N
7006                             || mode == TImode
7007                             || mode == TFmode
7008                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7009
7010   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7011      corresponds to the actual size of the memory being loaded/stored and the
7012      mode of the corresponding addressing mode is half of that.  */
7013   if (type == ADDR_QUERY_LDP_STP_N
7014       && known_eq (GET_MODE_SIZE (mode), 16))
7015     mode = DFmode;
7016
7017   bool allow_reg_index_p = (!load_store_pair_p
7018                             && (known_lt (GET_MODE_SIZE (mode), 16)
7019                                 || vec_flags == VEC_ADVSIMD
7020                                 || vec_flags & VEC_SVE_DATA));
7021
7022   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7023      [Rn, #offset, MUL VL].  */
7024   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7025       && (code != REG && code != PLUS))
7026     return false;
7027
7028   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7029      REG addressing.  */
7030   if (advsimd_struct_p
7031       && !BYTES_BIG_ENDIAN
7032       && (code != POST_INC && code != REG))
7033     return false;
7034
7035   gcc_checking_assert (GET_MODE (x) == VOIDmode
7036                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7037
7038   switch (code)
7039     {
7040     case REG:
7041     case SUBREG:
7042       info->type = ADDRESS_REG_IMM;
7043       info->base = x;
7044       info->offset = const0_rtx;
7045       info->const_offset = 0;
7046       return aarch64_base_register_rtx_p (x, strict_p);
7047
7048     case PLUS:
7049       op0 = XEXP (x, 0);
7050       op1 = XEXP (x, 1);
7051
7052       if (! strict_p
7053           && REG_P (op0)
7054           && virt_or_elim_regno_p (REGNO (op0))
7055           && poly_int_rtx_p (op1, &offset))
7056         {
7057           info->type = ADDRESS_REG_IMM;
7058           info->base = op0;
7059           info->offset = op1;
7060           info->const_offset = offset;
7061
7062           return true;
7063         }
7064
7065       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7066           && aarch64_base_register_rtx_p (op0, strict_p)
7067           && poly_int_rtx_p (op1, &offset))
7068         {
7069           info->type = ADDRESS_REG_IMM;
7070           info->base = op0;
7071           info->offset = op1;
7072           info->const_offset = offset;
7073
7074           /* TImode and TFmode values are allowed in both pairs of X
7075              registers and individual Q registers.  The available
7076              address modes are:
7077              X,X: 7-bit signed scaled offset
7078              Q:   9-bit signed offset
7079              We conservatively require an offset representable in either mode.
7080              When performing the check for pairs of X registers i.e.  LDP/STP
7081              pass down DImode since that is the natural size of the LDP/STP
7082              instruction memory accesses.  */
7083           if (mode == TImode || mode == TFmode)
7084             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7085                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7086                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7087
7088           /* A 7bit offset check because OImode will emit a ldp/stp
7089              instruction (only big endian will get here).
7090              For ldp/stp instructions, the offset is scaled for the size of a
7091              single element of the pair.  */
7092           if (mode == OImode)
7093             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7094
7095           /* Three 9/12 bit offsets checks because CImode will emit three
7096              ldr/str instructions (only big endian will get here).  */
7097           if (mode == CImode)
7098             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7099                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7100                                                                offset + 32)
7101                         || offset_12bit_unsigned_scaled_p (V16QImode,
7102                                                            offset + 32)));
7103
7104           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7105              instructions (only big endian will get here).  */
7106           if (mode == XImode)
7107             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7108                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7109                                                             offset + 32));
7110
7111           /* Make "m" use the LD1 offset range for SVE data modes, so
7112              that pre-RTL optimizers like ivopts will work to that
7113              instead of the wider LDR/STR range.  */
7114           if (vec_flags == VEC_SVE_DATA)
7115             return (type == ADDR_QUERY_M
7116                     ? offset_4bit_signed_scaled_p (mode, offset)
7117                     : offset_9bit_signed_scaled_p (mode, offset));
7118
7119           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7120             {
7121               poly_int64 end_offset = (offset
7122                                        + GET_MODE_SIZE (mode)
7123                                        - BYTES_PER_SVE_VECTOR);
7124               return (type == ADDR_QUERY_M
7125                       ? offset_4bit_signed_scaled_p (mode, offset)
7126                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7127                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7128                                                          end_offset)));
7129             }
7130
7131           if (vec_flags == VEC_SVE_PRED)
7132             return offset_9bit_signed_scaled_p (mode, offset);
7133
7134           if (load_store_pair_p)
7135             return ((known_eq (GET_MODE_SIZE (mode), 4)
7136                      || known_eq (GET_MODE_SIZE (mode), 8)
7137                      || known_eq (GET_MODE_SIZE (mode), 16))
7138                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7139           else
7140             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7141                     || offset_12bit_unsigned_scaled_p (mode, offset));
7142         }
7143
7144       if (allow_reg_index_p)
7145         {
7146           /* Look for base + (scaled/extended) index register.  */
7147           if (aarch64_base_register_rtx_p (op0, strict_p)
7148               && aarch64_classify_index (info, op1, mode, strict_p))
7149             {
7150               info->base = op0;
7151               return true;
7152             }
7153           if (aarch64_base_register_rtx_p (op1, strict_p)
7154               && aarch64_classify_index (info, op0, mode, strict_p))
7155             {
7156               info->base = op1;
7157               return true;
7158             }
7159         }
7160
7161       return false;
7162
7163     case POST_INC:
7164     case POST_DEC:
7165     case PRE_INC:
7166     case PRE_DEC:
7167       info->type = ADDRESS_REG_WB;
7168       info->base = XEXP (x, 0);
7169       info->offset = NULL_RTX;
7170       return aarch64_base_register_rtx_p (info->base, strict_p);
7171
7172     case POST_MODIFY:
7173     case PRE_MODIFY:
7174       info->type = ADDRESS_REG_WB;
7175       info->base = XEXP (x, 0);
7176       if (GET_CODE (XEXP (x, 1)) == PLUS
7177           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7178           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7179           && aarch64_base_register_rtx_p (info->base, strict_p))
7180         {
7181           info->offset = XEXP (XEXP (x, 1), 1);
7182           info->const_offset = offset;
7183
7184           /* TImode and TFmode values are allowed in both pairs of X
7185              registers and individual Q registers.  The available
7186              address modes are:
7187              X,X: 7-bit signed scaled offset
7188              Q:   9-bit signed offset
7189              We conservatively require an offset representable in either mode.
7190            */
7191           if (mode == TImode || mode == TFmode)
7192             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7193                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7194
7195           if (load_store_pair_p)
7196             return ((known_eq (GET_MODE_SIZE (mode), 4)
7197                      || known_eq (GET_MODE_SIZE (mode), 8)
7198                      || known_eq (GET_MODE_SIZE (mode), 16))
7199                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7200           else
7201             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7202         }
7203       return false;
7204
7205     case CONST:
7206     case SYMBOL_REF:
7207     case LABEL_REF:
7208       /* load literal: pc-relative constant pool entry.  Only supported
7209          for SI mode or larger.  */
7210       info->type = ADDRESS_SYMBOLIC;
7211
7212       if (!load_store_pair_p
7213           && GET_MODE_SIZE (mode).is_constant (&const_size)
7214           && const_size >= 4)
7215         {
7216           rtx sym, addend;
7217
7218           split_const (x, &sym, &addend);
7219           return ((GET_CODE (sym) == LABEL_REF
7220                    || (GET_CODE (sym) == SYMBOL_REF
7221                        && CONSTANT_POOL_ADDRESS_P (sym)
7222                        && aarch64_pcrelative_literal_loads)));
7223         }
7224       return false;
7225
7226     case LO_SUM:
7227       info->type = ADDRESS_LO_SUM;
7228       info->base = XEXP (x, 0);
7229       info->offset = XEXP (x, 1);
7230       if (allow_reg_index_p
7231           && aarch64_base_register_rtx_p (info->base, strict_p))
7232         {
7233           rtx sym, offs;
7234           split_const (info->offset, &sym, &offs);
7235           if (GET_CODE (sym) == SYMBOL_REF
7236               && (aarch64_classify_symbol (sym, INTVAL (offs))
7237                   == SYMBOL_SMALL_ABSOLUTE))
7238             {
7239               /* The symbol and offset must be aligned to the access size.  */
7240               unsigned int align;
7241
7242               if (CONSTANT_POOL_ADDRESS_P (sym))
7243                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7244               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7245                 {
7246                   tree exp = SYMBOL_REF_DECL (sym);
7247                   align = TYPE_ALIGN (TREE_TYPE (exp));
7248                   align = aarch64_constant_alignment (exp, align);
7249                 }
7250               else if (SYMBOL_REF_DECL (sym))
7251                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7252               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7253                        && SYMBOL_REF_BLOCK (sym) != NULL)
7254                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7255               else
7256                 align = BITS_PER_UNIT;
7257
7258               poly_int64 ref_size = GET_MODE_SIZE (mode);
7259               if (known_eq (ref_size, 0))
7260                 ref_size = GET_MODE_SIZE (DImode);
7261
7262               return (multiple_p (INTVAL (offs), ref_size)
7263                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7264             }
7265         }
7266       return false;
7267
7268     default:
7269       return false;
7270     }
7271 }
7272
7273 /* Return true if the address X is valid for a PRFM instruction.
7274    STRICT_P is true if we should do strict checking with
7275    aarch64_classify_address.  */
7276
7277 bool
7278 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7279 {
7280   struct aarch64_address_info addr;
7281
7282   /* PRFM accepts the same addresses as DImode...  */
7283   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7284   if (!res)
7285     return false;
7286
7287   /* ... except writeback forms.  */
7288   return addr.type != ADDRESS_REG_WB;
7289 }
7290
7291 bool
7292 aarch64_symbolic_address_p (rtx x)
7293 {
7294   rtx offset;
7295
7296   split_const (x, &x, &offset);
7297   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7298 }
7299
7300 /* Classify the base of symbolic expression X.  */
7301
7302 enum aarch64_symbol_type
7303 aarch64_classify_symbolic_expression (rtx x)
7304 {
7305   rtx offset;
7306
7307   split_const (x, &x, &offset);
7308   return aarch64_classify_symbol (x, INTVAL (offset));
7309 }
7310
7311
7312 /* Return TRUE if X is a legitimate address for accessing memory in
7313    mode MODE.  */
7314 static bool
7315 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7316 {
7317   struct aarch64_address_info addr;
7318
7319   return aarch64_classify_address (&addr, x, mode, strict_p);
7320 }
7321
7322 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7323    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7324 bool
7325 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7326                               aarch64_addr_query_type type)
7327 {
7328   struct aarch64_address_info addr;
7329
7330   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7331 }
7332
7333 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7334
7335 static bool
7336 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7337                                          poly_int64 orig_offset,
7338                                          machine_mode mode)
7339 {
7340   HOST_WIDE_INT size;
7341   if (GET_MODE_SIZE (mode).is_constant (&size))
7342     {
7343       HOST_WIDE_INT const_offset, second_offset;
7344
7345       /* A general SVE offset is A * VQ + B.  Remove the A component from
7346          coefficient 0 in order to get the constant B.  */
7347       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7348
7349       /* Split an out-of-range address displacement into a base and
7350          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7351          range otherwise to increase opportunities for sharing the base
7352          address of different sizes.  Unaligned accesses use the signed
7353          9-bit range, TImode/TFmode use the intersection of signed
7354          scaled 7-bit and signed 9-bit offset.  */
7355       if (mode == TImode || mode == TFmode)
7356         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7357       else if ((const_offset & (size - 1)) != 0)
7358         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7359       else
7360         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7361
7362       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7363         return false;
7364
7365       /* Split the offset into second_offset and the rest.  */
7366       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7367       *offset2 = gen_int_mode (second_offset, Pmode);
7368       return true;
7369     }
7370   else
7371     {
7372       /* Get the mode we should use as the basis of the range.  For structure
7373          modes this is the mode of one vector.  */
7374       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7375       machine_mode step_mode
7376         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7377
7378       /* Get the "mul vl" multiplier we'd like to use.  */
7379       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7380       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7381       if (vec_flags & VEC_SVE_DATA)
7382         /* LDR supports a 9-bit range, but the move patterns for
7383            structure modes require all vectors to be in range of the
7384            same base.  The simplest way of accomodating that while still
7385            promoting reuse of anchor points between different modes is
7386            to use an 8-bit range unconditionally.  */
7387         vnum = ((vnum + 128) & 255) - 128;
7388       else
7389         /* Predicates are only handled singly, so we might as well use
7390            the full range.  */
7391         vnum = ((vnum + 256) & 511) - 256;
7392       if (vnum == 0)
7393         return false;
7394
7395       /* Convert the "mul vl" multiplier into a byte offset.  */
7396       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7397       if (known_eq (second_offset, orig_offset))
7398         return false;
7399
7400       /* Split the offset into second_offset and the rest.  */
7401       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7402       *offset2 = gen_int_mode (second_offset, Pmode);
7403       return true;
7404     }
7405 }
7406
7407 /* Return the binary representation of floating point constant VALUE in INTVAL.
7408    If the value cannot be converted, return false without setting INTVAL.
7409    The conversion is done in the given MODE.  */
7410 bool
7411 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7412 {
7413
7414   /* We make a general exception for 0.  */
7415   if (aarch64_float_const_zero_rtx_p (value))
7416     {
7417       *intval = 0;
7418       return true;
7419     }
7420
7421   scalar_float_mode mode;
7422   if (GET_CODE (value) != CONST_DOUBLE
7423       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7424       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7425       /* Only support up to DF mode.  */
7426       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7427     return false;
7428
7429   unsigned HOST_WIDE_INT ival = 0;
7430
7431   long res[2];
7432   real_to_target (res,
7433                   CONST_DOUBLE_REAL_VALUE (value),
7434                   REAL_MODE_FORMAT (mode));
7435
7436   if (mode == DFmode)
7437     {
7438       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7439       ival = zext_hwi (res[order], 32);
7440       ival |= (zext_hwi (res[1 - order], 32) << 32);
7441     }
7442   else
7443       ival = zext_hwi (res[0], 32);
7444
7445   *intval = ival;
7446   return true;
7447 }
7448
7449 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7450    single MOV(+MOVK) followed by an FMOV.  */
7451 bool
7452 aarch64_float_const_rtx_p (rtx x)
7453 {
7454   machine_mode mode = GET_MODE (x);
7455   if (mode == VOIDmode)
7456     return false;
7457
7458   /* Determine whether it's cheaper to write float constants as
7459      mov/movk pairs over ldr/adrp pairs.  */
7460   unsigned HOST_WIDE_INT ival;
7461
7462   if (GET_CODE (x) == CONST_DOUBLE
7463       && SCALAR_FLOAT_MODE_P (mode)
7464       && aarch64_reinterpret_float_as_int (x, &ival))
7465     {
7466       scalar_int_mode imode = (mode == HFmode
7467                                ? SImode
7468                                : int_mode_for_mode (mode).require ());
7469       int num_instr = aarch64_internal_mov_immediate
7470                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7471       return num_instr < 3;
7472     }
7473
7474   return false;
7475 }
7476
7477 /* Return TRUE if rtx X is immediate constant 0.0 */
7478 bool
7479 aarch64_float_const_zero_rtx_p (rtx x)
7480 {
7481   if (GET_MODE (x) == VOIDmode)
7482     return false;
7483
7484   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7485     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7486   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7487 }
7488
7489 /* Return TRUE if rtx X is immediate constant that fits in a single
7490    MOVI immediate operation.  */
7491 bool
7492 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7493 {
7494   if (!TARGET_SIMD)
7495      return false;
7496
7497   machine_mode vmode;
7498   scalar_int_mode imode;
7499   unsigned HOST_WIDE_INT ival;
7500
7501   if (GET_CODE (x) == CONST_DOUBLE
7502       && SCALAR_FLOAT_MODE_P (mode))
7503     {
7504       if (!aarch64_reinterpret_float_as_int (x, &ival))
7505         return false;
7506
7507       /* We make a general exception for 0.  */
7508       if (aarch64_float_const_zero_rtx_p (x))
7509         return true;
7510
7511       imode = int_mode_for_mode (mode).require ();
7512     }
7513   else if (GET_CODE (x) == CONST_INT
7514            && is_a <scalar_int_mode> (mode, &imode))
7515     ival = INTVAL (x);
7516   else
7517     return false;
7518
7519    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7520      a 128 bit vector mode.  */
7521   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7522
7523   vmode = aarch64_simd_container_mode (imode, width);
7524   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7525
7526   return aarch64_simd_valid_immediate (v_op, NULL);
7527 }
7528
7529
7530 /* Return the fixed registers used for condition codes.  */
7531
7532 static bool
7533 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7534 {
7535   *p1 = CC_REGNUM;
7536   *p2 = INVALID_REGNUM;
7537   return true;
7538 }
7539
7540 /* This function is used by the call expanders of the machine description.
7541    RESULT is the register in which the result is returned.  It's NULL for
7542    "call" and "sibcall".
7543    MEM is the location of the function call.
7544    SIBCALL indicates whether this function call is normal call or sibling call.
7545    It will generate different pattern accordingly.  */
7546
7547 void
7548 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7549 {
7550   rtx call, callee, tmp;
7551   rtvec vec;
7552   machine_mode mode;
7553
7554   gcc_assert (MEM_P (mem));
7555   callee = XEXP (mem, 0);
7556   mode = GET_MODE (callee);
7557   gcc_assert (mode == Pmode);
7558
7559   /* Decide if we should generate indirect calls by loading the
7560      address of the callee into a register before performing
7561      the branch-and-link.  */
7562   if (SYMBOL_REF_P (callee)
7563       ? (aarch64_is_long_call_p (callee)
7564          || aarch64_is_noplt_call_p (callee))
7565       : !REG_P (callee))
7566     XEXP (mem, 0) = force_reg (mode, callee);
7567
7568   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7569
7570   if (result != NULL_RTX)
7571     call = gen_rtx_SET (result, call);
7572
7573   if (sibcall)
7574     tmp = ret_rtx;
7575   else
7576     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7577
7578   vec = gen_rtvec (2, call, tmp);
7579   call = gen_rtx_PARALLEL (VOIDmode, vec);
7580
7581   aarch64_emit_call_insn (call);
7582 }
7583
7584 /* Emit call insn with PAT and do aarch64-specific handling.  */
7585
7586 void
7587 aarch64_emit_call_insn (rtx pat)
7588 {
7589   rtx insn = emit_call_insn (pat);
7590
7591   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7592   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7593   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7594 }
7595
7596 machine_mode
7597 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7598 {
7599   machine_mode mode_x = GET_MODE (x);
7600   rtx_code code_x = GET_CODE (x);
7601
7602   /* All floating point compares return CCFP if it is an equality
7603      comparison, and CCFPE otherwise.  */
7604   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7605     {
7606       switch (code)
7607         {
7608         case EQ:
7609         case NE:
7610         case UNORDERED:
7611         case ORDERED:
7612         case UNLT:
7613         case UNLE:
7614         case UNGT:
7615         case UNGE:
7616         case UNEQ:
7617           return CCFPmode;
7618
7619         case LT:
7620         case LE:
7621         case GT:
7622         case GE:
7623         case LTGT:
7624           return CCFPEmode;
7625
7626         default:
7627           gcc_unreachable ();
7628         }
7629     }
7630
7631   /* Equality comparisons of short modes against zero can be performed
7632      using the TST instruction with the appropriate bitmask.  */
7633   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7634       && (code == EQ || code == NE)
7635       && (mode_x == HImode || mode_x == QImode))
7636     return CC_NZmode;
7637
7638   /* Similarly, comparisons of zero_extends from shorter modes can
7639      be performed using an ANDS with an immediate mask.  */
7640   if (y == const0_rtx && code_x == ZERO_EXTEND
7641       && (mode_x == SImode || mode_x == DImode)
7642       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7643       && (code == EQ || code == NE))
7644     return CC_NZmode;
7645
7646   if ((mode_x == SImode || mode_x == DImode)
7647       && y == const0_rtx
7648       && (code == EQ || code == NE || code == LT || code == GE)
7649       && (code_x == PLUS || code_x == MINUS || code_x == AND
7650           || code_x == NEG
7651           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7652               && CONST_INT_P (XEXP (x, 2)))))
7653     return CC_NZmode;
7654
7655   /* A compare with a shifted operand.  Because of canonicalization,
7656      the comparison will have to be swapped when we emit the assembly
7657      code.  */
7658   if ((mode_x == SImode || mode_x == DImode)
7659       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7660       && (code_x == ASHIFT || code_x == ASHIFTRT
7661           || code_x == LSHIFTRT
7662           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7663     return CC_SWPmode;
7664
7665   /* Similarly for a negated operand, but we can only do this for
7666      equalities.  */
7667   if ((mode_x == SImode || mode_x == DImode)
7668       && (REG_P (y) || GET_CODE (y) == SUBREG)
7669       && (code == EQ || code == NE)
7670       && code_x == NEG)
7671     return CC_Zmode;
7672
7673   /* A test for unsigned overflow from an addition.  */
7674   if ((mode_x == DImode || mode_x == TImode)
7675       && (code == LTU || code == GEU)
7676       && code_x == PLUS
7677       && rtx_equal_p (XEXP (x, 0), y))
7678     return CC_Cmode;
7679
7680   /* A test for unsigned overflow from an add with carry.  */
7681   if ((mode_x == DImode || mode_x == TImode)
7682       && (code == LTU || code == GEU)
7683       && code_x == PLUS
7684       && CONST_SCALAR_INT_P (y)
7685       && (rtx_mode_t (y, mode_x)
7686           == (wi::shwi (1, mode_x)
7687               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7688     return CC_ADCmode;
7689
7690   /* A test for signed overflow.  */
7691   if ((mode_x == DImode || mode_x == TImode)
7692       && code == NE
7693       && code_x == PLUS
7694       && GET_CODE (y) == SIGN_EXTEND)
7695     return CC_Vmode;
7696
7697   /* For everything else, return CCmode.  */
7698   return CCmode;
7699 }
7700
7701 static int
7702 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7703
7704 int
7705 aarch64_get_condition_code (rtx x)
7706 {
7707   machine_mode mode = GET_MODE (XEXP (x, 0));
7708   enum rtx_code comp_code = GET_CODE (x);
7709
7710   if (GET_MODE_CLASS (mode) != MODE_CC)
7711     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7712   return aarch64_get_condition_code_1 (mode, comp_code);
7713 }
7714
7715 static int
7716 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7717 {
7718   switch (mode)
7719     {
7720     case E_CCFPmode:
7721     case E_CCFPEmode:
7722       switch (comp_code)
7723         {
7724         case GE: return AARCH64_GE;
7725         case GT: return AARCH64_GT;
7726         case LE: return AARCH64_LS;
7727         case LT: return AARCH64_MI;
7728         case NE: return AARCH64_NE;
7729         case EQ: return AARCH64_EQ;
7730         case ORDERED: return AARCH64_VC;
7731         case UNORDERED: return AARCH64_VS;
7732         case UNLT: return AARCH64_LT;
7733         case UNLE: return AARCH64_LE;
7734         case UNGT: return AARCH64_HI;
7735         case UNGE: return AARCH64_PL;
7736         default: return -1;
7737         }
7738       break;
7739
7740     case E_CCmode:
7741       switch (comp_code)
7742         {
7743         case NE: return AARCH64_NE;
7744         case EQ: return AARCH64_EQ;
7745         case GE: return AARCH64_GE;
7746         case GT: return AARCH64_GT;
7747         case LE: return AARCH64_LE;
7748         case LT: return AARCH64_LT;
7749         case GEU: return AARCH64_CS;
7750         case GTU: return AARCH64_HI;
7751         case LEU: return AARCH64_LS;
7752         case LTU: return AARCH64_CC;
7753         default: return -1;
7754         }
7755       break;
7756
7757     case E_CC_SWPmode:
7758       switch (comp_code)
7759         {
7760         case NE: return AARCH64_NE;
7761         case EQ: return AARCH64_EQ;
7762         case GE: return AARCH64_LE;
7763         case GT: return AARCH64_LT;
7764         case LE: return AARCH64_GE;
7765         case LT: return AARCH64_GT;
7766         case GEU: return AARCH64_LS;
7767         case GTU: return AARCH64_CC;
7768         case LEU: return AARCH64_CS;
7769         case LTU: return AARCH64_HI;
7770         default: return -1;
7771         }
7772       break;
7773
7774     case E_CC_NZCmode:
7775       switch (comp_code)
7776         {
7777         case NE: return AARCH64_NE; /* = any */
7778         case EQ: return AARCH64_EQ; /* = none */
7779         case GE: return AARCH64_PL; /* = nfrst */
7780         case LT: return AARCH64_MI; /* = first */
7781         case GEU: return AARCH64_CS; /* = nlast */
7782         case GTU: return AARCH64_HI; /* = pmore */
7783         case LEU: return AARCH64_LS; /* = plast */
7784         case LTU: return AARCH64_CC; /* = last */
7785         default: return -1;
7786         }
7787       break;
7788
7789     case E_CC_NZmode:
7790       switch (comp_code)
7791         {
7792         case NE: return AARCH64_NE;
7793         case EQ: return AARCH64_EQ;
7794         case GE: return AARCH64_PL;
7795         case LT: return AARCH64_MI;
7796         default: return -1;
7797         }
7798       break;
7799
7800     case E_CC_Zmode:
7801       switch (comp_code)
7802         {
7803         case NE: return AARCH64_NE;
7804         case EQ: return AARCH64_EQ;
7805         default: return -1;
7806         }
7807       break;
7808
7809     case E_CC_Cmode:
7810       switch (comp_code)
7811         {
7812         case LTU: return AARCH64_CS;
7813         case GEU: return AARCH64_CC;
7814         default: return -1;
7815         }
7816       break;
7817
7818     case E_CC_ADCmode:
7819       switch (comp_code)
7820         {
7821         case GEU: return AARCH64_CS;
7822         case LTU: return AARCH64_CC;
7823         default: return -1;
7824         }
7825       break;
7826
7827     case E_CC_Vmode:
7828       switch (comp_code)
7829         {
7830         case NE: return AARCH64_VS;
7831         case EQ: return AARCH64_VC;
7832         default: return -1;
7833         }
7834       break;
7835
7836     default:
7837       return -1;
7838     }
7839
7840   return -1;
7841 }
7842
7843 bool
7844 aarch64_const_vec_all_same_in_range_p (rtx x,
7845                                        HOST_WIDE_INT minval,
7846                                        HOST_WIDE_INT maxval)
7847 {
7848   rtx elt;
7849   return (const_vec_duplicate_p (x, &elt)
7850           && CONST_INT_P (elt)
7851           && IN_RANGE (INTVAL (elt), minval, maxval));
7852 }
7853
7854 bool
7855 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7856 {
7857   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7858 }
7859
7860 /* Return true if VEC is a constant in which every element is in the range
7861    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7862
7863 static bool
7864 aarch64_const_vec_all_in_range_p (rtx vec,
7865                                   HOST_WIDE_INT minval,
7866                                   HOST_WIDE_INT maxval)
7867 {
7868   if (GET_CODE (vec) != CONST_VECTOR
7869       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7870     return false;
7871
7872   int nunits;
7873   if (!CONST_VECTOR_STEPPED_P (vec))
7874     nunits = const_vector_encoded_nelts (vec);
7875   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7876     return false;
7877
7878   for (int i = 0; i < nunits; i++)
7879     {
7880       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7881       if (!CONST_INT_P (vec_elem)
7882           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7883         return false;
7884     }
7885   return true;
7886 }
7887
7888 /* N Z C V.  */
7889 #define AARCH64_CC_V 1
7890 #define AARCH64_CC_C (1 << 1)
7891 #define AARCH64_CC_Z (1 << 2)
7892 #define AARCH64_CC_N (1 << 3)
7893
7894 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7895 static const int aarch64_nzcv_codes[] =
7896 {
7897   0,            /* EQ, Z == 1.  */
7898   AARCH64_CC_Z, /* NE, Z == 0.  */
7899   0,            /* CS, C == 1.  */
7900   AARCH64_CC_C, /* CC, C == 0.  */
7901   0,            /* MI, N == 1.  */
7902   AARCH64_CC_N, /* PL, N == 0.  */
7903   0,            /* VS, V == 1.  */
7904   AARCH64_CC_V, /* VC, V == 0.  */
7905   0,            /* HI, C ==1 && Z == 0.  */
7906   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7907   AARCH64_CC_V, /* GE, N == V.  */
7908   0,            /* LT, N != V.  */
7909   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7910   0,            /* LE, !(Z == 0 && N == V).  */
7911   0,            /* AL, Any.  */
7912   0             /* NV, Any.  */
7913 };
7914
7915 /* Print floating-point vector immediate operand X to F, negating it
7916    first if NEGATE is true.  Return true on success, false if it isn't
7917    a constant we can handle.  */
7918
7919 static bool
7920 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7921 {
7922   rtx elt;
7923
7924   if (!const_vec_duplicate_p (x, &elt))
7925     return false;
7926
7927   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7928   if (negate)
7929     r = real_value_negate (&r);
7930
7931   /* We only handle the SVE single-bit immediates here.  */
7932   if (real_equal (&r, &dconst0))
7933     asm_fprintf (f, "0.0");
7934   else if (real_equal (&r, &dconst1))
7935     asm_fprintf (f, "1.0");
7936   else if (real_equal (&r, &dconsthalf))
7937     asm_fprintf (f, "0.5");
7938   else
7939     return false;
7940
7941   return true;
7942 }
7943
7944 /* Return the equivalent letter for size.  */
7945 static char
7946 sizetochar (int size)
7947 {
7948   switch (size)
7949     {
7950     case 64: return 'd';
7951     case 32: return 's';
7952     case 16: return 'h';
7953     case 8 : return 'b';
7954     default: gcc_unreachable ();
7955     }
7956 }
7957
7958 /* Print operand X to file F in a target specific manner according to CODE.
7959    The acceptable formatting commands given by CODE are:
7960      'c':               An integer or symbol address without a preceding #
7961                         sign.
7962      'C':               Take the duplicated element in a vector constant
7963                         and print it in hex.
7964      'D':               Take the duplicated element in a vector constant
7965                         and print it as an unsigned integer, in decimal.
7966      'e':               Print the sign/zero-extend size as a character 8->b,
7967                         16->h, 32->w.
7968      'p':               Prints N such that 2^N == X (X must be power of 2 and
7969                         const int).
7970      'P':               Print the number of non-zero bits in X (a const_int).
7971      'H':               Print the higher numbered register of a pair (TImode)
7972                         of regs.
7973      'm':               Print a condition (eq, ne, etc).
7974      'M':               Same as 'm', but invert condition.
7975      'N':               Take the duplicated element in a vector constant
7976                         and print the negative of it in decimal.
7977      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7978      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7979                         The register printed is the FP/SIMD register name
7980                         of X + 0/1/2/3 for S/T/U/V.
7981      'R':               Print a scalar FP/SIMD register name + 1.
7982      'X':               Print bottom 16 bits of integer constant in hex.
7983      'w/x':             Print a general register name or the zero register
7984                         (32-bit or 64-bit).
7985      '0':               Print a normal operand, if it's a general register,
7986                         then we assume DImode.
7987      'k':               Print NZCV for conditional compare instructions.
7988      'A':               Output address constant representing the first
7989                         argument of X, specifying a relocation offset
7990                         if appropriate.
7991      'L':               Output constant address specified by X
7992                         with a relocation offset if appropriate.
7993      'G':               Prints address of X, specifying a PC relative
7994                         relocation mode if appropriate.
7995      'y':               Output address of LDP or STP - this is used for
7996                         some LDP/STPs which don't use a PARALLEL in their
7997                         pattern (so the mode needs to be adjusted).
7998      'z':               Output address of a typical LDP or STP.  */
7999
8000 static void
8001 aarch64_print_operand (FILE *f, rtx x, int code)
8002 {
8003   rtx elt;
8004   switch (code)
8005     {
8006     case 'c':
8007       switch (GET_CODE (x))
8008         {
8009         case CONST_INT:
8010           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8011           break;
8012
8013         case SYMBOL_REF:
8014           output_addr_const (f, x);
8015           break;
8016
8017         case CONST:
8018           if (GET_CODE (XEXP (x, 0)) == PLUS
8019               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8020             {
8021               output_addr_const (f, x);
8022               break;
8023             }
8024           /* Fall through.  */
8025
8026         default:
8027           output_operand_lossage ("unsupported operand for code '%c'", code);
8028         }
8029       break;
8030
8031     case 'e':
8032       {
8033         int n;
8034
8035         if (!CONST_INT_P (x)
8036             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
8037           {
8038             output_operand_lossage ("invalid operand for '%%%c'", code);
8039             return;
8040           }
8041
8042         switch (n)
8043           {
8044           case 3:
8045             fputc ('b', f);
8046             break;
8047           case 4:
8048             fputc ('h', f);
8049             break;
8050           case 5:
8051             fputc ('w', f);
8052             break;
8053           default:
8054             output_operand_lossage ("invalid operand for '%%%c'", code);
8055             return;
8056           }
8057       }
8058       break;
8059
8060     case 'p':
8061       {
8062         int n;
8063
8064         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8065           {
8066             output_operand_lossage ("invalid operand for '%%%c'", code);
8067             return;
8068           }
8069
8070         asm_fprintf (f, "%d", n);
8071       }
8072       break;
8073
8074     case 'P':
8075       if (!CONST_INT_P (x))
8076         {
8077           output_operand_lossage ("invalid operand for '%%%c'", code);
8078           return;
8079         }
8080
8081       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8082       break;
8083
8084     case 'H':
8085       if (x == const0_rtx)
8086         {
8087           asm_fprintf (f, "xzr");
8088           break;
8089         }
8090
8091       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8092         {
8093           output_operand_lossage ("invalid operand for '%%%c'", code);
8094           return;
8095         }
8096
8097       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8098       break;
8099
8100     case 'M':
8101     case 'm':
8102       {
8103         int cond_code;
8104         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8105         if (x == const_true_rtx)
8106           {
8107             if (code == 'M')
8108               fputs ("nv", f);
8109             return;
8110           }
8111
8112         if (!COMPARISON_P (x))
8113           {
8114             output_operand_lossage ("invalid operand for '%%%c'", code);
8115             return;
8116           }
8117
8118         cond_code = aarch64_get_condition_code (x);
8119         gcc_assert (cond_code >= 0);
8120         if (code == 'M')
8121           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8122         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8123           fputs (aarch64_sve_condition_codes[cond_code], f);
8124         else
8125           fputs (aarch64_condition_codes[cond_code], f);
8126       }
8127       break;
8128
8129     case 'N':
8130       if (!const_vec_duplicate_p (x, &elt))
8131         {
8132           output_operand_lossage ("invalid vector constant");
8133           return;
8134         }
8135
8136       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8137         asm_fprintf (f, "%wd", -INTVAL (elt));
8138       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8139                && aarch64_print_vector_float_operand (f, x, true))
8140         ;
8141       else
8142         {
8143           output_operand_lossage ("invalid vector constant");
8144           return;
8145         }
8146       break;
8147
8148     case 'b':
8149     case 'h':
8150     case 's':
8151     case 'd':
8152     case 'q':
8153       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8154         {
8155           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8156           return;
8157         }
8158       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8159       break;
8160
8161     case 'S':
8162     case 'T':
8163     case 'U':
8164     case 'V':
8165       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8166         {
8167           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8168           return;
8169         }
8170       asm_fprintf (f, "%c%d",
8171                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8172                    REGNO (x) - V0_REGNUM + (code - 'S'));
8173       break;
8174
8175     case 'R':
8176       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8177         {
8178           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8179           return;
8180         }
8181       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8182       break;
8183
8184     case 'X':
8185       if (!CONST_INT_P (x))
8186         {
8187           output_operand_lossage ("invalid operand for '%%%c'", code);
8188           return;
8189         }
8190       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8191       break;
8192
8193     case 'C':
8194       {
8195         /* Print a replicated constant in hex.  */
8196         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8197           {
8198             output_operand_lossage ("invalid operand for '%%%c'", code);
8199             return;
8200           }
8201         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8202         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8203       }
8204       break;
8205
8206     case 'D':
8207       {
8208         /* Print a replicated constant in decimal, treating it as
8209            unsigned.  */
8210         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8211           {
8212             output_operand_lossage ("invalid operand for '%%%c'", code);
8213             return;
8214           }
8215         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8216         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8217       }
8218       break;
8219
8220     case 'w':
8221     case 'x':
8222       if (x == const0_rtx
8223           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8224         {
8225           asm_fprintf (f, "%czr", code);
8226           break;
8227         }
8228
8229       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8230         {
8231           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8232           break;
8233         }
8234
8235       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8236         {
8237           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8238           break;
8239         }
8240
8241       /* Fall through */
8242
8243     case 0:
8244       if (x == NULL)
8245         {
8246           output_operand_lossage ("missing operand");
8247           return;
8248         }
8249
8250       switch (GET_CODE (x))
8251         {
8252         case REG:
8253           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8254             {
8255               if (REG_NREGS (x) == 1)
8256                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8257               else
8258                 {
8259                   char suffix
8260                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8261                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8262                                REGNO (x) - V0_REGNUM, suffix,
8263                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8264                 }
8265             }
8266           else
8267             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8268           break;
8269
8270         case MEM:
8271           output_address (GET_MODE (x), XEXP (x, 0));
8272           break;
8273
8274         case LABEL_REF:
8275         case SYMBOL_REF:
8276           output_addr_const (asm_out_file, x);
8277           break;
8278
8279         case CONST_INT:
8280           asm_fprintf (f, "%wd", INTVAL (x));
8281           break;
8282
8283         case CONST:
8284           if (!VECTOR_MODE_P (GET_MODE (x)))
8285             {
8286               output_addr_const (asm_out_file, x);
8287               break;
8288             }
8289           /* fall through */
8290
8291         case CONST_VECTOR:
8292           if (!const_vec_duplicate_p (x, &elt))
8293             {
8294               output_operand_lossage ("invalid vector constant");
8295               return;
8296             }
8297
8298           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8299             asm_fprintf (f, "%wd", INTVAL (elt));
8300           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8301                    && aarch64_print_vector_float_operand (f, x, false))
8302             ;
8303           else
8304             {
8305               output_operand_lossage ("invalid vector constant");
8306               return;
8307             }
8308           break;
8309
8310         case CONST_DOUBLE:
8311           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8312              be getting CONST_DOUBLEs holding integers.  */
8313           gcc_assert (GET_MODE (x) != VOIDmode);
8314           if (aarch64_float_const_zero_rtx_p (x))
8315             {
8316               fputc ('0', f);
8317               break;
8318             }
8319           else if (aarch64_float_const_representable_p (x))
8320             {
8321 #define buf_size 20
8322               char float_buf[buf_size] = {'\0'};
8323               real_to_decimal_for_mode (float_buf,
8324                                         CONST_DOUBLE_REAL_VALUE (x),
8325                                         buf_size, buf_size,
8326                                         1, GET_MODE (x));
8327               asm_fprintf (asm_out_file, "%s", float_buf);
8328               break;
8329 #undef buf_size
8330             }
8331           output_operand_lossage ("invalid constant");
8332           return;
8333         default:
8334           output_operand_lossage ("invalid operand");
8335           return;
8336         }
8337       break;
8338
8339     case 'A':
8340       if (GET_CODE (x) == HIGH)
8341         x = XEXP (x, 0);
8342
8343       switch (aarch64_classify_symbolic_expression (x))
8344         {
8345         case SYMBOL_SMALL_GOT_4G:
8346           asm_fprintf (asm_out_file, ":got:");
8347           break;
8348
8349         case SYMBOL_SMALL_TLSGD:
8350           asm_fprintf (asm_out_file, ":tlsgd:");
8351           break;
8352
8353         case SYMBOL_SMALL_TLSDESC:
8354           asm_fprintf (asm_out_file, ":tlsdesc:");
8355           break;
8356
8357         case SYMBOL_SMALL_TLSIE:
8358           asm_fprintf (asm_out_file, ":gottprel:");
8359           break;
8360
8361         case SYMBOL_TLSLE24:
8362           asm_fprintf (asm_out_file, ":tprel:");
8363           break;
8364
8365         case SYMBOL_TINY_GOT:
8366           gcc_unreachable ();
8367           break;
8368
8369         default:
8370           break;
8371         }
8372       output_addr_const (asm_out_file, x);
8373       break;
8374
8375     case 'L':
8376       switch (aarch64_classify_symbolic_expression (x))
8377         {
8378         case SYMBOL_SMALL_GOT_4G:
8379           asm_fprintf (asm_out_file, ":lo12:");
8380           break;
8381
8382         case SYMBOL_SMALL_TLSGD:
8383           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8384           break;
8385
8386         case SYMBOL_SMALL_TLSDESC:
8387           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8388           break;
8389
8390         case SYMBOL_SMALL_TLSIE:
8391           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8392           break;
8393
8394         case SYMBOL_TLSLE12:
8395           asm_fprintf (asm_out_file, ":tprel_lo12:");
8396           break;
8397
8398         case SYMBOL_TLSLE24:
8399           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8400           break;
8401
8402         case SYMBOL_TINY_GOT:
8403           asm_fprintf (asm_out_file, ":got:");
8404           break;
8405
8406         case SYMBOL_TINY_TLSIE:
8407           asm_fprintf (asm_out_file, ":gottprel:");
8408           break;
8409
8410         default:
8411           break;
8412         }
8413       output_addr_const (asm_out_file, x);
8414       break;
8415
8416     case 'G':
8417       switch (aarch64_classify_symbolic_expression (x))
8418         {
8419         case SYMBOL_TLSLE24:
8420           asm_fprintf (asm_out_file, ":tprel_hi12:");
8421           break;
8422         default:
8423           break;
8424         }
8425       output_addr_const (asm_out_file, x);
8426       break;
8427
8428     case 'k':
8429       {
8430         HOST_WIDE_INT cond_code;
8431
8432         if (!CONST_INT_P (x))
8433           {
8434             output_operand_lossage ("invalid operand for '%%%c'", code);
8435             return;
8436           }
8437
8438         cond_code = INTVAL (x);
8439         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8440         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8441       }
8442       break;
8443
8444     case 'y':
8445     case 'z':
8446       {
8447         machine_mode mode = GET_MODE (x);
8448
8449         if (GET_CODE (x) != MEM
8450             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8451           {
8452             output_operand_lossage ("invalid operand for '%%%c'", code);
8453             return;
8454           }
8455
8456         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8457                                             code == 'y'
8458                                             ? ADDR_QUERY_LDP_STP_N
8459                                             : ADDR_QUERY_LDP_STP))
8460           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8461       }
8462       break;
8463
8464     default:
8465       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8466       return;
8467     }
8468 }
8469
8470 /* Print address 'x' of a memory access with mode 'mode'.
8471    'op' is the context required by aarch64_classify_address.  It can either be
8472    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8473 static bool
8474 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8475                                 aarch64_addr_query_type type)
8476 {
8477   struct aarch64_address_info addr;
8478   unsigned int size;
8479
8480   /* Check all addresses are Pmode - including ILP32.  */
8481   if (GET_MODE (x) != Pmode
8482       && (!CONST_INT_P (x)
8483           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8484     {
8485       output_operand_lossage ("invalid address mode");
8486       return false;
8487     }
8488
8489   if (aarch64_classify_address (&addr, x, mode, true, type))
8490     switch (addr.type)
8491       {
8492       case ADDRESS_REG_IMM:
8493         if (known_eq (addr.const_offset, 0))
8494           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8495         else if (aarch64_sve_data_mode_p (mode))
8496           {
8497             HOST_WIDE_INT vnum
8498               = exact_div (addr.const_offset,
8499                            BYTES_PER_SVE_VECTOR).to_constant ();
8500             asm_fprintf (f, "[%s, #%wd, mul vl]",
8501                          reg_names[REGNO (addr.base)], vnum);
8502           }
8503         else if (aarch64_sve_pred_mode_p (mode))
8504           {
8505             HOST_WIDE_INT vnum
8506               = exact_div (addr.const_offset,
8507                            BYTES_PER_SVE_PRED).to_constant ();
8508             asm_fprintf (f, "[%s, #%wd, mul vl]",
8509                          reg_names[REGNO (addr.base)], vnum);
8510           }
8511         else
8512           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8513                        INTVAL (addr.offset));
8514         return true;
8515
8516       case ADDRESS_REG_REG:
8517         if (addr.shift == 0)
8518           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8519                        reg_names [REGNO (addr.offset)]);
8520         else
8521           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8522                        reg_names [REGNO (addr.offset)], addr.shift);
8523         return true;
8524
8525       case ADDRESS_REG_UXTW:
8526         if (addr.shift == 0)
8527           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8528                        REGNO (addr.offset) - R0_REGNUM);
8529         else
8530           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8531                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8532         return true;
8533
8534       case ADDRESS_REG_SXTW:
8535         if (addr.shift == 0)
8536           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8537                        REGNO (addr.offset) - R0_REGNUM);
8538         else
8539           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8540                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8541         return true;
8542
8543       case ADDRESS_REG_WB:
8544         /* Writeback is only supported for fixed-width modes.  */
8545         size = GET_MODE_SIZE (mode).to_constant ();
8546         switch (GET_CODE (x))
8547           {
8548           case PRE_INC:
8549             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8550             return true;
8551           case POST_INC:
8552             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8553             return true;
8554           case PRE_DEC:
8555             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8556             return true;
8557           case POST_DEC:
8558             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8559             return true;
8560           case PRE_MODIFY:
8561             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8562                          INTVAL (addr.offset));
8563             return true;
8564           case POST_MODIFY:
8565             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8566                          INTVAL (addr.offset));
8567             return true;
8568           default:
8569             break;
8570           }
8571         break;
8572
8573       case ADDRESS_LO_SUM:
8574         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8575         output_addr_const (f, addr.offset);
8576         asm_fprintf (f, "]");
8577         return true;
8578
8579       case ADDRESS_SYMBOLIC:
8580         output_addr_const (f, x);
8581         return true;
8582       }
8583
8584   return false;
8585 }
8586
8587 /* Print address 'x' of a memory access with mode 'mode'.  */
8588 static void
8589 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8590 {
8591   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8592     output_addr_const (f, x);
8593 }
8594
8595 bool
8596 aarch64_label_mentioned_p (rtx x)
8597 {
8598   const char *fmt;
8599   int i;
8600
8601   if (GET_CODE (x) == LABEL_REF)
8602     return true;
8603
8604   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8605      referencing instruction, but they are constant offsets, not
8606      symbols.  */
8607   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8608     return false;
8609
8610   fmt = GET_RTX_FORMAT (GET_CODE (x));
8611   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8612     {
8613       if (fmt[i] == 'E')
8614         {
8615           int j;
8616
8617           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8618             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8619               return 1;
8620         }
8621       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8622         return 1;
8623     }
8624
8625   return 0;
8626 }
8627
8628 /* Implement REGNO_REG_CLASS.  */
8629
8630 enum reg_class
8631 aarch64_regno_regclass (unsigned regno)
8632 {
8633   if (GP_REGNUM_P (regno))
8634     return GENERAL_REGS;
8635
8636   if (regno == SP_REGNUM)
8637     return STACK_REG;
8638
8639   if (regno == FRAME_POINTER_REGNUM
8640       || regno == ARG_POINTER_REGNUM)
8641     return POINTER_REGS;
8642
8643   if (FP_REGNUM_P (regno))
8644     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
8645             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
8646
8647   if (PR_REGNUM_P (regno))
8648     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8649
8650   return NO_REGS;
8651 }
8652
8653 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8654    If OFFSET is out of range, return an offset of an anchor point
8655    that is in range.  Return 0 otherwise.  */
8656
8657 static HOST_WIDE_INT
8658 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8659                        machine_mode mode)
8660 {
8661   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8662   if (size > 16)
8663     return (offset + 0x400) & ~0x7f0;
8664
8665   /* For offsets that aren't a multiple of the access size, the limit is
8666      -256...255.  */
8667   if (offset & (size - 1))
8668     {
8669       /* BLKmode typically uses LDP of X-registers.  */
8670       if (mode == BLKmode)
8671         return (offset + 512) & ~0x3ff;
8672       return (offset + 0x100) & ~0x1ff;
8673     }
8674
8675   /* Small negative offsets are supported.  */
8676   if (IN_RANGE (offset, -256, 0))
8677     return 0;
8678
8679   if (mode == TImode || mode == TFmode)
8680     return (offset + 0x100) & ~0x1ff;
8681
8682   /* Use 12-bit offset by access size.  */
8683   return offset & (~0xfff * size);
8684 }
8685
8686 static rtx
8687 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8688 {
8689   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8690      where mask is selected by alignment and size of the offset.
8691      We try to pick as large a range for the offset as possible to
8692      maximize the chance of a CSE.  However, for aligned addresses
8693      we limit the range to 4k so that structures with different sized
8694      elements are likely to use the same base.  We need to be careful
8695      not to split a CONST for some forms of address expression, otherwise
8696      it will generate sub-optimal code.  */
8697
8698   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8699     {
8700       rtx base = XEXP (x, 0);
8701       rtx offset_rtx = XEXP (x, 1);
8702       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8703
8704       if (GET_CODE (base) == PLUS)
8705         {
8706           rtx op0 = XEXP (base, 0);
8707           rtx op1 = XEXP (base, 1);
8708
8709           /* Force any scaling into a temp for CSE.  */
8710           op0 = force_reg (Pmode, op0);
8711           op1 = force_reg (Pmode, op1);
8712
8713           /* Let the pointer register be in op0.  */
8714           if (REG_POINTER (op1))
8715             std::swap (op0, op1);
8716
8717           /* If the pointer is virtual or frame related, then we know that
8718              virtual register instantiation or register elimination is going
8719              to apply a second constant.  We want the two constants folded
8720              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8721           if (virt_or_elim_regno_p (REGNO (op0)))
8722             {
8723               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8724                                    NULL_RTX, true, OPTAB_DIRECT);
8725               return gen_rtx_PLUS (Pmode, base, op1);
8726             }
8727
8728           /* Otherwise, in order to encourage CSE (and thence loop strength
8729              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8730           base = expand_binop (Pmode, add_optab, op0, op1,
8731                                NULL_RTX, true, OPTAB_DIRECT);
8732           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8733         }
8734
8735       HOST_WIDE_INT size;
8736       if (GET_MODE_SIZE (mode).is_constant (&size))
8737         {
8738           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8739                                                              mode);
8740           if (base_offset != 0)
8741             {
8742               base = plus_constant (Pmode, base, base_offset);
8743               base = force_operand (base, NULL_RTX);
8744               return plus_constant (Pmode, base, offset - base_offset);
8745             }
8746         }
8747     }
8748
8749   return x;
8750 }
8751
8752 static reg_class_t
8753 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8754                           reg_class_t rclass,
8755                           machine_mode mode,
8756                           secondary_reload_info *sri)
8757 {
8758   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8759      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8760      comment at the head of aarch64-sve.md for more details about the
8761      big-endian handling.  */
8762   if (BYTES_BIG_ENDIAN
8763       && reg_class_subset_p (rclass, FP_REGS)
8764       && !((REG_P (x) && HARD_REGISTER_P (x))
8765            || aarch64_simd_valid_immediate (x, NULL))
8766       && aarch64_sve_data_mode_p (mode))
8767     {
8768       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8769       return NO_REGS;
8770     }
8771
8772   /* If we have to disable direct literal pool loads and stores because the
8773      function is too big, then we need a scratch register.  */
8774   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8775       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8776           || targetm.vector_mode_supported_p (GET_MODE (x)))
8777       && !aarch64_pcrelative_literal_loads)
8778     {
8779       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8780       return NO_REGS;
8781     }
8782
8783   /* Without the TARGET_SIMD instructions we cannot move a Q register
8784      to a Q register directly.  We need a scratch.  */
8785   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8786       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8787       && reg_class_subset_p (rclass, FP_REGS))
8788     {
8789       sri->icode = code_for_aarch64_reload_mov (mode);
8790       return NO_REGS;
8791     }
8792
8793   /* A TFmode or TImode memory access should be handled via an FP_REGS
8794      because AArch64 has richer addressing modes for LDR/STR instructions
8795      than LDP/STP instructions.  */
8796   if (TARGET_FLOAT && rclass == GENERAL_REGS
8797       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8798     return FP_REGS;
8799
8800   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8801       return GENERAL_REGS;
8802
8803   return NO_REGS;
8804 }
8805
8806 static bool
8807 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8808 {
8809   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8810
8811   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8812      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8813   if (frame_pointer_needed)
8814     return to == HARD_FRAME_POINTER_REGNUM;
8815   return true;
8816 }
8817
8818 poly_int64
8819 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8820 {
8821   if (to == HARD_FRAME_POINTER_REGNUM)
8822     {
8823       if (from == ARG_POINTER_REGNUM)
8824         return cfun->machine->frame.hard_fp_offset;
8825
8826       if (from == FRAME_POINTER_REGNUM)
8827         return cfun->machine->frame.hard_fp_offset
8828                - cfun->machine->frame.locals_offset;
8829     }
8830
8831   if (to == STACK_POINTER_REGNUM)
8832     {
8833       if (from == FRAME_POINTER_REGNUM)
8834           return cfun->machine->frame.frame_size
8835                  - cfun->machine->frame.locals_offset;
8836     }
8837
8838   return cfun->machine->frame.frame_size;
8839 }
8840
8841 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8842    previous frame.  */
8843
8844 rtx
8845 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8846 {
8847   if (count != 0)
8848     return const0_rtx;
8849   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8850 }
8851
8852
8853 static void
8854 aarch64_asm_trampoline_template (FILE *f)
8855 {
8856   int offset1 = 16;
8857   int offset2 = 20;
8858
8859   if (aarch64_bti_enabled ())
8860     {
8861       asm_fprintf (f, "\thint\t34 // bti c\n");
8862       offset1 -= 4;
8863       offset2 -= 4;
8864     }
8865
8866   if (TARGET_ILP32)
8867     {
8868       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8869       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8870                    offset1);
8871     }
8872   else
8873     {
8874       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8875       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8876                    offset2);
8877     }
8878   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8879
8880   /* The trampoline needs an extra padding instruction.  In case if BTI is
8881      enabled the padding instruction is replaced by the BTI instruction at
8882      the beginning.  */
8883   if (!aarch64_bti_enabled ())
8884     assemble_aligned_integer (4, const0_rtx);
8885
8886   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8887   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8888 }
8889
8890 static void
8891 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8892 {
8893   rtx fnaddr, mem, a_tramp;
8894   const int tramp_code_sz = 16;
8895
8896   /* Don't need to copy the trailing D-words, we fill those in below.  */
8897   emit_block_move (m_tramp, assemble_trampoline_template (),
8898                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8899   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8900   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8901   if (GET_MODE (fnaddr) != ptr_mode)
8902     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8903   emit_move_insn (mem, fnaddr);
8904
8905   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8906   emit_move_insn (mem, chain_value);
8907
8908   /* XXX We should really define a "clear_cache" pattern and use
8909      gen_clear_cache().  */
8910   a_tramp = XEXP (m_tramp, 0);
8911   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8912                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8913                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8914                      ptr_mode);
8915 }
8916
8917 static unsigned char
8918 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8919 {
8920   /* ??? Logically we should only need to provide a value when
8921      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8922      can hold MODE, but at the moment we need to handle all modes.
8923      Just ignore any runtime parts for registers that can't store them.  */
8924   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8925   unsigned int nregs;
8926   switch (regclass)
8927     {
8928     case TAILCALL_ADDR_REGS:
8929     case POINTER_REGS:
8930     case GENERAL_REGS:
8931     case ALL_REGS:
8932     case POINTER_AND_FP_REGS:
8933     case FP_REGS:
8934     case FP_LO_REGS:
8935     case FP_LO8_REGS:
8936       if (aarch64_sve_data_mode_p (mode)
8937           && constant_multiple_p (GET_MODE_SIZE (mode),
8938                                   BYTES_PER_SVE_VECTOR, &nregs))
8939         return nregs;
8940       return (aarch64_vector_data_mode_p (mode)
8941               ? CEIL (lowest_size, UNITS_PER_VREG)
8942               : CEIL (lowest_size, UNITS_PER_WORD));
8943     case STACK_REG:
8944     case PR_REGS:
8945     case PR_LO_REGS:
8946     case PR_HI_REGS:
8947       return 1;
8948
8949     case NO_REGS:
8950       return 0;
8951
8952     default:
8953       break;
8954     }
8955   gcc_unreachable ();
8956 }
8957
8958 static reg_class_t
8959 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8960 {
8961   if (regclass == POINTER_REGS)
8962     return GENERAL_REGS;
8963
8964   if (regclass == STACK_REG)
8965     {
8966       if (REG_P(x)
8967           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8968           return regclass;
8969
8970       return NO_REGS;
8971     }
8972
8973   /* Register eliminiation can result in a request for
8974      SP+constant->FP_REGS.  We cannot support such operations which
8975      use SP as source and an FP_REG as destination, so reject out
8976      right now.  */
8977   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8978     {
8979       rtx lhs = XEXP (x, 0);
8980
8981       /* Look through a possible SUBREG introduced by ILP32.  */
8982       if (GET_CODE (lhs) == SUBREG)
8983         lhs = SUBREG_REG (lhs);
8984
8985       gcc_assert (REG_P (lhs));
8986       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8987                                       POINTER_REGS));
8988       return NO_REGS;
8989     }
8990
8991   return regclass;
8992 }
8993
8994 void
8995 aarch64_asm_output_labelref (FILE* f, const char *name)
8996 {
8997   asm_fprintf (f, "%U%s", name);
8998 }
8999
9000 static void
9001 aarch64_elf_asm_constructor (rtx symbol, int priority)
9002 {
9003   if (priority == DEFAULT_INIT_PRIORITY)
9004     default_ctor_section_asm_out_constructor (symbol, priority);
9005   else
9006     {
9007       section *s;
9008       /* While priority is known to be in range [0, 65535], so 18 bytes
9009          would be enough, the compiler might not know that.  To avoid
9010          -Wformat-truncation false positive, use a larger size.  */
9011       char buf[23];
9012       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9013       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9014       switch_to_section (s);
9015       assemble_align (POINTER_SIZE);
9016       assemble_aligned_integer (POINTER_BYTES, symbol);
9017     }
9018 }
9019
9020 static void
9021 aarch64_elf_asm_destructor (rtx symbol, int priority)
9022 {
9023   if (priority == DEFAULT_INIT_PRIORITY)
9024     default_dtor_section_asm_out_destructor (symbol, priority);
9025   else
9026     {
9027       section *s;
9028       /* While priority is known to be in range [0, 65535], so 18 bytes
9029          would be enough, the compiler might not know that.  To avoid
9030          -Wformat-truncation false positive, use a larger size.  */
9031       char buf[23];
9032       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9033       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9034       switch_to_section (s);
9035       assemble_align (POINTER_SIZE);
9036       assemble_aligned_integer (POINTER_BYTES, symbol);
9037     }
9038 }
9039
9040 const char*
9041 aarch64_output_casesi (rtx *operands)
9042 {
9043   char buf[100];
9044   char label[100];
9045   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9046   int index;
9047   static const char *const patterns[4][2] =
9048   {
9049     {
9050       "ldrb\t%w3, [%0,%w1,uxtw]",
9051       "add\t%3, %4, %w3, sxtb #2"
9052     },
9053     {
9054       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9055       "add\t%3, %4, %w3, sxth #2"
9056     },
9057     {
9058       "ldr\t%w3, [%0,%w1,uxtw #2]",
9059       "add\t%3, %4, %w3, sxtw #2"
9060     },
9061     /* We assume that DImode is only generated when not optimizing and
9062        that we don't really need 64-bit address offsets.  That would
9063        imply an object file with 8GB of code in a single function!  */
9064     {
9065       "ldr\t%w3, [%0,%w1,uxtw #2]",
9066       "add\t%3, %4, %w3, sxtw #2"
9067     }
9068   };
9069
9070   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9071
9072   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9073   index = exact_log2 (GET_MODE_SIZE (mode));
9074
9075   gcc_assert (index >= 0 && index <= 3);
9076
9077   /* Need to implement table size reduction, by chaning the code below.  */
9078   output_asm_insn (patterns[index][0], operands);
9079   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9080   snprintf (buf, sizeof (buf),
9081             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9082   output_asm_insn (buf, operands);
9083   output_asm_insn (patterns[index][1], operands);
9084   output_asm_insn ("br\t%3", operands);
9085   assemble_label (asm_out_file, label);
9086   return "";
9087 }
9088
9089
9090 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9091    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9092    operator.  */
9093
9094 int
9095 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9096 {
9097   if (shift >= 0 && shift <= 3)
9098     {
9099       int size;
9100       for (size = 8; size <= 32; size *= 2)
9101         {
9102           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9103           if (mask == bits << shift)
9104             return size;
9105         }
9106     }
9107   return 0;
9108 }
9109
9110 /* Constant pools are per function only when PC relative
9111    literal loads are true or we are in the large memory
9112    model.  */
9113
9114 static inline bool
9115 aarch64_can_use_per_function_literal_pools_p (void)
9116 {
9117   return (aarch64_pcrelative_literal_loads
9118           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9119 }
9120
9121 static bool
9122 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9123 {
9124   /* We can't use blocks for constants when we're using a per-function
9125      constant pool.  */
9126   return !aarch64_can_use_per_function_literal_pools_p ();
9127 }
9128
9129 /* Select appropriate section for constants depending
9130    on where we place literal pools.  */
9131
9132 static section *
9133 aarch64_select_rtx_section (machine_mode mode,
9134                             rtx x,
9135                             unsigned HOST_WIDE_INT align)
9136 {
9137   if (aarch64_can_use_per_function_literal_pools_p ())
9138     return function_section (current_function_decl);
9139
9140   return default_elf_select_rtx_section (mode, x, align);
9141 }
9142
9143 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9144 void
9145 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9146                                   HOST_WIDE_INT offset)
9147 {
9148   /* When using per-function literal pools, we must ensure that any code
9149      section is aligned to the minimal instruction length, lest we get
9150      errors from the assembler re "unaligned instructions".  */
9151   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9152     ASM_OUTPUT_ALIGN (f, 2);
9153 }
9154
9155 /* Costs.  */
9156
9157 /* Helper function for rtx cost calculation.  Strip a shift expression
9158    from X.  Returns the inner operand if successful, or the original
9159    expression on failure.  */
9160 static rtx
9161 aarch64_strip_shift (rtx x)
9162 {
9163   rtx op = x;
9164
9165   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9166      we can convert both to ROR during final output.  */
9167   if ((GET_CODE (op) == ASHIFT
9168        || GET_CODE (op) == ASHIFTRT
9169        || GET_CODE (op) == LSHIFTRT
9170        || GET_CODE (op) == ROTATERT
9171        || GET_CODE (op) == ROTATE)
9172       && CONST_INT_P (XEXP (op, 1)))
9173     return XEXP (op, 0);
9174
9175   if (GET_CODE (op) == MULT
9176       && CONST_INT_P (XEXP (op, 1))
9177       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9178     return XEXP (op, 0);
9179
9180   return x;
9181 }
9182
9183 /* Helper function for rtx cost calculation.  Strip an extend
9184    expression from X.  Returns the inner operand if successful, or the
9185    original expression on failure.  We deal with a number of possible
9186    canonicalization variations here. If STRIP_SHIFT is true, then
9187    we can strip off a shift also.  */
9188 static rtx
9189 aarch64_strip_extend (rtx x, bool strip_shift)
9190 {
9191   scalar_int_mode mode;
9192   rtx op = x;
9193
9194   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9195     return op;
9196
9197   /* Zero and sign extraction of a widened value.  */
9198   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9199       && XEXP (op, 2) == const0_rtx
9200       && GET_CODE (XEXP (op, 0)) == MULT
9201       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9202                                          XEXP (op, 1)))
9203     return XEXP (XEXP (op, 0), 0);
9204
9205   /* It can also be represented (for zero-extend) as an AND with an
9206      immediate.  */
9207   if (GET_CODE (op) == AND
9208       && GET_CODE (XEXP (op, 0)) == MULT
9209       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9210       && CONST_INT_P (XEXP (op, 1))
9211       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9212                            INTVAL (XEXP (op, 1))) != 0)
9213     return XEXP (XEXP (op, 0), 0);
9214
9215   /* Now handle extended register, as this may also have an optional
9216      left shift by 1..4.  */
9217   if (strip_shift
9218       && GET_CODE (op) == ASHIFT
9219       && CONST_INT_P (XEXP (op, 1))
9220       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9221     op = XEXP (op, 0);
9222
9223   if (GET_CODE (op) == ZERO_EXTEND
9224       || GET_CODE (op) == SIGN_EXTEND)
9225     op = XEXP (op, 0);
9226
9227   if (op != x)
9228     return op;
9229
9230   return x;
9231 }
9232
9233 /* Return true iff CODE is a shift supported in combination
9234    with arithmetic instructions.  */
9235
9236 static bool
9237 aarch64_shift_p (enum rtx_code code)
9238 {
9239   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9240 }
9241
9242
9243 /* Return true iff X is a cheap shift without a sign extend. */
9244
9245 static bool
9246 aarch64_cheap_mult_shift_p (rtx x)
9247 {
9248   rtx op0, op1;
9249
9250   op0 = XEXP (x, 0);
9251   op1 = XEXP (x, 1);
9252
9253   if (!(aarch64_tune_params.extra_tuning_flags
9254                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9255     return false;
9256
9257   if (GET_CODE (op0) == SIGN_EXTEND)
9258     return false;
9259
9260   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9261       && UINTVAL (op1) <= 4)
9262     return true;
9263
9264   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9265     return false;
9266
9267   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9268
9269   if (l2 > 0 && l2 <= 4)
9270     return true;
9271
9272   return false;
9273 }
9274
9275 /* Helper function for rtx cost calculation.  Calculate the cost of
9276    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9277    Return the calculated cost of the expression, recursing manually in to
9278    operands where needed.  */
9279
9280 static int
9281 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9282 {
9283   rtx op0, op1;
9284   const struct cpu_cost_table *extra_cost
9285     = aarch64_tune_params.insn_extra_cost;
9286   int cost = 0;
9287   bool compound_p = (outer == PLUS || outer == MINUS);
9288   machine_mode mode = GET_MODE (x);
9289
9290   gcc_checking_assert (code == MULT);
9291
9292   op0 = XEXP (x, 0);
9293   op1 = XEXP (x, 1);
9294
9295   if (VECTOR_MODE_P (mode))
9296     mode = GET_MODE_INNER (mode);
9297
9298   /* Integer multiply/fma.  */
9299   if (GET_MODE_CLASS (mode) == MODE_INT)
9300     {
9301       /* The multiply will be canonicalized as a shift, cost it as such.  */
9302       if (aarch64_shift_p (GET_CODE (x))
9303           || (CONST_INT_P (op1)
9304               && exact_log2 (INTVAL (op1)) > 0))
9305         {
9306           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9307                            || GET_CODE (op0) == SIGN_EXTEND;
9308           if (speed)
9309             {
9310               if (compound_p)
9311                 {
9312                   /* If the shift is considered cheap,
9313                      then don't add any cost. */
9314                   if (aarch64_cheap_mult_shift_p (x))
9315                     ;
9316                   else if (REG_P (op1))
9317                     /* ARITH + shift-by-register.  */
9318                     cost += extra_cost->alu.arith_shift_reg;
9319                   else if (is_extend)
9320                     /* ARITH + extended register.  We don't have a cost field
9321                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9322                     cost += extra_cost->alu.extend_arith;
9323                   else
9324                     /* ARITH + shift-by-immediate.  */
9325                     cost += extra_cost->alu.arith_shift;
9326                 }
9327               else
9328                 /* LSL (immediate).  */
9329                 cost += extra_cost->alu.shift;
9330
9331             }
9332           /* Strip extends as we will have costed them in the case above.  */
9333           if (is_extend)
9334             op0 = aarch64_strip_extend (op0, true);
9335
9336           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9337
9338           return cost;
9339         }
9340
9341       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9342          compound and let the below cases handle it.  After all, MNEG is a
9343          special-case alias of MSUB.  */
9344       if (GET_CODE (op0) == NEG)
9345         {
9346           op0 = XEXP (op0, 0);
9347           compound_p = true;
9348         }
9349
9350       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9351       if ((GET_CODE (op0) == ZERO_EXTEND
9352            && GET_CODE (op1) == ZERO_EXTEND)
9353           || (GET_CODE (op0) == SIGN_EXTEND
9354               && GET_CODE (op1) == SIGN_EXTEND))
9355         {
9356           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9357           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9358
9359           if (speed)
9360             {
9361               if (compound_p)
9362                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9363                 cost += extra_cost->mult[0].extend_add;
9364               else
9365                 /* MUL/SMULL/UMULL.  */
9366                 cost += extra_cost->mult[0].extend;
9367             }
9368
9369           return cost;
9370         }
9371
9372       /* This is either an integer multiply or a MADD.  In both cases
9373          we want to recurse and cost the operands.  */
9374       cost += rtx_cost (op0, mode, MULT, 0, speed);
9375       cost += rtx_cost (op1, mode, MULT, 1, speed);
9376
9377       if (speed)
9378         {
9379           if (compound_p)
9380             /* MADD/MSUB.  */
9381             cost += extra_cost->mult[mode == DImode].add;
9382           else
9383             /* MUL.  */
9384             cost += extra_cost->mult[mode == DImode].simple;
9385         }
9386
9387       return cost;
9388     }
9389   else
9390     {
9391       if (speed)
9392         {
9393           /* Floating-point FMA/FMUL can also support negations of the
9394              operands, unless the rounding mode is upward or downward in
9395              which case FNMUL is different than FMUL with operand negation.  */
9396           bool neg0 = GET_CODE (op0) == NEG;
9397           bool neg1 = GET_CODE (op1) == NEG;
9398           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9399             {
9400               if (neg0)
9401                 op0 = XEXP (op0, 0);
9402               if (neg1)
9403                 op1 = XEXP (op1, 0);
9404             }
9405
9406           if (compound_p)
9407             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9408             cost += extra_cost->fp[mode == DFmode].fma;
9409           else
9410             /* FMUL/FNMUL.  */
9411             cost += extra_cost->fp[mode == DFmode].mult;
9412         }
9413
9414       cost += rtx_cost (op0, mode, MULT, 0, speed);
9415       cost += rtx_cost (op1, mode, MULT, 1, speed);
9416       return cost;
9417     }
9418 }
9419
9420 static int
9421 aarch64_address_cost (rtx x,
9422                       machine_mode mode,
9423                       addr_space_t as ATTRIBUTE_UNUSED,
9424                       bool speed)
9425 {
9426   enum rtx_code c = GET_CODE (x);
9427   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9428   struct aarch64_address_info info;
9429   int cost = 0;
9430   info.shift = 0;
9431
9432   if (!aarch64_classify_address (&info, x, mode, false))
9433     {
9434       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9435         {
9436           /* This is a CONST or SYMBOL ref which will be split
9437              in a different way depending on the code model in use.
9438              Cost it through the generic infrastructure.  */
9439           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9440           /* Divide through by the cost of one instruction to
9441              bring it to the same units as the address costs.  */
9442           cost_symbol_ref /= COSTS_N_INSNS (1);
9443           /* The cost is then the cost of preparing the address,
9444              followed by an immediate (possibly 0) offset.  */
9445           return cost_symbol_ref + addr_cost->imm_offset;
9446         }
9447       else
9448         {
9449           /* This is most likely a jump table from a case
9450              statement.  */
9451           return addr_cost->register_offset;
9452         }
9453     }
9454
9455   switch (info.type)
9456     {
9457       case ADDRESS_LO_SUM:
9458       case ADDRESS_SYMBOLIC:
9459       case ADDRESS_REG_IMM:
9460         cost += addr_cost->imm_offset;
9461         break;
9462
9463       case ADDRESS_REG_WB:
9464         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9465           cost += addr_cost->pre_modify;
9466         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9467           cost += addr_cost->post_modify;
9468         else
9469           gcc_unreachable ();
9470
9471         break;
9472
9473       case ADDRESS_REG_REG:
9474         cost += addr_cost->register_offset;
9475         break;
9476
9477       case ADDRESS_REG_SXTW:
9478         cost += addr_cost->register_sextend;
9479         break;
9480
9481       case ADDRESS_REG_UXTW:
9482         cost += addr_cost->register_zextend;
9483         break;
9484
9485       default:
9486         gcc_unreachable ();
9487     }
9488
9489
9490   if (info.shift > 0)
9491     {
9492       /* For the sake of calculating the cost of the shifted register
9493          component, we can treat same sized modes in the same way.  */
9494       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9495         cost += addr_cost->addr_scale_costs.hi;
9496       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9497         cost += addr_cost->addr_scale_costs.si;
9498       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9499         cost += addr_cost->addr_scale_costs.di;
9500       else
9501         /* We can't tell, or this is a 128-bit vector.  */
9502         cost += addr_cost->addr_scale_costs.ti;
9503     }
9504
9505   return cost;
9506 }
9507
9508 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9509    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9510    to be taken.  */
9511
9512 int
9513 aarch64_branch_cost (bool speed_p, bool predictable_p)
9514 {
9515   /* When optimizing for speed, use the cost of unpredictable branches.  */
9516   const struct cpu_branch_cost *branch_costs =
9517     aarch64_tune_params.branch_costs;
9518
9519   if (!speed_p || predictable_p)
9520     return branch_costs->predictable;
9521   else
9522     return branch_costs->unpredictable;
9523 }
9524
9525 /* Return true if the RTX X in mode MODE is a zero or sign extract
9526    usable in an ADD or SUB (extended register) instruction.  */
9527 static bool
9528 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9529 {
9530   /* Catch add with a sign extract.
9531      This is add_<optab><mode>_multp2.  */
9532   if (GET_CODE (x) == SIGN_EXTRACT
9533       || GET_CODE (x) == ZERO_EXTRACT)
9534     {
9535       rtx op0 = XEXP (x, 0);
9536       rtx op1 = XEXP (x, 1);
9537       rtx op2 = XEXP (x, 2);
9538
9539       if (GET_CODE (op0) == MULT
9540           && CONST_INT_P (op1)
9541           && op2 == const0_rtx
9542           && CONST_INT_P (XEXP (op0, 1))
9543           && aarch64_is_extend_from_extract (mode,
9544                                              XEXP (op0, 1),
9545                                              op1))
9546         {
9547           return true;
9548         }
9549     }
9550   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9551      No shift.  */
9552   else if (GET_CODE (x) == SIGN_EXTEND
9553            || GET_CODE (x) == ZERO_EXTEND)
9554     return REG_P (XEXP (x, 0));
9555
9556   return false;
9557 }
9558
9559 static bool
9560 aarch64_frint_unspec_p (unsigned int u)
9561 {
9562   switch (u)
9563     {
9564       case UNSPEC_FRINTZ:
9565       case UNSPEC_FRINTP:
9566       case UNSPEC_FRINTM:
9567       case UNSPEC_FRINTA:
9568       case UNSPEC_FRINTN:
9569       case UNSPEC_FRINTX:
9570       case UNSPEC_FRINTI:
9571         return true;
9572
9573       default:
9574         return false;
9575     }
9576 }
9577
9578 /* Return true iff X is an rtx that will match an extr instruction
9579    i.e. as described in the *extr<mode>5_insn family of patterns.
9580    OP0 and OP1 will be set to the operands of the shifts involved
9581    on success and will be NULL_RTX otherwise.  */
9582
9583 static bool
9584 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9585 {
9586   rtx op0, op1;
9587   scalar_int_mode mode;
9588   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9589     return false;
9590
9591   *res_op0 = NULL_RTX;
9592   *res_op1 = NULL_RTX;
9593
9594   if (GET_CODE (x) != IOR)
9595     return false;
9596
9597   op0 = XEXP (x, 0);
9598   op1 = XEXP (x, 1);
9599
9600   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9601       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9602     {
9603      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9604       if (GET_CODE (op1) == ASHIFT)
9605         std::swap (op0, op1);
9606
9607       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9608         return false;
9609
9610       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9611       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9612
9613       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9614           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9615         {
9616           *res_op0 = XEXP (op0, 0);
9617           *res_op1 = XEXP (op1, 0);
9618           return true;
9619         }
9620     }
9621
9622   return false;
9623 }
9624
9625 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9626    storing it in *COST.  Result is true if the total cost of the operation
9627    has now been calculated.  */
9628 static bool
9629 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9630 {
9631   rtx inner;
9632   rtx comparator;
9633   enum rtx_code cmpcode;
9634
9635   if (COMPARISON_P (op0))
9636     {
9637       inner = XEXP (op0, 0);
9638       comparator = XEXP (op0, 1);
9639       cmpcode = GET_CODE (op0);
9640     }
9641   else
9642     {
9643       inner = op0;
9644       comparator = const0_rtx;
9645       cmpcode = NE;
9646     }
9647
9648   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9649     {
9650       /* Conditional branch.  */
9651       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9652         return true;
9653       else
9654         {
9655           if (cmpcode == NE || cmpcode == EQ)
9656             {
9657               if (comparator == const0_rtx)
9658                 {
9659                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9660                   if (GET_CODE (inner) == ZERO_EXTRACT)
9661                     /* TBZ/TBNZ.  */
9662                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9663                                        ZERO_EXTRACT, 0, speed);
9664                   else
9665                     /* CBZ/CBNZ.  */
9666                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9667
9668                 return true;
9669               }
9670             }
9671           else if (cmpcode == LT || cmpcode == GE)
9672             {
9673               /* TBZ/TBNZ.  */
9674               if (comparator == const0_rtx)
9675                 return true;
9676             }
9677         }
9678     }
9679   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9680     {
9681       /* CCMP.  */
9682       if (GET_CODE (op1) == COMPARE)
9683         {
9684           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9685           if (XEXP (op1, 1) == const0_rtx)
9686             *cost += 1;
9687           if (speed)
9688             {
9689               machine_mode mode = GET_MODE (XEXP (op1, 0));
9690               const struct cpu_cost_table *extra_cost
9691                 = aarch64_tune_params.insn_extra_cost;
9692
9693               if (GET_MODE_CLASS (mode) == MODE_INT)
9694                 *cost += extra_cost->alu.arith;
9695               else
9696                 *cost += extra_cost->fp[mode == DFmode].compare;
9697             }
9698           return true;
9699         }
9700
9701       /* It's a conditional operation based on the status flags,
9702          so it must be some flavor of CSEL.  */
9703
9704       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9705       if (GET_CODE (op1) == NEG
9706           || GET_CODE (op1) == NOT
9707           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9708         op1 = XEXP (op1, 0);
9709       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9710         {
9711           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9712           op1 = XEXP (op1, 0);
9713           op2 = XEXP (op2, 0);
9714         }
9715
9716       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9717       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9718       return true;
9719     }
9720
9721   /* We don't know what this is, cost all operands.  */
9722   return false;
9723 }
9724
9725 /* Check whether X is a bitfield operation of the form shift + extend that
9726    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9727    operand to which the bitfield operation is applied.  Otherwise return
9728    NULL_RTX.  */
9729
9730 static rtx
9731 aarch64_extend_bitfield_pattern_p (rtx x)
9732 {
9733   rtx_code outer_code = GET_CODE (x);
9734   machine_mode outer_mode = GET_MODE (x);
9735
9736   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9737       && outer_mode != SImode && outer_mode != DImode)
9738     return NULL_RTX;
9739
9740   rtx inner = XEXP (x, 0);
9741   rtx_code inner_code = GET_CODE (inner);
9742   machine_mode inner_mode = GET_MODE (inner);
9743   rtx op = NULL_RTX;
9744
9745   switch (inner_code)
9746     {
9747       case ASHIFT:
9748         if (CONST_INT_P (XEXP (inner, 1))
9749             && (inner_mode == QImode || inner_mode == HImode))
9750           op = XEXP (inner, 0);
9751         break;
9752       case LSHIFTRT:
9753         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9754             && (inner_mode == QImode || inner_mode == HImode))
9755           op = XEXP (inner, 0);
9756         break;
9757       case ASHIFTRT:
9758         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9759             && (inner_mode == QImode || inner_mode == HImode))
9760           op = XEXP (inner, 0);
9761         break;
9762       default:
9763         break;
9764     }
9765
9766   return op;
9767 }
9768
9769 /* Return true if the mask and a shift amount from an RTX of the form
9770    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9771    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9772
9773 bool
9774 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9775                                     rtx shft_amnt)
9776 {
9777   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9778          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9779          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9780          && (INTVAL (mask)
9781              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9782 }
9783
9784 /* Return true if the masks and a shift amount from an RTX of the form
9785    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9786    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
9787
9788 bool
9789 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9790                                    unsigned HOST_WIDE_INT mask1,
9791                                    unsigned HOST_WIDE_INT shft_amnt,
9792                                    unsigned HOST_WIDE_INT mask2)
9793 {
9794   unsigned HOST_WIDE_INT t;
9795
9796   /* Verify that there is no overlap in what bits are set in the two masks.  */
9797   if (mask1 != ~mask2)
9798     return false;
9799
9800   /* Verify that mask2 is not all zeros or ones.  */
9801   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9802     return false;
9803
9804   /* The shift amount should always be less than the mode size.  */
9805   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9806
9807   /* Verify that the mask being shifted is contiguous and would be in the
9808      least significant bits after shifting by shft_amnt.  */
9809   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9810   return (t == (t & -t));
9811 }
9812
9813 /* Calculate the cost of calculating X, storing it in *COST.  Result
9814    is true if the total cost of the operation has now been calculated.  */
9815 static bool
9816 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9817                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9818 {
9819   rtx op0, op1, op2;
9820   const struct cpu_cost_table *extra_cost
9821     = aarch64_tune_params.insn_extra_cost;
9822   int code = GET_CODE (x);
9823   scalar_int_mode int_mode;
9824
9825   /* By default, assume that everything has equivalent cost to the
9826      cheapest instruction.  Any additional costs are applied as a delta
9827      above this default.  */
9828   *cost = COSTS_N_INSNS (1);
9829
9830   switch (code)
9831     {
9832     case SET:
9833       /* The cost depends entirely on the operands to SET.  */
9834       *cost = 0;
9835       op0 = SET_DEST (x);
9836       op1 = SET_SRC (x);
9837
9838       switch (GET_CODE (op0))
9839         {
9840         case MEM:
9841           if (speed)
9842             {
9843               rtx address = XEXP (op0, 0);
9844               if (VECTOR_MODE_P (mode))
9845                 *cost += extra_cost->ldst.storev;
9846               else if (GET_MODE_CLASS (mode) == MODE_INT)
9847                 *cost += extra_cost->ldst.store;
9848               else if (mode == SFmode)
9849                 *cost += extra_cost->ldst.storef;
9850               else if (mode == DFmode)
9851                 *cost += extra_cost->ldst.stored;
9852
9853               *cost +=
9854                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9855                                                      0, speed));
9856             }
9857
9858           *cost += rtx_cost (op1, mode, SET, 1, speed);
9859           return true;
9860
9861         case SUBREG:
9862           if (! REG_P (SUBREG_REG (op0)))
9863             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9864
9865           /* Fall through.  */
9866         case REG:
9867           /* The cost is one per vector-register copied.  */
9868           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9869             {
9870               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9871               *cost = COSTS_N_INSNS (nregs);
9872             }
9873           /* const0_rtx is in general free, but we will use an
9874              instruction to set a register to 0.  */
9875           else if (REG_P (op1) || op1 == const0_rtx)
9876             {
9877               /* The cost is 1 per register copied.  */
9878               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9879               *cost = COSTS_N_INSNS (nregs);
9880             }
9881           else
9882             /* Cost is just the cost of the RHS of the set.  */
9883             *cost += rtx_cost (op1, mode, SET, 1, speed);
9884           return true;
9885
9886         case ZERO_EXTRACT:
9887         case SIGN_EXTRACT:
9888           /* Bit-field insertion.  Strip any redundant widening of
9889              the RHS to meet the width of the target.  */
9890           if (GET_CODE (op1) == SUBREG)
9891             op1 = SUBREG_REG (op1);
9892           if ((GET_CODE (op1) == ZERO_EXTEND
9893                || GET_CODE (op1) == SIGN_EXTEND)
9894               && CONST_INT_P (XEXP (op0, 1))
9895               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9896               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9897             op1 = XEXP (op1, 0);
9898
9899           if (CONST_INT_P (op1))
9900             {
9901               /* MOV immediate is assumed to always be cheap.  */
9902               *cost = COSTS_N_INSNS (1);
9903             }
9904           else
9905             {
9906               /* BFM.  */
9907               if (speed)
9908                 *cost += extra_cost->alu.bfi;
9909               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9910             }
9911
9912           return true;
9913
9914         default:
9915           /* We can't make sense of this, assume default cost.  */
9916           *cost = COSTS_N_INSNS (1);
9917           return false;
9918         }
9919       return false;
9920
9921     case CONST_INT:
9922       /* If an instruction can incorporate a constant within the
9923          instruction, the instruction's expression avoids calling
9924          rtx_cost() on the constant.  If rtx_cost() is called on a
9925          constant, then it is usually because the constant must be
9926          moved into a register by one or more instructions.
9927
9928          The exception is constant 0, which can be expressed
9929          as XZR/WZR and is therefore free.  The exception to this is
9930          if we have (set (reg) (const0_rtx)) in which case we must cost
9931          the move.  However, we can catch that when we cost the SET, so
9932          we don't need to consider that here.  */
9933       if (x == const0_rtx)
9934         *cost = 0;
9935       else
9936         {
9937           /* To an approximation, building any other constant is
9938              proportionally expensive to the number of instructions
9939              required to build that constant.  This is true whether we
9940              are compiling for SPEED or otherwise.  */
9941           if (!is_a <scalar_int_mode> (mode, &int_mode))
9942             int_mode = word_mode;
9943           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9944                                  (NULL_RTX, x, false, int_mode));
9945         }
9946       return true;
9947
9948     case CONST_DOUBLE:
9949
9950       /* First determine number of instructions to do the move
9951           as an integer constant.  */
9952       if (!aarch64_float_const_representable_p (x)
9953            && !aarch64_can_const_movi_rtx_p (x, mode)
9954            && aarch64_float_const_rtx_p (x))
9955         {
9956           unsigned HOST_WIDE_INT ival;
9957           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9958           gcc_assert (succeed);
9959
9960           scalar_int_mode imode = (mode == HFmode
9961                                    ? SImode
9962                                    : int_mode_for_mode (mode).require ());
9963           int ncost = aarch64_internal_mov_immediate
9964                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9965           *cost += COSTS_N_INSNS (ncost);
9966           return true;
9967         }
9968
9969       if (speed)
9970         {
9971           /* mov[df,sf]_aarch64.  */
9972           if (aarch64_float_const_representable_p (x))
9973             /* FMOV (scalar immediate).  */
9974             *cost += extra_cost->fp[mode == DFmode].fpconst;
9975           else if (!aarch64_float_const_zero_rtx_p (x))
9976             {
9977               /* This will be a load from memory.  */
9978               if (mode == DFmode)
9979                 *cost += extra_cost->ldst.loadd;
9980               else
9981                 *cost += extra_cost->ldst.loadf;
9982             }
9983           else
9984             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9985                or MOV v0.s[0], wzr - neither of which are modeled by the
9986                cost tables.  Just use the default cost.  */
9987             {
9988             }
9989         }
9990
9991       return true;
9992
9993     case MEM:
9994       if (speed)
9995         {
9996           /* For loads we want the base cost of a load, plus an
9997              approximation for the additional cost of the addressing
9998              mode.  */
9999           rtx address = XEXP (x, 0);
10000           if (VECTOR_MODE_P (mode))
10001             *cost += extra_cost->ldst.loadv;
10002           else if (GET_MODE_CLASS (mode) == MODE_INT)
10003             *cost += extra_cost->ldst.load;
10004           else if (mode == SFmode)
10005             *cost += extra_cost->ldst.loadf;
10006           else if (mode == DFmode)
10007             *cost += extra_cost->ldst.loadd;
10008
10009           *cost +=
10010                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10011                                                      0, speed));
10012         }
10013
10014       return true;
10015
10016     case NEG:
10017       op0 = XEXP (x, 0);
10018
10019       if (VECTOR_MODE_P (mode))
10020         {
10021           if (speed)
10022             {
10023               /* FNEG.  */
10024               *cost += extra_cost->vect.alu;
10025             }
10026           return false;
10027         }
10028
10029       if (GET_MODE_CLASS (mode) == MODE_INT)
10030         {
10031           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10032               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10033             {
10034               /* CSETM.  */
10035               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10036               return true;
10037             }
10038
10039           /* Cost this as SUB wzr, X.  */
10040           op0 = CONST0_RTX (mode);
10041           op1 = XEXP (x, 0);
10042           goto cost_minus;
10043         }
10044
10045       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10046         {
10047           /* Support (neg(fma...)) as a single instruction only if
10048              sign of zeros is unimportant.  This matches the decision
10049              making in aarch64.md.  */
10050           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10051             {
10052               /* FNMADD.  */
10053               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10054               return true;
10055             }
10056           if (GET_CODE (op0) == MULT)
10057             {
10058               /* FNMUL.  */
10059               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10060               return true;
10061             }
10062           if (speed)
10063             /* FNEG.  */
10064             *cost += extra_cost->fp[mode == DFmode].neg;
10065           return false;
10066         }
10067
10068       return false;
10069
10070     case CLRSB:
10071     case CLZ:
10072       if (speed)
10073         {
10074           if (VECTOR_MODE_P (mode))
10075             *cost += extra_cost->vect.alu;
10076           else
10077             *cost += extra_cost->alu.clz;
10078         }
10079
10080       return false;
10081
10082     case COMPARE:
10083       op0 = XEXP (x, 0);
10084       op1 = XEXP (x, 1);
10085
10086       if (op1 == const0_rtx
10087           && GET_CODE (op0) == AND)
10088         {
10089           x = op0;
10090           mode = GET_MODE (op0);
10091           goto cost_logic;
10092         }
10093
10094       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10095         {
10096           /* TODO: A write to the CC flags possibly costs extra, this
10097              needs encoding in the cost tables.  */
10098
10099           mode = GET_MODE (op0);
10100           /* ANDS.  */
10101           if (GET_CODE (op0) == AND)
10102             {
10103               x = op0;
10104               goto cost_logic;
10105             }
10106
10107           if (GET_CODE (op0) == PLUS)
10108             {
10109               /* ADDS (and CMN alias).  */
10110               x = op0;
10111               goto cost_plus;
10112             }
10113
10114           if (GET_CODE (op0) == MINUS)
10115             {
10116               /* SUBS.  */
10117               x = op0;
10118               goto cost_minus;
10119             }
10120
10121           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10122               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10123               && CONST_INT_P (XEXP (op0, 2)))
10124             {
10125               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10126                  Handle it here directly rather than going to cost_logic
10127                  since we know the immediate generated for the TST is valid
10128                  so we can avoid creating an intermediate rtx for it only
10129                  for costing purposes.  */
10130               if (speed)
10131                 *cost += extra_cost->alu.logical;
10132
10133               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10134                                  ZERO_EXTRACT, 0, speed);
10135               return true;
10136             }
10137
10138           if (GET_CODE (op1) == NEG)
10139             {
10140               /* CMN.  */
10141               if (speed)
10142                 *cost += extra_cost->alu.arith;
10143
10144               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10145               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10146               return true;
10147             }
10148
10149           /* CMP.
10150
10151              Compare can freely swap the order of operands, and
10152              canonicalization puts the more complex operation first.
10153              But the integer MINUS logic expects the shift/extend
10154              operation in op1.  */
10155           if (! (REG_P (op0)
10156                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10157           {
10158             op0 = XEXP (x, 1);
10159             op1 = XEXP (x, 0);
10160           }
10161           goto cost_minus;
10162         }
10163
10164       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10165         {
10166           /* FCMP.  */
10167           if (speed)
10168             *cost += extra_cost->fp[mode == DFmode].compare;
10169
10170           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10171             {
10172               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10173               /* FCMP supports constant 0.0 for no extra cost. */
10174               return true;
10175             }
10176           return false;
10177         }
10178
10179       if (VECTOR_MODE_P (mode))
10180         {
10181           /* Vector compare.  */
10182           if (speed)
10183             *cost += extra_cost->vect.alu;
10184
10185           if (aarch64_float_const_zero_rtx_p (op1))
10186             {
10187               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10188                  cost.  */
10189               return true;
10190             }
10191           return false;
10192         }
10193       return false;
10194
10195     case MINUS:
10196       {
10197         op0 = XEXP (x, 0);
10198         op1 = XEXP (x, 1);
10199
10200 cost_minus:
10201         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10202
10203         /* Detect valid immediates.  */
10204         if ((GET_MODE_CLASS (mode) == MODE_INT
10205              || (GET_MODE_CLASS (mode) == MODE_CC
10206                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10207             && CONST_INT_P (op1)
10208             && aarch64_uimm12_shift (INTVAL (op1)))
10209           {
10210             if (speed)
10211               /* SUB(S) (immediate).  */
10212               *cost += extra_cost->alu.arith;
10213             return true;
10214           }
10215
10216         /* Look for SUB (extended register).  */
10217         if (is_a <scalar_int_mode> (mode, &int_mode)
10218             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10219           {
10220             if (speed)
10221               *cost += extra_cost->alu.extend_arith;
10222
10223             op1 = aarch64_strip_extend (op1, true);
10224             *cost += rtx_cost (op1, VOIDmode,
10225                                (enum rtx_code) GET_CODE (op1), 0, speed);
10226             return true;
10227           }
10228
10229         rtx new_op1 = aarch64_strip_extend (op1, false);
10230
10231         /* Cost this as an FMA-alike operation.  */
10232         if ((GET_CODE (new_op1) == MULT
10233              || aarch64_shift_p (GET_CODE (new_op1)))
10234             && code != COMPARE)
10235           {
10236             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10237                                             (enum rtx_code) code,
10238                                             speed);
10239             return true;
10240           }
10241
10242         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10243
10244         if (speed)
10245           {
10246             if (VECTOR_MODE_P (mode))
10247               {
10248                 /* Vector SUB.  */
10249                 *cost += extra_cost->vect.alu;
10250               }
10251             else if (GET_MODE_CLASS (mode) == MODE_INT)
10252               {
10253                 /* SUB(S).  */
10254                 *cost += extra_cost->alu.arith;
10255               }
10256             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10257               {
10258                 /* FSUB.  */
10259                 *cost += extra_cost->fp[mode == DFmode].addsub;
10260               }
10261           }
10262         return true;
10263       }
10264
10265     case PLUS:
10266       {
10267         rtx new_op0;
10268
10269         op0 = XEXP (x, 0);
10270         op1 = XEXP (x, 1);
10271
10272 cost_plus:
10273         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10274             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10275           {
10276             /* CSINC.  */
10277             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10278             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10279             return true;
10280           }
10281
10282         if (GET_MODE_CLASS (mode) == MODE_INT
10283             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10284                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10285           {
10286             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10287
10288             if (speed)
10289               /* ADD (immediate).  */
10290               *cost += extra_cost->alu.arith;
10291             return true;
10292           }
10293
10294         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10295
10296         /* Look for ADD (extended register).  */
10297         if (is_a <scalar_int_mode> (mode, &int_mode)
10298             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10299           {
10300             if (speed)
10301               *cost += extra_cost->alu.extend_arith;
10302
10303             op0 = aarch64_strip_extend (op0, true);
10304             *cost += rtx_cost (op0, VOIDmode,
10305                                (enum rtx_code) GET_CODE (op0), 0, speed);
10306             return true;
10307           }
10308
10309         /* Strip any extend, leave shifts behind as we will
10310            cost them through mult_cost.  */
10311         new_op0 = aarch64_strip_extend (op0, false);
10312
10313         if (GET_CODE (new_op0) == MULT
10314             || aarch64_shift_p (GET_CODE (new_op0)))
10315           {
10316             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10317                                             speed);
10318             return true;
10319           }
10320
10321         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10322
10323         if (speed)
10324           {
10325             if (VECTOR_MODE_P (mode))
10326               {
10327                 /* Vector ADD.  */
10328                 *cost += extra_cost->vect.alu;
10329               }
10330             else if (GET_MODE_CLASS (mode) == MODE_INT)
10331               {
10332                 /* ADD.  */
10333                 *cost += extra_cost->alu.arith;
10334               }
10335             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10336               {
10337                 /* FADD.  */
10338                 *cost += extra_cost->fp[mode == DFmode].addsub;
10339               }
10340           }
10341         return true;
10342       }
10343
10344     case BSWAP:
10345       *cost = COSTS_N_INSNS (1);
10346
10347       if (speed)
10348         {
10349           if (VECTOR_MODE_P (mode))
10350             *cost += extra_cost->vect.alu;
10351           else
10352             *cost += extra_cost->alu.rev;
10353         }
10354       return false;
10355
10356     case IOR:
10357       if (aarch_rev16_p (x))
10358         {
10359           *cost = COSTS_N_INSNS (1);
10360
10361           if (speed)
10362             {
10363               if (VECTOR_MODE_P (mode))
10364                 *cost += extra_cost->vect.alu;
10365               else
10366                 *cost += extra_cost->alu.rev;
10367             }
10368           return true;
10369         }
10370
10371       if (aarch64_extr_rtx_p (x, &op0, &op1))
10372         {
10373           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10374           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10375           if (speed)
10376             *cost += extra_cost->alu.shift;
10377
10378           return true;
10379         }
10380     /* Fall through.  */
10381     case XOR:
10382     case AND:
10383     cost_logic:
10384       op0 = XEXP (x, 0);
10385       op1 = XEXP (x, 1);
10386
10387       if (VECTOR_MODE_P (mode))
10388         {
10389           if (speed)
10390             *cost += extra_cost->vect.alu;
10391           return true;
10392         }
10393
10394       if (code == AND
10395           && GET_CODE (op0) == MULT
10396           && CONST_INT_P (XEXP (op0, 1))
10397           && CONST_INT_P (op1)
10398           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10399                                INTVAL (op1)) != 0)
10400         {
10401           /* This is a UBFM/SBFM.  */
10402           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10403           if (speed)
10404             *cost += extra_cost->alu.bfx;
10405           return true;
10406         }
10407
10408       if (is_int_mode (mode, &int_mode))
10409         {
10410           if (CONST_INT_P (op1))
10411             {
10412               /* We have a mask + shift version of a UBFIZ
10413                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10414               if (GET_CODE (op0) == ASHIFT
10415                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10416                                                          XEXP (op0, 1)))
10417                 {
10418                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10419                                      (enum rtx_code) code, 0, speed);
10420                   if (speed)
10421                     *cost += extra_cost->alu.bfx;
10422
10423                   return true;
10424                 }
10425               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10426                 {
10427                 /* We possibly get the immediate for free, this is not
10428                    modelled.  */
10429                   *cost += rtx_cost (op0, int_mode,
10430                                      (enum rtx_code) code, 0, speed);
10431                   if (speed)
10432                     *cost += extra_cost->alu.logical;
10433
10434                   return true;
10435                 }
10436             }
10437           else
10438             {
10439               rtx new_op0 = op0;
10440
10441               /* Handle ORN, EON, or BIC.  */
10442               if (GET_CODE (op0) == NOT)
10443                 op0 = XEXP (op0, 0);
10444
10445               new_op0 = aarch64_strip_shift (op0);
10446
10447               /* If we had a shift on op0 then this is a logical-shift-
10448                  by-register/immediate operation.  Otherwise, this is just
10449                  a logical operation.  */
10450               if (speed)
10451                 {
10452                   if (new_op0 != op0)
10453                     {
10454                       /* Shift by immediate.  */
10455                       if (CONST_INT_P (XEXP (op0, 1)))
10456                         *cost += extra_cost->alu.log_shift;
10457                       else
10458                         *cost += extra_cost->alu.log_shift_reg;
10459                     }
10460                   else
10461                     *cost += extra_cost->alu.logical;
10462                 }
10463
10464               /* In both cases we want to cost both operands.  */
10465               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10466                                  0, speed);
10467               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10468                                  1, speed);
10469
10470               return true;
10471             }
10472         }
10473       return false;
10474
10475     case NOT:
10476       x = XEXP (x, 0);
10477       op0 = aarch64_strip_shift (x);
10478
10479       if (VECTOR_MODE_P (mode))
10480         {
10481           /* Vector NOT.  */
10482           *cost += extra_cost->vect.alu;
10483           return false;
10484         }
10485
10486       /* MVN-shifted-reg.  */
10487       if (op0 != x)
10488         {
10489           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10490
10491           if (speed)
10492             *cost += extra_cost->alu.log_shift;
10493
10494           return true;
10495         }
10496       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10497          Handle the second form here taking care that 'a' in the above can
10498          be a shift.  */
10499       else if (GET_CODE (op0) == XOR)
10500         {
10501           rtx newop0 = XEXP (op0, 0);
10502           rtx newop1 = XEXP (op0, 1);
10503           rtx op0_stripped = aarch64_strip_shift (newop0);
10504
10505           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10506           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10507
10508           if (speed)
10509             {
10510               if (op0_stripped != newop0)
10511                 *cost += extra_cost->alu.log_shift;
10512               else
10513                 *cost += extra_cost->alu.logical;
10514             }
10515
10516           return true;
10517         }
10518       /* MVN.  */
10519       if (speed)
10520         *cost += extra_cost->alu.logical;
10521
10522       return false;
10523
10524     case ZERO_EXTEND:
10525
10526       op0 = XEXP (x, 0);
10527       /* If a value is written in SI mode, then zero extended to DI
10528          mode, the operation will in general be free as a write to
10529          a 'w' register implicitly zeroes the upper bits of an 'x'
10530          register.  However, if this is
10531
10532            (set (reg) (zero_extend (reg)))
10533
10534          we must cost the explicit register move.  */
10535       if (mode == DImode
10536           && GET_MODE (op0) == SImode
10537           && outer == SET)
10538         {
10539           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10540
10541         /* If OP_COST is non-zero, then the cost of the zero extend
10542            is effectively the cost of the inner operation.  Otherwise
10543            we have a MOV instruction and we take the cost from the MOV
10544            itself.  This is true independently of whether we are
10545            optimizing for space or time.  */
10546           if (op_cost)
10547             *cost = op_cost;
10548
10549           return true;
10550         }
10551       else if (MEM_P (op0))
10552         {
10553           /* All loads can zero extend to any size for free.  */
10554           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10555           return true;
10556         }
10557
10558       op0 = aarch64_extend_bitfield_pattern_p (x);
10559       if (op0)
10560         {
10561           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10562           if (speed)
10563             *cost += extra_cost->alu.bfx;
10564           return true;
10565         }
10566
10567       if (speed)
10568         {
10569           if (VECTOR_MODE_P (mode))
10570             {
10571               /* UMOV.  */
10572               *cost += extra_cost->vect.alu;
10573             }
10574           else
10575             {
10576               /* We generate an AND instead of UXTB/UXTH.  */
10577               *cost += extra_cost->alu.logical;
10578             }
10579         }
10580       return false;
10581
10582     case SIGN_EXTEND:
10583       if (MEM_P (XEXP (x, 0)))
10584         {
10585           /* LDRSH.  */
10586           if (speed)
10587             {
10588               rtx address = XEXP (XEXP (x, 0), 0);
10589               *cost += extra_cost->ldst.load_sign_extend;
10590
10591               *cost +=
10592                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10593                                                      0, speed));
10594             }
10595           return true;
10596         }
10597
10598       op0 = aarch64_extend_bitfield_pattern_p (x);
10599       if (op0)
10600         {
10601           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10602           if (speed)
10603             *cost += extra_cost->alu.bfx;
10604           return true;
10605         }
10606
10607       if (speed)
10608         {
10609           if (VECTOR_MODE_P (mode))
10610             *cost += extra_cost->vect.alu;
10611           else
10612             *cost += extra_cost->alu.extend;
10613         }
10614       return false;
10615
10616     case ASHIFT:
10617       op0 = XEXP (x, 0);
10618       op1 = XEXP (x, 1);
10619
10620       if (CONST_INT_P (op1))
10621         {
10622           if (speed)
10623             {
10624               if (VECTOR_MODE_P (mode))
10625                 {
10626                   /* Vector shift (immediate).  */
10627                   *cost += extra_cost->vect.alu;
10628                 }
10629               else
10630                 {
10631                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10632                      aliases.  */
10633                   *cost += extra_cost->alu.shift;
10634                 }
10635             }
10636
10637           /* We can incorporate zero/sign extend for free.  */
10638           if (GET_CODE (op0) == ZERO_EXTEND
10639               || GET_CODE (op0) == SIGN_EXTEND)
10640             op0 = XEXP (op0, 0);
10641
10642           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10643           return true;
10644         }
10645       else
10646         {
10647           if (VECTOR_MODE_P (mode))
10648             {
10649               if (speed)
10650                 /* Vector shift (register).  */
10651                 *cost += extra_cost->vect.alu;
10652             }
10653           else
10654             {
10655               if (speed)
10656                 /* LSLV.  */
10657                 *cost += extra_cost->alu.shift_reg;
10658
10659               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10660                   && CONST_INT_P (XEXP (op1, 1))
10661                   && known_eq (INTVAL (XEXP (op1, 1)),
10662                                GET_MODE_BITSIZE (mode) - 1))
10663                 {
10664                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10665                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10666                      don't recurse into it.  */
10667                   return true;
10668                 }
10669             }
10670           return false;  /* All arguments need to be in registers.  */
10671         }
10672
10673     case ROTATE:
10674     case ROTATERT:
10675     case LSHIFTRT:
10676     case ASHIFTRT:
10677       op0 = XEXP (x, 0);
10678       op1 = XEXP (x, 1);
10679
10680       if (CONST_INT_P (op1))
10681         {
10682           /* ASR (immediate) and friends.  */
10683           if (speed)
10684             {
10685               if (VECTOR_MODE_P (mode))
10686                 *cost += extra_cost->vect.alu;
10687               else
10688                 *cost += extra_cost->alu.shift;
10689             }
10690
10691           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10692           return true;
10693         }
10694       else
10695         {
10696           if (VECTOR_MODE_P (mode))
10697             {
10698               if (speed)
10699                 /* Vector shift (register).  */
10700                 *cost += extra_cost->vect.alu;
10701             }
10702           else
10703             {
10704               if (speed)
10705                 /* ASR (register) and friends.  */
10706                 *cost += extra_cost->alu.shift_reg;
10707
10708               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10709                   && CONST_INT_P (XEXP (op1, 1))
10710                   && known_eq (INTVAL (XEXP (op1, 1)),
10711                                GET_MODE_BITSIZE (mode) - 1))
10712                 {
10713                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10714                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10715                      don't recurse into it.  */
10716                   return true;
10717                 }
10718             }
10719           return false;  /* All arguments need to be in registers.  */
10720         }
10721
10722     case SYMBOL_REF:
10723
10724       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10725           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10726         {
10727           /* LDR.  */
10728           if (speed)
10729             *cost += extra_cost->ldst.load;
10730         }
10731       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10732                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10733         {
10734           /* ADRP, followed by ADD.  */
10735           *cost += COSTS_N_INSNS (1);
10736           if (speed)
10737             *cost += 2 * extra_cost->alu.arith;
10738         }
10739       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10740                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10741         {
10742           /* ADR.  */
10743           if (speed)
10744             *cost += extra_cost->alu.arith;
10745         }
10746
10747       if (flag_pic)
10748         {
10749           /* One extra load instruction, after accessing the GOT.  */
10750           *cost += COSTS_N_INSNS (1);
10751           if (speed)
10752             *cost += extra_cost->ldst.load;
10753         }
10754       return true;
10755
10756     case HIGH:
10757     case LO_SUM:
10758       /* ADRP/ADD (immediate).  */
10759       if (speed)
10760         *cost += extra_cost->alu.arith;
10761       return true;
10762
10763     case ZERO_EXTRACT:
10764     case SIGN_EXTRACT:
10765       /* UBFX/SBFX.  */
10766       if (speed)
10767         {
10768           if (VECTOR_MODE_P (mode))
10769             *cost += extra_cost->vect.alu;
10770           else
10771             *cost += extra_cost->alu.bfx;
10772         }
10773
10774       /* We can trust that the immediates used will be correct (there
10775          are no by-register forms), so we need only cost op0.  */
10776       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10777       return true;
10778
10779     case MULT:
10780       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10781       /* aarch64_rtx_mult_cost always handles recursion to its
10782          operands.  */
10783       return true;
10784
10785     case MOD:
10786     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10787        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10788        an unconditional negate.  This case should only ever be reached through
10789        the set_smod_pow2_cheap check in expmed.c.  */
10790       if (CONST_INT_P (XEXP (x, 1))
10791           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10792           && (mode == SImode || mode == DImode))
10793         {
10794           /* We expand to 4 instructions.  Reset the baseline.  */
10795           *cost = COSTS_N_INSNS (4);
10796
10797           if (speed)
10798             *cost += 2 * extra_cost->alu.logical
10799                      + 2 * extra_cost->alu.arith;
10800
10801           return true;
10802         }
10803
10804     /* Fall-through.  */
10805     case UMOD:
10806       if (speed)
10807         {
10808           /* Slighly prefer UMOD over SMOD.  */
10809           if (VECTOR_MODE_P (mode))
10810             *cost += extra_cost->vect.alu;
10811           else if (GET_MODE_CLASS (mode) == MODE_INT)
10812             *cost += (extra_cost->mult[mode == DImode].add
10813                       + extra_cost->mult[mode == DImode].idiv
10814                       + (code == MOD ? 1 : 0));
10815         }
10816       return false;  /* All arguments need to be in registers.  */
10817
10818     case DIV:
10819     case UDIV:
10820     case SQRT:
10821       if (speed)
10822         {
10823           if (VECTOR_MODE_P (mode))
10824             *cost += extra_cost->vect.alu;
10825           else if (GET_MODE_CLASS (mode) == MODE_INT)
10826             /* There is no integer SQRT, so only DIV and UDIV can get
10827                here.  */
10828             *cost += (extra_cost->mult[mode == DImode].idiv
10829                      /* Slighly prefer UDIV over SDIV.  */
10830                      + (code == DIV ? 1 : 0));
10831           else
10832             *cost += extra_cost->fp[mode == DFmode].div;
10833         }
10834       return false;  /* All arguments need to be in registers.  */
10835
10836     case IF_THEN_ELSE:
10837       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10838                                          XEXP (x, 2), cost, speed);
10839
10840     case EQ:
10841     case NE:
10842     case GT:
10843     case GTU:
10844     case LT:
10845     case LTU:
10846     case GE:
10847     case GEU:
10848     case LE:
10849     case LEU:
10850
10851       return false; /* All arguments must be in registers.  */
10852
10853     case FMA:
10854       op0 = XEXP (x, 0);
10855       op1 = XEXP (x, 1);
10856       op2 = XEXP (x, 2);
10857
10858       if (speed)
10859         {
10860           if (VECTOR_MODE_P (mode))
10861             *cost += extra_cost->vect.alu;
10862           else
10863             *cost += extra_cost->fp[mode == DFmode].fma;
10864         }
10865
10866       /* FMSUB, FNMADD, and FNMSUB are free.  */
10867       if (GET_CODE (op0) == NEG)
10868         op0 = XEXP (op0, 0);
10869
10870       if (GET_CODE (op2) == NEG)
10871         op2 = XEXP (op2, 0);
10872
10873       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10874          and the by-element operand as operand 0.  */
10875       if (GET_CODE (op1) == NEG)
10876         op1 = XEXP (op1, 0);
10877
10878       /* Catch vector-by-element operations.  The by-element operand can
10879          either be (vec_duplicate (vec_select (x))) or just
10880          (vec_select (x)), depending on whether we are multiplying by
10881          a vector or a scalar.
10882
10883          Canonicalization is not very good in these cases, FMA4 will put the
10884          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10885       if (GET_CODE (op0) == VEC_DUPLICATE)
10886         op0 = XEXP (op0, 0);
10887       else if (GET_CODE (op1) == VEC_DUPLICATE)
10888         op1 = XEXP (op1, 0);
10889
10890       if (GET_CODE (op0) == VEC_SELECT)
10891         op0 = XEXP (op0, 0);
10892       else if (GET_CODE (op1) == VEC_SELECT)
10893         op1 = XEXP (op1, 0);
10894
10895       /* If the remaining parameters are not registers,
10896          get the cost to put them into registers.  */
10897       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10898       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10899       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10900       return true;
10901
10902     case FLOAT:
10903     case UNSIGNED_FLOAT:
10904       if (speed)
10905         *cost += extra_cost->fp[mode == DFmode].fromint;
10906       return false;
10907
10908     case FLOAT_EXTEND:
10909       if (speed)
10910         {
10911           if (VECTOR_MODE_P (mode))
10912             {
10913               /*Vector truncate.  */
10914               *cost += extra_cost->vect.alu;
10915             }
10916           else
10917             *cost += extra_cost->fp[mode == DFmode].widen;
10918         }
10919       return false;
10920
10921     case FLOAT_TRUNCATE:
10922       if (speed)
10923         {
10924           if (VECTOR_MODE_P (mode))
10925             {
10926               /*Vector conversion.  */
10927               *cost += extra_cost->vect.alu;
10928             }
10929           else
10930             *cost += extra_cost->fp[mode == DFmode].narrow;
10931         }
10932       return false;
10933
10934     case FIX:
10935     case UNSIGNED_FIX:
10936       x = XEXP (x, 0);
10937       /* Strip the rounding part.  They will all be implemented
10938          by the fcvt* family of instructions anyway.  */
10939       if (GET_CODE (x) == UNSPEC)
10940         {
10941           unsigned int uns_code = XINT (x, 1);
10942
10943           if (uns_code == UNSPEC_FRINTA
10944               || uns_code == UNSPEC_FRINTM
10945               || uns_code == UNSPEC_FRINTN
10946               || uns_code == UNSPEC_FRINTP
10947               || uns_code == UNSPEC_FRINTZ)
10948             x = XVECEXP (x, 0, 0);
10949         }
10950
10951       if (speed)
10952         {
10953           if (VECTOR_MODE_P (mode))
10954             *cost += extra_cost->vect.alu;
10955           else
10956             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10957         }
10958
10959       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10960          fixed-point fcvt.  */
10961       if (GET_CODE (x) == MULT
10962           && ((VECTOR_MODE_P (mode)
10963                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10964               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10965         {
10966           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10967                              0, speed);
10968           return true;
10969         }
10970
10971       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10972       return true;
10973
10974     case ABS:
10975       if (VECTOR_MODE_P (mode))
10976         {
10977           /* ABS (vector).  */
10978           if (speed)
10979             *cost += extra_cost->vect.alu;
10980         }
10981       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10982         {
10983           op0 = XEXP (x, 0);
10984
10985           /* FABD, which is analogous to FADD.  */
10986           if (GET_CODE (op0) == MINUS)
10987             {
10988               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10989               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10990               if (speed)
10991                 *cost += extra_cost->fp[mode == DFmode].addsub;
10992
10993               return true;
10994             }
10995           /* Simple FABS is analogous to FNEG.  */
10996           if (speed)
10997             *cost += extra_cost->fp[mode == DFmode].neg;
10998         }
10999       else
11000         {
11001           /* Integer ABS will either be split to
11002              two arithmetic instructions, or will be an ABS
11003              (scalar), which we don't model.  */
11004           *cost = COSTS_N_INSNS (2);
11005           if (speed)
11006             *cost += 2 * extra_cost->alu.arith;
11007         }
11008       return false;
11009
11010     case SMAX:
11011     case SMIN:
11012       if (speed)
11013         {
11014           if (VECTOR_MODE_P (mode))
11015             *cost += extra_cost->vect.alu;
11016           else
11017             {
11018               /* FMAXNM/FMINNM/FMAX/FMIN.
11019                  TODO: This may not be accurate for all implementations, but
11020                  we do not model this in the cost tables.  */
11021               *cost += extra_cost->fp[mode == DFmode].addsub;
11022             }
11023         }
11024       return false;
11025
11026     case UNSPEC:
11027       /* The floating point round to integer frint* instructions.  */
11028       if (aarch64_frint_unspec_p (XINT (x, 1)))
11029         {
11030           if (speed)
11031             *cost += extra_cost->fp[mode == DFmode].roundint;
11032
11033           return false;
11034         }
11035
11036       if (XINT (x, 1) == UNSPEC_RBIT)
11037         {
11038           if (speed)
11039             *cost += extra_cost->alu.rev;
11040
11041           return false;
11042         }
11043       break;
11044
11045     case TRUNCATE:
11046
11047       /* Decompose <su>muldi3_highpart.  */
11048       if (/* (truncate:DI  */
11049           mode == DImode
11050           /*   (lshiftrt:TI  */
11051           && GET_MODE (XEXP (x, 0)) == TImode
11052           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11053           /*      (mult:TI  */
11054           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11055           /*        (ANY_EXTEND:TI (reg:DI))
11056                     (ANY_EXTEND:TI (reg:DI)))  */
11057           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11058                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11059               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11060                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11061           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11062           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11063           /*     (const_int 64)  */
11064           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11065           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11066         {
11067           /* UMULH/SMULH.  */
11068           if (speed)
11069             *cost += extra_cost->mult[mode == DImode].extend;
11070           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11071                              mode, MULT, 0, speed);
11072           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11073                              mode, MULT, 1, speed);
11074           return true;
11075         }
11076
11077       /* Fall through.  */
11078     default:
11079       break;
11080     }
11081
11082   if (dump_file
11083       && flag_aarch64_verbose_cost)
11084     fprintf (dump_file,
11085       "\nFailed to cost RTX.  Assuming default cost.\n");
11086
11087   return true;
11088 }
11089
11090 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11091    calculated for X.  This cost is stored in *COST.  Returns true
11092    if the total cost of X was calculated.  */
11093 static bool
11094 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11095                    int param, int *cost, bool speed)
11096 {
11097   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11098
11099   if (dump_file
11100       && flag_aarch64_verbose_cost)
11101     {
11102       print_rtl_single (dump_file, x);
11103       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11104                speed ? "Hot" : "Cold",
11105                *cost, result ? "final" : "partial");
11106     }
11107
11108   return result;
11109 }
11110
11111 static int
11112 aarch64_register_move_cost (machine_mode mode,
11113                             reg_class_t from_i, reg_class_t to_i)
11114 {
11115   enum reg_class from = (enum reg_class) from_i;
11116   enum reg_class to = (enum reg_class) to_i;
11117   const struct cpu_regmove_cost *regmove_cost
11118     = aarch64_tune_params.regmove_cost;
11119
11120   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11121   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11122     to = GENERAL_REGS;
11123
11124   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11125     from = GENERAL_REGS;
11126
11127   /* Moving between GPR and stack cost is the same as GP2GP.  */
11128   if ((from == GENERAL_REGS && to == STACK_REG)
11129       || (to == GENERAL_REGS && from == STACK_REG))
11130     return regmove_cost->GP2GP;
11131
11132   /* To/From the stack register, we move via the gprs.  */
11133   if (to == STACK_REG || from == STACK_REG)
11134     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11135             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11136
11137   if (known_eq (GET_MODE_SIZE (mode), 16))
11138     {
11139       /* 128-bit operations on general registers require 2 instructions.  */
11140       if (from == GENERAL_REGS && to == GENERAL_REGS)
11141         return regmove_cost->GP2GP * 2;
11142       else if (from == GENERAL_REGS)
11143         return regmove_cost->GP2FP * 2;
11144       else if (to == GENERAL_REGS)
11145         return regmove_cost->FP2GP * 2;
11146
11147       /* When AdvSIMD instructions are disabled it is not possible to move
11148          a 128-bit value directly between Q registers.  This is handled in
11149          secondary reload.  A general register is used as a scratch to move
11150          the upper DI value and the lower DI value is moved directly,
11151          hence the cost is the sum of three moves. */
11152       if (! TARGET_SIMD)
11153         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11154
11155       return regmove_cost->FP2FP;
11156     }
11157
11158   if (from == GENERAL_REGS && to == GENERAL_REGS)
11159     return regmove_cost->GP2GP;
11160   else if (from == GENERAL_REGS)
11161     return regmove_cost->GP2FP;
11162   else if (to == GENERAL_REGS)
11163     return regmove_cost->FP2GP;
11164
11165   return regmove_cost->FP2FP;
11166 }
11167
11168 static int
11169 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11170                           reg_class_t rclass ATTRIBUTE_UNUSED,
11171                           bool in ATTRIBUTE_UNUSED)
11172 {
11173   return aarch64_tune_params.memmov_cost;
11174 }
11175
11176 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11177    to optimize 1.0/sqrt.  */
11178
11179 static bool
11180 use_rsqrt_p (machine_mode mode)
11181 {
11182   return (!flag_trapping_math
11183           && flag_unsafe_math_optimizations
11184           && ((aarch64_tune_params.approx_modes->recip_sqrt
11185                & AARCH64_APPROX_MODE (mode))
11186               || flag_mrecip_low_precision_sqrt));
11187 }
11188
11189 /* Function to decide when to use the approximate reciprocal square root
11190    builtin.  */
11191
11192 static tree
11193 aarch64_builtin_reciprocal (tree fndecl)
11194 {
11195   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11196
11197   if (!use_rsqrt_p (mode))
11198     return NULL_TREE;
11199   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11200 }
11201
11202 /* Emit instruction sequence to compute either the approximate square root
11203    or its approximate reciprocal, depending on the flag RECP, and return
11204    whether the sequence was emitted or not.  */
11205
11206 bool
11207 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11208 {
11209   machine_mode mode = GET_MODE (dst);
11210
11211   if (GET_MODE_INNER (mode) == HFmode)
11212     {
11213       gcc_assert (!recp);
11214       return false;
11215     }
11216
11217   if (!recp)
11218     {
11219       if (!(flag_mlow_precision_sqrt
11220             || (aarch64_tune_params.approx_modes->sqrt
11221                 & AARCH64_APPROX_MODE (mode))))
11222         return false;
11223
11224       if (flag_finite_math_only
11225           || flag_trapping_math
11226           || !flag_unsafe_math_optimizations
11227           || optimize_function_for_size_p (cfun))
11228         return false;
11229     }
11230   else
11231     /* Caller assumes we cannot fail.  */
11232     gcc_assert (use_rsqrt_p (mode));
11233
11234   machine_mode mmsk = mode_for_int_vector (mode).require ();
11235   rtx xmsk = gen_reg_rtx (mmsk);
11236   if (!recp)
11237     /* When calculating the approximate square root, compare the
11238        argument with 0.0 and create a mask.  */
11239     emit_insn (gen_rtx_SET (xmsk,
11240                             gen_rtx_NEG (mmsk,
11241                                          gen_rtx_EQ (mmsk, src,
11242                                                      CONST0_RTX (mode)))));
11243
11244   /* Estimate the approximate reciprocal square root.  */
11245   rtx xdst = gen_reg_rtx (mode);
11246   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11247
11248   /* Iterate over the series twice for SF and thrice for DF.  */
11249   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11250
11251   /* Optionally iterate over the series once less for faster performance
11252      while sacrificing the accuracy.  */
11253   if ((recp && flag_mrecip_low_precision_sqrt)
11254       || (!recp && flag_mlow_precision_sqrt))
11255     iterations--;
11256
11257   /* Iterate over the series to calculate the approximate reciprocal square
11258      root.  */
11259   rtx x1 = gen_reg_rtx (mode);
11260   while (iterations--)
11261     {
11262       rtx x2 = gen_reg_rtx (mode);
11263       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11264
11265       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11266
11267       if (iterations > 0)
11268         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11269     }
11270
11271   if (!recp)
11272     {
11273       /* Qualify the approximate reciprocal square root when the argument is
11274          0.0 by squashing the intermediary result to 0.0.  */
11275       rtx xtmp = gen_reg_rtx (mmsk);
11276       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11277                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11278       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11279
11280       /* Calculate the approximate square root.  */
11281       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11282     }
11283
11284   /* Finalize the approximation.  */
11285   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11286
11287   return true;
11288 }
11289
11290 /* Emit the instruction sequence to compute the approximation for the division
11291    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11292
11293 bool
11294 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11295 {
11296   machine_mode mode = GET_MODE (quo);
11297
11298   if (GET_MODE_INNER (mode) == HFmode)
11299     return false;
11300
11301   bool use_approx_division_p = (flag_mlow_precision_div
11302                                 || (aarch64_tune_params.approx_modes->division
11303                                     & AARCH64_APPROX_MODE (mode)));
11304
11305   if (!flag_finite_math_only
11306       || flag_trapping_math
11307       || !flag_unsafe_math_optimizations
11308       || optimize_function_for_size_p (cfun)
11309       || !use_approx_division_p)
11310     return false;
11311
11312   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11313     return false;
11314
11315   /* Estimate the approximate reciprocal.  */
11316   rtx xrcp = gen_reg_rtx (mode);
11317   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11318
11319   /* Iterate over the series twice for SF and thrice for DF.  */
11320   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11321
11322   /* Optionally iterate over the series once less for faster performance,
11323      while sacrificing the accuracy.  */
11324   if (flag_mlow_precision_div)
11325     iterations--;
11326
11327   /* Iterate over the series to calculate the approximate reciprocal.  */
11328   rtx xtmp = gen_reg_rtx (mode);
11329   while (iterations--)
11330     {
11331       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11332
11333       if (iterations > 0)
11334         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11335     }
11336
11337   if (num != CONST1_RTX (mode))
11338     {
11339       /* As the approximate reciprocal of DEN is already calculated, only
11340          calculate the approximate division when NUM is not 1.0.  */
11341       rtx xnum = force_reg (mode, num);
11342       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11343     }
11344
11345   /* Finalize the approximation.  */
11346   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11347   return true;
11348 }
11349
11350 /* Return the number of instructions that can be issued per cycle.  */
11351 static int
11352 aarch64_sched_issue_rate (void)
11353 {
11354   return aarch64_tune_params.issue_rate;
11355 }
11356
11357 static int
11358 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11359 {
11360   int issue_rate = aarch64_sched_issue_rate ();
11361
11362   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11363 }
11364
11365
11366 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11367    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11368    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11369
11370 static int
11371 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11372                                                     int ready_index)
11373 {
11374   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11375 }
11376
11377
11378 /* Vectorizer cost model target hooks.  */
11379
11380 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11381 static int
11382 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11383                                     tree vectype,
11384                                     int misalign ATTRIBUTE_UNUSED)
11385 {
11386   unsigned elements;
11387   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11388   bool fp = false;
11389
11390   if (vectype != NULL)
11391     fp = FLOAT_TYPE_P (vectype);
11392
11393   switch (type_of_cost)
11394     {
11395       case scalar_stmt:
11396         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11397
11398       case scalar_load:
11399         return costs->scalar_load_cost;
11400
11401       case scalar_store:
11402         return costs->scalar_store_cost;
11403
11404       case vector_stmt:
11405         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11406
11407       case vector_load:
11408         return costs->vec_align_load_cost;
11409
11410       case vector_store:
11411         return costs->vec_store_cost;
11412
11413       case vec_to_scalar:
11414         return costs->vec_to_scalar_cost;
11415
11416       case scalar_to_vec:
11417         return costs->scalar_to_vec_cost;
11418
11419       case unaligned_load:
11420       case vector_gather_load:
11421         return costs->vec_unalign_load_cost;
11422
11423       case unaligned_store:
11424       case vector_scatter_store:
11425         return costs->vec_unalign_store_cost;
11426
11427       case cond_branch_taken:
11428         return costs->cond_taken_branch_cost;
11429
11430       case cond_branch_not_taken:
11431         return costs->cond_not_taken_branch_cost;
11432
11433       case vec_perm:
11434         return costs->vec_permute_cost;
11435
11436       case vec_promote_demote:
11437         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11438
11439       case vec_construct:
11440         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11441         return elements / 2 + 1;
11442
11443       default:
11444         gcc_unreachable ();
11445     }
11446 }
11447
11448 /* Implement targetm.vectorize.add_stmt_cost.  */
11449 static unsigned
11450 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11451                        struct _stmt_vec_info *stmt_info, int misalign,
11452                        enum vect_cost_model_location where)
11453 {
11454   unsigned *cost = (unsigned *) data;
11455   unsigned retval = 0;
11456
11457   if (flag_vect_cost_model)
11458     {
11459       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11460       int stmt_cost =
11461             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11462
11463       /* Statements in an inner loop relative to the loop being
11464          vectorized are weighted more heavily.  The value here is
11465          arbitrary and could potentially be improved with analysis.  */
11466       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11467         count *= 50; /*  FIXME  */
11468
11469       retval = (unsigned) (count * stmt_cost);
11470       cost[where] += retval;
11471     }
11472
11473   return retval;
11474 }
11475
11476 static void initialize_aarch64_code_model (struct gcc_options *);
11477
11478 /* Parse the TO_PARSE string and put the architecture struct that it
11479    selects into RES and the architectural features into ISA_FLAGS.
11480    Return an aarch64_parse_opt_result describing the parse result.
11481    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11482    When the TO_PARSE string contains an invalid extension,
11483    a copy of the string is created and stored to INVALID_EXTENSION.  */
11484
11485 static enum aarch64_parse_opt_result
11486 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11487                     uint64_t *isa_flags, std::string *invalid_extension)
11488 {
11489   const char *ext;
11490   const struct processor *arch;
11491   size_t len;
11492
11493   ext = strchr (to_parse, '+');
11494
11495   if (ext != NULL)
11496     len = ext - to_parse;
11497   else
11498     len = strlen (to_parse);
11499
11500   if (len == 0)
11501     return AARCH64_PARSE_MISSING_ARG;
11502
11503
11504   /* Loop through the list of supported ARCHes to find a match.  */
11505   for (arch = all_architectures; arch->name != NULL; arch++)
11506     {
11507       if (strlen (arch->name) == len
11508           && strncmp (arch->name, to_parse, len) == 0)
11509         {
11510           uint64_t isa_temp = arch->flags;
11511
11512           if (ext != NULL)
11513             {
11514               /* TO_PARSE string contains at least one extension.  */
11515               enum aarch64_parse_opt_result ext_res
11516                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11517
11518               if (ext_res != AARCH64_PARSE_OK)
11519                 return ext_res;
11520             }
11521           /* Extension parsing was successful.  Confirm the result
11522              arch and ISA flags.  */
11523           *res = arch;
11524           *isa_flags = isa_temp;
11525           return AARCH64_PARSE_OK;
11526         }
11527     }
11528
11529   /* ARCH name not found in list.  */
11530   return AARCH64_PARSE_INVALID_ARG;
11531 }
11532
11533 /* Parse the TO_PARSE string and put the result tuning in RES and the
11534    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11535    describing the parse result.  If there is an error parsing, RES and
11536    ISA_FLAGS are left unchanged.
11537    When the TO_PARSE string contains an invalid extension,
11538    a copy of the string is created and stored to INVALID_EXTENSION.  */
11539
11540 static enum aarch64_parse_opt_result
11541 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11542                    uint64_t *isa_flags, std::string *invalid_extension)
11543 {
11544   const char *ext;
11545   const struct processor *cpu;
11546   size_t len;
11547
11548   ext = strchr (to_parse, '+');
11549
11550   if (ext != NULL)
11551     len = ext - to_parse;
11552   else
11553     len = strlen (to_parse);
11554
11555   if (len == 0)
11556     return AARCH64_PARSE_MISSING_ARG;
11557
11558
11559   /* Loop through the list of supported CPUs to find a match.  */
11560   for (cpu = all_cores; cpu->name != NULL; cpu++)
11561     {
11562       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11563         {
11564           uint64_t isa_temp = cpu->flags;
11565
11566
11567           if (ext != NULL)
11568             {
11569               /* TO_PARSE string contains at least one extension.  */
11570               enum aarch64_parse_opt_result ext_res
11571                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11572
11573               if (ext_res != AARCH64_PARSE_OK)
11574                 return ext_res;
11575             }
11576           /* Extension parsing was successfull.  Confirm the result
11577              cpu and ISA flags.  */
11578           *res = cpu;
11579           *isa_flags = isa_temp;
11580           return AARCH64_PARSE_OK;
11581         }
11582     }
11583
11584   /* CPU name not found in list.  */
11585   return AARCH64_PARSE_INVALID_ARG;
11586 }
11587
11588 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11589    Return an aarch64_parse_opt_result describing the parse result.
11590    If the parsing fails the RES does not change.  */
11591
11592 static enum aarch64_parse_opt_result
11593 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11594 {
11595   const struct processor *cpu;
11596
11597   /* Loop through the list of supported CPUs to find a match.  */
11598   for (cpu = all_cores; cpu->name != NULL; cpu++)
11599     {
11600       if (strcmp (cpu->name, to_parse) == 0)
11601         {
11602           *res = cpu;
11603           return AARCH64_PARSE_OK;
11604         }
11605     }
11606
11607   /* CPU name not found in list.  */
11608   return AARCH64_PARSE_INVALID_ARG;
11609 }
11610
11611 /* Parse TOKEN, which has length LENGTH to see if it is an option
11612    described in FLAG.  If it is, return the index bit for that fusion type.
11613    If not, error (printing OPTION_NAME) and return zero.  */
11614
11615 static unsigned int
11616 aarch64_parse_one_option_token (const char *token,
11617                                 size_t length,
11618                                 const struct aarch64_flag_desc *flag,
11619                                 const char *option_name)
11620 {
11621   for (; flag->name != NULL; flag++)
11622     {
11623       if (length == strlen (flag->name)
11624           && !strncmp (flag->name, token, length))
11625         return flag->flag;
11626     }
11627
11628   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11629   return 0;
11630 }
11631
11632 /* Parse OPTION which is a comma-separated list of flags to enable.
11633    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11634    default state we inherit from the CPU tuning structures.  OPTION_NAME
11635    gives the top-level option we are parsing in the -moverride string,
11636    for use in error messages.  */
11637
11638 static unsigned int
11639 aarch64_parse_boolean_options (const char *option,
11640                                const struct aarch64_flag_desc *flags,
11641                                unsigned int initial_state,
11642                                const char *option_name)
11643 {
11644   const char separator = '.';
11645   const char* specs = option;
11646   const char* ntoken = option;
11647   unsigned int found_flags = initial_state;
11648
11649   while ((ntoken = strchr (specs, separator)))
11650     {
11651       size_t token_length = ntoken - specs;
11652       unsigned token_ops = aarch64_parse_one_option_token (specs,
11653                                                            token_length,
11654                                                            flags,
11655                                                            option_name);
11656       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11657          in the token stream, reset the supported operations.  So:
11658
11659            adrp+add.cmp+branch.none.adrp+add
11660
11661            would have the result of turning on only adrp+add fusion.  */
11662       if (!token_ops)
11663         found_flags = 0;
11664
11665       found_flags |= token_ops;
11666       specs = ++ntoken;
11667     }
11668
11669   /* We ended with a comma, print something.  */
11670   if (!(*specs))
11671     {
11672       error ("%s string ill-formed\n", option_name);
11673       return 0;
11674     }
11675
11676   /* We still have one more token to parse.  */
11677   size_t token_length = strlen (specs);
11678   unsigned token_ops = aarch64_parse_one_option_token (specs,
11679                                                        token_length,
11680                                                        flags,
11681                                                        option_name);
11682    if (!token_ops)
11683      found_flags = 0;
11684
11685   found_flags |= token_ops;
11686   return found_flags;
11687 }
11688
11689 /* Support for overriding instruction fusion.  */
11690
11691 static void
11692 aarch64_parse_fuse_string (const char *fuse_string,
11693                             struct tune_params *tune)
11694 {
11695   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11696                                                      aarch64_fusible_pairs,
11697                                                      tune->fusible_ops,
11698                                                      "fuse=");
11699 }
11700
11701 /* Support for overriding other tuning flags.  */
11702
11703 static void
11704 aarch64_parse_tune_string (const char *tune_string,
11705                             struct tune_params *tune)
11706 {
11707   tune->extra_tuning_flags
11708     = aarch64_parse_boolean_options (tune_string,
11709                                      aarch64_tuning_flags,
11710                                      tune->extra_tuning_flags,
11711                                      "tune=");
11712 }
11713
11714 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11715    Accept the valid SVE vector widths allowed by
11716    aarch64_sve_vector_bits_enum and use it to override sve_width
11717    in TUNE.  */
11718
11719 static void
11720 aarch64_parse_sve_width_string (const char *tune_string,
11721                                 struct tune_params *tune)
11722 {
11723   int width = -1;
11724
11725   int n = sscanf (tune_string, "%d", &width);
11726   if (n == EOF)
11727     {
11728       error ("invalid format for sve_width");
11729       return;
11730     }
11731   switch (width)
11732     {
11733     case SVE_128:
11734     case SVE_256:
11735     case SVE_512:
11736     case SVE_1024:
11737     case SVE_2048:
11738       break;
11739     default:
11740       error ("invalid sve_width value: %d", width);
11741     }
11742   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11743 }
11744
11745 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11746    we understand.  If it is, extract the option string and handoff to
11747    the appropriate function.  */
11748
11749 void
11750 aarch64_parse_one_override_token (const char* token,
11751                                   size_t length,
11752                                   struct tune_params *tune)
11753 {
11754   const struct aarch64_tuning_override_function *fn
11755     = aarch64_tuning_override_functions;
11756
11757   const char *option_part = strchr (token, '=');
11758   if (!option_part)
11759     {
11760       error ("tuning string missing in option (%s)", token);
11761       return;
11762     }
11763
11764   /* Get the length of the option name.  */
11765   length = option_part - token;
11766   /* Skip the '=' to get to the option string.  */
11767   option_part++;
11768
11769   for (; fn->name != NULL; fn++)
11770     {
11771       if (!strncmp (fn->name, token, length))
11772         {
11773           fn->parse_override (option_part, tune);
11774           return;
11775         }
11776     }
11777
11778   error ("unknown tuning option (%s)",token);
11779   return;
11780 }
11781
11782 /* A checking mechanism for the implementation of the tls size.  */
11783
11784 static void
11785 initialize_aarch64_tls_size (struct gcc_options *opts)
11786 {
11787   if (aarch64_tls_size == 0)
11788     aarch64_tls_size = 24;
11789
11790   switch (opts->x_aarch64_cmodel_var)
11791     {
11792     case AARCH64_CMODEL_TINY:
11793       /* Both the default and maximum TLS size allowed under tiny is 1M which
11794          needs two instructions to address, so we clamp the size to 24.  */
11795       if (aarch64_tls_size > 24)
11796         aarch64_tls_size = 24;
11797       break;
11798     case AARCH64_CMODEL_SMALL:
11799       /* The maximum TLS size allowed under small is 4G.  */
11800       if (aarch64_tls_size > 32)
11801         aarch64_tls_size = 32;
11802       break;
11803     case AARCH64_CMODEL_LARGE:
11804       /* The maximum TLS size allowed under large is 16E.
11805          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11806       if (aarch64_tls_size > 48)
11807         aarch64_tls_size = 48;
11808       break;
11809     default:
11810       gcc_unreachable ();
11811     }
11812
11813   return;
11814 }
11815
11816 /* Parse STRING looking for options in the format:
11817      string     :: option:string
11818      option     :: name=substring
11819      name       :: {a-z}
11820      substring  :: defined by option.  */
11821
11822 static void
11823 aarch64_parse_override_string (const char* input_string,
11824                                struct tune_params* tune)
11825 {
11826   const char separator = ':';
11827   size_t string_length = strlen (input_string) + 1;
11828   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11829   char *string = string_root;
11830   strncpy (string, input_string, string_length);
11831   string[string_length - 1] = '\0';
11832
11833   char* ntoken = string;
11834
11835   while ((ntoken = strchr (string, separator)))
11836     {
11837       size_t token_length = ntoken - string;
11838       /* Make this substring look like a string.  */
11839       *ntoken = '\0';
11840       aarch64_parse_one_override_token (string, token_length, tune);
11841       string = ++ntoken;
11842     }
11843
11844   /* One last option to parse.  */
11845   aarch64_parse_one_override_token (string, strlen (string), tune);
11846   free (string_root);
11847 }
11848
11849
11850 static void
11851 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11852 {
11853   if (accepted_branch_protection_string)
11854     {
11855       opts->x_aarch64_branch_protection_string
11856         = xstrdup (accepted_branch_protection_string);
11857     }
11858
11859   /* PR 70044: We have to be careful about being called multiple times for the
11860      same function.  This means all changes should be repeatable.  */
11861
11862   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11863      Disable the frame pointer flag so the mid-end will not use a frame
11864      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11865      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11866      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11867   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11868   if (opts->x_flag_omit_frame_pointer == 0)
11869     opts->x_flag_omit_frame_pointer = 2;
11870
11871   /* If not optimizing for size, set the default
11872      alignment to what the target wants.  */
11873   if (!opts->x_optimize_size)
11874     {
11875       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11876         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11877       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11878         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11879       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11880         opts->x_str_align_functions = aarch64_tune_params.function_align;
11881     }
11882
11883   /* We default to no pc-relative literal loads.  */
11884
11885   aarch64_pcrelative_literal_loads = false;
11886
11887   /* If -mpc-relative-literal-loads is set on the command line, this
11888      implies that the user asked for PC relative literal loads.  */
11889   if (opts->x_pcrelative_literal_loads == 1)
11890     aarch64_pcrelative_literal_loads = true;
11891
11892   /* In the tiny memory model it makes no sense to disallow PC relative
11893      literal pool loads.  */
11894   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11895       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11896     aarch64_pcrelative_literal_loads = true;
11897
11898   /* When enabling the lower precision Newton series for the square root, also
11899      enable it for the reciprocal square root, since the latter is an
11900      intermediary step for the former.  */
11901   if (flag_mlow_precision_sqrt)
11902     flag_mrecip_low_precision_sqrt = true;
11903 }
11904
11905 /* 'Unpack' up the internal tuning structs and update the options
11906     in OPTS.  The caller must have set up selected_tune and selected_arch
11907     as all the other target-specific codegen decisions are
11908     derived from them.  */
11909
11910 void
11911 aarch64_override_options_internal (struct gcc_options *opts)
11912 {
11913   aarch64_tune_flags = selected_tune->flags;
11914   aarch64_tune = selected_tune->sched_core;
11915   /* Make a copy of the tuning parameters attached to the core, which
11916      we may later overwrite.  */
11917   aarch64_tune_params = *(selected_tune->tune);
11918   aarch64_architecture_version = selected_arch->architecture_version;
11919
11920   if (opts->x_aarch64_override_tune_string)
11921     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11922                                   &aarch64_tune_params);
11923
11924   /* This target defaults to strict volatile bitfields.  */
11925   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11926     opts->x_flag_strict_volatile_bitfields = 1;
11927
11928   if (aarch64_stack_protector_guard == SSP_GLOBAL
11929       && opts->x_aarch64_stack_protector_guard_offset_str)
11930     {
11931       error ("incompatible options %<-mstack-protector-guard=global%> and "
11932              "%<-mstack-protector-guard-offset=%s%>",
11933              aarch64_stack_protector_guard_offset_str);
11934     }
11935
11936   if (aarch64_stack_protector_guard == SSP_SYSREG
11937       && !(opts->x_aarch64_stack_protector_guard_offset_str
11938            && opts->x_aarch64_stack_protector_guard_reg_str))
11939     {
11940       error ("both %<-mstack-protector-guard-offset%> and "
11941              "%<-mstack-protector-guard-reg%> must be used "
11942              "with %<-mstack-protector-guard=sysreg%>");
11943     }
11944
11945   if (opts->x_aarch64_stack_protector_guard_reg_str)
11946     {
11947       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11948           error ("specify a system register with a small string length.");
11949     }
11950
11951   if (opts->x_aarch64_stack_protector_guard_offset_str)
11952     {
11953       char *end;
11954       const char *str = aarch64_stack_protector_guard_offset_str;
11955       errno = 0;
11956       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11957       if (!*str || *end || errno)
11958         error ("%qs is not a valid offset in %qs", str,
11959                "-mstack-protector-guard-offset=");
11960       aarch64_stack_protector_guard_offset = offs;
11961     }
11962
11963   initialize_aarch64_code_model (opts);
11964   initialize_aarch64_tls_size (opts);
11965
11966   int queue_depth = 0;
11967   switch (aarch64_tune_params.autoprefetcher_model)
11968     {
11969       case tune_params::AUTOPREFETCHER_OFF:
11970         queue_depth = -1;
11971         break;
11972       case tune_params::AUTOPREFETCHER_WEAK:
11973         queue_depth = 0;
11974         break;
11975       case tune_params::AUTOPREFETCHER_STRONG:
11976         queue_depth = max_insn_queue_index + 1;
11977         break;
11978       default:
11979         gcc_unreachable ();
11980     }
11981
11982   /* We don't mind passing in global_options_set here as we don't use
11983      the *options_set structs anyway.  */
11984   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11985                          queue_depth,
11986                          opts->x_param_values,
11987                          global_options_set.x_param_values);
11988
11989   /* Set up parameters to be used in prefetching algorithm.  Do not
11990      override the defaults unless we are tuning for a core we have
11991      researched values for.  */
11992   if (aarch64_tune_params.prefetch->num_slots > 0)
11993     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11994                            aarch64_tune_params.prefetch->num_slots,
11995                            opts->x_param_values,
11996                            global_options_set.x_param_values);
11997   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11998     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11999                            aarch64_tune_params.prefetch->l1_cache_size,
12000                            opts->x_param_values,
12001                            global_options_set.x_param_values);
12002   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12003     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12004                            aarch64_tune_params.prefetch->l1_cache_line_size,
12005                            opts->x_param_values,
12006                            global_options_set.x_param_values);
12007   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12008     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12009                            aarch64_tune_params.prefetch->l2_cache_size,
12010                            opts->x_param_values,
12011                            global_options_set.x_param_values);
12012   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12013     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12014                            0,
12015                            opts->x_param_values,
12016                            global_options_set.x_param_values);
12017   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12018     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12019                            aarch64_tune_params.prefetch->minimum_stride,
12020                            opts->x_param_values,
12021                            global_options_set.x_param_values);
12022
12023   /* Use the alternative scheduling-pressure algorithm by default.  */
12024   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12025                          opts->x_param_values,
12026                          global_options_set.x_param_values);
12027
12028   /* If the user hasn't changed it via configure then set the default to 64 KB
12029      for the backend.  */
12030   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12031                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12032                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12033                          opts->x_param_values,
12034                          global_options_set.x_param_values);
12035
12036   /* Validate the guard size.  */
12037   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12038
12039   /* Enforce that interval is the same size as size so the mid-end does the
12040      right thing.  */
12041   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12042                          guard_size,
12043                          opts->x_param_values,
12044                          global_options_set.x_param_values);
12045
12046   /* The maybe_set calls won't update the value if the user has explicitly set
12047      one.  Which means we need to validate that probing interval and guard size
12048      are equal.  */
12049   int probe_interval
12050     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12051   if (guard_size != probe_interval)
12052     error ("stack clash guard size %<%d%> must be equal to probing interval "
12053            "%<%d%>", guard_size, probe_interval);
12054
12055   /* Enable sw prefetching at specified optimization level for
12056      CPUS that have prefetch.  Lower optimization level threshold by 1
12057      when profiling is enabled.  */
12058   if (opts->x_flag_prefetch_loop_arrays < 0
12059       && !opts->x_optimize_size
12060       && aarch64_tune_params.prefetch->default_opt_level >= 0
12061       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12062     opts->x_flag_prefetch_loop_arrays = 1;
12063
12064   if (opts->x_aarch64_arch_string == NULL)
12065     opts->x_aarch64_arch_string = selected_arch->name;
12066   if (opts->x_aarch64_cpu_string == NULL)
12067     opts->x_aarch64_cpu_string = selected_cpu->name;
12068   if (opts->x_aarch64_tune_string == NULL)
12069     opts->x_aarch64_tune_string = selected_tune->name;
12070
12071   aarch64_override_options_after_change_1 (opts);
12072 }
12073
12074 /* Print a hint with a suggestion for a core or architecture name that
12075    most closely resembles what the user passed in STR.  ARCH is true if
12076    the user is asking for an architecture name.  ARCH is false if the user
12077    is asking for a core name.  */
12078
12079 static void
12080 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12081 {
12082   auto_vec<const char *> candidates;
12083   const struct processor *entry = arch ? all_architectures : all_cores;
12084   for (; entry->name != NULL; entry++)
12085     candidates.safe_push (entry->name);
12086
12087 #ifdef HAVE_LOCAL_CPU_DETECT
12088   /* Add also "native" as possible value.  */
12089   if (arch)
12090     candidates.safe_push ("native");
12091 #endif
12092
12093   char *s;
12094   const char *hint = candidates_list_and_hint (str, s, candidates);
12095   if (hint)
12096     inform (input_location, "valid arguments are: %s;"
12097                              " did you mean %qs?", s, hint);
12098   else
12099     inform (input_location, "valid arguments are: %s", s);
12100
12101   XDELETEVEC (s);
12102 }
12103
12104 /* Print a hint with a suggestion for a core name that most closely resembles
12105    what the user passed in STR.  */
12106
12107 inline static void
12108 aarch64_print_hint_for_core (const char *str)
12109 {
12110   aarch64_print_hint_for_core_or_arch (str, false);
12111 }
12112
12113 /* Print a hint with a suggestion for an architecture name that most closely
12114    resembles what the user passed in STR.  */
12115
12116 inline static void
12117 aarch64_print_hint_for_arch (const char *str)
12118 {
12119   aarch64_print_hint_for_core_or_arch (str, true);
12120 }
12121
12122
12123 /* Print a hint with a suggestion for an extension name
12124    that most closely resembles what the user passed in STR.  */
12125
12126 void
12127 aarch64_print_hint_for_extensions (const std::string &str)
12128 {
12129   auto_vec<const char *> candidates;
12130   aarch64_get_all_extension_candidates (&candidates);
12131   char *s;
12132   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12133   if (hint)
12134     inform (input_location, "valid arguments are: %s;"
12135                              " did you mean %qs?", s, hint);
12136   else
12137     inform (input_location, "valid arguments are: %s;", s);
12138
12139   XDELETEVEC (s);
12140 }
12141
12142 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12143    specified in STR and throw errors if appropriate.  Put the results if
12144    they are valid in RES and ISA_FLAGS.  Return whether the option is
12145    valid.  */
12146
12147 static bool
12148 aarch64_validate_mcpu (const char *str, const struct processor **res,
12149                        uint64_t *isa_flags)
12150 {
12151   std::string invalid_extension;
12152   enum aarch64_parse_opt_result parse_res
12153     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12154
12155   if (parse_res == AARCH64_PARSE_OK)
12156     return true;
12157
12158   switch (parse_res)
12159     {
12160       case AARCH64_PARSE_MISSING_ARG:
12161         error ("missing cpu name in %<-mcpu=%s%>", str);
12162         break;
12163       case AARCH64_PARSE_INVALID_ARG:
12164         error ("unknown value %qs for %<-mcpu%>", str);
12165         aarch64_print_hint_for_core (str);
12166         break;
12167       case AARCH64_PARSE_INVALID_FEATURE:
12168         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12169                invalid_extension.c_str (), str);
12170         aarch64_print_hint_for_extensions (invalid_extension);
12171         break;
12172       default:
12173         gcc_unreachable ();
12174     }
12175
12176   return false;
12177 }
12178
12179 /* Parses CONST_STR for branch protection features specified in
12180    aarch64_branch_protect_types, and set any global variables required.  Returns
12181    the parsing result and assigns LAST_STR to the last processed token from
12182    CONST_STR so that it can be used for error reporting.  */
12183
12184 static enum
12185 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12186                                                           char** last_str)
12187 {
12188   char *str_root = xstrdup (const_str);
12189   char* token_save = NULL;
12190   char *str = strtok_r (str_root, "+", &token_save);
12191   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12192   if (!str)
12193     res = AARCH64_PARSE_MISSING_ARG;
12194   else
12195     {
12196       char *next_str = strtok_r (NULL, "+", &token_save);
12197       /* Reset the branch protection features to their defaults.  */
12198       aarch64_handle_no_branch_protection (NULL, NULL);
12199
12200       while (str && res == AARCH64_PARSE_OK)
12201         {
12202           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12203           bool found = false;
12204           /* Search for this type.  */
12205           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12206             {
12207               if (strcmp (str, type->name) == 0)
12208                 {
12209                   found = true;
12210                   res = type->handler (str, next_str);
12211                   str = next_str;
12212                   next_str = strtok_r (NULL, "+", &token_save);
12213                 }
12214               else
12215                 type++;
12216             }
12217           if (found && res == AARCH64_PARSE_OK)
12218             {
12219               bool found_subtype = true;
12220               /* Loop through each token until we find one that isn't a
12221                  subtype.  */
12222               while (found_subtype)
12223                 {
12224                   found_subtype = false;
12225                   const aarch64_branch_protect_type *subtype = type->subtypes;
12226                   /* Search for the subtype.  */
12227                   while (str && subtype && subtype->name && !found_subtype
12228                           && res == AARCH64_PARSE_OK)
12229                     {
12230                       if (strcmp (str, subtype->name) == 0)
12231                         {
12232                           found_subtype = true;
12233                           res = subtype->handler (str, next_str);
12234                           str = next_str;
12235                           next_str = strtok_r (NULL, "+", &token_save);
12236                         }
12237                       else
12238                         subtype++;
12239                     }
12240                 }
12241             }
12242           else if (!found)
12243             res = AARCH64_PARSE_INVALID_ARG;
12244         }
12245     }
12246   /* Copy the last processed token into the argument to pass it back.
12247     Used by option and attribute validation to print the offending token.  */
12248   if (last_str)
12249     {
12250       if (str) strcpy (*last_str, str);
12251       else *last_str = NULL;
12252     }
12253   if (res == AARCH64_PARSE_OK)
12254     {
12255       /* If needed, alloc the accepted string then copy in const_str.
12256         Used by override_option_after_change_1.  */
12257       if (!accepted_branch_protection_string)
12258         accepted_branch_protection_string = (char *) xmalloc (
12259                                                       BRANCH_PROTECT_STR_MAX
12260                                                         + 1);
12261       strncpy (accepted_branch_protection_string, const_str,
12262                 BRANCH_PROTECT_STR_MAX + 1);
12263       /* Forcibly null-terminate.  */
12264       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12265     }
12266   return res;
12267 }
12268
12269 static bool
12270 aarch64_validate_mbranch_protection (const char *const_str)
12271 {
12272   char *str = (char *) xmalloc (strlen (const_str));
12273   enum aarch64_parse_opt_result res =
12274     aarch64_parse_branch_protection (const_str, &str);
12275   if (res == AARCH64_PARSE_INVALID_ARG)
12276     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12277   else if (res == AARCH64_PARSE_MISSING_ARG)
12278     error ("missing argument for %<-mbranch-protection=%>");
12279   free (str);
12280   return res == AARCH64_PARSE_OK;
12281 }
12282
12283 /* Validate a command-line -march option.  Parse the arch and extensions
12284    (if any) specified in STR and throw errors if appropriate.  Put the
12285    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12286    option is valid.  */
12287
12288 static bool
12289 aarch64_validate_march (const char *str, const struct processor **res,
12290                          uint64_t *isa_flags)
12291 {
12292   std::string invalid_extension;
12293   enum aarch64_parse_opt_result parse_res
12294     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12295
12296   if (parse_res == AARCH64_PARSE_OK)
12297     return true;
12298
12299   switch (parse_res)
12300     {
12301       case AARCH64_PARSE_MISSING_ARG:
12302         error ("missing arch name in %<-march=%s%>", str);
12303         break;
12304       case AARCH64_PARSE_INVALID_ARG:
12305         error ("unknown value %qs for %<-march%>", str);
12306         aarch64_print_hint_for_arch (str);
12307         break;
12308       case AARCH64_PARSE_INVALID_FEATURE:
12309         error ("invalid feature modifier %qs in %<-march=%s%>",
12310                invalid_extension.c_str (), str);
12311         aarch64_print_hint_for_extensions (invalid_extension);
12312         break;
12313       default:
12314         gcc_unreachable ();
12315     }
12316
12317   return false;
12318 }
12319
12320 /* Validate a command-line -mtune option.  Parse the cpu
12321    specified in STR and throw errors if appropriate.  Put the
12322    result, if it is valid, in RES.  Return whether the option is
12323    valid.  */
12324
12325 static bool
12326 aarch64_validate_mtune (const char *str, const struct processor **res)
12327 {
12328   enum aarch64_parse_opt_result parse_res
12329     = aarch64_parse_tune (str, res);
12330
12331   if (parse_res == AARCH64_PARSE_OK)
12332     return true;
12333
12334   switch (parse_res)
12335     {
12336       case AARCH64_PARSE_MISSING_ARG:
12337         error ("missing cpu name in %<-mtune=%s%>", str);
12338         break;
12339       case AARCH64_PARSE_INVALID_ARG:
12340         error ("unknown value %qs for %<-mtune%>", str);
12341         aarch64_print_hint_for_core (str);
12342         break;
12343       default:
12344         gcc_unreachable ();
12345     }
12346   return false;
12347 }
12348
12349 /* Return the CPU corresponding to the enum CPU.
12350    If it doesn't specify a cpu, return the default.  */
12351
12352 static const struct processor *
12353 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12354 {
12355   if (cpu != aarch64_none)
12356     return &all_cores[cpu];
12357
12358   /* The & 0x3f is to extract the bottom 6 bits that encode the
12359      default cpu as selected by the --with-cpu GCC configure option
12360      in config.gcc.
12361      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12362      flags mechanism should be reworked to make it more sane.  */
12363   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12364 }
12365
12366 /* Return the architecture corresponding to the enum ARCH.
12367    If it doesn't specify a valid architecture, return the default.  */
12368
12369 static const struct processor *
12370 aarch64_get_arch (enum aarch64_arch arch)
12371 {
12372   if (arch != aarch64_no_arch)
12373     return &all_architectures[arch];
12374
12375   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12376
12377   return &all_architectures[cpu->arch];
12378 }
12379
12380 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12381
12382 static poly_uint16
12383 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12384 {
12385   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12386      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12387      deciding which .md file patterns to use and when deciding whether
12388      something is a legitimate address or constant.  */
12389   if (value == SVE_SCALABLE || value == SVE_128)
12390     return poly_uint16 (2, 2);
12391   else
12392     return (int) value / 64;
12393 }
12394
12395 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12396    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12397    tuning structs.  In particular it must set selected_tune and
12398    aarch64_isa_flags that define the available ISA features and tuning
12399    decisions.  It must also set selected_arch as this will be used to
12400    output the .arch asm tags for each function.  */
12401
12402 static void
12403 aarch64_override_options (void)
12404 {
12405   uint64_t cpu_isa = 0;
12406   uint64_t arch_isa = 0;
12407   aarch64_isa_flags = 0;
12408
12409   bool valid_cpu = true;
12410   bool valid_tune = true;
12411   bool valid_arch = true;
12412
12413   selected_cpu = NULL;
12414   selected_arch = NULL;
12415   selected_tune = NULL;
12416
12417   if (aarch64_branch_protection_string)
12418     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12419
12420   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12421      If either of -march or -mtune is given, they override their
12422      respective component of -mcpu.  */
12423   if (aarch64_cpu_string)
12424     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12425                                         &cpu_isa);
12426
12427   if (aarch64_arch_string)
12428     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12429                                           &arch_isa);
12430
12431   if (aarch64_tune_string)
12432     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12433
12434 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12435   SUBTARGET_OVERRIDE_OPTIONS;
12436 #endif
12437
12438   /* If the user did not specify a processor, choose the default
12439      one for them.  This will be the CPU set during configuration using
12440      --with-cpu, otherwise it is "generic".  */
12441   if (!selected_cpu)
12442     {
12443       if (selected_arch)
12444         {
12445           selected_cpu = &all_cores[selected_arch->ident];
12446           aarch64_isa_flags = arch_isa;
12447           explicit_arch = selected_arch->arch;
12448         }
12449       else
12450         {
12451           /* Get default configure-time CPU.  */
12452           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12453           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12454         }
12455
12456       if (selected_tune)
12457         explicit_tune_core = selected_tune->ident;
12458     }
12459   /* If both -mcpu and -march are specified check that they are architecturally
12460      compatible, warn if they're not and prefer the -march ISA flags.  */
12461   else if (selected_arch)
12462     {
12463       if (selected_arch->arch != selected_cpu->arch)
12464         {
12465           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12466                        all_architectures[selected_cpu->arch].name,
12467                        selected_arch->name);
12468         }
12469       aarch64_isa_flags = arch_isa;
12470       explicit_arch = selected_arch->arch;
12471       explicit_tune_core = selected_tune ? selected_tune->ident
12472                                           : selected_cpu->ident;
12473     }
12474   else
12475     {
12476       /* -mcpu but no -march.  */
12477       aarch64_isa_flags = cpu_isa;
12478       explicit_tune_core = selected_tune ? selected_tune->ident
12479                                           : selected_cpu->ident;
12480       gcc_assert (selected_cpu);
12481       selected_arch = &all_architectures[selected_cpu->arch];
12482       explicit_arch = selected_arch->arch;
12483     }
12484
12485   /* Set the arch as well as we will need it when outputing
12486      the .arch directive in assembly.  */
12487   if (!selected_arch)
12488     {
12489       gcc_assert (selected_cpu);
12490       selected_arch = &all_architectures[selected_cpu->arch];
12491     }
12492
12493   if (!selected_tune)
12494     selected_tune = selected_cpu;
12495
12496   if (aarch64_enable_bti == 2)
12497     {
12498 #ifdef TARGET_ENABLE_BTI
12499       aarch64_enable_bti = 1;
12500 #else
12501       aarch64_enable_bti = 0;
12502 #endif
12503     }
12504
12505   /* Return address signing is currently not supported for ILP32 targets.  For
12506      LP64 targets use the configured option in the absence of a command-line
12507      option for -mbranch-protection.  */
12508   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12509     {
12510 #ifdef TARGET_ENABLE_PAC_RET
12511       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12512 #else
12513       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12514 #endif
12515     }
12516
12517 #ifndef HAVE_AS_MABI_OPTION
12518   /* The compiler may have been configured with 2.23.* binutils, which does
12519      not have support for ILP32.  */
12520   if (TARGET_ILP32)
12521     error ("assembler does not support %<-mabi=ilp32%>");
12522 #endif
12523
12524   /* Convert -msve-vector-bits to a VG count.  */
12525   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12526
12527   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12528     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12529
12530   /* Make sure we properly set up the explicit options.  */
12531   if ((aarch64_cpu_string && valid_cpu)
12532        || (aarch64_tune_string && valid_tune))
12533     gcc_assert (explicit_tune_core != aarch64_none);
12534
12535   if ((aarch64_cpu_string && valid_cpu)
12536        || (aarch64_arch_string && valid_arch))
12537     gcc_assert (explicit_arch != aarch64_no_arch);
12538
12539   /* The pass to insert speculation tracking runs before
12540      shrink-wrapping and the latter does not know how to update the
12541      tracking status.  So disable it in this case.  */
12542   if (aarch64_track_speculation)
12543     flag_shrink_wrap = 0;
12544
12545   aarch64_override_options_internal (&global_options);
12546
12547   /* Save these options as the default ones in case we push and pop them later
12548      while processing functions with potential target attributes.  */
12549   target_option_default_node = target_option_current_node
12550       = build_target_option_node (&global_options);
12551 }
12552
12553 /* Implement targetm.override_options_after_change.  */
12554
12555 static void
12556 aarch64_override_options_after_change (void)
12557 {
12558   aarch64_override_options_after_change_1 (&global_options);
12559 }
12560
12561 static struct machine_function *
12562 aarch64_init_machine_status (void)
12563 {
12564   struct machine_function *machine;
12565   machine = ggc_cleared_alloc<machine_function> ();
12566   return machine;
12567 }
12568
12569 void
12570 aarch64_init_expanders (void)
12571 {
12572   init_machine_status = aarch64_init_machine_status;
12573 }
12574
12575 /* A checking mechanism for the implementation of the various code models.  */
12576 static void
12577 initialize_aarch64_code_model (struct gcc_options *opts)
12578 {
12579    if (opts->x_flag_pic)
12580      {
12581        switch (opts->x_aarch64_cmodel_var)
12582          {
12583          case AARCH64_CMODEL_TINY:
12584            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12585            break;
12586          case AARCH64_CMODEL_SMALL:
12587 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12588            aarch64_cmodel = (flag_pic == 2
12589                              ? AARCH64_CMODEL_SMALL_PIC
12590                              : AARCH64_CMODEL_SMALL_SPIC);
12591 #else
12592            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12593 #endif
12594            break;
12595          case AARCH64_CMODEL_LARGE:
12596            sorry ("code model %qs with %<-f%s%>", "large",
12597                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12598            break;
12599          default:
12600            gcc_unreachable ();
12601          }
12602      }
12603    else
12604      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12605 }
12606
12607 /* Implement TARGET_OPTION_SAVE.  */
12608
12609 static void
12610 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12611 {
12612   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12613   ptr->x_aarch64_branch_protection_string
12614     = opts->x_aarch64_branch_protection_string;
12615 }
12616
12617 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12618    using the information saved in PTR.  */
12619
12620 static void
12621 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12622 {
12623   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12624   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12625   opts->x_explicit_arch = ptr->x_explicit_arch;
12626   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12627   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12628   opts->x_aarch64_branch_protection_string
12629     = ptr->x_aarch64_branch_protection_string;
12630   if (opts->x_aarch64_branch_protection_string)
12631     {
12632       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12633                                         NULL);
12634     }
12635
12636   aarch64_override_options_internal (opts);
12637 }
12638
12639 /* Implement TARGET_OPTION_PRINT.  */
12640
12641 static void
12642 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12643 {
12644   const struct processor *cpu
12645     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12646   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12647   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12648   std::string extension
12649     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12650
12651   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12652   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12653            arch->name, extension.c_str ());
12654 }
12655
12656 static GTY(()) tree aarch64_previous_fndecl;
12657
12658 void
12659 aarch64_reset_previous_fndecl (void)
12660 {
12661   aarch64_previous_fndecl = NULL;
12662 }
12663
12664 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12665    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12666    make sure optab availability predicates are recomputed when necessary.  */
12667
12668 void
12669 aarch64_save_restore_target_globals (tree new_tree)
12670 {
12671   if (TREE_TARGET_GLOBALS (new_tree))
12672     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12673   else if (new_tree == target_option_default_node)
12674     restore_target_globals (&default_target_globals);
12675   else
12676     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12677 }
12678
12679 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12680    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12681    of the function, if such exists.  This function may be called multiple
12682    times on a single function so use aarch64_previous_fndecl to avoid
12683    setting up identical state.  */
12684
12685 static void
12686 aarch64_set_current_function (tree fndecl)
12687 {
12688   if (!fndecl || fndecl == aarch64_previous_fndecl)
12689     return;
12690
12691   tree old_tree = (aarch64_previous_fndecl
12692                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12693                    : NULL_TREE);
12694
12695   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12696
12697   /* If current function has no attributes but the previous one did,
12698      use the default node.  */
12699   if (!new_tree && old_tree)
12700     new_tree = target_option_default_node;
12701
12702   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12703      the default have been handled by aarch64_save_restore_target_globals from
12704      aarch64_pragma_target_parse.  */
12705   if (old_tree == new_tree)
12706     return;
12707
12708   aarch64_previous_fndecl = fndecl;
12709
12710   /* First set the target options.  */
12711   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12712
12713   aarch64_save_restore_target_globals (new_tree);
12714 }
12715
12716 /* Enum describing the various ways we can handle attributes.
12717    In many cases we can reuse the generic option handling machinery.  */
12718
12719 enum aarch64_attr_opt_type
12720 {
12721   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12722   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12723   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12724   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12725 };
12726
12727 /* All the information needed to handle a target attribute.
12728    NAME is the name of the attribute.
12729    ATTR_TYPE specifies the type of behavior of the attribute as described
12730    in the definition of enum aarch64_attr_opt_type.
12731    ALLOW_NEG is true if the attribute supports a "no-" form.
12732    HANDLER is the function that takes the attribute string as an argument
12733    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12734    OPT_NUM is the enum specifying the option that the attribute modifies.
12735    This is needed for attributes that mirror the behavior of a command-line
12736    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12737    aarch64_attr_enum.  */
12738
12739 struct aarch64_attribute_info
12740 {
12741   const char *name;
12742   enum aarch64_attr_opt_type attr_type;
12743   bool allow_neg;
12744   bool (*handler) (const char *);
12745   enum opt_code opt_num;
12746 };
12747
12748 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12749
12750 static bool
12751 aarch64_handle_attr_arch (const char *str)
12752 {
12753   const struct processor *tmp_arch = NULL;
12754   std::string invalid_extension;
12755   enum aarch64_parse_opt_result parse_res
12756     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12757
12758   if (parse_res == AARCH64_PARSE_OK)
12759     {
12760       gcc_assert (tmp_arch);
12761       selected_arch = tmp_arch;
12762       explicit_arch = selected_arch->arch;
12763       return true;
12764     }
12765
12766   switch (parse_res)
12767     {
12768       case AARCH64_PARSE_MISSING_ARG:
12769         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12770         break;
12771       case AARCH64_PARSE_INVALID_ARG:
12772         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12773         aarch64_print_hint_for_arch (str);
12774         break;
12775       case AARCH64_PARSE_INVALID_FEATURE:
12776         error ("invalid feature modifier %s of value (\"%s\") in "
12777                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12778         aarch64_print_hint_for_extensions (invalid_extension);
12779         break;
12780       default:
12781         gcc_unreachable ();
12782     }
12783
12784   return false;
12785 }
12786
12787 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12788
12789 static bool
12790 aarch64_handle_attr_cpu (const char *str)
12791 {
12792   const struct processor *tmp_cpu = NULL;
12793   std::string invalid_extension;
12794   enum aarch64_parse_opt_result parse_res
12795     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12796
12797   if (parse_res == AARCH64_PARSE_OK)
12798     {
12799       gcc_assert (tmp_cpu);
12800       selected_tune = tmp_cpu;
12801       explicit_tune_core = selected_tune->ident;
12802
12803       selected_arch = &all_architectures[tmp_cpu->arch];
12804       explicit_arch = selected_arch->arch;
12805       return true;
12806     }
12807
12808   switch (parse_res)
12809     {
12810       case AARCH64_PARSE_MISSING_ARG:
12811         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12812         break;
12813       case AARCH64_PARSE_INVALID_ARG:
12814         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12815         aarch64_print_hint_for_core (str);
12816         break;
12817       case AARCH64_PARSE_INVALID_FEATURE:
12818         error ("invalid feature modifier %s of value (\"%s\") in "
12819                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12820         aarch64_print_hint_for_extensions (invalid_extension);
12821         break;
12822       default:
12823         gcc_unreachable ();
12824     }
12825
12826   return false;
12827 }
12828
12829 /* Handle the argument STR to the branch-protection= attribute.  */
12830
12831  static bool
12832  aarch64_handle_attr_branch_protection (const char* str)
12833  {
12834   char *err_str = (char *) xmalloc (strlen (str));
12835   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12836                                                                       &err_str);
12837   bool success = false;
12838   switch (res)
12839     {
12840      case AARCH64_PARSE_MISSING_ARG:
12841        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12842               " attribute");
12843        break;
12844      case AARCH64_PARSE_INVALID_ARG:
12845        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12846               "=\")%> pragma or attribute", err_str);
12847        break;
12848      case AARCH64_PARSE_OK:
12849        success = true;
12850       /* Fall through.  */
12851      case AARCH64_PARSE_INVALID_FEATURE:
12852        break;
12853      default:
12854        gcc_unreachable ();
12855     }
12856   free (err_str);
12857   return success;
12858  }
12859
12860 /* Handle the argument STR to the tune= target attribute.  */
12861
12862 static bool
12863 aarch64_handle_attr_tune (const char *str)
12864 {
12865   const struct processor *tmp_tune = NULL;
12866   enum aarch64_parse_opt_result parse_res
12867     = aarch64_parse_tune (str, &tmp_tune);
12868
12869   if (parse_res == AARCH64_PARSE_OK)
12870     {
12871       gcc_assert (tmp_tune);
12872       selected_tune = tmp_tune;
12873       explicit_tune_core = selected_tune->ident;
12874       return true;
12875     }
12876
12877   switch (parse_res)
12878     {
12879       case AARCH64_PARSE_INVALID_ARG:
12880         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12881         aarch64_print_hint_for_core (str);
12882         break;
12883       default:
12884         gcc_unreachable ();
12885     }
12886
12887   return false;
12888 }
12889
12890 /* Parse an architecture extensions target attribute string specified in STR.
12891    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12892    if successful.  Update aarch64_isa_flags to reflect the ISA features
12893    modified.  */
12894
12895 static bool
12896 aarch64_handle_attr_isa_flags (char *str)
12897 {
12898   enum aarch64_parse_opt_result parse_res;
12899   uint64_t isa_flags = aarch64_isa_flags;
12900
12901   /* We allow "+nothing" in the beginning to clear out all architectural
12902      features if the user wants to handpick specific features.  */
12903   if (strncmp ("+nothing", str, 8) == 0)
12904     {
12905       isa_flags = 0;
12906       str += 8;
12907     }
12908
12909   std::string invalid_extension;
12910   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12911
12912   if (parse_res == AARCH64_PARSE_OK)
12913     {
12914       aarch64_isa_flags = isa_flags;
12915       return true;
12916     }
12917
12918   switch (parse_res)
12919     {
12920       case AARCH64_PARSE_MISSING_ARG:
12921         error ("missing value in %<target()%> pragma or attribute");
12922         break;
12923
12924       case AARCH64_PARSE_INVALID_FEATURE:
12925         error ("invalid feature modifier %s of value (\"%s\") in "
12926                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12927         break;
12928
12929       default:
12930         gcc_unreachable ();
12931     }
12932
12933  return false;
12934 }
12935
12936 /* The target attributes that we support.  On top of these we also support just
12937    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12938    handled explicitly in aarch64_process_one_target_attr.  */
12939
12940 static const struct aarch64_attribute_info aarch64_attributes[] =
12941 {
12942   { "general-regs-only", aarch64_attr_mask, false, NULL,
12943      OPT_mgeneral_regs_only },
12944   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12945      OPT_mfix_cortex_a53_835769 },
12946   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12947      OPT_mfix_cortex_a53_843419 },
12948   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12949   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12950   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12951      OPT_momit_leaf_frame_pointer },
12952   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12953   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12954      OPT_march_ },
12955   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12956   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12957      OPT_mtune_ },
12958   { "branch-protection", aarch64_attr_custom, false,
12959      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12960   { "sign-return-address", aarch64_attr_enum, false, NULL,
12961      OPT_msign_return_address_ },
12962   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12963 };
12964
12965 /* Parse ARG_STR which contains the definition of one target attribute.
12966    Show appropriate errors if any or return true if the attribute is valid.  */
12967
12968 static bool
12969 aarch64_process_one_target_attr (char *arg_str)
12970 {
12971   bool invert = false;
12972
12973   size_t len = strlen (arg_str);
12974
12975   if (len == 0)
12976     {
12977       error ("malformed %<target()%> pragma or attribute");
12978       return false;
12979     }
12980
12981   char *str_to_check = (char *) alloca (len + 1);
12982   strcpy (str_to_check, arg_str);
12983
12984   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12985      It is easier to detect and handle it explicitly here rather than going
12986      through the machinery for the rest of the target attributes in this
12987      function.  */
12988   if (*str_to_check == '+')
12989     return aarch64_handle_attr_isa_flags (str_to_check);
12990
12991   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12992     {
12993       invert = true;
12994       str_to_check += 3;
12995     }
12996   char *arg = strchr (str_to_check, '=');
12997
12998   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12999      and point ARG to "foo".  */
13000   if (arg)
13001     {
13002       *arg = '\0';
13003       arg++;
13004     }
13005   const struct aarch64_attribute_info *p_attr;
13006   bool found = false;
13007   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13008     {
13009       /* If the names don't match up, or the user has given an argument
13010          to an attribute that doesn't accept one, or didn't give an argument
13011          to an attribute that expects one, fail to match.  */
13012       if (strcmp (str_to_check, p_attr->name) != 0)
13013         continue;
13014
13015       found = true;
13016       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13017                               || p_attr->attr_type == aarch64_attr_enum;
13018
13019       if (attr_need_arg_p ^ (arg != NULL))
13020         {
13021           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13022           return false;
13023         }
13024
13025       /* If the name matches but the attribute does not allow "no-" versions
13026          then we can't match.  */
13027       if (invert && !p_attr->allow_neg)
13028         {
13029           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13030           return false;
13031         }
13032
13033       switch (p_attr->attr_type)
13034         {
13035         /* Has a custom handler registered.
13036            For example, cpu=, arch=, tune=.  */
13037           case aarch64_attr_custom:
13038             gcc_assert (p_attr->handler);
13039             if (!p_attr->handler (arg))
13040               return false;
13041             break;
13042
13043           /* Either set or unset a boolean option.  */
13044           case aarch64_attr_bool:
13045             {
13046               struct cl_decoded_option decoded;
13047
13048               generate_option (p_attr->opt_num, NULL, !invert,
13049                                CL_TARGET, &decoded);
13050               aarch64_handle_option (&global_options, &global_options_set,
13051                                       &decoded, input_location);
13052               break;
13053             }
13054           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13055              should know what mask to apply given the option number.  */
13056           case aarch64_attr_mask:
13057             {
13058               struct cl_decoded_option decoded;
13059               /* We only need to specify the option number.
13060                  aarch64_handle_option will know which mask to apply.  */
13061               decoded.opt_index = p_attr->opt_num;
13062               decoded.value = !invert;
13063               aarch64_handle_option (&global_options, &global_options_set,
13064                                       &decoded, input_location);
13065               break;
13066             }
13067           /* Use the option setting machinery to set an option to an enum.  */
13068           case aarch64_attr_enum:
13069             {
13070               gcc_assert (arg);
13071               bool valid;
13072               int value;
13073               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13074                                               &value, CL_TARGET);
13075               if (valid)
13076                 {
13077                   set_option (&global_options, NULL, p_attr->opt_num, value,
13078                               NULL, DK_UNSPECIFIED, input_location,
13079                               global_dc);
13080                 }
13081               else
13082                 {
13083                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13084                 }
13085               break;
13086             }
13087           default:
13088             gcc_unreachable ();
13089         }
13090     }
13091
13092   /* If we reached here we either have found an attribute and validated
13093      it or didn't match any.  If we matched an attribute but its arguments
13094      were malformed we will have returned false already.  */
13095   return found;
13096 }
13097
13098 /* Count how many times the character C appears in
13099    NULL-terminated string STR.  */
13100
13101 static unsigned int
13102 num_occurences_in_str (char c, char *str)
13103 {
13104   unsigned int res = 0;
13105   while (*str != '\0')
13106     {
13107       if (*str == c)
13108         res++;
13109
13110       str++;
13111     }
13112
13113   return res;
13114 }
13115
13116 /* Parse the tree in ARGS that contains the target attribute information
13117    and update the global target options space.  */
13118
13119 bool
13120 aarch64_process_target_attr (tree args)
13121 {
13122   if (TREE_CODE (args) == TREE_LIST)
13123     {
13124       do
13125         {
13126           tree head = TREE_VALUE (args);
13127           if (head)
13128             {
13129               if (!aarch64_process_target_attr (head))
13130                 return false;
13131             }
13132           args = TREE_CHAIN (args);
13133         } while (args);
13134
13135       return true;
13136     }
13137
13138   if (TREE_CODE (args) != STRING_CST)
13139     {
13140       error ("attribute %<target%> argument not a string");
13141       return false;
13142     }
13143
13144   size_t len = strlen (TREE_STRING_POINTER (args));
13145   char *str_to_check = (char *) alloca (len + 1);
13146   strcpy (str_to_check, TREE_STRING_POINTER (args));
13147
13148   if (len == 0)
13149     {
13150       error ("malformed %<target()%> pragma or attribute");
13151       return false;
13152     }
13153
13154   /* Used to catch empty spaces between commas i.e.
13155      attribute ((target ("attr1,,attr2"))).  */
13156   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13157
13158   /* Handle multiple target attributes separated by ','.  */
13159   char *token = strtok_r (str_to_check, ",", &str_to_check);
13160
13161   unsigned int num_attrs = 0;
13162   while (token)
13163     {
13164       num_attrs++;
13165       if (!aarch64_process_one_target_attr (token))
13166         {
13167           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13168           return false;
13169         }
13170
13171       token = strtok_r (NULL, ",", &str_to_check);
13172     }
13173
13174   if (num_attrs != num_commas + 1)
13175     {
13176       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13177       return false;
13178     }
13179
13180   return true;
13181 }
13182
13183 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13184    process attribute ((target ("..."))).  */
13185
13186 static bool
13187 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13188 {
13189   struct cl_target_option cur_target;
13190   bool ret;
13191   tree old_optimize;
13192   tree new_target, new_optimize;
13193   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13194
13195   /* If what we're processing is the current pragma string then the
13196      target option node is already stored in target_option_current_node
13197      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13198      having to re-parse the string.  This is especially useful to keep
13199      arm_neon.h compile times down since that header contains a lot
13200      of intrinsics enclosed in pragmas.  */
13201   if (!existing_target && args == current_target_pragma)
13202     {
13203       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13204       return true;
13205     }
13206   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13207
13208   old_optimize = build_optimization_node (&global_options);
13209   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13210
13211   /* If the function changed the optimization levels as well as setting
13212      target options, start with the optimizations specified.  */
13213   if (func_optimize && func_optimize != old_optimize)
13214     cl_optimization_restore (&global_options,
13215                              TREE_OPTIMIZATION (func_optimize));
13216
13217   /* Save the current target options to restore at the end.  */
13218   cl_target_option_save (&cur_target, &global_options);
13219
13220   /* If fndecl already has some target attributes applied to it, unpack
13221      them so that we add this attribute on top of them, rather than
13222      overwriting them.  */
13223   if (existing_target)
13224     {
13225       struct cl_target_option *existing_options
13226         = TREE_TARGET_OPTION (existing_target);
13227
13228       if (existing_options)
13229         cl_target_option_restore (&global_options, existing_options);
13230     }
13231   else
13232     cl_target_option_restore (&global_options,
13233                         TREE_TARGET_OPTION (target_option_current_node));
13234
13235   ret = aarch64_process_target_attr (args);
13236
13237   /* Set up any additional state.  */
13238   if (ret)
13239     {
13240       aarch64_override_options_internal (&global_options);
13241       /* Initialize SIMD builtins if we haven't already.
13242          Set current_target_pragma to NULL for the duration so that
13243          the builtin initialization code doesn't try to tag the functions
13244          being built with the attributes specified by any current pragma, thus
13245          going into an infinite recursion.  */
13246       if (TARGET_SIMD)
13247         {
13248           tree saved_current_target_pragma = current_target_pragma;
13249           current_target_pragma = NULL;
13250           aarch64_init_simd_builtins ();
13251           current_target_pragma = saved_current_target_pragma;
13252         }
13253       new_target = build_target_option_node (&global_options);
13254     }
13255   else
13256     new_target = NULL;
13257
13258   new_optimize = build_optimization_node (&global_options);
13259
13260   if (fndecl && ret)
13261     {
13262       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13263
13264       if (old_optimize != new_optimize)
13265         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13266     }
13267
13268   cl_target_option_restore (&global_options, &cur_target);
13269
13270   if (old_optimize != new_optimize)
13271     cl_optimization_restore (&global_options,
13272                              TREE_OPTIMIZATION (old_optimize));
13273   return ret;
13274 }
13275
13276 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13277    tri-bool options (yes, no, don't care) and the default value is
13278    DEF, determine whether to reject inlining.  */
13279
13280 static bool
13281 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13282                                      int dont_care, int def)
13283 {
13284   /* If the callee doesn't care, always allow inlining.  */
13285   if (callee == dont_care)
13286     return true;
13287
13288   /* If the caller doesn't care, always allow inlining.  */
13289   if (caller == dont_care)
13290     return true;
13291
13292   /* Otherwise, allow inlining if either the callee and caller values
13293      agree, or if the callee is using the default value.  */
13294   return (callee == caller || callee == def);
13295 }
13296
13297 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13298    to inline CALLEE into CALLER based on target-specific info.
13299    Make sure that the caller and callee have compatible architectural
13300    features.  Then go through the other possible target attributes
13301    and see if they can block inlining.  Try not to reject always_inline
13302    callees unless they are incompatible architecturally.  */
13303
13304 static bool
13305 aarch64_can_inline_p (tree caller, tree callee)
13306 {
13307   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13308   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13309
13310   struct cl_target_option *caller_opts
13311         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13312                                            : target_option_default_node);
13313
13314   struct cl_target_option *callee_opts
13315         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13316                                            : target_option_default_node);
13317
13318   /* Callee's ISA flags should be a subset of the caller's.  */
13319   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13320        != callee_opts->x_aarch64_isa_flags)
13321     return false;
13322
13323   /* Allow non-strict aligned functions inlining into strict
13324      aligned ones.  */
13325   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13326        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13327       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13328            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13329     return false;
13330
13331   bool always_inline = lookup_attribute ("always_inline",
13332                                           DECL_ATTRIBUTES (callee));
13333
13334   /* If the architectural features match up and the callee is always_inline
13335      then the other attributes don't matter.  */
13336   if (always_inline)
13337     return true;
13338
13339   if (caller_opts->x_aarch64_cmodel_var
13340       != callee_opts->x_aarch64_cmodel_var)
13341     return false;
13342
13343   if (caller_opts->x_aarch64_tls_dialect
13344       != callee_opts->x_aarch64_tls_dialect)
13345     return false;
13346
13347   /* Honour explicit requests to workaround errata.  */
13348   if (!aarch64_tribools_ok_for_inlining_p (
13349           caller_opts->x_aarch64_fix_a53_err835769,
13350           callee_opts->x_aarch64_fix_a53_err835769,
13351           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13352     return false;
13353
13354   if (!aarch64_tribools_ok_for_inlining_p (
13355           caller_opts->x_aarch64_fix_a53_err843419,
13356           callee_opts->x_aarch64_fix_a53_err843419,
13357           2, TARGET_FIX_ERR_A53_843419))
13358     return false;
13359
13360   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13361      caller and calle and they don't match up, reject inlining.  */
13362   if (!aarch64_tribools_ok_for_inlining_p (
13363           caller_opts->x_flag_omit_leaf_frame_pointer,
13364           callee_opts->x_flag_omit_leaf_frame_pointer,
13365           2, 1))
13366     return false;
13367
13368   /* If the callee has specific tuning overrides, respect them.  */
13369   if (callee_opts->x_aarch64_override_tune_string != NULL
13370       && caller_opts->x_aarch64_override_tune_string == NULL)
13371     return false;
13372
13373   /* If the user specified tuning override strings for the
13374      caller and callee and they don't match up, reject inlining.
13375      We just do a string compare here, we don't analyze the meaning
13376      of the string, as it would be too costly for little gain.  */
13377   if (callee_opts->x_aarch64_override_tune_string
13378       && caller_opts->x_aarch64_override_tune_string
13379       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13380                   caller_opts->x_aarch64_override_tune_string) != 0))
13381     return false;
13382
13383   return true;
13384 }
13385
13386 /* Return true if SYMBOL_REF X binds locally.  */
13387
13388 static bool
13389 aarch64_symbol_binds_local_p (const_rtx x)
13390 {
13391   return (SYMBOL_REF_DECL (x)
13392           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13393           : SYMBOL_REF_LOCAL_P (x));
13394 }
13395
13396 /* Return true if SYMBOL_REF X is thread local */
13397 static bool
13398 aarch64_tls_symbol_p (rtx x)
13399 {
13400   if (! TARGET_HAVE_TLS)
13401     return false;
13402
13403   if (GET_CODE (x) != SYMBOL_REF)
13404     return false;
13405
13406   return SYMBOL_REF_TLS_MODEL (x) != 0;
13407 }
13408
13409 /* Classify a TLS symbol into one of the TLS kinds.  */
13410 enum aarch64_symbol_type
13411 aarch64_classify_tls_symbol (rtx x)
13412 {
13413   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13414
13415   switch (tls_kind)
13416     {
13417     case TLS_MODEL_GLOBAL_DYNAMIC:
13418     case TLS_MODEL_LOCAL_DYNAMIC:
13419       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13420
13421     case TLS_MODEL_INITIAL_EXEC:
13422       switch (aarch64_cmodel)
13423         {
13424         case AARCH64_CMODEL_TINY:
13425         case AARCH64_CMODEL_TINY_PIC:
13426           return SYMBOL_TINY_TLSIE;
13427         default:
13428           return SYMBOL_SMALL_TLSIE;
13429         }
13430
13431     case TLS_MODEL_LOCAL_EXEC:
13432       if (aarch64_tls_size == 12)
13433         return SYMBOL_TLSLE12;
13434       else if (aarch64_tls_size == 24)
13435         return SYMBOL_TLSLE24;
13436       else if (aarch64_tls_size == 32)
13437         return SYMBOL_TLSLE32;
13438       else if (aarch64_tls_size == 48)
13439         return SYMBOL_TLSLE48;
13440       else
13441         gcc_unreachable ();
13442
13443     case TLS_MODEL_EMULATED:
13444     case TLS_MODEL_NONE:
13445       return SYMBOL_FORCE_TO_MEM;
13446
13447     default:
13448       gcc_unreachable ();
13449     }
13450 }
13451
13452 /* Return the correct method for accessing X + OFFSET, where X is either
13453    a SYMBOL_REF or LABEL_REF.  */
13454
13455 enum aarch64_symbol_type
13456 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13457 {
13458   if (GET_CODE (x) == LABEL_REF)
13459     {
13460       switch (aarch64_cmodel)
13461         {
13462         case AARCH64_CMODEL_LARGE:
13463           return SYMBOL_FORCE_TO_MEM;
13464
13465         case AARCH64_CMODEL_TINY_PIC:
13466         case AARCH64_CMODEL_TINY:
13467           return SYMBOL_TINY_ABSOLUTE;
13468
13469         case AARCH64_CMODEL_SMALL_SPIC:
13470         case AARCH64_CMODEL_SMALL_PIC:
13471         case AARCH64_CMODEL_SMALL:
13472           return SYMBOL_SMALL_ABSOLUTE;
13473
13474         default:
13475           gcc_unreachable ();
13476         }
13477     }
13478
13479   if (GET_CODE (x) == SYMBOL_REF)
13480     {
13481       if (aarch64_tls_symbol_p (x))
13482         return aarch64_classify_tls_symbol (x);
13483
13484       switch (aarch64_cmodel)
13485         {
13486         case AARCH64_CMODEL_TINY:
13487           /* When we retrieve symbol + offset address, we have to make sure
13488              the offset does not cause overflow of the final address.  But
13489              we have no way of knowing the address of symbol at compile time
13490              so we can't accurately say if the distance between the PC and
13491              symbol + offset is outside the addressible range of +/-1M in the
13492              TINY code model.  So we rely on images not being greater than
13493              1M and cap the offset at 1M and anything beyond 1M will have to
13494              be loaded using an alternative mechanism.  Furthermore if the
13495              symbol is a weak reference to something that isn't known to
13496              resolve to a symbol in this module, then force to memory.  */
13497           if ((SYMBOL_REF_WEAK (x)
13498                && !aarch64_symbol_binds_local_p (x))
13499               || !IN_RANGE (offset, -1048575, 1048575))
13500             return SYMBOL_FORCE_TO_MEM;
13501           return SYMBOL_TINY_ABSOLUTE;
13502
13503         case AARCH64_CMODEL_SMALL:
13504           /* Same reasoning as the tiny code model, but the offset cap here is
13505              4G.  */
13506           if ((SYMBOL_REF_WEAK (x)
13507                && !aarch64_symbol_binds_local_p (x))
13508               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13509                             HOST_WIDE_INT_C (4294967264)))
13510             return SYMBOL_FORCE_TO_MEM;
13511           return SYMBOL_SMALL_ABSOLUTE;
13512
13513         case AARCH64_CMODEL_TINY_PIC:
13514           if (!aarch64_symbol_binds_local_p (x))
13515             return SYMBOL_TINY_GOT;
13516           return SYMBOL_TINY_ABSOLUTE;
13517
13518         case AARCH64_CMODEL_SMALL_SPIC:
13519         case AARCH64_CMODEL_SMALL_PIC:
13520           if (!aarch64_symbol_binds_local_p (x))
13521             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13522                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13523           return SYMBOL_SMALL_ABSOLUTE;
13524
13525         case AARCH64_CMODEL_LARGE:
13526           /* This is alright even in PIC code as the constant
13527              pool reference is always PC relative and within
13528              the same translation unit.  */
13529           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13530             return SYMBOL_SMALL_ABSOLUTE;
13531           else
13532             return SYMBOL_FORCE_TO_MEM;
13533
13534         default:
13535           gcc_unreachable ();
13536         }
13537     }
13538
13539   /* By default push everything into the constant pool.  */
13540   return SYMBOL_FORCE_TO_MEM;
13541 }
13542
13543 bool
13544 aarch64_constant_address_p (rtx x)
13545 {
13546   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13547 }
13548
13549 bool
13550 aarch64_legitimate_pic_operand_p (rtx x)
13551 {
13552   if (GET_CODE (x) == SYMBOL_REF
13553       || (GET_CODE (x) == CONST
13554           && GET_CODE (XEXP (x, 0)) == PLUS
13555           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13556      return false;
13557
13558   return true;
13559 }
13560
13561 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13562    that should be rematerialized rather than spilled.  */
13563
13564 static bool
13565 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13566 {
13567   /* Support CSE and rematerialization of common constants.  */
13568   if (CONST_INT_P (x)
13569       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13570       || GET_CODE (x) == CONST_VECTOR)
13571     return true;
13572
13573   /* Do not allow vector struct mode constants for Advanced SIMD.
13574      We could support 0 and -1 easily, but they need support in
13575      aarch64-simd.md.  */
13576   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13577   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13578     return false;
13579
13580   /* Only accept variable-length vector constants if they can be
13581      handled directly.
13582
13583      ??? It would be possible to handle rematerialization of other
13584      constants via secondary reloads.  */
13585   if (vec_flags & VEC_ANY_SVE)
13586     return aarch64_simd_valid_immediate (x, NULL);
13587
13588   if (GET_CODE (x) == HIGH)
13589     x = XEXP (x, 0);
13590
13591   /* Accept polynomial constants that can be calculated by using the
13592      destination of a move as the sole temporary.  Constants that
13593      require a second temporary cannot be rematerialized (they can't be
13594      forced to memory and also aren't legitimate constants).  */
13595   poly_int64 offset;
13596   if (poly_int_rtx_p (x, &offset))
13597     return aarch64_offset_temporaries (false, offset) <= 1;
13598
13599   /* If an offset is being added to something else, we need to allow the
13600      base to be moved into the destination register, meaning that there
13601      are no free temporaries for the offset.  */
13602   x = strip_offset (x, &offset);
13603   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13604     return false;
13605
13606   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13607   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13608     return false;
13609
13610   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13611      so spilling them is better than rematerialization.  */
13612   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13613     return true;
13614
13615   /* Label references are always constant.  */
13616   if (GET_CODE (x) == LABEL_REF)
13617     return true;
13618
13619   return false;
13620 }
13621
13622 rtx
13623 aarch64_load_tp (rtx target)
13624 {
13625   if (!target
13626       || GET_MODE (target) != Pmode
13627       || !register_operand (target, Pmode))
13628     target = gen_reg_rtx (Pmode);
13629
13630   /* Can return in any reg.  */
13631   emit_insn (gen_aarch64_load_tp_hard (target));
13632   return target;
13633 }
13634
13635 /* On AAPCS systems, this is the "struct __va_list".  */
13636 static GTY(()) tree va_list_type;
13637
13638 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13639    Return the type to use as __builtin_va_list.
13640
13641    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13642
13643    struct __va_list
13644    {
13645      void *__stack;
13646      void *__gr_top;
13647      void *__vr_top;
13648      int   __gr_offs;
13649      int   __vr_offs;
13650    };  */
13651
13652 static tree
13653 aarch64_build_builtin_va_list (void)
13654 {
13655   tree va_list_name;
13656   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13657
13658   /* Create the type.  */
13659   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13660   /* Give it the required name.  */
13661   va_list_name = build_decl (BUILTINS_LOCATION,
13662                              TYPE_DECL,
13663                              get_identifier ("__va_list"),
13664                              va_list_type);
13665   DECL_ARTIFICIAL (va_list_name) = 1;
13666   TYPE_NAME (va_list_type) = va_list_name;
13667   TYPE_STUB_DECL (va_list_type) = va_list_name;
13668
13669   /* Create the fields.  */
13670   f_stack = build_decl (BUILTINS_LOCATION,
13671                         FIELD_DECL, get_identifier ("__stack"),
13672                         ptr_type_node);
13673   f_grtop = build_decl (BUILTINS_LOCATION,
13674                         FIELD_DECL, get_identifier ("__gr_top"),
13675                         ptr_type_node);
13676   f_vrtop = build_decl (BUILTINS_LOCATION,
13677                         FIELD_DECL, get_identifier ("__vr_top"),
13678                         ptr_type_node);
13679   f_groff = build_decl (BUILTINS_LOCATION,
13680                         FIELD_DECL, get_identifier ("__gr_offs"),
13681                         integer_type_node);
13682   f_vroff = build_decl (BUILTINS_LOCATION,
13683                         FIELD_DECL, get_identifier ("__vr_offs"),
13684                         integer_type_node);
13685
13686   /* Tell tree-stdarg pass about our internal offset fields.
13687      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13688      purpose to identify whether the code is updating va_list internal
13689      offset fields through irregular way.  */
13690   va_list_gpr_counter_field = f_groff;
13691   va_list_fpr_counter_field = f_vroff;
13692
13693   DECL_ARTIFICIAL (f_stack) = 1;
13694   DECL_ARTIFICIAL (f_grtop) = 1;
13695   DECL_ARTIFICIAL (f_vrtop) = 1;
13696   DECL_ARTIFICIAL (f_groff) = 1;
13697   DECL_ARTIFICIAL (f_vroff) = 1;
13698
13699   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13700   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13701   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13702   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13703   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13704
13705   TYPE_FIELDS (va_list_type) = f_stack;
13706   DECL_CHAIN (f_stack) = f_grtop;
13707   DECL_CHAIN (f_grtop) = f_vrtop;
13708   DECL_CHAIN (f_vrtop) = f_groff;
13709   DECL_CHAIN (f_groff) = f_vroff;
13710
13711   /* Compute its layout.  */
13712   layout_type (va_list_type);
13713
13714   return va_list_type;
13715 }
13716
13717 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13718 static void
13719 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13720 {
13721   const CUMULATIVE_ARGS *cum;
13722   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13723   tree stack, grtop, vrtop, groff, vroff;
13724   tree t;
13725   int gr_save_area_size = cfun->va_list_gpr_size;
13726   int vr_save_area_size = cfun->va_list_fpr_size;
13727   int vr_offset;
13728
13729   cum = &crtl->args.info;
13730   if (cfun->va_list_gpr_size)
13731     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13732                              cfun->va_list_gpr_size);
13733   if (cfun->va_list_fpr_size)
13734     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13735                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13736
13737   if (!TARGET_FLOAT)
13738     {
13739       gcc_assert (cum->aapcs_nvrn == 0);
13740       vr_save_area_size = 0;
13741     }
13742
13743   f_stack = TYPE_FIELDS (va_list_type_node);
13744   f_grtop = DECL_CHAIN (f_stack);
13745   f_vrtop = DECL_CHAIN (f_grtop);
13746   f_groff = DECL_CHAIN (f_vrtop);
13747   f_vroff = DECL_CHAIN (f_groff);
13748
13749   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13750                   NULL_TREE);
13751   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13752                   NULL_TREE);
13753   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13754                   NULL_TREE);
13755   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13756                   NULL_TREE);
13757   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13758                   NULL_TREE);
13759
13760   /* Emit code to initialize STACK, which points to the next varargs stack
13761      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13762      by named arguments.  STACK is 8-byte aligned.  */
13763   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13764   if (cum->aapcs_stack_size > 0)
13765     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13766   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13767   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13768
13769   /* Emit code to initialize GRTOP, the top of the GR save area.
13770      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13771   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13772   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13773   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13774
13775   /* Emit code to initialize VRTOP, the top of the VR save area.
13776      This address is gr_save_area_bytes below GRTOP, rounded
13777      down to the next 16-byte boundary.  */
13778   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13779   vr_offset = ROUND_UP (gr_save_area_size,
13780                         STACK_BOUNDARY / BITS_PER_UNIT);
13781
13782   if (vr_offset)
13783     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13784   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13785   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13786
13787   /* Emit code to initialize GROFF, the offset from GRTOP of the
13788      next GPR argument.  */
13789   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13790               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13791   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13792
13793   /* Likewise emit code to initialize VROFF, the offset from FTOP
13794      of the next VR argument.  */
13795   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13796               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13797   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13798 }
13799
13800 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13801
13802 static tree
13803 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13804                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13805 {
13806   tree addr;
13807   bool indirect_p;
13808   bool is_ha;           /* is HFA or HVA.  */
13809   bool dw_align;        /* double-word align.  */
13810   machine_mode ag_mode = VOIDmode;
13811   int nregs;
13812   machine_mode mode;
13813
13814   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13815   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13816   HOST_WIDE_INT size, rsize, adjust, align;
13817   tree t, u, cond1, cond2;
13818
13819   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13820   if (indirect_p)
13821     type = build_pointer_type (type);
13822
13823   mode = TYPE_MODE (type);
13824
13825   f_stack = TYPE_FIELDS (va_list_type_node);
13826   f_grtop = DECL_CHAIN (f_stack);
13827   f_vrtop = DECL_CHAIN (f_grtop);
13828   f_groff = DECL_CHAIN (f_vrtop);
13829   f_vroff = DECL_CHAIN (f_groff);
13830
13831   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13832                   f_stack, NULL_TREE);
13833   size = int_size_in_bytes (type);
13834
13835   bool abi_break;
13836   align
13837     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13838
13839   dw_align = false;
13840   adjust = 0;
13841   if (aarch64_vfp_is_call_or_return_candidate (mode,
13842                                                type,
13843                                                &ag_mode,
13844                                                &nregs,
13845                                                &is_ha))
13846     {
13847       /* No frontends can create types with variable-sized modes, so we
13848          shouldn't be asked to pass or return them.  */
13849       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13850
13851       /* TYPE passed in fp/simd registers.  */
13852       if (!TARGET_FLOAT)
13853         aarch64_err_no_fpadvsimd (mode);
13854
13855       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13856                       unshare_expr (valist), f_vrtop, NULL_TREE);
13857       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13858                       unshare_expr (valist), f_vroff, NULL_TREE);
13859
13860       rsize = nregs * UNITS_PER_VREG;
13861
13862       if (is_ha)
13863         {
13864           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13865             adjust = UNITS_PER_VREG - ag_size;
13866         }
13867       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13868                && size < UNITS_PER_VREG)
13869         {
13870           adjust = UNITS_PER_VREG - size;
13871         }
13872     }
13873   else
13874     {
13875       /* TYPE passed in general registers.  */
13876       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13877                       unshare_expr (valist), f_grtop, NULL_TREE);
13878       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13879                       unshare_expr (valist), f_groff, NULL_TREE);
13880       rsize = ROUND_UP (size, UNITS_PER_WORD);
13881       nregs = rsize / UNITS_PER_WORD;
13882
13883       if (align > 8)
13884         {
13885           if (abi_break && warn_psabi)
13886             inform (input_location, "parameter passing for argument of type "
13887                     "%qT changed in GCC 9.1", type);
13888           dw_align = true;
13889         }
13890
13891       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13892           && size < UNITS_PER_WORD)
13893         {
13894           adjust = UNITS_PER_WORD  - size;
13895         }
13896     }
13897
13898   /* Get a local temporary for the field value.  */
13899   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13900
13901   /* Emit code to branch if off >= 0.  */
13902   t = build2 (GE_EXPR, boolean_type_node, off,
13903               build_int_cst (TREE_TYPE (off), 0));
13904   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13905
13906   if (dw_align)
13907     {
13908       /* Emit: offs = (offs + 15) & -16.  */
13909       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13910                   build_int_cst (TREE_TYPE (off), 15));
13911       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13912                   build_int_cst (TREE_TYPE (off), -16));
13913       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13914     }
13915   else
13916     roundup = NULL;
13917
13918   /* Update ap.__[g|v]r_offs  */
13919   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13920               build_int_cst (TREE_TYPE (off), rsize));
13921   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13922
13923   /* String up.  */
13924   if (roundup)
13925     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13926
13927   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13928   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13929               build_int_cst (TREE_TYPE (f_off), 0));
13930   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13931
13932   /* String up: make sure the assignment happens before the use.  */
13933   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13934   COND_EXPR_ELSE (cond1) = t;
13935
13936   /* Prepare the trees handling the argument that is passed on the stack;
13937      the top level node will store in ON_STACK.  */
13938   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13939   if (align > 8)
13940     {
13941       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13942       t = fold_build_pointer_plus_hwi (arg, 15);
13943       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13944                   build_int_cst (TREE_TYPE (t), -16));
13945       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13946     }
13947   else
13948     roundup = NULL;
13949   /* Advance ap.__stack  */
13950   t = fold_build_pointer_plus_hwi (arg, size + 7);
13951   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13952               build_int_cst (TREE_TYPE (t), -8));
13953   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13954   /* String up roundup and advance.  */
13955   if (roundup)
13956     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13957   /* String up with arg */
13958   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13959   /* Big-endianness related address adjustment.  */
13960   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13961       && size < UNITS_PER_WORD)
13962   {
13963     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13964                 size_int (UNITS_PER_WORD - size));
13965     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13966   }
13967
13968   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13969   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13970
13971   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13972   t = off;
13973   if (adjust)
13974     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13975                 build_int_cst (TREE_TYPE (off), adjust));
13976
13977   t = fold_convert (sizetype, t);
13978   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13979
13980   if (is_ha)
13981     {
13982       /* type ha; // treat as "struct {ftype field[n];}"
13983          ... [computing offs]
13984          for (i = 0; i <nregs; ++i, offs += 16)
13985            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13986          return ha;  */
13987       int i;
13988       tree tmp_ha, field_t, field_ptr_t;
13989
13990       /* Declare a local variable.  */
13991       tmp_ha = create_tmp_var_raw (type, "ha");
13992       gimple_add_tmp_var (tmp_ha);
13993
13994       /* Establish the base type.  */
13995       switch (ag_mode)
13996         {
13997         case E_SFmode:
13998           field_t = float_type_node;
13999           field_ptr_t = float_ptr_type_node;
14000           break;
14001         case E_DFmode:
14002           field_t = double_type_node;
14003           field_ptr_t = double_ptr_type_node;
14004           break;
14005         case E_TFmode:
14006           field_t = long_double_type_node;
14007           field_ptr_t = long_double_ptr_type_node;
14008           break;
14009         case E_HFmode:
14010           field_t = aarch64_fp16_type_node;
14011           field_ptr_t = aarch64_fp16_ptr_type_node;
14012           break;
14013         case E_V2SImode:
14014         case E_V4SImode:
14015             {
14016               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14017               field_t = build_vector_type_for_mode (innertype, ag_mode);
14018               field_ptr_t = build_pointer_type (field_t);
14019             }
14020           break;
14021         default:
14022           gcc_assert (0);
14023         }
14024
14025       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14026       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14027       addr = t;
14028       t = fold_convert (field_ptr_t, addr);
14029       t = build2 (MODIFY_EXPR, field_t,
14030                   build1 (INDIRECT_REF, field_t, tmp_ha),
14031                   build1 (INDIRECT_REF, field_t, t));
14032
14033       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14034       for (i = 1; i < nregs; ++i)
14035         {
14036           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14037           u = fold_convert (field_ptr_t, addr);
14038           u = build2 (MODIFY_EXPR, field_t,
14039                       build2 (MEM_REF, field_t, tmp_ha,
14040                               build_int_cst (field_ptr_t,
14041                                              (i *
14042                                               int_size_in_bytes (field_t)))),
14043                       build1 (INDIRECT_REF, field_t, u));
14044           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14045         }
14046
14047       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14048       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14049     }
14050
14051   COND_EXPR_ELSE (cond2) = t;
14052   addr = fold_convert (build_pointer_type (type), cond1);
14053   addr = build_va_arg_indirect_ref (addr);
14054
14055   if (indirect_p)
14056     addr = build_va_arg_indirect_ref (addr);
14057
14058   return addr;
14059 }
14060
14061 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14062
14063 static void
14064 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14065                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14066                                 int no_rtl)
14067 {
14068   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14069   CUMULATIVE_ARGS local_cum;
14070   int gr_saved = cfun->va_list_gpr_size;
14071   int vr_saved = cfun->va_list_fpr_size;
14072
14073   /* The caller has advanced CUM up to, but not beyond, the last named
14074      argument.  Advance a local copy of CUM past the last "real" named
14075      argument, to find out how many registers are left over.  */
14076   local_cum = *cum;
14077   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14078
14079   /* Found out how many registers we need to save.
14080      Honor tree-stdvar analysis results.  */
14081   if (cfun->va_list_gpr_size)
14082     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14083                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14084   if (cfun->va_list_fpr_size)
14085     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14086                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14087
14088   if (!TARGET_FLOAT)
14089     {
14090       gcc_assert (local_cum.aapcs_nvrn == 0);
14091       vr_saved = 0;
14092     }
14093
14094   if (!no_rtl)
14095     {
14096       if (gr_saved > 0)
14097         {
14098           rtx ptr, mem;
14099
14100           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14101           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14102                                - gr_saved * UNITS_PER_WORD);
14103           mem = gen_frame_mem (BLKmode, ptr);
14104           set_mem_alias_set (mem, get_varargs_alias_set ());
14105
14106           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14107                                mem, gr_saved);
14108         }
14109       if (vr_saved > 0)
14110         {
14111           /* We can't use move_block_from_reg, because it will use
14112              the wrong mode, storing D regs only.  */
14113           machine_mode mode = TImode;
14114           int off, i, vr_start;
14115
14116           /* Set OFF to the offset from virtual_incoming_args_rtx of
14117              the first vector register.  The VR save area lies below
14118              the GR one, and is aligned to 16 bytes.  */
14119           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14120                            STACK_BOUNDARY / BITS_PER_UNIT);
14121           off -= vr_saved * UNITS_PER_VREG;
14122
14123           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14124           for (i = 0; i < vr_saved; ++i)
14125             {
14126               rtx ptr, mem;
14127
14128               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14129               mem = gen_frame_mem (mode, ptr);
14130               set_mem_alias_set (mem, get_varargs_alias_set ());
14131               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14132               off += UNITS_PER_VREG;
14133             }
14134         }
14135     }
14136
14137   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14138      any complication of having crtl->args.pretend_args_size changed.  */
14139   cfun->machine->frame.saved_varargs_size
14140     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14141                  STACK_BOUNDARY / BITS_PER_UNIT)
14142        + vr_saved * UNITS_PER_VREG);
14143 }
14144
14145 static void
14146 aarch64_conditional_register_usage (void)
14147 {
14148   int i;
14149   if (!TARGET_FLOAT)
14150     {
14151       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14152         {
14153           fixed_regs[i] = 1;
14154           call_used_regs[i] = 1;
14155         }
14156     }
14157   if (!TARGET_SVE)
14158     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14159       {
14160         fixed_regs[i] = 1;
14161         call_used_regs[i] = 1;
14162       }
14163
14164   /* When tracking speculation, we need a couple of call-clobbered registers
14165      to track the speculation state.  It would be nice to just use
14166      IP0 and IP1, but currently there are numerous places that just
14167      assume these registers are free for other uses (eg pointer
14168      authentication).  */
14169   if (aarch64_track_speculation)
14170     {
14171       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14172       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14173       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14174       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14175     }
14176 }
14177
14178 /* Walk down the type tree of TYPE counting consecutive base elements.
14179    If *MODEP is VOIDmode, then set it to the first valid floating point
14180    type.  If a non-floating point type is found, or if a floating point
14181    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14182    otherwise return the count in the sub-tree.  */
14183 static int
14184 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14185 {
14186   machine_mode mode;
14187   HOST_WIDE_INT size;
14188
14189   switch (TREE_CODE (type))
14190     {
14191     case REAL_TYPE:
14192       mode = TYPE_MODE (type);
14193       if (mode != DFmode && mode != SFmode
14194           && mode != TFmode && mode != HFmode)
14195         return -1;
14196
14197       if (*modep == VOIDmode)
14198         *modep = mode;
14199
14200       if (*modep == mode)
14201         return 1;
14202
14203       break;
14204
14205     case COMPLEX_TYPE:
14206       mode = TYPE_MODE (TREE_TYPE (type));
14207       if (mode != DFmode && mode != SFmode
14208           && mode != TFmode && mode != HFmode)
14209         return -1;
14210
14211       if (*modep == VOIDmode)
14212         *modep = mode;
14213
14214       if (*modep == mode)
14215         return 2;
14216
14217       break;
14218
14219     case VECTOR_TYPE:
14220       /* Use V2SImode and V4SImode as representatives of all 64-bit
14221          and 128-bit vector types.  */
14222       size = int_size_in_bytes (type);
14223       switch (size)
14224         {
14225         case 8:
14226           mode = V2SImode;
14227           break;
14228         case 16:
14229           mode = V4SImode;
14230           break;
14231         default:
14232           return -1;
14233         }
14234
14235       if (*modep == VOIDmode)
14236         *modep = mode;
14237
14238       /* Vector modes are considered to be opaque: two vectors are
14239          equivalent for the purposes of being homogeneous aggregates
14240          if they are the same size.  */
14241       if (*modep == mode)
14242         return 1;
14243
14244       break;
14245
14246     case ARRAY_TYPE:
14247       {
14248         int count;
14249         tree index = TYPE_DOMAIN (type);
14250
14251         /* Can't handle incomplete types nor sizes that are not
14252            fixed.  */
14253         if (!COMPLETE_TYPE_P (type)
14254             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14255           return -1;
14256
14257         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14258         if (count == -1
14259             || !index
14260             || !TYPE_MAX_VALUE (index)
14261             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14262             || !TYPE_MIN_VALUE (index)
14263             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14264             || count < 0)
14265           return -1;
14266
14267         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14268                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14269
14270         /* There must be no padding.  */
14271         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14272                       count * GET_MODE_BITSIZE (*modep)))
14273           return -1;
14274
14275         return count;
14276       }
14277
14278     case RECORD_TYPE:
14279       {
14280         int count = 0;
14281         int sub_count;
14282         tree field;
14283
14284         /* Can't handle incomplete types nor sizes that are not
14285            fixed.  */
14286         if (!COMPLETE_TYPE_P (type)
14287             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14288           return -1;
14289
14290         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14291           {
14292             if (TREE_CODE (field) != FIELD_DECL)
14293               continue;
14294
14295             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14296             if (sub_count < 0)
14297               return -1;
14298             count += sub_count;
14299           }
14300
14301         /* There must be no padding.  */
14302         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14303                       count * GET_MODE_BITSIZE (*modep)))
14304           return -1;
14305
14306         return count;
14307       }
14308
14309     case UNION_TYPE:
14310     case QUAL_UNION_TYPE:
14311       {
14312         /* These aren't very interesting except in a degenerate case.  */
14313         int count = 0;
14314         int sub_count;
14315         tree field;
14316
14317         /* Can't handle incomplete types nor sizes that are not
14318            fixed.  */
14319         if (!COMPLETE_TYPE_P (type)
14320             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14321           return -1;
14322
14323         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14324           {
14325             if (TREE_CODE (field) != FIELD_DECL)
14326               continue;
14327
14328             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14329             if (sub_count < 0)
14330               return -1;
14331             count = count > sub_count ? count : sub_count;
14332           }
14333
14334         /* There must be no padding.  */
14335         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14336                       count * GET_MODE_BITSIZE (*modep)))
14337           return -1;
14338
14339         return count;
14340       }
14341
14342     default:
14343       break;
14344     }
14345
14346   return -1;
14347 }
14348
14349 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14350    type as described in AAPCS64 \S 4.1.2.
14351
14352    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14353
14354 static bool
14355 aarch64_short_vector_p (const_tree type,
14356                         machine_mode mode)
14357 {
14358   poly_int64 size = -1;
14359
14360   if (type && TREE_CODE (type) == VECTOR_TYPE)
14361     size = int_size_in_bytes (type);
14362   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14363             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14364     size = GET_MODE_SIZE (mode);
14365
14366   return known_eq (size, 8) || known_eq (size, 16);
14367 }
14368
14369 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14370    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14371    array types.  The C99 floating-point complex types are also considered
14372    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14373    types, which are GCC extensions and out of the scope of AAPCS64, are
14374    treated as composite types here as well.
14375
14376    Note that MODE itself is not sufficient in determining whether a type
14377    is such a composite type or not.  This is because
14378    stor-layout.c:compute_record_mode may have already changed the MODE
14379    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14380    structure with only one field may have its MODE set to the mode of the
14381    field.  Also an integer mode whose size matches the size of the
14382    RECORD_TYPE type may be used to substitute the original mode
14383    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14384    solely relied on.  */
14385
14386 static bool
14387 aarch64_composite_type_p (const_tree type,
14388                           machine_mode mode)
14389 {
14390   if (aarch64_short_vector_p (type, mode))
14391     return false;
14392
14393   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14394     return true;
14395
14396   if (mode == BLKmode
14397       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14398       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14399     return true;
14400
14401   return false;
14402 }
14403
14404 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14405    shall be passed or returned in simd/fp register(s) (providing these
14406    parameter passing registers are available).
14407
14408    Upon successful return, *COUNT returns the number of needed registers,
14409    *BASE_MODE returns the mode of the individual register and when IS_HAF
14410    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14411    floating-point aggregate or a homogeneous short-vector aggregate.  */
14412
14413 static bool
14414 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14415                                          const_tree type,
14416                                          machine_mode *base_mode,
14417                                          int *count,
14418                                          bool *is_ha)
14419 {
14420   machine_mode new_mode = VOIDmode;
14421   bool composite_p = aarch64_composite_type_p (type, mode);
14422
14423   if (is_ha != NULL) *is_ha = false;
14424
14425   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14426       || aarch64_short_vector_p (type, mode))
14427     {
14428       *count = 1;
14429       new_mode = mode;
14430     }
14431   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14432     {
14433       if (is_ha != NULL) *is_ha = true;
14434       *count = 2;
14435       new_mode = GET_MODE_INNER (mode);
14436     }
14437   else if (type && composite_p)
14438     {
14439       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14440
14441       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14442         {
14443           if (is_ha != NULL) *is_ha = true;
14444           *count = ag_count;
14445         }
14446       else
14447         return false;
14448     }
14449   else
14450     return false;
14451
14452   *base_mode = new_mode;
14453   return true;
14454 }
14455
14456 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14457
14458 static rtx
14459 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14460                           int incoming ATTRIBUTE_UNUSED)
14461 {
14462   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14463 }
14464
14465 /* Implements target hook vector_mode_supported_p.  */
14466 static bool
14467 aarch64_vector_mode_supported_p (machine_mode mode)
14468 {
14469   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14470   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14471 }
14472
14473 /* Return the full-width SVE vector mode for element mode MODE, if one
14474    exists.  */
14475 opt_machine_mode
14476 aarch64_full_sve_mode (scalar_mode mode)
14477 {
14478   switch (mode)
14479     {
14480     case E_DFmode:
14481       return VNx2DFmode;
14482     case E_SFmode:
14483       return VNx4SFmode;
14484     case E_HFmode:
14485       return VNx8HFmode;
14486     case E_DImode:
14487         return VNx2DImode;
14488     case E_SImode:
14489       return VNx4SImode;
14490     case E_HImode:
14491       return VNx8HImode;
14492     case E_QImode:
14493       return VNx16QImode;
14494     default:
14495       return opt_machine_mode ();
14496     }
14497 }
14498
14499 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14500    if it exists.  */
14501 opt_machine_mode
14502 aarch64_vq_mode (scalar_mode mode)
14503 {
14504   switch (mode)
14505     {
14506     case E_DFmode:
14507       return V2DFmode;
14508     case E_SFmode:
14509       return V4SFmode;
14510     case E_HFmode:
14511       return V8HFmode;
14512     case E_SImode:
14513       return V4SImode;
14514     case E_HImode:
14515       return V8HImode;
14516     case E_QImode:
14517       return V16QImode;
14518     case E_DImode:
14519       return V2DImode;
14520     default:
14521       return opt_machine_mode ();
14522     }
14523 }
14524
14525 /* Return appropriate SIMD container
14526    for MODE within a vector of WIDTH bits.  */
14527 static machine_mode
14528 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14529 {
14530   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14531     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14532
14533   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14534   if (TARGET_SIMD)
14535     {
14536       if (known_eq (width, 128))
14537         return aarch64_vq_mode (mode).else_mode (word_mode);
14538       else
14539         switch (mode)
14540           {
14541           case E_SFmode:
14542             return V2SFmode;
14543           case E_HFmode:
14544             return V4HFmode;
14545           case E_SImode:
14546             return V2SImode;
14547           case E_HImode:
14548             return V4HImode;
14549           case E_QImode:
14550             return V8QImode;
14551           default:
14552             break;
14553           }
14554     }
14555   return word_mode;
14556 }
14557
14558 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14559 static machine_mode
14560 aarch64_preferred_simd_mode (scalar_mode mode)
14561 {
14562   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14563   return aarch64_simd_container_mode (mode, bits);
14564 }
14565
14566 /* Return a list of possible vector sizes for the vectorizer
14567    to iterate over.  */
14568 static void
14569 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14570 {
14571   if (TARGET_SVE)
14572     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14573   sizes->safe_push (16);
14574   sizes->safe_push (8);
14575 }
14576
14577 /* Implement TARGET_MANGLE_TYPE.  */
14578
14579 static const char *
14580 aarch64_mangle_type (const_tree type)
14581 {
14582   /* The AArch64 ABI documents say that "__va_list" has to be
14583      mangled as if it is in the "std" namespace.  */
14584   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14585     return "St9__va_list";
14586
14587   /* Half-precision float.  */
14588   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14589     return "Dh";
14590
14591   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14592      builtin types.  */
14593   if (TYPE_NAME (type) != NULL)
14594     return aarch64_mangle_builtin_type (type);
14595
14596   /* Use the default mangling.  */
14597   return NULL;
14598 }
14599
14600 /* Find the first rtx_insn before insn that will generate an assembly
14601    instruction.  */
14602
14603 static rtx_insn *
14604 aarch64_prev_real_insn (rtx_insn *insn)
14605 {
14606   if (!insn)
14607     return NULL;
14608
14609   do
14610     {
14611       insn = prev_real_insn (insn);
14612     }
14613   while (insn && recog_memoized (insn) < 0);
14614
14615   return insn;
14616 }
14617
14618 static bool
14619 is_madd_op (enum attr_type t1)
14620 {
14621   unsigned int i;
14622   /* A number of these may be AArch32 only.  */
14623   enum attr_type mlatypes[] = {
14624     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14625     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14626     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14627   };
14628
14629   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14630     {
14631       if (t1 == mlatypes[i])
14632         return true;
14633     }
14634
14635   return false;
14636 }
14637
14638 /* Check if there is a register dependency between a load and the insn
14639    for which we hold recog_data.  */
14640
14641 static bool
14642 dep_between_memop_and_curr (rtx memop)
14643 {
14644   rtx load_reg;
14645   int opno;
14646
14647   gcc_assert (GET_CODE (memop) == SET);
14648
14649   if (!REG_P (SET_DEST (memop)))
14650     return false;
14651
14652   load_reg = SET_DEST (memop);
14653   for (opno = 1; opno < recog_data.n_operands; opno++)
14654     {
14655       rtx operand = recog_data.operand[opno];
14656       if (REG_P (operand)
14657           && reg_overlap_mentioned_p (load_reg, operand))
14658         return true;
14659
14660     }
14661   return false;
14662 }
14663
14664
14665 /* When working around the Cortex-A53 erratum 835769,
14666    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14667    instruction and has a preceding memory instruction such that a NOP
14668    should be inserted between them.  */
14669
14670 bool
14671 aarch64_madd_needs_nop (rtx_insn* insn)
14672 {
14673   enum attr_type attr_type;
14674   rtx_insn *prev;
14675   rtx body;
14676
14677   if (!TARGET_FIX_ERR_A53_835769)
14678     return false;
14679
14680   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14681     return false;
14682
14683   attr_type = get_attr_type (insn);
14684   if (!is_madd_op (attr_type))
14685     return false;
14686
14687   prev = aarch64_prev_real_insn (insn);
14688   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14689      Restore recog state to INSN to avoid state corruption.  */
14690   extract_constrain_insn_cached (insn);
14691
14692   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14693     return false;
14694
14695   body = single_set (prev);
14696
14697   /* If the previous insn is a memory op and there is no dependency between
14698      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14699      have a complex memory operation, probably a load/store pair.
14700      Be conservative for now and emit a NOP.  */
14701   if (GET_MODE (recog_data.operand[0]) == DImode
14702       && (!body || !dep_between_memop_and_curr (body)))
14703     return true;
14704
14705   return false;
14706
14707 }
14708
14709
14710 /* Implement FINAL_PRESCAN_INSN.  */
14711
14712 void
14713 aarch64_final_prescan_insn (rtx_insn *insn)
14714 {
14715   if (aarch64_madd_needs_nop (insn))
14716     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14717 }
14718
14719
14720 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14721    instruction.  */
14722
14723 bool
14724 aarch64_sve_index_immediate_p (rtx base_or_step)
14725 {
14726   return (CONST_INT_P (base_or_step)
14727           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14728 }
14729
14730 /* Return true if X is a valid immediate for the SVE ADD and SUB
14731    instructions.  Negate X first if NEGATE_P is true.  */
14732
14733 bool
14734 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14735 {
14736   rtx elt;
14737
14738   if (!const_vec_duplicate_p (x, &elt)
14739       || !CONST_INT_P (elt))
14740     return false;
14741
14742   HOST_WIDE_INT val = INTVAL (elt);
14743   if (negate_p)
14744     val = -val;
14745   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14746
14747   if (val & 0xff)
14748     return IN_RANGE (val, 0, 0xff);
14749   return IN_RANGE (val, 0, 0xff00);
14750 }
14751
14752 /* Return true if X is a valid immediate operand for an SVE logical
14753    instruction such as AND.  */
14754
14755 bool
14756 aarch64_sve_bitmask_immediate_p (rtx x)
14757 {
14758   rtx elt;
14759
14760   return (const_vec_duplicate_p (x, &elt)
14761           && CONST_INT_P (elt)
14762           && aarch64_bitmask_imm (INTVAL (elt),
14763                                   GET_MODE_INNER (GET_MODE (x))));
14764 }
14765
14766 /* Return true if X is a valid immediate for the SVE DUP and CPY
14767    instructions.  */
14768
14769 bool
14770 aarch64_sve_dup_immediate_p (rtx x)
14771 {
14772   rtx elt;
14773
14774   if (!const_vec_duplicate_p (x, &elt)
14775       || !CONST_INT_P (elt))
14776     return false;
14777
14778   HOST_WIDE_INT val = INTVAL (elt);
14779   if (val & 0xff)
14780     return IN_RANGE (val, -0x80, 0x7f);
14781   return IN_RANGE (val, -0x8000, 0x7f00);
14782 }
14783
14784 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14785    SIGNED_P says whether the operand is signed rather than unsigned.  */
14786
14787 bool
14788 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14789 {
14790   rtx elt;
14791
14792   return (const_vec_duplicate_p (x, &elt)
14793           && CONST_INT_P (elt)
14794           && (signed_p
14795               ? IN_RANGE (INTVAL (elt), -16, 15)
14796               : IN_RANGE (INTVAL (elt), 0, 127)));
14797 }
14798
14799 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14800    instruction.  Negate X first if NEGATE_P is true.  */
14801
14802 bool
14803 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14804 {
14805   rtx elt;
14806   REAL_VALUE_TYPE r;
14807
14808   if (!const_vec_duplicate_p (x, &elt)
14809       || GET_CODE (elt) != CONST_DOUBLE)
14810     return false;
14811
14812   r = *CONST_DOUBLE_REAL_VALUE (elt);
14813
14814   if (negate_p)
14815     r = real_value_negate (&r);
14816
14817   if (real_equal (&r, &dconst1))
14818     return true;
14819   if (real_equal (&r, &dconsthalf))
14820     return true;
14821   return false;
14822 }
14823
14824 /* Return true if X is a valid immediate operand for an SVE FMUL
14825    instruction.  */
14826
14827 bool
14828 aarch64_sve_float_mul_immediate_p (rtx x)
14829 {
14830   rtx elt;
14831
14832   /* GCC will never generate a multiply with an immediate of 2, so there is no
14833      point testing for it (even though it is a valid constant).  */
14834   return (const_vec_duplicate_p (x, &elt)
14835           && GET_CODE (elt) == CONST_DOUBLE
14836           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14837 }
14838
14839 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14840    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14841    is nonnull, use it to describe valid immediates.  */
14842 static bool
14843 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14844                                     simd_immediate_info *info,
14845                                     enum simd_immediate_check which,
14846                                     simd_immediate_info::insn_type insn)
14847 {
14848   /* Try a 4-byte immediate with LSL.  */
14849   for (unsigned int shift = 0; shift < 32; shift += 8)
14850     if ((val32 & (0xff << shift)) == val32)
14851       {
14852         if (info)
14853           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14854                                        simd_immediate_info::LSL, shift);
14855         return true;
14856       }
14857
14858   /* Try a 2-byte immediate with LSL.  */
14859   unsigned int imm16 = val32 & 0xffff;
14860   if (imm16 == (val32 >> 16))
14861     for (unsigned int shift = 0; shift < 16; shift += 8)
14862       if ((imm16 & (0xff << shift)) == imm16)
14863         {
14864           if (info)
14865             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14866                                          simd_immediate_info::LSL, shift);
14867           return true;
14868         }
14869
14870   /* Try a 4-byte immediate with MSL, except for cases that MVN
14871      can handle.  */
14872   if (which == AARCH64_CHECK_MOV)
14873     for (unsigned int shift = 8; shift < 24; shift += 8)
14874       {
14875         unsigned int low = (1 << shift) - 1;
14876         if (((val32 & (0xff << shift)) | low) == val32)
14877           {
14878             if (info)
14879               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14880                                            simd_immediate_info::MSL, shift);
14881             return true;
14882           }
14883       }
14884
14885   return false;
14886 }
14887
14888 /* Return true if replicating VAL64 is a valid immediate for the
14889    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14890    use it to describe valid immediates.  */
14891 static bool
14892 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14893                                  simd_immediate_info *info,
14894                                  enum simd_immediate_check which)
14895 {
14896   unsigned int val32 = val64 & 0xffffffff;
14897   unsigned int val16 = val64 & 0xffff;
14898   unsigned int val8 = val64 & 0xff;
14899
14900   if (val32 == (val64 >> 32))
14901     {
14902       if ((which & AARCH64_CHECK_ORR) != 0
14903           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14904                                                  simd_immediate_info::MOV))
14905         return true;
14906
14907       if ((which & AARCH64_CHECK_BIC) != 0
14908           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14909                                                  simd_immediate_info::MVN))
14910         return true;
14911
14912       /* Try using a replicated byte.  */
14913       if (which == AARCH64_CHECK_MOV
14914           && val16 == (val32 >> 16)
14915           && val8 == (val16 >> 8))
14916         {
14917           if (info)
14918             *info = simd_immediate_info (QImode, val8);
14919           return true;
14920         }
14921     }
14922
14923   /* Try using a bit-to-bytemask.  */
14924   if (which == AARCH64_CHECK_MOV)
14925     {
14926       unsigned int i;
14927       for (i = 0; i < 64; i += 8)
14928         {
14929           unsigned char byte = (val64 >> i) & 0xff;
14930           if (byte != 0 && byte != 0xff)
14931             break;
14932         }
14933       if (i == 64)
14934         {
14935           if (info)
14936             *info = simd_immediate_info (DImode, val64);
14937           return true;
14938         }
14939     }
14940   return false;
14941 }
14942
14943 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14944    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14945
14946 static bool
14947 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14948                              simd_immediate_info *info)
14949 {
14950   scalar_int_mode mode = DImode;
14951   unsigned int val32 = val64 & 0xffffffff;
14952   if (val32 == (val64 >> 32))
14953     {
14954       mode = SImode;
14955       unsigned int val16 = val32 & 0xffff;
14956       if (val16 == (val32 >> 16))
14957         {
14958           mode = HImode;
14959           unsigned int val8 = val16 & 0xff;
14960           if (val8 == (val16 >> 8))
14961             mode = QImode;
14962         }
14963     }
14964   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14965   if (IN_RANGE (val, -0x80, 0x7f))
14966     {
14967       /* DUP with no shift.  */
14968       if (info)
14969         *info = simd_immediate_info (mode, val);
14970       return true;
14971     }
14972   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14973     {
14974       /* DUP with LSL #8.  */
14975       if (info)
14976         *info = simd_immediate_info (mode, val);
14977       return true;
14978     }
14979   if (aarch64_bitmask_imm (val64, mode))
14980     {
14981       /* DUPM.  */
14982       if (info)
14983         *info = simd_immediate_info (mode, val);
14984       return true;
14985     }
14986   return false;
14987 }
14988
14989 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
14990    it to describe valid immediates.  */
14991
14992 static bool
14993 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
14994 {
14995   if (x == CONST0_RTX (GET_MODE (x)))
14996     {
14997       if (info)
14998         *info = simd_immediate_info (DImode, 0);
14999       return true;
15000     }
15001
15002   /* Analyze the value as a VNx16BImode.  This should be relatively
15003      efficient, since rtx_vector_builder has enough built-in capacity
15004      to store all VLA predicate constants without needing the heap.  */
15005   rtx_vector_builder builder;
15006   if (!aarch64_get_sve_pred_bits (builder, x))
15007     return false;
15008
15009   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15010   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15011     {
15012       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15013       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15014       if (pattern != AARCH64_NUM_SVPATTERNS)
15015         {
15016           if (info)
15017             {
15018               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15019               *info = simd_immediate_info (int_mode, pattern);
15020             }
15021           return true;
15022         }
15023     }
15024   return false;
15025 }
15026
15027 /* Return true if OP is a valid SIMD immediate for the operation
15028    described by WHICH.  If INFO is nonnull, use it to describe valid
15029    immediates.  */
15030 bool
15031 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15032                               enum simd_immediate_check which)
15033 {
15034   machine_mode mode = GET_MODE (op);
15035   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15036   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15037     return false;
15038
15039   if (vec_flags & VEC_SVE_PRED)
15040     return aarch64_sve_pred_valid_immediate (op, info);
15041
15042   scalar_mode elt_mode = GET_MODE_INNER (mode);
15043   rtx base, step;
15044   unsigned int n_elts;
15045   if (GET_CODE (op) == CONST_VECTOR
15046       && CONST_VECTOR_DUPLICATE_P (op))
15047     n_elts = CONST_VECTOR_NPATTERNS (op);
15048   else if ((vec_flags & VEC_SVE_DATA)
15049            && const_vec_series_p (op, &base, &step))
15050     {
15051       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15052       if (!aarch64_sve_index_immediate_p (base)
15053           || !aarch64_sve_index_immediate_p (step))
15054         return false;
15055
15056       if (info)
15057         *info = simd_immediate_info (elt_mode, base, step);
15058       return true;
15059     }
15060   else if (GET_CODE (op) == CONST_VECTOR
15061            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15062     /* N_ELTS set above.  */;
15063   else
15064     return false;
15065
15066   scalar_float_mode elt_float_mode;
15067   if (n_elts == 1
15068       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15069     {
15070       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15071       if (aarch64_float_const_zero_rtx_p (elt)
15072           || aarch64_float_const_representable_p (elt))
15073         {
15074           if (info)
15075             *info = simd_immediate_info (elt_float_mode, elt);
15076           return true;
15077         }
15078     }
15079
15080   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15081   if (elt_size > 8)
15082     return false;
15083
15084   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15085
15086   /* Expand the vector constant out into a byte vector, with the least
15087      significant byte of the register first.  */
15088   auto_vec<unsigned char, 16> bytes;
15089   bytes.reserve (n_elts * elt_size);
15090   for (unsigned int i = 0; i < n_elts; i++)
15091     {
15092       /* The vector is provided in gcc endian-neutral fashion.
15093          For aarch64_be Advanced SIMD, it must be laid out in the vector
15094          register in reverse order.  */
15095       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15096       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15097
15098       if (elt_mode != elt_int_mode)
15099         elt = gen_lowpart (elt_int_mode, elt);
15100
15101       if (!CONST_INT_P (elt))
15102         return false;
15103
15104       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15105       for (unsigned int byte = 0; byte < elt_size; byte++)
15106         {
15107           bytes.quick_push (elt_val & 0xff);
15108           elt_val >>= BITS_PER_UNIT;
15109         }
15110     }
15111
15112   /* The immediate must repeat every eight bytes.  */
15113   unsigned int nbytes = bytes.length ();
15114   for (unsigned i = 8; i < nbytes; ++i)
15115     if (bytes[i] != bytes[i - 8])
15116       return false;
15117
15118   /* Get the repeating 8-byte value as an integer.  No endian correction
15119      is needed here because bytes is already in lsb-first order.  */
15120   unsigned HOST_WIDE_INT val64 = 0;
15121   for (unsigned int i = 0; i < 8; i++)
15122     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15123               << (i * BITS_PER_UNIT));
15124
15125   if (vec_flags & VEC_SVE_DATA)
15126     return aarch64_sve_valid_immediate (val64, info);
15127   else
15128     return aarch64_advsimd_valid_immediate (val64, info, which);
15129 }
15130
15131 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15132    has a step in the range of INDEX.  Return the index expression if so,
15133    otherwise return null.  */
15134 rtx
15135 aarch64_check_zero_based_sve_index_immediate (rtx x)
15136 {
15137   rtx base, step;
15138   if (const_vec_series_p (x, &base, &step)
15139       && base == const0_rtx
15140       && aarch64_sve_index_immediate_p (step))
15141     return step;
15142   return NULL_RTX;
15143 }
15144
15145 /* Check of immediate shift constants are within range.  */
15146 bool
15147 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15148 {
15149   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15150   if (left)
15151     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15152   else
15153     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15154 }
15155
15156 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15157    operation of width WIDTH at bit position POS.  */
15158
15159 rtx
15160 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15161 {
15162   gcc_assert (CONST_INT_P (width));
15163   gcc_assert (CONST_INT_P (pos));
15164
15165   unsigned HOST_WIDE_INT mask
15166     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15167   return GEN_INT (mask << UINTVAL (pos));
15168 }
15169
15170 bool
15171 aarch64_mov_operand_p (rtx x, machine_mode mode)
15172 {
15173   if (GET_CODE (x) == HIGH
15174       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15175     return true;
15176
15177   if (CONST_INT_P (x))
15178     return true;
15179
15180   if (VECTOR_MODE_P (GET_MODE (x)))
15181     return aarch64_simd_valid_immediate (x, NULL);
15182
15183   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15184     return true;
15185
15186   if (aarch64_sve_cnt_immediate_p (x))
15187     return true;
15188
15189   return aarch64_classify_symbolic_expression (x)
15190     == SYMBOL_TINY_ABSOLUTE;
15191 }
15192
15193 /* Return a const_int vector of VAL.  */
15194 rtx
15195 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15196 {
15197   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15198   return gen_const_vec_duplicate (mode, c);
15199 }
15200
15201 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15202
15203 bool
15204 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15205 {
15206   machine_mode vmode;
15207
15208   vmode = aarch64_simd_container_mode (mode, 64);
15209   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15210   return aarch64_simd_valid_immediate (op_v, NULL);
15211 }
15212
15213 /* Construct and return a PARALLEL RTX vector with elements numbering the
15214    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15215    the vector - from the perspective of the architecture.  This does not
15216    line up with GCC's perspective on lane numbers, so we end up with
15217    different masks depending on our target endian-ness.  The diagram
15218    below may help.  We must draw the distinction when building masks
15219    which select one half of the vector.  An instruction selecting
15220    architectural low-lanes for a big-endian target, must be described using
15221    a mask selecting GCC high-lanes.
15222
15223                  Big-Endian             Little-Endian
15224
15225 GCC             0   1   2   3           3   2   1   0
15226               | x | x | x | x |       | x | x | x | x |
15227 Architecture    3   2   1   0           3   2   1   0
15228
15229 Low Mask:         { 2, 3 }                { 0, 1 }
15230 High Mask:        { 0, 1 }                { 2, 3 }
15231
15232    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15233
15234 rtx
15235 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15236 {
15237   rtvec v = rtvec_alloc (nunits / 2);
15238   int high_base = nunits / 2;
15239   int low_base = 0;
15240   int base;
15241   rtx t1;
15242   int i;
15243
15244   if (BYTES_BIG_ENDIAN)
15245     base = high ? low_base : high_base;
15246   else
15247     base = high ? high_base : low_base;
15248
15249   for (i = 0; i < nunits / 2; i++)
15250     RTVEC_ELT (v, i) = GEN_INT (base + i);
15251
15252   t1 = gen_rtx_PARALLEL (mode, v);
15253   return t1;
15254 }
15255
15256 /* Check OP for validity as a PARALLEL RTX vector with elements
15257    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15258    from the perspective of the architecture.  See the diagram above
15259    aarch64_simd_vect_par_cnst_half for more details.  */
15260
15261 bool
15262 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15263                                        bool high)
15264 {
15265   int nelts;
15266   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15267     return false;
15268
15269   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15270   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15271   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15272   int i = 0;
15273
15274   if (count_op != count_ideal)
15275     return false;
15276
15277   for (i = 0; i < count_ideal; i++)
15278     {
15279       rtx elt_op = XVECEXP (op, 0, i);
15280       rtx elt_ideal = XVECEXP (ideal, 0, i);
15281
15282       if (!CONST_INT_P (elt_op)
15283           || INTVAL (elt_ideal) != INTVAL (elt_op))
15284         return false;
15285     }
15286   return true;
15287 }
15288
15289 /* Return a PARALLEL containing NELTS elements, with element I equal
15290    to BASE + I * STEP.  */
15291
15292 rtx
15293 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15294 {
15295   rtvec vec = rtvec_alloc (nelts);
15296   for (unsigned int i = 0; i < nelts; ++i)
15297     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15298   return gen_rtx_PARALLEL (VOIDmode, vec);
15299 }
15300
15301 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15302    series with step STEP.  */
15303
15304 bool
15305 aarch64_stepped_int_parallel_p (rtx op, int step)
15306 {
15307   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15308     return false;
15309
15310   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15311   for (int i = 1; i < XVECLEN (op, 0); ++i)
15312     if (!CONST_INT_P (XVECEXP (op, 0, i))
15313         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15314       return false;
15315
15316   return true;
15317 }
15318
15319 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15320    HIGH (exclusive).  */
15321 void
15322 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15323                           const_tree exp)
15324 {
15325   HOST_WIDE_INT lane;
15326   gcc_assert (CONST_INT_P (operand));
15327   lane = INTVAL (operand);
15328
15329   if (lane < low || lane >= high)
15330   {
15331     if (exp)
15332       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15333     else
15334       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15335   }
15336 }
15337
15338 /* Peform endian correction on lane number N, which indexes a vector
15339    of mode MODE, and return the result as an SImode rtx.  */
15340
15341 rtx
15342 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15343 {
15344   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15345 }
15346
15347 /* Return TRUE if OP is a valid vector addressing mode.  */
15348
15349 bool
15350 aarch64_simd_mem_operand_p (rtx op)
15351 {
15352   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15353                         || REG_P (XEXP (op, 0)));
15354 }
15355
15356 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15357
15358 bool
15359 aarch64_sve_ld1r_operand_p (rtx op)
15360 {
15361   struct aarch64_address_info addr;
15362   scalar_mode mode;
15363
15364   return (MEM_P (op)
15365           && is_a <scalar_mode> (GET_MODE (op), &mode)
15366           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15367           && addr.type == ADDRESS_REG_IMM
15368           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15369 }
15370
15371 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15372 bool
15373 aarch64_sve_ld1rq_operand_p (rtx op)
15374 {
15375   struct aarch64_address_info addr;
15376   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15377   if (!MEM_P (op)
15378       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15379     return false;
15380
15381   if (addr.type == ADDRESS_REG_IMM)
15382     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15383
15384   if (addr.type == ADDRESS_REG_REG)
15385     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15386
15387   return false;
15388 }
15389
15390 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15391    The conditions for STR are the same.  */
15392 bool
15393 aarch64_sve_ldr_operand_p (rtx op)
15394 {
15395   struct aarch64_address_info addr;
15396
15397   return (MEM_P (op)
15398           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15399                                        false, ADDR_QUERY_ANY)
15400           && addr.type == ADDRESS_REG_IMM);
15401 }
15402
15403 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15404    We need to be able to access the individual pieces, so the range
15405    is different from LD[234] and ST[234].  */
15406 bool
15407 aarch64_sve_struct_memory_operand_p (rtx op)
15408 {
15409   if (!MEM_P (op))
15410     return false;
15411
15412   machine_mode mode = GET_MODE (op);
15413   struct aarch64_address_info addr;
15414   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15415                                  ADDR_QUERY_ANY)
15416       || addr.type != ADDRESS_REG_IMM)
15417     return false;
15418
15419   poly_int64 first = addr.const_offset;
15420   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15421   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15422           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15423 }
15424
15425 /* Emit a register copy from operand to operand, taking care not to
15426    early-clobber source registers in the process.
15427
15428    COUNT is the number of components into which the copy needs to be
15429    decomposed.  */
15430 void
15431 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15432                                 unsigned int count)
15433 {
15434   unsigned int i;
15435   int rdest = REGNO (operands[0]);
15436   int rsrc = REGNO (operands[1]);
15437
15438   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15439       || rdest < rsrc)
15440     for (i = 0; i < count; i++)
15441       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15442                       gen_rtx_REG (mode, rsrc + i));
15443   else
15444     for (i = 0; i < count; i++)
15445       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15446                       gen_rtx_REG (mode, rsrc + count - i - 1));
15447 }
15448
15449 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15450    one of VSTRUCT modes: OI, CI, or XI.  */
15451 int
15452 aarch64_simd_attr_length_rglist (machine_mode mode)
15453 {
15454   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15455   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15456 }
15457
15458 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15459    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15460    16 bits.  */
15461 static HOST_WIDE_INT
15462 aarch64_simd_vector_alignment (const_tree type)
15463 {
15464   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15465     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15466        be set for non-predicate vectors of booleans.  Modes are the most
15467        direct way we have of identifying real SVE predicate types.  */
15468     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15469   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15470 }
15471
15472 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15473 static poly_uint64
15474 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15475 {
15476   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15477     {
15478       /* If the length of the vector is fixed, try to align to that length,
15479          otherwise don't try to align at all.  */
15480       HOST_WIDE_INT result;
15481       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15482         result = TYPE_ALIGN (TREE_TYPE (type));
15483       return result;
15484     }
15485   return TYPE_ALIGN (type);
15486 }
15487
15488 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15489 static bool
15490 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15491 {
15492   if (is_packed)
15493     return false;
15494
15495   /* For fixed-length vectors, check that the vectorizer will aim for
15496      full-vector alignment.  This isn't true for generic GCC vectors
15497      that are wider than the ABI maximum of 128 bits.  */
15498   poly_uint64 preferred_alignment =
15499     aarch64_vectorize_preferred_vector_alignment (type);
15500   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15501       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15502                    preferred_alignment))
15503     return false;
15504
15505   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15506   return true;
15507 }
15508
15509 /* Return true if the vector misalignment factor is supported by the
15510    target.  */
15511 static bool
15512 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15513                                              const_tree type, int misalignment,
15514                                              bool is_packed)
15515 {
15516   if (TARGET_SIMD && STRICT_ALIGNMENT)
15517     {
15518       /* Return if movmisalign pattern is not supported for this mode.  */
15519       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15520         return false;
15521
15522       /* Misalignment factor is unknown at compile time.  */
15523       if (misalignment == -1)
15524         return false;
15525     }
15526   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15527                                                       is_packed);
15528 }
15529
15530 /* If VALS is a vector constant that can be loaded into a register
15531    using DUP, generate instructions to do so and return an RTX to
15532    assign to the register.  Otherwise return NULL_RTX.  */
15533 static rtx
15534 aarch64_simd_dup_constant (rtx vals)
15535 {
15536   machine_mode mode = GET_MODE (vals);
15537   machine_mode inner_mode = GET_MODE_INNER (mode);
15538   rtx x;
15539
15540   if (!const_vec_duplicate_p (vals, &x))
15541     return NULL_RTX;
15542
15543   /* We can load this constant by using DUP and a constant in a
15544      single ARM register.  This will be cheaper than a vector
15545      load.  */
15546   x = copy_to_mode_reg (inner_mode, x);
15547   return gen_vec_duplicate (mode, x);
15548 }
15549
15550
15551 /* Generate code to load VALS, which is a PARALLEL containing only
15552    constants (for vec_init) or CONST_VECTOR, efficiently into a
15553    register.  Returns an RTX to copy into the register, or NULL_RTX
15554    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15555 static rtx
15556 aarch64_simd_make_constant (rtx vals)
15557 {
15558   machine_mode mode = GET_MODE (vals);
15559   rtx const_dup;
15560   rtx const_vec = NULL_RTX;
15561   int n_const = 0;
15562   int i;
15563
15564   if (GET_CODE (vals) == CONST_VECTOR)
15565     const_vec = vals;
15566   else if (GET_CODE (vals) == PARALLEL)
15567     {
15568       /* A CONST_VECTOR must contain only CONST_INTs and
15569          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15570          Only store valid constants in a CONST_VECTOR.  */
15571       int n_elts = XVECLEN (vals, 0);
15572       for (i = 0; i < n_elts; ++i)
15573         {
15574           rtx x = XVECEXP (vals, 0, i);
15575           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15576             n_const++;
15577         }
15578       if (n_const == n_elts)
15579         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15580     }
15581   else
15582     gcc_unreachable ();
15583
15584   if (const_vec != NULL_RTX
15585       && aarch64_simd_valid_immediate (const_vec, NULL))
15586     /* Load using MOVI/MVNI.  */
15587     return const_vec;
15588   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15589     /* Loaded using DUP.  */
15590     return const_dup;
15591   else if (const_vec != NULL_RTX)
15592     /* Load from constant pool. We cannot take advantage of single-cycle
15593        LD1 because we need a PC-relative addressing mode.  */
15594     return const_vec;
15595   else
15596     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15597        We cannot construct an initializer.  */
15598     return NULL_RTX;
15599 }
15600
15601 /* Expand a vector initialisation sequence, such that TARGET is
15602    initialised to contain VALS.  */
15603
15604 void
15605 aarch64_expand_vector_init (rtx target, rtx vals)
15606 {
15607   machine_mode mode = GET_MODE (target);
15608   scalar_mode inner_mode = GET_MODE_INNER (mode);
15609   /* The number of vector elements.  */
15610   int n_elts = XVECLEN (vals, 0);
15611   /* The number of vector elements which are not constant.  */
15612   int n_var = 0;
15613   rtx any_const = NULL_RTX;
15614   /* The first element of vals.  */
15615   rtx v0 = XVECEXP (vals, 0, 0);
15616   bool all_same = true;
15617
15618   /* This is a special vec_init<M><N> where N is not an element mode but a
15619      vector mode with half the elements of M.  We expect to find two entries
15620      of mode N in VALS and we must put their concatentation into TARGET.  */
15621   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15622     {
15623       gcc_assert (known_eq (GET_MODE_SIZE (mode),
15624                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15625       rtx lo = XVECEXP (vals, 0, 0);
15626       rtx hi = XVECEXP (vals, 0, 1);
15627       machine_mode narrow_mode = GET_MODE (lo);
15628       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15629       gcc_assert (narrow_mode == GET_MODE (hi));
15630
15631       /* When we want to concatenate a half-width vector with zeroes we can
15632          use the aarch64_combinez[_be] patterns.  Just make sure that the
15633          zeroes are in the right half.  */
15634       if (BYTES_BIG_ENDIAN
15635           && aarch64_simd_imm_zero (lo, narrow_mode)
15636           && general_operand (hi, narrow_mode))
15637         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15638       else if (!BYTES_BIG_ENDIAN
15639                && aarch64_simd_imm_zero (hi, narrow_mode)
15640                && general_operand (lo, narrow_mode))
15641         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15642       else
15643         {
15644           /* Else create the two half-width registers and combine them.  */
15645           if (!REG_P (lo))
15646             lo = force_reg (GET_MODE (lo), lo);
15647           if (!REG_P (hi))
15648             hi = force_reg (GET_MODE (hi), hi);
15649
15650           if (BYTES_BIG_ENDIAN)
15651             std::swap (lo, hi);
15652           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15653         }
15654      return;
15655    }
15656
15657   /* Count the number of variable elements to initialise.  */
15658   for (int i = 0; i < n_elts; ++i)
15659     {
15660       rtx x = XVECEXP (vals, 0, i);
15661       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15662         ++n_var;
15663       else
15664         any_const = x;
15665
15666       all_same &= rtx_equal_p (x, v0);
15667     }
15668
15669   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15670      how best to handle this.  */
15671   if (n_var == 0)
15672     {
15673       rtx constant = aarch64_simd_make_constant (vals);
15674       if (constant != NULL_RTX)
15675         {
15676           emit_move_insn (target, constant);
15677           return;
15678         }
15679     }
15680
15681   /* Splat a single non-constant element if we can.  */
15682   if (all_same)
15683     {
15684       rtx x = copy_to_mode_reg (inner_mode, v0);
15685       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15686       return;
15687     }
15688
15689   enum insn_code icode = optab_handler (vec_set_optab, mode);
15690   gcc_assert (icode != CODE_FOR_nothing);
15691
15692   /* If there are only variable elements, try to optimize
15693      the insertion using dup for the most common element
15694      followed by insertions.  */
15695
15696   /* The algorithm will fill matches[*][0] with the earliest matching element,
15697      and matches[X][1] with the count of duplicate elements (if X is the
15698      earliest element which has duplicates).  */
15699
15700   if (n_var == n_elts && n_elts <= 16)
15701     {
15702       int matches[16][2] = {0};
15703       for (int i = 0; i < n_elts; i++)
15704         {
15705           for (int j = 0; j <= i; j++)
15706             {
15707               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15708                 {
15709                   matches[i][0] = j;
15710                   matches[j][1]++;
15711                   break;
15712                 }
15713             }
15714         }
15715       int maxelement = 0;
15716       int maxv = 0;
15717       for (int i = 0; i < n_elts; i++)
15718         if (matches[i][1] > maxv)
15719           {
15720             maxelement = i;
15721             maxv = matches[i][1];
15722           }
15723
15724       /* Create a duplicate of the most common element, unless all elements
15725          are equally useless to us, in which case just immediately set the
15726          vector register using the first element.  */
15727
15728       if (maxv == 1)
15729         {
15730           /* For vectors of two 64-bit elements, we can do even better.  */
15731           if (n_elts == 2
15732               && (inner_mode == E_DImode
15733                   || inner_mode == E_DFmode))
15734
15735             {
15736               rtx x0 = XVECEXP (vals, 0, 0);
15737               rtx x1 = XVECEXP (vals, 0, 1);
15738               /* Combine can pick up this case, but handling it directly
15739                  here leaves clearer RTL.
15740
15741                  This is load_pair_lanes<mode>, and also gives us a clean-up
15742                  for store_pair_lanes<mode>.  */
15743               if (memory_operand (x0, inner_mode)
15744                   && memory_operand (x1, inner_mode)
15745                   && !STRICT_ALIGNMENT
15746                   && rtx_equal_p (XEXP (x1, 0),
15747                                   plus_constant (Pmode,
15748                                                  XEXP (x0, 0),
15749                                                  GET_MODE_SIZE (inner_mode))))
15750                 {
15751                   rtx t;
15752                   if (inner_mode == DFmode)
15753                     t = gen_load_pair_lanesdf (target, x0, x1);
15754                   else
15755                     t = gen_load_pair_lanesdi (target, x0, x1);
15756                   emit_insn (t);
15757                   return;
15758                 }
15759             }
15760           /* The subreg-move sequence below will move into lane zero of the
15761              vector register.  For big-endian we want that position to hold
15762              the last element of VALS.  */
15763           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15764           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15765           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15766         }
15767       else
15768         {
15769           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15770           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15771         }
15772
15773       /* Insert the rest.  */
15774       for (int i = 0; i < n_elts; i++)
15775         {
15776           rtx x = XVECEXP (vals, 0, i);
15777           if (matches[i][0] == maxelement)
15778             continue;
15779           x = copy_to_mode_reg (inner_mode, x);
15780           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15781         }
15782       return;
15783     }
15784
15785   /* Initialise a vector which is part-variable.  We want to first try
15786      to build those lanes which are constant in the most efficient way we
15787      can.  */
15788   if (n_var != n_elts)
15789     {
15790       rtx copy = copy_rtx (vals);
15791
15792       /* Load constant part of vector.  We really don't care what goes into the
15793          parts we will overwrite, but we're more likely to be able to load the
15794          constant efficiently if it has fewer, larger, repeating parts
15795          (see aarch64_simd_valid_immediate).  */
15796       for (int i = 0; i < n_elts; i++)
15797         {
15798           rtx x = XVECEXP (vals, 0, i);
15799           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15800             continue;
15801           rtx subst = any_const;
15802           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15803             {
15804               /* Look in the copied vector, as more elements are const.  */
15805               rtx test = XVECEXP (copy, 0, i ^ bit);
15806               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15807                 {
15808                   subst = test;
15809                   break;
15810                 }
15811             }
15812           XVECEXP (copy, 0, i) = subst;
15813         }
15814       aarch64_expand_vector_init (target, copy);
15815     }
15816
15817   /* Insert the variable lanes directly.  */
15818   for (int i = 0; i < n_elts; i++)
15819     {
15820       rtx x = XVECEXP (vals, 0, i);
15821       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15822         continue;
15823       x = copy_to_mode_reg (inner_mode, x);
15824       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15825     }
15826 }
15827
15828 /* Emit RTL corresponding to:
15829    insr TARGET, ELEM.  */
15830
15831 static void
15832 emit_insr (rtx target, rtx elem)
15833 {
15834   machine_mode mode = GET_MODE (target);
15835   scalar_mode elem_mode = GET_MODE_INNER (mode);
15836   elem = force_reg (elem_mode, elem);
15837
15838   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15839   gcc_assert (icode != CODE_FOR_nothing);
15840   emit_insn (GEN_FCN (icode) (target, target, elem));
15841 }
15842
15843 /* Subroutine of aarch64_sve_expand_vector_init for handling
15844    trailing constants.
15845    This function works as follows:
15846    (a) Create a new vector consisting of trailing constants.
15847    (b) Initialize TARGET with the constant vector using emit_move_insn.
15848    (c) Insert remaining elements in TARGET using insr.
15849    NELTS is the total number of elements in original vector while
15850    while NELTS_REQD is the number of elements that are actually
15851    significant.
15852
15853    ??? The heuristic used is to do above only if number of constants
15854    is at least half the total number of elements.  May need fine tuning.  */
15855
15856 static bool
15857 aarch64_sve_expand_vector_init_handle_trailing_constants
15858  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
15859 {
15860   machine_mode mode = GET_MODE (target);
15861   scalar_mode elem_mode = GET_MODE_INNER (mode);
15862   int n_trailing_constants = 0;
15863
15864   for (int i = nelts_reqd - 1;
15865        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
15866        i--)
15867     n_trailing_constants++;
15868
15869   if (n_trailing_constants >= nelts_reqd / 2)
15870     {
15871       rtx_vector_builder v (mode, 1, nelts);
15872       for (int i = 0; i < nelts; i++)
15873         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
15874       rtx const_vec = v.build ();
15875       emit_move_insn (target, const_vec);
15876
15877       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
15878         emit_insr (target, builder.elt (i));
15879
15880       return true;
15881     }
15882
15883   return false;
15884 }
15885
15886 /* Subroutine of aarch64_sve_expand_vector_init.
15887    Works as follows:
15888    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15889    (b) Skip trailing elements from BUILDER, which are the same as
15890        element NELTS_REQD - 1.
15891    (c) Insert earlier elements in reverse order in TARGET using insr.  */
15892
15893 static void
15894 aarch64_sve_expand_vector_init_insert_elems (rtx target,
15895                                              const rtx_vector_builder &builder,
15896                                              int nelts_reqd)
15897 {
15898   machine_mode mode = GET_MODE (target);
15899   scalar_mode elem_mode = GET_MODE_INNER (mode);
15900
15901   struct expand_operand ops[2];
15902   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
15903   gcc_assert (icode != CODE_FOR_nothing);
15904
15905   create_output_operand (&ops[0], target, mode);
15906   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
15907   expand_insn (icode, 2, ops);
15908
15909   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15910   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
15911     emit_insr (target, builder.elt (i));
15912 }
15913
15914 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15915    when all trailing elements of builder are same.
15916    This works as follows:
15917    (a) Use expand_insn interface to broadcast last vector element in TARGET.
15918    (b) Insert remaining elements in TARGET using insr.
15919
15920    ??? The heuristic used is to do above if number of same trailing elements
15921    is at least 3/4 of total number of elements, loosely based on
15922    heuristic from mostly_zeros_p.  May need fine-tuning.  */
15923
15924 static bool
15925 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15926  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
15927 {
15928   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15929   if (ndups >= (3 * nelts_reqd) / 4)
15930     {
15931       aarch64_sve_expand_vector_init_insert_elems (target, builder,
15932                                                    nelts_reqd - ndups + 1);
15933       return true;
15934     }
15935
15936   return false;
15937 }
15938
15939 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15940    of elements in BUILDER.
15941
15942    The function tries to initialize TARGET from BUILDER if it fits one
15943    of the special cases outlined below.
15944
15945    Failing that, the function divides BUILDER into two sub-vectors:
15946    v_even = even elements of BUILDER;
15947    v_odd = odd elements of BUILDER;
15948
15949    and recursively calls itself with v_even and v_odd.
15950
15951    if (recursive call succeeded for v_even or v_odd)
15952      TARGET = zip (v_even, v_odd)
15953
15954    The function returns true if it managed to build TARGET from BUILDER
15955    with one of the special cases, false otherwise.
15956
15957    Example: {a, 1, b, 2, c, 3, d, 4}
15958
15959    The vector gets divided into:
15960    v_even = {a, b, c, d}
15961    v_odd = {1, 2, 3, 4}
15962
15963    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15964    initialize tmp2 from constant vector v_odd using emit_move_insn.
15965
15966    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15967    4 elements, so we construct tmp1 from v_even using insr:
15968    tmp1 = dup(d)
15969    insr tmp1, c
15970    insr tmp1, b
15971    insr tmp1, a
15972
15973    And finally:
15974    TARGET = zip (tmp1, tmp2)
15975    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
15976
15977 static bool
15978 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
15979                                 int nelts, int nelts_reqd)
15980 {
15981   machine_mode mode = GET_MODE (target);
15982
15983   /* Case 1: Vector contains trailing constants.  */
15984
15985   if (aarch64_sve_expand_vector_init_handle_trailing_constants
15986        (target, builder, nelts, nelts_reqd))
15987     return true;
15988
15989   /* Case 2: Vector contains leading constants.  */
15990
15991   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
15992   for (int i = 0; i < nelts_reqd; i++)
15993     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
15994   rev_builder.finalize ();
15995
15996   if (aarch64_sve_expand_vector_init_handle_trailing_constants
15997        (target, rev_builder, nelts, nelts_reqd))
15998     {
15999       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16000       return true;
16001     }
16002
16003   /* Case 3: Vector contains trailing same element.  */
16004
16005   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16006        (target, builder, nelts_reqd))
16007     return true;
16008
16009   /* Case 4: Vector contains leading same element.  */
16010
16011   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16012        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16013     {
16014       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16015       return true;
16016     }
16017
16018   /* Avoid recursing below 4-elements.
16019      ??? The threshold 4 may need fine-tuning.  */
16020
16021   if (nelts_reqd <= 4)
16022     return false;
16023
16024   rtx_vector_builder v_even (mode, 1, nelts);
16025   rtx_vector_builder v_odd (mode, 1, nelts);
16026
16027   for (int i = 0; i < nelts * 2; i += 2)
16028     {
16029       v_even.quick_push (builder.elt (i));
16030       v_odd.quick_push (builder.elt (i + 1));
16031     }
16032
16033   v_even.finalize ();
16034   v_odd.finalize ();
16035
16036   rtx tmp1 = gen_reg_rtx (mode);
16037   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16038                                                     nelts, nelts_reqd / 2);
16039
16040   rtx tmp2 = gen_reg_rtx (mode);
16041   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16042                                                    nelts, nelts_reqd / 2);
16043
16044   if (!did_even_p && !did_odd_p)
16045     return false;
16046
16047   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16048      special cases and zip v_even, v_odd.  */
16049
16050   if (!did_even_p)
16051     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16052
16053   if (!did_odd_p)
16054     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16055
16056   rtvec v = gen_rtvec (2, tmp1, tmp2);
16057   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16058   return true;
16059 }
16060
16061 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16062
16063 void
16064 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16065 {
16066   machine_mode mode = GET_MODE (target);
16067   int nelts = XVECLEN (vals, 0);
16068
16069   rtx_vector_builder v (mode, 1, nelts);
16070   for (int i = 0; i < nelts; i++)
16071     v.quick_push (XVECEXP (vals, 0, i));
16072   v.finalize ();
16073
16074   /* If neither sub-vectors of v could be initialized specially,
16075      then use INSR to insert all elements from v into TARGET.
16076      ??? This might not be optimal for vectors with large
16077      initializers like 16-element or above.
16078      For nelts < 4, it probably isn't useful to handle specially.  */
16079
16080   if (nelts < 4
16081       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16082     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16083 }
16084
16085 static unsigned HOST_WIDE_INT
16086 aarch64_shift_truncation_mask (machine_mode mode)
16087 {
16088   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16089     return 0;
16090   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16091 }
16092
16093 /* Select a format to encode pointers in exception handling data.  */
16094 int
16095 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16096 {
16097    int type;
16098    switch (aarch64_cmodel)
16099      {
16100      case AARCH64_CMODEL_TINY:
16101      case AARCH64_CMODEL_TINY_PIC:
16102      case AARCH64_CMODEL_SMALL:
16103      case AARCH64_CMODEL_SMALL_PIC:
16104      case AARCH64_CMODEL_SMALL_SPIC:
16105        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16106           for everything.  */
16107        type = DW_EH_PE_sdata4;
16108        break;
16109      default:
16110        /* No assumptions here.  8-byte relocs required.  */
16111        type = DW_EH_PE_sdata8;
16112        break;
16113      }
16114    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16115 }
16116
16117 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16118
16119 static void
16120 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16121 {
16122   if (aarch64_simd_decl_p (decl))
16123     {
16124       fprintf (stream, "\t.variant_pcs\t");
16125       assemble_name (stream, name);
16126       fprintf (stream, "\n");
16127     }
16128 }
16129
16130 /* The last .arch and .tune assembly strings that we printed.  */
16131 static std::string aarch64_last_printed_arch_string;
16132 static std::string aarch64_last_printed_tune_string;
16133
16134 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16135    by the function fndecl.  */
16136
16137 void
16138 aarch64_declare_function_name (FILE *stream, const char* name,
16139                                 tree fndecl)
16140 {
16141   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16142
16143   struct cl_target_option *targ_options;
16144   if (target_parts)
16145     targ_options = TREE_TARGET_OPTION (target_parts);
16146   else
16147     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16148   gcc_assert (targ_options);
16149
16150   const struct processor *this_arch
16151     = aarch64_get_arch (targ_options->x_explicit_arch);
16152
16153   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16154   std::string extension
16155     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16156                                                   this_arch->flags);
16157   /* Only update the assembler .arch string if it is distinct from the last
16158      such string we printed.  */
16159   std::string to_print = this_arch->name + extension;
16160   if (to_print != aarch64_last_printed_arch_string)
16161     {
16162       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16163       aarch64_last_printed_arch_string = to_print;
16164     }
16165
16166   /* Print the cpu name we're tuning for in the comments, might be
16167      useful to readers of the generated asm.  Do it only when it changes
16168      from function to function and verbose assembly is requested.  */
16169   const struct processor *this_tune
16170     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16171
16172   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16173     {
16174       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16175                    this_tune->name);
16176       aarch64_last_printed_tune_string = this_tune->name;
16177     }
16178
16179   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16180
16181   /* Don't forget the type directive for ELF.  */
16182   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16183   ASM_OUTPUT_LABEL (stream, name);
16184 }
16185
16186 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16187
16188 void
16189 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16190 {
16191   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16192   const char *value = IDENTIFIER_POINTER (target);
16193   aarch64_asm_output_variant_pcs (stream, decl, name);
16194   ASM_OUTPUT_DEF (stream, name, value);
16195 }
16196
16197 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16198    function symbol references.  */
16199
16200 void
16201 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16202 {
16203   default_elf_asm_output_external (stream, decl, name);
16204   aarch64_asm_output_variant_pcs (stream, decl, name);
16205 }
16206
16207 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16208    Used to output the .cfi_b_key_frame directive when signing the current
16209    function with the B key.  */
16210
16211 void
16212 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16213 {
16214   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16215       && aarch64_ra_sign_key == AARCH64_KEY_B)
16216         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16217 }
16218
16219 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16220
16221 static void
16222 aarch64_start_file (void)
16223 {
16224   struct cl_target_option *default_options
16225     = TREE_TARGET_OPTION (target_option_default_node);
16226
16227   const struct processor *default_arch
16228     = aarch64_get_arch (default_options->x_explicit_arch);
16229   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16230   std::string extension
16231     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16232                                                   default_arch->flags);
16233
16234    aarch64_last_printed_arch_string = default_arch->name + extension;
16235    aarch64_last_printed_tune_string = "";
16236    asm_fprintf (asm_out_file, "\t.arch %s\n",
16237                 aarch64_last_printed_arch_string.c_str ());
16238
16239    default_file_start ();
16240 }
16241
16242 /* Emit load exclusive.  */
16243
16244 static void
16245 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16246                              rtx mem, rtx model_rtx)
16247 {
16248   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16249 }
16250
16251 /* Emit store exclusive.  */
16252
16253 static void
16254 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16255                               rtx rval, rtx mem, rtx model_rtx)
16256 {
16257   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16258 }
16259
16260 /* Mark the previous jump instruction as unlikely.  */
16261
16262 static void
16263 aarch64_emit_unlikely_jump (rtx insn)
16264 {
16265   rtx_insn *jump = emit_jump_insn (insn);
16266   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16267 }
16268
16269 /* Expand a compare and swap pattern.  */
16270
16271 void
16272 aarch64_expand_compare_and_swap (rtx operands[])
16273 {
16274   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16275   machine_mode mode, r_mode;
16276
16277   bval = operands[0];
16278   rval = operands[1];
16279   mem = operands[2];
16280   oldval = operands[3];
16281   newval = operands[4];
16282   is_weak = operands[5];
16283   mod_s = operands[6];
16284   mod_f = operands[7];
16285   mode = GET_MODE (mem);
16286
16287   /* Normally the succ memory model must be stronger than fail, but in the
16288      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16289      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16290   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16291       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16292     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16293
16294   r_mode = mode;
16295   if (mode == QImode || mode == HImode)
16296     {
16297       r_mode = SImode;
16298       rval = gen_reg_rtx (r_mode);
16299     }
16300
16301   if (TARGET_LSE)
16302     {
16303       /* The CAS insn requires oldval and rval overlap, but we need to
16304          have a copy of oldval saved across the operation to tell if
16305          the operation is successful.  */
16306       if (reg_overlap_mentioned_p (rval, oldval))
16307         rval = copy_to_mode_reg (r_mode, oldval);
16308       else
16309         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16310
16311       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16312                                                    newval, mod_s));
16313       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16314     }
16315   else
16316     {
16317       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16318       insn_code code = code_for_aarch64_compare_and_swap (mode);
16319       if (!insn_data[code].operand[2].predicate (oldval, mode))
16320         oldval = force_reg (mode, oldval);
16321
16322       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16323                                  is_weak, mod_s, mod_f));
16324       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16325     }
16326
16327   if (r_mode != mode)
16328     rval = gen_lowpart (mode, rval);
16329   emit_move_insn (operands[1], rval);
16330
16331   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16332   emit_insn (gen_rtx_SET (bval, x));
16333 }
16334
16335 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16336    sequence implementing an atomic operation.  */
16337
16338 static void
16339 aarch64_emit_post_barrier (enum memmodel model)
16340 {
16341   const enum memmodel base_model = memmodel_base (model);
16342
16343   if (is_mm_sync (model)
16344       && (base_model == MEMMODEL_ACQUIRE
16345           || base_model == MEMMODEL_ACQ_REL
16346           || base_model == MEMMODEL_SEQ_CST))
16347     {
16348       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16349     }
16350 }
16351
16352 /* Split a compare and swap pattern.  */
16353
16354 void
16355 aarch64_split_compare_and_swap (rtx operands[])
16356 {
16357   rtx rval, mem, oldval, newval, scratch;
16358   machine_mode mode;
16359   bool is_weak;
16360   rtx_code_label *label1, *label2;
16361   rtx x, cond;
16362   enum memmodel model;
16363   rtx model_rtx;
16364
16365   rval = operands[0];
16366   mem = operands[1];
16367   oldval = operands[2];
16368   newval = operands[3];
16369   is_weak = (operands[4] != const0_rtx);
16370   model_rtx = operands[5];
16371   scratch = operands[7];
16372   mode = GET_MODE (mem);
16373   model = memmodel_from_int (INTVAL (model_rtx));
16374
16375   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16376     loop:
16377     .label1:
16378         LD[A]XR rval, [mem]
16379         CBNZ    rval, .label2
16380         ST[L]XR scratch, newval, [mem]
16381         CBNZ    scratch, .label1
16382     .label2:
16383         CMP     rval, 0.  */
16384   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16385
16386   label1 = NULL;
16387   if (!is_weak)
16388     {
16389       label1 = gen_label_rtx ();
16390       emit_label (label1);
16391     }
16392   label2 = gen_label_rtx ();
16393
16394   /* The initial load can be relaxed for a __sync operation since a final
16395      barrier will be emitted to stop code hoisting.  */
16396   if (is_mm_sync (model))
16397     aarch64_emit_load_exclusive (mode, rval, mem,
16398                                  GEN_INT (MEMMODEL_RELAXED));
16399   else
16400     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16401
16402   if (strong_zero_p)
16403     {
16404       if (aarch64_track_speculation)
16405         {
16406           /* Emit an explicit compare instruction, so that we can correctly
16407              track the condition codes.  */
16408           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16409           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16410         }
16411       else
16412         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16413
16414       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16415                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16416       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16417     }
16418   else
16419     {
16420       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16421       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16422       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16423                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16424       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16425     }
16426
16427   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16428
16429   if (!is_weak)
16430     {
16431       if (aarch64_track_speculation)
16432         {
16433           /* Emit an explicit compare instruction, so that we can correctly
16434              track the condition codes.  */
16435           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16436           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16437         }
16438       else
16439         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16440
16441       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16442                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16443       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16444     }
16445   else
16446     {
16447       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16448       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16449       emit_insn (gen_rtx_SET (cond, x));
16450     }
16451
16452   emit_label (label2);
16453   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16454      to set the condition flags.  If this is not used it will be removed by
16455      later passes.  */
16456   if (strong_zero_p)
16457     {
16458       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16459       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16460       emit_insn (gen_rtx_SET (cond, x));
16461     }
16462   /* Emit any final barrier needed for a __sync operation.  */
16463   if (is_mm_sync (model))
16464     aarch64_emit_post_barrier (model);
16465 }
16466
16467 /* Split an atomic operation.  */
16468
16469 void
16470 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16471                          rtx value, rtx model_rtx, rtx cond)
16472 {
16473   machine_mode mode = GET_MODE (mem);
16474   machine_mode wmode = (mode == DImode ? DImode : SImode);
16475   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16476   const bool is_sync = is_mm_sync (model);
16477   rtx_code_label *label;
16478   rtx x;
16479
16480   /* Split the atomic operation into a sequence.  */
16481   label = gen_label_rtx ();
16482   emit_label (label);
16483
16484   if (new_out)
16485     new_out = gen_lowpart (wmode, new_out);
16486   if (old_out)
16487     old_out = gen_lowpart (wmode, old_out);
16488   else
16489     old_out = new_out;
16490   value = simplify_gen_subreg (wmode, value, mode, 0);
16491
16492   /* The initial load can be relaxed for a __sync operation since a final
16493      barrier will be emitted to stop code hoisting.  */
16494  if (is_sync)
16495     aarch64_emit_load_exclusive (mode, old_out, mem,
16496                                  GEN_INT (MEMMODEL_RELAXED));
16497   else
16498     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16499
16500   switch (code)
16501     {
16502     case SET:
16503       new_out = value;
16504       break;
16505
16506     case NOT:
16507       x = gen_rtx_AND (wmode, old_out, value);
16508       emit_insn (gen_rtx_SET (new_out, x));
16509       x = gen_rtx_NOT (wmode, new_out);
16510       emit_insn (gen_rtx_SET (new_out, x));
16511       break;
16512
16513     case MINUS:
16514       if (CONST_INT_P (value))
16515         {
16516           value = GEN_INT (-INTVAL (value));
16517           code = PLUS;
16518         }
16519       /* Fall through.  */
16520
16521     default:
16522       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16523       emit_insn (gen_rtx_SET (new_out, x));
16524       break;
16525     }
16526
16527   aarch64_emit_store_exclusive (mode, cond, mem,
16528                                 gen_lowpart (mode, new_out), model_rtx);
16529
16530   if (aarch64_track_speculation)
16531     {
16532       /* Emit an explicit compare instruction, so that we can correctly
16533          track the condition codes.  */
16534       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16535       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16536     }
16537   else
16538     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16539
16540   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16541                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16542   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16543
16544   /* Emit any final barrier needed for a __sync operation.  */
16545   if (is_sync)
16546     aarch64_emit_post_barrier (model);
16547 }
16548
16549 static void
16550 aarch64_init_libfuncs (void)
16551 {
16552    /* Half-precision float operations.  The compiler handles all operations
16553      with NULL libfuncs by converting to SFmode.  */
16554
16555   /* Conversions.  */
16556   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16557   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16558
16559   /* Arithmetic.  */
16560   set_optab_libfunc (add_optab, HFmode, NULL);
16561   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16562   set_optab_libfunc (smul_optab, HFmode, NULL);
16563   set_optab_libfunc (neg_optab, HFmode, NULL);
16564   set_optab_libfunc (sub_optab, HFmode, NULL);
16565
16566   /* Comparisons.  */
16567   set_optab_libfunc (eq_optab, HFmode, NULL);
16568   set_optab_libfunc (ne_optab, HFmode, NULL);
16569   set_optab_libfunc (lt_optab, HFmode, NULL);
16570   set_optab_libfunc (le_optab, HFmode, NULL);
16571   set_optab_libfunc (ge_optab, HFmode, NULL);
16572   set_optab_libfunc (gt_optab, HFmode, NULL);
16573   set_optab_libfunc (unord_optab, HFmode, NULL);
16574 }
16575
16576 /* Target hook for c_mode_for_suffix.  */
16577 static machine_mode
16578 aarch64_c_mode_for_suffix (char suffix)
16579 {
16580   if (suffix == 'q')
16581     return TFmode;
16582
16583   return VOIDmode;
16584 }
16585
16586 /* We can only represent floating point constants which will fit in
16587    "quarter-precision" values.  These values are characterised by
16588    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16589    by:
16590
16591    (-1)^s * (n/16) * 2^r
16592
16593    Where:
16594      's' is the sign bit.
16595      'n' is an integer in the range 16 <= n <= 31.
16596      'r' is an integer in the range -3 <= r <= 4.  */
16597
16598 /* Return true iff X can be represented by a quarter-precision
16599    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16600 bool
16601 aarch64_float_const_representable_p (rtx x)
16602 {
16603   /* This represents our current view of how many bits
16604      make up the mantissa.  */
16605   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16606   int exponent;
16607   unsigned HOST_WIDE_INT mantissa, mask;
16608   REAL_VALUE_TYPE r, m;
16609   bool fail;
16610
16611   if (!CONST_DOUBLE_P (x))
16612     return false;
16613
16614   if (GET_MODE (x) == VOIDmode
16615       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16616     return false;
16617
16618   r = *CONST_DOUBLE_REAL_VALUE (x);
16619
16620   /* We cannot represent infinities, NaNs or +/-zero.  We won't
16621      know if we have +zero until we analyse the mantissa, but we
16622      can reject the other invalid values.  */
16623   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16624       || REAL_VALUE_MINUS_ZERO (r))
16625     return false;
16626
16627   /* Extract exponent.  */
16628   r = real_value_abs (&r);
16629   exponent = REAL_EXP (&r);
16630
16631   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16632      highest (sign) bit, with a fixed binary point at bit point_pos.
16633      m1 holds the low part of the mantissa, m2 the high part.
16634      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16635      bits for the mantissa, this can fail (low bits will be lost).  */
16636   real_ldexp (&m, &r, point_pos - exponent);
16637   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16638
16639   /* If the low part of the mantissa has bits set we cannot represent
16640      the value.  */
16641   if (w.ulow () != 0)
16642     return false;
16643   /* We have rejected the lower HOST_WIDE_INT, so update our
16644      understanding of how many bits lie in the mantissa and
16645      look only at the high HOST_WIDE_INT.  */
16646   mantissa = w.elt (1);
16647   point_pos -= HOST_BITS_PER_WIDE_INT;
16648
16649   /* We can only represent values with a mantissa of the form 1.xxxx.  */
16650   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16651   if ((mantissa & mask) != 0)
16652     return false;
16653
16654   /* Having filtered unrepresentable values, we may now remove all
16655      but the highest 5 bits.  */
16656   mantissa >>= point_pos - 5;
16657
16658   /* We cannot represent the value 0.0, so reject it.  This is handled
16659      elsewhere.  */
16660   if (mantissa == 0)
16661     return false;
16662
16663   /* Then, as bit 4 is always set, we can mask it off, leaving
16664      the mantissa in the range [0, 15].  */
16665   mantissa &= ~(1 << 4);
16666   gcc_assert (mantissa <= 15);
16667
16668   /* GCC internally does not use IEEE754-like encoding (where normalized
16669      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
16670      Our mantissa values are shifted 4 places to the left relative to
16671      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16672      by 5 places to correct for GCC's representation.  */
16673   exponent = 5 - exponent;
16674
16675   return (exponent >= 0 && exponent <= 7);
16676 }
16677
16678 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16679    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
16680    output MOVI/MVNI, ORR or BIC immediate.  */
16681 char*
16682 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16683                                    enum simd_immediate_check which)
16684 {
16685   bool is_valid;
16686   static char templ[40];
16687   const char *mnemonic;
16688   const char *shift_op;
16689   unsigned int lane_count = 0;
16690   char element_char;
16691
16692   struct simd_immediate_info info;
16693
16694   /* This will return true to show const_vector is legal for use as either
16695      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16696      It will also update INFO to show how the immediate should be generated.
16697      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
16698   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16699   gcc_assert (is_valid);
16700
16701   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16702   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16703
16704   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16705     {
16706       gcc_assert (info.insn == simd_immediate_info::MOV
16707                   && info.u.mov.shift == 0);
16708       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16709          move immediate path.  */
16710       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16711         info.u.mov.value = GEN_INT (0);
16712       else
16713         {
16714           const unsigned int buf_size = 20;
16715           char float_buf[buf_size] = {'\0'};
16716           real_to_decimal_for_mode (float_buf,
16717                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16718                                     buf_size, buf_size, 1, info.elt_mode);
16719
16720           if (lane_count == 1)
16721             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16722           else
16723             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16724                       lane_count, element_char, float_buf);
16725           return templ;
16726         }
16727     }
16728
16729   gcc_assert (CONST_INT_P (info.u.mov.value));
16730
16731   if (which == AARCH64_CHECK_MOV)
16732     {
16733       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16734       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
16735                   ? "msl" : "lsl");
16736       if (lane_count == 1)
16737         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16738                   mnemonic, UINTVAL (info.u.mov.value));
16739       else if (info.u.mov.shift)
16740         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16741                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16742                   element_char, UINTVAL (info.u.mov.value), shift_op,
16743                   info.u.mov.shift);
16744       else
16745         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16746                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16747                   element_char, UINTVAL (info.u.mov.value));
16748     }
16749   else
16750     {
16751       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
16752       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16753       if (info.u.mov.shift)
16754         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16755                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16756                   element_char, UINTVAL (info.u.mov.value), "lsl",
16757                   info.u.mov.shift);
16758       else
16759         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16760                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16761                   element_char, UINTVAL (info.u.mov.value));
16762     }
16763   return templ;
16764 }
16765
16766 char*
16767 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16768 {
16769
16770   /* If a floating point number was passed and we desire to use it in an
16771      integer mode do the conversion to integer.  */
16772   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16773     {
16774       unsigned HOST_WIDE_INT ival;
16775       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16776           gcc_unreachable ();
16777       immediate = gen_int_mode (ival, mode);
16778     }
16779
16780   machine_mode vmode;
16781   /* use a 64 bit mode for everything except for DI/DF mode, where we use
16782      a 128 bit vector mode.  */
16783   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16784
16785   vmode = aarch64_simd_container_mode (mode, width);
16786   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16787   return aarch64_output_simd_mov_immediate (v_op, width);
16788 }
16789
16790 /* Return the output string to use for moving immediate CONST_VECTOR
16791    into an SVE register.  */
16792
16793 char *
16794 aarch64_output_sve_mov_immediate (rtx const_vector)
16795 {
16796   static char templ[40];
16797   struct simd_immediate_info info;
16798   char element_char;
16799
16800   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16801   gcc_assert (is_valid);
16802
16803   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16804
16805   machine_mode vec_mode = GET_MODE (const_vector);
16806   if (aarch64_sve_pred_mode_p (vec_mode))
16807     {
16808       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16809       if (info.insn == simd_immediate_info::MOV)
16810         {
16811           gcc_assert (info.u.mov.value == const0_rtx);
16812           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
16813         }
16814       else
16815         {
16816           gcc_assert (info.insn == simd_immediate_info::PTRUE);
16817           unsigned int total_bytes;
16818           if (info.u.pattern == AARCH64_SV_ALL
16819               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
16820             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
16821                       total_bytes / GET_MODE_SIZE (info.elt_mode));
16822           else
16823             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
16824                       svpattern_token (info.u.pattern));
16825         }
16826       return buf;
16827     }
16828
16829   if (info.insn == simd_immediate_info::INDEX)
16830     {
16831       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16832                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16833                 element_char, INTVAL (info.u.index.base),
16834                 INTVAL (info.u.index.step));
16835       return templ;
16836     }
16837
16838   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16839     {
16840       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16841         info.u.mov.value = GEN_INT (0);
16842       else
16843         {
16844           const int buf_size = 20;
16845           char float_buf[buf_size] = {};
16846           real_to_decimal_for_mode (float_buf,
16847                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16848                                     buf_size, buf_size, 1, info.elt_mode);
16849
16850           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16851                     element_char, float_buf);
16852           return templ;
16853         }
16854     }
16855
16856   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16857             element_char, INTVAL (info.u.mov.value));
16858   return templ;
16859 }
16860
16861 /* Split operands into moves from op[1] + op[2] into op[0].  */
16862
16863 void
16864 aarch64_split_combinev16qi (rtx operands[3])
16865 {
16866   unsigned int dest = REGNO (operands[0]);
16867   unsigned int src1 = REGNO (operands[1]);
16868   unsigned int src2 = REGNO (operands[2]);
16869   machine_mode halfmode = GET_MODE (operands[1]);
16870   unsigned int halfregs = REG_NREGS (operands[1]);
16871   rtx destlo, desthi;
16872
16873   gcc_assert (halfmode == V16QImode);
16874
16875   if (src1 == dest && src2 == dest + halfregs)
16876     {
16877       /* No-op move.  Can't split to nothing; emit something.  */
16878       emit_note (NOTE_INSN_DELETED);
16879       return;
16880     }
16881
16882   /* Preserve register attributes for variable tracking.  */
16883   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16884   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16885                                GET_MODE_SIZE (halfmode));
16886
16887   /* Special case of reversed high/low parts.  */
16888   if (reg_overlap_mentioned_p (operands[2], destlo)
16889       && reg_overlap_mentioned_p (operands[1], desthi))
16890     {
16891       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16892       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16893       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16894     }
16895   else if (!reg_overlap_mentioned_p (operands[2], destlo))
16896     {
16897       /* Try to avoid unnecessary moves if part of the result
16898          is in the right place already.  */
16899       if (src1 != dest)
16900         emit_move_insn (destlo, operands[1]);
16901       if (src2 != dest + halfregs)
16902         emit_move_insn (desthi, operands[2]);
16903     }
16904   else
16905     {
16906       if (src2 != dest + halfregs)
16907         emit_move_insn (desthi, operands[2]);
16908       if (src1 != dest)
16909         emit_move_insn (destlo, operands[1]);
16910     }
16911 }
16912
16913 /* vec_perm support.  */
16914
16915 struct expand_vec_perm_d
16916 {
16917   rtx target, op0, op1;
16918   vec_perm_indices perm;
16919   machine_mode vmode;
16920   unsigned int vec_flags;
16921   bool one_vector_p;
16922   bool testing_p;
16923 };
16924
16925 /* Generate a variable permutation.  */
16926
16927 static void
16928 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16929 {
16930   machine_mode vmode = GET_MODE (target);
16931   bool one_vector_p = rtx_equal_p (op0, op1);
16932
16933   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16934   gcc_checking_assert (GET_MODE (op0) == vmode);
16935   gcc_checking_assert (GET_MODE (op1) == vmode);
16936   gcc_checking_assert (GET_MODE (sel) == vmode);
16937   gcc_checking_assert (TARGET_SIMD);
16938
16939   if (one_vector_p)
16940     {
16941       if (vmode == V8QImode)
16942         {
16943           /* Expand the argument to a V16QI mode by duplicating it.  */
16944           rtx pair = gen_reg_rtx (V16QImode);
16945           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16946           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16947         }
16948       else
16949         {
16950           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16951         }
16952     }
16953   else
16954     {
16955       rtx pair;
16956
16957       if (vmode == V8QImode)
16958         {
16959           pair = gen_reg_rtx (V16QImode);
16960           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16961           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16962         }
16963       else
16964         {
16965           pair = gen_reg_rtx (OImode);
16966           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16967           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16968         }
16969     }
16970 }
16971
16972 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16973    NELT is the number of elements in the vector.  */
16974
16975 void
16976 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16977                          unsigned int nelt)
16978 {
16979   machine_mode vmode = GET_MODE (target);
16980   bool one_vector_p = rtx_equal_p (op0, op1);
16981   rtx mask;
16982
16983   /* The TBL instruction does not use a modulo index, so we must take care
16984      of that ourselves.  */
16985   mask = aarch64_simd_gen_const_vector_dup (vmode,
16986       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16987   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16988
16989   /* For big-endian, we also need to reverse the index within the vector
16990      (but not which vector).  */
16991   if (BYTES_BIG_ENDIAN)
16992     {
16993       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16994       if (!one_vector_p)
16995         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16996       sel = expand_simple_binop (vmode, XOR, sel, mask,
16997                                  NULL, 0, OPTAB_LIB_WIDEN);
16998     }
16999   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17000 }
17001
17002 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17003
17004 static void
17005 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17006 {
17007   emit_insn (gen_rtx_SET (target,
17008                           gen_rtx_UNSPEC (GET_MODE (target),
17009                                           gen_rtvec (2, op0, op1), code)));
17010 }
17011
17012 /* Expand an SVE vec_perm with the given operands.  */
17013
17014 void
17015 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17016 {
17017   machine_mode data_mode = GET_MODE (target);
17018   machine_mode sel_mode = GET_MODE (sel);
17019   /* Enforced by the pattern condition.  */
17020   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17021
17022   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17023      size of the two value vectors, i.e. the upper bits of the indices
17024      are effectively ignored.  SVE TBL instead produces 0 for any
17025      out-of-range indices, so we need to modulo all the vec_perm indices
17026      to ensure they are all in range.  */
17027   rtx sel_reg = force_reg (sel_mode, sel);
17028
17029   /* Check if the sel only references the first values vector.  */
17030   if (GET_CODE (sel) == CONST_VECTOR
17031       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17032     {
17033       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17034       return;
17035     }
17036
17037   /* Check if the two values vectors are the same.  */
17038   if (rtx_equal_p (op0, op1))
17039     {
17040       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17041       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17042                                          NULL, 0, OPTAB_DIRECT);
17043       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17044       return;
17045     }
17046
17047   /* Run TBL on for each value vector and combine the results.  */
17048
17049   rtx res0 = gen_reg_rtx (data_mode);
17050   rtx res1 = gen_reg_rtx (data_mode);
17051   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17052   if (GET_CODE (sel) != CONST_VECTOR
17053       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17054     {
17055       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17056                                                        2 * nunits - 1);
17057       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17058                                      NULL, 0, OPTAB_DIRECT);
17059     }
17060   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17061   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17062                                      NULL, 0, OPTAB_DIRECT);
17063   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17064   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17065     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17066   else
17067     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17068 }
17069
17070 /* Recognize patterns suitable for the TRN instructions.  */
17071 static bool
17072 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17073 {
17074   HOST_WIDE_INT odd;
17075   poly_uint64 nelt = d->perm.length ();
17076   rtx out, in0, in1, x;
17077   machine_mode vmode = d->vmode;
17078
17079   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17080     return false;
17081
17082   /* Note that these are little-endian tests.
17083      We correct for big-endian later.  */
17084   if (!d->perm[0].is_constant (&odd)
17085       || (odd != 0 && odd != 1)
17086       || !d->perm.series_p (0, 2, odd, 2)
17087       || !d->perm.series_p (1, 2, nelt + odd, 2))
17088     return false;
17089
17090   /* Success!  */
17091   if (d->testing_p)
17092     return true;
17093
17094   in0 = d->op0;
17095   in1 = d->op1;
17096   /* We don't need a big-endian lane correction for SVE; see the comment
17097      at the head of aarch64-sve.md for details.  */
17098   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17099     {
17100       x = in0, in0 = in1, in1 = x;
17101       odd = !odd;
17102     }
17103   out = d->target;
17104
17105   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17106                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17107   return true;
17108 }
17109
17110 /* Recognize patterns suitable for the UZP instructions.  */
17111 static bool
17112 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17113 {
17114   HOST_WIDE_INT odd;
17115   rtx out, in0, in1, x;
17116   machine_mode vmode = d->vmode;
17117
17118   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17119     return false;
17120
17121   /* Note that these are little-endian tests.
17122      We correct for big-endian later.  */
17123   if (!d->perm[0].is_constant (&odd)
17124       || (odd != 0 && odd != 1)
17125       || !d->perm.series_p (0, 1, odd, 2))
17126     return false;
17127
17128   /* Success!  */
17129   if (d->testing_p)
17130     return true;
17131
17132   in0 = d->op0;
17133   in1 = d->op1;
17134   /* We don't need a big-endian lane correction for SVE; see the comment
17135      at the head of aarch64-sve.md for details.  */
17136   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17137     {
17138       x = in0, in0 = in1, in1 = x;
17139       odd = !odd;
17140     }
17141   out = d->target;
17142
17143   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17144                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17145   return true;
17146 }
17147
17148 /* Recognize patterns suitable for the ZIP instructions.  */
17149 static bool
17150 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17151 {
17152   unsigned int high;
17153   poly_uint64 nelt = d->perm.length ();
17154   rtx out, in0, in1, x;
17155   machine_mode vmode = d->vmode;
17156
17157   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17158     return false;
17159
17160   /* Note that these are little-endian tests.
17161      We correct for big-endian later.  */
17162   poly_uint64 first = d->perm[0];
17163   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17164       || !d->perm.series_p (0, 2, first, 1)
17165       || !d->perm.series_p (1, 2, first + nelt, 1))
17166     return false;
17167   high = maybe_ne (first, 0U);
17168
17169   /* Success!  */
17170   if (d->testing_p)
17171     return true;
17172
17173   in0 = d->op0;
17174   in1 = d->op1;
17175   /* We don't need a big-endian lane correction for SVE; see the comment
17176      at the head of aarch64-sve.md for details.  */
17177   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17178     {
17179       x = in0, in0 = in1, in1 = x;
17180       high = !high;
17181     }
17182   out = d->target;
17183
17184   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17185                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17186   return true;
17187 }
17188
17189 /* Recognize patterns for the EXT insn.  */
17190
17191 static bool
17192 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17193 {
17194   HOST_WIDE_INT location;
17195   rtx offset;
17196
17197   /* The first element always refers to the first vector.
17198      Check if the extracted indices are increasing by one.  */
17199   if (d->vec_flags == VEC_SVE_PRED
17200       || !d->perm[0].is_constant (&location)
17201       || !d->perm.series_p (0, 1, location, 1))
17202     return false;
17203
17204   /* Success! */
17205   if (d->testing_p)
17206     return true;
17207
17208   /* The case where (location == 0) is a no-op for both big- and little-endian,
17209      and is removed by the mid-end at optimization levels -O1 and higher.
17210
17211      We don't need a big-endian lane correction for SVE; see the comment
17212      at the head of aarch64-sve.md for details.  */
17213   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17214     {
17215       /* After setup, we want the high elements of the first vector (stored
17216          at the LSB end of the register), and the low elements of the second
17217          vector (stored at the MSB end of the register). So swap.  */
17218       std::swap (d->op0, d->op1);
17219       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17220          to_constant () is safe since this is restricted to Advanced SIMD
17221          vectors.  */
17222       location = d->perm.length ().to_constant () - location;
17223     }
17224
17225   offset = GEN_INT (location);
17226   emit_set_insn (d->target,
17227                  gen_rtx_UNSPEC (d->vmode,
17228                                  gen_rtvec (3, d->op0, d->op1, offset),
17229                                  UNSPEC_EXT));
17230   return true;
17231 }
17232
17233 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17234    within each 64-bit, 32-bit or 16-bit granule.  */
17235
17236 static bool
17237 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17238 {
17239   HOST_WIDE_INT diff;
17240   unsigned int i, size, unspec;
17241   machine_mode pred_mode;
17242
17243   if (d->vec_flags == VEC_SVE_PRED
17244       || !d->one_vector_p
17245       || !d->perm[0].is_constant (&diff))
17246     return false;
17247
17248   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17249   if (size == 8)
17250     {
17251       unspec = UNSPEC_REV64;
17252       pred_mode = VNx2BImode;
17253     }
17254   else if (size == 4)
17255     {
17256       unspec = UNSPEC_REV32;
17257       pred_mode = VNx4BImode;
17258     }
17259   else if (size == 2)
17260     {
17261       unspec = UNSPEC_REV16;
17262       pred_mode = VNx8BImode;
17263     }
17264   else
17265     return false;
17266
17267   unsigned int step = diff + 1;
17268   for (i = 0; i < step; ++i)
17269     if (!d->perm.series_p (i, step, diff - i, step))
17270       return false;
17271
17272   /* Success! */
17273   if (d->testing_p)
17274     return true;
17275
17276   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17277   if (d->vec_flags == VEC_SVE_DATA)
17278     {
17279       rtx pred = aarch64_ptrue_reg (pred_mode);
17280       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17281                             UNSPEC_MERGE_PTRUE);
17282     }
17283   emit_set_insn (d->target, src);
17284   return true;
17285 }
17286
17287 /* Recognize patterns for the REV insn, which reverses elements within
17288    a full vector.  */
17289
17290 static bool
17291 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17292 {
17293   poly_uint64 nelt = d->perm.length ();
17294
17295   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17296     return false;
17297
17298   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17299     return false;
17300
17301   /* Success! */
17302   if (d->testing_p)
17303     return true;
17304
17305   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17306   emit_set_insn (d->target, src);
17307   return true;
17308 }
17309
17310 static bool
17311 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17312 {
17313   rtx out = d->target;
17314   rtx in0;
17315   HOST_WIDE_INT elt;
17316   machine_mode vmode = d->vmode;
17317   rtx lane;
17318
17319   if (d->vec_flags == VEC_SVE_PRED
17320       || d->perm.encoding ().encoded_nelts () != 1
17321       || !d->perm[0].is_constant (&elt))
17322     return false;
17323
17324   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17325     return false;
17326
17327   /* Success! */
17328   if (d->testing_p)
17329     return true;
17330
17331   /* The generic preparation in aarch64_expand_vec_perm_const_1
17332      swaps the operand order and the permute indices if it finds
17333      d->perm[0] to be in the second operand.  Thus, we can always
17334      use d->op0 and need not do any extra arithmetic to get the
17335      correct lane number.  */
17336   in0 = d->op0;
17337   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17338
17339   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17340   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17341   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17342   return true;
17343 }
17344
17345 static bool
17346 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17347 {
17348   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17349   machine_mode vmode = d->vmode;
17350
17351   /* Make sure that the indices are constant.  */
17352   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17353   for (unsigned int i = 0; i < encoded_nelts; ++i)
17354     if (!d->perm[i].is_constant ())
17355       return false;
17356
17357   if (d->testing_p)
17358     return true;
17359
17360   /* Generic code will try constant permutation twice.  Once with the
17361      original mode and again with the elements lowered to QImode.
17362      So wait and don't do the selector expansion ourselves.  */
17363   if (vmode != V8QImode && vmode != V16QImode)
17364     return false;
17365
17366   /* to_constant is safe since this routine is specific to Advanced SIMD
17367      vectors.  */
17368   unsigned int nelt = d->perm.length ().to_constant ();
17369   for (unsigned int i = 0; i < nelt; ++i)
17370     /* If big-endian and two vectors we end up with a weird mixed-endian
17371        mode on NEON.  Reverse the index within each word but not the word
17372        itself.  to_constant is safe because we checked is_constant above.  */
17373     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17374                         ? d->perm[i].to_constant () ^ (nelt - 1)
17375                         : d->perm[i].to_constant ());
17376
17377   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17378   sel = force_reg (vmode, sel);
17379
17380   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17381   return true;
17382 }
17383
17384 /* Try to implement D using an SVE TBL instruction.  */
17385
17386 static bool
17387 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17388 {
17389   unsigned HOST_WIDE_INT nelt;
17390
17391   /* Permuting two variable-length vectors could overflow the
17392      index range.  */
17393   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17394     return false;
17395
17396   if (d->testing_p)
17397     return true;
17398
17399   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17400   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17401   if (d->one_vector_p)
17402     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17403   else
17404     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17405   return true;
17406 }
17407
17408 static bool
17409 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17410 {
17411   /* The pattern matching functions above are written to look for a small
17412      number to begin the sequence (0, 1, N/2).  If we begin with an index
17413      from the second operand, we can swap the operands.  */
17414   poly_int64 nelt = d->perm.length ();
17415   if (known_ge (d->perm[0], nelt))
17416     {
17417       d->perm.rotate_inputs (1);
17418       std::swap (d->op0, d->op1);
17419     }
17420
17421   if ((d->vec_flags == VEC_ADVSIMD
17422        || d->vec_flags == VEC_SVE_DATA
17423        || d->vec_flags == VEC_SVE_PRED)
17424       && known_gt (nelt, 1))
17425     {
17426       if (aarch64_evpc_rev_local (d))
17427         return true;
17428       else if (aarch64_evpc_rev_global (d))
17429         return true;
17430       else if (aarch64_evpc_ext (d))
17431         return true;
17432       else if (aarch64_evpc_dup (d))
17433         return true;
17434       else if (aarch64_evpc_zip (d))
17435         return true;
17436       else if (aarch64_evpc_uzp (d))
17437         return true;
17438       else if (aarch64_evpc_trn (d))
17439         return true;
17440       if (d->vec_flags == VEC_SVE_DATA)
17441         return aarch64_evpc_sve_tbl (d);
17442       else if (d->vec_flags == VEC_ADVSIMD)
17443         return aarch64_evpc_tbl (d);
17444     }
17445   return false;
17446 }
17447
17448 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
17449
17450 static bool
17451 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17452                                   rtx op1, const vec_perm_indices &sel)
17453 {
17454   struct expand_vec_perm_d d;
17455
17456   /* Check whether the mask can be applied to a single vector.  */
17457   if (sel.ninputs () == 1
17458       || (op0 && rtx_equal_p (op0, op1)))
17459     d.one_vector_p = true;
17460   else if (sel.all_from_input_p (0))
17461     {
17462       d.one_vector_p = true;
17463       op1 = op0;
17464     }
17465   else if (sel.all_from_input_p (1))
17466     {
17467       d.one_vector_p = true;
17468       op0 = op1;
17469     }
17470   else
17471     d.one_vector_p = false;
17472
17473   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17474                      sel.nelts_per_input ());
17475   d.vmode = vmode;
17476   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17477   d.target = target;
17478   d.op0 = op0;
17479   d.op1 = op1;
17480   d.testing_p = !target;
17481
17482   if (!d.testing_p)
17483     return aarch64_expand_vec_perm_const_1 (&d);
17484
17485   rtx_insn *last = get_last_insn ();
17486   bool ret = aarch64_expand_vec_perm_const_1 (&d);
17487   gcc_assert (last == get_last_insn ());
17488
17489   return ret;
17490 }
17491
17492 /* Generate a byte permute mask for a register of mode MODE,
17493    which has NUNITS units.  */
17494
17495 rtx
17496 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17497 {
17498   /* We have to reverse each vector because we dont have
17499      a permuted load that can reverse-load according to ABI rules.  */
17500   rtx mask;
17501   rtvec v = rtvec_alloc (16);
17502   unsigned int i, j;
17503   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17504
17505   gcc_assert (BYTES_BIG_ENDIAN);
17506   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17507
17508   for (i = 0; i < nunits; i++)
17509     for (j = 0; j < usize; j++)
17510       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17511   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17512   return force_reg (V16QImode, mask);
17513 }
17514
17515 /* Return true if X is a valid second operand for the SVE instruction
17516    that implements integer comparison OP_CODE.  */
17517
17518 static bool
17519 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17520 {
17521   if (register_operand (x, VOIDmode))
17522     return true;
17523
17524   switch (op_code)
17525     {
17526     case LTU:
17527     case LEU:
17528     case GEU:
17529     case GTU:
17530       return aarch64_sve_cmp_immediate_p (x, false);
17531     case LT:
17532     case LE:
17533     case GE:
17534     case GT:
17535     case NE:
17536     case EQ:
17537       return aarch64_sve_cmp_immediate_p (x, true);
17538     default:
17539       gcc_unreachable ();
17540     }
17541 }
17542
17543 /* Use predicated SVE instructions to implement the equivalent of:
17544
17545      (set TARGET OP)
17546
17547    given that PTRUE is an all-true predicate of the appropriate mode.  */
17548
17549 static void
17550 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17551 {
17552   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17553                                gen_rtvec (2, ptrue, op),
17554                                UNSPEC_MERGE_PTRUE);
17555   rtx_insn *insn = emit_set_insn (target, unspec);
17556   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17557 }
17558
17559 /* Likewise, but also clobber the condition codes.  */
17560
17561 static void
17562 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17563 {
17564   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17565                                gen_rtvec (2, ptrue, op),
17566                                UNSPEC_MERGE_PTRUE);
17567   rtx_insn *insn = emit_insn (gen_set_clobber_cc_nzc (target, unspec));
17568   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17569 }
17570
17571 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17572
17573 static unsigned int
17574 aarch64_unspec_cond_code (rtx_code code)
17575 {
17576   switch (code)
17577     {
17578     case NE:
17579       return UNSPEC_COND_FCMNE;
17580     case EQ:
17581       return UNSPEC_COND_FCMEQ;
17582     case LT:
17583       return UNSPEC_COND_FCMLT;
17584     case GT:
17585       return UNSPEC_COND_FCMGT;
17586     case LE:
17587       return UNSPEC_COND_FCMLE;
17588     case GE:
17589       return UNSPEC_COND_FCMGE;
17590     default:
17591       gcc_unreachable ();
17592     }
17593 }
17594
17595 /* Emit:
17596
17597       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17598
17599    where <X> is the operation associated with comparison CODE.  This form
17600    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17601    semantics, such as when PRED might not be all-true and when comparing
17602    inactive lanes could have side effects.  */
17603
17604 static void
17605 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17606                                   rtx pred, rtx op0, rtx op1)
17607 {
17608   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17609                                gen_rtvec (3, pred, op0, op1),
17610                                aarch64_unspec_cond_code (code));
17611   emit_set_insn (target, unspec);
17612 }
17613
17614 /* Expand an SVE integer comparison using the SVE equivalent of:
17615
17616      (set TARGET (CODE OP0 OP1)).  */
17617
17618 void
17619 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17620 {
17621   machine_mode pred_mode = GET_MODE (target);
17622   machine_mode data_mode = GET_MODE (op0);
17623
17624   if (!aarch64_sve_cmp_operand_p (code, op1))
17625     op1 = force_reg (data_mode, op1);
17626
17627   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17628   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17629   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17630 }
17631
17632 /* Emit the SVE equivalent of:
17633
17634       (set TMP1 (CODE1 OP0 OP1))
17635       (set TMP2 (CODE2 OP0 OP1))
17636       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17637
17638    PTRUE is an all-true predicate with the same mode as TARGET.  */
17639
17640 static void
17641 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17642                            rtx ptrue, rtx op0, rtx op1)
17643 {
17644   machine_mode pred_mode = GET_MODE (ptrue);
17645   rtx tmp1 = gen_reg_rtx (pred_mode);
17646   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17647                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17648   rtx tmp2 = gen_reg_rtx (pred_mode);
17649   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17650                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17651   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17652 }
17653
17654 /* Emit the SVE equivalent of:
17655
17656       (set TMP (CODE OP0 OP1))
17657       (set TARGET (not TMP))
17658
17659    PTRUE is an all-true predicate with the same mode as TARGET.  */
17660
17661 static void
17662 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17663                                 rtx op0, rtx op1)
17664 {
17665   machine_mode pred_mode = GET_MODE (ptrue);
17666   rtx tmp = gen_reg_rtx (pred_mode);
17667   aarch64_emit_sve_ptrue_op (tmp, ptrue,
17668                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17669   aarch64_emit_unop (target, one_cmpl_optab, tmp);
17670 }
17671
17672 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17673
17674      (set TARGET (CODE OP0 OP1))
17675
17676    If CAN_INVERT_P is true, the caller can also handle inverted results;
17677    return true if the result is in fact inverted.  */
17678
17679 bool
17680 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17681                                   rtx op0, rtx op1, bool can_invert_p)
17682 {
17683   machine_mode pred_mode = GET_MODE (target);
17684   machine_mode data_mode = GET_MODE (op0);
17685
17686   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17687   switch (code)
17688     {
17689     case UNORDERED:
17690       /* UNORDERED has no immediate form.  */
17691       op1 = force_reg (data_mode, op1);
17692       /* fall through */
17693     case LT:
17694     case LE:
17695     case GT:
17696     case GE:
17697     case EQ:
17698     case NE:
17699       {
17700         /* There is native support for the comparison.  */
17701         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17702         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17703         return false;
17704       }
17705
17706     case LTGT:
17707       /* This is a trapping operation (LT or GT).  */
17708       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17709       return false;
17710
17711     case UNEQ:
17712       if (!flag_trapping_math)
17713         {
17714           /* This would trap for signaling NaNs.  */
17715           op1 = force_reg (data_mode, op1);
17716           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17717           return false;
17718         }
17719       /* fall through */
17720     case UNLT:
17721     case UNLE:
17722     case UNGT:
17723     case UNGE:
17724       if (flag_trapping_math)
17725         {
17726           /* Work out which elements are ordered.  */
17727           rtx ordered = gen_reg_rtx (pred_mode);
17728           op1 = force_reg (data_mode, op1);
17729           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17730
17731           /* Test the opposite condition for the ordered elements,
17732              then invert the result.  */
17733           if (code == UNEQ)
17734             code = NE;
17735           else
17736             code = reverse_condition_maybe_unordered (code);
17737           if (can_invert_p)
17738             {
17739               aarch64_emit_sve_predicated_cond (target, code,
17740                                                 ordered, op0, op1);
17741               return true;
17742             }
17743           rtx tmp = gen_reg_rtx (pred_mode);
17744           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17745           aarch64_emit_unop (target, one_cmpl_optab, tmp);
17746           return false;
17747         }
17748       break;
17749
17750     case ORDERED:
17751       /* ORDERED has no immediate form.  */
17752       op1 = force_reg (data_mode, op1);
17753       break;
17754
17755     default:
17756       gcc_unreachable ();
17757     }
17758
17759   /* There is native support for the inverse comparison.  */
17760   code = reverse_condition_maybe_unordered (code);
17761   if (can_invert_p)
17762     {
17763       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17764       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17765       return true;
17766     }
17767   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17768   return false;
17769 }
17770
17771 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
17772    of the data being selected and CMP_MODE is the mode of the values being
17773    compared.  */
17774
17775 void
17776 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17777                           rtx *ops)
17778 {
17779   machine_mode pred_mode
17780     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17781                              GET_MODE_SIZE (cmp_mode)).require ();
17782   rtx pred = gen_reg_rtx (pred_mode);
17783   if (FLOAT_MODE_P (cmp_mode))
17784     {
17785       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17786                                             ops[4], ops[5], true))
17787         std::swap (ops[1], ops[2]);
17788     }
17789   else
17790     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17791
17792   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17793   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17794 }
17795
17796 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
17797    true.  However due to issues with register allocation it is preferable
17798    to avoid tieing integer scalar and FP scalar modes.  Executing integer
17799    operations in general registers is better than treating them as scalar
17800    vector operations.  This reduces latency and avoids redundant int<->FP
17801    moves.  So tie modes if they are either the same class, or vector modes
17802    with other vector modes, vector structs or any scalar mode.  */
17803
17804 static bool
17805 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17806 {
17807   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17808     return true;
17809
17810   /* We specifically want to allow elements of "structure" modes to
17811      be tieable to the structure.  This more general condition allows
17812      other rarer situations too.  The reason we don't extend this to
17813      predicate modes is that there are no predicate structure modes
17814      nor any specific instructions for extracting part of a predicate
17815      register.  */
17816   if (aarch64_vector_data_mode_p (mode1)
17817       && aarch64_vector_data_mode_p (mode2))
17818     return true;
17819
17820   /* Also allow any scalar modes with vectors.  */
17821   if (aarch64_vector_mode_supported_p (mode1)
17822       || aarch64_vector_mode_supported_p (mode2))
17823     return true;
17824
17825   return false;
17826 }
17827
17828 /* Return a new RTX holding the result of moving POINTER forward by
17829    AMOUNT bytes.  */
17830
17831 static rtx
17832 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17833 {
17834   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17835
17836   return adjust_automodify_address (pointer, GET_MODE (pointer),
17837                                     next, amount);
17838 }
17839
17840 /* Return a new RTX holding the result of moving POINTER forward by the
17841    size of the mode it points to.  */
17842
17843 static rtx
17844 aarch64_progress_pointer (rtx pointer)
17845 {
17846   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17847 }
17848
17849 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17850    MODE bytes.  */
17851
17852 static void
17853 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17854                                               machine_mode mode)
17855 {
17856   rtx reg = gen_reg_rtx (mode);
17857
17858   /* "Cast" the pointers to the correct mode.  */
17859   *src = adjust_address (*src, mode, 0);
17860   *dst = adjust_address (*dst, mode, 0);
17861   /* Emit the memcpy.  */
17862   emit_move_insn (reg, *src);
17863   emit_move_insn (*dst, reg);
17864   /* Move the pointers forward.  */
17865   *src = aarch64_progress_pointer (*src);
17866   *dst = aarch64_progress_pointer (*dst);
17867 }
17868
17869 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
17870    we succeed, otherwise return false.  */
17871
17872 bool
17873 aarch64_expand_cpymem (rtx *operands)
17874 {
17875   int n, mode_bits;
17876   rtx dst = operands[0];
17877   rtx src = operands[1];
17878   rtx base;
17879   machine_mode cur_mode = BLKmode, next_mode;
17880   bool speed_p = !optimize_function_for_size_p (cfun);
17881
17882   /* When optimizing for size, give a better estimate of the length of a
17883      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
17884      will always require an even number of instructions to do now.  And each
17885      operation requires both a load+store, so devide the max number by 2.  */
17886   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17887
17888   /* We can't do anything smart if the amount to copy is not constant.  */
17889   if (!CONST_INT_P (operands[2]))
17890     return false;
17891
17892   n = INTVAL (operands[2]);
17893
17894   /* Try to keep the number of instructions low.  For all cases we will do at
17895      most two moves for the residual amount, since we'll always overlap the
17896      remainder.  */
17897   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17898     return false;
17899
17900   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17901   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17902
17903   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17904   src = adjust_automodify_address (src, VOIDmode, base, 0);
17905
17906   /* Convert n to bits to make the rest of the code simpler.  */
17907   n = n * BITS_PER_UNIT;
17908
17909   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
17910      larger than TImode, but we should not use them for loads/stores here.  */
17911   const int copy_limit = GET_MODE_BITSIZE (TImode);
17912
17913   while (n > 0)
17914     {
17915       /* Find the largest mode in which to do the copy in without over reading
17916          or writing.  */
17917       opt_scalar_int_mode mode_iter;
17918       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17919         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17920           cur_mode = mode_iter.require ();
17921
17922       gcc_assert (cur_mode != BLKmode);
17923
17924       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17925       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17926
17927       n -= mode_bits;
17928
17929       /* Do certain trailing copies as overlapping if it's going to be
17930          cheaper.  i.e. less instructions to do so.  For instance doing a 15
17931          byte copy it's more efficient to do two overlapping 8 byte copies than
17932          8 + 6 + 1.  */
17933       if (n > 0 && n <= 8 * BITS_PER_UNIT)
17934         {
17935           next_mode = smallest_mode_for_size (n, MODE_INT);
17936           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17937           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17938           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17939           n = n_bits;
17940         }
17941     }
17942
17943   return true;
17944 }
17945
17946 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17947    SImode stores.  Handle the case when the constant has identical
17948    bottom and top halves.  This is beneficial when the two stores can be
17949    merged into an STP and we avoid synthesising potentially expensive
17950    immediates twice.  Return true if such a split is possible.  */
17951
17952 bool
17953 aarch64_split_dimode_const_store (rtx dst, rtx src)
17954 {
17955   rtx lo = gen_lowpart (SImode, src);
17956   rtx hi = gen_highpart_mode (SImode, DImode, src);
17957
17958   bool size_p = optimize_function_for_size_p (cfun);
17959
17960   if (!rtx_equal_p (lo, hi))
17961     return false;
17962
17963   unsigned int orig_cost
17964     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17965   unsigned int lo_cost
17966     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17967
17968   /* We want to transform:
17969      MOV        x1, 49370
17970      MOVK       x1, 0x140, lsl 16
17971      MOVK       x1, 0xc0da, lsl 32
17972      MOVK       x1, 0x140, lsl 48
17973      STR        x1, [x0]
17974    into:
17975      MOV        w1, 49370
17976      MOVK       w1, 0x140, lsl 16
17977      STP        w1, w1, [x0]
17978    So we want to perform this only when we save two instructions
17979    or more.  When optimizing for size, however, accept any code size
17980    savings we can.  */
17981   if (size_p && orig_cost <= lo_cost)
17982     return false;
17983
17984   if (!size_p
17985       && (orig_cost <= lo_cost + 1))
17986     return false;
17987
17988   rtx mem_lo = adjust_address (dst, SImode, 0);
17989   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17990     return false;
17991
17992   rtx tmp_reg = gen_reg_rtx (SImode);
17993   aarch64_expand_mov_immediate (tmp_reg, lo);
17994   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17995   /* Don't emit an explicit store pair as this may not be always profitable.
17996      Let the sched-fusion logic decide whether to merge them.  */
17997   emit_move_insn (mem_lo, tmp_reg);
17998   emit_move_insn (mem_hi, tmp_reg);
17999
18000   return true;
18001 }
18002
18003 /* Generate RTL for a conditional branch with rtx comparison CODE in
18004    mode CC_MODE.  The destination of the unlikely conditional branch
18005    is LABEL_REF.  */
18006
18007 void
18008 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18009                               rtx label_ref)
18010 {
18011   rtx x;
18012   x = gen_rtx_fmt_ee (code, VOIDmode,
18013                       gen_rtx_REG (cc_mode, CC_REGNUM),
18014                       const0_rtx);
18015
18016   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18017                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18018                             pc_rtx);
18019   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18020 }
18021
18022 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18023
18024    OP1 represents the TImode destination operand 1
18025    OP2 represents the TImode destination operand 2
18026    LOW_DEST represents the low half (DImode) of TImode operand 0
18027    LOW_IN1 represents the low half (DImode) of TImode operand 1
18028    LOW_IN2 represents the low half (DImode) of TImode operand 2
18029    HIGH_DEST represents the high half (DImode) of TImode operand 0
18030    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18031    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18032
18033 void
18034 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18035                             rtx *low_in1, rtx *low_in2,
18036                             rtx *high_dest, rtx *high_in1,
18037                             rtx *high_in2)
18038 {
18039   *low_dest = gen_reg_rtx (DImode);
18040   *low_in1 = gen_lowpart (DImode, op1);
18041   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18042                                   subreg_lowpart_offset (DImode, TImode));
18043   *high_dest = gen_reg_rtx (DImode);
18044   *high_in1 = gen_highpart (DImode, op1);
18045   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18046                                    subreg_highpart_offset (DImode, TImode));
18047 }
18048
18049 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18050
18051    This function differs from 'arch64_addti_scratch_regs' in that
18052    OP1 can be an immediate constant (zero). We must call
18053    subreg_highpart_offset with DImode and TImode arguments, otherwise
18054    VOIDmode will be used for the const_int which generates an internal
18055    error from subreg_size_highpart_offset which does not expect a size of zero.
18056
18057    OP1 represents the TImode destination operand 1
18058    OP2 represents the TImode destination operand 2
18059    LOW_DEST represents the low half (DImode) of TImode operand 0
18060    LOW_IN1 represents the low half (DImode) of TImode operand 1
18061    LOW_IN2 represents the low half (DImode) of TImode operand 2
18062    HIGH_DEST represents the high half (DImode) of TImode operand 0
18063    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18064    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18065
18066
18067 void
18068 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18069                              rtx *low_in1, rtx *low_in2,
18070                              rtx *high_dest, rtx *high_in1,
18071                              rtx *high_in2)
18072 {
18073   *low_dest = gen_reg_rtx (DImode);
18074   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18075                                   subreg_lowpart_offset (DImode, TImode));
18076
18077   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18078                                   subreg_lowpart_offset (DImode, TImode));
18079   *high_dest = gen_reg_rtx (DImode);
18080
18081   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18082                                    subreg_highpart_offset (DImode, TImode));
18083   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18084                                    subreg_highpart_offset (DImode, TImode));
18085 }
18086
18087 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18088
18089    OP0 represents the TImode destination operand 0
18090    LOW_DEST represents the low half (DImode) of TImode operand 0
18091    LOW_IN1 represents the low half (DImode) of TImode operand 1
18092    LOW_IN2 represents the low half (DImode) of TImode operand 2
18093    HIGH_DEST represents the high half (DImode) of TImode operand 0
18094    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18095    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18096    UNSIGNED_P is true if the operation is being performed on unsigned
18097    values.  */
18098 void
18099 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18100                        rtx low_in2, rtx high_dest, rtx high_in1,
18101                        rtx high_in2, bool unsigned_p)
18102 {
18103   if (low_in2 == const0_rtx)
18104     {
18105       low_dest = low_in1;
18106       high_in2 = force_reg (DImode, high_in2);
18107       if (unsigned_p)
18108         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18109       else
18110         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18111     }
18112   else
18113     {
18114       if (CONST_INT_P (low_in2))
18115         {
18116           high_in2 = force_reg (DImode, high_in2);
18117           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18118                                               GEN_INT (-INTVAL (low_in2))));
18119         }
18120       else
18121         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18122
18123       if (unsigned_p)
18124         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18125       else
18126         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18127     }
18128
18129   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18130   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18131
18132 }
18133
18134 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18135
18136 static unsigned HOST_WIDE_INT
18137 aarch64_asan_shadow_offset (void)
18138 {
18139   if (TARGET_ILP32)
18140     return (HOST_WIDE_INT_1 << 29);
18141   else
18142     return (HOST_WIDE_INT_1 << 36);
18143 }
18144
18145 static rtx
18146 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18147                         int code, tree treeop0, tree treeop1)
18148 {
18149   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18150   rtx op0, op1;
18151   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18152   insn_code icode;
18153   struct expand_operand ops[4];
18154
18155   start_sequence ();
18156   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18157
18158   op_mode = GET_MODE (op0);
18159   if (op_mode == VOIDmode)
18160     op_mode = GET_MODE (op1);
18161
18162   switch (op_mode)
18163     {
18164     case E_QImode:
18165     case E_HImode:
18166     case E_SImode:
18167       cmp_mode = SImode;
18168       icode = CODE_FOR_cmpsi;
18169       break;
18170
18171     case E_DImode:
18172       cmp_mode = DImode;
18173       icode = CODE_FOR_cmpdi;
18174       break;
18175
18176     case E_SFmode:
18177       cmp_mode = SFmode;
18178       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18179       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18180       break;
18181
18182     case E_DFmode:
18183       cmp_mode = DFmode;
18184       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18185       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18186       break;
18187
18188     default:
18189       end_sequence ();
18190       return NULL_RTX;
18191     }
18192
18193   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18194   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18195   if (!op0 || !op1)
18196     {
18197       end_sequence ();
18198       return NULL_RTX;
18199     }
18200   *prep_seq = get_insns ();
18201   end_sequence ();
18202
18203   create_fixed_operand (&ops[0], op0);
18204   create_fixed_operand (&ops[1], op1);
18205
18206   start_sequence ();
18207   if (!maybe_expand_insn (icode, 2, ops))
18208     {
18209       end_sequence ();
18210       return NULL_RTX;
18211     }
18212   *gen_seq = get_insns ();
18213   end_sequence ();
18214
18215   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18216                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18217 }
18218
18219 static rtx
18220 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18221                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18222 {
18223   rtx op0, op1, target;
18224   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18225   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18226   insn_code icode;
18227   struct expand_operand ops[6];
18228   int aarch64_cond;
18229
18230   push_to_sequence (*prep_seq);
18231   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18232
18233   op_mode = GET_MODE (op0);
18234   if (op_mode == VOIDmode)
18235     op_mode = GET_MODE (op1);
18236
18237   switch (op_mode)
18238     {
18239     case E_QImode:
18240     case E_HImode:
18241     case E_SImode:
18242       cmp_mode = SImode;
18243       icode = CODE_FOR_ccmpsi;
18244       break;
18245
18246     case E_DImode:
18247       cmp_mode = DImode;
18248       icode = CODE_FOR_ccmpdi;
18249       break;
18250
18251     case E_SFmode:
18252       cmp_mode = SFmode;
18253       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18254       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18255       break;
18256
18257     case E_DFmode:
18258       cmp_mode = DFmode;
18259       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18260       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18261       break;
18262
18263     default:
18264       end_sequence ();
18265       return NULL_RTX;
18266     }
18267
18268   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18269   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18270   if (!op0 || !op1)
18271     {
18272       end_sequence ();
18273       return NULL_RTX;
18274     }
18275   *prep_seq = get_insns ();
18276   end_sequence ();
18277
18278   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18279   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18280
18281   if (bit_code != AND)
18282     {
18283       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18284                                                 GET_MODE (XEXP (prev, 0))),
18285                              VOIDmode, XEXP (prev, 0), const0_rtx);
18286       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18287     }
18288
18289   create_fixed_operand (&ops[0], XEXP (prev, 0));
18290   create_fixed_operand (&ops[1], target);
18291   create_fixed_operand (&ops[2], op0);
18292   create_fixed_operand (&ops[3], op1);
18293   create_fixed_operand (&ops[4], prev);
18294   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18295
18296   push_to_sequence (*gen_seq);
18297   if (!maybe_expand_insn (icode, 6, ops))
18298     {
18299       end_sequence ();
18300       return NULL_RTX;
18301     }
18302
18303   *gen_seq = get_insns ();
18304   end_sequence ();
18305
18306   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18307 }
18308
18309 #undef TARGET_GEN_CCMP_FIRST
18310 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18311
18312 #undef TARGET_GEN_CCMP_NEXT
18313 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18314
18315 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18316    instruction fusion of some sort.  */
18317
18318 static bool
18319 aarch64_macro_fusion_p (void)
18320 {
18321   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18322 }
18323
18324
18325 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18326    should be kept together during scheduling.  */
18327
18328 static bool
18329 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18330 {
18331   rtx set_dest;
18332   rtx prev_set = single_set (prev);
18333   rtx curr_set = single_set (curr);
18334   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18335   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18336
18337   if (!aarch64_macro_fusion_p ())
18338     return false;
18339
18340   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18341     {
18342       /* We are trying to match:
18343          prev (mov)  == (set (reg r0) (const_int imm16))
18344          curr (movk) == (set (zero_extract (reg r0)
18345                                            (const_int 16)
18346                                            (const_int 16))
18347                              (const_int imm16_1))  */
18348
18349       set_dest = SET_DEST (curr_set);
18350
18351       if (GET_CODE (set_dest) == ZERO_EXTRACT
18352           && CONST_INT_P (SET_SRC (curr_set))
18353           && CONST_INT_P (SET_SRC (prev_set))
18354           && CONST_INT_P (XEXP (set_dest, 2))
18355           && INTVAL (XEXP (set_dest, 2)) == 16
18356           && REG_P (XEXP (set_dest, 0))
18357           && REG_P (SET_DEST (prev_set))
18358           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18359         {
18360           return true;
18361         }
18362     }
18363
18364   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18365     {
18366
18367       /*  We're trying to match:
18368           prev (adrp) == (set (reg r1)
18369                               (high (symbol_ref ("SYM"))))
18370           curr (add) == (set (reg r0)
18371                              (lo_sum (reg r1)
18372                                      (symbol_ref ("SYM"))))
18373           Note that r0 need not necessarily be the same as r1, especially
18374           during pre-regalloc scheduling.  */
18375
18376       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18377           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18378         {
18379           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18380               && REG_P (XEXP (SET_SRC (curr_set), 0))
18381               && REGNO (XEXP (SET_SRC (curr_set), 0))
18382                  == REGNO (SET_DEST (prev_set))
18383               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18384                               XEXP (SET_SRC (curr_set), 1)))
18385             return true;
18386         }
18387     }
18388
18389   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18390     {
18391
18392       /* We're trying to match:
18393          prev (movk) == (set (zero_extract (reg r0)
18394                                            (const_int 16)
18395                                            (const_int 32))
18396                              (const_int imm16_1))
18397          curr (movk) == (set (zero_extract (reg r0)
18398                                            (const_int 16)
18399                                            (const_int 48))
18400                              (const_int imm16_2))  */
18401
18402       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18403           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18404           && REG_P (XEXP (SET_DEST (prev_set), 0))
18405           && REG_P (XEXP (SET_DEST (curr_set), 0))
18406           && REGNO (XEXP (SET_DEST (prev_set), 0))
18407              == REGNO (XEXP (SET_DEST (curr_set), 0))
18408           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18409           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18410           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18411           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18412           && CONST_INT_P (SET_SRC (prev_set))
18413           && CONST_INT_P (SET_SRC (curr_set)))
18414         return true;
18415
18416     }
18417   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18418     {
18419       /* We're trying to match:
18420           prev (adrp) == (set (reg r0)
18421                               (high (symbol_ref ("SYM"))))
18422           curr (ldr) == (set (reg r1)
18423                              (mem (lo_sum (reg r0)
18424                                              (symbol_ref ("SYM")))))
18425                  or
18426           curr (ldr) == (set (reg r1)
18427                              (zero_extend (mem
18428                                            (lo_sum (reg r0)
18429                                                    (symbol_ref ("SYM"))))))  */
18430       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18431           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18432         {
18433           rtx curr_src = SET_SRC (curr_set);
18434
18435           if (GET_CODE (curr_src) == ZERO_EXTEND)
18436             curr_src = XEXP (curr_src, 0);
18437
18438           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18439               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18440               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18441                  == REGNO (SET_DEST (prev_set))
18442               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18443                               XEXP (SET_SRC (prev_set), 0)))
18444               return true;
18445         }
18446     }
18447
18448   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18449       && any_condjump_p (curr))
18450     {
18451       unsigned int condreg1, condreg2;
18452       rtx cc_reg_1;
18453       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18454       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18455
18456       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18457           && prev
18458           && modified_in_p (cc_reg_1, prev))
18459         {
18460           enum attr_type prev_type = get_attr_type (prev);
18461
18462           /* FIXME: this misses some which is considered simple arthematic
18463              instructions for ThunderX.  Simple shifts are missed here.  */
18464           if (prev_type == TYPE_ALUS_SREG
18465               || prev_type == TYPE_ALUS_IMM
18466               || prev_type == TYPE_LOGICS_REG
18467               || prev_type == TYPE_LOGICS_IMM)
18468             return true;
18469         }
18470     }
18471
18472   if (prev_set
18473       && curr_set
18474       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18475       && any_condjump_p (curr))
18476     {
18477       /* We're trying to match:
18478           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18479           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18480                                                          (const_int 0))
18481                                                  (label_ref ("SYM"))
18482                                                  (pc))  */
18483       if (SET_DEST (curr_set) == (pc_rtx)
18484           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18485           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18486           && REG_P (SET_DEST (prev_set))
18487           && REGNO (SET_DEST (prev_set))
18488              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18489         {
18490           /* Fuse ALU operations followed by conditional branch instruction.  */
18491           switch (get_attr_type (prev))
18492             {
18493             case TYPE_ALU_IMM:
18494             case TYPE_ALU_SREG:
18495             case TYPE_ADC_REG:
18496             case TYPE_ADC_IMM:
18497             case TYPE_ADCS_REG:
18498             case TYPE_ADCS_IMM:
18499             case TYPE_LOGIC_REG:
18500             case TYPE_LOGIC_IMM:
18501             case TYPE_CSEL:
18502             case TYPE_ADR:
18503             case TYPE_MOV_IMM:
18504             case TYPE_SHIFT_REG:
18505             case TYPE_SHIFT_IMM:
18506             case TYPE_BFM:
18507             case TYPE_RBIT:
18508             case TYPE_REV:
18509             case TYPE_EXTEND:
18510               return true;
18511
18512             default:;
18513             }
18514         }
18515     }
18516
18517   return false;
18518 }
18519
18520 /* Return true iff the instruction fusion described by OP is enabled.  */
18521
18522 bool
18523 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18524 {
18525   return (aarch64_tune_params.fusible_ops & op) != 0;
18526 }
18527
18528 /* If MEM is in the form of [base+offset], extract the two parts
18529    of address and set to BASE and OFFSET, otherwise return false
18530    after clearing BASE and OFFSET.  */
18531
18532 bool
18533 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18534 {
18535   rtx addr;
18536
18537   gcc_assert (MEM_P (mem));
18538
18539   addr = XEXP (mem, 0);
18540
18541   if (REG_P (addr))
18542     {
18543       *base = addr;
18544       *offset = const0_rtx;
18545       return true;
18546     }
18547
18548   if (GET_CODE (addr) == PLUS
18549       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18550     {
18551       *base = XEXP (addr, 0);
18552       *offset = XEXP (addr, 1);
18553       return true;
18554     }
18555
18556   *base = NULL_RTX;
18557   *offset = NULL_RTX;
18558
18559   return false;
18560 }
18561
18562 /* Types for scheduling fusion.  */
18563 enum sched_fusion_type
18564 {
18565   SCHED_FUSION_NONE = 0,
18566   SCHED_FUSION_LD_SIGN_EXTEND,
18567   SCHED_FUSION_LD_ZERO_EXTEND,
18568   SCHED_FUSION_LD,
18569   SCHED_FUSION_ST,
18570   SCHED_FUSION_NUM
18571 };
18572
18573 /* If INSN is a load or store of address in the form of [base+offset],
18574    extract the two parts and set to BASE and OFFSET.  Return scheduling
18575    fusion type this INSN is.  */
18576
18577 static enum sched_fusion_type
18578 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18579 {
18580   rtx x, dest, src;
18581   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18582
18583   gcc_assert (INSN_P (insn));
18584   x = PATTERN (insn);
18585   if (GET_CODE (x) != SET)
18586     return SCHED_FUSION_NONE;
18587
18588   src = SET_SRC (x);
18589   dest = SET_DEST (x);
18590
18591   machine_mode dest_mode = GET_MODE (dest);
18592
18593   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18594     return SCHED_FUSION_NONE;
18595
18596   if (GET_CODE (src) == SIGN_EXTEND)
18597     {
18598       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18599       src = XEXP (src, 0);
18600       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18601         return SCHED_FUSION_NONE;
18602     }
18603   else if (GET_CODE (src) == ZERO_EXTEND)
18604     {
18605       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18606       src = XEXP (src, 0);
18607       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18608         return SCHED_FUSION_NONE;
18609     }
18610
18611   if (GET_CODE (src) == MEM && REG_P (dest))
18612     extract_base_offset_in_addr (src, base, offset);
18613   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18614     {
18615       fusion = SCHED_FUSION_ST;
18616       extract_base_offset_in_addr (dest, base, offset);
18617     }
18618   else
18619     return SCHED_FUSION_NONE;
18620
18621   if (*base == NULL_RTX || *offset == NULL_RTX)
18622     fusion = SCHED_FUSION_NONE;
18623
18624   return fusion;
18625 }
18626
18627 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18628
18629    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18630    and PRI are only calculated for these instructions.  For other instruction,
18631    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18632    type instruction fusion can be added by returning different priorities.
18633
18634    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18635
18636 static void
18637 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18638                                int *fusion_pri, int *pri)
18639 {
18640   int tmp, off_val;
18641   rtx base, offset;
18642   enum sched_fusion_type fusion;
18643
18644   gcc_assert (INSN_P (insn));
18645
18646   tmp = max_pri - 1;
18647   fusion = fusion_load_store (insn, &base, &offset);
18648   if (fusion == SCHED_FUSION_NONE)
18649     {
18650       *pri = tmp;
18651       *fusion_pri = tmp;
18652       return;
18653     }
18654
18655   /* Set FUSION_PRI according to fusion type and base register.  */
18656   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18657
18658   /* Calculate PRI.  */
18659   tmp /= 2;
18660
18661   /* INSN with smaller offset goes first.  */
18662   off_val = (int)(INTVAL (offset));
18663   if (off_val >= 0)
18664     tmp -= (off_val & 0xfffff);
18665   else
18666     tmp += ((- off_val) & 0xfffff);
18667
18668   *pri = tmp;
18669   return;
18670 }
18671
18672 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18673    Adjust priority of sha1h instructions so they are scheduled before
18674    other SHA1 instructions.  */
18675
18676 static int
18677 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18678 {
18679   rtx x = PATTERN (insn);
18680
18681   if (GET_CODE (x) == SET)
18682     {
18683       x = SET_SRC (x);
18684
18685       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18686         return priority + 10;
18687     }
18688
18689   return priority;
18690 }
18691
18692 /* Given OPERANDS of consecutive load/store, check if we can merge
18693    them into ldp/stp.  LOAD is true if they are load instructions.
18694    MODE is the mode of memory operands.  */
18695
18696 bool
18697 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18698                                 machine_mode mode)
18699 {
18700   HOST_WIDE_INT offval_1, offval_2, msize;
18701   enum reg_class rclass_1, rclass_2;
18702   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18703
18704   if (load)
18705     {
18706       mem_1 = operands[1];
18707       mem_2 = operands[3];
18708       reg_1 = operands[0];
18709       reg_2 = operands[2];
18710       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18711       if (REGNO (reg_1) == REGNO (reg_2))
18712         return false;
18713     }
18714   else
18715     {
18716       mem_1 = operands[0];
18717       mem_2 = operands[2];
18718       reg_1 = operands[1];
18719       reg_2 = operands[3];
18720     }
18721
18722   /* The mems cannot be volatile.  */
18723   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18724     return false;
18725
18726   /* If we have SImode and slow unaligned ldp,
18727      check the alignment to be at least 8 byte. */
18728   if (mode == SImode
18729       && (aarch64_tune_params.extra_tuning_flags
18730           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18731       && !optimize_size
18732       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18733     return false;
18734
18735   /* Check if the addresses are in the form of [base+offset].  */
18736   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18737   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18738     return false;
18739   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18740   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18741     return false;
18742
18743   /* Check if the bases are same.  */
18744   if (!rtx_equal_p (base_1, base_2))
18745     return false;
18746
18747   /* The operands must be of the same size.  */
18748   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18749                          GET_MODE_SIZE (GET_MODE (mem_2))));
18750
18751   offval_1 = INTVAL (offset_1);
18752   offval_2 = INTVAL (offset_2);
18753   /* We should only be trying this for fixed-sized modes.  There is no
18754      SVE LDP/STP instruction.  */
18755   msize = GET_MODE_SIZE (mode).to_constant ();
18756   /* Check if the offsets are consecutive.  */
18757   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18758     return false;
18759
18760   /* Check if the addresses are clobbered by load.  */
18761   if (load)
18762     {
18763       if (reg_mentioned_p (reg_1, mem_1))
18764         return false;
18765
18766       /* In increasing order, the last load can clobber the address.  */
18767       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18768         return false;
18769     }
18770
18771   /* One of the memory accesses must be a mempair operand.
18772      If it is not the first one, they need to be swapped by the
18773      peephole.  */
18774   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18775        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18776     return false;
18777
18778   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18779     rclass_1 = FP_REGS;
18780   else
18781     rclass_1 = GENERAL_REGS;
18782
18783   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18784     rclass_2 = FP_REGS;
18785   else
18786     rclass_2 = GENERAL_REGS;
18787
18788   /* Check if the registers are of same class.  */
18789   if (rclass_1 != rclass_2)
18790     return false;
18791
18792   return true;
18793 }
18794
18795 /* Given OPERANDS of consecutive load/store that can be merged,
18796    swap them if they are not in ascending order.  */
18797 void
18798 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18799 {
18800   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18801   HOST_WIDE_INT offval_1, offval_2;
18802
18803   if (load)
18804     {
18805       mem_1 = operands[1];
18806       mem_2 = operands[3];
18807     }
18808   else
18809     {
18810       mem_1 = operands[0];
18811       mem_2 = operands[2];
18812     }
18813
18814   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18815   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18816
18817   offval_1 = INTVAL (offset_1);
18818   offval_2 = INTVAL (offset_2);
18819
18820   if (offval_1 > offval_2)
18821     {
18822       /* Irrespective of whether this is a load or a store,
18823          we do the same swap.  */
18824       std::swap (operands[0], operands[2]);
18825       std::swap (operands[1], operands[3]);
18826     }
18827 }
18828
18829 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18830    comparison between the two.  */
18831 int
18832 aarch64_host_wide_int_compare (const void *x, const void *y)
18833 {
18834   return wi::cmps (* ((const HOST_WIDE_INT *) x),
18835                    * ((const HOST_WIDE_INT *) y));
18836 }
18837
18838 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18839    other pointing to a REG rtx containing an offset, compare the offsets
18840    of the two pairs.
18841
18842    Return:
18843
18844         1 iff offset (X) > offset (Y)
18845         0 iff offset (X) == offset (Y)
18846         -1 iff offset (X) < offset (Y)  */
18847 int
18848 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18849 {
18850   const rtx * operands_1 = (const rtx *) x;
18851   const rtx * operands_2 = (const rtx *) y;
18852   rtx mem_1, mem_2, base, offset_1, offset_2;
18853
18854   if (MEM_P (operands_1[0]))
18855     mem_1 = operands_1[0];
18856   else
18857     mem_1 = operands_1[1];
18858
18859   if (MEM_P (operands_2[0]))
18860     mem_2 = operands_2[0];
18861   else
18862     mem_2 = operands_2[1];
18863
18864   /* Extract the offsets.  */
18865   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18866   extract_base_offset_in_addr (mem_2, &base, &offset_2);
18867
18868   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18869
18870   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18871 }
18872
18873 /* Given OPERANDS of consecutive load/store, check if we can merge
18874    them into ldp/stp by adjusting the offset.  LOAD is true if they
18875    are load instructions.  MODE is the mode of memory operands.
18876
18877    Given below consecutive stores:
18878
18879      str  w1, [xb, 0x100]
18880      str  w1, [xb, 0x104]
18881      str  w1, [xb, 0x108]
18882      str  w1, [xb, 0x10c]
18883
18884    Though the offsets are out of the range supported by stp, we can
18885    still pair them after adjusting the offset, like:
18886
18887      add  scratch, xb, 0x100
18888      stp  w1, w1, [scratch]
18889      stp  w1, w1, [scratch, 0x8]
18890
18891    The peephole patterns detecting this opportunity should guarantee
18892    the scratch register is avaliable.  */
18893
18894 bool
18895 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18896                                        scalar_mode mode)
18897 {
18898   const int num_insns = 4;
18899   enum reg_class rclass;
18900   HOST_WIDE_INT offvals[num_insns], msize;
18901   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18902
18903   if (load)
18904     {
18905       for (int i = 0; i < num_insns; i++)
18906         {
18907           reg[i] = operands[2 * i];
18908           mem[i] = operands[2 * i + 1];
18909
18910           gcc_assert (REG_P (reg[i]));
18911         }
18912
18913       /* Do not attempt to merge the loads if the loads clobber each other.  */
18914       for (int i = 0; i < 8; i += 2)
18915         for (int j = i + 2; j < 8; j += 2)
18916           if (reg_overlap_mentioned_p (operands[i], operands[j]))
18917             return false;
18918     }
18919   else
18920     for (int i = 0; i < num_insns; i++)
18921       {
18922         mem[i] = operands[2 * i];
18923         reg[i] = operands[2 * i + 1];
18924       }
18925
18926   /* Skip if memory operand is by itself valid for ldp/stp.  */
18927   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18928     return false;
18929
18930   for (int i = 0; i < num_insns; i++)
18931     {
18932       /* The mems cannot be volatile.  */
18933       if (MEM_VOLATILE_P (mem[i]))
18934         return false;
18935
18936       /* Check if the addresses are in the form of [base+offset].  */
18937       extract_base_offset_in_addr (mem[i], base + i, offset + i);
18938       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18939         return false;
18940     }
18941
18942   /* Check if the registers are of same class.  */
18943   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18944     ? FP_REGS : GENERAL_REGS;
18945
18946   for (int i = 1; i < num_insns; i++)
18947     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18948       {
18949         if (rclass != FP_REGS)
18950           return false;
18951       }
18952     else
18953       {
18954         if (rclass != GENERAL_REGS)
18955           return false;
18956       }
18957
18958   /* Only the last register in the order in which they occur
18959      may be clobbered by the load.  */
18960   if (rclass == GENERAL_REGS && load)
18961     for (int i = 0; i < num_insns - 1; i++)
18962       if (reg_mentioned_p (reg[i], mem[i]))
18963         return false;
18964
18965   /* Check if the bases are same.  */
18966   for (int i = 0; i < num_insns - 1; i++)
18967     if (!rtx_equal_p (base[i], base[i + 1]))
18968       return false;
18969
18970   for (int i = 0; i < num_insns; i++)
18971     offvals[i] = INTVAL (offset[i]);
18972
18973   msize = GET_MODE_SIZE (mode);
18974
18975   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18976   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18977          aarch64_host_wide_int_compare);
18978
18979   if (!(offvals[1] == offvals[0] + msize
18980         && offvals[3] == offvals[2] + msize))
18981     return false;
18982
18983   /* Check that offsets are within range of each other.  The ldp/stp
18984      instructions have 7 bit immediate offsets, so use 0x80.  */
18985   if (offvals[2] - offvals[0] >= msize * 0x80)
18986     return false;
18987
18988   /* The offsets must be aligned with respect to each other.  */
18989   if (offvals[0] % msize != offvals[2] % msize)
18990     return false;
18991
18992   /* If we have SImode and slow unaligned ldp,
18993      check the alignment to be at least 8 byte. */
18994   if (mode == SImode
18995       && (aarch64_tune_params.extra_tuning_flags
18996           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18997       && !optimize_size
18998       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18999     return false;
19000
19001   return true;
19002 }
19003
19004 /* Given OPERANDS of consecutive load/store, this function pairs them
19005    into LDP/STP after adjusting the offset.  It depends on the fact
19006    that the operands can be sorted so the offsets are correct for STP.
19007    MODE is the mode of memory operands.  CODE is the rtl operator
19008    which should be applied to all memory operands, it's SIGN_EXTEND,
19009    ZERO_EXTEND or UNKNOWN.  */
19010
19011 bool
19012 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19013                              scalar_mode mode, RTX_CODE code)
19014 {
19015   rtx base, offset_1, offset_3, t1, t2;
19016   rtx mem_1, mem_2, mem_3, mem_4;
19017   rtx temp_operands[8];
19018   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19019                 stp_off_upper_limit, stp_off_lower_limit, msize;
19020
19021   /* We make changes on a copy as we may still bail out.  */
19022   for (int i = 0; i < 8; i ++)
19023     temp_operands[i] = operands[i];
19024
19025   /* Sort the operands.  */
19026   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19027
19028   /* Copy the memory operands so that if we have to bail for some
19029      reason the original addresses are unchanged.  */
19030   if (load)
19031     {
19032       mem_1 = copy_rtx (temp_operands[1]);
19033       mem_2 = copy_rtx (temp_operands[3]);
19034       mem_3 = copy_rtx (temp_operands[5]);
19035       mem_4 = copy_rtx (temp_operands[7]);
19036     }
19037   else
19038     {
19039       mem_1 = copy_rtx (temp_operands[0]);
19040       mem_2 = copy_rtx (temp_operands[2]);
19041       mem_3 = copy_rtx (temp_operands[4]);
19042       mem_4 = copy_rtx (temp_operands[6]);
19043       gcc_assert (code == UNKNOWN);
19044     }
19045
19046   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19047   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19048   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19049               && offset_3 != NULL_RTX);
19050
19051   /* Adjust offset so it can fit in LDP/STP instruction.  */
19052   msize = GET_MODE_SIZE (mode);
19053   stp_off_upper_limit = msize * (0x40 - 1);
19054   stp_off_lower_limit = - msize * 0x40;
19055
19056   off_val_1 = INTVAL (offset_1);
19057   off_val_3 = INTVAL (offset_3);
19058
19059   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19060   if (msize <= 4)
19061     base_off = (off_val_1 + off_val_3) / 2;
19062   else
19063     /* However, due to issues with negative LDP/STP offset generation for
19064        larger modes, for DF, DI and vector modes. we must not use negative
19065        addresses smaller than 9 signed unadjusted bits can store.  This
19066        provides the most range in this case.  */
19067     base_off = off_val_1;
19068
19069   /* Adjust the base so that it is aligned with the addresses but still
19070      optimal.  */
19071   if (base_off % msize != off_val_1 % msize)
19072     /* Fix the offset, bearing in mind we want to make it bigger not
19073        smaller.  */
19074     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19075   else if (msize <= 4)
19076     /* The negative range of LDP/STP is one larger than the positive range.  */
19077     base_off += msize;
19078
19079   /* Check if base offset is too big or too small.  We can attempt to resolve
19080      this issue by setting it to the maximum value and seeing if the offsets
19081      still fit.  */
19082   if (base_off >= 0x1000)
19083     {
19084       base_off = 0x1000 - 1;
19085       /* We must still make sure that the base offset is aligned with respect
19086          to the address.  But it may may not be made any bigger.  */
19087       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19088     }
19089
19090   /* Likewise for the case where the base is too small.  */
19091   if (base_off <= -0x1000)
19092     {
19093       base_off = -0x1000 + 1;
19094       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19095     }
19096
19097   /* Offset of the first STP/LDP.  */
19098   new_off_1 = off_val_1 - base_off;
19099
19100   /* Offset of the second STP/LDP.  */
19101   new_off_3 = off_val_3 - base_off;
19102
19103   /* The offsets must be within the range of the LDP/STP instructions.  */
19104   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19105       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19106     return false;
19107
19108   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19109                                                   new_off_1), true);
19110   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19111                                                   new_off_1 + msize), true);
19112   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19113                                                   new_off_3), true);
19114   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19115                                                   new_off_3 + msize), true);
19116
19117   if (!aarch64_mem_pair_operand (mem_1, mode)
19118       || !aarch64_mem_pair_operand (mem_3, mode))
19119     return false;
19120
19121   if (code == ZERO_EXTEND)
19122     {
19123       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19124       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19125       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19126       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19127     }
19128   else if (code == SIGN_EXTEND)
19129     {
19130       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19131       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19132       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19133       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19134     }
19135
19136   if (load)
19137     {
19138       operands[0] = temp_operands[0];
19139       operands[1] = mem_1;
19140       operands[2] = temp_operands[2];
19141       operands[3] = mem_2;
19142       operands[4] = temp_operands[4];
19143       operands[5] = mem_3;
19144       operands[6] = temp_operands[6];
19145       operands[7] = mem_4;
19146     }
19147   else
19148     {
19149       operands[0] = mem_1;
19150       operands[1] = temp_operands[1];
19151       operands[2] = mem_2;
19152       operands[3] = temp_operands[3];
19153       operands[4] = mem_3;
19154       operands[5] = temp_operands[5];
19155       operands[6] = mem_4;
19156       operands[7] = temp_operands[7];
19157     }
19158
19159   /* Emit adjusting instruction.  */
19160   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19161   /* Emit ldp/stp instructions.  */
19162   t1 = gen_rtx_SET (operands[0], operands[1]);
19163   t2 = gen_rtx_SET (operands[2], operands[3]);
19164   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19165   t1 = gen_rtx_SET (operands[4], operands[5]);
19166   t2 = gen_rtx_SET (operands[6], operands[7]);
19167   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19168   return true;
19169 }
19170
19171 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19172    it isn't worth branching around empty masked ops (including masked
19173    stores).  */
19174
19175 static bool
19176 aarch64_empty_mask_is_expensive (unsigned)
19177 {
19178   return false;
19179 }
19180
19181 /* Return 1 if pseudo register should be created and used to hold
19182    GOT address for PIC code.  */
19183
19184 bool
19185 aarch64_use_pseudo_pic_reg (void)
19186 {
19187   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19188 }
19189
19190 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19191
19192 static int
19193 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19194 {
19195   switch (XINT (x, 1))
19196     {
19197     case UNSPEC_GOTSMALLPIC:
19198     case UNSPEC_GOTSMALLPIC28K:
19199     case UNSPEC_GOTTINYPIC:
19200       return 0;
19201     default:
19202       break;
19203     }
19204
19205   return default_unspec_may_trap_p (x, flags);
19206 }
19207
19208
19209 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19210    return the log2 of that value.  Otherwise return -1.  */
19211
19212 int
19213 aarch64_fpconst_pow_of_2 (rtx x)
19214 {
19215   const REAL_VALUE_TYPE *r;
19216
19217   if (!CONST_DOUBLE_P (x))
19218     return -1;
19219
19220   r = CONST_DOUBLE_REAL_VALUE (x);
19221
19222   if (REAL_VALUE_NEGATIVE (*r)
19223       || REAL_VALUE_ISNAN (*r)
19224       || REAL_VALUE_ISINF (*r)
19225       || !real_isinteger (r, DFmode))
19226     return -1;
19227
19228   return exact_log2 (real_to_integer (r));
19229 }
19230
19231 /* If X is a vector of equal CONST_DOUBLE values and that value is
19232    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19233
19234 int
19235 aarch64_vec_fpconst_pow_of_2 (rtx x)
19236 {
19237   int nelts;
19238   if (GET_CODE (x) != CONST_VECTOR
19239       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19240     return -1;
19241
19242   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19243     return -1;
19244
19245   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19246   if (firstval <= 0)
19247     return -1;
19248
19249   for (int i = 1; i < nelts; i++)
19250     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19251       return -1;
19252
19253   return firstval;
19254 }
19255
19256 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19257    to float.
19258
19259    __fp16 always promotes through this hook.
19260    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19261    through the generic excess precision logic rather than here.  */
19262
19263 static tree
19264 aarch64_promoted_type (const_tree t)
19265 {
19266   if (SCALAR_FLOAT_TYPE_P (t)
19267       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19268     return float_type_node;
19269
19270   return NULL_TREE;
19271 }
19272
19273 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19274
19275 static bool
19276 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19277                            optimization_type opt_type)
19278 {
19279   switch (op)
19280     {
19281     case rsqrt_optab:
19282       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19283
19284     default:
19285       return true;
19286     }
19287 }
19288
19289 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19290
19291 static unsigned int
19292 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19293                                         int *offset)
19294 {
19295   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19296   gcc_assert (i == 1);
19297   *factor = 2;
19298   *offset = 1;
19299   return AARCH64_DWARF_VG;
19300 }
19301
19302 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19303    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19304
19305 static bool
19306 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19307 {
19308   return (mode == HFmode
19309           ? true
19310           : default_libgcc_floating_mode_supported_p (mode));
19311 }
19312
19313 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19314    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19315
19316 static bool
19317 aarch64_scalar_mode_supported_p (scalar_mode mode)
19318 {
19319   return (mode == HFmode
19320           ? true
19321           : default_scalar_mode_supported_p (mode));
19322 }
19323
19324 /* Set the value of FLT_EVAL_METHOD.
19325    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19326
19327     0: evaluate all operations and constants, whose semantic type has at
19328        most the range and precision of type float, to the range and
19329        precision of float; evaluate all other operations and constants to
19330        the range and precision of the semantic type;
19331
19332     N, where _FloatN is a supported interchange floating type
19333        evaluate all operations and constants, whose semantic type has at
19334        most the range and precision of _FloatN type, to the range and
19335        precision of the _FloatN type; evaluate all other operations and
19336        constants to the range and precision of the semantic type;
19337
19338    If we have the ARMv8.2-A extensions then we support _Float16 in native
19339    precision, so we should set this to 16.  Otherwise, we support the type,
19340    but want to evaluate expressions in float precision, so set this to
19341    0.  */
19342
19343 static enum flt_eval_method
19344 aarch64_excess_precision (enum excess_precision_type type)
19345 {
19346   switch (type)
19347     {
19348       case EXCESS_PRECISION_TYPE_FAST:
19349       case EXCESS_PRECISION_TYPE_STANDARD:
19350         /* We can calculate either in 16-bit range and precision or
19351            32-bit range and precision.  Make that decision based on whether
19352            we have native support for the ARMv8.2-A 16-bit floating-point
19353            instructions or not.  */
19354         return (TARGET_FP_F16INST
19355                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19356                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19357       case EXCESS_PRECISION_TYPE_IMPLICIT:
19358         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19359       default:
19360         gcc_unreachable ();
19361     }
19362   return FLT_EVAL_METHOD_UNPREDICTABLE;
19363 }
19364
19365 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19366    scheduled for speculative execution.  Reject the long-running division
19367    and square-root instructions.  */
19368
19369 static bool
19370 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19371 {
19372   switch (get_attr_type (insn))
19373     {
19374       case TYPE_SDIV:
19375       case TYPE_UDIV:
19376       case TYPE_FDIVS:
19377       case TYPE_FDIVD:
19378       case TYPE_FSQRTS:
19379       case TYPE_FSQRTD:
19380       case TYPE_NEON_FP_SQRT_S:
19381       case TYPE_NEON_FP_SQRT_D:
19382       case TYPE_NEON_FP_SQRT_S_Q:
19383       case TYPE_NEON_FP_SQRT_D_Q:
19384       case TYPE_NEON_FP_DIV_S:
19385       case TYPE_NEON_FP_DIV_D:
19386       case TYPE_NEON_FP_DIV_S_Q:
19387       case TYPE_NEON_FP_DIV_D_Q:
19388         return false;
19389       default:
19390         return true;
19391     }
19392 }
19393
19394 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19395
19396 static int
19397 aarch64_compute_pressure_classes (reg_class *classes)
19398 {
19399   int i = 0;
19400   classes[i++] = GENERAL_REGS;
19401   classes[i++] = FP_REGS;
19402   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19403      registers need to go in PR_LO_REGS at some point during their
19404      lifetime.  Splitting it into two halves has the effect of making
19405      all predicates count against PR_LO_REGS, so that we try whenever
19406      possible to restrict the number of live predicates to 8.  This
19407      greatly reduces the amount of spilling in certain loops.  */
19408   classes[i++] = PR_LO_REGS;
19409   classes[i++] = PR_HI_REGS;
19410   return i;
19411 }
19412
19413 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19414
19415 static bool
19416 aarch64_can_change_mode_class (machine_mode from,
19417                                machine_mode to, reg_class_t)
19418 {
19419   if (BYTES_BIG_ENDIAN)
19420     {
19421       bool from_sve_p = aarch64_sve_data_mode_p (from);
19422       bool to_sve_p = aarch64_sve_data_mode_p (to);
19423
19424       /* Don't allow changes between SVE data modes and non-SVE modes.
19425          See the comment at the head of aarch64-sve.md for details.  */
19426       if (from_sve_p != to_sve_p)
19427         return false;
19428
19429       /* Don't allow changes in element size: lane 0 of the new vector
19430          would not then be lane 0 of the old vector.  See the comment
19431          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19432          description.
19433
19434          In the worst case, this forces a register to be spilled in
19435          one mode and reloaded in the other, which handles the
19436          endianness correctly.  */
19437       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19438         return false;
19439     }
19440   return true;
19441 }
19442
19443 /* Implement TARGET_EARLY_REMAT_MODES.  */
19444
19445 static void
19446 aarch64_select_early_remat_modes (sbitmap modes)
19447 {
19448   /* SVE values are not normally live across a call, so it should be
19449      worth doing early rematerialization even in VL-specific mode.  */
19450   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19451     {
19452       machine_mode mode = (machine_mode) i;
19453       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19454       if (vec_flags & VEC_ANY_SVE)
19455         bitmap_set_bit (modes, i);
19456     }
19457 }
19458
19459 /* Override the default target speculation_safe_value.  */
19460 static rtx
19461 aarch64_speculation_safe_value (machine_mode mode,
19462                                 rtx result, rtx val, rtx failval)
19463 {
19464   /* Maybe we should warn if falling back to hard barriers.  They are
19465      likely to be noticably more expensive than the alternative below.  */
19466   if (!aarch64_track_speculation)
19467     return default_speculation_safe_value (mode, result, val, failval);
19468
19469   if (!REG_P (val))
19470     val = copy_to_mode_reg (mode, val);
19471
19472   if (!aarch64_reg_or_zero (failval, mode))
19473     failval = copy_to_mode_reg (mode, failval);
19474
19475   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19476   return result;
19477 }
19478
19479 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19480    Look into the tuning structure for an estimate.
19481    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19482    Advanced SIMD 128 bits.  */
19483
19484 static HOST_WIDE_INT
19485 aarch64_estimated_poly_value (poly_int64 val)
19486 {
19487   enum aarch64_sve_vector_bits_enum width_source
19488     = aarch64_tune_params.sve_width;
19489
19490   /* If we still don't have an estimate, use the default.  */
19491   if (width_source == SVE_SCALABLE)
19492     return default_estimated_poly_value (val);
19493
19494   HOST_WIDE_INT over_128 = width_source - 128;
19495   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19496 }
19497
19498
19499 /* Return true for types that could be supported as SIMD return or
19500    argument types.  */
19501
19502 static bool
19503 supported_simd_type (tree t)
19504 {
19505   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19506     {
19507       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19508       return s == 1 || s == 2 || s == 4 || s == 8;
19509     }
19510   return false;
19511 }
19512
19513 /* Return true for types that currently are supported as SIMD return
19514    or argument types.  */
19515
19516 static bool
19517 currently_supported_simd_type (tree t, tree b)
19518 {
19519   if (COMPLEX_FLOAT_TYPE_P (t))
19520     return false;
19521
19522   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19523     return false;
19524
19525   return supported_simd_type (t);
19526 }
19527
19528 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19529
19530 static int
19531 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19532                                         struct cgraph_simd_clone *clonei,
19533                                         tree base_type, int num)
19534 {
19535   tree t, ret_type, arg_type;
19536   unsigned int elt_bits, vec_bits, count;
19537
19538   if (!TARGET_SIMD)
19539     return 0;
19540
19541   if (clonei->simdlen
19542       && (clonei->simdlen < 2
19543           || clonei->simdlen > 1024
19544           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19545     {
19546       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19547                   "unsupported simdlen %d", clonei->simdlen);
19548       return 0;
19549     }
19550
19551   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19552   if (TREE_CODE (ret_type) != VOID_TYPE
19553       && !currently_supported_simd_type (ret_type, base_type))
19554     {
19555       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19556         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19557                     "GCC does not currently support mixed size types "
19558                     "for %<simd%> functions");
19559       else if (supported_simd_type (ret_type))
19560         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19561                     "GCC does not currently support return type %qT "
19562                     "for %<simd%> functions", ret_type);
19563       else
19564         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19565                     "unsupported return type %qT for %<simd%> functions",
19566                     ret_type);
19567       return 0;
19568     }
19569
19570   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19571     {
19572       arg_type = TREE_TYPE (t);
19573
19574       if (!currently_supported_simd_type (arg_type, base_type))
19575         {
19576           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19577             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19578                         "GCC does not currently support mixed size types "
19579                         "for %<simd%> functions");
19580           else
19581             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19582                         "GCC does not currently support argument type %qT "
19583                         "for %<simd%> functions", arg_type);
19584           return 0;
19585         }
19586     }
19587
19588   clonei->vecsize_mangle = 'n';
19589   clonei->mask_mode = VOIDmode;
19590   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19591   if (clonei->simdlen == 0)
19592     {
19593       count = 2;
19594       vec_bits = (num == 0 ? 64 : 128);
19595       clonei->simdlen = vec_bits / elt_bits;
19596     }
19597   else
19598     {
19599       count = 1;
19600       vec_bits = clonei->simdlen * elt_bits;
19601       if (vec_bits != 64 && vec_bits != 128)
19602         {
19603           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19604                       "GCC does not currently support simdlen %d for type %qT",
19605                       clonei->simdlen, base_type);
19606           return 0;
19607         }
19608     }
19609   clonei->vecsize_int = vec_bits;
19610   clonei->vecsize_float = vec_bits;
19611   return count;
19612 }
19613
19614 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19615
19616 static void
19617 aarch64_simd_clone_adjust (struct cgraph_node *node)
19618 {
19619   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19620      use the correct ABI.  */
19621
19622   tree t = TREE_TYPE (node->decl);
19623   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19624                                         TYPE_ATTRIBUTES (t));
19625 }
19626
19627 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19628
19629 static int
19630 aarch64_simd_clone_usable (struct cgraph_node *node)
19631 {
19632   switch (node->simdclone->vecsize_mangle)
19633     {
19634     case 'n':
19635       if (!TARGET_SIMD)
19636         return -1;
19637       return 0;
19638     default:
19639       gcc_unreachable ();
19640     }
19641 }
19642
19643 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19644
19645 static int
19646 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19647 {
19648   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19649       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19650     return 0;
19651   return 1;
19652 }
19653
19654 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19655
19656 static const char *
19657 aarch64_get_multilib_abi_name (void)
19658 {
19659   if (TARGET_BIG_END)
19660     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19661   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19662 }
19663
19664 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19665    global variable based guard use the default else
19666    return a null tree.  */
19667 static tree
19668 aarch64_stack_protect_guard (void)
19669 {
19670   if (aarch64_stack_protector_guard == SSP_GLOBAL)
19671     return default_stack_protect_guard ();
19672
19673   return NULL_TREE;
19674 }
19675
19676 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
19677    section at the end if needed.  */
19678 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
19679 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
19680 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
19681 void
19682 aarch64_file_end_indicate_exec_stack ()
19683 {
19684   file_end_indicate_exec_stack ();
19685
19686   unsigned feature_1_and = 0;
19687   if (aarch64_bti_enabled ())
19688     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19689
19690   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19691     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19692
19693   if (feature_1_and)
19694     {
19695       /* Generate .note.gnu.property section.  */
19696       switch_to_section (get_section (".note.gnu.property",
19697                                       SECTION_NOTYPE, NULL));
19698
19699       /* PT_NOTE header: namesz, descsz, type.
19700          namesz = 4 ("GNU\0")
19701          descsz = 16 (Size of the program property array)
19702                   [(12 + padding) * Number of array elements]
19703          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
19704       assemble_align (POINTER_SIZE);
19705       assemble_integer (GEN_INT (4), 4, 32, 1);
19706       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19707       assemble_integer (GEN_INT (5), 4, 32, 1);
19708
19709       /* PT_NOTE name.  */
19710       assemble_string ("GNU", 4);
19711
19712       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19713          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19714          datasz = 4
19715          data   = feature_1_and.  */
19716       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19717       assemble_integer (GEN_INT (4), 4, 32, 1);
19718       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19719
19720       /* Pad the size of the note to the required alignment.  */
19721       assemble_align (POINTER_SIZE);
19722     }
19723 }
19724 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19725 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19726 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19727
19728 /* Target-specific selftests.  */
19729
19730 #if CHECKING_P
19731
19732 namespace selftest {
19733
19734 /* Selftest for the RTL loader.
19735    Verify that the RTL loader copes with a dump from
19736    print_rtx_function.  This is essentially just a test that class
19737    function_reader can handle a real dump, but it also verifies
19738    that lookup_reg_by_dump_name correctly handles hard regs.
19739    The presence of hard reg names in the dump means that the test is
19740    target-specific, hence it is in this file.  */
19741
19742 static void
19743 aarch64_test_loading_full_dump ()
19744 {
19745   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19746
19747   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19748
19749   rtx_insn *insn_1 = get_insn_by_uid (1);
19750   ASSERT_EQ (NOTE, GET_CODE (insn_1));
19751
19752   rtx_insn *insn_15 = get_insn_by_uid (15);
19753   ASSERT_EQ (INSN, GET_CODE (insn_15));
19754   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19755
19756   /* Verify crtl->return_rtx.  */
19757   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19758   ASSERT_EQ (0, REGNO (crtl->return_rtx));
19759   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19760 }
19761
19762 /* Run all target-specific selftests.  */
19763
19764 static void
19765 aarch64_run_selftests (void)
19766 {
19767   aarch64_test_loading_full_dump ();
19768 }
19769
19770 } // namespace selftest
19771
19772 #endif /* #if CHECKING_P */
19773
19774 #undef TARGET_STACK_PROTECT_GUARD
19775 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19776
19777 #undef TARGET_ADDRESS_COST
19778 #define TARGET_ADDRESS_COST aarch64_address_cost
19779
19780 /* This hook will determines whether unnamed bitfields affect the alignment
19781    of the containing structure.  The hook returns true if the structure
19782    should inherit the alignment requirements of an unnamed bitfield's
19783    type.  */
19784 #undef TARGET_ALIGN_ANON_BITFIELD
19785 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19786
19787 #undef TARGET_ASM_ALIGNED_DI_OP
19788 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19789
19790 #undef TARGET_ASM_ALIGNED_HI_OP
19791 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19792
19793 #undef TARGET_ASM_ALIGNED_SI_OP
19794 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19795
19796 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19797 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19798   hook_bool_const_tree_hwi_hwi_const_tree_true
19799
19800 #undef TARGET_ASM_FILE_START
19801 #define TARGET_ASM_FILE_START aarch64_start_file
19802
19803 #undef TARGET_ASM_OUTPUT_MI_THUNK
19804 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19805
19806 #undef TARGET_ASM_SELECT_RTX_SECTION
19807 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19808
19809 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19810 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19811
19812 #undef TARGET_BUILD_BUILTIN_VA_LIST
19813 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19814
19815 #undef TARGET_CALLEE_COPIES
19816 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19817
19818 #undef TARGET_CAN_ELIMINATE
19819 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19820
19821 #undef TARGET_CAN_INLINE_P
19822 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19823
19824 #undef TARGET_CANNOT_FORCE_CONST_MEM
19825 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19826
19827 #undef TARGET_CASE_VALUES_THRESHOLD
19828 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19829
19830 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19831 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19832
19833 /* Only the least significant bit is used for initialization guard
19834    variables.  */
19835 #undef TARGET_CXX_GUARD_MASK_BIT
19836 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19837
19838 #undef TARGET_C_MODE_FOR_SUFFIX
19839 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19840
19841 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19842 #undef  TARGET_DEFAULT_TARGET_FLAGS
19843 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19844 #endif
19845
19846 #undef TARGET_CLASS_MAX_NREGS
19847 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19848
19849 #undef TARGET_BUILTIN_DECL
19850 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19851
19852 #undef TARGET_BUILTIN_RECIPROCAL
19853 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19854
19855 #undef TARGET_C_EXCESS_PRECISION
19856 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19857
19858 #undef  TARGET_EXPAND_BUILTIN
19859 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19860
19861 #undef TARGET_EXPAND_BUILTIN_VA_START
19862 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19863
19864 #undef TARGET_FOLD_BUILTIN
19865 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19866
19867 #undef TARGET_FUNCTION_ARG
19868 #define TARGET_FUNCTION_ARG aarch64_function_arg
19869
19870 #undef TARGET_FUNCTION_ARG_ADVANCE
19871 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19872
19873 #undef TARGET_FUNCTION_ARG_BOUNDARY
19874 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19875
19876 #undef TARGET_FUNCTION_ARG_PADDING
19877 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19878
19879 #undef TARGET_GET_RAW_RESULT_MODE
19880 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19881 #undef TARGET_GET_RAW_ARG_MODE
19882 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19883
19884 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19885 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19886
19887 #undef TARGET_FUNCTION_VALUE
19888 #define TARGET_FUNCTION_VALUE aarch64_function_value
19889
19890 #undef TARGET_FUNCTION_VALUE_REGNO_P
19891 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19892
19893 #undef TARGET_GIMPLE_FOLD_BUILTIN
19894 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19895
19896 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19897 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19898
19899 #undef  TARGET_INIT_BUILTINS
19900 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
19901
19902 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19903 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19904   aarch64_ira_change_pseudo_allocno_class
19905
19906 #undef TARGET_LEGITIMATE_ADDRESS_P
19907 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19908
19909 #undef TARGET_LEGITIMATE_CONSTANT_P
19910 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19911
19912 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19913 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19914   aarch64_legitimize_address_displacement
19915
19916 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19917 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19918
19919 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19920 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19921 aarch64_libgcc_floating_mode_supported_p
19922
19923 #undef TARGET_MANGLE_TYPE
19924 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19925
19926 #undef TARGET_MEMORY_MOVE_COST
19927 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19928
19929 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19930 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19931
19932 #undef TARGET_MUST_PASS_IN_STACK
19933 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19934
19935 /* This target hook should return true if accesses to volatile bitfields
19936    should use the narrowest mode possible.  It should return false if these
19937    accesses should use the bitfield container type.  */
19938 #undef TARGET_NARROW_VOLATILE_BITFIELD
19939 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19940
19941 #undef  TARGET_OPTION_OVERRIDE
19942 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19943
19944 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19945 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19946   aarch64_override_options_after_change
19947
19948 #undef TARGET_OPTION_SAVE
19949 #define TARGET_OPTION_SAVE aarch64_option_save
19950
19951 #undef TARGET_OPTION_RESTORE
19952 #define TARGET_OPTION_RESTORE aarch64_option_restore
19953
19954 #undef TARGET_OPTION_PRINT
19955 #define TARGET_OPTION_PRINT aarch64_option_print
19956
19957 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19958 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19959
19960 #undef TARGET_SET_CURRENT_FUNCTION
19961 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19962
19963 #undef TARGET_PASS_BY_REFERENCE
19964 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19965
19966 #undef TARGET_PREFERRED_RELOAD_CLASS
19967 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19968
19969 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19970 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19971
19972 #undef TARGET_PROMOTED_TYPE
19973 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19974
19975 #undef TARGET_SECONDARY_RELOAD
19976 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19977
19978 #undef TARGET_SHIFT_TRUNCATION_MASK
19979 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19980
19981 #undef TARGET_SETUP_INCOMING_VARARGS
19982 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19983
19984 #undef TARGET_STRUCT_VALUE_RTX
19985 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
19986
19987 #undef TARGET_REGISTER_MOVE_COST
19988 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19989
19990 #undef TARGET_RETURN_IN_MEMORY
19991 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19992
19993 #undef TARGET_RETURN_IN_MSB
19994 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19995
19996 #undef TARGET_RTX_COSTS
19997 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19998
19999 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20000 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20001
20002 #undef TARGET_SCHED_ISSUE_RATE
20003 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20004
20005 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20006 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20007   aarch64_sched_first_cycle_multipass_dfa_lookahead
20008
20009 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20010 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20011   aarch64_first_cycle_multipass_dfa_lookahead_guard
20012
20013 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20014 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20015   aarch64_get_separate_components
20016
20017 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20018 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20019   aarch64_components_for_bb
20020
20021 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20022 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20023   aarch64_disqualify_components
20024
20025 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20026 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20027   aarch64_emit_prologue_components
20028
20029 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20030 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20031   aarch64_emit_epilogue_components
20032
20033 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20034 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20035   aarch64_set_handled_components
20036
20037 #undef TARGET_TRAMPOLINE_INIT
20038 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20039
20040 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20041 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20042
20043 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20044 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20045
20046 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20047 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20048   aarch64_builtin_support_vector_misalignment
20049
20050 #undef TARGET_ARRAY_MODE
20051 #define TARGET_ARRAY_MODE aarch64_array_mode
20052
20053 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20054 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20055
20056 #undef TARGET_VECTORIZE_ADD_STMT_COST
20057 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20058
20059 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20060 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20061   aarch64_builtin_vectorization_cost
20062
20063 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20064 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20065
20066 #undef TARGET_VECTORIZE_BUILTINS
20067 #define TARGET_VECTORIZE_BUILTINS
20068
20069 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20070 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20071   aarch64_builtin_vectorized_function
20072
20073 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20074 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20075   aarch64_autovectorize_vector_sizes
20076
20077 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20078 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20079   aarch64_atomic_assign_expand_fenv
20080
20081 /* Section anchor support.  */
20082
20083 #undef TARGET_MIN_ANCHOR_OFFSET
20084 #define TARGET_MIN_ANCHOR_OFFSET -256
20085
20086 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20087    byte offset; we can do much more for larger data types, but have no way
20088    to determine the size of the access.  We assume accesses are aligned.  */
20089 #undef TARGET_MAX_ANCHOR_OFFSET
20090 #define TARGET_MAX_ANCHOR_OFFSET 4095
20091
20092 #undef TARGET_VECTOR_ALIGNMENT
20093 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20094
20095 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20096 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20097   aarch64_vectorize_preferred_vector_alignment
20098 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20099 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20100   aarch64_simd_vector_alignment_reachable
20101
20102 /* vec_perm support.  */
20103
20104 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20105 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20106   aarch64_vectorize_vec_perm_const
20107
20108 #undef TARGET_VECTORIZE_GET_MASK_MODE
20109 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20110 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20111 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20112   aarch64_empty_mask_is_expensive
20113 #undef TARGET_PREFERRED_ELSE_VALUE
20114 #define TARGET_PREFERRED_ELSE_VALUE \
20115   aarch64_preferred_else_value
20116
20117 #undef TARGET_INIT_LIBFUNCS
20118 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20119
20120 #undef TARGET_FIXED_CONDITION_CODE_REGS
20121 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20122
20123 #undef TARGET_FLAGS_REGNUM
20124 #define TARGET_FLAGS_REGNUM CC_REGNUM
20125
20126 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20127 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20128
20129 #undef TARGET_ASAN_SHADOW_OFFSET
20130 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20131
20132 #undef TARGET_LEGITIMIZE_ADDRESS
20133 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20134
20135 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20136 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20137
20138 #undef TARGET_CAN_USE_DOLOOP_P
20139 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20140
20141 #undef TARGET_SCHED_ADJUST_PRIORITY
20142 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20143
20144 #undef TARGET_SCHED_MACRO_FUSION_P
20145 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20146
20147 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20148 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20149
20150 #undef TARGET_SCHED_FUSION_PRIORITY
20151 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20152
20153 #undef TARGET_UNSPEC_MAY_TRAP_P
20154 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20155
20156 #undef TARGET_USE_PSEUDO_PIC_REG
20157 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20158
20159 #undef TARGET_PRINT_OPERAND
20160 #define TARGET_PRINT_OPERAND aarch64_print_operand
20161
20162 #undef TARGET_PRINT_OPERAND_ADDRESS
20163 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20164
20165 #undef TARGET_OPTAB_SUPPORTED_P
20166 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20167
20168 #undef TARGET_OMIT_STRUCT_RETURN_REG
20169 #define TARGET_OMIT_STRUCT_RETURN_REG true
20170
20171 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20172 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20173   aarch64_dwarf_poly_indeterminate_value
20174
20175 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20176 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20177 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20178
20179 #undef TARGET_HARD_REGNO_NREGS
20180 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20181 #undef TARGET_HARD_REGNO_MODE_OK
20182 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20183
20184 #undef TARGET_MODES_TIEABLE_P
20185 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20186
20187 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20188 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20189   aarch64_hard_regno_call_part_clobbered
20190
20191 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20192 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20193   aarch64_remove_extra_call_preserved_regs
20194
20195 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20196 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20197   aarch64_return_call_with_max_clobbers
20198
20199 #undef TARGET_CONSTANT_ALIGNMENT
20200 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20201
20202 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20203 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20204   aarch64_stack_clash_protection_alloca_probe_range
20205
20206 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20207 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20208
20209 #undef TARGET_CAN_CHANGE_MODE_CLASS
20210 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20211
20212 #undef TARGET_SELECT_EARLY_REMAT_MODES
20213 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20214
20215 #undef TARGET_SPECULATION_SAFE_VALUE
20216 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20217
20218 #undef TARGET_ESTIMATED_POLY_VALUE
20219 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20220
20221 #undef TARGET_ATTRIBUTE_TABLE
20222 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20223
20224 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20225 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20226   aarch64_simd_clone_compute_vecsize_and_simdlen
20227
20228 #undef TARGET_SIMD_CLONE_ADJUST
20229 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20230
20231 #undef TARGET_SIMD_CLONE_USABLE
20232 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20233
20234 #undef TARGET_COMP_TYPE_ATTRIBUTES
20235 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20236
20237 #undef TARGET_GET_MULTILIB_ABI_NAME
20238 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20239
20240 #if CHECKING_P
20241 #undef TARGET_RUN_TARGET_SELFTESTS
20242 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20243 #endif /* #if CHECKING_P */
20244
20245 #undef TARGET_ASM_POST_CFI_STARTPROC
20246 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20247
20248 struct gcc_target targetm = TARGET_INITIALIZER;
20249
20250 #include "gt-aarch64.h"